ICU 74.1  74.1
normalizer2.h
Go to the documentation of this file.
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 2009-2013, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: normalizer2.h
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2009nov22
16 * created by: Markus W. Scherer
17 */
18 
19 #ifndef __NORMALIZER2_H__
20 #define __NORMALIZER2_H__
21 
27 #include "unicode/utypes.h"
28 
29 #if U_SHOW_CPLUSPLUS_API
30 
31 #if !UCONFIG_NO_NORMALIZATION
32 
33 #include "unicode/stringpiece.h"
34 #include "unicode/uniset.h"
35 #include "unicode/unistr.h"
36 #include "unicode/unorm2.h"
37 
38 U_NAMESPACE_BEGIN
39 
40 class ByteSink;
41 
86 public:
92 
104  static const Normalizer2 *
106 
118  static const Normalizer2 *
120 
132  static const Normalizer2 *
134 
146  static const Normalizer2 *
148 
163  static const Normalizer2 *
165 
166 #ifndef U_HIDE_DRAFT_API
181  static const Normalizer2 *
183 #endif // U_HIDE_DRAFT_API
184 
206  static const Normalizer2 *
207  getInstance(const char *packageName,
208  const char *name,
209  UNormalization2Mode mode,
210  UErrorCode &errorCode);
211 
223  normalize(const UnicodeString &src, UErrorCode &errorCode) const {
224  UnicodeString result;
225  normalize(src, result, errorCode);
226  return result;
227  }
241  virtual UnicodeString &
243  UnicodeString &dest,
244  UErrorCode &errorCode) const = 0;
245 
268  virtual void
269  normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
270  Edits *edits, UErrorCode &errorCode) const;
271 
286  virtual UnicodeString &
288  const UnicodeString &second,
289  UErrorCode &errorCode) const = 0;
304  virtual UnicodeString &
306  const UnicodeString &second,
307  UErrorCode &errorCode) const = 0;
308 
322  virtual UBool
323  getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0;
324 
349  virtual UBool
350  getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
351 
367  virtual UChar32
369 
378  virtual uint8_t
380 
395  virtual UBool
396  isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;
416  virtual UBool
418 
419 
436  quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;
437 
460  virtual int32_t
461  spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;
462 
476  virtual UBool hasBoundaryBefore(UChar32 c) const = 0;
477 
492  virtual UBool hasBoundaryAfter(UChar32 c) const = 0;
493 
507  virtual UBool isInert(UChar32 c) const = 0;
508 };
509 
522 public:
533  FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) :
534  norm2(n2), set(filterSet) {}
535 
541 
555  virtual UnicodeString &
557  UnicodeString &dest,
558  UErrorCode &errorCode) const override;
559 
582  virtual void
583  normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
584  Edits *edits, UErrorCode &errorCode) const override;
585 
600  virtual UnicodeString &
602  const UnicodeString &second,
603  UErrorCode &errorCode) const override;
618  virtual UnicodeString &
620  const UnicodeString &second,
621  UErrorCode &errorCode) const override;
622 
634  virtual UBool
635  getDecomposition(UChar32 c, UnicodeString &decomposition) const override;
636 
648  virtual UBool
649  getRawDecomposition(UChar32 c, UnicodeString &decomposition) const override;
650 
661  virtual UChar32
662  composePair(UChar32 a, UChar32 b) const override;
663 
672  virtual uint8_t
673  getCombiningClass(UChar32 c) const override;
674 
686  virtual UBool
687  isNormalized(const UnicodeString &s, UErrorCode &errorCode) const override;
707  virtual UBool
708  isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const override;
721  quickCheck(const UnicodeString &s, UErrorCode &errorCode) const override;
733  virtual int32_t
734  spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const override;
735 
744  virtual UBool hasBoundaryBefore(UChar32 c) const override;
745 
754  virtual UBool hasBoundaryAfter(UChar32 c) const override;
755 
763  virtual UBool isInert(UChar32 c) const override;
764 private:
765  UnicodeString &
766  normalize(const UnicodeString &src,
767  UnicodeString &dest,
768  USetSpanCondition spanCondition,
769  UErrorCode &errorCode) const;
770 
771  void
772  normalizeUTF8(uint32_t options, const char *src, int32_t length,
773  ByteSink &sink, Edits *edits,
774  USetSpanCondition spanCondition,
775  UErrorCode &errorCode) const;
776 
777  UnicodeString &
778  normalizeSecondAndAppend(UnicodeString &first,
779  const UnicodeString &second,
780  UBool doNormalize,
781  UErrorCode &errorCode) const;
782 
783  const Normalizer2 &norm2;
784  const UnicodeSet &set;
785 };
786 
787 U_NAMESPACE_END
788 
789 #endif // !UCONFIG_NO_NORMALIZATION
790 
791 #endif /* U_SHOW_CPLUSPLUS_API */
792 
793 #endif // __NORMALIZER2_H__
A ByteSink can be filled with bytes.
Definition: bytestream.h:53
Records lengths of string edits but not replacement text.
Definition: edits.h:80
Normalization filtered by a UnicodeSet.
Definition: normalizer2.h:521
virtual UnicodeString & normalize(const UnicodeString &src, UnicodeString &dest, UErrorCode &errorCode) const override
Writes the normalized form of the source string to the destination string (replacing its contents) an...
virtual UBool isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const override
Tests if the UTF-8 string is normalized.
~FilteredNormalizer2()
Destructor.
virtual UnicodeString & normalizeSecondAndAppend(UnicodeString &first, const UnicodeString &second, UErrorCode &errorCode) const override
Appends the normalized form of the second string to the first string (merging them at the boundary) a...
virtual UNormalizationCheckResult quickCheck(const UnicodeString &s, UErrorCode &errorCode) const override
Tests if the string is normalized.
virtual UBool isInert(UChar32 c) const override
Tests if the character is normalization-inert.
virtual UChar32 composePair(UChar32 a, UChar32 b) const override
Performs pairwise composition of a & b and returns the composite if there is one.
virtual UBool getDecomposition(UChar32 c, UnicodeString &decomposition) const override
Gets the decomposition mapping of c.
virtual UBool isNormalized(const UnicodeString &s, UErrorCode &errorCode) const override
Tests if the string is normalized.
virtual UBool getRawDecomposition(UChar32 c, UnicodeString &decomposition) const override
Gets the raw decomposition mapping of c.
virtual uint8_t getCombiningClass(UChar32 c) const override
Gets the combining class of c.
virtual void normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink, Edits *edits, UErrorCode &errorCode) const override
Normalizes a UTF-8 string and optionally records how source substrings relate to changed and unchange...
virtual UBool hasBoundaryBefore(UChar32 c) const override
Tests if the character always has a normalization boundary before it, regardless of context.
virtual UnicodeString & append(UnicodeString &first, const UnicodeString &second, UErrorCode &errorCode) const override
Appends the second string to the first string (merging them at the boundary) and returns the first st...
FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet)
Constructs a filtered normalizer wrapping any Normalizer2 instance and a filter set.
Definition: normalizer2.h:533
virtual UBool hasBoundaryAfter(UChar32 c) const override
Tests if the character always has a normalization boundary after it, regardless of context.
virtual int32_t spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const override
Returns the end of the normalized substring of the input string.
Unicode normalization functionality for standard Unicode normalization or for using custom mapping ta...
Definition: normalizer2.h:85
virtual UBool getDecomposition(UChar32 c, UnicodeString &decomposition) const =0
Gets the decomposition mapping of c.
virtual int32_t spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const =0
Returns the end of the normalized substring of the input string.
virtual UBool isNormalized(const UnicodeString &s, UErrorCode &errorCode) const =0
Tests if the string is normalized.
virtual UBool hasBoundaryBefore(UChar32 c) const =0
Tests if the character always has a normalization boundary before it, regardless of context.
static const Normalizer2 * getNFKCInstance(UErrorCode &errorCode)
Returns a Normalizer2 instance for Unicode NFKC normalization.
~Normalizer2()
Destructor.
virtual UChar32 composePair(UChar32 a, UChar32 b) const
Performs pairwise composition of a & b and returns the composite if there is one.
static const Normalizer2 * getNFCInstance(UErrorCode &errorCode)
Returns a Normalizer2 instance for Unicode NFC normalization.
virtual UnicodeString & normalize(const UnicodeString &src, UnicodeString &dest, UErrorCode &errorCode) const =0
Writes the normalized form of the source string to the destination string (replacing its contents) an...
static const Normalizer2 * getInstance(const char *packageName, const char *name, UNormalization2Mode mode, UErrorCode &errorCode)
Returns a Normalizer2 instance which uses the specified data file (packageName/name similar to ucnv_o...
virtual UnicodeString & append(UnicodeString &first, const UnicodeString &second, UErrorCode &errorCode) const =0
Appends the second string to the first string (merging them at the boundary) and returns the first st...
virtual UnicodeString & normalizeSecondAndAppend(UnicodeString &first, const UnicodeString &second, UErrorCode &errorCode) const =0
Appends the normalized form of the second string to the first string (merging them at the boundary) a...
UnicodeString normalize(const UnicodeString &src, UErrorCode &errorCode) const
Returns the normalized form of the source string.
Definition: normalizer2.h:223
virtual UNormalizationCheckResult quickCheck(const UnicodeString &s, UErrorCode &errorCode) const =0
Tests if the string is normalized.
virtual UBool hasBoundaryAfter(UChar32 c) const =0
Tests if the character always has a normalization boundary after it, regardless of context.
virtual uint8_t getCombiningClass(UChar32 c) const
Gets the combining class of c.
static const Normalizer2 * getNFKCSimpleCasefoldInstance(UErrorCode &errorCode)
Returns a Normalizer2 instance for a variant of Unicode toNFKC_Casefold() normalization which is equi...
virtual UBool isInert(UChar32 c) const =0
Tests if the character is normalization-inert.
static const Normalizer2 * getNFKCCasefoldInstance(UErrorCode &errorCode)
Returns a Normalizer2 instance for Unicode toNFKC_Casefold() normalization which is equivalent to app...
virtual void normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink, Edits *edits, UErrorCode &errorCode) const
Normalizes a UTF-8 string and optionally records how source substrings relate to changed and unchange...
virtual UBool isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const
Tests if the UTF-8 string is normalized.
virtual UBool getRawDecomposition(UChar32 c, UnicodeString &decomposition) const
Gets the raw decomposition mapping of c.
static const Normalizer2 * getNFDInstance(UErrorCode &errorCode)
Returns a Normalizer2 instance for Unicode NFD normalization.
static const Normalizer2 * getNFKDInstance(UErrorCode &errorCode)
Returns a Normalizer2 instance for Unicode NFKD normalization.
A string-like object that points to a sized piece of memory.
Definition: stringpiece.h:60
UObject is the common ICU "boilerplate" class.
Definition: uobject.h:223
A mutable set of Unicode characters and multicharacter strings.
Definition: uniset.h:285
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition: unistr.h:296
C++ API: StringPiece: Read-only byte string wrapper class.
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition: umachine.h:435
int8_t UBool
The ICU boolean type, a signed-byte integer.
Definition: umachine.h:247
C++ API: Unicode Set.
C++ API: Unicode String.
C API: New API for Unicode Normalization.
UNormalizationCheckResult
Result values for normalization quick check functions.
Definition: unorm2.h:97
UNormalization2Mode
Constants for normalization modes.
Definition: unorm2.h:48
USetSpanCondition
Argument values for whether span() and similar functions continue while the current character is cont...
Definition: uset.h:184
Basic definitions for ICU, for both C and C++ APIs.
UErrorCode
Standard ICU4C error code type, a substitute for exceptions.
Definition: utypes.h:415
#define U_COMMON_API
Set to export library symbols from inside the common library, and to import them from outside.
Definition: utypes.h:300