ICU 69.1  69.1
normalizer2.h
Go to the documentation of this file.
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 2009-2013, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: normalizer2.h
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2009nov22
16 * created by: Markus W. Scherer
17 */
18 
19 #ifndef __NORMALIZER2_H__
20 #define __NORMALIZER2_H__
21 
27 #include "unicode/utypes.h"
28 
29 #if U_SHOW_CPLUSPLUS_API
30 
31 #if !UCONFIG_NO_NORMALIZATION
32 
33 #include "unicode/stringpiece.h"
34 #include "unicode/uniset.h"
35 #include "unicode/unistr.h"
36 #include "unicode/unorm2.h"
37 
38 U_NAMESPACE_BEGIN
39 
40 class ByteSink;
41 
86 public:
91  ~Normalizer2();
92 
104  static const Normalizer2 *
105  getNFCInstance(UErrorCode &errorCode);
106 
118  static const Normalizer2 *
119  getNFDInstance(UErrorCode &errorCode);
120 
132  static const Normalizer2 *
133  getNFKCInstance(UErrorCode &errorCode);
134 
146  static const Normalizer2 *
147  getNFKDInstance(UErrorCode &errorCode);
148 
160  static const Normalizer2 *
161  getNFKCCasefoldInstance(UErrorCode &errorCode);
162 
184  static const Normalizer2 *
185  getInstance(const char *packageName,
186  const char *name,
187  UNormalization2Mode mode,
188  UErrorCode &errorCode);
189 
201  normalize(const UnicodeString &src, UErrorCode &errorCode) const {
202  UnicodeString result;
203  normalize(src, result, errorCode);
204  return result;
205  }
219  virtual UnicodeString &
220  normalize(const UnicodeString &src,
221  UnicodeString &dest,
222  UErrorCode &errorCode) const = 0;
223 
246  virtual void
247  normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
248  Edits *edits, UErrorCode &errorCode) const;
249 
264  virtual UnicodeString &
265  normalizeSecondAndAppend(UnicodeString &first,
266  const UnicodeString &second,
267  UErrorCode &errorCode) const = 0;
282  virtual UnicodeString &
283  append(UnicodeString &first,
284  const UnicodeString &second,
285  UErrorCode &errorCode) const = 0;
286 
300  virtual UBool
301  getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0;
302 
327  virtual UBool
328  getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
329 
345  virtual UChar32
346  composePair(UChar32 a, UChar32 b) const;
347 
356  virtual uint8_t
357  getCombiningClass(UChar32 c) const;
358 
373  virtual UBool
374  isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;
394  virtual UBool
395  isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const;
396 
397 
414  quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;
415 
438  virtual int32_t
439  spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;
440 
454  virtual UBool hasBoundaryBefore(UChar32 c) const = 0;
455 
470  virtual UBool hasBoundaryAfter(UChar32 c) const = 0;
471 
485  virtual UBool isInert(UChar32 c) const = 0;
486 };
487 
500 public:
511  FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) :
512  norm2(n2), set(filterSet) {}
513 
519 
533  virtual UnicodeString &
534  normalize(const UnicodeString &src,
535  UnicodeString &dest,
536  UErrorCode &errorCode) const U_OVERRIDE;
537 
560  virtual void
561  normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
562  Edits *edits, UErrorCode &errorCode) const U_OVERRIDE;
563 
578  virtual UnicodeString &
580  const UnicodeString &second,
581  UErrorCode &errorCode) const U_OVERRIDE;
596  virtual UnicodeString &
597  append(UnicodeString &first,
598  const UnicodeString &second,
599  UErrorCode &errorCode) const U_OVERRIDE;
600 
612  virtual UBool
613  getDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE;
614 
626  virtual UBool
627  getRawDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE;
628 
639  virtual UChar32
641 
650  virtual uint8_t
652 
664  virtual UBool
665  isNormalized(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
685  virtual UBool
686  isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const U_OVERRIDE;
699  quickCheck(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
711  virtual int32_t
712  spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
713 
722  virtual UBool hasBoundaryBefore(UChar32 c) const U_OVERRIDE;
723 
732  virtual UBool hasBoundaryAfter(UChar32 c) const U_OVERRIDE;
733 
741  virtual UBool isInert(UChar32 c) const U_OVERRIDE;
742 private:
743  UnicodeString &
744  normalize(const UnicodeString &src,
745  UnicodeString &dest,
746  USetSpanCondition spanCondition,
747  UErrorCode &errorCode) const;
748 
749  void
750  normalizeUTF8(uint32_t options, const char *src, int32_t length,
751  ByteSink &sink, Edits *edits,
752  USetSpanCondition spanCondition,
753  UErrorCode &errorCode) const;
754 
755  UnicodeString &
757  const UnicodeString &second,
758  UBool doNormalize,
759  UErrorCode &errorCode) const;
760 
761  const Normalizer2 &norm2;
762  const UnicodeSet &set;
763 };
764 
765 U_NAMESPACE_END
766 
767 #endif // !UCONFIG_NO_NORMALIZATION
768 
769 #endif /* U_SHOW_CPLUSPLUS_API */
770 
771 #endif // __NORMALIZER2_H__
virtual UBool isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const
Tests if the UTF-8 string is normalized.
virtual UBool getRawDecomposition(UChar32 c, UnicodeString &decomposition) const
Gets the raw decomposition mapping of c.
virtual UBool hasBoundaryBefore(UChar32 c) const =0
Tests if the character always has a normalization boundary before it, regardless of context...
#define U_OVERRIDE
Defined to the C++11 "override" keyword if available.
Definition: umachine.h:130
UnicodeString normalize(const UnicodeString &src, UErrorCode &errorCode) const
Returns the normalized form of the source string.
Definition: normalizer2.h:201
virtual UChar32 composePair(UChar32 a, UChar32 b) const
Performs pairwise composition of a & b and returns the composite if there is one. ...
virtual int32_t spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const =0
Returns the end of the normalized substring of the input string.
C++ API: Unicode String.
A ByteSink can be filled with bytes.
Definition: bytestream.h:53
Records lengths of string edits but not replacement text.
Definition: edits.h:80
C++ API: StringPiece: Read-only byte string wrapper class.
virtual UnicodeString & append(UnicodeString &first, const UnicodeString &second, UErrorCode &errorCode) const =0
Appends the second string to the first string (merging them at the boundary) and returns the first st...
Unicode normalization functionality for standard Unicode normalization or for using custom mapping ta...
Definition: normalizer2.h:85
C API: New API for Unicode Normalization.
virtual UBool hasBoundaryAfter(UChar32 c) const =0
Tests if the character always has a normalization boundary after it, regardless of context...
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition: umachine.h:467
FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet)
Constructs a filtered normalizer wrapping any Normalizer2 instance and a filter set.
Definition: normalizer2.h:511
A mutable set of Unicode characters and multicharacter strings.
Definition: uniset.h:279
USetSpanCondition
Argument values for whether span() and similar functions continue while the current character is cont...
Definition: uset.h:159
virtual UNormalizationCheckResult quickCheck(const UnicodeString &s, UErrorCode &errorCode) const =0
Tests if the string is normalized.
UNormalization2Mode
Constants for normalization modes.
Definition: unorm2.h:48
virtual UnicodeString & normalizeSecondAndAppend(UnicodeString &first, const UnicodeString &second, UErrorCode &errorCode) const =0
Appends the normalized form of the second string to the first string (merging them at the boundary) a...
UErrorCode
Standard ICU4C error code type, a substitute for exceptions.
Definition: utypes.h:415
virtual UBool isNormalized(const UnicodeString &s, UErrorCode &errorCode) const =0
Tests if the string is normalized.
virtual UBool getDecomposition(UChar32 c, UnicodeString &decomposition) const =0
Gets the decomposition mapping of c.
virtual uint8_t getCombiningClass(UChar32 c) const
Gets the combining class of c.
Basic definitions for ICU, for both C and C++ APIs.
virtual UBool isInert(UChar32 c) const =0
Tests if the character is normalization-inert.
#define U_COMMON_API
Set to export library symbols from inside the common library, and to import them from outside...
Definition: utypes.h:300
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition: unistr.h:295
A string-like object that points to a sized piece of memory.
Definition: stringpiece.h:60
UObject is the common ICU "boilerplate" class.
Definition: uobject.h:223
Normalization filtered by a UnicodeSet.
Definition: normalizer2.h:499
UNormalizationCheckResult
Result values for normalization quick check functions.
Definition: unorm2.h:97
virtual void normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink, Edits *edits, UErrorCode &errorCode) const
Normalizes a UTF-8 string and optionally records how source substrings relate to changed and unchange...
int8_t UBool
The ICU boolean type, a signed-byte integer.
Definition: umachine.h:269
C++ API: Unicode Set.