ICU 67.1  67.1
normalizer2.h
Go to the documentation of this file.
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 2009-2013, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: normalizer2.h
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2009nov22
16 * created by: Markus W. Scherer
17 */
18 
19 #ifndef __NORMALIZER2_H__
20 #define __NORMALIZER2_H__
21 
27 #include "unicode/utypes.h"
28 
29 #if U_SHOW_CPLUSPLUS_API
30 
31 #if !UCONFIG_NO_NORMALIZATION
32 
33 #include "unicode/stringpiece.h"
34 #include "unicode/uniset.h"
35 #include "unicode/unistr.h"
36 #include "unicode/unorm2.h"
37 
38 U_NAMESPACE_BEGIN
39 
40 class ByteSink;
41 
86 public:
92 
104  static const Normalizer2 *
106 
118  static const Normalizer2 *
120 
132  static const Normalizer2 *
134 
146  static const Normalizer2 *
148 
160  static const Normalizer2 *
162 
184  static const Normalizer2 *
185  getInstance(const char *packageName,
186  const char *name,
187  UNormalization2Mode mode,
188  UErrorCode &errorCode);
189 
201  normalize(const UnicodeString &src, UErrorCode &errorCode) const {
202  UnicodeString result;
203  normalize(src, result, errorCode);
204  return result;
205  }
219  virtual UnicodeString &
221  UnicodeString &dest,
222  UErrorCode &errorCode) const = 0;
223 
248  virtual void
249  normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
250  Edits *edits, UErrorCode &errorCode) const;
251 
266  virtual UnicodeString &
268  const UnicodeString &second,
269  UErrorCode &errorCode) const = 0;
284  virtual UnicodeString &
286  const UnicodeString &second,
287  UErrorCode &errorCode) const = 0;
288 
302  virtual UBool
303  getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0;
304 
329  virtual UBool
330  getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
331 
347  virtual UChar32
349 
358  virtual uint8_t
360 
375  virtual UBool
376  isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;
398  virtual UBool
400 
401 
418  quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;
419 
442  virtual int32_t
443  spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;
444 
458  virtual UBool hasBoundaryBefore(UChar32 c) const = 0;
459 
474  virtual UBool hasBoundaryAfter(UChar32 c) const = 0;
475 
489  virtual UBool isInert(UChar32 c) const = 0;
490 };
491 
504 public:
515  FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) :
516  norm2(n2), set(filterSet) {}
517 
523 
537  virtual UnicodeString &
539  UnicodeString &dest,
540  UErrorCode &errorCode) const U_OVERRIDE;
541 
566  virtual void
567  normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
568  Edits *edits, UErrorCode &errorCode) const U_OVERRIDE;
569 
584  virtual UnicodeString &
586  const UnicodeString &second,
587  UErrorCode &errorCode) const U_OVERRIDE;
602  virtual UnicodeString &
604  const UnicodeString &second,
605  UErrorCode &errorCode) const U_OVERRIDE;
606 
618  virtual UBool
620 
632  virtual UBool
634 
645  virtual UChar32
647 
656  virtual uint8_t
658 
670  virtual UBool
671  isNormalized(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
693  virtual UBool
707  quickCheck(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
719  virtual int32_t
721 
731 
741 
749  virtual UBool isInert(UChar32 c) const U_OVERRIDE;
750 private:
751  UnicodeString &
752  normalize(const UnicodeString &src,
753  UnicodeString &dest,
754  USetSpanCondition spanCondition,
755  UErrorCode &errorCode) const;
756 
757  void
758  normalizeUTF8(uint32_t options, const char *src, int32_t length,
759  ByteSink &sink, Edits *edits,
760  USetSpanCondition spanCondition,
761  UErrorCode &errorCode) const;
762 
763  UnicodeString &
765  const UnicodeString &second,
766  UBool doNormalize,
767  UErrorCode &errorCode) const;
768 
769  const Normalizer2 &norm2;
770  const UnicodeSet &set;
771 };
772 
773 U_NAMESPACE_END
774 
775 #endif // !UCONFIG_NO_NORMALIZATION
776 
777 #endif /* U_SHOW_CPLUSPLUS_API */
778 
779 #endif // __NORMALIZER2_H__
icu::Normalizer2::getRawDecomposition
virtual UBool getRawDecomposition(UChar32 c, UnicodeString &decomposition) const
Gets the raw decomposition mapping of c.
icu::FilteredNormalizer2::composePair
virtual UChar32 composePair(UChar32 a, UChar32 b) const U_OVERRIDE
Performs pairwise composition of a & b and returns the composite if there is one.
icu::Normalizer2::getDecomposition
virtual UBool getDecomposition(UChar32 c, UnicodeString &decomposition) const =0
Gets the decomposition mapping of c.
icu::Normalizer2::spanQuickCheckYes
virtual int32_t spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const =0
Returns the end of the normalized substring of the input string.
UNormalization2Mode
UNormalization2Mode
Constants for normalization modes.
Definition: unorm2.h:45
icu::Normalizer2::getInstance
static const Normalizer2 * getInstance(const char *packageName, const char *name, UNormalization2Mode mode, UErrorCode &errorCode)
Returns a Normalizer2 instance which uses the specified data file (packageName/name similar to ucnv_o...
icu::FilteredNormalizer2::normalizeUTF8
virtual void normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink, Edits *edits, UErrorCode &errorCode) const U_OVERRIDE
Normalizes a UTF-8 string and optionally records how source substrings relate to changed and unchange...
utypes.h
Basic definitions for ICU, for both C and C++ APIs.
icu::UnicodeSet
A mutable set of Unicode characters and multicharacter strings.
Definition: uniset.h:281
icu::FilteredNormalizer2::getRawDecomposition
virtual UBool getRawDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE
Gets the raw decomposition mapping of c.
UBool
int8_t UBool
The ICU boolean type.
Definition: umachine.h:261
icu::FilteredNormalizer2::~FilteredNormalizer2
~FilteredNormalizer2()
Destructor.
icu::Normalizer2::getNFKCInstance
static const Normalizer2 * getNFKCInstance(UErrorCode &errorCode)
Returns a Normalizer2 instance for Unicode NFKC normalization.
icu::Normalizer2::hasBoundaryAfter
virtual UBool hasBoundaryAfter(UChar32 c) const =0
Tests if the character always has a normalization boundary after it, regardless of context.
icu::FilteredNormalizer2::normalizeSecondAndAppend
virtual UnicodeString & normalizeSecondAndAppend(UnicodeString &first, const UnicodeString &second, UErrorCode &errorCode) const U_OVERRIDE
Appends the normalized form of the second string to the first string (merging them at the boundary) a...
U_OVERRIDE
#define U_OVERRIDE
Defined to the C++11 "override" keyword if available.
Definition: umachine.h:129
U_COMMON_API
#define U_COMMON_API
Set to export library symbols from inside the common library, and to import them from outside.
Definition: utypes.h:300
USetSpanCondition
USetSpanCondition
Argument values for whether span() and similar functions continue while the current character is cont...
Definition: uset.h:156
icu::FilteredNormalizer2::FilteredNormalizer2
FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet)
Constructs a filtered normalizer wrapping any Normalizer2 instance and a filter set.
Definition: normalizer2.h:515
icu::FilteredNormalizer2::append
virtual UnicodeString & append(UnicodeString &first, const UnicodeString &second, UErrorCode &errorCode) const U_OVERRIDE
Appends the second string to the first string (merging them at the boundary) and returns the first st...
stringpiece.h
C++ API: StringPiece: Read-only byte string wrapper class.
icu::FilteredNormalizer2::hasBoundaryBefore
virtual UBool hasBoundaryBefore(UChar32 c) const U_OVERRIDE
Tests if the character always has a normalization boundary before it, regardless of context.
icu::UnicodeString
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition: unistr.h:295
icu::FilteredNormalizer2::isNormalized
virtual UBool isNormalized(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE
Tests if the string is normalized.
icu::Normalizer2::getNFCInstance
static const Normalizer2 * getNFCInstance(UErrorCode &errorCode)
Returns a Normalizer2 instance for Unicode NFC normalization.
icu::FilteredNormalizer2::quickCheck
virtual UNormalizationCheckResult quickCheck(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE
Tests if the string is normalized.
UChar32
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition: umachine.h:425
icu::UObject
UObject is the common ICU "boilerplate" class.
Definition: uobject.h:223
UErrorCode
UErrorCode
Standard ICU4C error code type, a substitute for exceptions.
Definition: utypes.h:415
icu::Edits
Records lengths of string edits but not replacement text.
Definition: edits.h:80
icu::Normalizer2::isNormalizedUTF8
virtual UBool isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const
Tests if the UTF-8 string is normalized.
icu::FilteredNormalizer2::normalize
virtual UnicodeString & normalize(const UnicodeString &src, UnicodeString &dest, UErrorCode &errorCode) const U_OVERRIDE
Writes the normalized form of the source string to the destination string (replacing its contents) an...
icu::ByteSink
A ByteSink can be filled with bytes.
Definition: bytestream.h:53
icu::Normalizer2
Unicode normalization functionality for standard Unicode normalization or for using custom mapping ta...
Definition: normalizer2.h:85
icu::Normalizer2::quickCheck
virtual UNormalizationCheckResult quickCheck(const UnicodeString &s, UErrorCode &errorCode) const =0
Tests if the string is normalized.
icu::FilteredNormalizer2::getDecomposition
virtual UBool getDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE
Gets the decomposition mapping of c.
icu::Normalizer2::append
virtual UnicodeString & append(UnicodeString &first, const UnicodeString &second, UErrorCode &errorCode) const =0
Appends the second string to the first string (merging them at the boundary) and returns the first st...
icu::FilteredNormalizer2::hasBoundaryAfter
virtual UBool hasBoundaryAfter(UChar32 c) const U_OVERRIDE
Tests if the character always has a normalization boundary after it, regardless of context.
UNormalizationCheckResult
UNormalizationCheckResult
Result values for normalization quick check functions.
Definition: unorm2.h:94
unorm2.h
C API: New API for Unicode Normalization.
icu::FilteredNormalizer2::spanQuickCheckYes
virtual int32_t spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE
Returns the end of the normalized substring of the input string.
icu::FilteredNormalizer2::getCombiningClass
virtual uint8_t getCombiningClass(UChar32 c) const U_OVERRIDE
Gets the combining class of c.
icu::FilteredNormalizer2::isInert
virtual UBool isInert(UChar32 c) const U_OVERRIDE
Tests if the character is normalization-inert.
icu::Normalizer2::isInert
virtual UBool isInert(UChar32 c) const =0
Tests if the character is normalization-inert.
icu::Normalizer2::normalizeSecondAndAppend
virtual UnicodeString & normalizeSecondAndAppend(UnicodeString &first, const UnicodeString &second, UErrorCode &errorCode) const =0
Appends the normalized form of the second string to the first string (merging them at the boundary) a...
icu::Normalizer2::hasBoundaryBefore
virtual UBool hasBoundaryBefore(UChar32 c) const =0
Tests if the character always has a normalization boundary before it, regardless of context.
icu::Normalizer2::getCombiningClass
virtual uint8_t getCombiningClass(UChar32 c) const
Gets the combining class of c.
icu::FilteredNormalizer2::isNormalizedUTF8
virtual UBool isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const U_OVERRIDE
Tests if the UTF-8 string is normalized.
icu::Normalizer2::isNormalized
virtual UBool isNormalized(const UnicodeString &s, UErrorCode &errorCode) const =0
Tests if the string is normalized.
icu::Normalizer2::~Normalizer2
~Normalizer2()
Destructor.
icu::Normalizer2::composePair
virtual UChar32 composePair(UChar32 a, UChar32 b) const
Performs pairwise composition of a & b and returns the composite if there is one.
icu::Normalizer2::getNFKCCasefoldInstance
static const Normalizer2 * getNFKCCasefoldInstance(UErrorCode &errorCode)
Returns a Normalizer2 instance for Unicode NFKC_Casefold normalization.
uniset.h
C++ API: Unicode Set.
icu::Normalizer2::normalizeUTF8
virtual void normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink, Edits *edits, UErrorCode &errorCode) const
Normalizes a UTF-8 string and optionally records how source substrings relate to changed and unchange...
icu::StringPiece
A string-like object that points to a sized piece of memory.
Definition: stringpiece.h:60
icu::Normalizer2::getNFKDInstance
static const Normalizer2 * getNFKDInstance(UErrorCode &errorCode)
Returns a Normalizer2 instance for Unicode NFKD normalization.
icu::Normalizer2::normalize
virtual UnicodeString & normalize(const UnicodeString &src, UnicodeString &dest, UErrorCode &errorCode) const =0
Writes the normalized form of the source string to the destination string (replacing its contents) an...
icu::Normalizer2::normalize
UnicodeString normalize(const UnicodeString &src, UErrorCode &errorCode) const
Returns the normalized form of the source string.
Definition: normalizer2.h:201
unistr.h
C++ API: Unicode String.
icu::Normalizer2::getNFDInstance
static const Normalizer2 * getNFDInstance(UErrorCode &errorCode)
Returns a Normalizer2 instance for Unicode NFD normalization.
icu::FilteredNormalizer2
Normalization filtered by a UnicodeSet.
Definition: normalizer2.h:503