Where Online Learning is simpler!
The C and C++ Include Header Files
/usr/include/unicode/normalizer2.h
$ cat -n /usr/include/unicode/normalizer2.h 1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2009-2013, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: normalizer2.h 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2009nov22 16 * created by: Markus W. Scherer 17 */ 18 19 #ifndef __NORMALIZER2_H__ 20 #define __NORMALIZER2_H__ 21 22 /** 23 * \file 24 * \brief C++ API: New API for Unicode Normalization. 25 */ 26 27 #include "unicode/utypes.h" 28 29 #if U_SHOW_CPLUSPLUS_API 30 31 #if !UCONFIG_NO_NORMALIZATION 32 33 #include "unicode/stringpiece.h" 34 #include "unicode/uniset.h" 35 #include "unicode/unistr.h" 36 #include "unicode/unorm2.h" 37 38 U_NAMESPACE_BEGIN 39 40 class ByteSink; 41 42 /** 43 * Unicode normalization functionality for standard Unicode normalization or 44 * for using custom mapping tables. 45 * All instances of this class are unmodifiable/immutable. 46 * Instances returned by getInstance() are singletons that must not be deleted by the caller. 47 * The Normalizer2 class is not intended for public subclassing. 48 * 49 * The primary functions are to produce a normalized string and to detect whether 50 * a string is already normalized. 51 * The most commonly used normalization forms are those defined in 52 * http://www.unicode.org/unicode/reports/tr15/ 53 * However, this API supports additional normalization forms for specialized purposes. 54 * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE) 55 * and can be used in implementations of UTS #46. 56 * 57 * Not only are the standard compose and decompose modes supplied, 58 * but additional modes are provided as documented in the Mode enum. 59 * 60 * Some of the functions in this class identify normalization boundaries. 61 * At a normalization boundary, the portions of the string 62 * before it and starting from it do not interact and can be handled independently. 63 * 64 * The spanQuickCheckYes() stops at a normalization boundary. 65 * When the goal is a normalized string, then the text before the boundary 66 * can be copied, and the remainder can be processed with normalizeSecondAndAppend(). 67 * 68 * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether 69 * a character is guaranteed to be at a normalization boundary, 70 * regardless of context. 71 * This is used for moving from one normalization boundary to the next 72 * or preceding boundary, and for performing iterative normalization. 73 * 74 * Iterative normalization is useful when only a small portion of a 75 * longer string needs to be processed. 76 * For example, in ICU, iterative normalization is used by the NormalizationTransliterator 77 * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart() 78 * (to process only the substring for which sort key bytes are computed). 79 * 80 * The set of normalization boundaries returned by these functions may not be 81 * complete: There may be more boundaries that could be returned. 82 * Different functions may return different boundaries. 83 * @stable ICU 4.4 84 */ 85 class U_COMMON_API Normalizer2 : public UObject { 86 public: 87 /** 88 * Destructor. 89 * @stable ICU 4.4 90 */ 91 ~Normalizer2(); 92 93 /** 94 * Returns a Normalizer2 instance for Unicode NFC normalization. 95 * Same as getInstance(nullptr, "nfc", UNORM2_COMPOSE, errorCode). 96 * Returns an unmodifiable singleton instance. Do not delete it. 97 * @param errorCode Standard ICU error code. Its input value must 98 * pass the U_SUCCESS() test, or else the function returns 99 * immediately. Check for U_FAILURE() on output or use with 100 * function chaining. (See User Guide for details.) 101 * @return the requested Normalizer2, if successful 102 * @stable ICU 49 103 */ 104 static const Normalizer2 * 105 getNFCInstance(UErrorCode &errorCode); 106 107 /** 108 * Returns a Normalizer2 instance for Unicode NFD normalization. 109 * Same as getInstance(nullptr, "nfc", UNORM2_DECOMPOSE, errorCode). 110 * Returns an unmodifiable singleton instance. Do not delete it. 111 * @param errorCode Standard ICU error code. Its input value must 112 * pass the U_SUCCESS() test, or else the function returns 113 * immediately. Check for U_FAILURE() on output or use with 114 * function chaining. (See User Guide for details.) 115 * @return the requested Normalizer2, if successful 116 * @stable ICU 49 117 */ 118 static const Normalizer2 * 119 getNFDInstance(UErrorCode &errorCode); 120 121 /** 122 * Returns a Normalizer2 instance for Unicode NFKC normalization. 123 * Same as getInstance(nullptr, "nfkc", UNORM2_COMPOSE, errorCode). 124 * Returns an unmodifiable singleton instance. Do not delete it. 125 * @param errorCode Standard ICU error code. Its input value must 126 * pass the U_SUCCESS() test, or else the function returns 127 * immediately. Check for U_FAILURE() on output or use with 128 * function chaining. (See User Guide for details.) 129 * @return the requested Normalizer2, if successful 130 * @stable ICU 49 131 */ 132 static const Normalizer2 * 133 getNFKCInstance(UErrorCode &errorCode); 134 135 /** 136 * Returns a Normalizer2 instance for Unicode NFKD normalization. 137 * Same as getInstance(nullptr, "nfkc", UNORM2_DECOMPOSE, errorCode). 138 * Returns an unmodifiable singleton instance. Do not delete it. 139 * @param errorCode Standard ICU error code. Its input value must 140 * pass the U_SUCCESS() test, or else the function returns 141 * immediately. Check for U_FAILURE() on output or use with 142 * function chaining. (See User Guide for details.) 143 * @return the requested Normalizer2, if successful 144 * @stable ICU 49 145 */ 146 static const Normalizer2 * 147 getNFKDInstance(UErrorCode &errorCode); 148 149 /** 150 * Returns a Normalizer2 instance for Unicode toNFKC_Casefold() normalization 151 * which is equivalent to applying the NFKC_Casefold mappings and then NFC. 152 * See https://www.unicode.org/reports/tr44/#NFKC_Casefold 153 * 154 * Same as getInstance(nullptr, "nfkc_cf", UNORM2_COMPOSE, errorCode). 155 * Returns an unmodifiable singleton instance. Do not delete it. 156 * @param errorCode Standard ICU error code. Its input value must 157 * pass the U_SUCCESS() test, or else the function returns 158 * immediately. Check for U_FAILURE() on output or use with 159 * function chaining. (See User Guide for details.) 160 * @return the requested Normalizer2, if successful 161 * @stable ICU 49 162 */ 163 static const Normalizer2 * 164 getNFKCCasefoldInstance(UErrorCode &errorCode); 165 166 #ifndef U_HIDE_DRAFT_API 167 /** 168 * Returns a Normalizer2 instance for a variant of Unicode toNFKC_Casefold() normalization 169 * which is equivalent to applying the NFKC_Simple_Casefold mappings and then NFC. 170 * See https://www.unicode.org/reports/tr44/#NFKC_Simple_Casefold 171 * 172 * Same as getInstance(nullptr, "nfkc_scf", UNORM2_COMPOSE, errorCode). 173 * Returns an unmodifiable singleton instance. Do not delete it. 174 * @param errorCode Standard ICU error code. Its input value must 175 * pass the U_SUCCESS() test, or else the function returns 176 * immediately. Check for U_FAILURE() on output or use with 177 * function chaining. (See User Guide for details.) 178 * @return the requested Normalizer2, if successful 179 * @draft ICU 74 180 */ 181 static const Normalizer2 * 182 getNFKCSimpleCasefoldInstance(UErrorCode &errorCode); 183 #endif // U_HIDE_DRAFT_API 184 185 /** 186 * Returns a Normalizer2 instance which uses the specified data file 187 * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle) 188 * and which composes or decomposes text according to the specified mode. 189 * Returns an unmodifiable singleton instance. Do not delete it. 190 * 191 * Use packageName=nullptr for data files that are part of ICU's own data. 192 * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD. 193 * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD. 194 * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold. 195 * 196 * @param packageName nullptr for ICU built-in data, otherwise application data package name 197 * @param name "nfc" or "nfkc" or "nfkc_cf" or "nfkc_scf" or name of custom data file 198 * @param mode normalization mode (compose or decompose etc.) 199 * @param errorCode Standard ICU error code. Its input value must 200 * pass the U_SUCCESS() test, or else the function returns 201 * immediately. Check for U_FAILURE() on output or use with 202 * function chaining. (See User Guide for details.) 203 * @return the requested Normalizer2, if successful 204 * @stable ICU 4.4 205 */ 206 static const Normalizer2 * 207 getInstance(const char *packageName, 208 const char *name, 209 UNormalization2Mode mode, 210 UErrorCode &errorCode); 211 212 /** 213 * Returns the normalized form of the source string. 214 * @param src source string 215 * @param errorCode Standard ICU error code. Its input value must 216 * pass the U_SUCCESS() test, or else the function returns 217 * immediately. Check for U_FAILURE() on output or use with 218 * function chaining. (See User Guide for details.) 219 * @return normalized src 220 * @stable ICU 4.4 221 */ 222 UnicodeString 223 normalize(const UnicodeString &src, UErrorCode &errorCode) const { 224 UnicodeString result; 225 normalize(src, result, errorCode); 226 return result; 227 } 228 /** 229 * Writes the normalized form of the source string to the destination string 230 * (replacing its contents) and returns the destination string. 231 * The source and destination strings must be different objects. 232 * @param src source string 233 * @param dest destination string; its contents is replaced with normalized src 234 * @param errorCode Standard ICU error code. Its input value must 235 * pass the U_SUCCESS() test, or else the function returns 236 * immediately. Check for U_FAILURE() on output or use with 237 * function chaining. (See User Guide for details.) 238 * @return dest 239 * @stable ICU 4.4 240 */ 241 virtual UnicodeString & 242 normalize(const UnicodeString &src, 243 UnicodeString &dest, 244 UErrorCode &errorCode) const = 0; 245 246 /** 247 * Normalizes a UTF-8 string and optionally records how source substrings 248 * relate to changed and unchanged result substrings. 249 * 250 * Implemented completely for all built-in modes except for FCD. 251 * The base class implementation converts to & from UTF-16 and does not support edits. 252 * 253 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET. 254 * @param src Source UTF-8 string. 255 * @param sink A ByteSink to which the normalized UTF-8 result string is written. 256 * sink.Flush() is called at the end. 257 * @param edits Records edits for index mapping, working with styled text, 258 * and getting only changes (if any). 259 * The Edits contents is undefined if any error occurs. 260 * This function calls edits->reset() first unless 261 * options includes U_EDITS_NO_RESET. edits can be nullptr. 262 * @param errorCode Standard ICU error code. Its input value must 263 * pass the U_SUCCESS() test, or else the function returns 264 * immediately. Check for U_FAILURE() on output or use with 265 * function chaining. (See User Guide for details.) 266 * @stable ICU 60 267 */ 268 virtual void 269 normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink, 270 Edits *edits, UErrorCode &errorCode) const; 271 272 /** 273 * Appends the normalized form of the second string to the first string 274 * (merging them at the boundary) and returns the first string. 275 * The result is normalized if the first string was normalized. 276 * The first and second strings must be different objects. 277 * @param first string, should be normalized 278 * @param second string, will be normalized 279 * @param errorCode Standard ICU error code. Its input value must 280 * pass the U_SUCCESS() test, or else the function returns 281 * immediately. Check for U_FAILURE() on output or use with 282 * function chaining. (See User Guide for details.) 283 * @return first 284 * @stable ICU 4.4 285 */ 286 virtual UnicodeString & 287 normalizeSecondAndAppend(UnicodeString &first, 288 const UnicodeString &second, 289 UErrorCode &errorCode) const = 0; 290 /** 291 * Appends the second string to the first string 292 * (merging them at the boundary) and returns the first string. 293 * The result is normalized if both the strings were normalized. 294 * The first and second strings must be different objects. 295 * @param first string, should be normalized 296 * @param second string, should be normalized 297 * @param errorCode Standard ICU error code. Its input value must 298 * pass the U_SUCCESS() test, or else the function returns 299 * immediately. Check for U_FAILURE() on output or use with 300 * function chaining. (See User Guide for details.) 301 * @return first 302 * @stable ICU 4.4 303 */ 304 virtual UnicodeString & 305 append(UnicodeString &first, 306 const UnicodeString &second, 307 UErrorCode &errorCode) const = 0; 308 309 /** 310 * Gets the decomposition mapping of c. 311 * Roughly equivalent to normalizing the String form of c 312 * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster, and except that this function 313 * returns false and does not write a string 314 * if c does not have a decomposition mapping in this instance's data. 315 * This function is independent of the mode of the Normalizer2. 316 * @param c code point 317 * @param decomposition String object which will be set to c's 318 * decomposition mapping, if there is one. 319 * @return true if c has a decomposition, otherwise false 320 * @stable ICU 4.6 321 */ 322 virtual UBool 323 getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0; 324 325 /** 326 * Gets the raw decomposition mapping of c. 327 * 328 * This is similar to the getDecomposition() method but returns the 329 * raw decomposition mapping as specified in UnicodeData.txt or 330 * (for custom data) in the mapping files processed by the gennorm2 tool. 331 * By contrast, getDecomposition() returns the processed, 332 * recursively-decomposed version of this mapping. 333 * 334 * When used on a standard NFKC Normalizer2 instance, 335 * getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property. 336 * 337 * When used on a standard NFC Normalizer2 instance, 338 * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can); 339 * in this case, the result contains either one or two code points (=1..4 char16_ts). 340 * 341 * This function is independent of the mode of the Normalizer2. 342 * The default implementation returns false. 343 * @param c code point 344 * @param decomposition String object which will be set to c's 345 * raw decomposition mapping, if there is one. 346 * @return true if c has a decomposition, otherwise false 347 * @stable ICU 49 348 */ 349 virtual UBool 350 getRawDecomposition(UChar32 c, UnicodeString &decomposition) const; 351 352 /** 353 * Performs pairwise composition of a & b and returns the composite if there is one. 354 * 355 * Returns a composite code point c only if c has a two-way mapping to a+b. 356 * In standard Unicode normalization, this means that 357 * c has a canonical decomposition to a+b 358 * and c does not have the Full_Composition_Exclusion property. 359 * 360 * This function is independent of the mode of the Normalizer2. 361 * The default implementation returns a negative value. 362 * @param a A (normalization starter) code point. 363 * @param b Another code point. 364 * @return The non-negative composite code point if there is one; otherwise a negative value. 365 * @stable ICU 49 366 */ 367 virtual UChar32 368 composePair(UChar32 a, UChar32 b) const; 369 370 /** 371 * Gets the combining class of c. 372 * The default implementation returns 0 373 * but all standard implementations return the Unicode Canonical_Combining_Class value. 374 * @param c code point 375 * @return c's combining class 376 * @stable ICU 49 377 */ 378 virtual uint8_t 379 getCombiningClass(UChar32 c) const; 380 381 /** 382 * Tests if the string is normalized. 383 * Internally, in cases where the quickCheck() method would return "maybe" 384 * (which is only possible for the two COMPOSE modes) this method 385 * resolves to "yes" or "no" to provide a definitive result, 386 * at the cost of doing more work in those cases. 387 * @param s input string 388 * @param errorCode Standard ICU error code. Its input value must 389 * pass the U_SUCCESS() test, or else the function returns 390 * immediately. Check for U_FAILURE() on output or use with 391 * function chaining. (See User Guide for details.) 392 * @return true if s is normalized 393 * @stable ICU 4.4 394 */ 395 virtual UBool 396 isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0; 397 /** 398 * Tests if the UTF-8 string is normalized. 399 * Internally, in cases where the quickCheck() method would return "maybe" 400 * (which is only possible for the two COMPOSE modes) this method 401 * resolves to "yes" or "no" to provide a definitive result, 402 * at the cost of doing more work in those cases. 403 * 404 * This works for all normalization modes. 405 * It is optimized for UTF-8 for all built-in modes except for FCD. 406 * The base class implementation converts to UTF-16 and calls isNormalized(). 407 * 408 * @param s UTF-8 input string 409 * @param errorCode Standard ICU error code. Its input value must 410 * pass the U_SUCCESS() test, or else the function returns 411 * immediately. Check for U_FAILURE() on output or use with 412 * function chaining. (See User Guide for details.) 413 * @return true if s is normalized 414 * @stable ICU 60 415 */ 416 virtual UBool 417 isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const; 418 419 420 /** 421 * Tests if the string is normalized. 422 * For the two COMPOSE modes, the result could be "maybe" in cases that 423 * would take a little more work to resolve definitively. 424 * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster 425 * combination of quick check + normalization, to avoid 426 * re-checking the "yes" prefix. 427 * @param s input string 428 * @param errorCode Standard ICU error code. Its input value must 429 * pass the U_SUCCESS() test, or else the function returns 430 * immediately. Check for U_FAILURE() on output or use with 431 * function chaining. (See User Guide for details.) 432 * @return UNormalizationCheckResult 433 * @stable ICU 4.4 434 */ 435 virtual UNormalizationCheckResult 436 quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0; 437 438 /** 439 * Returns the end of the normalized substring of the input string. 440 * In other words, with
end=spanQuickCheckYes(s, ec);
441 * the substring
UnicodeString(s, 0, end)
442 * will pass the quick check with a "yes" result. 443 * 444 * The returned end index is usually one or more characters before the 445 * "no" or "maybe" character: The end index is at a normalization boundary. 446 * (See the class documentation for more about normalization boundaries.) 447 * 448 * When the goal is a normalized string and most input strings are expected 449 * to be normalized already, then call this method, 450 * and if it returns a prefix shorter than the input string, 451 * copy that prefix and use normalizeSecondAndAppend() for the remainder. 452 * @param s input string 453 * @param errorCode Standard ICU error code. Its input value must 454 * pass the U_SUCCESS() test, or else the function returns 455 * immediately. Check for U_FAILURE() on output or use with 456 * function chaining. (See User Guide for details.) 457 * @return "yes" span end index 458 * @stable ICU 4.4 459 */ 460 virtual int32_t 461 spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0; 462 463 /** 464 * Tests if the character always has a normalization boundary before it, 465 * regardless of context. 466 * If true, then the character does not normalization-interact with 467 * preceding characters. 468 * In other words, a string containing this character can be normalized 469 * by processing portions before this character and starting from this 470 * character independently. 471 * This is used for iterative normalization. See the class documentation for details. 472 * @param c character to test 473 * @return true if c has a normalization boundary before it 474 * @stable ICU 4.4 475 */ 476 virtual UBool hasBoundaryBefore(UChar32 c) const = 0; 477 478 /** 479 * Tests if the character always has a normalization boundary after it, 480 * regardless of context. 481 * If true, then the character does not normalization-interact with 482 * following characters. 483 * In other words, a string containing this character can be normalized 484 * by processing portions up to this character and after this 485 * character independently. 486 * This is used for iterative normalization. See the class documentation for details. 487 * Note that this operation may be significantly slower than hasBoundaryBefore(). 488 * @param c character to test 489 * @return true if c has a normalization boundary after it 490 * @stable ICU 4.4 491 */ 492 virtual UBool hasBoundaryAfter(UChar32 c) const = 0; 493 494 /** 495 * Tests if the character is normalization-inert. 496 * If true, then the character does not change, nor normalization-interact with 497 * preceding or following characters. 498 * In other words, a string containing this character can be normalized 499 * by processing portions before this character and after this 500 * character independently. 501 * This is used for iterative normalization. See the class documentation for details. 502 * Note that this operation may be significantly slower than hasBoundaryBefore(). 503 * @param c character to test 504 * @return true if c is normalization-inert 505 * @stable ICU 4.4 506 */ 507 virtual UBool isInert(UChar32 c) const = 0; 508 }; 509 510 /** 511 * Normalization filtered by a UnicodeSet. 512 * Normalizes portions of the text contained in the filter set and leaves 513 * portions not contained in the filter set unchanged. 514 * Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE). 515 * Not-in-the-filter text is treated as "is normalized" and "quick check yes". 516 * This class implements all of (and only) the Normalizer2 API. 517 * An instance of this class is unmodifiable/immutable but is constructed and 518 * must be destructed by the owner. 519 * @stable ICU 4.4 520 */ 521 class U_COMMON_API FilteredNormalizer2 : public Normalizer2 { 522 public: 523 /** 524 * Constructs a filtered normalizer wrapping any Normalizer2 instance 525 * and a filter set. 526 * Both are aliased and must not be modified or deleted while this object 527 * is used. 528 * The filter set should be frozen; otherwise the performance will suffer greatly. 529 * @param n2 wrapped Normalizer2 instance 530 * @param filterSet UnicodeSet which determines the characters to be normalized 531 * @stable ICU 4.4 532 */ 533 FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) : 534 norm2(n2), set(filterSet) {} 535 536 /** 537 * Destructor. 538 * @stable ICU 4.4 539 */ 540 ~FilteredNormalizer2(); 541 542 /** 543 * Writes the normalized form of the source string to the destination string 544 * (replacing its contents) and returns the destination string. 545 * The source and destination strings must be different objects. 546 * @param src source string 547 * @param dest destination string; its contents is replaced with normalized src 548 * @param errorCode Standard ICU error code. Its input value must 549 * pass the U_SUCCESS() test, or else the function returns 550 * immediately. Check for U_FAILURE() on output or use with 551 * function chaining. (See User Guide for details.) 552 * @return dest 553 * @stable ICU 4.4 554 */ 555 virtual UnicodeString & 556 normalize(const UnicodeString &src, 557 UnicodeString &dest, 558 UErrorCode &errorCode) const override; 559 560 /** 561 * Normalizes a UTF-8 string and optionally records how source substrings 562 * relate to changed and unchanged result substrings. 563 * 564 * Implemented completely for most built-in modes except for FCD. 565 * The base class implementation converts to & from UTF-16 and does not support edits. 566 * 567 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET. 568 * @param src Source UTF-8 string. 569 * @param sink A ByteSink to which the normalized UTF-8 result string is written. 570 * sink.Flush() is called at the end. 571 * @param edits Records edits for index mapping, working with styled text, 572 * and getting only changes (if any). 573 * The Edits contents is undefined if any error occurs. 574 * This function calls edits->reset() first unless 575 * options includes U_EDITS_NO_RESET. edits can be nullptr. 576 * @param errorCode Standard ICU error code. Its input value must 577 * pass the U_SUCCESS() test, or else the function returns 578 * immediately. Check for U_FAILURE() on output or use with 579 * function chaining. (See User Guide for details.) 580 * @stable ICU 60 581 */ 582 virtual void 583 normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink, 584 Edits *edits, UErrorCode &errorCode) const override; 585 586 /** 587 * Appends the normalized form of the second string to the first string 588 * (merging them at the boundary) and returns the first string. 589 * The result is normalized if the first string was normalized. 590 * The first and second strings must be different objects. 591 * @param first string, should be normalized 592 * @param second string, will be normalized 593 * @param errorCode Standard ICU error code. Its input value must 594 * pass the U_SUCCESS() test, or else the function returns 595 * immediately. Check for U_FAILURE() on output or use with 596 * function chaining. (See User Guide for details.) 597 * @return first 598 * @stable ICU 4.4 599 */ 600 virtual UnicodeString & 601 normalizeSecondAndAppend(UnicodeString &first, 602 const UnicodeString &second, 603 UErrorCode &errorCode) const override; 604 /** 605 * Appends the second string to the first string 606 * (merging them at the boundary) and returns the first string. 607 * The result is normalized if both the strings were normalized. 608 * The first and second strings must be different objects. 609 * @param first string, should be normalized 610 * @param second string, should be normalized 611 * @param errorCode Standard ICU error code. Its input value must 612 * pass the U_SUCCESS() test, or else the function returns 613 * immediately. Check for U_FAILURE() on output or use with 614 * function chaining. (See User Guide for details.) 615 * @return first 616 * @stable ICU 4.4 617 */ 618 virtual UnicodeString & 619 append(UnicodeString &first, 620 const UnicodeString &second, 621 UErrorCode &errorCode) const override; 622 623 /** 624 * Gets the decomposition mapping of c. 625 * For details see the base class documentation. 626 * 627 * This function is independent of the mode of the Normalizer2. 628 * @param c code point 629 * @param decomposition String object which will be set to c's 630 * decomposition mapping, if there is one. 631 * @return true if c has a decomposition, otherwise false 632 * @stable ICU 4.6 633 */ 634 virtual UBool 635 getDecomposition(UChar32 c, UnicodeString &decomposition) const override; 636 637 /** 638 * Gets the raw decomposition mapping of c. 639 * For details see the base class documentation. 640 * 641 * This function is independent of the mode of the Normalizer2. 642 * @param c code point 643 * @param decomposition String object which will be set to c's 644 * raw decomposition mapping, if there is one. 645 * @return true if c has a decomposition, otherwise false 646 * @stable ICU 49 647 */ 648 virtual UBool 649 getRawDecomposition(UChar32 c, UnicodeString &decomposition) const override; 650 651 /** 652 * Performs pairwise composition of a & b and returns the composite if there is one. 653 * For details see the base class documentation. 654 * 655 * This function is independent of the mode of the Normalizer2. 656 * @param a A (normalization starter) code point. 657 * @param b Another code point. 658 * @return The non-negative composite code point if there is one; otherwise a negative value. 659 * @stable ICU 49 660 */ 661 virtual UChar32 662 composePair(UChar32 a, UChar32 b) const override; 663 664 /** 665 * Gets the combining class of c. 666 * The default implementation returns 0 667 * but all standard implementations return the Unicode Canonical_Combining_Class value. 668 * @param c code point 669 * @return c's combining class 670 * @stable ICU 49 671 */ 672 virtual uint8_t 673 getCombiningClass(UChar32 c) const override; 674 675 /** 676 * Tests if the string is normalized. 677 * For details see the Normalizer2 base class documentation. 678 * @param s input string 679 * @param errorCode Standard ICU error code. Its input value must 680 * pass the U_SUCCESS() test, or else the function returns 681 * immediately. Check for U_FAILURE() on output or use with 682 * function chaining. (See User Guide for details.) 683 * @return true if s is normalized 684 * @stable ICU 4.4 685 */ 686 virtual UBool 687 isNormalized(const UnicodeString &s, UErrorCode &errorCode) const override; 688 /** 689 * Tests if the UTF-8 string is normalized. 690 * Internally, in cases where the quickCheck() method would return "maybe" 691 * (which is only possible for the two COMPOSE modes) this method 692 * resolves to "yes" or "no" to provide a definitive result, 693 * at the cost of doing more work in those cases. 694 * 695 * This works for all normalization modes. 696 * It is optimized for UTF-8 for all built-in modes except for FCD. 697 * The base class implementation converts to UTF-16 and calls isNormalized(). 698 * 699 * @param s UTF-8 input string 700 * @param errorCode Standard ICU error code. Its input value must 701 * pass the U_SUCCESS() test, or else the function returns 702 * immediately. Check for U_FAILURE() on output or use with 703 * function chaining. (See User Guide for details.) 704 * @return true if s is normalized 705 * @stable ICU 60 706 */ 707 virtual UBool 708 isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const override; 709 /** 710 * Tests if the string is normalized. 711 * For details see the Normalizer2 base class documentation. 712 * @param s input string 713 * @param errorCode Standard ICU error code. Its input value must 714 * pass the U_SUCCESS() test, or else the function returns 715 * immediately. Check for U_FAILURE() on output or use with 716 * function chaining. (See User Guide for details.) 717 * @return UNormalizationCheckResult 718 * @stable ICU 4.4 719 */ 720 virtual UNormalizationCheckResult 721 quickCheck(const UnicodeString &s, UErrorCode &errorCode) const override; 722 /** 723 * Returns the end of the normalized substring of the input string. 724 * For details see the Normalizer2 base class documentation. 725 * @param s input string 726 * @param errorCode Standard ICU error code. Its input value must 727 * pass the U_SUCCESS() test, or else the function returns 728 * immediately. Check for U_FAILURE() on output or use with 729 * function chaining. (See User Guide for details.) 730 * @return "yes" span end index 731 * @stable ICU 4.4 732 */ 733 virtual int32_t 734 spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const override; 735 736 /** 737 * Tests if the character always has a normalization boundary before it, 738 * regardless of context. 739 * For details see the Normalizer2 base class documentation. 740 * @param c character to test 741 * @return true if c has a normalization boundary before it 742 * @stable ICU 4.4 743 */ 744 virtual UBool hasBoundaryBefore(UChar32 c) const override; 745 746 /** 747 * Tests if the character always has a normalization boundary after it, 748 * regardless of context. 749 * For details see the Normalizer2 base class documentation. 750 * @param c character to test 751 * @return true if c has a normalization boundary after it 752 * @stable ICU 4.4 753 */ 754 virtual UBool hasBoundaryAfter(UChar32 c) const override; 755 756 /** 757 * Tests if the character is normalization-inert. 758 * For details see the Normalizer2 base class documentation. 759 * @param c character to test 760 * @return true if c is normalization-inert 761 * @stable ICU 4.4 762 */ 763 virtual UBool isInert(UChar32 c) const override; 764 private: 765 UnicodeString & 766 normalize(const UnicodeString &src, 767 UnicodeString &dest, 768 USetSpanCondition spanCondition, 769 UErrorCode &errorCode) const; 770 771 void 772 normalizeUTF8(uint32_t options, const char *src, int32_t length, 773 ByteSink &sink, Edits *edits, 774 USetSpanCondition spanCondition, 775 UErrorCode &errorCode) const; 776 777 UnicodeString & 778 normalizeSecondAndAppend(UnicodeString &first, 779 const UnicodeString &second, 780 UBool doNormalize, 781 UErrorCode &errorCode) const; 782 783 const Normalizer2 &norm2; 784 const UnicodeSet &set; 785 }; 786 787 U_NAMESPACE_END 788 789 #endif // !UCONFIG_NO_NORMALIZATION 790 791 #endif /* U_SHOW_CPLUSPLUS_API */ 792 793 #endif // __NORMALIZER2_H__
Contact us
|
About us
|
Term of use
|
Copyright © 2000-2025 MyWebUniversity.com ™