Where Online Learning is simpler!
The C and C++ Include Header Files
/usr/include/unicode/uniset.h
$ cat -n /usr/include/unicode/uniset.h 1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 *************************************************************************** 5 * Copyright (C) 1999-2016, International Business Machines Corporation 6 * and others. All Rights Reserved. 7 *************************************************************************** 8 * Date Name Description 9 * 10/20/99 alan Creation. 10 *************************************************************************** 11 */ 12 13 #ifndef UNICODESET_H 14 #define UNICODESET_H 15 16 #include "unicode/utypes.h" 17 18 #if U_SHOW_CPLUSPLUS_API 19 20 #include "unicode/ucpmap.h" 21 #include "unicode/unifilt.h" 22 #include "unicode/unistr.h" 23 #include "unicode/uset.h" 24 25 /** 26 * \file 27 * \brief C++ API: Unicode Set 28 */ 29 30 U_NAMESPACE_BEGIN 31 32 // Forward Declarations. 33 class BMPSet; 34 class ParsePosition; 35 class RBBIRuleScanner; 36 class SymbolTable; 37 class UnicodeSetStringSpan; 38 class UVector; 39 class RuleCharacterIterator; 40 41 /** 42 * A mutable set of Unicode characters and multicharacter strings. Objects of this class 43 * represent <em>character classes</em> used in regular expressions. 44 * A character specifies a subset of Unicode code points. Legal 45 * code points are U+0000 to U+10FFFF, inclusive. 46 * 47 * <p>The UnicodeSet class is not designed to be subclassed. 48 * 49 * <p><code>UnicodeSet</code> supports two APIs. The first is the 50 * <em>operand</em> API that allows the caller to modify the value of 51 * a <code>UnicodeSet</code> object. It conforms to Java 2's 52 * <code>java.util.Set</code> interface, although 53 * <code>UnicodeSet</code> does not actually implement that 54 * interface. All methods of <code>Set</code> are supported, with the 55 * modification that they take a character range or single character 56 * instead of an <code>Object</code>, and they take a 57 * <code>UnicodeSet</code> instead of a <code>Collection</code>. The 58 * operand API may be thought of in terms of boolean logic: a boolean 59 * OR is implemented by <code>add</code>, a boolean AND is implemented 60 * by <code>retain</code>, a boolean XOR is implemented by 61 * <code>complement</code> taking an argument, and a boolean NOT is 62 * implemented by <code>complement</code> with no argument. In terms 63 * of traditional set theory function names, <code>add</code> is a 64 * union, <code>retain</code> is an intersection, <code>remove</code> 65 * is an asymmetric difference, and <code>complement</code> with no 66 * argument is a set complement with respect to the superset range 67 * <code>MIN_VALUE-MAX_VALUE</code> 68 * 69 * <p>The second API is the 70 * <code>applyPattern()</code>/<code>toPattern()</code> API from the 71 * <code>java.text.Format</code>-derived classes. Unlike the 72 * methods that add characters, add categories, and control the logic 73 * of the set, the method <code>applyPattern()</code> sets all 74 * attributes of a <code>UnicodeSet</code> at once, based on a 75 * string pattern. 76 * 77 * <p><b>Pattern syntax</b></p> 78 * 79 * Patterns are accepted by the constructors and the 80 * <code>applyPattern()</code> methods and returned by the 81 * <code>toPattern()</code> method. These patterns follow a syntax 82 * similar to that employed by version 8 regular expression character 83 * classes. Here are some simple examples: 84 * 85 * \htmlonly<blockquote>\endhtmlonly 86 * <table> 87 * <tr align="top"> 88 * <td nowrap valign="top" align="left"><code>[]</code></td> 89 * <td valign="top">No characters</td> 90 * </tr><tr align="top"> 91 * <td nowrap valign="top" align="left"><code>[a]</code></td> 92 * <td valign="top">The character 'a'</td> 93 * </tr><tr align="top"> 94 * <td nowrap valign="top" align="left"><code>[ae]</code></td> 95 * <td valign="top">The characters 'a' and 'e'</td> 96 * </tr> 97 * <tr> 98 * <td nowrap valign="top" align="left"><code>[a-e]</code></td> 99 * <td valign="top">The characters 'a' through 'e' inclusive, in Unicode code 100 * point order</td> 101 * </tr> 102 * <tr> 103 * <td nowrap valign="top" align="left"><code>[\\u4E01]</code></td> 104 * <td valign="top">The character U+4E01</td> 105 * </tr> 106 * <tr> 107 * <td nowrap valign="top" align="left"><code>[a{ab}{ac}]</code></td> 108 * <td valign="top">The character 'a' and the multicharacter strings "ab" and 109 * "ac"</td> 110 * </tr> 111 * <tr> 112 * <td nowrap valign="top" align="left"><code>[\\p{Lu}]</code></td> 113 * <td valign="top">All characters in the general category Uppercase Letter</td> 114 * </tr> 115 * </table> 116 * \htmlonly</blockquote>\endhtmlonly 117 * 118 * Any character may be preceded by a backslash in order to remove any special 119 * meaning. White space characters, as defined by UCharacter.isWhitespace(), are 120 * ignored, unless they are escaped. 121 * 122 * <p>Property patterns specify a set of characters having a certain 123 * property as defined by the Unicode standard. Both the POSIX-like 124 * "[:Lu:]" and the Perl-like syntax "\\p{Lu}" are recognized. For a 125 * complete list of supported property patterns, see the User's Guide 126 * for UnicodeSet at 127 * <a href="https://unicode-org.github.io/icu/userguide/strings/unicodeset"> 128 * https://unicode-org.github.io/icu/userguide/strings/unicodeset</a>. 129 * Actual determination of property data is defined by the underlying 130 * Unicode database as implemented by UCharacter. 131 * 132 * <p>Patterns specify individual characters, ranges of characters, and 133 * Unicode property sets. When elements are concatenated, they 134 * specify their union. To complement a set, place a '^' immediately 135 * after the opening '['. Property patterns are inverted by modifying 136 * their delimiters; "[:^foo]" and "\\P{foo}". In any other location, 137 * '^' has no special meaning. 138 * 139 * <p>Since ICU 70, "[^...]", "[:^foo]", "\\P{foo}", and "[:binaryProperty=No:]" 140 * perform a “code point complement†(all code points minus the original set), 141 * removing all multicharacter strings, 142 * equivalent to <code>.complement().removeAllStrings()</code>. 143 * The complement() API function continues to perform a 144 * symmetric difference with all code points and thus retains all multicharacter strings. 145 * 146 * <p>Ranges are indicated by placing two a '-' between two 147 * characters, as in "a-z". This specifies the range of all 148 * characters from the left to the right, in Unicode order. If the 149 * left character is greater than or equal to the 150 * right character it is a syntax error. If a '-' occurs as the first 151 * character after the opening '[' or '[^', or if it occurs as the 152 * last character before the closing ']', then it is taken as a 153 * literal. Thus "[a\-b]", "[-ab]", and "[ab-]" all indicate the same 154 * set of three characters, 'a', 'b', and '-'. 155 * 156 * <p>Sets may be intersected using the '&' operator or the asymmetric 157 * set difference may be taken using the '-' operator, for example, 158 * "[[:L:]&[\\u0000-\\u0FFF]]" indicates the set of all Unicode letters 159 * with values less than 4096. Operators ('&' and '|') have equal 160 * precedence and bind left-to-right. Thus 161 * "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to 162 * "[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]". This only really matters for 163 * difference; intersection is commutative. 164 * 165 * <table> 166 * <tr valign=top><td nowrap><code>[a]</code><td>The set containing 'a' 167 * <tr valign=top><td nowrap><code>[a-z]</code><td>The set containing 'a' 168 * through 'z' and all letters in between, in Unicode order 169 * <tr valign=top><td nowrap><code>[^a-z]</code><td>The set containing 170 * all characters but 'a' through 'z', 171 * that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF 172 * <tr valign=top><td nowrap><code>[[<em>pat1</em>][<em>pat2</em>]]</code> 173 * <td>The union of sets specified by <em>pat1</em> and <em>pat2</em> 174 * <tr valign=top><td nowrap><code>[[<em>pat1</em>]&[<em>pat2</em>]]</code> 175 * <td>The intersection of sets specified by <em>pat1</em> and <em>pat2</em> 176 * <tr valign=top><td nowrap><code>[[<em>pat1</em>]-[<em>pat2</em>]]</code> 177 * <td>The asymmetric difference of sets specified by <em>pat1</em> and 178 * <em>pat2</em> 179 * <tr valign=top><td nowrap><code>[:Lu:] or \\p{Lu}</code> 180 * <td>The set of characters having the specified 181 * Unicode property; in 182 * this case, Unicode uppercase letters 183 * <tr valign=top><td nowrap><code>[:^Lu:] or \\P{Lu}</code> 184 * <td>The set of characters <em>not</em> having the given 185 * Unicode property 186 * </table> 187 * 188 * <p><b>Formal syntax</b></p> 189 * 190 * \htmlonly<blockquote>\endhtmlonly 191 * <table> 192 * <tr align="top"> 193 * <td nowrap valign="top" align="right"><code>pattern := </code></td> 194 * <td valign="top"><code>('[' '^'? item* ']') | 195 * property</code></td> 196 * </tr> 197 * <tr align="top"> 198 * <td nowrap valign="top" align="right"><code>item := </code></td> 199 * <td valign="top"><code>char | (char '-' char) | pattern-expr<br> 200 * </code></td> 201 * </tr> 202 * <tr align="top"> 203 * <td nowrap valign="top" align="right"><code>pattern-expr := </code></td> 204 * <td valign="top"><code>pattern | pattern-expr pattern | 205 * pattern-expr op pattern<br> 206 * </code></td> 207 * </tr> 208 * <tr align="top"> 209 * <td nowrap valign="top" align="right"><code>op := </code></td> 210 * <td valign="top"><code>'&' | '-'<br> 211 * </code></td> 212 * </tr> 213 * <tr align="top"> 214 * <td nowrap valign="top" align="right"><code>special := </code></td> 215 * <td valign="top"><code>'[' | ']' | '-'<br> 216 * </code></td> 217 * </tr> 218 * <tr align="top"> 219 * <td nowrap valign="top" align="right"><code>char := </code></td> 220 * <td valign="top"><em>any character that is not</em><code> special<br> 221 * | ('\' </code><em>any character</em><code>)<br> 222 * | ('\\u' hex hex hex hex)<br> 223 * </code></td> 224 * </tr> 225 * <tr align="top"> 226 * <td nowrap valign="top" align="right"><code>hex := </code></td> 227 * <td valign="top"><code>'0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' |<br> 228 * 'A' | 'B' | 'C' | 'D' | 'E' | 'F' | 'a' | 'b' | 'c' | 'd' | 'e' | 'f'</code></td> 229 * </tr> 230 * <tr> 231 * <td nowrap valign="top" align="right"><code>property := </code></td> 232 * <td valign="top"><em>a Unicode property set pattern</em></td> 233 * </tr> 234 * </table> 235 * <br> 236 * <table border="1"> 237 * <tr> 238 * <td>Legend: <table> 239 * <tr> 240 * <td nowrap valign="top"><code>a := b</code></td> 241 * <td width="20" valign="top"> </td> 242 * <td valign="top"><code>a</code> may be replaced by <code>b</code> </td> 243 * </tr> 244 * <tr> 245 * <td nowrap valign="top"><code>a?</code></td> 246 * <td valign="top"></td> 247 * <td valign="top">zero or one instance of <code>a</code><br> 248 * </td> 249 * </tr> 250 * <tr> 251 * <td nowrap valign="top"><code>a*</code></td> 252 * <td valign="top"></td> 253 * <td valign="top">one or more instances of <code>a</code><br> 254 * </td> 255 * </tr> 256 * <tr> 257 * <td nowrap valign="top"><code>a | b</code></td> 258 * <td valign="top"></td> 259 * <td valign="top">either <code>a</code> or <code>b</code><br> 260 * </td> 261 * </tr> 262 * <tr> 263 * <td nowrap valign="top"><code>'a'</code></td> 264 * <td valign="top"></td> 265 * <td valign="top">the literal string between the quotes </td> 266 * </tr> 267 * </table> 268 * </td> 269 * </tr> 270 * </table> 271 * \htmlonly</blockquote>\endhtmlonly 272 * 273 * <p>Note: 274 * - Most UnicodeSet methods do not take a UErrorCode parameter because 275 * there are usually very few opportunities for failure other than a shortage 276 * of memory, error codes in low-level C++ string methods would be inconvenient, 277 * and the error code as the last parameter (ICU convention) would prevent 278 * the use of default parameter values. 279 * Instead, such methods set the UnicodeSet into a "bogus" state 280 * (see isBogus()) if an error occurs. 281 * 282 * @author Alan Liu 283 * @stable ICU 2.0 284 */ 285 class U_COMMON_API UnicodeSet final : public UnicodeFilter { 286 private: 287 /** 288 * Enough for sets with few ranges. 289 * For example, White_Space has 10 ranges, list length 21. 290 */ 291 static constexpr int32_t INITIAL_CAPACITY = 25; 292 // fFlags constant 293 static constexpr uint8_t kIsBogus = 1; // This set is bogus (i.e. not valid) 294 295 UChar32* list = stackList; // MUST be terminated with HIGH 296 int32_t capacity = INITIAL_CAPACITY; // capacity of list 297 int32_t len = 1; // length of list used; 1 <= len <= capacity 298 uint8_t fFlags = 0; // Bit flag (see constants above) 299 300 BMPSet *bmpSet = nullptr; // The set is frozen iff either bmpSet or stringSpan is not nullptr. 301 UChar32* buffer = nullptr; // internal buffer, may be nullptr 302 int32_t bufferCapacity = 0; // capacity of buffer 303 304 /** 305 * The pattern representation of this set. This may not be the 306 * most economical pattern. It is the pattern supplied to 307 * applyPattern(), with variables substituted and whitespace 308 * removed. For sets constructed without applyPattern(), or 309 * modified using the non-pattern API, this string will be empty, 310 * indicating that toPattern() must generate a pattern 311 * representation from the inversion list. 312 */ 313 char16_t *pat = nullptr; 314 int32_t patLen = 0; 315 316 UVector* strings = nullptr; // maintained in sorted order 317 UnicodeSetStringSpan *stringSpan = nullptr; 318 319 /** 320 * Initial list array. 321 * Avoids some heap allocations, and list is never nullptr. 322 * Increases the object size a bit. 323 */ 324 UChar32 stackList[INITIAL_CAPACITY]; 325 326 public: 327 /** 328 * Determine if this object contains a valid set. 329 * A bogus set has no value. It is different from an empty set. 330 * It can be used to indicate that no set value is available. 331 * 332 * @return true if the set is bogus/invalid, false otherwise 333 * @see setToBogus() 334 * @stable ICU 4.0 335 */ 336 inline UBool isBogus(void) const; 337 338 /** 339 * Make this UnicodeSet object invalid. 340 * The string will test true with isBogus(). 341 * 342 * A bogus set has no value. It is different from an empty set. 343 * It can be used to indicate that no set value is available. 344 * 345 * This utility function is used throughout the UnicodeSet 346 * implementation to indicate that a UnicodeSet operation failed, 347 * and may be used in other functions, 348 * especially but not exclusively when such functions do not 349 * take a UErrorCode for simplicity. 350 * 351 * @see isBogus() 352 * @stable ICU 4.0 353 */ 354 void setToBogus(); 355 356 public: 357 358 enum { 359 /** 360 * Minimum value that can be stored in a UnicodeSet. 361 * @stable ICU 2.4 362 */ 363 MIN_VALUE = 0, 364 365 /** 366 * Maximum value that can be stored in a UnicodeSet. 367 * @stable ICU 2.4 368 */ 369 MAX_VALUE = 0x10ffff 370 }; 371 372 //---------------------------------------------------------------- 373 // Constructors &c 374 //---------------------------------------------------------------- 375 376 public: 377 378 /** 379 * Constructs an empty set. 380 * @stable ICU 2.0 381 */ 382 UnicodeSet(); 383 384 /** 385 * Constructs a set containing the given range. If <code>end < 386 * start</code> then an empty set is created. 387 * 388 * @param start first character, inclusive, of range 389 * @param end last character, inclusive, of range 390 * @stable ICU 2.4 391 */ 392 UnicodeSet(UChar32 start, UChar32 end); 393 394 #ifndef U_HIDE_INTERNAL_API 395 /** 396 * @internal 397 */ 398 enum ESerialization { 399 kSerialized /* result of serialize() */ 400 }; 401 402 /** 403 * Constructs a set from the output of serialize(). 404 * 405 * @param buffer the 16 bit array 406 * @param bufferLen the original length returned from serialize() 407 * @param serialization the value 'kSerialized' 408 * @param status error code 409 * 410 * @internal 411 */ 412 UnicodeSet(const uint16_t buffer[], int32_t bufferLen, 413 ESerialization serialization, UErrorCode &status); 414 #endif /* U_HIDE_INTERNAL_API */ 415 416 /** 417 * Constructs a set from the given pattern. See the class 418 * description for the syntax of the pattern language. 419 * @param pattern a string specifying what characters are in the set 420 * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern 421 * contains a syntax error. 422 * @stable ICU 2.0 423 */ 424 UnicodeSet(const UnicodeString& pattern, 425 UErrorCode& status); 426 427 #ifndef U_HIDE_INTERNAL_API 428 /** 429 * Constructs a set from the given pattern. See the class 430 * description for the syntax of the pattern language. 431 * @param pattern a string specifying what characters are in the set 432 * @param options bitmask for options to apply to the pattern. 433 * Valid options are USET_IGNORE_SPACE and 434 * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE. 435 * These case options are mutually exclusive. 436 * @param symbols a symbol table mapping variable names to values 437 * and stand-in characters to UnicodeSets; may be nullptr 438 * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern 439 * contains a syntax error. 440 * @internal 441 */ 442 UnicodeSet(const UnicodeString& pattern, 443 uint32_t options, 444 const SymbolTable* symbols, 445 UErrorCode& status); 446 #endif /* U_HIDE_INTERNAL_API */ 447 448 /** 449 * Constructs a set from the given pattern. See the class description 450 * for the syntax of the pattern language. 451 * @param pattern a string specifying what characters are in the set 452 * @param pos on input, the position in pattern at which to start parsing. 453 * On output, the position after the last character parsed. 454 * @param options bitmask for options to apply to the pattern. 455 * Valid options are USET_IGNORE_SPACE and 456 * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE. 457 * These case options are mutually exclusive. 458 * @param symbols a symbol table mapping variable names to values 459 * and stand-in characters to UnicodeSets; may be nullptr 460 * @param status input-output error code 461 * @stable ICU 2.8 462 */ 463 UnicodeSet(const UnicodeString& pattern, ParsePosition& pos, 464 uint32_t options, 465 const SymbolTable* symbols, 466 UErrorCode& status); 467 468 /** 469 * Constructs a set that is identical to the given UnicodeSet. 470 * @stable ICU 2.0 471 */ 472 UnicodeSet(const UnicodeSet& o); 473 474 /** 475 * Destructs the set. 476 * @stable ICU 2.0 477 */ 478 virtual ~UnicodeSet(); 479 480 /** 481 * Assigns this object to be a copy of another. 482 * A frozen set will not be modified. 483 * @stable ICU 2.0 484 */ 485 UnicodeSet& operator=(const UnicodeSet& o); 486 487 /** 488 * Compares the specified object with this set for equality. Returns 489 * <tt>true</tt> if the two sets 490 * have the same size, and every member of the specified set is 491 * contained in this set (or equivalently, every member of this set is 492 * contained in the specified set). 493 * 494 * @param o set to be compared for equality with this set. 495 * @return <tt>true</tt> if the specified set is equal to this set. 496 * @stable ICU 2.0 497 */ 498 virtual bool operator==(const UnicodeSet& o) const; 499 500 /** 501 * Compares the specified object with this set for equality. Returns 502 * <tt>true</tt> if the specified set is not equal to this set. 503 * @stable ICU 2.0 504 */ 505 inline bool operator!=(const UnicodeSet& o) const; 506 507 /** 508 * Returns a copy of this object. All UnicodeFunctor objects have 509 * to support cloning in order to allow classes using 510 * UnicodeFunctors, such as Transliterator, to implement cloning. 511 * If this set is frozen, then the clone will be frozen as well. 512 * Use cloneAsThawed() for a mutable clone of a frozen set. 513 * @see cloneAsThawed 514 * @stable ICU 2.0 515 */ 516 virtual UnicodeSet* clone() const override; 517 518 /** 519 * Returns the hash code value for this set. 520 * 521 * @return the hash code value for this set. 522 * @see Object#hashCode() 523 * @stable ICU 2.0 524 */ 525 virtual int32_t hashCode(void) const; 526 527 /** 528 * Get a UnicodeSet pointer from a USet 529 * 530 * @param uset a USet (the ICU plain C type for UnicodeSet) 531 * @return the corresponding UnicodeSet pointer. 532 * 533 * @stable ICU 4.2 534 */ 535 inline static UnicodeSet *fromUSet(USet *uset); 536 537 /** 538 * Get a UnicodeSet pointer from a const USet 539 * 540 * @param uset a const USet (the ICU plain C type for UnicodeSet) 541 * @return the corresponding UnicodeSet pointer. 542 * 543 * @stable ICU 4.2 544 */ 545 inline static const UnicodeSet *fromUSet(const USet *uset); 546 547 /** 548 * Produce a USet * pointer for this UnicodeSet. 549 * USet is the plain C type for UnicodeSet 550 * 551 * @return a USet pointer for this UnicodeSet 552 * @stable ICU 4.2 553 */ 554 inline USet *toUSet(); 555 556 557 /** 558 * Produce a const USet * pointer for this UnicodeSet. 559 * USet is the plain C type for UnicodeSet 560 * 561 * @return a const USet pointer for this UnicodeSet 562 * @stable ICU 4.2 563 */ 564 inline const USet * toUSet() const; 565 566 567 //---------------------------------------------------------------- 568 // Freezable API 569 //---------------------------------------------------------------- 570 571 /** 572 * Determines whether the set has been frozen (made immutable) or not. 573 * See the ICU4J Freezable interface for details. 574 * @return true/false for whether the set has been frozen 575 * @see freeze 576 * @see cloneAsThawed 577 * @stable ICU 3.8 578 */ 579 inline UBool isFrozen() const; 580 581 /** 582 * Freeze the set (make it immutable). 583 * Once frozen, it cannot be unfrozen and is therefore thread-safe 584 * until it is deleted. 585 * See the ICU4J Freezable interface for details. 586 * Freezing the set may also make some operations faster, for example 587 * contains() and span(). 588 * A frozen set will not be modified. (It remains frozen.) 589 * @return this set. 590 * @see isFrozen 591 * @see cloneAsThawed 592 * @stable ICU 3.8 593 */ 594 UnicodeSet *freeze(); 595 596 /** 597 * Clone the set and make the clone mutable. 598 * See the ICU4J Freezable interface for details. 599 * @return the mutable clone 600 * @see freeze 601 * @see isFrozen 602 * @stable ICU 3.8 603 */ 604 UnicodeSet *cloneAsThawed() const; 605 606 //---------------------------------------------------------------- 607 // Public API 608 //---------------------------------------------------------------- 609 610 /** 611 * Make this object represent the range `start - end`. 612 * If `start > end` then this object is set to an empty range. 613 * A frozen set will not be modified. 614 * 615 * @param start first character in the set, inclusive 616 * @param end last character in the set, inclusive 617 * @stable ICU 2.4 618 */ 619 UnicodeSet& set(UChar32 start, UChar32 end); 620 621 /** 622 * Return true if the given position, in the given pattern, appears 623 * to be the start of a UnicodeSet pattern. 624 * @stable ICU 2.4 625 */ 626 static UBool resemblesPattern(const UnicodeString& pattern, 627 int32_t pos); 628 629 /** 630 * Modifies this set to represent the set specified by the given 631 * pattern, ignoring Unicode Pattern_White_Space characters. 632 * See the class description for the syntax of the pattern language. 633 * A frozen set will not be modified. 634 * @param pattern a string specifying what characters are in the set 635 * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern 636 * contains a syntax error. 637 * <em> Empties the set passed before applying the pattern.</em> 638 * @return a reference to this 639 * @stable ICU 2.0 640 */ 641 UnicodeSet& applyPattern(const UnicodeString& pattern, 642 UErrorCode& status); 643 644 #ifndef U_HIDE_INTERNAL_API 645 /** 646 * Modifies this set to represent the set specified by the given 647 * pattern, optionally ignoring Unicode Pattern_White_Space characters. 648 * See the class description for the syntax of the pattern language. 649 * A frozen set will not be modified. 650 * @param pattern a string specifying what characters are in the set 651 * @param options bitmask for options to apply to the pattern. 652 * Valid options are USET_IGNORE_SPACE and 653 * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE. 654 * These case options are mutually exclusive. 655 * @param symbols a symbol table mapping variable names to 656 * values and stand-ins to UnicodeSets; may be nullptr 657 * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern 658 * contains a syntax error. 659 *<em> Empties the set passed before applying the pattern.</em> 660 * @return a reference to this 661 * @internal 662 */ 663 UnicodeSet& applyPattern(const UnicodeString& pattern, 664 uint32_t options, 665 const SymbolTable* symbols, 666 UErrorCode& status); 667 #endif /* U_HIDE_INTERNAL_API */ 668 669 /** 670 * Parses the given pattern, starting at the given position. The 671 * character at pattern.charAt(pos.getIndex()) must be '[', or the 672 * parse fails. Parsing continues until the corresponding closing 673 * ']'. If a syntax error is encountered between the opening and 674 * closing brace, the parse fails. Upon return from a successful 675 * parse, the ParsePosition is updated to point to the character 676 * following the closing ']', and a StringBuffer containing a 677 * pairs list for the parsed pattern is returned. This method calls 678 * itself recursively to parse embedded subpatterns. 679 *<em> Empties the set passed before applying the pattern.</em> 680 * A frozen set will not be modified. 681 * 682 * @param pattern the string containing the pattern to be parsed. 683 * The portion of the string from pos.getIndex(), which must be a 684 * '[', to the corresponding closing ']', is parsed. 685 * @param pos upon entry, the position at which to being parsing. 686 * The character at pattern.charAt(pos.getIndex()) must be a '['. 687 * Upon return from a successful parse, pos.getIndex() is either 688 * the character after the closing ']' of the parsed pattern, or 689 * pattern.length() if the closing ']' is the last character of 690 * the pattern string. 691 * @param options bitmask for options to apply to the pattern. 692 * Valid options are USET_IGNORE_SPACE and 693 * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE. 694 * These case options are mutually exclusive. 695 * @param symbols a symbol table mapping variable names to 696 * values and stand-ins to UnicodeSets; may be nullptr 697 * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern 698 * contains a syntax error. 699 * @return a reference to this 700 * @stable ICU 2.8 701 */ 702 UnicodeSet& applyPattern(const UnicodeString& pattern, 703 ParsePosition& pos, 704 uint32_t options, 705 const SymbolTable* symbols, 706 UErrorCode& status); 707 708 /** 709 * Returns a string representation of this set. If the result of 710 * calling this function is passed to a UnicodeSet constructor, it 711 * will produce another set that is equal to this one. 712 * A frozen set will not be modified. 713 * @param result the string to receive the rules. Previous 714 * contents will be deleted. 715 * @param escapeUnprintable if true then convert unprintable 716 * character to their hex escape representations, \\uxxxx or 717 * \\Uxxxxxxxx. Unprintable characters are those other than 718 * U+000A, U+0020..U+007E. 719 * @stable ICU 2.0 720 */ 721 virtual UnicodeString& toPattern(UnicodeString& result, 722 UBool escapeUnprintable = false) const override; 723 724 /** 725 * Modifies this set to contain those code points which have the given value 726 * for the given binary or enumerated property, as returned by 727 * u_getIntPropertyValue. Prior contents of this set are lost. 728 * A frozen set will not be modified. 729 * 730 * @param prop a property in the range UCHAR_BIN_START..UCHAR_BIN_LIMIT-1 731 * or UCHAR_INT_START..UCHAR_INT_LIMIT-1 732 * or UCHAR_MASK_START..UCHAR_MASK_LIMIT-1. 733 * 734 * @param value a value in the range u_getIntPropertyMinValue(prop).. 735 * u_getIntPropertyMaxValue(prop), with one exception. If prop is 736 * UCHAR_GENERAL_CATEGORY_MASK, then value should not be a UCharCategory, but 737 * rather a mask value produced by U_GET_GC_MASK(). This allows grouped 738 * categories such as [:L:] to be represented. 739 * 740 * @param ec error code input/output parameter 741 * 742 * @return a reference to this set 743 * 744 * @stable ICU 2.4 745 */ 746 UnicodeSet& applyIntPropertyValue(UProperty prop, 747 int32_t value, 748 UErrorCode& ec); 749 750 /** 751 * Modifies this set to contain those code points which have the 752 * given value for the given property. Prior contents of this 753 * set are lost. 754 * A frozen set will not be modified. 755 * 756 * @param prop a property alias, either short or long. The name is matched 757 * loosely. See PropertyAliases.txt for names and a description of loose 758 * matching. If the value string is empty, then this string is interpreted 759 * as either a General_Category value alias, a Script value alias, a binary 760 * property alias, or a special ID. Special IDs are matched loosely and 761 * correspond to the following sets: 762 * 763 * "ANY" = [\\u0000-\\U0010FFFF], 764 * "ASCII" = [\\u0000-\\u007F], 765 * "Assigned" = [:^Cn:]. 766 * 767 * @param value a value alias, either short or long. The name is matched 768 * loosely. See PropertyValueAliases.txt for names and a description of 769 * loose matching. In addition to aliases listed, numeric values and 770 * canonical combining classes may be expressed numerically, e.g., ("nv", 771 * "0.5") or ("ccc", "220"). The value string may also be empty. 772 * 773 * @param ec error code input/output parameter 774 * 775 * @return a reference to this set 776 * 777 * @stable ICU 2.4 778 */ 779 UnicodeSet& applyPropertyAlias(const UnicodeString& prop, 780 const UnicodeString& value, 781 UErrorCode& ec); 782 783 /** 784 * Returns the number of elements in this set (its cardinality). 785 * Note than the elements of a set may include both individual 786 * codepoints and strings. 787 * 788 * This is slower than getRangeCount() because 789 * it counts the code points of all ranges. 790 * 791 * @return the number of elements in this set (its cardinality). 792 * @stable ICU 2.0 793 * @see getRangeCount 794 */ 795 virtual int32_t size(void) const; 796 797 /** 798 * Returns <tt>true</tt> if this set contains no elements. 799 * 800 * @return <tt>true</tt> if this set contains no elements. 801 * @stable ICU 2.0 802 */ 803 virtual UBool isEmpty(void) const; 804 805 /** 806 * @return true if this set contains multi-character strings or the empty string. 807 * @stable ICU 70 808 */ 809 UBool hasStrings() const; 810 811 /** 812 * Returns true if this set contains the given character. 813 * This function works faster with a frozen set. 814 * @param c character to be checked for containment 815 * @return true if the test condition is met 816 * @stable ICU 2.0 817 */ 818 virtual UBool contains(UChar32 c) const override; 819 820 /** 821 * Returns true if this set contains every character 822 * of the given range. 823 * @param start first character, inclusive, of the range 824 * @param end last character, inclusive, of the range 825 * @return true if the test condition is met 826 * @stable ICU 2.0 827 */ 828 virtual UBool contains(UChar32 start, UChar32 end) const; 829 830 /** 831 * Returns <tt>true</tt> if this set contains the given 832 * multicharacter string. 833 * @param s string to be checked for containment 834 * @return <tt>true</tt> if this set contains the specified string 835 * @stable ICU 2.4 836 */ 837 UBool contains(const UnicodeString& s) const; 838 839 /** 840 * Returns true if this set contains all the characters and strings 841 * of the given set. 842 * @param c set to be checked for containment 843 * @return true if the test condition is met 844 * @stable ICU 2.4 845 */ 846 virtual UBool containsAll(const UnicodeSet& c) const; 847 848 /** 849 * Returns true if this set contains all the characters 850 * of the given string. 851 * @param s string containing characters to be checked for containment 852 * @return true if the test condition is met 853 * @stable ICU 2.4 854 */ 855 UBool containsAll(const UnicodeString& s) const; 856 857 /** 858 * Returns true if this set contains none of the characters 859 * of the given range. 860 * @param start first character, inclusive, of the range 861 * @param end last character, inclusive, of the range 862 * @return true if the test condition is met 863 * @stable ICU 2.4 864 */ 865 UBool containsNone(UChar32 start, UChar32 end) const; 866 867 /** 868 * Returns true if this set contains none of the characters and strings 869 * of the given set. 870 * @param c set to be checked for containment 871 * @return true if the test condition is met 872 * @stable ICU 2.4 873 */ 874 UBool containsNone(const UnicodeSet& c) const; 875 876 /** 877 * Returns true if this set contains none of the characters 878 * of the given string. 879 * @param s string containing characters to be checked for containment 880 * @return true if the test condition is met 881 * @stable ICU 2.4 882 */ 883 UBool containsNone(const UnicodeString& s) const; 884 885 /** 886 * Returns true if this set contains one or more of the characters 887 * in the given range. 888 * @param start first character, inclusive, of the range 889 * @param end last character, inclusive, of the range 890 * @return true if the condition is met 891 * @stable ICU 2.4 892 */ 893 inline UBool containsSome(UChar32 start, UChar32 end) const; 894 895 /** 896 * Returns true if this set contains one or more of the characters 897 * and strings of the given set. 898 * @param s The set to be checked for containment 899 * @return true if the condition is met 900 * @stable ICU 2.4 901 */ 902 inline UBool containsSome(const UnicodeSet& s) const; 903 904 /** 905 * Returns true if this set contains one or more of the characters 906 * of the given string. 907 * @param s string containing characters to be checked for containment 908 * @return true if the condition is met 909 * @stable ICU 2.4 910 */ 911 inline UBool containsSome(const UnicodeString& s) const; 912 913 /** 914 * Returns the length of the initial substring of the input string which 915 * consists only of characters and strings that are contained in this set 916 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), 917 * or only of characters and strings that are not contained 918 * in this set (USET_SPAN_NOT_CONTAINED). 919 * See USetSpanCondition for details. 920 * Similar to the strspn() C library function. 921 * Unpaired surrogates are treated according to contains() of their surrogate code points. 922 * This function works faster with a frozen set and with a non-negative string length argument. 923 * @param s start of the string 924 * @param length of the string; can be -1 for NUL-terminated 925 * @param spanCondition specifies the containment condition 926 * @return the length of the initial substring according to the spanCondition; 927 * 0 if the start of the string does not fit the spanCondition 928 * @stable ICU 3.8 929 * @see USetSpanCondition 930 */ 931 int32_t span(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const; 932 933 /** 934 * Returns the end of the substring of the input string according to the USetSpanCondition. 935 * Same as <code>start+span(s.getBuffer()+start, s.length()-start, spanCondition)</code> 936 * after pinning start to 0<=start<=s.length(). 937 * @param s the string 938 * @param start the start index in the string for the span operation 939 * @param spanCondition specifies the containment condition 940 * @return the exclusive end of the substring according to the spanCondition; 941 * the substring s.tempSubStringBetween(start, end) fulfills the spanCondition 942 * @stable ICU 4.4 943 * @see USetSpanCondition 944 */ 945 inline int32_t span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const; 946 947 /** 948 * Returns the start of the trailing substring of the input string which 949 * consists only of characters and strings that are contained in this set 950 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), 951 * or only of characters and strings that are not contained 952 * in this set (USET_SPAN_NOT_CONTAINED). 953 * See USetSpanCondition for details. 954 * Unpaired surrogates are treated according to contains() of their surrogate code points. 955 * This function works faster with a frozen set and with a non-negative string length argument. 956 * @param s start of the string 957 * @param length of the string; can be -1 for NUL-terminated 958 * @param spanCondition specifies the containment condition 959 * @return the start of the trailing substring according to the spanCondition; 960 * the string length if the end of the string does not fit the spanCondition 961 * @stable ICU 3.8 962 * @see USetSpanCondition 963 */ 964 int32_t spanBack(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const; 965 966 /** 967 * Returns the start of the substring of the input string according to the USetSpanCondition. 968 * Same as <code>spanBack(s.getBuffer(), limit, spanCondition)</code> 969 * after pinning limit to 0<=end<=s.length(). 970 * @param s the string 971 * @param limit the exclusive-end index in the string for the span operation 972 * (use s.length() or INT32_MAX for spanning back from the end of the string) 973 * @param spanCondition specifies the containment condition 974 * @return the start of the substring according to the spanCondition; 975 * the substring s.tempSubStringBetween(start, limit) fulfills the spanCondition 976 * @stable ICU 4.4 977 * @see USetSpanCondition 978 */ 979 inline int32_t spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const; 980 981 /** 982 * Returns the length of the initial substring of the input string which 983 * consists only of characters and strings that are contained in this set 984 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), 985 * or only of characters and strings that are not contained 986 * in this set (USET_SPAN_NOT_CONTAINED). 987 * See USetSpanCondition for details. 988 * Similar to the strspn() C library function. 989 * Malformed byte sequences are treated according to contains(0xfffd). 990 * This function works faster with a frozen set and with a non-negative string length argument. 991 * @param s start of the string (UTF-8) 992 * @param length of the string; can be -1 for NUL-terminated 993 * @param spanCondition specifies the containment condition 994 * @return the length of the initial substring according to the spanCondition; 995 * 0 if the start of the string does not fit the spanCondition 996 * @stable ICU 3.8 997 * @see USetSpanCondition 998 */ 999 int32_t spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const; 1000 1001 /** 1002 * Returns the start of the trailing substring of the input string which 1003 * consists only of characters and strings that are contained in this set 1004 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), 1005 * or only of characters and strings that are not contained 1006 * in this set (USET_SPAN_NOT_CONTAINED). 1007 * See USetSpanCondition for details. 1008 * Malformed byte sequences are treated according to contains(0xfffd). 1009 * This function works faster with a frozen set and with a non-negative string length argument. 1010 * @param s start of the string (UTF-8) 1011 * @param length of the string; can be -1 for NUL-terminated 1012 * @param spanCondition specifies the containment condition 1013 * @return the start of the trailing substring according to the spanCondition; 1014 * the string length if the end of the string does not fit the spanCondition 1015 * @stable ICU 3.8 1016 * @see USetSpanCondition 1017 */ 1018 int32_t spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const; 1019 1020 /** 1021 * Implement UnicodeMatcher::matches() 1022 * @stable ICU 2.4 1023 */ 1024 virtual UMatchDegree matches(const Replaceable& text, 1025 int32_t& offset, 1026 int32_t limit, 1027 UBool incremental) override; 1028 1029 private: 1030 /** 1031 * Returns the longest match for s in text at the given position. 1032 * If limit > start then match forward from start+1 to limit 1033 * matching all characters except s.charAt(0). If limit < start, 1034 * go backward starting from start-1 matching all characters 1035 * except s.charAt(s.length()-1). This method assumes that the 1036 * first character, text.charAt(start), matches s, so it does not 1037 * check it. 1038 * @param text the text to match 1039 * @param start the first character to match. In the forward 1040 * direction, text.charAt(start) is matched against s.charAt(0). 1041 * In the reverse direction, it is matched against 1042 * s.charAt(s.length()-1). 1043 * @param limit the limit offset for matching, either last+1 in 1044 * the forward direction, or last-1 in the reverse direction, 1045 * where last is the index of the last character to match. 1046 * @param s 1047 * @return If part of s matches up to the limit, return |limit - 1048 * start|. If all of s matches before reaching the limit, return 1049 * s.length(). If there is a mismatch between s and text, return 1050 * 0 1051 */ 1052 static int32_t matchRest(const Replaceable& text, 1053 int32_t start, int32_t limit, 1054 const UnicodeString& s); 1055 1056 /** 1057 * Returns the smallest value i such that c < list[i]. Caller 1058 * must ensure that c is a legal value or this method will enter 1059 * an infinite loop. This method performs a binary search. 1060 * @param c a character in the range MIN_VALUE..MAX_VALUE 1061 * inclusive 1062 * @return the smallest integer i in the range 0..len-1, 1063 * inclusive, such that c < list[i] 1064 */ 1065 int32_t findCodePoint(UChar32 c) const; 1066 1067 public: 1068 1069 /** 1070 * Implementation of UnicodeMatcher API. Union the set of all 1071 * characters that may be matched by this object into the given 1072 * set. 1073 * @param toUnionTo the set into which to union the source characters 1074 * @stable ICU 2.4 1075 */ 1076 virtual void addMatchSetTo(UnicodeSet& toUnionTo) const override; 1077 1078 /** 1079 * Returns the index of the given character within this set, where 1080 * the set is ordered by ascending code point. If the character 1081 * is not in this set, return -1. The inverse of this method is 1082 * <code>charAt()</code>. 1083 * @return an index from 0..size()-1, or -1 1084 * @stable ICU 2.4 1085 */ 1086 int32_t indexOf(UChar32 c) const; 1087 1088 /** 1089 * Returns the character at the given index within this set, where 1090 * the set is ordered by ascending code point. If the index is 1091 * out of range for characters, returns (UChar32)-1. 1092 * The inverse of this method is <code>indexOf()</code>. 1093 * 1094 * For iteration, this is slower than UnicodeSetIterator or 1095 * getRangeCount()/getRangeStart()/getRangeEnd(), 1096 * because for each call it skips linearly over <code>index</code> 1097 * characters in the ranges. 1098 * 1099 * @param index an index from 0..size()-1 1100 * @return the character at the given index, or (UChar32)-1. 1101 * @stable ICU 2.4 1102 */ 1103 UChar32 charAt(int32_t index) const; 1104 1105 /** 1106 * Adds the specified range to this set if it is not already 1107 * present. If this set already contains the specified range, 1108 * the call leaves this set unchanged. If <code>start > end</code> 1109 * then an empty range is added, leaving the set unchanged. 1110 * This is equivalent to a boolean logic OR, or a set UNION. 1111 * A frozen set will not be modified. 1112 * 1113 * @param start first character, inclusive, of range to be added 1114 * to this set. 1115 * @param end last character, inclusive, of range to be added 1116 * to this set. 1117 * @stable ICU 2.0 1118 */ 1119 virtual UnicodeSet& add(UChar32 start, UChar32 end); 1120 1121 /** 1122 * Adds the specified character to this set if it is not already 1123 * present. If this set already contains the specified character, 1124 * the call leaves this set unchanged. 1125 * A frozen set will not be modified. 1126 * 1127 * @param c the character (code point) 1128 * @return this object, for chaining 1129 * @stable ICU 2.0 1130 */ 1131 UnicodeSet& add(UChar32 c); 1132 1133 /** 1134 * Adds the specified multicharacter to this set if it is not already 1135 * present. If this set already contains the multicharacter, 1136 * the call leaves this set unchanged. 1137 * Thus "ch" => {"ch"} 1138 * A frozen set will not be modified. 1139 * 1140 * @param s the source string 1141 * @return this object, for chaining 1142 * @stable ICU 2.4 1143 */ 1144 UnicodeSet& add(const UnicodeString& s); 1145 1146 private: 1147 /** 1148 * @return a code point IF the string consists of a single one. 1149 * otherwise returns -1. 1150 * @param s string to test 1151 */ 1152 static int32_t getSingleCP(const UnicodeString& s); 1153 1154 void _add(const UnicodeString& s); 1155 1156 public: 1157 /** 1158 * Adds each of the characters in this string to the set. Note: "ch" => {"c", "h"} 1159 * If this set already contains any particular character, it has no effect on that character. 1160 * A frozen set will not be modified. 1161 * @param s the source string 1162 * @return this object, for chaining 1163 * @stable ICU 2.4 1164 */ 1165 UnicodeSet& addAll(const UnicodeString& s); 1166 1167 /** 1168 * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"} 1169 * A frozen set will not be modified. 1170 * @param s the source string 1171 * @return this object, for chaining 1172 * @stable ICU 2.4 1173 */ 1174 UnicodeSet& retainAll(const UnicodeString& s); 1175 1176 /** 1177 * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"} 1178 * A frozen set will not be modified. 1179 * @param s the source string 1180 * @return this object, for chaining 1181 * @stable ICU 2.4 1182 */ 1183 UnicodeSet& complementAll(const UnicodeString& s); 1184 1185 /** 1186 * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"} 1187 * A frozen set will not be modified. 1188 * @param s the source string 1189 * @return this object, for chaining 1190 * @stable ICU 2.4 1191 */ 1192 UnicodeSet& removeAll(const UnicodeString& s); 1193 1194 /** 1195 * Makes a set from a multicharacter string. Thus "ch" => {"ch"} 1196 * 1197 * @param s the source string 1198 * @return a newly created set containing the given string. 1199 * The caller owns the return object and is responsible for deleting it. 1200 * @stable ICU 2.4 1201 */ 1202 static UnicodeSet* U_EXPORT2 createFrom(const UnicodeString& s); 1203 1204 1205 /** 1206 * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"} 1207 * @param s the source string 1208 * @return a newly created set containing the given characters 1209 * The caller owns the return object and is responsible for deleting it. 1210 * @stable ICU 2.4 1211 */ 1212 static UnicodeSet* U_EXPORT2 createFromAll(const UnicodeString& s); 1213 1214 /** 1215 * Retain only the elements in this set that are contained in the 1216 * specified range. If <code>start > end</code> then an empty range is 1217 * retained, leaving the set empty. This is equivalent to 1218 * a boolean logic AND, or a set INTERSECTION. 1219 * A frozen set will not be modified. 1220 * 1221 * @param start first character, inclusive, of range 1222 * @param end last character, inclusive, of range 1223 * @stable ICU 2.0 1224 */ 1225 virtual UnicodeSet& retain(UChar32 start, UChar32 end); 1226 1227 1228 /** 1229 * Retain the specified character from this set if it is present. 1230 * A frozen set will not be modified. 1231 * 1232 * @param c the character (code point) 1233 * @return this object, for chaining 1234 * @stable ICU 2.0 1235 */ 1236 UnicodeSet& retain(UChar32 c); 1237 1238 /** 1239 * Retains only the specified string from this set if it is present. 1240 * Upon return this set will be empty if it did not contain s, or 1241 * will only contain s if it did contain s. 1242 * A frozen set will not be modified. 1243 * 1244 * @param s the source string 1245 * @return this object, for chaining 1246 * @stable ICU 69 1247 */ 1248 UnicodeSet& retain(const UnicodeString &s); 1249 1250 /** 1251 * Removes the specified range from this set if it is present. 1252 * The set will not contain the specified range once the call 1253 * returns. If <code>start > end</code> then an empty range is 1254 * removed, leaving the set unchanged. 1255 * A frozen set will not be modified. 1256 * 1257 * @param start first character, inclusive, of range to be removed 1258 * from this set. 1259 * @param end last character, inclusive, of range to be removed 1260 * from this set. 1261 * @stable ICU 2.0 1262 */ 1263 virtual UnicodeSet& remove(UChar32 start, UChar32 end); 1264 1265 /** 1266 * Removes the specified character from this set if it is present. 1267 * The set will not contain the specified range once the call 1268 * returns. 1269 * A frozen set will not be modified. 1270 * 1271 * @param c the character (code point) 1272 * @return this object, for chaining 1273 * @stable ICU 2.0 1274 */ 1275 UnicodeSet& remove(UChar32 c); 1276 1277 /** 1278 * Removes the specified string from this set if it is present. 1279 * The set will not contain the specified character once the call 1280 * returns. 1281 * A frozen set will not be modified. 1282 * @param s the source string 1283 * @return this object, for chaining 1284 * @stable ICU 2.4 1285 */ 1286 UnicodeSet& remove(const UnicodeString& s); 1287 1288 /** 1289 * This is equivalent to 1290 * <code>complement(MIN_VALUE, MAX_VALUE)</code>. 1291 * 1292 * <strong>Note:</strong> This performs a symmetric difference with all code points 1293 * <em>and thus retains all multicharacter strings</em>. 1294 * In order to achieve a “code point complement†(all code points minus this set), 1295 * the easiest is to <code>.complement().removeAllStrings()</code>. 1296 * 1297 * A frozen set will not be modified. 1298 * @stable ICU 2.0 1299 */ 1300 virtual UnicodeSet& complement(); 1301 1302 /** 1303 * Complements the specified range in this set. Any character in 1304 * the range will be removed if it is in this set, or will be 1305 * added if it is not in this set. If <code>start > end</code> 1306 * then an empty range is complemented, leaving the set unchanged. 1307 * This is equivalent to a boolean logic XOR. 1308 * A frozen set will not be modified. 1309 * 1310 * @param start first character, inclusive, of range 1311 * @param end last character, inclusive, of range 1312 * @stable ICU 2.0 1313 */ 1314 virtual UnicodeSet& complement(UChar32 start, UChar32 end); 1315 1316 /** 1317 * Complements the specified character in this set. The character 1318 * will be removed if it is in this set, or will be added if it is 1319 * not in this set. 1320 * A frozen set will not be modified. 1321 * 1322 * @param c the character (code point) 1323 * @return this object, for chaining 1324 * @stable ICU 2.0 1325 */ 1326 UnicodeSet& complement(UChar32 c); 1327 1328 /** 1329 * Complement the specified string in this set. 1330 * The string will be removed if it is in this set, or will be added if it is not in this set. 1331 * A frozen set will not be modified. 1332 * 1333 * @param s the string to complement 1334 * @return this object, for chaining 1335 * @stable ICU 2.4 1336 */ 1337 UnicodeSet& complement(const UnicodeString& s); 1338 1339 /** 1340 * Adds all of the elements in the specified set to this set if 1341 * they're not already present. This operation effectively 1342 * modifies this set so that its value is the <i>union</i> of the two 1343 * sets. The behavior of this operation is unspecified if the specified 1344 * collection is modified while the operation is in progress. 1345 * A frozen set will not be modified. 1346 * 1347 * @param c set whose elements are to be added to this set. 1348 * @see #add(UChar32, UChar32) 1349 * @stable ICU 2.0 1350 */ 1351 virtual UnicodeSet& addAll(const UnicodeSet& c); 1352 1353 /** 1354 * Retains only the elements in this set that are contained in the 1355 * specified set. In other words, removes from this set all of 1356 * its elements that are not contained in the specified set. This 1357 * operation effectively modifies this set so that its value is 1358 * the <i>intersection</i> of the two sets. 1359 * A frozen set will not be modified. 1360 * 1361 * @param c set that defines which elements this set will retain. 1362 * @stable ICU 2.0 1363 */ 1364 virtual UnicodeSet& retainAll(const UnicodeSet& c); 1365 1366 /** 1367 * Removes from this set all of its elements that are contained in the 1368 * specified set. This operation effectively modifies this 1369 * set so that its value is the <i>asymmetric set difference</i> of 1370 * the two sets. 1371 * A frozen set will not be modified. 1372 * 1373 * @param c set that defines which elements will be removed from 1374 * this set. 1375 * @stable ICU 2.0 1376 */ 1377 virtual UnicodeSet& removeAll(const UnicodeSet& c); 1378 1379 /** 1380 * Complements in this set all elements contained in the specified 1381 * set. Any character in the other set will be removed if it is 1382 * in this set, or will be added if it is not in this set. 1383 * A frozen set will not be modified. 1384 * 1385 * @param c set that defines which elements will be xor'ed from 1386 * this set. 1387 * @stable ICU 2.4 1388 */ 1389 virtual UnicodeSet& complementAll(const UnicodeSet& c); 1390 1391 /** 1392 * Removes all of the elements from this set. This set will be 1393 * empty after this call returns. 1394 * A frozen set will not be modified. 1395 * @stable ICU 2.0 1396 */ 1397 virtual UnicodeSet& clear(void); 1398 1399 /** 1400 * Close this set over the given attribute. For the attribute 1401 * USET_CASE_INSENSITIVE, the result is to modify this set so that: 1402 * 1403 * 1. For each character or string 'a' in this set, all strings or 1404 * characters 'b' such that foldCase(a) == foldCase(b) are added 1405 * to this set. 1406 * 1407 * 2. For each string 'e' in the resulting set, if e != 1408 * foldCase(e), 'e' will be removed. 1409 * 1410 * Example: [aq\\u00DF{Bc}{bC}{Fi}] => [aAqQ\\u00DF\\uFB01{ss}{bc}{fi}] 1411 * 1412 * (Here foldCase(x) refers to the operation u_strFoldCase, and a 1413 * == b denotes that the contents are the same, not pointer 1414 * comparison.) 1415 * 1416 * A frozen set will not be modified. 1417 * 1418 * @param attribute bitmask for attributes to close over. 1419 * Valid options: 1420 * At most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE. 1421 * These case options are mutually exclusive. 1422 * Unrelated options bits are ignored. 1423 * @return a reference to this set. 1424 * @stable ICU 4.2 1425 */ 1426 UnicodeSet& closeOver(int32_t attribute); 1427 1428 /** 1429 * Remove all strings from this set. 1430 * 1431 * @return a reference to this set. 1432 * @stable ICU 4.2 1433 */ 1434 virtual UnicodeSet &removeAllStrings(); 1435 1436 /** 1437 * Iteration method that returns the number of ranges contained in 1438 * this set. 1439 * @see #getRangeStart 1440 * @see #getRangeEnd 1441 * @stable ICU 2.4 1442 */ 1443 virtual int32_t getRangeCount(void) const; 1444 1445 /** 1446 * Iteration method that returns the first character in the 1447 * specified range of this set. 1448 * @see #getRangeCount 1449 * @see #getRangeEnd 1450 * @stable ICU 2.4 1451 */ 1452 virtual UChar32 getRangeStart(int32_t index) const; 1453 1454 /** 1455 * Iteration method that returns the last character in the 1456 * specified range of this set. 1457 * @see #getRangeStart 1458 * @see #getRangeEnd 1459 * @stable ICU 2.4 1460 */ 1461 virtual UChar32 getRangeEnd(int32_t index) const; 1462 1463 /** 1464 * Serializes this set into an array of 16-bit integers. Serialization 1465 * (currently) only records the characters in the set; multicharacter 1466 * strings are ignored. 1467 * 1468 * The array has following format (each line is one 16-bit 1469 * integer): 1470 * 1471 * length = (n+2*m) | (m!=0?0x8000:0) 1472 * bmpLength = n; present if m!=0 1473 * bmp[0] 1474 * bmp[1] 1475 * ... 1476 * bmp[n-1] 1477 * supp-high[0] 1478 * supp-low[0] 1479 * supp-high[1] 1480 * supp-low[1] 1481 * ... 1482 * supp-high[m-1] 1483 * supp-low[m-1] 1484 * 1485 * The array starts with a header. After the header are n bmp 1486 * code points, then m supplementary code points. Either n or m 1487 * or both may be zero. n+2*m is always <= 0x7FFF. 1488 * 1489 * If there are no supplementary characters (if m==0) then the 1490 * header is one 16-bit integer, 'length', with value n. 1491 * 1492 * If there are supplementary characters (if m!=0) then the header 1493 * is two 16-bit integers. The first, 'length', has value 1494 * (n+2*m)|0x8000. The second, 'bmpLength', has value n. 1495 * 1496 * After the header the code points are stored in ascending order. 1497 * Supplementary code points are stored as most significant 16 1498 * bits followed by least significant 16 bits. 1499 * 1500 * @param dest pointer to buffer of destCapacity 16-bit integers. 1501 * May be nullptr only if destCapacity is zero. 1502 * @param destCapacity size of dest, or zero. Must not be negative. 1503 * @param ec error code. Will be set to U_INDEX_OUTOFBOUNDS_ERROR 1504 * if n+2*m > 0x7FFF. Will be set to U_BUFFER_OVERFLOW_ERROR if 1505 * n+2*m+(m!=0?2:1) > destCapacity. 1506 * @return the total length of the serialized format, including 1507 * the header, that is, n+2*m+(m!=0?2:1), or 0 on error other 1508 * than U_BUFFER_OVERFLOW_ERROR. 1509 * @stable ICU 2.4 1510 */ 1511 int32_t serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const; 1512 1513 /** 1514 * Reallocate this objects internal structures to take up the least 1515 * possible space, without changing this object's value. 1516 * A frozen set will not be modified. 1517 * @stable ICU 2.4 1518 */ 1519 virtual UnicodeSet& compact(); 1520 1521 /** 1522 * Return the class ID for this class. This is useful only for 1523 * comparing to a return value from getDynamicClassID(). For example: 1524 * <pre> 1525 * . Base* polymorphic_pointer = createPolymorphicObject(); 1526 * . if (polymorphic_pointer->getDynamicClassID() == 1527 * . Derived::getStaticClassID()) ... 1528 * </pre> 1529 * @return The class ID for all objects of this class. 1530 * @stable ICU 2.0 1531 */ 1532 static UClassID U_EXPORT2 getStaticClassID(void); 1533 1534 /** 1535 * Implement UnicodeFunctor API. 1536 * 1537 * @return The class ID for this object. All objects of a given 1538 * class have the same class ID. Objects of other classes have 1539 * different class IDs. 1540 * @stable ICU 2.4 1541 */ 1542 virtual UClassID getDynamicClassID(void) const override; 1543 1544 private: 1545 1546 // Private API for the USet API 1547 1548 friend class USetAccess; 1549 1550 const UnicodeString* getString(int32_t index) const; 1551 1552 //---------------------------------------------------------------- 1553 // RuleBasedTransliterator support 1554 //---------------------------------------------------------------- 1555 1556 private: 1557 1558 /** 1559 * Returns <tt>true</tt> if this set contains any character whose low byte 1560 * is the given value. This is used by <tt>RuleBasedTransliterator</tt> for 1561 * indexing. 1562 */ 1563 virtual UBool matchesIndexValue(uint8_t v) const override; 1564 1565 private: 1566 friend class RBBIRuleScanner; 1567 1568 //---------------------------------------------------------------- 1569 // Implementation: Clone as thawed (see ICU4J Freezable) 1570 //---------------------------------------------------------------- 1571 1572 UnicodeSet(const UnicodeSet& o, UBool /* asThawed */); 1573 UnicodeSet& copyFrom(const UnicodeSet& o, UBool asThawed); 1574 1575 //---------------------------------------------------------------- 1576 // Implementation: Pattern parsing 1577 //---------------------------------------------------------------- 1578 1579 void applyPatternIgnoreSpace(const UnicodeString& pattern, 1580 ParsePosition& pos, 1581 const SymbolTable* symbols, 1582 UErrorCode& status); 1583 1584 void applyPattern(RuleCharacterIterator& chars, 1585 const SymbolTable* symbols, 1586 UnicodeString& rebuiltPat, 1587 uint32_t options, 1588 UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute), 1589 int32_t depth, 1590 UErrorCode& ec); 1591 1592 void closeOverCaseInsensitive(bool simple); 1593 void closeOverAddCaseMappings(); 1594 1595 //---------------------------------------------------------------- 1596 // Implementation: Utility methods 1597 //---------------------------------------------------------------- 1598 1599 static int32_t nextCapacity(int32_t minCapacity); 1600 1601 bool ensureCapacity(int32_t newLen); 1602 1603 bool ensureBufferCapacity(int32_t newLen); 1604 1605 void swapBuffers(void); 1606 1607 UBool allocateStrings(UErrorCode &status); 1608 int32_t stringsSize() const; 1609 UBool stringsContains(const UnicodeString &s) const; 1610 1611 UnicodeString& _toPattern(UnicodeString& result, 1612 UBool escapeUnprintable) const; 1613 1614 UnicodeString& _generatePattern(UnicodeString& result, 1615 UBool escapeUnprintable) const; 1616 1617 static void _appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable); 1618 1619 static void _appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable); 1620 1621 static void _appendToPat(UnicodeString &result, UChar32 start, UChar32 end, 1622 UBool escapeUnprintable); 1623 1624 //---------------------------------------------------------------- 1625 // Implementation: Fundamental operators 1626 //---------------------------------------------------------------- 1627 1628 void exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity); 1629 1630 void add(const UChar32* other, int32_t otherLen, int8_t polarity); 1631 1632 void retain(const UChar32* other, int32_t otherLen, int8_t polarity); 1633 1634 /** 1635 * Return true if the given position, in the given pattern, appears 1636 * to be the start of a property set pattern [:foo:], \\p{foo}, or 1637 * \\P{foo}, or \\N{name}. 1638 */ 1639 static UBool resemblesPropertyPattern(const UnicodeString& pattern, 1640 int32_t pos); 1641 1642 static UBool resemblesPropertyPattern(RuleCharacterIterator& chars, 1643 int32_t iterOpts); 1644 1645 /** 1646 * Parse the given property pattern at the given parse position 1647 * and set this UnicodeSet to the result. 1648 * 1649 * The original design document is out of date, but still useful. 1650 * Ignore the property and value names: 1651 * https://htmlpreview.github.io/?https://github.com/unicode-org/icu-docs/blob/main/design/unicodeset_properties.html 1652 * 1653 * Recognized syntax: 1654 * 1655 * [:foo:] [:^foo:] - white space not allowed within "[:" or ":]" 1656 * \\p{foo} \\P{foo} - white space not allowed within "\\p" or "\\P" 1657 * \\N{name} - white space not allowed within "\\N" 1658 * 1659 * Other than the above restrictions, Unicode Pattern_White_Space characters are ignored. 1660 * Case is ignored except in "\\p" and "\\P" and "\\N". In 'name' leading 1661 * and trailing space is deleted, and internal runs of whitespace 1662 * are collapsed to a single space. 1663 * 1664 * We support binary properties, enumerated properties, and the 1665 * following non-enumerated properties: 1666 * 1667 * Numeric_Value 1668 * Name 1669 * Unicode_1_Name 1670 * 1671 * @param pattern the pattern string 1672 * @param ppos on entry, the position at which to begin parsing. 1673 * This should be one of the locations marked '^': 1674 * 1675 * [:blah:] \\p{blah} \\P{blah} \\N{name} 1676 * ^ % ^ % ^ % ^ % 1677 * 1678 * On return, the position after the last character parsed, that is, 1679 * the locations marked '%'. If the parse fails, ppos is returned 1680 * unchanged. 1681 * @param ec status 1682 * @return a reference to this. 1683 */ 1684 UnicodeSet& applyPropertyPattern(const UnicodeString& pattern, 1685 ParsePosition& ppos, 1686 UErrorCode &ec); 1687 1688 void applyPropertyPattern(RuleCharacterIterator& chars, 1689 UnicodeString& rebuiltPat, 1690 UErrorCode& ec); 1691 1692 /** 1693 * A filter that returns true if the given code point should be 1694 * included in the UnicodeSet being constructed. 1695 */ 1696 typedef UBool (*Filter)(UChar32 codePoint, void* context); 1697 1698 /** 1699 * Given a filter, set this UnicodeSet to the code points 1700 * contained by that filter. The filter MUST be 1701 * property-conformant. That is, if it returns value v for one 1702 * code point, then it must return v for all affiliated code 1703 * points, as defined by the inclusions list. See 1704 * getInclusions(). 1705 * src is a UPropertySource value. 1706 */ 1707 void applyFilter(Filter filter, 1708 void* context, 1709 const UnicodeSet* inclusions, 1710 UErrorCode &status); 1711 1712 /** 1713 * Set the new pattern to cache. 1714 */ 1715 void setPattern(const UnicodeString& newPat) { 1716 setPattern(newPat.getBuffer(), newPat.length()); 1717 } 1718 void setPattern(const char16_t *newPat, int32_t newPatLen); 1719 /** 1720 * Release existing cached pattern. 1721 */ 1722 void releasePattern(); 1723 1724 friend class UnicodeSetIterator; 1725 }; 1726 1727 1728 1729 inline bool UnicodeSet::operator!=(const UnicodeSet& o) const { 1730 return !operator==(o); 1731 } 1732 1733 inline UBool UnicodeSet::isFrozen() const { 1734 return (UBool)(bmpSet!=nullptr || stringSpan!=nullptr); 1735 } 1736 1737 inline UBool UnicodeSet::containsSome(UChar32 start, UChar32 end) const { 1738 return !containsNone(start, end); 1739 } 1740 1741 inline UBool UnicodeSet::containsSome(const UnicodeSet& s) const { 1742 return !containsNone(s); 1743 } 1744 1745 inline UBool UnicodeSet::containsSome(const UnicodeString& s) const { 1746 return !containsNone(s); 1747 } 1748 1749 inline UBool UnicodeSet::isBogus() const { 1750 return (UBool)(fFlags & kIsBogus); 1751 } 1752 1753 inline UnicodeSet *UnicodeSet::fromUSet(USet *uset) { 1754 return reinterpret_cast<UnicodeSet *>(uset); 1755 } 1756 1757 inline const UnicodeSet *UnicodeSet::fromUSet(const USet *uset) { 1758 return reinterpret_cast<const UnicodeSet *>(uset); 1759 } 1760 1761 inline USet *UnicodeSet::toUSet() { 1762 return reinterpret_cast<USet *>(this); 1763 } 1764 1765 inline const USet *UnicodeSet::toUSet() const { 1766 return reinterpret_cast<const USet *>(this); 1767 } 1768 1769 inline int32_t UnicodeSet::span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const { 1770 int32_t sLength=s.length(); 1771 if(start<0) { 1772 start=0; 1773 } else if(start>sLength) { 1774 start=sLength; 1775 } 1776 return start+span(s.getBuffer()+start, sLength-start, spanCondition); 1777 } 1778 1779 inline int32_t UnicodeSet::spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const { 1780 int32_t sLength=s.length(); 1781 if(limit<0) { 1782 limit=0; 1783 } else if(limit>sLength) { 1784 limit=sLength; 1785 } 1786 return spanBack(s.getBuffer(), limit, spanCondition); 1787 } 1788 1789 U_NAMESPACE_END 1790 1791 #endif /* U_SHOW_CPLUSPLUS_API */ 1792 1793 #endif
Welcome to MyWebUniversity on July 19, 2025.
Contact us
|
About us
|
Term of use
|
Copyright © 2000-2025 MyWebUniversity.com ™