This is a C wrapper around the C++ RegexPattern and RegexMatcher classes.
enum URegexpFlag
150 * The contents of the pattern UText will be extracted and saved. Ownership of the 151 * UText struct itself remains with the caller. This is to match the behavior of 152 * uregex_open(). 153 * 154 * @param pattern The Regular Expression pattern to be compiled. 155 * @param flags Flags that alter the default matching behavior for 156 * the regular expression, UREGEX_CASE_INSENSITIVE, for 157 * example. For default behavior, set this parameter to zero. 158 * See enum URegexpFlag. All desired flags 159 * are bitwise-ORed together. 160 * @param pe Receives the position (line and column numbers) of any syntax 161 * error within the source regular expression string. If this 162 * information is not wanted, pass NULL for this parameter. 163 * @param status Receives error detected by this function. 164 * 165 * @stable ICU 4.6 166 */ 167 U_CAPI URegularExpression * U_EXPORT2 168 uregex_openUText(UText *pattern, 169 uint32_t flags, 170 UParseError *pe, 171 UErrorCode *status); 172 173 #if !UCONFIG_NO_CONVERSION 174 /** 175 * Open (compile) an ICU regular expression. The resulting regular expression 176 * handle can then be used to perform various matching operations. 177 *
178 * This function is the same as uregex_open, except that the pattern 179 * is supplied as an 8 bit char * string in the default code page. 180 * 181 * @param pattern The Regular Expression pattern to be compiled, 182 * NUL terminated. 183 * @param flags Flags that alter the default matching behavior for 184 * the regular expression, UREGEX_CASE_INSENSITIVE, for 185 * example. For default behavior, set this parameter to zero. 186 * See enum URegexpFlag. All desired flags 187 * are bitwise-ORed together. 188 * @param pe Receives the position (line and column numbers) of any syntax 189 * error within the source regular expression string. If this 190 * information is not wanted, pass NULL for this parameter. 191 * @param status Receives errors detected by this function. 192 * @return The URegularExpression object representing the compiled 193 * pattern. 194 * 195 * @stable ICU 3.0 196 */ 197 U_CAPI URegularExpression * U_EXPORT2 198 uregex_openC( const char *pattern, 199 uint32_t flags, 200 UParseError *pe, 201 UErrorCode *status); 202 #endif 203 204 205 206 /** 207 * Close the regular expression, recovering all resources (memory) it 208 * was holding. 209 * 210 * @param regexp The regular expression to be closed. 211 * @stable ICU 3.0 212 */ 213 U_CAPI void U_EXPORT2 214 uregex_close(URegularExpression *regexp); 215 216 #if U_SHOW_CPLUSPLUS_API 217 218 U_NAMESPACE_BEGIN 219 220 /** 221 * \class LocalURegularExpressionPointer 222 * "Smart pointer" class, closes a URegularExpression via uregex_close(). 223 * For most methods see the LocalPointerBase base class. 224 * 225 * @see LocalPointerBase 226 * @see LocalPointer 227 * @stable ICU 4.4 228 */ 229 U_DEFINE_LOCAL_OPEN_POINTER(LocalURegularExpressionPointer, URegularExpression, uregex_close); 230 231 U_NAMESPACE_END 232 233 #endif 234 235 /** 236 * Make a copy of a compiled regular expression. Cloning a regular 237 * expression is faster than opening a second instance from the source 238 * form of the expression, and requires less memory. 239 *
240 * Note that the current input string and the position of any matched text 241 * within it are not cloned; only the pattern itself and the 242 * match mode flags are copied. 243 *
244 * Cloning can be particularly useful to threaded applications that perform 245 * multiple match operations in parallel. Each concurrent RE 246 * operation requires its own instance of a URegularExpression. 247 * 248 * @param regexp The compiled regular expression to be cloned. 249 * @param status Receives indication of any errors encountered 250 * @return the cloned copy of the compiled regular expression. 251 * @stable ICU 3.0 252 */ 253 U_CAPI URegularExpression * U_EXPORT2 254 uregex_clone(const URegularExpression *regexp, UErrorCode *status); 255 256 /** 257 * Returns a pointer to the source form of the pattern for this regular expression. 258 * This function will work even if the pattern was originally specified as a UText. 259 * 260 * @param regexp The compiled regular expression. 261 * @param patLength This output parameter will be set to the length of the 262 * pattern string. A NULL pointer may be used here if the 263 * pattern length is not needed, as would be the case if 264 * the pattern is known in advance to be a NUL terminated 265 * string. 266 * @param status Receives errors detected by this function. 267 * @return a pointer to the pattern string. The storage for the string is 268 * owned by the regular expression object, and must not be 269 * altered or deleted by the application. The returned string 270 * will remain valid until the regular expression is closed. 271 * @stable ICU 3.0 272 */ 273 U_CAPI const UChar * U_EXPORT2 274 uregex_pattern(const URegularExpression *regexp, 275 int32_t *patLength, 276 UErrorCode *status); 277 278 /** 279 * Returns the source text of the pattern for this regular expression. 280 * This function will work even if the pattern was originally specified as a UChar string. 281 * 282 * @param regexp The compiled regular expression. 283 * @param status Receives errors detected by this function. 284 * @return the pattern text. The storage for the text is owned by the regular expression 285 * object, and must not be altered or deleted. 286 * 287 * @stable ICU 4.6 288 */ 289 U_CAPI UText * U_EXPORT2 290 uregex_patternUText(const URegularExpression *regexp, 291 UErrorCode *status); 292 293 /** 294 * Get the match mode flags that were specified when compiling this regular expression. 295 * @param status Receives errors detected by this function. 296 * @param regexp The compiled regular expression. 297 * @return The match mode flags 298 * @see URegexpFlag 299 * @stable ICU 3.0 300 */ 301 U_CAPI int32_t U_EXPORT2 302 uregex_flags(const URegularExpression *regexp, 303 UErrorCode *status); 304 305 306 /** 307 * Set the subject text string upon which the regular expression will look for matches. 308 * This function may be called any number of times, allowing the regular 309 * expression pattern to be applied to different strings. 310 *
311 * Regular expression matching operations work directly on the application's 312 * string data. No copy is made. The subject string data must not be 313 * altered after calling this function until after all regular expression 314 * operations involving this string data are completed. 315 *
316 * Zero length strings are permitted. In this case, no subsequent match 317 * operation will dereference the text string pointer. 318 * 319 * @param regexp The compiled regular expression. 320 * @param text The subject text string. 321 * @param textLength The length of the subject text, or -1 if the string 322 * is NUL terminated. 323 * @param status Receives errors detected by this function. 324 * @stable ICU 3.0 325 */ 326 U_CAPI void U_EXPORT2 327 uregex_setText(URegularExpression *regexp, 328 const UChar *text, 329 int32_t textLength, 330 UErrorCode *status); 331 332 333 /** 334 * Set the subject text string upon which the regular expression will look for matches. 335 * This function may be called any number of times, allowing the regular 336 * expression pattern to be applied to different strings. 337 *
338 * Regular expression matching operations work directly on the application's 339 * string data; only a shallow clone is made. The subject string data must not be 340 * altered after calling this function until after all regular expression 341 * operations involving this string data are completed. 342 * 343 * @param regexp The compiled regular expression. 344 * @param text The subject text string. 345 * @param status Receives errors detected by this function. 346 * 347 * @stable ICU 4.6 348 */ 349 U_CAPI void U_EXPORT2 350 uregex_setUText(URegularExpression *regexp, 351 UText *text, 352 UErrorCode *status); 353 354 /** 355 * Get the subject text that is currently associated with this 356 * regular expression object. If the input was supplied using uregex_setText(), 357 * that pointer will be returned. Otherwise, the characters in the input will 358 * be extracted to a buffer and returned. In either case, ownership remains 359 * with the regular expression object. 360 * 361 * This function will work even if the input was originally specified as a UText. 362 * 363 * @param regexp The compiled regular expression. 364 * @param textLength The length of the string is returned in this output parameter. 365 * A NULL pointer may be used here if the 366 * text length is not needed, as would be the case if 367 * the text is known in advance to be a NUL terminated 368 * string. 369 * @param status Receives errors detected by this function. 370 * @return Pointer to the subject text string currently associated with 371 * this regular expression. 372 * @stable ICU 3.0 373 */ 374 U_CAPI const UChar * U_EXPORT2 375 uregex_getText(URegularExpression *regexp, 376 int32_t *textLength, 377 UErrorCode *status); 378 379 /** 380 * Get the subject text that is currently associated with this 381 * regular expression object. 382 * 383 * This function will work even if the input was originally specified as a UChar string. 384 * 385 * @param regexp The compiled regular expression. 386 * @param dest A mutable UText in which to store the current input. 387 * If NULL, a new UText will be created as an immutable shallow clone 388 * of the actual input string. 389 * @param status Receives errors detected by this function. 390 * @return The subject text currently associated with this regular expression. 391 * If a pre-allocated UText was provided, it will always be used and returned. 392 * 393 * @stable ICU 4.6 394 */ 395 U_CAPI UText * U_EXPORT2 396 uregex_getUText(URegularExpression *regexp, 397 UText *dest, 398 UErrorCode *status); 399 400 /** 401 * Set the subject text string upon which the regular expression is looking for matches 402 * without changing any other aspect of the matching state. 403 * The new and previous text strings must have the same content. 404 * 405 * This function is intended for use in environments where ICU is operating on 406 * strings that may move around in memory. It provides a mechanism for notifying 407 * ICU that the string has been relocated, and providing a new UText to access the 408 * string in its new position. 409 * 410 * Note that the regular expression implementation never copies the underlying text 411 * of a string being matched, but always operates directly on the original text 412 * provided by the user. Refreshing simply drops the references to the old text 413 * and replaces them with references to the new. 414 * 415 * Caution: this function is normally used only by very specialized 416 * system-level code. One example use case is with garbage collection 417 * that moves the text in memory. 418 * 419 * @param regexp The compiled regular expression. 420 * @param text The new (moved) text string. 421 * @param status Receives errors detected by this function. 422 * 423 * @stable ICU 4.8 424 */ 425 U_CAPI void U_EXPORT2 426 uregex_refreshUText(URegularExpression *regexp, 427 UText *text, 428 UErrorCode *status); 429 430 /** 431 * Attempts to match the input string against the pattern. 432 * To succeed, the match must extend to the end of the string, 433 * or cover the complete match region. 434 * 435 * If startIndex >= zero the match operation starts at the specified 436 * index and must extend to the end of the input string. Any region 437 * that has been specified is reset. 438 * 439 * If startIndex == -1 the match must cover the input region, or the entire 440 * input string if no region has been set. This directly corresponds to 441 * Matcher.matches() in Java 442 * 443 * @param regexp The compiled regular expression. 444 * @param startIndex The input string (native) index at which to begin matching, or -1 445 * to match the input Region. 446 * @param status Receives errors detected by this function. 447 * @return true if there is a match 448 * @stable ICU 3.0 449 */ 450 U_CAPI UBool U_EXPORT2 451 uregex_matches(URegularExpression *regexp, 452 int32_t startIndex, 453 UErrorCode *status); 454 455 /** 456 * 64bit version of uregex_matches. 457 * Attempts to match the input string against the pattern. 458 * To succeed, the match must extend to the end of the string, 459 * or cover the complete match region. 460 * 461 * If startIndex >= zero the match operation starts at the specified 462 * index and must extend to the end of the input string. Any region 463 * that has been specified is reset. 464 * 465 * If startIndex == -1 the match must cover the input region, or the entire 466 * input string if no region has been set. This directly corresponds to 467 * Matcher.matches() in Java 468 * 469 * @param regexp The compiled regular expression. 470 * @param startIndex The input string (native) index at which to begin matching, or -1 471 * to match the input Region. 472 * @param status Receives errors detected by this function. 473 * @return true if there is a match 474 * @stable ICU 4.6 475 */ 476 U_CAPI UBool U_EXPORT2 477 uregex_matches64(URegularExpression *regexp, 478 int64_t startIndex, 479 UErrorCode *status); 480 481 /** 482 * Attempts to match the input string, starting from the specified index, against the pattern. 483 * The match may be of any length, and is not required to extend to the end 484 * of the input string. Contrast with uregex_matches(). 485 * 486 *
If startIndex is >= 0 any input region that was set for this 487 * URegularExpression is reset before the operation begins. 488 * 489 *
If the specified starting index == -1 the match begins at the start of the input 490 * region, or at the start of the full string if no region has been specified. 491 * This corresponds directly with Matcher.lookingAt() in Java. 492 * 493 *
If the match succeeds then more information can be obtained via the 494 * uregexp_start(), uregexp_end(), 495 * and uregex_group() functions.
uregexp_start()
uregexp_end()
uregex_group()
If startIndex is >= 0 any input region that was set for this 516 * URegularExpression is reset before the operation begins. 517 * 518 *
If the specified starting index == -1 the match begins at the start of the input 519 * region, or at the start of the full string if no region has been specified. 520 * This corresponds directly with Matcher.lookingAt() in Java. 521 * 522 *
If the match succeeds then more information can be obtained via the 523 * uregexp_start(), uregexp_end(), 524 * and uregex_group() functions.
uregex_start(), uregex_end()
The input string, starting from the end of the previous match and ending at 1184 * the start of the current match, is appended to the destination string. Then the 1185 * replacement string is appended to the output string, 1186 * including handling any substitutions of captured text.
A note on preflight computation of buffersize and error handling: 1189 * Calls to uregex_appendReplacement() and uregex_appendTail() are 1190 * designed to be chained, one after another, with the destination 1191 * buffer pointer and buffer capacity updated after each in preparation 1192 * to for the next. If the destination buffer is exhausted partway through such a 1193 * sequence, a U_BUFFER_OVERFLOW_ERROR status will be returned. Normal 1194 * ICU conventions are for a function to perform no action if it is 1195 * called with an error status, but for this one case, uregex_appendRepacement() 1196 * will operate normally so that buffer size computations will complete 1197 * correctly. 1198 * 1199 *
For simple, prepackaged, non-incremental find-and-replace 1200 * operations, see replaceFirst() or replaceAll().
The input string, starting from the end of the previous match and ending at 1238 * the start of the current match, is appended to the destination string. Then the 1239 * replacement string is appended to the output string, 1240 * including handling any substitutions of captured text.
For simple, prepackaged, non-incremental find-and-replace 1243 * operations, see replaceFirst() or replaceAll().
uregex_appendTail()
uregex_appendReplacement()
uregex_appendTailUText()
uregex_appendReplacementUText()
1380 * The behavior of this function is not very closely aligned with uregex_split(); 1381 * instead, it is based on (and implemented directly on top of) the C++ split method. 1382 * 1383 * @param regexp The compiled regular expression. 1384 * @param destFields An array of mutable UText structs to receive the results of the split. 1385 * If a field is NULL, a new UText is allocated to contain the results for 1386 * that field. This new UText is not guaranteed to be mutable. 1387 * @param destFieldsCapacity The number of elements in the destination array. 1388 * If the number of fields found is less than destCapacity, the 1389 * extra strings in the destination array are not altered. 1390 * If the number of destination strings is less than the number 1391 * of fields, the trailing part of the input string, including any 1392 * field delimiters, is placed in the last destination string. 1393 * This behavior mimics that of Perl. It is not an error condition, and no 1394 * error status is returned when all destField positions are used. 1395 * @param status A reference to a UErrorCode to receive any errors. 1396 * @return The number of fields into which the input string was split. 1397 * 1398 * @stable ICU 4.6 1399 */ 1400 U_CAPI int32_t U_EXPORT2 1401 uregex_splitUText(URegularExpression *regexp, 1402 UText *destFields[], 1403 int32_t destFieldsCapacity, 1404 UErrorCode *status); 1405 1406 /** 1407 * Set a processing time limit for match operations with this URegularExpression. 1408 * 1409 * Some patterns, when matching certain strings, can run in exponential time. 1410 * For practical purposes, the match operation may appear to be in an 1411 * infinite loop. 1412 * When a limit is set a match operation will fail with an error if the 1413 * limit is exceeded. 1414 *
1415 * The units of the limit are steps of the match engine. 1416 * Correspondence with actual processor time will depend on the speed 1417 * of the processor and the details of the specific pattern, but will 1418 * typically be on the order of milliseconds. 1419 *
1420 * By default, the matching time is not limited. 1421 *
1422 * 1423 * @param regexp The compiled regular expression. 1424 * @param limit The limit value, or 0 for no limit. 1425 * @param status A reference to a UErrorCode to receive any errors. 1426 * @stable ICU 4.0 1427 */ 1428 U_CAPI void U_EXPORT2 1429 uregex_setTimeLimit(URegularExpression *regexp, 1430 int32_t limit, 1431 UErrorCode *status); 1432 1433 /** 1434 * Get the time limit for for matches with this URegularExpression. 1435 * A return value of zero indicates that there is no limit. 1436 * 1437 * @param regexp The compiled regular expression. 1438 * @param status A reference to a UErrorCode to receive any errors. 1439 * @return the maximum allowed time for a match, in units of processing steps. 1440 * @stable ICU 4.0 1441 */ 1442 U_CAPI int32_t U_EXPORT2 1443 uregex_getTimeLimit(const URegularExpression *regexp, 1444 UErrorCode *status); 1445 1446 /** 1447 * Set the amount of heap storage available for use by the match backtracking stack. 1448 *
1449 * ICU uses a backtracking regular expression engine, with the backtrack stack 1450 * maintained on the heap. This function sets the limit to the amount of memory 1451 * that can be used for this purpose. A backtracking stack overflow will 1452 * result in an error from the match operation that caused it. 1453 *
1454 * A limit is desirable because a malicious or poorly designed pattern can use 1455 * excessive memory, potentially crashing the process. A limit is enabled 1456 * by default. 1457 *
1458 * @param regexp The compiled regular expression. 1459 * @param limit The maximum size, in bytes, of the matching backtrack stack. 1460 * A value of zero means no limit. 1461 * The limit must be greater than or equal to zero. 1462 * @param status A reference to a UErrorCode to receive any errors. 1463 * 1464 * @stable ICU 4.0 1465 */ 1466 U_CAPI void U_EXPORT2 1467 uregex_setStackLimit(URegularExpression *regexp, 1468 int32_t limit, 1469 UErrorCode *status); 1470 1471 /** 1472 * Get the size of the heap storage available for use by the back tracking stack. 1473 * 1474 * @return the maximum backtracking stack size, in bytes, or zero if the 1475 * stack size is unlimited. 1476 * @stable ICU 4.0 1477 */ 1478 U_CAPI int32_t U_EXPORT2 1479 uregex_getStackLimit(const URegularExpression *regexp, 1480 UErrorCode *status); 1481 1482 1483 /** 1484 * Function pointer for a regular expression matching callback function. 1485 * When set, a callback function will be called periodically during matching 1486 * operations. If the call back function returns false, the matching 1487 * operation will be terminated early. 1488 * 1489 * Note: the callback function must not call other functions on this 1490 * URegularExpression. 1491 * 1492 * @param context context pointer. The callback function will be invoked 1493 * with the context specified at the time that 1494 * uregex_setMatchCallback() is called. 1495 * @param steps the accumulated processing time, in match steps, 1496 * for this matching operation. 1497 * @return true to continue the matching operation. 1498 * false to terminate the matching operation. 1499 * @stable ICU 4.0 1500 */ 1501 U_CDECL_BEGIN 1502 typedef UBool U_CALLCONV URegexMatchCallback ( 1503 const void *context, 1504 int32_t steps); 1505 U_CDECL_END 1506 1507 /** 1508 * Set a callback function for this URegularExpression. 1509 * During matching operations the function will be called periodically, 1510 * giving the application the opportunity to terminate a long-running 1511 * match. 1512 * 1513 * @param regexp The compiled regular expression. 1514 * @param callback A pointer to the user-supplied callback function. 1515 * @param context User context pointer. The value supplied at the 1516 * time the callback function is set will be saved 1517 * and passed to the callback each time that it is called. 1518 * @param status A reference to a UErrorCode to receive any errors. 1519 * @stable ICU 4.0 1520 */ 1521 U_CAPI void U_EXPORT2 1522 uregex_setMatchCallback(URegularExpression *regexp, 1523 URegexMatchCallback *callback, 1524 const void *context, 1525 UErrorCode *status); 1526 1527 1528 /** 1529 * Get the callback function for this URegularExpression. 1530 * 1531 * @param regexp The compiled regular expression. 1532 * @param callback Out parameter, receives a pointer to the user-supplied 1533 * callback function. 1534 * @param context Out parameter, receives the user context pointer that 1535 * was set when uregex_setMatchCallback() was called. 1536 * @param status A reference to a UErrorCode to receive any errors. 1537 * @stable ICU 4.0 1538 */ 1539 U_CAPI void U_EXPORT2 1540 uregex_getMatchCallback(const URegularExpression *regexp, 1541 URegexMatchCallback **callback, 1542 const void **context, 1543 UErrorCode *status); 1544 1545 /** 1546 * Function pointer for a regular expression find callback function. 1547 * 1548 * When set, a callback function will be called during a find operation 1549 * and for operations that depend on find, such as findNext, split and some replace 1550 * operations like replaceFirst. 1551 * The callback will usually be called after each attempt at a match, but this is not a 1552 * guarantee that the callback will be invoked at each character. For finds where the 1553 * match engine is invoked at each character, this may be close to true, but less likely 1554 * for more optimized loops where the pattern is known to only start, and the match 1555 * engine invoked, at certain characters. 1556 * When invoked, this callback will specify the index at which a match operation is about 1557 * to be attempted, giving the application the opportunity to terminate a long-running 1558 * find operation. 1559 * 1560 * If the call back function returns false, the find operation will be terminated early. 1561 * 1562 * Note: the callback function must not call other functions on this 1563 * URegularExpression 1564 * 1565 * @param context context pointer. The callback function will be invoked 1566 * with the context specified at the time that 1567 * uregex_setFindProgressCallback() is called. 1568 * @param matchIndex the next index at which a match attempt will be attempted for this 1569 * find operation. If this callback interrupts the search, this is the 1570 * index at which a find/findNext operation may be re-initiated. 1571 * @return true to continue the matching operation. 1572 * false to terminate the matching operation. 1573 * @stable ICU 4.6 1574 */ 1575 U_CDECL_BEGIN 1576 typedef UBool U_CALLCONV URegexFindProgressCallback ( 1577 const void *context, 1578 int64_t matchIndex); 1579 U_CDECL_END 1580 1581 1582 /** 1583 * Set the find progress callback function for this URegularExpression. 1584 * 1585 * @param regexp The compiled regular expression. 1586 * @param callback A pointer to the user-supplied callback function. 1587 * @param context User context pointer. The value supplied at the 1588 * time the callback function is set will be saved 1589 * and passed to the callback each time that it is called. 1590 * @param status A reference to a UErrorCode to receive any errors. 1591 * @stable ICU 4.6 1592 */ 1593 U_CAPI void U_EXPORT2 1594 uregex_setFindProgressCallback(URegularExpression *regexp, 1595 URegexFindProgressCallback *callback, 1596 const void *context, 1597 UErrorCode *status); 1598 1599 /** 1600 * Get the find progress callback function for this URegularExpression. 1601 * 1602 * @param regexp The compiled regular expression. 1603 * @param callback Out parameter, receives a pointer to the user-supplied 1604 * callback function. 1605 * @param context Out parameter, receives the user context pointer that 1606 * was set when uregex_setFindProgressCallback() was called. 1607 * @param status A reference to a UErrorCode to receive any errors. 1608 * @stable ICU 4.6 1609 */ 1610 U_CAPI void U_EXPORT2 1611 uregex_getFindProgressCallback(const URegularExpression *regexp, 1612 URegexFindProgressCallback **callback, 1613 const void **context, 1614 UErrorCode *status); 1615 1616 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ 1617 #endif /* UREGEX_H */