Where Online Learning is simpler!
The C and C++ Include Header Files
/usr/include/c++/13/bits/regex_scanner.tcc
$ cat -n /usr/include/c++/13/bits/regex_scanner.tcc 1 // class template regex -*- C++ -*- 2 3 // Copyright (C) 2013-2023 Free Software Foundation, Inc. 4 // 5 // This file is part of the GNU ISO C++ Library. This library is free 6 // software; you can redistribute it and/or modify it under the 7 // terms of the GNU General Public License as published by the 8 // Free Software Foundation; either version 3, or (at your option) 9 // any later version. 10 11 // This library is distributed in the hope that it will be useful, 12 // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 // GNU General Public License for more details. 15 16 // Under Section 7 of GPL version 3, you are granted additional 17 // permissions described in the GCC Runtime Library Exception, version 18 // 3.1, as published by the Free Software Foundation. 19 20 // You should have received a copy of the GNU General Public License and 21 // a copy of the GCC Runtime Library Exception along with this program; 22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 23 //
. 24 25 /** 26 * @file bits/regex_scanner.tcc 27 * This is an internal header file, included by other library headers. 28 * Do not attempt to use it directly. @headername{regex} 29 */ 30 31 // FIXME make comments doxygen format. 32 33 // N3376 specified 6 regex styles: ECMAScript, basic, extended, grep, egrep 34 // and awk 35 // 1) grep is basic except '\n' is treated as '|' 36 // 2) egrep is extended except '\n' is treated as '|' 37 // 3) awk is extended except special escaping rules, and there's no 38 // back-reference. 39 // 40 // References: 41 // 42 // ECMAScript: ECMA-262 15.10 43 // 44 // basic, extended: 45 // http://pubs.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html 46 // 47 // awk: http://pubs.opengroup.org/onlinepubs/000095399/utilities/awk.html 48 49 namespace std _GLIBCXX_VISIBILITY(default) 50 { 51 _GLIBCXX_BEGIN_NAMESPACE_VERSION 52 53 namespace __detail 54 { 55 template
56 _Scanner<_CharT>:: 57 _Scanner(const _CharT* __begin, const _CharT* __end, 58 _FlagT __flags, std::locale __loc) 59 : _ScannerBase(__flags), 60 _M_current(__begin), _M_end(__end), 61 _M_ctype(std::use_facet<_CtypeT>(__loc)), 62 _M_eat_escape(_M_is_ecma() 63 ? &_Scanner::_M_eat_escape_ecma 64 : &_Scanner::_M_eat_escape_posix) 65 { _M_advance(); } 66 67 template
68 void 69 _Scanner<_CharT>:: 70 _M_advance() 71 { 72 if (_M_current == _M_end) 73 { 74 _M_token = _S_token_eof; 75 return; 76 } 77 78 if (_M_state == _S_state_normal) 79 _M_scan_normal(); 80 else if (_M_state == _S_state_in_bracket) 81 _M_scan_in_bracket(); 82 else if (_M_state == _S_state_in_brace) 83 _M_scan_in_brace(); 84 else 85 { 86 __glibcxx_assert(!"unexpected state while processing regex"); 87 } 88 } 89 90 // Differences between styles: 91 // 1) "\(", "\)", "\{" in basic. It's not escaping. 92 // 2) "(?:", "(?=", "(?!" in ECMAScript. 93 template
94 void 95 _Scanner<_CharT>:: 96 _M_scan_normal() 97 { 98 auto __c = *_M_current++; 99 100 if (__builtin_strchr(_M_spec_char, _M_ctype.narrow(__c, ' ')) == nullptr) 101 { 102 _M_token = _S_token_ord_char; 103 _M_value.assign(1, __c); 104 return; 105 } 106 if (__c == '\\') 107 { 108 if (_M_current == _M_end) 109 __throw_regex_error( 110 regex_constants::error_escape, 111 "Invalid escape at end of regular expression"); 112 113 if (!_M_is_basic() 114 || (*_M_current != '(' 115 && *_M_current != ')' 116 && *_M_current != '{')) 117 { 118 (this->*_M_eat_escape)(); 119 return; 120 } 121 __c = *_M_current++; 122 } 123 if (__c == '(') 124 { 125 if (_M_is_ecma() && *_M_current == '?') 126 { 127 if (++_M_current == _M_end) 128 __throw_regex_error(regex_constants::error_paren); 129 130 if (*_M_current == ':') 131 { 132 ++_M_current; 133 _M_token = _S_token_subexpr_no_group_begin; 134 } 135 else if (*_M_current == '=') 136 { 137 ++_M_current; 138 _M_token = _S_token_subexpr_lookahead_begin; 139 _M_value.assign(1, 'p'); 140 } 141 else if (*_M_current == '!') 142 { 143 ++_M_current; 144 _M_token = _S_token_subexpr_lookahead_begin; 145 _M_value.assign(1, 'n'); 146 } 147 else 148 __throw_regex_error(regex_constants::error_paren, 149 "Invalid '(?...)' zero-width assertion " 150 "in regular expression"); 151 } 152 else if (_M_flags & regex_constants::nosubs) 153 _M_token = _S_token_subexpr_no_group_begin; 154 else 155 _M_token = _S_token_subexpr_begin; 156 } 157 else if (__c == ')') 158 _M_token = _S_token_subexpr_end; 159 else if (__c == '[') 160 { 161 _M_state = _S_state_in_bracket; 162 _M_at_bracket_start = true; 163 if (_M_current != _M_end && *_M_current == '^') 164 { 165 _M_token = _S_token_bracket_neg_begin; 166 ++_M_current; 167 } 168 else 169 _M_token = _S_token_bracket_begin; 170 } 171 else if (__c == '{') 172 { 173 _M_state = _S_state_in_brace; 174 _M_token = _S_token_interval_begin; 175 } 176 else if (__builtin_expect(__c == _CharT(0), false)) 177 { 178 if (!_M_is_ecma()) 179 __throw_regex_error(regex_constants::_S_null); 180 _M_token = _S_token_ord_char; 181 _M_value.assign(1, __c); 182 } 183 else if (__c != ']' && __c != '}') 184 { 185 auto __it = _M_token_tbl; 186 auto __narrowc = _M_ctype.narrow(__c, '\0'); 187 for (; __it->first != '\0'; ++__it) 188 if (__it->first == __narrowc) 189 { 190 _M_token = __it->second; 191 return; 192 } 193 __glibcxx_assert(!"unexpected special character in regex"); 194 } 195 else 196 { 197 _M_token = _S_token_ord_char; 198 _M_value.assign(1, __c); 199 } 200 } 201 202 // Differences between styles: 203 // 1) different semantics of "[]" and "[^]". 204 // 2) Escaping in bracket expr. 205 template
206 void 207 _Scanner<_CharT>:: 208 _M_scan_in_bracket() 209 { 210 if (_M_current == _M_end) 211 __throw_regex_error(regex_constants::error_brack); 212 213 auto __c = *_M_current++; 214 215 if (__c == '-') 216 _M_token = _S_token_bracket_dash; 217 else if (__c == '[') 218 { 219 if (_M_current == _M_end) 220 __throw_regex_error(regex_constants::error_brack, 221 "Incomplete '[[' character class in " 222 "regular expression"); 223 224 if (*_M_current == '.') 225 { 226 _M_token = _S_token_collsymbol; 227 _M_eat_class(*_M_current++); 228 } 229 else if (*_M_current == ':') 230 { 231 _M_token = _S_token_char_class_name; 232 _M_eat_class(*_M_current++); 233 } 234 else if (*_M_current == '=') 235 { 236 _M_token = _S_token_equiv_class_name; 237 _M_eat_class(*_M_current++); 238 } 239 else 240 { 241 _M_token = _S_token_ord_char; 242 _M_value.assign(1, __c); 243 } 244 } 245 // In POSIX, when encountering "[]" or "[^]", the ']' is interpreted 246 // literally. So "[]]" and "[^]]" are valid regexes. See the testcases 247 // `.../empty_range.cc`. 248 else if (__c == ']' && (_M_is_ecma() || !_M_at_bracket_start)) 249 { 250 _M_token = _S_token_bracket_end; 251 _M_state = _S_state_normal; 252 } 253 // ECMAScript and awk permits escaping in bracket. 254 else if (__c == '\\' && (_M_is_ecma() || _M_is_awk())) 255 (this->*_M_eat_escape)(); 256 else 257 { 258 _M_token = _S_token_ord_char; 259 _M_value.assign(1, __c); 260 } 261 _M_at_bracket_start = false; 262 } 263 264 // Differences between styles: 265 // 1) "\}" in basic style. 266 template
267 void 268 _Scanner<_CharT>:: 269 _M_scan_in_brace() 270 { 271 if (_M_current == _M_end) 272 __throw_regex_error(regex_constants::error_brace); 273 274 auto __c = *_M_current++; 275 276 if (_M_ctype.is(_CtypeT::digit, __c)) 277 { 278 _M_token = _S_token_dup_count; 279 _M_value.assign(1, __c); 280 while (_M_current != _M_end 281 && _M_ctype.is(_CtypeT::digit, *_M_current)) 282 _M_value += *_M_current++; 283 } 284 else if (__c == ',') 285 _M_token = _S_token_comma; 286 // basic use \}. 287 else if (_M_is_basic()) 288 { 289 if (__c == '\\' && _M_current != _M_end && *_M_current == '}') 290 { 291 _M_state = _S_state_normal; 292 _M_token = _S_token_interval_end; 293 ++_M_current; 294 } 295 else 296 __throw_regex_error(regex_constants::error_badbrace); 297 } 298 else if (__c == '}') 299 { 300 _M_state = _S_state_normal; 301 _M_token = _S_token_interval_end; 302 } 303 else 304 __throw_regex_error(regex_constants::error_badbrace); 305 } 306 307 template
308 void 309 _Scanner<_CharT>:: 310 _M_eat_escape_ecma() 311 { 312 if (_M_current == _M_end) 313 __throw_regex_error(regex_constants::error_escape); 314 315 auto __c = *_M_current++; 316 auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0')); 317 318 if (__pos != nullptr && (__c != 'b' || _M_state == _S_state_in_bracket)) 319 { 320 _M_token = _S_token_ord_char; 321 _M_value.assign(1, *__pos); 322 } 323 else if (__c == 'b') 324 { 325 _M_token = _S_token_word_bound; 326 _M_value.assign(1, 'p'); 327 } 328 else if (__c == 'B') 329 { 330 _M_token = _S_token_word_bound; 331 _M_value.assign(1, 'n'); 332 } 333 // N3376 28.13 334 else if (__c == 'd' 335 || __c == 'D' 336 || __c == 's' 337 || __c == 'S' 338 || __c == 'w' 339 || __c == 'W') 340 { 341 _M_token = _S_token_quoted_class; 342 _M_value.assign(1, __c); 343 } 344 else if (__c == 'c') 345 { 346 if (_M_current == _M_end) 347 __throw_regex_error(regex_constants::error_escape, 348 "invalid '\\cX' control character in " 349 "regular expression"); 350 _M_token = _S_token_ord_char; 351 _M_value.assign(1, *_M_current++); 352 } 353 else if (__c == 'x' || __c == 'u') 354 { 355 _M_value.clear(); 356 const int __n = __c == 'x' ? 2 : 4; 357 for (int __i = 0; __i < __n; __i++) 358 { 359 if (_M_current == _M_end 360 || !_M_ctype.is(_CtypeT::xdigit, *_M_current)) 361 __throw_regex_error(regex_constants::error_escape, 362 __n == 2 363 ? "Invalid '\\xNN' control character in " 364 "regular expression" 365 : "Invalid '\\uNNNN' control character in " 366 "regular expression"); 367 _M_value += *_M_current++; 368 } 369 _M_token = _S_token_hex_num; 370 } 371 // ECMAScript recognizes multi-digit back-references. 372 else if (_M_ctype.is(_CtypeT::digit, __c)) 373 { 374 _M_value.assign(1, __c); 375 while (_M_current != _M_end 376 && _M_ctype.is(_CtypeT::digit, *_M_current)) 377 _M_value += *_M_current++; 378 _M_token = _S_token_backref; 379 } 380 else 381 { 382 _M_token = _S_token_ord_char; 383 _M_value.assign(1, __c); 384 } 385 } 386 387 // Differences between styles: 388 // 1) Extended doesn't support backref, but basic does. 389 template
390 void 391 _Scanner<_CharT>:: 392 _M_eat_escape_posix() 393 { 394 if (_M_current == _M_end) 395 __throw_regex_error(regex_constants::error_escape); 396 397 auto __c = *_M_current; 398 auto __pos = __builtin_strchr(_M_spec_char, _M_ctype.narrow(__c, '\0')); 399 400 if (__pos != nullptr && *__pos != '\0') 401 { 402 _M_token = _S_token_ord_char; 403 _M_value.assign(1, __c); 404 } 405 // We MUST judge awk before handling backrefs. There's no backref in awk. 406 else if (_M_is_awk()) 407 { 408 _M_eat_escape_awk(); 409 return; 410 } 411 else if (_M_is_basic() && _M_ctype.is(_CtypeT::digit, __c) && __c != '0') 412 { 413 _M_token = _S_token_backref; 414 _M_value.assign(1, __c); 415 } 416 else 417 { 418 #ifdef __STRICT_ANSI__ 419 // POSIX says it is undefined to escape ordinary characters 420 __throw_regex_error(regex_constants::error_escape); 421 #else 422 _M_token = _S_token_ord_char; 423 _M_value.assign(1, __c); 424 #endif 425 } 426 ++_M_current; 427 } 428 429 template
430 void 431 _Scanner<_CharT>:: 432 _M_eat_escape_awk() 433 { 434 auto __c = *_M_current++; 435 auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0')); 436 437 if (__pos != nullptr) 438 { 439 _M_token = _S_token_ord_char; 440 _M_value.assign(1, *__pos); 441 } 442 // \ddd for oct representation 443 else if (_M_ctype.is(_CtypeT::digit, __c) 444 && __c != '8' 445 && __c != '9') 446 { 447 _M_value.assign(1, __c); 448 for (int __i = 0; 449 __i < 2 450 && _M_current != _M_end 451 && _M_ctype.is(_CtypeT::digit, *_M_current) 452 && *_M_current != '8' 453 && *_M_current != '9'; 454 __i++) 455 _M_value += *_M_current++; 456 _M_token = _S_token_oct_num; 457 return; 458 } 459 else 460 __throw_regex_error(regex_constants::error_escape); 461 } 462 463 // Eats a character class or throws an exception. 464 // __ch could be ':', '.' or '=', _M_current is the char after ']' when 465 // returning. 466 template
467 void 468 _Scanner<_CharT>:: 469 _M_eat_class(char __ch) 470 { 471 for (_M_value.clear(); _M_current != _M_end && *_M_current != __ch;) 472 _M_value += *_M_current++; 473 if (_M_current == _M_end 474 || *_M_current++ != __ch 475 || _M_current == _M_end // skip __ch 476 || *_M_current++ != ']') // skip ']' 477 { 478 __throw_regex_error(__ch == ':' ? regex_constants::error_ctype 479 : regex_constants::error_collate); 480 } 481 } 482 483 #ifdef _GLIBCXX_DEBUG 484 template
485 std::ostream& 486 _Scanner<_CharT>:: 487 _M_print(std::ostream& __ostr) 488 { 489 switch (_M_token) 490 { 491 case _S_token_anychar: 492 __ostr << "any-character\n"; 493 break; 494 case _S_token_backref: 495 __ostr << "backref\n"; 496 break; 497 case _S_token_bracket_begin: 498 __ostr << "bracket-begin\n"; 499 break; 500 case _S_token_bracket_neg_begin: 501 __ostr << "bracket-neg-begin\n"; 502 break; 503 case _S_token_bracket_end: 504 __ostr << "bracket-end\n"; 505 break; 506 case _S_token_char_class_name: 507 __ostr << "char-class-name \"" << _M_value << "\"\n"; 508 break; 509 case _S_token_closure0: 510 __ostr << "closure0\n"; 511 break; 512 case _S_token_closure1: 513 __ostr << "closure1\n"; 514 break; 515 case _S_token_collsymbol: 516 __ostr << "collsymbol \"" << _M_value << "\"\n"; 517 break; 518 case _S_token_comma: 519 __ostr << "comma\n"; 520 break; 521 case _S_token_dup_count: 522 __ostr << "dup count: " << _M_value << "\n"; 523 break; 524 case _S_token_eof: 525 __ostr << "EOF\n"; 526 break; 527 case _S_token_equiv_class_name: 528 __ostr << "equiv-class-name \"" << _M_value << "\"\n"; 529 break; 530 case _S_token_interval_begin: 531 __ostr << "interval begin\n"; 532 break; 533 case _S_token_interval_end: 534 __ostr << "interval end\n"; 535 break; 536 case _S_token_line_begin: 537 __ostr << "line begin\n"; 538 break; 539 case _S_token_line_end: 540 __ostr << "line end\n"; 541 break; 542 case _S_token_opt: 543 __ostr << "opt\n"; 544 break; 545 case _S_token_or: 546 __ostr << "or\n"; 547 break; 548 case _S_token_ord_char: 549 __ostr << "ordinary character: \"" << _M_value << "\"\n"; 550 break; 551 case _S_token_subexpr_begin: 552 __ostr << "subexpr begin\n"; 553 break; 554 case _S_token_subexpr_no_group_begin: 555 __ostr << "no grouping subexpr begin\n"; 556 break; 557 case _S_token_subexpr_lookahead_begin: 558 __ostr << "lookahead subexpr begin\n"; 559 break; 560 case _S_token_subexpr_end: 561 __ostr << "subexpr end\n"; 562 break; 563 case _S_token_unknown: 564 __ostr << "-- unknown token --\n"; 565 break; 566 case _S_token_oct_num: 567 __ostr << "oct number " << _M_value << "\n"; 568 break; 569 case _S_token_hex_num: 570 __ostr << "hex number " << _M_value << "\n"; 571 break; 572 case _S_token_quoted_class: 573 __ostr << "quoted class " << "\\" << _M_value << "\n"; 574 break; 575 default: 576 _GLIBCXX_DEBUG_ASSERT(false); 577 } 578 return __ostr; 579 } 580 #endif 581 582 } // namespace __detail 583 _GLIBCXX_END_NAMESPACE_VERSION 584 } // namespace
Contact us
|
About us
|
Term of use
|
Copyright © 2000-2025 MyWebUniversity.com ™