Where Online Learning is simpler!
The C and C++ Include Header Files
/usr/include/c++/11/bits/regex_scanner.tcc
$ cat -n /usr/include/c++/11/bits/regex_scanner.tcc 1 // class template regex -*- C++ -*- 2 3 // Copyright (C) 2013-2021 Free Software Foundation, Inc. 4 // 5 // This file is part of the GNU ISO C++ Library. This library is free 6 // software; you can redistribute it and/or modify it under the 7 // terms of the GNU General Public License as published by the 8 // Free Software Foundation; either version 3, or (at your option) 9 // any later version. 10 11 // This library is distributed in the hope that it will be useful, 12 // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 // GNU General Public License for more details. 15 16 // Under Section 7 of GPL version 3, you are granted additional 17 // permissions described in the GCC Runtime Library Exception, version 18 // 3.1, as published by the Free Software Foundation. 19 20 // You should have received a copy of the GNU General Public License and 21 // a copy of the GCC Runtime Library Exception along with this program; 22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 23 //
. 24 25 /** 26 * @file bits/regex_scanner.tcc 27 * This is an internal header file, included by other library headers. 28 * Do not attempt to use it directly. @headername{regex} 29 */ 30 31 // FIXME make comments doxygen format. 32 33 // N3376 specified 6 regex styles: ECMAScript, basic, extended, grep, egrep 34 // and awk 35 // 1) grep is basic except '\n' is treated as '|' 36 // 2) egrep is extended except '\n' is treated as '|' 37 // 3) awk is extended except special escaping rules, and there's no 38 // back-reference. 39 // 40 // References: 41 // 42 // ECMAScript: ECMA-262 15.10 43 // 44 // basic, extended: 45 // http://pubs.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html 46 // 47 // awk: http://pubs.opengroup.org/onlinepubs/000095399/utilities/awk.html 48 49 namespace std _GLIBCXX_VISIBILITY(default) 50 { 51 _GLIBCXX_BEGIN_NAMESPACE_VERSION 52 53 namespace __detail 54 { 55 template
56 _Scanner<_CharT>:: 57 _Scanner(const _CharT* __begin, const _CharT* __end, 58 _FlagT __flags, std::locale __loc) 59 : _ScannerBase(__flags), 60 _M_current(__begin), _M_end(__end), 61 _M_ctype(std::use_facet<_CtypeT>(__loc)), 62 _M_eat_escape(_M_is_ecma() 63 ? &_Scanner::_M_eat_escape_ecma 64 : &_Scanner::_M_eat_escape_posix) 65 { _M_advance(); } 66 67 template
68 void 69 _Scanner<_CharT>:: 70 _M_advance() 71 { 72 if (_M_current == _M_end) 73 { 74 _M_token = _S_token_eof; 75 return; 76 } 77 78 if (_M_state == _S_state_normal) 79 _M_scan_normal(); 80 else if (_M_state == _S_state_in_bracket) 81 _M_scan_in_bracket(); 82 else if (_M_state == _S_state_in_brace) 83 _M_scan_in_brace(); 84 else 85 { 86 __glibcxx_assert(false); 87 } 88 } 89 90 // Differences between styles: 91 // 1) "\(", "\)", "\{" in basic. It's not escaping. 92 // 2) "(?:", "(?=", "(?!" in ECMAScript. 93 template
94 void 95 _Scanner<_CharT>:: 96 _M_scan_normal() 97 { 98 auto __c = *_M_current++; 99 100 if (std::strchr(_M_spec_char, _M_ctype.narrow(__c, ' ')) == nullptr) 101 { 102 _M_token = _S_token_ord_char; 103 _M_value.assign(1, __c); 104 return; 105 } 106 if (__c == '\\') 107 { 108 if (_M_current == _M_end) 109 __throw_regex_error( 110 regex_constants::error_escape, 111 "Unexpected end of regex when escaping."); 112 113 if (!_M_is_basic() 114 || (*_M_current != '(' 115 && *_M_current != ')' 116 && *_M_current != '{')) 117 { 118 (this->*_M_eat_escape)(); 119 return; 120 } 121 __c = *_M_current++; 122 } 123 if (__c == '(') 124 { 125 if (_M_is_ecma() && *_M_current == '?') 126 { 127 if (++_M_current == _M_end) 128 __throw_regex_error( 129 regex_constants::error_paren, 130 "Unexpected end of regex when in an open parenthesis."); 131 132 if (*_M_current == ':') 133 { 134 ++_M_current; 135 _M_token = _S_token_subexpr_no_group_begin; 136 } 137 else if (*_M_current == '=') 138 { 139 ++_M_current; 140 _M_token = _S_token_subexpr_lookahead_begin; 141 _M_value.assign(1, 'p'); 142 } 143 else if (*_M_current == '!') 144 { 145 ++_M_current; 146 _M_token = _S_token_subexpr_lookahead_begin; 147 _M_value.assign(1, 'n'); 148 } 149 else 150 __throw_regex_error( 151 regex_constants::error_paren, 152 "Invalid special open parenthesis."); 153 } 154 else if (_M_flags & regex_constants::nosubs) 155 _M_token = _S_token_subexpr_no_group_begin; 156 else 157 _M_token = _S_token_subexpr_begin; 158 } 159 else if (__c == ')') 160 _M_token = _S_token_subexpr_end; 161 else if (__c == '[') 162 { 163 _M_state = _S_state_in_bracket; 164 _M_at_bracket_start = true; 165 if (_M_current != _M_end && *_M_current == '^') 166 { 167 _M_token = _S_token_bracket_neg_begin; 168 ++_M_current; 169 } 170 else 171 _M_token = _S_token_bracket_begin; 172 } 173 else if (__c == '{') 174 { 175 _M_state = _S_state_in_brace; 176 _M_token = _S_token_interval_begin; 177 } 178 else if (__builtin_expect(__c == _CharT(0), false)) 179 { 180 if (!_M_is_ecma()) 181 { 182 __throw_regex_error(regex_constants::_S_null, 183 "Unexpected null character in regular expression"); 184 } 185 _M_token = _S_token_ord_char; 186 _M_value.assign(1, __c); 187 } 188 else if (__c != ']' && __c != '}') 189 { 190 auto __it = _M_token_tbl; 191 auto __narrowc = _M_ctype.narrow(__c, '\0'); 192 for (; __it->first != '\0'; ++__it) 193 if (__it->first == __narrowc) 194 { 195 _M_token = __it->second; 196 return; 197 } 198 __glibcxx_assert(false); 199 } 200 else 201 { 202 _M_token = _S_token_ord_char; 203 _M_value.assign(1, __c); 204 } 205 } 206 207 // Differences between styles: 208 // 1) different semantics of "[]" and "[^]". 209 // 2) Escaping in bracket expr. 210 template
211 void 212 _Scanner<_CharT>:: 213 _M_scan_in_bracket() 214 { 215 if (_M_current == _M_end) 216 __throw_regex_error( 217 regex_constants::error_brack, 218 "Unexpected end of regex when in bracket expression."); 219 220 auto __c = *_M_current++; 221 222 if (__c == '-') 223 _M_token = _S_token_bracket_dash; 224 else if (__c == '[') 225 { 226 if (_M_current == _M_end) 227 __throw_regex_error(regex_constants::error_brack, 228 "Unexpected character class open bracket."); 229 230 if (*_M_current == '.') 231 { 232 _M_token = _S_token_collsymbol; 233 _M_eat_class(*_M_current++); 234 } 235 else if (*_M_current == ':') 236 { 237 _M_token = _S_token_char_class_name; 238 _M_eat_class(*_M_current++); 239 } 240 else if (*_M_current == '=') 241 { 242 _M_token = _S_token_equiv_class_name; 243 _M_eat_class(*_M_current++); 244 } 245 else 246 { 247 _M_token = _S_token_ord_char; 248 _M_value.assign(1, __c); 249 } 250 } 251 // In POSIX, when encountering "[]" or "[^]", the ']' is interpreted 252 // literally. So "[]]" and "[^]]" are valid regexes. See the testcases 253 // `*/empty_range.cc`. 254 else if (__c == ']' && (_M_is_ecma() || !_M_at_bracket_start)) 255 { 256 _M_token = _S_token_bracket_end; 257 _M_state = _S_state_normal; 258 } 259 // ECMAScript and awk permits escaping in bracket. 260 else if (__c == '\\' && (_M_is_ecma() || _M_is_awk())) 261 (this->*_M_eat_escape)(); 262 else 263 { 264 _M_token = _S_token_ord_char; 265 _M_value.assign(1, __c); 266 } 267 _M_at_bracket_start = false; 268 } 269 270 // Differences between styles: 271 // 1) "\}" in basic style. 272 template
273 void 274 _Scanner<_CharT>:: 275 _M_scan_in_brace() 276 { 277 if (_M_current == _M_end) 278 __throw_regex_error( 279 regex_constants::error_brace, 280 "Unexpected end of regex when in brace expression."); 281 282 auto __c = *_M_current++; 283 284 if (_M_ctype.is(_CtypeT::digit, __c)) 285 { 286 _M_token = _S_token_dup_count; 287 _M_value.assign(1, __c); 288 while (_M_current != _M_end 289 && _M_ctype.is(_CtypeT::digit, *_M_current)) 290 _M_value += *_M_current++; 291 } 292 else if (__c == ',') 293 _M_token = _S_token_comma; 294 // basic use \}. 295 else if (_M_is_basic()) 296 { 297 if (__c == '\\' && _M_current != _M_end && *_M_current == '}') 298 { 299 _M_state = _S_state_normal; 300 _M_token = _S_token_interval_end; 301 ++_M_current; 302 } 303 else 304 __throw_regex_error(regex_constants::error_badbrace, 305 "Unexpected character in brace expression."); 306 } 307 else if (__c == '}') 308 { 309 _M_state = _S_state_normal; 310 _M_token = _S_token_interval_end; 311 } 312 else 313 __throw_regex_error(regex_constants::error_badbrace, 314 "Unexpected character in brace expression."); 315 } 316 317 template
318 void 319 _Scanner<_CharT>:: 320 _M_eat_escape_ecma() 321 { 322 if (_M_current == _M_end) 323 __throw_regex_error(regex_constants::error_escape, 324 "Unexpected end of regex when escaping."); 325 326 auto __c = *_M_current++; 327 auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0')); 328 329 if (__pos != nullptr && (__c != 'b' || _M_state == _S_state_in_bracket)) 330 { 331 _M_token = _S_token_ord_char; 332 _M_value.assign(1, *__pos); 333 } 334 else if (__c == 'b') 335 { 336 _M_token = _S_token_word_bound; 337 _M_value.assign(1, 'p'); 338 } 339 else if (__c == 'B') 340 { 341 _M_token = _S_token_word_bound; 342 _M_value.assign(1, 'n'); 343 } 344 // N3376 28.13 345 else if (__c == 'd' 346 || __c == 'D' 347 || __c == 's' 348 || __c == 'S' 349 || __c == 'w' 350 || __c == 'W') 351 { 352 _M_token = _S_token_quoted_class; 353 _M_value.assign(1, __c); 354 } 355 else if (__c == 'c') 356 { 357 if (_M_current == _M_end) 358 __throw_regex_error( 359 regex_constants::error_escape, 360 "Unexpected end of regex when reading control code."); 361 _M_token = _S_token_ord_char; 362 _M_value.assign(1, *_M_current++); 363 } 364 else if (__c == 'x' || __c == 'u') 365 { 366 _M_value.erase(); 367 for (int __i = 0; __i < (__c == 'x' ? 2 : 4); __i++) 368 { 369 if (_M_current == _M_end 370 || !_M_ctype.is(_CtypeT::xdigit, *_M_current)) 371 __throw_regex_error( 372 regex_constants::error_escape, 373 "Unexpected end of regex when ascii character."); 374 _M_value += *_M_current++; 375 } 376 _M_token = _S_token_hex_num; 377 } 378 // ECMAScript recognizes multi-digit back-references. 379 else if (_M_ctype.is(_CtypeT::digit, __c)) 380 { 381 _M_value.assign(1, __c); 382 while (_M_current != _M_end 383 && _M_ctype.is(_CtypeT::digit, *_M_current)) 384 _M_value += *_M_current++; 385 _M_token = _S_token_backref; 386 } 387 else 388 { 389 _M_token = _S_token_ord_char; 390 _M_value.assign(1, __c); 391 } 392 } 393 394 // Differences between styles: 395 // 1) Extended doesn't support backref, but basic does. 396 template
397 void 398 _Scanner<_CharT>:: 399 _M_eat_escape_posix() 400 { 401 if (_M_current == _M_end) 402 __throw_regex_error(regex_constants::error_escape, 403 "Unexpected end of regex when escaping."); 404 405 auto __c = *_M_current; 406 auto __pos = std::strchr(_M_spec_char, _M_ctype.narrow(__c, '\0')); 407 408 if (__pos != nullptr && *__pos != '\0') 409 { 410 _M_token = _S_token_ord_char; 411 _M_value.assign(1, __c); 412 } 413 // We MUST judge awk before handling backrefs. There's no backref in awk. 414 else if (_M_is_awk()) 415 { 416 _M_eat_escape_awk(); 417 return; 418 } 419 else if (_M_is_basic() && _M_ctype.is(_CtypeT::digit, __c) && __c != '0') 420 { 421 _M_token = _S_token_backref; 422 _M_value.assign(1, __c); 423 } 424 else 425 { 426 #ifdef __STRICT_ANSI__ 427 // POSIX says it is undefined to escape ordinary characters 428 __throw_regex_error(regex_constants::error_escape, 429 "Unexpected escape character."); 430 #else 431 _M_token = _S_token_ord_char; 432 _M_value.assign(1, __c); 433 #endif 434 } 435 ++_M_current; 436 } 437 438 template
439 void 440 _Scanner<_CharT>:: 441 _M_eat_escape_awk() 442 { 443 auto __c = *_M_current++; 444 auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0')); 445 446 if (__pos != nullptr) 447 { 448 _M_token = _S_token_ord_char; 449 _M_value.assign(1, *__pos); 450 } 451 // \ddd for oct representation 452 else if (_M_ctype.is(_CtypeT::digit, __c) 453 && __c != '8' 454 && __c != '9') 455 { 456 _M_value.assign(1, __c); 457 for (int __i = 0; 458 __i < 2 459 && _M_current != _M_end 460 && _M_ctype.is(_CtypeT::digit, *_M_current) 461 && *_M_current != '8' 462 && *_M_current != '9'; 463 __i++) 464 _M_value += *_M_current++; 465 _M_token = _S_token_oct_num; 466 return; 467 } 468 else 469 __throw_regex_error(regex_constants::error_escape, 470 "Unexpected escape character."); 471 } 472 473 // Eats a character class or throws an exception. 474 // __ch could be ':', '.' or '=', _M_current is the char after ']' when 475 // returning. 476 template
477 void 478 _Scanner<_CharT>:: 479 _M_eat_class(char __ch) 480 { 481 for (_M_value.clear(); _M_current != _M_end && *_M_current != __ch;) 482 _M_value += *_M_current++; 483 if (_M_current == _M_end 484 || *_M_current++ != __ch 485 || _M_current == _M_end // skip __ch 486 || *_M_current++ != ']') // skip ']' 487 { 488 if (__ch == ':') 489 __throw_regex_error(regex_constants::error_ctype, 490 "Unexpected end of character class."); 491 else 492 __throw_regex_error(regex_constants::error_collate, 493 "Unexpected end of character class."); 494 } 495 } 496 497 #ifdef _GLIBCXX_DEBUG 498 template
499 std::ostream& 500 _Scanner<_CharT>:: 501 _M_print(std::ostream& ostr) 502 { 503 switch (_M_token) 504 { 505 case _S_token_anychar: 506 ostr << "any-character\n"; 507 break; 508 case _S_token_backref: 509 ostr << "backref\n"; 510 break; 511 case _S_token_bracket_begin: 512 ostr << "bracket-begin\n"; 513 break; 514 case _S_token_bracket_neg_begin: 515 ostr << "bracket-neg-begin\n"; 516 break; 517 case _S_token_bracket_end: 518 ostr << "bracket-end\n"; 519 break; 520 case _S_token_char_class_name: 521 ostr << "char-class-name \"" << _M_value << "\"\n"; 522 break; 523 case _S_token_closure0: 524 ostr << "closure0\n"; 525 break; 526 case _S_token_closure1: 527 ostr << "closure1\n"; 528 break; 529 case _S_token_collsymbol: 530 ostr << "collsymbol \"" << _M_value << "\"\n"; 531 break; 532 case _S_token_comma: 533 ostr << "comma\n"; 534 break; 535 case _S_token_dup_count: 536 ostr << "dup count: " << _M_value << "\n"; 537 break; 538 case _S_token_eof: 539 ostr << "EOF\n"; 540 break; 541 case _S_token_equiv_class_name: 542 ostr << "equiv-class-name \"" << _M_value << "\"\n"; 543 break; 544 case _S_token_interval_begin: 545 ostr << "interval begin\n"; 546 break; 547 case _S_token_interval_end: 548 ostr << "interval end\n"; 549 break; 550 case _S_token_line_begin: 551 ostr << "line begin\n"; 552 break; 553 case _S_token_line_end: 554 ostr << "line end\n"; 555 break; 556 case _S_token_opt: 557 ostr << "opt\n"; 558 break; 559 case _S_token_or: 560 ostr << "or\n"; 561 break; 562 case _S_token_ord_char: 563 ostr << "ordinary character: \"" << _M_value << "\"\n"; 564 break; 565 case _S_token_subexpr_begin: 566 ostr << "subexpr begin\n"; 567 break; 568 case _S_token_subexpr_no_group_begin: 569 ostr << "no grouping subexpr begin\n"; 570 break; 571 case _S_token_subexpr_lookahead_begin: 572 ostr << "lookahead subexpr begin\n"; 573 break; 574 case _S_token_subexpr_end: 575 ostr << "subexpr end\n"; 576 break; 577 case _S_token_unknown: 578 ostr << "-- unknown token --\n"; 579 break; 580 case _S_token_oct_num: 581 ostr << "oct number " << _M_value << "\n"; 582 break; 583 case _S_token_hex_num: 584 ostr << "hex number " << _M_value << "\n"; 585 break; 586 case _S_token_quoted_class: 587 ostr << "quoted class " << "\\" << _M_value << "\n"; 588 break; 589 default: 590 _GLIBCXX_DEBUG_ASSERT(false); 591 } 592 return ostr; 593 } 594 #endif 595 596 } // namespace __detail 597 _GLIBCXX_END_NAMESPACE_VERSION 598 } // namespace
Contact us
|
About us
|
Term of use
|
Copyright © 2000-2025 MyWebUniversity.com ™