Where Online Learning is simpler!

The C and C++ Include Header Files

/usr/include/c++/11/bits/regex_scanner.tcc


$ cat -n /usr/include/c++/11/bits/regex_scanner.tcc

     1	// class template regex -*- C++ -*-
     2	
     3	// Copyright (C) 2013-2021 Free Software Foundation, Inc.
     4	//
     5	// This file is part of the GNU ISO C++ Library.  This library is free
     6	// software; you can redistribute it and/or modify it under the
     7	// terms of the GNU General Public License as published by the
     8	// Free Software Foundation; either version 3, or (at your option)
     9	// any later version.
    10	
    11	// This library is distributed in the hope that it will be useful,
    12	// but WITHOUT ANY WARRANTY; without even the implied warranty of
    13	// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    14	// GNU General Public License for more details.
    15	
    16	// Under Section 7 of GPL version 3, you are granted additional
    17	// permissions described in the GCC Runtime Library Exception, version
    18	// 3.1, as published by the Free Software Foundation.
    19	
    20	// You should have received a copy of the GNU General Public License and
    21	// a copy of the GCC Runtime Library Exception along with this program;
    22	// see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    23	// <http://www.gnu.org/licenses/>.
    24	
    25	/**
    26	 *  @file bits/regex_scanner.tcc
    27	 *  This is an internal header file, included by other library headers.
    28	 *  Do not attempt to use it directly. @headername{regex}
    29	 */
    30	
    31	// FIXME make comments doxygen format.
    32	
    33	// N3376 specified 6 regex styles: ECMAScript, basic, extended, grep, egrep
    34	// and awk
    35	// 1) grep is basic except '\n' is treated as '|'
    36	// 2) egrep is extended except '\n' is treated as '|'
    37	// 3) awk is extended except special escaping rules, and there's no
    38	//    back-reference.
    39	//
    40	// References:
    41	//
    42	// ECMAScript: ECMA-262 15.10
    43	//
    44	// basic, extended:
    45	// http://pubs.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html
    46	//
    47	// awk: http://pubs.opengroup.org/onlinepubs/000095399/utilities/awk.html
    48	
    49	namespace std _GLIBCXX_VISIBILITY(default)
    50	{
    51	_GLIBCXX_BEGIN_NAMESPACE_VERSION
    52	
    53	namespace __detail
    54	{
    55	  template<typename _CharT>
    56	    _Scanner<_CharT>::
    57	    _Scanner(const _CharT* __begin, const _CharT* __end,
    58		     _FlagT __flags, std::locale __loc)
    59	    : _ScannerBase(__flags),
    60	      _M_current(__begin), _M_end(__end),
    61	      _M_ctype(std::use_facet<_CtypeT>(__loc)),
    62	      _M_eat_escape(_M_is_ecma()
    63			    ? &_Scanner::_M_eat_escape_ecma
    64			    : &_Scanner::_M_eat_escape_posix)
    65	    { _M_advance(); }
    66	
    67	  template<typename _CharT>
    68	    void
    69	    _Scanner<_CharT>::
    70	    _M_advance()
    71	    {
    72	      if (_M_current == _M_end)
    73		{
    74		  _M_token = _S_token_eof;
    75		  return;
    76		}
    77	
    78	      if (_M_state == _S_state_normal)
    79		_M_scan_normal();
    80	      else if (_M_state == _S_state_in_bracket)
    81		_M_scan_in_bracket();
    82	      else if (_M_state == _S_state_in_brace)
    83		_M_scan_in_brace();
    84	      else
    85		{
    86		  __glibcxx_assert(false);
    87		}
    88	    }
    89	
    90	  // Differences between styles:
    91	  // 1) "\(", "\)", "\{" in basic. It's not escaping.
    92	  // 2) "(?:", "(?=", "(?!" in ECMAScript.
    93	  template<typename _CharT>
    94	    void
    95	    _Scanner<_CharT>::
    96	    _M_scan_normal()
    97	    {
    98	      auto __c = *_M_current++;
    99	
   100	      if (std::strchr(_M_spec_char, _M_ctype.narrow(__c, ' ')) == nullptr)
   101		{
   102		  _M_token = _S_token_ord_char;
   103		  _M_value.assign(1, __c);
   104		  return;
   105		}
   106	      if (__c == '\\')
   107		{
   108		  if (_M_current == _M_end)
   109		    __throw_regex_error(
   110		      regex_constants::error_escape,
   111		      "Unexpected end of regex when escaping.");
   112	
   113		  if (!_M_is_basic()
   114		      || (*_M_current != '('
   115			  && *_M_current != ')'
   116			  && *_M_current != '{'))
   117		    {
   118		      (this->*_M_eat_escape)();
   119		      return;
   120		    }
   121		  __c = *_M_current++;
   122		}
   123	      if (__c == '(')
   124		{
   125		  if (_M_is_ecma() && *_M_current == '?')
   126		    {
   127		      if (++_M_current == _M_end)
   128			__throw_regex_error(
   129			  regex_constants::error_paren,
   130			  "Unexpected end of regex when in an open parenthesis.");
   131	
   132		      if (*_M_current == ':')
   133			{
   134			  ++_M_current;
   135			  _M_token = _S_token_subexpr_no_group_begin;
   136			}
   137		      else if (*_M_current == '=')
   138			{
   139			  ++_M_current;
   140			  _M_token = _S_token_subexpr_lookahead_begin;
   141			  _M_value.assign(1, 'p');
   142			}
   143		      else if (*_M_current == '!')
   144			{
   145			  ++_M_current;
   146			  _M_token = _S_token_subexpr_lookahead_begin;
   147			  _M_value.assign(1, 'n');
   148			}
   149		      else
   150			__throw_regex_error(
   151			  regex_constants::error_paren,
   152			  "Invalid special open parenthesis.");
   153		    }
   154		  else if (_M_flags & regex_constants::nosubs)
   155		    _M_token = _S_token_subexpr_no_group_begin;
   156		  else
   157		    _M_token = _S_token_subexpr_begin;
   158		}
   159	      else if (__c == ')')
   160		_M_token = _S_token_subexpr_end;
   161	      else if (__c == '[')
   162		{
   163		  _M_state = _S_state_in_bracket;
   164		  _M_at_bracket_start = true;
   165		  if (_M_current != _M_end && *_M_current == '^')
   166		    {
   167		      _M_token = _S_token_bracket_neg_begin;
   168		      ++_M_current;
   169		    }
   170		  else
   171		    _M_token = _S_token_bracket_begin;
   172		}
   173	      else if (__c == '{')
   174		{
   175		  _M_state = _S_state_in_brace;
   176		  _M_token = _S_token_interval_begin;
   177		}
   178	      else if (__builtin_expect(__c == _CharT(0), false))
   179		{
   180		  if (!_M_is_ecma())
   181		    {
   182		      __throw_regex_error(regex_constants::_S_null,
   183			  "Unexpected null character in regular expression");
   184		    }
   185		  _M_token = _S_token_ord_char;
   186		  _M_value.assign(1, __c);
   187		}
   188	      else if (__c != ']' && __c != '}')
   189		{
   190		  auto __it = _M_token_tbl;
   191		  auto __narrowc = _M_ctype.narrow(__c, '\0');
   192		  for (; __it->first != '\0'; ++__it)
   193		    if (__it->first == __narrowc)
   194		      {
   195			_M_token = __it->second;
   196			return;
   197		      }
   198		  __glibcxx_assert(false);
   199		}
   200	      else
   201		{
   202		  _M_token = _S_token_ord_char;
   203		  _M_value.assign(1, __c);
   204		}
   205	    }
   206	
   207	  // Differences between styles:
   208	  // 1) different semantics of "[]" and "[^]".
   209	  // 2) Escaping in bracket expr.
   210	  template<typename _CharT>
   211	    void
   212	    _Scanner<_CharT>::
   213	    _M_scan_in_bracket()
   214	    {
   215	      if (_M_current == _M_end)
   216		__throw_regex_error(
   217		  regex_constants::error_brack,
   218		  "Unexpected end of regex when in bracket expression.");
   219	
   220	      auto __c = *_M_current++;
   221	
   222	      if (__c == '-')
   223		_M_token = _S_token_bracket_dash;
   224	      else if (__c == '[')
   225		{
   226		  if (_M_current == _M_end)
   227		    __throw_regex_error(regex_constants::error_brack,
   228					"Unexpected character class open bracket.");
   229	
   230		  if (*_M_current == '.')
   231		    {
   232		      _M_token = _S_token_collsymbol;
   233		      _M_eat_class(*_M_current++);
   234		    }
   235		  else if (*_M_current == ':')
   236		    {
   237		      _M_token = _S_token_char_class_name;
   238		      _M_eat_class(*_M_current++);
   239		    }
   240		  else if (*_M_current == '=')
   241		    {
   242		      _M_token = _S_token_equiv_class_name;
   243		      _M_eat_class(*_M_current++);
   244		    }
   245		  else
   246		    {
   247		      _M_token = _S_token_ord_char;
   248		      _M_value.assign(1, __c);
   249		    }
   250		}
   251	      // In POSIX, when encountering "[]" or "[^]", the ']' is interpreted
   252	      // literally. So "[]]" and "[^]]" are valid regexes. See the testcases
   253	      // `*/empty_range.cc`.
   254	      else if (__c == ']' && (_M_is_ecma() || !_M_at_bracket_start))
   255		{
   256		  _M_token = _S_token_bracket_end;
   257		  _M_state = _S_state_normal;
   258		}
   259	      // ECMAScript and awk permits escaping in bracket.
   260	      else if (__c == '\\' && (_M_is_ecma() || _M_is_awk()))
   261		(this->*_M_eat_escape)();
   262	      else
   263		{
   264		  _M_token = _S_token_ord_char;
   265		  _M_value.assign(1, __c);
   266		}
   267	      _M_at_bracket_start = false;
   268	    }
   269	
   270	  // Differences between styles:
   271	  // 1) "\}" in basic style.
   272	  template<typename _CharT>
   273	    void
   274	    _Scanner<_CharT>::
   275	    _M_scan_in_brace()
   276	    {
   277	      if (_M_current == _M_end)
   278		__throw_regex_error(
   279		  regex_constants::error_brace,
   280		  "Unexpected end of regex when in brace expression.");
   281	
   282	      auto __c = *_M_current++;
   283	
   284	      if (_M_ctype.is(_CtypeT::digit, __c))
   285		{
   286		  _M_token = _S_token_dup_count;
   287		  _M_value.assign(1, __c);
   288		  while (_M_current != _M_end
   289			 && _M_ctype.is(_CtypeT::digit, *_M_current))
   290		    _M_value += *_M_current++;
   291		}
   292	      else if (__c == ',')
   293		_M_token = _S_token_comma;
   294	      // basic use \}.
   295	      else if (_M_is_basic())
   296		{
   297		  if (__c == '\\' && _M_current != _M_end && *_M_current == '}')
   298		    {
   299		      _M_state = _S_state_normal;
   300		      _M_token = _S_token_interval_end;
   301		      ++_M_current;
   302		    }
   303		  else
   304		    __throw_regex_error(regex_constants::error_badbrace,
   305					"Unexpected character in brace expression.");
   306		}
   307	      else if (__c == '}')
   308		{
   309		  _M_state = _S_state_normal;
   310		  _M_token = _S_token_interval_end;
   311		}
   312	      else
   313		__throw_regex_error(regex_constants::error_badbrace,
   314				    "Unexpected character in brace expression.");
   315	    }
   316	
   317	  template<typename _CharT>
   318	    void
   319	    _Scanner<_CharT>::
   320	    _M_eat_escape_ecma()
   321	    {
   322	      if (_M_current == _M_end)
   323		__throw_regex_error(regex_constants::error_escape,
   324				    "Unexpected end of regex when escaping.");
   325	
   326	      auto __c = *_M_current++;
   327	      auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0'));
   328	
   329	      if (__pos != nullptr && (__c != 'b' || _M_state == _S_state_in_bracket))
   330		{
   331		  _M_token = _S_token_ord_char;
   332		  _M_value.assign(1, *__pos);
   333		}
   334	      else if (__c == 'b')
   335		{
   336		  _M_token = _S_token_word_bound;
   337		  _M_value.assign(1, 'p');
   338		}
   339	      else if (__c == 'B')
   340		{
   341		  _M_token = _S_token_word_bound;
   342		  _M_value.assign(1, 'n');
   343		}
   344	      // N3376 28.13
   345	      else if (__c == 'd'
   346		       || __c == 'D'
   347		       || __c == 's'
   348		       || __c == 'S'
   349		       || __c == 'w'
   350		       || __c == 'W')
   351		{
   352		  _M_token = _S_token_quoted_class;
   353		  _M_value.assign(1, __c);
   354		}
   355	      else if (__c == 'c')
   356		{
   357		  if (_M_current == _M_end)
   358		    __throw_regex_error(
   359		      regex_constants::error_escape,
   360		      "Unexpected end of regex when reading control code.");
   361		  _M_token = _S_token_ord_char;
   362		  _M_value.assign(1, *_M_current++);
   363		}
   364	      else if (__c == 'x' || __c == 'u')
   365		{
   366		  _M_value.erase();
   367		  for (int __i = 0; __i < (__c == 'x' ? 2 : 4); __i++)
   368		    {
   369		      if (_M_current == _M_end
   370			  || !_M_ctype.is(_CtypeT::xdigit, *_M_current))
   371			__throw_regex_error(
   372			  regex_constants::error_escape,
   373			  "Unexpected end of regex when ascii character.");
   374		      _M_value += *_M_current++;
   375		    }
   376		  _M_token = _S_token_hex_num;
   377		}
   378	      // ECMAScript recognizes multi-digit back-references.
   379	      else if (_M_ctype.is(_CtypeT::digit, __c))
   380		{
   381		  _M_value.assign(1, __c);
   382		  while (_M_current != _M_end
   383			 && _M_ctype.is(_CtypeT::digit, *_M_current))
   384		    _M_value += *_M_current++;
   385		  _M_token = _S_token_backref;
   386		}
   387	      else
   388		{
   389		  _M_token = _S_token_ord_char;
   390		  _M_value.assign(1, __c);
   391		}
   392	    }
   393	
   394	  // Differences between styles:
   395	  // 1) Extended doesn't support backref, but basic does.
   396	  template<typename _CharT>
   397	    void
   398	    _Scanner<_CharT>::
   399	    _M_eat_escape_posix()
   400	    {
   401	      if (_M_current == _M_end)
   402		__throw_regex_error(regex_constants::error_escape,
   403				    "Unexpected end of regex when escaping.");
   404	
   405	      auto __c = *_M_current;
   406	      auto __pos = std::strchr(_M_spec_char, _M_ctype.narrow(__c, '\0'));
   407	
   408	      if (__pos != nullptr && *__pos != '\0')
   409		{
   410		  _M_token = _S_token_ord_char;
   411		  _M_value.assign(1, __c);
   412		}
   413	      // We MUST judge awk before handling backrefs. There's no backref in awk.
   414	      else if (_M_is_awk())
   415		{
   416		  _M_eat_escape_awk();
   417		  return;
   418		}
   419	      else if (_M_is_basic() && _M_ctype.is(_CtypeT::digit, __c) && __c != '0')
   420		{
   421		  _M_token = _S_token_backref;
   422		  _M_value.assign(1, __c);
   423		}
   424	      else
   425		{
   426	#ifdef __STRICT_ANSI__
   427		  // POSIX says it is undefined to escape ordinary characters
   428		  __throw_regex_error(regex_constants::error_escape,
   429				      "Unexpected escape character.");
   430	#else
   431		  _M_token = _S_token_ord_char;
   432		  _M_value.assign(1, __c);
   433	#endif
   434		}
   435	      ++_M_current;
   436	    }
   437	
   438	  template<typename _CharT>
   439	    void
   440	    _Scanner<_CharT>::
   441	    _M_eat_escape_awk()
   442	    {
   443	      auto __c = *_M_current++;
   444	      auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0'));
   445	
   446	      if (__pos != nullptr)
   447		{
   448		  _M_token = _S_token_ord_char;
   449		  _M_value.assign(1, *__pos);
   450		}
   451	      // \ddd for oct representation
   452	      else if (_M_ctype.is(_CtypeT::digit, __c)
   453		       && __c != '8'
   454		       && __c != '9')
   455		{
   456		  _M_value.assign(1,  __c);
   457		  for (int __i = 0;
   458		       __i < 2
   459		       && _M_current != _M_end
   460		       && _M_ctype.is(_CtypeT::digit, *_M_current)
   461		       && *_M_current != '8'
   462		       && *_M_current != '9';
   463		       __i++)
   464		    _M_value += *_M_current++;
   465		  _M_token = _S_token_oct_num;
   466		  return;
   467		}
   468	      else
   469		__throw_regex_error(regex_constants::error_escape,
   470				    "Unexpected escape character.");
   471	    }
   472	
   473	  // Eats a character class or throws an exception.
   474	  // __ch could be ':', '.' or '=', _M_current is the char after ']' when
   475	  // returning.
   476	  template<typename _CharT>
   477	    void
   478	    _Scanner<_CharT>::
   479	    _M_eat_class(char __ch)
   480	    {
   481	      for (_M_value.clear(); _M_current != _M_end && *_M_current != __ch;)
   482		_M_value += *_M_current++;
   483	      if (_M_current == _M_end
   484		  || *_M_current++ != __ch
   485		  || _M_current == _M_end // skip __ch
   486		  || *_M_current++ != ']') // skip ']'
   487		{
   488		  if (__ch == ':')
   489		    __throw_regex_error(regex_constants::error_ctype,
   490					"Unexpected end of character class.");
   491		  else
   492		    __throw_regex_error(regex_constants::error_collate,
   493					"Unexpected end of character class.");
   494		}
   495	    }
   496	
   497	#ifdef _GLIBCXX_DEBUG
   498	  template<typename _CharT>
   499	    std::ostream&
   500	    _Scanner<_CharT>::
   501	    _M_print(std::ostream& ostr)
   502	    {
   503	      switch (_M_token)
   504	      {
   505	      case _S_token_anychar:
   506		ostr << "any-character\n";
   507		break;
   508	      case _S_token_backref:
   509		ostr << "backref\n";
   510		break;
   511	      case _S_token_bracket_begin:
   512		ostr << "bracket-begin\n";
   513		break;
   514	      case _S_token_bracket_neg_begin:
   515		ostr << "bracket-neg-begin\n";
   516		break;
   517	      case _S_token_bracket_end:
   518		ostr << "bracket-end\n";
   519		break;
   520	      case _S_token_char_class_name:
   521		ostr << "char-class-name \"" << _M_value << "\"\n";
   522		break;
   523	      case _S_token_closure0:
   524		ostr << "closure0\n";
   525		break;
   526	      case _S_token_closure1:
   527		ostr << "closure1\n";
   528		break;
   529	      case _S_token_collsymbol:
   530		ostr << "collsymbol \"" << _M_value << "\"\n";
   531		break;
   532	      case _S_token_comma:
   533		ostr << "comma\n";
   534		break;
   535	      case _S_token_dup_count:
   536		ostr << "dup count: " << _M_value << "\n";
   537		break;
   538	      case _S_token_eof:
   539		ostr << "EOF\n";
   540		break;
   541	      case _S_token_equiv_class_name:
   542		ostr << "equiv-class-name \"" << _M_value << "\"\n";
   543		break;
   544	      case _S_token_interval_begin:
   545		ostr << "interval begin\n";
   546		break;
   547	      case _S_token_interval_end:
   548		ostr << "interval end\n";
   549		break;
   550	      case _S_token_line_begin:
   551		ostr << "line begin\n";
   552		break;
   553	      case _S_token_line_end:
   554		ostr << "line end\n";
   555		break;
   556	      case _S_token_opt:
   557		ostr << "opt\n";
   558		break;
   559	      case _S_token_or:
   560		ostr << "or\n";
   561		break;
   562	      case _S_token_ord_char:
   563		ostr << "ordinary character: \"" << _M_value << "\"\n";
   564		break;
   565	      case _S_token_subexpr_begin:
   566		ostr << "subexpr begin\n";
   567		break;
   568	      case _S_token_subexpr_no_group_begin:
   569		ostr << "no grouping subexpr begin\n";
   570		break;
   571	      case _S_token_subexpr_lookahead_begin:
   572		ostr << "lookahead subexpr begin\n";
   573		break;
   574	      case _S_token_subexpr_end:
   575		ostr << "subexpr end\n";
   576		break;
   577	      case _S_token_unknown:
   578		ostr << "-- unknown token --\n";
   579		break;
   580	      case _S_token_oct_num:
   581		ostr << "oct number " << _M_value << "\n";
   582		break;
   583	      case _S_token_hex_num:
   584		ostr << "hex number " << _M_value << "\n";
   585		break;
   586	      case _S_token_quoted_class:
   587		ostr << "quoted class " << "\\" << _M_value << "\n";
   588		break;
   589	      default:
   590		_GLIBCXX_DEBUG_ASSERT(false);
   591	      }
   592	      return ostr;
   593	    }
   594	#endif
   595	
   596	} // namespace __detail
   597	_GLIBCXX_END_NAMESPACE_VERSION
   598	} // namespace