libstdc++
regex_compiler.h
Go to the documentation of this file.
1 // class template regex -*- C++ -*-
2 
3 // Copyright (C) 2010-2026 Free Software Foundation, Inc.
4 //
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
9 // any later version.
10 
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
15 
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
19 
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
24 
25 /**
26  * @file bits/regex_compiler.h
27  * This is an internal header file, included by other library headers.
28  * Do not attempt to use it directly. @headername{regex}
29  */
30 
31 namespace std _GLIBCXX_VISIBILITY(default)
32 {
33 _GLIBCXX_BEGIN_NAMESPACE_VERSION
34 _GLIBCXX_BEGIN_NAMESPACE_CXX11
35 
36  template<typename>
37  class regex_traits;
38 
39 _GLIBCXX_END_NAMESPACE_CXX11
40 
41 #pragma GCC diagnostic push
42 #pragma GCC diagnostic ignored "-Wc++17-extensions" // if constexpr
43 namespace __detail
44 {
45  /**
46  * @addtogroup regex-detail
47  * @{
48  */
49 
50  template<typename, bool, bool>
51  struct _BracketMatcher;
52 
53  /**
54  * @brief Builds an NFA from an input iterator range.
55  *
56  * The %_TraitsT type should fulfill the requirements in C++11 28.3 [re.req].
57  */
58  template<typename _TraitsT>
59  class _Compiler
60  {
61  public:
62  typedef typename _TraitsT::char_type _CharT;
63  typedef _NFA<_TraitsT> _RegexT;
65 
66  _Compiler(const _CharT* __b, const _CharT* __e,
67  const typename _TraitsT::locale_type& __traits, _FlagT __flags);
68 
70  _M_get_nfa() noexcept
71  { return std::move(_M_nfa); }
72 
73  private:
74  typedef _Scanner<_CharT> _ScannerT;
75  typedef typename _TraitsT::string_type _StringT;
76  typedef typename _ScannerT::_TokenT _TokenT;
80 
81  // accepts a specific token or returns false.
82  bool
83  _M_match_token(_TokenT __token);
84 
85  void
86  _M_disjunction();
87 
88  void
89  _M_alternative();
90 
91  bool
92  _M_term();
93 
94  bool
95  _M_assertion();
96 
97  bool
98  _M_quantifier();
99 
100  bool
101  _M_atom();
102 
103  bool
104  _M_bracket_expression();
105 
106  template<bool __icase, bool __collate>
107  void
108  _M_insert_any_matcher_ecma();
109 
110  template<bool __icase, bool __collate>
111  void
112  _M_insert_any_matcher_posix();
113 
114  template<bool __icase, bool __collate>
115  void
116  _M_insert_char_matcher();
117 
118  template<bool __icase, bool __collate>
119  void
120  _M_insert_character_class_matcher();
121 
122  template<bool __icase, bool __collate>
123  void
124  _M_insert_bracket_matcher(bool __neg);
125 
126  // Cache of the last atom seen in a bracketed range expression.
127  struct _BracketState
128  {
129  enum class _Type : char { _None, _Char, _Class } _M_type = _Type::_None;
130  _CharT _M_char = _CharT();
131 
132  void
133  set(_CharT __c) noexcept { _M_type = _Type::_Char; _M_char = __c; }
134 
135  _GLIBCXX_NODISCARD _CharT
136  get() const noexcept { return _M_char; }
137 
138  void
139  reset(_Type __t = _Type::_None) noexcept { _M_type = __t; }
140 
141  explicit operator bool() const noexcept
142  { return _M_type != _Type::_None; }
143 
144  // Previous token was a single character.
145  _GLIBCXX_NODISCARD bool
146  _M_is_char() const noexcept { return _M_type == _Type::_Char; }
147 
148  // Previous token was a character class, equivalent class,
149  // collating symbol etc.
150  _GLIBCXX_NODISCARD bool
151  _M_is_class() const noexcept { return _M_type == _Type::_Class; }
152  };
153 
154  template<bool __icase, bool __collate>
155  using _BracketMatcher
157 
158  // Returns true if successfully parsed one term and should continue
159  // compiling a bracket expression.
160  // Returns false if the compiler should move on.
161  template<bool __icase, bool __collate>
162  bool
163  _M_expression_term(_BracketState& __last_char,
165 
166  int
167  _M_cur_int_value(int __radix);
168 
169  bool
170  _M_try_char();
171 
172  _StateSeqT
173  _M_pop()
174  {
175  auto ret = _M_stack.top();
176  _M_stack.pop();
177  return ret;
178  }
179 
180  static _FlagT
181  _S_validate(_FlagT __f)
182  {
183  using namespace regex_constants;
184  switch (__f & (ECMAScript|basic|extended|awk|grep|egrep))
185  {
186  case ECMAScript:
187  case basic:
188  case extended:
189  case awk:
190  case grep:
191  case egrep:
192  return __f;
193  case _FlagT(0):
194  return __f | ECMAScript;
195  default:
196  std::__throw_regex_error(_S_grammar, "conflicting grammar options");
197  }
198  }
199 
200  _FlagT _M_flags;
201  _ScannerT _M_scanner;
202  shared_ptr<_RegexT> _M_nfa;
203  _StringT _M_value;
204  _StackT _M_stack;
205  const _TraitsT& _M_traits;
206  const _CtypeT& _M_ctype;
207  };
208 
209  // C++11 28.13 [re.grammar] p14
210  template<typename _TraitsT, bool __icase, bool __collate>
211  class _RegexTranslatorBase
212  {
213  public:
214  typedef typename _TraitsT::char_type _CharT;
215  typedef typename _TraitsT::string_type _StringT;
216  typedef _StringT _StrTransT;
217 
218  explicit
219  _RegexTranslatorBase(const _TraitsT& __traits)
220  : _M_traits(__traits)
221  { }
222 
223  _CharT
224  _M_translate(_CharT __ch) const
225  {
226  if constexpr (__icase)
227  return _M_traits.translate_nocase(__ch);
228  else if constexpr (__collate)
229  return _M_traits.translate(__ch);
230  else
231  return __ch;
232  }
233 
234  _StrTransT
235  _M_transform(_CharT __ch) const
236  {
237  _StrTransT __str(1, __ch);
238  return _M_traits.transform(__str.begin(), __str.end());
239  }
240 
241  // See LWG 523. It's not efficiently implementable when _TraitsT is not
242  // std::regex_traits<>, and __collate is true. See specializations for
243  // implementations of other cases.
244  bool
245  _M_match_range(const _StrTransT& __first, const _StrTransT& __last,
246  const _StrTransT& __s) const
247  { return __first <= __s && __s <= __last; }
248 
249  protected:
250  bool _M_in_range_icase(_CharT __first, _CharT __last, _CharT __ch) const
251  {
252  typedef std::ctype<_CharT> __ctype_type;
253  const auto& __fctyp = use_facet<__ctype_type>(this->_M_traits.getloc());
254  auto __lower = __fctyp.tolower(__ch);
255  auto __upper = __fctyp.toupper(__ch);
256  return (__first <= __lower && __lower <= __last)
257  || (__first <= __upper && __upper <= __last);
258  }
259 
260  const _TraitsT& _M_traits;
261  };
262 
263  template<typename _TraitsT, bool __icase, bool __collate>
264  class _RegexTranslator
265  : public _RegexTranslatorBase<_TraitsT, __icase, __collate>
266  {
267  public:
268  typedef _RegexTranslatorBase<_TraitsT, __icase, __collate> _Base;
269  using _Base::_Base;
270  };
271 
272  template<typename _TraitsT, bool __icase>
273  class _RegexTranslator<_TraitsT, __icase, false>
274  : public _RegexTranslatorBase<_TraitsT, __icase, false>
275  {
276  public:
277  typedef _RegexTranslatorBase<_TraitsT, __icase, false> _Base;
278  typedef typename _Base::_CharT _CharT;
279  typedef _CharT _StrTransT;
280 
281  using _Base::_Base;
282 
283  _StrTransT
284  _M_transform(_CharT __ch) const
285  { return __ch; }
286 
287  bool
288  _M_match_range(_CharT __first, _CharT __last, _CharT __ch) const
289  {
290  if constexpr (!__icase)
291  return __first <= __ch && __ch <= __last;
292  else
293  return this->_M_in_range_icase(__first, __last, __ch);
294  }
295  };
296 
297  template<typename _CharType>
298  class _RegexTranslator<std::regex_traits<_CharType>, true, true>
299  : public _RegexTranslatorBase<std::regex_traits<_CharType>, true, true>
300  {
301  public:
302  typedef _RegexTranslatorBase<std::regex_traits<_CharType>, true, true>
303  _Base;
304  typedef typename _Base::_CharT _CharT;
305  typedef typename _Base::_StrTransT _StrTransT;
306 
307  using _Base::_Base;
308 
309  bool
310  _M_match_range(const _StrTransT& __first, const _StrTransT& __last,
311  const _StrTransT& __str) const
312  {
313  __glibcxx_assert(__first.size() == 1);
314  __glibcxx_assert(__last.size() == 1);
315  __glibcxx_assert(__str.size() == 1);
316  return this->_M_in_range_icase(__first[0], __last[0], __str[0]);
317  }
318  };
319 
320  template<typename _TraitsT>
321  class _RegexTranslator<_TraitsT, false, false>
322  {
323  public:
324  typedef typename _TraitsT::char_type _CharT;
325  typedef _CharT _StrTransT;
326 
327  explicit
328  _RegexTranslator(const _TraitsT&)
329  { }
330 
331  _CharT
332  _M_translate(_CharT __ch) const
333  { return __ch; }
334 
335  _StrTransT
336  _M_transform(_CharT __ch) const
337  { return __ch; }
338 
339  bool
340  _M_match_range(_CharT __first, _CharT __last, _CharT __ch) const
341  { return __first <= __ch && __ch <= __last; }
342  };
343 
344  template<typename _TraitsT, bool __is_ecma, bool __icase, bool __collate>
345  struct _AnyMatcher;
346 
347  template<typename _TraitsT, bool __icase, bool __collate>
348  struct _AnyMatcher<_TraitsT, false, __icase, __collate>
349  {
350  typedef _RegexTranslator<_TraitsT, __icase, __collate> _TransT;
351  typedef typename _TransT::_CharT _CharT;
352 
353  explicit
354  _AnyMatcher(const _TraitsT& __traits)
355  : _M_translator(__traits)
356  { }
357 
358  bool
359  operator()(_CharT __ch) const
360  {
361  static auto __nul = _M_translator._M_translate('\0');
362  return _M_translator._M_translate(__ch) != __nul;
363  }
364 
365  _TransT _M_translator;
366  };
367 
368  template<typename _TraitsT, bool __icase, bool __collate>
369  struct _AnyMatcher<_TraitsT, true, __icase, __collate>
370  {
371  typedef _RegexTranslator<_TraitsT, __icase, __collate> _TransT;
372  typedef typename _TransT::_CharT _CharT;
373 
374  explicit
375  _AnyMatcher(const _TraitsT& __traits)
376  : _M_translator(__traits)
377  { }
378 
379  bool
380  operator()(_CharT __ch) const
381  {
382  const auto __c = _M_translator._M_translate(__ch);
383  if (__c == _M_translator._M_translate('\n'))
384  return false;
385  if (__c == _M_translator._M_translate('\r'))
386  return false;
387  if constexpr (!is_same<_CharT, char>::value)
388  {
389  if (__c == _M_translator._M_translate(u'\u2028')) // line sep
390  return false;
391  if (__c == _M_translator._M_translate(u'\u2029')) // para sep
392  return false;
393  }
394  return true;
395  }
396 
397  _TransT _M_translator;
398  };
399 
400  template<typename _TraitsT, bool __icase, bool __collate>
401  struct _CharMatcher
402  {
403  typedef _RegexTranslator<_TraitsT, __icase, __collate> _TransT;
404  typedef typename _TransT::_CharT _CharT;
405 
406  _CharMatcher(_CharT __ch, const _TraitsT& __traits)
407  : _M_translator(__traits), _M_ch(_M_translator._M_translate(__ch))
408  { }
409 
410  bool
411  operator()(_CharT __ch) const
412  { return _M_ch == _M_translator._M_translate(__ch); }
413 
414  _TransT _M_translator;
415  _CharT _M_ch;
416  };
417 
418  /// Matches a character range (bracket expression)
419  template<typename _TraitsT, bool __icase, bool __collate>
421  {
422  public:
423  typedef _RegexTranslator<_TraitsT, __icase, __collate> _TransT;
424  typedef typename _TransT::_CharT _CharT;
425  typedef typename _TransT::_StrTransT _StrTransT;
426  typedef typename _TraitsT::string_type _StringT;
427  typedef typename _TraitsT::char_class_type _CharClassT;
428 
429  public:
430  _BracketMatcher(bool __is_non_matching,
431  const _TraitsT& __traits)
432  : _M_class_set(0), _M_translator(__traits), _M_traits(__traits),
433  _M_is_non_matching(__is_non_matching)
434  { }
435 
436  bool
437  operator()(_CharT __ch) const
438  {
439  _GLIBCXX_DEBUG_ASSERT(_M_is_ready);
440  if constexpr (_UseCache::value)
441  if (!(__ch & 0x80)) [[__likely__]]
442  return _M_cache[static_cast<_UnsignedCharT>(__ch)];
443  return _M_apply(__ch);
444  }
445 
446  void
447  _M_add_char(_CharT __c)
448  {
449  _M_char_set.push_back(_M_translator._M_translate(__c));
450  _GLIBCXX_DEBUG_ONLY(_M_is_ready = false);
451  }
452 
453  _StringT
454  _M_add_collate_element(const _StringT& __s)
455  {
456  auto __st = _M_traits.lookup_collatename(__s.data(),
457  __s.data() + __s.size());
458  if (__st.empty())
459  __throw_regex_error(regex_constants::error_collate,
460  "Invalid collate element.");
461  _M_char_set.push_back(_M_translator._M_translate(__st[0]));
462  _GLIBCXX_DEBUG_ONLY(_M_is_ready = false);
463  return __st;
464  }
465 
466  void
467  _M_add_equivalence_class(const _StringT& __s)
468  {
469  auto __st = _M_traits.lookup_collatename(__s.data(),
470  __s.data() + __s.size());
471  if (__st.empty())
472  __throw_regex_error(regex_constants::error_collate,
473  "Invalid equivalence class.");
474  __st = _M_traits.transform_primary(__st.data(),
475  __st.data() + __st.size());
476  _M_equiv_set.push_back(__st);
477  _GLIBCXX_DEBUG_ONLY(_M_is_ready = false);
478  }
479 
480  // __neg should be true for \D, \S and \W only.
481  void
482  _M_add_character_class(const _StringT& __s, bool __neg)
483  {
484  auto __mask = _M_traits.lookup_classname(__s.data(),
485  __s.data() + __s.size(),
486  __icase);
487  if (__mask == 0)
488  __throw_regex_error(regex_constants::error_collate,
489  "Invalid character class.");
490  if (!__neg)
491  _M_class_set |= __mask;
492  else
493  _M_neg_class_set.push_back(__mask);
494  _GLIBCXX_DEBUG_ONLY(_M_is_ready = false);
495  }
496 
497  void
498  _M_make_range(_CharT __l, _CharT __r)
499  {
500  if (__l > __r)
501  __throw_regex_error(regex_constants::error_range,
502  "Invalid range in bracket expression.");
503  _M_range_set.push_back(make_pair(_M_translator._M_transform(__l),
504  _M_translator._M_transform(__r)));
505  _GLIBCXX_DEBUG_ONLY(_M_is_ready = false);
506  }
507 
508  void
509  _M_ready()
510  {
511  std::sort(_M_char_set.begin(), _M_char_set.end());
512  auto __end = std::unique(_M_char_set.begin(), _M_char_set.end());
513  _M_char_set.erase(__end, _M_char_set.end());
514  if constexpr (_UseCache::value)
515  for (unsigned __i = 0; __i < 128; __i++) // Only cache 7-bit chars
516  _M_cache[__i] = _M_apply(static_cast<_CharT>(__i));
517  _GLIBCXX_DEBUG_ONLY(_M_is_ready = true);
518  }
519 
520  private:
521  // Currently we only use the cache for char
522  using _UseCache = typename std::is_same<_CharT, char>::type;
523 
524  static constexpr size_t
525  _S_cache_size =
526  1ul << (sizeof(_CharT) * __CHAR_BIT__ * int(_UseCache::value));
527 
528  struct _Dummy { };
529  using _CacheT = std::__conditional_t<_UseCache::value,
531  _Dummy>;
532  using _UnsignedCharT = typename std::make_unsigned<_CharT>::type;
533 
534  bool
535  _M_apply(_CharT __ch) const;
536 
537  private:
538  _GLIBCXX_STD_C::vector<_CharT> _M_char_set;
539  _GLIBCXX_STD_C::vector<_StringT> _M_equiv_set;
540  _GLIBCXX_STD_C::vector<pair<_StrTransT, _StrTransT>> _M_range_set;
541  _GLIBCXX_STD_C::vector<_CharClassT> _M_neg_class_set;
542  _CharClassT _M_class_set;
543  _TransT _M_translator;
544  const _TraitsT& _M_traits;
545  bool _M_is_non_matching;
546  _CacheT _M_cache;
547 #ifdef _GLIBCXX_DEBUG
548  bool _M_is_ready = false;
549 #endif
550  };
551 
552  ///@} regex-detail
553 } // namespace __detail
554 #pragma GCC diagnostic pop
555 _GLIBCXX_END_NAMESPACE_VERSION
556 } // namespace std
557 
558 #include <bits/regex_compiler.tcc>
constexpr std::remove_reference< _Tp >::type && move(_Tp &&__t) noexcept
Convert a value to an rvalue.
Definition: move.h:138
ISO C++ entities toplevel namespace is std.
constexpr error_type error_range(_S_error_range)
constexpr syntax_option_type ECMAScript
constexpr syntax_option_type egrep
syntax_option_type
This is a bitmask type indicating how to interpret the regex.
constexpr syntax_option_type awk
constexpr syntax_option_type extended
constexpr syntax_option_type basic
constexpr error_type error_collate(_S_error_collate)
constexpr syntax_option_type grep
The bitset class represents a fixed-size sequence of bits.
Definition: bitset:822
is_same
Definition: type_traits:1623
Primary class template ctype facet.
Describes a sequence of one or more _State, its current start and end(s). This structure contains fra...
Matches a character range (bracket expression)
Builds an NFA from an input iterator range.
Scans an input range for regex tokens.
A smart pointer with reference-counted copy semantics.
A standard container made up of unique keys, which can be retrieved in logarithmic time.
Definition: stl_set.h:101
void pop()
Removes first element.
Definition: stl_stack.h:331
reference top()
Definition: stl_stack.h:258