libstdc++
regex_compiler.h
Go to the documentation of this file.
1 // class template regex -*- C++ -*-
2 
3 // Copyright (C) 2010-2021 Free Software Foundation, Inc.
4 //
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
9 // any later version.
10 
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
15 
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
19 
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
24 
25 /**
26  * @file bits/regex_compiler.h
27  * This is an internal header file, included by other library headers.
28  * Do not attempt to use it directly. @headername{regex}
29  */
30 
31 namespace std _GLIBCXX_VISIBILITY(default)
32 {
33 _GLIBCXX_BEGIN_NAMESPACE_VERSION
34 _GLIBCXX_BEGIN_NAMESPACE_CXX11
35 
36  template<typename>
37  class regex_traits;
38 
39 _GLIBCXX_END_NAMESPACE_CXX11
40 
41 namespace __detail
42 {
43  /**
44  * @addtogroup regex-detail
45  * @{
46  */
47 
48  template<typename, bool, bool>
49  struct _BracketMatcher;
50 
51  /**
52  * @brief Builds an NFA from an input iterator range.
53  *
54  * The %_TraitsT type should fulfill requirements [28.3].
55  */
56  template<typename _TraitsT>
57  class _Compiler
58  {
59  public:
60  typedef typename _TraitsT::char_type _CharT;
61  typedef _NFA<_TraitsT> _RegexT;
63 
64  _Compiler(const _CharT* __b, const _CharT* __e,
65  const typename _TraitsT::locale_type& __traits, _FlagT __flags);
66 
68  _M_get_nfa() noexcept
69  { return std::move(_M_nfa); }
70 
71  private:
72  typedef _Scanner<_CharT> _ScannerT;
73  typedef typename _TraitsT::string_type _StringT;
74  typedef typename _ScannerT::_TokenT _TokenT;
78 
79  // accepts a specific token or returns false.
80  bool
81  _M_match_token(_TokenT __token);
82 
83  void
84  _M_disjunction();
85 
86  void
87  _M_alternative();
88 
89  bool
90  _M_term();
91 
92  bool
93  _M_assertion();
94 
95  bool
96  _M_quantifier();
97 
98  bool
99  _M_atom();
100 
101  bool
102  _M_bracket_expression();
103 
104  template<bool __icase, bool __collate>
105  void
106  _M_insert_any_matcher_ecma();
107 
108  template<bool __icase, bool __collate>
109  void
110  _M_insert_any_matcher_posix();
111 
112  template<bool __icase, bool __collate>
113  void
114  _M_insert_char_matcher();
115 
116  template<bool __icase, bool __collate>
117  void
118  _M_insert_character_class_matcher();
119 
120  template<bool __icase, bool __collate>
121  void
122  _M_insert_bracket_matcher(bool __neg);
123 
124  // Returns true if successfully matched one term and should continue.
125  // Returns false if the compiler should move on.
126  template<bool __icase, bool __collate>
127  bool
128  _M_expression_term(pair<bool, _CharT>& __last_char,
130  __matcher);
131 
132  int
133  _M_cur_int_value(int __radix);
134 
135  bool
136  _M_try_char();
137 
138  _StateSeqT
139  _M_pop()
140  {
141  auto ret = _M_stack.top();
142  _M_stack.pop();
143  return ret;
144  }
145 
146  static _FlagT
147  _S_validate(_FlagT __f)
148  {
149  using namespace regex_constants;
150  switch (__f & (ECMAScript|basic|extended|awk|grep|egrep))
151  {
152  case ECMAScript:
153  case basic:
154  case extended:
155  case awk:
156  case grep:
157  case egrep:
158  return __f;
159  case _FlagT(0):
160  return __f | ECMAScript;
161  default:
162  std::__throw_regex_error(_S_grammar, "conflicting grammar options");
163  }
164  }
165 
166  _FlagT _M_flags;
167  _ScannerT _M_scanner;
168  shared_ptr<_RegexT> _M_nfa;
169  _StringT _M_value;
170  _StackT _M_stack;
171  const _TraitsT& _M_traits;
172  const _CtypeT& _M_ctype;
173  };
174 
175  // [28.13.14]
176  template<typename _TraitsT, bool __icase, bool __collate>
177  class _RegexTranslatorBase
178  {
179  public:
180  typedef typename _TraitsT::char_type _CharT;
181  typedef typename _TraitsT::string_type _StringT;
182  typedef _StringT _StrTransT;
183 
184  explicit
185  _RegexTranslatorBase(const _TraitsT& __traits)
186  : _M_traits(__traits)
187  { }
188 
189  _CharT
190  _M_translate(_CharT __ch) const
191  {
192  if (__icase)
193  return _M_traits.translate_nocase(__ch);
194  else if (__collate)
195  return _M_traits.translate(__ch);
196  else
197  return __ch;
198  }
199 
200  _StrTransT
201  _M_transform(_CharT __ch) const
202  {
203  _StrTransT __str(1, __ch);
204  return _M_traits.transform(__str.begin(), __str.end());
205  }
206 
207  // See LWG 523. It's not efficiently implementable when _TraitsT is not
208  // std::regex_traits<>, and __collate is true. See specializations for
209  // implementations of other cases.
210  bool
211  _M_match_range(const _StrTransT& __first, const _StrTransT& __last,
212  const _StrTransT& __s) const
213  { return __first <= __s && __s <= __last; }
214 
215  protected:
216  bool _M_in_range_icase(_CharT __first, _CharT __last, _CharT __ch) const
217  {
218  typedef std::ctype<_CharT> __ctype_type;
219  const auto& __fctyp = use_facet<__ctype_type>(this->_M_traits.getloc());
220  auto __lower = __fctyp.tolower(__ch);
221  auto __upper = __fctyp.toupper(__ch);
222  return (__first <= __lower && __lower <= __last)
223  || (__first <= __upper && __upper <= __last);
224  }
225 
226  const _TraitsT& _M_traits;
227  };
228 
229  template<typename _TraitsT, bool __icase, bool __collate>
230  class _RegexTranslator
231  : public _RegexTranslatorBase<_TraitsT, __icase, __collate>
232  {
233  public:
234  typedef _RegexTranslatorBase<_TraitsT, __icase, __collate> _Base;
235  using _Base::_Base;
236  };
237 
238  template<typename _TraitsT, bool __icase>
239  class _RegexTranslator<_TraitsT, __icase, false>
240  : public _RegexTranslatorBase<_TraitsT, __icase, false>
241  {
242  public:
243  typedef _RegexTranslatorBase<_TraitsT, __icase, false> _Base;
244  typedef typename _Base::_CharT _CharT;
245  typedef _CharT _StrTransT;
246 
247  using _Base::_Base;
248 
249  _StrTransT
250  _M_transform(_CharT __ch) const
251  { return __ch; }
252 
253  bool
254  _M_match_range(_CharT __first, _CharT __last, _CharT __ch) const
255  {
256  if (!__icase)
257  return __first <= __ch && __ch <= __last;
258  return this->_M_in_range_icase(__first, __last, __ch);
259  }
260  };
261 
262  template<typename _CharType>
263  class _RegexTranslator<std::regex_traits<_CharType>, true, true>
264  : public _RegexTranslatorBase<std::regex_traits<_CharType>, true, true>
265  {
266  public:
267  typedef _RegexTranslatorBase<std::regex_traits<_CharType>, true, true>
268  _Base;
269  typedef typename _Base::_CharT _CharT;
270  typedef typename _Base::_StrTransT _StrTransT;
271 
272  using _Base::_Base;
273 
274  bool
275  _M_match_range(const _StrTransT& __first, const _StrTransT& __last,
276  const _StrTransT& __str) const
277  {
278  __glibcxx_assert(__first.size() == 1);
279  __glibcxx_assert(__last.size() == 1);
280  __glibcxx_assert(__str.size() == 1);
281  return this->_M_in_range_icase(__first[0], __last[0], __str[0]);
282  }
283  };
284 
285  template<typename _TraitsT>
286  class _RegexTranslator<_TraitsT, false, false>
287  {
288  public:
289  typedef typename _TraitsT::char_type _CharT;
290  typedef _CharT _StrTransT;
291 
292  explicit
293  _RegexTranslator(const _TraitsT&)
294  { }
295 
296  _CharT
297  _M_translate(_CharT __ch) const
298  { return __ch; }
299 
300  _StrTransT
301  _M_transform(_CharT __ch) const
302  { return __ch; }
303 
304  bool
305  _M_match_range(_CharT __first, _CharT __last, _CharT __ch) const
306  { return __first <= __ch && __ch <= __last; }
307  };
308 
309  template<typename _TraitsT, bool __is_ecma, bool __icase, bool __collate>
310  struct _AnyMatcher;
311 
312  template<typename _TraitsT, bool __icase, bool __collate>
313  struct _AnyMatcher<_TraitsT, false, __icase, __collate>
314  {
315  typedef _RegexTranslator<_TraitsT, __icase, __collate> _TransT;
316  typedef typename _TransT::_CharT _CharT;
317 
318  explicit
319  _AnyMatcher(const _TraitsT& __traits)
320  : _M_translator(__traits)
321  { }
322 
323  bool
324  operator()(_CharT __ch) const
325  {
326  static auto __nul = _M_translator._M_translate('\0');
327  return _M_translator._M_translate(__ch) != __nul;
328  }
329 
330  _TransT _M_translator;
331  };
332 
333  template<typename _TraitsT, bool __icase, bool __collate>
334  struct _AnyMatcher<_TraitsT, true, __icase, __collate>
335  {
336  typedef _RegexTranslator<_TraitsT, __icase, __collate> _TransT;
337  typedef typename _TransT::_CharT _CharT;
338 
339  explicit
340  _AnyMatcher(const _TraitsT& __traits)
341  : _M_translator(__traits)
342  { }
343 
344  bool
345  operator()(_CharT __ch) const
346  { return _M_apply(__ch, typename is_same<_CharT, char>::type()); }
347 
348  bool
349  _M_apply(_CharT __ch, true_type) const
350  {
351  auto __c = _M_translator._M_translate(__ch);
352  auto __n = _M_translator._M_translate('\n');
353  auto __r = _M_translator._M_translate('\r');
354  return __c != __n && __c != __r;
355  }
356 
357  bool
358  _M_apply(_CharT __ch, false_type) const
359  {
360  auto __c = _M_translator._M_translate(__ch);
361  auto __n = _M_translator._M_translate('\n');
362  auto __r = _M_translator._M_translate('\r');
363  auto __u2028 = _M_translator._M_translate(u'\u2028');
364  auto __u2029 = _M_translator._M_translate(u'\u2029');
365  return __c != __n && __c != __r && __c != __u2028 && __c != __u2029;
366  }
367 
368  _TransT _M_translator;
369  };
370 
371  template<typename _TraitsT, bool __icase, bool __collate>
372  struct _CharMatcher
373  {
374  typedef _RegexTranslator<_TraitsT, __icase, __collate> _TransT;
375  typedef typename _TransT::_CharT _CharT;
376 
377  _CharMatcher(_CharT __ch, const _TraitsT& __traits)
378  : _M_translator(__traits), _M_ch(_M_translator._M_translate(__ch))
379  { }
380 
381  bool
382  operator()(_CharT __ch) const
383  { return _M_ch == _M_translator._M_translate(__ch); }
384 
385  _TransT _M_translator;
386  _CharT _M_ch;
387  };
388 
389  /// Matches a character range (bracket expression)
390  template<typename _TraitsT, bool __icase, bool __collate>
392  {
393  public:
394  typedef _RegexTranslator<_TraitsT, __icase, __collate> _TransT;
395  typedef typename _TransT::_CharT _CharT;
396  typedef typename _TransT::_StrTransT _StrTransT;
397  typedef typename _TraitsT::string_type _StringT;
398  typedef typename _TraitsT::char_class_type _CharClassT;
399 
400  public:
401  _BracketMatcher(bool __is_non_matching,
402  const _TraitsT& __traits)
403  : _M_class_set(0), _M_translator(__traits), _M_traits(__traits),
404  _M_is_non_matching(__is_non_matching)
405  { }
406 
407  bool
408  operator()(_CharT __ch) const
409  {
410  _GLIBCXX_DEBUG_ASSERT(_M_is_ready);
411  return _M_apply(__ch, _UseCache());
412  }
413 
414  void
415  _M_add_char(_CharT __c)
416  {
417  _M_char_set.push_back(_M_translator._M_translate(__c));
418  _GLIBCXX_DEBUG_ONLY(_M_is_ready = false);
419  }
420 
421  _StringT
422  _M_add_collate_element(const _StringT& __s)
423  {
424  auto __st = _M_traits.lookup_collatename(__s.data(),
425  __s.data() + __s.size());
426  if (__st.empty())
427  __throw_regex_error(regex_constants::error_collate,
428  "Invalid collate element.");
429  _M_char_set.push_back(_M_translator._M_translate(__st[0]));
430  _GLIBCXX_DEBUG_ONLY(_M_is_ready = false);
431  return __st;
432  }
433 
434  void
435  _M_add_equivalence_class(const _StringT& __s)
436  {
437  auto __st = _M_traits.lookup_collatename(__s.data(),
438  __s.data() + __s.size());
439  if (__st.empty())
440  __throw_regex_error(regex_constants::error_collate,
441  "Invalid equivalence class.");
442  __st = _M_traits.transform_primary(__st.data(),
443  __st.data() + __st.size());
444  _M_equiv_set.push_back(__st);
445  _GLIBCXX_DEBUG_ONLY(_M_is_ready = false);
446  }
447 
448  // __neg should be true for \D, \S and \W only.
449  void
450  _M_add_character_class(const _StringT& __s, bool __neg)
451  {
452  auto __mask = _M_traits.lookup_classname(__s.data(),
453  __s.data() + __s.size(),
454  __icase);
455  if (__mask == 0)
456  __throw_regex_error(regex_constants::error_collate,
457  "Invalid character class.");
458  if (!__neg)
459  _M_class_set |= __mask;
460  else
461  _M_neg_class_set.push_back(__mask);
462  _GLIBCXX_DEBUG_ONLY(_M_is_ready = false);
463  }
464 
465  void
466  _M_make_range(_CharT __l, _CharT __r)
467  {
468  if (__l > __r)
469  __throw_regex_error(regex_constants::error_range,
470  "Invalid range in bracket expression.");
471  _M_range_set.push_back(make_pair(_M_translator._M_transform(__l),
472  _M_translator._M_transform(__r)));
473  _GLIBCXX_DEBUG_ONLY(_M_is_ready = false);
474  }
475 
476  void
477  _M_ready()
478  {
479  std::sort(_M_char_set.begin(), _M_char_set.end());
480  auto __end = std::unique(_M_char_set.begin(), _M_char_set.end());
481  _M_char_set.erase(__end, _M_char_set.end());
482  _M_make_cache(_UseCache());
483  _GLIBCXX_DEBUG_ONLY(_M_is_ready = true);
484  }
485 
486  private:
487  // Currently we only use the cache for char
488  using _UseCache = typename std::is_same<_CharT, char>::type;
489 
490  static constexpr size_t
491  _S_cache_size =
492  1ul << (sizeof(_CharT) * __CHAR_BIT__ * int(_UseCache::value));
493 
494  struct _Dummy { };
495  using _CacheT = std::__conditional_t<_UseCache::value,
497  _Dummy>;
498  using _UnsignedCharT = typename std::make_unsigned<_CharT>::type;
499 
500  bool
501  _M_apply(_CharT __ch, false_type) const;
502 
503  bool
504  _M_apply(_CharT __ch, true_type) const
505  { return _M_cache[static_cast<_UnsignedCharT>(__ch)]; }
506 
507  void
508  _M_make_cache(true_type)
509  {
510  for (unsigned __i = 0; __i < _M_cache.size(); __i++)
511  _M_cache[__i] = _M_apply(static_cast<_CharT>(__i), false_type());
512  }
513 
514  void
515  _M_make_cache(false_type)
516  { }
517 
518  private:
519  _GLIBCXX_STD_C::vector<_CharT> _M_char_set;
520  _GLIBCXX_STD_C::vector<_StringT> _M_equiv_set;
521  _GLIBCXX_STD_C::vector<pair<_StrTransT, _StrTransT>> _M_range_set;
522  _GLIBCXX_STD_C::vector<_CharClassT> _M_neg_class_set;
523  _CharClassT _M_class_set;
524  _TransT _M_translator;
525  const _TraitsT& _M_traits;
526  bool _M_is_non_matching;
527  _CacheT _M_cache;
528 #ifdef _GLIBCXX_DEBUG
529  bool _M_is_ready = false;
530 #endif
531  };
532 
533  ///@} regex-detail
534 } // namespace __detail
535 _GLIBCXX_END_NAMESPACE_VERSION
536 } // namespace std
537 
538 #include <bits/regex_compiler.tcc>
integral_constant< bool, true > true_type
The type used as a compile-time boolean with true value.
Definition: type_traits:82
integral_constant< bool, false > false_type
The type used as a compile-time boolean with false value.
Definition: type_traits:85
constexpr std::remove_reference< _Tp >::type && move(_Tp &&__t) noexcept
Convert a value to an rvalue.
Definition: move.h:104
ISO C++ entities toplevel namespace is std.
constexpr error_type error_range(_S_error_range)
constexpr syntax_option_type ECMAScript
constexpr syntax_option_type egrep
syntax_option_type
This is a bitmask type indicating how to interpret the regex.
constexpr syntax_option_type awk
constexpr syntax_option_type extended
constexpr syntax_option_type basic
constexpr error_type error_collate(_S_error_collate)
constexpr syntax_option_type grep
The bitset class represents a fixed-size sequence of bits.
Definition: bitset:753
integral_constant
Definition: type_traits:63
Primary class template ctype facet.
Describes a sequence of one or more _State, its current start and end(s). This structure contains fra...
Matches a character range (bracket expression)
Builds an NFA from an input iterator range.
Scans an input range for regex tokens.
A smart pointer with reference-counted copy semantics.
Struct holding two objects of arbitrary type.
Definition: stl_pair.h:201
void pop()
Removes first element.
Definition: stl_stack.h:293
reference top()
Definition: stl_stack.h:232