libstdc++
unicode.h
Go to the documentation of this file.
1 // Unicode utilities -*- C++ -*-
2 
3 // Copyright The GNU Toolchain Authors.
4 //
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
9 // any later version.
10 
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
15 
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
19 
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
24 
25 /** @file include/bits/unicode.h
26  * This is an internal header file, included by other library headers.
27  * Do not attempt to use it directly. @headername{format}
28  */
29 
30 #ifndef _GLIBCXX_UNICODE_H
31 #define _GLIBCXX_UNICODE_H 1
32 
33 #if __cplusplus >= 202002L
34 #include <array>
35 #include <bit> // bit_width
36 #include <charconv> // __detail::__from_chars_alnum_to_val_table
37 #include <string_view>
38 #include <cstdint>
39 #include <bits/stl_algo.h>
40 #include <bits/stl_iterator.h>
41 #include <bits/ranges_base.h> // iterator_t, sentinel_t, input_range, etc.
42 #include <bits/ranges_util.h> // view_interface
43 
44 namespace std _GLIBCXX_VISIBILITY(default)
45 {
46 _GLIBCXX_BEGIN_NAMESPACE_VERSION
47 namespace __unicode
48 {
49  // A Unicode code point that is not a high or low surrogate.
50  constexpr bool
51  __is_scalar_value(char32_t __c)
52  {
53  if (__c < 0xD800) [[likely]]
54  return true;
55  return 0xDFFF < __c && __c <= 0x10FFFF;
56  }
57 
58  // A code point that can be encoded in a single code unit of type _CharT.
59  template<typename _CharT>
60  constexpr bool
61  __is_single_code_unit(char32_t __c)
62  {
63  if constexpr (__gnu_cxx::__int_traits<_CharT>::__max <= 0xFF)
64  return __c <= 0x7F; // ASCII character
65  else
66  return __c < __gnu_cxx::__int_traits<_CharT>::__max
67  && __is_scalar_value(__c);
68  }
69 
70  // Based on https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2023/p2728r6.html#add-the-transcoding-iterator-template
71 
72  struct _Repl
73  {
74  constexpr char32_t
75  operator()() const noexcept
76  { return 0xFFFD; }
77  };
78 
79  struct _Null_sentinel_t
80  {
81  template<input_iterator _It>
82  requires default_initializable<iter_value_t<_It>>
83  && equality_comparable_with<iter_reference_t<_It>, iter_value_t<_It>>
84  friend constexpr auto
85  operator==(_It __it, _Null_sentinel_t)
86  { return *__it == iter_value_t<_It>{}; }
87  };
88 
89  // An iterator over an input range of FromFmt code units that yields either
90  // UTF-8, UTF-16, or UTF-32, as a range of ToFmt code units.
91  // The code units from the input range are interpreted as Unicode code points
92  // and the iterator produces the individual code unit for each code point.
93  // Invalid sequences in the input are replaced with U+FFDD so that the result
94  // is always valid UTF-8, UTF-16, or UTF-32.
95  //
96  // The iterator knows the bounds of the underlying input range and will not
97  // read outside those bounds (incrementing or decrementing at the boundary
98  // is erroneously idempotent).
99  //
100  // On construction, the iterator attemps to decode a single code point from
101  // the input range and then encode it into an internal buffer in the output
102  // format, e.g. if the input is UTF-8 and the output is UTF-16, it might read
103  // three char8_t code units from the input and store two char16_t code units
104  // in its buffer. Incrementing the iterator will first iterate over buffer,
105  // yielding each code unit in turn, and then extract another code point from
106  // the input. Failure to extract a valid code point from the input will store
107  // U+FFFD in the buffer, encoded as the appropriate code units of type ToFmt.
108  template<typename _FromFmt, typename _ToFmt,
109  input_iterator _Iter, sentinel_for<_Iter> _Sent = _Iter,
110  typename _ErrorHandler = _Repl>
111  requires convertible_to<iter_value_t<_Iter>, _FromFmt>
112  class _Utf_iterator
113  {
114  static_assert(forward_iterator<_Iter> || noexcept(_ErrorHandler()()));
115 
116  public:
117  using value_type = _ToFmt;
118  using difference_type = iter_difference_t<_Iter>;
119  using reference = value_type;
120  using iterator_concept
121  = std::__detail::__clamp_iter_cat<__iter_category_t<_Iter>,
122  bidirectional_iterator_tag>;
123 
124  constexpr _Utf_iterator() = default;
125 
126  constexpr
127  _Utf_iterator(_Iter __first, _Iter __it, _Sent __last)
128  requires bidirectional_iterator<_Iter>
129  : _M_first_and_curr{__first, __it}, _M_last(__last)
130  {
131  if (_M_curr() != _M_last)
132  _M_read();
133  else
134  _M_buf = {};
135  }
136 
137  constexpr
138  _Utf_iterator(_Iter __it, _Sent __last)
139  requires (!bidirectional_iterator<_Iter>)
140  : _M_first_and_curr{__it}, _M_last(__last)
141  {
142  if (_M_curr() != _M_last)
143  _M_read();
144  else
145  _M_buf = {};
146  }
147 
148  template<class _Iter2, class _Sent2>
149  requires convertible_to<_Iter2, _Iter> && convertible_to<_Sent2, _Sent>
150  constexpr
151  _Utf_iterator(const _Utf_iterator<_FromFmt, _ToFmt, _Iter2, _Sent2,
152  _ErrorHandler>& __other)
153  : _M_buf(__other._M_buf), _M_first_and_curr(__other._M_first_and_curr),
154  _M_buf_index(__other._M_buf_index), _M_buf_last(__other._M_buf_last),
155  _M_last(__other._M_last)
156  { }
157 
158  [[nodiscard]]
159  constexpr _Iter
160  begin() const requires bidirectional_iterator<_Iter>
161  { return _M_first(); }
162 
163  [[nodiscard]]
164  constexpr _Sent
165  end() const { return _M_last; }
166 
167  [[nodiscard]]
168  constexpr _Iter
169  base() const requires forward_iterator<_Iter>
170  { return _M_curr(); }
171 
172  [[nodiscard]]
173  constexpr iter_difference_t<_Iter>
174  _M_units() const requires forward_iterator<_Iter>
175  { return _M_to_increment; }
176 
177  [[nodiscard]]
178  constexpr value_type
179  operator*() const { return _M_buf[_M_buf_index]; }
180 
181  constexpr _Utf_iterator&
182  operator++()
183  {
184  if (_M_buf_index + 1 < _M_buf_last)
185  ++_M_buf_index; // Move to the next code unit in the buffer.
186  else if (_M_curr() != _M_last)
187  {
188  // Advance past the current code point (for non-forward iterators
189  // we already moved there after decoding the last code point).
190  if constexpr (forward_iterator<_Iter>)
191  std::advance(_M_curr(), _M_to_increment);
192  if (_M_curr() == _M_last)
193  _M_buf_index = 0;
194  else // Decode next code point from the input and update buffer.
195  _M_read();
196  }
197  // else erroneous, but ignored for now.
198  return *this;
199  }
200 
201  constexpr _Utf_iterator
202  operator++(int)
203  {
204  auto __tmp = *this;
205  ++*this;
206  return __tmp;
207  }
208 
209  constexpr _Utf_iterator&
210  operator--() requires bidirectional_iterator<_Iter>
211  {
212  if (_M_buf_index > 0)
213  --_M_buf_index;
214  else if (_M_curr() != _M_first())
215  {
216  _M_read_reverse();
217  _M_buf_index = _M_buf_last - 1;
218  ranges::advance(_M_curr(), -_M_to_increment);
219  }
220  // else erroneous, but ignored for now.
221  return *this;
222  }
223 
224  constexpr _Utf_iterator
225  operator--(int)
226  {
227  auto __tmp = *this;
228  --*this;
229  return __tmp;
230  }
231 
232  [[nodiscard]]
233  friend constexpr bool
234  operator==(_Utf_iterator __lhs, _Utf_iterator __rhs)
235  requires forward_iterator<_Iter> || requires (_Iter __i) { __i != __i; }
236  {
237  if constexpr (forward_iterator<_Iter>)
238  return __lhs._M_curr() == __rhs._M_curr()
239  && __lhs._M_buf_index == __rhs._M_buf_index;
240  else if (__lhs._M_curr() != __rhs._M_curr())
241  return false;
242  else if (__lhs._M_buf_index == __rhs._M_buf_index
243  && __lhs._M_buf_last == __rhs._M_buf_last)
244  return true;
245  else
246  return __lhs._M_buf_index == __lhs._M_buf_last
247  && __rhs._M_buf_index == __rhs._M_buf_last;
248  }
249 
250  [[nodiscard]]
251  friend constexpr bool
252  operator==(_Utf_iterator __lhs, _Sent __rhs)
253  {
254  if constexpr (forward_iterator<_Iter>)
255  return __lhs._M_curr() == __rhs;
256  else
257  return __lhs._M_curr() == __rhs
258  && __lhs._M_buf_index == __lhs._M_buf_last;
259  }
260 
261  private:
262  constexpr void
263  _M_read()
264  {
265  if constexpr (sizeof(_FromFmt) == sizeof(uint8_t))
266  _M_read_utf8();
267  else if constexpr (sizeof(_FromFmt) == sizeof(uint16_t))
268  _M_read_utf16();
269  else
270  {
271  static_assert(sizeof(_FromFmt) == sizeof(uint32_t));
272  _M_read_utf32();
273  }
274  }
275 
276  constexpr void
277  _M_read_reverse() requires bidirectional_iterator<_Iter>
278  {
279  if constexpr (sizeof(_FromFmt) == sizeof(uint8_t))
280  _M_read_reverse_utf8();
281  else if constexpr (sizeof(_FromFmt) == sizeof(uint16_t))
282  _M_read_reverse_utf16();
283  else
284  {
285  static_assert(sizeof(_FromFmt) == sizeof(uint32_t));
286  _M_read_reverse_utf32();
287  }
288  }
289 
290  template<typename>
291  struct _Guard
292  {
293  _Guard(void*, _Iter&) { }
294  };
295 
296  template<typename _It> requires forward_iterator<_It>
297  struct _Guard<_It>
298  {
299  constexpr ~_Guard() { _M_this->_M_curr() = std::move(_M_orig); }
300  _Utf_iterator* _M_this;
301  _It _M_orig;
302  };
303 
304  constexpr char32_t
305  _M_read_utf8()
306  {
307  _Guard<_Iter> __g{this, _M_curr()};
308  char32_t __c{};
309  const uint8_t __lo_bound = 0x80, __hi_bound = 0xBF;
310  uint8_t __u = *_M_curr()++;
311  uint8_t __to_incr = 1;
312  auto __incr = [&, this] {
313  ++__to_incr;
314  return ++_M_curr();
315  };
316 
317  if (__u <= 0x7F) [[likely]] // 0x00 to 0x7F
318  __c = __u;
319  else if (__u < 0xC2) [[unlikely]]
320  __c = _S_error();
321  else if (_M_curr() == _M_last) [[unlikely]]
322  __c = _S_error();
323  else if (__u <= 0xDF) // 0xC2 to 0xDF
324  {
325  __c = __u & 0x1F;
326  __u = *_M_curr();
327 
328  if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]
329  __c = _S_error();
330  else
331  {
332  __c = (__c << 6) | (__u & 0x3F);
333  __incr();
334  }
335  }
336  else if (__u <= 0xEF) // 0xE0 to 0xEF
337  {
338  const uint8_t __lo_bound_2 = __u == 0xE0 ? 0xA0 : __lo_bound;
339  const uint8_t __hi_bound_2 = __u == 0xED ? 0x9F : __hi_bound;
340 
341  __c = __u & 0x0F;
342  __u = *_M_curr();
343 
344  if (__u < __lo_bound_2 || __u > __hi_bound_2) [[unlikely]]
345  __c = _S_error();
346  else if (__incr() == _M_last) [[unlikely]]
347  __c = _S_error();
348  else
349  {
350  __c = (__c << 6) | (__u & 0x3F);
351  __u = *_M_curr();
352 
353  if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]
354  __c = _S_error();
355  else
356  {
357  __c = (__c << 6) | (__u & 0x3F);
358  __incr();
359  }
360  }
361  }
362  else if (__u <= 0xF4) // 0xF0 to 0xF4
363  {
364  const uint8_t __lo_bound_2 = __u == 0xF0 ? 0x90 : __lo_bound;
365  const uint8_t __hi_bound_2 = __u == 0xF4 ? 0x8F : __hi_bound;
366 
367  __c = __u & 0x07;
368  __u = *_M_curr();
369 
370  if (__u < __lo_bound_2 || __u > __hi_bound_2) [[unlikely]]
371  __c = _S_error();
372  else if (__incr() == _M_last) [[unlikely]]
373  __c = _S_error();
374  else
375  {
376  __c = (__c << 6) | (__u & 0x3F);
377  __u = *_M_curr();
378 
379  if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]
380  __c = _S_error();
381  else if (__incr() == _M_last) [[unlikely]]
382  __c = _S_error();
383  else
384  {
385  __c = (__c << 6) | (__u & 0x3F);
386  __u = *_M_curr();
387 
388  if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]
389  __c = _S_error();
390  else
391  {
392  __c = (__c << 6) | (__u & 0x3F);
393  __incr();
394  }
395  }
396  }
397  }
398  else [[unlikely]]
399  __c = _S_error();
400 
401  _M_update(__c, __to_incr);
402 
403  return __c;
404  }
405 
406  constexpr void
407  _M_read_utf16()
408  {
409  _Guard<_Iter> __g{this, _M_curr()};
410  char32_t __c{};
411  uint16_t __u = *_M_curr()++;
412  uint8_t __to_incr = 1;
413 
414  if (__u < 0xD800 || __u > 0xDFFF) [[likely]]
415  __c = __u;
416  else if (__u < 0xDC00 && _M_curr() != _M_last)
417  {
418  uint16_t __u2 = *_M_curr();
419  if (__u2 < 0xDC00 || __u2 > 0xDFFF) [[unlikely]]
420  __c = _S_error();
421  else
422  {
423  ++_M_curr();
424  __to_incr = 2;
425  uint32_t __x = (__u & 0x3F) << 10 | (__u2 & 0x3FF);
426  uint32_t __w = (__u >> 6) & 0x1F;
427  __c = (__w + 1) << 16 | __x;
428  }
429  }
430  else
431  __c = _S_error();
432 
433  _M_update(__c, __to_incr);
434  }
435 
436  constexpr void
437  _M_read_utf32()
438  {
439  _Guard<_Iter> __g{this, _M_curr()};
440  char32_t __c = *_M_curr()++;
441  if (!__is_scalar_value(__c)) [[unlikely]]
442  __c = _S_error();
443  _M_update(__c, 1);
444  }
445 
446  constexpr void
447  _M_read_reverse_utf8() requires bidirectional_iterator<_Iter>
448  {
449  const auto __first = _M_first();
450  auto __curr = _M_curr();
451  // The code point we decode:
452  char32_t __c{};
453  // The last code unit read:
454  uint8_t __u = *--__curr;
455  // Count of bytes read:
456  uint8_t __to_incr = 1;
457 
458  if (__u <= 0x7F) [[likely]]
459  {
460  _M_update(__u, 1);
461  return;
462  }
463 
464  // Continuation bytes match 10xxxxxx
465  auto __is_continuation = [](uint8_t __b) {
466  return (__b & 0xC0) == 0x80;
467  };
468  // 0xC0 and 0xC1 would produce overlong encodings of ASCII characters.
469  // 0xF5-0xFF would produce code points above U+10FFFF
470  auto __is_invalid = [](uint8_t __b) {
471  return (__b & 0xFE) == 0xC0 || __b >= 0xF5;
472  };
473 
474  // No valid or invalid multibyte sequence is longer than 4 bytes,
475  // so skip back over at most four bytes.
476  while (__is_continuation(__u) && __to_incr < 4 && __curr != __first)
477  {
478  ++__to_incr;
479  __u = *--__curr;
480  }
481 
482  // If the last byte read was a continuation byte then either we read
483  // four continuation bytes, or stopped at the start of the sequence.
484  // Either way, the maximal subparts are the individual continuation
485  // bytes so each one should be replaced with U+FFFD.
486  if (__is_continuation(__u) || __is_invalid(__u)) [[unlikely]]
487  {
488  // Either found four continuation bytes (maximum allowed is three)
489  // or first non-continuation byte is an invalid UTF-8 code unit.
490  _M_update(_S_error(), 1);
491  return;
492  }
493  // __u is a valid start byte so use countl_one to get the expected
494  // length of the multibyte sequence that starts with this byte.
495  int __seq_length = std::countl_one((unsigned char)__u);
496  if (__seq_length < __to_incr) [[unlikely]]
497  {
498  // If the expected number of continuation bytes is less than
499  // the number we found, then the last continuation byte is a
500  // maximal subpart and the decremented iterator points to it.
501  _M_update(_S_error(), 1);
502  return;
503  }
504 
505  auto __orig = std::__exchange(_M_curr(), std::move(__curr));
506  if (_M_read_utf8() == _S_error()) [[unlikely]]
507  {
508  if (_M_to_increment < __to_incr) // Read truncated sequence, set
509  _M_to_increment = 1; // curr to last continuation byte.
510  }
511 
512  _M_curr() = std::move(__orig);
513  // operator--() will move back by _M_to_increment
514  }
515 
516  constexpr void
517  _M_read_reverse_utf16() requires bidirectional_iterator<_Iter>
518  {
519  _Guard<_Iter> __g{this, _M_curr()};
520  char32_t __c{};
521  uint16_t __u = *--_M_curr();
522  uint8_t __to_incr = 1;
523 
524  if (__u < 0xD800 || __u > 0xDFFF) [[likely]]
525  __c = __u;
526  else if (__u >= 0xDC00 && _M_curr() != _M_first()) [[likely]]
527  {
528  // read a low surrogate, expect a high surrogate before it.
529  uint16_t __u2 = *--_M_curr();
530  if (__u2 < 0xD800 || __u2 >= 0xDC00) [[unlikely]]
531  __c = _S_error(); // unpaired low surrogate
532  else
533  {
534  __to_incr = 2;
535  uint32_t __x = (__u2 & 0x3F) << 10 | (__u & 0x3FF);
536  uint32_t __w = (__u2 >> 6) & 0x1F;
537  __c = (__w + 1) << 16 | __x;
538  }
539  }
540  else
541  __c = _S_error(); // unpaired surrogate
542 
543  _M_update(__c, __to_incr);
544  }
545 
546  constexpr void
547  _M_read_reverse_utf32() requires bidirectional_iterator<_Iter>
548  {
549  _Guard<_Iter> __g{this, _M_curr()};
550  char32_t __c = *--_M_curr();
551  if (!__is_scalar_value(__c)) [[unlikely]]
552  __c = _S_error();
553  _M_update(__c, 1);
554  }
555 
556  // Encode the code point __c as one or more code units in _M_buf.
557  constexpr void
558  _M_update(char32_t __c, uint8_t __to_incr)
559  {
560  _M_to_increment = __to_incr;
561  _M_buf_index = 0;
562  if constexpr (sizeof(_ToFmt) == sizeof(uint32_t))
563  {
564  _M_buf[0] = __c;
565  _M_buf_last = 1;
566  }
567  else if constexpr (sizeof(_ToFmt) == sizeof(uint16_t))
568  {
569  if (__is_single_code_unit<_ToFmt>(__c))
570  {
571  _M_buf[0] = __c;
572  _M_buf[1] = 0;
573  _M_buf_last = 1;
574  }
575  else
576  {
577  // From http://www.unicode.org/faq/utf_bom.html#utf16-4
578  const char32_t __lead_offset = 0xD800 - (0x10000 >> 10);
579  char16_t __lead = __lead_offset + (__c >> 10);
580  char16_t __trail = 0xDC00 + (__c & 0x3FF);
581  _M_buf[0] = __lead;
582  _M_buf[1] = __trail;
583  _M_buf_last = 2;
584  }
585  }
586  else
587  {
588  static_assert(sizeof(_ToFmt) == 1);
589  int __bits = std::bit_width((uint32_t)__c);
590  if (__bits <= 7) [[likely]]
591  {
592  _M_buf[0] = __c;
593  _M_buf[1] = _M_buf[2] = _M_buf[3] = 0;
594  _M_buf_last = 1;
595  }
596  else if (__bits <= 11)
597  {
598  _M_buf[0] = 0xC0 | (__c >> 6);
599  _M_buf[1] = 0x80 | (__c & 0x3F);
600  _M_buf[2] = _M_buf[3] = 0;
601  _M_buf_last = 2;
602  }
603  else if (__bits <= 16)
604  {
605  _M_buf[0] = 0xE0 | (__c >> 12);
606  _M_buf[1] = 0x80 | ((__c >> 6) & 0x3F);
607  _M_buf[2] = 0x80 | (__c & 0x3F);
608  _M_buf[3] = 0;
609  _M_buf_last = 3;
610  }
611  else
612  {
613  _M_buf[0] = 0xF0 | ((__c >> 18) & 0x07);
614  _M_buf[1] = 0x80 | ((__c >> 12) & 0x3F);
615  _M_buf[2] = 0x80 | ((__c >> 6) & 0x3F);
616  _M_buf[3] = 0x80 | (__c & 0x3F);
617  _M_buf_last = 4;
618  }
619  }
620  }
621 
622  constexpr char32_t
623  _S_error()
624  {
625  char32_t __c = _ErrorHandler()();
626  __glibcxx_assert(__is_scalar_value(__c));
627  return __c;
628  }
629 
630  constexpr _Iter
631  _M_first() const requires bidirectional_iterator<_Iter>
632  { return _M_first_and_curr._M_first; }
633 
634  constexpr _Iter&
635  _M_curr() { return _M_first_and_curr._M_curr; }
636 
637  constexpr _Iter
638  _M_curr() const { return _M_first_and_curr._M_curr; }
639 
640  // _M_first is not needed for non-bidirectional ranges.
641  template<typename _It>
642  struct _First_and_curr
643  {
644  _First_and_curr() = default;
645 
646  constexpr
647  _First_and_curr(_It __curr) : _M_curr(__curr) { }
648 
649  template<convertible_to<_It> _It2>
650  constexpr
651  _First_and_curr(const _First_and_curr<_It2>& __other)
652  : _M_curr(__other._M_curr) { }
653 
654  // First code unit of the current code point for forward iterators,
655  // past-the-end of the current code point for input iterators.
656  _It _M_curr;
657  };
658 
659  template<typename _It> requires bidirectional_iterator<_It>
660  struct _First_and_curr<_It>
661  {
662  _First_and_curr() = default;
663 
664  constexpr
665  _First_and_curr(_It __first, _It __curr)
666  : _M_first(__first), _M_curr(__curr) { }
667 
668  template<convertible_to<_It> _It2>
669  constexpr
670  _First_and_curr(const _First_and_curr<_It2>& __other)
671  : _M_first(__other._M_first), _M_curr(__other._M_curr) { }
672 
673  _It _M_first; // Start of the underlying range.
674  _It _M_curr; // First code unit of the current code point.
675  };
676 
677  // Iterators pointing to the start of the underlying range and to the
678  // start (or end, for non-forward iterators) of the current code point.
679  _First_and_curr<_Iter> _M_first_and_curr;
680 
681  // The end of the underlying input range.
682  [[no_unique_address]] _Sent _M_last;
683 
684  // Buffer holding the individual code units of the current code point.
685  array<value_type, 4 / sizeof(_ToFmt)> _M_buf;
686 
687  uint8_t _M_buf_index = 0; // Index of current code unit in the buffer.
688  uint8_t _M_buf_last = 0; // Number of code units in the buffer.
689  uint8_t _M_to_increment = 0; // How far to advance _M_curr on increment.
690 
691  template<typename _FromFmt2, typename _ToFmt2,
692  input_iterator _Iter2, sentinel_for<_Iter2> _Sent2,
693  typename _ErrHandler>
694  requires convertible_to<iter_value_t<_Iter2>, _FromFmt2>
695  friend class _Utf_iterator;
696  };
697 
698  template<typename _ToFormat, ranges::input_range _View>
699  requires ranges::view<_View>
700  class _Utf_view
701  : public ranges::view_interface<_Utf_view<_ToFormat, _View>>
702  {
703  using _Iterator = _Utf_iterator<ranges::range_value_t<_View>,
704  _ToFormat, ranges::iterator_t<_View>,
705  ranges::sentinel_t<_View>>;
706 
707  template<typename _Iter, typename _Sent>
708  constexpr auto
709  _M_begin(_Iter __first, _Sent __last)
710  {
711  if constexpr (bidirectional_iterator<_Iter>)
712  return _Iterator(__first, __first, __last);
713  else
714  return _Iterator(__first, __last);
715  }
716 
717  template<typename _Iter, typename _Sent>
718  constexpr auto
719  _M_end(_Iter __first, _Sent __last)
720  {
721  if constexpr (!is_same_v<_Iter, _Sent>)
722  return __last;
723  else if constexpr (bidirectional_iterator<_Iter>)
724  return _Iterator(__first, __last, __last);
725  else
726  return _Iterator(__last, __last);
727  }
728 
729  _View _M_base;
730 
731  public:
732  constexpr explicit
733  _Utf_view(_View __r) : _M_base(std::move(__r)) { }
734 
735  constexpr auto begin()
736  { return _M_begin(ranges::begin(_M_base), ranges::end(_M_base)); }
737 
738  constexpr auto end()
739  { return _M_end(ranges::begin(_M_base), ranges::end(_M_base)); }
740 
741  constexpr bool empty() const { return ranges::empty(_M_base); }
742  };
743 
744 #ifdef __cpp_char8_t
745  template<typename _View>
746  using _Utf8_view = _Utf_view<char8_t, _View>;
747 #else
748  template<typename _View>
749  using _Utf8_view = _Utf_view<char, _View>;
750 #endif
751  template<typename _View>
752  using _Utf16_view = _Utf_view<char16_t, _View>;
753  template<typename _View>
754  using _Utf32_view = _Utf_view<char32_t, _View>;
755 
756 inline namespace __v16_0_0
757 {
758 #define _GLIBCXX_GET_UNICODE_DATA 160000
759 #include "unicode-data.h"
760 #ifdef _GLIBCXX_GET_UNICODE_DATA
761 # error "Invalid unicode data"
762 #endif
763 
764  // The field width of a code point.
765  constexpr int
766  __field_width(char32_t __c) noexcept
767  {
768  if (__c < __width_edges[0]) [[likely]]
769  return 1;
770 
771  auto* __p = std::upper_bound(__width_edges, std::end(__width_edges), __c);
772  return (__p - __width_edges) % 2 + 1;
773  }
774 
775  // @pre c <= 0x10FFFF
776  constexpr bool
777  __should_escape_category(char32_t __c) noexcept
778  {
779  constexpr uint32_t __mask = 0x01;
780  auto* __end = std::end(__escape_edges);
781  auto* __p = std::lower_bound(__escape_edges, __end,
782  (__c << 1u) + 2);
783  return __p[-1] & __mask;
784  }
785 
786 
787  // @pre c <= 0x10FFFF
788  constexpr _Gcb_property
789  __grapheme_cluster_break_property(char32_t __c) noexcept
790  {
791  constexpr uint32_t __mask = (1 << __gcb_shift_bits) - 1;
792  auto* __end = std::end(__gcb_edges);
793  auto* __p = std::lower_bound(__gcb_edges, __end,
794  (__c << __gcb_shift_bits) | __mask);
795  return _Gcb_property(__p[-1] & __mask);
796  }
797 
798  constexpr bool
799  __is_incb_linker(char32_t __c) noexcept
800  {
801  const auto __end = std::end(__incb_linkers);
802  // Array is small enough that linear search is faster than binary search.
803  return _GLIBCXX_STD_A::find(__incb_linkers, __end, __c) != __end;
804  }
805 
806  // @pre c <= 0x10FFFF
807  constexpr _InCB
808  __incb_property(char32_t __c) noexcept
809  {
810  if ((__c << 2) < __incb_edges[0]) [[likely]]
811  return _InCB(0);
812 
813  constexpr uint32_t __mask = 0x3;
814  auto* __end = std::end(__incb_edges);
815  auto* __p = std::lower_bound(__incb_edges, __end, (__c << 2) | __mask);
816  return _InCB(__p[-1] & __mask);
817  }
818 
819  constexpr bool
820  __is_extended_pictographic(char32_t __c)
821  {
822  if (__c < __xpicto_edges[0]) [[likely]]
823  return 0;
824 
825  auto* __p = std::upper_bound(__xpicto_edges, std::end(__xpicto_edges), __c);
826  return (__p - __xpicto_edges) % 2;
827  }
828 
829  struct _Grapheme_cluster_iterator_base
830  {
831  char32_t _M_c; // First code point in the cluster.
832  _Gcb_property _M_prop; // GCB property of _M_c.
833  enum class _XPicto : unsigned char { _Init, _Zwj, _Matched, _Failed };
834  _XPicto _M_xpicto_seq_state = _XPicto::_Init;
835  unsigned char _M_RI_count = 0;
836  bool _M_incb_linker_seen = false;
837 
838  constexpr void
839  _M_reset(char32_t __c, _Gcb_property __p)
840  {
841  _M_c = __c;
842  _M_prop = __p;
843  _M_xpicto_seq_state = _XPicto::_Init;
844  _M_RI_count = 0;
845  _M_incb_linker_seen = false;
846  }
847 
848  constexpr void
849  _M_update_xpicto_seq_state(char32_t __c, _Gcb_property __p)
850  {
851  if (_M_xpicto_seq_state == _XPicto::_Failed)
852  return;
853 
854  auto __next_state = _XPicto::_Failed;
855  if (_M_xpicto_seq_state != _XPicto::_Zwj) // i.e. Init or Matched
856  {
857  if (__p == _Gcb_property::_Gcb_ZWJ)
858  {
859  if (_M_xpicto_seq_state == _XPicto::_Matched)
860  __next_state = _XPicto::_Zwj;
861  // We check _M_c here so that we do the lookup at most once,
862  // and only for clusters containing at least one ZWJ.
863  else if (__is_extended_pictographic(_M_c))
864  __next_state = _XPicto::_Zwj;
865  }
866  else if (__p == _Gcb_property::_Gcb_Extend)
867  __next_state = _M_xpicto_seq_state; // no change
868  }
869  else // Zwj
870  {
871  // This assumes that all \p{Extended_Pictographic} emoji have
872  // Grapheme_Cluster_Break=Other.
873  if (__p == _Gcb_property::_Gcb_Other
874  && __is_extended_pictographic(__c))
875  __next_state = _XPicto::_Matched;
876  }
877  _M_xpicto_seq_state = __next_state;
878  }
879 
880  constexpr void
881  _M_update_ri_count(_Gcb_property __p)
882  {
883  if (__p == _Gcb_property::_Gcb_Regional_Indicator)
884  ++_M_RI_count;
885  else
886  _M_RI_count = 0;
887  }
888 
889  constexpr void
890  _M_update_incb_state(char32_t __c, _Gcb_property)
891  {
892  if (__is_incb_linker(__c))
893  _M_incb_linker_seen = true;
894  }
895  };
896 
897  // Split a range into extended grapheme clusters.
898  template<ranges::forward_range _View> requires ranges::view<_View>
899  class _Grapheme_cluster_view
900  : public ranges::view_interface<_Grapheme_cluster_view<_View>>
901  {
902  public:
903 
904  constexpr
905  _Grapheme_cluster_view(_View __v)
906  : _M_begin(_Utf32_view<_View>(std::move(__v)).begin())
907  { }
908 
909  constexpr auto begin() const { return _M_begin; }
910  constexpr auto end() const { return _M_begin.end(); }
911 
912  private:
913  struct _Iterator : private _Grapheme_cluster_iterator_base
914  {
915  private:
916  // Iterator over the underlying code points.
917  using _U32_iterator = ranges::iterator_t<_Utf32_view<_View>>;
918 
919  public:
920  // TODO: Change value_type to be subrange<_U32_iterator> instead?
921  // Alternatively, value_type could be _Utf32_view<iterator_t<_View>>.
922  // That would be the whole cluster, not just the first code point.
923  // Would need to store two iterators and find end of current cluster
924  // on increment, so operator* returns value_type(_M_base, _M_next).
925  using value_type = char32_t;
926  using iterator_concept = forward_iterator_tag;
927  using difference_type = ptrdiff_t;
928 
929  constexpr
930  _Iterator(_U32_iterator __i)
931  : _M_base(__i)
932  {
933  if (__i != __i.end())
934  {
935  _M_c = *__i;
936  _M_prop = __grapheme_cluster_break_property(_M_c);
937  }
938  }
939 
940  // The first code point of the current extended grapheme cluster.
941  constexpr value_type
942  operator*() const
943  { return _M_c; }
944 
945  constexpr auto
946  operator->() const
947  { return &_M_c; }
948 
949  // Move to the next extended grapheme cluster.
950  constexpr _Iterator&
951  operator++()
952  {
953  const auto __end = _M_base.end();
954  if (_M_base != __end)
955  {
956  auto __p_prev = _M_prop;
957  auto __it = _M_base;
958  while (++__it != __end)
959  {
960  char32_t __c = *__it;
961  auto __p = __grapheme_cluster_break_property(*__it);
962  _M_update_xpicto_seq_state(__c, __p);
963  _M_update_ri_count(__p);
964  _M_update_incb_state(__c, __p);
965  if (_M_is_break(__p_prev, __p, __it))
966  {
967  // Found a grapheme cluster break
968  _M_reset(__c, __p);
969  break;
970  }
971  __p_prev = __p;
972  }
973  _M_base = __it;
974  }
975  return *this;
976  }
977 
978  constexpr _Iterator
979  operator++(int)
980  {
981  auto __tmp = *this;
982  ++*this;
983  return __tmp;
984  }
985 
986  constexpr bool
987  operator==(const _Iterator& __i) const
988  { return _M_base == __i._M_base; }
989 
990  // This supports iter != iter.end()
991  constexpr bool
992  operator==(const ranges::sentinel_t<_View>& __i) const
993  { return _M_base == __i; }
994 
995  // Iterator to the start of the current cluster.
996  constexpr auto base() const { return _M_base.base(); }
997 
998  // The end of the underlying view (not the end of the current cluster!)
999  constexpr auto end() const { return _M_base.end(); }
1000 
1001  // Field width of the first code point in the cluster.
1002  constexpr int
1003  width() const noexcept
1004  { return __field_width(_M_c); }
1005 
1006  private:
1007  _U32_iterator _M_base;
1008 
1009  // Implement the Grapheme Cluster Boundary Rules from Unicode Annex #29
1010  // http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
1011  // This implements the rules from TR29 revision 43 in Unicode 15.1.0.
1012  // Return true if there is a break between code point with property p1
1013  // and code point with property p2.
1014  constexpr bool
1015  _M_is_break(_Gcb_property __p1, _Gcb_property __p2,
1016  _U32_iterator __curr) const
1017  {
1018  using enum _Gcb_property;
1019 
1020  if (__p1 == _Gcb_Control || __p1 == _Gcb_LF)
1021  return true; // Break after Control or LF.
1022 
1023  if (__p1 == _Gcb_CR)
1024  return __p2 != _Gcb_LF; // Do not break between a CR and LF.
1025 
1026  // Rule GB5
1027  if (__p2 == _Gcb_Control || __p2 == _Gcb_CR || __p2 == _Gcb_LF)
1028  return true; // Break before Control, CR or LF.
1029 
1030  // Rule GB6
1031  if (__p1 == _Gcb_L)
1032  switch (__p2)
1033  {
1034  case _Gcb_L:
1035  case _Gcb_V:
1036  case _Gcb_LV:
1037  case _Gcb_LVT:
1038  return false; // Do not break Hangul syllable sequences.
1039  default:
1040  return true;
1041  }
1042 
1043  // Rule GB7
1044  if (__p1 == _Gcb_LV || __p1 == _Gcb_V)
1045  switch (__p2)
1046  {
1047  case _Gcb_V:
1048  case _Gcb_T:
1049  return false; // Do not break Hangul syllable sequences.
1050  default:
1051  return true;
1052  }
1053 
1054  // Rule GB8
1055  if (__p1 == _Gcb_LVT || __p1 == _Gcb_T)
1056  return __p2 != _Gcb_T; // Do not break Hangul syllable sequences.
1057 
1058  // Rule GB9
1059  if (__p2 == _Gcb_Extend || __p2 == _Gcb_ZWJ)
1060  return false; // Do not break before extending characters or ZWJ.
1061 
1062  // The following GB9x rules only apply to extended grapheme clusters,
1063  // which is what the C++ standard uses (not legacy grapheme clusters).
1064 
1065  // Rule GB9a
1066  if (__p2 == _Gcb_SpacingMark)
1067  return false; // Do not break before SpacingMarks,
1068  // Rule GB9b
1069  if (__p1 == _Gcb_Prepend)
1070  return false; // or after Prepend characters.
1071 
1072  // Rule GB9c (Unicode 15.1.0)
1073  // Do not break within certain combinations with
1074  // Indic_Conjunct_Break (InCB)=Linker.
1075  if (_M_incb_linker_seen
1076  && __incb_property(_M_c) == _InCB::_Consonant
1077  && __incb_property(*__curr) == _InCB::_Consonant)
1078  {
1079  // Match [_M_base, __curr] against regular expression
1080  // Consonant ([Extend Linker]* Linker [Extend Linker]* Consonant)+
1081  bool __have_linker = false;
1082  auto __it = _M_base;
1083  while (++__it != __curr)
1084  {
1085  if (__is_incb_linker(*__it))
1086  __have_linker = true;
1087  else
1088  {
1089  auto __incb = __incb_property(*__it);
1090  if (__incb == _InCB::_Consonant)
1091  __have_linker = false;
1092  else if (__incb != _InCB::_Extend)
1093  break;
1094  }
1095  }
1096  if (__it == __curr && __have_linker)
1097  return false;
1098  }
1099 
1100  // Rule GB11
1101  // Do not break within emoji modifier sequences
1102  // or emoji zwj sequences.
1103  if (__p1 == _Gcb_ZWJ && _M_xpicto_seq_state == _XPicto::_Matched)
1104  return false;
1105 
1106  // Rules GB12 and GB13
1107  // Do not break within emoji flag sequences. That is, do not break
1108  // between regional indicator (RI) symbols if there is an odd number
1109  // of RI characters before the break point.
1110  if (__p1 == _Gcb_property::_Gcb_Regional_Indicator && __p1 == __p2)
1111  return (_M_RI_count & 1) == 0;
1112 
1113  // Rule GB999
1114  return true; // Otherwise, break everywhere.
1115  }
1116  };
1117 
1118  _Iterator _M_begin;
1119  };
1120 
1121 } // namespace __v16_0_0
1122 
1123  // Return the field width of a string.
1124  template<typename _CharT>
1125  constexpr size_t
1126  __field_width(basic_string_view<_CharT> __s)
1127  {
1128  if (__s.empty()) [[unlikely]]
1129  return 0;
1130  _Grapheme_cluster_view<basic_string_view<_CharT>> __gc(__s);
1131  auto __it = __gc.begin();
1132  const auto __end = __gc.end();
1133  size_t __n = __it.width();
1134  while (++__it != __end)
1135  __n += __it.width();
1136  return __n;
1137  }
1138 
1139  // Truncate a string to at most `__max` field width units, and return the
1140  // resulting field width.
1141  template<typename _CharT>
1142  constexpr size_t
1143  __truncate(basic_string_view<_CharT>& __s, size_t __max)
1144  {
1145  if (__s.empty()) [[unlikely]]
1146  return 0;
1147 
1148  _Grapheme_cluster_view<basic_string_view<_CharT>> __gc(__s);
1149  auto __it = __gc.begin();
1150  const auto __end = __gc.end();
1151  size_t __n = __it.width();
1152  if (__n > __max)
1153  {
1154  __s = {};
1155  return 0;
1156  }
1157  while (++__it != __end)
1158  {
1159  size_t __n2 = __n + __it.width();
1160  if (__n2 > __max)
1161  {
1162  __s = basic_string_view<_CharT>(__s.begin(), __it.base());
1163  return __n;
1164  }
1165  __n = __n2;
1166  }
1167  return __n;
1168  }
1169 
1170  template<typename _CharT>
1171  consteval bool
1172  __literal_encoding_is_unicode()
1173  {
1174  if constexpr (is_same_v<_CharT, char16_t>)
1175  return true;
1176  else if constexpr (is_same_v<_CharT, char32_t>)
1177  return true;
1178 #ifdef __cpp_char8_t
1179  else if constexpr (is_same_v<_CharT, char8_t>)
1180  return true;
1181 #endif
1182 
1183  const char* __enc = "";
1184 
1185 #ifdef __GNUC_EXECUTION_CHARSET_NAME
1186  auto __remove_iso10646_prefix = [](const char* __s) {
1187  // GNU iconv allows "ISO-10646/" prefix (case-insensitive).
1188  if (__s[0] == 'I' || __s[0] == 'i')
1189  if (__s[1] == 'S' || __s[1] == 's')
1190  if (__s[2] == 'O' || __s[2] == 'o')
1191  if (string_view(__s + 3).starts_with("-10646/"))
1192  return __s + 10;
1193  return __s;
1194  };
1195 
1196  if constexpr (is_same_v<_CharT, char>)
1197  __enc = __remove_iso10646_prefix(__GNUC_EXECUTION_CHARSET_NAME);
1198 # if defined _GLIBCXX_USE_WCHAR_T && defined __GNUC_WIDE_EXECUTION_CHARSET_NAME
1199  else
1200  __enc = __remove_iso10646_prefix(__GNUC_WIDE_EXECUTION_CHARSET_NAME);
1201 # endif
1202 
1203  if ((__enc[0] == 'U' || __enc[0] == 'u')
1204  && (__enc[1] == 'T' || __enc[1] == 't')
1205  && (__enc[2] == 'F' || __enc[2] == 'f'))
1206  {
1207  __enc += 3;
1208  if (__enc[0] == '-')
1209  ++__enc;
1210  if (__enc[0] == '8')
1211  return __enc[1] == '\0' || string_view(__enc + 1) == "//";
1212  else if constexpr (!is_same_v<_CharT, char>)
1213  {
1214  string_view __s(__enc);
1215  if (__s.ends_with("//"))
1216  __s.remove_suffix(2);
1217  if (__s.ends_with("LE") || __s.ends_with("BE"))
1218  __s.remove_suffix(2);
1219  return __s == "16" || __s == "32";
1220  }
1221  }
1222 #elif defined __clang_literal_encoding__
1223  if constexpr (is_same_v<_CharT, char>)
1224  __enc = __clang_literal_encoding__;
1225 # if defined _GLIBCXX_USE_WCHAR_T && defined __clang_wide_literal_encoding__
1226  else
1227  __enc = __clang_wide_literal_encoding__;
1228 # endif
1229  // Clang accepts "-fexec-charset=utf-8" but the macro is still uppercase.
1230  string_view __s(__enc);
1231  if (__s == "UTF-8")
1232  return true;
1233  else if constexpr (!is_same_v<_CharT, char>)
1234  return __s == "UTF-16" || __s == "UTF-32";
1235 #endif
1236 
1237  return false;
1238  }
1239 
1240  consteval bool
1241  __literal_encoding_is_utf8()
1242  { return __literal_encoding_is_unicode<char>(); }
1243 
1244  consteval bool
1245  __literal_encoding_is_extended_ascii()
1246  {
1247  return '0' == 0x30 && 'A' == 0x41 && 'Z' == 0x5a
1248  && 'a' == 0x61 && 'z' == 0x7a;
1249  }
1250 
1251  // https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching
1252  constexpr bool
1253  __charset_alias_match(string_view __a, string_view __b)
1254  {
1255  // Map alphanumeric chars to their base 64 value, everything else to 127.
1256  auto __map = [](char __c, bool& __num) -> unsigned char {
1257  if (__c == '0') [[unlikely]]
1258  return __num ? 0 : 127;
1259  const auto __v = __detail::__from_chars_alnum_to_val(__c);
1260  __num = __v < 10;
1261  return __v;
1262  };
1263 
1264  auto __ptr_a = __a.begin(), __end_a = __a.end();
1265  auto __ptr_b = __b.begin(), __end_b = __b.end();
1266  bool __num_a = false, __num_b = false;
1267 
1268  while (true)
1269  {
1270  // Find the value of the next alphanumeric character in each string.
1271  unsigned char __val_a{}, __val_b{};
1272  while (__ptr_a != __end_a
1273  && (__val_a = __map(*__ptr_a, __num_a)) == 127)
1274  ++__ptr_a;
1275  while (__ptr_b != __end_b
1276  && (__val_b = __map(*__ptr_b, __num_b)) == 127)
1277  ++__ptr_b;
1278  // Stop when we reach the end of a string, or get a mismatch.
1279  if (__ptr_a == __end_a)
1280  return __ptr_b == __end_b;
1281  else if (__ptr_b == __end_b)
1282  return false;
1283  else if (__val_a != __val_b)
1284  return false; // Found non-matching characters.
1285  ++__ptr_a;
1286  ++__ptr_b;
1287  }
1288  return true;
1289  }
1290 
1291 } // namespace __unicode
1292 
1293 namespace ranges
1294 {
1295  template<typename _To, typename _Range>
1296  inline constexpr bool
1297  enable_borrowed_range<std::__unicode::_Utf_view<_To, _Range>>
1298  = enable_borrowed_range<_Range>;
1299 
1300  template<typename _Range>
1301  inline constexpr bool
1302  enable_borrowed_range<std::__unicode::_Grapheme_cluster_view<_Range>>
1303  = enable_borrowed_range<_Range>;
1304 } // namespace ranges
1305 
1306 _GLIBCXX_END_NAMESPACE_VERSION
1307 } // namespace std
1308 #endif // C++20
1309 #endif // _GLIBCXX_UNICODE_H
constexpr complex< _Tp > operator*(const complex< _Tp > &__x, const complex< _Tp > &__y)
Return new complex value x times y.
Definition: complex:434
constexpr std::remove_reference< _Tp >::type && move(_Tp &&__t) noexcept
Convert a value to an rvalue.
Definition: move.h:138
_Tp * end(valarray< _Tp > &__va) noexcept
Return an iterator pointing to one past the last element of the valarray.
Definition: valarray:1251
_Tp * begin(valarray< _Tp > &__va) noexcept
Return an iterator pointing to the first element of the valarray.
Definition: valarray:1229
ISO C++ entities toplevel namespace is std.
constexpr auto empty(const _Container &__cont) noexcept(noexcept(__cont.empty())) -> decltype(__cont.empty())
Return whether a container is empty.
Definition: range_access.h:294
constexpr void advance(_InputIterator &__i, _Distance __n)
A generalization of pointer arithmetic.
GNU extensions for public use.
__numeric_traits_integer< _Tp > __int_traits
Convenience alias for __numeric_traits<integer-type>.