libstdc++
unicode.h
Go to the documentation of this file.
1// Unicode utilities -*- C++ -*-
2
3// Copyright The GNU Toolchain Authors.
4//
5// This file is part of the GNU ISO C++ Library. This library is free
6// software; you can redistribute it and/or modify it under the
7// terms of the GNU General Public License as published by the
8// Free Software Foundation; either version 3, or (at your option)
9// any later version.
10
11// This library is distributed in the hope that it will be useful,
12// but WITHOUT ANY WARRANTY; without even the implied warranty of
13// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14// GNU General Public License for more details.
15
16// Under Section 7 of GPL version 3, you are granted additional
17// permissions described in the GCC Runtime Library Exception, version
18// 3.1, as published by the Free Software Foundation.
19
20// You should have received a copy of the GNU General Public License and
21// a copy of the GCC Runtime Library Exception along with this program;
22// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23// <http://www.gnu.org/licenses/>.
24
25/** @file include/bits/unicode.h
26 * This is an internal header file, included by other library headers.
27 * Do not attempt to use it directly. @headername{format}
28 */
29
30#ifndef _GLIBCXX_UNICODE_H
31#define _GLIBCXX_UNICODE_H 1
32
33#if __cplusplus >= 202002L
34#include <array>
35#include <bit> // bit_width
36#include <charconv> // __detail::__from_chars_alnum_to_val_table
37#include <cstdint>
38#include <bits/stl_algo.h>
39#include <bits/stl_iterator.h>
40#include <bits/ranges_base.h>
41
42namespace std _GLIBCXX_VISIBILITY(default)
43{
44_GLIBCXX_BEGIN_NAMESPACE_VERSION
45namespace __unicode
46{
47 // A Unicode code point that is not a high or low surrogate.
48 constexpr bool
49 __is_scalar_value(char32_t __c)
50 {
51 if (__c < 0xD800) [[likely]]
52 return true;
53 return 0xDFFF < __c && __c <= 0x10FFFF;
54 }
55
56 // A code point that can be encoded in a single code unit of type _CharT.
57 template<typename _CharT>
58 constexpr bool
59 __is_single_code_unit(char32_t __c)
60 {
61 if constexpr (__gnu_cxx::__int_traits<_CharT>::__max <= 0xFF)
62 return __c < 0x7F; // ASCII character
63 else
64 return __c < __gnu_cxx::__int_traits<_CharT>::__max
65 && __is_scalar_value(__c);
66 }
67
68 // Based on https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2023/p2728r6.html#add-the-transcoding-iterator-template
69
70 struct _Repl
71 {
72 constexpr char32_t
73 operator()() const noexcept
74 { return 0xFFFD; }
75 };
76
77 struct _Null_sentinel_t
78 {
79 template<input_iterator _It>
80 requires default_initializable<iter_value_t<_It>>
81 && equality_comparable_with<iter_reference_t<_It>, iter_value_t<_It>>
82 friend constexpr auto
83 operator==(_It __it, _Null_sentinel_t)
84 { return *__it == iter_value_t<_It>{}; }
85 };
86
87 template<typename _FromFmt, typename _ToFmt,
88 input_iterator _Iter, sentinel_for<_Iter> _Sent = _Iter,
89 typename _ErrorHandler = _Repl>
90 requires convertible_to<iter_value_t<_Iter>, _FromFmt>
91 class _Utf_iterator
92 {
93 static_assert(forward_iterator<_Iter> || noexcept(_ErrorHandler()()));
94
95 public:
96 using value_type = _ToFmt;
97 using difference_type = iter_difference_t<_Iter>;
98 using reference = value_type;
99 using iterator_concept
100 = std::__detail::__clamp_iter_cat<__iter_category_t<_Iter>,
101 bidirectional_iterator_tag>;
102
103 constexpr _Utf_iterator() = default;
104
105 constexpr
106 _Utf_iterator(_Iter __first, _Iter __it, _Sent __last)
107 requires bidirectional_iterator<_Iter>
108 : _M_first_and_curr{__first, __it}, _M_last(__last)
109 {
110 if (_M_curr() != _M_last)
111 _M_read();
112 else
113 _M_buf = {};
114 }
115
116 constexpr
117 _Utf_iterator(_Iter __it, _Sent __last)
118 requires (!bidirectional_iterator<_Iter>)
119 : _M_first_and_curr{__it}, _M_last(__last)
120 {
121 if (_M_curr() != _M_last)
122 _M_read();
123 else
124 _M_buf = {};
125 }
126
127 template<class _Iter2, class _Sent2>
128 requires convertible_to<_Iter2, _Iter> && convertible_to<_Sent2, _Sent>
129 constexpr
130 _Utf_iterator(const _Utf_iterator<_FromFmt, _ToFmt, _Iter2, _Sent2,
131 _ErrorHandler>& __other)
132 : _M_buf(__other._M_buf), _M_first_and_curr(__other._M_first_and_curr),
133 _M_buf_index(__other._M_buf_index), _M_buf_last(__other._M_buf_last),
134 _M_last(__other._M_last)
135 { }
136
137 [[nodiscard]]
138 constexpr _Iter
139 begin() const requires bidirectional_iterator<_Iter>
140 { return _M_first(); }
141
142 [[nodiscard]]
143 constexpr _Sent
144 end() const { return _M_last; }
145
146 [[nodiscard]]
147 constexpr _Iter
148 base() const requires forward_iterator<_Iter>
149 { return _M_curr(); }
150
151 [[nodiscard]]
152 constexpr value_type
153 operator*() const { return _M_buf[_M_buf_index]; }
154
155 constexpr _Utf_iterator&
156 operator++()
157 {
158 if (_M_buf_index + 1 == _M_buf_last && _M_curr() != _M_last)
159 {
160 if constexpr (forward_iterator<_Iter>)
161 std::advance(_M_curr(), _M_to_increment);
162 if (_M_curr() == _M_last)
163 _M_buf_index = 0;
164 else
165 _M_read();
166 }
167 else if (_M_buf_index + 1 < _M_buf_last)
168 ++_M_buf_index;
169 return *this;
170 }
171
172 constexpr _Utf_iterator
173 operator++(int)
174 {
175 auto __tmp = *this;
176 ++*this;
177 return __tmp;
178 }
179
180 constexpr _Utf_iterator&
181 operator--() requires bidirectional_iterator<_Iter>
182 {
183 if (!_M_buf_index && _M_curr() != _M_first())
184 _M_read_reverse();
185 else if (_M_buf_index)
186 --_M_buf_index;
187 return *this;
188 }
189
190 constexpr _Utf_iterator
191 operator--(int)
192 {
193 auto __tmp = *this;
194 --*this;
195 return __tmp;
196 }
197
198 [[nodiscard]]
199 friend constexpr bool
200 operator==(_Utf_iterator __lhs, _Utf_iterator __rhs)
201 requires forward_iterator<_Iter> || requires (_Iter __i) { __i != __i; }
202 {
203 if constexpr (forward_iterator<_Iter>)
204 return __lhs._M_curr() == __rhs._M_curr()
205 && __lhs._M_buf_index == __rhs._M_buf_index;
206 else if (__lhs._M_curr() != __rhs._M_curr())
207 return false;
208 else if (__lhs._M_buf_index == __rhs._M_buf_index
209 && __lhs._M_buf_last == __rhs._M_buf_last)
210 return true;
211 else
212 return __lhs._M_buf_index == __lhs._M_buf_last
213 && __rhs._M_buf_index == __rhs._M_buf_last;
214 }
215
216 [[nodiscard]]
217 friend constexpr bool
218 operator==(_Utf_iterator __lhs, _Sent __rhs)
219 {
220 if constexpr (forward_iterator<_Iter>)
221 return __lhs._M_curr() == __rhs;
222 else
223 return __lhs._M_curr() == __rhs
224 && __lhs._M_buf_index == __lhs._M_buf_last;
225 }
226
227 private:
228 constexpr void
229 _M_read()
230 {
231 if constexpr (sizeof(_FromFmt) == sizeof(uint8_t))
232 _M_read_utf8();
233 else if constexpr (sizeof(_FromFmt) == sizeof(uint16_t))
234 _M_read_utf16();
235 else
236 {
237 static_assert(sizeof(_FromFmt) == sizeof(uint32_t));
238 _M_read_utf32();
239 }
240 }
241
242 constexpr void
243 _M_read_reverse(); // TODO
244
245 template<typename>
246 struct _Guard
247 {
248 _Guard(void*, _Iter&) { }
249 };
250
251 template<typename _It> requires forward_iterator<_It>
252 struct _Guard<_It>
253 {
254 constexpr ~_Guard() { _M_this->_M_curr() = std::move(_M_orig); }
255 _Utf_iterator* _M_this;
256 _It _M_orig;
257 };
258
259 constexpr void
260 _M_read_utf8()
261 {
262 _Guard<_Iter> __g{this, _M_curr()};
263 char32_t __c{};
264 uint8_t __u = *_M_curr()++;
265 const uint8_t __lo_bound = 0x80, __hi_bound = 0xBF;
266 uint8_t __to_incr = 1;
267
268 if (__u <= 0x7F) [[likely]] // 0x00 to 0x7F
269 __c = __u;
270 else if (__u < 0xC2) [[unlikely]]
271 __c = _S_error();
272 else if (_M_curr() == _M_last) [[unlikely]]
273 __c = _S_error();
274 else if (__u <= 0xDF) // 0xC2 to 0xDF
275 {
276 __c = __u & 0x1F;
277 __u = *_M_curr();
278
279 if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]
280 __c = _S_error();
281 else
282 {
283 __c = (__c << 6) | (__u & 0x3F);
284 ++_M_curr();
285 ++__to_incr;
286 }
287 }
288 else if (__u <= 0xEF) // 0xE0 to 0xEF
289 {
290 const uint8_t __lo_bound_2 = __u == 0xE0 ? 0xA0 : __lo_bound;
291 const uint8_t __hi_bound_2 = __u == 0xED ? 0x9F : __hi_bound;
292
293 __c = __u & 0x0F;
294 __u = *_M_curr();
295
296 if (__u < __lo_bound_2 || __u > __hi_bound_2) [[unlikely]]
297 __c = _S_error();
298 else if (++_M_curr() == _M_last) [[unlikely]]
299 __c = _S_error();
300 else
301 {
302 ++__to_incr;
303 __c = (__c << 6) | (__u & 0x3F);
304 __u = *_M_curr();
305
306 if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]
307 __c = _S_error();
308 else
309 {
310 __c = (__c << 6) | (__u & 0x3F);
311 ++_M_curr();
312 ++__to_incr;
313 }
314 }
315 }
316 else if (__u <= 0xF4) // 0xF0 to 0xF4
317 {
318 const uint8_t __lo_bound_2 = __u == 0xF0 ? 0x90 : __lo_bound;
319 const uint8_t __hi_bound_2 = __u == 0xF4 ? 0x8F : __hi_bound;
320
321 __c = __u & 0x07;
322 __u = *_M_curr();
323
324 if (__u < __lo_bound_2 || __u > __hi_bound_2) [[unlikely]]
325 __c = _S_error();
326 else if (++_M_curr() == _M_last) [[unlikely]]
327 __c = _S_error();
328 else
329 {
330 ++__to_incr;
331 __c = (__c << 6) | (__u & 0x3F);
332 __u = *_M_curr();
333
334 if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]
335 __c = _S_error();
336 else if (++_M_curr() == _M_last) [[unlikely]]
337 __c = _S_error();
338 else
339 {
340 ++__to_incr;
341 __c = (__c << 6) | (__u & 0x3F);
342 __u = *_M_curr();
343
344 if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]
345 __c = _S_error();
346 else
347 {
348 __c = (__c << 6) | (__u & 0x3F);
349 ++_M_curr();
350 ++__to_incr;
351 }
352 }
353 }
354 }
355 else [[unlikely]]
356 __c = _S_error();
357
358 _M_update(__c, __to_incr);
359 }
360
361 constexpr void
362 _M_read_utf16()
363 {
364 _Guard<_Iter> __g{this, _M_curr()};
365 char32_t __c{};
366 uint16_t __u = *_M_curr()++;
367 uint8_t __to_incr = 1;
368
369 if (__u < 0xD800 || __u > 0xDFFF) [[likely]]
370 __c = __u;
371 else if (__u < 0xDC00 && _M_curr() != _M_last)
372 {
373 uint16_t __u2 = *_M_curr();
374 if (__u2 < 0xDC00 || __u2 > 0xDFFF) [[unlikely]]
375 __c = _S_error();
376 else
377 {
378 ++_M_curr();
379 __to_incr = 2;
380 uint32_t __x = (__u & 0x3F) << 10 | __u2 & 0x3FF;
381 uint32_t __w = (__u >> 6) & 0x1F;
382 __c = (__w + 1) << 16 | __x;
383 }
384 }
385 else
386 __c = _S_error();
387
388 _M_update(__c, __to_incr);
389 }
390
391 constexpr void
392 _M_read_utf32()
393 {
394 _Guard<_Iter> __g{this, _M_curr()};
395 char32_t __c = *_M_curr()++;
396 if (!__is_scalar_value(__c)) [[unlikely]]
397 __c = _S_error();
398 _M_update(__c, 1);
399 }
400
401 // Encode the code point __c as one or more code units in _M_buf.
402 constexpr void
403 _M_update(char32_t __c, uint8_t __to_incr)
404 {
405 _M_to_increment = __to_incr;
406 _M_buf_index = 0;
407 if constexpr (sizeof(_ToFmt) == sizeof(uint32_t))
408 {
409 _M_buf[0] = __c;
410 _M_buf_last = 1;
411 }
412 else if constexpr (sizeof(_ToFmt) == sizeof(uint16_t))
413 {
414 if (__is_single_code_unit<_ToFmt>(__c))
415 {
416 _M_buf[0] = __c;
417 _M_buf[1] = 0;
418 _M_buf_last = 1;
419 }
420 else
421 {
422 // From http://www.unicode.org/faq/utf_bom.html#utf16-4
423 const char32_t __lead_offset = 0xD800 - (0x10000 >> 10);
424 char16_t __lead = __lead_offset + (__c >> 10);
425 char16_t __trail = 0xDC00 + (__c & 0x3FF);
426 _M_buf[0] = __lead;
427 _M_buf[1] = __trail;
428 _M_buf_last = 2;
429 }
430 }
431 else
432 {
433 static_assert(sizeof(_ToFmt) == 1);
434 int __bits = std::bit_width((uint32_t)__c);
435 if (__bits <= 7) [[likely]]
436 {
437 _M_buf[0] = __c;
438 _M_buf[1] = _M_buf[2] = _M_buf[3] = 0;
439 _M_buf_last = 1;
440 }
441 else if (__bits <= 11)
442 {
443 _M_buf[0] = 0xC0 | (__c >> 6);
444 _M_buf[1] = 0x80 | (__c & 0x3F);
445 _M_buf[2] = _M_buf[3] = 0;
446 _M_buf_last = 2;
447 }
448 else if (__bits <= 16)
449 {
450 _M_buf[0] = 0xE0 | (__c >> 12);
451 _M_buf[1] = 0x80 | ((__c >> 6) & 0x3F);
452 _M_buf[2] = 0x80 | (__c & 0x3F);
453 _M_buf[3] = 0;
454 _M_buf_last = 3;
455 }
456 else
457 {
458 _M_buf[0] = 0xF0 | ((__c >> 18) & 0x07);
459 _M_buf[1] = 0x80 | ((__c >> 12) & 0x3F);
460 _M_buf[2] = 0x80 | ((__c >> 6) & 0x3F);
461 _M_buf[3] = 0x80 | (__c & 0x3F);
462 _M_buf_last = 4;
463 }
464 }
465 }
466
467 constexpr char32_t
468 _S_error()
469 {
470 char32_t __c = _ErrorHandler()();
471 __glibcxx_assert(__is_scalar_value(__c));
472 return __c;
473 }
474
475 constexpr _Iter
476 _M_first() const requires bidirectional_iterator<_Iter>
477 { return _M_first_and_curr._M_first; }
478
479 constexpr _Iter&
480 _M_curr() { return _M_first_and_curr._M_curr; }
481
482 constexpr _Iter
483 _M_curr() const { return _M_first_and_curr._M_curr; }
484
485 array<value_type, 4 / sizeof(_ToFmt)> _M_buf;
486
487 template<typename _It>
488 struct _First_and_curr
489 {
490 _First_and_curr() = default;
491
492 constexpr
493 _First_and_curr(_It __curr) : _M_curr(__curr) { }
494
495 template<convertible_to<_It> _It2>
496 constexpr
497 _First_and_curr(const _First_and_curr<_It2>& __other)
498 : _M_curr(__other._M_curr) { }
499
500 _It _M_curr;
501 };
502
503 template<typename _It> requires bidirectional_iterator<_It>
504 struct _First_and_curr<_It>
505 {
506 _First_and_curr() = default;
507
508 constexpr
509 _First_and_curr(_It __first, _It __curr)
510 : _M_first(__first), _M_curr(__curr) { }
511
512 template<convertible_to<_It> _It2>
513 constexpr
514 _First_and_curr(const _First_and_curr<_It2>& __other)
515 : _M_first(__other._M_first), _M_curr(__other._M_curr) { }
516
517 _It _M_first;
518 _It _M_curr;
519 };
520
521 _First_and_curr<_Iter> _M_first_and_curr;
522
523 uint8_t _M_buf_index = 0;
524 uint8_t _M_buf_last = 0;
525 uint8_t _M_to_increment = 0;
526
527 [[no_unique_address]] _Sent _M_last;
528
529 template<typename _FromFmt2, typename _ToFmt2,
530 input_iterator _Iter2, sentinel_for<_Iter2> _Sent2,
531 typename _ErrHandler>
532 requires convertible_to<iter_value_t<_Iter2>, _FromFmt2>
533 friend class _Utf_iterator;
534 };
535
536 template<typename _ToFormat, ranges::input_range _Range>
537 class _Utf_view
538 : public ranges::view_interface<_Utf_view<_ToFormat, _Range>>
539 {
540 using _Iterator = _Utf_iterator<ranges::range_value_t<_Range>,
541 _ToFormat, ranges::iterator_t<_Range>,
542 ranges::sentinel_t<_Range>>;
543
544 template<typename _Iter, typename _Sent>
545 constexpr auto
546 _M_begin(_Iter __first, _Sent __last)
547 {
548 if constexpr (bidirectional_iterator<_Iter>)
549 return _Iterator(__first, __first, __last);
550 else
551 return _Iterator(__first, __last);
552 }
553
554 template<typename _Iter, typename _Sent>
555 constexpr auto
556 _M_end(_Iter __first, _Sent __last)
557 {
558 if constexpr (!is_same_v<_Iter, _Sent>)
559 return __last;
560 else if constexpr (bidirectional_iterator<_Iter>)
561 return _Iterator(__first, __last, __last);
562 else
563 return _Iterator(__last, __last);
564 }
565
566 _Range _M_base;
567
568 public:
569 constexpr explicit
570 _Utf_view(_Range&& __r) : _M_base(std::forward<_Range>(__r)) { }
571
572 constexpr auto begin()
573 { return _M_begin(ranges::begin(_M_base), ranges::end(_M_base)); }
574
575 constexpr auto end()
576 { return _M_end(ranges::begin(_M_base), ranges::end(_M_base)); }
577
578 constexpr bool empty() const { return ranges::empty(_M_base); }
579 };
580
581 template<typename _View>
582 using _Utf8_view = _Utf_view<char8_t, _View>;
583 template<typename _View>
584 using _Utf16_view = _Utf_view<char16_t, _View>;
585 template<typename _View>
586 using _Utf32_view = _Utf_view<char32_t, _View>;
587
588inline namespace __v15_1_0
589{
590#define _GLIBCXX_GET_UNICODE_DATA 150100
591#include "unicode-data.h"
592#ifdef _GLIBCXX_GET_UNICODE_DATA
593# error "Invalid unicode data"
594#endif
595
596 // The field width of a code point.
597 constexpr int
598 __field_width(char32_t __c) noexcept
599 {
600 if (__c < __width_edges[0]) [[likely]]
601 return 1;
602
603 auto* __p = std::upper_bound(__width_edges, std::end(__width_edges), __c);
604 return (__p - __width_edges) % 2 + 1;
605 }
606
607 // @pre c <= 0x10FFFF
608 constexpr _Gcb_property
609 __grapheme_cluster_break_property(char32_t __c) noexcept
610 {
611 constexpr uint32_t __mask = (1 << __gcb_shift_bits) - 1;
612 auto* __end = std::end(__gcb_edges);
613 auto* __p = std::lower_bound(__gcb_edges, __end,
614 (__c << __gcb_shift_bits) | __mask);
615 return _Gcb_property(__p[-1] & __mask);
616 }
617
618 constexpr bool
619 __is_incb_linker(char32_t __c) noexcept
620 {
621 const auto __end = std::end(__incb_linkers);
622 // Array is small enough that linear search is faster than binary search.
623 return std::find(__incb_linkers, __end, __c) != __end;
624 }
625
626 // @pre c <= 0x10FFFF
627 constexpr _InCB
628 __incb_property(char32_t __c) noexcept
629 {
630 if ((__c << 2) < __incb_edges[0]) [[likely]]
631 return _InCB(0);
632
633 constexpr uint32_t __mask = 0x3;
634 auto* __end = std::end(__incb_edges);
635 auto* __p = std::lower_bound(__incb_edges, __end, (__c << 2) | __mask);
636 return _InCB(__p[-1] & __mask);
637 }
638
639 constexpr bool
640 __is_extended_pictographic(char32_t __c)
641 {
642 if (__c < __xpicto_edges[0]) [[likely]]
643 return 0;
644
645 auto* __p = std::upper_bound(__xpicto_edges, std::end(__xpicto_edges), __c);
646 return (__p - __xpicto_edges) % 2;
647 }
648
649 struct _Grapheme_cluster_iterator_base
650 {
651 char32_t _M_c; // First code point in the cluster.
652 _Gcb_property _M_prop; // GCB property of _M_c.
653 enum class _XPicto : unsigned char { _Init, _Zwj, _Matched, _Failed };
654 _XPicto _M_xpicto_seq_state = _XPicto::_Init;
655 unsigned char _M_RI_count = 0;
656 bool _M_incb_linker_seen = false;
657
658 constexpr void
659 _M_reset(char32_t __c, _Gcb_property __p)
660 {
661 _M_c = __c;
662 _M_prop = __p;
663 _M_xpicto_seq_state = _XPicto::_Init;
664 _M_RI_count = 0;
665 _M_incb_linker_seen = false;
666 }
667
668 constexpr void
669 _M_update_xpicto_seq_state(char32_t __c, _Gcb_property __p)
670 {
671 if (_M_xpicto_seq_state == _XPicto::_Failed)
672 return;
673
674 auto __next_state = _XPicto::_Failed;
675 if (_M_xpicto_seq_state != _XPicto::_Zwj) // i.e. Init or Matched
676 {
677 if (__p == _Gcb_property::_Gcb_ZWJ)
678 {
679 if (_M_xpicto_seq_state == _XPicto::_Matched)
680 __next_state = _XPicto::_Zwj;
681 // We check _M_c here so that we do the lookup at most once,
682 // and only for clusters containing at least one ZWJ.
683 else if (__is_extended_pictographic(_M_c))
684 __next_state = _XPicto::_Zwj;
685 }
686 else if (__p == _Gcb_property::_Gcb_Extend)
687 __next_state = _M_xpicto_seq_state; // no change
688 }
689 else // Zwj
690 {
691 // This assumes that all \p{Extended_Pictographic} emoji have
692 // Grapheme_Cluster_Break=Other.
693 if (__p == _Gcb_property::_Gcb_Other
694 && __is_extended_pictographic(__c))
695 __next_state = _XPicto::_Matched;
696 }
697 _M_xpicto_seq_state = __next_state;
698 }
699
700 constexpr void
701 _M_update_ri_count(_Gcb_property __p)
702 {
703 if (__p == _Gcb_property::_Gcb_Regional_Indicator)
704 ++_M_RI_count;
705 else
706 _M_RI_count = 0;
707 }
708
709 constexpr void
710 _M_update_incb_state(char32_t __c, _Gcb_property)
711 {
712 if (__is_incb_linker(__c))
713 _M_incb_linker_seen = true;
714 }
715 };
716
717 // Split a range into extended grapheme clusters.
718 template<ranges::forward_range _View> requires ranges::view<_View>
719 class _Grapheme_cluster_view
720 : public ranges::view_interface<_Grapheme_cluster_view<_View>>
721 {
722 public:
723
724 constexpr
725 _Grapheme_cluster_view(_View __v)
726 : _M_begin(_Utf32_view<_View>(std::move(__v)).begin())
727 { }
728
729 constexpr auto begin() const { return _M_begin; }
730 constexpr auto end() const { return _M_begin.end(); }
731
732 private:
733 struct _Iterator : private _Grapheme_cluster_iterator_base
734 {
735 private:
736 // Iterator over the underlying code points.
737 using _U32_iterator = ranges::iterator_t<_Utf32_view<_View>>;
738
739 public:
740 // TODO: Change value_type to be subrange<_U32_iterator> instead?
741 // Alternatively, value_type could be _Utf32_view<iterator_t<_View>>.
742 // That would be the whole cluster, not just the first code point.
743 // Would need to store two iterators and find end of current cluster
744 // on increment, so operator* returns value_type(_M_base, _M_next).
745 using value_type = char32_t;
746 using iterator_concept = forward_iterator_tag;
747 using difference_type = ptrdiff_t;
748
749 constexpr
750 _Iterator(_U32_iterator __i)
751 : _M_base(__i)
752 {
753 if (__i != __i.end())
754 {
755 _M_c = *__i;
756 _M_prop = __grapheme_cluster_break_property(_M_c);
757 }
758 }
759
760 // The first code point of the current extended grapheme cluster.
761 constexpr value_type
762 operator*() const
763 { return _M_c; }
764
765 constexpr auto
766 operator->() const
767 { return &_M_c; }
768
769 // Move to the next extended grapheme cluster.
770 constexpr _Iterator&
771 operator++()
772 {
773 const auto __end = _M_base.end();
774 if (_M_base != __end)
775 {
776 auto __p_prev = _M_prop;
777 auto __it = _M_base;
778 while (++__it != __end)
779 {
780 char32_t __c = *__it;
781 auto __p = __grapheme_cluster_break_property(*__it);
782 _M_update_xpicto_seq_state(__c, __p);
783 _M_update_ri_count(__p);
784 _M_update_incb_state(__c, __p);
785 if (_M_is_break(__p_prev, __p, __it))
786 {
787 // Found a grapheme cluster break
788 _M_reset(__c, __p);
789 break;
790 }
791 __p_prev = __p;
792 }
793 _M_base = __it;
794 }
795 return *this;
796 }
797
798 constexpr _Iterator
799 operator++(int)
800 {
801 auto __tmp = *this;
802 ++this;
803 return __tmp;
804 }
805
806 constexpr bool
807 operator==(const _Iterator& __i) const
808 { return _M_base == __i._M_base; }
809
810 // This supports iter != iter.end()
811 constexpr bool
812 operator==(const ranges::sentinel_t<_View>& __i) const
813 { return _M_base == __i; }
814
815 // Iterator to the start of the current cluster.
816 constexpr auto base() const { return _M_base.base(); }
817
818 // The end of the underlying view (not the end of the current cluster!)
819 constexpr auto end() const { return _M_base.end(); }
820
821 // Field width of the first code point in the cluster.
822 constexpr int
823 width() const noexcept
824 { return __field_width(_M_c); }
825
826 private:
827 _U32_iterator _M_base;
828
829 // Implement the Grapheme Cluster Boundary Rules from Unicode Annex #29
830 // http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
831 // This implements the rules from TR29 revision 43 in Unicode 15.1.0.
832 // Return true if there is a break between code point with property p1
833 // and code point with property p2.
834 constexpr bool
835 _M_is_break(_Gcb_property __p1, _Gcb_property __p2,
836 _U32_iterator __curr) const
837 {
838 using enum _Gcb_property;
839
840 if (__p1 == _Gcb_Control || __p1 == _Gcb_LF)
841 return true; // Break after Control or LF.
842
843 if (__p1 == _Gcb_CR)
844 return __p2 != _Gcb_LF; // Do not break between a CR and LF.
845
846 // Rule GB5
847 if (__p2 == _Gcb_Control || __p2 == _Gcb_CR || __p2 == _Gcb_LF)
848 return true; // Break before Control, CR or LF.
849
850 // Rule GB6
851 if (__p1 == _Gcb_L)
852 switch (__p2)
853 {
854 case _Gcb_L:
855 case _Gcb_V:
856 case _Gcb_LV:
857 case _Gcb_LVT:
858 return false; // Do not break Hangul syllable sequences.
859 default:
860 return true;
861 }
862
863 // Rule GB7
864 if (__p1 == _Gcb_LV || __p1 == _Gcb_V)
865 switch (__p2)
866 {
867 case _Gcb_V:
868 case _Gcb_T:
869 return false; // Do not break Hangul syllable sequences.
870 default:
871 return true;
872 }
873
874 // Rule GB8
875 if (__p1 == _Gcb_LVT || __p1 == _Gcb_T)
876 return __p2 != _Gcb_T; // Do not break Hangul syllable sequences.
877
878 // Rule GB9
879 if (__p2 == _Gcb_Extend || __p2 == _Gcb_ZWJ)
880 return false; // Do not break before extending characters or ZWJ.
881
882 // The following GB9x rules only apply to extended grapheme clusters,
883 // which is what the C++ standard uses (not legacy grapheme clusters).
884
885 // Rule GB9a
886 if (__p2 == _Gcb_SpacingMark)
887 return false; // Do not break before SpacingMarks,
888 // Rule GB9b
889 if (__p1 == _Gcb_Prepend)
890 return false; // or after Prepend characters.
891
892 // Rule GB9c (Unicode 15.1.0)
893 // Do not break within certain combinations with
894 // Indic_Conjunct_Break (InCB)=Linker.
895 if (_M_incb_linker_seen
896 && __incb_property(_M_c) == _InCB::_Consonant
897 && __incb_property(*__curr) == _InCB::_Consonant)
898 {
899 // Match [_M_base, __curr] against regular expression
900 // Consonant ([Extend Linker]* Linker [Extend Linker]* Consonant)+
901 bool __have_linker = false;
902 auto __it = _M_base;
903 while (++__it != __curr)
904 {
905 if (__is_incb_linker(*__it))
906 __have_linker = true;
907 else
908 {
909 auto __incb = __incb_property(*__it);
910 if (__incb == _InCB::_Consonant)
911 __have_linker = false;
912 else if (__incb != _InCB::_Extend)
913 break;
914 }
915 }
916 if (__it == __curr && __have_linker)
917 return false;
918 }
919
920 // Rule GB11
921 // Do not break within emoji modifier sequences
922 // or emoji zwj sequences.
923 if (__p1 == _Gcb_ZWJ && _M_xpicto_seq_state == _XPicto::_Matched)
924 return false;
925
926 // Rules GB12 and GB13
927 // Do not break within emoji flag sequences. That is, do not break
928 // between regional indicator (RI) symbols if there is an odd number
929 // of RI characters before the break point.
930 if (__p1 == _Gcb_property::_Gcb_Regional_Indicator && __p1 == __p2)
931 return (_M_RI_count & 1) == 0;
932
933 // Rule GB999
934 return true; // Otherwise, break everywhere.
935 }
936 };
937
938 _Iterator _M_begin;
939 };
940
941} // namespace __v15_1_0
942
943 // Return the field width of a string.
944 template<typename _CharT>
945 constexpr size_t
946 __field_width(basic_string_view<_CharT> __s)
947 {
948 if (__s.empty()) [[unlikely]]
949 return 0;
950 _Grapheme_cluster_view<basic_string_view<_CharT>> __gc(__s);
951 auto __it = __gc.begin();
952 const auto __end = __gc.end();
953 size_t __n = __it.width();
954 while (++__it != __end)
955 __n += __it.width();
956 return __n;
957 }
958
959 // Truncate a string to at most `__max` field width units, and return the
960 // resulting field width.
961 template<typename _CharT>
962 constexpr size_t
963 __truncate(basic_string_view<_CharT>& __s, size_t __max)
964 {
965 if (__s.empty()) [[unlikely]]
966 return 0;
967
968 _Grapheme_cluster_view<basic_string_view<_CharT>> __gc(__s);
969 auto __it = __gc.begin();
970 const auto __end = __gc.end();
971 size_t __n = __it.width();
972 if (__n > __max)
973 {
974 __s = {};
975 return 0;
976 }
977 while (++__it != __end)
978 {
979 size_t __n2 = __n + __it.width();
980 if (__n2 > __max)
981 {
982 __s = basic_string_view<_CharT>(__s.begin(), __it.base());
983 return __n;
984 }
985 __n = __n2;
986 }
987 return __n;
988 }
989
990 template<typename _CharT>
991 consteval bool
992 __literal_encoding_is_unicode()
993 {
994 if constexpr (is_same_v<_CharT, char8_t>)
995 return true;
996 else if constexpr (is_same_v<_CharT, char16_t>)
997 return true;
998 else if constexpr (is_same_v<_CharT, char32_t>)
999 return true;
1000
1001 const char* __enc = "";
1002
1003#ifdef __GNUC_EXECUTION_CHARSET_NAME
1004 auto __remove_iso10646_prefix = [](const char* __s) {
1005 // GNU iconv allows "ISO-10646/" prefix (case-insensitive).
1006 if (__s[0] == 'I' || __s[0] == 'i')
1007 if (__s[1] == 'S' || __s[1] == 's')
1008 if (__s[2] == 'O' || __s[2] == 'o')
1009 if (string_view(__s + 3).starts_with("-10646/"))
1010 return __s + 10;
1011 return __s;
1012 };
1013
1014 if constexpr (is_same_v<_CharT, char>)
1015 __enc = __remove_iso10646_prefix(__GNUC_EXECUTION_CHARSET_NAME);
1016# if defined _GLIBCXX_USE_WCHAR_T && defined __GNUC_WIDE_EXECUTION_CHARSET_NAME
1017 else
1018 __enc = __remove_iso10646_prefix(__GNUC_WIDE_EXECUTION_CHARSET_NAME);
1019# endif
1020
1021 if ((__enc[0] == 'U' || __enc[0] == 'u')
1022 && (__enc[1] == 'T' || __enc[1] == 't')
1023 && (__enc[2] == 'F' || __enc[2] == 'f'))
1024 {
1025 __enc += 3;
1026 if (__enc[0] == '-')
1027 ++__enc;
1028 if (__enc[0] == '8')
1029 return __enc[1] == '\0' || string_view(__enc + 1) == "//";
1030 else if constexpr (!is_same_v<_CharT, char>)
1031 {
1032 string_view __s(__enc);
1033 if (__s.ends_with("//"))
1034 __s.remove_suffix(2);
1035 return __s == "16" || __s == "32";
1036 }
1037 }
1038#elif defined __clang_literal_encoding__
1039 if constexpr (is_same_v<_CharT, char>)
1040 __enc = __clang_literal_encoding__;
1041# if defined _GLIBCXX_USE_WCHAR_T && defined __clang_wide_literal_encoding__
1042 else
1043 __enc = __clang_wide_literal_encoding__;
1044# endif
1045 // Clang accepts "-fexec-charset=utf-8" but the macro is still uppercase.
1046 string_view __s(__enc);
1047 if (__s == "UTF-8")
1048 return true;
1049 else if constexpr (!is_same_v<_CharT, char>)
1050 return __s == "UTF-16" || __s == "UTF-32";
1051#endif
1052
1053 return false;
1054 }
1055
1056 consteval bool
1057 __literal_encoding_is_utf8()
1058 { return __literal_encoding_is_unicode<char>(); }
1059
1060 consteval bool
1061 __literal_encoding_is_extended_ascii()
1062 {
1063 return '0' == 0x30 && 'A' == 0x41 && 'Z' == 0x5a
1064 && 'a' == 0x61 && 'z' == 0x7a;
1065 }
1066
1067 // https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching
1068 constexpr bool
1069 __charset_alias_match(string_view __a, string_view __b)
1070 {
1071 // Map alphanumeric chars to their base 64 value, everything else to 127.
1072 auto __map = [](char __c, bool& __num) -> unsigned char {
1073 if (__c == '0') [[unlikely]]
1074 return __num ? 0 : 127;
1075 const auto __v = __detail::__from_chars_alnum_to_val(__c);
1076 __num = __v < 10;
1077 return __v;
1078 };
1079
1080 auto __ptr_a = __a.begin(), __end_a = __a.end();
1081 auto __ptr_b = __b.begin(), __end_b = __b.end();
1082 bool __num_a = false, __num_b = false;
1083
1084 while (true)
1085 {
1086 // Find the value of the next alphanumeric character in each string.
1087 unsigned char __val_a{}, __val_b{};
1088 while (__ptr_a != __end_a
1089 && (__val_a = __map(*__ptr_a, __num_a)) == 127)
1090 ++__ptr_a;
1091 while (__ptr_b != __end_b
1092 && (__val_b = __map(*__ptr_b, __num_b)) == 127)
1093 ++__ptr_b;
1094 // Stop when we reach the end of a string, or get a mismatch.
1095 if (__ptr_a == __end_a)
1096 return __ptr_b == __end_b;
1097 else if (__ptr_b == __end_b)
1098 return false;
1099 else if (__val_a != __val_b)
1100 return false; // Found non-matching characters.
1101 ++__ptr_a;
1102 ++__ptr_b;
1103 }
1104 return true;
1105 }
1106
1107} // namespace __unicode
1108
1109namespace ranges
1110{
1111 template<typename _To, typename _Range>
1112 inline constexpr bool
1113 enable_borrowed_range<std::__unicode::_Utf_view<_To, _Range>>
1114 = enable_borrowed_range<_Range>;
1115
1116 template<typename _Range>
1117 inline constexpr bool
1118 enable_borrowed_range<std::__unicode::_Grapheme_cluster_view<_Range>>
1119 = enable_borrowed_range<_Range>;
1120} // namespace ranges
1121
1122_GLIBCXX_END_NAMESPACE_VERSION
1123} // namespace std
1124#endif // C++20
1125#endif // _GLIBCXX_UNICODE_H
constexpr complex< _Tp > operator*(const complex< _Tp > &__x, const complex< _Tp > &__y)
Return new complex value x times y.
Definition complex:400
constexpr std::remove_reference< _Tp >::type && move(_Tp &&__t) noexcept
Convert a value to an rvalue.
Definition move.h:126
constexpr _Tp && forward(typename std::remove_reference< _Tp >::type &__t) noexcept
Forward an lvalue.
Definition move.h:70
_Tp * end(valarray< _Tp > &__va) noexcept
Return an iterator pointing to one past the last element of the valarray.
Definition valarray:1243
_Tp * begin(valarray< _Tp > &__va) noexcept
Return an iterator pointing to the first element of the valarray.
Definition valarray:1221
ISO C++ entities toplevel namespace is std.
constexpr void advance(_InputIterator &__i, _Distance __n)
A generalization of pointer arithmetic.
__numeric_traits_integer< _Tp > __int_traits
Convenience alias for __numeric_traits<integer-type>.