ICU 78.1 78.1
Loading...
Searching...
No Matches
utfiterator.h
Go to the documentation of this file.
1// © 2024 and later: Unicode, Inc. and others.
2// License & terms of use: https://www.unicode.org/copyright.html
3
4// utfiterator.h
5// created: 2024aug12 Markus W. Scherer
6
7#ifndef __UTFITERATOR_H__
8#define __UTFITERATOR_H__
9
10#include "unicode/utypes.h"
11
12#if U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API || !defined(UTYPES_H)
13
14#include <iterator>
15#if defined(__cpp_lib_ranges)
16#include <ranges>
17#endif
18#include <string>
19#include <string_view>
20#include <type_traits>
21#include "unicode/utf16.h"
22#include "unicode/utf8.h"
23#include "unicode/uversion.h"
24
135#ifndef U_HIDE_DRAFT_API
136
169
170namespace U_HEADER_ONLY_NAMESPACE {
171
172namespace prv {
173#if U_CPLUSPLUS_VERSION >= 20
174
176template<typename Iter>
177using iter_value_t = typename std::iter_value_t<Iter>;
178
180template<typename Iter>
181using iter_difference_t = std::iter_difference_t<Iter>;
182
184template<typename Iter>
185constexpr bool forward_iterator = std::forward_iterator<Iter>;
186
188template<typename Iter>
189constexpr bool bidirectional_iterator = std::bidirectional_iterator<Iter>;
190
192template<typename Range>
193constexpr bool range = std::ranges::range<Range>;
194
195#else
196
198template<typename Iter>
199using iter_value_t = typename std::iterator_traits<Iter>::value_type;
200
202template<typename Iter>
203using iter_difference_t = typename std::iterator_traits<Iter>::difference_type;
204
206template<typename Iter>
207constexpr bool forward_iterator =
208 std::is_base_of_v<
209 std::forward_iterator_tag,
210 typename std::iterator_traits<Iter>::iterator_category>;
211
213template<typename Iter>
215 std::is_base_of_v<
216 std::bidirectional_iterator_tag,
217 typename std::iterator_traits<Iter>::iterator_category>;
218
220template<typename Range, typename = void>
221struct range_type : std::false_type {};
222
224template<typename Range>
226 Range,
227 std::void_t<decltype(std::declval<Range>().begin()),
228 decltype(std::declval<Range>().end())>> : std::true_type {};
229
231template<typename Range>
233
234#endif
235
237template<typename CP32, bool skipSurrogates>
239 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
240public:
242 using value_type = CP32;
246 using pointer = CP32 *;
248 using difference_type = int32_t;
250 using iterator_category = std::forward_iterator_tag;
251
253 inline CodePointsIterator(CP32 c) : c_(c) {}
255 inline bool operator==(const CodePointsIterator &other) const { return c_ == other.c_; }
257 inline bool operator!=(const CodePointsIterator &other) const { return !operator==(other); }
259 inline CP32 operator*() const { return c_; }
261 inline CodePointsIterator &operator++() { // pre-increment
262 ++c_;
263 if (skipSurrogates && c_ == 0xd800) {
264 c_ = 0xe000;
265 }
266 return *this;
267 }
269 inline CodePointsIterator operator++(int) { // post-increment
270 CodePointsIterator result(*this);
271 ++(*this);
272 return result;
273 }
274
275private:
276 CP32 c_;
277};
278
279} // namespace prv
280
291template<typename CP32>
293 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
294public:
302 auto begin() const { return prv::CodePointsIterator<CP32, false>(0); }
307 auto end() const { return prv::CodePointsIterator<CP32, false>(0x110000); }
308};
309
322template<typename CP32>
324 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
325public:
333 auto begin() const { return prv::CodePointsIterator<CP32, true>(0); }
338 auto end() const { return prv::CodePointsIterator<CP32, true>(0x110000); }
339};
340
356template<typename CP32, typename UnitIter, typename = void>
358 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
359 using Unit = typename prv::iter_value_t<UnitIter>;
360public:
362 UnsafeCodeUnits(CP32 codePoint, uint8_t length, UnitIter start, UnitIter limit) :
363 c_(codePoint), len_(length), start_(start), limit_(limit) {}
364
366 UnsafeCodeUnits(const UnsafeCodeUnits &other) = default;
368 UnsafeCodeUnits &operator=(const UnsafeCodeUnits &other) = default;
369
377 CP32 codePoint() const { return c_; }
378
384 UnitIter begin() const { return start_; }
385
391 UnitIter end() const { return limit_; }
392
397 uint8_t length() const { return len_; }
398
399#if U_CPLUSPLUS_VERSION >= 20
405 template<std::contiguous_iterator Iter = UnitIter>
406 std::basic_string_view<Unit> stringView() const {
407 return std::basic_string_view<Unit>(begin(), end());
408 }
409#else
415 template<typename Iter = UnitIter, typename Unit = typename std::iterator_traits<Iter>::value_type>
416 std::enable_if_t<std::is_pointer_v<Iter> ||
417 std::is_same_v<Iter, typename std::basic_string<Unit>::iterator> ||
418 std::is_same_v<Iter, typename std::basic_string<Unit>::const_iterator> ||
419 std::is_same_v<Iter, typename std::basic_string_view<Unit>::iterator> ||
420 std::is_same_v<Iter, typename std::basic_string_view<Unit>::const_iterator>,
421 std::basic_string_view<Unit>>
422 stringView() const {
423 return std::basic_string_view<Unit>(&*start_, len_);
424 }
425#endif
426
427private:
428 // Order of fields with padding and access frequency in mind.
429 CP32 c_;
430 uint8_t len_;
431 UnitIter start_;
432 UnitIter limit_;
433};
434
435#ifndef U_IN_DOXYGEN
436// Partial template specialization for single-pass input iterator.
437// No UnitIter field, no getter for it, no stringView().
438template<typename CP32, typename UnitIter>
439class UnsafeCodeUnits<
440 CP32,
441 UnitIter,
442 std::enable_if_t<!prv::forward_iterator<UnitIter>>> {
443 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
444public:
445 UnsafeCodeUnits(CP32 codePoint, uint8_t length) : c_(codePoint), len_(length) {}
446
447 UnsafeCodeUnits(const UnsafeCodeUnits &other) = default;
448 UnsafeCodeUnits &operator=(const UnsafeCodeUnits &other) = default;
449
450 CP32 codePoint() const { return c_; }
451
452 uint8_t length() const { return len_; }
453
454private:
455 // Order of fields with padding and access frequency in mind.
456 CP32 c_;
457 uint8_t len_;
458};
459#endif // U_IN_DOXYGEN
460
476template<typename CP32, typename UnitIter, typename = void>
477class CodeUnits : public UnsafeCodeUnits<CP32, UnitIter> {
478public:
480 CodeUnits(CP32 codePoint, uint8_t length, bool wellFormed, UnitIter start, UnitIter limit) :
481 UnsafeCodeUnits<CP32, UnitIter>(codePoint, length, start, limit), ok_(wellFormed) {}
482
484 CodeUnits(const CodeUnits &other) = default;
486 CodeUnits &operator=(const CodeUnits &other) = default;
487
492 bool wellFormed() const { return ok_; }
493
494private:
495 bool ok_;
496};
497
498#ifndef U_IN_DOXYGEN
499// Partial template specialization for single-pass input iterator.
500// No UnitIter field, no getter for it, no stringView().
501template<typename CP32, typename UnitIter>
502class CodeUnits<
503 CP32,
504 UnitIter,
505 std::enable_if_t<!prv::forward_iterator<UnitIter>>> :
506 public UnsafeCodeUnits<CP32, UnitIter> {
507public:
508 CodeUnits(CP32 codePoint, uint8_t length, bool wellFormed) :
509 UnsafeCodeUnits<CP32, UnitIter>(codePoint, length), ok_(wellFormed) {}
510
511 CodeUnits(const CodeUnits &other) = default;
512 CodeUnits &operator=(const CodeUnits &other) = default;
513
514 bool wellFormed() const { return ok_; }
515
516private:
517 bool ok_;
518};
519#endif // U_IN_DOXYGEN
520
521// Validating implementations ---------------------------------------------- ***
522
523#ifndef U_IN_DOXYGEN
524template<typename CP32, UTFIllFormedBehavior behavior,
525 typename UnitIter, typename LimitIter = UnitIter, typename = void>
526class UTFImpl;
527
528// Note: readAndInc() functions take both a p0 and a p iterator.
529// They must have the same value.
530// For a multi-pass UnitIter, the caller must copy its p into a local variable p0,
531// and readAndInc() copies p0 and the incremented p into the CodeUnits.
532// For a single-pass UnitIter, which may not be default-constructible nor coypable,
533// the caller can pass p into both references, and readAndInc() does not use p0
534// and constructs CodeUnits without them.
535// Moving the p0 variable into the call site avoids having to declare it inside readAndInc()
536// which may not be possible for a single-pass iterator.
537
538// UTF-8
539template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter, typename LimitIter>
540class UTFImpl<
541 CP32, behavior,
542 UnitIter, LimitIter,
543 std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 1>> {
544 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
545 static_assert(behavior != UTF_BEHAVIOR_SURROGATE,
546 "For 8-bit strings, the SURROGATE option does not have an equivalent.");
547public:
548 // Handle ill-formed UTF-8
549 U_FORCE_INLINE static CP32 sub() {
550 switch (behavior) {
552 case UTF_BEHAVIOR_FFFD: return 0xfffd;
553 }
554 }
555
556 U_FORCE_INLINE static void inc(UnitIter &p, const LimitIter &limit) {
557 // Very similar to U8_FWD_1().
558 uint8_t b = *p;
559 ++p;
560 if (U8_IS_LEAD(b) && p != limit) {
561 uint8_t t1 = *p;
562 if ((0xe0 <= b && b < 0xf0)) {
563 if (U8_IS_VALID_LEAD3_AND_T1(b, t1) &&
564 ++p != limit && U8_IS_TRAIL(*p)) {
565 ++p;
566 }
567 } else if (b < 0xe0) {
568 if (U8_IS_TRAIL(t1)) {
569 ++p;
570 }
571 } else /* b >= 0xf0 */ {
572 if (U8_IS_VALID_LEAD4_AND_T1(b, t1) &&
573 ++p != limit && U8_IS_TRAIL(*p) &&
574 ++p != limit && U8_IS_TRAIL(*p)) {
575 ++p;
576 }
577 }
578 }
579 }
580
581 U_FORCE_INLINE static void dec(UnitIter start, UnitIter &p) {
582 // Very similar to U8_BACK_1().
583 uint8_t c = *--p;
584 if (U8_IS_TRAIL(c) && p != start) {
585 UnitIter p1 = p;
586 uint8_t b1 = *--p1;
587 if (U8_IS_LEAD(b1)) {
588 if (b1 < 0xe0 ||
589 (b1 < 0xf0 ?
591 U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
592 p = p1;
593 return;
594 }
595 } else if (U8_IS_TRAIL(b1) && p1 != start) {
596 uint8_t b2 = *--p1;
597 if (0xe0 <= b2 && b2 <= 0xf4) {
598 if (b2 < 0xf0 ?
600 U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
601 p = p1;
602 return;
603 }
604 } else if (U8_IS_TRAIL(b2) && p1 != start) {
605 uint8_t b3 = *--p1;
606 if (0xf0 <= b3 && b3 <= 0xf4 && U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
607 p = p1;
608 return;
609 }
610 }
611 }
612 }
613 }
614
615 U_FORCE_INLINE static CodeUnits<CP32, UnitIter> readAndInc(
616 UnitIter &p0, UnitIter &p, const LimitIter &limit) {
617 constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
618 // Very similar to U8_NEXT_OR_FFFD().
619 CP32 c = uint8_t(*p);
620 ++p;
621 if (U8_IS_SINGLE(c)) {
622 if constexpr (isMultiPass) {
623 return {c, 1, true, p0, p};
624 } else {
625 return {c, 1, true};
626 }
627 }
628 uint8_t length = 1;
629 uint8_t t = 0;
630 if (p != limit &&
631 // fetch/validate/assemble all but last trail byte
632 (c >= 0xe0 ?
633 (c < 0xf0 ? // U+0800..U+FFFF except surrogates
634 U8_LEAD3_T1_BITS[c &= 0xf] & (1 << ((t = *p) >> 5)) &&
635 (t &= 0x3f, 1)
636 : // U+10000..U+10FFFF
637 (c -= 0xf0) <= 4 &&
638 U8_LEAD4_T1_BITS[(t = *p) >> 4] & (1 << c) &&
639 (c = (c << 6) | (t & 0x3f), ++length, ++p != limit) &&
640 (t = *p - 0x80) <= 0x3f) &&
641 // valid second-to-last trail byte
642 (c = (c << 6) | t, ++length, ++p != limit)
643 : // U+0080..U+07FF
644 c >= 0xc2 && (c &= 0x1f, 1)) &&
645 // last trail byte
646 (t = *p - 0x80) <= 0x3f) {
647 c = (c << 6) | t;
648 ++length;
649 ++p;
650 if constexpr (isMultiPass) {
651 return {c, length, true, p0, p};
652 } else {
653 return {c, length, true};
654 }
655 }
656 if constexpr (isMultiPass) {
657 return {sub(), length, false, p0, p};
658 } else {
659 return {sub(), length, false};
660 }
661 }
662
663 U_FORCE_INLINE static CodeUnits<CP32, UnitIter> decAndRead(UnitIter start, UnitIter &p) {
664 // Very similar to U8_PREV_OR_FFFD().
665 UnitIter p0 = p;
666 CP32 c = uint8_t(*--p);
667 if (U8_IS_SINGLE(c)) {
668 return {c, 1, true, p, p0};
669 }
670 if (U8_IS_TRAIL(c) && p != start) {
671 UnitIter p1 = p;
672 uint8_t b1 = *--p1;
673 if (U8_IS_LEAD(b1)) {
674 if (b1 < 0xe0) {
675 p = p1;
676 c = ((b1 - 0xc0) << 6) | (c & 0x3f);
677 return {c, 2, true, p, p0};
678 } else if (b1 < 0xf0 ?
681 // Truncated 3- or 4-byte sequence.
682 p = p1;
683 return {sub(), 2, false, p, p0};
684 }
685 } else if (U8_IS_TRAIL(b1) && p1 != start) {
686 // Extract the value bits from the last trail byte.
687 c &= 0x3f;
688 uint8_t b2 = *--p1;
689 if (0xe0 <= b2 && b2 <= 0xf4) {
690 if (b2 < 0xf0) {
691 b2 &= 0xf;
692 if (U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
693 p = p1;
694 c = (b2 << 12) | ((b1 & 0x3f) << 6) | c;
695 return {c, 3, true, p, p0};
696 }
697 } else if (U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
698 // Truncated 4-byte sequence.
699 p = p1;
700 return {sub(), 3, false, p, p0};
701 }
702 } else if (U8_IS_TRAIL(b2) && p1 != start) {
703 uint8_t b3 = *--p1;
704 if (0xf0 <= b3 && b3 <= 0xf4) {
705 b3 &= 7;
706 if (U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
707 p = p1;
708 c = (b3 << 18) | ((b2 & 0x3f) << 12) | ((b1 & 0x3f) << 6) | c;
709 return {c, 4, true, p, p0};
710 }
711 }
712 }
713 }
714 }
715 return {sub(), 1, false, p, p0};
716 }
717};
718
719// UTF-16
720template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter, typename LimitIter>
721class UTFImpl<
722 CP32, behavior,
723 UnitIter, LimitIter,
724 std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 2>> {
725 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
726public:
727 // Handle ill-formed UTF-16: One unpaired surrogate.
728 U_FORCE_INLINE static CP32 sub(CP32 surrogate) {
729 switch (behavior) {
731 case UTF_BEHAVIOR_FFFD: return 0xfffd;
732 case UTF_BEHAVIOR_SURROGATE: return surrogate;
733 }
734 }
735
736 U_FORCE_INLINE static void inc(UnitIter &p, const LimitIter &limit) {
737 // Very similar to U16_FWD_1().
738 auto c = *p;
739 ++p;
740 if (U16_IS_LEAD(c) && p != limit && U16_IS_TRAIL(*p)) {
741 ++p;
742 }
743 }
744
745 U_FORCE_INLINE static void dec(UnitIter start, UnitIter &p) {
746 // Very similar to U16_BACK_1().
747 UnitIter p1;
748 if (U16_IS_TRAIL(*--p) && p != start && (p1 = p, U16_IS_LEAD(*--p1))) {
749 p = p1;
750 }
751 }
752
753 U_FORCE_INLINE static CodeUnits<CP32, UnitIter> readAndInc(
754 UnitIter &p0, UnitIter &p, const LimitIter &limit) {
755 constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
756 // Very similar to U16_NEXT_OR_FFFD().
757 CP32 c = static_cast<CP32>(*p);
758 ++p;
759 if (!U16_IS_SURROGATE(c)) {
760 if constexpr (isMultiPass) {
761 return {c, 1, true, p0, p};
762 } else {
763 return {c, 1, true};
764 }
765 } else {
766 uint16_t c2;
767 if (U16_IS_SURROGATE_LEAD(c) && p != limit && U16_IS_TRAIL(c2 = *p)) {
768 ++p;
769 c = U16_GET_SUPPLEMENTARY(c, c2);
770 if constexpr (isMultiPass) {
771 return {c, 2, true, p0, p};
772 } else {
773 return {c, 2, true};
774 }
775 } else {
776 if constexpr (isMultiPass) {
777 return {sub(c), 1, false, p0, p};
778 } else {
779 return {sub(c), 1, false};
780 }
781 }
782 }
783 }
784
785 U_FORCE_INLINE static CodeUnits<CP32, UnitIter> decAndRead(UnitIter start, UnitIter &p) {
786 // Very similar to U16_PREV_OR_FFFD().
787 UnitIter p0 = p;
788 CP32 c = static_cast<CP32>(*--p);
789 if (!U16_IS_SURROGATE(c)) {
790 return {c, 1, true, p, p0};
791 } else {
792 UnitIter p1;
793 uint16_t c2;
794 if (U16_IS_SURROGATE_TRAIL(c) && p != start && (p1 = p, U16_IS_LEAD(c2 = *--p1))) {
795 p = p1;
796 c = U16_GET_SUPPLEMENTARY(c2, c);
797 return {c, 2, true, p, p0};
798 } else {
799 return {sub(c), 1, false, p, p0};
800 }
801 }
802 }
803};
804
805// UTF-32: trivial, but still validating
806template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter, typename LimitIter>
807class UTFImpl<
808 CP32, behavior,
809 UnitIter, LimitIter,
810 std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 4>> {
811 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
812public:
813 // Handle ill-formed UTF-32
814 U_FORCE_INLINE static CP32 sub(bool forSurrogate, CP32 surrogate) {
815 switch (behavior) {
817 case UTF_BEHAVIOR_FFFD: return 0xfffd;
818 case UTF_BEHAVIOR_SURROGATE: return forSurrogate ? surrogate : 0xfffd;
819 }
820 }
821
822 U_FORCE_INLINE static void inc(UnitIter &p, const LimitIter &/*limit*/) {
823 ++p;
824 }
825
826 U_FORCE_INLINE static void dec(UnitIter /*start*/, UnitIter &p) {
827 --p;
828 }
829
830 U_FORCE_INLINE static CodeUnits<CP32, UnitIter> readAndInc(
831 UnitIter &p0, UnitIter &p, const LimitIter &/*limit*/) {
832 constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
833 uint32_t uc = *p;
834 CP32 c = uc;
835 ++p;
836 if (uc < 0xd800 || (0xe000 <= uc && uc <= 0x10ffff)) {
837 if constexpr (isMultiPass) {
838 return {c, 1, true, p0, p};
839 } else {
840 return {c, 1, true};
841 }
842 } else {
843 if constexpr (isMultiPass) {
844 return {sub(uc < 0xe000, c), 1, false, p0, p};
845 } else {
846 return {sub(uc < 0xe000, c), 1, false};
847 }
848 }
849 }
850
851 U_FORCE_INLINE static CodeUnits<CP32, UnitIter> decAndRead(UnitIter /*start*/, UnitIter &p) {
852 UnitIter p0 = p;
853 uint32_t uc = *--p;
854 CP32 c = uc;
855 if (uc < 0xd800 || (0xe000 <= uc && uc <= 0x10ffff)) {
856 return {c, 1, true, p, p0};
857 } else {
858 return {sub(uc < 0xe000, c), 1, false, p, p0};
859 }
860 }
861};
862
863// Non-validating implementations ------------------------------------------ ***
864
865template<typename CP32, typename UnitIter, typename = void>
866class UnsafeUTFImpl;
867
868// UTF-8
869template<typename CP32, typename UnitIter>
870class UnsafeUTFImpl<
871 CP32,
872 UnitIter,
873 std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 1>> {
874 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
875public:
876 U_FORCE_INLINE static void inc(UnitIter &p) {
877 // Very similar to U8_FWD_1_UNSAFE().
878 uint8_t b = *p;
879 std::advance(p, 1 + U8_COUNT_TRAIL_BYTES_UNSAFE(b));
880 }
881
882 U_FORCE_INLINE static void dec(UnitIter &p) {
883 // Very similar to U8_BACK_1_UNSAFE().
884 while (U8_IS_TRAIL(*--p)) {}
885 }
886
887 U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> readAndInc(UnitIter &p0, UnitIter &p) {
888 constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
889 // Very similar to U8_NEXT_UNSAFE().
890 CP32 c = uint8_t(*p);
891 ++p;
892 if (U8_IS_SINGLE(c)) {
893 if constexpr (isMultiPass) {
894 return {c, 1, p0, p};
895 } else {
896 return {c, 1};
897 }
898 } else if (c < 0xe0) {
899 c = ((c & 0x1f) << 6) | (*p & 0x3f);
900 ++p;
901 if constexpr (isMultiPass) {
902 return {c, 2, p0, p};
903 } else {
904 return {c, 2};
905 }
906 } else if (c < 0xf0) {
907 // No need for (c&0xf) because the upper bits are truncated
908 // after <<12 in the cast to uint16_t.
909 c = uint16_t(c << 12) | ((*p & 0x3f) << 6);
910 ++p;
911 c |= *p & 0x3f;
912 ++p;
913 if constexpr (isMultiPass) {
914 return {c, 3, p0, p};
915 } else {
916 return {c, 3};
917 }
918 } else {
919 c = ((c & 7) << 18) | ((*p & 0x3f) << 12);
920 ++p;
921 c |= (*p & 0x3f) << 6;
922 ++p;
923 c |= *p & 0x3f;
924 ++p;
925 if constexpr (isMultiPass) {
926 return {c, 4, p0, p};
927 } else {
928 return {c, 4};
929 }
930 }
931 }
932
933 U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> decAndRead(UnitIter &p) {
934 // Very similar to U8_PREV_UNSAFE().
935 UnitIter p0 = p;
936 CP32 c = uint8_t(*--p);
937 if (U8_IS_SINGLE(c)) {
938 return {c, 1, p, p0};
939 }
940 // U8_IS_TRAIL(c) if well-formed
941 c &= 0x3f;
942 uint8_t count = 1;
943 for (uint8_t shift = 6;;) {
944 uint8_t b = *--p;
945 if (b >= 0xc0) {
946 U8_MASK_LEAD_BYTE(b, count);
947 c |= uint32_t{b} << shift;
948 break;
949 } else {
950 c |= (uint32_t{b} & 0x3f) << shift;
951 ++count;
952 shift += 6;
953 }
954 }
955 ++count;
956 return {c, count, p, p0};
957 }
958};
959
960// UTF-16
961template<typename CP32, typename UnitIter>
962class UnsafeUTFImpl<
963 CP32,
964 UnitIter,
965 std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 2>> {
966 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
967public:
968 U_FORCE_INLINE static void inc(UnitIter &p) {
969 // Very similar to U16_FWD_1_UNSAFE().
970 auto c = *p;
971 ++p;
972 if (U16_IS_LEAD(c)) {
973 ++p;
974 }
975 }
976
977 U_FORCE_INLINE static void dec(UnitIter &p) {
978 // Very similar to U16_BACK_1_UNSAFE().
979 if (U16_IS_TRAIL(*--p)) {
980 --p;
981 }
982 }
983
984 U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> readAndInc(UnitIter &p0, UnitIter &p) {
985 constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
986 // Very similar to U16_NEXT_UNSAFE().
987 CP32 c = static_cast<CP32>(*p);
988 ++p;
989 if (!U16_IS_LEAD(c)) {
990 if constexpr (isMultiPass) {
991 return {c, 1, p0, p};
992 } else {
993 return {c, 1};
994 }
995 } else {
996 uint16_t c2 = *p;
997 ++p;
998 c = U16_GET_SUPPLEMENTARY(c, c2);
999 if constexpr (isMultiPass) {
1000 return {c, 2, p0, p};
1001 } else {
1002 return {c, 2};
1003 }
1004 }
1005 }
1006
1007 U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> decAndRead(UnitIter &p) {
1008 // Very similar to U16_PREV_UNSAFE().
1009 UnitIter p0 = p;
1010 CP32 c = static_cast<CP32>(*--p);
1011 if (!U16_IS_TRAIL(c)) {
1012 return {c, 1, p, p0};
1013 } else {
1014 uint16_t c2 = *--p;
1015 c = U16_GET_SUPPLEMENTARY(c2, c);
1016 return {c, 2, p, p0};
1017 }
1018 }
1019};
1020
1021// UTF-32: trivial
1022template<typename CP32, typename UnitIter>
1023class UnsafeUTFImpl<
1024 CP32,
1025 UnitIter,
1026 std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 4>> {
1027 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
1028public:
1029 U_FORCE_INLINE static void inc(UnitIter &p) {
1030 ++p;
1031 }
1032
1033 U_FORCE_INLINE static void dec(UnitIter &p) {
1034 --p;
1035 }
1036
1037 U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> readAndInc(UnitIter &p0, UnitIter &p) {
1038 constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
1039 CP32 c = *p;
1040 ++p;
1041 if constexpr (isMultiPass) {
1042 return {c, 1, p0, p};
1043 } else {
1044 return {c, 1};
1045 }
1046 }
1047
1048 U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> decAndRead(UnitIter &p) {
1049 UnitIter p0 = p;
1050 CP32 c = *--p;
1051 return {c, 1, p, p0};
1052 }
1053};
1054
1055#endif
1056
1057// Validating iterators ---------------------------------------------------- ***
1058
1082template<typename CP32, UTFIllFormedBehavior behavior,
1083 typename UnitIter, typename LimitIter = UnitIter, typename = void>
1085 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
1086 using Impl = UTFImpl<CP32, behavior, UnitIter, LimitIter>;
1087
1088 // Proxy type for operator->() (required by LegacyInputIterator)
1089 // so that we don't promise always returning CodeUnits.
1090 class Proxy {
1091 public:
1092 explicit Proxy(CodeUnits<CP32, UnitIter> &units) : units_(units) {}
1093 CodeUnits<CP32, UnitIter> &operator*() { return units_; }
1094 CodeUnits<CP32, UnitIter> *operator->() { return &units_; }
1095 private:
1097 };
1098
1099public:
1105 using pointer = Proxy;
1109 using iterator_category = std::conditional_t<
1110 prv::bidirectional_iterator<UnitIter>,
1111 std::bidirectional_iterator_tag,
1112 std::forward_iterator_tag>;
1113
1127 U_FORCE_INLINE UTFIterator(UnitIter start, UnitIter p, LimitIter limit) :
1128 p_(p), start_(start), limit_(limit), units_(0, 0, false, p, p) {}
1140 U_FORCE_INLINE UTFIterator(UnitIter p, LimitIter limit) :
1141 p_(p), start_(p), limit_(limit), units_(0, 0, false, p, p) {}
1153 U_FORCE_INLINE explicit UTFIterator(UnitIter p) : p_(p), start_(p), limit_(p), units_(0, 0, false, p, p) {}
1159 U_FORCE_INLINE UTFIterator() : p_{}, start_{}, limit_{}, units_(0, 0, false, p_, p_) {}
1160
1162 U_FORCE_INLINE UTFIterator(UTFIterator &&src) noexcept = default;
1165
1167 U_FORCE_INLINE UTFIterator(const UTFIterator &other) = default;
1170
1176 U_FORCE_INLINE bool operator==(const UTFIterator &other) const {
1177 return getLogicalPosition() == other.getLogicalPosition();
1178 }
1184 U_FORCE_INLINE bool operator!=(const UTFIterator &other) const { return !operator==(other); }
1185
1186 // Asymmetric equality & nonequality with a sentinel type.
1187
1194 template<typename Sentinel> U_FORCE_INLINE friend
1195 std::enable_if_t<
1196 !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1197 bool>
1198 operator==(const UTFIterator &iter, const Sentinel &s) {
1199 return iter.getLogicalPosition() == s;
1200 }
1201
1202#if U_CPLUSPLUS_VERSION < 20
1203 // C++17: Need to define all four combinations of == / != vs. parameter order.
1204 // Once we require C++20, we could remove all but the first == because
1205 // the compiler would generate the rest.
1206
1213 template<typename Sentinel> U_FORCE_INLINE friend
1214 std::enable_if_t<
1215 !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1216 bool>
1217 operator==(const Sentinel &s, const UTFIterator &iter) {
1218 return iter.getLogicalPosition() == s;
1219 }
1226 template<typename Sentinel> U_FORCE_INLINE friend
1227 std::enable_if_t<
1228 !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1229 bool>
1230 operator!=(const UTFIterator &iter, const Sentinel &s) { return !(iter == s); }
1237 template<typename Sentinel> U_FORCE_INLINE friend
1238 std::enable_if_t<
1239 !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1240 bool>
1241 operator!=(const Sentinel &s, const UTFIterator &iter) { return !(iter == s); }
1242#endif // C++17
1243
1251 if (state_ == 0) {
1252 UnitIter p0 = p_;
1253 units_ = Impl::readAndInc(p0, p_, limit_);
1254 state_ = 1;
1255 }
1256 return units_;
1257 }
1258
1268 if (state_ == 0) {
1269 UnitIter p0 = p_;
1270 units_ = Impl::readAndInc(p0, p_, limit_);
1271 state_ = 1;
1272 }
1273 return Proxy(units_);
1274 }
1275
1283 if (state_ > 0) {
1284 // operator*() called readAndInc() so p_ is already ahead.
1285 state_ = 0;
1286 } else if (state_ == 0) {
1287 Impl::inc(p_, limit_);
1288 } else /* state_ < 0 */ {
1289 // operator--() called decAndRead() so we know how far to skip.
1290 p_ = units_.end();
1291 state_ = 0;
1292 }
1293 return *this;
1294 }
1295
1304 U_FORCE_INLINE UTFIterator operator++(int) { // post-increment
1305 if (state_ > 0) {
1306 // operator*() called readAndInc() so p_ is already ahead.
1307 UTFIterator result(*this);
1308 state_ = 0;
1309 return result;
1310 } else if (state_ == 0) {
1311 UnitIter p0 = p_;
1312 units_ = Impl::readAndInc(p0, p_, limit_);
1313 UTFIterator result(*this);
1314 result.state_ = 1;
1315 // keep this->state_ == 0
1316 return result;
1317 } else /* state_ < 0 */ {
1318 UTFIterator result(*this);
1319 // operator--() called decAndRead() so we know how far to skip.
1320 p_ = units_.end();
1321 state_ = 0;
1322 return result;
1323 }
1324 }
1325
1333 template<typename Iter = UnitIter>
1335 std::enable_if_t<prv::bidirectional_iterator<Iter>, UTFIterator &>
1336 operator--() { // pre-decrement
1337 if (state_ > 0) {
1338 // operator*() called readAndInc() so p_ is ahead of the logical position.
1339 p_ = units_.begin();
1340 }
1341 units_ = Impl::decAndRead(start_, p_);
1342 state_ = -1;
1343 return *this;
1344 }
1345
1353 template<typename Iter = UnitIter>
1355 std::enable_if_t<prv::bidirectional_iterator<Iter>, UTFIterator>
1356 operator--(int) { // post-decrement
1357 UTFIterator result(*this);
1358 operator--();
1359 return result;
1360 }
1361
1362private:
1363 friend class std::reverse_iterator<UTFIterator<CP32, behavior, UnitIter>>;
1364
1365 U_FORCE_INLINE UnitIter getLogicalPosition() const {
1366 return state_ <= 0 ? p_ : units_.begin();
1367 }
1368
1369 // operator*() etc. are logically const.
1370 mutable UnitIter p_;
1371 // In a validating iterator, we need start_ & limit_ so that when we read a code point
1372 // (forward or backward) we can test if there are enough code units.
1373 UnitIter start_;
1374 LimitIter limit_;
1375 // Keep state so that we call readAndInc() only once for both operator*() and ++
1376 // to make it easy for the compiler to optimize.
1377 mutable CodeUnits<CP32, UnitIter> units_;
1378 // >0: units_ = readAndInc(), p_ = units limit
1379 // which means that p_ is ahead of its logical position
1380 // 0: initial state
1381 // <0: units_ = decAndRead(), p_ = units start
1382 mutable int8_t state_ = 0;
1383};
1384
1385#ifndef U_IN_DOXYGEN
1386// Partial template specialization for single-pass input iterator.
1387template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter, typename LimitIter>
1388class UTFIterator<
1389 CP32, behavior,
1390 UnitIter, LimitIter,
1391 std::enable_if_t<!prv::forward_iterator<UnitIter>>> {
1392 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
1393 using Impl = UTFImpl<CP32, behavior, UnitIter, LimitIter>;
1394
1395 // Proxy type for post-increment return value, to make *iter++ work.
1396 // Also for operator->() (required by LegacyInputIterator)
1397 // so that we don't promise always returning CodeUnits.
1398 class Proxy {
1399 public:
1400 explicit Proxy(CodeUnits<CP32, UnitIter> &units) : units_(units) {}
1401 CodeUnits<CP32, UnitIter> &operator*() { return units_; }
1402 CodeUnits<CP32, UnitIter> *operator->() { return &units_; }
1403 private:
1404 CodeUnits<CP32, UnitIter> units_;
1405 };
1406
1407public:
1408 using value_type = CodeUnits<CP32, UnitIter>;
1409 using reference = value_type;
1410 using pointer = Proxy;
1411 using difference_type = prv::iter_difference_t<UnitIter>;
1412 using iterator_category = std::input_iterator_tag;
1413
1414 U_FORCE_INLINE UTFIterator(UnitIter p, LimitIter limit) : p_(std::move(p)), limit_(std::move(limit)) {}
1415
1416 // Constructs an iterator start or limit sentinel.
1417 // Requires p to be copyable.
1418 U_FORCE_INLINE explicit UTFIterator(UnitIter p) : p_(std::move(p)), limit_(p_) {}
1419
1420 U_FORCE_INLINE UTFIterator(UTFIterator &&src) noexcept = default;
1421 U_FORCE_INLINE UTFIterator &operator=(UTFIterator &&src) noexcept = default;
1422
1423 U_FORCE_INLINE UTFIterator(const UTFIterator &other) = default;
1424 U_FORCE_INLINE UTFIterator &operator=(const UTFIterator &other) = default;
1425
1426 U_FORCE_INLINE bool operator==(const UTFIterator &other) const {
1427 return p_ == other.p_ && ahead_ == other.ahead_;
1428 // Strictly speaking, we should check if the logical position is the same.
1429 // However, we cannot advance, or do arithmetic with, a single-pass UnitIter.
1430 }
1431 U_FORCE_INLINE bool operator!=(const UTFIterator &other) const { return !operator==(other); }
1432
1433 template<typename Sentinel> U_FORCE_INLINE friend
1434 std::enable_if_t<
1435 !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1436 bool>
1437 operator==(const UTFIterator &iter, const Sentinel &s) {
1438 return !iter.ahead_ && iter.p_ == s;
1439 }
1440
1441#if U_CPLUSPLUS_VERSION < 20
1442 template<typename Sentinel> U_FORCE_INLINE friend
1443 std::enable_if_t<
1444 !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1445 bool>
1446 operator==(const Sentinel &s, const UTFIterator &iter) {
1447 return !iter.ahead_ && iter.p_ == s;
1448 }
1449
1450 template<typename Sentinel> U_FORCE_INLINE friend
1451 std::enable_if_t<
1452 !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1453 bool>
1454 operator!=(const UTFIterator &iter, const Sentinel &s) { return !(iter == s); }
1455
1456 template<typename Sentinel> U_FORCE_INLINE friend
1457 std::enable_if_t<
1458 !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1459 bool>
1460 operator!=(const Sentinel &s, const UTFIterator &iter) { return !(iter == s); }
1461#endif // C++17
1462
1463 U_FORCE_INLINE CodeUnits<CP32, UnitIter> operator*() const {
1464 if (!ahead_) {
1465 units_ = Impl::readAndInc(p_, p_, limit_);
1466 ahead_ = true;
1467 }
1468 return units_;
1469 }
1470
1471 U_FORCE_INLINE Proxy operator->() const {
1472 if (!ahead_) {
1473 units_ = Impl::readAndInc(p_, p_, limit_);
1474 ahead_ = true;
1475 }
1476 return Proxy(units_);
1477 }
1478
1479 U_FORCE_INLINE UTFIterator &operator++() { // pre-increment
1480 if (ahead_) {
1481 // operator*() called readAndInc() so p_ is already ahead.
1482 ahead_ = false;
1483 } else {
1484 Impl::inc(p_, limit_);
1485 }
1486 return *this;
1487 }
1488
1489 U_FORCE_INLINE Proxy operator++(int) { // post-increment
1490 if (ahead_) {
1491 // operator*() called readAndInc() so p_ is already ahead.
1492 ahead_ = false;
1493 } else {
1494 units_ = Impl::readAndInc(p_, p_, limit_);
1495 // keep this->ahead_ == false
1496 }
1497 return Proxy(units_);
1498 }
1499
1500private:
1501 // operator*() etc. are logically const.
1502 mutable UnitIter p_;
1503 // In a validating iterator, we need limit_ so that when we read a code point
1504 // we can test if there are enough code units.
1505 LimitIter limit_;
1506 // Keep state so that we call readAndInc() only once for both operator*() and ++
1507 // so that we can use a single-pass input iterator for UnitIter.
1508 mutable CodeUnits<CP32, UnitIter> units_ = {0, 0, false};
1509 // true: units_ = readAndInc(), p_ = units limit
1510 // which means that p_ is ahead of its logical position
1511 // false: initial state
1512 mutable bool ahead_ = false;
1513};
1514#endif // U_IN_DOXYGEN
1515
1516} // namespace U_HEADER_ONLY_NAMESPACE
1517
1518#ifndef U_IN_DOXYGEN
1519// Bespoke specialization of reverse_iterator.
1520// The default implementation implements reverse operator*() and ++ in a way
1521// that does most of the same work twice for reading variable-length sequences.
1522template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter>
1523class std::reverse_iterator<U_HEADER_ONLY_NAMESPACE::UTFIterator<CP32, behavior, UnitIter>> {
1524 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
1525 using Impl = U_HEADER_ONLY_NAMESPACE::UTFImpl<CP32, behavior, UnitIter>;
1527
1528 // Proxy type for operator->() (required by LegacyInputIterator)
1529 // so that we don't promise always returning CodeUnits.
1530 class Proxy {
1531 public:
1532 explicit Proxy(CodeUnits_ units) : units_(units) {}
1533 CodeUnits_ &operator*() { return units_; }
1534 CodeUnits_ *operator->() { return &units_; }
1535 private:
1536 CodeUnits_ units_;
1537 };
1538
1539public:
1540 using value_type = CodeUnits_;
1541 using reference = value_type;
1542 using pointer = Proxy;
1544 using iterator_category = std::bidirectional_iterator_tag;
1545
1547 p_(iter.getLogicalPosition()), start_(iter.start_), limit_(iter.limit_),
1548 units_(0, 0, false, p_, p_) {}
1549 U_FORCE_INLINE reverse_iterator() : p_{}, start_{}, limit_{}, units_(0, 0, false, p_, p_) {}
1550
1551 U_FORCE_INLINE reverse_iterator(reverse_iterator &&src) noexcept = default;
1552 U_FORCE_INLINE reverse_iterator &operator=(reverse_iterator &&src) noexcept = default;
1553
1554 U_FORCE_INLINE reverse_iterator(const reverse_iterator &other) = default;
1555 U_FORCE_INLINE reverse_iterator &operator=(const reverse_iterator &other) = default;
1556
1557 U_FORCE_INLINE bool operator==(const reverse_iterator &other) const {
1558 return getLogicalPosition() == other.getLogicalPosition();
1559 }
1560 U_FORCE_INLINE bool operator!=(const reverse_iterator &other) const { return !operator==(other); }
1561
1562 U_FORCE_INLINE CodeUnits_ operator*() const {
1563 if (state_ == 0) {
1564 units_ = Impl::decAndRead(start_, p_);
1565 state_ = -1;
1566 }
1567 return units_;
1568 }
1569
1570 U_FORCE_INLINE Proxy operator->() const {
1571 if (state_ == 0) {
1572 units_ = Impl::decAndRead(start_, p_);
1573 state_ = -1;
1574 }
1575 return Proxy(units_);
1576 }
1577
1578 U_FORCE_INLINE reverse_iterator &operator++() { // pre-increment
1579 if (state_ < 0) {
1580 // operator*() called decAndRead() so p_ is already behind.
1581 state_ = 0;
1582 } else if (state_ == 0) {
1583 Impl::dec(start_, p_);
1584 } else /* state_ > 0 */ {
1585 // operator--() called readAndInc() so we know how far to skip.
1586 p_ = units_.begin();
1587 state_ = 0;
1588 }
1589 return *this;
1590 }
1591
1592 U_FORCE_INLINE reverse_iterator operator++(int) { // post-increment
1593 if (state_ < 0) {
1594 // operator*() called decAndRead() so p_ is already behind.
1595 reverse_iterator result(*this);
1596 state_ = 0;
1597 return result;
1598 } else if (state_ == 0) {
1599 units_ = Impl::decAndRead(start_, p_);
1600 reverse_iterator result(*this);
1601 result.state_ = -1;
1602 // keep this->state_ == 0
1603 return result;
1604 } else /* state_ > 0 */ {
1605 reverse_iterator result(*this);
1606 // operator--() called readAndInc() so we know how far to skip.
1607 p_ = units_.begin();
1608 state_ = 0;
1609 return result;
1610 }
1611 }
1612
1613 U_FORCE_INLINE reverse_iterator &operator--() { // pre-decrement
1614 if (state_ < 0) {
1615 // operator*() called decAndRead() so p_ is behind the logical position.
1616 p_ = units_.end();
1617 }
1618 UnitIter p0 = p_;
1619 units_ = Impl::readAndInc(p0, p_, limit_);
1620 state_ = 1;
1621 return *this;
1622 }
1623
1624 U_FORCE_INLINE reverse_iterator operator--(int) { // post-decrement
1625 reverse_iterator result(*this);
1626 operator--();
1627 return result;
1628 }
1629
1630private:
1631 U_FORCE_INLINE UnitIter getLogicalPosition() const {
1632 return state_ >= 0 ? p_ : units_.end();
1633 }
1634
1635 // operator*() etc. are logically const.
1636 mutable UnitIter p_;
1637 // In a validating iterator, we need start_ & limit_ so that when we read a code point
1638 // (forward or backward) we can test if there are enough code units.
1639 UnitIter start_;
1640 UnitIter limit_;
1641 // Keep state so that we call decAndRead() only once for both operator*() and ++
1642 // to make it easy for the compiler to optimize.
1643 mutable CodeUnits_ units_;
1644 // >0: units_ = readAndInc(), p_ = units limit
1645 // 0: initial state
1646 // <0: units_ = decAndRead(), p_ = units start
1647 // which means that p_ is behind its logical position
1648 mutable int8_t state_ = 0;
1649};
1650#endif // U_IN_DOXYGEN
1651
1652namespace U_HEADER_ONLY_NAMESPACE {
1653
1676template<typename CP32, UTFIllFormedBehavior behavior,
1677 typename UnitIter, typename LimitIter = UnitIter>
1678auto utfIterator(UnitIter start, UnitIter p, LimitIter limit) {
1679 return UTFIterator<CP32, behavior, UnitIter, LimitIter>(
1680 std::move(start), std::move(p), std::move(limit));
1681}
1682
1703template<typename CP32, UTFIllFormedBehavior behavior,
1704 typename UnitIter, typename LimitIter = UnitIter>
1705auto utfIterator(UnitIter p, LimitIter limit) {
1706 return UTFIterator<CP32, behavior, UnitIter, LimitIter>(
1707 std::move(p), std::move(limit));
1708}
1709
1710// Note: We should only enable the following factory function for a copyable UnitIter.
1711// In C++17, we would have to partially specialize with enable_if_t testing for forward_iterator,
1712// but a function template partial specialization is not allowed.
1713// In C++20, we might be able to require the std::copyable concept.
1714
1734template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter>
1735auto utfIterator(UnitIter p) {
1736 return UTFIterator<CP32, behavior, UnitIter>(std::move(p));
1737}
1738
1751template<typename CP32, UTFIllFormedBehavior behavior, typename Range>
1752class UTFStringCodePoints {
1753 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
1754public:
1759 UTFStringCodePoints() = default;
1760
1766 template<typename R = Range, typename = std::enable_if_t<!std::is_reference_v<R>>>
1767 explicit UTFStringCodePoints(Range unitRange) : unitRange(std::move(unitRange)) {}
1776 template<typename R = Range, typename = std::enable_if_t<std::is_reference_v<R>>, typename = void>
1777 explicit UTFStringCodePoints(Range unitRange) : unitRange(unitRange) {}
1778
1780 UTFStringCodePoints(const UTFStringCodePoints &other) = default;
1781
1783 UTFStringCodePoints &operator=(const UTFStringCodePoints &other) = default;
1784
1789 auto begin() {
1790 return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end());
1791 }
1792
1797 template<typename R = Range, typename = std::enable_if_t<prv::range<const R>>>
1798 auto begin() const {
1799 return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end());
1800 }
1801
1806 auto end() {
1807 using UnitIter = decltype(unitRange.begin());
1808 using LimitIter = decltype(unitRange.end());
1809 if constexpr (!std::is_same_v<UnitIter, LimitIter>) {
1810 // Return the code unit sentinel.
1811 return unitRange.end();
1812 } else if constexpr (prv::bidirectional_iterator<UnitIter>) {
1813 return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end(), unitRange.end());
1814 } else {
1815 // The input iterator specialization has no three-argument constructor.
1816 return utfIterator<CP32, behavior>(unitRange.end(), unitRange.end());
1817 }
1818 }
1819
1824 template<typename R = Range, typename = std::enable_if_t<prv::range<const R>>>
1825 auto end() const {
1826 using UnitIter = decltype(unitRange.begin());
1827 using LimitIter = decltype(unitRange.end());
1828 if constexpr (!std::is_same_v<UnitIter, LimitIter>) {
1829 // Return the code unit sentinel.
1830 return unitRange.end();
1831 } else if constexpr (prv::bidirectional_iterator<UnitIter>) {
1832 return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end(), unitRange.end());
1833 } else {
1834 // The input iterator specialization has no three-argument constructor.
1835 return utfIterator<CP32, behavior>(unitRange.end(), unitRange.end());
1836 }
1837 }
1838
1843 auto rbegin() const {
1844 return std::make_reverse_iterator(end());
1845 }
1846
1851 auto rend() const {
1852 return std::make_reverse_iterator(begin());
1853 }
1854
1855private:
1856 Range unitRange;
1857};
1858
1860template<typename CP32, UTFIllFormedBehavior behavior>
1861struct UTFStringCodePointsAdaptor
1862#if U_CPLUSPLUS_VERSION >= 23 && __cpp_lib_ranges >= 2022'02 && \
1863 __cpp_lib_bind_back >= 2022'02 // http://wg21.link/P2387R3.
1864 : std::ranges::range_adaptor_closure<UTFStringCodePointsAdaptor<CP32, behavior>>
1865#endif
1866{
1868 template<typename Range>
1869 auto operator()(Range &&unitRange) const {
1870#if defined(__cpp_lib_ranges) && __cpp_lib_ranges >= 2021'10 // We need https://wg21.link/P2415R2.
1871 return UTFStringCodePoints<CP32, behavior, std::ranges::views::all_t<Range>>(
1872 std::forward<Range>(unitRange));
1873#else
1874 return UTFStringCodePoints<CP32, behavior, Range>(std::forward<Range>(unitRange));
1875#endif
1876 }
1877};
1878
1893template<typename CP32, UTFIllFormedBehavior behavior>
1894constexpr UTFStringCodePointsAdaptor<CP32, behavior> utfStringCodePoints;
1895
1896// Non-validating iterators ------------------------------------------------ ***
1897
1919template<typename CP32, typename UnitIter, typename = void>
1920class UnsafeUTFIterator {
1921 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
1922 using Impl = UnsafeUTFImpl<CP32, UnitIter>;
1923
1924 // Proxy type for operator->() (required by LegacyInputIterator)
1925 // so that we don't promise always returning UnsafeCodeUnits.
1926 class Proxy {
1927 public:
1928 explicit Proxy(UnsafeCodeUnits<CP32, UnitIter> &units) : units_(units) {}
1929 UnsafeCodeUnits<CP32, UnitIter> &operator*() { return units_; }
1930 UnsafeCodeUnits<CP32, UnitIter> *operator->() { return &units_; }
1931 private:
1932 UnsafeCodeUnits<CP32, UnitIter> units_;
1933 };
1934
1935public:
1937 using value_type = UnsafeCodeUnits<CP32, UnitIter>;
1939 using reference = value_type;
1941 using pointer = Proxy;
1943 using difference_type = prv::iter_difference_t<UnitIter>;
1945 using iterator_category = std::conditional_t<
1946 prv::bidirectional_iterator<UnitIter>,
1947 std::bidirectional_iterator_tag,
1948 std::forward_iterator_tag>;
1949
1959 U_FORCE_INLINE explicit UnsafeUTFIterator(UnitIter p) : p_(p), units_(0, 0, p, p) {}
1965 U_FORCE_INLINE UnsafeUTFIterator() : p_{}, units_(0, 0, p_, p_) {}
1966
1968 U_FORCE_INLINE UnsafeUTFIterator(UnsafeUTFIterator &&src) noexcept = default;
1970 U_FORCE_INLINE UnsafeUTFIterator &operator=(UnsafeUTFIterator &&src) noexcept = default;
1971
1973 U_FORCE_INLINE UnsafeUTFIterator(const UnsafeUTFIterator &other) = default;
1975 U_FORCE_INLINE UnsafeUTFIterator &operator=(const UnsafeUTFIterator &other) = default;
1976
1982 U_FORCE_INLINE bool operator==(const UnsafeUTFIterator &other) const {
1983 return getLogicalPosition() == other.getLogicalPosition();
1984 }
1990 U_FORCE_INLINE bool operator!=(const UnsafeUTFIterator &other) const { return !operator==(other); }
1991
1998 template<typename Sentinel> U_FORCE_INLINE friend
1999 std::enable_if_t<
2000 !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2001 bool>
2002 operator==(const UnsafeUTFIterator &iter, const Sentinel &s) {
2003 return iter.getLogicalPosition() == s;
2004 }
2005
2006#if U_CPLUSPLUS_VERSION < 20
2013 template<typename Sentinel> U_FORCE_INLINE friend
2014 std::enable_if_t<
2015 !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2016 bool>
2017 operator==(const Sentinel &s, const UnsafeUTFIterator &iter) {
2018 return iter.getLogicalPosition() == s;
2019 }
2026 template<typename Sentinel> U_FORCE_INLINE friend
2027 std::enable_if_t<
2028 !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2029 bool>
2030 operator!=(const UnsafeUTFIterator &iter, const Sentinel &s) { return !(iter == s); }
2037 template<typename Sentinel> U_FORCE_INLINE friend
2038 std::enable_if_t<
2039 !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2040 bool>
2041 operator!=(const Sentinel &s, const UnsafeUTFIterator &iter) { return !(iter == s); }
2042#endif // C++17
2043
2050 U_FORCE_INLINE UnsafeCodeUnits<CP32, UnitIter> operator*() const {
2051 if (state_ == 0) {
2052 UnitIter p0 = p_;
2053 units_ = Impl::readAndInc(p0, p_);
2054 state_ = 1;
2055 }
2056 return units_;
2057 }
2058
2067 U_FORCE_INLINE Proxy operator->() const {
2068 if (state_ == 0) {
2069 UnitIter p0 = p_;
2070 units_ = Impl::readAndInc(p0, p_);
2071 state_ = 1;
2072 }
2073 return Proxy(units_);
2074 }
2075
2082 U_FORCE_INLINE UnsafeUTFIterator &operator++() { // pre-increment
2083 if (state_ > 0) {
2084 // operator*() called readAndInc() so p_ is already ahead.
2085 state_ = 0;
2086 } else if (state_ == 0) {
2087 Impl::inc(p_);
2088 } else /* state_ < 0 */ {
2089 // operator--() called decAndRead() so we know how far to skip.
2090 p_ = units_.end();
2091 state_ = 0;
2092 }
2093 return *this;
2094 }
2095
2104 U_FORCE_INLINE UnsafeUTFIterator operator++(int) { // post-increment
2105 if (state_ > 0) {
2106 // operator*() called readAndInc() so p_ is already ahead.
2107 UnsafeUTFIterator result(*this);
2108 state_ = 0;
2109 return result;
2110 } else if (state_ == 0) {
2111 UnitIter p0 = p_;
2112 units_ = Impl::readAndInc(p0, p_);
2113 UnsafeUTFIterator result(*this);
2114 result.state_ = 1;
2115 // keep this->state_ == 0
2116 return result;
2117 } else /* state_ < 0 */ {
2118 UnsafeUTFIterator result(*this);
2119 // operator--() called decAndRead() so we know how far to skip.
2120 p_ = units_.end();
2121 state_ = 0;
2122 return result;
2123 }
2124 }
2125
2133 template<typename Iter = UnitIter>
2135 std::enable_if_t<prv::bidirectional_iterator<Iter>, UnsafeUTFIterator &>
2136 operator--() { // pre-decrement
2137 if (state_ > 0) {
2138 // operator*() called readAndInc() so p_ is ahead of the logical position.
2139 p_ = units_.begin();
2140 }
2141 units_ = Impl::decAndRead(p_);
2142 state_ = -1;
2143 return *this;
2144 }
2145
2153 template<typename Iter = UnitIter>
2155 std::enable_if_t<prv::bidirectional_iterator<Iter>, UnsafeUTFIterator>
2156 operator--(int) { // post-decrement
2157 UnsafeUTFIterator result(*this);
2158 operator--();
2159 return result;
2160 }
2161
2162private:
2163 friend class std::reverse_iterator<UnsafeUTFIterator<CP32, UnitIter>>;
2164
2165 U_FORCE_INLINE UnitIter getLogicalPosition() const {
2166 return state_ <= 0 ? p_ : units_.begin();
2167 }
2168
2169 // operator*() etc. are logically const.
2170 mutable UnitIter p_;
2171 // Keep state so that we call readAndInc() only once for both operator*() and ++
2172 // to make it easy for the compiler to optimize.
2173 mutable UnsafeCodeUnits<CP32, UnitIter> units_;
2174 // >0: units_ = readAndInc(), p_ = units limit
2175 // which means that p_ is ahead of its logical position
2176 // 0: initial state
2177 // <0: units_ = decAndRead(), p_ = units start
2178 mutable int8_t state_ = 0;
2179};
2180
2181#ifndef U_IN_DOXYGEN
2182// Partial template specialization for single-pass input iterator.
2183template<typename CP32, typename UnitIter>
2184class UnsafeUTFIterator<
2185 CP32,
2186 UnitIter,
2187 std::enable_if_t<!prv::forward_iterator<UnitIter>>> {
2188 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
2189 using Impl = UnsafeUTFImpl<CP32, UnitIter>;
2190
2191 // Proxy type for post-increment return value, to make *iter++ work.
2192 // Also for operator->() (required by LegacyInputIterator)
2193 // so that we don't promise always returning UnsafeCodeUnits.
2194 class Proxy {
2195 public:
2196 explicit Proxy(UnsafeCodeUnits<CP32, UnitIter> &units) : units_(units) {}
2197 UnsafeCodeUnits<CP32, UnitIter> &operator*() { return units_; }
2198 UnsafeCodeUnits<CP32, UnitIter> *operator->() { return &units_; }
2199 private:
2200 UnsafeCodeUnits<CP32, UnitIter> units_;
2201 };
2202
2203public:
2204 using value_type = UnsafeCodeUnits<CP32, UnitIter>;
2205 using reference = value_type;
2206 using pointer = Proxy;
2207 using difference_type = prv::iter_difference_t<UnitIter>;
2208 using iterator_category = std::input_iterator_tag;
2209
2210 U_FORCE_INLINE explicit UnsafeUTFIterator(UnitIter p) : p_(std::move(p)) {}
2211
2212 U_FORCE_INLINE UnsafeUTFIterator(UnsafeUTFIterator &&src) noexcept = default;
2213 U_FORCE_INLINE UnsafeUTFIterator &operator=(UnsafeUTFIterator &&src) noexcept = default;
2214
2215 U_FORCE_INLINE UnsafeUTFIterator(const UnsafeUTFIterator &other) = default;
2216 U_FORCE_INLINE UnsafeUTFIterator &operator=(const UnsafeUTFIterator &other) = default;
2217
2218 U_FORCE_INLINE bool operator==(const UnsafeUTFIterator &other) const {
2219 return p_ == other.p_ && ahead_ == other.ahead_;
2220 // Strictly speaking, we should check if the logical position is the same.
2221 // However, we cannot advance, or do arithmetic with, a single-pass UnitIter.
2222 }
2223 U_FORCE_INLINE bool operator!=(const UnsafeUTFIterator &other) const { return !operator==(other); }
2224
2225 template<typename Sentinel> U_FORCE_INLINE friend
2226 std::enable_if_t<
2227 !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2228 bool>
2229 operator==(const UnsafeUTFIterator &iter, const Sentinel &s) {
2230 return !iter.ahead_ && iter.p_ == s;
2231 }
2232
2233#if U_CPLUSPLUS_VERSION < 20
2234 template<typename Sentinel> U_FORCE_INLINE friend
2235 std::enable_if_t<
2236 !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2237 bool>
2238 operator==(const Sentinel &s, const UnsafeUTFIterator &iter) {
2239 return !iter.ahead_ && iter.p_ == s;
2240 }
2241
2242 template<typename Sentinel> U_FORCE_INLINE friend
2243 std::enable_if_t<
2244 !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2245 bool>
2246 operator!=(const UnsafeUTFIterator &iter, const Sentinel &s) { return !(iter == s); }
2247
2248 template<typename Sentinel> U_FORCE_INLINE friend
2249 std::enable_if_t<
2250 !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2251 bool>
2252 operator!=(const Sentinel &s, const UnsafeUTFIterator &iter) { return !(iter == s); }
2253#endif // C++17
2254
2255 U_FORCE_INLINE UnsafeCodeUnits<CP32, UnitIter> operator*() const {
2256 if (!ahead_) {
2257 units_ = Impl::readAndInc(p_, p_);
2258 ahead_ = true;
2259 }
2260 return units_;
2261 }
2262
2263 U_FORCE_INLINE Proxy operator->() const {
2264 if (!ahead_) {
2265 units_ = Impl::readAndInc(p_, p_);
2266 ahead_ = true;
2267 }
2268 return Proxy(units_);
2269 }
2270
2271 U_FORCE_INLINE UnsafeUTFIterator &operator++() { // pre-increment
2272 if (ahead_) {
2273 // operator*() called readAndInc() so p_ is already ahead.
2274 ahead_ = false;
2275 } else {
2276 Impl::inc(p_);
2277 }
2278 return *this;
2279 }
2280
2281 U_FORCE_INLINE Proxy operator++(int) { // post-increment
2282 if (ahead_) {
2283 // operator*() called readAndInc() so p_ is already ahead.
2284 ahead_ = false;
2285 } else {
2286 units_ = Impl::readAndInc(p_, p_);
2287 // keep this->ahead_ == false
2288 }
2289 return Proxy(units_);
2290 }
2291
2292private:
2293 // operator*() etc. are logically const.
2294 mutable UnitIter p_;
2295 // Keep state so that we call readAndInc() only once for both operator*() and ++
2296 // so that we can use a single-pass input iterator for UnitIter.
2297 mutable UnsafeCodeUnits<CP32, UnitIter> units_ = {0, 0};
2298 // true: units_ = readAndInc(), p_ = units limit
2299 // which means that p_ is ahead of its logical position
2300 // false: initial state
2301 mutable bool ahead_ = false;
2302};
2303#endif // U_IN_DOXYGEN
2304
2305} // namespace U_HEADER_ONLY_NAMESPACE
2306
2307#ifndef U_IN_DOXYGEN
2308// Bespoke specialization of reverse_iterator.
2309// The default implementation implements reverse operator*() and ++ in a way
2310// that does most of the same work twice for reading variable-length sequences.
2311template<typename CP32, typename UnitIter>
2312class std::reverse_iterator<U_HEADER_ONLY_NAMESPACE::UnsafeUTFIterator<CP32, UnitIter>> {
2313 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
2314 using Impl = U_HEADER_ONLY_NAMESPACE::UnsafeUTFImpl<CP32, UnitIter>;
2316
2317 // Proxy type for operator->() (required by LegacyInputIterator)
2318 // so that we don't promise always returning UnsafeCodeUnits.
2319 class Proxy {
2320 public:
2321 explicit Proxy(UnsafeCodeUnits_ units) : units_(units) {}
2322 UnsafeCodeUnits_ &operator*() { return units_; }
2323 UnsafeCodeUnits_ *operator->() { return &units_; }
2324 private:
2325 UnsafeCodeUnits_ units_;
2326 };
2327
2328public:
2329 using value_type = UnsafeCodeUnits_;
2330 using reference = value_type;
2331 using pointer = Proxy;
2333 using iterator_category = std::bidirectional_iterator_tag;
2334
2335 U_FORCE_INLINE explicit reverse_iterator(U_HEADER_ONLY_NAMESPACE::UnsafeUTFIterator<CP32, UnitIter> iter) :
2336 p_(iter.getLogicalPosition()), units_(0, 0, p_, p_) {}
2337 U_FORCE_INLINE reverse_iterator() : p_{}, units_(0, 0, p_, p_) {}
2338
2339 U_FORCE_INLINE reverse_iterator(reverse_iterator &&src) noexcept = default;
2340 U_FORCE_INLINE reverse_iterator &operator=(reverse_iterator &&src) noexcept = default;
2341
2342 U_FORCE_INLINE reverse_iterator(const reverse_iterator &other) = default;
2343 U_FORCE_INLINE reverse_iterator &operator=(const reverse_iterator &other) = default;
2344
2345 U_FORCE_INLINE bool operator==(const reverse_iterator &other) const {
2346 return getLogicalPosition() == other.getLogicalPosition();
2347 }
2348 U_FORCE_INLINE bool operator!=(const reverse_iterator &other) const { return !operator==(other); }
2349
2350 U_FORCE_INLINE UnsafeCodeUnits_ operator*() const {
2351 if (state_ == 0) {
2352 units_ = Impl::decAndRead(p_);
2353 state_ = -1;
2354 }
2355 return units_;
2356 }
2357
2358 U_FORCE_INLINE Proxy operator->() const {
2359 if (state_ == 0) {
2360 units_ = Impl::decAndRead(p_);
2361 state_ = -1;
2362 }
2363 return Proxy(units_);
2364 }
2365
2366 U_FORCE_INLINE reverse_iterator &operator++() { // pre-increment
2367 if (state_ < 0) {
2368 // operator*() called decAndRead() so p_ is already behind.
2369 state_ = 0;
2370 } else if (state_ == 0) {
2371 Impl::dec(p_);
2372 } else /* state_ > 0 */ {
2373 // operator--() called readAndInc() so we know how far to skip.
2374 p_ = units_.begin();
2375 state_ = 0;
2376 }
2377 return *this;
2378 }
2379
2380 U_FORCE_INLINE reverse_iterator operator++(int) { // post-increment
2381 if (state_ < 0) {
2382 // operator*() called decAndRead() so p_ is already behind.
2383 reverse_iterator result(*this);
2384 state_ = 0;
2385 return result;
2386 } else if (state_ == 0) {
2387 units_ = Impl::decAndRead(p_);
2388 reverse_iterator result(*this);
2389 result.state_ = -1;
2390 // keep this->state_ == 0
2391 return result;
2392 } else /* state_ > 0 */ {
2393 reverse_iterator result(*this);
2394 // operator--() called readAndInc() so we know how far to skip.
2395 p_ = units_.begin();
2396 state_ = 0;
2397 return result;
2398 }
2399 }
2400
2401 U_FORCE_INLINE reverse_iterator &operator--() { // pre-decrement
2402 if (state_ < 0) {
2403 // operator*() called decAndRead() so p_ is behind the logical position.
2404 p_ = units_.end();
2405 }
2406 UnitIter p0 = p_;
2407 units_ = Impl::readAndInc(p0, p_);
2408 state_ = 1;
2409 return *this;
2410 }
2411
2412 U_FORCE_INLINE reverse_iterator operator--(int) { // post-decrement
2413 reverse_iterator result(*this);
2414 operator--();
2415 return result;
2416 }
2417
2418private:
2419 U_FORCE_INLINE UnitIter getLogicalPosition() const {
2420 return state_ >= 0 ? p_ : units_.end();
2421 }
2422
2423 // operator*() etc. are logically const.
2424 mutable UnitIter p_;
2425 // Keep state so that we call decAndRead() only once for both operator*() and ++
2426 // to make it easy for the compiler to optimize.
2427 mutable UnsafeCodeUnits_ units_;
2428 // >0: units_ = readAndInc(), p_ = units limit
2429 // 0: initial state
2430 // <0: units_ = decAndRead(), p_ = units start
2431 // which means that p_ is behind its logical position
2432 mutable int8_t state_ = 0;
2433};
2434#endif // U_IN_DOXYGEN
2435
2436namespace U_HEADER_ONLY_NAMESPACE {
2437
2453template<typename CP32, typename UnitIter>
2454auto unsafeUTFIterator(UnitIter iter) {
2455 return UnsafeUTFIterator<CP32, UnitIter>(std::move(iter));
2456}
2457
2469template<typename CP32, typename Range>
2470class UnsafeUTFStringCodePoints {
2471 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
2472public:
2477 UnsafeUTFStringCodePoints() = default;
2478
2484 template<typename R = Range, typename = std::enable_if_t<!std::is_reference_v<R>>>
2485 explicit UnsafeUTFStringCodePoints(Range unitRange) : unitRange(std::move(unitRange)) {}
2494 template<typename R = Range, typename = std::enable_if_t<std::is_reference_v<R>>, typename = void>
2495 explicit UnsafeUTFStringCodePoints(Range unitRange) : unitRange(unitRange) {}
2496
2498 UnsafeUTFStringCodePoints(const UnsafeUTFStringCodePoints &other) = default;
2499
2501 UnsafeUTFStringCodePoints &operator=(const UnsafeUTFStringCodePoints &other) = default;
2502
2507 auto begin() {
2508 return unsafeUTFIterator<CP32>(unitRange.begin());
2509 }
2510
2515 template<typename R = Range, typename = std::enable_if_t<prv::range<const R>>>
2516 auto begin() const {
2517 return unsafeUTFIterator<CP32>(unitRange.begin());
2518 }
2519
2524 auto end() {
2525 using UnitIter = decltype(unitRange.begin());
2526 using LimitIter = decltype(unitRange.end());
2527 if constexpr (!std::is_same_v<UnitIter, LimitIter>) {
2528 // Return the code unit sentinel.
2529 return unitRange.end();
2530 } else {
2531 return unsafeUTFIterator<CP32>(unitRange.end());
2532 }
2533 }
2534
2539 template<typename R = Range, typename = std::enable_if_t<prv::range<const R>>>
2540 auto end() const {
2541 using UnitIter = decltype(unitRange.begin());
2542 using LimitIter = decltype(unitRange.end());
2543 if constexpr (!std::is_same_v<UnitIter, LimitIter>) {
2544 // Return the code unit sentinel.
2545 return unitRange.end();
2546 } else {
2547 return unsafeUTFIterator<CP32>(unitRange.end());
2548 }
2549 }
2550
2555 auto rbegin() const {
2556 return std::make_reverse_iterator(end());
2557 }
2558
2563 auto rend() const {
2564 return std::make_reverse_iterator(begin());
2565 }
2566
2567private:
2568 Range unitRange;
2569};
2570
2572template<typename CP32>
2573struct UnsafeUTFStringCodePointsAdaptor
2574#if U_CPLUSPLUS_VERSION >= 23 && __cpp_lib_ranges >= 2022'02 && \
2575 __cpp_lib_bind_back >= 2022'02 // http://wg21.link/P2387R3.
2576 : std::ranges::range_adaptor_closure<UnsafeUTFStringCodePointsAdaptor<CP32>>
2577#endif
2578{
2580 template<typename Range>
2581 auto operator()(Range &&unitRange) const {
2582#if defined(__cpp_lib_ranges) && __cpp_lib_ranges >= 2021'10 // We need https://wg21.link/P2415R2.
2583 return UnsafeUTFStringCodePoints<CP32, std::ranges::views::all_t<Range>>(std::forward<Range>(unitRange));
2584#else
2585 return UnsafeUTFStringCodePoints<CP32, Range>(std::forward<Range>(unitRange));
2586#endif
2587 }
2588};
2589
2590
2603template<typename CP32>
2604constexpr UnsafeUTFStringCodePointsAdaptor<CP32> unsafeUTFStringCodePoints;
2605
2606} // namespace U_HEADER_ONLY_NAMESPACE
2607
2608#endif // U_HIDE_DRAFT_API
2609#endif // U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API
2610#endif // __UTFITERATOR_H__
A C++ "range" over all Unicode code points U+0000..U+10FFFF.
A C++ "range" over all Unicode scalar values U+0000..U+D7FF & U+E000..U+10FFFF.
Result of validating and decoding a code unit sequence for one code point.
CodeUnits & operator=(const CodeUnits &other)=default
Copy assignment operator.
CodeUnits(const CodeUnits &other)=default
Copy constructor.
CodeUnits(CP32 codePoint, uint8_t length, bool wellFormed, UnitIter start, UnitIter limit)
Validating iterator over the code points in a Unicode string.
U_FORCE_INLINE UTFIterator()
Default constructor.
U_FORCE_INLINE friend std::enable_if_t< !std::is_same_v< Sentinel, UTFIterator > &&!std::is_same_v< Sentinel, UnitIter >, bool > operator!=(const UTFIterator &iter, const Sentinel &s)
U_FORCE_INLINE std::enable_if_t< prv::bidirectional_iterator< Iter >, UTFIterator & > operator--()
Pre-decrement operator.
U_FORCE_INLINE Proxy operator->() const
Decodes the code unit sequence at the current position.
value_type reference
C++ iterator boilerplate.
CodeUnits< CP32, UnitIter > value_type
C++ iterator boilerplate.
U_FORCE_INLINE friend std::enable_if_t< !std::is_same_v< Sentinel, UTFIterator > &&!std::is_same_v< Sentinel, UnitIter >, bool > operator==(const UTFIterator &iter, const Sentinel &s)
U_FORCE_INLINE UTFIterator(UTFIterator &&src) noexcept=default
Move constructor.
U_FORCE_INLINE friend std::enable_if_t< !std::is_same_v< Sentinel, UTFIterator > &&!std::is_same_v< Sentinel, UnitIter >, bool > operator!=(const Sentinel &s, const UTFIterator &iter)
U_FORCE_INLINE std::enable_if_t< prv::bidirectional_iterator< Iter >, UTFIterator > operator--(int)
Post-decrement operator.
U_FORCE_INLINE UTFIterator & operator++()
Pre-increment operator.
std::conditional_t< prv::bidirectional_iterator< UnitIter >, std::bidirectional_iterator_tag, std::forward_iterator_tag > iterator_category
C++ iterator boilerplate.
U_FORCE_INLINE UTFIterator & operator=(UTFIterator &&src) noexcept=default
Move assignment operator.
U_FORCE_INLINE UTFIterator(UnitIter p)
Constructs an iterator start or limit sentinel.
U_FORCE_INLINE friend std::enable_if_t< !std::is_same_v< Sentinel, UTFIterator > &&!std::is_same_v< Sentinel, UnitIter >, bool > operator==(const Sentinel &s, const UTFIterator &iter)
U_FORCE_INLINE CodeUnits< CP32, UnitIter > operator*() const
Decodes the code unit sequence at the current position.
U_FORCE_INLINE bool operator!=(const UTFIterator &other) const
Proxy pointer
C++ iterator boilerplate.
U_FORCE_INLINE UTFIterator(UnitIter start, UnitIter p, LimitIter limit)
Constructor with start <= p < limit.
U_FORCE_INLINE UTFIterator(const UTFIterator &other)=default
Copy constructor.
U_FORCE_INLINE UTFIterator operator++(int)
Post-increment operator.
U_FORCE_INLINE UTFIterator & operator=(const UTFIterator &other)=default
Copy assignment operator.
U_FORCE_INLINE bool operator==(const UTFIterator &other) const
U_FORCE_INLINE UTFIterator(UnitIter p, LimitIter limit)
Constructor with start == p < limit.
prv::iter_difference_t< UnitIter > difference_type
C++ iterator boilerplate.
Result of decoding a code unit sequence for one code point.
std::enable_if_t< std::is_pointer_v< Iter >||std::is_same_v< Iter, typename std::basic_string< Unit >::iterator >||std::is_same_v< Iter, typename std::basic_string< Unit >::const_iterator >||std::is_same_v< Iter, typename std::basic_string_view< Unit >::iterator >||std::is_same_v< Iter, typename std::basic_string_view< Unit >::const_iterator >, std::basic_string_view< Unit > > stringView() const
UnsafeCodeUnits & operator=(const UnsafeCodeUnits &other)=default
Copy assignment operator.
UnsafeCodeUnits(CP32 codePoint, uint8_t length, UnitIter start, UnitIter limit)
UnsafeCodeUnits(const UnsafeCodeUnits &other)=default
Copy constructor.
int32_t difference_type
C++ iterator boilerplate.
bool operator==(const CodePointsIterator &other) const
bool operator!=(const CodePointsIterator &other) const
value_type reference
C++ iterator boilerplate.
std::forward_iterator_tag iterator_category
C++ iterator boilerplate.
CP32 * pointer
C++ iterator boilerplate.
U_COMMON_API UBool operator==(const StringPiece &x, const StringPiece &y)
Global operator == for StringPiece.
bool operator!=(const StringPiece &x, const StringPiece &y)
Global operator != for StringPiece.
#define U_CPLUSPLUS_VERSION
0 if no C++; 1, 11, 14, ... if C++.
Definition platform.h:464
#define U_SENTINEL
This value is intended for sentinel values for APIs that (take or) return single code points (UChar32...
Definition umachine.h:469
#define U_FORCE_INLINE
Forces function inlining on compilers that are known to support it.
Definition umachine.h:135
C API: 16-bit Unicode handling macros.
#define U16_IS_SURROGATE_TRAIL(c)
Assuming c is a surrogate code point (U16_IS_SURROGATE(c)), is it a trail surrogate?
Definition utf16.h:93
#define U16_IS_SURROGATE_LEAD(c)
Assuming c is a surrogate code point (U16_IS_SURROGATE(c)), is it a lead surrogate?
Definition utf16.h:84
#define U16_GET_SUPPLEMENTARY(lead, trail)
Get a supplementary code point value (U+10000..U+10ffff) from its lead and trail surrogates.
Definition utf16.h:112
#define U16_IS_SURROGATE(c)
Is this code unit a surrogate (U+d800..U+dfff)?
Definition utf16.h:75
#define U16_IS_LEAD(c)
Is this code unit a lead surrogate (U+d800..U+dbff)?
Definition utf16.h:59
#define U16_IS_TRAIL(c)
Is this code unit a trail surrogate (U+dc00..U+dfff)?
Definition utf16.h:67
C API: 8-bit Unicode handling macros.
#define U8_COUNT_TRAIL_BYTES_UNSAFE(leadByte)
Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence.
Definition utf8.h:71
#define U8_IS_VALID_LEAD3_AND_T1(lead, t1)
Internal 3-byte UTF-8 validity check.
Definition utf8.h:98
#define U8_IS_VALID_LEAD4_AND_T1(lead, t1)
Internal 4-byte UTF-8 validity check.
Definition utf8.h:115
#define U8_IS_SINGLE(c)
Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?
Definition utf8.h:173
#define U8_LEAD3_T1_BITS
Internal bit vector for 3-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD3_AND_T1.
Definition utf8.h:91
#define U8_LEAD4_T1_BITS
Internal bit vector for 4-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD4_AND_T1.
Definition utf8.h:108
#define U8_IS_LEAD(c)
Is this code unit (byte) a UTF-8 lead byte? (0xC2..0xF4)
Definition utf8.h:181
#define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes)
Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value.
Definition utf8.h:81
#define U8_IS_TRAIL(c)
Is this code unit (byte) a UTF-8 trail byte? (0x80..0xBF)
Definition utf8.h:190
typename std::iterator_traits< Iter >::difference_type iter_difference_t
constexpr bool forward_iterator
typename std::iterator_traits< Iter >::value_type iter_value_t
constexpr bool bidirectional_iterator
UTFIllFormedBehavior
Some defined behaviors for handling ill-formed Unicode strings.
@ UTF_BEHAVIOR_FFFD
Returns U+FFFD Replacement Character.
@ UTF_BEHAVIOR_SURROGATE
UTF-8: Not allowed; UTF-16: returns the unpaired surrogate; UTF-32: returns the surrogate code point,...
@ UTF_BEHAVIOR_NEGATIVE
Returns a negative value (-1=U_SENTINEL) instead of a code point.
Basic definitions for ICU, for both C and C++ APIs.
C API: API for accessing ICU version numbers.