7#ifndef __UTFITERATOR_H__
8#define __UTFITERATOR_H__
12#if U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API || !defined(UTYPES_H)
15#if defined(__cpp_lib_ranges)
135#ifndef U_HIDE_DRAFT_API
170namespace U_HEADER_ONLY_NAMESPACE {
173#if U_CPLUSPLUS_VERSION >= 20
176template<
typename Iter>
180template<
typename Iter>
184template<
typename Iter>
188template<
typename Iter>
192template<
typename Range>
193constexpr bool range = std::ranges::range<Range>;
198template<
typename Iter>
202template<
typename Iter>
206template<
typename Iter>
209 std::forward_iterator_tag,
210 typename std::iterator_traits<Iter>::iterator_category>;
213template<
typename Iter>
216 std::bidirectional_iterator_tag,
217 typename std::iterator_traits<Iter>::iterator_category>;
220template<
typename Range,
typename =
void>
224template<
typename Range>
227 std::void_t<decltype(std::declval<Range>().begin()),
228 decltype(std::declval<Range>().end())>> : std::true_type {};
231template<
typename Range>
237template<
typename CP32,
bool skipSurrogates>
239 static_assert(
sizeof(CP32) == 4,
"CP32 must be a 32-bit type to hold a code point");
263 if (skipSurrogates && c_ == 0xd800) {
291template<
typename CP32>
293 static_assert(
sizeof(CP32) == 4,
"CP32 must be a 32-bit type to hold a code point");
322template<
typename CP32>
324 static_assert(
sizeof(CP32) == 4,
"CP32 must be a 32-bit type to hold a code point");
356template<
typename CP32,
typename UnitIter,
typename =
void>
358 static_assert(
sizeof(CP32) == 4,
"CP32 must be a 32-bit type to hold a code point");
384 UnitIter
begin()
const {
return start_; }
391 UnitIter
end()
const {
return limit_; }
399#if U_CPLUSPLUS_VERSION >= 20
405 template<std::contiguous_iterator Iter = UnitIter>
406 std::basic_string_view<Unit>
stringView()
const {
407 return std::basic_string_view<Unit>(
begin(),
end());
415 template<typename Iter = UnitIter, typename Unit = typename std::iterator_traits<Iter>::value_type>
416 std::enable_if_t<std::is_pointer_v<Iter> ||
417 std::is_same_v<Iter, typename std::basic_string<Unit>::iterator> ||
418 std::is_same_v<Iter, typename std::basic_string<Unit>::const_iterator> ||
419 std::is_same_v<Iter, typename std::basic_string_view<Unit>::iterator> ||
420 std::is_same_v<Iter, typename std::basic_string_view<Unit>::const_iterator>,
421 std::basic_string_view<Unit>>
423 return std::basic_string_view<Unit>(&*start_, len_);
438template<
typename CP32,
typename UnitIter>
439class UnsafeCodeUnits<
442 std::enable_if_t<!prv::forward_iterator<UnitIter>>> {
443 static_assert(
sizeof(CP32) == 4,
"CP32 must be a 32-bit type to hold a code point");
452 uint8_t
length()
const {
return len_; }
476template<
typename CP32,
typename UnitIter,
typename =
void>
501template<
typename CP32,
typename UnitIter>
505 std::enable_if_t<!prv::forward_iterator<UnitIter>>> :
506 public UnsafeCodeUnits<CP32, UnitIter> {
511 CodeUnits(
const CodeUnits &other) =
default;
525 typename UnitIter,
typename LimitIter = UnitIter,
typename =
void>
539template<
typename CP32, UTFIllFormedBehavior behavior,
typename UnitIter,
typename LimitIter>
543 std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 1>> {
544 static_assert(
sizeof(CP32) == 4,
"CP32 must be a 32-bit type to hold a code point");
546 "For 8-bit strings, the SURROGATE option does not have an equivalent.");
556 U_FORCE_INLINE static void inc(UnitIter &p,
const LimitIter &limit) {
562 if ((0xe0 <= b && b < 0xf0)) {
567 }
else if (b < 0xe0) {
597 if (0xe0 <= b2 && b2 <= 0xf4) {
616 UnitIter &p0, UnitIter &p,
const LimitIter &limit) {
617 constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
619 CP32 c = uint8_t(*p);
622 if constexpr (isMultiPass) {
623 return {c, 1,
true, p0, p};
639 (c = (c << 6) | (t & 0x3f), ++length, ++p != limit) &&
640 (t = *p - 0x80) <= 0x3f) &&
642 (c = (c << 6) | t, ++length, ++p != limit)
644 c >= 0xc2 && (c &= 0x1f, 1)) &&
646 (t = *p - 0x80) <= 0x3f) {
650 if constexpr (isMultiPass) {
651 return {c, length,
true, p0, p};
653 return {c, length,
true};
656 if constexpr (isMultiPass) {
657 return {sub(), length,
false, p0, p};
659 return {sub(), length,
false};
663 U_FORCE_INLINE static CodeUnits<CP32, UnitIter> decAndRead(UnitIter start, UnitIter &p) {
666 CP32 c = uint8_t(*--p);
668 return {c, 1,
true, p, p0};
676 c = ((b1 - 0xc0) << 6) | (c & 0x3f);
677 return {c, 2,
true, p, p0};
678 }
else if (b1 < 0xf0 ?
683 return {sub(), 2,
false, p, p0};
689 if (0xe0 <= b2 && b2 <= 0xf4) {
694 c = (b2 << 12) | ((b1 & 0x3f) << 6) | c;
695 return {c, 3,
true, p, p0};
700 return {sub(), 3,
false, p, p0};
704 if (0xf0 <= b3 && b3 <= 0xf4) {
708 c = (b3 << 18) | ((b2 & 0x3f) << 12) | ((b1 & 0x3f) << 6) | c;
709 return {c, 4,
true, p, p0};
715 return {sub(), 1,
false, p, p0};
720template<
typename CP32, UTFIllFormedBehavior behavior,
typename UnitIter,
typename LimitIter>
724 std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 2>> {
725 static_assert(
sizeof(CP32) == 4,
"CP32 must be a 32-bit type to hold a code point");
736 U_FORCE_INLINE static void inc(UnitIter &p,
const LimitIter &limit) {
754 UnitIter &p0, UnitIter &p,
const LimitIter &limit) {
755 constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
757 CP32 c =
static_cast<CP32
>(*p);
760 if constexpr (isMultiPass) {
761 return {c, 1,
true, p0, p};
770 if constexpr (isMultiPass) {
771 return {c, 2,
true, p0, p};
776 if constexpr (isMultiPass) {
777 return {sub(c), 1,
false, p0, p};
779 return {sub(c), 1,
false};
785 U_FORCE_INLINE static CodeUnits<CP32, UnitIter> decAndRead(UnitIter start, UnitIter &p) {
788 CP32 c =
static_cast<CP32
>(*--p);
790 return {c, 1,
true, p, p0};
797 return {c, 2,
true, p, p0};
799 return {sub(c), 1,
false, p, p0};
806template<
typename CP32, UTFIllFormedBehavior behavior,
typename UnitIter,
typename LimitIter>
810 std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 4>> {
811 static_assert(
sizeof(CP32) == 4,
"CP32 must be a 32-bit type to hold a code point");
814 U_FORCE_INLINE static CP32 sub(
bool forSurrogate, CP32 surrogate) {
831 UnitIter &p0, UnitIter &p,
const LimitIter &) {
832 constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
836 if (uc < 0xd800 || (0xe000 <= uc && uc <= 0x10ffff)) {
837 if constexpr (isMultiPass) {
838 return {c, 1,
true, p0, p};
843 if constexpr (isMultiPass) {
844 return {sub(uc < 0xe000, c), 1,
false, p0, p};
846 return {sub(uc < 0xe000, c), 1,
false};
851 U_FORCE_INLINE static CodeUnits<CP32, UnitIter> decAndRead(UnitIter , UnitIter &p) {
855 if (uc < 0xd800 || (0xe000 <= uc && uc <= 0x10ffff)) {
856 return {c, 1,
true, p, p0};
858 return {sub(uc < 0xe000, c), 1,
false, p, p0};
865template<
typename CP32,
typename UnitIter,
typename =
void>
869template<
typename CP32,
typename UnitIter>
873 std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 1>> {
874 static_assert(
sizeof(CP32) == 4,
"CP32 must be a 32-bit type to hold a code point");
887 U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> readAndInc(UnitIter &p0, UnitIter &p) {
888 constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
890 CP32 c = uint8_t(*p);
893 if constexpr (isMultiPass) {
894 return {c, 1, p0, p};
898 }
else if (c < 0xe0) {
899 c = ((c & 0x1f) << 6) | (*p & 0x3f);
901 if constexpr (isMultiPass) {
902 return {c, 2, p0, p};
906 }
else if (c < 0xf0) {
909 c = uint16_t(c << 12) | ((*p & 0x3f) << 6);
913 if constexpr (isMultiPass) {
914 return {c, 3, p0, p};
919 c = ((c & 7) << 18) | ((*p & 0x3f) << 12);
921 c |= (*p & 0x3f) << 6;
925 if constexpr (isMultiPass) {
926 return {c, 4, p0, p};
933 U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> decAndRead(UnitIter &p) {
936 CP32 c = uint8_t(*--p);
938 return {c, 1, p, p0};
943 for (uint8_t shift = 6;;) {
947 c |= uint32_t{b} << shift;
950 c |= (uint32_t{b} & 0x3f) << shift;
956 return {c, count, p, p0};
961template<
typename CP32,
typename UnitIter>
965 std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 2>> {
966 static_assert(
sizeof(CP32) == 4,
"CP32 must be a 32-bit type to hold a code point");
984 U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> readAndInc(UnitIter &p0, UnitIter &p) {
985 constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
987 CP32 c =
static_cast<CP32
>(*p);
990 if constexpr (isMultiPass) {
991 return {c, 1, p0, p};
999 if constexpr (isMultiPass) {
1000 return {c, 2, p0, p};
1007 U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> decAndRead(UnitIter &p) {
1010 CP32 c =
static_cast<CP32
>(*--p);
1012 return {c, 1, p, p0};
1016 return {c, 2, p, p0};
1022template<
typename CP32,
typename UnitIter>
1026 std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 4>> {
1027 static_assert(
sizeof(CP32) == 4,
"CP32 must be a 32-bit type to hold a code point");
1037 U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> readAndInc(UnitIter &p0, UnitIter &p) {
1038 constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
1041 if constexpr (isMultiPass) {
1042 return {c, 1, p0, p};
1048 U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> decAndRead(UnitIter &p) {
1051 return {c, 1, p, p0};
1083 typename UnitIter,
typename LimitIter = UnitIter,
typename =
void>
1085 static_assert(
sizeof(CP32) == 4,
"CP32 must be a 32-bit type to hold a code point");
1086 using Impl = UTFImpl<CP32, behavior, UnitIter, LimitIter>;
1110 prv::bidirectional_iterator<UnitIter>,
1111 std::bidirectional_iterator_tag,
1112 std::forward_iterator_tag>;
1128 p_(p), start_(start), limit_(limit), units_(0, 0, false, p, p) {}
1141 p_(p), start_(p), limit_(limit), units_(0, 0, false, p, p) {}
1177 return getLogicalPosition() == other.getLogicalPosition();
1196 !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1199 return iter.getLogicalPosition() == s;
1202#if U_CPLUSPLUS_VERSION < 20
1215 !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1218 return iter.getLogicalPosition() == s;
1228 !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1239 !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1253 units_ = Impl::readAndInc(p0, p_, limit_);
1270 units_ = Impl::readAndInc(p0, p_, limit_);
1273 return Proxy(units_);
1286 }
else if (state_ == 0) {
1287 Impl::inc(p_, limit_);
1310 }
else if (state_ == 0) {
1312 units_ = Impl::readAndInc(p0, p_, limit_);
1333 template<
typename Iter = UnitIter>
1335 std::enable_if_t<prv::bidirectional_iterator<Iter>,
UTFIterator &>
1339 p_ = units_.begin();
1341 units_ = Impl::decAndRead(start_, p_);
1353 template<
typename Iter = UnitIter>
1355 std::enable_if_t<prv::bidirectional_iterator<Iter>,
UTFIterator>
1363 friend class std::reverse_iterator<
UTFIterator<CP32, behavior, UnitIter>>;
1366 return state_ <= 0 ? p_ : units_.begin();
1370 mutable UnitIter p_;
1377 mutable CodeUnits<CP32, UnitIter> units_;
1382 mutable int8_t state_ = 0;
1387template<
typename CP32, UTFIllFormedBehavior behavior,
typename UnitIter,
typename LimitIter>
1390 UnitIter, LimitIter,
1391 std::enable_if_t<!prv::forward_iterator<UnitIter>>> {
1392 static_assert(
sizeof(CP32) == 4,
"CP32 must be a 32-bit type to hold a code point");
1393 using Impl = UTFImpl<CP32, behavior, UnitIter, LimitIter>;
1400 explicit Proxy(CodeUnits<CP32, UnitIter> &units) : units_(units) {}
1401 CodeUnits<CP32, UnitIter> &operator*() {
return units_; }
1402 CodeUnits<CP32, UnitIter> *operator->() {
return &units_; }
1404 CodeUnits<CP32, UnitIter> units_;
1408 using value_type = CodeUnits<CP32, UnitIter>;
1427 return p_ == other.p_ && ahead_ == other.ahead_;
1435 !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1438 return !iter.ahead_ && iter.p_ == s;
1441#if U_CPLUSPLUS_VERSION < 20
1444 !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1447 return !iter.ahead_ && iter.p_ == s;
1452 !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1458 !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1465 units_ = Impl::readAndInc(p_, p_, limit_);
1473 units_ = Impl::readAndInc(p_, p_, limit_);
1476 return Proxy(units_);
1484 Impl::inc(p_, limit_);
1494 units_ = Impl::readAndInc(p_, p_, limit_);
1497 return Proxy(units_);
1502 mutable UnitIter p_;
1508 mutable CodeUnits<CP32, UnitIter> units_ = {0, 0,
false};
1512 mutable bool ahead_ =
false;
1522template<
typename CP32, UTFIllFormedBehavior behavior,
typename UnitIter>
1523class std::reverse_iterator<U_HEADER_ONLY_NAMESPACE::UTFIterator<CP32, behavior, UnitIter>> {
1524 static_assert(
sizeof(CP32) == 4,
"CP32 must be a 32-bit type to hold a code point");
1525 using Impl = U_HEADER_ONLY_NAMESPACE::UTFImpl<CP32, behavior, UnitIter>;
1532 explicit Proxy(CodeUnits_ units) : units_(units) {}
1533 CodeUnits_ &operator*() {
return units_; }
1534 CodeUnits_ *operator->() {
return &units_; }
1540 using value_type = CodeUnits_;
1541 using reference = value_type;
1542 using pointer = Proxy;
1544 using iterator_category = std::bidirectional_iterator_tag;
1547 p_(iter.getLogicalPosition()), start_(iter.start_), limit_(iter.limit_),
1548 units_(0, 0, false, p_, p_) {}
1549 U_FORCE_INLINE reverse_iterator() : p_{}, start_{}, limit_{}, units_(0, 0, false, p_, p_) {}
1551 U_FORCE_INLINE reverse_iterator(reverse_iterator &&src)
noexcept =
default;
1552 U_FORCE_INLINE reverse_iterator &operator=(reverse_iterator &&src)
noexcept =
default;
1554 U_FORCE_INLINE reverse_iterator(
const reverse_iterator &other) =
default;
1555 U_FORCE_INLINE reverse_iterator &operator=(
const reverse_iterator &other) =
default;
1558 return getLogicalPosition() == other.getLogicalPosition();
1564 units_ = Impl::decAndRead(start_, p_);
1572 units_ = Impl::decAndRead(start_, p_);
1575 return Proxy(units_);
1582 }
else if (state_ == 0) {
1583 Impl::dec(start_, p_);
1586 p_ = units_.begin();
1595 reverse_iterator result(*
this);
1598 }
else if (state_ == 0) {
1599 units_ = Impl::decAndRead(start_, p_);
1600 reverse_iterator result(*
this);
1605 reverse_iterator result(*
this);
1607 p_ = units_.begin();
1619 units_ = Impl::readAndInc(p0, p_, limit_);
1625 reverse_iterator result(*
this);
1632 return state_ >= 0 ? p_ : units_.end();
1636 mutable UnitIter p_;
1643 mutable CodeUnits_ units_;
1648 mutable int8_t state_ = 0;
1652namespace U_HEADER_ONLY_NAMESPACE {
1677 typename UnitIter,
typename LimitIter = UnitIter>
1678auto utfIterator(UnitIter start, UnitIter p, LimitIter limit) {
1679 return UTFIterator<CP32, behavior, UnitIter, LimitIter>(
1680 std::move(start), std::move(p), std::move(limit));
1704 typename UnitIter,
typename LimitIter = UnitIter>
1705auto utfIterator(UnitIter p, LimitIter limit) {
1706 return UTFIterator<CP32, behavior, UnitIter, LimitIter>(
1707 std::move(p), std::move(limit));
1734template<
typename CP32, UTFIllFormedBehavior behavior,
typename UnitIter>
1735auto utfIterator(UnitIter p) {
1736 return UTFIterator<CP32, behavior, UnitIter>(std::move(p));
1751template<
typename CP32, UTFIllFormedBehavior behavior,
typename Range>
1752class UTFStringCodePoints {
1753 static_assert(
sizeof(CP32) == 4,
"CP32 must be a 32-bit type to hold a code point");
1759 UTFStringCodePoints() =
default;
1766 template<
typename R = Range,
typename = std::enable_if_t<!std::is_reference_v<R>>>
1767 explicit UTFStringCodePoints(Range unitRange) : unitRange(std::move(unitRange)) {}
1776 template<
typename R = Range,
typename = std::enable_if_t<std::is_reference_v<R>>,
typename =
void>
1777 explicit UTFStringCodePoints(Range unitRange) : unitRange(unitRange) {}
1780 UTFStringCodePoints(
const UTFStringCodePoints &other) =
default;
1783 UTFStringCodePoints &operator=(
const UTFStringCodePoints &other) =
default;
1790 return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end());
1797 template<
typename R = Range,
typename = std::enable_if_t<prv::range<const R>>>
1798 auto begin()
const {
1799 return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end());
1807 using UnitIter =
decltype(unitRange.begin());
1808 using LimitIter =
decltype(unitRange.end());
1809 if constexpr (!std::is_same_v<UnitIter, LimitIter>) {
1811 return unitRange.end();
1812 }
else if constexpr (prv::bidirectional_iterator<UnitIter>) {
1813 return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end(), unitRange.end());
1816 return utfIterator<CP32, behavior>(unitRange.end(), unitRange.end());
1824 template<
typename R = Range,
typename = std::enable_if_t<prv::range<const R>>>
1826 using UnitIter =
decltype(unitRange.begin());
1827 using LimitIter =
decltype(unitRange.end());
1828 if constexpr (!std::is_same_v<UnitIter, LimitIter>) {
1830 return unitRange.end();
1831 }
else if constexpr (prv::bidirectional_iterator<UnitIter>) {
1832 return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end(), unitRange.end());
1835 return utfIterator<CP32, behavior>(unitRange.end(), unitRange.end());
1843 auto rbegin()
const {
1844 return std::make_reverse_iterator(end());
1852 return std::make_reverse_iterator(begin());
1860template<
typename CP32, UTFIllFormedBehavior behavior>
1861struct UTFStringCodePointsAdaptor
1863 __cpp_lib_bind_back >= 2022'02
1864 : std::ranges::range_adaptor_closure<UTFStringCodePointsAdaptor<CP32, behavior>>
1868 template<
typename Range>
1869 auto operator()(Range &&unitRange)
const {
1870#if defined(__cpp_lib_ranges) && __cpp_lib_ranges >= 2021'10
1871 return UTFStringCodePoints<CP32, behavior, std::ranges::views::all_t<Range>>(
1872 std::forward<Range>(unitRange));
1874 return UTFStringCodePoints<CP32, behavior, Range>(std::forward<Range>(unitRange));
1893template<
typename CP32, UTFIllFormedBehavior behavior>
1894constexpr UTFStringCodePointsAdaptor<CP32, behavior> utfStringCodePoints;
1919template<
typename CP32,
typename UnitIter,
typename =
void>
1920class UnsafeUTFIterator {
1921 static_assert(
sizeof(CP32) == 4,
"CP32 must be a 32-bit type to hold a code point");
1922 using Impl = UnsafeUTFImpl<CP32, UnitIter>;
1928 explicit Proxy(UnsafeCodeUnits<CP32, UnitIter> &units) : units_(units) {}
1929 UnsafeCodeUnits<CP32, UnitIter> &operator*() {
return units_; }
1930 UnsafeCodeUnits<CP32, UnitIter> *operator->() {
return &units_; }
1932 UnsafeCodeUnits<CP32, UnitIter> units_;
1937 using value_type = UnsafeCodeUnits<CP32, UnitIter>;
1939 using reference = value_type;
1941 using pointer = Proxy;
1943 using difference_type = prv::iter_difference_t<UnitIter>;
1945 using iterator_category = std::conditional_t<
1946 prv::bidirectional_iterator<UnitIter>,
1947 std::bidirectional_iterator_tag,
1948 std::forward_iterator_tag>;
1959 U_FORCE_INLINE explicit UnsafeUTFIterator(UnitIter p) : p_(p), units_(0, 0, p, p) {}
1965 U_FORCE_INLINE UnsafeUTFIterator() : p_{}, units_(0, 0, p_, p_) {}
1968 U_FORCE_INLINE UnsafeUTFIterator(UnsafeUTFIterator &&src)
noexcept =
default;
1970 U_FORCE_INLINE UnsafeUTFIterator &operator=(UnsafeUTFIterator &&src)
noexcept =
default;
1973 U_FORCE_INLINE UnsafeUTFIterator(
const UnsafeUTFIterator &other) =
default;
1975 U_FORCE_INLINE UnsafeUTFIterator &operator=(
const UnsafeUTFIterator &other) =
default;
1983 return getLogicalPosition() == other.getLogicalPosition();
2000 !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2002 operator==(
const UnsafeUTFIterator &iter,
const Sentinel &s) {
2003 return iter.getLogicalPosition() == s;
2006#if U_CPLUSPLUS_VERSION < 20
2015 !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2017 operator==(
const Sentinel &s,
const UnsafeUTFIterator &iter) {
2018 return iter.getLogicalPosition() == s;
2028 !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2030 operator!=(
const UnsafeUTFIterator &iter,
const Sentinel &s) {
return !(iter == s); }
2039 !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2041 operator!=(
const Sentinel &s,
const UnsafeUTFIterator &iter) {
return !(iter == s); }
2050 U_FORCE_INLINE UnsafeCodeUnits<CP32, UnitIter> operator*()
const {
2053 units_ = Impl::readAndInc(p0, p_);
2070 units_ = Impl::readAndInc(p0, p_);
2073 return Proxy(units_);
2086 }
else if (state_ == 0) {
2107 UnsafeUTFIterator result(*
this);
2110 }
else if (state_ == 0) {
2112 units_ = Impl::readAndInc(p0, p_);
2113 UnsafeUTFIterator result(*
this);
2118 UnsafeUTFIterator result(*
this);
2133 template<
typename Iter = UnitIter>
2135 std::enable_if_t<prv::bidirectional_iterator<Iter>, UnsafeUTFIterator &>
2139 p_ = units_.begin();
2141 units_ = Impl::decAndRead(p_);
2153 template<
typename Iter = UnitIter>
2155 std::enable_if_t<prv::bidirectional_iterator<Iter>, UnsafeUTFIterator>
2157 UnsafeUTFIterator result(*
this);
2163 friend class std::reverse_iterator<UnsafeUTFIterator<CP32, UnitIter>>;
2166 return state_ <= 0 ? p_ : units_.begin();
2170 mutable UnitIter p_;
2173 mutable UnsafeCodeUnits<CP32, UnitIter> units_;
2178 mutable int8_t state_ = 0;
2183template<
typename CP32,
typename UnitIter>
2184class UnsafeUTFIterator<
2187 std::enable_if_t<!prv::forward_iterator<UnitIter>>> {
2188 static_assert(
sizeof(CP32) == 4,
"CP32 must be a 32-bit type to hold a code point");
2189 using Impl = UnsafeUTFImpl<CP32, UnitIter>;
2196 explicit Proxy(UnsafeCodeUnits<CP32, UnitIter> &units) : units_(units) {}
2197 UnsafeCodeUnits<CP32, UnitIter> &operator*() {
return units_; }
2198 UnsafeCodeUnits<CP32, UnitIter> *operator->() {
return &units_; }
2200 UnsafeCodeUnits<CP32, UnitIter> units_;
2204 using value_type = UnsafeCodeUnits<CP32, UnitIter>;
2205 using reference = value_type;
2206 using pointer = Proxy;
2207 using difference_type = prv::iter_difference_t<UnitIter>;
2208 using iterator_category = std::input_iterator_tag;
2210 U_FORCE_INLINE explicit UnsafeUTFIterator(UnitIter p) : p_(std::move(p)) {}
2212 U_FORCE_INLINE UnsafeUTFIterator(UnsafeUTFIterator &&src)
noexcept =
default;
2213 U_FORCE_INLINE UnsafeUTFIterator &operator=(UnsafeUTFIterator &&src)
noexcept =
default;
2215 U_FORCE_INLINE UnsafeUTFIterator(
const UnsafeUTFIterator &other) =
default;
2216 U_FORCE_INLINE UnsafeUTFIterator &operator=(
const UnsafeUTFIterator &other) =
default;
2219 return p_ == other.p_ && ahead_ == other.ahead_;
2227 !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2229 operator==(
const UnsafeUTFIterator &iter,
const Sentinel &s) {
2230 return !iter.ahead_ && iter.p_ == s;
2233#if U_CPLUSPLUS_VERSION < 20
2236 !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2238 operator==(
const Sentinel &s,
const UnsafeUTFIterator &iter) {
2239 return !iter.ahead_ && iter.p_ == s;
2244 !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2246 operator!=(
const UnsafeUTFIterator &iter,
const Sentinel &s) {
return !(iter == s); }
2250 !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2252 operator!=(
const Sentinel &s,
const UnsafeUTFIterator &iter) {
return !(iter == s); }
2255 U_FORCE_INLINE UnsafeCodeUnits<CP32, UnitIter> operator*()
const {
2257 units_ = Impl::readAndInc(p_, p_);
2265 units_ = Impl::readAndInc(p_, p_);
2268 return Proxy(units_);
2286 units_ = Impl::readAndInc(p_, p_);
2289 return Proxy(units_);
2294 mutable UnitIter p_;
2297 mutable UnsafeCodeUnits<CP32, UnitIter> units_ = {0, 0};
2301 mutable bool ahead_ =
false;
2311template<
typename CP32,
typename UnitIter>
2312class std::reverse_iterator<U_HEADER_ONLY_NAMESPACE::UnsafeUTFIterator<CP32, UnitIter>> {
2313 static_assert(
sizeof(CP32) == 4,
"CP32 must be a 32-bit type to hold a code point");
2314 using Impl = U_HEADER_ONLY_NAMESPACE::UnsafeUTFImpl<CP32, UnitIter>;
2321 explicit Proxy(UnsafeCodeUnits_ units) : units_(units) {}
2322 UnsafeCodeUnits_ &operator*() {
return units_; }
2323 UnsafeCodeUnits_ *operator->() {
return &units_; }
2325 UnsafeCodeUnits_ units_;
2329 using value_type = UnsafeCodeUnits_;
2330 using reference = value_type;
2331 using pointer = Proxy;
2333 using iterator_category = std::bidirectional_iterator_tag;
2335 U_FORCE_INLINE explicit reverse_iterator(U_HEADER_ONLY_NAMESPACE::UnsafeUTFIterator<CP32, UnitIter> iter) :
2336 p_(iter.getLogicalPosition()), units_(0, 0, p_, p_) {}
2337 U_FORCE_INLINE reverse_iterator() : p_{}, units_(0, 0, p_, p_) {}
2339 U_FORCE_INLINE reverse_iterator(reverse_iterator &&src)
noexcept =
default;
2340 U_FORCE_INLINE reverse_iterator &operator=(reverse_iterator &&src)
noexcept =
default;
2342 U_FORCE_INLINE reverse_iterator(
const reverse_iterator &other) =
default;
2343 U_FORCE_INLINE reverse_iterator &operator=(
const reverse_iterator &other) =
default;
2346 return getLogicalPosition() == other.getLogicalPosition();
2352 units_ = Impl::decAndRead(p_);
2360 units_ = Impl::decAndRead(p_);
2363 return Proxy(units_);
2370 }
else if (state_ == 0) {
2374 p_ = units_.begin();
2383 reverse_iterator result(*
this);
2386 }
else if (state_ == 0) {
2387 units_ = Impl::decAndRead(p_);
2388 reverse_iterator result(*
this);
2393 reverse_iterator result(*
this);
2395 p_ = units_.begin();
2407 units_ = Impl::readAndInc(p0, p_);
2413 reverse_iterator result(*
this);
2420 return state_ >= 0 ? p_ : units_.end();
2424 mutable UnitIter p_;
2427 mutable UnsafeCodeUnits_ units_;
2432 mutable int8_t state_ = 0;
2436namespace U_HEADER_ONLY_NAMESPACE {
2453template<
typename CP32,
typename UnitIter>
2454auto unsafeUTFIterator(UnitIter iter) {
2455 return UnsafeUTFIterator<CP32, UnitIter>(std::move(iter));
2469template<
typename CP32,
typename Range>
2470class UnsafeUTFStringCodePoints {
2471 static_assert(
sizeof(CP32) == 4,
"CP32 must be a 32-bit type to hold a code point");
2477 UnsafeUTFStringCodePoints() =
default;
2484 template<
typename R = Range,
typename = std::enable_if_t<!std::is_reference_v<R>>>
2485 explicit UnsafeUTFStringCodePoints(Range unitRange) : unitRange(std::move(unitRange)) {}
2494 template<
typename R = Range,
typename = std::enable_if_t<std::is_reference_v<R>>,
typename =
void>
2495 explicit UnsafeUTFStringCodePoints(Range unitRange) : unitRange(unitRange) {}
2498 UnsafeUTFStringCodePoints(
const UnsafeUTFStringCodePoints &other) =
default;
2501 UnsafeUTFStringCodePoints &operator=(
const UnsafeUTFStringCodePoints &other) =
default;
2508 return unsafeUTFIterator<CP32>(unitRange.begin());
2515 template<
typename R = Range,
typename = std::enable_if_t<prv::range<const R>>>
2516 auto begin()
const {
2517 return unsafeUTFIterator<CP32>(unitRange.begin());
2525 using UnitIter =
decltype(unitRange.begin());
2526 using LimitIter =
decltype(unitRange.end());
2527 if constexpr (!std::is_same_v<UnitIter, LimitIter>) {
2529 return unitRange.end();
2531 return unsafeUTFIterator<CP32>(unitRange.end());
2539 template<
typename R = Range,
typename = std::enable_if_t<prv::range<const R>>>
2541 using UnitIter =
decltype(unitRange.begin());
2542 using LimitIter =
decltype(unitRange.end());
2543 if constexpr (!std::is_same_v<UnitIter, LimitIter>) {
2545 return unitRange.end();
2547 return unsafeUTFIterator<CP32>(unitRange.end());
2555 auto rbegin()
const {
2556 return std::make_reverse_iterator(end());
2564 return std::make_reverse_iterator(begin());
2572template<
typename CP32>
2573struct UnsafeUTFStringCodePointsAdaptor
2575 __cpp_lib_bind_back >= 2022'02
2576 : std::ranges::range_adaptor_closure<UnsafeUTFStringCodePointsAdaptor<CP32>>
2580 template<
typename Range>
2581 auto operator()(Range &&unitRange)
const {
2582#if defined(__cpp_lib_ranges) && __cpp_lib_ranges >= 2021'10
2583 return UnsafeUTFStringCodePoints<CP32, std::ranges::views::all_t<Range>>(std::forward<Range>(unitRange));
2585 return UnsafeUTFStringCodePoints<CP32, Range>(std::forward<Range>(unitRange));
2603template<
typename CP32>
2604constexpr UnsafeUTFStringCodePointsAdaptor<CP32> unsafeUTFStringCodePoints;
U_COMMON_API UBool operator==(const StringPiece &x, const StringPiece &y)
Global operator == for StringPiece.
bool operator!=(const StringPiece &x, const StringPiece &y)
Global operator != for StringPiece.
#define U_SENTINEL
This value is intended for sentinel values for APIs that (take or) return single code points (UChar32...
#define U_FORCE_INLINE
Forces function inlining on compilers that are known to support it.
C API: 16-bit Unicode handling macros.
#define U16_IS_SURROGATE_TRAIL(c)
Assuming c is a surrogate code point (U16_IS_SURROGATE(c)), is it a trail surrogate?
#define U16_IS_SURROGATE_LEAD(c)
Assuming c is a surrogate code point (U16_IS_SURROGATE(c)), is it a lead surrogate?
#define U16_GET_SUPPLEMENTARY(lead, trail)
Get a supplementary code point value (U+10000..U+10ffff) from its lead and trail surrogates.
#define U16_IS_SURROGATE(c)
Is this code unit a surrogate (U+d800..U+dfff)?
#define U16_IS_LEAD(c)
Is this code unit a lead surrogate (U+d800..U+dbff)?
#define U16_IS_TRAIL(c)
Is this code unit a trail surrogate (U+dc00..U+dfff)?
C API: 8-bit Unicode handling macros.
#define U8_COUNT_TRAIL_BYTES_UNSAFE(leadByte)
Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence.
#define U8_IS_VALID_LEAD3_AND_T1(lead, t1)
Internal 3-byte UTF-8 validity check.
#define U8_IS_VALID_LEAD4_AND_T1(lead, t1)
Internal 4-byte UTF-8 validity check.
#define U8_IS_SINGLE(c)
Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?
#define U8_LEAD3_T1_BITS
Internal bit vector for 3-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD3_AND_T1.
#define U8_LEAD4_T1_BITS
Internal bit vector for 4-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD4_AND_T1.
#define U8_IS_LEAD(c)
Is this code unit (byte) a UTF-8 lead byte? (0xC2..0xF4)
#define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes)
Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value.
#define U8_IS_TRAIL(c)
Is this code unit (byte) a UTF-8 trail byte? (0x80..0xBF)
typename std::iterator_traits< Iter >::difference_type iter_difference_t
constexpr bool forward_iterator
typename std::iterator_traits< Iter >::value_type iter_value_t
constexpr bool bidirectional_iterator
UTFIllFormedBehavior
Some defined behaviors for handling ill-formed Unicode strings.
@ UTF_BEHAVIOR_FFFD
Returns U+FFFD Replacement Character.
@ UTF_BEHAVIOR_SURROGATE
UTF-8: Not allowed; UTF-16: returns the unpaired surrogate; UTF-32: returns the surrogate code point,...
@ UTF_BEHAVIOR_NEGATIVE
Returns a negative value (-1=U_SENTINEL) instead of a code point.
Basic definitions for ICU, for both C and C++ APIs.
C API: API for accessing ICU version numbers.