module Data.Text.Internal.Encoding
( validateUtf8Chunk
, validateUtf8More
, decodeUtf8Chunk
, decodeUtf8More
, decodeUtf8With1
, decodeUtf8With2
, Utf8State
, startUtf8State
, StrictBuilder()
, strictBuilderToText
, textToStrictBuilder
, skipIncomplete
, getCompleteLen
, getPartialUtf8
) where
#if defined(ASSERTS)
import Control.Exception (assert)
#endif
import Data.Bits ((.&.), shiftL, shiftR)
import Data.ByteString (ByteString)
#if !MIN_VERSION_base(4,11,0)
import Data.Semigroup (Semigroup(..))
#endif
import Data.Word (Word32, Word8)
import Foreign.Storable (pokeElemOff)
import Data.Text.Encoding.Error (OnDecodeError)
import Data.Text.Internal (Text(..))
import Data.Text.Internal.Encoding.Utf8
(DecoderState, utf8AcceptState, utf8RejectState, updateDecoderState)
import Data.Text.Internal.StrictBuilder (StrictBuilder)
import qualified Data.ByteString as B
import qualified Data.ByteString.Internal as BI
import qualified Data.ByteString.Short.Internal as SBS
import qualified Data.Text.Array as A
import qualified Data.Text.Internal.StrictBuilder as SB
#if defined(ASSERTS)
import GHC.Stack (HasCallStack)
#endif
#ifdef SIMDUTF
import Data.Text.Internal.ByteStringCompat (withBS)
import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
import Data.Text.Unsafe (unsafeDupablePerformIO)
import Foreign.C.Types (CSize(..))
import Foreign.C.Types (CInt(..))
import Foreign.Ptr (Ptr)
#endif
strictBuilderToText :: StrictBuilder -> Text
strictBuilderToText = SB.toText
textToStrictBuilder :: Text -> StrictBuilder
textToStrictBuilder = SB.fromText
data Utf8State = Utf8State
{
utf8CodePointState :: !DecoderState
, partialUtf8CodePoint :: !PartialUtf8CodePoint
}
deriving (Eq, Show)
startUtf8State :: Utf8State
startUtf8State = Utf8State utf8AcceptState partUtf8Empty
newtype PartialUtf8CodePoint = PartialUtf8CodePoint Word32
deriving (Eq, Show)
partUtf8Empty :: PartialUtf8CodePoint
partUtf8Empty = PartialUtf8CodePoint 0
partUtf8Len :: PartialUtf8CodePoint -> Int
partUtf8Len (PartialUtf8CodePoint w) = fromIntegral $ w `shiftR` 24
partUtf8CompleteLen :: PartialUtf8CodePoint -> Int
partUtf8CompleteLen c@(PartialUtf8CodePoint w)
| partUtf8Len c == 0 = 0
| 0xf0 <= firstByte = 4
| 0xe0 <= firstByte = 3
| 0xc2 <= firstByte = 2
| otherwise = 0
where
firstByte = (w `shiftR` 16) .&. 255
partUtf8UnsafeIndex ::
#if defined(ASSERTS)
HasCallStack =>
#endif
PartialUtf8CodePoint -> Int -> Word8
partUtf8UnsafeIndex _c@(PartialUtf8CodePoint w) n =
#if defined(ASSERTS)
assert (0 <= n && n < partUtf8Len _c) $
#endif
fromIntegral $ w `shiftR` (16 8 * n)
partUtf8UnsafeAppend ::
#if defined(ASSERTS)
HasCallStack =>
#endif
PartialUtf8CodePoint -> ByteString -> PartialUtf8CodePoint
partUtf8UnsafeAppend c@(PartialUtf8CodePoint word) bs =
#if defined(ASSERTS)
assert (lenc + lenbs <= 3) $
#endif
PartialUtf8CodePoint $
tryPush 0 $ tryPush 1 $ tryPush 2 $ word + (fromIntegral lenbs `shiftL` 24)
where
lenc = partUtf8Len c
lenbs = B.length bs
tryPush i w =
if i < lenbs
then w + (fromIntegral (B.index bs i) `shiftL` fromIntegral (16 8 * (lenc + i)))
else w
partUtf8Foldr :: (Word8 -> a -> a) -> a -> PartialUtf8CodePoint -> a
partUtf8Foldr f x0 c = case partUtf8Len c of
0 -> x0
1 -> build 0 x0
2 -> build 0 (build 1 x0)
_ -> build 0 (build 1 (build 2 x0))
where
build i x = f (partUtf8UnsafeIndex c i) x
partUtf8ToByteString :: PartialUtf8CodePoint -> B.ByteString
partUtf8ToByteString c = BI.unsafeCreate (partUtf8Len c) $ \ptr ->
partUtf8Foldr (\w k i -> pokeElemOff ptr i w >> k (i+1)) (\_ -> pure ()) c 0
getCompleteLen :: Utf8State -> Int
getCompleteLen = partUtf8CompleteLen . partialUtf8CodePoint
getPartialUtf8 :: Utf8State -> B.ByteString
getPartialUtf8 = partUtf8ToByteString . partialUtf8CodePoint
#ifdef SIMDUTF
foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8
:: Ptr Word8 -> CSize -> IO CInt
#endif
validateUtf8Chunk :: ByteString -> (Int, Maybe Utf8State)
validateUtf8Chunk bs = validateUtf8ChunkFrom 0 bs (,)
validateUtf8ChunkFrom :: forall r. Int -> ByteString -> (Int -> Maybe Utf8State -> r) -> r
validateUtf8ChunkFrom ofs bs k
#if defined(SIMDUTF) || MIN_VERSION_bytestring(0,11,2)
| guessUtf8Boundary > 0 &&
(
#ifdef SIMDUTF
withBS (B.drop ofs bs) $ \ fp _ -> unsafeDupablePerformIO $
unsafeWithForeignPtr fp $ \ptr -> (/= 0) <$>
c_is_valid_utf8 ptr (fromIntegral guessUtf8Boundary)
#else
B.isValidUtf8 $ B.take guessUtf8Boundary (B.drop ofs bs)
#endif
) = slowValidateUtf8ChunkFrom (ofs + guessUtf8Boundary)
| otherwise = slowValidateUtf8ChunkFrom ofs
where
len = B.length bs ofs
isBoundary n p = len >= n && p (B.index bs (ofs + len n))
guessUtf8Boundary
| isBoundary 1 (<= 0x80) = len
| isBoundary 1 (0xc2 <=) = len 1
| isBoundary 2 (0xe0 <=) = len 2
| isBoundary 3 (0xf0 <=) = len 3
| otherwise = len
#else
= slowValidateUtf8ChunkFrom ofs
where
#endif
slowValidateUtf8ChunkFrom :: Int -> r
slowValidateUtf8ChunkFrom ofs1 = slowLoop ofs1 ofs1 utf8AcceptState
slowLoop !utf8End i s
| i < B.length bs =
case updateDecoderState (B.index bs i) s of
s' | s' == utf8RejectState -> k utf8End Nothing
| s' == utf8AcceptState -> slowLoop (i + 1) (i + 1) s'
| otherwise -> slowLoop utf8End (i + 1) s'
| otherwise = k utf8End (Just (Utf8State s (partUtf8UnsafeAppend partUtf8Empty (B.drop utf8End bs))))
validateUtf8More :: Utf8State -> ByteString -> (Int, Maybe Utf8State)
validateUtf8More st bs = validateUtf8MoreCont st bs (,)
validateUtf8MoreCont :: Utf8State -> ByteString -> (Int -> Maybe Utf8State -> r) -> r
validateUtf8MoreCont st@(Utf8State s0 part) bs k
| len > 0 = loop 0 s0
| otherwise = k ( partUtf8Len part) (Just st)
where
len = B.length bs
loop !i s
| s == utf8AcceptState = validateUtf8ChunkFrom i bs k
| i < len =
case updateDecoderState (B.index bs i) s of
s' | s' == utf8RejectState -> k ( partUtf8Len part) Nothing
| otherwise -> loop (i + 1) s'
| otherwise = k ( partUtf8Len part) (Just (Utf8State s (partUtf8UnsafeAppend part bs)))
partUtf8ToStrictBuilder :: PartialUtf8CodePoint -> StrictBuilder
partUtf8ToStrictBuilder c =
partUtf8Foldr ((<>) . SB.unsafeFromWord8) mempty c
utf8StateToStrictBuilder :: Utf8State -> StrictBuilder
utf8StateToStrictBuilder = partUtf8ToStrictBuilder . partialUtf8CodePoint
decodeUtf8More :: Utf8State -> ByteString -> (StrictBuilder, ByteString, Maybe Utf8State)
decodeUtf8More s bs =
validateUtf8MoreCont s bs $ \len ms ->
let builder | len <= 0 = mempty
| otherwise = utf8StateToStrictBuilder s
<> SB.unsafeFromByteString (B.take len bs)
in (builder, B.drop len bs, ms)
decodeUtf8Chunk :: ByteString -> (StrictBuilder, ByteString, Maybe Utf8State)
decodeUtf8Chunk = decodeUtf8More startUtf8State
skipIncomplete :: OnDecodeError -> String -> Utf8State -> StrictBuilder
skipIncomplete onErr msg s =
partUtf8Foldr
((<>) . handleUtf8Error onErr msg)
mempty (partialUtf8CodePoint s)
handleUtf8Error :: OnDecodeError -> String -> Word8 -> StrictBuilder
handleUtf8Error onErr msg w = case onErr msg (Just w) of
Just c -> SB.fromChar c
Nothing -> mempty
decodeUtf8With1 ::
#if defined(ASSERTS)
HasCallStack =>
#endif
OnDecodeError -> String -> ByteString -> Text
decodeUtf8With1 onErr msg bs = validateUtf8ChunkFrom 0 bs $ \len ms -> case ms of
Just s
| len == B.length bs ->
let !(SBS.SBS arr) = SBS.toShort bs in
Text (A.ByteArray arr) 0 len
| otherwise -> SB.toText $
SB.unsafeFromByteString (B.take len bs) <> skipIncomplete onErr msg s
Nothing ->
let (builder, _, s) = decodeUtf8With2 onErr msg startUtf8State (B.drop (len + 1) bs) in
SB.toText $
SB.unsafeFromByteString (B.take len bs) <>
handleUtf8Error onErr msg (B.index bs len) <>
builder <>
skipIncomplete onErr msg s
decodeUtf8With2 ::
#if defined(ASSERTS)
HasCallStack =>
#endif
OnDecodeError -> String -> Utf8State -> ByteString -> (StrictBuilder, ByteString, Utf8State)
decodeUtf8With2 onErr msg s0 bs = loop s0 0 mempty
where
loop s i !builder =
let nonEmptyPrefix len = builder
<> utf8StateToStrictBuilder s
<> SB.unsafeFromByteString (B.take len (B.drop i bs))
in validateUtf8MoreCont s (B.drop i bs) $ \len ms -> case ms of
Nothing ->
if len < 0
then
let builder' = builder <> skipIncomplete onErr msg s
in loop startUtf8State i builder'
else
let builder' = nonEmptyPrefix len
<> handleUtf8Error onErr msg (B.index bs (i + len))
in loop startUtf8State (i + len + 1) builder'
Just s' ->
let builder' = if len <= 0 then builder else nonEmptyPrefix len
undecoded = if B.length bs >= partUtf8Len (partialUtf8CodePoint s')
then B.drop (i + len) bs
else partUtf8ToByteString (partialUtf8CodePoint s')
in (builder', undecoded, s')