(* Title: Pure/General/utf8.ML
Author: Makarius
Variations on UTF-8.
*)
signature UTF8 =
sig
type codepoint = int
val decode_permissive: string -> codepoint list
end;
structure UTF8: UTF8 =
struct
type codepoint = int;
(* permissive UTF-8 decoding *)
(*see also https://en.wikipedia.org/wiki/UTF-8#Description
overlong encodings enable byte-stuffing of low-ASCII*)
local
type state = codepoint list * codepoint * int;
fun flush ((buf, code, rest): state) : state =
if code <> ~1 then
((if rest = 0 andalso code <= 0x10FFFF then code else 0xFFFD) :: buf, ~1, 0)
else (buf, code, rest);
fun init x n (state: state) : state = (#1 (flush state), Word8.toInt x, n);
fun push x ((buf, code, rest): state) =
if rest <= 0 then init x ~1 (buf, code, rest)
else (buf, code * 64 + Word8.toInt x, rest - 1);
fun append x ((buf, code, rest): state) : state = (Word8.toInt x :: buf, code, rest);
fun decode (c, state) =
if c < 0w128 then state |> flush |> append c
else if Word8.andb (c, 0wxC0) = 0wx80 then state |> push (Word8.andb (c, 0wx3F))
else if Word8.andb (c, 0wxE0) = 0wxC0 then state |> init (Word8.andb (c, 0wx1F)) 1
else if Word8.andb (c, 0wxF0) = 0wxE0 then state |> init (Word8.andb (c, 0wx0F)) 2
else if Word8.andb (c, 0wxF8) = 0wxF0 then state |> init (Word8.andb (c, 0wx07)) 3
else state;
in
fun decode_permissive text =
Word8Vector.foldl decode ([], ~1, 0) (Byte.stringToBytes text)
|> flush |> #1 |> rev;
end;
end;