src/Pure/General/utf8.ML
changeset 69208 3e4edf43e254
equal deleted inserted replaced
69207:ae2074acbaa8 69208:3e4edf43e254
       
     1 (*  Title:      Pure/General/utf8.ML
       
     2     Author:     Makarius
       
     3 
       
     4 Variations on UTF-8.
       
     5 *)
       
     6 
       
     7 signature UTF8 =
       
     8 sig
       
     9   type codepoint = int
       
    10   val decode_permissive: string -> codepoint list
       
    11 end;
       
    12 
       
    13 structure UTF8: UTF8 =
       
    14 struct
       
    15 
       
    16 type codepoint = int;
       
    17 
       
    18 
       
    19 (* permissive UTF-8 decoding *)
       
    20 
       
    21 (*see also https://en.wikipedia.org/wiki/UTF-8#Description
       
    22   overlong encodings enable byte-stuffing of low-ASCII*)
       
    23 
       
    24 local
       
    25 
       
    26 type state = codepoint list * codepoint * int;
       
    27 
       
    28 fun flush ((buf, code, rest): state) : state =
       
    29   if code <> ~1 then
       
    30     ((if rest = 0 andalso code <= 0x10FFFF then code else 0xFFFD) :: buf, ~1, 0)
       
    31   else (buf, code, rest);
       
    32 
       
    33 fun init x n (state: state) : state = (#1 (flush state), Word8.toInt x, n);
       
    34 
       
    35 fun push x ((buf, code, rest): state) =
       
    36   if rest <= 0 then init x ~1 (buf, code, rest)
       
    37   else (buf, code * 64 + Word8.toInt x, rest - 1);
       
    38 
       
    39 fun append x ((buf, code, rest): state) : state = (Word8.toInt x :: buf, code, rest);
       
    40 
       
    41 fun decode (c, state) =
       
    42   if c < 0w128 then state |> flush |> append c
       
    43   else if Word8.andb (c, 0wxC0) = 0wx80 then state |> push (Word8.andb (c, 0wx3F))
       
    44   else if Word8.andb (c, 0wxE0) = 0wxC0 then state |> init (Word8.andb (c, 0wx1F)) 1
       
    45   else if Word8.andb (c, 0wxF0) = 0wxE0 then state |> init (Word8.andb (c, 0wx0F)) 2
       
    46   else if Word8.andb (c, 0wxF8) = 0wxF0 then state |> init (Word8.andb (c, 0wx07)) 3
       
    47   else state;
       
    48 
       
    49 in
       
    50 
       
    51 fun decode_permissive text =
       
    52   Word8Vector.foldl decode ([], ~1, 0) (Byte.stringToBytes text)
       
    53   |> flush |> #1 |> rev;
       
    54 
       
    55 end;
       
    56 
       
    57 end;