some support for UTF-8 (similar to Isabelle/Scala version);
authorwenzelm
Tue Oct 30 19:14:31 2018 +0100 (9 months ago)
changeset 692083e4edf43e254
parent 69207 ae2074acbaa8
child 69209 3f4210c13356
some support for UTF-8 (similar to Isabelle/Scala version);
src/Pure/General/utf8.ML
src/Pure/ROOT.ML
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/src/Pure/General/utf8.ML	Tue Oct 30 19:14:31 2018 +0100
     1.3 @@ -0,0 +1,57 @@
     1.4 +(*  Title:      Pure/General/utf8.ML
     1.5 +    Author:     Makarius
     1.6 +
     1.7 +Variations on UTF-8.
     1.8 +*)
     1.9 +
    1.10 +signature UTF8 =
    1.11 +sig
    1.12 +  type codepoint = int
    1.13 +  val decode_permissive: string -> codepoint list
    1.14 +end;
    1.15 +
    1.16 +structure UTF8: UTF8 =
    1.17 +struct
    1.18 +
    1.19 +type codepoint = int;
    1.20 +
    1.21 +
    1.22 +(* permissive UTF-8 decoding *)
    1.23 +
    1.24 +(*see also https://en.wikipedia.org/wiki/UTF-8#Description
    1.25 +  overlong encodings enable byte-stuffing of low-ASCII*)
    1.26 +
    1.27 +local
    1.28 +
    1.29 +type state = codepoint list * codepoint * int;
    1.30 +
    1.31 +fun flush ((buf, code, rest): state) : state =
    1.32 +  if code <> ~1 then
    1.33 +    ((if rest = 0 andalso code <= 0x10FFFF then code else 0xFFFD) :: buf, ~1, 0)
    1.34 +  else (buf, code, rest);
    1.35 +
    1.36 +fun init x n (state: state) : state = (#1 (flush state), Word8.toInt x, n);
    1.37 +
    1.38 +fun push x ((buf, code, rest): state) =
    1.39 +  if rest <= 0 then init x ~1 (buf, code, rest)
    1.40 +  else (buf, code * 64 + Word8.toInt x, rest - 1);
    1.41 +
    1.42 +fun append x ((buf, code, rest): state) : state = (Word8.toInt x :: buf, code, rest);
    1.43 +
    1.44 +fun decode (c, state) =
    1.45 +  if c < 0w128 then state |> flush |> append c
    1.46 +  else if Word8.andb (c, 0wxC0) = 0wx80 then state |> push (Word8.andb (c, 0wx3F))
    1.47 +  else if Word8.andb (c, 0wxE0) = 0wxC0 then state |> init (Word8.andb (c, 0wx1F)) 1
    1.48 +  else if Word8.andb (c, 0wxF0) = 0wxE0 then state |> init (Word8.andb (c, 0wx0F)) 2
    1.49 +  else if Word8.andb (c, 0wxF8) = 0wxF0 then state |> init (Word8.andb (c, 0wx07)) 3
    1.50 +  else state;
    1.51 +
    1.52 +in
    1.53 +
    1.54 +fun decode_permissive text =
    1.55 +  Word8Vector.foldl decode ([], ~1, 0) (Byte.stringToBytes text)
    1.56 +  |> flush |> #1 |> rev;
    1.57 +
    1.58 +end;
    1.59 +
    1.60 +end;
     2.1 --- a/src/Pure/ROOT.ML	Tue Oct 30 15:45:24 2018 +0100
     2.2 +++ b/src/Pure/ROOT.ML	Tue Oct 30 19:14:31 2018 +0100
     2.3 @@ -48,6 +48,7 @@
     2.4  ML_file "General/properties.ML";
     2.5  ML_file "General/output.ML";
     2.6  ML_file "PIDE/markup.ML";
     2.7 +ML_file "General/utf8.ML";
     2.8  ML_file "General/scan.ML";
     2.9  ML_file "General/source.ML";
    2.10  ML_file "General/symbol.ML";