some support for UTF-8 (similar to Isabelle/Scala version);
authorwenzelm
Tue, 30 Oct 2018 19:14:31 +0100
changeset 69208 3e4edf43e254
parent 69207 ae2074acbaa8
child 69209 3f4210c13356
some support for UTF-8 (similar to Isabelle/Scala version);
src/Pure/General/utf8.ML
src/Pure/ROOT.ML
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/Pure/General/utf8.ML	Tue Oct 30 19:14:31 2018 +0100
@@ -0,0 +1,57 @@
+(*  Title:      Pure/General/utf8.ML
+    Author:     Makarius
+
+Variations on UTF-8.
+*)
+
+signature UTF8 =
+sig
+  type codepoint = int
+  val decode_permissive: string -> codepoint list
+end;
+
+structure UTF8: UTF8 =
+struct
+
+type codepoint = int;
+
+
+(* permissive UTF-8 decoding *)
+
+(*see also https://en.wikipedia.org/wiki/UTF-8#Description
+  overlong encodings enable byte-stuffing of low-ASCII*)
+
+local
+
+type state = codepoint list * codepoint * int;
+
+fun flush ((buf, code, rest): state) : state =
+  if code <> ~1 then
+    ((if rest = 0 andalso code <= 0x10FFFF then code else 0xFFFD) :: buf, ~1, 0)
+  else (buf, code, rest);
+
+fun init x n (state: state) : state = (#1 (flush state), Word8.toInt x, n);
+
+fun push x ((buf, code, rest): state) =
+  if rest <= 0 then init x ~1 (buf, code, rest)
+  else (buf, code * 64 + Word8.toInt x, rest - 1);
+
+fun append x ((buf, code, rest): state) : state = (Word8.toInt x :: buf, code, rest);
+
+fun decode (c, state) =
+  if c < 0w128 then state |> flush |> append c
+  else if Word8.andb (c, 0wxC0) = 0wx80 then state |> push (Word8.andb (c, 0wx3F))
+  else if Word8.andb (c, 0wxE0) = 0wxC0 then state |> init (Word8.andb (c, 0wx1F)) 1
+  else if Word8.andb (c, 0wxF0) = 0wxE0 then state |> init (Word8.andb (c, 0wx0F)) 2
+  else if Word8.andb (c, 0wxF8) = 0wxF0 then state |> init (Word8.andb (c, 0wx07)) 3
+  else state;
+
+in
+
+fun decode_permissive text =
+  Word8Vector.foldl decode ([], ~1, 0) (Byte.stringToBytes text)
+  |> flush |> #1 |> rev;
+
+end;
+
+end;
--- a/src/Pure/ROOT.ML	Tue Oct 30 15:45:24 2018 +0100
+++ b/src/Pure/ROOT.ML	Tue Oct 30 19:14:31 2018 +0100
@@ -48,6 +48,7 @@
 ML_file "General/properties.ML";
 ML_file "General/output.ML";
 ML_file "PIDE/markup.ML";
+ML_file "General/utf8.ML";
 ML_file "General/scan.ML";
 ML_file "General/source.ML";
 ML_file "General/symbol.ML";