# HG changeset patch # User wenzelm # Date 1540923271 -3600 # Node ID 3e4edf43e2548c775206b83f953a10298f0351cc # Parent ae2074acbaa88f5cca8f65513d38b80182b015d3 some support for UTF-8 (similar to Isabelle/Scala version); diff -r ae2074acbaa8 -r 3e4edf43e254 src/Pure/General/utf8.ML --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/Pure/General/utf8.ML Tue Oct 30 19:14:31 2018 +0100 @@ -0,0 +1,57 @@ +(* Title: Pure/General/utf8.ML + Author: Makarius + +Variations on UTF-8. +*) + +signature UTF8 = +sig + type codepoint = int + val decode_permissive: string -> codepoint list +end; + +structure UTF8: UTF8 = +struct + +type codepoint = int; + + +(* permissive UTF-8 decoding *) + +(*see also https://en.wikipedia.org/wiki/UTF-8#Description + overlong encodings enable byte-stuffing of low-ASCII*) + +local + +type state = codepoint list * codepoint * int; + +fun flush ((buf, code, rest): state) : state = + if code <> ~1 then + ((if rest = 0 andalso code <= 0x10FFFF then code else 0xFFFD) :: buf, ~1, 0) + else (buf, code, rest); + +fun init x n (state: state) : state = (#1 (flush state), Word8.toInt x, n); + +fun push x ((buf, code, rest): state) = + if rest <= 0 then init x ~1 (buf, code, rest) + else (buf, code * 64 + Word8.toInt x, rest - 1); + +fun append x ((buf, code, rest): state) : state = (Word8.toInt x :: buf, code, rest); + +fun decode (c, state) = + if c < 0w128 then state |> flush |> append c + else if Word8.andb (c, 0wxC0) = 0wx80 then state |> push (Word8.andb (c, 0wx3F)) + else if Word8.andb (c, 0wxE0) = 0wxC0 then state |> init (Word8.andb (c, 0wx1F)) 1 + else if Word8.andb (c, 0wxF0) = 0wxE0 then state |> init (Word8.andb (c, 0wx0F)) 2 + else if Word8.andb (c, 0wxF8) = 0wxF0 then state |> init (Word8.andb (c, 0wx07)) 3 + else state; + +in + +fun decode_permissive text = + Word8Vector.foldl decode ([], ~1, 0) (Byte.stringToBytes text) + |> flush |> #1 |> rev; + +end; + +end; diff -r ae2074acbaa8 -r 3e4edf43e254 src/Pure/ROOT.ML --- a/src/Pure/ROOT.ML Tue Oct 30 15:45:24 2018 +0100 +++ b/src/Pure/ROOT.ML Tue Oct 30 19:14:31 2018 +0100 @@ -48,6 +48,7 @@ ML_file "General/properties.ML"; ML_file "General/output.ML"; ML_file "PIDE/markup.ML"; +ML_file "General/utf8.ML"; ML_file "General/scan.ML"; ML_file "General/source.ML"; ML_file "General/symbol.ML";