src/Pure/General/utf8.ML
author wenzelm
Thu, 22 Nov 2018 17:34:30 +0100
changeset 69327 264b44dce6be
parent 69208 3e4edf43e254
permissions -rw-r--r--
tuned;

(*  Title:      Pure/General/utf8.ML
    Author:     Makarius

Variations on UTF-8.
*)

signature UTF8 =
sig
  type codepoint = int
  val decode_permissive: string -> codepoint list
end;

structure UTF8: UTF8 =
struct

type codepoint = int;


(* permissive UTF-8 decoding *)

(*see also https://en.wikipedia.org/wiki/UTF-8#Description
  overlong encodings enable byte-stuffing of low-ASCII*)

local

type state = codepoint list * codepoint * int;

fun flush ((buf, code, rest): state) : state =
  if code <> ~1 then
    ((if rest = 0 andalso code <= 0x10FFFF then code else 0xFFFD) :: buf, ~1, 0)
  else (buf, code, rest);

fun init x n (state: state) : state = (#1 (flush state), Word8.toInt x, n);

fun push x ((buf, code, rest): state) =
  if rest <= 0 then init x ~1 (buf, code, rest)
  else (buf, code * 64 + Word8.toInt x, rest - 1);

fun append x ((buf, code, rest): state) : state = (Word8.toInt x :: buf, code, rest);

fun decode (c, state) =
  if c < 0w128 then state |> flush |> append c
  else if Word8.andb (c, 0wxC0) = 0wx80 then state |> push (Word8.andb (c, 0wx3F))
  else if Word8.andb (c, 0wxE0) = 0wxC0 then state |> init (Word8.andb (c, 0wx1F)) 1
  else if Word8.andb (c, 0wxF0) = 0wxE0 then state |> init (Word8.andb (c, 0wx0F)) 2
  else if Word8.andb (c, 0wxF8) = 0wxF0 then state |> init (Word8.andb (c, 0wx07)) 3
  else state;

in

fun decode_permissive text =
  Word8Vector.foldl decode ([], ~1, 0) (Byte.stringToBytes text)
  |> flush |> #1 |> rev;

end;

end;