src/Pure/General/utf8.ML
author wenzelm
Sun, 27 Oct 2024 11:02:21 +0100
changeset 81266 8300511f4c45
parent 69208 3e4edf43e254
permissions -rw-r--r--
clarified signature;
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
69208
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
     1
(*  Title:      Pure/General/utf8.ML
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
     2
    Author:     Makarius
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
     3
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
     4
Variations on UTF-8.
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
     5
*)
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
     6
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
     7
signature UTF8 =
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
     8
sig
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
     9
  type codepoint = int
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
    10
  val decode_permissive: string -> codepoint list
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
    11
end;
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
    12
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
    13
structure UTF8: UTF8 =
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
    14
struct
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
    15
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
    16
type codepoint = int;
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
    17
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
    18
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
    19
(* permissive UTF-8 decoding *)
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
    20
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
    21
(*see also https://en.wikipedia.org/wiki/UTF-8#Description
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
    22
  overlong encodings enable byte-stuffing of low-ASCII*)
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
    23
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
    24
local
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
    25
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
    26
type state = codepoint list * codepoint * int;
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
    27
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
    28
fun flush ((buf, code, rest): state) : state =
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
    29
  if code <> ~1 then
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
    30
    ((if rest = 0 andalso code <= 0x10FFFF then code else 0xFFFD) :: buf, ~1, 0)
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
    31
  else (buf, code, rest);
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
    32
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
    33
fun init x n (state: state) : state = (#1 (flush state), Word8.toInt x, n);
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
    34
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
    35
fun push x ((buf, code, rest): state) =
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
    36
  if rest <= 0 then init x ~1 (buf, code, rest)
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
    37
  else (buf, code * 64 + Word8.toInt x, rest - 1);
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
    38
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
    39
fun append x ((buf, code, rest): state) : state = (Word8.toInt x :: buf, code, rest);
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
    40
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
    41
fun decode (c, state) =
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
    42
  if c < 0w128 then state |> flush |> append c
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
    43
  else if Word8.andb (c, 0wxC0) = 0wx80 then state |> push (Word8.andb (c, 0wx3F))
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
    44
  else if Word8.andb (c, 0wxE0) = 0wxC0 then state |> init (Word8.andb (c, 0wx1F)) 1
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
    45
  else if Word8.andb (c, 0wxF0) = 0wxE0 then state |> init (Word8.andb (c, 0wx0F)) 2
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
    46
  else if Word8.andb (c, 0wxF8) = 0wxF0 then state |> init (Word8.andb (c, 0wx07)) 3
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
    47
  else state;
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
    48
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
    49
in
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
    50
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
    51
fun decode_permissive text =
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
    52
  Word8Vector.foldl decode ([], ~1, 0) (Byte.stringToBytes text)
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
    53
  |> flush |> #1 |> rev;
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
    54
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
    55
end;
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
    56
3e4edf43e254 some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff changeset
    57
end;