author | wenzelm |
Sun, 27 Oct 2024 11:02:21 +0100 | |
changeset 81266 | 8300511f4c45 |
parent 69208 | 3e4edf43e254 |
permissions | -rw-r--r-- |
69208
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
1 |
(* Title: Pure/General/utf8.ML |
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
2 |
Author: Makarius |
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
3 |
|
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
4 |
Variations on UTF-8. |
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
5 |
*) |
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
6 |
|
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
7 |
signature UTF8 = |
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
8 |
sig |
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
9 |
type codepoint = int |
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
10 |
val decode_permissive: string -> codepoint list |
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
11 |
end; |
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
12 |
|
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
13 |
structure UTF8: UTF8 = |
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
14 |
struct |
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
15 |
|
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
16 |
type codepoint = int; |
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
17 |
|
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
18 |
|
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
19 |
(* permissive UTF-8 decoding *) |
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
20 |
|
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
21 |
(*see also https://en.wikipedia.org/wiki/UTF-8#Description |
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
22 |
overlong encodings enable byte-stuffing of low-ASCII*) |
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
23 |
|
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
24 |
local |
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
25 |
|
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
26 |
type state = codepoint list * codepoint * int; |
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
27 |
|
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
28 |
fun flush ((buf, code, rest): state) : state = |
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
29 |
if code <> ~1 then |
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
30 |
((if rest = 0 andalso code <= 0x10FFFF then code else 0xFFFD) :: buf, ~1, 0) |
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
31 |
else (buf, code, rest); |
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
32 |
|
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
33 |
fun init x n (state: state) : state = (#1 (flush state), Word8.toInt x, n); |
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
34 |
|
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
35 |
fun push x ((buf, code, rest): state) = |
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
36 |
if rest <= 0 then init x ~1 (buf, code, rest) |
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
37 |
else (buf, code * 64 + Word8.toInt x, rest - 1); |
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
38 |
|
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
39 |
fun append x ((buf, code, rest): state) : state = (Word8.toInt x :: buf, code, rest); |
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
40 |
|
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
41 |
fun decode (c, state) = |
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
42 |
if c < 0w128 then state |> flush |> append c |
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
43 |
else if Word8.andb (c, 0wxC0) = 0wx80 then state |> push (Word8.andb (c, 0wx3F)) |
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
44 |
else if Word8.andb (c, 0wxE0) = 0wxC0 then state |> init (Word8.andb (c, 0wx1F)) 1 |
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
45 |
else if Word8.andb (c, 0wxF0) = 0wxE0 then state |> init (Word8.andb (c, 0wx0F)) 2 |
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
46 |
else if Word8.andb (c, 0wxF8) = 0wxF0 then state |> init (Word8.andb (c, 0wx07)) 3 |
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
47 |
else state; |
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
48 |
|
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
49 |
in |
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
50 |
|
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
51 |
fun decode_permissive text = |
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
52 |
Word8Vector.foldl decode ([], ~1, 0) (Byte.stringToBytes text) |
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
53 |
|> flush |> #1 |> rev; |
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
54 |
|
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
55 |
end; |
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
56 |
|
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
wenzelm
parents:
diff
changeset
|
57 |
end; |