|
1 (* Title: Pure/General/utf8.ML |
|
2 Author: Makarius |
|
3 |
|
4 Variations on UTF-8. |
|
5 *) |
|
6 |
|
7 signature UTF8 = |
|
8 sig |
|
9 type codepoint = int |
|
10 val decode_permissive: string -> codepoint list |
|
11 end; |
|
12 |
|
13 structure UTF8: UTF8 = |
|
14 struct |
|
15 |
|
16 type codepoint = int; |
|
17 |
|
18 |
|
19 (* permissive UTF-8 decoding *) |
|
20 |
|
21 (*see also https://en.wikipedia.org/wiki/UTF-8#Description |
|
22 overlong encodings enable byte-stuffing of low-ASCII*) |
|
23 |
|
24 local |
|
25 |
|
26 type state = codepoint list * codepoint * int; |
|
27 |
|
28 fun flush ((buf, code, rest): state) : state = |
|
29 if code <> ~1 then |
|
30 ((if rest = 0 andalso code <= 0x10FFFF then code else 0xFFFD) :: buf, ~1, 0) |
|
31 else (buf, code, rest); |
|
32 |
|
33 fun init x n (state: state) : state = (#1 (flush state), Word8.toInt x, n); |
|
34 |
|
35 fun push x ((buf, code, rest): state) = |
|
36 if rest <= 0 then init x ~1 (buf, code, rest) |
|
37 else (buf, code * 64 + Word8.toInt x, rest - 1); |
|
38 |
|
39 fun append x ((buf, code, rest): state) : state = (Word8.toInt x :: buf, code, rest); |
|
40 |
|
41 fun decode (c, state) = |
|
42 if c < 0w128 then state |> flush |> append c |
|
43 else if Word8.andb (c, 0wxC0) = 0wx80 then state |> push (Word8.andb (c, 0wx3F)) |
|
44 else if Word8.andb (c, 0wxE0) = 0wxC0 then state |> init (Word8.andb (c, 0wx1F)) 1 |
|
45 else if Word8.andb (c, 0wxF0) = 0wxE0 then state |> init (Word8.andb (c, 0wx0F)) 2 |
|
46 else if Word8.andb (c, 0wxF8) = 0wxF0 then state |> init (Word8.andb (c, 0wx07)) 3 |
|
47 else state; |
|
48 |
|
49 in |
|
50 |
|
51 fun decode_permissive text = |
|
52 Word8Vector.foldl decode ([], ~1, 0) (Byte.stringToBytes text) |
|
53 |> flush |> #1 |> rev; |
|
54 |
|
55 end; |
|
56 |
|
57 end; |