| author | wenzelm | 
| Wed, 19 May 2021 15:53:55 +0200 | |
| changeset 73746 | b2d47981c8dc | 
| parent 69208 | 3e4edf43e254 | 
| permissions | -rw-r--r-- | 
| 69208 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 1 | (* Title: Pure/General/utf8.ML | 
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 2 | Author: Makarius | 
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 3 | |
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 4 | Variations on UTF-8. | 
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 5 | *) | 
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 6 | |
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 7 | signature UTF8 = | 
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 8 | sig | 
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 9 | type codepoint = int | 
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 10 | val decode_permissive: string -> codepoint list | 
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 11 | end; | 
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 12 | |
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 13 | structure UTF8: UTF8 = | 
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 14 | struct | 
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 15 | |
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 16 | type codepoint = int; | 
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 17 | |
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 18 | |
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 19 | (* permissive UTF-8 decoding *) | 
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 20 | |
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 21 | (*see also https://en.wikipedia.org/wiki/UTF-8#Description | 
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 22 | overlong encodings enable byte-stuffing of low-ASCII*) | 
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 23 | |
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 24 | local | 
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 25 | |
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 26 | type state = codepoint list * codepoint * int; | 
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 27 | |
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 28 | fun flush ((buf, code, rest): state) : state = | 
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 29 | if code <> ~1 then | 
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 30 | ((if rest = 0 andalso code <= 0x10FFFF then code else 0xFFFD) :: buf, ~1, 0) | 
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 31 | else (buf, code, rest); | 
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 32 | |
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 33 | fun init x n (state: state) : state = (#1 (flush state), Word8.toInt x, n); | 
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 34 | |
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 35 | fun push x ((buf, code, rest): state) = | 
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 36 | if rest <= 0 then init x ~1 (buf, code, rest) | 
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 37 | else (buf, code * 64 + Word8.toInt x, rest - 1); | 
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 38 | |
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 39 | fun append x ((buf, code, rest): state) : state = (Word8.toInt x :: buf, code, rest); | 
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 40 | |
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 41 | fun decode (c, state) = | 
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 42 | if c < 0w128 then state |> flush |> append c | 
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 43 | else if Word8.andb (c, 0wxC0) = 0wx80 then state |> push (Word8.andb (c, 0wx3F)) | 
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 44 | else if Word8.andb (c, 0wxE0) = 0wxC0 then state |> init (Word8.andb (c, 0wx1F)) 1 | 
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 45 | else if Word8.andb (c, 0wxF0) = 0wxE0 then state |> init (Word8.andb (c, 0wx0F)) 2 | 
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 46 | else if Word8.andb (c, 0wxF8) = 0wxF0 then state |> init (Word8.andb (c, 0wx07)) 3 | 
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 47 | else state; | 
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 48 | |
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 49 | in | 
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 50 | |
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 51 | fun decode_permissive text = | 
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 52 | Word8Vector.foldl decode ([], ~1, 0) (Byte.stringToBytes text) | 
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 53 | |> flush |> #1 |> rev; | 
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 54 | |
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 55 | end; | 
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 56 | |
| 
3e4edf43e254
some support for UTF-8 (similar to Isabelle/Scala version);
 wenzelm parents: diff
changeset | 57 | end; |