| author | blanchet | 
| Tue, 07 May 2013 10:18:59 +0200 | |
| changeset 51888 | 1cbcc0cc6bdf | 
| parent 50136 | a96bd08258a2 | 
| child 53315 | b102e20cec78 | 
| permissions | -rw-r--r-- | 
| 33823 | 1 | (* Title: Tools/WWW_Find/unicode_symbols.ML | 
| 33817 | 2 | Author: Timothy Bourke, NICTA | 
| 3 | ||
| 50135 | 4 | Ad-hoc parsing of ~~/etc/symbols. | 
| 33817 | 5 | *) | 
| 6 | ||
| 7 | signature UNICODE_SYMBOLS = | |
| 8 | sig | |
| 50135 | 9 | val symbol_to_unicode : string -> int option | 
| 10 | val abbrev_to_unicode : string -> int option | |
| 11 | val unicode_to_symbol : int -> string option | |
| 12 | val unicode_to_abbrev : int -> string option | |
| 13 | val utf8_to_symbols : string -> string | |
| 14 | val utf8 : int list -> string | |
| 33817 | 15 | end; | 
| 16 | ||
| 33823 | 17 | structure UnicodeSymbols : UNICODE_SYMBOLS = | 
| 33817 | 18 | struct | 
| 19 | ||
| 20 | local (* Lexer *) | |
| 21 | ||
| 22 | open Basic_Symbol_Pos | |
| 23 | ||
| 24 | val keywords = | |
| 50136 
a96bd08258a2
support for symbol groups, retaining original order of declarations;
 wenzelm parents: 
50135diff
changeset | 25 | Scan.make_lexicon (map Symbol.explode ["code", "group", "font", "abbrev", ":"]); | 
| 33817 | 26 | |
| 27 | datatype token_kind = | |
| 28 | Symbol | AsciiSymbol | Keyword | Name | Code | Space | Comment | EOF; | |
| 29 | ||
| 30 | datatype token = Token of token_kind * string * Position.range; | |
| 31 | ||
| 32 | fun token kind ss = Token (kind, Symbol_Pos.content ss, Symbol_Pos.range ss); | |
| 33 | ||
| 34 | in | |
| 35 | ||
| 36 | fun mk_eof pos = Token (EOF, "", (pos, Position.none)); | |
| 37 | ||
| 38 | fun str_of_token (Token (_, s, _)) = s; | |
| 39 | ||
| 40 | fun pos_of_token (Token (_, _, (pos, _))) = pos; | |
| 41 | ||
| 42 | fun int_of_code (Token (Code, s, _)) = #value (Lexicon.read_xnum s) | |
| 43 | | int_of_code _ = error "internal error in UnicodeSymbols.int_of_code" | |
| 44 | ||
| 45 | fun is_proper (Token (Space, _, _)) = false | |
| 46 | | is_proper (Token (Comment, _, _)) = false | |
| 47 | | is_proper _ = true; | |
| 48 | ||
| 49 | fun is_keyword kw (Token (Keyword, kw', _)) = (kw = kw') | |
| 50 | | is_keyword _ _ = false; | |
| 51 | ||
| 52 | fun is_ascii_sym (Token (AsciiSymbol, _, _)) = true | |
| 53 | | is_ascii_sym _ = false; | |
| 54 | ||
| 55 | fun is_hex_code (Token (Code, _, _)) = true | |
| 56 | | is_hex_code _ = false; | |
| 57 | ||
| 58 | fun is_symbol (Token (Symbol, _, _)) = true | |
| 59 | | is_symbol _ = false; | |
| 60 | ||
| 61 | fun is_name (Token (Name, _, _)) = true | |
| 62 | | is_name _ = false; | |
| 63 | ||
| 64 | fun is_eof (Token (EOF, _, _)) = true | |
| 65 | | is_eof _ = false; | |
| 66 | ||
| 67 | fun end_position_of (Token (_, _, (_, epos))) = epos; | |
| 68 | ||
| 40527 | 69 | val is_space = Symbol_Pos.symbol #> (fn s => Symbol.is_blank s andalso s <> "\n"); | 
| 33817 | 70 | val scan_space = | 
| 71 | (Scan.many1 is_space @@@ Scan.optional ($$$ "\n") [] | |
| 72 | || | |
| 73 | Scan.many is_space @@@ ($$$ "\n")) >> token Space; | |
| 74 | ||
| 75 | val scan_code = Lexicon.scan_hex #>> token Code; | |
| 76 | ||
| 77 | val scan_ascii_symbol = Scan.one | |
| 78 | ((fn x => Symbol.is_ascii x andalso | |
| 79 | not (Symbol.is_ascii_letter x | |
| 80 | orelse Symbol.is_ascii_digit x | |
| 40527 | 81 | orelse Symbol.is_ascii_blank x)) o Symbol_Pos.symbol) | 
| 82 | -- Scan.many (not o Symbol.is_ascii_blank o Symbol_Pos.symbol) | |
| 33817 | 83 | >> (token AsciiSymbol o op ::); | 
| 84 | ||
| 40627 
becf5d5187cc
renamed raw "explode" function to "raw_explode" to emphasize its meaning;
 wenzelm parents: 
40527diff
changeset | 85 | fun not_contains xs c = not (member (op =) (raw_explode xs) (Symbol_Pos.symbol c)); | 
| 33817 | 86 | val scan_comment = | 
| 87 | $$$ "#" |-- (Scan.many (not_contains "\n") @@@ ($$$ "\n")) | |
| 88 | >> token Comment; | |
| 89 | ||
| 43469 | 90 | fun is_sym s = | 
| 43485 | 91 | (case Symbol.decode s of | 
| 92 | Symbol.Sym _ => true | |
| 93 | | Symbol.Ctrl _ => true | |
| 94 | | _ => false); | |
| 43469 | 95 | |
| 33817 | 96 | fun tokenize syms = | 
| 97 | let | |
| 98 | val scanner = | |
| 43469 | 99 | Scan.one (is_sym o Symbol_Pos.symbol) >> (token Symbol o single) || | 
| 33817 | 100 | scan_comment || | 
| 101 | scan_space || | |
| 102 | scan_code || | |
| 103 | Scan.literal keywords >> token Keyword || | |
| 104 | scan_ascii_symbol || | |
| 105 | Lexicon.scan_id >> token Name; | |
| 43947 
9b00f09f7721
defer evaluation of Scan.message, for improved performance in the frequent situation where failure is handled later (e.g. via ||);
 wenzelm parents: 
43602diff
changeset | 106 | val scan_token = Symbol_Pos.!!! (fn () => "Lexical error") (Scan.bulk scanner); | 
| 33817 | 107 | in | 
| 108 | (case Scan.error (Scan.finite Symbol_Pos.stopper scan_token) syms of | |
| 109 | (toks, []) => toks | |
| 43947 
9b00f09f7721
defer evaluation of Scan.message, for improved performance in the frequent situation where failure is handled later (e.g. via ||);
 wenzelm parents: 
43602diff
changeset | 110 | | (_, ss) => | 
| 
9b00f09f7721
defer evaluation of Scan.message, for improved performance in the frequent situation where failure is handled later (e.g. via ||);
 wenzelm parents: 
43602diff
changeset | 111 |         error ("Lexical error at: " ^ Symbol_Pos.content ss ^
 | 
| 48992 | 112 | Position.here (#1 (Symbol_Pos.range ss)))) | 
| 33817 | 113 | end; | 
| 114 | ||
| 115 | val stopper = | |
| 116 | Scan.stopper | |
| 117 | (fn [] => mk_eof Position.none | |
| 118 | | toks => mk_eof (end_position_of (List.last toks))) is_eof; | |
| 119 | ||
| 120 | end; | |
| 121 | ||
| 122 | local (* Parser *) | |
| 123 | ||
| 124 | fun $$$ kw = Scan.one (is_keyword kw) >> str_of_token; | |
| 125 | val hex_code = Scan.one is_hex_code >> int_of_code; | |
| 126 | val ascii_sym = Scan.one is_ascii_sym >> str_of_token; | |
| 127 | val symbol = Scan.one is_symbol >> (fn t => (str_of_token t, pos_of_token t)); | |
| 128 | val name = Scan.one is_name >> str_of_token; | |
| 129 | ||
| 130 | val unicode = $$$ "code" -- $$$ ":" |-- hex_code; | |
| 50136 
a96bd08258a2
support for symbol groups, retaining original order of declarations;
 wenzelm parents: 
50135diff
changeset | 131 | val group = Scan.option ($$$ "group" -- $$$ ":" |-- name); | 
| 33817 | 132 | val font = Scan.option ($$$ "font" -- $$$ ":" |-- name); | 
| 133 | val abbr = Scan.option ($$$ "abbrev" -- $$$ ":" | |
| 134 | |-- (ascii_sym || $$$ ":" || name)); | |
| 135 | ||
| 136 | in | |
| 137 | ||
| 50136 
a96bd08258a2
support for symbol groups, retaining original order of declarations;
 wenzelm parents: 
50135diff
changeset | 138 | val line = (symbol -- unicode -- group -- font -- abbr) | 
| 
a96bd08258a2
support for symbol groups, retaining original order of declarations;
 wenzelm parents: 
50135diff
changeset | 139 | >> (fn ((((a, b), _), _), c) => (a, b, c)); | 
| 33817 | 140 | |
| 141 | end; | |
| 142 | ||
| 143 | local (* build tables *) | |
| 144 | ||
| 145 | fun add_entries ((fromsym, fromabbr, tosym, toabbr), ((sym, pos), uni, oabbr)) = | |
| 146 | (case oabbr of | |
| 147 | NONE => | |
| 148 | (Symtab.update_new (sym, uni) fromsym, | |
| 149 | fromabbr, | |
| 150 | Inttab.update (uni, sym) tosym, | |
| 151 | toabbr) | |
| 152 | | SOME abbr => | |
| 153 | (Symtab.update_new (sym, uni) fromsym, | |
| 154 | Symtab.update_new (abbr, uni) fromabbr, | |
| 155 | Inttab.update (uni, sym) tosym, | |
| 156 | Inttab.update (uni, abbr) toabbr)) | |
| 48992 | 157 |   handle Symtab.DUP sym => error ("Duplicate at" ^ Position.here pos)
 | 
| 158 |        | Inttab.DUP sym => error ("Duplicate code at" ^ Position.here pos);
 | |
| 33817 | 159 | |
| 160 | in | |
| 161 | ||
| 162 | fun read_symbols path = | |
| 163 | let | |
| 164 | val parsed_lines = | |
| 43602 | 165 | Symbol_Pos.explode (File.read path, Path.position path) | 
| 33817 | 166 | |> tokenize | 
| 167 | |> filter is_proper | |
| 168 | |> Scan.finite stopper (Scan.repeat line) | |
| 169 | |> fst; | |
| 170 | in | |
| 171 | Library.foldl add_entries | |
| 172 | ((Symtab.empty, Symtab.empty, Inttab.empty, Inttab.empty), | |
| 173 | parsed_lines) | |
| 174 | end; | |
| 175 | ||
| 176 | end; | |
| 177 | ||
| 178 | local | |
| 43602 | 179 | val (fromsym, fromabbr, tosym, toabbr) = read_symbols (Path.explode "~~/etc/symbols"); | 
| 33817 | 180 | in | 
| 181 | val symbol_to_unicode = Symtab.lookup fromsym; | |
| 182 | val abbrev_to_unicode = Symtab.lookup fromabbr; | |
| 183 | val unicode_to_symbol = Inttab.lookup tosym; | |
| 184 | val unicode_to_abbrev = Inttab.lookup toabbr; | |
| 185 | end; | |
| 186 | ||
| 187 | fun utf8_to_symbols utf8str = | |
| 188 | let | |
| 189 | val get_next = | |
| 190 | Substring.getc | |
| 191 | #> Option.map (apfst Byte.charToByte); | |
| 192 | val wstr = String.str o Byte.byteToChar; | |
| 193 | val replacement_char = "\<questiondown>"; | |
| 194 | ||
| 195 | fun word_to_symbol w = | |
| 196 | (case (unicode_to_symbol o Word32.toInt) w of | |
| 197 | NONE => "?" | |
| 198 | | SOME s => s); | |
| 199 | ||
| 200 | fun andb32 (w1, w2) = | |
| 201 | Word8.andb(w1, w2) | |
| 202 | |> Word8.toLarge | |
| 203 | |> Word32.fromLarge; | |
| 204 | ||
| 205 | fun read_next (ss, 0, c) = (word_to_symbol c, ss) | |
| 206 | | read_next (ss, n, c) = | |
| 207 | (case get_next ss of | |
| 208 | NONE => (replacement_char, ss) | |
| 209 | | SOME (w, ss') => | |
| 210 | if Word8.andb (w, 0wxc0) <> 0wx80 | |
| 211 | then (replacement_char, ss') | |
| 212 | else | |
| 213 | let | |
| 214 | val w' = (Word8.andb (w, 0wx3f)); | |
| 215 | val bw = (Word32.fromLarge o Word8.toLarge) w'; | |
| 216 | val c' = Word32.<< (c, 0wx6); | |
| 217 | in read_next (ss', n - 1, Word32.orb (c', bw)) end); | |
| 218 | ||
| 219 | fun do_char (w, ss) = | |
| 220 | if Word8.andb (w, 0wx80) = 0wx00 | |
| 221 | then (wstr w, ss) | |
| 222 | else if Word8.andb (w, 0wx60) = 0wx40 | |
| 223 | then read_next (ss, 1, andb32 (w, 0wx1f)) | |
| 224 | else if Word8.andb (w, 0wxf0) = 0wxe0 | |
| 225 | then read_next (ss, 2, andb32 (w, 0wx0f)) | |
| 226 | else if Word8.andb (w, 0wxf8) = 0wxf0 | |
| 227 | then read_next (ss, 3, andb32 (w, 0wx07)) | |
| 228 | else (replacement_char, ss); | |
| 229 | ||
| 230 | fun read (rs, ss) = | |
| 231 | (case Option.map do_char (get_next ss) of | |
| 232 | NONE => (implode o rev) rs | |
| 233 | | SOME (s, ss') => read (s::rs, ss')); | |
| 234 | in read ([], Substring.full utf8str) end; | |
| 235 | ||
| 236 | local | |
| 237 | ||
| 238 | fun consec n = | |
| 239 | fn w => ( | |
| 240 | Word32.>> (w, Word.fromInt (n * 6)) | |
| 241 | |> (curry Word32.andb) 0wx3f | |
| 242 | |> (curry Word32.orb) 0wx80 | |
| 243 | |> Word8.fromLargeWord o Word32.toLargeWord); | |
| 244 | ||
| 245 | fun stamp n = | |
| 246 | fn w => ( | |
| 247 | Word32.>> (w, Word.fromInt (n * 6)) | |
| 248 | |> (curry Word32.andb) (Word32.>> (0wx000000ff, Word.fromInt (n + 2))) | |
| 249 | |> (curry Word32.orb) (Word32.<< (0wxffffffff, Word.fromInt (7 - n))) | |
| 250 | |> Word8.fromLargeWord o Word32.toLargeWord); | |
| 251 | ||
| 252 | fun to_utf8_bytes i = | |
| 253 | if i <= 0x007f | |
| 254 | then [Word8.fromInt i] | |
| 255 | else let | |
| 256 | val w = Word32.fromInt i; | |
| 257 | in | |
| 258 | if i < 0x07ff | |
| 259 | then [stamp 1 w, consec 0 w] | |
| 260 | else if i < 0xffff | |
| 261 | then [stamp 2 w, consec 1 w, consec 0 w] | |
| 262 | else if i < 0x10ffff | |
| 263 | then [stamp 3 w, consec 2 w, consec 1 w, consec 0 w] | |
| 264 | else [] | |
| 265 | end; | |
| 266 | ||
| 267 | in | |
| 268 | ||
| 269 | fun utf8 is = | |
| 270 | map to_utf8_bytes is | |
| 271 | |> flat | |
| 272 | |> Word8Vector.fromList | |
| 273 | |> Byte.bytesToString; | |
| 274 | ||
| 275 | end | |
| 276 | ||
| 277 | end; | |
| 278 |