author | nipkow |
Fri, 09 Sep 2011 06:47:14 +0200 | |
changeset 44851 | 4bc70ab28787 |
parent 43947 | 9b00f09f7721 |
child 48992 | 0518bf89c777 |
permissions | -rw-r--r-- |
33823 | 1 |
(* Title: Tools/WWW_Find/unicode_symbols.ML |
33817 | 2 |
Author: Timothy Bourke, NICTA |
3 |
||
33823 | 4 |
Parse the ISABELLE_HOME/etc/symbols file. |
33817 | 5 |
*) |
6 |
||
7 |
signature UNICODE_SYMBOLS = |
|
8 |
sig |
|
9 |
val symbol_to_unicode : string -> int option |
|
10 |
val abbrev_to_unicode : string -> int option |
|
11 |
val unicode_to_symbol : int -> string option |
|
12 |
val unicode_to_abbrev : int -> string option |
|
13 |
val utf8_to_symbols : string -> string |
|
14 |
val utf8 : int list -> string |
|
15 |
end; |
|
16 |
||
33823 | 17 |
structure UnicodeSymbols : UNICODE_SYMBOLS = |
33817 | 18 |
struct |
19 |
||
20 |
local (* Lexer *) |
|
21 |
||
22 |
open Basic_Symbol_Pos |
|
23 |
||
24 |
val keywords = |
|
25 |
Scan.make_lexicon (map Symbol.explode ["code", "font", "abbrev", ":"]); |
|
26 |
||
27 |
datatype token_kind = |
|
28 |
Symbol | AsciiSymbol | Keyword | Name | Code | Space | Comment | EOF; |
|
29 |
||
30 |
datatype token = Token of token_kind * string * Position.range; |
|
31 |
||
32 |
fun token kind ss = Token (kind, Symbol_Pos.content ss, Symbol_Pos.range ss); |
|
33 |
||
34 |
in |
|
35 |
||
36 |
fun mk_eof pos = Token (EOF, "", (pos, Position.none)); |
|
37 |
||
38 |
fun str_of_token (Token (_, s, _)) = s; |
|
39 |
||
40 |
fun pos_of_token (Token (_, _, (pos, _))) = pos; |
|
41 |
||
42 |
fun int_of_code (Token (Code, s, _)) = #value (Lexicon.read_xnum s) |
|
43 |
| int_of_code _ = error "internal error in UnicodeSymbols.int_of_code" |
|
44 |
||
45 |
fun is_proper (Token (Space, _, _)) = false |
|
46 |
| is_proper (Token (Comment, _, _)) = false |
|
47 |
| is_proper _ = true; |
|
48 |
||
49 |
fun is_keyword kw (Token (Keyword, kw', _)) = (kw = kw') |
|
50 |
| is_keyword _ _ = false; |
|
51 |
||
52 |
fun is_ascii_sym (Token (AsciiSymbol, _, _)) = true |
|
53 |
| is_ascii_sym _ = false; |
|
54 |
||
55 |
fun is_hex_code (Token (Code, _, _)) = true |
|
56 |
| is_hex_code _ = false; |
|
57 |
||
58 |
fun is_symbol (Token (Symbol, _, _)) = true |
|
59 |
| is_symbol _ = false; |
|
60 |
||
61 |
fun is_name (Token (Name, _, _)) = true |
|
62 |
| is_name _ = false; |
|
63 |
||
64 |
fun is_eof (Token (EOF, _, _)) = true |
|
65 |
| is_eof _ = false; |
|
66 |
||
67 |
fun end_position_of (Token (_, _, (_, epos))) = epos; |
|
68 |
||
40527 | 69 |
val is_space = Symbol_Pos.symbol #> (fn s => Symbol.is_blank s andalso s <> "\n"); |
33817 | 70 |
val scan_space = |
71 |
(Scan.many1 is_space @@@ Scan.optional ($$$ "\n") [] |
|
72 |
|| |
|
73 |
Scan.many is_space @@@ ($$$ "\n")) >> token Space; |
|
74 |
||
75 |
val scan_code = Lexicon.scan_hex #>> token Code; |
|
76 |
||
77 |
val scan_ascii_symbol = Scan.one |
|
78 |
((fn x => Symbol.is_ascii x andalso |
|
79 |
not (Symbol.is_ascii_letter x |
|
80 |
orelse Symbol.is_ascii_digit x |
|
40527 | 81 |
orelse Symbol.is_ascii_blank x)) o Symbol_Pos.symbol) |
82 |
-- Scan.many (not o Symbol.is_ascii_blank o Symbol_Pos.symbol) |
|
33817 | 83 |
>> (token AsciiSymbol o op ::); |
84 |
||
40627
becf5d5187cc
renamed raw "explode" function to "raw_explode" to emphasize its meaning;
wenzelm
parents:
40527
diff
changeset
|
85 |
fun not_contains xs c = not (member (op =) (raw_explode xs) (Symbol_Pos.symbol c)); |
33817 | 86 |
val scan_comment = |
87 |
$$$ "#" |-- (Scan.many (not_contains "\n") @@@ ($$$ "\n")) |
|
88 |
>> token Comment; |
|
89 |
||
43469 | 90 |
fun is_sym s = |
43485 | 91 |
(case Symbol.decode s of |
92 |
Symbol.Sym _ => true |
|
93 |
| Symbol.Ctrl _ => true |
|
94 |
| _ => false); |
|
43469 | 95 |
|
33817 | 96 |
fun tokenize syms = |
97 |
let |
|
98 |
val scanner = |
|
43469 | 99 |
Scan.one (is_sym o Symbol_Pos.symbol) >> (token Symbol o single) || |
33817 | 100 |
scan_comment || |
101 |
scan_space || |
|
102 |
scan_code || |
|
103 |
Scan.literal keywords >> token Keyword || |
|
104 |
scan_ascii_symbol || |
|
105 |
Lexicon.scan_id >> token Name; |
|
43947
9b00f09f7721
defer evaluation of Scan.message, for improved performance in the frequent situation where failure is handled later (e.g. via ||);
wenzelm
parents:
43602
diff
changeset
|
106 |
val scan_token = Symbol_Pos.!!! (fn () => "Lexical error") (Scan.bulk scanner); |
33817 | 107 |
in |
108 |
(case Scan.error (Scan.finite Symbol_Pos.stopper scan_token) syms of |
|
109 |
(toks, []) => toks |
|
43947
9b00f09f7721
defer evaluation of Scan.message, for improved performance in the frequent situation where failure is handled later (e.g. via ||);
wenzelm
parents:
43602
diff
changeset
|
110 |
| (_, ss) => |
9b00f09f7721
defer evaluation of Scan.message, for improved performance in the frequent situation where failure is handled later (e.g. via ||);
wenzelm
parents:
43602
diff
changeset
|
111 |
error ("Lexical error at: " ^ Symbol_Pos.content ss ^ |
9b00f09f7721
defer evaluation of Scan.message, for improved performance in the frequent situation where failure is handled later (e.g. via ||);
wenzelm
parents:
43602
diff
changeset
|
112 |
Position.str_of (#1 (Symbol_Pos.range ss)))) |
33817 | 113 |
end; |
114 |
||
115 |
val stopper = |
|
116 |
Scan.stopper |
|
117 |
(fn [] => mk_eof Position.none |
|
118 |
| toks => mk_eof (end_position_of (List.last toks))) is_eof; |
|
119 |
||
120 |
end; |
|
121 |
||
122 |
local (* Parser *) |
|
123 |
||
124 |
fun $$$ kw = Scan.one (is_keyword kw) >> str_of_token; |
|
125 |
val hex_code = Scan.one is_hex_code >> int_of_code; |
|
126 |
val ascii_sym = Scan.one is_ascii_sym >> str_of_token; |
|
127 |
val symbol = Scan.one is_symbol >> (fn t => (str_of_token t, pos_of_token t)); |
|
128 |
val name = Scan.one is_name >> str_of_token; |
|
129 |
||
130 |
val unicode = $$$ "code" -- $$$ ":" |-- hex_code; |
|
131 |
val font = Scan.option ($$$ "font" -- $$$ ":" |-- name); |
|
132 |
val abbr = Scan.option ($$$ "abbrev" -- $$$ ":" |
|
133 |
|-- (ascii_sym || $$$ ":" || name)); |
|
134 |
||
135 |
in |
|
136 |
||
36960
01594f816e3a
prefer structure Keyword, Parse, Parse_Spec, Outer_Syntax;
wenzelm
parents:
36692
diff
changeset
|
137 |
val line = (symbol -- unicode --| font -- abbr) >> Parse.triple1; |
33817 | 138 |
|
139 |
end; |
|
140 |
||
141 |
local (* build tables *) |
|
142 |
||
143 |
fun add_entries ((fromsym, fromabbr, tosym, toabbr), ((sym, pos), uni, oabbr)) = |
|
144 |
(case oabbr of |
|
145 |
NONE => |
|
146 |
(Symtab.update_new (sym, uni) fromsym, |
|
147 |
fromabbr, |
|
148 |
Inttab.update (uni, sym) tosym, |
|
149 |
toabbr) |
|
150 |
| SOME abbr => |
|
151 |
(Symtab.update_new (sym, uni) fromsym, |
|
152 |
Symtab.update_new (abbr, uni) fromabbr, |
|
153 |
Inttab.update (uni, sym) tosym, |
|
154 |
Inttab.update (uni, abbr) toabbr)) |
|
155 |
handle Symtab.DUP sym => error ("Duplicate at" ^ Position.str_of pos) |
|
156 |
| Inttab.DUP sym => error ("Duplicate code at" ^ Position.str_of pos); |
|
157 |
||
158 |
in |
|
159 |
||
160 |
fun read_symbols path = |
|
161 |
let |
|
162 |
val parsed_lines = |
|
43602 | 163 |
Symbol_Pos.explode (File.read path, Path.position path) |
33817 | 164 |
|> tokenize |
165 |
|> filter is_proper |
|
166 |
|> Scan.finite stopper (Scan.repeat line) |
|
167 |
|> fst; |
|
168 |
in |
|
169 |
Library.foldl add_entries |
|
170 |
((Symtab.empty, Symtab.empty, Inttab.empty, Inttab.empty), |
|
171 |
parsed_lines) |
|
172 |
end; |
|
173 |
||
174 |
end; |
|
175 |
||
176 |
local |
|
43602 | 177 |
val (fromsym, fromabbr, tosym, toabbr) = read_symbols (Path.explode "~~/etc/symbols"); |
33817 | 178 |
in |
179 |
val symbol_to_unicode = Symtab.lookup fromsym; |
|
180 |
val abbrev_to_unicode = Symtab.lookup fromabbr; |
|
181 |
val unicode_to_symbol = Inttab.lookup tosym; |
|
182 |
val unicode_to_abbrev = Inttab.lookup toabbr; |
|
183 |
end; |
|
184 |
||
185 |
fun utf8_to_symbols utf8str = |
|
186 |
let |
|
187 |
val get_next = |
|
188 |
Substring.getc |
|
189 |
#> Option.map (apfst Byte.charToByte); |
|
190 |
val wstr = String.str o Byte.byteToChar; |
|
191 |
val replacement_char = "\<questiondown>"; |
|
192 |
||
193 |
fun word_to_symbol w = |
|
194 |
(case (unicode_to_symbol o Word32.toInt) w of |
|
195 |
NONE => "?" |
|
196 |
| SOME s => s); |
|
197 |
||
198 |
fun andb32 (w1, w2) = |
|
199 |
Word8.andb(w1, w2) |
|
200 |
|> Word8.toLarge |
|
201 |
|> Word32.fromLarge; |
|
202 |
||
203 |
fun read_next (ss, 0, c) = (word_to_symbol c, ss) |
|
204 |
| read_next (ss, n, c) = |
|
205 |
(case get_next ss of |
|
206 |
NONE => (replacement_char, ss) |
|
207 |
| SOME (w, ss') => |
|
208 |
if Word8.andb (w, 0wxc0) <> 0wx80 |
|
209 |
then (replacement_char, ss') |
|
210 |
else |
|
211 |
let |
|
212 |
val w' = (Word8.andb (w, 0wx3f)); |
|
213 |
val bw = (Word32.fromLarge o Word8.toLarge) w'; |
|
214 |
val c' = Word32.<< (c, 0wx6); |
|
215 |
in read_next (ss', n - 1, Word32.orb (c', bw)) end); |
|
216 |
||
217 |
fun do_char (w, ss) = |
|
218 |
if Word8.andb (w, 0wx80) = 0wx00 |
|
219 |
then (wstr w, ss) |
|
220 |
else if Word8.andb (w, 0wx60) = 0wx40 |
|
221 |
then read_next (ss, 1, andb32 (w, 0wx1f)) |
|
222 |
else if Word8.andb (w, 0wxf0) = 0wxe0 |
|
223 |
then read_next (ss, 2, andb32 (w, 0wx0f)) |
|
224 |
else if Word8.andb (w, 0wxf8) = 0wxf0 |
|
225 |
then read_next (ss, 3, andb32 (w, 0wx07)) |
|
226 |
else (replacement_char, ss); |
|
227 |
||
228 |
fun read (rs, ss) = |
|
229 |
(case Option.map do_char (get_next ss) of |
|
230 |
NONE => (implode o rev) rs |
|
231 |
| SOME (s, ss') => read (s::rs, ss')); |
|
232 |
in read ([], Substring.full utf8str) end; |
|
233 |
||
234 |
local |
|
235 |
||
236 |
fun consec n = |
|
237 |
fn w => ( |
|
238 |
Word32.>> (w, Word.fromInt (n * 6)) |
|
239 |
|> (curry Word32.andb) 0wx3f |
|
240 |
|> (curry Word32.orb) 0wx80 |
|
241 |
|> Word8.fromLargeWord o Word32.toLargeWord); |
|
242 |
||
243 |
fun stamp n = |
|
244 |
fn w => ( |
|
245 |
Word32.>> (w, Word.fromInt (n * 6)) |
|
246 |
|> (curry Word32.andb) (Word32.>> (0wx000000ff, Word.fromInt (n + 2))) |
|
247 |
|> (curry Word32.orb) (Word32.<< (0wxffffffff, Word.fromInt (7 - n))) |
|
248 |
|> Word8.fromLargeWord o Word32.toLargeWord); |
|
249 |
||
250 |
fun to_utf8_bytes i = |
|
251 |
if i <= 0x007f |
|
252 |
then [Word8.fromInt i] |
|
253 |
else let |
|
254 |
val w = Word32.fromInt i; |
|
255 |
in |
|
256 |
if i < 0x07ff |
|
257 |
then [stamp 1 w, consec 0 w] |
|
258 |
else if i < 0xffff |
|
259 |
then [stamp 2 w, consec 1 w, consec 0 w] |
|
260 |
else if i < 0x10ffff |
|
261 |
then [stamp 3 w, consec 2 w, consec 1 w, consec 0 w] |
|
262 |
else [] |
|
263 |
end; |
|
264 |
||
265 |
in |
|
266 |
||
267 |
fun utf8 is = |
|
268 |
map to_utf8_bytes is |
|
269 |
|> flat |
|
270 |
|> Word8Vector.fromList |
|
271 |
|> Byte.bytesToString; |
|
272 |
||
273 |
end |
|
274 |
||
275 |
end; |
|
276 |