1 (* Title: Pure/Syntax/lexicon.ML |
1 (* Title: Pure/Syntax/lexicon.ML |
2 ID: $Id$ |
2 ID: $Id$ |
3 Author: Tobias Nipkow and Markus Wenzel, TU Muenchen |
3 Author: Tobias Nipkow and Markus Wenzel, TU Muenchen |
4 |
4 |
5 Scanner combinators and the main lexer (for terms and types). |
5 Lexer for the inner Isabelle syntax (terms and types). |
6 *) |
6 *) |
7 |
|
8 infix 5 -- ^^; |
|
9 infix 3 >>; |
|
10 infix 0 ||; |
|
11 |
|
12 signature SCANNER = |
|
13 sig |
|
14 exception LEXICAL_ERROR |
|
15 val >> : ('a -> 'b * 'c) * ('b -> 'd) -> 'a -> 'd * 'c |
|
16 val || : ('a -> 'b) * ('a -> 'b) -> 'a -> 'b |
|
17 val -- : ('a -> 'b * 'c) * ('c -> 'd * 'e) -> 'a -> ('b * 'd) * 'e |
|
18 val ^^ : ('a -> string * 'b) * ('b -> string * 'c) -> 'a -> string * 'c |
|
19 val $$ : ''a -> ''a list -> ''a * ''a list |
|
20 val scan_empty: 'a list -> 'b list * 'a list |
|
21 val scan_one: ('a -> bool) -> 'a list -> 'a * 'a list |
|
22 val scan_any: ('a -> bool) -> 'a list -> 'a list * 'a list |
|
23 val scan_any1: ('a -> bool) -> 'a list -> 'a list * 'a list |
|
24 val scan_end: 'a list -> 'b list * 'a list |
|
25 val optional: ('a -> 'b * 'a) -> 'a -> 'b option * 'a |
|
26 val repeat: ('a -> 'b * 'a) -> 'a -> 'b list * 'a |
|
27 val repeat1: ('a -> 'b * 'a) -> 'a -> 'b list * 'a |
|
28 val max_of: ('a -> ('b * string) * 'a) -> ('a -> ('b * string) * 'a) |
|
29 -> 'a -> ('b * string) option * 'a |
|
30 val scan_id: string list -> string * string list |
|
31 val scan_longid: string list -> string * string list |
|
32 val scan_tid: string list -> string * string list |
|
33 val scan_nat: string list -> string * string list |
|
34 type lexicon |
|
35 val dest_lexicon: lexicon -> string list |
|
36 val empty_lexicon: lexicon |
|
37 val extend_lexicon: lexicon -> string list -> lexicon |
|
38 val make_lexicon: string list -> lexicon |
|
39 val merge_lexicons: lexicon -> lexicon -> lexicon |
|
40 val scan_literal: lexicon -> string list -> string * string list |
|
41 end; |
|
42 |
7 |
43 signature LEXICON0 = |
8 signature LEXICON0 = |
44 sig |
9 sig |
45 val is_identifier: string -> bool |
10 val is_identifier: string -> bool |
46 val implode_xstr: string list -> string |
11 val implode_xstr: string list -> string |
47 val explode_xstr: string -> string list |
12 val explode_xstr: string -> string list |
|
13 val scan_id: string list -> string * string list |
|
14 val scan_longid: string list -> string * string list |
|
15 val scan_var: string list -> string * string list |
|
16 val scan_tid: string list -> string * string list |
|
17 val scan_nat: string list -> string * string list |
|
18 val scan_int: string list -> string * string list |
48 val string_of_vname: indexname -> string |
19 val string_of_vname: indexname -> string |
49 val string_of_vname': indexname -> string |
20 val string_of_vname': indexname -> string |
50 val scan_varname: string list -> indexname * string list |
21 val indexname: string list -> indexname |
51 val scan_var: string -> term |
22 val read_var: string -> term |
52 val const: string -> term |
23 val const: string -> term |
53 val free: string -> term |
24 val free: string -> term |
54 val var: indexname -> term |
25 val var: indexname -> term |
55 val read_var: string -> string * int |
|
56 end; |
26 end; |
57 |
27 |
58 signature LEXICON = |
28 signature LEXICON = |
59 sig |
29 sig |
60 include SCANNER |
|
61 include LEXICON0 |
30 include LEXICON0 |
62 val is_xid: string -> bool |
31 val is_xid: string -> bool |
63 val is_tid: string -> bool |
32 val is_tid: string -> bool |
64 datatype token = |
33 datatype token = |
65 Token of string | |
34 Token of string | |
82 val display_token: token -> string |
51 val display_token: token -> string |
83 val matching_tokens: token * token -> bool |
52 val matching_tokens: token * token -> bool |
84 val token_assoc: (token option * 'a list) list * token -> 'a list |
53 val token_assoc: (token option * 'a list) list * token -> 'a list |
85 val valued_token: token -> bool |
54 val valued_token: token -> bool |
86 val predef_term: string -> token option |
55 val predef_term: string -> token option |
87 val tokenize: lexicon -> bool -> string list -> token list |
56 val tokenize: Scan.lexicon -> bool -> string list -> token list |
88 end; |
57 end; |
89 |
58 |
90 structure Lexicon : LEXICON = |
59 structure Lexicon : LEXICON = |
91 struct |
60 struct |
92 |
61 |
93 |
62 |
94 (** is_identifier etc. **) |
63 (** is_identifier etc. **) |
95 |
64 |
96 fun is_ident [] = false |
65 fun is_ident [] = false |
97 | is_ident (c :: cs) = is_letter c andalso forall is_letdig cs; |
66 | is_ident (c :: cs) = Symbol.is_letter c andalso forall Symbol.is_letdig cs; |
98 |
67 |
99 val is_identifier = is_ident o explode; |
68 val is_identifier = is_ident o Symbol.explode; |
100 |
69 |
101 fun is_xid s = |
70 fun is_xid s = |
102 (case explode s of |
71 (case Symbol.explode s of |
103 "_" :: cs => is_ident cs |
72 "_" :: cs => is_ident cs |
104 | cs => is_ident cs); |
73 | cs => is_ident cs); |
105 |
74 |
106 fun is_tid s = |
75 fun is_tid s = |
107 (case explode s of |
76 (case Symbol.explode s of |
108 "'" :: cs => is_ident cs |
77 "'" :: cs => is_ident cs |
109 | _ => false); |
78 | _ => false); |
110 |
79 |
111 |
80 |
112 |
81 |
|
82 (** basic scanners **) |
|
83 |
|
84 val scan_letter_letdigs = Scan.one Symbol.is_letter -- Scan.any Symbol.is_letdig >> op ::; |
|
85 val scan_digits1 = Scan.any1 Symbol.is_digit; |
|
86 |
|
87 val scan_id = scan_letter_letdigs >> implode; |
|
88 val scan_longid = scan_id ^^ (Scan.repeat1 ($$ "." ^^ scan_id) >> implode); |
|
89 val scan_tid = $$ "'" ^^ scan_id; |
|
90 |
|
91 val scan_nat = scan_digits1 >> implode; |
|
92 val scan_int = $$ "~" ^^ scan_nat || scan_nat; |
|
93 |
|
94 val scan_id_nat = scan_id ^^ Scan.optional ($$ "." ^^ scan_nat) ""; |
|
95 val scan_var = $$ "?" ^^ scan_id_nat; |
|
96 |
|
97 |
|
98 |
113 (** string_of_vname **) |
99 (** string_of_vname **) |
114 |
100 |
115 fun string_of_vname (x, i) = |
101 fun string_of_vname (x, i) = |
116 let |
102 let |
117 val si = string_of_int i; |
103 val si = string_of_int i; |
|
104 val dot = Symbol.is_digit (last_elem (Symbol.explode x)) handle _ => true; |
118 in |
105 in |
119 (if is_digit (last_elem (explode x)) then "?" ^ x ^ "." ^ si |
106 if dot then "?" ^ x ^ "." ^ si |
120 else if i = 0 then "?" ^ x |
107 else if i = 0 then "?" ^ x |
121 else "?" ^ x ^ si) |
108 else "?" ^ x ^ si |
122 handle LIST _ => "?NULL_VARIABLE." ^ si |
|
123 end; |
109 end; |
124 |
110 |
125 fun string_of_vname' (xi as (x, i)) = |
111 fun string_of_vname' (x, ~1) = x |
126 if i < 0 then x else string_of_vname xi; |
112 | string_of_vname' xi = string_of_vname xi; |
127 |
113 |
128 |
114 |
129 |
115 |
130 (** datatype token **) |
116 (** datatype token **) |
131 |
117 |
229 | predef_term "xnum" = Some (NumSy "xnum") |
215 | predef_term "xnum" = Some (NumSy "xnum") |
230 | predef_term "xstr" = Some (StrSy "xstr") |
216 | predef_term "xstr" = Some (StrSy "xstr") |
231 | predef_term _ = None; |
217 | predef_term _ = None; |
232 |
218 |
233 |
219 |
234 (* xstr tokens *) |
220 (* xstr tokens *) |
235 |
221 |
236 fun implode_xstr cs = enclose "''" "''" (implode cs); |
222 val scan_chr = |
|
223 $$ "\\" |-- Scan.one Symbol.not_eof || |
|
224 Scan.one (not_equal "'" andf Symbol.not_eof) || |
|
225 $$ "'" --| Scan.ahead (Scan.one (not_equal "'")); |
|
226 |
|
227 val scan_str = |
|
228 $$ "'" |-- $$ "'" |-- |
|
229 !! (fn cs => "Inner lexical error: malformed literal string at " ^ quote ("''" ^ beginning cs)) |
|
230 (Scan.repeat scan_chr --| $$ "'" --| $$ "'"); |
|
231 |
|
232 |
|
233 fun implode_xstr cs = enclose "''" "''" (implode (map (fn "'" => "\\'" | c => c) cs)); |
237 |
234 |
238 fun explode_xstr str = |
235 fun explode_xstr str = |
239 rev (tl (tl (rev (tl (tl (explode str)))))); |
236 #1 (Scan.error (Scan.finite Symbol.eof scan_str) (Symbol.explode str)); |
240 |
|
241 |
|
242 |
|
243 (** datatype lexicon **) |
|
244 |
|
245 datatype lexicon = |
|
246 Empty | |
|
247 Branch of string * string * lexicon * lexicon * lexicon; |
|
248 |
|
249 val no_token = ""; |
|
250 |
|
251 |
|
252 (* dest_lexicon *) |
|
253 |
|
254 fun dest_lexicon Empty = [] |
|
255 | dest_lexicon (Branch (_, "", lt, eq, gt)) = |
|
256 dest_lexicon eq @ dest_lexicon lt @ dest_lexicon gt |
|
257 | dest_lexicon (Branch (_, str, lt, eq, gt)) = |
|
258 str :: (dest_lexicon eq @ dest_lexicon lt @ dest_lexicon gt); |
|
259 |
|
260 |
|
261 (* empty, extend, make, merge lexicons *) |
|
262 |
|
263 val empty_lexicon = Empty; |
|
264 |
|
265 fun extend_lexicon lexicon strs = |
|
266 let |
|
267 fun ext (lex, s) = |
|
268 let |
|
269 fun add (Branch (d, a, lt, eq, gt)) (chs as c :: cs) = |
|
270 if c < d then Branch (d, a, add lt chs, eq, gt) |
|
271 else if c > d then Branch (d, a, lt, eq, add gt chs) |
|
272 else Branch (d, if null cs then s else a, lt, add eq cs, gt) |
|
273 | add Empty [c] = |
|
274 Branch (c, s, Empty, Empty, Empty) |
|
275 | add Empty (c :: cs) = |
|
276 Branch (c, no_token, Empty, add Empty cs, Empty) |
|
277 | add lex [] = lex; |
|
278 |
|
279 val cs = explode s; |
|
280 in |
|
281 if exists is_blank cs then |
|
282 sys_error ("extend_lexicon: blank in delimiter " ^ quote s) |
|
283 else add lex cs |
|
284 end; |
|
285 in |
|
286 foldl ext (lexicon, strs \\ dest_lexicon lexicon) |
|
287 end; |
|
288 |
|
289 val make_lexicon = extend_lexicon empty_lexicon; |
|
290 |
|
291 fun merge_lexicons lex1 lex2 = |
|
292 let |
|
293 val strs1 = dest_lexicon lex1; |
|
294 val strs2 = dest_lexicon lex2; |
|
295 in |
|
296 if strs2 subset strs1 then lex1 |
|
297 else if strs1 subset strs2 then lex2 |
|
298 else extend_lexicon lex1 strs2 |
|
299 end; |
|
300 |
|
301 |
|
302 |
|
303 (** scanners **) |
|
304 |
|
305 exception LEXICAL_ERROR; |
|
306 |
|
307 |
|
308 (* scanner combinators *) |
|
309 |
|
310 fun (scan >> f) cs = apfst f (scan cs); |
|
311 |
|
312 fun (scan1 || scan2) cs = scan1 cs handle LEXICAL_ERROR => scan2 cs; |
|
313 |
|
314 fun (scan1 -- scan2) cs = |
|
315 let |
|
316 val (x, cs') = scan1 cs; |
|
317 val (y, cs'') = scan2 cs'; |
|
318 in |
|
319 ((x, y), cs'') |
|
320 end; |
|
321 |
|
322 fun (scan1 ^^ scan2) = scan1 -- scan2 >> op ^; |
|
323 |
|
324 |
|
325 (* generic scanners *) |
|
326 |
|
327 fun $$ _ [] = raise LEXICAL_ERROR |
|
328 | $$ a (c :: cs) = |
|
329 if a = c then (c, cs) else raise LEXICAL_ERROR; |
|
330 |
|
331 fun scan_empty cs = ([], cs); |
|
332 |
|
333 fun scan_one _ [] = raise LEXICAL_ERROR |
|
334 | scan_one pred (c :: cs) = |
|
335 if pred c then (c, cs) else raise LEXICAL_ERROR; |
|
336 |
|
337 fun scan_any _ [] = ([], []) |
|
338 | scan_any pred (chs as c :: cs) = |
|
339 if pred c then apfst (cons c) (scan_any pred cs) |
|
340 else ([], chs); |
|
341 |
|
342 fun scan_any1 pred = scan_one pred -- scan_any pred >> op ::; |
|
343 |
|
344 fun scan_rest cs = (cs, []); |
|
345 |
|
346 fun scan_end [] = ([], []) |
|
347 | scan_end _ = raise LEXICAL_ERROR; |
|
348 |
|
349 fun optional scan = scan >> Some || scan_empty >> K None; |
|
350 |
|
351 fun repeat scan cs = (scan -- repeat scan >> op :: || scan_empty) cs; |
|
352 |
|
353 fun repeat1 scan = scan -- repeat scan >> op ::; |
|
354 |
|
355 fun max_of scan1 scan2 cs = |
|
356 (case (optional scan1 cs, optional scan2 cs) of |
|
357 (tok1, (None, _)) => tok1 |
|
358 | ((None, _), tok2) => tok2 |
|
359 | (tok1 as (Some (_, s1), _), tok2 as (Some (_, s2), _)) => |
|
360 if size s1 >= size s2 then tok1 else tok2); |
|
361 |
|
362 |
|
363 (* other scanners *) |
|
364 |
|
365 val scan_letter_letdigs = scan_one is_letter -- scan_any is_letdig >> op ::; |
|
366 |
|
367 val scan_digits1 = scan_any1 is_digit; |
|
368 |
|
369 val scan_id = scan_letter_letdigs >> implode; |
|
370 |
|
371 val scan_longid = scan_id ^^ (repeat1 ($$ "." ^^ scan_id) >> implode); |
|
372 |
|
373 val scan_id_nat = |
|
374 scan_id ^^ ($$ "." ^^ (scan_digits1 >> implode) || scan_empty >> K ""); |
|
375 |
|
376 val scan_tid = $$ "'" ^^ scan_id; |
|
377 |
|
378 val scan_nat = scan_digits1 >> implode; |
|
379 |
|
380 val scan_int = $$ "~" ^^ scan_nat || scan_nat; |
|
381 |
|
382 |
|
383 (* scan_literal *) |
|
384 |
|
385 fun scan_literal lex chrs = |
|
386 let |
|
387 fun scan_lit _ s_cs [] = s_cs |
|
388 | scan_lit Empty s_cs _ = s_cs |
|
389 | scan_lit (Branch (d, a, lt, eq, gt)) s_cs (chs as c :: cs) = |
|
390 if c < d then scan_lit lt s_cs chs |
|
391 else if c > d then scan_lit gt s_cs chs |
|
392 else scan_lit eq (if a = no_token then s_cs else Some (a, cs)) cs; |
|
393 in |
|
394 (case scan_lit lex None chrs of |
|
395 None => raise LEXICAL_ERROR |
|
396 | Some s_cs => s_cs) |
|
397 end; |
|
398 |
237 |
399 |
238 |
400 |
239 |
401 (** tokenize **) |
240 (** tokenize **) |
402 |
241 |
404 let |
243 let |
405 val scan_xid = |
244 val scan_xid = |
406 if xids then $$ "_" ^^ scan_id || scan_id |
245 if xids then $$ "_" ^^ scan_id || scan_id |
407 else scan_id; |
246 else scan_id; |
408 |
247 |
409 val scan_lit = scan_literal lex >> pair Token; |
|
410 |
|
411 val scan_val = |
248 val scan_val = |
412 $$ "?" ^^ $$ "'" ^^ scan_id_nat >> pair TVarSy || |
249 $$ "?" ^^ $$ "'" ^^ scan_id_nat >> pair TVarSy || |
413 $$ "?" ^^ scan_id_nat >> pair VarSy || |
250 scan_var >> pair VarSy || |
414 $$ "'" ^^ scan_id >> pair TFreeSy || |
251 scan_tid >> pair TFreeSy || |
415 $$ "#" ^^ scan_int >> pair NumSy || |
252 $$ "#" ^^ scan_int >> pair NumSy || (* FIXME remove "#" *) |
416 scan_longid >> pair LongIdentSy || |
253 scan_longid >> pair LongIdentSy || |
417 scan_xid >> pair IdentSy; |
254 scan_xid >> pair IdentSy; |
418 |
255 |
419 fun scan_str ("'" :: "'" :: cs) = ([], cs) |
256 val scan_lit = Scan.literal lex >> (pair Token o implode); |
420 | scan_str ("\\" :: c :: cs) = apfst (cons c) (scan_str cs) |
257 |
421 | scan_str (c :: cs) = apfst (cons c) (scan_str cs) |
258 val scan_token = |
422 | scan_str [] = raise ERROR; |
259 Scan.max (op <= o pairself snd) scan_lit scan_val >> (fn (tk, s) => Some (tk s)) || |
423 |
260 scan_str >> (Some o StrSy o implode_xstr) || |
424 |
261 Scan.one Symbol.is_blank >> K None; |
425 fun scan (rev_toks, []) = rev (EndToken :: rev_toks) |
|
426 | scan (rev_toks, chs as "'" :: "'" :: cs) = |
|
427 let |
|
428 val (cs', cs'') = scan_str cs handle ERROR => |
|
429 error ("Lexical error: missing quotes at end of string " ^ |
|
430 quote (implode chs)); |
|
431 in |
|
432 scan (StrSy (implode_xstr cs') :: rev_toks, cs'') |
|
433 end |
|
434 | scan (rev_toks, chs as c :: cs) = |
|
435 if is_blank c then scan (rev_toks, cs) |
|
436 else |
|
437 (case max_of scan_lit scan_val chs of |
|
438 (None, _) => error ("Lexical error at: " ^ quote (implode chs)) |
|
439 | (Some (tk, s), chs') => scan (tk s :: rev_toks, chs')); |
|
440 in |
262 in |
441 scan ([], chs) |
263 (case Scan.error (Scan.finite Symbol.eof (Scan.repeat scan_token)) chs of |
|
264 (toks, []) => mapfilter I toks @ [EndToken] |
|
265 | (_, cs) => error ("Inner lexical error at: " ^ quote (implode cs))) |
442 end; |
266 end; |
443 |
267 |
444 |
268 |
445 |
269 |
446 (** scan variables **) |
270 (** scan variables **) |
450 fun scan_vname chrs = |
274 fun scan_vname chrs = |
451 let |
275 let |
452 fun nat_of_chs n [] = n |
276 fun nat_of_chs n [] = n |
453 | nat_of_chs n (c :: cs) = nat_of_chs (n * 10 + (ord c - ord "0")) cs; |
277 | nat_of_chs n (c :: cs) = nat_of_chs (n * 10 + (ord c - ord "0")) cs; |
454 |
278 |
455 val nat_of = nat_of_chs 0; |
279 val nat = nat_of_chs 0; |
456 |
280 |
457 fun split_vname chs = |
281 fun split_vname chs = |
458 let val (cs, ds) = take_suffix is_digit chs |
282 let val (cs, ds) = take_suffix Symbol.is_digit chs |
459 in (implode cs, nat_of ds) end |
283 in (implode cs, nat ds) end |
460 |
284 |
461 val scan = |
285 val scan = |
462 scan_letter_letdigs -- |
286 scan_letter_letdigs -- Scan.optional ($$ "." |-- scan_digits1 >> nat) ~1; |
463 ($$ "." -- scan_digits1 >> (nat_of o #2) || scan_empty >> K ~1); |
|
464 in |
287 in |
465 (case scan chrs of |
288 (case scan chrs of |
466 ((cs, ~1), cs') => (split_vname cs, cs') |
289 ((cs, ~1), cs') => (split_vname cs, cs') |
467 | ((cs, i), cs') => ((implode cs, i), cs')) |
290 | ((cs, i), cs') => ((implode cs, i), cs')) |
468 end; |
291 end; |
469 |
292 |
470 |
293 |
471 (* scan_varname *) |
294 (* indexname *) |
472 |
295 |
473 fun scan_varname chs = |
296 fun indexname cs = |
474 scan_vname chs handle LEXICAL_ERROR |
297 (case Scan.error (Scan.finite Symbol.eof (Scan.option scan_vname)) cs of |
475 => error ("scan_varname: bad varname " ^ quote (implode chs)); |
298 (Some xi, []) => xi |
476 |
299 | _ => error ("Lexical error in variable name: " ^ quote (implode cs))); |
477 |
300 |
478 (* scan_var *) |
301 |
|
302 (* read_var *) |
479 |
303 |
480 fun const c = Const (c, dummyT); |
304 fun const c = Const (c, dummyT); |
481 fun free x = Free (x, dummyT); |
305 fun free x = Free (x, dummyT); |
482 fun var xi = Var (xi, dummyT); |
306 fun var xi = Var (xi, dummyT); |
483 |
307 |
484 fun scan_var str = |
308 fun read_var str = |
485 let |
309 let |
486 fun tvar (x, i) = var ("'" ^ x, i); |
310 fun tvar (x, i) = var ("'" ^ x, i); |
487 |
311 |
488 val scan = |
312 val scan = |
489 $$ "?" -- $$ "'" -- scan_vname -- scan_end >> (tvar o #2 o #1) || |
313 $$ "?" |-- $$ "'" |-- scan_vname >> tvar || |
490 $$ "?" -- scan_vname -- scan_end >> (var o #2 o #1) || |
314 $$ "?" |-- scan_vname >> var || |
491 scan_rest >> (free o implode); |
315 Scan.any Symbol.not_eof >> (free o implode); |
492 in |
316 in |
493 #1 (scan (explode str)) |
317 #1 (Scan.error (Scan.finite Symbol.eof scan) (Symbol.explode str)) |
494 end; |
318 end; |
495 |
319 |
496 |
320 |
497 (* read_var *) |
|
498 |
|
499 fun read_var str = |
|
500 let val scan = $$ "?" -- scan_vname -- scan_end >> (#2 o #1) in |
|
501 #1 (scan (explode str)) handle LEXICAL_ERROR |
|
502 => error ("Bad variable " ^ quote str) |
|
503 end; |
|
504 |
|
505 |
|
506 end; |
321 end; |