1 (* Title: Pure/Thy/thy_scan.ML |
1 (* Title: Pure/Thy/thy_scan.ML |
2 ID: $Id$ |
2 ID: $Id$ |
3 Author: Markus Wenzel, TU Muenchen |
3 Author: Markus Wenzel, TU Muenchen |
4 |
4 |
5 The scanner for theory files. |
5 Lexer for the outer Isabelle syntax. |
|
6 |
|
7 TODO: |
|
8 - old vs. new: interpreted strings, no 'ML', var!?; |
6 *) |
9 *) |
7 |
10 |
8 signature THY_SCAN = |
11 signature THY_SCAN = |
9 sig |
12 sig |
10 datatype token_kind = |
13 datatype token_kind = |
11 Keyword | Ident | LongIdent | TypeVar | Nat | String | Verbatim | EOF |
14 Keyword | Ident | LongIdent | Var | TypeVar | Nat | String | Verbatim | Ignore | EOF |
12 val name_of_kind: token_kind -> string |
15 val name_of_kind: token_kind -> string |
13 type lexicon |
16 val tokenize: Scan.lexicon -> string -> (token_kind * string * int) list |
14 val make_lexicon: string list -> lexicon |
17 end; |
15 val tokenize: lexicon -> string -> (token_kind * string * int) list |
|
16 end; |
|
17 |
18 |
18 |
19 structure ThyScan: THY_SCAN = |
19 structure ThyScan : THY_SCAN = |
|
20 struct |
20 struct |
21 |
|
22 open Scanner; |
|
23 |
21 |
24 |
22 |
25 (** token kinds **) |
23 (** token kinds **) |
26 |
24 |
27 datatype token_kind = |
25 datatype token_kind = |
28 Keyword | Ident | LongIdent | TypeVar | Nat | String | Verbatim | EOF; |
26 Keyword | Ident | LongIdent | Var | TypeVar | Nat | String | Verbatim | Ignore | EOF; |
29 |
27 |
30 fun name_of_kind Keyword = "keyword" |
28 fun name_of_kind Keyword = "keyword" |
31 | name_of_kind Ident = "identifier" |
29 | name_of_kind Ident = "identifier" |
32 | name_of_kind LongIdent = "long identifier" |
30 | name_of_kind LongIdent = "long identifier" |
|
31 | name_of_kind Var = "schematic variable" |
33 | name_of_kind TypeVar = "type variable" |
32 | name_of_kind TypeVar = "type variable" |
34 | name_of_kind Nat = "natural number" |
33 | name_of_kind Nat = "natural number" |
35 | name_of_kind String = "string" |
34 | name_of_kind String = "string" |
36 | name_of_kind Verbatim = "verbatim text" |
35 | name_of_kind Verbatim = "verbatim text" |
|
36 | name_of_kind Ignore = "ignore" |
37 | name_of_kind EOF = "end-of-file"; |
37 | name_of_kind EOF = "end-of-file"; |
38 |
38 |
39 |
39 |
40 |
40 |
41 (** scanners **) |
41 (** scanners **) |
42 |
42 |
43 fun lex_err line msg = |
43 (* diagnostics *) |
44 error ("Lexical error on line " ^ string_of_int line ^ ": " ^ msg); |
44 |
|
45 fun lex_err None msg = "Outer lexical error: " ^ msg |
|
46 | lex_err (Some n) msg = "Outer lexical error on line " ^ string_of_int n ^ ": " ^ msg; |
45 |
47 |
46 |
48 |
47 (* misc utilities *) |
49 (* line numbering *) |
48 |
50 |
49 fun count_lines cs = |
51 val incr = apsome (fn n:int => n + 1); |
50 foldl (fn (sum, c) => if c = "\n" then sum + 1 else sum) (0, cs); |
|
51 |
52 |
52 fun inc_line n s = n + count_lines (explode s); |
53 fun incr_line scan = Scan.depend (fn n => scan >> pair (incr n)); |
|
54 val keep_line = Scan.lift; |
53 |
55 |
54 fun cons_fst c (cs, x, y) = (c :: cs, x, y); |
56 val scan_blank = |
|
57 incr_line ($$ "\n") || |
|
58 keep_line (Scan.one Symbol.is_blank); |
55 |
59 |
56 |
60 |
57 (* scan strings *) |
61 (* scan ML-style strings *) |
58 |
62 |
59 val scan_ctrl = $$ "^" ^^ scan_one (fn c => ord c >= 64 andalso ord c <= 95); |
63 val scan_ctrl = |
|
64 $$ "^" ^^ Scan.one (fn c => Symbol.is_ascii c andalso ord c >= 64 andalso ord c <= 95); |
60 |
65 |
61 val scan_dig = scan_one is_digit; |
66 val scan_dig = Scan.one Symbol.is_digit; |
62 |
|
63 val scan_blanks1 = scan_any1 is_blank >> implode; |
|
64 |
67 |
65 val scan_esc = |
68 val scan_esc = |
66 $$ "n" || $$ "t" || $$ "\"" || $$ "\\" || |
69 keep_line ($$ "\\") ^^ !! (fn (n, _) => lex_err n "bad escape sequence in string") |
67 scan_ctrl || scan_dig ^^ scan_dig ^^ scan_dig || |
70 (keep_line ($$ "n" || $$ "t" || $$ "\"" || $$ "\\" || |
68 scan_blanks1 ^^ $$ "\\"; |
71 scan_ctrl || scan_dig ^^ scan_dig ^^ scan_dig) || |
|
72 (Scan.repeat1 scan_blank >> implode) ^^ keep_line ($$ "\\")); |
69 |
73 |
70 fun string ("\"" :: cs) n = (["\""], cs, n) |
74 val scan_str = |
71 | string ("\\" :: cs) n = |
75 scan_esc || |
72 (case optional scan_esc cs of |
76 scan_blank >> K " " || |
73 (None, _) => lex_err n "bad escape sequence in string" |
77 keep_line (Scan.one (not_equal "\"" andf Symbol.not_eof)); |
74 | (Some s, cs') => cons_fst ("\\" ^ s) (string cs' (inc_line n s))) |
78 |
75 | string ("\n" :: cs) n = cons_fst " " (string cs (n + 1)) |
79 val scan_string = |
76 | string (c :: cs) n = |
80 keep_line ($$ "\"") ^^ |
77 if is_blank c then cons_fst " " (string cs n) |
81 !! (fn (n, _) => lex_err n "missing quote at end of string") |
78 else cons_fst c (string cs n) |
82 ((Scan.repeat scan_str >> implode) ^^ keep_line ($$ "\"")); |
79 | string [] n = lex_err n "missing quote at end of string"; |
|
80 |
83 |
81 |
84 |
82 (* scan verbatim text *) |
85 (* scan verbatim text *) |
83 |
86 |
84 fun verbatim ("|" :: "}" :: cs) n = ([], cs, n) |
87 val scan_verb = |
85 | verbatim (c :: cs) n = cons_fst c (verbatim cs (inc_line n c)) |
88 scan_blank || |
86 | verbatim [] n = lex_err n "missing end of verbatim text"; |
89 keep_line ($$ "|" --| Scan.ahead (Scan.one (not_equal "}"))) || |
|
90 keep_line (Scan.one (not_equal "|" andf Symbol.not_eof)); |
|
91 |
|
92 val scan_verbatim = |
|
93 keep_line ($$ "{" -- $$ "|") |-- |
|
94 !! (fn (n, _) => lex_err n "missing end of verbatim text") |
|
95 ((Scan.repeat scan_verb >> implode) --| keep_line ($$ "|" -- $$ "}")); |
87 |
96 |
88 |
97 |
89 (* scan nested comments *) |
98 (* scan nested comments *) |
90 |
99 |
91 fun comment ("*" :: ")" :: cs) n 1 = (cs, n) |
100 val scan_cmt = |
92 | comment ("*" :: ")" :: cs) n d = comment cs n (d - 1) |
101 Scan.lift scan_blank || |
93 | comment ("(" :: "*" :: cs) n d = comment cs n (d + 1) |
102 Scan.depend (fn d => keep_line ($$ "(" ^^ $$ "*") >> pair (d + 1)) || |
94 | comment (c :: cs) n d = comment cs (inc_line n c) d |
103 Scan.depend (fn 0 => Scan.fail | d => keep_line ($$ "*" ^^ $$ ")") >> pair (d - 1)) || |
95 | comment [] n _ = lex_err n "missing end of comment"; |
104 Scan.lift (keep_line ($$ "*" --| Scan.ahead (Scan.one (not_equal ")")))) || |
|
105 Scan.lift (keep_line (Scan.one (not_equal "*" andf Symbol.not_eof))); |
|
106 |
|
107 val scan_comment = |
|
108 keep_line ($$ "(" -- $$ "*") |-- |
|
109 !! (fn (n, _) => lex_err n "missing end of comment") |
|
110 (Scan.pass 0 (Scan.repeat scan_cmt) |-- keep_line ($$ "*" -- $$ ")") >> K ""); |
96 |
111 |
97 |
112 |
|
113 (* scan token *) |
98 |
114 |
99 (** tokenize **) |
115 fun token k None x = (k, x, 0) |
|
116 | token k (Some n) x = (k, x, n); |
100 |
117 |
101 fun scan_word lex = |
118 fun scan_tok lex (n, cs) = |
102 max_of |
119 (scan_string >> token String n || |
103 (scan_literal lex >> pair Keyword) |
120 scan_verbatim >> token Verbatim n || |
104 (scan_longid >> pair LongIdent || |
121 Scan.repeat1 scan_blank >> (token Ignore n o implode) || |
105 scan_id >> pair Ident || |
122 scan_comment >> token Ignore n || |
106 scan_tid >> pair TypeVar || |
123 keep_line (Scan.max (fn ((_, x, _), (_, x', _)) => x <= x') |
107 scan_nat >> pair Nat); |
124 (Scan.literal lex >> (token Keyword n o implode)) |
|
125 (Syntax.scan_longid >> token LongIdent n || |
|
126 Syntax.scan_id >> token Ident n || |
|
127 Syntax.scan_var >> token Var n || |
|
128 Syntax.scan_tid >> token TypeVar n || |
|
129 Syntax.scan_nat >> token Nat n))) (n, cs); |
108 |
130 |
109 fun add_tok toks kind n (cs, cs', n') = |
131 val scan_rest = Scan.any Symbol.not_eof >> implode; |
110 ((kind, implode cs, n) :: toks, cs', n'); |
|
111 |
132 |
112 val take_line = implode o fst o take_prefix (not_equal "\n"); |
133 fun scan_token lex x = |
|
134 (case scan_tok lex x of |
|
135 ((Keyword, "ML", n), x') => (keep_line scan_rest >> token Verbatim (Some n)) x' |
|
136 | tok_x' => tok_x'); |
113 |
137 |
|
138 |
|
139 (* tokenize *) |
114 |
140 |
115 fun tokenize lex str = |
141 fun tokenize lex str = |
116 let |
142 let |
117 fun scan (toks, [], n) = rev ((EOF, "end-of-file", n) :: toks) |
143 val chs = explode str; (*sic!*) |
118 | scan (toks, "\"" :: cs, n) = |
144 val scan_toks = Scan.repeat (scan_token lex); |
119 scan (add_tok toks String n (cons_fst "\"" (string cs n))) |
145 val ignore = fn (Ignore, _, _) => true | _ => false; |
120 | scan (toks, "{" :: "|" :: cs, n) = |
|
121 scan (add_tok toks Verbatim n (verbatim cs n)) |
|
122 | scan (toks, "(" :: "*" :: cs, n) = |
|
123 scan ((fn (cs', n') => (toks, cs', n')) (comment cs n 1)) |
|
124 | scan (toks, chs as c :: cs, n) = |
|
125 if is_blank c then scan (toks, cs, inc_line n c) |
|
126 else |
|
127 (case scan_word lex chs of |
|
128 (None, _) => lex_err n (quote (take_line chs)) |
|
129 | (Some (Keyword, "ML"), chs') => scan ((Verbatim, implode chs', n) |
|
130 :: (Keyword, "ML", n) :: toks, [], n + count_lines chs') |
|
131 | (Some (tk, s), chs') => scan ((tk, s, n) :: toks, chs', n)); |
|
132 in |
146 in |
133 scan ([], explode str, 1) |
147 (case Scan.error (Scan.finite' Symbol.eof scan_toks) (Some 1, chs) of |
|
148 (toks, (n, [])) => filter_out ignore toks @ [token EOF n "end-of-file"] |
|
149 | (_, (n, cs)) => error (lex_err n ("Bad input " ^ quote (beginning cs)))) |
134 end; |
150 end; |
135 |
151 |
136 |
152 |
137 end; |
153 end; |
138 |
|