author | wenzelm |
Mon, 25 Jun 2007 00:36:42 +0200 | |
changeset 23493 | a056eefb76e5 |
parent 22873 | decd2ff5f503 |
child 23678 | f5d315390edc |
permissions | -rw-r--r-- |
5825 | 1 |
(* Title: Pure/Isar/outer_lex.ML |
2 |
ID: $Id$ |
|
3 |
Author: Markus Wenzel, TU Muenchen |
|
4 |
||
5 |
Outer lexical syntax for Isabelle/Isar. |
|
6 |
*) |
|
7 |
||
8 |
signature OUTER_LEX = |
|
9 |
sig |
|
10 |
datatype token_kind = |
|
7477 | 11 |
Command | Keyword | Ident | LongIdent | SymIdent | Var | TypeIdent | TypeVar | |
17164
a786e1a1ce02
added AltString token (delimited by ASCII back-quotes);
wenzelm
parents:
17069
diff
changeset
|
12 |
Nat | String | AltString | Verbatim | Space | Comment | Sync | Malformed | EOF |
15143
05b5995f214e
Make token an eqtype to assist reconstructing input
aspinall
parents:
14991
diff
changeset
|
13 |
eqtype token |
5825 | 14 |
val str_of_kind: token_kind -> string |
15 |
val stopper: token * (token -> bool) |
|
6859 | 16 |
val not_sync: token -> bool |
5825 | 17 |
val not_eof: token -> bool |
18 |
val position_of: token -> Position.T |
|
19 |
val pos_of: token -> string |
|
20 |
val is_kind: token_kind -> token -> bool |
|
7026 | 21 |
val keyword_with: (string -> bool) -> token -> bool |
16029 | 22 |
val ident_with: (string -> bool) -> token -> bool |
5825 | 23 |
val name_of: token -> string |
24 |
val is_proper: token -> bool |
|
9130 | 25 |
val is_semicolon: token -> bool |
17069 | 26 |
val is_comment: token -> bool |
8580 | 27 |
val is_begin_ignore: token -> bool |
28 |
val is_end_ignore: token -> bool |
|
17069 | 29 |
val is_blank: token -> bool |
8651 | 30 |
val is_newline: token -> bool |
14991 | 31 |
val unparse: token -> string |
5825 | 32 |
val val_of: token -> string |
5876 | 33 |
val is_sid: string -> bool |
9130 | 34 |
val !!! : string -> (Position.T * 'a -> 'b) -> Position.T * 'a -> 'b |
35 |
val incr_line: ('a -> 'b * 'c) -> Position.T * 'a -> 'b * (Position.T * 'c) |
|
36 |
val keep_line: ('a -> 'b * 'c) -> Position.T * 'a -> 'b * (Position.T * 'c) |
|
37 |
val scan_blank: Position.T * Symbol.symbol list |
|
38 |
-> Symbol.symbol * (Position.T * Symbol.symbol list) |
|
39 |
val scan_string: Position.T * Symbol.symbol list -> string * (Position.T * Symbol.symbol list) |
|
7026 | 40 |
val scan: (Scan.lexicon * Scan.lexicon) -> |
5825 | 41 |
Position.T * Symbol.symbol list -> token * (Position.T * Symbol.symbol list) |
7026 | 42 |
val source: bool -> (unit -> (Scan.lexicon * Scan.lexicon)) -> |
43 |
Position.T -> (Symbol.symbol, 'a) Source.source -> |
|
7682
46de8064c93c
added Space, Comment token kinds (keep actual text);
wenzelm
parents:
7477
diff
changeset
|
44 |
(token, Position.T * (Symbol.symbol, 'a) Source.source) Source.source |
17164
a786e1a1ce02
added AltString token (delimited by ASCII back-quotes);
wenzelm
parents:
17069
diff
changeset
|
45 |
val source_proper: (token, 'a) Source.source -> |
a786e1a1ce02
added AltString token (delimited by ASCII back-quotes);
wenzelm
parents:
17069
diff
changeset
|
46 |
(token, (token, 'a) Source.source) Source.source |
9130 | 47 |
val make_lexicon: string list -> Scan.lexicon |
5825 | 48 |
end; |
49 |
||
50 |
structure OuterLex: OUTER_LEX = |
|
51 |
struct |
|
52 |
||
53 |
||
54 |
(** tokens **) |
|
55 |
||
56 |
(* datatype token *) |
|
57 |
||
58 |
datatype token_kind = |
|
7477 | 59 |
Command | Keyword | Ident | LongIdent | SymIdent | Var | TypeIdent | TypeVar | |
17164
a786e1a1ce02
added AltString token (delimited by ASCII back-quotes);
wenzelm
parents:
17069
diff
changeset
|
60 |
Nat | String | AltString | Verbatim | Space | Comment | Sync | Malformed | EOF; |
5825 | 61 |
|
62 |
datatype token = Token of Position.T * (token_kind * string); |
|
63 |
||
64 |
val str_of_kind = |
|
7026 | 65 |
fn Command => "command" |
66 |
| Keyword => "keyword" |
|
5825 | 67 |
| Ident => "identifier" |
68 |
| LongIdent => "long identifier" |
|
69 |
| SymIdent => "symbolic identifier" |
|
70 |
| Var => "schematic variable" |
|
71 |
| TypeIdent => "type variable" |
|
72 |
| TypeVar => "schematic type variable" |
|
73 |
| Nat => "number" |
|
74 |
| String => "string" |
|
17164
a786e1a1ce02
added AltString token (delimited by ASCII back-quotes);
wenzelm
parents:
17069
diff
changeset
|
75 |
| AltString => "back-quoted string" |
5825 | 76 |
| Verbatim => "verbatim text" |
7682
46de8064c93c
added Space, Comment token kinds (keep actual text);
wenzelm
parents:
7477
diff
changeset
|
77 |
| Space => "white space" |
46de8064c93c
added Space, Comment token kinds (keep actual text);
wenzelm
parents:
7477
diff
changeset
|
78 |
| Comment => "comment text" |
6859 | 79 |
| Sync => "sync marker" |
10748 | 80 |
| Malformed => "bad input" |
5825 | 81 |
| EOF => "end-of-file"; |
82 |
||
83 |
||
10748 | 84 |
(* control tokens *) |
6859 | 85 |
|
86 |
fun not_sync (Token (_, (Sync, _))) = false |
|
87 |
| not_sync _ = true; |
|
88 |
||
10748 | 89 |
val malformed = Token (Position.none, (Malformed, "")); |
15224
1bd35fd87963
Allow scanning to recover and reconstruct bad input
aspinall
parents:
15209
diff
changeset
|
90 |
fun malformed_of xs = Token (Position.none, (Malformed, implode xs)); |
10748 | 91 |
|
6859 | 92 |
|
5825 | 93 |
(* eof token *) |
94 |
||
95 |
val eof = Token (Position.none, (EOF, "")); |
|
96 |
||
97 |
fun is_eof (Token (_, (EOF, _))) = true |
|
98 |
| is_eof _ = false; |
|
99 |
||
100 |
val stopper = (eof, is_eof); |
|
101 |
val not_eof = not o is_eof; |
|
102 |
||
103 |
||
104 |
(* get position *) |
|
105 |
||
106 |
fun position_of (Token (pos, _)) = pos; |
|
107 |
val pos_of = Position.str_of o position_of; |
|
108 |
||
109 |
||
110 |
(* kind of token *) |
|
111 |
||
112 |
fun is_kind k (Token (_, (k', _))) = k = k'; |
|
113 |
||
7026 | 114 |
fun keyword_with pred (Token (_, (Keyword, x))) = pred x |
115 |
| keyword_with _ _ = false; |
|
5825 | 116 |
|
16029 | 117 |
fun ident_with pred (Token (_, (Ident, x))) = pred x |
118 |
| ident_with _ _ = false; |
|
119 |
||
7682
46de8064c93c
added Space, Comment token kinds (keep actual text);
wenzelm
parents:
7477
diff
changeset
|
120 |
fun is_proper (Token (_, (Space, _))) = false |
46de8064c93c
added Space, Comment token kinds (keep actual text);
wenzelm
parents:
7477
diff
changeset
|
121 |
| is_proper (Token (_, (Comment, _))) = false |
5825 | 122 |
| is_proper _ = true; |
123 |
||
9195 | 124 |
fun is_semicolon (Token (_, (Keyword, ";"))) = true |
9130 | 125 |
| is_semicolon _ = false; |
126 |
||
17069 | 127 |
fun is_comment (Token (_, (Comment, _))) = true |
128 |
| is_comment _ = false; |
|
129 |
||
8580 | 130 |
fun is_begin_ignore (Token (_, (Comment, "<"))) = true |
131 |
| is_begin_ignore _ = false; |
|
132 |
||
133 |
fun is_end_ignore (Token (_, (Comment, ">"))) = true |
|
134 |
| is_end_ignore _ = false; |
|
135 |
||
8651 | 136 |
|
17069 | 137 |
(* blanks and newlines -- space tokens obey lines *) |
8651 | 138 |
|
17069 | 139 |
fun is_blank (Token (_, (Space, s))) = not (String.isSuffix "\n" s) |
140 |
| is_blank _ = false; |
|
141 |
||
142 |
fun is_newline (Token (_, (Space, s))) = String.isSuffix "\n" s |
|
8651 | 143 |
| is_newline _ = false; |
144 |
||
5825 | 145 |
|
14991 | 146 |
(* token content *) |
9155 | 147 |
|
148 |
fun name_of (tok as Token (_, (k, x))) = |
|
10748 | 149 |
if is_semicolon tok then "terminator" |
150 |
else if x = "" then str_of_kind k |
|
151 |
else str_of_kind k ^ " " ^ quote x; |
|
5825 | 152 |
|
18547 | 153 |
fun escape q = |
154 |
implode o map (fn s => if s = q orelse s = "\\" then "\\" ^ s else s) o Symbol.explode; |
|
155 |
||
14991 | 156 |
fun unparse (Token (_, (kind, x))) = |
157 |
(case kind of |
|
18547 | 158 |
String => x |> quote o escape "\"" |
159 |
| AltString => x |> enclose "`" "`" o escape "`" |
|
14991 | 160 |
| Verbatim => x |> enclose "{*" "*}" |
161 |
| Comment => x |> enclose "(*" "*)" |
|
162 |
| _ => x); |
|
163 |
||
5825 | 164 |
fun val_of (Token (_, (_, x))) = x; |
165 |
||
166 |
fun token_leq (Token (_, (_, x)), Token (_, (_, x'))) = x <= x'; |
|
167 |
||
168 |
||
169 |
||
170 |
(** scanners **) |
|
171 |
||
172 |
fun change_prompt scan = Scan.prompt "# " scan; |
|
173 |
||
174 |
||
175 |
(* diagnostics *) |
|
176 |
||
177 |
fun lex_err msg ((pos, cs), _) = "Outer lexical error" ^ Position.str_of pos ^ ": " ^ msg cs; |
|
9130 | 178 |
fun !!! msg scan = Scan.!! (lex_err (K msg)) scan; |
5825 | 179 |
|
180 |
||
181 |
(* line numbering *) |
|
182 |
||
183 |
fun incr_line scan = Scan.depend (fn pos => scan >> pair (Position.inc pos)); |
|
184 |
val keep_line = Scan.lift; |
|
185 |
||
186 |
val scan_blank = |
|
187 |
incr_line ($$ "\n") || |
|
188 |
keep_line (Scan.one Symbol.is_blank); |
|
189 |
||
190 |
||
191 |
(* scan symbolic idents *) |
|
192 |
||
20664 | 193 |
val is_sym_char = member (op =) (explode "!#$%&*+-/<=>?@^_|~"); |
5825 | 194 |
|
8231 | 195 |
val scan_symid = |
21858
05f57309170c
avoid conflict with Alice keywords: renamed pack -> implode, unpack -> explode, any -> many, avoided assert;
wenzelm
parents:
20982
diff
changeset
|
196 |
Scan.many1 is_sym_char >> implode || |
8231 | 197 |
Scan.one Symbol.is_symbolic; |
5825 | 198 |
|
8231 | 199 |
fun is_symid str = |
200 |
(case try Symbol.explode str of |
|
15531 | 201 |
SOME [s] => Symbol.is_symbolic s orelse is_sym_char s |
202 |
| SOME ss => forall is_sym_char ss |
|
8231 | 203 |
| _ => false); |
204 |
||
20982 | 205 |
fun is_sid "begin" = false |
206 |
| is_sid ":" = true |
|
22873 | 207 |
| is_sid "::" = true |
20982 | 208 |
| is_sid s = is_symid s orelse Syntax.is_identifier s; |
5825 | 209 |
|
210 |
||
211 |
(* scan strings *) |
|
212 |
||
17164
a786e1a1ce02
added AltString token (delimited by ASCII back-quotes);
wenzelm
parents:
17069
diff
changeset
|
213 |
local |
a786e1a1ce02
added AltString token (delimited by ASCII back-quotes);
wenzelm
parents:
17069
diff
changeset
|
214 |
|
a786e1a1ce02
added AltString token (delimited by ASCII back-quotes);
wenzelm
parents:
17069
diff
changeset
|
215 |
fun scan_str q = |
7682
46de8064c93c
added Space, Comment token kinds (keep actual text);
wenzelm
parents:
7477
diff
changeset
|
216 |
scan_blank || |
9130 | 217 |
keep_line ($$ "\\") |-- !!! "bad escape character in string" |
17164
a786e1a1ce02
added AltString token (delimited by ASCII back-quotes);
wenzelm
parents:
17069
diff
changeset
|
218 |
(scan_blank || keep_line ($$ q || $$ "\\")) || |
19305 | 219 |
keep_line (Scan.one (fn s => s <> q andalso s <> "\\" andalso |
220 |
Symbol.not_sync s andalso Symbol.not_eof s)); |
|
5825 | 221 |
|
17164
a786e1a1ce02
added AltString token (delimited by ASCII back-quotes);
wenzelm
parents:
17069
diff
changeset
|
222 |
fun scan_strs q = |
a786e1a1ce02
added AltString token (delimited by ASCII back-quotes);
wenzelm
parents:
17069
diff
changeset
|
223 |
keep_line ($$ q) |-- |
9130 | 224 |
!!! "missing quote at end of string" |
17164
a786e1a1ce02
added AltString token (delimited by ASCII back-quotes);
wenzelm
parents:
17069
diff
changeset
|
225 |
(change_prompt ((Scan.repeat (scan_str q) >> implode) --| keep_line ($$ q))); |
a786e1a1ce02
added AltString token (delimited by ASCII back-quotes);
wenzelm
parents:
17069
diff
changeset
|
226 |
|
a786e1a1ce02
added AltString token (delimited by ASCII back-quotes);
wenzelm
parents:
17069
diff
changeset
|
227 |
in |
a786e1a1ce02
added AltString token (delimited by ASCII back-quotes);
wenzelm
parents:
17069
diff
changeset
|
228 |
|
a786e1a1ce02
added AltString token (delimited by ASCII back-quotes);
wenzelm
parents:
17069
diff
changeset
|
229 |
val scan_string = scan_strs "\""; |
a786e1a1ce02
added AltString token (delimited by ASCII back-quotes);
wenzelm
parents:
17069
diff
changeset
|
230 |
val scan_alt_string = scan_strs "`"; |
a786e1a1ce02
added AltString token (delimited by ASCII back-quotes);
wenzelm
parents:
17069
diff
changeset
|
231 |
|
a786e1a1ce02
added AltString token (delimited by ASCII back-quotes);
wenzelm
parents:
17069
diff
changeset
|
232 |
end; |
5825 | 233 |
|
234 |
||
235 |
(* scan verbatim text *) |
|
236 |
||
237 |
val scan_verb = |
|
238 |
scan_blank || |
|
19305 | 239 |
keep_line ($$ "*" --| Scan.ahead (~$$ "}")) || |
240 |
keep_line (Scan.one (fn s => s <> "*" andalso Symbol.not_sync s andalso Symbol.not_eof s)); |
|
5825 | 241 |
|
242 |
val scan_verbatim = |
|
6743
5d50225637c8
changed {| |} verbatim syntax to {* *} in order to simplify ProofGeneral setup;
wenzelm
parents:
5876
diff
changeset
|
243 |
keep_line ($$ "{" -- $$ "*") |-- |
9130 | 244 |
!!! "missing end of verbatim text" |
6743
5d50225637c8
changed {| |} verbatim syntax to {* *} in order to simplify ProofGeneral setup;
wenzelm
parents:
5876
diff
changeset
|
245 |
(change_prompt ((Scan.repeat scan_verb >> implode) --| keep_line ($$ "*" -- $$ "}"))); |
5825 | 246 |
|
247 |
||
248 |
(* scan space *) |
|
249 |
||
19305 | 250 |
fun is_space s = Symbol.is_blank s andalso s <> "\n"; |
5825 | 251 |
|
252 |
val scan_space = |
|
21858
05f57309170c
avoid conflict with Alice keywords: renamed pack -> implode, unpack -> explode, any -> many, avoided assert;
wenzelm
parents:
20982
diff
changeset
|
253 |
(keep_line (Scan.many1 is_space) -- Scan.optional (incr_line ($$ "\n")) "" || |
05f57309170c
avoid conflict with Alice keywords: renamed pack -> implode, unpack -> explode, any -> many, avoided assert;
wenzelm
parents:
20982
diff
changeset
|
254 |
keep_line (Scan.many is_space) -- incr_line ($$ "\n")) >> (fn (cs, c) => implode cs ^ c); |
5825 | 255 |
|
256 |
||
257 |
(* scan nested comments *) |
|
258 |
||
259 |
val scan_cmt = |
|
260 |
Scan.lift scan_blank || |
|
261 |
Scan.depend (fn d => keep_line ($$ "(" ^^ $$ "*") >> pair (d + 1)) || |
|
262 |
Scan.depend (fn 0 => Scan.fail | d => keep_line ($$ "*" ^^ $$ ")") >> pair (d - 1)) || |
|
19305 | 263 |
Scan.lift (keep_line ($$ "*" --| Scan.ahead (~$$ ")"))) || |
264 |
Scan.lift (keep_line (Scan.one (fn s => |
|
265 |
s <> "*" andalso Symbol.not_sync s andalso Symbol.not_eof s))); |
|
5825 | 266 |
|
267 |
val scan_comment = |
|
268 |
keep_line ($$ "(" -- $$ "*") |-- |
|
9130 | 269 |
!!! "missing end of comment" |
5825 | 270 |
(change_prompt |
7682
46de8064c93c
added Space, Comment token kinds (keep actual text);
wenzelm
parents:
7477
diff
changeset
|
271 |
(Scan.pass 0 (Scan.repeat scan_cmt >> implode) --| keep_line ($$ "*" -- $$ ")"))); |
5825 | 272 |
|
273 |
||
274 |
(* scan token *) |
|
275 |
||
9130 | 276 |
fun scan (lex1, lex2) = |
5825 | 277 |
let |
9130 | 278 |
val scanner = Scan.state :-- (fn pos => |
279 |
let |
|
280 |
fun token k x = Token (pos, (k, x)); |
|
281 |
fun sync _ = token Sync Symbol.sync; |
|
282 |
in |
|
283 |
scan_string >> token String || |
|
17164
a786e1a1ce02
added AltString token (delimited by ASCII back-quotes);
wenzelm
parents:
17069
diff
changeset
|
284 |
scan_alt_string >> token AltString || |
9130 | 285 |
scan_verbatim >> token Verbatim || |
286 |
scan_space >> token Space || |
|
287 |
scan_comment >> token Comment || |
|
288 |
keep_line (Scan.one Symbol.is_sync >> sync) || |
|
289 |
keep_line (Scan.max token_leq |
|
290 |
(Scan.max token_leq |
|
291 |
(Scan.literal lex1 >> (token Keyword o implode)) |
|
292 |
(Scan.literal lex2 >> (token Command o implode))) |
|
293 |
(Syntax.scan_longid >> token LongIdent || |
|
294 |
Syntax.scan_id >> token Ident || |
|
295 |
Syntax.scan_var >> token Var || |
|
296 |
Syntax.scan_tid >> token TypeIdent || |
|
297 |
Syntax.scan_tvar >> token TypeVar || |
|
298 |
Syntax.scan_nat >> token Nat || |
|
299 |
scan_symid >> token SymIdent)) |
|
300 |
end) >> #2; |
|
14729 | 301 |
in !! (lex_err (fn cs => "bad input " ^ quote (Symbol.beginning 10 cs))) scanner end; |
5825 | 302 |
|
303 |
||
9130 | 304 |
(* token sources *) |
5825 | 305 |
|
6859 | 306 |
val is_junk = (not o Symbol.is_blank) andf Symbol.not_sync andf Symbol.not_eof; |
21858
05f57309170c
avoid conflict with Alice keywords: renamed pack -> implode, unpack -> explode, any -> many, avoided assert;
wenzelm
parents:
20982
diff
changeset
|
307 |
fun recover xs = (keep_line (Scan.many is_junk) >> (fn ts => [malformed_of ts])) xs; |
5825 | 308 |
|
309 |
fun source do_recover get_lex pos src = |
|
310 |
Source.source' pos Symbol.stopper (Scan.bulk (fn xs => scan (get_lex ()) xs)) |
|
15531 | 311 |
(if do_recover then SOME recover else NONE) src; |
5825 | 312 |
|
9130 | 313 |
fun source_proper src = src |> Source.filter is_proper; |
314 |
||
315 |
||
316 |
(* lexicons *) |
|
317 |
||
318 |
val make_lexicon = Scan.make_lexicon o map Symbol.explode; |
|
5825 | 319 |
|
320 |
end; |