author | wenzelm |
Sun, 20 Jul 2008 23:06:59 +0200 | |
changeset 27663 | 098798321622 |
parent 27358 | d6679949a869 |
child 27733 | d3d7038fb7b5 |
permissions | -rw-r--r-- |
5825 | 1 |
(* Title: Pure/Isar/outer_lex.ML |
2 |
ID: $Id$ |
|
3 |
Author: Markus Wenzel, TU Muenchen |
|
4 |
||
5 |
Outer lexical syntax for Isabelle/Isar. |
|
6 |
*) |
|
7 |
||
8 |
signature OUTER_LEX = |
|
9 |
sig |
|
10 |
datatype token_kind = |
|
23729
d1ba656978c5
separated Malformed (symbolic char) from Error (bad input);
wenzelm
parents:
23721
diff
changeset
|
11 |
Command | Keyword | Ident | LongIdent | SymIdent | Var | TypeIdent | TypeVar | Nat | |
23788
54ce229dc858
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
wenzelm
parents:
23729
diff
changeset
|
12 |
String | AltString | Verbatim | Space | Comment | Malformed | Error of string | Sync | EOF |
15143
05b5995f214e
Make token an eqtype to assist reconstructing input
aspinall
parents:
14991
diff
changeset
|
13 |
eqtype token |
5825 | 14 |
val str_of_kind: token_kind -> string |
15 |
val stopper: token * (token -> bool) |
|
6859 | 16 |
val not_sync: token -> bool |
5825 | 17 |
val not_eof: token -> bool |
27663 | 18 |
val range_of: token -> Position.range |
5825 | 19 |
val position_of: token -> Position.T |
20 |
val pos_of: token -> string |
|
23721 | 21 |
val kind_of: token -> token_kind |
5825 | 22 |
val is_kind: token_kind -> token -> bool |
7026 | 23 |
val keyword_with: (string -> bool) -> token -> bool |
16029 | 24 |
val ident_with: (string -> bool) -> token -> bool |
5825 | 25 |
val is_proper: token -> bool |
9130 | 26 |
val is_semicolon: token -> bool |
17069 | 27 |
val is_comment: token -> bool |
8580 | 28 |
val is_begin_ignore: token -> bool |
29 |
val is_end_ignore: token -> bool |
|
17069 | 30 |
val is_blank: token -> bool |
8651 | 31 |
val is_newline: token -> bool |
25642
ebdff0dca2a5
text_of: made even more robust against recurrent errors;
wenzelm
parents:
25582
diff
changeset
|
32 |
val val_of: token -> string |
14991 | 33 |
val unparse: token -> string |
23788
54ce229dc858
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
wenzelm
parents:
23729
diff
changeset
|
34 |
val text_of: token -> string * string |
5876 | 35 |
val is_sid: string -> bool |
9130 | 36 |
val !!! : string -> (Position.T * 'a -> 'b) -> Position.T * 'a -> 'b |
26004
2abb3005660f
added count/counted: improved position handling for token syntax;
wenzelm
parents:
25642
diff
changeset
|
37 |
val count: (Symbol.symbol list -> Symbol.symbol * Symbol.symbol list) -> |
2abb3005660f
added count/counted: improved position handling for token syntax;
wenzelm
parents:
25642
diff
changeset
|
38 |
Position.T * Symbol.symbol list -> Symbol.symbol * (Position.T * Symbol.symbol list) |
2abb3005660f
added count/counted: improved position handling for token syntax;
wenzelm
parents:
25642
diff
changeset
|
39 |
val counted: (Symbol.symbol list -> Symbol.symbol list * Symbol.symbol list) -> |
2abb3005660f
added count/counted: improved position handling for token syntax;
wenzelm
parents:
25642
diff
changeset
|
40 |
Position.T * Symbol.symbol list -> string * (Position.T * Symbol.symbol list) |
9130 | 41 |
val scan_string: Position.T * Symbol.symbol list -> string * (Position.T * Symbol.symbol list) |
7026 | 42 |
val scan: (Scan.lexicon * Scan.lexicon) -> |
5825 | 43 |
Position.T * Symbol.symbol list -> token * (Position.T * Symbol.symbol list) |
23721 | 44 |
val source: bool option -> (unit -> Scan.lexicon * Scan.lexicon) -> |
7026 | 45 |
Position.T -> (Symbol.symbol, 'a) Source.source -> |
7682
46de8064c93c
added Space, Comment token kinds (keep actual text);
wenzelm
parents:
7477
diff
changeset
|
46 |
(token, Position.T * (Symbol.symbol, 'a) Source.source) Source.source |
17164
a786e1a1ce02
added AltString token (delimited by ASCII back-quotes);
wenzelm
parents:
17069
diff
changeset
|
47 |
val source_proper: (token, 'a) Source.source -> |
a786e1a1ce02
added AltString token (delimited by ASCII back-quotes);
wenzelm
parents:
17069
diff
changeset
|
48 |
(token, (token, 'a) Source.source) Source.source |
5825 | 49 |
end; |
50 |
||
51 |
structure OuterLex: OUTER_LEX = |
|
52 |
struct |
|
53 |
||
54 |
||
55 |
(** tokens **) |
|
56 |
||
57 |
(* datatype token *) |
|
58 |
||
59 |
datatype token_kind = |
|
23729
d1ba656978c5
separated Malformed (symbolic char) from Error (bad input);
wenzelm
parents:
23721
diff
changeset
|
60 |
Command | Keyword | Ident | LongIdent | SymIdent | Var | TypeIdent | TypeVar | Nat | |
23788
54ce229dc858
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
wenzelm
parents:
23729
diff
changeset
|
61 |
String | AltString | Verbatim | Space | Comment | Malformed | Error of string | Sync | EOF; |
5825 | 62 |
|
27663 | 63 |
datatype token = Token of Position.range * (token_kind * string); |
5825 | 64 |
|
65 |
val str_of_kind = |
|
7026 | 66 |
fn Command => "command" |
67 |
| Keyword => "keyword" |
|
5825 | 68 |
| Ident => "identifier" |
69 |
| LongIdent => "long identifier" |
|
70 |
| SymIdent => "symbolic identifier" |
|
71 |
| Var => "schematic variable" |
|
72 |
| TypeIdent => "type variable" |
|
73 |
| TypeVar => "schematic type variable" |
|
74 |
| Nat => "number" |
|
75 |
| String => "string" |
|
17164
a786e1a1ce02
added AltString token (delimited by ASCII back-quotes);
wenzelm
parents:
17069
diff
changeset
|
76 |
| AltString => "back-quoted string" |
5825 | 77 |
| Verbatim => "verbatim text" |
7682
46de8064c93c
added Space, Comment token kinds (keep actual text);
wenzelm
parents:
7477
diff
changeset
|
78 |
| Space => "white space" |
46de8064c93c
added Space, Comment token kinds (keep actual text);
wenzelm
parents:
7477
diff
changeset
|
79 |
| Comment => "comment text" |
23729
d1ba656978c5
separated Malformed (symbolic char) from Error (bad input);
wenzelm
parents:
23721
diff
changeset
|
80 |
| Malformed => "malformed symbolic character" |
d1ba656978c5
separated Malformed (symbolic char) from Error (bad input);
wenzelm
parents:
23721
diff
changeset
|
81 |
| Error _ => "bad input" |
23788
54ce229dc858
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
wenzelm
parents:
23729
diff
changeset
|
82 |
| Sync => "sync marker" |
5825 | 83 |
| EOF => "end-of-file"; |
84 |
||
85 |
||
10748 | 86 |
(* control tokens *) |
6859 | 87 |
|
27663 | 88 |
val eof = Token ((Position.none, Position.none), (EOF, "")); |
5825 | 89 |
|
90 |
fun is_eof (Token (_, (EOF, _))) = true |
|
91 |
| is_eof _ = false; |
|
92 |
||
93 |
val stopper = (eof, is_eof); |
|
94 |
val not_eof = not o is_eof; |
|
95 |
||
96 |
||
23678 | 97 |
fun not_sync (Token (_, (Sync, _))) = false |
98 |
| not_sync _ = true; |
|
99 |
||
100 |
||
5825 | 101 |
(* get position *) |
102 |
||
27663 | 103 |
fun range_of (Token (range, _)) = range; |
104 |
||
105 |
val position_of = #1 o range_of; |
|
5825 | 106 |
val pos_of = Position.str_of o position_of; |
107 |
||
108 |
||
109 |
(* kind of token *) |
|
110 |
||
23721 | 111 |
fun kind_of (Token (_, (k, _))) = k; |
112 |
||
5825 | 113 |
fun is_kind k (Token (_, (k', _))) = k = k'; |
114 |
||
7026 | 115 |
fun keyword_with pred (Token (_, (Keyword, x))) = pred x |
116 |
| keyword_with _ _ = false; |
|
5825 | 117 |
|
16029 | 118 |
fun ident_with pred (Token (_, (Ident, x))) = pred x |
119 |
| ident_with _ _ = false; |
|
120 |
||
7682
46de8064c93c
added Space, Comment token kinds (keep actual text);
wenzelm
parents:
7477
diff
changeset
|
121 |
fun is_proper (Token (_, (Space, _))) = false |
46de8064c93c
added Space, Comment token kinds (keep actual text);
wenzelm
parents:
7477
diff
changeset
|
122 |
| is_proper (Token (_, (Comment, _))) = false |
5825 | 123 |
| is_proper _ = true; |
124 |
||
9195 | 125 |
fun is_semicolon (Token (_, (Keyword, ";"))) = true |
9130 | 126 |
| is_semicolon _ = false; |
127 |
||
17069 | 128 |
fun is_comment (Token (_, (Comment, _))) = true |
129 |
| is_comment _ = false; |
|
130 |
||
8580 | 131 |
fun is_begin_ignore (Token (_, (Comment, "<"))) = true |
132 |
| is_begin_ignore _ = false; |
|
133 |
||
134 |
fun is_end_ignore (Token (_, (Comment, ">"))) = true |
|
135 |
| is_end_ignore _ = false; |
|
136 |
||
8651 | 137 |
|
17069 | 138 |
(* blanks and newlines -- space tokens obey lines *) |
8651 | 139 |
|
23678 | 140 |
fun is_blank (Token (_, (Space, x))) = not (String.isSuffix "\n" x) |
17069 | 141 |
| is_blank _ = false; |
142 |
||
23678 | 143 |
fun is_newline (Token (_, (Space, x))) = String.isSuffix "\n" x |
8651 | 144 |
| is_newline _ = false; |
145 |
||
5825 | 146 |
|
14991 | 147 |
(* token content *) |
9155 | 148 |
|
25642
ebdff0dca2a5
text_of: made even more robust against recurrent errors;
wenzelm
parents:
25582
diff
changeset
|
149 |
fun val_of (Token (_, (_, x))) = x; |
ebdff0dca2a5
text_of: made even more robust against recurrent errors;
wenzelm
parents:
25582
diff
changeset
|
150 |
|
18547 | 151 |
fun escape q = |
152 |
implode o map (fn s => if s = q orelse s = "\\" then "\\" ^ s else s) o Symbol.explode; |
|
153 |
||
14991 | 154 |
fun unparse (Token (_, (kind, x))) = |
155 |
(case kind of |
|
18547 | 156 |
String => x |> quote o escape "\"" |
157 |
| AltString => x |> enclose "`" "`" o escape "`" |
|
14991 | 158 |
| Verbatim => x |> enclose "{*" "*}" |
159 |
| Comment => x |> enclose "(*" "*)" |
|
25642
ebdff0dca2a5
text_of: made even more robust against recurrent errors;
wenzelm
parents:
25582
diff
changeset
|
160 |
| Malformed => Output.escape (translate_string Output.output x) |
23729
d1ba656978c5
separated Malformed (symbolic char) from Error (bad input);
wenzelm
parents:
23721
diff
changeset
|
161 |
| Sync => "" |
d1ba656978c5
separated Malformed (symbolic char) from Error (bad input);
wenzelm
parents:
23721
diff
changeset
|
162 |
| EOF => "" |
14991 | 163 |
| _ => x); |
164 |
||
23788
54ce229dc858
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
wenzelm
parents:
23729
diff
changeset
|
165 |
fun text_of tok = |
54ce229dc858
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
wenzelm
parents:
23729
diff
changeset
|
166 |
if is_semicolon tok then ("terminator", "") |
23729
d1ba656978c5
separated Malformed (symbolic char) from Error (bad input);
wenzelm
parents:
23721
diff
changeset
|
167 |
else |
23788
54ce229dc858
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
wenzelm
parents:
23729
diff
changeset
|
168 |
let |
54ce229dc858
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
wenzelm
parents:
23729
diff
changeset
|
169 |
val k = str_of_kind (kind_of tok); |
25642
ebdff0dca2a5
text_of: made even more robust against recurrent errors;
wenzelm
parents:
25582
diff
changeset
|
170 |
val s = unparse tok |
ebdff0dca2a5
text_of: made even more robust against recurrent errors;
wenzelm
parents:
25582
diff
changeset
|
171 |
handle ERROR _ => Symbol.separate_chars (val_of tok); |
23788
54ce229dc858
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
wenzelm
parents:
23729
diff
changeset
|
172 |
in |
54ce229dc858
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
wenzelm
parents:
23729
diff
changeset
|
173 |
if s = "" then (k, "") |
54ce229dc858
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
wenzelm
parents:
23729
diff
changeset
|
174 |
else if size s < 40 andalso not (exists_string (fn c => c = "\n") s) then (k ^ " " ^ s, "") |
54ce229dc858
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
wenzelm
parents:
23729
diff
changeset
|
175 |
else (k, s) |
54ce229dc858
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
wenzelm
parents:
23729
diff
changeset
|
176 |
end; |
23729
d1ba656978c5
separated Malformed (symbolic char) from Error (bad input);
wenzelm
parents:
23721
diff
changeset
|
177 |
|
5825 | 178 |
|
179 |
||
180 |
(** scanners **) |
|
181 |
||
182 |
fun change_prompt scan = Scan.prompt "# " scan; |
|
183 |
||
184 |
||
185 |
(* diagnostics *) |
|
186 |
||
187 |
fun lex_err msg ((pos, cs), _) = "Outer lexical error" ^ Position.str_of pos ^ ": " ^ msg cs; |
|
9130 | 188 |
fun !!! msg scan = Scan.!! (lex_err (K msg)) scan; |
5825 | 189 |
|
190 |
||
26004
2abb3005660f
added count/counted: improved position handling for token syntax;
wenzelm
parents:
25642
diff
changeset
|
191 |
(* position *) |
2abb3005660f
added count/counted: improved position handling for token syntax;
wenzelm
parents:
25642
diff
changeset
|
192 |
|
2abb3005660f
added count/counted: improved position handling for token syntax;
wenzelm
parents:
25642
diff
changeset
|
193 |
local |
2abb3005660f
added count/counted: improved position handling for token syntax;
wenzelm
parents:
25642
diff
changeset
|
194 |
|
2abb3005660f
added count/counted: improved position handling for token syntax;
wenzelm
parents:
25642
diff
changeset
|
195 |
fun map_position f (scan: Symbol.symbol list -> 'a * Symbol.symbol list) = |
2abb3005660f
added count/counted: improved position handling for token syntax;
wenzelm
parents:
25642
diff
changeset
|
196 |
Scan.depend (fn (pos: Position.T) => scan >> (fn x => (f x pos, x))); |
5825 | 197 |
|
26004
2abb3005660f
added count/counted: improved position handling for token syntax;
wenzelm
parents:
25642
diff
changeset
|
198 |
in |
5825 | 199 |
|
26004
2abb3005660f
added count/counted: improved position handling for token syntax;
wenzelm
parents:
25642
diff
changeset
|
200 |
fun count scan = map_position Position.advance scan; |
2abb3005660f
added count/counted: improved position handling for token syntax;
wenzelm
parents:
25642
diff
changeset
|
201 |
fun counted scan = map_position (fold Position.advance) scan >> implode; |
2abb3005660f
added count/counted: improved position handling for token syntax;
wenzelm
parents:
25642
diff
changeset
|
202 |
|
2abb3005660f
added count/counted: improved position handling for token syntax;
wenzelm
parents:
25642
diff
changeset
|
203 |
end; |
5825 | 204 |
|
205 |
||
206 |
(* scan symbolic idents *) |
|
207 |
||
20664 | 208 |
val is_sym_char = member (op =) (explode "!#$%&*+-/<=>?@^_|~"); |
5825 | 209 |
|
8231 | 210 |
val scan_symid = |
26004
2abb3005660f
added count/counted: improved position handling for token syntax;
wenzelm
parents:
25642
diff
changeset
|
211 |
Scan.many1 is_sym_char || |
2abb3005660f
added count/counted: improved position handling for token syntax;
wenzelm
parents:
25642
diff
changeset
|
212 |
Scan.one Symbol.is_symbolic >> single; |
5825 | 213 |
|
8231 | 214 |
fun is_symid str = |
215 |
(case try Symbol.explode str of |
|
15531 | 216 |
SOME [s] => Symbol.is_symbolic s orelse is_sym_char s |
217 |
| SOME ss => forall is_sym_char ss |
|
8231 | 218 |
| _ => false); |
219 |
||
20982 | 220 |
fun is_sid "begin" = false |
221 |
| is_sid ":" = true |
|
22873 | 222 |
| is_sid "::" = true |
20982 | 223 |
| is_sid s = is_symid s orelse Syntax.is_identifier s; |
5825 | 224 |
|
225 |
||
226 |
(* scan strings *) |
|
227 |
||
17164
a786e1a1ce02
added AltString token (delimited by ASCII back-quotes);
wenzelm
parents:
17069
diff
changeset
|
228 |
local |
a786e1a1ce02
added AltString token (delimited by ASCII back-quotes);
wenzelm
parents:
17069
diff
changeset
|
229 |
|
25579
22869d9d545b
(alt)string: allow explicit character codes (as in ML);
wenzelm
parents:
24577
diff
changeset
|
230 |
val char_code = |
26004
2abb3005660f
added count/counted: improved position handling for token syntax;
wenzelm
parents:
25642
diff
changeset
|
231 |
count (Scan.one Symbol.is_ascii_digit) -- |
2abb3005660f
added count/counted: improved position handling for token syntax;
wenzelm
parents:
25642
diff
changeset
|
232 |
count (Scan.one Symbol.is_ascii_digit) -- |
2abb3005660f
added count/counted: improved position handling for token syntax;
wenzelm
parents:
25642
diff
changeset
|
233 |
count (Scan.one Symbol.is_ascii_digit) :|-- |
2abb3005660f
added count/counted: improved position handling for token syntax;
wenzelm
parents:
25642
diff
changeset
|
234 |
(fn ((a, b), c) => |
25579
22869d9d545b
(alt)string: allow explicit character codes (as in ML);
wenzelm
parents:
24577
diff
changeset
|
235 |
let val (n, _) = Library.read_int [a, b, c] |
22869d9d545b
(alt)string: allow explicit character codes (as in ML);
wenzelm
parents:
24577
diff
changeset
|
236 |
in if n <= 255 then Scan.succeed (chr n) else Scan.fail end); |
22869d9d545b
(alt)string: allow explicit character codes (as in ML);
wenzelm
parents:
24577
diff
changeset
|
237 |
|
17164
a786e1a1ce02
added AltString token (delimited by ASCII back-quotes);
wenzelm
parents:
17069
diff
changeset
|
238 |
fun scan_str q = |
26004
2abb3005660f
added count/counted: improved position handling for token syntax;
wenzelm
parents:
25642
diff
changeset
|
239 |
count ($$ "\\") |-- !!! "bad escape character in string" (count ($$ q || $$ "\\") || char_code) || |
2abb3005660f
added count/counted: improved position handling for token syntax;
wenzelm
parents:
25642
diff
changeset
|
240 |
count (Scan.one (fn s => s <> q andalso s <> "\\" andalso Symbol.is_regular s)); |
5825 | 241 |
|
17164
a786e1a1ce02
added AltString token (delimited by ASCII back-quotes);
wenzelm
parents:
17069
diff
changeset
|
242 |
fun scan_strs q = |
26004
2abb3005660f
added count/counted: improved position handling for token syntax;
wenzelm
parents:
25642
diff
changeset
|
243 |
count ($$ q) |-- |
9130 | 244 |
!!! "missing quote at end of string" |
26004
2abb3005660f
added count/counted: improved position handling for token syntax;
wenzelm
parents:
25642
diff
changeset
|
245 |
(change_prompt ((Scan.repeat (scan_str q) >> implode) --| count ($$ q))); |
17164
a786e1a1ce02
added AltString token (delimited by ASCII back-quotes);
wenzelm
parents:
17069
diff
changeset
|
246 |
|
a786e1a1ce02
added AltString token (delimited by ASCII back-quotes);
wenzelm
parents:
17069
diff
changeset
|
247 |
in |
a786e1a1ce02
added AltString token (delimited by ASCII back-quotes);
wenzelm
parents:
17069
diff
changeset
|
248 |
|
a786e1a1ce02
added AltString token (delimited by ASCII back-quotes);
wenzelm
parents:
17069
diff
changeset
|
249 |
val scan_string = scan_strs "\""; |
a786e1a1ce02
added AltString token (delimited by ASCII back-quotes);
wenzelm
parents:
17069
diff
changeset
|
250 |
val scan_alt_string = scan_strs "`"; |
a786e1a1ce02
added AltString token (delimited by ASCII back-quotes);
wenzelm
parents:
17069
diff
changeset
|
251 |
|
a786e1a1ce02
added AltString token (delimited by ASCII back-quotes);
wenzelm
parents:
17069
diff
changeset
|
252 |
end; |
5825 | 253 |
|
254 |
||
255 |
(* scan verbatim text *) |
|
256 |
||
257 |
val scan_verb = |
|
26004
2abb3005660f
added count/counted: improved position handling for token syntax;
wenzelm
parents:
25642
diff
changeset
|
258 |
count ($$ "*" --| Scan.ahead (~$$ "}")) || |
2abb3005660f
added count/counted: improved position handling for token syntax;
wenzelm
parents:
25642
diff
changeset
|
259 |
count (Scan.one (fn s => s <> "*" andalso Symbol.is_regular s)); |
5825 | 260 |
|
261 |
val scan_verbatim = |
|
26004
2abb3005660f
added count/counted: improved position handling for token syntax;
wenzelm
parents:
25642
diff
changeset
|
262 |
count ($$ "{") |-- count ($$ "*") |-- |
9130 | 263 |
!!! "missing end of verbatim text" |
26004
2abb3005660f
added count/counted: improved position handling for token syntax;
wenzelm
parents:
25642
diff
changeset
|
264 |
(change_prompt ((Scan.repeat scan_verb >> implode) --| count ($$ "*") --| count ($$ "}"))); |
5825 | 265 |
|
266 |
||
267 |
(* scan space *) |
|
268 |
||
19305 | 269 |
fun is_space s = Symbol.is_blank s andalso s <> "\n"; |
5825 | 270 |
|
271 |
val scan_space = |
|
26004
2abb3005660f
added count/counted: improved position handling for token syntax;
wenzelm
parents:
25642
diff
changeset
|
272 |
(Scan.many1 is_space @@@ Scan.optional ($$ "\n" >> single) [] || |
2abb3005660f
added count/counted: improved position handling for token syntax;
wenzelm
parents:
25642
diff
changeset
|
273 |
Scan.many is_space @@@ ($$ "\n" >> single)); |
5825 | 274 |
|
275 |
||
276 |
(* scan nested comments *) |
|
277 |
||
278 |
val scan_cmt = |
|
26004
2abb3005660f
added count/counted: improved position handling for token syntax;
wenzelm
parents:
25642
diff
changeset
|
279 |
Scan.depend (fn d => count ($$ "(") ^^ count ($$ "*") >> pair (d + 1)) || |
2abb3005660f
added count/counted: improved position handling for token syntax;
wenzelm
parents:
25642
diff
changeset
|
280 |
Scan.depend (fn 0 => Scan.fail | d => count ($$ "*") ^^ count ($$ ")") >> pair (d - 1)) || |
2abb3005660f
added count/counted: improved position handling for token syntax;
wenzelm
parents:
25642
diff
changeset
|
281 |
Scan.lift (count ($$ "*" --| Scan.ahead (~$$ ")"))) || |
2abb3005660f
added count/counted: improved position handling for token syntax;
wenzelm
parents:
25642
diff
changeset
|
282 |
Scan.lift (count (Scan.one (fn s => s <> "*" andalso Symbol.is_regular s))); |
5825 | 283 |
|
284 |
val scan_comment = |
|
26004
2abb3005660f
added count/counted: improved position handling for token syntax;
wenzelm
parents:
25642
diff
changeset
|
285 |
count ($$ "(") |-- count ($$ "*") |-- |
9130 | 286 |
!!! "missing end of comment" |
5825 | 287 |
(change_prompt |
26004
2abb3005660f
added count/counted: improved position handling for token syntax;
wenzelm
parents:
25642
diff
changeset
|
288 |
(Scan.pass 0 (Scan.repeat scan_cmt >> implode) --| count ($$ "*") --| count ($$ ")"))); |
5825 | 289 |
|
290 |
||
23678 | 291 |
(* scan malformed symbols *) |
292 |
||
293 |
val scan_malformed = |
|
26004
2abb3005660f
added count/counted: improved position handling for token syntax;
wenzelm
parents:
25642
diff
changeset
|
294 |
$$ Symbol.malformed |-- |
2abb3005660f
added count/counted: improved position handling for token syntax;
wenzelm
parents:
25642
diff
changeset
|
295 |
change_prompt (Scan.many Symbol.is_regular) |
2abb3005660f
added count/counted: improved position handling for token syntax;
wenzelm
parents:
25642
diff
changeset
|
296 |
--| Scan.option ($$ Symbol.end_malformed); |
23678 | 297 |
|
298 |
||
5825 | 299 |
(* scan token *) |
300 |
||
27663 | 301 |
fun token_leq ((_, x1: string), (_, x2)) = x1 <= x2; |
302 |
||
9130 | 303 |
fun scan (lex1, lex2) = |
5825 | 304 |
let |
27663 | 305 |
val scanner = Scan.state -- |
306 |
(scan_string >> pair String || |
|
307 |
scan_alt_string >> pair AltString || |
|
308 |
scan_verbatim >> pair Verbatim || |
|
309 |
scan_comment >> pair Comment || |
|
310 |
counted scan_space >> pair Space || |
|
311 |
counted scan_malformed >> pair Malformed || |
|
312 |
Scan.lift (Scan.one Symbol.is_sync >> K (Sync, Symbol.sync)) || |
|
26004
2abb3005660f
added count/counted: improved position handling for token syntax;
wenzelm
parents:
25642
diff
changeset
|
313 |
(Scan.max token_leq |
9130 | 314 |
(Scan.max token_leq |
27663 | 315 |
(counted (Scan.literal lex2) >> pair Command) |
316 |
(counted (Scan.literal lex1) >> pair Keyword)) |
|
317 |
(counted Syntax.scan_longid >> pair LongIdent || |
|
318 |
counted Syntax.scan_id >> pair Ident || |
|
319 |
counted Syntax.scan_var >> pair Var || |
|
320 |
counted Syntax.scan_tid >> pair TypeIdent || |
|
321 |
counted Syntax.scan_tvar >> pair TypeVar || |
|
322 |
counted Syntax.scan_nat >> pair Nat || |
|
323 |
counted scan_symid >> pair SymIdent))) -- Scan.state |
|
324 |
>> (fn ((pos, (k, x)), pos') => Token ((pos, pos'), (k, x))); |
|
325 |
||
14729 | 326 |
in !! (lex_err (fn cs => "bad input " ^ quote (Symbol.beginning 10 cs))) scanner end; |
5825 | 327 |
|
328 |
||
9130 | 329 |
(* token sources *) |
5825 | 330 |
|
23678 | 331 |
local |
332 |
||
23788
54ce229dc858
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
wenzelm
parents:
23729
diff
changeset
|
333 |
val is_junk = (not o Symbol.is_blank) andf Symbol.is_regular; |
23678 | 334 |
|
27663 | 335 |
fun recover msg = Scan.state -- counted (Scan.many is_junk) -- Scan.state |
336 |
>> (fn ((pos, s), pos') => [Token ((pos, pos'), (Error msg, s))]); |
|
23678 | 337 |
|
338 |
in |
|
5825 | 339 |
|
340 |
fun source do_recover get_lex pos src = |
|
341 |
Source.source' pos Symbol.stopper (Scan.bulk (fn xs => scan (get_lex ()) xs)) |
|
23682
cf4773532006
nested source: explicit interactive flag for recover avoids duplicate errors;
wenzelm
parents:
23678
diff
changeset
|
342 |
(Option.map (rpair recover) do_recover) src; |
23678 | 343 |
|
344 |
end; |
|
5825 | 345 |
|
9130 | 346 |
fun source_proper src = src |> Source.filter is_proper; |
347 |
||
348 |
||
5825 | 349 |
end; |