| author | wenzelm |
| Sun, 30 Nov 2014 15:11:50 +0100 | |
| changeset 59069 | ec6ce25a630d |
| parent 59067 | dd8ec9138112 |
| child 59082 | 25501ba956ac |
| permissions | -rw-r--r-- |
| 24579 | 1 |
(* Title: Pure/ML/ml_lex.ML |
2 |
Author: Makarius |
|
3 |
||
4 |
Lexical syntax for SML. |
|
5 |
*) |
|
6 |
||
7 |
signature ML_LEX = |
|
8 |
sig |
|
| 55505 | 9 |
val keywords: string list |
| 24579 | 10 |
datatype token_kind = |
| 24596 | 11 |
Keyword | Ident | LongIdent | TypeVar | Word | Int | Real | Char | String | |
| 24579 | 12 |
Space | Comment | Error of string | EOF |
13 |
eqtype token |
|
| 27732 | 14 |
val stopper: token Scan.stopper |
| 24596 | 15 |
val is_regular: token -> bool |
16 |
val is_improper: token -> bool |
|
|
30683
e8ac1f9d9469
datatype antiquote: maintain original Position.range, which is eventually attached to the resulting ML tokens;
wenzelm
parents:
30645
diff
changeset
|
17 |
val set_range: Position.range -> token -> token |
| 30636 | 18 |
val pos_of: token -> Position.T |
| 31474 | 19 |
val end_pos_of: token -> Position.T |
| 24579 | 20 |
val kind_of: token -> token_kind |
| 27817 | 21 |
val content_of: token -> string |
| 31426 | 22 |
val check_content_of: token -> string |
| 31332 | 23 |
val flatten: token list -> string |
| 24596 | 24 |
val source: (Symbol.symbol, 'a) Source.source -> |
| 30573 | 25 |
(token, (Symbol_Pos.T, Position.T * (Symbol.symbol, 'a) Source.source) |
| 27772 | 26 |
Source.source) Source.source |
| 30591 | 27 |
val tokenize: string -> token list |
| 59067 | 28 |
val read_pos: Position.T -> Symbol_Pos.text -> token Antiquote.antiquote list |
29 |
val read: Symbol_Pos.text -> token Antiquote.antiquote list |
|
|
58978
e42da880c61e
more position information, e.g. relevant for errors in generated ML source;
wenzelm
parents:
58933
diff
changeset
|
30 |
val read_set_range: Position.range -> Symbol_Pos.text -> token Antiquote.antiquote list |
| 59064 | 31 |
val read_source: bool -> Input.source -> token Antiquote.antiquote list |
| 24579 | 32 |
end; |
33 |
||
34 |
structure ML_Lex: ML_LEX = |
|
35 |
struct |
|
36 |
||
| 55505 | 37 |
(** keywords **) |
38 |
||
39 |
val keywords = |
|
40 |
["#", "(", ")", ",", "->", "...", ":", ":>", ";", "=", "=>", "[",
|
|
41 |
"]", "_", "{", "|", "}", "abstype", "and", "andalso", "as", "case",
|
|
42 |
"datatype", "do", "else", "end", "eqtype", "exception", "fn", "fun", |
|
43 |
"functor", "handle", "if", "in", "include", "infix", "infixr", |
|
44 |
"let", "local", "nonfix", "of", "op", "open", "orelse", "raise", |
|
45 |
"rec", "sharing", "sig", "signature", "struct", "structure", "then", |
|
46 |
"type", "val", "where", "while", "with", "withtype"]; |
|
47 |
||
48 |
val keywords2 = |
|
| 58933 | 49 |
["and", "case", "do", "else", "end", "if", "in", "let", "local", "of", |
| 55505 | 50 |
"sig", "struct", "then", "while", "with"]; |
51 |
||
52 |
val keywords3 = |
|
53 |
["handle", "open", "raise"]; |
|
54 |
||
55 |
val lexicon = Scan.make_lexicon (map raw_explode keywords); |
|
56 |
||
57 |
||
58 |
||
| 24579 | 59 |
(** tokens **) |
60 |
||
61 |
(* datatype token *) |
|
62 |
||
63 |
datatype token_kind = |
|
| 24596 | 64 |
Keyword | Ident | LongIdent | TypeVar | Word | Int | Real | Char | String | |
| 24579 | 65 |
Space | Comment | Error of string | EOF; |
66 |
||
| 27772 | 67 |
datatype token = Token of Position.range * (token_kind * string); |
68 |
||
69 |
||
70 |
(* position *) |
|
71 |
||
|
30683
e8ac1f9d9469
datatype antiquote: maintain original Position.range, which is eventually attached to the resulting ML tokens;
wenzelm
parents:
30645
diff
changeset
|
72 |
fun set_range range (Token (_, x)) = Token (range, x); |
|
e8ac1f9d9469
datatype antiquote: maintain original Position.range, which is eventually attached to the resulting ML tokens;
wenzelm
parents:
30645
diff
changeset
|
73 |
|
| 30636 | 74 |
fun pos_of (Token ((pos, _), _)) = pos; |
75 |
fun end_pos_of (Token ((_, pos), _)) = pos; |
|
| 24579 | 76 |
|
77 |
||
| 58855 | 78 |
(* stopper *) |
| 24579 | 79 |
|
| 27772 | 80 |
fun mk_eof pos = Token ((pos, Position.none), (EOF, "")); |
81 |
val eof = mk_eof Position.none; |
|
| 24579 | 82 |
|
83 |
fun is_eof (Token (_, (EOF, _))) = true |
|
84 |
| is_eof _ = false; |
|
85 |
||
| 27772 | 86 |
val stopper = |
| 30636 | 87 |
Scan.stopper (fn [] => eof | toks => mk_eof (end_pos_of (List.last toks))) is_eof; |
| 27772 | 88 |
|
| 24579 | 89 |
|
| 27772 | 90 |
(* token content *) |
91 |
||
| 31426 | 92 |
fun kind_of (Token (_, (k, _))) = k; |
93 |
||
| 27817 | 94 |
fun content_of (Token (_, (_, x))) = x; |
95 |
fun token_leq (tok, tok') = content_of tok <= content_of tok'; |
|
| 27772 | 96 |
|
| 55505 | 97 |
fun is_keyword (Token (_, (Keyword, _))) = true |
98 |
| is_keyword _ = false; |
|
99 |
||
100 |
fun is_delimiter (Token (_, (Keyword, x))) = not (Symbol.is_ascii_identifier x) |
|
101 |
| is_delimiter _ = false; |
|
102 |
||
|
41502
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents:
40627
diff
changeset
|
103 |
fun is_regular (Token (_, (Error _, _))) = false |
|
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents:
40627
diff
changeset
|
104 |
| is_regular (Token (_, (EOF, _))) = false |
|
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents:
40627
diff
changeset
|
105 |
| is_regular _ = true; |
|
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents:
40627
diff
changeset
|
106 |
|
|
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents:
40627
diff
changeset
|
107 |
fun is_improper (Token (_, (Space, _))) = true |
|
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents:
40627
diff
changeset
|
108 |
| is_improper (Token (_, (Comment, _))) = true |
|
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents:
40627
diff
changeset
|
109 |
| is_improper _ = false; |
|
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents:
40627
diff
changeset
|
110 |
|
|
40337
d25bbb5f1c9e
warn in correlation with report -- avoid spurious message duplicates;
wenzelm
parents:
40319
diff
changeset
|
111 |
fun warn tok = |
|
d25bbb5f1c9e
warn in correlation with report -- avoid spurious message duplicates;
wenzelm
parents:
40319
diff
changeset
|
112 |
(case tok of |
|
d25bbb5f1c9e
warn in correlation with report -- avoid spurious message duplicates;
wenzelm
parents:
40319
diff
changeset
|
113 |
Token (_, (Keyword, ":>")) => |
|
d25bbb5f1c9e
warn in correlation with report -- avoid spurious message duplicates;
wenzelm
parents:
40319
diff
changeset
|
114 |
warning ("Opaque signature matching (:>) fails to work with ML pretty printing --\n\
|
|
d25bbb5f1c9e
warn in correlation with report -- avoid spurious message duplicates;
wenzelm
parents:
40319
diff
changeset
|
115 |
\prefer non-opaque matching (:) possibly with abstype" ^ |
| 48992 | 116 |
Position.here (pos_of tok)) |
|
40337
d25bbb5f1c9e
warn in correlation with report -- avoid spurious message duplicates;
wenzelm
parents:
40319
diff
changeset
|
117 |
| _ => ()); |
|
d25bbb5f1c9e
warn in correlation with report -- avoid spurious message duplicates;
wenzelm
parents:
40319
diff
changeset
|
118 |
|
| 31426 | 119 |
fun check_content_of tok = |
| 30636 | 120 |
(case kind_of tok of |
| 30645 | 121 |
Error msg => error msg |
|
40337
d25bbb5f1c9e
warn in correlation with report -- avoid spurious message duplicates;
wenzelm
parents:
40319
diff
changeset
|
122 |
| _ => content_of tok); |
| 30636 | 123 |
|
|
41502
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents:
40627
diff
changeset
|
124 |
fun flatten_content (tok :: (toks as tok' :: _)) = |
|
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents:
40627
diff
changeset
|
125 |
Symbol.escape (check_content_of tok) :: |
|
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents:
40627
diff
changeset
|
126 |
(if is_improper tok orelse is_improper tok' then flatten_content toks |
|
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents:
40627
diff
changeset
|
127 |
else Symbol.space :: flatten_content toks) |
|
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents:
40627
diff
changeset
|
128 |
| flatten_content toks = map (Symbol.escape o check_content_of) toks; |
| 31332 | 129 |
|
|
41502
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents:
40627
diff
changeset
|
130 |
val flatten = implode o flatten_content; |
| 24596 | 131 |
|
132 |
||
| 30614 | 133 |
(* markup *) |
134 |
||
| 37195 | 135 |
local |
136 |
||
|
56278
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents:
55828
diff
changeset
|
137 |
fun token_kind_markup SML = |
| 55505 | 138 |
fn Keyword => (Markup.empty, "") |
|
48768
abc45de5bb22
clarified "bad" markup: proper body text, invent missing serial on Scala side (counting backwards);
wenzelm
parents:
48756
diff
changeset
|
139 |
| Ident => (Markup.empty, "") |
|
abc45de5bb22
clarified "bad" markup: proper body text, invent missing serial on Scala side (counting backwards);
wenzelm
parents:
48756
diff
changeset
|
140 |
| LongIdent => (Markup.empty, "") |
|
50201
c26369c9eda6
Isabelle-specific implementation of quasi-abstract markup elements -- back to module arrangement before d83797ef0d2d;
wenzelm
parents:
48992
diff
changeset
|
141 |
| TypeVar => (Markup.ML_tvar, "") |
|
c26369c9eda6
Isabelle-specific implementation of quasi-abstract markup elements -- back to module arrangement before d83797ef0d2d;
wenzelm
parents:
48992
diff
changeset
|
142 |
| Word => (Markup.ML_numeral, "") |
|
c26369c9eda6
Isabelle-specific implementation of quasi-abstract markup elements -- back to module arrangement before d83797ef0d2d;
wenzelm
parents:
48992
diff
changeset
|
143 |
| Int => (Markup.ML_numeral, "") |
|
c26369c9eda6
Isabelle-specific implementation of quasi-abstract markup elements -- back to module arrangement before d83797ef0d2d;
wenzelm
parents:
48992
diff
changeset
|
144 |
| Real => (Markup.ML_numeral, "") |
|
c26369c9eda6
Isabelle-specific implementation of quasi-abstract markup elements -- back to module arrangement before d83797ef0d2d;
wenzelm
parents:
48992
diff
changeset
|
145 |
| Char => (Markup.ML_char, "") |
|
56278
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents:
55828
diff
changeset
|
146 |
| String => (if SML then Markup.SML_string else Markup.ML_string, "") |
|
48768
abc45de5bb22
clarified "bad" markup: proper body text, invent missing serial on Scala side (counting backwards);
wenzelm
parents:
48756
diff
changeset
|
147 |
| Space => (Markup.empty, "") |
|
56278
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents:
55828
diff
changeset
|
148 |
| Comment => (if SML then Markup.SML_comment else Markup.ML_comment, "") |
|
50201
c26369c9eda6
Isabelle-specific implementation of quasi-abstract markup elements -- back to module arrangement before d83797ef0d2d;
wenzelm
parents:
48992
diff
changeset
|
149 |
| Error msg => (Markup.bad, msg) |
|
48768
abc45de5bb22
clarified "bad" markup: proper body text, invent missing serial on Scala side (counting backwards);
wenzelm
parents:
48756
diff
changeset
|
150 |
| EOF => (Markup.empty, ""); |
| 30614 | 151 |
|
| 37195 | 152 |
in |
153 |
||
|
56278
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents:
55828
diff
changeset
|
154 |
fun report_of_token SML (tok as Token ((pos, _), (kind, x))) = |
| 55505 | 155 |
let |
156 |
val (markup, txt) = |
|
|
56278
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents:
55828
diff
changeset
|
157 |
if not (is_keyword tok) then token_kind_markup SML kind |
| 55505 | 158 |
else if is_delimiter tok then (Markup.ML_delimiter, "") |
159 |
else if member (op =) keywords2 x then (Markup.ML_keyword2, "") |
|
160 |
else if member (op =) keywords3 x then (Markup.ML_keyword3, "") |
|
161 |
else (Markup.ML_keyword1, ""); |
|
|
48768
abc45de5bb22
clarified "bad" markup: proper body text, invent missing serial on Scala side (counting backwards);
wenzelm
parents:
48756
diff
changeset
|
162 |
in ((pos, markup), txt) end; |
| 37195 | 163 |
|
164 |
end; |
|
| 30614 | 165 |
|
166 |
||
| 24579 | 167 |
|
168 |
(** scanners **) |
|
169 |
||
| 30573 | 170 |
open Basic_Symbol_Pos; |
| 24579 | 171 |
|
| 55105 | 172 |
val err_prefix = "SML lexical error: "; |
173 |
||
174 |
fun !!! msg = Symbol_Pos.!!! (fn () => err_prefix ^ msg); |
|
| 24579 | 175 |
|
176 |
||
177 |
(* identifiers *) |
|
178 |
||
| 24596 | 179 |
local |
180 |
||
| 24579 | 181 |
val scan_letdigs = |
| 55496 | 182 |
Scan.many (Symbol.is_ascii_letdig o Symbol_Pos.symbol); |
| 24579 | 183 |
|
|
40525
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
wenzelm
parents:
40523
diff
changeset
|
184 |
val scan_alphanumeric = |
|
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
wenzelm
parents:
40523
diff
changeset
|
185 |
Scan.one (Symbol.is_ascii_letter o Symbol_Pos.symbol) -- scan_letdigs >> op ::; |
| 24579 | 186 |
|
|
40525
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
wenzelm
parents:
40523
diff
changeset
|
187 |
val scan_symbolic = |
|
40627
becf5d5187cc
renamed raw "explode" function to "raw_explode" to emphasize its meaning;
wenzelm
parents:
40525
diff
changeset
|
188 |
Scan.many1 (member (op =) (raw_explode "!#$%&*+-/:<=>?@\\^`|~") o Symbol_Pos.symbol); |
| 24579 | 189 |
|
| 24596 | 190 |
in |
191 |
||
| 24579 | 192 |
val scan_ident = scan_alphanumeric || scan_symbolic; |
193 |
||
| 55496 | 194 |
val scan_long_ident = |
| 27772 | 195 |
(Scan.repeat1 (scan_alphanumeric @@@ $$$ ".") >> flat) @@@ (scan_ident || $$$ "="); |
| 24579 | 196 |
|
| 55496 | 197 |
val scan_type_var = $$$ "'" @@@ scan_letdigs; |
| 24579 | 198 |
|
| 24596 | 199 |
end; |
| 24579 | 200 |
|
201 |
||
202 |
(* numerals *) |
|
203 |
||
| 24596 | 204 |
local |
205 |
||
|
40525
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
wenzelm
parents:
40523
diff
changeset
|
206 |
val scan_dec = Scan.many1 (Symbol.is_ascii_digit o Symbol_Pos.symbol); |
|
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
wenzelm
parents:
40523
diff
changeset
|
207 |
val scan_hex = Scan.many1 (Symbol.is_ascii_hex o Symbol_Pos.symbol); |
| 27772 | 208 |
val scan_sign = Scan.optional ($$$ "~") []; |
209 |
val scan_decint = scan_sign @@@ scan_dec; |
|
| 55496 | 210 |
val scan_exp = ($$$ "E" || $$$ "e") @@@ scan_decint; |
| 24579 | 211 |
|
| 24596 | 212 |
in |
213 |
||
| 27772 | 214 |
val scan_word = |
215 |
$$$ "0" @@@ $$$ "w" @@@ $$$ "x" @@@ scan_hex || |
|
216 |
$$$ "0" @@@ $$$ "w" @@@ scan_dec; |
|
| 24579 | 217 |
|
| 27772 | 218 |
val scan_int = scan_sign @@@ ($$$ "0" @@@ $$$ "x" @@@ scan_hex || scan_dec); |
| 24579 | 219 |
|
220 |
val scan_real = |
|
| 27772 | 221 |
scan_decint @@@ $$$ "." @@@ scan_dec @@@ Scan.optional scan_exp [] || |
222 |
scan_decint @@@ scan_exp; |
|
| 24579 | 223 |
|
| 24596 | 224 |
end; |
225 |
||
| 24579 | 226 |
|
227 |
(* chars and strings *) |
|
228 |
||
| 55496 | 229 |
val scan_blanks1 = Scan.many1 (Symbol.is_ascii_blank o Symbol_Pos.symbol); |
230 |
||
| 24596 | 231 |
local |
232 |
||
233 |
val scan_escape = |
|
|
40627
becf5d5187cc
renamed raw "explode" function to "raw_explode" to emphasize its meaning;
wenzelm
parents:
40525
diff
changeset
|
234 |
Scan.one (member (op =) (raw_explode "\"\\abtnvfr") o Symbol_Pos.symbol) >> single || |
| 27772 | 235 |
$$$ "^" @@@ (Scan.one (fn (s, _) => ord "@" <= ord s andalso ord s <= ord "_") >> single) || |
|
40525
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
wenzelm
parents:
40523
diff
changeset
|
236 |
Scan.one (Symbol.is_ascii_digit o Symbol_Pos.symbol) -- |
|
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
wenzelm
parents:
40523
diff
changeset
|
237 |
Scan.one (Symbol.is_ascii_digit o Symbol_Pos.symbol) -- |
|
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
wenzelm
parents:
40523
diff
changeset
|
238 |
Scan.one (Symbol.is_ascii_digit o Symbol_Pos.symbol) >> (fn ((a, b), c) => [a, b, c]); |
| 24596 | 239 |
|
240 |
val scan_str = |
|
| 58854 | 241 |
Scan.one (fn (s, _) => Symbol.not_eof s andalso s <> "\"" andalso s <> "\\" andalso |
|
30600
de241396389c
allow non-printable symbols within string tokens;
wenzelm
parents:
30593
diff
changeset
|
242 |
(not (Symbol.is_char s) orelse Symbol.is_printable s)) >> single || |
| 27772 | 243 |
$$$ "\\" @@@ !!! "bad escape character in string" scan_escape; |
| 24596 | 244 |
|
| 27772 | 245 |
val scan_gap = $$$ "\\" @@@ scan_blanks1 @@@ $$$ "\\"; |
246 |
val scan_gaps = Scan.repeat scan_gap >> flat; |
|
| 24579 | 247 |
|
| 24596 | 248 |
in |
| 24579 | 249 |
|
250 |
val scan_char = |
|
| 27772 | 251 |
$$$ "#" @@@ $$$ "\"" @@@ scan_gaps @@@ scan_str @@@ scan_gaps @@@ $$$ "\""; |
| 24579 | 252 |
|
|
48743
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
wenzelm
parents:
45666
diff
changeset
|
253 |
val recover_char = |
|
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
wenzelm
parents:
45666
diff
changeset
|
254 |
$$$ "#" @@@ $$$ "\"" @@@ scan_gaps @@@ Scan.optional (Scan.permissive scan_str @@@ scan_gaps) []; |
|
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
wenzelm
parents:
45666
diff
changeset
|
255 |
|
| 24579 | 256 |
val scan_string = |
| 55103 | 257 |
Scan.ahead ($$ "\"") |-- |
258 |
!!! "unclosed string literal" |
|
259 |
($$$ "\"" @@@ (Scan.repeat (scan_gap || scan_str) >> flat) @@@ $$$ "\""); |
|
| 24596 | 260 |
|
|
48743
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
wenzelm
parents:
45666
diff
changeset
|
261 |
val recover_string = |
|
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
wenzelm
parents:
45666
diff
changeset
|
262 |
$$$ "\"" @@@ (Scan.repeat (scan_gap || Scan.permissive scan_str) >> flat); |
|
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
wenzelm
parents:
45666
diff
changeset
|
263 |
|
| 24596 | 264 |
end; |
| 24579 | 265 |
|
266 |
||
| 30645 | 267 |
(* scan tokens *) |
| 24579 | 268 |
|
269 |
local |
|
270 |
||
| 30593 | 271 |
fun token k ss = Token (Symbol_Pos.range ss, (k, Symbol_Pos.content ss)); |
| 24579 | 272 |
|
| 30593 | 273 |
val scan_ml = |
| 27772 | 274 |
(scan_char >> token Char || |
275 |
scan_string >> token String || |
|
276 |
scan_blanks1 >> token Space || |
|
| 55105 | 277 |
Symbol_Pos.scan_comment err_prefix >> token Comment || |
| 27772 | 278 |
Scan.max token_leq |
| 55505 | 279 |
(Scan.literal lexicon >> token Keyword) |
| 27772 | 280 |
(scan_word >> token Word || |
281 |
scan_real >> token Real || |
|
282 |
scan_int >> token Int || |
|
| 55496 | 283 |
scan_long_ident >> token LongIdent || |
| 27772 | 284 |
scan_ident >> token Ident || |
| 55496 | 285 |
scan_type_var >> token TypeVar)); |
| 27772 | 286 |
|
| 53167 | 287 |
val scan_antiq = Antiquote.scan_antiq >> Antiquote.Antiq || scan_ml >> Antiquote.Text; |
| 30645 | 288 |
|
| 27772 | 289 |
fun recover msg = |
|
48743
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
wenzelm
parents:
45666
diff
changeset
|
290 |
(recover_char || |
|
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
wenzelm
parents:
45666
diff
changeset
|
291 |
recover_string || |
|
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
wenzelm
parents:
45666
diff
changeset
|
292 |
Symbol_Pos.recover_comment || |
|
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
wenzelm
parents:
45666
diff
changeset
|
293 |
Scan.one (Symbol.not_eof o Symbol_Pos.symbol) >> single) |
|
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
wenzelm
parents:
45666
diff
changeset
|
294 |
>> (single o token (Error msg)); |
| 24579 | 295 |
|
|
56278
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents:
55828
diff
changeset
|
296 |
fun gen_read SML pos text = |
|
37198
3af985b10550
replaced ML_Lex.read_antiq by more concise ML_Lex.read, which includes full read/report with explicit position information;
wenzelm
parents:
37195
diff
changeset
|
297 |
let |
|
55828
42ac3cfb89f6
clarified language markup: added "delimited" property;
wenzelm
parents:
55612
diff
changeset
|
298 |
val syms = Symbol_Pos.explode (text, pos); |
|
37209
1c8cf0048934
terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
wenzelm
parents:
37198
diff
changeset
|
299 |
val termination = |
|
1c8cf0048934
terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
wenzelm
parents:
37198
diff
changeset
|
300 |
if null syms then [] |
|
1c8cf0048934
terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
wenzelm
parents:
37198
diff
changeset
|
301 |
else |
|
1c8cf0048934
terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
wenzelm
parents:
37198
diff
changeset
|
302 |
let |
|
1c8cf0048934
terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
wenzelm
parents:
37198
diff
changeset
|
303 |
val pos1 = List.last syms |-> Position.advance; |
|
1c8cf0048934
terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
wenzelm
parents:
37198
diff
changeset
|
304 |
val pos2 = Position.advance Symbol.space pos1; |
|
1c8cf0048934
terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
wenzelm
parents:
37198
diff
changeset
|
305 |
in [Antiquote.Text (Token (Position.range pos1 pos2, (Space, Symbol.space)))] end; |
|
56278
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents:
55828
diff
changeset
|
306 |
|
|
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents:
55828
diff
changeset
|
307 |
val scan = if SML then scan_ml >> Antiquote.Text else scan_antiq; |
|
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents:
55828
diff
changeset
|
308 |
fun check (Antiquote.Text tok) = (check_content_of tok; if SML then () else warn tok) |
|
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents:
55828
diff
changeset
|
309 |
| check _ = (); |
|
37209
1c8cf0048934
terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
wenzelm
parents:
37198
diff
changeset
|
310 |
val input = |
|
56278
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents:
55828
diff
changeset
|
311 |
Source.of_list syms |
| 58864 | 312 |
|> Source.source Symbol_Pos.stopper |
313 |
(Scan.recover (Scan.bulk (!!! "bad input" scan)) |
|
314 |
(fn msg => recover msg >> map Antiquote.Text)) |
|
|
56278
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents:
55828
diff
changeset
|
315 |
|> Source.exhaust |
|
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents:
55828
diff
changeset
|
316 |
|> tap (Position.reports_text o Antiquote.antiquote_reports (single o report_of_token SML)) |
|
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents:
55828
diff
changeset
|
317 |
|> tap (List.app check); |
|
37209
1c8cf0048934
terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
wenzelm
parents:
37198
diff
changeset
|
318 |
in input @ termination end; |
| 30645 | 319 |
|
|
56278
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents:
55828
diff
changeset
|
320 |
in |
|
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents:
55828
diff
changeset
|
321 |
|
|
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents:
55828
diff
changeset
|
322 |
fun source src = |
|
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents:
55828
diff
changeset
|
323 |
Symbol_Pos.source (Position.line 1) src |
| 58864 | 324 |
|> Source.source Symbol_Pos.stopper (Scan.recover (Scan.bulk (!!! "bad input" scan_ml)) recover); |
|
56278
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents:
55828
diff
changeset
|
325 |
|
|
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents:
55828
diff
changeset
|
326 |
val tokenize = Source.of_string #> Symbol.source #> source #> Source.exhaust; |
|
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents:
55828
diff
changeset
|
327 |
|
| 59067 | 328 |
val read_pos = gen_read false; |
329 |
||
330 |
val read = read_pos Position.none; |
|
|
56278
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents:
55828
diff
changeset
|
331 |
|
|
58978
e42da880c61e
more position information, e.g. relevant for errors in generated ML source;
wenzelm
parents:
58933
diff
changeset
|
332 |
fun read_set_range range = |
| 59067 | 333 |
read #> map (fn Antiquote.Text tok => Antiquote.Text (set_range range tok) | antiq => antiq); |
|
58978
e42da880c61e
more position information, e.g. relevant for errors in generated ML source;
wenzelm
parents:
58933
diff
changeset
|
334 |
|
| 59064 | 335 |
fun read_source SML source = |
|
56459
38d0b2099743
no report for position singularity, notably for aux. file, especially empty one;
wenzelm
parents:
56278
diff
changeset
|
336 |
let |
| 59064 | 337 |
val pos = Input.pos_of source; |
|
56459
38d0b2099743
no report for position singularity, notably for aux. file, especially empty one;
wenzelm
parents:
56278
diff
changeset
|
338 |
val language = if SML then Markup.language_SML else Markup.language_ML; |
|
38d0b2099743
no report for position singularity, notably for aux. file, especially empty one;
wenzelm
parents:
56278
diff
changeset
|
339 |
val _ = |
| 59064 | 340 |
if Position.is_reported_range pos |
341 |
then Position.report pos (language (Input.is_delimited source)) |
|
|
56459
38d0b2099743
no report for position singularity, notably for aux. file, especially empty one;
wenzelm
parents:
56278
diff
changeset
|
342 |
else (); |
| 59064 | 343 |
in gen_read SML pos (Input.text_of source) end; |
|
55828
42ac3cfb89f6
clarified language markup: added "delimited" property;
wenzelm
parents:
55612
diff
changeset
|
344 |
|
| 24579 | 345 |
end; |
346 |
||
| 24596 | 347 |
end; |
| 24579 | 348 |