author | wenzelm |
Thu, 15 Aug 2024 12:43:17 +0200 | |
changeset 80712 | 05b16602a683 |
parent 78701 | c7b2697094ac |
permissions | -rw-r--r-- |
24579 | 1 |
(* Title: Pure/ML/ml_lex.ML |
2 |
Author: Makarius |
|
3 |
||
59109 | 4 |
Lexical syntax for Isabelle/ML and Standard ML. |
24579 | 5 |
*) |
6 |
||
7 |
signature ML_LEX = |
|
8 |
sig |
|
55505 | 9 |
val keywords: string list |
24579 | 10 |
datatype token_kind = |
59082 | 11 |
Keyword | Ident | Long_Ident | Type_Var | Word | Int | Real | Char | String | |
67425
7d4a088dbc0e
clarified modules: uniform notion of formal comments;
wenzelm
parents:
67363
diff
changeset
|
12 |
Space | Comment of Comment.kind option | Error of string | EOF |
24579 | 13 |
eqtype token |
27732 | 14 |
val stopper: token Scan.stopper |
78701 | 15 |
val is_ident_with: (string -> bool) -> token -> bool |
24596 | 16 |
val is_regular: token -> bool |
17 |
val is_improper: token -> bool |
|
67362 | 18 |
val is_comment: token -> bool |
30683
e8ac1f9d9469
datatype antiquote: maintain original Position.range, which is eventually attached to the resulting ML tokens;
wenzelm
parents:
30645
diff
changeset
|
19 |
val set_range: Position.range -> token -> token |
59112 | 20 |
val range_of: token -> Position.range |
30636 | 21 |
val pos_of: token -> Position.T |
31474 | 22 |
val end_pos_of: token -> Position.T |
24579 | 23 |
val kind_of: token -> token_kind |
27817 | 24 |
val content_of: token -> string |
31426 | 25 |
val check_content_of: token -> string |
31332 | 26 |
val flatten: token list -> string |
24596 | 27 |
val source: (Symbol.symbol, 'a) Source.source -> |
30573 | 28 |
(token, (Symbol_Pos.T, Position.T * (Symbol.symbol, 'a) Source.source) |
27772 | 29 |
Source.source) Source.source |
30591 | 30 |
val tokenize: string -> token list |
73549
a2c589d5e1e4
clarified signature: more detailed token positions for antiquotations;
wenzelm
parents:
69891
diff
changeset
|
31 |
val tokenize_range: Position.range -> string -> token list |
74291
b83fa8f3a271
ML antiquotations for type constructors and term constants;
wenzelm
parents:
74174
diff
changeset
|
32 |
val tokenize_no_range: string -> token list |
68823 | 33 |
val read_text: Symbol_Pos.text * Position.T -> token Antiquote.antiquote list |
59067 | 34 |
val read: Symbol_Pos.text -> token Antiquote.antiquote list |
73549
a2c589d5e1e4
clarified signature: more detailed token positions for antiquotations;
wenzelm
parents:
69891
diff
changeset
|
35 |
val read_range: Position.range -> Symbol_Pos.text -> token Antiquote.antiquote list |
68823 | 36 |
val read_source': {language: bool -> Markup.T, opaque_warning: bool, symbols: bool} -> |
37 |
token Antiquote.antiquote Symbol_Pos.scanner -> Input.source -> token Antiquote.antiquote list |
|
38 |
val read_source: Input.source -> token Antiquote.antiquote list |
|
39 |
val read_source_sml: Input.source -> token Antiquote.antiquote list |
|
74373
6e4093927dbb
outer syntax: support for control-cartouche tokens;
wenzelm
parents:
74291
diff
changeset
|
40 |
val read_symbols: Symbol_Pos.T list -> token Antiquote.antiquote list |
69841
b3c9291b5fc7
clarified signature, notably for hol4isabelle (by Fabian Immler);
wenzelm
parents:
68823
diff
changeset
|
41 |
val read_symbols_sml: Symbol_Pos.T list -> token Antiquote.antiquote list |
24579 | 42 |
end; |
43 |
||
44 |
structure ML_Lex: ML_LEX = |
|
45 |
struct |
|
46 |
||
55505 | 47 |
(** keywords **) |
48 |
||
49 |
val keywords = |
|
50 |
["#", "(", ")", ",", "->", "...", ":", ":>", ";", "=", "=>", "[", |
|
51 |
"]", "_", "{", "|", "}", "abstype", "and", "andalso", "as", "case", |
|
52 |
"datatype", "do", "else", "end", "eqtype", "exception", "fn", "fun", |
|
53 |
"functor", "handle", "if", "in", "include", "infix", "infixr", |
|
54 |
"let", "local", "nonfix", "of", "op", "open", "orelse", "raise", |
|
55 |
"rec", "sharing", "sig", "signature", "struct", "structure", "then", |
|
56 |
"type", "val", "where", "while", "with", "withtype"]; |
|
57 |
||
58 |
val keywords2 = |
|
78687 | 59 |
Symset.make ["and", "case", "do", "else", "end", "if", "in", "let", "local", "of", |
55505 | 60 |
"sig", "struct", "then", "while", "with"]; |
61 |
||
62 |
val keywords3 = |
|
78687 | 63 |
Symset.make ["handle", "open", "raise"]; |
55505 | 64 |
|
65 |
val lexicon = Scan.make_lexicon (map raw_explode keywords); |
|
66 |
||
67 |
||
68 |
||
24579 | 69 |
(** tokens **) |
70 |
||
71 |
(* datatype token *) |
|
72 |
||
73 |
datatype token_kind = |
|
59082 | 74 |
Keyword | Ident | Long_Ident | Type_Var | Word | Int | Real | Char | String | |
67425
7d4a088dbc0e
clarified modules: uniform notion of formal comments;
wenzelm
parents:
67363
diff
changeset
|
75 |
Space | Comment of Comment.kind option | Error of string | EOF; |
24579 | 76 |
|
27772 | 77 |
datatype token = Token of Position.range * (token_kind * string); |
78 |
||
79 |
||
80 |
(* position *) |
|
81 |
||
30683
e8ac1f9d9469
datatype antiquote: maintain original Position.range, which is eventually attached to the resulting ML tokens;
wenzelm
parents:
30645
diff
changeset
|
82 |
fun set_range range (Token (_, x)) = Token (range, x); |
59112 | 83 |
fun range_of (Token (range, _)) = range; |
30683
e8ac1f9d9469
datatype antiquote: maintain original Position.range, which is eventually attached to the resulting ML tokens;
wenzelm
parents:
30645
diff
changeset
|
84 |
|
59112 | 85 |
val pos_of = #1 o range_of; |
86 |
val end_pos_of = #2 o range_of; |
|
24579 | 87 |
|
88 |
||
58855 | 89 |
(* stopper *) |
24579 | 90 |
|
27772 | 91 |
fun mk_eof pos = Token ((pos, Position.none), (EOF, "")); |
92 |
val eof = mk_eof Position.none; |
|
24579 | 93 |
|
94 |
fun is_eof (Token (_, (EOF, _))) = true |
|
95 |
| is_eof _ = false; |
|
96 |
||
27772 | 97 |
val stopper = |
30636 | 98 |
Scan.stopper (fn [] => eof | toks => mk_eof (end_pos_of (List.last toks))) is_eof; |
27772 | 99 |
|
24579 | 100 |
|
27772 | 101 |
(* token content *) |
102 |
||
31426 | 103 |
fun kind_of (Token (_, (k, _))) = k; |
104 |
||
27817 | 105 |
fun content_of (Token (_, (_, x))) = x; |
106 |
fun token_leq (tok, tok') = content_of tok <= content_of tok'; |
|
27772 | 107 |
|
55505 | 108 |
fun is_keyword (Token (_, (Keyword, _))) = true |
109 |
| is_keyword _ = false; |
|
110 |
||
111 |
fun is_delimiter (Token (_, (Keyword, x))) = not (Symbol.is_ascii_identifier x) |
|
112 |
| is_delimiter _ = false; |
|
113 |
||
78701 | 114 |
fun is_ident_with pred (Token (_, (Ident, x))) = pred x |
115 |
| is_ident_with _ _ = false; |
|
116 |
||
41502
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents:
40627
diff
changeset
|
117 |
fun is_regular (Token (_, (Error _, _))) = false |
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents:
40627
diff
changeset
|
118 |
| is_regular (Token (_, (EOF, _))) = false |
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents:
40627
diff
changeset
|
119 |
| is_regular _ = true; |
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents:
40627
diff
changeset
|
120 |
|
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents:
40627
diff
changeset
|
121 |
fun is_improper (Token (_, (Space, _))) = true |
67425
7d4a088dbc0e
clarified modules: uniform notion of formal comments;
wenzelm
parents:
67363
diff
changeset
|
122 |
| is_improper (Token (_, (Comment _, _))) = true |
41502
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents:
40627
diff
changeset
|
123 |
| is_improper _ = false; |
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents:
40627
diff
changeset
|
124 |
|
67425
7d4a088dbc0e
clarified modules: uniform notion of formal comments;
wenzelm
parents:
67363
diff
changeset
|
125 |
fun is_comment (Token (_, (Comment _, _))) = true |
67362 | 126 |
| is_comment _ = false; |
127 |
||
68823 | 128 |
fun warning_opaque tok = |
40337
d25bbb5f1c9e
warn in correlation with report -- avoid spurious message duplicates;
wenzelm
parents:
40319
diff
changeset
|
129 |
(case tok of |
d25bbb5f1c9e
warn in correlation with report -- avoid spurious message duplicates;
wenzelm
parents:
40319
diff
changeset
|
130 |
Token (_, (Keyword, ":>")) => |
d25bbb5f1c9e
warn in correlation with report -- avoid spurious message duplicates;
wenzelm
parents:
40319
diff
changeset
|
131 |
warning ("Opaque signature matching (:>) fails to work with ML pretty printing --\n\ |
d25bbb5f1c9e
warn in correlation with report -- avoid spurious message duplicates;
wenzelm
parents:
40319
diff
changeset
|
132 |
\prefer non-opaque matching (:) possibly with abstype" ^ |
48992 | 133 |
Position.here (pos_of tok)) |
40337
d25bbb5f1c9e
warn in correlation with report -- avoid spurious message duplicates;
wenzelm
parents:
40319
diff
changeset
|
134 |
| _ => ()); |
d25bbb5f1c9e
warn in correlation with report -- avoid spurious message duplicates;
wenzelm
parents:
40319
diff
changeset
|
135 |
|
31426 | 136 |
fun check_content_of tok = |
30636 | 137 |
(case kind_of tok of |
30645 | 138 |
Error msg => error msg |
40337
d25bbb5f1c9e
warn in correlation with report -- avoid spurious message duplicates;
wenzelm
parents:
40319
diff
changeset
|
139 |
| _ => content_of tok); |
30636 | 140 |
|
59112 | 141 |
|
142 |
(* flatten *) |
|
143 |
||
41502
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents:
40627
diff
changeset
|
144 |
fun flatten_content (tok :: (toks as tok' :: _)) = |
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents:
40627
diff
changeset
|
145 |
Symbol.escape (check_content_of tok) :: |
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents:
40627
diff
changeset
|
146 |
(if is_improper tok orelse is_improper tok' then flatten_content toks |
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents:
40627
diff
changeset
|
147 |
else Symbol.space :: flatten_content toks) |
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents:
40627
diff
changeset
|
148 |
| flatten_content toks = map (Symbol.escape o check_content_of) toks; |
31332 | 149 |
|
41502
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents:
40627
diff
changeset
|
150 |
val flatten = implode o flatten_content; |
24596 | 151 |
|
152 |
||
30614 | 153 |
(* markup *) |
154 |
||
37195 | 155 |
local |
156 |
||
68822 | 157 |
val token_kind_markup = |
59124 | 158 |
fn Type_Var => (Markup.ML_tvar, "") |
59082 | 159 |
| Word => (Markup.ML_numeral, "") |
160 |
| Int => (Markup.ML_numeral, "") |
|
161 |
| Real => (Markup.ML_numeral, "") |
|
162 |
| Char => (Markup.ML_char, "") |
|
68822 | 163 |
| String => (Markup.ML_string, "") |
164 |
| Comment _ => (Markup.ML_comment, "") |
|
64677
8dc24130e8fe
more uniform treatment of "bad" like other messages (with serial number);
wenzelm
parents:
64275
diff
changeset
|
165 |
| Error msg => (Markup.bad (), msg) |
59124 | 166 |
| _ => (Markup.empty, ""); |
30614 | 167 |
|
37195 | 168 |
in |
169 |
||
68822 | 170 |
fun token_report (tok as Token ((pos, _), (kind, x))) = |
55505 | 171 |
let |
172 |
val (markup, txt) = |
|
68822 | 173 |
if not (is_keyword tok) then token_kind_markup kind |
55505 | 174 |
else if is_delimiter tok then (Markup.ML_delimiter, "") |
78687 | 175 |
else if Symset.member keywords2 x then (Markup.ML_keyword2 |> Markup.keyword_properties, "") |
176 |
else if Symset.member keywords3 x then (Markup.ML_keyword3 |> Markup.keyword_properties, "") |
|
66067 | 177 |
else (Markup.ML_keyword1 |> Markup.keyword_properties, ""); |
48768
abc45de5bb22
clarified "bad" markup: proper body text, invent missing serial on Scala side (counting backwards);
wenzelm
parents:
48756
diff
changeset
|
178 |
in ((pos, markup), txt) end; |
37195 | 179 |
|
180 |
end; |
|
30614 | 181 |
|
182 |
||
24579 | 183 |
|
184 |
(** scanners **) |
|
185 |
||
30573 | 186 |
open Basic_Symbol_Pos; |
24579 | 187 |
|
55105 | 188 |
val err_prefix = "SML lexical error: "; |
189 |
||
190 |
fun !!! msg = Symbol_Pos.!!! (fn () => err_prefix ^ msg); |
|
24579 | 191 |
|
192 |
||
193 |
(* identifiers *) |
|
194 |
||
24596 | 195 |
local |
196 |
||
24579 | 197 |
val scan_letdigs = |
55496 | 198 |
Scan.many (Symbol.is_ascii_letdig o Symbol_Pos.symbol); |
24579 | 199 |
|
40525
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
wenzelm
parents:
40523
diff
changeset
|
200 |
val scan_alphanumeric = |
61814
1ca1142e1711
clarified type Token.src: plain token list, with usual implicit value assignment;
wenzelm
parents:
61596
diff
changeset
|
201 |
Scan.one (Symbol.is_ascii_letter o Symbol_Pos.symbol) ::: scan_letdigs; |
24579 | 202 |
|
78687 | 203 |
val symbolic = Symset.make (raw_explode "!#$%&*+-/:<=>?@\\^`|~"); |
204 |
||
205 |
val scan_symbolic = Scan.many1 (Symset.member symbolic o Symbol_Pos.symbol); |
|
24579 | 206 |
|
24596 | 207 |
in |
208 |
||
24579 | 209 |
val scan_ident = scan_alphanumeric || scan_symbolic; |
210 |
||
55496 | 211 |
val scan_long_ident = |
61476 | 212 |
Scan.repeats1 (scan_alphanumeric @@@ $$$ ".") @@@ (scan_ident || $$$ "="); |
24579 | 213 |
|
55496 | 214 |
val scan_type_var = $$$ "'" @@@ scan_letdigs; |
24579 | 215 |
|
24596 | 216 |
end; |
24579 | 217 |
|
218 |
||
219 |
(* numerals *) |
|
220 |
||
24596 | 221 |
local |
222 |
||
40525
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
wenzelm
parents:
40523
diff
changeset
|
223 |
val scan_dec = Scan.many1 (Symbol.is_ascii_digit o Symbol_Pos.symbol); |
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
wenzelm
parents:
40523
diff
changeset
|
224 |
val scan_hex = Scan.many1 (Symbol.is_ascii_hex o Symbol_Pos.symbol); |
27772 | 225 |
val scan_sign = Scan.optional ($$$ "~") []; |
226 |
val scan_decint = scan_sign @@@ scan_dec; |
|
55496 | 227 |
val scan_exp = ($$$ "E" || $$$ "e") @@@ scan_decint; |
24579 | 228 |
|
24596 | 229 |
in |
230 |
||
27772 | 231 |
val scan_word = |
232 |
$$$ "0" @@@ $$$ "w" @@@ $$$ "x" @@@ scan_hex || |
|
233 |
$$$ "0" @@@ $$$ "w" @@@ scan_dec; |
|
24579 | 234 |
|
27772 | 235 |
val scan_int = scan_sign @@@ ($$$ "0" @@@ $$$ "x" @@@ scan_hex || scan_dec); |
24579 | 236 |
|
63204
921a5be54132
support rat numerals via special antiquotation syntax;
wenzelm
parents:
62797
diff
changeset
|
237 |
val scan_rat = scan_decint @@@ Scan.optional ($$$ "/" @@@ scan_dec) []; |
921a5be54132
support rat numerals via special antiquotation syntax;
wenzelm
parents:
62797
diff
changeset
|
238 |
|
24579 | 239 |
val scan_real = |
27772 | 240 |
scan_decint @@@ $$$ "." @@@ scan_dec @@@ Scan.optional scan_exp [] || |
241 |
scan_decint @@@ scan_exp; |
|
24579 | 242 |
|
24596 | 243 |
end; |
244 |
||
24579 | 245 |
|
246 |
(* chars and strings *) |
|
247 |
||
55496 | 248 |
val scan_blanks1 = Scan.many1 (Symbol.is_ascii_blank o Symbol_Pos.symbol); |
249 |
||
24596 | 250 |
local |
251 |
||
78687 | 252 |
val escape = Symset.make (raw_explode "\"\\abtnvfr"); |
253 |
||
24596 | 254 |
val scan_escape = |
78687 | 255 |
Scan.one (Symset.member escape o Symbol_Pos.symbol) >> single || |
64275
ac2abc987cf9
accomodate Poly/ML repository version, which treats singleton strings as boxed;
wenzelm
parents:
63936
diff
changeset
|
256 |
$$$ "^" @@@ |
ac2abc987cf9
accomodate Poly/ML repository version, which treats singleton strings as boxed;
wenzelm
parents:
63936
diff
changeset
|
257 |
(Scan.one (fn (s, _) => Char.ord #"@" <= ord s andalso ord s <= Char.ord #"_") >> single) || |
40525
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
wenzelm
parents:
40523
diff
changeset
|
258 |
Scan.one (Symbol.is_ascii_digit o Symbol_Pos.symbol) -- |
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
wenzelm
parents:
40523
diff
changeset
|
259 |
Scan.one (Symbol.is_ascii_digit o Symbol_Pos.symbol) -- |
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
wenzelm
parents:
40523
diff
changeset
|
260 |
Scan.one (Symbol.is_ascii_digit o Symbol_Pos.symbol) >> (fn ((a, b), c) => [a, b, c]); |
24596 | 261 |
|
262 |
val scan_str = |
|
58854 | 263 |
Scan.one (fn (s, _) => Symbol.not_eof s andalso s <> "\"" andalso s <> "\\" andalso |
30600
de241396389c
allow non-printable symbols within string tokens;
wenzelm
parents:
30593
diff
changeset
|
264 |
(not (Symbol.is_char s) orelse Symbol.is_printable s)) >> single || |
27772 | 265 |
$$$ "\\" @@@ !!! "bad escape character in string" scan_escape; |
24596 | 266 |
|
27772 | 267 |
val scan_gap = $$$ "\\" @@@ scan_blanks1 @@@ $$$ "\\"; |
61476 | 268 |
val scan_gaps = Scan.repeats scan_gap; |
24579 | 269 |
|
24596 | 270 |
in |
24579 | 271 |
|
272 |
val scan_char = |
|
27772 | 273 |
$$$ "#" @@@ $$$ "\"" @@@ scan_gaps @@@ scan_str @@@ scan_gaps @@@ $$$ "\""; |
24579 | 274 |
|
48743
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
wenzelm
parents:
45666
diff
changeset
|
275 |
val recover_char = |
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
wenzelm
parents:
45666
diff
changeset
|
276 |
$$$ "#" @@@ $$$ "\"" @@@ scan_gaps @@@ Scan.optional (Scan.permissive scan_str @@@ scan_gaps) []; |
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
wenzelm
parents:
45666
diff
changeset
|
277 |
|
24579 | 278 |
val scan_string = |
55103 | 279 |
Scan.ahead ($$ "\"") |-- |
280 |
!!! "unclosed string literal" |
|
61476 | 281 |
($$$ "\"" @@@ Scan.repeats (scan_gap || scan_str) @@@ $$$ "\""); |
24596 | 282 |
|
48743
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
wenzelm
parents:
45666
diff
changeset
|
283 |
val recover_string = |
61476 | 284 |
$$$ "\"" @@@ Scan.repeats (scan_gap || Scan.permissive scan_str); |
48743
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
wenzelm
parents:
45666
diff
changeset
|
285 |
|
24596 | 286 |
end; |
24579 | 287 |
|
288 |
||
63204
921a5be54132
support rat numerals via special antiquotation syntax;
wenzelm
parents:
62797
diff
changeset
|
289 |
(* rat antiquotation *) |
921a5be54132
support rat numerals via special antiquotation syntax;
wenzelm
parents:
62797
diff
changeset
|
290 |
|
921a5be54132
support rat numerals via special antiquotation syntax;
wenzelm
parents:
62797
diff
changeset
|
291 |
val rat_name = Symbol_Pos.explode ("Pure.rat ", Position.none); |
921a5be54132
support rat numerals via special antiquotation syntax;
wenzelm
parents:
62797
diff
changeset
|
292 |
|
921a5be54132
support rat numerals via special antiquotation syntax;
wenzelm
parents:
62797
diff
changeset
|
293 |
val scan_rat_antiq = |
921a5be54132
support rat numerals via special antiquotation syntax;
wenzelm
parents:
62797
diff
changeset
|
294 |
Symbol_Pos.scan_pos -- ($$ "@" |-- Symbol_Pos.scan_pos -- scan_rat) -- Symbol_Pos.scan_pos |
921a5be54132
support rat numerals via special antiquotation syntax;
wenzelm
parents:
62797
diff
changeset
|
295 |
>> (fn ((pos1, (pos2, body)), pos3) => |
921a5be54132
support rat numerals via special antiquotation syntax;
wenzelm
parents:
62797
diff
changeset
|
296 |
{start = Position.range_position (pos1, pos2), |
921a5be54132
support rat numerals via special antiquotation syntax;
wenzelm
parents:
62797
diff
changeset
|
297 |
stop = Position.none, |
921a5be54132
support rat numerals via special antiquotation syntax;
wenzelm
parents:
62797
diff
changeset
|
298 |
range = Position.range (pos1, pos3), |
921a5be54132
support rat numerals via special antiquotation syntax;
wenzelm
parents:
62797
diff
changeset
|
299 |
body = rat_name @ body}); |
921a5be54132
support rat numerals via special antiquotation syntax;
wenzelm
parents:
62797
diff
changeset
|
300 |
|
921a5be54132
support rat numerals via special antiquotation syntax;
wenzelm
parents:
62797
diff
changeset
|
301 |
|
30645 | 302 |
(* scan tokens *) |
24579 | 303 |
|
304 |
local |
|
305 |
||
30593 | 306 |
fun token k ss = Token (Symbol_Pos.range ss, (k, Symbol_Pos.content ss)); |
24579 | 307 |
|
67427
5409cfd41e7f
clarified SML (no formal comments) vs. Isabelle/ML (arbitrary comments);
wenzelm
parents:
67425
diff
changeset
|
308 |
val scan_sml = |
5409cfd41e7f
clarified SML (no formal comments) vs. Isabelle/ML (arbitrary comments);
wenzelm
parents:
67425
diff
changeset
|
309 |
scan_char >> token Char || |
27772 | 310 |
scan_string >> token String || |
311 |
scan_blanks1 >> token Space || |
|
67425
7d4a088dbc0e
clarified modules: uniform notion of formal comments;
wenzelm
parents:
67363
diff
changeset
|
312 |
Symbol_Pos.scan_comment err_prefix >> token (Comment NONE) || |
27772 | 313 |
Scan.max token_leq |
55505 | 314 |
(Scan.literal lexicon >> token Keyword) |
27772 | 315 |
(scan_word >> token Word || |
316 |
scan_real >> token Real || |
|
317 |
scan_int >> token Int || |
|
59082 | 318 |
scan_long_ident >> token Long_Ident || |
27772 | 319 |
scan_ident >> token Ident || |
67427
5409cfd41e7f
clarified SML (no formal comments) vs. Isabelle/ML (arbitrary comments);
wenzelm
parents:
67425
diff
changeset
|
320 |
scan_type_var >> token Type_Var); |
59112 | 321 |
|
68823 | 322 |
val scan_sml_antiq = scan_sml >> Antiquote.Text; |
323 |
||
59112 | 324 |
val scan_ml_antiq = |
69891
def3ec9cdb7e
document markers are formal comments, and may thus occur anywhere in the command-span;
wenzelm
parents:
69842
diff
changeset
|
325 |
Comment.scan_inner >> (fn (kind, ss) => Antiquote.Text (token (Comment (SOME kind)) ss)) || |
74373
6e4093927dbb
outer syntax: support for control-cartouche tokens;
wenzelm
parents:
74291
diff
changeset
|
326 |
Antiquote.scan_control Antiquote.err_prefix >> Antiquote.Control || |
59112 | 327 |
Antiquote.scan_antiq >> Antiquote.Antiq || |
63204
921a5be54132
support rat numerals via special antiquotation syntax;
wenzelm
parents:
62797
diff
changeset
|
328 |
scan_rat_antiq >> Antiquote.Antiq || |
68823 | 329 |
scan_sml_antiq; |
30645 | 330 |
|
27772 | 331 |
fun recover msg = |
48743
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
wenzelm
parents:
45666
diff
changeset
|
332 |
(recover_char || |
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
wenzelm
parents:
45666
diff
changeset
|
333 |
recover_string || |
59112 | 334 |
Symbol_Pos.recover_cartouche || |
48743
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
wenzelm
parents:
45666
diff
changeset
|
335 |
Symbol_Pos.recover_comment || |
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
wenzelm
parents:
45666
diff
changeset
|
336 |
Scan.one (Symbol.not_eof o Symbol_Pos.symbol) >> single) |
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
wenzelm
parents:
45666
diff
changeset
|
337 |
>> (single o token (Error msg)); |
24579 | 338 |
|
68823 | 339 |
fun reader {opaque_warning} scan syms = |
37198
3af985b10550
replaced ML_Lex.read_antiq by more concise ML_Lex.read, which includes full read/report with explicit position information;
wenzelm
parents:
37195
diff
changeset
|
340 |
let |
37209
1c8cf0048934
terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
wenzelm
parents:
37198
diff
changeset
|
341 |
val termination = |
1c8cf0048934
terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
wenzelm
parents:
37198
diff
changeset
|
342 |
if null syms then [] |
1c8cf0048934
terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
wenzelm
parents:
37198
diff
changeset
|
343 |
else |
1c8cf0048934
terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
wenzelm
parents:
37198
diff
changeset
|
344 |
let |
74174 | 345 |
val pos1 = List.last syms |-> Position.symbol; |
346 |
val pos2 = Position.symbol Symbol.space pos1; |
|
62797 | 347 |
in [Antiquote.Text (Token (Position.range (pos1, pos2), (Space, Symbol.space)))] end; |
56278
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents:
55828
diff
changeset
|
348 |
|
68823 | 349 |
fun check (Antiquote.Text tok) = |
350 |
(check_content_of tok; if opaque_warning then warning_opaque tok else ()) |
|
56278
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents:
55828
diff
changeset
|
351 |
| check _ = (); |
37209
1c8cf0048934
terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
wenzelm
parents:
37198
diff
changeset
|
352 |
val input = |
56278
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents:
55828
diff
changeset
|
353 |
Source.of_list syms |
58864 | 354 |
|> Source.source Symbol_Pos.stopper |
355 |
(Scan.recover (Scan.bulk (!!! "bad input" scan)) |
|
356 |
(fn msg => recover msg >> map Antiquote.Text)) |
|
61482 | 357 |
|> Source.exhaust; |
358 |
val _ = Position.reports (Antiquote.antiq_reports input); |
|
68822 | 359 |
val _ = Position.reports_text (maps (fn Antiquote.Text t => [token_report t] | _ => []) input); |
61482 | 360 |
val _ = List.app check input; |
37209
1c8cf0048934
terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
wenzelm
parents:
37198
diff
changeset
|
361 |
in input @ termination end; |
30645 | 362 |
|
56278
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents:
55828
diff
changeset
|
363 |
in |
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents:
55828
diff
changeset
|
364 |
|
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents:
55828
diff
changeset
|
365 |
fun source src = |
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents:
55828
diff
changeset
|
366 |
Symbol_Pos.source (Position.line 1) src |
67427
5409cfd41e7f
clarified SML (no formal comments) vs. Isabelle/ML (arbitrary comments);
wenzelm
parents:
67425
diff
changeset
|
367 |
|> Source.source Symbol_Pos.stopper (Scan.recover (Scan.bulk (!!! "bad input" scan_sml)) recover); |
56278
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents:
55828
diff
changeset
|
368 |
|
63936 | 369 |
val tokenize = Symbol.explode #> Source.of_list #> source #> Source.exhaust; |
73549
a2c589d5e1e4
clarified signature: more detailed token positions for antiquotations;
wenzelm
parents:
69891
diff
changeset
|
370 |
fun tokenize_range range = tokenize #> map (set_range range); |
74291
b83fa8f3a271
ML antiquotations for type constructors and term constants;
wenzelm
parents:
74174
diff
changeset
|
371 |
val tokenize_no_range = tokenize_range Position.no_range; |
56278
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents:
55828
diff
changeset
|
372 |
|
68823 | 373 |
val read_text = reader {opaque_warning = true} scan_ml_antiq o Symbol_Pos.explode; |
374 |
fun read text = read_text (text, Position.none); |
|
56278
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents:
55828
diff
changeset
|
375 |
|
73549
a2c589d5e1e4
clarified signature: more detailed token positions for antiquotations;
wenzelm
parents:
69891
diff
changeset
|
376 |
fun read_range range = |
59067 | 377 |
read #> map (fn Antiquote.Text tok => Antiquote.Text (set_range range tok) | antiq => antiq); |
58978
e42da880c61e
more position information, e.g. relevant for errors in generated ML source;
wenzelm
parents:
58933
diff
changeset
|
378 |
|
68823 | 379 |
fun read_source' {language, symbols, opaque_warning} scan source = |
56459
38d0b2099743
no report for position singularity, notably for aux. file, especially empty one;
wenzelm
parents:
56278
diff
changeset
|
380 |
let |
59064 | 381 |
val pos = Input.pos_of source; |
56459
38d0b2099743
no report for position singularity, notably for aux. file, especially empty one;
wenzelm
parents:
56278
diff
changeset
|
382 |
val _ = |
59064 | 383 |
if Position.is_reported_range pos |
384 |
then Position.report pos (language (Input.is_delimited source)) |
|
56459
38d0b2099743
no report for position singularity, notably for aux. file, especially empty one;
wenzelm
parents:
56278
diff
changeset
|
385 |
else (); |
69842 | 386 |
in |
387 |
Input.source_explode source |
|
388 |
|> not symbols ? maps (fn (s, p) => raw_explode s |> map (rpair p)) |
|
389 |
|> reader {opaque_warning = opaque_warning} scan |
|
390 |
end; |
|
68823 | 391 |
|
392 |
val read_source = |
|
393 |
read_source' {language = Markup.language_ML, symbols = true, opaque_warning = true} |
|
394 |
scan_ml_antiq; |
|
395 |
||
396 |
val read_source_sml = |
|
397 |
read_source' {language = Markup.language_SML, symbols = false, opaque_warning = false} |
|
398 |
scan_sml_antiq; |
|
55828
42ac3cfb89f6
clarified language markup: added "delimited" property;
wenzelm
parents:
55612
diff
changeset
|
399 |
|
74373
6e4093927dbb
outer syntax: support for control-cartouche tokens;
wenzelm
parents:
74291
diff
changeset
|
400 |
val read_symbols = reader {opaque_warning = true} scan_ml_antiq; |
69841
b3c9291b5fc7
clarified signature, notably for hol4isabelle (by Fabian Immler);
wenzelm
parents:
68823
diff
changeset
|
401 |
val read_symbols_sml = reader {opaque_warning = false} scan_sml_antiq; |
b3c9291b5fc7
clarified signature, notably for hol4isabelle (by Fabian Immler);
wenzelm
parents:
68823
diff
changeset
|
402 |
|
24579 | 403 |
end; |
404 |
||
24596 | 405 |
end; |