| author | wenzelm | 
| Tue, 18 Aug 2015 14:07:27 +0200 | |
| changeset 60965 | 49c797cb9f46 | 
| parent 59124 | 60c134fdd290 | 
| child 61457 | 3e21699bb83b | 
| permissions | -rw-r--r-- | 
| 24579 | 1  | 
(* Title: Pure/ML/ml_lex.ML  | 
2  | 
Author: Makarius  | 
|
3  | 
||
| 59109 | 4  | 
Lexical syntax for Isabelle/ML and Standard ML.  | 
| 24579 | 5  | 
*)  | 
6  | 
||
7  | 
signature ML_LEX =  | 
|
8  | 
sig  | 
|
| 55505 | 9  | 
val keywords: string list  | 
| 24579 | 10  | 
datatype token_kind =  | 
| 59082 | 11  | 
Keyword | Ident | Long_Ident | Type_Var | Word | Int | Real | Char | String |  | 
| 59112 | 12  | 
Space | Cartouche | Comment | Error of string | EOF  | 
| 24579 | 13  | 
eqtype token  | 
| 27732 | 14  | 
val stopper: token Scan.stopper  | 
| 59112 | 15  | 
val is_cartouche: token -> bool  | 
| 24596 | 16  | 
val is_regular: token -> bool  | 
17  | 
val is_improper: token -> bool  | 
|
| 
30683
 
e8ac1f9d9469
datatype antiquote: maintain original Position.range, which is eventually attached to the resulting ML tokens;
 
wenzelm 
parents: 
30645 
diff
changeset
 | 
18  | 
val set_range: Position.range -> token -> token  | 
| 59112 | 19  | 
val range_of: token -> Position.range  | 
| 30636 | 20  | 
val pos_of: token -> Position.T  | 
| 31474 | 21  | 
val end_pos_of: token -> Position.T  | 
| 24579 | 22  | 
val kind_of: token -> token_kind  | 
| 27817 | 23  | 
val content_of: token -> string  | 
| 31426 | 24  | 
val check_content_of: token -> string  | 
| 31332 | 25  | 
val flatten: token list -> string  | 
| 24596 | 26  | 
val source: (Symbol.symbol, 'a) Source.source ->  | 
| 30573 | 27  | 
(token, (Symbol_Pos.T, Position.T * (Symbol.symbol, 'a) Source.source)  | 
| 27772 | 28  | 
Source.source) Source.source  | 
| 30591 | 29  | 
val tokenize: string -> token list  | 
| 59067 | 30  | 
val read_pos: Position.T -> Symbol_Pos.text -> token Antiquote.antiquote list  | 
31  | 
val read: Symbol_Pos.text -> token Antiquote.antiquote list  | 
|
| 
58978
 
e42da880c61e
more position information, e.g. relevant for errors in generated ML source;
 
wenzelm 
parents: 
58933 
diff
changeset
 | 
32  | 
val read_set_range: Position.range -> Symbol_Pos.text -> token Antiquote.antiquote list  | 
| 59064 | 33  | 
val read_source: bool -> Input.source -> token Antiquote.antiquote list  | 
| 24579 | 34  | 
end;  | 
35  | 
||
36  | 
structure ML_Lex: ML_LEX =  | 
|
37  | 
struct  | 
|
38  | 
||
| 55505 | 39  | 
(** keywords **)  | 
40  | 
||
41  | 
val keywords =  | 
|
42  | 
 ["#", "(", ")", ",", "->", "...", ":", ":>", ";", "=", "=>", "[",
 | 
|
43  | 
  "]", "_", "{", "|", "}", "abstype", "and", "andalso", "as", "case",
 | 
|
44  | 
"datatype", "do", "else", "end", "eqtype", "exception", "fn", "fun",  | 
|
45  | 
"functor", "handle", "if", "in", "include", "infix", "infixr",  | 
|
46  | 
"let", "local", "nonfix", "of", "op", "open", "orelse", "raise",  | 
|
47  | 
"rec", "sharing", "sig", "signature", "struct", "structure", "then",  | 
|
48  | 
"type", "val", "where", "while", "with", "withtype"];  | 
|
49  | 
||
50  | 
val keywords2 =  | 
|
| 58933 | 51  | 
["and", "case", "do", "else", "end", "if", "in", "let", "local", "of",  | 
| 55505 | 52  | 
"sig", "struct", "then", "while", "with"];  | 
53  | 
||
54  | 
val keywords3 =  | 
|
55  | 
["handle", "open", "raise"];  | 
|
56  | 
||
57  | 
val lexicon = Scan.make_lexicon (map raw_explode keywords);  | 
|
58  | 
||
59  | 
||
60  | 
||
| 24579 | 61  | 
(** tokens **)  | 
62  | 
||
63  | 
(* datatype token *)  | 
|
64  | 
||
65  | 
datatype token_kind =  | 
|
| 59082 | 66  | 
Keyword | Ident | Long_Ident | Type_Var | Word | Int | Real | Char | String |  | 
| 59112 | 67  | 
Space | Cartouche | Comment | Error of string | EOF;  | 
| 24579 | 68  | 
|
| 27772 | 69  | 
datatype token = Token of Position.range * (token_kind * string);  | 
70  | 
||
71  | 
||
72  | 
(* position *)  | 
|
73  | 
||
| 
30683
 
e8ac1f9d9469
datatype antiquote: maintain original Position.range, which is eventually attached to the resulting ML tokens;
 
wenzelm 
parents: 
30645 
diff
changeset
 | 
74  | 
fun set_range range (Token (_, x)) = Token (range, x);  | 
| 59112 | 75  | 
fun range_of (Token (range, _)) = range;  | 
| 
30683
 
e8ac1f9d9469
datatype antiquote: maintain original Position.range, which is eventually attached to the resulting ML tokens;
 
wenzelm 
parents: 
30645 
diff
changeset
 | 
76  | 
|
| 59112 | 77  | 
val pos_of = #1 o range_of;  | 
78  | 
val end_pos_of = #2 o range_of;  | 
|
| 24579 | 79  | 
|
80  | 
||
| 58855 | 81  | 
(* stopper *)  | 
| 24579 | 82  | 
|
| 27772 | 83  | 
fun mk_eof pos = Token ((pos, Position.none), (EOF, ""));  | 
84  | 
val eof = mk_eof Position.none;  | 
|
| 24579 | 85  | 
|
86  | 
fun is_eof (Token (_, (EOF, _))) = true  | 
|
87  | 
| is_eof _ = false;  | 
|
88  | 
||
| 27772 | 89  | 
val stopper =  | 
| 30636 | 90  | 
Scan.stopper (fn [] => eof | toks => mk_eof (end_pos_of (List.last toks))) is_eof;  | 
| 27772 | 91  | 
|
| 24579 | 92  | 
|
| 27772 | 93  | 
(* token content *)  | 
94  | 
||
| 31426 | 95  | 
fun kind_of (Token (_, (k, _))) = k;  | 
96  | 
||
| 27817 | 97  | 
fun content_of (Token (_, (_, x))) = x;  | 
98  | 
fun token_leq (tok, tok') = content_of tok <= content_of tok';  | 
|
| 27772 | 99  | 
|
| 55505 | 100  | 
fun is_keyword (Token (_, (Keyword, _))) = true  | 
101  | 
| is_keyword _ = false;  | 
|
102  | 
||
103  | 
fun is_delimiter (Token (_, (Keyword, x))) = not (Symbol.is_ascii_identifier x)  | 
|
104  | 
| is_delimiter _ = false;  | 
|
105  | 
||
| 59112 | 106  | 
fun is_cartouche (Token (_, (Cartouche, _))) = true  | 
107  | 
| is_cartouche _ = false;  | 
|
108  | 
||
| 
41502
 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 
wenzelm 
parents: 
40627 
diff
changeset
 | 
109  | 
fun is_regular (Token (_, (Error _, _))) = false  | 
| 
 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 
wenzelm 
parents: 
40627 
diff
changeset
 | 
110  | 
| is_regular (Token (_, (EOF, _))) = false  | 
| 
 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 
wenzelm 
parents: 
40627 
diff
changeset
 | 
111  | 
| is_regular _ = true;  | 
| 
 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 
wenzelm 
parents: 
40627 
diff
changeset
 | 
112  | 
|
| 
 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 
wenzelm 
parents: 
40627 
diff
changeset
 | 
113  | 
fun is_improper (Token (_, (Space, _))) = true  | 
| 
 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 
wenzelm 
parents: 
40627 
diff
changeset
 | 
114  | 
| is_improper (Token (_, (Comment, _))) = true  | 
| 
 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 
wenzelm 
parents: 
40627 
diff
changeset
 | 
115  | 
| is_improper _ = false;  | 
| 
 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 
wenzelm 
parents: 
40627 
diff
changeset
 | 
116  | 
|
| 
40337
 
d25bbb5f1c9e
warn in correlation with report -- avoid spurious message duplicates;
 
wenzelm 
parents: 
40319 
diff
changeset
 | 
117  | 
fun warn tok =  | 
| 
 
d25bbb5f1c9e
warn in correlation with report -- avoid spurious message duplicates;
 
wenzelm 
parents: 
40319 
diff
changeset
 | 
118  | 
(case tok of  | 
| 
 
d25bbb5f1c9e
warn in correlation with report -- avoid spurious message duplicates;
 
wenzelm 
parents: 
40319 
diff
changeset
 | 
119  | 
Token (_, (Keyword, ":>")) =>  | 
| 
 
d25bbb5f1c9e
warn in correlation with report -- avoid spurious message duplicates;
 
wenzelm 
parents: 
40319 
diff
changeset
 | 
120  | 
      warning ("Opaque signature matching (:>) fails to work with ML pretty printing --\n\
 | 
| 
 
d25bbb5f1c9e
warn in correlation with report -- avoid spurious message duplicates;
 
wenzelm 
parents: 
40319 
diff
changeset
 | 
121  | 
\prefer non-opaque matching (:) possibly with abstype" ^  | 
| 48992 | 122  | 
Position.here (pos_of tok))  | 
| 
40337
 
d25bbb5f1c9e
warn in correlation with report -- avoid spurious message duplicates;
 
wenzelm 
parents: 
40319 
diff
changeset
 | 
123  | 
| _ => ());  | 
| 
 
d25bbb5f1c9e
warn in correlation with report -- avoid spurious message duplicates;
 
wenzelm 
parents: 
40319 
diff
changeset
 | 
124  | 
|
| 31426 | 125  | 
fun check_content_of tok =  | 
| 30636 | 126  | 
(case kind_of tok of  | 
| 30645 | 127  | 
Error msg => error msg  | 
| 
40337
 
d25bbb5f1c9e
warn in correlation with report -- avoid spurious message duplicates;
 
wenzelm 
parents: 
40319 
diff
changeset
 | 
128  | 
| _ => content_of tok);  | 
| 30636 | 129  | 
|
| 59112 | 130  | 
|
131  | 
(* flatten *)  | 
|
132  | 
||
| 
41502
 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 
wenzelm 
parents: 
40627 
diff
changeset
 | 
133  | 
fun flatten_content (tok :: (toks as tok' :: _)) =  | 
| 
 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 
wenzelm 
parents: 
40627 
diff
changeset
 | 
134  | 
Symbol.escape (check_content_of tok) ::  | 
| 
 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 
wenzelm 
parents: 
40627 
diff
changeset
 | 
135  | 
(if is_improper tok orelse is_improper tok' then flatten_content toks  | 
| 
 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 
wenzelm 
parents: 
40627 
diff
changeset
 | 
136  | 
else Symbol.space :: flatten_content toks)  | 
| 
 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 
wenzelm 
parents: 
40627 
diff
changeset
 | 
137  | 
| flatten_content toks = map (Symbol.escape o check_content_of) toks;  | 
| 31332 | 138  | 
|
| 
41502
 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 
wenzelm 
parents: 
40627 
diff
changeset
 | 
139  | 
val flatten = implode o flatten_content;  | 
| 24596 | 140  | 
|
141  | 
||
| 30614 | 142  | 
(* markup *)  | 
143  | 
||
| 37195 | 144  | 
local  | 
145  | 
||
| 
56278
 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 
wenzelm 
parents: 
55828 
diff
changeset
 | 
146  | 
fun token_kind_markup SML =  | 
| 59124 | 147  | 
fn Type_Var => (Markup.ML_tvar, "")  | 
| 59082 | 148  | 
| Word => (Markup.ML_numeral, "")  | 
149  | 
| Int => (Markup.ML_numeral, "")  | 
|
150  | 
| Real => (Markup.ML_numeral, "")  | 
|
151  | 
| Char => (Markup.ML_char, "")  | 
|
152  | 
| String => (if SML then Markup.SML_string else Markup.ML_string, "")  | 
|
| 59112 | 153  | 
| Cartouche => (Markup.ML_cartouche, "")  | 
| 59082 | 154  | 
| Comment => (if SML then Markup.SML_comment else Markup.ML_comment, "")  | 
| 
50201
 
c26369c9eda6
Isabelle-specific implementation of quasi-abstract markup elements -- back to module arrangement before d83797ef0d2d;
 
wenzelm 
parents: 
48992 
diff
changeset
 | 
155  | 
| Error msg => (Markup.bad, msg)  | 
| 59124 | 156  | 
| _ => (Markup.empty, "");  | 
| 30614 | 157  | 
|
| 37195 | 158  | 
in  | 
159  | 
||
| 
56278
 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 
wenzelm 
parents: 
55828 
diff
changeset
 | 
160  | 
fun report_of_token SML (tok as Token ((pos, _), (kind, x))) =  | 
| 55505 | 161  | 
let  | 
162  | 
val (markup, txt) =  | 
|
| 
56278
 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 
wenzelm 
parents: 
55828 
diff
changeset
 | 
163  | 
if not (is_keyword tok) then token_kind_markup SML kind  | 
| 55505 | 164  | 
else if is_delimiter tok then (Markup.ML_delimiter, "")  | 
165  | 
else if member (op =) keywords2 x then (Markup.ML_keyword2, "")  | 
|
166  | 
else if member (op =) keywords3 x then (Markup.ML_keyword3, "")  | 
|
167  | 
else (Markup.ML_keyword1, "");  | 
|
| 
48768
 
abc45de5bb22
clarified "bad" markup: proper body text, invent missing serial on Scala side (counting backwards);
 
wenzelm 
parents: 
48756 
diff
changeset
 | 
168  | 
in ((pos, markup), txt) end;  | 
| 37195 | 169  | 
|
170  | 
end;  | 
|
| 30614 | 171  | 
|
172  | 
||
| 24579 | 173  | 
|
174  | 
(** scanners **)  | 
|
175  | 
||
| 30573 | 176  | 
open Basic_Symbol_Pos;  | 
| 24579 | 177  | 
|
| 55105 | 178  | 
val err_prefix = "SML lexical error: ";  | 
179  | 
||
180  | 
fun !!! msg = Symbol_Pos.!!! (fn () => err_prefix ^ msg);  | 
|
| 24579 | 181  | 
|
182  | 
||
183  | 
(* identifiers *)  | 
|
184  | 
||
| 24596 | 185  | 
local  | 
186  | 
||
| 24579 | 187  | 
val scan_letdigs =  | 
| 55496 | 188  | 
Scan.many (Symbol.is_ascii_letdig o Symbol_Pos.symbol);  | 
| 24579 | 189  | 
|
| 
40525
 
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
 
wenzelm 
parents: 
40523 
diff
changeset
 | 
190  | 
val scan_alphanumeric =  | 
| 
 
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
 
wenzelm 
parents: 
40523 
diff
changeset
 | 
191  | 
Scan.one (Symbol.is_ascii_letter o Symbol_Pos.symbol) -- scan_letdigs >> op ::;  | 
| 24579 | 192  | 
|
| 
40525
 
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
 
wenzelm 
parents: 
40523 
diff
changeset
 | 
193  | 
val scan_symbolic =  | 
| 
40627
 
becf5d5187cc
renamed raw "explode" function to "raw_explode" to emphasize its meaning;
 
wenzelm 
parents: 
40525 
diff
changeset
 | 
194  | 
Scan.many1 (member (op =) (raw_explode "!#$%&*+-/:<=>?@\\^`|~") o Symbol_Pos.symbol);  | 
| 24579 | 195  | 
|
| 24596 | 196  | 
in  | 
197  | 
||
| 24579 | 198  | 
val scan_ident = scan_alphanumeric || scan_symbolic;  | 
199  | 
||
| 55496 | 200  | 
val scan_long_ident =  | 
| 27772 | 201  | 
(Scan.repeat1 (scan_alphanumeric @@@ $$$ ".") >> flat) @@@ (scan_ident || $$$ "=");  | 
| 24579 | 202  | 
|
| 55496 | 203  | 
val scan_type_var = $$$ "'" @@@ scan_letdigs;  | 
| 24579 | 204  | 
|
| 24596 | 205  | 
end;  | 
| 24579 | 206  | 
|
207  | 
||
208  | 
(* numerals *)  | 
|
209  | 
||
| 24596 | 210  | 
local  | 
211  | 
||
| 
40525
 
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
 
wenzelm 
parents: 
40523 
diff
changeset
 | 
212  | 
val scan_dec = Scan.many1 (Symbol.is_ascii_digit o Symbol_Pos.symbol);  | 
| 
 
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
 
wenzelm 
parents: 
40523 
diff
changeset
 | 
213  | 
val scan_hex = Scan.many1 (Symbol.is_ascii_hex o Symbol_Pos.symbol);  | 
| 27772 | 214  | 
val scan_sign = Scan.optional ($$$ "~") [];  | 
215  | 
val scan_decint = scan_sign @@@ scan_dec;  | 
|
| 55496 | 216  | 
val scan_exp = ($$$ "E" || $$$ "e") @@@ scan_decint;  | 
| 24579 | 217  | 
|
| 24596 | 218  | 
in  | 
219  | 
||
| 27772 | 220  | 
val scan_word =  | 
221  | 
$$$ "0" @@@ $$$ "w" @@@ $$$ "x" @@@ scan_hex ||  | 
|
222  | 
$$$ "0" @@@ $$$ "w" @@@ scan_dec;  | 
|
| 24579 | 223  | 
|
| 27772 | 224  | 
val scan_int = scan_sign @@@ ($$$ "0" @@@ $$$ "x" @@@ scan_hex || scan_dec);  | 
| 24579 | 225  | 
|
226  | 
val scan_real =  | 
|
| 27772 | 227  | 
scan_decint @@@ $$$ "." @@@ scan_dec @@@ Scan.optional scan_exp [] ||  | 
228  | 
scan_decint @@@ scan_exp;  | 
|
| 24579 | 229  | 
|
| 24596 | 230  | 
end;  | 
231  | 
||
| 24579 | 232  | 
|
233  | 
(* chars and strings *)  | 
|
234  | 
||
| 55496 | 235  | 
val scan_blanks1 = Scan.many1 (Symbol.is_ascii_blank o Symbol_Pos.symbol);  | 
236  | 
||
| 24596 | 237  | 
local  | 
238  | 
||
239  | 
val scan_escape =  | 
|
| 
40627
 
becf5d5187cc
renamed raw "explode" function to "raw_explode" to emphasize its meaning;
 
wenzelm 
parents: 
40525 
diff
changeset
 | 
240  | 
Scan.one (member (op =) (raw_explode "\"\\abtnvfr") o Symbol_Pos.symbol) >> single ||  | 
| 27772 | 241  | 
$$$ "^" @@@ (Scan.one (fn (s, _) => ord "@" <= ord s andalso ord s <= ord "_") >> single) ||  | 
| 
40525
 
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
 
wenzelm 
parents: 
40523 
diff
changeset
 | 
242  | 
Scan.one (Symbol.is_ascii_digit o Symbol_Pos.symbol) --  | 
| 
 
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
 
wenzelm 
parents: 
40523 
diff
changeset
 | 
243  | 
Scan.one (Symbol.is_ascii_digit o Symbol_Pos.symbol) --  | 
| 
 
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
 
wenzelm 
parents: 
40523 
diff
changeset
 | 
244  | 
Scan.one (Symbol.is_ascii_digit o Symbol_Pos.symbol) >> (fn ((a, b), c) => [a, b, c]);  | 
| 24596 | 245  | 
|
246  | 
val scan_str =  | 
|
| 58854 | 247  | 
Scan.one (fn (s, _) => Symbol.not_eof s andalso s <> "\"" andalso s <> "\\" andalso  | 
| 
30600
 
de241396389c
allow non-printable symbols within string tokens;
 
wenzelm 
parents: 
30593 
diff
changeset
 | 
248  | 
(not (Symbol.is_char s) orelse Symbol.is_printable s)) >> single ||  | 
| 27772 | 249  | 
$$$ "\\" @@@ !!! "bad escape character in string" scan_escape;  | 
| 24596 | 250  | 
|
| 27772 | 251  | 
val scan_gap = $$$ "\\" @@@ scan_blanks1 @@@ $$$ "\\";  | 
252  | 
val scan_gaps = Scan.repeat scan_gap >> flat;  | 
|
| 24579 | 253  | 
|
| 24596 | 254  | 
in  | 
| 24579 | 255  | 
|
256  | 
val scan_char =  | 
|
| 27772 | 257  | 
$$$ "#" @@@ $$$ "\"" @@@ scan_gaps @@@ scan_str @@@ scan_gaps @@@ $$$ "\"";  | 
| 24579 | 258  | 
|
| 
48743
 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 
wenzelm 
parents: 
45666 
diff
changeset
 | 
259  | 
val recover_char =  | 
| 
 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 
wenzelm 
parents: 
45666 
diff
changeset
 | 
260  | 
$$$ "#" @@@ $$$ "\"" @@@ scan_gaps @@@ Scan.optional (Scan.permissive scan_str @@@ scan_gaps) [];  | 
| 
 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 
wenzelm 
parents: 
45666 
diff
changeset
 | 
261  | 
|
| 24579 | 262  | 
val scan_string =  | 
| 55103 | 263  | 
Scan.ahead ($$ "\"") |--  | 
264  | 
!!! "unclosed string literal"  | 
|
265  | 
($$$ "\"" @@@ (Scan.repeat (scan_gap || scan_str) >> flat) @@@ $$$ "\"");  | 
|
| 24596 | 266  | 
|
| 
48743
 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 
wenzelm 
parents: 
45666 
diff
changeset
 | 
267  | 
val recover_string =  | 
| 
 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 
wenzelm 
parents: 
45666 
diff
changeset
 | 
268  | 
$$$ "\"" @@@ (Scan.repeat (scan_gap || Scan.permissive scan_str) >> flat);  | 
| 
 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 
wenzelm 
parents: 
45666 
diff
changeset
 | 
269  | 
|
| 24596 | 270  | 
end;  | 
| 24579 | 271  | 
|
272  | 
||
| 30645 | 273  | 
(* scan tokens *)  | 
| 24579 | 274  | 
|
275  | 
local  | 
|
276  | 
||
| 30593 | 277  | 
fun token k ss = Token (Symbol_Pos.range ss, (k, Symbol_Pos.content ss));  | 
| 24579 | 278  | 
|
| 30593 | 279  | 
val scan_ml =  | 
| 27772 | 280  | 
(scan_char >> token Char ||  | 
281  | 
scan_string >> token String ||  | 
|
282  | 
scan_blanks1 >> token Space ||  | 
|
| 55105 | 283  | 
Symbol_Pos.scan_comment err_prefix >> token Comment ||  | 
| 27772 | 284  | 
Scan.max token_leq  | 
| 55505 | 285  | 
(Scan.literal lexicon >> token Keyword)  | 
| 27772 | 286  | 
(scan_word >> token Word ||  | 
287  | 
scan_real >> token Real ||  | 
|
288  | 
scan_int >> token Int ||  | 
|
| 59082 | 289  | 
scan_long_ident >> token Long_Ident ||  | 
| 27772 | 290  | 
scan_ident >> token Ident ||  | 
| 59082 | 291  | 
scan_type_var >> token Type_Var));  | 
| 27772 | 292  | 
|
| 59112 | 293  | 
val scan_sml = scan_ml >> Antiquote.Text;  | 
294  | 
||
295  | 
val scan_ml_antiq =  | 
|
296  | 
Antiquote.scan_antiq >> Antiquote.Antiq ||  | 
|
297  | 
Symbol_Pos.scan_cartouche err_prefix >> (Antiquote.Text o token Cartouche) ||  | 
|
298  | 
scan_ml >> Antiquote.Text;  | 
|
| 30645 | 299  | 
|
| 27772 | 300  | 
fun recover msg =  | 
| 
48743
 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 
wenzelm 
parents: 
45666 
diff
changeset
 | 
301  | 
(recover_char ||  | 
| 
 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 
wenzelm 
parents: 
45666 
diff
changeset
 | 
302  | 
recover_string ||  | 
| 59112 | 303  | 
Symbol_Pos.recover_cartouche ||  | 
| 
48743
 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 
wenzelm 
parents: 
45666 
diff
changeset
 | 
304  | 
Symbol_Pos.recover_comment ||  | 
| 
 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 
wenzelm 
parents: 
45666 
diff
changeset
 | 
305  | 
Scan.one (Symbol.not_eof o Symbol_Pos.symbol) >> single)  | 
| 
 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 
wenzelm 
parents: 
45666 
diff
changeset
 | 
306  | 
>> (single o token (Error msg));  | 
| 24579 | 307  | 
|
| 
56278
 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 
wenzelm 
parents: 
55828 
diff
changeset
 | 
308  | 
fun gen_read SML pos text =  | 
| 
37198
 
3af985b10550
replaced ML_Lex.read_antiq by more concise ML_Lex.read, which includes full read/report with explicit position information;
 
wenzelm 
parents: 
37195 
diff
changeset
 | 
309  | 
let  | 
| 
59110
 
8a78c7cb5b14
some special cases for official SML, to treat Isabelle symbols like raw characters;
 
wenzelm 
parents: 
59109 
diff
changeset
 | 
310  | 
val syms =  | 
| 
 
8a78c7cb5b14
some special cases for official SML, to treat Isabelle symbols like raw characters;
 
wenzelm 
parents: 
59109 
diff
changeset
 | 
311  | 
Symbol_Pos.explode (text, pos)  | 
| 
 
8a78c7cb5b14
some special cases for official SML, to treat Isabelle symbols like raw characters;
 
wenzelm 
parents: 
59109 
diff
changeset
 | 
312  | 
|> SML ? maps (fn (s, p) => raw_explode s |> map (rpair p));  | 
| 
 
8a78c7cb5b14
some special cases for official SML, to treat Isabelle symbols like raw characters;
 
wenzelm 
parents: 
59109 
diff
changeset
 | 
313  | 
|
| 
37209
 
1c8cf0048934
terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
 
wenzelm 
parents: 
37198 
diff
changeset
 | 
314  | 
val termination =  | 
| 
 
1c8cf0048934
terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
 
wenzelm 
parents: 
37198 
diff
changeset
 | 
315  | 
if null syms then []  | 
| 
 
1c8cf0048934
terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
 
wenzelm 
parents: 
37198 
diff
changeset
 | 
316  | 
else  | 
| 
 
1c8cf0048934
terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
 
wenzelm 
parents: 
37198 
diff
changeset
 | 
317  | 
let  | 
| 
 
1c8cf0048934
terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
 
wenzelm 
parents: 
37198 
diff
changeset
 | 
318  | 
val pos1 = List.last syms |-> Position.advance;  | 
| 
 
1c8cf0048934
terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
 
wenzelm 
parents: 
37198 
diff
changeset
 | 
319  | 
val pos2 = Position.advance Symbol.space pos1;  | 
| 
 
1c8cf0048934
terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
 
wenzelm 
parents: 
37198 
diff
changeset
 | 
320  | 
in [Antiquote.Text (Token (Position.range pos1 pos2, (Space, Symbol.space)))] end;  | 
| 
56278
 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 
wenzelm 
parents: 
55828 
diff
changeset
 | 
321  | 
|
| 59112 | 322  | 
val scan = if SML then scan_sml else scan_ml_antiq;  | 
| 
56278
 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 
wenzelm 
parents: 
55828 
diff
changeset
 | 
323  | 
fun check (Antiquote.Text tok) = (check_content_of tok; if SML then () else warn tok)  | 
| 
 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 
wenzelm 
parents: 
55828 
diff
changeset
 | 
324  | 
| check _ = ();  | 
| 
37209
 
1c8cf0048934
terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
 
wenzelm 
parents: 
37198 
diff
changeset
 | 
325  | 
val input =  | 
| 
56278
 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 
wenzelm 
parents: 
55828 
diff
changeset
 | 
326  | 
Source.of_list syms  | 
| 58864 | 327  | 
|> Source.source Symbol_Pos.stopper  | 
328  | 
(Scan.recover (Scan.bulk (!!! "bad input" scan))  | 
|
329  | 
(fn msg => recover msg >> map Antiquote.Text))  | 
|
| 
56278
 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 
wenzelm 
parents: 
55828 
diff
changeset
 | 
330  | 
|> Source.exhaust  | 
| 
 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 
wenzelm 
parents: 
55828 
diff
changeset
 | 
331  | 
|> tap (Position.reports_text o Antiquote.antiquote_reports (single o report_of_token SML))  | 
| 
 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 
wenzelm 
parents: 
55828 
diff
changeset
 | 
332  | 
|> tap (List.app check);  | 
| 
37209
 
1c8cf0048934
terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
 
wenzelm 
parents: 
37198 
diff
changeset
 | 
333  | 
in input @ termination end;  | 
| 30645 | 334  | 
|
| 
56278
 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 
wenzelm 
parents: 
55828 
diff
changeset
 | 
335  | 
in  | 
| 
 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 
wenzelm 
parents: 
55828 
diff
changeset
 | 
336  | 
|
| 
 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 
wenzelm 
parents: 
55828 
diff
changeset
 | 
337  | 
fun source src =  | 
| 
 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 
wenzelm 
parents: 
55828 
diff
changeset
 | 
338  | 
Symbol_Pos.source (Position.line 1) src  | 
| 58864 | 339  | 
|> Source.source Symbol_Pos.stopper (Scan.recover (Scan.bulk (!!! "bad input" scan_ml)) recover);  | 
| 
56278
 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 
wenzelm 
parents: 
55828 
diff
changeset
 | 
340  | 
|
| 
 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 
wenzelm 
parents: 
55828 
diff
changeset
 | 
341  | 
val tokenize = Source.of_string #> Symbol.source #> source #> Source.exhaust;  | 
| 
 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 
wenzelm 
parents: 
55828 
diff
changeset
 | 
342  | 
|
| 59067 | 343  | 
val read_pos = gen_read false;  | 
344  | 
||
345  | 
val read = read_pos Position.none;  | 
|
| 
56278
 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 
wenzelm 
parents: 
55828 
diff
changeset
 | 
346  | 
|
| 
58978
 
e42da880c61e
more position information, e.g. relevant for errors in generated ML source;
 
wenzelm 
parents: 
58933 
diff
changeset
 | 
347  | 
fun read_set_range range =  | 
| 59067 | 348  | 
read #> map (fn Antiquote.Text tok => Antiquote.Text (set_range range tok) | antiq => antiq);  | 
| 
58978
 
e42da880c61e
more position information, e.g. relevant for errors in generated ML source;
 
wenzelm 
parents: 
58933 
diff
changeset
 | 
349  | 
|
| 59064 | 350  | 
fun read_source SML source =  | 
| 
56459
 
38d0b2099743
no report for position singularity, notably for aux. file, especially empty one;
 
wenzelm 
parents: 
56278 
diff
changeset
 | 
351  | 
let  | 
| 59064 | 352  | 
val pos = Input.pos_of source;  | 
| 
56459
 
38d0b2099743
no report for position singularity, notably for aux. file, especially empty one;
 
wenzelm 
parents: 
56278 
diff
changeset
 | 
353  | 
val language = if SML then Markup.language_SML else Markup.language_ML;  | 
| 
 
38d0b2099743
no report for position singularity, notably for aux. file, especially empty one;
 
wenzelm 
parents: 
56278 
diff
changeset
 | 
354  | 
val _ =  | 
| 59064 | 355  | 
if Position.is_reported_range pos  | 
356  | 
then Position.report pos (language (Input.is_delimited source))  | 
|
| 
56459
 
38d0b2099743
no report for position singularity, notably for aux. file, especially empty one;
 
wenzelm 
parents: 
56278 
diff
changeset
 | 
357  | 
else ();  | 
| 59064 | 358  | 
in gen_read SML pos (Input.text_of source) end;  | 
| 
55828
 
42ac3cfb89f6
clarified language markup: added "delimited" property;
 
wenzelm 
parents: 
55612 
diff
changeset
 | 
359  | 
|
| 24579 | 360  | 
end;  | 
361  | 
||
| 24596 | 362  | 
end;  | 
| 24579 | 363  |