| author | wenzelm | 
| Wed, 14 Sep 2016 12:51:40 +0200 | |
| changeset 63866 | 630eaf8fe9f3 | 
| parent 63204 | 921a5be54132 | 
| child 63936 | b87784e19a77 | 
| permissions | -rw-r--r-- | 
| 24579 | 1 | (* Title: Pure/ML/ml_lex.ML | 
| 2 | Author: Makarius | |
| 3 | ||
| 59109 | 4 | Lexical syntax for Isabelle/ML and Standard ML. | 
| 24579 | 5 | *) | 
| 6 | ||
| 7 | signature ML_LEX = | |
| 8 | sig | |
| 55505 | 9 | val keywords: string list | 
| 24579 | 10 | datatype token_kind = | 
| 59082 | 11 | Keyword | Ident | Long_Ident | Type_Var | Word | Int | Real | Char | String | | 
| 61596 | 12 | Space | Comment | Error of string | EOF | 
| 24579 | 13 | eqtype token | 
| 27732 | 14 | val stopper: token Scan.stopper | 
| 24596 | 15 | val is_regular: token -> bool | 
| 16 | val is_improper: token -> bool | |
| 30683 
e8ac1f9d9469
datatype antiquote: maintain original Position.range, which is eventually attached to the resulting ML tokens;
 wenzelm parents: 
30645diff
changeset | 17 | val set_range: Position.range -> token -> token | 
| 59112 | 18 | val range_of: token -> Position.range | 
| 30636 | 19 | val pos_of: token -> Position.T | 
| 31474 | 20 | val end_pos_of: token -> Position.T | 
| 24579 | 21 | val kind_of: token -> token_kind | 
| 27817 | 22 | val content_of: token -> string | 
| 31426 | 23 | val check_content_of: token -> string | 
| 31332 | 24 | val flatten: token list -> string | 
| 24596 | 25 | val source: (Symbol.symbol, 'a) Source.source -> | 
| 30573 | 26 | (token, (Symbol_Pos.T, Position.T * (Symbol.symbol, 'a) Source.source) | 
| 27772 | 27 | Source.source) Source.source | 
| 30591 | 28 | val tokenize: string -> token list | 
| 59067 | 29 | val read_pos: Position.T -> Symbol_Pos.text -> token Antiquote.antiquote list | 
| 30 | val read: Symbol_Pos.text -> token Antiquote.antiquote list | |
| 58978 
e42da880c61e
more position information, e.g. relevant for errors in generated ML source;
 wenzelm parents: 
58933diff
changeset | 31 | val read_set_range: Position.range -> Symbol_Pos.text -> token Antiquote.antiquote list | 
| 59064 | 32 | val read_source: bool -> Input.source -> token Antiquote.antiquote list | 
| 24579 | 33 | end; | 
| 34 | ||
| 35 | structure ML_Lex: ML_LEX = | |
| 36 | struct | |
| 37 | ||
| 55505 | 38 | (** keywords **) | 
| 39 | ||
| 40 | val keywords = | |
| 41 |  ["#", "(", ")", ",", "->", "...", ":", ":>", ";", "=", "=>", "[",
 | |
| 42 |   "]", "_", "{", "|", "}", "abstype", "and", "andalso", "as", "case",
 | |
| 43 | "datatype", "do", "else", "end", "eqtype", "exception", "fn", "fun", | |
| 44 | "functor", "handle", "if", "in", "include", "infix", "infixr", | |
| 45 | "let", "local", "nonfix", "of", "op", "open", "orelse", "raise", | |
| 46 | "rec", "sharing", "sig", "signature", "struct", "structure", "then", | |
| 47 | "type", "val", "where", "while", "with", "withtype"]; | |
| 48 | ||
| 49 | val keywords2 = | |
| 58933 | 50 | ["and", "case", "do", "else", "end", "if", "in", "let", "local", "of", | 
| 55505 | 51 | "sig", "struct", "then", "while", "with"]; | 
| 52 | ||
| 53 | val keywords3 = | |
| 54 | ["handle", "open", "raise"]; | |
| 55 | ||
| 56 | val lexicon = Scan.make_lexicon (map raw_explode keywords); | |
| 57 | ||
| 58 | ||
| 59 | ||
| 24579 | 60 | (** tokens **) | 
| 61 | ||
| 62 | (* datatype token *) | |
| 63 | ||
| 64 | datatype token_kind = | |
| 59082 | 65 | Keyword | Ident | Long_Ident | Type_Var | Word | Int | Real | Char | String | | 
| 61596 | 66 | Space | Comment | Error of string | EOF; | 
| 24579 | 67 | |
| 27772 | 68 | datatype token = Token of Position.range * (token_kind * string); | 
| 69 | ||
| 70 | ||
| 71 | (* position *) | |
| 72 | ||
| 30683 
e8ac1f9d9469
datatype antiquote: maintain original Position.range, which is eventually attached to the resulting ML tokens;
 wenzelm parents: 
30645diff
changeset | 73 | fun set_range range (Token (_, x)) = Token (range, x); | 
| 59112 | 74 | fun range_of (Token (range, _)) = range; | 
| 30683 
e8ac1f9d9469
datatype antiquote: maintain original Position.range, which is eventually attached to the resulting ML tokens;
 wenzelm parents: 
30645diff
changeset | 75 | |
| 59112 | 76 | val pos_of = #1 o range_of; | 
| 77 | val end_pos_of = #2 o range_of; | |
| 24579 | 78 | |
| 79 | ||
| 58855 | 80 | (* stopper *) | 
| 24579 | 81 | |
| 27772 | 82 | fun mk_eof pos = Token ((pos, Position.none), (EOF, "")); | 
| 83 | val eof = mk_eof Position.none; | |
| 24579 | 84 | |
| 85 | fun is_eof (Token (_, (EOF, _))) = true | |
| 86 | | is_eof _ = false; | |
| 87 | ||
| 27772 | 88 | val stopper = | 
| 30636 | 89 | Scan.stopper (fn [] => eof | toks => mk_eof (end_pos_of (List.last toks))) is_eof; | 
| 27772 | 90 | |
| 24579 | 91 | |
| 27772 | 92 | (* token content *) | 
| 93 | ||
| 31426 | 94 | fun kind_of (Token (_, (k, _))) = k; | 
| 95 | ||
| 27817 | 96 | fun content_of (Token (_, (_, x))) = x; | 
| 97 | fun token_leq (tok, tok') = content_of tok <= content_of tok'; | |
| 27772 | 98 | |
| 55505 | 99 | fun is_keyword (Token (_, (Keyword, _))) = true | 
| 100 | | is_keyword _ = false; | |
| 101 | ||
| 102 | fun is_delimiter (Token (_, (Keyword, x))) = not (Symbol.is_ascii_identifier x) | |
| 103 | | is_delimiter _ = false; | |
| 104 | ||
| 41502 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 wenzelm parents: 
40627diff
changeset | 105 | fun is_regular (Token (_, (Error _, _))) = false | 
| 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 wenzelm parents: 
40627diff
changeset | 106 | | is_regular (Token (_, (EOF, _))) = false | 
| 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 wenzelm parents: 
40627diff
changeset | 107 | | is_regular _ = true; | 
| 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 wenzelm parents: 
40627diff
changeset | 108 | |
| 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 wenzelm parents: 
40627diff
changeset | 109 | fun is_improper (Token (_, (Space, _))) = true | 
| 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 wenzelm parents: 
40627diff
changeset | 110 | | is_improper (Token (_, (Comment, _))) = true | 
| 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 wenzelm parents: 
40627diff
changeset | 111 | | is_improper _ = false; | 
| 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 wenzelm parents: 
40627diff
changeset | 112 | |
| 40337 
d25bbb5f1c9e
warn in correlation with report -- avoid spurious message duplicates;
 wenzelm parents: 
40319diff
changeset | 113 | fun warn tok = | 
| 
d25bbb5f1c9e
warn in correlation with report -- avoid spurious message duplicates;
 wenzelm parents: 
40319diff
changeset | 114 | (case tok of | 
| 
d25bbb5f1c9e
warn in correlation with report -- avoid spurious message duplicates;
 wenzelm parents: 
40319diff
changeset | 115 | Token (_, (Keyword, ":>")) => | 
| 
d25bbb5f1c9e
warn in correlation with report -- avoid spurious message duplicates;
 wenzelm parents: 
40319diff
changeset | 116 |       warning ("Opaque signature matching (:>) fails to work with ML pretty printing --\n\
 | 
| 
d25bbb5f1c9e
warn in correlation with report -- avoid spurious message duplicates;
 wenzelm parents: 
40319diff
changeset | 117 | \prefer non-opaque matching (:) possibly with abstype" ^ | 
| 48992 | 118 | Position.here (pos_of tok)) | 
| 40337 
d25bbb5f1c9e
warn in correlation with report -- avoid spurious message duplicates;
 wenzelm parents: 
40319diff
changeset | 119 | | _ => ()); | 
| 
d25bbb5f1c9e
warn in correlation with report -- avoid spurious message duplicates;
 wenzelm parents: 
40319diff
changeset | 120 | |
| 31426 | 121 | fun check_content_of tok = | 
| 30636 | 122 | (case kind_of tok of | 
| 30645 | 123 | Error msg => error msg | 
| 40337 
d25bbb5f1c9e
warn in correlation with report -- avoid spurious message duplicates;
 wenzelm parents: 
40319diff
changeset | 124 | | _ => content_of tok); | 
| 30636 | 125 | |
| 59112 | 126 | |
| 127 | (* flatten *) | |
| 128 | ||
| 41502 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 wenzelm parents: 
40627diff
changeset | 129 | fun flatten_content (tok :: (toks as tok' :: _)) = | 
| 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 wenzelm parents: 
40627diff
changeset | 130 | Symbol.escape (check_content_of tok) :: | 
| 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 wenzelm parents: 
40627diff
changeset | 131 | (if is_improper tok orelse is_improper tok' then flatten_content toks | 
| 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 wenzelm parents: 
40627diff
changeset | 132 | else Symbol.space :: flatten_content toks) | 
| 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 wenzelm parents: 
40627diff
changeset | 133 | | flatten_content toks = map (Symbol.escape o check_content_of) toks; | 
| 31332 | 134 | |
| 41502 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 wenzelm parents: 
40627diff
changeset | 135 | val flatten = implode o flatten_content; | 
| 24596 | 136 | |
| 137 | ||
| 30614 | 138 | (* markup *) | 
| 139 | ||
| 37195 | 140 | local | 
| 141 | ||
| 56278 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 wenzelm parents: 
55828diff
changeset | 142 | fun token_kind_markup SML = | 
| 59124 | 143 | fn Type_Var => (Markup.ML_tvar, "") | 
| 59082 | 144 | | Word => (Markup.ML_numeral, "") | 
| 145 | | Int => (Markup.ML_numeral, "") | |
| 146 | | Real => (Markup.ML_numeral, "") | |
| 147 | | Char => (Markup.ML_char, "") | |
| 148 | | String => (if SML then Markup.SML_string else Markup.ML_string, "") | |
| 149 | | Comment => (if SML then Markup.SML_comment else Markup.ML_comment, "") | |
| 50201 
c26369c9eda6
Isabelle-specific implementation of quasi-abstract markup elements -- back to module arrangement before d83797ef0d2d;
 wenzelm parents: 
48992diff
changeset | 150 | | Error msg => (Markup.bad, msg) | 
| 59124 | 151 | | _ => (Markup.empty, ""); | 
| 30614 | 152 | |
| 37195 | 153 | in | 
| 154 | ||
| 61457 | 155 | fun token_report SML (tok as Token ((pos, _), (kind, x))) = | 
| 55505 | 156 | let | 
| 157 | val (markup, txt) = | |
| 56278 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 wenzelm parents: 
55828diff
changeset | 158 | if not (is_keyword tok) then token_kind_markup SML kind | 
| 55505 | 159 | else if is_delimiter tok then (Markup.ML_delimiter, "") | 
| 160 | else if member (op =) keywords2 x then (Markup.ML_keyword2, "") | |
| 161 | else if member (op =) keywords3 x then (Markup.ML_keyword3, "") | |
| 162 | else (Markup.ML_keyword1, ""); | |
| 48768 
abc45de5bb22
clarified "bad" markup: proper body text, invent missing serial on Scala side (counting backwards);
 wenzelm parents: 
48756diff
changeset | 163 | in ((pos, markup), txt) end; | 
| 37195 | 164 | |
| 165 | end; | |
| 30614 | 166 | |
| 167 | ||
| 24579 | 168 | |
| 169 | (** scanners **) | |
| 170 | ||
| 30573 | 171 | open Basic_Symbol_Pos; | 
| 24579 | 172 | |
| 55105 | 173 | val err_prefix = "SML lexical error: "; | 
| 174 | ||
| 175 | fun !!! msg = Symbol_Pos.!!! (fn () => err_prefix ^ msg); | |
| 24579 | 176 | |
| 177 | ||
| 178 | (* identifiers *) | |
| 179 | ||
| 24596 | 180 | local | 
| 181 | ||
| 24579 | 182 | val scan_letdigs = | 
| 55496 | 183 | Scan.many (Symbol.is_ascii_letdig o Symbol_Pos.symbol); | 
| 24579 | 184 | |
| 40525 
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
 wenzelm parents: 
40523diff
changeset | 185 | val scan_alphanumeric = | 
| 61814 
1ca1142e1711
clarified type Token.src: plain token list, with usual implicit value assignment;
 wenzelm parents: 
61596diff
changeset | 186 | Scan.one (Symbol.is_ascii_letter o Symbol_Pos.symbol) ::: scan_letdigs; | 
| 24579 | 187 | |
| 40525 
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
 wenzelm parents: 
40523diff
changeset | 188 | val scan_symbolic = | 
| 40627 
becf5d5187cc
renamed raw "explode" function to "raw_explode" to emphasize its meaning;
 wenzelm parents: 
40525diff
changeset | 189 | Scan.many1 (member (op =) (raw_explode "!#$%&*+-/:<=>?@\\^`|~") o Symbol_Pos.symbol); | 
| 24579 | 190 | |
| 24596 | 191 | in | 
| 192 | ||
| 24579 | 193 | val scan_ident = scan_alphanumeric || scan_symbolic; | 
| 194 | ||
| 55496 | 195 | val scan_long_ident = | 
| 61476 | 196 | Scan.repeats1 (scan_alphanumeric @@@ $$$ ".") @@@ (scan_ident || $$$ "="); | 
| 24579 | 197 | |
| 55496 | 198 | val scan_type_var = $$$ "'" @@@ scan_letdigs; | 
| 24579 | 199 | |
| 24596 | 200 | end; | 
| 24579 | 201 | |
| 202 | ||
| 203 | (* numerals *) | |
| 204 | ||
| 24596 | 205 | local | 
| 206 | ||
| 40525 
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
 wenzelm parents: 
40523diff
changeset | 207 | val scan_dec = Scan.many1 (Symbol.is_ascii_digit o Symbol_Pos.symbol); | 
| 
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
 wenzelm parents: 
40523diff
changeset | 208 | val scan_hex = Scan.many1 (Symbol.is_ascii_hex o Symbol_Pos.symbol); | 
| 27772 | 209 | val scan_sign = Scan.optional ($$$ "~") []; | 
| 210 | val scan_decint = scan_sign @@@ scan_dec; | |
| 55496 | 211 | val scan_exp = ($$$ "E" || $$$ "e") @@@ scan_decint; | 
| 24579 | 212 | |
| 24596 | 213 | in | 
| 214 | ||
| 27772 | 215 | val scan_word = | 
| 216 | $$$ "0" @@@ $$$ "w" @@@ $$$ "x" @@@ scan_hex || | |
| 217 | $$$ "0" @@@ $$$ "w" @@@ scan_dec; | |
| 24579 | 218 | |
| 27772 | 219 | val scan_int = scan_sign @@@ ($$$ "0" @@@ $$$ "x" @@@ scan_hex || scan_dec); | 
| 24579 | 220 | |
| 63204 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
62797diff
changeset | 221 | val scan_rat = scan_decint @@@ Scan.optional ($$$ "/" @@@ scan_dec) []; | 
| 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
62797diff
changeset | 222 | |
| 24579 | 223 | val scan_real = | 
| 27772 | 224 | scan_decint @@@ $$$ "." @@@ scan_dec @@@ Scan.optional scan_exp [] || | 
| 225 | scan_decint @@@ scan_exp; | |
| 24579 | 226 | |
| 24596 | 227 | end; | 
| 228 | ||
| 24579 | 229 | |
| 230 | (* chars and strings *) | |
| 231 | ||
| 55496 | 232 | val scan_blanks1 = Scan.many1 (Symbol.is_ascii_blank o Symbol_Pos.symbol); | 
| 233 | ||
| 24596 | 234 | local | 
| 235 | ||
| 236 | val scan_escape = | |
| 40627 
becf5d5187cc
renamed raw "explode" function to "raw_explode" to emphasize its meaning;
 wenzelm parents: 
40525diff
changeset | 237 | Scan.one (member (op =) (raw_explode "\"\\abtnvfr") o Symbol_Pos.symbol) >> single || | 
| 27772 | 238 | $$$ "^" @@@ (Scan.one (fn (s, _) => ord "@" <= ord s andalso ord s <= ord "_") >> single) || | 
| 40525 
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
 wenzelm parents: 
40523diff
changeset | 239 | Scan.one (Symbol.is_ascii_digit o Symbol_Pos.symbol) -- | 
| 
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
 wenzelm parents: 
40523diff
changeset | 240 | Scan.one (Symbol.is_ascii_digit o Symbol_Pos.symbol) -- | 
| 
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
 wenzelm parents: 
40523diff
changeset | 241 | Scan.one (Symbol.is_ascii_digit o Symbol_Pos.symbol) >> (fn ((a, b), c) => [a, b, c]); | 
| 24596 | 242 | |
| 243 | val scan_str = | |
| 58854 | 244 | Scan.one (fn (s, _) => Symbol.not_eof s andalso s <> "\"" andalso s <> "\\" andalso | 
| 30600 
de241396389c
allow non-printable symbols within string tokens;
 wenzelm parents: 
30593diff
changeset | 245 | (not (Symbol.is_char s) orelse Symbol.is_printable s)) >> single || | 
| 27772 | 246 | $$$ "\\" @@@ !!! "bad escape character in string" scan_escape; | 
| 24596 | 247 | |
| 27772 | 248 | val scan_gap = $$$ "\\" @@@ scan_blanks1 @@@ $$$ "\\"; | 
| 61476 | 249 | val scan_gaps = Scan.repeats scan_gap; | 
| 24579 | 250 | |
| 24596 | 251 | in | 
| 24579 | 252 | |
| 253 | val scan_char = | |
| 27772 | 254 | $$$ "#" @@@ $$$ "\"" @@@ scan_gaps @@@ scan_str @@@ scan_gaps @@@ $$$ "\""; | 
| 24579 | 255 | |
| 48743 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 wenzelm parents: 
45666diff
changeset | 256 | val recover_char = | 
| 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 wenzelm parents: 
45666diff
changeset | 257 | $$$ "#" @@@ $$$ "\"" @@@ scan_gaps @@@ Scan.optional (Scan.permissive scan_str @@@ scan_gaps) []; | 
| 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 wenzelm parents: 
45666diff
changeset | 258 | |
| 24579 | 259 | val scan_string = | 
| 55103 | 260 | Scan.ahead ($$ "\"") |-- | 
| 261 | !!! "unclosed string literal" | |
| 61476 | 262 | ($$$ "\"" @@@ Scan.repeats (scan_gap || scan_str) @@@ $$$ "\""); | 
| 24596 | 263 | |
| 48743 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 wenzelm parents: 
45666diff
changeset | 264 | val recover_string = | 
| 61476 | 265 | $$$ "\"" @@@ Scan.repeats (scan_gap || Scan.permissive scan_str); | 
| 48743 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 wenzelm parents: 
45666diff
changeset | 266 | |
| 24596 | 267 | end; | 
| 24579 | 268 | |
| 269 | ||
| 63204 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
62797diff
changeset | 270 | (* rat antiquotation *) | 
| 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
62797diff
changeset | 271 | |
| 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
62797diff
changeset | 272 | val rat_name = Symbol_Pos.explode ("Pure.rat ", Position.none);
 | 
| 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
62797diff
changeset | 273 | |
| 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
62797diff
changeset | 274 | val scan_rat_antiq = | 
| 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
62797diff
changeset | 275 | Symbol_Pos.scan_pos -- ($$ "@" |-- Symbol_Pos.scan_pos -- scan_rat) -- Symbol_Pos.scan_pos | 
| 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
62797diff
changeset | 276 | >> (fn ((pos1, (pos2, body)), pos3) => | 
| 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
62797diff
changeset | 277 |       {start = Position.range_position (pos1, pos2),
 | 
| 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
62797diff
changeset | 278 | stop = Position.none, | 
| 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
62797diff
changeset | 279 | range = Position.range (pos1, pos3), | 
| 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
62797diff
changeset | 280 | body = rat_name @ body}); | 
| 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
62797diff
changeset | 281 | |
| 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
62797diff
changeset | 282 | |
| 30645 | 283 | (* scan tokens *) | 
| 24579 | 284 | |
| 285 | local | |
| 286 | ||
| 30593 | 287 | fun token k ss = Token (Symbol_Pos.range ss, (k, Symbol_Pos.content ss)); | 
| 24579 | 288 | |
| 30593 | 289 | val scan_ml = | 
| 27772 | 290 | (scan_char >> token Char || | 
| 291 | scan_string >> token String || | |
| 292 | scan_blanks1 >> token Space || | |
| 55105 | 293 | Symbol_Pos.scan_comment err_prefix >> token Comment || | 
| 27772 | 294 | Scan.max token_leq | 
| 55505 | 295 | (Scan.literal lexicon >> token Keyword) | 
| 27772 | 296 | (scan_word >> token Word || | 
| 297 | scan_real >> token Real || | |
| 298 | scan_int >> token Int || | |
| 59082 | 299 | scan_long_ident >> token Long_Ident || | 
| 27772 | 300 | scan_ident >> token Ident || | 
| 59082 | 301 | scan_type_var >> token Type_Var)); | 
| 27772 | 302 | |
| 59112 | 303 | val scan_sml = scan_ml >> Antiquote.Text; | 
| 304 | ||
| 305 | val scan_ml_antiq = | |
| 61471 | 306 | Antiquote.scan_control >> Antiquote.Control || | 
| 59112 | 307 | Antiquote.scan_antiq >> Antiquote.Antiq || | 
| 63204 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
62797diff
changeset | 308 | scan_rat_antiq >> Antiquote.Antiq || | 
| 59112 | 309 | scan_ml >> Antiquote.Text; | 
| 30645 | 310 | |
| 27772 | 311 | fun recover msg = | 
| 48743 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 wenzelm parents: 
45666diff
changeset | 312 | (recover_char || | 
| 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 wenzelm parents: 
45666diff
changeset | 313 | recover_string || | 
| 59112 | 314 | Symbol_Pos.recover_cartouche || | 
| 48743 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 wenzelm parents: 
45666diff
changeset | 315 | Symbol_Pos.recover_comment || | 
| 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 wenzelm parents: 
45666diff
changeset | 316 | Scan.one (Symbol.not_eof o Symbol_Pos.symbol) >> single) | 
| 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 wenzelm parents: 
45666diff
changeset | 317 | >> (single o token (Error msg)); | 
| 24579 | 318 | |
| 56278 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 wenzelm parents: 
55828diff
changeset | 319 | fun gen_read SML pos text = | 
| 37198 
3af985b10550
replaced ML_Lex.read_antiq by more concise ML_Lex.read, which includes full read/report with explicit position information;
 wenzelm parents: 
37195diff
changeset | 320 | let | 
| 59110 
8a78c7cb5b14
some special cases for official SML, to treat Isabelle symbols like raw characters;
 wenzelm parents: 
59109diff
changeset | 321 | val syms = | 
| 
8a78c7cb5b14
some special cases for official SML, to treat Isabelle symbols like raw characters;
 wenzelm parents: 
59109diff
changeset | 322 | Symbol_Pos.explode (text, pos) | 
| 
8a78c7cb5b14
some special cases for official SML, to treat Isabelle symbols like raw characters;
 wenzelm parents: 
59109diff
changeset | 323 | |> SML ? maps (fn (s, p) => raw_explode s |> map (rpair p)); | 
| 
8a78c7cb5b14
some special cases for official SML, to treat Isabelle symbols like raw characters;
 wenzelm parents: 
59109diff
changeset | 324 | |
| 37209 
1c8cf0048934
terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
 wenzelm parents: 
37198diff
changeset | 325 | val termination = | 
| 
1c8cf0048934
terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
 wenzelm parents: 
37198diff
changeset | 326 | if null syms then [] | 
| 
1c8cf0048934
terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
 wenzelm parents: 
37198diff
changeset | 327 | else | 
| 
1c8cf0048934
terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
 wenzelm parents: 
37198diff
changeset | 328 | let | 
| 
1c8cf0048934
terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
 wenzelm parents: 
37198diff
changeset | 329 | val pos1 = List.last syms |-> Position.advance; | 
| 
1c8cf0048934
terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
 wenzelm parents: 
37198diff
changeset | 330 | val pos2 = Position.advance Symbol.space pos1; | 
| 62797 | 331 | in [Antiquote.Text (Token (Position.range (pos1, pos2), (Space, Symbol.space)))] end; | 
| 56278 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 wenzelm parents: 
55828diff
changeset | 332 | |
| 59112 | 333 | val scan = if SML then scan_sml else scan_ml_antiq; | 
| 56278 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 wenzelm parents: 
55828diff
changeset | 334 | fun check (Antiquote.Text tok) = (check_content_of tok; if SML then () else warn tok) | 
| 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 wenzelm parents: 
55828diff
changeset | 335 | | check _ = (); | 
| 37209 
1c8cf0048934
terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
 wenzelm parents: 
37198diff
changeset | 336 | val input = | 
| 56278 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 wenzelm parents: 
55828diff
changeset | 337 | Source.of_list syms | 
| 58864 | 338 | |> Source.source Symbol_Pos.stopper | 
| 339 | (Scan.recover (Scan.bulk (!!! "bad input" scan)) | |
| 340 | (fn msg => recover msg >> map Antiquote.Text)) | |
| 61482 | 341 | |> Source.exhaust; | 
| 342 | val _ = Position.reports (Antiquote.antiq_reports input); | |
| 343 | val _ = | |
| 344 | Position.reports_text (maps (fn Antiquote.Text t => [token_report SML t] | _ => []) input); | |
| 345 | val _ = List.app check input; | |
| 37209 
1c8cf0048934
terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
 wenzelm parents: 
37198diff
changeset | 346 | in input @ termination end; | 
| 30645 | 347 | |
| 56278 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 wenzelm parents: 
55828diff
changeset | 348 | in | 
| 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 wenzelm parents: 
55828diff
changeset | 349 | |
| 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 wenzelm parents: 
55828diff
changeset | 350 | fun source src = | 
| 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 wenzelm parents: 
55828diff
changeset | 351 | Symbol_Pos.source (Position.line 1) src | 
| 58864 | 352 | |> Source.source Symbol_Pos.stopper (Scan.recover (Scan.bulk (!!! "bad input" scan_ml)) recover); | 
| 56278 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 wenzelm parents: 
55828diff
changeset | 353 | |
| 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 wenzelm parents: 
55828diff
changeset | 354 | val tokenize = Source.of_string #> Symbol.source #> source #> Source.exhaust; | 
| 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 wenzelm parents: 
55828diff
changeset | 355 | |
| 59067 | 356 | val read_pos = gen_read false; | 
| 357 | ||
| 358 | val read = read_pos Position.none; | |
| 56278 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 wenzelm parents: 
55828diff
changeset | 359 | |
| 58978 
e42da880c61e
more position information, e.g. relevant for errors in generated ML source;
 wenzelm parents: 
58933diff
changeset | 360 | fun read_set_range range = | 
| 59067 | 361 | read #> map (fn Antiquote.Text tok => Antiquote.Text (set_range range tok) | antiq => antiq); | 
| 58978 
e42da880c61e
more position information, e.g. relevant for errors in generated ML source;
 wenzelm parents: 
58933diff
changeset | 362 | |
| 59064 | 363 | fun read_source SML source = | 
| 56459 
38d0b2099743
no report for position singularity, notably for aux. file, especially empty one;
 wenzelm parents: 
56278diff
changeset | 364 | let | 
| 59064 | 365 | val pos = Input.pos_of source; | 
| 56459 
38d0b2099743
no report for position singularity, notably for aux. file, especially empty one;
 wenzelm parents: 
56278diff
changeset | 366 | val language = if SML then Markup.language_SML else Markup.language_ML; | 
| 
38d0b2099743
no report for position singularity, notably for aux. file, especially empty one;
 wenzelm parents: 
56278diff
changeset | 367 | val _ = | 
| 59064 | 368 | if Position.is_reported_range pos | 
| 369 | then Position.report pos (language (Input.is_delimited source)) | |
| 56459 
38d0b2099743
no report for position singularity, notably for aux. file, especially empty one;
 wenzelm parents: 
56278diff
changeset | 370 | else (); | 
| 59064 | 371 | in gen_read SML pos (Input.text_of source) end; | 
| 55828 
42ac3cfb89f6
clarified language markup: added "delimited" property;
 wenzelm parents: 
55612diff
changeset | 372 | |
| 24579 | 373 | end; | 
| 374 | ||
| 24596 | 375 | end; |