| author | desharna | 
| Mon, 19 Feb 2024 11:39:00 +0100 | |
| changeset 79668 | 9f36a31fe7ae | 
| parent 78701 | c7b2697094ac | 
| permissions | -rw-r--r-- | 
| 24579 | 1 | (* Title: Pure/ML/ml_lex.ML | 
| 2 | Author: Makarius | |
| 3 | ||
| 59109 | 4 | Lexical syntax for Isabelle/ML and Standard ML. | 
| 24579 | 5 | *) | 
| 6 | ||
| 7 | signature ML_LEX = | |
| 8 | sig | |
| 55505 | 9 | val keywords: string list | 
| 24579 | 10 | datatype token_kind = | 
| 59082 | 11 | Keyword | Ident | Long_Ident | Type_Var | Word | Int | Real | Char | String | | 
| 67425 
7d4a088dbc0e
clarified modules: uniform notion of formal comments;
 wenzelm parents: 
67363diff
changeset | 12 | Space | Comment of Comment.kind option | Error of string | EOF | 
| 24579 | 13 | eqtype token | 
| 27732 | 14 | val stopper: token Scan.stopper | 
| 78701 | 15 | val is_ident_with: (string -> bool) -> token -> bool | 
| 24596 | 16 | val is_regular: token -> bool | 
| 17 | val is_improper: token -> bool | |
| 67362 | 18 | val is_comment: token -> bool | 
| 30683 
e8ac1f9d9469
datatype antiquote: maintain original Position.range, which is eventually attached to the resulting ML tokens;
 wenzelm parents: 
30645diff
changeset | 19 | val set_range: Position.range -> token -> token | 
| 59112 | 20 | val range_of: token -> Position.range | 
| 30636 | 21 | val pos_of: token -> Position.T | 
| 31474 | 22 | val end_pos_of: token -> Position.T | 
| 24579 | 23 | val kind_of: token -> token_kind | 
| 27817 | 24 | val content_of: token -> string | 
| 31426 | 25 | val check_content_of: token -> string | 
| 31332 | 26 | val flatten: token list -> string | 
| 24596 | 27 | val source: (Symbol.symbol, 'a) Source.source -> | 
| 30573 | 28 | (token, (Symbol_Pos.T, Position.T * (Symbol.symbol, 'a) Source.source) | 
| 27772 | 29 | Source.source) Source.source | 
| 30591 | 30 | val tokenize: string -> token list | 
| 73549 
a2c589d5e1e4
clarified signature: more detailed token positions for antiquotations;
 wenzelm parents: 
69891diff
changeset | 31 | val tokenize_range: Position.range -> string -> token list | 
| 74291 
b83fa8f3a271
ML antiquotations for type constructors and term constants;
 wenzelm parents: 
74174diff
changeset | 32 | val tokenize_no_range: string -> token list | 
| 68823 | 33 | val read_text: Symbol_Pos.text * Position.T -> token Antiquote.antiquote list | 
| 59067 | 34 | val read: Symbol_Pos.text -> token Antiquote.antiquote list | 
| 73549 
a2c589d5e1e4
clarified signature: more detailed token positions for antiquotations;
 wenzelm parents: 
69891diff
changeset | 35 | val read_range: Position.range -> Symbol_Pos.text -> token Antiquote.antiquote list | 
| 68823 | 36 |   val read_source': {language: bool -> Markup.T, opaque_warning: bool, symbols: bool} ->
 | 
| 37 | token Antiquote.antiquote Symbol_Pos.scanner -> Input.source -> token Antiquote.antiquote list | |
| 38 | val read_source: Input.source -> token Antiquote.antiquote list | |
| 39 | val read_source_sml: Input.source -> token Antiquote.antiquote list | |
| 74373 
6e4093927dbb
outer syntax: support for control-cartouche tokens;
 wenzelm parents: 
74291diff
changeset | 40 | val read_symbols: Symbol_Pos.T list -> token Antiquote.antiquote list | 
| 69841 
b3c9291b5fc7
clarified signature, notably for hol4isabelle (by Fabian Immler);
 wenzelm parents: 
68823diff
changeset | 41 | val read_symbols_sml: Symbol_Pos.T list -> token Antiquote.antiquote list | 
| 24579 | 42 | end; | 
| 43 | ||
| 44 | structure ML_Lex: ML_LEX = | |
| 45 | struct | |
| 46 | ||
| 55505 | 47 | (** keywords **) | 
| 48 | ||
| 49 | val keywords = | |
| 50 |  ["#", "(", ")", ",", "->", "...", ":", ":>", ";", "=", "=>", "[",
 | |
| 51 |   "]", "_", "{", "|", "}", "abstype", "and", "andalso", "as", "case",
 | |
| 52 | "datatype", "do", "else", "end", "eqtype", "exception", "fn", "fun", | |
| 53 | "functor", "handle", "if", "in", "include", "infix", "infixr", | |
| 54 | "let", "local", "nonfix", "of", "op", "open", "orelse", "raise", | |
| 55 | "rec", "sharing", "sig", "signature", "struct", "structure", "then", | |
| 56 | "type", "val", "where", "while", "with", "withtype"]; | |
| 57 | ||
| 58 | val keywords2 = | |
| 78687 | 59 | Symset.make ["and", "case", "do", "else", "end", "if", "in", "let", "local", "of", | 
| 55505 | 60 | "sig", "struct", "then", "while", "with"]; | 
| 61 | ||
| 62 | val keywords3 = | |
| 78687 | 63 | Symset.make ["handle", "open", "raise"]; | 
| 55505 | 64 | |
| 65 | val lexicon = Scan.make_lexicon (map raw_explode keywords); | |
| 66 | ||
| 67 | ||
| 68 | ||
| 24579 | 69 | (** tokens **) | 
| 70 | ||
| 71 | (* datatype token *) | |
| 72 | ||
| 73 | datatype token_kind = | |
| 59082 | 74 | Keyword | Ident | Long_Ident | Type_Var | Word | Int | Real | Char | String | | 
| 67425 
7d4a088dbc0e
clarified modules: uniform notion of formal comments;
 wenzelm parents: 
67363diff
changeset | 75 | Space | Comment of Comment.kind option | Error of string | EOF; | 
| 24579 | 76 | |
| 27772 | 77 | datatype token = Token of Position.range * (token_kind * string); | 
| 78 | ||
| 79 | ||
| 80 | (* position *) | |
| 81 | ||
| 30683 
e8ac1f9d9469
datatype antiquote: maintain original Position.range, which is eventually attached to the resulting ML tokens;
 wenzelm parents: 
30645diff
changeset | 82 | fun set_range range (Token (_, x)) = Token (range, x); | 
| 59112 | 83 | fun range_of (Token (range, _)) = range; | 
| 30683 
e8ac1f9d9469
datatype antiquote: maintain original Position.range, which is eventually attached to the resulting ML tokens;
 wenzelm parents: 
30645diff
changeset | 84 | |
| 59112 | 85 | val pos_of = #1 o range_of; | 
| 86 | val end_pos_of = #2 o range_of; | |
| 24579 | 87 | |
| 88 | ||
| 58855 | 89 | (* stopper *) | 
| 24579 | 90 | |
| 27772 | 91 | fun mk_eof pos = Token ((pos, Position.none), (EOF, "")); | 
| 92 | val eof = mk_eof Position.none; | |
| 24579 | 93 | |
| 94 | fun is_eof (Token (_, (EOF, _))) = true | |
| 95 | | is_eof _ = false; | |
| 96 | ||
| 27772 | 97 | val stopper = | 
| 30636 | 98 | Scan.stopper (fn [] => eof | toks => mk_eof (end_pos_of (List.last toks))) is_eof; | 
| 27772 | 99 | |
| 24579 | 100 | |
| 27772 | 101 | (* token content *) | 
| 102 | ||
| 31426 | 103 | fun kind_of (Token (_, (k, _))) = k; | 
| 104 | ||
| 27817 | 105 | fun content_of (Token (_, (_, x))) = x; | 
| 106 | fun token_leq (tok, tok') = content_of tok <= content_of tok'; | |
| 27772 | 107 | |
| 55505 | 108 | fun is_keyword (Token (_, (Keyword, _))) = true | 
| 109 | | is_keyword _ = false; | |
| 110 | ||
| 111 | fun is_delimiter (Token (_, (Keyword, x))) = not (Symbol.is_ascii_identifier x) | |
| 112 | | is_delimiter _ = false; | |
| 113 | ||
| 78701 | 114 | fun is_ident_with pred (Token (_, (Ident, x))) = pred x | 
| 115 | | is_ident_with _ _ = false; | |
| 116 | ||
| 41502 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 wenzelm parents: 
40627diff
changeset | 117 | fun is_regular (Token (_, (Error _, _))) = false | 
| 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 wenzelm parents: 
40627diff
changeset | 118 | | is_regular (Token (_, (EOF, _))) = false | 
| 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 wenzelm parents: 
40627diff
changeset | 119 | | is_regular _ = true; | 
| 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 wenzelm parents: 
40627diff
changeset | 120 | |
| 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 wenzelm parents: 
40627diff
changeset | 121 | fun is_improper (Token (_, (Space, _))) = true | 
| 67425 
7d4a088dbc0e
clarified modules: uniform notion of formal comments;
 wenzelm parents: 
67363diff
changeset | 122 | | is_improper (Token (_, (Comment _, _))) = true | 
| 41502 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 wenzelm parents: 
40627diff
changeset | 123 | | is_improper _ = false; | 
| 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 wenzelm parents: 
40627diff
changeset | 124 | |
| 67425 
7d4a088dbc0e
clarified modules: uniform notion of formal comments;
 wenzelm parents: 
67363diff
changeset | 125 | fun is_comment (Token (_, (Comment _, _))) = true | 
| 67362 | 126 | | is_comment _ = false; | 
| 127 | ||
| 68823 | 128 | fun warning_opaque tok = | 
| 40337 
d25bbb5f1c9e
warn in correlation with report -- avoid spurious message duplicates;
 wenzelm parents: 
40319diff
changeset | 129 | (case tok of | 
| 
d25bbb5f1c9e
warn in correlation with report -- avoid spurious message duplicates;
 wenzelm parents: 
40319diff
changeset | 130 | Token (_, (Keyword, ":>")) => | 
| 
d25bbb5f1c9e
warn in correlation with report -- avoid spurious message duplicates;
 wenzelm parents: 
40319diff
changeset | 131 |       warning ("Opaque signature matching (:>) fails to work with ML pretty printing --\n\
 | 
| 
d25bbb5f1c9e
warn in correlation with report -- avoid spurious message duplicates;
 wenzelm parents: 
40319diff
changeset | 132 | \prefer non-opaque matching (:) possibly with abstype" ^ | 
| 48992 | 133 | Position.here (pos_of tok)) | 
| 40337 
d25bbb5f1c9e
warn in correlation with report -- avoid spurious message duplicates;
 wenzelm parents: 
40319diff
changeset | 134 | | _ => ()); | 
| 
d25bbb5f1c9e
warn in correlation with report -- avoid spurious message duplicates;
 wenzelm parents: 
40319diff
changeset | 135 | |
| 31426 | 136 | fun check_content_of tok = | 
| 30636 | 137 | (case kind_of tok of | 
| 30645 | 138 | Error msg => error msg | 
| 40337 
d25bbb5f1c9e
warn in correlation with report -- avoid spurious message duplicates;
 wenzelm parents: 
40319diff
changeset | 139 | | _ => content_of tok); | 
| 30636 | 140 | |
| 59112 | 141 | |
| 142 | (* flatten *) | |
| 143 | ||
| 41502 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 wenzelm parents: 
40627diff
changeset | 144 | fun flatten_content (tok :: (toks as tok' :: _)) = | 
| 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 wenzelm parents: 
40627diff
changeset | 145 | Symbol.escape (check_content_of tok) :: | 
| 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 wenzelm parents: 
40627diff
changeset | 146 | (if is_improper tok orelse is_improper tok' then flatten_content toks | 
| 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 wenzelm parents: 
40627diff
changeset | 147 | else Symbol.space :: flatten_content toks) | 
| 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 wenzelm parents: 
40627diff
changeset | 148 | | flatten_content toks = map (Symbol.escape o check_content_of) toks; | 
| 31332 | 149 | |
| 41502 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 wenzelm parents: 
40627diff
changeset | 150 | val flatten = implode o flatten_content; | 
| 24596 | 151 | |
| 152 | ||
| 30614 | 153 | (* markup *) | 
| 154 | ||
| 37195 | 155 | local | 
| 156 | ||
| 68822 | 157 | val token_kind_markup = | 
| 59124 | 158 | fn Type_Var => (Markup.ML_tvar, "") | 
| 59082 | 159 | | Word => (Markup.ML_numeral, "") | 
| 160 | | Int => (Markup.ML_numeral, "") | |
| 161 | | Real => (Markup.ML_numeral, "") | |
| 162 | | Char => (Markup.ML_char, "") | |
| 68822 | 163 | | String => (Markup.ML_string, "") | 
| 164 | | Comment _ => (Markup.ML_comment, "") | |
| 64677 
8dc24130e8fe
more uniform treatment of "bad" like other messages (with serial number);
 wenzelm parents: 
64275diff
changeset | 165 | | Error msg => (Markup.bad (), msg) | 
| 59124 | 166 | | _ => (Markup.empty, ""); | 
| 30614 | 167 | |
| 37195 | 168 | in | 
| 169 | ||
| 68822 | 170 | fun token_report (tok as Token ((pos, _), (kind, x))) = | 
| 55505 | 171 | let | 
| 172 | val (markup, txt) = | |
| 68822 | 173 | if not (is_keyword tok) then token_kind_markup kind | 
| 55505 | 174 | else if is_delimiter tok then (Markup.ML_delimiter, "") | 
| 78687 | 175 | else if Symset.member keywords2 x then (Markup.ML_keyword2 |> Markup.keyword_properties, "") | 
| 176 | else if Symset.member keywords3 x then (Markup.ML_keyword3 |> Markup.keyword_properties, "") | |
| 66067 | 177 | else (Markup.ML_keyword1 |> Markup.keyword_properties, ""); | 
| 48768 
abc45de5bb22
clarified "bad" markup: proper body text, invent missing serial on Scala side (counting backwards);
 wenzelm parents: 
48756diff
changeset | 178 | in ((pos, markup), txt) end; | 
| 37195 | 179 | |
| 180 | end; | |
| 30614 | 181 | |
| 182 | ||
| 24579 | 183 | |
| 184 | (** scanners **) | |
| 185 | ||
| 30573 | 186 | open Basic_Symbol_Pos; | 
| 24579 | 187 | |
| 55105 | 188 | val err_prefix = "SML lexical error: "; | 
| 189 | ||
| 190 | fun !!! msg = Symbol_Pos.!!! (fn () => err_prefix ^ msg); | |
| 24579 | 191 | |
| 192 | ||
| 193 | (* identifiers *) | |
| 194 | ||
| 24596 | 195 | local | 
| 196 | ||
| 24579 | 197 | val scan_letdigs = | 
| 55496 | 198 | Scan.many (Symbol.is_ascii_letdig o Symbol_Pos.symbol); | 
| 24579 | 199 | |
| 40525 
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
 wenzelm parents: 
40523diff
changeset | 200 | val scan_alphanumeric = | 
| 61814 
1ca1142e1711
clarified type Token.src: plain token list, with usual implicit value assignment;
 wenzelm parents: 
61596diff
changeset | 201 | Scan.one (Symbol.is_ascii_letter o Symbol_Pos.symbol) ::: scan_letdigs; | 
| 24579 | 202 | |
| 78687 | 203 | val symbolic = Symset.make (raw_explode "!#$%&*+-/:<=>?@\\^`|~"); | 
| 204 | ||
| 205 | val scan_symbolic = Scan.many1 (Symset.member symbolic o Symbol_Pos.symbol); | |
| 24579 | 206 | |
| 24596 | 207 | in | 
| 208 | ||
| 24579 | 209 | val scan_ident = scan_alphanumeric || scan_symbolic; | 
| 210 | ||
| 55496 | 211 | val scan_long_ident = | 
| 61476 | 212 | Scan.repeats1 (scan_alphanumeric @@@ $$$ ".") @@@ (scan_ident || $$$ "="); | 
| 24579 | 213 | |
| 55496 | 214 | val scan_type_var = $$$ "'" @@@ scan_letdigs; | 
| 24579 | 215 | |
| 24596 | 216 | end; | 
| 24579 | 217 | |
| 218 | ||
| 219 | (* numerals *) | |
| 220 | ||
| 24596 | 221 | local | 
| 222 | ||
| 40525 
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
 wenzelm parents: 
40523diff
changeset | 223 | val scan_dec = Scan.many1 (Symbol.is_ascii_digit o Symbol_Pos.symbol); | 
| 
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
 wenzelm parents: 
40523diff
changeset | 224 | val scan_hex = Scan.many1 (Symbol.is_ascii_hex o Symbol_Pos.symbol); | 
| 27772 | 225 | val scan_sign = Scan.optional ($$$ "~") []; | 
| 226 | val scan_decint = scan_sign @@@ scan_dec; | |
| 55496 | 227 | val scan_exp = ($$$ "E" || $$$ "e") @@@ scan_decint; | 
| 24579 | 228 | |
| 24596 | 229 | in | 
| 230 | ||
| 27772 | 231 | val scan_word = | 
| 232 | $$$ "0" @@@ $$$ "w" @@@ $$$ "x" @@@ scan_hex || | |
| 233 | $$$ "0" @@@ $$$ "w" @@@ scan_dec; | |
| 24579 | 234 | |
| 27772 | 235 | val scan_int = scan_sign @@@ ($$$ "0" @@@ $$$ "x" @@@ scan_hex || scan_dec); | 
| 24579 | 236 | |
| 63204 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
62797diff
changeset | 237 | val scan_rat = scan_decint @@@ Scan.optional ($$$ "/" @@@ scan_dec) []; | 
| 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
62797diff
changeset | 238 | |
| 24579 | 239 | val scan_real = | 
| 27772 | 240 | scan_decint @@@ $$$ "." @@@ scan_dec @@@ Scan.optional scan_exp [] || | 
| 241 | scan_decint @@@ scan_exp; | |
| 24579 | 242 | |
| 24596 | 243 | end; | 
| 244 | ||
| 24579 | 245 | |
| 246 | (* chars and strings *) | |
| 247 | ||
| 55496 | 248 | val scan_blanks1 = Scan.many1 (Symbol.is_ascii_blank o Symbol_Pos.symbol); | 
| 249 | ||
| 24596 | 250 | local | 
| 251 | ||
| 78687 | 252 | val escape = Symset.make (raw_explode "\"\\abtnvfr"); | 
| 253 | ||
| 24596 | 254 | val scan_escape = | 
| 78687 | 255 | Scan.one (Symset.member escape o Symbol_Pos.symbol) >> single || | 
| 64275 
ac2abc987cf9
accomodate Poly/ML repository version, which treats singleton strings as boxed;
 wenzelm parents: 
63936diff
changeset | 256 | $$$ "^" @@@ | 
| 
ac2abc987cf9
accomodate Poly/ML repository version, which treats singleton strings as boxed;
 wenzelm parents: 
63936diff
changeset | 257 | (Scan.one (fn (s, _) => Char.ord #"@" <= ord s andalso ord s <= Char.ord #"_") >> single) || | 
| 40525 
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
 wenzelm parents: 
40523diff
changeset | 258 | Scan.one (Symbol.is_ascii_digit o Symbol_Pos.symbol) -- | 
| 
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
 wenzelm parents: 
40523diff
changeset | 259 | Scan.one (Symbol.is_ascii_digit o Symbol_Pos.symbol) -- | 
| 
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
 wenzelm parents: 
40523diff
changeset | 260 | Scan.one (Symbol.is_ascii_digit o Symbol_Pos.symbol) >> (fn ((a, b), c) => [a, b, c]); | 
| 24596 | 261 | |
| 262 | val scan_str = | |
| 58854 | 263 | Scan.one (fn (s, _) => Symbol.not_eof s andalso s <> "\"" andalso s <> "\\" andalso | 
| 30600 
de241396389c
allow non-printable symbols within string tokens;
 wenzelm parents: 
30593diff
changeset | 264 | (not (Symbol.is_char s) orelse Symbol.is_printable s)) >> single || | 
| 27772 | 265 | $$$ "\\" @@@ !!! "bad escape character in string" scan_escape; | 
| 24596 | 266 | |
| 27772 | 267 | val scan_gap = $$$ "\\" @@@ scan_blanks1 @@@ $$$ "\\"; | 
| 61476 | 268 | val scan_gaps = Scan.repeats scan_gap; | 
| 24579 | 269 | |
| 24596 | 270 | in | 
| 24579 | 271 | |
| 272 | val scan_char = | |
| 27772 | 273 | $$$ "#" @@@ $$$ "\"" @@@ scan_gaps @@@ scan_str @@@ scan_gaps @@@ $$$ "\""; | 
| 24579 | 274 | |
| 48743 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 wenzelm parents: 
45666diff
changeset | 275 | val recover_char = | 
| 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 wenzelm parents: 
45666diff
changeset | 276 | $$$ "#" @@@ $$$ "\"" @@@ scan_gaps @@@ Scan.optional (Scan.permissive scan_str @@@ scan_gaps) []; | 
| 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 wenzelm parents: 
45666diff
changeset | 277 | |
| 24579 | 278 | val scan_string = | 
| 55103 | 279 | Scan.ahead ($$ "\"") |-- | 
| 280 | !!! "unclosed string literal" | |
| 61476 | 281 | ($$$ "\"" @@@ Scan.repeats (scan_gap || scan_str) @@@ $$$ "\""); | 
| 24596 | 282 | |
| 48743 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 wenzelm parents: 
45666diff
changeset | 283 | val recover_string = | 
| 61476 | 284 | $$$ "\"" @@@ Scan.repeats (scan_gap || Scan.permissive scan_str); | 
| 48743 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 wenzelm parents: 
45666diff
changeset | 285 | |
| 24596 | 286 | end; | 
| 24579 | 287 | |
| 288 | ||
| 63204 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
62797diff
changeset | 289 | (* rat antiquotation *) | 
| 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
62797diff
changeset | 290 | |
| 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
62797diff
changeset | 291 | val rat_name = Symbol_Pos.explode ("Pure.rat ", Position.none);
 | 
| 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
62797diff
changeset | 292 | |
| 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
62797diff
changeset | 293 | val scan_rat_antiq = | 
| 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
62797diff
changeset | 294 | Symbol_Pos.scan_pos -- ($$ "@" |-- Symbol_Pos.scan_pos -- scan_rat) -- Symbol_Pos.scan_pos | 
| 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
62797diff
changeset | 295 | >> (fn ((pos1, (pos2, body)), pos3) => | 
| 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
62797diff
changeset | 296 |       {start = Position.range_position (pos1, pos2),
 | 
| 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
62797diff
changeset | 297 | stop = Position.none, | 
| 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
62797diff
changeset | 298 | range = Position.range (pos1, pos3), | 
| 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
62797diff
changeset | 299 | body = rat_name @ body}); | 
| 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
62797diff
changeset | 300 | |
| 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
62797diff
changeset | 301 | |
| 30645 | 302 | (* scan tokens *) | 
| 24579 | 303 | |
| 304 | local | |
| 305 | ||
| 30593 | 306 | fun token k ss = Token (Symbol_Pos.range ss, (k, Symbol_Pos.content ss)); | 
| 24579 | 307 | |
| 67427 
5409cfd41e7f
clarified SML (no formal comments) vs. Isabelle/ML (arbitrary comments);
 wenzelm parents: 
67425diff
changeset | 308 | val scan_sml = | 
| 
5409cfd41e7f
clarified SML (no formal comments) vs. Isabelle/ML (arbitrary comments);
 wenzelm parents: 
67425diff
changeset | 309 | scan_char >> token Char || | 
| 27772 | 310 | scan_string >> token String || | 
| 311 | scan_blanks1 >> token Space || | |
| 67425 
7d4a088dbc0e
clarified modules: uniform notion of formal comments;
 wenzelm parents: 
67363diff
changeset | 312 | Symbol_Pos.scan_comment err_prefix >> token (Comment NONE) || | 
| 27772 | 313 | Scan.max token_leq | 
| 55505 | 314 | (Scan.literal lexicon >> token Keyword) | 
| 27772 | 315 | (scan_word >> token Word || | 
| 316 | scan_real >> token Real || | |
| 317 | scan_int >> token Int || | |
| 59082 | 318 | scan_long_ident >> token Long_Ident || | 
| 27772 | 319 | scan_ident >> token Ident || | 
| 67427 
5409cfd41e7f
clarified SML (no formal comments) vs. Isabelle/ML (arbitrary comments);
 wenzelm parents: 
67425diff
changeset | 320 | scan_type_var >> token Type_Var); | 
| 59112 | 321 | |
| 68823 | 322 | val scan_sml_antiq = scan_sml >> Antiquote.Text; | 
| 323 | ||
| 59112 | 324 | val scan_ml_antiq = | 
| 69891 
def3ec9cdb7e
document markers are formal comments, and may thus occur anywhere in the command-span;
 wenzelm parents: 
69842diff
changeset | 325 | Comment.scan_inner >> (fn (kind, ss) => Antiquote.Text (token (Comment (SOME kind)) ss)) || | 
| 74373 
6e4093927dbb
outer syntax: support for control-cartouche tokens;
 wenzelm parents: 
74291diff
changeset | 326 | Antiquote.scan_control Antiquote.err_prefix >> Antiquote.Control || | 
| 59112 | 327 | Antiquote.scan_antiq >> Antiquote.Antiq || | 
| 63204 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
62797diff
changeset | 328 | scan_rat_antiq >> Antiquote.Antiq || | 
| 68823 | 329 | scan_sml_antiq; | 
| 30645 | 330 | |
| 27772 | 331 | fun recover msg = | 
| 48743 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 wenzelm parents: 
45666diff
changeset | 332 | (recover_char || | 
| 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 wenzelm parents: 
45666diff
changeset | 333 | recover_string || | 
| 59112 | 334 | Symbol_Pos.recover_cartouche || | 
| 48743 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 wenzelm parents: 
45666diff
changeset | 335 | Symbol_Pos.recover_comment || | 
| 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 wenzelm parents: 
45666diff
changeset | 336 | Scan.one (Symbol.not_eof o Symbol_Pos.symbol) >> single) | 
| 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 wenzelm parents: 
45666diff
changeset | 337 | >> (single o token (Error msg)); | 
| 24579 | 338 | |
| 68823 | 339 | fun reader {opaque_warning} scan syms =
 | 
| 37198 
3af985b10550
replaced ML_Lex.read_antiq by more concise ML_Lex.read, which includes full read/report with explicit position information;
 wenzelm parents: 
37195diff
changeset | 340 | let | 
| 37209 
1c8cf0048934
terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
 wenzelm parents: 
37198diff
changeset | 341 | val termination = | 
| 
1c8cf0048934
terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
 wenzelm parents: 
37198diff
changeset | 342 | if null syms then [] | 
| 
1c8cf0048934
terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
 wenzelm parents: 
37198diff
changeset | 343 | else | 
| 
1c8cf0048934
terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
 wenzelm parents: 
37198diff
changeset | 344 | let | 
| 74174 | 345 | val pos1 = List.last syms |-> Position.symbol; | 
| 346 | val pos2 = Position.symbol Symbol.space pos1; | |
| 62797 | 347 | in [Antiquote.Text (Token (Position.range (pos1, pos2), (Space, Symbol.space)))] end; | 
| 56278 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 wenzelm parents: 
55828diff
changeset | 348 | |
| 68823 | 349 | fun check (Antiquote.Text tok) = | 
| 350 | (check_content_of tok; if opaque_warning then warning_opaque tok else ()) | |
| 56278 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 wenzelm parents: 
55828diff
changeset | 351 | | check _ = (); | 
| 37209 
1c8cf0048934
terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
 wenzelm parents: 
37198diff
changeset | 352 | val input = | 
| 56278 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 wenzelm parents: 
55828diff
changeset | 353 | Source.of_list syms | 
| 58864 | 354 | |> Source.source Symbol_Pos.stopper | 
| 355 | (Scan.recover (Scan.bulk (!!! "bad input" scan)) | |
| 356 | (fn msg => recover msg >> map Antiquote.Text)) | |
| 61482 | 357 | |> Source.exhaust; | 
| 358 | val _ = Position.reports (Antiquote.antiq_reports input); | |
| 68822 | 359 | val _ = Position.reports_text (maps (fn Antiquote.Text t => [token_report t] | _ => []) input); | 
| 61482 | 360 | val _ = List.app check input; | 
| 37209 
1c8cf0048934
terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
 wenzelm parents: 
37198diff
changeset | 361 | in input @ termination end; | 
| 30645 | 362 | |
| 56278 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 wenzelm parents: 
55828diff
changeset | 363 | in | 
| 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 wenzelm parents: 
55828diff
changeset | 364 | |
| 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 wenzelm parents: 
55828diff
changeset | 365 | fun source src = | 
| 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 wenzelm parents: 
55828diff
changeset | 366 | Symbol_Pos.source (Position.line 1) src | 
| 67427 
5409cfd41e7f
clarified SML (no formal comments) vs. Isabelle/ML (arbitrary comments);
 wenzelm parents: 
67425diff
changeset | 367 | |> Source.source Symbol_Pos.stopper (Scan.recover (Scan.bulk (!!! "bad input" scan_sml)) recover); | 
| 56278 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 wenzelm parents: 
55828diff
changeset | 368 | |
| 63936 | 369 | val tokenize = Symbol.explode #> Source.of_list #> source #> Source.exhaust; | 
| 73549 
a2c589d5e1e4
clarified signature: more detailed token positions for antiquotations;
 wenzelm parents: 
69891diff
changeset | 370 | fun tokenize_range range = tokenize #> map (set_range range); | 
| 74291 
b83fa8f3a271
ML antiquotations for type constructors and term constants;
 wenzelm parents: 
74174diff
changeset | 371 | val tokenize_no_range = tokenize_range Position.no_range; | 
| 56278 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 wenzelm parents: 
55828diff
changeset | 372 | |
| 68823 | 373 | val read_text = reader {opaque_warning = true} scan_ml_antiq o Symbol_Pos.explode;
 | 
| 374 | fun read text = read_text (text, Position.none); | |
| 56278 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 wenzelm parents: 
55828diff
changeset | 375 | |
| 73549 
a2c589d5e1e4
clarified signature: more detailed token positions for antiquotations;
 wenzelm parents: 
69891diff
changeset | 376 | fun read_range range = | 
| 59067 | 377 | read #> map (fn Antiquote.Text tok => Antiquote.Text (set_range range tok) | antiq => antiq); | 
| 58978 
e42da880c61e
more position information, e.g. relevant for errors in generated ML source;
 wenzelm parents: 
58933diff
changeset | 378 | |
| 68823 | 379 | fun read_source' {language, symbols, opaque_warning} scan source =
 | 
| 56459 
38d0b2099743
no report for position singularity, notably for aux. file, especially empty one;
 wenzelm parents: 
56278diff
changeset | 380 | let | 
| 59064 | 381 | val pos = Input.pos_of source; | 
| 56459 
38d0b2099743
no report for position singularity, notably for aux. file, especially empty one;
 wenzelm parents: 
56278diff
changeset | 382 | val _ = | 
| 59064 | 383 | if Position.is_reported_range pos | 
| 384 | then Position.report pos (language (Input.is_delimited source)) | |
| 56459 
38d0b2099743
no report for position singularity, notably for aux. file, especially empty one;
 wenzelm parents: 
56278diff
changeset | 385 | else (); | 
| 69842 | 386 | in | 
| 387 | Input.source_explode source | |
| 388 | |> not symbols ? maps (fn (s, p) => raw_explode s |> map (rpair p)) | |
| 389 |     |> reader {opaque_warning = opaque_warning} scan
 | |
| 390 | end; | |
| 68823 | 391 | |
| 392 | val read_source = | |
| 393 |   read_source' {language = Markup.language_ML, symbols = true, opaque_warning = true}
 | |
| 394 | scan_ml_antiq; | |
| 395 | ||
| 396 | val read_source_sml = | |
| 397 |   read_source' {language = Markup.language_SML, symbols = false, opaque_warning = false}
 | |
| 398 | scan_sml_antiq; | |
| 55828 
42ac3cfb89f6
clarified language markup: added "delimited" property;
 wenzelm parents: 
55612diff
changeset | 399 | |
| 74373 
6e4093927dbb
outer syntax: support for control-cartouche tokens;
 wenzelm parents: 
74291diff
changeset | 400 | val read_symbols = reader {opaque_warning = true} scan_ml_antiq;
 | 
| 69841 
b3c9291b5fc7
clarified signature, notably for hol4isabelle (by Fabian Immler);
 wenzelm parents: 
68823diff
changeset | 401 | val read_symbols_sml = reader {opaque_warning = false} scan_sml_antiq;
 | 
| 
b3c9291b5fc7
clarified signature, notably for hol4isabelle (by Fabian Immler);
 wenzelm parents: 
68823diff
changeset | 402 | |
| 24579 | 403 | end; | 
| 404 | ||
| 24596 | 405 | end; |