| author | wenzelm | 
| Wed, 26 Oct 2022 15:02:11 +0200 | |
| changeset 76378 | c0566b6f6ca8 | 
| parent 74373 | 6e4093927dbb | 
| child 78687 | 5fe4c11b5ecb | 
| permissions | -rw-r--r-- | 
| 24579 | 1 | (* Title: Pure/ML/ml_lex.ML | 
| 2 | Author: Makarius | |
| 3 | ||
| 59109 | 4 | Lexical syntax for Isabelle/ML and Standard ML. | 
| 24579 | 5 | *) | 
| 6 | ||
| 7 | signature ML_LEX = | |
| 8 | sig | |
| 55505 | 9 | val keywords: string list | 
| 24579 | 10 | datatype token_kind = | 
| 59082 | 11 | Keyword | Ident | Long_Ident | Type_Var | Word | Int | Real | Char | String | | 
| 67425 
7d4a088dbc0e
clarified modules: uniform notion of formal comments;
 wenzelm parents: 
67363diff
changeset | 12 | Space | Comment of Comment.kind option | Error of string | EOF | 
| 24579 | 13 | eqtype token | 
| 27732 | 14 | val stopper: token Scan.stopper | 
| 24596 | 15 | val is_regular: token -> bool | 
| 16 | val is_improper: token -> bool | |
| 67362 | 17 | val is_comment: token -> bool | 
| 30683 
e8ac1f9d9469
datatype antiquote: maintain original Position.range, which is eventually attached to the resulting ML tokens;
 wenzelm parents: 
30645diff
changeset | 18 | val set_range: Position.range -> token -> token | 
| 59112 | 19 | val range_of: token -> Position.range | 
| 30636 | 20 | val pos_of: token -> Position.T | 
| 31474 | 21 | val end_pos_of: token -> Position.T | 
| 24579 | 22 | val kind_of: token -> token_kind | 
| 27817 | 23 | val content_of: token -> string | 
| 31426 | 24 | val check_content_of: token -> string | 
| 31332 | 25 | val flatten: token list -> string | 
| 24596 | 26 | val source: (Symbol.symbol, 'a) Source.source -> | 
| 30573 | 27 | (token, (Symbol_Pos.T, Position.T * (Symbol.symbol, 'a) Source.source) | 
| 27772 | 28 | Source.source) Source.source | 
| 30591 | 29 | val tokenize: string -> token list | 
| 73549 
a2c589d5e1e4
clarified signature: more detailed token positions for antiquotations;
 wenzelm parents: 
69891diff
changeset | 30 | val tokenize_range: Position.range -> string -> token list | 
| 74291 
b83fa8f3a271
ML antiquotations for type constructors and term constants;
 wenzelm parents: 
74174diff
changeset | 31 | val tokenize_no_range: string -> token list | 
| 68823 | 32 | val read_text: Symbol_Pos.text * Position.T -> token Antiquote.antiquote list | 
| 59067 | 33 | val read: Symbol_Pos.text -> token Antiquote.antiquote list | 
| 73549 
a2c589d5e1e4
clarified signature: more detailed token positions for antiquotations;
 wenzelm parents: 
69891diff
changeset | 34 | val read_range: Position.range -> Symbol_Pos.text -> token Antiquote.antiquote list | 
| 68823 | 35 |   val read_source': {language: bool -> Markup.T, opaque_warning: bool, symbols: bool} ->
 | 
| 36 | token Antiquote.antiquote Symbol_Pos.scanner -> Input.source -> token Antiquote.antiquote list | |
| 37 | val read_source: Input.source -> token Antiquote.antiquote list | |
| 38 | val read_source_sml: Input.source -> token Antiquote.antiquote list | |
| 74373 
6e4093927dbb
outer syntax: support for control-cartouche tokens;
 wenzelm parents: 
74291diff
changeset | 39 | val read_symbols: Symbol_Pos.T list -> token Antiquote.antiquote list | 
| 69841 
b3c9291b5fc7
clarified signature, notably for hol4isabelle (by Fabian Immler);
 wenzelm parents: 
68823diff
changeset | 40 | val read_symbols_sml: Symbol_Pos.T list -> token Antiquote.antiquote list | 
| 24579 | 41 | end; | 
| 42 | ||
| 43 | structure ML_Lex: ML_LEX = | |
| 44 | struct | |
| 45 | ||
| 55505 | 46 | (** keywords **) | 
| 47 | ||
| 48 | val keywords = | |
| 49 |  ["#", "(", ")", ",", "->", "...", ":", ":>", ";", "=", "=>", "[",
 | |
| 50 |   "]", "_", "{", "|", "}", "abstype", "and", "andalso", "as", "case",
 | |
| 51 | "datatype", "do", "else", "end", "eqtype", "exception", "fn", "fun", | |
| 52 | "functor", "handle", "if", "in", "include", "infix", "infixr", | |
| 53 | "let", "local", "nonfix", "of", "op", "open", "orelse", "raise", | |
| 54 | "rec", "sharing", "sig", "signature", "struct", "structure", "then", | |
| 55 | "type", "val", "where", "while", "with", "withtype"]; | |
| 56 | ||
| 57 | val keywords2 = | |
| 58933 | 58 | ["and", "case", "do", "else", "end", "if", "in", "let", "local", "of", | 
| 55505 | 59 | "sig", "struct", "then", "while", "with"]; | 
| 60 | ||
| 61 | val keywords3 = | |
| 62 | ["handle", "open", "raise"]; | |
| 63 | ||
| 64 | val lexicon = Scan.make_lexicon (map raw_explode keywords); | |
| 65 | ||
| 66 | ||
| 67 | ||
| 24579 | 68 | (** tokens **) | 
| 69 | ||
| 70 | (* datatype token *) | |
| 71 | ||
| 72 | datatype token_kind = | |
| 59082 | 73 | Keyword | Ident | Long_Ident | Type_Var | Word | Int | Real | Char | String | | 
| 67425 
7d4a088dbc0e
clarified modules: uniform notion of formal comments;
 wenzelm parents: 
67363diff
changeset | 74 | Space | Comment of Comment.kind option | Error of string | EOF; | 
| 24579 | 75 | |
| 27772 | 76 | datatype token = Token of Position.range * (token_kind * string); | 
| 77 | ||
| 78 | ||
| 79 | (* position *) | |
| 80 | ||
| 30683 
e8ac1f9d9469
datatype antiquote: maintain original Position.range, which is eventually attached to the resulting ML tokens;
 wenzelm parents: 
30645diff
changeset | 81 | fun set_range range (Token (_, x)) = Token (range, x); | 
| 59112 | 82 | fun range_of (Token (range, _)) = range; | 
| 30683 
e8ac1f9d9469
datatype antiquote: maintain original Position.range, which is eventually attached to the resulting ML tokens;
 wenzelm parents: 
30645diff
changeset | 83 | |
| 59112 | 84 | val pos_of = #1 o range_of; | 
| 85 | val end_pos_of = #2 o range_of; | |
| 24579 | 86 | |
| 87 | ||
| 58855 | 88 | (* stopper *) | 
| 24579 | 89 | |
| 27772 | 90 | fun mk_eof pos = Token ((pos, Position.none), (EOF, "")); | 
| 91 | val eof = mk_eof Position.none; | |
| 24579 | 92 | |
| 93 | fun is_eof (Token (_, (EOF, _))) = true | |
| 94 | | is_eof _ = false; | |
| 95 | ||
| 27772 | 96 | val stopper = | 
| 30636 | 97 | Scan.stopper (fn [] => eof | toks => mk_eof (end_pos_of (List.last toks))) is_eof; | 
| 27772 | 98 | |
| 24579 | 99 | |
| 27772 | 100 | (* token content *) | 
| 101 | ||
| 31426 | 102 | fun kind_of (Token (_, (k, _))) = k; | 
| 103 | ||
| 27817 | 104 | fun content_of (Token (_, (_, x))) = x; | 
| 105 | fun token_leq (tok, tok') = content_of tok <= content_of tok'; | |
| 27772 | 106 | |
| 55505 | 107 | fun is_keyword (Token (_, (Keyword, _))) = true | 
| 108 | | is_keyword _ = false; | |
| 109 | ||
| 110 | fun is_delimiter (Token (_, (Keyword, x))) = not (Symbol.is_ascii_identifier x) | |
| 111 | | is_delimiter _ = false; | |
| 112 | ||
| 41502 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 wenzelm parents: 
40627diff
changeset | 113 | fun is_regular (Token (_, (Error _, _))) = false | 
| 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 wenzelm parents: 
40627diff
changeset | 114 | | is_regular (Token (_, (EOF, _))) = false | 
| 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 wenzelm parents: 
40627diff
changeset | 115 | | is_regular _ = true; | 
| 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 wenzelm parents: 
40627diff
changeset | 116 | |
| 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 wenzelm parents: 
40627diff
changeset | 117 | fun is_improper (Token (_, (Space, _))) = true | 
| 67425 
7d4a088dbc0e
clarified modules: uniform notion of formal comments;
 wenzelm parents: 
67363diff
changeset | 118 | | is_improper (Token (_, (Comment _, _))) = true | 
| 41502 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 wenzelm parents: 
40627diff
changeset | 119 | | is_improper _ = false; | 
| 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 wenzelm parents: 
40627diff
changeset | 120 | |
| 67425 
7d4a088dbc0e
clarified modules: uniform notion of formal comments;
 wenzelm parents: 
67363diff
changeset | 121 | fun is_comment (Token (_, (Comment _, _))) = true | 
| 67362 | 122 | | is_comment _ = false; | 
| 123 | ||
| 68823 | 124 | fun warning_opaque tok = | 
| 40337 
d25bbb5f1c9e
warn in correlation with report -- avoid spurious message duplicates;
 wenzelm parents: 
40319diff
changeset | 125 | (case tok of | 
| 
d25bbb5f1c9e
warn in correlation with report -- avoid spurious message duplicates;
 wenzelm parents: 
40319diff
changeset | 126 | Token (_, (Keyword, ":>")) => | 
| 
d25bbb5f1c9e
warn in correlation with report -- avoid spurious message duplicates;
 wenzelm parents: 
40319diff
changeset | 127 |       warning ("Opaque signature matching (:>) fails to work with ML pretty printing --\n\
 | 
| 
d25bbb5f1c9e
warn in correlation with report -- avoid spurious message duplicates;
 wenzelm parents: 
40319diff
changeset | 128 | \prefer non-opaque matching (:) possibly with abstype" ^ | 
| 48992 | 129 | Position.here (pos_of tok)) | 
| 40337 
d25bbb5f1c9e
warn in correlation with report -- avoid spurious message duplicates;
 wenzelm parents: 
40319diff
changeset | 130 | | _ => ()); | 
| 
d25bbb5f1c9e
warn in correlation with report -- avoid spurious message duplicates;
 wenzelm parents: 
40319diff
changeset | 131 | |
| 31426 | 132 | fun check_content_of tok = | 
| 30636 | 133 | (case kind_of tok of | 
| 30645 | 134 | Error msg => error msg | 
| 40337 
d25bbb5f1c9e
warn in correlation with report -- avoid spurious message duplicates;
 wenzelm parents: 
40319diff
changeset | 135 | | _ => content_of tok); | 
| 30636 | 136 | |
| 59112 | 137 | |
| 138 | (* flatten *) | |
| 139 | ||
| 41502 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 wenzelm parents: 
40627diff
changeset | 140 | fun flatten_content (tok :: (toks as tok' :: _)) = | 
| 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 wenzelm parents: 
40627diff
changeset | 141 | Symbol.escape (check_content_of tok) :: | 
| 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 wenzelm parents: 
40627diff
changeset | 142 | (if is_improper tok orelse is_improper tok' then flatten_content toks | 
| 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 wenzelm parents: 
40627diff
changeset | 143 | else Symbol.space :: flatten_content toks) | 
| 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 wenzelm parents: 
40627diff
changeset | 144 | | flatten_content toks = map (Symbol.escape o check_content_of) toks; | 
| 31332 | 145 | |
| 41502 
967cbbc77abd
refined ML_Lex.flatten: ensure proper separation of tokens;
 wenzelm parents: 
40627diff
changeset | 146 | val flatten = implode o flatten_content; | 
| 24596 | 147 | |
| 148 | ||
| 30614 | 149 | (* markup *) | 
| 150 | ||
| 37195 | 151 | local | 
| 152 | ||
| 68822 | 153 | val token_kind_markup = | 
| 59124 | 154 | fn Type_Var => (Markup.ML_tvar, "") | 
| 59082 | 155 | | Word => (Markup.ML_numeral, "") | 
| 156 | | Int => (Markup.ML_numeral, "") | |
| 157 | | Real => (Markup.ML_numeral, "") | |
| 158 | | Char => (Markup.ML_char, "") | |
| 68822 | 159 | | String => (Markup.ML_string, "") | 
| 160 | | Comment _ => (Markup.ML_comment, "") | |
| 64677 
8dc24130e8fe
more uniform treatment of "bad" like other messages (with serial number);
 wenzelm parents: 
64275diff
changeset | 161 | | Error msg => (Markup.bad (), msg) | 
| 59124 | 162 | | _ => (Markup.empty, ""); | 
| 30614 | 163 | |
| 37195 | 164 | in | 
| 165 | ||
| 68822 | 166 | fun token_report (tok as Token ((pos, _), (kind, x))) = | 
| 55505 | 167 | let | 
| 168 | val (markup, txt) = | |
| 68822 | 169 | if not (is_keyword tok) then token_kind_markup kind | 
| 55505 | 170 | else if is_delimiter tok then (Markup.ML_delimiter, "") | 
| 66067 | 171 | else if member (op =) keywords2 x then (Markup.ML_keyword2 |> Markup.keyword_properties, "") | 
| 172 | else if member (op =) keywords3 x then (Markup.ML_keyword3 |> Markup.keyword_properties, "") | |
| 173 | else (Markup.ML_keyword1 |> Markup.keyword_properties, ""); | |
| 48768 
abc45de5bb22
clarified "bad" markup: proper body text, invent missing serial on Scala side (counting backwards);
 wenzelm parents: 
48756diff
changeset | 174 | in ((pos, markup), txt) end; | 
| 37195 | 175 | |
| 176 | end; | |
| 30614 | 177 | |
| 178 | ||
| 24579 | 179 | |
| 180 | (** scanners **) | |
| 181 | ||
| 30573 | 182 | open Basic_Symbol_Pos; | 
| 24579 | 183 | |
| 55105 | 184 | val err_prefix = "SML lexical error: "; | 
| 185 | ||
| 186 | fun !!! msg = Symbol_Pos.!!! (fn () => err_prefix ^ msg); | |
| 24579 | 187 | |
| 188 | ||
| 189 | (* identifiers *) | |
| 190 | ||
| 24596 | 191 | local | 
| 192 | ||
| 24579 | 193 | val scan_letdigs = | 
| 55496 | 194 | Scan.many (Symbol.is_ascii_letdig o Symbol_Pos.symbol); | 
| 24579 | 195 | |
| 40525 
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
 wenzelm parents: 
40523diff
changeset | 196 | val scan_alphanumeric = | 
| 61814 
1ca1142e1711
clarified type Token.src: plain token list, with usual implicit value assignment;
 wenzelm parents: 
61596diff
changeset | 197 | Scan.one (Symbol.is_ascii_letter o Symbol_Pos.symbol) ::: scan_letdigs; | 
| 24579 | 198 | |
| 40525 
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
 wenzelm parents: 
40523diff
changeset | 199 | val scan_symbolic = | 
| 40627 
becf5d5187cc
renamed raw "explode" function to "raw_explode" to emphasize its meaning;
 wenzelm parents: 
40525diff
changeset | 200 | Scan.many1 (member (op =) (raw_explode "!#$%&*+-/:<=>?@\\^`|~") o Symbol_Pos.symbol); | 
| 24579 | 201 | |
| 24596 | 202 | in | 
| 203 | ||
| 24579 | 204 | val scan_ident = scan_alphanumeric || scan_symbolic; | 
| 205 | ||
| 55496 | 206 | val scan_long_ident = | 
| 61476 | 207 | Scan.repeats1 (scan_alphanumeric @@@ $$$ ".") @@@ (scan_ident || $$$ "="); | 
| 24579 | 208 | |
| 55496 | 209 | val scan_type_var = $$$ "'" @@@ scan_letdigs; | 
| 24579 | 210 | |
| 24596 | 211 | end; | 
| 24579 | 212 | |
| 213 | ||
| 214 | (* numerals *) | |
| 215 | ||
| 24596 | 216 | local | 
| 217 | ||
| 40525 
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
 wenzelm parents: 
40523diff
changeset | 218 | val scan_dec = Scan.many1 (Symbol.is_ascii_digit o Symbol_Pos.symbol); | 
| 
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
 wenzelm parents: 
40523diff
changeset | 219 | val scan_hex = Scan.many1 (Symbol.is_ascii_hex o Symbol_Pos.symbol); | 
| 27772 | 220 | val scan_sign = Scan.optional ($$$ "~") []; | 
| 221 | val scan_decint = scan_sign @@@ scan_dec; | |
| 55496 | 222 | val scan_exp = ($$$ "E" || $$$ "e") @@@ scan_decint; | 
| 24579 | 223 | |
| 24596 | 224 | in | 
| 225 | ||
| 27772 | 226 | val scan_word = | 
| 227 | $$$ "0" @@@ $$$ "w" @@@ $$$ "x" @@@ scan_hex || | |
| 228 | $$$ "0" @@@ $$$ "w" @@@ scan_dec; | |
| 24579 | 229 | |
| 27772 | 230 | val scan_int = scan_sign @@@ ($$$ "0" @@@ $$$ "x" @@@ scan_hex || scan_dec); | 
| 24579 | 231 | |
| 63204 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
62797diff
changeset | 232 | val scan_rat = scan_decint @@@ Scan.optional ($$$ "/" @@@ scan_dec) []; | 
| 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
62797diff
changeset | 233 | |
| 24579 | 234 | val scan_real = | 
| 27772 | 235 | scan_decint @@@ $$$ "." @@@ scan_dec @@@ Scan.optional scan_exp [] || | 
| 236 | scan_decint @@@ scan_exp; | |
| 24579 | 237 | |
| 24596 | 238 | end; | 
| 239 | ||
| 24579 | 240 | |
| 241 | (* chars and strings *) | |
| 242 | ||
| 55496 | 243 | val scan_blanks1 = Scan.many1 (Symbol.is_ascii_blank o Symbol_Pos.symbol); | 
| 244 | ||
| 24596 | 245 | local | 
| 246 | ||
| 247 | val scan_escape = | |
| 40627 
becf5d5187cc
renamed raw "explode" function to "raw_explode" to emphasize its meaning;
 wenzelm parents: 
40525diff
changeset | 248 | Scan.one (member (op =) (raw_explode "\"\\abtnvfr") o Symbol_Pos.symbol) >> single || | 
| 64275 
ac2abc987cf9
accomodate Poly/ML repository version, which treats singleton strings as boxed;
 wenzelm parents: 
63936diff
changeset | 249 | $$$ "^" @@@ | 
| 
ac2abc987cf9
accomodate Poly/ML repository version, which treats singleton strings as boxed;
 wenzelm parents: 
63936diff
changeset | 250 | (Scan.one (fn (s, _) => Char.ord #"@" <= ord s andalso ord s <= Char.ord #"_") >> single) || | 
| 40525 
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
 wenzelm parents: 
40523diff
changeset | 251 | Scan.one (Symbol.is_ascii_digit o Symbol_Pos.symbol) -- | 
| 
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
 wenzelm parents: 
40523diff
changeset | 252 | Scan.one (Symbol.is_ascii_digit o Symbol_Pos.symbol) -- | 
| 
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
 wenzelm parents: 
40523diff
changeset | 253 | Scan.one (Symbol.is_ascii_digit o Symbol_Pos.symbol) >> (fn ((a, b), c) => [a, b, c]); | 
| 24596 | 254 | |
| 255 | val scan_str = | |
| 58854 | 256 | Scan.one (fn (s, _) => Symbol.not_eof s andalso s <> "\"" andalso s <> "\\" andalso | 
| 30600 
de241396389c
allow non-printable symbols within string tokens;
 wenzelm parents: 
30593diff
changeset | 257 | (not (Symbol.is_char s) orelse Symbol.is_printable s)) >> single || | 
| 27772 | 258 | $$$ "\\" @@@ !!! "bad escape character in string" scan_escape; | 
| 24596 | 259 | |
| 27772 | 260 | val scan_gap = $$$ "\\" @@@ scan_blanks1 @@@ $$$ "\\"; | 
| 61476 | 261 | val scan_gaps = Scan.repeats scan_gap; | 
| 24579 | 262 | |
| 24596 | 263 | in | 
| 24579 | 264 | |
| 265 | val scan_char = | |
| 27772 | 266 | $$$ "#" @@@ $$$ "\"" @@@ scan_gaps @@@ scan_str @@@ scan_gaps @@@ $$$ "\""; | 
| 24579 | 267 | |
| 48743 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 wenzelm parents: 
45666diff
changeset | 268 | val recover_char = | 
| 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 wenzelm parents: 
45666diff
changeset | 269 | $$$ "#" @@@ $$$ "\"" @@@ scan_gaps @@@ Scan.optional (Scan.permissive scan_str @@@ scan_gaps) []; | 
| 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 wenzelm parents: 
45666diff
changeset | 270 | |
| 24579 | 271 | val scan_string = | 
| 55103 | 272 | Scan.ahead ($$ "\"") |-- | 
| 273 | !!! "unclosed string literal" | |
| 61476 | 274 | ($$$ "\"" @@@ Scan.repeats (scan_gap || scan_str) @@@ $$$ "\""); | 
| 24596 | 275 | |
| 48743 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 wenzelm parents: 
45666diff
changeset | 276 | val recover_string = | 
| 61476 | 277 | $$$ "\"" @@@ Scan.repeats (scan_gap || Scan.permissive scan_str); | 
| 48743 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 wenzelm parents: 
45666diff
changeset | 278 | |
| 24596 | 279 | end; | 
| 24579 | 280 | |
| 281 | ||
| 63204 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
62797diff
changeset | 282 | (* rat antiquotation *) | 
| 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
62797diff
changeset | 283 | |
| 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
62797diff
changeset | 284 | val rat_name = Symbol_Pos.explode ("Pure.rat ", Position.none);
 | 
| 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
62797diff
changeset | 285 | |
| 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
62797diff
changeset | 286 | val scan_rat_antiq = | 
| 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
62797diff
changeset | 287 | Symbol_Pos.scan_pos -- ($$ "@" |-- Symbol_Pos.scan_pos -- scan_rat) -- Symbol_Pos.scan_pos | 
| 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
62797diff
changeset | 288 | >> (fn ((pos1, (pos2, body)), pos3) => | 
| 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
62797diff
changeset | 289 |       {start = Position.range_position (pos1, pos2),
 | 
| 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
62797diff
changeset | 290 | stop = Position.none, | 
| 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
62797diff
changeset | 291 | range = Position.range (pos1, pos3), | 
| 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
62797diff
changeset | 292 | body = rat_name @ body}); | 
| 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
62797diff
changeset | 293 | |
| 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
62797diff
changeset | 294 | |
| 30645 | 295 | (* scan tokens *) | 
| 24579 | 296 | |
| 297 | local | |
| 298 | ||
| 30593 | 299 | fun token k ss = Token (Symbol_Pos.range ss, (k, Symbol_Pos.content ss)); | 
| 24579 | 300 | |
| 67427 
5409cfd41e7f
clarified SML (no formal comments) vs. Isabelle/ML (arbitrary comments);
 wenzelm parents: 
67425diff
changeset | 301 | val scan_sml = | 
| 
5409cfd41e7f
clarified SML (no formal comments) vs. Isabelle/ML (arbitrary comments);
 wenzelm parents: 
67425diff
changeset | 302 | scan_char >> token Char || | 
| 27772 | 303 | scan_string >> token String || | 
| 304 | scan_blanks1 >> token Space || | |
| 67425 
7d4a088dbc0e
clarified modules: uniform notion of formal comments;
 wenzelm parents: 
67363diff
changeset | 305 | Symbol_Pos.scan_comment err_prefix >> token (Comment NONE) || | 
| 27772 | 306 | Scan.max token_leq | 
| 55505 | 307 | (Scan.literal lexicon >> token Keyword) | 
| 27772 | 308 | (scan_word >> token Word || | 
| 309 | scan_real >> token Real || | |
| 310 | scan_int >> token Int || | |
| 59082 | 311 | scan_long_ident >> token Long_Ident || | 
| 27772 | 312 | scan_ident >> token Ident || | 
| 67427 
5409cfd41e7f
clarified SML (no formal comments) vs. Isabelle/ML (arbitrary comments);
 wenzelm parents: 
67425diff
changeset | 313 | scan_type_var >> token Type_Var); | 
| 59112 | 314 | |
| 68823 | 315 | val scan_sml_antiq = scan_sml >> Antiquote.Text; | 
| 316 | ||
| 59112 | 317 | val scan_ml_antiq = | 
| 69891 
def3ec9cdb7e
document markers are formal comments, and may thus occur anywhere in the command-span;
 wenzelm parents: 
69842diff
changeset | 318 | Comment.scan_inner >> (fn (kind, ss) => Antiquote.Text (token (Comment (SOME kind)) ss)) || | 
| 74373 
6e4093927dbb
outer syntax: support for control-cartouche tokens;
 wenzelm parents: 
74291diff
changeset | 319 | Antiquote.scan_control Antiquote.err_prefix >> Antiquote.Control || | 
| 59112 | 320 | Antiquote.scan_antiq >> Antiquote.Antiq || | 
| 63204 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
62797diff
changeset | 321 | scan_rat_antiq >> Antiquote.Antiq || | 
| 68823 | 322 | scan_sml_antiq; | 
| 30645 | 323 | |
| 27772 | 324 | fun recover msg = | 
| 48743 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 wenzelm parents: 
45666diff
changeset | 325 | (recover_char || | 
| 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 wenzelm parents: 
45666diff
changeset | 326 | recover_string || | 
| 59112 | 327 | Symbol_Pos.recover_cartouche || | 
| 48743 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 wenzelm parents: 
45666diff
changeset | 328 | Symbol_Pos.recover_comment || | 
| 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 wenzelm parents: 
45666diff
changeset | 329 | Scan.one (Symbol.not_eof o Symbol_Pos.symbol) >> single) | 
| 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 wenzelm parents: 
45666diff
changeset | 330 | >> (single o token (Error msg)); | 
| 24579 | 331 | |
| 68823 | 332 | fun reader {opaque_warning} scan syms =
 | 
| 37198 
3af985b10550
replaced ML_Lex.read_antiq by more concise ML_Lex.read, which includes full read/report with explicit position information;
 wenzelm parents: 
37195diff
changeset | 333 | let | 
| 37209 
1c8cf0048934
terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
 wenzelm parents: 
37198diff
changeset | 334 | val termination = | 
| 
1c8cf0048934
terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
 wenzelm parents: 
37198diff
changeset | 335 | if null syms then [] | 
| 
1c8cf0048934
terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
 wenzelm parents: 
37198diff
changeset | 336 | else | 
| 
1c8cf0048934
terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
 wenzelm parents: 
37198diff
changeset | 337 | let | 
| 74174 | 338 | val pos1 = List.last syms |-> Position.symbol; | 
| 339 | val pos2 = Position.symbol Symbol.space pos1; | |
| 62797 | 340 | in [Antiquote.Text (Token (Position.range (pos1, pos2), (Space, Symbol.space)))] end; | 
| 56278 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 wenzelm parents: 
55828diff
changeset | 341 | |
| 68823 | 342 | fun check (Antiquote.Text tok) = | 
| 343 | (check_content_of tok; if opaque_warning then warning_opaque tok else ()) | |
| 56278 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 wenzelm parents: 
55828diff
changeset | 344 | | check _ = (); | 
| 37209 
1c8cf0048934
terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
 wenzelm parents: 
37198diff
changeset | 345 | val input = | 
| 56278 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 wenzelm parents: 
55828diff
changeset | 346 | Source.of_list syms | 
| 58864 | 347 | |> Source.source Symbol_Pos.stopper | 
| 348 | (Scan.recover (Scan.bulk (!!! "bad input" scan)) | |
| 349 | (fn msg => recover msg >> map Antiquote.Text)) | |
| 61482 | 350 | |> Source.exhaust; | 
| 351 | val _ = Position.reports (Antiquote.antiq_reports input); | |
| 68822 | 352 | val _ = Position.reports_text (maps (fn Antiquote.Text t => [token_report t] | _ => []) input); | 
| 61482 | 353 | val _ = List.app check input; | 
| 37209 
1c8cf0048934
terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
 wenzelm parents: 
37198diff
changeset | 354 | in input @ termination end; | 
| 30645 | 355 | |
| 56278 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 wenzelm parents: 
55828diff
changeset | 356 | in | 
| 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 wenzelm parents: 
55828diff
changeset | 357 | |
| 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 wenzelm parents: 
55828diff
changeset | 358 | fun source src = | 
| 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 wenzelm parents: 
55828diff
changeset | 359 | Symbol_Pos.source (Position.line 1) src | 
| 67427 
5409cfd41e7f
clarified SML (no formal comments) vs. Isabelle/ML (arbitrary comments);
 wenzelm parents: 
67425diff
changeset | 360 | |> Source.source Symbol_Pos.stopper (Scan.recover (Scan.bulk (!!! "bad input" scan_sml)) recover); | 
| 56278 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 wenzelm parents: 
55828diff
changeset | 361 | |
| 63936 | 362 | val tokenize = Symbol.explode #> Source.of_list #> source #> Source.exhaust; | 
| 73549 
a2c589d5e1e4
clarified signature: more detailed token positions for antiquotations;
 wenzelm parents: 
69891diff
changeset | 363 | fun tokenize_range range = tokenize #> map (set_range range); | 
| 74291 
b83fa8f3a271
ML antiquotations for type constructors and term constants;
 wenzelm parents: 
74174diff
changeset | 364 | val tokenize_no_range = tokenize_range Position.no_range; | 
| 56278 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 wenzelm parents: 
55828diff
changeset | 365 | |
| 68823 | 366 | val read_text = reader {opaque_warning = true} scan_ml_antiq o Symbol_Pos.explode;
 | 
| 367 | fun read text = read_text (text, Position.none); | |
| 56278 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 wenzelm parents: 
55828diff
changeset | 368 | |
| 73549 
a2c589d5e1e4
clarified signature: more detailed token positions for antiquotations;
 wenzelm parents: 
69891diff
changeset | 369 | fun read_range range = | 
| 59067 | 370 | read #> map (fn Antiquote.Text tok => Antiquote.Text (set_range range tok) | antiq => antiq); | 
| 58978 
e42da880c61e
more position information, e.g. relevant for errors in generated ML source;
 wenzelm parents: 
58933diff
changeset | 371 | |
| 68823 | 372 | fun read_source' {language, symbols, opaque_warning} scan source =
 | 
| 56459 
38d0b2099743
no report for position singularity, notably for aux. file, especially empty one;
 wenzelm parents: 
56278diff
changeset | 373 | let | 
| 59064 | 374 | val pos = Input.pos_of source; | 
| 56459 
38d0b2099743
no report for position singularity, notably for aux. file, especially empty one;
 wenzelm parents: 
56278diff
changeset | 375 | val _ = | 
| 59064 | 376 | if Position.is_reported_range pos | 
| 377 | then Position.report pos (language (Input.is_delimited source)) | |
| 56459 
38d0b2099743
no report for position singularity, notably for aux. file, especially empty one;
 wenzelm parents: 
56278diff
changeset | 378 | else (); | 
| 69842 | 379 | in | 
| 380 | Input.source_explode source | |
| 381 | |> not symbols ? maps (fn (s, p) => raw_explode s |> map (rpair p)) | |
| 382 |     |> reader {opaque_warning = opaque_warning} scan
 | |
| 383 | end; | |
| 68823 | 384 | |
| 385 | val read_source = | |
| 386 |   read_source' {language = Markup.language_ML, symbols = true, opaque_warning = true}
 | |
| 387 | scan_ml_antiq; | |
| 388 | ||
| 389 | val read_source_sml = | |
| 390 |   read_source' {language = Markup.language_SML, symbols = false, opaque_warning = false}
 | |
| 391 | scan_sml_antiq; | |
| 55828 
42ac3cfb89f6
clarified language markup: added "delimited" property;
 wenzelm parents: 
55612diff
changeset | 392 | |
| 74373 
6e4093927dbb
outer syntax: support for control-cartouche tokens;
 wenzelm parents: 
74291diff
changeset | 393 | val read_symbols = reader {opaque_warning = true} scan_ml_antiq;
 | 
| 69841 
b3c9291b5fc7
clarified signature, notably for hol4isabelle (by Fabian Immler);
 wenzelm parents: 
68823diff
changeset | 394 | val read_symbols_sml = reader {opaque_warning = false} scan_sml_antiq;
 | 
| 
b3c9291b5fc7
clarified signature, notably for hol4isabelle (by Fabian Immler);
 wenzelm parents: 
68823diff
changeset | 395 | |
| 24579 | 396 | end; | 
| 397 | ||
| 24596 | 398 | end; |