| author | nipkow | 
| Mon, 07 Jul 2003 17:58:21 +0200 | |
| changeset 14091 | ad6ba9c55190 | 
| parent 10748 | 74ed77fa5310 | 
| child 14729 | 0e987111a17e | 
| permissions | -rw-r--r-- | 
| 5825 | 1 | (* Title: Pure/Isar/outer_lex.ML | 
| 2 | ID: $Id$ | |
| 3 | Author: Markus Wenzel, TU Muenchen | |
| 8807 | 4 | License: GPL (GNU GENERAL PUBLIC LICENSE) | 
| 5825 | 5 | |
| 6 | Outer lexical syntax for Isabelle/Isar. | |
| 7 | *) | |
| 8 | ||
| 9 | signature OUTER_LEX = | |
| 10 | sig | |
| 11 | datatype token_kind = | |
| 7477 | 12 | Command | Keyword | Ident | LongIdent | SymIdent | Var | TypeIdent | TypeVar | | 
| 10748 | 13 | Nat | String | Verbatim | Space | Comment | Sync | Malformed | EOF | 
| 5825 | 14 | type token | 
| 15 | val str_of_kind: token_kind -> string | |
| 16 | val stopper: token * (token -> bool) | |
| 6859 | 17 | val not_sync: token -> bool | 
| 5825 | 18 | val not_eof: token -> bool | 
| 19 | val position_of: token -> Position.T | |
| 20 | val pos_of: token -> string | |
| 21 | val is_kind: token_kind -> token -> bool | |
| 7026 | 22 | val keyword_with: (string -> bool) -> token -> bool | 
| 5825 | 23 | val name_of: token -> string | 
| 24 | val is_proper: token -> bool | |
| 9130 | 25 | val is_semicolon: token -> bool | 
| 8580 | 26 | val is_begin_ignore: token -> bool | 
| 27 | val is_end_ignore: token -> bool | |
| 8651 | 28 | val is_newline: token -> bool | 
| 7902 | 29 | val is_indent: token -> bool | 
| 5825 | 30 | val val_of: token -> string | 
| 5876 | 31 | val is_sid: string -> bool | 
| 9130 | 32 | val !!! : string -> (Position.T * 'a -> 'b) -> Position.T * 'a -> 'b | 
| 33 |   val incr_line: ('a -> 'b * 'c) -> Position.T * 'a -> 'b * (Position.T * 'c)
 | |
| 34 |   val keep_line: ('a -> 'b * 'c) -> Position.T * 'a -> 'b * (Position.T * 'c)
 | |
| 35 | val scan_blank: Position.T * Symbol.symbol list | |
| 36 | -> Symbol.symbol * (Position.T * Symbol.symbol list) | |
| 37 | val scan_string: Position.T * Symbol.symbol list -> string * (Position.T * Symbol.symbol list) | |
| 7026 | 38 | val scan: (Scan.lexicon * Scan.lexicon) -> | 
| 5825 | 39 | Position.T * Symbol.symbol list -> token * (Position.T * Symbol.symbol list) | 
| 7026 | 40 | val source: bool -> (unit -> (Scan.lexicon * Scan.lexicon)) -> | 
| 41 | Position.T -> (Symbol.symbol, 'a) Source.source -> | |
| 7682 
46de8064c93c
added Space, Comment token kinds (keep actual text);
 wenzelm parents: 
7477diff
changeset | 42 | (token, Position.T * (Symbol.symbol, 'a) Source.source) Source.source | 
| 9130 | 43 | val source_proper: (token, 'a) Source.source -> (token, (token, 'a) Source.source) Source.source | 
| 44 | val make_lexicon: string list -> Scan.lexicon | |
| 5825 | 45 | end; | 
| 46 | ||
| 47 | structure OuterLex: OUTER_LEX = | |
| 48 | struct | |
| 49 | ||
| 50 | ||
| 51 | (** tokens **) | |
| 52 | ||
| 53 | (* datatype token *) | |
| 54 | ||
| 55 | datatype token_kind = | |
| 7477 | 56 | Command | Keyword | Ident | LongIdent | SymIdent | Var | TypeIdent | TypeVar | | 
| 10748 | 57 | Nat | String | Verbatim | Space | Comment | Sync | Malformed | EOF; | 
| 5825 | 58 | |
| 59 | datatype token = Token of Position.T * (token_kind * string); | |
| 60 | ||
| 61 | val str_of_kind = | |
| 7026 | 62 | fn Command => "command" | 
| 63 | | Keyword => "keyword" | |
| 5825 | 64 | | Ident => "identifier" | 
| 65 | | LongIdent => "long identifier" | |
| 66 | | SymIdent => "symbolic identifier" | |
| 67 | | Var => "schematic variable" | |
| 68 | | TypeIdent => "type variable" | |
| 69 | | TypeVar => "schematic type variable" | |
| 70 | | Nat => "number" | |
| 71 | | String => "string" | |
| 72 | | Verbatim => "verbatim text" | |
| 7682 
46de8064c93c
added Space, Comment token kinds (keep actual text);
 wenzelm parents: 
7477diff
changeset | 73 | | Space => "white space" | 
| 
46de8064c93c
added Space, Comment token kinds (keep actual text);
 wenzelm parents: 
7477diff
changeset | 74 | | Comment => "comment text" | 
| 6859 | 75 | | Sync => "sync marker" | 
| 10748 | 76 | | Malformed => "bad input" | 
| 5825 | 77 | | EOF => "end-of-file"; | 
| 78 | ||
| 79 | ||
| 10748 | 80 | (* control tokens *) | 
| 6859 | 81 | |
| 82 | fun not_sync (Token (_, (Sync, _))) = false | |
| 83 | | not_sync _ = true; | |
| 84 | ||
| 10748 | 85 | val malformed = Token (Position.none, (Malformed, "")); | 
| 86 | ||
| 6859 | 87 | |
| 5825 | 88 | (* eof token *) | 
| 89 | ||
| 90 | val eof = Token (Position.none, (EOF, "")); | |
| 91 | ||
| 92 | fun is_eof (Token (_, (EOF, _))) = true | |
| 93 | | is_eof _ = false; | |
| 94 | ||
| 95 | val stopper = (eof, is_eof); | |
| 96 | val not_eof = not o is_eof; | |
| 97 | ||
| 98 | ||
| 99 | (* get position *) | |
| 100 | ||
| 101 | fun position_of (Token (pos, _)) = pos; | |
| 102 | val pos_of = Position.str_of o position_of; | |
| 103 | ||
| 104 | ||
| 105 | (* kind of token *) | |
| 106 | ||
| 107 | fun is_kind k (Token (_, (k', _))) = k = k'; | |
| 108 | ||
| 7026 | 109 | fun keyword_with pred (Token (_, (Keyword, x))) = pred x | 
| 110 | | keyword_with _ _ = false; | |
| 5825 | 111 | |
| 7682 
46de8064c93c
added Space, Comment token kinds (keep actual text);
 wenzelm parents: 
7477diff
changeset | 112 | fun is_proper (Token (_, (Space, _))) = false | 
| 
46de8064c93c
added Space, Comment token kinds (keep actual text);
 wenzelm parents: 
7477diff
changeset | 113 | | is_proper (Token (_, (Comment, _))) = false | 
| 5825 | 114 | | is_proper _ = true; | 
| 115 | ||
| 9195 | 116 | fun is_semicolon (Token (_, (Keyword, ";"))) = true | 
| 9130 | 117 | | is_semicolon _ = false; | 
| 118 | ||
| 8580 | 119 | fun is_begin_ignore (Token (_, (Comment, "<"))) = true | 
| 120 | | is_begin_ignore _ = false; | |
| 121 | ||
| 122 | fun is_end_ignore (Token (_, (Comment, ">"))) = true | |
| 123 | | is_end_ignore _ = false; | |
| 124 | ||
| 8651 | 125 | |
| 126 | (* newline and indentations (note that space tokens obey lines) *) | |
| 127 | ||
| 128 | fun is_newline (Token (_, (Space, "\n"))) = true | |
| 129 | | is_newline _ = false; | |
| 130 | ||
| 7902 | 131 | fun is_indent (Token (_, (Space, s))) = | 
| 132 | let val n = size s in n > 0 andalso String.substring (s, n - 1, 1) <> "\n" end | |
| 133 | | is_indent _ = false; | |
| 134 | ||
| 5825 | 135 | |
| 9155 | 136 | (* name and value of token *) | 
| 137 | ||
| 138 | fun name_of (tok as Token (_, (k, x))) = | |
| 10748 | 139 | if is_semicolon tok then "terminator" | 
| 140 | else if x = "" then str_of_kind k | |
| 141 | else str_of_kind k ^ " " ^ quote x; | |
| 5825 | 142 | |
| 143 | fun val_of (Token (_, (_, x))) = x; | |
| 144 | ||
| 145 | fun token_leq (Token (_, (_, x)), Token (_, (_, x'))) = x <= x'; | |
| 146 | ||
| 147 | ||
| 148 | ||
| 149 | (** scanners **) | |
| 150 | ||
| 151 | fun change_prompt scan = Scan.prompt "# " scan; | |
| 152 | ||
| 153 | ||
| 154 | (* diagnostics *) | |
| 155 | ||
| 156 | fun lex_err msg ((pos, cs), _) = "Outer lexical error" ^ Position.str_of pos ^ ": " ^ msg cs; | |
| 9130 | 157 | fun !!! msg scan = Scan.!! (lex_err (K msg)) scan; | 
| 5825 | 158 | |
| 159 | ||
| 160 | (* line numbering *) | |
| 161 | ||
| 162 | fun incr_line scan = Scan.depend (fn pos => scan >> pair (Position.inc pos)); | |
| 163 | val keep_line = Scan.lift; | |
| 164 | ||
| 165 | val scan_blank = | |
| 166 | incr_line ($$ "\n") || | |
| 167 | keep_line (Scan.one Symbol.is_blank); | |
| 168 | ||
| 169 | ||
| 170 | (* scan symbolic idents *) | |
| 171 | ||
| 172 | val sym_chars = explode "!#$%&*+-/:<=>?@^_`|~"; | |
| 173 | fun is_sym_char s = s mem sym_chars; | |
| 174 | ||
| 8231 | 175 | val scan_symid = | 
| 176 | Scan.any1 is_sym_char >> implode || | |
| 177 | Scan.one Symbol.is_symbolic; | |
| 5825 | 178 | |
| 8231 | 179 | fun is_symid str = | 
| 180 | (case try Symbol.explode str of | |
| 181 | Some [s] => Symbol.is_symbolic s orelse is_sym_char s | |
| 182 | | Some ss => forall is_sym_char ss | |
| 183 | | _ => false); | |
| 184 | ||
| 5876 | 185 | val is_sid = is_symid orf Syntax.is_identifier; | 
| 5825 | 186 | |
| 187 | ||
| 188 | (* scan strings *) | |
| 189 | ||
| 190 | val scan_str = | |
| 7682 
46de8064c93c
added Space, Comment token kinds (keep actual text);
 wenzelm parents: 
7477diff
changeset | 191 | scan_blank || | 
| 9130 | 192 | keep_line ($$ "\\") |-- !!! "bad escape character in string" | 
| 9051 | 193 | (scan_blank || keep_line ($$ "\"" || $$ "\\")) || | 
| 6859 | 194 | keep_line (Scan.one (not_equal "\\" andf not_equal "\"" andf | 
| 195 | Symbol.not_sync andf Symbol.not_eof)); | |
| 5825 | 196 | |
| 197 | val scan_string = | |
| 198 | keep_line ($$ "\"") |-- | |
| 9130 | 199 | !!! "missing quote at end of string" | 
| 5825 | 200 | (change_prompt ((Scan.repeat scan_str >> implode) --| keep_line ($$ "\""))); | 
| 201 | ||
| 202 | ||
| 203 | (* scan verbatim text *) | |
| 204 | ||
| 205 | val scan_verb = | |
| 206 | scan_blank || | |
| 6743 
5d50225637c8
changed {| |} verbatim syntax to {* *} in order to simplify ProofGeneral setup;
 wenzelm parents: 
5876diff
changeset | 207 | keep_line ($$ "*" --| Scan.ahead (Scan.one (not_equal "}"))) || | 
| 6859 | 208 | keep_line (Scan.one (not_equal "*" andf Symbol.not_sync andf Symbol.not_eof)); | 
| 5825 | 209 | |
| 210 | val scan_verbatim = | |
| 6743 
5d50225637c8
changed {| |} verbatim syntax to {* *} in order to simplify ProofGeneral setup;
 wenzelm parents: 
5876diff
changeset | 211 |   keep_line ($$ "{" -- $$ "*") |--
 | 
| 9130 | 212 | !!! "missing end of verbatim text" | 
| 6743 
5d50225637c8
changed {| |} verbatim syntax to {* *} in order to simplify ProofGeneral setup;
 wenzelm parents: 
5876diff
changeset | 213 | (change_prompt ((Scan.repeat scan_verb >> implode) --| keep_line ($$ "*" -- $$ "}"))); | 
| 5825 | 214 | |
| 215 | ||
| 216 | (* scan space *) | |
| 217 | ||
| 218 | val is_space = Symbol.is_blank andf not_equal "\n"; | |
| 219 | ||
| 220 | val scan_space = | |
| 7682 
46de8064c93c
added Space, Comment token kinds (keep actual text);
 wenzelm parents: 
7477diff
changeset | 221 | (keep_line (Scan.any1 is_space) -- Scan.optional (incr_line ($$ "\n")) "" || | 
| 
46de8064c93c
added Space, Comment token kinds (keep actual text);
 wenzelm parents: 
7477diff
changeset | 222 | keep_line (Scan.any is_space) -- incr_line ($$ "\n")) >> (fn (cs, c) => implode cs ^ c); | 
| 5825 | 223 | |
| 224 | ||
| 225 | (* scan nested comments *) | |
| 226 | ||
| 227 | val scan_cmt = | |
| 228 | Scan.lift scan_blank || | |
| 229 |   Scan.depend (fn d => keep_line ($$ "(" ^^ $$ "*") >> pair (d + 1)) ||
 | |
| 230 | Scan.depend (fn 0 => Scan.fail | d => keep_line ($$ "*" ^^ $$ ")") >> pair (d - 1)) || | |
| 231 | Scan.lift (keep_line ($$ "*" --| Scan.ahead (Scan.one (not_equal ")")))) || | |
| 6859 | 232 | Scan.lift (keep_line (Scan.one (not_equal "*" andf Symbol.not_sync andf Symbol.not_eof))); | 
| 5825 | 233 | |
| 234 | val scan_comment = | |
| 235 |   keep_line ($$ "(" -- $$ "*") |--
 | |
| 9130 | 236 | !!! "missing end of comment" | 
| 5825 | 237 | (change_prompt | 
| 7682 
46de8064c93c
added Space, Comment token kinds (keep actual text);
 wenzelm parents: 
7477diff
changeset | 238 | (Scan.pass 0 (Scan.repeat scan_cmt >> implode) --| keep_line ($$ "*" -- $$ ")"))); | 
| 5825 | 239 | |
| 240 | ||
| 241 | (* scan token *) | |
| 242 | ||
| 9130 | 243 | fun scan (lex1, lex2) = | 
| 5825 | 244 | let | 
| 9130 | 245 | val scanner = Scan.state :-- (fn pos => | 
| 246 | let | |
| 247 | fun token k x = Token (pos, (k, x)); | |
| 248 | fun sync _ = token Sync Symbol.sync; | |
| 249 | in | |
| 250 | scan_string >> token String || | |
| 251 | scan_verbatim >> token Verbatim || | |
| 252 | scan_space >> token Space || | |
| 253 | scan_comment >> token Comment || | |
| 254 | keep_line (Scan.one Symbol.is_sync >> sync) || | |
| 255 | keep_line (Scan.max token_leq | |
| 256 | (Scan.max token_leq | |
| 257 | (Scan.literal lex1 >> (token Keyword o implode)) | |
| 258 | (Scan.literal lex2 >> (token Command o implode))) | |
| 259 | (Syntax.scan_longid >> token LongIdent || | |
| 260 | Syntax.scan_id >> token Ident || | |
| 261 | Syntax.scan_var >> token Var || | |
| 262 | Syntax.scan_tid >> token TypeIdent || | |
| 263 | Syntax.scan_tvar >> token TypeVar || | |
| 264 | Syntax.scan_nat >> token Nat || | |
| 265 | scan_symid >> token SymIdent)) | |
| 266 | end) >> #2; | |
| 267 | in !! (lex_err (fn cs => "bad input " ^ quote (Symbol.beginning cs))) scanner end; | |
| 5825 | 268 | |
| 269 | ||
| 9130 | 270 | (* token sources *) | 
| 5825 | 271 | |
| 6859 | 272 | val is_junk = (not o Symbol.is_blank) andf Symbol.not_sync andf Symbol.not_eof; | 
| 10748 | 273 | fun recover xs = (keep_line (Scan.any is_junk) >> K [malformed]) xs; | 
| 5825 | 274 | |
| 275 | fun source do_recover get_lex pos src = | |
| 276 | Source.source' pos Symbol.stopper (Scan.bulk (fn xs => scan (get_lex ()) xs)) | |
| 7682 
46de8064c93c
added Space, Comment token kinds (keep actual text);
 wenzelm parents: 
7477diff
changeset | 277 | (if do_recover then Some recover else None) src; | 
| 5825 | 278 | |
| 9130 | 279 | fun source_proper src = src |> Source.filter is_proper; | 
| 280 | ||
| 281 | ||
| 282 | (* lexicons *) | |
| 283 | ||
| 284 | val make_lexicon = Scan.make_lexicon o map Symbol.explode; | |
| 5825 | 285 | |
| 286 | end; |