| 5825 |      1 | (*  Title:      Pure/Isar/outer_lex.ML
 | 
|  |      2 |     ID:         $Id$
 | 
|  |      3 |     Author:     Markus Wenzel, TU Muenchen
 | 
|  |      4 | 
 | 
|  |      5 | Outer lexical syntax for Isabelle/Isar.
 | 
|  |      6 | *)
 | 
|  |      7 | 
 | 
|  |      8 | signature OUTER_LEX =
 | 
|  |      9 | sig
 | 
|  |     10 |   datatype token_kind =
 | 
|  |     11 |     Keyword | Ident | LongIdent | SymIdent | Var | TextVar | TypeIdent | TypeVar | Nat |
 | 
|  |     12 |     String | Verbatim | Ignore | EOF
 | 
|  |     13 |   type token
 | 
|  |     14 |   val str_of_kind: token_kind -> string
 | 
|  |     15 |   val stopper: token * (token -> bool)
 | 
|  |     16 |   val not_eof: token -> bool
 | 
|  |     17 |   val position_of: token -> Position.T
 | 
|  |     18 |   val pos_of: token -> string
 | 
|  |     19 |   val is_kind: token_kind -> token -> bool
 | 
|  |     20 |   val keyword_pred: (string -> bool) -> token -> bool
 | 
|  |     21 |   val name_of: token -> string
 | 
|  |     22 |   val is_proper: token -> bool
 | 
|  |     23 |   val val_of: token -> string
 | 
| 5876 |     24 |   val is_sid: string -> bool
 | 
| 5825 |     25 |   val scan: Scan.lexicon ->
 | 
|  |     26 |     Position.T * Symbol.symbol list -> token * (Position.T * Symbol.symbol list)
 | 
|  |     27 |   val source: bool -> (unit -> Scan.lexicon) -> Position.T -> (Symbol.symbol, 'a) Source.source ->
 | 
|  |     28 |     (token, (token, Position.T * (Symbol.symbol, 'a) Source.source) Source.source) Source.source
 | 
|  |     29 | end;
 | 
|  |     30 | 
 | 
|  |     31 | structure OuterLex: OUTER_LEX =
 | 
|  |     32 | struct
 | 
|  |     33 | 
 | 
|  |     34 | 
 | 
|  |     35 | (** tokens **)
 | 
|  |     36 | 
 | 
|  |     37 | (* datatype token *)
 | 
|  |     38 | 
 | 
|  |     39 | datatype token_kind =
 | 
|  |     40 |   Keyword | Ident | LongIdent | SymIdent | Var | TextVar | TypeIdent | TypeVar | Nat |
 | 
|  |     41 |   String | Verbatim | Ignore | EOF;
 | 
|  |     42 | 
 | 
|  |     43 | datatype token = Token of Position.T * (token_kind * string);
 | 
|  |     44 | 
 | 
|  |     45 | val str_of_kind =
 | 
|  |     46 |  fn Keyword => "keyword"
 | 
|  |     47 |   | Ident => "identifier"
 | 
|  |     48 |   | LongIdent => "long identifier"
 | 
|  |     49 |   | SymIdent => "symbolic identifier"
 | 
|  |     50 |   | Var => "schematic variable"
 | 
|  |     51 |   | TextVar => "text variable"
 | 
|  |     52 |   | TypeIdent => "type variable"
 | 
|  |     53 |   | TypeVar => "schematic type variable"
 | 
|  |     54 |   | Nat => "number"
 | 
|  |     55 |   | String => "string"
 | 
|  |     56 |   | Verbatim => "verbatim text"
 | 
|  |     57 |   | Ignore => "ignored text"
 | 
|  |     58 |   | EOF => "end-of-file";
 | 
|  |     59 | 
 | 
|  |     60 | 
 | 
|  |     61 | (* eof token *)
 | 
|  |     62 | 
 | 
|  |     63 | val eof = Token (Position.none, (EOF, ""));
 | 
|  |     64 | 
 | 
|  |     65 | fun is_eof (Token (_, (EOF, _))) = true
 | 
|  |     66 |   | is_eof _ = false;
 | 
|  |     67 | 
 | 
|  |     68 | val stopper = (eof, is_eof);
 | 
|  |     69 | val not_eof = not o is_eof;
 | 
|  |     70 | 
 | 
|  |     71 | 
 | 
|  |     72 | (* get position *)
 | 
|  |     73 | 
 | 
|  |     74 | fun position_of (Token (pos, _)) = pos;
 | 
|  |     75 | val pos_of = Position.str_of o position_of;
 | 
|  |     76 | 
 | 
|  |     77 | 
 | 
|  |     78 | (* kind of token *)
 | 
|  |     79 | 
 | 
|  |     80 | fun is_kind k (Token (_, (k', _))) = k = k';
 | 
|  |     81 | 
 | 
|  |     82 | fun keyword_pred pred (Token (_, (Keyword, x))) = pred x
 | 
|  |     83 |   | keyword_pred _ _ = false;
 | 
|  |     84 | 
 | 
|  |     85 | fun name_of (Token (_, (k, _))) = str_of_kind k;
 | 
|  |     86 | 
 | 
|  |     87 | fun is_proper (Token (_, (Ignore, _))) = false
 | 
|  |     88 |   | is_proper _ = true;
 | 
|  |     89 | 
 | 
|  |     90 | 
 | 
|  |     91 | (* value of token *)
 | 
|  |     92 | 
 | 
|  |     93 | fun val_of (Token (_, (_, x))) = x;
 | 
|  |     94 | 
 | 
|  |     95 | fun token_leq (Token (_, (_, x)), Token (_, (_, x'))) = x <= x';
 | 
|  |     96 | 
 | 
|  |     97 | 
 | 
|  |     98 | 
 | 
|  |     99 | (** scanners **)
 | 
|  |    100 | 
 | 
|  |    101 | fun change_prompt scan = Scan.prompt "# " scan;
 | 
|  |    102 | 
 | 
|  |    103 | 
 | 
|  |    104 | (* diagnostics *)
 | 
|  |    105 | 
 | 
|  |    106 | fun lex_err msg ((pos, cs), _) = "Outer lexical error" ^ Position.str_of pos ^ ": " ^ msg cs;
 | 
|  |    107 | 
 | 
|  |    108 | 
 | 
|  |    109 | (* line numbering *)
 | 
|  |    110 | 
 | 
|  |    111 | fun incr_line scan = Scan.depend (fn pos => scan >> pair (Position.inc pos));
 | 
|  |    112 | val keep_line = Scan.lift;
 | 
|  |    113 | 
 | 
|  |    114 | val scan_blank =
 | 
|  |    115 |   incr_line ($$ "\n") ||
 | 
|  |    116 |   keep_line (Scan.one Symbol.is_blank);
 | 
|  |    117 | 
 | 
|  |    118 | 
 | 
|  |    119 | (* scan symbolic idents *)
 | 
|  |    120 | 
 | 
|  |    121 | val sym_chars = explode "!#$%&*+-/:<=>?@^_`|~";
 | 
|  |    122 | fun is_sym_char s = s mem sym_chars;
 | 
|  |    123 | 
 | 
| 5876 |    124 | val scan_symid = Scan.any1 is_sym_char >> implode;
 | 
| 5825 |    125 | 
 | 
| 5876 |    126 | fun is_symid s = s <> "" andalso forall is_sym_char (Symbol.explode s);
 | 
|  |    127 | val is_sid = is_symid orf Syntax.is_identifier;
 | 
| 5825 |    128 | 
 | 
|  |    129 | 
 | 
|  |    130 | (* scan strings *)
 | 
|  |    131 | 
 | 
|  |    132 | val scan_str =
 | 
|  |    133 |   scan_blank >> K Symbol.space ||
 | 
|  |    134 |   keep_line ($$ "\\" |-- Scan.one Symbol.not_eof) ||
 | 
|  |    135 |   keep_line (Scan.one (not_equal "\\" andf not_equal "\"" andf Symbol.not_eof));
 | 
|  |    136 | 
 | 
|  |    137 | val scan_string =
 | 
|  |    138 |   keep_line ($$ "\"") |--
 | 
|  |    139 |     !! (lex_err (K "missing quote at end of string"))
 | 
|  |    140 |       (change_prompt ((Scan.repeat scan_str >> implode) --| keep_line ($$ "\"")));
 | 
|  |    141 | 
 | 
|  |    142 | 
 | 
|  |    143 | (* scan verbatim text *)
 | 
|  |    144 | 
 | 
|  |    145 | val scan_verb =
 | 
|  |    146 |   scan_blank ||
 | 
|  |    147 |   keep_line ($$ "|" --| Scan.ahead (Scan.one (not_equal "}"))) ||
 | 
|  |    148 |   keep_line (Scan.one (not_equal "|" andf Symbol.not_eof));
 | 
|  |    149 | 
 | 
|  |    150 | val scan_verbatim =
 | 
|  |    151 |   keep_line ($$ "{" -- $$ "|") |--
 | 
|  |    152 |     !! (lex_err (K "missing end of verbatim text"))
 | 
|  |    153 |       (change_prompt ((Scan.repeat scan_verb >> implode) --| keep_line ($$ "|" -- $$ "}")));
 | 
|  |    154 | 
 | 
|  |    155 | 
 | 
|  |    156 | (* scan space *)
 | 
|  |    157 | 
 | 
|  |    158 | val is_space = Symbol.is_blank andf not_equal "\n";
 | 
|  |    159 | 
 | 
|  |    160 | val scan_space =
 | 
|  |    161 |   keep_line (Scan.any1 is_space) |-- Scan.optional (incr_line ($$ "\n")) "" ||
 | 
|  |    162 |   keep_line (Scan.any is_space) |-- incr_line ($$ "\n");
 | 
|  |    163 | 
 | 
|  |    164 | 
 | 
|  |    165 | (* scan nested comments *)
 | 
|  |    166 | 
 | 
|  |    167 | val scan_cmt =
 | 
|  |    168 |   Scan.lift scan_blank ||
 | 
|  |    169 |   Scan.depend (fn d => keep_line ($$ "(" ^^ $$ "*") >> pair (d + 1)) ||
 | 
|  |    170 |   Scan.depend (fn 0 => Scan.fail | d => keep_line ($$ "*" ^^ $$ ")") >> pair (d - 1)) ||
 | 
|  |    171 |   Scan.lift (keep_line ($$ "*" --| Scan.ahead (Scan.one (not_equal ")")))) ||
 | 
|  |    172 |   Scan.lift (keep_line (Scan.one (not_equal "*" andf Symbol.not_eof)));
 | 
|  |    173 | 
 | 
|  |    174 | val scan_comment =
 | 
|  |    175 |   keep_line ($$ "(" -- $$ "*") |--
 | 
|  |    176 |     !! (lex_err (K "missing end of comment"))
 | 
|  |    177 |       (change_prompt
 | 
|  |    178 |         (Scan.pass 0 (Scan.repeat scan_cmt) |-- keep_line ($$ "*" -- $$ ")") >> K ""));
 | 
|  |    179 | 
 | 
|  |    180 | 
 | 
|  |    181 | (* scan token *)
 | 
|  |    182 | 
 | 
|  |    183 | fun scan lex (pos, cs) =
 | 
|  |    184 |   let
 | 
|  |    185 |     fun token k x = Token (pos, (k, x));
 | 
|  |    186 |     fun ignore _ = token Ignore "";
 | 
|  |    187 | 
 | 
|  |    188 |     val scanner =
 | 
|  |    189 |       scan_string >> token String ||
 | 
|  |    190 |       scan_verbatim >> token Verbatim ||
 | 
|  |    191 |       scan_space >> ignore ||
 | 
|  |    192 |       scan_comment >> ignore ||
 | 
|  |    193 |       keep_line (Scan.max token_leq
 | 
|  |    194 |         (Scan.literal lex >> (token Keyword o implode))
 | 
|  |    195 |         (Syntax.scan_longid >> token LongIdent ||
 | 
|  |    196 |           Syntax.scan_id >> token Ident ||
 | 
|  |    197 |           Syntax.scan_var >> token Var ||
 | 
|  |    198 |           $$ "?" ^^ $$ "?" ^^ Syntax.scan_id >> token TextVar ||
 | 
|  |    199 |           Syntax.scan_tid >> token TypeIdent ||
 | 
|  |    200 |           Syntax.scan_tvar >> token TypeVar ||
 | 
|  |    201 |           Syntax.scan_nat >> token Nat ||
 | 
|  |    202 |           scan_symid >> token SymIdent));
 | 
|  |    203 |   in
 | 
|  |    204 |     !! (lex_err (fn cs => "bad input " ^ quote (Symbol.beginning cs))) scanner (pos, cs)
 | 
|  |    205 |   end;
 | 
|  |    206 | 
 | 
|  |    207 | 
 | 
|  |    208 | (* source of (proper) tokens *)
 | 
|  |    209 | 
 | 
|  |    210 | fun recover xs = keep_line (Scan.any1 ((not o Symbol.is_blank) andf Symbol.not_eof)) xs;
 | 
|  |    211 | 
 | 
|  |    212 | fun source do_recover get_lex pos src =
 | 
|  |    213 |   Source.source' pos Symbol.stopper (Scan.bulk (fn xs => scan (get_lex ()) xs))
 | 
|  |    214 |     (if do_recover then Some recover else None) src
 | 
|  |    215 |   |> Source.filter is_proper;
 | 
|  |    216 | 
 | 
|  |    217 | 
 | 
|  |    218 | end;
 |