| 4705 |      1 | (*  Title:	Pure/Thy/thy_scan.ML
 | 
|  |      2 |     ID:		$Id$
 | 
|  |      3 |     Author:	Markus Wenzel, TU Muenchen
 | 
| 388 |      4 | 
 | 
| 4705 |      5 | Lexer for the outer Isabelle syntax.
 | 
| 388 |      6 | *)
 | 
|  |      7 | 
 | 
|  |      8 | signature THY_SCAN =
 | 
| 4705 |      9 | sig
 | 
| 388 |     10 |   datatype token_kind =
 | 
| 4705 |     11 |     Keyword | Ident | LongIdent | Var | TypeVar | Nat | String | Verbatim | Ignore | EOF
 | 
| 388 |     12 |   val name_of_kind: token_kind -> string
 | 
| 6207 |     13 |   val tokenize: Scan.lexicon -> string list -> (token_kind * string * int) list
 | 
| 4705 |     14 | end;
 | 
| 388 |     15 | 
 | 
| 4705 |     16 | structure ThyScan: THY_SCAN =
 | 
| 388 |     17 | struct
 | 
|  |     18 | 
 | 
|  |     19 | 
 | 
|  |     20 | (** token kinds **)
 | 
|  |     21 | 
 | 
|  |     22 | datatype token_kind =
 | 
| 4705 |     23 |   Keyword | Ident | LongIdent | Var | TypeVar | Nat | String | Verbatim | Ignore | EOF;
 | 
| 388 |     24 | 
 | 
|  |     25 | fun name_of_kind Keyword = "keyword"
 | 
|  |     26 |   | name_of_kind Ident = "identifier"
 | 
|  |     27 |   | name_of_kind LongIdent = "long identifier"
 | 
| 4705 |     28 |   | name_of_kind Var = "schematic variable"
 | 
| 388 |     29 |   | name_of_kind TypeVar = "type variable"
 | 
|  |     30 |   | name_of_kind Nat = "natural number"
 | 
|  |     31 |   | name_of_kind String = "string"
 | 
|  |     32 |   | name_of_kind Verbatim = "verbatim text"
 | 
| 4705 |     33 |   | name_of_kind Ignore = "ignore"
 | 
| 388 |     34 |   | name_of_kind EOF = "end-of-file";
 | 
|  |     35 | 
 | 
|  |     36 | 
 | 
|  |     37 | 
 | 
|  |     38 | (** scanners **)
 | 
|  |     39 | 
 | 
| 4705 |     40 | (* diagnostics *)
 | 
|  |     41 | 
 | 
|  |     42 | fun lex_err None msg = "Outer lexical error: " ^ msg
 | 
|  |     43 |   | lex_err (Some n) msg = "Outer lexical error on line " ^ string_of_int n ^ ": " ^ msg;
 | 
| 388 |     44 | 
 | 
|  |     45 | 
 | 
| 4705 |     46 | (* line numbering *)
 | 
|  |     47 | 
 | 
|  |     48 | val incr = apsome (fn n:int => n + 1);
 | 
| 388 |     49 | 
 | 
| 4705 |     50 | fun incr_line scan = Scan.depend (fn n => scan >> pair (incr n));
 | 
|  |     51 | val keep_line = Scan.lift;
 | 
| 388 |     52 | 
 | 
| 4705 |     53 | val scan_blank =
 | 
|  |     54 |   incr_line ($$ "\n") ||
 | 
|  |     55 |   keep_line (Scan.one Symbol.is_blank);
 | 
| 388 |     56 | 
 | 
|  |     57 | 
 | 
| 4705 |     58 | (* scan ML-style strings *)
 | 
| 388 |     59 | 
 | 
| 4705 |     60 | val scan_ctrl =
 | 
|  |     61 |   $$ "^" ^^ Scan.one (fn c => Symbol.is_ascii c andalso ord c >= 64 andalso ord c <= 95);
 | 
| 388 |     62 | 
 | 
| 4705 |     63 | val scan_dig = Scan.one Symbol.is_digit;
 | 
| 388 |     64 | 
 | 
|  |     65 | val scan_esc =
 | 
| 4921 |     66 |   keep_line ($$ "\\") ^^ !! (fn ((n, _), _) => lex_err n "bad escape sequence in string")
 | 
| 4705 |     67 |     (keep_line ($$ "n" || $$ "t" || $$ "\"" || $$ "\\" ||
 | 
|  |     68 |       scan_ctrl || scan_dig ^^ scan_dig ^^ scan_dig) ||
 | 
|  |     69 |       (Scan.repeat1 scan_blank >> implode) ^^ keep_line ($$ "\\"));
 | 
| 388 |     70 | 
 | 
| 4705 |     71 | val scan_str =
 | 
|  |     72 |   scan_esc ||
 | 
| 5910 |     73 |   scan_blank >> K Symbol.space ||
 | 
| 4705 |     74 |   keep_line (Scan.one (not_equal "\"" andf Symbol.not_eof));
 | 
|  |     75 | 
 | 
|  |     76 | val scan_string =
 | 
|  |     77 |   keep_line ($$ "\"") ^^
 | 
| 4921 |     78 |   !! (fn ((n, _), _) => lex_err n "missing quote at end of string")
 | 
| 4705 |     79 |     ((Scan.repeat scan_str >> implode) ^^ keep_line ($$ "\""));
 | 
| 388 |     80 | 
 | 
|  |     81 | 
 | 
|  |     82 | (* scan verbatim text *)
 | 
|  |     83 | 
 | 
| 4705 |     84 | val scan_verb =
 | 
|  |     85 |   scan_blank ||
 | 
|  |     86 |   keep_line ($$ "|" --| Scan.ahead (Scan.one (not_equal "}"))) ||
 | 
|  |     87 |   keep_line (Scan.one (not_equal "|" andf Symbol.not_eof));
 | 
|  |     88 | 
 | 
|  |     89 | val scan_verbatim =
 | 
|  |     90 |   keep_line ($$ "{" -- $$ "|") |--
 | 
| 4921 |     91 |   !! (fn ((n, _), _) => lex_err n "missing end of verbatim text")
 | 
| 4705 |     92 |     ((Scan.repeat scan_verb >> implode) --| keep_line ($$ "|" -- $$ "}"));
 | 
| 388 |     93 | 
 | 
|  |     94 | 
 | 
|  |     95 | (* scan nested comments *)
 | 
|  |     96 | 
 | 
| 4705 |     97 | val scan_cmt =
 | 
|  |     98 |   Scan.lift scan_blank ||
 | 
|  |     99 |   Scan.depend (fn d => keep_line ($$ "(" ^^ $$ "*") >> pair (d + 1)) ||
 | 
|  |    100 |   Scan.depend (fn 0 => Scan.fail | d => keep_line ($$ "*" ^^ $$ ")") >> pair (d - 1)) ||
 | 
|  |    101 |   Scan.lift (keep_line ($$ "*" --| Scan.ahead (Scan.one (not_equal ")")))) ||
 | 
|  |    102 |   Scan.lift (keep_line (Scan.one (not_equal "*" andf Symbol.not_eof)));
 | 
| 388 |    103 | 
 | 
| 4705 |    104 | val scan_comment =
 | 
|  |    105 |   keep_line ($$ "(" -- $$ "*") |--
 | 
| 4921 |    106 |   !! (fn ((n, _), _) => lex_err n "missing end of comment")
 | 
| 4705 |    107 |     (Scan.pass 0 (Scan.repeat scan_cmt) |-- keep_line ($$ "*" -- $$ ")") >> K "");
 | 
| 388 |    108 | 
 | 
|  |    109 | 
 | 
| 4705 |    110 | (* scan token *)
 | 
|  |    111 | 
 | 
|  |    112 | fun token k None x = (k, x, 0)
 | 
|  |    113 |   | token k (Some n) x = (k, x, n);
 | 
| 388 |    114 | 
 | 
| 4705 |    115 | fun scan_tok lex (n, cs) =
 | 
|  |    116 |  (scan_string >> token String n ||
 | 
|  |    117 |   scan_verbatim >> token Verbatim n ||
 | 
|  |    118 |   Scan.repeat1 scan_blank >> (token Ignore n o implode) ||
 | 
|  |    119 |   scan_comment >> token Ignore n ||
 | 
|  |    120 |   keep_line (Scan.max (fn ((_, x, _), (_, x', _)) => x <= x')
 | 
|  |    121 |     (Scan.literal lex >> (token Keyword n o implode))
 | 
|  |    122 |     (Syntax.scan_longid >> token LongIdent n ||
 | 
|  |    123 |       Syntax.scan_id >> token Ident n ||
 | 
|  |    124 |       Syntax.scan_var >> token Var n ||
 | 
|  |    125 |       Syntax.scan_tid >> token TypeVar n ||
 | 
|  |    126 |       Syntax.scan_nat >> token Nat n))) (n, cs);
 | 
| 388 |    127 | 
 | 
| 4705 |    128 | val scan_rest = Scan.any Symbol.not_eof >> implode;
 | 
| 388 |    129 | 
 | 
| 4705 |    130 | fun scan_token lex x =
 | 
|  |    131 |   (case scan_tok lex x of
 | 
|  |    132 |     ((Keyword, "ML", n), x') => (keep_line scan_rest >> token Verbatim (Some n)) x'
 | 
|  |    133 |   | tok_x' => tok_x');
 | 
| 388 |    134 | 
 | 
|  |    135 | 
 | 
| 4705 |    136 | (* tokenize *)
 | 
|  |    137 | 
 | 
| 6207 |    138 | fun tokenize lex chs =
 | 
| 388 |    139 |   let
 | 
| 4705 |    140 |     val scan_toks = Scan.repeat (scan_token lex);
 | 
|  |    141 |     val ignore = fn (Ignore, _, _) => true | _ => false;
 | 
| 388 |    142 |   in
 | 
| 4938 |    143 |     (case Scan.error (Scan.finite' Symbol.stopper scan_toks) (Some 1, chs) of
 | 
| 4705 |    144 |       (toks, (n, [])) => filter_out ignore toks @ [token EOF n "end-of-file"]
 | 
| 5112 |    145 |     | (_, (n, cs)) => error (lex_err n ("Bad input " ^ quote (Symbol.beginning cs))))
 | 
| 388 |    146 |   end;
 | 
|  |    147 | 
 | 
|  |    148 | 
 | 
|  |    149 | end;
 |