| 4705 |      1 | (*  Title:	Pure/Thy/thy_scan.ML
 | 
|  |      2 |     ID:		$Id$
 | 
|  |      3 |     Author:	Markus Wenzel, TU Muenchen
 | 
| 11819 |      4 |     License:    GPL (GNU GENERAL PUBLIC LICENSE)
 | 
| 388 |      5 | 
 | 
| 11819 |      6 | Lexer for *old-style* outer syntax.
 | 
| 388 |      7 | *)
 | 
|  |      8 | 
 | 
|  |      9 | signature THY_SCAN =
 | 
| 4705 |     10 | sig
 | 
| 388 |     11 |   datatype token_kind =
 | 
| 4705 |     12 |     Keyword | Ident | LongIdent | Var | TypeVar | Nat | String | Verbatim | Ignore | EOF
 | 
| 388 |     13 |   val name_of_kind: token_kind -> string
 | 
| 6207 |     14 |   val tokenize: Scan.lexicon -> string list -> (token_kind * string * int) list
 | 
| 4705 |     15 | end;
 | 
| 388 |     16 | 
 | 
| 4705 |     17 | structure ThyScan: THY_SCAN =
 | 
| 388 |     18 | struct
 | 
|  |     19 | 
 | 
|  |     20 | 
 | 
|  |     21 | (** token kinds **)
 | 
|  |     22 | 
 | 
|  |     23 | datatype token_kind =
 | 
| 4705 |     24 |   Keyword | Ident | LongIdent | Var | TypeVar | Nat | String | Verbatim | Ignore | EOF;
 | 
| 388 |     25 | 
 | 
|  |     26 | fun name_of_kind Keyword = "keyword"
 | 
|  |     27 |   | name_of_kind Ident = "identifier"
 | 
|  |     28 |   | name_of_kind LongIdent = "long identifier"
 | 
| 4705 |     29 |   | name_of_kind Var = "schematic variable"
 | 
| 388 |     30 |   | name_of_kind TypeVar = "type variable"
 | 
|  |     31 |   | name_of_kind Nat = "natural number"
 | 
|  |     32 |   | name_of_kind String = "string"
 | 
|  |     33 |   | name_of_kind Verbatim = "verbatim text"
 | 
| 4705 |     34 |   | name_of_kind Ignore = "ignore"
 | 
| 388 |     35 |   | name_of_kind EOF = "end-of-file";
 | 
|  |     36 | 
 | 
|  |     37 | 
 | 
|  |     38 | 
 | 
|  |     39 | (** scanners **)
 | 
|  |     40 | 
 | 
| 4705 |     41 | (* diagnostics *)
 | 
|  |     42 | 
 | 
|  |     43 | fun lex_err None msg = "Outer lexical error: " ^ msg
 | 
|  |     44 |   | lex_err (Some n) msg = "Outer lexical error on line " ^ string_of_int n ^ ": " ^ msg;
 | 
| 388 |     45 | 
 | 
|  |     46 | 
 | 
| 4705 |     47 | (* line numbering *)
 | 
|  |     48 | 
 | 
|  |     49 | val incr = apsome (fn n:int => n + 1);
 | 
| 388 |     50 | 
 | 
| 4705 |     51 | fun incr_line scan = Scan.depend (fn n => scan >> pair (incr n));
 | 
|  |     52 | val keep_line = Scan.lift;
 | 
| 388 |     53 | 
 | 
| 4705 |     54 | val scan_blank =
 | 
|  |     55 |   incr_line ($$ "\n") ||
 | 
|  |     56 |   keep_line (Scan.one Symbol.is_blank);
 | 
| 388 |     57 | 
 | 
|  |     58 | 
 | 
| 4705 |     59 | (* scan ML-style strings *)
 | 
| 388 |     60 | 
 | 
| 4705 |     61 | val scan_ctrl =
 | 
|  |     62 |   $$ "^" ^^ Scan.one (fn c => Symbol.is_ascii c andalso ord c >= 64 andalso ord c <= 95);
 | 
| 388 |     63 | 
 | 
| 4705 |     64 | val scan_dig = Scan.one Symbol.is_digit;
 | 
| 388 |     65 | 
 | 
|  |     66 | val scan_esc =
 | 
| 4921 |     67 |   keep_line ($$ "\\") ^^ !! (fn ((n, _), _) => lex_err n "bad escape sequence in string")
 | 
| 4705 |     68 |     (keep_line ($$ "n" || $$ "t" || $$ "\"" || $$ "\\" ||
 | 
|  |     69 |       scan_ctrl || scan_dig ^^ scan_dig ^^ scan_dig) ||
 | 
|  |     70 |       (Scan.repeat1 scan_blank >> implode) ^^ keep_line ($$ "\\"));
 | 
| 388 |     71 | 
 | 
| 4705 |     72 | val scan_str =
 | 
|  |     73 |   scan_esc ||
 | 
| 5910 |     74 |   scan_blank >> K Symbol.space ||
 | 
| 4705 |     75 |   keep_line (Scan.one (not_equal "\"" andf Symbol.not_eof));
 | 
|  |     76 | 
 | 
|  |     77 | val scan_string =
 | 
|  |     78 |   keep_line ($$ "\"") ^^
 | 
| 4921 |     79 |   !! (fn ((n, _), _) => lex_err n "missing quote at end of string")
 | 
| 4705 |     80 |     ((Scan.repeat scan_str >> implode) ^^ keep_line ($$ "\""));
 | 
| 388 |     81 | 
 | 
|  |     82 | 
 | 
|  |     83 | (* scan verbatim text *)
 | 
|  |     84 | 
 | 
| 4705 |     85 | val scan_verb =
 | 
|  |     86 |   scan_blank ||
 | 
|  |     87 |   keep_line ($$ "|" --| Scan.ahead (Scan.one (not_equal "}"))) ||
 | 
|  |     88 |   keep_line (Scan.one (not_equal "|" andf Symbol.not_eof));
 | 
|  |     89 | 
 | 
|  |     90 | val scan_verbatim =
 | 
|  |     91 |   keep_line ($$ "{" -- $$ "|") |--
 | 
| 4921 |     92 |   !! (fn ((n, _), _) => lex_err n "missing end of verbatim text")
 | 
| 4705 |     93 |     ((Scan.repeat scan_verb >> implode) --| keep_line ($$ "|" -- $$ "}"));
 | 
| 388 |     94 | 
 | 
|  |     95 | 
 | 
|  |     96 | (* scan nested comments *)
 | 
|  |     97 | 
 | 
| 4705 |     98 | val scan_cmt =
 | 
|  |     99 |   Scan.lift scan_blank ||
 | 
|  |    100 |   Scan.depend (fn d => keep_line ($$ "(" ^^ $$ "*") >> pair (d + 1)) ||
 | 
|  |    101 |   Scan.depend (fn 0 => Scan.fail | d => keep_line ($$ "*" ^^ $$ ")") >> pair (d - 1)) ||
 | 
|  |    102 |   Scan.lift (keep_line ($$ "*" --| Scan.ahead (Scan.one (not_equal ")")))) ||
 | 
|  |    103 |   Scan.lift (keep_line (Scan.one (not_equal "*" andf Symbol.not_eof)));
 | 
| 388 |    104 | 
 | 
| 4705 |    105 | val scan_comment =
 | 
|  |    106 |   keep_line ($$ "(" -- $$ "*") |--
 | 
| 4921 |    107 |   !! (fn ((n, _), _) => lex_err n "missing end of comment")
 | 
| 4705 |    108 |     (Scan.pass 0 (Scan.repeat scan_cmt) |-- keep_line ($$ "*" -- $$ ")") >> K "");
 | 
| 388 |    109 | 
 | 
|  |    110 | 
 | 
| 4705 |    111 | (* scan token *)
 | 
|  |    112 | 
 | 
|  |    113 | fun token k None x = (k, x, 0)
 | 
|  |    114 |   | token k (Some n) x = (k, x, n);
 | 
| 388 |    115 | 
 | 
| 4705 |    116 | fun scan_tok lex (n, cs) =
 | 
|  |    117 |  (scan_string >> token String n ||
 | 
|  |    118 |   scan_verbatim >> token Verbatim n ||
 | 
|  |    119 |   Scan.repeat1 scan_blank >> (token Ignore n o implode) ||
 | 
|  |    120 |   scan_comment >> token Ignore n ||
 | 
|  |    121 |   keep_line (Scan.max (fn ((_, x, _), (_, x', _)) => x <= x')
 | 
|  |    122 |     (Scan.literal lex >> (token Keyword n o implode))
 | 
|  |    123 |     (Syntax.scan_longid >> token LongIdent n ||
 | 
|  |    124 |       Syntax.scan_id >> token Ident n ||
 | 
|  |    125 |       Syntax.scan_var >> token Var n ||
 | 
|  |    126 |       Syntax.scan_tid >> token TypeVar n ||
 | 
|  |    127 |       Syntax.scan_nat >> token Nat n))) (n, cs);
 | 
| 388 |    128 | 
 | 
| 4705 |    129 | val scan_rest = Scan.any Symbol.not_eof >> implode;
 | 
| 388 |    130 | 
 | 
| 4705 |    131 | fun scan_token lex x =
 | 
|  |    132 |   (case scan_tok lex x of
 | 
|  |    133 |     ((Keyword, "ML", n), x') => (keep_line scan_rest >> token Verbatim (Some n)) x'
 | 
|  |    134 |   | tok_x' => tok_x');
 | 
| 388 |    135 | 
 | 
|  |    136 | 
 | 
| 4705 |    137 | (* tokenize *)
 | 
|  |    138 | 
 | 
| 6207 |    139 | fun tokenize lex chs =
 | 
| 388 |    140 |   let
 | 
| 4705 |    141 |     val scan_toks = Scan.repeat (scan_token lex);
 | 
|  |    142 |     val ignore = fn (Ignore, _, _) => true | _ => false;
 | 
| 388 |    143 |   in
 | 
| 4938 |    144 |     (case Scan.error (Scan.finite' Symbol.stopper scan_toks) (Some 1, chs) of
 | 
| 4705 |    145 |       (toks, (n, [])) => filter_out ignore toks @ [token EOF n "end-of-file"]
 | 
| 5112 |    146 |     | (_, (n, cs)) => error (lex_err n ("Bad input " ^ quote (Symbol.beginning cs))))
 | 
| 388 |    147 |   end;
 | 
|  |    148 | 
 | 
|  |    149 | 
 | 
|  |    150 | end;
 |