src/Pure/Thy/thy_scan.ML
changeset 17927 4b42562ec171
parent 17926 8e12d3a4b890
child 17928 c567e5f885bf
equal deleted inserted replaced
17926:8e12d3a4b890 17927:4b42562ec171
     1 (*  Title:	Pure/Thy/thy_scan.ML
       
     2     ID:		$Id$
       
     3     Author:	Markus Wenzel, TU Muenchen
       
     4 
       
     5 Lexer for *old-style* outer syntax.
       
     6 *)
       
     7 
       
     8 signature THY_SCAN =
       
     9 sig
       
    10   datatype token_kind =
       
    11     Keyword | Ident | LongIdent | Var | TypeVar | Nat | String | Verbatim | Ignore | EOF
       
    12   val name_of_kind: token_kind -> string
       
    13   val tokenize: Scan.lexicon -> string list -> (token_kind * string * int) list
       
    14 end;
       
    15 
       
    16 structure ThyScan: THY_SCAN =
       
    17 struct
       
    18 
       
    19 
       
    20 (** token kinds **)
       
    21 
       
    22 datatype token_kind =
       
    23   Keyword | Ident | LongIdent | Var | TypeVar | Nat | String | Verbatim | Ignore | EOF;
       
    24 
       
    25 fun name_of_kind Keyword = "keyword"
       
    26   | name_of_kind Ident = "identifier"
       
    27   | name_of_kind LongIdent = "long identifier"
       
    28   | name_of_kind Var = "schematic variable"
       
    29   | name_of_kind TypeVar = "type variable"
       
    30   | name_of_kind Nat = "natural number"
       
    31   | name_of_kind String = "string"
       
    32   | name_of_kind Verbatim = "verbatim text"
       
    33   | name_of_kind Ignore = "ignore"
       
    34   | name_of_kind EOF = "end-of-file";
       
    35 
       
    36 
       
    37 
       
    38 (** scanners **)
       
    39 
       
    40 (* diagnostics *)
       
    41 
       
    42 fun lex_err NONE msg = "Outer lexical error: " ^ msg
       
    43   | lex_err (SOME n) msg = "Outer lexical error on line " ^ string_of_int n ^ ": " ^ msg;
       
    44 
       
    45 
       
    46 (* line numbering *)
       
    47 
       
    48 val incr = Option.map (fn n:int => n + 1);
       
    49 
       
    50 fun incr_line scan = Scan.depend (fn n => scan >> pair (incr n));
       
    51 val keep_line = Scan.lift;
       
    52 
       
    53 val scan_blank =
       
    54   incr_line ($$ "\n") ||
       
    55   keep_line (Scan.one Symbol.is_blank);
       
    56 
       
    57 
       
    58 (* scan ML-style strings *)
       
    59 
       
    60 val scan_ctrl =
       
    61   $$ "^" ^^ Scan.one (fn c => Symbol.is_ascii c andalso ord c >= 64 andalso ord c <= 95);
       
    62 
       
    63 val scan_dig = Scan.one Symbol.is_digit;
       
    64 
       
    65 val scan_esc =
       
    66   keep_line ($$ "\\") ^^ !! (fn ((n, _), _) => lex_err n "bad escape sequence in string")
       
    67     (keep_line ($$ "n" || $$ "t" || $$ "\"" || $$ "\\" ||
       
    68       scan_ctrl || scan_dig ^^ scan_dig ^^ scan_dig) ||
       
    69       (Scan.repeat1 scan_blank >> implode) ^^ keep_line ($$ "\\"));
       
    70 
       
    71 val scan_str =
       
    72   scan_esc ||
       
    73   scan_blank >> K Symbol.space ||
       
    74   keep_line (Scan.one (not_equal "\"" andf Symbol.not_eof));
       
    75 
       
    76 val scan_string =
       
    77   keep_line ($$ "\"") ^^
       
    78   !! (fn ((n, _), _) => lex_err n "missing quote at end of string")
       
    79     ((Scan.repeat scan_str >> implode) ^^ keep_line ($$ "\""));
       
    80 
       
    81 
       
    82 (* scan verbatim text *)
       
    83 
       
    84 val scan_verb =
       
    85   scan_blank ||
       
    86   keep_line ($$ "|" --| Scan.ahead (Scan.one (not_equal "}"))) ||
       
    87   keep_line (Scan.one (not_equal "|" andf Symbol.not_eof));
       
    88 
       
    89 val scan_verbatim =
       
    90   keep_line ($$ "{" -- $$ "|") |--
       
    91   !! (fn ((n, _), _) => lex_err n "missing end of verbatim text")
       
    92     ((Scan.repeat scan_verb >> implode) --| keep_line ($$ "|" -- $$ "}"));
       
    93 
       
    94 
       
    95 (* scan nested comments *)
       
    96 
       
    97 val scan_cmt =
       
    98   Scan.lift scan_blank ||
       
    99   Scan.depend (fn d => keep_line ($$ "(" ^^ $$ "*") >> pair (d + 1)) ||
       
   100   Scan.depend (fn 0 => Scan.fail | d => keep_line ($$ "*" ^^ $$ ")") >> pair (d - 1)) ||
       
   101   Scan.lift (keep_line ($$ "*" --| Scan.ahead (Scan.one (not_equal ")")))) ||
       
   102   Scan.lift (keep_line (Scan.one (not_equal "*" andf Symbol.not_eof)));
       
   103 
       
   104 val scan_comment =
       
   105   keep_line ($$ "(" -- $$ "*") |--
       
   106   !! (fn ((n, _), _) => lex_err n "missing end of comment")
       
   107     (Scan.pass 0 (Scan.repeat scan_cmt) |-- keep_line ($$ "*" -- $$ ")") >> K "");
       
   108 
       
   109 
       
   110 (* scan token *)
       
   111 
       
   112 fun token k NONE x = (k, x, 0)
       
   113   | token k (SOME n) x = (k, x, n);
       
   114 
       
   115 fun scan_tok lex (n, cs) =
       
   116  (scan_string >> token String n ||
       
   117   scan_verbatim >> token Verbatim n ||
       
   118   Scan.repeat1 scan_blank >> (token Ignore n o implode) ||
       
   119   scan_comment >> token Ignore n ||
       
   120   keep_line (Scan.max (fn ((_, x, _), (_, x', _)) => x <= x')
       
   121     (Scan.literal lex >> (token Keyword n o implode))
       
   122     (Syntax.scan_longid >> token LongIdent n ||
       
   123       Syntax.scan_id >> token Ident n ||
       
   124       Syntax.scan_var >> token Var n ||
       
   125       Syntax.scan_tid >> token TypeVar n ||
       
   126       Syntax.scan_nat >> token Nat n))) (n, cs);
       
   127 
       
   128 val scan_rest = Scan.any Symbol.not_eof >> implode;
       
   129 
       
   130 fun scan_token lex x =
       
   131   (case scan_tok lex x of
       
   132     ((Keyword, "ML", n), x') => (keep_line scan_rest >> token Verbatim (SOME n)) x'
       
   133   | tok_x' => tok_x');
       
   134 
       
   135 
       
   136 (* tokenize *)
       
   137 
       
   138 fun tokenize lex chs =
       
   139   let
       
   140     val scan_toks = Scan.repeat (scan_token lex);
       
   141     val ignore = fn (Ignore, _, _) => true | _ => false;
       
   142   in
       
   143     (case Scan.error (Scan.finite' Symbol.stopper scan_toks) (SOME 1, chs) of
       
   144       (toks, (n, [])) => filter_out ignore toks @ [token EOF n "end-of-file"]
       
   145     | (_, (n, cs)) => error (lex_err n ("Bad input " ^ quote (Symbol.beginning 10 cs))))
       
   146   end;
       
   147 
       
   148 
       
   149 end;