src/Pure/Thy/thy_scan.ML
author wenzelm
Wed Oct 17 18:52:30 2001 +0200 (2001-10-17)
changeset 11819 9283b3c11234
parent 6667 58b9785f8534
child 14729 0e987111a17e
permissions -rw-r--r--
tuned comments;
     1 (*  Title:	Pure/Thy/thy_scan.ML
     2     ID:		$Id$
     3     Author:	Markus Wenzel, TU Muenchen
     4     License:    GPL (GNU GENERAL PUBLIC LICENSE)
     5 
     6 Lexer for *old-style* outer syntax.
     7 *)
     8 
     9 signature THY_SCAN =
    10 sig
    11   datatype token_kind =
    12     Keyword | Ident | LongIdent | Var | TypeVar | Nat | String | Verbatim | Ignore | EOF
    13   val name_of_kind: token_kind -> string
    14   val tokenize: Scan.lexicon -> string list -> (token_kind * string * int) list
    15 end;
    16 
    17 structure ThyScan: THY_SCAN =
    18 struct
    19 
    20 
    21 (** token kinds **)
    22 
    23 datatype token_kind =
    24   Keyword | Ident | LongIdent | Var | TypeVar | Nat | String | Verbatim | Ignore | EOF;
    25 
    26 fun name_of_kind Keyword = "keyword"
    27   | name_of_kind Ident = "identifier"
    28   | name_of_kind LongIdent = "long identifier"
    29   | name_of_kind Var = "schematic variable"
    30   | name_of_kind TypeVar = "type variable"
    31   | name_of_kind Nat = "natural number"
    32   | name_of_kind String = "string"
    33   | name_of_kind Verbatim = "verbatim text"
    34   | name_of_kind Ignore = "ignore"
    35   | name_of_kind EOF = "end-of-file";
    36 
    37 
    38 
    39 (** scanners **)
    40 
    41 (* diagnostics *)
    42 
    43 fun lex_err None msg = "Outer lexical error: " ^ msg
    44   | lex_err (Some n) msg = "Outer lexical error on line " ^ string_of_int n ^ ": " ^ msg;
    45 
    46 
    47 (* line numbering *)
    48 
    49 val incr = apsome (fn n:int => n + 1);
    50 
    51 fun incr_line scan = Scan.depend (fn n => scan >> pair (incr n));
    52 val keep_line = Scan.lift;
    53 
    54 val scan_blank =
    55   incr_line ($$ "\n") ||
    56   keep_line (Scan.one Symbol.is_blank);
    57 
    58 
    59 (* scan ML-style strings *)
    60 
    61 val scan_ctrl =
    62   $$ "^" ^^ Scan.one (fn c => Symbol.is_ascii c andalso ord c >= 64 andalso ord c <= 95);
    63 
    64 val scan_dig = Scan.one Symbol.is_digit;
    65 
    66 val scan_esc =
    67   keep_line ($$ "\\") ^^ !! (fn ((n, _), _) => lex_err n "bad escape sequence in string")
    68     (keep_line ($$ "n" || $$ "t" || $$ "\"" || $$ "\\" ||
    69       scan_ctrl || scan_dig ^^ scan_dig ^^ scan_dig) ||
    70       (Scan.repeat1 scan_blank >> implode) ^^ keep_line ($$ "\\"));
    71 
    72 val scan_str =
    73   scan_esc ||
    74   scan_blank >> K Symbol.space ||
    75   keep_line (Scan.one (not_equal "\"" andf Symbol.not_eof));
    76 
    77 val scan_string =
    78   keep_line ($$ "\"") ^^
    79   !! (fn ((n, _), _) => lex_err n "missing quote at end of string")
    80     ((Scan.repeat scan_str >> implode) ^^ keep_line ($$ "\""));
    81 
    82 
    83 (* scan verbatim text *)
    84 
    85 val scan_verb =
    86   scan_blank ||
    87   keep_line ($$ "|" --| Scan.ahead (Scan.one (not_equal "}"))) ||
    88   keep_line (Scan.one (not_equal "|" andf Symbol.not_eof));
    89 
    90 val scan_verbatim =
    91   keep_line ($$ "{" -- $$ "|") |--
    92   !! (fn ((n, _), _) => lex_err n "missing end of verbatim text")
    93     ((Scan.repeat scan_verb >> implode) --| keep_line ($$ "|" -- $$ "}"));
    94 
    95 
    96 (* scan nested comments *)
    97 
    98 val scan_cmt =
    99   Scan.lift scan_blank ||
   100   Scan.depend (fn d => keep_line ($$ "(" ^^ $$ "*") >> pair (d + 1)) ||
   101   Scan.depend (fn 0 => Scan.fail | d => keep_line ($$ "*" ^^ $$ ")") >> pair (d - 1)) ||
   102   Scan.lift (keep_line ($$ "*" --| Scan.ahead (Scan.one (not_equal ")")))) ||
   103   Scan.lift (keep_line (Scan.one (not_equal "*" andf Symbol.not_eof)));
   104 
   105 val scan_comment =
   106   keep_line ($$ "(" -- $$ "*") |--
   107   !! (fn ((n, _), _) => lex_err n "missing end of comment")
   108     (Scan.pass 0 (Scan.repeat scan_cmt) |-- keep_line ($$ "*" -- $$ ")") >> K "");
   109 
   110 
   111 (* scan token *)
   112 
   113 fun token k None x = (k, x, 0)
   114   | token k (Some n) x = (k, x, n);
   115 
   116 fun scan_tok lex (n, cs) =
   117  (scan_string >> token String n ||
   118   scan_verbatim >> token Verbatim n ||
   119   Scan.repeat1 scan_blank >> (token Ignore n o implode) ||
   120   scan_comment >> token Ignore n ||
   121   keep_line (Scan.max (fn ((_, x, _), (_, x', _)) => x <= x')
   122     (Scan.literal lex >> (token Keyword n o implode))
   123     (Syntax.scan_longid >> token LongIdent n ||
   124       Syntax.scan_id >> token Ident n ||
   125       Syntax.scan_var >> token Var n ||
   126       Syntax.scan_tid >> token TypeVar n ||
   127       Syntax.scan_nat >> token Nat n))) (n, cs);
   128 
   129 val scan_rest = Scan.any Symbol.not_eof >> implode;
   130 
   131 fun scan_token lex x =
   132   (case scan_tok lex x of
   133     ((Keyword, "ML", n), x') => (keep_line scan_rest >> token Verbatim (Some n)) x'
   134   | tok_x' => tok_x');
   135 
   136 
   137 (* tokenize *)
   138 
   139 fun tokenize lex chs =
   140   let
   141     val scan_toks = Scan.repeat (scan_token lex);
   142     val ignore = fn (Ignore, _, _) => true | _ => false;
   143   in
   144     (case Scan.error (Scan.finite' Symbol.stopper scan_toks) (Some 1, chs) of
   145       (toks, (n, [])) => filter_out ignore toks @ [token EOF n "end-of-file"]
   146     | (_, (n, cs)) => error (lex_err n ("Bad input " ^ quote (Symbol.beginning cs))))
   147   end;
   148 
   149 
   150 end;