(replaces Thy/scan.ML)
authorwenzelm
Thu May 19 16:25:03 1994 +0200 (1994-05-19)
changeset 388dcf6c0774075
parent 387 69f4356d915d
child 389 85105a7fb668
(replaces Thy/scan.ML)
scanner now based on combinators of structure Scanner;
improved handling of keywords;
now supports long.ids, (* (* nested *) comments *), {| verbatim text |};
src/Pure/Thy/thy_scan.ML
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/src/Pure/Thy/thy_scan.ML	Thu May 19 16:25:03 1994 +0200
     1.3 @@ -0,0 +1,138 @@
     1.4 +(*  Title:      Pure/Thy/thy_scan.ML
     1.5 +    ID:         $Id$
     1.6 +    Author:     Markus Wenzel, TU Muenchen
     1.7 +
     1.8 +The scanner for theory files.
     1.9 +*)
    1.10 +
    1.11 +signature THY_SCAN =
    1.12 +sig
    1.13 +  datatype token_kind =
    1.14 +    Keyword | Ident | LongIdent | TypeVar | Nat | String | Verbatim | EOF
    1.15 +  val name_of_kind: token_kind -> string
    1.16 +  type lexicon
    1.17 +  val make_lexicon: string list -> lexicon
    1.18 +  val tokenize: lexicon -> string -> (token_kind * string * int) list
    1.19 +end;
    1.20 +
    1.21 +
    1.22 +functor ThyScanFun(Scanner: SCANNER): THY_SCAN =
    1.23 +struct
    1.24 +
    1.25 +open Scanner;
    1.26 +
    1.27 +
    1.28 +(** token kinds **)
    1.29 +
    1.30 +datatype token_kind =
    1.31 +  Keyword | Ident | LongIdent | TypeVar | Nat | String | Verbatim | EOF;
    1.32 +
    1.33 +fun name_of_kind Keyword = "keyword"
    1.34 +  | name_of_kind Ident = "identifier"
    1.35 +  | name_of_kind LongIdent = "long identifier"
    1.36 +  | name_of_kind TypeVar = "type variable"
    1.37 +  | name_of_kind Nat = "natural number"
    1.38 +  | name_of_kind String = "string"
    1.39 +  | name_of_kind Verbatim = "verbatim text"
    1.40 +  | name_of_kind EOF = "end-of-file";
    1.41 +
    1.42 +
    1.43 +
    1.44 +(** scanners **)
    1.45 +
    1.46 +fun lex_err line msg =
    1.47 +  error ("Lexical error on line " ^ string_of_int line ^ ": " ^ msg);
    1.48 +
    1.49 +
    1.50 +(* misc utilities *)
    1.51 +
    1.52 +fun count_lines cs =
    1.53 +  foldl (fn (sum, c) => if c = "\n" then sum + 1 else sum) (0, cs);
    1.54 +
    1.55 +fun inc_line n s = n + count_lines (explode s);
    1.56 +
    1.57 +fun cons_fst c (cs, x, y) = (c :: cs, x, y);
    1.58 +
    1.59 +
    1.60 +(* scan strings *)
    1.61 +
    1.62 +val scan_ctrl = $$ "^" ^^ scan_one (fn c => ord c >= 64 andalso ord c <= 95);
    1.63 +
    1.64 +val scan_dig = scan_one is_digit;
    1.65 +
    1.66 +val scan_blanks1 = scan_any1 is_blank >> implode;
    1.67 +
    1.68 +val scan_esc =
    1.69 +  $$ "n" || $$ "t" || $$ "\"" || $$ "\\" ||
    1.70 +  scan_ctrl || scan_dig ^^ scan_dig ^^ scan_dig ||
    1.71 +  scan_blanks1 ^^ $$ "\\";
    1.72 +
    1.73 +
    1.74 +fun string ("\"" :: cs) n = (["\""], cs, n)
    1.75 +  | string ("\\" :: cs) n =
    1.76 +      (case optional scan_esc cs of
    1.77 +        (None, _) => lex_err n "bad escape sequence in string"
    1.78 +      | (Some s, cs') => cons_fst ("\\" ^ s) (string cs' (inc_line n s)))
    1.79 +  | string (c :: cs) n = cons_fst c (string cs n)
    1.80 +  | string [] n = lex_err n "missing quote at end of string";
    1.81 +
    1.82 +
    1.83 +(* scan verbatim text *)
    1.84 +
    1.85 +fun verbatim ("|" :: "}" :: cs) n = ([], cs, n)
    1.86 +  | verbatim (c :: cs) n = cons_fst c (verbatim cs (inc_line n c))
    1.87 +  | verbatim [] n = lex_err n "missing end of verbatim text";
    1.88 +
    1.89 +
    1.90 +(* scan nested comments *)
    1.91 +
    1.92 +fun comment ("*" :: ")" :: cs) n 1 = (cs, n)
    1.93 +  | comment ("*" :: ")" :: cs) n d = comment cs n (d - 1)
    1.94 +  | comment ("(" :: "*" :: cs) n d = comment cs n (d + 1)
    1.95 +  | comment (c :: cs) n d = comment cs (inc_line n c) d
    1.96 +  | comment [] n _ = lex_err n "missing end of comment";
    1.97 +
    1.98 +
    1.99 +
   1.100 +(** tokenize **)
   1.101 +
   1.102 +val scan_longid = scan_id ^^ (repeat1 ($$ "." ^^ scan_id) >> implode);
   1.103 +
   1.104 +fun scan_word lex =
   1.105 +  max_of
   1.106 +    (scan_literal lex >> pair Keyword)
   1.107 +    (scan_longid >> pair LongIdent ||
   1.108 +      scan_id >> pair Ident ||
   1.109 +      scan_tid >> pair TypeVar ||
   1.110 +      scan_nat >> pair Nat);
   1.111 +
   1.112 +fun add_tok toks kind n (cs, cs', n') =
   1.113 +  ((kind, implode cs, n) :: toks, cs', n');
   1.114 +
   1.115 +val take_line = implode o fst o take_prefix (not_equal "\n");
   1.116 +
   1.117 +
   1.118 +fun tokenize lex str =
   1.119 +  let
   1.120 +    fun scan (toks, [], n) = rev ((EOF, "end-of-file", n) :: toks)
   1.121 +      | scan (toks, "\"" :: cs, n) =
   1.122 +          scan (add_tok toks String n (cons_fst "\"" (string cs n)))
   1.123 +      | scan (toks, "{" :: "|" :: cs, n) =
   1.124 +          scan (add_tok toks Verbatim n (verbatim cs n))
   1.125 +      | scan (toks, "(" :: "*" :: cs, n) =
   1.126 +          scan ((fn (cs', n') => (toks, cs', n')) (comment cs n 1))
   1.127 +      | scan (toks, chs as c :: cs, n) =
   1.128 +          if is_blank c then scan (toks, cs, inc_line n c)
   1.129 +          else
   1.130 +            (case scan_word lex chs of
   1.131 +              (None, _) => lex_err n (quote (take_line chs))
   1.132 +            | (Some (Keyword, "ML"), chs') => scan ((Verbatim, implode chs', n)
   1.133 +                :: (Keyword, "ML", n) :: toks, [], n + count_lines chs')
   1.134 +            | (Some (tk, s), chs') => scan ((tk, s, n) :: toks, chs', n));
   1.135 +  in
   1.136 +    scan ([], explode str, 1)
   1.137 +  end;
   1.138 +
   1.139 +
   1.140 +end;
   1.141 +