src/Pure/Thy/thy_scan.ML
changeset 4705 f71017706887
parent 3831 45e2e7ba31b8
child 4921 74bc10921f7d
     1.1 --- a/src/Pure/Thy/thy_scan.ML	Mon Mar 09 16:13:21 1998 +0100
     1.2 +++ b/src/Pure/Thy/thy_scan.ML	Mon Mar 09 16:14:15 1998 +0100
     1.3 @@ -1,138 +1,153 @@
     1.4 -(*  Title:      Pure/Thy/thy_scan.ML
     1.5 -    ID:         $Id$
     1.6 -    Author:     Markus Wenzel, TU Muenchen
     1.7 +(*  Title:	Pure/Thy/thy_scan.ML
     1.8 +    ID:		$Id$
     1.9 +    Author:	Markus Wenzel, TU Muenchen
    1.10  
    1.11 -The scanner for theory files.
    1.12 +Lexer for the outer Isabelle syntax.
    1.13 +
    1.14 +TODO:
    1.15 +  - old vs. new: interpreted strings, no 'ML', var!?;
    1.16  *)
    1.17  
    1.18  signature THY_SCAN =
    1.19 -  sig
    1.20 +sig
    1.21    datatype token_kind =
    1.22 -    Keyword | Ident | LongIdent | TypeVar | Nat | String | Verbatim | EOF
    1.23 +    Keyword | Ident | LongIdent | Var | TypeVar | Nat | String | Verbatim | Ignore | EOF
    1.24    val name_of_kind: token_kind -> string
    1.25 -  type lexicon
    1.26 -  val make_lexicon: string list -> lexicon
    1.27 -  val tokenize: lexicon -> string -> (token_kind * string * int) list
    1.28 -  end;
    1.29 +  val tokenize: Scan.lexicon -> string -> (token_kind * string * int) list
    1.30 +end;
    1.31  
    1.32 -
    1.33 -structure ThyScan : THY_SCAN =
    1.34 +structure ThyScan: THY_SCAN =
    1.35  struct
    1.36  
    1.37 -open Scanner;
    1.38 -
    1.39  
    1.40  (** token kinds **)
    1.41  
    1.42  datatype token_kind =
    1.43 -  Keyword | Ident | LongIdent | TypeVar | Nat | String | Verbatim | EOF;
    1.44 +  Keyword | Ident | LongIdent | Var | TypeVar | Nat | String | Verbatim | Ignore | EOF;
    1.45  
    1.46  fun name_of_kind Keyword = "keyword"
    1.47    | name_of_kind Ident = "identifier"
    1.48    | name_of_kind LongIdent = "long identifier"
    1.49 +  | name_of_kind Var = "schematic variable"
    1.50    | name_of_kind TypeVar = "type variable"
    1.51    | name_of_kind Nat = "natural number"
    1.52    | name_of_kind String = "string"
    1.53    | name_of_kind Verbatim = "verbatim text"
    1.54 +  | name_of_kind Ignore = "ignore"
    1.55    | name_of_kind EOF = "end-of-file";
    1.56  
    1.57  
    1.58  
    1.59  (** scanners **)
    1.60  
    1.61 -fun lex_err line msg =
    1.62 -  error ("Lexical error on line " ^ string_of_int line ^ ": " ^ msg);
    1.63 +(* diagnostics *)
    1.64 +
    1.65 +fun lex_err None msg = "Outer lexical error: " ^ msg
    1.66 +  | lex_err (Some n) msg = "Outer lexical error on line " ^ string_of_int n ^ ": " ^ msg;
    1.67  
    1.68  
    1.69 -(* misc utilities *)
    1.70 +(* line numbering *)
    1.71 +
    1.72 +val incr = apsome (fn n:int => n + 1);
    1.73  
    1.74 -fun count_lines cs =
    1.75 -  foldl (fn (sum, c) => if c = "\n" then sum + 1 else sum) (0, cs);
    1.76 +fun incr_line scan = Scan.depend (fn n => scan >> pair (incr n));
    1.77 +val keep_line = Scan.lift;
    1.78  
    1.79 -fun inc_line n s = n + count_lines (explode s);
    1.80 -
    1.81 -fun cons_fst c (cs, x, y) = (c :: cs, x, y);
    1.82 +val scan_blank =
    1.83 +  incr_line ($$ "\n") ||
    1.84 +  keep_line (Scan.one Symbol.is_blank);
    1.85  
    1.86  
    1.87 -(* scan strings *)
    1.88 -
    1.89 -val scan_ctrl = $$ "^" ^^ scan_one (fn c => ord c >= 64 andalso ord c <= 95);
    1.90 +(* scan ML-style strings *)
    1.91  
    1.92 -val scan_dig = scan_one is_digit;
    1.93 +val scan_ctrl =
    1.94 +  $$ "^" ^^ Scan.one (fn c => Symbol.is_ascii c andalso ord c >= 64 andalso ord c <= 95);
    1.95  
    1.96 -val scan_blanks1 = scan_any1 is_blank >> implode;
    1.97 +val scan_dig = Scan.one Symbol.is_digit;
    1.98  
    1.99  val scan_esc =
   1.100 -  $$ "n" || $$ "t" || $$ "\"" || $$ "\\" ||
   1.101 -  scan_ctrl || scan_dig ^^ scan_dig ^^ scan_dig ||
   1.102 -  scan_blanks1 ^^ $$ "\\";
   1.103 +  keep_line ($$ "\\") ^^ !! (fn (n, _) => lex_err n "bad escape sequence in string")
   1.104 +    (keep_line ($$ "n" || $$ "t" || $$ "\"" || $$ "\\" ||
   1.105 +      scan_ctrl || scan_dig ^^ scan_dig ^^ scan_dig) ||
   1.106 +      (Scan.repeat1 scan_blank >> implode) ^^ keep_line ($$ "\\"));
   1.107  
   1.108 -fun string ("\"" :: cs) n = (["\""], cs, n)
   1.109 -  | string ("\\" :: cs) n =
   1.110 -      (case optional scan_esc cs of
   1.111 -        (None, _) => lex_err n "bad escape sequence in string"
   1.112 -      | (Some s, cs') => cons_fst ("\\" ^ s) (string cs' (inc_line n s)))
   1.113 -  | string ("\n" :: cs) n = cons_fst " " (string cs (n + 1))
   1.114 -  | string (c :: cs) n = 
   1.115 -      if is_blank c then cons_fst " " (string cs n)
   1.116 -      else cons_fst c (string cs n)
   1.117 -  | string [] n = lex_err n "missing quote at end of string";
   1.118 +val scan_str =
   1.119 +  scan_esc ||
   1.120 +  scan_blank >> K " " ||
   1.121 +  keep_line (Scan.one (not_equal "\"" andf Symbol.not_eof));
   1.122 +
   1.123 +val scan_string =
   1.124 +  keep_line ($$ "\"") ^^
   1.125 +  !! (fn (n, _) => lex_err n "missing quote at end of string")
   1.126 +    ((Scan.repeat scan_str >> implode) ^^ keep_line ($$ "\""));
   1.127  
   1.128  
   1.129  (* scan verbatim text *)
   1.130  
   1.131 -fun verbatim ("|" :: "}" :: cs) n = ([], cs, n)
   1.132 -  | verbatim (c :: cs) n = cons_fst c (verbatim cs (inc_line n c))
   1.133 -  | verbatim [] n = lex_err n "missing end of verbatim text";
   1.134 +val scan_verb =
   1.135 +  scan_blank ||
   1.136 +  keep_line ($$ "|" --| Scan.ahead (Scan.one (not_equal "}"))) ||
   1.137 +  keep_line (Scan.one (not_equal "|" andf Symbol.not_eof));
   1.138 +
   1.139 +val scan_verbatim =
   1.140 +  keep_line ($$ "{" -- $$ "|") |--
   1.141 +  !! (fn (n, _) => lex_err n "missing end of verbatim text")
   1.142 +    ((Scan.repeat scan_verb >> implode) --| keep_line ($$ "|" -- $$ "}"));
   1.143  
   1.144  
   1.145  (* scan nested comments *)
   1.146  
   1.147 -fun comment ("*" :: ")" :: cs) n 1 = (cs, n)
   1.148 -  | comment ("*" :: ")" :: cs) n d = comment cs n (d - 1)
   1.149 -  | comment ("(" :: "*" :: cs) n d = comment cs n (d + 1)
   1.150 -  | comment (c :: cs) n d = comment cs (inc_line n c) d
   1.151 -  | comment [] n _ = lex_err n "missing end of comment";
   1.152 +val scan_cmt =
   1.153 +  Scan.lift scan_blank ||
   1.154 +  Scan.depend (fn d => keep_line ($$ "(" ^^ $$ "*") >> pair (d + 1)) ||
   1.155 +  Scan.depend (fn 0 => Scan.fail | d => keep_line ($$ "*" ^^ $$ ")") >> pair (d - 1)) ||
   1.156 +  Scan.lift (keep_line ($$ "*" --| Scan.ahead (Scan.one (not_equal ")")))) ||
   1.157 +  Scan.lift (keep_line (Scan.one (not_equal "*" andf Symbol.not_eof)));
   1.158  
   1.159 +val scan_comment =
   1.160 +  keep_line ($$ "(" -- $$ "*") |--
   1.161 +  !! (fn (n, _) => lex_err n "missing end of comment")
   1.162 +    (Scan.pass 0 (Scan.repeat scan_cmt) |-- keep_line ($$ "*" -- $$ ")") >> K "");
   1.163  
   1.164  
   1.165 -(** tokenize **)
   1.166 +(* scan token *)
   1.167 +
   1.168 +fun token k None x = (k, x, 0)
   1.169 +  | token k (Some n) x = (k, x, n);
   1.170  
   1.171 -fun scan_word lex =
   1.172 -  max_of
   1.173 -    (scan_literal lex >> pair Keyword)
   1.174 -    (scan_longid >> pair LongIdent ||
   1.175 -      scan_id >> pair Ident ||
   1.176 -      scan_tid >> pair TypeVar ||
   1.177 -      scan_nat >> pair Nat);
   1.178 +fun scan_tok lex (n, cs) =
   1.179 + (scan_string >> token String n ||
   1.180 +  scan_verbatim >> token Verbatim n ||
   1.181 +  Scan.repeat1 scan_blank >> (token Ignore n o implode) ||
   1.182 +  scan_comment >> token Ignore n ||
   1.183 +  keep_line (Scan.max (fn ((_, x, _), (_, x', _)) => x <= x')
   1.184 +    (Scan.literal lex >> (token Keyword n o implode))
   1.185 +    (Syntax.scan_longid >> token LongIdent n ||
   1.186 +      Syntax.scan_id >> token Ident n ||
   1.187 +      Syntax.scan_var >> token Var n ||
   1.188 +      Syntax.scan_tid >> token TypeVar n ||
   1.189 +      Syntax.scan_nat >> token Nat n))) (n, cs);
   1.190  
   1.191 -fun add_tok toks kind n (cs, cs', n') =
   1.192 -  ((kind, implode cs, n) :: toks, cs', n');
   1.193 +val scan_rest = Scan.any Symbol.not_eof >> implode;
   1.194  
   1.195 -val take_line = implode o fst o take_prefix (not_equal "\n");
   1.196 +fun scan_token lex x =
   1.197 +  (case scan_tok lex x of
   1.198 +    ((Keyword, "ML", n), x') => (keep_line scan_rest >> token Verbatim (Some n)) x'
   1.199 +  | tok_x' => tok_x');
   1.200  
   1.201  
   1.202 +(* tokenize *)
   1.203 +
   1.204  fun tokenize lex str =
   1.205    let
   1.206 -    fun scan (toks, [], n) = rev ((EOF, "end-of-file", n) :: toks)
   1.207 -      | scan (toks, "\"" :: cs, n) =
   1.208 -          scan (add_tok toks String n (cons_fst "\"" (string cs n)))
   1.209 -      | scan (toks, "{" :: "|" :: cs, n) =
   1.210 -          scan (add_tok toks Verbatim n (verbatim cs n))
   1.211 -      | scan (toks, "(" :: "*" :: cs, n) =
   1.212 -          scan ((fn (cs', n') => (toks, cs', n')) (comment cs n 1))
   1.213 -      | scan (toks, chs as c :: cs, n) =
   1.214 -          if is_blank c then scan (toks, cs, inc_line n c)
   1.215 -          else
   1.216 -            (case scan_word lex chs of
   1.217 -              (None, _) => lex_err n (quote (take_line chs))
   1.218 -            | (Some (Keyword, "ML"), chs') => scan ((Verbatim, implode chs', n)
   1.219 -                :: (Keyword, "ML", n) :: toks, [], n + count_lines chs')
   1.220 -            | (Some (tk, s), chs') => scan ((tk, s, n) :: toks, chs', n));
   1.221 +    val chs = explode str;	(*sic!*)
   1.222 +    val scan_toks = Scan.repeat (scan_token lex);
   1.223 +    val ignore = fn (Ignore, _, _) => true | _ => false;
   1.224    in
   1.225 -    scan ([], explode str, 1)
   1.226 +    (case Scan.error (Scan.finite' Symbol.eof scan_toks) (Some 1, chs) of
   1.227 +      (toks, (n, [])) => filter_out ignore toks @ [token EOF n "end-of-file"]
   1.228 +    | (_, (n, cs)) => error (lex_err n ("Bad input " ^ quote (beginning cs))))
   1.229    end;
   1.230  
   1.231  
   1.232  end;
   1.233 -