src/Pure/Thy/thy_scan.ML
author wenzelm
Tue, 17 Nov 1998 14:09:29 +0100
changeset 5910 151ee1a5c09c
parent 5112 9e74cf11e4a4
child 6207 58e9f980bd4f
permissions -rw-r--r--
Symbol.space;
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
     1
(*  Title:	Pure/Thy/thy_scan.ML
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
     2
    ID:		$Id$
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
     3
    Author:	Markus Wenzel, TU Muenchen
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
     4
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
     5
Lexer for the outer Isabelle syntax.
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
     6
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
     7
TODO:
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
     8
  - old vs. new: interpreted strings, no 'ML', var!?;
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
     9
*)
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    10
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    11
signature THY_SCAN =
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    12
sig
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    13
  datatype token_kind =
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    14
    Keyword | Ident | LongIdent | Var | TypeVar | Nat | String | Verbatim | Ignore | EOF
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    15
  val name_of_kind: token_kind -> string
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    16
  val tokenize: Scan.lexicon -> string -> (token_kind * string * int) list
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    17
end;
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    18
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    19
structure ThyScan: THY_SCAN =
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    20
struct
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    21
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    22
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    23
(** token kinds **)
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    24
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    25
datatype token_kind =
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    26
  Keyword | Ident | LongIdent | Var | TypeVar | Nat | String | Verbatim | Ignore | EOF;
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    27
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    28
fun name_of_kind Keyword = "keyword"
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    29
  | name_of_kind Ident = "identifier"
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    30
  | name_of_kind LongIdent = "long identifier"
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    31
  | name_of_kind Var = "schematic variable"
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    32
  | name_of_kind TypeVar = "type variable"
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    33
  | name_of_kind Nat = "natural number"
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    34
  | name_of_kind String = "string"
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    35
  | name_of_kind Verbatim = "verbatim text"
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    36
  | name_of_kind Ignore = "ignore"
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    37
  | name_of_kind EOF = "end-of-file";
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    38
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    39
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    40
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    41
(** scanners **)
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    42
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    43
(* diagnostics *)
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    44
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    45
fun lex_err None msg = "Outer lexical error: " ^ msg
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    46
  | lex_err (Some n) msg = "Outer lexical error on line " ^ string_of_int n ^ ": " ^ msg;
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    47
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    48
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    49
(* line numbering *)
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    50
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    51
val incr = apsome (fn n:int => n + 1);
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    52
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    53
fun incr_line scan = Scan.depend (fn n => scan >> pair (incr n));
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    54
val keep_line = Scan.lift;
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    55
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    56
val scan_blank =
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    57
  incr_line ($$ "\n") ||
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    58
  keep_line (Scan.one Symbol.is_blank);
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    59
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    60
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    61
(* scan ML-style strings *)
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    62
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    63
val scan_ctrl =
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    64
  $$ "^" ^^ Scan.one (fn c => Symbol.is_ascii c andalso ord c >= 64 andalso ord c <= 95);
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    65
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    66
val scan_dig = Scan.one Symbol.is_digit;
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    67
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    68
val scan_esc =
4921
74bc10921f7d adapted to new Scan.fail_with / Scan.!!;
wenzelm
parents: 4705
diff changeset
    69
  keep_line ($$ "\\") ^^ !! (fn ((n, _), _) => lex_err n "bad escape sequence in string")
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    70
    (keep_line ($$ "n" || $$ "t" || $$ "\"" || $$ "\\" ||
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    71
      scan_ctrl || scan_dig ^^ scan_dig ^^ scan_dig) ||
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    72
      (Scan.repeat1 scan_blank >> implode) ^^ keep_line ($$ "\\"));
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    73
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    74
val scan_str =
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    75
  scan_esc ||
5910
151ee1a5c09c Symbol.space;
wenzelm
parents: 5112
diff changeset
    76
  scan_blank >> K Symbol.space ||
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    77
  keep_line (Scan.one (not_equal "\"" andf Symbol.not_eof));
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    78
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    79
val scan_string =
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    80
  keep_line ($$ "\"") ^^
4921
74bc10921f7d adapted to new Scan.fail_with / Scan.!!;
wenzelm
parents: 4705
diff changeset
    81
  !! (fn ((n, _), _) => lex_err n "missing quote at end of string")
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    82
    ((Scan.repeat scan_str >> implode) ^^ keep_line ($$ "\""));
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    83
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    84
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    85
(* scan verbatim text *)
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    86
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    87
val scan_verb =
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    88
  scan_blank ||
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    89
  keep_line ($$ "|" --| Scan.ahead (Scan.one (not_equal "}"))) ||
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    90
  keep_line (Scan.one (not_equal "|" andf Symbol.not_eof));
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    91
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    92
val scan_verbatim =
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    93
  keep_line ($$ "{" -- $$ "|") |--
4921
74bc10921f7d adapted to new Scan.fail_with / Scan.!!;
wenzelm
parents: 4705
diff changeset
    94
  !! (fn ((n, _), _) => lex_err n "missing end of verbatim text")
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    95
    ((Scan.repeat scan_verb >> implode) --| keep_line ($$ "|" -- $$ "}"));
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    96
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    97
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    98
(* scan nested comments *)
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    99
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   100
val scan_cmt =
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   101
  Scan.lift scan_blank ||
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   102
  Scan.depend (fn d => keep_line ($$ "(" ^^ $$ "*") >> pair (d + 1)) ||
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   103
  Scan.depend (fn 0 => Scan.fail | d => keep_line ($$ "*" ^^ $$ ")") >> pair (d - 1)) ||
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   104
  Scan.lift (keep_line ($$ "*" --| Scan.ahead (Scan.one (not_equal ")")))) ||
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   105
  Scan.lift (keep_line (Scan.one (not_equal "*" andf Symbol.not_eof)));
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   106
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   107
val scan_comment =
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   108
  keep_line ($$ "(" -- $$ "*") |--
4921
74bc10921f7d adapted to new Scan.fail_with / Scan.!!;
wenzelm
parents: 4705
diff changeset
   109
  !! (fn ((n, _), _) => lex_err n "missing end of comment")
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   110
    (Scan.pass 0 (Scan.repeat scan_cmt) |-- keep_line ($$ "*" -- $$ ")") >> K "");
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   111
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   112
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   113
(* scan token *)
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   114
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   115
fun token k None x = (k, x, 0)
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   116
  | token k (Some n) x = (k, x, n);
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   117
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   118
fun scan_tok lex (n, cs) =
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   119
 (scan_string >> token String n ||
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   120
  scan_verbatim >> token Verbatim n ||
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   121
  Scan.repeat1 scan_blank >> (token Ignore n o implode) ||
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   122
  scan_comment >> token Ignore n ||
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   123
  keep_line (Scan.max (fn ((_, x, _), (_, x', _)) => x <= x')
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   124
    (Scan.literal lex >> (token Keyword n o implode))
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   125
    (Syntax.scan_longid >> token LongIdent n ||
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   126
      Syntax.scan_id >> token Ident n ||
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   127
      Syntax.scan_var >> token Var n ||
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   128
      Syntax.scan_tid >> token TypeVar n ||
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   129
      Syntax.scan_nat >> token Nat n))) (n, cs);
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   130
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   131
val scan_rest = Scan.any Symbol.not_eof >> implode;
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   132
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   133
fun scan_token lex x =
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   134
  (case scan_tok lex x of
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   135
    ((Keyword, "ML", n), x') => (keep_line scan_rest >> token Verbatim (Some n)) x'
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   136
  | tok_x' => tok_x');
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   137
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   138
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   139
(* tokenize *)
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   140
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   141
fun tokenize lex str =
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   142
  let
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   143
    val chs = explode str;	(*sic!*)
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   144
    val scan_toks = Scan.repeat (scan_token lex);
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   145
    val ignore = fn (Ignore, _, _) => true | _ => false;
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   146
  in
4938
c8bbbf3c59fa Symbol.stopper;
wenzelm
parents: 4921
diff changeset
   147
    (case Scan.error (Scan.finite' Symbol.stopper scan_toks) (Some 1, chs) of
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   148
      (toks, (n, [])) => filter_out ignore toks @ [token EOF n "end-of-file"]
5112
9e74cf11e4a4 Symbol.beginning;
wenzelm
parents: 4938
diff changeset
   149
    | (_, (n, cs)) => error (lex_err n ("Bad input " ^ quote (Symbol.beginning cs))))
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   150
  end;
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   151
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   152
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   153
end;