src/Pure/Thy/thy_scan.ML
author kleing
Sat, 17 Apr 2004 14:57:50 +0200
changeset 14616 b167b1b848d8
parent 11819 9283b3c11234
child 14729 0e987111a17e
permissions -rw-r--r--
added sydney mirror
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
     1
(*  Title:	Pure/Thy/thy_scan.ML
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
     2
    ID:		$Id$
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
     3
    Author:	Markus Wenzel, TU Muenchen
11819
9283b3c11234 tuned comments;
wenzelm
parents: 6667
diff changeset
     4
    License:    GPL (GNU GENERAL PUBLIC LICENSE)
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
     5
11819
9283b3c11234 tuned comments;
wenzelm
parents: 6667
diff changeset
     6
Lexer for *old-style* outer syntax.
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
     7
*)
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
     8
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
     9
signature THY_SCAN =
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    10
sig
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    11
  datatype token_kind =
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    12
    Keyword | Ident | LongIdent | Var | TypeVar | Nat | String | Verbatim | Ignore | EOF
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    13
  val name_of_kind: token_kind -> string
6207
58e9f980bd4f tokenize: get exploded args;
wenzelm
parents: 5910
diff changeset
    14
  val tokenize: Scan.lexicon -> string list -> (token_kind * string * int) list
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    15
end;
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    16
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    17
structure ThyScan: THY_SCAN =
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    18
struct
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    19
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    20
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    21
(** token kinds **)
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    22
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    23
datatype token_kind =
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    24
  Keyword | Ident | LongIdent | Var | TypeVar | Nat | String | Verbatim | Ignore | EOF;
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    25
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    26
fun name_of_kind Keyword = "keyword"
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    27
  | name_of_kind Ident = "identifier"
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    28
  | name_of_kind LongIdent = "long identifier"
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    29
  | name_of_kind Var = "schematic variable"
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    30
  | name_of_kind TypeVar = "type variable"
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    31
  | name_of_kind Nat = "natural number"
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    32
  | name_of_kind String = "string"
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    33
  | name_of_kind Verbatim = "verbatim text"
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    34
  | name_of_kind Ignore = "ignore"
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    35
  | name_of_kind EOF = "end-of-file";
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    36
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    37
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    38
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    39
(** scanners **)
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    40
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    41
(* diagnostics *)
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    42
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    43
fun lex_err None msg = "Outer lexical error: " ^ msg
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    44
  | lex_err (Some n) msg = "Outer lexical error on line " ^ string_of_int n ^ ": " ^ msg;
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    45
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    46
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    47
(* line numbering *)
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    48
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    49
val incr = apsome (fn n:int => n + 1);
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    50
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    51
fun incr_line scan = Scan.depend (fn n => scan >> pair (incr n));
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    52
val keep_line = Scan.lift;
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    53
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    54
val scan_blank =
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    55
  incr_line ($$ "\n") ||
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    56
  keep_line (Scan.one Symbol.is_blank);
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    57
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    58
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    59
(* scan ML-style strings *)
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    60
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    61
val scan_ctrl =
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    62
  $$ "^" ^^ Scan.one (fn c => Symbol.is_ascii c andalso ord c >= 64 andalso ord c <= 95);
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    63
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    64
val scan_dig = Scan.one Symbol.is_digit;
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    65
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    66
val scan_esc =
4921
74bc10921f7d adapted to new Scan.fail_with / Scan.!!;
wenzelm
parents: 4705
diff changeset
    67
  keep_line ($$ "\\") ^^ !! (fn ((n, _), _) => lex_err n "bad escape sequence in string")
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    68
    (keep_line ($$ "n" || $$ "t" || $$ "\"" || $$ "\\" ||
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    69
      scan_ctrl || scan_dig ^^ scan_dig ^^ scan_dig) ||
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    70
      (Scan.repeat1 scan_blank >> implode) ^^ keep_line ($$ "\\"));
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    71
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    72
val scan_str =
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    73
  scan_esc ||
5910
151ee1a5c09c Symbol.space;
wenzelm
parents: 5112
diff changeset
    74
  scan_blank >> K Symbol.space ||
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    75
  keep_line (Scan.one (not_equal "\"" andf Symbol.not_eof));
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    76
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    77
val scan_string =
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    78
  keep_line ($$ "\"") ^^
4921
74bc10921f7d adapted to new Scan.fail_with / Scan.!!;
wenzelm
parents: 4705
diff changeset
    79
  !! (fn ((n, _), _) => lex_err n "missing quote at end of string")
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    80
    ((Scan.repeat scan_str >> implode) ^^ keep_line ($$ "\""));
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    81
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    82
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    83
(* scan verbatim text *)
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    84
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    85
val scan_verb =
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    86
  scan_blank ||
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    87
  keep_line ($$ "|" --| Scan.ahead (Scan.one (not_equal "}"))) ||
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    88
  keep_line (Scan.one (not_equal "|" andf Symbol.not_eof));
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    89
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    90
val scan_verbatim =
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    91
  keep_line ($$ "{" -- $$ "|") |--
4921
74bc10921f7d adapted to new Scan.fail_with / Scan.!!;
wenzelm
parents: 4705
diff changeset
    92
  !! (fn ((n, _), _) => lex_err n "missing end of verbatim text")
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    93
    ((Scan.repeat scan_verb >> implode) --| keep_line ($$ "|" -- $$ "}"));
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    94
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    95
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    96
(* scan nested comments *)
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    97
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    98
val scan_cmt =
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
    99
  Scan.lift scan_blank ||
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   100
  Scan.depend (fn d => keep_line ($$ "(" ^^ $$ "*") >> pair (d + 1)) ||
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   101
  Scan.depend (fn 0 => Scan.fail | d => keep_line ($$ "*" ^^ $$ ")") >> pair (d - 1)) ||
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   102
  Scan.lift (keep_line ($$ "*" --| Scan.ahead (Scan.one (not_equal ")")))) ||
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   103
  Scan.lift (keep_line (Scan.one (not_equal "*" andf Symbol.not_eof)));
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   104
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   105
val scan_comment =
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   106
  keep_line ($$ "(" -- $$ "*") |--
4921
74bc10921f7d adapted to new Scan.fail_with / Scan.!!;
wenzelm
parents: 4705
diff changeset
   107
  !! (fn ((n, _), _) => lex_err n "missing end of comment")
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   108
    (Scan.pass 0 (Scan.repeat scan_cmt) |-- keep_line ($$ "*" -- $$ ")") >> K "");
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   109
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   110
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   111
(* scan token *)
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   112
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   113
fun token k None x = (k, x, 0)
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   114
  | token k (Some n) x = (k, x, n);
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   115
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   116
fun scan_tok lex (n, cs) =
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   117
 (scan_string >> token String n ||
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   118
  scan_verbatim >> token Verbatim n ||
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   119
  Scan.repeat1 scan_blank >> (token Ignore n o implode) ||
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   120
  scan_comment >> token Ignore n ||
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   121
  keep_line (Scan.max (fn ((_, x, _), (_, x', _)) => x <= x')
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   122
    (Scan.literal lex >> (token Keyword n o implode))
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   123
    (Syntax.scan_longid >> token LongIdent n ||
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   124
      Syntax.scan_id >> token Ident n ||
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   125
      Syntax.scan_var >> token Var n ||
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   126
      Syntax.scan_tid >> token TypeVar n ||
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   127
      Syntax.scan_nat >> token Nat n))) (n, cs);
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   128
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   129
val scan_rest = Scan.any Symbol.not_eof >> implode;
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   130
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   131
fun scan_token lex x =
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   132
  (case scan_tok lex x of
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   133
    ((Keyword, "ML", n), x') => (keep_line scan_rest >> token Verbatim (Some n)) x'
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   134
  | tok_x' => tok_x');
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   135
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   136
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   137
(* tokenize *)
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   138
6207
58e9f980bd4f tokenize: get exploded args;
wenzelm
parents: 5910
diff changeset
   139
fun tokenize lex chs =
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   140
  let
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   141
    val scan_toks = Scan.repeat (scan_token lex);
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   142
    val ignore = fn (Ignore, _, _) => true | _ => false;
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   143
  in
4938
c8bbbf3c59fa Symbol.stopper;
wenzelm
parents: 4921
diff changeset
   144
    (case Scan.error (Scan.finite' Symbol.stopper scan_toks) (Some 1, chs) of
4705
f71017706887 adapted to new scanner, baroque chars;
wenzelm
parents: 3831
diff changeset
   145
      (toks, (n, [])) => filter_out ignore toks @ [token EOF n "end-of-file"]
5112
9e74cf11e4a4 Symbol.beginning;
wenzelm
parents: 4938
diff changeset
   146
    | (_, (n, cs)) => error (lex_err n ("Bad input " ^ quote (Symbol.beginning cs))))
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   147
  end;
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   148
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   149
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   150
end;