src/Pure/Thy/thy_scan.ML
author wenzelm
Mon May 18 17:57:47 1998 +0200 (1998-05-18)
changeset 4938 c8bbbf3c59fa
parent 4921 74bc10921f7d
child 5112 9e74cf11e4a4
permissions -rw-r--r--
Symbol.stopper;
wenzelm@4705
     1
(*  Title:	Pure/Thy/thy_scan.ML
wenzelm@4705
     2
    ID:		$Id$
wenzelm@4705
     3
    Author:	Markus Wenzel, TU Muenchen
wenzelm@388
     4
wenzelm@4705
     5
Lexer for the outer Isabelle syntax.
wenzelm@4705
     6
wenzelm@4705
     7
TODO:
wenzelm@4705
     8
  - old vs. new: interpreted strings, no 'ML', var!?;
wenzelm@388
     9
*)
wenzelm@388
    10
wenzelm@388
    11
signature THY_SCAN =
wenzelm@4705
    12
sig
wenzelm@388
    13
  datatype token_kind =
wenzelm@4705
    14
    Keyword | Ident | LongIdent | Var | TypeVar | Nat | String | Verbatim | Ignore | EOF
wenzelm@388
    15
  val name_of_kind: token_kind -> string
wenzelm@4705
    16
  val tokenize: Scan.lexicon -> string -> (token_kind * string * int) list
wenzelm@4705
    17
end;
wenzelm@388
    18
wenzelm@4705
    19
structure ThyScan: THY_SCAN =
wenzelm@388
    20
struct
wenzelm@388
    21
wenzelm@388
    22
wenzelm@388
    23
(** token kinds **)
wenzelm@388
    24
wenzelm@388
    25
datatype token_kind =
wenzelm@4705
    26
  Keyword | Ident | LongIdent | Var | TypeVar | Nat | String | Verbatim | Ignore | EOF;
wenzelm@388
    27
wenzelm@388
    28
fun name_of_kind Keyword = "keyword"
wenzelm@388
    29
  | name_of_kind Ident = "identifier"
wenzelm@388
    30
  | name_of_kind LongIdent = "long identifier"
wenzelm@4705
    31
  | name_of_kind Var = "schematic variable"
wenzelm@388
    32
  | name_of_kind TypeVar = "type variable"
wenzelm@388
    33
  | name_of_kind Nat = "natural number"
wenzelm@388
    34
  | name_of_kind String = "string"
wenzelm@388
    35
  | name_of_kind Verbatim = "verbatim text"
wenzelm@4705
    36
  | name_of_kind Ignore = "ignore"
wenzelm@388
    37
  | name_of_kind EOF = "end-of-file";
wenzelm@388
    38
wenzelm@388
    39
wenzelm@388
    40
wenzelm@388
    41
(** scanners **)
wenzelm@388
    42
wenzelm@4705
    43
(* diagnostics *)
wenzelm@4705
    44
wenzelm@4705
    45
fun lex_err None msg = "Outer lexical error: " ^ msg
wenzelm@4705
    46
  | lex_err (Some n) msg = "Outer lexical error on line " ^ string_of_int n ^ ": " ^ msg;
wenzelm@388
    47
wenzelm@388
    48
wenzelm@4705
    49
(* line numbering *)
wenzelm@4705
    50
wenzelm@4705
    51
val incr = apsome (fn n:int => n + 1);
wenzelm@388
    52
wenzelm@4705
    53
fun incr_line scan = Scan.depend (fn n => scan >> pair (incr n));
wenzelm@4705
    54
val keep_line = Scan.lift;
wenzelm@388
    55
wenzelm@4705
    56
val scan_blank =
wenzelm@4705
    57
  incr_line ($$ "\n") ||
wenzelm@4705
    58
  keep_line (Scan.one Symbol.is_blank);
wenzelm@388
    59
wenzelm@388
    60
wenzelm@4705
    61
(* scan ML-style strings *)
wenzelm@388
    62
wenzelm@4705
    63
val scan_ctrl =
wenzelm@4705
    64
  $$ "^" ^^ Scan.one (fn c => Symbol.is_ascii c andalso ord c >= 64 andalso ord c <= 95);
wenzelm@388
    65
wenzelm@4705
    66
val scan_dig = Scan.one Symbol.is_digit;
wenzelm@388
    67
wenzelm@388
    68
val scan_esc =
wenzelm@4921
    69
  keep_line ($$ "\\") ^^ !! (fn ((n, _), _) => lex_err n "bad escape sequence in string")
wenzelm@4705
    70
    (keep_line ($$ "n" || $$ "t" || $$ "\"" || $$ "\\" ||
wenzelm@4705
    71
      scan_ctrl || scan_dig ^^ scan_dig ^^ scan_dig) ||
wenzelm@4705
    72
      (Scan.repeat1 scan_blank >> implode) ^^ keep_line ($$ "\\"));
wenzelm@388
    73
wenzelm@4705
    74
val scan_str =
wenzelm@4705
    75
  scan_esc ||
wenzelm@4705
    76
  scan_blank >> K " " ||
wenzelm@4705
    77
  keep_line (Scan.one (not_equal "\"" andf Symbol.not_eof));
wenzelm@4705
    78
wenzelm@4705
    79
val scan_string =
wenzelm@4705
    80
  keep_line ($$ "\"") ^^
wenzelm@4921
    81
  !! (fn ((n, _), _) => lex_err n "missing quote at end of string")
wenzelm@4705
    82
    ((Scan.repeat scan_str >> implode) ^^ keep_line ($$ "\""));
wenzelm@388
    83
wenzelm@388
    84
wenzelm@388
    85
(* scan verbatim text *)
wenzelm@388
    86
wenzelm@4705
    87
val scan_verb =
wenzelm@4705
    88
  scan_blank ||
wenzelm@4705
    89
  keep_line ($$ "|" --| Scan.ahead (Scan.one (not_equal "}"))) ||
wenzelm@4705
    90
  keep_line (Scan.one (not_equal "|" andf Symbol.not_eof));
wenzelm@4705
    91
wenzelm@4705
    92
val scan_verbatim =
wenzelm@4705
    93
  keep_line ($$ "{" -- $$ "|") |--
wenzelm@4921
    94
  !! (fn ((n, _), _) => lex_err n "missing end of verbatim text")
wenzelm@4705
    95
    ((Scan.repeat scan_verb >> implode) --| keep_line ($$ "|" -- $$ "}"));
wenzelm@388
    96
wenzelm@388
    97
wenzelm@388
    98
(* scan nested comments *)
wenzelm@388
    99
wenzelm@4705
   100
val scan_cmt =
wenzelm@4705
   101
  Scan.lift scan_blank ||
wenzelm@4705
   102
  Scan.depend (fn d => keep_line ($$ "(" ^^ $$ "*") >> pair (d + 1)) ||
wenzelm@4705
   103
  Scan.depend (fn 0 => Scan.fail | d => keep_line ($$ "*" ^^ $$ ")") >> pair (d - 1)) ||
wenzelm@4705
   104
  Scan.lift (keep_line ($$ "*" --| Scan.ahead (Scan.one (not_equal ")")))) ||
wenzelm@4705
   105
  Scan.lift (keep_line (Scan.one (not_equal "*" andf Symbol.not_eof)));
wenzelm@388
   106
wenzelm@4705
   107
val scan_comment =
wenzelm@4705
   108
  keep_line ($$ "(" -- $$ "*") |--
wenzelm@4921
   109
  !! (fn ((n, _), _) => lex_err n "missing end of comment")
wenzelm@4705
   110
    (Scan.pass 0 (Scan.repeat scan_cmt) |-- keep_line ($$ "*" -- $$ ")") >> K "");
wenzelm@388
   111
wenzelm@388
   112
wenzelm@4705
   113
(* scan token *)
wenzelm@4705
   114
wenzelm@4705
   115
fun token k None x = (k, x, 0)
wenzelm@4705
   116
  | token k (Some n) x = (k, x, n);
wenzelm@388
   117
wenzelm@4705
   118
fun scan_tok lex (n, cs) =
wenzelm@4705
   119
 (scan_string >> token String n ||
wenzelm@4705
   120
  scan_verbatim >> token Verbatim n ||
wenzelm@4705
   121
  Scan.repeat1 scan_blank >> (token Ignore n o implode) ||
wenzelm@4705
   122
  scan_comment >> token Ignore n ||
wenzelm@4705
   123
  keep_line (Scan.max (fn ((_, x, _), (_, x', _)) => x <= x')
wenzelm@4705
   124
    (Scan.literal lex >> (token Keyword n o implode))
wenzelm@4705
   125
    (Syntax.scan_longid >> token LongIdent n ||
wenzelm@4705
   126
      Syntax.scan_id >> token Ident n ||
wenzelm@4705
   127
      Syntax.scan_var >> token Var n ||
wenzelm@4705
   128
      Syntax.scan_tid >> token TypeVar n ||
wenzelm@4705
   129
      Syntax.scan_nat >> token Nat n))) (n, cs);
wenzelm@388
   130
wenzelm@4705
   131
val scan_rest = Scan.any Symbol.not_eof >> implode;
wenzelm@388
   132
wenzelm@4705
   133
fun scan_token lex x =
wenzelm@4705
   134
  (case scan_tok lex x of
wenzelm@4705
   135
    ((Keyword, "ML", n), x') => (keep_line scan_rest >> token Verbatim (Some n)) x'
wenzelm@4705
   136
  | tok_x' => tok_x');
wenzelm@388
   137
wenzelm@388
   138
wenzelm@4705
   139
(* tokenize *)
wenzelm@4705
   140
wenzelm@388
   141
fun tokenize lex str =
wenzelm@388
   142
  let
wenzelm@4705
   143
    val chs = explode str;	(*sic!*)
wenzelm@4705
   144
    val scan_toks = Scan.repeat (scan_token lex);
wenzelm@4705
   145
    val ignore = fn (Ignore, _, _) => true | _ => false;
wenzelm@388
   146
  in
wenzelm@4938
   147
    (case Scan.error (Scan.finite' Symbol.stopper scan_toks) (Some 1, chs) of
wenzelm@4705
   148
      (toks, (n, [])) => filter_out ignore toks @ [token EOF n "end-of-file"]
wenzelm@4705
   149
    | (_, (n, cs)) => error (lex_err n ("Bad input " ^ quote (beginning cs))))
wenzelm@388
   150
  end;
wenzelm@388
   151
wenzelm@388
   152
wenzelm@388
   153
end;