src/Pure/Thy/thy_scan.ML
author wenzelm
Fri, 10 Oct 1997 15:48:43 +0200
changeset 3831 45e2e7ba31b8
parent 2564 9d66b758bce5
child 4705 f71017706887
permissions -rw-r--r--
scan_longid moved to Syntax/lexicon.ML;
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
     1
(*  Title:      Pure/Thy/thy_scan.ML
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
     2
    ID:         $Id$
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
     3
    Author:     Markus Wenzel, TU Muenchen
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
     4
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
     5
The scanner for theory files.
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
     6
*)
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
     7
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
     8
signature THY_SCAN =
1512
ce37c64244c0 Elimination of fully-functorial style.
paulson
parents: 978
diff changeset
     9
  sig
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    10
  datatype token_kind =
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    11
    Keyword | Ident | LongIdent | TypeVar | Nat | String | Verbatim | EOF
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    12
  val name_of_kind: token_kind -> string
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    13
  type lexicon
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    14
  val make_lexicon: string list -> lexicon
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    15
  val tokenize: lexicon -> string -> (token_kind * string * int) list
1512
ce37c64244c0 Elimination of fully-functorial style.
paulson
parents: 978
diff changeset
    16
  end;
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    17
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    18
1512
ce37c64244c0 Elimination of fully-functorial style.
paulson
parents: 978
diff changeset
    19
structure ThyScan : THY_SCAN =
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    20
struct
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    21
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    22
open Scanner;
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    23
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    24
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    25
(** token kinds **)
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    26
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    27
datatype token_kind =
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    28
  Keyword | Ident | LongIdent | TypeVar | Nat | String | Verbatim | EOF;
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    29
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    30
fun name_of_kind Keyword = "keyword"
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    31
  | name_of_kind Ident = "identifier"
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    32
  | name_of_kind LongIdent = "long identifier"
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    33
  | name_of_kind TypeVar = "type variable"
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    34
  | name_of_kind Nat = "natural number"
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    35
  | name_of_kind String = "string"
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    36
  | name_of_kind Verbatim = "verbatim text"
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    37
  | name_of_kind EOF = "end-of-file";
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    38
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    39
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    40
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    41
(** scanners **)
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    42
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    43
fun lex_err line msg =
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    44
  error ("Lexical error on line " ^ string_of_int line ^ ": " ^ msg);
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    45
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    46
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    47
(* misc utilities *)
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    48
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    49
fun count_lines cs =
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    50
  foldl (fn (sum, c) => if c = "\n" then sum + 1 else sum) (0, cs);
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    51
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    52
fun inc_line n s = n + count_lines (explode s);
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    53
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    54
fun cons_fst c (cs, x, y) = (c :: cs, x, y);
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    55
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    56
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    57
(* scan strings *)
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    58
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    59
val scan_ctrl = $$ "^" ^^ scan_one (fn c => ord c >= 64 andalso ord c <= 95);
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    60
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    61
val scan_dig = scan_one is_digit;
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    62
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    63
val scan_blanks1 = scan_any1 is_blank >> implode;
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    64
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    65
val scan_esc =
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    66
  $$ "n" || $$ "t" || $$ "\"" || $$ "\\" ||
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    67
  scan_ctrl || scan_dig ^^ scan_dig ^^ scan_dig ||
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    68
  scan_blanks1 ^^ $$ "\\";
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    69
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    70
fun string ("\"" :: cs) n = (["\""], cs, n)
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    71
  | string ("\\" :: cs) n =
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    72
      (case optional scan_esc cs of
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    73
        (None, _) => lex_err n "bad escape sequence in string"
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    74
      | (Some s, cs') => cons_fst ("\\" ^ s) (string cs' (inc_line n s)))
2204
64cb98f841e4 improved string scanner: converts 8 bit chars to escape sequences;
wenzelm
parents: 2151
diff changeset
    75
  | string ("\n" :: cs) n = cons_fst " " (string cs (n + 1))
712
1f5800d2c366 added check for newlines not enclosed by '\' inside strings
clasohm
parents: 388
diff changeset
    76
  | string (c :: cs) n = 
2204
64cb98f841e4 improved string scanner: converts 8 bit chars to escape sequences;
wenzelm
parents: 2151
diff changeset
    77
      if is_blank c then cons_fst " " (string cs n)
2564
9d66b758bce5 removed warning for unprintable chars in strings (functionality will
wenzelm
parents: 2387
diff changeset
    78
      else cons_fst c (string cs n)
388
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    79
  | string [] n = lex_err n "missing quote at end of string";
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    80
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    81
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    82
(* scan verbatim text *)
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    83
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    84
fun verbatim ("|" :: "}" :: cs) n = ([], cs, n)
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    85
  | verbatim (c :: cs) n = cons_fst c (verbatim cs (inc_line n c))
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    86
  | verbatim [] n = lex_err n "missing end of verbatim text";
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    87
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    88
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    89
(* scan nested comments *)
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    90
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    91
fun comment ("*" :: ")" :: cs) n 1 = (cs, n)
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    92
  | comment ("*" :: ")" :: cs) n d = comment cs n (d - 1)
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    93
  | comment ("(" :: "*" :: cs) n d = comment cs n (d + 1)
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    94
  | comment (c :: cs) n d = comment cs (inc_line n c) d
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    95
  | comment [] n _ = lex_err n "missing end of comment";
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    96
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    97
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    98
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
    99
(** tokenize **)
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   100
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   101
fun scan_word lex =
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   102
  max_of
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   103
    (scan_literal lex >> pair Keyword)
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   104
    (scan_longid >> pair LongIdent ||
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   105
      scan_id >> pair Ident ||
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   106
      scan_tid >> pair TypeVar ||
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   107
      scan_nat >> pair Nat);
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   108
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   109
fun add_tok toks kind n (cs, cs', n') =
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   110
  ((kind, implode cs, n) :: toks, cs', n');
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   111
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   112
val take_line = implode o fst o take_prefix (not_equal "\n");
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   113
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   114
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   115
fun tokenize lex str =
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   116
  let
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   117
    fun scan (toks, [], n) = rev ((EOF, "end-of-file", n) :: toks)
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   118
      | scan (toks, "\"" :: cs, n) =
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   119
          scan (add_tok toks String n (cons_fst "\"" (string cs n)))
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   120
      | scan (toks, "{" :: "|" :: cs, n) =
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   121
          scan (add_tok toks Verbatim n (verbatim cs n))
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   122
      | scan (toks, "(" :: "*" :: cs, n) =
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   123
          scan ((fn (cs', n') => (toks, cs', n')) (comment cs n 1))
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   124
      | scan (toks, chs as c :: cs, n) =
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   125
          if is_blank c then scan (toks, cs, inc_line n c)
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   126
          else
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   127
            (case scan_word lex chs of
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   128
              (None, _) => lex_err n (quote (take_line chs))
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   129
            | (Some (Keyword, "ML"), chs') => scan ((Verbatim, implode chs', n)
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   130
                :: (Keyword, "ML", n) :: toks, [], n + count_lines chs')
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   131
            | (Some (tk, s), chs') => scan ((tk, s, n) :: toks, chs', n));
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   132
  in
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   133
    scan ([], explode str, 1)
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   134
  end;
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   135
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   136
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   137
end;
dcf6c0774075 (replaces Thy/scan.ML)
wenzelm
parents:
diff changeset
   138