src/Pure/Thy/thy_scan.ML
author wenzelm
Fri Dec 13 17:38:56 1996 +0100 (1996-12-13)
changeset 2387 1b37895b607a
parent 2338 1871df9900bf
child 2564 9d66b758bce5
permissions -rw-r--r--
added warning for unprintable chars in strings;
wenzelm@388
     1
(*  Title:      Pure/Thy/thy_scan.ML
wenzelm@388
     2
    ID:         $Id$
wenzelm@388
     3
    Author:     Markus Wenzel, TU Muenchen
wenzelm@388
     4
wenzelm@388
     5
The scanner for theory files.
wenzelm@388
     6
*)
wenzelm@388
     7
wenzelm@388
     8
signature THY_SCAN =
paulson@1512
     9
  sig
wenzelm@388
    10
  datatype token_kind =
wenzelm@388
    11
    Keyword | Ident | LongIdent | TypeVar | Nat | String | Verbatim | EOF
wenzelm@388
    12
  val name_of_kind: token_kind -> string
wenzelm@388
    13
  type lexicon
wenzelm@388
    14
  val make_lexicon: string list -> lexicon
wenzelm@388
    15
  val tokenize: lexicon -> string -> (token_kind * string * int) list
paulson@1512
    16
  end;
wenzelm@388
    17
wenzelm@388
    18
paulson@1512
    19
structure ThyScan : THY_SCAN =
wenzelm@388
    20
struct
wenzelm@388
    21
wenzelm@388
    22
open Scanner;
wenzelm@388
    23
wenzelm@388
    24
wenzelm@388
    25
(** token kinds **)
wenzelm@388
    26
wenzelm@388
    27
datatype token_kind =
wenzelm@388
    28
  Keyword | Ident | LongIdent | TypeVar | Nat | String | Verbatim | EOF;
wenzelm@388
    29
wenzelm@388
    30
fun name_of_kind Keyword = "keyword"
wenzelm@388
    31
  | name_of_kind Ident = "identifier"
wenzelm@388
    32
  | name_of_kind LongIdent = "long identifier"
wenzelm@388
    33
  | name_of_kind TypeVar = "type variable"
wenzelm@388
    34
  | name_of_kind Nat = "natural number"
wenzelm@388
    35
  | name_of_kind String = "string"
wenzelm@388
    36
  | name_of_kind Verbatim = "verbatim text"
wenzelm@388
    37
  | name_of_kind EOF = "end-of-file";
wenzelm@388
    38
wenzelm@388
    39
wenzelm@388
    40
wenzelm@388
    41
(** scanners **)
wenzelm@388
    42
wenzelm@388
    43
fun lex_err line msg =
wenzelm@388
    44
  error ("Lexical error on line " ^ string_of_int line ^ ": " ^ msg);
wenzelm@388
    45
wenzelm@388
    46
wenzelm@388
    47
(* misc utilities *)
wenzelm@388
    48
wenzelm@388
    49
fun count_lines cs =
wenzelm@388
    50
  foldl (fn (sum, c) => if c = "\n" then sum + 1 else sum) (0, cs);
wenzelm@388
    51
wenzelm@388
    52
fun inc_line n s = n + count_lines (explode s);
wenzelm@388
    53
wenzelm@388
    54
fun cons_fst c (cs, x, y) = (c :: cs, x, y);
wenzelm@388
    55
wenzelm@388
    56
wenzelm@388
    57
(* scan strings *)
wenzelm@388
    58
wenzelm@388
    59
val scan_ctrl = $$ "^" ^^ scan_one (fn c => ord c >= 64 andalso ord c <= 95);
wenzelm@388
    60
wenzelm@388
    61
val scan_dig = scan_one is_digit;
wenzelm@388
    62
wenzelm@388
    63
val scan_blanks1 = scan_any1 is_blank >> implode;
wenzelm@388
    64
wenzelm@388
    65
val scan_esc =
wenzelm@388
    66
  $$ "n" || $$ "t" || $$ "\"" || $$ "\\" ||
wenzelm@388
    67
  scan_ctrl || scan_dig ^^ scan_dig ^^ scan_dig ||
wenzelm@388
    68
  scan_blanks1 ^^ $$ "\\";
wenzelm@388
    69
wenzelm@2387
    70
fun warn_unprintable c n =
wenzelm@2387
    71
  (if not (is_printable c) then
wenzelm@2387
    72
    warning ("Unprintable char \"\\" ^ string_of_int (ord c) ^
wenzelm@2387
    73
      "\" in string at line " ^ string_of_int n)
wenzelm@2387
    74
  else (); c);
wenzelm@388
    75
wenzelm@388
    76
fun string ("\"" :: cs) n = (["\""], cs, n)
wenzelm@388
    77
  | string ("\\" :: cs) n =
wenzelm@388
    78
      (case optional scan_esc cs of
wenzelm@388
    79
        (None, _) => lex_err n "bad escape sequence in string"
wenzelm@388
    80
      | (Some s, cs') => cons_fst ("\\" ^ s) (string cs' (inc_line n s)))
wenzelm@2204
    81
  | string ("\n" :: cs) n = cons_fst " " (string cs (n + 1))
clasohm@712
    82
  | string (c :: cs) n = 
wenzelm@2204
    83
      if is_blank c then cons_fst " " (string cs n)
wenzelm@2387
    84
      else cons_fst (warn_unprintable c n) (string cs n)
wenzelm@388
    85
  | string [] n = lex_err n "missing quote at end of string";
wenzelm@388
    86
wenzelm@388
    87
wenzelm@388
    88
(* scan verbatim text *)
wenzelm@388
    89
wenzelm@388
    90
fun verbatim ("|" :: "}" :: cs) n = ([], cs, n)
wenzelm@388
    91
  | verbatim (c :: cs) n = cons_fst c (verbatim cs (inc_line n c))
wenzelm@388
    92
  | verbatim [] n = lex_err n "missing end of verbatim text";
wenzelm@388
    93
wenzelm@388
    94
wenzelm@388
    95
(* scan nested comments *)
wenzelm@388
    96
wenzelm@388
    97
fun comment ("*" :: ")" :: cs) n 1 = (cs, n)
wenzelm@388
    98
  | comment ("*" :: ")" :: cs) n d = comment cs n (d - 1)
wenzelm@388
    99
  | comment ("(" :: "*" :: cs) n d = comment cs n (d + 1)
wenzelm@388
   100
  | comment (c :: cs) n d = comment cs (inc_line n c) d
wenzelm@388
   101
  | comment [] n _ = lex_err n "missing end of comment";
wenzelm@388
   102
wenzelm@388
   103
wenzelm@388
   104
wenzelm@388
   105
(** tokenize **)
wenzelm@388
   106
wenzelm@388
   107
val scan_longid = scan_id ^^ (repeat1 ($$ "." ^^ scan_id) >> implode);
wenzelm@388
   108
wenzelm@388
   109
fun scan_word lex =
wenzelm@388
   110
  max_of
wenzelm@388
   111
    (scan_literal lex >> pair Keyword)
wenzelm@388
   112
    (scan_longid >> pair LongIdent ||
wenzelm@388
   113
      scan_id >> pair Ident ||
wenzelm@388
   114
      scan_tid >> pair TypeVar ||
wenzelm@388
   115
      scan_nat >> pair Nat);
wenzelm@388
   116
wenzelm@388
   117
fun add_tok toks kind n (cs, cs', n') =
wenzelm@388
   118
  ((kind, implode cs, n) :: toks, cs', n');
wenzelm@388
   119
wenzelm@388
   120
val take_line = implode o fst o take_prefix (not_equal "\n");
wenzelm@388
   121
wenzelm@388
   122
wenzelm@388
   123
fun tokenize lex str =
wenzelm@388
   124
  let
wenzelm@388
   125
    fun scan (toks, [], n) = rev ((EOF, "end-of-file", n) :: toks)
wenzelm@388
   126
      | scan (toks, "\"" :: cs, n) =
wenzelm@388
   127
          scan (add_tok toks String n (cons_fst "\"" (string cs n)))
wenzelm@388
   128
      | scan (toks, "{" :: "|" :: cs, n) =
wenzelm@388
   129
          scan (add_tok toks Verbatim n (verbatim cs n))
wenzelm@388
   130
      | scan (toks, "(" :: "*" :: cs, n) =
wenzelm@388
   131
          scan ((fn (cs', n') => (toks, cs', n')) (comment cs n 1))
wenzelm@388
   132
      | scan (toks, chs as c :: cs, n) =
wenzelm@388
   133
          if is_blank c then scan (toks, cs, inc_line n c)
wenzelm@388
   134
          else
wenzelm@388
   135
            (case scan_word lex chs of
wenzelm@388
   136
              (None, _) => lex_err n (quote (take_line chs))
wenzelm@388
   137
            | (Some (Keyword, "ML"), chs') => scan ((Verbatim, implode chs', n)
wenzelm@388
   138
                :: (Keyword, "ML", n) :: toks, [], n + count_lines chs')
wenzelm@388
   139
            | (Some (tk, s), chs') => scan ((tk, s, n) :: toks, chs', n));
wenzelm@388
   140
  in
wenzelm@388
   141
    scan ([], explode str, 1)
wenzelm@388
   142
  end;
wenzelm@388
   143
wenzelm@388
   144
wenzelm@388
   145
end;
wenzelm@388
   146