src/Pure/Thy/thy_scan.ML
author wenzelm
Mon Nov 18 17:32:38 1996 +0100 (1996-11-18)
changeset 2204 64cb98f841e4
parent 2151 000767995143
child 2338 1871df9900bf
permissions -rw-r--r--
improved string scanner: converts 8 bit chars to escape sequences;
     1 (*  Title:      Pure/Thy/thy_scan.ML
     2     ID:         $Id$
     3     Author:     Markus Wenzel, TU Muenchen
     4 
     5 The scanner for theory files.
     6 *)
     7 
     8 signature THY_SCAN =
     9   sig
    10   datatype token_kind =
    11     Keyword | Ident | LongIdent | TypeVar | Nat | String | Verbatim | EOF
    12   val name_of_kind: token_kind -> string
    13   type lexicon
    14   val make_lexicon: string list -> lexicon
    15   val tokenize: lexicon -> string -> (token_kind * string * int) list
    16   end;
    17 
    18 
    19 structure ThyScan : THY_SCAN =
    20 struct
    21 
    22 open Scanner;
    23 
    24 
    25 (** token kinds **)
    26 
    27 datatype token_kind =
    28   Keyword | Ident | LongIdent | TypeVar | Nat | String | Verbatim | EOF;
    29 
    30 fun name_of_kind Keyword = "keyword"
    31   | name_of_kind Ident = "identifier"
    32   | name_of_kind LongIdent = "long identifier"
    33   | name_of_kind TypeVar = "type variable"
    34   | name_of_kind Nat = "natural number"
    35   | name_of_kind String = "string"
    36   | name_of_kind Verbatim = "verbatim text"
    37   | name_of_kind EOF = "end-of-file";
    38 
    39 
    40 
    41 (** scanners **)
    42 
    43 fun lex_err line msg =
    44   error ("Lexical error on line " ^ string_of_int line ^ ": " ^ msg);
    45 
    46 
    47 (* misc utilities *)
    48 
    49 fun count_lines cs =
    50   foldl (fn (sum, c) => if c = "\n" then sum + 1 else sum) (0, cs);
    51 
    52 fun inc_line n s = n + count_lines (explode s);
    53 
    54 fun cons_fst c (cs, x, y) = (c :: cs, x, y);
    55 
    56 
    57 (* scan strings *)
    58 
    59 val scan_ctrl = $$ "^" ^^ scan_one (fn c => ord c >= 64 andalso ord c <= 95);
    60 
    61 val scan_dig = scan_one is_digit;
    62 
    63 val scan_blanks1 = scan_any1 is_blank >> implode;
    64 
    65 val scan_esc =
    66   $$ "n" || $$ "t" || $$ "\"" || $$ "\\" ||
    67   scan_ctrl || scan_dig ^^ scan_dig ^^ scan_dig ||
    68   scan_blanks1 ^^ $$ "\\";
    69 
    70 
    71 fun string ("\"" :: cs) n = (["\""], cs, n)
    72   | string ("\\" :: cs) n =
    73       (case optional scan_esc cs of
    74         (None, _) => lex_err n "bad escape sequence in string"
    75       | (Some s, cs') => cons_fst ("\\" ^ s) (string cs' (inc_line n s)))
    76   | string ("\n" :: cs) n = cons_fst " " (string cs (n + 1))
    77   | string (c :: cs) n = 
    78       if is_blank c then cons_fst " " (string cs n)
    79       else if is_printable c then cons_fst c (string cs n)
    80       else cons_fst ("\\" ^ string_of_int (ord c)) (string cs n)
    81   | string [] n = lex_err n "missing quote at end of string";
    82 
    83 
    84 (* scan verbatim text *)
    85 
    86 fun verbatim ("|" :: "}" :: cs) n = ([], cs, n)
    87   | verbatim (c :: cs) n = cons_fst c (verbatim cs (inc_line n c))
    88   | verbatim [] n = lex_err n "missing end of verbatim text";
    89 
    90 
    91 (* scan nested comments *)
    92 
    93 fun comment ("*" :: ")" :: cs) n 1 = (cs, n)
    94   | comment ("*" :: ")" :: cs) n d = comment cs n (d - 1)
    95   | comment ("(" :: "*" :: cs) n d = comment cs n (d + 1)
    96   | comment (c :: cs) n d = comment cs (inc_line n c) d
    97   | comment [] n _ = lex_err n "missing end of comment";
    98 
    99 
   100 
   101 (** tokenize **)
   102 
   103 val scan_longid = scan_id ^^ (repeat1 ($$ "." ^^ scan_id) >> implode);
   104 
   105 fun scan_word lex =
   106   max_of
   107     (scan_literal lex >> pair Keyword)
   108     (scan_longid >> pair LongIdent ||
   109       scan_id >> pair Ident ||
   110       scan_tid >> pair TypeVar ||
   111       scan_nat >> pair Nat);
   112 
   113 fun add_tok toks kind n (cs, cs', n') =
   114   ((kind, implode cs, n) :: toks, cs', n');
   115 
   116 val take_line = implode o fst o take_prefix (not_equal "\n");
   117 
   118 
   119 fun tokenize lex str =
   120   let
   121     fun scan (toks, [], n) = rev ((EOF, "end-of-file", n) :: toks)
   122       | scan (toks, "\"" :: cs, n) =
   123           scan (add_tok toks String n (cons_fst "\"" (string cs n)))
   124       | scan (toks, "{" :: "|" :: cs, n) =
   125           scan (add_tok toks Verbatim n (verbatim cs n))
   126       | scan (toks, "(" :: "*" :: cs, n) =
   127           scan ((fn (cs', n') => (toks, cs', n')) (comment cs n 1))
   128       | scan (toks, chs as c :: cs, n) =
   129           if is_blank c then scan (toks, cs, inc_line n c)
   130           else
   131             (case scan_word lex chs of
   132               (None, _) => lex_err n (quote (take_line chs))
   133             | (Some (Keyword, "ML"), chs') => scan ((Verbatim, implode chs', n)
   134                 :: (Keyword, "ML", n) :: toks, [], n + count_lines chs')
   135             | (Some (tk, s), chs') => scan ((tk, s, n) :: toks, chs', n));
   136   in
   137     scan ([], explode str, 1)
   138   end;
   139 
   140 
   141 end;
   142