wenzelm@388: (* Title: Pure/Thy/thy_scan.ML wenzelm@388: ID: $Id$ wenzelm@388: Author: Markus Wenzel, TU Muenchen wenzelm@388: wenzelm@388: The scanner for theory files. wenzelm@388: *) wenzelm@388: wenzelm@388: signature THY_SCAN = paulson@1512: sig wenzelm@388: datatype token_kind = wenzelm@388: Keyword | Ident | LongIdent | TypeVar | Nat | String | Verbatim | EOF wenzelm@388: val name_of_kind: token_kind -> string wenzelm@388: type lexicon wenzelm@388: val make_lexicon: string list -> lexicon wenzelm@388: val tokenize: lexicon -> string -> (token_kind * string * int) list paulson@1512: end; wenzelm@388: wenzelm@388: paulson@1512: structure ThyScan : THY_SCAN = wenzelm@388: struct wenzelm@388: wenzelm@388: open Scanner; wenzelm@388: wenzelm@388: wenzelm@388: (** token kinds **) wenzelm@388: wenzelm@388: datatype token_kind = wenzelm@388: Keyword | Ident | LongIdent | TypeVar | Nat | String | Verbatim | EOF; wenzelm@388: wenzelm@388: fun name_of_kind Keyword = "keyword" wenzelm@388: | name_of_kind Ident = "identifier" wenzelm@388: | name_of_kind LongIdent = "long identifier" wenzelm@388: | name_of_kind TypeVar = "type variable" wenzelm@388: | name_of_kind Nat = "natural number" wenzelm@388: | name_of_kind String = "string" wenzelm@388: | name_of_kind Verbatim = "verbatim text" wenzelm@388: | name_of_kind EOF = "end-of-file"; wenzelm@388: wenzelm@388: wenzelm@388: wenzelm@388: (** scanners **) wenzelm@388: wenzelm@388: fun lex_err line msg = wenzelm@388: error ("Lexical error on line " ^ string_of_int line ^ ": " ^ msg); wenzelm@388: wenzelm@388: wenzelm@388: (* misc utilities *) wenzelm@388: wenzelm@388: fun count_lines cs = wenzelm@388: foldl (fn (sum, c) => if c = "\n" then sum + 1 else sum) (0, cs); wenzelm@388: wenzelm@388: fun inc_line n s = n + count_lines (explode s); wenzelm@388: wenzelm@388: fun cons_fst c (cs, x, y) = (c :: cs, x, y); wenzelm@388: wenzelm@388: wenzelm@388: (* scan strings *) wenzelm@388: wenzelm@388: val scan_ctrl = $$ "^" ^^ scan_one (fn c => ord c >= 64 andalso ord c <= 95); wenzelm@388: wenzelm@388: val scan_dig = scan_one is_digit; wenzelm@388: wenzelm@388: val scan_blanks1 = scan_any1 is_blank >> implode; wenzelm@388: wenzelm@388: val scan_esc = wenzelm@388: $$ "n" || $$ "t" || $$ "\"" || $$ "\\" || wenzelm@388: scan_ctrl || scan_dig ^^ scan_dig ^^ scan_dig || wenzelm@388: scan_blanks1 ^^ $$ "\\"; wenzelm@388: wenzelm@2387: fun warn_unprintable c n = wenzelm@2387: (if not (is_printable c) then wenzelm@2387: warning ("Unprintable char \"\\" ^ string_of_int (ord c) ^ wenzelm@2387: "\" in string at line " ^ string_of_int n) wenzelm@2387: else (); c); wenzelm@388: wenzelm@388: fun string ("\"" :: cs) n = (["\""], cs, n) wenzelm@388: | string ("\\" :: cs) n = wenzelm@388: (case optional scan_esc cs of wenzelm@388: (None, _) => lex_err n "bad escape sequence in string" wenzelm@388: | (Some s, cs') => cons_fst ("\\" ^ s) (string cs' (inc_line n s))) wenzelm@2204: | string ("\n" :: cs) n = cons_fst " " (string cs (n + 1)) clasohm@712: | string (c :: cs) n = wenzelm@2204: if is_blank c then cons_fst " " (string cs n) wenzelm@2387: else cons_fst (warn_unprintable c n) (string cs n) wenzelm@388: | string [] n = lex_err n "missing quote at end of string"; wenzelm@388: wenzelm@388: wenzelm@388: (* scan verbatim text *) wenzelm@388: wenzelm@388: fun verbatim ("|" :: "}" :: cs) n = ([], cs, n) wenzelm@388: | verbatim (c :: cs) n = cons_fst c (verbatim cs (inc_line n c)) wenzelm@388: | verbatim [] n = lex_err n "missing end of verbatim text"; wenzelm@388: wenzelm@388: wenzelm@388: (* scan nested comments *) wenzelm@388: wenzelm@388: fun comment ("*" :: ")" :: cs) n 1 = (cs, n) wenzelm@388: | comment ("*" :: ")" :: cs) n d = comment cs n (d - 1) wenzelm@388: | comment ("(" :: "*" :: cs) n d = comment cs n (d + 1) wenzelm@388: | comment (c :: cs) n d = comment cs (inc_line n c) d wenzelm@388: | comment [] n _ = lex_err n "missing end of comment"; wenzelm@388: wenzelm@388: wenzelm@388: wenzelm@388: (** tokenize **) wenzelm@388: wenzelm@388: val scan_longid = scan_id ^^ (repeat1 ($$ "." ^^ scan_id) >> implode); wenzelm@388: wenzelm@388: fun scan_word lex = wenzelm@388: max_of wenzelm@388: (scan_literal lex >> pair Keyword) wenzelm@388: (scan_longid >> pair LongIdent || wenzelm@388: scan_id >> pair Ident || wenzelm@388: scan_tid >> pair TypeVar || wenzelm@388: scan_nat >> pair Nat); wenzelm@388: wenzelm@388: fun add_tok toks kind n (cs, cs', n') = wenzelm@388: ((kind, implode cs, n) :: toks, cs', n'); wenzelm@388: wenzelm@388: val take_line = implode o fst o take_prefix (not_equal "\n"); wenzelm@388: wenzelm@388: wenzelm@388: fun tokenize lex str = wenzelm@388: let wenzelm@388: fun scan (toks, [], n) = rev ((EOF, "end-of-file", n) :: toks) wenzelm@388: | scan (toks, "\"" :: cs, n) = wenzelm@388: scan (add_tok toks String n (cons_fst "\"" (string cs n))) wenzelm@388: | scan (toks, "{" :: "|" :: cs, n) = wenzelm@388: scan (add_tok toks Verbatim n (verbatim cs n)) wenzelm@388: | scan (toks, "(" :: "*" :: cs, n) = wenzelm@388: scan ((fn (cs', n') => (toks, cs', n')) (comment cs n 1)) wenzelm@388: | scan (toks, chs as c :: cs, n) = wenzelm@388: if is_blank c then scan (toks, cs, inc_line n c) wenzelm@388: else wenzelm@388: (case scan_word lex chs of wenzelm@388: (None, _) => lex_err n (quote (take_line chs)) wenzelm@388: | (Some (Keyword, "ML"), chs') => scan ((Verbatim, implode chs', n) wenzelm@388: :: (Keyword, "ML", n) :: toks, [], n + count_lines chs') wenzelm@388: | (Some (tk, s), chs') => scan ((tk, s, n) :: toks, chs', n)); wenzelm@388: in wenzelm@388: scan ([], explode str, 1) wenzelm@388: end; wenzelm@388: wenzelm@388: wenzelm@388: end; wenzelm@388: