| author | paulson | 
| Fri, 14 Jun 1996 12:25:19 +0200 | |
| changeset 1799 | 1b4d20a06ba0 | 
| parent 1512 | ce37c64244c0 | 
| child 2151 | 000767995143 | 
| permissions | -rw-r--r-- | 
| 388 | 1 | (* Title: Pure/Thy/thy_scan.ML | 
| 2 | ID: $Id$ | |
| 3 | Author: Markus Wenzel, TU Muenchen | |
| 4 | ||
| 5 | The scanner for theory files. | |
| 6 | *) | |
| 7 | ||
| 8 | signature THY_SCAN = | |
| 1512 | 9 | sig | 
| 388 | 10 | datatype token_kind = | 
| 11 | Keyword | Ident | LongIdent | TypeVar | Nat | String | Verbatim | EOF | |
| 12 | val name_of_kind: token_kind -> string | |
| 13 | type lexicon | |
| 14 | val make_lexicon: string list -> lexicon | |
| 15 | val tokenize: lexicon -> string -> (token_kind * string * int) list | |
| 1512 | 16 | end; | 
| 388 | 17 | |
| 18 | ||
| 1512 | 19 | structure ThyScan : THY_SCAN = | 
| 388 | 20 | struct | 
| 21 | ||
| 22 | open Scanner; | |
| 23 | ||
| 24 | ||
| 25 | (** token kinds **) | |
| 26 | ||
| 27 | datatype token_kind = | |
| 28 | Keyword | Ident | LongIdent | TypeVar | Nat | String | Verbatim | EOF; | |
| 29 | ||
| 30 | fun name_of_kind Keyword = "keyword" | |
| 31 | | name_of_kind Ident = "identifier" | |
| 32 | | name_of_kind LongIdent = "long identifier" | |
| 33 | | name_of_kind TypeVar = "type variable" | |
| 34 | | name_of_kind Nat = "natural number" | |
| 35 | | name_of_kind String = "string" | |
| 36 | | name_of_kind Verbatim = "verbatim text" | |
| 37 | | name_of_kind EOF = "end-of-file"; | |
| 38 | ||
| 39 | ||
| 40 | ||
| 41 | (** scanners **) | |
| 42 | ||
| 43 | fun lex_err line msg = | |
| 44 |   error ("Lexical error on line " ^ string_of_int line ^ ": " ^ msg);
 | |
| 45 | ||
| 46 | ||
| 47 | (* misc utilities *) | |
| 48 | ||
| 49 | fun count_lines cs = | |
| 50 | foldl (fn (sum, c) => if c = "\n" then sum + 1 else sum) (0, cs); | |
| 51 | ||
| 52 | fun inc_line n s = n + count_lines (explode s); | |
| 53 | ||
| 54 | fun cons_fst c (cs, x, y) = (c :: cs, x, y); | |
| 55 | ||
| 56 | ||
| 57 | (* scan strings *) | |
| 58 | ||
| 59 | val scan_ctrl = $$ "^" ^^ scan_one (fn c => ord c >= 64 andalso ord c <= 95); | |
| 60 | ||
| 61 | val scan_dig = scan_one is_digit; | |
| 62 | ||
| 63 | val scan_blanks1 = scan_any1 is_blank >> implode; | |
| 64 | ||
| 65 | val scan_esc = | |
| 66 | $$ "n" || $$ "t" || $$ "\"" || $$ "\\" || | |
| 67 | scan_ctrl || scan_dig ^^ scan_dig ^^ scan_dig || | |
| 68 | scan_blanks1 ^^ $$ "\\"; | |
| 69 | ||
| 70 | ||
| 71 | fun string ("\"" :: cs) n = (["\""], cs, n)
 | |
| 72 |   | string ("\\" :: cs) n =
 | |
| 73 | (case optional scan_esc cs of | |
| 74 | (None, _) => lex_err n "bad escape sequence in string" | |
| 75 |       | (Some s, cs') => cons_fst ("\\" ^ s) (string cs' (inc_line n s)))
 | |
| 712 
1f5800d2c366
added check for newlines not enclosed by '\' inside strings
 clasohm parents: 
388diff
changeset | 76 | | string (c :: cs) n = | 
| 978 
f7011b47ac38
changed string scanner so that newlines ('\n') are allowed and ignored inside
 clasohm parents: 
712diff
changeset | 77 | if c = "\n" then string cs (n+1) | 
| 712 
1f5800d2c366
added check for newlines not enclosed by '\' inside strings
 clasohm parents: 
388diff
changeset | 78 | else cons_fst c (string cs n) | 
| 388 | 79 | | string [] n = lex_err n "missing quote at end of string"; | 
| 80 | ||
| 81 | ||
| 82 | (* scan verbatim text *) | |
| 83 | ||
| 84 | fun verbatim ("|" :: "}" :: cs) n = ([], cs, n)
 | |
| 85 | | verbatim (c :: cs) n = cons_fst c (verbatim cs (inc_line n c)) | |
| 86 | | verbatim [] n = lex_err n "missing end of verbatim text"; | |
| 87 | ||
| 88 | ||
| 89 | (* scan nested comments *) | |
| 90 | ||
| 91 | fun comment ("*" :: ")" :: cs) n 1 = (cs, n)
 | |
| 92 |   | comment ("*" :: ")" :: cs) n d = comment cs n (d - 1)
 | |
| 93 |   | comment ("(" :: "*" :: cs) n d = comment cs n (d + 1)
 | |
| 94 | | comment (c :: cs) n d = comment cs (inc_line n c) d | |
| 95 | | comment [] n _ = lex_err n "missing end of comment"; | |
| 96 | ||
| 97 | ||
| 98 | ||
| 99 | (** tokenize **) | |
| 100 | ||
| 101 | val scan_longid = scan_id ^^ (repeat1 ($$ "." ^^ scan_id) >> implode); | |
| 102 | ||
| 103 | fun scan_word lex = | |
| 104 | max_of | |
| 105 | (scan_literal lex >> pair Keyword) | |
| 106 | (scan_longid >> pair LongIdent || | |
| 107 | scan_id >> pair Ident || | |
| 108 | scan_tid >> pair TypeVar || | |
| 109 | scan_nat >> pair Nat); | |
| 110 | ||
| 111 | fun add_tok toks kind n (cs, cs', n') = | |
| 112 | ((kind, implode cs, n) :: toks, cs', n'); | |
| 113 | ||
| 114 | val take_line = implode o fst o take_prefix (not_equal "\n"); | |
| 115 | ||
| 116 | ||
| 117 | fun tokenize lex str = | |
| 118 | let | |
| 119 | fun scan (toks, [], n) = rev ((EOF, "end-of-file", n) :: toks) | |
| 120 | | scan (toks, "\"" :: cs, n) = | |
| 121 | scan (add_tok toks String n (cons_fst "\"" (string cs n))) | |
| 122 |       | scan (toks, "{" :: "|" :: cs, n) =
 | |
| 123 | scan (add_tok toks Verbatim n (verbatim cs n)) | |
| 124 |       | scan (toks, "(" :: "*" :: cs, n) =
 | |
| 125 | scan ((fn (cs', n') => (toks, cs', n')) (comment cs n 1)) | |
| 126 | | scan (toks, chs as c :: cs, n) = | |
| 127 | if is_blank c then scan (toks, cs, inc_line n c) | |
| 128 | else | |
| 129 | (case scan_word lex chs of | |
| 130 | (None, _) => lex_err n (quote (take_line chs)) | |
| 131 | | (Some (Keyword, "ML"), chs') => scan ((Verbatim, implode chs', n) | |
| 132 | :: (Keyword, "ML", n) :: toks, [], n + count_lines chs') | |
| 133 | | (Some (tk, s), chs') => scan ((tk, s, n) :: toks, chs', n)); | |
| 134 | in | |
| 135 | scan ([], explode str, 1) | |
| 136 | end; | |
| 137 | ||
| 138 | ||
| 139 | end; | |
| 140 |