author | wenzelm |
Fri, 10 Oct 1997 15:48:43 +0200 | |
changeset 3831 | 45e2e7ba31b8 |
parent 2564 | 9d66b758bce5 |
child 4705 | f71017706887 |
permissions | -rw-r--r-- |
388 | 1 |
(* Title: Pure/Thy/thy_scan.ML |
2 |
ID: $Id$ |
|
3 |
Author: Markus Wenzel, TU Muenchen |
|
4 |
||
5 |
The scanner for theory files. |
|
6 |
*) |
|
7 |
||
8 |
signature THY_SCAN = |
|
1512 | 9 |
sig |
388 | 10 |
datatype token_kind = |
11 |
Keyword | Ident | LongIdent | TypeVar | Nat | String | Verbatim | EOF |
|
12 |
val name_of_kind: token_kind -> string |
|
13 |
type lexicon |
|
14 |
val make_lexicon: string list -> lexicon |
|
15 |
val tokenize: lexicon -> string -> (token_kind * string * int) list |
|
1512 | 16 |
end; |
388 | 17 |
|
18 |
||
1512 | 19 |
structure ThyScan : THY_SCAN = |
388 | 20 |
struct |
21 |
||
22 |
open Scanner; |
|
23 |
||
24 |
||
25 |
(** token kinds **) |
|
26 |
||
27 |
datatype token_kind = |
|
28 |
Keyword | Ident | LongIdent | TypeVar | Nat | String | Verbatim | EOF; |
|
29 |
||
30 |
fun name_of_kind Keyword = "keyword" |
|
31 |
| name_of_kind Ident = "identifier" |
|
32 |
| name_of_kind LongIdent = "long identifier" |
|
33 |
| name_of_kind TypeVar = "type variable" |
|
34 |
| name_of_kind Nat = "natural number" |
|
35 |
| name_of_kind String = "string" |
|
36 |
| name_of_kind Verbatim = "verbatim text" |
|
37 |
| name_of_kind EOF = "end-of-file"; |
|
38 |
||
39 |
||
40 |
||
41 |
(** scanners **) |
|
42 |
||
43 |
fun lex_err line msg = |
|
44 |
error ("Lexical error on line " ^ string_of_int line ^ ": " ^ msg); |
|
45 |
||
46 |
||
47 |
(* misc utilities *) |
|
48 |
||
49 |
fun count_lines cs = |
|
50 |
foldl (fn (sum, c) => if c = "\n" then sum + 1 else sum) (0, cs); |
|
51 |
||
52 |
fun inc_line n s = n + count_lines (explode s); |
|
53 |
||
54 |
fun cons_fst c (cs, x, y) = (c :: cs, x, y); |
|
55 |
||
56 |
||
57 |
(* scan strings *) |
|
58 |
||
59 |
val scan_ctrl = $$ "^" ^^ scan_one (fn c => ord c >= 64 andalso ord c <= 95); |
|
60 |
||
61 |
val scan_dig = scan_one is_digit; |
|
62 |
||
63 |
val scan_blanks1 = scan_any1 is_blank >> implode; |
|
64 |
||
65 |
val scan_esc = |
|
66 |
$$ "n" || $$ "t" || $$ "\"" || $$ "\\" || |
|
67 |
scan_ctrl || scan_dig ^^ scan_dig ^^ scan_dig || |
|
68 |
scan_blanks1 ^^ $$ "\\"; |
|
69 |
||
70 |
fun string ("\"" :: cs) n = (["\""], cs, n) |
|
71 |
| string ("\\" :: cs) n = |
|
72 |
(case optional scan_esc cs of |
|
73 |
(None, _) => lex_err n "bad escape sequence in string" |
|
74 |
| (Some s, cs') => cons_fst ("\\" ^ s) (string cs' (inc_line n s))) |
|
2204
64cb98f841e4
improved string scanner: converts 8 bit chars to escape sequences;
wenzelm
parents:
2151
diff
changeset
|
75 |
| string ("\n" :: cs) n = cons_fst " " (string cs (n + 1)) |
712
1f5800d2c366
added check for newlines not enclosed by '\' inside strings
clasohm
parents:
388
diff
changeset
|
76 |
| string (c :: cs) n = |
2204
64cb98f841e4
improved string scanner: converts 8 bit chars to escape sequences;
wenzelm
parents:
2151
diff
changeset
|
77 |
if is_blank c then cons_fst " " (string cs n) |
2564
9d66b758bce5
removed warning for unprintable chars in strings (functionality will
wenzelm
parents:
2387
diff
changeset
|
78 |
else cons_fst c (string cs n) |
388 | 79 |
| string [] n = lex_err n "missing quote at end of string"; |
80 |
||
81 |
||
82 |
(* scan verbatim text *) |
|
83 |
||
84 |
fun verbatim ("|" :: "}" :: cs) n = ([], cs, n) |
|
85 |
| verbatim (c :: cs) n = cons_fst c (verbatim cs (inc_line n c)) |
|
86 |
| verbatim [] n = lex_err n "missing end of verbatim text"; |
|
87 |
||
88 |
||
89 |
(* scan nested comments *) |
|
90 |
||
91 |
fun comment ("*" :: ")" :: cs) n 1 = (cs, n) |
|
92 |
| comment ("*" :: ")" :: cs) n d = comment cs n (d - 1) |
|
93 |
| comment ("(" :: "*" :: cs) n d = comment cs n (d + 1) |
|
94 |
| comment (c :: cs) n d = comment cs (inc_line n c) d |
|
95 |
| comment [] n _ = lex_err n "missing end of comment"; |
|
96 |
||
97 |
||
98 |
||
99 |
(** tokenize **) |
|
100 |
||
101 |
fun scan_word lex = |
|
102 |
max_of |
|
103 |
(scan_literal lex >> pair Keyword) |
|
104 |
(scan_longid >> pair LongIdent || |
|
105 |
scan_id >> pair Ident || |
|
106 |
scan_tid >> pair TypeVar || |
|
107 |
scan_nat >> pair Nat); |
|
108 |
||
109 |
fun add_tok toks kind n (cs, cs', n') = |
|
110 |
((kind, implode cs, n) :: toks, cs', n'); |
|
111 |
||
112 |
val take_line = implode o fst o take_prefix (not_equal "\n"); |
|
113 |
||
114 |
||
115 |
fun tokenize lex str = |
|
116 |
let |
|
117 |
fun scan (toks, [], n) = rev ((EOF, "end-of-file", n) :: toks) |
|
118 |
| scan (toks, "\"" :: cs, n) = |
|
119 |
scan (add_tok toks String n (cons_fst "\"" (string cs n))) |
|
120 |
| scan (toks, "{" :: "|" :: cs, n) = |
|
121 |
scan (add_tok toks Verbatim n (verbatim cs n)) |
|
122 |
| scan (toks, "(" :: "*" :: cs, n) = |
|
123 |
scan ((fn (cs', n') => (toks, cs', n')) (comment cs n 1)) |
|
124 |
| scan (toks, chs as c :: cs, n) = |
|
125 |
if is_blank c then scan (toks, cs, inc_line n c) |
|
126 |
else |
|
127 |
(case scan_word lex chs of |
|
128 |
(None, _) => lex_err n (quote (take_line chs)) |
|
129 |
| (Some (Keyword, "ML"), chs') => scan ((Verbatim, implode chs', n) |
|
130 |
:: (Keyword, "ML", n) :: toks, [], n + count_lines chs') |
|
131 |
| (Some (tk, s), chs') => scan ((tk, s, n) :: toks, chs', n)); |
|
132 |
in |
|
133 |
scan ([], explode str, 1) |
|
134 |
end; |
|
135 |
||
136 |
||
137 |
end; |
|
138 |