24579
|
1 |
(* Title: Pure/ML/ml_lex.ML
|
|
2 |
ID: $Id$
|
|
3 |
Author: Makarius
|
|
4 |
|
|
5 |
Lexical syntax for SML.
|
|
6 |
*)
|
|
7 |
|
|
8 |
signature ML_LEX =
|
|
9 |
sig
|
|
10 |
datatype token_kind =
|
24596
|
11 |
Keyword | Ident | LongIdent | TypeVar | Word | Int | Real | Char | String |
|
24579
|
12 |
Space | Comment | Error of string | EOF
|
|
13 |
eqtype token
|
27732
|
14 |
val stopper: token Scan.stopper
|
24596
|
15 |
val is_regular: token -> bool
|
|
16 |
val is_improper: token -> bool
|
|
17 |
val pos_of: token -> string
|
24579
|
18 |
val kind_of: token -> token_kind
|
|
19 |
val val_of: token -> string
|
|
20 |
val keywords: string list
|
24596
|
21 |
val source: (Symbol.symbol, 'a) Source.source ->
|
|
22 |
(token, int * (Symbol.symbol, 'a) Source.source) Source.source
|
24579
|
23 |
end;
|
|
24 |
|
|
25 |
structure ML_Lex: ML_LEX =
|
|
26 |
struct
|
|
27 |
|
|
28 |
(** tokens **)
|
|
29 |
|
|
30 |
(* datatype token *)
|
|
31 |
|
|
32 |
datatype token_kind =
|
24596
|
33 |
Keyword | Ident | LongIdent | TypeVar | Word | Int | Real | Char | String |
|
24579
|
34 |
Space | Comment | Error of string | EOF;
|
|
35 |
|
|
36 |
datatype token = Token of int * (token_kind * string);
|
|
37 |
|
|
38 |
|
24596
|
39 |
(* control tokens *)
|
24579
|
40 |
|
|
41 |
val eof = Token (0, (EOF, ""));
|
|
42 |
|
|
43 |
fun is_eof (Token (_, (EOF, _))) = true
|
|
44 |
| is_eof _ = false;
|
|
45 |
|
27732
|
46 |
val stopper = Scan.stopper (K eof) is_eof;
|
24579
|
47 |
|
|
48 |
|
24596
|
49 |
fun is_regular (Token (_, (Error _, _))) = false
|
|
50 |
| is_regular (Token (_, (EOF, _))) = false
|
|
51 |
| is_regular _ = true;
|
|
52 |
|
|
53 |
fun is_improper (Token (_, (Space, _))) = true
|
|
54 |
| is_improper (Token (_, (Comment, _))) = true
|
|
55 |
| is_improper _ = false;
|
|
56 |
|
|
57 |
|
|
58 |
(* token content *)
|
|
59 |
|
|
60 |
fun pos_of (Token (n, _)) = " (line " ^ string_of_int n ^ ")";
|
24579
|
61 |
|
|
62 |
fun kind_of (Token (_, (k, _))) = k;
|
|
63 |
fun val_of (Token (_, (_, x))) = x;
|
|
64 |
|
|
65 |
fun token_leq (Token (_, (_, x)), Token (_, (_, x'))) = x <= x';
|
|
66 |
|
|
67 |
|
|
68 |
|
|
69 |
(** scanners **)
|
|
70 |
|
|
71 |
(* diagnostics *)
|
|
72 |
|
|
73 |
fun lex_err msg ((n, cs), _) = "SML lexical error (line " ^ string_of_int n ^ "): " ^ msg cs;
|
|
74 |
fun !!! msg scan = Scan.!! (lex_err (K msg)) scan;
|
|
75 |
|
|
76 |
|
|
77 |
(* line numbering *)
|
|
78 |
|
24596
|
79 |
fun incr_line scan = Scan.depend (fn (n: int) => scan >> pair (n + 1));
|
24579
|
80 |
val keep_line = Scan.lift;
|
|
81 |
|
|
82 |
val scan_blank =
|
|
83 |
incr_line ($$ "\n") ||
|
|
84 |
keep_line (Scan.one Symbol.is_ascii_blank);
|
|
85 |
|
|
86 |
val scan_blanks = Scan.repeat scan_blank >> implode;
|
|
87 |
val scan_blanks1 = Scan.repeat1 scan_blank >> implode;
|
|
88 |
|
|
89 |
|
|
90 |
(* keywords *)
|
|
91 |
|
|
92 |
val keywords = ["#", "(", ")", ",", "->", "...", ":", ":>", ";", "=",
|
|
93 |
"=>", "[", "]", "_", "{", "|", "}", "abstype", "and", "andalso", "as",
|
|
94 |
"case", "datatype", "do", "else", "end", "eqtype", "exception", "fn",
|
|
95 |
"fun", "functor", "handle", "if", "in", "include", "infix", "infixr",
|
|
96 |
"let", "local", "nonfix", "of", "op", "open", "orelse", "raise", "rec",
|
|
97 |
"sharing", "sig", "signature", "struct", "structure", "then", "type",
|
|
98 |
"val", "where", "while", "with", "withtype"];
|
|
99 |
|
|
100 |
val scan_keyword = Scan.literal (Scan.make_lexicon (map explode keywords)) >> implode;
|
|
101 |
|
|
102 |
|
|
103 |
(* identifiers *)
|
|
104 |
|
24596
|
105 |
local
|
|
106 |
|
24579
|
107 |
val scan_letdigs =
|
|
108 |
Scan.many (Symbol.is_ascii_letter orf Symbol.is_ascii_digit orf Symbol.is_ascii_quasi) >> implode;
|
|
109 |
|
|
110 |
val scan_alphanumeric = Scan.one Symbol.is_ascii_letter ^^ scan_letdigs;
|
|
111 |
|
|
112 |
val scan_symbolic = Scan.many1 (member (op =) (explode "!#$%&*+-/:<=>?@\\^`|~")) >> implode;
|
|
113 |
|
24596
|
114 |
in
|
|
115 |
|
24579
|
116 |
val scan_ident = scan_alphanumeric || scan_symbolic;
|
|
117 |
|
|
118 |
val scan_longident =
|
|
119 |
(Scan.repeat1 (scan_alphanumeric ^^ $$ ".") >> implode) ^^ (scan_ident || $$ "=");
|
|
120 |
|
|
121 |
val scan_typevar = $$ "'" ^^ scan_letdigs;
|
|
122 |
|
24596
|
123 |
end;
|
24579
|
124 |
|
|
125 |
|
|
126 |
(* numerals *)
|
|
127 |
|
24596
|
128 |
local
|
|
129 |
|
24579
|
130 |
val scan_dec = Scan.many1 Symbol.is_ascii_digit >> implode;
|
|
131 |
val scan_hex = Scan.many1 Symbol.is_ascii_hex >> implode;
|
|
132 |
val scan_sign = Scan.optional ($$ "~") "";
|
|
133 |
val scan_decint = scan_sign ^^ scan_dec;
|
|
134 |
|
24596
|
135 |
in
|
|
136 |
|
24579
|
137 |
val scan_word = Scan.this_string "0wx" ^^ scan_hex || Scan.this_string "0w" ^^ scan_dec;
|
|
138 |
|
|
139 |
val scan_int = scan_sign ^^ (Scan.this_string "0x" ^^ scan_hex || scan_dec);
|
|
140 |
|
|
141 |
val scan_exp = ($$ "E" || $$ "e") ^^ scan_decint;
|
|
142 |
|
|
143 |
val scan_real =
|
|
144 |
scan_decint ^^ $$ "." ^^ scan_dec ^^ Scan.optional scan_exp "" ||
|
|
145 |
scan_decint ^^ scan_exp;
|
|
146 |
|
24596
|
147 |
end;
|
|
148 |
|
24579
|
149 |
|
|
150 |
(* chars and strings *)
|
|
151 |
|
24596
|
152 |
local
|
|
153 |
|
|
154 |
val scan_escape =
|
|
155 |
Scan.one (member (op =) (explode "\"\\abtnvfr")) ||
|
|
156 |
$$ "^" ^^ Scan.one (fn s => ord "@" <= ord s andalso ord s <= ord "_") ||
|
|
157 |
Scan.one Symbol.is_ascii_digit ^^
|
|
158 |
Scan.one Symbol.is_ascii_digit ^^
|
|
159 |
Scan.one Symbol.is_ascii_digit;
|
|
160 |
|
|
161 |
val scan_str =
|
|
162 |
keep_line (Scan.one (fn s => Symbol.is_printable s andalso s <> "\"" andalso s <> "\\")) ||
|
|
163 |
keep_line ($$ "\\") ^^ !!! "bad escape character in string" (keep_line scan_escape);
|
|
164 |
|
24579
|
165 |
val scan_gap = keep_line ($$ "\\") ^^ scan_blanks1 ^^ keep_line ($$ "\\");
|
|
166 |
val scan_gaps = Scan.repeat scan_gap >> implode;
|
|
167 |
|
24596
|
168 |
in
|
24579
|
169 |
|
|
170 |
val scan_char =
|
|
171 |
keep_line ($$ "#" ^^ $$ "\"") ^^ scan_gaps ^^ scan_str ^^ scan_gaps ^^ keep_line ($$ "\"");
|
|
172 |
|
|
173 |
val scan_string =
|
|
174 |
keep_line ($$ "\"") ^^
|
24596
|
175 |
!!! "missing quote at end of string"
|
|
176 |
((Scan.repeat (scan_gap || scan_str) >> implode) ^^ keep_line ($$ "\""));
|
|
177 |
|
|
178 |
end;
|
24579
|
179 |
|
|
180 |
|
|
181 |
(* scan nested comments *)
|
|
182 |
|
24596
|
183 |
local
|
|
184 |
|
24579
|
185 |
val scan_cmt =
|
|
186 |
Scan.lift scan_blank ||
|
24596
|
187 |
Scan.depend (fn (d: int) => keep_line ($$ "(" ^^ $$ "*") >> pair (d + 1)) ||
|
24579
|
188 |
Scan.depend (fn 0 => Scan.fail | d => keep_line ($$ "*" ^^ $$ ")") >> pair (d - 1)) ||
|
|
189 |
Scan.lift (keep_line ($$ "*" --| Scan.ahead (~$$ ")"))) ||
|
|
190 |
Scan.lift (keep_line (Scan.one (fn s => s <> "*" andalso Symbol.is_regular s)));
|
|
191 |
|
24596
|
192 |
in
|
|
193 |
|
24579
|
194 |
val scan_comment =
|
|
195 |
keep_line ($$ "(" ^^ $$ "*") ^^
|
24596
|
196 |
!!! "missing end of comment"
|
|
197 |
(Scan.pass 0 (Scan.repeat scan_cmt >> implode) ^^ keep_line ($$ "*" ^^ $$ ")"));
|
|
198 |
|
|
199 |
end;
|
24579
|
200 |
|
|
201 |
|
|
202 |
(* scan token *)
|
|
203 |
|
|
204 |
val scan =
|
|
205 |
let
|
|
206 |
val scanner = Scan.state :|-- (fn n =>
|
|
207 |
let
|
|
208 |
fun token k x = Token (n, (k, x));
|
|
209 |
in
|
|
210 |
scan_char >> token Char ||
|
|
211 |
scan_string >> token String ||
|
|
212 |
scan_blanks1 >> token Space ||
|
|
213 |
scan_comment >> token Comment ||
|
|
214 |
keep_line (Scan.max token_leq
|
|
215 |
(scan_keyword >> token Keyword)
|
24596
|
216 |
(scan_word >> token Word ||
|
|
217 |
scan_real >> token Real ||
|
|
218 |
scan_int >> token Int ||
|
|
219 |
scan_longident >> token LongIdent ||
|
24579
|
220 |
scan_ident >> token Ident ||
|
24596
|
221 |
scan_typevar >> token TypeVar))
|
24579
|
222 |
end);
|
|
223 |
in !! (lex_err (fn cs => "bad input " ^ quote (Symbol.beginning 10 cs))) scanner end;
|
|
224 |
|
|
225 |
|
24596
|
226 |
(* token source *)
|
24579
|
227 |
|
|
228 |
local
|
|
229 |
|
|
230 |
val is_junk = (not o Symbol.is_blank) andf Symbol.is_regular;
|
|
231 |
|
|
232 |
fun recover msg = Scan.state :|-- (fn n =>
|
|
233 |
keep_line (Scan.many is_junk) >> (fn cs => [Token (n, (Error msg, implode cs))]));
|
|
234 |
|
|
235 |
in
|
|
236 |
|
24596
|
237 |
fun source src =
|
|
238 |
Source.source' 1 Symbol.stopper (Scan.bulk scan) (SOME (false, recover)) src;
|
24579
|
239 |
|
|
240 |
end;
|
|
241 |
|
24596
|
242 |
end;
|
24579
|
243 |
|