24579
|
1 |
(* Title: Pure/ML/ml_lex.ML
|
|
2 |
ID: $Id$
|
|
3 |
Author: Makarius
|
|
4 |
|
|
5 |
Lexical syntax for SML.
|
|
6 |
*)
|
|
7 |
|
|
8 |
signature ML_LEX =
|
|
9 |
sig
|
|
10 |
datatype token_kind =
|
|
11 |
Keyword | Ident | LongIdent | TypeVar | Selector | Word | Int | Real | Char | String |
|
|
12 |
Space | Comment | Error of string | EOF
|
|
13 |
eqtype token
|
|
14 |
val str_of_kind: token_kind -> string
|
|
15 |
val stopper: token * (token -> bool)
|
|
16 |
val not_eof: token -> bool
|
|
17 |
val kind_of: token -> token_kind
|
|
18 |
val is_kind: token_kind -> token -> bool
|
|
19 |
val is_proper: token -> bool
|
|
20 |
val val_of: token -> string
|
|
21 |
val !!! : string -> (int * 'a -> 'b) -> int * 'a -> 'b
|
|
22 |
val keywords: string list
|
|
23 |
val scan: int * string list -> token * (int * string list)
|
|
24 |
val source: bool option -> (string, 'a) Source.source ->
|
|
25 |
(token, int * (string, 'a) Source.source) Source.source
|
|
26 |
val source_proper: (token, 'a) Source.source ->
|
|
27 |
(token, (token, 'a) Source.source) Source.source
|
|
28 |
end;
|
|
29 |
|
|
30 |
structure ML_Lex: ML_LEX =
|
|
31 |
struct
|
|
32 |
|
|
33 |
(** tokens **)
|
|
34 |
|
|
35 |
(* datatype token *)
|
|
36 |
|
|
37 |
datatype token_kind =
|
|
38 |
Keyword | Ident | LongIdent | TypeVar | Selector | Word | Int | Real | Char | String |
|
|
39 |
Space | Comment | Error of string | EOF;
|
|
40 |
|
|
41 |
datatype token = Token of int * (token_kind * string);
|
|
42 |
|
|
43 |
val str_of_kind =
|
|
44 |
fn Keyword => "keyword"
|
|
45 |
| Ident => "identifier"
|
|
46 |
| LongIdent => "long identifier"
|
|
47 |
| TypeVar => "type variable"
|
|
48 |
| Selector => "record selector"
|
|
49 |
| Word => "word"
|
|
50 |
| Int => "integer"
|
|
51 |
| Real => "real"
|
|
52 |
| Char => "character"
|
|
53 |
| String => "string"
|
|
54 |
| Space => "white space"
|
|
55 |
| Comment => "comment"
|
|
56 |
| Error _ => "bad input"
|
|
57 |
| EOF => "end-of-file";
|
|
58 |
|
|
59 |
|
|
60 |
(* end-of-file *)
|
|
61 |
|
|
62 |
val eof = Token (0, (EOF, ""));
|
|
63 |
|
|
64 |
fun is_eof (Token (_, (EOF, _))) = true
|
|
65 |
| is_eof _ = false;
|
|
66 |
|
|
67 |
val stopper = (eof, is_eof);
|
|
68 |
val not_eof = not o is_eof;
|
|
69 |
|
|
70 |
|
|
71 |
(* token kind *)
|
|
72 |
|
|
73 |
fun kind_of (Token (_, (k, _))) = k;
|
|
74 |
|
|
75 |
fun is_kind k (Token (_, (k', _))) = k = k';
|
|
76 |
|
|
77 |
fun is_proper (Token (_, (Space, _))) = false
|
|
78 |
| is_proper (Token (_, (Comment, _))) = false
|
|
79 |
| is_proper _ = true;
|
|
80 |
|
|
81 |
|
|
82 |
(* token value *)
|
|
83 |
|
|
84 |
fun val_of (Token (_, (_, x))) = x;
|
|
85 |
|
|
86 |
fun token_leq (Token (_, (_, x)), Token (_, (_, x'))) = x <= x';
|
|
87 |
|
|
88 |
|
|
89 |
|
|
90 |
(** scanners **)
|
|
91 |
|
|
92 |
(* diagnostics *)
|
|
93 |
|
|
94 |
fun lex_err msg ((n, cs), _) = "SML lexical error (line " ^ string_of_int n ^ "): " ^ msg cs;
|
|
95 |
fun !!! msg scan = Scan.!! (lex_err (K msg)) scan;
|
|
96 |
|
|
97 |
|
|
98 |
(* line numbering *)
|
|
99 |
|
|
100 |
fun incr_line scan = Scan.depend (fn n => scan >> pair (n + 1));
|
|
101 |
val keep_line = Scan.lift;
|
|
102 |
|
|
103 |
val scan_blank =
|
|
104 |
incr_line ($$ "\n") ||
|
|
105 |
keep_line (Scan.one Symbol.is_ascii_blank);
|
|
106 |
|
|
107 |
val scan_blanks = Scan.repeat scan_blank >> implode;
|
|
108 |
val scan_blanks1 = Scan.repeat1 scan_blank >> implode;
|
|
109 |
|
|
110 |
|
|
111 |
(* keywords *)
|
|
112 |
|
|
113 |
val keywords = ["#", "(", ")", ",", "->", "...", ":", ":>", ";", "=",
|
|
114 |
"=>", "[", "]", "_", "{", "|", "}", "abstype", "and", "andalso", "as",
|
|
115 |
"case", "datatype", "do", "else", "end", "eqtype", "exception", "fn",
|
|
116 |
"fun", "functor", "handle", "if", "in", "include", "infix", "infixr",
|
|
117 |
"let", "local", "nonfix", "of", "op", "open", "orelse", "raise", "rec",
|
|
118 |
"sharing", "sig", "signature", "struct", "structure", "then", "type",
|
|
119 |
"val", "where", "while", "with", "withtype"];
|
|
120 |
|
|
121 |
val scan_keyword = Scan.literal (Scan.make_lexicon (map explode keywords)) >> implode;
|
|
122 |
|
|
123 |
|
|
124 |
(* identifiers *)
|
|
125 |
|
|
126 |
val scan_letdigs =
|
|
127 |
Scan.many (Symbol.is_ascii_letter orf Symbol.is_ascii_digit orf Symbol.is_ascii_quasi) >> implode;
|
|
128 |
|
|
129 |
val scan_alphanumeric = Scan.one Symbol.is_ascii_letter ^^ scan_letdigs;
|
|
130 |
|
|
131 |
val scan_symbolic = Scan.many1 (member (op =) (explode "!#$%&*+-/:<=>?@\\^`|~")) >> implode;
|
|
132 |
|
|
133 |
val scan_ident = scan_alphanumeric || scan_symbolic;
|
|
134 |
|
|
135 |
val scan_longident =
|
|
136 |
(Scan.repeat1 (scan_alphanumeric ^^ $$ ".") >> implode) ^^ (scan_ident || $$ "=");
|
|
137 |
|
|
138 |
val scan_typevar = $$ "'" ^^ scan_letdigs;
|
|
139 |
|
|
140 |
|
|
141 |
(* selectors -- not fully standard conformant *)
|
|
142 |
|
|
143 |
val scan_numeric =
|
|
144 |
Scan.one (member (op =) (explode "123456789")) ^^ (Scan.many Symbol.is_ascii_digit >> implode);
|
|
145 |
|
|
146 |
val scan_selector =
|
|
147 |
keep_line ($$ "#") ^^ scan_blanks ^^
|
|
148 |
!!! "malformed record selector" (keep_line (scan_numeric || scan_alphanumeric || scan_symbolic));
|
|
149 |
|
|
150 |
|
|
151 |
(* numerals *)
|
|
152 |
|
|
153 |
val scan_dec = Scan.many1 Symbol.is_ascii_digit >> implode;
|
|
154 |
val scan_hex = Scan.many1 Symbol.is_ascii_hex >> implode;
|
|
155 |
val scan_sign = Scan.optional ($$ "~") "";
|
|
156 |
val scan_decint = scan_sign ^^ scan_dec;
|
|
157 |
|
|
158 |
val scan_word = Scan.this_string "0wx" ^^ scan_hex || Scan.this_string "0w" ^^ scan_dec;
|
|
159 |
|
|
160 |
val scan_int = scan_sign ^^ (Scan.this_string "0x" ^^ scan_hex || scan_dec);
|
|
161 |
|
|
162 |
val scan_exp = ($$ "E" || $$ "e") ^^ scan_decint;
|
|
163 |
|
|
164 |
val scan_real =
|
|
165 |
scan_decint ^^ $$ "." ^^ scan_dec ^^ Scan.optional scan_exp "" ||
|
|
166 |
scan_decint ^^ scan_exp;
|
|
167 |
|
|
168 |
|
|
169 |
(* chars and strings *)
|
|
170 |
|
|
171 |
val scan_gap = keep_line ($$ "\\") ^^ scan_blanks1 ^^ keep_line ($$ "\\");
|
|
172 |
val scan_gaps = Scan.repeat scan_gap >> implode;
|
|
173 |
|
|
174 |
val scan_str =
|
|
175 |
scan_blank ||
|
|
176 |
scan_gap ||
|
|
177 |
keep_line ($$ "\\") |-- !!! "bad escape character in string"
|
|
178 |
(scan_blank || keep_line ($$ "\"" || $$ "\\")) ||
|
|
179 |
keep_line (Scan.one (fn s => s <> "\"" andalso s <> "\\" andalso Symbol.is_regular s));
|
|
180 |
|
|
181 |
val scan_char =
|
|
182 |
keep_line ($$ "#" ^^ $$ "\"") ^^ scan_gaps ^^ scan_str ^^ scan_gaps ^^ keep_line ($$ "\"");
|
|
183 |
|
|
184 |
val scan_string =
|
|
185 |
keep_line ($$ "\"") ^^
|
|
186 |
!!! "missing quote at end of string" ((Scan.repeat scan_str >> implode) ^^ keep_line ($$ "\""));
|
|
187 |
|
|
188 |
|
|
189 |
(* scan nested comments *)
|
|
190 |
|
|
191 |
val scan_cmt =
|
|
192 |
Scan.lift scan_blank ||
|
|
193 |
Scan.depend (fn d => keep_line ($$ "(" ^^ $$ "*") >> pair (d + 1)) ||
|
|
194 |
Scan.depend (fn 0 => Scan.fail | d => keep_line ($$ "*" ^^ $$ ")") >> pair (d - 1)) ||
|
|
195 |
Scan.lift (keep_line ($$ "*" --| Scan.ahead (~$$ ")"))) ||
|
|
196 |
Scan.lift (keep_line (Scan.one (fn s => s <> "*" andalso Symbol.is_regular s)));
|
|
197 |
|
|
198 |
val scan_comment =
|
|
199 |
keep_line ($$ "(" ^^ $$ "*") ^^
|
|
200 |
!!! "missing end of comment"
|
|
201 |
(Scan.pass 0 (Scan.repeat scan_cmt >> implode) ^^ keep_line ($$ "*" ^^ $$ ")"));
|
|
202 |
|
|
203 |
|
|
204 |
(* scan token *)
|
|
205 |
|
|
206 |
val scan =
|
|
207 |
let
|
|
208 |
val scanner = Scan.state :|-- (fn n =>
|
|
209 |
let
|
|
210 |
fun token k x = Token (n, (k, x));
|
|
211 |
in
|
|
212 |
scan_char >> token Char ||
|
|
213 |
scan_selector >> token Selector ||
|
|
214 |
scan_string >> token String ||
|
|
215 |
scan_blanks1 >> token Space ||
|
|
216 |
scan_comment >> token Comment ||
|
|
217 |
keep_line (Scan.max token_leq
|
|
218 |
(scan_keyword >> token Keyword)
|
|
219 |
(scan_longident >> token LongIdent ||
|
|
220 |
scan_ident >> token Ident ||
|
|
221 |
scan_typevar >> token TypeVar ||
|
|
222 |
scan_real >> token Real ||
|
|
223 |
scan_word >> token Word ||
|
|
224 |
scan_int >> token Int))
|
|
225 |
end);
|
|
226 |
in !! (lex_err (fn cs => "bad input " ^ quote (Symbol.beginning 10 cs))) scanner end;
|
|
227 |
|
|
228 |
|
|
229 |
(* token sources *)
|
|
230 |
|
|
231 |
local
|
|
232 |
|
|
233 |
val is_junk = (not o Symbol.is_blank) andf Symbol.is_regular;
|
|
234 |
|
|
235 |
fun recover msg = Scan.state :|-- (fn n =>
|
|
236 |
keep_line (Scan.many is_junk) >> (fn cs => [Token (n, (Error msg, implode cs))]));
|
|
237 |
|
|
238 |
in
|
|
239 |
|
|
240 |
fun source do_recover src =
|
|
241 |
Source.source' 1 Symbol.stopper (Scan.bulk scan) (Option.map (rpair recover) do_recover) src;
|
|
242 |
|
|
243 |
end;
|
|
244 |
|
|
245 |
fun source_proper src = src |> Source.filter is_proper;
|
|
246 |
|
|
247 |
|
|
248 |
end;
|