author | wenzelm |
Fri, 20 Mar 2009 20:22:13 +0100 | |
changeset 30614 | 845bcfd3e9cd |
parent 30600 | de241396389c |
child 30636 | bd8813d7774d |
permissions | -rw-r--r-- |
24579 | 1 |
(* Title: Pure/ML/ml_lex.ML |
2 |
Author: Makarius |
|
3 |
||
4 |
Lexical syntax for SML. |
|
5 |
*) |
|
6 |
||
7 |
signature ML_LEX = |
|
8 |
sig |
|
9 |
datatype token_kind = |
|
24596 | 10 |
Keyword | Ident | LongIdent | TypeVar | Word | Int | Real | Char | String | |
24579 | 11 |
Space | Comment | Error of string | EOF |
12 |
eqtype token |
|
27732 | 13 |
val stopper: token Scan.stopper |
24596 | 14 |
val is_regular: token -> bool |
15 |
val is_improper: token -> bool |
|
16 |
val pos_of: token -> string |
|
24579 | 17 |
val kind_of: token -> token_kind |
27817 | 18 |
val content_of: token -> string |
30614 | 19 |
val markup_of: token -> Markup.T |
20 |
val report_token: token -> unit |
|
24579 | 21 |
val keywords: string list |
30593 | 22 |
val scan_antiq: Symbol_Pos.T list -> token Antiquote.antiquote * Symbol_Pos.T list |
24596 | 23 |
val source: (Symbol.symbol, 'a) Source.source -> |
30573 | 24 |
(token, (Symbol_Pos.T, Position.T * (Symbol.symbol, 'a) Source.source) |
27772 | 25 |
Source.source) Source.source |
30591 | 26 |
val tokenize: string -> token list |
24579 | 27 |
end; |
28 |
||
29 |
structure ML_Lex: ML_LEX = |
|
30 |
struct |
|
31 |
||
32 |
(** tokens **) |
|
33 |
||
34 |
(* datatype token *) |
|
35 |
||
36 |
datatype token_kind = |
|
24596 | 37 |
Keyword | Ident | LongIdent | TypeVar | Word | Int | Real | Char | String | |
24579 | 38 |
Space | Comment | Error of string | EOF; |
39 |
||
27772 | 40 |
datatype token = Token of Position.range * (token_kind * string); |
41 |
||
42 |
||
43 |
(* position *) |
|
44 |
||
45 |
fun position_of (Token ((pos, _), _)) = pos; |
|
46 |
fun end_position_of (Token ((_, pos), _)) = pos; |
|
47 |
||
48 |
val pos_of = Position.str_of o position_of; |
|
24579 | 49 |
|
50 |
||
24596 | 51 |
(* control tokens *) |
24579 | 52 |
|
27772 | 53 |
fun mk_eof pos = Token ((pos, Position.none), (EOF, "")); |
54 |
val eof = mk_eof Position.none; |
|
24579 | 55 |
|
56 |
fun is_eof (Token (_, (EOF, _))) = true |
|
57 |
| is_eof _ = false; |
|
58 |
||
27772 | 59 |
val stopper = |
60 |
Scan.stopper (fn [] => eof | toks => mk_eof (end_position_of (List.last toks))) is_eof; |
|
61 |
||
24579 | 62 |
|
27772 | 63 |
(* token content *) |
64 |
||
27817 | 65 |
fun content_of (Token (_, (_, x))) = x; |
66 |
fun token_leq (tok, tok') = content_of tok <= content_of tok'; |
|
27772 | 67 |
|
68 |
fun kind_of (Token (_, (k, _))) = k; |
|
24579 | 69 |
|
24596 | 70 |
fun is_regular (Token (_, (Error _, _))) = false |
71 |
| is_regular (Token (_, (EOF, _))) = false |
|
72 |
| is_regular _ = true; |
|
73 |
||
74 |
fun is_improper (Token (_, (Space, _))) = true |
|
75 |
| is_improper (Token (_, (Comment, _))) = true |
|
76 |
| is_improper _ = false; |
|
77 |
||
78 |
||
30614 | 79 |
(* markup *) |
80 |
||
81 |
val markup_of = kind_of #> |
|
82 |
(fn Keyword => Markup.ML_keyword |
|
83 |
| Ident => Markup.ML_ident |
|
84 |
| LongIdent => Markup.ML_ident |
|
85 |
| TypeVar => Markup.ML_tvar |
|
86 |
| Word => Markup.ML_numeral |
|
87 |
| Int => Markup.ML_numeral |
|
88 |
| Real => Markup.ML_numeral |
|
89 |
| Char => Markup.ML_char |
|
90 |
| String => Markup.ML_string |
|
91 |
| Space => Markup.none |
|
92 |
| Comment => Markup.ML_comment |
|
93 |
| Error _ => Markup.ML_malformed |
|
94 |
| EOF => Markup.none); |
|
95 |
||
96 |
fun report_token tok = Position.report (markup_of tok) (position_of tok); |
|
97 |
||
98 |
||
24579 | 99 |
|
100 |
(** scanners **) |
|
101 |
||
30573 | 102 |
open Basic_Symbol_Pos; |
24579 | 103 |
|
30573 | 104 |
fun !!! msg = Symbol_Pos.!!! ("SML lexical error: " ^ msg); |
24579 | 105 |
|
106 |
||
27772 | 107 |
(* blanks *) |
24579 | 108 |
|
27772 | 109 |
val scan_blank = Scan.one (Symbol.is_ascii_blank o symbol); |
110 |
val scan_blanks1 = Scan.repeat1 scan_blank; |
|
24579 | 111 |
|
112 |
||
113 |
(* keywords *) |
|
114 |
||
115 |
val keywords = ["#", "(", ")", ",", "->", "...", ":", ":>", ";", "=", |
|
116 |
"=>", "[", "]", "_", "{", "|", "}", "abstype", "and", "andalso", "as", |
|
117 |
"case", "datatype", "do", "else", "end", "eqtype", "exception", "fn", |
|
118 |
"fun", "functor", "handle", "if", "in", "include", "infix", "infixr", |
|
119 |
"let", "local", "nonfix", "of", "op", "open", "orelse", "raise", "rec", |
|
120 |
"sharing", "sig", "signature", "struct", "structure", "then", "type", |
|
121 |
"val", "where", "while", "with", "withtype"]; |
|
122 |
||
27772 | 123 |
val lex = Scan.make_lexicon (map explode keywords); |
124 |
fun scan_keyword x = Scan.literal lex x; |
|
24579 | 125 |
|
126 |
||
127 |
(* identifiers *) |
|
128 |
||
24596 | 129 |
local |
130 |
||
24579 | 131 |
val scan_letdigs = |
27772 | 132 |
Scan.many ((Symbol.is_ascii_letter orf Symbol.is_ascii_digit orf Symbol.is_ascii_quasi) o symbol); |
24579 | 133 |
|
27772 | 134 |
val scan_alphanumeric = Scan.one (Symbol.is_ascii_letter o symbol) -- scan_letdigs >> op ::; |
24579 | 135 |
|
27772 | 136 |
val scan_symbolic = Scan.many1 (member (op =) (explode "!#$%&*+-/:<=>?@\\^`|~") o symbol); |
24579 | 137 |
|
24596 | 138 |
in |
139 |
||
24579 | 140 |
val scan_ident = scan_alphanumeric || scan_symbolic; |
141 |
||
142 |
val scan_longident = |
|
27772 | 143 |
(Scan.repeat1 (scan_alphanumeric @@@ $$$ ".") >> flat) @@@ (scan_ident || $$$ "="); |
24579 | 144 |
|
27772 | 145 |
val scan_typevar = $$$ "'" @@@ scan_letdigs; |
24579 | 146 |
|
24596 | 147 |
end; |
24579 | 148 |
|
149 |
||
150 |
(* numerals *) |
|
151 |
||
24596 | 152 |
local |
153 |
||
27772 | 154 |
val scan_dec = Scan.many1 (Symbol.is_ascii_digit o symbol); |
155 |
val scan_hex = Scan.many1 (Symbol.is_ascii_hex o symbol); |
|
156 |
val scan_sign = Scan.optional ($$$ "~") []; |
|
157 |
val scan_decint = scan_sign @@@ scan_dec; |
|
24579 | 158 |
|
24596 | 159 |
in |
160 |
||
27772 | 161 |
val scan_word = |
162 |
$$$ "0" @@@ $$$ "w" @@@ $$$ "x" @@@ scan_hex || |
|
163 |
$$$ "0" @@@ $$$ "w" @@@ scan_dec; |
|
24579 | 164 |
|
27772 | 165 |
val scan_int = scan_sign @@@ ($$$ "0" @@@ $$$ "x" @@@ scan_hex || scan_dec); |
24579 | 166 |
|
27772 | 167 |
val scan_exp = ($$$ "E" || $$$ "e") @@@ scan_decint; |
24579 | 168 |
|
169 |
val scan_real = |
|
27772 | 170 |
scan_decint @@@ $$$ "." @@@ scan_dec @@@ Scan.optional scan_exp [] || |
171 |
scan_decint @@@ scan_exp; |
|
24579 | 172 |
|
24596 | 173 |
end; |
174 |
||
24579 | 175 |
|
176 |
(* chars and strings *) |
|
177 |
||
24596 | 178 |
local |
179 |
||
180 |
val scan_escape = |
|
27772 | 181 |
Scan.one (member (op =) (explode "\"\\abtnvfr") o symbol) >> single || |
182 |
$$$ "^" @@@ (Scan.one (fn (s, _) => ord "@" <= ord s andalso ord s <= ord "_") >> single) || |
|
183 |
Scan.one (Symbol.is_ascii_digit o symbol) -- |
|
184 |
Scan.one (Symbol.is_ascii_digit o symbol) -- |
|
185 |
Scan.one (Symbol.is_ascii_digit o symbol) >> (fn ((a, b), c) => [a, b, c]); |
|
24596 | 186 |
|
187 |
val scan_str = |
|
30600
de241396389c
allow non-printable symbols within string tokens;
wenzelm
parents:
30593
diff
changeset
|
188 |
Scan.one (fn (s, _) => Symbol.is_regular s andalso s <> "\"" andalso s <> "\\" andalso |
de241396389c
allow non-printable symbols within string tokens;
wenzelm
parents:
30593
diff
changeset
|
189 |
(not (Symbol.is_char s) orelse Symbol.is_printable s)) >> single || |
27772 | 190 |
$$$ "\\" @@@ !!! "bad escape character in string" scan_escape; |
24596 | 191 |
|
27772 | 192 |
val scan_gap = $$$ "\\" @@@ scan_blanks1 @@@ $$$ "\\"; |
193 |
val scan_gaps = Scan.repeat scan_gap >> flat; |
|
24579 | 194 |
|
24596 | 195 |
in |
24579 | 196 |
|
197 |
val scan_char = |
|
27772 | 198 |
$$$ "#" @@@ $$$ "\"" @@@ scan_gaps @@@ scan_str @@@ scan_gaps @@@ $$$ "\""; |
24579 | 199 |
|
200 |
val scan_string = |
|
27772 | 201 |
$$$ "\"" @@@ !!! "missing quote at end of string" |
202 |
((Scan.repeat (scan_gap || scan_str) >> flat) @@@ $$$ "\""); |
|
24596 | 203 |
|
204 |
end; |
|
24579 | 205 |
|
206 |
||
24596 | 207 |
(* token source *) |
24579 | 208 |
|
209 |
local |
|
210 |
||
30593 | 211 |
fun token k ss = Token (Symbol_Pos.range ss, (k, Symbol_Pos.content ss)); |
24579 | 212 |
|
30593 | 213 |
val scan_ml = |
27772 | 214 |
(scan_char >> token Char || |
215 |
scan_string >> token String || |
|
216 |
scan_blanks1 >> token Space || |
|
30573 | 217 |
Symbol_Pos.scan_comment !!! >> token Comment || |
27772 | 218 |
Scan.max token_leq |
219 |
(scan_keyword >> token Keyword) |
|
220 |
(scan_word >> token Word || |
|
221 |
scan_real >> token Real || |
|
222 |
scan_int >> token Int || |
|
223 |
scan_longident >> token LongIdent || |
|
224 |
scan_ident >> token Ident || |
|
225 |
scan_typevar >> token TypeVar)); |
|
226 |
||
227 |
fun recover msg = |
|
228 |
Scan.many (((not o Symbol.is_blank) andf Symbol.is_regular) o symbol) |
|
229 |
>> (fn cs => [token (Error msg) cs]); |
|
24579 | 230 |
|
231 |
in |
|
232 |
||
30593 | 233 |
val scan_antiq = Antiquote.scan || scan_ml >> Antiquote.Text; |
234 |
||
24596 | 235 |
fun source src = |
30573 | 236 |
Symbol_Pos.source (Position.line 1) src |
30593 | 237 |
|> Source.source Symbol_Pos.stopper (Scan.bulk (!!! "bad input" scan_ml)) (SOME (false, recover)); |
30591 | 238 |
|
239 |
val tokenize = Source.of_string #> source #> Source.exhaust; |
|
24579 | 240 |
|
241 |
end; |
|
242 |
||
24596 | 243 |
end; |
24579 | 244 |