author | wenzelm |
Sun, 07 Jan 2018 21:04:51 +0100 | |
changeset 67365 | fb539f83683a |
parent 67364 | f74672cf83c6 |
child 67366 | e2575ccc0f5c |
permissions | -rw-r--r-- |
55497 | 1 |
/* Title: Pure/ML/ml_lex.scala |
2 |
Author: Makarius |
|
3 |
||
59109 | 4 |
Lexical syntax for Isabelle/ML and Standard ML. |
55497 | 5 |
*/ |
6 |
||
7 |
package isabelle |
|
8 |
||
55499 | 9 |
|
10 |
import scala.collection.mutable |
|
64824 | 11 |
import scala.util.parsing.input.Reader |
55497 | 12 |
|
13 |
||
14 |
object ML_Lex |
|
15 |
{ |
|
55505 | 16 |
/** keywords **/ |
17 |
||
18 |
val keywords: Set[String] = |
|
19 |
Set("#", "(", ")", ",", "->", "...", ":", ":>", ";", "=", "=>", |
|
20 |
"[", "]", "_", "{", "|", "}", "abstype", "and", "andalso", "as", |
|
21 |
"case", "datatype", "do", "else", "end", "eqtype", "exception", |
|
22 |
"fn", "fun", "functor", "handle", "if", "in", "include", |
|
23 |
"infix", "infixr", "let", "local", "nonfix", "of", "op", "open", |
|
24 |
"orelse", "raise", "rec", "sharing", "sig", "signature", |
|
25 |
"struct", "structure", "then", "type", "val", "where", "while", |
|
26 |
"with", "withtype") |
|
27 |
||
28 |
val keywords2: Set[String] = |
|
58933 | 29 |
Set("and", "case", "do", "else", "end", "if", "in", "let", "local", |
30 |
"of", "sig", "struct", "then", "while", "with") |
|
55505 | 31 |
|
32 |
val keywords3: Set[String] = |
|
33 |
Set("handle", "open", "raise") |
|
34 |
||
35 |
private val lexicon: Scan.Lexicon = Scan.Lexicon(keywords.toList: _*) |
|
36 |
||
37 |
||
38 |
||
55497 | 39 |
/** tokens **/ |
40 |
||
41 |
object Kind extends Enumeration |
|
42 |
{ |
|
43 |
val KEYWORD = Value("keyword") |
|
44 |
val IDENT = Value("identifier") |
|
45 |
val LONG_IDENT = Value("long identifier") |
|
46 |
val TYPE_VAR = Value("type variable") |
|
47 |
val WORD = Value("word") |
|
48 |
val INT = Value("integer") |
|
49 |
val REAL = Value("real") |
|
50 |
val CHAR = Value("character") |
|
51 |
val STRING = Value("quoted string") |
|
52 |
val SPACE = Value("white space") |
|
53 |
val COMMENT = Value("comment text") |
|
67365
fb539f83683a
support for formal comments in ML in Isabelle/Scala;
wenzelm
parents:
67364
diff
changeset
|
54 |
val COMMENT_CARTOUCHE = Value("comment cartouche") |
61471 | 55 |
val CONTROL = Value("control symbol antiquotation") |
55512 | 56 |
val ANTIQ = Value("antiquotation") |
57 |
val ANTIQ_START = Value("antiquotation: start") |
|
58 |
val ANTIQ_STOP = Value("antiquotation: stop") |
|
59 |
val ANTIQ_OTHER = Value("antiquotation: other") |
|
60 |
val ANTIQ_STRING = Value("antiquotation: quoted string") |
|
61 |
val ANTIQ_ALT_STRING = Value("antiquotation: back-quoted string") |
|
62 |
val ANTIQ_CARTOUCHE = Value("antiquotation: text cartouche") |
|
55497 | 63 |
val ERROR = Value("bad input") |
64 |
} |
|
65 |
||
60215 | 66 |
sealed case class Token(kind: Kind.Value, source: String) |
55500 | 67 |
{ |
55501 | 68 |
def is_keyword: Boolean = kind == Kind.KEYWORD |
55505 | 69 |
def is_delimiter: Boolean = is_keyword && !Symbol.is_ascii_identifier(source) |
63610 | 70 |
def is_space: Boolean = kind == Kind.SPACE |
67365
fb539f83683a
support for formal comments in ML in Isabelle/Scala;
wenzelm
parents:
67364
diff
changeset
|
71 |
def is_comment: Boolean = kind == Kind.COMMENT || kind == Kind.COMMENT_CARTOUCHE |
55500 | 72 |
} |
55497 | 73 |
|
74 |
||
75 |
||
76 |
/** parsers **/ |
|
77 |
||
55510
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
wenzelm
parents:
55505
diff
changeset
|
78 |
case object ML_String extends Scan.Line_Context |
55512 | 79 |
case class Antiq(ctxt: Scan.Line_Context) extends Scan.Line_Context |
55499 | 80 |
|
55512 | 81 |
private object Parsers extends Scan.Parsers with Antiquote.Parsers |
55497 | 82 |
{ |
83 |
/* string material */ |
|
84 |
||
55500 | 85 |
private val blanks = many(character(Symbol.is_ascii_blank)) |
55497 | 86 |
private val blanks1 = many1(character(Symbol.is_ascii_blank)) |
87 |
||
55499 | 88 |
private val gap = "\\" ~ blanks1 ~ "\\" ^^ { case x ~ y ~ z => x + y + z } |
55500 | 89 |
private val gap_start = "\\" ~ blanks ~ """\z""".r ^^ { case x ~ y ~ _ => x + y } |
55499 | 90 |
|
55497 | 91 |
private val escape = |
92 |
one(character("\"\\abtnvfr".contains(_))) | |
|
93 |
"^" ~ one(character(c => '@' <= c && c <= '_')) ^^ { case x ~ y => x + y } | |
|
94 |
repeated(character(Symbol.is_ascii_digit), 3, 3) |
|
95 |
||
96 |
private val str = |
|
97 |
one(character(c => c != '"' && c != '\\' && ' ' <= c && c <= '~')) | |
|
59108
50ccc027e8a7
clarified Isabelle/ML strings (refining 72238ea2201c);
wenzelm
parents:
58933
diff
changeset
|
98 |
one(s => Symbol.is_symbolic(s) | Symbol.is_control(s)) | |
55497 | 99 |
"\\" ~ escape ^^ { case x ~ y => x + y } |
100 |
||
55499 | 101 |
|
102 |
/* ML char -- without gaps */ |
|
103 |
||
104 |
private val ml_char: Parser[Token] = |
|
105 |
"#\"" ~ str ~ "\"" ^^ { case x ~ y ~ z => Token(Kind.CHAR, x + y + z) } |
|
106 |
||
107 |
private val recover_ml_char: Parser[String] = |
|
108 |
"#\"" ~ opt(str) ^^ { case x ~ Some(y) => x + y case x ~ None => x } |
|
109 |
||
110 |
||
111 |
/* ML string */ |
|
112 |
||
113 |
private val ml_string_body: Parser[String] = |
|
114 |
rep(gap | str) ^^ (_.mkString) |
|
115 |
||
116 |
private val recover_ml_string: Parser[String] = |
|
117 |
"\"" ~ ml_string_body ^^ { case x ~ y => x + y } |
|
118 |
||
119 |
private val ml_string: Parser[Token] = |
|
120 |
"\"" ~ ml_string_body ~ "\"" ^^ { case x ~ y ~ z => Token(Kind.STRING, x + y + z) } |
|
121 |
||
55510
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
wenzelm
parents:
55505
diff
changeset
|
122 |
private def ml_string_line(ctxt: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] = |
55499 | 123 |
{ |
55510
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
wenzelm
parents:
55505
diff
changeset
|
124 |
def result(x: String, c: Scan.Line_Context) = (Token(Kind.STRING, x), c) |
55499 | 125 |
|
126 |
ctxt match { |
|
127 |
case Scan.Finished => |
|
128 |
"\"" ~ ml_string_body ~ ("\"" | gap_start) ^^ |
|
129 |
{ case x ~ y ~ z => result(x + y + z, if (z == "\"") Scan.Finished else ML_String) } |
|
130 |
case ML_String => |
|
55500 | 131 |
blanks ~ opt_term("\\" ~ ml_string_body ~ ("\"" | gap_start)) ^^ |
55499 | 132 |
{ case x ~ Some(y ~ z ~ w) => |
133 |
result(x + y + z + w, if (w == "\"") Scan.Finished else ML_String) |
|
134 |
case x ~ None => result(x, ML_String) } |
|
135 |
case _ => failure("") |
|
136 |
} |
|
137 |
} |
|
138 |
||
139 |
||
140 |
/* ML comment */ |
|
141 |
||
142 |
private val ml_comment: Parser[Token] = |
|
143 |
comment ^^ (x => Token(Kind.COMMENT, x)) |
|
144 |
||
55510
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
wenzelm
parents:
55505
diff
changeset
|
145 |
private def ml_comment_line(ctxt: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] = |
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
wenzelm
parents:
55505
diff
changeset
|
146 |
comment_line(ctxt) ^^ { case (x, c) => (Token(Kind.COMMENT, x), c) } |
55497 | 147 |
|
67365
fb539f83683a
support for formal comments in ML in Isabelle/Scala;
wenzelm
parents:
67364
diff
changeset
|
148 |
private val ml_comment_cartouche: Parser[Token] = |
fb539f83683a
support for formal comments in ML in Isabelle/Scala;
wenzelm
parents:
67364
diff
changeset
|
149 |
comment_cartouche ^^ (x => Token(Kind.COMMENT_CARTOUCHE, x)) |
fb539f83683a
support for formal comments in ML in Isabelle/Scala;
wenzelm
parents:
67364
diff
changeset
|
150 |
|
fb539f83683a
support for formal comments in ML in Isabelle/Scala;
wenzelm
parents:
67364
diff
changeset
|
151 |
private def ml_comment_cartouche_line(ctxt: Scan.Line_Context) |
fb539f83683a
support for formal comments in ML in Isabelle/Scala;
wenzelm
parents:
67364
diff
changeset
|
152 |
: Parser[(Token, Scan.Line_Context)] = |
fb539f83683a
support for formal comments in ML in Isabelle/Scala;
wenzelm
parents:
67364
diff
changeset
|
153 |
comment_cartouche_line(ctxt) ^^ { case (x, c) => (Token(Kind.COMMENT_CARTOUCHE, x), c) } |
fb539f83683a
support for formal comments in ML in Isabelle/Scala;
wenzelm
parents:
67364
diff
changeset
|
154 |
|
55497 | 155 |
|
156 |
private def other_token: Parser[Token] = |
|
157 |
{ |
|
158 |
/* identifiers */ |
|
159 |
||
160 |
val letdigs = many(character(Symbol.is_ascii_letdig)) |
|
161 |
||
162 |
val alphanumeric = |
|
163 |
one(character(Symbol.is_ascii_letter)) ~ letdigs ^^ { case x ~ y => x + y } |
|
164 |
||
165 |
val symbolic = many1(character("!#$%&*+-/:<=>?@\\^`|~".contains(_))) |
|
166 |
||
167 |
val ident = (alphanumeric | symbolic) ^^ (x => Token(Kind.IDENT, x)) |
|
168 |
||
169 |
val long_ident = |
|
170 |
rep1(alphanumeric ~ "." ^^ { case x ~ y => x + y }) ~ |
|
171 |
(alphanumeric | (symbolic | "=")) ^^ |
|
172 |
{ case x ~ y => Token(Kind.LONG_IDENT, x.mkString + y) } |
|
173 |
||
174 |
val type_var = "'" ~ letdigs ^^ { case x ~ y => Token(Kind.TYPE_VAR, x + y) } |
|
175 |
||
176 |
||
177 |
/* numerals */ |
|
178 |
||
179 |
val dec = many1(character(Symbol.is_ascii_digit)) |
|
180 |
val hex = many1(character(Symbol.is_ascii_hex)) |
|
181 |
val sign = opt("~") ^^ { case Some(x) => x case None => "" } |
|
182 |
val decint = sign ~ dec ^^ { case x ~ y => x + y } |
|
183 |
val exp = ("E" | "e") ~ decint ^^ { case x ~ y => x + y } |
|
184 |
||
185 |
val word = |
|
186 |
("0wx" ~ hex ^^ { case x ~ y => x + y } | "0w" ~ dec ^^ { case x ~ y => x + y }) ^^ |
|
187 |
(x => Token(Kind.WORD, x)) |
|
188 |
||
189 |
val int = |
|
190 |
sign ~ ("0x" ~ hex ^^ { case x ~ y => x + y } | dec) ^^ |
|
191 |
{ case x ~ y => Token(Kind.INT, x + y) } |
|
192 |
||
63204
921a5be54132
support rat numerals via special antiquotation syntax;
wenzelm
parents:
61596
diff
changeset
|
193 |
val rat = |
921a5be54132
support rat numerals via special antiquotation syntax;
wenzelm
parents:
61596
diff
changeset
|
194 |
decint ~ opt("/" ~ dec) ^^ { case x ~ None => x case x ~ Some(y ~ z) => x + y + z } |
921a5be54132
support rat numerals via special antiquotation syntax;
wenzelm
parents:
61596
diff
changeset
|
195 |
|
55497 | 196 |
val real = |
197 |
(decint ~ "." ~ dec ~ (opt(exp) ^^ { case Some(x) => x case None => "" }) ^^ |
|
198 |
{ case x ~ y ~ z ~ w => x + y + z + w } | |
|
199 |
decint ~ exp ^^ { case x ~ y => x + y }) ^^ (x => Token(Kind.REAL, x)) |
|
200 |
||
201 |
||
55499 | 202 |
/* main */ |
55497 | 203 |
|
204 |
val space = blanks1 ^^ (x => Token(Kind.SPACE, x)) |
|
205 |
||
206 |
val keyword = literal(lexicon) ^^ (x => Token(Kind.KEYWORD, x)) |
|
207 |
||
61471 | 208 |
val ml_control = control ^^ (x => Token(Kind.CONTROL, x)) |
63204
921a5be54132
support rat numerals via special antiquotation syntax;
wenzelm
parents:
61596
diff
changeset
|
209 |
val ml_antiq = |
921a5be54132
support rat numerals via special antiquotation syntax;
wenzelm
parents:
61596
diff
changeset
|
210 |
"@" ~ rat ^^ { case x ~ y => Token(Kind.ANTIQ, x + y) } | |
921a5be54132
support rat numerals via special antiquotation syntax;
wenzelm
parents:
61596
diff
changeset
|
211 |
antiq ^^ (x => Token(Kind.ANTIQ, x)) |
55512 | 212 |
|
55497 | 213 |
val bad = one(_ => true) ^^ (x => Token(Kind.ERROR, x)) |
214 |
||
67364 | 215 |
val recover = |
216 |
(recover_ml_char | (recover_ml_string | (recover_cartouche | recover_comment))) ^^ |
|
217 |
(x => Token(Kind.ERROR, x)) |
|
218 |
||
219 |
space | (ml_control | (recover | (ml_antiq | |
|
61471 | 220 |
(((word | (real | (int | (long_ident | (ident | type_var))))) ||| keyword) | bad)))) |
55497 | 221 |
} |
222 |
||
55499 | 223 |
|
55512 | 224 |
/* antiquotations (line-oriented) */ |
225 |
||
67095
91ffe1f8bf5c
proper treatment of multi-line cartouche as rudiment of antiquotation, e.g. relevant for syntax-highlighting in Isabelle/jEdit;
wenzelm
parents:
64824
diff
changeset
|
226 |
def ml_cartouche_line(ctxt: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] = |
91ffe1f8bf5c
proper treatment of multi-line cartouche as rudiment of antiquotation, e.g. relevant for syntax-highlighting in Isabelle/jEdit;
wenzelm
parents:
64824
diff
changeset
|
227 |
cartouche_line(ctxt) ^^ { case (x, c) => (Token(Kind.ANTIQ_CARTOUCHE, x), c) } |
91ffe1f8bf5c
proper treatment of multi-line cartouche as rudiment of antiquotation, e.g. relevant for syntax-highlighting in Isabelle/jEdit;
wenzelm
parents:
64824
diff
changeset
|
228 |
|
55512 | 229 |
def ml_antiq_start(ctxt: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] = |
230 |
ctxt match { |
|
231 |
case Scan.Finished => "@{" ^^ (x => (Token(Kind.ANTIQ_START, x), Antiq(Scan.Finished))) |
|
232 |
case _ => failure("") |
|
233 |
} |
|
234 |
||
235 |
def ml_antiq_stop(ctxt: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] = |
|
236 |
ctxt match { |
|
237 |
case Antiq(Scan.Finished) => "}" ^^ (x => (Token(Kind.ANTIQ_STOP, x), Scan.Finished)) |
|
238 |
case _ => failure("") |
|
239 |
} |
|
240 |
||
241 |
def ml_antiq_body(context: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] = |
|
242 |
context match { |
|
243 |
case Antiq(ctxt) => |
|
244 |
(if (ctxt == Scan.Finished) antiq_other ^^ (x => (Token(Kind.ANTIQ_OTHER, x), context)) |
|
245 |
else failure("")) | |
|
246 |
quoted_line("\"", ctxt) ^^ { case (x, c) => (Token(Kind.ANTIQ_STRING, x), Antiq(c)) } | |
|
247 |
quoted_line("`", ctxt) ^^ { case (x, c) => (Token(Kind.ANTIQ_ALT_STRING, x), Antiq(c)) } | |
|
248 |
cartouche_line(ctxt) ^^ { case (x, c) => (Token(Kind.ANTIQ_CARTOUCHE, x), Antiq(c)) } |
|
249 |
case _ => failure("") |
|
250 |
} |
|
251 |
||
252 |
||
55499 | 253 |
/* token */ |
254 |
||
67365
fb539f83683a
support for formal comments in ML in Isabelle/Scala;
wenzelm
parents:
67364
diff
changeset
|
255 |
def token: Parser[Token] = |
fb539f83683a
support for formal comments in ML in Isabelle/Scala;
wenzelm
parents:
67364
diff
changeset
|
256 |
ml_char | (ml_string | (ml_comment | (ml_comment_cartouche | other_token))) |
55499 | 257 |
|
56278
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents:
55512
diff
changeset
|
258 |
def token_line(SML: Boolean, ctxt: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] = |
55499 | 259 |
{ |
260 |
val other = (ml_char | other_token) ^^ (x => (x, Scan.Finished)) |
|
261 |
||
56278
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents:
55512
diff
changeset
|
262 |
if (SML) ml_string_line(ctxt) | (ml_comment_line(ctxt) | other) |
67365
fb539f83683a
support for formal comments in ML in Isabelle/Scala;
wenzelm
parents:
67364
diff
changeset
|
263 |
else { |
56278
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents:
55512
diff
changeset
|
264 |
ml_string_line(ctxt) | |
61596 | 265 |
(ml_comment_line(ctxt) | |
67365
fb539f83683a
support for formal comments in ML in Isabelle/Scala;
wenzelm
parents:
67364
diff
changeset
|
266 |
(ml_comment_cartouche_line(ctxt) | |
fb539f83683a
support for formal comments in ML in Isabelle/Scala;
wenzelm
parents:
67364
diff
changeset
|
267 |
(ml_cartouche_line(ctxt) | |
fb539f83683a
support for formal comments in ML in Isabelle/Scala;
wenzelm
parents:
67364
diff
changeset
|
268 |
(ml_antiq_start(ctxt) | (ml_antiq_stop(ctxt) | (ml_antiq_body(ctxt) | other)))))) |
fb539f83683a
support for formal comments in ML in Isabelle/Scala;
wenzelm
parents:
67364
diff
changeset
|
269 |
} |
55499 | 270 |
} |
55497 | 271 |
} |
272 |
||
55499 | 273 |
|
274 |
/* tokenize */ |
|
275 |
||
55497 | 276 |
def tokenize(input: CharSequence): List[Token] = |
64824 | 277 |
Parsers.parseAll(Parsers.rep(Parsers.token), Scan.char_reader(input)) match { |
55497 | 278 |
case Parsers.Success(tokens, _) => tokens |
279 |
case _ => error("Unexpected failure of tokenizing input:\n" + input.toString) |
|
280 |
} |
|
55499 | 281 |
|
56278
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents:
55512
diff
changeset
|
282 |
def tokenize_line(SML: Boolean, input: CharSequence, context: Scan.Line_Context) |
55510
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
wenzelm
parents:
55505
diff
changeset
|
283 |
: (List[Token], Scan.Line_Context) = |
55499 | 284 |
{ |
64824 | 285 |
var in: Reader[Char] = Scan.char_reader(input) |
55499 | 286 |
val toks = new mutable.ListBuffer[Token] |
287 |
var ctxt = context |
|
288 |
while (!in.atEnd) { |
|
56278
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents:
55512
diff
changeset
|
289 |
Parsers.parse(Parsers.token_line(SML, ctxt), in) match { |
60215 | 290 |
case Parsers.Success((x, c), rest) => toks += x; ctxt = c; in = rest |
55499 | 291 |
case Parsers.NoSuccess(_, rest) => |
292 |
error("Unexpected failure of tokenizing input:\n" + rest.source.toString) |
|
293 |
} |
|
294 |
} |
|
295 |
(toks.toList, ctxt) |
|
296 |
} |
|
55497 | 297 |
} |