55497
|
1 |
/* Title: Pure/ML/ml_lex.scala
|
|
2 |
Author: Makarius
|
|
3 |
|
|
4 |
Lexical syntax for SML.
|
|
5 |
*/
|
|
6 |
|
|
7 |
package isabelle
|
|
8 |
|
55499
|
9 |
|
|
10 |
import scala.collection.mutable
|
55497
|
11 |
import scala.util.parsing.input.{Reader, CharSequenceReader}
|
|
12 |
|
|
13 |
|
|
14 |
object ML_Lex
|
|
15 |
{
|
|
16 |
/** tokens **/
|
|
17 |
|
|
18 |
object Kind extends Enumeration
|
|
19 |
{
|
|
20 |
val KEYWORD = Value("keyword")
|
|
21 |
val IDENT = Value("identifier")
|
|
22 |
val LONG_IDENT = Value("long identifier")
|
|
23 |
val TYPE_VAR = Value("type variable")
|
|
24 |
val WORD = Value("word")
|
|
25 |
val INT = Value("integer")
|
|
26 |
val REAL = Value("real")
|
|
27 |
val CHAR = Value("character")
|
|
28 |
val STRING = Value("quoted string")
|
|
29 |
val SPACE = Value("white space")
|
|
30 |
val COMMENT = Value("comment text")
|
|
31 |
val ERROR = Value("bad input")
|
|
32 |
}
|
|
33 |
|
|
34 |
sealed case class Token(val kind: Kind.Value, val source: String)
|
55500
|
35 |
{
|
55501
|
36 |
def is_keyword: Boolean = kind == Kind.KEYWORD
|
|
37 |
def is_operator: Boolean = is_keyword && !Symbol.is_ascii_identifier(source)
|
55500
|
38 |
}
|
55497
|
39 |
|
|
40 |
|
|
41 |
|
|
42 |
/** parsers **/
|
|
43 |
|
55499
|
44 |
case object ML_String extends Scan.Context
|
|
45 |
|
55497
|
46 |
private val lexicon =
|
|
47 |
Scan.Lexicon("#", "(", ")", ",", "->", "...", ":", ":>", ";", "=",
|
|
48 |
"=>", "[", "]", "_", "{", "|", "}", "abstype", "and", "andalso", "as",
|
|
49 |
"case", "datatype", "do", "else", "end", "eqtype", "exception", "fn",
|
|
50 |
"fun", "functor", "handle", "if", "in", "include", "infix", "infixr",
|
|
51 |
"let", "local", "nonfix", "of", "op", "open", "orelse", "raise", "rec",
|
|
52 |
"sharing", "sig", "signature", "struct", "structure", "then", "type",
|
|
53 |
"val", "where", "while", "with", "withtype")
|
|
54 |
|
|
55 |
private object Parsers extends Scan.Parsers
|
|
56 |
{
|
|
57 |
/* string material */
|
|
58 |
|
55500
|
59 |
private val blanks = many(character(Symbol.is_ascii_blank))
|
55497
|
60 |
private val blanks1 = many1(character(Symbol.is_ascii_blank))
|
|
61 |
|
55499
|
62 |
private val gap = "\\" ~ blanks1 ~ "\\" ^^ { case x ~ y ~ z => x + y + z }
|
55500
|
63 |
private val gap_start = "\\" ~ blanks ~ """\z""".r ^^ { case x ~ y ~ _ => x + y }
|
55499
|
64 |
|
55497
|
65 |
private val escape =
|
|
66 |
one(character("\"\\abtnvfr".contains(_))) |
|
|
67 |
"^" ~ one(character(c => '@' <= c && c <= '_')) ^^ { case x ~ y => x + y } |
|
|
68 |
repeated(character(Symbol.is_ascii_digit), 3, 3)
|
|
69 |
|
|
70 |
private val str =
|
55502
|
71 |
one(Symbol.is_symbolic) |
|
55497
|
72 |
one(character(c => c != '"' && c != '\\' && ' ' <= c && c <= '~')) |
|
|
73 |
"\\" ~ escape ^^ { case x ~ y => x + y }
|
|
74 |
|
55499
|
75 |
|
|
76 |
/* ML char -- without gaps */
|
|
77 |
|
|
78 |
private val ml_char: Parser[Token] =
|
|
79 |
"#\"" ~ str ~ "\"" ^^ { case x ~ y ~ z => Token(Kind.CHAR, x + y + z) }
|
|
80 |
|
|
81 |
private val recover_ml_char: Parser[String] =
|
|
82 |
"#\"" ~ opt(str) ^^ { case x ~ Some(y) => x + y case x ~ None => x }
|
|
83 |
|
|
84 |
|
|
85 |
/* ML string */
|
|
86 |
|
|
87 |
private val ml_string_body: Parser[String] =
|
|
88 |
rep(gap | str) ^^ (_.mkString)
|
|
89 |
|
|
90 |
private val recover_ml_string: Parser[String] =
|
|
91 |
"\"" ~ ml_string_body ^^ { case x ~ y => x + y }
|
|
92 |
|
|
93 |
private val ml_string: Parser[Token] =
|
|
94 |
"\"" ~ ml_string_body ~ "\"" ^^ { case x ~ y ~ z => Token(Kind.STRING, x + y + z) }
|
|
95 |
|
|
96 |
private def ml_string_context(ctxt: Scan.Context): Parser[(Token, Scan.Context)] =
|
|
97 |
{
|
|
98 |
def result(x: String, c: Scan.Context) = (Token(Kind.STRING, x), c)
|
|
99 |
|
|
100 |
ctxt match {
|
|
101 |
case Scan.Finished =>
|
|
102 |
"\"" ~ ml_string_body ~ ("\"" | gap_start) ^^
|
|
103 |
{ case x ~ y ~ z => result(x + y + z, if (z == "\"") Scan.Finished else ML_String) }
|
|
104 |
case ML_String =>
|
55500
|
105 |
blanks ~ opt_term("\\" ~ ml_string_body ~ ("\"" | gap_start)) ^^
|
55499
|
106 |
{ case x ~ Some(y ~ z ~ w) =>
|
|
107 |
result(x + y + z + w, if (w == "\"") Scan.Finished else ML_String)
|
|
108 |
case x ~ None => result(x, ML_String) }
|
|
109 |
case _ => failure("")
|
|
110 |
}
|
|
111 |
}
|
|
112 |
|
|
113 |
|
|
114 |
/* ML comment */
|
|
115 |
|
|
116 |
private val ml_comment: Parser[Token] =
|
|
117 |
comment ^^ (x => Token(Kind.COMMENT, x))
|
|
118 |
|
|
119 |
private def ml_comment_context(ctxt: Scan.Context): Parser[(Token, Scan.Context)] =
|
|
120 |
comment_context(ctxt) ^^ { case (x, c) => (Token(Kind.COMMENT, x), c) }
|
55497
|
121 |
|
|
122 |
|
|
123 |
/* delimited token */
|
|
124 |
|
|
125 |
private def delimited_token: Parser[Token] =
|
55499
|
126 |
ml_char | (ml_string | ml_comment)
|
55497
|
127 |
|
55499
|
128 |
private val recover_delimited: Parser[Token] =
|
|
129 |
(recover_ml_char | (recover_ml_string | recover_comment)) ^^ (x => Token(Kind.ERROR, x))
|
55497
|
130 |
|
|
131 |
|
|
132 |
private def other_token: Parser[Token] =
|
|
133 |
{
|
|
134 |
/* identifiers */
|
|
135 |
|
|
136 |
val letdigs = many(character(Symbol.is_ascii_letdig))
|
|
137 |
|
|
138 |
val alphanumeric =
|
|
139 |
one(character(Symbol.is_ascii_letter)) ~ letdigs ^^ { case x ~ y => x + y }
|
|
140 |
|
|
141 |
val symbolic = many1(character("!#$%&*+-/:<=>?@\\^`|~".contains(_)))
|
|
142 |
|
|
143 |
val ident = (alphanumeric | symbolic) ^^ (x => Token(Kind.IDENT, x))
|
|
144 |
|
|
145 |
val long_ident =
|
|
146 |
rep1(alphanumeric ~ "." ^^ { case x ~ y => x + y }) ~
|
|
147 |
(alphanumeric | (symbolic | "=")) ^^
|
|
148 |
{ case x ~ y => Token(Kind.LONG_IDENT, x.mkString + y) }
|
|
149 |
|
|
150 |
val type_var = "'" ~ letdigs ^^ { case x ~ y => Token(Kind.TYPE_VAR, x + y) }
|
|
151 |
|
|
152 |
|
|
153 |
/* numerals */
|
|
154 |
|
|
155 |
val dec = many1(character(Symbol.is_ascii_digit))
|
|
156 |
val hex = many1(character(Symbol.is_ascii_hex))
|
|
157 |
val sign = opt("~") ^^ { case Some(x) => x case None => "" }
|
|
158 |
val decint = sign ~ dec ^^ { case x ~ y => x + y }
|
|
159 |
val exp = ("E" | "e") ~ decint ^^ { case x ~ y => x + y }
|
|
160 |
|
|
161 |
val word =
|
|
162 |
("0wx" ~ hex ^^ { case x ~ y => x + y } | "0w" ~ dec ^^ { case x ~ y => x + y }) ^^
|
|
163 |
(x => Token(Kind.WORD, x))
|
|
164 |
|
|
165 |
val int =
|
|
166 |
sign ~ ("0x" ~ hex ^^ { case x ~ y => x + y } | dec) ^^
|
|
167 |
{ case x ~ y => Token(Kind.INT, x + y) }
|
|
168 |
|
|
169 |
val real =
|
|
170 |
(decint ~ "." ~ dec ~ (opt(exp) ^^ { case Some(x) => x case None => "" }) ^^
|
|
171 |
{ case x ~ y ~ z ~ w => x + y + z + w } |
|
|
172 |
decint ~ exp ^^ { case x ~ y => x + y }) ^^ (x => Token(Kind.REAL, x))
|
|
173 |
|
|
174 |
|
55499
|
175 |
/* main */
|
55497
|
176 |
|
|
177 |
val space = blanks1 ^^ (x => Token(Kind.SPACE, x))
|
|
178 |
|
|
179 |
val keyword = literal(lexicon) ^^ (x => Token(Kind.KEYWORD, x))
|
|
180 |
|
|
181 |
val bad = one(_ => true) ^^ (x => Token(Kind.ERROR, x))
|
|
182 |
|
|
183 |
space | (recover_delimited |
|
|
184 |
(((word | (real | (int | (long_ident | (ident | type_var))))) ||| keyword) | bad))
|
|
185 |
}
|
|
186 |
|
55499
|
187 |
|
|
188 |
/* token */
|
|
189 |
|
55497
|
190 |
def token: Parser[Token] = delimited_token | other_token
|
55499
|
191 |
|
|
192 |
def token_context(ctxt: Scan.Context): Parser[(Token, Scan.Context)] =
|
|
193 |
{
|
|
194 |
val other = (ml_char | other_token) ^^ (x => (x, Scan.Finished))
|
|
195 |
|
|
196 |
ml_string_context(ctxt) | (ml_comment_context(ctxt) | other)
|
|
197 |
}
|
55497
|
198 |
}
|
|
199 |
|
55499
|
200 |
|
|
201 |
/* tokenize */
|
|
202 |
|
55497
|
203 |
def tokenize(input: CharSequence): List[Token] =
|
|
204 |
{
|
|
205 |
Parsers.parseAll(Parsers.rep(Parsers.token), new CharSequenceReader(input)) match {
|
|
206 |
case Parsers.Success(tokens, _) => tokens
|
|
207 |
case _ => error("Unexpected failure of tokenizing input:\n" + input.toString)
|
|
208 |
}
|
|
209 |
}
|
55499
|
210 |
|
|
211 |
def tokenize_context(input: CharSequence, context: Scan.Context): (List[Token], Scan.Context) =
|
|
212 |
{
|
|
213 |
var in: Reader[Char] = new CharSequenceReader(input)
|
|
214 |
val toks = new mutable.ListBuffer[Token]
|
|
215 |
var ctxt = context
|
|
216 |
while (!in.atEnd) {
|
|
217 |
Parsers.parse(Parsers.token_context(ctxt), in) match {
|
|
218 |
case Parsers.Success((x, c), rest) => { toks += x; ctxt = c; in = rest }
|
|
219 |
case Parsers.NoSuccess(_, rest) =>
|
|
220 |
error("Unexpected failure of tokenizing input:\n" + rest.source.toString)
|
|
221 |
}
|
|
222 |
}
|
|
223 |
(toks.toList, ctxt)
|
|
224 |
}
|
55497
|
225 |
}
|
|
226 |
|