| author | desharna | 
| Mon, 06 Oct 2014 13:42:48 +0200 | |
| changeset 58586 | 1b11669a5c66 | 
| parent 56278 | 2576d3a40ed6 | 
| child 58933 | 6585e59aee3e | 
| permissions | -rw-r--r-- | 
| 55497 | 1  | 
/* Title: Pure/ML/ml_lex.scala  | 
2  | 
Author: Makarius  | 
|
3  | 
||
4  | 
Lexical syntax for SML.  | 
|
5  | 
*/  | 
|
6  | 
||
7  | 
package isabelle  | 
|
8  | 
||
| 55499 | 9  | 
|
10  | 
import scala.collection.mutable  | 
|
| 55497 | 11  | 
import scala.util.parsing.input.{Reader, CharSequenceReader}
 | 
12  | 
||
13  | 
||
14  | 
object ML_Lex  | 
|
15  | 
{
 | 
|
| 55505 | 16  | 
/** keywords **/  | 
17  | 
||
18  | 
val keywords: Set[String] =  | 
|
19  | 
    Set("#", "(", ")", ",", "->", "...", ":", ":>", ";", "=", "=>",
 | 
|
20  | 
      "[", "]", "_", "{", "|", "}", "abstype", "and", "andalso", "as",
 | 
|
21  | 
"case", "datatype", "do", "else", "end", "eqtype", "exception",  | 
|
22  | 
"fn", "fun", "functor", "handle", "if", "in", "include",  | 
|
23  | 
"infix", "infixr", "let", "local", "nonfix", "of", "op", "open",  | 
|
24  | 
"orelse", "raise", "rec", "sharing", "sig", "signature",  | 
|
25  | 
"struct", "structure", "then", "type", "val", "where", "while",  | 
|
26  | 
"with", "withtype")  | 
|
27  | 
||
28  | 
val keywords2: Set[String] =  | 
|
29  | 
    Set("case", "do", "else", "end", "if", "in", "let", "local", "of",
 | 
|
30  | 
"sig", "struct", "then", "while", "with")  | 
|
31  | 
||
32  | 
val keywords3: Set[String] =  | 
|
33  | 
    Set("handle", "open", "raise")
 | 
|
34  | 
||
35  | 
private val lexicon: Scan.Lexicon = Scan.Lexicon(keywords.toList: _*)  | 
|
36  | 
||
37  | 
||
38  | 
||
| 55497 | 39  | 
/** tokens **/  | 
40  | 
||
41  | 
object Kind extends Enumeration  | 
|
42  | 
  {
 | 
|
43  | 
    val KEYWORD = Value("keyword")
 | 
|
44  | 
    val IDENT = Value("identifier")
 | 
|
45  | 
    val LONG_IDENT = Value("long identifier")
 | 
|
46  | 
    val TYPE_VAR = Value("type variable")
 | 
|
47  | 
    val WORD = Value("word")
 | 
|
48  | 
    val INT = Value("integer")
 | 
|
49  | 
    val REAL = Value("real")
 | 
|
50  | 
    val CHAR = Value("character")
 | 
|
51  | 
    val STRING = Value("quoted string")
 | 
|
52  | 
    val SPACE = Value("white space")
 | 
|
53  | 
    val COMMENT = Value("comment text")
 | 
|
| 55512 | 54  | 
    val ANTIQ = Value("antiquotation")
 | 
55  | 
    val ANTIQ_START = Value("antiquotation: start")
 | 
|
56  | 
    val ANTIQ_STOP = Value("antiquotation: stop")
 | 
|
57  | 
    val ANTIQ_OTHER = Value("antiquotation: other")
 | 
|
58  | 
    val ANTIQ_STRING = Value("antiquotation: quoted string")
 | 
|
59  | 
    val ANTIQ_ALT_STRING = Value("antiquotation: back-quoted string")
 | 
|
60  | 
    val ANTIQ_CARTOUCHE = Value("antiquotation: text cartouche")
 | 
|
| 55497 | 61  | 
    val ERROR = Value("bad input")
 | 
62  | 
}  | 
|
63  | 
||
64  | 
sealed case class Token(val kind: Kind.Value, val source: String)  | 
|
| 55500 | 65  | 
  {
 | 
| 55501 | 66  | 
def is_keyword: Boolean = kind == Kind.KEYWORD  | 
| 55505 | 67  | 
def is_delimiter: Boolean = is_keyword && !Symbol.is_ascii_identifier(source)  | 
| 55500 | 68  | 
}  | 
| 55497 | 69  | 
|
70  | 
||
71  | 
||
72  | 
/** parsers **/  | 
|
73  | 
||
| 
55510
 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 
wenzelm 
parents: 
55505 
diff
changeset
 | 
74  | 
case object ML_String extends Scan.Line_Context  | 
| 55512 | 75  | 
case class Antiq(ctxt: Scan.Line_Context) extends Scan.Line_Context  | 
| 55499 | 76  | 
|
| 55512 | 77  | 
private object Parsers extends Scan.Parsers with Antiquote.Parsers  | 
| 55497 | 78  | 
  {
 | 
79  | 
/* string material */  | 
|
80  | 
||
| 55500 | 81  | 
private val blanks = many(character(Symbol.is_ascii_blank))  | 
| 55497 | 82  | 
private val blanks1 = many1(character(Symbol.is_ascii_blank))  | 
83  | 
||
| 55499 | 84  | 
    private val gap = "\\" ~ blanks1 ~ "\\" ^^ { case x ~ y ~ z => x + y + z }
 | 
| 55500 | 85  | 
    private val gap_start = "\\" ~ blanks ~ """\z""".r ^^ { case x ~ y ~ _ => x + y }
 | 
| 55499 | 86  | 
|
| 55497 | 87  | 
private val escape =  | 
88  | 
      one(character("\"\\abtnvfr".contains(_))) |
 | 
|
89  | 
      "^" ~ one(character(c => '@' <= c && c <= '_')) ^^ { case x ~ y => x + y } |
 | 
|
90  | 
repeated(character(Symbol.is_ascii_digit), 3, 3)  | 
|
91  | 
||
92  | 
private val str =  | 
|
| 55502 | 93  | 
one(Symbol.is_symbolic) |  | 
| 55497 | 94  | 
one(character(c => c != '"' && c != '\\' && ' ' <= c && c <= '~')) |  | 
95  | 
      "\\" ~ escape ^^ { case x ~ y => x + y }
 | 
|
96  | 
||
| 55499 | 97  | 
|
98  | 
/* ML char -- without gaps */  | 
|
99  | 
||
100  | 
private val ml_char: Parser[Token] =  | 
|
101  | 
      "#\"" ~ str ~ "\"" ^^ { case x ~ y ~ z => Token(Kind.CHAR, x + y + z) }
 | 
|
102  | 
||
103  | 
private val recover_ml_char: Parser[String] =  | 
|
104  | 
      "#\"" ~ opt(str) ^^ { case x ~ Some(y) => x + y case x ~ None => x }
 | 
|
105  | 
||
106  | 
||
107  | 
/* ML string */  | 
|
108  | 
||
109  | 
private val ml_string_body: Parser[String] =  | 
|
110  | 
rep(gap | str) ^^ (_.mkString)  | 
|
111  | 
||
112  | 
private val recover_ml_string: Parser[String] =  | 
|
113  | 
      "\"" ~ ml_string_body ^^ { case x ~ y => x + y }
 | 
|
114  | 
||
115  | 
private val ml_string: Parser[Token] =  | 
|
116  | 
      "\"" ~ ml_string_body ~ "\"" ^^ { case x ~ y ~ z => Token(Kind.STRING, x + y + z) }
 | 
|
117  | 
||
| 
55510
 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 
wenzelm 
parents: 
55505 
diff
changeset
 | 
118  | 
private def ml_string_line(ctxt: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] =  | 
| 55499 | 119  | 
    {
 | 
| 
55510
 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 
wenzelm 
parents: 
55505 
diff
changeset
 | 
120  | 
def result(x: String, c: Scan.Line_Context) = (Token(Kind.STRING, x), c)  | 
| 55499 | 121  | 
|
122  | 
      ctxt match {
 | 
|
123  | 
case Scan.Finished =>  | 
|
124  | 
          "\"" ~ ml_string_body ~ ("\"" | gap_start) ^^
 | 
|
125  | 
            { case x ~ y ~ z => result(x + y + z, if (z == "\"") Scan.Finished else ML_String) }
 | 
|
126  | 
case ML_String =>  | 
|
| 55500 | 127  | 
          blanks ~ opt_term("\\" ~ ml_string_body ~ ("\"" | gap_start)) ^^
 | 
| 55499 | 128  | 
            { case x ~ Some(y ~ z ~ w) =>
 | 
129  | 
result(x + y + z + w, if (w == "\"") Scan.Finished else ML_String)  | 
|
130  | 
case x ~ None => result(x, ML_String) }  | 
|
131  | 
        case _ => failure("")
 | 
|
132  | 
}  | 
|
133  | 
}  | 
|
134  | 
||
135  | 
||
136  | 
/* ML comment */  | 
|
137  | 
||
138  | 
private val ml_comment: Parser[Token] =  | 
|
139  | 
comment ^^ (x => Token(Kind.COMMENT, x))  | 
|
140  | 
||
| 
55510
 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 
wenzelm 
parents: 
55505 
diff
changeset
 | 
141  | 
private def ml_comment_line(ctxt: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] =  | 
| 
 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 
wenzelm 
parents: 
55505 
diff
changeset
 | 
142  | 
      comment_line(ctxt) ^^ { case (x, c) => (Token(Kind.COMMENT, x), c) }
 | 
| 55497 | 143  | 
|
144  | 
||
145  | 
/* delimited token */  | 
|
146  | 
||
147  | 
private def delimited_token: Parser[Token] =  | 
|
| 55499 | 148  | 
ml_char | (ml_string | ml_comment)  | 
| 55497 | 149  | 
|
| 55499 | 150  | 
private val recover_delimited: Parser[Token] =  | 
151  | 
(recover_ml_char | (recover_ml_string | recover_comment)) ^^ (x => Token(Kind.ERROR, x))  | 
|
| 55497 | 152  | 
|
153  | 
||
154  | 
private def other_token: Parser[Token] =  | 
|
155  | 
    {
 | 
|
156  | 
/* identifiers */  | 
|
157  | 
||
158  | 
val letdigs = many(character(Symbol.is_ascii_letdig))  | 
|
159  | 
||
160  | 
val alphanumeric =  | 
|
161  | 
        one(character(Symbol.is_ascii_letter)) ~ letdigs ^^ { case x ~ y => x + y }
 | 
|
162  | 
||
163  | 
      val symbolic = many1(character("!#$%&*+-/:<=>?@\\^`|~".contains(_)))
 | 
|
164  | 
||
165  | 
val ident = (alphanumeric | symbolic) ^^ (x => Token(Kind.IDENT, x))  | 
|
166  | 
||
167  | 
val long_ident =  | 
|
168  | 
        rep1(alphanumeric ~ "." ^^ { case x ~ y => x + y }) ~
 | 
|
169  | 
(alphanumeric | (symbolic | "=")) ^^  | 
|
170  | 
          { case x ~ y => Token(Kind.LONG_IDENT, x.mkString + y) }
 | 
|
171  | 
||
172  | 
      val type_var = "'" ~ letdigs ^^ { case x ~ y => Token(Kind.TYPE_VAR, x + y) }
 | 
|
173  | 
||
174  | 
||
175  | 
/* numerals */  | 
|
176  | 
||
177  | 
val dec = many1(character(Symbol.is_ascii_digit))  | 
|
178  | 
val hex = many1(character(Symbol.is_ascii_hex))  | 
|
179  | 
      val sign = opt("~") ^^ { case Some(x) => x case None => "" }
 | 
|
180  | 
      val decint = sign ~ dec ^^ { case x ~ y => x + y }
 | 
|
181  | 
      val exp = ("E" | "e") ~ decint ^^ { case x ~ y => x + y }
 | 
|
182  | 
||
183  | 
val word =  | 
|
184  | 
        ("0wx" ~ hex ^^ { case x ~ y => x + y } | "0w" ~ dec ^^ { case x ~ y => x + y }) ^^
 | 
|
185  | 
(x => Token(Kind.WORD, x))  | 
|
186  | 
||
187  | 
val int =  | 
|
188  | 
        sign ~ ("0x" ~ hex ^^ { case x ~ y => x + y } | dec) ^^
 | 
|
189  | 
          { case x ~ y => Token(Kind.INT, x + y) }
 | 
|
190  | 
||
191  | 
val real =  | 
|
192  | 
        (decint ~ "." ~ dec ~ (opt(exp) ^^ { case Some(x) => x case None => "" }) ^^
 | 
|
193  | 
          { case x ~ y ~ z ~ w => x + y + z + w } |
 | 
|
194  | 
         decint ~ exp ^^ { case x ~ y => x + y }) ^^ (x => Token(Kind.REAL, x))
 | 
|
195  | 
||
196  | 
||
| 55499 | 197  | 
/* main */  | 
| 55497 | 198  | 
|
199  | 
val space = blanks1 ^^ (x => Token(Kind.SPACE, x))  | 
|
200  | 
||
201  | 
val keyword = literal(lexicon) ^^ (x => Token(Kind.KEYWORD, x))  | 
|
202  | 
||
| 55512 | 203  | 
val ml_antiq = antiq ^^ (x => Token(Kind.ANTIQ, x))  | 
204  | 
||
| 55497 | 205  | 
val bad = one(_ => true) ^^ (x => Token(Kind.ERROR, x))  | 
206  | 
||
| 55512 | 207  | 
space | (recover_delimited | (ml_antiq |  | 
208  | 
(((word | (real | (int | (long_ident | (ident | type_var))))) ||| keyword) | bad)))  | 
|
| 55497 | 209  | 
}  | 
210  | 
||
| 55499 | 211  | 
|
| 55512 | 212  | 
/* antiquotations (line-oriented) */  | 
213  | 
||
214  | 
def ml_antiq_start(ctxt: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] =  | 
|
215  | 
      ctxt match {
 | 
|
216  | 
        case Scan.Finished => "@{" ^^ (x => (Token(Kind.ANTIQ_START, x), Antiq(Scan.Finished)))
 | 
|
217  | 
        case _ => failure("")
 | 
|
218  | 
}  | 
|
219  | 
||
220  | 
def ml_antiq_stop(ctxt: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] =  | 
|
221  | 
      ctxt match {
 | 
|
222  | 
case Antiq(Scan.Finished) => "}" ^^ (x => (Token(Kind.ANTIQ_STOP, x), Scan.Finished))  | 
|
223  | 
        case _ => failure("")
 | 
|
224  | 
}  | 
|
225  | 
||
226  | 
def ml_antiq_body(context: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] =  | 
|
227  | 
      context match {
 | 
|
228  | 
case Antiq(ctxt) =>  | 
|
229  | 
(if (ctxt == Scan.Finished) antiq_other ^^ (x => (Token(Kind.ANTIQ_OTHER, x), context))  | 
|
230  | 
           else failure("")) |
 | 
|
231  | 
          quoted_line("\"", ctxt) ^^ { case (x, c) => (Token(Kind.ANTIQ_STRING, x), Antiq(c)) } |
 | 
|
232  | 
          quoted_line("`", ctxt) ^^ { case (x, c) => (Token(Kind.ANTIQ_ALT_STRING, x), Antiq(c)) } |
 | 
|
233  | 
          cartouche_line(ctxt) ^^ { case (x, c) => (Token(Kind.ANTIQ_CARTOUCHE, x), Antiq(c)) }
 | 
|
234  | 
        case _ => failure("")
 | 
|
235  | 
}  | 
|
236  | 
||
237  | 
||
| 55499 | 238  | 
/* token */  | 
239  | 
||
| 55497 | 240  | 
def token: Parser[Token] = delimited_token | other_token  | 
| 55499 | 241  | 
|
| 
56278
 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 
wenzelm 
parents: 
55512 
diff
changeset
 | 
242  | 
def token_line(SML: Boolean, ctxt: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] =  | 
| 55499 | 243  | 
    {
 | 
244  | 
val other = (ml_char | other_token) ^^ (x => (x, Scan.Finished))  | 
|
245  | 
||
| 
56278
 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 
wenzelm 
parents: 
55512 
diff
changeset
 | 
246  | 
if (SML) ml_string_line(ctxt) | (ml_comment_line(ctxt) | other)  | 
| 
 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 
wenzelm 
parents: 
55512 
diff
changeset
 | 
247  | 
else  | 
| 
 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 
wenzelm 
parents: 
55512 
diff
changeset
 | 
248  | 
ml_string_line(ctxt) |  | 
| 
 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 
wenzelm 
parents: 
55512 
diff
changeset
 | 
249  | 
(ml_comment_line(ctxt) |  | 
| 
 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 
wenzelm 
parents: 
55512 
diff
changeset
 | 
250  | 
(ml_antiq_start(ctxt) | (ml_antiq_stop(ctxt) | (ml_antiq_body(ctxt) | other))))  | 
| 55499 | 251  | 
}  | 
| 55497 | 252  | 
}  | 
253  | 
||
| 55499 | 254  | 
|
255  | 
/* tokenize */  | 
|
256  | 
||
| 55497 | 257  | 
def tokenize(input: CharSequence): List[Token] =  | 
258  | 
  {
 | 
|
259  | 
    Parsers.parseAll(Parsers.rep(Parsers.token), new CharSequenceReader(input)) match {
 | 
|
260  | 
case Parsers.Success(tokens, _) => tokens  | 
|
261  | 
      case _ => error("Unexpected failure of tokenizing input:\n" + input.toString)
 | 
|
262  | 
}  | 
|
263  | 
}  | 
|
| 55499 | 264  | 
|
| 
56278
 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 
wenzelm 
parents: 
55512 
diff
changeset
 | 
265  | 
def tokenize_line(SML: Boolean, input: CharSequence, context: Scan.Line_Context)  | 
| 
55510
 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 
wenzelm 
parents: 
55505 
diff
changeset
 | 
266  | 
: (List[Token], Scan.Line_Context) =  | 
| 55499 | 267  | 
  {
 | 
268  | 
var in: Reader[Char] = new CharSequenceReader(input)  | 
|
269  | 
val toks = new mutable.ListBuffer[Token]  | 
|
270  | 
var ctxt = context  | 
|
271  | 
    while (!in.atEnd) {
 | 
|
| 
56278
 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 
wenzelm 
parents: 
55512 
diff
changeset
 | 
272  | 
      Parsers.parse(Parsers.token_line(SML, ctxt), in) match {
 | 
| 55499 | 273  | 
        case Parsers.Success((x, c), rest) => { toks += x; ctxt = c; in = rest }
 | 
274  | 
case Parsers.NoSuccess(_, rest) =>  | 
|
275  | 
          error("Unexpected failure of tokenizing input:\n" + rest.source.toString)
 | 
|
276  | 
}  | 
|
277  | 
}  | 
|
278  | 
(toks.toList, ctxt)  | 
|
279  | 
}  | 
|
| 55497 | 280  | 
}  | 
281  |