| author | nipkow | 
| Thu, 03 Feb 2022 10:33:55 +0100 | |
| changeset 75061 | 57df04e4f018 | 
| parent 67509 | 524dc5c2a031 | 
| child 75384 | 20093a63d03b | 
| permissions | -rw-r--r-- | 
| 55497 | 1  | 
/* Title: Pure/ML/ml_lex.scala  | 
2  | 
Author: Makarius  | 
|
3  | 
||
| 59109 | 4  | 
Lexical syntax for Isabelle/ML and Standard ML.  | 
| 55497 | 5  | 
*/  | 
6  | 
||
7  | 
package isabelle  | 
|
8  | 
||
| 55499 | 9  | 
|
10  | 
import scala.collection.mutable  | 
|
| 64824 | 11  | 
import scala.util.parsing.input.Reader  | 
| 55497 | 12  | 
|
13  | 
||
14  | 
object ML_Lex  | 
|
15  | 
{
 | 
|
| 55505 | 16  | 
/** keywords **/  | 
17  | 
||
18  | 
val keywords: Set[String] =  | 
|
19  | 
    Set("#", "(", ")", ",", "->", "...", ":", ":>", ";", "=", "=>",
 | 
|
20  | 
      "[", "]", "_", "{", "|", "}", "abstype", "and", "andalso", "as",
 | 
|
21  | 
"case", "datatype", "do", "else", "end", "eqtype", "exception",  | 
|
22  | 
"fn", "fun", "functor", "handle", "if", "in", "include",  | 
|
23  | 
"infix", "infixr", "let", "local", "nonfix", "of", "op", "open",  | 
|
24  | 
"orelse", "raise", "rec", "sharing", "sig", "signature",  | 
|
25  | 
"struct", "structure", "then", "type", "val", "where", "while",  | 
|
26  | 
"with", "withtype")  | 
|
27  | 
||
28  | 
val keywords2: Set[String] =  | 
|
| 58933 | 29  | 
    Set("and", "case", "do", "else", "end", "if", "in", "let", "local",
 | 
30  | 
"of", "sig", "struct", "then", "while", "with")  | 
|
| 55505 | 31  | 
|
32  | 
val keywords3: Set[String] =  | 
|
33  | 
    Set("handle", "open", "raise")
 | 
|
34  | 
||
35  | 
private val lexicon: Scan.Lexicon = Scan.Lexicon(keywords.toList: _*)  | 
|
36  | 
||
37  | 
||
38  | 
||
| 55497 | 39  | 
/** tokens **/  | 
40  | 
||
41  | 
object Kind extends Enumeration  | 
|
42  | 
  {
 | 
|
43  | 
    val KEYWORD = Value("keyword")
 | 
|
44  | 
    val IDENT = Value("identifier")
 | 
|
45  | 
    val LONG_IDENT = Value("long identifier")
 | 
|
46  | 
    val TYPE_VAR = Value("type variable")
 | 
|
47  | 
    val WORD = Value("word")
 | 
|
48  | 
    val INT = Value("integer")
 | 
|
49  | 
    val REAL = Value("real")
 | 
|
50  | 
    val CHAR = Value("character")
 | 
|
51  | 
    val STRING = Value("quoted string")
 | 
|
52  | 
    val SPACE = Value("white space")
 | 
|
| 67438 | 53  | 
    val INFORMAL_COMMENT = Value("informal comment")
 | 
54  | 
    val FORMAL_COMMENT = Value("formal comment")
 | 
|
| 61471 | 55  | 
    val CONTROL = Value("control symbol antiquotation")
 | 
| 55512 | 56  | 
    val ANTIQ = Value("antiquotation")
 | 
57  | 
    val ANTIQ_START = Value("antiquotation: start")
 | 
|
58  | 
    val ANTIQ_STOP = Value("antiquotation: stop")
 | 
|
59  | 
    val ANTIQ_OTHER = Value("antiquotation: other")
 | 
|
60  | 
    val ANTIQ_STRING = Value("antiquotation: quoted string")
 | 
|
61  | 
    val ANTIQ_ALT_STRING = Value("antiquotation: back-quoted string")
 | 
|
62  | 
    val ANTIQ_CARTOUCHE = Value("antiquotation: text cartouche")
 | 
|
| 55497 | 63  | 
    val ERROR = Value("bad input")
 | 
64  | 
}  | 
|
65  | 
||
| 60215 | 66  | 
sealed case class Token(kind: Kind.Value, source: String)  | 
| 55500 | 67  | 
  {
 | 
| 55501 | 68  | 
def is_keyword: Boolean = kind == Kind.KEYWORD  | 
| 55505 | 69  | 
def is_delimiter: Boolean = is_keyword && !Symbol.is_ascii_identifier(source)  | 
| 63610 | 70  | 
def is_space: Boolean = kind == Kind.SPACE  | 
| 67438 | 71  | 
def is_comment: Boolean = kind == Kind.INFORMAL_COMMENT || kind == Kind.FORMAL_COMMENT  | 
| 55500 | 72  | 
}  | 
| 55497 | 73  | 
|
74  | 
||
75  | 
||
76  | 
/** parsers **/  | 
|
77  | 
||
| 
55510
 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 
wenzelm 
parents: 
55505 
diff
changeset
 | 
78  | 
case object ML_String extends Scan.Line_Context  | 
| 55512 | 79  | 
case class Antiq(ctxt: Scan.Line_Context) extends Scan.Line_Context  | 
| 55499 | 80  | 
|
| 67438 | 81  | 
private object Parsers extends Scan.Parsers with Antiquote.Parsers with Comment.Parsers  | 
| 55497 | 82  | 
  {
 | 
83  | 
/* string material */  | 
|
84  | 
||
| 55500 | 85  | 
private val blanks = many(character(Symbol.is_ascii_blank))  | 
| 55497 | 86  | 
private val blanks1 = many1(character(Symbol.is_ascii_blank))  | 
87  | 
||
| 55499 | 88  | 
    private val gap = "\\" ~ blanks1 ~ "\\" ^^ { case x ~ y ~ z => x + y + z }
 | 
| 55500 | 89  | 
    private val gap_start = "\\" ~ blanks ~ """\z""".r ^^ { case x ~ y ~ _ => x + y }
 | 
| 55499 | 90  | 
|
| 55497 | 91  | 
private val escape =  | 
92  | 
      one(character("\"\\abtnvfr".contains(_))) |
 | 
|
93  | 
      "^" ~ one(character(c => '@' <= c && c <= '_')) ^^ { case x ~ y => x + y } |
 | 
|
94  | 
repeated(character(Symbol.is_ascii_digit), 3, 3)  | 
|
95  | 
||
96  | 
private val str =  | 
|
97  | 
one(character(c => c != '"' && c != '\\' && ' ' <= c && c <= '~')) |  | 
|
| 67509 | 98  | 
one(s =>  | 
99  | 
Symbol.is_open(s) || Symbol.is_close(s) || Symbol.is_symbolic(s) || Symbol.is_control(s)) |  | 
|
| 55497 | 100  | 
      "\\" ~ escape ^^ { case x ~ y => x + y }
 | 
101  | 
||
| 55499 | 102  | 
|
103  | 
/* ML char -- without gaps */  | 
|
104  | 
||
105  | 
private val ml_char: Parser[Token] =  | 
|
106  | 
      "#\"" ~ str ~ "\"" ^^ { case x ~ y ~ z => Token(Kind.CHAR, x + y + z) }
 | 
|
107  | 
||
108  | 
private val recover_ml_char: Parser[String] =  | 
|
109  | 
      "#\"" ~ opt(str) ^^ { case x ~ Some(y) => x + y case x ~ None => x }
 | 
|
110  | 
||
111  | 
||
112  | 
/* ML string */  | 
|
113  | 
||
114  | 
private val ml_string_body: Parser[String] =  | 
|
115  | 
rep(gap | str) ^^ (_.mkString)  | 
|
116  | 
||
117  | 
private val recover_ml_string: Parser[String] =  | 
|
118  | 
      "\"" ~ ml_string_body ^^ { case x ~ y => x + y }
 | 
|
119  | 
||
120  | 
private val ml_string: Parser[Token] =  | 
|
121  | 
      "\"" ~ ml_string_body ~ "\"" ^^ { case x ~ y ~ z => Token(Kind.STRING, x + y + z) }
 | 
|
122  | 
||
| 
55510
 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 
wenzelm 
parents: 
55505 
diff
changeset
 | 
123  | 
private def ml_string_line(ctxt: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] =  | 
| 55499 | 124  | 
    {
 | 
| 
55510
 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 
wenzelm 
parents: 
55505 
diff
changeset
 | 
125  | 
def result(x: String, c: Scan.Line_Context) = (Token(Kind.STRING, x), c)  | 
| 55499 | 126  | 
|
127  | 
      ctxt match {
 | 
|
128  | 
case Scan.Finished =>  | 
|
129  | 
          "\"" ~ ml_string_body ~ ("\"" | gap_start) ^^
 | 
|
130  | 
            { case x ~ y ~ z => result(x + y + z, if (z == "\"") Scan.Finished else ML_String) }
 | 
|
131  | 
case ML_String =>  | 
|
| 55500 | 132  | 
          blanks ~ opt_term("\\" ~ ml_string_body ~ ("\"" | gap_start)) ^^
 | 
| 55499 | 133  | 
            { case x ~ Some(y ~ z ~ w) =>
 | 
134  | 
result(x + y + z + w, if (w == "\"") Scan.Finished else ML_String)  | 
|
135  | 
case x ~ None => result(x, ML_String) }  | 
|
136  | 
        case _ => failure("")
 | 
|
137  | 
}  | 
|
138  | 
}  | 
|
139  | 
||
140  | 
||
141  | 
/* ML comment */  | 
|
142  | 
||
143  | 
private val ml_comment: Parser[Token] =  | 
|
| 67438 | 144  | 
comment ^^ (x => Token(Kind.INFORMAL_COMMENT, x))  | 
| 55499 | 145  | 
|
| 
55510
 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 
wenzelm 
parents: 
55505 
diff
changeset
 | 
146  | 
private def ml_comment_line(ctxt: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] =  | 
| 67438 | 147  | 
      comment_line(ctxt) ^^ { case (x, c) => (Token(Kind.INFORMAL_COMMENT, x), c) }
 | 
| 55497 | 148  | 
|
| 67438 | 149  | 
private val ml_formal_comment: Parser[Token] =  | 
150  | 
comment_cartouche ^^ (x => Token(Kind.FORMAL_COMMENT, x))  | 
|
| 
67365
 
fb539f83683a
support for formal comments in ML in Isabelle/Scala;
 
wenzelm 
parents: 
67364 
diff
changeset
 | 
151  | 
|
| 67438 | 152  | 
private def ml_formal_comment_line(ctxt: Scan.Line_Context)  | 
| 
67365
 
fb539f83683a
support for formal comments in ML in Isabelle/Scala;
 
wenzelm 
parents: 
67364 
diff
changeset
 | 
153  | 
: Parser[(Token, Scan.Line_Context)] =  | 
| 67438 | 154  | 
      comment_cartouche_line(ctxt) ^^ { case (x, c) => (Token(Kind.FORMAL_COMMENT, x), c) }
 | 
| 
67365
 
fb539f83683a
support for formal comments in ML in Isabelle/Scala;
 
wenzelm 
parents: 
67364 
diff
changeset
 | 
155  | 
|
| 55497 | 156  | 
|
| 67366 | 157  | 
/* antiquotations (line-oriented) */  | 
158  | 
||
159  | 
def ml_cartouche_line(ctxt: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] =  | 
|
160  | 
      cartouche_line(ctxt) ^^ { case (x, c) => (Token(Kind.ANTIQ_CARTOUCHE, x), c) }
 | 
|
161  | 
||
162  | 
def ml_antiq_start(ctxt: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] =  | 
|
163  | 
      ctxt match {
 | 
|
164  | 
        case Scan.Finished => "@{" ^^ (x => (Token(Kind.ANTIQ_START, x), Antiq(Scan.Finished)))
 | 
|
165  | 
        case _ => failure("")
 | 
|
166  | 
}  | 
|
167  | 
||
168  | 
def ml_antiq_stop(ctxt: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] =  | 
|
169  | 
      ctxt match {
 | 
|
170  | 
case Antiq(Scan.Finished) => "}" ^^ (x => (Token(Kind.ANTIQ_STOP, x), Scan.Finished))  | 
|
171  | 
        case _ => failure("")
 | 
|
172  | 
}  | 
|
173  | 
||
174  | 
def ml_antiq_body(context: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] =  | 
|
175  | 
      context match {
 | 
|
176  | 
case Antiq(ctxt) =>  | 
|
177  | 
(if (ctxt == Scan.Finished) antiq_other ^^ (x => (Token(Kind.ANTIQ_OTHER, x), context))  | 
|
178  | 
           else failure("")) |
 | 
|
179  | 
          quoted_line("\"", ctxt) ^^ { case (x, c) => (Token(Kind.ANTIQ_STRING, x), Antiq(c)) } |
 | 
|
180  | 
          quoted_line("`", ctxt) ^^ { case (x, c) => (Token(Kind.ANTIQ_ALT_STRING, x), Antiq(c)) } |
 | 
|
181  | 
          cartouche_line(ctxt) ^^ { case (x, c) => (Token(Kind.ANTIQ_CARTOUCHE, x), Antiq(c)) }
 | 
|
182  | 
        case _ => failure("")
 | 
|
183  | 
}  | 
|
184  | 
||
185  | 
||
186  | 
/* token */  | 
|
187  | 
||
| 55497 | 188  | 
private def other_token: Parser[Token] =  | 
189  | 
    {
 | 
|
190  | 
/* identifiers */  | 
|
191  | 
||
192  | 
val letdigs = many(character(Symbol.is_ascii_letdig))  | 
|
193  | 
||
194  | 
val alphanumeric =  | 
|
195  | 
        one(character(Symbol.is_ascii_letter)) ~ letdigs ^^ { case x ~ y => x + y }
 | 
|
196  | 
||
197  | 
      val symbolic = many1(character("!#$%&*+-/:<=>?@\\^`|~".contains(_)))
 | 
|
198  | 
||
199  | 
val ident = (alphanumeric | symbolic) ^^ (x => Token(Kind.IDENT, x))  | 
|
200  | 
||
201  | 
val long_ident =  | 
|
202  | 
        rep1(alphanumeric ~ "." ^^ { case x ~ y => x + y }) ~
 | 
|
203  | 
(alphanumeric | (symbolic | "=")) ^^  | 
|
204  | 
          { case x ~ y => Token(Kind.LONG_IDENT, x.mkString + y) }
 | 
|
205  | 
||
206  | 
      val type_var = "'" ~ letdigs ^^ { case x ~ y => Token(Kind.TYPE_VAR, x + y) }
 | 
|
207  | 
||
208  | 
||
209  | 
/* numerals */  | 
|
210  | 
||
211  | 
val dec = many1(character(Symbol.is_ascii_digit))  | 
|
212  | 
val hex = many1(character(Symbol.is_ascii_hex))  | 
|
213  | 
      val sign = opt("~") ^^ { case Some(x) => x case None => "" }
 | 
|
214  | 
      val decint = sign ~ dec ^^ { case x ~ y => x + y }
 | 
|
215  | 
      val exp = ("E" | "e") ~ decint ^^ { case x ~ y => x + y }
 | 
|
216  | 
||
217  | 
val word =  | 
|
218  | 
        ("0wx" ~ hex ^^ { case x ~ y => x + y } | "0w" ~ dec ^^ { case x ~ y => x + y }) ^^
 | 
|
219  | 
(x => Token(Kind.WORD, x))  | 
|
220  | 
||
221  | 
val int =  | 
|
222  | 
        sign ~ ("0x" ~ hex ^^ { case x ~ y => x + y } | dec) ^^
 | 
|
223  | 
          { case x ~ y => Token(Kind.INT, x + y) }
 | 
|
224  | 
||
| 
63204
 
921a5be54132
support rat numerals via special antiquotation syntax;
 
wenzelm 
parents: 
61596 
diff
changeset
 | 
225  | 
val rat =  | 
| 
 
921a5be54132
support rat numerals via special antiquotation syntax;
 
wenzelm 
parents: 
61596 
diff
changeset
 | 
226  | 
        decint ~ opt("/" ~ dec) ^^ { case x ~ None => x case x ~ Some(y ~ z) => x + y + z }
 | 
| 
 
921a5be54132
support rat numerals via special antiquotation syntax;
 
wenzelm 
parents: 
61596 
diff
changeset
 | 
227  | 
|
| 55497 | 228  | 
val real =  | 
229  | 
        (decint ~ "." ~ dec ~ (opt(exp) ^^ { case Some(x) => x case None => "" }) ^^
 | 
|
230  | 
          { case x ~ y ~ z ~ w => x + y + z + w } |
 | 
|
231  | 
         decint ~ exp ^^ { case x ~ y => x + y }) ^^ (x => Token(Kind.REAL, x))
 | 
|
232  | 
||
233  | 
||
| 55499 | 234  | 
/* main */  | 
| 55497 | 235  | 
|
236  | 
val space = blanks1 ^^ (x => Token(Kind.SPACE, x))  | 
|
237  | 
||
238  | 
val keyword = literal(lexicon) ^^ (x => Token(Kind.KEYWORD, x))  | 
|
239  | 
||
| 61471 | 240  | 
val ml_control = control ^^ (x => Token(Kind.CONTROL, x))  | 
| 
63204
 
921a5be54132
support rat numerals via special antiquotation syntax;
 
wenzelm 
parents: 
61596 
diff
changeset
 | 
241  | 
val ml_antiq =  | 
| 
 
921a5be54132
support rat numerals via special antiquotation syntax;
 
wenzelm 
parents: 
61596 
diff
changeset
 | 
242  | 
        "@" ~ rat ^^ { case x ~ y => Token(Kind.ANTIQ, x + y) } |
 | 
| 
 
921a5be54132
support rat numerals via special antiquotation syntax;
 
wenzelm 
parents: 
61596 
diff
changeset
 | 
243  | 
antiq ^^ (x => Token(Kind.ANTIQ, x))  | 
| 55512 | 244  | 
|
| 55497 | 245  | 
val bad = one(_ => true) ^^ (x => Token(Kind.ERROR, x))  | 
246  | 
||
| 67364 | 247  | 
val recover =  | 
248  | 
(recover_ml_char | (recover_ml_string | (recover_cartouche | recover_comment))) ^^  | 
|
249  | 
(x => Token(Kind.ERROR, x))  | 
|
250  | 
||
251  | 
space | (ml_control | (recover | (ml_antiq |  | 
|
| 61471 | 252  | 
(((word | (real | (int | (long_ident | (ident | type_var))))) ||| keyword) | bad))))  | 
| 55497 | 253  | 
}  | 
254  | 
||
| 
67365
 
fb539f83683a
support for formal comments in ML in Isabelle/Scala;
 
wenzelm 
parents: 
67364 
diff
changeset
 | 
255  | 
def token: Parser[Token] =  | 
| 67438 | 256  | 
ml_char | (ml_string | (ml_comment | (ml_formal_comment | other_token)))  | 
| 55499 | 257  | 
|
| 
56278
 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 
wenzelm 
parents: 
55512 
diff
changeset
 | 
258  | 
def token_line(SML: Boolean, ctxt: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] =  | 
| 55499 | 259  | 
    {
 | 
260  | 
val other = (ml_char | other_token) ^^ (x => (x, Scan.Finished))  | 
|
261  | 
||
| 
56278
 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 
wenzelm 
parents: 
55512 
diff
changeset
 | 
262  | 
if (SML) ml_string_line(ctxt) | (ml_comment_line(ctxt) | other)  | 
| 
67365
 
fb539f83683a
support for formal comments in ML in Isabelle/Scala;
 
wenzelm 
parents: 
67364 
diff
changeset
 | 
263  | 
      else {
 | 
| 
56278
 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 
wenzelm 
parents: 
55512 
diff
changeset
 | 
264  | 
ml_string_line(ctxt) |  | 
| 61596 | 265  | 
(ml_comment_line(ctxt) |  | 
| 67438 | 266  | 
(ml_formal_comment_line(ctxt) |  | 
| 
67365
 
fb539f83683a
support for formal comments in ML in Isabelle/Scala;
 
wenzelm 
parents: 
67364 
diff
changeset
 | 
267  | 
(ml_cartouche_line(ctxt) |  | 
| 
 
fb539f83683a
support for formal comments in ML in Isabelle/Scala;
 
wenzelm 
parents: 
67364 
diff
changeset
 | 
268  | 
(ml_antiq_start(ctxt) | (ml_antiq_stop(ctxt) | (ml_antiq_body(ctxt) | other))))))  | 
| 
 
fb539f83683a
support for formal comments in ML in Isabelle/Scala;
 
wenzelm 
parents: 
67364 
diff
changeset
 | 
269  | 
}  | 
| 55499 | 270  | 
}  | 
| 55497 | 271  | 
}  | 
272  | 
||
| 55499 | 273  | 
|
274  | 
/* tokenize */  | 
|
275  | 
||
| 55497 | 276  | 
def tokenize(input: CharSequence): List[Token] =  | 
| 64824 | 277  | 
    Parsers.parseAll(Parsers.rep(Parsers.token), Scan.char_reader(input)) match {
 | 
| 55497 | 278  | 
case Parsers.Success(tokens, _) => tokens  | 
279  | 
      case _ => error("Unexpected failure of tokenizing input:\n" + input.toString)
 | 
|
280  | 
}  | 
|
| 55499 | 281  | 
|
| 
56278
 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 
wenzelm 
parents: 
55512 
diff
changeset
 | 
282  | 
def tokenize_line(SML: Boolean, input: CharSequence, context: Scan.Line_Context)  | 
| 
55510
 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 
wenzelm 
parents: 
55505 
diff
changeset
 | 
283  | 
: (List[Token], Scan.Line_Context) =  | 
| 55499 | 284  | 
  {
 | 
| 64824 | 285  | 
var in: Reader[Char] = Scan.char_reader(input)  | 
| 55499 | 286  | 
val toks = new mutable.ListBuffer[Token]  | 
287  | 
var ctxt = context  | 
|
288  | 
    while (!in.atEnd) {
 | 
|
| 
56278
 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 
wenzelm 
parents: 
55512 
diff
changeset
 | 
289  | 
      Parsers.parse(Parsers.token_line(SML, ctxt), in) match {
 | 
| 60215 | 290  | 
case Parsers.Success((x, c), rest) => toks += x; ctxt = c; in = rest  | 
| 55499 | 291  | 
case Parsers.NoSuccess(_, rest) =>  | 
292  | 
          error("Unexpected failure of tokenizing input:\n" + rest.source.toString)
 | 
|
293  | 
}  | 
|
294  | 
}  | 
|
295  | 
(toks.toList, ctxt)  | 
|
296  | 
}  | 
|
| 55497 | 297  | 
}  |