| author | wenzelm | 
| Thu, 11 Jul 2024 22:39:04 +0200 | |
| changeset 80557 | 08034bb75ee8 | 
| parent 75420 | 73a2f3fe0e8c | 
| permissions | -rw-r--r-- | 
| 55497 | 1 | /* Title: Pure/ML/ml_lex.scala | 
| 2 | Author: Makarius | |
| 3 | ||
| 59109 | 4 | Lexical syntax for Isabelle/ML and Standard ML. | 
| 55497 | 5 | */ | 
| 6 | ||
| 7 | package isabelle | |
| 8 | ||
| 55499 | 9 | |
| 10 | import scala.collection.mutable | |
| 64824 | 11 | import scala.util.parsing.input.Reader | 
| 55497 | 12 | |
| 13 | ||
| 75393 | 14 | object ML_Lex {
 | 
| 55505 | 15 | /** keywords **/ | 
| 16 | ||
| 17 | val keywords: Set[String] = | |
| 18 |     Set("#", "(", ")", ",", "->", "...", ":", ":>", ";", "=", "=>",
 | |
| 19 |       "[", "]", "_", "{", "|", "}", "abstype", "and", "andalso", "as",
 | |
| 20 | "case", "datatype", "do", "else", "end", "eqtype", "exception", | |
| 21 | "fn", "fun", "functor", "handle", "if", "in", "include", | |
| 22 | "infix", "infixr", "let", "local", "nonfix", "of", "op", "open", | |
| 23 | "orelse", "raise", "rec", "sharing", "sig", "signature", | |
| 24 | "struct", "structure", "then", "type", "val", "where", "while", | |
| 25 | "with", "withtype") | |
| 26 | ||
| 27 | val keywords2: Set[String] = | |
| 58933 | 28 |     Set("and", "case", "do", "else", "end", "if", "in", "let", "local",
 | 
| 29 | "of", "sig", "struct", "then", "while", "with") | |
| 55505 | 30 | |
| 31 | val keywords3: Set[String] = | |
| 32 |     Set("handle", "open", "raise")
 | |
| 33 | ||
| 34 | private val lexicon: Scan.Lexicon = Scan.Lexicon(keywords.toList: _*) | |
| 35 | ||
| 36 | ||
| 37 | ||
| 55497 | 38 | /** tokens **/ | 
| 39 | ||
| 75393 | 40 |   object Kind extends Enumeration {
 | 
| 55497 | 41 |     val KEYWORD = Value("keyword")
 | 
| 42 |     val IDENT = Value("identifier")
 | |
| 43 |     val LONG_IDENT = Value("long identifier")
 | |
| 44 |     val TYPE_VAR = Value("type variable")
 | |
| 45 |     val WORD = Value("word")
 | |
| 46 |     val INT = Value("integer")
 | |
| 47 |     val REAL = Value("real")
 | |
| 48 |     val CHAR = Value("character")
 | |
| 49 |     val STRING = Value("quoted string")
 | |
| 50 |     val SPACE = Value("white space")
 | |
| 67438 | 51 |     val INFORMAL_COMMENT = Value("informal comment")
 | 
| 52 |     val FORMAL_COMMENT = Value("formal comment")
 | |
| 61471 | 53 |     val CONTROL = Value("control symbol antiquotation")
 | 
| 55512 | 54 |     val ANTIQ = Value("antiquotation")
 | 
| 55 |     val ANTIQ_START = Value("antiquotation: start")
 | |
| 56 |     val ANTIQ_STOP = Value("antiquotation: stop")
 | |
| 57 |     val ANTIQ_OTHER = Value("antiquotation: other")
 | |
| 58 |     val ANTIQ_STRING = Value("antiquotation: quoted string")
 | |
| 59 |     val ANTIQ_ALT_STRING = Value("antiquotation: back-quoted string")
 | |
| 60 |     val ANTIQ_CARTOUCHE = Value("antiquotation: text cartouche")
 | |
| 55497 | 61 |     val ERROR = Value("bad input")
 | 
| 62 | } | |
| 63 | ||
| 75393 | 64 |   sealed case class Token(kind: Kind.Value, source: String) {
 | 
| 55501 | 65 | def is_keyword: Boolean = kind == Kind.KEYWORD | 
| 55505 | 66 | def is_delimiter: Boolean = is_keyword && !Symbol.is_ascii_identifier(source) | 
| 63610 | 67 | def is_space: Boolean = kind == Kind.SPACE | 
| 67438 | 68 | def is_comment: Boolean = kind == Kind.INFORMAL_COMMENT || kind == Kind.FORMAL_COMMENT | 
| 55500 | 69 | } | 
| 55497 | 70 | |
| 71 | ||
| 72 | ||
| 73 | /** parsers **/ | |
| 74 | ||
| 55510 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 wenzelm parents: 
55505diff
changeset | 75 | case object ML_String extends Scan.Line_Context | 
| 55512 | 76 | case class Antiq(ctxt: Scan.Line_Context) extends Scan.Line_Context | 
| 55499 | 77 | |
| 75393 | 78 |   private object Parsers extends Scan.Parsers with Antiquote.Parsers with Comment.Parsers {
 | 
| 55497 | 79 | /* string material */ | 
| 80 | ||
| 55500 | 81 | private val blanks = many(character(Symbol.is_ascii_blank)) | 
| 55497 | 82 | private val blanks1 = many1(character(Symbol.is_ascii_blank)) | 
| 83 | ||
| 55499 | 84 |     private val gap = "\\" ~ blanks1 ~ "\\" ^^ { case x ~ y ~ z => x + y + z }
 | 
| 55500 | 85 |     private val gap_start = "\\" ~ blanks ~ """\z""".r ^^ { case x ~ y ~ _ => x + y }
 | 
| 55499 | 86 | |
| 55497 | 87 | private val escape = | 
| 88 |       one(character("\"\\abtnvfr".contains(_))) |
 | |
| 89 |       "^" ~ one(character(c => '@' <= c && c <= '_')) ^^ { case x ~ y => x + y } |
 | |
| 90 | repeated(character(Symbol.is_ascii_digit), 3, 3) | |
| 91 | ||
| 92 | private val str = | |
| 93 | one(character(c => c != '"' && c != '\\' && ' ' <= c && c <= '~')) | | |
| 67509 | 94 | one(s => | 
| 95 | Symbol.is_open(s) || Symbol.is_close(s) || Symbol.is_symbolic(s) || Symbol.is_control(s)) | | |
| 55497 | 96 |       "\\" ~ escape ^^ { case x ~ y => x + y }
 | 
| 97 | ||
| 55499 | 98 | |
| 99 | /* ML char -- without gaps */ | |
| 100 | ||
| 101 | private val ml_char: Parser[Token] = | |
| 102 |       "#\"" ~ str ~ "\"" ^^ { case x ~ y ~ z => Token(Kind.CHAR, x + y + z) }
 | |
| 103 | ||
| 104 | private val recover_ml_char: Parser[String] = | |
| 105 |       "#\"" ~ opt(str) ^^ { case x ~ Some(y) => x + y case x ~ None => x }
 | |
| 106 | ||
| 107 | ||
| 108 | /* ML string */ | |
| 109 | ||
| 110 | private val ml_string_body: Parser[String] = | |
| 111 | rep(gap | str) ^^ (_.mkString) | |
| 112 | ||
| 113 | private val recover_ml_string: Parser[String] = | |
| 114 |       "\"" ~ ml_string_body ^^ { case x ~ y => x + y }
 | |
| 115 | ||
| 116 | private val ml_string: Parser[Token] = | |
| 117 |       "\"" ~ ml_string_body ~ "\"" ^^ { case x ~ y ~ z => Token(Kind.STRING, x + y + z) }
 | |
| 118 | ||
| 75393 | 119 |     private def ml_string_line(ctxt: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] = {
 | 
| 55510 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 wenzelm parents: 
55505diff
changeset | 120 | def result(x: String, c: Scan.Line_Context) = (Token(Kind.STRING, x), c) | 
| 55499 | 121 | |
| 122 |       ctxt match {
 | |
| 123 | case Scan.Finished => | |
| 124 |           "\"" ~ ml_string_body ~ ("\"" | gap_start) ^^
 | |
| 125 |             { case x ~ y ~ z => result(x + y + z, if (z == "\"") Scan.Finished else ML_String) }
 | |
| 126 | case ML_String => | |
| 55500 | 127 |           blanks ~ opt_term("\\" ~ ml_string_body ~ ("\"" | gap_start)) ^^
 | 
| 55499 | 128 |             { case x ~ Some(y ~ z ~ w) =>
 | 
| 129 | result(x + y + z + w, if (w == "\"") Scan.Finished else ML_String) | |
| 130 | case x ~ None => result(x, ML_String) } | |
| 131 |         case _ => failure("")
 | |
| 132 | } | |
| 133 | } | |
| 134 | ||
| 135 | ||
| 136 | /* ML comment */ | |
| 137 | ||
| 138 | private val ml_comment: Parser[Token] = | |
| 67438 | 139 | comment ^^ (x => Token(Kind.INFORMAL_COMMENT, x)) | 
| 55499 | 140 | |
| 55510 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 wenzelm parents: 
55505diff
changeset | 141 | private def ml_comment_line(ctxt: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] = | 
| 67438 | 142 |       comment_line(ctxt) ^^ { case (x, c) => (Token(Kind.INFORMAL_COMMENT, x), c) }
 | 
| 55497 | 143 | |
| 67438 | 144 | private val ml_formal_comment: Parser[Token] = | 
| 145 | comment_cartouche ^^ (x => Token(Kind.FORMAL_COMMENT, x)) | |
| 67365 
fb539f83683a
support for formal comments in ML in Isabelle/Scala;
 wenzelm parents: 
67364diff
changeset | 146 | |
| 67438 | 147 | private def ml_formal_comment_line(ctxt: Scan.Line_Context) | 
| 67365 
fb539f83683a
support for formal comments in ML in Isabelle/Scala;
 wenzelm parents: 
67364diff
changeset | 148 | : Parser[(Token, Scan.Line_Context)] = | 
| 67438 | 149 |       comment_cartouche_line(ctxt) ^^ { case (x, c) => (Token(Kind.FORMAL_COMMENT, x), c) }
 | 
| 67365 
fb539f83683a
support for formal comments in ML in Isabelle/Scala;
 wenzelm parents: 
67364diff
changeset | 150 | |
| 55497 | 151 | |
| 67366 | 152 | /* antiquotations (line-oriented) */ | 
| 153 | ||
| 154 | def ml_cartouche_line(ctxt: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] = | |
| 155 |       cartouche_line(ctxt) ^^ { case (x, c) => (Token(Kind.ANTIQ_CARTOUCHE, x), c) }
 | |
| 156 | ||
| 157 | def ml_antiq_start(ctxt: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] = | |
| 158 |       ctxt match {
 | |
| 159 |         case Scan.Finished => "@{" ^^ (x => (Token(Kind.ANTIQ_START, x), Antiq(Scan.Finished)))
 | |
| 160 |         case _ => failure("")
 | |
| 161 | } | |
| 162 | ||
| 163 | def ml_antiq_stop(ctxt: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] = | |
| 164 |       ctxt match {
 | |
| 165 | case Antiq(Scan.Finished) => "}" ^^ (x => (Token(Kind.ANTIQ_STOP, x), Scan.Finished)) | |
| 166 |         case _ => failure("")
 | |
| 167 | } | |
| 168 | ||
| 169 | def ml_antiq_body(context: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] = | |
| 170 |       context match {
 | |
| 171 | case Antiq(ctxt) => | |
| 172 | (if (ctxt == Scan.Finished) antiq_other ^^ (x => (Token(Kind.ANTIQ_OTHER, x), context)) | |
| 173 |            else failure("")) |
 | |
| 174 |           quoted_line("\"", ctxt) ^^ { case (x, c) => (Token(Kind.ANTIQ_STRING, x), Antiq(c)) } |
 | |
| 175 |           quoted_line("`", ctxt) ^^ { case (x, c) => (Token(Kind.ANTIQ_ALT_STRING, x), Antiq(c)) } |
 | |
| 176 |           cartouche_line(ctxt) ^^ { case (x, c) => (Token(Kind.ANTIQ_CARTOUCHE, x), Antiq(c)) }
 | |
| 177 |         case _ => failure("")
 | |
| 178 | } | |
| 179 | ||
| 180 | ||
| 181 | /* token */ | |
| 182 | ||
| 75393 | 183 |     private def other_token: Parser[Token] = {
 | 
| 55497 | 184 | /* identifiers */ | 
| 185 | ||
| 186 | val letdigs = many(character(Symbol.is_ascii_letdig)) | |
| 187 | ||
| 188 | val alphanumeric = | |
| 189 |         one(character(Symbol.is_ascii_letter)) ~ letdigs ^^ { case x ~ y => x + y }
 | |
| 190 | ||
| 191 |       val symbolic = many1(character("!#$%&*+-/:<=>?@\\^`|~".contains(_)))
 | |
| 192 | ||
| 193 | val ident = (alphanumeric | symbolic) ^^ (x => Token(Kind.IDENT, x)) | |
| 194 | ||
| 195 | val long_ident = | |
| 196 |         rep1(alphanumeric ~ "." ^^ { case x ~ y => x + y }) ~
 | |
| 197 | (alphanumeric | (symbolic | "=")) ^^ | |
| 198 |           { case x ~ y => Token(Kind.LONG_IDENT, x.mkString + y) }
 | |
| 199 | ||
| 200 |       val type_var = "'" ~ letdigs ^^ { case x ~ y => Token(Kind.TYPE_VAR, x + y) }
 | |
| 201 | ||
| 202 | ||
| 203 | /* numerals */ | |
| 204 | ||
| 205 | val dec = many1(character(Symbol.is_ascii_digit)) | |
| 206 | val hex = many1(character(Symbol.is_ascii_hex)) | |
| 207 |       val sign = opt("~") ^^ { case Some(x) => x case None => "" }
 | |
| 208 |       val decint = sign ~ dec ^^ { case x ~ y => x + y }
 | |
| 209 |       val exp = ("E" | "e") ~ decint ^^ { case x ~ y => x + y }
 | |
| 210 | ||
| 211 | val word = | |
| 212 |         ("0wx" ~ hex ^^ { case x ~ y => x + y } | "0w" ~ dec ^^ { case x ~ y => x + y }) ^^
 | |
| 213 | (x => Token(Kind.WORD, x)) | |
| 214 | ||
| 215 | val int = | |
| 216 |         sign ~ ("0x" ~ hex ^^ { case x ~ y => x + y } | dec) ^^
 | |
| 217 |           { case x ~ y => Token(Kind.INT, x + y) }
 | |
| 218 | ||
| 63204 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
61596diff
changeset | 219 | val rat = | 
| 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
61596diff
changeset | 220 |         decint ~ opt("/" ~ dec) ^^ { case x ~ None => x case x ~ Some(y ~ z) => x + y + z }
 | 
| 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
61596diff
changeset | 221 | |
| 55497 | 222 | val real = | 
| 223 |         (decint ~ "." ~ dec ~ (opt(exp) ^^ { case Some(x) => x case None => "" }) ^^
 | |
| 224 |           { case x ~ y ~ z ~ w => x + y + z + w } |
 | |
| 225 |          decint ~ exp ^^ { case x ~ y => x + y }) ^^ (x => Token(Kind.REAL, x))
 | |
| 226 | ||
| 227 | ||
| 55499 | 228 | /* main */ | 
| 55497 | 229 | |
| 230 | val space = blanks1 ^^ (x => Token(Kind.SPACE, x)) | |
| 231 | ||
| 232 | val keyword = literal(lexicon) ^^ (x => Token(Kind.KEYWORD, x)) | |
| 233 | ||
| 61471 | 234 | val ml_control = control ^^ (x => Token(Kind.CONTROL, x)) | 
| 63204 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
61596diff
changeset | 235 | val ml_antiq = | 
| 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
61596diff
changeset | 236 |         "@" ~ rat ^^ { case x ~ y => Token(Kind.ANTIQ, x + y) } |
 | 
| 
921a5be54132
support rat numerals via special antiquotation syntax;
 wenzelm parents: 
61596diff
changeset | 237 | antiq ^^ (x => Token(Kind.ANTIQ, x)) | 
| 55512 | 238 | |
| 55497 | 239 | val bad = one(_ => true) ^^ (x => Token(Kind.ERROR, x)) | 
| 240 | ||
| 67364 | 241 | val recover = | 
| 242 | (recover_ml_char | (recover_ml_string | (recover_cartouche | recover_comment))) ^^ | |
| 243 | (x => Token(Kind.ERROR, x)) | |
| 244 | ||
| 245 | space | (ml_control | (recover | (ml_antiq | | |
| 75384 
20093a63d03b
updated to scala-parser-combinators 2.1.0, which also fits to scala-3.0.2;
 wenzelm parents: 
67509diff
changeset | 246 | ((keyword ||| (word | (real | (int | (long_ident | (ident | type_var)))))) | bad)))) | 
| 55497 | 247 | } | 
| 248 | ||
| 67365 
fb539f83683a
support for formal comments in ML in Isabelle/Scala;
 wenzelm parents: 
67364diff
changeset | 249 | def token: Parser[Token] = | 
| 67438 | 250 | ml_char | (ml_string | (ml_comment | (ml_formal_comment | other_token))) | 
| 55499 | 251 | |
| 75393 | 252 |     def token_line(SML: Boolean, ctxt: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] = {
 | 
| 55499 | 253 | val other = (ml_char | other_token) ^^ (x => (x, Scan.Finished)) | 
| 254 | ||
| 56278 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 wenzelm parents: 
55512diff
changeset | 255 | if (SML) ml_string_line(ctxt) | (ml_comment_line(ctxt) | other) | 
| 67365 
fb539f83683a
support for formal comments in ML in Isabelle/Scala;
 wenzelm parents: 
67364diff
changeset | 256 |       else {
 | 
| 56278 
2576d3a40ed6
separate tokenization and language context for SML: no symbols, no antiquotes;
 wenzelm parents: 
55512diff
changeset | 257 | ml_string_line(ctxt) | | 
| 61596 | 258 | (ml_comment_line(ctxt) | | 
| 67438 | 259 | (ml_formal_comment_line(ctxt) | | 
| 67365 
fb539f83683a
support for formal comments in ML in Isabelle/Scala;
 wenzelm parents: 
67364diff
changeset | 260 | (ml_cartouche_line(ctxt) | | 
| 
fb539f83683a
support for formal comments in ML in Isabelle/Scala;
 wenzelm parents: 
67364diff
changeset | 261 | (ml_antiq_start(ctxt) | (ml_antiq_stop(ctxt) | (ml_antiq_body(ctxt) | other)))))) | 
| 
fb539f83683a
support for formal comments in ML in Isabelle/Scala;
 wenzelm parents: 
67364diff
changeset | 262 | } | 
| 55499 | 263 | } | 
| 55497 | 264 | } | 
| 265 | ||
| 55499 | 266 | |
| 267 | /* tokenize */ | |
| 268 | ||
| 55497 | 269 | def tokenize(input: CharSequence): List[Token] = | 
| 64824 | 270 |     Parsers.parseAll(Parsers.rep(Parsers.token), Scan.char_reader(input)) match {
 | 
| 55497 | 271 | case Parsers.Success(tokens, _) => tokens | 
| 272 |       case _ => error("Unexpected failure of tokenizing input:\n" + input.toString)
 | |
| 273 | } | |
| 55499 | 274 | |
| 75393 | 275 | def tokenize_line( | 
| 276 | SML: Boolean, | |
| 277 | input: CharSequence, | |
| 278 | context: Scan.Line_Context | |
| 279 |   ) : (List[Token], Scan.Line_Context) = {
 | |
| 64824 | 280 | var in: Reader[Char] = Scan.char_reader(input) | 
| 55499 | 281 | val toks = new mutable.ListBuffer[Token] | 
| 282 | var ctxt = context | |
| 283 |     while (!in.atEnd) {
 | |
| 75420 | 284 | val result = Parsers.parse(Parsers.token_line(SML, ctxt), in) | 
| 285 |       (result : @unchecked) match {
 | |
| 60215 | 286 | case Parsers.Success((x, c), rest) => toks += x; ctxt = c; in = rest | 
| 55499 | 287 | case Parsers.NoSuccess(_, rest) => | 
| 288 |           error("Unexpected failure of tokenizing input:\n" + rest.source.toString)
 | |
| 289 | } | |
| 290 | } | |
| 291 | (toks.toList, ctxt) | |
| 292 | } | |
| 55497 | 293 | } |