src/Pure/ML/ml_lex.scala
author wenzelm
Sat Nov 04 15:24:40 2017 +0100 (20 months ago)
changeset 67003 49850a679c2c
parent 64824 330ec9bc4b75
child 67095 91ffe1f8bf5c
permissions -rw-r--r--
more robust sorted_entries;
wenzelm@55497
     1
/*  Title:      Pure/ML/ml_lex.scala
wenzelm@55497
     2
    Author:     Makarius
wenzelm@55497
     3
wenzelm@59109
     4
Lexical syntax for Isabelle/ML and Standard ML.
wenzelm@55497
     5
*/
wenzelm@55497
     6
wenzelm@55497
     7
package isabelle
wenzelm@55497
     8
wenzelm@55499
     9
wenzelm@55499
    10
import scala.collection.mutable
wenzelm@64824
    11
import scala.util.parsing.input.Reader
wenzelm@55497
    12
wenzelm@55497
    13
wenzelm@55497
    14
object ML_Lex
wenzelm@55497
    15
{
wenzelm@55505
    16
  /** keywords **/
wenzelm@55505
    17
wenzelm@55505
    18
  val keywords: Set[String] =
wenzelm@55505
    19
    Set("#", "(", ")", ",", "->", "...", ":", ":>", ";", "=", "=>",
wenzelm@55505
    20
      "[", "]", "_", "{", "|", "}", "abstype", "and", "andalso", "as",
wenzelm@55505
    21
      "case", "datatype", "do", "else", "end", "eqtype", "exception",
wenzelm@55505
    22
      "fn", "fun", "functor", "handle", "if", "in", "include",
wenzelm@55505
    23
      "infix", "infixr", "let", "local", "nonfix", "of", "op", "open",
wenzelm@55505
    24
      "orelse", "raise", "rec", "sharing", "sig", "signature",
wenzelm@55505
    25
      "struct", "structure", "then", "type", "val", "where", "while",
wenzelm@55505
    26
      "with", "withtype")
wenzelm@55505
    27
wenzelm@55505
    28
  val keywords2: Set[String] =
wenzelm@58933
    29
    Set("and", "case", "do", "else", "end", "if", "in", "let", "local",
wenzelm@58933
    30
      "of", "sig", "struct", "then", "while", "with")
wenzelm@55505
    31
wenzelm@55505
    32
  val keywords3: Set[String] =
wenzelm@55505
    33
    Set("handle", "open", "raise")
wenzelm@55505
    34
wenzelm@55505
    35
  private val lexicon: Scan.Lexicon = Scan.Lexicon(keywords.toList: _*)
wenzelm@55505
    36
wenzelm@55505
    37
wenzelm@55505
    38
wenzelm@55497
    39
  /** tokens **/
wenzelm@55497
    40
wenzelm@55497
    41
  object Kind extends Enumeration
wenzelm@55497
    42
  {
wenzelm@55497
    43
    val KEYWORD = Value("keyword")
wenzelm@55497
    44
    val IDENT = Value("identifier")
wenzelm@55497
    45
    val LONG_IDENT = Value("long identifier")
wenzelm@55497
    46
    val TYPE_VAR = Value("type variable")
wenzelm@55497
    47
    val WORD = Value("word")
wenzelm@55497
    48
    val INT = Value("integer")
wenzelm@55497
    49
    val REAL = Value("real")
wenzelm@55497
    50
    val CHAR = Value("character")
wenzelm@55497
    51
    val STRING = Value("quoted string")
wenzelm@55497
    52
    val SPACE = Value("white space")
wenzelm@55497
    53
    val COMMENT = Value("comment text")
wenzelm@61471
    54
    val CONTROL = Value("control symbol antiquotation")
wenzelm@55512
    55
    val ANTIQ = Value("antiquotation")
wenzelm@55512
    56
    val ANTIQ_START = Value("antiquotation: start")
wenzelm@55512
    57
    val ANTIQ_STOP = Value("antiquotation: stop")
wenzelm@55512
    58
    val ANTIQ_OTHER = Value("antiquotation: other")
wenzelm@55512
    59
    val ANTIQ_STRING = Value("antiquotation: quoted string")
wenzelm@55512
    60
    val ANTIQ_ALT_STRING = Value("antiquotation: back-quoted string")
wenzelm@55512
    61
    val ANTIQ_CARTOUCHE = Value("antiquotation: text cartouche")
wenzelm@55497
    62
    val ERROR = Value("bad input")
wenzelm@55497
    63
  }
wenzelm@55497
    64
wenzelm@60215
    65
  sealed case class Token(kind: Kind.Value, source: String)
wenzelm@55500
    66
  {
wenzelm@55501
    67
    def is_keyword: Boolean = kind == Kind.KEYWORD
wenzelm@55505
    68
    def is_delimiter: Boolean = is_keyword && !Symbol.is_ascii_identifier(source)
wenzelm@63610
    69
    def is_space: Boolean = kind == Kind.SPACE
wenzelm@63610
    70
    def is_comment: Boolean = kind == Kind.COMMENT
wenzelm@55500
    71
  }
wenzelm@55497
    72
wenzelm@55497
    73
wenzelm@55497
    74
wenzelm@55497
    75
  /** parsers **/
wenzelm@55497
    76
wenzelm@55510
    77
  case object ML_String extends Scan.Line_Context
wenzelm@55512
    78
  case class Antiq(ctxt: Scan.Line_Context) extends Scan.Line_Context
wenzelm@55499
    79
wenzelm@55512
    80
  private object Parsers extends Scan.Parsers with Antiquote.Parsers
wenzelm@55497
    81
  {
wenzelm@55497
    82
    /* string material */
wenzelm@55497
    83
wenzelm@55500
    84
    private val blanks = many(character(Symbol.is_ascii_blank))
wenzelm@55497
    85
    private val blanks1 = many1(character(Symbol.is_ascii_blank))
wenzelm@55497
    86
wenzelm@55499
    87
    private val gap = "\\" ~ blanks1 ~ "\\" ^^ { case x ~ y ~ z => x + y + z }
wenzelm@55500
    88
    private val gap_start = "\\" ~ blanks ~ """\z""".r ^^ { case x ~ y ~ _ => x + y }
wenzelm@55499
    89
wenzelm@55497
    90
    private val escape =
wenzelm@55497
    91
      one(character("\"\\abtnvfr".contains(_))) |
wenzelm@55497
    92
      "^" ~ one(character(c => '@' <= c && c <= '_')) ^^ { case x ~ y => x + y } |
wenzelm@55497
    93
      repeated(character(Symbol.is_ascii_digit), 3, 3)
wenzelm@55497
    94
wenzelm@55497
    95
    private val str =
wenzelm@55497
    96
      one(character(c => c != '"' && c != '\\' && ' ' <= c && c <= '~')) |
wenzelm@59108
    97
      one(s => Symbol.is_symbolic(s) | Symbol.is_control(s)) |
wenzelm@55497
    98
      "\\" ~ escape ^^ { case x ~ y => x + y }
wenzelm@55497
    99
wenzelm@55499
   100
wenzelm@55499
   101
    /* ML char -- without gaps */
wenzelm@55499
   102
wenzelm@55499
   103
    private val ml_char: Parser[Token] =
wenzelm@55499
   104
      "#\"" ~ str ~ "\"" ^^ { case x ~ y ~ z => Token(Kind.CHAR, x + y + z) }
wenzelm@55499
   105
wenzelm@55499
   106
    private val recover_ml_char: Parser[String] =
wenzelm@55499
   107
      "#\"" ~ opt(str) ^^ { case x ~ Some(y) => x + y case x ~ None => x }
wenzelm@55499
   108
wenzelm@55499
   109
wenzelm@55499
   110
    /* ML string */
wenzelm@55499
   111
wenzelm@55499
   112
    private val ml_string_body: Parser[String] =
wenzelm@55499
   113
      rep(gap | str) ^^ (_.mkString)
wenzelm@55499
   114
wenzelm@55499
   115
    private val recover_ml_string: Parser[String] =
wenzelm@55499
   116
      "\"" ~ ml_string_body ^^ { case x ~ y => x + y }
wenzelm@55499
   117
wenzelm@55499
   118
    private val ml_string: Parser[Token] =
wenzelm@55499
   119
      "\"" ~ ml_string_body ~ "\"" ^^ { case x ~ y ~ z => Token(Kind.STRING, x + y + z) }
wenzelm@55499
   120
wenzelm@55510
   121
    private def ml_string_line(ctxt: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] =
wenzelm@55499
   122
    {
wenzelm@55510
   123
      def result(x: String, c: Scan.Line_Context) = (Token(Kind.STRING, x), c)
wenzelm@55499
   124
wenzelm@55499
   125
      ctxt match {
wenzelm@55499
   126
        case Scan.Finished =>
wenzelm@55499
   127
          "\"" ~ ml_string_body ~ ("\"" | gap_start) ^^
wenzelm@55499
   128
            { case x ~ y ~ z => result(x + y + z, if (z == "\"") Scan.Finished else ML_String) }
wenzelm@55499
   129
        case ML_String =>
wenzelm@55500
   130
          blanks ~ opt_term("\\" ~ ml_string_body ~ ("\"" | gap_start)) ^^
wenzelm@55499
   131
            { case x ~ Some(y ~ z ~ w) =>
wenzelm@55499
   132
                result(x + y + z + w, if (w == "\"") Scan.Finished else ML_String)
wenzelm@55499
   133
              case x ~ None => result(x, ML_String) }
wenzelm@55499
   134
        case _ => failure("")
wenzelm@55499
   135
      }
wenzelm@55499
   136
    }
wenzelm@55499
   137
wenzelm@55499
   138
wenzelm@55499
   139
    /* ML comment */
wenzelm@55499
   140
wenzelm@55499
   141
    private val ml_comment: Parser[Token] =
wenzelm@55499
   142
      comment ^^ (x => Token(Kind.COMMENT, x))
wenzelm@55499
   143
wenzelm@55510
   144
    private def ml_comment_line(ctxt: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] =
wenzelm@55510
   145
      comment_line(ctxt) ^^ { case (x, c) => (Token(Kind.COMMENT, x), c) }
wenzelm@55497
   146
wenzelm@55497
   147
wenzelm@55497
   148
    /* delimited token */
wenzelm@55497
   149
wenzelm@55497
   150
    private def delimited_token: Parser[Token] =
wenzelm@61596
   151
      ml_char | (ml_string | ml_comment)
wenzelm@55497
   152
wenzelm@55499
   153
    private val recover_delimited: Parser[Token] =
wenzelm@59112
   154
      (recover_ml_char | (recover_ml_string | (recover_cartouche | recover_comment))) ^^
wenzelm@59112
   155
        (x => Token(Kind.ERROR, x))
wenzelm@55497
   156
wenzelm@55497
   157
wenzelm@55497
   158
    private def other_token: Parser[Token] =
wenzelm@55497
   159
    {
wenzelm@55497
   160
      /* identifiers */
wenzelm@55497
   161
wenzelm@55497
   162
      val letdigs = many(character(Symbol.is_ascii_letdig))
wenzelm@55497
   163
wenzelm@55497
   164
      val alphanumeric =
wenzelm@55497
   165
        one(character(Symbol.is_ascii_letter)) ~ letdigs ^^ { case x ~ y => x + y }
wenzelm@55497
   166
wenzelm@55497
   167
      val symbolic = many1(character("!#$%&*+-/:<=>?@\\^`|~".contains(_)))
wenzelm@55497
   168
wenzelm@55497
   169
      val ident = (alphanumeric | symbolic) ^^ (x => Token(Kind.IDENT, x))
wenzelm@55497
   170
wenzelm@55497
   171
      val long_ident =
wenzelm@55497
   172
        rep1(alphanumeric ~ "." ^^ { case x ~ y => x + y }) ~
wenzelm@55497
   173
          (alphanumeric | (symbolic | "=")) ^^
wenzelm@55497
   174
          { case x ~ y => Token(Kind.LONG_IDENT, x.mkString + y) }
wenzelm@55497
   175
wenzelm@55497
   176
      val type_var = "'" ~ letdigs ^^ { case x ~ y => Token(Kind.TYPE_VAR, x + y) }
wenzelm@55497
   177
wenzelm@55497
   178
wenzelm@55497
   179
      /* numerals */
wenzelm@55497
   180
wenzelm@55497
   181
      val dec = many1(character(Symbol.is_ascii_digit))
wenzelm@55497
   182
      val hex = many1(character(Symbol.is_ascii_hex))
wenzelm@55497
   183
      val sign = opt("~") ^^ { case Some(x) => x case None => "" }
wenzelm@55497
   184
      val decint = sign ~ dec ^^ { case x ~ y => x + y }
wenzelm@55497
   185
      val exp = ("E" | "e") ~ decint ^^ { case x ~ y => x + y }
wenzelm@55497
   186
wenzelm@55497
   187
      val word =
wenzelm@55497
   188
        ("0wx" ~ hex ^^ { case x ~ y => x + y } | "0w" ~ dec ^^ { case x ~ y => x + y }) ^^
wenzelm@55497
   189
          (x => Token(Kind.WORD, x))
wenzelm@55497
   190
wenzelm@55497
   191
      val int =
wenzelm@55497
   192
        sign ~ ("0x" ~ hex ^^ { case x ~ y => x + y } | dec) ^^
wenzelm@55497
   193
          { case x ~ y => Token(Kind.INT, x + y) }
wenzelm@55497
   194
wenzelm@63204
   195
      val rat =
wenzelm@63204
   196
        decint ~ opt("/" ~ dec) ^^ { case x ~ None => x case x ~ Some(y ~ z) => x + y + z }
wenzelm@63204
   197
wenzelm@55497
   198
      val real =
wenzelm@55497
   199
        (decint ~ "." ~ dec ~ (opt(exp) ^^ { case Some(x) => x case None => "" }) ^^
wenzelm@55497
   200
          { case x ~ y ~ z ~ w => x + y + z + w } |
wenzelm@55497
   201
         decint ~ exp ^^ { case x ~ y => x + y }) ^^ (x => Token(Kind.REAL, x))
wenzelm@55497
   202
wenzelm@55497
   203
wenzelm@55499
   204
      /* main */
wenzelm@55497
   205
wenzelm@55497
   206
      val space = blanks1 ^^ (x => Token(Kind.SPACE, x))
wenzelm@55497
   207
wenzelm@55497
   208
      val keyword = literal(lexicon) ^^ (x => Token(Kind.KEYWORD, x))
wenzelm@55497
   209
wenzelm@61471
   210
      val ml_control = control ^^ (x => Token(Kind.CONTROL, x))
wenzelm@63204
   211
      val ml_antiq =
wenzelm@63204
   212
        "@" ~ rat ^^ { case x ~ y => Token(Kind.ANTIQ, x + y) } |
wenzelm@63204
   213
        antiq ^^ (x => Token(Kind.ANTIQ, x))
wenzelm@55512
   214
wenzelm@55497
   215
      val bad = one(_ => true) ^^ (x => Token(Kind.ERROR, x))
wenzelm@55497
   216
wenzelm@61596
   217
      space | (ml_control | (recover_delimited | (ml_antiq |
wenzelm@61471
   218
        (((word | (real | (int | (long_ident | (ident | type_var))))) ||| keyword) | bad))))
wenzelm@55497
   219
    }
wenzelm@55497
   220
wenzelm@55499
   221
wenzelm@55512
   222
    /* antiquotations (line-oriented) */
wenzelm@55512
   223
wenzelm@55512
   224
    def ml_antiq_start(ctxt: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] =
wenzelm@55512
   225
      ctxt match {
wenzelm@55512
   226
        case Scan.Finished => "@{" ^^ (x => (Token(Kind.ANTIQ_START, x), Antiq(Scan.Finished)))
wenzelm@55512
   227
        case _ => failure("")
wenzelm@55512
   228
      }
wenzelm@55512
   229
wenzelm@55512
   230
    def ml_antiq_stop(ctxt: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] =
wenzelm@55512
   231
      ctxt match {
wenzelm@55512
   232
        case Antiq(Scan.Finished) => "}" ^^ (x => (Token(Kind.ANTIQ_STOP, x), Scan.Finished))
wenzelm@55512
   233
        case _ => failure("")
wenzelm@55512
   234
      }
wenzelm@55512
   235
wenzelm@55512
   236
    def ml_antiq_body(context: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] =
wenzelm@55512
   237
      context match {
wenzelm@55512
   238
        case Antiq(ctxt) =>
wenzelm@55512
   239
          (if (ctxt == Scan.Finished) antiq_other ^^ (x => (Token(Kind.ANTIQ_OTHER, x), context))
wenzelm@55512
   240
           else failure("")) |
wenzelm@55512
   241
          quoted_line("\"", ctxt) ^^ { case (x, c) => (Token(Kind.ANTIQ_STRING, x), Antiq(c)) } |
wenzelm@55512
   242
          quoted_line("`", ctxt) ^^ { case (x, c) => (Token(Kind.ANTIQ_ALT_STRING, x), Antiq(c)) } |
wenzelm@55512
   243
          cartouche_line(ctxt) ^^ { case (x, c) => (Token(Kind.ANTIQ_CARTOUCHE, x), Antiq(c)) }
wenzelm@55512
   244
        case _ => failure("")
wenzelm@55512
   245
      }
wenzelm@55512
   246
wenzelm@55512
   247
wenzelm@55499
   248
    /* token */
wenzelm@55499
   249
wenzelm@55497
   250
    def token: Parser[Token] = delimited_token | other_token
wenzelm@55499
   251
wenzelm@56278
   252
    def token_line(SML: Boolean, ctxt: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] =
wenzelm@55499
   253
    {
wenzelm@55499
   254
      val other = (ml_char | other_token) ^^ (x => (x, Scan.Finished))
wenzelm@55499
   255
wenzelm@56278
   256
      if (SML) ml_string_line(ctxt) | (ml_comment_line(ctxt) | other)
wenzelm@56278
   257
      else
wenzelm@56278
   258
        ml_string_line(ctxt) |
wenzelm@61596
   259
          (ml_comment_line(ctxt) |
wenzelm@61596
   260
            (ml_antiq_start(ctxt) | (ml_antiq_stop(ctxt) | (ml_antiq_body(ctxt) | other))))
wenzelm@55499
   261
    }
wenzelm@55497
   262
  }
wenzelm@55497
   263
wenzelm@55499
   264
wenzelm@55499
   265
  /* tokenize */
wenzelm@55499
   266
wenzelm@55497
   267
  def tokenize(input: CharSequence): List[Token] =
wenzelm@64824
   268
    Parsers.parseAll(Parsers.rep(Parsers.token), Scan.char_reader(input)) match {
wenzelm@55497
   269
      case Parsers.Success(tokens, _) => tokens
wenzelm@55497
   270
      case _ => error("Unexpected failure of tokenizing input:\n" + input.toString)
wenzelm@55497
   271
    }
wenzelm@55499
   272
wenzelm@56278
   273
  def tokenize_line(SML: Boolean, input: CharSequence, context: Scan.Line_Context)
wenzelm@55510
   274
    : (List[Token], Scan.Line_Context) =
wenzelm@55499
   275
  {
wenzelm@64824
   276
    var in: Reader[Char] = Scan.char_reader(input)
wenzelm@55499
   277
    val toks = new mutable.ListBuffer[Token]
wenzelm@55499
   278
    var ctxt = context
wenzelm@55499
   279
    while (!in.atEnd) {
wenzelm@56278
   280
      Parsers.parse(Parsers.token_line(SML, ctxt), in) match {
wenzelm@60215
   281
        case Parsers.Success((x, c), rest) => toks += x; ctxt = c; in = rest
wenzelm@55499
   282
        case Parsers.NoSuccess(_, rest) =>
wenzelm@55499
   283
          error("Unexpected failure of tokenizing input:\n" + rest.source.toString)
wenzelm@55499
   284
      }
wenzelm@55499
   285
    }
wenzelm@55499
   286
    (toks.toList, ctxt)
wenzelm@55499
   287
  }
wenzelm@55497
   288
}