src/Pure/ML/ml_lex.scala
author wenzelm
Fri, 14 Feb 2014 21:06:20 +0100
changeset 55497 c0f8aebfb43d
child 55499 2581fbee5b95
permissions -rw-r--r--
lexical syntax for SML (in Scala); tuned;
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
55497
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
     1
/*  Title:      Pure/ML/ml_lex.scala
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
     2
    Author:     Makarius
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
     3
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
     4
Lexical syntax for SML.
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
     5
*/
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
     6
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
     7
package isabelle
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
     8
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
     9
import scala.util.parsing.input.{Reader, CharSequenceReader}
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    10
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    11
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    12
object ML_Lex
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    13
{
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    14
  /** tokens **/
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    15
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    16
  object Kind extends Enumeration
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    17
  {
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    18
    val KEYWORD = Value("keyword")
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    19
    val IDENT = Value("identifier")
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    20
    val LONG_IDENT = Value("long identifier")
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    21
    val TYPE_VAR = Value("type variable")
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    22
    val WORD = Value("word")
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    23
    val INT = Value("integer")
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    24
    val REAL = Value("real")
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    25
    val CHAR = Value("character")
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    26
    val STRING = Value("quoted string")
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    27
    val SPACE = Value("white space")
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    28
    val COMMENT = Value("comment text")
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    29
    val ERROR = Value("bad input")
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    30
  }
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    31
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    32
  sealed case class Token(val kind: Kind.Value, val source: String)
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    33
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    34
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    35
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    36
  /** parsers **/
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    37
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    38
  private val lexicon =
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    39
    Scan.Lexicon("#", "(", ")", ",", "->", "...", ":", ":>", ";", "=",
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    40
      "=>", "[", "]", "_", "{", "|", "}", "abstype", "and", "andalso", "as",
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    41
      "case", "datatype", "do", "else", "end", "eqtype", "exception", "fn",
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    42
      "fun", "functor", "handle", "if", "in", "include", "infix", "infixr",
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    43
      "let", "local", "nonfix", "of", "op", "open", "orelse", "raise", "rec",
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    44
      "sharing", "sig", "signature", "struct", "structure", "then", "type",
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    45
      "val", "where", "while", "with", "withtype")
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    46
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    47
  private object Parsers extends Scan.Parsers
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    48
  {
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    49
    /* string material */
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    50
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    51
    private val blanks1 = many1(character(Symbol.is_ascii_blank))
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    52
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    53
    private val escape =
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    54
      one(character("\"\\abtnvfr".contains(_))) |
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    55
      "^" ~ one(character(c => '@' <= c && c <= '_')) ^^ { case x ~ y => x + y } |
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    56
      repeated(character(Symbol.is_ascii_digit), 3, 3)
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    57
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    58
    private val str =
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    59
      one(character(c => c != '"' && c != '\\' && ' ' <= c && c <= '~')) |
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    60
      "\\" ~ escape ^^ { case x ~ y => x + y }
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    61
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    62
    private val gap = "\\" ~ blanks1 ~ "\\" ^^ { case x ~ y ~ z => x + y + z }
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    63
    private val gaps = rep(gap) ^^ (_.mkString)
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    64
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    65
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    66
    /* delimited token */
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    67
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    68
    private def delimited_token: Parser[Token] =
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    69
    {
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    70
      val char =
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    71
        "#\"" ~ gaps ~ str ~ gaps ~ "\"" ^^
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    72
        { case u ~ v ~ x ~ y ~ z => Token(Kind.CHAR, u + v + x + y + z) }
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    73
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    74
      val string =
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    75
        "\"" ~ (rep(gap | str) ^^ (_.mkString)) ~ "\"" ^^
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    76
        { case x ~ y ~ z => Token(Kind.STRING, x + y + z) }
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    77
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    78
      val cmt = comment ^^ (x => Token(Kind.COMMENT, x))
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    79
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    80
      char | (string | cmt)
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    81
    }
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    82
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    83
    private def other_token: Parser[Token] =
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    84
    {
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    85
      /* identifiers */
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    86
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    87
      val letdigs = many(character(Symbol.is_ascii_letdig))
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    88
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    89
      val alphanumeric =
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    90
        one(character(Symbol.is_ascii_letter)) ~ letdigs ^^ { case x ~ y => x + y }
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    91
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    92
      val symbolic = many1(character("!#$%&*+-/:<=>?@\\^`|~".contains(_)))
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    93
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    94
      val ident = (alphanumeric | symbolic) ^^ (x => Token(Kind.IDENT, x))
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    95
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    96
      val long_ident =
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    97
        rep1(alphanumeric ~ "." ^^ { case x ~ y => x + y }) ~
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    98
          (alphanumeric | (symbolic | "=")) ^^
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    99
          { case x ~ y => Token(Kind.LONG_IDENT, x.mkString + y) }
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   100
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   101
      val type_var = "'" ~ letdigs ^^ { case x ~ y => Token(Kind.TYPE_VAR, x + y) }
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   102
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   103
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   104
      /* numerals */
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   105
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   106
      val dec = many1(character(Symbol.is_ascii_digit))
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   107
      val hex = many1(character(Symbol.is_ascii_hex))
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   108
      val sign = opt("~") ^^ { case Some(x) => x case None => "" }
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   109
      val decint = sign ~ dec ^^ { case x ~ y => x + y }
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   110
      val exp = ("E" | "e") ~ decint ^^ { case x ~ y => x + y }
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   111
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   112
      val word =
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   113
        ("0wx" ~ hex ^^ { case x ~ y => x + y } | "0w" ~ dec ^^ { case x ~ y => x + y }) ^^
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   114
          (x => Token(Kind.WORD, x))
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   115
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   116
      val int =
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   117
        sign ~ ("0x" ~ hex ^^ { case x ~ y => x + y } | dec) ^^
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   118
          { case x ~ y => Token(Kind.INT, x + y) }
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   119
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   120
      val real =
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   121
        (decint ~ "." ~ dec ~ (opt(exp) ^^ { case Some(x) => x case None => "" }) ^^
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   122
          { case x ~ y ~ z ~ w => x + y + z + w } |
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   123
         decint ~ exp ^^ { case x ~ y => x + y }) ^^ (x => Token(Kind.REAL, x))
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   124
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   125
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   126
      /* recover delimited */
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   127
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   128
      val recover_char =
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   129
        "#\"" ~ gaps ~ (opt(str ~ gaps) ^^ { case Some(x ~ y) => x + y case None => "" }) ^^
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   130
          { case x ~ y ~ z => x + y + z }
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   131
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   132
      val recover_string =
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   133
        "\"" ~ (rep(gap | str) ^^ (_.mkString)) ^^ { case x ~ y => x + y }
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   134
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   135
      val recover_delimited =
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   136
        (recover_char | (recover_string | recover_comment)) ^^ (x => Token(Kind.ERROR, x))
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   137
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   138
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   139
      /* token */
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   140
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   141
      val space = blanks1 ^^ (x => Token(Kind.SPACE, x))
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   142
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   143
      val keyword = literal(lexicon) ^^ (x => Token(Kind.KEYWORD, x))
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   144
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   145
      val bad = one(_ => true) ^^ (x => Token(Kind.ERROR, x))
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   146
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   147
      space | (recover_delimited |
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   148
        (((word | (real | (int | (long_ident | (ident | type_var))))) ||| keyword) | bad))
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   149
    }
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   150
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   151
    def token: Parser[Token] = delimited_token | other_token
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   152
  }
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   153
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   154
  def tokenize(input: CharSequence): List[Token] =
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   155
  {
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   156
    Parsers.parseAll(Parsers.rep(Parsers.token), new CharSequenceReader(input)) match {
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   157
      case Parsers.Success(tokens, _) => tokens
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   158
      case _ => error("Unexpected failure of tokenizing input:\n" + input.toString)
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   159
    }
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   160
  }
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   161
}
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   162