lexical syntax for SML (in Scala);
authorwenzelm
Fri, 14 Feb 2014 21:06:20 +0100
changeset 55497 c0f8aebfb43d
parent 55496 5d2835453ad3
child 55498 cf829d10d1d4
lexical syntax for SML (in Scala); tuned;
src/Pure/General/scan.scala
src/Pure/General/symbol.scala
src/Pure/Isar/completion.scala
src/Pure/Isar/token.scala
src/Pure/ML/ml_lex.scala
src/Pure/build-jars
--- a/src/Pure/General/scan.scala	Fri Feb 14 20:58:48 2014 +0100
+++ b/src/Pure/General/scan.scala	Fri Feb 14 21:06:20 2014 +0100
@@ -43,9 +43,9 @@
       p ^^ (x => Some(x)) | """\z""".r ^^ (_ => None)
 
 
-    /* symbol range */
+    /* repeated symbols */
 
-    def symbol_range(pred: Symbol.Symbol => Boolean, min_count: Int, max_count: Int): Parser[String] =
+    def repeated(pred: Symbol.Symbol => Boolean, min_count: Int, max_count: Int): Parser[String] =
       new Parser[String]
       {
         def apply(in: Input) =
@@ -66,16 +66,22 @@
           if (count < min_count) Failure("bad input", in)
           else Success(in.source.subSequence(start, i).toString, in.drop(i - start))
         }
-      }.named("symbol_range")
+      }.named("repeated")
 
     def one(pred: Symbol.Symbol => Boolean): Parser[String] =
-      symbol_range(pred, 1, 1)
+      repeated(pred, 1, 1)
 
     def many(pred: Symbol.Symbol => Boolean): Parser[String] =
-      symbol_range(pred, 0, Integer.MAX_VALUE)
+      repeated(pred, 0, Integer.MAX_VALUE)
 
     def many1(pred: Symbol.Symbol => Boolean): Parser[String] =
-      symbol_range(pred, 1, Integer.MAX_VALUE)
+      repeated(pred, 1, Integer.MAX_VALUE)
+
+
+    /* character */
+
+    def character(pred: Char => Boolean): Symbol.Symbol => Boolean =
+      (s: Symbol. Symbol) => s.length == 1 && pred(s.charAt(0))
 
 
     /* quoted strings */
@@ -279,7 +285,7 @@
 
     /* keyword */
 
-    def keyword(lexicon: Lexicon): Parser[String] = new Parser[String]
+    def literal(lexicon: Lexicon): Parser[String] = new Parser[String]
     {
       def apply(in: Input) =
       {
--- a/src/Pure/General/symbol.scala	Fri Feb 14 20:58:48 2014 +0100
+++ b/src/Pure/General/symbol.scala	Fri Feb 14 21:06:20 2014 +0100
@@ -19,9 +19,16 @@
   /* ASCII characters */
 
   def is_ascii_letter(c: Char): Boolean = 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'
+
   def is_ascii_digit(c: Char): Boolean = '0' <= c && c <= '9'
+
+  def is_ascii_hex(c: Char): Boolean =
+    '0' <= c && c <= '9' || 'A' <= c && c <= 'F' || 'a' <= c && c <= 'f'
+
   def is_ascii_quasi(c: Char): Boolean = c == '_' || c == '\''
 
+  def is_ascii_blank(c: Char): Boolean = " \t\n\u000b\f\r".contains(c)
+
   def is_ascii_letdig(c: Char): Boolean =
     is_ascii_letter(c) || is_ascii_digit(c) || is_ascii_quasi(c)
 
--- a/src/Pure/Isar/completion.scala	Fri Feb 14 20:58:48 2014 +0100
+++ b/src/Pure/Isar/completion.scala	Fri Feb 14 21:06:20 2014 +0100
@@ -185,7 +185,7 @@
     line: CharSequence): Option[Completion.Result] =
   {
     val raw_result =
-      Scan.Parsers.parse(Scan.Parsers.keyword(abbrevs_lex), new Library.Reverse(line)) match {
+      Scan.Parsers.parse(Scan.Parsers.literal(abbrevs_lex), new Library.Reverse(line)) match {
         case Scan.Parsers.Success(reverse_a, _) =>
           val abbrevs = abbrevs_map.get_list(reverse_a)
           abbrevs match {
--- a/src/Pure/Isar/token.scala	Fri Feb 14 20:58:48 2014 +0100
+++ b/src/Pure/Isar/token.scala	Fri Feb 14 21:06:20 2014 +0100
@@ -81,7 +81,7 @@
         (x => Token(Token.Kind.SYM_IDENT, x))
 
       val command_keyword =
-        keyword(lexicon) ^^
+        literal(lexicon) ^^
           (x => Token(if (is_command(x)) Token.Kind.COMMAND else Token.Kind.KEYWORD, x))
 
       val space = many1(Symbol.is_blank) ^^ (x => Token(Token.Kind.SPACE, x))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/Pure/ML/ml_lex.scala	Fri Feb 14 21:06:20 2014 +0100
@@ -0,0 +1,162 @@
+/*  Title:      Pure/ML/ml_lex.scala
+    Author:     Makarius
+
+Lexical syntax for SML.
+*/
+
+package isabelle
+
+import scala.util.parsing.input.{Reader, CharSequenceReader}
+
+
+object ML_Lex
+{
+  /** tokens **/
+
+  object Kind extends Enumeration
+  {
+    val KEYWORD = Value("keyword")
+    val IDENT = Value("identifier")
+    val LONG_IDENT = Value("long identifier")
+    val TYPE_VAR = Value("type variable")
+    val WORD = Value("word")
+    val INT = Value("integer")
+    val REAL = Value("real")
+    val CHAR = Value("character")
+    val STRING = Value("quoted string")
+    val SPACE = Value("white space")
+    val COMMENT = Value("comment text")
+    val ERROR = Value("bad input")
+  }
+
+  sealed case class Token(val kind: Kind.Value, val source: String)
+
+
+
+  /** parsers **/
+
+  private val lexicon =
+    Scan.Lexicon("#", "(", ")", ",", "->", "...", ":", ":>", ";", "=",
+      "=>", "[", "]", "_", "{", "|", "}", "abstype", "and", "andalso", "as",
+      "case", "datatype", "do", "else", "end", "eqtype", "exception", "fn",
+      "fun", "functor", "handle", "if", "in", "include", "infix", "infixr",
+      "let", "local", "nonfix", "of", "op", "open", "orelse", "raise", "rec",
+      "sharing", "sig", "signature", "struct", "structure", "then", "type",
+      "val", "where", "while", "with", "withtype")
+
+  private object Parsers extends Scan.Parsers
+  {
+    /* string material */
+
+    private val blanks1 = many1(character(Symbol.is_ascii_blank))
+
+    private val escape =
+      one(character("\"\\abtnvfr".contains(_))) |
+      "^" ~ one(character(c => '@' <= c && c <= '_')) ^^ { case x ~ y => x + y } |
+      repeated(character(Symbol.is_ascii_digit), 3, 3)
+
+    private val str =
+      one(character(c => c != '"' && c != '\\' && ' ' <= c && c <= '~')) |
+      "\\" ~ escape ^^ { case x ~ y => x + y }
+
+    private val gap = "\\" ~ blanks1 ~ "\\" ^^ { case x ~ y ~ z => x + y + z }
+    private val gaps = rep(gap) ^^ (_.mkString)
+
+
+    /* delimited token */
+
+    private def delimited_token: Parser[Token] =
+    {
+      val char =
+        "#\"" ~ gaps ~ str ~ gaps ~ "\"" ^^
+        { case u ~ v ~ x ~ y ~ z => Token(Kind.CHAR, u + v + x + y + z) }
+
+      val string =
+        "\"" ~ (rep(gap | str) ^^ (_.mkString)) ~ "\"" ^^
+        { case x ~ y ~ z => Token(Kind.STRING, x + y + z) }
+
+      val cmt = comment ^^ (x => Token(Kind.COMMENT, x))
+
+      char | (string | cmt)
+    }
+
+    private def other_token: Parser[Token] =
+    {
+      /* identifiers */
+
+      val letdigs = many(character(Symbol.is_ascii_letdig))
+
+      val alphanumeric =
+        one(character(Symbol.is_ascii_letter)) ~ letdigs ^^ { case x ~ y => x + y }
+
+      val symbolic = many1(character("!#$%&*+-/:<=>?@\\^`|~".contains(_)))
+
+      val ident = (alphanumeric | symbolic) ^^ (x => Token(Kind.IDENT, x))
+
+      val long_ident =
+        rep1(alphanumeric ~ "." ^^ { case x ~ y => x + y }) ~
+          (alphanumeric | (symbolic | "=")) ^^
+          { case x ~ y => Token(Kind.LONG_IDENT, x.mkString + y) }
+
+      val type_var = "'" ~ letdigs ^^ { case x ~ y => Token(Kind.TYPE_VAR, x + y) }
+
+
+      /* numerals */
+
+      val dec = many1(character(Symbol.is_ascii_digit))
+      val hex = many1(character(Symbol.is_ascii_hex))
+      val sign = opt("~") ^^ { case Some(x) => x case None => "" }
+      val decint = sign ~ dec ^^ { case x ~ y => x + y }
+      val exp = ("E" | "e") ~ decint ^^ { case x ~ y => x + y }
+
+      val word =
+        ("0wx" ~ hex ^^ { case x ~ y => x + y } | "0w" ~ dec ^^ { case x ~ y => x + y }) ^^
+          (x => Token(Kind.WORD, x))
+
+      val int =
+        sign ~ ("0x" ~ hex ^^ { case x ~ y => x + y } | dec) ^^
+          { case x ~ y => Token(Kind.INT, x + y) }
+
+      val real =
+        (decint ~ "." ~ dec ~ (opt(exp) ^^ { case Some(x) => x case None => "" }) ^^
+          { case x ~ y ~ z ~ w => x + y + z + w } |
+         decint ~ exp ^^ { case x ~ y => x + y }) ^^ (x => Token(Kind.REAL, x))
+
+
+      /* recover delimited */
+
+      val recover_char =
+        "#\"" ~ gaps ~ (opt(str ~ gaps) ^^ { case Some(x ~ y) => x + y case None => "" }) ^^
+          { case x ~ y ~ z => x + y + z }
+
+      val recover_string =
+        "\"" ~ (rep(gap | str) ^^ (_.mkString)) ^^ { case x ~ y => x + y }
+
+      val recover_delimited =
+        (recover_char | (recover_string | recover_comment)) ^^ (x => Token(Kind.ERROR, x))
+
+
+      /* token */
+
+      val space = blanks1 ^^ (x => Token(Kind.SPACE, x))
+
+      val keyword = literal(lexicon) ^^ (x => Token(Kind.KEYWORD, x))
+
+      val bad = one(_ => true) ^^ (x => Token(Kind.ERROR, x))
+
+      space | (recover_delimited |
+        (((word | (real | (int | (long_ident | (ident | type_var))))) ||| keyword) | bad))
+    }
+
+    def token: Parser[Token] = delimited_token | other_token
+  }
+
+  def tokenize(input: CharSequence): List[Token] =
+  {
+    Parsers.parseAll(Parsers.rep(Parsers.token), new CharSequenceReader(input)) match {
+      case Parsers.Success(tokens, _) => tokens
+      case _ => error("Unexpected failure of tokenizing input:\n" + input.toString)
+    }
+  }
+}
+
--- a/src/Pure/build-jars	Fri Feb 14 20:58:48 2014 +0100
+++ b/src/Pure/build-jars	Fri Feb 14 21:06:20 2014 +0100
@@ -43,6 +43,7 @@
   Isar/outer_syntax.scala
   Isar/parse.scala
   Isar/token.scala
+  ML/ml_lex.scala
   PIDE/command.scala
   PIDE/document.scala
   PIDE/document_id.scala