partial scans via ML_Lex.tokenize_context;
authorwenzelm
Sat, 15 Feb 2014 14:52:51 +0100
changeset 55499 2581fbee5b95
parent 55498 cf829d10d1d4
child 55500 cdbbaa3074a8
partial scans via ML_Lex.tokenize_context; simplified ML_char: no gaps (hardly relevant in practice); tuned;
src/Pure/General/scan.scala
src/Pure/ML/ml_lex.scala
--- a/src/Pure/General/scan.scala	Fri Feb 14 22:03:48 2014 +0100
+++ b/src/Pure/General/scan.scala	Sat Feb 15 14:52:51 2014 +0100
@@ -17,9 +17,9 @@
 
 object Scan
 {
-  /** context of partial scans **/
+  /** context of partial scans (line boundary) **/
 
-  sealed abstract class Context
+  abstract class Context
   case object Finished extends Context
   case class Quoted(quote: String) extends Context
   case object Verbatim extends Context
--- a/src/Pure/ML/ml_lex.scala	Fri Feb 14 22:03:48 2014 +0100
+++ b/src/Pure/ML/ml_lex.scala	Sat Feb 15 14:52:51 2014 +0100
@@ -6,6 +6,8 @@
 
 package isabelle
 
+
+import scala.collection.mutable
 import scala.util.parsing.input.{Reader, CharSequenceReader}
 
 
@@ -35,6 +37,8 @@
 
   /** parsers **/
 
+  case object ML_String extends Scan.Context
+
   private val lexicon =
     Scan.Lexicon("#", "(", ")", ",", "->", "...", ":", ":>", ";", "=",
       "=>", "[", "]", "_", "{", "|", "}", "abstype", "and", "andalso", "as",
@@ -50,6 +54,9 @@
 
     private val blanks1 = many1(character(Symbol.is_ascii_blank))
 
+    private val gap = "\\" ~ blanks1 ~ "\\" ^^ { case x ~ y ~ z => x + y + z }
+    private val gap_start = "\\" ~ blanks1 ~ """\z""".r ^^ { case x ~ y ~ _ => x + y }
+
     private val escape =
       one(character("\"\\abtnvfr".contains(_))) |
       "^" ~ one(character(c => '@' <= c && c <= '_')) ^^ { case x ~ y => x + y } |
@@ -59,26 +66,62 @@
       one(character(c => c != '"' && c != '\\' && ' ' <= c && c <= '~')) |
       "\\" ~ escape ^^ { case x ~ y => x + y }
 
-    private val gap = "\\" ~ blanks1 ~ "\\" ^^ { case x ~ y ~ z => x + y + z }
-    private val gaps = rep(gap) ^^ (_.mkString)
+
+    /* ML char -- without gaps */
+
+    private val ml_char: Parser[Token] =
+      "#\"" ~ str ~ "\"" ^^ { case x ~ y ~ z => Token(Kind.CHAR, x + y + z) }
+
+    private val recover_ml_char: Parser[String] =
+      "#\"" ~ opt(str) ^^ { case x ~ Some(y) => x + y case x ~ None => x }
+
+
+    /* ML string */
+
+    private val ml_string_body: Parser[String] =
+      rep(gap | str) ^^ (_.mkString)
+
+    private val recover_ml_string: Parser[String] =
+      "\"" ~ ml_string_body ^^ { case x ~ y => x + y }
+
+    private val ml_string: Parser[Token] =
+      "\"" ~ ml_string_body ~ "\"" ^^ { case x ~ y ~ z => Token(Kind.STRING, x + y + z) }
+
+    private def ml_string_context(ctxt: Scan.Context): Parser[(Token, Scan.Context)] =
+    {
+      def result(x: String, c: Scan.Context) = (Token(Kind.STRING, x), c)
+
+      ctxt match {
+        case Scan.Finished =>
+          "\"" ~ ml_string_body ~ ("\"" | gap_start) ^^
+            { case x ~ y ~ z => result(x + y + z, if (z == "\"") Scan.Finished else ML_String) }
+        case ML_String =>
+          blanks1 ~ opt_term("\\" ~ ml_string_body ~ ("\"" | gap_start)) ^^
+            { case x ~ Some(y ~ z ~ w) =>
+                result(x + y + z + w, if (w == "\"") Scan.Finished else ML_String)
+              case x ~ None => result(x, ML_String) }
+        case _ => failure("")
+      }
+    }
+
+
+    /* ML comment */
+
+    private val ml_comment: Parser[Token] =
+      comment ^^ (x => Token(Kind.COMMENT, x))
+
+    private def ml_comment_context(ctxt: Scan.Context): Parser[(Token, Scan.Context)] =
+      comment_context(ctxt) ^^ { case (x, c) => (Token(Kind.COMMENT, x), c) }
 
 
     /* delimited token */
 
     private def delimited_token: Parser[Token] =
-    {
-      val char =
-        "#\"" ~ gaps ~ str ~ gaps ~ "\"" ^^
-        { case u ~ v ~ x ~ y ~ z => Token(Kind.CHAR, u + v + x + y + z) }
+      ml_char | (ml_string | ml_comment)
 
-      val string =
-        "\"" ~ (rep(gap | str) ^^ (_.mkString)) ~ "\"" ^^
-        { case x ~ y ~ z => Token(Kind.STRING, x + y + z) }
+    private val recover_delimited: Parser[Token] =
+      (recover_ml_char | (recover_ml_string | recover_comment)) ^^ (x => Token(Kind.ERROR, x))
 
-      val cmt = comment ^^ (x => Token(Kind.COMMENT, x))
-
-      char | (string | cmt)
-    }
 
     private def other_token: Parser[Token] =
     {
@@ -123,20 +166,7 @@
          decint ~ exp ^^ { case x ~ y => x + y }) ^^ (x => Token(Kind.REAL, x))
 
 
-      /* recover delimited */
-
-      val recover_char =
-        "#\"" ~ gaps ~ (opt(str ~ gaps) ^^ { case Some(x ~ y) => x + y case None => "" }) ^^
-          { case x ~ y ~ z => x + y + z }
-
-      val recover_string =
-        "\"" ~ (rep(gap | str) ^^ (_.mkString)) ^^ { case x ~ y => x + y }
-
-      val recover_delimited =
-        (recover_char | (recover_string | recover_comment)) ^^ (x => Token(Kind.ERROR, x))
-
-
-      /* token */
+      /* main */
 
       val space = blanks1 ^^ (x => Token(Kind.SPACE, x))
 
@@ -148,9 +178,22 @@
         (((word | (real | (int | (long_ident | (ident | type_var))))) ||| keyword) | bad))
     }
 
+
+    /* token */
+
     def token: Parser[Token] = delimited_token | other_token
+
+    def token_context(ctxt: Scan.Context): Parser[(Token, Scan.Context)] =
+    {
+      val other = (ml_char | other_token) ^^ (x => (x, Scan.Finished))
+
+      ml_string_context(ctxt) | (ml_comment_context(ctxt) | other)
+    }
   }
 
+
+  /* tokenize */
+
   def tokenize(input: CharSequence): List[Token] =
   {
     Parsers.parseAll(Parsers.rep(Parsers.token), new CharSequenceReader(input)) match {
@@ -158,5 +201,20 @@
       case _ => error("Unexpected failure of tokenizing input:\n" + input.toString)
     }
   }
+
+  def tokenize_context(input: CharSequence, context: Scan.Context): (List[Token], Scan.Context) =
+  {
+    var in: Reader[Char] = new CharSequenceReader(input)
+    val toks = new mutable.ListBuffer[Token]
+    var ctxt = context
+    while (!in.atEnd) {
+      Parsers.parse(Parsers.token_context(ctxt), in) match {
+        case Parsers.Success((x, c), rest) => { toks += x; ctxt = c; in = rest }
+        case Parsers.NoSuccess(_, rest) =>
+          error("Unexpected failure of tokenizing input:\n" + rest.source.toString)
+      }
+    }
+    (toks.toList, ctxt)
+  }
 }