more explicit item kind;
authorwenzelm
Fri, 03 Oct 2014 23:33:47 +0200
changeset 58528 7d6b8f8893e8
parent 58527 4b190c763097
child 58529 cd4439d8799c
more explicit item kind; clarified recover_delimited: skip to next @; support for line context parsing; tuned signature;
src/Pure/Tools/bibtex.scala
--- a/src/Pure/Tools/bibtex.scala	Fri Oct 03 20:19:42 2014 +0200
+++ b/src/Pure/Tools/bibtex.scala	Fri Oct 03 23:33:47 2014 +0200
@@ -7,6 +7,7 @@
 package isabelle
 
 
+import scala.collection.mutable
 import scala.util.parsing.input.{Reader, CharSequenceReader}
 import scala.util.parsing.combinator.RegexParsers
 
@@ -123,34 +124,41 @@
     def is_error: Boolean = kind == Token.Kind.ERROR
   }
 
-  abstract class Chunk { def size: Int; def kind: String = "" }
-  case class Ignored(source: String) extends Chunk { def size: Int = source.size }
-  case class Malformed(source: String) extends Chunk { def size: Int = source.size }
-  case class Item(tokens: List[Token]) extends Chunk
+  abstract class Chunk
+  {
+    def size: Int
+    def kind: String
+  }
+
+  case class Ignored(source: String) extends Chunk
+  {
+    def size: Int = source.size
+    def kind: String = ""
+  }
+
+  case class Item(kind: String, tokens: List[Token]) extends Chunk
   {
     def size: Int = (0 /: tokens)({ case (n, token) => n + token.source.size })
 
-    private val content: (String, List[Token]) =
+    private val wellformed_content: Option[List[Token]] =
       tokens match {
         case Token(Token.Kind.KEYWORD, "@") :: body
         if !body.isEmpty && !body.exists(_.is_error) =>
           (body.init.filterNot(_.is_space), body.last) match {
-            case (Token(Token.Kind.IDENT, id) :: Token(Token.Kind.KEYWORD, "{") :: toks,
-                  Token(Token.Kind.KEYWORD, "}")) => (id, toks)
-            case (Token(Token.Kind.IDENT, id) :: Token(Token.Kind.KEYWORD, "(") :: toks,
-                  Token(Token.Kind.KEYWORD, ")")) => (id, toks)
-            case _ => ("", Nil)
+            case (Token(Token.Kind.IDENT, _) :: Token(Token.Kind.KEYWORD, "{") :: toks,
+                  Token(Token.Kind.KEYWORD, "}")) => Some(toks)
+            case (Token(Token.Kind.IDENT, _) :: Token(Token.Kind.KEYWORD, "(") :: toks,
+                  Token(Token.Kind.KEYWORD, ")")) => Some(toks)
+            case _ => None
           }
-        case _ => ("", Nil)
+        case _ => None
       }
-    override def kind: String = content._1
-    def content_tokens: List[Token] = content._2
-
-    def is_wellformed: Boolean = kind != ""
+    def is_wellformed: Boolean = kind != "" && wellformed_content.isDefined
+    def content_tokens: List[Token] = wellformed_content getOrElse Nil
 
     def name: String =
       content_tokens match {
-        case Token(Token.Kind.IDENT, name) :: _ if is_wellformed => name
+        case Token(Token.Kind.IDENT, id) :: _ if is_wellformed => id
         case _ => ""
       }
   }
@@ -161,8 +169,10 @@
 
   // context of partial line-oriented scans
   abstract class Line_Context
-  case class Delimited(quoted: Boolean, depth: Int) extends Line_Context
-  val Finished = Delimited(false, 0)
+  case object Ignored_Context extends Line_Context
+  case class Item_Context(kind: String, delim: Delimited, right: String) extends Line_Context
+  case class Delimited(quoted: Boolean, depth: Int)
+  val Closed = Delimited(false, 0)
 
   private def token(kind: Token.Kind.Value)(source: String): Token = Token(kind, source)
   private def keyword(source: String): Token = Token(Token.Kind.KEYWORD, source)
@@ -180,8 +190,17 @@
     private val space = """[ \t\n\r]+""".r ^^ token(Token.Kind.SPACE)
     private val strict_space = """[ \t]+""".r ^^ token(Token.Kind.SPACE)
 
-    private val ignored =
-      rep1("""(?mi)([^@]+|@[ \t\n\r]*comment)""".r) ^^ { case ss => Ignored(ss.mkString) }
+
+    /* ignored material outside items */
+
+    private val ignored: Parser[Chunk] =
+      rep1("""(?mi)([^@]+|@[ \t]*comment)""".r) ^^ { case ss => Ignored(ss.mkString) }
+
+    private def ignored_line(ctxt: Line_Context): Parser[(Chunk, Line_Context)] =
+      ctxt match {
+        case Ignored_Context => ignored ^^ { case a => (a, ctxt) }
+        case _ => failure("")
+      }
 
 
     /* delimited string: outermost "..." or {...} and body with balanced {...} */
@@ -210,29 +229,29 @@
             else finished = true
           }
           if (i == start) Failure("bad input", in)
-          else
-            Success((in.source.subSequence(start, i).toString,
-              Delimited(q, d)), in.drop(i - start))
+          else {
+            val s = in.source.subSequence(start, i).toString
+            Success((s, Delimited(q, d)), in.drop(i - start))
+          }
         }
       }.named("delimited_depth")
 
-    private def delimited: Parser[String] =
-      delimited_depth(Finished) ^? { case (x, delim) if delim == Finished => x }
+    private def delimited: Parser[Token] =
+      delimited_depth(Closed) ^?
+        { case (s, delim) if delim == Closed => Token(Token.Kind.STRING, s) }
 
-    private def delimited_line(ctxt: Line_Context): Parser[(String, Line_Context)] =
+    private def delimited_line(ctxt: Line_Context): Parser[(Item, Line_Context)] =
     {
       ctxt match {
-        case delim: Delimited => delimited_depth(delim)
+        case Item_Context(kind, delim, right) =>
+          delimited_depth(delim) ^^ { case (s, delim1) =>
+            (Item(kind, List(Token(Token.Kind.STRING, s))), Item_Context(kind, delim1, right)) }
         case _ => failure("")
       }
     }
 
-    private val recover_delimited: Parser[String] =
-      delimited_depth(Finished) ^^ (_._1)
-
-    private val delimited_token =
-      delimited ^^ token(Token.Kind.STRING) |
-      recover_delimited ^^ token(Token.Kind.ERROR)
+    private def recover_delimited: Parser[Token] =
+      """(?m)["{][^@]+""".r ^^ token(Token.Kind.ERROR)
 
 
     /* other tokens */
@@ -248,32 +267,80 @@
     private val ident =
       """[\x21-\x7f&&[^"#%'(),={}0-9]][\x21-\x7f&&[^"#%'(),={}]]*""".r ^^ token(Token.Kind.IDENT)
 
+    val other_token = "[=#,]".r ^^ keyword | (nat | (ident | space))
+
+
+    /* items */
+
+    private val item_start: Parser[(String, List[Token])] =
+      at ~ rep(strict_space) ~ ident ~ rep(strict_space) ^^
+        { case a ~ b ~ c ~ d => (c.source, List(a) ::: b ::: List(c) ::: d) }
+
+    private val item_body =
+      delimited | (recover_delimited | other_token)
+
+    private val item: Parser[Item] =
+      (item_start ~ left_brace ~ rep(item_body) ~ opt(right_brace) |
+       item_start ~ left_paren ~ rep(item_body) ~ opt(right_paren)) ^^
+        { case (kind, a) ~ b ~ c ~ d => Item(kind, a ::: List(b) ::: c ::: d.toList) }
+
+    private val recover_item: Parser[Item] =
+      at ~ "(?m)[^@]+".r ^^ { case a ~ b => Item("", List(a, Token(Token.Kind.ERROR, b))) }
+
+    def item_line(ctxt: Line_Context): Parser[(Item, Line_Context)] =
+    {
+      ctxt match {
+        case Ignored_Context =>
+          item_start ~ (left_brace | left_paren) ^^
+            { case (kind, a) ~ b =>
+                val right = if (b.source == "{") "}" else ")"
+                (Item(kind, a ::: List(b)), Item_Context(kind, Closed, right)) }
+        case Item_Context(kind, delim, right) =>
+          if (delim.depth > 0)
+            delimited_line(ctxt)
+          else {
+            delimited_line(ctxt) |
+            other_token ^^ { case a => (Item(kind, List(a)), ctxt) } |
+            right ^^ { case a => (Item(kind, List(keyword(a))), Ignored_Context) }
+          }
+        case _ => failure("")
+      }
+    }
+
 
     /* chunks */
 
-    private val item_start =
-      at ~ rep(strict_space) ~ ident ~ rep(strict_space) ^^
-        { case a ~ b ~ c ~ d => List(a) ::: b ::: List(c) ::: d }
-
-    private val body_token = delimited_token | ("[=#,]".r ^^ keyword | (nat | (ident | space)))
+    val chunk: Parser[Chunk] = ignored | (item | recover_item)
 
-    private val item =
-      (item_start ~ left_brace ~ rep(body_token) ~ opt(right_brace) |
-       item_start ~ left_paren ~ rep(body_token) ~ opt(right_paren)) ^^
-        { case a ~ b ~ c ~ d => Item(a ::: List(b) ::: c ::: d.toList) }
+    def chunk_line(ctxt: Line_Context): Parser[(Chunk, Line_Context)] =
+      ignored_line(ctxt) | item_line(ctxt)
+  }
 
-    private val recover_item = "(?m)@[^@]+".r ^^ (s => Malformed(s))
 
-    val chunks: Parser[List[Chunk]] = rep(ignored | (item | recover_item))
-  }
+  /* parse */
 
   def parse(input: CharSequence): List[Chunk] =
   {
     val in: Reader[Char] = new CharSequenceReader(input)
-    Parsers.parseAll(Parsers.chunks, in) match {
+    Parsers.parseAll(Parsers.rep(Parsers.chunk), in) match {
       case Parsers.Success(result, _) => result
       case _ => error("Unexpected failure to parse input:\n" + input.toString)
     }
   }
+
+  def parse_line(input: CharSequence, context: Line_Context): (List[Chunk], Line_Context) =
+  {
+    var in: Reader[Char] = new CharSequenceReader(input)
+    val chunks = new mutable.ListBuffer[Chunk]
+    var ctxt = context
+    while (!in.atEnd) {
+      Parsers.parse(Parsers.chunk_line(ctxt), in) match {
+        case Parsers.Success((x, c), rest) => { chunks += x; ctxt = c; in = rest }
+        case Parsers.NoSuccess(_, rest) =>
+          error("Unepected failure to parse input:\n" + rest.source.toString)
+      }
+    }
+    (chunks.toList, ctxt)
+  }
 }