diff -r 4b190c763097 -r 7d6b8f8893e8 src/Pure/Tools/bibtex.scala --- a/src/Pure/Tools/bibtex.scala Fri Oct 03 20:19:42 2014 +0200 +++ b/src/Pure/Tools/bibtex.scala Fri Oct 03 23:33:47 2014 +0200 @@ -7,6 +7,7 @@ package isabelle +import scala.collection.mutable import scala.util.parsing.input.{Reader, CharSequenceReader} import scala.util.parsing.combinator.RegexParsers @@ -123,34 +124,41 @@ def is_error: Boolean = kind == Token.Kind.ERROR } - abstract class Chunk { def size: Int; def kind: String = "" } - case class Ignored(source: String) extends Chunk { def size: Int = source.size } - case class Malformed(source: String) extends Chunk { def size: Int = source.size } - case class Item(tokens: List[Token]) extends Chunk + abstract class Chunk + { + def size: Int + def kind: String + } + + case class Ignored(source: String) extends Chunk + { + def size: Int = source.size + def kind: String = "" + } + + case class Item(kind: String, tokens: List[Token]) extends Chunk { def size: Int = (0 /: tokens)({ case (n, token) => n + token.source.size }) - private val content: (String, List[Token]) = + private val wellformed_content: Option[List[Token]] = tokens match { case Token(Token.Kind.KEYWORD, "@") :: body if !body.isEmpty && !body.exists(_.is_error) => (body.init.filterNot(_.is_space), body.last) match { - case (Token(Token.Kind.IDENT, id) :: Token(Token.Kind.KEYWORD, "{") :: toks, - Token(Token.Kind.KEYWORD, "}")) => (id, toks) - case (Token(Token.Kind.IDENT, id) :: Token(Token.Kind.KEYWORD, "(") :: toks, - Token(Token.Kind.KEYWORD, ")")) => (id, toks) - case _ => ("", Nil) + case (Token(Token.Kind.IDENT, _) :: Token(Token.Kind.KEYWORD, "{") :: toks, + Token(Token.Kind.KEYWORD, "}")) => Some(toks) + case (Token(Token.Kind.IDENT, _) :: Token(Token.Kind.KEYWORD, "(") :: toks, + Token(Token.Kind.KEYWORD, ")")) => Some(toks) + case _ => None } - case _ => ("", Nil) + case _ => None } - override def kind: String = content._1 - def content_tokens: List[Token] = content._2 - - def is_wellformed: Boolean = kind != "" + def is_wellformed: Boolean = kind != "" && wellformed_content.isDefined + def content_tokens: List[Token] = wellformed_content getOrElse Nil def name: String = content_tokens match { - case Token(Token.Kind.IDENT, name) :: _ if is_wellformed => name + case Token(Token.Kind.IDENT, id) :: _ if is_wellformed => id case _ => "" } } @@ -161,8 +169,10 @@ // context of partial line-oriented scans abstract class Line_Context - case class Delimited(quoted: Boolean, depth: Int) extends Line_Context - val Finished = Delimited(false, 0) + case object Ignored_Context extends Line_Context + case class Item_Context(kind: String, delim: Delimited, right: String) extends Line_Context + case class Delimited(quoted: Boolean, depth: Int) + val Closed = Delimited(false, 0) private def token(kind: Token.Kind.Value)(source: String): Token = Token(kind, source) private def keyword(source: String): Token = Token(Token.Kind.KEYWORD, source) @@ -180,8 +190,17 @@ private val space = """[ \t\n\r]+""".r ^^ token(Token.Kind.SPACE) private val strict_space = """[ \t]+""".r ^^ token(Token.Kind.SPACE) - private val ignored = - rep1("""(?mi)([^@]+|@[ \t\n\r]*comment)""".r) ^^ { case ss => Ignored(ss.mkString) } + + /* ignored material outside items */ + + private val ignored: Parser[Chunk] = + rep1("""(?mi)([^@]+|@[ \t]*comment)""".r) ^^ { case ss => Ignored(ss.mkString) } + + private def ignored_line(ctxt: Line_Context): Parser[(Chunk, Line_Context)] = + ctxt match { + case Ignored_Context => ignored ^^ { case a => (a, ctxt) } + case _ => failure("") + } /* delimited string: outermost "..." or {...} and body with balanced {...} */ @@ -210,29 +229,29 @@ else finished = true } if (i == start) Failure("bad input", in) - else - Success((in.source.subSequence(start, i).toString, - Delimited(q, d)), in.drop(i - start)) + else { + val s = in.source.subSequence(start, i).toString + Success((s, Delimited(q, d)), in.drop(i - start)) + } } }.named("delimited_depth") - private def delimited: Parser[String] = - delimited_depth(Finished) ^? { case (x, delim) if delim == Finished => x } + private def delimited: Parser[Token] = + delimited_depth(Closed) ^? + { case (s, delim) if delim == Closed => Token(Token.Kind.STRING, s) } - private def delimited_line(ctxt: Line_Context): Parser[(String, Line_Context)] = + private def delimited_line(ctxt: Line_Context): Parser[(Item, Line_Context)] = { ctxt match { - case delim: Delimited => delimited_depth(delim) + case Item_Context(kind, delim, right) => + delimited_depth(delim) ^^ { case (s, delim1) => + (Item(kind, List(Token(Token.Kind.STRING, s))), Item_Context(kind, delim1, right)) } case _ => failure("") } } - private val recover_delimited: Parser[String] = - delimited_depth(Finished) ^^ (_._1) - - private val delimited_token = - delimited ^^ token(Token.Kind.STRING) | - recover_delimited ^^ token(Token.Kind.ERROR) + private def recover_delimited: Parser[Token] = + """(?m)["{][^@]+""".r ^^ token(Token.Kind.ERROR) /* other tokens */ @@ -248,32 +267,80 @@ private val ident = """[\x21-\x7f&&[^"#%'(),={}0-9]][\x21-\x7f&&[^"#%'(),={}]]*""".r ^^ token(Token.Kind.IDENT) + val other_token = "[=#,]".r ^^ keyword | (nat | (ident | space)) + + + /* items */ + + private val item_start: Parser[(String, List[Token])] = + at ~ rep(strict_space) ~ ident ~ rep(strict_space) ^^ + { case a ~ b ~ c ~ d => (c.source, List(a) ::: b ::: List(c) ::: d) } + + private val item_body = + delimited | (recover_delimited | other_token) + + private val item: Parser[Item] = + (item_start ~ left_brace ~ rep(item_body) ~ opt(right_brace) | + item_start ~ left_paren ~ rep(item_body) ~ opt(right_paren)) ^^ + { case (kind, a) ~ b ~ c ~ d => Item(kind, a ::: List(b) ::: c ::: d.toList) } + + private val recover_item: Parser[Item] = + at ~ "(?m)[^@]+".r ^^ { case a ~ b => Item("", List(a, Token(Token.Kind.ERROR, b))) } + + def item_line(ctxt: Line_Context): Parser[(Item, Line_Context)] = + { + ctxt match { + case Ignored_Context => + item_start ~ (left_brace | left_paren) ^^ + { case (kind, a) ~ b => + val right = if (b.source == "{") "}" else ")" + (Item(kind, a ::: List(b)), Item_Context(kind, Closed, right)) } + case Item_Context(kind, delim, right) => + if (delim.depth > 0) + delimited_line(ctxt) + else { + delimited_line(ctxt) | + other_token ^^ { case a => (Item(kind, List(a)), ctxt) } | + right ^^ { case a => (Item(kind, List(keyword(a))), Ignored_Context) } + } + case _ => failure("") + } + } + /* chunks */ - private val item_start = - at ~ rep(strict_space) ~ ident ~ rep(strict_space) ^^ - { case a ~ b ~ c ~ d => List(a) ::: b ::: List(c) ::: d } - - private val body_token = delimited_token | ("[=#,]".r ^^ keyword | (nat | (ident | space))) + val chunk: Parser[Chunk] = ignored | (item | recover_item) - private val item = - (item_start ~ left_brace ~ rep(body_token) ~ opt(right_brace) | - item_start ~ left_paren ~ rep(body_token) ~ opt(right_paren)) ^^ - { case a ~ b ~ c ~ d => Item(a ::: List(b) ::: c ::: d.toList) } + def chunk_line(ctxt: Line_Context): Parser[(Chunk, Line_Context)] = + ignored_line(ctxt) | item_line(ctxt) + } - private val recover_item = "(?m)@[^@]+".r ^^ (s => Malformed(s)) - val chunks: Parser[List[Chunk]] = rep(ignored | (item | recover_item)) - } + /* parse */ def parse(input: CharSequence): List[Chunk] = { val in: Reader[Char] = new CharSequenceReader(input) - Parsers.parseAll(Parsers.chunks, in) match { + Parsers.parseAll(Parsers.rep(Parsers.chunk), in) match { case Parsers.Success(result, _) => result case _ => error("Unexpected failure to parse input:\n" + input.toString) } } + + def parse_line(input: CharSequence, context: Line_Context): (List[Chunk], Line_Context) = + { + var in: Reader[Char] = new CharSequenceReader(input) + val chunks = new mutable.ListBuffer[Chunk] + var ctxt = context + while (!in.atEnd) { + Parsers.parse(Parsers.chunk_line(ctxt), in) match { + case Parsers.Success((x, c), rest) => { chunks += x; ctxt = c; in = rest } + case Parsers.NoSuccess(_, rest) => + error("Unepected failure to parse input:\n" + rest.source.toString) + } + } + (chunks.toList, ctxt) + } }