src/Pure/Tools/bibtex.scala
changeset 58528 7d6b8f8893e8
parent 58527 4b190c763097
child 58529 cd4439d8799c
equal deleted inserted replaced
58527:4b190c763097 58528:7d6b8f8893e8
     5 */
     5 */
     6 
     6 
     7 package isabelle
     7 package isabelle
     8 
     8 
     9 
     9 
       
    10 import scala.collection.mutable
    10 import scala.util.parsing.input.{Reader, CharSequenceReader}
    11 import scala.util.parsing.input.{Reader, CharSequenceReader}
    11 import scala.util.parsing.combinator.RegexParsers
    12 import scala.util.parsing.combinator.RegexParsers
    12 
    13 
    13 
    14 
    14 object Bibtex
    15 object Bibtex
   121   {
   122   {
   122     def is_space: Boolean = kind == Token.Kind.SPACE
   123     def is_space: Boolean = kind == Token.Kind.SPACE
   123     def is_error: Boolean = kind == Token.Kind.ERROR
   124     def is_error: Boolean = kind == Token.Kind.ERROR
   124   }
   125   }
   125 
   126 
   126   abstract class Chunk { def size: Int; def kind: String = "" }
   127   abstract class Chunk
   127   case class Ignored(source: String) extends Chunk { def size: Int = source.size }
   128   {
   128   case class Malformed(source: String) extends Chunk { def size: Int = source.size }
   129     def size: Int
   129   case class Item(tokens: List[Token]) extends Chunk
   130     def kind: String
       
   131   }
       
   132 
       
   133   case class Ignored(source: String) extends Chunk
       
   134   {
       
   135     def size: Int = source.size
       
   136     def kind: String = ""
       
   137   }
       
   138 
       
   139   case class Item(kind: String, tokens: List[Token]) extends Chunk
   130   {
   140   {
   131     def size: Int = (0 /: tokens)({ case (n, token) => n + token.source.size })
   141     def size: Int = (0 /: tokens)({ case (n, token) => n + token.source.size })
   132 
   142 
   133     private val content: (String, List[Token]) =
   143     private val wellformed_content: Option[List[Token]] =
   134       tokens match {
   144       tokens match {
   135         case Token(Token.Kind.KEYWORD, "@") :: body
   145         case Token(Token.Kind.KEYWORD, "@") :: body
   136         if !body.isEmpty && !body.exists(_.is_error) =>
   146         if !body.isEmpty && !body.exists(_.is_error) =>
   137           (body.init.filterNot(_.is_space), body.last) match {
   147           (body.init.filterNot(_.is_space), body.last) match {
   138             case (Token(Token.Kind.IDENT, id) :: Token(Token.Kind.KEYWORD, "{") :: toks,
   148             case (Token(Token.Kind.IDENT, _) :: Token(Token.Kind.KEYWORD, "{") :: toks,
   139                   Token(Token.Kind.KEYWORD, "}")) => (id, toks)
   149                   Token(Token.Kind.KEYWORD, "}")) => Some(toks)
   140             case (Token(Token.Kind.IDENT, id) :: Token(Token.Kind.KEYWORD, "(") :: toks,
   150             case (Token(Token.Kind.IDENT, _) :: Token(Token.Kind.KEYWORD, "(") :: toks,
   141                   Token(Token.Kind.KEYWORD, ")")) => (id, toks)
   151                   Token(Token.Kind.KEYWORD, ")")) => Some(toks)
   142             case _ => ("", Nil)
   152             case _ => None
   143           }
   153           }
   144         case _ => ("", Nil)
   154         case _ => None
   145       }
   155       }
   146     override def kind: String = content._1
   156     def is_wellformed: Boolean = kind != "" && wellformed_content.isDefined
   147     def content_tokens: List[Token] = content._2
   157     def content_tokens: List[Token] = wellformed_content getOrElse Nil
   148 
       
   149     def is_wellformed: Boolean = kind != ""
       
   150 
   158 
   151     def name: String =
   159     def name: String =
   152       content_tokens match {
   160       content_tokens match {
   153         case Token(Token.Kind.IDENT, name) :: _ if is_wellformed => name
   161         case Token(Token.Kind.IDENT, id) :: _ if is_wellformed => id
   154         case _ => ""
   162         case _ => ""
   155       }
   163       }
   156   }
   164   }
   157 
   165 
   158 
   166 
   159 
   167 
   160   /** parsing **/
   168   /** parsing **/
   161 
   169 
   162   // context of partial line-oriented scans
   170   // context of partial line-oriented scans
   163   abstract class Line_Context
   171   abstract class Line_Context
   164   case class Delimited(quoted: Boolean, depth: Int) extends Line_Context
   172   case object Ignored_Context extends Line_Context
   165   val Finished = Delimited(false, 0)
   173   case class Item_Context(kind: String, delim: Delimited, right: String) extends Line_Context
       
   174   case class Delimited(quoted: Boolean, depth: Int)
       
   175   val Closed = Delimited(false, 0)
   166 
   176 
   167   private def token(kind: Token.Kind.Value)(source: String): Token = Token(kind, source)
   177   private def token(kind: Token.Kind.Value)(source: String): Token = Token(kind, source)
   168   private def keyword(source: String): Token = Token(Token.Kind.KEYWORD, source)
   178   private def keyword(source: String): Token = Token(Token.Kind.KEYWORD, source)
   169 
   179 
   170 
   180 
   178     override val whiteSpace = "".r
   188     override val whiteSpace = "".r
   179 
   189 
   180     private val space = """[ \t\n\r]+""".r ^^ token(Token.Kind.SPACE)
   190     private val space = """[ \t\n\r]+""".r ^^ token(Token.Kind.SPACE)
   181     private val strict_space = """[ \t]+""".r ^^ token(Token.Kind.SPACE)
   191     private val strict_space = """[ \t]+""".r ^^ token(Token.Kind.SPACE)
   182 
   192 
   183     private val ignored =
   193 
   184       rep1("""(?mi)([^@]+|@[ \t\n\r]*comment)""".r) ^^ { case ss => Ignored(ss.mkString) }
   194     /* ignored material outside items */
       
   195 
       
   196     private val ignored: Parser[Chunk] =
       
   197       rep1("""(?mi)([^@]+|@[ \t]*comment)""".r) ^^ { case ss => Ignored(ss.mkString) }
       
   198 
       
   199     private def ignored_line(ctxt: Line_Context): Parser[(Chunk, Line_Context)] =
       
   200       ctxt match {
       
   201         case Ignored_Context => ignored ^^ { case a => (a, ctxt) }
       
   202         case _ => failure("")
       
   203       }
   185 
   204 
   186 
   205 
   187     /* delimited string: outermost "..." or {...} and body with balanced {...} */
   206     /* delimited string: outermost "..." or {...} and body with balanced {...} */
   188 
   207 
   189     private def delimited_depth(delim: Delimited): Parser[(String, Delimited)] =
   208     private def delimited_depth(delim: Delimited): Parser[(String, Delimited)] =
   208             else if (c == '}' && d > 0) { i += 1; d -= 1; if (d == 0) finished = true }
   227             else if (c == '}' && d > 0) { i += 1; d -= 1; if (d == 0) finished = true }
   209             else if (d > 0) i += 1
   228             else if (d > 0) i += 1
   210             else finished = true
   229             else finished = true
   211           }
   230           }
   212           if (i == start) Failure("bad input", in)
   231           if (i == start) Failure("bad input", in)
   213           else
   232           else {
   214             Success((in.source.subSequence(start, i).toString,
   233             val s = in.source.subSequence(start, i).toString
   215               Delimited(q, d)), in.drop(i - start))
   234             Success((s, Delimited(q, d)), in.drop(i - start))
       
   235           }
   216         }
   236         }
   217       }.named("delimited_depth")
   237       }.named("delimited_depth")
   218 
   238 
   219     private def delimited: Parser[String] =
   239     private def delimited: Parser[Token] =
   220       delimited_depth(Finished) ^? { case (x, delim) if delim == Finished => x }
   240       delimited_depth(Closed) ^?
   221 
   241         { case (s, delim) if delim == Closed => Token(Token.Kind.STRING, s) }
   222     private def delimited_line(ctxt: Line_Context): Parser[(String, Line_Context)] =
   242 
       
   243     private def delimited_line(ctxt: Line_Context): Parser[(Item, Line_Context)] =
   223     {
   244     {
   224       ctxt match {
   245       ctxt match {
   225         case delim: Delimited => delimited_depth(delim)
   246         case Item_Context(kind, delim, right) =>
       
   247           delimited_depth(delim) ^^ { case (s, delim1) =>
       
   248             (Item(kind, List(Token(Token.Kind.STRING, s))), Item_Context(kind, delim1, right)) }
   226         case _ => failure("")
   249         case _ => failure("")
   227       }
   250       }
   228     }
   251     }
   229 
   252 
   230     private val recover_delimited: Parser[String] =
   253     private def recover_delimited: Parser[Token] =
   231       delimited_depth(Finished) ^^ (_._1)
   254       """(?m)["{][^@]+""".r ^^ token(Token.Kind.ERROR)
   232 
       
   233     private val delimited_token =
       
   234       delimited ^^ token(Token.Kind.STRING) |
       
   235       recover_delimited ^^ token(Token.Kind.ERROR)
       
   236 
   255 
   237 
   256 
   238     /* other tokens */
   257     /* other tokens */
   239 
   258 
   240     private val at = "@" ^^ keyword
   259     private val at = "@" ^^ keyword
   246     private val nat = "[0-9]+".r ^^ token(Token.Kind.NAT)
   265     private val nat = "[0-9]+".r ^^ token(Token.Kind.NAT)
   247 
   266 
   248     private val ident =
   267     private val ident =
   249       """[\x21-\x7f&&[^"#%'(),={}0-9]][\x21-\x7f&&[^"#%'(),={}]]*""".r ^^ token(Token.Kind.IDENT)
   268       """[\x21-\x7f&&[^"#%'(),={}0-9]][\x21-\x7f&&[^"#%'(),={}]]*""".r ^^ token(Token.Kind.IDENT)
   250 
   269 
       
   270     val other_token = "[=#,]".r ^^ keyword | (nat | (ident | space))
       
   271 
       
   272 
       
   273     /* items */
       
   274 
       
   275     private val item_start: Parser[(String, List[Token])] =
       
   276       at ~ rep(strict_space) ~ ident ~ rep(strict_space) ^^
       
   277         { case a ~ b ~ c ~ d => (c.source, List(a) ::: b ::: List(c) ::: d) }
       
   278 
       
   279     private val item_body =
       
   280       delimited | (recover_delimited | other_token)
       
   281 
       
   282     private val item: Parser[Item] =
       
   283       (item_start ~ left_brace ~ rep(item_body) ~ opt(right_brace) |
       
   284        item_start ~ left_paren ~ rep(item_body) ~ opt(right_paren)) ^^
       
   285         { case (kind, a) ~ b ~ c ~ d => Item(kind, a ::: List(b) ::: c ::: d.toList) }
       
   286 
       
   287     private val recover_item: Parser[Item] =
       
   288       at ~ "(?m)[^@]+".r ^^ { case a ~ b => Item("", List(a, Token(Token.Kind.ERROR, b))) }
       
   289 
       
   290     def item_line(ctxt: Line_Context): Parser[(Item, Line_Context)] =
       
   291     {
       
   292       ctxt match {
       
   293         case Ignored_Context =>
       
   294           item_start ~ (left_brace | left_paren) ^^
       
   295             { case (kind, a) ~ b =>
       
   296                 val right = if (b.source == "{") "}" else ")"
       
   297                 (Item(kind, a ::: List(b)), Item_Context(kind, Closed, right)) }
       
   298         case Item_Context(kind, delim, right) =>
       
   299           if (delim.depth > 0)
       
   300             delimited_line(ctxt)
       
   301           else {
       
   302             delimited_line(ctxt) |
       
   303             other_token ^^ { case a => (Item(kind, List(a)), ctxt) } |
       
   304             right ^^ { case a => (Item(kind, List(keyword(a))), Ignored_Context) }
       
   305           }
       
   306         case _ => failure("")
       
   307       }
       
   308     }
       
   309 
   251 
   310 
   252     /* chunks */
   311     /* chunks */
   253 
   312 
   254     private val item_start =
   313     val chunk: Parser[Chunk] = ignored | (item | recover_item)
   255       at ~ rep(strict_space) ~ ident ~ rep(strict_space) ^^
   314 
   256         { case a ~ b ~ c ~ d => List(a) ::: b ::: List(c) ::: d }
   315     def chunk_line(ctxt: Line_Context): Parser[(Chunk, Line_Context)] =
   257 
   316       ignored_line(ctxt) | item_line(ctxt)
   258     private val body_token = delimited_token | ("[=#,]".r ^^ keyword | (nat | (ident | space)))
   317   }
   259 
   318 
   260     private val item =
   319 
   261       (item_start ~ left_brace ~ rep(body_token) ~ opt(right_brace) |
   320   /* parse */
   262        item_start ~ left_paren ~ rep(body_token) ~ opt(right_paren)) ^^
       
   263         { case a ~ b ~ c ~ d => Item(a ::: List(b) ::: c ::: d.toList) }
       
   264 
       
   265     private val recover_item = "(?m)@[^@]+".r ^^ (s => Malformed(s))
       
   266 
       
   267     val chunks: Parser[List[Chunk]] = rep(ignored | (item | recover_item))
       
   268   }
       
   269 
   321 
   270   def parse(input: CharSequence): List[Chunk] =
   322   def parse(input: CharSequence): List[Chunk] =
   271   {
   323   {
   272     val in: Reader[Char] = new CharSequenceReader(input)
   324     val in: Reader[Char] = new CharSequenceReader(input)
   273     Parsers.parseAll(Parsers.chunks, in) match {
   325     Parsers.parseAll(Parsers.rep(Parsers.chunk), in) match {
   274       case Parsers.Success(result, _) => result
   326       case Parsers.Success(result, _) => result
   275       case _ => error("Unexpected failure to parse input:\n" + input.toString)
   327       case _ => error("Unexpected failure to parse input:\n" + input.toString)
   276     }
   328     }
   277   }
   329   }
       
   330 
       
   331   def parse_line(input: CharSequence, context: Line_Context): (List[Chunk], Line_Context) =
       
   332   {
       
   333     var in: Reader[Char] = new CharSequenceReader(input)
       
   334     val chunks = new mutable.ListBuffer[Chunk]
       
   335     var ctxt = context
       
   336     while (!in.atEnd) {
       
   337       Parsers.parse(Parsers.chunk_line(ctxt), in) match {
       
   338         case Parsers.Success((x, c), rest) => { chunks += x; ctxt = c; in = rest }
       
   339         case Parsers.NoSuccess(_, rest) =>
       
   340           error("Unepected failure to parse input:\n" + rest.source.toString)
       
   341       }
       
   342     }
       
   343     (chunks.toList, ctxt)
       
   344   }
   278 }
   345 }
   279 
   346