src/Pure/Tools/bibtex.scala
author wenzelm
Wed Nov 26 20:05:34 2014 +0100 (2014-11-26)
changeset 59058 a78612c67ec0
parent 58609 d0cb70d66bc1
child 59319 677615cba30d
permissions -rw-r--r--
renamed "pairself" to "apply2", in accordance to @{apply 2};
     1 /*  Title:      Pure/Tools/bibtex.scala
     2     Author:     Makarius
     3 
     4 BibTeX support.
     5 */
     6 
     7 package isabelle
     8 
     9 
    10 import scala.collection.mutable
    11 import scala.util.parsing.input.{Reader, CharSequenceReader}
    12 import scala.util.parsing.combinator.RegexParsers
    13 
    14 
    15 object Bibtex
    16 {
    17   /** content **/
    18 
    19   private val months = List(
    20     "jan",
    21     "feb",
    22     "mar",
    23     "apr",
    24     "may",
    25     "jun",
    26     "jul",
    27     "aug",
    28     "sep",
    29     "oct",
    30     "nov",
    31     "dec")
    32   def is_month(s: String): Boolean = months.contains(s.toLowerCase)
    33 
    34   private val commands = List("preamble", "string")
    35   def is_command(s: String): Boolean = commands.contains(s.toLowerCase)
    36 
    37   sealed case class Entry(
    38     kind: String,
    39     required: List[String],
    40     optional_crossref: List[String],
    41     optional_other: List[String])
    42   {
    43     def is_required(s: String): Boolean = required.contains(s.toLowerCase)
    44     def is_optional(s: String): Boolean =
    45       optional_crossref.contains(s.toLowerCase) || optional_other.contains(s.toLowerCase)
    46 
    47     def fields: List[String] = required ::: optional_crossref ::: optional_other
    48     def template: String =
    49       "@" + kind + "{,\n" + fields.map(x => "  " + x + " = {},\n").mkString + "}\n"
    50   }
    51 
    52   val entries: List[Entry] =
    53     List(
    54       Entry("Article",
    55         List("author", "title"),
    56         List("journal", "year"),
    57         List("volume", "number", "pages", "month", "note")),
    58       Entry("InProceedings",
    59         List("author", "title"),
    60         List("booktitle", "year"),
    61         List("editor", "volume", "number", "series", "pages", "month", "address",
    62           "organization", "publisher", "note")),
    63       Entry("InCollection",
    64         List("author", "title", "booktitle"),
    65         List("publisher", "year"),
    66         List("editor", "volume", "number", "series", "type", "chapter", "pages",
    67           "edition", "month", "address", "note")),
    68       Entry("InBook",
    69         List("author", "editor", "title", "chapter"),
    70         List("publisher", "year"),
    71         List("volume", "number", "series", "type", "address", "edition", "month", "pages", "note")),
    72       Entry("Proceedings",
    73         List("title", "year"),
    74         List(),
    75         List("booktitle", "editor", "volume", "number", "series", "address", "month",
    76           "organization", "publisher", "note")),
    77       Entry("Book",
    78         List("author", "editor", "title"),
    79         List("publisher", "year"),
    80         List("volume", "number", "series", "address", "edition", "month", "note")),
    81       Entry("Booklet",
    82         List("title"),
    83         List(),
    84         List("author", "howpublished", "address", "month", "year", "note")),
    85       Entry("PhdThesis",
    86         List("author", "title", "school", "year"),
    87         List(),
    88         List("type", "address", "month", "note")),
    89       Entry("MastersThesis",
    90         List("author", "title", "school", "year"),
    91         List(),
    92         List("type", "address", "month", "note")),
    93       Entry("TechReport",
    94         List("author", "title", "institution", "year"),
    95         List(),
    96         List("type", "number", "address", "month", "note")),
    97       Entry("Manual",
    98         List("title"),
    99         List(),
   100         List("author", "organization", "address", "edition", "month", "year", "note")),
   101       Entry("Unpublished",
   102         List("author", "title", "note"),
   103         List(),
   104         List("month", "year")),
   105       Entry("Misc",
   106         List(),
   107         List(),
   108         List("author", "title", "howpublished", "month", "year", "note")))
   109 
   110   def get_entry(kind: String): Option[Entry] =
   111     entries.find(entry => entry.kind.toLowerCase == kind.toLowerCase)
   112 
   113   def is_entry(kind: String): Boolean = get_entry(kind).isDefined
   114 
   115 
   116 
   117   /** tokens and chunks **/
   118 
   119   object Token
   120   {
   121     object Kind extends Enumeration
   122     {
   123       val COMMAND = Value("command")
   124       val ENTRY = Value("entry")
   125       val KEYWORD = Value("keyword")
   126       val NAT = Value("natural number")
   127       val STRING = Value("string")
   128       val NAME = Value("name")
   129       val IDENT = Value("identifier")
   130       val SPACE = Value("white space")
   131       val COMMENT = Value("ignored text")
   132       val ERROR = Value("bad input")
   133     }
   134   }
   135 
   136   sealed case class Token(kind: Token.Kind.Value, val source: String)
   137   {
   138     def is_kind: Boolean =
   139       kind == Token.Kind.COMMAND ||
   140       kind == Token.Kind.ENTRY ||
   141       kind == Token.Kind.IDENT
   142     def is_name: Boolean =
   143       kind == Token.Kind.NAME ||
   144       kind == Token.Kind.IDENT
   145     def is_ignored: Boolean =
   146       kind == Token.Kind.SPACE ||
   147       kind == Token.Kind.COMMENT
   148     def is_malformed: Boolean = kind ==
   149       Token.Kind.ERROR
   150   }
   151 
   152   case class Chunk(kind: String, tokens: List[Token])
   153   {
   154     val source = tokens.map(_.source).mkString
   155 
   156     private val content: Option[List[Token]] =
   157       tokens match {
   158         case Token(Token.Kind.KEYWORD, "@") :: body if !body.isEmpty =>
   159           (body.init.filterNot(_.is_ignored), body.last) match {
   160             case (tok :: Token(Token.Kind.KEYWORD, "{") :: toks, Token(Token.Kind.KEYWORD, "}"))
   161             if tok.is_kind => Some(toks)
   162 
   163             case (tok :: Token(Token.Kind.KEYWORD, "(") :: toks, Token(Token.Kind.KEYWORD, ")"))
   164             if tok.is_kind => Some(toks)
   165 
   166             case _ => None
   167           }
   168         case _ => None
   169       }
   170 
   171     def name: String =
   172       content match {
   173         case Some(tok :: _) if tok.is_name => tok.source
   174         case _ => ""
   175       }
   176 
   177     def is_ignored: Boolean = kind == "" && tokens.forall(_.is_ignored)
   178     def is_malformed: Boolean = kind == "" || tokens.exists(_.is_malformed)
   179     def is_command: Boolean = Bibtex.is_command(kind) && name != "" && content.isDefined
   180     def is_entry: Boolean = Bibtex.is_entry(kind) && name != "" && content.isDefined
   181   }
   182 
   183 
   184 
   185   /** parsing **/
   186 
   187   // context of partial line-oriented scans
   188   abstract class Line_Context
   189   case object Ignored extends Line_Context
   190   case object At extends Line_Context
   191   case class Item_Start(kind: String) extends Line_Context
   192   case class Item_Open(kind: String, end: String) extends Line_Context
   193   case class Item(kind: String, end: String, delim: Delimited) extends Line_Context
   194 
   195   case class Delimited(quoted: Boolean, depth: Int)
   196   val Closed = Delimited(false, 0)
   197 
   198   private def token(kind: Token.Kind.Value)(source: String): Token = Token(kind, source)
   199   private def keyword(source: String): Token = Token(Token.Kind.KEYWORD, source)
   200 
   201 
   202   // See also http://ctan.org/tex-archive/biblio/bibtex/base/bibtex.web
   203   // module @<Scan for and process a \.{.bib} command or database entry@>.
   204 
   205   object Parsers extends RegexParsers
   206   {
   207     /* white space and comments */
   208 
   209     override val whiteSpace = "".r
   210 
   211     private val space = """[ \t\n\r]+""".r ^^ token(Token.Kind.SPACE)
   212     private val spaces = rep(space)
   213 
   214 
   215     /* ignored text */
   216 
   217     private val ignored: Parser[Chunk] =
   218       rep1("""(?i)([^@]+|@[ \t]*comment)""".r) ^^ {
   219         case ss => Chunk("", List(Token(Token.Kind.COMMENT, ss.mkString))) }
   220 
   221     private def ignored_line: Parser[(Chunk, Line_Context)] =
   222       ignored ^^ { case a => (a, Ignored) }
   223 
   224 
   225     /* delimited string: outermost "..." or {...} and body with balanced {...} */
   226 
   227     // see also bibtex.web: scan_a_field_token_and_eat_white, scan_balanced_braces
   228     private def delimited_depth(delim: Delimited): Parser[(String, Delimited)] =
   229       new Parser[(String, Delimited)]
   230       {
   231         require(if (delim.quoted) delim.depth > 0 else delim.depth >= 0)
   232 
   233         def apply(in: Input) =
   234         {
   235           val start = in.offset
   236           val end = in.source.length
   237 
   238           var i = start
   239           var q = delim.quoted
   240           var d = delim.depth
   241           var finished = false
   242           while (!finished && i < end) {
   243             val c = in.source.charAt(i)
   244 
   245             if (c == '"' && d == 0) { i += 1; d = 1; q = true }
   246             else if (c == '"' && d == 1 && q) {
   247               i += 1; d = 0; q = false; finished = true
   248             }
   249             else if (c == '{') { i += 1; d += 1 }
   250             else if (c == '}') {
   251               if (d == 1 && !q || d > 1) { i += 1; d -= 1; if (d == 0) finished = true }
   252               else {i = start; finished = true }
   253             }
   254             else if (d > 0) i += 1
   255             else finished = true
   256           }
   257           if (i == start) Failure("bad input", in)
   258           else {
   259             val s = in.source.subSequence(start, i).toString
   260             Success((s, Delimited(q, d)), in.drop(i - start))
   261           }
   262         }
   263       }.named("delimited_depth")
   264 
   265     private def delimited: Parser[Token] =
   266       delimited_depth(Closed) ^?
   267         { case (s, delim) if delim == Closed => Token(Token.Kind.STRING, s) }
   268 
   269     private def recover_delimited: Parser[Token] =
   270       """["{][^@]*""".r ^^ token(Token.Kind.ERROR)
   271 
   272     def delimited_line(ctxt: Item): Parser[(Chunk, Line_Context)] =
   273       delimited_depth(ctxt.delim) ^^ { case (s, delim1) =>
   274         (Chunk(ctxt.kind, List(Token(Token.Kind.STRING, s))), ctxt.copy(delim = delim1)) } |
   275       recover_delimited ^^ { case a => (Chunk(ctxt.kind, List(a)), Ignored) }
   276 
   277 
   278     /* other tokens */
   279 
   280     private val at = "@" ^^ keyword
   281 
   282     private val nat = "[0-9]+".r ^^ token(Token.Kind.NAT)
   283 
   284     private val name = """[\x21-\x7f&&[^"#%'(),={}]]+""".r ^^ token(Token.Kind.NAME)
   285 
   286     private val identifier =
   287       """[\x21-\x7f&&[^"#%'(),={}0-9]][\x21-\x7f&&[^"#%'(),={}]]*""".r
   288 
   289     private val ident = identifier ^^ token(Token.Kind.IDENT)
   290 
   291     val other_token = "[=#,]".r ^^ keyword | (nat | (ident | space))
   292 
   293 
   294     /* body */
   295 
   296     private val body =
   297       delimited | (recover_delimited | other_token)
   298 
   299     private def body_line(ctxt: Item) =
   300       if (ctxt.delim.depth > 0)
   301         delimited_line(ctxt)
   302       else
   303         delimited_line(ctxt) |
   304         other_token ^^ { case a => (Chunk(ctxt.kind, List(a)), ctxt) } |
   305         ctxt.end ^^ { case a => (Chunk(ctxt.kind, List(keyword(a))), Ignored) }
   306 
   307 
   308     /* items: command or entry */
   309 
   310     private val item_kind =
   311       identifier ^^ { case a =>
   312         val kind =
   313           if (is_command(a)) Token.Kind.COMMAND
   314           else if (is_entry(a)) Token.Kind.ENTRY
   315           else Token.Kind.IDENT
   316         Token(kind, a)
   317       }
   318 
   319     private val item_begin =
   320       "{" ^^ { case a => ("}", keyword(a)) } |
   321       "(" ^^ { case a => (")", keyword(a)) }
   322 
   323     private def item_name(kind: String) =
   324       kind.toLowerCase match {
   325         case "preamble" => failure("")
   326         case "string" => identifier ^^ token(Token.Kind.NAME)
   327         case _ => name
   328       }
   329 
   330     private val item_start =
   331       at ~ spaces ~ item_kind ~ spaces ^^
   332         { case a ~ b ~ c ~ d => (c.source, List(a) ::: b ::: List(c) ::: d) }
   333 
   334     private val item: Parser[Chunk] =
   335       (item_start ~ item_begin ~ spaces) into
   336         { case (kind, a) ~ ((end, b)) ~ c =>
   337             opt(item_name(kind)) ~ rep(body) ~ opt(end ^^ keyword) ^^ {
   338               case d ~ e ~ f => Chunk(kind, a ::: List(b) ::: c ::: d.toList ::: e ::: f.toList) } }
   339 
   340     private val recover_item: Parser[Chunk] =
   341       at ~ "[^@]*".r ^^ { case a ~ b => Chunk("", List(a, Token(Token.Kind.ERROR, b))) }
   342 
   343 
   344     /* chunks */
   345 
   346     val chunk: Parser[Chunk] = ignored | (item | recover_item)
   347 
   348     def chunk_line(ctxt: Line_Context): Parser[(Chunk, Line_Context)] =
   349     {
   350       ctxt match {
   351         case Ignored =>
   352           ignored_line |
   353           at ^^ { case a => (Chunk("", List(a)), At) }
   354 
   355         case At =>
   356           space ^^ { case a => (Chunk("", List(a)), ctxt) } |
   357           item_kind ^^ { case a => (Chunk(a.source, List(a)), Item_Start(a.source)) } |
   358           recover_item ^^ { case a => (a, Ignored) } |
   359           ignored_line
   360 
   361         case Item_Start(kind) =>
   362           space ^^ { case a => (Chunk(kind, List(a)), ctxt) } |
   363           item_begin ^^ { case (end, a) => (Chunk(kind, List(a)), Item_Open(kind, end)) } |
   364           recover_item ^^ { case a => (a, Ignored) } |
   365           ignored_line
   366 
   367         case Item_Open(kind, end) =>
   368           space ^^ { case a => (Chunk(kind, List(a)), ctxt) } |
   369           item_name(kind) ^^ { case a => (Chunk(kind, List(a)), Item(kind, end, Closed)) } |
   370           body_line(Item(kind, end, Closed)) |
   371           ignored_line
   372 
   373         case item_ctxt: Item =>
   374           body_line(item_ctxt) |
   375           ignored_line
   376 
   377         case _ => failure("")
   378       }
   379     }
   380   }
   381 
   382 
   383   /* parse */
   384 
   385   def parse(input: CharSequence): List[Chunk] =
   386   {
   387     val in: Reader[Char] = new CharSequenceReader(input)
   388     Parsers.parseAll(Parsers.rep(Parsers.chunk), in) match {
   389       case Parsers.Success(result, _) => result
   390       case _ => error("Unexpected failure to parse input:\n" + input.toString)
   391     }
   392   }
   393 
   394   def parse_line(input: CharSequence, context: Line_Context): (List[Chunk], Line_Context) =
   395   {
   396     var in: Reader[Char] = new CharSequenceReader(input)
   397     val chunks = new mutable.ListBuffer[Chunk]
   398     var ctxt = context
   399     while (!in.atEnd) {
   400       Parsers.parse(Parsers.chunk_line(ctxt), in) match {
   401         case Parsers.Success((x, c), rest) => { chunks += x; ctxt = c; in = rest }
   402         case Parsers.NoSuccess(_, rest) =>
   403           error("Unepected failure to parse input:\n" + rest.source.toString)
   404       }
   405     }
   406     (chunks.toList, ctxt)
   407   }
   408 }
   409