src/Pure/Tools/bibtex.scala
author wenzelm
Mon Oct 09 21:12:22 2017 +0200 (21 months ago)
changeset 66822 4642cf4a7ebb
parent 66152 18e1aba549f6
child 67014 e6a695d6a6b2
permissions -rw-r--r--
tuned signature;
     1 /*  Title:      Pure/Tools/bibtex.scala
     2     Author:     Makarius
     3 
     4 BibTeX support.
     5 */
     6 
     7 package isabelle
     8 
     9 
    10 import scala.collection.mutable
    11 import scala.util.parsing.combinator.RegexParsers
    12 import scala.util.parsing.input.Reader
    13 
    14 
    15 object Bibtex
    16 {
    17   /** document model **/
    18 
    19   def check_name(name: String): Boolean = name.endsWith(".bib")
    20   def check_name(name: Document.Node.Name): Boolean = check_name(name.node)
    21 
    22   def entries(text: String): List[Text.Info[String]] =
    23   {
    24     val result = new mutable.ListBuffer[Text.Info[String]]
    25     var offset = 0
    26     for (chunk <- Bibtex.parse(text)) {
    27       val end_offset = offset + chunk.source.length
    28       if (chunk.name != "" && !chunk.is_command)
    29         result += Text.Info(Text.Range(offset, end_offset), chunk.name)
    30       offset = end_offset
    31     }
    32     result.toList
    33   }
    34 
    35   def entries_iterator[A, B <: Document.Model](models: Map[A, B])
    36     : Iterator[Text.Info[(String, B)]] =
    37   {
    38     for {
    39       (_, model) <- models.iterator
    40       info <- model.bibtex_entries.iterator
    41     } yield info.map((_, model))
    42   }
    43 
    44 
    45   /* completion */
    46 
    47   def completion[A, B <: Document.Model](
    48     history: Completion.History, rendering: Rendering, caret: Text.Offset,
    49     models: Map[A, B]): Option[Completion.Result] =
    50   {
    51     for {
    52       Text.Info(r, name) <- rendering.citation(rendering.before_caret_range(caret))
    53       name1 <- Completion.clean_name(name)
    54 
    55       original <- rendering.model.try_get_text(r)
    56       original1 <- Completion.clean_name(Library.perhaps_unquote(original))
    57 
    58       entries =
    59         (for {
    60           Text.Info(_, (entry, _)) <- entries_iterator(models)
    61           if entry.toLowerCase.containsSlice(name1.toLowerCase) && entry != original1
    62         } yield entry).toList
    63       if entries.nonEmpty
    64 
    65       items =
    66         entries.sorted.map({
    67           case entry =>
    68             val full_name = Long_Name.qualify(Markup.CITATION, entry)
    69             val description = List(entry, "(BibTeX entry)")
    70             val replacement = quote(entry)
    71           Completion.Item(r, original, full_name, description, replacement, 0, false)
    72         }).sorted(history.ordering).take(rendering.options.int("completion_limit"))
    73     } yield Completion.Result(r, original, false, items)
    74   }
    75 
    76 
    77 
    78   /** content **/
    79 
    80   private val months = List(
    81     "jan",
    82     "feb",
    83     "mar",
    84     "apr",
    85     "may",
    86     "jun",
    87     "jul",
    88     "aug",
    89     "sep",
    90     "oct",
    91     "nov",
    92     "dec")
    93   def is_month(s: String): Boolean = months.contains(s.toLowerCase)
    94 
    95   private val commands = List("preamble", "string")
    96   def is_command(s: String): Boolean = commands.contains(s.toLowerCase)
    97 
    98   sealed case class Entry(
    99     kind: String,
   100     required: List[String],
   101     optional_crossref: List[String],
   102     optional_other: List[String])
   103   {
   104     def is_required(s: String): Boolean = required.contains(s.toLowerCase)
   105     def is_optional(s: String): Boolean =
   106       optional_crossref.contains(s.toLowerCase) || optional_other.contains(s.toLowerCase)
   107 
   108     def fields: List[String] = required ::: optional_crossref ::: optional_other
   109     def template: String =
   110       "@" + kind + "{,\n" + fields.map(x => "  " + x + " = {},\n").mkString + "}\n"
   111   }
   112 
   113   val known_entries: List[Entry] =
   114     List(
   115       Entry("Article",
   116         List("author", "title"),
   117         List("journal", "year"),
   118         List("volume", "number", "pages", "month", "note")),
   119       Entry("InProceedings",
   120         List("author", "title"),
   121         List("booktitle", "year"),
   122         List("editor", "volume", "number", "series", "pages", "month", "address",
   123           "organization", "publisher", "note")),
   124       Entry("InCollection",
   125         List("author", "title", "booktitle"),
   126         List("publisher", "year"),
   127         List("editor", "volume", "number", "series", "type", "chapter", "pages",
   128           "edition", "month", "address", "note")),
   129       Entry("InBook",
   130         List("author", "editor", "title", "chapter"),
   131         List("publisher", "year"),
   132         List("volume", "number", "series", "type", "address", "edition", "month", "pages", "note")),
   133       Entry("Proceedings",
   134         List("title", "year"),
   135         List(),
   136         List("booktitle", "editor", "volume", "number", "series", "address", "month",
   137           "organization", "publisher", "note")),
   138       Entry("Book",
   139         List("author", "editor", "title"),
   140         List("publisher", "year"),
   141         List("volume", "number", "series", "address", "edition", "month", "note")),
   142       Entry("Booklet",
   143         List("title"),
   144         List(),
   145         List("author", "howpublished", "address", "month", "year", "note")),
   146       Entry("PhdThesis",
   147         List("author", "title", "school", "year"),
   148         List(),
   149         List("type", "address", "month", "note")),
   150       Entry("MastersThesis",
   151         List("author", "title", "school", "year"),
   152         List(),
   153         List("type", "address", "month", "note")),
   154       Entry("TechReport",
   155         List("author", "title", "institution", "year"),
   156         List(),
   157         List("type", "number", "address", "month", "note")),
   158       Entry("Manual",
   159         List("title"),
   160         List(),
   161         List("author", "organization", "address", "edition", "month", "year", "note")),
   162       Entry("Unpublished",
   163         List("author", "title", "note"),
   164         List(),
   165         List("month", "year")),
   166       Entry("Misc",
   167         List(),
   168         List(),
   169         List("author", "title", "howpublished", "month", "year", "note")))
   170 
   171   def get_entry(kind: String): Option[Entry] =
   172     known_entries.find(entry => entry.kind.toLowerCase == kind.toLowerCase)
   173 
   174   def is_entry(kind: String): Boolean = get_entry(kind).isDefined
   175 
   176 
   177 
   178   /** tokens and chunks **/
   179 
   180   object Token
   181   {
   182     object Kind extends Enumeration
   183     {
   184       val COMMAND = Value("command")
   185       val ENTRY = Value("entry")
   186       val KEYWORD = Value("keyword")
   187       val NAT = Value("natural number")
   188       val STRING = Value("string")
   189       val NAME = Value("name")
   190       val IDENT = Value("identifier")
   191       val SPACE = Value("white space")
   192       val COMMENT = Value("ignored text")
   193       val ERROR = Value("bad input")
   194     }
   195   }
   196 
   197   sealed case class Token(kind: Token.Kind.Value, source: String)
   198   {
   199     def is_kind: Boolean =
   200       kind == Token.Kind.COMMAND ||
   201       kind == Token.Kind.ENTRY ||
   202       kind == Token.Kind.IDENT
   203     def is_name: Boolean =
   204       kind == Token.Kind.NAME ||
   205       kind == Token.Kind.IDENT
   206     def is_ignored: Boolean =
   207       kind == Token.Kind.SPACE ||
   208       kind == Token.Kind.COMMENT
   209     def is_malformed: Boolean = kind ==
   210       Token.Kind.ERROR
   211   }
   212 
   213   case class Chunk(kind: String, tokens: List[Token])
   214   {
   215     val source = tokens.map(_.source).mkString
   216 
   217     private val content: Option[List[Token]] =
   218       tokens match {
   219         case Token(Token.Kind.KEYWORD, "@") :: body if body.nonEmpty =>
   220           (body.init.filterNot(_.is_ignored), body.last) match {
   221             case (tok :: Token(Token.Kind.KEYWORD, "{") :: toks, Token(Token.Kind.KEYWORD, "}"))
   222             if tok.is_kind => Some(toks)
   223 
   224             case (tok :: Token(Token.Kind.KEYWORD, "(") :: toks, Token(Token.Kind.KEYWORD, ")"))
   225             if tok.is_kind => Some(toks)
   226 
   227             case _ => None
   228           }
   229         case _ => None
   230       }
   231 
   232     def name: String =
   233       content match {
   234         case Some(tok :: _) if tok.is_name => tok.source
   235         case _ => ""
   236       }
   237 
   238     def is_ignored: Boolean = kind == "" && tokens.forall(_.is_ignored)
   239     def is_malformed: Boolean = kind == "" || tokens.exists(_.is_malformed)
   240     def is_command: Boolean = Bibtex.is_command(kind) && name != "" && content.isDefined
   241     def is_entry: Boolean = Bibtex.is_entry(kind) && name != "" && content.isDefined
   242   }
   243 
   244 
   245 
   246   /** parsing **/
   247 
   248   // context of partial line-oriented scans
   249   abstract class Line_Context
   250   case object Ignored extends Line_Context
   251   case object At extends Line_Context
   252   case class Item_Start(kind: String) extends Line_Context
   253   case class Item_Open(kind: String, end: String) extends Line_Context
   254   case class Item(kind: String, end: String, delim: Delimited) extends Line_Context
   255 
   256   case class Delimited(quoted: Boolean, depth: Int)
   257   val Closed = Delimited(false, 0)
   258 
   259   private def token(kind: Token.Kind.Value)(source: String): Token = Token(kind, source)
   260   private def keyword(source: String): Token = Token(Token.Kind.KEYWORD, source)
   261 
   262 
   263   // See also http://ctan.org/tex-archive/biblio/bibtex/base/bibtex.web
   264   // module @<Scan for and process a \.{.bib} command or database entry@>.
   265 
   266   object Parsers extends RegexParsers
   267   {
   268     /* white space and comments */
   269 
   270     override val whiteSpace = "".r
   271 
   272     private val space = """[ \t\n\r]+""".r ^^ token(Token.Kind.SPACE)
   273     private val spaces = rep(space)
   274 
   275 
   276     /* ignored text */
   277 
   278     private val ignored: Parser[Chunk] =
   279       rep1("""(?i)([^@]+|@[ \t]*comment)""".r) ^^ {
   280         case ss => Chunk("", List(Token(Token.Kind.COMMENT, ss.mkString))) }
   281 
   282     private def ignored_line: Parser[(Chunk, Line_Context)] =
   283       ignored ^^ { case a => (a, Ignored) }
   284 
   285 
   286     /* delimited string: outermost "..." or {...} and body with balanced {...} */
   287 
   288     // see also bibtex.web: scan_a_field_token_and_eat_white, scan_balanced_braces
   289     private def delimited_depth(delim: Delimited): Parser[(String, Delimited)] =
   290       new Parser[(String, Delimited)]
   291       {
   292         require(if (delim.quoted) delim.depth > 0 else delim.depth >= 0)
   293 
   294         def apply(in: Input) =
   295         {
   296           val start = in.offset
   297           val end = in.source.length
   298 
   299           var i = start
   300           var q = delim.quoted
   301           var d = delim.depth
   302           var finished = false
   303           while (!finished && i < end) {
   304             val c = in.source.charAt(i)
   305 
   306             if (c == '"' && d == 0) { i += 1; d = 1; q = true }
   307             else if (c == '"' && d == 1 && q) {
   308               i += 1; d = 0; q = false; finished = true
   309             }
   310             else if (c == '{') { i += 1; d += 1 }
   311             else if (c == '}') {
   312               if (d == 1 && !q || d > 1) { i += 1; d -= 1; if (d == 0) finished = true }
   313               else {i = start; finished = true }
   314             }
   315             else if (d > 0) i += 1
   316             else finished = true
   317           }
   318           if (i == start) Failure("bad input", in)
   319           else {
   320             val s = in.source.subSequence(start, i).toString
   321             Success((s, Delimited(q, d)), in.drop(i - start))
   322           }
   323         }
   324       }.named("delimited_depth")
   325 
   326     private def delimited: Parser[Token] =
   327       delimited_depth(Closed) ^?
   328         { case (s, delim) if delim == Closed => Token(Token.Kind.STRING, s) }
   329 
   330     private def recover_delimited: Parser[Token] =
   331       """["{][^@]*""".r ^^ token(Token.Kind.ERROR)
   332 
   333     def delimited_line(ctxt: Item): Parser[(Chunk, Line_Context)] =
   334       delimited_depth(ctxt.delim) ^^ { case (s, delim1) =>
   335         (Chunk(ctxt.kind, List(Token(Token.Kind.STRING, s))), ctxt.copy(delim = delim1)) } |
   336       recover_delimited ^^ { case a => (Chunk(ctxt.kind, List(a)), Ignored) }
   337 
   338 
   339     /* other tokens */
   340 
   341     private val at = "@" ^^ keyword
   342 
   343     private val nat = "[0-9]+".r ^^ token(Token.Kind.NAT)
   344 
   345     private val name = """[\x21-\x7f&&[^"#%'(),={}]]+""".r ^^ token(Token.Kind.NAME)
   346 
   347     private val identifier =
   348       """[\x21-\x7f&&[^"#%'(),={}0-9]][\x21-\x7f&&[^"#%'(),={}]]*""".r
   349 
   350     private val ident = identifier ^^ token(Token.Kind.IDENT)
   351 
   352     val other_token = "[=#,]".r ^^ keyword | (nat | (ident | space))
   353 
   354 
   355     /* body */
   356 
   357     private val body =
   358       delimited | (recover_delimited | other_token)
   359 
   360     private def body_line(ctxt: Item) =
   361       if (ctxt.delim.depth > 0)
   362         delimited_line(ctxt)
   363       else
   364         delimited_line(ctxt) |
   365         other_token ^^ { case a => (Chunk(ctxt.kind, List(a)), ctxt) } |
   366         ctxt.end ^^ { case a => (Chunk(ctxt.kind, List(keyword(a))), Ignored) }
   367 
   368 
   369     /* items: command or entry */
   370 
   371     private val item_kind =
   372       identifier ^^ { case a =>
   373         val kind =
   374           if (is_command(a)) Token.Kind.COMMAND
   375           else if (is_entry(a)) Token.Kind.ENTRY
   376           else Token.Kind.IDENT
   377         Token(kind, a)
   378       }
   379 
   380     private val item_begin =
   381       "{" ^^ { case a => ("}", keyword(a)) } |
   382       "(" ^^ { case a => (")", keyword(a)) }
   383 
   384     private def item_name(kind: String) =
   385       kind.toLowerCase match {
   386         case "preamble" => failure("")
   387         case "string" => identifier ^^ token(Token.Kind.NAME)
   388         case _ => name
   389       }
   390 
   391     private val item_start =
   392       at ~ spaces ~ item_kind ~ spaces ^^
   393         { case a ~ b ~ c ~ d => (c.source, List(a) ::: b ::: List(c) ::: d) }
   394 
   395     private val item: Parser[Chunk] =
   396       (item_start ~ item_begin ~ spaces) into
   397         { case (kind, a) ~ ((end, b)) ~ c =>
   398             opt(item_name(kind)) ~ rep(body) ~ opt(end ^^ keyword) ^^ {
   399               case d ~ e ~ f => Chunk(kind, a ::: List(b) ::: c ::: d.toList ::: e ::: f.toList) } }
   400 
   401     private val recover_item: Parser[Chunk] =
   402       at ~ "[^@]*".r ^^ { case a ~ b => Chunk("", List(a, Token(Token.Kind.ERROR, b))) }
   403 
   404 
   405     /* chunks */
   406 
   407     val chunk: Parser[Chunk] = ignored | (item | recover_item)
   408 
   409     def chunk_line(ctxt: Line_Context): Parser[(Chunk, Line_Context)] =
   410     {
   411       ctxt match {
   412         case Ignored =>
   413           ignored_line |
   414           at ^^ { case a => (Chunk("", List(a)), At) }
   415 
   416         case At =>
   417           space ^^ { case a => (Chunk("", List(a)), ctxt) } |
   418           item_kind ^^ { case a => (Chunk(a.source, List(a)), Item_Start(a.source)) } |
   419           recover_item ^^ { case a => (a, Ignored) } |
   420           ignored_line
   421 
   422         case Item_Start(kind) =>
   423           space ^^ { case a => (Chunk(kind, List(a)), ctxt) } |
   424           item_begin ^^ { case (end, a) => (Chunk(kind, List(a)), Item_Open(kind, end)) } |
   425           recover_item ^^ { case a => (a, Ignored) } |
   426           ignored_line
   427 
   428         case Item_Open(kind, end) =>
   429           space ^^ { case a => (Chunk(kind, List(a)), ctxt) } |
   430           item_name(kind) ^^ { case a => (Chunk(kind, List(a)), Item(kind, end, Closed)) } |
   431           body_line(Item(kind, end, Closed)) |
   432           ignored_line
   433 
   434         case item_ctxt: Item =>
   435           body_line(item_ctxt) |
   436           ignored_line
   437 
   438         case _ => failure("")
   439       }
   440     }
   441   }
   442 
   443 
   444   /* parse */
   445 
   446   def parse(input: CharSequence): List[Chunk] =
   447     Parsers.parseAll(Parsers.rep(Parsers.chunk), Scan.char_reader(input)) match {
   448       case Parsers.Success(result, _) => result
   449       case _ => error("Unexpected failure to parse input:\n" + input.toString)
   450     }
   451 
   452   def parse_line(input: CharSequence, context: Line_Context): (List[Chunk], Line_Context) =
   453   {
   454     var in: Reader[Char] = Scan.char_reader(input)
   455     val chunks = new mutable.ListBuffer[Chunk]
   456     var ctxt = context
   457     while (!in.atEnd) {
   458       Parsers.parse(Parsers.chunk_line(ctxt), in) match {
   459         case Parsers.Success((x, c), rest) => chunks += x; ctxt = c; in = rest
   460         case Parsers.NoSuccess(_, rest) =>
   461           error("Unepected failure to parse input:\n" + rest.source.toString)
   462       }
   463     }
   464     (chunks.toList, ctxt)
   465   }
   466 }