| author | wenzelm | 
| Mon, 06 Oct 2014 10:24:51 +0200 | |
| changeset 58589 | d9350ec0937e | 
| parent 58544 | 340f130b3d38 | 
| child 58590 | 472b9fbcc7f0 | 
| permissions | -rw-r--r-- | 
| 58523 | 1  | 
/* Title: Pure/Tools/bibtex.scala  | 
2  | 
Author: Makarius  | 
|
3  | 
||
| 
58544
 
340f130b3d38
bibtex support in ML: document antiquotation @{cite} with markup;
 
wenzelm 
parents: 
58543 
diff
changeset
 | 
4  | 
BibTeX support.  | 
| 58523 | 5  | 
*/  | 
6  | 
||
7  | 
package isabelle  | 
|
8  | 
||
9  | 
||
| 58528 | 10  | 
import scala.collection.mutable  | 
| 58523 | 11  | 
import scala.util.parsing.input.{Reader, CharSequenceReader}
 | 
12  | 
import scala.util.parsing.combinator.RegexParsers  | 
|
13  | 
||
14  | 
||
15  | 
object Bibtex  | 
|
16  | 
{
 | 
|
17  | 
/** content **/  | 
|
18  | 
||
| 58529 | 19  | 
private val months = List(  | 
| 58523 | 20  | 
"jan",  | 
21  | 
"feb",  | 
|
22  | 
"mar",  | 
|
23  | 
"apr",  | 
|
24  | 
"may",  | 
|
25  | 
"jun",  | 
|
26  | 
"jul",  | 
|
27  | 
"aug",  | 
|
28  | 
"sep",  | 
|
29  | 
"oct",  | 
|
30  | 
"nov",  | 
|
31  | 
"dec")  | 
|
| 58529 | 32  | 
def is_month(s: String): Boolean = months.contains(s.toLowerCase)  | 
| 58523 | 33  | 
|
| 58529 | 34  | 
  private val commands = List("preamble", "string")
 | 
35  | 
def is_command(s: String): Boolean = commands.contains(s.toLowerCase)  | 
|
| 58523 | 36  | 
|
| 58524 | 37  | 
sealed case class Entry(  | 
| 58526 | 38  | 
kind: String,  | 
| 58523 | 39  | 
required: List[String],  | 
40  | 
optional_crossref: List[String],  | 
|
| 58529 | 41  | 
optional_other: List[String])  | 
| 58524 | 42  | 
  {
 | 
| 58533 | 43  | 
def is_required(s: String): Boolean = required.contains(s.toLowerCase)  | 
| 58529 | 44  | 
def is_optional(s: String): Boolean =  | 
| 58533 | 45  | 
optional_crossref.contains(s.toLowerCase) || optional_other.contains(s.toLowerCase)  | 
| 58529 | 46  | 
|
47  | 
def fields: List[String] = required ::: optional_crossref ::: optional_other  | 
|
| 58524 | 48  | 
def template: String =  | 
| 58526 | 49  | 
      "@" + kind + "{,\n" + fields.map(x => "  " + x + " = {},\n").mkString + "}\n"
 | 
| 58524 | 50  | 
}  | 
| 58523 | 51  | 
|
| 58524 | 52  | 
val entries: List[Entry] =  | 
53  | 
List(  | 
|
54  | 
      Entry("Article",
 | 
|
55  | 
        List("author", "title"),
 | 
|
56  | 
        List("journal", "year"),
 | 
|
57  | 
        List("volume", "number", "pages", "month", "note")),
 | 
|
58  | 
      Entry("InProceedings",
 | 
|
59  | 
        List("author", "title"),
 | 
|
60  | 
        List("booktitle", "year"),
 | 
|
61  | 
        List("editor", "volume", "number", "series", "pages", "month", "address",
 | 
|
62  | 
"organization", "publisher", "note")),  | 
|
63  | 
      Entry("InCollection",
 | 
|
64  | 
        List("author", "title", "booktitle"),
 | 
|
65  | 
        List("publisher", "year"),
 | 
|
66  | 
        List("editor", "volume", "number", "series", "type", "chapter", "pages",
 | 
|
67  | 
"edition", "month", "address", "note")),  | 
|
68  | 
      Entry("InBook",
 | 
|
69  | 
        List("author", "editor", "title", "chapter"),
 | 
|
70  | 
        List("publisher", "year"),
 | 
|
71  | 
        List("volume", "number", "series", "type", "address", "edition", "month", "pages", "note")),
 | 
|
72  | 
      Entry("Proceedings",
 | 
|
73  | 
        List("title", "year"),
 | 
|
74  | 
List(),  | 
|
75  | 
        List("booktitle", "editor", "volume", "number", "series", "address", "month",
 | 
|
76  | 
"organization", "publisher", "note")),  | 
|
77  | 
      Entry("Book",
 | 
|
78  | 
        List("author", "editor", "title"),
 | 
|
79  | 
        List("publisher", "year"),
 | 
|
80  | 
        List("volume", "number", "series", "address", "edition", "month", "note")),
 | 
|
81  | 
      Entry("Booklet",
 | 
|
82  | 
        List("title"),
 | 
|
83  | 
List(),  | 
|
84  | 
        List("author", "howpublished", "address", "month", "year", "note")),
 | 
|
85  | 
      Entry("PhdThesis",
 | 
|
86  | 
        List("author", "title", "school", "year"),
 | 
|
87  | 
List(),  | 
|
88  | 
        List("type", "address", "month", "note")),
 | 
|
89  | 
      Entry("MastersThesis",
 | 
|
90  | 
        List("author", "title", "school", "year"),
 | 
|
91  | 
List(),  | 
|
92  | 
        List("type", "address", "month", "note")),
 | 
|
93  | 
      Entry("TechReport",
 | 
|
94  | 
        List("author", "title", "institution", "year"),
 | 
|
95  | 
List(),  | 
|
96  | 
        List("type", "number", "address", "month", "note")),
 | 
|
97  | 
      Entry("Manual",
 | 
|
98  | 
        List("title"),
 | 
|
99  | 
List(),  | 
|
100  | 
        List("author", "organization", "address", "edition", "month", "year", "note")),
 | 
|
101  | 
      Entry("Unpublished",
 | 
|
102  | 
        List("author", "title", "note"),
 | 
|
103  | 
List(),  | 
|
104  | 
        List("month", "year")),
 | 
|
105  | 
      Entry("Misc",
 | 
|
106  | 
List(),  | 
|
107  | 
List(),  | 
|
108  | 
        List("author", "title", "howpublished", "month", "year", "note")))
 | 
|
| 58523 | 109  | 
|
| 58529 | 110  | 
def get_entry(kind: String): Option[Entry] =  | 
111  | 
entries.find(entry => entry.kind.toLowerCase == kind.toLowerCase)  | 
|
112  | 
||
| 58530 | 113  | 
def is_entry(kind: String): Boolean = get_entry(kind).isDefined  | 
114  | 
||
| 58523 | 115  | 
|
116  | 
||
117  | 
/** tokens and chunks **/  | 
|
118  | 
||
119  | 
object Token  | 
|
120  | 
  {
 | 
|
121  | 
object Kind extends Enumeration  | 
|
122  | 
    {
 | 
|
| 58529 | 123  | 
      val COMMAND = Value("command")
 | 
124  | 
      val ENTRY = Value("entry")
 | 
|
| 58523 | 125  | 
      val KEYWORD = Value("keyword")
 | 
126  | 
      val NAT = Value("natural number")
 | 
|
| 58529 | 127  | 
      val STRING = Value("string")
 | 
| 58531 | 128  | 
      val NAME = Value("name")
 | 
| 58523 | 129  | 
      val IDENT = Value("identifier")
 | 
| 58535 | 130  | 
      val SPACE = Value("white space")
 | 
131  | 
      val COMMENT = Value("ignored text")
 | 
|
| 58523 | 132  | 
      val ERROR = Value("bad input")
 | 
133  | 
}  | 
|
134  | 
}  | 
|
135  | 
||
136  | 
sealed case class Token(kind: Token.Kind.Value, val source: String)  | 
|
137  | 
  {
 | 
|
| 58530 | 138  | 
def is_kind: Boolean =  | 
139  | 
kind == Token.Kind.COMMAND ||  | 
|
140  | 
kind == Token.Kind.ENTRY ||  | 
|
141  | 
kind == Token.Kind.IDENT  | 
|
| 58531 | 142  | 
def is_name: Boolean =  | 
143  | 
kind == Token.Kind.NAME ||  | 
|
144  | 
kind == Token.Kind.IDENT  | 
|
| 58535 | 145  | 
def is_ignored: Boolean =  | 
146  | 
kind == Token.Kind.SPACE ||  | 
|
147  | 
kind == Token.Kind.COMMENT  | 
|
148  | 
def is_malformed: Boolean = kind ==  | 
|
149  | 
Token.Kind.ERROR  | 
|
| 58523 | 150  | 
}  | 
151  | 
||
| 58530 | 152  | 
case class Chunk(kind: String, tokens: List[Token])  | 
| 58523 | 153  | 
  {
 | 
| 58529 | 154  | 
val source = tokens.map(_.source).mkString  | 
| 58526 | 155  | 
|
| 58530 | 156  | 
private val content: Option[List[Token]] =  | 
| 58523 | 157  | 
      tokens match {
 | 
| 58530 | 158  | 
case Token(Token.Kind.KEYWORD, "@") :: body if !body.isEmpty =>  | 
| 58529 | 159  | 
          (body.init.filterNot(_.is_ignored), body.last) match {
 | 
| 58530 | 160  | 
            case (tok :: Token(Token.Kind.KEYWORD, "{") :: toks, Token(Token.Kind.KEYWORD, "}"))
 | 
161  | 
if tok.is_kind => Some(toks)  | 
|
162  | 
||
163  | 
            case (tok :: Token(Token.Kind.KEYWORD, "(") :: toks, Token(Token.Kind.KEYWORD, ")"))
 | 
|
164  | 
if tok.is_kind => Some(toks)  | 
|
165  | 
||
| 58528 | 166  | 
case _ => None  | 
| 58523 | 167  | 
}  | 
| 58528 | 168  | 
case _ => None  | 
| 58526 | 169  | 
}  | 
170  | 
||
171  | 
def name: String =  | 
|
| 58530 | 172  | 
      content match {
 | 
| 58531 | 173  | 
case Some(tok :: _) if tok.is_name => tok.source  | 
| 58523 | 174  | 
case _ => ""  | 
175  | 
}  | 
|
| 58530 | 176  | 
|
177  | 
def is_ignored: Boolean = kind == "" && tokens.forall(_.is_ignored)  | 
|
178  | 
def is_malformed: Boolean = kind == "" || tokens.exists(_.is_malformed)  | 
|
| 58543 | 179  | 
def is_command: Boolean = Bibtex.is_command(kind) && name != "" && content.isDefined  | 
180  | 
def is_entry: Boolean = Bibtex.is_entry(kind) && name != "" && content.isDefined  | 
|
| 58523 | 181  | 
}  | 
182  | 
||
183  | 
||
184  | 
||
185  | 
/** parsing **/  | 
|
186  | 
||
187  | 
// context of partial line-oriented scans  | 
|
188  | 
abstract class Line_Context  | 
|
| 58589 | 189  | 
case object Ignored extends Line_Context  | 
190  | 
case class Item(kind: String, delim: Delimited, right: String) extends Line_Context  | 
|
| 58528 | 191  | 
case class Delimited(quoted: Boolean, depth: Int)  | 
192  | 
val Closed = Delimited(false, 0)  | 
|
| 58523 | 193  | 
|
194  | 
private def token(kind: Token.Kind.Value)(source: String): Token = Token(kind, source)  | 
|
195  | 
private def keyword(source: String): Token = Token(Token.Kind.KEYWORD, source)  | 
|
196  | 
||
197  | 
||
198  | 
// See also http://ctan.org/tex-archive/biblio/bibtex/base/bibtex.web  | 
|
199  | 
  // module @<Scan for and process a \.{.bib} command or database entry@>.
 | 
|
200  | 
||
201  | 
object Parsers extends RegexParsers  | 
|
202  | 
  {
 | 
|
203  | 
/* white space and comments */  | 
|
204  | 
||
205  | 
override val whiteSpace = "".r  | 
|
206  | 
||
| 58535 | 207  | 
private val space = """[ \t\n\r]+""".r ^^ token(Token.Kind.SPACE)  | 
208  | 
private val strict_space = """[ \t]+""".r ^^ token(Token.Kind.SPACE)  | 
|
| 58523 | 209  | 
|
| 58528 | 210  | 
|
| 58535 | 211  | 
/* ignored text */  | 
| 58528 | 212  | 
|
213  | 
private val ignored: Parser[Chunk] =  | 
|
| 58530 | 214  | 
      rep1("""(?mi)([^@]+|@[ \t]*comment)""".r) ^^ {
 | 
| 58535 | 215  | 
        case ss => Chunk("", List(Token(Token.Kind.COMMENT, ss.mkString))) }
 | 
| 58523 | 216  | 
|
| 
58536
 
402a8e8107a7
more total chunk_line: recovery via ignored_line;
 
wenzelm 
parents: 
58535 
diff
changeset
 | 
217  | 
private def ignored_line: Parser[(Chunk, Line_Context)] =  | 
| 58589 | 218  | 
      ignored ^^ { case a => (a, Ignored) }
 | 
| 
58536
 
402a8e8107a7
more total chunk_line: recovery via ignored_line;
 
wenzelm 
parents: 
58535 
diff
changeset
 | 
219  | 
|
| 58523 | 220  | 
|
221  | 
    /* delimited string: outermost "..." or {...} and body with balanced {...} */
 | 
|
222  | 
||
| 58534 | 223  | 
// see also bibtex.web: scan_a_field_token_and_eat_white, scan_balanced_braces  | 
| 58523 | 224  | 
private def delimited_depth(delim: Delimited): Parser[(String, Delimited)] =  | 
225  | 
new Parser[(String, Delimited)]  | 
|
226  | 
      {
 | 
|
227  | 
require(if (delim.quoted) delim.depth > 0 else delim.depth >= 0)  | 
|
228  | 
||
229  | 
def apply(in: Input) =  | 
|
230  | 
        {
 | 
|
231  | 
val start = in.offset  | 
|
232  | 
val end = in.source.length  | 
|
233  | 
||
234  | 
var i = start  | 
|
235  | 
var q = delim.quoted  | 
|
236  | 
var d = delim.depth  | 
|
237  | 
var finished = false  | 
|
238  | 
          while (!finished && i < end) {
 | 
|
239  | 
val c = in.source.charAt(i)  | 
|
| 58534 | 240  | 
|
| 58523 | 241  | 
            if (c == '"' && d == 0) { i += 1; d = 1; q = true }
 | 
| 58532 | 242  | 
            else if (c == '"' && d == 1 && q) {
 | 
243  | 
i += 1; d = 0; q = false; finished = true  | 
|
244  | 
}  | 
|
| 58523 | 245  | 
            else if (c == '{') { i += 1; d += 1 }
 | 
| 58534 | 246  | 
            else if (c == '}') {
 | 
247  | 
              if (d == 1 && !q || d > 1) { i += 1; d -= 1; if (d == 0) finished = true }
 | 
|
248  | 
              else {i = start; finished = true }
 | 
|
| 58532 | 249  | 
}  | 
| 58523 | 250  | 
else if (d > 0) i += 1  | 
251  | 
else finished = true  | 
|
252  | 
}  | 
|
253  | 
          if (i == start) Failure("bad input", in)
 | 
|
| 58528 | 254  | 
          else {
 | 
255  | 
val s = in.source.subSequence(start, i).toString  | 
|
256  | 
Success((s, Delimited(q, d)), in.drop(i - start))  | 
|
257  | 
}  | 
|
| 58523 | 258  | 
}  | 
259  | 
      }.named("delimited_depth")
 | 
|
260  | 
||
| 58528 | 261  | 
private def delimited: Parser[Token] =  | 
262  | 
delimited_depth(Closed) ^?  | 
|
263  | 
        { case (s, delim) if delim == Closed => Token(Token.Kind.STRING, s) }
 | 
|
| 58523 | 264  | 
|
| 58534 | 265  | 
private def recover_delimited: Parser[Token] =  | 
266  | 
      """(?m)["{][^@]*""".r ^^ token(Token.Kind.ERROR)
 | 
|
267  | 
||
| 58589 | 268  | 
def delimited_line(item_ctxt: Item): Parser[(Chunk, Line_Context)] =  | 
| 58534 | 269  | 
      item_ctxt match {
 | 
| 58589 | 270  | 
case Item(kind, delim, _) =>  | 
| 58528 | 271  | 
          delimited_depth(delim) ^^ { case (s, delim1) =>
 | 
| 58534 | 272  | 
(Chunk(kind, List(Token(Token.Kind.STRING, s))), item_ctxt.copy(delim = delim1)) } |  | 
| 58589 | 273  | 
          recover_delimited ^^ { case a => (Chunk(kind, List(a)), Ignored) }
 | 
| 58534 | 274  | 
}  | 
| 58523 | 275  | 
|
276  | 
||
277  | 
/* other tokens */  | 
|
278  | 
||
279  | 
private val at = "@" ^^ keyword  | 
|
280  | 
    private val left_brace = "{" ^^ keyword
 | 
|
281  | 
private val right_brace = "}" ^^ keyword  | 
|
282  | 
    private val left_paren = "(" ^^ keyword
 | 
|
283  | 
private val right_paren = ")" ^^ keyword  | 
|
284  | 
||
285  | 
private val nat = "[0-9]+".r ^^ token(Token.Kind.NAT)  | 
|
286  | 
||
| 58529 | 287  | 
private val identifier =  | 
288  | 
      """[\x21-\x7f&&[^"#%'(),={}0-9]][\x21-\x7f&&[^"#%'(),={}]]*""".r
 | 
|
289  | 
||
290  | 
private val ident = identifier ^^ token(Token.Kind.IDENT)  | 
|
| 58523 | 291  | 
|
| 58528 | 292  | 
val other_token = "[=#,]".r ^^ keyword | (nat | (ident | space))  | 
293  | 
||
294  | 
||
| 58530 | 295  | 
/* items: command or entry */  | 
| 58528 | 296  | 
|
| 58530 | 297  | 
private val item_kind =  | 
| 58529 | 298  | 
      identifier ^^ { case a =>
 | 
299  | 
val kind =  | 
|
300  | 
if (is_command(a)) Token.Kind.COMMAND  | 
|
| 58530 | 301  | 
else if (is_entry(a)) Token.Kind.ENTRY  | 
| 58529 | 302  | 
else Token.Kind.IDENT  | 
303  | 
Token(kind, a)  | 
|
304  | 
}  | 
|
305  | 
||
| 58531 | 306  | 
private val item_start =  | 
| 58530 | 307  | 
at ~ rep(strict_space) ~ item_kind ~ rep(strict_space) ^^  | 
| 58528 | 308  | 
        { case a ~ b ~ c ~ d => (c.source, List(a) ::: b ::: List(c) ::: d) }
 | 
309  | 
||
| 58531 | 310  | 
private val item_name =  | 
311  | 
rep(strict_space) ~ identifier ^^  | 
|
312  | 
        { case a ~ b => a ::: List(Token(Token.Kind.NAME, b)) }
 | 
|
313  | 
||
| 58528 | 314  | 
private val item_body =  | 
315  | 
delimited | (recover_delimited | other_token)  | 
|
316  | 
||
| 58530 | 317  | 
private val item: Parser[Chunk] =  | 
| 58531 | 318  | 
(item_start ~ left_brace ~ item_name ~ rep(item_body) ~ opt(right_brace) |  | 
319  | 
item_start ~ left_paren ~ item_name ~ rep(item_body) ~ opt(right_paren)) ^^  | 
|
320  | 
        { case (kind, a) ~ b ~ c ~ d ~ e => Chunk(kind, a ::: List(b) ::: c ::: d ::: e.toList) }
 | 
|
| 58528 | 321  | 
|
| 58530 | 322  | 
private val recover_item: Parser[Chunk] =  | 
323  | 
      at ~ "(?m)[^@]*".r ^^ { case a ~ b => Chunk("", List(a, Token(Token.Kind.ERROR, b))) }
 | 
|
| 58528 | 324  | 
|
| 58523 | 325  | 
|
326  | 
/* chunks */  | 
|
327  | 
||
| 58528 | 328  | 
val chunk: Parser[Chunk] = ignored | (item | recover_item)  | 
| 58523 | 329  | 
|
| 58528 | 330  | 
def chunk_line(ctxt: Line_Context): Parser[(Chunk, Line_Context)] =  | 
| 58530 | 331  | 
    {
 | 
332  | 
      ctxt match {
 | 
|
| 58589 | 333  | 
case Ignored =>  | 
| 
58538
 
299b82d12d53
proper treatment of @comment (amending 402a8e8107a7);
 
wenzelm 
parents: 
58536 
diff
changeset
 | 
334  | 
ignored_line |  | 
| 58531 | 335  | 
item_start ~ (left_brace | left_paren) ~ opt(item_name) ^^  | 
336  | 
            { case (kind, a) ~ b ~ c =>
 | 
|
| 58530 | 337  | 
                val right = if (b.source == "{") "}" else ")"
 | 
| 58531 | 338  | 
val chunk = Chunk(kind, a ::: List(b) ::: (c getOrElse Nil))  | 
| 58589 | 339  | 
(chunk, Item(kind, Closed, right)) } |  | 
340  | 
          recover_item ^^ { case a => (a, Ignored) }
 | 
|
341  | 
case item_ctxt @ Item(kind, delim, right) =>  | 
|
| 58530 | 342  | 
if (delim.depth > 0)  | 
| 
58536
 
402a8e8107a7
more total chunk_line: recovery via ignored_line;
 
wenzelm 
parents: 
58535 
diff
changeset
 | 
343  | 
delimited_line(item_ctxt) |  | 
| 
 
402a8e8107a7
more total chunk_line: recovery via ignored_line;
 
wenzelm 
parents: 
58535 
diff
changeset
 | 
344  | 
ignored_line  | 
| 58530 | 345  | 
          else {
 | 
| 58534 | 346  | 
delimited_line(item_ctxt) |  | 
| 58530 | 347  | 
            other_token ^^ { case a => (Chunk(kind, List(a)), ctxt) } |
 | 
| 58589 | 348  | 
            right ^^ { case a => (Chunk(kind, List(keyword(a))), Ignored) } |
 | 
| 
58536
 
402a8e8107a7
more total chunk_line: recovery via ignored_line;
 
wenzelm 
parents: 
58535 
diff
changeset
 | 
349  | 
ignored_line  | 
| 58530 | 350  | 
}  | 
351  | 
        case _ => failure("")
 | 
|
352  | 
}  | 
|
353  | 
}  | 
|
| 58528 | 354  | 
}  | 
| 58523 | 355  | 
|
356  | 
||
| 58528 | 357  | 
/* parse */  | 
| 58523 | 358  | 
|
359  | 
def parse(input: CharSequence): List[Chunk] =  | 
|
360  | 
  {
 | 
|
361  | 
val in: Reader[Char] = new CharSequenceReader(input)  | 
|
| 58528 | 362  | 
    Parsers.parseAll(Parsers.rep(Parsers.chunk), in) match {
 | 
| 58523 | 363  | 
case Parsers.Success(result, _) => result  | 
| 58526 | 364  | 
      case _ => error("Unexpected failure to parse input:\n" + input.toString)
 | 
| 58523 | 365  | 
}  | 
366  | 
}  | 
|
| 58528 | 367  | 
|
368  | 
def parse_line(input: CharSequence, context: Line_Context): (List[Chunk], Line_Context) =  | 
|
369  | 
  {
 | 
|
370  | 
var in: Reader[Char] = new CharSequenceReader(input)  | 
|
371  | 
val chunks = new mutable.ListBuffer[Chunk]  | 
|
372  | 
var ctxt = context  | 
|
373  | 
    while (!in.atEnd) {
 | 
|
374  | 
      Parsers.parse(Parsers.chunk_line(ctxt), in) match {
 | 
|
375  | 
        case Parsers.Success((x, c), rest) => { chunks += x; ctxt = c; in = rest }
 | 
|
376  | 
case Parsers.NoSuccess(_, rest) =>  | 
|
377  | 
          error("Unepected failure to parse input:\n" + rest.source.toString)
 | 
|
378  | 
}  | 
|
379  | 
}  | 
|
380  | 
(chunks.toList, ctxt)  | 
|
381  | 
}  | 
|
| 58523 | 382  | 
}  | 
383  |