| author | wenzelm | 
| Sat, 09 Apr 2022 15:28:55 +0200 | |
| changeset 75436 | 40630fec3b5d | 
| parent 75393 | 87ebf5a50283 | 
| child 80425 | efa212a6699a | 
| permissions | -rw-r--r-- | 
| 44698 | 1  | 
/* Title: Pure/PIDE/yxml.scala  | 
| 27930 | 2  | 
Author: Makarius  | 
3  | 
||
| 44698 | 4  | 
Efficient text representation of XML trees. Suitable for direct  | 
5  | 
inlining into plain text.  | 
|
| 27930 | 6  | 
*/  | 
7  | 
||
8  | 
package isabelle  | 
|
9  | 
||
10  | 
||
| 40453 | 11  | 
import scala.collection.mutable  | 
| 36684 | 12  | 
|
13  | 
||
| 75393 | 14  | 
object YXML {
 | 
| 
27943
 
f34ff5e7728f
replaced Pattern.split by chunks iterator (more efficient, resembles ML version more closely);
 
wenzelm 
parents: 
27930 
diff
changeset
 | 
15  | 
/* chunk markers */  | 
| 27930 | 16  | 
|
| 
56661
 
ef623f6f036b
avoid octal escape literals -- deprecated in scala-2.11.0;
 
wenzelm 
parents: 
56600 
diff
changeset
 | 
17  | 
val X = '\u0005'  | 
| 
 
ef623f6f036b
avoid octal escape literals -- deprecated in scala-2.11.0;
 
wenzelm 
parents: 
56600 
diff
changeset
 | 
18  | 
val Y = '\u0006'  | 
| 56600 | 19  | 
|
| 73029 | 20  | 
val is_X: Char => Boolean = _ == X  | 
21  | 
val is_Y: Char => Boolean = _ == Y  | 
|
| 56600 | 22  | 
|
| 71601 | 23  | 
val X_string: String = X.toString  | 
24  | 
val Y_string: String = Y.toString  | 
|
25  | 
val XY_string: String = X_string + Y_string  | 
|
26  | 
val XYX_string: String = XY_string + X_string  | 
|
| 27930 | 27  | 
|
| 43782 | 28  | 
def detect(s: String): Boolean = s.exists(c => c == X || c == Y)  | 
| 
67820
 
e30d6368c7c8
clarified argument formats: explicit Unit, allow XML.Elem as well;
 
wenzelm 
parents: 
64370 
diff
changeset
 | 
29  | 
def detect_elem(s: String): Boolean = s.startsWith(XY_string)  | 
| 43782 | 30  | 
|
| 27930 | 31  | 
|
| 73029 | 32  | 
/* string representation */  | 
| 
38268
 
beb86b805590
more uniform XML/YXML string_of_body/string_of_tree;
 
wenzelm 
parents: 
38267 
diff
changeset
 | 
33  | 
|
| 75393 | 34  | 
  def traversal(string: String => Unit, body: XML.Body): Unit = {
 | 
| 
38268
 
beb86b805590
more uniform XML/YXML string_of_body/string_of_tree;
 
wenzelm 
parents: 
38267 
diff
changeset
 | 
35  | 
def tree(t: XML.Tree): Unit =  | 
| 
 
beb86b805590
more uniform XML/YXML string_of_body/string_of_tree;
 
wenzelm 
parents: 
38267 
diff
changeset
 | 
36  | 
      t match {
 | 
| 
73556
 
192bcee4f8b8
more robust treatment of empty markup: it allows to produce formal chunks;
 
wenzelm 
parents: 
73340 
diff
changeset
 | 
37  | 
case XML.Elem(markup @ Markup(name, atts), ts) =>  | 
| 
 
192bcee4f8b8
more robust treatment of empty markup: it allows to produce formal chunks;
 
wenzelm 
parents: 
73340 
diff
changeset
 | 
38  | 
if (markup.is_empty) ts.foreach(tree)  | 
| 
 
192bcee4f8b8
more robust treatment of empty markup: it allows to produce formal chunks;
 
wenzelm 
parents: 
73340 
diff
changeset
 | 
39  | 
          else {
 | 
| 
 
192bcee4f8b8
more robust treatment of empty markup: it allows to produce formal chunks;
 
wenzelm 
parents: 
73340 
diff
changeset
 | 
40  | 
string(XY_string)  | 
| 
 
192bcee4f8b8
more robust treatment of empty markup: it allows to produce formal chunks;
 
wenzelm 
parents: 
73340 
diff
changeset
 | 
41  | 
string(name)  | 
| 
 
192bcee4f8b8
more robust treatment of empty markup: it allows to produce formal chunks;
 
wenzelm 
parents: 
73340 
diff
changeset
 | 
42  | 
            for ((a, x) <- atts) { string(Y_string); string(a); string("="); string(x) }
 | 
| 
 
192bcee4f8b8
more robust treatment of empty markup: it allows to produce formal chunks;
 
wenzelm 
parents: 
73340 
diff
changeset
 | 
43  | 
string(X_string)  | 
| 
 
192bcee4f8b8
more robust treatment of empty markup: it allows to produce formal chunks;
 
wenzelm 
parents: 
73340 
diff
changeset
 | 
44  | 
ts.foreach(tree)  | 
| 
 
192bcee4f8b8
more robust treatment of empty markup: it allows to produce formal chunks;
 
wenzelm 
parents: 
73340 
diff
changeset
 | 
45  | 
string(XYX_string)  | 
| 
 
192bcee4f8b8
more robust treatment of empty markup: it allows to produce formal chunks;
 
wenzelm 
parents: 
73340 
diff
changeset
 | 
46  | 
}  | 
| 71534 | 47  | 
case XML.Text(text) => string(text)  | 
| 
38268
 
beb86b805590
more uniform XML/YXML string_of_body/string_of_tree;
 
wenzelm 
parents: 
38267 
diff
changeset
 | 
48  | 
}  | 
| 
 
beb86b805590
more uniform XML/YXML string_of_body/string_of_tree;
 
wenzelm 
parents: 
38267 
diff
changeset
 | 
49  | 
body.foreach(tree)  | 
| 71534 | 50  | 
}  | 
51  | 
||
| 75393 | 52  | 
  def string_of_body(body: XML.Body): String = {
 | 
| 71534 | 53  | 
val s = new StringBuilder  | 
54  | 
traversal(str => s ++= str, body)  | 
|
| 
38268
 
beb86b805590
more uniform XML/YXML string_of_body/string_of_tree;
 
wenzelm 
parents: 
38267 
diff
changeset
 | 
55  | 
s.toString  | 
| 
 
beb86b805590
more uniform XML/YXML string_of_body/string_of_tree;
 
wenzelm 
parents: 
38267 
diff
changeset
 | 
56  | 
}  | 
| 
 
beb86b805590
more uniform XML/YXML string_of_body/string_of_tree;
 
wenzelm 
parents: 
38267 
diff
changeset
 | 
57  | 
|
| 
 
beb86b805590
more uniform XML/YXML string_of_body/string_of_tree;
 
wenzelm 
parents: 
38267 
diff
changeset
 | 
58  | 
def string_of_tree(tree: XML.Tree): String = string_of_body(List(tree))  | 
| 
 
beb86b805590
more uniform XML/YXML string_of_body/string_of_tree;
 
wenzelm 
parents: 
38267 
diff
changeset
 | 
59  | 
|
| 
 
beb86b805590
more uniform XML/YXML string_of_body/string_of_tree;
 
wenzelm 
parents: 
38267 
diff
changeset
 | 
60  | 
|
| 27930 | 61  | 
/* parsing */  | 
62  | 
||
| 
27993
 
6dd90ef9f927
simplified exceptions: use plain error function / RuntimeException;
 
wenzelm 
parents: 
27971 
diff
changeset
 | 
63  | 
  private def err(msg: String) = error("Malformed YXML: " + msg)
 | 
| 27930 | 64  | 
  private def err_attribute() = err("bad attribute")
 | 
65  | 
  private def err_element() = err("bad element")
 | 
|
66  | 
private def err_unbalanced(name: String) =  | 
|
67  | 
    if (name == "") err("unbalanced element")
 | 
|
| 44181 | 68  | 
    else err("unbalanced element " + quote(name))
 | 
| 27930 | 69  | 
|
| 73029 | 70  | 
private def parse_attrib(source: CharSequence): (String, String) =  | 
| 73712 | 71  | 
Properties.Eq.unapply(source.toString) getOrElse err_attribute()  | 
| 27930 | 72  | 
|
73  | 
||
| 75393 | 74  | 
  def parse_body(source: CharSequence, cache: XML.Cache = XML.Cache.none): XML.Body = {
 | 
| 27930 | 75  | 
/* stack operations */  | 
76  | 
||
| 40453 | 77  | 
def buffer(): mutable.ListBuffer[XML.Tree] = new mutable.ListBuffer[XML.Tree]  | 
78  | 
var stack: List[(Markup, mutable.ListBuffer[XML.Tree])] = List((Markup.Empty, buffer()))  | 
|
| 27930 | 79  | 
|
| 73340 | 80  | 
def add(x: XML.Tree): Unit =  | 
| 73029 | 81  | 
      (stack: @unchecked) match { case (_, body) :: _ => body += x }
 | 
| 27930 | 82  | 
|
| 73340 | 83  | 
def push(name: String, atts: XML.Attributes): Unit =  | 
| 27930 | 84  | 
if (name == "") err_element()  | 
| 73030 | 85  | 
else stack = (cache.markup(Markup(name, atts)), buffer()) :: stack  | 
| 27930 | 86  | 
|
| 73340 | 87  | 
def pop(): Unit =  | 
| 36684 | 88  | 
      (stack: @unchecked) match {
 | 
| 73029 | 89  | 
        case (Markup.Empty, _) :: _ => err_unbalanced("")
 | 
90  | 
case (markup, body) :: pending =>  | 
|
| 
38230
 
ed147003de4b
simplified type XML.Tree: embed Markup directly, avoid slightly odd triple;
 
wenzelm 
parents: 
36685 
diff
changeset
 | 
91  | 
stack = pending  | 
| 73030 | 92  | 
add(cache.tree0(XML.Elem(markup, body.toList)))  | 
| 36684 | 93  | 
}  | 
| 27930 | 94  | 
|
95  | 
||
96  | 
/* parse chunks */  | 
|
97  | 
||
| 56600 | 98  | 
    for (chunk <- Library.separated_chunks(is_X, source) if chunk.length != 0) {
 | 
| 
34099
 
2541de190d92
added decode_chars, with raw character view on byte buffer and adhoc decoding via toString;
 
wenzelm 
parents: 
32450 
diff
changeset
 | 
99  | 
if (chunk.length == 1 && chunk.charAt(0) == Y) pop()  | 
| 
27945
 
d2dc5a1903e8
tuned parse performance: avoid splitting terminal Y chunk;
 
wenzelm 
parents: 
27944 
diff
changeset
 | 
100  | 
      else {
 | 
| 56600 | 101  | 
        Library.separated_chunks(is_Y, chunk).toList match {
 | 
| 
34099
 
2541de190d92
added decode_chars, with raw character view on byte buffer and adhoc decoding via toString;
 
wenzelm 
parents: 
32450 
diff
changeset
 | 
102  | 
case ch :: name :: atts if ch.length == 0 =>  | 
| 
38234
 
4b610fbb2d83
YXML.parse: refrain from interning, let XML.Cache do it (partially);
 
wenzelm 
parents: 
38231 
diff
changeset
 | 
103  | 
push(name.toString, atts.map(parse_attrib))  | 
| 73030 | 104  | 
case txts => for (txt <- txts) add(cache.tree0(XML.Text(cache.string(txt.toString))))  | 
| 
27945
 
d2dc5a1903e8
tuned parse performance: avoid splitting terminal Y chunk;
 
wenzelm 
parents: 
27944 
diff
changeset
 | 
105  | 
}  | 
| 27930 | 106  | 
}  | 
107  | 
}  | 
|
| 42719 | 108  | 
    (stack: @unchecked) match {
 | 
| 38475 | 109  | 
case List((Markup.Empty, body)) => body.toList  | 
| 
38230
 
ed147003de4b
simplified type XML.Tree: embed Markup directly, avoid slightly odd triple;
 
wenzelm 
parents: 
36685 
diff
changeset
 | 
110  | 
case (Markup(name, _), _) :: _ => err_unbalanced(name)  | 
| 27930 | 111  | 
}  | 
112  | 
}  | 
|
113  | 
||
| 73030 | 114  | 
def parse(source: CharSequence, cache: XML.Cache = XML.Cache.none): XML.Tree =  | 
115  | 
    parse_body(source, cache = cache) match {
 | 
|
| 27930 | 116  | 
case List(result) => result  | 
| 73028 | 117  | 
case Nil => XML.no_text  | 
| 
67820
 
e30d6368c7c8
clarified argument formats: explicit Unit, allow XML.Elem as well;
 
wenzelm 
parents: 
64370 
diff
changeset
 | 
118  | 
      case _ => err("multiple XML trees")
 | 
| 
 
e30d6368c7c8
clarified argument formats: explicit Unit, allow XML.Elem as well;
 
wenzelm 
parents: 
64370 
diff
changeset
 | 
119  | 
}  | 
| 
 
e30d6368c7c8
clarified argument formats: explicit Unit, allow XML.Elem as well;
 
wenzelm 
parents: 
64370 
diff
changeset
 | 
120  | 
|
| 73030 | 121  | 
def parse_elem(source: CharSequence, cache: XML.Cache = XML.Cache.none): XML.Tree =  | 
122  | 
    parse_body(source, cache = cache) match {
 | 
|
| 
67820
 
e30d6368c7c8
clarified argument formats: explicit Unit, allow XML.Elem as well;
 
wenzelm 
parents: 
64370 
diff
changeset
 | 
123  | 
case List(elem: XML.Elem) => elem  | 
| 
 
e30d6368c7c8
clarified argument formats: explicit Unit, allow XML.Elem as well;
 
wenzelm 
parents: 
64370 
diff
changeset
 | 
124  | 
      case _ => err("single XML element expected")
 | 
| 27930 | 125  | 
}  | 
| 27960 | 126  | 
|
| 29521 | 127  | 
|
128  | 
/* failsafe parsing */  | 
|
129  | 
||
| 45666 | 130  | 
private def markup_broken(source: CharSequence) =  | 
| 55551 | 131  | 
XML.Elem(Markup.Broken, List(XML.Text(source.toString)))  | 
| 29521 | 132  | 
|
| 75393 | 133  | 
  def parse_body_failsafe(source: CharSequence, cache: XML.Cache = XML.Cache.none): XML.Body = {
 | 
| 73030 | 134  | 
    try { parse_body(source, cache = cache) }
 | 
| 45666 | 135  | 
    catch { case ERROR(_) => List(markup_broken(source)) }
 | 
| 29521 | 136  | 
}  | 
137  | 
||
| 75393 | 138  | 
  def parse_failsafe(source: CharSequence, cache: XML.Cache = XML.Cache.none): XML.Tree = {
 | 
| 73030 | 139  | 
    try { parse(source, cache = cache) }
 | 
| 45666 | 140  | 
    catch { case ERROR(_) => markup_broken(source) }
 | 
| 27960 | 141  | 
}  | 
| 27930 | 142  | 
}  |