| author | wenzelm | 
| Mon, 30 Aug 2010 10:36:55 +0200 | |
| changeset 38855 | 35b2d91e88d7 | 
| parent 38475 | 1dd4f545ac24 | 
| child 40453 | a81346e57cef | 
| permissions | -rw-r--r-- | 
| 27930 | 1  | 
/* Title: Pure/General/yxml.scala  | 
2  | 
Author: Makarius  | 
|
3  | 
||
4  | 
Efficient text representation of XML trees.  | 
|
5  | 
*/  | 
|
6  | 
||
7  | 
package isabelle  | 
|
8  | 
||
9  | 
||
| 36684 | 10  | 
import scala.collection.mutable.ListBuffer  | 
11  | 
||
12  | 
||
| 32450 | 13  | 
object YXML  | 
14  | 
{
 | 
|
| 
27943
 
f34ff5e7728f
replaced Pattern.split by chunks iterator (more efficient, resembles ML version more closely);
 
wenzelm 
parents: 
27930 
diff
changeset
 | 
15  | 
/* chunk markers */  | 
| 27930 | 16  | 
|
17  | 
private val X = '\5'  | 
|
18  | 
private val Y = '\6'  | 
|
| 27960 | 19  | 
private val X_string = X.toString  | 
| 
27945
 
d2dc5a1903e8
tuned parse performance: avoid splitting terminal Y chunk;
 
wenzelm 
parents: 
27944 
diff
changeset
 | 
20  | 
private val Y_string = Y.toString  | 
| 27930 | 21  | 
|
22  | 
||
| 
38268
 
beb86b805590
more uniform XML/YXML string_of_body/string_of_tree;
 
wenzelm 
parents: 
38267 
diff
changeset
 | 
23  | 
/* string representation */ // FIXME byte array version with pseudo-utf-8 (!?)  | 
| 
 
beb86b805590
more uniform XML/YXML string_of_body/string_of_tree;
 
wenzelm 
parents: 
38267 
diff
changeset
 | 
24  | 
|
| 
 
beb86b805590
more uniform XML/YXML string_of_body/string_of_tree;
 
wenzelm 
parents: 
38267 
diff
changeset
 | 
25  | 
def string_of_body(body: XML.Body): String =  | 
| 
 
beb86b805590
more uniform XML/YXML string_of_body/string_of_tree;
 
wenzelm 
parents: 
38267 
diff
changeset
 | 
26  | 
  {
 | 
| 
 
beb86b805590
more uniform XML/YXML string_of_body/string_of_tree;
 
wenzelm 
parents: 
38267 
diff
changeset
 | 
27  | 
val s = new StringBuilder  | 
| 
 
beb86b805590
more uniform XML/YXML string_of_body/string_of_tree;
 
wenzelm 
parents: 
38267 
diff
changeset
 | 
28  | 
    def attrib(p: (String, String)) { s += Y; s ++= p._1; s += '='; s ++= p._2 }
 | 
| 
 
beb86b805590
more uniform XML/YXML string_of_body/string_of_tree;
 
wenzelm 
parents: 
38267 
diff
changeset
 | 
29  | 
def tree(t: XML.Tree): Unit =  | 
| 
 
beb86b805590
more uniform XML/YXML string_of_body/string_of_tree;
 
wenzelm 
parents: 
38267 
diff
changeset
 | 
30  | 
      t match {
 | 
| 
 
beb86b805590
more uniform XML/YXML string_of_body/string_of_tree;
 
wenzelm 
parents: 
38267 
diff
changeset
 | 
31  | 
case XML.Elem(Markup(name, atts), ts) =>  | 
| 
 
beb86b805590
more uniform XML/YXML string_of_body/string_of_tree;
 
wenzelm 
parents: 
38267 
diff
changeset
 | 
32  | 
s += X; s += Y; s++= name; atts.foreach(attrib); s += X  | 
| 
 
beb86b805590
more uniform XML/YXML string_of_body/string_of_tree;
 
wenzelm 
parents: 
38267 
diff
changeset
 | 
33  | 
ts.foreach(tree)  | 
| 
 
beb86b805590
more uniform XML/YXML string_of_body/string_of_tree;
 
wenzelm 
parents: 
38267 
diff
changeset
 | 
34  | 
s += X; s += Y; s += X  | 
| 
 
beb86b805590
more uniform XML/YXML string_of_body/string_of_tree;
 
wenzelm 
parents: 
38267 
diff
changeset
 | 
35  | 
case XML.Text(text) => s ++= text  | 
| 
 
beb86b805590
more uniform XML/YXML string_of_body/string_of_tree;
 
wenzelm 
parents: 
38267 
diff
changeset
 | 
36  | 
}  | 
| 
 
beb86b805590
more uniform XML/YXML string_of_body/string_of_tree;
 
wenzelm 
parents: 
38267 
diff
changeset
 | 
37  | 
body.foreach(tree)  | 
| 
 
beb86b805590
more uniform XML/YXML string_of_body/string_of_tree;
 
wenzelm 
parents: 
38267 
diff
changeset
 | 
38  | 
s.toString  | 
| 
 
beb86b805590
more uniform XML/YXML string_of_body/string_of_tree;
 
wenzelm 
parents: 
38267 
diff
changeset
 | 
39  | 
}  | 
| 
 
beb86b805590
more uniform XML/YXML string_of_body/string_of_tree;
 
wenzelm 
parents: 
38267 
diff
changeset
 | 
40  | 
|
| 
 
beb86b805590
more uniform XML/YXML string_of_body/string_of_tree;
 
wenzelm 
parents: 
38267 
diff
changeset
 | 
41  | 
def string_of_tree(tree: XML.Tree): String = string_of_body(List(tree))  | 
| 
 
beb86b805590
more uniform XML/YXML string_of_body/string_of_tree;
 
wenzelm 
parents: 
38267 
diff
changeset
 | 
42  | 
|
| 
 
beb86b805590
more uniform XML/YXML string_of_body/string_of_tree;
 
wenzelm 
parents: 
38267 
diff
changeset
 | 
43  | 
|
| 
34099
 
2541de190d92
added decode_chars, with raw character view on byte buffer and adhoc decoding via toString;
 
wenzelm 
parents: 
32450 
diff
changeset
 | 
44  | 
/* decoding pseudo UTF-8 */  | 
| 
 
2541de190d92
added decode_chars, with raw character view on byte buffer and adhoc decoding via toString;
 
wenzelm 
parents: 
32450 
diff
changeset
 | 
45  | 
|
| 
 
2541de190d92
added decode_chars, with raw character view on byte buffer and adhoc decoding via toString;
 
wenzelm 
parents: 
32450 
diff
changeset
 | 
46  | 
private class Decode_Chars(decode: String => String,  | 
| 
 
2541de190d92
added decode_chars, with raw character view on byte buffer and adhoc decoding via toString;
 
wenzelm 
parents: 
32450 
diff
changeset
 | 
47  | 
buffer: Array[Byte], start: Int, end: Int) extends CharSequence  | 
| 
 
2541de190d92
added decode_chars, with raw character view on byte buffer and adhoc decoding via toString;
 
wenzelm 
parents: 
32450 
diff
changeset
 | 
48  | 
  {
 | 
| 
 
2541de190d92
added decode_chars, with raw character view on byte buffer and adhoc decoding via toString;
 
wenzelm 
parents: 
32450 
diff
changeset
 | 
49  | 
def length: Int = end - start  | 
| 
 
2541de190d92
added decode_chars, with raw character view on byte buffer and adhoc decoding via toString;
 
wenzelm 
parents: 
32450 
diff
changeset
 | 
50  | 
def charAt(i: Int): Char = (buffer(start + i).asInstanceOf[Int] & 0xFF).asInstanceOf[Char]  | 
| 
 
2541de190d92
added decode_chars, with raw character view on byte buffer and adhoc decoding via toString;
 
wenzelm 
parents: 
32450 
diff
changeset
 | 
51  | 
def subSequence(i: Int, j: Int): CharSequence =  | 
| 
 
2541de190d92
added decode_chars, with raw character view on byte buffer and adhoc decoding via toString;
 
wenzelm 
parents: 
32450 
diff
changeset
 | 
52  | 
new Decode_Chars(decode, buffer, start + i, start + j)  | 
| 
 
2541de190d92
added decode_chars, with raw character view on byte buffer and adhoc decoding via toString;
 
wenzelm 
parents: 
32450 
diff
changeset
 | 
53  | 
|
| 
 
2541de190d92
added decode_chars, with raw character view on byte buffer and adhoc decoding via toString;
 
wenzelm 
parents: 
32450 
diff
changeset
 | 
54  | 
// toString with adhoc decoding: abuse of CharSequence interface  | 
| 
34201
 
c95dcd12f48a
separate Standard_System (Cygwin/Posix compatibility) vs. Isabelle_System (settings environment etc.);
 
wenzelm 
parents: 
34198 
diff
changeset
 | 
55  | 
override def toString: String = decode(Standard_System.decode_permissive_utf8(this))  | 
| 
34099
 
2541de190d92
added decode_chars, with raw character view on byte buffer and adhoc decoding via toString;
 
wenzelm 
parents: 
32450 
diff
changeset
 | 
56  | 
}  | 
| 
 
2541de190d92
added decode_chars, with raw character view on byte buffer and adhoc decoding via toString;
 
wenzelm 
parents: 
32450 
diff
changeset
 | 
57  | 
|
| 
 
2541de190d92
added decode_chars, with raw character view on byte buffer and adhoc decoding via toString;
 
wenzelm 
parents: 
32450 
diff
changeset
 | 
58  | 
def decode_chars(decode: String => String,  | 
| 
 
2541de190d92
added decode_chars, with raw character view on byte buffer and adhoc decoding via toString;
 
wenzelm 
parents: 
32450 
diff
changeset
 | 
59  | 
buffer: Array[Byte], start: Int, end: Int): CharSequence =  | 
| 
 
2541de190d92
added decode_chars, with raw character view on byte buffer and adhoc decoding via toString;
 
wenzelm 
parents: 
32450 
diff
changeset
 | 
60  | 
  {
 | 
| 
 
2541de190d92
added decode_chars, with raw character view on byte buffer and adhoc decoding via toString;
 
wenzelm 
parents: 
32450 
diff
changeset
 | 
61  | 
require(0 <= start && start <= end && end <= buffer.length)  | 
| 
 
2541de190d92
added decode_chars, with raw character view on byte buffer and adhoc decoding via toString;
 
wenzelm 
parents: 
32450 
diff
changeset
 | 
62  | 
new Decode_Chars(decode, buffer, start, end)  | 
| 
 
2541de190d92
added decode_chars, with raw character view on byte buffer and adhoc decoding via toString;
 
wenzelm 
parents: 
32450 
diff
changeset
 | 
63  | 
}  | 
| 
 
2541de190d92
added decode_chars, with raw character view on byte buffer and adhoc decoding via toString;
 
wenzelm 
parents: 
32450 
diff
changeset
 | 
64  | 
|
| 
 
2541de190d92
added decode_chars, with raw character view on byte buffer and adhoc decoding via toString;
 
wenzelm 
parents: 
32450 
diff
changeset
 | 
65  | 
|
| 27930 | 66  | 
/* parsing */  | 
67  | 
||
| 
27993
 
6dd90ef9f927
simplified exceptions: use plain error function / RuntimeException;
 
wenzelm 
parents: 
27971 
diff
changeset
 | 
68  | 
  private def err(msg: String) = error("Malformed YXML: " + msg)
 | 
| 27930 | 69  | 
  private def err_attribute() = err("bad attribute")
 | 
70  | 
  private def err_element() = err("bad element")
 | 
|
71  | 
private def err_unbalanced(name: String) =  | 
|
72  | 
    if (name == "") err("unbalanced element")
 | 
|
73  | 
    else err("unbalanced element \"" + name + "\"")
 | 
|
74  | 
||
| 
27944
 
2bf3f30558ed
parse_attrib: more efficient due to indexOf('=');
 
wenzelm 
parents: 
27943 
diff
changeset
 | 
75  | 
  private def parse_attrib(source: CharSequence) = {
 | 
| 
 
2bf3f30558ed
parse_attrib: more efficient due to indexOf('=');
 
wenzelm 
parents: 
27943 
diff
changeset
 | 
76  | 
val s = source.toString  | 
| 
 
2bf3f30558ed
parse_attrib: more efficient due to indexOf('=');
 
wenzelm 
parents: 
27943 
diff
changeset
 | 
77  | 
    val i = s.indexOf('=')
 | 
| 
 
2bf3f30558ed
parse_attrib: more efficient due to indexOf('=');
 
wenzelm 
parents: 
27943 
diff
changeset
 | 
78  | 
if (i <= 0) err_attribute()  | 
| 
38234
 
4b610fbb2d83
YXML.parse: refrain from interning, let XML.Cache do it (partially);
 
wenzelm 
parents: 
38231 
diff
changeset
 | 
79  | 
(s.substring(0, i), s.substring(i + 1))  | 
| 
27944
 
2bf3f30558ed
parse_attrib: more efficient due to indexOf('=');
 
wenzelm 
parents: 
27943 
diff
changeset
 | 
80  | 
}  | 
| 27930 | 81  | 
|
82  | 
||
| 
38267
 
e50c283dd125
type XML.Body as basic data representation language (Scala version);
 
wenzelm 
parents: 
38234 
diff
changeset
 | 
83  | 
def parse_body(source: CharSequence): XML.Body =  | 
| 32450 | 84  | 
  {
 | 
| 27930 | 85  | 
/* stack operations */  | 
86  | 
||
| 36684 | 87  | 
def buffer(): ListBuffer[XML.Tree] = new ListBuffer[XML.Tree]  | 
| 38475 | 88  | 
var stack: List[(Markup, ListBuffer[XML.Tree])] = List((Markup.Empty, buffer()))  | 
| 27930 | 89  | 
|
| 36684 | 90  | 
def add(x: XML.Tree)  | 
91  | 
    {
 | 
|
92  | 
      (stack: @unchecked) match { case ((_, body) :: _) => body += x }
 | 
|
| 27930 | 93  | 
}  | 
94  | 
||
| 36684 | 95  | 
def push(name: String, atts: XML.Attributes)  | 
96  | 
    {
 | 
|
| 27930 | 97  | 
if (name == "") err_element()  | 
| 
38230
 
ed147003de4b
simplified type XML.Tree: embed Markup directly, avoid slightly odd triple;
 
wenzelm 
parents: 
36685 
diff
changeset
 | 
98  | 
else stack = (Markup(name, atts), buffer()) :: stack  | 
| 36684 | 99  | 
}  | 
| 27930 | 100  | 
|
| 36684 | 101  | 
def pop()  | 
102  | 
    {
 | 
|
103  | 
      (stack: @unchecked) match {
 | 
|
| 38475 | 104  | 
        case ((Markup.Empty, _) :: _) => err_unbalanced("")
 | 
| 
38230
 
ed147003de4b
simplified type XML.Tree: embed Markup directly, avoid slightly odd triple;
 
wenzelm 
parents: 
36685 
diff
changeset
 | 
105  | 
case ((markup, body) :: pending) =>  | 
| 
 
ed147003de4b
simplified type XML.Tree: embed Markup directly, avoid slightly odd triple;
 
wenzelm 
parents: 
36685 
diff
changeset
 | 
106  | 
stack = pending  | 
| 
 
ed147003de4b
simplified type XML.Tree: embed Markup directly, avoid slightly odd triple;
 
wenzelm 
parents: 
36685 
diff
changeset
 | 
107  | 
add(XML.Elem(markup, body.toList))  | 
| 36684 | 108  | 
}  | 
| 27930 | 109  | 
}  | 
110  | 
||
111  | 
||
112  | 
/* parse chunks */  | 
|
113  | 
||
| 36685 | 114  | 
    for (chunk <- Library.chunks(source, X) if chunk.length != 0) {
 | 
| 
34099
 
2541de190d92
added decode_chars, with raw character view on byte buffer and adhoc decoding via toString;
 
wenzelm 
parents: 
32450 
diff
changeset
 | 
115  | 
if (chunk.length == 1 && chunk.charAt(0) == Y) pop()  | 
| 
27945
 
d2dc5a1903e8
tuned parse performance: avoid splitting terminal Y chunk;
 
wenzelm 
parents: 
27944 
diff
changeset
 | 
116  | 
      else {
 | 
| 36685 | 117  | 
        Library.chunks(chunk, Y).toList match {
 | 
| 
34099
 
2541de190d92
added decode_chars, with raw character view on byte buffer and adhoc decoding via toString;
 
wenzelm 
parents: 
32450 
diff
changeset
 | 
118  | 
case ch :: name :: atts if ch.length == 0 =>  | 
| 
38234
 
4b610fbb2d83
YXML.parse: refrain from interning, let XML.Cache do it (partially);
 
wenzelm 
parents: 
38231 
diff
changeset
 | 
119  | 
push(name.toString, atts.map(parse_attrib))  | 
| 
27945
 
d2dc5a1903e8
tuned parse performance: avoid splitting terminal Y chunk;
 
wenzelm 
parents: 
27944 
diff
changeset
 | 
120  | 
case txts => for (txt <- txts) add(XML.Text(txt.toString))  | 
| 
 
d2dc5a1903e8
tuned parse performance: avoid splitting terminal Y chunk;
 
wenzelm 
parents: 
27944 
diff
changeset
 | 
121  | 
}  | 
| 27930 | 122  | 
}  | 
123  | 
}  | 
|
124  | 
    stack match {
 | 
|
| 38475 | 125  | 
case List((Markup.Empty, body)) => body.toList  | 
| 
38230
 
ed147003de4b
simplified type XML.Tree: embed Markup directly, avoid slightly odd triple;
 
wenzelm 
parents: 
36685 
diff
changeset
 | 
126  | 
case (Markup(name, _), _) :: _ => err_unbalanced(name)  | 
| 27930 | 127  | 
}  | 
128  | 
}  | 
|
129  | 
||
| 32450 | 130  | 
def parse(source: CharSequence): XML.Tree =  | 
| 27930 | 131  | 
    parse_body(source) match {
 | 
132  | 
case List(result) => result  | 
|
133  | 
      case Nil => XML.Text("")
 | 
|
134  | 
      case _ => err("multiple results")
 | 
|
135  | 
}  | 
|
| 27960 | 136  | 
|
| 29521 | 137  | 
|
138  | 
/* failsafe parsing */  | 
|
139  | 
||
140  | 
private def markup_failsafe(source: CharSequence) =  | 
|
| 38231 | 141  | 
XML.elem (Markup.MALFORMED,  | 
| 29521 | 142  | 
List(XML.Text(source.toString.replace(X_string, "\\<^X>").replace(Y_string, "\\<^Y>"))))  | 
143  | 
||
| 
38267
 
e50c283dd125
type XML.Body as basic data representation language (Scala version);
 
wenzelm 
parents: 
38234 
diff
changeset
 | 
144  | 
def parse_body_failsafe(source: CharSequence): XML.Body =  | 
| 32450 | 145  | 
  {
 | 
| 29521 | 146  | 
    try { parse_body(source) }
 | 
147  | 
    catch { case _: RuntimeException => List(markup_failsafe(source)) }
 | 
|
148  | 
}  | 
|
149  | 
||
| 32450 | 150  | 
def parse_failsafe(source: CharSequence): XML.Tree =  | 
151  | 
  {
 | 
|
| 27960 | 152  | 
    try { parse(source) }
 | 
| 29521 | 153  | 
    catch { case _: RuntimeException => markup_failsafe(source) }
 | 
| 27960 | 154  | 
}  | 
| 27930 | 155  | 
}  |