| author | haftmann | 
| Tue, 16 Sep 2008 09:21:24 +0200 | |
| changeset 28228 | 7ebe8dc06cbb | 
| parent 27993 | 6dd90ef9f927 | 
| child 29140 | e7ac5bb20aed | 
| permissions | -rw-r--r-- | 
| 27930 | 1  | 
/* Title: Pure/General/yxml.scala  | 
2  | 
ID: $Id$  | 
|
3  | 
Author: Makarius  | 
|
4  | 
||
5  | 
Efficient text representation of XML trees.  | 
|
6  | 
*/  | 
|
7  | 
||
8  | 
package isabelle  | 
|
9  | 
||
10  | 
import java.util.regex.Pattern  | 
|
11  | 
||
12  | 
||
13  | 
object YXML {
 | 
|
14  | 
||
| 
27943
 
f34ff5e7728f
replaced Pattern.split by chunks iterator (more efficient, resembles ML version more closely);
 
wenzelm 
parents: 
27930 
diff
changeset
 | 
15  | 
/* chunk markers */  | 
| 27930 | 16  | 
|
17  | 
private val X = '\5'  | 
|
18  | 
private val Y = '\6'  | 
|
| 27960 | 19  | 
private val X_string = X.toString  | 
| 
27945
 
d2dc5a1903e8
tuned parse performance: avoid splitting terminal Y chunk;
 
wenzelm 
parents: 
27944 
diff
changeset
 | 
20  | 
private val Y_string = Y.toString  | 
| 27930 | 21  | 
|
22  | 
  def detect(source: CharSequence) = {
 | 
|
23  | 
source.length >= 2 &&  | 
|
24  | 
source.charAt(0) == X &&  | 
|
25  | 
source.charAt(1) == Y  | 
|
26  | 
}  | 
|
27  | 
||
28  | 
||
| 
27943
 
f34ff5e7728f
replaced Pattern.split by chunks iterator (more efficient, resembles ML version more closely);
 
wenzelm 
parents: 
27930 
diff
changeset
 | 
29  | 
/* iterate over chunks (resembles space_explode in ML) */  | 
| 
 
f34ff5e7728f
replaced Pattern.split by chunks iterator (more efficient, resembles ML version more closely);
 
wenzelm 
parents: 
27930 
diff
changeset
 | 
30  | 
|
| 
 
f34ff5e7728f
replaced Pattern.split by chunks iterator (more efficient, resembles ML version more closely);
 
wenzelm 
parents: 
27930 
diff
changeset
 | 
31  | 
  private def chunks(sep: Char, source: CharSequence) = new Iterator[CharSequence] {
 | 
| 
 
f34ff5e7728f
replaced Pattern.split by chunks iterator (more efficient, resembles ML version more closely);
 
wenzelm 
parents: 
27930 
diff
changeset
 | 
32  | 
private val end = source.length  | 
| 
 
f34ff5e7728f
replaced Pattern.split by chunks iterator (more efficient, resembles ML version more closely);
 
wenzelm 
parents: 
27930 
diff
changeset
 | 
33  | 
private var state = if (end == 0) None else get_chunk(-1)  | 
| 
 
f34ff5e7728f
replaced Pattern.split by chunks iterator (more efficient, resembles ML version more closely);
 
wenzelm 
parents: 
27930 
diff
changeset
 | 
34  | 
    private def get_chunk(i: Int) = {
 | 
| 
 
f34ff5e7728f
replaced Pattern.split by chunks iterator (more efficient, resembles ML version more closely);
 
wenzelm 
parents: 
27930 
diff
changeset
 | 
35  | 
      if (i < end) {
 | 
| 
 
f34ff5e7728f
replaced Pattern.split by chunks iterator (more efficient, resembles ML version more closely);
 
wenzelm 
parents: 
27930 
diff
changeset
 | 
36  | 
var j = i; do j += 1 while (j < end && source.charAt(j) != sep)  | 
| 
 
f34ff5e7728f
replaced Pattern.split by chunks iterator (more efficient, resembles ML version more closely);
 
wenzelm 
parents: 
27930 
diff
changeset
 | 
37  | 
Some((source.subSequence(i + 1, j), j))  | 
| 
 
f34ff5e7728f
replaced Pattern.split by chunks iterator (more efficient, resembles ML version more closely);
 
wenzelm 
parents: 
27930 
diff
changeset
 | 
38  | 
}  | 
| 
 
f34ff5e7728f
replaced Pattern.split by chunks iterator (more efficient, resembles ML version more closely);
 
wenzelm 
parents: 
27930 
diff
changeset
 | 
39  | 
else None  | 
| 
 
f34ff5e7728f
replaced Pattern.split by chunks iterator (more efficient, resembles ML version more closely);
 
wenzelm 
parents: 
27930 
diff
changeset
 | 
40  | 
}  | 
| 
 
f34ff5e7728f
replaced Pattern.split by chunks iterator (more efficient, resembles ML version more closely);
 
wenzelm 
parents: 
27930 
diff
changeset
 | 
41  | 
|
| 
 
f34ff5e7728f
replaced Pattern.split by chunks iterator (more efficient, resembles ML version more closely);
 
wenzelm 
parents: 
27930 
diff
changeset
 | 
42  | 
def hasNext() = state.isDefined  | 
| 
 
f34ff5e7728f
replaced Pattern.split by chunks iterator (more efficient, resembles ML version more closely);
 
wenzelm 
parents: 
27930 
diff
changeset
 | 
43  | 
    def next() = state match {
 | 
| 
 
f34ff5e7728f
replaced Pattern.split by chunks iterator (more efficient, resembles ML version more closely);
 
wenzelm 
parents: 
27930 
diff
changeset
 | 
44  | 
      case Some((s, i)) => { state = get_chunk(i); s }
 | 
| 
 
f34ff5e7728f
replaced Pattern.split by chunks iterator (more efficient, resembles ML version more closely);
 
wenzelm 
parents: 
27930 
diff
changeset
 | 
45  | 
      case None => throw new NoSuchElementException("next on empty iterator")
 | 
| 
 
f34ff5e7728f
replaced Pattern.split by chunks iterator (more efficient, resembles ML version more closely);
 
wenzelm 
parents: 
27930 
diff
changeset
 | 
46  | 
}  | 
| 
 
f34ff5e7728f
replaced Pattern.split by chunks iterator (more efficient, resembles ML version more closely);
 
wenzelm 
parents: 
27930 
diff
changeset
 | 
47  | 
}  | 
| 
 
f34ff5e7728f
replaced Pattern.split by chunks iterator (more efficient, resembles ML version more closely);
 
wenzelm 
parents: 
27930 
diff
changeset
 | 
48  | 
|
| 
 
f34ff5e7728f
replaced Pattern.split by chunks iterator (more efficient, resembles ML version more closely);
 
wenzelm 
parents: 
27930 
diff
changeset
 | 
49  | 
|
| 27930 | 50  | 
/* parsing */  | 
51  | 
||
| 
27993
 
6dd90ef9f927
simplified exceptions: use plain error function / RuntimeException;
 
wenzelm 
parents: 
27971 
diff
changeset
 | 
52  | 
  private def err(msg: String) = error("Malformed YXML: " + msg)
 | 
| 27930 | 53  | 
  private def err_attribute() = err("bad attribute")
 | 
54  | 
  private def err_element() = err("bad element")
 | 
|
55  | 
private def err_unbalanced(name: String) =  | 
|
56  | 
    if (name == "") err("unbalanced element")
 | 
|
57  | 
    else err("unbalanced element \"" + name + "\"")
 | 
|
58  | 
||
| 
27944
 
2bf3f30558ed
parse_attrib: more efficient due to indexOf('=');
 
wenzelm 
parents: 
27943 
diff
changeset
 | 
59  | 
  private def parse_attrib(source: CharSequence) = {
 | 
| 
 
2bf3f30558ed
parse_attrib: more efficient due to indexOf('=');
 
wenzelm 
parents: 
27943 
diff
changeset
 | 
60  | 
val s = source.toString  | 
| 
 
2bf3f30558ed
parse_attrib: more efficient due to indexOf('=');
 
wenzelm 
parents: 
27943 
diff
changeset
 | 
61  | 
    val i = s.indexOf('=')
 | 
| 
 
2bf3f30558ed
parse_attrib: more efficient due to indexOf('=');
 
wenzelm 
parents: 
27943 
diff
changeset
 | 
62  | 
if (i <= 0) err_attribute()  | 
| 27946 | 63  | 
(s.substring(0, i), s.substring(i + 1))  | 
| 
27944
 
2bf3f30558ed
parse_attrib: more efficient due to indexOf('=');
 
wenzelm 
parents: 
27943 
diff
changeset
 | 
64  | 
}  | 
| 27930 | 65  | 
|
66  | 
||
67  | 
  def parse_body(source: CharSequence) = {
 | 
|
68  | 
||
69  | 
/* stack operations */  | 
|
70  | 
||
71  | 
var stack: List[((String, XML.Attributes), List[XML.Tree])] = null  | 
|
72  | 
||
73  | 
    def add(x: XML.Tree) = stack match {
 | 
|
74  | 
case ((elem, body) :: pending) => stack = (elem, x :: body) :: pending  | 
|
75  | 
}  | 
|
76  | 
||
77  | 
def push(name: String, atts: XML.Attributes) =  | 
|
78  | 
if (name == "") err_element()  | 
|
79  | 
else stack = ((name, atts), Nil) :: stack  | 
|
80  | 
||
81  | 
    def pop() = stack match {
 | 
|
82  | 
      case ((("", _), _) :: _) => err_unbalanced("")
 | 
|
83  | 
case (((name, atts), body) :: pending) =>  | 
|
84  | 
stack = pending; add(XML.Elem(name, atts, body.reverse))  | 
|
85  | 
}  | 
|
86  | 
||
87  | 
||
88  | 
/* parse chunks */  | 
|
89  | 
||
90  | 
    stack = List((("", Nil), Nil))
 | 
|
| 
27943
 
f34ff5e7728f
replaced Pattern.split by chunks iterator (more efficient, resembles ML version more closely);
 
wenzelm 
parents: 
27930 
diff
changeset
 | 
91  | 
    for (chunk <- chunks(X, source) if chunk != "") {
 | 
| 
27945
 
d2dc5a1903e8
tuned parse performance: avoid splitting terminal Y chunk;
 
wenzelm 
parents: 
27944 
diff
changeset
 | 
92  | 
if (chunk == Y_string) pop()  | 
| 
 
d2dc5a1903e8
tuned parse performance: avoid splitting terminal Y chunk;
 
wenzelm 
parents: 
27944 
diff
changeset
 | 
93  | 
      else {
 | 
| 
 
d2dc5a1903e8
tuned parse performance: avoid splitting terminal Y chunk;
 
wenzelm 
parents: 
27944 
diff
changeset
 | 
94  | 
        chunks(Y, chunk).toList match {
 | 
| 
 
d2dc5a1903e8
tuned parse performance: avoid splitting terminal Y chunk;
 
wenzelm 
parents: 
27944 
diff
changeset
 | 
95  | 
case "" :: name :: atts => push(name.toString, atts.map(parse_attrib))  | 
| 
 
d2dc5a1903e8
tuned parse performance: avoid splitting terminal Y chunk;
 
wenzelm 
parents: 
27944 
diff
changeset
 | 
96  | 
case txts => for (txt <- txts) add(XML.Text(txt.toString))  | 
| 
 
d2dc5a1903e8
tuned parse performance: avoid splitting terminal Y chunk;
 
wenzelm 
parents: 
27944 
diff
changeset
 | 
97  | 
}  | 
| 27930 | 98  | 
}  | 
99  | 
}  | 
|
100  | 
    stack match {
 | 
|
101  | 
      case List((("", _), result)) => result.reverse
 | 
|
102  | 
case ((name, _), _) :: _ => err_unbalanced(name)  | 
|
103  | 
}  | 
|
104  | 
}  | 
|
105  | 
||
106  | 
def parse(source: CharSequence) =  | 
|
107  | 
    parse_body(source) match {
 | 
|
108  | 
case List(result) => result  | 
|
109  | 
      case Nil => XML.Text("")
 | 
|
110  | 
      case _ => err("multiple results")
 | 
|
111  | 
}  | 
|
| 27960 | 112  | 
|
113  | 
  def parse_failsafe(source: CharSequence) = {
 | 
|
114  | 
    try { parse(source) }
 | 
|
115  | 
    catch {
 | 
|
| 
27993
 
6dd90ef9f927
simplified exceptions: use plain error function / RuntimeException;
 
wenzelm 
parents: 
27971 
diff
changeset
 | 
116  | 
case _: RuntimeException => XML.Elem (Markup.BAD, Nil,  | 
| 27960 | 117  | 
List(XML.Text(source.toString.replace(X_string, "\\<^X>").replace(Y_string, "\\<^Y>"))))  | 
118  | 
}  | 
|
119  | 
}  | 
|
120  | 
||
| 27930 | 121  | 
}  |