| author | haftmann | 
| Thu, 05 Feb 2009 14:14:02 +0100 | |
| changeset 29806 | bebe5a254ba6 | 
| parent 29606 | fedb8be05f24 | 
| child 31469 | 40f815edbde4 | 
| permissions | -rw-r--r-- | 
| 26528 | 1  | 
(* Title: Pure/General/yxml.ML  | 
2  | 
Author: Makarius  | 
|
3  | 
||
| 
26540
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
4  | 
Efficient text representation of XML trees using extra characters X  | 
| 
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
5  | 
and Y -- no escaping, may nest marked text verbatim.  | 
| 26528 | 6  | 
|
| 
26540
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
7  | 
Markup <elem att="val" ...>...body...</elem> is encoded as:  | 
| 
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
8  | 
|
| 
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
9  | 
X Y name Y att=val ... X  | 
| 26528 | 10  | 
...  | 
11  | 
body  | 
|
12  | 
...  | 
|
| 
26540
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
13  | 
X Y X  | 
| 26528 | 14  | 
*)  | 
15  | 
||
16  | 
signature YXML =  | 
|
17  | 
sig  | 
|
18  | 
val detect: string -> bool  | 
|
| 
26540
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
19  | 
val output_markup: Markup.T -> string * string  | 
| 26528 | 20  | 
val element: string -> XML.attributes -> string list -> string  | 
21  | 
val string_of: XML.tree -> string  | 
|
22  | 
val parse_body: string -> XML.tree list  | 
|
23  | 
val parse: string -> XML.tree  | 
|
24  | 
end;  | 
|
25  | 
||
26  | 
structure YXML: YXML =  | 
|
27  | 
struct  | 
|
28  | 
||
| 
26540
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
29  | 
(** string representation **)  | 
| 26528 | 30  | 
|
| 26547 | 31  | 
(* markers *)  | 
32  | 
||
| 
26540
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
33  | 
val X = Symbol.ENQ;  | 
| 
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
34  | 
val Y = Symbol.ACK;  | 
| 
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
35  | 
val XY = X ^ Y;  | 
| 
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
36  | 
val XYX = XY ^ X;  | 
| 26528 | 37  | 
|
| 
26540
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
38  | 
val detect = String.isPrefix XY;  | 
| 26528 | 39  | 
|
40  | 
||
| 26547 | 41  | 
(* output *)  | 
| 
26540
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
42  | 
|
| 27884 | 43  | 
fun output_markup (markup as (name, atts)) =  | 
| 29325 | 44  | 
if Markup.is_none markup then Markup.no_output  | 
| 27884 | 45  | 
else (XY ^ name ^ implode (map (fn (a, x) => Y ^ a ^ "=" ^ x) atts) ^ X, XYX);  | 
| 
26540
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
46  | 
|
| 26528 | 47  | 
fun element name atts body =  | 
| 
26540
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
48  | 
let val (pre, post) = output_markup (name, atts)  | 
| 
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
49  | 
in pre ^ implode body ^ post end;  | 
| 26528 | 50  | 
|
51  | 
fun string_of t =  | 
|
52  | 
let  | 
|
53  | 
fun attrib (a, x) =  | 
|
| 
26540
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
54  | 
Buffer.add Y #>  | 
| 26528 | 55  | 
Buffer.add a #> Buffer.add "=" #> Buffer.add x;  | 
56  | 
fun tree (XML.Elem (name, atts, ts)) =  | 
|
| 
26540
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
57  | 
Buffer.add XY #> Buffer.add name #> fold attrib atts #> Buffer.add X #>  | 
| 26528 | 58  | 
fold tree ts #>  | 
| 
26540
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
59  | 
Buffer.add XYX  | 
| 28033 | 60  | 
| tree (XML.Text s) = Buffer.add s;  | 
| 26528 | 61  | 
in Buffer.empty |> tree t |> Buffer.content end;  | 
62  | 
||
63  | 
||
| 
26540
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
64  | 
|
| 
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
65  | 
(** efficient YXML parsing **)  | 
| 26528 | 66  | 
|
67  | 
local  | 
|
68  | 
||
| 
26540
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
69  | 
(* splitting *)  | 
| 
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
70  | 
|
| 
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
71  | 
fun is_char s c = ord s = Char.ord c;  | 
| 
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
72  | 
|
| 
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
73  | 
val split_string =  | 
| 
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
74  | 
Substring.full #>  | 
| 
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
75  | 
Substring.tokens (is_char X) #>  | 
| 
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
76  | 
map (Substring.fields (is_char Y) #> map Substring.string);  | 
| 
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
77  | 
|
| 
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
78  | 
|
| 
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
79  | 
(* structural errors *)  | 
| 26528 | 80  | 
|
81  | 
fun err msg = raise Fail ("Malformed YXML encoding: " ^ msg);
 | 
|
82  | 
fun err_attribute () = err "bad attribute";  | 
|
83  | 
fun err_element () = err "bad element";  | 
|
84  | 
fun err_unbalanced "" = err "unbalanced element"  | 
|
85  | 
  | err_unbalanced name = err ("unbalanced element " ^ quote name);
 | 
|
86  | 
||
87  | 
||
88  | 
(* stack operations *)  | 
|
89  | 
||
90  | 
fun add x ((elem, body) :: pending) = (elem, x :: body) :: pending;  | 
|
91  | 
||
92  | 
fun push "" _ _ = err_element ()  | 
|
93  | 
| push name atts pending = ((name, atts), []) :: pending;  | 
|
94  | 
||
95  | 
fun pop ((("", _), _) :: _) = err_unbalanced ""
 | 
|
96  | 
| pop (((name, atts), body) :: pending) = add (XML.Elem (name, atts, rev body)) pending;  | 
|
97  | 
||
98  | 
||
| 
26540
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
99  | 
(* parsing *)  | 
| 26528 | 100  | 
|
101  | 
fun parse_attrib s =  | 
|
| 28025 | 102  | 
(case first_field "=" s of  | 
| 
28023
 
92dd3ad302b7
simplified parse_attrib (find_substring instead of space_explode);
 
wenzelm 
parents: 
27932 
diff
changeset
 | 
103  | 
NONE => err_attribute ()  | 
| 28025 | 104  | 
  | SOME ("", _) => err_attribute ()
 | 
105  | 
| SOME att => att);  | 
|
| 26528 | 106  | 
|
| 
26540
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
107  | 
fun parse_chunk ["", ""] = pop  | 
| 
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
108  | 
  | parse_chunk ("" :: name :: atts) = push name (map parse_attrib atts)
 | 
| 
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
109  | 
| parse_chunk txts = fold (add o XML.Text) txts;  | 
| 26528 | 110  | 
|
111  | 
in  | 
|
112  | 
||
113  | 
fun parse_body source =  | 
|
| 
26540
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
114  | 
  (case fold parse_chunk (split_string source) [(("", []), [])] of
 | 
| 26528 | 115  | 
    [(("", _), result)] => rev result
 | 
116  | 
| ((name, _), _) :: _ => err_unbalanced name);  | 
|
117  | 
||
118  | 
fun parse source =  | 
|
119  | 
(case parse_body source of  | 
|
| 
27798
 
b96c73f11a23
YXML.parse: allow text without markup, potentially empty;
 
wenzelm 
parents: 
26684 
diff
changeset
 | 
120  | 
[result] => result  | 
| 
 
b96c73f11a23
YXML.parse: allow text without markup, potentially empty;
 
wenzelm 
parents: 
26684 
diff
changeset
 | 
121  | 
| [] => XML.Text ""  | 
| 
 
b96c73f11a23
YXML.parse: allow text without markup, potentially empty;
 
wenzelm 
parents: 
26684 
diff
changeset
 | 
122  | 
| _ => err "multiple results");  | 
| 26528 | 123  | 
|
124  | 
end;  | 
|
125  | 
||
126  | 
end;  | 
|
127  |