author | wenzelm |
Fri, 08 Aug 2008 19:28:59 +0200 | |
changeset 27794 | bdea6e17cbe3 |
parent 26684 | 0701201def95 |
child 27798 | b96c73f11a23 |
permissions | -rw-r--r-- |
26528 | 1 |
(* Title: Pure/General/yxml.ML |
2 |
ID: $Id$ |
|
3 |
Author: Makarius |
|
4 |
||
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
5 |
Efficient text representation of XML trees using extra characters X |
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
6 |
and Y -- no escaping, may nest marked text verbatim. |
26528 | 7 |
|
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
8 |
Markup <elem att="val" ...>...body...</elem> is encoded as: |
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
9 |
|
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
10 |
X Y name Y att=val ... X |
26528 | 11 |
... |
12 |
body |
|
13 |
... |
|
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
14 |
X Y X |
26528 | 15 |
*) |
16 |
||
17 |
signature YXML = |
|
18 |
sig |
|
19 |
val detect: string -> bool |
|
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
20 |
val output_markup: Markup.T -> string * string |
26528 | 21 |
val element: string -> XML.attributes -> string list -> string |
22 |
val string_of: XML.tree -> string |
|
23 |
val parse_body: string -> XML.tree list |
|
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
24 |
val parse_element: string -> string * XML.attributes * XML.tree list |
26528 | 25 |
val parse: string -> XML.tree |
26 |
end; |
|
27 |
||
28 |
structure YXML: YXML = |
|
29 |
struct |
|
30 |
||
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
31 |
(** string representation **) |
26528 | 32 |
|
26547 | 33 |
(* markers *) |
34 |
||
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
35 |
val X = Symbol.ENQ; |
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
36 |
val Y = Symbol.ACK; |
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
37 |
val XY = X ^ Y; |
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
38 |
val XYX = XY ^ X; |
26528 | 39 |
|
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
40 |
val detect = String.isPrefix XY; |
26528 | 41 |
|
42 |
||
26547 | 43 |
(* output *) |
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
44 |
|
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
45 |
fun output_markup (name, atts) = |
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
46 |
(XY ^ name ^ implode (map (fn (a, x) => Y ^ a ^ "=" ^ x) atts) ^ X, XYX); |
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
47 |
|
26528 | 48 |
fun element name atts body = |
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
49 |
let val (pre, post) = output_markup (name, atts) |
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
50 |
in pre ^ implode body ^ post end; |
26528 | 51 |
|
52 |
fun string_of t = |
|
53 |
let |
|
54 |
fun attrib (a, x) = |
|
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
55 |
Buffer.add Y #> |
26528 | 56 |
Buffer.add a #> Buffer.add "=" #> Buffer.add x; |
57 |
fun tree (XML.Elem (name, atts, ts)) = |
|
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
58 |
Buffer.add XY #> Buffer.add name #> fold attrib atts #> Buffer.add X #> |
26528 | 59 |
fold tree ts #> |
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
60 |
Buffer.add XYX |
26528 | 61 |
| tree (XML.Text s) = Buffer.add s |
62 |
| tree (XML.Output s) = Buffer.add s; |
|
63 |
in Buffer.empty |> tree t |> Buffer.content end; |
|
64 |
||
65 |
||
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
66 |
|
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
67 |
(** efficient YXML parsing **) |
26528 | 68 |
|
69 |
local |
|
70 |
||
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
71 |
(* splitting *) |
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
72 |
|
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
73 |
fun is_char s c = ord s = Char.ord c; |
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
74 |
|
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
75 |
val split_string = |
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
76 |
Substring.full #> |
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
77 |
Substring.tokens (is_char X) #> |
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
78 |
map (Substring.fields (is_char Y) #> map Substring.string); |
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
79 |
|
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
80 |
|
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
81 |
(* structural errors *) |
26528 | 82 |
|
83 |
fun err msg = raise Fail ("Malformed YXML encoding: " ^ msg); |
|
84 |
fun err_attribute () = err "bad attribute"; |
|
85 |
fun err_element () = err "bad element"; |
|
86 |
fun err_unbalanced "" = err "unbalanced element" |
|
87 |
| err_unbalanced name = err ("unbalanced element " ^ quote name); |
|
88 |
||
89 |
||
90 |
(* stack operations *) |
|
91 |
||
92 |
fun add x ((elem, body) :: pending) = (elem, x :: body) :: pending; |
|
93 |
||
94 |
fun push "" _ _ = err_element () |
|
95 |
| push name atts pending = ((name, atts), []) :: pending; |
|
96 |
||
97 |
fun pop ((("", _), _) :: _) = err_unbalanced "" |
|
98 |
| pop (((name, atts), body) :: pending) = add (XML.Elem (name, atts, rev body)) pending; |
|
99 |
||
100 |
||
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
101 |
(* parsing *) |
26528 | 102 |
|
103 |
fun parse_attrib s = |
|
26684 | 104 |
(case space_explode "=" s of |
26528 | 105 |
[] => err_attribute () |
106 |
| "" :: _ => err_attribute () |
|
107 |
| a :: xs => (a, space_implode "=" xs)); |
|
108 |
||
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
109 |
fun parse_chunk ["", ""] = pop |
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
110 |
| parse_chunk ("" :: name :: atts) = push name (map parse_attrib atts) |
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
111 |
| parse_chunk txts = fold (add o XML.Text) txts; |
26528 | 112 |
|
113 |
in |
|
114 |
||
115 |
fun parse_body source = |
|
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
116 |
(case fold parse_chunk (split_string source) [(("", []), [])] of |
26528 | 117 |
[(("", _), result)] => rev result |
118 |
| ((name, _), _) :: _ => err_unbalanced name); |
|
119 |
||
120 |
fun parse source = |
|
121 |
(case parse_body source of |
|
122 |
[result as XML.Elem _] => result |
|
123 |
| _ => err "no root element"); |
|
124 |
||
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
125 |
val parse_element = parse #> (fn XML.Elem elem => elem); |
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
126 |
|
26528 | 127 |
end; |
128 |
||
129 |
end; |
|
130 |