| author | kuncar | 
| Tue, 18 Nov 2014 16:19:57 +0100 | |
| changeset 60221 | 45545e6c0984 | 
| parent 59433 | 9da5b2c61049 | 
| child 64275 | ac2abc987cf9 | 
| permissions | -rw-r--r-- | 
| 44698 | 1  | 
(* Title: Pure/PIDE/yxml.ML  | 
| 26528 | 2  | 
Author: Makarius  | 
3  | 
||
| 
26540
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
4  | 
Efficient text representation of XML trees using extra characters X  | 
| 44698 | 5  | 
and Y -- no escaping, may nest marked text verbatim. Suitable for  | 
6  | 
direct inlining into plain text.  | 
|
| 26528 | 7  | 
|
| 
26540
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
8  | 
Markup <elem att="val" ...>...body...</elem> is encoded as:  | 
| 
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
9  | 
|
| 
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
10  | 
X Y name Y att=val ... X  | 
| 26528 | 11  | 
...  | 
12  | 
body  | 
|
13  | 
...  | 
|
| 
26540
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
14  | 
X Y X  | 
| 26528 | 15  | 
*)  | 
16  | 
||
17  | 
signature YXML =  | 
|
18  | 
sig  | 
|
| 43777 | 19  | 
val X: Symbol.symbol  | 
20  | 
val Y: Symbol.symbol  | 
|
| 
43772
 
c825594fd0c1
clarified YXML.embed_controls -- this is idempotent and cannot be nested;
 
wenzelm 
parents: 
43731 
diff
changeset
 | 
21  | 
val embed_controls: string -> string  | 
| 
43731
 
70072780e095
inner syntax supports inlined YXML according to Term_XML (particularly useful for producing text under program control);
 
wenzelm 
parents: 
43728 
diff
changeset
 | 
22  | 
val detect: string -> bool  | 
| 
26540
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
23  | 
val output_markup: Markup.T -> string * string  | 
| 43728 | 24  | 
val string_of_body: XML.body -> string  | 
| 26528 | 25  | 
val string_of: XML.tree -> string  | 
| 
49656
 
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
 
wenzelm 
parents: 
46832 
diff
changeset
 | 
26  | 
val output_markup_elem: Markup.T -> (string * string) * string  | 
| 
38266
 
492d377ecfe2
type XML.body as basic data representation language;
 
wenzelm 
parents: 
38265 
diff
changeset
 | 
27  | 
val parse_body: string -> XML.body  | 
| 26528 | 28  | 
val parse: string -> XML.tree  | 
| 59433 | 29  | 
val content_of: string -> string  | 
| 26528 | 30  | 
end;  | 
31  | 
||
32  | 
structure YXML: YXML =  | 
|
33  | 
struct  | 
|
34  | 
||
| 
26540
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
35  | 
(** string representation **)  | 
| 26528 | 36  | 
|
| 
38265
 
cc9fde54311f
renamed YXML.binary_text to YXML.escape_controls to emphasize what it actually does;
 
wenzelm 
parents: 
38228 
diff
changeset
 | 
37  | 
(* idempotent recoding of certain low ASCII control characters *)  | 
| 
34095
 
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
 
wenzelm 
parents: 
31469 
diff
changeset
 | 
38  | 
|
| 
 
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
 
wenzelm 
parents: 
31469 
diff
changeset
 | 
39  | 
fun pseudo_utf8 c =  | 
| 
 
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
 
wenzelm 
parents: 
31469 
diff
changeset
 | 
40  | 
if Symbol.is_ascii_control c  | 
| 
 
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
 
wenzelm 
parents: 
31469 
diff
changeset
 | 
41  | 
then chr 192 ^ chr (128 + ord c)  | 
| 
 
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
 
wenzelm 
parents: 
31469 
diff
changeset
 | 
42  | 
else c;  | 
| 
 
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
 
wenzelm 
parents: 
31469 
diff
changeset
 | 
43  | 
|
| 
43772
 
c825594fd0c1
clarified YXML.embed_controls -- this is idempotent and cannot be nested;
 
wenzelm 
parents: 
43731 
diff
changeset
 | 
44  | 
fun embed_controls str =  | 
| 
34095
 
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
 
wenzelm 
parents: 
31469 
diff
changeset
 | 
45  | 
if exists_string Symbol.is_ascii_control str  | 
| 
 
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
 
wenzelm 
parents: 
31469 
diff
changeset
 | 
46  | 
then translate_string pseudo_utf8 str  | 
| 
 
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
 
wenzelm 
parents: 
31469 
diff
changeset
 | 
47  | 
else str;  | 
| 
 
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
 
wenzelm 
parents: 
31469 
diff
changeset
 | 
48  | 
|
| 
 
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
 
wenzelm 
parents: 
31469 
diff
changeset
 | 
49  | 
|
| 26547 | 50  | 
(* markers *)  | 
51  | 
||
| 43777 | 52  | 
val X = chr 5;  | 
53  | 
val Y = chr 6;  | 
|
| 
26540
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
54  | 
val XY = X ^ Y;  | 
| 
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
55  | 
val XYX = XY ^ X;  | 
| 26528 | 56  | 
|
| 43782 | 57  | 
val detect = exists_string (fn s => s = X orelse s = Y);  | 
| 
43731
 
70072780e095
inner syntax supports inlined YXML according to Term_XML (particularly useful for producing text under program control);
 
wenzelm 
parents: 
43728 
diff
changeset
 | 
58  | 
|
| 26528 | 59  | 
|
| 26547 | 60  | 
(* output *)  | 
| 
26540
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
61  | 
|
| 27884 | 62  | 
fun output_markup (markup as (name, atts)) =  | 
| 
38474
 
e498dc2eb576
uniform Markup.empty/Markup.Empty in ML and Scala;
 
wenzelm 
parents: 
38266 
diff
changeset
 | 
63  | 
if Markup.is_empty markup then Markup.no_output  | 
| 27884 | 64  | 
else (XY ^ name ^ implode (map (fn (a, x) => Y ^ a ^ "=" ^ x) atts) ^ X, XYX);  | 
| 
26540
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
65  | 
|
| 43728 | 66  | 
fun string_of_body body =  | 
| 26528 | 67  | 
let  | 
| 46831 | 68  | 
fun attrib (a, x) = Buffer.add Y #> Buffer.add a #> Buffer.add "=" #> Buffer.add x;  | 
| 
38228
 
ada3ab6b9085
simplified type XML.tree: embed Markup.T directly, avoid slightly odd triple;
 
wenzelm 
parents: 
34095 
diff
changeset
 | 
69  | 
fun tree (XML.Elem ((name, atts), ts)) =  | 
| 
26540
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
70  | 
Buffer.add XY #> Buffer.add name #> fold attrib atts #> Buffer.add X #>  | 
| 43728 | 71  | 
trees ts #>  | 
| 
26540
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
72  | 
Buffer.add XYX  | 
| 43728 | 73  | 
| tree (XML.Text s) = Buffer.add s  | 
74  | 
and trees ts = fold tree ts;  | 
|
75  | 
in Buffer.empty |> trees body |> Buffer.content end;  | 
|
76  | 
||
77  | 
val string_of = string_of_body o single;  | 
|
| 26528 | 78  | 
|
79  | 
||
| 
49656
 
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
 
wenzelm 
parents: 
46832 
diff
changeset
 | 
80  | 
(* wrapped elements *)  | 
| 
 
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
 
wenzelm 
parents: 
46832 
diff
changeset
 | 
81  | 
|
| 
 
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
 
wenzelm 
parents: 
46832 
diff
changeset
 | 
82  | 
val Z = chr 0;  | 
| 
 
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
 
wenzelm 
parents: 
46832 
diff
changeset
 | 
83  | 
val Z_text = [XML.Text Z];  | 
| 
 
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
 
wenzelm 
parents: 
46832 
diff
changeset
 | 
84  | 
|
| 
 
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
 
wenzelm 
parents: 
46832 
diff
changeset
 | 
85  | 
fun output_markup_elem markup =  | 
| 
 
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
 
wenzelm 
parents: 
46832 
diff
changeset
 | 
86  | 
let val [bg1, bg2, en] = space_explode Z (string_of (XML.wrap_elem ((markup, Z_text), Z_text)))  | 
| 
 
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
 
wenzelm 
parents: 
46832 
diff
changeset
 | 
87  | 
in ((bg1, bg2), en) end;  | 
| 
 
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
 
wenzelm 
parents: 
46832 
diff
changeset
 | 
88  | 
|
| 
 
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
 
wenzelm 
parents: 
46832 
diff
changeset
 | 
89  | 
|
| 
26540
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
90  | 
|
| 
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
91  | 
(** efficient YXML parsing **)  | 
| 26528 | 92  | 
|
93  | 
local  | 
|
94  | 
||
| 
26540
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
95  | 
(* splitting *)  | 
| 
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
96  | 
|
| 
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
97  | 
fun is_char s c = ord s = Char.ord c;  | 
| 
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
98  | 
|
| 
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
99  | 
val split_string =  | 
| 
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
100  | 
Substring.full #>  | 
| 
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
101  | 
Substring.tokens (is_char X) #>  | 
| 
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
102  | 
map (Substring.fields (is_char Y) #> map Substring.string);  | 
| 
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
103  | 
|
| 
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
104  | 
|
| 
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
105  | 
(* structural errors *)  | 
| 26528 | 106  | 
|
| 46832 | 107  | 
fun err msg = raise Fail ("Malformed YXML: " ^ msg);
 | 
| 26528 | 108  | 
fun err_attribute () = err "bad attribute";  | 
109  | 
fun err_element () = err "bad element";  | 
|
110  | 
fun err_unbalanced "" = err "unbalanced element"  | 
|
111  | 
  | err_unbalanced name = err ("unbalanced element " ^ quote name);
 | 
|
112  | 
||
113  | 
||
114  | 
(* stack operations *)  | 
|
115  | 
||
116  | 
fun add x ((elem, body) :: pending) = (elem, x :: body) :: pending;  | 
|
117  | 
||
118  | 
fun push "" _ _ = err_element ()  | 
|
119  | 
| push name atts pending = ((name, atts), []) :: pending;  | 
|
120  | 
||
121  | 
fun pop ((("", _), _) :: _) = err_unbalanced ""
 | 
|
| 
38228
 
ada3ab6b9085
simplified type XML.tree: embed Markup.T directly, avoid slightly odd triple;
 
wenzelm 
parents: 
34095 
diff
changeset
 | 
122  | 
| pop ((markup, body) :: pending) = add (XML.Elem (markup, rev body)) pending;  | 
| 26528 | 123  | 
|
124  | 
||
| 
26540
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
125  | 
(* parsing *)  | 
| 26528 | 126  | 
|
127  | 
fun parse_attrib s =  | 
|
| 28025 | 128  | 
(case first_field "=" s of  | 
| 
28023
 
92dd3ad302b7
simplified parse_attrib (find_substring instead of space_explode);
 
wenzelm 
parents: 
27932 
diff
changeset
 | 
129  | 
NONE => err_attribute ()  | 
| 28025 | 130  | 
  | SOME ("", _) => err_attribute ()
 | 
131  | 
| SOME att => att);  | 
|
| 26528 | 132  | 
|
| 
26540
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
133  | 
fun parse_chunk ["", ""] = pop  | 
| 
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
134  | 
  | parse_chunk ("" :: name :: atts) = push name (map parse_attrib atts)
 | 
| 
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
135  | 
| parse_chunk txts = fold (add o XML.Text) txts;  | 
| 26528 | 136  | 
|
137  | 
in  | 
|
138  | 
||
139  | 
fun parse_body source =  | 
|
| 
43615
 
8e0f6cfa8eb2
reverted ce00462f,b3759dce, 7a165592: unwanted generalisation
 
noschinl 
parents: 
42355 
diff
changeset
 | 
140  | 
  (case fold parse_chunk (split_string source) [(("", []), [])] of
 | 
| 26528 | 141  | 
    [(("", _), result)] => rev result
 | 
142  | 
| ((name, _), _) :: _ => err_unbalanced name);  | 
|
143  | 
||
144  | 
fun parse source =  | 
|
145  | 
(case parse_body source of  | 
|
| 
27798
 
b96c73f11a23
YXML.parse: allow text without markup, potentially empty;
 
wenzelm 
parents: 
26684 
diff
changeset
 | 
146  | 
[result] => result  | 
| 
 
b96c73f11a23
YXML.parse: allow text without markup, potentially empty;
 
wenzelm 
parents: 
26684 
diff
changeset
 | 
147  | 
| [] => XML.Text ""  | 
| 
 
b96c73f11a23
YXML.parse: allow text without markup, potentially empty;
 
wenzelm 
parents: 
26684 
diff
changeset
 | 
148  | 
| _ => err "multiple results");  | 
| 26528 | 149  | 
|
150  | 
end;  | 
|
151  | 
||
| 59433 | 152  | 
val content_of = parse_body #> XML.content_of;  | 
153  | 
||
| 26528 | 154  | 
end;  | 
155  |