| author | wenzelm | 
| Mon, 07 Oct 2019 14:23:58 +0200 | |
| changeset 70798 | 9ee3558a7e99 | 
| parent 69845 | d28e8199dcb9 | 
| child 70989 | 64a20b562e63 | 
| permissions | -rw-r--r-- | 
| 44698 | 1  | 
(* Title: Pure/PIDE/yxml.ML  | 
| 26528 | 2  | 
Author: Makarius  | 
3  | 
||
| 
26540
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
4  | 
Efficient text representation of XML trees using extra characters X  | 
| 44698 | 5  | 
and Y -- no escaping, may nest marked text verbatim. Suitable for  | 
6  | 
direct inlining into plain text.  | 
|
| 26528 | 7  | 
|
| 
26540
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
8  | 
Markup <elem att="val" ...>...body...</elem> is encoded as:  | 
| 
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
9  | 
|
| 
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
10  | 
X Y name Y att=val ... X  | 
| 26528 | 11  | 
...  | 
12  | 
body  | 
|
13  | 
...  | 
|
| 
26540
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
14  | 
X Y X  | 
| 26528 | 15  | 
*)  | 
16  | 
||
17  | 
signature YXML =  | 
|
18  | 
sig  | 
|
| 43777 | 19  | 
val X: Symbol.symbol  | 
20  | 
val Y: Symbol.symbol  | 
|
| 
43772
 
c825594fd0c1
clarified YXML.embed_controls -- this is idempotent and cannot be nested;
 
wenzelm 
parents: 
43731 
diff
changeset
 | 
21  | 
val embed_controls: string -> string  | 
| 
43731
 
70072780e095
inner syntax supports inlined YXML according to Term_XML (particularly useful for producing text under program control);
 
wenzelm 
parents: 
43728 
diff
changeset
 | 
22  | 
val detect: string -> bool  | 
| 
26540
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
23  | 
val output_markup: Markup.T -> string * string  | 
| 68228 | 24  | 
val buffer_body: XML.body -> Buffer.T -> Buffer.T  | 
25  | 
val buffer: XML.tree -> Buffer.T -> Buffer.T  | 
|
| 69845 | 26  | 
val chunks_of_body: XML.body -> string list  | 
| 43728 | 27  | 
val string_of_body: XML.body -> string  | 
| 26528 | 28  | 
val string_of: XML.tree -> string  | 
| 
49656
 
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
 
wenzelm 
parents: 
46832 
diff
changeset
 | 
29  | 
val output_markup_elem: Markup.T -> (string * string) * string  | 
| 
38266
 
492d377ecfe2
type XML.body as basic data representation language;
 
wenzelm 
parents: 
38265 
diff
changeset
 | 
30  | 
val parse_body: string -> XML.body  | 
| 26528 | 31  | 
val parse: string -> XML.tree  | 
| 59433 | 32  | 
val content_of: string -> string  | 
| 26528 | 33  | 
end;  | 
34  | 
||
35  | 
structure YXML: YXML =  | 
|
36  | 
struct  | 
|
37  | 
||
| 
26540
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
38  | 
(** string representation **)  | 
| 26528 | 39  | 
|
| 
38265
 
cc9fde54311f
renamed YXML.binary_text to YXML.escape_controls to emphasize what it actually does;
 
wenzelm 
parents: 
38228 
diff
changeset
 | 
40  | 
(* idempotent recoding of certain low ASCII control characters *)  | 
| 
34095
 
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
 
wenzelm 
parents: 
31469 
diff
changeset
 | 
41  | 
|
| 
 
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
 
wenzelm 
parents: 
31469 
diff
changeset
 | 
42  | 
fun pseudo_utf8 c =  | 
| 
 
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
 
wenzelm 
parents: 
31469 
diff
changeset
 | 
43  | 
if Symbol.is_ascii_control c  | 
| 
 
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
 
wenzelm 
parents: 
31469 
diff
changeset
 | 
44  | 
then chr 192 ^ chr (128 + ord c)  | 
| 
 
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
 
wenzelm 
parents: 
31469 
diff
changeset
 | 
45  | 
else c;  | 
| 
 
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
 
wenzelm 
parents: 
31469 
diff
changeset
 | 
46  | 
|
| 
43772
 
c825594fd0c1
clarified YXML.embed_controls -- this is idempotent and cannot be nested;
 
wenzelm 
parents: 
43731 
diff
changeset
 | 
47  | 
fun embed_controls str =  | 
| 
34095
 
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
 
wenzelm 
parents: 
31469 
diff
changeset
 | 
48  | 
if exists_string Symbol.is_ascii_control str  | 
| 
 
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
 
wenzelm 
parents: 
31469 
diff
changeset
 | 
49  | 
then translate_string pseudo_utf8 str  | 
| 
 
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
 
wenzelm 
parents: 
31469 
diff
changeset
 | 
50  | 
else str;  | 
| 
 
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
 
wenzelm 
parents: 
31469 
diff
changeset
 | 
51  | 
|
| 
 
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
 
wenzelm 
parents: 
31469 
diff
changeset
 | 
52  | 
|
| 26547 | 53  | 
(* markers *)  | 
54  | 
||
| 
64275
 
ac2abc987cf9
accomodate Poly/ML repository version, which treats singleton strings as boxed;
 
wenzelm 
parents: 
59433 
diff
changeset
 | 
55  | 
val X_char = Char.chr 5;  | 
| 
 
ac2abc987cf9
accomodate Poly/ML repository version, which treats singleton strings as boxed;
 
wenzelm 
parents: 
59433 
diff
changeset
 | 
56  | 
val Y_char = Char.chr 6;  | 
| 
 
ac2abc987cf9
accomodate Poly/ML repository version, which treats singleton strings as boxed;
 
wenzelm 
parents: 
59433 
diff
changeset
 | 
57  | 
|
| 
 
ac2abc987cf9
accomodate Poly/ML repository version, which treats singleton strings as boxed;
 
wenzelm 
parents: 
59433 
diff
changeset
 | 
58  | 
val X = String.str X_char;  | 
| 
 
ac2abc987cf9
accomodate Poly/ML repository version, which treats singleton strings as boxed;
 
wenzelm 
parents: 
59433 
diff
changeset
 | 
59  | 
val Y = String.str Y_char;  | 
| 
26540
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
60  | 
val XY = X ^ Y;  | 
| 
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
61  | 
val XYX = XY ^ X;  | 
| 26528 | 62  | 
|
| 
64275
 
ac2abc987cf9
accomodate Poly/ML repository version, which treats singleton strings as boxed;
 
wenzelm 
parents: 
59433 
diff
changeset
 | 
63  | 
fun detect s = Char.contains s X_char orelse Char.contains s Y_char;  | 
| 
43731
 
70072780e095
inner syntax supports inlined YXML according to Term_XML (particularly useful for producing text under program control);
 
wenzelm 
parents: 
43728 
diff
changeset
 | 
64  | 
|
| 26528 | 65  | 
|
| 26547 | 66  | 
(* output *)  | 
| 
26540
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
67  | 
|
| 27884 | 68  | 
fun output_markup (markup as (name, atts)) =  | 
| 
38474
 
e498dc2eb576
uniform Markup.empty/Markup.Empty in ML and Scala;
 
wenzelm 
parents: 
38266 
diff
changeset
 | 
69  | 
if Markup.is_empty markup then Markup.no_output  | 
| 27884 | 70  | 
else (XY ^ name ^ implode (map (fn (a, x) => Y ^ a ^ "=" ^ x) atts) ^ X, XYX);  | 
| 
26540
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
71  | 
|
| 68228 | 72  | 
fun buffer_attrib (a, x) =  | 
73  | 
Buffer.add Y #> Buffer.add a #> Buffer.add "=" #> Buffer.add x;  | 
|
| 43728 | 74  | 
|
| 68228 | 75  | 
fun buffer_body ts = fold buffer ts  | 
76  | 
and buffer (XML.Elem ((name, atts), ts)) =  | 
|
77  | 
Buffer.add XY #> Buffer.add name #> fold buffer_attrib atts #> Buffer.add X #>  | 
|
78  | 
buffer_body ts #>  | 
|
79  | 
Buffer.add XYX  | 
|
80  | 
| buffer (XML.Text s) = Buffer.add s  | 
|
81  | 
||
| 69845 | 82  | 
fun chunks_of_body body = Buffer.empty |> buffer_body body |> Buffer.chunks;  | 
| 68228 | 83  | 
fun string_of_body body = Buffer.empty |> buffer_body body |> Buffer.content;  | 
| 43728 | 84  | 
val string_of = string_of_body o single;  | 
| 26528 | 85  | 
|
86  | 
||
| 
49656
 
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
 
wenzelm 
parents: 
46832 
diff
changeset
 | 
87  | 
(* wrapped elements *)  | 
| 
 
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
 
wenzelm 
parents: 
46832 
diff
changeset
 | 
88  | 
|
| 
 
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
 
wenzelm 
parents: 
46832 
diff
changeset
 | 
89  | 
val Z = chr 0;  | 
| 
 
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
 
wenzelm 
parents: 
46832 
diff
changeset
 | 
90  | 
val Z_text = [XML.Text Z];  | 
| 
 
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
 
wenzelm 
parents: 
46832 
diff
changeset
 | 
91  | 
|
| 
 
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
 
wenzelm 
parents: 
46832 
diff
changeset
 | 
92  | 
fun output_markup_elem markup =  | 
| 
 
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
 
wenzelm 
parents: 
46832 
diff
changeset
 | 
93  | 
let val [bg1, bg2, en] = space_explode Z (string_of (XML.wrap_elem ((markup, Z_text), Z_text)))  | 
| 
 
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
 
wenzelm 
parents: 
46832 
diff
changeset
 | 
94  | 
in ((bg1, bg2), en) end;  | 
| 
 
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
 
wenzelm 
parents: 
46832 
diff
changeset
 | 
95  | 
|
| 
 
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
 
wenzelm 
parents: 
46832 
diff
changeset
 | 
96  | 
|
| 
26540
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
97  | 
|
| 
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
98  | 
(** efficient YXML parsing **)  | 
| 26528 | 99  | 
|
100  | 
local  | 
|
101  | 
||
| 
26540
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
102  | 
(* splitting *)  | 
| 
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
103  | 
|
| 
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
104  | 
val split_string =  | 
| 
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
105  | 
Substring.full #>  | 
| 
64275
 
ac2abc987cf9
accomodate Poly/ML repository version, which treats singleton strings as boxed;
 
wenzelm 
parents: 
59433 
diff
changeset
 | 
106  | 
Substring.tokens (fn c => c = X_char) #>  | 
| 
 
ac2abc987cf9
accomodate Poly/ML repository version, which treats singleton strings as boxed;
 
wenzelm 
parents: 
59433 
diff
changeset
 | 
107  | 
map (Substring.fields (fn c => c = Y_char) #> map Substring.string);  | 
| 
26540
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
108  | 
|
| 
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
109  | 
|
| 
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
110  | 
(* structural errors *)  | 
| 26528 | 111  | 
|
| 46832 | 112  | 
fun err msg = raise Fail ("Malformed YXML: " ^ msg);
 | 
| 26528 | 113  | 
fun err_attribute () = err "bad attribute";  | 
114  | 
fun err_element () = err "bad element";  | 
|
115  | 
fun err_unbalanced "" = err "unbalanced element"  | 
|
116  | 
  | err_unbalanced name = err ("unbalanced element " ^ quote name);
 | 
|
117  | 
||
118  | 
||
119  | 
(* stack operations *)  | 
|
120  | 
||
121  | 
fun add x ((elem, body) :: pending) = (elem, x :: body) :: pending;  | 
|
122  | 
||
123  | 
fun push "" _ _ = err_element ()  | 
|
124  | 
| push name atts pending = ((name, atts), []) :: pending;  | 
|
125  | 
||
126  | 
fun pop ((("", _), _) :: _) = err_unbalanced ""
 | 
|
| 
38228
 
ada3ab6b9085
simplified type XML.tree: embed Markup.T directly, avoid slightly odd triple;
 
wenzelm 
parents: 
34095 
diff
changeset
 | 
127  | 
| pop ((markup, body) :: pending) = add (XML.Elem (markup, rev body)) pending;  | 
| 26528 | 128  | 
|
129  | 
||
| 
26540
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
130  | 
(* parsing *)  | 
| 26528 | 131  | 
|
132  | 
fun parse_attrib s =  | 
|
| 28025 | 133  | 
(case first_field "=" s of  | 
| 
28023
 
92dd3ad302b7
simplified parse_attrib (find_substring instead of space_explode);
 
wenzelm 
parents: 
27932 
diff
changeset
 | 
134  | 
NONE => err_attribute ()  | 
| 28025 | 135  | 
  | SOME ("", _) => err_attribute ()
 | 
136  | 
| SOME att => att);  | 
|
| 26528 | 137  | 
|
| 
26540
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
138  | 
fun parse_chunk ["", ""] = pop  | 
| 
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
139  | 
  | parse_chunk ("" :: name :: atts) = push name (map parse_attrib atts)
 | 
| 
 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 
wenzelm 
parents: 
26528 
diff
changeset
 | 
140  | 
| parse_chunk txts = fold (add o XML.Text) txts;  | 
| 26528 | 141  | 
|
142  | 
in  | 
|
143  | 
||
144  | 
fun parse_body source =  | 
|
| 
43615
 
8e0f6cfa8eb2
reverted ce00462f,b3759dce, 7a165592: unwanted generalisation
 
noschinl 
parents: 
42355 
diff
changeset
 | 
145  | 
  (case fold parse_chunk (split_string source) [(("", []), [])] of
 | 
| 26528 | 146  | 
    [(("", _), result)] => rev result
 | 
147  | 
| ((name, _), _) :: _ => err_unbalanced name);  | 
|
148  | 
||
149  | 
fun parse source =  | 
|
150  | 
(case parse_body source of  | 
|
| 
27798
 
b96c73f11a23
YXML.parse: allow text without markup, potentially empty;
 
wenzelm 
parents: 
26684 
diff
changeset
 | 
151  | 
[result] => result  | 
| 
 
b96c73f11a23
YXML.parse: allow text without markup, potentially empty;
 
wenzelm 
parents: 
26684 
diff
changeset
 | 
152  | 
| [] => XML.Text ""  | 
| 
 
b96c73f11a23
YXML.parse: allow text without markup, potentially empty;
 
wenzelm 
parents: 
26684 
diff
changeset
 | 
153  | 
| _ => err "multiple results");  | 
| 26528 | 154  | 
|
155  | 
end;  | 
|
156  | 
||
| 59433 | 157  | 
val content_of = parse_body #> XML.content_of;  | 
158  | 
||
| 26528 | 159  | 
end;  | 
160  |