author | wenzelm |
Sun, 24 Mar 2019 17:53:46 +0100 | |
changeset 69968 | 1a400b14fd3a |
parent 69845 | d28e8199dcb9 |
child 70989 | 64a20b562e63 |
permissions | -rw-r--r-- |
44698 | 1 |
(* Title: Pure/PIDE/yxml.ML |
26528 | 2 |
Author: Makarius |
3 |
||
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
4 |
Efficient text representation of XML trees using extra characters X |
44698 | 5 |
and Y -- no escaping, may nest marked text verbatim. Suitable for |
6 |
direct inlining into plain text. |
|
26528 | 7 |
|
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
8 |
Markup <elem att="val" ...>...body...</elem> is encoded as: |
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
9 |
|
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
10 |
X Y name Y att=val ... X |
26528 | 11 |
... |
12 |
body |
|
13 |
... |
|
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
14 |
X Y X |
26528 | 15 |
*) |
16 |
||
17 |
signature YXML = |
|
18 |
sig |
|
43777 | 19 |
val X: Symbol.symbol |
20 |
val Y: Symbol.symbol |
|
43772
c825594fd0c1
clarified YXML.embed_controls -- this is idempotent and cannot be nested;
wenzelm
parents:
43731
diff
changeset
|
21 |
val embed_controls: string -> string |
43731
70072780e095
inner syntax supports inlined YXML according to Term_XML (particularly useful for producing text under program control);
wenzelm
parents:
43728
diff
changeset
|
22 |
val detect: string -> bool |
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
23 |
val output_markup: Markup.T -> string * string |
68228 | 24 |
val buffer_body: XML.body -> Buffer.T -> Buffer.T |
25 |
val buffer: XML.tree -> Buffer.T -> Buffer.T |
|
69845 | 26 |
val chunks_of_body: XML.body -> string list |
43728 | 27 |
val string_of_body: XML.body -> string |
26528 | 28 |
val string_of: XML.tree -> string |
49656
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
wenzelm
parents:
46832
diff
changeset
|
29 |
val output_markup_elem: Markup.T -> (string * string) * string |
38266
492d377ecfe2
type XML.body as basic data representation language;
wenzelm
parents:
38265
diff
changeset
|
30 |
val parse_body: string -> XML.body |
26528 | 31 |
val parse: string -> XML.tree |
59433 | 32 |
val content_of: string -> string |
26528 | 33 |
end; |
34 |
||
35 |
structure YXML: YXML = |
|
36 |
struct |
|
37 |
||
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
38 |
(** string representation **) |
26528 | 39 |
|
38265
cc9fde54311f
renamed YXML.binary_text to YXML.escape_controls to emphasize what it actually does;
wenzelm
parents:
38228
diff
changeset
|
40 |
(* idempotent recoding of certain low ASCII control characters *) |
34095
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
wenzelm
parents:
31469
diff
changeset
|
41 |
|
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
wenzelm
parents:
31469
diff
changeset
|
42 |
fun pseudo_utf8 c = |
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
wenzelm
parents:
31469
diff
changeset
|
43 |
if Symbol.is_ascii_control c |
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
wenzelm
parents:
31469
diff
changeset
|
44 |
then chr 192 ^ chr (128 + ord c) |
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
wenzelm
parents:
31469
diff
changeset
|
45 |
else c; |
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
wenzelm
parents:
31469
diff
changeset
|
46 |
|
43772
c825594fd0c1
clarified YXML.embed_controls -- this is idempotent and cannot be nested;
wenzelm
parents:
43731
diff
changeset
|
47 |
fun embed_controls str = |
34095
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
wenzelm
parents:
31469
diff
changeset
|
48 |
if exists_string Symbol.is_ascii_control str |
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
wenzelm
parents:
31469
diff
changeset
|
49 |
then translate_string pseudo_utf8 str |
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
wenzelm
parents:
31469
diff
changeset
|
50 |
else str; |
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
wenzelm
parents:
31469
diff
changeset
|
51 |
|
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
wenzelm
parents:
31469
diff
changeset
|
52 |
|
26547 | 53 |
(* markers *) |
54 |
||
64275
ac2abc987cf9
accomodate Poly/ML repository version, which treats singleton strings as boxed;
wenzelm
parents:
59433
diff
changeset
|
55 |
val X_char = Char.chr 5; |
ac2abc987cf9
accomodate Poly/ML repository version, which treats singleton strings as boxed;
wenzelm
parents:
59433
diff
changeset
|
56 |
val Y_char = Char.chr 6; |
ac2abc987cf9
accomodate Poly/ML repository version, which treats singleton strings as boxed;
wenzelm
parents:
59433
diff
changeset
|
57 |
|
ac2abc987cf9
accomodate Poly/ML repository version, which treats singleton strings as boxed;
wenzelm
parents:
59433
diff
changeset
|
58 |
val X = String.str X_char; |
ac2abc987cf9
accomodate Poly/ML repository version, which treats singleton strings as boxed;
wenzelm
parents:
59433
diff
changeset
|
59 |
val Y = String.str Y_char; |
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
60 |
val XY = X ^ Y; |
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
61 |
val XYX = XY ^ X; |
26528 | 62 |
|
64275
ac2abc987cf9
accomodate Poly/ML repository version, which treats singleton strings as boxed;
wenzelm
parents:
59433
diff
changeset
|
63 |
fun detect s = Char.contains s X_char orelse Char.contains s Y_char; |
43731
70072780e095
inner syntax supports inlined YXML according to Term_XML (particularly useful for producing text under program control);
wenzelm
parents:
43728
diff
changeset
|
64 |
|
26528 | 65 |
|
26547 | 66 |
(* output *) |
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
67 |
|
27884 | 68 |
fun output_markup (markup as (name, atts)) = |
38474
e498dc2eb576
uniform Markup.empty/Markup.Empty in ML and Scala;
wenzelm
parents:
38266
diff
changeset
|
69 |
if Markup.is_empty markup then Markup.no_output |
27884 | 70 |
else (XY ^ name ^ implode (map (fn (a, x) => Y ^ a ^ "=" ^ x) atts) ^ X, XYX); |
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
71 |
|
68228 | 72 |
fun buffer_attrib (a, x) = |
73 |
Buffer.add Y #> Buffer.add a #> Buffer.add "=" #> Buffer.add x; |
|
43728 | 74 |
|
68228 | 75 |
fun buffer_body ts = fold buffer ts |
76 |
and buffer (XML.Elem ((name, atts), ts)) = |
|
77 |
Buffer.add XY #> Buffer.add name #> fold buffer_attrib atts #> Buffer.add X #> |
|
78 |
buffer_body ts #> |
|
79 |
Buffer.add XYX |
|
80 |
| buffer (XML.Text s) = Buffer.add s |
|
81 |
||
69845 | 82 |
fun chunks_of_body body = Buffer.empty |> buffer_body body |> Buffer.chunks; |
68228 | 83 |
fun string_of_body body = Buffer.empty |> buffer_body body |> Buffer.content; |
43728 | 84 |
val string_of = string_of_body o single; |
26528 | 85 |
|
86 |
||
49656
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
wenzelm
parents:
46832
diff
changeset
|
87 |
(* wrapped elements *) |
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
wenzelm
parents:
46832
diff
changeset
|
88 |
|
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
wenzelm
parents:
46832
diff
changeset
|
89 |
val Z = chr 0; |
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
wenzelm
parents:
46832
diff
changeset
|
90 |
val Z_text = [XML.Text Z]; |
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
wenzelm
parents:
46832
diff
changeset
|
91 |
|
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
wenzelm
parents:
46832
diff
changeset
|
92 |
fun output_markup_elem markup = |
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
wenzelm
parents:
46832
diff
changeset
|
93 |
let val [bg1, bg2, en] = space_explode Z (string_of (XML.wrap_elem ((markup, Z_text), Z_text))) |
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
wenzelm
parents:
46832
diff
changeset
|
94 |
in ((bg1, bg2), en) end; |
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
wenzelm
parents:
46832
diff
changeset
|
95 |
|
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
wenzelm
parents:
46832
diff
changeset
|
96 |
|
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
97 |
|
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
98 |
(** efficient YXML parsing **) |
26528 | 99 |
|
100 |
local |
|
101 |
||
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
102 |
(* splitting *) |
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
103 |
|
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
104 |
val split_string = |
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
105 |
Substring.full #> |
64275
ac2abc987cf9
accomodate Poly/ML repository version, which treats singleton strings as boxed;
wenzelm
parents:
59433
diff
changeset
|
106 |
Substring.tokens (fn c => c = X_char) #> |
ac2abc987cf9
accomodate Poly/ML repository version, which treats singleton strings as boxed;
wenzelm
parents:
59433
diff
changeset
|
107 |
map (Substring.fields (fn c => c = Y_char) #> map Substring.string); |
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
108 |
|
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
109 |
|
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
110 |
(* structural errors *) |
26528 | 111 |
|
46832 | 112 |
fun err msg = raise Fail ("Malformed YXML: " ^ msg); |
26528 | 113 |
fun err_attribute () = err "bad attribute"; |
114 |
fun err_element () = err "bad element"; |
|
115 |
fun err_unbalanced "" = err "unbalanced element" |
|
116 |
| err_unbalanced name = err ("unbalanced element " ^ quote name); |
|
117 |
||
118 |
||
119 |
(* stack operations *) |
|
120 |
||
121 |
fun add x ((elem, body) :: pending) = (elem, x :: body) :: pending; |
|
122 |
||
123 |
fun push "" _ _ = err_element () |
|
124 |
| push name atts pending = ((name, atts), []) :: pending; |
|
125 |
||
126 |
fun pop ((("", _), _) :: _) = err_unbalanced "" |
|
38228
ada3ab6b9085
simplified type XML.tree: embed Markup.T directly, avoid slightly odd triple;
wenzelm
parents:
34095
diff
changeset
|
127 |
| pop ((markup, body) :: pending) = add (XML.Elem (markup, rev body)) pending; |
26528 | 128 |
|
129 |
||
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
130 |
(* parsing *) |
26528 | 131 |
|
132 |
fun parse_attrib s = |
|
28025 | 133 |
(case first_field "=" s of |
28023
92dd3ad302b7
simplified parse_attrib (find_substring instead of space_explode);
wenzelm
parents:
27932
diff
changeset
|
134 |
NONE => err_attribute () |
28025 | 135 |
| SOME ("", _) => err_attribute () |
136 |
| SOME att => att); |
|
26528 | 137 |
|
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
138 |
fun parse_chunk ["", ""] = pop |
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
139 |
| parse_chunk ("" :: name :: atts) = push name (map parse_attrib atts) |
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
140 |
| parse_chunk txts = fold (add o XML.Text) txts; |
26528 | 141 |
|
142 |
in |
|
143 |
||
144 |
fun parse_body source = |
|
43615
8e0f6cfa8eb2
reverted ce00462f,b3759dce, 7a165592: unwanted generalisation
noschinl
parents:
42355
diff
changeset
|
145 |
(case fold parse_chunk (split_string source) [(("", []), [])] of |
26528 | 146 |
[(("", _), result)] => rev result |
147 |
| ((name, _), _) :: _ => err_unbalanced name); |
|
148 |
||
149 |
fun parse source = |
|
150 |
(case parse_body source of |
|
27798
b96c73f11a23
YXML.parse: allow text without markup, potentially empty;
wenzelm
parents:
26684
diff
changeset
|
151 |
[result] => result |
b96c73f11a23
YXML.parse: allow text without markup, potentially empty;
wenzelm
parents:
26684
diff
changeset
|
152 |
| [] => XML.Text "" |
b96c73f11a23
YXML.parse: allow text without markup, potentially empty;
wenzelm
parents:
26684
diff
changeset
|
153 |
| _ => err "multiple results"); |
26528 | 154 |
|
155 |
end; |
|
156 |
||
59433 | 157 |
val content_of = parse_body #> XML.content_of; |
158 |
||
26528 | 159 |
end; |
160 |