| author | wenzelm | 
| Sun, 09 Jun 2024 21:16:38 +0200 | |
| changeset 80313 | a828e47c867c | 
| parent 77768 | 65008644d394 | 
| child 80504 | 7ea69c26524b | 
| permissions | -rw-r--r-- | 
| 44698 | 1 | (* Title: Pure/PIDE/yxml.ML | 
| 26528 | 2 | Author: Makarius | 
| 3 | ||
| 26540 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 4 | Efficient text representation of XML trees using extra characters X | 
| 44698 | 5 | and Y -- no escaping, may nest marked text verbatim. Suitable for | 
| 6 | direct inlining into plain text. | |
| 26528 | 7 | |
| 26540 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 8 | Markup <elem att="val" ...>...body...</elem> is encoded as: | 
| 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 9 | |
| 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 10 | X Y name Y att=val ... X | 
| 26528 | 11 | ... | 
| 12 | body | |
| 13 | ... | |
| 26540 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 14 | X Y X | 
| 26528 | 15 | *) | 
| 16 | ||
| 17 | signature YXML = | |
| 18 | sig | |
| 43777 | 19 | val X: Symbol.symbol | 
| 20 | val Y: Symbol.symbol | |
| 43772 
c825594fd0c1
clarified YXML.embed_controls -- this is idempotent and cannot be nested;
 wenzelm parents: 
43731diff
changeset | 21 | val embed_controls: string -> string | 
| 43731 
70072780e095
inner syntax supports inlined YXML according to Term_XML (particularly useful for producing text under program control);
 wenzelm parents: 
43728diff
changeset | 22 | val detect: string -> bool | 
| 70990 | 23 | val traverse: (string -> 'a -> 'a) -> XML.tree -> 'a -> 'a | 
| 73557 | 24 | val tree_size: XML.tree -> int | 
| 25 | val body_size: XML.body -> int | |
| 43728 | 26 | val string_of_body: XML.body -> string | 
| 26528 | 27 | val string_of: XML.tree -> string | 
| 70991 
f9f7c34b7dd4
more scalable protocol_message: use XML.body directly (Output.output hook is not required);
 wenzelm parents: 
70990diff
changeset | 28 | val write_body: Path.T -> XML.body -> unit | 
| 70989 | 29 | val output_markup: Markup.T -> string * string | 
| 49656 
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
 wenzelm parents: 
46832diff
changeset | 30 | val output_markup_elem: Markup.T -> (string * string) * string | 
| 38266 
492d377ecfe2
type XML.body as basic data representation language;
 wenzelm parents: 
38265diff
changeset | 31 | val parse_body: string -> XML.body | 
| 75626 | 32 | val parse_body_bytes: Bytes.T -> XML.body | 
| 26528 | 33 | val parse: string -> XML.tree | 
| 75626 | 34 | val parse_bytes: Bytes.T -> XML.tree | 
| 59433 | 35 | val content_of: string -> string | 
| 71456 
09c850e82258
more robust pretty printing of broken YXML, e.g. single "\^E";
 wenzelm parents: 
70996diff
changeset | 36 | val is_wellformed: string -> bool | 
| 26528 | 37 | end; | 
| 38 | ||
| 39 | structure YXML: YXML = | |
| 40 | struct | |
| 41 | ||
| 26540 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 42 | (** string representation **) | 
| 26528 | 43 | |
| 38265 
cc9fde54311f
renamed YXML.binary_text to YXML.escape_controls to emphasize what it actually does;
 wenzelm parents: 
38228diff
changeset | 44 | (* idempotent recoding of certain low ASCII control characters *) | 
| 34095 
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
 wenzelm parents: 
31469diff
changeset | 45 | |
| 
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
 wenzelm parents: 
31469diff
changeset | 46 | fun pseudo_utf8 c = | 
| 
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
 wenzelm parents: 
31469diff
changeset | 47 | if Symbol.is_ascii_control c | 
| 
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
 wenzelm parents: 
31469diff
changeset | 48 | then chr 192 ^ chr (128 + ord c) | 
| 
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
 wenzelm parents: 
31469diff
changeset | 49 | else c; | 
| 
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
 wenzelm parents: 
31469diff
changeset | 50 | |
| 43772 
c825594fd0c1
clarified YXML.embed_controls -- this is idempotent and cannot be nested;
 wenzelm parents: 
43731diff
changeset | 51 | fun embed_controls str = | 
| 34095 
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
 wenzelm parents: 
31469diff
changeset | 52 | if exists_string Symbol.is_ascii_control str | 
| 
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
 wenzelm parents: 
31469diff
changeset | 53 | then translate_string pseudo_utf8 str | 
| 
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
 wenzelm parents: 
31469diff
changeset | 54 | else str; | 
| 
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
 wenzelm parents: 
31469diff
changeset | 55 | |
| 
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
 wenzelm parents: 
31469diff
changeset | 56 | |
| 26547 | 57 | (* markers *) | 
| 58 | ||
| 64275 
ac2abc987cf9
accomodate Poly/ML repository version, which treats singleton strings as boxed;
 wenzelm parents: 
59433diff
changeset | 59 | val X_char = Char.chr 5; | 
| 
ac2abc987cf9
accomodate Poly/ML repository version, which treats singleton strings as boxed;
 wenzelm parents: 
59433diff
changeset | 60 | val Y_char = Char.chr 6; | 
| 
ac2abc987cf9
accomodate Poly/ML repository version, which treats singleton strings as boxed;
 wenzelm parents: 
59433diff
changeset | 61 | |
| 
ac2abc987cf9
accomodate Poly/ML repository version, which treats singleton strings as boxed;
 wenzelm parents: 
59433diff
changeset | 62 | val X = String.str X_char; | 
| 
ac2abc987cf9
accomodate Poly/ML repository version, which treats singleton strings as boxed;
 wenzelm parents: 
59433diff
changeset | 63 | val Y = String.str Y_char; | 
| 26540 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 64 | val XY = X ^ Y; | 
| 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 65 | val XYX = XY ^ X; | 
| 26528 | 66 | |
| 64275 
ac2abc987cf9
accomodate Poly/ML repository version, which treats singleton strings as boxed;
 wenzelm parents: 
59433diff
changeset | 67 | fun detect s = Char.contains s X_char orelse Char.contains s Y_char; | 
| 43731 
70072780e095
inner syntax supports inlined YXML according to Term_XML (particularly useful for producing text under program control);
 wenzelm parents: 
43728diff
changeset | 68 | |
| 26528 | 69 | |
| 73557 | 70 | (* traverse *) | 
| 26540 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 71 | |
| 70990 | 72 | fun traverse string = | 
| 73 | let | |
| 74 | fun attrib (a, x) = string Y #> string a #> string "=" #> string x; | |
| 73556 
192bcee4f8b8
more robust treatment of empty markup: it allows to produce formal chunks;
 wenzelm parents: 
71456diff
changeset | 75 | fun tree (XML.Elem (markup as (name, atts), ts)) = | 
| 
192bcee4f8b8
more robust treatment of empty markup: it allows to produce formal chunks;
 wenzelm parents: 
71456diff
changeset | 76 | if Markup.is_empty markup then fold tree ts | 
| 
192bcee4f8b8
more robust treatment of empty markup: it allows to produce formal chunks;
 wenzelm parents: 
71456diff
changeset | 77 | else | 
| 
192bcee4f8b8
more robust treatment of empty markup: it allows to produce formal chunks;
 wenzelm parents: 
71456diff
changeset | 78 | string XY #> string name #> fold attrib atts #> string X #> | 
| 
192bcee4f8b8
more robust treatment of empty markup: it allows to produce formal chunks;
 wenzelm parents: 
71456diff
changeset | 79 | fold tree ts #> | 
| 
192bcee4f8b8
more robust treatment of empty markup: it allows to produce formal chunks;
 wenzelm parents: 
71456diff
changeset | 80 | string XYX | 
| 70990 | 81 | | tree (XML.Text s) = string s; | 
| 82 | in tree end; | |
| 43728 | 83 | |
| 77768 | 84 | val tree_size = Integer.build o traverse (Integer.add o size); | 
| 85 | val body_size = Integer.build o fold (Integer.add o tree_size); | |
| 73557 | 86 | |
| 87 | ||
| 88 | (* output *) | |
| 89 | ||
| 74231 | 90 | val string_of_body = Buffer.build_content o fold (traverse Buffer.add); | 
| 43728 | 91 | val string_of = string_of_body o single; | 
| 26528 | 92 | |
| 70991 
f9f7c34b7dd4
more scalable protocol_message: use XML.body directly (Output.output hook is not required);
 wenzelm parents: 
70990diff
changeset | 93 | fun write_body path body = | 
| 75615 | 94 | path |> File_Stream.open_output (fn file => | 
| 95 | fold (traverse (fn s => fn () => File_Stream.output file s)) body ()); | |
| 70991 
f9f7c34b7dd4
more scalable protocol_message: use XML.body directly (Output.output hook is not required);
 wenzelm parents: 
70990diff
changeset | 96 | |
| 26528 | 97 | |
| 70989 | 98 | (* markup elements *) | 
| 49656 
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
 wenzelm parents: 
46832diff
changeset | 99 | |
| 
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
 wenzelm parents: 
46832diff
changeset | 100 | val Z = chr 0; | 
| 
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
 wenzelm parents: 
46832diff
changeset | 101 | val Z_text = [XML.Text Z]; | 
| 
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
 wenzelm parents: 
46832diff
changeset | 102 | |
| 70989 | 103 | fun output_markup (markup as (name, atts)) = | 
| 104 | if Markup.is_empty markup then Markup.no_output | |
| 105 | else (XY ^ name ^ implode (map (fn (a, x) => Y ^ a ^ "=" ^ x) atts) ^ X, XYX); | |
| 106 | ||
| 49656 
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
 wenzelm parents: 
46832diff
changeset | 107 | fun output_markup_elem markup = | 
| 
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
 wenzelm parents: 
46832diff
changeset | 108 | let val [bg1, bg2, en] = space_explode Z (string_of (XML.wrap_elem ((markup, Z_text), Z_text))) | 
| 
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
 wenzelm parents: 
46832diff
changeset | 109 | in ((bg1, bg2), en) end; | 
| 
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
 wenzelm parents: 
46832diff
changeset | 110 | |
| 
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
 wenzelm parents: 
46832diff
changeset | 111 | |
| 26540 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 112 | |
| 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 113 | (** efficient YXML parsing **) | 
| 26528 | 114 | |
| 115 | local | |
| 116 | ||
| 26540 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 117 | (* splitting *) | 
| 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 118 | |
| 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 119 | val split_string = | 
| 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 120 | Substring.full #> | 
| 64275 
ac2abc987cf9
accomodate Poly/ML repository version, which treats singleton strings as boxed;
 wenzelm parents: 
59433diff
changeset | 121 | Substring.tokens (fn c => c = X_char) #> | 
| 
ac2abc987cf9
accomodate Poly/ML repository version, which treats singleton strings as boxed;
 wenzelm parents: 
59433diff
changeset | 122 | map (Substring.fields (fn c => c = Y_char) #> map Substring.string); | 
| 26540 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 123 | |
| 75626 | 124 | val split_bytes = | 
| 125 | Bytes.space_explode X | |
| 126 | #> filter (fn "" => false | _ => true) | |
| 127 | #> map (space_explode Y); | |
| 128 | ||
| 26540 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 129 | |
| 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 130 | (* structural errors *) | 
| 26528 | 131 | |
| 46832 | 132 | fun err msg = raise Fail ("Malformed YXML: " ^ msg);
 | 
| 26528 | 133 | fun err_attribute () = err "bad attribute"; | 
| 134 | fun err_element () = err "bad element"; | |
| 135 | fun err_unbalanced "" = err "unbalanced element" | |
| 136 |   | err_unbalanced name = err ("unbalanced element " ^ quote name);
 | |
| 137 | ||
| 138 | ||
| 139 | (* stack operations *) | |
| 140 | ||
| 141 | fun add x ((elem, body) :: pending) = (elem, x :: body) :: pending; | |
| 142 | ||
| 143 | fun push "" _ _ = err_element () | |
| 144 | | push name atts pending = ((name, atts), []) :: pending; | |
| 145 | ||
| 146 | fun pop ((("", _), _) :: _) = err_unbalanced ""
 | |
| 38228 
ada3ab6b9085
simplified type XML.tree: embed Markup.T directly, avoid slightly odd triple;
 wenzelm parents: 
34095diff
changeset | 147 | | pop ((markup, body) :: pending) = add (XML.Elem (markup, rev body)) pending; | 
| 26528 | 148 | |
| 149 | ||
| 26540 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 150 | (* parsing *) | 
| 26528 | 151 | |
| 152 | fun parse_attrib s = | |
| 28025 | 153 | (case first_field "=" s of | 
| 28023 
92dd3ad302b7
simplified parse_attrib (find_substring instead of space_explode);
 wenzelm parents: 
27932diff
changeset | 154 | NONE => err_attribute () | 
| 28025 | 155 |   | SOME ("", _) => err_attribute ()
 | 
| 156 | | SOME att => att); | |
| 26528 | 157 | |
| 26540 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 158 | fun parse_chunk ["", ""] = pop | 
| 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 159 |   | parse_chunk ("" :: name :: atts) = push name (map parse_attrib atts)
 | 
| 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 160 | | parse_chunk txts = fold (add o XML.Text) txts; | 
| 26528 | 161 | |
| 75626 | 162 | fun parse_body' chunks = | 
| 163 |   (case fold parse_chunk chunks [(("", []), [])] of
 | |
| 26528 | 164 |     [(("", _), result)] => rev result
 | 
| 165 | | ((name, _), _) :: _ => err_unbalanced name); | |
| 166 | ||
| 75626 | 167 | fun parse' chunks = | 
| 168 | (case parse_body' chunks of | |
| 27798 
b96c73f11a23
YXML.parse: allow text without markup, potentially empty;
 wenzelm parents: 
26684diff
changeset | 169 | [result] => result | 
| 
b96c73f11a23
YXML.parse: allow text without markup, potentially empty;
 wenzelm parents: 
26684diff
changeset | 170 | | [] => XML.Text "" | 
| 
b96c73f11a23
YXML.parse: allow text without markup, potentially empty;
 wenzelm parents: 
26684diff
changeset | 171 | | _ => err "multiple results"); | 
| 26528 | 172 | |
| 75626 | 173 | in | 
| 174 | ||
| 175 | val parse_body = parse_body' o split_string; | |
| 176 | val parse_body_bytes = parse_body' o split_bytes; | |
| 177 | ||
| 178 | val parse = parse' o split_string; | |
| 179 | val parse_bytes = parse' o split_bytes; | |
| 180 | ||
| 26528 | 181 | end; | 
| 182 | ||
| 59433 | 183 | val content_of = parse_body #> XML.content_of; | 
| 184 | ||
| 71456 
09c850e82258
more robust pretty printing of broken YXML, e.g. single "\^E";
 wenzelm parents: 
70996diff
changeset | 185 | fun is_wellformed s = string_of_body (parse_body s) = s | 
| 
09c850e82258
more robust pretty printing of broken YXML, e.g. single "\^E";
 wenzelm parents: 
70996diff
changeset | 186 | handle Fail _ => false; | 
| 
09c850e82258
more robust pretty printing of broken YXML, e.g. single "\^E";
 wenzelm parents: 
70996diff
changeset | 187 | |
| 26528 | 188 | end; |