author | wenzelm |
Thu, 10 Apr 2025 14:12:33 +0200 | |
changeset 82475 | 0a6d57c4d58b |
parent 80861 | 9de19e3a7231 |
permissions | -rw-r--r-- |
44698 | 1 |
(* Title: Pure/PIDE/yxml.ML |
26528 | 2 |
Author: Makarius |
3 |
||
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
4 |
Efficient text representation of XML trees using extra characters X |
44698 | 5 |
and Y -- no escaping, may nest marked text verbatim. Suitable for |
6 |
direct inlining into plain text. |
|
26528 | 7 |
|
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
8 |
Markup <elem att="val" ...>...body...</elem> is encoded as: |
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
9 |
|
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
10 |
X Y name Y att=val ... X |
26528 | 11 |
... |
12 |
body |
|
13 |
... |
|
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
14 |
X Y X |
26528 | 15 |
*) |
16 |
||
17 |
signature YXML = |
|
18 |
sig |
|
80860 | 19 |
include YXML |
70990 | 20 |
val traverse: (string -> 'a -> 'a) -> XML.tree -> 'a -> 'a |
73557 | 21 |
val tree_size: XML.tree -> int |
22 |
val body_size: XML.body -> int |
|
43728 | 23 |
val string_of_body: XML.body -> string |
26528 | 24 |
val string_of: XML.tree -> string |
80563 | 25 |
val bytes_of_body: XML.body -> Bytes.T |
26 |
val bytes_of: XML.tree -> Bytes.T |
|
70991
f9f7c34b7dd4
more scalable protocol_message: use XML.body directly (Output.output hook is not required);
wenzelm
parents:
70990
diff
changeset
|
27 |
val write_body: Path.T -> XML.body -> unit |
80826
7feaa04d332b
prefer static YXML.output_markup_only (without print_mode): Output.status is only relevant for PIDE (with print_mode "isabelle_process");
wenzelm
parents:
80821
diff
changeset
|
28 |
val output_markup_only: Markup.T -> string |
49656
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
wenzelm
parents:
46832
diff
changeset
|
29 |
val output_markup_elem: Markup.T -> (string * string) * string |
38266
492d377ecfe2
type XML.body as basic data representation language;
wenzelm
parents:
38265
diff
changeset
|
30 |
val parse_body: string -> XML.body |
75626 | 31 |
val parse_body_bytes: Bytes.T -> XML.body |
26528 | 32 |
val parse: string -> XML.tree |
75626 | 33 |
val parse_bytes: Bytes.T -> XML.tree |
71456
09c850e82258
more robust pretty printing of broken YXML, e.g. single "\^E";
wenzelm
parents:
70996
diff
changeset
|
34 |
val is_wellformed: string -> bool |
26528 | 35 |
end; |
36 |
||
37 |
structure YXML: YXML = |
|
38 |
struct |
|
39 |
||
80860 | 40 |
open YXML; |
64275
ac2abc987cf9
accomodate Poly/ML repository version, which treats singleton strings as boxed;
wenzelm
parents:
59433
diff
changeset
|
41 |
|
26528 | 42 |
|
80860 | 43 |
(** string representation **) |
26528 | 44 |
|
73557 | 45 |
(* traverse *) |
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
46 |
|
70990 | 47 |
fun traverse string = |
48 |
let |
|
49 |
fun attrib (a, x) = string Y #> string a #> string "=" #> string x; |
|
73556
192bcee4f8b8
more robust treatment of empty markup: it allows to produce formal chunks;
wenzelm
parents:
71456
diff
changeset
|
50 |
fun tree (XML.Elem (markup as (name, atts), ts)) = |
192bcee4f8b8
more robust treatment of empty markup: it allows to produce formal chunks;
wenzelm
parents:
71456
diff
changeset
|
51 |
if Markup.is_empty markup then fold tree ts |
192bcee4f8b8
more robust treatment of empty markup: it allows to produce formal chunks;
wenzelm
parents:
71456
diff
changeset
|
52 |
else |
192bcee4f8b8
more robust treatment of empty markup: it allows to produce formal chunks;
wenzelm
parents:
71456
diff
changeset
|
53 |
string XY #> string name #> fold attrib atts #> string X #> |
192bcee4f8b8
more robust treatment of empty markup: it allows to produce formal chunks;
wenzelm
parents:
71456
diff
changeset
|
54 |
fold tree ts #> |
192bcee4f8b8
more robust treatment of empty markup: it allows to produce formal chunks;
wenzelm
parents:
71456
diff
changeset
|
55 |
string XYX |
70990 | 56 |
| tree (XML.Text s) = string s; |
57 |
in tree end; |
|
43728 | 58 |
|
77768 | 59 |
val tree_size = Integer.build o traverse (Integer.add o size); |
60 |
val body_size = Integer.build o fold (Integer.add o tree_size); |
|
73557 | 61 |
|
62 |
||
63 |
(* output *) |
|
64 |
||
74231 | 65 |
val string_of_body = Buffer.build_content o fold (traverse Buffer.add); |
43728 | 66 |
val string_of = string_of_body o single; |
26528 | 67 |
|
80563 | 68 |
val bytes_of_body = Bytes.build o fold (traverse Bytes.add); |
69 |
val bytes_of = bytes_of_body o single; |
|
70 |
||
70991
f9f7c34b7dd4
more scalable protocol_message: use XML.body directly (Output.output hook is not required);
wenzelm
parents:
70990
diff
changeset
|
71 |
fun write_body path body = |
75615 | 72 |
path |> File_Stream.open_output (fn file => |
73 |
fold (traverse (fn s => fn () => File_Stream.output file s)) body ()); |
|
70991
f9f7c34b7dd4
more scalable protocol_message: use XML.body directly (Output.output hook is not required);
wenzelm
parents:
70990
diff
changeset
|
74 |
|
26528 | 75 |
|
70989 | 76 |
(* markup elements *) |
49656
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
wenzelm
parents:
46832
diff
changeset
|
77 |
|
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
wenzelm
parents:
46832
diff
changeset
|
78 |
val Z = chr 0; |
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
wenzelm
parents:
46832
diff
changeset
|
79 |
val Z_text = [XML.Text Z]; |
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
wenzelm
parents:
46832
diff
changeset
|
80 |
|
80826
7feaa04d332b
prefer static YXML.output_markup_only (without print_mode): Output.status is only relevant for PIDE (with print_mode "isabelle_process");
wenzelm
parents:
80821
diff
changeset
|
81 |
val output_markup_only = op ^ o output_markup; |
7feaa04d332b
prefer static YXML.output_markup_only (without print_mode): Output.status is only relevant for PIDE (with print_mode "isabelle_process");
wenzelm
parents:
80821
diff
changeset
|
82 |
|
49656
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
wenzelm
parents:
46832
diff
changeset
|
83 |
fun output_markup_elem markup = |
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
wenzelm
parents:
46832
diff
changeset
|
84 |
let val [bg1, bg2, en] = space_explode Z (string_of (XML.wrap_elem ((markup, Z_text), Z_text))) |
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
wenzelm
parents:
46832
diff
changeset
|
85 |
in ((bg1, bg2), en) end; |
7ff712de5747
treat wrapped markup elements as raw markup delimiters;
wenzelm
parents:
46832
diff
changeset
|
86 |
|
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
87 |
|
80860 | 88 |
|
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
89 |
(** efficient YXML parsing **) |
26528 | 90 |
|
91 |
local |
|
92 |
||
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
93 |
(* splitting *) |
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
94 |
|
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
95 |
val split_string = |
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
96 |
Substring.full #> |
64275
ac2abc987cf9
accomodate Poly/ML repository version, which treats singleton strings as boxed;
wenzelm
parents:
59433
diff
changeset
|
97 |
Substring.tokens (fn c => c = X_char) #> |
ac2abc987cf9
accomodate Poly/ML repository version, which treats singleton strings as boxed;
wenzelm
parents:
59433
diff
changeset
|
98 |
map (Substring.fields (fn c => c = Y_char) #> map Substring.string); |
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
99 |
|
75626 | 100 |
val split_bytes = |
101 |
Bytes.space_explode X |
|
80561 | 102 |
#> map_filter (fn "" => NONE | s => SOME (space_explode Y s)); |
75626 | 103 |
|
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
104 |
|
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
105 |
(* structural errors *) |
26528 | 106 |
|
46832 | 107 |
fun err msg = raise Fail ("Malformed YXML: " ^ msg); |
26528 | 108 |
fun err_attribute () = err "bad attribute"; |
109 |
fun err_element () = err "bad element"; |
|
110 |
fun err_unbalanced "" = err "unbalanced element" |
|
111 |
| err_unbalanced name = err ("unbalanced element " ^ quote name); |
|
112 |
||
113 |
||
114 |
(* stack operations *) |
|
115 |
||
116 |
fun add x ((elem, body) :: pending) = (elem, x :: body) :: pending; |
|
117 |
||
118 |
fun push "" _ _ = err_element () |
|
119 |
| push name atts pending = ((name, atts), []) :: pending; |
|
120 |
||
121 |
fun pop ((("", _), _) :: _) = err_unbalanced "" |
|
38228
ada3ab6b9085
simplified type XML.tree: embed Markup.T directly, avoid slightly odd triple;
wenzelm
parents:
34095
diff
changeset
|
122 |
| pop ((markup, body) :: pending) = add (XML.Elem (markup, rev body)) pending; |
26528 | 123 |
|
124 |
||
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
125 |
(* parsing *) |
26528 | 126 |
|
127 |
fun parse_attrib s = |
|
28025 | 128 |
(case first_field "=" s of |
28023
92dd3ad302b7
simplified parse_attrib (find_substring instead of space_explode);
wenzelm
parents:
27932
diff
changeset
|
129 |
NONE => err_attribute () |
28025 | 130 |
| SOME ("", _) => err_attribute () |
131 |
| SOME att => att); |
|
26528 | 132 |
|
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
133 |
fun parse_chunk ["", ""] = pop |
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
134 |
| parse_chunk ("" :: name :: atts) = push name (map parse_attrib atts) |
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
135 |
| parse_chunk txts = fold (add o XML.Text) txts; |
26528 | 136 |
|
75626 | 137 |
fun parse_body' chunks = |
138 |
(case fold parse_chunk chunks [(("", []), [])] of |
|
26528 | 139 |
[(("", _), result)] => rev result |
140 |
| ((name, _), _) :: _ => err_unbalanced name); |
|
141 |
||
75626 | 142 |
fun parse' chunks = |
143 |
(case parse_body' chunks of |
|
27798
b96c73f11a23
YXML.parse: allow text without markup, potentially empty;
wenzelm
parents:
26684
diff
changeset
|
144 |
[result] => result |
b96c73f11a23
YXML.parse: allow text without markup, potentially empty;
wenzelm
parents:
26684
diff
changeset
|
145 |
| [] => XML.Text "" |
b96c73f11a23
YXML.parse: allow text without markup, potentially empty;
wenzelm
parents:
26684
diff
changeset
|
146 |
| _ => err "multiple results"); |
26528 | 147 |
|
75626 | 148 |
in |
149 |
||
150 |
val parse_body = parse_body' o split_string; |
|
151 |
val parse_body_bytes = parse_body' o split_bytes; |
|
152 |
||
153 |
val parse = parse' o split_string; |
|
154 |
val parse_bytes = parse' o split_bytes; |
|
155 |
||
26528 | 156 |
end; |
157 |
||
71456
09c850e82258
more robust pretty printing of broken YXML, e.g. single "\^E";
wenzelm
parents:
70996
diff
changeset
|
158 |
fun is_wellformed s = string_of_body (parse_body s) = s |
09c850e82258
more robust pretty printing of broken YXML, e.g. single "\^E";
wenzelm
parents:
70996
diff
changeset
|
159 |
handle Fail _ => false; |
09c850e82258
more robust pretty printing of broken YXML, e.g. single "\^E";
wenzelm
parents:
70996
diff
changeset
|
160 |
|
26528 | 161 |
end; |