| author | wenzelm | 
| Mon, 09 Jan 2012 23:09:03 +0100 | |
| changeset 46165 | 0e131ca93a49 | 
| parent 45155 | 3216d65d8f34 | 
| child 46837 | 5bdd68f380b3 | 
| permissions | -rw-r--r-- | 
| 44698 | 1  | 
(* Title: Pure/PIDE/xml.ML  | 
2  | 
Author: David Aspinall  | 
|
3  | 
Author: Stefan Berghofer  | 
|
4  | 
Author: Makarius  | 
|
| 24264 | 5  | 
|
| 44698 | 6  | 
Untyped XML trees and basic data representation.  | 
| 24264 | 7  | 
*)  | 
8  | 
||
| 43767 | 9  | 
signature XML_DATA_OPS =  | 
10  | 
sig  | 
|
| 
43778
 
ce9189450447
more compact representation of XML data (notably sort/typ/term), using properties as vector of atomic values;
 
wenzelm 
parents: 
43768 
diff
changeset
 | 
11  | 
type 'a A  | 
| 43767 | 12  | 
type 'a T  | 
| 
43778
 
ce9189450447
more compact representation of XML data (notably sort/typ/term), using properties as vector of atomic values;
 
wenzelm 
parents: 
43768 
diff
changeset
 | 
13  | 
type 'a V  | 
| 
 
ce9189450447
more compact representation of XML data (notably sort/typ/term), using properties as vector of atomic values;
 
wenzelm 
parents: 
43768 
diff
changeset
 | 
14  | 
val int_atom: int A  | 
| 
 
ce9189450447
more compact representation of XML data (notably sort/typ/term), using properties as vector of atomic values;
 
wenzelm 
parents: 
43768 
diff
changeset
 | 
15  | 
val bool_atom: bool A  | 
| 
 
ce9189450447
more compact representation of XML data (notably sort/typ/term), using properties as vector of atomic values;
 
wenzelm 
parents: 
43768 
diff
changeset
 | 
16  | 
val unit_atom: unit A  | 
| 43767 | 17  | 
val properties: Properties.T T  | 
18  | 
val string: string T  | 
|
19  | 
val int: int T  | 
|
20  | 
val bool: bool T  | 
|
21  | 
val unit: unit T  | 
|
22  | 
  val pair: 'a T -> 'b T -> ('a * 'b) T
 | 
|
23  | 
  val triple: 'a T -> 'b T -> 'c T -> ('a * 'b * 'c) T
 | 
|
24  | 
val list: 'a T -> 'a list T  | 
|
25  | 
val option: 'a T -> 'a option T  | 
|
| 
43778
 
ce9189450447
more compact representation of XML data (notably sort/typ/term), using properties as vector of atomic values;
 
wenzelm 
parents: 
43768 
diff
changeset
 | 
26  | 
val variant: 'a V list -> 'a T  | 
| 43767 | 27  | 
end;  | 
28  | 
||
| 24264 | 29  | 
signature XML =  | 
30  | 
sig  | 
|
| 28017 | 31  | 
type attributes = Properties.T  | 
| 24264 | 32  | 
datatype tree =  | 
| 
38228
 
ada3ab6b9085
simplified type XML.tree: embed Markup.T directly, avoid slightly odd triple;
 
wenzelm 
parents: 
31469 
diff
changeset
 | 
33  | 
Elem of Markup.T * tree list  | 
| 24264 | 34  | 
| Text of string  | 
| 
38266
 
492d377ecfe2
type XML.body as basic data representation language;
 
wenzelm 
parents: 
38228 
diff
changeset
 | 
35  | 
type body = tree list  | 
| 26546 | 36  | 
val add_content: tree -> Buffer.T -> Buffer.T  | 
| 
39555
 
ccb223a4d49c
added XML.content_of convenience -- cover XML.body, which is the general situation;
 
wenzelm 
parents: 
38474 
diff
changeset
 | 
37  | 
val content_of: body -> string  | 
| 26546 | 38  | 
val header: string  | 
39  | 
val text: string -> string  | 
|
40  | 
val element: string -> attributes -> string list -> string  | 
|
| 
40131
 
7cbebd636e79
explicitly qualify type Output.output, which is a slightly odd internal feature;
 
wenzelm 
parents: 
39555 
diff
changeset
 | 
41  | 
val output_markup: Markup.T -> Output.output * Output.output  | 
| 
26539
 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 
wenzelm 
parents: 
26525 
diff
changeset
 | 
42  | 
val string_of: tree -> string  | 
| 43791 | 43  | 
val pretty: int -> tree -> Pretty.T  | 
| 26546 | 44  | 
val output: tree -> TextIO.outstream -> unit  | 
| 
26984
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
45  | 
val parse_comments: string list -> unit * string list  | 
| 24264 | 46  | 
val parse_string : string -> string option  | 
| 26546 | 47  | 
val parse_element: string list -> tree * string list  | 
| 
26984
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
48  | 
val parse_document: string list -> tree * string list  | 
| 
26539
 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 
wenzelm 
parents: 
26525 
diff
changeset
 | 
49  | 
val parse: string -> tree  | 
| 44808 | 50  | 
val cache: unit -> tree -> tree  | 
| 43767 | 51  | 
exception XML_ATOM of string  | 
52  | 
exception XML_BODY of body  | 
|
| 
43778
 
ce9189450447
more compact representation of XML data (notably sort/typ/term), using properties as vector of atomic values;
 
wenzelm 
parents: 
43768 
diff
changeset
 | 
53  | 
structure Encode: XML_DATA_OPS  | 
| 
 
ce9189450447
more compact representation of XML data (notably sort/typ/term), using properties as vector of atomic values;
 
wenzelm 
parents: 
43768 
diff
changeset
 | 
54  | 
structure Decode: XML_DATA_OPS  | 
| 24264 | 55  | 
end;  | 
56  | 
||
57  | 
structure XML: XML =  | 
|
58  | 
struct  | 
|
59  | 
||
| 26546 | 60  | 
(** XML trees **)  | 
61  | 
||
| 28017 | 62  | 
type attributes = Properties.T;  | 
| 26546 | 63  | 
|
64  | 
datatype tree =  | 
|
| 
38228
 
ada3ab6b9085
simplified type XML.tree: embed Markup.T directly, avoid slightly odd triple;
 
wenzelm 
parents: 
31469 
diff
changeset
 | 
65  | 
Elem of Markup.T * tree list  | 
| 28033 | 66  | 
| Text of string;  | 
| 26546 | 67  | 
|
| 
38266
 
492d377ecfe2
type XML.body as basic data representation language;
 
wenzelm 
parents: 
38228 
diff
changeset
 | 
68  | 
type body = tree list;  | 
| 
 
492d377ecfe2
type XML.body as basic data representation language;
 
wenzelm 
parents: 
38228 
diff
changeset
 | 
69  | 
|
| 
38228
 
ada3ab6b9085
simplified type XML.tree: embed Markup.T directly, avoid slightly odd triple;
 
wenzelm 
parents: 
31469 
diff
changeset
 | 
70  | 
fun add_content (Elem (_, ts)) = fold add_content ts  | 
| 28033 | 71  | 
| add_content (Text s) = Buffer.add s;  | 
| 26546 | 72  | 
|
| 
39555
 
ccb223a4d49c
added XML.content_of convenience -- cover XML.body, which is the general situation;
 
wenzelm 
parents: 
38474 
diff
changeset
 | 
73  | 
fun content_of body = Buffer.empty |> fold add_content body |> Buffer.content;  | 
| 
 
ccb223a4d49c
added XML.content_of convenience -- cover XML.body, which is the general situation;
 
wenzelm 
parents: 
38474 
diff
changeset
 | 
74  | 
|
| 26546 | 75  | 
|
| 24264 | 76  | 
|
| 26525 | 77  | 
(** string representation **)  | 
78  | 
||
| 24264 | 79  | 
val header = "<?xml version=\"1.0\"?>\n";  | 
80  | 
||
81  | 
||
| 26546 | 82  | 
(* escaped text *)  | 
| 24264 | 83  | 
|
84  | 
fun decode "<" = "<"  | 
|
85  | 
| decode ">" = ">"  | 
|
86  | 
| decode "&" = "&"  | 
|
87  | 
| decode "'" = "'"  | 
|
88  | 
| decode """ = "\""  | 
|
89  | 
| decode c = c;  | 
|
90  | 
||
91  | 
fun encode "<" = "<"  | 
|
92  | 
| encode ">" = ">"  | 
|
93  | 
| encode "&" = "&"  | 
|
94  | 
| encode "'" = "'"  | 
|
95  | 
| encode "\"" = """  | 
|
96  | 
| encode c = c;  | 
|
97  | 
||
| 25838 | 98  | 
val text = translate_string encode;  | 
| 24264 | 99  | 
|
100  | 
||
101  | 
(* elements *)  | 
|
102  | 
||
| 
26539
 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 
wenzelm 
parents: 
26525 
diff
changeset
 | 
103  | 
fun elem name atts =  | 
| 26551 | 104  | 
space_implode " " (name :: map (fn (a, x) => a ^ "=\"" ^ text x ^ "\"") atts);  | 
| 24264 | 105  | 
|
| 26525 | 106  | 
fun element name atts body =  | 
| 
26539
 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 
wenzelm 
parents: 
26525 
diff
changeset
 | 
107  | 
let val b = implode body in  | 
| 
 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 
wenzelm 
parents: 
26525 
diff
changeset
 | 
108  | 
if b = "" then enclose "<" "/>" (elem name atts)  | 
| 
 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 
wenzelm 
parents: 
26525 
diff
changeset
 | 
109  | 
else enclose "<" ">" (elem name atts) ^ b ^ enclose "</" ">" name  | 
| 24264 | 110  | 
end;  | 
111  | 
||
| 27884 | 112  | 
fun output_markup (markup as (name, atts)) =  | 
| 
38474
 
e498dc2eb576
uniform Markup.empty/Markup.Empty in ML and Scala;
 
wenzelm 
parents: 
38266 
diff
changeset
 | 
113  | 
if Markup.is_empty markup then Markup.no_output  | 
| 27884 | 114  | 
else (enclose "<" ">" (elem name atts), enclose "</" ">" name);  | 
| 
26539
 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 
wenzelm 
parents: 
26525 
diff
changeset
 | 
115  | 
|
| 24264 | 116  | 
|
| 26546 | 117  | 
(* output *)  | 
| 24264 | 118  | 
|
| 43791 | 119  | 
fun buffer_of depth tree =  | 
| 24264 | 120  | 
let  | 
| 43791 | 121  | 
fun traverse _ (Elem ((name, atts), [])) =  | 
| 
26539
 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 
wenzelm 
parents: 
26525 
diff
changeset
 | 
122  | 
Buffer.add "<" #> Buffer.add (elem name atts) #> Buffer.add "/>"  | 
| 43791 | 123  | 
| traverse d (Elem ((name, atts), ts)) =  | 
| 
26539
 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 
wenzelm 
parents: 
26525 
diff
changeset
 | 
124  | 
Buffer.add "<" #> Buffer.add (elem name atts) #> Buffer.add ">" #>  | 
| 43791 | 125  | 
traverse_body d ts #>  | 
| 26525 | 126  | 
Buffer.add "</" #> Buffer.add name #> Buffer.add ">"  | 
| 43791 | 127  | 
| traverse _ (Text s) = Buffer.add (text s)  | 
128  | 
and traverse_body 0 _ = Buffer.add "..."  | 
|
129  | 
| traverse_body d ts = fold (traverse (d - 1)) ts;  | 
|
130  | 
in Buffer.empty |> traverse depth tree end;  | 
|
| 24264 | 131  | 
|
| 43791 | 132  | 
val string_of = Buffer.content o buffer_of ~1;  | 
133  | 
val output = Buffer.output o buffer_of ~1;  | 
|
134  | 
||
135  | 
fun pretty depth tree =  | 
|
136  | 
Pretty.str (Buffer.content (buffer_of (Int.max (0, depth)) tree));  | 
|
| 25838 | 137  | 
|
| 24264 | 138  | 
|
139  | 
||
| 44698 | 140  | 
(** XML parsing **)  | 
| 26546 | 141  | 
|
142  | 
local  | 
|
| 24264 | 143  | 
|
| 
43947
 
9b00f09f7721
defer evaluation of Scan.message, for improved performance in the frequent situation where failure is handled later (e.g. via ||);
 
wenzelm 
parents: 
43844 
diff
changeset
 | 
144  | 
fun err msg (xs, _) =  | 
| 
 
9b00f09f7721
defer evaluation of Scan.message, for improved performance in the frequent situation where failure is handled later (e.g. via ||);
 
wenzelm 
parents: 
43844 
diff
changeset
 | 
145  | 
fn () => "XML parsing error: " ^ msg () ^ "\nfound: " ^ quote (Symbol.beginning 100 xs);  | 
| 24264 | 146  | 
|
| 
26984
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
147  | 
fun ignored _ = [];  | 
| 
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
148  | 
|
| 
45155
 
3216d65d8f34
slightly more standard-conformant XML parsing (see also 94033767ef9b);
 
wenzelm 
parents: 
44809 
diff
changeset
 | 
149  | 
fun name_start_char c = Symbol.is_ascii_letter c orelse c = ":" orelse c = "_";  | 
| 
 
3216d65d8f34
slightly more standard-conformant XML parsing (see also 94033767ef9b);
 
wenzelm 
parents: 
44809 
diff
changeset
 | 
150  | 
fun name_char c = name_start_char c orelse Symbol.is_ascii_digit c orelse c = "-" orelse c = ".";  | 
| 
 
3216d65d8f34
slightly more standard-conformant XML parsing (see also 94033767ef9b);
 
wenzelm 
parents: 
44809 
diff
changeset
 | 
151  | 
val parse_name = Scan.one name_start_char ::: Scan.many name_char;  | 
| 
 
3216d65d8f34
slightly more standard-conformant XML parsing (see also 94033767ef9b);
 
wenzelm 
parents: 
44809 
diff
changeset
 | 
152  | 
|
| 26551 | 153  | 
val blanks = Scan.many Symbol.is_blank;  | 
| 
45155
 
3216d65d8f34
slightly more standard-conformant XML parsing (see also 94033767ef9b);
 
wenzelm 
parents: 
44809 
diff
changeset
 | 
154  | 
val special = $$ "&" ^^ (parse_name >> implode) ^^ $$ ";" >> decode;  | 
| 26551 | 155  | 
val regular = Scan.one Symbol.is_regular;  | 
156  | 
fun regular_except x = Scan.one (fn c => Symbol.is_regular c andalso c <> x);  | 
|
| 24264 | 157  | 
|
| 26551 | 158  | 
val parse_chars = Scan.repeat1 (special || regular_except "<") >> implode;  | 
| 24264 | 159  | 
|
| 26551 | 160  | 
val parse_cdata =  | 
161  | 
Scan.this_string "<![CDATA[" |--  | 
|
162  | 
(Scan.repeat (Scan.unless (Scan.this_string "]]>") regular) >> implode) --|  | 
|
163  | 
Scan.this_string "]]>";  | 
|
| 24264 | 164  | 
|
165  | 
val parse_att =  | 
|
| 
45155
 
3216d65d8f34
slightly more standard-conformant XML parsing (see also 94033767ef9b);
 
wenzelm 
parents: 
44809 
diff
changeset
 | 
166  | 
((parse_name >> implode) --| (blanks -- $$ "=" -- blanks)) --  | 
| 26551 | 167  | 
(($$ "\"" || $$ "'") :|-- (fn s =>  | 
168  | 
(Scan.repeat (special || regular_except s) >> implode) --| $$ s));  | 
|
| 24264 | 169  | 
|
| 26551 | 170  | 
val parse_comment =  | 
171  | 
Scan.this_string "<!--" --  | 
|
172  | 
Scan.repeat (Scan.unless (Scan.this_string "-->") regular) --  | 
|
| 
26984
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
173  | 
Scan.this_string "-->" >> ignored;  | 
| 24264 | 174  | 
|
| 26551 | 175  | 
val parse_processing_instruction =  | 
176  | 
Scan.this_string "<?" --  | 
|
177  | 
Scan.repeat (Scan.unless (Scan.this_string "?>") regular) --  | 
|
| 
26984
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
178  | 
Scan.this_string "?>" >> ignored;  | 
| 
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
179  | 
|
| 
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
180  | 
val parse_doctype =  | 
| 
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
181  | 
Scan.this_string "<!DOCTYPE" --  | 
| 
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
182  | 
Scan.repeat (Scan.unless ($$ ">") regular) --  | 
| 
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
183  | 
$$ ">" >> ignored;  | 
| 
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
184  | 
|
| 
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
185  | 
val parse_misc =  | 
| 
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
186  | 
Scan.one Symbol.is_blank >> ignored ||  | 
| 
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
187  | 
parse_processing_instruction ||  | 
| 
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
188  | 
parse_comment;  | 
| 26551 | 189  | 
|
190  | 
val parse_optional_text =  | 
|
191  | 
Scan.optional (parse_chars >> (single o Text)) [];  | 
|
| 24264 | 192  | 
|
| 26546 | 193  | 
in  | 
194  | 
||
| 
26984
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
195  | 
val parse_comments =  | 
| 
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
196  | 
blanks -- Scan.repeat (parse_comment -- blanks >> K ()) >> K ();  | 
| 
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
197  | 
|
| 
40627
 
becf5d5187cc
renamed raw "explode" function to "raw_explode" to emphasize its meaning;
 
wenzelm 
parents: 
40131 
diff
changeset
 | 
198  | 
val parse_string = Scan.read Symbol.stopper parse_chars o raw_explode;  | 
| 26546 | 199  | 
|
| 24264 | 200  | 
fun parse_content xs =  | 
| 26551 | 201  | 
(parse_optional_text @@@  | 
202  | 
(Scan.repeat  | 
|
203  | 
((parse_element >> single ||  | 
|
204  | 
parse_cdata >> (single o Text) ||  | 
|
| 
26984
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
205  | 
parse_processing_instruction ||  | 
| 
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
206  | 
parse_comment)  | 
| 26551 | 207  | 
@@@ parse_optional_text) >> flat)) xs  | 
| 24264 | 208  | 
|
| 26546 | 209  | 
and parse_element xs =  | 
| 
43949
 
94033767ef9b
more precise parse_name according to XML standard;
 
wenzelm 
parents: 
43947 
diff
changeset
 | 
210  | 
($$ "<" |-- parse_name -- Scan.repeat (blanks |-- parse_att) --| blanks :--  | 
| 
 
94033767ef9b
more precise parse_name according to XML standard;
 
wenzelm 
parents: 
43947 
diff
changeset
 | 
211  | 
(fn (name, _) =>  | 
| 
43947
 
9b00f09f7721
defer evaluation of Scan.message, for improved performance in the frequent situation where failure is handled later (e.g. via ||);
 
wenzelm 
parents: 
43844 
diff
changeset
 | 
212  | 
!! (err (fn () => "Expected > or />"))  | 
| 
43949
 
94033767ef9b
more precise parse_name according to XML standard;
 
wenzelm 
parents: 
43947 
diff
changeset
 | 
213  | 
($$ "/" -- $$ ">" >> ignored ||  | 
| 
 
94033767ef9b
more precise parse_name according to XML standard;
 
wenzelm 
parents: 
43947 
diff
changeset
 | 
214  | 
$$ ">" |-- parse_content --|  | 
| 
 
94033767ef9b
more precise parse_name according to XML standard;
 
wenzelm 
parents: 
43947 
diff
changeset
 | 
215  | 
!! (err (fn () => "Expected </" ^ implode name ^ ">"))  | 
| 
 
94033767ef9b
more precise parse_name according to XML standard;
 
wenzelm 
parents: 
43947 
diff
changeset
 | 
216  | 
($$ "<" -- $$ "/" -- Scan.this name -- blanks -- $$ ">")))  | 
| 
 
94033767ef9b
more precise parse_name according to XML standard;
 
wenzelm 
parents: 
43947 
diff
changeset
 | 
217  | 
>> (fn ((name, atts), body) => Elem ((implode name, atts), body))) xs;  | 
| 24264 | 218  | 
|
| 
26984
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
219  | 
val parse_document =  | 
| 
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
220  | 
(Scan.repeat parse_misc -- Scan.option parse_doctype -- Scan.repeat parse_misc)  | 
| 
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
221  | 
|-- parse_element;  | 
| 24264 | 222  | 
|
| 
26539
 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 
wenzelm 
parents: 
26525 
diff
changeset
 | 
223  | 
fun parse s =  | 
| 
43947
 
9b00f09f7721
defer evaluation of Scan.message, for improved performance in the frequent situation where failure is handled later (e.g. via ||);
 
wenzelm 
parents: 
43844 
diff
changeset
 | 
224  | 
(case Scan.finite Symbol.stopper (Scan.error (!! (err (fn () => "Malformed element"))  | 
| 
40627
 
becf5d5187cc
renamed raw "explode" function to "raw_explode" to emphasize its meaning;
 
wenzelm 
parents: 
40131 
diff
changeset
 | 
225  | 
(blanks |-- parse_document --| blanks))) (raw_explode s) of  | 
| 24264 | 226  | 
(x, []) => x  | 
227  | 
  | (_, ys) => error ("XML parsing error: Unprocessed input\n" ^ Symbol.beginning 100 ys));
 | 
|
228  | 
||
229  | 
end;  | 
|
| 26546 | 230  | 
|
| 43767 | 231  | 
|
| 
44809
 
df3626d1066e
more substructural sharing to gain significant compression;
 
wenzelm 
parents: 
44808 
diff
changeset
 | 
232  | 
(** cache for substructural sharing **)  | 
| 
 
df3626d1066e
more substructural sharing to gain significant compression;
 
wenzelm 
parents: 
44808 
diff
changeset
 | 
233  | 
|
| 
 
df3626d1066e
more substructural sharing to gain significant compression;
 
wenzelm 
parents: 
44808 
diff
changeset
 | 
234  | 
fun tree_ord tu =  | 
| 
 
df3626d1066e
more substructural sharing to gain significant compression;
 
wenzelm 
parents: 
44808 
diff
changeset
 | 
235  | 
if pointer_eq tu then EQUAL  | 
| 
 
df3626d1066e
more substructural sharing to gain significant compression;
 
wenzelm 
parents: 
44808 
diff
changeset
 | 
236  | 
else  | 
| 
 
df3626d1066e
more substructural sharing to gain significant compression;
 
wenzelm 
parents: 
44808 
diff
changeset
 | 
237  | 
(case tu of  | 
| 
 
df3626d1066e
more substructural sharing to gain significant compression;
 
wenzelm 
parents: 
44808 
diff
changeset
 | 
238  | 
(Text _, Elem _) => LESS  | 
| 
 
df3626d1066e
more substructural sharing to gain significant compression;
 
wenzelm 
parents: 
44808 
diff
changeset
 | 
239  | 
| (Elem _, Text _) => GREATER  | 
| 
 
df3626d1066e
more substructural sharing to gain significant compression;
 
wenzelm 
parents: 
44808 
diff
changeset
 | 
240  | 
| (Text s, Text s') => fast_string_ord (s, s')  | 
| 
 
df3626d1066e
more substructural sharing to gain significant compression;
 
wenzelm 
parents: 
44808 
diff
changeset
 | 
241  | 
| (Elem e, Elem e') =>  | 
| 
 
df3626d1066e
more substructural sharing to gain significant compression;
 
wenzelm 
parents: 
44808 
diff
changeset
 | 
242  | 
prod_ord  | 
| 
 
df3626d1066e
more substructural sharing to gain significant compression;
 
wenzelm 
parents: 
44808 
diff
changeset
 | 
243  | 
(prod_ord fast_string_ord (list_ord (prod_ord fast_string_ord fast_string_ord)))  | 
| 
 
df3626d1066e
more substructural sharing to gain significant compression;
 
wenzelm 
parents: 
44808 
diff
changeset
 | 
244  | 
(list_ord tree_ord) (e, e'));  | 
| 
 
df3626d1066e
more substructural sharing to gain significant compression;
 
wenzelm 
parents: 
44808 
diff
changeset
 | 
245  | 
|
| 
 
df3626d1066e
more substructural sharing to gain significant compression;
 
wenzelm 
parents: 
44808 
diff
changeset
 | 
246  | 
structure Treetab = Table(type key = tree val ord = tree_ord);  | 
| 44808 | 247  | 
|
248  | 
fun cache () =  | 
|
249  | 
let  | 
|
250  | 
val strings = Unsynchronized.ref (Symtab.empty: unit Symtab.table);  | 
|
| 
44809
 
df3626d1066e
more substructural sharing to gain significant compression;
 
wenzelm 
parents: 
44808 
diff
changeset
 | 
251  | 
val trees = Unsynchronized.ref (Treetab.empty: unit Treetab.table);  | 
| 44808 | 252  | 
|
253  | 
fun string s =  | 
|
| 
44809
 
df3626d1066e
more substructural sharing to gain significant compression;
 
wenzelm 
parents: 
44808 
diff
changeset
 | 
254  | 
if size s <= 1 then s  | 
| 
 
df3626d1066e
more substructural sharing to gain significant compression;
 
wenzelm 
parents: 
44808 
diff
changeset
 | 
255  | 
else  | 
| 
 
df3626d1066e
more substructural sharing to gain significant compression;
 
wenzelm 
parents: 
44808 
diff
changeset
 | 
256  | 
(case Symtab.lookup_key (! strings) s of  | 
| 
 
df3626d1066e
more substructural sharing to gain significant compression;
 
wenzelm 
parents: 
44808 
diff
changeset
 | 
257  | 
SOME (s', ()) => s'  | 
| 
 
df3626d1066e
more substructural sharing to gain significant compression;
 
wenzelm 
parents: 
44808 
diff
changeset
 | 
258  | 
| NONE => (Unsynchronized.change strings (Symtab.update (s, ())); s));  | 
| 44808 | 259  | 
|
| 
44809
 
df3626d1066e
more substructural sharing to gain significant compression;
 
wenzelm 
parents: 
44808 
diff
changeset
 | 
260  | 
fun tree t =  | 
| 
 
df3626d1066e
more substructural sharing to gain significant compression;
 
wenzelm 
parents: 
44808 
diff
changeset
 | 
261  | 
(case Treetab.lookup_key (! trees) t of  | 
| 
 
df3626d1066e
more substructural sharing to gain significant compression;
 
wenzelm 
parents: 
44808 
diff
changeset
 | 
262  | 
SOME (t', ()) => t'  | 
| 
 
df3626d1066e
more substructural sharing to gain significant compression;
 
wenzelm 
parents: 
44808 
diff
changeset
 | 
263  | 
| NONE =>  | 
| 
 
df3626d1066e
more substructural sharing to gain significant compression;
 
wenzelm 
parents: 
44808 
diff
changeset
 | 
264  | 
let  | 
| 
 
df3626d1066e
more substructural sharing to gain significant compression;
 
wenzelm 
parents: 
44808 
diff
changeset
 | 
265  | 
val t' =  | 
| 
 
df3626d1066e
more substructural sharing to gain significant compression;
 
wenzelm 
parents: 
44808 
diff
changeset
 | 
266  | 
(case t of  | 
| 
 
df3626d1066e
more substructural sharing to gain significant compression;
 
wenzelm 
parents: 
44808 
diff
changeset
 | 
267  | 
Elem ((a, ps), b) => Elem ((string a, map (pairself string) ps), map tree b)  | 
| 
 
df3626d1066e
more substructural sharing to gain significant compression;
 
wenzelm 
parents: 
44808 
diff
changeset
 | 
268  | 
| Text s => Text (string s));  | 
| 
 
df3626d1066e
more substructural sharing to gain significant compression;
 
wenzelm 
parents: 
44808 
diff
changeset
 | 
269  | 
val _ = Unsynchronized.change trees (Treetab.update (t', ()));  | 
| 
 
df3626d1066e
more substructural sharing to gain significant compression;
 
wenzelm 
parents: 
44808 
diff
changeset
 | 
270  | 
in t' end);  | 
| 44808 | 271  | 
in tree end;  | 
272  | 
||
273  | 
||
| 43767 | 274  | 
|
275  | 
(** XML as data representation language **)  | 
|
276  | 
||
277  | 
exception XML_ATOM of string;  | 
|
278  | 
exception XML_BODY of tree list;  | 
|
279  | 
||
280  | 
||
281  | 
structure Encode =  | 
|
282  | 
struct  | 
|
283  | 
||
| 
43778
 
ce9189450447
more compact representation of XML data (notably sort/typ/term), using properties as vector of atomic values;
 
wenzelm 
parents: 
43768 
diff
changeset
 | 
284  | 
type 'a A = 'a -> string;  | 
| 43767 | 285  | 
type 'a T = 'a -> body;  | 
| 
43778
 
ce9189450447
more compact representation of XML data (notably sort/typ/term), using properties as vector of atomic values;
 
wenzelm 
parents: 
43768 
diff
changeset
 | 
286  | 
type 'a V = 'a -> string list * body;  | 
| 43767 | 287  | 
|
288  | 
||
| 
43778
 
ce9189450447
more compact representation of XML data (notably sort/typ/term), using properties as vector of atomic values;
 
wenzelm 
parents: 
43768 
diff
changeset
 | 
289  | 
(* atomic values *)  | 
| 43767 | 290  | 
|
291  | 
fun int_atom i = signed_string_of_int i;  | 
|
292  | 
||
293  | 
fun bool_atom false = "0"  | 
|
294  | 
| bool_atom true = "1";  | 
|
295  | 
||
296  | 
fun unit_atom () = "";  | 
|
297  | 
||
298  | 
||
299  | 
(* structural nodes *)  | 
|
300  | 
||
301  | 
fun node ts = Elem ((":", []), ts);
 | 
|
302  | 
||
| 
43778
 
ce9189450447
more compact representation of XML data (notably sort/typ/term), using properties as vector of atomic values;
 
wenzelm 
parents: 
43768 
diff
changeset
 | 
303  | 
fun vector xs = map_index (fn (i, x) => (int_atom i, x)) xs;  | 
| 
 
ce9189450447
more compact representation of XML data (notably sort/typ/term), using properties as vector of atomic values;
 
wenzelm 
parents: 
43768 
diff
changeset
 | 
304  | 
|
| 
 
ce9189450447
more compact representation of XML data (notably sort/typ/term), using properties as vector of atomic values;
 
wenzelm 
parents: 
43768 
diff
changeset
 | 
305  | 
fun tagged (tag, (xs, ts)) = Elem ((int_atom tag, vector xs), ts);  | 
| 43767 | 306  | 
|
307  | 
||
308  | 
(* representation of standard types *)  | 
|
309  | 
||
310  | 
fun properties props = [Elem ((":", props), [])];
 | 
|
311  | 
||
312  | 
fun string "" = []  | 
|
313  | 
| string s = [Text s];  | 
|
314  | 
||
315  | 
val int = string o int_atom;  | 
|
316  | 
||
317  | 
val bool = string o bool_atom;  | 
|
318  | 
||
319  | 
val unit = string o unit_atom;  | 
|
320  | 
||
321  | 
fun pair f g (x, y) = [node (f x), node (g y)];  | 
|
322  | 
||
323  | 
fun triple f g h (x, y, z) = [node (f x), node (g y), node (h z)];  | 
|
324  | 
||
325  | 
fun list f xs = map (node o f) xs;  | 
|
326  | 
||
327  | 
fun option _ NONE = []  | 
|
328  | 
| option f (SOME x) = [node (f x)];  | 
|
329  | 
||
330  | 
fun variant fs x = [tagged (the (get_index (fn f => try f x) fs))];  | 
|
331  | 
||
| 26546 | 332  | 
end;  | 
| 43767 | 333  | 
|
334  | 
||
335  | 
structure Decode =  | 
|
336  | 
struct  | 
|
337  | 
||
| 
43778
 
ce9189450447
more compact representation of XML data (notably sort/typ/term), using properties as vector of atomic values;
 
wenzelm 
parents: 
43768 
diff
changeset
 | 
338  | 
type 'a A = string -> 'a;  | 
| 43767 | 339  | 
type 'a T = body -> 'a;  | 
| 
43778
 
ce9189450447
more compact representation of XML data (notably sort/typ/term), using properties as vector of atomic values;
 
wenzelm 
parents: 
43768 
diff
changeset
 | 
340  | 
type 'a V = string list * body -> 'a;  | 
| 43767 | 341  | 
|
342  | 
||
| 
43778
 
ce9189450447
more compact representation of XML data (notably sort/typ/term), using properties as vector of atomic values;
 
wenzelm 
parents: 
43768 
diff
changeset
 | 
343  | 
(* atomic values *)  | 
| 43767 | 344  | 
|
345  | 
fun int_atom s =  | 
|
| 
43797
 
fad7758421bf
more precise integer Markup.properties/XML.attributes: disallow ML-style ~ minus;
 
wenzelm 
parents: 
43791 
diff
changeset
 | 
346  | 
Markup.parse_int s  | 
| 
 
fad7758421bf
more precise integer Markup.properties/XML.attributes: disallow ML-style ~ minus;
 
wenzelm 
parents: 
43791 
diff
changeset
 | 
347  | 
handle Fail _ => raise XML_ATOM s;  | 
| 43767 | 348  | 
|
349  | 
fun bool_atom "0" = false  | 
|
350  | 
| bool_atom "1" = true  | 
|
351  | 
| bool_atom s = raise XML_ATOM s;  | 
|
352  | 
||
353  | 
fun unit_atom "" = ()  | 
|
354  | 
| unit_atom s = raise XML_ATOM s;  | 
|
355  | 
||
356  | 
||
357  | 
(* structural nodes *)  | 
|
358  | 
||
359  | 
fun node (Elem ((":", []), ts)) = ts
 | 
|
360  | 
| node t = raise XML_BODY [t];  | 
|
361  | 
||
| 43783 | 362  | 
fun vector atts =  | 
| 43844 | 363  | 
#1 (fold_map (fn (a, x) =>  | 
364  | 
fn i => if int_atom a = i then (x, i + 1) else raise XML_ATOM a) atts 0);  | 
|
| 
43778
 
ce9189450447
more compact representation of XML data (notably sort/typ/term), using properties as vector of atomic values;
 
wenzelm 
parents: 
43768 
diff
changeset
 | 
365  | 
|
| 43844 | 366  | 
fun tagged (Elem ((name, atts), ts)) = (int_atom name, (vector atts, ts))  | 
| 43767 | 367  | 
| tagged t = raise XML_BODY [t];  | 
368  | 
||
369  | 
||
370  | 
(* representation of standard types *)  | 
|
371  | 
||
372  | 
fun properties [Elem ((":", props), [])] = props
 | 
|
373  | 
| properties ts = raise XML_BODY ts;  | 
|
374  | 
||
375  | 
fun string [] = ""  | 
|
376  | 
| string [Text s] = s  | 
|
377  | 
| string ts = raise XML_BODY ts;  | 
|
378  | 
||
379  | 
val int = int_atom o string;  | 
|
380  | 
||
381  | 
val bool = bool_atom o string;  | 
|
382  | 
||
383  | 
val unit = unit_atom o string;  | 
|
384  | 
||
385  | 
fun pair f g [t1, t2] = (f (node t1), g (node t2))  | 
|
386  | 
| pair _ _ ts = raise XML_BODY ts;  | 
|
387  | 
||
388  | 
fun triple f g h [t1, t2, t3] = (f (node t1), g (node t2), h (node t3))  | 
|
389  | 
| triple _ _ _ ts = raise XML_BODY ts;  | 
|
390  | 
||
391  | 
fun list f ts = map (f o node) ts;  | 
|
392  | 
||
393  | 
fun option _ [] = NONE  | 
|
394  | 
| option f [t] = SOME (f (node t))  | 
|
395  | 
| option _ ts = raise XML_BODY ts;  | 
|
396  | 
||
| 43768 | 397  | 
fun variant fs [t] =  | 
398  | 
let  | 
|
| 
43778
 
ce9189450447
more compact representation of XML data (notably sort/typ/term), using properties as vector of atomic values;
 
wenzelm 
parents: 
43768 
diff
changeset
 | 
399  | 
val (tag, (xs, ts)) = tagged t;  | 
| 43768 | 400  | 
val f = nth fs tag handle General.Subscript => raise XML_BODY [t];  | 
| 
43778
 
ce9189450447
more compact representation of XML data (notably sort/typ/term), using properties as vector of atomic values;
 
wenzelm 
parents: 
43768 
diff
changeset
 | 
401  | 
in f (xs, ts) end  | 
| 43767 | 402  | 
| variant _ ts = raise XML_BODY ts;  | 
403  | 
||
404  | 
end;  | 
|
405  | 
||
406  | 
end;  |