author | wenzelm |
Tue, 12 Jul 2011 10:44:30 +0200 | |
changeset 43767 | e0219ef7f84c |
parent 40627 | becf5d5187cc |
child 43768 | d52ab827d62b |
permissions | -rw-r--r-- |
24584 | 1 |
(* Title: Pure/General/xml.ML |
24264 | 2 |
Author: David Aspinall, Stefan Berghofer and Markus Wenzel |
3 |
||
38228
ada3ab6b9085
simplified type XML.tree: embed Markup.T directly, avoid slightly odd triple;
wenzelm
parents:
31469
diff
changeset
|
4 |
Simple XML tree values. |
24264 | 5 |
*) |
6 |
||
43767 | 7 |
signature XML_DATA_OPS = |
8 |
sig |
|
9 |
type 'a T |
|
10 |
val properties: Properties.T T |
|
11 |
val string: string T |
|
12 |
val int: int T |
|
13 |
val bool: bool T |
|
14 |
val unit: unit T |
|
15 |
val pair: 'a T -> 'b T -> ('a * 'b) T |
|
16 |
val triple: 'a T -> 'b T -> 'c T -> ('a * 'b * 'c) T |
|
17 |
val list: 'a T -> 'a list T |
|
18 |
val option: 'a T -> 'a option T |
|
19 |
val variant: 'a T list -> 'a T |
|
20 |
end; |
|
21 |
||
24264 | 22 |
signature XML = |
23 |
sig |
|
28017 | 24 |
type attributes = Properties.T |
24264 | 25 |
datatype tree = |
38228
ada3ab6b9085
simplified type XML.tree: embed Markup.T directly, avoid slightly odd triple;
wenzelm
parents:
31469
diff
changeset
|
26 |
Elem of Markup.T * tree list |
24264 | 27 |
| Text of string |
38266
492d377ecfe2
type XML.body as basic data representation language;
wenzelm
parents:
38228
diff
changeset
|
28 |
type body = tree list |
26546 | 29 |
val add_content: tree -> Buffer.T -> Buffer.T |
39555
ccb223a4d49c
added XML.content_of convenience -- cover XML.body, which is the general situation;
wenzelm
parents:
38474
diff
changeset
|
30 |
val content_of: body -> string |
26546 | 31 |
val header: string |
32 |
val text: string -> string |
|
33 |
val element: string -> attributes -> string list -> string |
|
40131
7cbebd636e79
explicitly qualify type Output.output, which is a slightly odd internal feature;
wenzelm
parents:
39555
diff
changeset
|
34 |
val output_markup: Markup.T -> Output.output * Output.output |
26539
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
wenzelm
parents:
26525
diff
changeset
|
35 |
val string_of: tree -> string |
26546 | 36 |
val output: tree -> TextIO.outstream -> unit |
26984
d0e098e206f3
added parse_document (optional unchecked header material);
wenzelm
parents:
26554
diff
changeset
|
37 |
val parse_comments: string list -> unit * string list |
24264 | 38 |
val parse_string : string -> string option |
26546 | 39 |
val parse_element: string list -> tree * string list |
26984
d0e098e206f3
added parse_document (optional unchecked header material);
wenzelm
parents:
26554
diff
changeset
|
40 |
val parse_document: string list -> tree * string list |
26539
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
wenzelm
parents:
26525
diff
changeset
|
41 |
val parse: string -> tree |
43767 | 42 |
exception XML_ATOM of string |
43 |
exception XML_BODY of body |
|
44 |
structure Encode: XML_DATA_OPS where type 'a T = 'a -> body |
|
45 |
structure Decode: XML_DATA_OPS where type 'a T = body -> 'a |
|
24264 | 46 |
end; |
47 |
||
48 |
structure XML: XML = |
|
49 |
struct |
|
50 |
||
26546 | 51 |
(** XML trees **) |
52 |
||
28017 | 53 |
type attributes = Properties.T; |
26546 | 54 |
|
55 |
datatype tree = |
|
38228
ada3ab6b9085
simplified type XML.tree: embed Markup.T directly, avoid slightly odd triple;
wenzelm
parents:
31469
diff
changeset
|
56 |
Elem of Markup.T * tree list |
28033 | 57 |
| Text of string; |
26546 | 58 |
|
38266
492d377ecfe2
type XML.body as basic data representation language;
wenzelm
parents:
38228
diff
changeset
|
59 |
type body = tree list; |
492d377ecfe2
type XML.body as basic data representation language;
wenzelm
parents:
38228
diff
changeset
|
60 |
|
38228
ada3ab6b9085
simplified type XML.tree: embed Markup.T directly, avoid slightly odd triple;
wenzelm
parents:
31469
diff
changeset
|
61 |
fun add_content (Elem (_, ts)) = fold add_content ts |
28033 | 62 |
| add_content (Text s) = Buffer.add s; |
26546 | 63 |
|
39555
ccb223a4d49c
added XML.content_of convenience -- cover XML.body, which is the general situation;
wenzelm
parents:
38474
diff
changeset
|
64 |
fun content_of body = Buffer.empty |> fold add_content body |> Buffer.content; |
ccb223a4d49c
added XML.content_of convenience -- cover XML.body, which is the general situation;
wenzelm
parents:
38474
diff
changeset
|
65 |
|
26546 | 66 |
|
24264 | 67 |
|
26525 | 68 |
(** string representation **) |
69 |
||
24264 | 70 |
val header = "<?xml version=\"1.0\"?>\n"; |
71 |
||
72 |
||
26546 | 73 |
(* escaped text *) |
24264 | 74 |
|
75 |
fun decode "<" = "<" |
|
76 |
| decode ">" = ">" |
|
77 |
| decode "&" = "&" |
|
78 |
| decode "'" = "'" |
|
79 |
| decode """ = "\"" |
|
80 |
| decode c = c; |
|
81 |
||
82 |
fun encode "<" = "<" |
|
83 |
| encode ">" = ">" |
|
84 |
| encode "&" = "&" |
|
85 |
| encode "'" = "'" |
|
86 |
| encode "\"" = """ |
|
87 |
| encode c = c; |
|
88 |
||
25838 | 89 |
val text = translate_string encode; |
24264 | 90 |
|
91 |
||
92 |
(* elements *) |
|
93 |
||
26539
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
wenzelm
parents:
26525
diff
changeset
|
94 |
fun elem name atts = |
26551 | 95 |
space_implode " " (name :: map (fn (a, x) => a ^ "=\"" ^ text x ^ "\"") atts); |
24264 | 96 |
|
26525 | 97 |
fun element name atts body = |
26539
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
wenzelm
parents:
26525
diff
changeset
|
98 |
let val b = implode body in |
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
wenzelm
parents:
26525
diff
changeset
|
99 |
if b = "" then enclose "<" "/>" (elem name atts) |
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
wenzelm
parents:
26525
diff
changeset
|
100 |
else enclose "<" ">" (elem name atts) ^ b ^ enclose "</" ">" name |
24264 | 101 |
end; |
102 |
||
27884 | 103 |
fun output_markup (markup as (name, atts)) = |
38474
e498dc2eb576
uniform Markup.empty/Markup.Empty in ML and Scala;
wenzelm
parents:
38266
diff
changeset
|
104 |
if Markup.is_empty markup then Markup.no_output |
27884 | 105 |
else (enclose "<" ">" (elem name atts), enclose "</" ">" name); |
26539
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
wenzelm
parents:
26525
diff
changeset
|
106 |
|
24264 | 107 |
|
26546 | 108 |
(* output *) |
24264 | 109 |
|
26546 | 110 |
fun buffer_of tree = |
24264 | 111 |
let |
38228
ada3ab6b9085
simplified type XML.tree: embed Markup.T directly, avoid slightly odd triple;
wenzelm
parents:
31469
diff
changeset
|
112 |
fun traverse (Elem ((name, atts), [])) = |
26539
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
wenzelm
parents:
26525
diff
changeset
|
113 |
Buffer.add "<" #> Buffer.add (elem name atts) #> Buffer.add "/>" |
38228
ada3ab6b9085
simplified type XML.tree: embed Markup.T directly, avoid slightly odd triple;
wenzelm
parents:
31469
diff
changeset
|
114 |
| traverse (Elem ((name, atts), ts)) = |
26539
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
wenzelm
parents:
26525
diff
changeset
|
115 |
Buffer.add "<" #> Buffer.add (elem name atts) #> Buffer.add ">" #> |
26546 | 116 |
fold traverse ts #> |
26525 | 117 |
Buffer.add "</" #> Buffer.add name #> Buffer.add ">" |
28033 | 118 |
| traverse (Text s) = Buffer.add (text s); |
26546 | 119 |
in Buffer.empty |> traverse tree end; |
24264 | 120 |
|
26546 | 121 |
val string_of = Buffer.content o buffer_of; |
122 |
val output = Buffer.output o buffer_of; |
|
25838 | 123 |
|
24264 | 124 |
|
125 |
||
26546 | 126 |
(** XML parsing (slow) **) |
127 |
||
128 |
local |
|
24264 | 129 |
|
130 |
fun err s (xs, _) = |
|
131 |
"XML parsing error: " ^ s ^ "\nfound: " ^ quote (Symbol.beginning 100 xs); |
|
132 |
||
26984
d0e098e206f3
added parse_document (optional unchecked header material);
wenzelm
parents:
26554
diff
changeset
|
133 |
fun ignored _ = []; |
d0e098e206f3
added parse_document (optional unchecked header material);
wenzelm
parents:
26554
diff
changeset
|
134 |
|
26551 | 135 |
val blanks = Scan.many Symbol.is_blank; |
136 |
val special = $$ "&" ^^ Symbol.scan_id ^^ $$ ";" >> decode; |
|
137 |
val regular = Scan.one Symbol.is_regular; |
|
138 |
fun regular_except x = Scan.one (fn c => Symbol.is_regular c andalso c <> x); |
|
24264 | 139 |
|
26551 | 140 |
val parse_chars = Scan.repeat1 (special || regular_except "<") >> implode; |
24264 | 141 |
|
26551 | 142 |
val parse_cdata = |
143 |
Scan.this_string "<![CDATA[" |-- |
|
144 |
(Scan.repeat (Scan.unless (Scan.this_string "]]>") regular) >> implode) --| |
|
145 |
Scan.this_string "]]>"; |
|
24264 | 146 |
|
147 |
val parse_att = |
|
26551 | 148 |
(Symbol.scan_id --| (blanks -- $$ "=" -- blanks)) -- |
149 |
(($$ "\"" || $$ "'") :|-- (fn s => |
|
150 |
(Scan.repeat (special || regular_except s) >> implode) --| $$ s)); |
|
24264 | 151 |
|
26551 | 152 |
val parse_comment = |
153 |
Scan.this_string "<!--" -- |
|
154 |
Scan.repeat (Scan.unless (Scan.this_string "-->") regular) -- |
|
26984
d0e098e206f3
added parse_document (optional unchecked header material);
wenzelm
parents:
26554
diff
changeset
|
155 |
Scan.this_string "-->" >> ignored; |
24264 | 156 |
|
26551 | 157 |
val parse_processing_instruction = |
158 |
Scan.this_string "<?" -- |
|
159 |
Scan.repeat (Scan.unless (Scan.this_string "?>") regular) -- |
|
26984
d0e098e206f3
added parse_document (optional unchecked header material);
wenzelm
parents:
26554
diff
changeset
|
160 |
Scan.this_string "?>" >> ignored; |
d0e098e206f3
added parse_document (optional unchecked header material);
wenzelm
parents:
26554
diff
changeset
|
161 |
|
d0e098e206f3
added parse_document (optional unchecked header material);
wenzelm
parents:
26554
diff
changeset
|
162 |
val parse_doctype = |
d0e098e206f3
added parse_document (optional unchecked header material);
wenzelm
parents:
26554
diff
changeset
|
163 |
Scan.this_string "<!DOCTYPE" -- |
d0e098e206f3
added parse_document (optional unchecked header material);
wenzelm
parents:
26554
diff
changeset
|
164 |
Scan.repeat (Scan.unless ($$ ">") regular) -- |
d0e098e206f3
added parse_document (optional unchecked header material);
wenzelm
parents:
26554
diff
changeset
|
165 |
$$ ">" >> ignored; |
d0e098e206f3
added parse_document (optional unchecked header material);
wenzelm
parents:
26554
diff
changeset
|
166 |
|
d0e098e206f3
added parse_document (optional unchecked header material);
wenzelm
parents:
26554
diff
changeset
|
167 |
val parse_misc = |
d0e098e206f3
added parse_document (optional unchecked header material);
wenzelm
parents:
26554
diff
changeset
|
168 |
Scan.one Symbol.is_blank >> ignored || |
d0e098e206f3
added parse_document (optional unchecked header material);
wenzelm
parents:
26554
diff
changeset
|
169 |
parse_processing_instruction || |
d0e098e206f3
added parse_document (optional unchecked header material);
wenzelm
parents:
26554
diff
changeset
|
170 |
parse_comment; |
26551 | 171 |
|
172 |
val parse_optional_text = |
|
173 |
Scan.optional (parse_chars >> (single o Text)) []; |
|
24264 | 174 |
|
26546 | 175 |
in |
176 |
||
26984
d0e098e206f3
added parse_document (optional unchecked header material);
wenzelm
parents:
26554
diff
changeset
|
177 |
val parse_comments = |
d0e098e206f3
added parse_document (optional unchecked header material);
wenzelm
parents:
26554
diff
changeset
|
178 |
blanks -- Scan.repeat (parse_comment -- blanks >> K ()) >> K (); |
d0e098e206f3
added parse_document (optional unchecked header material);
wenzelm
parents:
26554
diff
changeset
|
179 |
|
40627
becf5d5187cc
renamed raw "explode" function to "raw_explode" to emphasize its meaning;
wenzelm
parents:
40131
diff
changeset
|
180 |
val parse_string = Scan.read Symbol.stopper parse_chars o raw_explode; |
26546 | 181 |
|
24264 | 182 |
fun parse_content xs = |
26551 | 183 |
(parse_optional_text @@@ |
184 |
(Scan.repeat |
|
185 |
((parse_element >> single || |
|
186 |
parse_cdata >> (single o Text) || |
|
26984
d0e098e206f3
added parse_document (optional unchecked header material);
wenzelm
parents:
26554
diff
changeset
|
187 |
parse_processing_instruction || |
d0e098e206f3
added parse_document (optional unchecked header material);
wenzelm
parents:
26554
diff
changeset
|
188 |
parse_comment) |
26551 | 189 |
@@@ parse_optional_text) >> flat)) xs |
24264 | 190 |
|
26546 | 191 |
and parse_element xs = |
24264 | 192 |
($$ "<" |-- Symbol.scan_id -- |
26551 | 193 |
Scan.repeat (blanks |-- parse_att) --| blanks :-- (fn (s, _) => |
24264 | 194 |
!! (err "Expected > or />") |
26984
d0e098e206f3
added parse_document (optional unchecked header material);
wenzelm
parents:
26554
diff
changeset
|
195 |
(Scan.this_string "/>" >> ignored |
24264 | 196 |
|| $$ ">" |-- parse_content --| |
197 |
!! (err ("Expected </" ^ s ^ ">")) |
|
38228
ada3ab6b9085
simplified type XML.tree: embed Markup.T directly, avoid slightly odd triple;
wenzelm
parents:
31469
diff
changeset
|
198 |
(Scan.this_string ("</" ^ s) --| blanks --| $$ ">"))) >> Elem) xs; |
24264 | 199 |
|
26984
d0e098e206f3
added parse_document (optional unchecked header material);
wenzelm
parents:
26554
diff
changeset
|
200 |
val parse_document = |
d0e098e206f3
added parse_document (optional unchecked header material);
wenzelm
parents:
26554
diff
changeset
|
201 |
(Scan.repeat parse_misc -- Scan.option parse_doctype -- Scan.repeat parse_misc) |
d0e098e206f3
added parse_document (optional unchecked header material);
wenzelm
parents:
26554
diff
changeset
|
202 |
|-- parse_element; |
24264 | 203 |
|
26539
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
wenzelm
parents:
26525
diff
changeset
|
204 |
fun parse s = |
24264 | 205 |
(case Scan.finite Symbol.stopper (Scan.error (!! (err "Malformed element") |
40627
becf5d5187cc
renamed raw "explode" function to "raw_explode" to emphasize its meaning;
wenzelm
parents:
40131
diff
changeset
|
206 |
(blanks |-- parse_document --| blanks))) (raw_explode s) of |
24264 | 207 |
(x, []) => x |
208 |
| (_, ys) => error ("XML parsing error: Unprocessed input\n" ^ Symbol.beginning 100 ys)); |
|
209 |
||
210 |
end; |
|
26546 | 211 |
|
43767 | 212 |
|
213 |
||
214 |
(** XML as data representation language **) |
|
215 |
||
216 |
exception XML_ATOM of string; |
|
217 |
exception XML_BODY of tree list; |
|
218 |
||
219 |
||
220 |
structure Encode = |
|
221 |
struct |
|
222 |
||
223 |
type 'a T = 'a -> body; |
|
224 |
||
225 |
||
226 |
(* basic values *) |
|
227 |
||
228 |
fun int_atom i = signed_string_of_int i; |
|
229 |
||
230 |
fun bool_atom false = "0" |
|
231 |
| bool_atom true = "1"; |
|
232 |
||
233 |
fun unit_atom () = ""; |
|
234 |
||
235 |
||
236 |
(* structural nodes *) |
|
237 |
||
238 |
fun node ts = Elem ((":", []), ts); |
|
239 |
||
240 |
fun tagged (tag, ts) = Elem ((int_atom tag, []), ts); |
|
241 |
||
242 |
||
243 |
(* representation of standard types *) |
|
244 |
||
245 |
fun properties props = [Elem ((":", props), [])]; |
|
246 |
||
247 |
fun string "" = [] |
|
248 |
| string s = [Text s]; |
|
249 |
||
250 |
val int = string o int_atom; |
|
251 |
||
252 |
val bool = string o bool_atom; |
|
253 |
||
254 |
val unit = string o unit_atom; |
|
255 |
||
256 |
fun pair f g (x, y) = [node (f x), node (g y)]; |
|
257 |
||
258 |
fun triple f g h (x, y, z) = [node (f x), node (g y), node (h z)]; |
|
259 |
||
260 |
fun list f xs = map (node o f) xs; |
|
261 |
||
262 |
fun option _ NONE = [] |
|
263 |
| option f (SOME x) = [node (f x)]; |
|
264 |
||
265 |
fun variant fs x = [tagged (the (get_index (fn f => try f x) fs))]; |
|
266 |
||
26546 | 267 |
end; |
43767 | 268 |
|
269 |
||
270 |
structure Decode = |
|
271 |
struct |
|
272 |
||
273 |
type 'a T = body -> 'a; |
|
274 |
||
275 |
||
276 |
(* basic values *) |
|
277 |
||
278 |
fun int_atom s = |
|
279 |
(case Int.fromString s of |
|
280 |
SOME i => i |
|
281 |
| NONE => raise XML_ATOM s); |
|
282 |
||
283 |
fun bool_atom "0" = false |
|
284 |
| bool_atom "1" = true |
|
285 |
| bool_atom s = raise XML_ATOM s; |
|
286 |
||
287 |
fun unit_atom "" = () |
|
288 |
| unit_atom s = raise XML_ATOM s; |
|
289 |
||
290 |
||
291 |
(* structural nodes *) |
|
292 |
||
293 |
fun node (Elem ((":", []), ts)) = ts |
|
294 |
| node t = raise XML_BODY [t]; |
|
295 |
||
296 |
fun tagged (Elem ((s, []), ts)) = (int_atom s, ts) |
|
297 |
| tagged t = raise XML_BODY [t]; |
|
298 |
||
299 |
||
300 |
(* representation of standard types *) |
|
301 |
||
302 |
fun properties [Elem ((":", props), [])] = props |
|
303 |
| properties ts = raise XML_BODY ts; |
|
304 |
||
305 |
fun string [] = "" |
|
306 |
| string [Text s] = s |
|
307 |
| string ts = raise XML_BODY ts; |
|
308 |
||
309 |
val int = int_atom o string; |
|
310 |
||
311 |
val bool = bool_atom o string; |
|
312 |
||
313 |
val unit = unit_atom o string; |
|
314 |
||
315 |
fun pair f g [t1, t2] = (f (node t1), g (node t2)) |
|
316 |
| pair _ _ ts = raise XML_BODY ts; |
|
317 |
||
318 |
fun triple f g h [t1, t2, t3] = (f (node t1), g (node t2), h (node t3)) |
|
319 |
| triple _ _ _ ts = raise XML_BODY ts; |
|
320 |
||
321 |
fun list f ts = map (f o node) ts; |
|
322 |
||
323 |
fun option _ [] = NONE |
|
324 |
| option f [t] = SOME (f (node t)) |
|
325 |
| option _ ts = raise XML_BODY ts; |
|
326 |
||
327 |
fun variant fs [t] = uncurry (nth fs) (tagged t) |
|
328 |
| variant _ ts = raise XML_BODY ts; |
|
329 |
||
330 |
end; |
|
331 |
||
332 |
end; |