|
1 (* Title: Pure/PIDE/xml.ML |
|
2 Author: David Aspinall |
|
3 Author: Stefan Berghofer |
|
4 Author: Makarius |
|
5 |
|
6 Untyped XML trees and basic data representation. |
|
7 *) |
|
8 |
|
9 signature XML_DATA_OPS = |
|
10 sig |
|
11 type 'a A |
|
12 type 'a T |
|
13 type 'a V |
|
14 val int_atom: int A |
|
15 val bool_atom: bool A |
|
16 val unit_atom: unit A |
|
17 val properties: Properties.T T |
|
18 val string: string T |
|
19 val int: int T |
|
20 val bool: bool T |
|
21 val unit: unit T |
|
22 val pair: 'a T -> 'b T -> ('a * 'b) T |
|
23 val triple: 'a T -> 'b T -> 'c T -> ('a * 'b * 'c) T |
|
24 val list: 'a T -> 'a list T |
|
25 val option: 'a T -> 'a option T |
|
26 val variant: 'a V list -> 'a T |
|
27 end; |
|
28 |
|
29 signature XML = |
|
30 sig |
|
31 type attributes = Properties.T |
|
32 datatype tree = |
|
33 Elem of Markup.T * tree list |
|
34 | Text of string |
|
35 type body = tree list |
|
36 val add_content: tree -> Buffer.T -> Buffer.T |
|
37 val content_of: body -> string |
|
38 val header: string |
|
39 val text: string -> string |
|
40 val element: string -> attributes -> string list -> string |
|
41 val output_markup: Markup.T -> Output.output * Output.output |
|
42 val string_of: tree -> string |
|
43 val pretty: int -> tree -> Pretty.T |
|
44 val output: tree -> TextIO.outstream -> unit |
|
45 val parse_comments: string list -> unit * string list |
|
46 val parse_string : string -> string option |
|
47 val parse_element: string list -> tree * string list |
|
48 val parse_document: string list -> tree * string list |
|
49 val parse: string -> tree |
|
50 exception XML_ATOM of string |
|
51 exception XML_BODY of body |
|
52 structure Encode: XML_DATA_OPS |
|
53 structure Decode: XML_DATA_OPS |
|
54 end; |
|
55 |
|
56 structure XML: XML = |
|
57 struct |
|
58 |
|
59 (** XML trees **) |
|
60 |
|
61 type attributes = Properties.T; |
|
62 |
|
63 datatype tree = |
|
64 Elem of Markup.T * tree list |
|
65 | Text of string; |
|
66 |
|
67 type body = tree list; |
|
68 |
|
69 fun add_content (Elem (_, ts)) = fold add_content ts |
|
70 | add_content (Text s) = Buffer.add s; |
|
71 |
|
72 fun content_of body = Buffer.empty |> fold add_content body |> Buffer.content; |
|
73 |
|
74 |
|
75 |
|
76 (** string representation **) |
|
77 |
|
78 val header = "<?xml version=\"1.0\"?>\n"; |
|
79 |
|
80 |
|
81 (* escaped text *) |
|
82 |
|
83 fun decode "<" = "<" |
|
84 | decode ">" = ">" |
|
85 | decode "&" = "&" |
|
86 | decode "'" = "'" |
|
87 | decode """ = "\"" |
|
88 | decode c = c; |
|
89 |
|
90 fun encode "<" = "<" |
|
91 | encode ">" = ">" |
|
92 | encode "&" = "&" |
|
93 | encode "'" = "'" |
|
94 | encode "\"" = """ |
|
95 | encode c = c; |
|
96 |
|
97 val text = translate_string encode; |
|
98 |
|
99 |
|
100 (* elements *) |
|
101 |
|
102 fun elem name atts = |
|
103 space_implode " " (name :: map (fn (a, x) => a ^ "=\"" ^ text x ^ "\"") atts); |
|
104 |
|
105 fun element name atts body = |
|
106 let val b = implode body in |
|
107 if b = "" then enclose "<" "/>" (elem name atts) |
|
108 else enclose "<" ">" (elem name atts) ^ b ^ enclose "</" ">" name |
|
109 end; |
|
110 |
|
111 fun output_markup (markup as (name, atts)) = |
|
112 if Markup.is_empty markup then Markup.no_output |
|
113 else (enclose "<" ">" (elem name atts), enclose "</" ">" name); |
|
114 |
|
115 |
|
116 (* output *) |
|
117 |
|
118 fun buffer_of depth tree = |
|
119 let |
|
120 fun traverse _ (Elem ((name, atts), [])) = |
|
121 Buffer.add "<" #> Buffer.add (elem name atts) #> Buffer.add "/>" |
|
122 | traverse d (Elem ((name, atts), ts)) = |
|
123 Buffer.add "<" #> Buffer.add (elem name atts) #> Buffer.add ">" #> |
|
124 traverse_body d ts #> |
|
125 Buffer.add "</" #> Buffer.add name #> Buffer.add ">" |
|
126 | traverse _ (Text s) = Buffer.add (text s) |
|
127 and traverse_body 0 _ = Buffer.add "..." |
|
128 | traverse_body d ts = fold (traverse (d - 1)) ts; |
|
129 in Buffer.empty |> traverse depth tree end; |
|
130 |
|
131 val string_of = Buffer.content o buffer_of ~1; |
|
132 val output = Buffer.output o buffer_of ~1; |
|
133 |
|
134 fun pretty depth tree = |
|
135 Pretty.str (Buffer.content (buffer_of (Int.max (0, depth)) tree)); |
|
136 |
|
137 |
|
138 |
|
139 (** XML parsing **) |
|
140 |
|
141 local |
|
142 |
|
143 fun err msg (xs, _) = |
|
144 fn () => "XML parsing error: " ^ msg () ^ "\nfound: " ^ quote (Symbol.beginning 100 xs); |
|
145 |
|
146 fun ignored _ = []; |
|
147 |
|
148 val blanks = Scan.many Symbol.is_blank; |
|
149 val special = $$ "&" ^^ Symbol.scan_id ^^ $$ ";" >> decode; |
|
150 val regular = Scan.one Symbol.is_regular; |
|
151 fun regular_except x = Scan.one (fn c => Symbol.is_regular c andalso c <> x); |
|
152 |
|
153 val parse_chars = Scan.repeat1 (special || regular_except "<") >> implode; |
|
154 |
|
155 val parse_cdata = |
|
156 Scan.this_string "<![CDATA[" |-- |
|
157 (Scan.repeat (Scan.unless (Scan.this_string "]]>") regular) >> implode) --| |
|
158 Scan.this_string "]]>"; |
|
159 |
|
160 val parse_att = |
|
161 (Symbol.scan_id --| (blanks -- $$ "=" -- blanks)) -- |
|
162 (($$ "\"" || $$ "'") :|-- (fn s => |
|
163 (Scan.repeat (special || regular_except s) >> implode) --| $$ s)); |
|
164 |
|
165 val parse_comment = |
|
166 Scan.this_string "<!--" -- |
|
167 Scan.repeat (Scan.unless (Scan.this_string "-->") regular) -- |
|
168 Scan.this_string "-->" >> ignored; |
|
169 |
|
170 val parse_processing_instruction = |
|
171 Scan.this_string "<?" -- |
|
172 Scan.repeat (Scan.unless (Scan.this_string "?>") regular) -- |
|
173 Scan.this_string "?>" >> ignored; |
|
174 |
|
175 val parse_doctype = |
|
176 Scan.this_string "<!DOCTYPE" -- |
|
177 Scan.repeat (Scan.unless ($$ ">") regular) -- |
|
178 $$ ">" >> ignored; |
|
179 |
|
180 val parse_misc = |
|
181 Scan.one Symbol.is_blank >> ignored || |
|
182 parse_processing_instruction || |
|
183 parse_comment; |
|
184 |
|
185 val parse_optional_text = |
|
186 Scan.optional (parse_chars >> (single o Text)) []; |
|
187 |
|
188 fun name_start_char c = Symbol.is_ascii_letter c orelse c = ":" orelse c = "_"; |
|
189 fun name_char c = name_start_char c orelse Symbol.is_ascii_digit c orelse c = "-" orelse c = "."; |
|
190 val parse_name = Scan.one name_start_char ::: Scan.many name_char; |
|
191 |
|
192 in |
|
193 |
|
194 val parse_comments = |
|
195 blanks -- Scan.repeat (parse_comment -- blanks >> K ()) >> K (); |
|
196 |
|
197 val parse_string = Scan.read Symbol.stopper parse_chars o raw_explode; |
|
198 |
|
199 fun parse_content xs = |
|
200 (parse_optional_text @@@ |
|
201 (Scan.repeat |
|
202 ((parse_element >> single || |
|
203 parse_cdata >> (single o Text) || |
|
204 parse_processing_instruction || |
|
205 parse_comment) |
|
206 @@@ parse_optional_text) >> flat)) xs |
|
207 |
|
208 and parse_element xs = |
|
209 ($$ "<" |-- parse_name -- Scan.repeat (blanks |-- parse_att) --| blanks :-- |
|
210 (fn (name, _) => |
|
211 !! (err (fn () => "Expected > or />")) |
|
212 ($$ "/" -- $$ ">" >> ignored || |
|
213 $$ ">" |-- parse_content --| |
|
214 !! (err (fn () => "Expected </" ^ implode name ^ ">")) |
|
215 ($$ "<" -- $$ "/" -- Scan.this name -- blanks -- $$ ">"))) |
|
216 >> (fn ((name, atts), body) => Elem ((implode name, atts), body))) xs; |
|
217 |
|
218 val parse_document = |
|
219 (Scan.repeat parse_misc -- Scan.option parse_doctype -- Scan.repeat parse_misc) |
|
220 |-- parse_element; |
|
221 |
|
222 fun parse s = |
|
223 (case Scan.finite Symbol.stopper (Scan.error (!! (err (fn () => "Malformed element")) |
|
224 (blanks |-- parse_document --| blanks))) (raw_explode s) of |
|
225 (x, []) => x |
|
226 | (_, ys) => error ("XML parsing error: Unprocessed input\n" ^ Symbol.beginning 100 ys)); |
|
227 |
|
228 end; |
|
229 |
|
230 |
|
231 |
|
232 (** XML as data representation language **) |
|
233 |
|
234 exception XML_ATOM of string; |
|
235 exception XML_BODY of tree list; |
|
236 |
|
237 |
|
238 structure Encode = |
|
239 struct |
|
240 |
|
241 type 'a A = 'a -> string; |
|
242 type 'a T = 'a -> body; |
|
243 type 'a V = 'a -> string list * body; |
|
244 |
|
245 |
|
246 (* atomic values *) |
|
247 |
|
248 fun int_atom i = signed_string_of_int i; |
|
249 |
|
250 fun bool_atom false = "0" |
|
251 | bool_atom true = "1"; |
|
252 |
|
253 fun unit_atom () = ""; |
|
254 |
|
255 |
|
256 (* structural nodes *) |
|
257 |
|
258 fun node ts = Elem ((":", []), ts); |
|
259 |
|
260 fun vector xs = map_index (fn (i, x) => (int_atom i, x)) xs; |
|
261 |
|
262 fun tagged (tag, (xs, ts)) = Elem ((int_atom tag, vector xs), ts); |
|
263 |
|
264 |
|
265 (* representation of standard types *) |
|
266 |
|
267 fun properties props = [Elem ((":", props), [])]; |
|
268 |
|
269 fun string "" = [] |
|
270 | string s = [Text s]; |
|
271 |
|
272 val int = string o int_atom; |
|
273 |
|
274 val bool = string o bool_atom; |
|
275 |
|
276 val unit = string o unit_atom; |
|
277 |
|
278 fun pair f g (x, y) = [node (f x), node (g y)]; |
|
279 |
|
280 fun triple f g h (x, y, z) = [node (f x), node (g y), node (h z)]; |
|
281 |
|
282 fun list f xs = map (node o f) xs; |
|
283 |
|
284 fun option _ NONE = [] |
|
285 | option f (SOME x) = [node (f x)]; |
|
286 |
|
287 fun variant fs x = [tagged (the (get_index (fn f => try f x) fs))]; |
|
288 |
|
289 end; |
|
290 |
|
291 |
|
292 structure Decode = |
|
293 struct |
|
294 |
|
295 type 'a A = string -> 'a; |
|
296 type 'a T = body -> 'a; |
|
297 type 'a V = string list * body -> 'a; |
|
298 |
|
299 |
|
300 (* atomic values *) |
|
301 |
|
302 fun int_atom s = |
|
303 Markup.parse_int s |
|
304 handle Fail _ => raise XML_ATOM s; |
|
305 |
|
306 fun bool_atom "0" = false |
|
307 | bool_atom "1" = true |
|
308 | bool_atom s = raise XML_ATOM s; |
|
309 |
|
310 fun unit_atom "" = () |
|
311 | unit_atom s = raise XML_ATOM s; |
|
312 |
|
313 |
|
314 (* structural nodes *) |
|
315 |
|
316 fun node (Elem ((":", []), ts)) = ts |
|
317 | node t = raise XML_BODY [t]; |
|
318 |
|
319 fun vector atts = |
|
320 #1 (fold_map (fn (a, x) => |
|
321 fn i => if int_atom a = i then (x, i + 1) else raise XML_ATOM a) atts 0); |
|
322 |
|
323 fun tagged (Elem ((name, atts), ts)) = (int_atom name, (vector atts, ts)) |
|
324 | tagged t = raise XML_BODY [t]; |
|
325 |
|
326 |
|
327 (* representation of standard types *) |
|
328 |
|
329 fun properties [Elem ((":", props), [])] = props |
|
330 | properties ts = raise XML_BODY ts; |
|
331 |
|
332 fun string [] = "" |
|
333 | string [Text s] = s |
|
334 | string ts = raise XML_BODY ts; |
|
335 |
|
336 val int = int_atom o string; |
|
337 |
|
338 val bool = bool_atom o string; |
|
339 |
|
340 val unit = unit_atom o string; |
|
341 |
|
342 fun pair f g [t1, t2] = (f (node t1), g (node t2)) |
|
343 | pair _ _ ts = raise XML_BODY ts; |
|
344 |
|
345 fun triple f g h [t1, t2, t3] = (f (node t1), g (node t2), h (node t3)) |
|
346 | triple _ _ _ ts = raise XML_BODY ts; |
|
347 |
|
348 fun list f ts = map (f o node) ts; |
|
349 |
|
350 fun option _ [] = NONE |
|
351 | option f [t] = SOME (f (node t)) |
|
352 | option _ ts = raise XML_BODY ts; |
|
353 |
|
354 fun variant fs [t] = |
|
355 let |
|
356 val (tag, (xs, ts)) = tagged t; |
|
357 val f = nth fs tag handle General.Subscript => raise XML_BODY [t]; |
|
358 in f (xs, ts) end |
|
359 | variant _ ts = raise XML_BODY ts; |
|
360 |
|
361 end; |
|
362 |
|
363 end; |