| author | wenzelm | 
| Mon, 23 Mar 2009 22:56:03 +0100 | |
| changeset 30678 | 35d40d961ed2 | 
| parent 29606 | fedb8be05f24 | 
| child 31469 | 40f815edbde4 | 
| permissions | -rw-r--r-- | 
| 24584 | 1  | 
(* Title: Pure/General/xml.ML  | 
| 24264 | 2  | 
Author: David Aspinall, Stefan Berghofer and Markus Wenzel  | 
3  | 
||
4  | 
Basic support for XML.  | 
|
5  | 
*)  | 
|
6  | 
||
7  | 
signature XML =  | 
|
8  | 
sig  | 
|
| 28017 | 9  | 
type attributes = Properties.T  | 
| 24264 | 10  | 
datatype tree =  | 
11  | 
Elem of string * attributes * tree list  | 
|
12  | 
| Text of string  | 
|
| 26546 | 13  | 
val add_content: tree -> Buffer.T -> Buffer.T  | 
14  | 
val detect: string -> bool  | 
|
15  | 
val header: string  | 
|
16  | 
val text: string -> string  | 
|
17  | 
val element: string -> attributes -> string list -> string  | 
|
18  | 
val output_markup: Markup.T -> output * output  | 
|
| 
26539
 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 
wenzelm 
parents: 
26525 
diff
changeset
 | 
19  | 
val string_of: tree -> string  | 
| 26546 | 20  | 
val output: tree -> TextIO.outstream -> unit  | 
| 
26984
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
21  | 
val parse_comments: string list -> unit * string list  | 
| 24264 | 22  | 
val parse_string : string -> string option  | 
| 26546 | 23  | 
val parse_element: string list -> tree * string list  | 
| 
26984
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
24  | 
val parse_document: string list -> tree * string list  | 
| 
26539
 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 
wenzelm 
parents: 
26525 
diff
changeset
 | 
25  | 
val parse: string -> tree  | 
| 24264 | 26  | 
end;  | 
27  | 
||
28  | 
structure XML: XML =  | 
|
29  | 
struct  | 
|
30  | 
||
| 26546 | 31  | 
(** XML trees **)  | 
32  | 
||
| 28017 | 33  | 
type attributes = Properties.T;  | 
| 26546 | 34  | 
|
35  | 
datatype tree =  | 
|
36  | 
Elem of string * attributes * tree list  | 
|
| 28033 | 37  | 
| Text of string;  | 
| 26546 | 38  | 
|
39  | 
fun add_content (Elem (_, _, ts)) = fold add_content ts  | 
|
| 28033 | 40  | 
| add_content (Text s) = Buffer.add s;  | 
| 26546 | 41  | 
|
42  | 
||
| 24264 | 43  | 
|
| 26525 | 44  | 
(** string representation **)  | 
45  | 
||
46  | 
val detect = String.isPrefix "<?xml";  | 
|
| 24264 | 47  | 
val header = "<?xml version=\"1.0\"?>\n";  | 
48  | 
||
49  | 
||
| 26546 | 50  | 
(* escaped text *)  | 
| 24264 | 51  | 
|
52  | 
fun decode "<" = "<"  | 
|
53  | 
| decode ">" = ">"  | 
|
54  | 
| decode "&" = "&"  | 
|
55  | 
| decode "'" = "'"  | 
|
56  | 
| decode """ = "\""  | 
|
57  | 
| decode c = c;  | 
|
58  | 
||
59  | 
fun encode "<" = "<"  | 
|
60  | 
| encode ">" = ">"  | 
|
61  | 
| encode "&" = "&"  | 
|
62  | 
| encode "'" = "'"  | 
|
63  | 
| encode "\"" = """  | 
|
64  | 
| encode c = c;  | 
|
65  | 
||
| 25838 | 66  | 
val text = translate_string encode;  | 
| 24264 | 67  | 
|
68  | 
||
69  | 
(* elements *)  | 
|
70  | 
||
| 
26539
 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 
wenzelm 
parents: 
26525 
diff
changeset
 | 
71  | 
fun elem name atts =  | 
| 26551 | 72  | 
space_implode " " (name :: map (fn (a, x) => a ^ "=\"" ^ text x ^ "\"") atts);  | 
| 24264 | 73  | 
|
| 26525 | 74  | 
fun element name atts body =  | 
| 
26539
 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 
wenzelm 
parents: 
26525 
diff
changeset
 | 
75  | 
let val b = implode body in  | 
| 
 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 
wenzelm 
parents: 
26525 
diff
changeset
 | 
76  | 
if b = "" then enclose "<" "/>" (elem name atts)  | 
| 
 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 
wenzelm 
parents: 
26525 
diff
changeset
 | 
77  | 
else enclose "<" ">" (elem name atts) ^ b ^ enclose "</" ">" name  | 
| 24264 | 78  | 
end;  | 
79  | 
||
| 27884 | 80  | 
fun output_markup (markup as (name, atts)) =  | 
| 29325 | 81  | 
if Markup.is_none markup then Markup.no_output  | 
| 27884 | 82  | 
else (enclose "<" ">" (elem name atts), enclose "</" ">" name);  | 
| 
26539
 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 
wenzelm 
parents: 
26525 
diff
changeset
 | 
83  | 
|
| 24264 | 84  | 
|
| 26546 | 85  | 
(* output *)  | 
| 24264 | 86  | 
|
| 26546 | 87  | 
fun buffer_of tree =  | 
| 24264 | 88  | 
let  | 
| 26546 | 89  | 
fun traverse (Elem (name, atts, [])) =  | 
| 
26539
 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 
wenzelm 
parents: 
26525 
diff
changeset
 | 
90  | 
Buffer.add "<" #> Buffer.add (elem name atts) #> Buffer.add "/>"  | 
| 26546 | 91  | 
| traverse (Elem (name, atts, ts)) =  | 
| 
26539
 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 
wenzelm 
parents: 
26525 
diff
changeset
 | 
92  | 
Buffer.add "<" #> Buffer.add (elem name atts) #> Buffer.add ">" #>  | 
| 26546 | 93  | 
fold traverse ts #>  | 
| 26525 | 94  | 
Buffer.add "</" #> Buffer.add name #> Buffer.add ">"  | 
| 28033 | 95  | 
| traverse (Text s) = Buffer.add (text s);  | 
| 26546 | 96  | 
in Buffer.empty |> traverse tree end;  | 
| 24264 | 97  | 
|
| 26546 | 98  | 
val string_of = Buffer.content o buffer_of;  | 
99  | 
val output = Buffer.output o buffer_of;  | 
|
| 25838 | 100  | 
|
| 24264 | 101  | 
|
102  | 
||
| 26546 | 103  | 
(** XML parsing (slow) **)  | 
104  | 
||
105  | 
local  | 
|
| 24264 | 106  | 
|
107  | 
fun err s (xs, _) =  | 
|
108  | 
"XML parsing error: " ^ s ^ "\nfound: " ^ quote (Symbol.beginning 100 xs);  | 
|
109  | 
||
| 
26984
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
110  | 
fun ignored _ = [];  | 
| 
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
111  | 
|
| 26551 | 112  | 
val blanks = Scan.many Symbol.is_blank;  | 
113  | 
val special = $$ "&" ^^ Symbol.scan_id ^^ $$ ";" >> decode;  | 
|
114  | 
val regular = Scan.one Symbol.is_regular;  | 
|
115  | 
fun regular_except x = Scan.one (fn c => Symbol.is_regular c andalso c <> x);  | 
|
| 24264 | 116  | 
|
| 26551 | 117  | 
val parse_chars = Scan.repeat1 (special || regular_except "<") >> implode;  | 
| 24264 | 118  | 
|
| 26551 | 119  | 
val parse_cdata =  | 
120  | 
Scan.this_string "<![CDATA[" |--  | 
|
121  | 
(Scan.repeat (Scan.unless (Scan.this_string "]]>") regular) >> implode) --|  | 
|
122  | 
Scan.this_string "]]>";  | 
|
| 24264 | 123  | 
|
124  | 
val parse_att =  | 
|
| 26551 | 125  | 
(Symbol.scan_id --| (blanks -- $$ "=" -- blanks)) --  | 
126  | 
(($$ "\"" || $$ "'") :|-- (fn s =>  | 
|
127  | 
(Scan.repeat (special || regular_except s) >> implode) --| $$ s));  | 
|
| 24264 | 128  | 
|
| 26551 | 129  | 
val parse_comment =  | 
130  | 
Scan.this_string "<!--" --  | 
|
131  | 
Scan.repeat (Scan.unless (Scan.this_string "-->") regular) --  | 
|
| 
26984
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
132  | 
Scan.this_string "-->" >> ignored;  | 
| 24264 | 133  | 
|
| 26551 | 134  | 
val parse_processing_instruction =  | 
135  | 
Scan.this_string "<?" --  | 
|
136  | 
Scan.repeat (Scan.unless (Scan.this_string "?>") regular) --  | 
|
| 
26984
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
137  | 
Scan.this_string "?>" >> ignored;  | 
| 
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
138  | 
|
| 
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
139  | 
val parse_doctype =  | 
| 
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
140  | 
Scan.this_string "<!DOCTYPE" --  | 
| 
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
141  | 
Scan.repeat (Scan.unless ($$ ">") regular) --  | 
| 
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
142  | 
$$ ">" >> ignored;  | 
| 
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
143  | 
|
| 
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
144  | 
val parse_misc =  | 
| 
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
145  | 
Scan.one Symbol.is_blank >> ignored ||  | 
| 
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
146  | 
parse_processing_instruction ||  | 
| 
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
147  | 
parse_comment;  | 
| 26551 | 148  | 
|
149  | 
val parse_optional_text =  | 
|
150  | 
Scan.optional (parse_chars >> (single o Text)) [];  | 
|
| 24264 | 151  | 
|
| 26546 | 152  | 
in  | 
153  | 
||
| 
26984
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
154  | 
val parse_comments =  | 
| 
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
155  | 
blanks -- Scan.repeat (parse_comment -- blanks >> K ()) >> K ();  | 
| 
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
156  | 
|
| 26546 | 157  | 
val parse_string = Scan.read Symbol.stopper parse_chars o explode;  | 
158  | 
||
| 24264 | 159  | 
fun parse_content xs =  | 
| 26551 | 160  | 
(parse_optional_text @@@  | 
161  | 
(Scan.repeat  | 
|
162  | 
((parse_element >> single ||  | 
|
163  | 
parse_cdata >> (single o Text) ||  | 
|
| 
26984
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
164  | 
parse_processing_instruction ||  | 
| 
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
165  | 
parse_comment)  | 
| 26551 | 166  | 
@@@ parse_optional_text) >> flat)) xs  | 
| 24264 | 167  | 
|
| 26546 | 168  | 
and parse_element xs =  | 
| 24264 | 169  | 
($$ "<" |-- Symbol.scan_id --  | 
| 26551 | 170  | 
Scan.repeat (blanks |-- parse_att) --| blanks :-- (fn (s, _) =>  | 
| 24264 | 171  | 
!! (err "Expected > or />")  | 
| 
26984
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
172  | 
(Scan.this_string "/>" >> ignored  | 
| 24264 | 173  | 
|| $$ ">" |-- parse_content --|  | 
174  | 
            !! (err ("Expected </" ^ s ^ ">"))
 | 
|
| 26551 | 175  | 
              (Scan.this_string ("</" ^ s) --| blanks --| $$ ">"))) >>
 | 
| 24264 | 176  | 
(fn ((s, atts), ts) => Elem (s, atts, ts))) xs;  | 
177  | 
||
| 
26984
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
178  | 
val parse_document =  | 
| 
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
179  | 
(Scan.repeat parse_misc -- Scan.option parse_doctype -- Scan.repeat parse_misc)  | 
| 
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
180  | 
|-- parse_element;  | 
| 24264 | 181  | 
|
| 
26539
 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 
wenzelm 
parents: 
26525 
diff
changeset
 | 
182  | 
fun parse s =  | 
| 24264 | 183  | 
(case Scan.finite Symbol.stopper (Scan.error (!! (err "Malformed element")  | 
| 
26984
 
d0e098e206f3
added parse_document (optional unchecked header material);
 
wenzelm 
parents: 
26554 
diff
changeset
 | 
184  | 
(blanks |-- parse_document --| blanks))) (explode s) of  | 
| 24264 | 185  | 
(x, []) => x  | 
186  | 
  | (_, ys) => error ("XML parsing error: Unprocessed input\n" ^ Symbol.beginning 100 ys));
 | 
|
187  | 
||
188  | 
end;  | 
|
| 26546 | 189  | 
|
190  | 
end;  |