| author | bulwahn | 
| Wed, 15 Dec 2010 17:46:45 +0100 | |
| changeset 41176 | ede546773fd6 | 
| parent 32960 | 69916a850301 | 
| child 41491 | a2ad5b824051 | 
| permissions | -rw-r--r-- | 
| 24584 | 1  | 
(* Title: HOL/Import/xml.ML  | 
| 19064 | 2  | 
|
| 26537 | 3  | 
Yet another version of XML support.  | 
| 19064 | 4  | 
*)  | 
5  | 
||
6  | 
signature XML =  | 
|
7  | 
sig  | 
|
8  | 
val header: string  | 
|
9  | 
val text: string -> string  | 
|
10  | 
val text_charref: string -> string  | 
|
11  | 
val cdata: string -> string  | 
|
12  | 
val element: string -> (string * string) list -> string list -> string  | 
|
| 19089 | 13  | 
|
| 19064 | 14  | 
datatype tree =  | 
15  | 
Elem of string * (string * string) list * tree list  | 
|
16  | 
| Text of string  | 
|
| 19089 | 17  | 
|
| 19064 | 18  | 
val string_of_tree: tree -> string  | 
19  | 
val tree_of_string: string -> tree  | 
|
| 19089 | 20  | 
|
21  | 
val encoded_string_of_tree : tree -> string  | 
|
22  | 
val tree_of_encoded_string : string -> tree  | 
|
| 19064 | 23  | 
end;  | 
24  | 
||
| 19089 | 25  | 
structure XML :> XML =  | 
| 19064 | 26  | 
struct  | 
27  | 
||
| 19093 | 28  | 
(*structure Seq = VectorScannerSeq  | 
29  | 
structure Scan = Scanner (structure Seq = Seq)*)  | 
|
30  | 
structure Seq = StringScannerSeq  | 
|
31  | 
structure Scan = StringScanner  | 
|
32  | 
||
| 19095 | 33  | 
|
| 19064 | 34  | 
open Scan  | 
35  | 
||
36  | 
(** string based representation (small scale) **)  | 
|
37  | 
||
38  | 
val header = "<?xml version=\"1.0\"?>\n";  | 
|
39  | 
||
40  | 
(* text and character data *)  | 
|
41  | 
||
42  | 
fun decode "<" = "<"  | 
|
43  | 
| decode ">" = ">"  | 
|
44  | 
| decode "&" = "&"  | 
|
45  | 
| decode "'" = "'"  | 
|
46  | 
| decode """ = "\""  | 
|
47  | 
| decode c = c;  | 
|
48  | 
||
49  | 
fun encode "<" = "<"  | 
|
50  | 
| encode ">" = ">"  | 
|
51  | 
| encode "&" = "&"  | 
|
52  | 
| encode "'" = "'"  | 
|
53  | 
| encode "\"" = """  | 
|
54  | 
| encode c = c;  | 
|
55  | 
||
56  | 
fun encode_charref c = "&#" ^ Int.toString (ord c) ^ ";"  | 
|
57  | 
||
58  | 
val text = Library.translate_string encode  | 
|
59  | 
||
60  | 
val text_charref = translate_string encode_charref;  | 
|
61  | 
||
62  | 
val cdata = enclose "<![CDATA[" "]]>\n"  | 
|
63  | 
||
64  | 
(* elements *)  | 
|
65  | 
||
66  | 
fun attribute (a, x) = a ^ " = \"" ^ text x ^ "\"";  | 
|
67  | 
||
68  | 
fun element name atts cs =  | 
|
69  | 
let val elem = space_implode " " (name :: map attribute atts) in  | 
|
70  | 
if null cs then enclose "<" "/>" elem  | 
|
71  | 
else enclose "<" ">" elem ^ implode cs ^ enclose "</" ">" name  | 
|
72  | 
end;  | 
|
73  | 
||
74  | 
(** explicit XML trees **)  | 
|
75  | 
||
76  | 
datatype tree =  | 
|
77  | 
Elem of string * (string * string) list * tree list  | 
|
78  | 
| Text of string;  | 
|
79  | 
||
80  | 
fun string_of_tree tree =  | 
|
81  | 
let  | 
|
82  | 
fun string_of (Elem (name, atts, ts)) buf =  | 
|
83  | 
let val buf' =  | 
|
84  | 
buf |> Buffer.add "<"  | 
|
85  | 
|> fold Buffer.add (separate " " (name :: map attribute atts))  | 
|
86  | 
in  | 
|
87  | 
if null ts then  | 
|
88  | 
buf' |> Buffer.add "/>"  | 
|
89  | 
else  | 
|
90  | 
buf' |> Buffer.add ">"  | 
|
91  | 
|> fold string_of ts  | 
|
92  | 
|> Buffer.add "</" |> Buffer.add name |> Buffer.add ">"  | 
|
93  | 
end  | 
|
94  | 
| string_of (Text s) buf = Buffer.add (text s) buf;  | 
|
95  | 
in Buffer.content (string_of tree Buffer.empty) end;  | 
|
96  | 
||
97  | 
(** XML parsing **)  | 
|
98  | 
||
| 19089 | 99  | 
fun beginning n xs = Symbol.beginning n (Seq.take_at_most xs n)  | 
| 19064 | 100  | 
|
101  | 
fun err s xs =  | 
|
102  | 
"XML parsing error: " ^ s ^ "\nfound: " ^ quote (beginning 100 xs) ;  | 
|
103  | 
||
104  | 
val scan_whspc = Scan.any Symbol.is_blank;  | 
|
105  | 
||
106  | 
val scan_special = $$ "&" ^^ scan_id ^^ $$ ";" >> decode;  | 
|
107  | 
||
108  | 
val parse_chars = Scan.repeat1 (Scan.unless ((* scan_whspc -- *)$$ "<")  | 
|
| 
23784
 
75e6b9dd5336
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
 
wenzelm 
parents: 
19095 
diff
changeset
 | 
109  | 
(scan_special || Scan.one Symbol.is_regular)) >> implode;  | 
| 19064 | 110  | 
|
111  | 
val parse_cdata = Scan.this_string "<![CDATA[" |--  | 
|
| 
23784
 
75e6b9dd5336
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
 
wenzelm 
parents: 
19095 
diff
changeset
 | 
112  | 
(Scan.repeat (Scan.unless (Scan.this_string "]]>") (Scan.one Symbol.is_regular)) >>  | 
| 19064 | 113  | 
implode) --| Scan.this_string "]]>";  | 
114  | 
||
115  | 
val parse_att =  | 
|
116  | 
scan_id --| scan_whspc --| $$ "=" --| scan_whspc --  | 
|
117  | 
(($$ "\"" || $$ "'") :-- (fn s => (Scan.repeat (Scan.unless ($$ s)  | 
|
| 
23784
 
75e6b9dd5336
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
 
wenzelm 
parents: 
19095 
diff
changeset
 | 
118  | 
(scan_special || Scan.one Symbol.is_regular)) >> implode) --| $$ s) >> snd);  | 
| 19064 | 119  | 
|
120  | 
val parse_comment = Scan.this_string "<!--" --  | 
|
| 
23784
 
75e6b9dd5336
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
 
wenzelm 
parents: 
19095 
diff
changeset
 | 
121  | 
Scan.repeat (Scan.unless (Scan.this_string "-->") (Scan.one Symbol.is_regular)) --  | 
| 19064 | 122  | 
Scan.this_string "-->";  | 
123  | 
||
124  | 
val scan_comment_whspc =  | 
|
125  | 
(scan_whspc >> K()) --| (Scan.repeat (parse_comment |-- (scan_whspc >> K())));  | 
|
126  | 
||
127  | 
val parse_pi = Scan.this_string "<?" |--  | 
|
| 
23784
 
75e6b9dd5336
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
 
wenzelm 
parents: 
19095 
diff
changeset
 | 
128  | 
Scan.repeat (Scan.unless (Scan.this_string "?>") (Scan.one Symbol.is_regular)) --|  | 
| 19064 | 129  | 
Scan.this_string "?>";  | 
130  | 
||
131  | 
fun parse_content xs =  | 
|
132  | 
((Scan.optional ((* scan_whspc |-- *) parse_chars >> (single o Text)) [] --  | 
|
133  | 
(Scan.repeat ((* scan_whspc |-- *)  | 
|
134  | 
( parse_elem >> single  | 
|
135  | 
|| parse_cdata >> (single o Text)  | 
|
136  | 
|| parse_pi >> K []  | 
|
137  | 
|| parse_comment >> K []) --  | 
|
138  | 
Scan.optional ((* scan_whspc |-- *) parse_chars >> (single o Text)) []  | 
|
| 32952 | 139  | 
>> op @) >> flat) >> op @)(* --| scan_whspc*)) xs  | 
| 19064 | 140  | 
|
141  | 
and parse_elem xs =  | 
|
142  | 
($$ "<" |-- scan_id --  | 
|
143  | 
Scan.repeat (scan_whspc |-- parse_att) --| scan_whspc :-- (fn (s, _) =>  | 
|
144  | 
!! (err "Expected > or />")  | 
|
145  | 
(Scan.this_string "/>" >> K []  | 
|
146  | 
|| $$ ">" |-- parse_content --|  | 
|
147  | 
            !! (err ("Expected </" ^ s ^ ">"))
 | 
|
148  | 
              (Scan.this_string ("</" ^ s) --| scan_whspc --| $$ ">"))) >>
 | 
|
149  | 
(fn ((s, atts), ts) => Elem (s, atts, ts))) xs;  | 
|
150  | 
||
151  | 
val parse_document =  | 
|
152  | 
Scan.option (Scan.this_string "<!DOCTYPE" -- scan_whspc |--  | 
|
153  | 
(Scan.repeat (Scan.unless ($$ ">")  | 
|
| 
23784
 
75e6b9dd5336
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
 
wenzelm 
parents: 
19095 
diff
changeset
 | 
154  | 
(Scan.one Symbol.is_regular)) >> implode) --| $$ ">" --| scan_whspc) --  | 
| 19064 | 155  | 
parse_elem;  | 
156  | 
||
157  | 
fun tree_of_string s =  | 
|
158  | 
let  | 
|
| 
32960
 
69916a850301
eliminated hard tabulators, guessing at each author's individual tab-width;
 
wenzelm 
parents: 
32952 
diff
changeset
 | 
159  | 
val seq = Seq.fromString s  | 
| 
 
69916a850301
eliminated hard tabulators, guessing at each author's individual tab-width;
 
wenzelm 
parents: 
32952 
diff
changeset
 | 
160  | 
val scanner = !! (err "Malformed element") (scan_whspc |-- parse_elem --| scan_whspc)  | 
| 
 
69916a850301
eliminated hard tabulators, guessing at each author's individual tab-width;
 
wenzelm 
parents: 
32952 
diff
changeset
 | 
161  | 
val (x, toks) = scanner seq  | 
| 19064 | 162  | 
in  | 
| 
32960
 
69916a850301
eliminated hard tabulators, guessing at each author's individual tab-width;
 
wenzelm 
parents: 
32952 
diff
changeset
 | 
163  | 
        if Seq.null toks then x else error ("Unprocessed input: '"^(beginning 100 toks)^"'")
 | 
| 19064 | 164  | 
end  | 
| 19089 | 165  | 
|
166  | 
(* More efficient saving and loading of xml trees employing a proprietary external format *)  | 
|
167  | 
||
168  | 
fun write_lstring s buf = buf |> Buffer.add (string_of_int (size s)) |> Buffer.add " " |> Buffer.add s  | 
|
169  | 
fun parse_lstring toks = (scan_nat --| one Symbol.is_blank :-- (fn i => repeat_fixed i (one (K true))) >> snd >> implode) toks  | 
|
170  | 
||
171  | 
fun write_list w l buf = buf |> Buffer.add (string_of_int (length l)) |> Buffer.add " " |> fold w l  | 
|
172  | 
fun parse_list sc = scan_nat --| one Symbol.is_blank :-- (fn i => repeat_fixed i sc) >> snd  | 
|
173  | 
||
174  | 
fun write_tree (Text s) buf = buf |> Buffer.add "T" |> write_lstring s  | 
|
175  | 
| write_tree (Elem (name, attrs, children)) buf =  | 
|
176  | 
buf |> Buffer.add "E"  | 
|
| 
32960
 
69916a850301
eliminated hard tabulators, guessing at each author's individual tab-width;
 
wenzelm 
parents: 
32952 
diff
changeset
 | 
177  | 
|> write_lstring name  | 
| 
 
69916a850301
eliminated hard tabulators, guessing at each author's individual tab-width;
 
wenzelm 
parents: 
32952 
diff
changeset
 | 
178  | 
|> write_list (fn (a,b) => fn buf => buf |> write_lstring a |> write_lstring b) attrs  | 
| 
 
69916a850301
eliminated hard tabulators, guessing at each author's individual tab-width;
 
wenzelm 
parents: 
32952 
diff
changeset
 | 
179  | 
|> write_list write_tree children  | 
| 19089 | 180  | 
|
181  | 
fun parse_tree toks = (one (K true) :-- (fn "T" => parse_lstring >> Text | "E" => parse_elem | _ => raise SyntaxError) >> snd) toks  | 
|
182  | 
and parse_elem toks = (parse_lstring -- parse_list (parse_lstring -- parse_lstring) -- parse_list parse_tree >> (fn ((n, a), c) => Elem (n,a,c))) toks  | 
|
183  | 
||
184  | 
fun encoded_string_of_tree tree = Buffer.content (write_tree tree Buffer.empty)  | 
|
185  | 
||
186  | 
fun tree_of_encoded_string s =  | 
|
187  | 
let  | 
|
| 
32960
 
69916a850301
eliminated hard tabulators, guessing at each author's individual tab-width;
 
wenzelm 
parents: 
32952 
diff
changeset
 | 
188  | 
fun print (a,b) = writeln (a^" "^(string_of_int b))  | 
| 
 
69916a850301
eliminated hard tabulators, guessing at each author's individual tab-width;
 
wenzelm 
parents: 
32952 
diff
changeset
 | 
189  | 
        val _ = print ("length of encoded string: ", size s)
 | 
| 
 
69916a850301
eliminated hard tabulators, guessing at each author's individual tab-width;
 
wenzelm 
parents: 
32952 
diff
changeset
 | 
190  | 
val _ = writeln "Seq.fromString..."  | 
| 
 
69916a850301
eliminated hard tabulators, guessing at each author's individual tab-width;
 
wenzelm 
parents: 
32952 
diff
changeset
 | 
191  | 
val seq = timeit (fn () => Seq.fromString s)  | 
| 
 
69916a850301
eliminated hard tabulators, guessing at each author's individual tab-width;
 
wenzelm 
parents: 
32952 
diff
changeset
 | 
192  | 
        val  _ = print ("length of sequence", timeit (fn () => Seq.length seq))
 | 
| 
 
69916a850301
eliminated hard tabulators, guessing at each author's individual tab-width;
 
wenzelm 
parents: 
32952 
diff
changeset
 | 
193  | 
val scanner = !! (err "Malformed encoded xml") parse_tree  | 
| 
 
69916a850301
eliminated hard tabulators, guessing at each author's individual tab-width;
 
wenzelm 
parents: 
32952 
diff
changeset
 | 
194  | 
val (x, toks) = scanner seq  | 
| 19089 | 195  | 
in  | 
| 
32960
 
69916a850301
eliminated hard tabulators, guessing at each author's individual tab-width;
 
wenzelm 
parents: 
32952 
diff
changeset
 | 
196  | 
        if Seq.null toks then x else error ("Unprocessed input: '"^(beginning 100 toks)^"'")
 | 
| 
 
69916a850301
eliminated hard tabulators, guessing at each author's individual tab-width;
 
wenzelm 
parents: 
32952 
diff
changeset
 | 
197  | 
end  | 
| 19089 | 198  | 
|
| 19064 | 199  | 
end;  |