author | haftmann |
Fri, 11 Jun 2010 17:14:02 +0200 | |
changeset 37407 | 61dd8c145da7 |
parent 32960 | 69916a850301 |
child 41491 | a2ad5b824051 |
permissions | -rw-r--r-- |
24584 | 1 |
(* Title: HOL/Import/xml.ML |
19064 | 2 |
|
26537 | 3 |
Yet another version of XML support. |
19064 | 4 |
*) |
5 |
||
6 |
signature XML = |
|
7 |
sig |
|
8 |
val header: string |
|
9 |
val text: string -> string |
|
10 |
val text_charref: string -> string |
|
11 |
val cdata: string -> string |
|
12 |
val element: string -> (string * string) list -> string list -> string |
|
19089 | 13 |
|
19064 | 14 |
datatype tree = |
15 |
Elem of string * (string * string) list * tree list |
|
16 |
| Text of string |
|
19089 | 17 |
|
19064 | 18 |
val string_of_tree: tree -> string |
19 |
val tree_of_string: string -> tree |
|
19089 | 20 |
|
21 |
val encoded_string_of_tree : tree -> string |
|
22 |
val tree_of_encoded_string : string -> tree |
|
19064 | 23 |
end; |
24 |
||
19089 | 25 |
structure XML :> XML = |
19064 | 26 |
struct |
27 |
||
19093 | 28 |
(*structure Seq = VectorScannerSeq |
29 |
structure Scan = Scanner (structure Seq = Seq)*) |
|
30 |
structure Seq = StringScannerSeq |
|
31 |
structure Scan = StringScanner |
|
32 |
||
19095 | 33 |
|
19064 | 34 |
open Scan |
35 |
||
36 |
(** string based representation (small scale) **) |
|
37 |
||
38 |
val header = "<?xml version=\"1.0\"?>\n"; |
|
39 |
||
40 |
(* text and character data *) |
|
41 |
||
42 |
fun decode "<" = "<" |
|
43 |
| decode ">" = ">" |
|
44 |
| decode "&" = "&" |
|
45 |
| decode "'" = "'" |
|
46 |
| decode """ = "\"" |
|
47 |
| decode c = c; |
|
48 |
||
49 |
fun encode "<" = "<" |
|
50 |
| encode ">" = ">" |
|
51 |
| encode "&" = "&" |
|
52 |
| encode "'" = "'" |
|
53 |
| encode "\"" = """ |
|
54 |
| encode c = c; |
|
55 |
||
56 |
fun encode_charref c = "&#" ^ Int.toString (ord c) ^ ";" |
|
57 |
||
58 |
val text = Library.translate_string encode |
|
59 |
||
60 |
val text_charref = translate_string encode_charref; |
|
61 |
||
62 |
val cdata = enclose "<![CDATA[" "]]>\n" |
|
63 |
||
64 |
(* elements *) |
|
65 |
||
66 |
fun attribute (a, x) = a ^ " = \"" ^ text x ^ "\""; |
|
67 |
||
68 |
fun element name atts cs = |
|
69 |
let val elem = space_implode " " (name :: map attribute atts) in |
|
70 |
if null cs then enclose "<" "/>" elem |
|
71 |
else enclose "<" ">" elem ^ implode cs ^ enclose "</" ">" name |
|
72 |
end; |
|
73 |
||
74 |
(** explicit XML trees **) |
|
75 |
||
76 |
datatype tree = |
|
77 |
Elem of string * (string * string) list * tree list |
|
78 |
| Text of string; |
|
79 |
||
80 |
fun string_of_tree tree = |
|
81 |
let |
|
82 |
fun string_of (Elem (name, atts, ts)) buf = |
|
83 |
let val buf' = |
|
84 |
buf |> Buffer.add "<" |
|
85 |
|> fold Buffer.add (separate " " (name :: map attribute atts)) |
|
86 |
in |
|
87 |
if null ts then |
|
88 |
buf' |> Buffer.add "/>" |
|
89 |
else |
|
90 |
buf' |> Buffer.add ">" |
|
91 |
|> fold string_of ts |
|
92 |
|> Buffer.add "</" |> Buffer.add name |> Buffer.add ">" |
|
93 |
end |
|
94 |
| string_of (Text s) buf = Buffer.add (text s) buf; |
|
95 |
in Buffer.content (string_of tree Buffer.empty) end; |
|
96 |
||
97 |
(** XML parsing **) |
|
98 |
||
19089 | 99 |
fun beginning n xs = Symbol.beginning n (Seq.take_at_most xs n) |
19064 | 100 |
|
101 |
fun err s xs = |
|
102 |
"XML parsing error: " ^ s ^ "\nfound: " ^ quote (beginning 100 xs) ; |
|
103 |
||
104 |
val scan_whspc = Scan.any Symbol.is_blank; |
|
105 |
||
106 |
val scan_special = $$ "&" ^^ scan_id ^^ $$ ";" >> decode; |
|
107 |
||
108 |
val parse_chars = Scan.repeat1 (Scan.unless ((* scan_whspc -- *)$$ "<") |
|
23784
75e6b9dd5336
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
wenzelm
parents:
19095
diff
changeset
|
109 |
(scan_special || Scan.one Symbol.is_regular)) >> implode; |
19064 | 110 |
|
111 |
val parse_cdata = Scan.this_string "<![CDATA[" |-- |
|
23784
75e6b9dd5336
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
wenzelm
parents:
19095
diff
changeset
|
112 |
(Scan.repeat (Scan.unless (Scan.this_string "]]>") (Scan.one Symbol.is_regular)) >> |
19064 | 113 |
implode) --| Scan.this_string "]]>"; |
114 |
||
115 |
val parse_att = |
|
116 |
scan_id --| scan_whspc --| $$ "=" --| scan_whspc -- |
|
117 |
(($$ "\"" || $$ "'") :-- (fn s => (Scan.repeat (Scan.unless ($$ s) |
|
23784
75e6b9dd5336
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
wenzelm
parents:
19095
diff
changeset
|
118 |
(scan_special || Scan.one Symbol.is_regular)) >> implode) --| $$ s) >> snd); |
19064 | 119 |
|
120 |
val parse_comment = Scan.this_string "<!--" -- |
|
23784
75e6b9dd5336
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
wenzelm
parents:
19095
diff
changeset
|
121 |
Scan.repeat (Scan.unless (Scan.this_string "-->") (Scan.one Symbol.is_regular)) -- |
19064 | 122 |
Scan.this_string "-->"; |
123 |
||
124 |
val scan_comment_whspc = |
|
125 |
(scan_whspc >> K()) --| (Scan.repeat (parse_comment |-- (scan_whspc >> K()))); |
|
126 |
||
127 |
val parse_pi = Scan.this_string "<?" |-- |
|
23784
75e6b9dd5336
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
wenzelm
parents:
19095
diff
changeset
|
128 |
Scan.repeat (Scan.unless (Scan.this_string "?>") (Scan.one Symbol.is_regular)) --| |
19064 | 129 |
Scan.this_string "?>"; |
130 |
||
131 |
fun parse_content xs = |
|
132 |
((Scan.optional ((* scan_whspc |-- *) parse_chars >> (single o Text)) [] -- |
|
133 |
(Scan.repeat ((* scan_whspc |-- *) |
|
134 |
( parse_elem >> single |
|
135 |
|| parse_cdata >> (single o Text) |
|
136 |
|| parse_pi >> K [] |
|
137 |
|| parse_comment >> K []) -- |
|
138 |
Scan.optional ((* scan_whspc |-- *) parse_chars >> (single o Text)) [] |
|
32952 | 139 |
>> op @) >> flat) >> op @)(* --| scan_whspc*)) xs |
19064 | 140 |
|
141 |
and parse_elem xs = |
|
142 |
($$ "<" |-- scan_id -- |
|
143 |
Scan.repeat (scan_whspc |-- parse_att) --| scan_whspc :-- (fn (s, _) => |
|
144 |
!! (err "Expected > or />") |
|
145 |
(Scan.this_string "/>" >> K [] |
|
146 |
|| $$ ">" |-- parse_content --| |
|
147 |
!! (err ("Expected </" ^ s ^ ">")) |
|
148 |
(Scan.this_string ("</" ^ s) --| scan_whspc --| $$ ">"))) >> |
|
149 |
(fn ((s, atts), ts) => Elem (s, atts, ts))) xs; |
|
150 |
||
151 |
val parse_document = |
|
152 |
Scan.option (Scan.this_string "<!DOCTYPE" -- scan_whspc |-- |
|
153 |
(Scan.repeat (Scan.unless ($$ ">") |
|
23784
75e6b9dd5336
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
wenzelm
parents:
19095
diff
changeset
|
154 |
(Scan.one Symbol.is_regular)) >> implode) --| $$ ">" --| scan_whspc) -- |
19064 | 155 |
parse_elem; |
156 |
||
157 |
fun tree_of_string s = |
|
158 |
let |
|
32960
69916a850301
eliminated hard tabulators, guessing at each author's individual tab-width;
wenzelm
parents:
32952
diff
changeset
|
159 |
val seq = Seq.fromString s |
69916a850301
eliminated hard tabulators, guessing at each author's individual tab-width;
wenzelm
parents:
32952
diff
changeset
|
160 |
val scanner = !! (err "Malformed element") (scan_whspc |-- parse_elem --| scan_whspc) |
69916a850301
eliminated hard tabulators, guessing at each author's individual tab-width;
wenzelm
parents:
32952
diff
changeset
|
161 |
val (x, toks) = scanner seq |
19064 | 162 |
in |
32960
69916a850301
eliminated hard tabulators, guessing at each author's individual tab-width;
wenzelm
parents:
32952
diff
changeset
|
163 |
if Seq.null toks then x else error ("Unprocessed input: '"^(beginning 100 toks)^"'") |
19064 | 164 |
end |
19089 | 165 |
|
166 |
(* More efficient saving and loading of xml trees employing a proprietary external format *) |
|
167 |
||
168 |
fun write_lstring s buf = buf |> Buffer.add (string_of_int (size s)) |> Buffer.add " " |> Buffer.add s |
|
169 |
fun parse_lstring toks = (scan_nat --| one Symbol.is_blank :-- (fn i => repeat_fixed i (one (K true))) >> snd >> implode) toks |
|
170 |
||
171 |
fun write_list w l buf = buf |> Buffer.add (string_of_int (length l)) |> Buffer.add " " |> fold w l |
|
172 |
fun parse_list sc = scan_nat --| one Symbol.is_blank :-- (fn i => repeat_fixed i sc) >> snd |
|
173 |
||
174 |
fun write_tree (Text s) buf = buf |> Buffer.add "T" |> write_lstring s |
|
175 |
| write_tree (Elem (name, attrs, children)) buf = |
|
176 |
buf |> Buffer.add "E" |
|
32960
69916a850301
eliminated hard tabulators, guessing at each author's individual tab-width;
wenzelm
parents:
32952
diff
changeset
|
177 |
|> write_lstring name |
69916a850301
eliminated hard tabulators, guessing at each author's individual tab-width;
wenzelm
parents:
32952
diff
changeset
|
178 |
|> write_list (fn (a,b) => fn buf => buf |> write_lstring a |> write_lstring b) attrs |
69916a850301
eliminated hard tabulators, guessing at each author's individual tab-width;
wenzelm
parents:
32952
diff
changeset
|
179 |
|> write_list write_tree children |
19089 | 180 |
|
181 |
fun parse_tree toks = (one (K true) :-- (fn "T" => parse_lstring >> Text | "E" => parse_elem | _ => raise SyntaxError) >> snd) toks |
|
182 |
and parse_elem toks = (parse_lstring -- parse_list (parse_lstring -- parse_lstring) -- parse_list parse_tree >> (fn ((n, a), c) => Elem (n,a,c))) toks |
|
183 |
||
184 |
fun encoded_string_of_tree tree = Buffer.content (write_tree tree Buffer.empty) |
|
185 |
||
186 |
fun tree_of_encoded_string s = |
|
187 |
let |
|
32960
69916a850301
eliminated hard tabulators, guessing at each author's individual tab-width;
wenzelm
parents:
32952
diff
changeset
|
188 |
fun print (a,b) = writeln (a^" "^(string_of_int b)) |
69916a850301
eliminated hard tabulators, guessing at each author's individual tab-width;
wenzelm
parents:
32952
diff
changeset
|
189 |
val _ = print ("length of encoded string: ", size s) |
69916a850301
eliminated hard tabulators, guessing at each author's individual tab-width;
wenzelm
parents:
32952
diff
changeset
|
190 |
val _ = writeln "Seq.fromString..." |
69916a850301
eliminated hard tabulators, guessing at each author's individual tab-width;
wenzelm
parents:
32952
diff
changeset
|
191 |
val seq = timeit (fn () => Seq.fromString s) |
69916a850301
eliminated hard tabulators, guessing at each author's individual tab-width;
wenzelm
parents:
32952
diff
changeset
|
192 |
val _ = print ("length of sequence", timeit (fn () => Seq.length seq)) |
69916a850301
eliminated hard tabulators, guessing at each author's individual tab-width;
wenzelm
parents:
32952
diff
changeset
|
193 |
val scanner = !! (err "Malformed encoded xml") parse_tree |
69916a850301
eliminated hard tabulators, guessing at each author's individual tab-width;
wenzelm
parents:
32952
diff
changeset
|
194 |
val (x, toks) = scanner seq |
19089 | 195 |
in |
32960
69916a850301
eliminated hard tabulators, guessing at each author's individual tab-width;
wenzelm
parents:
32952
diff
changeset
|
196 |
if Seq.null toks then x else error ("Unprocessed input: '"^(beginning 100 toks)^"'") |
69916a850301
eliminated hard tabulators, guessing at each author's individual tab-width;
wenzelm
parents:
32952
diff
changeset
|
197 |
end |
19089 | 198 |
|
19064 | 199 |
end; |