author | wenzelm |
Thu, 01 Feb 2007 20:40:17 +0100 | |
changeset 22227 | 7f88a6848fc2 |
parent 22207 | 4bc4a930a8bc |
child 22334 | 4c96d3370186 |
permissions | -rw-r--r-- |
12416 | 1 |
(* Title: Pure/General/xml.ML |
2 |
ID: $Id$ |
|
14728 | 3 |
Author: David Aspinall, Stefan Berghofer and Markus Wenzel |
12416 | 4 |
|
14969 | 5 |
Basic support for XML. |
12416 | 6 |
*) |
7 |
||
8 |
signature XML = |
|
9 |
sig |
|
21629
c762bdc2b504
Add string buffer getter. Add Rawtext constructor to allow XML-escaped strings in tree.
aspinall
parents:
21628
diff
changeset
|
10 |
(* string functions *) |
14969 | 11 |
val header: string |
12 |
val text: string -> string |
|
15211
5f54721547a7
Add text_charref to encode a string using character references
aspinall
parents:
15207
diff
changeset
|
13 |
val text_charref: string -> string |
14969 | 14 |
val cdata: string -> string |
21629
c762bdc2b504
Add string buffer getter. Add Rawtext constructor to allow XML-escaped strings in tree.
aspinall
parents:
21628
diff
changeset
|
15 |
type attributes = (string * string) list |
21628
7ab9ad8d6757
Type abbreviations for element args and attributes
aspinall
parents:
19482
diff
changeset
|
16 |
val element: string -> attributes -> string list -> string |
21629
c762bdc2b504
Add string buffer getter. Add Rawtext constructor to allow XML-escaped strings in tree.
aspinall
parents:
21628
diff
changeset
|
17 |
(* tree functions *) |
c762bdc2b504
Add string buffer getter. Add Rawtext constructor to allow XML-escaped strings in tree.
aspinall
parents:
21628
diff
changeset
|
18 |
datatype tree = |
c762bdc2b504
Add string buffer getter. Add Rawtext constructor to allow XML-escaped strings in tree.
aspinall
parents:
21628
diff
changeset
|
19 |
Elem of string * attributes * tree list |
c762bdc2b504
Add string buffer getter. Add Rawtext constructor to allow XML-escaped strings in tree.
aspinall
parents:
21628
diff
changeset
|
20 |
| Text of string (* native string, subject to XML.text on output *) |
c762bdc2b504
Add string buffer getter. Add Rawtext constructor to allow XML-escaped strings in tree.
aspinall
parents:
21628
diff
changeset
|
21 |
| Rawtext of string (* XML string: not parsed and output directly *) |
21654 | 22 |
type content = tree list |
23 |
type element = string * attributes * content |
|
13729
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
24 |
val string_of_tree: tree -> string |
21629
c762bdc2b504
Add string buffer getter. Add Rawtext constructor to allow XML-escaped strings in tree.
aspinall
parents:
21628
diff
changeset
|
25 |
val buffer_of_tree: tree -> Buffer.T |
21636
88b815dca68d
Add parse_string for attribute values and other string content
aspinall
parents:
21629
diff
changeset
|
26 |
val parse_string : string -> string option |
14185
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
27 |
val parse_content: string list -> tree list * string list |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
28 |
val parse_elem: string list -> tree * string list |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
29 |
val parse_document: string list -> (string option * tree) * string list |
21629
c762bdc2b504
Add string buffer getter. Add Rawtext constructor to allow XML-escaped strings in tree.
aspinall
parents:
21628
diff
changeset
|
30 |
val tree_of_string: string -> tree |
15142
7b7109f22224
Add scan_comment_whspc to skip space and comments in PGIP stream
aspinall
parents:
15010
diff
changeset
|
31 |
val scan_comment_whspc : string list -> unit * string list |
12416 | 32 |
end; |
33 |
||
34 |
structure XML: XML = |
|
35 |
struct |
|
36 |
||
14969 | 37 |
(** string based representation (small scale) **) |
38 |
||
39 |
val header = "<?xml version=\"1.0\"?>\n"; |
|
40 |
||
41 |
||
42 |
(* text and character data *) |
|
43 |
||
44 |
fun decode "<" = "<" |
|
45 |
| decode ">" = ">" |
|
46 |
| decode "&" = "&" |
|
47 |
| decode "'" = "'" |
|
48 |
| decode """ = "\"" |
|
49 |
| decode c = c; |
|
12416 | 50 |
|
51 |
fun encode "<" = "<" |
|
52 |
| encode ">" = ">" |
|
53 |
| encode "&" = "&" |
|
54 |
| encode "'" = "'" |
|
55 |
| encode "\"" = """ |
|
56 |
| encode c = c; |
|
57 |
||
17756 | 58 |
fun encode_charref c = "&#" ^ Int.toString (ord c) ^ ";" |
15211
5f54721547a7
Add text_charref to encode a string using character references
aspinall
parents:
15207
diff
changeset
|
59 |
|
5f54721547a7
Add text_charref to encode a string using character references
aspinall
parents:
15207
diff
changeset
|
60 |
val text = Library.translate_string encode |
5f54721547a7
Add text_charref to encode a string using character references
aspinall
parents:
15207
diff
changeset
|
61 |
|
17756 | 62 |
val text_charref = translate_string encode_charref; |
12416 | 63 |
|
15207 | 64 |
val cdata = enclose "<![CDATA[" "]]>\n" |
12416 | 65 |
|
14728 | 66 |
|
12416 | 67 |
(* elements *) |
68 |
||
14728 | 69 |
fun attribute (a, x) = a ^ " = \"" ^ text x ^ "\""; |
12416 | 70 |
|
71 |
fun element name atts cs = |
|
72 |
let val elem = space_implode " " (name :: map attribute atts) in |
|
73 |
if null cs then enclose "<" "/>" elem |
|
74 |
else enclose "<" ">" elem ^ implode cs ^ enclose "</" ">" name |
|
75 |
end; |
|
76 |
||
77 |
||
13729
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
78 |
|
15207 | 79 |
|
14969 | 80 |
(** explicit XML trees **) |
81 |
||
21628
7ab9ad8d6757
Type abbreviations for element args and attributes
aspinall
parents:
19482
diff
changeset
|
82 |
type attributes = (string * string) list |
7ab9ad8d6757
Type abbreviations for element args and attributes
aspinall
parents:
19482
diff
changeset
|
83 |
|
14969 | 84 |
datatype tree = |
21628
7ab9ad8d6757
Type abbreviations for element args and attributes
aspinall
parents:
19482
diff
changeset
|
85 |
Elem of string * attributes * tree list |
22227 | 86 |
| Text of string |
87 |
| Rawtext of string; |
|
14969 | 88 |
|
21654 | 89 |
type content = tree list |
90 |
||
91 |
type element = string * attributes * content |
|
21628
7ab9ad8d6757
Type abbreviations for element args and attributes
aspinall
parents:
19482
diff
changeset
|
92 |
|
21629
c762bdc2b504
Add string buffer getter. Add Rawtext constructor to allow XML-escaped strings in tree.
aspinall
parents:
21628
diff
changeset
|
93 |
fun buffer_of_tree tree = |
14969 | 94 |
let |
95 |
fun string_of (Elem (name, atts, ts)) buf = |
|
96 |
let val buf' = |
|
97 |
buf |> Buffer.add "<" |
|
98 |
|> fold Buffer.add (separate " " (name :: map attribute atts)) |
|
99 |
in |
|
100 |
if null ts then |
|
101 |
buf' |> Buffer.add "/>" |
|
102 |
else |
|
103 |
buf' |> Buffer.add ">" |
|
104 |
|> fold string_of ts |
|
105 |
|> Buffer.add "</" |> Buffer.add name |> Buffer.add ">" |
|
106 |
end |
|
21629
c762bdc2b504
Add string buffer getter. Add Rawtext constructor to allow XML-escaped strings in tree.
aspinall
parents:
21628
diff
changeset
|
107 |
| string_of (Text s) buf = Buffer.add (text s) buf |
c762bdc2b504
Add string buffer getter. Add Rawtext constructor to allow XML-escaped strings in tree.
aspinall
parents:
21628
diff
changeset
|
108 |
| string_of (Rawtext s) buf = Buffer.add s buf; |
c762bdc2b504
Add string buffer getter. Add Rawtext constructor to allow XML-escaped strings in tree.
aspinall
parents:
21628
diff
changeset
|
109 |
in string_of tree Buffer.empty end; |
c762bdc2b504
Add string buffer getter. Add Rawtext constructor to allow XML-escaped strings in tree.
aspinall
parents:
21628
diff
changeset
|
110 |
|
c762bdc2b504
Add string buffer getter. Add Rawtext constructor to allow XML-escaped strings in tree.
aspinall
parents:
21628
diff
changeset
|
111 |
val string_of_tree = Buffer.content o buffer_of_tree; |
14969 | 112 |
|
113 |
||
114 |
||
115 |
(** XML parsing **) |
|
13729
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
116 |
|
14728 | 117 |
fun err s (xs, _) = |
118 |
"XML parsing error: " ^ s ^ "\nfound: " ^ quote (Symbol.beginning 100 xs); |
|
13729
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
119 |
|
21858
05f57309170c
avoid conflict with Alice keywords: renamed pack -> implode, unpack -> explode, any -> many, avoided assert;
wenzelm
parents:
21654
diff
changeset
|
120 |
val scan_whspc = Scan.many Symbol.is_blank; |
13729
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
121 |
|
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
122 |
val scan_special = $$ "&" ^^ Symbol.scan_id ^^ $$ ";" >> decode; |
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
123 |
|
15216
2fac1f11b7f6
Remove white space skipping in element content; XML specification clearly requires whitespace to be passed to application.
aspinall
parents:
15211
diff
changeset
|
124 |
val parse_chars = Scan.repeat1 (Scan.unless ((* scan_whspc -- *)$$ "<") |
14185
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
125 |
(scan_special || Scan.one Symbol.not_eof)) >> implode; |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
126 |
|
21636
88b815dca68d
Add parse_string for attribute values and other string content
aspinall
parents:
21629
diff
changeset
|
127 |
val parse_string = Scan.read Symbol.stopper parse_chars o explode; |
88b815dca68d
Add parse_string for attribute values and other string content
aspinall
parents:
21629
diff
changeset
|
128 |
|
14910 | 129 |
val parse_cdata = Scan.this_string "<![CDATA[" |-- |
130 |
(Scan.repeat (Scan.unless (Scan.this_string "]]>") (Scan.one Symbol.not_eof)) >> |
|
131 |
implode) --| Scan.this_string "]]>"; |
|
14185
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
132 |
|
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
133 |
val parse_att = |
14865 | 134 |
Symbol.scan_id --| scan_whspc --| $$ "=" --| scan_whspc -- |
135 |
(($$ "\"" || $$ "'") :-- (fn s => (Scan.repeat (Scan.unless ($$ s) |
|
136 |
(scan_special || Scan.one Symbol.not_eof)) >> implode) --| $$ s) >> snd); |
|
14863 | 137 |
|
14910 | 138 |
val parse_comment = Scan.this_string "<!--" -- |
139 |
Scan.repeat (Scan.unless (Scan.this_string "-->") (Scan.one Symbol.not_eof)) -- |
|
140 |
Scan.this_string "-->"; |
|
13729
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
141 |
|
22227 | 142 |
val scan_comment_whspc = |
143 |
(scan_whspc >> K()) --| (Scan.repeat (parse_comment |-- (scan_whspc >> K()))); |
|
15142
7b7109f22224
Add scan_comment_whspc to skip space and comments in PGIP stream
aspinall
parents:
15010
diff
changeset
|
144 |
|
14910 | 145 |
val parse_pi = Scan.this_string "<?" |-- |
146 |
Scan.repeat (Scan.unless (Scan.this_string "?>") (Scan.one Symbol.not_eof)) --| |
|
147 |
Scan.this_string "?>"; |
|
13729
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
148 |
|
14185
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
149 |
fun parse_content xs = |
15216
2fac1f11b7f6
Remove white space skipping in element content; XML specification clearly requires whitespace to be passed to application.
aspinall
parents:
15211
diff
changeset
|
150 |
((Scan.optional ((* scan_whspc |-- *) parse_chars >> (single o Text)) [] -- |
2fac1f11b7f6
Remove white space skipping in element content; XML specification clearly requires whitespace to be passed to application.
aspinall
parents:
15211
diff
changeset
|
151 |
(Scan.repeat ((* scan_whspc |-- *) |
14185
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
152 |
( parse_elem >> single |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
153 |
|| parse_cdata >> (single o Text) |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
154 |
|| parse_pi >> K [] |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
155 |
|| parse_comment >> K []) -- |
15216
2fac1f11b7f6
Remove white space skipping in element content; XML specification clearly requires whitespace to be passed to application.
aspinall
parents:
15211
diff
changeset
|
156 |
Scan.optional ((* scan_whspc |-- *) parse_chars >> (single o Text)) [] |
19482
9f11af8f7ef9
tuned basic list operators (flat, maps, map_filter);
wenzelm
parents:
17756
diff
changeset
|
157 |
>> op @) >> flat) >> op @)(* --| scan_whspc*)) xs |
14185
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
158 |
|
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
159 |
and parse_elem xs = |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
160 |
($$ "<" |-- Symbol.scan_id -- |
14865 | 161 |
Scan.repeat (scan_whspc |-- parse_att) --| scan_whspc :-- (fn (s, _) => |
14185
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
162 |
!! (err "Expected > or />") |
14910 | 163 |
(Scan.this_string "/>" >> K [] |
14185
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
164 |
|| $$ ">" |-- parse_content --| |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
165 |
!! (err ("Expected </" ^ s ^ ">")) |
14910 | 166 |
(Scan.this_string ("</" ^ s) --| scan_whspc --| $$ ">"))) >> |
13729
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
167 |
(fn ((s, atts), ts) => Elem (s, atts, ts))) xs; |
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
168 |
|
14185
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
169 |
val parse_document = |
14910 | 170 |
Scan.option (Scan.this_string "<!DOCTYPE" -- scan_whspc |-- |
14185
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
171 |
(Scan.repeat (Scan.unless ($$ ">") |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
172 |
(Scan.one Symbol.not_eof)) >> implode) --| $$ ">" --| scan_whspc) -- |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
173 |
parse_elem; |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
174 |
|
13729
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
175 |
fun tree_of_string s = |
14185
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
176 |
(case Scan.finite Symbol.stopper (Scan.error (!! (err "Malformed element") |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
177 |
(scan_whspc |-- parse_elem --| scan_whspc))) (Symbol.explode s) of |
14728 | 178 |
(x, []) => x |
179 |
| (_, ys) => error ("XML parsing error: Unprocessed input\n" ^ Symbol.beginning 100 ys)); |
|
13729
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
180 |
|
12416 | 181 |
end; |