author | aspinall |
Fri, 07 May 2004 13:40:24 +0200 | |
changeset 14713 | 6d203f6f0e8d |
parent 14596 | c36e116b578b |
child 14714 | 38ff9c8a7de0 |
permissions | -rw-r--r-- |
12416 | 1 |
(* Title: Pure/General/xml.ML |
2 |
ID: $Id$ |
|
13729
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
3 |
Author: Markus Wenzel, LMU Muenchen |
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
4 |
Stefan Berghofer, TU Muenchen |
12416 | 5 |
License: GPL (GNU GENERAL PUBLIC LICENSE) |
6 |
||
13729
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
7 |
Basic support for XML input and output. |
14713
6d203f6f0e8d
Add cdata output. Add tabs in whitespace. Write two strings instead of Library.quote.
aspinall
parents:
14596
diff
changeset
|
8 |
|
6d203f6f0e8d
Add cdata output. Add tabs in whitespace. Write two strings instead of Library.quote.
aspinall
parents:
14596
diff
changeset
|
9 |
FIXME da: missing input raises FAIL (scan.ML), should give error message. |
12416 | 10 |
*) |
11 |
||
12 |
signature XML = |
|
13 |
sig |
|
13729
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
14 |
datatype tree = |
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
15 |
Elem of string * (string * string) list * tree list |
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
16 |
| Text of string |
12416 | 17 |
val element: string -> (string * string) list -> string list -> string |
18 |
val text: string -> string |
|
14713
6d203f6f0e8d
Add cdata output. Add tabs in whitespace. Write two strings instead of Library.quote.
aspinall
parents:
14596
diff
changeset
|
19 |
val cdata: string -> string |
12416 | 20 |
val header: string |
13729
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
21 |
val string_of_tree: tree -> string |
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
22 |
val tree_of_string: string -> tree |
14185
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
23 |
val parse_content: string list -> tree list * string list |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
24 |
val parse_elem: string list -> tree * string list |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
25 |
val parse_document: string list -> (string option * tree) * string list |
12416 | 26 |
end; |
27 |
||
28 |
structure XML: XML = |
|
29 |
struct |
|
30 |
||
31 |
(* character data *) |
|
32 |
||
33 |
fun encode "<" = "<" |
|
34 |
| encode ">" = ">" |
|
35 |
| encode "&" = "&" |
|
36 |
| encode "'" = "'" |
|
37 |
| encode "\"" = """ |
|
38 |
| encode c = c; |
|
39 |
||
13729
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
40 |
fun decode "<" = "<" |
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
41 |
| decode ">" = ">" |
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
42 |
| decode "&" = "&" |
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
43 |
| decode "'" = "'" |
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
44 |
| decode """ = "\"" |
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
45 |
| decode c = c; |
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
46 |
|
14596 | 47 |
val text = String.translate (encode o String.str); |
12416 | 48 |
|
14713
6d203f6f0e8d
Add cdata output. Add tabs in whitespace. Write two strings instead of Library.quote.
aspinall
parents:
14596
diff
changeset
|
49 |
val cdata_open = "<![CDATA[" |
6d203f6f0e8d
Add cdata output. Add tabs in whitespace. Write two strings instead of Library.quote.
aspinall
parents:
14596
diff
changeset
|
50 |
val cdata_close = "]]>" |
6d203f6f0e8d
Add cdata output. Add tabs in whitespace. Write two strings instead of Library.quote.
aspinall
parents:
14596
diff
changeset
|
51 |
|
6d203f6f0e8d
Add cdata output. Add tabs in whitespace. Write two strings instead of Library.quote.
aspinall
parents:
14596
diff
changeset
|
52 |
fun cdata s = cdata_open ^ s ^ cdata_close; |
12416 | 53 |
|
54 |
(* elements *) |
|
55 |
||
13729
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
56 |
datatype tree = |
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
57 |
Elem of string * (string * string) list * tree list |
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
58 |
| Text of string; |
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
59 |
|
14713
6d203f6f0e8d
Add cdata output. Add tabs in whitespace. Write two strings instead of Library.quote.
aspinall
parents:
14596
diff
changeset
|
60 |
fun attribute (a, x) = a ^ " = " ^ "\"" (text x) "\""; |
12416 | 61 |
|
62 |
fun element name atts cs = |
|
63 |
let val elem = space_implode " " (name :: map attribute atts) in |
|
64 |
if null cs then enclose "<" "/>" elem |
|
65 |
else enclose "<" ">" elem ^ implode cs ^ enclose "</" ">" name |
|
66 |
end; |
|
67 |
||
13729
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
68 |
fun string_of_tree (Elem (name, atts, ts)) = |
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
69 |
element name atts (map string_of_tree ts) |
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
70 |
| string_of_tree (Text s) = s |
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
71 |
|
12416 | 72 |
val header = "<?xml version=\"1.0\"?>\n"; |
73 |
||
13729
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
74 |
|
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
75 |
(* parser *) |
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
76 |
|
14185
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
77 |
fun err s (xs, _) = "XML parsing error: " ^ s ^ "\nfound:\n" ^ |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
78 |
implode (take (100, xs)); |
13729
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
79 |
|
14713
6d203f6f0e8d
Add cdata output. Add tabs in whitespace. Write two strings instead of Library.quote.
aspinall
parents:
14596
diff
changeset
|
80 |
val scan_whspc = Scan.repeat ($$ " " || $$ "\n" || $$ "\t"); |
13729
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
81 |
|
14185
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
82 |
val literal = Scan.literal o Scan.make_lexicon o single o explode; |
13729
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
83 |
|
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
84 |
val scan_special = $$ "&" ^^ Symbol.scan_id ^^ $$ ";" >> decode; |
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
85 |
|
14185
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
86 |
val parse_chars = Scan.repeat1 (Scan.unless (scan_whspc -- $$ "<") |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
87 |
(scan_special || Scan.one Symbol.not_eof)) >> implode; |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
88 |
|
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
89 |
val parse_cdata = literal "<![CDATA[" |-- |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
90 |
(Scan.repeat (Scan.unless (literal "]]>") (Scan.one Symbol.not_eof)) >> |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
91 |
implode) --| literal "]]>"; |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
92 |
|
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
93 |
val parse_att = |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
94 |
Symbol.scan_id --| scan_whspc --| $$ "=" --| scan_whspc --| $$ "\"" -- |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
95 |
(Scan.repeat (Scan.unless ($$ "\"") |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
96 |
(scan_special || Scan.one Symbol.not_eof)) >> implode) --| $$ "\""; |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
97 |
|
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
98 |
val parse_comment = literal "<!--" -- |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
99 |
Scan.repeat (Scan.unless (literal "-->") (Scan.one Symbol.not_eof)) -- |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
100 |
literal "-->"; |
13729
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
101 |
|
14185
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
102 |
val parse_pi = literal "<?" |-- |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
103 |
Scan.repeat (Scan.unless (literal "?>") (Scan.one Symbol.not_eof)) --| |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
104 |
literal "?>"; |
13729
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
105 |
|
14185
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
106 |
fun parse_content xs = |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
107 |
((Scan.optional (scan_whspc |-- parse_chars >> (single o Text)) [] -- |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
108 |
(Scan.repeat (scan_whspc |-- |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
109 |
( parse_elem >> single |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
110 |
|| parse_cdata >> (single o Text) |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
111 |
|| parse_pi >> K [] |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
112 |
|| parse_comment >> K []) -- |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
113 |
Scan.optional (scan_whspc |-- parse_chars >> (single o Text)) [] |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
114 |
>> op @) >> flat) >> op @) --| scan_whspc) xs |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
115 |
|
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
116 |
and parse_elem xs = |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
117 |
($$ "<" |-- Symbol.scan_id -- |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
118 |
Scan.repeat (scan_whspc |-- parse_att) --| scan_whspc :-- (fn (s, _) => |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
119 |
!! (err "Expected > or />") |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
120 |
( literal "/>" >> K [] |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
121 |
|| $$ ">" |-- parse_content --| |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
122 |
!! (err ("Expected </" ^ s ^ ">")) |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
123 |
(literal ("</" ^ s) --| scan_whspc --| $$ ">"))) >> |
13729
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
124 |
(fn ((s, atts), ts) => Elem (s, atts, ts))) xs; |
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
125 |
|
14185
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
126 |
val parse_document = |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
127 |
Scan.option (literal "<!DOCTYPE" -- scan_whspc |-- |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
128 |
(Scan.repeat (Scan.unless ($$ ">") |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
129 |
(Scan.one Symbol.not_eof)) >> implode) --| $$ ">" --| scan_whspc) -- |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
130 |
parse_elem; |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
131 |
|
13729
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
132 |
fun tree_of_string s = |
14185
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
133 |
(case Scan.finite Symbol.stopper (Scan.error (!! (err "Malformed element") |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
134 |
(scan_whspc |-- parse_elem --| scan_whspc))) (Symbol.explode s) of |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
135 |
(x, []) => x |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
136 |
| (_, ys) => error ("XML parsing error: Unprocessed input\n" ^ |
9b3841638c06
Tried to make parser a bit more standard-conforming.
berghofe
parents:
13729
diff
changeset
|
137 |
implode (take (100, ys)))); |
13729
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset
|
138 |
|
12416 | 139 |
end; |