| author | aspinall | 
| Fri, 25 Mar 2005 14:14:01 +0100 | |
| changeset 15627 | 7a108ae4c798 | 
| parent 15570 | 8d8c70b41bab | 
| child 17756 | d4a35f82fbb4 | 
| permissions | -rw-r--r-- | 
| 12416 | 1  | 
(* Title: Pure/General/xml.ML  | 
2  | 
ID: $Id$  | 
|
| 14728 | 3  | 
Author: David Aspinall, Stefan Berghofer and Markus Wenzel  | 
| 12416 | 4  | 
|
| 14969 | 5  | 
Basic support for XML.  | 
| 12416 | 6  | 
*)  | 
7  | 
||
8  | 
signature XML =  | 
|
9  | 
sig  | 
|
| 14969 | 10  | 
val header: string  | 
11  | 
val text: string -> string  | 
|
| 
15211
 
5f54721547a7
Add text_charref to encode a string using character references
 
aspinall 
parents: 
15207 
diff
changeset
 | 
12  | 
val text_charref: string -> string  | 
| 14969 | 13  | 
val cdata: string -> string  | 
14  | 
val element: string -> (string * string) list -> string list -> string  | 
|
| 
13729
 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
 
berghofe 
parents: 
12416 
diff
changeset
 | 
15  | 
datatype tree =  | 
| 
 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
 
berghofe 
parents: 
12416 
diff
changeset
 | 
16  | 
Elem of string * (string * string) list * tree list  | 
| 
 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
 
berghofe 
parents: 
12416 
diff
changeset
 | 
17  | 
| Text of string  | 
| 
 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
 
berghofe 
parents: 
12416 
diff
changeset
 | 
18  | 
val string_of_tree: tree -> string  | 
| 
14185
 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 
berghofe 
parents: 
13729 
diff
changeset
 | 
19  | 
val parse_content: string list -> tree list * string list  | 
| 
 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 
berghofe 
parents: 
13729 
diff
changeset
 | 
20  | 
val parse_elem: string list -> tree * string list  | 
| 
 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 
berghofe 
parents: 
13729 
diff
changeset
 | 
21  | 
val parse_document: string list -> (string option * tree) * string list  | 
| 
15142
 
7b7109f22224
Add scan_comment_whspc to skip space and comments in PGIP stream
 
aspinall 
parents: 
15010 
diff
changeset
 | 
22  | 
val scan_comment_whspc : string list -> unit * string list  | 
| 14969 | 23  | 
val tree_of_string: string -> tree  | 
| 12416 | 24  | 
end;  | 
25  | 
||
26  | 
structure XML: XML =  | 
|
27  | 
struct  | 
|
28  | 
||
| 14969 | 29  | 
(** string based representation (small scale) **)  | 
30  | 
||
31  | 
val header = "<?xml version=\"1.0\"?>\n";  | 
|
32  | 
||
33  | 
||
34  | 
(* text and character data *)  | 
|
35  | 
||
36  | 
fun decode "<" = "<"  | 
|
37  | 
| decode ">" = ">"  | 
|
38  | 
| decode "&" = "&"  | 
|
39  | 
| decode "'" = "'"  | 
|
40  | 
| decode """ = "\""  | 
|
41  | 
| decode c = c;  | 
|
| 12416 | 42  | 
|
43  | 
fun encode "<" = "<"  | 
|
44  | 
| encode ">" = ">"  | 
|
45  | 
| encode "&" = "&"  | 
|
46  | 
| encode "'" = "'"  | 
|
47  | 
| encode "\"" = """  | 
|
48  | 
| encode c = c;  | 
|
49  | 
||
| 
15211
 
5f54721547a7
Add text_charref to encode a string using character references
 
aspinall 
parents: 
15207 
diff
changeset
 | 
50  | 
fun encode_charref c = "&#"^Int.toString (Char.ord c)^ ";"  | 
| 
 
5f54721547a7
Add text_charref to encode a string using character references
 
aspinall 
parents: 
15207 
diff
changeset
 | 
51  | 
|
| 
 
5f54721547a7
Add text_charref to encode a string using character references
 
aspinall 
parents: 
15207 
diff
changeset
 | 
52  | 
val text = Library.translate_string encode  | 
| 
 
5f54721547a7
Add text_charref to encode a string using character references
 
aspinall 
parents: 
15207 
diff
changeset
 | 
53  | 
|
| 
 
5f54721547a7
Add text_charref to encode a string using character references
 
aspinall 
parents: 
15207 
diff
changeset
 | 
54  | 
val text_charref = implode o (map encode_charref) o String.explode  | 
| 12416 | 55  | 
|
| 15207 | 56  | 
val cdata = enclose "<![CDATA[" "]]>\n"  | 
| 12416 | 57  | 
|
| 14728 | 58  | 
|
| 12416 | 59  | 
(* elements *)  | 
60  | 
||
| 14728 | 61  | 
fun attribute (a, x) = a ^ " = \"" ^ text x ^ "\"";  | 
| 12416 | 62  | 
|
63  | 
fun element name atts cs =  | 
|
64  | 
let val elem = space_implode " " (name :: map attribute atts) in  | 
|
65  | 
if null cs then enclose "<" "/>" elem  | 
|
66  | 
else enclose "<" ">" elem ^ implode cs ^ enclose "</" ">" name  | 
|
67  | 
end;  | 
|
68  | 
||
69  | 
||
| 
13729
 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
 
berghofe 
parents: 
12416 
diff
changeset
 | 
70  | 
|
| 15207 | 71  | 
|
| 14969 | 72  | 
(** explicit XML trees **)  | 
73  | 
||
74  | 
datatype tree =  | 
|
75  | 
Elem of string * (string * string) list * tree list  | 
|
76  | 
| Text of string;  | 
|
77  | 
||
78  | 
fun string_of_tree tree =  | 
|
79  | 
let  | 
|
80  | 
fun string_of (Elem (name, atts, ts)) buf =  | 
|
81  | 
let val buf' =  | 
|
82  | 
buf |> Buffer.add "<"  | 
|
83  | 
|> fold Buffer.add (separate " " (name :: map attribute atts))  | 
|
84  | 
in  | 
|
85  | 
if null ts then  | 
|
86  | 
buf' |> Buffer.add "/>"  | 
|
87  | 
else  | 
|
88  | 
buf' |> Buffer.add ">"  | 
|
89  | 
|> fold string_of ts  | 
|
90  | 
|> Buffer.add "</" |> Buffer.add name |> Buffer.add ">"  | 
|
91  | 
end  | 
|
92  | 
| string_of (Text s) buf = Buffer.add (text s) buf;  | 
|
93  | 
in Buffer.content (string_of tree Buffer.empty) end;  | 
|
94  | 
||
95  | 
||
96  | 
||
97  | 
(** XML parsing **)  | 
|
| 
13729
 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
 
berghofe 
parents: 
12416 
diff
changeset
 | 
98  | 
|
| 14728 | 99  | 
fun err s (xs, _) =  | 
100  | 
"XML parsing error: " ^ s ^ "\nfound: " ^ quote (Symbol.beginning 100 xs);  | 
|
| 
13729
 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
 
berghofe 
parents: 
12416 
diff
changeset
 | 
101  | 
|
| 14728 | 102  | 
val scan_whspc = Scan.any Symbol.is_blank;  | 
| 
13729
 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
 
berghofe 
parents: 
12416 
diff
changeset
 | 
103  | 
|
| 
 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
 
berghofe 
parents: 
12416 
diff
changeset
 | 
104  | 
val scan_special = $$ "&" ^^ Symbol.scan_id ^^ $$ ";" >> decode;  | 
| 
 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
 
berghofe 
parents: 
12416 
diff
changeset
 | 
105  | 
|
| 
15216
 
2fac1f11b7f6
Remove white space skipping in element content; XML specification clearly requires whitespace to be passed to application.
 
aspinall 
parents: 
15211 
diff
changeset
 | 
106  | 
val parse_chars = Scan.repeat1 (Scan.unless ((* scan_whspc -- *)$$ "<")  | 
| 
14185
 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 
berghofe 
parents: 
13729 
diff
changeset
 | 
107  | 
(scan_special || Scan.one Symbol.not_eof)) >> implode;  | 
| 
 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 
berghofe 
parents: 
13729 
diff
changeset
 | 
108  | 
|
| 14910 | 109  | 
val parse_cdata = Scan.this_string "<![CDATA[" |--  | 
110  | 
(Scan.repeat (Scan.unless (Scan.this_string "]]>") (Scan.one Symbol.not_eof)) >>  | 
|
111  | 
implode) --| Scan.this_string "]]>";  | 
|
| 
14185
 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 
berghofe 
parents: 
13729 
diff
changeset
 | 
112  | 
|
| 
 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 
berghofe 
parents: 
13729 
diff
changeset
 | 
113  | 
val parse_att =  | 
| 14865 | 114  | 
Symbol.scan_id --| scan_whspc --| $$ "=" --| scan_whspc --  | 
115  | 
(($$ "\"" || $$ "'") :-- (fn s => (Scan.repeat (Scan.unless ($$ s)  | 
|
116  | 
(scan_special || Scan.one Symbol.not_eof)) >> implode) --| $$ s) >> snd);  | 
|
| 14863 | 117  | 
|
| 14910 | 118  | 
val parse_comment = Scan.this_string "<!--" --  | 
119  | 
Scan.repeat (Scan.unless (Scan.this_string "-->") (Scan.one Symbol.not_eof)) --  | 
|
120  | 
Scan.this_string "-->";  | 
|
| 
13729
 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
 
berghofe 
parents: 
12416 
diff
changeset
 | 
121  | 
|
| 
15142
 
7b7109f22224
Add scan_comment_whspc to skip space and comments in PGIP stream
 
aspinall 
parents: 
15010 
diff
changeset
 | 
122  | 
val scan_comment_whspc =  | 
| 
 
7b7109f22224
Add scan_comment_whspc to skip space and comments in PGIP stream
 
aspinall 
parents: 
15010 
diff
changeset
 | 
123  | 
(scan_whspc >> K()) --| (Scan.repeat (parse_comment |-- (scan_whspc >> K())));  | 
| 
 
7b7109f22224
Add scan_comment_whspc to skip space and comments in PGIP stream
 
aspinall 
parents: 
15010 
diff
changeset
 | 
124  | 
|
| 14910 | 125  | 
val parse_pi = Scan.this_string "<?" |--  | 
126  | 
Scan.repeat (Scan.unless (Scan.this_string "?>") (Scan.one Symbol.not_eof)) --|  | 
|
127  | 
Scan.this_string "?>";  | 
|
| 
13729
 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
 
berghofe 
parents: 
12416 
diff
changeset
 | 
128  | 
|
| 
14185
 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 
berghofe 
parents: 
13729 
diff
changeset
 | 
129  | 
fun parse_content xs =  | 
| 
15216
 
2fac1f11b7f6
Remove white space skipping in element content; XML specification clearly requires whitespace to be passed to application.
 
aspinall 
parents: 
15211 
diff
changeset
 | 
130  | 
((Scan.optional ((* scan_whspc |-- *) parse_chars >> (single o Text)) [] --  | 
| 
 
2fac1f11b7f6
Remove white space skipping in element content; XML specification clearly requires whitespace to be passed to application.
 
aspinall 
parents: 
15211 
diff
changeset
 | 
131  | 
(Scan.repeat ((* scan_whspc |-- *)  | 
| 
14185
 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 
berghofe 
parents: 
13729 
diff
changeset
 | 
132  | 
( parse_elem >> single  | 
| 
 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 
berghofe 
parents: 
13729 
diff
changeset
 | 
133  | 
|| parse_cdata >> (single o Text)  | 
| 
 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 
berghofe 
parents: 
13729 
diff
changeset
 | 
134  | 
|| parse_pi >> K []  | 
| 
 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 
berghofe 
parents: 
13729 
diff
changeset
 | 
135  | 
|| parse_comment >> K []) --  | 
| 
15216
 
2fac1f11b7f6
Remove white space skipping in element content; XML specification clearly requires whitespace to be passed to application.
 
aspinall 
parents: 
15211 
diff
changeset
 | 
136  | 
Scan.optional ((* scan_whspc |-- *) parse_chars >> (single o Text)) []  | 
| 15570 | 137  | 
>> op @) >> List.concat) >> op @)(* --| scan_whspc*)) xs  | 
| 
14185
 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 
berghofe 
parents: 
13729 
diff
changeset
 | 
138  | 
|
| 
 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 
berghofe 
parents: 
13729 
diff
changeset
 | 
139  | 
and parse_elem xs =  | 
| 
 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 
berghofe 
parents: 
13729 
diff
changeset
 | 
140  | 
($$ "<" |-- Symbol.scan_id --  | 
| 14865 | 141  | 
Scan.repeat (scan_whspc |-- parse_att) --| scan_whspc :-- (fn (s, _) =>  | 
| 
14185
 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 
berghofe 
parents: 
13729 
diff
changeset
 | 
142  | 
!! (err "Expected > or />")  | 
| 14910 | 143  | 
(Scan.this_string "/>" >> K []  | 
| 
14185
 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 
berghofe 
parents: 
13729 
diff
changeset
 | 
144  | 
|| $$ ">" |-- parse_content --|  | 
| 
 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 
berghofe 
parents: 
13729 
diff
changeset
 | 
145  | 
            !! (err ("Expected </" ^ s ^ ">"))
 | 
| 14910 | 146  | 
              (Scan.this_string ("</" ^ s) --| scan_whspc --| $$ ">"))) >>
 | 
| 
13729
 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
 
berghofe 
parents: 
12416 
diff
changeset
 | 
147  | 
(fn ((s, atts), ts) => Elem (s, atts, ts))) xs;  | 
| 
 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
 
berghofe 
parents: 
12416 
diff
changeset
 | 
148  | 
|
| 
14185
 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 
berghofe 
parents: 
13729 
diff
changeset
 | 
149  | 
val parse_document =  | 
| 14910 | 150  | 
Scan.option (Scan.this_string "<!DOCTYPE" -- scan_whspc |--  | 
| 
14185
 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 
berghofe 
parents: 
13729 
diff
changeset
 | 
151  | 
(Scan.repeat (Scan.unless ($$ ">")  | 
| 
 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 
berghofe 
parents: 
13729 
diff
changeset
 | 
152  | 
(Scan.one Symbol.not_eof)) >> implode) --| $$ ">" --| scan_whspc) --  | 
| 
 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 
berghofe 
parents: 
13729 
diff
changeset
 | 
153  | 
parse_elem;  | 
| 
 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 
berghofe 
parents: 
13729 
diff
changeset
 | 
154  | 
|
| 
13729
 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
 
berghofe 
parents: 
12416 
diff
changeset
 | 
155  | 
fun tree_of_string s =  | 
| 
14185
 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 
berghofe 
parents: 
13729 
diff
changeset
 | 
156  | 
(case Scan.finite Symbol.stopper (Scan.error (!! (err "Malformed element")  | 
| 
 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 
berghofe 
parents: 
13729 
diff
changeset
 | 
157  | 
(scan_whspc |-- parse_elem --| scan_whspc))) (Symbol.explode s) of  | 
| 14728 | 158  | 
(x, []) => x  | 
159  | 
  | (_, ys) => error ("XML parsing error: Unprocessed input\n" ^ Symbol.beginning 100 ys));
 | 
|
| 
13729
 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
 
berghofe 
parents: 
12416 
diff
changeset
 | 
160  | 
|
| 12416 | 161  | 
end;  |