| author | wenzelm | 
| Tue, 03 Apr 2007 19:24:15 +0200 | |
| changeset 22569 | e5d7d9de7d85 | 
| parent 22334 | 4c96d3370186 | 
| permissions | -rw-r--r-- | 
| 12416 | 1 | (* Title: Pure/General/xml.ML | 
| 2 | ID: $Id$ | |
| 14728 | 3 | Author: David Aspinall, Stefan Berghofer and Markus Wenzel | 
| 12416 | 4 | |
| 14969 | 5 | Basic support for XML. | 
| 12416 | 6 | *) | 
| 7 | ||
| 8 | signature XML = | |
| 9 | sig | |
| 21629 
c762bdc2b504
Add string buffer getter.  Add Rawtext constructor to allow XML-escaped strings in tree.
 aspinall parents: 
21628diff
changeset | 10 | (* string functions *) | 
| 14969 | 11 | val header: string | 
| 12 | val text: string -> string | |
| 15211 
5f54721547a7
Add text_charref to encode a string using character references
 aspinall parents: 
15207diff
changeset | 13 | val text_charref: string -> string | 
| 14969 | 14 | val cdata: string -> string | 
| 21629 
c762bdc2b504
Add string buffer getter.  Add Rawtext constructor to allow XML-escaped strings in tree.
 aspinall parents: 
21628diff
changeset | 15 | type attributes = (string * string) list | 
| 21628 
7ab9ad8d6757
Type abbreviations for element args and attributes
 aspinall parents: 
19482diff
changeset | 16 | val element: string -> attributes -> string list -> string | 
| 21629 
c762bdc2b504
Add string buffer getter.  Add Rawtext constructor to allow XML-escaped strings in tree.
 aspinall parents: 
21628diff
changeset | 17 | (* tree functions *) | 
| 
c762bdc2b504
Add string buffer getter.  Add Rawtext constructor to allow XML-escaped strings in tree.
 aspinall parents: 
21628diff
changeset | 18 | datatype tree = | 
| 
c762bdc2b504
Add string buffer getter.  Add Rawtext constructor to allow XML-escaped strings in tree.
 aspinall parents: 
21628diff
changeset | 19 | Elem of string * attributes * tree list | 
| 
c762bdc2b504
Add string buffer getter.  Add Rawtext constructor to allow XML-escaped strings in tree.
 aspinall parents: 
21628diff
changeset | 20 | | Text of string (* native string, subject to XML.text on output *) | 
| 22334 | 21 | | Rawtext of string (* XML string: output directly, not parsed *) | 
| 21654 | 22 | type content = tree list | 
| 23 | type element = string * attributes * content | |
| 13729 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
 berghofe parents: 
12416diff
changeset | 24 | val string_of_tree: tree -> string | 
| 21629 
c762bdc2b504
Add string buffer getter.  Add Rawtext constructor to allow XML-escaped strings in tree.
 aspinall parents: 
21628diff
changeset | 25 | val buffer_of_tree: tree -> Buffer.T | 
| 21636 
88b815dca68d
Add parse_string for attribute values and other string content
 aspinall parents: 
21629diff
changeset | 26 | val parse_string : string -> string option | 
| 14185 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 27 | val parse_content: string list -> tree list * string list | 
| 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 28 | val parse_elem: string list -> tree * string list | 
| 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 29 | val parse_document: string list -> (string option * tree) * string list | 
| 21629 
c762bdc2b504
Add string buffer getter.  Add Rawtext constructor to allow XML-escaped strings in tree.
 aspinall parents: 
21628diff
changeset | 30 | val tree_of_string: string -> tree | 
| 15142 
7b7109f22224
Add scan_comment_whspc to skip space and comments in PGIP stream
 aspinall parents: 
15010diff
changeset | 31 | val scan_comment_whspc : string list -> unit * string list | 
| 12416 | 32 | end; | 
| 33 | ||
| 34 | structure XML: XML = | |
| 35 | struct | |
| 36 | ||
| 14969 | 37 | (** string based representation (small scale) **) | 
| 38 | ||
| 39 | val header = "<?xml version=\"1.0\"?>\n"; | |
| 40 | ||
| 41 | ||
| 42 | (* text and character data *) | |
| 43 | ||
| 44 | fun decode "<" = "<" | |
| 45 | | decode ">" = ">" | |
| 46 | | decode "&" = "&" | |
| 47 | | decode "'" = "'" | |
| 48 | | decode """ = "\"" | |
| 49 | | decode c = c; | |
| 12416 | 50 | |
| 51 | fun encode "<" = "<" | |
| 52 | | encode ">" = ">" | |
| 53 | | encode "&" = "&" | |
| 54 | | encode "'" = "'" | |
| 55 | | encode "\"" = """ | |
| 56 | | encode c = c; | |
| 57 | ||
| 17756 | 58 | fun encode_charref c = "&#" ^ Int.toString (ord c) ^ ";" | 
| 15211 
5f54721547a7
Add text_charref to encode a string using character references
 aspinall parents: 
15207diff
changeset | 59 | |
| 
5f54721547a7
Add text_charref to encode a string using character references
 aspinall parents: 
15207diff
changeset | 60 | val text = Library.translate_string encode | 
| 
5f54721547a7
Add text_charref to encode a string using character references
 aspinall parents: 
15207diff
changeset | 61 | |
| 17756 | 62 | val text_charref = translate_string encode_charref; | 
| 12416 | 63 | |
| 15207 | 64 | val cdata = enclose "<![CDATA[" "]]>\n" | 
| 12416 | 65 | |
| 14728 | 66 | |
| 12416 | 67 | (* elements *) | 
| 68 | ||
| 14728 | 69 | fun attribute (a, x) = a ^ " = \"" ^ text x ^ "\""; | 
| 12416 | 70 | |
| 71 | fun element name atts cs = | |
| 72 | let val elem = space_implode " " (name :: map attribute atts) in | |
| 73 | if null cs then enclose "<" "/>" elem | |
| 74 | else enclose "<" ">" elem ^ implode cs ^ enclose "</" ">" name | |
| 75 | end; | |
| 76 | ||
| 77 | ||
| 13729 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
 berghofe parents: 
12416diff
changeset | 78 | |
| 15207 | 79 | |
| 14969 | 80 | (** explicit XML trees **) | 
| 81 | ||
| 21628 
7ab9ad8d6757
Type abbreviations for element args and attributes
 aspinall parents: 
19482diff
changeset | 82 | type attributes = (string * string) list | 
| 
7ab9ad8d6757
Type abbreviations for element args and attributes
 aspinall parents: 
19482diff
changeset | 83 | |
| 14969 | 84 | datatype tree = | 
| 21628 
7ab9ad8d6757
Type abbreviations for element args and attributes
 aspinall parents: 
19482diff
changeset | 85 | Elem of string * attributes * tree list | 
| 22227 | 86 | | Text of string | 
| 87 | | Rawtext of string; | |
| 14969 | 88 | |
| 21654 | 89 | type content = tree list | 
| 90 | ||
| 91 | type element = string * attributes * content | |
| 21628 
7ab9ad8d6757
Type abbreviations for element args and attributes
 aspinall parents: 
19482diff
changeset | 92 | |
| 21629 
c762bdc2b504
Add string buffer getter.  Add Rawtext constructor to allow XML-escaped strings in tree.
 aspinall parents: 
21628diff
changeset | 93 | fun buffer_of_tree tree = | 
| 14969 | 94 | let | 
| 95 | fun string_of (Elem (name, atts, ts)) buf = | |
| 96 | let val buf' = | |
| 97 | buf |> Buffer.add "<" | |
| 98 | |> fold Buffer.add (separate " " (name :: map attribute atts)) | |
| 99 | in | |
| 100 | if null ts then | |
| 101 | buf' |> Buffer.add "/>" | |
| 102 | else | |
| 103 | buf' |> Buffer.add ">" | |
| 104 | |> fold string_of ts | |
| 105 | |> Buffer.add "</" |> Buffer.add name |> Buffer.add ">" | |
| 106 | end | |
| 21629 
c762bdc2b504
Add string buffer getter.  Add Rawtext constructor to allow XML-escaped strings in tree.
 aspinall parents: 
21628diff
changeset | 107 | | string_of (Text s) buf = Buffer.add (text s) buf | 
| 
c762bdc2b504
Add string buffer getter.  Add Rawtext constructor to allow XML-escaped strings in tree.
 aspinall parents: 
21628diff
changeset | 108 | | string_of (Rawtext s) buf = Buffer.add s buf; | 
| 
c762bdc2b504
Add string buffer getter.  Add Rawtext constructor to allow XML-escaped strings in tree.
 aspinall parents: 
21628diff
changeset | 109 | in string_of tree Buffer.empty end; | 
| 
c762bdc2b504
Add string buffer getter.  Add Rawtext constructor to allow XML-escaped strings in tree.
 aspinall parents: 
21628diff
changeset | 110 | |
| 
c762bdc2b504
Add string buffer getter.  Add Rawtext constructor to allow XML-escaped strings in tree.
 aspinall parents: 
21628diff
changeset | 111 | val string_of_tree = Buffer.content o buffer_of_tree; | 
| 14969 | 112 | |
| 113 | ||
| 114 | ||
| 115 | (** XML parsing **) | |
| 13729 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
 berghofe parents: 
12416diff
changeset | 116 | |
| 14728 | 117 | fun err s (xs, _) = | 
| 118 | "XML parsing error: " ^ s ^ "\nfound: " ^ quote (Symbol.beginning 100 xs); | |
| 13729 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
 berghofe parents: 
12416diff
changeset | 119 | |
| 21858 
05f57309170c
avoid conflict with Alice keywords: renamed pack -> implode, unpack -> explode, any -> many, avoided assert;
 wenzelm parents: 
21654diff
changeset | 120 | val scan_whspc = Scan.many Symbol.is_blank; | 
| 13729 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
 berghofe parents: 
12416diff
changeset | 121 | |
| 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
 berghofe parents: 
12416diff
changeset | 122 | val scan_special = $$ "&" ^^ Symbol.scan_id ^^ $$ ";" >> decode; | 
| 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
 berghofe parents: 
12416diff
changeset | 123 | |
| 15216 
2fac1f11b7f6
Remove white space skipping in element content; XML specification clearly requires whitespace to be passed to application.
 aspinall parents: 
15211diff
changeset | 124 | val parse_chars = Scan.repeat1 (Scan.unless ((* scan_whspc -- *)$$ "<") | 
| 14185 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 125 | (scan_special || Scan.one Symbol.not_eof)) >> implode; | 
| 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 126 | |
| 21636 
88b815dca68d
Add parse_string for attribute values and other string content
 aspinall parents: 
21629diff
changeset | 127 | val parse_string = Scan.read Symbol.stopper parse_chars o explode; | 
| 
88b815dca68d
Add parse_string for attribute values and other string content
 aspinall parents: 
21629diff
changeset | 128 | |
| 14910 | 129 | val parse_cdata = Scan.this_string "<![CDATA[" |-- | 
| 130 | (Scan.repeat (Scan.unless (Scan.this_string "]]>") (Scan.one Symbol.not_eof)) >> | |
| 131 | implode) --| Scan.this_string "]]>"; | |
| 14185 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 132 | |
| 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 133 | val parse_att = | 
| 14865 | 134 | Symbol.scan_id --| scan_whspc --| $$ "=" --| scan_whspc -- | 
| 135 | (($$ "\"" || $$ "'") :-- (fn s => (Scan.repeat (Scan.unless ($$ s) | |
| 136 | (scan_special || Scan.one Symbol.not_eof)) >> implode) --| $$ s) >> snd); | |
| 14863 | 137 | |
| 14910 | 138 | val parse_comment = Scan.this_string "<!--" -- | 
| 139 | Scan.repeat (Scan.unless (Scan.this_string "-->") (Scan.one Symbol.not_eof)) -- | |
| 140 | Scan.this_string "-->"; | |
| 13729 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
 berghofe parents: 
12416diff
changeset | 141 | |
| 22227 | 142 | val scan_comment_whspc = | 
| 143 | (scan_whspc >> K()) --| (Scan.repeat (parse_comment |-- (scan_whspc >> K()))); | |
| 15142 
7b7109f22224
Add scan_comment_whspc to skip space and comments in PGIP stream
 aspinall parents: 
15010diff
changeset | 144 | |
| 14910 | 145 | val parse_pi = Scan.this_string "<?" |-- | 
| 146 | Scan.repeat (Scan.unless (Scan.this_string "?>") (Scan.one Symbol.not_eof)) --| | |
| 147 | Scan.this_string "?>"; | |
| 13729 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
 berghofe parents: 
12416diff
changeset | 148 | |
| 14185 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 149 | fun parse_content xs = | 
| 15216 
2fac1f11b7f6
Remove white space skipping in element content; XML specification clearly requires whitespace to be passed to application.
 aspinall parents: 
15211diff
changeset | 150 | ((Scan.optional ((* scan_whspc |-- *) parse_chars >> (single o Text)) [] -- | 
| 
2fac1f11b7f6
Remove white space skipping in element content; XML specification clearly requires whitespace to be passed to application.
 aspinall parents: 
15211diff
changeset | 151 | (Scan.repeat ((* scan_whspc |-- *) | 
| 14185 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 152 | ( parse_elem >> single | 
| 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 153 | || parse_cdata >> (single o Text) | 
| 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 154 | || parse_pi >> K [] | 
| 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 155 | || parse_comment >> K []) -- | 
| 15216 
2fac1f11b7f6
Remove white space skipping in element content; XML specification clearly requires whitespace to be passed to application.
 aspinall parents: 
15211diff
changeset | 156 | Scan.optional ((* scan_whspc |-- *) parse_chars >> (single o Text)) [] | 
| 19482 
9f11af8f7ef9
tuned basic list operators (flat, maps, map_filter);
 wenzelm parents: 
17756diff
changeset | 157 | >> op @) >> flat) >> op @)(* --| scan_whspc*)) xs | 
| 14185 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 158 | |
| 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 159 | and parse_elem xs = | 
| 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 160 | ($$ "<" |-- Symbol.scan_id -- | 
| 14865 | 161 | Scan.repeat (scan_whspc |-- parse_att) --| scan_whspc :-- (fn (s, _) => | 
| 14185 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 162 | !! (err "Expected > or />") | 
| 14910 | 163 | (Scan.this_string "/>" >> K [] | 
| 14185 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 164 | || $$ ">" |-- parse_content --| | 
| 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 165 |             !! (err ("Expected </" ^ s ^ ">"))
 | 
| 14910 | 166 |               (Scan.this_string ("</" ^ s) --| scan_whspc --| $$ ">"))) >>
 | 
| 13729 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
 berghofe parents: 
12416diff
changeset | 167 | (fn ((s, atts), ts) => Elem (s, atts, ts))) xs; | 
| 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
 berghofe parents: 
12416diff
changeset | 168 | |
| 14185 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 169 | val parse_document = | 
| 14910 | 170 | Scan.option (Scan.this_string "<!DOCTYPE" -- scan_whspc |-- | 
| 14185 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 171 | (Scan.repeat (Scan.unless ($$ ">") | 
| 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 172 | (Scan.one Symbol.not_eof)) >> implode) --| $$ ">" --| scan_whspc) -- | 
| 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 173 | parse_elem; | 
| 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 174 | |
| 13729 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
 berghofe parents: 
12416diff
changeset | 175 | fun tree_of_string s = | 
| 14185 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 176 | (case Scan.finite Symbol.stopper (Scan.error (!! (err "Malformed element") | 
| 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 177 | (scan_whspc |-- parse_elem --| scan_whspc))) (Symbol.explode s) of | 
| 14728 | 178 | (x, []) => x | 
| 179 |   | (_, ys) => error ("XML parsing error: Unprocessed input\n" ^ Symbol.beginning 100 ys));
 | |
| 13729 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
 berghofe parents: 
12416diff
changeset | 180 | |
| 12416 | 181 | end; |