| author | berghofe | 
| Wed, 02 Nov 2005 16:37:39 +0100 | |
| changeset 18068 | e8c3d371594e | 
| parent 17756 | d4a35f82fbb4 | 
| child 19482 | 9f11af8f7ef9 | 
| permissions | -rw-r--r-- | 
| 12416 | 1 | (* Title: Pure/General/xml.ML | 
| 2 | ID: $Id$ | |
| 14728 | 3 | Author: David Aspinall, Stefan Berghofer and Markus Wenzel | 
| 12416 | 4 | |
| 14969 | 5 | Basic support for XML. | 
| 12416 | 6 | *) | 
| 7 | ||
| 8 | signature XML = | |
| 9 | sig | |
| 14969 | 10 | val header: string | 
| 11 | val text: string -> string | |
| 15211 
5f54721547a7
Add text_charref to encode a string using character references
 aspinall parents: 
15207diff
changeset | 12 | val text_charref: string -> string | 
| 14969 | 13 | val cdata: string -> string | 
| 14 | val element: string -> (string * string) list -> string list -> string | |
| 13729 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
 berghofe parents: 
12416diff
changeset | 15 | datatype tree = | 
| 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
 berghofe parents: 
12416diff
changeset | 16 | Elem of string * (string * string) list * tree list | 
| 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
 berghofe parents: 
12416diff
changeset | 17 | | Text of string | 
| 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
 berghofe parents: 
12416diff
changeset | 18 | val string_of_tree: tree -> string | 
| 14185 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 19 | val parse_content: string list -> tree list * string list | 
| 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 20 | val parse_elem: string list -> tree * string list | 
| 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 21 | val parse_document: string list -> (string option * tree) * string list | 
| 15142 
7b7109f22224
Add scan_comment_whspc to skip space and comments in PGIP stream
 aspinall parents: 
15010diff
changeset | 22 | val scan_comment_whspc : string list -> unit * string list | 
| 14969 | 23 | val tree_of_string: string -> tree | 
| 12416 | 24 | end; | 
| 25 | ||
| 26 | structure XML: XML = | |
| 27 | struct | |
| 28 | ||
| 14969 | 29 | (** string based representation (small scale) **) | 
| 30 | ||
| 31 | val header = "<?xml version=\"1.0\"?>\n"; | |
| 32 | ||
| 33 | ||
| 34 | (* text and character data *) | |
| 35 | ||
| 36 | fun decode "<" = "<" | |
| 37 | | decode ">" = ">" | |
| 38 | | decode "&" = "&" | |
| 39 | | decode "'" = "'" | |
| 40 | | decode """ = "\"" | |
| 41 | | decode c = c; | |
| 12416 | 42 | |
| 43 | fun encode "<" = "<" | |
| 44 | | encode ">" = ">" | |
| 45 | | encode "&" = "&" | |
| 46 | | encode "'" = "'" | |
| 47 | | encode "\"" = """ | |
| 48 | | encode c = c; | |
| 49 | ||
| 17756 | 50 | fun encode_charref c = "&#" ^ Int.toString (ord c) ^ ";" | 
| 15211 
5f54721547a7
Add text_charref to encode a string using character references
 aspinall parents: 
15207diff
changeset | 51 | |
| 
5f54721547a7
Add text_charref to encode a string using character references
 aspinall parents: 
15207diff
changeset | 52 | val text = Library.translate_string encode | 
| 
5f54721547a7
Add text_charref to encode a string using character references
 aspinall parents: 
15207diff
changeset | 53 | |
| 17756 | 54 | val text_charref = translate_string encode_charref; | 
| 12416 | 55 | |
| 15207 | 56 | val cdata = enclose "<![CDATA[" "]]>\n" | 
| 12416 | 57 | |
| 14728 | 58 | |
| 12416 | 59 | (* elements *) | 
| 60 | ||
| 14728 | 61 | fun attribute (a, x) = a ^ " = \"" ^ text x ^ "\""; | 
| 12416 | 62 | |
| 63 | fun element name atts cs = | |
| 64 | let val elem = space_implode " " (name :: map attribute atts) in | |
| 65 | if null cs then enclose "<" "/>" elem | |
| 66 | else enclose "<" ">" elem ^ implode cs ^ enclose "</" ">" name | |
| 67 | end; | |
| 68 | ||
| 69 | ||
| 13729 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
 berghofe parents: 
12416diff
changeset | 70 | |
| 15207 | 71 | |
| 14969 | 72 | (** explicit XML trees **) | 
| 73 | ||
| 74 | datatype tree = | |
| 75 | Elem of string * (string * string) list * tree list | |
| 76 | | Text of string; | |
| 77 | ||
| 78 | fun string_of_tree tree = | |
| 79 | let | |
| 80 | fun string_of (Elem (name, atts, ts)) buf = | |
| 81 | let val buf' = | |
| 82 | buf |> Buffer.add "<" | |
| 83 | |> fold Buffer.add (separate " " (name :: map attribute atts)) | |
| 84 | in | |
| 85 | if null ts then | |
| 86 | buf' |> Buffer.add "/>" | |
| 87 | else | |
| 88 | buf' |> Buffer.add ">" | |
| 89 | |> fold string_of ts | |
| 90 | |> Buffer.add "</" |> Buffer.add name |> Buffer.add ">" | |
| 91 | end | |
| 92 | | string_of (Text s) buf = Buffer.add (text s) buf; | |
| 93 | in Buffer.content (string_of tree Buffer.empty) end; | |
| 94 | ||
| 95 | ||
| 96 | ||
| 97 | (** XML parsing **) | |
| 13729 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
 berghofe parents: 
12416diff
changeset | 98 | |
| 14728 | 99 | fun err s (xs, _) = | 
| 100 | "XML parsing error: " ^ s ^ "\nfound: " ^ quote (Symbol.beginning 100 xs); | |
| 13729 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
 berghofe parents: 
12416diff
changeset | 101 | |
| 14728 | 102 | val scan_whspc = Scan.any Symbol.is_blank; | 
| 13729 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
 berghofe parents: 
12416diff
changeset | 103 | |
| 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
 berghofe parents: 
12416diff
changeset | 104 | val scan_special = $$ "&" ^^ Symbol.scan_id ^^ $$ ";" >> decode; | 
| 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
 berghofe parents: 
12416diff
changeset | 105 | |
| 15216 
2fac1f11b7f6
Remove white space skipping in element content; XML specification clearly requires whitespace to be passed to application.
 aspinall parents: 
15211diff
changeset | 106 | val parse_chars = Scan.repeat1 (Scan.unless ((* scan_whspc -- *)$$ "<") | 
| 14185 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 107 | (scan_special || Scan.one Symbol.not_eof)) >> implode; | 
| 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 108 | |
| 14910 | 109 | val parse_cdata = Scan.this_string "<![CDATA[" |-- | 
| 110 | (Scan.repeat (Scan.unless (Scan.this_string "]]>") (Scan.one Symbol.not_eof)) >> | |
| 111 | implode) --| Scan.this_string "]]>"; | |
| 14185 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 112 | |
| 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 113 | val parse_att = | 
| 14865 | 114 | Symbol.scan_id --| scan_whspc --| $$ "=" --| scan_whspc -- | 
| 115 | (($$ "\"" || $$ "'") :-- (fn s => (Scan.repeat (Scan.unless ($$ s) | |
| 116 | (scan_special || Scan.one Symbol.not_eof)) >> implode) --| $$ s) >> snd); | |
| 14863 | 117 | |
| 14910 | 118 | val parse_comment = Scan.this_string "<!--" -- | 
| 119 | Scan.repeat (Scan.unless (Scan.this_string "-->") (Scan.one Symbol.not_eof)) -- | |
| 120 | Scan.this_string "-->"; | |
| 13729 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
 berghofe parents: 
12416diff
changeset | 121 | |
| 15142 
7b7109f22224
Add scan_comment_whspc to skip space and comments in PGIP stream
 aspinall parents: 
15010diff
changeset | 122 | val scan_comment_whspc = | 
| 
7b7109f22224
Add scan_comment_whspc to skip space and comments in PGIP stream
 aspinall parents: 
15010diff
changeset | 123 | (scan_whspc >> K()) --| (Scan.repeat (parse_comment |-- (scan_whspc >> K()))); | 
| 
7b7109f22224
Add scan_comment_whspc to skip space and comments in PGIP stream
 aspinall parents: 
15010diff
changeset | 124 | |
| 14910 | 125 | val parse_pi = Scan.this_string "<?" |-- | 
| 126 | Scan.repeat (Scan.unless (Scan.this_string "?>") (Scan.one Symbol.not_eof)) --| | |
| 127 | Scan.this_string "?>"; | |
| 13729 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
 berghofe parents: 
12416diff
changeset | 128 | |
| 14185 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 129 | fun parse_content xs = | 
| 15216 
2fac1f11b7f6
Remove white space skipping in element content; XML specification clearly requires whitespace to be passed to application.
 aspinall parents: 
15211diff
changeset | 130 | ((Scan.optional ((* scan_whspc |-- *) parse_chars >> (single o Text)) [] -- | 
| 
2fac1f11b7f6
Remove white space skipping in element content; XML specification clearly requires whitespace to be passed to application.
 aspinall parents: 
15211diff
changeset | 131 | (Scan.repeat ((* scan_whspc |-- *) | 
| 14185 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 132 | ( parse_elem >> single | 
| 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 133 | || parse_cdata >> (single o Text) | 
| 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 134 | || parse_pi >> K [] | 
| 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 135 | || parse_comment >> K []) -- | 
| 15216 
2fac1f11b7f6
Remove white space skipping in element content; XML specification clearly requires whitespace to be passed to application.
 aspinall parents: 
15211diff
changeset | 136 | Scan.optional ((* scan_whspc |-- *) parse_chars >> (single o Text)) [] | 
| 15570 | 137 | >> op @) >> List.concat) >> op @)(* --| scan_whspc*)) xs | 
| 14185 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 138 | |
| 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 139 | and parse_elem xs = | 
| 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 140 | ($$ "<" |-- Symbol.scan_id -- | 
| 14865 | 141 | Scan.repeat (scan_whspc |-- parse_att) --| scan_whspc :-- (fn (s, _) => | 
| 14185 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 142 | !! (err "Expected > or />") | 
| 14910 | 143 | (Scan.this_string "/>" >> K [] | 
| 14185 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 144 | || $$ ">" |-- parse_content --| | 
| 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 145 |             !! (err ("Expected </" ^ s ^ ">"))
 | 
| 14910 | 146 |               (Scan.this_string ("</" ^ s) --| scan_whspc --| $$ ">"))) >>
 | 
| 13729 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
 berghofe parents: 
12416diff
changeset | 147 | (fn ((s, atts), ts) => Elem (s, atts, ts))) xs; | 
| 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
 berghofe parents: 
12416diff
changeset | 148 | |
| 14185 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 149 | val parse_document = | 
| 14910 | 150 | Scan.option (Scan.this_string "<!DOCTYPE" -- scan_whspc |-- | 
| 14185 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 151 | (Scan.repeat (Scan.unless ($$ ">") | 
| 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 152 | (Scan.one Symbol.not_eof)) >> implode) --| $$ ">" --| scan_whspc) -- | 
| 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 153 | parse_elem; | 
| 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 154 | |
| 13729 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
 berghofe parents: 
12416diff
changeset | 155 | fun tree_of_string s = | 
| 14185 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 156 | (case Scan.finite Symbol.stopper (Scan.error (!! (err "Malformed element") | 
| 
9b3841638c06
Tried to make parser a bit more standard-conforming.
 berghofe parents: 
13729diff
changeset | 157 | (scan_whspc |-- parse_elem --| scan_whspc))) (Symbol.explode s) of | 
| 14728 | 158 | (x, []) => x | 
| 159 |   | (_, ys) => error ("XML parsing error: Unprocessed input\n" ^ Symbol.beginning 100 ys));
 | |
| 13729 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
 berghofe parents: 
12416diff
changeset | 160 | |
| 12416 | 161 | end; |