| author | wenzelm | 
| Thu, 07 Aug 2008 13:44:42 +0200 | |
| changeset 27764 | e0ee3cc240fe | 
| parent 26984 | d0e098e206f3 | 
| child 27884 | 10c927e4abf5 | 
| permissions | -rw-r--r-- | 
| 24584 | 1 | (* Title: Pure/General/xml.ML | 
| 24264 | 2 | ID: $Id$ | 
| 3 | Author: David Aspinall, Stefan Berghofer and Markus Wenzel | |
| 4 | ||
| 5 | Basic support for XML. | |
| 6 | *) | |
| 7 | ||
| 8 | signature XML = | |
| 9 | sig | |
| 26539 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 wenzelm parents: 
26525diff
changeset | 10 | type attributes = Markup.property list | 
| 24264 | 11 | datatype tree = | 
| 12 | Elem of string * attributes * tree list | |
| 13 | | Text of string | |
| 14 | | Output of output | |
| 26546 | 15 | val add_content: tree -> Buffer.T -> Buffer.T | 
| 16 | val detect: string -> bool | |
| 17 | val header: string | |
| 18 | val text: string -> string | |
| 19 | val element: string -> attributes -> string list -> string | |
| 20 | val output_markup: Markup.T -> output * output | |
| 26539 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 wenzelm parents: 
26525diff
changeset | 21 | val string_of: tree -> string | 
| 26546 | 22 | val output: tree -> TextIO.outstream -> unit | 
| 26984 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 23 | val parse_comments: string list -> unit * string list | 
| 24264 | 24 | val parse_string : string -> string option | 
| 26546 | 25 | val parse_element: string list -> tree * string list | 
| 26984 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 26 | val parse_document: string list -> tree * string list | 
| 26539 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 wenzelm parents: 
26525diff
changeset | 27 | val parse: string -> tree | 
| 24264 | 28 | end; | 
| 29 | ||
| 30 | structure XML: XML = | |
| 31 | struct | |
| 32 | ||
| 26546 | 33 | (** XML trees **) | 
| 34 | ||
| 35 | type attributes = Markup.property list; | |
| 36 | ||
| 37 | datatype tree = | |
| 38 | Elem of string * attributes * tree list | |
| 39 | | Text of string | |
| 40 | | Output of output; (* FIXME !? *) | |
| 41 | ||
| 42 | fun add_content (Elem (_, _, ts)) = fold add_content ts | |
| 43 | | add_content (Text s) = Buffer.add s | |
| 44 | | add_content (Output _) = I; (* FIXME !? *) | |
| 45 | ||
| 46 | ||
| 24264 | 47 | |
| 26525 | 48 | (** string representation **) | 
| 49 | ||
| 50 | val detect = String.isPrefix "<?xml"; | |
| 24264 | 51 | val header = "<?xml version=\"1.0\"?>\n"; | 
| 52 | ||
| 53 | ||
| 26546 | 54 | (* escaped text *) | 
| 24264 | 55 | |
| 56 | fun decode "<" = "<" | |
| 57 | | decode ">" = ">" | |
| 58 | | decode "&" = "&" | |
| 59 | | decode "'" = "'" | |
| 60 | | decode """ = "\"" | |
| 61 | | decode c = c; | |
| 62 | ||
| 63 | fun encode "<" = "<" | |
| 64 | | encode ">" = ">" | |
| 65 | | encode "&" = "&" | |
| 66 | | encode "'" = "'" | |
| 67 | | encode "\"" = """ | |
| 68 | | encode c = c; | |
| 69 | ||
| 25838 | 70 | val text = translate_string encode; | 
| 24264 | 71 | |
| 72 | ||
| 73 | (* elements *) | |
| 74 | ||
| 26539 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 wenzelm parents: 
26525diff
changeset | 75 | fun elem name atts = | 
| 26551 | 76 | space_implode " " (name :: map (fn (a, x) => a ^ "=\"" ^ text x ^ "\"") atts); | 
| 24264 | 77 | |
| 26525 | 78 | fun element name atts body = | 
| 26539 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 wenzelm parents: 
26525diff
changeset | 79 | let val b = implode body in | 
| 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 wenzelm parents: 
26525diff
changeset | 80 | if b = "" then enclose "<" "/>" (elem name atts) | 
| 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 wenzelm parents: 
26525diff
changeset | 81 | else enclose "<" ">" (elem name atts) ^ b ^ enclose "</" ">" name | 
| 24264 | 82 | end; | 
| 83 | ||
| 26539 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 wenzelm parents: 
26525diff
changeset | 84 | fun output_markup (name, atts) = | 
| 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 wenzelm parents: 
26525diff
changeset | 85 | (enclose "<" ">" (elem name atts), | 
| 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 wenzelm parents: 
26525diff
changeset | 86 | enclose "</" ">" name); | 
| 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 wenzelm parents: 
26525diff
changeset | 87 | |
| 24264 | 88 | |
| 26546 | 89 | (* output *) | 
| 24264 | 90 | |
| 26546 | 91 | fun buffer_of tree = | 
| 24264 | 92 | let | 
| 26546 | 93 | fun traverse (Elem (name, atts, [])) = | 
| 26539 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 wenzelm parents: 
26525diff
changeset | 94 | Buffer.add "<" #> Buffer.add (elem name atts) #> Buffer.add "/>" | 
| 26546 | 95 | | traverse (Elem (name, atts, ts)) = | 
| 26539 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 wenzelm parents: 
26525diff
changeset | 96 | Buffer.add "<" #> Buffer.add (elem name atts) #> Buffer.add ">" #> | 
| 26546 | 97 | fold traverse ts #> | 
| 26525 | 98 | Buffer.add "</" #> Buffer.add name #> Buffer.add ">" | 
| 26546 | 99 | | traverse (Text s) = Buffer.add (text s) | 
| 100 | | traverse (Output s) = Buffer.add s; | |
| 101 | in Buffer.empty |> traverse tree end; | |
| 24264 | 102 | |
| 26546 | 103 | val string_of = Buffer.content o buffer_of; | 
| 104 | val output = Buffer.output o buffer_of; | |
| 25838 | 105 | |
| 24264 | 106 | |
| 107 | ||
| 26546 | 108 | (** XML parsing (slow) **) | 
| 109 | ||
| 110 | local | |
| 24264 | 111 | |
| 112 | fun err s (xs, _) = | |
| 113 | "XML parsing error: " ^ s ^ "\nfound: " ^ quote (Symbol.beginning 100 xs); | |
| 114 | ||
| 26984 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 115 | fun ignored _ = []; | 
| 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 116 | |
| 26551 | 117 | val blanks = Scan.many Symbol.is_blank; | 
| 118 | val special = $$ "&" ^^ Symbol.scan_id ^^ $$ ";" >> decode; | |
| 119 | val regular = Scan.one Symbol.is_regular; | |
| 120 | fun regular_except x = Scan.one (fn c => Symbol.is_regular c andalso c <> x); | |
| 24264 | 121 | |
| 26551 | 122 | val parse_chars = Scan.repeat1 (special || regular_except "<") >> implode; | 
| 24264 | 123 | |
| 26551 | 124 | val parse_cdata = | 
| 125 | Scan.this_string "<![CDATA[" |-- | |
| 126 | (Scan.repeat (Scan.unless (Scan.this_string "]]>") regular) >> implode) --| | |
| 127 | Scan.this_string "]]>"; | |
| 24264 | 128 | |
| 129 | val parse_att = | |
| 26551 | 130 | (Symbol.scan_id --| (blanks -- $$ "=" -- blanks)) -- | 
| 131 | (($$ "\"" || $$ "'") :|-- (fn s => | |
| 132 | (Scan.repeat (special || regular_except s) >> implode) --| $$ s)); | |
| 24264 | 133 | |
| 26551 | 134 | val parse_comment = | 
| 135 | Scan.this_string "<!--" -- | |
| 136 | Scan.repeat (Scan.unless (Scan.this_string "-->") regular) -- | |
| 26984 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 137 | Scan.this_string "-->" >> ignored; | 
| 24264 | 138 | |
| 26551 | 139 | val parse_processing_instruction = | 
| 140 | Scan.this_string "<?" -- | |
| 141 | Scan.repeat (Scan.unless (Scan.this_string "?>") regular) -- | |
| 26984 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 142 | Scan.this_string "?>" >> ignored; | 
| 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 143 | |
| 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 144 | val parse_doctype = | 
| 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 145 | Scan.this_string "<!DOCTYPE" -- | 
| 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 146 | Scan.repeat (Scan.unless ($$ ">") regular) -- | 
| 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 147 | $$ ">" >> ignored; | 
| 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 148 | |
| 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 149 | val parse_misc = | 
| 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 150 | Scan.one Symbol.is_blank >> ignored || | 
| 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 151 | parse_processing_instruction || | 
| 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 152 | parse_comment; | 
| 26551 | 153 | |
| 154 | val parse_optional_text = | |
| 155 | Scan.optional (parse_chars >> (single o Text)) []; | |
| 24264 | 156 | |
| 26546 | 157 | in | 
| 158 | ||
| 26984 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 159 | val parse_comments = | 
| 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 160 | blanks -- Scan.repeat (parse_comment -- blanks >> K ()) >> K (); | 
| 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 161 | |
| 26546 | 162 | val parse_string = Scan.read Symbol.stopper parse_chars o explode; | 
| 163 | ||
| 24264 | 164 | fun parse_content xs = | 
| 26551 | 165 | (parse_optional_text @@@ | 
| 166 | (Scan.repeat | |
| 167 | ((parse_element >> single || | |
| 168 | parse_cdata >> (single o Text) || | |
| 26984 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 169 | parse_processing_instruction || | 
| 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 170 | parse_comment) | 
| 26551 | 171 | @@@ parse_optional_text) >> flat)) xs | 
| 24264 | 172 | |
| 26546 | 173 | and parse_element xs = | 
| 24264 | 174 | ($$ "<" |-- Symbol.scan_id -- | 
| 26551 | 175 | Scan.repeat (blanks |-- parse_att) --| blanks :-- (fn (s, _) => | 
| 24264 | 176 | !! (err "Expected > or />") | 
| 26984 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 177 | (Scan.this_string "/>" >> ignored | 
| 24264 | 178 | || $$ ">" |-- parse_content --| | 
| 179 |             !! (err ("Expected </" ^ s ^ ">"))
 | |
| 26551 | 180 |               (Scan.this_string ("</" ^ s) --| blanks --| $$ ">"))) >>
 | 
| 24264 | 181 | (fn ((s, atts), ts) => Elem (s, atts, ts))) xs; | 
| 182 | ||
| 26984 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 183 | val parse_document = | 
| 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 184 | (Scan.repeat parse_misc -- Scan.option parse_doctype -- Scan.repeat parse_misc) | 
| 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 185 | |-- parse_element; | 
| 24264 | 186 | |
| 26539 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 wenzelm parents: 
26525diff
changeset | 187 | fun parse s = | 
| 24264 | 188 | (case Scan.finite Symbol.stopper (Scan.error (!! (err "Malformed element") | 
| 26984 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 189 | (blanks |-- parse_document --| blanks))) (explode s) of | 
| 24264 | 190 | (x, []) => x | 
| 191 |   | (_, ys) => error ("XML parsing error: Unprocessed input\n" ^ Symbol.beginning 100 ys));
 | |
| 192 | ||
| 193 | end; | |
| 26546 | 194 | |
| 195 | end; |