| author | blanchet | 
| Tue, 07 Jun 2011 08:52:35 +0200 | |
| changeset 43228 | 2ed2f092e990 | 
| parent 40627 | becf5d5187cc | 
| child 43767 | e0219ef7f84c | 
| permissions | -rw-r--r-- | 
| 24584 | 1 | (* Title: Pure/General/xml.ML | 
| 24264 | 2 | Author: David Aspinall, Stefan Berghofer and Markus Wenzel | 
| 3 | ||
| 38228 
ada3ab6b9085
simplified type XML.tree: embed Markup.T directly, avoid slightly odd triple;
 wenzelm parents: 
31469diff
changeset | 4 | Simple XML tree values. | 
| 24264 | 5 | *) | 
| 6 | ||
| 7 | signature XML = | |
| 8 | sig | |
| 28017 | 9 | type attributes = Properties.T | 
| 24264 | 10 | datatype tree = | 
| 38228 
ada3ab6b9085
simplified type XML.tree: embed Markup.T directly, avoid slightly odd triple;
 wenzelm parents: 
31469diff
changeset | 11 | Elem of Markup.T * tree list | 
| 24264 | 12 | | Text of string | 
| 38266 
492d377ecfe2
type XML.body as basic data representation language;
 wenzelm parents: 
38228diff
changeset | 13 | type body = tree list | 
| 26546 | 14 | val add_content: tree -> Buffer.T -> Buffer.T | 
| 39555 
ccb223a4d49c
added XML.content_of convenience -- cover XML.body, which is the general situation;
 wenzelm parents: 
38474diff
changeset | 15 | val content_of: body -> string | 
| 26546 | 16 | val header: string | 
| 17 | val text: string -> string | |
| 18 | val element: string -> attributes -> string list -> string | |
| 40131 
7cbebd636e79
explicitly qualify type Output.output, which is a slightly odd internal feature;
 wenzelm parents: 
39555diff
changeset | 19 | val output_markup: Markup.T -> Output.output * Output.output | 
| 26539 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 wenzelm parents: 
26525diff
changeset | 20 | val string_of: tree -> string | 
| 26546 | 21 | val output: tree -> TextIO.outstream -> unit | 
| 26984 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 22 | val parse_comments: string list -> unit * string list | 
| 24264 | 23 | val parse_string : string -> string option | 
| 26546 | 24 | val parse_element: string list -> tree * string list | 
| 26984 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 25 | val parse_document: string list -> tree * string list | 
| 26539 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 wenzelm parents: 
26525diff
changeset | 26 | val parse: string -> tree | 
| 24264 | 27 | end; | 
| 28 | ||
| 29 | structure XML: XML = | |
| 30 | struct | |
| 31 | ||
| 26546 | 32 | (** XML trees **) | 
| 33 | ||
| 28017 | 34 | type attributes = Properties.T; | 
| 26546 | 35 | |
| 36 | datatype tree = | |
| 38228 
ada3ab6b9085
simplified type XML.tree: embed Markup.T directly, avoid slightly odd triple;
 wenzelm parents: 
31469diff
changeset | 37 | Elem of Markup.T * tree list | 
| 28033 | 38 | | Text of string; | 
| 26546 | 39 | |
| 38266 
492d377ecfe2
type XML.body as basic data representation language;
 wenzelm parents: 
38228diff
changeset | 40 | type body = tree list; | 
| 
492d377ecfe2
type XML.body as basic data representation language;
 wenzelm parents: 
38228diff
changeset | 41 | |
| 38228 
ada3ab6b9085
simplified type XML.tree: embed Markup.T directly, avoid slightly odd triple;
 wenzelm parents: 
31469diff
changeset | 42 | fun add_content (Elem (_, ts)) = fold add_content ts | 
| 28033 | 43 | | add_content (Text s) = Buffer.add s; | 
| 26546 | 44 | |
| 39555 
ccb223a4d49c
added XML.content_of convenience -- cover XML.body, which is the general situation;
 wenzelm parents: 
38474diff
changeset | 45 | fun content_of body = Buffer.empty |> fold add_content body |> Buffer.content; | 
| 
ccb223a4d49c
added XML.content_of convenience -- cover XML.body, which is the general situation;
 wenzelm parents: 
38474diff
changeset | 46 | |
| 26546 | 47 | |
| 24264 | 48 | |
| 26525 | 49 | (** string representation **) | 
| 50 | ||
| 24264 | 51 | val header = "<?xml version=\"1.0\"?>\n"; | 
| 52 | ||
| 53 | ||
| 26546 | 54 | (* escaped text *) | 
| 24264 | 55 | |
| 56 | fun decode "<" = "<" | |
| 57 | | decode ">" = ">" | |
| 58 | | decode "&" = "&" | |
| 59 | | decode "'" = "'" | |
| 60 | | decode """ = "\"" | |
| 61 | | decode c = c; | |
| 62 | ||
| 63 | fun encode "<" = "<" | |
| 64 | | encode ">" = ">" | |
| 65 | | encode "&" = "&" | |
| 66 | | encode "'" = "'" | |
| 67 | | encode "\"" = """ | |
| 68 | | encode c = c; | |
| 69 | ||
| 25838 | 70 | val text = translate_string encode; | 
| 24264 | 71 | |
| 72 | ||
| 73 | (* elements *) | |
| 74 | ||
| 26539 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 wenzelm parents: 
26525diff
changeset | 75 | fun elem name atts = | 
| 26551 | 76 | space_implode " " (name :: map (fn (a, x) => a ^ "=\"" ^ text x ^ "\"") atts); | 
| 24264 | 77 | |
| 26525 | 78 | fun element name atts body = | 
| 26539 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 wenzelm parents: 
26525diff
changeset | 79 | let val b = implode body in | 
| 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 wenzelm parents: 
26525diff
changeset | 80 | if b = "" then enclose "<" "/>" (elem name atts) | 
| 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 wenzelm parents: 
26525diff
changeset | 81 | else enclose "<" ">" (elem name atts) ^ b ^ enclose "</" ">" name | 
| 24264 | 82 | end; | 
| 83 | ||
| 27884 | 84 | fun output_markup (markup as (name, atts)) = | 
| 38474 
e498dc2eb576
uniform Markup.empty/Markup.Empty in ML and Scala;
 wenzelm parents: 
38266diff
changeset | 85 | if Markup.is_empty markup then Markup.no_output | 
| 27884 | 86 | else (enclose "<" ">" (elem name atts), enclose "</" ">" name); | 
| 26539 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 wenzelm parents: 
26525diff
changeset | 87 | |
| 24264 | 88 | |
| 26546 | 89 | (* output *) | 
| 24264 | 90 | |
| 26546 | 91 | fun buffer_of tree = | 
| 24264 | 92 | let | 
| 38228 
ada3ab6b9085
simplified type XML.tree: embed Markup.T directly, avoid slightly odd triple;
 wenzelm parents: 
31469diff
changeset | 93 | fun traverse (Elem ((name, atts), [])) = | 
| 26539 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 wenzelm parents: 
26525diff
changeset | 94 | Buffer.add "<" #> Buffer.add (elem name atts) #> Buffer.add "/>" | 
| 38228 
ada3ab6b9085
simplified type XML.tree: embed Markup.T directly, avoid slightly odd triple;
 wenzelm parents: 
31469diff
changeset | 95 | | traverse (Elem ((name, atts), ts)) = | 
| 26539 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 wenzelm parents: 
26525diff
changeset | 96 | Buffer.add "<" #> Buffer.add (elem name atts) #> Buffer.add ">" #> | 
| 26546 | 97 | fold traverse ts #> | 
| 26525 | 98 | Buffer.add "</" #> Buffer.add name #> Buffer.add ">" | 
| 28033 | 99 | | traverse (Text s) = Buffer.add (text s); | 
| 26546 | 100 | in Buffer.empty |> traverse tree end; | 
| 24264 | 101 | |
| 26546 | 102 | val string_of = Buffer.content o buffer_of; | 
| 103 | val output = Buffer.output o buffer_of; | |
| 25838 | 104 | |
| 24264 | 105 | |
| 106 | ||
| 26546 | 107 | (** XML parsing (slow) **) | 
| 108 | ||
| 109 | local | |
| 24264 | 110 | |
| 111 | fun err s (xs, _) = | |
| 112 | "XML parsing error: " ^ s ^ "\nfound: " ^ quote (Symbol.beginning 100 xs); | |
| 113 | ||
| 26984 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 114 | fun ignored _ = []; | 
| 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 115 | |
| 26551 | 116 | val blanks = Scan.many Symbol.is_blank; | 
| 117 | val special = $$ "&" ^^ Symbol.scan_id ^^ $$ ";" >> decode; | |
| 118 | val regular = Scan.one Symbol.is_regular; | |
| 119 | fun regular_except x = Scan.one (fn c => Symbol.is_regular c andalso c <> x); | |
| 24264 | 120 | |
| 26551 | 121 | val parse_chars = Scan.repeat1 (special || regular_except "<") >> implode; | 
| 24264 | 122 | |
| 26551 | 123 | val parse_cdata = | 
| 124 | Scan.this_string "<![CDATA[" |-- | |
| 125 | (Scan.repeat (Scan.unless (Scan.this_string "]]>") regular) >> implode) --| | |
| 126 | Scan.this_string "]]>"; | |
| 24264 | 127 | |
| 128 | val parse_att = | |
| 26551 | 129 | (Symbol.scan_id --| (blanks -- $$ "=" -- blanks)) -- | 
| 130 | (($$ "\"" || $$ "'") :|-- (fn s => | |
| 131 | (Scan.repeat (special || regular_except s) >> implode) --| $$ s)); | |
| 24264 | 132 | |
| 26551 | 133 | val parse_comment = | 
| 134 | Scan.this_string "<!--" -- | |
| 135 | Scan.repeat (Scan.unless (Scan.this_string "-->") regular) -- | |
| 26984 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 136 | Scan.this_string "-->" >> ignored; | 
| 24264 | 137 | |
| 26551 | 138 | val parse_processing_instruction = | 
| 139 | Scan.this_string "<?" -- | |
| 140 | Scan.repeat (Scan.unless (Scan.this_string "?>") regular) -- | |
| 26984 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 141 | Scan.this_string "?>" >> ignored; | 
| 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 142 | |
| 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 143 | val parse_doctype = | 
| 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 144 | Scan.this_string "<!DOCTYPE" -- | 
| 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 145 | Scan.repeat (Scan.unless ($$ ">") regular) -- | 
| 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 146 | $$ ">" >> ignored; | 
| 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 147 | |
| 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 148 | val parse_misc = | 
| 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 149 | Scan.one Symbol.is_blank >> ignored || | 
| 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 150 | parse_processing_instruction || | 
| 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 151 | parse_comment; | 
| 26551 | 152 | |
| 153 | val parse_optional_text = | |
| 154 | Scan.optional (parse_chars >> (single o Text)) []; | |
| 24264 | 155 | |
| 26546 | 156 | in | 
| 157 | ||
| 26984 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 158 | val parse_comments = | 
| 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 159 | blanks -- Scan.repeat (parse_comment -- blanks >> K ()) >> K (); | 
| 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 160 | |
| 40627 
becf5d5187cc
renamed raw "explode" function to "raw_explode" to emphasize its meaning;
 wenzelm parents: 
40131diff
changeset | 161 | val parse_string = Scan.read Symbol.stopper parse_chars o raw_explode; | 
| 26546 | 162 | |
| 24264 | 163 | fun parse_content xs = | 
| 26551 | 164 | (parse_optional_text @@@ | 
| 165 | (Scan.repeat | |
| 166 | ((parse_element >> single || | |
| 167 | parse_cdata >> (single o Text) || | |
| 26984 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 168 | parse_processing_instruction || | 
| 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 169 | parse_comment) | 
| 26551 | 170 | @@@ parse_optional_text) >> flat)) xs | 
| 24264 | 171 | |
| 26546 | 172 | and parse_element xs = | 
| 24264 | 173 | ($$ "<" |-- Symbol.scan_id -- | 
| 26551 | 174 | Scan.repeat (blanks |-- parse_att) --| blanks :-- (fn (s, _) => | 
| 24264 | 175 | !! (err "Expected > or />") | 
| 26984 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 176 | (Scan.this_string "/>" >> ignored | 
| 24264 | 177 | || $$ ">" |-- parse_content --| | 
| 178 |             !! (err ("Expected </" ^ s ^ ">"))
 | |
| 38228 
ada3ab6b9085
simplified type XML.tree: embed Markup.T directly, avoid slightly odd triple;
 wenzelm parents: 
31469diff
changeset | 179 |               (Scan.this_string ("</" ^ s) --| blanks --| $$ ">"))) >> Elem) xs;
 | 
| 24264 | 180 | |
| 26984 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 181 | val parse_document = | 
| 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 182 | (Scan.repeat parse_misc -- Scan.option parse_doctype -- Scan.repeat parse_misc) | 
| 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 183 | |-- parse_element; | 
| 24264 | 184 | |
| 26539 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 wenzelm parents: 
26525diff
changeset | 185 | fun parse s = | 
| 24264 | 186 | (case Scan.finite Symbol.stopper (Scan.error (!! (err "Malformed element") | 
| 40627 
becf5d5187cc
renamed raw "explode" function to "raw_explode" to emphasize its meaning;
 wenzelm parents: 
40131diff
changeset | 187 | (blanks |-- parse_document --| blanks))) (raw_explode s) of | 
| 24264 | 188 | (x, []) => x | 
| 189 |   | (_, ys) => error ("XML parsing error: Unprocessed input\n" ^ Symbol.beginning 100 ys));
 | |
| 190 | ||
| 191 | end; | |
| 26546 | 192 | |
| 193 | end; |