| author | wenzelm | 
| Thu, 08 Dec 2022 22:38:03 +0100 | |
| changeset 76610 | 6e2383488a55 | 
| parent 74789 | a5c23da73fca | 
| child 80461 | 38d020af64aa | 
| permissions | -rw-r--r-- | 
| 44698 | 1 | (* Title: Pure/PIDE/xml.ML | 
| 2 | Author: David Aspinall | |
| 3 | Author: Stefan Berghofer | |
| 4 | Author: Makarius | |
| 24264 | 5 | |
| 46840 | 6 | Untyped XML trees and representation of ML values. | 
| 24264 | 7 | *) | 
| 8 | ||
| 43767 | 9 | signature XML_DATA_OPS = | 
| 10 | sig | |
| 43778 
ce9189450447
more compact representation of XML data (notably sort/typ/term), using properties as vector of atomic values;
 wenzelm parents: 
43768diff
changeset | 11 | type 'a A | 
| 43767 | 12 | type 'a T | 
| 43778 
ce9189450447
more compact representation of XML data (notably sort/typ/term), using properties as vector of atomic values;
 wenzelm parents: 
43768diff
changeset | 13 | type 'a V | 
| 70828 | 14 | type 'a P | 
| 43778 
ce9189450447
more compact representation of XML data (notably sort/typ/term), using properties as vector of atomic values;
 wenzelm parents: 
43768diff
changeset | 15 | val int_atom: int A | 
| 
ce9189450447
more compact representation of XML data (notably sort/typ/term), using properties as vector of atomic values;
 wenzelm parents: 
43768diff
changeset | 16 | val bool_atom: bool A | 
| 
ce9189450447
more compact representation of XML data (notably sort/typ/term), using properties as vector of atomic values;
 wenzelm parents: 
43768diff
changeset | 17 | val unit_atom: unit A | 
| 43767 | 18 | val properties: Properties.T T | 
| 19 | val string: string T | |
| 20 | val int: int T | |
| 21 | val bool: bool T | |
| 22 | val unit: unit T | |
| 23 |   val pair: 'a T -> 'b T -> ('a * 'b) T
 | |
| 24 |   val triple: 'a T -> 'b T -> 'c T -> ('a * 'b * 'c) T
 | |
| 25 | val list: 'a T -> 'a list T | |
| 26 | val option: 'a T -> 'a option T | |
| 43778 
ce9189450447
more compact representation of XML data (notably sort/typ/term), using properties as vector of atomic values;
 wenzelm parents: 
43768diff
changeset | 27 | val variant: 'a V list -> 'a T | 
| 43767 | 28 | end; | 
| 29 | ||
| 24264 | 30 | signature XML = | 
| 31 | sig | |
| 46837 
5bdd68f380b3
clarified XML signature (again) -- coincide with basic Markup without explicit dependency;
 wenzelm parents: 
45155diff
changeset | 32 | type attributes = (string * string) list | 
| 24264 | 33 | datatype tree = | 
| 46837 
5bdd68f380b3
clarified XML signature (again) -- coincide with basic Markup without explicit dependency;
 wenzelm parents: 
45155diff
changeset | 34 | Elem of (string * attributes) * tree list | 
| 24264 | 35 | | Text of string | 
| 38266 
492d377ecfe2
type XML.body as basic data representation language;
 wenzelm parents: 
38228diff
changeset | 36 | type body = tree list | 
| 70991 
f9f7c34b7dd4
more scalable protocol_message: use XML.body directly (Output.output hook is not required);
 wenzelm parents: 
70828diff
changeset | 37 | val blob: string list -> body | 
| 70994 | 38 | val is_empty: tree -> bool | 
| 39 | val is_empty_body: body -> bool | |
| 74789 | 40 | val string: string -> body | 
| 41 | val enclose: string -> string -> body -> body | |
| 69234 | 42 | val xml_elemN: string | 
| 43 | val xml_nameN: string | |
| 44 | val xml_bodyN: string | |
| 49650 
9fad6480300d
support for wrapped XML elements, which allows to preserve full markup tree information in to_XML/from_XML conversion;
 wenzelm parents: 
49599diff
changeset | 45 | val wrap_elem: ((string * attributes) * tree list) * tree list -> tree | 
| 
9fad6480300d
support for wrapped XML elements, which allows to preserve full markup tree information in to_XML/from_XML conversion;
 wenzelm parents: 
49599diff
changeset | 46 | val unwrap_elem: tree -> (((string * attributes) * tree list) * tree list) option | 
| 26546 | 47 | val add_content: tree -> Buffer.T -> Buffer.T | 
| 39555 
ccb223a4d49c
added XML.content_of convenience -- cover XML.body, which is the general situation;
 wenzelm parents: 
38474diff
changeset | 48 | val content_of: body -> string | 
| 56059 
2390391584c2
some document antiquotations for Isabelle/jEdit elements;
 wenzelm parents: 
49650diff
changeset | 49 | val trim_blanks: body -> body | 
| 26546 | 50 | val header: string | 
| 51 | val text: string -> string | |
| 52 | val element: string -> attributes -> string list -> string | |
| 69345 
6bd63c94cf62
tuned signature (see also src/Tools/Haskell/Markup.hs);
 wenzelm parents: 
69234diff
changeset | 53 | val output_markup: Markup.T -> Markup.output | 
| 26539 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 wenzelm parents: 
26525diff
changeset | 54 | val string_of: tree -> string | 
| 43791 | 55 | val pretty: int -> tree -> Pretty.T | 
| 26984 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 56 | val parse_comments: string list -> unit * string list | 
| 24264 | 57 | val parse_string : string -> string option | 
| 26546 | 58 | val parse_element: string list -> tree * string list | 
| 26984 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 59 | val parse_document: string list -> tree * string list | 
| 26539 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 wenzelm parents: 
26525diff
changeset | 60 | val parse: string -> tree | 
| 43767 | 61 | exception XML_ATOM of string | 
| 62 | exception XML_BODY of body | |
| 65333 | 63 | structure Encode: | 
| 64 | sig | |
| 65 | include XML_DATA_OPS | |
| 66 | val tree: tree T | |
| 67 | end | |
| 68 | structure Decode: | |
| 69 | sig | |
| 70 | include XML_DATA_OPS | |
| 71 | val tree: tree T | |
| 72 | end | |
| 24264 | 73 | end; | 
| 74 | ||
| 75 | structure XML: XML = | |
| 76 | struct | |
| 77 | ||
| 26546 | 78 | (** XML trees **) | 
| 79 | ||
| 70991 
f9f7c34b7dd4
more scalable protocol_message: use XML.body directly (Output.output hook is not required);
 wenzelm parents: 
70828diff
changeset | 80 | open Output_Primitives.XML; | 
| 26546 | 81 | |
| 70991 
f9f7c34b7dd4
more scalable protocol_message: use XML.body directly (Output.output hook is not required);
 wenzelm parents: 
70828diff
changeset | 82 | val blob = map Text; | 
| 38266 
492d377ecfe2
type XML.body as basic data representation language;
 wenzelm parents: 
38228diff
changeset | 83 | |
| 70994 | 84 | fun is_empty (Text "") = true | 
| 85 | | is_empty _ = false; | |
| 86 | ||
| 87 | val is_empty_body = forall is_empty; | |
| 88 | ||
| 74789 | 89 | fun string "" = [] | 
| 90 | | string s = [Text s]; | |
| 91 | ||
| 92 | fun enclose bg en body = string bg @ body @ string en; | |
| 93 | ||
| 49650 
9fad6480300d
support for wrapped XML elements, which allows to preserve full markup tree information in to_XML/from_XML conversion;
 wenzelm parents: 
49599diff
changeset | 94 | |
| 
9fad6480300d
support for wrapped XML elements, which allows to preserve full markup tree information in to_XML/from_XML conversion;
 wenzelm parents: 
49599diff
changeset | 95 | (* wrapped elements *) | 
| 
9fad6480300d
support for wrapped XML elements, which allows to preserve full markup tree information in to_XML/from_XML conversion;
 wenzelm parents: 
49599diff
changeset | 96 | |
| 
9fad6480300d
support for wrapped XML elements, which allows to preserve full markup tree information in to_XML/from_XML conversion;
 wenzelm parents: 
49599diff
changeset | 97 | val xml_elemN = "xml_elem"; | 
| 
9fad6480300d
support for wrapped XML elements, which allows to preserve full markup tree information in to_XML/from_XML conversion;
 wenzelm parents: 
49599diff
changeset | 98 | val xml_nameN = "xml_name"; | 
| 
9fad6480300d
support for wrapped XML elements, which allows to preserve full markup tree information in to_XML/from_XML conversion;
 wenzelm parents: 
49599diff
changeset | 99 | val xml_bodyN = "xml_body"; | 
| 
9fad6480300d
support for wrapped XML elements, which allows to preserve full markup tree information in to_XML/from_XML conversion;
 wenzelm parents: 
49599diff
changeset | 100 | |
| 
9fad6480300d
support for wrapped XML elements, which allows to preserve full markup tree information in to_XML/from_XML conversion;
 wenzelm parents: 
49599diff
changeset | 101 | fun wrap_elem (((a, atts), body1), body2) = | 
| 
9fad6480300d
support for wrapped XML elements, which allows to preserve full markup tree information in to_XML/from_XML conversion;
 wenzelm parents: 
49599diff
changeset | 102 | Elem ((xml_elemN, (xml_nameN, a) :: atts), Elem ((xml_bodyN, []), body1) :: body2); | 
| 
9fad6480300d
support for wrapped XML elements, which allows to preserve full markup tree information in to_XML/from_XML conversion;
 wenzelm parents: 
49599diff
changeset | 103 | |
| 
9fad6480300d
support for wrapped XML elements, which allows to preserve full markup tree information in to_XML/from_XML conversion;
 wenzelm parents: 
49599diff
changeset | 104 | fun unwrap_elem (Elem ((name, (n, a) :: atts), Elem ((name', atts'), body1) :: body2)) = | 
| 
9fad6480300d
support for wrapped XML elements, which allows to preserve full markup tree information in to_XML/from_XML conversion;
 wenzelm parents: 
49599diff
changeset | 105 | if name = xml_elemN andalso n = xml_nameN andalso name' = xml_bodyN andalso null atts' | 
| 
9fad6480300d
support for wrapped XML elements, which allows to preserve full markup tree information in to_XML/from_XML conversion;
 wenzelm parents: 
49599diff
changeset | 106 | then SOME (((a, atts), body1), body2) else NONE | 
| 
9fad6480300d
support for wrapped XML elements, which allows to preserve full markup tree information in to_XML/from_XML conversion;
 wenzelm parents: 
49599diff
changeset | 107 | | unwrap_elem _ = NONE; | 
| 
9fad6480300d
support for wrapped XML elements, which allows to preserve full markup tree information in to_XML/from_XML conversion;
 wenzelm parents: 
49599diff
changeset | 108 | |
| 
9fad6480300d
support for wrapped XML elements, which allows to preserve full markup tree information in to_XML/from_XML conversion;
 wenzelm parents: 
49599diff
changeset | 109 | |
| 69224 | 110 | (* text content *) | 
| 49650 
9fad6480300d
support for wrapped XML elements, which allows to preserve full markup tree information in to_XML/from_XML conversion;
 wenzelm parents: 
49599diff
changeset | 111 | |
| 
9fad6480300d
support for wrapped XML elements, which allows to preserve full markup tree information in to_XML/from_XML conversion;
 wenzelm parents: 
49599diff
changeset | 112 | fun add_content tree = | 
| 
9fad6480300d
support for wrapped XML elements, which allows to preserve full markup tree information in to_XML/from_XML conversion;
 wenzelm parents: 
49599diff
changeset | 113 | (case unwrap_elem tree of | 
| 
9fad6480300d
support for wrapped XML elements, which allows to preserve full markup tree information in to_XML/from_XML conversion;
 wenzelm parents: 
49599diff
changeset | 114 | SOME (_, ts) => fold add_content ts | 
| 
9fad6480300d
support for wrapped XML elements, which allows to preserve full markup tree information in to_XML/from_XML conversion;
 wenzelm parents: 
49599diff
changeset | 115 | | NONE => | 
| 
9fad6480300d
support for wrapped XML elements, which allows to preserve full markup tree information in to_XML/from_XML conversion;
 wenzelm parents: 
49599diff
changeset | 116 | (case tree of | 
| 
9fad6480300d
support for wrapped XML elements, which allows to preserve full markup tree information in to_XML/from_XML conversion;
 wenzelm parents: 
49599diff
changeset | 117 | Elem (_, ts) => fold add_content ts | 
| 
9fad6480300d
support for wrapped XML elements, which allows to preserve full markup tree information in to_XML/from_XML conversion;
 wenzelm parents: 
49599diff
changeset | 118 | | Text s => Buffer.add s)); | 
| 26546 | 119 | |
| 74231 | 120 | val content_of = Buffer.build_content o fold add_content; | 
| 39555 
ccb223a4d49c
added XML.content_of convenience -- cover XML.body, which is the general situation;
 wenzelm parents: 
38474diff
changeset | 121 | |
| 26546 | 122 | |
| 56059 
2390391584c2
some document antiquotations for Isabelle/jEdit elements;
 wenzelm parents: 
49650diff
changeset | 123 | (* trim blanks *) | 
| 
2390391584c2
some document antiquotations for Isabelle/jEdit elements;
 wenzelm parents: 
49650diff
changeset | 124 | |
| 
2390391584c2
some document antiquotations for Isabelle/jEdit elements;
 wenzelm parents: 
49650diff
changeset | 125 | fun trim_blanks trees = | 
| 
2390391584c2
some document antiquotations for Isabelle/jEdit elements;
 wenzelm parents: 
49650diff
changeset | 126 | trees |> maps | 
| 
2390391584c2
some document antiquotations for Isabelle/jEdit elements;
 wenzelm parents: 
49650diff
changeset | 127 | (fn Elem (markup, body) => [Elem (markup, trim_blanks body)] | 
| 74785 | 128 | | Text s => s |> raw_explode |> trim Symbol.is_blank |> implode |> string); | 
| 56059 
2390391584c2
some document antiquotations for Isabelle/jEdit elements;
 wenzelm parents: 
49650diff
changeset | 129 | |
| 
2390391584c2
some document antiquotations for Isabelle/jEdit elements;
 wenzelm parents: 
49650diff
changeset | 130 | |
| 24264 | 131 | |
| 26525 | 132 | (** string representation **) | 
| 133 | ||
| 69806 | 134 | val header = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"; | 
| 24264 | 135 | |
| 136 | ||
| 26546 | 137 | (* escaped text *) | 
| 24264 | 138 | |
| 139 | fun decode "<" = "<" | |
| 140 | | decode ">" = ">" | |
| 141 | | decode "&" = "&" | |
| 142 | | decode "'" = "'" | |
| 143 | | decode """ = "\"" | |
| 144 | | decode c = c; | |
| 145 | ||
| 146 | fun encode "<" = "<" | |
| 147 | | encode ">" = ">" | |
| 148 | | encode "&" = "&" | |
| 149 | | encode "'" = "'" | |
| 150 | | encode "\"" = """ | |
| 151 | | encode c = c; | |
| 152 | ||
| 25838 | 153 | val text = translate_string encode; | 
| 24264 | 154 | |
| 155 | ||
| 156 | (* elements *) | |
| 157 | ||
| 26539 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 wenzelm parents: 
26525diff
changeset | 158 | fun elem name atts = | 
| 26551 | 159 | space_implode " " (name :: map (fn (a, x) => a ^ "=\"" ^ text x ^ "\"") atts); | 
| 24264 | 160 | |
| 26525 | 161 | fun element name atts body = | 
| 26539 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 wenzelm parents: 
26525diff
changeset | 162 | let val b = implode body in | 
| 74789 | 163 | if b = "" then Library.enclose "<" "/>" (elem name atts) | 
| 164 | else Library.enclose "<" ">" (elem name atts) ^ b ^ Library.enclose "</" ">" name | |
| 24264 | 165 | end; | 
| 166 | ||
| 27884 | 167 | fun output_markup (markup as (name, atts)) = | 
| 38474 
e498dc2eb576
uniform Markup.empty/Markup.Empty in ML and Scala;
 wenzelm parents: 
38266diff
changeset | 168 | if Markup.is_empty markup then Markup.no_output | 
| 74789 | 169 | else (Library.enclose "<" ">" (elem name atts), Library.enclose "</" ">" name); | 
| 26539 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 wenzelm parents: 
26525diff
changeset | 170 | |
| 24264 | 171 | |
| 74231 | 172 | (* output content *) | 
| 24264 | 173 | |
| 74231 | 174 | fun content_depth depth = | 
| 24264 | 175 | let | 
| 43791 | 176 | fun traverse _ (Elem ((name, atts), [])) = | 
| 26539 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 wenzelm parents: 
26525diff
changeset | 177 | Buffer.add "<" #> Buffer.add (elem name atts) #> Buffer.add "/>" | 
| 43791 | 178 | | traverse d (Elem ((name, atts), ts)) = | 
| 26539 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 wenzelm parents: 
26525diff
changeset | 179 | Buffer.add "<" #> Buffer.add (elem name atts) #> Buffer.add ">" #> | 
| 43791 | 180 | traverse_body d ts #> | 
| 26525 | 181 | Buffer.add "</" #> Buffer.add name #> Buffer.add ">" | 
| 43791 | 182 | | traverse _ (Text s) = Buffer.add (text s) | 
| 183 | and traverse_body 0 _ = Buffer.add "..." | |
| 184 | | traverse_body d ts = fold (traverse (d - 1)) ts; | |
| 74231 | 185 | in Buffer.build_content o traverse depth end; | 
| 24264 | 186 | |
| 74231 | 187 | val string_of = content_depth ~1; | 
| 43791 | 188 | |
| 74231 | 189 | fun pretty depth tree = Pretty.str (content_depth (Int.max (0, depth)) tree); | 
| 25838 | 190 | |
| 62819 
d3ff367a16a0
careful export of type-dependent functions, without losing their special status;
 wenzelm parents: 
62663diff
changeset | 191 | val _ = ML_system_pp (fn depth => fn _ => Pretty.to_polyml o pretty (FixedInt.toInt depth)); | 
| 62663 | 192 | |
| 24264 | 193 | |
| 194 | ||
| 44698 | 195 | (** XML parsing **) | 
| 26546 | 196 | |
| 197 | local | |
| 24264 | 198 | |
| 43947 
9b00f09f7721
defer evaluation of Scan.message, for improved performance in the frequent situation where failure is handled later (e.g. via ||);
 wenzelm parents: 
43844diff
changeset | 199 | fun err msg (xs, _) = | 
| 
9b00f09f7721
defer evaluation of Scan.message, for improved performance in the frequent situation where failure is handled later (e.g. via ||);
 wenzelm parents: 
43844diff
changeset | 200 | fn () => "XML parsing error: " ^ msg () ^ "\nfound: " ^ quote (Symbol.beginning 100 xs); | 
| 24264 | 201 | |
| 26984 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 202 | fun ignored _ = []; | 
| 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 203 | |
| 45155 
3216d65d8f34
slightly more standard-conformant XML parsing (see also 94033767ef9b);
 wenzelm parents: 
44809diff
changeset | 204 | fun name_start_char c = Symbol.is_ascii_letter c orelse c = ":" orelse c = "_"; | 
| 
3216d65d8f34
slightly more standard-conformant XML parsing (see also 94033767ef9b);
 wenzelm parents: 
44809diff
changeset | 205 | fun name_char c = name_start_char c orelse Symbol.is_ascii_digit c orelse c = "-" orelse c = "."; | 
| 
3216d65d8f34
slightly more standard-conformant XML parsing (see also 94033767ef9b);
 wenzelm parents: 
44809diff
changeset | 206 | val parse_name = Scan.one name_start_char ::: Scan.many name_char; | 
| 
3216d65d8f34
slightly more standard-conformant XML parsing (see also 94033767ef9b);
 wenzelm parents: 
44809diff
changeset | 207 | |
| 26551 | 208 | val blanks = Scan.many Symbol.is_blank; | 
| 45155 
3216d65d8f34
slightly more standard-conformant XML parsing (see also 94033767ef9b);
 wenzelm parents: 
44809diff
changeset | 209 | val special = $$ "&" ^^ (parse_name >> implode) ^^ $$ ";" >> decode; | 
| 58854 | 210 | val regular = Scan.one Symbol.not_eof; | 
| 211 | fun regular_except x = Scan.one (fn c => Symbol.not_eof c andalso c <> x); | |
| 24264 | 212 | |
| 26551 | 213 | val parse_chars = Scan.repeat1 (special || regular_except "<") >> implode; | 
| 24264 | 214 | |
| 26551 | 215 | val parse_cdata = | 
| 216 | Scan.this_string "<![CDATA[" |-- | |
| 217 | (Scan.repeat (Scan.unless (Scan.this_string "]]>") regular) >> implode) --| | |
| 218 | Scan.this_string "]]>"; | |
| 24264 | 219 | |
| 220 | val parse_att = | |
| 45155 
3216d65d8f34
slightly more standard-conformant XML parsing (see also 94033767ef9b);
 wenzelm parents: 
44809diff
changeset | 221 | ((parse_name >> implode) --| (blanks -- $$ "=" -- blanks)) -- | 
| 26551 | 222 | (($$ "\"" || $$ "'") :|-- (fn s => | 
| 223 | (Scan.repeat (special || regular_except s) >> implode) --| $$ s)); | |
| 24264 | 224 | |
| 26551 | 225 | val parse_comment = | 
| 226 | Scan.this_string "<!--" -- | |
| 227 | Scan.repeat (Scan.unless (Scan.this_string "-->") regular) -- | |
| 26984 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 228 | Scan.this_string "-->" >> ignored; | 
| 24264 | 229 | |
| 26551 | 230 | val parse_processing_instruction = | 
| 231 | Scan.this_string "<?" -- | |
| 232 | Scan.repeat (Scan.unless (Scan.this_string "?>") regular) -- | |
| 26984 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 233 | Scan.this_string "?>" >> ignored; | 
| 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 234 | |
| 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 235 | val parse_doctype = | 
| 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 236 | Scan.this_string "<!DOCTYPE" -- | 
| 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 237 | Scan.repeat (Scan.unless ($$ ">") regular) -- | 
| 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 238 | $$ ">" >> ignored; | 
| 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 239 | |
| 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 240 | val parse_misc = | 
| 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 241 | Scan.one Symbol.is_blank >> ignored || | 
| 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 242 | parse_processing_instruction || | 
| 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 243 | parse_comment; | 
| 26551 | 244 | |
| 245 | val parse_optional_text = | |
| 246 | Scan.optional (parse_chars >> (single o Text)) []; | |
| 24264 | 247 | |
| 26546 | 248 | in | 
| 249 | ||
| 26984 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 250 | val parse_comments = | 
| 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 251 | blanks -- Scan.repeat (parse_comment -- blanks >> K ()) >> K (); | 
| 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 252 | |
| 40627 
becf5d5187cc
renamed raw "explode" function to "raw_explode" to emphasize its meaning;
 wenzelm parents: 
40131diff
changeset | 253 | val parse_string = Scan.read Symbol.stopper parse_chars o raw_explode; | 
| 26546 | 254 | |
| 24264 | 255 | fun parse_content xs = | 
| 26551 | 256 | (parse_optional_text @@@ | 
| 61476 | 257 | Scan.repeats | 
| 26551 | 258 | ((parse_element >> single || | 
| 259 | parse_cdata >> (single o Text) || | |
| 26984 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 260 | parse_processing_instruction || | 
| 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 261 | parse_comment) | 
| 61476 | 262 | @@@ parse_optional_text)) xs | 
| 24264 | 263 | |
| 26546 | 264 | and parse_element xs = | 
| 43949 
94033767ef9b
more precise parse_name according to XML standard;
 wenzelm parents: 
43947diff
changeset | 265 | ($$ "<" |-- parse_name -- Scan.repeat (blanks |-- parse_att) --| blanks :-- | 
| 
94033767ef9b
more precise parse_name according to XML standard;
 wenzelm parents: 
43947diff
changeset | 266 | (fn (name, _) => | 
| 43947 
9b00f09f7721
defer evaluation of Scan.message, for improved performance in the frequent situation where failure is handled later (e.g. via ||);
 wenzelm parents: 
43844diff
changeset | 267 | !! (err (fn () => "Expected > or />")) | 
| 43949 
94033767ef9b
more precise parse_name according to XML standard;
 wenzelm parents: 
43947diff
changeset | 268 | ($$ "/" -- $$ ">" >> ignored || | 
| 
94033767ef9b
more precise parse_name according to XML standard;
 wenzelm parents: 
43947diff
changeset | 269 | $$ ">" |-- parse_content --| | 
| 
94033767ef9b
more precise parse_name according to XML standard;
 wenzelm parents: 
43947diff
changeset | 270 | !! (err (fn () => "Expected </" ^ implode name ^ ">")) | 
| 
94033767ef9b
more precise parse_name according to XML standard;
 wenzelm parents: 
43947diff
changeset | 271 | ($$ "<" -- $$ "/" -- Scan.this name -- blanks -- $$ ">"))) | 
| 
94033767ef9b
more precise parse_name according to XML standard;
 wenzelm parents: 
43947diff
changeset | 272 | >> (fn ((name, atts), body) => Elem ((implode name, atts), body))) xs; | 
| 24264 | 273 | |
| 26984 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 274 | val parse_document = | 
| 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 275 | (Scan.repeat parse_misc -- Scan.option parse_doctype -- Scan.repeat parse_misc) | 
| 
d0e098e206f3
added parse_document (optional unchecked header material);
 wenzelm parents: 
26554diff
changeset | 276 | |-- parse_element; | 
| 24264 | 277 | |
| 26539 
a0754be538ab
added output_markup (from Tools/isabelle_process.ML);
 wenzelm parents: 
26525diff
changeset | 278 | fun parse s = | 
| 43947 
9b00f09f7721
defer evaluation of Scan.message, for improved performance in the frequent situation where failure is handled later (e.g. via ||);
 wenzelm parents: 
43844diff
changeset | 279 | (case Scan.finite Symbol.stopper (Scan.error (!! (err (fn () => "Malformed element")) | 
| 40627 
becf5d5187cc
renamed raw "explode" function to "raw_explode" to emphasize its meaning;
 wenzelm parents: 
40131diff
changeset | 280 | (blanks |-- parse_document --| blanks))) (raw_explode s) of | 
| 24264 | 281 | (x, []) => x | 
| 48769 | 282 |   | (_, ys) => error ("XML parsing error: unprocessed input\n" ^ Symbol.beginning 100 ys));
 | 
| 24264 | 283 | |
| 284 | end; | |
| 26546 | 285 | |
| 43767 | 286 | |
| 287 | ||
| 288 | (** XML as data representation language **) | |
| 289 | ||
| 290 | exception XML_ATOM of string; | |
| 291 | exception XML_BODY of tree list; | |
| 292 | ||
| 293 | ||
| 294 | structure Encode = | |
| 295 | struct | |
| 296 | ||
| 43778 
ce9189450447
more compact representation of XML data (notably sort/typ/term), using properties as vector of atomic values;
 wenzelm parents: 
43768diff
changeset | 297 | type 'a A = 'a -> string; | 
| 43767 | 298 | type 'a T = 'a -> body; | 
| 43778 
ce9189450447
more compact representation of XML data (notably sort/typ/term), using properties as vector of atomic values;
 wenzelm parents: 
43768diff
changeset | 299 | type 'a V = 'a -> string list * body; | 
| 70828 | 300 | type 'a P = 'a -> string list; | 
| 43767 | 301 | |
| 302 | ||
| 43778 
ce9189450447
more compact representation of XML data (notably sort/typ/term), using properties as vector of atomic values;
 wenzelm parents: 
43768diff
changeset | 303 | (* atomic values *) | 
| 43767 | 304 | |
| 69574 | 305 | fun int_atom i = Value.print_int i; | 
| 43767 | 306 | |
| 307 | fun bool_atom false = "0" | |
| 308 | | bool_atom true = "1"; | |
| 309 | ||
| 310 | fun unit_atom () = ""; | |
| 311 | ||
| 312 | ||
| 313 | (* structural nodes *) | |
| 314 | ||
| 315 | fun node ts = Elem ((":", []), ts);
 | |
| 316 | ||
| 43778 
ce9189450447
more compact representation of XML data (notably sort/typ/term), using properties as vector of atomic values;
 wenzelm parents: 
43768diff
changeset | 317 | fun vector xs = map_index (fn (i, x) => (int_atom i, x)) xs; | 
| 
ce9189450447
more compact representation of XML data (notably sort/typ/term), using properties as vector of atomic values;
 wenzelm parents: 
43768diff
changeset | 318 | |
| 
ce9189450447
more compact representation of XML data (notably sort/typ/term), using properties as vector of atomic values;
 wenzelm parents: 
43768diff
changeset | 319 | fun tagged (tag, (xs, ts)) = Elem ((int_atom tag, vector xs), ts); | 
| 43767 | 320 | |
| 321 | ||
| 322 | (* representation of standard types *) | |
| 323 | ||
| 65333 | 324 | fun tree (t: tree) = [t]; | 
| 325 | ||
| 43767 | 326 | fun properties props = [Elem ((":", props), [])];
 | 
| 327 | ||
| 328 | fun string "" = [] | |
| 329 | | string s = [Text s]; | |
| 330 | ||
| 331 | val int = string o int_atom; | |
| 332 | ||
| 333 | val bool = string o bool_atom; | |
| 334 | ||
| 335 | val unit = string o unit_atom; | |
| 336 | ||
| 337 | fun pair f g (x, y) = [node (f x), node (g y)]; | |
| 338 | ||
| 339 | fun triple f g h (x, y, z) = [node (f x), node (g y), node (h z)]; | |
| 340 | ||
| 341 | fun list f xs = map (node o f) xs; | |
| 342 | ||
| 343 | fun option _ NONE = [] | |
| 344 | | option f (SOME x) = [node (f x)]; | |
| 345 | ||
| 47199 
15ede9f1da3f
more specific notion of partiality (cf. Scala version);
 wenzelm parents: 
46840diff
changeset | 346 | fun variant fs x = | 
| 
15ede9f1da3f
more specific notion of partiality (cf. Scala version);
 wenzelm parents: 
46840diff
changeset | 347 | [tagged (the (get_index (fn f => SOME (f x) handle General.Match => NONE) fs))]; | 
| 43767 | 348 | |
| 26546 | 349 | end; | 
| 43767 | 350 | |
| 351 | ||
| 352 | structure Decode = | |
| 353 | struct | |
| 354 | ||
| 43778 
ce9189450447
more compact representation of XML data (notably sort/typ/term), using properties as vector of atomic values;
 wenzelm parents: 
43768diff
changeset | 355 | type 'a A = string -> 'a; | 
| 43767 | 356 | type 'a T = body -> 'a; | 
| 43778 
ce9189450447
more compact representation of XML data (notably sort/typ/term), using properties as vector of atomic values;
 wenzelm parents: 
43768diff
changeset | 357 | type 'a V = string list * body -> 'a; | 
| 70828 | 358 | type 'a P = string list -> 'a; | 
| 43767 | 359 | |
| 360 | ||
| 43778 
ce9189450447
more compact representation of XML data (notably sort/typ/term), using properties as vector of atomic values;
 wenzelm parents: 
43768diff
changeset | 361 | (* atomic values *) | 
| 43767 | 362 | |
| 363 | fun int_atom s = | |
| 63806 | 364 | Value.parse_int s | 
| 43797 
fad7758421bf
more precise integer Markup.properties/XML.attributes: disallow ML-style ~ minus;
 wenzelm parents: 
43791diff
changeset | 365 | handle Fail _ => raise XML_ATOM s; | 
| 43767 | 366 | |
| 367 | fun bool_atom "0" = false | |
| 368 | | bool_atom "1" = true | |
| 369 | | bool_atom s = raise XML_ATOM s; | |
| 370 | ||
| 371 | fun unit_atom "" = () | |
| 372 | | unit_atom s = raise XML_ATOM s; | |
| 373 | ||
| 374 | ||
| 375 | (* structural nodes *) | |
| 376 | ||
| 377 | fun node (Elem ((":", []), ts)) = ts
 | |
| 378 | | node t = raise XML_BODY [t]; | |
| 379 | ||
| 43783 | 380 | fun vector atts = | 
| 46839 
f7232c078fa5
simplified -- plain map_index is sufficient (pointed out by Enrico Tassi);
 wenzelm parents: 
46837diff
changeset | 381 | map_index (fn (i, (a, x)) => if int_atom a = i then x else raise XML_ATOM a) atts; | 
| 43778 
ce9189450447
more compact representation of XML data (notably sort/typ/term), using properties as vector of atomic values;
 wenzelm parents: 
43768diff
changeset | 382 | |
| 43844 | 383 | fun tagged (Elem ((name, atts), ts)) = (int_atom name, (vector atts, ts)) | 
| 43767 | 384 | | tagged t = raise XML_BODY [t]; | 
| 385 | ||
| 386 | ||
| 387 | (* representation of standard types *) | |
| 388 | ||
| 65333 | 389 | fun tree [t] = t | 
| 390 | | tree ts = raise XML_BODY ts; | |
| 391 | ||
| 43767 | 392 | fun properties [Elem ((":", props), [])] = props
 | 
| 393 | | properties ts = raise XML_BODY ts; | |
| 394 | ||
| 395 | fun string [] = "" | |
| 396 | | string [Text s] = s | |
| 397 | | string ts = raise XML_BODY ts; | |
| 398 | ||
| 399 | val int = int_atom o string; | |
| 400 | ||
| 401 | val bool = bool_atom o string; | |
| 402 | ||
| 403 | val unit = unit_atom o string; | |
| 404 | ||
| 405 | fun pair f g [t1, t2] = (f (node t1), g (node t2)) | |
| 406 | | pair _ _ ts = raise XML_BODY ts; | |
| 407 | ||
| 408 | fun triple f g h [t1, t2, t3] = (f (node t1), g (node t2), h (node t3)) | |
| 409 | | triple _ _ _ ts = raise XML_BODY ts; | |
| 410 | ||
| 411 | fun list f ts = map (f o node) ts; | |
| 412 | ||
| 413 | fun option _ [] = NONE | |
| 414 | | option f [t] = SOME (f (node t)) | |
| 415 | | option _ ts = raise XML_BODY ts; | |
| 416 | ||
| 43768 | 417 | fun variant fs [t] = | 
| 418 | let | |
| 43778 
ce9189450447
more compact representation of XML data (notably sort/typ/term), using properties as vector of atomic values;
 wenzelm parents: 
43768diff
changeset | 419 | val (tag, (xs, ts)) = tagged t; | 
| 43768 | 420 | val f = nth fs tag handle General.Subscript => raise XML_BODY [t]; | 
| 43778 
ce9189450447
more compact representation of XML data (notably sort/typ/term), using properties as vector of atomic values;
 wenzelm parents: 
43768diff
changeset | 421 | in f (xs, ts) end | 
| 43767 | 422 | | variant _ ts = raise XML_BODY ts; | 
| 423 | ||
| 424 | end; | |
| 425 | ||
| 426 | end; |