src/Pure/General/xml.ML
author wenzelm
Sat Jul 23 16:37:17 2011 +0200 (2011-07-23)
changeset 43947 9b00f09f7721
parent 43844 33e20b7d7e72
child 43949 94033767ef9b
permissions -rw-r--r--
defer evaluation of Scan.message, for improved performance in the frequent situation where failure is handled later (e.g. via ||);
     1 (*  Title:      Pure/General/xml.ML
     2     Author:     David Aspinall, Stefan Berghofer and Markus Wenzel
     3 
     4 Untyped XML trees.
     5 *)
     6 
     7 signature XML_DATA_OPS =
     8 sig
     9   type 'a A
    10   type 'a T
    11   type 'a V
    12   val int_atom: int A
    13   val bool_atom: bool A
    14   val unit_atom: unit A
    15   val properties: Properties.T T
    16   val string: string T
    17   val int: int T
    18   val bool: bool T
    19   val unit: unit T
    20   val pair: 'a T -> 'b T -> ('a * 'b) T
    21   val triple: 'a T -> 'b T -> 'c T -> ('a * 'b * 'c) T
    22   val list: 'a T -> 'a list T
    23   val option: 'a T -> 'a option T
    24   val variant: 'a V list -> 'a T
    25 end;
    26 
    27 signature XML =
    28 sig
    29   type attributes = Properties.T
    30   datatype tree =
    31       Elem of Markup.T * tree list
    32     | Text of string
    33   type body = tree list
    34   val add_content: tree -> Buffer.T -> Buffer.T
    35   val content_of: body -> string
    36   val header: string
    37   val text: string -> string
    38   val element: string -> attributes -> string list -> string
    39   val output_markup: Markup.T -> Output.output * Output.output
    40   val string_of: tree -> string
    41   val pretty: int -> tree -> Pretty.T
    42   val output: tree -> TextIO.outstream -> unit
    43   val parse_comments: string list -> unit * string list
    44   val parse_string : string -> string option
    45   val parse_element: string list -> tree * string list
    46   val parse_document: string list -> tree * string list
    47   val parse: string -> tree
    48   exception XML_ATOM of string
    49   exception XML_BODY of body
    50   structure Encode: XML_DATA_OPS
    51   structure Decode: XML_DATA_OPS
    52 end;
    53 
    54 structure XML: XML =
    55 struct
    56 
    57 (** XML trees **)
    58 
    59 type attributes = Properties.T;
    60 
    61 datatype tree =
    62     Elem of Markup.T * tree list
    63   | Text of string;
    64 
    65 type body = tree list;
    66 
    67 fun add_content (Elem (_, ts)) = fold add_content ts
    68   | add_content (Text s) = Buffer.add s;
    69 
    70 fun content_of body = Buffer.empty |> fold add_content body |> Buffer.content;
    71 
    72 
    73 
    74 (** string representation **)
    75 
    76 val header = "<?xml version=\"1.0\"?>\n";
    77 
    78 
    79 (* escaped text *)
    80 
    81 fun decode "&lt;" = "<"
    82   | decode "&gt;" = ">"
    83   | decode "&amp;" = "&"
    84   | decode "&apos;" = "'"
    85   | decode "&quot;" = "\""
    86   | decode c = c;
    87 
    88 fun encode "<" = "&lt;"
    89   | encode ">" = "&gt;"
    90   | encode "&" = "&amp;"
    91   | encode "'" = "&apos;"
    92   | encode "\"" = "&quot;"
    93   | encode c = c;
    94 
    95 val text = translate_string encode;
    96 
    97 
    98 (* elements *)
    99 
   100 fun elem name atts =
   101   space_implode " " (name :: map (fn (a, x) => a ^ "=\"" ^ text x ^ "\"") atts);
   102 
   103 fun element name atts body =
   104   let val b = implode body in
   105     if b = "" then enclose "<" "/>" (elem name atts)
   106     else enclose "<" ">" (elem name atts) ^ b ^ enclose "</" ">" name
   107   end;
   108 
   109 fun output_markup (markup as (name, atts)) =
   110   if Markup.is_empty markup then Markup.no_output
   111   else (enclose "<" ">" (elem name atts), enclose "</" ">" name);
   112 
   113 
   114 (* output *)
   115 
   116 fun buffer_of depth tree =
   117   let
   118     fun traverse _ (Elem ((name, atts), [])) =
   119           Buffer.add "<" #> Buffer.add (elem name atts) #> Buffer.add "/>"
   120       | traverse d (Elem ((name, atts), ts)) =
   121           Buffer.add "<" #> Buffer.add (elem name atts) #> Buffer.add ">" #>
   122           traverse_body d ts #>
   123           Buffer.add "</" #> Buffer.add name #> Buffer.add ">"
   124       | traverse _ (Text s) = Buffer.add (text s)
   125     and traverse_body 0 _ = Buffer.add "..."
   126       | traverse_body d ts = fold (traverse (d - 1)) ts;
   127   in Buffer.empty |> traverse depth tree end;
   128 
   129 val string_of = Buffer.content o buffer_of ~1;
   130 val output = Buffer.output o buffer_of ~1;
   131 
   132 fun pretty depth tree =
   133   Pretty.str (Buffer.content (buffer_of (Int.max (0, depth)) tree));
   134 
   135 
   136 
   137 (** XML parsing (slow) **)
   138 
   139 local
   140 
   141 fun err msg (xs, _) =
   142   fn () => "XML parsing error: " ^ msg () ^ "\nfound: " ^ quote (Symbol.beginning 100 xs);
   143 
   144 fun ignored _ = [];
   145 
   146 val blanks = Scan.many Symbol.is_blank;
   147 val special = $$ "&" ^^ Symbol.scan_id ^^ $$ ";" >> decode;
   148 val regular = Scan.one Symbol.is_regular;
   149 fun regular_except x = Scan.one (fn c => Symbol.is_regular c andalso c <> x);
   150 
   151 val parse_chars = Scan.repeat1 (special || regular_except "<") >> implode;
   152 
   153 val parse_cdata =
   154   Scan.this_string "<![CDATA[" |--
   155   (Scan.repeat (Scan.unless (Scan.this_string "]]>") regular) >> implode) --|
   156   Scan.this_string "]]>";
   157 
   158 val parse_att =
   159   (Symbol.scan_id --| (blanks -- $$ "=" -- blanks)) --
   160   (($$ "\"" || $$ "'") :|-- (fn s =>
   161     (Scan.repeat (special || regular_except s) >> implode) --| $$ s));
   162 
   163 val parse_comment =
   164   Scan.this_string "<!--" --
   165   Scan.repeat (Scan.unless (Scan.this_string "-->") regular) --
   166   Scan.this_string "-->" >> ignored;
   167 
   168 val parse_processing_instruction =
   169   Scan.this_string "<?" --
   170   Scan.repeat (Scan.unless (Scan.this_string "?>") regular) --
   171   Scan.this_string "?>" >> ignored;
   172 
   173 val parse_doctype =
   174   Scan.this_string "<!DOCTYPE" --
   175   Scan.repeat (Scan.unless ($$ ">") regular) --
   176   $$ ">" >> ignored;
   177 
   178 val parse_misc =
   179   Scan.one Symbol.is_blank >> ignored ||
   180   parse_processing_instruction ||
   181   parse_comment;
   182 
   183 val parse_optional_text =
   184   Scan.optional (parse_chars >> (single o Text)) [];
   185 
   186 in
   187 
   188 val parse_comments =
   189   blanks -- Scan.repeat (parse_comment -- blanks >> K ()) >> K ();
   190 
   191 val parse_string = Scan.read Symbol.stopper parse_chars o raw_explode;
   192 
   193 fun parse_content xs =
   194   (parse_optional_text @@@
   195     (Scan.repeat
   196       ((parse_element >> single ||
   197         parse_cdata >> (single o Text) ||
   198         parse_processing_instruction ||
   199         parse_comment)
   200       @@@ parse_optional_text) >> flat)) xs
   201 
   202 and parse_element xs =
   203   ($$ "<" |-- Symbol.scan_id --
   204     Scan.repeat (blanks |-- parse_att) --| blanks :-- (fn (s, _) =>
   205       !! (err (fn () => "Expected > or />"))
   206         (Scan.this_string "/>" >> ignored
   207          || $$ ">" |-- parse_content --|
   208             !! (err (fn () => "Expected </" ^ s ^ ">"))
   209               (Scan.this_string ("</" ^ s) --| blanks --| $$ ">"))) >> Elem) xs;
   210 
   211 val parse_document =
   212   (Scan.repeat parse_misc -- Scan.option parse_doctype -- Scan.repeat parse_misc)
   213   |-- parse_element;
   214 
   215 fun parse s =
   216   (case Scan.finite Symbol.stopper (Scan.error (!! (err (fn () => "Malformed element"))
   217       (blanks |-- parse_document --| blanks))) (raw_explode s) of
   218     (x, []) => x
   219   | (_, ys) => error ("XML parsing error: Unprocessed input\n" ^ Symbol.beginning 100 ys));
   220 
   221 end;
   222 
   223 
   224 
   225 (** XML as data representation language **)
   226 
   227 exception XML_ATOM of string;
   228 exception XML_BODY of tree list;
   229 
   230 
   231 structure Encode =
   232 struct
   233 
   234 type 'a A = 'a -> string;
   235 type 'a T = 'a -> body;
   236 type 'a V = 'a -> string list * body;
   237 
   238 
   239 (* atomic values *)
   240 
   241 fun int_atom i = signed_string_of_int i;
   242 
   243 fun bool_atom false = "0"
   244   | bool_atom true = "1";
   245 
   246 fun unit_atom () = "";
   247 
   248 
   249 (* structural nodes *)
   250 
   251 fun node ts = Elem ((":", []), ts);
   252 
   253 fun vector xs = map_index (fn (i, x) => (int_atom i, x)) xs;
   254 
   255 fun tagged (tag, (xs, ts)) = Elem ((int_atom tag, vector xs), ts);
   256 
   257 
   258 (* representation of standard types *)
   259 
   260 fun properties props = [Elem ((":", props), [])];
   261 
   262 fun string "" = []
   263   | string s = [Text s];
   264 
   265 val int = string o int_atom;
   266 
   267 val bool = string o bool_atom;
   268 
   269 val unit = string o unit_atom;
   270 
   271 fun pair f g (x, y) = [node (f x), node (g y)];
   272 
   273 fun triple f g h (x, y, z) = [node (f x), node (g y), node (h z)];
   274 
   275 fun list f xs = map (node o f) xs;
   276 
   277 fun option _ NONE = []
   278   | option f (SOME x) = [node (f x)];
   279 
   280 fun variant fs x = [tagged (the (get_index (fn f => try f x) fs))];
   281 
   282 end;
   283 
   284 
   285 structure Decode =
   286 struct
   287 
   288 type 'a A = string -> 'a;
   289 type 'a T = body -> 'a;
   290 type 'a V = string list * body -> 'a;
   291 
   292 
   293 (* atomic values *)
   294 
   295 fun int_atom s =
   296   Markup.parse_int s
   297     handle Fail _ => raise XML_ATOM s;
   298 
   299 fun bool_atom "0" = false
   300   | bool_atom "1" = true
   301   | bool_atom s = raise XML_ATOM s;
   302 
   303 fun unit_atom "" = ()
   304   | unit_atom s = raise XML_ATOM s;
   305 
   306 
   307 (* structural nodes *)
   308 
   309 fun node (Elem ((":", []), ts)) = ts
   310   | node t = raise XML_BODY [t];
   311 
   312 fun vector atts =
   313   #1 (fold_map (fn (a, x) =>
   314         fn i => if int_atom a = i then (x, i + 1) else raise XML_ATOM a) atts 0);
   315 
   316 fun tagged (Elem ((name, atts), ts)) = (int_atom name, (vector atts, ts))
   317   | tagged t = raise XML_BODY [t];
   318 
   319 
   320 (* representation of standard types *)
   321 
   322 fun properties [Elem ((":", props), [])] = props
   323   | properties ts = raise XML_BODY ts;
   324 
   325 fun string [] = ""
   326   | string [Text s] = s
   327   | string ts = raise XML_BODY ts;
   328 
   329 val int = int_atom o string;
   330 
   331 val bool = bool_atom o string;
   332 
   333 val unit = unit_atom o string;
   334 
   335 fun pair f g [t1, t2] = (f (node t1), g (node t2))
   336   | pair _ _ ts = raise XML_BODY ts;
   337 
   338 fun triple f g h [t1, t2, t3] = (f (node t1), g (node t2), h (node t3))
   339   | triple _ _ _ ts = raise XML_BODY ts;
   340 
   341 fun list f ts = map (f o node) ts;
   342 
   343 fun option _ [] = NONE
   344   | option f [t] = SOME (f (node t))
   345   | option _ ts = raise XML_BODY ts;
   346 
   347 fun variant fs [t] =
   348       let
   349         val (tag, (xs, ts)) = tagged t;
   350         val f = nth fs tag handle General.Subscript => raise XML_BODY [t];
   351       in f (xs, ts) end
   352   | variant _ ts = raise XML_BODY ts;
   353 
   354 end;
   355 
   356 end;