src/Pure/PIDE/xml.ML
author wenzelm
Thu Aug 02 12:36:54 2012 +0200 (2012-08-02)
changeset 48646 91281e9472d8
parent 47199 15ede9f1da3f
child 48769 e3b7087bb923
permissions -rw-r--r--
more official command specifications, including source position;
     1 (*  Title:      Pure/PIDE/xml.ML
     2     Author:     David Aspinall
     3     Author:     Stefan Berghofer
     4     Author:     Makarius
     5 
     6 Untyped XML trees and representation of ML values.
     7 *)
     8 
     9 signature XML_DATA_OPS =
    10 sig
    11   type 'a A
    12   type 'a T
    13   type 'a V
    14   val int_atom: int A
    15   val bool_atom: bool A
    16   val unit_atom: unit A
    17   val properties: Properties.T T
    18   val string: string T
    19   val int: int T
    20   val bool: bool T
    21   val unit: unit T
    22   val pair: 'a T -> 'b T -> ('a * 'b) T
    23   val triple: 'a T -> 'b T -> 'c T -> ('a * 'b * 'c) T
    24   val list: 'a T -> 'a list T
    25   val option: 'a T -> 'a option T
    26   val variant: 'a V list -> 'a T
    27 end;
    28 
    29 signature XML =
    30 sig
    31   type attributes = (string * string) list
    32   datatype tree =
    33       Elem of (string * attributes) * tree list
    34     | Text of string
    35   type body = tree list
    36   val add_content: tree -> Buffer.T -> Buffer.T
    37   val content_of: body -> string
    38   val header: string
    39   val text: string -> string
    40   val element: string -> attributes -> string list -> string
    41   val output_markup: Markup.T -> Output.output * Output.output
    42   val string_of: tree -> string
    43   val pretty: int -> tree -> Pretty.T
    44   val output: tree -> TextIO.outstream -> unit
    45   val parse_comments: string list -> unit * string list
    46   val parse_string : string -> string option
    47   val parse_element: string list -> tree * string list
    48   val parse_document: string list -> tree * string list
    49   val parse: string -> tree
    50   val cache: unit -> tree -> tree
    51   exception XML_ATOM of string
    52   exception XML_BODY of body
    53   structure Encode: XML_DATA_OPS
    54   structure Decode: XML_DATA_OPS
    55 end;
    56 
    57 structure XML: XML =
    58 struct
    59 
    60 (** XML trees **)
    61 
    62 type attributes = (string * string) list;
    63 
    64 datatype tree =
    65     Elem of (string * attributes) * tree list
    66   | Text of string;
    67 
    68 type body = tree list;
    69 
    70 fun add_content (Elem (_, ts)) = fold add_content ts
    71   | add_content (Text s) = Buffer.add s;
    72 
    73 fun content_of body = Buffer.empty |> fold add_content body |> Buffer.content;
    74 
    75 
    76 
    77 (** string representation **)
    78 
    79 val header = "<?xml version=\"1.0\"?>\n";
    80 
    81 
    82 (* escaped text *)
    83 
    84 fun decode "&lt;" = "<"
    85   | decode "&gt;" = ">"
    86   | decode "&amp;" = "&"
    87   | decode "&apos;" = "'"
    88   | decode "&quot;" = "\""
    89   | decode c = c;
    90 
    91 fun encode "<" = "&lt;"
    92   | encode ">" = "&gt;"
    93   | encode "&" = "&amp;"
    94   | encode "'" = "&apos;"
    95   | encode "\"" = "&quot;"
    96   | encode c = c;
    97 
    98 val text = translate_string encode;
    99 
   100 
   101 (* elements *)
   102 
   103 fun elem name atts =
   104   space_implode " " (name :: map (fn (a, x) => a ^ "=\"" ^ text x ^ "\"") atts);
   105 
   106 fun element name atts body =
   107   let val b = implode body in
   108     if b = "" then enclose "<" "/>" (elem name atts)
   109     else enclose "<" ">" (elem name atts) ^ b ^ enclose "</" ">" name
   110   end;
   111 
   112 fun output_markup (markup as (name, atts)) =
   113   if Markup.is_empty markup then Markup.no_output
   114   else (enclose "<" ">" (elem name atts), enclose "</" ">" name);
   115 
   116 
   117 (* output *)
   118 
   119 fun buffer_of depth tree =
   120   let
   121     fun traverse _ (Elem ((name, atts), [])) =
   122           Buffer.add "<" #> Buffer.add (elem name atts) #> Buffer.add "/>"
   123       | traverse d (Elem ((name, atts), ts)) =
   124           Buffer.add "<" #> Buffer.add (elem name atts) #> Buffer.add ">" #>
   125           traverse_body d ts #>
   126           Buffer.add "</" #> Buffer.add name #> Buffer.add ">"
   127       | traverse _ (Text s) = Buffer.add (text s)
   128     and traverse_body 0 _ = Buffer.add "..."
   129       | traverse_body d ts = fold (traverse (d - 1)) ts;
   130   in Buffer.empty |> traverse depth tree end;
   131 
   132 val string_of = Buffer.content o buffer_of ~1;
   133 val output = Buffer.output o buffer_of ~1;
   134 
   135 fun pretty depth tree =
   136   Pretty.str (Buffer.content (buffer_of (Int.max (0, depth)) tree));
   137 
   138 
   139 
   140 (** XML parsing **)
   141 
   142 local
   143 
   144 fun err msg (xs, _) =
   145   fn () => "XML parsing error: " ^ msg () ^ "\nfound: " ^ quote (Symbol.beginning 100 xs);
   146 
   147 fun ignored _ = [];
   148 
   149 fun name_start_char c = Symbol.is_ascii_letter c orelse c = ":" orelse c = "_";
   150 fun name_char c = name_start_char c orelse Symbol.is_ascii_digit c orelse c = "-" orelse c = ".";
   151 val parse_name = Scan.one name_start_char ::: Scan.many name_char;
   152 
   153 val blanks = Scan.many Symbol.is_blank;
   154 val special = $$ "&" ^^ (parse_name >> implode) ^^ $$ ";" >> decode;
   155 val regular = Scan.one Symbol.is_regular;
   156 fun regular_except x = Scan.one (fn c => Symbol.is_regular c andalso c <> x);
   157 
   158 val parse_chars = Scan.repeat1 (special || regular_except "<") >> implode;
   159 
   160 val parse_cdata =
   161   Scan.this_string "<![CDATA[" |--
   162   (Scan.repeat (Scan.unless (Scan.this_string "]]>") regular) >> implode) --|
   163   Scan.this_string "]]>";
   164 
   165 val parse_att =
   166   ((parse_name >> implode) --| (blanks -- $$ "=" -- blanks)) --
   167   (($$ "\"" || $$ "'") :|-- (fn s =>
   168     (Scan.repeat (special || regular_except s) >> implode) --| $$ s));
   169 
   170 val parse_comment =
   171   Scan.this_string "<!--" --
   172   Scan.repeat (Scan.unless (Scan.this_string "-->") regular) --
   173   Scan.this_string "-->" >> ignored;
   174 
   175 val parse_processing_instruction =
   176   Scan.this_string "<?" --
   177   Scan.repeat (Scan.unless (Scan.this_string "?>") regular) --
   178   Scan.this_string "?>" >> ignored;
   179 
   180 val parse_doctype =
   181   Scan.this_string "<!DOCTYPE" --
   182   Scan.repeat (Scan.unless ($$ ">") regular) --
   183   $$ ">" >> ignored;
   184 
   185 val parse_misc =
   186   Scan.one Symbol.is_blank >> ignored ||
   187   parse_processing_instruction ||
   188   parse_comment;
   189 
   190 val parse_optional_text =
   191   Scan.optional (parse_chars >> (single o Text)) [];
   192 
   193 in
   194 
   195 val parse_comments =
   196   blanks -- Scan.repeat (parse_comment -- blanks >> K ()) >> K ();
   197 
   198 val parse_string = Scan.read Symbol.stopper parse_chars o raw_explode;
   199 
   200 fun parse_content xs =
   201   (parse_optional_text @@@
   202     (Scan.repeat
   203       ((parse_element >> single ||
   204         parse_cdata >> (single o Text) ||
   205         parse_processing_instruction ||
   206         parse_comment)
   207       @@@ parse_optional_text) >> flat)) xs
   208 
   209 and parse_element xs =
   210   ($$ "<" |-- parse_name -- Scan.repeat (blanks |-- parse_att) --| blanks :--
   211     (fn (name, _) =>
   212       !! (err (fn () => "Expected > or />"))
   213        ($$ "/" -- $$ ">" >> ignored ||
   214         $$ ">" |-- parse_content --|
   215           !! (err (fn () => "Expected </" ^ implode name ^ ">"))
   216               ($$ "<" -- $$ "/" -- Scan.this name -- blanks -- $$ ">")))
   217     >> (fn ((name, atts), body) => Elem ((implode name, atts), body))) xs;
   218 
   219 val parse_document =
   220   (Scan.repeat parse_misc -- Scan.option parse_doctype -- Scan.repeat parse_misc)
   221   |-- parse_element;
   222 
   223 fun parse s =
   224   (case Scan.finite Symbol.stopper (Scan.error (!! (err (fn () => "Malformed element"))
   225       (blanks |-- parse_document --| blanks))) (raw_explode s) of
   226     (x, []) => x
   227   | (_, ys) => error ("XML parsing error: Unprocessed input\n" ^ Symbol.beginning 100 ys));
   228 
   229 end;
   230 
   231 
   232 (** cache for substructural sharing **)
   233 
   234 fun tree_ord tu =
   235   if pointer_eq tu then EQUAL
   236   else
   237     (case tu of
   238       (Text _, Elem _) => LESS
   239     | (Elem _, Text _) => GREATER
   240     | (Text s, Text s') => fast_string_ord (s, s')
   241     | (Elem e, Elem e') =>
   242         prod_ord
   243           (prod_ord fast_string_ord (list_ord (prod_ord fast_string_ord fast_string_ord)))
   244           (list_ord tree_ord) (e, e'));
   245 
   246 structure Treetab = Table(type key = tree val ord = tree_ord);
   247 
   248 fun cache () =
   249   let
   250     val strings = Unsynchronized.ref (Symtab.empty: unit Symtab.table);
   251     val trees = Unsynchronized.ref (Treetab.empty: unit Treetab.table);
   252 
   253     fun string s =
   254       if size s <= 1 then s
   255       else
   256         (case Symtab.lookup_key (! strings) s of
   257           SOME (s', ()) => s'
   258         | NONE => (Unsynchronized.change strings (Symtab.update (s, ())); s));
   259 
   260     fun tree t =
   261       (case Treetab.lookup_key (! trees) t of
   262         SOME (t', ()) => t'
   263       | NONE =>
   264           let
   265             val t' =
   266               (case t of
   267                 Elem ((a, ps), b) => Elem ((string a, map (pairself string) ps), map tree b)
   268               | Text s => Text (string s));
   269             val _ = Unsynchronized.change trees (Treetab.update (t', ()));
   270           in t' end);
   271   in tree end;
   272 
   273 
   274 
   275 (** XML as data representation language **)
   276 
   277 exception XML_ATOM of string;
   278 exception XML_BODY of tree list;
   279 
   280 
   281 structure Encode =
   282 struct
   283 
   284 type 'a A = 'a -> string;
   285 type 'a T = 'a -> body;
   286 type 'a V = 'a -> string list * body;
   287 
   288 
   289 (* atomic values *)
   290 
   291 fun int_atom i = signed_string_of_int i;
   292 
   293 fun bool_atom false = "0"
   294   | bool_atom true = "1";
   295 
   296 fun unit_atom () = "";
   297 
   298 
   299 (* structural nodes *)
   300 
   301 fun node ts = Elem ((":", []), ts);
   302 
   303 fun vector xs = map_index (fn (i, x) => (int_atom i, x)) xs;
   304 
   305 fun tagged (tag, (xs, ts)) = Elem ((int_atom tag, vector xs), ts);
   306 
   307 
   308 (* representation of standard types *)
   309 
   310 fun properties props = [Elem ((":", props), [])];
   311 
   312 fun string "" = []
   313   | string s = [Text s];
   314 
   315 val int = string o int_atom;
   316 
   317 val bool = string o bool_atom;
   318 
   319 val unit = string o unit_atom;
   320 
   321 fun pair f g (x, y) = [node (f x), node (g y)];
   322 
   323 fun triple f g h (x, y, z) = [node (f x), node (g y), node (h z)];
   324 
   325 fun list f xs = map (node o f) xs;
   326 
   327 fun option _ NONE = []
   328   | option f (SOME x) = [node (f x)];
   329 
   330 fun variant fs x =
   331   [tagged (the (get_index (fn f => SOME (f x) handle General.Match => NONE) fs))];
   332 
   333 end;
   334 
   335 
   336 structure Decode =
   337 struct
   338 
   339 type 'a A = string -> 'a;
   340 type 'a T = body -> 'a;
   341 type 'a V = string list * body -> 'a;
   342 
   343 
   344 (* atomic values *)
   345 
   346 fun int_atom s =
   347   Markup.parse_int s
   348     handle Fail _ => raise XML_ATOM s;
   349 
   350 fun bool_atom "0" = false
   351   | bool_atom "1" = true
   352   | bool_atom s = raise XML_ATOM s;
   353 
   354 fun unit_atom "" = ()
   355   | unit_atom s = raise XML_ATOM s;
   356 
   357 
   358 (* structural nodes *)
   359 
   360 fun node (Elem ((":", []), ts)) = ts
   361   | node t = raise XML_BODY [t];
   362 
   363 fun vector atts =
   364   map_index (fn (i, (a, x)) => if int_atom a = i then x else raise XML_ATOM a) atts;
   365 
   366 fun tagged (Elem ((name, atts), ts)) = (int_atom name, (vector atts, ts))
   367   | tagged t = raise XML_BODY [t];
   368 
   369 
   370 (* representation of standard types *)
   371 
   372 fun properties [Elem ((":", props), [])] = props
   373   | properties ts = raise XML_BODY ts;
   374 
   375 fun string [] = ""
   376   | string [Text s] = s
   377   | string ts = raise XML_BODY ts;
   378 
   379 val int = int_atom o string;
   380 
   381 val bool = bool_atom o string;
   382 
   383 val unit = unit_atom o string;
   384 
   385 fun pair f g [t1, t2] = (f (node t1), g (node t2))
   386   | pair _ _ ts = raise XML_BODY ts;
   387 
   388 fun triple f g h [t1, t2, t3] = (f (node t1), g (node t2), h (node t3))
   389   | triple _ _ _ ts = raise XML_BODY ts;
   390 
   391 fun list f ts = map (f o node) ts;
   392 
   393 fun option _ [] = NONE
   394   | option f [t] = SOME (f (node t))
   395   | option _ ts = raise XML_BODY ts;
   396 
   397 fun variant fs [t] =
   398       let
   399         val (tag, (xs, ts)) = tagged t;
   400         val f = nth fs tag handle General.Subscript => raise XML_BODY [t];
   401       in f (xs, ts) end
   402   | variant _ ts = raise XML_BODY ts;
   403 
   404 end;
   405 
   406 end;