src/Pure/PIDE/xml.ML
author wenzelm
Sun Mar 10 14:19:30 2019 +0100 (4 months ago ago)
changeset 70070 be04e9a053a7
parent 69985 2156053c4ce9
permissions -rw-r--r--
markup and document markers for some meta data from "Dublin Core Metadata Element Set";
     1 (*  Title:      Pure/PIDE/xml.ML
     2     Author:     David Aspinall
     3     Author:     Stefan Berghofer
     4     Author:     Makarius
     5 
     6 Untyped XML trees and representation of ML values.
     7 *)
     8 
     9 signature XML_DATA_OPS =
    10 sig
    11   type 'a A
    12   type 'a T
    13   type 'a V
    14   val int_atom: int A
    15   val bool_atom: bool A
    16   val unit_atom: unit A
    17   val properties: Properties.T T
    18   val string: string T
    19   val int: int T
    20   val bool: bool T
    21   val unit: unit T
    22   val pair: 'a T -> 'b T -> ('a * 'b) T
    23   val triple: 'a T -> 'b T -> 'c T -> ('a * 'b * 'c) T
    24   val list: 'a T -> 'a list T
    25   val option: 'a T -> 'a option T
    26   val variant: 'a V list -> 'a T
    27 end;
    28 
    29 signature XML =
    30 sig
    31   type attributes = (string * string) list
    32   datatype tree =
    33       Elem of (string * attributes) * tree list
    34     | Text of string
    35   type body = tree list
    36   val xml_elemN: string
    37   val xml_nameN: string
    38   val xml_bodyN: string
    39   val wrap_elem: ((string * attributes) * tree list) * tree list -> tree
    40   val unwrap_elem: tree -> (((string * attributes) * tree list) * tree list) option
    41   val add_content: tree -> Buffer.T -> Buffer.T
    42   val content_of: body -> string
    43   val trim_blanks: body -> body
    44   val header: string
    45   val text: string -> string
    46   val element: string -> attributes -> string list -> string
    47   val output_markup: Markup.T -> Markup.output
    48   val string_of: tree -> string
    49   val pretty: int -> tree -> Pretty.T
    50   val output: tree -> BinIO.outstream -> unit
    51   val parse_comments: string list -> unit * string list
    52   val parse_string : string -> string option
    53   val parse_element: string list -> tree * string list
    54   val parse_document: string list -> tree * string list
    55   val parse: string -> tree
    56   exception XML_ATOM of string
    57   exception XML_BODY of body
    58   structure Encode:
    59   sig
    60     include XML_DATA_OPS
    61     val tree: tree T
    62   end
    63   structure Decode:
    64   sig
    65     include XML_DATA_OPS
    66     val tree: tree T
    67   end
    68 end;
    69 
    70 structure XML: XML =
    71 struct
    72 
    73 (** XML trees **)
    74 
    75 type attributes = (string * string) list;
    76 
    77 datatype tree =
    78     Elem of (string * attributes) * tree list
    79   | Text of string;
    80 
    81 type body = tree list;
    82 
    83 
    84 (* wrapped elements *)
    85 
    86 val xml_elemN = "xml_elem";
    87 val xml_nameN = "xml_name";
    88 val xml_bodyN = "xml_body";
    89 
    90 fun wrap_elem (((a, atts), body1), body2) =
    91   Elem ((xml_elemN, (xml_nameN, a) :: atts), Elem ((xml_bodyN, []), body1) :: body2);
    92 
    93 fun unwrap_elem (Elem ((name, (n, a) :: atts), Elem ((name', atts'), body1) :: body2)) =
    94       if name = xml_elemN andalso n = xml_nameN andalso name' = xml_bodyN andalso null atts'
    95       then SOME (((a, atts), body1), body2) else NONE
    96   | unwrap_elem _ = NONE;
    97 
    98 
    99 (* text content *)
   100 
   101 fun add_content tree =
   102   (case unwrap_elem tree of
   103     SOME (_, ts) => fold add_content ts
   104   | NONE =>
   105       (case tree of
   106         Elem (_, ts) => fold add_content ts
   107       | Text s => Buffer.add s));
   108 
   109 fun content_of body = Buffer.empty |> fold add_content body |> Buffer.content;
   110 
   111 
   112 (* trim blanks *)
   113 
   114 fun trim_blanks trees =
   115   trees |> maps
   116     (fn Elem (markup, body) => [Elem (markup, trim_blanks body)]
   117       | Text s =>
   118           let val s' = s |> raw_explode |> trim Symbol.is_blank |> implode;
   119           in if s' = "" then [] else [Text s'] end);
   120 
   121 
   122 
   123 (** string representation **)
   124 
   125 val header = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n";
   126 
   127 
   128 (* escaped text *)
   129 
   130 fun decode "&lt;" = "<"
   131   | decode "&gt;" = ">"
   132   | decode "&amp;" = "&"
   133   | decode "&apos;" = "'"
   134   | decode "&quot;" = "\""
   135   | decode c = c;
   136 
   137 fun encode "<" = "&lt;"
   138   | encode ">" = "&gt;"
   139   | encode "&" = "&amp;"
   140   | encode "'" = "&apos;"
   141   | encode "\"" = "&quot;"
   142   | encode c = c;
   143 
   144 val text = translate_string encode;
   145 
   146 
   147 (* elements *)
   148 
   149 fun elem name atts =
   150   space_implode " " (name :: map (fn (a, x) => a ^ "=\"" ^ text x ^ "\"") atts);
   151 
   152 fun element name atts body =
   153   let val b = implode body in
   154     if b = "" then enclose "<" "/>" (elem name atts)
   155     else enclose "<" ">" (elem name atts) ^ b ^ enclose "</" ">" name
   156   end;
   157 
   158 fun output_markup (markup as (name, atts)) =
   159   if Markup.is_empty markup then Markup.no_output
   160   else (enclose "<" ">" (elem name atts), enclose "</" ">" name);
   161 
   162 
   163 (* output *)
   164 
   165 fun buffer_of depth tree =
   166   let
   167     fun traverse _ (Elem ((name, atts), [])) =
   168           Buffer.add "<" #> Buffer.add (elem name atts) #> Buffer.add "/>"
   169       | traverse d (Elem ((name, atts), ts)) =
   170           Buffer.add "<" #> Buffer.add (elem name atts) #> Buffer.add ">" #>
   171           traverse_body d ts #>
   172           Buffer.add "</" #> Buffer.add name #> Buffer.add ">"
   173       | traverse _ (Text s) = Buffer.add (text s)
   174     and traverse_body 0 _ = Buffer.add "..."
   175       | traverse_body d ts = fold (traverse (d - 1)) ts;
   176   in Buffer.empty |> traverse depth tree end;
   177 
   178 val string_of = Buffer.content o buffer_of ~1;
   179 val output = Buffer.output o buffer_of ~1;
   180 
   181 fun pretty depth tree =
   182   Pretty.str (Buffer.content (buffer_of (Int.max (0, depth)) tree));
   183 
   184 val _ = ML_system_pp (fn depth => fn _ => Pretty.to_polyml o pretty (FixedInt.toInt depth));
   185 
   186 
   187 
   188 (** XML parsing **)
   189 
   190 local
   191 
   192 fun err msg (xs, _) =
   193   fn () => "XML parsing error: " ^ msg () ^ "\nfound: " ^ quote (Symbol.beginning 100 xs);
   194 
   195 fun ignored _ = [];
   196 
   197 fun name_start_char c = Symbol.is_ascii_letter c orelse c = ":" orelse c = "_";
   198 fun name_char c = name_start_char c orelse Symbol.is_ascii_digit c orelse c = "-" orelse c = ".";
   199 val parse_name = Scan.one name_start_char ::: Scan.many name_char;
   200 
   201 val blanks = Scan.many Symbol.is_blank;
   202 val special = $$ "&" ^^ (parse_name >> implode) ^^ $$ ";" >> decode;
   203 val regular = Scan.one Symbol.not_eof;
   204 fun regular_except x = Scan.one (fn c => Symbol.not_eof c andalso c <> x);
   205 
   206 val parse_chars = Scan.repeat1 (special || regular_except "<") >> implode;
   207 
   208 val parse_cdata =
   209   Scan.this_string "<![CDATA[" |--
   210   (Scan.repeat (Scan.unless (Scan.this_string "]]>") regular) >> implode) --|
   211   Scan.this_string "]]>";
   212 
   213 val parse_att =
   214   ((parse_name >> implode) --| (blanks -- $$ "=" -- blanks)) --
   215   (($$ "\"" || $$ "'") :|-- (fn s =>
   216     (Scan.repeat (special || regular_except s) >> implode) --| $$ s));
   217 
   218 val parse_comment =
   219   Scan.this_string "<!--" --
   220   Scan.repeat (Scan.unless (Scan.this_string "-->") regular) --
   221   Scan.this_string "-->" >> ignored;
   222 
   223 val parse_processing_instruction =
   224   Scan.this_string "<?" --
   225   Scan.repeat (Scan.unless (Scan.this_string "?>") regular) --
   226   Scan.this_string "?>" >> ignored;
   227 
   228 val parse_doctype =
   229   Scan.this_string "<!DOCTYPE" --
   230   Scan.repeat (Scan.unless ($$ ">") regular) --
   231   $$ ">" >> ignored;
   232 
   233 val parse_misc =
   234   Scan.one Symbol.is_blank >> ignored ||
   235   parse_processing_instruction ||
   236   parse_comment;
   237 
   238 val parse_optional_text =
   239   Scan.optional (parse_chars >> (single o Text)) [];
   240 
   241 in
   242 
   243 val parse_comments =
   244   blanks -- Scan.repeat (parse_comment -- blanks >> K ()) >> K ();
   245 
   246 val parse_string = Scan.read Symbol.stopper parse_chars o raw_explode;
   247 
   248 fun parse_content xs =
   249   (parse_optional_text @@@
   250     Scan.repeats
   251       ((parse_element >> single ||
   252         parse_cdata >> (single o Text) ||
   253         parse_processing_instruction ||
   254         parse_comment)
   255       @@@ parse_optional_text)) xs
   256 
   257 and parse_element xs =
   258   ($$ "<" |-- parse_name -- Scan.repeat (blanks |-- parse_att) --| blanks :--
   259     (fn (name, _) =>
   260       !! (err (fn () => "Expected > or />"))
   261        ($$ "/" -- $$ ">" >> ignored ||
   262         $$ ">" |-- parse_content --|
   263           !! (err (fn () => "Expected </" ^ implode name ^ ">"))
   264               ($$ "<" -- $$ "/" -- Scan.this name -- blanks -- $$ ">")))
   265     >> (fn ((name, atts), body) => Elem ((implode name, atts), body))) xs;
   266 
   267 val parse_document =
   268   (Scan.repeat parse_misc -- Scan.option parse_doctype -- Scan.repeat parse_misc)
   269   |-- parse_element;
   270 
   271 fun parse s =
   272   (case Scan.finite Symbol.stopper (Scan.error (!! (err (fn () => "Malformed element"))
   273       (blanks |-- parse_document --| blanks))) (raw_explode s) of
   274     (x, []) => x
   275   | (_, ys) => error ("XML parsing error: unprocessed input\n" ^ Symbol.beginning 100 ys));
   276 
   277 end;
   278 
   279 
   280 
   281 (** XML as data representation language **)
   282 
   283 exception XML_ATOM of string;
   284 exception XML_BODY of tree list;
   285 
   286 
   287 structure Encode =
   288 struct
   289 
   290 type 'a A = 'a -> string;
   291 type 'a T = 'a -> body;
   292 type 'a V = 'a -> string list * body;
   293 
   294 
   295 (* atomic values *)
   296 
   297 fun int_atom i = Value.print_int i;
   298 
   299 fun bool_atom false = "0"
   300   | bool_atom true = "1";
   301 
   302 fun unit_atom () = "";
   303 
   304 
   305 (* structural nodes *)
   306 
   307 fun node ts = Elem ((":", []), ts);
   308 
   309 fun vector xs = map_index (fn (i, x) => (int_atom i, x)) xs;
   310 
   311 fun tagged (tag, (xs, ts)) = Elem ((int_atom tag, vector xs), ts);
   312 
   313 
   314 (* representation of standard types *)
   315 
   316 fun tree (t: tree) = [t];
   317 
   318 fun properties props = [Elem ((":", props), [])];
   319 
   320 fun string "" = []
   321   | string s = [Text s];
   322 
   323 val int = string o int_atom;
   324 
   325 val bool = string o bool_atom;
   326 
   327 val unit = string o unit_atom;
   328 
   329 fun pair f g (x, y) = [node (f x), node (g y)];
   330 
   331 fun triple f g h (x, y, z) = [node (f x), node (g y), node (h z)];
   332 
   333 fun list f xs = map (node o f) xs;
   334 
   335 fun option _ NONE = []
   336   | option f (SOME x) = [node (f x)];
   337 
   338 fun variant fs x =
   339   [tagged (the (get_index (fn f => SOME (f x) handle General.Match => NONE) fs))];
   340 
   341 end;
   342 
   343 
   344 structure Decode =
   345 struct
   346 
   347 type 'a A = string -> 'a;
   348 type 'a T = body -> 'a;
   349 type 'a V = string list * body -> 'a;
   350 
   351 
   352 (* atomic values *)
   353 
   354 fun int_atom s =
   355   Value.parse_int s
   356     handle Fail _ => raise XML_ATOM s;
   357 
   358 fun bool_atom "0" = false
   359   | bool_atom "1" = true
   360   | bool_atom s = raise XML_ATOM s;
   361 
   362 fun unit_atom "" = ()
   363   | unit_atom s = raise XML_ATOM s;
   364 
   365 
   366 (* structural nodes *)
   367 
   368 fun node (Elem ((":", []), ts)) = ts
   369   | node t = raise XML_BODY [t];
   370 
   371 fun vector atts =
   372   map_index (fn (i, (a, x)) => if int_atom a = i then x else raise XML_ATOM a) atts;
   373 
   374 fun tagged (Elem ((name, atts), ts)) = (int_atom name, (vector atts, ts))
   375   | tagged t = raise XML_BODY [t];
   376 
   377 
   378 (* representation of standard types *)
   379 
   380 fun tree [t] = t
   381   | tree ts = raise XML_BODY ts;
   382 
   383 fun properties [Elem ((":", props), [])] = props
   384   | properties ts = raise XML_BODY ts;
   385 
   386 fun string [] = ""
   387   | string [Text s] = s
   388   | string ts = raise XML_BODY ts;
   389 
   390 val int = int_atom o string;
   391 
   392 val bool = bool_atom o string;
   393 
   394 val unit = unit_atom o string;
   395 
   396 fun pair f g [t1, t2] = (f (node t1), g (node t2))
   397   | pair _ _ ts = raise XML_BODY ts;
   398 
   399 fun triple f g h [t1, t2, t3] = (f (node t1), g (node t2), h (node t3))
   400   | triple _ _ _ ts = raise XML_BODY ts;
   401 
   402 fun list f ts = map (f o node) ts;
   403 
   404 fun option _ [] = NONE
   405   | option f [t] = SOME (f (node t))
   406   | option _ ts = raise XML_BODY ts;
   407 
   408 fun variant fs [t] =
   409       let
   410         val (tag, (xs, ts)) = tagged t;
   411         val f = nth fs tag handle General.Subscript => raise XML_BODY [t];
   412       in f (xs, ts) end
   413   | variant _ ts = raise XML_BODY ts;
   414 
   415 end;
   416 
   417 end;