src/Pure/General/xml.ML
changeset 23614 4724a6b90af4
parent 23613 3f2a6c66e089
child 23615 40ab945ef5ff
equal deleted inserted replaced
23613:3f2a6c66e089 23614:4724a6b90af4
     1 (*  Title:      Pure/General/xml.ML
       
     2     ID:         $Id$
       
     3     Author:     David Aspinall, Stefan Berghofer and Markus Wenzel
       
     4 
       
     5 Basic support for XML.
       
     6 *)
       
     7 
       
     8 signature XML =
       
     9 sig
       
    10   (* string functions *)
       
    11   val header: string
       
    12   val text: string -> string
       
    13   val text_charref: string -> string
       
    14   val cdata: string -> string
       
    15   type attributes = (string * string) list
       
    16   val element: string -> attributes -> string list -> string
       
    17   (* tree functions *)
       
    18   datatype tree =
       
    19       Elem of string * attributes * tree list
       
    20     | Text of string    (* native string, subject to XML.text on output *)
       
    21     | Rawtext of string (* XML string: output directly, not parsed      *)
       
    22   type content = tree list
       
    23   type element = string * attributes * content
       
    24   val string_of_tree: tree -> string
       
    25   val buffer_of_tree: tree -> Buffer.T
       
    26   val parse_string : string -> string option
       
    27   val parse_content: string list -> tree list * string list
       
    28   val parse_elem: string list -> tree * string list
       
    29   val parse_document: string list -> (string option * tree) * string list
       
    30   val tree_of_string: string -> tree
       
    31   val scan_comment_whspc : string list -> unit * string list
       
    32 end;
       
    33 
       
    34 structure XML: XML =
       
    35 struct
       
    36 
       
    37 (** string based representation (small scale) **)
       
    38 
       
    39 val header = "<?xml version=\"1.0\"?>\n";
       
    40 
       
    41 
       
    42 (* text and character data *)
       
    43 
       
    44 fun decode "&lt;" = "<"
       
    45   | decode "&gt;" = ">"
       
    46   | decode "&amp;" = "&"
       
    47   | decode "&apos;" = "'"
       
    48   | decode "&quot;" = "\""
       
    49   | decode c = c;
       
    50 
       
    51 fun encode "<" = "&lt;"
       
    52   | encode ">" = "&gt;"
       
    53   | encode "&" = "&amp;"
       
    54   | encode "'" = "&apos;"
       
    55   | encode "\"" = "&quot;"
       
    56   | encode c = c;
       
    57 
       
    58 fun encode_charref c = "&#" ^ Int.toString (ord c) ^ ";"
       
    59 
       
    60 val text = Library.translate_string encode
       
    61 
       
    62 val text_charref = translate_string encode_charref;
       
    63 
       
    64 val cdata = enclose "<![CDATA[" "]]>\n"
       
    65 
       
    66 
       
    67 (* elements *)
       
    68 
       
    69 fun attribute (a, x) = a ^ " = \"" ^ text x ^ "\"";
       
    70 
       
    71 fun element name atts cs =
       
    72   let val elem = space_implode " " (name :: map attribute atts) in
       
    73     if null cs then enclose "<" "/>" elem
       
    74     else enclose "<" ">" elem ^ implode cs ^ enclose "</" ">" name
       
    75   end;
       
    76 
       
    77 
       
    78 
       
    79 
       
    80 (** explicit XML trees **)
       
    81 
       
    82 type attributes = (string * string) list
       
    83 
       
    84 datatype tree =
       
    85     Elem of string * attributes * tree list
       
    86   | Text of string
       
    87   | Rawtext of string;
       
    88 
       
    89 type content = tree list
       
    90 
       
    91 type element = string * attributes * content
       
    92 
       
    93 fun buffer_of_tree tree =
       
    94   let
       
    95     fun string_of (Elem (name, atts, ts)) buf =
       
    96         let val buf' =
       
    97           buf |> Buffer.add "<"
       
    98           |> fold Buffer.add (separate " " (name :: map attribute atts))
       
    99         in
       
   100           if null ts then
       
   101             buf' |> Buffer.add "/>"
       
   102           else
       
   103             buf' |> Buffer.add ">"
       
   104             |> fold string_of ts
       
   105             |> Buffer.add "</" |> Buffer.add name |> Buffer.add ">"
       
   106         end
       
   107       | string_of (Text s) buf = Buffer.add (text s) buf
       
   108       | string_of (Rawtext s) buf = Buffer.add s buf;
       
   109   in string_of tree Buffer.empty end;
       
   110 
       
   111 val string_of_tree = Buffer.content o buffer_of_tree;
       
   112 
       
   113 
       
   114 
       
   115 (** XML parsing **)
       
   116 
       
   117 fun err s (xs, _) =
       
   118   "XML parsing error: " ^ s ^ "\nfound: " ^ quote (Symbol.beginning 100 xs);
       
   119 
       
   120 val scan_whspc = Scan.many Symbol.is_blank;
       
   121 
       
   122 val scan_special = $$ "&" ^^ Symbol.scan_id ^^ $$ ";" >> decode;
       
   123 
       
   124 val parse_chars = Scan.repeat1 (Scan.unless ((* scan_whspc -- *)$$ "<")
       
   125   (scan_special || Scan.one Symbol.not_eof)) >> implode;
       
   126 
       
   127 val parse_string = Scan.read Symbol.stopper parse_chars o explode;
       
   128 
       
   129 val parse_cdata = Scan.this_string "<![CDATA[" |--
       
   130   (Scan.repeat (Scan.unless (Scan.this_string "]]>") (Scan.one Symbol.not_eof)) >>
       
   131     implode) --| Scan.this_string "]]>";
       
   132 
       
   133 val parse_att =
       
   134   Symbol.scan_id --| scan_whspc --| $$ "=" --| scan_whspc --
       
   135   (($$ "\"" || $$ "'") :-- (fn s => (Scan.repeat (Scan.unless ($$ s)
       
   136     (scan_special || Scan.one Symbol.not_eof)) >> implode) --| $$ s) >> snd);
       
   137 
       
   138 val parse_comment = Scan.this_string "<!--" --
       
   139   Scan.repeat (Scan.unless (Scan.this_string "-->") (Scan.one Symbol.not_eof)) --
       
   140   Scan.this_string "-->";
       
   141 
       
   142 val scan_comment_whspc =
       
   143   (scan_whspc >> K()) --| (Scan.repeat (parse_comment |-- (scan_whspc >> K())));
       
   144 
       
   145 val parse_pi = Scan.this_string "<?" |--
       
   146   Scan.repeat (Scan.unless (Scan.this_string "?>") (Scan.one Symbol.not_eof)) --|
       
   147   Scan.this_string "?>";
       
   148 
       
   149 fun parse_content xs =
       
   150   ((Scan.optional ((* scan_whspc |-- *) parse_chars >> (single o Text)) [] --
       
   151     (Scan.repeat ((* scan_whspc |-- *)
       
   152        (   parse_elem >> single
       
   153         || parse_cdata >> (single o Text)
       
   154         || parse_pi >> K []
       
   155         || parse_comment >> K []) --
       
   156        Scan.optional ((* scan_whspc |-- *) parse_chars >> (single o Text)) []
       
   157          >> op @) >> flat) >> op @)(* --| scan_whspc*)) xs
       
   158 
       
   159 and parse_elem xs =
       
   160   ($$ "<" |-- Symbol.scan_id --
       
   161     Scan.repeat (scan_whspc |-- parse_att) --| scan_whspc :-- (fn (s, _) =>
       
   162       !! (err "Expected > or />")
       
   163         (Scan.this_string "/>" >> K []
       
   164          || $$ ">" |-- parse_content --|
       
   165             !! (err ("Expected </" ^ s ^ ">"))
       
   166               (Scan.this_string ("</" ^ s) --| scan_whspc --| $$ ">"))) >>
       
   167     (fn ((s, atts), ts) => Elem (s, atts, ts))) xs;
       
   168 
       
   169 val parse_document =
       
   170   Scan.option (Scan.this_string "<!DOCTYPE" -- scan_whspc |--
       
   171     (Scan.repeat (Scan.unless ($$ ">")
       
   172       (Scan.one Symbol.not_eof)) >> implode) --| $$ ">" --| scan_whspc) --
       
   173   parse_elem;
       
   174 
       
   175 fun tree_of_string s =
       
   176   (case Scan.finite Symbol.stopper (Scan.error (!! (err "Malformed element")
       
   177       (scan_whspc |-- parse_elem --| scan_whspc))) (Symbol.explode s) of
       
   178     (x, []) => x
       
   179   | (_, ys) => error ("XML parsing error: Unprocessed input\n" ^ Symbol.beginning 100 ys));
       
   180 
       
   181 end;