| 19064 |      1 | (*  Title:      Pure/General/xml.ML
 | 
|  |      2 |     ID:         $Id$
 | 
|  |      3 |     Author:     David Aspinall, Stefan Berghofer and Markus Wenzel
 | 
|  |      4 | 
 | 
|  |      5 | Basic support for XML.
 | 
|  |      6 | *)
 | 
|  |      7 | 
 | 
|  |      8 | signature XML =
 | 
|  |      9 | sig
 | 
|  |     10 |   val header: string
 | 
|  |     11 |   val text: string -> string
 | 
|  |     12 |   val text_charref: string -> string
 | 
|  |     13 |   val cdata: string -> string
 | 
|  |     14 |   val element: string -> (string * string) list -> string list -> string
 | 
| 19089 |     15 |   
 | 
| 19064 |     16 |   datatype tree =
 | 
|  |     17 |       Elem of string * (string * string) list * tree list
 | 
|  |     18 |     | Text of string
 | 
| 19089 |     19 |   
 | 
| 19064 |     20 |   val string_of_tree: tree -> string
 | 
|  |     21 |   val tree_of_string: string -> tree
 | 
| 19089 |     22 | 
 | 
|  |     23 |   val encoded_string_of_tree : tree -> string
 | 
|  |     24 |   val tree_of_encoded_string : string -> tree
 | 
| 19064 |     25 | end;
 | 
|  |     26 | 
 | 
| 19089 |     27 | structure XML :> XML =
 | 
| 19064 |     28 | struct
 | 
|  |     29 | 
 | 
| 19093 |     30 | (*structure Seq = VectorScannerSeq
 | 
|  |     31 | structure Scan = Scanner (structure Seq = Seq)*)
 | 
|  |     32 | structure Seq = StringScannerSeq
 | 
|  |     33 | structure Scan = StringScanner
 | 
|  |     34 | 
 | 
| 19095 |     35 | 
 | 
| 19064 |     36 | open Scan
 | 
|  |     37 | 
 | 
|  |     38 | (** string based representation (small scale) **)
 | 
|  |     39 | 
 | 
|  |     40 | val header = "<?xml version=\"1.0\"?>\n";
 | 
|  |     41 | 
 | 
|  |     42 | (* text and character data *)
 | 
|  |     43 | 
 | 
|  |     44 | fun decode "<" = "<"
 | 
|  |     45 |   | decode ">" = ">"
 | 
|  |     46 |   | decode "&" = "&"
 | 
|  |     47 |   | decode "'" = "'"
 | 
|  |     48 |   | decode """ = "\""
 | 
|  |     49 |   | decode c = c;
 | 
|  |     50 | 
 | 
|  |     51 | fun encode "<" = "<"
 | 
|  |     52 |   | encode ">" = ">"
 | 
|  |     53 |   | encode "&" = "&"
 | 
|  |     54 |   | encode "'" = "'"
 | 
|  |     55 |   | encode "\"" = """
 | 
|  |     56 |   | encode c = c;
 | 
|  |     57 | 
 | 
|  |     58 | fun encode_charref c = "&#" ^ Int.toString (ord c) ^ ";"
 | 
|  |     59 | 
 | 
|  |     60 | val text = Library.translate_string encode
 | 
|  |     61 | 
 | 
|  |     62 | val text_charref = translate_string encode_charref;
 | 
|  |     63 | 
 | 
|  |     64 | val cdata = enclose "<![CDATA[" "]]>\n"
 | 
|  |     65 | 
 | 
|  |     66 | (* elements *)
 | 
|  |     67 | 
 | 
|  |     68 | fun attribute (a, x) = a ^ " = \"" ^ text x ^ "\"";
 | 
|  |     69 | 
 | 
|  |     70 | fun element name atts cs =
 | 
|  |     71 |   let val elem = space_implode " " (name :: map attribute atts) in
 | 
|  |     72 |     if null cs then enclose "<" "/>" elem
 | 
|  |     73 |     else enclose "<" ">" elem ^ implode cs ^ enclose "</" ">" name
 | 
|  |     74 |   end;
 | 
|  |     75 | 
 | 
|  |     76 | (** explicit XML trees **)
 | 
|  |     77 | 
 | 
|  |     78 | datatype tree =
 | 
|  |     79 |     Elem of string * (string * string) list * tree list
 | 
|  |     80 |   | Text of string;
 | 
|  |     81 | 
 | 
|  |     82 | fun string_of_tree tree =
 | 
|  |     83 |   let
 | 
|  |     84 |     fun string_of (Elem (name, atts, ts)) buf =
 | 
|  |     85 |         let val buf' =
 | 
|  |     86 |           buf |> Buffer.add "<"
 | 
|  |     87 |           |> fold Buffer.add (separate " " (name :: map attribute atts))
 | 
|  |     88 |         in
 | 
|  |     89 |           if null ts then
 | 
|  |     90 |             buf' |> Buffer.add "/>"
 | 
|  |     91 |           else
 | 
|  |     92 |             buf' |> Buffer.add ">"
 | 
|  |     93 |             |> fold string_of ts
 | 
|  |     94 |             |> Buffer.add "</" |> Buffer.add name |> Buffer.add ">"
 | 
|  |     95 |         end
 | 
|  |     96 |       | string_of (Text s) buf = Buffer.add (text s) buf;
 | 
|  |     97 |   in Buffer.content (string_of tree Buffer.empty) end;
 | 
|  |     98 | 
 | 
|  |     99 | (** XML parsing **)
 | 
|  |    100 | 
 | 
| 19089 |    101 | fun beginning n xs = Symbol.beginning n (Seq.take_at_most xs n)
 | 
| 19064 |    102 | 
 | 
|  |    103 | fun err s xs =
 | 
|  |    104 |   "XML parsing error: " ^ s ^ "\nfound: " ^ quote (beginning 100 xs) ;
 | 
|  |    105 | 
 | 
|  |    106 | val scan_whspc = Scan.any Symbol.is_blank;
 | 
|  |    107 | 
 | 
|  |    108 | val scan_special = $$ "&" ^^ scan_id ^^ $$ ";" >> decode;
 | 
|  |    109 | 
 | 
|  |    110 | val parse_chars = Scan.repeat1 (Scan.unless ((* scan_whspc -- *)$$ "<")
 | 
|  |    111 |   (scan_special || Scan.one Symbol.not_eof)) >> implode;
 | 
|  |    112 | 
 | 
|  |    113 | val parse_cdata = Scan.this_string "<![CDATA[" |--
 | 
|  |    114 |   (Scan.repeat (Scan.unless (Scan.this_string "]]>") (Scan.one Symbol.not_eof)) >>
 | 
|  |    115 |     implode) --| Scan.this_string "]]>";
 | 
|  |    116 | 
 | 
|  |    117 | val parse_att =
 | 
|  |    118 |     scan_id --| scan_whspc --| $$ "=" --| scan_whspc --
 | 
|  |    119 |     (($$ "\"" || $$ "'") :-- (fn s => (Scan.repeat (Scan.unless ($$ s)
 | 
|  |    120 |     (scan_special || Scan.one Symbol.not_eof)) >> implode) --| $$ s) >> snd);
 | 
|  |    121 | 
 | 
|  |    122 | val parse_comment = Scan.this_string "<!--" --
 | 
|  |    123 |   Scan.repeat (Scan.unless (Scan.this_string "-->") (Scan.one Symbol.not_eof)) --
 | 
|  |    124 |   Scan.this_string "-->";
 | 
|  |    125 | 
 | 
|  |    126 | val scan_comment_whspc = 
 | 
|  |    127 |     (scan_whspc >> K()) --| (Scan.repeat (parse_comment |-- (scan_whspc >> K())));
 | 
|  |    128 | 
 | 
|  |    129 | val parse_pi = Scan.this_string "<?" |--
 | 
|  |    130 |   Scan.repeat (Scan.unless (Scan.this_string "?>") (Scan.one Symbol.not_eof)) --|
 | 
|  |    131 |   Scan.this_string "?>";
 | 
|  |    132 | 
 | 
|  |    133 | fun parse_content xs =
 | 
|  |    134 |   ((Scan.optional ((* scan_whspc |-- *) parse_chars >> (single o Text)) [] --
 | 
|  |    135 |     (Scan.repeat ((* scan_whspc |-- *)
 | 
|  |    136 |        (   parse_elem >> single
 | 
|  |    137 |         || parse_cdata >> (single o Text)
 | 
|  |    138 |         || parse_pi >> K []
 | 
|  |    139 |         || parse_comment >> K []) --
 | 
|  |    140 |        Scan.optional ((* scan_whspc |-- *) parse_chars >> (single o Text)) []
 | 
|  |    141 |          >> op @) >> List.concat) >> op @)(* --| scan_whspc*)) xs
 | 
|  |    142 | 
 | 
|  |    143 | and parse_elem xs =
 | 
|  |    144 |   ($$ "<" |-- scan_id --
 | 
|  |    145 |     Scan.repeat (scan_whspc |-- parse_att) --| scan_whspc :-- (fn (s, _) =>
 | 
|  |    146 |       !! (err "Expected > or />")
 | 
|  |    147 |         (Scan.this_string "/>" >> K []
 | 
|  |    148 |          || $$ ">" |-- parse_content --|
 | 
|  |    149 |             !! (err ("Expected </" ^ s ^ ">"))
 | 
|  |    150 |               (Scan.this_string ("</" ^ s) --| scan_whspc --| $$ ">"))) >>
 | 
|  |    151 |     (fn ((s, atts), ts) => Elem (s, atts, ts))) xs;
 | 
|  |    152 | 
 | 
|  |    153 | val parse_document =
 | 
|  |    154 |   Scan.option (Scan.this_string "<!DOCTYPE" -- scan_whspc |--
 | 
|  |    155 |     (Scan.repeat (Scan.unless ($$ ">")
 | 
|  |    156 |       (Scan.one Symbol.not_eof)) >> implode) --| $$ ">" --| scan_whspc) --
 | 
|  |    157 |   parse_elem;
 | 
|  |    158 | 
 | 
|  |    159 | fun tree_of_string s =
 | 
|  |    160 |     let
 | 
| 19089 |    161 | 	val seq = Seq.fromString s
 | 
| 19064 |    162 | 	val scanner = !! (err "Malformed element") (scan_whspc |-- parse_elem --| scan_whspc)
 | 
|  |    163 | 	val (x, toks) = scanner seq
 | 
|  |    164 |     in
 | 
| 19089 |    165 | 	if Seq.null toks then x else error ("Unprocessed input: '"^(beginning 100 toks)^"'")
 | 
| 19064 |    166 |     end
 | 
| 19089 |    167 | 
 | 
|  |    168 | (* More efficient saving and loading of xml trees employing a proprietary external format *)
 | 
|  |    169 | 
 | 
|  |    170 | fun write_lstring s buf = buf |> Buffer.add (string_of_int (size s)) |> Buffer.add " " |> Buffer.add s
 | 
|  |    171 | fun parse_lstring toks = (scan_nat --| one Symbol.is_blank :-- (fn i => repeat_fixed i (one (K true))) >> snd >> implode) toks
 | 
|  |    172 | 
 | 
|  |    173 | fun write_list w l buf = buf |> Buffer.add (string_of_int (length l)) |> Buffer.add " " |> fold w l     
 | 
|  |    174 | fun parse_list sc = scan_nat --| one Symbol.is_blank :-- (fn i => repeat_fixed i sc) >> snd
 | 
|  |    175 | 
 | 
|  |    176 | fun write_tree (Text s) buf = buf |> Buffer.add "T" |> write_lstring s
 | 
|  |    177 |   | write_tree (Elem (name, attrs, children)) buf = 
 | 
|  |    178 |     buf |> Buffer.add "E" 
 | 
|  |    179 | 	|> write_lstring name 
 | 
|  |    180 | 	|> write_list (fn (a,b) => fn buf => buf |> write_lstring a |> write_lstring b) attrs 
 | 
|  |    181 | 	|> write_list write_tree children
 | 
|  |    182 | 
 | 
|  |    183 | fun parse_tree toks = (one (K true) :-- (fn "T" => parse_lstring >> Text | "E" => parse_elem | _ => raise SyntaxError) >> snd) toks
 | 
|  |    184 | and parse_elem toks = (parse_lstring -- parse_list (parse_lstring -- parse_lstring) -- parse_list parse_tree >> (fn ((n, a), c) => Elem (n,a,c))) toks
 | 
|  |    185 | 
 | 
|  |    186 | fun encoded_string_of_tree tree = Buffer.content (write_tree tree Buffer.empty)
 | 
|  |    187 | 
 | 
|  |    188 | fun tree_of_encoded_string s = 
 | 
|  |    189 |     let
 | 
|  |    190 | 	fun print (a,b) = writeln (a^" "^(string_of_int b))
 | 
|  |    191 | 	val _ = print ("length of encoded string: ", size s)
 | 
|  |    192 | 	val _ = writeln "Seq.fromString..."
 | 
|  |    193 | 	val seq = timeit (fn () => Seq.fromString s)
 | 
|  |    194 | 	val  _ = print ("length of sequence", timeit (fn () => Seq.length seq))
 | 
|  |    195 | 	val scanner = !! (err "Malformed encoded xml") parse_tree
 | 
|  |    196 | 	val (x, toks) = scanner seq
 | 
|  |    197 |     in
 | 
|  |    198 | 	if Seq.null toks then x else error ("Unprocessed input: '"^(beginning 100 toks)^"'")
 | 
|  |    199 |     end	       
 | 
|  |    200 | 
 | 
| 19064 |    201 | end;
 |