src/HOL/Import/xml.ML
author wenzelm
Fri Dec 17 17:43:54 2010 +0100 (2010-12-17)
changeset 41229 d797baa3d57c
parent 32960 69916a850301
child 41491 a2ad5b824051
permissions -rw-r--r--
replaced command 'nonterminals' by slightly modernized version 'nonterminal';
     1 (*  Title:      HOL/Import/xml.ML
     2 
     3 Yet another version of XML support.
     4 *)
     5 
     6 signature XML =
     7 sig
     8   val header: string
     9   val text: string -> string
    10   val text_charref: string -> string
    11   val cdata: string -> string
    12   val element: string -> (string * string) list -> string list -> string
    13   
    14   datatype tree =
    15       Elem of string * (string * string) list * tree list
    16     | Text of string
    17   
    18   val string_of_tree: tree -> string
    19   val tree_of_string: string -> tree
    20 
    21   val encoded_string_of_tree : tree -> string
    22   val tree_of_encoded_string : string -> tree
    23 end;
    24 
    25 structure XML :> XML =
    26 struct
    27 
    28 (*structure Seq = VectorScannerSeq
    29 structure Scan = Scanner (structure Seq = Seq)*)
    30 structure Seq = StringScannerSeq
    31 structure Scan = StringScanner
    32 
    33 
    34 open Scan
    35 
    36 (** string based representation (small scale) **)
    37 
    38 val header = "<?xml version=\"1.0\"?>\n";
    39 
    40 (* text and character data *)
    41 
    42 fun decode "&lt;" = "<"
    43   | decode "&gt;" = ">"
    44   | decode "&amp;" = "&"
    45   | decode "&apos;" = "'"
    46   | decode "&quot;" = "\""
    47   | decode c = c;
    48 
    49 fun encode "<" = "&lt;"
    50   | encode ">" = "&gt;"
    51   | encode "&" = "&amp;"
    52   | encode "'" = "&apos;"
    53   | encode "\"" = "&quot;"
    54   | encode c = c;
    55 
    56 fun encode_charref c = "&#" ^ Int.toString (ord c) ^ ";"
    57 
    58 val text = Library.translate_string encode
    59 
    60 val text_charref = translate_string encode_charref;
    61 
    62 val cdata = enclose "<![CDATA[" "]]>\n"
    63 
    64 (* elements *)
    65 
    66 fun attribute (a, x) = a ^ " = \"" ^ text x ^ "\"";
    67 
    68 fun element name atts cs =
    69   let val elem = space_implode " " (name :: map attribute atts) in
    70     if null cs then enclose "<" "/>" elem
    71     else enclose "<" ">" elem ^ implode cs ^ enclose "</" ">" name
    72   end;
    73 
    74 (** explicit XML trees **)
    75 
    76 datatype tree =
    77     Elem of string * (string * string) list * tree list
    78   | Text of string;
    79 
    80 fun string_of_tree tree =
    81   let
    82     fun string_of (Elem (name, atts, ts)) buf =
    83         let val buf' =
    84           buf |> Buffer.add "<"
    85           |> fold Buffer.add (separate " " (name :: map attribute atts))
    86         in
    87           if null ts then
    88             buf' |> Buffer.add "/>"
    89           else
    90             buf' |> Buffer.add ">"
    91             |> fold string_of ts
    92             |> Buffer.add "</" |> Buffer.add name |> Buffer.add ">"
    93         end
    94       | string_of (Text s) buf = Buffer.add (text s) buf;
    95   in Buffer.content (string_of tree Buffer.empty) end;
    96 
    97 (** XML parsing **)
    98 
    99 fun beginning n xs = Symbol.beginning n (Seq.take_at_most xs n)
   100 
   101 fun err s xs =
   102   "XML parsing error: " ^ s ^ "\nfound: " ^ quote (beginning 100 xs) ;
   103 
   104 val scan_whspc = Scan.any Symbol.is_blank;
   105 
   106 val scan_special = $$ "&" ^^ scan_id ^^ $$ ";" >> decode;
   107 
   108 val parse_chars = Scan.repeat1 (Scan.unless ((* scan_whspc -- *)$$ "<")
   109   (scan_special || Scan.one Symbol.is_regular)) >> implode;
   110 
   111 val parse_cdata = Scan.this_string "<![CDATA[" |--
   112   (Scan.repeat (Scan.unless (Scan.this_string "]]>") (Scan.one Symbol.is_regular)) >>
   113     implode) --| Scan.this_string "]]>";
   114 
   115 val parse_att =
   116     scan_id --| scan_whspc --| $$ "=" --| scan_whspc --
   117     (($$ "\"" || $$ "'") :-- (fn s => (Scan.repeat (Scan.unless ($$ s)
   118     (scan_special || Scan.one Symbol.is_regular)) >> implode) --| $$ s) >> snd);
   119 
   120 val parse_comment = Scan.this_string "<!--" --
   121   Scan.repeat (Scan.unless (Scan.this_string "-->") (Scan.one Symbol.is_regular)) --
   122   Scan.this_string "-->";
   123 
   124 val scan_comment_whspc = 
   125     (scan_whspc >> K()) --| (Scan.repeat (parse_comment |-- (scan_whspc >> K())));
   126 
   127 val parse_pi = Scan.this_string "<?" |--
   128   Scan.repeat (Scan.unless (Scan.this_string "?>") (Scan.one Symbol.is_regular)) --|
   129   Scan.this_string "?>";
   130 
   131 fun parse_content xs =
   132   ((Scan.optional ((* scan_whspc |-- *) parse_chars >> (single o Text)) [] --
   133     (Scan.repeat ((* scan_whspc |-- *)
   134        (   parse_elem >> single
   135         || parse_cdata >> (single o Text)
   136         || parse_pi >> K []
   137         || parse_comment >> K []) --
   138        Scan.optional ((* scan_whspc |-- *) parse_chars >> (single o Text)) []
   139          >> op @) >> flat) >> op @)(* --| scan_whspc*)) xs
   140 
   141 and parse_elem xs =
   142   ($$ "<" |-- scan_id --
   143     Scan.repeat (scan_whspc |-- parse_att) --| scan_whspc :-- (fn (s, _) =>
   144       !! (err "Expected > or />")
   145         (Scan.this_string "/>" >> K []
   146          || $$ ">" |-- parse_content --|
   147             !! (err ("Expected </" ^ s ^ ">"))
   148               (Scan.this_string ("</" ^ s) --| scan_whspc --| $$ ">"))) >>
   149     (fn ((s, atts), ts) => Elem (s, atts, ts))) xs;
   150 
   151 val parse_document =
   152   Scan.option (Scan.this_string "<!DOCTYPE" -- scan_whspc |--
   153     (Scan.repeat (Scan.unless ($$ ">")
   154       (Scan.one Symbol.is_regular)) >> implode) --| $$ ">" --| scan_whspc) --
   155   parse_elem;
   156 
   157 fun tree_of_string s =
   158     let
   159         val seq = Seq.fromString s
   160         val scanner = !! (err "Malformed element") (scan_whspc |-- parse_elem --| scan_whspc)
   161         val (x, toks) = scanner seq
   162     in
   163         if Seq.null toks then x else error ("Unprocessed input: '"^(beginning 100 toks)^"'")
   164     end
   165 
   166 (* More efficient saving and loading of xml trees employing a proprietary external format *)
   167 
   168 fun write_lstring s buf = buf |> Buffer.add (string_of_int (size s)) |> Buffer.add " " |> Buffer.add s
   169 fun parse_lstring toks = (scan_nat --| one Symbol.is_blank :-- (fn i => repeat_fixed i (one (K true))) >> snd >> implode) toks
   170 
   171 fun write_list w l buf = buf |> Buffer.add (string_of_int (length l)) |> Buffer.add " " |> fold w l     
   172 fun parse_list sc = scan_nat --| one Symbol.is_blank :-- (fn i => repeat_fixed i sc) >> snd
   173 
   174 fun write_tree (Text s) buf = buf |> Buffer.add "T" |> write_lstring s
   175   | write_tree (Elem (name, attrs, children)) buf = 
   176     buf |> Buffer.add "E" 
   177         |> write_lstring name 
   178         |> write_list (fn (a,b) => fn buf => buf |> write_lstring a |> write_lstring b) attrs 
   179         |> write_list write_tree children
   180 
   181 fun parse_tree toks = (one (K true) :-- (fn "T" => parse_lstring >> Text | "E" => parse_elem | _ => raise SyntaxError) >> snd) toks
   182 and parse_elem toks = (parse_lstring -- parse_list (parse_lstring -- parse_lstring) -- parse_list parse_tree >> (fn ((n, a), c) => Elem (n,a,c))) toks
   183 
   184 fun encoded_string_of_tree tree = Buffer.content (write_tree tree Buffer.empty)
   185 
   186 fun tree_of_encoded_string s = 
   187     let
   188         fun print (a,b) = writeln (a^" "^(string_of_int b))
   189         val _ = print ("length of encoded string: ", size s)
   190         val _ = writeln "Seq.fromString..."
   191         val seq = timeit (fn () => Seq.fromString s)
   192         val  _ = print ("length of sequence", timeit (fn () => Seq.length seq))
   193         val scanner = !! (err "Malformed encoded xml") parse_tree
   194         val (x, toks) = scanner seq
   195     in
   196         if Seq.null toks then x else error ("Unprocessed input: '"^(beginning 100 toks)^"'")
   197     end        
   198 
   199 end;