haftmann@24584: (* Title: HOL/Import/xml.ML obua@19064: wenzelm@26537: Yet another version of XML support. obua@19064: *) obua@19064: obua@19064: signature XML = obua@19064: sig obua@19064: val header: string obua@19064: val text: string -> string obua@19064: val text_charref: string -> string obua@19064: val cdata: string -> string obua@19064: val element: string -> (string * string) list -> string list -> string obua@19089: obua@19064: datatype tree = obua@19064: Elem of string * (string * string) list * tree list obua@19064: | Text of string obua@19089: obua@19064: val string_of_tree: tree -> string obua@19064: val tree_of_string: string -> tree obua@19089: obua@19089: val encoded_string_of_tree : tree -> string obua@19089: val tree_of_encoded_string : string -> tree obua@19064: end; obua@19064: obua@19089: structure XML :> XML = obua@19064: struct obua@19064: obua@19093: (*structure Seq = VectorScannerSeq obua@19093: structure Scan = Scanner (structure Seq = Seq)*) obua@19093: structure Seq = StringScannerSeq obua@19093: structure Scan = StringScanner obua@19093: obua@19095: obua@19064: open Scan obua@19064: obua@19064: (** string based representation (small scale) **) obua@19064: obua@19064: val header = "\n"; obua@19064: obua@19064: (* text and character data *) obua@19064: obua@19064: fun decode "<" = "<" obua@19064: | decode ">" = ">" obua@19064: | decode "&" = "&" obua@19064: | decode "'" = "'" obua@19064: | decode """ = "\"" obua@19064: | decode c = c; obua@19064: obua@19064: fun encode "<" = "<" obua@19064: | encode ">" = ">" obua@19064: | encode "&" = "&" obua@19064: | encode "'" = "'" obua@19064: | encode "\"" = """ obua@19064: | encode c = c; obua@19064: obua@19064: fun encode_charref c = "&#" ^ Int.toString (ord c) ^ ";" obua@19064: obua@19064: val text = Library.translate_string encode obua@19064: obua@19064: val text_charref = translate_string encode_charref; obua@19064: obua@19064: val cdata = enclose "\n" obua@19064: obua@19064: (* elements *) obua@19064: obua@19064: fun attribute (a, x) = a ^ " = \"" ^ text x ^ "\""; obua@19064: obua@19064: fun element name atts cs = obua@19064: let val elem = space_implode " " (name :: map attribute atts) in obua@19064: if null cs then enclose "<" "/>" elem obua@19064: else enclose "<" ">" elem ^ implode cs ^ enclose "" name obua@19064: end; obua@19064: obua@19064: (** explicit XML trees **) obua@19064: obua@19064: datatype tree = obua@19064: Elem of string * (string * string) list * tree list obua@19064: | Text of string; obua@19064: obua@19064: fun string_of_tree tree = obua@19064: let obua@19064: fun string_of (Elem (name, atts, ts)) buf = obua@19064: let val buf' = obua@19064: buf |> Buffer.add "<" obua@19064: |> fold Buffer.add (separate " " (name :: map attribute atts)) obua@19064: in obua@19064: if null ts then obua@19064: buf' |> Buffer.add "/>" obua@19064: else obua@19064: buf' |> Buffer.add ">" obua@19064: |> fold string_of ts obua@19064: |> Buffer.add " Buffer.add name |> Buffer.add ">" obua@19064: end obua@19064: | string_of (Text s) buf = Buffer.add (text s) buf; obua@19064: in Buffer.content (string_of tree Buffer.empty) end; obua@19064: obua@19064: (** XML parsing **) obua@19064: obua@19089: fun beginning n xs = Symbol.beginning n (Seq.take_at_most xs n) obua@19064: obua@19064: fun err s xs = obua@19064: "XML parsing error: " ^ s ^ "\nfound: " ^ quote (beginning 100 xs) ; obua@19064: obua@19064: val scan_whspc = Scan.any Symbol.is_blank; obua@19064: obua@19064: val scan_special = $$ "&" ^^ scan_id ^^ $$ ";" >> decode; obua@19064: obua@19064: val parse_chars = Scan.repeat1 (Scan.unless ((* scan_whspc -- *)$$ "<") wenzelm@23784: (scan_special || Scan.one Symbol.is_regular)) >> implode; obua@19064: obua@19064: val parse_cdata = Scan.this_string "") (Scan.one Symbol.is_regular)) >> obua@19064: implode) --| Scan.this_string "]]>"; obua@19064: obua@19064: val parse_att = obua@19064: scan_id --| scan_whspc --| $$ "=" --| scan_whspc -- obua@19064: (($$ "\"" || $$ "'") :-- (fn s => (Scan.repeat (Scan.unless ($$ s) wenzelm@23784: (scan_special || Scan.one Symbol.is_regular)) >> implode) --| $$ s) >> snd); obua@19064: obua@19064: val parse_comment = Scan.this_string "") (Scan.one Symbol.is_regular)) -- obua@19064: Scan.this_string "-->"; obua@19064: obua@19064: val scan_comment_whspc = obua@19064: (scan_whspc >> K()) --| (Scan.repeat (parse_comment |-- (scan_whspc >> K()))); obua@19064: obua@19064: val parse_pi = Scan.this_string "") (Scan.one Symbol.is_regular)) --| obua@19064: Scan.this_string "?>"; obua@19064: obua@19064: fun parse_content xs = obua@19064: ((Scan.optional ((* scan_whspc |-- *) parse_chars >> (single o Text)) [] -- obua@19064: (Scan.repeat ((* scan_whspc |-- *) obua@19064: ( parse_elem >> single obua@19064: || parse_cdata >> (single o Text) obua@19064: || parse_pi >> K [] obua@19064: || parse_comment >> K []) -- obua@19064: Scan.optional ((* scan_whspc |-- *) parse_chars >> (single o Text)) [] wenzelm@32952: >> op @) >> flat) >> op @)(* --| scan_whspc*)) xs obua@19064: obua@19064: and parse_elem xs = obua@19064: ($$ "<" |-- scan_id -- obua@19064: Scan.repeat (scan_whspc |-- parse_att) --| scan_whspc :-- (fn (s, _) => obua@19064: !! (err "Expected > or />") obua@19064: (Scan.this_string "/>" >> K [] obua@19064: || $$ ">" |-- parse_content --| obua@19064: !! (err ("Expected ")) obua@19064: (Scan.this_string (""))) >> obua@19064: (fn ((s, atts), ts) => Elem (s, atts, ts))) xs; obua@19064: obua@19064: val parse_document = obua@19064: Scan.option (Scan.this_string "") wenzelm@23784: (Scan.one Symbol.is_regular)) >> implode) --| $$ ">" --| scan_whspc) -- obua@19064: parse_elem; obua@19064: obua@19064: fun tree_of_string s = obua@19064: let wenzelm@32960: val seq = Seq.fromString s wenzelm@32960: val scanner = !! (err "Malformed element") (scan_whspc |-- parse_elem --| scan_whspc) wenzelm@32960: val (x, toks) = scanner seq obua@19064: in wenzelm@32960: if Seq.null toks then x else error ("Unprocessed input: '"^(beginning 100 toks)^"'") obua@19064: end obua@19089: obua@19089: (* More efficient saving and loading of xml trees employing a proprietary external format *) obua@19089: obua@19089: fun write_lstring s buf = buf |> Buffer.add (string_of_int (size s)) |> Buffer.add " " |> Buffer.add s obua@19089: fun parse_lstring toks = (scan_nat --| one Symbol.is_blank :-- (fn i => repeat_fixed i (one (K true))) >> snd >> implode) toks obua@19089: obua@19089: fun write_list w l buf = buf |> Buffer.add (string_of_int (length l)) |> Buffer.add " " |> fold w l obua@19089: fun parse_list sc = scan_nat --| one Symbol.is_blank :-- (fn i => repeat_fixed i sc) >> snd obua@19089: obua@19089: fun write_tree (Text s) buf = buf |> Buffer.add "T" |> write_lstring s obua@19089: | write_tree (Elem (name, attrs, children)) buf = obua@19089: buf |> Buffer.add "E" wenzelm@32960: |> write_lstring name wenzelm@32960: |> write_list (fn (a,b) => fn buf => buf |> write_lstring a |> write_lstring b) attrs wenzelm@32960: |> write_list write_tree children obua@19089: obua@19089: fun parse_tree toks = (one (K true) :-- (fn "T" => parse_lstring >> Text | "E" => parse_elem | _ => raise SyntaxError) >> snd) toks obua@19089: and parse_elem toks = (parse_lstring -- parse_list (parse_lstring -- parse_lstring) -- parse_list parse_tree >> (fn ((n, a), c) => Elem (n,a,c))) toks obua@19089: obua@19089: fun encoded_string_of_tree tree = Buffer.content (write_tree tree Buffer.empty) obua@19089: obua@19089: fun tree_of_encoded_string s = obua@19089: let wenzelm@32960: fun print (a,b) = writeln (a^" "^(string_of_int b)) wenzelm@32960: val _ = print ("length of encoded string: ", size s) wenzelm@32960: val _ = writeln "Seq.fromString..." wenzelm@32960: val seq = timeit (fn () => Seq.fromString s) wenzelm@32960: val _ = print ("length of sequence", timeit (fn () => Seq.length seq)) wenzelm@32960: val scanner = !! (err "Malformed encoded xml") parse_tree wenzelm@32960: val (x, toks) = scanner seq obua@19089: in wenzelm@32960: if Seq.null toks then x else error ("Unprocessed input: '"^(beginning 100 toks)^"'") wenzelm@32960: end obua@19089: obua@19064: end;