| author | wenzelm | 
| Tue, 24 Mar 2009 00:36:32 +0100 | |
| changeset 30681 | 27ee3f4ea99c | 
| parent 26537 | 188961eb1f08 | 
| child 32952 | aeb1e44fbc19 | 
| permissions | -rw-r--r-- | 
| 24584 | 1 | (* Title: HOL/Import/xml.ML | 
| 19064 | 2 | ID: $Id$ | 
| 3 | ||
| 26537 | 4 | Yet another version of XML support. | 
| 19064 | 5 | *) | 
| 6 | ||
| 7 | signature XML = | |
| 8 | sig | |
| 9 | val header: string | |
| 10 | val text: string -> string | |
| 11 | val text_charref: string -> string | |
| 12 | val cdata: string -> string | |
| 13 | val element: string -> (string * string) list -> string list -> string | |
| 19089 | 14 | |
| 19064 | 15 | datatype tree = | 
| 16 | Elem of string * (string * string) list * tree list | |
| 17 | | Text of string | |
| 19089 | 18 | |
| 19064 | 19 | val string_of_tree: tree -> string | 
| 20 | val tree_of_string: string -> tree | |
| 19089 | 21 | |
| 22 | val encoded_string_of_tree : tree -> string | |
| 23 | val tree_of_encoded_string : string -> tree | |
| 19064 | 24 | end; | 
| 25 | ||
| 19089 | 26 | structure XML :> XML = | 
| 19064 | 27 | struct | 
| 28 | ||
| 19093 | 29 | (*structure Seq = VectorScannerSeq | 
| 30 | structure Scan = Scanner (structure Seq = Seq)*) | |
| 31 | structure Seq = StringScannerSeq | |
| 32 | structure Scan = StringScanner | |
| 33 | ||
| 19095 | 34 | |
| 19064 | 35 | open Scan | 
| 36 | ||
| 37 | (** string based representation (small scale) **) | |
| 38 | ||
| 39 | val header = "<?xml version=\"1.0\"?>\n"; | |
| 40 | ||
| 41 | (* text and character data *) | |
| 42 | ||
| 43 | fun decode "<" = "<" | |
| 44 | | decode ">" = ">" | |
| 45 | | decode "&" = "&" | |
| 46 | | decode "'" = "'" | |
| 47 | | decode """ = "\"" | |
| 48 | | decode c = c; | |
| 49 | ||
| 50 | fun encode "<" = "<" | |
| 51 | | encode ">" = ">" | |
| 52 | | encode "&" = "&" | |
| 53 | | encode "'" = "'" | |
| 54 | | encode "\"" = """ | |
| 55 | | encode c = c; | |
| 56 | ||
| 57 | fun encode_charref c = "&#" ^ Int.toString (ord c) ^ ";" | |
| 58 | ||
| 59 | val text = Library.translate_string encode | |
| 60 | ||
| 61 | val text_charref = translate_string encode_charref; | |
| 62 | ||
| 63 | val cdata = enclose "<![CDATA[" "]]>\n" | |
| 64 | ||
| 65 | (* elements *) | |
| 66 | ||
| 67 | fun attribute (a, x) = a ^ " = \"" ^ text x ^ "\""; | |
| 68 | ||
| 69 | fun element name atts cs = | |
| 70 | let val elem = space_implode " " (name :: map attribute atts) in | |
| 71 | if null cs then enclose "<" "/>" elem | |
| 72 | else enclose "<" ">" elem ^ implode cs ^ enclose "</" ">" name | |
| 73 | end; | |
| 74 | ||
| 75 | (** explicit XML trees **) | |
| 76 | ||
| 77 | datatype tree = | |
| 78 | Elem of string * (string * string) list * tree list | |
| 79 | | Text of string; | |
| 80 | ||
| 81 | fun string_of_tree tree = | |
| 82 | let | |
| 83 | fun string_of (Elem (name, atts, ts)) buf = | |
| 84 | let val buf' = | |
| 85 | buf |> Buffer.add "<" | |
| 86 | |> fold Buffer.add (separate " " (name :: map attribute atts)) | |
| 87 | in | |
| 88 | if null ts then | |
| 89 | buf' |> Buffer.add "/>" | |
| 90 | else | |
| 91 | buf' |> Buffer.add ">" | |
| 92 | |> fold string_of ts | |
| 93 | |> Buffer.add "</" |> Buffer.add name |> Buffer.add ">" | |
| 94 | end | |
| 95 | | string_of (Text s) buf = Buffer.add (text s) buf; | |
| 96 | in Buffer.content (string_of tree Buffer.empty) end; | |
| 97 | ||
| 98 | (** XML parsing **) | |
| 99 | ||
| 19089 | 100 | fun beginning n xs = Symbol.beginning n (Seq.take_at_most xs n) | 
| 19064 | 101 | |
| 102 | fun err s xs = | |
| 103 | "XML parsing error: " ^ s ^ "\nfound: " ^ quote (beginning 100 xs) ; | |
| 104 | ||
| 105 | val scan_whspc = Scan.any Symbol.is_blank; | |
| 106 | ||
| 107 | val scan_special = $$ "&" ^^ scan_id ^^ $$ ";" >> decode; | |
| 108 | ||
| 109 | val parse_chars = Scan.repeat1 (Scan.unless ((* scan_whspc -- *)$$ "<") | |
| 23784 
75e6b9dd5336
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
 wenzelm parents: 
19095diff
changeset | 110 | (scan_special || Scan.one Symbol.is_regular)) >> implode; | 
| 19064 | 111 | |
| 112 | val parse_cdata = Scan.this_string "<![CDATA[" |-- | |
| 23784 
75e6b9dd5336
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
 wenzelm parents: 
19095diff
changeset | 113 | (Scan.repeat (Scan.unless (Scan.this_string "]]>") (Scan.one Symbol.is_regular)) >> | 
| 19064 | 114 | implode) --| Scan.this_string "]]>"; | 
| 115 | ||
| 116 | val parse_att = | |
| 117 | scan_id --| scan_whspc --| $$ "=" --| scan_whspc -- | |
| 118 | (($$ "\"" || $$ "'") :-- (fn s => (Scan.repeat (Scan.unless ($$ s) | |
| 23784 
75e6b9dd5336
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
 wenzelm parents: 
19095diff
changeset | 119 | (scan_special || Scan.one Symbol.is_regular)) >> implode) --| $$ s) >> snd); | 
| 19064 | 120 | |
| 121 | val parse_comment = Scan.this_string "<!--" -- | |
| 23784 
75e6b9dd5336
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
 wenzelm parents: 
19095diff
changeset | 122 | Scan.repeat (Scan.unless (Scan.this_string "-->") (Scan.one Symbol.is_regular)) -- | 
| 19064 | 123 | Scan.this_string "-->"; | 
| 124 | ||
| 125 | val scan_comment_whspc = | |
| 126 | (scan_whspc >> K()) --| (Scan.repeat (parse_comment |-- (scan_whspc >> K()))); | |
| 127 | ||
| 128 | val parse_pi = Scan.this_string "<?" |-- | |
| 23784 
75e6b9dd5336
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
 wenzelm parents: 
19095diff
changeset | 129 | Scan.repeat (Scan.unless (Scan.this_string "?>") (Scan.one Symbol.is_regular)) --| | 
| 19064 | 130 | Scan.this_string "?>"; | 
| 131 | ||
| 132 | fun parse_content xs = | |
| 133 | ((Scan.optional ((* scan_whspc |-- *) parse_chars >> (single o Text)) [] -- | |
| 134 | (Scan.repeat ((* scan_whspc |-- *) | |
| 135 | ( parse_elem >> single | |
| 136 | || parse_cdata >> (single o Text) | |
| 137 | || parse_pi >> K [] | |
| 138 | || parse_comment >> K []) -- | |
| 139 | Scan.optional ((* scan_whspc |-- *) parse_chars >> (single o Text)) [] | |
| 140 | >> op @) >> List.concat) >> op @)(* --| scan_whspc*)) xs | |
| 141 | ||
| 142 | and parse_elem xs = | |
| 143 | ($$ "<" |-- scan_id -- | |
| 144 | Scan.repeat (scan_whspc |-- parse_att) --| scan_whspc :-- (fn (s, _) => | |
| 145 | !! (err "Expected > or />") | |
| 146 | (Scan.this_string "/>" >> K [] | |
| 147 | || $$ ">" |-- parse_content --| | |
| 148 |             !! (err ("Expected </" ^ s ^ ">"))
 | |
| 149 |               (Scan.this_string ("</" ^ s) --| scan_whspc --| $$ ">"))) >>
 | |
| 150 | (fn ((s, atts), ts) => Elem (s, atts, ts))) xs; | |
| 151 | ||
| 152 | val parse_document = | |
| 153 | Scan.option (Scan.this_string "<!DOCTYPE" -- scan_whspc |-- | |
| 154 | (Scan.repeat (Scan.unless ($$ ">") | |
| 23784 
75e6b9dd5336
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
 wenzelm parents: 
19095diff
changeset | 155 | (Scan.one Symbol.is_regular)) >> implode) --| $$ ">" --| scan_whspc) -- | 
| 19064 | 156 | parse_elem; | 
| 157 | ||
| 158 | fun tree_of_string s = | |
| 159 | let | |
| 19089 | 160 | val seq = Seq.fromString s | 
| 19064 | 161 | val scanner = !! (err "Malformed element") (scan_whspc |-- parse_elem --| scan_whspc) | 
| 162 | val (x, toks) = scanner seq | |
| 163 | in | |
| 19089 | 164 | 	if Seq.null toks then x else error ("Unprocessed input: '"^(beginning 100 toks)^"'")
 | 
| 19064 | 165 | end | 
| 19089 | 166 | |
| 167 | (* More efficient saving and loading of xml trees employing a proprietary external format *) | |
| 168 | ||
| 169 | fun write_lstring s buf = buf |> Buffer.add (string_of_int (size s)) |> Buffer.add " " |> Buffer.add s | |
| 170 | fun parse_lstring toks = (scan_nat --| one Symbol.is_blank :-- (fn i => repeat_fixed i (one (K true))) >> snd >> implode) toks | |
| 171 | ||
| 172 | fun write_list w l buf = buf |> Buffer.add (string_of_int (length l)) |> Buffer.add " " |> fold w l | |
| 173 | fun parse_list sc = scan_nat --| one Symbol.is_blank :-- (fn i => repeat_fixed i sc) >> snd | |
| 174 | ||
| 175 | fun write_tree (Text s) buf = buf |> Buffer.add "T" |> write_lstring s | |
| 176 | | write_tree (Elem (name, attrs, children)) buf = | |
| 177 | buf |> Buffer.add "E" | |
| 178 | |> write_lstring name | |
| 179 | |> write_list (fn (a,b) => fn buf => buf |> write_lstring a |> write_lstring b) attrs | |
| 180 | |> write_list write_tree children | |
| 181 | ||
| 182 | fun parse_tree toks = (one (K true) :-- (fn "T" => parse_lstring >> Text | "E" => parse_elem | _ => raise SyntaxError) >> snd) toks | |
| 183 | and parse_elem toks = (parse_lstring -- parse_list (parse_lstring -- parse_lstring) -- parse_list parse_tree >> (fn ((n, a), c) => Elem (n,a,c))) toks | |
| 184 | ||
| 185 | fun encoded_string_of_tree tree = Buffer.content (write_tree tree Buffer.empty) | |
| 186 | ||
| 187 | fun tree_of_encoded_string s = | |
| 188 | let | |
| 189 | fun print (a,b) = writeln (a^" "^(string_of_int b)) | |
| 190 | 	val _ = print ("length of encoded string: ", size s)
 | |
| 191 | val _ = writeln "Seq.fromString..." | |
| 192 | val seq = timeit (fn () => Seq.fromString s) | |
| 193 | 	val  _ = print ("length of sequence", timeit (fn () => Seq.length seq))
 | |
| 194 | val scanner = !! (err "Malformed encoded xml") parse_tree | |
| 195 | val (x, toks) = scanner seq | |
| 196 | in | |
| 197 | 	if Seq.null toks then x else error ("Unprocessed input: '"^(beginning 100 toks)^"'")
 | |
| 198 | end | |
| 199 | ||
| 19064 | 200 | end; |