| author | huffman | 
| Wed, 01 Aug 2007 19:59:12 +0200 | |
| changeset 24121 | a93b0f4df838 | 
| parent 23784 | 75e6b9dd5336 | 
| child 24584 | 01e83ffa6c54 | 
| permissions | -rw-r--r-- | 
| 19064 | 1 | (* Title: Pure/General/xml.ML | 
| 2 | ID: $Id$ | |
| 3 | Author: David Aspinall, Stefan Berghofer and Markus Wenzel | |
| 4 | ||
| 5 | Basic support for XML. | |
| 6 | *) | |
| 7 | ||
| 8 | signature XML = | |
| 9 | sig | |
| 10 | val header: string | |
| 11 | val text: string -> string | |
| 12 | val text_charref: string -> string | |
| 13 | val cdata: string -> string | |
| 14 | val element: string -> (string * string) list -> string list -> string | |
| 19089 | 15 | |
| 19064 | 16 | datatype tree = | 
| 17 | Elem of string * (string * string) list * tree list | |
| 18 | | Text of string | |
| 19089 | 19 | |
| 19064 | 20 | val string_of_tree: tree -> string | 
| 21 | val tree_of_string: string -> tree | |
| 19089 | 22 | |
| 23 | val encoded_string_of_tree : tree -> string | |
| 24 | val tree_of_encoded_string : string -> tree | |
| 19064 | 25 | end; | 
| 26 | ||
| 19089 | 27 | structure XML :> XML = | 
| 19064 | 28 | struct | 
| 29 | ||
| 19093 | 30 | (*structure Seq = VectorScannerSeq | 
| 31 | structure Scan = Scanner (structure Seq = Seq)*) | |
| 32 | structure Seq = StringScannerSeq | |
| 33 | structure Scan = StringScanner | |
| 34 | ||
| 19095 | 35 | |
| 19064 | 36 | open Scan | 
| 37 | ||
| 38 | (** string based representation (small scale) **) | |
| 39 | ||
| 40 | val header = "<?xml version=\"1.0\"?>\n"; | |
| 41 | ||
| 42 | (* text and character data *) | |
| 43 | ||
| 44 | fun decode "<" = "<" | |
| 45 | | decode ">" = ">" | |
| 46 | | decode "&" = "&" | |
| 47 | | decode "'" = "'" | |
| 48 | | decode """ = "\"" | |
| 49 | | decode c = c; | |
| 50 | ||
| 51 | fun encode "<" = "<" | |
| 52 | | encode ">" = ">" | |
| 53 | | encode "&" = "&" | |
| 54 | | encode "'" = "'" | |
| 55 | | encode "\"" = """ | |
| 56 | | encode c = c; | |
| 57 | ||
| 58 | fun encode_charref c = "&#" ^ Int.toString (ord c) ^ ";" | |
| 59 | ||
| 60 | val text = Library.translate_string encode | |
| 61 | ||
| 62 | val text_charref = translate_string encode_charref; | |
| 63 | ||
| 64 | val cdata = enclose "<![CDATA[" "]]>\n" | |
| 65 | ||
| 66 | (* elements *) | |
| 67 | ||
| 68 | fun attribute (a, x) = a ^ " = \"" ^ text x ^ "\""; | |
| 69 | ||
| 70 | fun element name atts cs = | |
| 71 | let val elem = space_implode " " (name :: map attribute atts) in | |
| 72 | if null cs then enclose "<" "/>" elem | |
| 73 | else enclose "<" ">" elem ^ implode cs ^ enclose "</" ">" name | |
| 74 | end; | |
| 75 | ||
| 76 | (** explicit XML trees **) | |
| 77 | ||
| 78 | datatype tree = | |
| 79 | Elem of string * (string * string) list * tree list | |
| 80 | | Text of string; | |
| 81 | ||
| 82 | fun string_of_tree tree = | |
| 83 | let | |
| 84 | fun string_of (Elem (name, atts, ts)) buf = | |
| 85 | let val buf' = | |
| 86 | buf |> Buffer.add "<" | |
| 87 | |> fold Buffer.add (separate " " (name :: map attribute atts)) | |
| 88 | in | |
| 89 | if null ts then | |
| 90 | buf' |> Buffer.add "/>" | |
| 91 | else | |
| 92 | buf' |> Buffer.add ">" | |
| 93 | |> fold string_of ts | |
| 94 | |> Buffer.add "</" |> Buffer.add name |> Buffer.add ">" | |
| 95 | end | |
| 96 | | string_of (Text s) buf = Buffer.add (text s) buf; | |
| 97 | in Buffer.content (string_of tree Buffer.empty) end; | |
| 98 | ||
| 99 | (** XML parsing **) | |
| 100 | ||
| 19089 | 101 | fun beginning n xs = Symbol.beginning n (Seq.take_at_most xs n) | 
| 19064 | 102 | |
| 103 | fun err s xs = | |
| 104 | "XML parsing error: " ^ s ^ "\nfound: " ^ quote (beginning 100 xs) ; | |
| 105 | ||
| 106 | val scan_whspc = Scan.any Symbol.is_blank; | |
| 107 | ||
| 108 | val scan_special = $$ "&" ^^ scan_id ^^ $$ ";" >> decode; | |
| 109 | ||
| 110 | val parse_chars = Scan.repeat1 (Scan.unless ((* scan_whspc -- *)$$ "<") | |
| 23784 
75e6b9dd5336
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
 wenzelm parents: 
19095diff
changeset | 111 | (scan_special || Scan.one Symbol.is_regular)) >> implode; | 
| 19064 | 112 | |
| 113 | val parse_cdata = Scan.this_string "<![CDATA[" |-- | |
| 23784 
75e6b9dd5336
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
 wenzelm parents: 
19095diff
changeset | 114 | (Scan.repeat (Scan.unless (Scan.this_string "]]>") (Scan.one Symbol.is_regular)) >> | 
| 19064 | 115 | implode) --| Scan.this_string "]]>"; | 
| 116 | ||
| 117 | val parse_att = | |
| 118 | scan_id --| scan_whspc --| $$ "=" --| scan_whspc -- | |
| 119 | (($$ "\"" || $$ "'") :-- (fn s => (Scan.repeat (Scan.unless ($$ s) | |
| 23784 
75e6b9dd5336
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
 wenzelm parents: 
19095diff
changeset | 120 | (scan_special || Scan.one Symbol.is_regular)) >> implode) --| $$ s) >> snd); | 
| 19064 | 121 | |
| 122 | val parse_comment = Scan.this_string "<!--" -- | |
| 23784 
75e6b9dd5336
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
 wenzelm parents: 
19095diff
changeset | 123 | Scan.repeat (Scan.unless (Scan.this_string "-->") (Scan.one Symbol.is_regular)) -- | 
| 19064 | 124 | Scan.this_string "-->"; | 
| 125 | ||
| 126 | val scan_comment_whspc = | |
| 127 | (scan_whspc >> K()) --| (Scan.repeat (parse_comment |-- (scan_whspc >> K()))); | |
| 128 | ||
| 129 | val parse_pi = Scan.this_string "<?" |-- | |
| 23784 
75e6b9dd5336
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
 wenzelm parents: 
19095diff
changeset | 130 | Scan.repeat (Scan.unless (Scan.this_string "?>") (Scan.one Symbol.is_regular)) --| | 
| 19064 | 131 | Scan.this_string "?>"; | 
| 132 | ||
| 133 | fun parse_content xs = | |
| 134 | ((Scan.optional ((* scan_whspc |-- *) parse_chars >> (single o Text)) [] -- | |
| 135 | (Scan.repeat ((* scan_whspc |-- *) | |
| 136 | ( parse_elem >> single | |
| 137 | || parse_cdata >> (single o Text) | |
| 138 | || parse_pi >> K [] | |
| 139 | || parse_comment >> K []) -- | |
| 140 | Scan.optional ((* scan_whspc |-- *) parse_chars >> (single o Text)) [] | |
| 141 | >> op @) >> List.concat) >> op @)(* --| scan_whspc*)) xs | |
| 142 | ||
| 143 | and parse_elem xs = | |
| 144 | ($$ "<" |-- scan_id -- | |
| 145 | Scan.repeat (scan_whspc |-- parse_att) --| scan_whspc :-- (fn (s, _) => | |
| 146 | !! (err "Expected > or />") | |
| 147 | (Scan.this_string "/>" >> K [] | |
| 148 | || $$ ">" |-- parse_content --| | |
| 149 |             !! (err ("Expected </" ^ s ^ ">"))
 | |
| 150 |               (Scan.this_string ("</" ^ s) --| scan_whspc --| $$ ">"))) >>
 | |
| 151 | (fn ((s, atts), ts) => Elem (s, atts, ts))) xs; | |
| 152 | ||
| 153 | val parse_document = | |
| 154 | Scan.option (Scan.this_string "<!DOCTYPE" -- scan_whspc |-- | |
| 155 | (Scan.repeat (Scan.unless ($$ ">") | |
| 23784 
75e6b9dd5336
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
 wenzelm parents: 
19095diff
changeset | 156 | (Scan.one Symbol.is_regular)) >> implode) --| $$ ">" --| scan_whspc) -- | 
| 19064 | 157 | parse_elem; | 
| 158 | ||
| 159 | fun tree_of_string s = | |
| 160 | let | |
| 19089 | 161 | val seq = Seq.fromString s | 
| 19064 | 162 | val scanner = !! (err "Malformed element") (scan_whspc |-- parse_elem --| scan_whspc) | 
| 163 | val (x, toks) = scanner seq | |
| 164 | in | |
| 19089 | 165 | 	if Seq.null toks then x else error ("Unprocessed input: '"^(beginning 100 toks)^"'")
 | 
| 19064 | 166 | end | 
| 19089 | 167 | |
| 168 | (* More efficient saving and loading of xml trees employing a proprietary external format *) | |
| 169 | ||
| 170 | fun write_lstring s buf = buf |> Buffer.add (string_of_int (size s)) |> Buffer.add " " |> Buffer.add s | |
| 171 | fun parse_lstring toks = (scan_nat --| one Symbol.is_blank :-- (fn i => repeat_fixed i (one (K true))) >> snd >> implode) toks | |
| 172 | ||
| 173 | fun write_list w l buf = buf |> Buffer.add (string_of_int (length l)) |> Buffer.add " " |> fold w l | |
| 174 | fun parse_list sc = scan_nat --| one Symbol.is_blank :-- (fn i => repeat_fixed i sc) >> snd | |
| 175 | ||
| 176 | fun write_tree (Text s) buf = buf |> Buffer.add "T" |> write_lstring s | |
| 177 | | write_tree (Elem (name, attrs, children)) buf = | |
| 178 | buf |> Buffer.add "E" | |
| 179 | |> write_lstring name | |
| 180 | |> write_list (fn (a,b) => fn buf => buf |> write_lstring a |> write_lstring b) attrs | |
| 181 | |> write_list write_tree children | |
| 182 | ||
| 183 | fun parse_tree toks = (one (K true) :-- (fn "T" => parse_lstring >> Text | "E" => parse_elem | _ => raise SyntaxError) >> snd) toks | |
| 184 | and parse_elem toks = (parse_lstring -- parse_list (parse_lstring -- parse_lstring) -- parse_list parse_tree >> (fn ((n, a), c) => Elem (n,a,c))) toks | |
| 185 | ||
| 186 | fun encoded_string_of_tree tree = Buffer.content (write_tree tree Buffer.empty) | |
| 187 | ||
| 188 | fun tree_of_encoded_string s = | |
| 189 | let | |
| 190 | fun print (a,b) = writeln (a^" "^(string_of_int b)) | |
| 191 | 	val _ = print ("length of encoded string: ", size s)
 | |
| 192 | val _ = writeln "Seq.fromString..." | |
| 193 | val seq = timeit (fn () => Seq.fromString s) | |
| 194 | 	val  _ = print ("length of sequence", timeit (fn () => Seq.length seq))
 | |
| 195 | val scanner = !! (err "Malformed encoded xml") parse_tree | |
| 196 | val (x, toks) = scanner seq | |
| 197 | in | |
| 198 | 	if Seq.null toks then x else error ("Unprocessed input: '"^(beginning 100 toks)^"'")
 | |
| 199 | end | |
| 200 | ||
| 19064 | 201 | end; |