| author | haftmann | 
| Wed, 27 Jan 2010 14:03:08 +0100 | |
| changeset 34972 | cc1d4c3ca9db | 
| parent 32960 | 69916a850301 | 
| child 41491 | a2ad5b824051 | 
| permissions | -rw-r--r-- | 
| 24584 | 1 | (* Title: HOL/Import/xml.ML | 
| 19064 | 2 | |
| 26537 | 3 | Yet another version of XML support. | 
| 19064 | 4 | *) | 
| 5 | ||
| 6 | signature XML = | |
| 7 | sig | |
| 8 | val header: string | |
| 9 | val text: string -> string | |
| 10 | val text_charref: string -> string | |
| 11 | val cdata: string -> string | |
| 12 | val element: string -> (string * string) list -> string list -> string | |
| 19089 | 13 | |
| 19064 | 14 | datatype tree = | 
| 15 | Elem of string * (string * string) list * tree list | |
| 16 | | Text of string | |
| 19089 | 17 | |
| 19064 | 18 | val string_of_tree: tree -> string | 
| 19 | val tree_of_string: string -> tree | |
| 19089 | 20 | |
| 21 | val encoded_string_of_tree : tree -> string | |
| 22 | val tree_of_encoded_string : string -> tree | |
| 19064 | 23 | end; | 
| 24 | ||
| 19089 | 25 | structure XML :> XML = | 
| 19064 | 26 | struct | 
| 27 | ||
| 19093 | 28 | (*structure Seq = VectorScannerSeq | 
| 29 | structure Scan = Scanner (structure Seq = Seq)*) | |
| 30 | structure Seq = StringScannerSeq | |
| 31 | structure Scan = StringScanner | |
| 32 | ||
| 19095 | 33 | |
| 19064 | 34 | open Scan | 
| 35 | ||
| 36 | (** string based representation (small scale) **) | |
| 37 | ||
| 38 | val header = "<?xml version=\"1.0\"?>\n"; | |
| 39 | ||
| 40 | (* text and character data *) | |
| 41 | ||
| 42 | fun decode "<" = "<" | |
| 43 | | decode ">" = ">" | |
| 44 | | decode "&" = "&" | |
| 45 | | decode "'" = "'" | |
| 46 | | decode """ = "\"" | |
| 47 | | decode c = c; | |
| 48 | ||
| 49 | fun encode "<" = "<" | |
| 50 | | encode ">" = ">" | |
| 51 | | encode "&" = "&" | |
| 52 | | encode "'" = "'" | |
| 53 | | encode "\"" = """ | |
| 54 | | encode c = c; | |
| 55 | ||
| 56 | fun encode_charref c = "&#" ^ Int.toString (ord c) ^ ";" | |
| 57 | ||
| 58 | val text = Library.translate_string encode | |
| 59 | ||
| 60 | val text_charref = translate_string encode_charref; | |
| 61 | ||
| 62 | val cdata = enclose "<![CDATA[" "]]>\n" | |
| 63 | ||
| 64 | (* elements *) | |
| 65 | ||
| 66 | fun attribute (a, x) = a ^ " = \"" ^ text x ^ "\""; | |
| 67 | ||
| 68 | fun element name atts cs = | |
| 69 | let val elem = space_implode " " (name :: map attribute atts) in | |
| 70 | if null cs then enclose "<" "/>" elem | |
| 71 | else enclose "<" ">" elem ^ implode cs ^ enclose "</" ">" name | |
| 72 | end; | |
| 73 | ||
| 74 | (** explicit XML trees **) | |
| 75 | ||
| 76 | datatype tree = | |
| 77 | Elem of string * (string * string) list * tree list | |
| 78 | | Text of string; | |
| 79 | ||
| 80 | fun string_of_tree tree = | |
| 81 | let | |
| 82 | fun string_of (Elem (name, atts, ts)) buf = | |
| 83 | let val buf' = | |
| 84 | buf |> Buffer.add "<" | |
| 85 | |> fold Buffer.add (separate " " (name :: map attribute atts)) | |
| 86 | in | |
| 87 | if null ts then | |
| 88 | buf' |> Buffer.add "/>" | |
| 89 | else | |
| 90 | buf' |> Buffer.add ">" | |
| 91 | |> fold string_of ts | |
| 92 | |> Buffer.add "</" |> Buffer.add name |> Buffer.add ">" | |
| 93 | end | |
| 94 | | string_of (Text s) buf = Buffer.add (text s) buf; | |
| 95 | in Buffer.content (string_of tree Buffer.empty) end; | |
| 96 | ||
| 97 | (** XML parsing **) | |
| 98 | ||
| 19089 | 99 | fun beginning n xs = Symbol.beginning n (Seq.take_at_most xs n) | 
| 19064 | 100 | |
| 101 | fun err s xs = | |
| 102 | "XML parsing error: " ^ s ^ "\nfound: " ^ quote (beginning 100 xs) ; | |
| 103 | ||
| 104 | val scan_whspc = Scan.any Symbol.is_blank; | |
| 105 | ||
| 106 | val scan_special = $$ "&" ^^ scan_id ^^ $$ ";" >> decode; | |
| 107 | ||
| 108 | val parse_chars = Scan.repeat1 (Scan.unless ((* scan_whspc -- *)$$ "<") | |
| 23784 
75e6b9dd5336
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
 wenzelm parents: 
19095diff
changeset | 109 | (scan_special || Scan.one Symbol.is_regular)) >> implode; | 
| 19064 | 110 | |
| 111 | val parse_cdata = Scan.this_string "<![CDATA[" |-- | |
| 23784 
75e6b9dd5336
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
 wenzelm parents: 
19095diff
changeset | 112 | (Scan.repeat (Scan.unless (Scan.this_string "]]>") (Scan.one Symbol.is_regular)) >> | 
| 19064 | 113 | implode) --| Scan.this_string "]]>"; | 
| 114 | ||
| 115 | val parse_att = | |
| 116 | scan_id --| scan_whspc --| $$ "=" --| scan_whspc -- | |
| 117 | (($$ "\"" || $$ "'") :-- (fn s => (Scan.repeat (Scan.unless ($$ s) | |
| 23784 
75e6b9dd5336
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
 wenzelm parents: 
19095diff
changeset | 118 | (scan_special || Scan.one Symbol.is_regular)) >> implode) --| $$ s) >> snd); | 
| 19064 | 119 | |
| 120 | val parse_comment = Scan.this_string "<!--" -- | |
| 23784 
75e6b9dd5336
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
 wenzelm parents: 
19095diff
changeset | 121 | Scan.repeat (Scan.unless (Scan.this_string "-->") (Scan.one Symbol.is_regular)) -- | 
| 19064 | 122 | Scan.this_string "-->"; | 
| 123 | ||
| 124 | val scan_comment_whspc = | |
| 125 | (scan_whspc >> K()) --| (Scan.repeat (parse_comment |-- (scan_whspc >> K()))); | |
| 126 | ||
| 127 | val parse_pi = Scan.this_string "<?" |-- | |
| 23784 
75e6b9dd5336
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
 wenzelm parents: 
19095diff
changeset | 128 | Scan.repeat (Scan.unless (Scan.this_string "?>") (Scan.one Symbol.is_regular)) --| | 
| 19064 | 129 | Scan.this_string "?>"; | 
| 130 | ||
| 131 | fun parse_content xs = | |
| 132 | ((Scan.optional ((* scan_whspc |-- *) parse_chars >> (single o Text)) [] -- | |
| 133 | (Scan.repeat ((* scan_whspc |-- *) | |
| 134 | ( parse_elem >> single | |
| 135 | || parse_cdata >> (single o Text) | |
| 136 | || parse_pi >> K [] | |
| 137 | || parse_comment >> K []) -- | |
| 138 | Scan.optional ((* scan_whspc |-- *) parse_chars >> (single o Text)) [] | |
| 32952 | 139 | >> op @) >> flat) >> op @)(* --| scan_whspc*)) xs | 
| 19064 | 140 | |
| 141 | and parse_elem xs = | |
| 142 | ($$ "<" |-- scan_id -- | |
| 143 | Scan.repeat (scan_whspc |-- parse_att) --| scan_whspc :-- (fn (s, _) => | |
| 144 | !! (err "Expected > or />") | |
| 145 | (Scan.this_string "/>" >> K [] | |
| 146 | || $$ ">" |-- parse_content --| | |
| 147 |             !! (err ("Expected </" ^ s ^ ">"))
 | |
| 148 |               (Scan.this_string ("</" ^ s) --| scan_whspc --| $$ ">"))) >>
 | |
| 149 | (fn ((s, atts), ts) => Elem (s, atts, ts))) xs; | |
| 150 | ||
| 151 | val parse_document = | |
| 152 | Scan.option (Scan.this_string "<!DOCTYPE" -- scan_whspc |-- | |
| 153 | (Scan.repeat (Scan.unless ($$ ">") | |
| 23784 
75e6b9dd5336
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
 wenzelm parents: 
19095diff
changeset | 154 | (Scan.one Symbol.is_regular)) >> implode) --| $$ ">" --| scan_whspc) -- | 
| 19064 | 155 | parse_elem; | 
| 156 | ||
| 157 | fun tree_of_string s = | |
| 158 | let | |
| 32960 
69916a850301
eliminated hard tabulators, guessing at each author's individual tab-width;
 wenzelm parents: 
32952diff
changeset | 159 | val seq = Seq.fromString s | 
| 
69916a850301
eliminated hard tabulators, guessing at each author's individual tab-width;
 wenzelm parents: 
32952diff
changeset | 160 | val scanner = !! (err "Malformed element") (scan_whspc |-- parse_elem --| scan_whspc) | 
| 
69916a850301
eliminated hard tabulators, guessing at each author's individual tab-width;
 wenzelm parents: 
32952diff
changeset | 161 | val (x, toks) = scanner seq | 
| 19064 | 162 | in | 
| 32960 
69916a850301
eliminated hard tabulators, guessing at each author's individual tab-width;
 wenzelm parents: 
32952diff
changeset | 163 |         if Seq.null toks then x else error ("Unprocessed input: '"^(beginning 100 toks)^"'")
 | 
| 19064 | 164 | end | 
| 19089 | 165 | |
| 166 | (* More efficient saving and loading of xml trees employing a proprietary external format *) | |
| 167 | ||
| 168 | fun write_lstring s buf = buf |> Buffer.add (string_of_int (size s)) |> Buffer.add " " |> Buffer.add s | |
| 169 | fun parse_lstring toks = (scan_nat --| one Symbol.is_blank :-- (fn i => repeat_fixed i (one (K true))) >> snd >> implode) toks | |
| 170 | ||
| 171 | fun write_list w l buf = buf |> Buffer.add (string_of_int (length l)) |> Buffer.add " " |> fold w l | |
| 172 | fun parse_list sc = scan_nat --| one Symbol.is_blank :-- (fn i => repeat_fixed i sc) >> snd | |
| 173 | ||
| 174 | fun write_tree (Text s) buf = buf |> Buffer.add "T" |> write_lstring s | |
| 175 | | write_tree (Elem (name, attrs, children)) buf = | |
| 176 | buf |> Buffer.add "E" | |
| 32960 
69916a850301
eliminated hard tabulators, guessing at each author's individual tab-width;
 wenzelm parents: 
32952diff
changeset | 177 | |> write_lstring name | 
| 
69916a850301
eliminated hard tabulators, guessing at each author's individual tab-width;
 wenzelm parents: 
32952diff
changeset | 178 | |> write_list (fn (a,b) => fn buf => buf |> write_lstring a |> write_lstring b) attrs | 
| 
69916a850301
eliminated hard tabulators, guessing at each author's individual tab-width;
 wenzelm parents: 
32952diff
changeset | 179 | |> write_list write_tree children | 
| 19089 | 180 | |
| 181 | fun parse_tree toks = (one (K true) :-- (fn "T" => parse_lstring >> Text | "E" => parse_elem | _ => raise SyntaxError) >> snd) toks | |
| 182 | and parse_elem toks = (parse_lstring -- parse_list (parse_lstring -- parse_lstring) -- parse_list parse_tree >> (fn ((n, a), c) => Elem (n,a,c))) toks | |
| 183 | ||
| 184 | fun encoded_string_of_tree tree = Buffer.content (write_tree tree Buffer.empty) | |
| 185 | ||
| 186 | fun tree_of_encoded_string s = | |
| 187 | let | |
| 32960 
69916a850301
eliminated hard tabulators, guessing at each author's individual tab-width;
 wenzelm parents: 
32952diff
changeset | 188 | fun print (a,b) = writeln (a^" "^(string_of_int b)) | 
| 
69916a850301
eliminated hard tabulators, guessing at each author's individual tab-width;
 wenzelm parents: 
32952diff
changeset | 189 |         val _ = print ("length of encoded string: ", size s)
 | 
| 
69916a850301
eliminated hard tabulators, guessing at each author's individual tab-width;
 wenzelm parents: 
32952diff
changeset | 190 | val _ = writeln "Seq.fromString..." | 
| 
69916a850301
eliminated hard tabulators, guessing at each author's individual tab-width;
 wenzelm parents: 
32952diff
changeset | 191 | val seq = timeit (fn () => Seq.fromString s) | 
| 
69916a850301
eliminated hard tabulators, guessing at each author's individual tab-width;
 wenzelm parents: 
32952diff
changeset | 192 |         val  _ = print ("length of sequence", timeit (fn () => Seq.length seq))
 | 
| 
69916a850301
eliminated hard tabulators, guessing at each author's individual tab-width;
 wenzelm parents: 
32952diff
changeset | 193 | val scanner = !! (err "Malformed encoded xml") parse_tree | 
| 
69916a850301
eliminated hard tabulators, guessing at each author's individual tab-width;
 wenzelm parents: 
32952diff
changeset | 194 | val (x, toks) = scanner seq | 
| 19089 | 195 | in | 
| 32960 
69916a850301
eliminated hard tabulators, guessing at each author's individual tab-width;
 wenzelm parents: 
32952diff
changeset | 196 |         if Seq.null toks then x else error ("Unprocessed input: '"^(beginning 100 toks)^"'")
 | 
| 
69916a850301
eliminated hard tabulators, guessing at each author's individual tab-width;
 wenzelm parents: 
32952diff
changeset | 197 | end | 
| 19089 | 198 | |
| 19064 | 199 | end; |