src/HOL/Import/xml.ML
author obua
Wed, 15 Feb 2006 23:57:06 +0100
changeset 19064 bf19cc5a7899
child 19089 2e487fe9593a
permissions -rw-r--r--
fixed bugs, added caching
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
19064
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
     1
(*  Title:      Pure/General/xml.ML
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
     2
    ID:         $Id$
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
     3
    Author:     David Aspinall, Stefan Berghofer and Markus Wenzel
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
     4
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
     5
Basic support for XML.
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
     6
*)
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
     7
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
     8
signature XML =
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
     9
sig
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    10
  val header: string
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    11
  val text: string -> string
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    12
  val text_charref: string -> string
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    13
  val cdata: string -> string
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    14
  val element: string -> (string * string) list -> string list -> string
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    15
  datatype tree =
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    16
      Elem of string * (string * string) list * tree list
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    17
    | Text of string
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    18
  val string_of_tree: tree -> string
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    19
  val tree_of_string: string -> tree
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    20
end;
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    21
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    22
structure XML =
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    23
struct
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    24
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    25
structure Scan = LazyScan
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    26
open Scan
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    27
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    28
(** string based representation (small scale) **)
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    29
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    30
val header = "<?xml version=\"1.0\"?>\n";
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    31
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    32
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    33
(* text and character data *)
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    34
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    35
fun decode "&lt;" = "<"
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    36
  | decode "&gt;" = ">"
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    37
  | decode "&amp;" = "&"
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    38
  | decode "&apos;" = "'"
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    39
  | decode "&quot;" = "\""
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    40
  | decode c = c;
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    41
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    42
fun encode "<" = "&lt;"
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    43
  | encode ">" = "&gt;"
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    44
  | encode "&" = "&amp;"
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    45
  | encode "'" = "&apos;"
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    46
  | encode "\"" = "&quot;"
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    47
  | encode c = c;
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    48
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    49
fun encode_charref c = "&#" ^ Int.toString (ord c) ^ ";"
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    50
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    51
val text = Library.translate_string encode
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    52
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    53
val text_charref = translate_string encode_charref;
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    54
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    55
val cdata = enclose "<![CDATA[" "]]>\n"
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    56
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    57
(* elements *)
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    58
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    59
fun attribute (a, x) = a ^ " = \"" ^ text x ^ "\"";
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    60
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    61
fun element name atts cs =
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    62
  let val elem = space_implode " " (name :: map attribute atts) in
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    63
    if null cs then enclose "<" "/>" elem
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    64
    else enclose "<" ">" elem ^ implode cs ^ enclose "</" ">" name
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    65
  end;
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    66
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    67
(** explicit XML trees **)
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    68
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    69
datatype tree =
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    70
    Elem of string * (string * string) list * tree list
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    71
  | Text of string;
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    72
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    73
fun string_of_tree tree =
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    74
  let
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    75
    fun string_of (Elem (name, atts, ts)) buf =
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    76
        let val buf' =
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    77
          buf |> Buffer.add "<"
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    78
          |> fold Buffer.add (separate " " (name :: map attribute atts))
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    79
        in
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    80
          if null ts then
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    81
            buf' |> Buffer.add "/>"
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    82
          else
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    83
            buf' |> Buffer.add ">"
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    84
            |> fold string_of ts
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    85
            |> Buffer.add "</" |> Buffer.add name |> Buffer.add ">"
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    86
        end
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    87
      | string_of (Text s) buf = Buffer.add (text s) buf;
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    88
  in Buffer.content (string_of tree Buffer.empty) end;
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    89
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    90
(** XML parsing **)
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    91
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    92
fun beginning n xs = Symbol.beginning n (LazySeq.take_at_most (xs, n))
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    93
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    94
fun err s xs =
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    95
  "XML parsing error: " ^ s ^ "\nfound: " ^ quote (beginning 100 xs) ;
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    96
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    97
val scan_whspc = Scan.any Symbol.is_blank;
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    98
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
    99
val scan_special = $$ "&" ^^ scan_id ^^ $$ ";" >> decode;
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   100
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   101
val parse_chars = Scan.repeat1 (Scan.unless ((* scan_whspc -- *)$$ "<")
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   102
  (scan_special || Scan.one Symbol.not_eof)) >> implode;
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   103
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   104
val parse_cdata = Scan.this_string "<![CDATA[" |--
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   105
  (Scan.repeat (Scan.unless (Scan.this_string "]]>") (Scan.one Symbol.not_eof)) >>
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   106
    implode) --| Scan.this_string "]]>";
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   107
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   108
val parse_att =
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   109
    scan_id --| scan_whspc --| $$ "=" --| scan_whspc --
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   110
    (($$ "\"" || $$ "'") :-- (fn s => (Scan.repeat (Scan.unless ($$ s)
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   111
    (scan_special || Scan.one Symbol.not_eof)) >> implode) --| $$ s) >> snd);
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   112
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   113
val parse_comment = Scan.this_string "<!--" --
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   114
  Scan.repeat (Scan.unless (Scan.this_string "-->") (Scan.one Symbol.not_eof)) --
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   115
  Scan.this_string "-->";
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   116
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   117
val scan_comment_whspc = 
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   118
    (scan_whspc >> K()) --| (Scan.repeat (parse_comment |-- (scan_whspc >> K())));
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   119
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   120
val parse_pi = Scan.this_string "<?" |--
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   121
  Scan.repeat (Scan.unless (Scan.this_string "?>") (Scan.one Symbol.not_eof)) --|
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   122
  Scan.this_string "?>";
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   123
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   124
fun parse_content xs =
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   125
  ((Scan.optional ((* scan_whspc |-- *) parse_chars >> (single o Text)) [] --
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   126
    (Scan.repeat ((* scan_whspc |-- *)
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   127
       (   parse_elem >> single
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   128
        || parse_cdata >> (single o Text)
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   129
        || parse_pi >> K []
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   130
        || parse_comment >> K []) --
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   131
       Scan.optional ((* scan_whspc |-- *) parse_chars >> (single o Text)) []
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   132
         >> op @) >> List.concat) >> op @)(* --| scan_whspc*)) xs
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   133
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   134
and parse_elem xs =
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   135
  ($$ "<" |-- scan_id --
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   136
    Scan.repeat (scan_whspc |-- parse_att) --| scan_whspc :-- (fn (s, _) =>
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   137
      !! (err "Expected > or />")
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   138
        (Scan.this_string "/>" >> K []
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   139
         || $$ ">" |-- parse_content --|
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   140
            !! (err ("Expected </" ^ s ^ ">"))
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   141
              (Scan.this_string ("</" ^ s) --| scan_whspc --| $$ ">"))) >>
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   142
    (fn ((s, atts), ts) => Elem (s, atts, ts))) xs;
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   143
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   144
val parse_document =
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   145
  Scan.option (Scan.this_string "<!DOCTYPE" -- scan_whspc |--
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   146
    (Scan.repeat (Scan.unless ($$ ">")
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   147
      (Scan.one Symbol.not_eof)) >> implode) --| $$ ">" --| scan_whspc) --
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   148
  parse_elem;
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   149
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   150
fun tree_of_string s =
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   151
    let
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   152
	val seq = LazySeq.of_list (Symbol.explode s)
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   153
	val scanner = !! (err "Malformed element") (scan_whspc |-- parse_elem --| scan_whspc)
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   154
	val (x, toks) = scanner seq
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   155
    in
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   156
	if LazySeq.null toks then x else error ("Unprocessed input: '"^(beginning 100 toks)^"'")
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   157
    end
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   158
	
bf19cc5a7899 fixed bugs, added caching
obua
parents:
diff changeset
   159
end;