src/Pure/General/xml.ML
author wenzelm
Thu Apr 03 18:42:36 2008 +0200 (2008-04-03)
changeset 26538 d65504ffb47d
parent 26525 14a56f013469
child 26539 a0754be538ab
permissions -rw-r--r--
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
haftmann@24584
     1
(*  Title:      Pure/General/xml.ML
wenzelm@24264
     2
    ID:         $Id$
wenzelm@24264
     3
    Author:     David Aspinall, Stefan Berghofer and Markus Wenzel
wenzelm@24264
     4
wenzelm@24264
     5
Basic support for XML.
wenzelm@24264
     6
*)
wenzelm@24264
     7
wenzelm@24264
     8
signature XML =
wenzelm@24264
     9
sig
wenzelm@24264
    10
  (*string functions*)
wenzelm@26525
    11
  val detect: string -> bool
wenzelm@24264
    12
  val header: string
wenzelm@24264
    13
  val text: string -> string
wenzelm@24264
    14
  type attributes = (string * string) list
wenzelm@24264
    15
  val attribute: string * string -> string
wenzelm@24264
    16
  val element: string -> attributes -> string list -> string
wenzelm@24264
    17
  (*tree functions*)
wenzelm@24264
    18
  datatype tree =
wenzelm@24264
    19
      Elem of string * attributes * tree list
wenzelm@24264
    20
    | Text of string
wenzelm@24264
    21
    | Output of output
wenzelm@24264
    22
  type content = tree list
wenzelm@24264
    23
  type element = string * attributes * content
wenzelm@24264
    24
  val string_of_tree: tree -> string
wenzelm@25838
    25
  val plain_content: tree -> string
wenzelm@24264
    26
  val parse_string : string -> string option
wenzelm@24264
    27
  val parse_content: string list -> tree list * string list
wenzelm@24264
    28
  val parse_elem: string list -> tree * string list
wenzelm@24264
    29
  val parse_document: string list -> (string option * tree) * string list
wenzelm@24264
    30
  val tree_of_string: string -> tree
wenzelm@25838
    31
  val scan_comment_whspc: string list -> unit * string list
wenzelm@24264
    32
end;
wenzelm@24264
    33
wenzelm@24264
    34
structure XML: XML =
wenzelm@24264
    35
struct
wenzelm@24264
    36
wenzelm@24264
    37
wenzelm@26525
    38
(** string representation **)
wenzelm@26525
    39
wenzelm@26525
    40
val detect = String.isPrefix "<?xml";
wenzelm@24264
    41
val header = "<?xml version=\"1.0\"?>\n";
wenzelm@24264
    42
wenzelm@24264
    43
wenzelm@24264
    44
(* text and character data *)
wenzelm@24264
    45
wenzelm@24264
    46
fun decode "&lt;" = "<"
wenzelm@24264
    47
  | decode "&gt;" = ">"
wenzelm@24264
    48
  | decode "&amp;" = "&"
wenzelm@24264
    49
  | decode "&apos;" = "'"
wenzelm@24264
    50
  | decode "&quot;" = "\""
wenzelm@24264
    51
  | decode c = c;
wenzelm@24264
    52
wenzelm@24264
    53
fun encode "<" = "&lt;"
wenzelm@24264
    54
  | encode ">" = "&gt;"
wenzelm@24264
    55
  | encode "&" = "&amp;"
wenzelm@24264
    56
  | encode "'" = "&apos;"
wenzelm@24264
    57
  | encode "\"" = "&quot;"
wenzelm@24264
    58
  | encode c = c;
wenzelm@24264
    59
wenzelm@25838
    60
val text = translate_string encode;
wenzelm@24264
    61
wenzelm@24264
    62
wenzelm@24264
    63
(* elements *)
wenzelm@24264
    64
wenzelm@24264
    65
fun attribute (a, x) = a ^ " = \"" ^ text x ^ "\"";
wenzelm@24264
    66
wenzelm@26525
    67
fun element name atts body =
wenzelm@26525
    68
  let
wenzelm@26525
    69
    val elem = space_implode " " (name :: map attribute atts);
wenzelm@26525
    70
    val b = implode body;
wenzelm@26525
    71
  in
wenzelm@26525
    72
    if b = "" then enclose "<" "/>" elem
wenzelm@26525
    73
    else enclose "<" ">" elem ^ b ^ enclose "</" ">" name
wenzelm@24264
    74
  end;
wenzelm@24264
    75
wenzelm@24264
    76
wenzelm@24264
    77
wenzelm@24264
    78
(** explicit XML trees **)
wenzelm@24264
    79
wenzelm@24264
    80
type attributes = (string * string) list;
wenzelm@24264
    81
wenzelm@24264
    82
datatype tree =
wenzelm@24264
    83
    Elem of string * attributes * tree list
wenzelm@24264
    84
  | Text of string
wenzelm@24264
    85
  | Output of output;
wenzelm@24264
    86
wenzelm@24264
    87
type content = tree list;
wenzelm@24264
    88
type element = string * attributes * content;
wenzelm@24264
    89
wenzelm@26525
    90
fun string_of_tree t =
wenzelm@24264
    91
  let
wenzelm@26525
    92
    fun name_atts name atts = fold Buffer.add (separate " " (name :: map attribute atts));
wenzelm@26525
    93
    fun tree (Elem (name, atts, [])) =
wenzelm@26525
    94
          Buffer.add "<" #> name_atts name atts #> Buffer.add "/>"
wenzelm@26525
    95
      | tree (Elem (name, atts, ts)) =
wenzelm@26525
    96
          Buffer.add "<" #> name_atts name atts #> Buffer.add ">" #>
wenzelm@26525
    97
          fold tree ts #>
wenzelm@26525
    98
          Buffer.add "</" #> Buffer.add name #> Buffer.add ">"
wenzelm@26525
    99
      | tree (Text s) = Buffer.add (text s)
wenzelm@26525
   100
      | tree (Output s) = Buffer.add s;
wenzelm@26525
   101
  in Buffer.empty |> tree t |> Buffer.content end;
wenzelm@24264
   102
wenzelm@25838
   103
fun plain_content tree =
wenzelm@25838
   104
  let
wenzelm@25838
   105
    fun content (Elem (_, _, ts)) = fold content ts
wenzelm@25838
   106
      | content (Text s) = Buffer.add s
wenzelm@25838
   107
      | content (Output _) = I;    (* FIXME !? *)
wenzelm@25838
   108
  in Buffer.empty |> content tree |> Buffer.content end;
wenzelm@25838
   109
wenzelm@24264
   110
wenzelm@24264
   111
wenzelm@24264
   112
(** XML parsing **)
wenzelm@24264
   113
wenzelm@24264
   114
fun err s (xs, _) =
wenzelm@24264
   115
  "XML parsing error: " ^ s ^ "\nfound: " ^ quote (Symbol.beginning 100 xs);
wenzelm@24264
   116
wenzelm@24264
   117
val scan_whspc = Scan.many Symbol.is_blank;
wenzelm@24264
   118
wenzelm@24264
   119
val scan_special = $$ "&" ^^ Symbol.scan_id ^^ $$ ";" >> decode;
wenzelm@24264
   120
wenzelm@24264
   121
val parse_chars = Scan.repeat1 (Scan.unless ((* scan_whspc -- *)$$ "<")
wenzelm@24264
   122
  (scan_special || Scan.one Symbol.is_regular)) >> implode;
wenzelm@24264
   123
wenzelm@24264
   124
val parse_string = Scan.read Symbol.stopper parse_chars o explode;
wenzelm@24264
   125
wenzelm@24264
   126
val parse_cdata = Scan.this_string "<![CDATA[" |--
wenzelm@24264
   127
  (Scan.repeat (Scan.unless (Scan.this_string "]]>") (Scan.one Symbol.is_regular)) >>
wenzelm@24264
   128
    implode) --| Scan.this_string "]]>";
wenzelm@24264
   129
wenzelm@24264
   130
val parse_att =
wenzelm@24264
   131
  Symbol.scan_id --| scan_whspc --| $$ "=" --| scan_whspc --
wenzelm@24264
   132
  (($$ "\"" || $$ "'") :|-- (fn s => (Scan.repeat (Scan.unless ($$ s)
wenzelm@24264
   133
    (scan_special || Scan.one Symbol.is_regular)) >> implode) --| $$ s));
wenzelm@24264
   134
wenzelm@24264
   135
val parse_comment = Scan.this_string "<!--" --
wenzelm@24264
   136
  Scan.repeat (Scan.unless (Scan.this_string "-->") (Scan.one Symbol.is_regular)) --
wenzelm@24264
   137
  Scan.this_string "-->";
wenzelm@24264
   138
wenzelm@24264
   139
val scan_comment_whspc =
wenzelm@24264
   140
  (scan_whspc >> K()) --| (Scan.repeat (parse_comment |-- (scan_whspc >> K())));
wenzelm@24264
   141
wenzelm@24264
   142
val parse_pi = Scan.this_string "<?" |--
wenzelm@24264
   143
  Scan.repeat (Scan.unless (Scan.this_string "?>") (Scan.one Symbol.is_regular)) --|
wenzelm@24264
   144
  Scan.this_string "?>";
wenzelm@24264
   145
wenzelm@24264
   146
fun parse_content xs =
wenzelm@24264
   147
  ((Scan.optional ((* scan_whspc |-- *) parse_chars >> (single o Text)) [] --
wenzelm@24264
   148
    (Scan.repeat ((* scan_whspc |-- *)
wenzelm@24264
   149
       (   parse_elem >> single
wenzelm@24264
   150
        || parse_cdata >> (single o Text)
wenzelm@24264
   151
        || parse_pi >> K []
wenzelm@24264
   152
        || parse_comment >> K []) --
wenzelm@24264
   153
       Scan.optional ((* scan_whspc |-- *) parse_chars >> (single o Text)) []
wenzelm@24264
   154
         >> op @) >> flat) >> op @)(* --| scan_whspc*)) xs
wenzelm@24264
   155
wenzelm@24264
   156
and parse_elem xs =
wenzelm@24264
   157
  ($$ "<" |-- Symbol.scan_id --
wenzelm@24264
   158
    Scan.repeat (scan_whspc |-- parse_att) --| scan_whspc :-- (fn (s, _) =>
wenzelm@24264
   159
      !! (err "Expected > or />")
wenzelm@24264
   160
        (Scan.this_string "/>" >> K []
wenzelm@24264
   161
         || $$ ">" |-- parse_content --|
wenzelm@24264
   162
            !! (err ("Expected </" ^ s ^ ">"))
wenzelm@24264
   163
              (Scan.this_string ("</" ^ s) --| scan_whspc --| $$ ">"))) >>
wenzelm@24264
   164
    (fn ((s, atts), ts) => Elem (s, atts, ts))) xs;
wenzelm@24264
   165
wenzelm@24264
   166
val parse_document =
wenzelm@24264
   167
  Scan.option (Scan.this_string "<!DOCTYPE" -- scan_whspc |--
wenzelm@24264
   168
    (Scan.repeat (Scan.unless ($$ ">")
wenzelm@24264
   169
      (Scan.one Symbol.is_regular)) >> implode) --| $$ ">" --| scan_whspc) --
wenzelm@24264
   170
  parse_elem;
wenzelm@24264
   171
wenzelm@24264
   172
fun tree_of_string s =
wenzelm@24264
   173
  (case Scan.finite Symbol.stopper (Scan.error (!! (err "Malformed element")
wenzelm@24264
   174
      (scan_whspc |-- parse_elem --| scan_whspc))) (Symbol.explode s) of
wenzelm@24264
   175
    (x, []) => x
wenzelm@24264
   176
  | (_, ys) => error ("XML parsing error: Unprocessed input\n" ^ Symbol.beginning 100 ys));
wenzelm@24264
   177
wenzelm@24264
   178
end;