src/Pure/General/xml.ML
author wenzelm
Tue Aug 10 22:26:23 2010 +0200 (2010-08-10)
changeset 38266 492d377ecfe2
parent 38228 ada3ab6b9085
child 38474 e498dc2eb576
permissions -rw-r--r--
type XML.body as basic data representation language;
tuned;
haftmann@24584
     1
(*  Title:      Pure/General/xml.ML
wenzelm@24264
     2
    Author:     David Aspinall, Stefan Berghofer and Markus Wenzel
wenzelm@24264
     3
wenzelm@38228
     4
Simple XML tree values.
wenzelm@24264
     5
*)
wenzelm@24264
     6
wenzelm@24264
     7
signature XML =
wenzelm@24264
     8
sig
wenzelm@28017
     9
  type attributes = Properties.T
wenzelm@24264
    10
  datatype tree =
wenzelm@38228
    11
      Elem of Markup.T * tree list
wenzelm@24264
    12
    | Text of string
wenzelm@38266
    13
  type body = tree list
wenzelm@26546
    14
  val add_content: tree -> Buffer.T -> Buffer.T
wenzelm@26546
    15
  val header: string
wenzelm@26546
    16
  val text: string -> string
wenzelm@26546
    17
  val element: string -> attributes -> string list -> string
wenzelm@26546
    18
  val output_markup: Markup.T -> output * output
wenzelm@26539
    19
  val string_of: tree -> string
wenzelm@26546
    20
  val output: tree -> TextIO.outstream -> unit
wenzelm@26984
    21
  val parse_comments: string list -> unit * string list
wenzelm@24264
    22
  val parse_string : string -> string option
wenzelm@26546
    23
  val parse_element: string list -> tree * string list
wenzelm@26984
    24
  val parse_document: string list -> tree * string list
wenzelm@26539
    25
  val parse: string -> tree
wenzelm@24264
    26
end;
wenzelm@24264
    27
wenzelm@24264
    28
structure XML: XML =
wenzelm@24264
    29
struct
wenzelm@24264
    30
wenzelm@26546
    31
(** XML trees **)
wenzelm@26546
    32
wenzelm@28017
    33
type attributes = Properties.T;
wenzelm@26546
    34
wenzelm@26546
    35
datatype tree =
wenzelm@38228
    36
    Elem of Markup.T * tree list
wenzelm@28033
    37
  | Text of string;
wenzelm@26546
    38
wenzelm@38266
    39
type body = tree list;
wenzelm@38266
    40
wenzelm@38228
    41
fun add_content (Elem (_, ts)) = fold add_content ts
wenzelm@28033
    42
  | add_content (Text s) = Buffer.add s;
wenzelm@26546
    43
wenzelm@26546
    44
wenzelm@24264
    45
wenzelm@26525
    46
(** string representation **)
wenzelm@26525
    47
wenzelm@24264
    48
val header = "<?xml version=\"1.0\"?>\n";
wenzelm@24264
    49
wenzelm@24264
    50
wenzelm@26546
    51
(* escaped text *)
wenzelm@24264
    52
wenzelm@24264
    53
fun decode "&lt;" = "<"
wenzelm@24264
    54
  | decode "&gt;" = ">"
wenzelm@24264
    55
  | decode "&amp;" = "&"
wenzelm@24264
    56
  | decode "&apos;" = "'"
wenzelm@24264
    57
  | decode "&quot;" = "\""
wenzelm@24264
    58
  | decode c = c;
wenzelm@24264
    59
wenzelm@24264
    60
fun encode "<" = "&lt;"
wenzelm@24264
    61
  | encode ">" = "&gt;"
wenzelm@24264
    62
  | encode "&" = "&amp;"
wenzelm@24264
    63
  | encode "'" = "&apos;"
wenzelm@24264
    64
  | encode "\"" = "&quot;"
wenzelm@24264
    65
  | encode c = c;
wenzelm@24264
    66
wenzelm@25838
    67
val text = translate_string encode;
wenzelm@24264
    68
wenzelm@24264
    69
wenzelm@24264
    70
(* elements *)
wenzelm@24264
    71
wenzelm@26539
    72
fun elem name atts =
wenzelm@26551
    73
  space_implode " " (name :: map (fn (a, x) => a ^ "=\"" ^ text x ^ "\"") atts);
wenzelm@24264
    74
wenzelm@26525
    75
fun element name atts body =
wenzelm@26539
    76
  let val b = implode body in
wenzelm@26539
    77
    if b = "" then enclose "<" "/>" (elem name atts)
wenzelm@26539
    78
    else enclose "<" ">" (elem name atts) ^ b ^ enclose "</" ">" name
wenzelm@24264
    79
  end;
wenzelm@24264
    80
wenzelm@27884
    81
fun output_markup (markup as (name, atts)) =
wenzelm@29325
    82
  if Markup.is_none markup then Markup.no_output
wenzelm@27884
    83
  else (enclose "<" ">" (elem name atts), enclose "</" ">" name);
wenzelm@26539
    84
wenzelm@24264
    85
wenzelm@26546
    86
(* output *)
wenzelm@24264
    87
wenzelm@26546
    88
fun buffer_of tree =
wenzelm@24264
    89
  let
wenzelm@38228
    90
    fun traverse (Elem ((name, atts), [])) =
wenzelm@26539
    91
          Buffer.add "<" #> Buffer.add (elem name atts) #> Buffer.add "/>"
wenzelm@38228
    92
      | traverse (Elem ((name, atts), ts)) =
wenzelm@26539
    93
          Buffer.add "<" #> Buffer.add (elem name atts) #> Buffer.add ">" #>
wenzelm@26546
    94
          fold traverse ts #>
wenzelm@26525
    95
          Buffer.add "</" #> Buffer.add name #> Buffer.add ">"
wenzelm@28033
    96
      | traverse (Text s) = Buffer.add (text s);
wenzelm@26546
    97
  in Buffer.empty |> traverse tree end;
wenzelm@24264
    98
wenzelm@26546
    99
val string_of = Buffer.content o buffer_of;
wenzelm@26546
   100
val output = Buffer.output o buffer_of;
wenzelm@25838
   101
wenzelm@24264
   102
wenzelm@24264
   103
wenzelm@26546
   104
(** XML parsing (slow) **)
wenzelm@26546
   105
wenzelm@26546
   106
local
wenzelm@24264
   107
wenzelm@24264
   108
fun err s (xs, _) =
wenzelm@24264
   109
  "XML parsing error: " ^ s ^ "\nfound: " ^ quote (Symbol.beginning 100 xs);
wenzelm@24264
   110
wenzelm@26984
   111
fun ignored _ = [];
wenzelm@26984
   112
wenzelm@26551
   113
val blanks = Scan.many Symbol.is_blank;
wenzelm@26551
   114
val special = $$ "&" ^^ Symbol.scan_id ^^ $$ ";" >> decode;
wenzelm@26551
   115
val regular = Scan.one Symbol.is_regular;
wenzelm@26551
   116
fun regular_except x = Scan.one (fn c => Symbol.is_regular c andalso c <> x);
wenzelm@24264
   117
wenzelm@26551
   118
val parse_chars = Scan.repeat1 (special || regular_except "<") >> implode;
wenzelm@24264
   119
wenzelm@26551
   120
val parse_cdata =
wenzelm@26551
   121
  Scan.this_string "<![CDATA[" |--
wenzelm@26551
   122
  (Scan.repeat (Scan.unless (Scan.this_string "]]>") regular) >> implode) --|
wenzelm@26551
   123
  Scan.this_string "]]>";
wenzelm@24264
   124
wenzelm@24264
   125
val parse_att =
wenzelm@26551
   126
  (Symbol.scan_id --| (blanks -- $$ "=" -- blanks)) --
wenzelm@26551
   127
  (($$ "\"" || $$ "'") :|-- (fn s =>
wenzelm@26551
   128
    (Scan.repeat (special || regular_except s) >> implode) --| $$ s));
wenzelm@24264
   129
wenzelm@26551
   130
val parse_comment =
wenzelm@26551
   131
  Scan.this_string "<!--" --
wenzelm@26551
   132
  Scan.repeat (Scan.unless (Scan.this_string "-->") regular) --
wenzelm@26984
   133
  Scan.this_string "-->" >> ignored;
wenzelm@24264
   134
wenzelm@26551
   135
val parse_processing_instruction =
wenzelm@26551
   136
  Scan.this_string "<?" --
wenzelm@26551
   137
  Scan.repeat (Scan.unless (Scan.this_string "?>") regular) --
wenzelm@26984
   138
  Scan.this_string "?>" >> ignored;
wenzelm@26984
   139
wenzelm@26984
   140
val parse_doctype =
wenzelm@26984
   141
  Scan.this_string "<!DOCTYPE" --
wenzelm@26984
   142
  Scan.repeat (Scan.unless ($$ ">") regular) --
wenzelm@26984
   143
  $$ ">" >> ignored;
wenzelm@26984
   144
wenzelm@26984
   145
val parse_misc =
wenzelm@26984
   146
  Scan.one Symbol.is_blank >> ignored ||
wenzelm@26984
   147
  parse_processing_instruction ||
wenzelm@26984
   148
  parse_comment;
wenzelm@26551
   149
wenzelm@26551
   150
val parse_optional_text =
wenzelm@26551
   151
  Scan.optional (parse_chars >> (single o Text)) [];
wenzelm@24264
   152
wenzelm@26546
   153
in
wenzelm@26546
   154
wenzelm@26984
   155
val parse_comments =
wenzelm@26984
   156
  blanks -- Scan.repeat (parse_comment -- blanks >> K ()) >> K ();
wenzelm@26984
   157
wenzelm@26546
   158
val parse_string = Scan.read Symbol.stopper parse_chars o explode;
wenzelm@26546
   159
wenzelm@24264
   160
fun parse_content xs =
wenzelm@26551
   161
  (parse_optional_text @@@
wenzelm@26551
   162
    (Scan.repeat
wenzelm@26551
   163
      ((parse_element >> single ||
wenzelm@26551
   164
        parse_cdata >> (single o Text) ||
wenzelm@26984
   165
        parse_processing_instruction ||
wenzelm@26984
   166
        parse_comment)
wenzelm@26551
   167
      @@@ parse_optional_text) >> flat)) xs
wenzelm@24264
   168
wenzelm@26546
   169
and parse_element xs =
wenzelm@24264
   170
  ($$ "<" |-- Symbol.scan_id --
wenzelm@26551
   171
    Scan.repeat (blanks |-- parse_att) --| blanks :-- (fn (s, _) =>
wenzelm@24264
   172
      !! (err "Expected > or />")
wenzelm@26984
   173
        (Scan.this_string "/>" >> ignored
wenzelm@24264
   174
         || $$ ">" |-- parse_content --|
wenzelm@24264
   175
            !! (err ("Expected </" ^ s ^ ">"))
wenzelm@38228
   176
              (Scan.this_string ("</" ^ s) --| blanks --| $$ ">"))) >> Elem) xs;
wenzelm@24264
   177
wenzelm@26984
   178
val parse_document =
wenzelm@26984
   179
  (Scan.repeat parse_misc -- Scan.option parse_doctype -- Scan.repeat parse_misc)
wenzelm@26984
   180
  |-- parse_element;
wenzelm@24264
   181
wenzelm@26539
   182
fun parse s =
wenzelm@24264
   183
  (case Scan.finite Symbol.stopper (Scan.error (!! (err "Malformed element")
wenzelm@26984
   184
      (blanks |-- parse_document --| blanks))) (explode s) of
wenzelm@24264
   185
    (x, []) => x
wenzelm@24264
   186
  | (_, ys) => error ("XML parsing error: Unprocessed input\n" ^ Symbol.beginning 100 ys));
wenzelm@24264
   187
wenzelm@24264
   188
end;
wenzelm@26546
   189
wenzelm@26546
   190
end;