src/Pure/General/xml.ML
author wenzelm
Sat Jul 23 16:37:17 2011 +0200 (2011-07-23)
changeset 43947 9b00f09f7721
parent 43844 33e20b7d7e72
child 43949 94033767ef9b
permissions -rw-r--r--
defer evaluation of Scan.message, for improved performance in the frequent situation where failure is handled later (e.g. via ||);
haftmann@24584
     1
(*  Title:      Pure/General/xml.ML
wenzelm@24264
     2
    Author:     David Aspinall, Stefan Berghofer and Markus Wenzel
wenzelm@24264
     3
wenzelm@43778
     4
Untyped XML trees.
wenzelm@24264
     5
*)
wenzelm@24264
     6
wenzelm@43767
     7
signature XML_DATA_OPS =
wenzelm@43767
     8
sig
wenzelm@43778
     9
  type 'a A
wenzelm@43767
    10
  type 'a T
wenzelm@43778
    11
  type 'a V
wenzelm@43778
    12
  val int_atom: int A
wenzelm@43778
    13
  val bool_atom: bool A
wenzelm@43778
    14
  val unit_atom: unit A
wenzelm@43767
    15
  val properties: Properties.T T
wenzelm@43767
    16
  val string: string T
wenzelm@43767
    17
  val int: int T
wenzelm@43767
    18
  val bool: bool T
wenzelm@43767
    19
  val unit: unit T
wenzelm@43767
    20
  val pair: 'a T -> 'b T -> ('a * 'b) T
wenzelm@43767
    21
  val triple: 'a T -> 'b T -> 'c T -> ('a * 'b * 'c) T
wenzelm@43767
    22
  val list: 'a T -> 'a list T
wenzelm@43767
    23
  val option: 'a T -> 'a option T
wenzelm@43778
    24
  val variant: 'a V list -> 'a T
wenzelm@43767
    25
end;
wenzelm@43767
    26
wenzelm@24264
    27
signature XML =
wenzelm@24264
    28
sig
wenzelm@28017
    29
  type attributes = Properties.T
wenzelm@24264
    30
  datatype tree =
wenzelm@38228
    31
      Elem of Markup.T * tree list
wenzelm@24264
    32
    | Text of string
wenzelm@38266
    33
  type body = tree list
wenzelm@26546
    34
  val add_content: tree -> Buffer.T -> Buffer.T
wenzelm@39555
    35
  val content_of: body -> string
wenzelm@26546
    36
  val header: string
wenzelm@26546
    37
  val text: string -> string
wenzelm@26546
    38
  val element: string -> attributes -> string list -> string
wenzelm@40131
    39
  val output_markup: Markup.T -> Output.output * Output.output
wenzelm@26539
    40
  val string_of: tree -> string
wenzelm@43791
    41
  val pretty: int -> tree -> Pretty.T
wenzelm@26546
    42
  val output: tree -> TextIO.outstream -> unit
wenzelm@26984
    43
  val parse_comments: string list -> unit * string list
wenzelm@24264
    44
  val parse_string : string -> string option
wenzelm@26546
    45
  val parse_element: string list -> tree * string list
wenzelm@26984
    46
  val parse_document: string list -> tree * string list
wenzelm@26539
    47
  val parse: string -> tree
wenzelm@43767
    48
  exception XML_ATOM of string
wenzelm@43767
    49
  exception XML_BODY of body
wenzelm@43778
    50
  structure Encode: XML_DATA_OPS
wenzelm@43778
    51
  structure Decode: XML_DATA_OPS
wenzelm@24264
    52
end;
wenzelm@24264
    53
wenzelm@24264
    54
structure XML: XML =
wenzelm@24264
    55
struct
wenzelm@24264
    56
wenzelm@26546
    57
(** XML trees **)
wenzelm@26546
    58
wenzelm@28017
    59
type attributes = Properties.T;
wenzelm@26546
    60
wenzelm@26546
    61
datatype tree =
wenzelm@38228
    62
    Elem of Markup.T * tree list
wenzelm@28033
    63
  | Text of string;
wenzelm@26546
    64
wenzelm@38266
    65
type body = tree list;
wenzelm@38266
    66
wenzelm@38228
    67
fun add_content (Elem (_, ts)) = fold add_content ts
wenzelm@28033
    68
  | add_content (Text s) = Buffer.add s;
wenzelm@26546
    69
wenzelm@39555
    70
fun content_of body = Buffer.empty |> fold add_content body |> Buffer.content;
wenzelm@39555
    71
wenzelm@26546
    72
wenzelm@24264
    73
wenzelm@26525
    74
(** string representation **)
wenzelm@26525
    75
wenzelm@24264
    76
val header = "<?xml version=\"1.0\"?>\n";
wenzelm@24264
    77
wenzelm@24264
    78
wenzelm@26546
    79
(* escaped text *)
wenzelm@24264
    80
wenzelm@24264
    81
fun decode "&lt;" = "<"
wenzelm@24264
    82
  | decode "&gt;" = ">"
wenzelm@24264
    83
  | decode "&amp;" = "&"
wenzelm@24264
    84
  | decode "&apos;" = "'"
wenzelm@24264
    85
  | decode "&quot;" = "\""
wenzelm@24264
    86
  | decode c = c;
wenzelm@24264
    87
wenzelm@24264
    88
fun encode "<" = "&lt;"
wenzelm@24264
    89
  | encode ">" = "&gt;"
wenzelm@24264
    90
  | encode "&" = "&amp;"
wenzelm@24264
    91
  | encode "'" = "&apos;"
wenzelm@24264
    92
  | encode "\"" = "&quot;"
wenzelm@24264
    93
  | encode c = c;
wenzelm@24264
    94
wenzelm@25838
    95
val text = translate_string encode;
wenzelm@24264
    96
wenzelm@24264
    97
wenzelm@24264
    98
(* elements *)
wenzelm@24264
    99
wenzelm@26539
   100
fun elem name atts =
wenzelm@26551
   101
  space_implode " " (name :: map (fn (a, x) => a ^ "=\"" ^ text x ^ "\"") atts);
wenzelm@24264
   102
wenzelm@26525
   103
fun element name atts body =
wenzelm@26539
   104
  let val b = implode body in
wenzelm@26539
   105
    if b = "" then enclose "<" "/>" (elem name atts)
wenzelm@26539
   106
    else enclose "<" ">" (elem name atts) ^ b ^ enclose "</" ">" name
wenzelm@24264
   107
  end;
wenzelm@24264
   108
wenzelm@27884
   109
fun output_markup (markup as (name, atts)) =
wenzelm@38474
   110
  if Markup.is_empty markup then Markup.no_output
wenzelm@27884
   111
  else (enclose "<" ">" (elem name atts), enclose "</" ">" name);
wenzelm@26539
   112
wenzelm@24264
   113
wenzelm@26546
   114
(* output *)
wenzelm@24264
   115
wenzelm@43791
   116
fun buffer_of depth tree =
wenzelm@24264
   117
  let
wenzelm@43791
   118
    fun traverse _ (Elem ((name, atts), [])) =
wenzelm@26539
   119
          Buffer.add "<" #> Buffer.add (elem name atts) #> Buffer.add "/>"
wenzelm@43791
   120
      | traverse d (Elem ((name, atts), ts)) =
wenzelm@26539
   121
          Buffer.add "<" #> Buffer.add (elem name atts) #> Buffer.add ">" #>
wenzelm@43791
   122
          traverse_body d ts #>
wenzelm@26525
   123
          Buffer.add "</" #> Buffer.add name #> Buffer.add ">"
wenzelm@43791
   124
      | traverse _ (Text s) = Buffer.add (text s)
wenzelm@43791
   125
    and traverse_body 0 _ = Buffer.add "..."
wenzelm@43791
   126
      | traverse_body d ts = fold (traverse (d - 1)) ts;
wenzelm@43791
   127
  in Buffer.empty |> traverse depth tree end;
wenzelm@24264
   128
wenzelm@43791
   129
val string_of = Buffer.content o buffer_of ~1;
wenzelm@43791
   130
val output = Buffer.output o buffer_of ~1;
wenzelm@43791
   131
wenzelm@43791
   132
fun pretty depth tree =
wenzelm@43791
   133
  Pretty.str (Buffer.content (buffer_of (Int.max (0, depth)) tree));
wenzelm@25838
   134
wenzelm@24264
   135
wenzelm@24264
   136
wenzelm@26546
   137
(** XML parsing (slow) **)
wenzelm@26546
   138
wenzelm@26546
   139
local
wenzelm@24264
   140
wenzelm@43947
   141
fun err msg (xs, _) =
wenzelm@43947
   142
  fn () => "XML parsing error: " ^ msg () ^ "\nfound: " ^ quote (Symbol.beginning 100 xs);
wenzelm@24264
   143
wenzelm@26984
   144
fun ignored _ = [];
wenzelm@26984
   145
wenzelm@26551
   146
val blanks = Scan.many Symbol.is_blank;
wenzelm@26551
   147
val special = $$ "&" ^^ Symbol.scan_id ^^ $$ ";" >> decode;
wenzelm@26551
   148
val regular = Scan.one Symbol.is_regular;
wenzelm@26551
   149
fun regular_except x = Scan.one (fn c => Symbol.is_regular c andalso c <> x);
wenzelm@24264
   150
wenzelm@26551
   151
val parse_chars = Scan.repeat1 (special || regular_except "<") >> implode;
wenzelm@24264
   152
wenzelm@26551
   153
val parse_cdata =
wenzelm@26551
   154
  Scan.this_string "<![CDATA[" |--
wenzelm@26551
   155
  (Scan.repeat (Scan.unless (Scan.this_string "]]>") regular) >> implode) --|
wenzelm@26551
   156
  Scan.this_string "]]>";
wenzelm@24264
   157
wenzelm@24264
   158
val parse_att =
wenzelm@26551
   159
  (Symbol.scan_id --| (blanks -- $$ "=" -- blanks)) --
wenzelm@26551
   160
  (($$ "\"" || $$ "'") :|-- (fn s =>
wenzelm@26551
   161
    (Scan.repeat (special || regular_except s) >> implode) --| $$ s));
wenzelm@24264
   162
wenzelm@26551
   163
val parse_comment =
wenzelm@26551
   164
  Scan.this_string "<!--" --
wenzelm@26551
   165
  Scan.repeat (Scan.unless (Scan.this_string "-->") regular) --
wenzelm@26984
   166
  Scan.this_string "-->" >> ignored;
wenzelm@24264
   167
wenzelm@26551
   168
val parse_processing_instruction =
wenzelm@26551
   169
  Scan.this_string "<?" --
wenzelm@26551
   170
  Scan.repeat (Scan.unless (Scan.this_string "?>") regular) --
wenzelm@26984
   171
  Scan.this_string "?>" >> ignored;
wenzelm@26984
   172
wenzelm@26984
   173
val parse_doctype =
wenzelm@26984
   174
  Scan.this_string "<!DOCTYPE" --
wenzelm@26984
   175
  Scan.repeat (Scan.unless ($$ ">") regular) --
wenzelm@26984
   176
  $$ ">" >> ignored;
wenzelm@26984
   177
wenzelm@26984
   178
val parse_misc =
wenzelm@26984
   179
  Scan.one Symbol.is_blank >> ignored ||
wenzelm@26984
   180
  parse_processing_instruction ||
wenzelm@26984
   181
  parse_comment;
wenzelm@26551
   182
wenzelm@26551
   183
val parse_optional_text =
wenzelm@26551
   184
  Scan.optional (parse_chars >> (single o Text)) [];
wenzelm@24264
   185
wenzelm@26546
   186
in
wenzelm@26546
   187
wenzelm@26984
   188
val parse_comments =
wenzelm@26984
   189
  blanks -- Scan.repeat (parse_comment -- blanks >> K ()) >> K ();
wenzelm@26984
   190
wenzelm@40627
   191
val parse_string = Scan.read Symbol.stopper parse_chars o raw_explode;
wenzelm@26546
   192
wenzelm@24264
   193
fun parse_content xs =
wenzelm@26551
   194
  (parse_optional_text @@@
wenzelm@26551
   195
    (Scan.repeat
wenzelm@26551
   196
      ((parse_element >> single ||
wenzelm@26551
   197
        parse_cdata >> (single o Text) ||
wenzelm@26984
   198
        parse_processing_instruction ||
wenzelm@26984
   199
        parse_comment)
wenzelm@26551
   200
      @@@ parse_optional_text) >> flat)) xs
wenzelm@24264
   201
wenzelm@26546
   202
and parse_element xs =
wenzelm@24264
   203
  ($$ "<" |-- Symbol.scan_id --
wenzelm@26551
   204
    Scan.repeat (blanks |-- parse_att) --| blanks :-- (fn (s, _) =>
wenzelm@43947
   205
      !! (err (fn () => "Expected > or />"))
wenzelm@26984
   206
        (Scan.this_string "/>" >> ignored
wenzelm@24264
   207
         || $$ ">" |-- parse_content --|
wenzelm@43947
   208
            !! (err (fn () => "Expected </" ^ s ^ ">"))
wenzelm@38228
   209
              (Scan.this_string ("</" ^ s) --| blanks --| $$ ">"))) >> Elem) xs;
wenzelm@24264
   210
wenzelm@26984
   211
val parse_document =
wenzelm@26984
   212
  (Scan.repeat parse_misc -- Scan.option parse_doctype -- Scan.repeat parse_misc)
wenzelm@26984
   213
  |-- parse_element;
wenzelm@24264
   214
wenzelm@26539
   215
fun parse s =
wenzelm@43947
   216
  (case Scan.finite Symbol.stopper (Scan.error (!! (err (fn () => "Malformed element"))
wenzelm@40627
   217
      (blanks |-- parse_document --| blanks))) (raw_explode s) of
wenzelm@24264
   218
    (x, []) => x
wenzelm@24264
   219
  | (_, ys) => error ("XML parsing error: Unprocessed input\n" ^ Symbol.beginning 100 ys));
wenzelm@24264
   220
wenzelm@24264
   221
end;
wenzelm@26546
   222
wenzelm@43767
   223
wenzelm@43767
   224
wenzelm@43767
   225
(** XML as data representation language **)
wenzelm@43767
   226
wenzelm@43767
   227
exception XML_ATOM of string;
wenzelm@43767
   228
exception XML_BODY of tree list;
wenzelm@43767
   229
wenzelm@43767
   230
wenzelm@43767
   231
structure Encode =
wenzelm@43767
   232
struct
wenzelm@43767
   233
wenzelm@43778
   234
type 'a A = 'a -> string;
wenzelm@43767
   235
type 'a T = 'a -> body;
wenzelm@43778
   236
type 'a V = 'a -> string list * body;
wenzelm@43767
   237
wenzelm@43767
   238
wenzelm@43778
   239
(* atomic values *)
wenzelm@43767
   240
wenzelm@43767
   241
fun int_atom i = signed_string_of_int i;
wenzelm@43767
   242
wenzelm@43767
   243
fun bool_atom false = "0"
wenzelm@43767
   244
  | bool_atom true = "1";
wenzelm@43767
   245
wenzelm@43767
   246
fun unit_atom () = "";
wenzelm@43767
   247
wenzelm@43767
   248
wenzelm@43767
   249
(* structural nodes *)
wenzelm@43767
   250
wenzelm@43767
   251
fun node ts = Elem ((":", []), ts);
wenzelm@43767
   252
wenzelm@43778
   253
fun vector xs = map_index (fn (i, x) => (int_atom i, x)) xs;
wenzelm@43778
   254
wenzelm@43778
   255
fun tagged (tag, (xs, ts)) = Elem ((int_atom tag, vector xs), ts);
wenzelm@43767
   256
wenzelm@43767
   257
wenzelm@43767
   258
(* representation of standard types *)
wenzelm@43767
   259
wenzelm@43767
   260
fun properties props = [Elem ((":", props), [])];
wenzelm@43767
   261
wenzelm@43767
   262
fun string "" = []
wenzelm@43767
   263
  | string s = [Text s];
wenzelm@43767
   264
wenzelm@43767
   265
val int = string o int_atom;
wenzelm@43767
   266
wenzelm@43767
   267
val bool = string o bool_atom;
wenzelm@43767
   268
wenzelm@43767
   269
val unit = string o unit_atom;
wenzelm@43767
   270
wenzelm@43767
   271
fun pair f g (x, y) = [node (f x), node (g y)];
wenzelm@43767
   272
wenzelm@43767
   273
fun triple f g h (x, y, z) = [node (f x), node (g y), node (h z)];
wenzelm@43767
   274
wenzelm@43767
   275
fun list f xs = map (node o f) xs;
wenzelm@43767
   276
wenzelm@43767
   277
fun option _ NONE = []
wenzelm@43767
   278
  | option f (SOME x) = [node (f x)];
wenzelm@43767
   279
wenzelm@43767
   280
fun variant fs x = [tagged (the (get_index (fn f => try f x) fs))];
wenzelm@43767
   281
wenzelm@26546
   282
end;
wenzelm@43767
   283
wenzelm@43767
   284
wenzelm@43767
   285
structure Decode =
wenzelm@43767
   286
struct
wenzelm@43767
   287
wenzelm@43778
   288
type 'a A = string -> 'a;
wenzelm@43767
   289
type 'a T = body -> 'a;
wenzelm@43778
   290
type 'a V = string list * body -> 'a;
wenzelm@43767
   291
wenzelm@43767
   292
wenzelm@43778
   293
(* atomic values *)
wenzelm@43767
   294
wenzelm@43767
   295
fun int_atom s =
wenzelm@43797
   296
  Markup.parse_int s
wenzelm@43797
   297
    handle Fail _ => raise XML_ATOM s;
wenzelm@43767
   298
wenzelm@43767
   299
fun bool_atom "0" = false
wenzelm@43767
   300
  | bool_atom "1" = true
wenzelm@43767
   301
  | bool_atom s = raise XML_ATOM s;
wenzelm@43767
   302
wenzelm@43767
   303
fun unit_atom "" = ()
wenzelm@43767
   304
  | unit_atom s = raise XML_ATOM s;
wenzelm@43767
   305
wenzelm@43767
   306
wenzelm@43767
   307
(* structural nodes *)
wenzelm@43767
   308
wenzelm@43767
   309
fun node (Elem ((":", []), ts)) = ts
wenzelm@43767
   310
  | node t = raise XML_BODY [t];
wenzelm@43767
   311
wenzelm@43783
   312
fun vector atts =
wenzelm@43844
   313
  #1 (fold_map (fn (a, x) =>
wenzelm@43844
   314
        fn i => if int_atom a = i then (x, i + 1) else raise XML_ATOM a) atts 0);
wenzelm@43778
   315
wenzelm@43844
   316
fun tagged (Elem ((name, atts), ts)) = (int_atom name, (vector atts, ts))
wenzelm@43767
   317
  | tagged t = raise XML_BODY [t];
wenzelm@43767
   318
wenzelm@43767
   319
wenzelm@43767
   320
(* representation of standard types *)
wenzelm@43767
   321
wenzelm@43767
   322
fun properties [Elem ((":", props), [])] = props
wenzelm@43767
   323
  | properties ts = raise XML_BODY ts;
wenzelm@43767
   324
wenzelm@43767
   325
fun string [] = ""
wenzelm@43767
   326
  | string [Text s] = s
wenzelm@43767
   327
  | string ts = raise XML_BODY ts;
wenzelm@43767
   328
wenzelm@43767
   329
val int = int_atom o string;
wenzelm@43767
   330
wenzelm@43767
   331
val bool = bool_atom o string;
wenzelm@43767
   332
wenzelm@43767
   333
val unit = unit_atom o string;
wenzelm@43767
   334
wenzelm@43767
   335
fun pair f g [t1, t2] = (f (node t1), g (node t2))
wenzelm@43767
   336
  | pair _ _ ts = raise XML_BODY ts;
wenzelm@43767
   337
wenzelm@43767
   338
fun triple f g h [t1, t2, t3] = (f (node t1), g (node t2), h (node t3))
wenzelm@43767
   339
  | triple _ _ _ ts = raise XML_BODY ts;
wenzelm@43767
   340
wenzelm@43767
   341
fun list f ts = map (f o node) ts;
wenzelm@43767
   342
wenzelm@43767
   343
fun option _ [] = NONE
wenzelm@43767
   344
  | option f [t] = SOME (f (node t))
wenzelm@43767
   345
  | option _ ts = raise XML_BODY ts;
wenzelm@43767
   346
wenzelm@43768
   347
fun variant fs [t] =
wenzelm@43768
   348
      let
wenzelm@43778
   349
        val (tag, (xs, ts)) = tagged t;
wenzelm@43768
   350
        val f = nth fs tag handle General.Subscript => raise XML_BODY [t];
wenzelm@43778
   351
      in f (xs, ts) end
wenzelm@43767
   352
  | variant _ ts = raise XML_BODY ts;
wenzelm@43767
   353
wenzelm@43767
   354
end;
wenzelm@43767
   355
wenzelm@43767
   356
end;