src/Pure/PIDE/xml.ML
author wenzelm
Tue Aug 12 18:36:43 2014 +0200 (2014-08-12)
changeset 57916 2c2c24dbf0a4
parent 56059 2390391584c2
child 58854 b979c781c2db
permissions -rw-r--r--
generic process wrapping in Prover;
clarified module arrangement;
wenzelm@44698
     1
(*  Title:      Pure/PIDE/xml.ML
wenzelm@44698
     2
    Author:     David Aspinall
wenzelm@44698
     3
    Author:     Stefan Berghofer
wenzelm@44698
     4
    Author:     Makarius
wenzelm@24264
     5
wenzelm@46840
     6
Untyped XML trees and representation of ML values.
wenzelm@24264
     7
*)
wenzelm@24264
     8
wenzelm@43767
     9
signature XML_DATA_OPS =
wenzelm@43767
    10
sig
wenzelm@43778
    11
  type 'a A
wenzelm@43767
    12
  type 'a T
wenzelm@43778
    13
  type 'a V
wenzelm@43778
    14
  val int_atom: int A
wenzelm@43778
    15
  val bool_atom: bool A
wenzelm@43778
    16
  val unit_atom: unit A
wenzelm@43767
    17
  val properties: Properties.T T
wenzelm@43767
    18
  val string: string T
wenzelm@43767
    19
  val int: int T
wenzelm@43767
    20
  val bool: bool T
wenzelm@43767
    21
  val unit: unit T
wenzelm@43767
    22
  val pair: 'a T -> 'b T -> ('a * 'b) T
wenzelm@43767
    23
  val triple: 'a T -> 'b T -> 'c T -> ('a * 'b * 'c) T
wenzelm@43767
    24
  val list: 'a T -> 'a list T
wenzelm@43767
    25
  val option: 'a T -> 'a option T
wenzelm@43778
    26
  val variant: 'a V list -> 'a T
wenzelm@43767
    27
end;
wenzelm@43767
    28
wenzelm@24264
    29
signature XML =
wenzelm@24264
    30
sig
wenzelm@46837
    31
  type attributes = (string * string) list
wenzelm@24264
    32
  datatype tree =
wenzelm@46837
    33
      Elem of (string * attributes) * tree list
wenzelm@24264
    34
    | Text of string
wenzelm@38266
    35
  type body = tree list
wenzelm@49650
    36
  val wrap_elem: ((string * attributes) * tree list) * tree list -> tree
wenzelm@49650
    37
  val unwrap_elem: tree -> (((string * attributes) * tree list) * tree list) option
wenzelm@26546
    38
  val add_content: tree -> Buffer.T -> Buffer.T
wenzelm@39555
    39
  val content_of: body -> string
wenzelm@56059
    40
  val trim_blanks: body -> body
wenzelm@26546
    41
  val header: string
wenzelm@26546
    42
  val text: string -> string
wenzelm@26546
    43
  val element: string -> attributes -> string list -> string
wenzelm@40131
    44
  val output_markup: Markup.T -> Output.output * Output.output
wenzelm@26539
    45
  val string_of: tree -> string
wenzelm@43791
    46
  val pretty: int -> tree -> Pretty.T
wenzelm@26546
    47
  val output: tree -> TextIO.outstream -> unit
wenzelm@26984
    48
  val parse_comments: string list -> unit * string list
wenzelm@24264
    49
  val parse_string : string -> string option
wenzelm@26546
    50
  val parse_element: string list -> tree * string list
wenzelm@26984
    51
  val parse_document: string list -> tree * string list
wenzelm@26539
    52
  val parse: string -> tree
wenzelm@43767
    53
  exception XML_ATOM of string
wenzelm@43767
    54
  exception XML_BODY of body
wenzelm@43778
    55
  structure Encode: XML_DATA_OPS
wenzelm@43778
    56
  structure Decode: XML_DATA_OPS
wenzelm@24264
    57
end;
wenzelm@24264
    58
wenzelm@24264
    59
structure XML: XML =
wenzelm@24264
    60
struct
wenzelm@24264
    61
wenzelm@26546
    62
(** XML trees **)
wenzelm@26546
    63
wenzelm@46837
    64
type attributes = (string * string) list;
wenzelm@26546
    65
wenzelm@26546
    66
datatype tree =
wenzelm@46837
    67
    Elem of (string * attributes) * tree list
wenzelm@28033
    68
  | Text of string;
wenzelm@26546
    69
wenzelm@38266
    70
type body = tree list;
wenzelm@38266
    71
wenzelm@49650
    72
wenzelm@49650
    73
(* wrapped elements *)
wenzelm@49650
    74
wenzelm@49650
    75
val xml_elemN = "xml_elem";
wenzelm@49650
    76
val xml_nameN = "xml_name";
wenzelm@49650
    77
val xml_bodyN = "xml_body";
wenzelm@49650
    78
wenzelm@49650
    79
fun wrap_elem (((a, atts), body1), body2) =
wenzelm@49650
    80
  Elem ((xml_elemN, (xml_nameN, a) :: atts), Elem ((xml_bodyN, []), body1) :: body2);
wenzelm@49650
    81
wenzelm@49650
    82
fun unwrap_elem (Elem ((name, (n, a) :: atts), Elem ((name', atts'), body1) :: body2)) =
wenzelm@49650
    83
      if name = xml_elemN andalso n = xml_nameN andalso name' = xml_bodyN andalso null atts'
wenzelm@49650
    84
      then SOME (((a, atts), body1), body2) else NONE
wenzelm@49650
    85
  | unwrap_elem _ = NONE;
wenzelm@49650
    86
wenzelm@49650
    87
wenzelm@49650
    88
(* text context *)
wenzelm@49650
    89
wenzelm@49650
    90
fun add_content tree =
wenzelm@49650
    91
  (case unwrap_elem tree of
wenzelm@49650
    92
    SOME (_, ts) => fold add_content ts
wenzelm@49650
    93
  | NONE =>
wenzelm@49650
    94
      (case tree of
wenzelm@49650
    95
        Elem (_, ts) => fold add_content ts
wenzelm@49650
    96
      | Text s => Buffer.add s));
wenzelm@26546
    97
wenzelm@39555
    98
fun content_of body = Buffer.empty |> fold add_content body |> Buffer.content;
wenzelm@39555
    99
wenzelm@26546
   100
wenzelm@56059
   101
(* trim blanks *)
wenzelm@56059
   102
wenzelm@56059
   103
fun trim_blanks trees =
wenzelm@56059
   104
  trees |> maps
wenzelm@56059
   105
    (fn Elem (markup, body) => [Elem (markup, trim_blanks body)]
wenzelm@56059
   106
      | Text s =>
wenzelm@56059
   107
          let
wenzelm@56059
   108
            val s' = s
wenzelm@56059
   109
              |> raw_explode
wenzelm@56059
   110
              |> take_prefix Symbol.is_blank |> #2
wenzelm@56059
   111
              |> take_suffix Symbol.is_blank |> #1
wenzelm@56059
   112
              |> implode;
wenzelm@56059
   113
          in if s' = "" then [] else [Text s'] end);
wenzelm@56059
   114
wenzelm@56059
   115
wenzelm@24264
   116
wenzelm@26525
   117
(** string representation **)
wenzelm@26525
   118
wenzelm@24264
   119
val header = "<?xml version=\"1.0\"?>\n";
wenzelm@24264
   120
wenzelm@24264
   121
wenzelm@26546
   122
(* escaped text *)
wenzelm@24264
   123
wenzelm@24264
   124
fun decode "&lt;" = "<"
wenzelm@24264
   125
  | decode "&gt;" = ">"
wenzelm@24264
   126
  | decode "&amp;" = "&"
wenzelm@24264
   127
  | decode "&apos;" = "'"
wenzelm@24264
   128
  | decode "&quot;" = "\""
wenzelm@24264
   129
  | decode c = c;
wenzelm@24264
   130
wenzelm@24264
   131
fun encode "<" = "&lt;"
wenzelm@24264
   132
  | encode ">" = "&gt;"
wenzelm@24264
   133
  | encode "&" = "&amp;"
wenzelm@24264
   134
  | encode "'" = "&apos;"
wenzelm@24264
   135
  | encode "\"" = "&quot;"
wenzelm@24264
   136
  | encode c = c;
wenzelm@24264
   137
wenzelm@25838
   138
val text = translate_string encode;
wenzelm@24264
   139
wenzelm@24264
   140
wenzelm@24264
   141
(* elements *)
wenzelm@24264
   142
wenzelm@26539
   143
fun elem name atts =
wenzelm@26551
   144
  space_implode " " (name :: map (fn (a, x) => a ^ "=\"" ^ text x ^ "\"") atts);
wenzelm@24264
   145
wenzelm@26525
   146
fun element name atts body =
wenzelm@26539
   147
  let val b = implode body in
wenzelm@26539
   148
    if b = "" then enclose "<" "/>" (elem name atts)
wenzelm@26539
   149
    else enclose "<" ">" (elem name atts) ^ b ^ enclose "</" ">" name
wenzelm@24264
   150
  end;
wenzelm@24264
   151
wenzelm@27884
   152
fun output_markup (markup as (name, atts)) =
wenzelm@38474
   153
  if Markup.is_empty markup then Markup.no_output
wenzelm@27884
   154
  else (enclose "<" ">" (elem name atts), enclose "</" ">" name);
wenzelm@26539
   155
wenzelm@24264
   156
wenzelm@26546
   157
(* output *)
wenzelm@24264
   158
wenzelm@43791
   159
fun buffer_of depth tree =
wenzelm@24264
   160
  let
wenzelm@43791
   161
    fun traverse _ (Elem ((name, atts), [])) =
wenzelm@26539
   162
          Buffer.add "<" #> Buffer.add (elem name atts) #> Buffer.add "/>"
wenzelm@43791
   163
      | traverse d (Elem ((name, atts), ts)) =
wenzelm@26539
   164
          Buffer.add "<" #> Buffer.add (elem name atts) #> Buffer.add ">" #>
wenzelm@43791
   165
          traverse_body d ts #>
wenzelm@26525
   166
          Buffer.add "</" #> Buffer.add name #> Buffer.add ">"
wenzelm@43791
   167
      | traverse _ (Text s) = Buffer.add (text s)
wenzelm@43791
   168
    and traverse_body 0 _ = Buffer.add "..."
wenzelm@43791
   169
      | traverse_body d ts = fold (traverse (d - 1)) ts;
wenzelm@43791
   170
  in Buffer.empty |> traverse depth tree end;
wenzelm@24264
   171
wenzelm@43791
   172
val string_of = Buffer.content o buffer_of ~1;
wenzelm@43791
   173
val output = Buffer.output o buffer_of ~1;
wenzelm@43791
   174
wenzelm@43791
   175
fun pretty depth tree =
wenzelm@43791
   176
  Pretty.str (Buffer.content (buffer_of (Int.max (0, depth)) tree));
wenzelm@25838
   177
wenzelm@24264
   178
wenzelm@24264
   179
wenzelm@44698
   180
(** XML parsing **)
wenzelm@26546
   181
wenzelm@26546
   182
local
wenzelm@24264
   183
wenzelm@43947
   184
fun err msg (xs, _) =
wenzelm@43947
   185
  fn () => "XML parsing error: " ^ msg () ^ "\nfound: " ^ quote (Symbol.beginning 100 xs);
wenzelm@24264
   186
wenzelm@26984
   187
fun ignored _ = [];
wenzelm@26984
   188
wenzelm@45155
   189
fun name_start_char c = Symbol.is_ascii_letter c orelse c = ":" orelse c = "_";
wenzelm@45155
   190
fun name_char c = name_start_char c orelse Symbol.is_ascii_digit c orelse c = "-" orelse c = ".";
wenzelm@45155
   191
val parse_name = Scan.one name_start_char ::: Scan.many name_char;
wenzelm@45155
   192
wenzelm@26551
   193
val blanks = Scan.many Symbol.is_blank;
wenzelm@45155
   194
val special = $$ "&" ^^ (parse_name >> implode) ^^ $$ ";" >> decode;
wenzelm@26551
   195
val regular = Scan.one Symbol.is_regular;
wenzelm@26551
   196
fun regular_except x = Scan.one (fn c => Symbol.is_regular c andalso c <> x);
wenzelm@24264
   197
wenzelm@26551
   198
val parse_chars = Scan.repeat1 (special || regular_except "<") >> implode;
wenzelm@24264
   199
wenzelm@26551
   200
val parse_cdata =
wenzelm@26551
   201
  Scan.this_string "<![CDATA[" |--
wenzelm@26551
   202
  (Scan.repeat (Scan.unless (Scan.this_string "]]>") regular) >> implode) --|
wenzelm@26551
   203
  Scan.this_string "]]>";
wenzelm@24264
   204
wenzelm@24264
   205
val parse_att =
wenzelm@45155
   206
  ((parse_name >> implode) --| (blanks -- $$ "=" -- blanks)) --
wenzelm@26551
   207
  (($$ "\"" || $$ "'") :|-- (fn s =>
wenzelm@26551
   208
    (Scan.repeat (special || regular_except s) >> implode) --| $$ s));
wenzelm@24264
   209
wenzelm@26551
   210
val parse_comment =
wenzelm@26551
   211
  Scan.this_string "<!--" --
wenzelm@26551
   212
  Scan.repeat (Scan.unless (Scan.this_string "-->") regular) --
wenzelm@26984
   213
  Scan.this_string "-->" >> ignored;
wenzelm@24264
   214
wenzelm@26551
   215
val parse_processing_instruction =
wenzelm@26551
   216
  Scan.this_string "<?" --
wenzelm@26551
   217
  Scan.repeat (Scan.unless (Scan.this_string "?>") regular) --
wenzelm@26984
   218
  Scan.this_string "?>" >> ignored;
wenzelm@26984
   219
wenzelm@26984
   220
val parse_doctype =
wenzelm@26984
   221
  Scan.this_string "<!DOCTYPE" --
wenzelm@26984
   222
  Scan.repeat (Scan.unless ($$ ">") regular) --
wenzelm@26984
   223
  $$ ">" >> ignored;
wenzelm@26984
   224
wenzelm@26984
   225
val parse_misc =
wenzelm@26984
   226
  Scan.one Symbol.is_blank >> ignored ||
wenzelm@26984
   227
  parse_processing_instruction ||
wenzelm@26984
   228
  parse_comment;
wenzelm@26551
   229
wenzelm@26551
   230
val parse_optional_text =
wenzelm@26551
   231
  Scan.optional (parse_chars >> (single o Text)) [];
wenzelm@24264
   232
wenzelm@26546
   233
in
wenzelm@26546
   234
wenzelm@26984
   235
val parse_comments =
wenzelm@26984
   236
  blanks -- Scan.repeat (parse_comment -- blanks >> K ()) >> K ();
wenzelm@26984
   237
wenzelm@40627
   238
val parse_string = Scan.read Symbol.stopper parse_chars o raw_explode;
wenzelm@26546
   239
wenzelm@24264
   240
fun parse_content xs =
wenzelm@26551
   241
  (parse_optional_text @@@
wenzelm@26551
   242
    (Scan.repeat
wenzelm@26551
   243
      ((parse_element >> single ||
wenzelm@26551
   244
        parse_cdata >> (single o Text) ||
wenzelm@26984
   245
        parse_processing_instruction ||
wenzelm@26984
   246
        parse_comment)
wenzelm@26551
   247
      @@@ parse_optional_text) >> flat)) xs
wenzelm@24264
   248
wenzelm@26546
   249
and parse_element xs =
wenzelm@43949
   250
  ($$ "<" |-- parse_name -- Scan.repeat (blanks |-- parse_att) --| blanks :--
wenzelm@43949
   251
    (fn (name, _) =>
wenzelm@43947
   252
      !! (err (fn () => "Expected > or />"))
wenzelm@43949
   253
       ($$ "/" -- $$ ">" >> ignored ||
wenzelm@43949
   254
        $$ ">" |-- parse_content --|
wenzelm@43949
   255
          !! (err (fn () => "Expected </" ^ implode name ^ ">"))
wenzelm@43949
   256
              ($$ "<" -- $$ "/" -- Scan.this name -- blanks -- $$ ">")))
wenzelm@43949
   257
    >> (fn ((name, atts), body) => Elem ((implode name, atts), body))) xs;
wenzelm@24264
   258
wenzelm@26984
   259
val parse_document =
wenzelm@26984
   260
  (Scan.repeat parse_misc -- Scan.option parse_doctype -- Scan.repeat parse_misc)
wenzelm@26984
   261
  |-- parse_element;
wenzelm@24264
   262
wenzelm@26539
   263
fun parse s =
wenzelm@43947
   264
  (case Scan.finite Symbol.stopper (Scan.error (!! (err (fn () => "Malformed element"))
wenzelm@40627
   265
      (blanks |-- parse_document --| blanks))) (raw_explode s) of
wenzelm@24264
   266
    (x, []) => x
wenzelm@48769
   267
  | (_, ys) => error ("XML parsing error: unprocessed input\n" ^ Symbol.beginning 100 ys));
wenzelm@24264
   268
wenzelm@24264
   269
end;
wenzelm@26546
   270
wenzelm@43767
   271
wenzelm@43767
   272
wenzelm@43767
   273
(** XML as data representation language **)
wenzelm@43767
   274
wenzelm@43767
   275
exception XML_ATOM of string;
wenzelm@43767
   276
exception XML_BODY of tree list;
wenzelm@43767
   277
wenzelm@43767
   278
wenzelm@43767
   279
structure Encode =
wenzelm@43767
   280
struct
wenzelm@43767
   281
wenzelm@43778
   282
type 'a A = 'a -> string;
wenzelm@43767
   283
type 'a T = 'a -> body;
wenzelm@43778
   284
type 'a V = 'a -> string list * body;
wenzelm@43767
   285
wenzelm@43767
   286
wenzelm@43778
   287
(* atomic values *)
wenzelm@43767
   288
wenzelm@43767
   289
fun int_atom i = signed_string_of_int i;
wenzelm@43767
   290
wenzelm@43767
   291
fun bool_atom false = "0"
wenzelm@43767
   292
  | bool_atom true = "1";
wenzelm@43767
   293
wenzelm@43767
   294
fun unit_atom () = "";
wenzelm@43767
   295
wenzelm@43767
   296
wenzelm@43767
   297
(* structural nodes *)
wenzelm@43767
   298
wenzelm@43767
   299
fun node ts = Elem ((":", []), ts);
wenzelm@43767
   300
wenzelm@43778
   301
fun vector xs = map_index (fn (i, x) => (int_atom i, x)) xs;
wenzelm@43778
   302
wenzelm@43778
   303
fun tagged (tag, (xs, ts)) = Elem ((int_atom tag, vector xs), ts);
wenzelm@43767
   304
wenzelm@43767
   305
wenzelm@43767
   306
(* representation of standard types *)
wenzelm@43767
   307
wenzelm@43767
   308
fun properties props = [Elem ((":", props), [])];
wenzelm@43767
   309
wenzelm@43767
   310
fun string "" = []
wenzelm@43767
   311
  | string s = [Text s];
wenzelm@43767
   312
wenzelm@43767
   313
val int = string o int_atom;
wenzelm@43767
   314
wenzelm@43767
   315
val bool = string o bool_atom;
wenzelm@43767
   316
wenzelm@43767
   317
val unit = string o unit_atom;
wenzelm@43767
   318
wenzelm@43767
   319
fun pair f g (x, y) = [node (f x), node (g y)];
wenzelm@43767
   320
wenzelm@43767
   321
fun triple f g h (x, y, z) = [node (f x), node (g y), node (h z)];
wenzelm@43767
   322
wenzelm@43767
   323
fun list f xs = map (node o f) xs;
wenzelm@43767
   324
wenzelm@43767
   325
fun option _ NONE = []
wenzelm@43767
   326
  | option f (SOME x) = [node (f x)];
wenzelm@43767
   327
wenzelm@47199
   328
fun variant fs x =
wenzelm@47199
   329
  [tagged (the (get_index (fn f => SOME (f x) handle General.Match => NONE) fs))];
wenzelm@43767
   330
wenzelm@26546
   331
end;
wenzelm@43767
   332
wenzelm@43767
   333
wenzelm@43767
   334
structure Decode =
wenzelm@43767
   335
struct
wenzelm@43767
   336
wenzelm@43778
   337
type 'a A = string -> 'a;
wenzelm@43767
   338
type 'a T = body -> 'a;
wenzelm@43778
   339
type 'a V = string list * body -> 'a;
wenzelm@43767
   340
wenzelm@43767
   341
wenzelm@43778
   342
(* atomic values *)
wenzelm@43767
   343
wenzelm@43767
   344
fun int_atom s =
wenzelm@43797
   345
  Markup.parse_int s
wenzelm@43797
   346
    handle Fail _ => raise XML_ATOM s;
wenzelm@43767
   347
wenzelm@43767
   348
fun bool_atom "0" = false
wenzelm@43767
   349
  | bool_atom "1" = true
wenzelm@43767
   350
  | bool_atom s = raise XML_ATOM s;
wenzelm@43767
   351
wenzelm@43767
   352
fun unit_atom "" = ()
wenzelm@43767
   353
  | unit_atom s = raise XML_ATOM s;
wenzelm@43767
   354
wenzelm@43767
   355
wenzelm@43767
   356
(* structural nodes *)
wenzelm@43767
   357
wenzelm@43767
   358
fun node (Elem ((":", []), ts)) = ts
wenzelm@43767
   359
  | node t = raise XML_BODY [t];
wenzelm@43767
   360
wenzelm@43783
   361
fun vector atts =
wenzelm@46839
   362
  map_index (fn (i, (a, x)) => if int_atom a = i then x else raise XML_ATOM a) atts;
wenzelm@43778
   363
wenzelm@43844
   364
fun tagged (Elem ((name, atts), ts)) = (int_atom name, (vector atts, ts))
wenzelm@43767
   365
  | tagged t = raise XML_BODY [t];
wenzelm@43767
   366
wenzelm@43767
   367
wenzelm@43767
   368
(* representation of standard types *)
wenzelm@43767
   369
wenzelm@43767
   370
fun properties [Elem ((":", props), [])] = props
wenzelm@43767
   371
  | properties ts = raise XML_BODY ts;
wenzelm@43767
   372
wenzelm@43767
   373
fun string [] = ""
wenzelm@43767
   374
  | string [Text s] = s
wenzelm@43767
   375
  | string ts = raise XML_BODY ts;
wenzelm@43767
   376
wenzelm@43767
   377
val int = int_atom o string;
wenzelm@43767
   378
wenzelm@43767
   379
val bool = bool_atom o string;
wenzelm@43767
   380
wenzelm@43767
   381
val unit = unit_atom o string;
wenzelm@43767
   382
wenzelm@43767
   383
fun pair f g [t1, t2] = (f (node t1), g (node t2))
wenzelm@43767
   384
  | pair _ _ ts = raise XML_BODY ts;
wenzelm@43767
   385
wenzelm@43767
   386
fun triple f g h [t1, t2, t3] = (f (node t1), g (node t2), h (node t3))
wenzelm@43767
   387
  | triple _ _ _ ts = raise XML_BODY ts;
wenzelm@43767
   388
wenzelm@43767
   389
fun list f ts = map (f o node) ts;
wenzelm@43767
   390
wenzelm@43767
   391
fun option _ [] = NONE
wenzelm@43767
   392
  | option f [t] = SOME (f (node t))
wenzelm@43767
   393
  | option _ ts = raise XML_BODY ts;
wenzelm@43767
   394
wenzelm@43768
   395
fun variant fs [t] =
wenzelm@43768
   396
      let
wenzelm@43778
   397
        val (tag, (xs, ts)) = tagged t;
wenzelm@43768
   398
        val f = nth fs tag handle General.Subscript => raise XML_BODY [t];
wenzelm@43778
   399
      in f (xs, ts) end
wenzelm@43767
   400
  | variant _ ts = raise XML_BODY ts;
wenzelm@43767
   401
wenzelm@43767
   402
end;
wenzelm@43767
   403
wenzelm@43767
   404
end;