src/Pure/PIDE/xml.ML
author wenzelm
Sun Mar 10 14:19:30 2019 +0100 (4 months ago ago)
changeset 70070 be04e9a053a7
parent 69985 2156053c4ce9
permissions -rw-r--r--
markup and document markers for some meta data from "Dublin Core Metadata Element Set";
wenzelm@44698
     1
(*  Title:      Pure/PIDE/xml.ML
wenzelm@44698
     2
    Author:     David Aspinall
wenzelm@44698
     3
    Author:     Stefan Berghofer
wenzelm@44698
     4
    Author:     Makarius
wenzelm@24264
     5
wenzelm@46840
     6
Untyped XML trees and representation of ML values.
wenzelm@24264
     7
*)
wenzelm@24264
     8
wenzelm@43767
     9
signature XML_DATA_OPS =
wenzelm@43767
    10
sig
wenzelm@43778
    11
  type 'a A
wenzelm@43767
    12
  type 'a T
wenzelm@43778
    13
  type 'a V
wenzelm@43778
    14
  val int_atom: int A
wenzelm@43778
    15
  val bool_atom: bool A
wenzelm@43778
    16
  val unit_atom: unit A
wenzelm@43767
    17
  val properties: Properties.T T
wenzelm@43767
    18
  val string: string T
wenzelm@43767
    19
  val int: int T
wenzelm@43767
    20
  val bool: bool T
wenzelm@43767
    21
  val unit: unit T
wenzelm@43767
    22
  val pair: 'a T -> 'b T -> ('a * 'b) T
wenzelm@43767
    23
  val triple: 'a T -> 'b T -> 'c T -> ('a * 'b * 'c) T
wenzelm@43767
    24
  val list: 'a T -> 'a list T
wenzelm@43767
    25
  val option: 'a T -> 'a option T
wenzelm@43778
    26
  val variant: 'a V list -> 'a T
wenzelm@43767
    27
end;
wenzelm@43767
    28
wenzelm@24264
    29
signature XML =
wenzelm@24264
    30
sig
wenzelm@46837
    31
  type attributes = (string * string) list
wenzelm@24264
    32
  datatype tree =
wenzelm@46837
    33
      Elem of (string * attributes) * tree list
wenzelm@24264
    34
    | Text of string
wenzelm@38266
    35
  type body = tree list
wenzelm@69239
    36
  val xml_elemN: string
wenzelm@69239
    37
  val xml_nameN: string
wenzelm@69239
    38
  val xml_bodyN: string
wenzelm@49650
    39
  val wrap_elem: ((string * attributes) * tree list) * tree list -> tree
wenzelm@49650
    40
  val unwrap_elem: tree -> (((string * attributes) * tree list) * tree list) option
wenzelm@26546
    41
  val add_content: tree -> Buffer.T -> Buffer.T
wenzelm@39555
    42
  val content_of: body -> string
wenzelm@56059
    43
  val trim_blanks: body -> body
wenzelm@26546
    44
  val header: string
wenzelm@26546
    45
  val text: string -> string
wenzelm@26546
    46
  val element: string -> attributes -> string list -> string
wenzelm@69355
    47
  val output_markup: Markup.T -> Markup.output
wenzelm@26539
    48
  val string_of: tree -> string
wenzelm@43791
    49
  val pretty: int -> tree -> Pretty.T
wenzelm@60982
    50
  val output: tree -> BinIO.outstream -> unit
wenzelm@26984
    51
  val parse_comments: string list -> unit * string list
wenzelm@24264
    52
  val parse_string : string -> string option
wenzelm@26546
    53
  val parse_element: string list -> tree * string list
wenzelm@26984
    54
  val parse_document: string list -> tree * string list
wenzelm@26539
    55
  val parse: string -> tree
wenzelm@43767
    56
  exception XML_ATOM of string
wenzelm@43767
    57
  exception XML_BODY of body
wenzelm@65333
    58
  structure Encode:
wenzelm@65333
    59
  sig
wenzelm@65333
    60
    include XML_DATA_OPS
wenzelm@65333
    61
    val tree: tree T
wenzelm@65333
    62
  end
wenzelm@65333
    63
  structure Decode:
wenzelm@65333
    64
  sig
wenzelm@65333
    65
    include XML_DATA_OPS
wenzelm@65333
    66
    val tree: tree T
wenzelm@65333
    67
  end
wenzelm@24264
    68
end;
wenzelm@24264
    69
wenzelm@24264
    70
structure XML: XML =
wenzelm@24264
    71
struct
wenzelm@24264
    72
wenzelm@26546
    73
(** XML trees **)
wenzelm@26546
    74
wenzelm@46837
    75
type attributes = (string * string) list;
wenzelm@26546
    76
wenzelm@26546
    77
datatype tree =
wenzelm@46837
    78
    Elem of (string * attributes) * tree list
wenzelm@28033
    79
  | Text of string;
wenzelm@26546
    80
wenzelm@38266
    81
type body = tree list;
wenzelm@38266
    82
wenzelm@49650
    83
wenzelm@49650
    84
(* wrapped elements *)
wenzelm@49650
    85
wenzelm@49650
    86
val xml_elemN = "xml_elem";
wenzelm@49650
    87
val xml_nameN = "xml_name";
wenzelm@49650
    88
val xml_bodyN = "xml_body";
wenzelm@49650
    89
wenzelm@49650
    90
fun wrap_elem (((a, atts), body1), body2) =
wenzelm@49650
    91
  Elem ((xml_elemN, (xml_nameN, a) :: atts), Elem ((xml_bodyN, []), body1) :: body2);
wenzelm@49650
    92
wenzelm@49650
    93
fun unwrap_elem (Elem ((name, (n, a) :: atts), Elem ((name', atts'), body1) :: body2)) =
wenzelm@49650
    94
      if name = xml_elemN andalso n = xml_nameN andalso name' = xml_bodyN andalso null atts'
wenzelm@49650
    95
      then SOME (((a, atts), body1), body2) else NONE
wenzelm@49650
    96
  | unwrap_elem _ = NONE;
wenzelm@49650
    97
wenzelm@49650
    98
wenzelm@69229
    99
(* text content *)
wenzelm@49650
   100
wenzelm@49650
   101
fun add_content tree =
wenzelm@49650
   102
  (case unwrap_elem tree of
wenzelm@49650
   103
    SOME (_, ts) => fold add_content ts
wenzelm@49650
   104
  | NONE =>
wenzelm@49650
   105
      (case tree of
wenzelm@49650
   106
        Elem (_, ts) => fold add_content ts
wenzelm@49650
   107
      | Text s => Buffer.add s));
wenzelm@26546
   108
wenzelm@39555
   109
fun content_of body = Buffer.empty |> fold add_content body |> Buffer.content;
wenzelm@39555
   110
wenzelm@26546
   111
wenzelm@56059
   112
(* trim blanks *)
wenzelm@56059
   113
wenzelm@56059
   114
fun trim_blanks trees =
wenzelm@56059
   115
  trees |> maps
wenzelm@56059
   116
    (fn Elem (markup, body) => [Elem (markup, trim_blanks body)]
wenzelm@56059
   117
      | Text s =>
wenzelm@61707
   118
          let val s' = s |> raw_explode |> trim Symbol.is_blank |> implode;
wenzelm@56059
   119
          in if s' = "" then [] else [Text s'] end);
wenzelm@56059
   120
wenzelm@56059
   121
wenzelm@24264
   122
wenzelm@26525
   123
(** string representation **)
wenzelm@26525
   124
wenzelm@69985
   125
val header = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n";
wenzelm@24264
   126
wenzelm@24264
   127
wenzelm@26546
   128
(* escaped text *)
wenzelm@24264
   129
wenzelm@24264
   130
fun decode "&lt;" = "<"
wenzelm@24264
   131
  | decode "&gt;" = ">"
wenzelm@24264
   132
  | decode "&amp;" = "&"
wenzelm@24264
   133
  | decode "&apos;" = "'"
wenzelm@24264
   134
  | decode "&quot;" = "\""
wenzelm@24264
   135
  | decode c = c;
wenzelm@24264
   136
wenzelm@24264
   137
fun encode "<" = "&lt;"
wenzelm@24264
   138
  | encode ">" = "&gt;"
wenzelm@24264
   139
  | encode "&" = "&amp;"
wenzelm@24264
   140
  | encode "'" = "&apos;"
wenzelm@24264
   141
  | encode "\"" = "&quot;"
wenzelm@24264
   142
  | encode c = c;
wenzelm@24264
   143
wenzelm@25838
   144
val text = translate_string encode;
wenzelm@24264
   145
wenzelm@24264
   146
wenzelm@24264
   147
(* elements *)
wenzelm@24264
   148
wenzelm@26539
   149
fun elem name atts =
wenzelm@26551
   150
  space_implode " " (name :: map (fn (a, x) => a ^ "=\"" ^ text x ^ "\"") atts);
wenzelm@24264
   151
wenzelm@26525
   152
fun element name atts body =
wenzelm@26539
   153
  let val b = implode body in
wenzelm@26539
   154
    if b = "" then enclose "<" "/>" (elem name atts)
wenzelm@26539
   155
    else enclose "<" ">" (elem name atts) ^ b ^ enclose "</" ">" name
wenzelm@24264
   156
  end;
wenzelm@24264
   157
wenzelm@27884
   158
fun output_markup (markup as (name, atts)) =
wenzelm@38474
   159
  if Markup.is_empty markup then Markup.no_output
wenzelm@27884
   160
  else (enclose "<" ">" (elem name atts), enclose "</" ">" name);
wenzelm@26539
   161
wenzelm@24264
   162
wenzelm@26546
   163
(* output *)
wenzelm@24264
   164
wenzelm@43791
   165
fun buffer_of depth tree =
wenzelm@24264
   166
  let
wenzelm@43791
   167
    fun traverse _ (Elem ((name, atts), [])) =
wenzelm@26539
   168
          Buffer.add "<" #> Buffer.add (elem name atts) #> Buffer.add "/>"
wenzelm@43791
   169
      | traverse d (Elem ((name, atts), ts)) =
wenzelm@26539
   170
          Buffer.add "<" #> Buffer.add (elem name atts) #> Buffer.add ">" #>
wenzelm@43791
   171
          traverse_body d ts #>
wenzelm@26525
   172
          Buffer.add "</" #> Buffer.add name #> Buffer.add ">"
wenzelm@43791
   173
      | traverse _ (Text s) = Buffer.add (text s)
wenzelm@43791
   174
    and traverse_body 0 _ = Buffer.add "..."
wenzelm@43791
   175
      | traverse_body d ts = fold (traverse (d - 1)) ts;
wenzelm@43791
   176
  in Buffer.empty |> traverse depth tree end;
wenzelm@24264
   177
wenzelm@43791
   178
val string_of = Buffer.content o buffer_of ~1;
wenzelm@43791
   179
val output = Buffer.output o buffer_of ~1;
wenzelm@43791
   180
wenzelm@43791
   181
fun pretty depth tree =
wenzelm@43791
   182
  Pretty.str (Buffer.content (buffer_of (Int.max (0, depth)) tree));
wenzelm@25838
   183
wenzelm@62819
   184
val _ = ML_system_pp (fn depth => fn _ => Pretty.to_polyml o pretty (FixedInt.toInt depth));
wenzelm@62663
   185
wenzelm@24264
   186
wenzelm@24264
   187
wenzelm@44698
   188
(** XML parsing **)
wenzelm@26546
   189
wenzelm@26546
   190
local
wenzelm@24264
   191
wenzelm@43947
   192
fun err msg (xs, _) =
wenzelm@43947
   193
  fn () => "XML parsing error: " ^ msg () ^ "\nfound: " ^ quote (Symbol.beginning 100 xs);
wenzelm@24264
   194
wenzelm@26984
   195
fun ignored _ = [];
wenzelm@26984
   196
wenzelm@45155
   197
fun name_start_char c = Symbol.is_ascii_letter c orelse c = ":" orelse c = "_";
wenzelm@45155
   198
fun name_char c = name_start_char c orelse Symbol.is_ascii_digit c orelse c = "-" orelse c = ".";
wenzelm@45155
   199
val parse_name = Scan.one name_start_char ::: Scan.many name_char;
wenzelm@45155
   200
wenzelm@26551
   201
val blanks = Scan.many Symbol.is_blank;
wenzelm@45155
   202
val special = $$ "&" ^^ (parse_name >> implode) ^^ $$ ";" >> decode;
wenzelm@58854
   203
val regular = Scan.one Symbol.not_eof;
wenzelm@58854
   204
fun regular_except x = Scan.one (fn c => Symbol.not_eof c andalso c <> x);
wenzelm@24264
   205
wenzelm@26551
   206
val parse_chars = Scan.repeat1 (special || regular_except "<") >> implode;
wenzelm@24264
   207
wenzelm@26551
   208
val parse_cdata =
wenzelm@26551
   209
  Scan.this_string "<![CDATA[" |--
wenzelm@26551
   210
  (Scan.repeat (Scan.unless (Scan.this_string "]]>") regular) >> implode) --|
wenzelm@26551
   211
  Scan.this_string "]]>";
wenzelm@24264
   212
wenzelm@24264
   213
val parse_att =
wenzelm@45155
   214
  ((parse_name >> implode) --| (blanks -- $$ "=" -- blanks)) --
wenzelm@26551
   215
  (($$ "\"" || $$ "'") :|-- (fn s =>
wenzelm@26551
   216
    (Scan.repeat (special || regular_except s) >> implode) --| $$ s));
wenzelm@24264
   217
wenzelm@26551
   218
val parse_comment =
wenzelm@26551
   219
  Scan.this_string "<!--" --
wenzelm@26551
   220
  Scan.repeat (Scan.unless (Scan.this_string "-->") regular) --
wenzelm@26984
   221
  Scan.this_string "-->" >> ignored;
wenzelm@24264
   222
wenzelm@26551
   223
val parse_processing_instruction =
wenzelm@26551
   224
  Scan.this_string "<?" --
wenzelm@26551
   225
  Scan.repeat (Scan.unless (Scan.this_string "?>") regular) --
wenzelm@26984
   226
  Scan.this_string "?>" >> ignored;
wenzelm@26984
   227
wenzelm@26984
   228
val parse_doctype =
wenzelm@26984
   229
  Scan.this_string "<!DOCTYPE" --
wenzelm@26984
   230
  Scan.repeat (Scan.unless ($$ ">") regular) --
wenzelm@26984
   231
  $$ ">" >> ignored;
wenzelm@26984
   232
wenzelm@26984
   233
val parse_misc =
wenzelm@26984
   234
  Scan.one Symbol.is_blank >> ignored ||
wenzelm@26984
   235
  parse_processing_instruction ||
wenzelm@26984
   236
  parse_comment;
wenzelm@26551
   237
wenzelm@26551
   238
val parse_optional_text =
wenzelm@26551
   239
  Scan.optional (parse_chars >> (single o Text)) [];
wenzelm@24264
   240
wenzelm@26546
   241
in
wenzelm@26546
   242
wenzelm@26984
   243
val parse_comments =
wenzelm@26984
   244
  blanks -- Scan.repeat (parse_comment -- blanks >> K ()) >> K ();
wenzelm@26984
   245
wenzelm@40627
   246
val parse_string = Scan.read Symbol.stopper parse_chars o raw_explode;
wenzelm@26546
   247
wenzelm@24264
   248
fun parse_content xs =
wenzelm@26551
   249
  (parse_optional_text @@@
wenzelm@61476
   250
    Scan.repeats
wenzelm@26551
   251
      ((parse_element >> single ||
wenzelm@26551
   252
        parse_cdata >> (single o Text) ||
wenzelm@26984
   253
        parse_processing_instruction ||
wenzelm@26984
   254
        parse_comment)
wenzelm@61476
   255
      @@@ parse_optional_text)) xs
wenzelm@24264
   256
wenzelm@26546
   257
and parse_element xs =
wenzelm@43949
   258
  ($$ "<" |-- parse_name -- Scan.repeat (blanks |-- parse_att) --| blanks :--
wenzelm@43949
   259
    (fn (name, _) =>
wenzelm@43947
   260
      !! (err (fn () => "Expected > or />"))
wenzelm@43949
   261
       ($$ "/" -- $$ ">" >> ignored ||
wenzelm@43949
   262
        $$ ">" |-- parse_content --|
wenzelm@43949
   263
          !! (err (fn () => "Expected </" ^ implode name ^ ">"))
wenzelm@43949
   264
              ($$ "<" -- $$ "/" -- Scan.this name -- blanks -- $$ ">")))
wenzelm@43949
   265
    >> (fn ((name, atts), body) => Elem ((implode name, atts), body))) xs;
wenzelm@24264
   266
wenzelm@26984
   267
val parse_document =
wenzelm@26984
   268
  (Scan.repeat parse_misc -- Scan.option parse_doctype -- Scan.repeat parse_misc)
wenzelm@26984
   269
  |-- parse_element;
wenzelm@24264
   270
wenzelm@26539
   271
fun parse s =
wenzelm@43947
   272
  (case Scan.finite Symbol.stopper (Scan.error (!! (err (fn () => "Malformed element"))
wenzelm@40627
   273
      (blanks |-- parse_document --| blanks))) (raw_explode s) of
wenzelm@24264
   274
    (x, []) => x
wenzelm@48769
   275
  | (_, ys) => error ("XML parsing error: unprocessed input\n" ^ Symbol.beginning 100 ys));
wenzelm@24264
   276
wenzelm@24264
   277
end;
wenzelm@26546
   278
wenzelm@43767
   279
wenzelm@43767
   280
wenzelm@43767
   281
(** XML as data representation language **)
wenzelm@43767
   282
wenzelm@43767
   283
exception XML_ATOM of string;
wenzelm@43767
   284
exception XML_BODY of tree list;
wenzelm@43767
   285
wenzelm@43767
   286
wenzelm@43767
   287
structure Encode =
wenzelm@43767
   288
struct
wenzelm@43767
   289
wenzelm@43778
   290
type 'a A = 'a -> string;
wenzelm@43767
   291
type 'a T = 'a -> body;
wenzelm@43778
   292
type 'a V = 'a -> string list * body;
wenzelm@43767
   293
wenzelm@43767
   294
wenzelm@43778
   295
(* atomic values *)
wenzelm@43767
   296
wenzelm@69586
   297
fun int_atom i = Value.print_int i;
wenzelm@43767
   298
wenzelm@43767
   299
fun bool_atom false = "0"
wenzelm@43767
   300
  | bool_atom true = "1";
wenzelm@43767
   301
wenzelm@43767
   302
fun unit_atom () = "";
wenzelm@43767
   303
wenzelm@43767
   304
wenzelm@43767
   305
(* structural nodes *)
wenzelm@43767
   306
wenzelm@43767
   307
fun node ts = Elem ((":", []), ts);
wenzelm@43767
   308
wenzelm@43778
   309
fun vector xs = map_index (fn (i, x) => (int_atom i, x)) xs;
wenzelm@43778
   310
wenzelm@43778
   311
fun tagged (tag, (xs, ts)) = Elem ((int_atom tag, vector xs), ts);
wenzelm@43767
   312
wenzelm@43767
   313
wenzelm@43767
   314
(* representation of standard types *)
wenzelm@43767
   315
wenzelm@65333
   316
fun tree (t: tree) = [t];
wenzelm@65333
   317
wenzelm@43767
   318
fun properties props = [Elem ((":", props), [])];
wenzelm@43767
   319
wenzelm@43767
   320
fun string "" = []
wenzelm@43767
   321
  | string s = [Text s];
wenzelm@43767
   322
wenzelm@43767
   323
val int = string o int_atom;
wenzelm@43767
   324
wenzelm@43767
   325
val bool = string o bool_atom;
wenzelm@43767
   326
wenzelm@43767
   327
val unit = string o unit_atom;
wenzelm@43767
   328
wenzelm@43767
   329
fun pair f g (x, y) = [node (f x), node (g y)];
wenzelm@43767
   330
wenzelm@43767
   331
fun triple f g h (x, y, z) = [node (f x), node (g y), node (h z)];
wenzelm@43767
   332
wenzelm@43767
   333
fun list f xs = map (node o f) xs;
wenzelm@43767
   334
wenzelm@43767
   335
fun option _ NONE = []
wenzelm@43767
   336
  | option f (SOME x) = [node (f x)];
wenzelm@43767
   337
wenzelm@47199
   338
fun variant fs x =
wenzelm@47199
   339
  [tagged (the (get_index (fn f => SOME (f x) handle General.Match => NONE) fs))];
wenzelm@43767
   340
wenzelm@26546
   341
end;
wenzelm@43767
   342
wenzelm@43767
   343
wenzelm@43767
   344
structure Decode =
wenzelm@43767
   345
struct
wenzelm@43767
   346
wenzelm@43778
   347
type 'a A = string -> 'a;
wenzelm@43767
   348
type 'a T = body -> 'a;
wenzelm@43778
   349
type 'a V = string list * body -> 'a;
wenzelm@43767
   350
wenzelm@43767
   351
wenzelm@43778
   352
(* atomic values *)
wenzelm@43767
   353
wenzelm@43767
   354
fun int_atom s =
wenzelm@63806
   355
  Value.parse_int s
wenzelm@43797
   356
    handle Fail _ => raise XML_ATOM s;
wenzelm@43767
   357
wenzelm@43767
   358
fun bool_atom "0" = false
wenzelm@43767
   359
  | bool_atom "1" = true
wenzelm@43767
   360
  | bool_atom s = raise XML_ATOM s;
wenzelm@43767
   361
wenzelm@43767
   362
fun unit_atom "" = ()
wenzelm@43767
   363
  | unit_atom s = raise XML_ATOM s;
wenzelm@43767
   364
wenzelm@43767
   365
wenzelm@43767
   366
(* structural nodes *)
wenzelm@43767
   367
wenzelm@43767
   368
fun node (Elem ((":", []), ts)) = ts
wenzelm@43767
   369
  | node t = raise XML_BODY [t];
wenzelm@43767
   370
wenzelm@43783
   371
fun vector atts =
wenzelm@46839
   372
  map_index (fn (i, (a, x)) => if int_atom a = i then x else raise XML_ATOM a) atts;
wenzelm@43778
   373
wenzelm@43844
   374
fun tagged (Elem ((name, atts), ts)) = (int_atom name, (vector atts, ts))
wenzelm@43767
   375
  | tagged t = raise XML_BODY [t];
wenzelm@43767
   376
wenzelm@43767
   377
wenzelm@43767
   378
(* representation of standard types *)
wenzelm@43767
   379
wenzelm@65333
   380
fun tree [t] = t
wenzelm@65333
   381
  | tree ts = raise XML_BODY ts;
wenzelm@65333
   382
wenzelm@43767
   383
fun properties [Elem ((":", props), [])] = props
wenzelm@43767
   384
  | properties ts = raise XML_BODY ts;
wenzelm@43767
   385
wenzelm@43767
   386
fun string [] = ""
wenzelm@43767
   387
  | string [Text s] = s
wenzelm@43767
   388
  | string ts = raise XML_BODY ts;
wenzelm@43767
   389
wenzelm@43767
   390
val int = int_atom o string;
wenzelm@43767
   391
wenzelm@43767
   392
val bool = bool_atom o string;
wenzelm@43767
   393
wenzelm@43767
   394
val unit = unit_atom o string;
wenzelm@43767
   395
wenzelm@43767
   396
fun pair f g [t1, t2] = (f (node t1), g (node t2))
wenzelm@43767
   397
  | pair _ _ ts = raise XML_BODY ts;
wenzelm@43767
   398
wenzelm@43767
   399
fun triple f g h [t1, t2, t3] = (f (node t1), g (node t2), h (node t3))
wenzelm@43767
   400
  | triple _ _ _ ts = raise XML_BODY ts;
wenzelm@43767
   401
wenzelm@43767
   402
fun list f ts = map (f o node) ts;
wenzelm@43767
   403
wenzelm@43767
   404
fun option _ [] = NONE
wenzelm@43767
   405
  | option f [t] = SOME (f (node t))
wenzelm@43767
   406
  | option _ ts = raise XML_BODY ts;
wenzelm@43767
   407
wenzelm@43768
   408
fun variant fs [t] =
wenzelm@43768
   409
      let
wenzelm@43778
   410
        val (tag, (xs, ts)) = tagged t;
wenzelm@43768
   411
        val f = nth fs tag handle General.Subscript => raise XML_BODY [t];
wenzelm@43778
   412
      in f (xs, ts) end
wenzelm@43767
   413
  | variant _ ts = raise XML_BODY ts;
wenzelm@43767
   414
wenzelm@43767
   415
end;
wenzelm@43767
   416
wenzelm@43767
   417
end;