src/Pure/PIDE/xml.ML
author wenzelm
Thu Aug 02 12:36:54 2012 +0200 (2012-08-02)
changeset 48646 91281e9472d8
parent 47199 15ede9f1da3f
child 48769 e3b7087bb923
permissions -rw-r--r--
more official command specifications, including source position;
wenzelm@44698
     1
(*  Title:      Pure/PIDE/xml.ML
wenzelm@44698
     2
    Author:     David Aspinall
wenzelm@44698
     3
    Author:     Stefan Berghofer
wenzelm@44698
     4
    Author:     Makarius
wenzelm@24264
     5
wenzelm@46840
     6
Untyped XML trees and representation of ML values.
wenzelm@24264
     7
*)
wenzelm@24264
     8
wenzelm@43767
     9
signature XML_DATA_OPS =
wenzelm@43767
    10
sig
wenzelm@43778
    11
  type 'a A
wenzelm@43767
    12
  type 'a T
wenzelm@43778
    13
  type 'a V
wenzelm@43778
    14
  val int_atom: int A
wenzelm@43778
    15
  val bool_atom: bool A
wenzelm@43778
    16
  val unit_atom: unit A
wenzelm@43767
    17
  val properties: Properties.T T
wenzelm@43767
    18
  val string: string T
wenzelm@43767
    19
  val int: int T
wenzelm@43767
    20
  val bool: bool T
wenzelm@43767
    21
  val unit: unit T
wenzelm@43767
    22
  val pair: 'a T -> 'b T -> ('a * 'b) T
wenzelm@43767
    23
  val triple: 'a T -> 'b T -> 'c T -> ('a * 'b * 'c) T
wenzelm@43767
    24
  val list: 'a T -> 'a list T
wenzelm@43767
    25
  val option: 'a T -> 'a option T
wenzelm@43778
    26
  val variant: 'a V list -> 'a T
wenzelm@43767
    27
end;
wenzelm@43767
    28
wenzelm@24264
    29
signature XML =
wenzelm@24264
    30
sig
wenzelm@46837
    31
  type attributes = (string * string) list
wenzelm@24264
    32
  datatype tree =
wenzelm@46837
    33
      Elem of (string * attributes) * tree list
wenzelm@24264
    34
    | Text of string
wenzelm@38266
    35
  type body = tree list
wenzelm@26546
    36
  val add_content: tree -> Buffer.T -> Buffer.T
wenzelm@39555
    37
  val content_of: body -> string
wenzelm@26546
    38
  val header: string
wenzelm@26546
    39
  val text: string -> string
wenzelm@26546
    40
  val element: string -> attributes -> string list -> string
wenzelm@40131
    41
  val output_markup: Markup.T -> Output.output * Output.output
wenzelm@26539
    42
  val string_of: tree -> string
wenzelm@43791
    43
  val pretty: int -> tree -> Pretty.T
wenzelm@26546
    44
  val output: tree -> TextIO.outstream -> unit
wenzelm@26984
    45
  val parse_comments: string list -> unit * string list
wenzelm@24264
    46
  val parse_string : string -> string option
wenzelm@26546
    47
  val parse_element: string list -> tree * string list
wenzelm@26984
    48
  val parse_document: string list -> tree * string list
wenzelm@26539
    49
  val parse: string -> tree
wenzelm@44808
    50
  val cache: unit -> tree -> tree
wenzelm@43767
    51
  exception XML_ATOM of string
wenzelm@43767
    52
  exception XML_BODY of body
wenzelm@43778
    53
  structure Encode: XML_DATA_OPS
wenzelm@43778
    54
  structure Decode: XML_DATA_OPS
wenzelm@24264
    55
end;
wenzelm@24264
    56
wenzelm@24264
    57
structure XML: XML =
wenzelm@24264
    58
struct
wenzelm@24264
    59
wenzelm@26546
    60
(** XML trees **)
wenzelm@26546
    61
wenzelm@46837
    62
type attributes = (string * string) list;
wenzelm@26546
    63
wenzelm@26546
    64
datatype tree =
wenzelm@46837
    65
    Elem of (string * attributes) * tree list
wenzelm@28033
    66
  | Text of string;
wenzelm@26546
    67
wenzelm@38266
    68
type body = tree list;
wenzelm@38266
    69
wenzelm@38228
    70
fun add_content (Elem (_, ts)) = fold add_content ts
wenzelm@28033
    71
  | add_content (Text s) = Buffer.add s;
wenzelm@26546
    72
wenzelm@39555
    73
fun content_of body = Buffer.empty |> fold add_content body |> Buffer.content;
wenzelm@39555
    74
wenzelm@26546
    75
wenzelm@24264
    76
wenzelm@26525
    77
(** string representation **)
wenzelm@26525
    78
wenzelm@24264
    79
val header = "<?xml version=\"1.0\"?>\n";
wenzelm@24264
    80
wenzelm@24264
    81
wenzelm@26546
    82
(* escaped text *)
wenzelm@24264
    83
wenzelm@24264
    84
fun decode "&lt;" = "<"
wenzelm@24264
    85
  | decode "&gt;" = ">"
wenzelm@24264
    86
  | decode "&amp;" = "&"
wenzelm@24264
    87
  | decode "&apos;" = "'"
wenzelm@24264
    88
  | decode "&quot;" = "\""
wenzelm@24264
    89
  | decode c = c;
wenzelm@24264
    90
wenzelm@24264
    91
fun encode "<" = "&lt;"
wenzelm@24264
    92
  | encode ">" = "&gt;"
wenzelm@24264
    93
  | encode "&" = "&amp;"
wenzelm@24264
    94
  | encode "'" = "&apos;"
wenzelm@24264
    95
  | encode "\"" = "&quot;"
wenzelm@24264
    96
  | encode c = c;
wenzelm@24264
    97
wenzelm@25838
    98
val text = translate_string encode;
wenzelm@24264
    99
wenzelm@24264
   100
wenzelm@24264
   101
(* elements *)
wenzelm@24264
   102
wenzelm@26539
   103
fun elem name atts =
wenzelm@26551
   104
  space_implode " " (name :: map (fn (a, x) => a ^ "=\"" ^ text x ^ "\"") atts);
wenzelm@24264
   105
wenzelm@26525
   106
fun element name atts body =
wenzelm@26539
   107
  let val b = implode body in
wenzelm@26539
   108
    if b = "" then enclose "<" "/>" (elem name atts)
wenzelm@26539
   109
    else enclose "<" ">" (elem name atts) ^ b ^ enclose "</" ">" name
wenzelm@24264
   110
  end;
wenzelm@24264
   111
wenzelm@27884
   112
fun output_markup (markup as (name, atts)) =
wenzelm@38474
   113
  if Markup.is_empty markup then Markup.no_output
wenzelm@27884
   114
  else (enclose "<" ">" (elem name atts), enclose "</" ">" name);
wenzelm@26539
   115
wenzelm@24264
   116
wenzelm@26546
   117
(* output *)
wenzelm@24264
   118
wenzelm@43791
   119
fun buffer_of depth tree =
wenzelm@24264
   120
  let
wenzelm@43791
   121
    fun traverse _ (Elem ((name, atts), [])) =
wenzelm@26539
   122
          Buffer.add "<" #> Buffer.add (elem name atts) #> Buffer.add "/>"
wenzelm@43791
   123
      | traverse d (Elem ((name, atts), ts)) =
wenzelm@26539
   124
          Buffer.add "<" #> Buffer.add (elem name atts) #> Buffer.add ">" #>
wenzelm@43791
   125
          traverse_body d ts #>
wenzelm@26525
   126
          Buffer.add "</" #> Buffer.add name #> Buffer.add ">"
wenzelm@43791
   127
      | traverse _ (Text s) = Buffer.add (text s)
wenzelm@43791
   128
    and traverse_body 0 _ = Buffer.add "..."
wenzelm@43791
   129
      | traverse_body d ts = fold (traverse (d - 1)) ts;
wenzelm@43791
   130
  in Buffer.empty |> traverse depth tree end;
wenzelm@24264
   131
wenzelm@43791
   132
val string_of = Buffer.content o buffer_of ~1;
wenzelm@43791
   133
val output = Buffer.output o buffer_of ~1;
wenzelm@43791
   134
wenzelm@43791
   135
fun pretty depth tree =
wenzelm@43791
   136
  Pretty.str (Buffer.content (buffer_of (Int.max (0, depth)) tree));
wenzelm@25838
   137
wenzelm@24264
   138
wenzelm@24264
   139
wenzelm@44698
   140
(** XML parsing **)
wenzelm@26546
   141
wenzelm@26546
   142
local
wenzelm@24264
   143
wenzelm@43947
   144
fun err msg (xs, _) =
wenzelm@43947
   145
  fn () => "XML parsing error: " ^ msg () ^ "\nfound: " ^ quote (Symbol.beginning 100 xs);
wenzelm@24264
   146
wenzelm@26984
   147
fun ignored _ = [];
wenzelm@26984
   148
wenzelm@45155
   149
fun name_start_char c = Symbol.is_ascii_letter c orelse c = ":" orelse c = "_";
wenzelm@45155
   150
fun name_char c = name_start_char c orelse Symbol.is_ascii_digit c orelse c = "-" orelse c = ".";
wenzelm@45155
   151
val parse_name = Scan.one name_start_char ::: Scan.many name_char;
wenzelm@45155
   152
wenzelm@26551
   153
val blanks = Scan.many Symbol.is_blank;
wenzelm@45155
   154
val special = $$ "&" ^^ (parse_name >> implode) ^^ $$ ";" >> decode;
wenzelm@26551
   155
val regular = Scan.one Symbol.is_regular;
wenzelm@26551
   156
fun regular_except x = Scan.one (fn c => Symbol.is_regular c andalso c <> x);
wenzelm@24264
   157
wenzelm@26551
   158
val parse_chars = Scan.repeat1 (special || regular_except "<") >> implode;
wenzelm@24264
   159
wenzelm@26551
   160
val parse_cdata =
wenzelm@26551
   161
  Scan.this_string "<![CDATA[" |--
wenzelm@26551
   162
  (Scan.repeat (Scan.unless (Scan.this_string "]]>") regular) >> implode) --|
wenzelm@26551
   163
  Scan.this_string "]]>";
wenzelm@24264
   164
wenzelm@24264
   165
val parse_att =
wenzelm@45155
   166
  ((parse_name >> implode) --| (blanks -- $$ "=" -- blanks)) --
wenzelm@26551
   167
  (($$ "\"" || $$ "'") :|-- (fn s =>
wenzelm@26551
   168
    (Scan.repeat (special || regular_except s) >> implode) --| $$ s));
wenzelm@24264
   169
wenzelm@26551
   170
val parse_comment =
wenzelm@26551
   171
  Scan.this_string "<!--" --
wenzelm@26551
   172
  Scan.repeat (Scan.unless (Scan.this_string "-->") regular) --
wenzelm@26984
   173
  Scan.this_string "-->" >> ignored;
wenzelm@24264
   174
wenzelm@26551
   175
val parse_processing_instruction =
wenzelm@26551
   176
  Scan.this_string "<?" --
wenzelm@26551
   177
  Scan.repeat (Scan.unless (Scan.this_string "?>") regular) --
wenzelm@26984
   178
  Scan.this_string "?>" >> ignored;
wenzelm@26984
   179
wenzelm@26984
   180
val parse_doctype =
wenzelm@26984
   181
  Scan.this_string "<!DOCTYPE" --
wenzelm@26984
   182
  Scan.repeat (Scan.unless ($$ ">") regular) --
wenzelm@26984
   183
  $$ ">" >> ignored;
wenzelm@26984
   184
wenzelm@26984
   185
val parse_misc =
wenzelm@26984
   186
  Scan.one Symbol.is_blank >> ignored ||
wenzelm@26984
   187
  parse_processing_instruction ||
wenzelm@26984
   188
  parse_comment;
wenzelm@26551
   189
wenzelm@26551
   190
val parse_optional_text =
wenzelm@26551
   191
  Scan.optional (parse_chars >> (single o Text)) [];
wenzelm@24264
   192
wenzelm@26546
   193
in
wenzelm@26546
   194
wenzelm@26984
   195
val parse_comments =
wenzelm@26984
   196
  blanks -- Scan.repeat (parse_comment -- blanks >> K ()) >> K ();
wenzelm@26984
   197
wenzelm@40627
   198
val parse_string = Scan.read Symbol.stopper parse_chars o raw_explode;
wenzelm@26546
   199
wenzelm@24264
   200
fun parse_content xs =
wenzelm@26551
   201
  (parse_optional_text @@@
wenzelm@26551
   202
    (Scan.repeat
wenzelm@26551
   203
      ((parse_element >> single ||
wenzelm@26551
   204
        parse_cdata >> (single o Text) ||
wenzelm@26984
   205
        parse_processing_instruction ||
wenzelm@26984
   206
        parse_comment)
wenzelm@26551
   207
      @@@ parse_optional_text) >> flat)) xs
wenzelm@24264
   208
wenzelm@26546
   209
and parse_element xs =
wenzelm@43949
   210
  ($$ "<" |-- parse_name -- Scan.repeat (blanks |-- parse_att) --| blanks :--
wenzelm@43949
   211
    (fn (name, _) =>
wenzelm@43947
   212
      !! (err (fn () => "Expected > or />"))
wenzelm@43949
   213
       ($$ "/" -- $$ ">" >> ignored ||
wenzelm@43949
   214
        $$ ">" |-- parse_content --|
wenzelm@43949
   215
          !! (err (fn () => "Expected </" ^ implode name ^ ">"))
wenzelm@43949
   216
              ($$ "<" -- $$ "/" -- Scan.this name -- blanks -- $$ ">")))
wenzelm@43949
   217
    >> (fn ((name, atts), body) => Elem ((implode name, atts), body))) xs;
wenzelm@24264
   218
wenzelm@26984
   219
val parse_document =
wenzelm@26984
   220
  (Scan.repeat parse_misc -- Scan.option parse_doctype -- Scan.repeat parse_misc)
wenzelm@26984
   221
  |-- parse_element;
wenzelm@24264
   222
wenzelm@26539
   223
fun parse s =
wenzelm@43947
   224
  (case Scan.finite Symbol.stopper (Scan.error (!! (err (fn () => "Malformed element"))
wenzelm@40627
   225
      (blanks |-- parse_document --| blanks))) (raw_explode s) of
wenzelm@24264
   226
    (x, []) => x
wenzelm@24264
   227
  | (_, ys) => error ("XML parsing error: Unprocessed input\n" ^ Symbol.beginning 100 ys));
wenzelm@24264
   228
wenzelm@24264
   229
end;
wenzelm@26546
   230
wenzelm@43767
   231
wenzelm@44809
   232
(** cache for substructural sharing **)
wenzelm@44809
   233
wenzelm@44809
   234
fun tree_ord tu =
wenzelm@44809
   235
  if pointer_eq tu then EQUAL
wenzelm@44809
   236
  else
wenzelm@44809
   237
    (case tu of
wenzelm@44809
   238
      (Text _, Elem _) => LESS
wenzelm@44809
   239
    | (Elem _, Text _) => GREATER
wenzelm@44809
   240
    | (Text s, Text s') => fast_string_ord (s, s')
wenzelm@44809
   241
    | (Elem e, Elem e') =>
wenzelm@44809
   242
        prod_ord
wenzelm@44809
   243
          (prod_ord fast_string_ord (list_ord (prod_ord fast_string_ord fast_string_ord)))
wenzelm@44809
   244
          (list_ord tree_ord) (e, e'));
wenzelm@44809
   245
wenzelm@44809
   246
structure Treetab = Table(type key = tree val ord = tree_ord);
wenzelm@44808
   247
wenzelm@44808
   248
fun cache () =
wenzelm@44808
   249
  let
wenzelm@44808
   250
    val strings = Unsynchronized.ref (Symtab.empty: unit Symtab.table);
wenzelm@44809
   251
    val trees = Unsynchronized.ref (Treetab.empty: unit Treetab.table);
wenzelm@44808
   252
wenzelm@44808
   253
    fun string s =
wenzelm@44809
   254
      if size s <= 1 then s
wenzelm@44809
   255
      else
wenzelm@44809
   256
        (case Symtab.lookup_key (! strings) s of
wenzelm@44809
   257
          SOME (s', ()) => s'
wenzelm@44809
   258
        | NONE => (Unsynchronized.change strings (Symtab.update (s, ())); s));
wenzelm@44808
   259
wenzelm@44809
   260
    fun tree t =
wenzelm@44809
   261
      (case Treetab.lookup_key (! trees) t of
wenzelm@44809
   262
        SOME (t', ()) => t'
wenzelm@44809
   263
      | NONE =>
wenzelm@44809
   264
          let
wenzelm@44809
   265
            val t' =
wenzelm@44809
   266
              (case t of
wenzelm@44809
   267
                Elem ((a, ps), b) => Elem ((string a, map (pairself string) ps), map tree b)
wenzelm@44809
   268
              | Text s => Text (string s));
wenzelm@44809
   269
            val _ = Unsynchronized.change trees (Treetab.update (t', ()));
wenzelm@44809
   270
          in t' end);
wenzelm@44808
   271
  in tree end;
wenzelm@44808
   272
wenzelm@44808
   273
wenzelm@43767
   274
wenzelm@43767
   275
(** XML as data representation language **)
wenzelm@43767
   276
wenzelm@43767
   277
exception XML_ATOM of string;
wenzelm@43767
   278
exception XML_BODY of tree list;
wenzelm@43767
   279
wenzelm@43767
   280
wenzelm@43767
   281
structure Encode =
wenzelm@43767
   282
struct
wenzelm@43767
   283
wenzelm@43778
   284
type 'a A = 'a -> string;
wenzelm@43767
   285
type 'a T = 'a -> body;
wenzelm@43778
   286
type 'a V = 'a -> string list * body;
wenzelm@43767
   287
wenzelm@43767
   288
wenzelm@43778
   289
(* atomic values *)
wenzelm@43767
   290
wenzelm@43767
   291
fun int_atom i = signed_string_of_int i;
wenzelm@43767
   292
wenzelm@43767
   293
fun bool_atom false = "0"
wenzelm@43767
   294
  | bool_atom true = "1";
wenzelm@43767
   295
wenzelm@43767
   296
fun unit_atom () = "";
wenzelm@43767
   297
wenzelm@43767
   298
wenzelm@43767
   299
(* structural nodes *)
wenzelm@43767
   300
wenzelm@43767
   301
fun node ts = Elem ((":", []), ts);
wenzelm@43767
   302
wenzelm@43778
   303
fun vector xs = map_index (fn (i, x) => (int_atom i, x)) xs;
wenzelm@43778
   304
wenzelm@43778
   305
fun tagged (tag, (xs, ts)) = Elem ((int_atom tag, vector xs), ts);
wenzelm@43767
   306
wenzelm@43767
   307
wenzelm@43767
   308
(* representation of standard types *)
wenzelm@43767
   309
wenzelm@43767
   310
fun properties props = [Elem ((":", props), [])];
wenzelm@43767
   311
wenzelm@43767
   312
fun string "" = []
wenzelm@43767
   313
  | string s = [Text s];
wenzelm@43767
   314
wenzelm@43767
   315
val int = string o int_atom;
wenzelm@43767
   316
wenzelm@43767
   317
val bool = string o bool_atom;
wenzelm@43767
   318
wenzelm@43767
   319
val unit = string o unit_atom;
wenzelm@43767
   320
wenzelm@43767
   321
fun pair f g (x, y) = [node (f x), node (g y)];
wenzelm@43767
   322
wenzelm@43767
   323
fun triple f g h (x, y, z) = [node (f x), node (g y), node (h z)];
wenzelm@43767
   324
wenzelm@43767
   325
fun list f xs = map (node o f) xs;
wenzelm@43767
   326
wenzelm@43767
   327
fun option _ NONE = []
wenzelm@43767
   328
  | option f (SOME x) = [node (f x)];
wenzelm@43767
   329
wenzelm@47199
   330
fun variant fs x =
wenzelm@47199
   331
  [tagged (the (get_index (fn f => SOME (f x) handle General.Match => NONE) fs))];
wenzelm@43767
   332
wenzelm@26546
   333
end;
wenzelm@43767
   334
wenzelm@43767
   335
wenzelm@43767
   336
structure Decode =
wenzelm@43767
   337
struct
wenzelm@43767
   338
wenzelm@43778
   339
type 'a A = string -> 'a;
wenzelm@43767
   340
type 'a T = body -> 'a;
wenzelm@43778
   341
type 'a V = string list * body -> 'a;
wenzelm@43767
   342
wenzelm@43767
   343
wenzelm@43778
   344
(* atomic values *)
wenzelm@43767
   345
wenzelm@43767
   346
fun int_atom s =
wenzelm@43797
   347
  Markup.parse_int s
wenzelm@43797
   348
    handle Fail _ => raise XML_ATOM s;
wenzelm@43767
   349
wenzelm@43767
   350
fun bool_atom "0" = false
wenzelm@43767
   351
  | bool_atom "1" = true
wenzelm@43767
   352
  | bool_atom s = raise XML_ATOM s;
wenzelm@43767
   353
wenzelm@43767
   354
fun unit_atom "" = ()
wenzelm@43767
   355
  | unit_atom s = raise XML_ATOM s;
wenzelm@43767
   356
wenzelm@43767
   357
wenzelm@43767
   358
(* structural nodes *)
wenzelm@43767
   359
wenzelm@43767
   360
fun node (Elem ((":", []), ts)) = ts
wenzelm@43767
   361
  | node t = raise XML_BODY [t];
wenzelm@43767
   362
wenzelm@43783
   363
fun vector atts =
wenzelm@46839
   364
  map_index (fn (i, (a, x)) => if int_atom a = i then x else raise XML_ATOM a) atts;
wenzelm@43778
   365
wenzelm@43844
   366
fun tagged (Elem ((name, atts), ts)) = (int_atom name, (vector atts, ts))
wenzelm@43767
   367
  | tagged t = raise XML_BODY [t];
wenzelm@43767
   368
wenzelm@43767
   369
wenzelm@43767
   370
(* representation of standard types *)
wenzelm@43767
   371
wenzelm@43767
   372
fun properties [Elem ((":", props), [])] = props
wenzelm@43767
   373
  | properties ts = raise XML_BODY ts;
wenzelm@43767
   374
wenzelm@43767
   375
fun string [] = ""
wenzelm@43767
   376
  | string [Text s] = s
wenzelm@43767
   377
  | string ts = raise XML_BODY ts;
wenzelm@43767
   378
wenzelm@43767
   379
val int = int_atom o string;
wenzelm@43767
   380
wenzelm@43767
   381
val bool = bool_atom o string;
wenzelm@43767
   382
wenzelm@43767
   383
val unit = unit_atom o string;
wenzelm@43767
   384
wenzelm@43767
   385
fun pair f g [t1, t2] = (f (node t1), g (node t2))
wenzelm@43767
   386
  | pair _ _ ts = raise XML_BODY ts;
wenzelm@43767
   387
wenzelm@43767
   388
fun triple f g h [t1, t2, t3] = (f (node t1), g (node t2), h (node t3))
wenzelm@43767
   389
  | triple _ _ _ ts = raise XML_BODY ts;
wenzelm@43767
   390
wenzelm@43767
   391
fun list f ts = map (f o node) ts;
wenzelm@43767
   392
wenzelm@43767
   393
fun option _ [] = NONE
wenzelm@43767
   394
  | option f [t] = SOME (f (node t))
wenzelm@43767
   395
  | option _ ts = raise XML_BODY ts;
wenzelm@43767
   396
wenzelm@43768
   397
fun variant fs [t] =
wenzelm@43768
   398
      let
wenzelm@43778
   399
        val (tag, (xs, ts)) = tagged t;
wenzelm@43768
   400
        val f = nth fs tag handle General.Subscript => raise XML_BODY [t];
wenzelm@43778
   401
      in f (xs, ts) end
wenzelm@43767
   402
  | variant _ ts = raise XML_BODY ts;
wenzelm@43767
   403
wenzelm@43767
   404
end;
wenzelm@43767
   405
wenzelm@43767
   406
end;