src/Pure/PIDE/xml.ML
author wenzelm
Thu Dec 13 18:00:24 2012 +0100 (2012-12-13 ago)
changeset 50503 50f141b34bb7
parent 49650 9fad6480300d
child 56059 2390391584c2
permissions -rw-r--r--
enable Isabelle/ML to produce uninterpreted result messages as well;
wenzelm@44698
     1
(*  Title:      Pure/PIDE/xml.ML
wenzelm@44698
     2
    Author:     David Aspinall
wenzelm@44698
     3
    Author:     Stefan Berghofer
wenzelm@44698
     4
    Author:     Makarius
wenzelm@24264
     5
wenzelm@46840
     6
Untyped XML trees and representation of ML values.
wenzelm@24264
     7
*)
wenzelm@24264
     8
wenzelm@43767
     9
signature XML_DATA_OPS =
wenzelm@43767
    10
sig
wenzelm@43778
    11
  type 'a A
wenzelm@43767
    12
  type 'a T
wenzelm@43778
    13
  type 'a V
wenzelm@43778
    14
  val int_atom: int A
wenzelm@43778
    15
  val bool_atom: bool A
wenzelm@43778
    16
  val unit_atom: unit A
wenzelm@43767
    17
  val properties: Properties.T T
wenzelm@43767
    18
  val string: string T
wenzelm@43767
    19
  val int: int T
wenzelm@43767
    20
  val bool: bool T
wenzelm@43767
    21
  val unit: unit T
wenzelm@43767
    22
  val pair: 'a T -> 'b T -> ('a * 'b) T
wenzelm@43767
    23
  val triple: 'a T -> 'b T -> 'c T -> ('a * 'b * 'c) T
wenzelm@43767
    24
  val list: 'a T -> 'a list T
wenzelm@43767
    25
  val option: 'a T -> 'a option T
wenzelm@43778
    26
  val variant: 'a V list -> 'a T
wenzelm@43767
    27
end;
wenzelm@43767
    28
wenzelm@24264
    29
signature XML =
wenzelm@24264
    30
sig
wenzelm@46837
    31
  type attributes = (string * string) list
wenzelm@24264
    32
  datatype tree =
wenzelm@46837
    33
      Elem of (string * attributes) * tree list
wenzelm@24264
    34
    | Text of string
wenzelm@38266
    35
  type body = tree list
wenzelm@49650
    36
  val wrap_elem: ((string * attributes) * tree list) * tree list -> tree
wenzelm@49650
    37
  val unwrap_elem: tree -> (((string * attributes) * tree list) * tree list) option
wenzelm@26546
    38
  val add_content: tree -> Buffer.T -> Buffer.T
wenzelm@39555
    39
  val content_of: body -> string
wenzelm@26546
    40
  val header: string
wenzelm@26546
    41
  val text: string -> string
wenzelm@26546
    42
  val element: string -> attributes -> string list -> string
wenzelm@40131
    43
  val output_markup: Markup.T -> Output.output * Output.output
wenzelm@26539
    44
  val string_of: tree -> string
wenzelm@43791
    45
  val pretty: int -> tree -> Pretty.T
wenzelm@26546
    46
  val output: tree -> TextIO.outstream -> unit
wenzelm@26984
    47
  val parse_comments: string list -> unit * string list
wenzelm@24264
    48
  val parse_string : string -> string option
wenzelm@26546
    49
  val parse_element: string list -> tree * string list
wenzelm@26984
    50
  val parse_document: string list -> tree * string list
wenzelm@26539
    51
  val parse: string -> tree
wenzelm@43767
    52
  exception XML_ATOM of string
wenzelm@43767
    53
  exception XML_BODY of body
wenzelm@43778
    54
  structure Encode: XML_DATA_OPS
wenzelm@43778
    55
  structure Decode: XML_DATA_OPS
wenzelm@24264
    56
end;
wenzelm@24264
    57
wenzelm@24264
    58
structure XML: XML =
wenzelm@24264
    59
struct
wenzelm@24264
    60
wenzelm@26546
    61
(** XML trees **)
wenzelm@26546
    62
wenzelm@46837
    63
type attributes = (string * string) list;
wenzelm@26546
    64
wenzelm@26546
    65
datatype tree =
wenzelm@46837
    66
    Elem of (string * attributes) * tree list
wenzelm@28033
    67
  | Text of string;
wenzelm@26546
    68
wenzelm@38266
    69
type body = tree list;
wenzelm@38266
    70
wenzelm@49650
    71
wenzelm@49650
    72
(* wrapped elements *)
wenzelm@49650
    73
wenzelm@49650
    74
val xml_elemN = "xml_elem";
wenzelm@49650
    75
val xml_nameN = "xml_name";
wenzelm@49650
    76
val xml_bodyN = "xml_body";
wenzelm@49650
    77
wenzelm@49650
    78
fun wrap_elem (((a, atts), body1), body2) =
wenzelm@49650
    79
  Elem ((xml_elemN, (xml_nameN, a) :: atts), Elem ((xml_bodyN, []), body1) :: body2);
wenzelm@49650
    80
wenzelm@49650
    81
fun unwrap_elem (Elem ((name, (n, a) :: atts), Elem ((name', atts'), body1) :: body2)) =
wenzelm@49650
    82
      if name = xml_elemN andalso n = xml_nameN andalso name' = xml_bodyN andalso null atts'
wenzelm@49650
    83
      then SOME (((a, atts), body1), body2) else NONE
wenzelm@49650
    84
  | unwrap_elem _ = NONE;
wenzelm@49650
    85
wenzelm@49650
    86
wenzelm@49650
    87
(* text context *)
wenzelm@49650
    88
wenzelm@49650
    89
fun add_content tree =
wenzelm@49650
    90
  (case unwrap_elem tree of
wenzelm@49650
    91
    SOME (_, ts) => fold add_content ts
wenzelm@49650
    92
  | NONE =>
wenzelm@49650
    93
      (case tree of
wenzelm@49650
    94
        Elem (_, ts) => fold add_content ts
wenzelm@49650
    95
      | Text s => Buffer.add s));
wenzelm@26546
    96
wenzelm@39555
    97
fun content_of body = Buffer.empty |> fold add_content body |> Buffer.content;
wenzelm@39555
    98
wenzelm@26546
    99
wenzelm@24264
   100
wenzelm@26525
   101
(** string representation **)
wenzelm@26525
   102
wenzelm@24264
   103
val header = "<?xml version=\"1.0\"?>\n";
wenzelm@24264
   104
wenzelm@24264
   105
wenzelm@26546
   106
(* escaped text *)
wenzelm@24264
   107
wenzelm@24264
   108
fun decode "&lt;" = "<"
wenzelm@24264
   109
  | decode "&gt;" = ">"
wenzelm@24264
   110
  | decode "&amp;" = "&"
wenzelm@24264
   111
  | decode "&apos;" = "'"
wenzelm@24264
   112
  | decode "&quot;" = "\""
wenzelm@24264
   113
  | decode c = c;
wenzelm@24264
   114
wenzelm@24264
   115
fun encode "<" = "&lt;"
wenzelm@24264
   116
  | encode ">" = "&gt;"
wenzelm@24264
   117
  | encode "&" = "&amp;"
wenzelm@24264
   118
  | encode "'" = "&apos;"
wenzelm@24264
   119
  | encode "\"" = "&quot;"
wenzelm@24264
   120
  | encode c = c;
wenzelm@24264
   121
wenzelm@25838
   122
val text = translate_string encode;
wenzelm@24264
   123
wenzelm@24264
   124
wenzelm@24264
   125
(* elements *)
wenzelm@24264
   126
wenzelm@26539
   127
fun elem name atts =
wenzelm@26551
   128
  space_implode " " (name :: map (fn (a, x) => a ^ "=\"" ^ text x ^ "\"") atts);
wenzelm@24264
   129
wenzelm@26525
   130
fun element name atts body =
wenzelm@26539
   131
  let val b = implode body in
wenzelm@26539
   132
    if b = "" then enclose "<" "/>" (elem name atts)
wenzelm@26539
   133
    else enclose "<" ">" (elem name atts) ^ b ^ enclose "</" ">" name
wenzelm@24264
   134
  end;
wenzelm@24264
   135
wenzelm@27884
   136
fun output_markup (markup as (name, atts)) =
wenzelm@38474
   137
  if Markup.is_empty markup then Markup.no_output
wenzelm@27884
   138
  else (enclose "<" ">" (elem name atts), enclose "</" ">" name);
wenzelm@26539
   139
wenzelm@24264
   140
wenzelm@26546
   141
(* output *)
wenzelm@24264
   142
wenzelm@43791
   143
fun buffer_of depth tree =
wenzelm@24264
   144
  let
wenzelm@43791
   145
    fun traverse _ (Elem ((name, atts), [])) =
wenzelm@26539
   146
          Buffer.add "<" #> Buffer.add (elem name atts) #> Buffer.add "/>"
wenzelm@43791
   147
      | traverse d (Elem ((name, atts), ts)) =
wenzelm@26539
   148
          Buffer.add "<" #> Buffer.add (elem name atts) #> Buffer.add ">" #>
wenzelm@43791
   149
          traverse_body d ts #>
wenzelm@26525
   150
          Buffer.add "</" #> Buffer.add name #> Buffer.add ">"
wenzelm@43791
   151
      | traverse _ (Text s) = Buffer.add (text s)
wenzelm@43791
   152
    and traverse_body 0 _ = Buffer.add "..."
wenzelm@43791
   153
      | traverse_body d ts = fold (traverse (d - 1)) ts;
wenzelm@43791
   154
  in Buffer.empty |> traverse depth tree end;
wenzelm@24264
   155
wenzelm@43791
   156
val string_of = Buffer.content o buffer_of ~1;
wenzelm@43791
   157
val output = Buffer.output o buffer_of ~1;
wenzelm@43791
   158
wenzelm@43791
   159
fun pretty depth tree =
wenzelm@43791
   160
  Pretty.str (Buffer.content (buffer_of (Int.max (0, depth)) tree));
wenzelm@25838
   161
wenzelm@24264
   162
wenzelm@24264
   163
wenzelm@44698
   164
(** XML parsing **)
wenzelm@26546
   165
wenzelm@26546
   166
local
wenzelm@24264
   167
wenzelm@43947
   168
fun err msg (xs, _) =
wenzelm@43947
   169
  fn () => "XML parsing error: " ^ msg () ^ "\nfound: " ^ quote (Symbol.beginning 100 xs);
wenzelm@24264
   170
wenzelm@26984
   171
fun ignored _ = [];
wenzelm@26984
   172
wenzelm@45155
   173
fun name_start_char c = Symbol.is_ascii_letter c orelse c = ":" orelse c = "_";
wenzelm@45155
   174
fun name_char c = name_start_char c orelse Symbol.is_ascii_digit c orelse c = "-" orelse c = ".";
wenzelm@45155
   175
val parse_name = Scan.one name_start_char ::: Scan.many name_char;
wenzelm@45155
   176
wenzelm@26551
   177
val blanks = Scan.many Symbol.is_blank;
wenzelm@45155
   178
val special = $$ "&" ^^ (parse_name >> implode) ^^ $$ ";" >> decode;
wenzelm@26551
   179
val regular = Scan.one Symbol.is_regular;
wenzelm@26551
   180
fun regular_except x = Scan.one (fn c => Symbol.is_regular c andalso c <> x);
wenzelm@24264
   181
wenzelm@26551
   182
val parse_chars = Scan.repeat1 (special || regular_except "<") >> implode;
wenzelm@24264
   183
wenzelm@26551
   184
val parse_cdata =
wenzelm@26551
   185
  Scan.this_string "<![CDATA[" |--
wenzelm@26551
   186
  (Scan.repeat (Scan.unless (Scan.this_string "]]>") regular) >> implode) --|
wenzelm@26551
   187
  Scan.this_string "]]>";
wenzelm@24264
   188
wenzelm@24264
   189
val parse_att =
wenzelm@45155
   190
  ((parse_name >> implode) --| (blanks -- $$ "=" -- blanks)) --
wenzelm@26551
   191
  (($$ "\"" || $$ "'") :|-- (fn s =>
wenzelm@26551
   192
    (Scan.repeat (special || regular_except s) >> implode) --| $$ s));
wenzelm@24264
   193
wenzelm@26551
   194
val parse_comment =
wenzelm@26551
   195
  Scan.this_string "<!--" --
wenzelm@26551
   196
  Scan.repeat (Scan.unless (Scan.this_string "-->") regular) --
wenzelm@26984
   197
  Scan.this_string "-->" >> ignored;
wenzelm@24264
   198
wenzelm@26551
   199
val parse_processing_instruction =
wenzelm@26551
   200
  Scan.this_string "<?" --
wenzelm@26551
   201
  Scan.repeat (Scan.unless (Scan.this_string "?>") regular) --
wenzelm@26984
   202
  Scan.this_string "?>" >> ignored;
wenzelm@26984
   203
wenzelm@26984
   204
val parse_doctype =
wenzelm@26984
   205
  Scan.this_string "<!DOCTYPE" --
wenzelm@26984
   206
  Scan.repeat (Scan.unless ($$ ">") regular) --
wenzelm@26984
   207
  $$ ">" >> ignored;
wenzelm@26984
   208
wenzelm@26984
   209
val parse_misc =
wenzelm@26984
   210
  Scan.one Symbol.is_blank >> ignored ||
wenzelm@26984
   211
  parse_processing_instruction ||
wenzelm@26984
   212
  parse_comment;
wenzelm@26551
   213
wenzelm@26551
   214
val parse_optional_text =
wenzelm@26551
   215
  Scan.optional (parse_chars >> (single o Text)) [];
wenzelm@24264
   216
wenzelm@26546
   217
in
wenzelm@26546
   218
wenzelm@26984
   219
val parse_comments =
wenzelm@26984
   220
  blanks -- Scan.repeat (parse_comment -- blanks >> K ()) >> K ();
wenzelm@26984
   221
wenzelm@40627
   222
val parse_string = Scan.read Symbol.stopper parse_chars o raw_explode;
wenzelm@26546
   223
wenzelm@24264
   224
fun parse_content xs =
wenzelm@26551
   225
  (parse_optional_text @@@
wenzelm@26551
   226
    (Scan.repeat
wenzelm@26551
   227
      ((parse_element >> single ||
wenzelm@26551
   228
        parse_cdata >> (single o Text) ||
wenzelm@26984
   229
        parse_processing_instruction ||
wenzelm@26984
   230
        parse_comment)
wenzelm@26551
   231
      @@@ parse_optional_text) >> flat)) xs
wenzelm@24264
   232
wenzelm@26546
   233
and parse_element xs =
wenzelm@43949
   234
  ($$ "<" |-- parse_name -- Scan.repeat (blanks |-- parse_att) --| blanks :--
wenzelm@43949
   235
    (fn (name, _) =>
wenzelm@43947
   236
      !! (err (fn () => "Expected > or />"))
wenzelm@43949
   237
       ($$ "/" -- $$ ">" >> ignored ||
wenzelm@43949
   238
        $$ ">" |-- parse_content --|
wenzelm@43949
   239
          !! (err (fn () => "Expected </" ^ implode name ^ ">"))
wenzelm@43949
   240
              ($$ "<" -- $$ "/" -- Scan.this name -- blanks -- $$ ">")))
wenzelm@43949
   241
    >> (fn ((name, atts), body) => Elem ((implode name, atts), body))) xs;
wenzelm@24264
   242
wenzelm@26984
   243
val parse_document =
wenzelm@26984
   244
  (Scan.repeat parse_misc -- Scan.option parse_doctype -- Scan.repeat parse_misc)
wenzelm@26984
   245
  |-- parse_element;
wenzelm@24264
   246
wenzelm@26539
   247
fun parse s =
wenzelm@43947
   248
  (case Scan.finite Symbol.stopper (Scan.error (!! (err (fn () => "Malformed element"))
wenzelm@40627
   249
      (blanks |-- parse_document --| blanks))) (raw_explode s) of
wenzelm@24264
   250
    (x, []) => x
wenzelm@48769
   251
  | (_, ys) => error ("XML parsing error: unprocessed input\n" ^ Symbol.beginning 100 ys));
wenzelm@24264
   252
wenzelm@24264
   253
end;
wenzelm@26546
   254
wenzelm@43767
   255
wenzelm@43767
   256
wenzelm@43767
   257
(** XML as data representation language **)
wenzelm@43767
   258
wenzelm@43767
   259
exception XML_ATOM of string;
wenzelm@43767
   260
exception XML_BODY of tree list;
wenzelm@43767
   261
wenzelm@43767
   262
wenzelm@43767
   263
structure Encode =
wenzelm@43767
   264
struct
wenzelm@43767
   265
wenzelm@43778
   266
type 'a A = 'a -> string;
wenzelm@43767
   267
type 'a T = 'a -> body;
wenzelm@43778
   268
type 'a V = 'a -> string list * body;
wenzelm@43767
   269
wenzelm@43767
   270
wenzelm@43778
   271
(* atomic values *)
wenzelm@43767
   272
wenzelm@43767
   273
fun int_atom i = signed_string_of_int i;
wenzelm@43767
   274
wenzelm@43767
   275
fun bool_atom false = "0"
wenzelm@43767
   276
  | bool_atom true = "1";
wenzelm@43767
   277
wenzelm@43767
   278
fun unit_atom () = "";
wenzelm@43767
   279
wenzelm@43767
   280
wenzelm@43767
   281
(* structural nodes *)
wenzelm@43767
   282
wenzelm@43767
   283
fun node ts = Elem ((":", []), ts);
wenzelm@43767
   284
wenzelm@43778
   285
fun vector xs = map_index (fn (i, x) => (int_atom i, x)) xs;
wenzelm@43778
   286
wenzelm@43778
   287
fun tagged (tag, (xs, ts)) = Elem ((int_atom tag, vector xs), ts);
wenzelm@43767
   288
wenzelm@43767
   289
wenzelm@43767
   290
(* representation of standard types *)
wenzelm@43767
   291
wenzelm@43767
   292
fun properties props = [Elem ((":", props), [])];
wenzelm@43767
   293
wenzelm@43767
   294
fun string "" = []
wenzelm@43767
   295
  | string s = [Text s];
wenzelm@43767
   296
wenzelm@43767
   297
val int = string o int_atom;
wenzelm@43767
   298
wenzelm@43767
   299
val bool = string o bool_atom;
wenzelm@43767
   300
wenzelm@43767
   301
val unit = string o unit_atom;
wenzelm@43767
   302
wenzelm@43767
   303
fun pair f g (x, y) = [node (f x), node (g y)];
wenzelm@43767
   304
wenzelm@43767
   305
fun triple f g h (x, y, z) = [node (f x), node (g y), node (h z)];
wenzelm@43767
   306
wenzelm@43767
   307
fun list f xs = map (node o f) xs;
wenzelm@43767
   308
wenzelm@43767
   309
fun option _ NONE = []
wenzelm@43767
   310
  | option f (SOME x) = [node (f x)];
wenzelm@43767
   311
wenzelm@47199
   312
fun variant fs x =
wenzelm@47199
   313
  [tagged (the (get_index (fn f => SOME (f x) handle General.Match => NONE) fs))];
wenzelm@43767
   314
wenzelm@26546
   315
end;
wenzelm@43767
   316
wenzelm@43767
   317
wenzelm@43767
   318
structure Decode =
wenzelm@43767
   319
struct
wenzelm@43767
   320
wenzelm@43778
   321
type 'a A = string -> 'a;
wenzelm@43767
   322
type 'a T = body -> 'a;
wenzelm@43778
   323
type 'a V = string list * body -> 'a;
wenzelm@43767
   324
wenzelm@43767
   325
wenzelm@43778
   326
(* atomic values *)
wenzelm@43767
   327
wenzelm@43767
   328
fun int_atom s =
wenzelm@43797
   329
  Markup.parse_int s
wenzelm@43797
   330
    handle Fail _ => raise XML_ATOM s;
wenzelm@43767
   331
wenzelm@43767
   332
fun bool_atom "0" = false
wenzelm@43767
   333
  | bool_atom "1" = true
wenzelm@43767
   334
  | bool_atom s = raise XML_ATOM s;
wenzelm@43767
   335
wenzelm@43767
   336
fun unit_atom "" = ()
wenzelm@43767
   337
  | unit_atom s = raise XML_ATOM s;
wenzelm@43767
   338
wenzelm@43767
   339
wenzelm@43767
   340
(* structural nodes *)
wenzelm@43767
   341
wenzelm@43767
   342
fun node (Elem ((":", []), ts)) = ts
wenzelm@43767
   343
  | node t = raise XML_BODY [t];
wenzelm@43767
   344
wenzelm@43783
   345
fun vector atts =
wenzelm@46839
   346
  map_index (fn (i, (a, x)) => if int_atom a = i then x else raise XML_ATOM a) atts;
wenzelm@43778
   347
wenzelm@43844
   348
fun tagged (Elem ((name, atts), ts)) = (int_atom name, (vector atts, ts))
wenzelm@43767
   349
  | tagged t = raise XML_BODY [t];
wenzelm@43767
   350
wenzelm@43767
   351
wenzelm@43767
   352
(* representation of standard types *)
wenzelm@43767
   353
wenzelm@43767
   354
fun properties [Elem ((":", props), [])] = props
wenzelm@43767
   355
  | properties ts = raise XML_BODY ts;
wenzelm@43767
   356
wenzelm@43767
   357
fun string [] = ""
wenzelm@43767
   358
  | string [Text s] = s
wenzelm@43767
   359
  | string ts = raise XML_BODY ts;
wenzelm@43767
   360
wenzelm@43767
   361
val int = int_atom o string;
wenzelm@43767
   362
wenzelm@43767
   363
val bool = bool_atom o string;
wenzelm@43767
   364
wenzelm@43767
   365
val unit = unit_atom o string;
wenzelm@43767
   366
wenzelm@43767
   367
fun pair f g [t1, t2] = (f (node t1), g (node t2))
wenzelm@43767
   368
  | pair _ _ ts = raise XML_BODY ts;
wenzelm@43767
   369
wenzelm@43767
   370
fun triple f g h [t1, t2, t3] = (f (node t1), g (node t2), h (node t3))
wenzelm@43767
   371
  | triple _ _ _ ts = raise XML_BODY ts;
wenzelm@43767
   372
wenzelm@43767
   373
fun list f ts = map (f o node) ts;
wenzelm@43767
   374
wenzelm@43767
   375
fun option _ [] = NONE
wenzelm@43767
   376
  | option f [t] = SOME (f (node t))
wenzelm@43767
   377
  | option _ ts = raise XML_BODY ts;
wenzelm@43767
   378
wenzelm@43768
   379
fun variant fs [t] =
wenzelm@43768
   380
      let
wenzelm@43778
   381
        val (tag, (xs, ts)) = tagged t;
wenzelm@43768
   382
        val f = nth fs tag handle General.Subscript => raise XML_BODY [t];
wenzelm@43778
   383
      in f (xs, ts) end
wenzelm@43767
   384
  | variant _ ts = raise XML_BODY ts;
wenzelm@43767
   385
wenzelm@43767
   386
end;
wenzelm@43767
   387
wenzelm@43767
   388
end;