src/Pure/General/yxml.ML
author wenzelm
Tue Aug 10 20:13:52 2010 +0200 (2010-08-10)
changeset 38265 cc9fde54311f
parent 38228 ada3ab6b9085
child 38266 492d377ecfe2
permissions -rw-r--r--
renamed YXML.binary_text to YXML.escape_controls to emphasize what it actually does;
wenzelm@26528
     1
(*  Title:      Pure/General/yxml.ML
wenzelm@26528
     2
    Author:     Makarius
wenzelm@26528
     3
wenzelm@26540
     4
Efficient text representation of XML trees using extra characters X
wenzelm@26540
     5
and Y -- no escaping, may nest marked text verbatim.
wenzelm@26528
     6
wenzelm@26540
     7
Markup <elem att="val" ...>...body...</elem> is encoded as:
wenzelm@26540
     8
wenzelm@26540
     9
  X Y name Y att=val ... X
wenzelm@26528
    10
  ...
wenzelm@26528
    11
  body
wenzelm@26528
    12
  ...
wenzelm@26540
    13
  X Y X
wenzelm@26528
    14
*)
wenzelm@26528
    15
wenzelm@26528
    16
signature YXML =
wenzelm@26528
    17
sig
wenzelm@38265
    18
  val escape_controls: string -> string
wenzelm@26540
    19
  val output_markup: Markup.T -> string * string
wenzelm@26528
    20
  val element: string -> XML.attributes -> string list -> string
wenzelm@26528
    21
  val string_of: XML.tree -> string
wenzelm@26528
    22
  val parse_body: string -> XML.tree list
wenzelm@26528
    23
  val parse: string -> XML.tree
wenzelm@26528
    24
end;
wenzelm@26528
    25
wenzelm@26528
    26
structure YXML: YXML =
wenzelm@26528
    27
struct
wenzelm@26528
    28
wenzelm@26540
    29
(** string representation **)
wenzelm@26528
    30
wenzelm@38265
    31
(* idempotent recoding of certain low ASCII control characters *)
wenzelm@34095
    32
wenzelm@34095
    33
fun pseudo_utf8 c =
wenzelm@34095
    34
  if Symbol.is_ascii_control c
wenzelm@34095
    35
  then chr 192 ^ chr (128 + ord c)
wenzelm@34095
    36
  else c;
wenzelm@34095
    37
wenzelm@38265
    38
fun escape_controls str =
wenzelm@34095
    39
  if exists_string Symbol.is_ascii_control str
wenzelm@34095
    40
  then translate_string pseudo_utf8 str
wenzelm@34095
    41
  else str;
wenzelm@34095
    42
wenzelm@34095
    43
wenzelm@26547
    44
(* markers *)
wenzelm@26547
    45
wenzelm@26540
    46
val X = Symbol.ENQ;
wenzelm@26540
    47
val Y = Symbol.ACK;
wenzelm@26540
    48
val XY = X ^ Y;
wenzelm@26540
    49
val XYX = XY ^ X;
wenzelm@26528
    50
wenzelm@26528
    51
wenzelm@26547
    52
(* output *)
wenzelm@26540
    53
wenzelm@27884
    54
fun output_markup (markup as (name, atts)) =
wenzelm@29325
    55
  if Markup.is_none markup then Markup.no_output
wenzelm@27884
    56
  else (XY ^ name ^ implode (map (fn (a, x) => Y ^ a ^ "=" ^ x) atts) ^ X, XYX);
wenzelm@26540
    57
wenzelm@26528
    58
fun element name atts body =
wenzelm@26540
    59
  let val (pre, post) = output_markup (name, atts)
wenzelm@26540
    60
  in pre ^ implode body ^ post end;
wenzelm@26528
    61
wenzelm@26528
    62
fun string_of t =
wenzelm@26528
    63
  let
wenzelm@26528
    64
    fun attrib (a, x) =
wenzelm@26540
    65
      Buffer.add Y #>
wenzelm@26528
    66
      Buffer.add a #> Buffer.add "=" #> Buffer.add x;
wenzelm@38228
    67
    fun tree (XML.Elem ((name, atts), ts)) =
wenzelm@26540
    68
          Buffer.add XY #> Buffer.add name #> fold attrib atts #> Buffer.add X #>
wenzelm@26528
    69
          fold tree ts #>
wenzelm@26540
    70
          Buffer.add XYX
wenzelm@28033
    71
      | tree (XML.Text s) = Buffer.add s;
wenzelm@26528
    72
  in Buffer.empty |> tree t |> Buffer.content end;
wenzelm@26528
    73
wenzelm@26528
    74
wenzelm@26540
    75
wenzelm@26540
    76
(** efficient YXML parsing **)
wenzelm@26528
    77
wenzelm@26528
    78
local
wenzelm@26528
    79
wenzelm@26540
    80
(* splitting *)
wenzelm@26540
    81
wenzelm@26540
    82
fun is_char s c = ord s = Char.ord c;
wenzelm@26540
    83
wenzelm@26540
    84
val split_string =
wenzelm@26540
    85
  Substring.full #>
wenzelm@26540
    86
  Substring.tokens (is_char X) #>
wenzelm@26540
    87
  map (Substring.fields (is_char Y) #> map Substring.string);
wenzelm@26540
    88
wenzelm@26540
    89
wenzelm@26540
    90
(* structural errors *)
wenzelm@26528
    91
wenzelm@26528
    92
fun err msg = raise Fail ("Malformed YXML encoding: " ^ msg);
wenzelm@26528
    93
fun err_attribute () = err "bad attribute";
wenzelm@26528
    94
fun err_element () = err "bad element";
wenzelm@26528
    95
fun err_unbalanced "" = err "unbalanced element"
wenzelm@26528
    96
  | err_unbalanced name = err ("unbalanced element " ^ quote name);
wenzelm@26528
    97
wenzelm@26528
    98
wenzelm@26528
    99
(* stack operations *)
wenzelm@26528
   100
wenzelm@26528
   101
fun add x ((elem, body) :: pending) = (elem, x :: body) :: pending;
wenzelm@26528
   102
wenzelm@26528
   103
fun push "" _ _ = err_element ()
wenzelm@26528
   104
  | push name atts pending = ((name, atts), []) :: pending;
wenzelm@26528
   105
wenzelm@26528
   106
fun pop ((("", _), _) :: _) = err_unbalanced ""
wenzelm@38228
   107
  | pop ((markup, body) :: pending) = add (XML.Elem (markup, rev body)) pending;
wenzelm@26528
   108
wenzelm@26528
   109
wenzelm@26540
   110
(* parsing *)
wenzelm@26528
   111
wenzelm@26528
   112
fun parse_attrib s =
wenzelm@28025
   113
  (case first_field "=" s of
wenzelm@28023
   114
    NONE => err_attribute ()
wenzelm@28025
   115
  | SOME ("", _) => err_attribute ()
wenzelm@28025
   116
  | SOME att => att);
wenzelm@26528
   117
wenzelm@26540
   118
fun parse_chunk ["", ""] = pop
wenzelm@26540
   119
  | parse_chunk ("" :: name :: atts) = push name (map parse_attrib atts)
wenzelm@26540
   120
  | parse_chunk txts = fold (add o XML.Text) txts;
wenzelm@26528
   121
wenzelm@26528
   122
in
wenzelm@26528
   123
wenzelm@26528
   124
fun parse_body source =
wenzelm@26540
   125
  (case fold parse_chunk (split_string source) [(("", []), [])] of
wenzelm@26528
   126
    [(("", _), result)] => rev result
wenzelm@26528
   127
  | ((name, _), _) :: _ => err_unbalanced name);
wenzelm@26528
   128
wenzelm@26528
   129
fun parse source =
wenzelm@26528
   130
  (case parse_body source of
wenzelm@27798
   131
    [result] => result
wenzelm@27798
   132
  | [] => XML.Text ""
wenzelm@27798
   133
  | _ => err "multiple results");
wenzelm@26528
   134
wenzelm@26528
   135
end;
wenzelm@26528
   136
wenzelm@26528
   137
end;
wenzelm@26528
   138