src/Pure/General/yxml.ML
author wenzelm
Thu, 03 Apr 2008 21:23:38 +0200
changeset 26547 1112375f6a69
parent 26540 173d548ce9d2
child 26684 0701201def95
permissions -rw-r--r--
tuned comments;
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
     1
(*  Title:      Pure/General/yxml.ML
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
     2
    ID:         $Id$
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
     3
    Author:     Makarius
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
     4
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
     5
Efficient text representation of XML trees using extra characters X
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
     6
and Y -- no escaping, may nest marked text verbatim.
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
     7
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
     8
Markup <elem att="val" ...>...body...</elem> is encoded as:
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
     9
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    10
  X Y name Y att=val ... X
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    11
  ...
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    12
  body
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    13
  ...
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    14
  X Y X
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    15
*)
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    16
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    17
signature YXML =
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    18
sig
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    19
  val detect: string -> bool
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    20
  val output_markup: Markup.T -> string * string
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    21
  val element: string -> XML.attributes -> string list -> string
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    22
  val string_of: XML.tree -> string
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    23
  val parse_body: string -> XML.tree list
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    24
  val parse_element: string -> string * XML.attributes * XML.tree list
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    25
  val parse: string -> XML.tree
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    26
end;
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    27
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    28
structure YXML: YXML =
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    29
struct
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    30
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    31
(** string representation **)
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    32
26547
1112375f6a69 tuned comments;
wenzelm
parents: 26540
diff changeset
    33
(* markers *)
1112375f6a69 tuned comments;
wenzelm
parents: 26540
diff changeset
    34
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    35
val X = Symbol.ENQ;
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    36
val Y = Symbol.ACK;
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    37
val XY = X ^ Y;
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    38
val XYX = XY ^ X;
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    39
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    40
val detect = String.isPrefix XY;
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    41
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    42
26547
1112375f6a69 tuned comments;
wenzelm
parents: 26540
diff changeset
    43
(* output *)
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    44
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    45
fun output_markup (name, atts) =
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    46
  (XY ^ name ^ implode (map (fn (a, x) => Y ^ a ^ "=" ^ x) atts) ^ X, XYX);
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    47
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    48
fun element name atts body =
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    49
  let val (pre, post) = output_markup (name, atts)
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    50
  in pre ^ implode body ^ post end;
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    51
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    52
fun string_of t =
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    53
  let
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    54
    fun attrib (a, x) =
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    55
      Buffer.add Y #>
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    56
      Buffer.add a #> Buffer.add "=" #> Buffer.add x;
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    57
    fun tree (XML.Elem (name, atts, ts)) =
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    58
          Buffer.add XY #> Buffer.add name #> fold attrib atts #> Buffer.add X #>
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    59
          fold tree ts #>
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    60
          Buffer.add XYX
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    61
      | tree (XML.Text s) = Buffer.add s
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    62
      | tree (XML.Output s) = Buffer.add s;
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    63
  in Buffer.empty |> tree t |> Buffer.content end;
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    64
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    65
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    66
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    67
(** efficient YXML parsing **)
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    68
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    69
local
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    70
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    71
(* splitting *)
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    72
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    73
fun is_char s c = ord s = Char.ord c;
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    74
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    75
val split_string =
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    76
  Substring.full #>
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    77
  Substring.tokens (is_char X) #>
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    78
  map (Substring.fields (is_char Y) #> map Substring.string);
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    79
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    80
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    81
(* structural errors *)
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    82
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    83
fun err msg = raise Fail ("Malformed YXML encoding: " ^ msg);
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    84
fun err_attribute () = err "bad attribute";
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    85
fun err_element () = err "bad element";
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    86
fun err_unbalanced "" = err "unbalanced element"
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    87
  | err_unbalanced name = err ("unbalanced element " ^ quote name);
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    88
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    89
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    90
(* stack operations *)
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    91
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    92
fun add x ((elem, body) :: pending) = (elem, x :: body) :: pending;
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    93
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    94
fun push "" _ _ = err_element ()
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    95
  | push name atts pending = ((name, atts), []) :: pending;
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    96
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    97
fun pop ((("", _), _) :: _) = err_unbalanced ""
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    98
  | pop (((name, atts), body) :: pending) = add (XML.Elem (name, atts, rev body)) pending;
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    99
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   100
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
   101
(* parsing *)
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   102
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   103
fun parse_attrib s =
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   104
  (case String.fields (is_char "=") s of
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   105
    [] => err_attribute ()
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   106
  | "" :: _ => err_attribute ()
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   107
  | a :: xs => (a, space_implode "=" xs));
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   108
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
   109
fun parse_chunk ["", ""] = pop
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
   110
  | parse_chunk ("" :: name :: atts) = push name (map parse_attrib atts)
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
   111
  | parse_chunk txts = fold (add o XML.Text) txts;
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   112
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   113
in
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   114
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   115
fun parse_body source =
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
   116
  (case fold parse_chunk (split_string source) [(("", []), [])] of
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   117
    [(("", _), result)] => rev result
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   118
  | ((name, _), _) :: _ => err_unbalanced name);
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   119
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   120
fun parse source =
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   121
  (case parse_body source of
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   122
    [result as XML.Elem _] => result
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   123
  | _ => err "no root element");
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   124
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
   125
val parse_element = parse #> (fn XML.Elem elem => elem);
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
   126
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   127
end;
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   128
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   129
end;
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   130