src/Pure/General/yxml.ML
author wenzelm
Thu, 03 Apr 2008 16:03:59 +0200
changeset 26528 944f9bf26d2d
child 26540 173d548ce9d2
permissions -rw-r--r--
Why XML notation?
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
     1
(*  Title:      Pure/General/yxml.ML
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
     2
    ID:         $Id$
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
     3
    Author:     Makarius
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
     4
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
     5
Why XML notation?  Efficient text representation of XML trees, using
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
     6
extra characters ETX and EOT -- no escaping, may nest marked text
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
     7
verbatim.  Markup <elem att="val" ...>...body...</elem> is encoded as:
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
     8
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
     9
  ETX EOT name EOT att=val ... ETX
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    10
  ...
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    11
  body
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    12
  ...
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    13
  ETX EOT ETX
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    14
*)
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    15
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    16
signature YXML =
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    17
sig
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    18
  val detect: string -> bool
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    19
  val element: string -> XML.attributes -> string list -> string
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    20
  val string_of: XML.tree -> string
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    21
  val parse_body: string -> XML.tree list
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    22
  val parse: string -> XML.tree
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    23
end;
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    24
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    25
structure YXML: YXML =
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    26
struct
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    27
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    28
(* string representation *)
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    29
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    30
val ETX = Symbol.ETX;
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    31
val EOT = Symbol.EOT;
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    32
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    33
fun detect s = ord s = ord EOT;
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    34
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    35
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    36
(*naive pasting of strings*)
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    37
fun element name atts body =
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    38
  ETX ^ EOT ^ name ^ implode (map (fn (a, x) => EOT ^ a ^ "=" ^ x) atts) ^ ETX ^
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    39
  implode body ^
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    40
  ETX ^ EOT ^ ETX;
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    41
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    42
(*scalable buffer output*)
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    43
fun string_of t =
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    44
  let
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    45
    fun attrib (a, x) =
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    46
      Buffer.add EOT #>
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    47
      Buffer.add a #> Buffer.add "=" #> Buffer.add x;
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    48
    fun tree (XML.Elem (name, atts, ts)) =
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    49
          Buffer.add ETX #>
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    50
          Buffer.add EOT #> Buffer.add name #>
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    51
          fold attrib atts #>
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    52
          Buffer.add ETX #>
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    53
          fold tree ts #>
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    54
          Buffer.add ETX #>
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    55
          Buffer.add EOT #>
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    56
          Buffer.add ETX
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    57
      | tree (XML.Text s) = Buffer.add s
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    58
      | tree (XML.Output s) = Buffer.add s;
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    59
  in Buffer.empty |> tree t |> Buffer.content end;
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    60
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    61
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    62
(* efficient YXML parsing *)
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    63
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    64
local
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    65
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    66
(* errors *)
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    67
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    68
fun err msg = raise Fail ("Malformed YXML encoding: " ^ msg);
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    69
fun err_attribute () = err "bad attribute";
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    70
fun err_element () = err "bad element";
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    71
fun err_unbalanced "" = err "unbalanced element"
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    72
  | err_unbalanced name = err ("unbalanced element " ^ quote name);
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    73
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    74
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    75
(* stack operations *)
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    76
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    77
fun add x ((elem, body) :: pending) = (elem, x :: body) :: pending;
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    78
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    79
fun push "" _ _ = err_element ()
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    80
  | push name atts pending = ((name, atts), []) :: pending;
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    81
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    82
fun pop ((("", _), _) :: _) = err_unbalanced ""
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    83
  | pop (((name, atts), body) :: pending) = add (XML.Elem (name, atts, rev body)) pending;
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    84
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    85
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    86
(* parsers *)
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    87
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    88
fun is_char s c = ord s = Char.ord c;
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    89
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    90
fun parse_attrib s =
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    91
  (case String.fields (is_char "=") s of
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    92
    [] => err_attribute ()
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    93
  | "" :: _ => err_attribute ()
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    94
  | a :: xs => (a, space_implode "=" xs));
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    95
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    96
fun parse_chunk chunk =
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    97
  (case String.fields (is_char EOT) chunk of
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    98
    ["", ""] => pop
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    99
  | "" :: name :: atts => push name (map parse_attrib atts)
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   100
  | [_] => add (XML.Text chunk)
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   101
  | _ => err "bad text");
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   102
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   103
in
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   104
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   105
fun parse_body source =
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   106
  (case fold parse_chunk (String.tokens (is_char ETX) source) [(("", []), [])] of
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   107
    [(("", _), result)] => rev result
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   108
  | ((name, _), _) :: _ => err_unbalanced name);
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   109
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   110
fun parse source =
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   111
  (case parse_body source of
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   112
    [result as XML.Elem _] => result
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   113
  | _ => err "no root element");
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   114
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   115
end;
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   116
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   117
end;
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   118