src/Pure/General/yxml.ML
author noschinl
Wed, 13 Apr 2011 21:23:30 +0200
changeset 42330 7a1655920fe8
parent 38474 e498dc2eb576
child 42331 b3759dcea95e
permissions -rw-r--r--
Add YXML.parse_file to parse and process big data files
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
     1
(*  Title:      Pure/General/yxml.ML
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
     2
    Author:     Makarius
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
     3
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
     4
Efficient text representation of XML trees using extra characters X
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
     5
and Y -- no escaping, may nest marked text verbatim.
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
     6
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
     7
Markup <elem att="val" ...>...body...</elem> is encoded as:
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
     8
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
     9
  X Y name Y att=val ... X
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    10
  ...
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    11
  body
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    12
  ...
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    13
  X Y X
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    14
*)
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    15
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    16
signature YXML =
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    17
sig
38265
cc9fde54311f renamed YXML.binary_text to YXML.escape_controls to emphasize what it actually does;
wenzelm
parents: 38228
diff changeset
    18
  val escape_controls: string -> string
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    19
  val output_markup: Markup.T -> string * string
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    20
  val element: string -> XML.attributes -> string list -> string
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    21
  val string_of: XML.tree -> string
38266
492d377ecfe2 type XML.body as basic data representation language;
wenzelm
parents: 38265
diff changeset
    22
  val parse_body: string -> XML.body
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    23
  val parse: string -> XML.tree
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    24
end;
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    25
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    26
structure YXML: YXML =
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    27
struct
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    28
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    29
(** string representation **)
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    30
38265
cc9fde54311f renamed YXML.binary_text to YXML.escape_controls to emphasize what it actually does;
wenzelm
parents: 38228
diff changeset
    31
(* idempotent recoding of certain low ASCII control characters *)
34095
c2f176a38448 robust representation of low ASCII control characters within XML/YXML text;
wenzelm
parents: 31469
diff changeset
    32
c2f176a38448 robust representation of low ASCII control characters within XML/YXML text;
wenzelm
parents: 31469
diff changeset
    33
fun pseudo_utf8 c =
c2f176a38448 robust representation of low ASCII control characters within XML/YXML text;
wenzelm
parents: 31469
diff changeset
    34
  if Symbol.is_ascii_control c
c2f176a38448 robust representation of low ASCII control characters within XML/YXML text;
wenzelm
parents: 31469
diff changeset
    35
  then chr 192 ^ chr (128 + ord c)
c2f176a38448 robust representation of low ASCII control characters within XML/YXML text;
wenzelm
parents: 31469
diff changeset
    36
  else c;
c2f176a38448 robust representation of low ASCII control characters within XML/YXML text;
wenzelm
parents: 31469
diff changeset
    37
38265
cc9fde54311f renamed YXML.binary_text to YXML.escape_controls to emphasize what it actually does;
wenzelm
parents: 38228
diff changeset
    38
fun escape_controls str =
34095
c2f176a38448 robust representation of low ASCII control characters within XML/YXML text;
wenzelm
parents: 31469
diff changeset
    39
  if exists_string Symbol.is_ascii_control str
c2f176a38448 robust representation of low ASCII control characters within XML/YXML text;
wenzelm
parents: 31469
diff changeset
    40
  then translate_string pseudo_utf8 str
c2f176a38448 robust representation of low ASCII control characters within XML/YXML text;
wenzelm
parents: 31469
diff changeset
    41
  else str;
c2f176a38448 robust representation of low ASCII control characters within XML/YXML text;
wenzelm
parents: 31469
diff changeset
    42
c2f176a38448 robust representation of low ASCII control characters within XML/YXML text;
wenzelm
parents: 31469
diff changeset
    43
26547
1112375f6a69 tuned comments;
wenzelm
parents: 26540
diff changeset
    44
(* markers *)
1112375f6a69 tuned comments;
wenzelm
parents: 26540
diff changeset
    45
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    46
val X = Symbol.ENQ;
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    47
val Y = Symbol.ACK;
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    48
val XY = X ^ Y;
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    49
val XYX = XY ^ X;
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    50
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    51
26547
1112375f6a69 tuned comments;
wenzelm
parents: 26540
diff changeset
    52
(* output *)
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    53
27884
10c927e4abf5 output_markup: check Markup.is_none;
wenzelm
parents: 27798
diff changeset
    54
fun output_markup (markup as (name, atts)) =
38474
e498dc2eb576 uniform Markup.empty/Markup.Empty in ML and Scala;
wenzelm
parents: 38266
diff changeset
    55
  if Markup.is_empty markup then Markup.no_output
27884
10c927e4abf5 output_markup: check Markup.is_none;
wenzelm
parents: 27798
diff changeset
    56
  else (XY ^ name ^ implode (map (fn (a, x) => Y ^ a ^ "=" ^ x) atts) ^ X, XYX);
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    57
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    58
fun element name atts body =
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    59
  let val (pre, post) = output_markup (name, atts)
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    60
  in pre ^ implode body ^ post end;
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    61
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    62
fun string_of t =
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    63
  let
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    64
    fun attrib (a, x) =
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    65
      Buffer.add Y #>
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    66
      Buffer.add a #> Buffer.add "=" #> Buffer.add x;
38228
ada3ab6b9085 simplified type XML.tree: embed Markup.T directly, avoid slightly odd triple;
wenzelm
parents: 34095
diff changeset
    67
    fun tree (XML.Elem ((name, atts), ts)) =
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    68
          Buffer.add XY #> Buffer.add name #> fold attrib atts #> Buffer.add X #>
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    69
          fold tree ts #>
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    70
          Buffer.add XYX
28033
f03b5856f286 removed obsolete XML.Output workaround;
wenzelm
parents: 28025
diff changeset
    71
      | tree (XML.Text s) = Buffer.add s;
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    72
  in Buffer.empty |> tree t |> Buffer.content end;
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    73
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    74
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    75
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    76
(** efficient YXML parsing **)
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    77
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    78
local
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    79
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    80
(* splitting *)
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    81
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    82
fun is_char s c = ord s = Char.ord c;
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    83
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    84
val split_string =
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    85
  Substring.full #>
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    86
  Substring.tokens (is_char X) #>
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    87
  map (Substring.fields (is_char Y) #> map Substring.string);
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    88
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    89
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    90
(* structural errors *)
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    91
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    92
fun err msg = raise Fail ("Malformed YXML encoding: " ^ msg);
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    93
fun err_attribute () = err "bad attribute";
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    94
fun err_element () = err "bad element";
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    95
fun err_unbalanced "" = err "unbalanced element"
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    96
  | err_unbalanced name = err ("unbalanced element " ^ quote name);
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    97
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    98
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    99
(* stack operations *)
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   100
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   101
fun add x ((elem, body) :: pending) = (elem, x :: body) :: pending;
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   102
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   103
fun push "" _ _ = err_element ()
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   104
  | push name atts pending = ((name, atts), []) :: pending;
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   105
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   106
fun pop ((("", _), _) :: _) = err_unbalanced ""
38228
ada3ab6b9085 simplified type XML.tree: embed Markup.T directly, avoid slightly odd triple;
wenzelm
parents: 34095
diff changeset
   107
  | pop ((markup, body) :: pending) = add (XML.Elem (markup, rev body)) pending;
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   108
42330
7a1655920fe8 Add YXML.parse_file to parse and process big data files
noschinl
parents: 38474
diff changeset
   109
val stack_init = [(("", []), [])];
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   110
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
   111
(* parsing *)
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   112
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   113
fun parse_attrib s =
28025
d9fcab768496 replaced find_substring by first_field;
wenzelm
parents: 28023
diff changeset
   114
  (case first_field "=" s of
28023
92dd3ad302b7 simplified parse_attrib (find_substring instead of space_explode);
wenzelm
parents: 27932
diff changeset
   115
    NONE => err_attribute ()
28025
d9fcab768496 replaced find_substring by first_field;
wenzelm
parents: 28023
diff changeset
   116
  | SOME ("", _) => err_attribute ()
d9fcab768496 replaced find_substring by first_field;
wenzelm
parents: 28023
diff changeset
   117
  | SOME att => att);
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   118
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
   119
fun parse_chunk ["", ""] = pop
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
   120
  | parse_chunk ("" :: name :: atts) = push name (map parse_attrib atts)
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
   121
  | parse_chunk txts = fold (add o XML.Text) txts;
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   122
42330
7a1655920fe8 Add YXML.parse_file to parse and process big data files
noschinl
parents: 38474
diff changeset
   123
fun preparse_chunk _ "" x = x
7a1655920fe8 Add YXML.parse_file to parse and process big data files
noschinl
parents: 38474
diff changeset
   124
  | preparse_chunk f str (pending, results) =
7a1655920fe8 Add YXML.parse_file to parse and process big data files
noschinl
parents: 38474
diff changeset
   125
      (case parse_chunk (String.fields (is_char Y) str) pending of
7a1655920fe8 Add YXML.parse_file to parse and process big data files
noschinl
parents: 38474
diff changeset
   126
        [(("", _), [result])] => (stack_init, f result :: results)
7a1655920fe8 Add YXML.parse_file to parse and process big data files
noschinl
parents: 38474
diff changeset
   127
      | foo => (foo, results));
7a1655920fe8 Add YXML.parse_file to parse and process big data files
noschinl
parents: 38474
diff changeset
   128
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   129
in
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   130
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   131
fun parse_body source =
42330
7a1655920fe8 Add YXML.parse_file to parse and process big data files
noschinl
parents: 38474
diff changeset
   132
  (case fold parse_chunk (split_string source) stack_init of
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   133
    [(("", _), result)] => rev result
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   134
  | ((name, _), _) :: _ => err_unbalanced name);
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   135
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   136
fun parse source =
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   137
  (case parse_body source of
27798
b96c73f11a23 YXML.parse: allow text without markup, potentially empty;
wenzelm
parents: 26684
diff changeset
   138
    [result] => result
b96c73f11a23 YXML.parse: allow text without markup, potentially empty;
wenzelm
parents: 26684
diff changeset
   139
  | [] => XML.Text ""
b96c73f11a23 YXML.parse: allow text without markup, potentially empty;
wenzelm
parents: 26684
diff changeset
   140
  | _ => err "multiple results");
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   141
42330
7a1655920fe8 Add YXML.parse_file to parse and process big data files
noschinl
parents: 38474
diff changeset
   142
fun parse_file' f path =
7a1655920fe8 Add YXML.parse_file to parse and process big data files
noschinl
parents: 38474
diff changeset
   143
  (case File.fold_fields (is_char X) (preparse_chunk f)
7a1655920fe8 Add YXML.parse_file to parse and process big data files
noschinl
parents: 38474
diff changeset
   144
      path  (stack_init, []) of
7a1655920fe8 Add YXML.parse_file to parse and process big data files
noschinl
parents: 38474
diff changeset
   145
    ([(("", _), [])], result) => rev result
7a1655920fe8 Add YXML.parse_file to parse and process big data files
noschinl
parents: 38474
diff changeset
   146
  | (((name, _), _) :: _, _) => err_unbalanced name);
7a1655920fe8 Add YXML.parse_file to parse and process big data files
noschinl
parents: 38474
diff changeset
   147
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   148
end;
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   149
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   150
end;
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   151