src/Pure/PIDE/yxml.ML
author wenzelm
Sun, 24 Mar 2019 17:53:46 +0100
changeset 69968 1a400b14fd3a
parent 69845 d28e8199dcb9
child 70989 64a20b562e63
permissions -rw-r--r--
clarified spell-checking (see also 30233285270a);
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
44698
0385292321a0 moved XML/YXML to src/Pure/PIDE;
wenzelm
parents: 43782
diff changeset
     1
(*  Title:      Pure/PIDE/yxml.ML
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
     2
    Author:     Makarius
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
     3
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
     4
Efficient text representation of XML trees using extra characters X
44698
0385292321a0 moved XML/YXML to src/Pure/PIDE;
wenzelm
parents: 43782
diff changeset
     5
and Y -- no escaping, may nest marked text verbatim.  Suitable for
0385292321a0 moved XML/YXML to src/Pure/PIDE;
wenzelm
parents: 43782
diff changeset
     6
direct inlining into plain text.
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
     7
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
     8
Markup <elem att="val" ...>...body...</elem> is encoded as:
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
     9
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    10
  X Y name Y att=val ... X
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    11
  ...
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    12
  body
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    13
  ...
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    14
  X Y X
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    15
*)
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    16
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    17
signature YXML =
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    18
sig
43777
22c87ff1b8a9 tuned signature -- less cryptic ASCII names;
wenzelm
parents: 43772
diff changeset
    19
  val X: Symbol.symbol
22c87ff1b8a9 tuned signature -- less cryptic ASCII names;
wenzelm
parents: 43772
diff changeset
    20
  val Y: Symbol.symbol
43772
c825594fd0c1 clarified YXML.embed_controls -- this is idempotent and cannot be nested;
wenzelm
parents: 43731
diff changeset
    21
  val embed_controls: string -> string
43731
70072780e095 inner syntax supports inlined YXML according to Term_XML (particularly useful for producing text under program control);
wenzelm
parents: 43728
diff changeset
    22
  val detect: string -> bool
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    23
  val output_markup: Markup.T -> string * string
68228
326f4bcc5abc more scalable;
wenzelm
parents: 64275
diff changeset
    24
  val buffer_body: XML.body -> Buffer.T -> Buffer.T
326f4bcc5abc more scalable;
wenzelm
parents: 64275
diff changeset
    25
  val buffer: XML.tree -> Buffer.T -> Buffer.T
69845
d28e8199dcb9 more scalable on 32-bit Poly/ML;
wenzelm
parents: 68228
diff changeset
    26
  val chunks_of_body: XML.body -> string list
43728
152ce7114396 YXML.string_of_body convenience;
wenzelm
parents: 43615
diff changeset
    27
  val string_of_body: XML.body -> string
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    28
  val string_of: XML.tree -> string
49656
7ff712de5747 treat wrapped markup elements as raw markup delimiters;
wenzelm
parents: 46832
diff changeset
    29
  val output_markup_elem: Markup.T -> (string * string) * string
38266
492d377ecfe2 type XML.body as basic data representation language;
wenzelm
parents: 38265
diff changeset
    30
  val parse_body: string -> XML.body
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    31
  val parse: string -> XML.tree
59433
9da5b2c61049 tuned signature;
wenzelm
parents: 49656
diff changeset
    32
  val content_of: string -> string
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    33
end;
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    34
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    35
structure YXML: YXML =
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    36
struct
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    37
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    38
(** string representation **)
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    39
38265
cc9fde54311f renamed YXML.binary_text to YXML.escape_controls to emphasize what it actually does;
wenzelm
parents: 38228
diff changeset
    40
(* idempotent recoding of certain low ASCII control characters *)
34095
c2f176a38448 robust representation of low ASCII control characters within XML/YXML text;
wenzelm
parents: 31469
diff changeset
    41
c2f176a38448 robust representation of low ASCII control characters within XML/YXML text;
wenzelm
parents: 31469
diff changeset
    42
fun pseudo_utf8 c =
c2f176a38448 robust representation of low ASCII control characters within XML/YXML text;
wenzelm
parents: 31469
diff changeset
    43
  if Symbol.is_ascii_control c
c2f176a38448 robust representation of low ASCII control characters within XML/YXML text;
wenzelm
parents: 31469
diff changeset
    44
  then chr 192 ^ chr (128 + ord c)
c2f176a38448 robust representation of low ASCII control characters within XML/YXML text;
wenzelm
parents: 31469
diff changeset
    45
  else c;
c2f176a38448 robust representation of low ASCII control characters within XML/YXML text;
wenzelm
parents: 31469
diff changeset
    46
43772
c825594fd0c1 clarified YXML.embed_controls -- this is idempotent and cannot be nested;
wenzelm
parents: 43731
diff changeset
    47
fun embed_controls str =
34095
c2f176a38448 robust representation of low ASCII control characters within XML/YXML text;
wenzelm
parents: 31469
diff changeset
    48
  if exists_string Symbol.is_ascii_control str
c2f176a38448 robust representation of low ASCII control characters within XML/YXML text;
wenzelm
parents: 31469
diff changeset
    49
  then translate_string pseudo_utf8 str
c2f176a38448 robust representation of low ASCII control characters within XML/YXML text;
wenzelm
parents: 31469
diff changeset
    50
  else str;
c2f176a38448 robust representation of low ASCII control characters within XML/YXML text;
wenzelm
parents: 31469
diff changeset
    51
c2f176a38448 robust representation of low ASCII control characters within XML/YXML text;
wenzelm
parents: 31469
diff changeset
    52
26547
1112375f6a69 tuned comments;
wenzelm
parents: 26540
diff changeset
    53
(* markers *)
1112375f6a69 tuned comments;
wenzelm
parents: 26540
diff changeset
    54
64275
ac2abc987cf9 accomodate Poly/ML repository version, which treats singleton strings as boxed;
wenzelm
parents: 59433
diff changeset
    55
val X_char = Char.chr 5;
ac2abc987cf9 accomodate Poly/ML repository version, which treats singleton strings as boxed;
wenzelm
parents: 59433
diff changeset
    56
val Y_char = Char.chr 6;
ac2abc987cf9 accomodate Poly/ML repository version, which treats singleton strings as boxed;
wenzelm
parents: 59433
diff changeset
    57
ac2abc987cf9 accomodate Poly/ML repository version, which treats singleton strings as boxed;
wenzelm
parents: 59433
diff changeset
    58
val X = String.str X_char;
ac2abc987cf9 accomodate Poly/ML repository version, which treats singleton strings as boxed;
wenzelm
parents: 59433
diff changeset
    59
val Y = String.str Y_char;
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    60
val XY = X ^ Y;
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    61
val XYX = XY ^ X;
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    62
64275
ac2abc987cf9 accomodate Poly/ML repository version, which treats singleton strings as boxed;
wenzelm
parents: 59433
diff changeset
    63
fun detect s = Char.contains s X_char orelse Char.contains s Y_char;
43731
70072780e095 inner syntax supports inlined YXML according to Term_XML (particularly useful for producing text under program control);
wenzelm
parents: 43728
diff changeset
    64
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    65
26547
1112375f6a69 tuned comments;
wenzelm
parents: 26540
diff changeset
    66
(* output *)
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    67
27884
10c927e4abf5 output_markup: check Markup.is_none;
wenzelm
parents: 27798
diff changeset
    68
fun output_markup (markup as (name, atts)) =
38474
e498dc2eb576 uniform Markup.empty/Markup.Empty in ML and Scala;
wenzelm
parents: 38266
diff changeset
    69
  if Markup.is_empty markup then Markup.no_output
27884
10c927e4abf5 output_markup: check Markup.is_none;
wenzelm
parents: 27798
diff changeset
    70
  else (XY ^ name ^ implode (map (fn (a, x) => Y ^ a ^ "=" ^ x) atts) ^ X, XYX);
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    71
68228
326f4bcc5abc more scalable;
wenzelm
parents: 64275
diff changeset
    72
fun buffer_attrib (a, x) =
326f4bcc5abc more scalable;
wenzelm
parents: 64275
diff changeset
    73
  Buffer.add Y #> Buffer.add a #> Buffer.add "=" #> Buffer.add x;
43728
152ce7114396 YXML.string_of_body convenience;
wenzelm
parents: 43615
diff changeset
    74
68228
326f4bcc5abc more scalable;
wenzelm
parents: 64275
diff changeset
    75
fun buffer_body ts = fold buffer ts
326f4bcc5abc more scalable;
wenzelm
parents: 64275
diff changeset
    76
and buffer (XML.Elem ((name, atts), ts)) =
326f4bcc5abc more scalable;
wenzelm
parents: 64275
diff changeset
    77
      Buffer.add XY #> Buffer.add name #> fold buffer_attrib atts #> Buffer.add X #>
326f4bcc5abc more scalable;
wenzelm
parents: 64275
diff changeset
    78
      buffer_body ts #>
326f4bcc5abc more scalable;
wenzelm
parents: 64275
diff changeset
    79
      Buffer.add XYX
326f4bcc5abc more scalable;
wenzelm
parents: 64275
diff changeset
    80
  | buffer (XML.Text s) = Buffer.add s
326f4bcc5abc more scalable;
wenzelm
parents: 64275
diff changeset
    81
69845
d28e8199dcb9 more scalable on 32-bit Poly/ML;
wenzelm
parents: 68228
diff changeset
    82
fun chunks_of_body body = Buffer.empty |> buffer_body body |> Buffer.chunks;
68228
326f4bcc5abc more scalable;
wenzelm
parents: 64275
diff changeset
    83
fun string_of_body body = Buffer.empty |> buffer_body body |> Buffer.content;
43728
152ce7114396 YXML.string_of_body convenience;
wenzelm
parents: 43615
diff changeset
    84
val string_of = string_of_body o single;
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    85
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    86
49656
7ff712de5747 treat wrapped markup elements as raw markup delimiters;
wenzelm
parents: 46832
diff changeset
    87
(* wrapped elements *)
7ff712de5747 treat wrapped markup elements as raw markup delimiters;
wenzelm
parents: 46832
diff changeset
    88
7ff712de5747 treat wrapped markup elements as raw markup delimiters;
wenzelm
parents: 46832
diff changeset
    89
val Z = chr 0;
7ff712de5747 treat wrapped markup elements as raw markup delimiters;
wenzelm
parents: 46832
diff changeset
    90
val Z_text = [XML.Text Z];
7ff712de5747 treat wrapped markup elements as raw markup delimiters;
wenzelm
parents: 46832
diff changeset
    91
7ff712de5747 treat wrapped markup elements as raw markup delimiters;
wenzelm
parents: 46832
diff changeset
    92
fun output_markup_elem markup =
7ff712de5747 treat wrapped markup elements as raw markup delimiters;
wenzelm
parents: 46832
diff changeset
    93
  let val [bg1, bg2, en] = space_explode Z (string_of (XML.wrap_elem ((markup, Z_text), Z_text)))
7ff712de5747 treat wrapped markup elements as raw markup delimiters;
wenzelm
parents: 46832
diff changeset
    94
  in ((bg1, bg2), en) end;
7ff712de5747 treat wrapped markup elements as raw markup delimiters;
wenzelm
parents: 46832
diff changeset
    95
7ff712de5747 treat wrapped markup elements as raw markup delimiters;
wenzelm
parents: 46832
diff changeset
    96
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    97
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    98
(** efficient YXML parsing **)
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    99
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   100
local
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   101
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
   102
(* splitting *)
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
   103
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
   104
val split_string =
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
   105
  Substring.full #>
64275
ac2abc987cf9 accomodate Poly/ML repository version, which treats singleton strings as boxed;
wenzelm
parents: 59433
diff changeset
   106
  Substring.tokens (fn c => c = X_char) #>
ac2abc987cf9 accomodate Poly/ML repository version, which treats singleton strings as boxed;
wenzelm
parents: 59433
diff changeset
   107
  map (Substring.fields (fn c => c = Y_char) #> map Substring.string);
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
   108
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
   109
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
   110
(* structural errors *)
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   111
46832
050e344825c8 tuned message (cf. ML version);
wenzelm
parents: 46831
diff changeset
   112
fun err msg = raise Fail ("Malformed YXML: " ^ msg);
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   113
fun err_attribute () = err "bad attribute";
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   114
fun err_element () = err "bad element";
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   115
fun err_unbalanced "" = err "unbalanced element"
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   116
  | err_unbalanced name = err ("unbalanced element " ^ quote name);
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   117
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   118
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   119
(* stack operations *)
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   120
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   121
fun add x ((elem, body) :: pending) = (elem, x :: body) :: pending;
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   122
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   123
fun push "" _ _ = err_element ()
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   124
  | push name atts pending = ((name, atts), []) :: pending;
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   125
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   126
fun pop ((("", _), _) :: _) = err_unbalanced ""
38228
ada3ab6b9085 simplified type XML.tree: embed Markup.T directly, avoid slightly odd triple;
wenzelm
parents: 34095
diff changeset
   127
  | pop ((markup, body) :: pending) = add (XML.Elem (markup, rev body)) pending;
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   128
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   129
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
   130
(* parsing *)
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   131
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   132
fun parse_attrib s =
28025
d9fcab768496 replaced find_substring by first_field;
wenzelm
parents: 28023
diff changeset
   133
  (case first_field "=" s of
28023
92dd3ad302b7 simplified parse_attrib (find_substring instead of space_explode);
wenzelm
parents: 27932
diff changeset
   134
    NONE => err_attribute ()
28025
d9fcab768496 replaced find_substring by first_field;
wenzelm
parents: 28023
diff changeset
   135
  | SOME ("", _) => err_attribute ()
d9fcab768496 replaced find_substring by first_field;
wenzelm
parents: 28023
diff changeset
   136
  | SOME att => att);
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   137
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
   138
fun parse_chunk ["", ""] = pop
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
   139
  | parse_chunk ("" :: name :: atts) = push name (map parse_attrib atts)
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
   140
  | parse_chunk txts = fold (add o XML.Text) txts;
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   141
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   142
in
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   143
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   144
fun parse_body source =
43615
8e0f6cfa8eb2 reverted ce00462f,b3759dce, 7a165592: unwanted generalisation
noschinl
parents: 42355
diff changeset
   145
  (case fold parse_chunk (split_string source) [(("", []), [])] of
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   146
    [(("", _), result)] => rev result
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   147
  | ((name, _), _) :: _ => err_unbalanced name);
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   148
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   149
fun parse source =
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   150
  (case parse_body source of
27798
b96c73f11a23 YXML.parse: allow text without markup, potentially empty;
wenzelm
parents: 26684
diff changeset
   151
    [result] => result
b96c73f11a23 YXML.parse: allow text without markup, potentially empty;
wenzelm
parents: 26684
diff changeset
   152
  | [] => XML.Text ""
b96c73f11a23 YXML.parse: allow text without markup, potentially empty;
wenzelm
parents: 26684
diff changeset
   153
  | _ => err "multiple results");
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   154
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   155
end;
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   156
59433
9da5b2c61049 tuned signature;
wenzelm
parents: 49656
diff changeset
   157
val content_of = parse_body #> XML.content_of;
9da5b2c61049 tuned signature;
wenzelm
parents: 49656
diff changeset
   158
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   159
end;
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   160