src/Pure/PIDE/yxml.ML
author wenzelm
Thu, 10 Apr 2025 14:12:33 +0200
changeset 82475 0a6d57c4d58b
parent 80861 9de19e3a7231
permissions -rw-r--r--
more explicit build stages;
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
44698
0385292321a0 moved XML/YXML to src/Pure/PIDE;
wenzelm
parents: 43782
diff changeset
     1
(*  Title:      Pure/PIDE/yxml.ML
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
     2
    Author:     Makarius
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
     3
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
     4
Efficient text representation of XML trees using extra characters X
44698
0385292321a0 moved XML/YXML to src/Pure/PIDE;
wenzelm
parents: 43782
diff changeset
     5
and Y -- no escaping, may nest marked text verbatim.  Suitable for
0385292321a0 moved XML/YXML to src/Pure/PIDE;
wenzelm
parents: 43782
diff changeset
     6
direct inlining into plain text.
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
     7
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
     8
Markup <elem att="val" ...>...body...</elem> is encoded as:
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
     9
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    10
  X Y name Y att=val ... X
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    11
  ...
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    12
  body
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    13
  ...
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    14
  X Y X
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    15
*)
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    16
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    17
signature YXML =
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    18
sig
80860
64dc09f7f189 clarified YXML bootstrap;
wenzelm
parents: 80828
diff changeset
    19
  include YXML
70990
e5e34bd28257 clarified signature;
wenzelm
parents: 70989
diff changeset
    20
  val traverse: (string -> 'a -> 'a) -> XML.tree -> 'a -> 'a
73557
225486d9c960 tuned signature;
wenzelm
parents: 73556
diff changeset
    21
  val tree_size: XML.tree -> int
225486d9c960 tuned signature;
wenzelm
parents: 73556
diff changeset
    22
  val body_size: XML.body -> int
43728
152ce7114396 YXML.string_of_body convenience;
wenzelm
parents: 43615
diff changeset
    23
  val string_of_body: XML.body -> string
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    24
  val string_of: XML.tree -> string
80563
36da6a3c79d6 more scalable operations;
wenzelm
parents: 80561
diff changeset
    25
  val bytes_of_body: XML.body -> Bytes.T
36da6a3c79d6 more scalable operations;
wenzelm
parents: 80561
diff changeset
    26
  val bytes_of: XML.tree -> Bytes.T
70991
f9f7c34b7dd4 more scalable protocol_message: use XML.body directly (Output.output hook is not required);
wenzelm
parents: 70990
diff changeset
    27
  val write_body: Path.T -> XML.body -> unit
80826
7feaa04d332b prefer static YXML.output_markup_only (without print_mode): Output.status is only relevant for PIDE (with print_mode "isabelle_process");
wenzelm
parents: 80821
diff changeset
    28
  val output_markup_only: Markup.T -> string
49656
7ff712de5747 treat wrapped markup elements as raw markup delimiters;
wenzelm
parents: 46832
diff changeset
    29
  val output_markup_elem: Markup.T -> (string * string) * string
38266
492d377ecfe2 type XML.body as basic data representation language;
wenzelm
parents: 38265
diff changeset
    30
  val parse_body: string -> XML.body
75626
4879d0021185 more operations on Bytes.T;
wenzelm
parents: 75615
diff changeset
    31
  val parse_body_bytes: Bytes.T -> XML.body
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    32
  val parse: string -> XML.tree
75626
4879d0021185 more operations on Bytes.T;
wenzelm
parents: 75615
diff changeset
    33
  val parse_bytes: Bytes.T -> XML.tree
71456
09c850e82258 more robust pretty printing of broken YXML, e.g. single "\^E";
wenzelm
parents: 70996
diff changeset
    34
  val is_wellformed: string -> bool
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    35
end;
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    36
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    37
structure YXML: YXML =
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    38
struct
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    39
80860
64dc09f7f189 clarified YXML bootstrap;
wenzelm
parents: 80828
diff changeset
    40
open YXML;
64275
ac2abc987cf9 accomodate Poly/ML repository version, which treats singleton strings as boxed;
wenzelm
parents: 59433
diff changeset
    41
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    42
80860
64dc09f7f189 clarified YXML bootstrap;
wenzelm
parents: 80828
diff changeset
    43
(** string representation **)
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    44
73557
225486d9c960 tuned signature;
wenzelm
parents: 73556
diff changeset
    45
(* traverse *)
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    46
70990
e5e34bd28257 clarified signature;
wenzelm
parents: 70989
diff changeset
    47
fun traverse string =
e5e34bd28257 clarified signature;
wenzelm
parents: 70989
diff changeset
    48
  let
e5e34bd28257 clarified signature;
wenzelm
parents: 70989
diff changeset
    49
    fun attrib (a, x) = string Y #> string a #> string "=" #> string x;
73556
192bcee4f8b8 more robust treatment of empty markup: it allows to produce formal chunks;
wenzelm
parents: 71456
diff changeset
    50
    fun tree (XML.Elem (markup as (name, atts), ts)) =
192bcee4f8b8 more robust treatment of empty markup: it allows to produce formal chunks;
wenzelm
parents: 71456
diff changeset
    51
          if Markup.is_empty markup then fold tree ts
192bcee4f8b8 more robust treatment of empty markup: it allows to produce formal chunks;
wenzelm
parents: 71456
diff changeset
    52
          else
192bcee4f8b8 more robust treatment of empty markup: it allows to produce formal chunks;
wenzelm
parents: 71456
diff changeset
    53
            string XY #> string name #> fold attrib atts #> string X #>
192bcee4f8b8 more robust treatment of empty markup: it allows to produce formal chunks;
wenzelm
parents: 71456
diff changeset
    54
            fold tree ts #>
192bcee4f8b8 more robust treatment of empty markup: it allows to produce formal chunks;
wenzelm
parents: 71456
diff changeset
    55
            string XYX
70990
e5e34bd28257 clarified signature;
wenzelm
parents: 70989
diff changeset
    56
      | tree (XML.Text s) = string s;
e5e34bd28257 clarified signature;
wenzelm
parents: 70989
diff changeset
    57
  in tree end;
43728
152ce7114396 YXML.string_of_body convenience;
wenzelm
parents: 43615
diff changeset
    58
77768
65008644d394 tuned: prefer "build" combinator;
wenzelm
parents: 75626
diff changeset
    59
val tree_size = Integer.build o traverse (Integer.add o size);
65008644d394 tuned: prefer "build" combinator;
wenzelm
parents: 75626
diff changeset
    60
val body_size = Integer.build o fold (Integer.add o tree_size);
73557
225486d9c960 tuned signature;
wenzelm
parents: 73556
diff changeset
    61
225486d9c960 tuned signature;
wenzelm
parents: 73556
diff changeset
    62
225486d9c960 tuned signature;
wenzelm
parents: 73556
diff changeset
    63
(* output *)
225486d9c960 tuned signature;
wenzelm
parents: 73556
diff changeset
    64
74231
b3c65c984210 tuned signature;
wenzelm
parents: 73557
diff changeset
    65
val string_of_body = Buffer.build_content o fold (traverse Buffer.add);
43728
152ce7114396 YXML.string_of_body convenience;
wenzelm
parents: 43615
diff changeset
    66
val string_of = string_of_body o single;
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    67
80563
36da6a3c79d6 more scalable operations;
wenzelm
parents: 80561
diff changeset
    68
val bytes_of_body = Bytes.build o fold (traverse Bytes.add);
36da6a3c79d6 more scalable operations;
wenzelm
parents: 80561
diff changeset
    69
val bytes_of = bytes_of_body o single;
36da6a3c79d6 more scalable operations;
wenzelm
parents: 80561
diff changeset
    70
70991
f9f7c34b7dd4 more scalable protocol_message: use XML.body directly (Output.output hook is not required);
wenzelm
parents: 70990
diff changeset
    71
fun write_body path body =
75615
4494cd69f97f clarified modules;
wenzelm
parents: 74231
diff changeset
    72
  path |> File_Stream.open_output (fn file =>
4494cd69f97f clarified modules;
wenzelm
parents: 74231
diff changeset
    73
    fold (traverse (fn s => fn () => File_Stream.output file s)) body ());
70991
f9f7c34b7dd4 more scalable protocol_message: use XML.body directly (Output.output hook is not required);
wenzelm
parents: 70990
diff changeset
    74
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    75
70989
wenzelm
parents: 69845
diff changeset
    76
(* markup elements *)
49656
7ff712de5747 treat wrapped markup elements as raw markup delimiters;
wenzelm
parents: 46832
diff changeset
    77
7ff712de5747 treat wrapped markup elements as raw markup delimiters;
wenzelm
parents: 46832
diff changeset
    78
val Z = chr 0;
7ff712de5747 treat wrapped markup elements as raw markup delimiters;
wenzelm
parents: 46832
diff changeset
    79
val Z_text = [XML.Text Z];
7ff712de5747 treat wrapped markup elements as raw markup delimiters;
wenzelm
parents: 46832
diff changeset
    80
80826
7feaa04d332b prefer static YXML.output_markup_only (without print_mode): Output.status is only relevant for PIDE (with print_mode "isabelle_process");
wenzelm
parents: 80821
diff changeset
    81
val output_markup_only = op ^ o output_markup;
7feaa04d332b prefer static YXML.output_markup_only (without print_mode): Output.status is only relevant for PIDE (with print_mode "isabelle_process");
wenzelm
parents: 80821
diff changeset
    82
49656
7ff712de5747 treat wrapped markup elements as raw markup delimiters;
wenzelm
parents: 46832
diff changeset
    83
fun output_markup_elem markup =
7ff712de5747 treat wrapped markup elements as raw markup delimiters;
wenzelm
parents: 46832
diff changeset
    84
  let val [bg1, bg2, en] = space_explode Z (string_of (XML.wrap_elem ((markup, Z_text), Z_text)))
7ff712de5747 treat wrapped markup elements as raw markup delimiters;
wenzelm
parents: 46832
diff changeset
    85
  in ((bg1, bg2), en) end;
7ff712de5747 treat wrapped markup elements as raw markup delimiters;
wenzelm
parents: 46832
diff changeset
    86
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    87
80860
64dc09f7f189 clarified YXML bootstrap;
wenzelm
parents: 80828
diff changeset
    88
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    89
(** efficient YXML parsing **)
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    90
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    91
local
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
    92
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    93
(* splitting *)
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    94
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    95
val split_string =
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    96
  Substring.full #>
64275
ac2abc987cf9 accomodate Poly/ML repository version, which treats singleton strings as boxed;
wenzelm
parents: 59433
diff changeset
    97
  Substring.tokens (fn c => c = X_char) #>
ac2abc987cf9 accomodate Poly/ML repository version, which treats singleton strings as boxed;
wenzelm
parents: 59433
diff changeset
    98
  map (Substring.fields (fn c => c = Y_char) #> map Substring.string);
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
    99
75626
4879d0021185 more operations on Bytes.T;
wenzelm
parents: 75615
diff changeset
   100
val split_bytes =
4879d0021185 more operations on Bytes.T;
wenzelm
parents: 75615
diff changeset
   101
  Bytes.space_explode X
80561
437895863e1d tuned (see also 4879d0021185);
wenzelm
parents: 80506
diff changeset
   102
  #> map_filter (fn "" => NONE | s => SOME (space_explode Y s));
75626
4879d0021185 more operations on Bytes.T;
wenzelm
parents: 75615
diff changeset
   103
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
   104
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
   105
(* structural errors *)
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   106
46832
050e344825c8 tuned message (cf. ML version);
wenzelm
parents: 46831
diff changeset
   107
fun err msg = raise Fail ("Malformed YXML: " ^ msg);
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   108
fun err_attribute () = err "bad attribute";
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   109
fun err_element () = err "bad element";
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   110
fun err_unbalanced "" = err "unbalanced element"
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   111
  | err_unbalanced name = err ("unbalanced element " ^ quote name);
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   112
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   113
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   114
(* stack operations *)
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   115
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   116
fun add x ((elem, body) :: pending) = (elem, x :: body) :: pending;
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   117
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   118
fun push "" _ _ = err_element ()
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   119
  | push name atts pending = ((name, atts), []) :: pending;
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   120
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   121
fun pop ((("", _), _) :: _) = err_unbalanced ""
38228
ada3ab6b9085 simplified type XML.tree: embed Markup.T directly, avoid slightly odd triple;
wenzelm
parents: 34095
diff changeset
   122
  | pop ((markup, body) :: pending) = add (XML.Elem (markup, rev body)) pending;
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   123
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   124
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
   125
(* parsing *)
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   126
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   127
fun parse_attrib s =
28025
d9fcab768496 replaced find_substring by first_field;
wenzelm
parents: 28023
diff changeset
   128
  (case first_field "=" s of
28023
92dd3ad302b7 simplified parse_attrib (find_substring instead of space_explode);
wenzelm
parents: 27932
diff changeset
   129
    NONE => err_attribute ()
28025
d9fcab768496 replaced find_substring by first_field;
wenzelm
parents: 28023
diff changeset
   130
  | SOME ("", _) => err_attribute ()
d9fcab768496 replaced find_substring by first_field;
wenzelm
parents: 28023
diff changeset
   131
  | SOME att => att);
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   132
26540
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
   133
fun parse_chunk ["", ""] = pop
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
   134
  | parse_chunk ("" :: name :: atts) = push name (map parse_attrib atts)
173d548ce9d2 replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents: 26528
diff changeset
   135
  | parse_chunk txts = fold (add o XML.Text) txts;
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   136
75626
4879d0021185 more operations on Bytes.T;
wenzelm
parents: 75615
diff changeset
   137
fun parse_body' chunks =
4879d0021185 more operations on Bytes.T;
wenzelm
parents: 75615
diff changeset
   138
  (case fold parse_chunk chunks [(("", []), [])] of
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   139
    [(("", _), result)] => rev result
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   140
  | ((name, _), _) :: _ => err_unbalanced name);
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   141
75626
4879d0021185 more operations on Bytes.T;
wenzelm
parents: 75615
diff changeset
   142
fun parse' chunks =
4879d0021185 more operations on Bytes.T;
wenzelm
parents: 75615
diff changeset
   143
  (case parse_body' chunks of
27798
b96c73f11a23 YXML.parse: allow text without markup, potentially empty;
wenzelm
parents: 26684
diff changeset
   144
    [result] => result
b96c73f11a23 YXML.parse: allow text without markup, potentially empty;
wenzelm
parents: 26684
diff changeset
   145
  | [] => XML.Text ""
b96c73f11a23 YXML.parse: allow text without markup, potentially empty;
wenzelm
parents: 26684
diff changeset
   146
  | _ => err "multiple results");
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   147
75626
4879d0021185 more operations on Bytes.T;
wenzelm
parents: 75615
diff changeset
   148
in
4879d0021185 more operations on Bytes.T;
wenzelm
parents: 75615
diff changeset
   149
4879d0021185 more operations on Bytes.T;
wenzelm
parents: 75615
diff changeset
   150
val parse_body = parse_body' o split_string;
4879d0021185 more operations on Bytes.T;
wenzelm
parents: 75615
diff changeset
   151
val parse_body_bytes = parse_body' o split_bytes;
4879d0021185 more operations on Bytes.T;
wenzelm
parents: 75615
diff changeset
   152
4879d0021185 more operations on Bytes.T;
wenzelm
parents: 75615
diff changeset
   153
val parse = parse' o split_string;
4879d0021185 more operations on Bytes.T;
wenzelm
parents: 75615
diff changeset
   154
val parse_bytes = parse' o split_bytes;
4879d0021185 more operations on Bytes.T;
wenzelm
parents: 75615
diff changeset
   155
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   156
end;
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   157
71456
09c850e82258 more robust pretty printing of broken YXML, e.g. single "\^E";
wenzelm
parents: 70996
diff changeset
   158
fun is_wellformed s = string_of_body (parse_body s) = s
09c850e82258 more robust pretty printing of broken YXML, e.g. single "\^E";
wenzelm
parents: 70996
diff changeset
   159
  handle Fail _ => false;
09c850e82258 more robust pretty printing of broken YXML, e.g. single "\^E";
wenzelm
parents: 70996
diff changeset
   160
26528
944f9bf26d2d Why XML notation?
wenzelm
parents:
diff changeset
   161
end;