src/Pure/PIDE/yxml.ML
author wenzelm
Fri, 08 Nov 2024 22:52:29 +0100
changeset 81409 07c802837a8c
parent 80861 9de19e3a7231
permissions -rw-r--r--
merged

(*  Title:      Pure/PIDE/yxml.ML
    Author:     Makarius

Efficient text representation of XML trees using extra characters X
and Y -- no escaping, may nest marked text verbatim.  Suitable for
direct inlining into plain text.

Markup <elem att="val" ...>...body...</elem> is encoded as:

  X Y name Y att=val ... X
  ...
  body
  ...
  X Y X
*)

signature YXML =
sig
  include YXML
  val traverse: (string -> 'a -> 'a) -> XML.tree -> 'a -> 'a
  val tree_size: XML.tree -> int
  val body_size: XML.body -> int
  val string_of_body: XML.body -> string
  val string_of: XML.tree -> string
  val bytes_of_body: XML.body -> Bytes.T
  val bytes_of: XML.tree -> Bytes.T
  val write_body: Path.T -> XML.body -> unit
  val output_markup_only: Markup.T -> string
  val output_markup_elem: Markup.T -> (string * string) * string
  val parse_body: string -> XML.body
  val parse_body_bytes: Bytes.T -> XML.body
  val parse: string -> XML.tree
  val parse_bytes: Bytes.T -> XML.tree
  val is_wellformed: string -> bool
end;

structure YXML: YXML =
struct

open YXML;


(** string representation **)

(* traverse *)

fun traverse string =
  let
    fun attrib (a, x) = string Y #> string a #> string "=" #> string x;
    fun tree (XML.Elem (markup as (name, atts), ts)) =
          if Markup.is_empty markup then fold tree ts
          else
            string XY #> string name #> fold attrib atts #> string X #>
            fold tree ts #>
            string XYX
      | tree (XML.Text s) = string s;
  in tree end;

val tree_size = Integer.build o traverse (Integer.add o size);
val body_size = Integer.build o fold (Integer.add o tree_size);


(* output *)

val string_of_body = Buffer.build_content o fold (traverse Buffer.add);
val string_of = string_of_body o single;

val bytes_of_body = Bytes.build o fold (traverse Bytes.add);
val bytes_of = bytes_of_body o single;

fun write_body path body =
  path |> File_Stream.open_output (fn file =>
    fold (traverse (fn s => fn () => File_Stream.output file s)) body ());


(* markup elements *)

val Z = chr 0;
val Z_text = [XML.Text Z];

val output_markup_only = op ^ o output_markup;

fun output_markup_elem markup =
  let val [bg1, bg2, en] = space_explode Z (string_of (XML.wrap_elem ((markup, Z_text), Z_text)))
  in ((bg1, bg2), en) end;



(** efficient YXML parsing **)

local

(* splitting *)

val split_string =
  Substring.full #>
  Substring.tokens (fn c => c = X_char) #>
  map (Substring.fields (fn c => c = Y_char) #> map Substring.string);

val split_bytes =
  Bytes.space_explode X
  #> map_filter (fn "" => NONE | s => SOME (space_explode Y s));


(* structural errors *)

fun err msg = raise Fail ("Malformed YXML: " ^ msg);
fun err_attribute () = err "bad attribute";
fun err_element () = err "bad element";
fun err_unbalanced "" = err "unbalanced element"
  | err_unbalanced name = err ("unbalanced element " ^ quote name);


(* stack operations *)

fun add x ((elem, body) :: pending) = (elem, x :: body) :: pending;

fun push "" _ _ = err_element ()
  | push name atts pending = ((name, atts), []) :: pending;

fun pop ((("", _), _) :: _) = err_unbalanced ""
  | pop ((markup, body) :: pending) = add (XML.Elem (markup, rev body)) pending;


(* parsing *)

fun parse_attrib s =
  (case first_field "=" s of
    NONE => err_attribute ()
  | SOME ("", _) => err_attribute ()
  | SOME att => att);

fun parse_chunk ["", ""] = pop
  | parse_chunk ("" :: name :: atts) = push name (map parse_attrib atts)
  | parse_chunk txts = fold (add o XML.Text) txts;

fun parse_body' chunks =
  (case fold parse_chunk chunks [(("", []), [])] of
    [(("", _), result)] => rev result
  | ((name, _), _) :: _ => err_unbalanced name);

fun parse' chunks =
  (case parse_body' chunks of
    [result] => result
  | [] => XML.Text ""
  | _ => err "multiple results");

in

val parse_body = parse_body' o split_string;
val parse_body_bytes = parse_body' o split_bytes;

val parse = parse' o split_string;
val parse_bytes = parse' o split_bytes;

end;

fun is_wellformed s = string_of_body (parse_body s) = s
  handle Fail _ => false;

end;