src/Pure/General/bytes.ML
author wenzelm
Tue, 21 Jun 2022 23:36:16 +0200
changeset 75581 29654a8e9374
parent 69446 9cf0b79dfb7f
parent 75577 c51e1cef1eae
child 75592 f72b88f2842c
permissions -rw-r--r--
merged
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
75567
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
     1
(*  Title:      Pure/General/bytes.ML
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
     2
    Author:     Makarius
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
     3
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
     4
Scalable byte strings, with incremental construction (add content to the
75572
833f26c3a3af tuned comments;
wenzelm
parents: 75571
diff changeset
     5
end).
833f26c3a3af tuned comments;
wenzelm
parents: 75571
diff changeset
     6
833f26c3a3af tuned comments;
wenzelm
parents: 75571
diff changeset
     7
Note: type string is implicitly limited by String.maxSize (approx. 64 MB on
833f26c3a3af tuned comments;
wenzelm
parents: 75571
diff changeset
     8
64_32 architecture).
75567
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
     9
*)
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
    10
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
    11
signature BYTES =
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
    12
sig
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
    13
  val chunk_size: int
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
    14
  type T
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
    15
  val length: T -> int
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
    16
  val contents: T -> string list
75577
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
    17
  val contents_blob: T -> XML.body
75567
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
    18
  val content: T -> string
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
    19
  val is_empty: T -> bool
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
    20
  val empty: T
75577
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
    21
  val build: (T -> T) -> T
75568
a8539b1c8775 clarified signature: avoid repeated string copying via Substring.slice;
wenzelm
parents: 75567
diff changeset
    22
  val add_substring: substring -> T -> T
75567
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
    23
  val add: string -> T -> T
75577
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
    24
  val beginning: int -> T -> string
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
    25
  val exists_string: (string -> bool) -> T -> bool
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
    26
  val forall_string: (string -> bool) -> T -> bool
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
    27
  val last_string: T -> string option
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
    28
  val trim_line: T -> T
75574
5945c6f5126a clarified signature: more operations;
wenzelm
parents: 75573
diff changeset
    29
  val append: T -> T -> T
75567
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
    30
  val string: string -> T
75577
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
    31
  val newline: T
75567
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
    32
  val buffer: Buffer.T -> T
75577
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
    33
  val read_block: int -> BinIO.instream -> string
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
    34
  val read_stream: int -> BinIO.instream -> T
75570
ad957ab06f72 tuned signature: more operations;
wenzelm
parents: 75569
diff changeset
    35
  val write_stream: BinIO.outstream -> T -> unit
75567
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
    36
  val read: Path.T -> T
75570
ad957ab06f72 tuned signature: more operations;
wenzelm
parents: 75569
diff changeset
    37
  val write: Path.T -> T -> unit
75567
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
    38
end;
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
    39
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
    40
structure Bytes: BYTES =
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
    41
struct
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
    42
75576
8c5eedb6c983 tuned comments;
wenzelm
parents: 75575
diff changeset
    43
(* primitive operations *)
8c5eedb6c983 tuned comments;
wenzelm
parents: 75575
diff changeset
    44
75567
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
    45
val chunk_size = 1024 * 1024;
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
    46
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
    47
abstype T = Bytes of
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
    48
  {buffer: string list, chunks: string list, m: int (*buffer size*), n: int (*chunks size*)}
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
    49
with
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
    50
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
    51
fun length (Bytes {m, n, ...}) = m + n;
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
    52
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
    53
val compact = implode o rev;
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
    54
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
    55
fun contents (Bytes {buffer, chunks, ...}) =
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
    56
  rev (chunks |> not (null buffer) ? cons (compact buffer));
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
    57
75577
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
    58
val contents_blob = contents #> XML.blob;
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
    59
75567
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
    60
val content = implode o contents;
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
    61
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
    62
fun is_empty bytes = length bytes = 0;
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
    63
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
    64
val empty = Bytes {buffer = [], chunks = [], m = 0, n = 0};
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
    65
75577
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
    66
fun build (f: T -> T) = f empty;
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
    67
75568
a8539b1c8775 clarified signature: avoid repeated string copying via Substring.slice;
wenzelm
parents: 75567
diff changeset
    68
fun add_substring s (bytes as Bytes {buffer, chunks, m, n}) =
a8539b1c8775 clarified signature: avoid repeated string copying via Substring.slice;
wenzelm
parents: 75567
diff changeset
    69
  if Substring.isEmpty s then bytes
a8539b1c8775 clarified signature: avoid repeated string copying via Substring.slice;
wenzelm
parents: 75567
diff changeset
    70
  else
a8539b1c8775 clarified signature: avoid repeated string copying via Substring.slice;
wenzelm
parents: 75567
diff changeset
    71
    let val l = Substring.size s in
a8539b1c8775 clarified signature: avoid repeated string copying via Substring.slice;
wenzelm
parents: 75567
diff changeset
    72
      if l + m < chunk_size
a8539b1c8775 clarified signature: avoid repeated string copying via Substring.slice;
wenzelm
parents: 75567
diff changeset
    73
      then Bytes {buffer = Substring.string s :: buffer, chunks = chunks, m = l + m, n = n}
a8539b1c8775 clarified signature: avoid repeated string copying via Substring.slice;
wenzelm
parents: 75567
diff changeset
    74
      else
a8539b1c8775 clarified signature: avoid repeated string copying via Substring.slice;
wenzelm
parents: 75567
diff changeset
    75
        let
a8539b1c8775 clarified signature: avoid repeated string copying via Substring.slice;
wenzelm
parents: 75567
diff changeset
    76
          val k = chunk_size - m;
a8539b1c8775 clarified signature: avoid repeated string copying via Substring.slice;
wenzelm
parents: 75567
diff changeset
    77
          val chunk = compact (Substring.string (Substring.slice (s, 0, SOME k)) :: buffer);
a8539b1c8775 clarified signature: avoid repeated string copying via Substring.slice;
wenzelm
parents: 75567
diff changeset
    78
          val s' = Substring.slice (s, k, SOME (l - k));
a8539b1c8775 clarified signature: avoid repeated string copying via Substring.slice;
wenzelm
parents: 75567
diff changeset
    79
          val bytes' = Bytes {buffer = [], chunks = chunk :: chunks, m = 0, n = chunk_size + n};
a8539b1c8775 clarified signature: avoid repeated string copying via Substring.slice;
wenzelm
parents: 75567
diff changeset
    80
        in add_substring s' bytes' end
a8539b1c8775 clarified signature: avoid repeated string copying via Substring.slice;
wenzelm
parents: 75567
diff changeset
    81
    end;
a8539b1c8775 clarified signature: avoid repeated string copying via Substring.slice;
wenzelm
parents: 75567
diff changeset
    82
a8539b1c8775 clarified signature: avoid repeated string copying via Substring.slice;
wenzelm
parents: 75567
diff changeset
    83
val add = add_substring o Substring.full;
75567
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
    84
75577
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
    85
fun exists_string pred (Bytes {buffer, chunks, ...}) =
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
    86
  let val ex = (exists o Library.exists_string) pred
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
    87
  in ex buffer orelse ex chunks end;
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
    88
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
    89
fun forall_string pred = not o exists_string (not o pred);
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
    90
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
    91
fun last_string (Bytes {buffer, chunks, ...}) =
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
    92
  (case buffer of
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
    93
    s :: _ => Library.last_string s
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
    94
  | [] =>
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
    95
      (case chunks of
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
    96
        s :: _ => Library.last_string s
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
    97
      | [] => NONE));
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
    98
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
    99
fun trim_line (bytes as Bytes {buffer, chunks, ...}) =
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
   100
  let
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
   101
    val is_line =
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
   102
      (case last_string bytes of
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
   103
        SOME s => Symbol.is_ascii_line_terminator s
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
   104
      | NONE => false);
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
   105
  in
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
   106
    if is_line then
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
   107
      let
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
   108
        val (last_chunk, chunks') =
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
   109
          (case chunks of
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
   110
            [] => ("", [])
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
   111
          | c :: cs => (c, cs));
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
   112
        val trimed = Library.trim_line (last_chunk ^ compact buffer);
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
   113
      in build (fold_rev add chunks' #> add trimed) end
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
   114
    else bytes
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
   115
  end;
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
   116
75567
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
   117
end;
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
   118
75576
8c5eedb6c983 tuned comments;
wenzelm
parents: 75575
diff changeset
   119
8c5eedb6c983 tuned comments;
wenzelm
parents: 75575
diff changeset
   120
(* derived operations *)
8c5eedb6c983 tuned comments;
wenzelm
parents: 75575
diff changeset
   121
75577
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
   122
fun beginning n bytes =
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
   123
  let
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
   124
    val dots = " ...";
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
   125
    val m = (String.maxSize - size dots) div chunk_size;
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
   126
    val a = implode (take m (contents bytes));
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
   127
    val b = String.substring (a, 0, Int.min (n, size a));
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
   128
  in if size b < length bytes then b ^ dots else b end;
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
   129
75574
5945c6f5126a clarified signature: more operations;
wenzelm
parents: 75573
diff changeset
   130
fun append bytes1 bytes2 =  (*left-associative*)
5945c6f5126a clarified signature: more operations;
wenzelm
parents: 75573
diff changeset
   131
  if is_empty bytes1 then bytes2
5945c6f5126a clarified signature: more operations;
wenzelm
parents: 75573
diff changeset
   132
  else if is_empty bytes2 then bytes1
5945c6f5126a clarified signature: more operations;
wenzelm
parents: 75573
diff changeset
   133
  else bytes1 |> fold add (contents bytes2);
5945c6f5126a clarified signature: more operations;
wenzelm
parents: 75573
diff changeset
   134
75577
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
   135
val string = build o add;
75567
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
   136
75577
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
   137
val newline = string "\n";
75567
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
   138
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
   139
val buffer = build o fold add o Buffer.contents;
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
   140
75577
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
   141
fun read_block limit =
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
   142
  File.input_size (if limit < 0 then chunk_size else Int.min (chunk_size, limit));
75573
4fc8a35579b2 tuned signature;
wenzelm
parents: 75572
diff changeset
   143
75577
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
   144
fun read_stream limit stream =
75567
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
   145
  let
75577
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
   146
    fun read bytes =
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
   147
      (case read_block (limit - length bytes) stream of
75567
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
   148
        "" => bytes
75577
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
   149
      | s => read (add s bytes))
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
   150
  in read empty end;
75570
ad957ab06f72 tuned signature: more operations;
wenzelm
parents: 75569
diff changeset
   151
75571
ac5e633ad9b3 tuned signature: more operations;
wenzelm
parents: 75570
diff changeset
   152
fun write_stream stream = File.outputs stream o contents;
75570
ad957ab06f72 tuned signature: more operations;
wenzelm
parents: 75569
diff changeset
   153
75577
c51e1cef1eae more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents: 75576
diff changeset
   154
val read = File.open_input (read_stream ~1);
75570
ad957ab06f72 tuned signature: more operations;
wenzelm
parents: 75569
diff changeset
   155
val write = File.open_output write_stream;
75567
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
   156
75576
8c5eedb6c983 tuned comments;
wenzelm
parents: 75575
diff changeset
   157
8c5eedb6c983 tuned comments;
wenzelm
parents: 75575
diff changeset
   158
(* ML pretty printing *)
8c5eedb6c983 tuned comments;
wenzelm
parents: 75575
diff changeset
   159
75575
06f8b072f28e clarified ML pretty printing;
wenzelm
parents: 75574
diff changeset
   160
val _ =
06f8b072f28e clarified ML pretty printing;
wenzelm
parents: 75574
diff changeset
   161
  ML_system_pp (fn _ => fn _ => fn bytes =>
06f8b072f28e clarified ML pretty printing;
wenzelm
parents: 75574
diff changeset
   162
    PolyML.PrettyString ("Bytes {length = " ^ string_of_int (length bytes) ^ "}"))
06f8b072f28e clarified ML pretty printing;
wenzelm
parents: 75574
diff changeset
   163
75567
94321fd25c8a support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff changeset
   164
end;