author | wenzelm |
Tue, 21 Jun 2022 23:36:16 +0200 | |
changeset 75581 | 29654a8e9374 |
parent 69446 | 9cf0b79dfb7f |
parent 75577 | c51e1cef1eae |
child 75592 | f72b88f2842c |
permissions | -rw-r--r-- |
75567
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
1 |
(* Title: Pure/General/bytes.ML |
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
2 |
Author: Makarius |
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
3 |
|
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
4 |
Scalable byte strings, with incremental construction (add content to the |
75572 | 5 |
end). |
6 |
||
7 |
Note: type string is implicitly limited by String.maxSize (approx. 64 MB on |
|
8 |
64_32 architecture). |
|
75567
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
9 |
*) |
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
10 |
|
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
11 |
signature BYTES = |
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
12 |
sig |
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
13 |
val chunk_size: int |
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
14 |
type T |
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
15 |
val length: T -> int |
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
16 |
val contents: T -> string list |
75577
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
17 |
val contents_blob: T -> XML.body |
75567
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
18 |
val content: T -> string |
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
19 |
val is_empty: T -> bool |
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
20 |
val empty: T |
75577
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
21 |
val build: (T -> T) -> T |
75568
a8539b1c8775
clarified signature: avoid repeated string copying via Substring.slice;
wenzelm
parents:
75567
diff
changeset
|
22 |
val add_substring: substring -> T -> T |
75567
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
23 |
val add: string -> T -> T |
75577
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
24 |
val beginning: int -> T -> string |
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
25 |
val exists_string: (string -> bool) -> T -> bool |
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
26 |
val forall_string: (string -> bool) -> T -> bool |
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
27 |
val last_string: T -> string option |
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
28 |
val trim_line: T -> T |
75574 | 29 |
val append: T -> T -> T |
75567
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
30 |
val string: string -> T |
75577
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
31 |
val newline: T |
75567
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
32 |
val buffer: Buffer.T -> T |
75577
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
33 |
val read_block: int -> BinIO.instream -> string |
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
34 |
val read_stream: int -> BinIO.instream -> T |
75570 | 35 |
val write_stream: BinIO.outstream -> T -> unit |
75567
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
36 |
val read: Path.T -> T |
75570 | 37 |
val write: Path.T -> T -> unit |
75567
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
38 |
end; |
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
39 |
|
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
40 |
structure Bytes: BYTES = |
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
41 |
struct |
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
42 |
|
75576 | 43 |
(* primitive operations *) |
44 |
||
75567
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
45 |
val chunk_size = 1024 * 1024; |
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
46 |
|
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
47 |
abstype T = Bytes of |
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
48 |
{buffer: string list, chunks: string list, m: int (*buffer size*), n: int (*chunks size*)} |
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
49 |
with |
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
50 |
|
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
51 |
fun length (Bytes {m, n, ...}) = m + n; |
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
52 |
|
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
53 |
val compact = implode o rev; |
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
54 |
|
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
55 |
fun contents (Bytes {buffer, chunks, ...}) = |
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
56 |
rev (chunks |> not (null buffer) ? cons (compact buffer)); |
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
57 |
|
75577
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
58 |
val contents_blob = contents #> XML.blob; |
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
59 |
|
75567
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
60 |
val content = implode o contents; |
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
61 |
|
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
62 |
fun is_empty bytes = length bytes = 0; |
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
63 |
|
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
64 |
val empty = Bytes {buffer = [], chunks = [], m = 0, n = 0}; |
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
65 |
|
75577
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
66 |
fun build (f: T -> T) = f empty; |
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
67 |
|
75568
a8539b1c8775
clarified signature: avoid repeated string copying via Substring.slice;
wenzelm
parents:
75567
diff
changeset
|
68 |
fun add_substring s (bytes as Bytes {buffer, chunks, m, n}) = |
a8539b1c8775
clarified signature: avoid repeated string copying via Substring.slice;
wenzelm
parents:
75567
diff
changeset
|
69 |
if Substring.isEmpty s then bytes |
a8539b1c8775
clarified signature: avoid repeated string copying via Substring.slice;
wenzelm
parents:
75567
diff
changeset
|
70 |
else |
a8539b1c8775
clarified signature: avoid repeated string copying via Substring.slice;
wenzelm
parents:
75567
diff
changeset
|
71 |
let val l = Substring.size s in |
a8539b1c8775
clarified signature: avoid repeated string copying via Substring.slice;
wenzelm
parents:
75567
diff
changeset
|
72 |
if l + m < chunk_size |
a8539b1c8775
clarified signature: avoid repeated string copying via Substring.slice;
wenzelm
parents:
75567
diff
changeset
|
73 |
then Bytes {buffer = Substring.string s :: buffer, chunks = chunks, m = l + m, n = n} |
a8539b1c8775
clarified signature: avoid repeated string copying via Substring.slice;
wenzelm
parents:
75567
diff
changeset
|
74 |
else |
a8539b1c8775
clarified signature: avoid repeated string copying via Substring.slice;
wenzelm
parents:
75567
diff
changeset
|
75 |
let |
a8539b1c8775
clarified signature: avoid repeated string copying via Substring.slice;
wenzelm
parents:
75567
diff
changeset
|
76 |
val k = chunk_size - m; |
a8539b1c8775
clarified signature: avoid repeated string copying via Substring.slice;
wenzelm
parents:
75567
diff
changeset
|
77 |
val chunk = compact (Substring.string (Substring.slice (s, 0, SOME k)) :: buffer); |
a8539b1c8775
clarified signature: avoid repeated string copying via Substring.slice;
wenzelm
parents:
75567
diff
changeset
|
78 |
val s' = Substring.slice (s, k, SOME (l - k)); |
a8539b1c8775
clarified signature: avoid repeated string copying via Substring.slice;
wenzelm
parents:
75567
diff
changeset
|
79 |
val bytes' = Bytes {buffer = [], chunks = chunk :: chunks, m = 0, n = chunk_size + n}; |
a8539b1c8775
clarified signature: avoid repeated string copying via Substring.slice;
wenzelm
parents:
75567
diff
changeset
|
80 |
in add_substring s' bytes' end |
a8539b1c8775
clarified signature: avoid repeated string copying via Substring.slice;
wenzelm
parents:
75567
diff
changeset
|
81 |
end; |
a8539b1c8775
clarified signature: avoid repeated string copying via Substring.slice;
wenzelm
parents:
75567
diff
changeset
|
82 |
|
a8539b1c8775
clarified signature: avoid repeated string copying via Substring.slice;
wenzelm
parents:
75567
diff
changeset
|
83 |
val add = add_substring o Substring.full; |
75567
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
84 |
|
75577
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
85 |
fun exists_string pred (Bytes {buffer, chunks, ...}) = |
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
86 |
let val ex = (exists o Library.exists_string) pred |
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
87 |
in ex buffer orelse ex chunks end; |
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
88 |
|
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
89 |
fun forall_string pred = not o exists_string (not o pred); |
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
90 |
|
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
91 |
fun last_string (Bytes {buffer, chunks, ...}) = |
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
92 |
(case buffer of |
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
93 |
s :: _ => Library.last_string s |
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
94 |
| [] => |
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
95 |
(case chunks of |
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
96 |
s :: _ => Library.last_string s |
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
97 |
| [] => NONE)); |
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
98 |
|
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
99 |
fun trim_line (bytes as Bytes {buffer, chunks, ...}) = |
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
100 |
let |
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
101 |
val is_line = |
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
102 |
(case last_string bytes of |
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
103 |
SOME s => Symbol.is_ascii_line_terminator s |
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
104 |
| NONE => false); |
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
105 |
in |
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
106 |
if is_line then |
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
107 |
let |
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
108 |
val (last_chunk, chunks') = |
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
109 |
(case chunks of |
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
110 |
[] => ("", []) |
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
111 |
| c :: cs => (c, cs)); |
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
112 |
val trimed = Library.trim_line (last_chunk ^ compact buffer); |
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
113 |
in build (fold_rev add chunks' #> add trimed) end |
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
114 |
else bytes |
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
115 |
end; |
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
116 |
|
75567
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
117 |
end; |
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
118 |
|
75576 | 119 |
|
120 |
(* derived operations *) |
|
121 |
||
75577
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
122 |
fun beginning n bytes = |
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
123 |
let |
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
124 |
val dots = " ..."; |
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
125 |
val m = (String.maxSize - size dots) div chunk_size; |
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
126 |
val a = implode (take m (contents bytes)); |
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
127 |
val b = String.substring (a, 0, Int.min (n, size a)); |
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
128 |
in if size b < length bytes then b ^ dots else b end; |
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
129 |
|
75574 | 130 |
fun append bytes1 bytes2 = (*left-associative*) |
131 |
if is_empty bytes1 then bytes2 |
|
132 |
else if is_empty bytes2 then bytes1 |
|
133 |
else bytes1 |> fold add (contents bytes2); |
|
134 |
||
75577
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
135 |
val string = build o add; |
75567
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
136 |
|
75577
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
137 |
val newline = string "\n"; |
75567
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
138 |
|
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
139 |
val buffer = build o fold add o Buffer.contents; |
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
140 |
|
75577
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
141 |
fun read_block limit = |
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
142 |
File.input_size (if limit < 0 then chunk_size else Int.min (chunk_size, limit)); |
75573 | 143 |
|
75577
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
144 |
fun read_stream limit stream = |
75567
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
145 |
let |
75577
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
146 |
fun read bytes = |
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
147 |
(case read_block (limit - length bytes) stream of |
75567
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
148 |
"" => bytes |
75577
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
149 |
| s => read (add s bytes)) |
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
150 |
in read empty end; |
75570 | 151 |
|
75571 | 152 |
fun write_stream stream = File.outputs stream o contents; |
75570 | 153 |
|
75577
c51e1cef1eae
more scalable byte messages, notably for Scala functions in ML;
wenzelm
parents:
75576
diff
changeset
|
154 |
val read = File.open_input (read_stream ~1); |
75570 | 155 |
val write = File.open_output write_stream; |
75567
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
156 |
|
75576 | 157 |
|
158 |
(* ML pretty printing *) |
|
159 |
||
75575 | 160 |
val _ = |
161 |
ML_system_pp (fn _ => fn _ => fn bytes => |
|
162 |
PolyML.PrettyString ("Bytes {length = " ^ string_of_int (length bytes) ^ "}")) |
|
163 |
||
75567
94321fd25c8a
support for scalable byte strings, with incremental construction;
wenzelm
parents:
diff
changeset
|
164 |
end; |