| author | wenzelm | 
| Fri, 28 Jun 2024 13:20:18 +0200 | |
| changeset 80443 | ab0dd21dd0ca | 
| parent 80428 | 5fe811816fa3 | 
| child 80473 | 09afc244bb5e | 
| permissions | -rw-r--r-- | 
| 54439 | 1 | /* Title: Pure/General/bytes.scala | 
| 2 | Author: Makarius | |
| 3 | ||
| 4 | Immutable byte vectors versus UTF8 strings. | |
| 5 | */ | |
| 6 | ||
| 7 | package isabelle | |
| 8 | ||
| 9 | ||
| 78855 | 10 | import java.io.{ByteArrayInputStream, ByteArrayOutputStream, FileInputStream, FileOutputStream,
 | 
| 11 | InputStream, OutputStream, File => JFile} | |
| 77711 
25fd62cba347
clarified signature: more general operation Bytes.read_slice;
 wenzelm parents: 
76361diff
changeset | 12 | import java.nio.ByteBuffer | 
| 
25fd62cba347
clarified signature: more general operation Bytes.read_slice;
 wenzelm parents: 
76361diff
changeset | 13 | import java.nio.channels.FileChannel | 
| 
25fd62cba347
clarified signature: more general operation Bytes.read_slice;
 wenzelm parents: 
76361diff
changeset | 14 | import java.nio.file.StandardOpenOption | 
| 77712 
dd4bb80dbc3a
tuned performance: much faster low-level operation;
 wenzelm parents: 
77711diff
changeset | 15 | import java.util.Arrays | 
| 76353 | 16 | import org.tukaani.xz | 
| 17 | import com.github.luben.zstd | |
| 54440 | 18 | |
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 19 | import scala.collection.mutable.ArrayBuffer | 
| 80426 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 20 | import scala.collection.mutable | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 21 | |
| 54440 | 22 | |
| 75393 | 23 | object Bytes {
 | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 24 | /* internal sizes */ | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 25 | |
| 80385 | 26 | private val array_size: Long = Int.MaxValue - 8 // see java.io.InputStream.MAX_BUFFER_SIZE | 
| 80389 | 27 | private val block_size: Int = 16384 // see java.io.InputStream.DEFAULT_BUFFER_SIZE | 
| 80385 | 28 | private val chunk_size: Long = Space.MiB(100).bytes | 
| 29 | ||
| 30 | private def make_size(chunks: Array[Array[Byte]], buffer: Array[Byte]): Long = | |
| 31 | chunks.foldLeft(buffer.length.toLong)((n, chunk) => n + chunk.length) | |
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 32 | |
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 33 |   class Too_Large(size: Long) extends IndexOutOfBoundsException {
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 34 | override def getMessage: String = | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 35 | "Bytes too large for particular operation: " + | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 36 | Space.bytes(size).print + " > " + Space.bytes(array_size).print | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 37 | } | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 38 | |
| 54439 | 39 | |
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 40 | /* main constructors */ | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 41 | |
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 42 | private def reuse_array(bytes: Array[Byte]): Bytes = | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 43 | if (bytes.length <= chunk_size) new Bytes(None, bytes, 0L, bytes.length.toLong) | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 44 | else apply(bytes) | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 45 | |
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 46 | val empty: Bytes = reuse_array(new Array(0)) | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 47 | |
| 80373 | 48 | def apply(s: CharSequence): Bytes = | 
| 49 | if (s.isEmpty) empty | |
| 50 |     else Builder.use(hint = s.length) { builder => builder += s }
 | |
| 54440 | 51 | |
| 63779 | 52 | def apply(a: Array[Byte]): Bytes = apply(a, 0, a.length) | 
| 53 | ||
| 54442 
c39972ddd672
more specific Protocol_Output: empty message.body, main content via bytes/text;
 wenzelm parents: 
54440diff
changeset | 54 | def apply(a: Array[Byte], offset: Int, length: Int): Bytes = | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 55 |     Builder.use(hint = length) { builder => builder += (a, offset, length) }
 | 
| 54442 
c39972ddd672
more specific Protocol_Output: empty message.body, main content via bytes/text;
 wenzelm parents: 
54440diff
changeset | 56 | |
| 69454 | 57 |   val newline: Bytes = apply("\n")
 | 
| 54440 | 58 | |
| 75579 | 59 | |
| 54440 | 60 | /* read */ | 
| 61 | ||
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 62 |   def read_stream(stream: InputStream, limit: Long = -1L, hint: Long = 0L): Bytes = {
 | 
| 64004 | 63 | if (limit == 0) empty | 
| 64 |     else {
 | |
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 65 |       Builder.use(hint = if (limit > 0) limit else hint) { builder =>
 | 
| 80384 | 66 | val buf = new Array[Byte](block_size) | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 67 | var m = 0 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 68 | var n = 0L | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 69 |         while ({
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 70 | val l = if (limit > 0) ((limit - n) min buf.length).toInt else buf.length | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 71 | m = stream.read(buf, 0, l) | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 72 |           if (m != -1) {
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 73 | builder += (buf, 0, m) | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 74 | n += m | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 75 | } | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 76 | m != -1 && (limit < 0 || limit > n) | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 77 | }) () | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 78 | } | 
| 54440 | 79 | } | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 80 | } | 
| 64001 
7ecb22be8f03
more general read_stream: return actual byte count;
 wenzelm parents: 
63779diff
changeset | 81 | |
| 79510 
d8330439823a
clarified signature: explicit type isabelle.Url to avoid oddities of java.net.URL (e.g. its "equals" method);
 wenzelm parents: 
79509diff
changeset | 82 | def read_url(name: String): Bytes = using(Url(name).open_stream())(read_stream(_)) | 
| 77717 | 83 | |
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 84 |   def read_file(path: Path, offset: Long = 0L, limit: Long = -1L): Bytes = {
 | 
| 78956 | 85 | val length = File.size(path) | 
| 77711 
25fd62cba347
clarified signature: more general operation Bytes.read_slice;
 wenzelm parents: 
76361diff
changeset | 86 | val start = offset.max(0L) | 
| 80384 | 87 | val stop = (length - start).max(0L).min(if (limit < 0) Long.MaxValue else limit) | 
| 88 | if (stop == 0L) empty | |
| 77711 
25fd62cba347
clarified signature: more general operation Bytes.read_slice;
 wenzelm parents: 
76361diff
changeset | 89 |     else {
 | 
| 80384 | 90 |       Builder.use(hint = stop) { builder =>
 | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 91 |         using(FileChannel.open(path.java_path, StandardOpenOption.READ)) { channel =>
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 92 | channel.position(start) | 
| 80384 | 93 | val buf = ByteBuffer.allocate(block_size) | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 94 | var m = 0 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 95 | var n = 0L | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 96 |           while ({
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 97 | m = channel.read(buf) | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 98 |             if (m != -1) {
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 99 | builder += (buf.array(), 0, m) | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 100 | buf.clear() | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 101 | n += m | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 102 | } | 
| 80384 | 103 | m != -1 && stop > n | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 104 | }) () | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 105 | } | 
| 77711 
25fd62cba347
clarified signature: more general operation Bytes.read_slice;
 wenzelm parents: 
76361diff
changeset | 106 | } | 
| 
25fd62cba347
clarified signature: more general operation Bytes.read_slice;
 wenzelm parents: 
76361diff
changeset | 107 | } | 
| 
25fd62cba347
clarified signature: more general operation Bytes.read_slice;
 wenzelm parents: 
76361diff
changeset | 108 | } | 
| 
25fd62cba347
clarified signature: more general operation Bytes.read_slice;
 wenzelm parents: 
76361diff
changeset | 109 | |
| 78953 | 110 | def read(path: Path): Bytes = read_file(path) | 
| 111 | def read(file: JFile): Bytes = read_file(File.path(file)) | |
| 77718 | 112 | |
| 64229 | 113 | |
| 114 | /* write */ | |
| 115 | ||
| 69393 
ed0824ef337e
static type for Library.using: avoid Java 11 warnings on "Illegal reflective access";
 wenzelm parents: 
69365diff
changeset | 116 | def write(file: JFile, bytes: Bytes): Unit = | 
| 
ed0824ef337e
static type for Library.using: avoid Java 11 warnings on "Illegal reflective access";
 wenzelm parents: 
69365diff
changeset | 117 | using(new FileOutputStream(file))(bytes.write_stream(_)) | 
| 64229 | 118 | |
| 119 | def write(path: Path, bytes: Bytes): Unit = write(path.file, bytes) | |
| 78194 | 120 | |
| 121 | ||
| 122 | /* append */ | |
| 123 | ||
| 124 | def append(file: JFile, bytes: Bytes): Unit = | |
| 125 | using(new FileOutputStream(file, true))(bytes.write_stream(_)) | |
| 126 | ||
| 127 | def append(path: Path, bytes: Bytes): Unit = append(path.file, bytes) | |
| 80351 | 128 | |
| 129 | ||
| 80355 | 130 | /* vector of short unsigned integers */ | 
| 80351 | 131 | |
| 132 |   trait Vec {
 | |
| 133 | def size: Long | |
| 80355 | 134 | def apply(i: Long): Char | 
| 80351 | 135 | } | 
| 136 | ||
| 137 |   class Vec_String(string: String) extends Vec {
 | |
| 138 | override def size: Long = string.length.toLong | |
| 80355 | 139 | override def apply(i: Long): Char = | 
| 140 | if (0 <= i && i < size) string(i.toInt) | |
| 80351 | 141 | else throw new IndexOutOfBoundsException | 
| 142 | } | |
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 143 | |
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 144 | |
| 80380 
94d903234f6b
Bytes.Builder is unsynchronized, like java.io.OutputBuffer;
 wenzelm parents: 
80379diff
changeset | 145 | /* incremental builder: unsynchronized! */ | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 146 | |
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 147 |   object Builder {
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 148 |     def use(hint: Long = 0L)(body: Builder => Unit): Bytes = {
 | 
| 80375 | 149 | val builder = new Builder(hint) | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 150 | body(builder) | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 151 | builder.done() | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 152 | } | 
| 80377 
28dd9b91dfe5
more scalable compression, using Bytes.Builder.Stream;
 wenzelm parents: 
80376diff
changeset | 153 | |
| 80394 | 154 |     def use_stream(hint: Long = 0L)(body: OutputStream => Unit): Bytes = {
 | 
| 155 | val stream = new Stream(hint = hint) | |
| 156 | body(stream) | |
| 157 | stream.builder.done() | |
| 158 | } | |
| 159 | ||
| 160 |     private class Stream(hint: Long = 0L) extends OutputStream {
 | |
| 80377 
28dd9b91dfe5
more scalable compression, using Bytes.Builder.Stream;
 wenzelm parents: 
80376diff
changeset | 161 | val builder = new Builder(hint) | 
| 
28dd9b91dfe5
more scalable compression, using Bytes.Builder.Stream;
 wenzelm parents: 
80376diff
changeset | 162 | |
| 
28dd9b91dfe5
more scalable compression, using Bytes.Builder.Stream;
 wenzelm parents: 
80376diff
changeset | 163 | override def write(b: Int): Unit = | 
| 
28dd9b91dfe5
more scalable compression, using Bytes.Builder.Stream;
 wenzelm parents: 
80376diff
changeset | 164 |         { builder += b.toByte }
 | 
| 
28dd9b91dfe5
more scalable compression, using Bytes.Builder.Stream;
 wenzelm parents: 
80376diff
changeset | 165 | |
| 
28dd9b91dfe5
more scalable compression, using Bytes.Builder.Stream;
 wenzelm parents: 
80376diff
changeset | 166 | override def write(array: Array[Byte], offset: Int, length: Int): Unit = | 
| 
28dd9b91dfe5
more scalable compression, using Bytes.Builder.Stream;
 wenzelm parents: 
80376diff
changeset | 167 |         { builder += (array, offset, length) }
 | 
| 
28dd9b91dfe5
more scalable compression, using Bytes.Builder.Stream;
 wenzelm parents: 
80376diff
changeset | 168 | } | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 169 | } | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 170 | |
| 80375 | 171 |   final class Builder private[Bytes](hint: Long) {
 | 
| 172 | private var chunks = | |
| 173 | new ArrayBuffer[Array[Byte]](if (hint <= 0) 16 else (hint / chunk_size).toInt) | |
| 80426 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 174 | |
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 175 | private var buffer_list: mutable.ListBuffer[Array[Byte]] = null | 
| 80375 | 176 | private var buffer = | 
| 80426 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 177 | new Array[Byte](if (hint <= 0) 1024 else (hint min chunk_size min array_size).toInt) | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 178 | private var buffer_index = 0 | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 179 | private var buffer_total = 0 | 
| 80370 | 180 | |
| 80426 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 181 | private def buffer_content(): Array[Byte] = | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 182 |       if (buffer_list != null) {
 | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 183 | val array = new Array[Byte](buffer_total) | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 184 | var i = 0 | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 185 |         for (b <- buffer_list) {
 | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 186 | val n = b.length | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 187 | System.arraycopy(b, 0, array, i, n) | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 188 | i += n | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 189 | } | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 190 | System.arraycopy(buffer, 0, array, i, buffer_index) | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 191 | array | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 192 | } | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 193 | else if (buffer_index == buffer.length) buffer else Arrays.copyOf(buffer, buffer_index) | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 194 | |
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 195 | private def buffer_check(request: Int = 0): Unit = | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 196 |       if (buffer_index == buffer.length) {
 | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 197 |         if (buffer_total == chunk_size) {
 | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 198 | chunks += buffer_content() | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 199 | buffer_list = null | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 200 | buffer = new Array[Byte](chunk_size.toInt) | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 201 | buffer_total = 0 | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 202 | buffer_index = 0 | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 203 | } | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 204 |         else {
 | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 205 |           if (buffer_list == null) { buffer_list = new mutable.ListBuffer }
 | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 206 | buffer_list += buffer | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 207 | buffer_index = 0 | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 208 | val limit = (chunk_size - buffer_total).toInt | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 209 | buffer = new Array[Byte]((buffer_total max request) min limit) | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 210 | } | 
| 80370 | 211 | } | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 212 | |
| 80380 
94d903234f6b
Bytes.Builder is unsynchronized, like java.io.OutputBuffer;
 wenzelm parents: 
80379diff
changeset | 213 |     def += (b: Byte): Unit = {
 | 
| 80426 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 214 | buffer(buffer_index) = b | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 215 | buffer_total += 1 | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 216 | buffer_index += 1 | 
| 80377 
28dd9b91dfe5
more scalable compression, using Bytes.Builder.Stream;
 wenzelm parents: 
80376diff
changeset | 217 | buffer_check() | 
| 
28dd9b91dfe5
more scalable compression, using Bytes.Builder.Stream;
 wenzelm parents: 
80376diff
changeset | 218 | } | 
| 
28dd9b91dfe5
more scalable compression, using Bytes.Builder.Stream;
 wenzelm parents: 
80376diff
changeset | 219 | |
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 220 |     def += (array: Array[Byte], offset: Int, length: Int): Unit = {
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 221 |       if (offset < 0 || length < 0 || offset.toLong + length.toLong > array.length) {
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 222 | throw new IndexOutOfBoundsException | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 223 | } | 
| 80426 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 224 |       else {
 | 
| 80380 
94d903234f6b
Bytes.Builder is unsynchronized, like java.io.OutputBuffer;
 wenzelm parents: 
80379diff
changeset | 225 | var i = offset | 
| 
94d903234f6b
Bytes.Builder is unsynchronized, like java.io.OutputBuffer;
 wenzelm parents: 
80379diff
changeset | 226 | var n = length | 
| 
94d903234f6b
Bytes.Builder is unsynchronized, like java.io.OutputBuffer;
 wenzelm parents: 
80379diff
changeset | 227 |         while (n > 0) {
 | 
| 80426 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 228 | val l = n min (buffer.length - buffer_index) | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 229 | System.arraycopy(array, i, buffer, buffer_index, l) | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 230 | buffer_total += l | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 231 | buffer_index += l | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 232 | i += l | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 233 | n -= l | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 234 | buffer_check(request = n) | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 235 | } | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 236 | } | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 237 | } | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 238 | |
| 80428 | 239 | def += (s: CharSequence): Unit = | 
| 240 |       if (s.length > 0) { this += UTF8.bytes(s.toString) }
 | |
| 241 | ||
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 242 |     def += (array: Array[Byte]): Unit = { this += (array, 0, array.length) }
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 243 | |
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 244 |     def += (a: Subarray): Unit = { this += (a.array, a.offset, a.length) }
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 245 | |
| 80394 | 246 |     private def done(): Bytes = {
 | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 247 | val cs = chunks.toArray | 
| 80426 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 248 | val b = buffer_content() | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 249 |       val size = cs.foldLeft(b.length.toLong)({ case (n, a) => n + a.length })
 | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 250 | chunks = null | 
| 80426 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 251 | buffer_list = null | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 252 | buffer = null | 
| 80374 | 253 | new Bytes(if (cs.isEmpty) None else Some(cs), b, 0L, size) | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 254 | } | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 255 | } | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 256 | |
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 257 | |
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 258 | /* subarray */ | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 259 | |
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 260 |   object Subarray {
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 261 | val empty: Subarray = new Subarray(new Array[Byte](0), 0, 0) | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 262 | |
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 263 |     def apply(array: Array[Byte], offset: Int, length: Int): Subarray = {
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 264 | val n = array.length | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 265 |       if (0 <= offset && offset < n && 0 <= length && offset + length <= n) {
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 266 | if (length == 0) empty | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 267 | else new Subarray(array, offset, length) | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 268 | } | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 269 | else throw new IndexOutOfBoundsException | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 270 | } | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 271 | } | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 272 | |
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 273 | final class Subarray private( | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 274 | val array: Array[Byte], | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 275 | val offset: Int, | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 276 | val length: Int | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 277 |   ) {
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 278 |     override def toString: String = "Bytes.Subarray(" + Space.bytes(length).print + ")"
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 279 | |
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 280 | def byte_iterator: Iterator[Byte] = | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 281 | if (length == 0) Iterator.empty | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 282 |       else { for (i <- (offset until (offset + length)).iterator) yield array(i) }
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 283 | } | 
| 54439 | 284 | } | 
| 285 | ||
| 286 | final class Bytes private( | |
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 287 | protected val chunks: Option[Array[Array[Byte]]], | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 288 | protected val chunk0: Array[Byte], | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 289 | protected val offset: Long, | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 290 | val size: Long | 
| 80361 | 291 | ) extends Bytes.Vec {
 | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 292 | assert( | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 293 | (chunks.isEmpty || | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 294 | chunks.get.nonEmpty && | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 295 | chunks.get.forall(chunk => chunk.length == Bytes.chunk_size)) && | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 296 | chunk0.length < Bytes.chunk_size) | 
| 80361 | 297 | |
| 80385 | 298 | def small_size: Int = | 
| 299 | if (size > Bytes.array_size) throw new Bytes.Too_Large(size) | |
| 300 | else size.toInt | |
| 301 | ||
| 80361 | 302 | def is_empty: Boolean = size == 0 | 
| 303 | ||
| 80385 | 304 | def proper: Option[Bytes] = if (is_empty) None else Some(this) | 
| 305 | ||
| 80361 | 306 | def is_sliced: Boolean = | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 307 |     offset != 0L || {
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 308 |       chunks match {
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 309 | case None => size != chunk0.length | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 310 | case Some(cs) => size != Bytes.make_size(cs, chunk0) | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 311 | } | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 312 | } | 
| 80361 | 313 | |
| 314 | override def toString: String = | |
| 315 | if (is_empty) "Bytes.empty" | |
| 316 |     else "Bytes(" + Space.bytes(size).print + if_proper(is_sliced, ", sliced") + ")"
 | |
| 317 | ||
| 318 | ||
| 319 | /* elements: signed Byte or unsigned Char */ | |
| 320 | ||
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 321 |   protected def byte_unchecked(i: Long): Byte = {
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 322 | val a = offset + i | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 323 |     chunks match {
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 324 | case None => chunk0(a.toInt) | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 325 | case Some(cs) => | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 326 | val b = a % Bytes.chunk_size | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 327 | val c = a / Bytes.chunk_size | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 328 | if (c < cs.length) cs(c.toInt)(b.toInt) else chunk0(b.toInt) | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 329 | } | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 330 | } | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 331 | |
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 332 | def byte(i: Long): Byte = | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 333 | if (0 <= i && i < size) byte_unchecked(i) | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 334 | else throw new IndexOutOfBoundsException | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 335 | |
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 336 | def apply(i: Long): Char = (byte(i).toInt & 0xff).toChar | 
| 80361 | 337 | |
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 338 | protected def subarray_iterator: Iterator[Bytes.Subarray] = | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 339 | if (is_empty) Iterator.empty | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 340 | else if (chunks.isEmpty) Iterator(Bytes.Subarray(chunk0, offset.toInt, size.toInt)) | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 341 |     else {
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 342 | val end_offset = offset + size | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 343 |       for ((array, index) <- (chunks.get.iterator ++ Iterator(chunk0)).zipWithIndex) yield {
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 344 | val array_start = Bytes.chunk_size * index | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 345 | val array_stop = array_start + array.length | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 346 |         if (offset < array_stop && array_start < end_offset) {
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 347 | val i = (array_start max offset) - array_start | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 348 | val j = (array_stop min end_offset) - array_start | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 349 | Bytes.Subarray(array, i.toInt, (j - i).toInt) | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 350 | } | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 351 | else Bytes.Subarray.empty | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 352 | } | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 353 | } | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 354 | |
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 355 | def byte_iterator: Iterator[Byte] = | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 356 |     for {
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 357 | a <- subarray_iterator | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 358 | b <- a.byte_iterator | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 359 | } yield b | 
| 80361 | 360 | |
| 361 | ||
| 80385 | 362 | /* slice */ | 
| 363 | ||
| 364 | def slice(i: Long, j: Long): Bytes = | |
| 365 |     if (0 <= i && i <= j && j <= size) {
 | |
| 366 | if (i == j) Bytes.empty | |
| 367 | else new Bytes(chunks, chunk0, offset + i, j - i) | |
| 368 | } | |
| 369 | else throw new IndexOutOfBoundsException | |
| 370 | ||
| 371 | def unslice: Bytes = | |
| 372 |     if (is_sliced) {
 | |
| 373 |       Bytes.Builder.use(hint = size) { builder =>
 | |
| 374 |         for (a <- subarray_iterator) { builder += a }
 | |
| 375 | } | |
| 376 | } | |
| 377 | else this | |
| 378 | ||
| 379 | def trim_line: Bytes = | |
| 380 |     if (size >= 2 && byte_unchecked(size - 2) == 13 && byte_unchecked(size - 1) == 10) {
 | |
| 381 | slice(0, size - 2) | |
| 382 | } | |
| 383 |     else if (size >= 1 && (byte_unchecked(size - 1) == 13 || byte_unchecked(size - 1) == 10)) {
 | |
| 384 | slice(0, size - 1) | |
| 385 | } | |
| 386 | else this | |
| 387 | ||
| 388 | ||
| 80363 
306f273c91ec
clarified hash and equality: depend on sha1 digest to be collision-free;
 wenzelm parents: 
80362diff
changeset | 389 | /* hash and equality */ | 
| 
306f273c91ec
clarified hash and equality: depend on sha1 digest to be collision-free;
 wenzelm parents: 
80362diff
changeset | 390 | |
| 
306f273c91ec
clarified hash and equality: depend on sha1 digest to be collision-free;
 wenzelm parents: 
80362diff
changeset | 391 | lazy val sha1_digest: SHA1.Digest = | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 392 | if (is_empty) SHA1.digest_empty | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 393 |     else {
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 394 |       SHA1.make_digest { sha =>
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 395 |         for (a <- subarray_iterator if a.length > 0) {
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 396 | sha.update(a.array, a.offset, a.length) | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 397 | } | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 398 | } | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 399 | } | 
| 80363 
306f273c91ec
clarified hash and equality: depend on sha1 digest to be collision-free;
 wenzelm parents: 
80362diff
changeset | 400 | |
| 
306f273c91ec
clarified hash and equality: depend on sha1 digest to be collision-free;
 wenzelm parents: 
80362diff
changeset | 401 | override def hashCode(): Int = sha1_digest.hashCode() | 
| 54439 | 402 | |
| 75393 | 403 |   override def equals(that: Any): Boolean = {
 | 
| 54440 | 404 |     that match {
 | 
| 405 | case other: Bytes => | |
| 80363 
306f273c91ec
clarified hash and equality: depend on sha1 digest to be collision-free;
 wenzelm parents: 
80362diff
changeset | 406 | if (this.eq(other)) true | 
| 
306f273c91ec
clarified hash and equality: depend on sha1 digest to be collision-free;
 wenzelm parents: 
80362diff
changeset | 407 | else if (size != other.size) false | 
| 80378 
ab4badc7db7f
more efficient equals: avoid somewhat slow sha1_digest (see also 29b761e290c5, 306f273c91ec);
 wenzelm parents: 
80377diff
changeset | 408 |         else {
 | 
| 
ab4badc7db7f
more efficient equals: avoid somewhat slow sha1_digest (see also 29b761e290c5, 306f273c91ec);
 wenzelm parents: 
80377diff
changeset | 409 |           if (chunks.isEmpty && other.chunks.isEmpty) {
 | 
| 
ab4badc7db7f
more efficient equals: avoid somewhat slow sha1_digest (see also 29b761e290c5, 306f273c91ec);
 wenzelm parents: 
80377diff
changeset | 410 | Arrays.equals(chunk0, offset.toInt, (offset + size).toInt, | 
| 
ab4badc7db7f
more efficient equals: avoid somewhat slow sha1_digest (see also 29b761e290c5, 306f273c91ec);
 wenzelm parents: 
80377diff
changeset | 411 | other.chunk0, other.offset.toInt, (other.offset + other.size).toInt) | 
| 
ab4badc7db7f
more efficient equals: avoid somewhat slow sha1_digest (see also 29b761e290c5, 306f273c91ec);
 wenzelm parents: 
80377diff
changeset | 412 | } | 
| 
ab4badc7db7f
more efficient equals: avoid somewhat slow sha1_digest (see also 29b761e290c5, 306f273c91ec);
 wenzelm parents: 
80377diff
changeset | 413 |           else if (!is_sliced && !other.is_sliced) {
 | 
| 
ab4badc7db7f
more efficient equals: avoid somewhat slow sha1_digest (see also 29b761e290c5, 306f273c91ec);
 wenzelm parents: 
80377diff
changeset | 414 | (subarray_iterator zip other.subarray_iterator) | 
| 
ab4badc7db7f
more efficient equals: avoid somewhat slow sha1_digest (see also 29b761e290c5, 306f273c91ec);
 wenzelm parents: 
80377diff
changeset | 415 | .forall((a, b) => Arrays.equals(a.array, b.array)) | 
| 
ab4badc7db7f
more efficient equals: avoid somewhat slow sha1_digest (see also 29b761e290c5, 306f273c91ec);
 wenzelm parents: 
80377diff
changeset | 416 | } | 
| 
ab4badc7db7f
more efficient equals: avoid somewhat slow sha1_digest (see also 29b761e290c5, 306f273c91ec);
 wenzelm parents: 
80377diff
changeset | 417 | else sha1_digest == other.sha1_digest | 
| 80363 
306f273c91ec
clarified hash and equality: depend on sha1 digest to be collision-free;
 wenzelm parents: 
80362diff
changeset | 418 | } | 
| 54440 | 419 | case _ => false | 
| 420 | } | |
| 421 | } | |
| 422 | ||
| 54439 | 423 | |
| 424 | /* content */ | |
| 425 | ||
| 80385 | 426 | def + (other: Bytes): Bytes = | 
| 427 | if (other.is_empty) this | |
| 428 | else if (is_empty) other | |
| 429 |     else {
 | |
| 430 |       Bytes.Builder.use(hint = size + other.size) { builder =>
 | |
| 431 |         for (a <- subarray_iterator ++ other.subarray_iterator) {
 | |
| 432 | builder += a | |
| 433 | } | |
| 434 | } | |
| 435 | } | |
| 436 | ||
| 80368 | 437 |   def make_array: Array[Byte] = {
 | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 438 | val buf = new ByteArrayOutputStream(small_size) | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 439 |     for (a <- subarray_iterator) { buf.write(a.array, a.offset, a.length) }
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 440 | buf.toByteArray | 
| 69365 
c5b3860d29ef
avoid loading of font file, to eliminate "Illegal reflective access by com.lowagie.text.pdf.MappedRandomAccessFile$1 (iText-2.1.5.jar) to method java.nio.DirectByteBuffer.cleaner()" -- due to com.lowagie.text.pdf.TrueTypeFont.process() / RandomAccessFileOrArray;
 wenzelm parents: 
68167diff
changeset | 441 | } | 
| 
c5b3860d29ef
avoid loading of font file, to eliminate "Illegal reflective access by com.lowagie.text.pdf.MappedRandomAccessFile$1 (iText-2.1.5.jar) to method java.nio.DirectByteBuffer.cleaner()" -- due to com.lowagie.text.pdf.TrueTypeFont.process() / RandomAccessFileOrArray;
 wenzelm parents: 
68167diff
changeset | 442 | |
| 80352 | 443 | def text: String = | 
| 444 | if (is_empty) "" | |
| 80379 
1f1ce661c71c
notable performance tuning: avoid overhead of higher-order functions;
 wenzelm parents: 
80378diff
changeset | 445 |     else {
 | 
| 
1f1ce661c71c
notable performance tuning: avoid overhead of higher-order functions;
 wenzelm parents: 
80378diff
changeset | 446 | var i = 0L | 
| 
1f1ce661c71c
notable performance tuning: avoid overhead of higher-order functions;
 wenzelm parents: 
80378diff
changeset | 447 | var utf8 = false | 
| 
1f1ce661c71c
notable performance tuning: avoid overhead of higher-order functions;
 wenzelm parents: 
80378diff
changeset | 448 |       while (i < size && !utf8) {
 | 
| 
1f1ce661c71c
notable performance tuning: avoid overhead of higher-order functions;
 wenzelm parents: 
80378diff
changeset | 449 |         if (byte_unchecked(i) < 0) { utf8 = true }
 | 
| 
1f1ce661c71c
notable performance tuning: avoid overhead of higher-order functions;
 wenzelm parents: 
80378diff
changeset | 450 | i += 1 | 
| 
1f1ce661c71c
notable performance tuning: avoid overhead of higher-order functions;
 wenzelm parents: 
80378diff
changeset | 451 | } | 
| 
1f1ce661c71c
notable performance tuning: avoid overhead of higher-order functions;
 wenzelm parents: 
80378diff
changeset | 452 | utf8 | 
| 
1f1ce661c71c
notable performance tuning: avoid overhead of higher-order functions;
 wenzelm parents: 
80378diff
changeset | 453 | |
| 
1f1ce661c71c
notable performance tuning: avoid overhead of higher-order functions;
 wenzelm parents: 
80378diff
changeset | 454 | if (utf8) UTF8.decode_permissive_bytes(this) | 
| 
1f1ce661c71c
notable performance tuning: avoid overhead of higher-order functions;
 wenzelm parents: 
80378diff
changeset | 455 | else new String(make_array, UTF8.charset) | 
| 80352 | 456 | } | 
| 65279 
fa62e095d8f1
clarified signature (again, see also 3ed43cfc8b14);
 wenzelm parents: 
65070diff
changeset | 457 | |
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 458 | def wellformed_text: Option[String] = | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 459 |     try {
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 460 | val s = text | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 461 | if (this == Bytes(s)) Some(s) else None | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 462 | } | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 463 |     catch { case ERROR(_) => None }
 | 
| 76236 | 464 | |
| 80393 
6138c5b803be
Base64: proper support for large Bytes, with subtle change of types (Bytes instead of String);
 wenzelm parents: 
80389diff
changeset | 465 | |
| 
6138c5b803be
Base64: proper support for large Bytes, with subtle change of types (Bytes instead of String);
 wenzelm parents: 
80389diff
changeset | 466 | /* Base64 data representation */ | 
| 
6138c5b803be
Base64: proper support for large Bytes, with subtle change of types (Bytes instead of String);
 wenzelm parents: 
80389diff
changeset | 467 | |
| 80394 | 468 | def encode_base64: Bytes = | 
| 469 |     Bytes.Builder.use_stream(hint = (size * 1.5).round) { out =>
 | |
| 470 | using(Base64.encode_stream(out))(write_stream(_)) | |
| 471 | } | |
| 80393 
6138c5b803be
Base64: proper support for large Bytes, with subtle change of types (Bytes instead of String);
 wenzelm parents: 
80389diff
changeset | 472 | |
| 
6138c5b803be
Base64: proper support for large Bytes, with subtle change of types (Bytes instead of String);
 wenzelm parents: 
80389diff
changeset | 473 | def decode_base64: Bytes = | 
| 
6138c5b803be
Base64: proper support for large Bytes, with subtle change of types (Bytes instead of String);
 wenzelm parents: 
80389diff
changeset | 474 | using(Base64.decode_stream(stream()))(Bytes.read_stream(_, hint = (size / 1.2).round)) | 
| 68094 | 475 | |
| 76236 | 476 | def maybe_encode_base64: (Boolean, String) = | 
| 477 |     wellformed_text match {
 | |
| 478 | case Some(s) => (false, s) | |
| 80393 
6138c5b803be
Base64: proper support for large Bytes, with subtle change of types (Bytes instead of String);
 wenzelm parents: 
80389diff
changeset | 479 | case None => (true, encode_base64.text) | 
| 76236 | 480 | } | 
| 68106 | 481 | |
| 54440 | 482 | |
| 64004 | 483 | /* streams */ | 
| 484 | ||
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 485 | def stream(): InputStream = | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 486 | if (chunks.isEmpty) new ByteArrayInputStream(chunk0, offset.toInt, size.toInt) | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 487 |     else {
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 488 |       new InputStream {
 | 
| 80367 | 489 | private var index = 0L | 
| 490 | ||
| 491 |         def read(): Int = {
 | |
| 492 |           if (index < size) {
 | |
| 493 | val res = byte_unchecked(index).toInt & 0xff | |
| 494 | index += 1 | |
| 495 | res | |
| 496 | } | |
| 497 | else -1 | |
| 498 | } | |
| 499 | ||
| 80388 | 500 |         override def read(buffer: Array[Byte], start: Int, length: Int): Int = {
 | 
| 501 | if (length < 16) super.read(buffer, start, length) | |
| 502 |           else {
 | |
| 503 | val index0 = index | |
| 504 | index = (index + length) min size | |
| 505 | val n = (index - index0).toInt | |
| 506 | if (n == 0) -1 | |
| 507 |             else {
 | |
| 508 | var i = start | |
| 509 |               for (a <- slice(index0, index).subarray_iterator) {
 | |
| 510 | val l = a.length | |
| 511 |                 if (l > 0) {
 | |
| 512 | System.arraycopy(a.array, a.offset, buffer, i, l) | |
| 513 | i += l | |
| 514 | } | |
| 515 | } | |
| 516 | n | |
| 517 | } | |
| 518 | } | |
| 519 | } | |
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 520 | } | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 521 | } | 
| 64004 | 522 | |
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 523 | def write_stream(stream: OutputStream): Unit = | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 524 |     for (a <- subarray_iterator if a.length > 0) {
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 525 | stream.write(a.array, a.offset, a.length) | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 526 | } | 
| 64004 | 527 | |
| 528 | ||
| 76351 
2cee31cd92f0
generic support for XZ and Zstd compression in Isabelle/Scala;
 wenzelm parents: 
76350diff
changeset | 529 | /* XZ / Zstd data compression */ | 
| 
2cee31cd92f0
generic support for XZ and Zstd compression in Isabelle/Scala;
 wenzelm parents: 
76350diff
changeset | 530 | |
| 76358 | 531 | def detect_xz: Boolean = | 
| 80357 
fe123d033e76
clarified signature: pro-forma support for Bytes with size: Long;
 wenzelm parents: 
80356diff
changeset | 532 | size >= 6 && | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 533 | byte_unchecked(0) == 0xFD.toByte && | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 534 | byte_unchecked(1) == 0x37.toByte && | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 535 | byte_unchecked(2) == 0x7A.toByte && | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 536 | byte_unchecked(3) == 0x58.toByte && | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 537 | byte_unchecked(4) == 0x5A.toByte && | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 538 | byte_unchecked(5) == 0x00.toByte | 
| 76351 
2cee31cd92f0
generic support for XZ and Zstd compression in Isabelle/Scala;
 wenzelm parents: 
76350diff
changeset | 539 | |
| 76358 | 540 | def detect_zstd: Boolean = | 
| 80357 
fe123d033e76
clarified signature: pro-forma support for Bytes with size: Long;
 wenzelm parents: 
80356diff
changeset | 541 | size >= 4 && | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 542 | byte_unchecked(0) == 0x28.toByte && | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 543 | byte_unchecked(1) == 0xB5.toByte && | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 544 | byte_unchecked(2) == 0x2F.toByte && | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 545 | byte_unchecked(3) == 0xFD.toByte | 
| 76351 
2cee31cd92f0
generic support for XZ and Zstd compression in Isabelle/Scala;
 wenzelm parents: 
76350diff
changeset | 546 | |
| 76358 | 547 | def uncompress_xz(cache: Compress.Cache = Compress.Cache.none): Bytes = | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 548 | using(new xz.XZInputStream(stream(), cache.for_xz))(Bytes.read_stream(_, hint = size)) | 
| 76358 | 549 | |
| 550 |   def uncompress_zstd(cache: Compress.Cache = Compress.Cache.none): Bytes = {
 | |
| 551 | Zstd.init() | |
| 80376 | 552 | using(new zstd.ZstdInputStream(stream(), cache.for_zstd))(Bytes.read_stream(_, hint = size)) | 
| 76358 | 553 | } | 
| 54440 | 554 | |
| 76351 
2cee31cd92f0
generic support for XZ and Zstd compression in Isabelle/Scala;
 wenzelm parents: 
76350diff
changeset | 555 | def uncompress(cache: Compress.Cache = Compress.Cache.none): Bytes = | 
| 76358 | 556 | if (detect_xz) uncompress_xz(cache = cache) | 
| 557 | else if (detect_zstd) uncompress_zstd(cache = cache) | |
| 558 |     else error("Cannot detect compression scheme")
 | |
| 64004 | 559 | |
| 76351 
2cee31cd92f0
generic support for XZ and Zstd compression in Isabelle/Scala;
 wenzelm parents: 
76350diff
changeset | 560 | def compress( | 
| 
2cee31cd92f0
generic support for XZ and Zstd compression in Isabelle/Scala;
 wenzelm parents: 
76350diff
changeset | 561 | options: Compress.Options = Compress.Options(), | 
| 
2cee31cd92f0
generic support for XZ and Zstd compression in Isabelle/Scala;
 wenzelm parents: 
76350diff
changeset | 562 | cache: Compress.Cache = Compress.Cache.none | 
| 
2cee31cd92f0
generic support for XZ and Zstd compression in Isabelle/Scala;
 wenzelm parents: 
76350diff
changeset | 563 |   ): Bytes = {
 | 
| 80394 | 564 |     Bytes.Builder.use_stream(hint = size) { out =>
 | 
| 565 | using( | |
| 566 |         options match {
 | |
| 567 | case options_xz: Compress.Options_XZ => | |
| 568 | new xz.XZOutputStream(out, options_xz.make, cache.for_xz) | |
| 569 | case options_zstd: Compress.Options_Zstd => | |
| 570 | new zstd.ZstdOutputStream(out, cache.for_zstd, options_zstd.level) | |
| 571 | } | |
| 572 |       ) { s => for (a <- subarray_iterator) s.write(a.array, a.offset, a.length) }
 | |
| 573 | } | |
| 64004 | 574 | } | 
| 68167 | 575 | |
| 75393 | 576 | def maybe_compress( | 
| 76351 
2cee31cd92f0
generic support for XZ and Zstd compression in Isabelle/Scala;
 wenzelm parents: 
76350diff
changeset | 577 | options: Compress.Options = Compress.Options(), | 
| 
2cee31cd92f0
generic support for XZ and Zstd compression in Isabelle/Scala;
 wenzelm parents: 
76350diff
changeset | 578 | cache: Compress.Cache = Compress.Cache.none | 
| 75393 | 579 |   ) : (Boolean, Bytes) = {
 | 
| 68167 | 580 | val compressed = compress(options = options, cache = cache) | 
| 80357 
fe123d033e76
clarified signature: pro-forma support for Bytes with size: Long;
 wenzelm parents: 
80356diff
changeset | 581 | if (compressed.size < size) (true, compressed) else (false, this) | 
| 68167 | 582 | } | 
| 54439 | 583 | } |