| author | paulson <lp15@cam.ac.uk> | 
| Tue, 09 Jul 2024 21:13:14 +0100 | |
| changeset 80539 | 34a5ca6fcddd | 
| parent 80514 | 482897a69699 | 
| child 82142 | 508a673c87ac | 
| permissions | -rw-r--r-- | 
| 54439 | 1 | /* Title: Pure/General/bytes.scala | 
| 2 | Author: Makarius | |
| 3 | ||
| 80503 | 4 | Scalable byte strings, with incremental construction (via Builder). | 
| 54439 | 5 | */ | 
| 6 | ||
| 7 | package isabelle | |
| 8 | ||
| 9 | ||
| 78855 | 10 | import java.io.{ByteArrayInputStream, ByteArrayOutputStream, FileInputStream, FileOutputStream,
 | 
| 80508 
8585399f26f6
prefer official UTF-8 decoding (in contrast to 2541de190d92): this is also more efficient (factor 10-20);
 wenzelm parents: 
80503diff
changeset | 11 | InputStreamReader, InputStream, OutputStream, File => JFile} | 
| 77711 
25fd62cba347
clarified signature: more general operation Bytes.read_slice;
 wenzelm parents: 
76361diff
changeset | 12 | import java.nio.ByteBuffer | 
| 80492 
43323d886ea3
clarified signature: more direct Bytes.raw and subsequent UTF-8 default decoding;
 wenzelm parents: 
80491diff
changeset | 13 | import java.nio.charset.StandardCharsets.ISO_8859_1 | 
| 77711 
25fd62cba347
clarified signature: more general operation Bytes.read_slice;
 wenzelm parents: 
76361diff
changeset | 14 | import java.nio.channels.FileChannel | 
| 
25fd62cba347
clarified signature: more general operation Bytes.read_slice;
 wenzelm parents: 
76361diff
changeset | 15 | import java.nio.file.StandardOpenOption | 
| 77712 
dd4bb80dbc3a
tuned performance: much faster low-level operation;
 wenzelm parents: 
77711diff
changeset | 16 | import java.util.Arrays | 
| 76353 | 17 | import org.tukaani.xz | 
| 18 | import com.github.luben.zstd | |
| 54440 | 19 | |
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 20 | import scala.collection.mutable.ArrayBuffer | 
| 80426 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 21 | import scala.collection.mutable | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 22 | |
| 54440 | 23 | |
| 75393 | 24 | object Bytes {
 | 
| 80514 
482897a69699
tuned signature: expose internal limits for testing or add-on implementations;
 wenzelm parents: 
80513diff
changeset | 25 | /* internal limits */ | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 26 | |
| 80514 
482897a69699
tuned signature: expose internal limits for testing or add-on implementations;
 wenzelm parents: 
80513diff
changeset | 27 | val array_size: Long = Int.MaxValue - 8 // see java.io.InputStream.MAX_BUFFER_SIZE | 
| 
482897a69699
tuned signature: expose internal limits for testing or add-on implementations;
 wenzelm parents: 
80513diff
changeset | 28 | val string_size: Long = Int.MaxValue / 2 | 
| 
482897a69699
tuned signature: expose internal limits for testing or add-on implementations;
 wenzelm parents: 
80513diff
changeset | 29 | val block_size: Int = 16384 // see java.io.InputStream.DEFAULT_BUFFER_SIZE | 
| 
482897a69699
tuned signature: expose internal limits for testing or add-on implementations;
 wenzelm parents: 
80513diff
changeset | 30 | val chunk_size: Long = Space.MiB(100).bytes | 
| 80385 | 31 | |
| 80509 | 32 |   class Too_Large(size: Long, limit: Long) extends IndexOutOfBoundsException {
 | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 33 | override def getMessage: String = | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 34 | "Bytes too large for particular operation: " + | 
| 80509 | 35 | Space.bytes(size).print + " > " + Space.bytes(limit).print | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 36 | } | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 37 | |
| 54439 | 38 | |
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 39 | /* main constructors */ | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 40 | |
| 80512 | 41 | val empty: Bytes = new Bytes(None, new Array(0), 0L, 0L) | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 42 | |
| 80492 
43323d886ea3
clarified signature: more direct Bytes.raw and subsequent UTF-8 default decoding;
 wenzelm parents: 
80491diff
changeset | 43 | def raw(s: String): Bytes = | 
| 
43323d886ea3
clarified signature: more direct Bytes.raw and subsequent UTF-8 default decoding;
 wenzelm parents: 
80491diff
changeset | 44 | if (s.isEmpty) empty | 
| 
43323d886ea3
clarified signature: more direct Bytes.raw and subsequent UTF-8 default decoding;
 wenzelm parents: 
80491diff
changeset | 45 |     else Builder.use(hint = s.length) { builder => builder += s.getBytes(ISO_8859_1) }
 | 
| 
43323d886ea3
clarified signature: more direct Bytes.raw and subsequent UTF-8 default decoding;
 wenzelm parents: 
80491diff
changeset | 46 | |
| 80491 | 47 | def apply(s: String): Bytes = | 
| 80373 | 48 | if (s.isEmpty) empty | 
| 49 |     else Builder.use(hint = s.length) { builder => builder += s }
 | |
| 54440 | 50 | |
| 63779 | 51 | def apply(a: Array[Byte]): Bytes = apply(a, 0, a.length) | 
| 52 | ||
| 54442 
c39972ddd672
more specific Protocol_Output: empty message.body, main content via bytes/text;
 wenzelm parents: 
54440diff
changeset | 53 | def apply(a: Array[Byte], offset: Int, length: Int): Bytes = | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 54 |     Builder.use(hint = length) { builder => builder += (a, offset, length) }
 | 
| 54442 
c39972ddd672
more specific Protocol_Output: empty message.body, main content via bytes/text;
 wenzelm parents: 
54440diff
changeset | 55 | |
| 69454 | 56 |   val newline: Bytes = apply("\n")
 | 
| 54440 | 57 | |
| 75579 | 58 | |
| 54440 | 59 | /* read */ | 
| 60 | ||
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 61 |   def read_stream(stream: InputStream, limit: Long = -1L, hint: Long = 0L): Bytes = {
 | 
| 64004 | 62 | if (limit == 0) empty | 
| 63 |     else {
 | |
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 64 |       Builder.use(hint = if (limit > 0) limit else hint) { builder =>
 | 
| 80474 
66ebeae3acdc
misc tuning: more uniform read_stream vs. read_file;
 wenzelm parents: 
80473diff
changeset | 65 | val buf_size = block_size | 
| 80384 | 66 | val buf = new Array[Byte](block_size) | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 67 | var m = 0 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 68 | var n = 0L | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 69 |         while ({
 | 
| 80474 
66ebeae3acdc
misc tuning: more uniform read_stream vs. read_file;
 wenzelm parents: 
80473diff
changeset | 70 | val l = if (limit > 0) (limit - n).min(buf_size).toInt else buf_size | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 71 | m = stream.read(buf, 0, l) | 
| 80474 
66ebeae3acdc
misc tuning: more uniform read_stream vs. read_file;
 wenzelm parents: 
80473diff
changeset | 72 |           if (m > 0) {
 | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 73 | builder += (buf, 0, m) | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 74 | n += m | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 75 | } | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 76 | m != -1 && (limit < 0 || limit > n) | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 77 | }) () | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 78 | } | 
| 54440 | 79 | } | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 80 | } | 
| 64001 
7ecb22be8f03
more general read_stream: return actual byte count;
 wenzelm parents: 
63779diff
changeset | 81 | |
| 79510 
d8330439823a
clarified signature: explicit type isabelle.Url to avoid oddities of java.net.URL (e.g. its "equals" method);
 wenzelm parents: 
79509diff
changeset | 82 | def read_url(name: String): Bytes = using(Url(name).open_stream())(read_stream(_)) | 
| 77717 | 83 | |
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 84 |   def read_file(path: Path, offset: Long = 0L, limit: Long = -1L): Bytes = {
 | 
| 78956 | 85 | val length = File.size(path) | 
| 77711 
25fd62cba347
clarified signature: more general operation Bytes.read_slice;
 wenzelm parents: 
76361diff
changeset | 86 | val start = offset.max(0L) | 
| 80474 
66ebeae3acdc
misc tuning: more uniform read_stream vs. read_file;
 wenzelm parents: 
80473diff
changeset | 87 | val len = (length - start).max(0L).min(if (limit < 0) Long.MaxValue else limit) | 
| 
66ebeae3acdc
misc tuning: more uniform read_stream vs. read_file;
 wenzelm parents: 
80473diff
changeset | 88 | if (len == 0L) empty | 
| 77711 
25fd62cba347
clarified signature: more general operation Bytes.read_slice;
 wenzelm parents: 
76361diff
changeset | 89 |     else {
 | 
| 80474 
66ebeae3acdc
misc tuning: more uniform read_stream vs. read_file;
 wenzelm parents: 
80473diff
changeset | 90 |       Builder.use(hint = len) { builder =>
 | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 91 |         using(FileChannel.open(path.java_path, StandardOpenOption.READ)) { channel =>
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 92 | channel.position(start) | 
| 80473 
09afc244bb5e
proper limit for read operation (amending ac4d53bc8f6b);
 wenzelm parents: 
80428diff
changeset | 93 | val buf_size = block_size | 
| 80474 
66ebeae3acdc
misc tuning: more uniform read_stream vs. read_file;
 wenzelm parents: 
80473diff
changeset | 94 | val buf = ByteBuffer.allocate(buf_size) | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 95 | var m = 0 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 96 | var n = 0L | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 97 |           while ({
 | 
| 80474 
66ebeae3acdc
misc tuning: more uniform read_stream vs. read_file;
 wenzelm parents: 
80473diff
changeset | 98 | val l = (len - n).min(buf_size).toInt | 
| 
66ebeae3acdc
misc tuning: more uniform read_stream vs. read_file;
 wenzelm parents: 
80473diff
changeset | 99 | buf.limit(l) | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 100 | m = channel.read(buf) | 
| 80474 
66ebeae3acdc
misc tuning: more uniform read_stream vs. read_file;
 wenzelm parents: 
80473diff
changeset | 101 |             if (m > 0) {
 | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 102 | builder += (buf.array(), 0, m) | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 103 | buf.clear() | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 104 | n += m | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 105 | } | 
| 80474 
66ebeae3acdc
misc tuning: more uniform read_stream vs. read_file;
 wenzelm parents: 
80473diff
changeset | 106 | m != -1 && len > n | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 107 | }) () | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 108 | } | 
| 77711 
25fd62cba347
clarified signature: more general operation Bytes.read_slice;
 wenzelm parents: 
76361diff
changeset | 109 | } | 
| 
25fd62cba347
clarified signature: more general operation Bytes.read_slice;
 wenzelm parents: 
76361diff
changeset | 110 | } | 
| 
25fd62cba347
clarified signature: more general operation Bytes.read_slice;
 wenzelm parents: 
76361diff
changeset | 111 | } | 
| 
25fd62cba347
clarified signature: more general operation Bytes.read_slice;
 wenzelm parents: 
76361diff
changeset | 112 | |
| 78953 | 113 | def read(path: Path): Bytes = read_file(path) | 
| 114 | def read(file: JFile): Bytes = read_file(File.path(file)) | |
| 77718 | 115 | |
| 64229 | 116 | |
| 117 | /* write */ | |
| 118 | ||
| 69393 
ed0824ef337e
static type for Library.using: avoid Java 11 warnings on "Illegal reflective access";
 wenzelm parents: 
69365diff
changeset | 119 | def write(file: JFile, bytes: Bytes): Unit = | 
| 
ed0824ef337e
static type for Library.using: avoid Java 11 warnings on "Illegal reflective access";
 wenzelm parents: 
69365diff
changeset | 120 | using(new FileOutputStream(file))(bytes.write_stream(_)) | 
| 64229 | 121 | |
| 122 | def write(path: Path, bytes: Bytes): Unit = write(path.file, bytes) | |
| 78194 | 123 | |
| 124 | ||
| 125 | /* append */ | |
| 126 | ||
| 127 | def append(file: JFile, bytes: Bytes): Unit = | |
| 128 | using(new FileOutputStream(file, true))(bytes.write_stream(_)) | |
| 129 | ||
| 130 | def append(path: Path, bytes: Bytes): Unit = append(path.file, bytes) | |
| 80351 | 131 | |
| 132 | ||
| 80380 
94d903234f6b
Bytes.Builder is unsynchronized, like java.io.OutputBuffer;
 wenzelm parents: 
80379diff
changeset | 133 | /* incremental builder: unsynchronized! */ | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 134 | |
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 135 |   object Builder {
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 136 |     def use(hint: Long = 0L)(body: Builder => Unit): Bytes = {
 | 
| 80375 | 137 | val builder = new Builder(hint) | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 138 | body(builder) | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 139 | builder.done() | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 140 | } | 
| 80377 
28dd9b91dfe5
more scalable compression, using Bytes.Builder.Stream;
 wenzelm parents: 
80376diff
changeset | 141 | |
| 80394 | 142 |     def use_stream(hint: Long = 0L)(body: OutputStream => Unit): Bytes = {
 | 
| 143 | val stream = new Stream(hint = hint) | |
| 144 | body(stream) | |
| 145 | stream.builder.done() | |
| 146 | } | |
| 147 | ||
| 148 |     private class Stream(hint: Long = 0L) extends OutputStream {
 | |
| 80377 
28dd9b91dfe5
more scalable compression, using Bytes.Builder.Stream;
 wenzelm parents: 
80376diff
changeset | 149 | val builder = new Builder(hint) | 
| 
28dd9b91dfe5
more scalable compression, using Bytes.Builder.Stream;
 wenzelm parents: 
80376diff
changeset | 150 | |
| 
28dd9b91dfe5
more scalable compression, using Bytes.Builder.Stream;
 wenzelm parents: 
80376diff
changeset | 151 | override def write(b: Int): Unit = | 
| 
28dd9b91dfe5
more scalable compression, using Bytes.Builder.Stream;
 wenzelm parents: 
80376diff
changeset | 152 |         { builder += b.toByte }
 | 
| 
28dd9b91dfe5
more scalable compression, using Bytes.Builder.Stream;
 wenzelm parents: 
80376diff
changeset | 153 | |
| 
28dd9b91dfe5
more scalable compression, using Bytes.Builder.Stream;
 wenzelm parents: 
80376diff
changeset | 154 | override def write(array: Array[Byte], offset: Int, length: Int): Unit = | 
| 
28dd9b91dfe5
more scalable compression, using Bytes.Builder.Stream;
 wenzelm parents: 
80376diff
changeset | 155 |         { builder += (array, offset, length) }
 | 
| 
28dd9b91dfe5
more scalable compression, using Bytes.Builder.Stream;
 wenzelm parents: 
80376diff
changeset | 156 | } | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 157 | } | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 158 | |
| 80375 | 159 |   final class Builder private[Bytes](hint: Long) {
 | 
| 80485 | 160 | builder => | 
| 161 | ||
| 80375 | 162 | private var chunks = | 
| 163 | new ArrayBuffer[Array[Byte]](if (hint <= 0) 16 else (hint / chunk_size).toInt) | |
| 80426 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 164 | |
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 165 | private var buffer_list: mutable.ListBuffer[Array[Byte]] = null | 
| 80375 | 166 | private var buffer = | 
| 80426 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 167 | new Array[Byte](if (hint <= 0) 1024 else (hint min chunk_size min array_size).toInt) | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 168 | private var buffer_index = 0 | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 169 | private var buffer_total = 0 | 
| 80370 | 170 | |
| 80426 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 171 | private def buffer_content(): Array[Byte] = | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 172 |       if (buffer_list != null) {
 | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 173 | val array = new Array[Byte](buffer_total) | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 174 | var i = 0 | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 175 |         for (b <- buffer_list) {
 | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 176 | val n = b.length | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 177 | System.arraycopy(b, 0, array, i, n) | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 178 | i += n | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 179 | } | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 180 | System.arraycopy(buffer, 0, array, i, buffer_index) | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 181 | array | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 182 | } | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 183 | else if (buffer_index == buffer.length) buffer else Arrays.copyOf(buffer, buffer_index) | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 184 | |
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 185 | private def buffer_check(request: Int = 0): Unit = | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 186 |       if (buffer_index == buffer.length) {
 | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 187 |         if (buffer_total == chunk_size) {
 | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 188 | chunks += buffer_content() | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 189 | buffer_list = null | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 190 | buffer = new Array[Byte](chunk_size.toInt) | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 191 | buffer_total = 0 | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 192 | buffer_index = 0 | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 193 | } | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 194 |         else {
 | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 195 |           if (buffer_list == null) { buffer_list = new mutable.ListBuffer }
 | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 196 | buffer_list += buffer | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 197 | buffer_index = 0 | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 198 | val limit = (chunk_size - buffer_total).toInt | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 199 | buffer = new Array[Byte]((buffer_total max request) min limit) | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 200 | } | 
| 80370 | 201 | } | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 202 | |
| 80380 
94d903234f6b
Bytes.Builder is unsynchronized, like java.io.OutputBuffer;
 wenzelm parents: 
80379diff
changeset | 203 |     def += (b: Byte): Unit = {
 | 
| 80426 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 204 | buffer(buffer_index) = b | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 205 | buffer_total += 1 | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 206 | buffer_index += 1 | 
| 80377 
28dd9b91dfe5
more scalable compression, using Bytes.Builder.Stream;
 wenzelm parents: 
80376diff
changeset | 207 | buffer_check() | 
| 
28dd9b91dfe5
more scalable compression, using Bytes.Builder.Stream;
 wenzelm parents: 
80376diff
changeset | 208 | } | 
| 
28dd9b91dfe5
more scalable compression, using Bytes.Builder.Stream;
 wenzelm parents: 
80376diff
changeset | 209 | |
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 210 |     def += (array: Array[Byte], offset: Int, length: Int): Unit = {
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 211 |       if (offset < 0 || length < 0 || offset.toLong + length.toLong > array.length) {
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 212 | throw new IndexOutOfBoundsException | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 213 | } | 
| 80426 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 214 |       else {
 | 
| 80380 
94d903234f6b
Bytes.Builder is unsynchronized, like java.io.OutputBuffer;
 wenzelm parents: 
80379diff
changeset | 215 | var i = offset | 
| 
94d903234f6b
Bytes.Builder is unsynchronized, like java.io.OutputBuffer;
 wenzelm parents: 
80379diff
changeset | 216 | var n = length | 
| 
94d903234f6b
Bytes.Builder is unsynchronized, like java.io.OutputBuffer;
 wenzelm parents: 
80379diff
changeset | 217 |         while (n > 0) {
 | 
| 80426 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 218 | val l = n min (buffer.length - buffer_index) | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 219 | System.arraycopy(array, i, buffer, buffer_index, l) | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 220 | buffer_total += l | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 221 | buffer_index += l | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 222 | i += l | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 223 | n -= l | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 224 | buffer_check(request = n) | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 225 | } | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 226 | } | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 227 | } | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 228 | |
| 80491 | 229 | def += (s: String): Unit = | 
| 230 |       if (s.length > 0) { builder += UTF8.bytes(s) }
 | |
| 80428 | 231 | |
| 80485 | 232 |     def += (array: Array[Byte]): Unit = { builder += (array, 0, array.length) }
 | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 233 | |
| 80485 | 234 |     def += (a: Subarray): Unit = { builder += (a.array, a.offset, a.length) }
 | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 235 | |
| 80394 | 236 |     private def done(): Bytes = {
 | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 237 | val cs = chunks.toArray | 
| 80426 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 238 | val b = buffer_content() | 
| 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 239 |       val size = cs.foldLeft(b.length.toLong)({ case (n, a) => n + a.length })
 | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 240 | chunks = null | 
| 80426 
7d2922f0ae2b
performance tuning: multi-stage buffer with fewer array copies;
 wenzelm parents: 
80394diff
changeset | 241 | buffer_list = null | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 242 | buffer = null | 
| 80513 | 243 | if (size == 0) empty | 
| 244 | else new Bytes(if (cs.isEmpty) None else Some(cs), b, 0L, size) | |
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 245 | } | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 246 | } | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 247 | |
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 248 | |
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 249 | /* subarray */ | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 250 | |
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 251 |   object Subarray {
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 252 | val empty: Subarray = new Subarray(new Array[Byte](0), 0, 0) | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 253 | |
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 254 |     def apply(array: Array[Byte], offset: Int, length: Int): Subarray = {
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 255 | val n = array.length | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 256 |       if (0 <= offset && offset < n && 0 <= length && offset + length <= n) {
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 257 | if (length == 0) empty | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 258 | else new Subarray(array, offset, length) | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 259 | } | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 260 | else throw new IndexOutOfBoundsException | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 261 | } | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 262 | } | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 263 | |
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 264 | final class Subarray private( | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 265 | val array: Array[Byte], | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 266 | val offset: Int, | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 267 | val length: Int | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 268 |   ) {
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 269 |     override def toString: String = "Bytes.Subarray(" + Space.bytes(length).print + ")"
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 270 | |
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 271 | def byte_iterator: Iterator[Byte] = | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 272 | if (length == 0) Iterator.empty | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 273 |       else { for (i <- (offset until (offset + length)).iterator) yield array(i) }
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 274 | } | 
| 54439 | 275 | } | 
| 276 | ||
| 277 | final class Bytes private( | |
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 278 | protected val chunks: Option[Array[Array[Byte]]], | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 279 | protected val chunk0: Array[Byte], | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 280 | protected val offset: Long, | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 281 | val size: Long | 
| 80492 
43323d886ea3
clarified signature: more direct Bytes.raw and subsequent UTF-8 default decoding;
 wenzelm parents: 
80491diff
changeset | 282 | ) extends YXML.Source {
 | 
| 80485 | 283 | bytes => | 
| 284 | ||
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 285 | assert( | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 286 | (chunks.isEmpty || | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 287 | chunks.get.nonEmpty && | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 288 | chunks.get.forall(chunk => chunk.length == Bytes.chunk_size)) && | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 289 | chunk0.length < Bytes.chunk_size) | 
| 80361 | 290 | |
| 80480 
972f7a4cdc0e
clarified YXML.Source: more direct support for String and Bytes, instead of CharSequence;
 wenzelm parents: 
80479diff
changeset | 291 | override def is_empty: Boolean = size == 0 | 
| 80361 | 292 | |
| 80485 | 293 | def proper: Option[Bytes] = if (is_empty) None else Some(bytes) | 
| 80385 | 294 | |
| 80361 | 295 | def is_sliced: Boolean = | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 296 |     offset != 0L || {
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 297 |       chunks match {
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 298 | case None => size != chunk0.length | 
| 80511 | 299 | case Some(cs) => | 
| 300 | val physical_size = cs.foldLeft(chunk0.length.toLong)((n, c) => n + c.length) | |
| 301 | size != physical_size | |
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 302 | } | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 303 | } | 
| 80361 | 304 | |
| 305 | override def toString: String = | |
| 306 | if (is_empty) "Bytes.empty" | |
| 307 |     else "Bytes(" + Space.bytes(size).print + if_proper(is_sliced, ", sliced") + ")"
 | |
| 308 | ||
| 309 | ||
| 310 | /* elements: signed Byte or unsigned Char */ | |
| 311 | ||
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 312 |   protected def byte_unchecked(i: Long): Byte = {
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 313 | val a = offset + i | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 314 |     chunks match {
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 315 | case None => chunk0(a.toInt) | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 316 | case Some(cs) => | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 317 | val b = a % Bytes.chunk_size | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 318 | val c = a / Bytes.chunk_size | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 319 | if (c < cs.length) cs(c.toInt)(b.toInt) else chunk0(b.toInt) | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 320 | } | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 321 | } | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 322 | |
| 80494 | 323 | // signed byte | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 324 | def byte(i: Long): Byte = | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 325 | if (0 <= i && i < size) byte_unchecked(i) | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 326 | else throw new IndexOutOfBoundsException | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 327 | |
| 80494 | 328 | // unsigned char | 
| 329 | def char(i: Long): Char = (byte(i).toInt & 0xff).toChar | |
| 80361 | 330 | |
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 331 | protected def subarray_iterator: Iterator[Bytes.Subarray] = | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 332 | if (is_empty) Iterator.empty | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 333 | else if (chunks.isEmpty) Iterator(Bytes.Subarray(chunk0, offset.toInt, size.toInt)) | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 334 |     else {
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 335 | val end_offset = offset + size | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 336 |       for ((array, index) <- (chunks.get.iterator ++ Iterator(chunk0)).zipWithIndex) yield {
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 337 | val array_start = Bytes.chunk_size * index | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 338 | val array_stop = array_start + array.length | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 339 |         if (offset < array_stop && array_start < end_offset) {
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 340 | val i = (array_start max offset) - array_start | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 341 | val j = (array_stop min end_offset) - array_start | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 342 | Bytes.Subarray(array, i.toInt, (j - i).toInt) | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 343 | } | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 344 | else Bytes.Subarray.empty | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 345 | } | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 346 | } | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 347 | |
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 348 | def byte_iterator: Iterator[Byte] = | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 349 |     for {
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 350 | a <- subarray_iterator | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 351 | b <- a.byte_iterator | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 352 | } yield b | 
| 80361 | 353 | |
| 354 | ||
| 80385 | 355 | /* slice */ | 
| 356 | ||
| 357 | def slice(i: Long, j: Long): Bytes = | |
| 358 |     if (0 <= i && i <= j && j <= size) {
 | |
| 359 | if (i == j) Bytes.empty | |
| 360 | else new Bytes(chunks, chunk0, offset + i, j - i) | |
| 361 | } | |
| 362 | else throw new IndexOutOfBoundsException | |
| 363 | ||
| 364 | def unslice: Bytes = | |
| 365 |     if (is_sliced) {
 | |
| 366 |       Bytes.Builder.use(hint = size) { builder =>
 | |
| 367 |         for (a <- subarray_iterator) { builder += a }
 | |
| 368 | } | |
| 369 | } | |
| 80485 | 370 | else bytes | 
| 80385 | 371 | |
| 80493 | 372 | def terminated_line: Boolean = | 
| 373 | size >= 1 && (byte_unchecked(size - 1) == 13 || byte_unchecked(size - 1) == 10) | |
| 374 | ||
| 80385 | 375 | def trim_line: Bytes = | 
| 376 |     if (size >= 2 && byte_unchecked(size - 2) == 13 && byte_unchecked(size - 1) == 10) {
 | |
| 377 | slice(0, size - 2) | |
| 378 | } | |
| 379 |     else if (size >= 1 && (byte_unchecked(size - 1) == 13 || byte_unchecked(size - 1) == 10)) {
 | |
| 380 | slice(0, size - 1) | |
| 381 | } | |
| 80485 | 382 | else bytes | 
| 80385 | 383 | |
| 384 | ||
| 80479 | 385 | /* separated chunks */ | 
| 386 | ||
| 387 | def separated_chunks(sep: Byte): Iterator[Bytes] = | |
| 388 |     new Iterator[Bytes] {
 | |
| 389 | private var start: Long = -1 | |
| 390 | private var stop: Long = -1 | |
| 391 | ||
| 392 |       private def end(i: Long): Long = {
 | |
| 393 | var j = i | |
| 80485 | 394 |         while (j < bytes.size && byte_unchecked(j) != sep) { j += 1 }
 | 
| 80479 | 395 | j | 
| 396 | } | |
| 397 | ||
| 398 | // init | |
| 80485 | 399 |       if (!bytes.is_empty) { start = 0; stop = end(0) }
 | 
| 80479 | 400 | |
| 401 | def hasNext: Boolean = | |
| 80485 | 402 | 0 <= start && start <= stop && stop <= bytes.size | 
| 80479 | 403 | |
| 404 | def next(): Bytes = | |
| 405 |         if (hasNext) {
 | |
| 80485 | 406 | val chunk = bytes.slice(start, stop) | 
| 407 |           if (stop < bytes.size) { start = stop + 1; stop = end(start) }
 | |
| 80479 | 408 |           else { start = -1; stop = -1 }
 | 
| 409 | chunk | |
| 410 | } | |
| 411 | else throw new NoSuchElementException | |
| 412 | } | |
| 413 | ||
| 414 | def space_explode(sep: Byte): List[Bytes] = separated_chunks(sep).toList | |
| 415 | ||
| 416 | def split_lines: List[Bytes] = space_explode(10) | |
| 417 | ||
| 80480 
972f7a4cdc0e
clarified YXML.Source: more direct support for String and Bytes, instead of CharSequence;
 wenzelm parents: 
80479diff
changeset | 418 | // YXML.Source operations | 
| 
972f7a4cdc0e
clarified YXML.Source: more direct support for String and Bytes, instead of CharSequence;
 wenzelm parents: 
80479diff
changeset | 419 | override def is_X: Boolean = size == 1 && byte_unchecked(0) == YXML.X_byte | 
| 
972f7a4cdc0e
clarified YXML.Source: more direct support for String and Bytes, instead of CharSequence;
 wenzelm parents: 
80479diff
changeset | 420 | override def is_Y: Boolean = size == 1 && byte_unchecked(0) == YXML.Y_byte | 
| 
972f7a4cdc0e
clarified YXML.Source: more direct support for String and Bytes, instead of CharSequence;
 wenzelm parents: 
80479diff
changeset | 421 | override def iterator_X: Iterator[Bytes] = separated_chunks(YXML.X_byte) | 
| 
972f7a4cdc0e
clarified YXML.Source: more direct support for String and Bytes, instead of CharSequence;
 wenzelm parents: 
80479diff
changeset | 422 | override def iterator_Y: Iterator[Bytes] = separated_chunks(YXML.Y_byte) | 
| 
972f7a4cdc0e
clarified YXML.Source: more direct support for String and Bytes, instead of CharSequence;
 wenzelm parents: 
80479diff
changeset | 423 | |
| 80479 | 424 | |
| 80363 
306f273c91ec
clarified hash and equality: depend on sha1 digest to be collision-free;
 wenzelm parents: 
80362diff
changeset | 425 | /* hash and equality */ | 
| 
306f273c91ec
clarified hash and equality: depend on sha1 digest to be collision-free;
 wenzelm parents: 
80362diff
changeset | 426 | |
| 
306f273c91ec
clarified hash and equality: depend on sha1 digest to be collision-free;
 wenzelm parents: 
80362diff
changeset | 427 | lazy val sha1_digest: SHA1.Digest = | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 428 | if (is_empty) SHA1.digest_empty | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 429 |     else {
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 430 |       SHA1.make_digest { sha =>
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 431 |         for (a <- subarray_iterator if a.length > 0) {
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 432 | sha.update(a.array, a.offset, a.length) | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 433 | } | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 434 | } | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 435 | } | 
| 80363 
306f273c91ec
clarified hash and equality: depend on sha1 digest to be collision-free;
 wenzelm parents: 
80362diff
changeset | 436 | |
| 
306f273c91ec
clarified hash and equality: depend on sha1 digest to be collision-free;
 wenzelm parents: 
80362diff
changeset | 437 | override def hashCode(): Int = sha1_digest.hashCode() | 
| 54439 | 438 | |
| 75393 | 439 |   override def equals(that: Any): Boolean = {
 | 
| 54440 | 440 |     that match {
 | 
| 441 | case other: Bytes => | |
| 80485 | 442 | if (bytes.eq(other)) true | 
| 80363 
306f273c91ec
clarified hash and equality: depend on sha1 digest to be collision-free;
 wenzelm parents: 
80362diff
changeset | 443 | else if (size != other.size) false | 
| 80378 
ab4badc7db7f
more efficient equals: avoid somewhat slow sha1_digest (see also 29b761e290c5, 306f273c91ec);
 wenzelm parents: 
80377diff
changeset | 444 |         else {
 | 
| 
ab4badc7db7f
more efficient equals: avoid somewhat slow sha1_digest (see also 29b761e290c5, 306f273c91ec);
 wenzelm parents: 
80377diff
changeset | 445 |           if (chunks.isEmpty && other.chunks.isEmpty) {
 | 
| 
ab4badc7db7f
more efficient equals: avoid somewhat slow sha1_digest (see also 29b761e290c5, 306f273c91ec);
 wenzelm parents: 
80377diff
changeset | 446 | Arrays.equals(chunk0, offset.toInt, (offset + size).toInt, | 
| 
ab4badc7db7f
more efficient equals: avoid somewhat slow sha1_digest (see also 29b761e290c5, 306f273c91ec);
 wenzelm parents: 
80377diff
changeset | 447 | other.chunk0, other.offset.toInt, (other.offset + other.size).toInt) | 
| 
ab4badc7db7f
more efficient equals: avoid somewhat slow sha1_digest (see also 29b761e290c5, 306f273c91ec);
 wenzelm parents: 
80377diff
changeset | 448 | } | 
| 
ab4badc7db7f
more efficient equals: avoid somewhat slow sha1_digest (see also 29b761e290c5, 306f273c91ec);
 wenzelm parents: 
80377diff
changeset | 449 |           else if (!is_sliced && !other.is_sliced) {
 | 
| 
ab4badc7db7f
more efficient equals: avoid somewhat slow sha1_digest (see also 29b761e290c5, 306f273c91ec);
 wenzelm parents: 
80377diff
changeset | 450 | (subarray_iterator zip other.subarray_iterator) | 
| 
ab4badc7db7f
more efficient equals: avoid somewhat slow sha1_digest (see also 29b761e290c5, 306f273c91ec);
 wenzelm parents: 
80377diff
changeset | 451 | .forall((a, b) => Arrays.equals(a.array, b.array)) | 
| 
ab4badc7db7f
more efficient equals: avoid somewhat slow sha1_digest (see also 29b761e290c5, 306f273c91ec);
 wenzelm parents: 
80377diff
changeset | 452 | } | 
| 
ab4badc7db7f
more efficient equals: avoid somewhat slow sha1_digest (see also 29b761e290c5, 306f273c91ec);
 wenzelm parents: 
80377diff
changeset | 453 | else sha1_digest == other.sha1_digest | 
| 80363 
306f273c91ec
clarified hash and equality: depend on sha1 digest to be collision-free;
 wenzelm parents: 
80362diff
changeset | 454 | } | 
| 54440 | 455 | case _ => false | 
| 456 | } | |
| 457 | } | |
| 458 | ||
| 54439 | 459 | |
| 460 | /* content */ | |
| 461 | ||
| 80385 | 462 | def + (other: Bytes): Bytes = | 
| 80485 | 463 | if (other.is_empty) bytes | 
| 80385 | 464 | else if (is_empty) other | 
| 465 |     else {
 | |
| 466 |       Bytes.Builder.use(hint = size + other.size) { builder =>
 | |
| 467 |         for (a <- subarray_iterator ++ other.subarray_iterator) {
 | |
| 468 | builder += a | |
| 469 | } | |
| 470 | } | |
| 471 | } | |
| 472 | ||
| 80368 | 473 |   def make_array: Array[Byte] = {
 | 
| 80509 | 474 | val n = | 
| 475 | if (size <= Bytes.array_size) size.toInt | |
| 476 | else throw new Bytes.Too_Large(size, Bytes.array_size) | |
| 477 | val buf = new ByteArrayOutputStream(n) | |
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 478 |     for (a <- subarray_iterator) { buf.write(a.array, a.offset, a.length) }
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 479 | buf.toByteArray | 
| 69365 
c5b3860d29ef
avoid loading of font file, to eliminate "Illegal reflective access by com.lowagie.text.pdf.MappedRandomAccessFile$1 (iText-2.1.5.jar) to method java.nio.DirectByteBuffer.cleaner()" -- due to com.lowagie.text.pdf.TrueTypeFont.process() / RandomAccessFileOrArray;
 wenzelm parents: 
68167diff
changeset | 480 | } | 
| 
c5b3860d29ef
avoid loading of font file, to eliminate "Illegal reflective access by com.lowagie.text.pdf.MappedRandomAccessFile$1 (iText-2.1.5.jar) to method java.nio.DirectByteBuffer.cleaner()" -- due to com.lowagie.text.pdf.TrueTypeFont.process() / RandomAccessFileOrArray;
 wenzelm parents: 
68167diff
changeset | 481 | |
| 80508 
8585399f26f6
prefer official UTF-8 decoding (in contrast to 2541de190d92): this is also more efficient (factor 10-20);
 wenzelm parents: 
80503diff
changeset | 482 | def text: String = | 
| 80352 | 483 | if (is_empty) "" | 
| 80379 
1f1ce661c71c
notable performance tuning: avoid overhead of higher-order functions;
 wenzelm parents: 
80378diff
changeset | 484 |     else {
 | 
| 80508 
8585399f26f6
prefer official UTF-8 decoding (in contrast to 2541de190d92): this is also more efficient (factor 10-20);
 wenzelm parents: 
80503diff
changeset | 485 | val reader = new InputStreamReader(stream(), UTF8.charset) | 
| 
8585399f26f6
prefer official UTF-8 decoding (in contrast to 2541de190d92): this is also more efficient (factor 10-20);
 wenzelm parents: 
80503diff
changeset | 486 | val buf = new Array[Char]((size min Bytes.string_size).toInt + 1) | 
| 
8585399f26f6
prefer official UTF-8 decoding (in contrast to 2541de190d92): this is also more efficient (factor 10-20);
 wenzelm parents: 
80503diff
changeset | 487 | var m = 0 | 
| 
8585399f26f6
prefer official UTF-8 decoding (in contrast to 2541de190d92): this is also more efficient (factor 10-20);
 wenzelm parents: 
80503diff
changeset | 488 | var n = 0 | 
| 
8585399f26f6
prefer official UTF-8 decoding (in contrast to 2541de190d92): this is also more efficient (factor 10-20);
 wenzelm parents: 
80503diff
changeset | 489 |       while (m >= 0 && n < buf.length) {
 | 
| 
8585399f26f6
prefer official UTF-8 decoding (in contrast to 2541de190d92): this is also more efficient (factor 10-20);
 wenzelm parents: 
80503diff
changeset | 490 | m = reader.read(buf, n, (buf.length - n) min Bytes.block_size) | 
| 
8585399f26f6
prefer official UTF-8 decoding (in contrast to 2541de190d92): this is also more efficient (factor 10-20);
 wenzelm parents: 
80503diff
changeset | 491 |         if (m > 0) { n += m }
 | 
| 80379 
1f1ce661c71c
notable performance tuning: avoid overhead of higher-order functions;
 wenzelm parents: 
80378diff
changeset | 492 | } | 
| 80508 
8585399f26f6
prefer official UTF-8 decoding (in contrast to 2541de190d92): this is also more efficient (factor 10-20);
 wenzelm parents: 
80503diff
changeset | 493 | require(m == -1, "Malformed UTF-8 string: overlong result") | 
| 
8585399f26f6
prefer official UTF-8 decoding (in contrast to 2541de190d92): this is also more efficient (factor 10-20);
 wenzelm parents: 
80503diff
changeset | 494 | new String(buf, 0, n) | 
| 80352 | 495 | } | 
| 65279 
fa62e095d8f1
clarified signature (again, see also 3ed43cfc8b14);
 wenzelm parents: 
65070diff
changeset | 496 | |
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 497 | def wellformed_text: Option[String] = | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 498 |     try {
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 499 | val s = text | 
| 80485 | 500 | if (bytes == Bytes(s)) Some(s) else None | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 501 | } | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 502 |     catch { case ERROR(_) => None }
 | 
| 76236 | 503 | |
| 80393 
6138c5b803be
Base64: proper support for large Bytes, with subtle change of types (Bytes instead of String);
 wenzelm parents: 
80389diff
changeset | 504 | |
| 
6138c5b803be
Base64: proper support for large Bytes, with subtle change of types (Bytes instead of String);
 wenzelm parents: 
80389diff
changeset | 505 | /* Base64 data representation */ | 
| 
6138c5b803be
Base64: proper support for large Bytes, with subtle change of types (Bytes instead of String);
 wenzelm parents: 
80389diff
changeset | 506 | |
| 80394 | 507 | def encode_base64: Bytes = | 
| 508 |     Bytes.Builder.use_stream(hint = (size * 1.5).round) { out =>
 | |
| 509 | using(Base64.encode_stream(out))(write_stream(_)) | |
| 510 | } | |
| 80393 
6138c5b803be
Base64: proper support for large Bytes, with subtle change of types (Bytes instead of String);
 wenzelm parents: 
80389diff
changeset | 511 | |
| 
6138c5b803be
Base64: proper support for large Bytes, with subtle change of types (Bytes instead of String);
 wenzelm parents: 
80389diff
changeset | 512 | def decode_base64: Bytes = | 
| 
6138c5b803be
Base64: proper support for large Bytes, with subtle change of types (Bytes instead of String);
 wenzelm parents: 
80389diff
changeset | 513 | using(Base64.decode_stream(stream()))(Bytes.read_stream(_, hint = (size / 1.2).round)) | 
| 68094 | 514 | |
| 76236 | 515 | def maybe_encode_base64: (Boolean, String) = | 
| 516 |     wellformed_text match {
 | |
| 517 | case Some(s) => (false, s) | |
| 80393 
6138c5b803be
Base64: proper support for large Bytes, with subtle change of types (Bytes instead of String);
 wenzelm parents: 
80389diff
changeset | 518 | case None => (true, encode_base64.text) | 
| 76236 | 519 | } | 
| 68106 | 520 | |
| 54440 | 521 | |
| 64004 | 522 | /* streams */ | 
| 523 | ||
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 524 | def stream(): InputStream = | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 525 | if (chunks.isEmpty) new ByteArrayInputStream(chunk0, offset.toInt, size.toInt) | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 526 |     else {
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 527 |       new InputStream {
 | 
| 80367 | 528 | private var index = 0L | 
| 529 | ||
| 530 |         def read(): Int = {
 | |
| 531 |           if (index < size) {
 | |
| 532 | val res = byte_unchecked(index).toInt & 0xff | |
| 533 | index += 1 | |
| 534 | res | |
| 535 | } | |
| 536 | else -1 | |
| 537 | } | |
| 538 | ||
| 80388 | 539 |         override def read(buffer: Array[Byte], start: Int, length: Int): Int = {
 | 
| 540 | if (length < 16) super.read(buffer, start, length) | |
| 541 |           else {
 | |
| 542 | val index0 = index | |
| 543 | index = (index + length) min size | |
| 544 | val n = (index - index0).toInt | |
| 545 | if (n == 0) -1 | |
| 546 |             else {
 | |
| 547 | var i = start | |
| 548 |               for (a <- slice(index0, index).subarray_iterator) {
 | |
| 549 | val l = a.length | |
| 550 |                 if (l > 0) {
 | |
| 551 | System.arraycopy(a.array, a.offset, buffer, i, l) | |
| 552 | i += l | |
| 553 | } | |
| 554 | } | |
| 555 | n | |
| 556 | } | |
| 557 | } | |
| 558 | } | |
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 559 | } | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 560 | } | 
| 64004 | 561 | |
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 562 | def write_stream(stream: OutputStream): Unit = | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 563 |     for (a <- subarray_iterator if a.length > 0) {
 | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 564 | stream.write(a.array, a.offset, a.length) | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 565 | } | 
| 64004 | 566 | |
| 567 | ||
| 76351 
2cee31cd92f0
generic support for XZ and Zstd compression in Isabelle/Scala;
 wenzelm parents: 
76350diff
changeset | 568 | /* XZ / Zstd data compression */ | 
| 
2cee31cd92f0
generic support for XZ and Zstd compression in Isabelle/Scala;
 wenzelm parents: 
76350diff
changeset | 569 | |
| 76358 | 570 | def detect_xz: Boolean = | 
| 80357 
fe123d033e76
clarified signature: pro-forma support for Bytes with size: Long;
 wenzelm parents: 
80356diff
changeset | 571 | size >= 6 && | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 572 | byte_unchecked(0) == 0xFD.toByte && | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 573 | byte_unchecked(1) == 0x37.toByte && | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 574 | byte_unchecked(2) == 0x7A.toByte && | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 575 | byte_unchecked(3) == 0x58.toByte && | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 576 | byte_unchecked(4) == 0x5A.toByte && | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 577 | byte_unchecked(5) == 0x00.toByte | 
| 76351 
2cee31cd92f0
generic support for XZ and Zstd compression in Isabelle/Scala;
 wenzelm parents: 
76350diff
changeset | 578 | |
| 76358 | 579 | def detect_zstd: Boolean = | 
| 80357 
fe123d033e76
clarified signature: pro-forma support for Bytes with size: Long;
 wenzelm parents: 
80356diff
changeset | 580 | size >= 4 && | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 581 | byte_unchecked(0) == 0x28.toByte && | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 582 | byte_unchecked(1) == 0xB5.toByte && | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 583 | byte_unchecked(2) == 0x2F.toByte && | 
| 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 584 | byte_unchecked(3) == 0xFD.toByte | 
| 76351 
2cee31cd92f0
generic support for XZ and Zstd compression in Isabelle/Scala;
 wenzelm parents: 
76350diff
changeset | 585 | |
| 76358 | 586 | def uncompress_xz(cache: Compress.Cache = Compress.Cache.none): Bytes = | 
| 80366 
ac4d53bc8f6b
support large byte arrays, using multiple "chunks";
 wenzelm parents: 
80364diff
changeset | 587 | using(new xz.XZInputStream(stream(), cache.for_xz))(Bytes.read_stream(_, hint = size)) | 
| 76358 | 588 | |
| 589 |   def uncompress_zstd(cache: Compress.Cache = Compress.Cache.none): Bytes = {
 | |
| 590 | Zstd.init() | |
| 80376 | 591 | using(new zstd.ZstdInputStream(stream(), cache.for_zstd))(Bytes.read_stream(_, hint = size)) | 
| 76358 | 592 | } | 
| 54440 | 593 | |
| 76351 
2cee31cd92f0
generic support for XZ and Zstd compression in Isabelle/Scala;
 wenzelm parents: 
76350diff
changeset | 594 | def uncompress(cache: Compress.Cache = Compress.Cache.none): Bytes = | 
| 76358 | 595 | if (detect_xz) uncompress_xz(cache = cache) | 
| 596 | else if (detect_zstd) uncompress_zstd(cache = cache) | |
| 597 |     else error("Cannot detect compression scheme")
 | |
| 64004 | 598 | |
| 76351 
2cee31cd92f0
generic support for XZ and Zstd compression in Isabelle/Scala;
 wenzelm parents: 
76350diff
changeset | 599 | def compress( | 
| 
2cee31cd92f0
generic support for XZ and Zstd compression in Isabelle/Scala;
 wenzelm parents: 
76350diff
changeset | 600 | options: Compress.Options = Compress.Options(), | 
| 
2cee31cd92f0
generic support for XZ and Zstd compression in Isabelle/Scala;
 wenzelm parents: 
76350diff
changeset | 601 | cache: Compress.Cache = Compress.Cache.none | 
| 
2cee31cd92f0
generic support for XZ and Zstd compression in Isabelle/Scala;
 wenzelm parents: 
76350diff
changeset | 602 |   ): Bytes = {
 | 
| 80394 | 603 |     Bytes.Builder.use_stream(hint = size) { out =>
 | 
| 604 | using( | |
| 605 |         options match {
 | |
| 606 | case options_xz: Compress.Options_XZ => | |
| 607 | new xz.XZOutputStream(out, options_xz.make, cache.for_xz) | |
| 608 | case options_zstd: Compress.Options_Zstd => | |
| 609 | new zstd.ZstdOutputStream(out, cache.for_zstd, options_zstd.level) | |
| 610 | } | |
| 611 |       ) { s => for (a <- subarray_iterator) s.write(a.array, a.offset, a.length) }
 | |
| 612 | } | |
| 64004 | 613 | } | 
| 68167 | 614 | |
| 75393 | 615 | def maybe_compress( | 
| 76351 
2cee31cd92f0
generic support for XZ and Zstd compression in Isabelle/Scala;
 wenzelm parents: 
76350diff
changeset | 616 | options: Compress.Options = Compress.Options(), | 
| 
2cee31cd92f0
generic support for XZ and Zstd compression in Isabelle/Scala;
 wenzelm parents: 
76350diff
changeset | 617 | cache: Compress.Cache = Compress.Cache.none | 
| 75393 | 618 |   ) : (Boolean, Bytes) = {
 | 
| 68167 | 619 | val compressed = compress(options = options, cache = cache) | 
| 80485 | 620 | if (compressed.size < size) (true, compressed) else (false, bytes) | 
| 68167 | 621 | } | 
| 54439 | 622 | } |