| author | paulson | 
| Mon, 08 Nov 2021 09:31:26 +0000 | |
| changeset 74730 | 25f5f1fa31bb | 
| parent 73576 | b50f8cc8c08e | 
| child 75382 | 81673c441ce3 | 
| permissions | -rw-r--r-- | 
| 54439 | 1 | /* Title: Pure/General/bytes.scala | 
| 2 | Author: Makarius | |
| 3 | ||
| 4 | Immutable byte vectors versus UTF8 strings. | |
| 5 | */ | |
| 6 | ||
| 7 | package isabelle | |
| 8 | ||
| 9 | ||
| 64004 | 10 | import java.io.{File => JFile, ByteArrayOutputStream, ByteArrayInputStream,
 | 
| 64229 | 11 | OutputStream, InputStream, FileInputStream, FileOutputStream} | 
| 65070 | 12 | import java.net.URL | 
| 68094 | 13 | import java.util.Base64 | 
| 64004 | 14 | |
| 15 | import org.tukaani.xz.{XZInputStream, XZOutputStream}
 | |
| 54440 | 16 | |
| 17 | ||
| 54439 | 18 | object Bytes | 
| 19 | {
 | |
| 20 | val empty: Bytes = new Bytes(Array[Byte](), 0, 0) | |
| 21 | ||
| 22 | def apply(s: CharSequence): Bytes = | |
| 23 |   {
 | |
| 24 | val str = s.toString | |
| 25 | if (str.isEmpty) empty | |
| 26 |     else {
 | |
| 62527 | 27 | val b = UTF8.bytes(str) | 
| 54442 
c39972ddd672
more specific Protocol_Output: empty message.body, main content via bytes/text;
 wenzelm parents: 
54440diff
changeset | 28 | new Bytes(b, 0, b.length) | 
| 54439 | 29 | } | 
| 30 | } | |
| 54440 | 31 | |
| 63779 | 32 | def apply(a: Array[Byte]): Bytes = apply(a, 0, a.length) | 
| 33 | ||
| 54442 
c39972ddd672
more specific Protocol_Output: empty message.body, main content via bytes/text;
 wenzelm parents: 
54440diff
changeset | 34 | def apply(a: Array[Byte], offset: Int, length: Int): Bytes = | 
| 
c39972ddd672
more specific Protocol_Output: empty message.body, main content via bytes/text;
 wenzelm parents: 
54440diff
changeset | 35 | if (length == 0) empty | 
| 
c39972ddd672
more specific Protocol_Output: empty message.body, main content via bytes/text;
 wenzelm parents: 
54440diff
changeset | 36 |     else {
 | 
| 
c39972ddd672
more specific Protocol_Output: empty message.body, main content via bytes/text;
 wenzelm parents: 
54440diff
changeset | 37 | val b = new Array[Byte](length) | 
| 55618 | 38 | System.arraycopy(a, offset, b, 0, length) | 
| 54442 
c39972ddd672
more specific Protocol_Output: empty message.body, main content via bytes/text;
 wenzelm parents: 
54440diff
changeset | 39 | new Bytes(b, 0, b.length) | 
| 
c39972ddd672
more specific Protocol_Output: empty message.body, main content via bytes/text;
 wenzelm parents: 
54440diff
changeset | 40 | } | 
| 
c39972ddd672
more specific Protocol_Output: empty message.body, main content via bytes/text;
 wenzelm parents: 
54440diff
changeset | 41 | |
| 69454 | 42 |   val newline: Bytes = apply("\n")
 | 
| 54440 | 43 | |
| 73576 | 44 | |
| 45 | /* base64 */ | |
| 46 | ||
| 68108 | 47 | def base64(s: String): Bytes = | 
| 48 |   {
 | |
| 49 | val a = Base64.getDecoder.decode(s) | |
| 50 | new Bytes(a, 0, a.length) | |
| 51 | } | |
| 52 | ||
| 73576 | 53 |   object Decode_Base64 extends Scala.Fun("decode_base64")
 | 
| 54 |   {
 | |
| 55 | val here = Scala_Project.here | |
| 56 | def invoke(args: List[Bytes]): List[Bytes] = | |
| 57 | List(base64(Library.the_single(args).text)) | |
| 58 | } | |
| 59 | ||
| 60 |   object Encode_Base64 extends Scala.Fun("encode_base64")
 | |
| 61 |   {
 | |
| 62 | val here = Scala_Project.here | |
| 63 | def invoke(args: List[Bytes]): List[Bytes] = | |
| 64 | List(Bytes(Library.the_single(args).base64)) | |
| 65 | } | |
| 66 | ||
| 68087 | 67 | |
| 54440 | 68 | /* read */ | 
| 69 | ||
| 64005 
f6e965cf1617
clarified magic values (see also java/io/BufferedInputStream.java);
 wenzelm parents: 
64004diff
changeset | 70 | def read_stream(stream: InputStream, limit: Int = Integer.MAX_VALUE, hint: Int = 1024): Bytes = | 
| 64004 | 71 | if (limit == 0) empty | 
| 72 |     else {
 | |
| 73414 | 73 | val out_size = (if (limit == Integer.MAX_VALUE) hint else limit) max 1024 | 
| 74 | val out = new ByteArrayOutputStream(out_size) | |
| 64005 
f6e965cf1617
clarified magic values (see also java/io/BufferedInputStream.java);
 wenzelm parents: 
64004diff
changeset | 75 | val buf = new Array[Byte](8192) | 
| 64004 | 76 | var m = 0 | 
| 54440 | 77 | |
| 78 |       do {
 | |
| 73554 | 79 | m = stream.read(buf, 0, buf.length min (limit - out.size)) | 
| 64004 | 80 | if (m != -1) out.write(buf, 0, m) | 
| 81 | } while (m != -1 && limit > out.size) | |
| 82 | ||
| 83 | new Bytes(out.toByteArray, 0, out.size) | |
| 54440 | 84 | } | 
| 64001 
7ecb22be8f03
more general read_stream: return actual byte count;
 wenzelm parents: 
63779diff
changeset | 85 | |
| 
7ecb22be8f03
more general read_stream: return actual byte count;
 wenzelm parents: 
63779diff
changeset | 86 | def read(file: JFile): Bytes = | 
| 71152 
f2d848a596d1
more robust: file length can be invalid in odd situations;
 wenzelm parents: 
71151diff
changeset | 87 |   {
 | 
| 
f2d848a596d1
more robust: file length can be invalid in odd situations;
 wenzelm parents: 
71151diff
changeset | 88 | val length = file.length | 
| 
f2d848a596d1
more robust: file length can be invalid in odd situations;
 wenzelm parents: 
71151diff
changeset | 89 | val limit = if (length < 0 || length > Integer.MAX_VALUE) Integer.MAX_VALUE else length.toInt | 
| 
f2d848a596d1
more robust: file length can be invalid in odd situations;
 wenzelm parents: 
71151diff
changeset | 90 | using(new FileInputStream(file))(read_stream(_, limit = limit)) | 
| 
f2d848a596d1
more robust: file length can be invalid in odd situations;
 wenzelm parents: 
71151diff
changeset | 91 | } | 
| 64229 | 92 | |
| 93 | def read(path: Path): Bytes = read(path.file) | |
| 94 | ||
| 65070 | 95 | def read(url: URL): Bytes = using(url.openStream)(read_stream(_)) | 
| 96 | ||
| 64229 | 97 | |
| 98 | /* write */ | |
| 99 | ||
| 69393 
ed0824ef337e
static type for Library.using: avoid Java 11 warnings on "Illegal reflective access";
 wenzelm parents: 
69365diff
changeset | 100 | def write(file: JFile, bytes: Bytes): Unit = | 
| 
ed0824ef337e
static type for Library.using: avoid Java 11 warnings on "Illegal reflective access";
 wenzelm parents: 
69365diff
changeset | 101 | using(new FileOutputStream(file))(bytes.write_stream(_)) | 
| 64229 | 102 | |
| 103 | def write(path: Path, bytes: Bytes): Unit = write(path.file, bytes) | |
| 54439 | 104 | } | 
| 105 | ||
| 106 | final class Bytes private( | |
| 107 | protected val bytes: Array[Byte], | |
| 108 | protected val offset: Int, | |
| 60833 | 109 | val length: Int) extends CharSequence | 
| 54439 | 110 | {
 | 
| 111 | /* equality */ | |
| 112 | ||
| 54440 | 113 | override def equals(that: Any): Boolean = | 
| 114 |   {
 | |
| 115 |     that match {
 | |
| 116 | case other: Bytes => | |
| 117 | if (this eq other) true | |
| 118 | else if (length != other.length) false | |
| 119 | else (0 until length).forall(i => bytes(offset + i) == other.bytes(other.offset + i)) | |
| 120 | case _ => false | |
| 121 | } | |
| 122 | } | |
| 123 | ||
| 54439 | 124 | private lazy val hash: Int = | 
| 125 |   {
 | |
| 126 | var h = 0 | |
| 127 |     for (i <- offset until offset + length) {
 | |
| 128 | val b = bytes(i).asInstanceOf[Int] & 0xFF | |
| 129 | h = 31 * h + b | |
| 130 | } | |
| 131 | h | |
| 132 | } | |
| 133 | ||
| 134 | override def hashCode(): Int = hash | |
| 135 | ||
| 136 | ||
| 137 | /* content */ | |
| 138 | ||
| 54512 | 139 | lazy val sha1_digest: SHA1.Digest = SHA1.digest(bytes) | 
| 54440 | 140 | |
| 69448 | 141 | def is_empty: Boolean = length == 0 | 
| 142 | ||
| 143 | def iterator: Iterator[Byte] = | |
| 144 | for (i <- (offset until (offset + length)).iterator) | |
| 145 | yield bytes(i) | |
| 146 | ||
| 69365 
c5b3860d29ef
avoid loading of font file, to eliminate "Illegal reflective access by com.lowagie.text.pdf.MappedRandomAccessFile$1 (iText-2.1.5.jar) to method java.nio.DirectByteBuffer.cleaner()" -- due to com.lowagie.text.pdf.TrueTypeFont.process() / RandomAccessFileOrArray;
 wenzelm parents: 
68167diff
changeset | 147 | def array: Array[Byte] = | 
| 
c5b3860d29ef
avoid loading of font file, to eliminate "Illegal reflective access by com.lowagie.text.pdf.MappedRandomAccessFile$1 (iText-2.1.5.jar) to method java.nio.DirectByteBuffer.cleaner()" -- due to com.lowagie.text.pdf.TrueTypeFont.process() / RandomAccessFileOrArray;
 wenzelm parents: 
68167diff
changeset | 148 |   {
 | 
| 
c5b3860d29ef
avoid loading of font file, to eliminate "Illegal reflective access by com.lowagie.text.pdf.MappedRandomAccessFile$1 (iText-2.1.5.jar) to method java.nio.DirectByteBuffer.cleaner()" -- due to com.lowagie.text.pdf.TrueTypeFont.process() / RandomAccessFileOrArray;
 wenzelm parents: 
68167diff
changeset | 149 | val a = new Array[Byte](length) | 
| 
c5b3860d29ef
avoid loading of font file, to eliminate "Illegal reflective access by com.lowagie.text.pdf.MappedRandomAccessFile$1 (iText-2.1.5.jar) to method java.nio.DirectByteBuffer.cleaner()" -- due to com.lowagie.text.pdf.TrueTypeFont.process() / RandomAccessFileOrArray;
 wenzelm parents: 
68167diff
changeset | 150 | System.arraycopy(bytes, offset, a, 0, length) | 
| 
c5b3860d29ef
avoid loading of font file, to eliminate "Illegal reflective access by com.lowagie.text.pdf.MappedRandomAccessFile$1 (iText-2.1.5.jar) to method java.nio.DirectByteBuffer.cleaner()" -- due to com.lowagie.text.pdf.TrueTypeFont.process() / RandomAccessFileOrArray;
 wenzelm parents: 
68167diff
changeset | 151 | a | 
| 
c5b3860d29ef
avoid loading of font file, to eliminate "Illegal reflective access by com.lowagie.text.pdf.MappedRandomAccessFile$1 (iText-2.1.5.jar) to method java.nio.DirectByteBuffer.cleaner()" -- due to com.lowagie.text.pdf.TrueTypeFont.process() / RandomAccessFileOrArray;
 wenzelm parents: 
68167diff
changeset | 152 | } | 
| 
c5b3860d29ef
avoid loading of font file, to eliminate "Illegal reflective access by com.lowagie.text.pdf.MappedRandomAccessFile$1 (iText-2.1.5.jar) to method java.nio.DirectByteBuffer.cleaner()" -- due to com.lowagie.text.pdf.TrueTypeFont.process() / RandomAccessFileOrArray;
 wenzelm parents: 
68167diff
changeset | 153 | |
| 73561 
c83152933579
clarified signature: Bytes extends CharSequence already (see d201996f72a8);
 wenzelm parents: 
73559diff
changeset | 154 | def text: String = UTF8.decode_permissive(this) | 
| 65279 
fa62e095d8f1
clarified signature (again, see also 3ed43cfc8b14);
 wenzelm parents: 
65070diff
changeset | 155 | |
| 68094 | 156 | def base64: String = | 
| 157 |   {
 | |
| 158 | val b = | |
| 159 | if (offset == 0 && length == bytes.length) bytes | |
| 160 | else Bytes(bytes, offset, length).bytes | |
| 161 | Base64.getEncoder.encodeToString(b) | |
| 162 | } | |
| 163 | ||
| 68106 | 164 | def maybe_base64: (Boolean, String) = | 
| 165 |   {
 | |
| 166 | val s = text | |
| 167 | if (this == Bytes(s)) (false, s) else (true, base64) | |
| 168 | } | |
| 169 | ||
| 68150 
f0f34cbed539
clarified output: avoid costly operations on huge blobs;
 wenzelm parents: 
68149diff
changeset | 170 |   override def toString: String = "Bytes(" + length + ")"
 | 
| 54439 | 171 | |
| 72885 | 172 | def proper: Option[Bytes] = if (is_empty) None else Some(this) | 
| 173 | def proper_text: Option[String] = if (is_empty) None else Some(text) | |
| 65630 | 174 | |
| 54439 | 175 | def +(other: Bytes): Bytes = | 
| 72885 | 176 | if (other.is_empty) this | 
| 177 | else if (is_empty) other | |
| 54439 | 178 |     else {
 | 
| 179 | val new_bytes = new Array[Byte](length + other.length) | |
| 55618 | 180 | System.arraycopy(bytes, offset, new_bytes, 0, length) | 
| 181 | System.arraycopy(other.bytes, other.offset, new_bytes, length, other.length) | |
| 54439 | 182 | new Bytes(new_bytes, 0, new_bytes.length) | 
| 183 | } | |
| 54440 | 184 | |
| 185 | ||
| 60833 | 186 | /* CharSequence operations */ | 
| 187 | ||
| 188 | def charAt(i: Int): Char = | |
| 189 | if (0 <= i && i < length) (bytes(offset + i).asInstanceOf[Int] & 0xFF).asInstanceOf[Char] | |
| 190 | else throw new IndexOutOfBoundsException | |
| 191 | ||
| 192 | def subSequence(i: Int, j: Int): Bytes = | |
| 193 |   {
 | |
| 194 | if (0 <= i && i <= j && j <= length) new Bytes(bytes, offset + i, j - i) | |
| 195 | else throw new IndexOutOfBoundsException | |
| 196 | } | |
| 197 | ||
| 69448 | 198 | def trim_line: Bytes = | 
| 199 | if (length >= 2 && charAt(length - 2) == 13 && charAt(length - 1) == 10) | |
| 200 | subSequence(0, length - 2) | |
| 201 | else if (length >= 1 && (charAt(length - 1) == 13 || charAt(length - 1) == 10)) | |
| 202 | subSequence(0, length - 1) | |
| 203 | else this | |
| 204 | ||
| 60833 | 205 | |
| 64004 | 206 | /* streams */ | 
| 207 | ||
| 208 | def stream(): ByteArrayInputStream = new ByteArrayInputStream(bytes, offset, length) | |
| 209 | ||
| 210 | def write_stream(stream: OutputStream): Unit = stream.write(bytes, offset, length) | |
| 211 | ||
| 212 | ||
| 213 | /* XZ data compression */ | |
| 54440 | 214 | |
| 73024 | 215 | def uncompress(cache: XZ.Cache = XZ.Cache()): Bytes = | 
| 68018 | 216 | using(new XZInputStream(stream(), cache))(Bytes.read_stream(_, hint = length)) | 
| 64004 | 217 | |
| 73024 | 218 | def compress(options: XZ.Options = XZ.options(), cache: XZ.Cache = XZ.Cache()): Bytes = | 
| 64004 | 219 |   {
 | 
| 220 | val result = new ByteArrayOutputStream(length) | |
| 68018 | 221 | using(new XZOutputStream(result, options, cache))(write_stream(_)) | 
| 64004 | 222 | new Bytes(result.toByteArray, 0, result.size) | 
| 223 | } | |
| 68167 | 224 | |
| 73024 | 225 | def maybe_compress(options: XZ.Options = XZ.options(), cache: XZ.Cache = XZ.Cache()) | 
| 68167 | 226 | : (Boolean, Bytes) = | 
| 227 |   {
 | |
| 228 | val compressed = compress(options = options, cache = cache) | |
| 229 | if (compressed.length < length) (true, compressed) else (false, this) | |
| 230 | } | |
| 54439 | 231 | } |