isabelle: src/Pure/General/bytes.scala@cff0828c374f (annotated)

54439 621a155c7715 immutable byte vectors versus UTF8 strings; wenzelm parents: diff changeset	1	/* Title: Pure/General/bytes.scala
621a155c7715 immutable byte vectors versus UTF8 strings; wenzelm parents: diff changeset	2	Author: Makarius
621a155c7715 immutable byte vectors versus UTF8 strings; wenzelm parents: diff changeset	3
621a155c7715 immutable byte vectors versus UTF8 strings; wenzelm parents: diff changeset	4	Immutable byte vectors versus UTF8 strings.
621a155c7715 immutable byte vectors versus UTF8 strings; wenzelm parents: diff changeset	5	*/
621a155c7715 immutable byte vectors versus UTF8 strings; wenzelm parents: diff changeset	6
621a155c7715 immutable byte vectors versus UTF8 strings; wenzelm parents: diff changeset	7	package isabelle
621a155c7715 immutable byte vectors versus UTF8 strings; wenzelm parents: diff changeset	8
621a155c7715 immutable byte vectors versus UTF8 strings; wenzelm parents: diff changeset	9
76351 2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala; wenzelm parents: 76350 diff changeset	10	import java.io.{ByteArrayInputStream, ByteArrayOutputStream, FileInputStream, FileOutputStream, InputStream, OutputStream, File as JFile}
65070 1222c010bff7 more operations; wenzelm parents: 64370 diff changeset	11	import java.net.URL
76353 3698d0f3da18 clarified signature; wenzelm parents: 76351 diff changeset	12	import org.tukaani.xz
3698d0f3da18 clarified signature; wenzelm parents: 76351 diff changeset	13	import com.github.luben.zstd
54440 2c4940d2edf7 tuned signature; wenzelm parents: 54439 diff changeset	14
2c4940d2edf7 tuned signature; wenzelm parents: 54439 diff changeset	15
75393 87ebf5a50283 clarified formatting, for the sake of scala3; wenzelm parents: 75382 diff changeset	16	object Bytes {
54439 621a155c7715 immutable byte vectors versus UTF8 strings; wenzelm parents: diff changeset	17	val empty: Bytes = new Bytes(Array[Byte](), 0, 0)
621a155c7715 immutable byte vectors versus UTF8 strings; wenzelm parents: diff changeset	18
75393 87ebf5a50283 clarified formatting, for the sake of scala3; wenzelm parents: 75382 diff changeset	19	def apply(s: CharSequence): Bytes = {
54439 621a155c7715 immutable byte vectors versus UTF8 strings; wenzelm parents: diff changeset	20	val str = s.toString
621a155c7715 immutable byte vectors versus UTF8 strings; wenzelm parents: diff changeset	21	if (str.isEmpty) empty
621a155c7715 immutable byte vectors versus UTF8 strings; wenzelm parents: diff changeset	22	else {
62527 aae9a2a855e0 tuned signature; wenzelm parents: 60833 diff changeset	23	val b = UTF8.bytes(str)
54442 c39972ddd672 more specific Protocol_Output: empty message.body, main content via bytes/text; wenzelm parents: 54440 diff changeset	24	new Bytes(b, 0, b.length)
54439 621a155c7715 immutable byte vectors versus UTF8 strings; wenzelm parents: diff changeset	25	}
621a155c7715 immutable byte vectors versus UTF8 strings; wenzelm parents: diff changeset	26	}
54440 2c4940d2edf7 tuned signature; wenzelm parents: 54439 diff changeset	27
63779 9da65bc75610 more operations; wenzelm parents: 62527 diff changeset	28	def apply(a: Array[Byte]): Bytes = apply(a, 0, a.length)
9da65bc75610 more operations; wenzelm parents: 62527 diff changeset	29
54442 c39972ddd672 more specific Protocol_Output: empty message.body, main content via bytes/text; wenzelm parents: 54440 diff changeset	30	def apply(a: Array[Byte], offset: Int, length: Int): Bytes =
c39972ddd672 more specific Protocol_Output: empty message.body, main content via bytes/text; wenzelm parents: 54440 diff changeset	31	if (length == 0) empty
c39972ddd672 more specific Protocol_Output: empty message.body, main content via bytes/text; wenzelm parents: 54440 diff changeset	32	else {
c39972ddd672 more specific Protocol_Output: empty message.body, main content via bytes/text; wenzelm parents: 54440 diff changeset	33	val b = new Array[Byte](length)
55618 995162143ef4 tuned imports; wenzelm parents: 54512 diff changeset	34	System.arraycopy(a, offset, b, 0, length)
54442 c39972ddd672 more specific Protocol_Output: empty message.body, main content via bytes/text; wenzelm parents: 54440 diff changeset	35	new Bytes(b, 0, b.length)
c39972ddd672 more specific Protocol_Output: empty message.body, main content via bytes/text; wenzelm parents: 54440 diff changeset	36	}
c39972ddd672 more specific Protocol_Output: empty message.body, main content via bytes/text; wenzelm parents: 54440 diff changeset	37
69454 ef051edd4d10 more uniform multi-language operations; wenzelm parents: 69448 diff changeset	38	val newline: Bytes = apply("\n")
54440 2c4940d2edf7 tuned signature; wenzelm parents: 54439 diff changeset	39
73576 b50f8cc8c08e support for base64 via Isabelle/Scala/ML; wenzelm parents: 73561 diff changeset	40
b50f8cc8c08e support for base64 via Isabelle/Scala/ML; wenzelm parents: 73561 diff changeset	41	/* base64 */
b50f8cc8c08e support for base64 via Isabelle/Scala/ML; wenzelm parents: 73561 diff changeset	42
75587 79b4efd17d2b tuned signature; wenzelm parents: 75586 diff changeset	43	def decode_base64(s: String): Bytes = {
75620 44815dc2b8f9 clarified modules; wenzelm parents: 75588 diff changeset	44	val a = Base64.decode(s)
68108 2277fe496d78 more operations; wenzelm parents: 68106 diff changeset	45	new Bytes(a, 0, a.length)
2277fe496d78 more operations; wenzelm parents: 68106 diff changeset	46	}
2277fe496d78 more operations; wenzelm parents: 68106 diff changeset	47
75579 3362b6a5d697 support XZ compression in Isabelle/ML; wenzelm parents: 75393 diff changeset	48
54440 2c4940d2edf7 tuned signature; wenzelm parents: 54439 diff changeset	49	/* read */
2c4940d2edf7 tuned signature; wenzelm parents: 54439 diff changeset	50
64005 f6e965cf1617 clarified magic values (see also java/io/BufferedInputStream.java); wenzelm parents: 64004 diff changeset	51	def read_stream(stream: InputStream, limit: Int = Integer.MAX_VALUE, hint: Int = 1024): Bytes =
64004 b4ece7a3f2ca clarified stream operations; wenzelm parents: 64001 diff changeset	52	if (limit == 0) empty
b4ece7a3f2ca clarified stream operations; wenzelm parents: 64001 diff changeset	53	else {
73414 7411d71b9fb8 more robust; wenzelm parents: 73024 diff changeset	54	val out_size = (if (limit == Integer.MAX_VALUE) hint else limit) max 1024
7411d71b9fb8 more robust; wenzelm parents: 73024 diff changeset	55	val out = new ByteArrayOutputStream(out_size)
64005 f6e965cf1617 clarified magic values (see also java/io/BufferedInputStream.java); wenzelm parents: 64004 diff changeset	56	val buf = new Array[Byte](8192)
64004 b4ece7a3f2ca clarified stream operations; wenzelm parents: 64001 diff changeset	57	var m = 0
54440 2c4940d2edf7 tuned signature; wenzelm parents: 54439 diff changeset	58
75709 a068fb7346ef clarified while-loops; wenzelm parents: 75620 diff changeset	59	while ({
73554 c973b5300025 tuned; wenzelm parents: 73414 diff changeset	60	m = stream.read(buf, 0, buf.length min (limit - out.size))
64004 b4ece7a3f2ca clarified stream operations; wenzelm parents: 64001 diff changeset	61	if (m != -1) out.write(buf, 0, m)
75709 a068fb7346ef clarified while-loops; wenzelm parents: 75620 diff changeset	62	m != -1 && limit > out.size
a068fb7346ef clarified while-loops; wenzelm parents: 75620 diff changeset	63	}) ()
64004 b4ece7a3f2ca clarified stream operations; wenzelm parents: 64001 diff changeset	64
b4ece7a3f2ca clarified stream operations; wenzelm parents: 64001 diff changeset	65	new Bytes(out.toByteArray, 0, out.size)
54440 2c4940d2edf7 tuned signature; wenzelm parents: 54439 diff changeset	66	}
64001 7ecb22be8f03 more general read_stream: return actual byte count; wenzelm parents: 63779 diff changeset	67
75393 87ebf5a50283 clarified formatting, for the sake of scala3; wenzelm parents: 75382 diff changeset	68	def read(file: JFile): Bytes = {
71152 f2d848a596d1 more robust: file length can be invalid in odd situations; wenzelm parents: 71151 diff changeset	69	val length = file.length
f2d848a596d1 more robust: file length can be invalid in odd situations; wenzelm parents: 71151 diff changeset	70	val limit = if (length < 0 \|\| length > Integer.MAX_VALUE) Integer.MAX_VALUE else length.toInt
f2d848a596d1 more robust: file length can be invalid in odd situations; wenzelm parents: 71151 diff changeset	71	using(new FileInputStream(file))(read_stream(_, limit = limit))
f2d848a596d1 more robust: file length can be invalid in odd situations; wenzelm parents: 71151 diff changeset	72	}
64229 12aa3980f65c more operations; wenzelm parents: 64224 diff changeset	73
12aa3980f65c more operations; wenzelm parents: 64224 diff changeset	74	def read(path: Path): Bytes = read(path.file)
12aa3980f65c more operations; wenzelm parents: 64224 diff changeset	75
65070 1222c010bff7 more operations; wenzelm parents: 64370 diff changeset	76	def read(url: URL): Bytes = using(url.openStream)(read_stream(_))
1222c010bff7 more operations; wenzelm parents: 64370 diff changeset	77
64229 12aa3980f65c more operations; wenzelm parents: 64224 diff changeset	78
12aa3980f65c more operations; wenzelm parents: 64224 diff changeset	79	/* write */
12aa3980f65c more operations; wenzelm parents: 64224 diff changeset	80
69393 ed0824ef337e static type for Library.using: avoid Java 11 warnings on "Illegal reflective access"; wenzelm parents: 69365 diff changeset	81	def write(file: JFile, bytes: Bytes): Unit =
ed0824ef337e static type for Library.using: avoid Java 11 warnings on "Illegal reflective access"; wenzelm parents: 69365 diff changeset	82	using(new FileOutputStream(file))(bytes.write_stream(_))
64229 12aa3980f65c more operations; wenzelm parents: 64224 diff changeset	83
12aa3980f65c more operations; wenzelm parents: 64224 diff changeset	84	def write(path: Path, bytes: Bytes): Unit = write(path.file, bytes)
54439 621a155c7715 immutable byte vectors versus UTF8 strings; wenzelm parents: diff changeset	85	}
621a155c7715 immutable byte vectors versus UTF8 strings; wenzelm parents: diff changeset	86
621a155c7715 immutable byte vectors versus UTF8 strings; wenzelm parents: diff changeset	87	final class Bytes private(
621a155c7715 immutable byte vectors versus UTF8 strings; wenzelm parents: diff changeset	88	protected val bytes: Array[Byte],
621a155c7715 immutable byte vectors versus UTF8 strings; wenzelm parents: diff changeset	89	protected val offset: Int,
75393 87ebf5a50283 clarified formatting, for the sake of scala3; wenzelm parents: 75382 diff changeset	90	val length: Int) extends CharSequence {
54439 621a155c7715 immutable byte vectors versus UTF8 strings; wenzelm parents: diff changeset	91	/* equality */
621a155c7715 immutable byte vectors versus UTF8 strings; wenzelm parents: diff changeset	92
75393 87ebf5a50283 clarified formatting, for the sake of scala3; wenzelm parents: 75382 diff changeset	93	override def equals(that: Any): Boolean = {
54440 2c4940d2edf7 tuned signature; wenzelm parents: 54439 diff changeset	94	that match {
2c4940d2edf7 tuned signature; wenzelm parents: 54439 diff changeset	95	case other: Bytes =>
2c4940d2edf7 tuned signature; wenzelm parents: 54439 diff changeset	96	if (this eq other) true
2c4940d2edf7 tuned signature; wenzelm parents: 54439 diff changeset	97	else if (length != other.length) false
2c4940d2edf7 tuned signature; wenzelm parents: 54439 diff changeset	98	else (0 until length).forall(i => bytes(offset + i) == other.bytes(other.offset + i))
2c4940d2edf7 tuned signature; wenzelm parents: 54439 diff changeset	99	case _ => false
2c4940d2edf7 tuned signature; wenzelm parents: 54439 diff changeset	100	}
2c4940d2edf7 tuned signature; wenzelm parents: 54439 diff changeset	101	}
2c4940d2edf7 tuned signature; wenzelm parents: 54439 diff changeset	102
75393 87ebf5a50283 clarified formatting, for the sake of scala3; wenzelm parents: 75382 diff changeset	103	private lazy val hash: Int = {
54439 621a155c7715 immutable byte vectors versus UTF8 strings; wenzelm parents: diff changeset	104	var h = 0
621a155c7715 immutable byte vectors versus UTF8 strings; wenzelm parents: diff changeset	105	for (i <- offset until offset + length) {
621a155c7715 immutable byte vectors versus UTF8 strings; wenzelm parents: diff changeset	106	val b = bytes(i).asInstanceOf[Int] & 0xFF
621a155c7715 immutable byte vectors versus UTF8 strings; wenzelm parents: diff changeset	107	h = 31 * h + b
621a155c7715 immutable byte vectors versus UTF8 strings; wenzelm parents: diff changeset	108	}
621a155c7715 immutable byte vectors versus UTF8 strings; wenzelm parents: diff changeset	109	h
621a155c7715 immutable byte vectors versus UTF8 strings; wenzelm parents: diff changeset	110	}
621a155c7715 immutable byte vectors versus UTF8 strings; wenzelm parents: diff changeset	111
621a155c7715 immutable byte vectors versus UTF8 strings; wenzelm parents: diff changeset	112	override def hashCode(): Int = hash
621a155c7715 immutable byte vectors versus UTF8 strings; wenzelm parents: diff changeset	113
621a155c7715 immutable byte vectors versus UTF8 strings; wenzelm parents: diff changeset	114
621a155c7715 immutable byte vectors versus UTF8 strings; wenzelm parents: diff changeset	115	/* content */
621a155c7715 immutable byte vectors versus UTF8 strings; wenzelm parents: diff changeset	116
54512 7a92ed889da4 persistent value; wenzelm parents: 54444 diff changeset	117	lazy val sha1_digest: SHA1.Digest = SHA1.digest(bytes)
54440 2c4940d2edf7 tuned signature; wenzelm parents: 54439 diff changeset	118
69448 51e696887b81 more uniform multi-language operations; wenzelm parents: 69393 diff changeset	119	def is_empty: Boolean = length == 0
51e696887b81 more uniform multi-language operations; wenzelm parents: 69393 diff changeset	120
51e696887b81 more uniform multi-language operations; wenzelm parents: 69393 diff changeset	121	def iterator: Iterator[Byte] =
51e696887b81 more uniform multi-language operations; wenzelm parents: 69393 diff changeset	122	for (i <- (offset until (offset + length)).iterator)
51e696887b81 more uniform multi-language operations; wenzelm parents: 69393 diff changeset	123	yield bytes(i)
51e696887b81 more uniform multi-language operations; wenzelm parents: 69393 diff changeset	124
75393 87ebf5a50283 clarified formatting, for the sake of scala3; wenzelm parents: 75382 diff changeset	125	def array: Array[Byte] = {
69365 c5b3860d29ef avoid loading of font file, to eliminate "Illegal reflective access by com.lowagie.text.pdf.MappedRandomAccessFile$1 (iText-2.1.5.jar) to method java.nio.DirectByteBuffer.cleaner()" -- due to com.lowagie.text.pdf.TrueTypeFont.process() / RandomAccessFileOrArray; wenzelm parents: 68167 diff changeset	126	val a = new Array[Byte](length)
c5b3860d29ef avoid loading of font file, to eliminate "Illegal reflective access by com.lowagie.text.pdf.MappedRandomAccessFile$1 (iText-2.1.5.jar) to method java.nio.DirectByteBuffer.cleaner()" -- due to com.lowagie.text.pdf.TrueTypeFont.process() / RandomAccessFileOrArray; wenzelm parents: 68167 diff changeset	127	System.arraycopy(bytes, offset, a, 0, length)
c5b3860d29ef avoid loading of font file, to eliminate "Illegal reflective access by com.lowagie.text.pdf.MappedRandomAccessFile$1 (iText-2.1.5.jar) to method java.nio.DirectByteBuffer.cleaner()" -- due to com.lowagie.text.pdf.TrueTypeFont.process() / RandomAccessFileOrArray; wenzelm parents: 68167 diff changeset	128	a
c5b3860d29ef avoid loading of font file, to eliminate "Illegal reflective access by com.lowagie.text.pdf.MappedRandomAccessFile$1 (iText-2.1.5.jar) to method java.nio.DirectByteBuffer.cleaner()" -- due to com.lowagie.text.pdf.TrueTypeFont.process() / RandomAccessFileOrArray; wenzelm parents: 68167 diff changeset	129	}
c5b3860d29ef avoid loading of font file, to eliminate "Illegal reflective access by com.lowagie.text.pdf.MappedRandomAccessFile$1 (iText-2.1.5.jar) to method java.nio.DirectByteBuffer.cleaner()" -- due to com.lowagie.text.pdf.TrueTypeFont.process() / RandomAccessFileOrArray; wenzelm parents: 68167 diff changeset	130
73561 c83152933579 clarified signature: Bytes extends CharSequence already (see d201996f72a8); wenzelm parents: 73559 diff changeset	131	def text: String = UTF8.decode_permissive(this)
65279 fa62e095d8f1 clarified signature (again, see also 3ed43cfc8b14); wenzelm parents: 65070 diff changeset	132
76236 03dd2f19f1d7 clarified signature: more operations; wenzelm parents: 75709 diff changeset	133	def wellformed_text: Option[String] = {
03dd2f19f1d7 clarified signature: more operations; wenzelm parents: 75709 diff changeset	134	val s = text
03dd2f19f1d7 clarified signature: more operations; wenzelm parents: 75709 diff changeset	135	if (this == Bytes(s)) Some(s) else None
03dd2f19f1d7 clarified signature: more operations; wenzelm parents: 75709 diff changeset	136	}
03dd2f19f1d7 clarified signature: more operations; wenzelm parents: 75709 diff changeset	137
75587 79b4efd17d2b tuned signature; wenzelm parents: 75586 diff changeset	138	def encode_base64: String = {
68094 0b66aca9c965 more operations; wenzelm parents: 68087 diff changeset	139	val b =
0b66aca9c965 more operations; wenzelm parents: 68087 diff changeset	140	if (offset == 0 && length == bytes.length) bytes
0b66aca9c965 more operations; wenzelm parents: 68087 diff changeset	141	else Bytes(bytes, offset, length).bytes
75620 44815dc2b8f9 clarified modules; wenzelm parents: 75588 diff changeset	142	Base64.encode(b)
68094 0b66aca9c965 more operations; wenzelm parents: 68087 diff changeset	143	}
0b66aca9c965 more operations; wenzelm parents: 68087 diff changeset	144
76236 03dd2f19f1d7 clarified signature: more operations; wenzelm parents: 75709 diff changeset	145	def maybe_encode_base64: (Boolean, String) =
03dd2f19f1d7 clarified signature: more operations; wenzelm parents: 75709 diff changeset	146	wellformed_text match {
03dd2f19f1d7 clarified signature: more operations; wenzelm parents: 75709 diff changeset	147	case Some(s) => (false, s)
03dd2f19f1d7 clarified signature: more operations; wenzelm parents: 75709 diff changeset	148	case None => (true, encode_base64)
03dd2f19f1d7 clarified signature: more operations; wenzelm parents: 75709 diff changeset	149	}
68106 a514e29db980 return exports as result for Isabelle server; wenzelm parents: 68094 diff changeset	150
68150 f0f34cbed539 clarified output: avoid costly operations on huge blobs; wenzelm parents: 68149 diff changeset	151	override def toString: String = "Bytes(" + length + ")"
54439 621a155c7715 immutable byte vectors versus UTF8 strings; wenzelm parents: diff changeset	152
72885 1b0f81e556a2 accommodate OpenJDK 15; wenzelm parents: 71152 diff changeset	153	def proper: Option[Bytes] = if (is_empty) None else Some(this)
1b0f81e556a2 accommodate OpenJDK 15; wenzelm parents: 71152 diff changeset	154	def proper_text: Option[String] = if (is_empty) None else Some(text)
65630 c41bbf657310 more operations; wenzelm parents: 65279 diff changeset	155
54439 621a155c7715 immutable byte vectors versus UTF8 strings; wenzelm parents: diff changeset	156	def +(other: Bytes): Bytes =
72885 1b0f81e556a2 accommodate OpenJDK 15; wenzelm parents: 71152 diff changeset	157	if (other.is_empty) this
1b0f81e556a2 accommodate OpenJDK 15; wenzelm parents: 71152 diff changeset	158	else if (is_empty) other
54439 621a155c7715 immutable byte vectors versus UTF8 strings; wenzelm parents: diff changeset	159	else {
621a155c7715 immutable byte vectors versus UTF8 strings; wenzelm parents: diff changeset	160	val new_bytes = new Array[Byte](length + other.length)
55618 995162143ef4 tuned imports; wenzelm parents: 54512 diff changeset	161	System.arraycopy(bytes, offset, new_bytes, 0, length)
995162143ef4 tuned imports; wenzelm parents: 54512 diff changeset	162	System.arraycopy(other.bytes, other.offset, new_bytes, length, other.length)
54439 621a155c7715 immutable byte vectors versus UTF8 strings; wenzelm parents: diff changeset	163	new Bytes(new_bytes, 0, new_bytes.length)
621a155c7715 immutable byte vectors versus UTF8 strings; wenzelm parents: diff changeset	164	}
54440 2c4940d2edf7 tuned signature; wenzelm parents: 54439 diff changeset	165
2c4940d2edf7 tuned signature; wenzelm parents: 54439 diff changeset	166
60833 d201996f72a8 provide CharSequence operations as well; wenzelm parents: 55618 diff changeset	167	/* CharSequence operations */
d201996f72a8 provide CharSequence operations as well; wenzelm parents: 55618 diff changeset	168
d201996f72a8 provide CharSequence operations as well; wenzelm parents: 55618 diff changeset	169	def charAt(i: Int): Char =
d201996f72a8 provide CharSequence operations as well; wenzelm parents: 55618 diff changeset	170	if (0 <= i && i < length) (bytes(offset + i).asInstanceOf[Int] & 0xFF).asInstanceOf[Char]
d201996f72a8 provide CharSequence operations as well; wenzelm parents: 55618 diff changeset	171	else throw new IndexOutOfBoundsException
d201996f72a8 provide CharSequence operations as well; wenzelm parents: 55618 diff changeset	172
75393 87ebf5a50283 clarified formatting, for the sake of scala3; wenzelm parents: 75382 diff changeset	173	def subSequence(i: Int, j: Int): Bytes = {
60833 d201996f72a8 provide CharSequence operations as well; wenzelm parents: 55618 diff changeset	174	if (0 <= i && i <= j && j <= length) new Bytes(bytes, offset + i, j - i)
d201996f72a8 provide CharSequence operations as well; wenzelm parents: 55618 diff changeset	175	else throw new IndexOutOfBoundsException
d201996f72a8 provide CharSequence operations as well; wenzelm parents: 55618 diff changeset	176	}
d201996f72a8 provide CharSequence operations as well; wenzelm parents: 55618 diff changeset	177
69448 51e696887b81 more uniform multi-language operations; wenzelm parents: 69393 diff changeset	178	def trim_line: Bytes =
51e696887b81 more uniform multi-language operations; wenzelm parents: 69393 diff changeset	179	if (length >= 2 && charAt(length - 2) == 13 && charAt(length - 1) == 10)
51e696887b81 more uniform multi-language operations; wenzelm parents: 69393 diff changeset	180	subSequence(0, length - 2)
51e696887b81 more uniform multi-language operations; wenzelm parents: 69393 diff changeset	181	else if (length >= 1 && (charAt(length - 1) == 13 \|\| charAt(length - 1) == 10))
51e696887b81 more uniform multi-language operations; wenzelm parents: 69393 diff changeset	182	subSequence(0, length - 1)
51e696887b81 more uniform multi-language operations; wenzelm parents: 69393 diff changeset	183	else this
51e696887b81 more uniform multi-language operations; wenzelm parents: 69393 diff changeset	184
60833 d201996f72a8 provide CharSequence operations as well; wenzelm parents: 55618 diff changeset	185
64004 b4ece7a3f2ca clarified stream operations; wenzelm parents: 64001 diff changeset	186	/* streams */
b4ece7a3f2ca clarified stream operations; wenzelm parents: 64001 diff changeset	187
b4ece7a3f2ca clarified stream operations; wenzelm parents: 64001 diff changeset	188	def stream(): ByteArrayInputStream = new ByteArrayInputStream(bytes, offset, length)
b4ece7a3f2ca clarified stream operations; wenzelm parents: 64001 diff changeset	189
b4ece7a3f2ca clarified stream operations; wenzelm parents: 64001 diff changeset	190	def write_stream(stream: OutputStream): Unit = stream.write(bytes, offset, length)
b4ece7a3f2ca clarified stream operations; wenzelm parents: 64001 diff changeset	191
b4ece7a3f2ca clarified stream operations; wenzelm parents: 64001 diff changeset	192
76351 2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala; wenzelm parents: 76350 diff changeset	193	/* XZ / Zstd data compression */
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala; wenzelm parents: 76350 diff changeset	194
76358 cff0828c374f clarified signature; wenzelm parents: 76353 diff changeset	195	def detect_xz: Boolean =
76351 2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala; wenzelm parents: 76350 diff changeset	196	length >= 6 &&
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala; wenzelm parents: 76350 diff changeset	197	bytes(offset) == 0xFD.toByte &&
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala; wenzelm parents: 76350 diff changeset	198	bytes(offset + 1) == 0x37.toByte &&
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala; wenzelm parents: 76350 diff changeset	199	bytes(offset + 2) == 0x7A.toByte &&
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala; wenzelm parents: 76350 diff changeset	200	bytes(offset + 3) == 0x58.toByte &&
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala; wenzelm parents: 76350 diff changeset	201	bytes(offset + 4) == 0x5A.toByte &&
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala; wenzelm parents: 76350 diff changeset	202	bytes(offset + 5) == 0x00.toByte
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala; wenzelm parents: 76350 diff changeset	203
76358 cff0828c374f clarified signature; wenzelm parents: 76353 diff changeset	204	def detect_zstd: Boolean =
76351 2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala; wenzelm parents: 76350 diff changeset	205	length >= 4 &&
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala; wenzelm parents: 76350 diff changeset	206	bytes(offset) == 0x28.toByte &&
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala; wenzelm parents: 76350 diff changeset	207	bytes(offset + 1) == 0xB5.toByte &&
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala; wenzelm parents: 76350 diff changeset	208	bytes(offset + 2) == 0x2F.toByte &&
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala; wenzelm parents: 76350 diff changeset	209	bytes(offset + 3) == 0xFD.toByte
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala; wenzelm parents: 76350 diff changeset	210
76358 cff0828c374f clarified signature; wenzelm parents: 76353 diff changeset	211	def uncompress_xz(cache: Compress.Cache = Compress.Cache.none): Bytes =
cff0828c374f clarified signature; wenzelm parents: 76353 diff changeset	212	using(new xz.XZInputStream(stream(), cache.for_xz))(Bytes.read_stream(_, hint = length))
cff0828c374f clarified signature; wenzelm parents: 76353 diff changeset	213
cff0828c374f clarified signature; wenzelm parents: 76353 diff changeset	214	def uncompress_zstd(cache: Compress.Cache = Compress.Cache.none): Bytes = {
cff0828c374f clarified signature; wenzelm parents: 76353 diff changeset	215	Zstd.init()
cff0828c374f clarified signature; wenzelm parents: 76353 diff changeset	216	val n = zstd.Zstd.decompressedSize(bytes, offset, length)
cff0828c374f clarified signature; wenzelm parents: 76353 diff changeset	217	if (n > 0 && n < Integer.MAX_VALUE) {
cff0828c374f clarified signature; wenzelm parents: 76353 diff changeset	218	Bytes(zstd.Zstd.decompress(array, n.toInt))
cff0828c374f clarified signature; wenzelm parents: 76353 diff changeset	219	}
cff0828c374f clarified signature; wenzelm parents: 76353 diff changeset	220	else {
cff0828c374f clarified signature; wenzelm parents: 76353 diff changeset	221	using(new zstd.ZstdInputStream(stream(), cache.for_zstd))(Bytes.read_stream(_, hint = length))
cff0828c374f clarified signature; wenzelm parents: 76353 diff changeset	222	}
cff0828c374f clarified signature; wenzelm parents: 76353 diff changeset	223	}
54440 2c4940d2edf7 tuned signature; wenzelm parents: 54439 diff changeset	224
76351 2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala; wenzelm parents: 76350 diff changeset	225	def uncompress(cache: Compress.Cache = Compress.Cache.none): Bytes =
76358 cff0828c374f clarified signature; wenzelm parents: 76353 diff changeset	226	if (detect_xz) uncompress_xz(cache = cache)
cff0828c374f clarified signature; wenzelm parents: 76353 diff changeset	227	else if (detect_zstd) uncompress_zstd(cache = cache)
cff0828c374f clarified signature; wenzelm parents: 76353 diff changeset	228	else error("Cannot detect compression scheme")
64004 b4ece7a3f2ca clarified stream operations; wenzelm parents: 64001 diff changeset	229
76351 2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala; wenzelm parents: 76350 diff changeset	230	def compress(
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala; wenzelm parents: 76350 diff changeset	231	options: Compress.Options = Compress.Options(),
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala; wenzelm parents: 76350 diff changeset	232	cache: Compress.Cache = Compress.Cache.none
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala; wenzelm parents: 76350 diff changeset	233	): Bytes = {
76358 cff0828c374f clarified signature; wenzelm parents: 76353 diff changeset	234	options match {
cff0828c374f clarified signature; wenzelm parents: 76353 diff changeset	235	case options_xz: Compress.Options_XZ =>
cff0828c374f clarified signature; wenzelm parents: 76353 diff changeset	236	val result = new ByteArrayOutputStream(length)
cff0828c374f clarified signature; wenzelm parents: 76353 diff changeset	237	using(new xz.XZOutputStream(result, options_xz.make, cache.for_xz))(write_stream)
cff0828c374f clarified signature; wenzelm parents: 76353 diff changeset	238	new Bytes(result.toByteArray, 0, result.size)
cff0828c374f clarified signature; wenzelm parents: 76353 diff changeset	239	case options_zstd: Compress.Options_Zstd =>
cff0828c374f clarified signature; wenzelm parents: 76353 diff changeset	240	Zstd.init()
cff0828c374f clarified signature; wenzelm parents: 76353 diff changeset	241	Bytes(zstd.Zstd.compress(array, options_zstd.level))
cff0828c374f clarified signature; wenzelm parents: 76353 diff changeset	242	}
64004 b4ece7a3f2ca clarified stream operations; wenzelm parents: 64001 diff changeset	243	}
68167 327bb0f5f768 clarified implicit compression; wenzelm parents: 68150 diff changeset	244
75393 87ebf5a50283 clarified formatting, for the sake of scala3; wenzelm parents: 75382 diff changeset	245	def maybe_compress(
76351 2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala; wenzelm parents: 76350 diff changeset	246	options: Compress.Options = Compress.Options(),
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala; wenzelm parents: 76350 diff changeset	247	cache: Compress.Cache = Compress.Cache.none
75393 87ebf5a50283 clarified formatting, for the sake of scala3; wenzelm parents: 75382 diff changeset	248	) : (Boolean, Bytes) = {
68167 327bb0f5f768 clarified implicit compression; wenzelm parents: 68150 diff changeset	249	val compressed = compress(options = options, cache = cache)
327bb0f5f768 clarified implicit compression; wenzelm parents: 68150 diff changeset	250	if (compressed.length < length) (true, compressed) else (false, this)
327bb0f5f768 clarified implicit compression; wenzelm parents: 68150 diff changeset	251	}
54439 621a155c7715 immutable byte vectors versus UTF8 strings; wenzelm parents: diff changeset	252	}

author	wenzelm
	Fri, 21 Oct 2022 21:39:38 +0200
changeset 76358	cff0828c374f
parent 76353	3698d0f3da18
child 76361	3b9f36ef7365
permissions	-rw-r--r--