src/Pure/General/bytes.scala
author wenzelm
Tue, 02 Jul 2024 22:38:00 +0200
changeset 80479 762fcf8f9ced
parent 80474 66ebeae3acdc
child 80480 972f7a4cdc0e
permissions -rw-r--r--
more specialized operations;
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
54439
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
     1
/*  Title:      Pure/General/bytes.scala
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
     2
    Author:     Makarius
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
     3
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
     4
Immutable byte vectors versus UTF8 strings.
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
     5
*/
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
     6
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
     7
package isabelle
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
     8
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
     9
78855
6fdcd6c8c97a prefer old-style import "=>";
wenzelm
parents: 78243
diff changeset
    10
import java.io.{ByteArrayInputStream, ByteArrayOutputStream, FileInputStream, FileOutputStream,
6fdcd6c8c97a prefer old-style import "=>";
wenzelm
parents: 78243
diff changeset
    11
  InputStream, OutputStream, File => JFile}
77711
25fd62cba347 clarified signature: more general operation Bytes.read_slice;
wenzelm
parents: 76361
diff changeset
    12
import java.nio.ByteBuffer
25fd62cba347 clarified signature: more general operation Bytes.read_slice;
wenzelm
parents: 76361
diff changeset
    13
import java.nio.channels.FileChannel
25fd62cba347 clarified signature: more general operation Bytes.read_slice;
wenzelm
parents: 76361
diff changeset
    14
import java.nio.file.StandardOpenOption
77712
dd4bb80dbc3a tuned performance: much faster low-level operation;
wenzelm
parents: 77711
diff changeset
    15
import java.util.Arrays
76353
3698d0f3da18 clarified signature;
wenzelm
parents: 76351
diff changeset
    16
import org.tukaani.xz
3698d0f3da18 clarified signature;
wenzelm
parents: 76351
diff changeset
    17
import com.github.luben.zstd
54440
2c4940d2edf7 tuned signature;
wenzelm
parents: 54439
diff changeset
    18
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    19
import scala.collection.mutable.ArrayBuffer
80426
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
    20
import scala.collection.mutable
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    21
54440
2c4940d2edf7 tuned signature;
wenzelm
parents: 54439
diff changeset
    22
75393
87ebf5a50283 clarified formatting, for the sake of scala3;
wenzelm
parents: 75382
diff changeset
    23
object Bytes {
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    24
  /* internal sizes */
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    25
80385
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
    26
  private val array_size: Long = Int.MaxValue - 8  // see java.io.InputStream.MAX_BUFFER_SIZE
80389
5e8dca75c699 clarified sizes;
wenzelm
parents: 80388
diff changeset
    27
  private val block_size: Int = 16384  // see java.io.InputStream.DEFAULT_BUFFER_SIZE
80385
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
    28
  private val chunk_size: Long = Space.MiB(100).bytes
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
    29
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
    30
  private def make_size(chunks: Array[Array[Byte]], buffer: Array[Byte]): Long =
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
    31
    chunks.foldLeft(buffer.length.toLong)((n, chunk) => n + chunk.length)
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    32
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    33
  class Too_Large(size: Long) extends IndexOutOfBoundsException {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    34
    override def getMessage: String =
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    35
      "Bytes too large for particular operation: " +
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    36
        Space.bytes(size).print + " > " + Space.bytes(array_size).print
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    37
  }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    38
54439
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
    39
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    40
  /* main constructors */
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    41
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    42
  private def reuse_array(bytes: Array[Byte]): Bytes =
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    43
    if (bytes.length <= chunk_size) new Bytes(None, bytes, 0L, bytes.length.toLong)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    44
    else apply(bytes)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    45
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    46
  val empty: Bytes = reuse_array(new Array(0))
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    47
80373
dc220d47b85e minor performance tuning;
wenzelm
parents: 80372
diff changeset
    48
  def apply(s: CharSequence): Bytes =
dc220d47b85e minor performance tuning;
wenzelm
parents: 80372
diff changeset
    49
    if (s.isEmpty) empty
dc220d47b85e minor performance tuning;
wenzelm
parents: 80372
diff changeset
    50
    else Builder.use(hint = s.length) { builder => builder += s }
54440
2c4940d2edf7 tuned signature;
wenzelm
parents: 54439
diff changeset
    51
63779
9da65bc75610 more operations;
wenzelm
parents: 62527
diff changeset
    52
  def apply(a: Array[Byte]): Bytes = apply(a, 0, a.length)
9da65bc75610 more operations;
wenzelm
parents: 62527
diff changeset
    53
54442
c39972ddd672 more specific Protocol_Output: empty message.body, main content via bytes/text;
wenzelm
parents: 54440
diff changeset
    54
  def apply(a: Array[Byte], offset: Int, length: Int): Bytes =
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    55
    Builder.use(hint = length) { builder => builder += (a, offset, length) }
54442
c39972ddd672 more specific Protocol_Output: empty message.body, main content via bytes/text;
wenzelm
parents: 54440
diff changeset
    56
69454
ef051edd4d10 more uniform multi-language operations;
wenzelm
parents: 69448
diff changeset
    57
  val newline: Bytes = apply("\n")
54440
2c4940d2edf7 tuned signature;
wenzelm
parents: 54439
diff changeset
    58
75579
3362b6a5d697 support XZ compression in Isabelle/ML;
wenzelm
parents: 75393
diff changeset
    59
54440
2c4940d2edf7 tuned signature;
wenzelm
parents: 54439
diff changeset
    60
  /* read */
2c4940d2edf7 tuned signature;
wenzelm
parents: 54439
diff changeset
    61
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    62
  def read_stream(stream: InputStream, limit: Long = -1L, hint: Long = 0L): Bytes = {
64004
b4ece7a3f2ca clarified stream operations;
wenzelm
parents: 64001
diff changeset
    63
    if (limit == 0) empty
b4ece7a3f2ca clarified stream operations;
wenzelm
parents: 64001
diff changeset
    64
    else {
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    65
      Builder.use(hint = if (limit > 0) limit else hint) { builder =>
80474
66ebeae3acdc misc tuning: more uniform read_stream vs. read_file;
wenzelm
parents: 80473
diff changeset
    66
        val buf_size = block_size
80384
061d672570f5 tuned names;
wenzelm
parents: 80383
diff changeset
    67
        val buf = new Array[Byte](block_size)
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    68
        var m = 0
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    69
        var n = 0L
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    70
        while ({
80474
66ebeae3acdc misc tuning: more uniform read_stream vs. read_file;
wenzelm
parents: 80473
diff changeset
    71
          val l = if (limit > 0) (limit - n).min(buf_size).toInt else buf_size
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    72
          m = stream.read(buf, 0, l)
80474
66ebeae3acdc misc tuning: more uniform read_stream vs. read_file;
wenzelm
parents: 80473
diff changeset
    73
          if (m > 0) {
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    74
            builder += (buf, 0, m)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    75
            n += m
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    76
          }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    77
          m != -1 && (limit < 0 || limit > n)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    78
        }) ()
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    79
      }
54440
2c4940d2edf7 tuned signature;
wenzelm
parents: 54439
diff changeset
    80
    }
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    81
  }
64001
7ecb22be8f03 more general read_stream: return actual byte count;
wenzelm
parents: 63779
diff changeset
    82
79510
d8330439823a clarified signature: explicit type isabelle.Url to avoid oddities of java.net.URL (e.g. its "equals" method);
wenzelm
parents: 79509
diff changeset
    83
  def read_url(name: String): Bytes = using(Url(name).open_stream())(read_stream(_))
77717
6a2daddc238c tuned signature;
wenzelm
parents: 77716
diff changeset
    84
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    85
  def read_file(path: Path, offset: Long = 0L, limit: Long = -1L): Bytes = {
78956
12abaffb0346 tuned signature: more operations;
wenzelm
parents: 78954
diff changeset
    86
    val length = File.size(path)
77711
25fd62cba347 clarified signature: more general operation Bytes.read_slice;
wenzelm
parents: 76361
diff changeset
    87
    val start = offset.max(0L)
80474
66ebeae3acdc misc tuning: more uniform read_stream vs. read_file;
wenzelm
parents: 80473
diff changeset
    88
    val len = (length - start).max(0L).min(if (limit < 0) Long.MaxValue else limit)
66ebeae3acdc misc tuning: more uniform read_stream vs. read_file;
wenzelm
parents: 80473
diff changeset
    89
    if (len == 0L) empty
77711
25fd62cba347 clarified signature: more general operation Bytes.read_slice;
wenzelm
parents: 76361
diff changeset
    90
    else {
80474
66ebeae3acdc misc tuning: more uniform read_stream vs. read_file;
wenzelm
parents: 80473
diff changeset
    91
      Builder.use(hint = len) { builder =>
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    92
        using(FileChannel.open(path.java_path, StandardOpenOption.READ)) { channel =>
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    93
          channel.position(start)
80473
09afc244bb5e proper limit for read operation (amending ac4d53bc8f6b);
wenzelm
parents: 80428
diff changeset
    94
          val buf_size = block_size
80474
66ebeae3acdc misc tuning: more uniform read_stream vs. read_file;
wenzelm
parents: 80473
diff changeset
    95
          val buf = ByteBuffer.allocate(buf_size)
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    96
          var m = 0
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    97
          var n = 0L
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    98
          while ({
80474
66ebeae3acdc misc tuning: more uniform read_stream vs. read_file;
wenzelm
parents: 80473
diff changeset
    99
            val l = (len - n).min(buf_size).toInt
66ebeae3acdc misc tuning: more uniform read_stream vs. read_file;
wenzelm
parents: 80473
diff changeset
   100
            buf.limit(l)
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   101
            m = channel.read(buf)
80474
66ebeae3acdc misc tuning: more uniform read_stream vs. read_file;
wenzelm
parents: 80473
diff changeset
   102
            if (m > 0) {
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   103
              builder += (buf.array(), 0, m)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   104
              buf.clear()
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   105
              n += m
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   106
            }
80474
66ebeae3acdc misc tuning: more uniform read_stream vs. read_file;
wenzelm
parents: 80473
diff changeset
   107
            m != -1 && len > n
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   108
          }) ()
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   109
        }
77711
25fd62cba347 clarified signature: more general operation Bytes.read_slice;
wenzelm
parents: 76361
diff changeset
   110
      }
25fd62cba347 clarified signature: more general operation Bytes.read_slice;
wenzelm
parents: 76361
diff changeset
   111
    }
25fd62cba347 clarified signature: more general operation Bytes.read_slice;
wenzelm
parents: 76361
diff changeset
   112
  }
25fd62cba347 clarified signature: more general operation Bytes.read_slice;
wenzelm
parents: 76361
diff changeset
   113
78953
b6116a86d2ac clarified signature;
wenzelm
parents: 78855
diff changeset
   114
  def read(path: Path): Bytes = read_file(path)
b6116a86d2ac clarified signature;
wenzelm
parents: 78855
diff changeset
   115
  def read(file: JFile): Bytes = read_file(File.path(file))
77718
6ad3a412ed97 clarified signature;
wenzelm
parents: 77717
diff changeset
   116
64229
12aa3980f65c more operations;
wenzelm
parents: 64224
diff changeset
   117
12aa3980f65c more operations;
wenzelm
parents: 64224
diff changeset
   118
  /* write */
12aa3980f65c more operations;
wenzelm
parents: 64224
diff changeset
   119
69393
ed0824ef337e static type for Library.using: avoid Java 11 warnings on "Illegal reflective access";
wenzelm
parents: 69365
diff changeset
   120
  def write(file: JFile, bytes: Bytes): Unit =
ed0824ef337e static type for Library.using: avoid Java 11 warnings on "Illegal reflective access";
wenzelm
parents: 69365
diff changeset
   121
    using(new FileOutputStream(file))(bytes.write_stream(_))
64229
12aa3980f65c more operations;
wenzelm
parents: 64224
diff changeset
   122
12aa3980f65c more operations;
wenzelm
parents: 64224
diff changeset
   123
  def write(path: Path, bytes: Bytes): Unit = write(path.file, bytes)
78194
da721ba809a4 more operations;
wenzelm
parents: 77718
diff changeset
   124
da721ba809a4 more operations;
wenzelm
parents: 77718
diff changeset
   125
da721ba809a4 more operations;
wenzelm
parents: 77718
diff changeset
   126
  /* append */
da721ba809a4 more operations;
wenzelm
parents: 77718
diff changeset
   127
da721ba809a4 more operations;
wenzelm
parents: 77718
diff changeset
   128
  def append(file: JFile, bytes: Bytes): Unit =
da721ba809a4 more operations;
wenzelm
parents: 77718
diff changeset
   129
    using(new FileOutputStream(file, true))(bytes.write_stream(_))
da721ba809a4 more operations;
wenzelm
parents: 77718
diff changeset
   130
da721ba809a4 more operations;
wenzelm
parents: 77718
diff changeset
   131
  def append(path: Path, bytes: Bytes): Unit = append(path.file, bytes)
80351
dbbe26afc319 clarified signature: more accurate types;
wenzelm
parents: 80350
diff changeset
   132
dbbe26afc319 clarified signature: more accurate types;
wenzelm
parents: 80350
diff changeset
   133
80355
5a555acad203 clarified signature (again);
wenzelm
parents: 80353
diff changeset
   134
  /* vector of short unsigned integers */
80351
dbbe26afc319 clarified signature: more accurate types;
wenzelm
parents: 80350
diff changeset
   135
dbbe26afc319 clarified signature: more accurate types;
wenzelm
parents: 80350
diff changeset
   136
  trait Vec {
dbbe26afc319 clarified signature: more accurate types;
wenzelm
parents: 80350
diff changeset
   137
    def size: Long
80355
5a555acad203 clarified signature (again);
wenzelm
parents: 80353
diff changeset
   138
    def apply(i: Long): Char
80351
dbbe26afc319 clarified signature: more accurate types;
wenzelm
parents: 80350
diff changeset
   139
  }
dbbe26afc319 clarified signature: more accurate types;
wenzelm
parents: 80350
diff changeset
   140
dbbe26afc319 clarified signature: more accurate types;
wenzelm
parents: 80350
diff changeset
   141
  class Vec_String(string: String) extends Vec {
dbbe26afc319 clarified signature: more accurate types;
wenzelm
parents: 80350
diff changeset
   142
    override def size: Long = string.length.toLong
80355
5a555acad203 clarified signature (again);
wenzelm
parents: 80353
diff changeset
   143
    override def apply(i: Long): Char =
5a555acad203 clarified signature (again);
wenzelm
parents: 80353
diff changeset
   144
      if (0 <= i && i < size) string(i.toInt)
80351
dbbe26afc319 clarified signature: more accurate types;
wenzelm
parents: 80350
diff changeset
   145
      else throw new IndexOutOfBoundsException
dbbe26afc319 clarified signature: more accurate types;
wenzelm
parents: 80350
diff changeset
   146
  }
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   147
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   148
80380
94d903234f6b Bytes.Builder is unsynchronized, like java.io.OutputBuffer;
wenzelm
parents: 80379
diff changeset
   149
  /* incremental builder: unsynchronized! */
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   150
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   151
  object Builder {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   152
    def use(hint: Long = 0L)(body: Builder => Unit): Bytes = {
80375
wenzelm
parents: 80374
diff changeset
   153
      val builder = new Builder(hint)
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   154
      body(builder)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   155
      builder.done()
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   156
    }
80377
28dd9b91dfe5 more scalable compression, using Bytes.Builder.Stream;
wenzelm
parents: 80376
diff changeset
   157
80394
348e10664e0f clarified signature;
wenzelm
parents: 80393
diff changeset
   158
    def use_stream(hint: Long = 0L)(body: OutputStream => Unit): Bytes = {
348e10664e0f clarified signature;
wenzelm
parents: 80393
diff changeset
   159
      val stream = new Stream(hint = hint)
348e10664e0f clarified signature;
wenzelm
parents: 80393
diff changeset
   160
      body(stream)
348e10664e0f clarified signature;
wenzelm
parents: 80393
diff changeset
   161
      stream.builder.done()
348e10664e0f clarified signature;
wenzelm
parents: 80393
diff changeset
   162
    }
348e10664e0f clarified signature;
wenzelm
parents: 80393
diff changeset
   163
348e10664e0f clarified signature;
wenzelm
parents: 80393
diff changeset
   164
    private class Stream(hint: Long = 0L) extends OutputStream {
80377
28dd9b91dfe5 more scalable compression, using Bytes.Builder.Stream;
wenzelm
parents: 80376
diff changeset
   165
      val builder = new Builder(hint)
28dd9b91dfe5 more scalable compression, using Bytes.Builder.Stream;
wenzelm
parents: 80376
diff changeset
   166
28dd9b91dfe5 more scalable compression, using Bytes.Builder.Stream;
wenzelm
parents: 80376
diff changeset
   167
      override def write(b: Int): Unit =
28dd9b91dfe5 more scalable compression, using Bytes.Builder.Stream;
wenzelm
parents: 80376
diff changeset
   168
        { builder += b.toByte }
28dd9b91dfe5 more scalable compression, using Bytes.Builder.Stream;
wenzelm
parents: 80376
diff changeset
   169
28dd9b91dfe5 more scalable compression, using Bytes.Builder.Stream;
wenzelm
parents: 80376
diff changeset
   170
      override def write(array: Array[Byte], offset: Int, length: Int): Unit =
28dd9b91dfe5 more scalable compression, using Bytes.Builder.Stream;
wenzelm
parents: 80376
diff changeset
   171
        { builder += (array, offset, length) }
28dd9b91dfe5 more scalable compression, using Bytes.Builder.Stream;
wenzelm
parents: 80376
diff changeset
   172
    }
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   173
  }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   174
80375
wenzelm
parents: 80374
diff changeset
   175
  final class Builder private[Bytes](hint: Long) {
wenzelm
parents: 80374
diff changeset
   176
    private var chunks =
wenzelm
parents: 80374
diff changeset
   177
      new ArrayBuffer[Array[Byte]](if (hint <= 0) 16 else (hint / chunk_size).toInt)
80426
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   178
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   179
    private var buffer_list: mutable.ListBuffer[Array[Byte]] = null
80375
wenzelm
parents: 80374
diff changeset
   180
    private var buffer =
80426
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   181
      new Array[Byte](if (hint <= 0) 1024 else (hint min chunk_size min array_size).toInt)
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   182
    private var buffer_index = 0
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   183
    private var buffer_total = 0
80370
wenzelm
parents: 80369
diff changeset
   184
80426
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   185
    private def buffer_content(): Array[Byte] =
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   186
      if (buffer_list != null) {
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   187
        val array = new Array[Byte](buffer_total)
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   188
        var i = 0
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   189
        for (b <- buffer_list) {
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   190
          val n = b.length
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   191
          System.arraycopy(b, 0, array, i, n)
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   192
          i += n
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   193
        }
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   194
        System.arraycopy(buffer, 0, array, i, buffer_index)
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   195
        array
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   196
      }
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   197
      else if (buffer_index == buffer.length) buffer else Arrays.copyOf(buffer, buffer_index)
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   198
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   199
    private def buffer_check(request: Int = 0): Unit =
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   200
      if (buffer_index == buffer.length) {
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   201
        if (buffer_total == chunk_size) {
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   202
          chunks += buffer_content()
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   203
          buffer_list = null
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   204
          buffer = new Array[Byte](chunk_size.toInt)
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   205
          buffer_total = 0
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   206
          buffer_index = 0
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   207
        }
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   208
        else {
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   209
          if (buffer_list == null) { buffer_list = new mutable.ListBuffer }
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   210
          buffer_list += buffer
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   211
          buffer_index = 0
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   212
          val limit = (chunk_size - buffer_total).toInt
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   213
          buffer = new Array[Byte]((buffer_total max request) min limit)
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   214
        }
80370
wenzelm
parents: 80369
diff changeset
   215
      }
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   216
80380
94d903234f6b Bytes.Builder is unsynchronized, like java.io.OutputBuffer;
wenzelm
parents: 80379
diff changeset
   217
    def += (b: Byte): Unit = {
80426
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   218
      buffer(buffer_index) = b
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   219
      buffer_total += 1
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   220
      buffer_index += 1
80377
28dd9b91dfe5 more scalable compression, using Bytes.Builder.Stream;
wenzelm
parents: 80376
diff changeset
   221
      buffer_check()
28dd9b91dfe5 more scalable compression, using Bytes.Builder.Stream;
wenzelm
parents: 80376
diff changeset
   222
    }
28dd9b91dfe5 more scalable compression, using Bytes.Builder.Stream;
wenzelm
parents: 80376
diff changeset
   223
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   224
    def += (array: Array[Byte], offset: Int, length: Int): Unit = {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   225
      if (offset < 0 || length < 0 || offset.toLong + length.toLong > array.length) {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   226
        throw new IndexOutOfBoundsException
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   227
      }
80426
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   228
      else {
80380
94d903234f6b Bytes.Builder is unsynchronized, like java.io.OutputBuffer;
wenzelm
parents: 80379
diff changeset
   229
        var i = offset
94d903234f6b Bytes.Builder is unsynchronized, like java.io.OutputBuffer;
wenzelm
parents: 80379
diff changeset
   230
        var n = length
94d903234f6b Bytes.Builder is unsynchronized, like java.io.OutputBuffer;
wenzelm
parents: 80379
diff changeset
   231
        while (n > 0) {
80426
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   232
          val l = n min (buffer.length - buffer_index)
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   233
          System.arraycopy(array, i, buffer, buffer_index, l)
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   234
          buffer_total += l
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   235
          buffer_index += l
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   236
          i += l
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   237
          n -= l
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   238
          buffer_check(request = n)
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   239
        }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   240
      }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   241
    }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   242
80428
5fe811816fa3 minor performance tuning;
wenzelm
parents: 80426
diff changeset
   243
    def += (s: CharSequence): Unit =
5fe811816fa3 minor performance tuning;
wenzelm
parents: 80426
diff changeset
   244
      if (s.length > 0) { this += UTF8.bytes(s.toString) }
5fe811816fa3 minor performance tuning;
wenzelm
parents: 80426
diff changeset
   245
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   246
    def += (array: Array[Byte]): Unit = { this += (array, 0, array.length) }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   247
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   248
    def += (a: Subarray): Unit = { this += (a.array, a.offset, a.length) }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   249
80394
348e10664e0f clarified signature;
wenzelm
parents: 80393
diff changeset
   250
    private def done(): Bytes = {
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   251
      val cs = chunks.toArray
80426
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   252
      val b = buffer_content()
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   253
      val size = cs.foldLeft(b.length.toLong)({ case (n, a) => n + a.length })
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   254
      chunks = null
80426
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   255
      buffer_list = null
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   256
      buffer = null
80374
4f1374f56c0b minor performance tuning;
wenzelm
parents: 80373
diff changeset
   257
      new Bytes(if (cs.isEmpty) None else Some(cs), b, 0L, size)
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   258
    }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   259
  }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   260
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   261
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   262
  /* subarray */
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   263
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   264
  object Subarray {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   265
    val empty: Subarray = new Subarray(new Array[Byte](0), 0, 0)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   266
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   267
    def apply(array: Array[Byte], offset: Int, length: Int): Subarray = {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   268
      val n = array.length
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   269
      if (0 <= offset && offset < n && 0 <= length && offset + length <= n) {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   270
        if (length == 0) empty
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   271
        else new Subarray(array, offset, length)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   272
      }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   273
      else throw new IndexOutOfBoundsException
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   274
    }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   275
  }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   276
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   277
  final class Subarray private(
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   278
    val array: Array[Byte],
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   279
    val offset: Int,
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   280
    val length: Int
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   281
  ) {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   282
    override def toString: String = "Bytes.Subarray(" + Space.bytes(length).print + ")"
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   283
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   284
    def byte_iterator: Iterator[Byte] =
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   285
      if (length == 0) Iterator.empty
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   286
      else { for (i <- (offset until (offset + length)).iterator) yield array(i) }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   287
  }
54439
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
   288
}
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
   289
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
   290
final class Bytes private(
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   291
  protected val chunks: Option[Array[Array[Byte]]],
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   292
  protected val chunk0: Array[Byte],
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   293
  protected val offset: Long,
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   294
  val size: Long
80361
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   295
) extends Bytes.Vec {
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   296
  assert(
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   297
    (chunks.isEmpty ||
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   298
      chunks.get.nonEmpty &&
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   299
      chunks.get.forall(chunk => chunk.length == Bytes.chunk_size)) &&
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   300
    chunk0.length < Bytes.chunk_size)
80361
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   301
80385
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   302
  def small_size: Int =
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   303
    if (size > Bytes.array_size) throw new Bytes.Too_Large(size)
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   304
    else size.toInt
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   305
80361
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   306
  def is_empty: Boolean = size == 0
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   307
80385
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   308
  def proper: Option[Bytes] = if (is_empty) None else Some(this)
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   309
80361
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   310
  def is_sliced: Boolean =
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   311
    offset != 0L || {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   312
      chunks match {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   313
        case None => size != chunk0.length
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   314
        case Some(cs) => size != Bytes.make_size(cs, chunk0)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   315
      }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   316
    }
80361
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   317
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   318
  override def toString: String =
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   319
    if (is_empty) "Bytes.empty"
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   320
    else "Bytes(" + Space.bytes(size).print + if_proper(is_sliced, ", sliced") + ")"
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   321
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   322
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   323
  /* elements: signed Byte or unsigned Char */
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   324
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   325
  protected def byte_unchecked(i: Long): Byte = {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   326
    val a = offset + i
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   327
    chunks match {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   328
      case None => chunk0(a.toInt)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   329
      case Some(cs) =>
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   330
        val b = a % Bytes.chunk_size
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   331
        val c = a / Bytes.chunk_size
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   332
        if (c < cs.length) cs(c.toInt)(b.toInt) else chunk0(b.toInt)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   333
    }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   334
  }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   335
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   336
  def byte(i: Long): Byte =
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   337
    if (0 <= i && i < size) byte_unchecked(i)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   338
    else throw new IndexOutOfBoundsException
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   339
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   340
  def apply(i: Long): Char = (byte(i).toInt & 0xff).toChar
80361
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   341
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   342
  protected def subarray_iterator: Iterator[Bytes.Subarray] =
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   343
    if (is_empty) Iterator.empty
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   344
    else if (chunks.isEmpty) Iterator(Bytes.Subarray(chunk0, offset.toInt, size.toInt))
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   345
    else {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   346
      val end_offset = offset + size
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   347
      for ((array, index) <- (chunks.get.iterator ++ Iterator(chunk0)).zipWithIndex) yield {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   348
        val array_start = Bytes.chunk_size * index
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   349
        val array_stop = array_start + array.length
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   350
        if (offset < array_stop && array_start < end_offset) {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   351
          val i = (array_start max offset) - array_start
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   352
          val j = (array_stop min end_offset) - array_start
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   353
          Bytes.Subarray(array, i.toInt, (j - i).toInt)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   354
        }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   355
        else Bytes.Subarray.empty
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   356
      }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   357
    }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   358
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   359
  def byte_iterator: Iterator[Byte] =
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   360
    for {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   361
      a <- subarray_iterator
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   362
      b <- a.byte_iterator
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   363
    } yield b
80361
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   364
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   365
80385
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   366
  /* slice */
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   367
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   368
  def slice(i: Long, j: Long): Bytes =
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   369
    if (0 <= i && i <= j && j <= size) {
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   370
      if (i == j) Bytes.empty
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   371
      else new Bytes(chunks, chunk0, offset + i, j - i)
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   372
    }
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   373
    else throw new IndexOutOfBoundsException
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   374
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   375
  def unslice: Bytes =
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   376
    if (is_sliced) {
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   377
      Bytes.Builder.use(hint = size) { builder =>
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   378
        for (a <- subarray_iterator) { builder += a }
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   379
      }
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   380
    }
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   381
    else this
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   382
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   383
  def trim_line: Bytes =
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   384
    if (size >= 2 && byte_unchecked(size - 2) == 13 && byte_unchecked(size - 1) == 10) {
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   385
      slice(0, size - 2)
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   386
    }
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   387
    else if (size >= 1 && (byte_unchecked(size - 1) == 13 || byte_unchecked(size - 1) == 10)) {
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   388
      slice(0, size - 1)
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   389
    }
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   390
    else this
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   391
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   392
80479
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   393
  /* separated chunks */
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   394
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   395
  def separated_chunks(sep: Byte): Iterator[Bytes] =
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   396
    new Iterator[Bytes] {
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   397
      private var start: Long = -1
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   398
      private var stop: Long = -1
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   399
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   400
      private def end(i: Long): Long = {
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   401
        var j = i
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   402
        while (j < Bytes.this.size && byte_unchecked(j) != sep) { j += 1 }
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   403
        j
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   404
      }
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   405
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   406
      // init
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   407
      if (!Bytes.this.is_empty) { start = 0; stop = end(0) }
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   408
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   409
      def hasNext: Boolean =
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   410
        0 <= start && start <= stop && stop <= Bytes.this.size
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   411
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   412
      def next(): Bytes =
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   413
        if (hasNext) {
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   414
          val chunk = Bytes.this.slice(start, stop)
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   415
          if (stop < Bytes.this.size) { start = stop + 1; stop = end(start) }
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   416
          else { start = -1; stop = -1 }
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   417
          chunk
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   418
        }
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   419
        else throw new NoSuchElementException
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   420
    }
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   421
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   422
  def space_explode(sep: Byte): List[Bytes] = separated_chunks(sep).toList
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   423
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   424
  def split_lines: List[Bytes] = space_explode(10)
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   425
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   426
80363
306f273c91ec clarified hash and equality: depend on sha1 digest to be collision-free;
wenzelm
parents: 80362
diff changeset
   427
  /* hash and equality */
306f273c91ec clarified hash and equality: depend on sha1 digest to be collision-free;
wenzelm
parents: 80362
diff changeset
   428
306f273c91ec clarified hash and equality: depend on sha1 digest to be collision-free;
wenzelm
parents: 80362
diff changeset
   429
  lazy val sha1_digest: SHA1.Digest =
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   430
    if (is_empty) SHA1.digest_empty
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   431
    else {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   432
      SHA1.make_digest { sha =>
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   433
        for (a <- subarray_iterator if a.length > 0) {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   434
          sha.update(a.array, a.offset, a.length)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   435
        }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   436
      }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   437
    }
80363
306f273c91ec clarified hash and equality: depend on sha1 digest to be collision-free;
wenzelm
parents: 80362
diff changeset
   438
306f273c91ec clarified hash and equality: depend on sha1 digest to be collision-free;
wenzelm
parents: 80362
diff changeset
   439
  override def hashCode(): Int = sha1_digest.hashCode()
54439
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
   440
75393
87ebf5a50283 clarified formatting, for the sake of scala3;
wenzelm
parents: 75382
diff changeset
   441
  override def equals(that: Any): Boolean = {
54440
2c4940d2edf7 tuned signature;
wenzelm
parents: 54439
diff changeset
   442
    that match {
2c4940d2edf7 tuned signature;
wenzelm
parents: 54439
diff changeset
   443
      case other: Bytes =>
80363
306f273c91ec clarified hash and equality: depend on sha1 digest to be collision-free;
wenzelm
parents: 80362
diff changeset
   444
        if (this.eq(other)) true
306f273c91ec clarified hash and equality: depend on sha1 digest to be collision-free;
wenzelm
parents: 80362
diff changeset
   445
        else if (size != other.size) false
80378
ab4badc7db7f more efficient equals: avoid somewhat slow sha1_digest (see also 29b761e290c5, 306f273c91ec);
wenzelm
parents: 80377
diff changeset
   446
        else {
ab4badc7db7f more efficient equals: avoid somewhat slow sha1_digest (see also 29b761e290c5, 306f273c91ec);
wenzelm
parents: 80377
diff changeset
   447
          if (chunks.isEmpty && other.chunks.isEmpty) {
ab4badc7db7f more efficient equals: avoid somewhat slow sha1_digest (see also 29b761e290c5, 306f273c91ec);
wenzelm
parents: 80377
diff changeset
   448
            Arrays.equals(chunk0, offset.toInt, (offset + size).toInt,
ab4badc7db7f more efficient equals: avoid somewhat slow sha1_digest (see also 29b761e290c5, 306f273c91ec);
wenzelm
parents: 80377
diff changeset
   449
              other.chunk0, other.offset.toInt, (other.offset + other.size).toInt)
ab4badc7db7f more efficient equals: avoid somewhat slow sha1_digest (see also 29b761e290c5, 306f273c91ec);
wenzelm
parents: 80377
diff changeset
   450
          }
ab4badc7db7f more efficient equals: avoid somewhat slow sha1_digest (see also 29b761e290c5, 306f273c91ec);
wenzelm
parents: 80377
diff changeset
   451
          else if (!is_sliced && !other.is_sliced) {
ab4badc7db7f more efficient equals: avoid somewhat slow sha1_digest (see also 29b761e290c5, 306f273c91ec);
wenzelm
parents: 80377
diff changeset
   452
            (subarray_iterator zip other.subarray_iterator)
ab4badc7db7f more efficient equals: avoid somewhat slow sha1_digest (see also 29b761e290c5, 306f273c91ec);
wenzelm
parents: 80377
diff changeset
   453
              .forall((a, b) => Arrays.equals(a.array, b.array))
ab4badc7db7f more efficient equals: avoid somewhat slow sha1_digest (see also 29b761e290c5, 306f273c91ec);
wenzelm
parents: 80377
diff changeset
   454
          }
ab4badc7db7f more efficient equals: avoid somewhat slow sha1_digest (see also 29b761e290c5, 306f273c91ec);
wenzelm
parents: 80377
diff changeset
   455
          else sha1_digest == other.sha1_digest
80363
306f273c91ec clarified hash and equality: depend on sha1 digest to be collision-free;
wenzelm
parents: 80362
diff changeset
   456
        }
54440
2c4940d2edf7 tuned signature;
wenzelm
parents: 54439
diff changeset
   457
      case _ => false
2c4940d2edf7 tuned signature;
wenzelm
parents: 54439
diff changeset
   458
    }
2c4940d2edf7 tuned signature;
wenzelm
parents: 54439
diff changeset
   459
  }
2c4940d2edf7 tuned signature;
wenzelm
parents: 54439
diff changeset
   460
54439
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
   461
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
   462
  /* content */
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
   463
80385
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   464
  def + (other: Bytes): Bytes =
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   465
    if (other.is_empty) this
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   466
    else if (is_empty) other
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   467
    else {
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   468
      Bytes.Builder.use(hint = size + other.size) { builder =>
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   469
        for (a <- subarray_iterator ++ other.subarray_iterator) {
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   470
          builder += a
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   471
        }
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   472
      }
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   473
    }
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   474
80368
9db395953106 clarified signature;
wenzelm
parents: 80367
diff changeset
   475
  def make_array: Array[Byte] = {
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   476
    val buf = new ByteArrayOutputStream(small_size)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   477
    for (a <- subarray_iterator) { buf.write(a.array, a.offset, a.length) }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   478
    buf.toByteArray
69365
c5b3860d29ef avoid loading of font file, to eliminate "Illegal reflective access by com.lowagie.text.pdf.MappedRandomAccessFile$1 (iText-2.1.5.jar) to method java.nio.DirectByteBuffer.cleaner()" -- due to com.lowagie.text.pdf.TrueTypeFont.process() / RandomAccessFileOrArray;
wenzelm
parents: 68167
diff changeset
   479
  }
c5b3860d29ef avoid loading of font file, to eliminate "Illegal reflective access by com.lowagie.text.pdf.MappedRandomAccessFile$1 (iText-2.1.5.jar) to method java.nio.DirectByteBuffer.cleaner()" -- due to com.lowagie.text.pdf.TrueTypeFont.process() / RandomAccessFileOrArray;
wenzelm
parents: 68167
diff changeset
   480
80352
7a6cba7c77c9 minor performance tuning;
wenzelm
parents: 80351
diff changeset
   481
  def text: String =
7a6cba7c77c9 minor performance tuning;
wenzelm
parents: 80351
diff changeset
   482
    if (is_empty) ""
80379
1f1ce661c71c notable performance tuning: avoid overhead of higher-order functions;
wenzelm
parents: 80378
diff changeset
   483
    else {
1f1ce661c71c notable performance tuning: avoid overhead of higher-order functions;
wenzelm
parents: 80378
diff changeset
   484
      var i = 0L
1f1ce661c71c notable performance tuning: avoid overhead of higher-order functions;
wenzelm
parents: 80378
diff changeset
   485
      var utf8 = false
1f1ce661c71c notable performance tuning: avoid overhead of higher-order functions;
wenzelm
parents: 80378
diff changeset
   486
      while (i < size && !utf8) {
1f1ce661c71c notable performance tuning: avoid overhead of higher-order functions;
wenzelm
parents: 80378
diff changeset
   487
        if (byte_unchecked(i) < 0) { utf8 = true }
1f1ce661c71c notable performance tuning: avoid overhead of higher-order functions;
wenzelm
parents: 80378
diff changeset
   488
        i += 1
1f1ce661c71c notable performance tuning: avoid overhead of higher-order functions;
wenzelm
parents: 80378
diff changeset
   489
      }
1f1ce661c71c notable performance tuning: avoid overhead of higher-order functions;
wenzelm
parents: 80378
diff changeset
   490
      utf8
1f1ce661c71c notable performance tuning: avoid overhead of higher-order functions;
wenzelm
parents: 80378
diff changeset
   491
1f1ce661c71c notable performance tuning: avoid overhead of higher-order functions;
wenzelm
parents: 80378
diff changeset
   492
      if (utf8) UTF8.decode_permissive_bytes(this)
1f1ce661c71c notable performance tuning: avoid overhead of higher-order functions;
wenzelm
parents: 80378
diff changeset
   493
      else new String(make_array, UTF8.charset)
80352
7a6cba7c77c9 minor performance tuning;
wenzelm
parents: 80351
diff changeset
   494
    }
65279
fa62e095d8f1 clarified signature (again, see also 3ed43cfc8b14);
wenzelm
parents: 65070
diff changeset
   495
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   496
  def wellformed_text: Option[String] =
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   497
    try {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   498
      val s = text
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   499
      if (this == Bytes(s)) Some(s) else None
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   500
    }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   501
    catch { case ERROR(_) => None }
76236
03dd2f19f1d7 clarified signature: more operations;
wenzelm
parents: 75709
diff changeset
   502
80393
6138c5b803be Base64: proper support for large Bytes, with subtle change of types (Bytes instead of String);
wenzelm
parents: 80389
diff changeset
   503
6138c5b803be Base64: proper support for large Bytes, with subtle change of types (Bytes instead of String);
wenzelm
parents: 80389
diff changeset
   504
  /* Base64 data representation */
6138c5b803be Base64: proper support for large Bytes, with subtle change of types (Bytes instead of String);
wenzelm
parents: 80389
diff changeset
   505
80394
348e10664e0f clarified signature;
wenzelm
parents: 80393
diff changeset
   506
  def encode_base64: Bytes =
348e10664e0f clarified signature;
wenzelm
parents: 80393
diff changeset
   507
    Bytes.Builder.use_stream(hint = (size * 1.5).round) { out =>
348e10664e0f clarified signature;
wenzelm
parents: 80393
diff changeset
   508
      using(Base64.encode_stream(out))(write_stream(_))
348e10664e0f clarified signature;
wenzelm
parents: 80393
diff changeset
   509
    }
80393
6138c5b803be Base64: proper support for large Bytes, with subtle change of types (Bytes instead of String);
wenzelm
parents: 80389
diff changeset
   510
6138c5b803be Base64: proper support for large Bytes, with subtle change of types (Bytes instead of String);
wenzelm
parents: 80389
diff changeset
   511
  def decode_base64: Bytes =
6138c5b803be Base64: proper support for large Bytes, with subtle change of types (Bytes instead of String);
wenzelm
parents: 80389
diff changeset
   512
    using(Base64.decode_stream(stream()))(Bytes.read_stream(_, hint = (size / 1.2).round))
68094
0b66aca9c965 more operations;
wenzelm
parents: 68087
diff changeset
   513
76236
03dd2f19f1d7 clarified signature: more operations;
wenzelm
parents: 75709
diff changeset
   514
  def maybe_encode_base64: (Boolean, String) =
03dd2f19f1d7 clarified signature: more operations;
wenzelm
parents: 75709
diff changeset
   515
    wellformed_text match {
03dd2f19f1d7 clarified signature: more operations;
wenzelm
parents: 75709
diff changeset
   516
      case Some(s) => (false, s)
80393
6138c5b803be Base64: proper support for large Bytes, with subtle change of types (Bytes instead of String);
wenzelm
parents: 80389
diff changeset
   517
      case None => (true, encode_base64.text)
76236
03dd2f19f1d7 clarified signature: more operations;
wenzelm
parents: 75709
diff changeset
   518
    }
68106
a514e29db980 return exports as result for Isabelle server;
wenzelm
parents: 68094
diff changeset
   519
54440
2c4940d2edf7 tuned signature;
wenzelm
parents: 54439
diff changeset
   520
64004
b4ece7a3f2ca clarified stream operations;
wenzelm
parents: 64001
diff changeset
   521
  /* streams */
b4ece7a3f2ca clarified stream operations;
wenzelm
parents: 64001
diff changeset
   522
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   523
  def stream(): InputStream =
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   524
    if (chunks.isEmpty) new ByteArrayInputStream(chunk0, offset.toInt, size.toInt)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   525
    else {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   526
      new InputStream {
80367
a6c1526600b3 minor performance tuning;
wenzelm
parents: 80366
diff changeset
   527
        private var index = 0L
a6c1526600b3 minor performance tuning;
wenzelm
parents: 80366
diff changeset
   528
a6c1526600b3 minor performance tuning;
wenzelm
parents: 80366
diff changeset
   529
        def read(): Int = {
a6c1526600b3 minor performance tuning;
wenzelm
parents: 80366
diff changeset
   530
          if (index < size) {
a6c1526600b3 minor performance tuning;
wenzelm
parents: 80366
diff changeset
   531
            val res = byte_unchecked(index).toInt & 0xff
a6c1526600b3 minor performance tuning;
wenzelm
parents: 80366
diff changeset
   532
            index += 1
a6c1526600b3 minor performance tuning;
wenzelm
parents: 80366
diff changeset
   533
            res
a6c1526600b3 minor performance tuning;
wenzelm
parents: 80366
diff changeset
   534
          }
a6c1526600b3 minor performance tuning;
wenzelm
parents: 80366
diff changeset
   535
          else -1
a6c1526600b3 minor performance tuning;
wenzelm
parents: 80366
diff changeset
   536
        }
a6c1526600b3 minor performance tuning;
wenzelm
parents: 80366
diff changeset
   537
80388
52f71e3816d8 more scalable stream read operations;
wenzelm
parents: 80387
diff changeset
   538
        override def read(buffer: Array[Byte], start: Int, length: Int): Int = {
52f71e3816d8 more scalable stream read operations;
wenzelm
parents: 80387
diff changeset
   539
          if (length < 16) super.read(buffer, start, length)
52f71e3816d8 more scalable stream read operations;
wenzelm
parents: 80387
diff changeset
   540
          else {
52f71e3816d8 more scalable stream read operations;
wenzelm
parents: 80387
diff changeset
   541
            val index0 = index
52f71e3816d8 more scalable stream read operations;
wenzelm
parents: 80387
diff changeset
   542
            index = (index + length) min size
52f71e3816d8 more scalable stream read operations;
wenzelm
parents: 80387
diff changeset
   543
            val n = (index - index0).toInt
52f71e3816d8 more scalable stream read operations;
wenzelm
parents: 80387
diff changeset
   544
            if (n == 0) -1
52f71e3816d8 more scalable stream read operations;
wenzelm
parents: 80387
diff changeset
   545
            else {
52f71e3816d8 more scalable stream read operations;
wenzelm
parents: 80387
diff changeset
   546
              var i = start
52f71e3816d8 more scalable stream read operations;
wenzelm
parents: 80387
diff changeset
   547
              for (a <- slice(index0, index).subarray_iterator) {
52f71e3816d8 more scalable stream read operations;
wenzelm
parents: 80387
diff changeset
   548
                val l = a.length
52f71e3816d8 more scalable stream read operations;
wenzelm
parents: 80387
diff changeset
   549
                if (l > 0) {
52f71e3816d8 more scalable stream read operations;
wenzelm
parents: 80387
diff changeset
   550
                  System.arraycopy(a.array, a.offset, buffer, i, l)
52f71e3816d8 more scalable stream read operations;
wenzelm
parents: 80387
diff changeset
   551
                  i += l
52f71e3816d8 more scalable stream read operations;
wenzelm
parents: 80387
diff changeset
   552
                }
52f71e3816d8 more scalable stream read operations;
wenzelm
parents: 80387
diff changeset
   553
              }
52f71e3816d8 more scalable stream read operations;
wenzelm
parents: 80387
diff changeset
   554
              n
52f71e3816d8 more scalable stream read operations;
wenzelm
parents: 80387
diff changeset
   555
            }
52f71e3816d8 more scalable stream read operations;
wenzelm
parents: 80387
diff changeset
   556
          }
52f71e3816d8 more scalable stream read operations;
wenzelm
parents: 80387
diff changeset
   557
        }
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   558
      }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   559
    }
64004
b4ece7a3f2ca clarified stream operations;
wenzelm
parents: 64001
diff changeset
   560
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   561
  def write_stream(stream: OutputStream): Unit =
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   562
    for (a <- subarray_iterator if a.length > 0) {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   563
      stream.write(a.array, a.offset, a.length)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   564
    }
64004
b4ece7a3f2ca clarified stream operations;
wenzelm
parents: 64001
diff changeset
   565
b4ece7a3f2ca clarified stream operations;
wenzelm
parents: 64001
diff changeset
   566
76351
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala;
wenzelm
parents: 76350
diff changeset
   567
  /* XZ / Zstd data compression */
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala;
wenzelm
parents: 76350
diff changeset
   568
76358
cff0828c374f clarified signature;
wenzelm
parents: 76353
diff changeset
   569
  def detect_xz: Boolean =
80357
fe123d033e76 clarified signature: pro-forma support for Bytes with size: Long;
wenzelm
parents: 80356
diff changeset
   570
    size >= 6 &&
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   571
      byte_unchecked(0) == 0xFD.toByte &&
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   572
      byte_unchecked(1) == 0x37.toByte &&
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   573
      byte_unchecked(2) == 0x7A.toByte &&
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   574
      byte_unchecked(3) == 0x58.toByte &&
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   575
      byte_unchecked(4) == 0x5A.toByte &&
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   576
      byte_unchecked(5) == 0x00.toByte
76351
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala;
wenzelm
parents: 76350
diff changeset
   577
76358
cff0828c374f clarified signature;
wenzelm
parents: 76353
diff changeset
   578
  def detect_zstd: Boolean =
80357
fe123d033e76 clarified signature: pro-forma support for Bytes with size: Long;
wenzelm
parents: 80356
diff changeset
   579
    size >= 4 &&
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   580
      byte_unchecked(0) == 0x28.toByte &&
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   581
      byte_unchecked(1) == 0xB5.toByte &&
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   582
      byte_unchecked(2) == 0x2F.toByte &&
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   583
      byte_unchecked(3) == 0xFD.toByte
76351
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala;
wenzelm
parents: 76350
diff changeset
   584
76358
cff0828c374f clarified signature;
wenzelm
parents: 76353
diff changeset
   585
  def uncompress_xz(cache: Compress.Cache = Compress.Cache.none): Bytes =
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   586
    using(new xz.XZInputStream(stream(), cache.for_xz))(Bytes.read_stream(_, hint = size))
76358
cff0828c374f clarified signature;
wenzelm
parents: 76353
diff changeset
   587
cff0828c374f clarified signature;
wenzelm
parents: 76353
diff changeset
   588
  def uncompress_zstd(cache: Compress.Cache = Compress.Cache.none): Bytes = {
cff0828c374f clarified signature;
wenzelm
parents: 76353
diff changeset
   589
    Zstd.init()
80376
201af0b45e57 tuned: more uniform, less ambitious;
wenzelm
parents: 80375
diff changeset
   590
    using(new zstd.ZstdInputStream(stream(), cache.for_zstd))(Bytes.read_stream(_, hint = size))
76358
cff0828c374f clarified signature;
wenzelm
parents: 76353
diff changeset
   591
  }
54440
2c4940d2edf7 tuned signature;
wenzelm
parents: 54439
diff changeset
   592
76351
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala;
wenzelm
parents: 76350
diff changeset
   593
  def uncompress(cache: Compress.Cache = Compress.Cache.none): Bytes =
76358
cff0828c374f clarified signature;
wenzelm
parents: 76353
diff changeset
   594
    if (detect_xz) uncompress_xz(cache = cache)
cff0828c374f clarified signature;
wenzelm
parents: 76353
diff changeset
   595
    else if (detect_zstd) uncompress_zstd(cache = cache)
cff0828c374f clarified signature;
wenzelm
parents: 76353
diff changeset
   596
    else error("Cannot detect compression scheme")
64004
b4ece7a3f2ca clarified stream operations;
wenzelm
parents: 64001
diff changeset
   597
76351
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala;
wenzelm
parents: 76350
diff changeset
   598
  def compress(
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala;
wenzelm
parents: 76350
diff changeset
   599
    options: Compress.Options = Compress.Options(),
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala;
wenzelm
parents: 76350
diff changeset
   600
    cache: Compress.Cache = Compress.Cache.none
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala;
wenzelm
parents: 76350
diff changeset
   601
  ): Bytes = {
80394
348e10664e0f clarified signature;
wenzelm
parents: 80393
diff changeset
   602
    Bytes.Builder.use_stream(hint = size) { out =>
348e10664e0f clarified signature;
wenzelm
parents: 80393
diff changeset
   603
      using(
348e10664e0f clarified signature;
wenzelm
parents: 80393
diff changeset
   604
        options match {
348e10664e0f clarified signature;
wenzelm
parents: 80393
diff changeset
   605
          case options_xz: Compress.Options_XZ =>
348e10664e0f clarified signature;
wenzelm
parents: 80393
diff changeset
   606
            new xz.XZOutputStream(out, options_xz.make, cache.for_xz)
348e10664e0f clarified signature;
wenzelm
parents: 80393
diff changeset
   607
          case options_zstd: Compress.Options_Zstd =>
348e10664e0f clarified signature;
wenzelm
parents: 80393
diff changeset
   608
            new zstd.ZstdOutputStream(out, cache.for_zstd, options_zstd.level)
348e10664e0f clarified signature;
wenzelm
parents: 80393
diff changeset
   609
        }
348e10664e0f clarified signature;
wenzelm
parents: 80393
diff changeset
   610
      ) { s => for (a <- subarray_iterator) s.write(a.array, a.offset, a.length) }
348e10664e0f clarified signature;
wenzelm
parents: 80393
diff changeset
   611
    }
64004
b4ece7a3f2ca clarified stream operations;
wenzelm
parents: 64001
diff changeset
   612
  }
68167
327bb0f5f768 clarified implicit compression;
wenzelm
parents: 68150
diff changeset
   613
75393
87ebf5a50283 clarified formatting, for the sake of scala3;
wenzelm
parents: 75382
diff changeset
   614
  def maybe_compress(
76351
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala;
wenzelm
parents: 76350
diff changeset
   615
    options: Compress.Options = Compress.Options(),
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala;
wenzelm
parents: 76350
diff changeset
   616
    cache: Compress.Cache = Compress.Cache.none
75393
87ebf5a50283 clarified formatting, for the sake of scala3;
wenzelm
parents: 75382
diff changeset
   617
  ) : (Boolean, Bytes) = {
68167
327bb0f5f768 clarified implicit compression;
wenzelm
parents: 68150
diff changeset
   618
    val compressed = compress(options = options, cache = cache)
80357
fe123d033e76 clarified signature: pro-forma support for Bytes with size: Long;
wenzelm
parents: 80356
diff changeset
   619
    if (compressed.size < size) (true, compressed) else (false, this)
68167
327bb0f5f768 clarified implicit compression;
wenzelm
parents: 68150
diff changeset
   620
  }
54439
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
   621
}