src/Pure/General/bytes.scala
author wenzelm
Sat, 15 Jun 2024 21:52:14 +0200
changeset 80376 201af0b45e57
parent 80375 c33d017fc1a6
child 80377 28dd9b91dfe5
permissions -rw-r--r--
tuned: more uniform, less ambitious;
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
54439
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
     1
/*  Title:      Pure/General/bytes.scala
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
     2
    Author:     Makarius
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
     3
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
     4
Immutable byte vectors versus UTF8 strings.
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
     5
*/
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
     6
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
     7
package isabelle
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
     8
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
     9
78855
6fdcd6c8c97a prefer old-style import "=>";
wenzelm
parents: 78243
diff changeset
    10
import java.io.{ByteArrayInputStream, ByteArrayOutputStream, FileInputStream, FileOutputStream,
6fdcd6c8c97a prefer old-style import "=>";
wenzelm
parents: 78243
diff changeset
    11
  InputStream, OutputStream, File => JFile}
77711
25fd62cba347 clarified signature: more general operation Bytes.read_slice;
wenzelm
parents: 76361
diff changeset
    12
import java.nio.ByteBuffer
25fd62cba347 clarified signature: more general operation Bytes.read_slice;
wenzelm
parents: 76361
diff changeset
    13
import java.nio.channels.FileChannel
25fd62cba347 clarified signature: more general operation Bytes.read_slice;
wenzelm
parents: 76361
diff changeset
    14
import java.nio.file.StandardOpenOption
77712
dd4bb80dbc3a tuned performance: much faster low-level operation;
wenzelm
parents: 77711
diff changeset
    15
import java.util.Arrays
76353
3698d0f3da18 clarified signature;
wenzelm
parents: 76351
diff changeset
    16
import org.tukaani.xz
3698d0f3da18 clarified signature;
wenzelm
parents: 76351
diff changeset
    17
import com.github.luben.zstd
54440
2c4940d2edf7 tuned signature;
wenzelm
parents: 54439
diff changeset
    18
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    19
import scala.collection.mutable.ArrayBuffer
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    20
54440
2c4940d2edf7 tuned signature;
wenzelm
parents: 54439
diff changeset
    21
75393
87ebf5a50283 clarified formatting, for the sake of scala3;
wenzelm
parents: 75382
diff changeset
    22
object Bytes {
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    23
  /* internal sizes */
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    24
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    25
  val array_size: Long = Int.MaxValue - 8  // see java.io.InputStream.MAX_BUFFER_SIZE
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    26
  val chunk_size: Long = Space.MiB(100).bytes
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    27
  val block_size: Int = 8192
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    28
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    29
  class Too_Large(size: Long) extends IndexOutOfBoundsException {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    30
    override def getMessage: String =
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    31
      "Bytes too large for particular operation: " +
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    32
        Space.bytes(size).print + " > " + Space.bytes(array_size).print
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    33
  }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    34
54439
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
    35
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    36
  /* main constructors */
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    37
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    38
  private def reuse_array(bytes: Array[Byte]): Bytes =
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    39
    if (bytes.length <= chunk_size) new Bytes(None, bytes, 0L, bytes.length.toLong)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    40
    else apply(bytes)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    41
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    42
  val empty: Bytes = reuse_array(new Array(0))
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    43
80373
dc220d47b85e minor performance tuning;
wenzelm
parents: 80372
diff changeset
    44
  def apply(s: CharSequence): Bytes =
dc220d47b85e minor performance tuning;
wenzelm
parents: 80372
diff changeset
    45
    if (s.isEmpty) empty
dc220d47b85e minor performance tuning;
wenzelm
parents: 80372
diff changeset
    46
    else Builder.use(hint = s.length) { builder => builder += s }
54440
2c4940d2edf7 tuned signature;
wenzelm
parents: 54439
diff changeset
    47
63779
9da65bc75610 more operations;
wenzelm
parents: 62527
diff changeset
    48
  def apply(a: Array[Byte]): Bytes = apply(a, 0, a.length)
9da65bc75610 more operations;
wenzelm
parents: 62527
diff changeset
    49
54442
c39972ddd672 more specific Protocol_Output: empty message.body, main content via bytes/text;
wenzelm
parents: 54440
diff changeset
    50
  def apply(a: Array[Byte], offset: Int, length: Int): Bytes =
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    51
    Builder.use(hint = length) { builder => builder += (a, offset, length) }
54442
c39972ddd672 more specific Protocol_Output: empty message.body, main content via bytes/text;
wenzelm
parents: 54440
diff changeset
    52
69454
ef051edd4d10 more uniform multi-language operations;
wenzelm
parents: 69448
diff changeset
    53
  val newline: Bytes = apply("\n")
54440
2c4940d2edf7 tuned signature;
wenzelm
parents: 54439
diff changeset
    54
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    55
  def decode_base64(s: String): Bytes = Bytes.reuse_array(Base64.decode(s))
68108
2277fe496d78 more operations;
wenzelm
parents: 68106
diff changeset
    56
75579
3362b6a5d697 support XZ compression in Isabelle/ML;
wenzelm
parents: 75393
diff changeset
    57
54440
2c4940d2edf7 tuned signature;
wenzelm
parents: 54439
diff changeset
    58
  /* read */
2c4940d2edf7 tuned signature;
wenzelm
parents: 54439
diff changeset
    59
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    60
  def read_stream(stream: InputStream, limit: Long = -1L, hint: Long = 0L): Bytes = {
64004
b4ece7a3f2ca clarified stream operations;
wenzelm
parents: 64001
diff changeset
    61
    if (limit == 0) empty
b4ece7a3f2ca clarified stream operations;
wenzelm
parents: 64001
diff changeset
    62
    else {
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    63
      Builder.use(hint = if (limit > 0) limit else hint) { builder =>
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    64
        val buf = new Array[Byte](Bytes.block_size)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    65
        var m = 0
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    66
        var n = 0L
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    67
        while ({
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    68
          val l = if (limit > 0) ((limit - n) min buf.length).toInt else buf.length
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    69
          m = stream.read(buf, 0, l)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    70
          if (m != -1) {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    71
            builder += (buf, 0, m)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    72
            n += m
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    73
          }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    74
          m != -1 && (limit < 0 || limit > n)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    75
        }) ()
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    76
      }
54440
2c4940d2edf7 tuned signature;
wenzelm
parents: 54439
diff changeset
    77
    }
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    78
  }
64001
7ecb22be8f03 more general read_stream: return actual byte count;
wenzelm
parents: 63779
diff changeset
    79
79510
d8330439823a clarified signature: explicit type isabelle.Url to avoid oddities of java.net.URL (e.g. its "equals" method);
wenzelm
parents: 79509
diff changeset
    80
  def read_url(name: String): Bytes = using(Url(name).open_stream())(read_stream(_))
77717
6a2daddc238c tuned signature;
wenzelm
parents: 77716
diff changeset
    81
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    82
  def read_file(path: Path, offset: Long = 0L, limit: Long = -1L): Bytes = {
78956
12abaffb0346 tuned signature: more operations;
wenzelm
parents: 78954
diff changeset
    83
    val length = File.size(path)
77711
25fd62cba347 clarified signature: more general operation Bytes.read_slice;
wenzelm
parents: 76361
diff changeset
    84
    val start = offset.max(0L)
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    85
    val len = (length - start).max(0L).min(if (limit < 0) Long.MaxValue else limit)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    86
    if (len == 0L) empty
77711
25fd62cba347 clarified signature: more general operation Bytes.read_slice;
wenzelm
parents: 76361
diff changeset
    87
    else {
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    88
      Builder.use(hint = len) { builder =>
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    89
        using(FileChannel.open(path.java_path, StandardOpenOption.READ)) { channel =>
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    90
          channel.position(start)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    91
          val buf = ByteBuffer.allocate(Bytes.block_size)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    92
          var m = 0
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    93
          var n = 0L
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    94
          while ({
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    95
            m = channel.read(buf)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    96
            if (m != -1) {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    97
              builder += (buf.array(), 0, m)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    98
              buf.clear()
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    99
              n += m
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   100
            }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   101
            m != -1 && len > n
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   102
          }) ()
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   103
        }
77711
25fd62cba347 clarified signature: more general operation Bytes.read_slice;
wenzelm
parents: 76361
diff changeset
   104
      }
25fd62cba347 clarified signature: more general operation Bytes.read_slice;
wenzelm
parents: 76361
diff changeset
   105
    }
25fd62cba347 clarified signature: more general operation Bytes.read_slice;
wenzelm
parents: 76361
diff changeset
   106
  }
25fd62cba347 clarified signature: more general operation Bytes.read_slice;
wenzelm
parents: 76361
diff changeset
   107
78953
b6116a86d2ac clarified signature;
wenzelm
parents: 78855
diff changeset
   108
  def read(path: Path): Bytes = read_file(path)
b6116a86d2ac clarified signature;
wenzelm
parents: 78855
diff changeset
   109
  def read(file: JFile): Bytes = read_file(File.path(file))
77718
6ad3a412ed97 clarified signature;
wenzelm
parents: 77717
diff changeset
   110
64229
12aa3980f65c more operations;
wenzelm
parents: 64224
diff changeset
   111
12aa3980f65c more operations;
wenzelm
parents: 64224
diff changeset
   112
  /* write */
12aa3980f65c more operations;
wenzelm
parents: 64224
diff changeset
   113
69393
ed0824ef337e static type for Library.using: avoid Java 11 warnings on "Illegal reflective access";
wenzelm
parents: 69365
diff changeset
   114
  def write(file: JFile, bytes: Bytes): Unit =
ed0824ef337e static type for Library.using: avoid Java 11 warnings on "Illegal reflective access";
wenzelm
parents: 69365
diff changeset
   115
    using(new FileOutputStream(file))(bytes.write_stream(_))
64229
12aa3980f65c more operations;
wenzelm
parents: 64224
diff changeset
   116
12aa3980f65c more operations;
wenzelm
parents: 64224
diff changeset
   117
  def write(path: Path, bytes: Bytes): Unit = write(path.file, bytes)
78194
da721ba809a4 more operations;
wenzelm
parents: 77718
diff changeset
   118
da721ba809a4 more operations;
wenzelm
parents: 77718
diff changeset
   119
da721ba809a4 more operations;
wenzelm
parents: 77718
diff changeset
   120
  /* append */
da721ba809a4 more operations;
wenzelm
parents: 77718
diff changeset
   121
da721ba809a4 more operations;
wenzelm
parents: 77718
diff changeset
   122
  def append(file: JFile, bytes: Bytes): Unit =
da721ba809a4 more operations;
wenzelm
parents: 77718
diff changeset
   123
    using(new FileOutputStream(file, true))(bytes.write_stream(_))
da721ba809a4 more operations;
wenzelm
parents: 77718
diff changeset
   124
da721ba809a4 more operations;
wenzelm
parents: 77718
diff changeset
   125
  def append(path: Path, bytes: Bytes): Unit = append(path.file, bytes)
80351
dbbe26afc319 clarified signature: more accurate types;
wenzelm
parents: 80350
diff changeset
   126
dbbe26afc319 clarified signature: more accurate types;
wenzelm
parents: 80350
diff changeset
   127
80355
5a555acad203 clarified signature (again);
wenzelm
parents: 80353
diff changeset
   128
  /* vector of short unsigned integers */
80351
dbbe26afc319 clarified signature: more accurate types;
wenzelm
parents: 80350
diff changeset
   129
dbbe26afc319 clarified signature: more accurate types;
wenzelm
parents: 80350
diff changeset
   130
  trait Vec {
dbbe26afc319 clarified signature: more accurate types;
wenzelm
parents: 80350
diff changeset
   131
    def size: Long
80355
5a555acad203 clarified signature (again);
wenzelm
parents: 80353
diff changeset
   132
    def apply(i: Long): Char
80351
dbbe26afc319 clarified signature: more accurate types;
wenzelm
parents: 80350
diff changeset
   133
  }
dbbe26afc319 clarified signature: more accurate types;
wenzelm
parents: 80350
diff changeset
   134
dbbe26afc319 clarified signature: more accurate types;
wenzelm
parents: 80350
diff changeset
   135
  class Vec_String(string: String) extends Vec {
dbbe26afc319 clarified signature: more accurate types;
wenzelm
parents: 80350
diff changeset
   136
    override def size: Long = string.length.toLong
80355
5a555acad203 clarified signature (again);
wenzelm
parents: 80353
diff changeset
   137
    override def apply(i: Long): Char =
5a555acad203 clarified signature (again);
wenzelm
parents: 80353
diff changeset
   138
      if (0 <= i && i < size) string(i.toInt)
80351
dbbe26afc319 clarified signature: more accurate types;
wenzelm
parents: 80350
diff changeset
   139
      else throw new IndexOutOfBoundsException
dbbe26afc319 clarified signature: more accurate types;
wenzelm
parents: 80350
diff changeset
   140
  }
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   141
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   142
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   143
  /* incremental builder: synchronized */
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   144
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   145
  private def make_size(chunks: Array[Array[Byte]], buffer: Array[Byte]): Long =
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   146
    chunks.foldLeft(buffer.length.toLong)((n, chunk) => n + chunk.length)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   147
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   148
  object Builder {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   149
    def use(hint: Long = 0L)(body: Builder => Unit): Bytes = {
80375
wenzelm
parents: 80374
diff changeset
   150
      val builder = new Builder(hint)
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   151
      body(builder)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   152
      builder.done()
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   153
    }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   154
  }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   155
80375
wenzelm
parents: 80374
diff changeset
   156
  final class Builder private[Bytes](hint: Long) {
80374
4f1374f56c0b minor performance tuning;
wenzelm
parents: 80373
diff changeset
   157
    private var size = 0L
80375
wenzelm
parents: 80374
diff changeset
   158
    private var chunks =
wenzelm
parents: 80374
diff changeset
   159
      new ArrayBuffer[Array[Byte]](if (hint <= 0) 16 else (hint / chunk_size).toInt)
wenzelm
parents: 80374
diff changeset
   160
    private var buffer =
wenzelm
parents: 80374
diff changeset
   161
      new ByteArrayOutputStream(if (hint <= 0) 1024 else (hint min chunk_size min array_size).toInt)
80370
wenzelm
parents: 80369
diff changeset
   162
80369
wenzelm
parents: 80368
diff changeset
   163
    private def buffer_free(): Int = chunk_size.toInt - buffer.size()
80370
wenzelm
parents: 80369
diff changeset
   164
    private def buffer_check(): Unit =
wenzelm
parents: 80369
diff changeset
   165
      if (buffer_free() == 0) {
wenzelm
parents: 80369
diff changeset
   166
        chunks += buffer.toByteArray
wenzelm
parents: 80369
diff changeset
   167
        buffer = new ByteArrayOutputStream
wenzelm
parents: 80369
diff changeset
   168
      }
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   169
80373
dc220d47b85e minor performance tuning;
wenzelm
parents: 80372
diff changeset
   170
    def += (s: CharSequence): Unit = {
dc220d47b85e minor performance tuning;
wenzelm
parents: 80372
diff changeset
   171
      val n = s.length
dc220d47b85e minor performance tuning;
wenzelm
parents: 80372
diff changeset
   172
      if (n > 0) {
dc220d47b85e minor performance tuning;
wenzelm
parents: 80372
diff changeset
   173
        if (UTF8.relevant(s)) { this += UTF8.bytes(s.toString) }
80372
6e74f0fc8a52 minor performance tuning;
wenzelm
parents: 80370
diff changeset
   174
        else {
6e74f0fc8a52 minor performance tuning;
wenzelm
parents: 80370
diff changeset
   175
          synchronized {
80373
dc220d47b85e minor performance tuning;
wenzelm
parents: 80372
diff changeset
   176
            for (i <- 0 until n) {
80374
4f1374f56c0b minor performance tuning;
wenzelm
parents: 80373
diff changeset
   177
              size += 1
80373
dc220d47b85e minor performance tuning;
wenzelm
parents: 80372
diff changeset
   178
              buffer.write(s.charAt(i).toByte)
80372
6e74f0fc8a52 minor performance tuning;
wenzelm
parents: 80370
diff changeset
   179
              buffer_check()
6e74f0fc8a52 minor performance tuning;
wenzelm
parents: 80370
diff changeset
   180
            }
6e74f0fc8a52 minor performance tuning;
wenzelm
parents: 80370
diff changeset
   181
          }
6e74f0fc8a52 minor performance tuning;
wenzelm
parents: 80370
diff changeset
   182
        }
6e74f0fc8a52 minor performance tuning;
wenzelm
parents: 80370
diff changeset
   183
      }
80373
dc220d47b85e minor performance tuning;
wenzelm
parents: 80372
diff changeset
   184
    }
80372
6e74f0fc8a52 minor performance tuning;
wenzelm
parents: 80370
diff changeset
   185
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   186
    def += (array: Array[Byte], offset: Int, length: Int): Unit = {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   187
      if (offset < 0 || length < 0 || offset.toLong + length.toLong > array.length) {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   188
        throw new IndexOutOfBoundsException
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   189
      }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   190
      else if (length > 0) {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   191
        synchronized {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   192
          var i = offset
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   193
          var n = length
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   194
          while (n > 0) {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   195
            val m = buffer_free()
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   196
            if (m > 0) {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   197
              val l = m min n
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   198
              buffer.write(array, i, l)
80374
4f1374f56c0b minor performance tuning;
wenzelm
parents: 80373
diff changeset
   199
              size += l
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   200
              i += l
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   201
              n -= l
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   202
            }
80370
wenzelm
parents: 80369
diff changeset
   203
            buffer_check()
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   204
          }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   205
        }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   206
      }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   207
    }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   208
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   209
    def += (array: Array[Byte]): Unit = { this += (array, 0, array.length) }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   210
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   211
    def += (a: Subarray): Unit = { this += (a.array, a.offset, a.length) }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   212
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   213
    private def done(): Bytes = synchronized {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   214
      val cs = chunks.toArray
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   215
      val b = buffer.toByteArray
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   216
      chunks = null
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   217
      buffer = null
80374
4f1374f56c0b minor performance tuning;
wenzelm
parents: 80373
diff changeset
   218
      new Bytes(if (cs.isEmpty) None else Some(cs), b, 0L, size)
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   219
    }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   220
  }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   221
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   222
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   223
  /* subarray */
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   224
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   225
  object Subarray {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   226
    val empty: Subarray = new Subarray(new Array[Byte](0), 0, 0)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   227
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   228
    def apply(array: Array[Byte], offset: Int, length: Int): Subarray = {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   229
      val n = array.length
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   230
      if (0 <= offset && offset < n && 0 <= length && offset + length <= n) {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   231
        if (length == 0) empty
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   232
        else new Subarray(array, offset, length)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   233
      }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   234
      else throw new IndexOutOfBoundsException
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   235
    }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   236
  }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   237
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   238
  final class Subarray private(
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   239
    val array: Array[Byte],
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   240
    val offset: Int,
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   241
    val length: Int
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   242
  ) {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   243
    override def toString: String = "Bytes.Subarray(" + Space.bytes(length).print + ")"
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   244
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   245
    def byte_iterator: Iterator[Byte] =
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   246
      if (length == 0) Iterator.empty
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   247
      else { for (i <- (offset until (offset + length)).iterator) yield array(i) }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   248
  }
54439
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
   249
}
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
   250
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
   251
final class Bytes private(
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   252
  protected val chunks: Option[Array[Array[Byte]]],
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   253
  protected val chunk0: Array[Byte],
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   254
  protected val offset: Long,
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   255
  val size: Long
80361
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   256
) extends Bytes.Vec {
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   257
  assert(
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   258
    (chunks.isEmpty ||
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   259
      chunks.get.nonEmpty &&
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   260
      chunks.get.forall(chunk => chunk.length == Bytes.chunk_size)) &&
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   261
    chunk0.length < Bytes.chunk_size)
80361
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   262
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   263
  def is_empty: Boolean = size == 0
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   264
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   265
  def is_sliced: Boolean =
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   266
    offset != 0L || {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   267
      chunks match {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   268
        case None => size != chunk0.length
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   269
        case Some(cs) => size != Bytes.make_size(cs, chunk0)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   270
      }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   271
    }
80361
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   272
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   273
  override def toString: String =
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   274
    if (is_empty) "Bytes.empty"
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   275
    else "Bytes(" + Space.bytes(size).print + if_proper(is_sliced, ", sliced") + ")"
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   276
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   277
  def small_size: Int =
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   278
    if (size > Bytes.array_size) throw new Bytes.Too_Large(size)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   279
    else size.toInt
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   280
80361
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   281
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   282
  /* slice */
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   283
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   284
  def slice(i: Long, j: Long): Bytes =
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   285
    if (0 <= i && i <= j && j <= size) {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   286
      if (i == j) Bytes.empty
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   287
      else new Bytes(chunks, chunk0, offset + i, j - i)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   288
    }
80361
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   289
    else throw new IndexOutOfBoundsException
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   290
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   291
  def unslice: Bytes =
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   292
    if (is_sliced) {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   293
      Bytes.Builder.use(hint = size) { builder =>
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   294
        for (a <- subarray_iterator) { builder += a }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   295
      }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   296
    }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   297
    else this
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   298
80361
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   299
  def trim_line: Bytes =
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   300
    if (size >= 2 && byte_unchecked(size - 2) == 13 && byte_unchecked(size - 1) == 10) {
80361
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   301
      slice(0, size - 2)
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   302
    }
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   303
    else if (size >= 1 && (byte_unchecked(size - 1) == 13 || byte_unchecked(size - 1) == 10)) {
80361
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   304
      slice(0, size - 1)
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   305
    }
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   306
    else this
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   307
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   308
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   309
  /* elements: signed Byte or unsigned Char */
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   310
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   311
  protected def byte_unchecked(i: Long): Byte = {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   312
    val a = offset + i
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   313
    chunks match {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   314
      case None => chunk0(a.toInt)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   315
      case Some(cs) =>
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   316
        val b = a % Bytes.chunk_size
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   317
        val c = a / Bytes.chunk_size
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   318
        if (c < cs.length) cs(c.toInt)(b.toInt) else chunk0(b.toInt)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   319
    }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   320
  }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   321
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   322
  def byte(i: Long): Byte =
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   323
    if (0 <= i && i < size) byte_unchecked(i)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   324
    else throw new IndexOutOfBoundsException
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   325
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   326
  def apply(i: Long): Char = (byte(i).toInt & 0xff).toChar
80361
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   327
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   328
  protected def subarray_iterator: Iterator[Bytes.Subarray] =
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   329
    if (is_empty) Iterator.empty
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   330
    else if (chunks.isEmpty) Iterator(Bytes.Subarray(chunk0, offset.toInt, size.toInt))
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   331
    else {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   332
      val end_offset = offset + size
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   333
      for ((array, index) <- (chunks.get.iterator ++ Iterator(chunk0)).zipWithIndex) yield {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   334
        val array_start = Bytes.chunk_size * index
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   335
        val array_stop = array_start + array.length
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   336
        if (offset < array_stop && array_start < end_offset) {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   337
          val i = (array_start max offset) - array_start
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   338
          val j = (array_stop min end_offset) - array_start
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   339
          Bytes.Subarray(array, i.toInt, (j - i).toInt)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   340
        }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   341
        else Bytes.Subarray.empty
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   342
      }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   343
    }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   344
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   345
  def byte_iterator: Iterator[Byte] =
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   346
    for {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   347
      a <- subarray_iterator
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   348
      b <- a.byte_iterator
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   349
    } yield b
80361
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   350
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   351
80363
306f273c91ec clarified hash and equality: depend on sha1 digest to be collision-free;
wenzelm
parents: 80362
diff changeset
   352
  /* hash and equality */
306f273c91ec clarified hash and equality: depend on sha1 digest to be collision-free;
wenzelm
parents: 80362
diff changeset
   353
306f273c91ec clarified hash and equality: depend on sha1 digest to be collision-free;
wenzelm
parents: 80362
diff changeset
   354
  lazy val sha1_digest: SHA1.Digest =
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   355
    if (is_empty) SHA1.digest_empty
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   356
    else {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   357
      SHA1.make_digest { sha =>
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   358
        for (a <- subarray_iterator if a.length > 0) {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   359
          sha.update(a.array, a.offset, a.length)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   360
        }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   361
      }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   362
    }
80363
306f273c91ec clarified hash and equality: depend on sha1 digest to be collision-free;
wenzelm
parents: 80362
diff changeset
   363
306f273c91ec clarified hash and equality: depend on sha1 digest to be collision-free;
wenzelm
parents: 80362
diff changeset
   364
  override def hashCode(): Int = sha1_digest.hashCode()
54439
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
   365
75393
87ebf5a50283 clarified formatting, for the sake of scala3;
wenzelm
parents: 75382
diff changeset
   366
  override def equals(that: Any): Boolean = {
54440
2c4940d2edf7 tuned signature;
wenzelm
parents: 54439
diff changeset
   367
    that match {
2c4940d2edf7 tuned signature;
wenzelm
parents: 54439
diff changeset
   368
      case other: Bytes =>
80363
306f273c91ec clarified hash and equality: depend on sha1 digest to be collision-free;
wenzelm
parents: 80362
diff changeset
   369
        if (this.eq(other)) true
306f273c91ec clarified hash and equality: depend on sha1 digest to be collision-free;
wenzelm
parents: 80362
diff changeset
   370
        else if (size != other.size) false
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   371
        else if (chunks.isEmpty && size <= 10 * SHA1.digest_length) {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   372
          Arrays.equals(chunk0, offset.toInt, (offset + size).toInt,
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   373
            other.chunk0, other.offset.toInt, (other.offset + other.size).toInt)
80363
306f273c91ec clarified hash and equality: depend on sha1 digest to be collision-free;
wenzelm
parents: 80362
diff changeset
   374
        }
306f273c91ec clarified hash and equality: depend on sha1 digest to be collision-free;
wenzelm
parents: 80362
diff changeset
   375
        else sha1_digest == other.sha1_digest
54440
2c4940d2edf7 tuned signature;
wenzelm
parents: 54439
diff changeset
   376
      case _ => false
2c4940d2edf7 tuned signature;
wenzelm
parents: 54439
diff changeset
   377
    }
2c4940d2edf7 tuned signature;
wenzelm
parents: 54439
diff changeset
   378
  }
2c4940d2edf7 tuned signature;
wenzelm
parents: 54439
diff changeset
   379
54439
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
   380
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
   381
  /* content */
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
   382
80368
9db395953106 clarified signature;
wenzelm
parents: 80367
diff changeset
   383
  def make_array: Array[Byte] = {
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   384
    val buf = new ByteArrayOutputStream(small_size)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   385
    for (a <- subarray_iterator) { buf.write(a.array, a.offset, a.length) }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   386
    buf.toByteArray
69365
c5b3860d29ef avoid loading of font file, to eliminate "Illegal reflective access by com.lowagie.text.pdf.MappedRandomAccessFile$1 (iText-2.1.5.jar) to method java.nio.DirectByteBuffer.cleaner()" -- due to com.lowagie.text.pdf.TrueTypeFont.process() / RandomAccessFileOrArray;
wenzelm
parents: 68167
diff changeset
   387
  }
c5b3860d29ef avoid loading of font file, to eliminate "Illegal reflective access by com.lowagie.text.pdf.MappedRandomAccessFile$1 (iText-2.1.5.jar) to method java.nio.DirectByteBuffer.cleaner()" -- due to com.lowagie.text.pdf.TrueTypeFont.process() / RandomAccessFileOrArray;
wenzelm
parents: 68167
diff changeset
   388
80352
7a6cba7c77c9 minor performance tuning;
wenzelm
parents: 80351
diff changeset
   389
  def text: String =
7a6cba7c77c9 minor performance tuning;
wenzelm
parents: 80351
diff changeset
   390
    if (is_empty) ""
80362
d395b7e14102 clarified signature;
wenzelm
parents: 80361
diff changeset
   391
    else if (byte_iterator.forall(_ >= 0)) {
80368
9db395953106 clarified signature;
wenzelm
parents: 80367
diff changeset
   392
      new String(make_array, UTF8.charset)
80352
7a6cba7c77c9 minor performance tuning;
wenzelm
parents: 80351
diff changeset
   393
    }
7a6cba7c77c9 minor performance tuning;
wenzelm
parents: 80351
diff changeset
   394
    else UTF8.decode_permissive_bytes(this)
65279
fa62e095d8f1 clarified signature (again, see also 3ed43cfc8b14);
wenzelm
parents: 65070
diff changeset
   395
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   396
  def wellformed_text: Option[String] =
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   397
    try {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   398
      val s = text
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   399
      if (this == Bytes(s)) Some(s) else None
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   400
    }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   401
    catch { case ERROR(_) => None }
76236
03dd2f19f1d7 clarified signature: more operations;
wenzelm
parents: 75709
diff changeset
   402
80368
9db395953106 clarified signature;
wenzelm
parents: 80367
diff changeset
   403
  def encode_base64: String = Base64.encode(make_array)
68094
0b66aca9c965 more operations;
wenzelm
parents: 68087
diff changeset
   404
76236
03dd2f19f1d7 clarified signature: more operations;
wenzelm
parents: 75709
diff changeset
   405
  def maybe_encode_base64: (Boolean, String) =
03dd2f19f1d7 clarified signature: more operations;
wenzelm
parents: 75709
diff changeset
   406
    wellformed_text match {
03dd2f19f1d7 clarified signature: more operations;
wenzelm
parents: 75709
diff changeset
   407
      case Some(s) => (false, s)
03dd2f19f1d7 clarified signature: more operations;
wenzelm
parents: 75709
diff changeset
   408
      case None => (true, encode_base64)
03dd2f19f1d7 clarified signature: more operations;
wenzelm
parents: 75709
diff changeset
   409
    }
68106
a514e29db980 return exports as result for Isabelle server;
wenzelm
parents: 68094
diff changeset
   410
72885
1b0f81e556a2 accommodate OpenJDK 15;
wenzelm
parents: 71152
diff changeset
   411
  def proper: Option[Bytes] = if (is_empty) None else Some(this)
1b0f81e556a2 accommodate OpenJDK 15;
wenzelm
parents: 71152
diff changeset
   412
  def proper_text: Option[String] = if (is_empty) None else Some(text)
65630
c41bbf657310 more operations;
wenzelm
parents: 65279
diff changeset
   413
54439
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
   414
  def +(other: Bytes): Bytes =
72885
1b0f81e556a2 accommodate OpenJDK 15;
wenzelm
parents: 71152
diff changeset
   415
    if (other.is_empty) this
1b0f81e556a2 accommodate OpenJDK 15;
wenzelm
parents: 71152
diff changeset
   416
    else if (is_empty) other
54439
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
   417
    else {
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   418
      Bytes.Builder.use(hint = size + other.size) { builder =>
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   419
        for (a <- subarray_iterator ++ other.subarray_iterator) {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   420
          builder += a
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   421
        }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   422
      }
54439
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
   423
    }
54440
2c4940d2edf7 tuned signature;
wenzelm
parents: 54439
diff changeset
   424
2c4940d2edf7 tuned signature;
wenzelm
parents: 54439
diff changeset
   425
64004
b4ece7a3f2ca clarified stream operations;
wenzelm
parents: 64001
diff changeset
   426
  /* streams */
b4ece7a3f2ca clarified stream operations;
wenzelm
parents: 64001
diff changeset
   427
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   428
  def stream(): InputStream =
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   429
    if (chunks.isEmpty) new ByteArrayInputStream(chunk0, offset.toInt, size.toInt)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   430
    else {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   431
      new InputStream {
80367
a6c1526600b3 minor performance tuning;
wenzelm
parents: 80366
diff changeset
   432
        private var index = 0L
a6c1526600b3 minor performance tuning;
wenzelm
parents: 80366
diff changeset
   433
a6c1526600b3 minor performance tuning;
wenzelm
parents: 80366
diff changeset
   434
        def read(): Int = {
a6c1526600b3 minor performance tuning;
wenzelm
parents: 80366
diff changeset
   435
          if (index < size) {
a6c1526600b3 minor performance tuning;
wenzelm
parents: 80366
diff changeset
   436
            val res = byte_unchecked(index).toInt & 0xff
a6c1526600b3 minor performance tuning;
wenzelm
parents: 80366
diff changeset
   437
            index += 1
a6c1526600b3 minor performance tuning;
wenzelm
parents: 80366
diff changeset
   438
            res
a6c1526600b3 minor performance tuning;
wenzelm
parents: 80366
diff changeset
   439
          }
a6c1526600b3 minor performance tuning;
wenzelm
parents: 80366
diff changeset
   440
          else -1
a6c1526600b3 minor performance tuning;
wenzelm
parents: 80366
diff changeset
   441
        }
a6c1526600b3 minor performance tuning;
wenzelm
parents: 80366
diff changeset
   442
80368
9db395953106 clarified signature;
wenzelm
parents: 80367
diff changeset
   443
        override def readAllBytes(): Array[Byte] = make_array
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   444
      }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   445
    }
64004
b4ece7a3f2ca clarified stream operations;
wenzelm
parents: 64001
diff changeset
   446
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   447
  def write_stream(stream: OutputStream): Unit =
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   448
    for (a <- subarray_iterator if a.length > 0) {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   449
      stream.write(a.array, a.offset, a.length)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   450
    }
64004
b4ece7a3f2ca clarified stream operations;
wenzelm
parents: 64001
diff changeset
   451
b4ece7a3f2ca clarified stream operations;
wenzelm
parents: 64001
diff changeset
   452
76351
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala;
wenzelm
parents: 76350
diff changeset
   453
  /* XZ / Zstd data compression */
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala;
wenzelm
parents: 76350
diff changeset
   454
76358
cff0828c374f clarified signature;
wenzelm
parents: 76353
diff changeset
   455
  def detect_xz: Boolean =
80357
fe123d033e76 clarified signature: pro-forma support for Bytes with size: Long;
wenzelm
parents: 80356
diff changeset
   456
    size >= 6 &&
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   457
      byte_unchecked(0) == 0xFD.toByte &&
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   458
      byte_unchecked(1) == 0x37.toByte &&
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   459
      byte_unchecked(2) == 0x7A.toByte &&
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   460
      byte_unchecked(3) == 0x58.toByte &&
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   461
      byte_unchecked(4) == 0x5A.toByte &&
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   462
      byte_unchecked(5) == 0x00.toByte
76351
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala;
wenzelm
parents: 76350
diff changeset
   463
76358
cff0828c374f clarified signature;
wenzelm
parents: 76353
diff changeset
   464
  def detect_zstd: Boolean =
80357
fe123d033e76 clarified signature: pro-forma support for Bytes with size: Long;
wenzelm
parents: 80356
diff changeset
   465
    size >= 4 &&
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   466
      byte_unchecked(0) == 0x28.toByte &&
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   467
      byte_unchecked(1) == 0xB5.toByte &&
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   468
      byte_unchecked(2) == 0x2F.toByte &&
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   469
      byte_unchecked(3) == 0xFD.toByte
76351
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala;
wenzelm
parents: 76350
diff changeset
   470
76358
cff0828c374f clarified signature;
wenzelm
parents: 76353
diff changeset
   471
  def uncompress_xz(cache: Compress.Cache = Compress.Cache.none): Bytes =
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   472
    using(new xz.XZInputStream(stream(), cache.for_xz))(Bytes.read_stream(_, hint = size))
76358
cff0828c374f clarified signature;
wenzelm
parents: 76353
diff changeset
   473
cff0828c374f clarified signature;
wenzelm
parents: 76353
diff changeset
   474
  def uncompress_zstd(cache: Compress.Cache = Compress.Cache.none): Bytes = {
cff0828c374f clarified signature;
wenzelm
parents: 76353
diff changeset
   475
    Zstd.init()
80376
201af0b45e57 tuned: more uniform, less ambitious;
wenzelm
parents: 80375
diff changeset
   476
    using(new zstd.ZstdInputStream(stream(), cache.for_zstd))(Bytes.read_stream(_, hint = size))
76358
cff0828c374f clarified signature;
wenzelm
parents: 76353
diff changeset
   477
  }
54440
2c4940d2edf7 tuned signature;
wenzelm
parents: 54439
diff changeset
   478
76351
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala;
wenzelm
parents: 76350
diff changeset
   479
  def uncompress(cache: Compress.Cache = Compress.Cache.none): Bytes =
76358
cff0828c374f clarified signature;
wenzelm
parents: 76353
diff changeset
   480
    if (detect_xz) uncompress_xz(cache = cache)
cff0828c374f clarified signature;
wenzelm
parents: 76353
diff changeset
   481
    else if (detect_zstd) uncompress_zstd(cache = cache)
cff0828c374f clarified signature;
wenzelm
parents: 76353
diff changeset
   482
    else error("Cannot detect compression scheme")
64004
b4ece7a3f2ca clarified stream operations;
wenzelm
parents: 64001
diff changeset
   483
76351
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala;
wenzelm
parents: 76350
diff changeset
   484
  def compress(
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala;
wenzelm
parents: 76350
diff changeset
   485
    options: Compress.Options = Compress.Options(),
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala;
wenzelm
parents: 76350
diff changeset
   486
    cache: Compress.Cache = Compress.Cache.none
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala;
wenzelm
parents: 76350
diff changeset
   487
  ): Bytes = {
76358
cff0828c374f clarified signature;
wenzelm
parents: 76353
diff changeset
   488
    options match {
cff0828c374f clarified signature;
wenzelm
parents: 76353
diff changeset
   489
      case options_xz: Compress.Options_XZ =>
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   490
        val out = new ByteArrayOutputStream((size min Bytes.array_size).toInt)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   491
        using(new xz.XZOutputStream(out, options_xz.make, cache.for_xz))(write_stream)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   492
        Bytes(out.toByteArray)
76358
cff0828c374f clarified signature;
wenzelm
parents: 76353
diff changeset
   493
      case options_zstd: Compress.Options_Zstd =>
cff0828c374f clarified signature;
wenzelm
parents: 76353
diff changeset
   494
        Zstd.init()
80368
9db395953106 clarified signature;
wenzelm
parents: 80367
diff changeset
   495
        val inp = if (chunks.isEmpty && !is_sliced) chunk0 else make_array
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   496
        Bytes(zstd.Zstd.compress(inp, options_zstd.level))
76358
cff0828c374f clarified signature;
wenzelm
parents: 76353
diff changeset
   497
    }
64004
b4ece7a3f2ca clarified stream operations;
wenzelm
parents: 64001
diff changeset
   498
  }
68167
327bb0f5f768 clarified implicit compression;
wenzelm
parents: 68150
diff changeset
   499
75393
87ebf5a50283 clarified formatting, for the sake of scala3;
wenzelm
parents: 75382
diff changeset
   500
  def maybe_compress(
76351
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala;
wenzelm
parents: 76350
diff changeset
   501
    options: Compress.Options = Compress.Options(),
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala;
wenzelm
parents: 76350
diff changeset
   502
    cache: Compress.Cache = Compress.Cache.none
75393
87ebf5a50283 clarified formatting, for the sake of scala3;
wenzelm
parents: 75382
diff changeset
   503
  ) : (Boolean, Bytes) = {
68167
327bb0f5f768 clarified implicit compression;
wenzelm
parents: 68150
diff changeset
   504
    val compressed = compress(options = options, cache = cache)
80357
fe123d033e76 clarified signature: pro-forma support for Bytes with size: Long;
wenzelm
parents: 80356
diff changeset
   505
    if (compressed.size < size) (true, compressed) else (false, this)
68167
327bb0f5f768 clarified implicit compression;
wenzelm
parents: 68150
diff changeset
   506
  }
54439
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
   507
}