src/Pure/General/bytes.scala
author wenzelm
Sun, 27 Oct 2024 11:02:21 +0100
changeset 81266 8300511f4c45
parent 80514 482897a69699
permissions -rw-r--r--
clarified signature;
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
54439
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
     1
/*  Title:      Pure/General/bytes.scala
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
     2
    Author:     Makarius
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
     3
80503
d59cc10a6888 tuned comments;
wenzelm
parents: 80494
diff changeset
     4
Scalable byte strings, with incremental construction (via Builder).
54439
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
     5
*/
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
     6
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
     7
package isabelle
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
     8
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
     9
78855
6fdcd6c8c97a prefer old-style import "=>";
wenzelm
parents: 78243
diff changeset
    10
import java.io.{ByteArrayInputStream, ByteArrayOutputStream, FileInputStream, FileOutputStream,
80508
8585399f26f6 prefer official UTF-8 decoding (in contrast to 2541de190d92): this is also more efficient (factor 10-20);
wenzelm
parents: 80503
diff changeset
    11
  InputStreamReader, InputStream, OutputStream, File => JFile}
77711
25fd62cba347 clarified signature: more general operation Bytes.read_slice;
wenzelm
parents: 76361
diff changeset
    12
import java.nio.ByteBuffer
80492
43323d886ea3 clarified signature: more direct Bytes.raw and subsequent UTF-8 default decoding;
wenzelm
parents: 80491
diff changeset
    13
import java.nio.charset.StandardCharsets.ISO_8859_1
77711
25fd62cba347 clarified signature: more general operation Bytes.read_slice;
wenzelm
parents: 76361
diff changeset
    14
import java.nio.channels.FileChannel
25fd62cba347 clarified signature: more general operation Bytes.read_slice;
wenzelm
parents: 76361
diff changeset
    15
import java.nio.file.StandardOpenOption
77712
dd4bb80dbc3a tuned performance: much faster low-level operation;
wenzelm
parents: 77711
diff changeset
    16
import java.util.Arrays
76353
3698d0f3da18 clarified signature;
wenzelm
parents: 76351
diff changeset
    17
import org.tukaani.xz
3698d0f3da18 clarified signature;
wenzelm
parents: 76351
diff changeset
    18
import com.github.luben.zstd
54440
2c4940d2edf7 tuned signature;
wenzelm
parents: 54439
diff changeset
    19
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    20
import scala.collection.mutable.ArrayBuffer
80426
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
    21
import scala.collection.mutable
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    22
54440
2c4940d2edf7 tuned signature;
wenzelm
parents: 54439
diff changeset
    23
75393
87ebf5a50283 clarified formatting, for the sake of scala3;
wenzelm
parents: 75382
diff changeset
    24
object Bytes {
80514
482897a69699 tuned signature: expose internal limits for testing or add-on implementations;
wenzelm
parents: 80513
diff changeset
    25
  /* internal limits */
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    26
80514
482897a69699 tuned signature: expose internal limits for testing or add-on implementations;
wenzelm
parents: 80513
diff changeset
    27
  val array_size: Long = Int.MaxValue - 8  // see java.io.InputStream.MAX_BUFFER_SIZE
482897a69699 tuned signature: expose internal limits for testing or add-on implementations;
wenzelm
parents: 80513
diff changeset
    28
  val string_size: Long = Int.MaxValue / 2
482897a69699 tuned signature: expose internal limits for testing or add-on implementations;
wenzelm
parents: 80513
diff changeset
    29
  val block_size: Int = 16384  // see java.io.InputStream.DEFAULT_BUFFER_SIZE
482897a69699 tuned signature: expose internal limits for testing or add-on implementations;
wenzelm
parents: 80513
diff changeset
    30
  val chunk_size: Long = Space.MiB(100).bytes
80385
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
    31
80509
2a9abd6a164e clarified signature;
wenzelm
parents: 80508
diff changeset
    32
  class Too_Large(size: Long, limit: Long) extends IndexOutOfBoundsException {
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    33
    override def getMessage: String =
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    34
      "Bytes too large for particular operation: " +
80509
2a9abd6a164e clarified signature;
wenzelm
parents: 80508
diff changeset
    35
        Space.bytes(size).print + " > " + Space.bytes(limit).print
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    36
  }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    37
54439
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
    38
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    39
  /* main constructors */
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    40
80512
wenzelm
parents: 80511
diff changeset
    41
  val empty: Bytes = new Bytes(None, new Array(0), 0L, 0L)
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    42
80492
43323d886ea3 clarified signature: more direct Bytes.raw and subsequent UTF-8 default decoding;
wenzelm
parents: 80491
diff changeset
    43
  def raw(s: String): Bytes =
43323d886ea3 clarified signature: more direct Bytes.raw and subsequent UTF-8 default decoding;
wenzelm
parents: 80491
diff changeset
    44
    if (s.isEmpty) empty
43323d886ea3 clarified signature: more direct Bytes.raw and subsequent UTF-8 default decoding;
wenzelm
parents: 80491
diff changeset
    45
    else Builder.use(hint = s.length) { builder => builder += s.getBytes(ISO_8859_1) }
43323d886ea3 clarified signature: more direct Bytes.raw and subsequent UTF-8 default decoding;
wenzelm
parents: 80491
diff changeset
    46
80491
b439582efc8a clarified signature;
wenzelm
parents: 80485
diff changeset
    47
  def apply(s: String): Bytes =
80373
dc220d47b85e minor performance tuning;
wenzelm
parents: 80372
diff changeset
    48
    if (s.isEmpty) empty
dc220d47b85e minor performance tuning;
wenzelm
parents: 80372
diff changeset
    49
    else Builder.use(hint = s.length) { builder => builder += s }
54440
2c4940d2edf7 tuned signature;
wenzelm
parents: 54439
diff changeset
    50
63779
9da65bc75610 more operations;
wenzelm
parents: 62527
diff changeset
    51
  def apply(a: Array[Byte]): Bytes = apply(a, 0, a.length)
9da65bc75610 more operations;
wenzelm
parents: 62527
diff changeset
    52
54442
c39972ddd672 more specific Protocol_Output: empty message.body, main content via bytes/text;
wenzelm
parents: 54440
diff changeset
    53
  def apply(a: Array[Byte], offset: Int, length: Int): Bytes =
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    54
    Builder.use(hint = length) { builder => builder += (a, offset, length) }
54442
c39972ddd672 more specific Protocol_Output: empty message.body, main content via bytes/text;
wenzelm
parents: 54440
diff changeset
    55
69454
ef051edd4d10 more uniform multi-language operations;
wenzelm
parents: 69448
diff changeset
    56
  val newline: Bytes = apply("\n")
54440
2c4940d2edf7 tuned signature;
wenzelm
parents: 54439
diff changeset
    57
75579
3362b6a5d697 support XZ compression in Isabelle/ML;
wenzelm
parents: 75393
diff changeset
    58
54440
2c4940d2edf7 tuned signature;
wenzelm
parents: 54439
diff changeset
    59
  /* read */
2c4940d2edf7 tuned signature;
wenzelm
parents: 54439
diff changeset
    60
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    61
  def read_stream(stream: InputStream, limit: Long = -1L, hint: Long = 0L): Bytes = {
64004
b4ece7a3f2ca clarified stream operations;
wenzelm
parents: 64001
diff changeset
    62
    if (limit == 0) empty
b4ece7a3f2ca clarified stream operations;
wenzelm
parents: 64001
diff changeset
    63
    else {
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    64
      Builder.use(hint = if (limit > 0) limit else hint) { builder =>
80474
66ebeae3acdc misc tuning: more uniform read_stream vs. read_file;
wenzelm
parents: 80473
diff changeset
    65
        val buf_size = block_size
80384
061d672570f5 tuned names;
wenzelm
parents: 80383
diff changeset
    66
        val buf = new Array[Byte](block_size)
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    67
        var m = 0
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    68
        var n = 0L
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    69
        while ({
80474
66ebeae3acdc misc tuning: more uniform read_stream vs. read_file;
wenzelm
parents: 80473
diff changeset
    70
          val l = if (limit > 0) (limit - n).min(buf_size).toInt else buf_size
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    71
          m = stream.read(buf, 0, l)
80474
66ebeae3acdc misc tuning: more uniform read_stream vs. read_file;
wenzelm
parents: 80473
diff changeset
    72
          if (m > 0) {
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    73
            builder += (buf, 0, m)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    74
            n += m
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    75
          }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    76
          m != -1 && (limit < 0 || limit > n)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    77
        }) ()
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    78
      }
54440
2c4940d2edf7 tuned signature;
wenzelm
parents: 54439
diff changeset
    79
    }
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    80
  }
64001
7ecb22be8f03 more general read_stream: return actual byte count;
wenzelm
parents: 63779
diff changeset
    81
79510
d8330439823a clarified signature: explicit type isabelle.Url to avoid oddities of java.net.URL (e.g. its "equals" method);
wenzelm
parents: 79509
diff changeset
    82
  def read_url(name: String): Bytes = using(Url(name).open_stream())(read_stream(_))
77717
6a2daddc238c tuned signature;
wenzelm
parents: 77716
diff changeset
    83
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    84
  def read_file(path: Path, offset: Long = 0L, limit: Long = -1L): Bytes = {
78956
12abaffb0346 tuned signature: more operations;
wenzelm
parents: 78954
diff changeset
    85
    val length = File.size(path)
77711
25fd62cba347 clarified signature: more general operation Bytes.read_slice;
wenzelm
parents: 76361
diff changeset
    86
    val start = offset.max(0L)
80474
66ebeae3acdc misc tuning: more uniform read_stream vs. read_file;
wenzelm
parents: 80473
diff changeset
    87
    val len = (length - start).max(0L).min(if (limit < 0) Long.MaxValue else limit)
66ebeae3acdc misc tuning: more uniform read_stream vs. read_file;
wenzelm
parents: 80473
diff changeset
    88
    if (len == 0L) empty
77711
25fd62cba347 clarified signature: more general operation Bytes.read_slice;
wenzelm
parents: 76361
diff changeset
    89
    else {
80474
66ebeae3acdc misc tuning: more uniform read_stream vs. read_file;
wenzelm
parents: 80473
diff changeset
    90
      Builder.use(hint = len) { builder =>
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    91
        using(FileChannel.open(path.java_path, StandardOpenOption.READ)) { channel =>
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    92
          channel.position(start)
80473
09afc244bb5e proper limit for read operation (amending ac4d53bc8f6b);
wenzelm
parents: 80428
diff changeset
    93
          val buf_size = block_size
80474
66ebeae3acdc misc tuning: more uniform read_stream vs. read_file;
wenzelm
parents: 80473
diff changeset
    94
          val buf = ByteBuffer.allocate(buf_size)
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    95
          var m = 0
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    96
          var n = 0L
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
    97
          while ({
80474
66ebeae3acdc misc tuning: more uniform read_stream vs. read_file;
wenzelm
parents: 80473
diff changeset
    98
            val l = (len - n).min(buf_size).toInt
66ebeae3acdc misc tuning: more uniform read_stream vs. read_file;
wenzelm
parents: 80473
diff changeset
    99
            buf.limit(l)
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   100
            m = channel.read(buf)
80474
66ebeae3acdc misc tuning: more uniform read_stream vs. read_file;
wenzelm
parents: 80473
diff changeset
   101
            if (m > 0) {
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   102
              builder += (buf.array(), 0, m)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   103
              buf.clear()
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   104
              n += m
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   105
            }
80474
66ebeae3acdc misc tuning: more uniform read_stream vs. read_file;
wenzelm
parents: 80473
diff changeset
   106
            m != -1 && len > n
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   107
          }) ()
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   108
        }
77711
25fd62cba347 clarified signature: more general operation Bytes.read_slice;
wenzelm
parents: 76361
diff changeset
   109
      }
25fd62cba347 clarified signature: more general operation Bytes.read_slice;
wenzelm
parents: 76361
diff changeset
   110
    }
25fd62cba347 clarified signature: more general operation Bytes.read_slice;
wenzelm
parents: 76361
diff changeset
   111
  }
25fd62cba347 clarified signature: more general operation Bytes.read_slice;
wenzelm
parents: 76361
diff changeset
   112
78953
b6116a86d2ac clarified signature;
wenzelm
parents: 78855
diff changeset
   113
  def read(path: Path): Bytes = read_file(path)
b6116a86d2ac clarified signature;
wenzelm
parents: 78855
diff changeset
   114
  def read(file: JFile): Bytes = read_file(File.path(file))
77718
6ad3a412ed97 clarified signature;
wenzelm
parents: 77717
diff changeset
   115
64229
12aa3980f65c more operations;
wenzelm
parents: 64224
diff changeset
   116
12aa3980f65c more operations;
wenzelm
parents: 64224
diff changeset
   117
  /* write */
12aa3980f65c more operations;
wenzelm
parents: 64224
diff changeset
   118
69393
ed0824ef337e static type for Library.using: avoid Java 11 warnings on "Illegal reflective access";
wenzelm
parents: 69365
diff changeset
   119
  def write(file: JFile, bytes: Bytes): Unit =
ed0824ef337e static type for Library.using: avoid Java 11 warnings on "Illegal reflective access";
wenzelm
parents: 69365
diff changeset
   120
    using(new FileOutputStream(file))(bytes.write_stream(_))
64229
12aa3980f65c more operations;
wenzelm
parents: 64224
diff changeset
   121
12aa3980f65c more operations;
wenzelm
parents: 64224
diff changeset
   122
  def write(path: Path, bytes: Bytes): Unit = write(path.file, bytes)
78194
da721ba809a4 more operations;
wenzelm
parents: 77718
diff changeset
   123
da721ba809a4 more operations;
wenzelm
parents: 77718
diff changeset
   124
da721ba809a4 more operations;
wenzelm
parents: 77718
diff changeset
   125
  /* append */
da721ba809a4 more operations;
wenzelm
parents: 77718
diff changeset
   126
da721ba809a4 more operations;
wenzelm
parents: 77718
diff changeset
   127
  def append(file: JFile, bytes: Bytes): Unit =
da721ba809a4 more operations;
wenzelm
parents: 77718
diff changeset
   128
    using(new FileOutputStream(file, true))(bytes.write_stream(_))
da721ba809a4 more operations;
wenzelm
parents: 77718
diff changeset
   129
da721ba809a4 more operations;
wenzelm
parents: 77718
diff changeset
   130
  def append(path: Path, bytes: Bytes): Unit = append(path.file, bytes)
80351
dbbe26afc319 clarified signature: more accurate types;
wenzelm
parents: 80350
diff changeset
   131
dbbe26afc319 clarified signature: more accurate types;
wenzelm
parents: 80350
diff changeset
   132
80380
94d903234f6b Bytes.Builder is unsynchronized, like java.io.OutputBuffer;
wenzelm
parents: 80379
diff changeset
   133
  /* incremental builder: unsynchronized! */
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   134
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   135
  object Builder {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   136
    def use(hint: Long = 0L)(body: Builder => Unit): Bytes = {
80375
wenzelm
parents: 80374
diff changeset
   137
      val builder = new Builder(hint)
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   138
      body(builder)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   139
      builder.done()
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   140
    }
80377
28dd9b91dfe5 more scalable compression, using Bytes.Builder.Stream;
wenzelm
parents: 80376
diff changeset
   141
80394
348e10664e0f clarified signature;
wenzelm
parents: 80393
diff changeset
   142
    def use_stream(hint: Long = 0L)(body: OutputStream => Unit): Bytes = {
348e10664e0f clarified signature;
wenzelm
parents: 80393
diff changeset
   143
      val stream = new Stream(hint = hint)
348e10664e0f clarified signature;
wenzelm
parents: 80393
diff changeset
   144
      body(stream)
348e10664e0f clarified signature;
wenzelm
parents: 80393
diff changeset
   145
      stream.builder.done()
348e10664e0f clarified signature;
wenzelm
parents: 80393
diff changeset
   146
    }
348e10664e0f clarified signature;
wenzelm
parents: 80393
diff changeset
   147
348e10664e0f clarified signature;
wenzelm
parents: 80393
diff changeset
   148
    private class Stream(hint: Long = 0L) extends OutputStream {
80377
28dd9b91dfe5 more scalable compression, using Bytes.Builder.Stream;
wenzelm
parents: 80376
diff changeset
   149
      val builder = new Builder(hint)
28dd9b91dfe5 more scalable compression, using Bytes.Builder.Stream;
wenzelm
parents: 80376
diff changeset
   150
28dd9b91dfe5 more scalable compression, using Bytes.Builder.Stream;
wenzelm
parents: 80376
diff changeset
   151
      override def write(b: Int): Unit =
28dd9b91dfe5 more scalable compression, using Bytes.Builder.Stream;
wenzelm
parents: 80376
diff changeset
   152
        { builder += b.toByte }
28dd9b91dfe5 more scalable compression, using Bytes.Builder.Stream;
wenzelm
parents: 80376
diff changeset
   153
28dd9b91dfe5 more scalable compression, using Bytes.Builder.Stream;
wenzelm
parents: 80376
diff changeset
   154
      override def write(array: Array[Byte], offset: Int, length: Int): Unit =
28dd9b91dfe5 more scalable compression, using Bytes.Builder.Stream;
wenzelm
parents: 80376
diff changeset
   155
        { builder += (array, offset, length) }
28dd9b91dfe5 more scalable compression, using Bytes.Builder.Stream;
wenzelm
parents: 80376
diff changeset
   156
    }
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   157
  }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   158
80375
wenzelm
parents: 80374
diff changeset
   159
  final class Builder private[Bytes](hint: Long) {
80485
71aaa7e65d20 tuned signature;
wenzelm
parents: 80480
diff changeset
   160
    builder =>
71aaa7e65d20 tuned signature;
wenzelm
parents: 80480
diff changeset
   161
80375
wenzelm
parents: 80374
diff changeset
   162
    private var chunks =
wenzelm
parents: 80374
diff changeset
   163
      new ArrayBuffer[Array[Byte]](if (hint <= 0) 16 else (hint / chunk_size).toInt)
80426
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   164
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   165
    private var buffer_list: mutable.ListBuffer[Array[Byte]] = null
80375
wenzelm
parents: 80374
diff changeset
   166
    private var buffer =
80426
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   167
      new Array[Byte](if (hint <= 0) 1024 else (hint min chunk_size min array_size).toInt)
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   168
    private var buffer_index = 0
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   169
    private var buffer_total = 0
80370
wenzelm
parents: 80369
diff changeset
   170
80426
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   171
    private def buffer_content(): Array[Byte] =
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   172
      if (buffer_list != null) {
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   173
        val array = new Array[Byte](buffer_total)
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   174
        var i = 0
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   175
        for (b <- buffer_list) {
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   176
          val n = b.length
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   177
          System.arraycopy(b, 0, array, i, n)
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   178
          i += n
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   179
        }
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   180
        System.arraycopy(buffer, 0, array, i, buffer_index)
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   181
        array
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   182
      }
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   183
      else if (buffer_index == buffer.length) buffer else Arrays.copyOf(buffer, buffer_index)
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   184
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   185
    private def buffer_check(request: Int = 0): Unit =
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   186
      if (buffer_index == buffer.length) {
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   187
        if (buffer_total == chunk_size) {
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   188
          chunks += buffer_content()
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   189
          buffer_list = null
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   190
          buffer = new Array[Byte](chunk_size.toInt)
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   191
          buffer_total = 0
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   192
          buffer_index = 0
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   193
        }
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   194
        else {
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   195
          if (buffer_list == null) { buffer_list = new mutable.ListBuffer }
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   196
          buffer_list += buffer
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   197
          buffer_index = 0
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   198
          val limit = (chunk_size - buffer_total).toInt
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   199
          buffer = new Array[Byte]((buffer_total max request) min limit)
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   200
        }
80370
wenzelm
parents: 80369
diff changeset
   201
      }
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   202
80380
94d903234f6b Bytes.Builder is unsynchronized, like java.io.OutputBuffer;
wenzelm
parents: 80379
diff changeset
   203
    def += (b: Byte): Unit = {
80426
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   204
      buffer(buffer_index) = b
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   205
      buffer_total += 1
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   206
      buffer_index += 1
80377
28dd9b91dfe5 more scalable compression, using Bytes.Builder.Stream;
wenzelm
parents: 80376
diff changeset
   207
      buffer_check()
28dd9b91dfe5 more scalable compression, using Bytes.Builder.Stream;
wenzelm
parents: 80376
diff changeset
   208
    }
28dd9b91dfe5 more scalable compression, using Bytes.Builder.Stream;
wenzelm
parents: 80376
diff changeset
   209
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   210
    def += (array: Array[Byte], offset: Int, length: Int): Unit = {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   211
      if (offset < 0 || length < 0 || offset.toLong + length.toLong > array.length) {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   212
        throw new IndexOutOfBoundsException
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   213
      }
80426
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   214
      else {
80380
94d903234f6b Bytes.Builder is unsynchronized, like java.io.OutputBuffer;
wenzelm
parents: 80379
diff changeset
   215
        var i = offset
94d903234f6b Bytes.Builder is unsynchronized, like java.io.OutputBuffer;
wenzelm
parents: 80379
diff changeset
   216
        var n = length
94d903234f6b Bytes.Builder is unsynchronized, like java.io.OutputBuffer;
wenzelm
parents: 80379
diff changeset
   217
        while (n > 0) {
80426
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   218
          val l = n min (buffer.length - buffer_index)
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   219
          System.arraycopy(array, i, buffer, buffer_index, l)
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   220
          buffer_total += l
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   221
          buffer_index += l
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   222
          i += l
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   223
          n -= l
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   224
          buffer_check(request = n)
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   225
        }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   226
      }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   227
    }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   228
80491
b439582efc8a clarified signature;
wenzelm
parents: 80485
diff changeset
   229
    def += (s: String): Unit =
b439582efc8a clarified signature;
wenzelm
parents: 80485
diff changeset
   230
      if (s.length > 0) { builder += UTF8.bytes(s) }
80428
5fe811816fa3 minor performance tuning;
wenzelm
parents: 80426
diff changeset
   231
80485
71aaa7e65d20 tuned signature;
wenzelm
parents: 80480
diff changeset
   232
    def += (array: Array[Byte]): Unit = { builder += (array, 0, array.length) }
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   233
80485
71aaa7e65d20 tuned signature;
wenzelm
parents: 80480
diff changeset
   234
    def += (a: Subarray): Unit = { builder += (a.array, a.offset, a.length) }
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   235
80394
348e10664e0f clarified signature;
wenzelm
parents: 80393
diff changeset
   236
    private def done(): Bytes = {
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   237
      val cs = chunks.toArray
80426
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   238
      val b = buffer_content()
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   239
      val size = cs.foldLeft(b.length.toLong)({ case (n, a) => n + a.length })
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   240
      chunks = null
80426
7d2922f0ae2b performance tuning: multi-stage buffer with fewer array copies;
wenzelm
parents: 80394
diff changeset
   241
      buffer_list = null
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   242
      buffer = null
80513
wenzelm
parents: 80512
diff changeset
   243
      if (size == 0) empty
wenzelm
parents: 80512
diff changeset
   244
      else new Bytes(if (cs.isEmpty) None else Some(cs), b, 0L, size)
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   245
    }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   246
  }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   247
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   248
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   249
  /* subarray */
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   250
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   251
  object Subarray {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   252
    val empty: Subarray = new Subarray(new Array[Byte](0), 0, 0)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   253
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   254
    def apply(array: Array[Byte], offset: Int, length: Int): Subarray = {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   255
      val n = array.length
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   256
      if (0 <= offset && offset < n && 0 <= length && offset + length <= n) {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   257
        if (length == 0) empty
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   258
        else new Subarray(array, offset, length)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   259
      }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   260
      else throw new IndexOutOfBoundsException
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   261
    }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   262
  }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   263
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   264
  final class Subarray private(
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   265
    val array: Array[Byte],
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   266
    val offset: Int,
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   267
    val length: Int
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   268
  ) {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   269
    override def toString: String = "Bytes.Subarray(" + Space.bytes(length).print + ")"
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   270
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   271
    def byte_iterator: Iterator[Byte] =
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   272
      if (length == 0) Iterator.empty
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   273
      else { for (i <- (offset until (offset + length)).iterator) yield array(i) }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   274
  }
54439
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
   275
}
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
   276
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
   277
final class Bytes private(
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   278
  protected val chunks: Option[Array[Array[Byte]]],
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   279
  protected val chunk0: Array[Byte],
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   280
  protected val offset: Long,
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   281
  val size: Long
80492
43323d886ea3 clarified signature: more direct Bytes.raw and subsequent UTF-8 default decoding;
wenzelm
parents: 80491
diff changeset
   282
) extends YXML.Source {
80485
71aaa7e65d20 tuned signature;
wenzelm
parents: 80480
diff changeset
   283
  bytes =>
71aaa7e65d20 tuned signature;
wenzelm
parents: 80480
diff changeset
   284
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   285
  assert(
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   286
    (chunks.isEmpty ||
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   287
      chunks.get.nonEmpty &&
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   288
      chunks.get.forall(chunk => chunk.length == Bytes.chunk_size)) &&
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   289
    chunk0.length < Bytes.chunk_size)
80361
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   290
80480
972f7a4cdc0e clarified YXML.Source: more direct support for String and Bytes, instead of CharSequence;
wenzelm
parents: 80479
diff changeset
   291
  override def is_empty: Boolean = size == 0
80361
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   292
80485
71aaa7e65d20 tuned signature;
wenzelm
parents: 80480
diff changeset
   293
  def proper: Option[Bytes] = if (is_empty) None else Some(bytes)
80385
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   294
80361
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   295
  def is_sliced: Boolean =
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   296
    offset != 0L || {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   297
      chunks match {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   298
        case None => size != chunk0.length
80511
wenzelm
parents: 80509
diff changeset
   299
        case Some(cs) =>
wenzelm
parents: 80509
diff changeset
   300
          val physical_size = cs.foldLeft(chunk0.length.toLong)((n, c) => n + c.length)
wenzelm
parents: 80509
diff changeset
   301
          size != physical_size
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   302
      }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   303
    }
80361
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   304
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   305
  override def toString: String =
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   306
    if (is_empty) "Bytes.empty"
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   307
    else "Bytes(" + Space.bytes(size).print + if_proper(is_sliced, ", sliced") + ")"
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   308
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   309
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   310
  /* elements: signed Byte or unsigned Char */
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   311
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   312
  protected def byte_unchecked(i: Long): Byte = {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   313
    val a = offset + i
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   314
    chunks match {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   315
      case None => chunk0(a.toInt)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   316
      case Some(cs) =>
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   317
        val b = a % Bytes.chunk_size
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   318
        val c = a / Bytes.chunk_size
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   319
        if (c < cs.length) cs(c.toInt)(b.toInt) else chunk0(b.toInt)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   320
    }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   321
  }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   322
80494
d1240adc30ce clarified signature, following 43323d886ea3;
wenzelm
parents: 80493
diff changeset
   323
  // signed byte
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   324
  def byte(i: Long): Byte =
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   325
    if (0 <= i && i < size) byte_unchecked(i)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   326
    else throw new IndexOutOfBoundsException
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   327
80494
d1240adc30ce clarified signature, following 43323d886ea3;
wenzelm
parents: 80493
diff changeset
   328
  // unsigned char
d1240adc30ce clarified signature, following 43323d886ea3;
wenzelm
parents: 80493
diff changeset
   329
  def char(i: Long): Char = (byte(i).toInt & 0xff).toChar
80361
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   330
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   331
  protected def subarray_iterator: Iterator[Bytes.Subarray] =
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   332
    if (is_empty) Iterator.empty
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   333
    else if (chunks.isEmpty) Iterator(Bytes.Subarray(chunk0, offset.toInt, size.toInt))
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   334
    else {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   335
      val end_offset = offset + size
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   336
      for ((array, index) <- (chunks.get.iterator ++ Iterator(chunk0)).zipWithIndex) yield {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   337
        val array_start = Bytes.chunk_size * index
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   338
        val array_stop = array_start + array.length
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   339
        if (offset < array_stop && array_start < end_offset) {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   340
          val i = (array_start max offset) - array_start
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   341
          val j = (array_stop min end_offset) - array_start
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   342
          Bytes.Subarray(array, i.toInt, (j - i).toInt)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   343
        }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   344
        else Bytes.Subarray.empty
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   345
      }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   346
    }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   347
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   348
  def byte_iterator: Iterator[Byte] =
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   349
    for {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   350
      a <- subarray_iterator
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   351
      b <- a.byte_iterator
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   352
    } yield b
80361
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   353
54a83e8e447b tuned source structure;
wenzelm
parents: 80360
diff changeset
   354
80385
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   355
  /* slice */
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   356
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   357
  def slice(i: Long, j: Long): Bytes =
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   358
    if (0 <= i && i <= j && j <= size) {
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   359
      if (i == j) Bytes.empty
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   360
      else new Bytes(chunks, chunk0, offset + i, j - i)
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   361
    }
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   362
    else throw new IndexOutOfBoundsException
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   363
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   364
  def unslice: Bytes =
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   365
    if (is_sliced) {
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   366
      Bytes.Builder.use(hint = size) { builder =>
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   367
        for (a <- subarray_iterator) { builder += a }
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   368
      }
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   369
    }
80485
71aaa7e65d20 tuned signature;
wenzelm
parents: 80480
diff changeset
   370
    else bytes
80385
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   371
80493
d334f158442b clarified signature: more direct operation;
wenzelm
parents: 80492
diff changeset
   372
  def terminated_line: Boolean =
d334f158442b clarified signature: more direct operation;
wenzelm
parents: 80492
diff changeset
   373
    size >= 1 && (byte_unchecked(size - 1) == 13 || byte_unchecked(size - 1) == 10)
d334f158442b clarified signature: more direct operation;
wenzelm
parents: 80492
diff changeset
   374
80385
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   375
  def trim_line: Bytes =
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   376
    if (size >= 2 && byte_unchecked(size - 2) == 13 && byte_unchecked(size - 1) == 10) {
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   377
      slice(0, size - 2)
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   378
    }
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   379
    else if (size >= 1 && (byte_unchecked(size - 1) == 13 || byte_unchecked(size - 1) == 10)) {
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   380
      slice(0, size - 1)
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   381
    }
80485
71aaa7e65d20 tuned signature;
wenzelm
parents: 80480
diff changeset
   382
    else bytes
80385
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   383
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   384
80479
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   385
  /* separated chunks */
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   386
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   387
  def separated_chunks(sep: Byte): Iterator[Bytes] =
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   388
    new Iterator[Bytes] {
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   389
      private var start: Long = -1
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   390
      private var stop: Long = -1
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   391
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   392
      private def end(i: Long): Long = {
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   393
        var j = i
80485
71aaa7e65d20 tuned signature;
wenzelm
parents: 80480
diff changeset
   394
        while (j < bytes.size && byte_unchecked(j) != sep) { j += 1 }
80479
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   395
        j
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   396
      }
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   397
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   398
      // init
80485
71aaa7e65d20 tuned signature;
wenzelm
parents: 80480
diff changeset
   399
      if (!bytes.is_empty) { start = 0; stop = end(0) }
80479
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   400
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   401
      def hasNext: Boolean =
80485
71aaa7e65d20 tuned signature;
wenzelm
parents: 80480
diff changeset
   402
        0 <= start && start <= stop && stop <= bytes.size
80479
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   403
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   404
      def next(): Bytes =
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   405
        if (hasNext) {
80485
71aaa7e65d20 tuned signature;
wenzelm
parents: 80480
diff changeset
   406
          val chunk = bytes.slice(start, stop)
71aaa7e65d20 tuned signature;
wenzelm
parents: 80480
diff changeset
   407
          if (stop < bytes.size) { start = stop + 1; stop = end(start) }
80479
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   408
          else { start = -1; stop = -1 }
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   409
          chunk
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   410
        }
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   411
        else throw new NoSuchElementException
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   412
    }
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   413
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   414
  def space_explode(sep: Byte): List[Bytes] = separated_chunks(sep).toList
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   415
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   416
  def split_lines: List[Bytes] = space_explode(10)
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   417
80480
972f7a4cdc0e clarified YXML.Source: more direct support for String and Bytes, instead of CharSequence;
wenzelm
parents: 80479
diff changeset
   418
  // YXML.Source operations
972f7a4cdc0e clarified YXML.Source: more direct support for String and Bytes, instead of CharSequence;
wenzelm
parents: 80479
diff changeset
   419
  override def is_X: Boolean = size == 1 && byte_unchecked(0) == YXML.X_byte
972f7a4cdc0e clarified YXML.Source: more direct support for String and Bytes, instead of CharSequence;
wenzelm
parents: 80479
diff changeset
   420
  override def is_Y: Boolean = size == 1 && byte_unchecked(0) == YXML.Y_byte
972f7a4cdc0e clarified YXML.Source: more direct support for String and Bytes, instead of CharSequence;
wenzelm
parents: 80479
diff changeset
   421
  override def iterator_X: Iterator[Bytes] = separated_chunks(YXML.X_byte)
972f7a4cdc0e clarified YXML.Source: more direct support for String and Bytes, instead of CharSequence;
wenzelm
parents: 80479
diff changeset
   422
  override def iterator_Y: Iterator[Bytes] = separated_chunks(YXML.Y_byte)
972f7a4cdc0e clarified YXML.Source: more direct support for String and Bytes, instead of CharSequence;
wenzelm
parents: 80479
diff changeset
   423
80479
762fcf8f9ced more specialized operations;
wenzelm
parents: 80474
diff changeset
   424
80363
306f273c91ec clarified hash and equality: depend on sha1 digest to be collision-free;
wenzelm
parents: 80362
diff changeset
   425
  /* hash and equality */
306f273c91ec clarified hash and equality: depend on sha1 digest to be collision-free;
wenzelm
parents: 80362
diff changeset
   426
306f273c91ec clarified hash and equality: depend on sha1 digest to be collision-free;
wenzelm
parents: 80362
diff changeset
   427
  lazy val sha1_digest: SHA1.Digest =
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   428
    if (is_empty) SHA1.digest_empty
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   429
    else {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   430
      SHA1.make_digest { sha =>
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   431
        for (a <- subarray_iterator if a.length > 0) {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   432
          sha.update(a.array, a.offset, a.length)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   433
        }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   434
      }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   435
    }
80363
306f273c91ec clarified hash and equality: depend on sha1 digest to be collision-free;
wenzelm
parents: 80362
diff changeset
   436
306f273c91ec clarified hash and equality: depend on sha1 digest to be collision-free;
wenzelm
parents: 80362
diff changeset
   437
  override def hashCode(): Int = sha1_digest.hashCode()
54439
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
   438
75393
87ebf5a50283 clarified formatting, for the sake of scala3;
wenzelm
parents: 75382
diff changeset
   439
  override def equals(that: Any): Boolean = {
54440
2c4940d2edf7 tuned signature;
wenzelm
parents: 54439
diff changeset
   440
    that match {
2c4940d2edf7 tuned signature;
wenzelm
parents: 54439
diff changeset
   441
      case other: Bytes =>
80485
71aaa7e65d20 tuned signature;
wenzelm
parents: 80480
diff changeset
   442
        if (bytes.eq(other)) true
80363
306f273c91ec clarified hash and equality: depend on sha1 digest to be collision-free;
wenzelm
parents: 80362
diff changeset
   443
        else if (size != other.size) false
80378
ab4badc7db7f more efficient equals: avoid somewhat slow sha1_digest (see also 29b761e290c5, 306f273c91ec);
wenzelm
parents: 80377
diff changeset
   444
        else {
ab4badc7db7f more efficient equals: avoid somewhat slow sha1_digest (see also 29b761e290c5, 306f273c91ec);
wenzelm
parents: 80377
diff changeset
   445
          if (chunks.isEmpty && other.chunks.isEmpty) {
ab4badc7db7f more efficient equals: avoid somewhat slow sha1_digest (see also 29b761e290c5, 306f273c91ec);
wenzelm
parents: 80377
diff changeset
   446
            Arrays.equals(chunk0, offset.toInt, (offset + size).toInt,
ab4badc7db7f more efficient equals: avoid somewhat slow sha1_digest (see also 29b761e290c5, 306f273c91ec);
wenzelm
parents: 80377
diff changeset
   447
              other.chunk0, other.offset.toInt, (other.offset + other.size).toInt)
ab4badc7db7f more efficient equals: avoid somewhat slow sha1_digest (see also 29b761e290c5, 306f273c91ec);
wenzelm
parents: 80377
diff changeset
   448
          }
ab4badc7db7f more efficient equals: avoid somewhat slow sha1_digest (see also 29b761e290c5, 306f273c91ec);
wenzelm
parents: 80377
diff changeset
   449
          else if (!is_sliced && !other.is_sliced) {
ab4badc7db7f more efficient equals: avoid somewhat slow sha1_digest (see also 29b761e290c5, 306f273c91ec);
wenzelm
parents: 80377
diff changeset
   450
            (subarray_iterator zip other.subarray_iterator)
ab4badc7db7f more efficient equals: avoid somewhat slow sha1_digest (see also 29b761e290c5, 306f273c91ec);
wenzelm
parents: 80377
diff changeset
   451
              .forall((a, b) => Arrays.equals(a.array, b.array))
ab4badc7db7f more efficient equals: avoid somewhat slow sha1_digest (see also 29b761e290c5, 306f273c91ec);
wenzelm
parents: 80377
diff changeset
   452
          }
ab4badc7db7f more efficient equals: avoid somewhat slow sha1_digest (see also 29b761e290c5, 306f273c91ec);
wenzelm
parents: 80377
diff changeset
   453
          else sha1_digest == other.sha1_digest
80363
306f273c91ec clarified hash and equality: depend on sha1 digest to be collision-free;
wenzelm
parents: 80362
diff changeset
   454
        }
54440
2c4940d2edf7 tuned signature;
wenzelm
parents: 54439
diff changeset
   455
      case _ => false
2c4940d2edf7 tuned signature;
wenzelm
parents: 54439
diff changeset
   456
    }
2c4940d2edf7 tuned signature;
wenzelm
parents: 54439
diff changeset
   457
  }
2c4940d2edf7 tuned signature;
wenzelm
parents: 54439
diff changeset
   458
54439
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
   459
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
   460
  /* content */
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
   461
80385
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   462
  def + (other: Bytes): Bytes =
80485
71aaa7e65d20 tuned signature;
wenzelm
parents: 80480
diff changeset
   463
    if (other.is_empty) bytes
80385
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   464
    else if (is_empty) other
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   465
    else {
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   466
      Bytes.Builder.use(hint = size + other.size) { builder =>
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   467
        for (a <- subarray_iterator ++ other.subarray_iterator) {
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   468
          builder += a
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   469
        }
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   470
      }
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   471
    }
605e19319343 tuned module structure;
wenzelm
parents: 80384
diff changeset
   472
80368
9db395953106 clarified signature;
wenzelm
parents: 80367
diff changeset
   473
  def make_array: Array[Byte] = {
80509
2a9abd6a164e clarified signature;
wenzelm
parents: 80508
diff changeset
   474
    val n =
2a9abd6a164e clarified signature;
wenzelm
parents: 80508
diff changeset
   475
      if (size <= Bytes.array_size) size.toInt
2a9abd6a164e clarified signature;
wenzelm
parents: 80508
diff changeset
   476
      else throw new Bytes.Too_Large(size, Bytes.array_size)
2a9abd6a164e clarified signature;
wenzelm
parents: 80508
diff changeset
   477
    val buf = new ByteArrayOutputStream(n)
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   478
    for (a <- subarray_iterator) { buf.write(a.array, a.offset, a.length) }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   479
    buf.toByteArray
69365
c5b3860d29ef avoid loading of font file, to eliminate "Illegal reflective access by com.lowagie.text.pdf.MappedRandomAccessFile$1 (iText-2.1.5.jar) to method java.nio.DirectByteBuffer.cleaner()" -- due to com.lowagie.text.pdf.TrueTypeFont.process() / RandomAccessFileOrArray;
wenzelm
parents: 68167
diff changeset
   480
  }
c5b3860d29ef avoid loading of font file, to eliminate "Illegal reflective access by com.lowagie.text.pdf.MappedRandomAccessFile$1 (iText-2.1.5.jar) to method java.nio.DirectByteBuffer.cleaner()" -- due to com.lowagie.text.pdf.TrueTypeFont.process() / RandomAccessFileOrArray;
wenzelm
parents: 68167
diff changeset
   481
80508
8585399f26f6 prefer official UTF-8 decoding (in contrast to 2541de190d92): this is also more efficient (factor 10-20);
wenzelm
parents: 80503
diff changeset
   482
  def text: String =
80352
7a6cba7c77c9 minor performance tuning;
wenzelm
parents: 80351
diff changeset
   483
    if (is_empty) ""
80379
1f1ce661c71c notable performance tuning: avoid overhead of higher-order functions;
wenzelm
parents: 80378
diff changeset
   484
    else {
80508
8585399f26f6 prefer official UTF-8 decoding (in contrast to 2541de190d92): this is also more efficient (factor 10-20);
wenzelm
parents: 80503
diff changeset
   485
      val reader = new InputStreamReader(stream(), UTF8.charset)
8585399f26f6 prefer official UTF-8 decoding (in contrast to 2541de190d92): this is also more efficient (factor 10-20);
wenzelm
parents: 80503
diff changeset
   486
      val buf = new Array[Char]((size min Bytes.string_size).toInt + 1)
8585399f26f6 prefer official UTF-8 decoding (in contrast to 2541de190d92): this is also more efficient (factor 10-20);
wenzelm
parents: 80503
diff changeset
   487
      var m = 0
8585399f26f6 prefer official UTF-8 decoding (in contrast to 2541de190d92): this is also more efficient (factor 10-20);
wenzelm
parents: 80503
diff changeset
   488
      var n = 0
8585399f26f6 prefer official UTF-8 decoding (in contrast to 2541de190d92): this is also more efficient (factor 10-20);
wenzelm
parents: 80503
diff changeset
   489
      while (m >= 0 && n < buf.length) {
8585399f26f6 prefer official UTF-8 decoding (in contrast to 2541de190d92): this is also more efficient (factor 10-20);
wenzelm
parents: 80503
diff changeset
   490
        m = reader.read(buf, n, (buf.length - n) min Bytes.block_size)
8585399f26f6 prefer official UTF-8 decoding (in contrast to 2541de190d92): this is also more efficient (factor 10-20);
wenzelm
parents: 80503
diff changeset
   491
        if (m > 0) { n += m }
80379
1f1ce661c71c notable performance tuning: avoid overhead of higher-order functions;
wenzelm
parents: 80378
diff changeset
   492
      }
80508
8585399f26f6 prefer official UTF-8 decoding (in contrast to 2541de190d92): this is also more efficient (factor 10-20);
wenzelm
parents: 80503
diff changeset
   493
      require(m == -1, "Malformed UTF-8 string: overlong result")
8585399f26f6 prefer official UTF-8 decoding (in contrast to 2541de190d92): this is also more efficient (factor 10-20);
wenzelm
parents: 80503
diff changeset
   494
      new String(buf, 0, n)
80352
7a6cba7c77c9 minor performance tuning;
wenzelm
parents: 80351
diff changeset
   495
    }
65279
fa62e095d8f1 clarified signature (again, see also 3ed43cfc8b14);
wenzelm
parents: 65070
diff changeset
   496
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   497
  def wellformed_text: Option[String] =
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   498
    try {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   499
      val s = text
80485
71aaa7e65d20 tuned signature;
wenzelm
parents: 80480
diff changeset
   500
      if (bytes == Bytes(s)) Some(s) else None
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   501
    }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   502
    catch { case ERROR(_) => None }
76236
03dd2f19f1d7 clarified signature: more operations;
wenzelm
parents: 75709
diff changeset
   503
80393
6138c5b803be Base64: proper support for large Bytes, with subtle change of types (Bytes instead of String);
wenzelm
parents: 80389
diff changeset
   504
6138c5b803be Base64: proper support for large Bytes, with subtle change of types (Bytes instead of String);
wenzelm
parents: 80389
diff changeset
   505
  /* Base64 data representation */
6138c5b803be Base64: proper support for large Bytes, with subtle change of types (Bytes instead of String);
wenzelm
parents: 80389
diff changeset
   506
80394
348e10664e0f clarified signature;
wenzelm
parents: 80393
diff changeset
   507
  def encode_base64: Bytes =
348e10664e0f clarified signature;
wenzelm
parents: 80393
diff changeset
   508
    Bytes.Builder.use_stream(hint = (size * 1.5).round) { out =>
348e10664e0f clarified signature;
wenzelm
parents: 80393
diff changeset
   509
      using(Base64.encode_stream(out))(write_stream(_))
348e10664e0f clarified signature;
wenzelm
parents: 80393
diff changeset
   510
    }
80393
6138c5b803be Base64: proper support for large Bytes, with subtle change of types (Bytes instead of String);
wenzelm
parents: 80389
diff changeset
   511
6138c5b803be Base64: proper support for large Bytes, with subtle change of types (Bytes instead of String);
wenzelm
parents: 80389
diff changeset
   512
  def decode_base64: Bytes =
6138c5b803be Base64: proper support for large Bytes, with subtle change of types (Bytes instead of String);
wenzelm
parents: 80389
diff changeset
   513
    using(Base64.decode_stream(stream()))(Bytes.read_stream(_, hint = (size / 1.2).round))
68094
0b66aca9c965 more operations;
wenzelm
parents: 68087
diff changeset
   514
76236
03dd2f19f1d7 clarified signature: more operations;
wenzelm
parents: 75709
diff changeset
   515
  def maybe_encode_base64: (Boolean, String) =
03dd2f19f1d7 clarified signature: more operations;
wenzelm
parents: 75709
diff changeset
   516
    wellformed_text match {
03dd2f19f1d7 clarified signature: more operations;
wenzelm
parents: 75709
diff changeset
   517
      case Some(s) => (false, s)
80393
6138c5b803be Base64: proper support for large Bytes, with subtle change of types (Bytes instead of String);
wenzelm
parents: 80389
diff changeset
   518
      case None => (true, encode_base64.text)
76236
03dd2f19f1d7 clarified signature: more operations;
wenzelm
parents: 75709
diff changeset
   519
    }
68106
a514e29db980 return exports as result for Isabelle server;
wenzelm
parents: 68094
diff changeset
   520
54440
2c4940d2edf7 tuned signature;
wenzelm
parents: 54439
diff changeset
   521
64004
b4ece7a3f2ca clarified stream operations;
wenzelm
parents: 64001
diff changeset
   522
  /* streams */
b4ece7a3f2ca clarified stream operations;
wenzelm
parents: 64001
diff changeset
   523
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   524
  def stream(): InputStream =
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   525
    if (chunks.isEmpty) new ByteArrayInputStream(chunk0, offset.toInt, size.toInt)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   526
    else {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   527
      new InputStream {
80367
a6c1526600b3 minor performance tuning;
wenzelm
parents: 80366
diff changeset
   528
        private var index = 0L
a6c1526600b3 minor performance tuning;
wenzelm
parents: 80366
diff changeset
   529
a6c1526600b3 minor performance tuning;
wenzelm
parents: 80366
diff changeset
   530
        def read(): Int = {
a6c1526600b3 minor performance tuning;
wenzelm
parents: 80366
diff changeset
   531
          if (index < size) {
a6c1526600b3 minor performance tuning;
wenzelm
parents: 80366
diff changeset
   532
            val res = byte_unchecked(index).toInt & 0xff
a6c1526600b3 minor performance tuning;
wenzelm
parents: 80366
diff changeset
   533
            index += 1
a6c1526600b3 minor performance tuning;
wenzelm
parents: 80366
diff changeset
   534
            res
a6c1526600b3 minor performance tuning;
wenzelm
parents: 80366
diff changeset
   535
          }
a6c1526600b3 minor performance tuning;
wenzelm
parents: 80366
diff changeset
   536
          else -1
a6c1526600b3 minor performance tuning;
wenzelm
parents: 80366
diff changeset
   537
        }
a6c1526600b3 minor performance tuning;
wenzelm
parents: 80366
diff changeset
   538
80388
52f71e3816d8 more scalable stream read operations;
wenzelm
parents: 80387
diff changeset
   539
        override def read(buffer: Array[Byte], start: Int, length: Int): Int = {
52f71e3816d8 more scalable stream read operations;
wenzelm
parents: 80387
diff changeset
   540
          if (length < 16) super.read(buffer, start, length)
52f71e3816d8 more scalable stream read operations;
wenzelm
parents: 80387
diff changeset
   541
          else {
52f71e3816d8 more scalable stream read operations;
wenzelm
parents: 80387
diff changeset
   542
            val index0 = index
52f71e3816d8 more scalable stream read operations;
wenzelm
parents: 80387
diff changeset
   543
            index = (index + length) min size
52f71e3816d8 more scalable stream read operations;
wenzelm
parents: 80387
diff changeset
   544
            val n = (index - index0).toInt
52f71e3816d8 more scalable stream read operations;
wenzelm
parents: 80387
diff changeset
   545
            if (n == 0) -1
52f71e3816d8 more scalable stream read operations;
wenzelm
parents: 80387
diff changeset
   546
            else {
52f71e3816d8 more scalable stream read operations;
wenzelm
parents: 80387
diff changeset
   547
              var i = start
52f71e3816d8 more scalable stream read operations;
wenzelm
parents: 80387
diff changeset
   548
              for (a <- slice(index0, index).subarray_iterator) {
52f71e3816d8 more scalable stream read operations;
wenzelm
parents: 80387
diff changeset
   549
                val l = a.length
52f71e3816d8 more scalable stream read operations;
wenzelm
parents: 80387
diff changeset
   550
                if (l > 0) {
52f71e3816d8 more scalable stream read operations;
wenzelm
parents: 80387
diff changeset
   551
                  System.arraycopy(a.array, a.offset, buffer, i, l)
52f71e3816d8 more scalable stream read operations;
wenzelm
parents: 80387
diff changeset
   552
                  i += l
52f71e3816d8 more scalable stream read operations;
wenzelm
parents: 80387
diff changeset
   553
                }
52f71e3816d8 more scalable stream read operations;
wenzelm
parents: 80387
diff changeset
   554
              }
52f71e3816d8 more scalable stream read operations;
wenzelm
parents: 80387
diff changeset
   555
              n
52f71e3816d8 more scalable stream read operations;
wenzelm
parents: 80387
diff changeset
   556
            }
52f71e3816d8 more scalable stream read operations;
wenzelm
parents: 80387
diff changeset
   557
          }
52f71e3816d8 more scalable stream read operations;
wenzelm
parents: 80387
diff changeset
   558
        }
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   559
      }
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   560
    }
64004
b4ece7a3f2ca clarified stream operations;
wenzelm
parents: 64001
diff changeset
   561
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   562
  def write_stream(stream: OutputStream): Unit =
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   563
    for (a <- subarray_iterator if a.length > 0) {
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   564
      stream.write(a.array, a.offset, a.length)
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   565
    }
64004
b4ece7a3f2ca clarified stream operations;
wenzelm
parents: 64001
diff changeset
   566
b4ece7a3f2ca clarified stream operations;
wenzelm
parents: 64001
diff changeset
   567
76351
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala;
wenzelm
parents: 76350
diff changeset
   568
  /* XZ / Zstd data compression */
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala;
wenzelm
parents: 76350
diff changeset
   569
76358
cff0828c374f clarified signature;
wenzelm
parents: 76353
diff changeset
   570
  def detect_xz: Boolean =
80357
fe123d033e76 clarified signature: pro-forma support for Bytes with size: Long;
wenzelm
parents: 80356
diff changeset
   571
    size >= 6 &&
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   572
      byte_unchecked(0) == 0xFD.toByte &&
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   573
      byte_unchecked(1) == 0x37.toByte &&
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   574
      byte_unchecked(2) == 0x7A.toByte &&
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   575
      byte_unchecked(3) == 0x58.toByte &&
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   576
      byte_unchecked(4) == 0x5A.toByte &&
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   577
      byte_unchecked(5) == 0x00.toByte
76351
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala;
wenzelm
parents: 76350
diff changeset
   578
76358
cff0828c374f clarified signature;
wenzelm
parents: 76353
diff changeset
   579
  def detect_zstd: Boolean =
80357
fe123d033e76 clarified signature: pro-forma support for Bytes with size: Long;
wenzelm
parents: 80356
diff changeset
   580
    size >= 4 &&
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   581
      byte_unchecked(0) == 0x28.toByte &&
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   582
      byte_unchecked(1) == 0xB5.toByte &&
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   583
      byte_unchecked(2) == 0x2F.toByte &&
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   584
      byte_unchecked(3) == 0xFD.toByte
76351
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala;
wenzelm
parents: 76350
diff changeset
   585
76358
cff0828c374f clarified signature;
wenzelm
parents: 76353
diff changeset
   586
  def uncompress_xz(cache: Compress.Cache = Compress.Cache.none): Bytes =
80366
ac4d53bc8f6b support large byte arrays, using multiple "chunks";
wenzelm
parents: 80364
diff changeset
   587
    using(new xz.XZInputStream(stream(), cache.for_xz))(Bytes.read_stream(_, hint = size))
76358
cff0828c374f clarified signature;
wenzelm
parents: 76353
diff changeset
   588
cff0828c374f clarified signature;
wenzelm
parents: 76353
diff changeset
   589
  def uncompress_zstd(cache: Compress.Cache = Compress.Cache.none): Bytes = {
cff0828c374f clarified signature;
wenzelm
parents: 76353
diff changeset
   590
    Zstd.init()
80376
201af0b45e57 tuned: more uniform, less ambitious;
wenzelm
parents: 80375
diff changeset
   591
    using(new zstd.ZstdInputStream(stream(), cache.for_zstd))(Bytes.read_stream(_, hint = size))
76358
cff0828c374f clarified signature;
wenzelm
parents: 76353
diff changeset
   592
  }
54440
2c4940d2edf7 tuned signature;
wenzelm
parents: 54439
diff changeset
   593
76351
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala;
wenzelm
parents: 76350
diff changeset
   594
  def uncompress(cache: Compress.Cache = Compress.Cache.none): Bytes =
76358
cff0828c374f clarified signature;
wenzelm
parents: 76353
diff changeset
   595
    if (detect_xz) uncompress_xz(cache = cache)
cff0828c374f clarified signature;
wenzelm
parents: 76353
diff changeset
   596
    else if (detect_zstd) uncompress_zstd(cache = cache)
cff0828c374f clarified signature;
wenzelm
parents: 76353
diff changeset
   597
    else error("Cannot detect compression scheme")
64004
b4ece7a3f2ca clarified stream operations;
wenzelm
parents: 64001
diff changeset
   598
76351
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala;
wenzelm
parents: 76350
diff changeset
   599
  def compress(
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala;
wenzelm
parents: 76350
diff changeset
   600
    options: Compress.Options = Compress.Options(),
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala;
wenzelm
parents: 76350
diff changeset
   601
    cache: Compress.Cache = Compress.Cache.none
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala;
wenzelm
parents: 76350
diff changeset
   602
  ): Bytes = {
80394
348e10664e0f clarified signature;
wenzelm
parents: 80393
diff changeset
   603
    Bytes.Builder.use_stream(hint = size) { out =>
348e10664e0f clarified signature;
wenzelm
parents: 80393
diff changeset
   604
      using(
348e10664e0f clarified signature;
wenzelm
parents: 80393
diff changeset
   605
        options match {
348e10664e0f clarified signature;
wenzelm
parents: 80393
diff changeset
   606
          case options_xz: Compress.Options_XZ =>
348e10664e0f clarified signature;
wenzelm
parents: 80393
diff changeset
   607
            new xz.XZOutputStream(out, options_xz.make, cache.for_xz)
348e10664e0f clarified signature;
wenzelm
parents: 80393
diff changeset
   608
          case options_zstd: Compress.Options_Zstd =>
348e10664e0f clarified signature;
wenzelm
parents: 80393
diff changeset
   609
            new zstd.ZstdOutputStream(out, cache.for_zstd, options_zstd.level)
348e10664e0f clarified signature;
wenzelm
parents: 80393
diff changeset
   610
        }
348e10664e0f clarified signature;
wenzelm
parents: 80393
diff changeset
   611
      ) { s => for (a <- subarray_iterator) s.write(a.array, a.offset, a.length) }
348e10664e0f clarified signature;
wenzelm
parents: 80393
diff changeset
   612
    }
64004
b4ece7a3f2ca clarified stream operations;
wenzelm
parents: 64001
diff changeset
   613
  }
68167
327bb0f5f768 clarified implicit compression;
wenzelm
parents: 68150
diff changeset
   614
75393
87ebf5a50283 clarified formatting, for the sake of scala3;
wenzelm
parents: 75382
diff changeset
   615
  def maybe_compress(
76351
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala;
wenzelm
parents: 76350
diff changeset
   616
    options: Compress.Options = Compress.Options(),
2cee31cd92f0 generic support for XZ and Zstd compression in Isabelle/Scala;
wenzelm
parents: 76350
diff changeset
   617
    cache: Compress.Cache = Compress.Cache.none
75393
87ebf5a50283 clarified formatting, for the sake of scala3;
wenzelm
parents: 75382
diff changeset
   618
  ) : (Boolean, Bytes) = {
68167
327bb0f5f768 clarified implicit compression;
wenzelm
parents: 68150
diff changeset
   619
    val compressed = compress(options = options, cache = cache)
80485
71aaa7e65d20 tuned signature;
wenzelm
parents: 80480
diff changeset
   620
    if (compressed.size < size) (true, compressed) else (false, bytes)
68167
327bb0f5f768 clarified implicit compression;
wenzelm
parents: 68150
diff changeset
   621
  }
54439
621a155c7715 immutable byte vectors versus UTF8 strings;
wenzelm
parents:
diff changeset
   622
}