src/Pure/General/bytes.scala
author wenzelm
Fri May 11 20:05:37 2018 +0200 (13 months ago)
changeset 68150 f0f34cbed539
parent 68149 9a4a6adb95b5
child 68167 327bb0f5f768
permissions -rw-r--r--
clarified output: avoid costly operations on huge blobs;
wenzelm@54439
     1
/*  Title:      Pure/General/bytes.scala
wenzelm@54439
     2
    Author:     Makarius
wenzelm@54439
     3
wenzelm@54439
     4
Immutable byte vectors versus UTF8 strings.
wenzelm@54439
     5
*/
wenzelm@54439
     6
wenzelm@54439
     7
package isabelle
wenzelm@54439
     8
wenzelm@54439
     9
wenzelm@64004
    10
import java.io.{File => JFile, ByteArrayOutputStream, ByteArrayInputStream,
wenzelm@64229
    11
  OutputStream, InputStream, FileInputStream, FileOutputStream}
wenzelm@65070
    12
import java.net.URL
wenzelm@68094
    13
import java.util.Base64
wenzelm@64004
    14
wenzelm@64004
    15
import org.tukaani.xz.{XZInputStream, XZOutputStream}
wenzelm@54440
    16
wenzelm@54440
    17
wenzelm@54439
    18
object Bytes
wenzelm@54439
    19
{
wenzelm@54439
    20
  val empty: Bytes = new Bytes(Array[Byte](), 0, 0)
wenzelm@54439
    21
wenzelm@54439
    22
  def apply(s: CharSequence): Bytes =
wenzelm@54439
    23
  {
wenzelm@54439
    24
    val str = s.toString
wenzelm@54439
    25
    if (str.isEmpty) empty
wenzelm@54439
    26
    else {
wenzelm@62527
    27
      val b = UTF8.bytes(str)
wenzelm@54442
    28
      new Bytes(b, 0, b.length)
wenzelm@54439
    29
    }
wenzelm@54439
    30
  }
wenzelm@54440
    31
wenzelm@63779
    32
  def apply(a: Array[Byte]): Bytes = apply(a, 0, a.length)
wenzelm@63779
    33
wenzelm@54442
    34
  def apply(a: Array[Byte], offset: Int, length: Int): Bytes =
wenzelm@54442
    35
    if (length == 0) empty
wenzelm@54442
    36
    else {
wenzelm@54442
    37
      val b = new Array[Byte](length)
wenzelm@55618
    38
      System.arraycopy(a, offset, b, 0, length)
wenzelm@54442
    39
      new Bytes(b, 0, b.length)
wenzelm@54442
    40
    }
wenzelm@54442
    41
wenzelm@54440
    42
wenzelm@68108
    43
  def base64(s: String): Bytes =
wenzelm@68108
    44
  {
wenzelm@68108
    45
    val a = Base64.getDecoder.decode(s)
wenzelm@68108
    46
    new Bytes(a, 0, a.length)
wenzelm@68108
    47
  }
wenzelm@68108
    48
wenzelm@68087
    49
wenzelm@54440
    50
  /* read */
wenzelm@54440
    51
wenzelm@64005
    52
  def read_stream(stream: InputStream, limit: Int = Integer.MAX_VALUE, hint: Int = 1024): Bytes =
wenzelm@64004
    53
    if (limit == 0) empty
wenzelm@64004
    54
    else {
wenzelm@64005
    55
      val out = new ByteArrayOutputStream(if (limit == Integer.MAX_VALUE) hint else limit)
wenzelm@64005
    56
      val buf = new Array[Byte](8192)
wenzelm@64004
    57
      var m = 0
wenzelm@54440
    58
wenzelm@54440
    59
      do {
wenzelm@64004
    60
        m = stream.read(buf, 0, buf.size min (limit - out.size))
wenzelm@64004
    61
        if (m != -1) out.write(buf, 0, m)
wenzelm@64004
    62
      } while (m != -1 && limit > out.size)
wenzelm@64004
    63
wenzelm@64004
    64
      new Bytes(out.toByteArray, 0, out.size)
wenzelm@54440
    65
    }
wenzelm@64001
    66
wenzelm@67805
    67
  def read_block(stream: InputStream, length: Int): Option[Bytes] =
wenzelm@67805
    68
  {
wenzelm@67805
    69
    val bytes = read_stream(stream, limit = length)
wenzelm@67805
    70
    if (bytes.length == length) Some(bytes) else None
wenzelm@67805
    71
  }
wenzelm@67805
    72
wenzelm@67805
    73
  def read_line(stream: InputStream): Option[Bytes] =
wenzelm@67805
    74
  {
wenzelm@67805
    75
    val out = new ByteArrayOutputStream(100)
wenzelm@67805
    76
    var c = 0
wenzelm@67805
    77
    while ({ c = stream.read; c != -1 && c != 10 }) out.write(c)
wenzelm@67805
    78
wenzelm@67805
    79
    if (c == -1 && out.size == 0) None
wenzelm@67805
    80
    else {
wenzelm@67805
    81
      val a = out.toByteArray
wenzelm@67805
    82
      val n = a.length
wenzelm@67805
    83
      val b = if (n > 0 && a(n - 1) == 13) a.take(n - 1) else a
wenzelm@67805
    84
      Some(new Bytes(b, 0, b.length))
wenzelm@67805
    85
    }
wenzelm@67805
    86
  }
wenzelm@67805
    87
wenzelm@64001
    88
  def read(file: JFile): Bytes =
wenzelm@64004
    89
    using(new FileInputStream(file))(read_stream(_, file.length.toInt))
wenzelm@64229
    90
wenzelm@64229
    91
  def read(path: Path): Bytes = read(path.file)
wenzelm@64229
    92
wenzelm@65070
    93
  def read(url: URL): Bytes = using(url.openStream)(read_stream(_))
wenzelm@65070
    94
wenzelm@64229
    95
wenzelm@64229
    96
  /* write */
wenzelm@64229
    97
wenzelm@64229
    98
  def write(file: JFile, bytes: Bytes)
wenzelm@64229
    99
  {
wenzelm@64229
   100
    val stream = new FileOutputStream(file)
wenzelm@64229
   101
    try { bytes.write_stream(stream) } finally { stream.close }
wenzelm@64229
   102
  }
wenzelm@64229
   103
wenzelm@64229
   104
  def write(path: Path, bytes: Bytes): Unit = write(path.file, bytes)
wenzelm@54439
   105
}
wenzelm@54439
   106
wenzelm@54439
   107
final class Bytes private(
wenzelm@54439
   108
  protected val bytes: Array[Byte],
wenzelm@54439
   109
  protected val offset: Int,
wenzelm@60833
   110
  val length: Int) extends CharSequence
wenzelm@54439
   111
{
wenzelm@54439
   112
  /* equality */
wenzelm@54439
   113
wenzelm@54440
   114
  override def equals(that: Any): Boolean =
wenzelm@54440
   115
  {
wenzelm@54440
   116
    that match {
wenzelm@54440
   117
      case other: Bytes =>
wenzelm@54440
   118
        if (this eq other) true
wenzelm@54440
   119
        else if (length != other.length) false
wenzelm@54440
   120
        else (0 until length).forall(i => bytes(offset + i) == other.bytes(other.offset + i))
wenzelm@54440
   121
      case _ => false
wenzelm@54440
   122
    }
wenzelm@54440
   123
  }
wenzelm@54440
   124
wenzelm@54439
   125
  private lazy val hash: Int =
wenzelm@54439
   126
  {
wenzelm@54439
   127
    var h = 0
wenzelm@54439
   128
    for (i <- offset until offset + length) {
wenzelm@54439
   129
      val b = bytes(i).asInstanceOf[Int] & 0xFF
wenzelm@54439
   130
      h = 31 * h + b
wenzelm@54439
   131
    }
wenzelm@54439
   132
    h
wenzelm@54439
   133
  }
wenzelm@54439
   134
wenzelm@54439
   135
  override def hashCode(): Int = hash
wenzelm@54439
   136
wenzelm@54439
   137
wenzelm@54439
   138
  /* content */
wenzelm@54439
   139
wenzelm@54512
   140
  lazy val sha1_digest: SHA1.Digest = SHA1.digest(bytes)
wenzelm@54440
   141
wenzelm@65279
   142
  def text: String =
wenzelm@65279
   143
    UTF8.decode_chars(s => s, bytes, offset, offset + length).toString
wenzelm@65279
   144
wenzelm@68094
   145
  def base64: String =
wenzelm@68094
   146
  {
wenzelm@68094
   147
    val b =
wenzelm@68094
   148
      if (offset == 0 && length == bytes.length) bytes
wenzelm@68094
   149
      else Bytes(bytes, offset, length).bytes
wenzelm@68094
   150
    Base64.getEncoder.encodeToString(b)
wenzelm@68094
   151
  }
wenzelm@68094
   152
wenzelm@68106
   153
  def maybe_base64: (Boolean, String) =
wenzelm@68106
   154
  {
wenzelm@68106
   155
    val s = text
wenzelm@68106
   156
    if (this == Bytes(s)) (false, s) else (true, base64)
wenzelm@68106
   157
  }
wenzelm@68106
   158
wenzelm@68150
   159
  override def toString: String = "Bytes(" + length + ")"
wenzelm@54439
   160
wenzelm@54440
   161
  def isEmpty: Boolean = length == 0
wenzelm@54439
   162
wenzelm@65630
   163
  def proper: Option[Bytes] = if (isEmpty) None else Some(this)
wenzelm@65630
   164
  def proper_text: Option[String] = if (isEmpty) None else Some(text)
wenzelm@65630
   165
wenzelm@54439
   166
  def +(other: Bytes): Bytes =
wenzelm@54440
   167
    if (other.isEmpty) this
wenzelm@54440
   168
    else if (isEmpty) other
wenzelm@54439
   169
    else {
wenzelm@54439
   170
      val new_bytes = new Array[Byte](length + other.length)
wenzelm@55618
   171
      System.arraycopy(bytes, offset, new_bytes, 0, length)
wenzelm@55618
   172
      System.arraycopy(other.bytes, other.offset, new_bytes, length, other.length)
wenzelm@54439
   173
      new Bytes(new_bytes, 0, new_bytes.length)
wenzelm@54439
   174
    }
wenzelm@54440
   175
wenzelm@54440
   176
wenzelm@60833
   177
  /* CharSequence operations */
wenzelm@60833
   178
wenzelm@60833
   179
  def charAt(i: Int): Char =
wenzelm@60833
   180
    if (0 <= i && i < length) (bytes(offset + i).asInstanceOf[Int] & 0xFF).asInstanceOf[Char]
wenzelm@60833
   181
    else throw new IndexOutOfBoundsException
wenzelm@60833
   182
wenzelm@60833
   183
  def subSequence(i: Int, j: Int): Bytes =
wenzelm@60833
   184
  {
wenzelm@60833
   185
    if (0 <= i && i <= j && j <= length) new Bytes(bytes, offset + i, j - i)
wenzelm@60833
   186
    else throw new IndexOutOfBoundsException
wenzelm@60833
   187
  }
wenzelm@60833
   188
wenzelm@60833
   189
wenzelm@64004
   190
  /* streams */
wenzelm@64004
   191
wenzelm@64004
   192
  def stream(): ByteArrayInputStream = new ByteArrayInputStream(bytes, offset, length)
wenzelm@64004
   193
wenzelm@64004
   194
  def write_stream(stream: OutputStream): Unit = stream.write(bytes, offset, length)
wenzelm@64004
   195
wenzelm@64004
   196
wenzelm@64004
   197
  /* XZ data compression */
wenzelm@54440
   198
wenzelm@68018
   199
  def uncompress(cache: XZ.Cache = XZ.cache()): Bytes =
wenzelm@68018
   200
    using(new XZInputStream(stream(), cache))(Bytes.read_stream(_, hint = length))
wenzelm@64004
   201
wenzelm@68018
   202
  def compress(options: XZ.Options = XZ.options(), cache: XZ.Cache = XZ.cache()): Bytes =
wenzelm@64004
   203
  {
wenzelm@64004
   204
    val result = new ByteArrayOutputStream(length)
wenzelm@68018
   205
    using(new XZOutputStream(result, options, cache))(write_stream(_))
wenzelm@64004
   206
    new Bytes(result.toByteArray, 0, result.size)
wenzelm@64004
   207
  }
wenzelm@54439
   208
}