prefer official UTF-8 decoding (in contrast to 2541de190d92): this is also more efficient (factor 10-20);
authorwenzelm
Fri, 05 Jul 2024 11:38:21 +0200
changeset 80508 8585399f26f6
parent 80507 8e4731a2a041
child 80509 2a9abd6a164e
prefer official UTF-8 decoding (in contrast to 2541de190d92): this is also more efficient (factor 10-20);
src/Pure/General/bytes.scala
src/Pure/General/utf8.scala
--- a/src/Pure/General/bytes.scala	Fri Jul 05 10:55:02 2024 +0200
+++ b/src/Pure/General/bytes.scala	Fri Jul 05 11:38:21 2024 +0200
@@ -8,7 +8,7 @@
 
 
 import java.io.{ByteArrayInputStream, ByteArrayOutputStream, FileInputStream, FileOutputStream,
-  InputStream, OutputStream, File => JFile}
+  InputStreamReader, InputStream, OutputStream, File => JFile}
 import java.nio.ByteBuffer
 import java.nio.charset.StandardCharsets.ISO_8859_1
 import java.nio.channels.FileChannel
@@ -25,6 +25,7 @@
   /* internal sizes */
 
   private val array_size: Long = Int.MaxValue - 8  // see java.io.InputStream.MAX_BUFFER_SIZE
+  private val string_size: Long = Int.MaxValue / 2
   private val block_size: Int = 16384  // see java.io.InputStream.DEFAULT_BUFFER_SIZE
   private val chunk_size: Long = Space.MiB(100).bytes
 
@@ -483,19 +484,19 @@
     buf.toByteArray
   }
 
-  override def text: String =
+  def text: String =
     if (is_empty) ""
     else {
-      var i = 0L
-      var utf8 = false
-      while (i < size && !utf8) {
-        if (byte_unchecked(i) < 0) { utf8 = true }
-        i += 1
+      val reader = new InputStreamReader(stream(), UTF8.charset)
+      val buf = new Array[Char]((size min Bytes.string_size).toInt + 1)
+      var m = 0
+      var n = 0
+      while (m >= 0 && n < buf.length) {
+        m = reader.read(buf, n, (buf.length - n) min Bytes.block_size)
+        if (m > 0) { n += m }
       }
-      utf8
-
-      if (utf8) UTF8.decode_permissive(bytes)
-      else new String(make_array, UTF8.charset)
+      require(m == -1, "Malformed UTF-8 string: overlong result")
+      new String(buf, 0, n)
     }
 
   def wellformed_text: Option[String] =
--- a/src/Pure/General/utf8.scala	Fri Jul 05 10:55:02 2024 +0200
+++ b/src/Pure/General/utf8.scala	Fri Jul 05 11:38:21 2024 +0200
@@ -11,53 +11,7 @@
 
 
 object UTF8 {
-  /* charset */
-
   val charset: Charset = StandardCharsets.UTF_8
 
   def bytes(s: String): Array[Byte] = s.getBytes(charset)
-
-
-  /* permissive UTF-8 decoding */
-
-  // see also https://en.wikipedia.org/wiki/UTF-8#Description
-  // overlong encodings enable byte-stuffing of low-ASCII
-
-  def decode_permissive(bytes: Bytes): String = {
-    val size = bytes.size
-    val buf = new java.lang.StringBuilder((size min Space.GiB(1).bytes).toInt)
-    var code = -1
-    var rest = 0
-    def flush(): Unit = {
-      if (code != -1) {
-        if (rest == 0 && Character.isValidCodePoint(code)) buf.appendCodePoint(code)
-        else buf.append('\uFFFD')
-        code = -1
-        rest = 0
-      }
-    }
-    def init(x: Int, n: Int): Unit = {
-      flush()
-      code = x
-      rest = n
-    }
-    def push(x: Int): Unit = {
-      if (rest <= 0) init(x, -1)
-      else {
-        code <<= 6
-        code += x
-        rest -= 1
-      }
-    }
-    for (i <- 0L until size) {
-      val c = bytes.char(i)
-      if (c < 128) { flush(); buf.append(c) }
-      else if ((c & 0xC0) == 0x80) push(c & 0x3F)
-      else if ((c & 0xE0) == 0xC0) init(c & 0x1F, 1)
-      else if ((c & 0xF0) == 0xE0) init(c & 0x0F, 2)
-      else if ((c & 0xF8) == 0xF0) init(c & 0x07, 3)
-    }
-    flush()
-    buf.toString
-  }
 }