# HG changeset patch # User wenzelm # Date 1720172301 -7200 # Node ID 8585399f26f6c6b063bb718b479a0bbccabdadde # Parent 8e4731a2a0415dc006c0b8f82a9587c6894c1f1f prefer official UTF-8 decoding (in contrast to 2541de190d92): this is also more efficient (factor 10-20); diff -r 8e4731a2a041 -r 8585399f26f6 src/Pure/General/bytes.scala --- a/src/Pure/General/bytes.scala Fri Jul 05 10:55:02 2024 +0200 +++ b/src/Pure/General/bytes.scala Fri Jul 05 11:38:21 2024 +0200 @@ -8,7 +8,7 @@ import java.io.{ByteArrayInputStream, ByteArrayOutputStream, FileInputStream, FileOutputStream, - InputStream, OutputStream, File => JFile} + InputStreamReader, InputStream, OutputStream, File => JFile} import java.nio.ByteBuffer import java.nio.charset.StandardCharsets.ISO_8859_1 import java.nio.channels.FileChannel @@ -25,6 +25,7 @@ /* internal sizes */ private val array_size: Long = Int.MaxValue - 8 // see java.io.InputStream.MAX_BUFFER_SIZE + private val string_size: Long = Int.MaxValue / 2 private val block_size: Int = 16384 // see java.io.InputStream.DEFAULT_BUFFER_SIZE private val chunk_size: Long = Space.MiB(100).bytes @@ -483,19 +484,19 @@ buf.toByteArray } - override def text: String = + def text: String = if (is_empty) "" else { - var i = 0L - var utf8 = false - while (i < size && !utf8) { - if (byte_unchecked(i) < 0) { utf8 = true } - i += 1 + val reader = new InputStreamReader(stream(), UTF8.charset) + val buf = new Array[Char]((size min Bytes.string_size).toInt + 1) + var m = 0 + var n = 0 + while (m >= 0 && n < buf.length) { + m = reader.read(buf, n, (buf.length - n) min Bytes.block_size) + if (m > 0) { n += m } } - utf8 - - if (utf8) UTF8.decode_permissive(bytes) - else new String(make_array, UTF8.charset) + require(m == -1, "Malformed UTF-8 string: overlong result") + new String(buf, 0, n) } def wellformed_text: Option[String] = diff -r 8e4731a2a041 -r 8585399f26f6 src/Pure/General/utf8.scala --- a/src/Pure/General/utf8.scala Fri Jul 05 10:55:02 2024 +0200 +++ b/src/Pure/General/utf8.scala Fri Jul 05 11:38:21 2024 +0200 @@ -11,53 +11,7 @@ object UTF8 { - /* charset */ - val charset: Charset = StandardCharsets.UTF_8 def bytes(s: String): Array[Byte] = s.getBytes(charset) - - - /* permissive UTF-8 decoding */ - - // see also https://en.wikipedia.org/wiki/UTF-8#Description - // overlong encodings enable byte-stuffing of low-ASCII - - def decode_permissive(bytes: Bytes): String = { - val size = bytes.size - val buf = new java.lang.StringBuilder((size min Space.GiB(1).bytes).toInt) - var code = -1 - var rest = 0 - def flush(): Unit = { - if (code != -1) { - if (rest == 0 && Character.isValidCodePoint(code)) buf.appendCodePoint(code) - else buf.append('\uFFFD') - code = -1 - rest = 0 - } - } - def init(x: Int, n: Int): Unit = { - flush() - code = x - rest = n - } - def push(x: Int): Unit = { - if (rest <= 0) init(x, -1) - else { - code <<= 6 - code += x - rest -= 1 - } - } - for (i <- 0L until size) { - val c = bytes.char(i) - if (c < 128) { flush(); buf.append(c) } - else if ((c & 0xC0) == 0x80) push(c & 0x3F) - else if ((c & 0xE0) == 0xC0) init(c & 0x1F, 1) - else if ((c & 0xF0) == 0xE0) init(c & 0x0F, 2) - else if ((c & 0xF8) == 0xF0) init(c & 0x07, 3) - } - flush() - buf.toString - } }