# HG changeset patch # User wenzelm # Date 1720190057 -7200 # Node ID 720849fb1f37192594d07330ac927f68167dbf27 # Parent db89ef6a8a4273a15ae9c4d4f1c2e9d4b0300934# Parent e7e95718e869ee05d4508695ce8123e32e04201e merged diff -r db89ef6a8a42 -r 720849fb1f37 NEWS --- a/NEWS Fri Jul 05 10:38:17 2024 +0200 +++ b/NEWS Fri Jul 05 16:34:17 2024 +0200 @@ -7,6 +7,8 @@ New in this Isabelle version ---------------------------- +** HOL ** + * Theory "HOL.Wellfounded": - Removed lemma wellorder.wfP_less. Use wellorder.wfp_on_less instead. Minor INCOMPATIBILITIES. @@ -43,6 +45,19 @@ ease transition. +*** System *** + +* The Isabelle/Scala type Bytes has become more scalable, with support +for incremental construction via Bytes.Builder. There is no longer an +artificial size limit, in contrast to Java byte arrays (max. 2 GiB). +Bytes operations on files, streams, encode/decode, compress/uncompress, +etc. have become more efficient, and bypass Java strings (max. 1 GiB). +In particular, YXML parsing and printing now works on very large byte +vectors as well: only the individual content nodes are limited by the +built-in string size. + + + New in Isabelle2024 (May 2024) ------------------------------ diff -r db89ef6a8a42 -r 720849fb1f37 src/Pure/Admin/isabelle_cronjob.scala --- a/src/Pure/Admin/isabelle_cronjob.scala Fri Jul 05 10:38:17 2024 +0200 +++ b/src/Pure/Admin/isabelle_cronjob.scala Fri Jul 05 16:34:17 2024 +0200 @@ -347,15 +347,15 @@ afp = true, detect = Build_Log.Prop.build_tags.toString + " = " + SQL.string("AFP"))), List( - Remote_Build("macOS, quick_and_dirty", "mini2", + Remote_Build("macOS 14 Sonoma, quick_and_dirty", "mini2-sonoma", options = "-m32 -M4 -t quick_and_dirty -p pide_session=false", args = "-a -o quick_and_dirty", detect = Build_Log.Prop.build_tags.toString + " = " + SQL.string("quick_and_dirty"), - count = () => 0), - Remote_Build("macOS, skip_proofs", "mini2", + count = () => 1), + Remote_Build("macOS 14 Sonoma, skip_proofs", "mini2-sonoma", options = "-m32 -M4 -t skip_proofs -p pide_session=false", args = "-a -o skip_proofs", detect = Build_Log.Prop.build_tags.toString + " = " + SQL.string("skip_proofs"), - count = () => 0)), + count = () => 1)), List( Remote_Build("macOS 13 Ventura (ARM)", "mini3", history_base = "build_history_base_arm", diff -r db89ef6a8a42 -r 720849fb1f37 src/Pure/General/bytes.scala --- a/src/Pure/General/bytes.scala Fri Jul 05 10:38:17 2024 +0200 +++ b/src/Pure/General/bytes.scala Fri Jul 05 16:34:17 2024 +0200 @@ -1,14 +1,14 @@ /* Title: Pure/General/bytes.scala Author: Makarius -Immutable byte vectors versus UTF8 strings. +Scalable byte strings, with incremental construction (via Builder). */ package isabelle import java.io.{ByteArrayInputStream, ByteArrayOutputStream, FileInputStream, FileOutputStream, - InputStream, OutputStream, File => JFile} + InputStreamReader, InputStream, OutputStream, File => JFile} import java.nio.ByteBuffer import java.nio.charset.StandardCharsets.ISO_8859_1 import java.nio.channels.FileChannel @@ -22,29 +22,23 @@ object Bytes { - /* internal sizes */ + /* internal limits */ - private val array_size: Long = Int.MaxValue - 8 // see java.io.InputStream.MAX_BUFFER_SIZE - private val block_size: Int = 16384 // see java.io.InputStream.DEFAULT_BUFFER_SIZE - private val chunk_size: Long = Space.MiB(100).bytes + val array_size: Long = Int.MaxValue - 8 // see java.io.InputStream.MAX_BUFFER_SIZE + val string_size: Long = Int.MaxValue / 2 + val block_size: Int = 16384 // see java.io.InputStream.DEFAULT_BUFFER_SIZE + val chunk_size: Long = Space.MiB(100).bytes - private def make_size(chunks: Array[Array[Byte]], buffer: Array[Byte]): Long = - chunks.foldLeft(buffer.length.toLong)((n, chunk) => n + chunk.length) - - class Too_Large(size: Long) extends IndexOutOfBoundsException { + class Too_Large(size: Long, limit: Long) extends IndexOutOfBoundsException { override def getMessage: String = "Bytes too large for particular operation: " + - Space.bytes(size).print + " > " + Space.bytes(array_size).print + Space.bytes(size).print + " > " + Space.bytes(limit).print } /* main constructors */ - private def reuse_array(bytes: Array[Byte]): Bytes = - if (bytes.length <= chunk_size) new Bytes(None, bytes, 0L, bytes.length.toLong) - else apply(bytes) - - val empty: Bytes = reuse_array(new Array(0)) + val empty: Bytes = new Bytes(None, new Array(0), 0L, 0L) def raw(s: String): Bytes = if (s.isEmpty) empty @@ -246,7 +240,8 @@ chunks = null buffer_list = null buffer = null - new Bytes(if (cs.isEmpty) None else Some(cs), b, 0L, size) + if (size == 0) empty + else new Bytes(if (cs.isEmpty) None else Some(cs), b, 0L, size) } } @@ -293,10 +288,6 @@ chunks.get.forall(chunk => chunk.length == Bytes.chunk_size)) && chunk0.length < Bytes.chunk_size) - def small_size: Int = - if (size > Bytes.array_size) throw new Bytes.Too_Large(size) - else size.toInt - override def is_empty: Boolean = size == 0 def proper: Option[Bytes] = if (is_empty) None else Some(bytes) @@ -305,7 +296,9 @@ offset != 0L || { chunks match { case None => size != chunk0.length - case Some(cs) => size != Bytes.make_size(cs, chunk0) + case Some(cs) => + val physical_size = cs.foldLeft(chunk0.length.toLong)((n, c) => n + c.length) + size != physical_size } } @@ -478,24 +471,27 @@ } def make_array: Array[Byte] = { - val buf = new ByteArrayOutputStream(small_size) + val n = + if (size <= Bytes.array_size) size.toInt + else throw new Bytes.Too_Large(size, Bytes.array_size) + val buf = new ByteArrayOutputStream(n) for (a <- subarray_iterator) { buf.write(a.array, a.offset, a.length) } buf.toByteArray } - override def text: String = + def text: String = if (is_empty) "" else { - var i = 0L - var utf8 = false - while (i < size && !utf8) { - if (byte_unchecked(i) < 0) { utf8 = true } - i += 1 + val reader = new InputStreamReader(stream(), UTF8.charset) + val buf = new Array[Char]((size min Bytes.string_size).toInt + 1) + var m = 0 + var n = 0 + while (m >= 0 && n < buf.length) { + m = reader.read(buf, n, (buf.length - n) min Bytes.block_size) + if (m > 0) { n += m } } - utf8 - - if (utf8) UTF8.decode_permissive(bytes) - else new String(make_array, UTF8.charset) + require(m == -1, "Malformed UTF-8 string: overlong result") + new String(buf, 0, n) } def wellformed_text: Option[String] = diff -r db89ef6a8a42 -r 720849fb1f37 src/Pure/General/pretty.ML --- a/src/Pure/General/pretty.ML Fri Jul 05 10:38:17 2024 +0200 +++ b/src/Pure/General/pretty.ML Fri Jul 05 16:34:17 2024 +0200 @@ -195,7 +195,7 @@ fun enum sep lpar rpar prts = enclose lpar rpar (separate sep prts); val position = - enum "," "{" "}" o map (fn (x, y) => str (x ^ "=" ^ y)) o Position.properties_of; + enum "," "{" "}" o map (str o Properties.print_eq) o Position.properties_of; fun here pos = let diff -r db89ef6a8a42 -r 720849fb1f37 src/Pure/General/properties.ML --- a/src/Pure/General/properties.ML Fri Jul 05 10:38:17 2024 +0200 +++ b/src/Pure/General/properties.ML Fri Jul 05 16:34:17 2024 +0200 @@ -8,6 +8,7 @@ sig type entry = string * string type T = entry list + val print_eq: entry -> string val entry_ord: entry ord val ord: T ord val defined: T -> string -> bool @@ -25,8 +26,11 @@ struct type entry = string * string; + type T = entry list; +fun print_eq (a, b) = a ^ "=" ^ b; + val entry_ord = prod_ord string_ord string_ord; val ord = list_ord entry_ord; diff -r db89ef6a8a42 -r 720849fb1f37 src/Pure/General/properties.scala --- a/src/Pure/General/properties.scala Fri Jul 05 10:38:17 2024 +0200 +++ b/src/Pure/General/properties.scala Fri Jul 05 16:34:17 2024 +0200 @@ -21,6 +21,9 @@ val i = str.indexOf('=') if (i <= 0) None else Some((str.substring(0, i), str.substring(i + 1))) } + + def parse(s: java.lang.String): Entry = + unapply(s) getOrElse error("Bad property entry: " + quote(s)) } def defined(props: T, name: java.lang.String): java.lang.Boolean = diff -r db89ef6a8a42 -r 720849fb1f37 src/Pure/General/scan.scala --- a/src/Pure/General/scan.scala Fri Jul 05 10:38:17 2024 +0200 +++ b/src/Pure/General/scan.scala Fri Jul 05 16:34:17 2024 +0200 @@ -436,7 +436,7 @@ reader.isInstanceOf[Byte_Reader] def reader_decode_utf8(is_utf8: Boolean, s: String): String = - if (is_utf8) UTF8.decode_permissive(Bytes.raw(s)) else s + if (is_utf8) Bytes.raw(s).text else s def reader_decode_utf8(reader: Reader[Char], s: String): String = reader_decode_utf8(reader_is_utf8(reader), s) diff -r db89ef6a8a42 -r 720849fb1f37 src/Pure/General/utf8.scala --- a/src/Pure/General/utf8.scala Fri Jul 05 10:38:17 2024 +0200 +++ b/src/Pure/General/utf8.scala Fri Jul 05 16:34:17 2024 +0200 @@ -11,53 +11,7 @@ object UTF8 { - /* charset */ - val charset: Charset = StandardCharsets.UTF_8 def bytes(s: String): Array[Byte] = s.getBytes(charset) - - - /* permissive UTF-8 decoding */ - - // see also https://en.wikipedia.org/wiki/UTF-8#Description - // overlong encodings enable byte-stuffing of low-ASCII - - def decode_permissive(bytes: Bytes): String = { - val size = bytes.size - val buf = new java.lang.StringBuilder((size min Space.GiB(1).bytes).toInt) - var code = -1 - var rest = 0 - def flush(): Unit = { - if (code != -1) { - if (rest == 0 && Character.isValidCodePoint(code)) buf.appendCodePoint(code) - else buf.append('\uFFFD') - code = -1 - rest = 0 - } - } - def init(x: Int, n: Int): Unit = { - flush() - code = x - rest = n - } - def push(x: Int): Unit = { - if (rest <= 0) init(x, -1) - else { - code <<= 6 - code += x - rest -= 1 - } - } - for (i <- 0L until size) { - val c = bytes.char(i) - if (c < 128) { flush(); buf.append(c) } - else if ((c & 0xC0) == 0x80) push(c & 0x3F) - else if ((c & 0xE0) == 0xC0) init(c & 0x1F, 1) - else if ((c & 0xF0) == 0xE0) init(c & 0x0F, 2) - else if ((c & 0xF8) == 0xF0) init(c & 0x07, 3) - } - flush() - buf.toString - } } diff -r db89ef6a8a42 -r 720849fb1f37 src/Pure/General/value.scala --- a/src/Pure/General/value.scala Fri Jul 05 10:38:17 2024 +0200 +++ b/src/Pure/General/value.scala Fri Jul 05 16:34:17 2024 +0200 @@ -21,6 +21,9 @@ } object Nat { + def unapply(bs: Bytes): Option[scala.Int] = + if (bs.byte_iterator.forall(b => '0' <= b && b <= '9')) unapply(bs.text) + else None def unapply(s: java.lang.String): Option[scala.Int] = s match { case Int(n) if n >= 0 => Some(n) diff -r db89ef6a8a42 -r 720849fb1f37 src/Pure/PIDE/prover.scala --- a/src/Pure/PIDE/prover.scala Fri Jul 05 10:38:17 2024 +0200 +++ b/src/Pure/PIDE/prover.scala Fri Jul 05 16:34:17 2024 +0200 @@ -246,8 +246,13 @@ /* message output */ private def message_output(stream: InputStream): Thread = { - def decode_chunk(chunk: Bytes): XML.Body = - Symbol.decode_yxml_failsafe(chunk.text, cache = cache) + def decode_prop(bytes: Bytes): Properties.Entry = { + val (a, b) = Properties.Eq.parse(bytes.text) + (Symbol.decode(a), Symbol.decode(b)) + } + + def decode_xml(bytes: Bytes): XML.Body = + Symbol.decode_yxml_failsafe(bytes.text, cache = cache) val thread_name = "message_output" Isabelle_Thread.fork(name = thread_name) { @@ -256,13 +261,12 @@ while (!finished) { Byte_Message.read_message(stream) match { case None => finished = true - case Some(header :: chunks) => - decode_chunk(header) match { - case List(XML.Elem(Markup(kind, props), Nil)) => - if (kind == Markup.PROTOCOL) protocol_output(props, chunks) - else output(kind, props, decode_chunk(Prover.the_chunk(chunks, kind))) - case _ => Prover.bad_header(header.toString) - } + case Some(k :: Value.Nat(props_length) :: rest) => + val kind = k.text + val props = rest.take(props_length).map(decode_prop) + val chunks = rest.drop(props_length) + if (kind == Markup.PROTOCOL) protocol_output(props, chunks) + else output(kind, props, chunks.flatMap(decode_xml)) case Some(_) => Prover.bad_chunks() } } diff -r db89ef6a8a42 -r 720849fb1f37 src/Pure/PIDE/yxml.ML --- a/src/Pure/PIDE/yxml.ML Fri Jul 05 10:38:17 2024 +0200 +++ b/src/Pure/PIDE/yxml.ML Fri Jul 05 16:34:17 2024 +0200 @@ -18,7 +18,6 @@ sig val X: Symbol.symbol val Y: Symbol.symbol - val embed_controls: string -> string val detect: string -> bool val traverse: (string -> 'a -> 'a) -> XML.tree -> 'a -> 'a val tree_size: XML.tree -> int @@ -41,19 +40,6 @@ (** string representation **) -(* idempotent recoding of certain low ASCII control characters *) - -fun pseudo_utf8 c = - if Symbol.is_ascii_control c - then chr 192 ^ chr (128 + ord c) - else c; - -fun embed_controls str = - if exists_string Symbol.is_ascii_control str - then translate_string pseudo_utf8 str - else str; - - (* markers *) val X_char = Char.chr 5; @@ -102,7 +88,7 @@ fun output_markup (markup as (name, atts)) = if Markup.is_empty markup then Markup.no_output - else (XY ^ name ^ implode (map (fn (a, x) => Y ^ a ^ "=" ^ x) atts) ^ X, XYX); + else (XY ^ name ^ implode (map (fn p => Y ^ Properties.print_eq p) atts) ^ X, XYX); fun output_markup_elem markup = let val [bg1, bg2, en] = space_explode Z (string_of (XML.wrap_elem ((markup, Z_text), Z_text))) diff -r db89ef6a8a42 -r 720849fb1f37 src/Pure/System/message_channel.ML --- a/src/Pure/System/message_channel.ML Fri Jul 05 10:38:17 2024 +0200 +++ b/src/Pure/System/message_channel.ML Fri Jul 05 16:34:17 2024 +0200 @@ -24,11 +24,12 @@ Mailbox.await_empty mbox; Isabelle_Thread.join thread); -fun message (Message_Channel {mbox, ...}) name raw_props chunks = +fun message (Message_Channel {mbox, ...}) name props chunks = let - val robust_props = map (apply2 YXML.embed_controls) raw_props; - val header = [XML.Elem ((name, robust_props), [])]; - in Mailbox.send mbox (Message (header :: chunks)) end; + val kind = XML.Encode.string name; + val props_length = XML.Encode.int (length props); + val props_chunks = map (XML.Encode.string o Properties.print_eq) props; + in Mailbox.send mbox (Message (kind :: props_length :: props_chunks @ chunks)) end; fun make stream = let diff -r db89ef6a8a42 -r 720849fb1f37 src/Pure/Tools/server.scala --- a/src/Pure/Tools/server.scala Fri Jul 05 10:38:17 2024 +0200 +++ b/src/Pure/Tools/server.scala Fri Jul 05 16:34:17 2024 +0200 @@ -189,7 +189,7 @@ catch { case _: IOException => } def write_line_message(msg: String): Unit = - out_lock.synchronized { Byte_Message.write_line_message(out, Bytes(UTF8.bytes(msg))) } + out_lock.synchronized { Byte_Message.write_line_message(out, Bytes(msg)) } def write_byte_message(chunks: List[Bytes]): Unit = out_lock.synchronized { Byte_Message.write_message(out, chunks) } diff -r db89ef6a8a42 -r 720849fb1f37 src/Tools/jEdit/jedit_main/scala_console.scala --- a/src/Tools/jEdit/jedit_main/scala_console.scala Fri Jul 05 10:38:17 2024 +0200 +++ b/src/Tools/jEdit/jedit_main/scala_console.scala Fri Jul 05 16:34:17 2024 +0200 @@ -50,7 +50,7 @@ override def flush(): Unit = { val s = buf.synchronized { val s = buf.toString; buf.clear(); s } - val str = UTF8.decode_permissive(Bytes.raw(s)) + val str = Bytes.raw(s).text GUI_Thread.later { if (global_out == null) java.lang.System.out.print(str) else global_out.writeAttrs(null, str)