merged
authorwenzelm
Fri, 05 Jul 2024 16:34:17 +0200
changeset 80517 720849fb1f37
parent 80502 db89ef6a8a42 (current diff)
parent 80516 e7e95718e869 (diff)
child 80518 d3b96e19ccc7
child 80520 b0d590e75592
merged
--- a/NEWS	Fri Jul 05 10:38:17 2024 +0200
+++ b/NEWS	Fri Jul 05 16:34:17 2024 +0200
@@ -7,6 +7,8 @@
 New in this Isabelle version
 ----------------------------
 
+** HOL **
+
 * Theory "HOL.Wellfounded":
   - Removed lemma wellorder.wfP_less. Use wellorder.wfp_on_less instead.
     Minor INCOMPATIBILITIES.
@@ -43,6 +45,19 @@
 ease transition.
 
 
+*** System ***
+
+* The Isabelle/Scala type Bytes has become more scalable, with support
+for incremental construction via Bytes.Builder. There is no longer an
+artificial size limit, in contrast to Java byte arrays (max. 2 GiB).
+Bytes operations on files, streams, encode/decode, compress/uncompress,
+etc. have become more efficient, and bypass Java strings (max. 1 GiB).
+In particular, YXML parsing and printing now works on very large byte
+vectors as well: only the individual content nodes are limited by the
+built-in string size.
+
+
+
 New in Isabelle2024 (May 2024)
 ------------------------------
 
--- a/src/Pure/Admin/isabelle_cronjob.scala	Fri Jul 05 10:38:17 2024 +0200
+++ b/src/Pure/Admin/isabelle_cronjob.scala	Fri Jul 05 16:34:17 2024 +0200
@@ -347,15 +347,15 @@
           afp = true,
           detect = Build_Log.Prop.build_tags.toString + " = " + SQL.string("AFP"))),
       List(
-        Remote_Build("macOS, quick_and_dirty", "mini2",
+        Remote_Build("macOS 14 Sonoma, quick_and_dirty", "mini2-sonoma",
           options = "-m32 -M4 -t quick_and_dirty -p pide_session=false",
           args = "-a -o quick_and_dirty",
           detect = Build_Log.Prop.build_tags.toString + " = " + SQL.string("quick_and_dirty"),
-          count = () => 0),
-        Remote_Build("macOS, skip_proofs", "mini2",
+          count = () => 1),
+        Remote_Build("macOS 14 Sonoma, skip_proofs", "mini2-sonoma",
           options = "-m32 -M4 -t skip_proofs -p pide_session=false", args = "-a -o skip_proofs",
           detect = Build_Log.Prop.build_tags.toString + " = " + SQL.string("skip_proofs"),
-          count = () => 0)),
+          count = () => 1)),
       List(
         Remote_Build("macOS 13 Ventura (ARM)", "mini3",
           history_base = "build_history_base_arm",
--- a/src/Pure/General/bytes.scala	Fri Jul 05 10:38:17 2024 +0200
+++ b/src/Pure/General/bytes.scala	Fri Jul 05 16:34:17 2024 +0200
@@ -1,14 +1,14 @@
 /*  Title:      Pure/General/bytes.scala
     Author:     Makarius
 
-Immutable byte vectors versus UTF8 strings.
+Scalable byte strings, with incremental construction (via Builder).
 */
 
 package isabelle
 
 
 import java.io.{ByteArrayInputStream, ByteArrayOutputStream, FileInputStream, FileOutputStream,
-  InputStream, OutputStream, File => JFile}
+  InputStreamReader, InputStream, OutputStream, File => JFile}
 import java.nio.ByteBuffer
 import java.nio.charset.StandardCharsets.ISO_8859_1
 import java.nio.channels.FileChannel
@@ -22,29 +22,23 @@
 
 
 object Bytes {
-  /* internal sizes */
+  /* internal limits */
 
-  private val array_size: Long = Int.MaxValue - 8  // see java.io.InputStream.MAX_BUFFER_SIZE
-  private val block_size: Int = 16384  // see java.io.InputStream.DEFAULT_BUFFER_SIZE
-  private val chunk_size: Long = Space.MiB(100).bytes
+  val array_size: Long = Int.MaxValue - 8  // see java.io.InputStream.MAX_BUFFER_SIZE
+  val string_size: Long = Int.MaxValue / 2
+  val block_size: Int = 16384  // see java.io.InputStream.DEFAULT_BUFFER_SIZE
+  val chunk_size: Long = Space.MiB(100).bytes
 
-  private def make_size(chunks: Array[Array[Byte]], buffer: Array[Byte]): Long =
-    chunks.foldLeft(buffer.length.toLong)((n, chunk) => n + chunk.length)
-
-  class Too_Large(size: Long) extends IndexOutOfBoundsException {
+  class Too_Large(size: Long, limit: Long) extends IndexOutOfBoundsException {
     override def getMessage: String =
       "Bytes too large for particular operation: " +
-        Space.bytes(size).print + " > " + Space.bytes(array_size).print
+        Space.bytes(size).print + " > " + Space.bytes(limit).print
   }
 
 
   /* main constructors */
 
-  private def reuse_array(bytes: Array[Byte]): Bytes =
-    if (bytes.length <= chunk_size) new Bytes(None, bytes, 0L, bytes.length.toLong)
-    else apply(bytes)
-
-  val empty: Bytes = reuse_array(new Array(0))
+  val empty: Bytes = new Bytes(None, new Array(0), 0L, 0L)
 
   def raw(s: String): Bytes =
     if (s.isEmpty) empty
@@ -246,7 +240,8 @@
       chunks = null
       buffer_list = null
       buffer = null
-      new Bytes(if (cs.isEmpty) None else Some(cs), b, 0L, size)
+      if (size == 0) empty
+      else new Bytes(if (cs.isEmpty) None else Some(cs), b, 0L, size)
     }
   }
 
@@ -293,10 +288,6 @@
       chunks.get.forall(chunk => chunk.length == Bytes.chunk_size)) &&
     chunk0.length < Bytes.chunk_size)
 
-  def small_size: Int =
-    if (size > Bytes.array_size) throw new Bytes.Too_Large(size)
-    else size.toInt
-
   override def is_empty: Boolean = size == 0
 
   def proper: Option[Bytes] = if (is_empty) None else Some(bytes)
@@ -305,7 +296,9 @@
     offset != 0L || {
       chunks match {
         case None => size != chunk0.length
-        case Some(cs) => size != Bytes.make_size(cs, chunk0)
+        case Some(cs) =>
+          val physical_size = cs.foldLeft(chunk0.length.toLong)((n, c) => n + c.length)
+          size != physical_size
       }
     }
 
@@ -478,24 +471,27 @@
     }
 
   def make_array: Array[Byte] = {
-    val buf = new ByteArrayOutputStream(small_size)
+    val n =
+      if (size <= Bytes.array_size) size.toInt
+      else throw new Bytes.Too_Large(size, Bytes.array_size)
+    val buf = new ByteArrayOutputStream(n)
     for (a <- subarray_iterator) { buf.write(a.array, a.offset, a.length) }
     buf.toByteArray
   }
 
-  override def text: String =
+  def text: String =
     if (is_empty) ""
     else {
-      var i = 0L
-      var utf8 = false
-      while (i < size && !utf8) {
-        if (byte_unchecked(i) < 0) { utf8 = true }
-        i += 1
+      val reader = new InputStreamReader(stream(), UTF8.charset)
+      val buf = new Array[Char]((size min Bytes.string_size).toInt + 1)
+      var m = 0
+      var n = 0
+      while (m >= 0 && n < buf.length) {
+        m = reader.read(buf, n, (buf.length - n) min Bytes.block_size)
+        if (m > 0) { n += m }
       }
-      utf8
-
-      if (utf8) UTF8.decode_permissive(bytes)
-      else new String(make_array, UTF8.charset)
+      require(m == -1, "Malformed UTF-8 string: overlong result")
+      new String(buf, 0, n)
     }
 
   def wellformed_text: Option[String] =
--- a/src/Pure/General/pretty.ML	Fri Jul 05 10:38:17 2024 +0200
+++ b/src/Pure/General/pretty.ML	Fri Jul 05 16:34:17 2024 +0200
@@ -195,7 +195,7 @@
 fun enum sep lpar rpar prts = enclose lpar rpar (separate sep prts);
 
 val position =
-  enum "," "{" "}" o map (fn (x, y) => str (x ^ "=" ^ y)) o Position.properties_of;
+  enum "," "{" "}" o map (str o Properties.print_eq) o Position.properties_of;
 
 fun here pos =
   let
--- a/src/Pure/General/properties.ML	Fri Jul 05 10:38:17 2024 +0200
+++ b/src/Pure/General/properties.ML	Fri Jul 05 16:34:17 2024 +0200
@@ -8,6 +8,7 @@
 sig
   type entry = string * string
   type T = entry list
+  val print_eq: entry -> string
   val entry_ord: entry ord
   val ord: T ord
   val defined: T -> string -> bool
@@ -25,8 +26,11 @@
 struct
 
 type entry = string * string;
+
 type T = entry list;
 
+fun print_eq (a, b) = a ^ "=" ^ b;
+
 val entry_ord = prod_ord string_ord string_ord;
 val ord = list_ord entry_ord;
 
--- a/src/Pure/General/properties.scala	Fri Jul 05 10:38:17 2024 +0200
+++ b/src/Pure/General/properties.scala	Fri Jul 05 16:34:17 2024 +0200
@@ -21,6 +21,9 @@
       val i = str.indexOf('=')
       if (i <= 0) None else Some((str.substring(0, i), str.substring(i + 1)))
     }
+
+    def parse(s: java.lang.String): Entry =
+      unapply(s) getOrElse error("Bad property entry: " + quote(s))
   }
 
   def defined(props: T, name: java.lang.String): java.lang.Boolean =
--- a/src/Pure/General/scan.scala	Fri Jul 05 10:38:17 2024 +0200
+++ b/src/Pure/General/scan.scala	Fri Jul 05 16:34:17 2024 +0200
@@ -436,7 +436,7 @@
     reader.isInstanceOf[Byte_Reader]
 
   def reader_decode_utf8(is_utf8: Boolean, s: String): String =
-    if (is_utf8) UTF8.decode_permissive(Bytes.raw(s)) else s
+    if (is_utf8) Bytes.raw(s).text else s
 
   def reader_decode_utf8(reader: Reader[Char], s: String): String =
     reader_decode_utf8(reader_is_utf8(reader), s)
--- a/src/Pure/General/utf8.scala	Fri Jul 05 10:38:17 2024 +0200
+++ b/src/Pure/General/utf8.scala	Fri Jul 05 16:34:17 2024 +0200
@@ -11,53 +11,7 @@
 
 
 object UTF8 {
-  /* charset */
-
   val charset: Charset = StandardCharsets.UTF_8
 
   def bytes(s: String): Array[Byte] = s.getBytes(charset)
-
-
-  /* permissive UTF-8 decoding */
-
-  // see also https://en.wikipedia.org/wiki/UTF-8#Description
-  // overlong encodings enable byte-stuffing of low-ASCII
-
-  def decode_permissive(bytes: Bytes): String = {
-    val size = bytes.size
-    val buf = new java.lang.StringBuilder((size min Space.GiB(1).bytes).toInt)
-    var code = -1
-    var rest = 0
-    def flush(): Unit = {
-      if (code != -1) {
-        if (rest == 0 && Character.isValidCodePoint(code)) buf.appendCodePoint(code)
-        else buf.append('\uFFFD')
-        code = -1
-        rest = 0
-      }
-    }
-    def init(x: Int, n: Int): Unit = {
-      flush()
-      code = x
-      rest = n
-    }
-    def push(x: Int): Unit = {
-      if (rest <= 0) init(x, -1)
-      else {
-        code <<= 6
-        code += x
-        rest -= 1
-      }
-    }
-    for (i <- 0L until size) {
-      val c = bytes.char(i)
-      if (c < 128) { flush(); buf.append(c) }
-      else if ((c & 0xC0) == 0x80) push(c & 0x3F)
-      else if ((c & 0xE0) == 0xC0) init(c & 0x1F, 1)
-      else if ((c & 0xF0) == 0xE0) init(c & 0x0F, 2)
-      else if ((c & 0xF8) == 0xF0) init(c & 0x07, 3)
-    }
-    flush()
-    buf.toString
-  }
 }
--- a/src/Pure/General/value.scala	Fri Jul 05 10:38:17 2024 +0200
+++ b/src/Pure/General/value.scala	Fri Jul 05 16:34:17 2024 +0200
@@ -21,6 +21,9 @@
   }
 
   object Nat {
+    def unapply(bs: Bytes): Option[scala.Int] =
+      if (bs.byte_iterator.forall(b => '0' <= b && b <= '9')) unapply(bs.text)
+      else None
     def unapply(s: java.lang.String): Option[scala.Int] =
       s match {
         case Int(n) if n >= 0 => Some(n)
--- a/src/Pure/PIDE/prover.scala	Fri Jul 05 10:38:17 2024 +0200
+++ b/src/Pure/PIDE/prover.scala	Fri Jul 05 16:34:17 2024 +0200
@@ -246,8 +246,13 @@
   /* message output */
 
   private def message_output(stream: InputStream): Thread = {
-    def decode_chunk(chunk: Bytes): XML.Body =
-      Symbol.decode_yxml_failsafe(chunk.text, cache = cache)
+    def decode_prop(bytes: Bytes): Properties.Entry = {
+      val (a, b) = Properties.Eq.parse(bytes.text)
+      (Symbol.decode(a), Symbol.decode(b))
+    }
+
+    def decode_xml(bytes: Bytes): XML.Body =
+      Symbol.decode_yxml_failsafe(bytes.text, cache = cache)
 
     val thread_name = "message_output"
     Isabelle_Thread.fork(name = thread_name) {
@@ -256,13 +261,12 @@
         while (!finished) {
           Byte_Message.read_message(stream) match {
             case None => finished = true
-            case Some(header :: chunks) =>
-              decode_chunk(header) match {
-                case List(XML.Elem(Markup(kind, props), Nil)) =>
-                  if (kind == Markup.PROTOCOL) protocol_output(props, chunks)
-                  else output(kind, props, decode_chunk(Prover.the_chunk(chunks, kind)))
-                case _ => Prover.bad_header(header.toString)
-              }
+            case Some(k :: Value.Nat(props_length) :: rest) =>
+              val kind = k.text
+              val props = rest.take(props_length).map(decode_prop)
+              val chunks = rest.drop(props_length)
+              if (kind == Markup.PROTOCOL) protocol_output(props, chunks)
+              else output(kind, props, chunks.flatMap(decode_xml))
             case Some(_) => Prover.bad_chunks()
           }
         }
--- a/src/Pure/PIDE/yxml.ML	Fri Jul 05 10:38:17 2024 +0200
+++ b/src/Pure/PIDE/yxml.ML	Fri Jul 05 16:34:17 2024 +0200
@@ -18,7 +18,6 @@
 sig
   val X: Symbol.symbol
   val Y: Symbol.symbol
-  val embed_controls: string -> string
   val detect: string -> bool
   val traverse: (string -> 'a -> 'a) -> XML.tree -> 'a -> 'a
   val tree_size: XML.tree -> int
@@ -41,19 +40,6 @@
 
 (** string representation **)
 
-(* idempotent recoding of certain low ASCII control characters *)
-
-fun pseudo_utf8 c =
-  if Symbol.is_ascii_control c
-  then chr 192 ^ chr (128 + ord c)
-  else c;
-
-fun embed_controls str =
-  if exists_string Symbol.is_ascii_control str
-  then translate_string pseudo_utf8 str
-  else str;
-
-
 (* markers *)
 
 val X_char = Char.chr 5;
@@ -102,7 +88,7 @@
 
 fun output_markup (markup as (name, atts)) =
   if Markup.is_empty markup then Markup.no_output
-  else (XY ^ name ^ implode (map (fn (a, x) => Y ^ a ^ "=" ^ x) atts) ^ X, XYX);
+  else (XY ^ name ^ implode (map (fn p => Y ^ Properties.print_eq p) atts) ^ X, XYX);
 
 fun output_markup_elem markup =
   let val [bg1, bg2, en] = space_explode Z (string_of (XML.wrap_elem ((markup, Z_text), Z_text)))
--- a/src/Pure/System/message_channel.ML	Fri Jul 05 10:38:17 2024 +0200
+++ b/src/Pure/System/message_channel.ML	Fri Jul 05 16:34:17 2024 +0200
@@ -24,11 +24,12 @@
   Mailbox.await_empty mbox;
   Isabelle_Thread.join thread);
 
-fun message (Message_Channel {mbox, ...}) name raw_props chunks =
+fun message (Message_Channel {mbox, ...}) name props chunks =
   let
-    val robust_props = map (apply2 YXML.embed_controls) raw_props;
-    val header = [XML.Elem ((name, robust_props), [])];
-  in Mailbox.send mbox (Message (header :: chunks)) end;
+    val kind = XML.Encode.string name;
+    val props_length = XML.Encode.int (length props);
+    val props_chunks = map (XML.Encode.string o Properties.print_eq) props;
+  in Mailbox.send mbox (Message (kind :: props_length :: props_chunks @ chunks)) end;
 
 fun make stream =
   let
--- a/src/Pure/Tools/server.scala	Fri Jul 05 10:38:17 2024 +0200
+++ b/src/Pure/Tools/server.scala	Fri Jul 05 16:34:17 2024 +0200
@@ -189,7 +189,7 @@
       catch { case _: IOException => }
 
     def write_line_message(msg: String): Unit =
-      out_lock.synchronized { Byte_Message.write_line_message(out, Bytes(UTF8.bytes(msg))) }
+      out_lock.synchronized { Byte_Message.write_line_message(out, Bytes(msg)) }
 
     def write_byte_message(chunks: List[Bytes]): Unit =
       out_lock.synchronized { Byte_Message.write_message(out, chunks) }
--- a/src/Tools/jEdit/jedit_main/scala_console.scala	Fri Jul 05 10:38:17 2024 +0200
+++ b/src/Tools/jEdit/jedit_main/scala_console.scala	Fri Jul 05 16:34:17 2024 +0200
@@ -50,7 +50,7 @@
 
     override def flush(): Unit = {
       val s = buf.synchronized { val s = buf.toString; buf.clear(); s }
-      val str = UTF8.decode_permissive(Bytes.raw(s))
+      val str = Bytes.raw(s).text
       GUI_Thread.later {
         if (global_out == null) java.lang.System.out.print(str)
         else global_out.writeAttrs(null, str)