generic support for XZ and Zstd compression in Isabelle/Scala;
authorwenzelm
Fri, 21 Oct 2022 16:39:31 +0200
changeset 76351 2cee31cd92f0
parent 76350 978f7ca3329f
child 76352 f2b98eb6a7a9
generic support for XZ and Zstd compression in Isabelle/Scala; support for Zstd compression in Isabelle/ML;
NEWS
etc/build.props
src/Pure/Admin/build_history.scala
src/Pure/Admin/build_log.scala
src/Pure/Admin/jenkins.scala
src/Pure/General/bytes.scala
src/Pure/General/compress.scala
src/Pure/General/file.scala
src/Pure/General/properties.scala
src/Pure/General/xz.scala
src/Pure/General/zstd.ML
src/Pure/PIDE/xml.scala
src/Pure/ROOT.ML
src/Pure/System/scala.scala
src/Pure/Thy/export.scala
src/Pure/Thy/sessions.scala
src/Pure/term.scala
--- a/NEWS	Fri Oct 21 14:45:13 2022 +0200
+++ b/NEWS	Fri Oct 21 16:39:31 2022 +0200
@@ -50,6 +50,22 @@
   - Added lemma wfP_subset_mset[simp].
 
 
+*** ML ***
+
+* Operations for Zstd compression (via Isabelle/Scala):
+
+  Zstd.compress: Bytes.T -> Bytes.T
+  Zstd.uncompress: Bytes.T -> Bytes.T
+
+
+*** System ***
+
+* Isabelle/Scala provides generic support for XZ and Zstd compression,
+via Compress.Options and Compress.Cache. Bytes.uncompress automatically
+detects the compression scheme.
+
+
+
 New in Isabelle2022 (October 2022)
 ----------------------------------
 
--- a/etc/build.props	Fri Oct 21 14:45:13 2022 +0200
+++ b/etc/build.props	Fri Oct 21 16:39:31 2022 +0200
@@ -63,6 +63,7 @@
   src/Pure/General/codepoint.scala \
   src/Pure/General/comment.scala \
   src/Pure/General/completion.scala \
+  src/Pure/General/compress.scala \
   src/Pure/General/csv.scala \
   src/Pure/General/date.scala \
   src/Pure/General/exn.scala \
@@ -100,7 +101,6 @@
   src/Pure/General/uuid.scala \
   src/Pure/General/value.scala \
   src/Pure/General/word.scala \
-  src/Pure/General/xz.scala \
   src/Pure/General/zstd.scala \
   src/Pure/Isar/document_structure.scala \
   src/Pure/Isar/keyword.scala \
--- a/src/Pure/Admin/build_history.scala	Fri Oct 21 14:45:13 2022 +0200
+++ b/src/Pure/Admin/build_history.scala	Fri Oct 21 16:39:31 2022 +0200
@@ -359,7 +359,7 @@
           session_build_info :::
           ml_statistics.map(Protocol.ML_Statistics_Marker.apply) :::
           session_errors.map(Protocol.Error_Message_Marker.apply) :::
-          heap_sizes), XZ.options(6))
+          heap_sizes), Compress.Options_XZ(6))
 
 
       /* next build */
--- a/src/Pure/Admin/build_log.scala	Fri Oct 21 14:45:13 2022 +0200
+++ b/src/Pure/Admin/build_log.scala	Fri Oct 21 16:39:31 2022 +0200
@@ -616,7 +616,10 @@
       errors = log_file.filter(Protocol.Error_Message_Marker))
   }
 
-  def compress_errors(errors: List[String], cache: XZ.Cache = XZ.Cache.none): Option[Bytes] =
+  def compress_errors(
+    errors: List[String],
+    cache: Compress.Cache = Compress.Cache.none
+  ): Option[Bytes] =
     if (errors.isEmpty) None
     else {
       Some(Bytes(YXML.string_of_body(XML.Encode.list(XML.Encode.string)(errors))).
@@ -627,7 +630,7 @@
     if (bytes.is_empty) Nil
     else {
       XML.Decode.list(YXML.string_of_body)(
-        YXML.parse_body(bytes.uncompress(cache = cache.xz).text, cache = cache))
+        YXML.parse_body(bytes.uncompress(cache = cache.compress).text, cache = cache))
     }
 
 
@@ -954,7 +957,7 @@
           stmt.double(13) = session.ml_timing.factor
           stmt.long(14) = session.heap_size
           stmt.string(15) = session.status.map(_.toString)
-          stmt.bytes(16) = compress_errors(session.errors, cache = cache.xz)
+          stmt.bytes(16) = compress_errors(session.errors, cache = cache.compress)
           stmt.string(17) = session.sources
           stmt.execute()
         }
@@ -988,7 +991,7 @@
       db.using_statement(db.insert_permissive(table)) { stmt =>
         val ml_stats: List[(String, Option[Bytes])] =
           Par_List.map[(String, Session_Entry), (String, Option[Bytes])](
-            { case (a, b) => (a, Properties.compress(b.ml_statistics, cache = cache.xz).proper) },
+            { case (a, b) => (a, Properties.compress(b.ml_statistics, cache = cache.compress).proper) },
             build_info.sessions.iterator.filter(p => p._2.ml_statistics.nonEmpty).toList)
         val entries = if (ml_stats.nonEmpty) ml_stats else List("" -> None)
         for ((session_name, ml_statistics) <- entries) {
--- a/src/Pure/Admin/jenkins.scala	Fri Oct 21 14:45:13 2022 +0200
+++ b/src/Pure/Admin/jenkins.scala	Fri Oct 21 16:39:31 2022 +0200
@@ -112,7 +112,7 @@
         File.write_xz(log_path,
           terminate_lines(Url.read(main_log) ::
             ml_statistics.map(Protocol.ML_Statistics_Marker.apply)),
-          XZ.options(6))
+          Compress.Options_XZ(6))
       }
     }
   }
--- a/src/Pure/General/bytes.scala	Fri Oct 21 14:45:13 2022 +0200
+++ b/src/Pure/General/bytes.scala	Fri Oct 21 16:39:31 2022 +0200
@@ -7,10 +7,10 @@
 package isabelle
 
 
-import java.io.{File => JFile, ByteArrayOutputStream, ByteArrayInputStream,
-  OutputStream, InputStream, FileInputStream, FileOutputStream}
+import com.github.luben.zstd.{ZstdInputStream, ZstdOutputStream}
+
+import java.io.{ByteArrayInputStream, ByteArrayOutputStream, FileInputStream, FileOutputStream, InputStream, OutputStream, File as JFile}
 import java.net.URL
-
 import org.tukaani.xz.{XZInputStream, XZOutputStream}
 
 
@@ -191,20 +191,59 @@
   def write_stream(stream: OutputStream): Unit = stream.write(bytes, offset, length)
 
 
-  /* XZ data compression */
+  /* XZ / Zstd data compression */
+
+  private def detect_xz: Boolean =
+    length >= 6 &&
+      bytes(offset)     == 0xFD.toByte &&
+      bytes(offset + 1) == 0x37.toByte &&
+      bytes(offset + 2) == 0x7A.toByte &&
+      bytes(offset + 3) == 0x58.toByte &&
+      bytes(offset + 4) == 0x5A.toByte &&
+      bytes(offset + 5) == 0x00.toByte
+
+  private def detect_zstd: Boolean =
+    length >= 4 &&
+      bytes(offset)     == 0x28.toByte &&
+      bytes(offset + 1) == 0xB5.toByte &&
+      bytes(offset + 2) == 0x2F.toByte &&
+      bytes(offset + 3) == 0xFD.toByte
+
+  private def detect_error(name: String = ""): Nothing =
+    error("Cannot detect compression scheme" + (if (name.isEmpty) "" else " " + name))
 
-  def uncompress(cache: XZ.Cache = XZ.Cache.none): Bytes =
-    using(new XZInputStream(stream(), cache))(Bytes.read_stream(_, hint = length))
+  def uncompress(cache: Compress.Cache = Compress.Cache.none): Bytes =
+    using(
+      if (detect_xz) new XZInputStream(stream(), cache.xz)
+      else if (detect_zstd) { Zstd.init(); new ZstdInputStream(stream(), cache.zstd) }
+      else detect_error()
+    )(Bytes.read_stream(_, hint = length))
+
+  def uncompress_xz(cache: Compress.Cache = Compress.Cache.none): Bytes =
+    if (detect_xz) uncompress(cache = cache) else detect_error("XZ")
+
+  def uncompress_zstd(cache: Compress.Cache = Compress.Cache.none): Bytes =
+    if (detect_zstd) uncompress(cache = cache) else detect_error("Zstd")
 
-  def compress(options: XZ.Options = XZ.options(), cache: XZ.Cache = XZ.Cache.none): Bytes = {
+  def compress(
+    options: Compress.Options = Compress.Options(),
+    cache: Compress.Cache = Compress.Cache.none
+  ): Bytes = {
     val result = new ByteArrayOutputStream(length)
-    using(new XZOutputStream(result, options, cache))(write_stream(_))
+    using(
+      options match {
+        case options_xz: Compress.Options_XZ =>
+          new XZOutputStream(result, options_xz.make, cache.xz)
+        case options_zstd: Compress.Options_Zstd =>
+          Zstd.init()
+          new ZstdOutputStream(result, cache.zstd, options_zstd.level)
+      })(write_stream)
     new Bytes(result.toByteArray, 0, result.size)
   }
 
   def maybe_compress(
-    options: XZ.Options = XZ.options(),
-    cache: XZ.Cache = XZ.Cache.none
+    options: Compress.Options = Compress.Options(),
+    cache: Compress.Cache = Compress.Cache.none
   ) : (Boolean, Bytes) = {
     val compressed = compress(options = options, cache = cache)
     if (compressed.length < length) (true, compressed) else (false, this)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/Pure/General/compress.scala	Fri Oct 21 16:39:31 2022 +0200
@@ -0,0 +1,63 @@
+/*  Title:      Pure/General/compress.scala
+    Author:     Makarius
+
+Support for generic data compression.
+*/
+
+package isabelle
+
+
+import org.tukaani.xz.{LZMA2Options, ArrayCache, BasicArrayCache}
+import com.github.luben.zstd.{BufferPool, NoPool, RecyclingBufferPool,
+  ZstdInputStream, ZstdOutputStream}
+
+
+object Compress {
+  /* options */
+
+  object Options {
+    def apply(): Options = Options_XZ()
+  }
+  sealed abstract class Options
+  case class Options_XZ(level: Int = 3) extends Options {
+    def make: LZMA2Options = {
+      val opts = new LZMA2Options
+      opts.setPreset(level)
+      opts
+    }
+  }
+  case class Options_Zstd(level: Int = 3) extends Options
+
+
+  /* cache */
+
+  class Cache private(val xz: ArrayCache, val zstd: BufferPool)
+
+  object Cache {
+    def none: Cache = { Zstd.init(); new Cache(ArrayCache.getDummyCache(), NoPool.INSTANCE) }
+    def make(): Cache = { Zstd.init(); new Cache(new BasicArrayCache, RecyclingBufferPool.INSTANCE) } // FIXME new pool
+  }
+
+
+  /* Scala functions in ML */
+
+  object XZ_Compress extends Scala.Fun_Bytes("XZ.compress") {
+    val here = Scala_Project.here
+    def apply(arg: Bytes): Bytes = arg.compress(Options_XZ())
+  }
+
+  object XZ_Uncompress extends Scala.Fun_Bytes("XZ.uncompress") {
+    val here = Scala_Project.here
+    def apply(arg: Bytes): Bytes = arg.uncompress_xz()
+  }
+
+  object Zstd_Compress extends Scala.Fun_Bytes("Zstd.compress") {
+    val here = Scala_Project.here
+    def apply(arg: Bytes): Bytes = arg.compress(Options_Zstd())
+  }
+
+  object Zstd_Uncompress extends Scala.Fun_Bytes("Zstd.uncompress") {
+    val here = Scala_Project.here
+    def apply(arg: Bytes): Bytes = arg.uncompress_zstd()
+  }
+}
--- a/src/Pure/General/file.scala	Fri Oct 21 14:45:13 2022 +0200
+++ b/src/Pure/General/file.scala	Fri Oct 21 16:39:31 2022 +0200
@@ -249,18 +249,23 @@
     write_file(file, text, (s: OutputStream) => new GZIPOutputStream(new BufferedOutputStream(s)))
   def write_gzip(path: Path, text: String): Unit = write_gzip(path.file, text)
 
-  def write_xz(file: JFile, text: String, options: XZ.Options): Unit =
-    File.write_file(file, text, s => new XZOutputStream(new BufferedOutputStream(s), options))
-  def write_xz(file: JFile, text: String): Unit = write_xz(file, text, XZ.options())
-  def write_xz(path: Path, text: String, options: XZ.Options): Unit =
+  def write_xz(file: JFile, text: String, options: Compress.Options_XZ): Unit =
+    File.write_file(file, text, s => new XZOutputStream(new BufferedOutputStream(s), options.make))
+  def write_xz(file: JFile, text: String): Unit = write_xz(file, text, Compress.Options_XZ())
+  def write_xz(path: Path, text: String, options: Compress.Options_XZ): Unit =
     write_xz(path.file, text, options)
-  def write_xz(path: Path, text: String): Unit = write_xz(path, text, XZ.options())
+  def write_xz(path: Path, text: String): Unit = write_xz(path, text, Compress.Options_XZ())
 
-  def write_zstd(file: JFile, text: String): Unit = {
+  def write_zstd(file: JFile, text: String, options: Compress.Options_Zstd): Unit = {
     Zstd.init()
-    write_file(file, text, (s: OutputStream) => new ZstdOutputStream(new BufferedOutputStream(s)))
+    File.write_file(file, text, s => new ZstdOutputStream(new BufferedOutputStream(s), options.level))
   }
-  def write_zstd(path: Path, text: String): Unit = write_zstd(path.file, text)
+  def write_zstd(file: JFile, text: String): Unit =
+    write_zstd(file, text, Compress.Options_Zstd())
+  def write_zstd(path: Path, text: String, options: Compress.Options_Zstd): Unit =
+    write_zstd(path.file, text, options)
+  def write_zstd(path: Path, text: String): Unit =
+    write_zstd(path, text, Compress.Options_Zstd())
 
   def write_backup(path: Path, text: String): Unit = {
     if (path.is_file) Isabelle_System.move_file(path, path.backup)
--- a/src/Pure/General/properties.scala	Fri Oct 21 14:45:13 2022 +0200
+++ b/src/Pure/General/properties.scala	Fri Oct 21 16:39:31 2022 +0200
@@ -54,8 +54,8 @@
   }
 
   def compress(ps: List[T],
-    options: XZ.Options = XZ.options(),
-    cache: XZ.Cache = XZ.Cache.none
+    options: Compress.Options = Compress.Options(),
+    cache: Compress.Cache = Compress.Cache.none
   ): Bytes = {
     if (ps.isEmpty) Bytes.empty
     else {
@@ -69,7 +69,7 @@
     else {
       val ps =
         XML.Decode.list(XML.Decode.properties)(
-          YXML.parse_body(bs.uncompress(cache = cache.xz).text))
+          YXML.parse_body(bs.uncompress(cache = cache.compress).text))
       if (cache.no_cache) ps else ps.map(cache.props)
     }
   }
--- a/src/Pure/General/xz.scala	Fri Oct 21 14:45:13 2022 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,46 +0,0 @@
-/*  Title:      Pure/General/xz.scala
-    Author:     Makarius
-
-Support for XZ data compression.
-*/
-
-package isabelle
-
-
-import org.tukaani.xz.{LZMA2Options, ArrayCache, BasicArrayCache}
-
-
-object XZ {
-  /* options */
-
-  type Options = LZMA2Options
-
-  def options(preset: Int = 3): Options = {
-    val opts = new LZMA2Options
-    opts.setPreset(preset)
-    opts
-  }
-
-
-  /* cache */
-
-  type Cache = ArrayCache
-
-  object Cache {
-    def none: ArrayCache = ArrayCache.getDummyCache()
-    def make(): ArrayCache = new BasicArrayCache
-  }
-
-
-  /* Scala functions in ML */
-
-  object Compress extends Scala.Fun_Bytes("XZ.compress") {
-    val here = Scala_Project.here
-    def apply(arg: Bytes): Bytes = arg.compress()
-  }
-
-  object Uncompress extends Scala.Fun_Bytes("XZ.uncompress") {
-    val here = Scala_Project.here
-    def apply(arg: Bytes): Bytes = arg.uncompress()
-  }
-}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/Pure/General/zstd.ML	Fri Oct 21 16:39:31 2022 +0200
@@ -0,0 +1,19 @@
+(*  Title:      Pure/General/zstd.ML
+    Author:     Makarius
+
+Support for Zstd compression (via Isabelle/Scala).
+*)
+
+signature Zstd =
+sig
+  val compress: Bytes.T -> Bytes.T
+  val uncompress: Bytes.T -> Bytes.T
+end;
+
+structure Zstd: Zstd =
+struct
+
+val compress = \<^scala>\<open>Zstd.compress\<close>;
+val uncompress = \<^scala>\<open>Zstd.uncompress\<close>;
+
+end;
--- a/src/Pure/PIDE/xml.scala	Fri Oct 21 14:45:13 2022 +0200
+++ b/src/Pure/PIDE/xml.scala	Fri Oct 21 16:39:31 2022 +0200
@@ -189,15 +189,15 @@
 
   object Cache {
     def make(
-        xz: XZ.Cache = XZ.Cache.make(),
+        compress: Compress.Cache = Compress.Cache.make(),
         max_string: Int = isabelle.Cache.default_max_string,
         initial_size: Int = isabelle.Cache.default_initial_size): Cache =
-      new Cache(xz, max_string, initial_size)
+      new Cache(compress, max_string, initial_size)
 
-    val none: Cache = make(XZ.Cache.none, max_string = 0)
+    val none: Cache = make(Compress.Cache.none, max_string = 0)
   }
 
-  class Cache(val xz: XZ.Cache, max_string: Int, initial_size: Int)
+  class Cache(val compress: Compress.Cache, max_string: Int, initial_size: Int)
   extends isabelle.Cache(max_string, initial_size) {
     protected def cache_props(x: Properties.T): Properties.T = {
       if (x.isEmpty) x
--- a/src/Pure/ROOT.ML	Fri Oct 21 14:45:13 2022 +0200
+++ b/src/Pure/ROOT.ML	Fri Oct 21 16:39:31 2022 +0200
@@ -348,6 +348,7 @@
 
 ML_file "General/base64.ML";
 ML_file "General/xz.ML";
+ML_file "General/zstd.ML";
 ML_file "Tools/build.ML";
 ML_file "Tools/named_thms.ML";
 ML_file "Tools/print_operation.ML";
--- a/src/Pure/System/scala.scala	Fri Oct 21 14:45:13 2022 +0200
+++ b/src/Pure/System/scala.scala	Fri Oct 21 16:39:31 2022 +0200
@@ -368,8 +368,10 @@
   Scala_Build.Scala_Fun,
   Base64.Decode,
   Base64.Encode,
-  XZ.Compress,
-  XZ.Uncompress,
+  Compress.XZ_Compress,
+  Compress.XZ_Uncompress,
+  Compress.Zstd_Compress,
+  Compress.Zstd_Uncompress,
   Doc.Doc_Names,
   Bibtex.Check_Database,
   Isabelle_System.Make_Directory,
--- a/src/Pure/Thy/export.scala	Fri Oct 21 14:45:13 2022 +0200
+++ b/src/Pure/Thy/export.scala	Fri Oct 21 16:39:31 2022 +0200
@@ -134,7 +134,7 @@
 
     def uncompressed: Bytes = {
       val (compressed, bytes) = body.join
-      if (compressed) bytes.uncompress(cache = cache.xz) else bytes
+      if (compressed) bytes.uncompress(cache = cache.compress) else bytes
     }
 
     def uncompressed_yxml: XML.Body =
@@ -183,7 +183,7 @@
     cache: XML.Cache
   ): Entry = {
     val body =
-      if (args.compress) Future.fork(bytes.maybe_compress(cache = cache.xz))
+      if (args.compress) Future.fork(bytes.maybe_compress(cache = cache.compress))
       else Future.value((false, bytes))
     val entry_name = Entry_Name(session = session_name, theory = args.theory_name, name = args.name)
     Entry(entry_name, args.executable, body, cache)
--- a/src/Pure/Thy/sessions.scala	Fri Oct 21 14:45:13 2022 +0200
+++ b/src/Pure/Thy/sessions.scala	Fri Oct 21 16:39:31 2022 +0200
@@ -1488,11 +1488,11 @@
         db.using_statement(table.insert()) { stmt =>
           stmt.string(1) = name
           stmt.bytes(2) = Properties.encode(build_log.session_timing)
-          stmt.bytes(3) = Properties.compress(build_log.command_timings, cache = cache.xz)
-          stmt.bytes(4) = Properties.compress(build_log.theory_timings, cache = cache.xz)
-          stmt.bytes(5) = Properties.compress(build_log.ml_statistics, cache = cache.xz)
-          stmt.bytes(6) = Properties.compress(build_log.task_statistics, cache = cache.xz)
-          stmt.bytes(7) = Build_Log.compress_errors(build_log.errors, cache = cache.xz)
+          stmt.bytes(3) = Properties.compress(build_log.command_timings, cache = cache.compress)
+          stmt.bytes(4) = Properties.compress(build_log.theory_timings, cache = cache.compress)
+          stmt.bytes(5) = Properties.compress(build_log.ml_statistics, cache = cache.compress)
+          stmt.bytes(6) = Properties.compress(build_log.task_statistics, cache = cache.compress)
+          stmt.bytes(7) = Build_Log.compress_errors(build_log.errors, cache = cache.compress)
           stmt.string(8) = build.sources
           stmt.string(9) = cat_lines(build.input_heaps)
           stmt.string(10) = build.output_heap getOrElse ""
--- a/src/Pure/term.scala	Fri Oct 21 14:45:13 2022 +0200
+++ b/src/Pure/term.scala	Fri Oct 21 16:39:31 2022 +0200
@@ -177,16 +177,16 @@
 
   object Cache {
     def make(
-        xz: XZ.Cache = XZ.Cache.make(),
+        compress: Compress.Cache = Compress.Cache.make(),
         max_string: Int = isabelle.Cache.default_max_string,
         initial_size: Int = isabelle.Cache.default_initial_size): Cache =
-      new Cache(xz, initial_size, max_string)
+      new Cache(compress, initial_size, max_string)
 
     val none: Cache = make(max_string = 0)
   }
 
-  class Cache(xz: XZ.Cache, max_string: Int, initial_size: Int)
-  extends XML.Cache(xz, max_string, initial_size) {
+  class Cache(compress: Compress.Cache, max_string: Int, initial_size: Int)
+  extends XML.Cache(compress, max_string, initial_size) {
     protected def cache_indexname(x: Indexname): Indexname =
       lookup(x) getOrElse store(Indexname(cache_string(x.name), x.index))