more Mailman content;
authorwenzelm
Fri, 10 Dec 2021 18:56:18 +0100
changeset 74906 9702913db56c
parent 74905 246e22068141
child 74907 af2323593473
more Mailman content;
src/Pure/General/mailman.scala
--- a/src/Pure/General/mailman.scala	Fri Dec 10 16:55:42 2021 +0100
+++ b/src/Pure/General/mailman.scala	Fri Dec 10 18:56:18 2021 +0100
@@ -9,65 +9,89 @@
 
 import java.net.URL
 
+import scala.util.matching.Regex
+
 
 object Mailman
 {
   /* mailing list archives */
 
-  def archive(url: URL, name: String = ""): Archive =
+  def archive(url: URL, msg_format: Msg_Format, name: String = ""): Archive =
   {
-    val html = Url.read(url)
+    val list_url =
+      Url(Library.take_suffix[Char](_ == '/', Url.trim_index(url).toString.toList)._1.mkString + "/")
+
+    val html = Url.read(list_url)
     val title =
       """<title>The ([^</>]*) Archives</title>""".r.findFirstMatchIn(html).map(_.group(1))
     val hrefs_text =
       """href="([^"]+\.txt(?:\.gz)?)"""".r.findAllMatchIn(html).map(_.group(1)).toList
 
-    val list_url =
-      Url(Library.take_suffix[Char](_ == '/', Url.trim_index(url).toString.toList)._1.mkString + "/")
     val list_name =
       (proper_string(name) orElse title).getOrElse(error("Failed to determine mailing list name"))
-    new Archive(list_url, list_name, hrefs_text)
+    new Archive(list_url, list_name, msg_format, hrefs_text)
   }
 
-  class Archive private[Mailman](val list_url: URL, val list_name: String, hrefs_text: List[String])
+  abstract class Msg_Format
+  {
+    def regex: Regex
+  }
+
+  class Archive private[Mailman](
+    val list_url: URL,
+    val list_name: String,
+    msg_format: Msg_Format,
+    hrefs_text: List[String])
   {
     override def toString: String = list_name
 
-    def download_text(target_dir: Path, progress: Progress = new Progress): List[Path] =
+    private def hrefs_msg: List[String] =
+      (for {
+        href <- """href="([^"]+)/date.html"""".r.findAllMatchIn(Url.read(list_url)).map(_.group(1))
+        html = Url.read(new URL(list_url, href + "/date.html"))
+        msg <- msg_format.regex.findAllMatchIn(html).map(_.group(1))
+      } yield href + "/" + msg).toList
+
+    def get(target_dir: Path, href: String, progress: Progress = new Progress): Option[Path] =
     {
       val dir = target_dir + Path.basic(list_name)
-      Isabelle_System.make_directory(dir)
+      val path = dir + Path.explode(href)
+      val url = new URL(list_url, href)
+      val connection = url.openConnection
+      try {
+        val length = connection.getContentLengthLong
+        val timestamp = connection.getLastModified
+        val file = path.file
+        if (file.isFile && file.length == length && file.lastModified == timestamp) None
+        else {
+          Isabelle_System.make_directory(path.dir)
+          progress.echo("Getting " + url)
+          val bytes =
+            using(connection.getInputStream)(Bytes.read_stream(_, hint = length.toInt max 1024))
+          Bytes.write(file, bytes)
+          file.setLastModified(timestamp)
+          Some(path)
+        }
+      }
+      finally { connection.getInputStream.close() }
+    }
 
-      hrefs_text.flatMap(name =>
-        {
-          val path = dir + Path.basic(name)
-          val url = new URL(list_url, name)
-          val connection = url.openConnection
-          try {
-            val length = connection.getContentLengthLong
-            val timestamp = connection.getLastModified
-            val file = path.file
-            if (file.isFile && file.length == length && file.lastModified == timestamp) None
-            else {
-              progress.echo("Getting " + url)
-              val bytes =
-                using(connection.getInputStream)(Bytes.read_stream(_, hint = length.toInt max 1024))
-              Bytes.write(file, bytes)
-              file.setLastModified(timestamp)
-              Some(path)
-            }
-          }
-          finally { connection.getInputStream.close() }
-        })
-    }
+    def download_text(target_dir: Path, progress: Progress = new Progress): List[Path] =
+      hrefs_text.flatMap(get(target_dir, _, progress = progress))
+
+    def download_msg(target_dir: Path, progress: Progress = new Progress): List[Path] =
+      hrefs_msg.flatMap(get(target_dir, _, progress = progress))
   }
 
 
   /* Isabelle mailing lists */
 
   def isabelle_users: Archive =
-    archive(Url("https://lists.cam.ac.uk/pipermail/cl-isabelle-users"), name = "isabelle-users")
+    archive(Url("https://lists.cam.ac.uk/pipermail/cl-isabelle-users"), name = "isabelle-users",
+      msg_format =
+        new Msg_Format { val regex: Regex = """<li><a name="\d+" href="(msg\d+\.html)">""".r })
 
   def isabelle_dev: Archive =
-    archive(Url("https://mailmanbroy.in.tum.de/pipermail/isabelle-dev"))
+    archive(Url("https://mailmanbroy.in.tum.de/pipermail/isabelle-dev"),
+      new Msg_Format { val regex: Regex = """<LI><A HREF="(\d+\.html)">""".r })
 }