src/Pure/General/mailman.scala
author wenzelm
Fri, 10 Dec 2021 19:21:14 +0100
changeset 74907 af2323593473
parent 74906 9702913db56c
child 74921 74655fd58f8e
permissions -rw-r--r--
more Mailman archives;
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
72558
38ebf696fd0c support for Mailman list servers;
wenzelm
parents:
diff changeset
     1
/*  Title:      Pure/General/mailman.scala
38ebf696fd0c support for Mailman list servers;
wenzelm
parents:
diff changeset
     2
    Author:     Makarius
38ebf696fd0c support for Mailman list servers;
wenzelm
parents:
diff changeset
     3
38ebf696fd0c support for Mailman list servers;
wenzelm
parents:
diff changeset
     4
Support for Mailman list servers.
38ebf696fd0c support for Mailman list servers;
wenzelm
parents:
diff changeset
     5
*/
38ebf696fd0c support for Mailman list servers;
wenzelm
parents:
diff changeset
     6
38ebf696fd0c support for Mailman list servers;
wenzelm
parents:
diff changeset
     7
package isabelle
38ebf696fd0c support for Mailman list servers;
wenzelm
parents:
diff changeset
     8
38ebf696fd0c support for Mailman list servers;
wenzelm
parents:
diff changeset
     9
38ebf696fd0c support for Mailman list servers;
wenzelm
parents:
diff changeset
    10
import java.net.URL
38ebf696fd0c support for Mailman list servers;
wenzelm
parents:
diff changeset
    11
74906
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    12
import scala.util.matching.Regex
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    13
72558
38ebf696fd0c support for Mailman list servers;
wenzelm
parents:
diff changeset
    14
38ebf696fd0c support for Mailman list servers;
wenzelm
parents:
diff changeset
    15
object Mailman
38ebf696fd0c support for Mailman list servers;
wenzelm
parents:
diff changeset
    16
{
72559
274c9986e55b maintain Isabelle mailing list archives;
wenzelm
parents: 72558
diff changeset
    17
  /* mailing list archives */
274c9986e55b maintain Isabelle mailing list archives;
wenzelm
parents: 72558
diff changeset
    18
74906
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    19
  def archive(url: URL, msg_format: Msg_Format, name: String = ""): Archive =
72558
38ebf696fd0c support for Mailman list servers;
wenzelm
parents:
diff changeset
    20
  {
74906
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    21
    val list_url =
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    22
      Url(Library.take_suffix[Char](_ == '/', Url.trim_index(url).toString.toList)._1.mkString + "/")
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    23
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    24
    val html = Url.read(list_url)
72558
38ebf696fd0c support for Mailman list servers;
wenzelm
parents:
diff changeset
    25
    val title =
74905
246e22068141 clarified signature;
wenzelm
parents: 73867
diff changeset
    26
      """<title>The ([^</>]*) Archives</title>""".r.findFirstMatchIn(html).map(_.group(1))
246e22068141 clarified signature;
wenzelm
parents: 73867
diff changeset
    27
    val hrefs_text =
246e22068141 clarified signature;
wenzelm
parents: 73867
diff changeset
    28
      """href="([^"]+\.txt(?:\.gz)?)"""".r.findAllMatchIn(html).map(_.group(1)).toList
246e22068141 clarified signature;
wenzelm
parents: 73867
diff changeset
    29
72558
38ebf696fd0c support for Mailman list servers;
wenzelm
parents:
diff changeset
    30
    val list_name =
38ebf696fd0c support for Mailman list servers;
wenzelm
parents:
diff changeset
    31
      (proper_string(name) orElse title).getOrElse(error("Failed to determine mailing list name"))
74906
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    32
    new Archive(list_url, list_name, msg_format, hrefs_text)
72558
38ebf696fd0c support for Mailman list servers;
wenzelm
parents:
diff changeset
    33
  }
38ebf696fd0c support for Mailman list servers;
wenzelm
parents:
diff changeset
    34
74906
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    35
  abstract class Msg_Format
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    36
  {
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    37
    def regex: Regex
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    38
  }
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    39
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    40
  class Archive private[Mailman](
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    41
    val list_url: URL,
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    42
    val list_name: String,
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    43
    msg_format: Msg_Format,
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    44
    hrefs_text: List[String])
72558
38ebf696fd0c support for Mailman list servers;
wenzelm
parents:
diff changeset
    45
  {
38ebf696fd0c support for Mailman list servers;
wenzelm
parents:
diff changeset
    46
    override def toString: String = list_name
38ebf696fd0c support for Mailman list servers;
wenzelm
parents:
diff changeset
    47
74906
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    48
    private def hrefs_msg: List[String] =
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    49
      (for {
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    50
        href <- """href="([^"]+)/date.html"""".r.findAllMatchIn(Url.read(list_url)).map(_.group(1))
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    51
        html = Url.read(new URL(list_url, href + "/date.html"))
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    52
        msg <- msg_format.regex.findAllMatchIn(html).map(_.group(1))
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    53
      } yield href + "/" + msg).toList
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    54
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    55
    def get(target_dir: Path, href: String, progress: Progress = new Progress): Option[Path] =
72558
38ebf696fd0c support for Mailman list servers;
wenzelm
parents:
diff changeset
    56
    {
38ebf696fd0c support for Mailman list servers;
wenzelm
parents:
diff changeset
    57
      val dir = target_dir + Path.basic(list_name)
74906
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    58
      val path = dir + Path.explode(href)
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    59
      val url = new URL(list_url, href)
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    60
      val connection = url.openConnection
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    61
      try {
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    62
        val length = connection.getContentLengthLong
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    63
        val timestamp = connection.getLastModified
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    64
        val file = path.file
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    65
        if (file.isFile && file.length == length && file.lastModified == timestamp) None
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    66
        else {
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    67
          Isabelle_System.make_directory(path.dir)
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    68
          progress.echo("Getting " + url)
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    69
          val bytes =
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    70
            using(connection.getInputStream)(Bytes.read_stream(_, hint = length.toInt max 1024))
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    71
          Bytes.write(file, bytes)
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    72
          file.setLastModified(timestamp)
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    73
          Some(path)
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    74
        }
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    75
      }
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    76
      finally { connection.getInputStream.close() }
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    77
    }
72558
38ebf696fd0c support for Mailman list servers;
wenzelm
parents:
diff changeset
    78
74906
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    79
    def download_text(target_dir: Path, progress: Progress = new Progress): List[Path] =
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    80
      hrefs_text.flatMap(get(target_dir, _, progress = progress))
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    81
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    82
    def download_msg(target_dir: Path, progress: Progress = new Progress): List[Path] =
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    83
      hrefs_msg.flatMap(get(target_dir, _, progress = progress))
74907
af2323593473 more Mailman archives;
wenzelm
parents: 74906
diff changeset
    84
af2323593473 more Mailman archives;
wenzelm
parents: 74906
diff changeset
    85
    def download(target_dir: Path, progress: Progress = new Progress): List[Path] =
af2323593473 more Mailman archives;
wenzelm
parents: 74906
diff changeset
    86
      download_text(target_dir, progress = progress) :::
af2323593473 more Mailman archives;
wenzelm
parents: 74906
diff changeset
    87
      download_msg(target_dir, progress = progress)
72558
38ebf696fd0c support for Mailman list servers;
wenzelm
parents:
diff changeset
    88
  }
72559
274c9986e55b maintain Isabelle mailing list archives;
wenzelm
parents: 72558
diff changeset
    89
274c9986e55b maintain Isabelle mailing list archives;
wenzelm
parents: 72558
diff changeset
    90
274c9986e55b maintain Isabelle mailing list archives;
wenzelm
parents: 72558
diff changeset
    91
  /* Isabelle mailing lists */
274c9986e55b maintain Isabelle mailing list archives;
wenzelm
parents: 72558
diff changeset
    92
274c9986e55b maintain Isabelle mailing list archives;
wenzelm
parents: 72558
diff changeset
    93
  def isabelle_users: Archive =
74906
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    94
    archive(Url("https://lists.cam.ac.uk/pipermail/cl-isabelle-users"), name = "isabelle-users",
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    95
      msg_format =
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    96
        new Msg_Format { val regex: Regex = """<li><a name="\d+" href="(msg\d+\.html)">""".r })
72559
274c9986e55b maintain Isabelle mailing list archives;
wenzelm
parents: 72558
diff changeset
    97
274c9986e55b maintain Isabelle mailing list archives;
wenzelm
parents: 72558
diff changeset
    98
  def isabelle_dev: Archive =
74906
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
    99
    archive(Url("https://mailmanbroy.in.tum.de/pipermail/isabelle-dev"),
9702913db56c more Mailman content;
wenzelm
parents: 74905
diff changeset
   100
      new Msg_Format { val regex: Regex = """<LI><A HREF="(\d+\.html)">""".r })
72558
38ebf696fd0c support for Mailman list servers;
wenzelm
parents:
diff changeset
   101
}