| author | wenzelm | 
| Sat, 11 Feb 2023 23:02:51 +0100 | |
| changeset 77255 | b810e99b5afb | 
| parent 75906 | 2167b9e3157a | 
| child 77368 | 7c57d9586f4c | 
| permissions | -rw-r--r-- | 
| 72558 | 1  | 
/* Title: Pure/General/mailman.scala  | 
2  | 
Author: Makarius  | 
|
3  | 
||
| 74942 | 4  | 
Support for Mailman list servers, notably isabelle-users and isabelle-dev.  | 
| 72558 | 5  | 
*/  | 
6  | 
||
7  | 
package isabelle  | 
|
8  | 
||
9  | 
||
10  | 
import java.net.URL  | 
|
11  | 
||
| 74921 | 12  | 
import scala.annotation.tailrec  | 
| 74906 | 13  | 
import scala.util.matching.Regex  | 
| 74921 | 14  | 
import scala.util.matching.Regex.Match  | 
| 74906 | 15  | 
|
| 72558 | 16  | 
|
| 75393 | 17  | 
object Mailman {
 | 
| 74921 | 18  | 
/* mailing list messages */  | 
| 72559 | 19  | 
|
| 74935 | 20  | 
def is_address(s: String): Boolean =  | 
21  | 
    s.contains('@') && s.contains('.') && !s.contains(' ')
 | 
|
22  | 
||
| 74936 | 23  | 
private val standard_name: Map[String, String] =  | 
| 74935 | 24  | 
Map(  | 
| 74941 | 25  | 
"121171528@qq.com" -> "Guo Fan\n121171528@qq.com",  | 
| 74937 | 26  | 
"Aman Pohjola, Johannes (Data61, Kensington NSW)" -> "Johannes Aman Pohjola",  | 
27  | 
"Andrei de AraÃjo Formiga" -> "Andrei de Araujo Formiga",  | 
|
| 74940 | 28  | 
"Benedikt.AHRENS@unice.fr" -> "Benedikt Ahrens\nBenedikt.AHRENS@unice.fr",  | 
29  | 
"Berg, Nils Erik" -> "Nils Erik Berg",  | 
|
| 74937 | 30  | 
"Berger U." -> "Ulrich Berger",  | 
31  | 
"Bisping, Benjamin" -> "Benjamin Bisping",  | 
|
| 74935 | 32  | 
"Blanchette, J.C." -> "Jasmin Christian Blanchette",  | 
33  | 
"Buday Gergely István" -> "Gergely Buday",  | 
|
| 74938 | 34  | 
"CALaF1UJ9Uy0vGCu4WkBmbfuPDxG7nFm8hfeCMP+O3g7_5CQ0Bw@mail.gmail.com" -> "",  | 
| 74937 | 35  | 
"CRACIUN F." -> "Florin Craciun",  | 
36  | 
"Carsten Schuermann" -> "Carsten Schürmann",  | 
|
| 74940 | 37  | 
"Chris" -> "",  | 
| 74935 | 38  | 
"Christoph Lueth" -> "Christoph Lüth",  | 
39  | 
"Claude Marche" -> "Claude Marché",  | 
|
| 74937 | 40  | 
"Daniel StÃwe" -> "Daniel Stüwe",  | 
| 74940 | 41  | 
"Daniel.Matichuk@data61.csiro.au" -> "Daniel Matichuk\nDaniel.Matichuk@data61.csiro.au",  | 
42  | 
"Daniel.Matichuk@nicta.com.au" -> "Daniel Matichuk\nDaniel.Matichuk@nicta.com.au",  | 
|
| 74937 | 43  | 
"David MENTRE" -> "David MENTRÉ",  | 
| 74940 | 44  | 
"Dey, Katie" -> "Katie Dey",  | 
| 74937 | 45  | 
"Dr. Brendan Patrick Mahony" -> "Brendan Mahony",  | 
46  | 
"Farn" -> "Farn Wang",  | 
|
| 74935 | 47  | 
"Farquhar, Colin I" -> "Colin Farquhar",  | 
| 74940 | 48  | 
"Fernandez, Matthew" -> "Matthew Fernandez",  | 
| 74937 | 49  | 
"Filip Maric" -> "Filip Marić",  | 
50  | 
"Filip MariÄ" -> "Filip Marić",  | 
|
51  | 
"Fleury Mathias" -> "Mathias Fleury",  | 
|
| 74940 | 52  | 
"Francisco Jose CHAVES ALONSO" -> "Francisco Jose Chaves Alonso",  | 
53  | 
"Frederic Tuong (Dr)" -> "Frederic Tuong",  | 
|
54  | 
"Fulya" -> "Fulya Horozal",  | 
|
| 74935 | 55  | 
"George K." -> "George Karabotsos",  | 
| 74940 | 56  | 
"Gidon Ernst" -> "Gidon ERNST",  | 
57  | 
"Gransden, Thomas" -> "Thomas Gransden",  | 
|
| 74937 | 58  | 
"Hans-JÃrg Schurr" -> "Hans-Jörg Schurr",  | 
59  | 
"Henri DEBRAT" -> "Henri Debrat",  | 
|
| 74935 | 60  | 
"Hitoshi Ohsaki (RTA publicity chair)" -> "Hitoshi Ohsaki",  | 
| 74940 | 61  | 
"Häuselmann Rafael" -> "Rafael Häuselmann",  | 
| 74937 | 62  | 
"Isabelle" -> "",  | 
| 74940 | 63  | 
"J. Juhas (TUM)" -> "Jonatan Juhas",  | 
| 74937 | 64  | 
"Jackson, Vincent (Data61, Kensington NSW)" -> "Vincent Jackson",  | 
65  | 
"Janney, Mark-P26816" -> "Mark Janney",  | 
|
| 74940 | 66  | 
"Jean François Molderez" -> "Jean-François Molderez",  | 
67  | 
"Jean-Francois Molderez" -> "Jean-François Molderez",  | 
|
68  | 
"John R Harrison" -> "John Harrison",  | 
|
| 74937 | 69  | 
"Jose DivasÃn" -> "Jose Divasón",  | 
70  | 
"Julian" -> "",  | 
|
71  | 
"Julien" -> "",  | 
|
72  | 
"Klein, Gerwin (Data61, Kensington NSW)" -> "Gerwin Klein",  | 
|
73  | 
"Kobayashi, Hidetsune" -> "Hidetsune Kobayashi",  | 
|
| 74940 | 74  | 
"Kylie Williams (IND)" -> "Kylie Williams",  | 
75  | 
"Laarman, A.W." -> "A.W. Laarman",  | 
|
| 74937 | 76  | 
"Laurent Thery" -> "Laurent Théry",  | 
| 74940 | 77  | 
"Li, Chanjuan" -> "Li Chanjuan",  | 
| 74937 | 78  | 
"Lochbihler Andreas" -> "Andreas Lochbihler",  | 
| 74940 | 79  | 
"Luckhardt, Daniel" -> "Daniel Luckhardt",  | 
| 74937 | 80  | 
"Lutz Schroeder" -> "Lutz Schröder",  | 
81  | 
"Lutz SchrÃder" -> "Lutz Schröder",  | 
|
| 74940 | 82  | 
"MACKENZIE Carlin" -> "Carlin MACKENZIE",  | 
| 74937 | 83  | 
"Makarius" -> "Makarius Wenzel",  | 
84  | 
"Marco" -> "",  | 
|
85  | 
"Mark" -> "",  | 
|
| 74940 | 86  | 
"Markus Mueller-Olm" -> "Markus Müller-Olm",  | 
| 74937 | 87  | 
"Markus" -> "",  | 
| 74940 | 88  | 
"Marmsoler, Diego" -> "Diego Marmsoler",  | 
| 74937 | 89  | 
"Martin Klebermass" -> "Martin Klebermaß",  | 
| 74941 | 90  | 
"Martyn Johnson via RT" -> "",  | 
91  | 
"Mathias.Fleury@ens-rennes.fr" -> "Mathias Fleury\nmathias.fleury12@gmail.com",  | 
|
| 74940 | 92  | 
"Matthew" -> "",  | 
| 74937 | 93  | 
"Matthews, John R" -> "John Matthews",  | 
| 74940 | 94  | 
"McCarthy, Jim (C3ID)" -> "Jim McCarthy",  | 
95  | 
"McCue, Brian" -> "Brian McCue",  | 
|
| 74937 | 96  | 
"Michael FÃrber" -> "Michael Färber",  | 
| 74940 | 97  | 
"Michel" -> "",  | 
98  | 
"Miranda, Brando" -> "Brando Miranda",  | 
|
99  | 
"Moscato, Mariano M. \\(LARC-D320\\)\\[NATIONAL INSTITUTE OF AEROSPACE\\]" -> "Mariano M. Moscato",  | 
|
100  | 
"Mr Julian Fell" -> "Julian Fell",  | 
|
101  | 
"Mueller Peter" -> "Peter Müller",  | 
|
102  | 
"Munoz, Cesar Augusto (LARC-D320)" -> "Cesar A. Munoz",  | 
|
103  | 
"Nadel, Alexander" -> "Alexander Nadel",  | 
|
104  | 
"Nagashima, Yutaka" -> "Yutaka Nagashima",  | 
|
| 74937 | 105  | 
"Norrish, Michael (Data61, Acton)" -> "Michael Norrish",  | 
| 74940 | 106  | 
"O'Leary, John W" -> "John W O'Leary",  | 
| 74937 | 107  | 
"Omar Montano Rivas" -> "Omar Montaño Rivas",  | 
108  | 
"Omar MontaÃo Rivas" -> "Omar Montaño Rivas",  | 
|
109  | 
"OndÅej KunÄar" -> "Ondřej Kunčar",  | 
|
| 74941 | 110  | 
"PALMER Jake" -> "Jake Palmer",  | 
| 74937 | 111  | 
"PAQUI LUCIO" -> "Paqui Lucio",  | 
| 74940 | 112  | 
"Pal, Abhik" -> "Abhik Pal",  | 
113  | 
"Pasupuleti, Vijay" -> "Vijay Pasupuleti",  | 
|
| 74937 | 114  | 
"Peter Vincent Homeier" -> "Peter V. Homeier",  | 
115  | 
"Peter" -> "",  | 
|
116  | 
"Philipp Ruemmer" -> "Philipp Rümmer",  | 
|
117  | 
"Philipp RÃmmer" -> "Philipp Rümmer",  | 
|
| 74941 | 118  | 
"Piete Brooks via RT" -> "",  | 
| 74940 | 119  | 
"RTA publicity chair" -> "",  | 
| 74937 | 120  | 
"Raamsdonk, F. van" -> "Femke van Raamsdonk",  | 
121  | 
"Raul Gutierrez" -> "Raúl Gutiérrez",  | 
|
122  | 
"Renà Thiemann" -> "René Thiemann",  | 
|
| 74940 | 123  | 
"Ridgway, John V. E." -> "John V. E. Ridgway",  | 
| 74937 | 124  | 
"Roggenbach M." -> "Markus Roggenbach",  | 
| 74940 | 125  | 
"Rosu, Grigore" -> "Grigore Rosu",  | 
| 74937 | 126  | 
"Rozman, Mihaela" -> "Mihaela Rozman",  | 
| 74940 | 127  | 
"Schmaltz, J." -> "Julien Schmaltz",  | 
| 74937 | 128  | 
"Serguei A. Mokhov on behalf of PST-11" -> "Serguei A. Mokhov",  | 
| 74940 | 129  | 
"Serguei Mokhov" -> "Serguei A. Mokhov",  | 
130  | 
"Shumeiko, Igor" -> "Igor Shumeiko",  | 
|
131  | 
"Siek, Jeremy" -> "Jeremy Siek",  | 
|
132  | 
"Silvio.Ranise@loria.fr" -> "Silvio Ranise\nSilvio.Ranise@loria.fr",  | 
|
133  | 
"Siu, Tony" -> "Tony Siu",  | 
|
| 74937 | 134  | 
"Stüber, Sebastian" -> "Sebastian Stüber",  | 
135  | 
"Thiemann, Rene" -> "René Thiemann",  | 
|
136  | 
"Thiemann, René" -> "René Thiemann",  | 
|
| 74940 | 137  | 
"Thomas Arthur Leck Sewell" -> "Thomas Sewell",  | 
| 74937 | 138  | 
"Thomas Goethel" -> "Thomas Göthel",  | 
| 74940 | 139  | 
"Thomas.Sewell@data61.csiro.au" -> "Thomas Sewell\nThomas.Sewell@data61.csiro.au",  | 
| 74941 | 140  | 
"Tjark Weber via RT" -> "Tjark Weber",  | 
| 74940 | 141  | 
"Toby.Murray@data61.csiro.au" -> "Toby Murray\nToby.Murray@data61.csiro.au",  | 
| 74935 | 142  | 
"Urban, Christian" -> "Christian Urban",  | 
| 74940 | 143  | 
"Ursula Eschbach" -> "",  | 
144  | 
"Van Staden Stephan" -> "Stephan van Staden",  | 
|
| 74937 | 145  | 
"Viktor Kuncak" -> "Viktor Kunčak",  | 
146  | 
"Viorel Preoteasaa" -> "Viorel Preoteasa",  | 
|
| 74940 | 147  | 
"Wickerson, John P" -> "John Wickerson",  | 
148  | 
"Wong, Yat" -> "Yat Wong",  | 
|
149  | 
"YAMADA, Akihisa" -> "Akihisa Yamada",  | 
|
| 74937 | 150  | 
"YliÃs Falcone" -> "Yliès Falcone",  | 
| 74940 | 151  | 
"amir mohajeri" -> "Amir Mohajeri",  | 
152  | 
"aniello murano" -> "Aniello Murano",  | 
|
153  | 
"barzan stefania" -> "Stefania Barzan",  | 
|
154  | 
"benhamou" -> "Belaid Benhamou",  | 
|
155  | 
"charmi panchal" -> "Charmi Panchal",  | 
|
156  | 
"chen kun" -> "Chen Kun",  | 
|
157  | 
"chunhan wu" -> "Chunhan Wu",  | 
|
158  | 
"daniel de la concepción sáez" -> "Daniel de la Concepción Sáez",  | 
|
159  | 
"daniel.luckhardt@mathematik.uni-goettingen.de" -> "Logiker@gmx.net",  | 
|
160  | 
"david streader" -> "David Streader",  | 
|
161  | 
"eschbach@in.tum.de" -> "",  | 
|
162  | 
"f.rabe@jacobs-university.de" -> "florian.rabe@fau.de",  | 
|
163  | 
"florian@haftmann-online.de" -> "haftmann@in.tum.de",  | 
|
164  | 
"fredegar@haftmann-online.de" -> "haftmann@in.tum.de",  | 
|
| 74937 | 165  | 
"gallais @ ensl.org" -> "Guillaume Allais",  | 
| 74940 | 166  | 
"geng chen" -> "Geng Chen",  | 
167  | 
"henning.seidler" -> "Henning Seidler",  | 
|
| 74937 | 168  | 
"hkb" -> "Hidetsune Kobayashi",  | 
| 74941 | 169  | 
"jobs-pm@inf.ethz.ch" -> "",  | 
| 74937 | 170  | 
"julien@RadboudUniversity" -> "",  | 
| 74940 | 171  | 
"jun sun" -> "Jun Sun",  | 
172  | 
"jwang whu.edu.cn (jwang)" -> "jwang",  | 
|
173  | 
"kostas pouliasis" -> "Kostas Pouliasis",  | 
|
174  | 
"kristof.teichel@ptb.de" -> "Kristof Teichel\nkristof.teichel@ptb.de",  | 
|
175  | 
"lucas cavalcante" -> "Lucas Cavalcante",  | 
|
176  | 
"mahmoud abdelazim" -> "Mahmoud Abdelazim",  | 
|
177  | 
"manish surolia" -> "Manish Surolia",  | 
|
| 74937 | 178  | 
"mantel" -> "Heiko Mantel",  | 
| 74940 | 179  | 
"marco caminati" -> "Marco Caminati",  | 
| 74941 | 180  | 
"mathias.fleury@ens-rennes.fr" -> "Mathias Fleury\nmathias.fleury12@gmail.com",  | 
| 74937 | 181  | 
"merz@loria.fr" -> "stephan.merz@loria.fr",  | 
| 74940 | 182  | 
"michel levy" -> "Michel Levy",  | 
183  | 
"michel.levy2009@laposte.net" -> "Michel Levy\nmichel.levy2009@laposte.net",  | 
|
| 74937 | 184  | 
"nemouchi" -> "Yakoub Nemouchi",  | 
| 74941 | 185  | 
"noam neer" -> "Noam Neer",  | 
186  | 
"olfa mraihi" -> "Olfa Mraihi",  | 
|
187  | 
"pathsnottakenworkshop@gmail.com" -> "Leo Freitas\nleo.freitas@newcastle.ac.uk",  | 
|
| 74940 | 188  | 
"patrick barlatier" -> "Patrick Barlatier",  | 
189  | 
"patrick dabou" -> "Patrick Dabou",  | 
|
190  | 
"paul zimmermann" -> "Paul Zimmermann",  | 
|
| 74937 | 191  | 
"popescu2@illinois.edu" -> "Andrei Popescu",  | 
| 74940 | 192  | 
"recruiting" -> "",  | 
| 74941 | 193  | 
"recruiting@mais.informatik.tu-darmstadt.de" -> "",  | 
| 74940 | 194  | 
"roux cody" -> "Cody Roux",  | 
195  | 
"scott constable" -> "Scott Constable",  | 
|
196  | 
"superuser@mattweidner.com" -> "Matthew Weidner\nsuperuser@mattweidner.com",  | 
|
197  | 
"urban@math.lmu.de" -> "Christian Urban\nurban@math.lmu.de",  | 
|
| 74937 | 198  | 
"veronique.cortier@loria.fr" -> "Veronique.Cortier@loria.fr",  | 
| 74940 | 199  | 
"vikram singh" -> "Vikram Singh",  | 
| 74937 | 200  | 
"wenzelm@in.tum.de" -> "makarius@sketis.net",  | 
| 74940 | 201  | 
"werner@lix.polytechnique.fr" -> "Benjamin Werner\nwerner@lix.polytechnique.fr",  | 
202  | 
"wmansky@cs.princeton.edu" -> "William Mansky\nwmansky@cs.princeton.edu",  | 
|
203  | 
"y.nemouchi@ensbiotech.edu.dz" -> "Yakoub Nemouchi\ny.nemouchi@ensbiotech.edu.dz",  | 
|
| 74937 | 204  | 
"ÐÑÐÐÐÑÐÐÐ ÐÐÐÐÐÐÐÑÐÐÐÑ ÐÐÐÑÐÐÐ" -> "",  | 
205  | 
"∀X.Xπ - Tutorials about Proofs" -> "Bruno Woltzenlogel Paleo",  | 
|
| 74935 | 206  | 
).withDefault(identity)  | 
207  | 
||
| 74936 | 208  | 
def standard_author_info(author_info: List[String]): List[String] =  | 
| 74940 | 209  | 
author_info.flatMap(s => split_lines(standard_name.getOrElse(s, s))).distinct  | 
| 74936 | 210  | 
|
| 74921 | 211  | 
sealed case class Message(  | 
212  | 
name: String,  | 
|
213  | 
date: Date,  | 
|
214  | 
title: String,  | 
|
| 74936 | 215  | 
author_info: List[String],  | 
| 74934 | 216  | 
body: String,  | 
| 75393 | 217  | 
tags: List[String]  | 
218  | 
  ) {
 | 
|
| 74936 | 219  | 
    if (author_info.isEmpty || author_info.exists(_.isEmpty)) {
 | 
220  | 
      error("Bad author information in " + quote(name))
 | 
|
| 74935 | 221  | 
}  | 
222  | 
||
| 74925 | 223  | 
override def toString: String = name  | 
| 74935 | 224  | 
}  | 
| 74906 | 225  | 
|
| 75393 | 226  | 
  object Messages {
 | 
| 74935 | 227  | 
type Graph = isabelle.Graph[String, Message]  | 
228  | 
||
| 75393 | 229  | 
    def apply(msgs: List[Message]): Messages = {
 | 
| 74936 | 230  | 
def make_node(g: Graph, node: String, msg: Message): Graph =  | 
231  | 
if (g.defined(node) && Date.Ordering.compare(g.get_node(node).date, msg.date) >= 0) g  | 
|
| 74935 | 232  | 
else g.default_node(node, msg)  | 
233  | 
||
| 74936 | 234  | 
def connect_nodes(g: Graph, nodes: List[String]): Graph =  | 
235  | 
        nodes match {
 | 
|
236  | 
case Nil => g  | 
|
237  | 
          case a :: bs => bs.foldLeft(g)({ case (g1, b) => g1.add_edge(a, b).add_edge(b, a) })
 | 
|
238  | 
}  | 
|
239  | 
||
| 74935 | 240  | 
new Messages(msgs.sortBy(_.date)(Date.Ordering),  | 
241  | 
msgs.foldLeft[Graph](Graph.string)(  | 
|
| 74936 | 242  | 
          { case (graph, msg) =>
 | 
243  | 
val nodes = msg.author_info  | 
|
244  | 
connect_nodes(nodes.foldLeft(graph)(make_node(_, _, msg)), nodes)  | 
|
| 74935 | 245  | 
}))  | 
246  | 
}  | 
|
| 74934 | 247  | 
|
| 75393 | 248  | 
    def find(dir: Path): Messages = {
 | 
| 74934 | 249  | 
val msgs =  | 
250  | 
        for {
 | 
|
251  | 
archive <- List(Isabelle_Users, Isabelle_Dev)  | 
|
252  | 
msg <- archive.find_messages(dir + Path.basic(archive.list_name))  | 
|
253  | 
} yield msg  | 
|
254  | 
Messages(msgs)  | 
|
255  | 
}  | 
|
| 74935 | 256  | 
|
| 75393 | 257  | 
    sealed case class Cluster(author_info: List[String]) {
 | 
| 74936 | 258  | 
val (addresses, names) = author_info.partition(is_address)  | 
259  | 
||
| 74935 | 260  | 
val name: String =  | 
261  | 
        names.headOption getOrElse {
 | 
|
262  | 
          addresses match {
 | 
|
263  | 
            case a :: _ => a.substring(0, a.indexOf('@')).replace('.', ' ')
 | 
|
264  | 
            case Nil => error("Empty cluster")
 | 
|
265  | 
}  | 
|
266  | 
}  | 
|
267  | 
||
| 74940 | 268  | 
val name_lowercase: String = Word.lowercase(name)  | 
269  | 
||
| 74935 | 270  | 
def get_address: Option[String] = addresses.headOption  | 
271  | 
||
272  | 
def unique: Boolean = names.length == 1 && addresses.length == 1  | 
|
273  | 
def multi: Boolean = names.length > 1 || addresses.length > 1  | 
|
274  | 
||
| 75393 | 275  | 
      def print: String = {
 | 
| 74935 | 276  | 
        val entries = names ::: (if (addresses.isEmpty) Nil else List("")) ::: addresses
 | 
277  | 
        entries.mkString("\n  * ", "\n    ", "")
 | 
|
278  | 
}  | 
|
279  | 
}  | 
|
| 74933 | 280  | 
}  | 
281  | 
||
| 75393 | 282  | 
  class Messages private(val sorted: List[Message], val graph: Messages.Graph) {
 | 
| 74935 | 283  | 
    override def toString: String = "Messages(" + sorted.size + ")"
 | 
| 74925 | 284  | 
|
| 75393 | 285  | 
    object Node_Ordering extends scala.math.Ordering[String] {
 | 
| 74935 | 286  | 
override def compare(a: String, b: String): Int =  | 
287  | 
Date.Rev_Ordering.compare(graph.get_node(a).date, graph.get_node(b).date)  | 
|
288  | 
}  | 
|
289  | 
||
290  | 
def get_cluster(msg: Message): Messages.Cluster =  | 
|
| 74936 | 291  | 
Messages.Cluster(graph.all_succs(msg.author_info).sorted.sorted(Node_Ordering))  | 
| 74935 | 292  | 
|
293  | 
def get_name(msg: Message): String = get_cluster(msg).name  | 
|
| 74925 | 294  | 
|
| 74935 | 295  | 
def get_address(msg: Message): String =  | 
296  | 
      get_cluster(msg).get_address getOrElse error("No author address for " + quote(msg.name))
 | 
|
297  | 
||
| 75393 | 298  | 
    def check(check_all: Boolean, check_multi: Boolean = false): Unit = {
 | 
| 74940 | 299  | 
val clusters = sorted.map(get_cluster).distinct.sortBy(_.name_lowercase)  | 
| 74935 | 300  | 
|
| 74937 | 301  | 
      if (check_all) {
 | 
302  | 
        Output.writeln(cat_lines("clusters:" :: clusters.map(_.print)))
 | 
|
303  | 
}  | 
|
304  | 
      else {
 | 
|
305  | 
val multi = if (check_multi) clusters.filter(_.multi) else Nil  | 
|
306  | 
        if (multi.nonEmpty) {
 | 
|
307  | 
          Output.writeln(cat_lines("ambiguous clusters:" :: multi.map(_.print)))
 | 
|
308  | 
}  | 
|
| 74925 | 309  | 
}  | 
| 74933 | 310  | 
|
| 74935 | 311  | 
val unknown = clusters.filter(cluster => cluster.get_address.isEmpty)  | 
312  | 
      if (unknown.nonEmpty) {
 | 
|
| 74937 | 313  | 
        Output.writeln(cat_lines("\nunknown mail:" :: unknown.map(_.print)))
 | 
| 74933 | 314  | 
}  | 
| 74925 | 315  | 
}  | 
316  | 
}  | 
|
317  | 
||
318  | 
||
| 74921 | 319  | 
/* mailing list archives */  | 
320  | 
||
321  | 
abstract class Archive(  | 
|
322  | 
url: URL,  | 
|
323  | 
name: String = "",  | 
|
| 75393 | 324  | 
tag: String = ""  | 
325  | 
  ) {
 | 
|
| 74921 | 326  | 
def message_regex: Regex  | 
| 74923 | 327  | 
def message_content(name: String, lines: List[String]): Message  | 
| 74921 | 328  | 
|
| 74924 | 329  | 
def message_match(lines: List[String], re: Regex): Option[Match] =  | 
330  | 
lines.flatMap(re.findFirstMatchIn(_)).headOption  | 
|
331  | 
||
| 75393 | 332  | 
    def make_name(str: String): String = {
 | 
| 74937 | 333  | 
val s =  | 
334  | 
        str.trim.replaceAll("""\s+""", " ").replaceAll(" at ", "@")
 | 
|
335  | 
          .replace("mailbroy.informatik.tu-muenchen.de", "in.tum.de")
 | 
|
336  | 
          .replace("informatik.tu-muenchen.de", "in.tum.de")
 | 
|
| 74935 | 337  | 
      if (s.startsWith("=") && s.endsWith("=")) "" else s
 | 
338  | 
}  | 
|
| 74924 | 339  | 
|
340  | 
def make_body(lines: List[String]): String =  | 
|
341  | 
cat_lines(Library.take_suffix[String](_.isEmpty, lines)._1)  | 
|
342  | 
||
| 74921 | 343  | 
private val main_url: URL =  | 
344  | 
Url(Library.take_suffix[Char](_ == '/', Url.trim_index(url).toString.toList)._1.mkString + "/")  | 
|
345  | 
||
346  | 
private val main_html = Url.read(main_url)  | 
|
347  | 
||
| 75393 | 348  | 
    val list_name: String = {
 | 
| 74921 | 349  | 
val title =  | 
350  | 
"""<title>The ([^</>]*) Archives</title>""".r.findFirstMatchIn(main_html).map(_.group(1))  | 
|
351  | 
      (proper_string(name) orElse title).getOrElse(error("Failed to determine mailing list name"))
 | 
|
352  | 
}  | 
|
| 72558 | 353  | 
override def toString: String = list_name  | 
354  | 
||
| 74923 | 355  | 
def full_name(href: String): String = list_name + "/" + href  | 
356  | 
||
| 74921 | 357  | 
def list_tag: String = proper_string(tag).getOrElse(list_name)  | 
358  | 
||
359  | 
def read_text(href: String): String = Url.read(new URL(main_url, href))  | 
|
360  | 
||
361  | 
def hrefs_text: List[String] =  | 
|
362  | 
"""href="([^"]+\.txt(?:\.gz)?)"""".r.findAllMatchIn(main_html).map(_.group(1)).toList  | 
|
363  | 
||
364  | 
def hrefs_msg: List[String] =  | 
|
| 74906 | 365  | 
      (for {
 | 
| 74921 | 366  | 
href <- """href="([^"]+)/date.html"""".r.findAllMatchIn(main_html).map(_.group(1))  | 
367  | 
html = read_text(href + "/date.html")  | 
|
368  | 
msg <- message_regex.findAllMatchIn(html).map(_.group(1))  | 
|
| 74906 | 369  | 
} yield href + "/" + msg).toList  | 
370  | 
||
| 75393 | 371  | 
    def get(target_dir: Path, href: String, progress: Progress = new Progress): Option[Path] = {
 | 
| 72558 | 372  | 
val dir = target_dir + Path.basic(list_name)  | 
| 74906 | 373  | 
val path = dir + Path.explode(href)  | 
| 74921 | 374  | 
val url = new URL(main_url, href)  | 
| 74906 | 375  | 
val connection = url.openConnection  | 
376  | 
      try {
 | 
|
377  | 
val length = connection.getContentLengthLong  | 
|
378  | 
val timestamp = connection.getLastModified  | 
|
379  | 
val file = path.file  | 
|
380  | 
if (file.isFile && file.length == length && file.lastModified == timestamp) None  | 
|
381  | 
        else {
 | 
|
382  | 
Isabelle_System.make_directory(path.dir)  | 
|
383  | 
          progress.echo("Getting " + url)
 | 
|
384  | 
val bytes =  | 
|
385  | 
using(connection.getInputStream)(Bytes.read_stream(_, hint = length.toInt max 1024))  | 
|
386  | 
Bytes.write(file, bytes)  | 
|
387  | 
file.setLastModified(timestamp)  | 
|
388  | 
Some(path)  | 
|
389  | 
}  | 
|
390  | 
}  | 
|
391  | 
      finally { connection.getInputStream.close() }
 | 
|
392  | 
}  | 
|
| 72558 | 393  | 
|
| 74906 | 394  | 
def download_text(target_dir: Path, progress: Progress = new Progress): List[Path] =  | 
395  | 
hrefs_text.flatMap(get(target_dir, _, progress = progress))  | 
|
396  | 
||
397  | 
def download_msg(target_dir: Path, progress: Progress = new Progress): List[Path] =  | 
|
398  | 
hrefs_msg.flatMap(get(target_dir, _, progress = progress))  | 
|
| 74907 | 399  | 
|
400  | 
def download(target_dir: Path, progress: Progress = new Progress): List[Path] =  | 
|
401  | 
download_text(target_dir, progress = progress) :::  | 
|
402  | 
download_msg(target_dir, progress = progress)  | 
|
| 74921 | 403  | 
|
| 75393 | 404  | 
    def make_title(str: String): String = {
 | 
| 74921 | 405  | 
      val Trim1 = ("""\s*\Q[""" + list_tag + """]\E\s*(.*)""").r
 | 
406  | 
val Trim2 = """(?i:(?:re|fw|fwd)\s*:\s*)(.*)""".r  | 
|
407  | 
val Trim3 = """\[\s*(.*?)\s*\]""".r  | 
|
408  | 
@tailrec def trim(s: String): String =  | 
|
409  | 
        s match {
 | 
|
410  | 
case Trim1(s1) => trim(s1)  | 
|
411  | 
case Trim2(s1) => trim(s1)  | 
|
412  | 
case Trim3(s1) => trim(s1)  | 
|
413  | 
case _ => s  | 
|
414  | 
}  | 
|
415  | 
trim(str)  | 
|
416  | 
}  | 
|
417  | 
||
418  | 
def get_messages(): List[Message] =  | 
|
419  | 
for (href <- hrefs_msg) yield message_content(href, split_lines(read_text(href)))  | 
|
420  | 
||
| 75393 | 421  | 
    def find_messages(dir: Path): List[Message] = {
 | 
| 74921 | 422  | 
      for {
 | 
| 
75906
 
2167b9e3157a
clarified signature: support for adhoc file types;
 
wenzelm 
parents: 
75393 
diff
changeset
 | 
423  | 
file <- File.find_files(dir.file, file => File.is_html(file.getName))  | 
| 74921 | 424  | 
rel_path <- File.relative_path(dir, File.path(file))  | 
| 74923 | 425  | 
}  | 
426  | 
      yield {
 | 
|
427  | 
val name = full_name(rel_path.implode)  | 
|
428  | 
message_content(name, split_lines(File.read(file)))  | 
|
429  | 
}  | 
|
| 74921 | 430  | 
}  | 
| 72558 | 431  | 
}  | 
| 72559 | 432  | 
|
| 75393 | 433  | 
  private class Message_Chunk(bg: String = "", en: String = "") {
 | 
434  | 
    def unapply(lines: List[String]): Option[List[String]] = {
 | 
|
| 74932 | 435  | 
val res1 =  | 
436  | 
if (bg.isEmpty) Some(lines)  | 
|
437  | 
        else {
 | 
|
438  | 
          lines.dropWhile(_ != bg) match {
 | 
|
439  | 
case Nil => None  | 
|
440  | 
case _ :: rest => Some(rest)  | 
|
441  | 
}  | 
|
442  | 
}  | 
|
443  | 
if (en.isEmpty) res1  | 
|
444  | 
      else {
 | 
|
445  | 
        res1 match {
 | 
|
446  | 
case None => None  | 
|
447  | 
case Some(lines1) =>  | 
|
448  | 
val lines2 = lines1.takeWhile(_ != en)  | 
|
449  | 
if (lines1.drop(lines2.length).isEmpty) None else Some(lines2)  | 
|
450  | 
}  | 
|
451  | 
}  | 
|
452  | 
}  | 
|
453  | 
||
454  | 
def get(lines: List[String]): List[String] =  | 
|
455  | 
unapply(lines) getOrElse  | 
|
456  | 
        error("Missing delimiters:" +
 | 
|
457  | 
(if (bg.nonEmpty) " " else "") + bg +  | 
|
458  | 
(if (en.nonEmpty) " " else "") + en)  | 
|
459  | 
}  | 
|
460  | 
||
| 72559 | 461  | 
|
| 74930 | 462  | 
/* isabelle-users mailing list */  | 
| 72559 | 463  | 
|
| 74921 | 464  | 
object Isabelle_Users extends Archive(  | 
465  | 
    Url("https://lists.cam.ac.uk/pipermail/cl-isabelle-users"),
 | 
|
| 75393 | 466  | 
name = "isabelle-users", tag = "isabelle"  | 
467  | 
  ) {
 | 
|
| 74921 | 468  | 
override def message_regex: Regex = """<li><a name="\d+" href="(msg\d+\.html)">""".r  | 
469  | 
||
470  | 
private object Head extends  | 
|
| 74922 | 471  | 
Message_Chunk(bg = "<!--X-Head-of-Message-->", en = "<!--X-Head-of-Message-End-->")  | 
| 74921 | 472  | 
|
473  | 
private object Body extends  | 
|
| 74922 | 474  | 
Message_Chunk(bg = "<!--X-Body-of-Message-->", en = "<!--X-Body-of-Message-End-->")  | 
475  | 
||
| 75393 | 476  | 
    private object Date_Format {
 | 
| 74922 | 477  | 
private val Trim1 = """\w+,\s*(.*)""".r  | 
478  | 
private val Trim2 = """(.*?)\s*\(.*\)""".r  | 
|
479  | 
private val Format =  | 
|
480  | 
Date.Format(  | 
|
481  | 
"d MMM uuuu H:m:s Z",  | 
|
482  | 
"d MMM uuuu H:m:s z",  | 
|
483  | 
"d MMM yy H:m:s Z",  | 
|
484  | 
"d MMM yy H:m:s z")  | 
|
| 75393 | 485  | 
      def unapply(s: String): Option[Date] = {
 | 
| 74922 | 486  | 
        val s0 = s.replaceAll("""\s+""", " ")
 | 
487  | 
val s1 =  | 
|
488  | 
          s0 match {
 | 
|
489  | 
case Trim1(s1) => s1  | 
|
490  | 
case _ => s0  | 
|
491  | 
}  | 
|
492  | 
val s2 =  | 
|
493  | 
          s1 match {
 | 
|
494  | 
case Trim2(s2) => s2  | 
|
495  | 
case _ => s1  | 
|
496  | 
}  | 
|
497  | 
Format.unapply(s2)  | 
|
498  | 
}  | 
|
499  | 
}  | 
|
| 74921 | 500  | 
|
| 75393 | 501  | 
    override def make_name(str: String): String = {
 | 
| 74935 | 502  | 
      val s = Library.perhaps_unsuffix(" via Cl-isabelle-users", super.make_name(str))
 | 
| 74925 | 503  | 
if (s == "cl-isabelle-users@lists.cam.ac.uk") "" else s  | 
504  | 
}  | 
|
505  | 
||
| 75393 | 506  | 
    object Address {
 | 
| 74924 | 507  | 
private def anchor(s: String): String = """<a href="[^"]*">""" + s + """</a>"""  | 
508  | 
private def angl(s: String): String = """<""" + s + """>"""  | 
|
509  | 
private def quot(s: String): String = """"""" + s + """""""  | 
|
510  | 
      private def paren(s: String): String = """\(""" + s + """\)"""
 | 
|
511  | 
private val adr = """([^<>]*? at [^<>]*?)"""  | 
|
512  | 
private val any = """([^<>]*?)"""  | 
|
513  | 
private val spc = """\s*"""  | 
|
514  | 
||
515  | 
private val Name1 = quot(anchor(any)).r  | 
|
516  | 
private val Name2 = quot(any).r  | 
|
517  | 
private val Name_Adr1 = (quot(anchor(any)) + spc + angl(anchor(adr))).r  | 
|
518  | 
private val Name_Adr2 = (quot(any) + spc + angl(anchor(adr))).r  | 
|
519  | 
private val Name_Adr3 = (anchor(any) + spc + angl(anchor(adr))).r  | 
|
520  | 
private val Name_Adr4 = (any + spc + angl(anchor(adr))).r  | 
|
521  | 
private val Adr_Name1 = (angl(anchor(adr)) + spc + paren(any)).r  | 
|
522  | 
private val Adr_Name2 = (anchor(adr) + spc + paren(any)).r  | 
|
523  | 
private val Adr1 = angl(anchor(adr)).r  | 
|
524  | 
private val Adr2 = anchor(adr).r  | 
|
525  | 
||
| 74936 | 526  | 
def parse(s: String): List[String] =  | 
| 74924 | 527  | 
        s match {
 | 
| 74936 | 528  | 
case Name1(a) => List(a)  | 
529  | 
case Name2(a) => List(a)  | 
|
530  | 
case Name_Adr1(a, b) => List(a, b)  | 
|
531  | 
case Name_Adr2(a, b) => List(a, b)  | 
|
532  | 
case Name_Adr3(a, b) => List(a, b)  | 
|
533  | 
case Name_Adr4(a, b) => List(a, b)  | 
|
534  | 
case Adr_Name1(b, a) => List(a, b)  | 
|
535  | 
case Adr_Name2(b, a) => List(a, b)  | 
|
536  | 
case Adr1(a) => List(a)  | 
|
537  | 
case Adr2(a) => List(a)  | 
|
538  | 
case _ => Nil  | 
|
| 74924 | 539  | 
}  | 
540  | 
}  | 
|
541  | 
||
| 75393 | 542  | 
    override def message_content(name: String, lines: List[String]): Message = {
 | 
| 74921 | 543  | 
def err(msg: String = ""): Nothing =  | 
| 74923 | 544  | 
        error("Malformed message: " + name + (if (msg.isEmpty) "" else "\n" + msg))
 | 
| 74921 | 545  | 
|
546  | 
val (head, body) =  | 
|
| 74924 | 547  | 
        try { (Head.get(lines), make_body(Body.get(lines))) }
 | 
| 74921 | 548  | 
        catch { case ERROR(msg) => err(msg) }
 | 
| 72559 | 549  | 
|
| 74921 | 550  | 
val date =  | 
551  | 
message_match(head, """<li><em>Date</em>:\s*(.*?)\s*</li>""".r)  | 
|
552  | 
.map(m => HTML.input(m.group(1))) match  | 
|
553  | 
        {
 | 
|
554  | 
case Some(Date_Format(d)) => d  | 
|
555  | 
          case Some(s) => err("Malformed Date: " + quote(s))
 | 
|
556  | 
          case None => err("Missing Date")
 | 
|
557  | 
}  | 
|
558  | 
||
559  | 
val title =  | 
|
| 74924 | 560  | 
make_title(  | 
561  | 
HTML.input(message_match(head, """<li><em>Subject</em>:\s*(.*)</li>""".r)  | 
|
562  | 
            .getOrElse(err("Missing Subject")).group(1)))
 | 
|
| 74921 | 563  | 
|
| 74938 | 564  | 
def parse_author_info(re: Regex): List[String] =  | 
565  | 
        message_match(head, re) match {
 | 
|
566  | 
case None => Nil  | 
|
567  | 
case Some(m) => Address.parse(m.group(1)).map(a => HTML.input(make_name(a)))  | 
|
568  | 
}  | 
|
569  | 
||
| 74936 | 570  | 
val author_info =  | 
| 74938 | 571  | 
        (parse_author_info("""<li><em>From</em>:\s*(.*?)\s*</li>""".r) :::
 | 
572  | 
          parse_author_info("""<li><em>Reply-to</em>:\s*(.*?)\s*</li>""".r))
 | 
|
573  | 
.distinct.filter(_.nonEmpty)  | 
|
574  | 
||
575  | 
      if (author_info.isEmpty) err("Malformed author information")
 | 
|
| 74921 | 576  | 
|
| 74934 | 577  | 
val tags = List(list_name)  | 
578  | 
||
| 74936 | 579  | 
Message(name, date, title, standard_author_info(author_info), body, tags)  | 
| 74921 | 580  | 
}  | 
581  | 
}  | 
|
582  | 
||
| 74930 | 583  | 
|
584  | 
/* isabelle-dev mailing list */  | 
|
585  | 
||
| 75393 | 586  | 
  object Isabelle_Dev extends Archive(Url("https://mailmanbroy.in.tum.de/pipermail/isabelle-dev")) {
 | 
| 74921 | 587  | 
override def message_regex: Regex = """<LI><A HREF="(\d+\.html)">""".r  | 
588  | 
||
| 74922 | 589  | 
private object Head extends Message_Chunk(en = "<!--beginarticle-->")  | 
590  | 
private object Body extends Message_Chunk(bg = "<!--beginarticle-->", en = "<!--endarticle-->")  | 
|
591  | 
||
| 75393 | 592  | 
    private object Date_Format {
 | 
| 74922 | 593  | 
      val Format = Date.Format("E MMM d H:m:s z uuuu")
 | 
594  | 
      def unapply(s: String): Option[Date] = Format.unapply(s.replaceAll("""\s+""", " "))
 | 
|
595  | 
}  | 
|
596  | 
||
| 75393 | 597  | 
    override def message_content(name: String, lines: List[String]): Message = {
 | 
| 74922 | 598  | 
def err(msg: String = ""): Nothing =  | 
| 74923 | 599  | 
        error("Malformed message: " + name + (if (msg.isEmpty) "" else "\n" + msg))
 | 
| 74922 | 600  | 
|
601  | 
val (head, body) =  | 
|
| 74924 | 602  | 
        try { (Head.get(lines), make_body(Body.get(lines))) }
 | 
| 74922 | 603  | 
        catch { case ERROR(msg) => err(msg) }
 | 
604  | 
||
605  | 
val date =  | 
|
606  | 
        message_match(head, """\s*<I>(.*)</I>""".r).map(m => HTML.input(m.group(1))) match {
 | 
|
607  | 
case Some(Date_Format(d)) => d  | 
|
608  | 
          case Some(s) => err("Malformed Date: " + quote(s))
 | 
|
609  | 
          case None => err("Missing Date")
 | 
|
610  | 
}  | 
|
611  | 
||
| 75393 | 612  | 
      val (title, author_address) = {
 | 
| 74922 | 613  | 
        message_match(head, """TITLE="([^"]+)">(.*)""".r) match {
 | 
| 74931 | 614  | 
case Some(m) => (make_title(HTML.input(m.group(1))), make_name(HTML.input(m.group(2))))  | 
| 74922 | 615  | 
          case None => err("Missing TITLE")
 | 
616  | 
}  | 
|
617  | 
}  | 
|
618  | 
||
619  | 
val author_name =  | 
|
620  | 
        message_match(head, """\s*<B>(.*)</B>""".r) match {
 | 
|
621  | 
          case None => err("Missing author")
 | 
|
| 74931 | 622  | 
case Some(m) =>  | 
623  | 
val a = make_name(HTML.input(m.group(1)))  | 
|
624  | 
if (a == author_address) "" else a  | 
|
| 74922 | 625  | 
}  | 
626  | 
||
| 74936 | 627  | 
val author_info = List(author_name, author_address)  | 
| 74934 | 628  | 
val tags = List(list_name)  | 
629  | 
||
| 74936 | 630  | 
Message(name, date, title, standard_author_info(author_info), body, tags)  | 
| 74922 | 631  | 
}  | 
| 74921 | 632  | 
}  | 
| 72558 | 633  | 
}  |