| author | Fabian Huch <huch@in.tum.de> | 
| Fri, 01 Dec 2023 20:32:34 +0100 | |
| changeset 79101 | 4e47b34fbb8e | 
| parent 79044 | 8cc1ae43e12e | 
| child 79510 | d8330439823a | 
| permissions | -rw-r--r-- | 
| 72558 | 1 | /* Title: Pure/General/mailman.scala | 
| 2 | Author: Makarius | |
| 3 | ||
| 74942 | 4 | Support for Mailman list servers, notably isabelle-users and isabelle-dev. | 
| 72558 | 5 | */ | 
| 6 | ||
| 7 | package isabelle | |
| 8 | ||
| 9 | ||
| 10 | import java.net.URL | |
| 11 | ||
| 74921 | 12 | import scala.annotation.tailrec | 
| 74906 | 13 | import scala.util.matching.Regex | 
| 74921 | 14 | import scala.util.matching.Regex.Match | 
| 74906 | 15 | |
| 72558 | 16 | |
| 75393 | 17 | object Mailman {
 | 
| 74921 | 18 | /* mailing list messages */ | 
| 72559 | 19 | |
| 74935 | 20 | def is_address(s: String): Boolean = | 
| 21 |     s.contains('@') && s.contains('.') && !s.contains(' ')
 | |
| 22 | ||
| 74936 | 23 | private val standard_name: Map[String, String] = | 
| 74935 | 24 | Map( | 
| 74941 | 25 | "121171528@qq.com" -> "Guo Fan\n121171528@qq.com", | 
| 74937 | 26 | "Aman Pohjola, Johannes (Data61, Kensington NSW)" -> "Johannes Aman Pohjola", | 
| 27 | "Andrei de AraÃjo Formiga" -> "Andrei de Araujo Formiga", | |
| 74940 | 28 | "Benedikt.AHRENS@unice.fr" -> "Benedikt Ahrens\nBenedikt.AHRENS@unice.fr", | 
| 29 | "Berg, Nils Erik" -> "Nils Erik Berg", | |
| 74937 | 30 | "Berger U." -> "Ulrich Berger", | 
| 31 | "Bisping, Benjamin" -> "Benjamin Bisping", | |
| 74935 | 32 | "Blanchette, J.C." -> "Jasmin Christian Blanchette", | 
| 33 | "Buday Gergely István" -> "Gergely Buday", | |
| 74938 | 34 | "CALaF1UJ9Uy0vGCu4WkBmbfuPDxG7nFm8hfeCMP+O3g7_5CQ0Bw@mail.gmail.com" -> "", | 
| 74937 | 35 | "CRACIUN F." -> "Florin Craciun", | 
| 36 | "Carsten Schuermann" -> "Carsten Schürmann", | |
| 74940 | 37 | "Chris" -> "", | 
| 74935 | 38 | "Christoph Lueth" -> "Christoph Lüth", | 
| 39 | "Claude Marche" -> "Claude Marché", | |
| 74937 | 40 | "Daniel StÃwe" -> "Daniel Stüwe", | 
| 74940 | 41 | "Daniel.Matichuk@data61.csiro.au" -> "Daniel Matichuk\nDaniel.Matichuk@data61.csiro.au", | 
| 42 | "Daniel.Matichuk@nicta.com.au" -> "Daniel Matichuk\nDaniel.Matichuk@nicta.com.au", | |
| 74937 | 43 | "David MENTRE" -> "David MENTRÉ", | 
| 74940 | 44 | "Dey, Katie" -> "Katie Dey", | 
| 74937 | 45 | "Dr. Brendan Patrick Mahony" -> "Brendan Mahony", | 
| 46 | "Farn" -> "Farn Wang", | |
| 74935 | 47 | "Farquhar, Colin I" -> "Colin Farquhar", | 
| 74940 | 48 | "Fernandez, Matthew" -> "Matthew Fernandez", | 
| 74937 | 49 | "Filip Maric" -> "Filip Marić", | 
| 50 | "Filip MariÄ" -> "Filip Marić", | |
| 51 | "Fleury Mathias" -> "Mathias Fleury", | |
| 74940 | 52 | "Francisco Jose CHAVES ALONSO" -> "Francisco Jose Chaves Alonso", | 
| 53 | "Frederic Tuong (Dr)" -> "Frederic Tuong", | |
| 54 | "Fulya" -> "Fulya Horozal", | |
| 74935 | 55 | "George K." -> "George Karabotsos", | 
| 74940 | 56 | "Gidon Ernst" -> "Gidon ERNST", | 
| 57 | "Gransden, Thomas" -> "Thomas Gransden", | |
| 74937 | 58 | "Hans-JÃrg Schurr" -> "Hans-Jörg Schurr", | 
| 59 | "Henri DEBRAT" -> "Henri Debrat", | |
| 74935 | 60 | "Hitoshi Ohsaki (RTA publicity chair)" -> "Hitoshi Ohsaki", | 
| 74940 | 61 | "Häuselmann Rafael" -> "Rafael Häuselmann", | 
| 74937 | 62 | "Isabelle" -> "", | 
| 74940 | 63 | "J. Juhas (TUM)" -> "Jonatan Juhas", | 
| 74937 | 64 | "Jackson, Vincent (Data61, Kensington NSW)" -> "Vincent Jackson", | 
| 65 | "Janney, Mark-P26816" -> "Mark Janney", | |
| 74940 | 66 | "Jean François Molderez" -> "Jean-François Molderez", | 
| 67 | "Jean-Francois Molderez" -> "Jean-François Molderez", | |
| 68 | "John R Harrison" -> "John Harrison", | |
| 74937 | 69 | "Jose DivasÃn" -> "Jose Divasón", | 
| 70 | "Julian" -> "", | |
| 71 | "Julien" -> "", | |
| 72 | "Klein, Gerwin (Data61, Kensington NSW)" -> "Gerwin Klein", | |
| 73 | "Kobayashi, Hidetsune" -> "Hidetsune Kobayashi", | |
| 74940 | 74 | "Kylie Williams (IND)" -> "Kylie Williams", | 
| 75 | "Laarman, A.W." -> "A.W. Laarman", | |
| 74937 | 76 | "Laurent Thery" -> "Laurent Théry", | 
| 74940 | 77 | "Li, Chanjuan" -> "Li Chanjuan", | 
| 74937 | 78 | "Lochbihler Andreas" -> "Andreas Lochbihler", | 
| 74940 | 79 | "Luckhardt, Daniel" -> "Daniel Luckhardt", | 
| 74937 | 80 | "Lutz Schroeder" -> "Lutz Schröder", | 
| 81 | "Lutz SchrÃder" -> "Lutz Schröder", | |
| 74940 | 82 | "MACKENZIE Carlin" -> "Carlin MACKENZIE", | 
| 74937 | 83 | "Makarius" -> "Makarius Wenzel", | 
| 84 | "Marco" -> "", | |
| 85 | "Mark" -> "", | |
| 74940 | 86 | "Markus Mueller-Olm" -> "Markus Müller-Olm", | 
| 74937 | 87 | "Markus" -> "", | 
| 74940 | 88 | "Marmsoler, Diego" -> "Diego Marmsoler", | 
| 74937 | 89 | "Martin Klebermass" -> "Martin Klebermaß", | 
| 74941 | 90 | "Martyn Johnson via RT" -> "", | 
| 91 | "Mathias.Fleury@ens-rennes.fr" -> "Mathias Fleury\nmathias.fleury12@gmail.com", | |
| 74940 | 92 | "Matthew" -> "", | 
| 74937 | 93 | "Matthews, John R" -> "John Matthews", | 
| 74940 | 94 | "McCarthy, Jim (C3ID)" -> "Jim McCarthy", | 
| 95 | "McCue, Brian" -> "Brian McCue", | |
| 74937 | 96 | "Michael FÃrber" -> "Michael Färber", | 
| 74940 | 97 | "Michel" -> "", | 
| 98 | "Miranda, Brando" -> "Brando Miranda", | |
| 99 | "Moscato, Mariano M. \\(LARC-D320\\)\\[NATIONAL INSTITUTE OF AEROSPACE\\]" -> "Mariano M. Moscato", | |
| 100 | "Mr Julian Fell" -> "Julian Fell", | |
| 101 | "Mueller Peter" -> "Peter Müller", | |
| 102 | "Munoz, Cesar Augusto (LARC-D320)" -> "Cesar A. Munoz", | |
| 103 | "Nadel, Alexander" -> "Alexander Nadel", | |
| 104 | "Nagashima, Yutaka" -> "Yutaka Nagashima", | |
| 74937 | 105 | "Norrish, Michael (Data61, Acton)" -> "Michael Norrish", | 
| 74940 | 106 | "O'Leary, John W" -> "John W O'Leary", | 
| 74937 | 107 | "Omar Montano Rivas" -> "Omar Montaño Rivas", | 
| 108 | "Omar MontaÃo Rivas" -> "Omar Montaño Rivas", | |
| 109 | "OndÅej KunÄar" -> "Ondřej Kunčar", | |
| 74941 | 110 | "PALMER Jake" -> "Jake Palmer", | 
| 74937 | 111 | "PAQUI LUCIO" -> "Paqui Lucio", | 
| 74940 | 112 | "Pal, Abhik" -> "Abhik Pal", | 
| 113 | "Pasupuleti, Vijay" -> "Vijay Pasupuleti", | |
| 74937 | 114 | "Peter Vincent Homeier" -> "Peter V. Homeier", | 
| 115 | "Peter" -> "", | |
| 116 | "Philipp Ruemmer" -> "Philipp Rümmer", | |
| 117 | "Philipp RÃmmer" -> "Philipp Rümmer", | |
| 74941 | 118 | "Piete Brooks via RT" -> "", | 
| 74940 | 119 | "RTA publicity chair" -> "", | 
| 74937 | 120 | "Raamsdonk, F. van" -> "Femke van Raamsdonk", | 
| 121 | "Raul Gutierrez" -> "Raúl Gutiérrez", | |
| 122 | "Renà Thiemann" -> "René Thiemann", | |
| 74940 | 123 | "Ridgway, John V. E." -> "John V. E. Ridgway", | 
| 74937 | 124 | "Roggenbach M." -> "Markus Roggenbach", | 
| 74940 | 125 | "Rosu, Grigore" -> "Grigore Rosu", | 
| 74937 | 126 | "Rozman, Mihaela" -> "Mihaela Rozman", | 
| 74940 | 127 | "Schmaltz, J." -> "Julien Schmaltz", | 
| 74937 | 128 | "Serguei A. Mokhov on behalf of PST-11" -> "Serguei A. Mokhov", | 
| 74940 | 129 | "Serguei Mokhov" -> "Serguei A. Mokhov", | 
| 130 | "Shumeiko, Igor" -> "Igor Shumeiko", | |
| 131 | "Siek, Jeremy" -> "Jeremy Siek", | |
| 132 | "Silvio.Ranise@loria.fr" -> "Silvio Ranise\nSilvio.Ranise@loria.fr", | |
| 133 | "Siu, Tony" -> "Tony Siu", | |
| 74937 | 134 | "Stüber, Sebastian" -> "Sebastian Stüber", | 
| 135 | "Thiemann, Rene" -> "René Thiemann", | |
| 136 | "Thiemann, René" -> "René Thiemann", | |
| 74940 | 137 | "Thomas Arthur Leck Sewell" -> "Thomas Sewell", | 
| 74937 | 138 | "Thomas Goethel" -> "Thomas Göthel", | 
| 74940 | 139 | "Thomas.Sewell@data61.csiro.au" -> "Thomas Sewell\nThomas.Sewell@data61.csiro.au", | 
| 74941 | 140 | "Tjark Weber via RT" -> "Tjark Weber", | 
| 74940 | 141 | "Toby.Murray@data61.csiro.au" -> "Toby Murray\nToby.Murray@data61.csiro.au", | 
| 74935 | 142 | "Urban, Christian" -> "Christian Urban", | 
| 74940 | 143 | "Ursula Eschbach" -> "", | 
| 144 | "Van Staden Stephan" -> "Stephan van Staden", | |
| 74937 | 145 | "Viktor Kuncak" -> "Viktor Kunčak", | 
| 146 | "Viorel Preoteasaa" -> "Viorel Preoteasa", | |
| 74940 | 147 | "Wickerson, John P" -> "John Wickerson", | 
| 148 | "Wong, Yat" -> "Yat Wong", | |
| 149 | "YAMADA, Akihisa" -> "Akihisa Yamada", | |
| 74937 | 150 | "YliÃs Falcone" -> "Yliès Falcone", | 
| 74940 | 151 | "amir mohajeri" -> "Amir Mohajeri", | 
| 152 | "aniello murano" -> "Aniello Murano", | |
| 153 | "barzan stefania" -> "Stefania Barzan", | |
| 154 | "benhamou" -> "Belaid Benhamou", | |
| 155 | "charmi panchal" -> "Charmi Panchal", | |
| 156 | "chen kun" -> "Chen Kun", | |
| 157 | "chunhan wu" -> "Chunhan Wu", | |
| 158 | "daniel de la concepción sáez" -> "Daniel de la Concepción Sáez", | |
| 159 | "daniel.luckhardt@mathematik.uni-goettingen.de" -> "Logiker@gmx.net", | |
| 160 | "david streader" -> "David Streader", | |
| 161 | "eschbach@in.tum.de" -> "", | |
| 162 | "f.rabe@jacobs-university.de" -> "florian.rabe@fau.de", | |
| 163 | "florian@haftmann-online.de" -> "haftmann@in.tum.de", | |
| 164 | "fredegar@haftmann-online.de" -> "haftmann@in.tum.de", | |
| 74937 | 165 | "gallais @ ensl.org" -> "Guillaume Allais", | 
| 74940 | 166 | "geng chen" -> "Geng Chen", | 
| 167 | "henning.seidler" -> "Henning Seidler", | |
| 74937 | 168 | "hkb" -> "Hidetsune Kobayashi", | 
| 74941 | 169 | "jobs-pm@inf.ethz.ch" -> "", | 
| 74937 | 170 | "julien@RadboudUniversity" -> "", | 
| 74940 | 171 | "jun sun" -> "Jun Sun", | 
| 172 | "jwang whu.edu.cn (jwang)" -> "jwang", | |
| 173 | "kostas pouliasis" -> "Kostas Pouliasis", | |
| 174 | "kristof.teichel@ptb.de" -> "Kristof Teichel\nkristof.teichel@ptb.de", | |
| 175 | "lucas cavalcante" -> "Lucas Cavalcante", | |
| 176 | "mahmoud abdelazim" -> "Mahmoud Abdelazim", | |
| 177 | "manish surolia" -> "Manish Surolia", | |
| 74937 | 178 | "mantel" -> "Heiko Mantel", | 
| 74940 | 179 | "marco caminati" -> "Marco Caminati", | 
| 74941 | 180 | "mathias.fleury@ens-rennes.fr" -> "Mathias Fleury\nmathias.fleury12@gmail.com", | 
| 74937 | 181 | "merz@loria.fr" -> "stephan.merz@loria.fr", | 
| 74940 | 182 | "michel levy" -> "Michel Levy", | 
| 183 | "michel.levy2009@laposte.net" -> "Michel Levy\nmichel.levy2009@laposte.net", | |
| 74937 | 184 | "nemouchi" -> "Yakoub Nemouchi", | 
| 74941 | 185 | "noam neer" -> "Noam Neer", | 
| 186 | "olfa mraihi" -> "Olfa Mraihi", | |
| 187 | "pathsnottakenworkshop@gmail.com" -> "Leo Freitas\nleo.freitas@newcastle.ac.uk", | |
| 74940 | 188 | "patrick barlatier" -> "Patrick Barlatier", | 
| 189 | "patrick dabou" -> "Patrick Dabou", | |
| 190 | "paul zimmermann" -> "Paul Zimmermann", | |
| 74937 | 191 | "popescu2@illinois.edu" -> "Andrei Popescu", | 
| 74940 | 192 | "recruiting" -> "", | 
| 74941 | 193 | "recruiting@mais.informatik.tu-darmstadt.de" -> "", | 
| 74940 | 194 | "roux cody" -> "Cody Roux", | 
| 195 | "scott constable" -> "Scott Constable", | |
| 196 | "superuser@mattweidner.com" -> "Matthew Weidner\nsuperuser@mattweidner.com", | |
| 197 | "urban@math.lmu.de" -> "Christian Urban\nurban@math.lmu.de", | |
| 74937 | 198 | "veronique.cortier@loria.fr" -> "Veronique.Cortier@loria.fr", | 
| 74940 | 199 | "vikram singh" -> "Vikram Singh", | 
| 74937 | 200 | "wenzelm@in.tum.de" -> "makarius@sketis.net", | 
| 74940 | 201 | "werner@lix.polytechnique.fr" -> "Benjamin Werner\nwerner@lix.polytechnique.fr", | 
| 202 | "wmansky@cs.princeton.edu" -> "William Mansky\nwmansky@cs.princeton.edu", | |
| 203 | "y.nemouchi@ensbiotech.edu.dz" -> "Yakoub Nemouchi\ny.nemouchi@ensbiotech.edu.dz", | |
| 74937 | 204 | "ÐÑÐÐÐÑÐÐÐ ÐÐÐÐÐÐÐÑÐÐÐÑ ÐÐÐÑÐÐÐ" -> "", | 
| 205 | "∀X.Xπ - Tutorials about Proofs" -> "Bruno Woltzenlogel Paleo", | |
| 74935 | 206 | ).withDefault(identity) | 
| 207 | ||
| 74936 | 208 | def standard_author_info(author_info: List[String]): List[String] = | 
| 74940 | 209 | author_info.flatMap(s => split_lines(standard_name.getOrElse(s, s))).distinct | 
| 74936 | 210 | |
| 74921 | 211 | sealed case class Message( | 
| 212 | name: String, | |
| 213 | date: Date, | |
| 214 | title: String, | |
| 74936 | 215 | author_info: List[String], | 
| 74934 | 216 | body: String, | 
| 75393 | 217 | tags: List[String] | 
| 218 |   ) {
 | |
| 74936 | 219 |     if (author_info.isEmpty || author_info.exists(_.isEmpty)) {
 | 
| 220 |       error("Bad author information in " + quote(name))
 | |
| 74935 | 221 | } | 
| 222 | ||
| 74925 | 223 | override def toString: String = name | 
| 74935 | 224 | } | 
| 74906 | 225 | |
| 75393 | 226 |   object Messages {
 | 
| 74935 | 227 | type Graph = isabelle.Graph[String, Message] | 
| 228 | ||
| 75393 | 229 |     def apply(msgs: List[Message]): Messages = {
 | 
| 74936 | 230 | def make_node(g: Graph, node: String, msg: Message): Graph = | 
| 231 | if (g.defined(node) && Date.Ordering.compare(g.get_node(node).date, msg.date) >= 0) g | |
| 74935 | 232 | else g.default_node(node, msg) | 
| 233 | ||
| 74936 | 234 | def connect_nodes(g: Graph, nodes: List[String]): Graph = | 
| 235 |         nodes match {
 | |
| 236 | case Nil => g | |
| 237 |           case a :: bs => bs.foldLeft(g)({ case (g1, b) => g1.add_edge(a, b).add_edge(b, a) })
 | |
| 238 | } | |
| 239 | ||
| 74935 | 240 | new Messages(msgs.sortBy(_.date)(Date.Ordering), | 
| 241 | msgs.foldLeft[Graph](Graph.string)( | |
| 74936 | 242 |           { case (graph, msg) =>
 | 
| 243 | val nodes = msg.author_info | |
| 244 | connect_nodes(nodes.foldLeft(graph)(make_node(_, _, msg)), nodes) | |
| 74935 | 245 | })) | 
| 246 | } | |
| 74934 | 247 | |
| 75393 | 248 |     def find(dir: Path): Messages = {
 | 
| 74934 | 249 | val msgs = | 
| 250 |         for {
 | |
| 251 | archive <- List(Isabelle_Users, Isabelle_Dev) | |
| 252 | msg <- archive.find_messages(dir + Path.basic(archive.list_name)) | |
| 253 | } yield msg | |
| 254 | Messages(msgs) | |
| 255 | } | |
| 74935 | 256 | |
| 75393 | 257 |     sealed case class Cluster(author_info: List[String]) {
 | 
| 74936 | 258 | val (addresses, names) = author_info.partition(is_address) | 
| 259 | ||
| 74935 | 260 | val name: String = | 
| 261 |         names.headOption getOrElse {
 | |
| 262 |           addresses match {
 | |
| 263 |             case a :: _ => a.substring(0, a.indexOf('@')).replace('.', ' ')
 | |
| 264 |             case Nil => error("Empty cluster")
 | |
| 265 | } | |
| 266 | } | |
| 267 | ||
| 74940 | 268 | val name_lowercase: String = Word.lowercase(name) | 
| 269 | ||
| 74935 | 270 | def get_address: Option[String] = addresses.headOption | 
| 271 | ||
| 272 | def unique: Boolean = names.length == 1 && addresses.length == 1 | |
| 273 | def multi: Boolean = names.length > 1 || addresses.length > 1 | |
| 274 | ||
| 75393 | 275 |       def print: String = {
 | 
| 74935 | 276 |         val entries = names ::: (if (addresses.isEmpty) Nil else List("")) ::: addresses
 | 
| 277 |         entries.mkString("\n  * ", "\n    ", "")
 | |
| 278 | } | |
| 279 | } | |
| 74933 | 280 | } | 
| 281 | ||
| 75393 | 282 |   class Messages private(val sorted: List[Message], val graph: Messages.Graph) {
 | 
| 74935 | 283 |     override def toString: String = "Messages(" + sorted.size + ")"
 | 
| 74925 | 284 | |
| 75393 | 285 |     object Node_Ordering extends scala.math.Ordering[String] {
 | 
| 74935 | 286 | override def compare(a: String, b: String): Int = | 
| 287 | Date.Rev_Ordering.compare(graph.get_node(a).date, graph.get_node(b).date) | |
| 288 | } | |
| 289 | ||
| 290 | def get_cluster(msg: Message): Messages.Cluster = | |
| 74936 | 291 | Messages.Cluster(graph.all_succs(msg.author_info).sorted.sorted(Node_Ordering)) | 
| 74935 | 292 | |
| 293 | def get_name(msg: Message): String = get_cluster(msg).name | |
| 74925 | 294 | |
| 74935 | 295 | def get_address(msg: Message): String = | 
| 296 |       get_cluster(msg).get_address getOrElse error("No author address for " + quote(msg.name))
 | |
| 297 | ||
| 75393 | 298 |     def check(check_all: Boolean, check_multi: Boolean = false): Unit = {
 | 
| 74940 | 299 | val clusters = sorted.map(get_cluster).distinct.sortBy(_.name_lowercase) | 
| 74935 | 300 | |
| 74937 | 301 |       if (check_all) {
 | 
| 302 |         Output.writeln(cat_lines("clusters:" :: clusters.map(_.print)))
 | |
| 303 | } | |
| 304 |       else {
 | |
| 305 | val multi = if (check_multi) clusters.filter(_.multi) else Nil | |
| 306 |         if (multi.nonEmpty) {
 | |
| 307 |           Output.writeln(cat_lines("ambiguous clusters:" :: multi.map(_.print)))
 | |
| 308 | } | |
| 74925 | 309 | } | 
| 74933 | 310 | |
| 74935 | 311 | val unknown = clusters.filter(cluster => cluster.get_address.isEmpty) | 
| 312 |       if (unknown.nonEmpty) {
 | |
| 74937 | 313 |         Output.writeln(cat_lines("\nunknown mail:" :: unknown.map(_.print)))
 | 
| 74933 | 314 | } | 
| 74925 | 315 | } | 
| 316 | } | |
| 317 | ||
| 318 | ||
| 74921 | 319 | /* mailing list archives */ | 
| 320 | ||
| 321 | abstract class Archive( | |
| 322 | url: URL, | |
| 323 | name: String = "", | |
| 75393 | 324 | tag: String = "" | 
| 325 |   ) {
 | |
| 74921 | 326 | def message_regex: Regex | 
| 74923 | 327 | def message_content(name: String, lines: List[String]): Message | 
| 74921 | 328 | |
| 74924 | 329 | def message_match(lines: List[String], re: Regex): Option[Match] = | 
| 330 | lines.flatMap(re.findFirstMatchIn(_)).headOption | |
| 331 | ||
| 75393 | 332 |     def make_name(str: String): String = {
 | 
| 74937 | 333 | val s = | 
| 334 |         str.trim.replaceAll("""\s+""", " ").replaceAll(" at ", "@")
 | |
| 335 |           .replace("mailbroy.informatik.tu-muenchen.de", "in.tum.de")
 | |
| 336 |           .replace("informatik.tu-muenchen.de", "in.tum.de")
 | |
| 74935 | 337 |       if (s.startsWith("=") && s.endsWith("=")) "" else s
 | 
| 338 | } | |
| 74924 | 339 | |
| 340 | def make_body(lines: List[String]): String = | |
| 341 | cat_lines(Library.take_suffix[String](_.isEmpty, lines)._1) | |
| 342 | ||
| 74921 | 343 | private val main_url: URL = | 
| 344 | Url(Library.take_suffix[Char](_ == '/', Url.trim_index(url).toString.toList)._1.mkString + "/") | |
| 345 | ||
| 346 | private val main_html = Url.read(main_url) | |
| 347 | ||
| 75393 | 348 |     val list_name: String = {
 | 
| 74921 | 349 | val title = | 
| 350 | """<title>The ([^</>]*) Archives</title>""".r.findFirstMatchIn(main_html).map(_.group(1)) | |
| 351 |       (proper_string(name) orElse title).getOrElse(error("Failed to determine mailing list name"))
 | |
| 352 | } | |
| 72558 | 353 | override def toString: String = list_name | 
| 354 | ||
| 74923 | 355 | def full_name(href: String): String = list_name + "/" + href | 
| 356 | ||
| 74921 | 357 | def list_tag: String = proper_string(tag).getOrElse(list_name) | 
| 358 | ||
| 79044 
8cc1ae43e12e
clarified signature: avoid deprecated URL constructors;
 wenzelm parents: 
78958diff
changeset | 359 | def read_text(href: String): String = Url.read(Url.resolve(main_url, href)) | 
| 74921 | 360 | |
| 361 | def hrefs_text: List[String] = | |
| 362 | """href="([^"]+\.txt(?:\.gz)?)"""".r.findAllMatchIn(main_html).map(_.group(1)).toList | |
| 363 | ||
| 364 | def hrefs_msg: List[String] = | |
| 74906 | 365 |       (for {
 | 
| 74921 | 366 | href <- """href="([^"]+)/date.html"""".r.findAllMatchIn(main_html).map(_.group(1)) | 
| 367 | html = read_text(href + "/date.html") | |
| 368 | msg <- message_regex.findAllMatchIn(html).map(_.group(1)) | |
| 74906 | 369 | } yield href + "/" + msg).toList | 
| 370 | ||
| 75393 | 371 |     def get(target_dir: Path, href: String, progress: Progress = new Progress): Option[Path] = {
 | 
| 72558 | 372 | val dir = target_dir + Path.basic(list_name) | 
| 74906 | 373 | val path = dir + Path.explode(href) | 
| 79044 
8cc1ae43e12e
clarified signature: avoid deprecated URL constructors;
 wenzelm parents: 
78958diff
changeset | 374 | val url = Url.resolve(main_url, href) | 
| 74906 | 375 | val connection = url.openConnection | 
| 376 |       try {
 | |
| 377 | val length = connection.getContentLengthLong | |
| 378 | val timestamp = connection.getLastModified | |
| 78958 | 379 | if (path.is_file && File.size(path) == length && path.file.lastModified == timestamp) None | 
| 74906 | 380 |         else {
 | 
| 381 | Isabelle_System.make_directory(path.dir) | |
| 382 |           progress.echo("Getting " + url)
 | |
| 383 | val bytes = | |
| 384 | using(connection.getInputStream)(Bytes.read_stream(_, hint = length.toInt max 1024)) | |
| 78958 | 385 | Bytes.write(path, bytes) | 
| 386 | path.file.setLastModified(timestamp) | |
| 74906 | 387 | Some(path) | 
| 388 | } | |
| 389 | } | |
| 390 |       finally { connection.getInputStream.close() }
 | |
| 391 | } | |
| 72558 | 392 | |
| 74906 | 393 | def download_text(target_dir: Path, progress: Progress = new Progress): List[Path] = | 
| 394 | hrefs_text.flatMap(get(target_dir, _, progress = progress)) | |
| 395 | ||
| 396 | def download_msg(target_dir: Path, progress: Progress = new Progress): List[Path] = | |
| 397 | hrefs_msg.flatMap(get(target_dir, _, progress = progress)) | |
| 74907 | 398 | |
| 399 | def download(target_dir: Path, progress: Progress = new Progress): List[Path] = | |
| 400 | download_text(target_dir, progress = progress) ::: | |
| 401 | download_msg(target_dir, progress = progress) | |
| 74921 | 402 | |
| 75393 | 403 |     def make_title(str: String): String = {
 | 
| 74921 | 404 |       val Trim1 = ("""\s*\Q[""" + list_tag + """]\E\s*(.*)""").r
 | 
| 405 | val Trim2 = """(?i:(?:re|fw|fwd)\s*:\s*)(.*)""".r | |
| 406 | val Trim3 = """\[\s*(.*?)\s*\]""".r | |
| 407 | @tailrec def trim(s: String): String = | |
| 408 |         s match {
 | |
| 409 | case Trim1(s1) => trim(s1) | |
| 410 | case Trim2(s1) => trim(s1) | |
| 411 | case Trim3(s1) => trim(s1) | |
| 412 | case _ => s | |
| 413 | } | |
| 414 | trim(str) | |
| 415 | } | |
| 416 | ||
| 417 | def get_messages(): List[Message] = | |
| 418 | for (href <- hrefs_msg) yield message_content(href, split_lines(read_text(href))) | |
| 419 | ||
| 75393 | 420 |     def find_messages(dir: Path): List[Message] = {
 | 
| 74921 | 421 |       for {
 | 
| 75906 
2167b9e3157a
clarified signature: support for adhoc file types;
 wenzelm parents: 
75393diff
changeset | 422 | file <- File.find_files(dir.file, file => File.is_html(file.getName)) | 
| 74921 | 423 | rel_path <- File.relative_path(dir, File.path(file)) | 
| 74923 | 424 | } | 
| 425 |       yield {
 | |
| 426 | val name = full_name(rel_path.implode) | |
| 427 | message_content(name, split_lines(File.read(file))) | |
| 428 | } | |
| 74921 | 429 | } | 
| 72558 | 430 | } | 
| 72559 | 431 | |
| 75393 | 432 |   private class Message_Chunk(bg: String = "", en: String = "") {
 | 
| 433 |     def unapply(lines: List[String]): Option[List[String]] = {
 | |
| 74932 | 434 | val res1 = | 
| 435 | if (bg.isEmpty) Some(lines) | |
| 436 |         else {
 | |
| 437 |           lines.dropWhile(_ != bg) match {
 | |
| 438 | case Nil => None | |
| 439 | case _ :: rest => Some(rest) | |
| 440 | } | |
| 441 | } | |
| 442 | if (en.isEmpty) res1 | |
| 443 |       else {
 | |
| 444 |         res1 match {
 | |
| 445 | case None => None | |
| 446 | case Some(lines1) => | |
| 447 | val lines2 = lines1.takeWhile(_ != en) | |
| 448 | if (lines1.drop(lines2.length).isEmpty) None else Some(lines2) | |
| 449 | } | |
| 450 | } | |
| 451 | } | |
| 452 | ||
| 453 | def get(lines: List[String]): List[String] = | |
| 454 | unapply(lines) getOrElse | |
| 77369 | 455 |         error("Missing delimiters:" + if_proper(bg, " ") + bg + if_proper(en, " ") + en)
 | 
| 74932 | 456 | } | 
| 457 | ||
| 72559 | 458 | |
| 74930 | 459 | /* isabelle-users mailing list */ | 
| 72559 | 460 | |
| 74921 | 461 | object Isabelle_Users extends Archive( | 
| 462 |     Url("https://lists.cam.ac.uk/pipermail/cl-isabelle-users"),
 | |
| 75393 | 463 | name = "isabelle-users", tag = "isabelle" | 
| 464 |   ) {
 | |
| 74921 | 465 | override def message_regex: Regex = """<li><a name="\d+" href="(msg\d+\.html)">""".r | 
| 466 | ||
| 467 | private object Head extends | |
| 74922 | 468 | Message_Chunk(bg = "<!--X-Head-of-Message-->", en = "<!--X-Head-of-Message-End-->") | 
| 74921 | 469 | |
| 470 | private object Body extends | |
| 74922 | 471 | Message_Chunk(bg = "<!--X-Body-of-Message-->", en = "<!--X-Body-of-Message-End-->") | 
| 472 | ||
| 75393 | 473 |     private object Date_Format {
 | 
| 74922 | 474 | private val Trim1 = """\w+,\s*(.*)""".r | 
| 475 | private val Trim2 = """(.*?)\s*\(.*\)""".r | |
| 476 | private val Format = | |
| 477 | Date.Format( | |
| 478 | "d MMM uuuu H:m:s Z", | |
| 479 | "d MMM uuuu H:m:s z", | |
| 480 | "d MMM yy H:m:s Z", | |
| 481 | "d MMM yy H:m:s z") | |
| 75393 | 482 |       def unapply(s: String): Option[Date] = {
 | 
| 74922 | 483 |         val s0 = s.replaceAll("""\s+""", " ")
 | 
| 484 | val s1 = | |
| 485 |           s0 match {
 | |
| 486 | case Trim1(s1) => s1 | |
| 487 | case _ => s0 | |
| 488 | } | |
| 489 | val s2 = | |
| 490 |           s1 match {
 | |
| 491 | case Trim2(s2) => s2 | |
| 492 | case _ => s1 | |
| 493 | } | |
| 494 | Format.unapply(s2) | |
| 495 | } | |
| 496 | } | |
| 74921 | 497 | |
| 75393 | 498 |     override def make_name(str: String): String = {
 | 
| 74935 | 499 |       val s = Library.perhaps_unsuffix(" via Cl-isabelle-users", super.make_name(str))
 | 
| 74925 | 500 | if (s == "cl-isabelle-users@lists.cam.ac.uk") "" else s | 
| 501 | } | |
| 502 | ||
| 75393 | 503 |     object Address {
 | 
| 74924 | 504 | private def anchor(s: String): String = """<a href="[^"]*">""" + s + """</a>""" | 
| 505 | private def angl(s: String): String = """<""" + s + """>""" | |
| 506 | private def quot(s: String): String = """"""" + s + """"""" | |
| 507 |       private def paren(s: String): String = """\(""" + s + """\)"""
 | |
| 508 | private val adr = """([^<>]*? at [^<>]*?)""" | |
| 509 | private val any = """([^<>]*?)""" | |
| 510 | private val spc = """\s*""" | |
| 511 | ||
| 512 | private val Name1 = quot(anchor(any)).r | |
| 513 | private val Name2 = quot(any).r | |
| 514 | private val Name_Adr1 = (quot(anchor(any)) + spc + angl(anchor(adr))).r | |
| 515 | private val Name_Adr2 = (quot(any) + spc + angl(anchor(adr))).r | |
| 516 | private val Name_Adr3 = (anchor(any) + spc + angl(anchor(adr))).r | |
| 517 | private val Name_Adr4 = (any + spc + angl(anchor(adr))).r | |
| 518 | private val Adr_Name1 = (angl(anchor(adr)) + spc + paren(any)).r | |
| 519 | private val Adr_Name2 = (anchor(adr) + spc + paren(any)).r | |
| 520 | private val Adr1 = angl(anchor(adr)).r | |
| 521 | private val Adr2 = anchor(adr).r | |
| 522 | ||
| 74936 | 523 | def parse(s: String): List[String] = | 
| 74924 | 524 |         s match {
 | 
| 74936 | 525 | case Name1(a) => List(a) | 
| 526 | case Name2(a) => List(a) | |
| 527 | case Name_Adr1(a, b) => List(a, b) | |
| 528 | case Name_Adr2(a, b) => List(a, b) | |
| 529 | case Name_Adr3(a, b) => List(a, b) | |
| 530 | case Name_Adr4(a, b) => List(a, b) | |
| 531 | case Adr_Name1(b, a) => List(a, b) | |
| 532 | case Adr_Name2(b, a) => List(a, b) | |
| 533 | case Adr1(a) => List(a) | |
| 534 | case Adr2(a) => List(a) | |
| 535 | case _ => Nil | |
| 74924 | 536 | } | 
| 537 | } | |
| 538 | ||
| 75393 | 539 |     override def message_content(name: String, lines: List[String]): Message = {
 | 
| 74921 | 540 | def err(msg: String = ""): Nothing = | 
| 77368 | 541 |         error("Malformed message: " + name + if_proper(msg, "\n" + msg))
 | 
| 74921 | 542 | |
| 543 | val (head, body) = | |
| 74924 | 544 |         try { (Head.get(lines), make_body(Body.get(lines))) }
 | 
| 74921 | 545 |         catch { case ERROR(msg) => err(msg) }
 | 
| 72559 | 546 | |
| 74921 | 547 | val date = | 
| 548 | message_match(head, """<li><em>Date</em>:\s*(.*?)\s*</li>""".r) | |
| 549 | .map(m => HTML.input(m.group(1))) match | |
| 550 |         {
 | |
| 551 | case Some(Date_Format(d)) => d | |
| 552 |           case Some(s) => err("Malformed Date: " + quote(s))
 | |
| 553 |           case None => err("Missing Date")
 | |
| 554 | } | |
| 555 | ||
| 556 | val title = | |
| 74924 | 557 | make_title( | 
| 558 | HTML.input(message_match(head, """<li><em>Subject</em>:\s*(.*)</li>""".r) | |
| 559 |             .getOrElse(err("Missing Subject")).group(1)))
 | |
| 74921 | 560 | |
| 74938 | 561 | def parse_author_info(re: Regex): List[String] = | 
| 562 |         message_match(head, re) match {
 | |
| 563 | case None => Nil | |
| 564 | case Some(m) => Address.parse(m.group(1)).map(a => HTML.input(make_name(a))) | |
| 565 | } | |
| 566 | ||
| 74936 | 567 | val author_info = | 
| 74938 | 568 |         (parse_author_info("""<li><em>From</em>:\s*(.*?)\s*</li>""".r) :::
 | 
| 569 |           parse_author_info("""<li><em>Reply-to</em>:\s*(.*?)\s*</li>""".r))
 | |
| 570 | .distinct.filter(_.nonEmpty) | |
| 571 | ||
| 572 |       if (author_info.isEmpty) err("Malformed author information")
 | |
| 74921 | 573 | |
| 74934 | 574 | val tags = List(list_name) | 
| 575 | ||
| 74936 | 576 | Message(name, date, title, standard_author_info(author_info), body, tags) | 
| 74921 | 577 | } | 
| 578 | } | |
| 579 | ||
| 74930 | 580 | |
| 581 | /* isabelle-dev mailing list */ | |
| 582 | ||
| 75393 | 583 |   object Isabelle_Dev extends Archive(Url("https://mailmanbroy.in.tum.de/pipermail/isabelle-dev")) {
 | 
| 74921 | 584 | override def message_regex: Regex = """<LI><A HREF="(\d+\.html)">""".r | 
| 585 | ||
| 74922 | 586 | private object Head extends Message_Chunk(en = "<!--beginarticle-->") | 
| 587 | private object Body extends Message_Chunk(bg = "<!--beginarticle-->", en = "<!--endarticle-->") | |
| 588 | ||
| 75393 | 589 |     private object Date_Format {
 | 
| 74922 | 590 |       val Format = Date.Format("E MMM d H:m:s z uuuu")
 | 
| 591 |       def unapply(s: String): Option[Date] = Format.unapply(s.replaceAll("""\s+""", " "))
 | |
| 592 | } | |
| 593 | ||
| 75393 | 594 |     override def message_content(name: String, lines: List[String]): Message = {
 | 
| 74922 | 595 | def err(msg: String = ""): Nothing = | 
| 77368 | 596 |         error("Malformed message: " + name + if_proper(msg, "\n" + msg))
 | 
| 74922 | 597 | |
| 598 | val (head, body) = | |
| 74924 | 599 |         try { (Head.get(lines), make_body(Body.get(lines))) }
 | 
| 74922 | 600 |         catch { case ERROR(msg) => err(msg) }
 | 
| 601 | ||
| 602 | val date = | |
| 603 |         message_match(head, """\s*<I>(.*)</I>""".r).map(m => HTML.input(m.group(1))) match {
 | |
| 604 | case Some(Date_Format(d)) => d | |
| 605 |           case Some(s) => err("Malformed Date: " + quote(s))
 | |
| 606 |           case None => err("Missing Date")
 | |
| 607 | } | |
| 608 | ||
| 75393 | 609 |       val (title, author_address) = {
 | 
| 74922 | 610 |         message_match(head, """TITLE="([^"]+)">(.*)""".r) match {
 | 
| 74931 | 611 | case Some(m) => (make_title(HTML.input(m.group(1))), make_name(HTML.input(m.group(2)))) | 
| 74922 | 612 |           case None => err("Missing TITLE")
 | 
| 613 | } | |
| 614 | } | |
| 615 | ||
| 616 | val author_name = | |
| 617 |         message_match(head, """\s*<B>(.*)</B>""".r) match {
 | |
| 618 |           case None => err("Missing author")
 | |
| 74931 | 619 | case Some(m) => | 
| 620 | val a = make_name(HTML.input(m.group(1))) | |
| 621 | if (a == author_address) "" else a | |
| 74922 | 622 | } | 
| 623 | ||
| 74936 | 624 | val author_info = List(author_name, author_address) | 
| 74934 | 625 | val tags = List(list_name) | 
| 626 | ||
| 74936 | 627 | Message(name, date, title, standard_author_info(author_info), body, tags) | 
| 74922 | 628 | } | 
| 74921 | 629 | } | 
| 72558 | 630 | } |