| author | Fabian Huch <huch@in.tum.de> | 
| Fri, 15 Mar 2024 17:57:03 +0100 | |
| changeset 79899 | c73a36081b1c | 
| parent 79510 | d8330439823a | 
| child 81627 | 079dee3b117c | 
| permissions | -rw-r--r-- | 
| 72558 | 1 | /* Title: Pure/General/mailman.scala | 
| 2 | Author: Makarius | |
| 3 | ||
| 74942 | 4 | Support for Mailman list servers, notably isabelle-users and isabelle-dev. | 
| 72558 | 5 | */ | 
| 6 | ||
| 7 | package isabelle | |
| 8 | ||
| 9 | ||
| 74921 | 10 | import scala.annotation.tailrec | 
| 74906 | 11 | import scala.util.matching.Regex | 
| 74921 | 12 | import scala.util.matching.Regex.Match | 
| 74906 | 13 | |
| 72558 | 14 | |
| 75393 | 15 | object Mailman {
 | 
| 74921 | 16 | /* mailing list messages */ | 
| 72559 | 17 | |
| 74935 | 18 | def is_address(s: String): Boolean = | 
| 19 |     s.contains('@') && s.contains('.') && !s.contains(' ')
 | |
| 20 | ||
| 74936 | 21 | private val standard_name: Map[String, String] = | 
| 74935 | 22 | Map( | 
| 74941 | 23 | "121171528@qq.com" -> "Guo Fan\n121171528@qq.com", | 
| 74937 | 24 | "Aman Pohjola, Johannes (Data61, Kensington NSW)" -> "Johannes Aman Pohjola", | 
| 25 | "Andrei de AraÃjo Formiga" -> "Andrei de Araujo Formiga", | |
| 74940 | 26 | "Benedikt.AHRENS@unice.fr" -> "Benedikt Ahrens\nBenedikt.AHRENS@unice.fr", | 
| 27 | "Berg, Nils Erik" -> "Nils Erik Berg", | |
| 74937 | 28 | "Berger U." -> "Ulrich Berger", | 
| 29 | "Bisping, Benjamin" -> "Benjamin Bisping", | |
| 74935 | 30 | "Blanchette, J.C." -> "Jasmin Christian Blanchette", | 
| 31 | "Buday Gergely István" -> "Gergely Buday", | |
| 74938 | 32 | "CALaF1UJ9Uy0vGCu4WkBmbfuPDxG7nFm8hfeCMP+O3g7_5CQ0Bw@mail.gmail.com" -> "", | 
| 74937 | 33 | "CRACIUN F." -> "Florin Craciun", | 
| 34 | "Carsten Schuermann" -> "Carsten Schürmann", | |
| 74940 | 35 | "Chris" -> "", | 
| 74935 | 36 | "Christoph Lueth" -> "Christoph Lüth", | 
| 37 | "Claude Marche" -> "Claude Marché", | |
| 74937 | 38 | "Daniel StÃwe" -> "Daniel Stüwe", | 
| 74940 | 39 | "Daniel.Matichuk@data61.csiro.au" -> "Daniel Matichuk\nDaniel.Matichuk@data61.csiro.au", | 
| 40 | "Daniel.Matichuk@nicta.com.au" -> "Daniel Matichuk\nDaniel.Matichuk@nicta.com.au", | |
| 74937 | 41 | "David MENTRE" -> "David MENTRÉ", | 
| 74940 | 42 | "Dey, Katie" -> "Katie Dey", | 
| 74937 | 43 | "Dr. Brendan Patrick Mahony" -> "Brendan Mahony", | 
| 44 | "Farn" -> "Farn Wang", | |
| 74935 | 45 | "Farquhar, Colin I" -> "Colin Farquhar", | 
| 74940 | 46 | "Fernandez, Matthew" -> "Matthew Fernandez", | 
| 74937 | 47 | "Filip Maric" -> "Filip Marić", | 
| 48 | "Filip MariÄ" -> "Filip Marić", | |
| 49 | "Fleury Mathias" -> "Mathias Fleury", | |
| 74940 | 50 | "Francisco Jose CHAVES ALONSO" -> "Francisco Jose Chaves Alonso", | 
| 51 | "Frederic Tuong (Dr)" -> "Frederic Tuong", | |
| 52 | "Fulya" -> "Fulya Horozal", | |
| 74935 | 53 | "George K." -> "George Karabotsos", | 
| 74940 | 54 | "Gidon Ernst" -> "Gidon ERNST", | 
| 55 | "Gransden, Thomas" -> "Thomas Gransden", | |
| 74937 | 56 | "Hans-JÃrg Schurr" -> "Hans-Jörg Schurr", | 
| 57 | "Henri DEBRAT" -> "Henri Debrat", | |
| 74935 | 58 | "Hitoshi Ohsaki (RTA publicity chair)" -> "Hitoshi Ohsaki", | 
| 74940 | 59 | "Häuselmann Rafael" -> "Rafael Häuselmann", | 
| 74937 | 60 | "Isabelle" -> "", | 
| 74940 | 61 | "J. Juhas (TUM)" -> "Jonatan Juhas", | 
| 74937 | 62 | "Jackson, Vincent (Data61, Kensington NSW)" -> "Vincent Jackson", | 
| 63 | "Janney, Mark-P26816" -> "Mark Janney", | |
| 74940 | 64 | "Jean François Molderez" -> "Jean-François Molderez", | 
| 65 | "Jean-Francois Molderez" -> "Jean-François Molderez", | |
| 66 | "John R Harrison" -> "John Harrison", | |
| 74937 | 67 | "Jose DivasÃn" -> "Jose Divasón", | 
| 68 | "Julian" -> "", | |
| 69 | "Julien" -> "", | |
| 70 | "Klein, Gerwin (Data61, Kensington NSW)" -> "Gerwin Klein", | |
| 71 | "Kobayashi, Hidetsune" -> "Hidetsune Kobayashi", | |
| 74940 | 72 | "Kylie Williams (IND)" -> "Kylie Williams", | 
| 73 | "Laarman, A.W." -> "A.W. Laarman", | |
| 74937 | 74 | "Laurent Thery" -> "Laurent Théry", | 
| 74940 | 75 | "Li, Chanjuan" -> "Li Chanjuan", | 
| 74937 | 76 | "Lochbihler Andreas" -> "Andreas Lochbihler", | 
| 74940 | 77 | "Luckhardt, Daniel" -> "Daniel Luckhardt", | 
| 74937 | 78 | "Lutz Schroeder" -> "Lutz Schröder", | 
| 79 | "Lutz SchrÃder" -> "Lutz Schröder", | |
| 74940 | 80 | "MACKENZIE Carlin" -> "Carlin MACKENZIE", | 
| 74937 | 81 | "Makarius" -> "Makarius Wenzel", | 
| 82 | "Marco" -> "", | |
| 83 | "Mark" -> "", | |
| 74940 | 84 | "Markus Mueller-Olm" -> "Markus Müller-Olm", | 
| 74937 | 85 | "Markus" -> "", | 
| 74940 | 86 | "Marmsoler, Diego" -> "Diego Marmsoler", | 
| 74937 | 87 | "Martin Klebermass" -> "Martin Klebermaß", | 
| 74941 | 88 | "Martyn Johnson via RT" -> "", | 
| 89 | "Mathias.Fleury@ens-rennes.fr" -> "Mathias Fleury\nmathias.fleury12@gmail.com", | |
| 74940 | 90 | "Matthew" -> "", | 
| 74937 | 91 | "Matthews, John R" -> "John Matthews", | 
| 74940 | 92 | "McCarthy, Jim (C3ID)" -> "Jim McCarthy", | 
| 93 | "McCue, Brian" -> "Brian McCue", | |
| 74937 | 94 | "Michael FÃrber" -> "Michael Färber", | 
| 74940 | 95 | "Michel" -> "", | 
| 96 | "Miranda, Brando" -> "Brando Miranda", | |
| 97 | "Moscato, Mariano M. \\(LARC-D320\\)\\[NATIONAL INSTITUTE OF AEROSPACE\\]" -> "Mariano M. Moscato", | |
| 98 | "Mr Julian Fell" -> "Julian Fell", | |
| 99 | "Mueller Peter" -> "Peter Müller", | |
| 100 | "Munoz, Cesar Augusto (LARC-D320)" -> "Cesar A. Munoz", | |
| 101 | "Nadel, Alexander" -> "Alexander Nadel", | |
| 102 | "Nagashima, Yutaka" -> "Yutaka Nagashima", | |
| 74937 | 103 | "Norrish, Michael (Data61, Acton)" -> "Michael Norrish", | 
| 74940 | 104 | "O'Leary, John W" -> "John W O'Leary", | 
| 74937 | 105 | "Omar Montano Rivas" -> "Omar Montaño Rivas", | 
| 106 | "Omar MontaÃo Rivas" -> "Omar Montaño Rivas", | |
| 107 | "OndÅej KunÄar" -> "Ondřej Kunčar", | |
| 74941 | 108 | "PALMER Jake" -> "Jake Palmer", | 
| 74937 | 109 | "PAQUI LUCIO" -> "Paqui Lucio", | 
| 74940 | 110 | "Pal, Abhik" -> "Abhik Pal", | 
| 111 | "Pasupuleti, Vijay" -> "Vijay Pasupuleti", | |
| 74937 | 112 | "Peter Vincent Homeier" -> "Peter V. Homeier", | 
| 113 | "Peter" -> "", | |
| 114 | "Philipp Ruemmer" -> "Philipp Rümmer", | |
| 115 | "Philipp RÃmmer" -> "Philipp Rümmer", | |
| 74941 | 116 | "Piete Brooks via RT" -> "", | 
| 74940 | 117 | "RTA publicity chair" -> "", | 
| 74937 | 118 | "Raamsdonk, F. van" -> "Femke van Raamsdonk", | 
| 119 | "Raul Gutierrez" -> "Raúl Gutiérrez", | |
| 120 | "Renà Thiemann" -> "René Thiemann", | |
| 74940 | 121 | "Ridgway, John V. E." -> "John V. E. Ridgway", | 
| 74937 | 122 | "Roggenbach M." -> "Markus Roggenbach", | 
| 74940 | 123 | "Rosu, Grigore" -> "Grigore Rosu", | 
| 74937 | 124 | "Rozman, Mihaela" -> "Mihaela Rozman", | 
| 74940 | 125 | "Schmaltz, J." -> "Julien Schmaltz", | 
| 74937 | 126 | "Serguei A. Mokhov on behalf of PST-11" -> "Serguei A. Mokhov", | 
| 74940 | 127 | "Serguei Mokhov" -> "Serguei A. Mokhov", | 
| 128 | "Shumeiko, Igor" -> "Igor Shumeiko", | |
| 129 | "Siek, Jeremy" -> "Jeremy Siek", | |
| 130 | "Silvio.Ranise@loria.fr" -> "Silvio Ranise\nSilvio.Ranise@loria.fr", | |
| 131 | "Siu, Tony" -> "Tony Siu", | |
| 74937 | 132 | "Stüber, Sebastian" -> "Sebastian Stüber", | 
| 133 | "Thiemann, Rene" -> "René Thiemann", | |
| 134 | "Thiemann, René" -> "René Thiemann", | |
| 74940 | 135 | "Thomas Arthur Leck Sewell" -> "Thomas Sewell", | 
| 74937 | 136 | "Thomas Goethel" -> "Thomas Göthel", | 
| 74940 | 137 | "Thomas.Sewell@data61.csiro.au" -> "Thomas Sewell\nThomas.Sewell@data61.csiro.au", | 
| 74941 | 138 | "Tjark Weber via RT" -> "Tjark Weber", | 
| 74940 | 139 | "Toby.Murray@data61.csiro.au" -> "Toby Murray\nToby.Murray@data61.csiro.au", | 
| 74935 | 140 | "Urban, Christian" -> "Christian Urban", | 
| 74940 | 141 | "Ursula Eschbach" -> "", | 
| 142 | "Van Staden Stephan" -> "Stephan van Staden", | |
| 74937 | 143 | "Viktor Kuncak" -> "Viktor Kunčak", | 
| 144 | "Viorel Preoteasaa" -> "Viorel Preoteasa", | |
| 74940 | 145 | "Wickerson, John P" -> "John Wickerson", | 
| 146 | "Wong, Yat" -> "Yat Wong", | |
| 147 | "YAMADA, Akihisa" -> "Akihisa Yamada", | |
| 74937 | 148 | "YliÃs Falcone" -> "Yliès Falcone", | 
| 74940 | 149 | "amir mohajeri" -> "Amir Mohajeri", | 
| 150 | "aniello murano" -> "Aniello Murano", | |
| 151 | "barzan stefania" -> "Stefania Barzan", | |
| 152 | "benhamou" -> "Belaid Benhamou", | |
| 153 | "charmi panchal" -> "Charmi Panchal", | |
| 154 | "chen kun" -> "Chen Kun", | |
| 155 | "chunhan wu" -> "Chunhan Wu", | |
| 156 | "daniel de la concepción sáez" -> "Daniel de la Concepción Sáez", | |
| 157 | "daniel.luckhardt@mathematik.uni-goettingen.de" -> "Logiker@gmx.net", | |
| 158 | "david streader" -> "David Streader", | |
| 159 | "eschbach@in.tum.de" -> "", | |
| 160 | "f.rabe@jacobs-university.de" -> "florian.rabe@fau.de", | |
| 161 | "florian@haftmann-online.de" -> "haftmann@in.tum.de", | |
| 162 | "fredegar@haftmann-online.de" -> "haftmann@in.tum.de", | |
| 74937 | 163 | "gallais @ ensl.org" -> "Guillaume Allais", | 
| 74940 | 164 | "geng chen" -> "Geng Chen", | 
| 165 | "henning.seidler" -> "Henning Seidler", | |
| 74937 | 166 | "hkb" -> "Hidetsune Kobayashi", | 
| 74941 | 167 | "jobs-pm@inf.ethz.ch" -> "", | 
| 74937 | 168 | "julien@RadboudUniversity" -> "", | 
| 74940 | 169 | "jun sun" -> "Jun Sun", | 
| 170 | "jwang whu.edu.cn (jwang)" -> "jwang", | |
| 171 | "kostas pouliasis" -> "Kostas Pouliasis", | |
| 172 | "kristof.teichel@ptb.de" -> "Kristof Teichel\nkristof.teichel@ptb.de", | |
| 173 | "lucas cavalcante" -> "Lucas Cavalcante", | |
| 174 | "mahmoud abdelazim" -> "Mahmoud Abdelazim", | |
| 175 | "manish surolia" -> "Manish Surolia", | |
| 74937 | 176 | "mantel" -> "Heiko Mantel", | 
| 74940 | 177 | "marco caminati" -> "Marco Caminati", | 
| 74941 | 178 | "mathias.fleury@ens-rennes.fr" -> "Mathias Fleury\nmathias.fleury12@gmail.com", | 
| 74937 | 179 | "merz@loria.fr" -> "stephan.merz@loria.fr", | 
| 74940 | 180 | "michel levy" -> "Michel Levy", | 
| 181 | "michel.levy2009@laposte.net" -> "Michel Levy\nmichel.levy2009@laposte.net", | |
| 74937 | 182 | "nemouchi" -> "Yakoub Nemouchi", | 
| 74941 | 183 | "noam neer" -> "Noam Neer", | 
| 184 | "olfa mraihi" -> "Olfa Mraihi", | |
| 185 | "pathsnottakenworkshop@gmail.com" -> "Leo Freitas\nleo.freitas@newcastle.ac.uk", | |
| 74940 | 186 | "patrick barlatier" -> "Patrick Barlatier", | 
| 187 | "patrick dabou" -> "Patrick Dabou", | |
| 188 | "paul zimmermann" -> "Paul Zimmermann", | |
| 74937 | 189 | "popescu2@illinois.edu" -> "Andrei Popescu", | 
| 74940 | 190 | "recruiting" -> "", | 
| 74941 | 191 | "recruiting@mais.informatik.tu-darmstadt.de" -> "", | 
| 74940 | 192 | "roux cody" -> "Cody Roux", | 
| 193 | "scott constable" -> "Scott Constable", | |
| 194 | "superuser@mattweidner.com" -> "Matthew Weidner\nsuperuser@mattweidner.com", | |
| 195 | "urban@math.lmu.de" -> "Christian Urban\nurban@math.lmu.de", | |
| 74937 | 196 | "veronique.cortier@loria.fr" -> "Veronique.Cortier@loria.fr", | 
| 74940 | 197 | "vikram singh" -> "Vikram Singh", | 
| 74937 | 198 | "wenzelm@in.tum.de" -> "makarius@sketis.net", | 
| 74940 | 199 | "werner@lix.polytechnique.fr" -> "Benjamin Werner\nwerner@lix.polytechnique.fr", | 
| 200 | "wmansky@cs.princeton.edu" -> "William Mansky\nwmansky@cs.princeton.edu", | |
| 201 | "y.nemouchi@ensbiotech.edu.dz" -> "Yakoub Nemouchi\ny.nemouchi@ensbiotech.edu.dz", | |
| 74937 | 202 | "ÐÑÐÐÐÑÐÐÐ ÐÐÐÐÐÐÐÑÐÐÐÑ ÐÐÐÑÐÐÐ" -> "", | 
| 203 | "∀X.Xπ - Tutorials about Proofs" -> "Bruno Woltzenlogel Paleo", | |
| 74935 | 204 | ).withDefault(identity) | 
| 205 | ||
| 74936 | 206 | def standard_author_info(author_info: List[String]): List[String] = | 
| 74940 | 207 | author_info.flatMap(s => split_lines(standard_name.getOrElse(s, s))).distinct | 
| 74936 | 208 | |
| 74921 | 209 | sealed case class Message( | 
| 210 | name: String, | |
| 211 | date: Date, | |
| 212 | title: String, | |
| 74936 | 213 | author_info: List[String], | 
| 74934 | 214 | body: String, | 
| 75393 | 215 | tags: List[String] | 
| 216 |   ) {
 | |
| 74936 | 217 |     if (author_info.isEmpty || author_info.exists(_.isEmpty)) {
 | 
| 218 |       error("Bad author information in " + quote(name))
 | |
| 74935 | 219 | } | 
| 220 | ||
| 74925 | 221 | override def toString: String = name | 
| 74935 | 222 | } | 
| 74906 | 223 | |
| 75393 | 224 |   object Messages {
 | 
| 74935 | 225 | type Graph = isabelle.Graph[String, Message] | 
| 226 | ||
| 75393 | 227 |     def apply(msgs: List[Message]): Messages = {
 | 
| 74936 | 228 | def make_node(g: Graph, node: String, msg: Message): Graph = | 
| 229 | if (g.defined(node) && Date.Ordering.compare(g.get_node(node).date, msg.date) >= 0) g | |
| 74935 | 230 | else g.default_node(node, msg) | 
| 231 | ||
| 74936 | 232 | def connect_nodes(g: Graph, nodes: List[String]): Graph = | 
| 233 |         nodes match {
 | |
| 234 | case Nil => g | |
| 235 |           case a :: bs => bs.foldLeft(g)({ case (g1, b) => g1.add_edge(a, b).add_edge(b, a) })
 | |
| 236 | } | |
| 237 | ||
| 74935 | 238 | new Messages(msgs.sortBy(_.date)(Date.Ordering), | 
| 239 | msgs.foldLeft[Graph](Graph.string)( | |
| 74936 | 240 |           { case (graph, msg) =>
 | 
| 241 | val nodes = msg.author_info | |
| 242 | connect_nodes(nodes.foldLeft(graph)(make_node(_, _, msg)), nodes) | |
| 74935 | 243 | })) | 
| 244 | } | |
| 74934 | 245 | |
| 75393 | 246 |     def find(dir: Path): Messages = {
 | 
| 74934 | 247 | val msgs = | 
| 248 |         for {
 | |
| 249 | archive <- List(Isabelle_Users, Isabelle_Dev) | |
| 250 | msg <- archive.find_messages(dir + Path.basic(archive.list_name)) | |
| 251 | } yield msg | |
| 252 | Messages(msgs) | |
| 253 | } | |
| 74935 | 254 | |
| 75393 | 255 |     sealed case class Cluster(author_info: List[String]) {
 | 
| 74936 | 256 | val (addresses, names) = author_info.partition(is_address) | 
| 257 | ||
| 74935 | 258 | val name: String = | 
| 259 |         names.headOption getOrElse {
 | |
| 260 |           addresses match {
 | |
| 261 |             case a :: _ => a.substring(0, a.indexOf('@')).replace('.', ' ')
 | |
| 262 |             case Nil => error("Empty cluster")
 | |
| 263 | } | |
| 264 | } | |
| 265 | ||
| 74940 | 266 | val name_lowercase: String = Word.lowercase(name) | 
| 267 | ||
| 74935 | 268 | def get_address: Option[String] = addresses.headOption | 
| 269 | ||
| 270 | def unique: Boolean = names.length == 1 && addresses.length == 1 | |
| 271 | def multi: Boolean = names.length > 1 || addresses.length > 1 | |
| 272 | ||
| 75393 | 273 |       def print: String = {
 | 
| 74935 | 274 |         val entries = names ::: (if (addresses.isEmpty) Nil else List("")) ::: addresses
 | 
| 275 |         entries.mkString("\n  * ", "\n    ", "")
 | |
| 276 | } | |
| 277 | } | |
| 74933 | 278 | } | 
| 279 | ||
| 75393 | 280 |   class Messages private(val sorted: List[Message], val graph: Messages.Graph) {
 | 
| 74935 | 281 |     override def toString: String = "Messages(" + sorted.size + ")"
 | 
| 74925 | 282 | |
| 75393 | 283 |     object Node_Ordering extends scala.math.Ordering[String] {
 | 
| 74935 | 284 | override def compare(a: String, b: String): Int = | 
| 285 | Date.Rev_Ordering.compare(graph.get_node(a).date, graph.get_node(b).date) | |
| 286 | } | |
| 287 | ||
| 288 | def get_cluster(msg: Message): Messages.Cluster = | |
| 74936 | 289 | Messages.Cluster(graph.all_succs(msg.author_info).sorted.sorted(Node_Ordering)) | 
| 74935 | 290 | |
| 291 | def get_name(msg: Message): String = get_cluster(msg).name | |
| 74925 | 292 | |
| 74935 | 293 | def get_address(msg: Message): String = | 
| 294 |       get_cluster(msg).get_address getOrElse error("No author address for " + quote(msg.name))
 | |
| 295 | ||
| 75393 | 296 |     def check(check_all: Boolean, check_multi: Boolean = false): Unit = {
 | 
| 74940 | 297 | val clusters = sorted.map(get_cluster).distinct.sortBy(_.name_lowercase) | 
| 74935 | 298 | |
| 74937 | 299 |       if (check_all) {
 | 
| 300 |         Output.writeln(cat_lines("clusters:" :: clusters.map(_.print)))
 | |
| 301 | } | |
| 302 |       else {
 | |
| 303 | val multi = if (check_multi) clusters.filter(_.multi) else Nil | |
| 304 |         if (multi.nonEmpty) {
 | |
| 305 |           Output.writeln(cat_lines("ambiguous clusters:" :: multi.map(_.print)))
 | |
| 306 | } | |
| 74925 | 307 | } | 
| 74933 | 308 | |
| 74935 | 309 | val unknown = clusters.filter(cluster => cluster.get_address.isEmpty) | 
| 310 |       if (unknown.nonEmpty) {
 | |
| 74937 | 311 |         Output.writeln(cat_lines("\nunknown mail:" :: unknown.map(_.print)))
 | 
| 74933 | 312 | } | 
| 74925 | 313 | } | 
| 314 | } | |
| 315 | ||
| 316 | ||
| 74921 | 317 | /* mailing list archives */ | 
| 318 | ||
| 319 | abstract class Archive( | |
| 79510 
d8330439823a
clarified signature: explicit type isabelle.Url to avoid oddities of java.net.URL (e.g. its "equals" method);
 wenzelm parents: 
79044diff
changeset | 320 | url: Url, | 
| 74921 | 321 | name: String = "", | 
| 75393 | 322 | tag: String = "" | 
| 323 |   ) {
 | |
| 74921 | 324 | def message_regex: Regex | 
| 74923 | 325 | def message_content(name: String, lines: List[String]): Message | 
| 74921 | 326 | |
| 74924 | 327 | def message_match(lines: List[String], re: Regex): Option[Match] = | 
| 328 | lines.flatMap(re.findFirstMatchIn(_)).headOption | |
| 329 | ||
| 75393 | 330 |     def make_name(str: String): String = {
 | 
| 74937 | 331 | val s = | 
| 332 |         str.trim.replaceAll("""\s+""", " ").replaceAll(" at ", "@")
 | |
| 333 |           .replace("mailbroy.informatik.tu-muenchen.de", "in.tum.de")
 | |
| 334 |           .replace("informatik.tu-muenchen.de", "in.tum.de")
 | |
| 74935 | 335 |       if (s.startsWith("=") && s.endsWith("=")) "" else s
 | 
| 336 | } | |
| 74924 | 337 | |
| 338 | def make_body(lines: List[String]): String = | |
| 339 | cat_lines(Library.take_suffix[String](_.isEmpty, lines)._1) | |
| 340 | ||
| 79510 
d8330439823a
clarified signature: explicit type isabelle.Url to avoid oddities of java.net.URL (e.g. its "equals" method);
 wenzelm parents: 
79044diff
changeset | 341 | private val main_url: Url = | 
| 74921 | 342 | Url(Library.take_suffix[Char](_ == '/', Url.trim_index(url).toString.toList)._1.mkString + "/") | 
| 343 | ||
| 344 | private val main_html = Url.read(main_url) | |
| 345 | ||
| 75393 | 346 |     val list_name: String = {
 | 
| 74921 | 347 | val title = | 
| 348 | """<title>The ([^</>]*) Archives</title>""".r.findFirstMatchIn(main_html).map(_.group(1)) | |
| 349 |       (proper_string(name) orElse title).getOrElse(error("Failed to determine mailing list name"))
 | |
| 350 | } | |
| 72558 | 351 | override def toString: String = list_name | 
| 352 | ||
| 74923 | 353 | def full_name(href: String): String = list_name + "/" + href | 
| 354 | ||
| 74921 | 355 | def list_tag: String = proper_string(tag).getOrElse(list_name) | 
| 356 | ||
| 79510 
d8330439823a
clarified signature: explicit type isabelle.Url to avoid oddities of java.net.URL (e.g. its "equals" method);
 wenzelm parents: 
79044diff
changeset | 357 | def read_text(href: String): String = Url.read(main_url.resolve(href)) | 
| 74921 | 358 | |
| 359 | def hrefs_text: List[String] = | |
| 360 | """href="([^"]+\.txt(?:\.gz)?)"""".r.findAllMatchIn(main_html).map(_.group(1)).toList | |
| 361 | ||
| 362 | def hrefs_msg: List[String] = | |
| 74906 | 363 |       (for {
 | 
| 74921 | 364 | href <- """href="([^"]+)/date.html"""".r.findAllMatchIn(main_html).map(_.group(1)) | 
| 365 | html = read_text(href + "/date.html") | |
| 366 | msg <- message_regex.findAllMatchIn(html).map(_.group(1)) | |
| 74906 | 367 | } yield href + "/" + msg).toList | 
| 368 | ||
| 75393 | 369 |     def get(target_dir: Path, href: String, progress: Progress = new Progress): Option[Path] = {
 | 
| 72558 | 370 | val dir = target_dir + Path.basic(list_name) | 
| 74906 | 371 | val path = dir + Path.explode(href) | 
| 79510 
d8330439823a
clarified signature: explicit type isabelle.Url to avoid oddities of java.net.URL (e.g. its "equals" method);
 wenzelm parents: 
79044diff
changeset | 372 | val url = main_url.resolve(href) | 
| 
d8330439823a
clarified signature: explicit type isabelle.Url to avoid oddities of java.net.URL (e.g. its "equals" method);
 wenzelm parents: 
79044diff
changeset | 373 | val connection = url.open_connection() | 
| 74906 | 374 |       try {
 | 
| 375 | val length = connection.getContentLengthLong | |
| 376 | val timestamp = connection.getLastModified | |
| 78958 | 377 | if (path.is_file && File.size(path) == length && path.file.lastModified == timestamp) None | 
| 74906 | 378 |         else {
 | 
| 379 | Isabelle_System.make_directory(path.dir) | |
| 380 |           progress.echo("Getting " + url)
 | |
| 381 | val bytes = | |
| 382 | using(connection.getInputStream)(Bytes.read_stream(_, hint = length.toInt max 1024)) | |
| 78958 | 383 | Bytes.write(path, bytes) | 
| 384 | path.file.setLastModified(timestamp) | |
| 74906 | 385 | Some(path) | 
| 386 | } | |
| 387 | } | |
| 388 |       finally { connection.getInputStream.close() }
 | |
| 389 | } | |
| 72558 | 390 | |
| 74906 | 391 | def download_text(target_dir: Path, progress: Progress = new Progress): List[Path] = | 
| 392 | hrefs_text.flatMap(get(target_dir, _, progress = progress)) | |
| 393 | ||
| 394 | def download_msg(target_dir: Path, progress: Progress = new Progress): List[Path] = | |
| 395 | hrefs_msg.flatMap(get(target_dir, _, progress = progress)) | |
| 74907 | 396 | |
| 397 | def download(target_dir: Path, progress: Progress = new Progress): List[Path] = | |
| 398 | download_text(target_dir, progress = progress) ::: | |
| 399 | download_msg(target_dir, progress = progress) | |
| 74921 | 400 | |
| 75393 | 401 |     def make_title(str: String): String = {
 | 
| 74921 | 402 |       val Trim1 = ("""\s*\Q[""" + list_tag + """]\E\s*(.*)""").r
 | 
| 403 | val Trim2 = """(?i:(?:re|fw|fwd)\s*:\s*)(.*)""".r | |
| 404 | val Trim3 = """\[\s*(.*?)\s*\]""".r | |
| 405 | @tailrec def trim(s: String): String = | |
| 406 |         s match {
 | |
| 407 | case Trim1(s1) => trim(s1) | |
| 408 | case Trim2(s1) => trim(s1) | |
| 409 | case Trim3(s1) => trim(s1) | |
| 410 | case _ => s | |
| 411 | } | |
| 412 | trim(str) | |
| 413 | } | |
| 414 | ||
| 415 | def get_messages(): List[Message] = | |
| 416 | for (href <- hrefs_msg) yield message_content(href, split_lines(read_text(href))) | |
| 417 | ||
| 75393 | 418 |     def find_messages(dir: Path): List[Message] = {
 | 
| 74921 | 419 |       for {
 | 
| 75906 
2167b9e3157a
clarified signature: support for adhoc file types;
 wenzelm parents: 
75393diff
changeset | 420 | file <- File.find_files(dir.file, file => File.is_html(file.getName)) | 
| 74921 | 421 | rel_path <- File.relative_path(dir, File.path(file)) | 
| 74923 | 422 | } | 
| 423 |       yield {
 | |
| 424 | val name = full_name(rel_path.implode) | |
| 425 | message_content(name, split_lines(File.read(file))) | |
| 426 | } | |
| 74921 | 427 | } | 
| 72558 | 428 | } | 
| 72559 | 429 | |
| 75393 | 430 |   private class Message_Chunk(bg: String = "", en: String = "") {
 | 
| 431 |     def unapply(lines: List[String]): Option[List[String]] = {
 | |
| 74932 | 432 | val res1 = | 
| 433 | if (bg.isEmpty) Some(lines) | |
| 434 |         else {
 | |
| 435 |           lines.dropWhile(_ != bg) match {
 | |
| 436 | case Nil => None | |
| 437 | case _ :: rest => Some(rest) | |
| 438 | } | |
| 439 | } | |
| 440 | if (en.isEmpty) res1 | |
| 441 |       else {
 | |
| 442 |         res1 match {
 | |
| 443 | case None => None | |
| 444 | case Some(lines1) => | |
| 445 | val lines2 = lines1.takeWhile(_ != en) | |
| 446 | if (lines1.drop(lines2.length).isEmpty) None else Some(lines2) | |
| 447 | } | |
| 448 | } | |
| 449 | } | |
| 450 | ||
| 451 | def get(lines: List[String]): List[String] = | |
| 452 | unapply(lines) getOrElse | |
| 77369 | 453 |         error("Missing delimiters:" + if_proper(bg, " ") + bg + if_proper(en, " ") + en)
 | 
| 74932 | 454 | } | 
| 455 | ||
| 72559 | 456 | |
| 74930 | 457 | /* isabelle-users mailing list */ | 
| 72559 | 458 | |
| 74921 | 459 | object Isabelle_Users extends Archive( | 
| 460 |     Url("https://lists.cam.ac.uk/pipermail/cl-isabelle-users"),
 | |
| 75393 | 461 | name = "isabelle-users", tag = "isabelle" | 
| 462 |   ) {
 | |
| 74921 | 463 | override def message_regex: Regex = """<li><a name="\d+" href="(msg\d+\.html)">""".r | 
| 464 | ||
| 465 | private object Head extends | |
| 74922 | 466 | Message_Chunk(bg = "<!--X-Head-of-Message-->", en = "<!--X-Head-of-Message-End-->") | 
| 74921 | 467 | |
| 468 | private object Body extends | |
| 74922 | 469 | Message_Chunk(bg = "<!--X-Body-of-Message-->", en = "<!--X-Body-of-Message-End-->") | 
| 470 | ||
| 75393 | 471 |     private object Date_Format {
 | 
| 74922 | 472 | private val Trim1 = """\w+,\s*(.*)""".r | 
| 473 | private val Trim2 = """(.*?)\s*\(.*\)""".r | |
| 474 | private val Format = | |
| 475 | Date.Format( | |
| 476 | "d MMM uuuu H:m:s Z", | |
| 477 | "d MMM uuuu H:m:s z", | |
| 478 | "d MMM yy H:m:s Z", | |
| 479 | "d MMM yy H:m:s z") | |
| 75393 | 480 |       def unapply(s: String): Option[Date] = {
 | 
| 74922 | 481 |         val s0 = s.replaceAll("""\s+""", " ")
 | 
| 482 | val s1 = | |
| 483 |           s0 match {
 | |
| 484 | case Trim1(s1) => s1 | |
| 485 | case _ => s0 | |
| 486 | } | |
| 487 | val s2 = | |
| 488 |           s1 match {
 | |
| 489 | case Trim2(s2) => s2 | |
| 490 | case _ => s1 | |
| 491 | } | |
| 492 | Format.unapply(s2) | |
| 493 | } | |
| 494 | } | |
| 74921 | 495 | |
| 75393 | 496 |     override def make_name(str: String): String = {
 | 
| 74935 | 497 |       val s = Library.perhaps_unsuffix(" via Cl-isabelle-users", super.make_name(str))
 | 
| 74925 | 498 | if (s == "cl-isabelle-users@lists.cam.ac.uk") "" else s | 
| 499 | } | |
| 500 | ||
| 75393 | 501 |     object Address {
 | 
| 74924 | 502 | private def anchor(s: String): String = """<a href="[^"]*">""" + s + """</a>""" | 
| 503 | private def angl(s: String): String = """<""" + s + """>""" | |
| 504 | private def quot(s: String): String = """"""" + s + """"""" | |
| 505 |       private def paren(s: String): String = """\(""" + s + """\)"""
 | |
| 506 | private val adr = """([^<>]*? at [^<>]*?)""" | |
| 507 | private val any = """([^<>]*?)""" | |
| 508 | private val spc = """\s*""" | |
| 509 | ||
| 510 | private val Name1 = quot(anchor(any)).r | |
| 511 | private val Name2 = quot(any).r | |
| 512 | private val Name_Adr1 = (quot(anchor(any)) + spc + angl(anchor(adr))).r | |
| 513 | private val Name_Adr2 = (quot(any) + spc + angl(anchor(adr))).r | |
| 514 | private val Name_Adr3 = (anchor(any) + spc + angl(anchor(adr))).r | |
| 515 | private val Name_Adr4 = (any + spc + angl(anchor(adr))).r | |
| 516 | private val Adr_Name1 = (angl(anchor(adr)) + spc + paren(any)).r | |
| 517 | private val Adr_Name2 = (anchor(adr) + spc + paren(any)).r | |
| 518 | private val Adr1 = angl(anchor(adr)).r | |
| 519 | private val Adr2 = anchor(adr).r | |
| 520 | ||
| 74936 | 521 | def parse(s: String): List[String] = | 
| 74924 | 522 |         s match {
 | 
| 74936 | 523 | case Name1(a) => List(a) | 
| 524 | case Name2(a) => List(a) | |
| 525 | case Name_Adr1(a, b) => List(a, b) | |
| 526 | case Name_Adr2(a, b) => List(a, b) | |
| 527 | case Name_Adr3(a, b) => List(a, b) | |
| 528 | case Name_Adr4(a, b) => List(a, b) | |
| 529 | case Adr_Name1(b, a) => List(a, b) | |
| 530 | case Adr_Name2(b, a) => List(a, b) | |
| 531 | case Adr1(a) => List(a) | |
| 532 | case Adr2(a) => List(a) | |
| 533 | case _ => Nil | |
| 74924 | 534 | } | 
| 535 | } | |
| 536 | ||
| 75393 | 537 |     override def message_content(name: String, lines: List[String]): Message = {
 | 
| 74921 | 538 | def err(msg: String = ""): Nothing = | 
| 77368 | 539 |         error("Malformed message: " + name + if_proper(msg, "\n" + msg))
 | 
| 74921 | 540 | |
| 541 | val (head, body) = | |
| 74924 | 542 |         try { (Head.get(lines), make_body(Body.get(lines))) }
 | 
| 74921 | 543 |         catch { case ERROR(msg) => err(msg) }
 | 
| 72559 | 544 | |
| 74921 | 545 | val date = | 
| 546 | message_match(head, """<li><em>Date</em>:\s*(.*?)\s*</li>""".r) | |
| 547 | .map(m => HTML.input(m.group(1))) match | |
| 548 |         {
 | |
| 549 | case Some(Date_Format(d)) => d | |
| 550 |           case Some(s) => err("Malformed Date: " + quote(s))
 | |
| 551 |           case None => err("Missing Date")
 | |
| 552 | } | |
| 553 | ||
| 554 | val title = | |
| 74924 | 555 | make_title( | 
| 556 | HTML.input(message_match(head, """<li><em>Subject</em>:\s*(.*)</li>""".r) | |
| 557 |             .getOrElse(err("Missing Subject")).group(1)))
 | |
| 74921 | 558 | |
| 74938 | 559 | def parse_author_info(re: Regex): List[String] = | 
| 560 |         message_match(head, re) match {
 | |
| 561 | case None => Nil | |
| 562 | case Some(m) => Address.parse(m.group(1)).map(a => HTML.input(make_name(a))) | |
| 563 | } | |
| 564 | ||
| 74936 | 565 | val author_info = | 
| 74938 | 566 |         (parse_author_info("""<li><em>From</em>:\s*(.*?)\s*</li>""".r) :::
 | 
| 567 |           parse_author_info("""<li><em>Reply-to</em>:\s*(.*?)\s*</li>""".r))
 | |
| 568 | .distinct.filter(_.nonEmpty) | |
| 569 | ||
| 570 |       if (author_info.isEmpty) err("Malformed author information")
 | |
| 74921 | 571 | |
| 74934 | 572 | val tags = List(list_name) | 
| 573 | ||
| 74936 | 574 | Message(name, date, title, standard_author_info(author_info), body, tags) | 
| 74921 | 575 | } | 
| 576 | } | |
| 577 | ||
| 74930 | 578 | |
| 579 | /* isabelle-dev mailing list */ | |
| 580 | ||
| 75393 | 581 |   object Isabelle_Dev extends Archive(Url("https://mailmanbroy.in.tum.de/pipermail/isabelle-dev")) {
 | 
| 74921 | 582 | override def message_regex: Regex = """<LI><A HREF="(\d+\.html)">""".r | 
| 583 | ||
| 74922 | 584 | private object Head extends Message_Chunk(en = "<!--beginarticle-->") | 
| 585 | private object Body extends Message_Chunk(bg = "<!--beginarticle-->", en = "<!--endarticle-->") | |
| 586 | ||
| 75393 | 587 |     private object Date_Format {
 | 
| 74922 | 588 |       val Format = Date.Format("E MMM d H:m:s z uuuu")
 | 
| 589 |       def unapply(s: String): Option[Date] = Format.unapply(s.replaceAll("""\s+""", " "))
 | |
| 590 | } | |
| 591 | ||
| 75393 | 592 |     override def message_content(name: String, lines: List[String]): Message = {
 | 
| 74922 | 593 | def err(msg: String = ""): Nothing = | 
| 77368 | 594 |         error("Malformed message: " + name + if_proper(msg, "\n" + msg))
 | 
| 74922 | 595 | |
| 596 | val (head, body) = | |
| 74924 | 597 |         try { (Head.get(lines), make_body(Body.get(lines))) }
 | 
| 74922 | 598 |         catch { case ERROR(msg) => err(msg) }
 | 
| 599 | ||
| 600 | val date = | |
| 601 |         message_match(head, """\s*<I>(.*)</I>""".r).map(m => HTML.input(m.group(1))) match {
 | |
| 602 | case Some(Date_Format(d)) => d | |
| 603 |           case Some(s) => err("Malformed Date: " + quote(s))
 | |
| 604 |           case None => err("Missing Date")
 | |
| 605 | } | |
| 606 | ||
| 75393 | 607 |       val (title, author_address) = {
 | 
| 74922 | 608 |         message_match(head, """TITLE="([^"]+)">(.*)""".r) match {
 | 
| 74931 | 609 | case Some(m) => (make_title(HTML.input(m.group(1))), make_name(HTML.input(m.group(2)))) | 
| 74922 | 610 |           case None => err("Missing TITLE")
 | 
| 611 | } | |
| 612 | } | |
| 613 | ||
| 614 | val author_name = | |
| 615 |         message_match(head, """\s*<B>(.*)</B>""".r) match {
 | |
| 616 |           case None => err("Missing author")
 | |
| 74931 | 617 | case Some(m) => | 
| 618 | val a = make_name(HTML.input(m.group(1))) | |
| 619 | if (a == author_address) "" else a | |
| 74922 | 620 | } | 
| 621 | ||
| 74936 | 622 | val author_info = List(author_name, author_address) | 
| 74934 | 623 | val tags = List(list_name) | 
| 624 | ||
| 74936 | 625 | Message(name, date, title, standard_author_info(author_info), body, tags) | 
| 74922 | 626 | } | 
| 74921 | 627 | } | 
| 72558 | 628 | } |