| author | wenzelm | 
| Sat, 11 Jan 2025 21:58:47 +0100 | |
| changeset 81766 | ebf113cd6d2c | 
| parent 81627 | 079dee3b117c | 
| permissions | -rw-r--r-- | 
| 72558 | 1 | /* Title: Pure/General/mailman.scala | 
| 2 | Author: Makarius | |
| 3 | ||
| 74942 | 4 | Support for Mailman list servers, notably isabelle-users and isabelle-dev. | 
| 72558 | 5 | */ | 
| 6 | ||
| 7 | package isabelle | |
| 8 | ||
| 9 | ||
| 74921 | 10 | import scala.annotation.tailrec | 
| 74906 | 11 | import scala.util.matching.Regex | 
| 74921 | 12 | import scala.util.matching.Regex.Match | 
| 74906 | 13 | |
| 72558 | 14 | |
| 75393 | 15 | object Mailman {
 | 
| 74921 | 16 | /* mailing list messages */ | 
| 72559 | 17 | |
| 74935 | 18 | def is_address(s: String): Boolean = | 
| 19 |     s.contains('@') && s.contains('.') && !s.contains(' ')
 | |
| 20 | ||
| 74936 | 21 | private val standard_name: Map[String, String] = | 
| 74935 | 22 | Map( | 
| 81627 | 23 | "121171528:qq/com" -> "Guo Fan\n121171528:qq/com", | 
| 74937 | 24 | "Aman Pohjola, Johannes (Data61, Kensington NSW)" -> "Johannes Aman Pohjola", | 
| 25 | "Andrei de AraÃjo Formiga" -> "Andrei de Araujo Formiga", | |
| 81627 | 26 | "Benedikt/AHRENS:unice/fr" -> "Benedikt Ahrens\nBenedikt/AHRENS:unice/fr", | 
| 74940 | 27 | "Berg, Nils Erik" -> "Nils Erik Berg", | 
| 81627 | 28 | "Berger U/" -> "Ulrich Berger", | 
| 74937 | 29 | "Bisping, Benjamin" -> "Benjamin Bisping", | 
| 81627 | 30 | "Blanchette, J/C/" -> "Jasmin Christian Blanchette", | 
| 74935 | 31 | "Buday Gergely István" -> "Gergely Buday", | 
| 81627 | 32 | "CALaF1UJ9Uy0vGCu4WkBmbfuPDxG7nFm8hfeCMP+O3g7_5CQ0Bw:mail/gmail/com" -> "", | 
| 33 | "CRACIUN F/" -> "Florin Craciun", | |
| 74937 | 34 | "Carsten Schuermann" -> "Carsten Schürmann", | 
| 74940 | 35 | "Chris" -> "", | 
| 74935 | 36 | "Christoph Lueth" -> "Christoph Lüth", | 
| 37 | "Claude Marche" -> "Claude Marché", | |
| 74937 | 38 | "Daniel StÃwe" -> "Daniel Stüwe", | 
| 81627 | 39 | "Daniel/Matichuk:data61/csiro/au" -> "Daniel Matichuk\nDaniel/Matichuk:data61/csiro/au", | 
| 40 | "Daniel/Matichuk:nicta/com/au" -> "Daniel Matichuk\nDaniel/Matichuk:nicta/com/au", | |
| 74937 | 41 | "David MENTRE" -> "David MENTRÉ", | 
| 74940 | 42 | "Dey, Katie" -> "Katie Dey", | 
| 81627 | 43 | "Dr/ Brendan Patrick Mahony" -> "Brendan Mahony", | 
| 74937 | 44 | "Farn" -> "Farn Wang", | 
| 74935 | 45 | "Farquhar, Colin I" -> "Colin Farquhar", | 
| 74940 | 46 | "Fernandez, Matthew" -> "Matthew Fernandez", | 
| 74937 | 47 | "Filip Maric" -> "Filip Marić", | 
| 48 | "Filip MariÄ" -> "Filip Marić", | |
| 49 | "Fleury Mathias" -> "Mathias Fleury", | |
| 74940 | 50 | "Francisco Jose CHAVES ALONSO" -> "Francisco Jose Chaves Alonso", | 
| 51 | "Frederic Tuong (Dr)" -> "Frederic Tuong", | |
| 52 | "Fulya" -> "Fulya Horozal", | |
| 81627 | 53 | "George K/" -> "George Karabotsos", | 
| 74940 | 54 | "Gidon Ernst" -> "Gidon ERNST", | 
| 55 | "Gransden, Thomas" -> "Thomas Gransden", | |
| 74937 | 56 | "Hans-JÃrg Schurr" -> "Hans-Jörg Schurr", | 
| 57 | "Henri DEBRAT" -> "Henri Debrat", | |
| 74935 | 58 | "Hitoshi Ohsaki (RTA publicity chair)" -> "Hitoshi Ohsaki", | 
| 74940 | 59 | "Häuselmann Rafael" -> "Rafael Häuselmann", | 
| 74937 | 60 | "Isabelle" -> "", | 
| 81627 | 61 | "J/ Juhas (TUM)" -> "Jonatan Juhas", | 
| 74937 | 62 | "Jackson, Vincent (Data61, Kensington NSW)" -> "Vincent Jackson", | 
| 63 | "Janney, Mark-P26816" -> "Mark Janney", | |
| 74940 | 64 | "Jean François Molderez" -> "Jean-François Molderez", | 
| 65 | "Jean-Francois Molderez" -> "Jean-François Molderez", | |
| 66 | "John R Harrison" -> "John Harrison", | |
| 74937 | 67 | "Jose DivasÃn" -> "Jose Divasón", | 
| 68 | "Julian" -> "", | |
| 69 | "Julien" -> "", | |
| 70 | "Klein, Gerwin (Data61, Kensington NSW)" -> "Gerwin Klein", | |
| 71 | "Kobayashi, Hidetsune" -> "Hidetsune Kobayashi", | |
| 74940 | 72 | "Kylie Williams (IND)" -> "Kylie Williams", | 
| 81627 | 73 | "Laarman, A/W/" -> "A/W/ Laarman", | 
| 74937 | 74 | "Laurent Thery" -> "Laurent Théry", | 
| 74940 | 75 | "Li, Chanjuan" -> "Li Chanjuan", | 
| 74937 | 76 | "Lochbihler Andreas" -> "Andreas Lochbihler", | 
| 74940 | 77 | "Luckhardt, Daniel" -> "Daniel Luckhardt", | 
| 74937 | 78 | "Lutz Schroeder" -> "Lutz Schröder", | 
| 79 | "Lutz SchrÃder" -> "Lutz Schröder", | |
| 74940 | 80 | "MACKENZIE Carlin" -> "Carlin MACKENZIE", | 
| 74937 | 81 | "Makarius" -> "Makarius Wenzel", | 
| 82 | "Marco" -> "", | |
| 83 | "Mark" -> "", | |
| 74940 | 84 | "Markus Mueller-Olm" -> "Markus Müller-Olm", | 
| 74937 | 85 | "Markus" -> "", | 
| 74940 | 86 | "Marmsoler, Diego" -> "Diego Marmsoler", | 
| 74937 | 87 | "Martin Klebermass" -> "Martin Klebermaß", | 
| 74941 | 88 | "Martyn Johnson via RT" -> "", | 
| 81627 | 89 | "Mathias/Fleury:ens-rennes/fr" -> "Mathias Fleury\nmathias/fleury12:gmail/com", | 
| 74940 | 90 | "Matthew" -> "", | 
| 74937 | 91 | "Matthews, John R" -> "John Matthews", | 
| 74940 | 92 | "McCarthy, Jim (C3ID)" -> "Jim McCarthy", | 
| 93 | "McCue, Brian" -> "Brian McCue", | |
| 74937 | 94 | "Michael FÃrber" -> "Michael Färber", | 
| 74940 | 95 | "Michel" -> "", | 
| 96 | "Miranda, Brando" -> "Brando Miranda", | |
| 81627 | 97 | "Moscato, Mariano M/ \\(LARC-D320\\)\\[NATIONAL INSTITUTE OF AEROSPACE\\]" -> "Mariano M/ Moscato", | 
| 74940 | 98 | "Mr Julian Fell" -> "Julian Fell", | 
| 99 | "Mueller Peter" -> "Peter Müller", | |
| 81627 | 100 | "Munoz, Cesar Augusto (LARC-D320)" -> "Cesar A/ Munoz", | 
| 74940 | 101 | "Nadel, Alexander" -> "Alexander Nadel", | 
| 102 | "Nagashima, Yutaka" -> "Yutaka Nagashima", | |
| 74937 | 103 | "Norrish, Michael (Data61, Acton)" -> "Michael Norrish", | 
| 74940 | 104 | "O'Leary, John W" -> "John W O'Leary", | 
| 74937 | 105 | "Omar Montano Rivas" -> "Omar Montaño Rivas", | 
| 106 | "Omar MontaÃo Rivas" -> "Omar Montaño Rivas", | |
| 107 | "OndÅej KunÄar" -> "Ondřej Kunčar", | |
| 74941 | 108 | "PALMER Jake" -> "Jake Palmer", | 
| 74937 | 109 | "PAQUI LUCIO" -> "Paqui Lucio", | 
| 74940 | 110 | "Pal, Abhik" -> "Abhik Pal", | 
| 111 | "Pasupuleti, Vijay" -> "Vijay Pasupuleti", | |
| 81627 | 112 | "Peter Vincent Homeier" -> "Peter V/ Homeier", | 
| 74937 | 113 | "Peter" -> "", | 
| 114 | "Philipp Ruemmer" -> "Philipp Rümmer", | |
| 115 | "Philipp RÃmmer" -> "Philipp Rümmer", | |
| 74941 | 116 | "Piete Brooks via RT" -> "", | 
| 74940 | 117 | "RTA publicity chair" -> "", | 
| 81627 | 118 | "Raamsdonk, F/ van" -> "Femke van Raamsdonk", | 
| 74937 | 119 | "Raul Gutierrez" -> "Raúl Gutiérrez", | 
| 120 | "Renà Thiemann" -> "René Thiemann", | |
| 81627 | 121 | "Ridgway, John V/ E/" -> "John V/ E/ Ridgway", | 
| 122 | "Roggenbach M/" -> "Markus Roggenbach", | |
| 74940 | 123 | "Rosu, Grigore" -> "Grigore Rosu", | 
| 74937 | 124 | "Rozman, Mihaela" -> "Mihaela Rozman", | 
| 81627 | 125 | "Schmaltz, J/" -> "Julien Schmaltz", | 
| 126 | "Serguei A/ Mokhov on behalf of PST-11" -> "Serguei A/ Mokhov", | |
| 127 | "Serguei Mokhov" -> "Serguei A/ Mokhov", | |
| 74940 | 128 | "Shumeiko, Igor" -> "Igor Shumeiko", | 
| 129 | "Siek, Jeremy" -> "Jeremy Siek", | |
| 81627 | 130 | "Silvio/Ranise:loria/fr" -> "Silvio Ranise\nSilvio/Ranise:loria/fr", | 
| 74940 | 131 | "Siu, Tony" -> "Tony Siu", | 
| 74937 | 132 | "Stüber, Sebastian" -> "Sebastian Stüber", | 
| 133 | "Thiemann, Rene" -> "René Thiemann", | |
| 134 | "Thiemann, René" -> "René Thiemann", | |
| 74940 | 135 | "Thomas Arthur Leck Sewell" -> "Thomas Sewell", | 
| 74937 | 136 | "Thomas Goethel" -> "Thomas Göthel", | 
| 81627 | 137 | "Thomas/Sewell:data61/csiro/au" -> "Thomas Sewell\nThomas/Sewell:data61/csiro/au", | 
| 74941 | 138 | "Tjark Weber via RT" -> "Tjark Weber", | 
| 81627 | 139 | "Toby/Murray:data61/csiro/au" -> "Toby Murray\nToby/Murray:data61/csiro/au", | 
| 74935 | 140 | "Urban, Christian" -> "Christian Urban", | 
| 74940 | 141 | "Ursula Eschbach" -> "", | 
| 142 | "Van Staden Stephan" -> "Stephan van Staden", | |
| 74937 | 143 | "Viktor Kuncak" -> "Viktor Kunčak", | 
| 144 | "Viorel Preoteasaa" -> "Viorel Preoteasa", | |
| 74940 | 145 | "Wickerson, John P" -> "John Wickerson", | 
| 146 | "Wong, Yat" -> "Yat Wong", | |
| 147 | "YAMADA, Akihisa" -> "Akihisa Yamada", | |
| 74937 | 148 | "YliÃs Falcone" -> "Yliès Falcone", | 
| 74940 | 149 | "amir mohajeri" -> "Amir Mohajeri", | 
| 150 | "aniello murano" -> "Aniello Murano", | |
| 151 | "barzan stefania" -> "Stefania Barzan", | |
| 152 | "benhamou" -> "Belaid Benhamou", | |
| 153 | "charmi panchal" -> "Charmi Panchal", | |
| 154 | "chen kun" -> "Chen Kun", | |
| 155 | "chunhan wu" -> "Chunhan Wu", | |
| 156 | "daniel de la concepción sáez" -> "Daniel de la Concepción Sáez", | |
| 81627 | 157 | "daniel/luckhardt:mathematik/uni-goettingen/de" -> "Logiker:gmx/net", | 
| 74940 | 158 | "david streader" -> "David Streader", | 
| 81627 | 159 | "eschbach:in/tum/de" -> "", | 
| 160 | "f/rabe:jacobs-university/de" -> "florian/rabe:fau/de", | |
| 161 | "florian:haftmann-online/de" -> "haftmann:in/tum/de", | |
| 162 | "fredegar:haftmann-online/de" -> "haftmann:in/tum/de", | |
| 163 | "gallais : ensl/org" -> "Guillaume Allais", | |
| 74940 | 164 | "geng chen" -> "Geng Chen", | 
| 81627 | 165 | "henning/seidler" -> "Henning Seidler", | 
| 74937 | 166 | "hkb" -> "Hidetsune Kobayashi", | 
| 81627 | 167 | "jobs-pm:inf/ethz/ch" -> "", | 
| 168 | "julien:RadboudUniversity" -> "", | |
| 74940 | 169 | "jun sun" -> "Jun Sun", | 
| 81627 | 170 | "jwang whu/edu/cn (jwang)" -> "jwang", | 
| 74940 | 171 | "kostas pouliasis" -> "Kostas Pouliasis", | 
| 81627 | 172 | "kristof/teichel:ptb/de" -> "Kristof Teichel\nkristof/teichel:ptb/de", | 
| 74940 | 173 | "lucas cavalcante" -> "Lucas Cavalcante", | 
| 174 | "mahmoud abdelazim" -> "Mahmoud Abdelazim", | |
| 175 | "manish surolia" -> "Manish Surolia", | |
| 74937 | 176 | "mantel" -> "Heiko Mantel", | 
| 74940 | 177 | "marco caminati" -> "Marco Caminati", | 
| 81627 | 178 | "mathias/fleury:ens-rennes/fr" -> "Mathias Fleury\nmathias/fleury12:gmail/com", | 
| 179 | "merz:loria/fr" -> "stephan/merz:loria/fr", | |
| 74940 | 180 | "michel levy" -> "Michel Levy", | 
| 81627 | 181 | "michel/levy2009:laposte/net" -> "Michel Levy\nmichel/levy2009:laposte/net", | 
| 74937 | 182 | "nemouchi" -> "Yakoub Nemouchi", | 
| 74941 | 183 | "noam neer" -> "Noam Neer", | 
| 184 | "olfa mraihi" -> "Olfa Mraihi", | |
| 81627 | 185 | "pathsnottakenworkshop:gmail/com" -> "Leo Freitas\nleo/freitas:newcastle/ac/uk", | 
| 74940 | 186 | "patrick barlatier" -> "Patrick Barlatier", | 
| 187 | "patrick dabou" -> "Patrick Dabou", | |
| 188 | "paul zimmermann" -> "Paul Zimmermann", | |
| 81627 | 189 | "popescu2:illinois/edu" -> "Andrei Popescu", | 
| 74940 | 190 | "recruiting" -> "", | 
| 81627 | 191 | "recruiting:mais/informatik/tu-darmstadt/de" -> "", | 
| 74940 | 192 | "roux cody" -> "Cody Roux", | 
| 193 | "scott constable" -> "Scott Constable", | |
| 81627 | 194 | "superuser:mattweidner/com" -> "Matthew Weidner\nsuperuser:mattweidner/com", | 
| 195 | "urban:math/lmu/de" -> "Christian Urban\nurban:math/lmu/de", | |
| 196 | "veronique/cortier:loria/fr" -> "Veronique/Cortier:loria/fr", | |
| 74940 | 197 | "vikram singh" -> "Vikram Singh", | 
| 81627 | 198 | "wenzelm:in/tum/de" -> "makarius:sketis/net", | 
| 199 | "werner:lix/polytechnique/fr" -> "Benjamin Werner\nwerner:lix/polytechnique/fr", | |
| 200 | "wmansky:cs/princeton/edu" -> "William Mansky\nwmansky:cs/princeton/edu", | |
| 201 | "y/nemouchi:ensbiotech/edu/dz" -> "Yakoub Nemouchi\ny/nemouchi:ensbiotech/edu/dz", | |
| 74937 | 202 | "ÐÑÐÐÐÑÐÐÐ ÐÐÐÐÐÐÐÑÐÐÐÑ ÐÐÐÑÐÐÐ" -> "", | 
| 81627 | 203 | "∀X/Xπ - Tutorials about Proofs" -> "Bruno Woltzenlogel Paleo", | 
| 204 | ) | |
| 205 | ||
| 206 | private def tune(s: String): String = | |
| 207 | s.replace(64.toChar, 58.toChar).replace(46.toChar, 47.toChar) | |
| 208 | ||
| 209 | private def untune(s: String): String = | |
| 210 | s.replace(58.toChar, 64.toChar).replace(47.toChar, 46.toChar) | |
| 74935 | 211 | |
| 74936 | 212 | def standard_author_info(author_info: List[String]): List[String] = | 
| 81627 | 213 | author_info.flatMap(s => | 
| 214 | split_lines(standard_name.get(tune(s)).map(untune).getOrElse(s))).distinct | |
| 74936 | 215 | |
| 74921 | 216 | sealed case class Message( | 
| 217 | name: String, | |
| 218 | date: Date, | |
| 219 | title: String, | |
| 74936 | 220 | author_info: List[String], | 
| 74934 | 221 | body: String, | 
| 75393 | 222 | tags: List[String] | 
| 223 |   ) {
 | |
| 74936 | 224 |     if (author_info.isEmpty || author_info.exists(_.isEmpty)) {
 | 
| 225 |       error("Bad author information in " + quote(name))
 | |
| 74935 | 226 | } | 
| 227 | ||
| 74925 | 228 | override def toString: String = name | 
| 74935 | 229 | } | 
| 74906 | 230 | |
| 75393 | 231 |   object Messages {
 | 
| 74935 | 232 | type Graph = isabelle.Graph[String, Message] | 
| 233 | ||
| 75393 | 234 |     def apply(msgs: List[Message]): Messages = {
 | 
| 74936 | 235 | def make_node(g: Graph, node: String, msg: Message): Graph = | 
| 236 | if (g.defined(node) && Date.Ordering.compare(g.get_node(node).date, msg.date) >= 0) g | |
| 74935 | 237 | else g.default_node(node, msg) | 
| 238 | ||
| 74936 | 239 | def connect_nodes(g: Graph, nodes: List[String]): Graph = | 
| 240 |         nodes match {
 | |
| 241 | case Nil => g | |
| 242 |           case a :: bs => bs.foldLeft(g)({ case (g1, b) => g1.add_edge(a, b).add_edge(b, a) })
 | |
| 243 | } | |
| 244 | ||
| 74935 | 245 | new Messages(msgs.sortBy(_.date)(Date.Ordering), | 
| 246 | msgs.foldLeft[Graph](Graph.string)( | |
| 74936 | 247 |           { case (graph, msg) =>
 | 
| 248 | val nodes = msg.author_info | |
| 249 | connect_nodes(nodes.foldLeft(graph)(make_node(_, _, msg)), nodes) | |
| 74935 | 250 | })) | 
| 251 | } | |
| 74934 | 252 | |
| 75393 | 253 |     def find(dir: Path): Messages = {
 | 
| 74934 | 254 | val msgs = | 
| 255 |         for {
 | |
| 256 | archive <- List(Isabelle_Users, Isabelle_Dev) | |
| 257 | msg <- archive.find_messages(dir + Path.basic(archive.list_name)) | |
| 258 | } yield msg | |
| 259 | Messages(msgs) | |
| 260 | } | |
| 74935 | 261 | |
| 75393 | 262 |     sealed case class Cluster(author_info: List[String]) {
 | 
| 74936 | 263 | val (addresses, names) = author_info.partition(is_address) | 
| 264 | ||
| 74935 | 265 | val name: String = | 
| 266 |         names.headOption getOrElse {
 | |
| 267 |           addresses match {
 | |
| 268 |             case a :: _ => a.substring(0, a.indexOf('@')).replace('.', ' ')
 | |
| 269 |             case Nil => error("Empty cluster")
 | |
| 270 | } | |
| 271 | } | |
| 272 | ||
| 74940 | 273 | val name_lowercase: String = Word.lowercase(name) | 
| 274 | ||
| 74935 | 275 | def get_address: Option[String] = addresses.headOption | 
| 276 | ||
| 277 | def unique: Boolean = names.length == 1 && addresses.length == 1 | |
| 278 | def multi: Boolean = names.length > 1 || addresses.length > 1 | |
| 279 | ||
| 75393 | 280 |       def print: String = {
 | 
| 74935 | 281 |         val entries = names ::: (if (addresses.isEmpty) Nil else List("")) ::: addresses
 | 
| 282 |         entries.mkString("\n  * ", "\n    ", "")
 | |
| 283 | } | |
| 284 | } | |
| 74933 | 285 | } | 
| 286 | ||
| 75393 | 287 |   class Messages private(val sorted: List[Message], val graph: Messages.Graph) {
 | 
| 74935 | 288 |     override def toString: String = "Messages(" + sorted.size + ")"
 | 
| 74925 | 289 | |
| 75393 | 290 |     object Node_Ordering extends scala.math.Ordering[String] {
 | 
| 74935 | 291 | override def compare(a: String, b: String): Int = | 
| 292 | Date.Rev_Ordering.compare(graph.get_node(a).date, graph.get_node(b).date) | |
| 293 | } | |
| 294 | ||
| 295 | def get_cluster(msg: Message): Messages.Cluster = | |
| 74936 | 296 | Messages.Cluster(graph.all_succs(msg.author_info).sorted.sorted(Node_Ordering)) | 
| 74935 | 297 | |
| 298 | def get_name(msg: Message): String = get_cluster(msg).name | |
| 74925 | 299 | |
| 74935 | 300 | def get_address(msg: Message): String = | 
| 301 |       get_cluster(msg).get_address getOrElse error("No author address for " + quote(msg.name))
 | |
| 302 | ||
| 75393 | 303 |     def check(check_all: Boolean, check_multi: Boolean = false): Unit = {
 | 
| 74940 | 304 | val clusters = sorted.map(get_cluster).distinct.sortBy(_.name_lowercase) | 
| 74935 | 305 | |
| 74937 | 306 |       if (check_all) {
 | 
| 307 |         Output.writeln(cat_lines("clusters:" :: clusters.map(_.print)))
 | |
| 308 | } | |
| 309 |       else {
 | |
| 310 | val multi = if (check_multi) clusters.filter(_.multi) else Nil | |
| 311 |         if (multi.nonEmpty) {
 | |
| 312 |           Output.writeln(cat_lines("ambiguous clusters:" :: multi.map(_.print)))
 | |
| 313 | } | |
| 74925 | 314 | } | 
| 74933 | 315 | |
| 74935 | 316 | val unknown = clusters.filter(cluster => cluster.get_address.isEmpty) | 
| 317 |       if (unknown.nonEmpty) {
 | |
| 74937 | 318 |         Output.writeln(cat_lines("\nunknown mail:" :: unknown.map(_.print)))
 | 
| 74933 | 319 | } | 
| 74925 | 320 | } | 
| 321 | } | |
| 322 | ||
| 323 | ||
| 74921 | 324 | /* mailing list archives */ | 
| 325 | ||
| 326 | abstract class Archive( | |
| 79510 
d8330439823a
clarified signature: explicit type isabelle.Url to avoid oddities of java.net.URL (e.g. its "equals" method);
 wenzelm parents: 
79044diff
changeset | 327 | url: Url, | 
| 74921 | 328 | name: String = "", | 
| 75393 | 329 | tag: String = "" | 
| 330 |   ) {
 | |
| 74921 | 331 | def message_regex: Regex | 
| 74923 | 332 | def message_content(name: String, lines: List[String]): Message | 
| 74921 | 333 | |
| 74924 | 334 | def message_match(lines: List[String], re: Regex): Option[Match] = | 
| 335 | lines.flatMap(re.findFirstMatchIn(_)).headOption | |
| 336 | ||
| 75393 | 337 |     def make_name(str: String): String = {
 | 
| 74937 | 338 | val s = | 
| 339 |         str.trim.replaceAll("""\s+""", " ").replaceAll(" at ", "@")
 | |
| 340 |           .replace("mailbroy.informatik.tu-muenchen.de", "in.tum.de")
 | |
| 341 |           .replace("informatik.tu-muenchen.de", "in.tum.de")
 | |
| 74935 | 342 |       if (s.startsWith("=") && s.endsWith("=")) "" else s
 | 
| 343 | } | |
| 74924 | 344 | |
| 345 | def make_body(lines: List[String]): String = | |
| 346 | cat_lines(Library.take_suffix[String](_.isEmpty, lines)._1) | |
| 347 | ||
| 79510 
d8330439823a
clarified signature: explicit type isabelle.Url to avoid oddities of java.net.URL (e.g. its "equals" method);
 wenzelm parents: 
79044diff
changeset | 348 | private val main_url: Url = | 
| 74921 | 349 | Url(Library.take_suffix[Char](_ == '/', Url.trim_index(url).toString.toList)._1.mkString + "/") | 
| 350 | ||
| 351 | private val main_html = Url.read(main_url) | |
| 352 | ||
| 75393 | 353 |     val list_name: String = {
 | 
| 74921 | 354 | val title = | 
| 355 | """<title>The ([^</>]*) Archives</title>""".r.findFirstMatchIn(main_html).map(_.group(1)) | |
| 356 |       (proper_string(name) orElse title).getOrElse(error("Failed to determine mailing list name"))
 | |
| 357 | } | |
| 72558 | 358 | override def toString: String = list_name | 
| 359 | ||
| 74923 | 360 | def full_name(href: String): String = list_name + "/" + href | 
| 361 | ||
| 74921 | 362 | def list_tag: String = proper_string(tag).getOrElse(list_name) | 
| 363 | ||
| 79510 
d8330439823a
clarified signature: explicit type isabelle.Url to avoid oddities of java.net.URL (e.g. its "equals" method);
 wenzelm parents: 
79044diff
changeset | 364 | def read_text(href: String): String = Url.read(main_url.resolve(href)) | 
| 74921 | 365 | |
| 366 | def hrefs_text: List[String] = | |
| 367 | """href="([^"]+\.txt(?:\.gz)?)"""".r.findAllMatchIn(main_html).map(_.group(1)).toList | |
| 368 | ||
| 369 | def hrefs_msg: List[String] = | |
| 74906 | 370 |       (for {
 | 
| 74921 | 371 | href <- """href="([^"]+)/date.html"""".r.findAllMatchIn(main_html).map(_.group(1)) | 
| 372 | html = read_text(href + "/date.html") | |
| 373 | msg <- message_regex.findAllMatchIn(html).map(_.group(1)) | |
| 74906 | 374 | } yield href + "/" + msg).toList | 
| 375 | ||
| 75393 | 376 |     def get(target_dir: Path, href: String, progress: Progress = new Progress): Option[Path] = {
 | 
| 72558 | 377 | val dir = target_dir + Path.basic(list_name) | 
| 74906 | 378 | val path = dir + Path.explode(href) | 
| 79510 
d8330439823a
clarified signature: explicit type isabelle.Url to avoid oddities of java.net.URL (e.g. its "equals" method);
 wenzelm parents: 
79044diff
changeset | 379 | val url = main_url.resolve(href) | 
| 
d8330439823a
clarified signature: explicit type isabelle.Url to avoid oddities of java.net.URL (e.g. its "equals" method);
 wenzelm parents: 
79044diff
changeset | 380 | val connection = url.open_connection() | 
| 74906 | 381 |       try {
 | 
| 382 | val length = connection.getContentLengthLong | |
| 383 | val timestamp = connection.getLastModified | |
| 78958 | 384 | if (path.is_file && File.size(path) == length && path.file.lastModified == timestamp) None | 
| 74906 | 385 |         else {
 | 
| 386 | Isabelle_System.make_directory(path.dir) | |
| 387 |           progress.echo("Getting " + url)
 | |
| 388 | val bytes = | |
| 389 | using(connection.getInputStream)(Bytes.read_stream(_, hint = length.toInt max 1024)) | |
| 78958 | 390 | Bytes.write(path, bytes) | 
| 391 | path.file.setLastModified(timestamp) | |
| 74906 | 392 | Some(path) | 
| 393 | } | |
| 394 | } | |
| 395 |       finally { connection.getInputStream.close() }
 | |
| 396 | } | |
| 72558 | 397 | |
| 74906 | 398 | def download_text(target_dir: Path, progress: Progress = new Progress): List[Path] = | 
| 399 | hrefs_text.flatMap(get(target_dir, _, progress = progress)) | |
| 400 | ||
| 401 | def download_msg(target_dir: Path, progress: Progress = new Progress): List[Path] = | |
| 402 | hrefs_msg.flatMap(get(target_dir, _, progress = progress)) | |
| 74907 | 403 | |
| 404 | def download(target_dir: Path, progress: Progress = new Progress): List[Path] = | |
| 405 | download_text(target_dir, progress = progress) ::: | |
| 406 | download_msg(target_dir, progress = progress) | |
| 74921 | 407 | |
| 75393 | 408 |     def make_title(str: String): String = {
 | 
| 74921 | 409 |       val Trim1 = ("""\s*\Q[""" + list_tag + """]\E\s*(.*)""").r
 | 
| 410 | val Trim2 = """(?i:(?:re|fw|fwd)\s*:\s*)(.*)""".r | |
| 411 | val Trim3 = """\[\s*(.*?)\s*\]""".r | |
| 412 | @tailrec def trim(s: String): String = | |
| 413 |         s match {
 | |
| 414 | case Trim1(s1) => trim(s1) | |
| 415 | case Trim2(s1) => trim(s1) | |
| 416 | case Trim3(s1) => trim(s1) | |
| 417 | case _ => s | |
| 418 | } | |
| 419 | trim(str) | |
| 420 | } | |
| 421 | ||
| 422 | def get_messages(): List[Message] = | |
| 423 | for (href <- hrefs_msg) yield message_content(href, split_lines(read_text(href))) | |
| 424 | ||
| 75393 | 425 |     def find_messages(dir: Path): List[Message] = {
 | 
| 74921 | 426 |       for {
 | 
| 75906 
2167b9e3157a
clarified signature: support for adhoc file types;
 wenzelm parents: 
75393diff
changeset | 427 | file <- File.find_files(dir.file, file => File.is_html(file.getName)) | 
| 74921 | 428 | rel_path <- File.relative_path(dir, File.path(file)) | 
| 74923 | 429 | } | 
| 430 |       yield {
 | |
| 431 | val name = full_name(rel_path.implode) | |
| 432 | message_content(name, split_lines(File.read(file))) | |
| 433 | } | |
| 74921 | 434 | } | 
| 72558 | 435 | } | 
| 72559 | 436 | |
| 75393 | 437 |   private class Message_Chunk(bg: String = "", en: String = "") {
 | 
| 438 |     def unapply(lines: List[String]): Option[List[String]] = {
 | |
| 74932 | 439 | val res1 = | 
| 440 | if (bg.isEmpty) Some(lines) | |
| 441 |         else {
 | |
| 442 |           lines.dropWhile(_ != bg) match {
 | |
| 443 | case Nil => None | |
| 444 | case _ :: rest => Some(rest) | |
| 445 | } | |
| 446 | } | |
| 447 | if (en.isEmpty) res1 | |
| 448 |       else {
 | |
| 449 |         res1 match {
 | |
| 450 | case None => None | |
| 451 | case Some(lines1) => | |
| 452 | val lines2 = lines1.takeWhile(_ != en) | |
| 453 | if (lines1.drop(lines2.length).isEmpty) None else Some(lines2) | |
| 454 | } | |
| 455 | } | |
| 456 | } | |
| 457 | ||
| 458 | def get(lines: List[String]): List[String] = | |
| 459 | unapply(lines) getOrElse | |
| 77369 | 460 |         error("Missing delimiters:" + if_proper(bg, " ") + bg + if_proper(en, " ") + en)
 | 
| 74932 | 461 | } | 
| 462 | ||
| 72559 | 463 | |
| 74930 | 464 | /* isabelle-users mailing list */ | 
| 72559 | 465 | |
| 74921 | 466 | object Isabelle_Users extends Archive( | 
| 467 |     Url("https://lists.cam.ac.uk/pipermail/cl-isabelle-users"),
 | |
| 75393 | 468 | name = "isabelle-users", tag = "isabelle" | 
| 469 |   ) {
 | |
| 74921 | 470 | override def message_regex: Regex = """<li><a name="\d+" href="(msg\d+\.html)">""".r | 
| 471 | ||
| 472 | private object Head extends | |
| 74922 | 473 | Message_Chunk(bg = "<!--X-Head-of-Message-->", en = "<!--X-Head-of-Message-End-->") | 
| 74921 | 474 | |
| 475 | private object Body extends | |
| 74922 | 476 | Message_Chunk(bg = "<!--X-Body-of-Message-->", en = "<!--X-Body-of-Message-End-->") | 
| 477 | ||
| 75393 | 478 |     private object Date_Format {
 | 
| 74922 | 479 | private val Trim1 = """\w+,\s*(.*)""".r | 
| 480 | private val Trim2 = """(.*?)\s*\(.*\)""".r | |
| 481 | private val Format = | |
| 482 | Date.Format( | |
| 483 | "d MMM uuuu H:m:s Z", | |
| 484 | "d MMM uuuu H:m:s z", | |
| 485 | "d MMM yy H:m:s Z", | |
| 486 | "d MMM yy H:m:s z") | |
| 75393 | 487 |       def unapply(s: String): Option[Date] = {
 | 
| 74922 | 488 |         val s0 = s.replaceAll("""\s+""", " ")
 | 
| 489 | val s1 = | |
| 490 |           s0 match {
 | |
| 491 | case Trim1(s1) => s1 | |
| 492 | case _ => s0 | |
| 493 | } | |
| 494 | val s2 = | |
| 495 |           s1 match {
 | |
| 496 | case Trim2(s2) => s2 | |
| 497 | case _ => s1 | |
| 498 | } | |
| 499 | Format.unapply(s2) | |
| 500 | } | |
| 501 | } | |
| 74921 | 502 | |
| 75393 | 503 |     override def make_name(str: String): String = {
 | 
| 74935 | 504 |       val s = Library.perhaps_unsuffix(" via Cl-isabelle-users", super.make_name(str))
 | 
| 74925 | 505 | if (s == "cl-isabelle-users@lists.cam.ac.uk") "" else s | 
| 506 | } | |
| 507 | ||
| 75393 | 508 |     object Address {
 | 
| 74924 | 509 | private def anchor(s: String): String = """<a href="[^"]*">""" + s + """</a>""" | 
| 510 | private def angl(s: String): String = """<""" + s + """>""" | |
| 511 | private def quot(s: String): String = """"""" + s + """"""" | |
| 512 |       private def paren(s: String): String = """\(""" + s + """\)"""
 | |
| 513 | private val adr = """([^<>]*? at [^<>]*?)""" | |
| 514 | private val any = """([^<>]*?)""" | |
| 515 | private val spc = """\s*""" | |
| 516 | ||
| 517 | private val Name1 = quot(anchor(any)).r | |
| 518 | private val Name2 = quot(any).r | |
| 519 | private val Name_Adr1 = (quot(anchor(any)) + spc + angl(anchor(adr))).r | |
| 520 | private val Name_Adr2 = (quot(any) + spc + angl(anchor(adr))).r | |
| 521 | private val Name_Adr3 = (anchor(any) + spc + angl(anchor(adr))).r | |
| 522 | private val Name_Adr4 = (any + spc + angl(anchor(adr))).r | |
| 523 | private val Adr_Name1 = (angl(anchor(adr)) + spc + paren(any)).r | |
| 524 | private val Adr_Name2 = (anchor(adr) + spc + paren(any)).r | |
| 525 | private val Adr1 = angl(anchor(adr)).r | |
| 526 | private val Adr2 = anchor(adr).r | |
| 527 | ||
| 74936 | 528 | def parse(s: String): List[String] = | 
| 74924 | 529 |         s match {
 | 
| 74936 | 530 | case Name1(a) => List(a) | 
| 531 | case Name2(a) => List(a) | |
| 532 | case Name_Adr1(a, b) => List(a, b) | |
| 533 | case Name_Adr2(a, b) => List(a, b) | |
| 534 | case Name_Adr3(a, b) => List(a, b) | |
| 535 | case Name_Adr4(a, b) => List(a, b) | |
| 536 | case Adr_Name1(b, a) => List(a, b) | |
| 537 | case Adr_Name2(b, a) => List(a, b) | |
| 538 | case Adr1(a) => List(a) | |
| 539 | case Adr2(a) => List(a) | |
| 540 | case _ => Nil | |
| 74924 | 541 | } | 
| 542 | } | |
| 543 | ||
| 75393 | 544 |     override def message_content(name: String, lines: List[String]): Message = {
 | 
| 74921 | 545 | def err(msg: String = ""): Nothing = | 
| 77368 | 546 |         error("Malformed message: " + name + if_proper(msg, "\n" + msg))
 | 
| 74921 | 547 | |
| 548 | val (head, body) = | |
| 74924 | 549 |         try { (Head.get(lines), make_body(Body.get(lines))) }
 | 
| 74921 | 550 |         catch { case ERROR(msg) => err(msg) }
 | 
| 72559 | 551 | |
| 74921 | 552 | val date = | 
| 553 | message_match(head, """<li><em>Date</em>:\s*(.*?)\s*</li>""".r) | |
| 554 | .map(m => HTML.input(m.group(1))) match | |
| 555 |         {
 | |
| 556 | case Some(Date_Format(d)) => d | |
| 557 |           case Some(s) => err("Malformed Date: " + quote(s))
 | |
| 558 |           case None => err("Missing Date")
 | |
| 559 | } | |
| 560 | ||
| 561 | val title = | |
| 74924 | 562 | make_title( | 
| 563 | HTML.input(message_match(head, """<li><em>Subject</em>:\s*(.*)</li>""".r) | |
| 564 |             .getOrElse(err("Missing Subject")).group(1)))
 | |
| 74921 | 565 | |
| 74938 | 566 | def parse_author_info(re: Regex): List[String] = | 
| 567 |         message_match(head, re) match {
 | |
| 568 | case None => Nil | |
| 569 | case Some(m) => Address.parse(m.group(1)).map(a => HTML.input(make_name(a))) | |
| 570 | } | |
| 571 | ||
| 74936 | 572 | val author_info = | 
| 74938 | 573 |         (parse_author_info("""<li><em>From</em>:\s*(.*?)\s*</li>""".r) :::
 | 
| 574 |           parse_author_info("""<li><em>Reply-to</em>:\s*(.*?)\s*</li>""".r))
 | |
| 575 | .distinct.filter(_.nonEmpty) | |
| 576 | ||
| 577 |       if (author_info.isEmpty) err("Malformed author information")
 | |
| 74921 | 578 | |
| 74934 | 579 | val tags = List(list_name) | 
| 580 | ||
| 74936 | 581 | Message(name, date, title, standard_author_info(author_info), body, tags) | 
| 74921 | 582 | } | 
| 583 | } | |
| 584 | ||
| 74930 | 585 | |
| 586 | /* isabelle-dev mailing list */ | |
| 587 | ||
| 75393 | 588 |   object Isabelle_Dev extends Archive(Url("https://mailmanbroy.in.tum.de/pipermail/isabelle-dev")) {
 | 
| 74921 | 589 | override def message_regex: Regex = """<LI><A HREF="(\d+\.html)">""".r | 
| 590 | ||
| 74922 | 591 | private object Head extends Message_Chunk(en = "<!--beginarticle-->") | 
| 592 | private object Body extends Message_Chunk(bg = "<!--beginarticle-->", en = "<!--endarticle-->") | |
| 593 | ||
| 75393 | 594 |     private object Date_Format {
 | 
| 74922 | 595 |       val Format = Date.Format("E MMM d H:m:s z uuuu")
 | 
| 596 |       def unapply(s: String): Option[Date] = Format.unapply(s.replaceAll("""\s+""", " "))
 | |
| 597 | } | |
| 598 | ||
| 75393 | 599 |     override def message_content(name: String, lines: List[String]): Message = {
 | 
| 74922 | 600 | def err(msg: String = ""): Nothing = | 
| 77368 | 601 |         error("Malformed message: " + name + if_proper(msg, "\n" + msg))
 | 
| 74922 | 602 | |
| 603 | val (head, body) = | |
| 74924 | 604 |         try { (Head.get(lines), make_body(Body.get(lines))) }
 | 
| 74922 | 605 |         catch { case ERROR(msg) => err(msg) }
 | 
| 606 | ||
| 607 | val date = | |
| 608 |         message_match(head, """\s*<I>(.*)</I>""".r).map(m => HTML.input(m.group(1))) match {
 | |
| 609 | case Some(Date_Format(d)) => d | |
| 610 |           case Some(s) => err("Malformed Date: " + quote(s))
 | |
| 611 |           case None => err("Missing Date")
 | |
| 612 | } | |
| 613 | ||
| 75393 | 614 |       val (title, author_address) = {
 | 
| 74922 | 615 |         message_match(head, """TITLE="([^"]+)">(.*)""".r) match {
 | 
| 74931 | 616 | case Some(m) => (make_title(HTML.input(m.group(1))), make_name(HTML.input(m.group(2)))) | 
| 74922 | 617 |           case None => err("Missing TITLE")
 | 
| 618 | } | |
| 619 | } | |
| 620 | ||
| 621 | val author_name = | |
| 622 |         message_match(head, """\s*<B>(.*)</B>""".r) match {
 | |
| 623 |           case None => err("Missing author")
 | |
| 74931 | 624 | case Some(m) => | 
| 625 | val a = make_name(HTML.input(m.group(1))) | |
| 626 | if (a == author_address) "" else a | |
| 74922 | 627 | } | 
| 628 | ||
| 74936 | 629 | val author_info = List(author_name, author_address) | 
| 74934 | 630 | val tags = List(list_name) | 
| 631 | ||
| 74936 | 632 | Message(name, date, title, standard_author_info(author_info), body, tags) | 
| 74922 | 633 | } | 
| 74921 | 634 | } | 
| 72558 | 635 | } |