author | wenzelm |
Wed, 12 Mar 2025 11:39:00 +0100 | |
changeset 82265 | 4b875a4c83b0 |
parent 81627 | 079dee3b117c |
permissions | -rw-r--r-- |
72558 | 1 |
/* Title: Pure/General/mailman.scala |
2 |
Author: Makarius |
|
3 |
||
74942 | 4 |
Support for Mailman list servers, notably isabelle-users and isabelle-dev. |
72558 | 5 |
*/ |
6 |
||
7 |
package isabelle |
|
8 |
||
9 |
||
74921 | 10 |
import scala.annotation.tailrec |
74906 | 11 |
import scala.util.matching.Regex |
74921 | 12 |
import scala.util.matching.Regex.Match |
74906 | 13 |
|
72558 | 14 |
|
75393 | 15 |
object Mailman { |
74921 | 16 |
/* mailing list messages */ |
72559 | 17 |
|
74935 | 18 |
def is_address(s: String): Boolean = |
19 |
s.contains('@') && s.contains('.') && !s.contains(' ') |
|
20 |
||
74936 | 21 |
private val standard_name: Map[String, String] = |
74935 | 22 |
Map( |
81627 | 23 |
"121171528:qq/com" -> "Guo Fan\n121171528:qq/com", |
74937 | 24 |
"Aman Pohjola, Johannes (Data61, Kensington NSW)" -> "Johannes Aman Pohjola", |
25 |
"Andrei de AraÃjo Formiga" -> "Andrei de Araujo Formiga", |
|
81627 | 26 |
"Benedikt/AHRENS:unice/fr" -> "Benedikt Ahrens\nBenedikt/AHRENS:unice/fr", |
74940 | 27 |
"Berg, Nils Erik" -> "Nils Erik Berg", |
81627 | 28 |
"Berger U/" -> "Ulrich Berger", |
74937 | 29 |
"Bisping, Benjamin" -> "Benjamin Bisping", |
81627 | 30 |
"Blanchette, J/C/" -> "Jasmin Christian Blanchette", |
74935 | 31 |
"Buday Gergely István" -> "Gergely Buday", |
81627 | 32 |
"CALaF1UJ9Uy0vGCu4WkBmbfuPDxG7nFm8hfeCMP+O3g7_5CQ0Bw:mail/gmail/com" -> "", |
33 |
"CRACIUN F/" -> "Florin Craciun", |
|
74937 | 34 |
"Carsten Schuermann" -> "Carsten Schürmann", |
74940 | 35 |
"Chris" -> "", |
74935 | 36 |
"Christoph Lueth" -> "Christoph Lüth", |
37 |
"Claude Marche" -> "Claude Marché", |
|
74937 | 38 |
"Daniel StÃwe" -> "Daniel Stüwe", |
81627 | 39 |
"Daniel/Matichuk:data61/csiro/au" -> "Daniel Matichuk\nDaniel/Matichuk:data61/csiro/au", |
40 |
"Daniel/Matichuk:nicta/com/au" -> "Daniel Matichuk\nDaniel/Matichuk:nicta/com/au", |
|
74937 | 41 |
"David MENTRE" -> "David MENTRÉ", |
74940 | 42 |
"Dey, Katie" -> "Katie Dey", |
81627 | 43 |
"Dr/ Brendan Patrick Mahony" -> "Brendan Mahony", |
74937 | 44 |
"Farn" -> "Farn Wang", |
74935 | 45 |
"Farquhar, Colin I" -> "Colin Farquhar", |
74940 | 46 |
"Fernandez, Matthew" -> "Matthew Fernandez", |
74937 | 47 |
"Filip Maric" -> "Filip Marić", |
48 |
"Filip MariÄ" -> "Filip Marić", |
|
49 |
"Fleury Mathias" -> "Mathias Fleury", |
|
74940 | 50 |
"Francisco Jose CHAVES ALONSO" -> "Francisco Jose Chaves Alonso", |
51 |
"Frederic Tuong (Dr)" -> "Frederic Tuong", |
|
52 |
"Fulya" -> "Fulya Horozal", |
|
81627 | 53 |
"George K/" -> "George Karabotsos", |
74940 | 54 |
"Gidon Ernst" -> "Gidon ERNST", |
55 |
"Gransden, Thomas" -> "Thomas Gransden", |
|
74937 | 56 |
"Hans-JÃrg Schurr" -> "Hans-Jörg Schurr", |
57 |
"Henri DEBRAT" -> "Henri Debrat", |
|
74935 | 58 |
"Hitoshi Ohsaki (RTA publicity chair)" -> "Hitoshi Ohsaki", |
74940 | 59 |
"Häuselmann Rafael" -> "Rafael Häuselmann", |
74937 | 60 |
"Isabelle" -> "", |
81627 | 61 |
"J/ Juhas (TUM)" -> "Jonatan Juhas", |
74937 | 62 |
"Jackson, Vincent (Data61, Kensington NSW)" -> "Vincent Jackson", |
63 |
"Janney, Mark-P26816" -> "Mark Janney", |
|
74940 | 64 |
"Jean François Molderez" -> "Jean-François Molderez", |
65 |
"Jean-Francois Molderez" -> "Jean-François Molderez", |
|
66 |
"John R Harrison" -> "John Harrison", |
|
74937 | 67 |
"Jose DivasÃn" -> "Jose Divasón", |
68 |
"Julian" -> "", |
|
69 |
"Julien" -> "", |
|
70 |
"Klein, Gerwin (Data61, Kensington NSW)" -> "Gerwin Klein", |
|
71 |
"Kobayashi, Hidetsune" -> "Hidetsune Kobayashi", |
|
74940 | 72 |
"Kylie Williams (IND)" -> "Kylie Williams", |
81627 | 73 |
"Laarman, A/W/" -> "A/W/ Laarman", |
74937 | 74 |
"Laurent Thery" -> "Laurent Théry", |
74940 | 75 |
"Li, Chanjuan" -> "Li Chanjuan", |
74937 | 76 |
"Lochbihler Andreas" -> "Andreas Lochbihler", |
74940 | 77 |
"Luckhardt, Daniel" -> "Daniel Luckhardt", |
74937 | 78 |
"Lutz Schroeder" -> "Lutz Schröder", |
79 |
"Lutz SchrÃder" -> "Lutz Schröder", |
|
74940 | 80 |
"MACKENZIE Carlin" -> "Carlin MACKENZIE", |
74937 | 81 |
"Makarius" -> "Makarius Wenzel", |
82 |
"Marco" -> "", |
|
83 |
"Mark" -> "", |
|
74940 | 84 |
"Markus Mueller-Olm" -> "Markus Müller-Olm", |
74937 | 85 |
"Markus" -> "", |
74940 | 86 |
"Marmsoler, Diego" -> "Diego Marmsoler", |
74937 | 87 |
"Martin Klebermass" -> "Martin Klebermaß", |
74941 | 88 |
"Martyn Johnson via RT" -> "", |
81627 | 89 |
"Mathias/Fleury:ens-rennes/fr" -> "Mathias Fleury\nmathias/fleury12:gmail/com", |
74940 | 90 |
"Matthew" -> "", |
74937 | 91 |
"Matthews, John R" -> "John Matthews", |
74940 | 92 |
"McCarthy, Jim (C3ID)" -> "Jim McCarthy", |
93 |
"McCue, Brian" -> "Brian McCue", |
|
74937 | 94 |
"Michael FÃrber" -> "Michael Färber", |
74940 | 95 |
"Michel" -> "", |
96 |
"Miranda, Brando" -> "Brando Miranda", |
|
81627 | 97 |
"Moscato, Mariano M/ \\(LARC-D320\\)\\[NATIONAL INSTITUTE OF AEROSPACE\\]" -> "Mariano M/ Moscato", |
74940 | 98 |
"Mr Julian Fell" -> "Julian Fell", |
99 |
"Mueller Peter" -> "Peter Müller", |
|
81627 | 100 |
"Munoz, Cesar Augusto (LARC-D320)" -> "Cesar A/ Munoz", |
74940 | 101 |
"Nadel, Alexander" -> "Alexander Nadel", |
102 |
"Nagashima, Yutaka" -> "Yutaka Nagashima", |
|
74937 | 103 |
"Norrish, Michael (Data61, Acton)" -> "Michael Norrish", |
74940 | 104 |
"O'Leary, John W" -> "John W O'Leary", |
74937 | 105 |
"Omar Montano Rivas" -> "Omar Montaño Rivas", |
106 |
"Omar MontaÃo Rivas" -> "Omar Montaño Rivas", |
|
107 |
"OndÅej KunÄar" -> "Ondřej Kunčar", |
|
74941 | 108 |
"PALMER Jake" -> "Jake Palmer", |
74937 | 109 |
"PAQUI LUCIO" -> "Paqui Lucio", |
74940 | 110 |
"Pal, Abhik" -> "Abhik Pal", |
111 |
"Pasupuleti, Vijay" -> "Vijay Pasupuleti", |
|
81627 | 112 |
"Peter Vincent Homeier" -> "Peter V/ Homeier", |
74937 | 113 |
"Peter" -> "", |
114 |
"Philipp Ruemmer" -> "Philipp Rümmer", |
|
115 |
"Philipp RÃmmer" -> "Philipp Rümmer", |
|
74941 | 116 |
"Piete Brooks via RT" -> "", |
74940 | 117 |
"RTA publicity chair" -> "", |
81627 | 118 |
"Raamsdonk, F/ van" -> "Femke van Raamsdonk", |
74937 | 119 |
"Raul Gutierrez" -> "Raúl Gutiérrez", |
120 |
"Renà Thiemann" -> "René Thiemann", |
|
81627 | 121 |
"Ridgway, John V/ E/" -> "John V/ E/ Ridgway", |
122 |
"Roggenbach M/" -> "Markus Roggenbach", |
|
74940 | 123 |
"Rosu, Grigore" -> "Grigore Rosu", |
74937 | 124 |
"Rozman, Mihaela" -> "Mihaela Rozman", |
81627 | 125 |
"Schmaltz, J/" -> "Julien Schmaltz", |
126 |
"Serguei A/ Mokhov on behalf of PST-11" -> "Serguei A/ Mokhov", |
|
127 |
"Serguei Mokhov" -> "Serguei A/ Mokhov", |
|
74940 | 128 |
"Shumeiko, Igor" -> "Igor Shumeiko", |
129 |
"Siek, Jeremy" -> "Jeremy Siek", |
|
81627 | 130 |
"Silvio/Ranise:loria/fr" -> "Silvio Ranise\nSilvio/Ranise:loria/fr", |
74940 | 131 |
"Siu, Tony" -> "Tony Siu", |
74937 | 132 |
"Stüber, Sebastian" -> "Sebastian Stüber", |
133 |
"Thiemann, Rene" -> "René Thiemann", |
|
134 |
"Thiemann, René" -> "René Thiemann", |
|
74940 | 135 |
"Thomas Arthur Leck Sewell" -> "Thomas Sewell", |
74937 | 136 |
"Thomas Goethel" -> "Thomas Göthel", |
81627 | 137 |
"Thomas/Sewell:data61/csiro/au" -> "Thomas Sewell\nThomas/Sewell:data61/csiro/au", |
74941 | 138 |
"Tjark Weber via RT" -> "Tjark Weber", |
81627 | 139 |
"Toby/Murray:data61/csiro/au" -> "Toby Murray\nToby/Murray:data61/csiro/au", |
74935 | 140 |
"Urban, Christian" -> "Christian Urban", |
74940 | 141 |
"Ursula Eschbach" -> "", |
142 |
"Van Staden Stephan" -> "Stephan van Staden", |
|
74937 | 143 |
"Viktor Kuncak" -> "Viktor Kunčak", |
144 |
"Viorel Preoteasaa" -> "Viorel Preoteasa", |
|
74940 | 145 |
"Wickerson, John P" -> "John Wickerson", |
146 |
"Wong, Yat" -> "Yat Wong", |
|
147 |
"YAMADA, Akihisa" -> "Akihisa Yamada", |
|
74937 | 148 |
"YliÃs Falcone" -> "Yliès Falcone", |
74940 | 149 |
"amir mohajeri" -> "Amir Mohajeri", |
150 |
"aniello murano" -> "Aniello Murano", |
|
151 |
"barzan stefania" -> "Stefania Barzan", |
|
152 |
"benhamou" -> "Belaid Benhamou", |
|
153 |
"charmi panchal" -> "Charmi Panchal", |
|
154 |
"chen kun" -> "Chen Kun", |
|
155 |
"chunhan wu" -> "Chunhan Wu", |
|
156 |
"daniel de la concepción sáez" -> "Daniel de la Concepción Sáez", |
|
81627 | 157 |
"daniel/luckhardt:mathematik/uni-goettingen/de" -> "Logiker:gmx/net", |
74940 | 158 |
"david streader" -> "David Streader", |
81627 | 159 |
"eschbach:in/tum/de" -> "", |
160 |
"f/rabe:jacobs-university/de" -> "florian/rabe:fau/de", |
|
161 |
"florian:haftmann-online/de" -> "haftmann:in/tum/de", |
|
162 |
"fredegar:haftmann-online/de" -> "haftmann:in/tum/de", |
|
163 |
"gallais : ensl/org" -> "Guillaume Allais", |
|
74940 | 164 |
"geng chen" -> "Geng Chen", |
81627 | 165 |
"henning/seidler" -> "Henning Seidler", |
74937 | 166 |
"hkb" -> "Hidetsune Kobayashi", |
81627 | 167 |
"jobs-pm:inf/ethz/ch" -> "", |
168 |
"julien:RadboudUniversity" -> "", |
|
74940 | 169 |
"jun sun" -> "Jun Sun", |
81627 | 170 |
"jwang whu/edu/cn (jwang)" -> "jwang", |
74940 | 171 |
"kostas pouliasis" -> "Kostas Pouliasis", |
81627 | 172 |
"kristof/teichel:ptb/de" -> "Kristof Teichel\nkristof/teichel:ptb/de", |
74940 | 173 |
"lucas cavalcante" -> "Lucas Cavalcante", |
174 |
"mahmoud abdelazim" -> "Mahmoud Abdelazim", |
|
175 |
"manish surolia" -> "Manish Surolia", |
|
74937 | 176 |
"mantel" -> "Heiko Mantel", |
74940 | 177 |
"marco caminati" -> "Marco Caminati", |
81627 | 178 |
"mathias/fleury:ens-rennes/fr" -> "Mathias Fleury\nmathias/fleury12:gmail/com", |
179 |
"merz:loria/fr" -> "stephan/merz:loria/fr", |
|
74940 | 180 |
"michel levy" -> "Michel Levy", |
81627 | 181 |
"michel/levy2009:laposte/net" -> "Michel Levy\nmichel/levy2009:laposte/net", |
74937 | 182 |
"nemouchi" -> "Yakoub Nemouchi", |
74941 | 183 |
"noam neer" -> "Noam Neer", |
184 |
"olfa mraihi" -> "Olfa Mraihi", |
|
81627 | 185 |
"pathsnottakenworkshop:gmail/com" -> "Leo Freitas\nleo/freitas:newcastle/ac/uk", |
74940 | 186 |
"patrick barlatier" -> "Patrick Barlatier", |
187 |
"patrick dabou" -> "Patrick Dabou", |
|
188 |
"paul zimmermann" -> "Paul Zimmermann", |
|
81627 | 189 |
"popescu2:illinois/edu" -> "Andrei Popescu", |
74940 | 190 |
"recruiting" -> "", |
81627 | 191 |
"recruiting:mais/informatik/tu-darmstadt/de" -> "", |
74940 | 192 |
"roux cody" -> "Cody Roux", |
193 |
"scott constable" -> "Scott Constable", |
|
81627 | 194 |
"superuser:mattweidner/com" -> "Matthew Weidner\nsuperuser:mattweidner/com", |
195 |
"urban:math/lmu/de" -> "Christian Urban\nurban:math/lmu/de", |
|
196 |
"veronique/cortier:loria/fr" -> "Veronique/Cortier:loria/fr", |
|
74940 | 197 |
"vikram singh" -> "Vikram Singh", |
81627 | 198 |
"wenzelm:in/tum/de" -> "makarius:sketis/net", |
199 |
"werner:lix/polytechnique/fr" -> "Benjamin Werner\nwerner:lix/polytechnique/fr", |
|
200 |
"wmansky:cs/princeton/edu" -> "William Mansky\nwmansky:cs/princeton/edu", |
|
201 |
"y/nemouchi:ensbiotech/edu/dz" -> "Yakoub Nemouchi\ny/nemouchi:ensbiotech/edu/dz", |
|
74937 | 202 |
"ÐÑÐÐÐÑÐÐÐ ÐÐÐÐÐÐÐÑÐÐÐÑ ÐÐÐÑÐÐÐ" -> "", |
81627 | 203 |
"∀X/Xπ - Tutorials about Proofs" -> "Bruno Woltzenlogel Paleo", |
204 |
) |
|
205 |
||
206 |
private def tune(s: String): String = |
|
207 |
s.replace(64.toChar, 58.toChar).replace(46.toChar, 47.toChar) |
|
208 |
||
209 |
private def untune(s: String): String = |
|
210 |
s.replace(58.toChar, 64.toChar).replace(47.toChar, 46.toChar) |
|
74935 | 211 |
|
74936 | 212 |
def standard_author_info(author_info: List[String]): List[String] = |
81627 | 213 |
author_info.flatMap(s => |
214 |
split_lines(standard_name.get(tune(s)).map(untune).getOrElse(s))).distinct |
|
74936 | 215 |
|
74921 | 216 |
sealed case class Message( |
217 |
name: String, |
|
218 |
date: Date, |
|
219 |
title: String, |
|
74936 | 220 |
author_info: List[String], |
74934 | 221 |
body: String, |
75393 | 222 |
tags: List[String] |
223 |
) { |
|
74936 | 224 |
if (author_info.isEmpty || author_info.exists(_.isEmpty)) { |
225 |
error("Bad author information in " + quote(name)) |
|
74935 | 226 |
} |
227 |
||
74925 | 228 |
override def toString: String = name |
74935 | 229 |
} |
74906 | 230 |
|
75393 | 231 |
object Messages { |
74935 | 232 |
type Graph = isabelle.Graph[String, Message] |
233 |
||
75393 | 234 |
def apply(msgs: List[Message]): Messages = { |
74936 | 235 |
def make_node(g: Graph, node: String, msg: Message): Graph = |
236 |
if (g.defined(node) && Date.Ordering.compare(g.get_node(node).date, msg.date) >= 0) g |
|
74935 | 237 |
else g.default_node(node, msg) |
238 |
||
74936 | 239 |
def connect_nodes(g: Graph, nodes: List[String]): Graph = |
240 |
nodes match { |
|
241 |
case Nil => g |
|
242 |
case a :: bs => bs.foldLeft(g)({ case (g1, b) => g1.add_edge(a, b).add_edge(b, a) }) |
|
243 |
} |
|
244 |
||
74935 | 245 |
new Messages(msgs.sortBy(_.date)(Date.Ordering), |
246 |
msgs.foldLeft[Graph](Graph.string)( |
|
74936 | 247 |
{ case (graph, msg) => |
248 |
val nodes = msg.author_info |
|
249 |
connect_nodes(nodes.foldLeft(graph)(make_node(_, _, msg)), nodes) |
|
74935 | 250 |
})) |
251 |
} |
|
74934 | 252 |
|
75393 | 253 |
def find(dir: Path): Messages = { |
74934 | 254 |
val msgs = |
255 |
for { |
|
256 |
archive <- List(Isabelle_Users, Isabelle_Dev) |
|
257 |
msg <- archive.find_messages(dir + Path.basic(archive.list_name)) |
|
258 |
} yield msg |
|
259 |
Messages(msgs) |
|
260 |
} |
|
74935 | 261 |
|
75393 | 262 |
sealed case class Cluster(author_info: List[String]) { |
74936 | 263 |
val (addresses, names) = author_info.partition(is_address) |
264 |
||
74935 | 265 |
val name: String = |
266 |
names.headOption getOrElse { |
|
267 |
addresses match { |
|
268 |
case a :: _ => a.substring(0, a.indexOf('@')).replace('.', ' ') |
|
269 |
case Nil => error("Empty cluster") |
|
270 |
} |
|
271 |
} |
|
272 |
||
74940 | 273 |
val name_lowercase: String = Word.lowercase(name) |
274 |
||
74935 | 275 |
def get_address: Option[String] = addresses.headOption |
276 |
||
277 |
def unique: Boolean = names.length == 1 && addresses.length == 1 |
|
278 |
def multi: Boolean = names.length > 1 || addresses.length > 1 |
|
279 |
||
75393 | 280 |
def print: String = { |
74935 | 281 |
val entries = names ::: (if (addresses.isEmpty) Nil else List("")) ::: addresses |
282 |
entries.mkString("\n * ", "\n ", "") |
|
283 |
} |
|
284 |
} |
|
74933 | 285 |
} |
286 |
||
75393 | 287 |
class Messages private(val sorted: List[Message], val graph: Messages.Graph) { |
74935 | 288 |
override def toString: String = "Messages(" + sorted.size + ")" |
74925 | 289 |
|
75393 | 290 |
object Node_Ordering extends scala.math.Ordering[String] { |
74935 | 291 |
override def compare(a: String, b: String): Int = |
292 |
Date.Rev_Ordering.compare(graph.get_node(a).date, graph.get_node(b).date) |
|
293 |
} |
|
294 |
||
295 |
def get_cluster(msg: Message): Messages.Cluster = |
|
74936 | 296 |
Messages.Cluster(graph.all_succs(msg.author_info).sorted.sorted(Node_Ordering)) |
74935 | 297 |
|
298 |
def get_name(msg: Message): String = get_cluster(msg).name |
|
74925 | 299 |
|
74935 | 300 |
def get_address(msg: Message): String = |
301 |
get_cluster(msg).get_address getOrElse error("No author address for " + quote(msg.name)) |
|
302 |
||
75393 | 303 |
def check(check_all: Boolean, check_multi: Boolean = false): Unit = { |
74940 | 304 |
val clusters = sorted.map(get_cluster).distinct.sortBy(_.name_lowercase) |
74935 | 305 |
|
74937 | 306 |
if (check_all) { |
307 |
Output.writeln(cat_lines("clusters:" :: clusters.map(_.print))) |
|
308 |
} |
|
309 |
else { |
|
310 |
val multi = if (check_multi) clusters.filter(_.multi) else Nil |
|
311 |
if (multi.nonEmpty) { |
|
312 |
Output.writeln(cat_lines("ambiguous clusters:" :: multi.map(_.print))) |
|
313 |
} |
|
74925 | 314 |
} |
74933 | 315 |
|
74935 | 316 |
val unknown = clusters.filter(cluster => cluster.get_address.isEmpty) |
317 |
if (unknown.nonEmpty) { |
|
74937 | 318 |
Output.writeln(cat_lines("\nunknown mail:" :: unknown.map(_.print))) |
74933 | 319 |
} |
74925 | 320 |
} |
321 |
} |
|
322 |
||
323 |
||
74921 | 324 |
/* mailing list archives */ |
325 |
||
326 |
abstract class Archive( |
|
79510
d8330439823a
clarified signature: explicit type isabelle.Url to avoid oddities of java.net.URL (e.g. its "equals" method);
wenzelm
parents:
79044
diff
changeset
|
327 |
url: Url, |
74921 | 328 |
name: String = "", |
75393 | 329 |
tag: String = "" |
330 |
) { |
|
74921 | 331 |
def message_regex: Regex |
74923 | 332 |
def message_content(name: String, lines: List[String]): Message |
74921 | 333 |
|
74924 | 334 |
def message_match(lines: List[String], re: Regex): Option[Match] = |
335 |
lines.flatMap(re.findFirstMatchIn(_)).headOption |
|
336 |
||
75393 | 337 |
def make_name(str: String): String = { |
74937 | 338 |
val s = |
339 |
str.trim.replaceAll("""\s+""", " ").replaceAll(" at ", "@") |
|
340 |
.replace("mailbroy.informatik.tu-muenchen.de", "in.tum.de") |
|
341 |
.replace("informatik.tu-muenchen.de", "in.tum.de") |
|
74935 | 342 |
if (s.startsWith("=") && s.endsWith("=")) "" else s |
343 |
} |
|
74924 | 344 |
|
345 |
def make_body(lines: List[String]): String = |
|
346 |
cat_lines(Library.take_suffix[String](_.isEmpty, lines)._1) |
|
347 |
||
79510
d8330439823a
clarified signature: explicit type isabelle.Url to avoid oddities of java.net.URL (e.g. its "equals" method);
wenzelm
parents:
79044
diff
changeset
|
348 |
private val main_url: Url = |
74921 | 349 |
Url(Library.take_suffix[Char](_ == '/', Url.trim_index(url).toString.toList)._1.mkString + "/") |
350 |
||
351 |
private val main_html = Url.read(main_url) |
|
352 |
||
75393 | 353 |
val list_name: String = { |
74921 | 354 |
val title = |
355 |
"""<title>The ([^</>]*) Archives</title>""".r.findFirstMatchIn(main_html).map(_.group(1)) |
|
356 |
(proper_string(name) orElse title).getOrElse(error("Failed to determine mailing list name")) |
|
357 |
} |
|
72558 | 358 |
override def toString: String = list_name |
359 |
||
74923 | 360 |
def full_name(href: String): String = list_name + "/" + href |
361 |
||
74921 | 362 |
def list_tag: String = proper_string(tag).getOrElse(list_name) |
363 |
||
79510
d8330439823a
clarified signature: explicit type isabelle.Url to avoid oddities of java.net.URL (e.g. its "equals" method);
wenzelm
parents:
79044
diff
changeset
|
364 |
def read_text(href: String): String = Url.read(main_url.resolve(href)) |
74921 | 365 |
|
366 |
def hrefs_text: List[String] = |
|
367 |
"""href="([^"]+\.txt(?:\.gz)?)"""".r.findAllMatchIn(main_html).map(_.group(1)).toList |
|
368 |
||
369 |
def hrefs_msg: List[String] = |
|
74906 | 370 |
(for { |
74921 | 371 |
href <- """href="([^"]+)/date.html"""".r.findAllMatchIn(main_html).map(_.group(1)) |
372 |
html = read_text(href + "/date.html") |
|
373 |
msg <- message_regex.findAllMatchIn(html).map(_.group(1)) |
|
74906 | 374 |
} yield href + "/" + msg).toList |
375 |
||
75393 | 376 |
def get(target_dir: Path, href: String, progress: Progress = new Progress): Option[Path] = { |
72558 | 377 |
val dir = target_dir + Path.basic(list_name) |
74906 | 378 |
val path = dir + Path.explode(href) |
79510
d8330439823a
clarified signature: explicit type isabelle.Url to avoid oddities of java.net.URL (e.g. its "equals" method);
wenzelm
parents:
79044
diff
changeset
|
379 |
val url = main_url.resolve(href) |
d8330439823a
clarified signature: explicit type isabelle.Url to avoid oddities of java.net.URL (e.g. its "equals" method);
wenzelm
parents:
79044
diff
changeset
|
380 |
val connection = url.open_connection() |
74906 | 381 |
try { |
382 |
val length = connection.getContentLengthLong |
|
383 |
val timestamp = connection.getLastModified |
|
78958 | 384 |
if (path.is_file && File.size(path) == length && path.file.lastModified == timestamp) None |
74906 | 385 |
else { |
386 |
Isabelle_System.make_directory(path.dir) |
|
387 |
progress.echo("Getting " + url) |
|
388 |
val bytes = |
|
389 |
using(connection.getInputStream)(Bytes.read_stream(_, hint = length.toInt max 1024)) |
|
78958 | 390 |
Bytes.write(path, bytes) |
391 |
path.file.setLastModified(timestamp) |
|
74906 | 392 |
Some(path) |
393 |
} |
|
394 |
} |
|
395 |
finally { connection.getInputStream.close() } |
|
396 |
} |
|
72558 | 397 |
|
74906 | 398 |
def download_text(target_dir: Path, progress: Progress = new Progress): List[Path] = |
399 |
hrefs_text.flatMap(get(target_dir, _, progress = progress)) |
|
400 |
||
401 |
def download_msg(target_dir: Path, progress: Progress = new Progress): List[Path] = |
|
402 |
hrefs_msg.flatMap(get(target_dir, _, progress = progress)) |
|
74907 | 403 |
|
404 |
def download(target_dir: Path, progress: Progress = new Progress): List[Path] = |
|
405 |
download_text(target_dir, progress = progress) ::: |
|
406 |
download_msg(target_dir, progress = progress) |
|
74921 | 407 |
|
75393 | 408 |
def make_title(str: String): String = { |
74921 | 409 |
val Trim1 = ("""\s*\Q[""" + list_tag + """]\E\s*(.*)""").r |
410 |
val Trim2 = """(?i:(?:re|fw|fwd)\s*:\s*)(.*)""".r |
|
411 |
val Trim3 = """\[\s*(.*?)\s*\]""".r |
|
412 |
@tailrec def trim(s: String): String = |
|
413 |
s match { |
|
414 |
case Trim1(s1) => trim(s1) |
|
415 |
case Trim2(s1) => trim(s1) |
|
416 |
case Trim3(s1) => trim(s1) |
|
417 |
case _ => s |
|
418 |
} |
|
419 |
trim(str) |
|
420 |
} |
|
421 |
||
422 |
def get_messages(): List[Message] = |
|
423 |
for (href <- hrefs_msg) yield message_content(href, split_lines(read_text(href))) |
|
424 |
||
75393 | 425 |
def find_messages(dir: Path): List[Message] = { |
74921 | 426 |
for { |
75906
2167b9e3157a
clarified signature: support for adhoc file types;
wenzelm
parents:
75393
diff
changeset
|
427 |
file <- File.find_files(dir.file, file => File.is_html(file.getName)) |
74921 | 428 |
rel_path <- File.relative_path(dir, File.path(file)) |
74923 | 429 |
} |
430 |
yield { |
|
431 |
val name = full_name(rel_path.implode) |
|
432 |
message_content(name, split_lines(File.read(file))) |
|
433 |
} |
|
74921 | 434 |
} |
72558 | 435 |
} |
72559 | 436 |
|
75393 | 437 |
private class Message_Chunk(bg: String = "", en: String = "") { |
438 |
def unapply(lines: List[String]): Option[List[String]] = { |
|
74932 | 439 |
val res1 = |
440 |
if (bg.isEmpty) Some(lines) |
|
441 |
else { |
|
442 |
lines.dropWhile(_ != bg) match { |
|
443 |
case Nil => None |
|
444 |
case _ :: rest => Some(rest) |
|
445 |
} |
|
446 |
} |
|
447 |
if (en.isEmpty) res1 |
|
448 |
else { |
|
449 |
res1 match { |
|
450 |
case None => None |
|
451 |
case Some(lines1) => |
|
452 |
val lines2 = lines1.takeWhile(_ != en) |
|
453 |
if (lines1.drop(lines2.length).isEmpty) None else Some(lines2) |
|
454 |
} |
|
455 |
} |
|
456 |
} |
|
457 |
||
458 |
def get(lines: List[String]): List[String] = |
|
459 |
unapply(lines) getOrElse |
|
77369 | 460 |
error("Missing delimiters:" + if_proper(bg, " ") + bg + if_proper(en, " ") + en) |
74932 | 461 |
} |
462 |
||
72559 | 463 |
|
74930 | 464 |
/* isabelle-users mailing list */ |
72559 | 465 |
|
74921 | 466 |
object Isabelle_Users extends Archive( |
467 |
Url("https://lists.cam.ac.uk/pipermail/cl-isabelle-users"), |
|
75393 | 468 |
name = "isabelle-users", tag = "isabelle" |
469 |
) { |
|
74921 | 470 |
override def message_regex: Regex = """<li><a name="\d+" href="(msg\d+\.html)">""".r |
471 |
||
472 |
private object Head extends |
|
74922 | 473 |
Message_Chunk(bg = "<!--X-Head-of-Message-->", en = "<!--X-Head-of-Message-End-->") |
74921 | 474 |
|
475 |
private object Body extends |
|
74922 | 476 |
Message_Chunk(bg = "<!--X-Body-of-Message-->", en = "<!--X-Body-of-Message-End-->") |
477 |
||
75393 | 478 |
private object Date_Format { |
74922 | 479 |
private val Trim1 = """\w+,\s*(.*)""".r |
480 |
private val Trim2 = """(.*?)\s*\(.*\)""".r |
|
481 |
private val Format = |
|
482 |
Date.Format( |
|
483 |
"d MMM uuuu H:m:s Z", |
|
484 |
"d MMM uuuu H:m:s z", |
|
485 |
"d MMM yy H:m:s Z", |
|
486 |
"d MMM yy H:m:s z") |
|
75393 | 487 |
def unapply(s: String): Option[Date] = { |
74922 | 488 |
val s0 = s.replaceAll("""\s+""", " ") |
489 |
val s1 = |
|
490 |
s0 match { |
|
491 |
case Trim1(s1) => s1 |
|
492 |
case _ => s0 |
|
493 |
} |
|
494 |
val s2 = |
|
495 |
s1 match { |
|
496 |
case Trim2(s2) => s2 |
|
497 |
case _ => s1 |
|
498 |
} |
|
499 |
Format.unapply(s2) |
|
500 |
} |
|
501 |
} |
|
74921 | 502 |
|
75393 | 503 |
override def make_name(str: String): String = { |
74935 | 504 |
val s = Library.perhaps_unsuffix(" via Cl-isabelle-users", super.make_name(str)) |
74925 | 505 |
if (s == "cl-isabelle-users@lists.cam.ac.uk") "" else s |
506 |
} |
|
507 |
||
75393 | 508 |
object Address { |
74924 | 509 |
private def anchor(s: String): String = """<a href="[^"]*">""" + s + """</a>""" |
510 |
private def angl(s: String): String = """<""" + s + """>""" |
|
511 |
private def quot(s: String): String = """"""" + s + """"""" |
|
512 |
private def paren(s: String): String = """\(""" + s + """\)""" |
|
513 |
private val adr = """([^<>]*? at [^<>]*?)""" |
|
514 |
private val any = """([^<>]*?)""" |
|
515 |
private val spc = """\s*""" |
|
516 |
||
517 |
private val Name1 = quot(anchor(any)).r |
|
518 |
private val Name2 = quot(any).r |
|
519 |
private val Name_Adr1 = (quot(anchor(any)) + spc + angl(anchor(adr))).r |
|
520 |
private val Name_Adr2 = (quot(any) + spc + angl(anchor(adr))).r |
|
521 |
private val Name_Adr3 = (anchor(any) + spc + angl(anchor(adr))).r |
|
522 |
private val Name_Adr4 = (any + spc + angl(anchor(adr))).r |
|
523 |
private val Adr_Name1 = (angl(anchor(adr)) + spc + paren(any)).r |
|
524 |
private val Adr_Name2 = (anchor(adr) + spc + paren(any)).r |
|
525 |
private val Adr1 = angl(anchor(adr)).r |
|
526 |
private val Adr2 = anchor(adr).r |
|
527 |
||
74936 | 528 |
def parse(s: String): List[String] = |
74924 | 529 |
s match { |
74936 | 530 |
case Name1(a) => List(a) |
531 |
case Name2(a) => List(a) |
|
532 |
case Name_Adr1(a, b) => List(a, b) |
|
533 |
case Name_Adr2(a, b) => List(a, b) |
|
534 |
case Name_Adr3(a, b) => List(a, b) |
|
535 |
case Name_Adr4(a, b) => List(a, b) |
|
536 |
case Adr_Name1(b, a) => List(a, b) |
|
537 |
case Adr_Name2(b, a) => List(a, b) |
|
538 |
case Adr1(a) => List(a) |
|
539 |
case Adr2(a) => List(a) |
|
540 |
case _ => Nil |
|
74924 | 541 |
} |
542 |
} |
|
543 |
||
75393 | 544 |
override def message_content(name: String, lines: List[String]): Message = { |
74921 | 545 |
def err(msg: String = ""): Nothing = |
77368 | 546 |
error("Malformed message: " + name + if_proper(msg, "\n" + msg)) |
74921 | 547 |
|
548 |
val (head, body) = |
|
74924 | 549 |
try { (Head.get(lines), make_body(Body.get(lines))) } |
74921 | 550 |
catch { case ERROR(msg) => err(msg) } |
72559 | 551 |
|
74921 | 552 |
val date = |
553 |
message_match(head, """<li><em>Date</em>:\s*(.*?)\s*</li>""".r) |
|
554 |
.map(m => HTML.input(m.group(1))) match |
|
555 |
{ |
|
556 |
case Some(Date_Format(d)) => d |
|
557 |
case Some(s) => err("Malformed Date: " + quote(s)) |
|
558 |
case None => err("Missing Date") |
|
559 |
} |
|
560 |
||
561 |
val title = |
|
74924 | 562 |
make_title( |
563 |
HTML.input(message_match(head, """<li><em>Subject</em>:\s*(.*)</li>""".r) |
|
564 |
.getOrElse(err("Missing Subject")).group(1))) |
|
74921 | 565 |
|
74938 | 566 |
def parse_author_info(re: Regex): List[String] = |
567 |
message_match(head, re) match { |
|
568 |
case None => Nil |
|
569 |
case Some(m) => Address.parse(m.group(1)).map(a => HTML.input(make_name(a))) |
|
570 |
} |
|
571 |
||
74936 | 572 |
val author_info = |
74938 | 573 |
(parse_author_info("""<li><em>From</em>:\s*(.*?)\s*</li>""".r) ::: |
574 |
parse_author_info("""<li><em>Reply-to</em>:\s*(.*?)\s*</li>""".r)) |
|
575 |
.distinct.filter(_.nonEmpty) |
|
576 |
||
577 |
if (author_info.isEmpty) err("Malformed author information") |
|
74921 | 578 |
|
74934 | 579 |
val tags = List(list_name) |
580 |
||
74936 | 581 |
Message(name, date, title, standard_author_info(author_info), body, tags) |
74921 | 582 |
} |
583 |
} |
|
584 |
||
74930 | 585 |
|
586 |
/* isabelle-dev mailing list */ |
|
587 |
||
75393 | 588 |
object Isabelle_Dev extends Archive(Url("https://mailmanbroy.in.tum.de/pipermail/isabelle-dev")) { |
74921 | 589 |
override def message_regex: Regex = """<LI><A HREF="(\d+\.html)">""".r |
590 |
||
74922 | 591 |
private object Head extends Message_Chunk(en = "<!--beginarticle-->") |
592 |
private object Body extends Message_Chunk(bg = "<!--beginarticle-->", en = "<!--endarticle-->") |
|
593 |
||
75393 | 594 |
private object Date_Format { |
74922 | 595 |
val Format = Date.Format("E MMM d H:m:s z uuuu") |
596 |
def unapply(s: String): Option[Date] = Format.unapply(s.replaceAll("""\s+""", " ")) |
|
597 |
} |
|
598 |
||
75393 | 599 |
override def message_content(name: String, lines: List[String]): Message = { |
74922 | 600 |
def err(msg: String = ""): Nothing = |
77368 | 601 |
error("Malformed message: " + name + if_proper(msg, "\n" + msg)) |
74922 | 602 |
|
603 |
val (head, body) = |
|
74924 | 604 |
try { (Head.get(lines), make_body(Body.get(lines))) } |
74922 | 605 |
catch { case ERROR(msg) => err(msg) } |
606 |
||
607 |
val date = |
|
608 |
message_match(head, """\s*<I>(.*)</I>""".r).map(m => HTML.input(m.group(1))) match { |
|
609 |
case Some(Date_Format(d)) => d |
|
610 |
case Some(s) => err("Malformed Date: " + quote(s)) |
|
611 |
case None => err("Missing Date") |
|
612 |
} |
|
613 |
||
75393 | 614 |
val (title, author_address) = { |
74922 | 615 |
message_match(head, """TITLE="([^"]+)">(.*)""".r) match { |
74931 | 616 |
case Some(m) => (make_title(HTML.input(m.group(1))), make_name(HTML.input(m.group(2)))) |
74922 | 617 |
case None => err("Missing TITLE") |
618 |
} |
|
619 |
} |
|
620 |
||
621 |
val author_name = |
|
622 |
message_match(head, """\s*<B>(.*)</B>""".r) match { |
|
623 |
case None => err("Missing author") |
|
74931 | 624 |
case Some(m) => |
625 |
val a = make_name(HTML.input(m.group(1))) |
|
626 |
if (a == author_address) "" else a |
|
74922 | 627 |
} |
628 |
||
74936 | 629 |
val author_info = List(author_name, author_address) |
74934 | 630 |
val tags = List(list_name) |
631 |
||
74936 | 632 |
Message(name, date, title, standard_author_info(author_info), body, tags) |
74922 | 633 |
} |
74921 | 634 |
} |
72558 | 635 |
} |