more data integrity: name vs. address;
authorwenzelm
Tue, 14 Dec 2021 17:22:40 +0100
changeset 74935 dc62948c6080
parent 74934 f5ad3214bef4
child 74936 4c166d510b65
more data integrity: name vs. address;
src/Pure/General/mailman.scala
--- a/src/Pure/General/mailman.scala	Tue Dec 14 10:19:48 2021 +0100
+++ b/src/Pure/General/mailman.scala	Tue Dec 14 17:22:40 2021 +0100
@@ -18,6 +18,26 @@
 {
   /* mailing list messages */
 
+  def is_address(s: String): Boolean =
+    s.contains('@') && s.contains('.') && !s.contains(' ')
+
+  val standard_name: Map[String, String] =
+    Map(
+      "Blanchette, J.C." -> "Jasmin Christian Blanchette",
+      "Buday Gergely István" -> "Gergely Buday",
+      "Carsten Schuermann" -> "Carsten Schürman",
+      "Christoph Lueth" -> "Christoph Lüth",
+      "Claude Marche" -> "Claude Marché",
+      "Farquhar, Colin I" -> "Colin Farquhar",
+      "George K." -> "George Karabotsos",
+      "Filip MariÄ" -> "Filip Marić",
+      "Hitoshi Ohsaki (RTA publicity chair)" -> "Hitoshi Ohsaki",
+      "Farn" -> "Farn Wang",
+      "Fleury Mathias" -> "Mathias Fleury",
+      "Urban, Christian" -> "Christian Urban",
+      "Moscato, Mariano M. \\(LARC-D320\\)\\[NATIONAL INSTITUTE OF AEROSPACE\\]" -> "Moscato, Mariano M. (LARC-D320)",
+    ).withDefault(identity)
+
   sealed case class Message(
     name: String,
     date: Date,
@@ -27,18 +47,47 @@
     body: String,
     tags: List[String])
   {
+    if (author_name.nonEmpty && is_address(author_name)) {
+      Output.warning("Bad author name, looks like mail address: " +
+        quote(author_name) + " in " + quote(name))
+    }
+
+    if (author_address.nonEmpty && !is_address(author_address)) {
+      error("Bad mail address: " + quote(author_address) + " in " + quote(name))
+    }
+
     override def toString: String = name
+
+    def nodes: List[String] =
+      proper_string(author_name).toList :::
+      proper_string(author_address).toList
+
     def print: String =
       name + "\n" + date + "\n" + title + "\n" +
       quote(author_name) + "\n" + "<" + author_address + ">\n"
-
-    def normal_address: String = Word.lowercase(author_address)
-  }
+   }
 
   object Messages
   {
+    type Graph = isabelle.Graph[String, Message]
+
     def apply(msgs: List[Message]): Messages =
-      new Messages(msgs.sortBy(_.date)(Date.Ordering))
+    {
+      def update_node(g: Graph, node: String, msg: Message): Graph =
+        if (node.isEmpty ||
+            g.defined(node) && Date.Ordering.compare(g.get_node(node).date, msg.date) >= 0) g
+        else g.default_node(node, msg)
+
+      new Messages(msgs.sortBy(_.date)(Date.Ordering),
+        msgs.foldLeft[Graph](Graph.string)(
+          { case (g, msg) =>
+              val a = msg.author_name
+              val b = msg.author_address
+              val g1 = update_node(update_node(g, a, msg), b, msg)
+              if (a.nonEmpty && b.nonEmpty) g1.add_edge(a, b).add_edge(b, a)
+              else g1
+          }))
+    }
 
     def find(dir: Path): Messages =
     {
@@ -49,51 +98,66 @@
         } yield msg
       Messages(msgs)
     }
+
+    sealed case class Cluster(names: List[String], addresses: List[String])
+    {
+      val name: String =
+        names.headOption getOrElse {
+          addresses match {
+            case a :: _ => a.substring(0, a.indexOf('@')).replace('.', ' ')
+            case Nil => error("Empty cluster")
+          }
+        }
+
+      def get_address: Option[String] = addresses.headOption
+
+      def unique: Boolean = names.length == 1 && addresses.length == 1
+      def multi: Boolean = names.length > 1 || addresses.length > 1
+
+      def print: String =
+      {
+        val entries = names ::: (if (addresses.isEmpty) Nil else List("")) ::: addresses
+        entries.mkString("\n  * ", "\n    ", "")
+      }
+    }
   }
 
-  class Messages private(val sorted: List[Message])
+  class Messages private(val sorted: List[Message], val graph: Messages.Graph)
   {
-    override def toString: String = "Messages(" + sorted.length + ")"
+    override def toString: String = "Messages(" + sorted.size + ")"
 
-    def check(): Unit =
+    object Node_Ordering extends scala.math.Ordering[String]
     {
-      val proper = sorted.filter(msg => msg.author_name.nonEmpty && msg.author_address.nonEmpty)
-      val address_name = Multi_Map.from(proper.map(msg => (msg.normal_address, msg.author_name)))
-      val name_address = Multi_Map.from(proper.map(msg => (msg.author_name, msg.normal_address)))
+      override def compare(a: String, b: String): Int =
+        Date.Rev_Ordering.compare(graph.get_node(a).date, graph.get_node(b).date)
+    }
+
+    def get_nodes(msg: Message): List[String] =
+      graph.all_succs(msg.nodes).sorted(Node_Ordering)
 
-      def print_dup(a: String, bs: List[String]): String =
-        "  * " + a + bs.mkString("\n      ", "\n      ", "")
+    def get_cluster(msg: Message): Messages.Cluster =
+    {
+      val (addresses, names) = get_nodes(msg).partition(is_address)
+      Messages.Cluster(names, addresses)
+    }
+
+    def get_name(msg: Message): String = get_cluster(msg).name
 
-      def print_dups(title: String, m: Multi_Map[String, String]): Unit =
-      {
-        val dups = m.iterator_list.filter(p => p._2.length > 1).toList
-        if (dups.nonEmpty) {
-          Output.writeln(cat_lines(title :: dups.map(p => print_dup(p._1, p._2))))
-        }
+    def get_address(msg: Message): String =
+      get_cluster(msg).get_address getOrElse error("No author address for " + quote(msg.name))
+
+    def check(check_multi: Boolean = false): Unit =
+    {
+      val clusters = sorted.map(get_cluster).distinct.sortBy(_.name)
+
+      val multi = if (check_multi) clusters.filter(_.multi) else Nil
+      if (multi.nonEmpty) {
+        Output.writeln(cat_lines("ambiguous:" :: multi.map(_.print)))
       }
 
-      print_dups("\nduplicate names:", address_name)
-      print_dups("\nduplicate addresses:", name_address)
-
-      def get_name(msg: Message): Option[String] =
-        proper_string(msg.author_name) orElse address_name.get(msg.author_address)
-
-      val missing_name =
-        sorted.flatMap(msg =>
-          if (get_name(msg).isEmpty) Some(proper_string(msg.author_address).getOrElse(msg.name))
-          else None).toSet.toList.sorted
-
-      val missing_address =
-        sorted.flatMap(msg =>
-          if (msg.author_address.isEmpty) Some(proper_string(msg.author_name).getOrElse(msg.name))
-          else None).toSet.toList.sorted
-
-      if (missing_name.nonEmpty) {
-        Output.writeln(("missing name:" :: missing_name).mkString("\n", "\n  ", ""))
-      }
-
-      if (missing_address.nonEmpty) {
-        Output.writeln(("missing address:" :: missing_address).mkString("\n", "\n  ", ""))
+      val unknown = clusters.filter(cluster => cluster.get_address.isEmpty)
+      if (unknown.nonEmpty) {
+        Output.writeln(cat_lines("unknown mail:" :: unknown.map(_.print)))
       }
     }
   }
@@ -113,7 +177,10 @@
       lines.flatMap(re.findFirstMatchIn(_)).headOption
 
     def make_name(str: String): String =
-      str.replaceAll("""\s+""", " ").replaceAll(" at ", "@")
+    {
+      val s = str.trim.replaceAll("""\s+""", " ").replaceAll(" at ", "@")
+      if (s.startsWith("=") && s.endsWith("=")) "" else s
+    }
 
     def make_body(lines: List[String]): String =
       cat_lines(Library.take_suffix[String](_.isEmpty, lines)._1)
@@ -286,7 +353,7 @@
 
     override def make_name(str: String): String =
     {
-      val s = Library.perhaps_unsuffix("via Cl-isabelle-users", super.make_name(str))
+      val s = Library.perhaps_unsuffix(" via Cl-isabelle-users", super.make_name(str))
       if (s == "cl-isabelle-users@lists.cam.ac.uk") "" else s
     }
 
@@ -362,7 +429,7 @@
 
       val tags = List(list_name)
 
-      Message(name, date, title, author_name, author_address, body, tags)
+      Message(name, date, title, standard_name(author_name), author_address, body, tags)
     }
   }
 
@@ -384,7 +451,7 @@
 
     override def make_name(str: String): String =
     {
-      val s = Library.perhaps_unsuffix("via RT", super.make_name(str))
+      val s = Library.perhaps_unsuffix(" via RT", super.make_name(str))
       if (s == "sys-admin@cl.cam.ac.uk") "" else s
     }
 
@@ -422,7 +489,7 @@
 
       val tags = List(list_name)
 
-      Message(name, date, title, author_name, author_address, body, tags)
+      Message(name, date, title, standard_name(author_name), author_address, body, tags)
     }
   }
 }