remove inactive (e.g., crashed) hosts from scheduling;
authorFabian Huch <huch@in.tum.de>
Mon, 01 Jul 2024 18:22:33 +0200
changeset 80471 12901c03b416
parent 80470 f2f4b953ead6
child 80472 8f839104d460
remove inactive (e.g., crashed) hosts from scheduling;
etc/options
src/Pure/Build/build_schedule.scala
--- a/etc/options	Mon Jul 01 15:25:27 2024 +0200
+++ b/etc/options	Mon Jul 01 18:22:33 2024 +0200
@@ -231,6 +231,9 @@
 option build_schedule_outdated_delay : real = 300.0
   -- "delay schedule generation loop"
 
+option build_schedule_inactive_delay : real = 300.0
+  -- "delay removing inactive hosts"
+
 
 section "Build Manager"
 
--- a/src/Pure/Build/build_schedule.scala	Mon Jul 01 15:25:27 2024 +0200
+++ b/src/Pure/Build/build_schedule.scala	Mon Jul 01 18:22:33 2024 +0200
@@ -85,6 +85,12 @@
       make(host_infos, build_history, sessions_structure)
     }
 
+    def restrict(full_data: Timing_Data, host_infos: Host_Infos): Timing_Data = {
+      val hostnames = host_infos.hosts.map(_.name).toSet
+      val results = full_data.facet.results.filter(result => hostnames.contains(result.hostname))
+      new Timing_Data(new Facet(results), host_infos, full_data.sessions_structure)
+    }
+
 
     /* data facets */
 
@@ -110,7 +116,7 @@
   }
 
   class Timing_Data private(
-    facet: Timing_Data.Facet,
+    private val facet: Timing_Data.Facet,
     val host_infos: Host_Infos,
     val sessions_structure: Sessions.Structure
   ) {
@@ -372,7 +378,7 @@
     ): Host_Infos = new Host_Infos(build_hosts.map(Host.load(options, _, host_db)))
   }
 
-  class Host_Infos private(val hosts: List[Host]) {
+  case class Host_Infos(hosts: List[Host]) {
     require(hosts.nonEmpty)
 
     private val by_hostname = hosts.map(host => host.name -> host).toMap
@@ -1047,8 +1053,8 @@
       build_cluster.benchmark()
     }
 
-    private val timing_data: Timing_Data = {
-      val cluster_hosts: List[Build_Cluster.Host] =
+    private var _host_infos = {
+      val build_hosts =
         if (!build_context.worker) build_context.build_hosts
         else {
           val local_build_host =
@@ -1056,11 +1062,14 @@
               hostname, jobs = build_context.jobs, numa = build_context.numa_shuffling)
           local_build_host :: build_context.build_hosts
         }
+      Host_Infos.load(build_options, build_hosts, _host_database)
+    }
 
-      val host_infos = Host_Infos.load(build_options, cluster_hosts, _host_database)
-      Timing_Data.load(host_infos, _log_database, build_context.sessions_structure)
+    private val timing_data: Timing_Data = {
+      Timing_Data.load(_host_infos, _log_database, build_context.sessions_structure)
     }
-    private val scheduler = init_scheduler(timing_data)
+
+    private var _scheduler = init_scheduler(timing_data)
 
     def write_build_log(results: Build.Results, state: Build_Process.State.Results): Unit = {
       val sessions =
@@ -1135,14 +1144,36 @@
         val current = state.next_ready.filter(task => is_current(state, task.name))
         if (current.nonEmpty) current.map(_.name)
         else {
-          val start = Time.now()
+          val start = Date.now()
+
+          def completed_since(name: String): Time = {
+            val result = state.results(name)
+            start - (result.start_date + result.process_result.timing.elapsed)
+          }
 
-          val new_schedule = scheduler.schedule(state).update(state)
+          val active_hosts0 = (for ((_, job) <- state.running) yield job.node_info.hostname).toSet
+          val inactive_hosts =
+            (for {
+              host <- _host_infos.hosts
+              if !active_hosts0.contains(host.name)
+              jobs = _schedule.next(host.name, state)
+              ancestors = build_context.sessions_structure.build_requirements(jobs)
+              if ancestors.forall(ancestor =>
+                completed_since(ancestor) > build_options.seconds("build_schedule_inactive_delay"))
+            } yield host).toSet
+
+          val host_infos = Host_Infos(_host_infos.hosts.filterNot(inactive_hosts.contains))
+          if (host_infos != _host_infos) {
+            _host_infos = host_infos
+            _scheduler = init_scheduler(Timing_Data.restrict(timing_data, host_infos))
+          }
+
+          val new_schedule = _scheduler.schedule(state).update(state)
           val schedule =
             if (_schedule.is_empty) new_schedule
             else List(_schedule.update(state), new_schedule).minBy(_.end)(Date.Ordering)
 
-          val elapsed = Time.now() - start
+          val elapsed = Date.now() - start
 
           val timing_msg = if (elapsed.is_relevant) " (took " + elapsed.message + ")" else ""
           progress.echo_if(