# HG changeset patch # User Fabian Huch # Date 1719840244 -7200 # Node ID a3bae6dd73448f9f2301cee61658a6112bcf5773 # Parent 8ae5312032ccf6fd5f7e3c64e3fe126e65b2cb59 add timeout to build manager tasks/jobs (e.g. for cluster builds that don't terminate after error on host); diff -r 8ae5312032cc -r a3bae6dd7344 etc/options --- a/etc/options Mon Jul 01 14:46:51 2024 +0200 +++ b/etc/options Mon Jul 01 15:24:04 2024 +0200 @@ -245,6 +245,9 @@ option build_manager_cluster : string = "cluster.default" +option build_manager_timeout : real = 28800 + -- "timeout for user-submitted tasks (seconds > 0)" + option build_manager_delay : real = 1.0 -- "delay build manager loop" diff -r 8ae5312032cc -r a3bae6dd7344 src/Pure/Build/build_ci.scala --- a/src/Pure/Build/build_ci.scala Mon Jul 01 14:46:51 2024 +0200 +++ b/src/Pure/Build/build_ci.scala Mon Jul 01 15:24:04 2024 +0200 @@ -103,6 +103,7 @@ name: String, description: String, hosts: Hosts, + timeout: Time, afp: Boolean = false, selection: Sessions.Selection = Sessions.Selection.empty, presentation: Boolean = false, @@ -145,6 +146,7 @@ Job("benchmark", description = "runs benchmark and timing sessions", Local("benchmark", jobs = 1, threads = 6, numa_shuffling = false), + Time.hms(2, 0, 0), selection = Sessions.Selection(session_groups = List("timing")), clean_build = true, select_dirs = List(Path.explode("~~/src/Benchmarks")), diff -r 8ae5312032cc -r a3bae6dd7344 src/Pure/Build/build_manager.scala --- a/src/Pure/Build/build_manager.scala Mon Jul 01 14:46:51 2024 +0200 +++ b/src/Pure/Build/build_manager.scala Mon Jul 01 15:24:04 2024 +0200 @@ -43,7 +43,7 @@ CI_Build(job.name, job.hosts.build_cluster, job.components.map(Component(_, "default"))) def task(job: Build_CI.Job): Task = - Task(CI_Build(job), job.hosts.hosts_spec, other_settings = job.other_settings, + Task(CI_Build(job), job.hosts.hosts_spec, job.timeout, other_settings = job.other_settings, isabelle_rev = "default") } @@ -113,6 +113,7 @@ sealed case class Task( build_config: Build_Config, hosts_spec: String, + timeout: Time, other_settings: List[String] = Nil, uuid: UUID.T = UUID.random(), submit_date: Date = Date.now(), @@ -136,6 +137,7 @@ build_cluster: Boolean, hostnames: List[String], components: List[Component], + timeout: Time, start_date: Date = Date.now(), cancelled: Boolean = false ) extends Build { def name: String = Build.name(kind, id) } @@ -331,6 +333,7 @@ val kind = SQL.Column.string("kind") val build_cluster = SQL.Column.bool("build_cluster") val hosts_spec = SQL.Column.string("hosts_spec") + val timeout = SQL.Column.long("timeout") val other_settings = SQL.Column.string("other_settings") val uuid = SQL.Column.string("uuid").make_primary_key val submit_date = SQL.Column.date("submit_date") @@ -354,7 +357,7 @@ val verbose = SQL.Column.bool("verbose") val table = - make_table(List(kind, build_cluster, hosts_spec, other_settings, uuid, submit_date, + make_table(List(kind, build_cluster, hosts_spec, timeout, other_settings, uuid, submit_date, priority, isabelle_rev, components, prefs, requirements, all_sessions, base_sessions, exclude_session_groups, exclude_sessions, session_groups, sessions, build_heap, clean_build, export_files, fresh_build, presentation, verbose), @@ -367,6 +370,7 @@ val kind = res.string(Pending.kind) val build_cluster = res.bool(Pending.build_cluster) val hosts_spec = res.string(Pending.hosts_spec) + val timeout = Time.ms(res.long(Pending.timeout)) val other_settings = split_lines(res.string(Pending.other_settings)) val uuid = res.string(Pending.uuid) val submit_date = res.date(Pending.submit_date) @@ -399,8 +403,8 @@ clean_build, export_files, fresh_build, presentation, verbose) } - val task = Task(build_config, hosts_spec, other_settings, UUID.make(uuid), submit_date, - priority, isabelle_rev) + val task = Task(build_config, hosts_spec, timeout, other_settings, UUID.make(uuid), + submit_date, priority, isabelle_rev) task.name -> task }) @@ -423,12 +427,13 @@ stmt.string(1) = task.kind stmt.bool(2) = task.build_cluster stmt.string(3) = task.hosts_spec - stmt.string(4) = cat_lines(task.other_settings) - stmt.string(5) = task.uuid.toString - stmt.date(6) = task.submit_date - stmt.string(7) = task.priority.toString - stmt.string(8) = task.isabelle_rev - stmt.string(9) = task.components.mkString(",") + stmt.long(4) = task.timeout.ms + stmt.string(5) = cat_lines(task.other_settings) + stmt.string(6) = task.uuid.toString + stmt.date(7) = task.submit_date + stmt.string(8) = task.priority.toString + stmt.string(9) = task.isabelle_rev + stmt.string(10) = task.components.mkString(",") def get[A](f: User_Build => A): Option[A] = task.build_config match { @@ -436,20 +441,20 @@ case _ => None } - stmt.string(10) = get(user_build => user_build.prefs.map(_.print).mkString(",")) - stmt.bool(11) = get(_.requirements) - stmt.bool(12) = get(_.all_sessions) - stmt.string(13) = get(_.base_sessions.mkString(",")) - stmt.string(14) = get(_.exclude_session_groups.mkString(",")) - stmt.string(15) = get(_.exclude_sessions.mkString(",")) - stmt.string(16) = get(_.session_groups.mkString(",")) - stmt.string(17) = get(_.sessions.mkString(",")) - stmt.bool(18) = get(_.build_heap) - stmt.bool(19) = get(_.clean_build) - stmt.bool(20) = get(_.export_files) - stmt.bool(21) = get(_.fresh_build) - stmt.bool(22) = get(_.presentation) - stmt.bool(23) = get(_.verbose) + stmt.string(11) = get(user_build => user_build.prefs.map(_.print).mkString(",")) + stmt.bool(12) = get(_.requirements) + stmt.bool(13) = get(_.all_sessions) + stmt.string(14) = get(_.base_sessions.mkString(",")) + stmt.string(15) = get(_.exclude_session_groups.mkString(",")) + stmt.string(16) = get(_.exclude_sessions.mkString(",")) + stmt.string(17) = get(_.session_groups.mkString(",")) + stmt.string(18) = get(_.sessions.mkString(",")) + stmt.bool(19) = get(_.build_heap) + stmt.bool(20) = get(_.clean_build) + stmt.bool(21) = get(_.export_files) + stmt.bool(22) = get(_.fresh_build) + stmt.bool(23) = get(_.presentation) + stmt.bool(24) = get(_.verbose) }) } @@ -467,11 +472,12 @@ val build_cluster = SQL.Column.bool("build_cluster") val hostnames = SQL.Column.string("hostnames") val components = SQL.Column.string("components") + val timeout = SQL.Column.long("timeout") val start_date = SQL.Column.date("start_date") val cancelled = SQL.Column.bool("cancelled") val table = - make_table(List(uuid, kind, id, isabelle_rev, build_cluster, hostnames, components, + make_table(List(uuid, kind, id, isabelle_rev, build_cluster, hostnames, components, timeout, start_date, cancelled), name = "running") } @@ -486,11 +492,12 @@ val build_cluster = res.bool(Running.build_cluster) val hostnames = space_explode(',', res.string(Running.hostnames)) val components = space_explode(',', res.string(Running.components)).map(Component.parse) + val timeout = Time.ms(res.long(Running.timeout)) val start_date = res.date(Running.start_date) val cancelled = res.bool(Running.cancelled) val job = Job(UUID.make(uuid), kind, id, isabelle_rev, build_cluster, hostnames, - components, start_date, cancelled) + components, timeout, start_date, cancelled) job.name -> job }) @@ -517,8 +524,9 @@ stmt.bool(5) = job.build_cluster stmt.string(6) = job.hostnames.mkString(",") stmt.string(7) = job.components.mkString(",") - stmt.date(8) = job.start_date - stmt.bool(9) = job.cancelled + stmt.long(8) = job.timeout.ms + stmt.date(9) = job.start_date + stmt.bool(10) = job.cancelled }) } update @@ -794,7 +802,7 @@ } Job(task.uuid, task.kind, id, isabelle_rev, task.build_cluster, hostnames, components, - start_date) + task.timeout, start_date) } match { case Exn.Res(job) => _state = _state.add_running(job) @@ -820,6 +828,19 @@ private def stop_cancelled(state: Runner.State): Runner.State = synchronized_database("stop_cancelled") { + val now = Date.now() + for { + name <- state.running + job = _state.running(name) + if now - job.start_date > job.timeout + } { + _state = _state.cancel_running(name) + + val timeout_msg = "Timeout after " + job.timeout.message_hms + new File_Progress(store.log_file(job.kind, job.id)).echo_error_message(timeout_msg) + echo(job.name + ": " + timeout_msg) + } + val cancelled = for (name <- state.running if _state.running(name).cancelled) yield name state.cancel(cancelled) } @@ -1547,12 +1568,13 @@ val afp_rev = if (afp_root.nonEmpty) Some("") else None val hosts_spec = options.string("build_manager_cluster") + val timeout = options.seconds("build_manager_timeout") val paths = Web_Server.paths(store) val build_config = User_Build(afp_rev, prefs, requirements, all_sessions, base_sessions, exclude_session_groups, exclude_sessions, session_groups, sessions, build_heap, clean_build, export_files, fresh_build, presentation, verbose) - val task = Task(build_config, hosts_spec, uuid = uuid, priority = Priority.high) + val task = Task(build_config, hosts_spec, timeout, uuid = uuid, priority = Priority.high) val dir = store.task_dir(task)