From 2f3fc87f12a98f57cc8f8c4156bef83617e05631 Mon Sep 17 00:00:00 2001 From: Ian Fijolek Date: Thu, 19 Oct 2023 12:07:57 -0700 Subject: [PATCH] Consider job status when detecting missing services Prevents false alarms and attempts to restart failed or stopped allocs. --- scripts/nomad_missing_services.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/scripts/nomad_missing_services.py b/scripts/nomad_missing_services.py index e4bddb1..97019d4 100755 --- a/scripts/nomad_missing_services.py +++ b/scripts/nomad_missing_services.py @@ -56,6 +56,10 @@ for job in nomad_req("jobs"): if job["Type"] in ("batch", "sysbatch"): continue + if job["Status"] != "running": + print(f"WARNING: job {job['Name']} is {job['Status']}") + continue + job_detail = nomad_req("job", job["ID"]) job_detail = cast(dict[str, Any], job_detail) @@ -83,7 +87,7 @@ for job in nomad_req("jobs"): restart_allocs: set[str] = set() for allocation in nomad_req("job", job_detail["ID"], "allocations"): allocation = cast(dict[str, Any], allocation) - if allocation["TaskGroup"] in restart_groups: + if allocation["ClientStatus"] == "running" and allocation["TaskGroup"] in restart_groups: restart_allocs.add(allocation["ID"]) # Restart allocs associated with missing services