Consider job status when detecting missing services

Prevents false alarms and attempts to restart failed or stopped allocs.
This commit is contained in:
IamTheFij 2023-10-19 12:07:57 -07:00
parent 369802cacc
commit 2f3fc87f12

View File

@ -56,6 +56,10 @@ for job in nomad_req("jobs"):
if job["Type"] in ("batch", "sysbatch"):
continue
if job["Status"] != "running":
print(f"WARNING: job {job['Name']} is {job['Status']}")
continue
job_detail = nomad_req("job", job["ID"])
job_detail = cast(dict[str, Any], job_detail)
@ -83,7 +87,7 @@ for job in nomad_req("jobs"):
restart_allocs: set[str] = set()
for allocation in nomad_req("job", job_detail["ID"], "allocations"):
allocation = cast(dict[str, Any], allocation)
if allocation["TaskGroup"] in restart_groups:
if allocation["ClientStatus"] == "running" and allocation["TaskGroup"] in restart_groups:
restart_allocs.add(allocation["ID"])
# Restart allocs associated with missing services