Consider job status when detecting missing services
Prevents false alarms and attempts to restart failed or stopped allocs.
This commit is contained in:
parent
369802cacc
commit
2f3fc87f12
@ -56,6 +56,10 @@ for job in nomad_req("jobs"):
|
||||
if job["Type"] in ("batch", "sysbatch"):
|
||||
continue
|
||||
|
||||
if job["Status"] != "running":
|
||||
print(f"WARNING: job {job['Name']} is {job['Status']}")
|
||||
continue
|
||||
|
||||
job_detail = nomad_req("job", job["ID"])
|
||||
job_detail = cast(dict[str, Any], job_detail)
|
||||
|
||||
@ -83,7 +87,7 @@ for job in nomad_req("jobs"):
|
||||
restart_allocs: set[str] = set()
|
||||
for allocation in nomad_req("job", job_detail["ID"], "allocations"):
|
||||
allocation = cast(dict[str, Any], allocation)
|
||||
if allocation["TaskGroup"] in restart_groups:
|
||||
if allocation["ClientStatus"] == "running" and allocation["TaskGroup"] in restart_groups:
|
||||
restart_allocs.add(allocation["ID"])
|
||||
|
||||
# Restart allocs associated with missing services
|
||||
|
Loading…
x
Reference in New Issue
Block a user