Consider job status when detecting missing services
Prevents false alarms and attempts to restart failed or stopped allocs.
This commit is contained in:
parent
369802cacc
commit
2f3fc87f12
@ -56,6 +56,10 @@ for job in nomad_req("jobs"):
|
|||||||
if job["Type"] in ("batch", "sysbatch"):
|
if job["Type"] in ("batch", "sysbatch"):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
if job["Status"] != "running":
|
||||||
|
print(f"WARNING: job {job['Name']} is {job['Status']}")
|
||||||
|
continue
|
||||||
|
|
||||||
job_detail = nomad_req("job", job["ID"])
|
job_detail = nomad_req("job", job["ID"])
|
||||||
job_detail = cast(dict[str, Any], job_detail)
|
job_detail = cast(dict[str, Any], job_detail)
|
||||||
|
|
||||||
@ -83,7 +87,7 @@ for job in nomad_req("jobs"):
|
|||||||
restart_allocs: set[str] = set()
|
restart_allocs: set[str] = set()
|
||||||
for allocation in nomad_req("job", job_detail["ID"], "allocations"):
|
for allocation in nomad_req("job", job_detail["ID"], "allocations"):
|
||||||
allocation = cast(dict[str, Any], allocation)
|
allocation = cast(dict[str, Any], allocation)
|
||||||
if allocation["TaskGroup"] in restart_groups:
|
if allocation["ClientStatus"] == "running" and allocation["TaskGroup"] in restart_groups:
|
||||||
restart_allocs.add(allocation["ID"])
|
restart_allocs.add(allocation["ID"])
|
||||||
|
|
||||||
# Restart allocs associated with missing services
|
# Restart allocs associated with missing services
|
||||||
|
Loading…
Reference in New Issue
Block a user