diff --git a/scripts/nomad_missing_services.py b/scripts/nomad_missing_services.py index 79b0aa9..e4bddb1 100755 --- a/scripts/nomad_missing_services.py +++ b/scripts/nomad_missing_services.py @@ -1,4 +1,5 @@ #! /usr/bin/env python3 +from argparse import ArgumentParser from os import environ from typing import Any from typing import cast @@ -31,18 +32,23 @@ def nomad_req( return response.text -def extract_job_services(job: dict[str, Any]) -> set[str]: - services: set[str] = set() +def extract_job_services(job: dict[str, Any]) -> dict[str, str]: + services: dict[str, str] = dict() for group in job["TaskGroups"]: for service in group.get("Services") or []: - services.add(service["Name"]) + services[service["Name"]] = group["Name"] for task in group["Tasks"]: for service in task.get("Services") or []: - services.add(service["Name"]) + services[service["Name"]] = group["Name"] return services exit_code = 0 +parser = ArgumentParser( + description="Checks for missing services and optionally restarts their allocs.", +) +parser.add_argument("-r", "--restart", action="store_true", help="Restart allocs for missing services") +args = parser.parse_args() for job in nomad_req("jobs"): job = cast(dict[str, Any], job) @@ -60,11 +66,30 @@ for job in nomad_req("jobs"): service = cast(dict[str, Any], service) found_services.add(service["ServiceName"]) - missing_services = expected_services - found_services + missing_services = set(expected_services) - found_services + restart_groups: set[str] = set() for missing_service in missing_services: print(f"ERROR: Missing service {missing_service} for job {job_detail['Name']}") - print(job) + # print(job) exit_code = 1 + # Add group associated with missing service to set + restart_groups.add(expected_services[missing_service]) + + if not restart_groups or not args.restart: + continue + + # Get allocts for groups that are missing services + restart_allocs: set[str] = set() + for allocation in nomad_req("job", job_detail["ID"], "allocations"): + allocation = cast(dict[str, Any], allocation) + if allocation["TaskGroup"] in restart_groups: + restart_allocs.add(allocation["ID"]) + + # Restart allocs associated with missing services + for allocation in restart_allocs: + print(f"INFO: Restarting allocation {allocation}") + nomad_req("client", "allocation", allocation, "restart") + exit(exit_code) diff --git a/scripts/nomad_orphan_services.py b/scripts/nomad_orphan_services.py index 9eb7f10..76b4406 100755 --- a/scripts/nomad_orphan_services.py +++ b/scripts/nomad_orphan_services.py @@ -34,7 +34,7 @@ def nomad_req( exit_code = 0 parser = ArgumentParser( - description="Checks for orphaned services and optional deletes them.", + description="Checks for orphaned services and optionally deletes them.", ) parser.add_argument("-d", "--delete", action="store_true", help="Delete orphan services") args = parser.parse_args()