diff --git a/.drone.yml b/.drone.yml new file mode 100644 index 0000000..e080b47 --- /dev/null +++ b/.drone.yml @@ -0,0 +1,62 @@ +--- +kind: pipeline +name: test + +steps: + - name: check + image: iamthefij/drone-pre-commit:personal + +--- +kind: pipeline +name: publish + +depends_on: + - test + +trigger: + event: + - push + - tag + refs: + - refs/heads/master + - refs/tags/v* + +steps: + - name: push images + image: thegeeklab/drone-docker-buildx + settings: + repo: iamthefij/nomad-service-fixers + auto_tag: true + platforms: + - linux/amd64 + - linux/arm64 + - linux/arm + username: + from_secret: docker_username + password: + from_secret: docker_password + +--- +kind: pipeline +name: notify + +depends_on: + - test + - publish + +trigger: + status: + - failure + +steps: + + - name: notify + image: drillster/drone-email + settings: + host: + from_secret: SMTP_HOST # pragma: whitelist secret + username: + from_secret: SMTP_USER # pragma: whitelist secret + password: + from_secret: SMTP_PASS # pragma: whitelist secret + from: drone@iamthefij.com diff --git a/.gitignore b/.gitignore index 5d381cc..489bdd0 100644 --- a/.gitignore +++ b/.gitignore @@ -159,4 +159,3 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ - diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..417ad26 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,11 @@ +--- +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.1.0 + hooks: + - id: check-added-large-files + - id: requirements-txt-fixer + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-merge-conflict + - id: debug-statements diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..bd07284 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,9 @@ +FROM python:3-alpine + +RUN mkdir /scripts +WORKDIR /scripts + +COPY ./requirements.txt /scripts/ +RUN pip install --no-cache-dir -r /scripts/requirements.txt + +COPY ./nomad_missing_services.py ./nomad_orphan_services.py /scripts/ diff --git a/README.md b/README.md index 83aef20..7c28ea9 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,12 @@ # nomad-service-fixers -A few check and fixer scripts to clean up services in my running instances. \ No newline at end of file +A few check and fixer scripts to clean up services in my running instances. + +These make use of [requests-unixsocket](https://github.com/msabramo/requests-unixsocket) so that they can target the workload API from within a Nomad task. + +Included scripts: + +* `./nomad_missing_services.py`: Looks for running allocs who's services appear to have dissapeared. +* `./nomad_orphan_services.py`: Looks for services who's allocs appear to have dissapeared. + +This is on DockerHub as `iamthefij/nomad-service-fixers`. diff --git a/nomad_missing_services.py b/nomad_missing_services.py new file mode 100755 index 0000000..e42028e --- /dev/null +++ b/nomad_missing_services.py @@ -0,0 +1,99 @@ +#! /usr/bin/env python3 +from argparse import ArgumentParser +from os import environ +from typing import Any +from typing import cast + +import requests_unixsocket +requests = requests_unixsocket.Session() + +NOMAD_ADDR = environ.get("NOMAD_ADDR", "http://127.0.0.1:4646") +NOMAD_TOKEN = environ.get("NOMAD_TOKEN") + + +def nomad_req( + *path: str, params: dict[str, Any] | None = None, method="GET" +) -> list[dict[str, Any]] | dict[str, Any] | str: + headers = {} + if NOMAD_TOKEN: + headers["X-Nomad-Token"] = NOMAD_TOKEN + + response = requests.request( + method, + f"{NOMAD_ADDR}/v1/{'/'.join(path)}", + params=params, + headers=headers, + ) + response.raise_for_status() + + try: + return response.json() + except requests.exceptions.JSONDecodeError: + return response.text + + +def extract_job_services(job: dict[str, Any]) -> dict[str, str]: + services: dict[str, str] = dict() + for group in job["TaskGroups"]: + for service in group.get("Services") or []: + services[service["Name"]] = group["Name"] + for task in group["Tasks"]: + for service in task.get("Services") or []: + services[service["Name"]] = group["Name"] + + return services + +exit_code = 0 +parser = ArgumentParser( + description="Checks for missing services and optionally restarts their allocs.", +) +parser.add_argument("-r", "--restart", action="store_true", help="Restart allocs for missing services") +args = parser.parse_args() + +for job in nomad_req("jobs"): + job = cast(dict[str, Any], job) + + if job["Type"] in ("batch", "sysbatch"): + continue + + if job["Status"] != "running": + print(f"WARNING: job {job['Name']} is {job['Status']}") + continue + + job_detail = nomad_req("job", job["ID"]) + job_detail = cast(dict[str, Any], job_detail) + + expected_services = extract_job_services(job_detail) + + found_services: set[str] = set() + for service in nomad_req("job", job_detail["ID"], "services"): + service = cast(dict[str, Any], service) + found_services.add(service["ServiceName"]) + + missing_services = set(expected_services) - found_services + restart_groups: set[str] = set() + for missing_service in missing_services: + print(f"ERROR: Missing service {missing_service} for job {job_detail['Name']}") + # print(job) + exit_code = 1 + + # Add group associated with missing service to set + restart_groups.add(expected_services[missing_service]) + + if not restart_groups or not args.restart: + continue + + # Get allocts for groups that are missing services + restart_allocs: set[str] = set() + for allocation in nomad_req("job", job_detail["ID"], "allocations"): + allocation = cast(dict[str, Any], allocation) + if allocation["ClientStatus"] == "running" and allocation["TaskGroup"] in restart_groups: + restart_allocs.add(allocation["ID"]) + + # Restart allocs associated with missing services + for allocation in restart_allocs: + print(f"INFO: Restarting allocation {allocation}") + nomad_req("client", "allocation", allocation, "restart") + + +exit(exit_code) diff --git a/nomad_orphan_services.py b/nomad_orphan_services.py new file mode 100755 index 0000000..d673cf4 --- /dev/null +++ b/nomad_orphan_services.py @@ -0,0 +1,73 @@ +#! /usr/bin/env python3 +from argparse import ArgumentParser +from os import environ +from typing import Any +from typing import cast + +import requests_unixsocket +requests = requests_unixsocket.Session() + + +NOMAD_ADDR = environ.get("NOMAD_ADDR", "http://127.0.0.1:4646") +NOMAD_TOKEN = environ.get("NOMAD_TOKEN") + + +def nomad_req( + *path: str, params: dict[str, Any] | None = None, method="GET" +) -> list[dict[str, Any]] | dict[str, Any] | str: + headers = {} + if NOMAD_TOKEN: + headers["X-Nomad-Token"] = NOMAD_TOKEN + + response = requests.request( + method, + f"{NOMAD_ADDR}/v1/{'/'.join(path)}", + params=params, + headers=headers, + ) + response.raise_for_status() + + try: + return response.json() + except requests.exceptions.JSONDecodeError: + return response.text + + +exit_code = 0 +parser = ArgumentParser( + description="Checks for orphaned services and optionally deletes them.", +) +parser.add_argument("-d", "--delete", action="store_true", help="Delete orphan services") +args = parser.parse_args() + + +for namespace in nomad_req("services"): + namespace = cast(dict[str, Any], namespace) + for service in namespace["Services"]: + service_name = service["ServiceName"] + for service_instance in nomad_req("service", service_name): + service_instance = cast(dict[str, Any], service_instance) + service_id = service_instance["ID"] + alloc_id = service_instance["AllocID"] + + alloc_found = True + + try: + alloc = nomad_req("allocation", alloc_id) + continue + except requests.exceptions.HTTPError as e: + if e.response.status_code == 404: + alloc_found = False + message = f"alloc {alloc_id} not found for {service_name}." + if args.delete: + message += f" Deleting {service_id}" + + print(message) + else: + raise e + + if not alloc_found and args.delete: + nomad_req("service", service_name, service_id, method="DELETE") + + +exit(exit_code) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..8c67117 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +requests==2.31.0 +requests-unixsocket==0.3.0