Secondary initial commit with scripts from other repo
continuous-integration/drone/push Build is passing Details

This commit is contained in:
IamTheFij 2023-10-19 12:38:13 -07:00
parent 7cd688adfe
commit e5e43b5021
8 changed files with 266 additions and 2 deletions

62
.drone.yml Normal file
View File

@ -0,0 +1,62 @@
---
kind: pipeline
name: test
steps:
- name: check
image: iamthefij/drone-pre-commit:personal
---
kind: pipeline
name: publish
depends_on:
- test
trigger:
event:
- push
- tag
refs:
- refs/heads/master
- refs/tags/v*
steps:
- name: push images
image: thegeeklab/drone-docker-buildx
settings:
repo: iamthefij/nomad-service-fixers
auto_tag: true
platforms:
- linux/amd64
- linux/arm64
- linux/arm
username:
from_secret: docker_username
password:
from_secret: docker_password
---
kind: pipeline
name: notify
depends_on:
- test
- publish
trigger:
status:
- failure
steps:
- name: notify
image: drillster/drone-email
settings:
host:
from_secret: SMTP_HOST # pragma: whitelist secret
username:
from_secret: SMTP_USER # pragma: whitelist secret
password:
from_secret: SMTP_PASS # pragma: whitelist secret
from: drone@iamthefij.com

1
.gitignore vendored
View File

@ -159,4 +159,3 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

11
.pre-commit-config.yaml Normal file
View File

@ -0,0 +1,11 @@
---
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.1.0
hooks:
- id: check-added-large-files
- id: requirements-txt-fixer
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-merge-conflict
- id: debug-statements

9
Dockerfile Normal file
View File

@ -0,0 +1,9 @@
FROM python:3-alpine
RUN mkdir /scripts
WORKDIR /scripts
COPY ./requirements.txt /scripts/
RUN pip install --no-cache-dir -r /scripts/requirements.txt
COPY ./nomad_missing_services.py ./nomad_orphan_services.py /scripts/

View File

@ -1,3 +1,12 @@
# nomad-service-fixers
A few check and fixer scripts to clean up services in my running instances.
A few check and fixer scripts to clean up services in my running instances.
These make use of [requests-unixsocket](https://github.com/msabramo/requests-unixsocket) so that they can target the workload API from within a Nomad task.
Included scripts:
* `./nomad_missing_services.py`: Looks for running allocs who's services appear to have dissapeared.
* `./nomad_orphan_services.py`: Looks for services who's allocs appear to have dissapeared.
This is on DockerHub as `iamthefij/nomad-service-fixers`.

99
nomad_missing_services.py Executable file
View File

@ -0,0 +1,99 @@
#! /usr/bin/env python3
from argparse import ArgumentParser
from os import environ
from typing import Any
from typing import cast
import requests_unixsocket
requests = requests_unixsocket.Session()
NOMAD_ADDR = environ.get("NOMAD_ADDR", "http://127.0.0.1:4646")
NOMAD_TOKEN = environ.get("NOMAD_TOKEN")
def nomad_req(
*path: str, params: dict[str, Any] | None = None, method="GET"
) -> list[dict[str, Any]] | dict[str, Any] | str:
headers = {}
if NOMAD_TOKEN:
headers["X-Nomad-Token"] = NOMAD_TOKEN
response = requests.request(
method,
f"{NOMAD_ADDR}/v1/{'/'.join(path)}",
params=params,
headers=headers,
)
response.raise_for_status()
try:
return response.json()
except requests.exceptions.JSONDecodeError:
return response.text
def extract_job_services(job: dict[str, Any]) -> dict[str, str]:
services: dict[str, str] = dict()
for group in job["TaskGroups"]:
for service in group.get("Services") or []:
services[service["Name"]] = group["Name"]
for task in group["Tasks"]:
for service in task.get("Services") or []:
services[service["Name"]] = group["Name"]
return services
exit_code = 0
parser = ArgumentParser(
description="Checks for missing services and optionally restarts their allocs.",
)
parser.add_argument("-r", "--restart", action="store_true", help="Restart allocs for missing services")
args = parser.parse_args()
for job in nomad_req("jobs"):
job = cast(dict[str, Any], job)
if job["Type"] in ("batch", "sysbatch"):
continue
if job["Status"] != "running":
print(f"WARNING: job {job['Name']} is {job['Status']}")
continue
job_detail = nomad_req("job", job["ID"])
job_detail = cast(dict[str, Any], job_detail)
expected_services = extract_job_services(job_detail)
found_services: set[str] = set()
for service in nomad_req("job", job_detail["ID"], "services"):
service = cast(dict[str, Any], service)
found_services.add(service["ServiceName"])
missing_services = set(expected_services) - found_services
restart_groups: set[str] = set()
for missing_service in missing_services:
print(f"ERROR: Missing service {missing_service} for job {job_detail['Name']}")
# print(job)
exit_code = 1
# Add group associated with missing service to set
restart_groups.add(expected_services[missing_service])
if not restart_groups or not args.restart:
continue
# Get allocts for groups that are missing services
restart_allocs: set[str] = set()
for allocation in nomad_req("job", job_detail["ID"], "allocations"):
allocation = cast(dict[str, Any], allocation)
if allocation["ClientStatus"] == "running" and allocation["TaskGroup"] in restart_groups:
restart_allocs.add(allocation["ID"])
# Restart allocs associated with missing services
for allocation in restart_allocs:
print(f"INFO: Restarting allocation {allocation}")
nomad_req("client", "allocation", allocation, "restart")
exit(exit_code)

73
nomad_orphan_services.py Executable file
View File

@ -0,0 +1,73 @@
#! /usr/bin/env python3
from argparse import ArgumentParser
from os import environ
from typing import Any
from typing import cast
import requests_unixsocket
requests = requests_unixsocket.Session()
NOMAD_ADDR = environ.get("NOMAD_ADDR", "http://127.0.0.1:4646")
NOMAD_TOKEN = environ.get("NOMAD_TOKEN")
def nomad_req(
*path: str, params: dict[str, Any] | None = None, method="GET"
) -> list[dict[str, Any]] | dict[str, Any] | str:
headers = {}
if NOMAD_TOKEN:
headers["X-Nomad-Token"] = NOMAD_TOKEN
response = requests.request(
method,
f"{NOMAD_ADDR}/v1/{'/'.join(path)}",
params=params,
headers=headers,
)
response.raise_for_status()
try:
return response.json()
except requests.exceptions.JSONDecodeError:
return response.text
exit_code = 0
parser = ArgumentParser(
description="Checks for orphaned services and optionally deletes them.",
)
parser.add_argument("-d", "--delete", action="store_true", help="Delete orphan services")
args = parser.parse_args()
for namespace in nomad_req("services"):
namespace = cast(dict[str, Any], namespace)
for service in namespace["Services"]:
service_name = service["ServiceName"]
for service_instance in nomad_req("service", service_name):
service_instance = cast(dict[str, Any], service_instance)
service_id = service_instance["ID"]
alloc_id = service_instance["AllocID"]
alloc_found = True
try:
alloc = nomad_req("allocation", alloc_id)
continue
except requests.exceptions.HTTPError as e:
if e.response.status_code == 404:
alloc_found = False
message = f"alloc {alloc_id} not found for {service_name}."
if args.delete:
message += f" Deleting {service_id}"
print(message)
else:
raise e
if not alloc_found and args.delete:
nomad_req("service", service_name, service_id, method="DELETE")
exit(exit_code)

2
requirements.txt Normal file
View File

@ -0,0 +1,2 @@
requests==2.31.0
requests-unixsocket==0.3.0