Secondary initial commit with scripts from other repo
All checks were successful
continuous-integration/drone/push Build is passing
All checks were successful
continuous-integration/drone/push Build is passing
This commit is contained in:
parent
7cd688adfe
commit
e5e43b5021
62
.drone.yml
Normal file
62
.drone.yml
Normal file
@ -0,0 +1,62 @@
|
||||
---
|
||||
kind: pipeline
|
||||
name: test
|
||||
|
||||
steps:
|
||||
- name: check
|
||||
image: iamthefij/drone-pre-commit:personal
|
||||
|
||||
---
|
||||
kind: pipeline
|
||||
name: publish
|
||||
|
||||
depends_on:
|
||||
- test
|
||||
|
||||
trigger:
|
||||
event:
|
||||
- push
|
||||
- tag
|
||||
refs:
|
||||
- refs/heads/master
|
||||
- refs/tags/v*
|
||||
|
||||
steps:
|
||||
- name: push images
|
||||
image: thegeeklab/drone-docker-buildx
|
||||
settings:
|
||||
repo: iamthefij/nomad-service-fixers
|
||||
auto_tag: true
|
||||
platforms:
|
||||
- linux/amd64
|
||||
- linux/arm64
|
||||
- linux/arm
|
||||
username:
|
||||
from_secret: docker_username
|
||||
password:
|
||||
from_secret: docker_password
|
||||
|
||||
---
|
||||
kind: pipeline
|
||||
name: notify
|
||||
|
||||
depends_on:
|
||||
- test
|
||||
- publish
|
||||
|
||||
trigger:
|
||||
status:
|
||||
- failure
|
||||
|
||||
steps:
|
||||
|
||||
- name: notify
|
||||
image: drillster/drone-email
|
||||
settings:
|
||||
host:
|
||||
from_secret: SMTP_HOST # pragma: whitelist secret
|
||||
username:
|
||||
from_secret: SMTP_USER # pragma: whitelist secret
|
||||
password:
|
||||
from_secret: SMTP_PASS # pragma: whitelist secret
|
||||
from: drone@iamthefij.com
|
1
.gitignore
vendored
1
.gitignore
vendored
@ -159,4 +159,3 @@ cython_debug/
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
#.idea/
|
||||
|
||||
|
11
.pre-commit-config.yaml
Normal file
11
.pre-commit-config.yaml
Normal file
@ -0,0 +1,11 @@
|
||||
---
|
||||
repos:
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
rev: v4.1.0
|
||||
hooks:
|
||||
- id: check-added-large-files
|
||||
- id: requirements-txt-fixer
|
||||
- id: trailing-whitespace
|
||||
- id: end-of-file-fixer
|
||||
- id: check-merge-conflict
|
||||
- id: debug-statements
|
9
Dockerfile
Normal file
9
Dockerfile
Normal file
@ -0,0 +1,9 @@
|
||||
FROM python:3-alpine
|
||||
|
||||
RUN mkdir /scripts
|
||||
WORKDIR /scripts
|
||||
|
||||
COPY ./requirements.txt /scripts/
|
||||
RUN pip install --no-cache-dir -r /scripts/requirements.txt
|
||||
|
||||
COPY ./nomad_missing_services.py ./nomad_orphan_services.py /scripts/
|
11
README.md
11
README.md
@ -1,3 +1,12 @@
|
||||
# nomad-service-fixers
|
||||
|
||||
A few check and fixer scripts to clean up services in my running instances.
|
||||
A few check and fixer scripts to clean up services in my running instances.
|
||||
|
||||
These make use of [requests-unixsocket](https://github.com/msabramo/requests-unixsocket) so that they can target the workload API from within a Nomad task.
|
||||
|
||||
Included scripts:
|
||||
|
||||
* `./nomad_missing_services.py`: Looks for running allocs who's services appear to have dissapeared.
|
||||
* `./nomad_orphan_services.py`: Looks for services who's allocs appear to have dissapeared.
|
||||
|
||||
This is on DockerHub as `iamthefij/nomad-service-fixers`.
|
||||
|
99
nomad_missing_services.py
Executable file
99
nomad_missing_services.py
Executable file
@ -0,0 +1,99 @@
|
||||
#! /usr/bin/env python3
|
||||
from argparse import ArgumentParser
|
||||
from os import environ
|
||||
from typing import Any
|
||||
from typing import cast
|
||||
|
||||
import requests_unixsocket
|
||||
requests = requests_unixsocket.Session()
|
||||
|
||||
NOMAD_ADDR = environ.get("NOMAD_ADDR", "http://127.0.0.1:4646")
|
||||
NOMAD_TOKEN = environ.get("NOMAD_TOKEN")
|
||||
|
||||
|
||||
def nomad_req(
|
||||
*path: str, params: dict[str, Any] | None = None, method="GET"
|
||||
) -> list[dict[str, Any]] | dict[str, Any] | str:
|
||||
headers = {}
|
||||
if NOMAD_TOKEN:
|
||||
headers["X-Nomad-Token"] = NOMAD_TOKEN
|
||||
|
||||
response = requests.request(
|
||||
method,
|
||||
f"{NOMAD_ADDR}/v1/{'/'.join(path)}",
|
||||
params=params,
|
||||
headers=headers,
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
try:
|
||||
return response.json()
|
||||
except requests.exceptions.JSONDecodeError:
|
||||
return response.text
|
||||
|
||||
|
||||
def extract_job_services(job: dict[str, Any]) -> dict[str, str]:
|
||||
services: dict[str, str] = dict()
|
||||
for group in job["TaskGroups"]:
|
||||
for service in group.get("Services") or []:
|
||||
services[service["Name"]] = group["Name"]
|
||||
for task in group["Tasks"]:
|
||||
for service in task.get("Services") or []:
|
||||
services[service["Name"]] = group["Name"]
|
||||
|
||||
return services
|
||||
|
||||
exit_code = 0
|
||||
parser = ArgumentParser(
|
||||
description="Checks for missing services and optionally restarts their allocs.",
|
||||
)
|
||||
parser.add_argument("-r", "--restart", action="store_true", help="Restart allocs for missing services")
|
||||
args = parser.parse_args()
|
||||
|
||||
for job in nomad_req("jobs"):
|
||||
job = cast(dict[str, Any], job)
|
||||
|
||||
if job["Type"] in ("batch", "sysbatch"):
|
||||
continue
|
||||
|
||||
if job["Status"] != "running":
|
||||
print(f"WARNING: job {job['Name']} is {job['Status']}")
|
||||
continue
|
||||
|
||||
job_detail = nomad_req("job", job["ID"])
|
||||
job_detail = cast(dict[str, Any], job_detail)
|
||||
|
||||
expected_services = extract_job_services(job_detail)
|
||||
|
||||
found_services: set[str] = set()
|
||||
for service in nomad_req("job", job_detail["ID"], "services"):
|
||||
service = cast(dict[str, Any], service)
|
||||
found_services.add(service["ServiceName"])
|
||||
|
||||
missing_services = set(expected_services) - found_services
|
||||
restart_groups: set[str] = set()
|
||||
for missing_service in missing_services:
|
||||
print(f"ERROR: Missing service {missing_service} for job {job_detail['Name']}")
|
||||
# print(job)
|
||||
exit_code = 1
|
||||
|
||||
# Add group associated with missing service to set
|
||||
restart_groups.add(expected_services[missing_service])
|
||||
|
||||
if not restart_groups or not args.restart:
|
||||
continue
|
||||
|
||||
# Get allocts for groups that are missing services
|
||||
restart_allocs: set[str] = set()
|
||||
for allocation in nomad_req("job", job_detail["ID"], "allocations"):
|
||||
allocation = cast(dict[str, Any], allocation)
|
||||
if allocation["ClientStatus"] == "running" and allocation["TaskGroup"] in restart_groups:
|
||||
restart_allocs.add(allocation["ID"])
|
||||
|
||||
# Restart allocs associated with missing services
|
||||
for allocation in restart_allocs:
|
||||
print(f"INFO: Restarting allocation {allocation}")
|
||||
nomad_req("client", "allocation", allocation, "restart")
|
||||
|
||||
|
||||
exit(exit_code)
|
73
nomad_orphan_services.py
Executable file
73
nomad_orphan_services.py
Executable file
@ -0,0 +1,73 @@
|
||||
#! /usr/bin/env python3
|
||||
from argparse import ArgumentParser
|
||||
from os import environ
|
||||
from typing import Any
|
||||
from typing import cast
|
||||
|
||||
import requests_unixsocket
|
||||
requests = requests_unixsocket.Session()
|
||||
|
||||
|
||||
NOMAD_ADDR = environ.get("NOMAD_ADDR", "http://127.0.0.1:4646")
|
||||
NOMAD_TOKEN = environ.get("NOMAD_TOKEN")
|
||||
|
||||
|
||||
def nomad_req(
|
||||
*path: str, params: dict[str, Any] | None = None, method="GET"
|
||||
) -> list[dict[str, Any]] | dict[str, Any] | str:
|
||||
headers = {}
|
||||
if NOMAD_TOKEN:
|
||||
headers["X-Nomad-Token"] = NOMAD_TOKEN
|
||||
|
||||
response = requests.request(
|
||||
method,
|
||||
f"{NOMAD_ADDR}/v1/{'/'.join(path)}",
|
||||
params=params,
|
||||
headers=headers,
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
try:
|
||||
return response.json()
|
||||
except requests.exceptions.JSONDecodeError:
|
||||
return response.text
|
||||
|
||||
|
||||
exit_code = 0
|
||||
parser = ArgumentParser(
|
||||
description="Checks for orphaned services and optionally deletes them.",
|
||||
)
|
||||
parser.add_argument("-d", "--delete", action="store_true", help="Delete orphan services")
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
for namespace in nomad_req("services"):
|
||||
namespace = cast(dict[str, Any], namespace)
|
||||
for service in namespace["Services"]:
|
||||
service_name = service["ServiceName"]
|
||||
for service_instance in nomad_req("service", service_name):
|
||||
service_instance = cast(dict[str, Any], service_instance)
|
||||
service_id = service_instance["ID"]
|
||||
alloc_id = service_instance["AllocID"]
|
||||
|
||||
alloc_found = True
|
||||
|
||||
try:
|
||||
alloc = nomad_req("allocation", alloc_id)
|
||||
continue
|
||||
except requests.exceptions.HTTPError as e:
|
||||
if e.response.status_code == 404:
|
||||
alloc_found = False
|
||||
message = f"alloc {alloc_id} not found for {service_name}."
|
||||
if args.delete:
|
||||
message += f" Deleting {service_id}"
|
||||
|
||||
print(message)
|
||||
else:
|
||||
raise e
|
||||
|
||||
if not alloc_found and args.delete:
|
||||
nomad_req("service", service_name, service_id, method="DELETE")
|
||||
|
||||
|
||||
exit(exit_code)
|
2
requirements.txt
Normal file
2
requirements.txt
Normal file
@ -0,0 +1,2 @@
|
||||
requests==2.31.0
|
||||
requests-unixsocket==0.3.0
|
Loading…
Reference in New Issue
Block a user