Secondary initial commit with scripts from other repo

2023-10-19 12:38:13 -07:00 · 2023-10-19 12:38:13 -07:00 · e5e43b5021
commit e5e43b5021
parent 7cd688adfe
8 changed files with 266 additions and 2 deletions
--- a/.drone.yml
+++ b/.drone.yml
@ -0,0 +1,62 @@
+---
+kind: pipeline
+name: test
+
+steps:
+  - name: check
+    image: iamthefij/drone-pre-commit:personal
+
+---
+kind: pipeline
+name: publish
+
+depends_on:
+  - test
+
+trigger:
+  event:
+    - push
+    - tag
+  refs:
+    - refs/heads/master
+    - refs/tags/v*
+
+steps:
+  - name: push images
+    image: thegeeklab/drone-docker-buildx
+    settings:
+      repo: iamthefij/nomad-service-fixers
+      auto_tag: true
+      platforms:
+        - linux/amd64
+        - linux/arm64
+        - linux/arm
+      username:
+        from_secret: docker_username
+      password:
+        from_secret: docker_password
+
+---
+kind: pipeline
+name: notify
+
+depends_on:
+  - test
+  - publish
+
+trigger:
+  status:
+    - failure
+
+steps:
+
+  - name: notify
+    image: drillster/drone-email
+    settings:
+      host:
+        from_secret: SMTP_HOST  # pragma: whitelist secret
+      username:
+        from_secret: SMTP_USER  # pragma: whitelist secret
+      password:
+        from_secret: SMTP_PASS  # pragma: whitelist secret
+      from: drone@iamthefij.com
--- a/.gitignore
+++ b/.gitignore
@ -159,4 +159,3 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
-
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,11 @@
+---
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.1.0
+    hooks:
+      - id: check-added-large-files
+      - id: requirements-txt-fixer
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+      - id: check-merge-conflict
+      - id: debug-statements
--- a/9
+++ b/9
@ -0,0 +1,9 @@
+FROM python:3-alpine
+
+RUN mkdir /scripts
+WORKDIR /scripts
+
+COPY ./requirements.txt /scripts/
+RUN pip install --no-cache-dir -r /scripts/requirements.txt
+
+COPY ./nomad_missing_services.py ./nomad_orphan_services.py /scripts/
--- a/README.md
+++ b/README.md
@ -1,3 +1,12 @@
 # nomad-service-fixers

 A few check and fixer scripts to clean up services in my running instances.
+
+These make use of [requests-unixsocket](https://github.com/msabramo/requests-unixsocket) so that they can target the workload API from within a Nomad task.
+
+Included scripts:
+
+* `./nomad_missing_services.py`: Looks for running allocs who's services appear to have dissapeared.
+* `./nomad_orphan_services.py`: Looks for services who's allocs appear to have dissapeared.
+
+This is on DockerHub as `iamthefij/nomad-service-fixers`.
--- a/nomad_missing_services.py
+++ b/nomad_missing_services.py
@ -0,0 +1,99 @@
+#! /usr/bin/env python3
+from argparse import ArgumentParser
+from os import environ
+from typing import Any
+from typing import cast
+
+import requests_unixsocket
+requests = requests_unixsocket.Session()
+
+NOMAD_ADDR = environ.get("NOMAD_ADDR", "http://127.0.0.1:4646")
+NOMAD_TOKEN = environ.get("NOMAD_TOKEN")
+
+
+def nomad_req(
+    *path: str, params: dict[str, Any] | None = None, method="GET"
+) -> list[dict[str, Any]] | dict[str, Any] | str:
+    headers = {}
+    if NOMAD_TOKEN:
+        headers["X-Nomad-Token"] = NOMAD_TOKEN
+
+    response = requests.request(
+        method,
+        f"{NOMAD_ADDR}/v1/{'/'.join(path)}",
+        params=params,
+        headers=headers,
+    )
+    response.raise_for_status()
+
+    try:
+        return response.json()
+    except requests.exceptions.JSONDecodeError:
+        return response.text
+
+
+def extract_job_services(job: dict[str, Any]) -> dict[str, str]:
+    services: dict[str, str] = dict()
+    for group in job["TaskGroups"]:
+        for service in group.get("Services") or []:
+            services[service["Name"]] = group["Name"]
+        for task in group["Tasks"]:
+            for service in task.get("Services") or []:
+                services[service["Name"]] = group["Name"]
+
+    return services
+
+exit_code = 0
+parser = ArgumentParser(
+    description="Checks for missing services and optionally restarts their allocs.",
+)
+parser.add_argument("-r", "--restart", action="store_true", help="Restart allocs for missing services")
+args = parser.parse_args()
+
+for job in nomad_req("jobs"):
+    job = cast(dict[str, Any], job)
+
+    if job["Type"] in ("batch", "sysbatch"):
+        continue
+
+    if job["Status"] != "running":
+        print(f"WARNING: job {job['Name']} is {job['Status']}")
+        continue
+
+    job_detail = nomad_req("job", job["ID"])
+    job_detail = cast(dict[str, Any], job_detail)
+
+    expected_services = extract_job_services(job_detail)
+
+    found_services: set[str] = set()
+    for service in nomad_req("job", job_detail["ID"], "services"):
+        service = cast(dict[str, Any], service)
+        found_services.add(service["ServiceName"])
+
+    missing_services = set(expected_services) - found_services
+    restart_groups: set[str] = set()
+    for missing_service in missing_services:
+        print(f"ERROR: Missing service {missing_service} for job {job_detail['Name']}")
+        # print(job)
+        exit_code = 1
+
+        # Add group associated with missing service to set
+        restart_groups.add(expected_services[missing_service])
+
+    if not restart_groups or not args.restart:
+        continue
+
+    # Get allocts for groups that are missing services
+    restart_allocs: set[str] = set()
+    for allocation in nomad_req("job", job_detail["ID"], "allocations"):
+        allocation = cast(dict[str, Any], allocation)
+        if allocation["ClientStatus"] == "running" and allocation["TaskGroup"] in restart_groups:
+            restart_allocs.add(allocation["ID"])
+
+    # Restart allocs associated with missing services
+    for allocation in restart_allocs:
+        print(f"INFO: Restarting allocation {allocation}")
+        nomad_req("client", "allocation", allocation, "restart")
+
+
+exit(exit_code)
--- a/nomad_orphan_services.py
+++ b/nomad_orphan_services.py
@ -0,0 +1,73 @@
+#! /usr/bin/env python3
+from argparse import ArgumentParser
+from os import environ
+from typing import Any
+from typing import cast
+
+import requests_unixsocket
+requests = requests_unixsocket.Session()
+
+
+NOMAD_ADDR = environ.get("NOMAD_ADDR", "http://127.0.0.1:4646")
+NOMAD_TOKEN = environ.get("NOMAD_TOKEN")
+
+
+def nomad_req(
+    *path: str, params: dict[str, Any] | None = None, method="GET"
+) -> list[dict[str, Any]] | dict[str, Any] | str:
+    headers = {}
+    if NOMAD_TOKEN:
+        headers["X-Nomad-Token"] = NOMAD_TOKEN
+
+    response = requests.request(
+        method,
+        f"{NOMAD_ADDR}/v1/{'/'.join(path)}",
+        params=params,
+        headers=headers,
+    )
+    response.raise_for_status()
+
+    try:
+        return response.json()
+    except requests.exceptions.JSONDecodeError:
+        return response.text
+
+
+exit_code = 0
+parser = ArgumentParser(
+    description="Checks for orphaned services and optionally deletes them.",
+)
+parser.add_argument("-d", "--delete", action="store_true", help="Delete orphan services")
+args = parser.parse_args()
+
+
+for namespace in nomad_req("services"):
+    namespace = cast(dict[str, Any], namespace)
+    for service in namespace["Services"]:
+        service_name = service["ServiceName"]
+        for service_instance in nomad_req("service", service_name):
+            service_instance = cast(dict[str, Any], service_instance)
+            service_id = service_instance["ID"]
+            alloc_id = service_instance["AllocID"]
+
+            alloc_found = True
+
+            try:
+                alloc = nomad_req("allocation", alloc_id)
+                continue
+            except requests.exceptions.HTTPError as e:
+                if e.response.status_code == 404:
+                    alloc_found = False
+                    message = f"alloc {alloc_id} not found for {service_name}."
+                    if args.delete:
+                        message += f" Deleting {service_id}"
+
+                    print(message)
+                else:
+                    raise e
+
+            if not alloc_found and args.delete:
+                nomad_req("service", service_name, service_id, method="DELETE")
+
+
+exit(exit_code)
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,2 @@
+requests==2.31.0
+requests-unixsocket==0.3.0