Improvements for future restore tests

A new mode to deploy without running automatic backups so that I can test restoration without writing a blank backup.
2025-02-24 09:53:10 -08:00 · 2025-02-24 09:53:10 -08:00 · 480fcf144c
commit 480fcf144c
parent 8907601605
5 changed files with 103 additions and 52 deletions
--- a/backups/backups.tf
+++ b/backups/backups.tf
@ -1,4 +1,5 @@
 resource "nomad_job" "backup" {
  count = var.restoration_mode ? 0 : 1
  jobspec = templatefile("${path.module}/backup.nomad", {
    module_path = path.module,
    batch_node  = null,
--- a/backups/vars.tf
+++ b/backups/vars.tf
@ -1,3 +1,9 @@
 variable "restoration_mode" {
  type        = bool
  description = "Prevent starting scheduled backup jobs so data can be restored without overwriting good data"
  default     = false
 }
 variable "use_wesher" {
  type        = bool
  description = "Indicates whether or not services should expose themselves on the wesher network"
--- a/main.tf
+++ b/main.tf
@ -29,6 +29,7 @@ module "backups" {
  source = "./backups"
  use_wesher       = var.use_wesher
  restoration_mode = var.restoration_mode
  depends_on = [module.databases, module.services, module.core]
 }
--- a/scripts/nomad_backup_job.py
+++ b/scripts/nomad_backup_job.py
@ -3,7 +3,7 @@ from os import environ
 from time import sleep
 from typing import Any
 from typing import cast
-from argparse import ArgumentParser
+from argparse import ArgumentParser, Namespace
 import requests
@ -65,45 +65,72 @@ def wait_for_eval_status(eval_id: str, status: str):
        eval = cast(dict[str, Any], eval)
-parser = ArgumentParser(
+def restart_job(job_id: str) ->  str|None:
    job_versions = nomad_req("job", job_id, "versions")
    job_versions = cast(dict[str, Any], job_versions)
    latest_stable_version: int = -1
    for version in job_versions["Versions"]:
        if version["Stable"] and version["Version"] > latest_stable_version:
            latest_stable_version = version["Version"]
    if latest_stable_version == -1:
        print("No stable versions found")
        return None
    print(f"Reverting to version {latest_stable_version}")
    revert = nomad_req("job", job_id, "revert", data={"JobVersion": latest_stable_version}, method="POST")
    revert = cast(dict[str, Any], revert)
    return revert["EvalID"]
 def parse_arguments() -> Namespace:
    parser = ArgumentParser(
        description="Execute one off backups and restores of services",
-)
+    )
-parser.add_argument("service_name", help="Name of the service to backup or restore")
+    parser.add_argument("service_name", help="Name of the service to backup or restore")
-parser.add_argument("-a", "--action", default="backup", choices=("backup", "restore"), help="Action to take, backup or restore")
+    parser.add_argument("-a", "--action", default="backup", choices=("backup", "restore"), help="Action to take, backup or restore")
-parser.add_argument("-s", "--snapshot", default="latest", help="Backup snapshot to restore, if restore is the chosen action")
+    parser.add_argument("-s", "--snapshot", default="latest", help="Backup snapshot to restore, if restore is the chosen action")
-parser.add_argument("-x", "--extra-safe", action="store_true", help="Perform extra safe backup or restore by stoping target job first")
+    parser.add_argument("-x", "--extra-safe", action="store_true", help="Perform extra safe backup or restore by stoping target job first")
-args = parser.parse_args()
+    args = parser.parse_args()
-service_name = args.service_name
+    return args
 service_info = nomad_req("service", service_name, params={"choose": "1|backups"})
-if not service_info:
+
 def main() -> int:
    args = parse_arguments()
    service_name = args.service_name
    service_info = nomad_req("service", service_name, params={"choose": "1|backups"})
    if not service_info:
        print(f"Could not find service {service_name}")
-    exit(1)
+        return 1
-service_info = cast(list[dict[str, Any]], service_info)
+    service_info = cast(list[dict[str, Any]], service_info)
-node_id = service_info[0]["NodeID"]
+    node_id = service_info[0]["NodeID"]
-job_id = service_info[0]["JobID"]
+    job_id = service_info[0]["JobID"]
-node = nomad_req("node", node_id)
+    node = nomad_req("node", node_id)
-node = cast(dict[str, Any], node)
+    node = cast(dict[str, Any], node)
 node_name = node["Name"]
 backup_job_name = f"backup-oneoff-{node_name}"
-backup_job = nomad_req("job", backup_job_name)
+    node_name = node["Name"]
-if not backup_job:
+    backup_job_name = f"backup-oneoff-{node_name}"
    backup_job = nomad_req("job", backup_job_name)
    if not backup_job:
        print(f"Could not find backup job {backup_job_name} for {service_name}")
-if args.extra_safe:
+    if args.extra_safe:
        print("Stopping job allocs")
-    stop_job = nomad_req("job", job_id, method="DELETE")
+        _ = nomad_req("job", job_id, method="DELETE")
    print(stop_job)
        wait_for_job_alloc_status(job_id, "complete")
-backup_job = cast(dict[str, Any], backup_job)
+    backup_job = cast(dict[str, Any], backup_job)
-backup_job_id = backup_job["ID"]
+    backup_job_id = backup_job["ID"]
-dispatch = nomad_req(
+    dispatch = nomad_req(
        "job",
        backup_job_id,
        "dispatch",
@ -116,13 +143,23 @@ dispatch = nomad_req(
            },
        },
        method="POST",
-)
+    )
-dispatch = cast(dict[str, Any], dispatch)
+    dispatch = cast(dict[str, Any], dispatch)
 print(dispatch)
-if args.extra_safe:
+    if args.extra_safe:
        print(f"Wait for {args.action} to finish")
        wait_for_eval_status(dispatch["EvalID"], "complete")
-    print("Backup complete. Verify success and restart job")
+        print(f"{args.action.capitalize()} complete. Verify success and restart job")
-    # If auto restarting, get versions and "revert" to version n-1 since n will be the recently stopped version
+
        revert_eval_id = restart_job(job_id)
        if revert_eval_id:
            wait_for_eval_status(revert_eval_id, "running")
        else:
            print("No stable versions to revert to")
    return 0
 if __name__ == "__main__":
    exit(main())
--- a/vars.tf
+++ b/vars.tf
@ -21,3 +21,9 @@ variable "use_wesher" {
  description = "Indicates whether or not services should expose themselves on the wesher network"
  default     = true
 }
 variable "restoration_mode" {
  type        = bool
  description = "Prevent starting scheduled backup jobs so data can be restored without overwriting good data"
  default     = false
 }