Improvements for future restore tests

A new mode to deploy without running automatic backups so that I can test restoration without writing a blank backup.
2025-02-24 09:53:10 -08:00 · 2025-02-24 09:53:10 -08:00 · 480fcf144c
commit 480fcf144c
parent 8907601605
5 changed files with 103 additions and 52 deletions
--- a/backups/backups.tf
+++ b/backups/backups.tf
@ -1,4 +1,5 @@
 resource "nomad_job" "backup" {
+  count = var.restoration_mode ? 0 : 1
  jobspec = templatefile("${path.module}/backup.nomad", {
    module_path = path.module,
    batch_node  = null,
--- a/backups/vars.tf
+++ b/backups/vars.tf
@ -1,3 +1,9 @@
+variable "restoration_mode" {
+  type        = bool
+  description = "Prevent starting scheduled backup jobs so data can be restored without overwriting good data"
+  default     = false
+}
+
 variable "use_wesher" {
  type        = bool
  description = "Indicates whether or not services should expose themselves on the wesher network"
--- a/main.tf
+++ b/main.tf
@ -29,6 +29,7 @@ module "backups" {
  source = "./backups"

  use_wesher       = var.use_wesher
+  restoration_mode = var.restoration_mode

  depends_on = [module.databases, module.services, module.core]
 }
--- a/scripts/nomad_backup_job.py
+++ b/scripts/nomad_backup_job.py
@ -3,7 +3,7 @@ from os import environ
 from time import sleep
 from typing import Any
 from typing import cast
-from argparse import ArgumentParser
+from argparse import ArgumentParser, Namespace

 import requests

@ -65,6 +65,27 @@ def wait_for_eval_status(eval_id: str, status: str):
        eval = cast(dict[str, Any], eval)


+def restart_job(job_id: str) ->  str|None:
+    job_versions = nomad_req("job", job_id, "versions")
+    job_versions = cast(dict[str, Any], job_versions)
+
+    latest_stable_version: int = -1
+    for version in job_versions["Versions"]:
+        if version["Stable"] and version["Version"] > latest_stable_version:
+            latest_stable_version = version["Version"]
+
+    if latest_stable_version == -1:
+        print("No stable versions found")
+        return None
+
+    print(f"Reverting to version {latest_stable_version}")
+    revert = nomad_req("job", job_id, "revert", data={"JobVersion": latest_stable_version}, method="POST")
+    revert = cast(dict[str, Any], revert)
+
+    return revert["EvalID"]
+
+
+def parse_arguments() -> Namespace:
    parser = ArgumentParser(
        description="Execute one off backups and restores of services",
    )
@ -74,12 +95,18 @@ parser.add_argument("-s", "--snapshot", default="latest", help="Backup snapshot
    parser.add_argument("-x", "--extra-safe", action="store_true", help="Perform extra safe backup or restore by stoping target job first")
    args = parser.parse_args()

+    return args
+
+
+def main() -> int:
+    args = parse_arguments()
+
    service_name = args.service_name
    service_info = nomad_req("service", service_name, params={"choose": "1|backups"})

    if not service_info:
        print(f"Could not find service {service_name}")
-    exit(1)
+        return 1

    service_info = cast(list[dict[str, Any]], service_info)
    node_id = service_info[0]["NodeID"]
@ -87,6 +114,7 @@ job_id = service_info[0]["JobID"]

    node = nomad_req("node", node_id)
    node = cast(dict[str, Any], node)
+
    node_name = node["Name"]
    backup_job_name = f"backup-oneoff-{node_name}"

@ -96,8 +124,7 @@ if not backup_job:

    if args.extra_safe:
        print("Stopping job allocs")
-    stop_job = nomad_req("job", job_id, method="DELETE")
-    print(stop_job)
+        _ = nomad_req("job", job_id, method="DELETE")
        wait_for_job_alloc_status(job_id, "complete")

    backup_job = cast(dict[str, Any], backup_job)
@ -118,11 +145,21 @@ dispatch = nomad_req(
        method="POST",
    )
    dispatch = cast(dict[str, Any], dispatch)
-print(dispatch)

    if args.extra_safe:
        print(f"Wait for {args.action} to finish")
        wait_for_eval_status(dispatch["EvalID"], "complete")

-    print("Backup complete. Verify success and restart job")
-    # If auto restarting, get versions and "revert" to version n-1 since n will be the recently stopped version
+        print(f"{args.action.capitalize()} complete. Verify success and restart job")
+
+        revert_eval_id = restart_job(job_id)
+        if revert_eval_id:
+            wait_for_eval_status(revert_eval_id, "running")
+        else:
+            print("No stable versions to revert to")
+
+    return 0
+
+
+if __name__ == "__main__":
+    exit(main())
--- a/vars.tf
+++ b/vars.tf
@ -21,3 +21,9 @@ variable "use_wesher" {
  description = "Indicates whether or not services should expose themselves on the wesher network"
  default     = true
 }
+
+variable "restoration_mode" {
+  type        = bool
+  description = "Prevent starting scheduled backup jobs so data can be restored without overwriting good data"
+  default     = false
+}