Improvements for future restore tests

A new mode to deploy without running automatic backups so that
I can test restoration without writing a blank backup.
This commit is contained in:
IamTheFij 2025-02-24 09:53:10 -08:00
parent 8907601605
commit 480fcf144c
5 changed files with 103 additions and 52 deletions

View File

@ -1,4 +1,5 @@
resource "nomad_job" "backup" { resource "nomad_job" "backup" {
count = var.restoration_mode ? 0 : 1
jobspec = templatefile("${path.module}/backup.nomad", { jobspec = templatefile("${path.module}/backup.nomad", {
module_path = path.module, module_path = path.module,
batch_node = null, batch_node = null,

View File

@ -1,3 +1,9 @@
variable "restoration_mode" {
type = bool
description = "Prevent starting scheduled backup jobs so data can be restored without overwriting good data"
default = false
}
variable "use_wesher" { variable "use_wesher" {
type = bool type = bool
description = "Indicates whether or not services should expose themselves on the wesher network" description = "Indicates whether or not services should expose themselves on the wesher network"

View File

@ -29,6 +29,7 @@ module "backups" {
source = "./backups" source = "./backups"
use_wesher = var.use_wesher use_wesher = var.use_wesher
restoration_mode = var.restoration_mode
depends_on = [module.databases, module.services, module.core] depends_on = [module.databases, module.services, module.core]
} }

View File

@ -3,7 +3,7 @@ from os import environ
from time import sleep from time import sleep
from typing import Any from typing import Any
from typing import cast from typing import cast
from argparse import ArgumentParser from argparse import ArgumentParser, Namespace
import requests import requests
@ -65,45 +65,72 @@ def wait_for_eval_status(eval_id: str, status: str):
eval = cast(dict[str, Any], eval) eval = cast(dict[str, Any], eval)
parser = ArgumentParser( def restart_job(job_id: str) -> str|None:
job_versions = nomad_req("job", job_id, "versions")
job_versions = cast(dict[str, Any], job_versions)
latest_stable_version: int = -1
for version in job_versions["Versions"]:
if version["Stable"] and version["Version"] > latest_stable_version:
latest_stable_version = version["Version"]
if latest_stable_version == -1:
print("No stable versions found")
return None
print(f"Reverting to version {latest_stable_version}")
revert = nomad_req("job", job_id, "revert", data={"JobVersion": latest_stable_version}, method="POST")
revert = cast(dict[str, Any], revert)
return revert["EvalID"]
def parse_arguments() -> Namespace:
parser = ArgumentParser(
description="Execute one off backups and restores of services", description="Execute one off backups and restores of services",
) )
parser.add_argument("service_name", help="Name of the service to backup or restore") parser.add_argument("service_name", help="Name of the service to backup or restore")
parser.add_argument("-a", "--action", default="backup", choices=("backup", "restore"), help="Action to take, backup or restore") parser.add_argument("-a", "--action", default="backup", choices=("backup", "restore"), help="Action to take, backup or restore")
parser.add_argument("-s", "--snapshot", default="latest", help="Backup snapshot to restore, if restore is the chosen action") parser.add_argument("-s", "--snapshot", default="latest", help="Backup snapshot to restore, if restore is the chosen action")
parser.add_argument("-x", "--extra-safe", action="store_true", help="Perform extra safe backup or restore by stoping target job first") parser.add_argument("-x", "--extra-safe", action="store_true", help="Perform extra safe backup or restore by stoping target job first")
args = parser.parse_args() args = parser.parse_args()
service_name = args.service_name return args
service_info = nomad_req("service", service_name, params={"choose": "1|backups"})
if not service_info:
def main() -> int:
args = parse_arguments()
service_name = args.service_name
service_info = nomad_req("service", service_name, params={"choose": "1|backups"})
if not service_info:
print(f"Could not find service {service_name}") print(f"Could not find service {service_name}")
exit(1) return 1
service_info = cast(list[dict[str, Any]], service_info) service_info = cast(list[dict[str, Any]], service_info)
node_id = service_info[0]["NodeID"] node_id = service_info[0]["NodeID"]
job_id = service_info[0]["JobID"] job_id = service_info[0]["JobID"]
node = nomad_req("node", node_id) node = nomad_req("node", node_id)
node = cast(dict[str, Any], node) node = cast(dict[str, Any], node)
node_name = node["Name"]
backup_job_name = f"backup-oneoff-{node_name}"
backup_job = nomad_req("job", backup_job_name) node_name = node["Name"]
if not backup_job: backup_job_name = f"backup-oneoff-{node_name}"
backup_job = nomad_req("job", backup_job_name)
if not backup_job:
print(f"Could not find backup job {backup_job_name} for {service_name}") print(f"Could not find backup job {backup_job_name} for {service_name}")
if args.extra_safe: if args.extra_safe:
print("Stopping job allocs") print("Stopping job allocs")
stop_job = nomad_req("job", job_id, method="DELETE") _ = nomad_req("job", job_id, method="DELETE")
print(stop_job)
wait_for_job_alloc_status(job_id, "complete") wait_for_job_alloc_status(job_id, "complete")
backup_job = cast(dict[str, Any], backup_job) backup_job = cast(dict[str, Any], backup_job)
backup_job_id = backup_job["ID"] backup_job_id = backup_job["ID"]
dispatch = nomad_req( dispatch = nomad_req(
"job", "job",
backup_job_id, backup_job_id,
"dispatch", "dispatch",
@ -116,13 +143,23 @@ dispatch = nomad_req(
}, },
}, },
method="POST", method="POST",
) )
dispatch = cast(dict[str, Any], dispatch) dispatch = cast(dict[str, Any], dispatch)
print(dispatch)
if args.extra_safe: if args.extra_safe:
print(f"Wait for {args.action} to finish") print(f"Wait for {args.action} to finish")
wait_for_eval_status(dispatch["EvalID"], "complete") wait_for_eval_status(dispatch["EvalID"], "complete")
print("Backup complete. Verify success and restart job") print(f"{args.action.capitalize()} complete. Verify success and restart job")
# If auto restarting, get versions and "revert" to version n-1 since n will be the recently stopped version
revert_eval_id = restart_job(job_id)
if revert_eval_id:
wait_for_eval_status(revert_eval_id, "running")
else:
print("No stable versions to revert to")
return 0
if __name__ == "__main__":
exit(main())

View File

@ -21,3 +21,9 @@ variable "use_wesher" {
description = "Indicates whether or not services should expose themselves on the wesher network" description = "Indicates whether or not services should expose themselves on the wesher network"
default = true default = true
} }
variable "restoration_mode" {
type = bool
description = "Prevent starting scheduled backup jobs so data can be restored without overwriting good data"
default = false
}