Improvements for future restore tests

A new mode to deploy without running automatic backups so that
I can test restoration without writing a blank backup.
This commit is contained in:
IamTheFij 2025-02-24 09:53:10 -08:00
parent 8907601605
commit 480fcf144c
5 changed files with 103 additions and 52 deletions

View File

@ -1,4 +1,5 @@
resource "nomad_job" "backup" { resource "nomad_job" "backup" {
count = var.restoration_mode ? 0 : 1
jobspec = templatefile("${path.module}/backup.nomad", { jobspec = templatefile("${path.module}/backup.nomad", {
module_path = path.module, module_path = path.module,
batch_node = null, batch_node = null,

View File

@ -1,3 +1,9 @@
variable "restoration_mode" {
type = bool
description = "Prevent starting scheduled backup jobs so data can be restored without overwriting good data"
default = false
}
variable "use_wesher" { variable "use_wesher" {
type = bool type = bool
description = "Indicates whether or not services should expose themselves on the wesher network" description = "Indicates whether or not services should expose themselves on the wesher network"

View File

@ -28,7 +28,8 @@ module "services" {
module "backups" { module "backups" {
source = "./backups" source = "./backups"
use_wesher = var.use_wesher use_wesher = var.use_wesher
restoration_mode = var.restoration_mode
depends_on = [module.databases, module.services, module.core] depends_on = [module.databases, module.services, module.core]
} }

View File

@ -3,7 +3,7 @@ from os import environ
from time import sleep from time import sleep
from typing import Any from typing import Any
from typing import cast from typing import cast
from argparse import ArgumentParser from argparse import ArgumentParser, Namespace
import requests import requests
@ -65,64 +65,101 @@ def wait_for_eval_status(eval_id: str, status: str):
eval = cast(dict[str, Any], eval) eval = cast(dict[str, Any], eval)
parser = ArgumentParser( def restart_job(job_id: str) -> str|None:
description="Execute one off backups and restores of services", job_versions = nomad_req("job", job_id, "versions")
) job_versions = cast(dict[str, Any], job_versions)
parser.add_argument("service_name", help="Name of the service to backup or restore")
parser.add_argument("-a", "--action", default="backup", choices=("backup", "restore"), help="Action to take, backup or restore")
parser.add_argument("-s", "--snapshot", default="latest", help="Backup snapshot to restore, if restore is the chosen action")
parser.add_argument("-x", "--extra-safe", action="store_true", help="Perform extra safe backup or restore by stoping target job first")
args = parser.parse_args()
service_name = args.service_name latest_stable_version: int = -1
service_info = nomad_req("service", service_name, params={"choose": "1|backups"}) for version in job_versions["Versions"]:
if version["Stable"] and version["Version"] > latest_stable_version:
latest_stable_version = version["Version"]
if not service_info: if latest_stable_version == -1:
print(f"Could not find service {service_name}") print("No stable versions found")
exit(1) return None
service_info = cast(list[dict[str, Any]], service_info) print(f"Reverting to version {latest_stable_version}")
node_id = service_info[0]["NodeID"] revert = nomad_req("job", job_id, "revert", data={"JobVersion": latest_stable_version}, method="POST")
job_id = service_info[0]["JobID"] revert = cast(dict[str, Any], revert)
node = nomad_req("node", node_id) return revert["EvalID"]
node = cast(dict[str, Any], node)
node_name = node["Name"]
backup_job_name = f"backup-oneoff-{node_name}"
backup_job = nomad_req("job", backup_job_name)
if not backup_job:
print(f"Could not find backup job {backup_job_name} for {service_name}")
if args.extra_safe: def parse_arguments() -> Namespace:
print("Stopping job allocs") parser = ArgumentParser(
stop_job = nomad_req("job", job_id, method="DELETE") description="Execute one off backups and restores of services",
print(stop_job) )
wait_for_job_alloc_status(job_id, "complete") parser.add_argument("service_name", help="Name of the service to backup or restore")
parser.add_argument("-a", "--action", default="backup", choices=("backup", "restore"), help="Action to take, backup or restore")
parser.add_argument("-s", "--snapshot", default="latest", help="Backup snapshot to restore, if restore is the chosen action")
parser.add_argument("-x", "--extra-safe", action="store_true", help="Perform extra safe backup or restore by stoping target job first")
args = parser.parse_args()
backup_job = cast(dict[str, Any], backup_job) return args
backup_job_id = backup_job["ID"]
dispatch = nomad_req(
"job", def main() -> int:
backup_job_id, args = parse_arguments()
"dispatch",
data={ service_name = args.service_name
"Payload": None, service_info = nomad_req("service", service_name, params={"choose": "1|backups"})
"Meta": {
"job_name": service_name, if not service_info:
"task": args.action, print(f"Could not find service {service_name}")
"snapshot": args.snapshot, return 1
service_info = cast(list[dict[str, Any]], service_info)
node_id = service_info[0]["NodeID"]
job_id = service_info[0]["JobID"]
node = nomad_req("node", node_id)
node = cast(dict[str, Any], node)
node_name = node["Name"]
backup_job_name = f"backup-oneoff-{node_name}"
backup_job = nomad_req("job", backup_job_name)
if not backup_job:
print(f"Could not find backup job {backup_job_name} for {service_name}")
if args.extra_safe:
print("Stopping job allocs")
_ = nomad_req("job", job_id, method="DELETE")
wait_for_job_alloc_status(job_id, "complete")
backup_job = cast(dict[str, Any], backup_job)
backup_job_id = backup_job["ID"]
dispatch = nomad_req(
"job",
backup_job_id,
"dispatch",
data={
"Payload": None,
"Meta": {
"job_name": service_name,
"task": args.action,
"snapshot": args.snapshot,
},
}, },
}, method="POST",
method="POST", )
) dispatch = cast(dict[str, Any], dispatch)
dispatch = cast(dict[str, Any], dispatch)
print(dispatch)
if args.extra_safe: if args.extra_safe:
print(f"Wait for {args.action} to finish") print(f"Wait for {args.action} to finish")
wait_for_eval_status(dispatch["EvalID"], "complete") wait_for_eval_status(dispatch["EvalID"], "complete")
print("Backup complete. Verify success and restart job") print(f"{args.action.capitalize()} complete. Verify success and restart job")
# If auto restarting, get versions and "revert" to version n-1 since n will be the recently stopped version
revert_eval_id = restart_job(job_id)
if revert_eval_id:
wait_for_eval_status(revert_eval_id, "running")
else:
print("No stable versions to revert to")
return 0
if __name__ == "__main__":
exit(main())

View File

@ -21,3 +21,9 @@ variable "use_wesher" {
description = "Indicates whether or not services should expose themselves on the wesher network" description = "Indicates whether or not services should expose themselves on the wesher network"
default = true default = true
} }
variable "restoration_mode" {
type = bool
description = "Prevent starting scheduled backup jobs so data can be restored without overwriting good data"
default = false
}