Improvements for future restore tests

A new mode to deploy without running automatic backups so that
I can test restoration without writing a blank backup.
This commit is contained in:
IamTheFij 2025-02-24 09:53:10 -08:00
parent 8907601605
commit 480fcf144c
5 changed files with 103 additions and 52 deletions

View File

@ -1,4 +1,5 @@
resource "nomad_job" "backup" {
count = var.restoration_mode ? 0 : 1
jobspec = templatefile("${path.module}/backup.nomad", {
module_path = path.module,
batch_node = null,

View File

@ -1,3 +1,9 @@
variable "restoration_mode" {
type = bool
description = "Prevent starting scheduled backup jobs so data can be restored without overwriting good data"
default = false
}
variable "use_wesher" {
type = bool
description = "Indicates whether or not services should expose themselves on the wesher network"

View File

@ -28,7 +28,8 @@ module "services" {
module "backups" {
source = "./backups"
use_wesher = var.use_wesher
use_wesher = var.use_wesher
restoration_mode = var.restoration_mode
depends_on = [module.databases, module.services, module.core]
}

View File

@ -3,7 +3,7 @@ from os import environ
from time import sleep
from typing import Any
from typing import cast
from argparse import ArgumentParser
from argparse import ArgumentParser, Namespace
import requests
@ -65,64 +65,101 @@ def wait_for_eval_status(eval_id: str, status: str):
eval = cast(dict[str, Any], eval)
parser = ArgumentParser(
description="Execute one off backups and restores of services",
)
parser.add_argument("service_name", help="Name of the service to backup or restore")
parser.add_argument("-a", "--action", default="backup", choices=("backup", "restore"), help="Action to take, backup or restore")
parser.add_argument("-s", "--snapshot", default="latest", help="Backup snapshot to restore, if restore is the chosen action")
parser.add_argument("-x", "--extra-safe", action="store_true", help="Perform extra safe backup or restore by stoping target job first")
args = parser.parse_args()
def restart_job(job_id: str) -> str|None:
job_versions = nomad_req("job", job_id, "versions")
job_versions = cast(dict[str, Any], job_versions)
service_name = args.service_name
service_info = nomad_req("service", service_name, params={"choose": "1|backups"})
latest_stable_version: int = -1
for version in job_versions["Versions"]:
if version["Stable"] and version["Version"] > latest_stable_version:
latest_stable_version = version["Version"]
if not service_info:
print(f"Could not find service {service_name}")
exit(1)
if latest_stable_version == -1:
print("No stable versions found")
return None
service_info = cast(list[dict[str, Any]], service_info)
node_id = service_info[0]["NodeID"]
job_id = service_info[0]["JobID"]
print(f"Reverting to version {latest_stable_version}")
revert = nomad_req("job", job_id, "revert", data={"JobVersion": latest_stable_version}, method="POST")
revert = cast(dict[str, Any], revert)
node = nomad_req("node", node_id)
node = cast(dict[str, Any], node)
node_name = node["Name"]
backup_job_name = f"backup-oneoff-{node_name}"
return revert["EvalID"]
backup_job = nomad_req("job", backup_job_name)
if not backup_job:
print(f"Could not find backup job {backup_job_name} for {service_name}")
if args.extra_safe:
print("Stopping job allocs")
stop_job = nomad_req("job", job_id, method="DELETE")
print(stop_job)
wait_for_job_alloc_status(job_id, "complete")
def parse_arguments() -> Namespace:
parser = ArgumentParser(
description="Execute one off backups and restores of services",
)
parser.add_argument("service_name", help="Name of the service to backup or restore")
parser.add_argument("-a", "--action", default="backup", choices=("backup", "restore"), help="Action to take, backup or restore")
parser.add_argument("-s", "--snapshot", default="latest", help="Backup snapshot to restore, if restore is the chosen action")
parser.add_argument("-x", "--extra-safe", action="store_true", help="Perform extra safe backup or restore by stoping target job first")
args = parser.parse_args()
backup_job = cast(dict[str, Any], backup_job)
backup_job_id = backup_job["ID"]
return args
dispatch = nomad_req(
"job",
backup_job_id,
"dispatch",
data={
"Payload": None,
"Meta": {
"job_name": service_name,
"task": args.action,
"snapshot": args.snapshot,
def main() -> int:
args = parse_arguments()
service_name = args.service_name
service_info = nomad_req("service", service_name, params={"choose": "1|backups"})
if not service_info:
print(f"Could not find service {service_name}")
return 1
service_info = cast(list[dict[str, Any]], service_info)
node_id = service_info[0]["NodeID"]
job_id = service_info[0]["JobID"]
node = nomad_req("node", node_id)
node = cast(dict[str, Any], node)
node_name = node["Name"]
backup_job_name = f"backup-oneoff-{node_name}"
backup_job = nomad_req("job", backup_job_name)
if not backup_job:
print(f"Could not find backup job {backup_job_name} for {service_name}")
if args.extra_safe:
print("Stopping job allocs")
_ = nomad_req("job", job_id, method="DELETE")
wait_for_job_alloc_status(job_id, "complete")
backup_job = cast(dict[str, Any], backup_job)
backup_job_id = backup_job["ID"]
dispatch = nomad_req(
"job",
backup_job_id,
"dispatch",
data={
"Payload": None,
"Meta": {
"job_name": service_name,
"task": args.action,
"snapshot": args.snapshot,
},
},
},
method="POST",
)
dispatch = cast(dict[str, Any], dispatch)
print(dispatch)
method="POST",
)
dispatch = cast(dict[str, Any], dispatch)
if args.extra_safe:
print(f"Wait for {args.action} to finish")
wait_for_eval_status(dispatch["EvalID"], "complete")
if args.extra_safe:
print(f"Wait for {args.action} to finish")
wait_for_eval_status(dispatch["EvalID"], "complete")
print("Backup complete. Verify success and restart job")
# If auto restarting, get versions and "revert" to version n-1 since n will be the recently stopped version
print(f"{args.action.capitalize()} complete. Verify success and restart job")
revert_eval_id = restart_job(job_id)
if revert_eval_id:
wait_for_eval_status(revert_eval_id, "running")
else:
print("No stable versions to revert to")
return 0
if __name__ == "__main__":
exit(main())

View File

@ -21,3 +21,9 @@ variable "use_wesher" {
description = "Indicates whether or not services should expose themselves on the wesher network"
default = true
}
variable "restoration_mode" {
type = bool
description = "Prevent starting scheduled backup jobs so data can be restored without overwriting good data"
default = false
}