Improvements for future restore tests

A new mode to deploy without running automatic backups so that
I can test restoration without writing a blank backup.
This commit is contained in:
IamTheFij 2025-02-24 09:53:10 -08:00
parent 8907601605
commit 480fcf144c
5 changed files with 103 additions and 52 deletions

View File

@ -1,4 +1,5 @@
resource "nomad_job" "backup" {
count = var.restoration_mode ? 0 : 1
jobspec = templatefile("${path.module}/backup.nomad", {
module_path = path.module,
batch_node = null,

View File

@ -1,3 +1,9 @@
variable "restoration_mode" {
type = bool
description = "Prevent starting scheduled backup jobs so data can be restored without overwriting good data"
default = false
}
variable "use_wesher" {
type = bool
description = "Indicates whether or not services should expose themselves on the wesher network"

View File

@ -29,6 +29,7 @@ module "backups" {
source = "./backups"
use_wesher = var.use_wesher
restoration_mode = var.restoration_mode
depends_on = [module.databases, module.services, module.core]
}

View File

@ -3,7 +3,7 @@ from os import environ
from time import sleep
from typing import Any
from typing import cast
from argparse import ArgumentParser
from argparse import ArgumentParser, Namespace
import requests
@ -65,6 +65,27 @@ def wait_for_eval_status(eval_id: str, status: str):
eval = cast(dict[str, Any], eval)
def restart_job(job_id: str) -> str|None:
job_versions = nomad_req("job", job_id, "versions")
job_versions = cast(dict[str, Any], job_versions)
latest_stable_version: int = -1
for version in job_versions["Versions"]:
if version["Stable"] and version["Version"] > latest_stable_version:
latest_stable_version = version["Version"]
if latest_stable_version == -1:
print("No stable versions found")
return None
print(f"Reverting to version {latest_stable_version}")
revert = nomad_req("job", job_id, "revert", data={"JobVersion": latest_stable_version}, method="POST")
revert = cast(dict[str, Any], revert)
return revert["EvalID"]
def parse_arguments() -> Namespace:
parser = ArgumentParser(
description="Execute one off backups and restores of services",
)
@ -74,12 +95,18 @@ parser.add_argument("-s", "--snapshot", default="latest", help="Backup snapshot
parser.add_argument("-x", "--extra-safe", action="store_true", help="Perform extra safe backup or restore by stoping target job first")
args = parser.parse_args()
return args
def main() -> int:
args = parse_arguments()
service_name = args.service_name
service_info = nomad_req("service", service_name, params={"choose": "1|backups"})
if not service_info:
print(f"Could not find service {service_name}")
exit(1)
return 1
service_info = cast(list[dict[str, Any]], service_info)
node_id = service_info[0]["NodeID"]
@ -87,6 +114,7 @@ job_id = service_info[0]["JobID"]
node = nomad_req("node", node_id)
node = cast(dict[str, Any], node)
node_name = node["Name"]
backup_job_name = f"backup-oneoff-{node_name}"
@ -96,8 +124,7 @@ if not backup_job:
if args.extra_safe:
print("Stopping job allocs")
stop_job = nomad_req("job", job_id, method="DELETE")
print(stop_job)
_ = nomad_req("job", job_id, method="DELETE")
wait_for_job_alloc_status(job_id, "complete")
backup_job = cast(dict[str, Any], backup_job)
@ -118,11 +145,21 @@ dispatch = nomad_req(
method="POST",
)
dispatch = cast(dict[str, Any], dispatch)
print(dispatch)
if args.extra_safe:
print(f"Wait for {args.action} to finish")
wait_for_eval_status(dispatch["EvalID"], "complete")
print("Backup complete. Verify success and restart job")
# If auto restarting, get versions and "revert" to version n-1 since n will be the recently stopped version
print(f"{args.action.capitalize()} complete. Verify success and restart job")
revert_eval_id = restart_job(job_id)
if revert_eval_id:
wait_for_eval_status(revert_eval_id, "running")
else:
print("No stable versions to revert to")
return 0
if __name__ == "__main__":
exit(main())

View File

@ -21,3 +21,9 @@ variable "use_wesher" {
description = "Indicates whether or not services should expose themselves on the wesher network"
default = true
}
variable "restoration_mode" {
type = bool
description = "Prevent starting scheduled backup jobs so data can be restored without overwriting good data"
default = false
}