diff --git a/backups/backups.tf b/backups/backups.tf index 739b819..0b80ed1 100644 --- a/backups/backups.tf +++ b/backups/backups.tf @@ -1,4 +1,5 @@ resource "nomad_job" "backup" { + count = var.restoration_mode ? 0 : 1 jobspec = templatefile("${path.module}/backup.nomad", { module_path = path.module, batch_node = null, diff --git a/backups/vars.tf b/backups/vars.tf index aa5858a..e4defee 100644 --- a/backups/vars.tf +++ b/backups/vars.tf @@ -1,3 +1,9 @@ +variable "restoration_mode" { + type = bool + description = "Prevent starting scheduled backup jobs so data can be restored without overwriting good data" + default = false +} + variable "use_wesher" { type = bool description = "Indicates whether or not services should expose themselves on the wesher network" diff --git a/main.tf b/main.tf index 5d85952..581f501 100644 --- a/main.tf +++ b/main.tf @@ -28,7 +28,8 @@ module "services" { module "backups" { source = "./backups" - use_wesher = var.use_wesher + use_wesher = var.use_wesher + restoration_mode = var.restoration_mode depends_on = [module.databases, module.services, module.core] } diff --git a/scripts/nomad_backup_job.py b/scripts/nomad_backup_job.py index 12023ae..2f5842c 100755 --- a/scripts/nomad_backup_job.py +++ b/scripts/nomad_backup_job.py @@ -3,7 +3,7 @@ from os import environ from time import sleep from typing import Any from typing import cast -from argparse import ArgumentParser +from argparse import ArgumentParser, Namespace import requests @@ -65,64 +65,101 @@ def wait_for_eval_status(eval_id: str, status: str): eval = cast(dict[str, Any], eval) -parser = ArgumentParser( - description="Execute one off backups and restores of services", -) -parser.add_argument("service_name", help="Name of the service to backup or restore") -parser.add_argument("-a", "--action", default="backup", choices=("backup", "restore"), help="Action to take, backup or restore") -parser.add_argument("-s", "--snapshot", default="latest", help="Backup snapshot to restore, if restore is the chosen action") -parser.add_argument("-x", "--extra-safe", action="store_true", help="Perform extra safe backup or restore by stoping target job first") -args = parser.parse_args() +def restart_job(job_id: str) -> str|None: + job_versions = nomad_req("job", job_id, "versions") + job_versions = cast(dict[str, Any], job_versions) -service_name = args.service_name -service_info = nomad_req("service", service_name, params={"choose": "1|backups"}) + latest_stable_version: int = -1 + for version in job_versions["Versions"]: + if version["Stable"] and version["Version"] > latest_stable_version: + latest_stable_version = version["Version"] -if not service_info: - print(f"Could not find service {service_name}") - exit(1) + if latest_stable_version == -1: + print("No stable versions found") + return None -service_info = cast(list[dict[str, Any]], service_info) -node_id = service_info[0]["NodeID"] -job_id = service_info[0]["JobID"] + print(f"Reverting to version {latest_stable_version}") + revert = nomad_req("job", job_id, "revert", data={"JobVersion": latest_stable_version}, method="POST") + revert = cast(dict[str, Any], revert) -node = nomad_req("node", node_id) -node = cast(dict[str, Any], node) -node_name = node["Name"] -backup_job_name = f"backup-oneoff-{node_name}" + return revert["EvalID"] -backup_job = nomad_req("job", backup_job_name) -if not backup_job: - print(f"Could not find backup job {backup_job_name} for {service_name}") -if args.extra_safe: - print("Stopping job allocs") - stop_job = nomad_req("job", job_id, method="DELETE") - print(stop_job) - wait_for_job_alloc_status(job_id, "complete") +def parse_arguments() -> Namespace: + parser = ArgumentParser( + description="Execute one off backups and restores of services", + ) + parser.add_argument("service_name", help="Name of the service to backup or restore") + parser.add_argument("-a", "--action", default="backup", choices=("backup", "restore"), help="Action to take, backup or restore") + parser.add_argument("-s", "--snapshot", default="latest", help="Backup snapshot to restore, if restore is the chosen action") + parser.add_argument("-x", "--extra-safe", action="store_true", help="Perform extra safe backup or restore by stoping target job first") + args = parser.parse_args() -backup_job = cast(dict[str, Any], backup_job) -backup_job_id = backup_job["ID"] + return args -dispatch = nomad_req( - "job", - backup_job_id, - "dispatch", - data={ - "Payload": None, - "Meta": { - "job_name": service_name, - "task": args.action, - "snapshot": args.snapshot, + +def main() -> int: + args = parse_arguments() + + service_name = args.service_name + service_info = nomad_req("service", service_name, params={"choose": "1|backups"}) + + if not service_info: + print(f"Could not find service {service_name}") + return 1 + + service_info = cast(list[dict[str, Any]], service_info) + node_id = service_info[0]["NodeID"] + job_id = service_info[0]["JobID"] + + node = nomad_req("node", node_id) + node = cast(dict[str, Any], node) + + node_name = node["Name"] + backup_job_name = f"backup-oneoff-{node_name}" + + backup_job = nomad_req("job", backup_job_name) + if not backup_job: + print(f"Could not find backup job {backup_job_name} for {service_name}") + + if args.extra_safe: + print("Stopping job allocs") + _ = nomad_req("job", job_id, method="DELETE") + wait_for_job_alloc_status(job_id, "complete") + + backup_job = cast(dict[str, Any], backup_job) + backup_job_id = backup_job["ID"] + + dispatch = nomad_req( + "job", + backup_job_id, + "dispatch", + data={ + "Payload": None, + "Meta": { + "job_name": service_name, + "task": args.action, + "snapshot": args.snapshot, + }, }, - }, - method="POST", -) -dispatch = cast(dict[str, Any], dispatch) -print(dispatch) + method="POST", + ) + dispatch = cast(dict[str, Any], dispatch) -if args.extra_safe: - print(f"Wait for {args.action} to finish") - wait_for_eval_status(dispatch["EvalID"], "complete") + if args.extra_safe: + print(f"Wait for {args.action} to finish") + wait_for_eval_status(dispatch["EvalID"], "complete") - print("Backup complete. Verify success and restart job") - # If auto restarting, get versions and "revert" to version n-1 since n will be the recently stopped version + print(f"{args.action.capitalize()} complete. Verify success and restart job") + + revert_eval_id = restart_job(job_id) + if revert_eval_id: + wait_for_eval_status(revert_eval_id, "running") + else: + print("No stable versions to revert to") + + return 0 + + +if __name__ == "__main__": + exit(main()) diff --git a/vars.tf b/vars.tf index 63f2d6f..af4875c 100644 --- a/vars.tf +++ b/vars.tf @@ -21,3 +21,9 @@ variable "use_wesher" { description = "Indicates whether or not services should expose themselves on the wesher network" default = true } + +variable "restoration_mode" { + type = bool + description = "Prevent starting scheduled backup jobs so data can be restored without overwriting good data" + default = false +}