Improvements for future restore tests
A new mode to deploy without running automatic backups so that I can test restoration without writing a blank backup.
This commit is contained in:
parent
8907601605
commit
480fcf144c
@ -1,4 +1,5 @@
|
||||
resource "nomad_job" "backup" {
|
||||
count = var.restoration_mode ? 0 : 1
|
||||
jobspec = templatefile("${path.module}/backup.nomad", {
|
||||
module_path = path.module,
|
||||
batch_node = null,
|
||||
|
@ -1,3 +1,9 @@
|
||||
variable "restoration_mode" {
|
||||
type = bool
|
||||
description = "Prevent starting scheduled backup jobs so data can be restored without overwriting good data"
|
||||
default = false
|
||||
}
|
||||
|
||||
variable "use_wesher" {
|
||||
type = bool
|
||||
description = "Indicates whether or not services should expose themselves on the wesher network"
|
||||
|
1
main.tf
1
main.tf
@ -29,6 +29,7 @@ module "backups" {
|
||||
source = "./backups"
|
||||
|
||||
use_wesher = var.use_wesher
|
||||
restoration_mode = var.restoration_mode
|
||||
|
||||
depends_on = [module.databases, module.services, module.core]
|
||||
}
|
||||
|
@ -3,7 +3,7 @@ from os import environ
|
||||
from time import sleep
|
||||
from typing import Any
|
||||
from typing import cast
|
||||
from argparse import ArgumentParser
|
||||
from argparse import ArgumentParser, Namespace
|
||||
|
||||
import requests
|
||||
|
||||
@ -65,6 +65,27 @@ def wait_for_eval_status(eval_id: str, status: str):
|
||||
eval = cast(dict[str, Any], eval)
|
||||
|
||||
|
||||
def restart_job(job_id: str) -> str|None:
|
||||
job_versions = nomad_req("job", job_id, "versions")
|
||||
job_versions = cast(dict[str, Any], job_versions)
|
||||
|
||||
latest_stable_version: int = -1
|
||||
for version in job_versions["Versions"]:
|
||||
if version["Stable"] and version["Version"] > latest_stable_version:
|
||||
latest_stable_version = version["Version"]
|
||||
|
||||
if latest_stable_version == -1:
|
||||
print("No stable versions found")
|
||||
return None
|
||||
|
||||
print(f"Reverting to version {latest_stable_version}")
|
||||
revert = nomad_req("job", job_id, "revert", data={"JobVersion": latest_stable_version}, method="POST")
|
||||
revert = cast(dict[str, Any], revert)
|
||||
|
||||
return revert["EvalID"]
|
||||
|
||||
|
||||
def parse_arguments() -> Namespace:
|
||||
parser = ArgumentParser(
|
||||
description="Execute one off backups and restores of services",
|
||||
)
|
||||
@ -74,12 +95,18 @@ parser.add_argument("-s", "--snapshot", default="latest", help="Backup snapshot
|
||||
parser.add_argument("-x", "--extra-safe", action="store_true", help="Perform extra safe backup or restore by stoping target job first")
|
||||
args = parser.parse_args()
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_arguments()
|
||||
|
||||
service_name = args.service_name
|
||||
service_info = nomad_req("service", service_name, params={"choose": "1|backups"})
|
||||
|
||||
if not service_info:
|
||||
print(f"Could not find service {service_name}")
|
||||
exit(1)
|
||||
return 1
|
||||
|
||||
service_info = cast(list[dict[str, Any]], service_info)
|
||||
node_id = service_info[0]["NodeID"]
|
||||
@ -87,6 +114,7 @@ job_id = service_info[0]["JobID"]
|
||||
|
||||
node = nomad_req("node", node_id)
|
||||
node = cast(dict[str, Any], node)
|
||||
|
||||
node_name = node["Name"]
|
||||
backup_job_name = f"backup-oneoff-{node_name}"
|
||||
|
||||
@ -96,8 +124,7 @@ if not backup_job:
|
||||
|
||||
if args.extra_safe:
|
||||
print("Stopping job allocs")
|
||||
stop_job = nomad_req("job", job_id, method="DELETE")
|
||||
print(stop_job)
|
||||
_ = nomad_req("job", job_id, method="DELETE")
|
||||
wait_for_job_alloc_status(job_id, "complete")
|
||||
|
||||
backup_job = cast(dict[str, Any], backup_job)
|
||||
@ -118,11 +145,21 @@ dispatch = nomad_req(
|
||||
method="POST",
|
||||
)
|
||||
dispatch = cast(dict[str, Any], dispatch)
|
||||
print(dispatch)
|
||||
|
||||
if args.extra_safe:
|
||||
print(f"Wait for {args.action} to finish")
|
||||
wait_for_eval_status(dispatch["EvalID"], "complete")
|
||||
|
||||
print("Backup complete. Verify success and restart job")
|
||||
# If auto restarting, get versions and "revert" to version n-1 since n will be the recently stopped version
|
||||
print(f"{args.action.capitalize()} complete. Verify success and restart job")
|
||||
|
||||
revert_eval_id = restart_job(job_id)
|
||||
if revert_eval_id:
|
||||
wait_for_eval_status(revert_eval_id, "running")
|
||||
else:
|
||||
print("No stable versions to revert to")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
exit(main())
|
||||
|
6
vars.tf
6
vars.tf
@ -21,3 +21,9 @@ variable "use_wesher" {
|
||||
description = "Indicates whether or not services should expose themselves on the wesher network"
|
||||
default = true
|
||||
}
|
||||
|
||||
variable "restoration_mode" {
|
||||
type = bool
|
||||
description = "Prevent starting scheduled backup jobs so data can be restored without overwriting good data"
|
||||
default = false
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user