Improvements for future restore tests
A new mode to deploy without running automatic backups so that I can test restoration without writing a blank backup.
This commit is contained in:
parent
8907601605
commit
480fcf144c
@ -1,4 +1,5 @@
|
|||||||
resource "nomad_job" "backup" {
|
resource "nomad_job" "backup" {
|
||||||
|
count = var.restoration_mode ? 0 : 1
|
||||||
jobspec = templatefile("${path.module}/backup.nomad", {
|
jobspec = templatefile("${path.module}/backup.nomad", {
|
||||||
module_path = path.module,
|
module_path = path.module,
|
||||||
batch_node = null,
|
batch_node = null,
|
||||||
|
@ -1,3 +1,9 @@
|
|||||||
|
variable "restoration_mode" {
|
||||||
|
type = bool
|
||||||
|
description = "Prevent starting scheduled backup jobs so data can be restored without overwriting good data"
|
||||||
|
default = false
|
||||||
|
}
|
||||||
|
|
||||||
variable "use_wesher" {
|
variable "use_wesher" {
|
||||||
type = bool
|
type = bool
|
||||||
description = "Indicates whether or not services should expose themselves on the wesher network"
|
description = "Indicates whether or not services should expose themselves on the wesher network"
|
||||||
|
1
main.tf
1
main.tf
@ -29,6 +29,7 @@ module "backups" {
|
|||||||
source = "./backups"
|
source = "./backups"
|
||||||
|
|
||||||
use_wesher = var.use_wesher
|
use_wesher = var.use_wesher
|
||||||
|
restoration_mode = var.restoration_mode
|
||||||
|
|
||||||
depends_on = [module.databases, module.services, module.core]
|
depends_on = [module.databases, module.services, module.core]
|
||||||
}
|
}
|
||||||
|
@ -3,7 +3,7 @@ from os import environ
|
|||||||
from time import sleep
|
from time import sleep
|
||||||
from typing import Any
|
from typing import Any
|
||||||
from typing import cast
|
from typing import cast
|
||||||
from argparse import ArgumentParser
|
from argparse import ArgumentParser, Namespace
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
@ -65,45 +65,72 @@ def wait_for_eval_status(eval_id: str, status: str):
|
|||||||
eval = cast(dict[str, Any], eval)
|
eval = cast(dict[str, Any], eval)
|
||||||
|
|
||||||
|
|
||||||
parser = ArgumentParser(
|
def restart_job(job_id: str) -> str|None:
|
||||||
|
job_versions = nomad_req("job", job_id, "versions")
|
||||||
|
job_versions = cast(dict[str, Any], job_versions)
|
||||||
|
|
||||||
|
latest_stable_version: int = -1
|
||||||
|
for version in job_versions["Versions"]:
|
||||||
|
if version["Stable"] and version["Version"] > latest_stable_version:
|
||||||
|
latest_stable_version = version["Version"]
|
||||||
|
|
||||||
|
if latest_stable_version == -1:
|
||||||
|
print("No stable versions found")
|
||||||
|
return None
|
||||||
|
|
||||||
|
print(f"Reverting to version {latest_stable_version}")
|
||||||
|
revert = nomad_req("job", job_id, "revert", data={"JobVersion": latest_stable_version}, method="POST")
|
||||||
|
revert = cast(dict[str, Any], revert)
|
||||||
|
|
||||||
|
return revert["EvalID"]
|
||||||
|
|
||||||
|
|
||||||
|
def parse_arguments() -> Namespace:
|
||||||
|
parser = ArgumentParser(
|
||||||
description="Execute one off backups and restores of services",
|
description="Execute one off backups and restores of services",
|
||||||
)
|
)
|
||||||
parser.add_argument("service_name", help="Name of the service to backup or restore")
|
parser.add_argument("service_name", help="Name of the service to backup or restore")
|
||||||
parser.add_argument("-a", "--action", default="backup", choices=("backup", "restore"), help="Action to take, backup or restore")
|
parser.add_argument("-a", "--action", default="backup", choices=("backup", "restore"), help="Action to take, backup or restore")
|
||||||
parser.add_argument("-s", "--snapshot", default="latest", help="Backup snapshot to restore, if restore is the chosen action")
|
parser.add_argument("-s", "--snapshot", default="latest", help="Backup snapshot to restore, if restore is the chosen action")
|
||||||
parser.add_argument("-x", "--extra-safe", action="store_true", help="Perform extra safe backup or restore by stoping target job first")
|
parser.add_argument("-x", "--extra-safe", action="store_true", help="Perform extra safe backup or restore by stoping target job first")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
service_name = args.service_name
|
return args
|
||||||
service_info = nomad_req("service", service_name, params={"choose": "1|backups"})
|
|
||||||
|
|
||||||
if not service_info:
|
|
||||||
|
def main() -> int:
|
||||||
|
args = parse_arguments()
|
||||||
|
|
||||||
|
service_name = args.service_name
|
||||||
|
service_info = nomad_req("service", service_name, params={"choose": "1|backups"})
|
||||||
|
|
||||||
|
if not service_info:
|
||||||
print(f"Could not find service {service_name}")
|
print(f"Could not find service {service_name}")
|
||||||
exit(1)
|
return 1
|
||||||
|
|
||||||
service_info = cast(list[dict[str, Any]], service_info)
|
service_info = cast(list[dict[str, Any]], service_info)
|
||||||
node_id = service_info[0]["NodeID"]
|
node_id = service_info[0]["NodeID"]
|
||||||
job_id = service_info[0]["JobID"]
|
job_id = service_info[0]["JobID"]
|
||||||
|
|
||||||
node = nomad_req("node", node_id)
|
node = nomad_req("node", node_id)
|
||||||
node = cast(dict[str, Any], node)
|
node = cast(dict[str, Any], node)
|
||||||
node_name = node["Name"]
|
|
||||||
backup_job_name = f"backup-oneoff-{node_name}"
|
|
||||||
|
|
||||||
backup_job = nomad_req("job", backup_job_name)
|
node_name = node["Name"]
|
||||||
if not backup_job:
|
backup_job_name = f"backup-oneoff-{node_name}"
|
||||||
|
|
||||||
|
backup_job = nomad_req("job", backup_job_name)
|
||||||
|
if not backup_job:
|
||||||
print(f"Could not find backup job {backup_job_name} for {service_name}")
|
print(f"Could not find backup job {backup_job_name} for {service_name}")
|
||||||
|
|
||||||
if args.extra_safe:
|
if args.extra_safe:
|
||||||
print("Stopping job allocs")
|
print("Stopping job allocs")
|
||||||
stop_job = nomad_req("job", job_id, method="DELETE")
|
_ = nomad_req("job", job_id, method="DELETE")
|
||||||
print(stop_job)
|
|
||||||
wait_for_job_alloc_status(job_id, "complete")
|
wait_for_job_alloc_status(job_id, "complete")
|
||||||
|
|
||||||
backup_job = cast(dict[str, Any], backup_job)
|
backup_job = cast(dict[str, Any], backup_job)
|
||||||
backup_job_id = backup_job["ID"]
|
backup_job_id = backup_job["ID"]
|
||||||
|
|
||||||
dispatch = nomad_req(
|
dispatch = nomad_req(
|
||||||
"job",
|
"job",
|
||||||
backup_job_id,
|
backup_job_id,
|
||||||
"dispatch",
|
"dispatch",
|
||||||
@ -116,13 +143,23 @@ dispatch = nomad_req(
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
method="POST",
|
method="POST",
|
||||||
)
|
)
|
||||||
dispatch = cast(dict[str, Any], dispatch)
|
dispatch = cast(dict[str, Any], dispatch)
|
||||||
print(dispatch)
|
|
||||||
|
|
||||||
if args.extra_safe:
|
if args.extra_safe:
|
||||||
print(f"Wait for {args.action} to finish")
|
print(f"Wait for {args.action} to finish")
|
||||||
wait_for_eval_status(dispatch["EvalID"], "complete")
|
wait_for_eval_status(dispatch["EvalID"], "complete")
|
||||||
|
|
||||||
print("Backup complete. Verify success and restart job")
|
print(f"{args.action.capitalize()} complete. Verify success and restart job")
|
||||||
# If auto restarting, get versions and "revert" to version n-1 since n will be the recently stopped version
|
|
||||||
|
revert_eval_id = restart_job(job_id)
|
||||||
|
if revert_eval_id:
|
||||||
|
wait_for_eval_status(revert_eval_id, "running")
|
||||||
|
else:
|
||||||
|
print("No stable versions to revert to")
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
exit(main())
|
||||||
|
6
vars.tf
6
vars.tf
@ -21,3 +21,9 @@ variable "use_wesher" {
|
|||||||
description = "Indicates whether or not services should expose themselves on the wesher network"
|
description = "Indicates whether or not services should expose themselves on the wesher network"
|
||||||
default = true
|
default = true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "restoration_mode" {
|
||||||
|
type = bool
|
||||||
|
description = "Prevent starting scheduled backup jobs so data can be restored without overwriting good data"
|
||||||
|
default = false
|
||||||
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user