homelab-nomad/ansible_playbooks/setup-cluster.yml

503 lines
14 KiB
YAML
Raw Normal View History

2022-02-16 17:56:18 +00:00
---
- name: Build Consul cluster
hosts: consul_instances
any_errors_fatal: true
roles:
- role: ansible-consul
vars:
consul_version: "1.13.3-1"
consul_install_upgrade: true
2022-06-24 03:11:48 +00:00
consul_install_from_repo: true
consul_os_repo_prerequisites: []
consul_node_role: server
2022-11-10 18:19:07 +00:00
consul_raft_protocol: 3
consul_bootstrap_expect: true
2022-09-28 04:27:34 +00:00
consul_bootstrap_expect_value: "{{ [(play_hosts | length), 3] | min }}"
consul_user: consul
consul_manage_user: true
consul_group: bin
consul_manage_group: true
# consul_tls_enable: true
consul_connect_enabled: true
consul_ports_grpc: 8502
consul_client_address: "0.0.0.0"
2022-08-30 22:12:52 +00:00
# Autopilot
consul_autopilot_enable: true
consul_autopilot_cleanup_dead_Servers: true
# Enable metrics
consul_config_custom:
telemetry:
prometheus_retention_time: "2h"
2022-06-24 03:12:09 +00:00
# DNS forwarding
consul_dnsmasq_enable: true
consul_dnsmasq_servers:
# TODO: use addresses of other nomad nodes?
# Maybe this can be [] to get the values from dhcp
- 1.1.1.1
- 1.0.0.1
consul_dnsmasq_bind_interfaces: true
consul_dnsmasq_listen_addresses:
# Listen only to loopback interface
- 127.0.0.1
become: true
tasks:
- name: Start Consul
systemd:
state: started
name: consul
become: true
2022-06-24 03:12:09 +00:00
# If DNS is broken after dnsmasq, then need to set /etc/resolv.conf to something
# pointing to 127.0.0.1 and possibly restart Docker and Nomad
- name: Update resolv.conf
lineinfile:
dest: /etc/resolv.conf
create: true
line: "nameserver 127.0.0.1"
become: true
2022-06-24 03:12:09 +00:00
2022-03-15 18:57:00 +00:00
- name: Setup Vault cluster
hosts: vault_instances
roles:
- name: ansible-vault
vars:
vault_version: 1.12.0-1
2022-03-15 18:57:00 +00:00
vault_install_hashi_repo: true
vault_harden_file_perms: true
2022-11-02 21:20:09 +00:00
# Maybe this should be restricted
vault_group: bin
vault_bin_path: /usr/bin
2022-03-15 18:57:00 +00:00
vault_address: 0.0.0.0
vault_backend: consul
become: true
tasks:
2022-05-25 03:10:47 +00:00
- name: Get Vault status
uri:
url: http://127.0.0.1:8200/v1/sys/health
method: GET
status_code: 200, 429, 472, 473, 501, 503
body_format: json
return_content: true
register: vault_status
- name: Initialize Vault
when: not vault_status.json["initialized"]
block:
- name: Initialize Vault
command:
argv:
- "vault"
- "operator"
- "init"
- "-format=json"
- "-address=http://127.0.0.1:8200/"
- "-key-shares={{ vault_init_key_shares|default(3) }}"
- "-key-threshold={{ vault_init_key_threshold|default(2) }}"
run_once: true
register: vault_init
- name: Save initialize result
copy:
content: "{{ vault_init.stdout }}"
2022-11-02 21:20:09 +00:00
dest: "../vault-keys.json"
2022-05-25 03:10:47 +00:00
when: vault_init is succeeded
delegate_to: localhost
run_once: true
- name: Unseal from init
no_log: true
command:
argv:
- "vault"
- "operator"
- "unseal"
- "-address=http://127.0.0.1:8200/"
- "{{ item }}"
loop: "{{ (vault_init.stdout | from_json)['unseal_keys_hex'] }}"
when: vault_init is succeeded
- name: Unseal Vault
no_log: true
2022-03-15 18:57:00 +00:00
command:
argv:
- "vault"
- "operator"
- "unseal"
- "-address=http://127.0.0.1:8200/"
- "{{ item }}"
2022-05-25 03:10:47 +00:00
loop: "{{ unseal_keys_hex }}"
when:
- unseal_keys_hex is defined
- vault_status.json["sealed"]
2022-03-15 18:57:00 +00:00
- name: Install Docker
hosts: nomad_instances
become: true
vars:
docker_architecture_map:
x86_64: amd64
armv7l: armhf
aarch64: arm64
docker_apt_arch: "{{ docker_architecture_map[ansible_architecture] }}"
docker_compose_arch: "{{ (ansible_architecture == 'armv7l') | ternary('armv7', ansible_architecture) }}"
roles:
- geerlingguy.docker
tasks:
- name: Remove snapd
package:
name: snapd
state: absent
# Not on Ubuntu 20.04
# - name: Install Podman
# hosts: nomad_instances
# become: true
#
# tasks:
# - name: Install Podman
# package:
# name: podman
# state: present
- name: Create NFS mounts
hosts: nomad_instances
become: true
tasks:
- name: Install nfs
package:
name: nfs-common
state: present
- name: Create Motioneye NFS mount
ansible.posix.mount:
src: 192.168.2.10:/Recordings/Motioneye
path: /srv/volumes/motioneye-recordings
opts: proto=tcp,port=2049,rw
state: mounted
fstype: nfs4
- name: Create Media Library RO NFS mount
ansible.posix.mount:
src: 192.168.2.10:/Multimedia
path: /srv/volumes/media-read
opts: proto=tcp,port=2049,ro
state: mounted
fstype: nfs4
2022-08-30 22:14:55 +00:00
- name: Create Media Library RW NFS mount
ansible.posix.mount:
src: 192.168.2.10:/Multimedia
path: /srv/volumes/media-write
opts: proto=tcp,port=2049,rw
state: mounted
fstype: nfs4
- name: Create Download RW NFS mount
ansible.posix.mount:
src: 192.168.2.10:/Download
path: /srv/volumes/download
opts: proto=tcp,port=2049,rw
state: mounted
fstype: nfs4
- name: Create Container NAS RW NFS mount
ansible.posix.mount:
src: 192.168.2.10:/Container
path: /srv/volumes/container
opts: proto=tcp,port=2049,rw
state: mounted
fstype: nfs4
2022-02-17 22:03:42 +00:00
- name: Build Nomad cluster
2022-02-16 17:56:18 +00:00
hosts: nomad_instances
any_errors_fatal: true
become: true
vars:
shared_host_volumes:
- name: motioneye-recordings
path: /srv/volumes/motioneye-recordings
owner: "root"
group: "root"
mode: "0755"
read_only: false
- name: media-read
path: /srv/volumes/media-write
read_only: true
2022-08-30 22:14:55 +00:00
- name: media-write
path: /srv/volumes/media-write
owner: "root"
group: "root"
mode: "0755"
read_only: false
- name: tv-sonarr
path: "/srv/volumes/media-write/TV Shows"
owner: 1001
group: 100
mode: "0755"
2022-08-30 22:14:55 +00:00
read_only: false
- name: download
path: /srv/volumes/download
owner: 1001
group: 100
mode: "0755"
2022-08-30 22:14:55 +00:00
read_only: false
- name: nzbget-data
path: /srv/volumes/container/nzbget/config
read_only: false
- name: gitea-data
path: /srv/volumes/container/gitea
read_only: false
2022-11-10 19:15:43 +00:00
- name: photoprism-media
path: /srv/volumes/media-write/Photoprism
read_only: false
- name: all-volumes
path: /srv/volumes
owner: "root"
group: "root"
mode: "0755"
read_only: false
2022-02-16 17:56:18 +00:00
roles:
- name: ansible-nomad
vars:
2022-11-10 18:19:22 +00:00
nomad_version: "1.4.2-1"
nomad_install_upgrade: true
nomad_allow_purge_config: true
nomad_meta:
# There are issues with v1.23.0 on arm64
connect.sidecar_image: envoyproxy/envoy:v1.23.1
# Where nomad gets installed to
nomad_bin_dir: /usr/bin
nomad_install_from_repo: true
2022-06-23 16:51:21 +00:00
nomad_bootstrap_expect: "{{ [(play_hosts | length), 3] | min }}"
nomad_raft_protocol: 3
2022-05-25 03:11:18 +00:00
nomad_autopilot: true
nomad_encrypt_enable: true
# nomad_use_consul: true
# Metrics
nomad_telemetry: true
nomad_telemetry_prometheus_metrics: true
nomad_telemetry_publish_allocation_metrics: true
nomad_telemetry_publish_node_metrics: true
# Enable container plugins
nomad_cni_enable: true
nomad_cni_version: 1.0.1
nomad_docker_enable: true
nomad_docker_dmsetup: false
# nomad_podman_enable: true
# Merge shared host volumes with node volumes
nomad_host_volumes: "{{ shared_host_volumes + (nomad_unique_host_volumes | default([])) }}"
# Customize docker plugin
nomad_plugins:
docker:
config:
2022-06-17 22:19:19 +00:00
allow_privileged: true
volumes:
enabled: true
selinuxlabel: "z"
# Send logs to journald so we can scrape them for Loki
logging:
type: journald
extra_labels:
- "job_name"
- "job_id"
- "task_group_name"
- "task_name"
- "namespace"
- "node_name"
- "node_id"
# Bind nomad
nomad_bind_address: 0.0.0.0
# Default interface for binding tasks
2022-03-22 03:13:13 +00:00
# nomad_network_interface: lo
# Create networks for binding task ports
nomad_host_networks:
- name: nomad-bridge
interface: nomad
reserved_ports: "22"
- name: loopback
interface: lo
reserved_ports: "22"
2022-03-22 03:13:13 +00:00
# Enable ACLs
nomad_acl_enabled: true
2022-03-15 18:57:00 +00:00
# Enable vault integration
# HACK: Only talk to local Vault for now because it doesn't have HTTPS
# TODO: Would be really great to have this over https and point to vault.consul.service
# nomad_vault_address: "https://vault.service.consul:8200"
# Right now, each node only talks to it's local Vault, so if that node is rebooted and
# that vault is sealed, it will not have access to vault. This is a problem if a node
# must reboot.
nomad_vault_address: "http://127.0.0.1:8200"
# TODO: This fails on first run because the Nomad-Vault integration can't be set up
# until Nomad has started. Could maybe figure out if ACLs have been set up and leave
2022-07-27 20:40:21 +00:00
# these out until the later play, maybe just bootstrap the nomad-cluster role in Vault
# befor Nomad is set up
nomad_vault_create_from_role: "nomad-cluster"
# TODO: (security) Probably want to restict this to a narrower scoped token
nomad_vault_enabled: "{{ root_token is defined }}"
nomad_vault_token: "{{ root_token | default('') }}"
2022-03-15 18:57:00 +00:00
nomad_config_custom:
ui:
enabled: true
consul:
ui_url: "https://consul.thefij.rocks/ui"
2022-03-15 18:57:00 +00:00
vault:
ui_url: "https://vault.thefij.rocks/ui"
2022-03-15 19:23:37 +00:00
consul:
2022-03-22 03:13:13 +00:00
tags:
2022-03-15 19:23:37 +00:00
- "traefik.enable=true"
- "traefik.consulcatalog.connect=true"
- "traefik.http.routers.nomadclient.entrypoints=websecure"
2022-09-28 04:28:02 +00:00
- name: Bootstrap Nomad ACLs and scheduler
hosts: nomad_instances
tasks:
- name: Start Nomad
systemd:
state: started
name: nomad
2022-03-22 04:26:04 +00:00
- name: Nomad API reachable?
uri:
url: "http://127.0.0.1:4646/v1/status/leader"
method: GET
status_code: 200
register: nomad_check_result
retries: 6
until: nomad_check_result is succeeded
delay: 10
changed_when: false
run_once: true
2022-03-22 04:26:04 +00:00
- name: Bootstrap ACLs
command:
argv:
- "nomad"
- "acl"
- "bootstrap"
- "-json"
run_once: true
ignore_errors: true
register: bootstrap_result
- name: Save bootstrap result
copy:
content: "{{ bootstrap_result.stdout }}"
2022-11-02 21:20:09 +00:00
dest: "../nomad_bootstrap.json"
2022-03-22 04:26:04 +00:00
when: bootstrap_result is succeeded
delegate_to: localhost
run_once: true
- name: Read secret
command:
argv:
- jq
- -r
- .SecretID
2022-11-02 21:20:09 +00:00
- ../nomad_bootstrap.json
2022-03-22 04:26:04 +00:00
delegate_to: localhost
run_once: true
2022-04-15 19:12:28 +00:00
no_log: true
changed_when: false
2022-03-22 04:26:04 +00:00
register: read_secretid
2022-09-16 23:45:26 +00:00
- name: Enable service scheduler preemption
command:
argv:
- nomad
- operator
- scheduler
- set-config
- -preempt-system-scheduler=true
- -preempt-service-scheduler=true
environment:
NOMAD_TOKEN: "{{ read_secretid.stdout }}"
delegate_to: "{{ play_hosts[0] }}"
run_once: true
2022-08-30 22:15:29 +00:00
- name: Look for policy
command:
argv:
- nomad
- acl
- policy
- list
environment:
NOMAD_TOKEN: "{{ read_secretid.stdout }}"
run_once: true
register: policies
2022-03-22 04:26:04 +00:00
- name: Copy policy
copy:
2022-11-02 21:20:09 +00:00
src: ../acls/nomad-anon-policy.hcl
2022-03-22 04:26:04 +00:00
dest: /tmp/anonymous.policy.hcl
delegate_to: "{{ play_hosts[0] }}"
register: anon_policy
run_once: true
- name: Create anon-policy
command:
argv:
- nomad
- acl
- policy
- apply
2022-08-23 17:31:03 +00:00
- -description="Anon read only"
2022-03-22 04:26:04 +00:00
- anonymous
- /tmp/anonymous.policy.hcl
environment:
NOMAD_TOKEN: "{{ read_secretid.stdout }}"
when: policies.stdout == "No policies found" or anon_policy.changed
delegate_to: "{{ play_hosts[0] }}"
run_once: true
- name: Set up Nomad backend and roles in Vault
community.general.terraform:
2022-11-02 21:20:09 +00:00
project_path: ../acls
force_init: true
variables:
consul_address: "{{ play_hosts[0] }}:8500"
vault_token: "{{ root_token }}"
nomad_secret_id: "{{ read_secretid.stdout }}"
delegate_to: localhost
run_once: true
2022-09-28 04:28:02 +00:00
notify:
- Restart Nomad
handlers:
- name: Restart Nomad
systemd:
state: restarted
name: nomad
retries: 6
delay: 5