homelab-nomad/ansible_playbooks/setup-cluster.yml

488 lines
13 KiB
YAML

---
- name: Update DNS for bootstrapping with non-Nomad host
hosts: consul_instances
become: true
gather_facts: false
vars:
non_nomad_dns: 192.168.2.170
tasks:
- name: Add non-nomad bootstrap DNS
lineinfile:
dest: /etc/resolv.conf
create: true
line: "nameserver {{ non_nomad_dns }}"
- name: Build Consul cluster
hosts: consul_instances
any_errors_fatal: true
roles:
- role: ansible-consul
vars:
consul_version: "1.13.3-1"
consul_install_upgrade: true
consul_install_from_repo: true
consul_os_repo_prerequisites: []
consul_node_role: server
consul_raft_protocol: 3
consul_bootstrap_expect: true
consul_bootstrap_expect_value: "{{ [(play_hosts | length), 3] | min }}"
consul_user: consul
consul_manage_user: true
consul_group: bin
consul_manage_group: true
# consul_tls_enable: true
consul_connect_enabled: true
consul_ports_grpc: 8502
consul_client_address: "0.0.0.0"
# Autopilot
consul_autopilot_enable: false
consul_autopilot_cleanup_dead_Servers: false
# Enable metrics
consul_config_custom:
telemetry:
prometheus_retention_time: "2h"
# DNS forwarding
consul_dnsmasq_enable: true
consul_dnsmasq_servers:
# TODO: use addresses of other nomad nodes?
# Maybe this can be [] to get the values from dhcp
- 1.1.1.1
- 1.0.0.1
consul_dnsmasq_bind_interfaces: true
consul_dnsmasq_listen_addresses:
# Listen only to loopback interface
- 127.0.0.1
become: true
tasks:
- name: Start Consul
systemd:
state: started
name: consul
become: true
# If DNS is broken after dnsmasq, then need to set /etc/resolv.conf to something
# pointing to 127.0.0.1 and possibly restart Docker and Nomad
# Actually, we should point to our external Nomad address so that Docker uses it
- name: Update resolv.conf
lineinfile:
dest: /etc/resolv.conf
create: true
line: "nameserver {{ hostvars[item].ansible_default_ipv4.address }}"
loop: "{{ ansible_play_hosts }}"
become: true
- name: Setup Vault cluster
hosts: vault_instances
roles:
- name: ansible-vault
vars:
vault_version: 1.12.2-1
vault_install_hashi_repo: true
vault_harden_file_perms: true
# Maybe this should be restricted
vault_group: bin
vault_bin_path: /usr/bin
vault_address: 0.0.0.0
vault_backend: consul
become: true
tasks:
- name: Get Vault status
uri:
url: http://127.0.0.1:8200/v1/sys/health
method: GET
status_code: 200, 429, 472, 473, 501, 503
body_format: json
return_content: true
register: vault_status
- name: Initialize Vault
when: not vault_status.json["initialized"]
block:
- name: Initialize Vault
command:
argv:
- "vault"
- "operator"
- "init"
- "-format=json"
- "-address=http://127.0.0.1:8200/"
- "-key-shares={{ vault_init_key_shares|default(3) }}"
- "-key-threshold={{ vault_init_key_threshold|default(2) }}"
run_once: true
register: vault_init
- name: Save initialize result
copy:
content: "{{ vault_init.stdout }}"
dest: "../vault-keys.json"
when: vault_init is succeeded
delegate_to: localhost
run_once: true
- name: Unseal from init
no_log: true
command:
argv:
- "vault"
- "operator"
- "unseal"
- "-address=http://127.0.0.1:8200/"
- "{{ item }}"
loop: "{{ (vault_init.stdout | from_json)['unseal_keys_hex'] }}"
when: vault_init is succeeded
- name: Unseal Vault
no_log: true
command:
argv:
- "vault"
- "operator"
- "unseal"
- "-address=http://127.0.0.1:8200/"
- "{{ item }}"
loop: "{{ unseal_keys_hex }}"
when:
- unseal_keys_hex is defined
- vault_status.json["sealed"]
- name: Install Docker
hosts: nomad_instances
become: true
vars:
docker_architecture_map:
x86_64: amd64
armv7l: armhf
aarch64: arm64
docker_apt_arch: "{{ docker_architecture_map[ansible_architecture] }}"
docker_compose_arch: "{{ (ansible_architecture == 'armv7l') | ternary('armv7', ansible_architecture) }}"
roles:
- geerlingguy.docker
tasks:
- name: Remove snapd
package:
name: snapd
state: absent
# Not on Ubuntu 20.04
# - name: Install Podman
# hosts: nomad_instances
# become: true
#
# tasks:
# - name: Install Podman
# package:
# name: podman
# state: present
- name: Create NFS mounts
hosts: nomad_instances
become: true
vars:
shared_nfs_mounts:
- src: 192.168.2.10:/Media
path: /srv/volumes/media-read
opts: proto=tcp,port=2049,ro
- src: 192.168.2.10:/Media
path: /srv/volumes/media-write
opts: proto=tcp,port=2049,rw
- src: 192.168.2.10:/Photos
path: /srv/volumes/photos
opts: proto=tcp,port=2049,rw
tasks:
- name: Install nfs
package:
name: nfs-common
state: present
- name: Mount NFS volumes
ansible.posix.mount:
src: "{{ item.src }}"
path: "{{ item.path }}"
opts: "{{ item.opts }}"
state: mounted
fstype: nfs4
loop: "{{ shared_nfs_mounts + (nfs_mounts | default([])) }}"
- name: Build Nomad cluster
hosts: nomad_instances
any_errors_fatal: true
become: true
vars:
shared_host_volumes:
- name: media-read
path: /srv/volumes/media-write
read_only: true
- name: media-write
path: /srv/volumes/media-write
owner: "root"
group: "root"
mode: "0755"
read_only: false
- name: media-downloads
path: /srv/volumes/media-write/Downloads
read_only: false
- name: sabnzbd-config
path: /srv/volumes/media-write/Downloads/sabnzbd
read_only: false
- name: photoprism-media
path: /srv/volumes/photos/Photoprism
read_only: false
# - name: tv-sonarr
# path: "/srv/volumes/media-write/TV Shows"
# owner: 1001
# group: 100
# mode: "0755"
# read_only: false
- name: all-volumes
path: /srv/volumes
owner: "root"
group: "root"
mode: "0755"
read_only: false
roles:
- name: ansible-nomad
vars:
nomad_version: "1.4.3-1"
nomad_install_upgrade: true
nomad_allow_purge_config: true
# Where nomad gets installed to
nomad_bin_dir: /usr/bin
nomad_install_from_repo: true
nomad_bootstrap_expect: "{{ [(play_hosts | length), 3] | min }}"
nomad_raft_protocol: 3
nomad_autopilot: true
nomad_encrypt_enable: true
# nomad_use_consul: true
# Metrics
nomad_telemetry: true
nomad_telemetry_prometheus_metrics: true
nomad_telemetry_publish_allocation_metrics: true
nomad_telemetry_publish_node_metrics: true
# Enable container plugins
nomad_cni_enable: true
nomad_cni_version: 1.0.1
nomad_docker_enable: true
nomad_docker_dmsetup: false
# nomad_podman_enable: true
# Merge shared host volumes with node volumes
nomad_host_volumes: "{{ shared_host_volumes + (nomad_unique_host_volumes | default([])) }}"
# Customize docker plugin
nomad_plugins:
docker:
config:
allow_privileged: true
cleanup:
image:
delay: "1d"
volumes:
enabled: true
selinuxlabel: "z"
# Send logs to journald so we can scrape them for Loki
logging:
type: journald
extra_labels:
- "job_name"
- "job_id"
- "task_group_name"
- "task_name"
- "namespace"
- "node_name"
- "node_id"
# Bind nomad
nomad_bind_address: 0.0.0.0
# Default interface for binding tasks
nomad_network_interface: eth0
# Create networks for binding task ports
nomad_host_networks:
- name: nomad-bridge
interface: nomad
reserved_ports: "22"
- name: loopback
interface: lo
reserved_ports: "22"
# Enable ACLs
nomad_acl_enabled: true
# Enable vault integration
# HACK: Only talk to local Vault for now because it doesn't have HTTPS
# TODO: Would be really great to have this over https and point to vault.consul.service
# nomad_vault_address: "https://vault.service.consul:8200"
# Right now, each node only talks to it's local Vault, so if that node is rebooted and
# that vault is sealed, it will not have access to vault. This is a problem if a node
# must reboot.
nomad_vault_address: "http://127.0.0.1:8200"
# TODO: This fails on first run because the Nomad-Vault integration can't be set up
# until Nomad has started. Could maybe figure out if ACLs have been set up and leave
# these out until the later play, maybe just bootstrap the nomad-cluster role in Vault
# befor Nomad is set up
nomad_vault_create_from_role: "nomad-cluster"
# TODO: (security) Probably want to restict this to a narrower scoped token
nomad_vault_enabled: "{{ root_token is defined }}"
nomad_vault_token: "{{ root_token | default('') }}"
nomad_config_custom:
ui:
enabled: true
consul:
ui_url: "https://consul.thefij.rocks/ui"
vault:
ui_url: "https://vault.thefij.rocks/ui"
consul:
tags:
- "traefik.enable=true"
- "traefik.consulcatalog.connect=true"
- "traefik.http.routers.nomadclient.entrypoints=websecure"
- name: Bootstrap Nomad ACLs and scheduler
hosts: nomad_instances
tasks:
- name: Start Nomad
systemd:
state: started
name: nomad
- name: Nomad API reachable?
uri:
url: "http://127.0.0.1:4646/v1/status/leader"
method: GET
status_code: 200
register: nomad_check_result
retries: 6
until: nomad_check_result is succeeded
delay: 10
changed_when: false
run_once: true
- name: Bootstrap ACLs
command:
argv:
- "nomad"
- "acl"
- "bootstrap"
- "-json"
run_once: true
ignore_errors: true
register: bootstrap_result
- name: Save bootstrap result
copy:
content: "{{ bootstrap_result.stdout }}"
dest: "../nomad_bootstrap.json"
when: bootstrap_result is succeeded
delegate_to: localhost
run_once: true
- name: Read secret
command:
argv:
- jq
- -r
- .SecretID
- ../nomad_bootstrap.json
delegate_to: localhost
run_once: true
no_log: true
changed_when: false
register: read_secretid
- name: Enable service scheduler preemption
command:
argv:
- nomad
- operator
- scheduler
- set-config
- -preempt-system-scheduler=true
- -preempt-service-scheduler=true
environment:
NOMAD_TOKEN: "{{ read_secretid.stdout }}"
delegate_to: "{{ play_hosts[0] }}"
run_once: true
- name: Look for policy
command:
argv:
- nomad
- acl
- policy
- list
environment:
NOMAD_TOKEN: "{{ read_secretid.stdout }}"
run_once: true
register: policies
- name: Copy policy
copy:
src: ../acls/nomad-anon-policy.hcl
dest: /tmp/anonymous.policy.hcl
delegate_to: "{{ play_hosts[0] }}"
register: anon_policy
run_once: true
- name: Create anon-policy
command:
argv:
- nomad
- acl
- policy
- apply
- -description="Anon read only"
- anonymous
- /tmp/anonymous.policy.hcl
environment:
NOMAD_TOKEN: "{{ read_secretid.stdout }}"
when: policies.stdout == "No policies found" or anon_policy.changed
delegate_to: "{{ play_hosts[0] }}"
run_once: true
- name: Set up Nomad backend and roles in Vault
community.general.terraform:
project_path: ../acls
force_init: true
variables:
consul_address: "{{ play_hosts[0] }}:8500"
vault_token: "{{ root_token }}"
nomad_secret_id: "{{ read_secretid.stdout }}"
delegate_to: localhost
run_once: true
notify:
- Restart Nomad
handlers:
- name: Restart Nomad
systemd:
state: restarted
name: nomad
retries: 6
delay: 5