Portal:Toolforge/Admin/Kubernetes/Upgrading Kubernetes/1.26 to 1.27 notes
Appearance
Working etherpad: https://etherpad.wikimedia.org/p/k8s-1.26-to-1.27-upgrade
Prepare packages
- [x] send and merge a patch similar to https://gerrit.wikimedia.org/r/c/operations/puppet/+/1058560 but for the destination version
- [x] check that the packages show up in https://apt.wikimedia.org/wikimedia/pool/thirdparty/
Toolsbeta
- get list of nodes
root@toolsbeta-test-k8s-control-10:~# for node in $(kubectl get nodes -o json | jq '.items[].metadata.name' -r); do echo "* [] $node"; done
prep
- [x] run prepare upgrade cookbook
~ $ sudo cookbook wmcs.toolforge.k8s.prepare_upgrade --cluster-name toolsbeta --src-version 1.26.15 --dst-version 1.27.16 --task-id <id_of_task>
- [x] downtime project via https://prometheus-alerts.wmcloud.org/?q=team%3Dwmcs
- [x] update topic on -cloud
control nodes
- run upgrade node cookbook
sudo cookbook wmcs.toolforge.k8s.worker.upgrade --task-id <id_of_task> --src-version 1.26.15 --dst-version 1.27.16 --cluster-name toolsbeta --hostname <control_node_name>
- check that services start healthy
- depool control-<x> and <y> via haproxy, check that control-<z> is still doing ok
ssh tools-test-k8s-haproxy-6.toolsbeta.eqiad1.wikimedia.cloud sudo puppet agent --disable "<user> k8s upgrade" sudo vim /etc/haproxy/conf.d/k8s-api-servers.cfg sudo systemctl reload haproxy
check:
echo "show stat" | sudo socat stdio /run/haproxy/haproxy.sock | grep k8s-api
revert:
sudo puppet agent --enable sudo run-puppet-agent sudo systemctl reload haproxy
toolsbeta-test-k8s-control-10
- [x] run upgrade node cookbook
- [x] check that services start healthy
- [x] depool control-11 and -12 via haproxy, check that control-10 is still doing ok - not done as we have to upgrade all controls at once
toolsbeta-test-k8s-control-11
- [x] run upgrade node cookbook
- [x] check that services start healthy
- [x] depool control-12 and -10 via haproxy, check that control-11 is still doing ok - not done as we have to upgrade all controls at once
toolsbeta-test-k8s-control-12
- [x] run upgrade node cookbook
- [x] check that services start healthy
- [x] depool control-10 and -11 via haproxy, check that control-12 is still doing ok - not done as we have to upgrade all controls at once
worker nodes
- run upgrade node cookbook for each
sudo cookbook wmcs.toolforge.k8s.worker.upgrade --task-id <id_of_task> --src-version 1.26.15 --dst-version 1.27.16 --cluster-name toolsbeta --hostname <worker_node_name>
- [x] toolsbeta-test-k8s-worker-nfs-5
- [x] toolsbeta-test-k8s-worker-nfs-7
- [x] toolsbeta-test-k8s-worker-nfs-8
- [x] toolsbeta-test-k8s-worker-nfs-9
- [x] toolsbeta-test-k8s-worker-12
- [x] toolsbeta-test-k8s-worker-13
ingress nodes
- run upgrade node cookbook for each
sudo cookbook wmcs.toolforge.k8s.worker.upgrade --task-id <id_of_task> --src-version 1.26.15 --dst-version 1.27.16 --cluster-name toolsbeta --hostname <worker_node_name>
- [x] toolsbeta-test-k8s-ingress-10
- [x] toolsbeta-test-k8s-ingress-11
- [x] toolsbeta-test-k8s-ingress-9
cleanup
- [x] remove downtime
- [x] revert topic change
Tools
- get list of nodes
root@tools-k8s-control-7:~# for node in $(kubectl get nodes -o json | jq '.items[].metadata.name' -r); do echo "* [] $node"; done
prep
- [x] run prepare upgrade cookbook
~ $ sudo cookbook wmcs.toolforge.k8s.prepare_upgrade --cluster-name tools --src-version 1.26.15 --dst-version 1.27.16 --task-id <id_of_task>
- [x] downtime project via https://prometheus-alerts.wmcloud.org/?q=team%3Dwmcs
- [x] update topic on -cloud
control nodes
- run upgrade node cookbook
sudo cookbook wmcs.toolforge.k8s.worker.upgrade --task-id <id_of_task> --src-version 1.26.15 --dst-version 1.27.16 --cluster-name tools --hostname <control_node_name>
- check that services start healthy
- depool control-<x> and <y> via haproxy, check that control-<z> is still doing ok
ssh tools-k8s-haproxy-5.tools.eqiad1.wikimedia.cloud sudo puppet agent --disable "<user> k8s upgrade" sudo nano /etc/haproxy/conf.d/k8s-api-servers.cfg sudo systemctl reload haproxy
check:
echo "show stat" | sudo socat stdio /run/haproxy/haproxy.sock | grep k8s-api
revert:
sudo puppet agent --enable sudo run-puppet-agent sudo systemctl reload haproxy
tools-k8s-control-7
- [x] run upgrade node cookbook
- [x] check that services start healthy
- [x] depool control-8 and -9 via haproxy, check that control-7 is still doing ok
tools-k8s-control-8
- [x] run upgrade node cookbook
- [x] check that services start healthy
- [x] depool control-7 and -9 via haproxy, check that control-8 is still doing ok
tools-k8s-control-9
- [x] run upgrade node cookbook
- [x] check that services start healthy
- [x] depool control-7 and -8 via haproxy, check that control-9 is still doing ok
worker nodes
- run upgrade node cookbook for each. it's ok to do a couple in parallel
sudo cookbook wmcs.toolforge.k8s.worker.upgrade --task-id <id_of_task> --src-version 1.26.15 --dst-version 1.27.16 --cluster-name tools --hostname <worker_node_name>
- [x] tools-k8s-worker-102
- [x] tools-k8s-worker-103
- [x] tools-k8s-worker-105
- [x] tools-k8s-worker-106
- [x] tools-k8s-worker-107
- [x] tools-k8s-worker-108
- [x] tools-k8s-worker-nfs-1
- [x] tools-k8s-worker-nfs-10
- [x] tools-k8s-worker-nfs-11
- [x] tools-k8s-worker-nfs-12
- [x] tools-k8s-worker-nfs-13
- [x] tools-k8s-worker-nfs-14
- [x] tools-k8s-worker-nfs-16
- [x] tools-k8s-worker-nfs-17
- [x] tools-k8s-worker-nfs-19
- [x] tools-k8s-worker-nfs-2
- [x] tools-k8s-worker-nfs-21
- [x] tools-k8s-worker-nfs-22
- [x] tools-k8s-worker-nfs-23
- [x] tools-k8s-worker-nfs-24
- [x] tools-k8s-worker-nfs-26
- [x] tools-k8s-worker-nfs-27
- [x] tools-k8s-worker-nfs-3
- [x] tools-k8s-worker-nfs-32
- [x] tools-k8s-worker-nfs-33
- [x] tools-k8s-worker-nfs-34
- [x] tools-k8s-worker-nfs-35
- [x] tools-k8s-worker-nfs-36
- [x] tools-k8s-worker-nfs-37
- [x] tools-k8s-worker-nfs-38
- [x] tools-k8s-worker-nfs-39
- [x] tools-k8s-worker-nfs-40
- [x] tools-k8s-worker-nfs-41
- [x] tools-k8s-worker-nfs-42
- [x] tools-k8s-worker-nfs-43
- [x] tools-k8s-worker-nfs-44
- [x] tools-k8s-worker-nfs-45
- [x] tools-k8s-worker-nfs-46
- [x] tools-k8s-worker-nfs-47
- [x] tools-k8s-worker-nfs-48
- [x] tools-k8s-worker-nfs-5
- [x] tools-k8s-worker-nfs-50
- [x] tools-k8s-worker-nfs-53
- [x] tools-k8s-worker-nfs-54
- [x] tools-k8s-worker-nfs-55
- [x] tools-k8s-worker-nfs-57
- [x] tools-k8s-worker-nfs-58
- [x] tools-k8s-worker-nfs-61
- [x] tools-k8s-worker-nfs-65
- [x] tools-k8s-worker-nfs-66
- [x] tools-k8s-worker-nfs-67
- [x] tools-k8s-worker-nfs-68
- [x] tools-k8s-worker-nfs-69
- [x] tools-k8s-worker-nfs-7
- [x] tools-k8s-worker-nfs-70
- [x] tools-k8s-worker-nfs-71
- [x] tools-k8s-worker-nfs-72
- [x] tools-k8s-worker-nfs-73
- [x] tools-k8s-worker-nfs-74
- [x] tools-k8s-worker-nfs-75
- [x] tools-k8s-worker-nfs-76
- [x] tools-k8s-worker-nfs-8
- [x] tools-k8s-worker-nfs-9
ingress nodes
- [x] kubectl -n ingress-nginx-gen2 scale deployment ingress-nginx-gen2-controller --replicas=2
run upgrade node cookbook for each:
- [x] tools-k8s-ingress-7
- [x] tools-k8s-ingress-8
- [x] tools-k8s-ingress-9
- [x] revert afterwards: kubectl -n ingress-nginx-gen2 scale deployment ingress-nginx-gen2-controller --replicas=3
cleanup
- [x] remove downtime
- [x] revert topic change