From 1871f24dc30e8a93dab817c75b03f444f976bc28 Mon Sep 17 00:00:00 2001 From: Alan Janis Date: Wed, 27 Nov 2024 20:04:34 +0000 Subject: [PATCH 1/3] update docker systemd override to expose socket on tcp, fix container systemd file journald configs, set per-os installation for vgpu driver so old packages are removed,remove old reboot task --- tasks/docker-install-nvidia-vgpu.yml | 43 ++++++++++++++++++++++++---- tasks/install.yml | 26 ++++++++--------- templates/docker-compose.service.j2 | 4 +-- templates/docker-override.conf.j2 | 6 +++- 4 files changed, 57 insertions(+), 22 deletions(-) diff --git a/tasks/docker-install-nvidia-vgpu.yml b/tasks/docker-install-nvidia-vgpu.yml index 786b047..4c2a3ef 100644 --- a/tasks/docker-install-nvidia-vgpu.yml +++ b/tasks/docker-install-nvidia-vgpu.yml @@ -39,6 +39,22 @@ register: cache_updated until: - cache_updated is success + - name: Wait for locks + ansible.builtin.include_tasks: + file: wait_for_locks.yml + + - name: Install grid video driver + ansible.builtin.apt: + name: + - "{{ docker_vgpu.driver }}" + state: present + autoremove: true + autocleean: true + when: + - dep_repo_added is sucess + register: install_nvidia_drivers + until: install_nvidia_drivers is not failed + notify: Reload Docker - name: Add Nvidia Grid Driver repository when: ansible_os_family == "RedHat" @@ -58,23 +74,38 @@ register: yum_repo_added until: - yum_repo_added is success - -- name: Add Nvidia Grid Driver package and license - tags: - - docker-install-nvidia - block: - name: Wait for locks ansible.builtin.include_tasks: file: wait_for_locks.yml - name: Install grid video driver - ansible.builtin.package: + ansible.builtin.dnf: name: - "{{ docker_vgpu.driver }}" state: present + alowerasing: true + allow_downgrade: true + when: + - yum_repo_added is success register: install_nvidia_drivers until: install_nvidia_drivers is not failed notify: Reload Docker +- name: Add Nvidia Grid license + tags: + - docker-install-nvidia + block: + # - name: Wait for locks + # ansible.builtin.include_tasks: + # file: wait_for_locks.yml + + # - name: Install grid video driver + # ansible.builtin.package: + # name: + # - "{{ docker_vgpu.driver }}" + # state: present + # register: install_nvidia_drivers + # until: install_nvidia_drivers is not failed + # notify: Reload Docker - name: Retrieve grid license token ansible.builtin.uri: diff --git a/tasks/install.yml b/tasks/install.yml index 64a6262..76000e5 100644 --- a/tasks/install.yml +++ b/tasks/install.yml @@ -13,20 +13,20 @@ tags: - docker-install-nvidia -- name: Reboot host if necessary - tags: - - always - block: - - name: Flush handlers - ansible.builtin.meta: flush_handlers +# - name: Reboot host if necessary +# tags: +# - always +# block: +# - name: Flush handlers +# ansible.builtin.meta: flush_handlers - - name: Reboot after docker updates - ansible.builtin.reboot: - connect_timeout: 10 - reboot_timeout: 600 - msg: Rebooting system via Ansible - when: - - docker_requires_reload|default(False) +# - name: Reboot after docker updates +# ansible.builtin.reboot: +# connect_timeout: 10 +# reboot_timeout: 600 +# msg: Rebooting system via Ansible +# when: +# - docker_requires_reload|default(False) - name: Enable and start or restart docker socket and service block: diff --git a/templates/docker-compose.service.j2 b/templates/docker-compose.service.j2 index 39a247a..047e7a9 100644 --- a/templates/docker-compose.service.j2 +++ b/templates/docker-compose.service.j2 @@ -39,8 +39,8 @@ RestartSec=1min StandardError=journal StandardOutput=journal LogLevelMax=debug -LogRateLimitIntervalSec=10000s -LogRateLimitBurst=30s +LogRateLimitIntervalSec=30s +LogRateLimitBurst=10000 [Install] WantedBy=multi-user.target {% if shared_storage and (data_mount_root|length > 0) %}{{ data_mount_root|regex_replace('^\\/', '') }}.mount{% else %}{% endif %} diff --git a/templates/docker-override.conf.j2 b/templates/docker-override.conf.j2 index 7496ad6..577ee85 100644 --- a/templates/docker-override.conf.j2 +++ b/templates/docker-override.conf.j2 @@ -3,4 +3,8 @@ Description=Docker Application Container Engine Documentation=https://docs.docker.com After=network-online.target docker.socket firewalld.service Wants=network-online.target -Requires=docker.socket \ No newline at end of file +Requires=docker.socket + +[Service] +EnvironmentFile=-/etc/default/docker +ExecStart=ExecStart=/usr/bin/dockerd -H tcp://0.0.0.0:2375 -H unix:///var/run/docker.sock From 887714def0c67d7fe29c0f95b1e3839c91306c16 Mon Sep 17 00:00:00 2001 From: Alan Janis Date: Thu, 28 Nov 2024 03:02:29 +0000 Subject: [PATCH 2/3] fix tags, fix conditions for vgpu --- tasks/docker-install-nvidia-vgpu.yml | 19 +++++-------------- tasks/docker-install-nvidia.yml | 10 ---------- tasks/docker-nvidia-setup.yml | 12 +----------- tasks/install.yml | 3 ++- 4 files changed, 8 insertions(+), 36 deletions(-) diff --git a/tasks/docker-install-nvidia-vgpu.yml b/tasks/docker-install-nvidia-vgpu.yml index 4c2a3ef..195fe54 100644 --- a/tasks/docker-install-nvidia-vgpu.yml +++ b/tasks/docker-install-nvidia-vgpu.yml @@ -1,7 +1,5 @@ --- - name: Remove existing Nvidia repositories - tags: - - docker-install-nvidia block: - name: Remove old repo files ansible.builtin.file: @@ -13,9 +11,8 @@ - /etc/yum/repos.d/nvidia-docker.repo - name: Add Nvidia Grid Driver repository - when: ansible_os_family == "Debian" - tags: - - docker-install-nvidia + when: + - ansible_os_family == "Debian" block: - name: Install Nvidia Grid repository ansible.builtin.deb822_repository: @@ -50,16 +47,13 @@ state: present autoremove: true autocleean: true - when: - - dep_repo_added is sucess register: install_nvidia_drivers until: install_nvidia_drivers is not failed notify: Reload Docker - name: Add Nvidia Grid Driver repository - when: ansible_os_family == "RedHat" - tags: - - docker-install-nvidia + when: + - ansible_os_family == "RedHat" block: - name: Install Nvidia Grid Repository ansible.builtin.yum_repository: @@ -85,14 +79,11 @@ state: present alowerasing: true allow_downgrade: true - when: - - yum_repo_added is success register: install_nvidia_drivers until: install_nvidia_drivers is not failed notify: Reload Docker + - name: Add Nvidia Grid license - tags: - - docker-install-nvidia block: # - name: Wait for locks # ansible.builtin.include_tasks: diff --git a/tasks/docker-install-nvidia.yml b/tasks/docker-install-nvidia.yml index 6248297..7600697 100644 --- a/tasks/docker-install-nvidia.yml +++ b/tasks/docker-install-nvidia.yml @@ -1,7 +1,5 @@ --- - name: Remove existing Nvidia repositories - tags: - - docker-install-nvidia block: - name: Remove old repo files ansible.builtin.file: @@ -13,8 +11,6 @@ - name: Add nvidia driver repo and packages when: ansible_os_family == "Debian" - tags: - - docker-install-nvidia block: - name: Add repository for nvidia container toolkit and runtime ansible.builtin.deb822_repository: @@ -41,8 +37,6 @@ - name: Add nvidia driver repo and packages when: ansible_os_family == "RedHat" - tags: - - docker-install-nvidia block: - name: Wait for locks ansible.builtin.include_tasks: @@ -67,8 +61,6 @@ state: present - name: Install Nvidia Docker and Container Runtime packages - tags: - - docker-install-nvidia block: - name: Wait for locks ansible.builtin.include_tasks: @@ -99,8 +91,6 @@ notify: Reload Docker - name: Patch nvidia drivers to allow more than 2 transcodes - tags: - - docker-install-nvidia block: - name: Create nvidia patch directory ansible.builtin.file: diff --git a/tasks/docker-nvidia-setup.yml b/tasks/docker-nvidia-setup.yml index 3a1131a..3fb7ead 100644 --- a/tasks/docker-nvidia-setup.yml +++ b/tasks/docker-nvidia-setup.yml @@ -2,8 +2,6 @@ - name: Configure kernel modules for nvidia when: - docker_install_nvidia | default(false) - tags: - - docker-install-nvidia block: - name: Blacklist nouveau kernel module when cuda cards are present ansible.builtin.copy: @@ -23,21 +21,13 @@ - name: Add nvidia driver repo and packages ansible.builtin.include_tasks: file: docker-install-nvidia.yml - apply: - tags: docker-install-nvidia when: - docker_install_nvidia | default(False) - not docker_vgpu.enabled | default(False) - tags: - - docker-install-nvidia -- name: Add nvidia driver repo and packages +- name: Add nvidia grid driver repo and packages ansible.builtin.include_tasks: file: docker-install-nvidia-vgpu.yml - apply: - tags: docker-install-nvidia when: - docker_install_nvidia | default(False) - docker_vgpu.enabled | default(False) - tags: - - docker-install-nvidia diff --git a/tasks/install.yml b/tasks/install.yml index 76000e5..4cb1200 100644 --- a/tasks/install.yml +++ b/tasks/install.yml @@ -7,7 +7,8 @@ ansible.builtin.include_tasks: file: docker-nvidia-setup.yml apply: - tags: docker-install-nvidia + tags: + - docker-install-nvidia when: - docker_install_nvidia | default(False) tags: From 7c5ed544618b8de89ae582387d2d70de13ae975d Mon Sep 17 00:00:00 2001 From: Alan Janis Date: Thu, 28 Nov 2024 03:11:00 +0000 Subject: [PATCH 3/3] fix typo --- tasks/docker-install-nvidia-vgpu.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/docker-install-nvidia-vgpu.yml b/tasks/docker-install-nvidia-vgpu.yml index 195fe54..f45f221 100644 --- a/tasks/docker-install-nvidia-vgpu.yml +++ b/tasks/docker-install-nvidia-vgpu.yml @@ -46,7 +46,7 @@ - "{{ docker_vgpu.driver }}" state: present autoremove: true - autocleean: true + autoclean: true register: install_nvidia_drivers until: install_nvidia_drivers is not failed notify: Reload Docker