Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

journal vgpu update #51

Merged
merged 3 commits into from
Nov 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 36 additions & 14 deletions tasks/docker-install-nvidia-vgpu.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
---
- name: Remove existing Nvidia repositories
tags:
- docker-install-nvidia
block:
- name: Remove old repo files
ansible.builtin.file:
Expand All @@ -13,9 +11,8 @@
- /etc/yum/repos.d/nvidia-docker.repo

- name: Add Nvidia Grid Driver repository
when: ansible_os_family == "Debian"
tags:
- docker-install-nvidia
when:
- ansible_os_family == "Debian"
block:
- name: Install Nvidia Grid repository
ansible.builtin.deb822_repository:
Expand All @@ -39,11 +36,24 @@
register: cache_updated
until:
- cache_updated is success
- name: Wait for locks
ansible.builtin.include_tasks:
file: wait_for_locks.yml

- name: Install grid video driver
ansible.builtin.apt:
name:
- "{{ docker_vgpu.driver }}"
state: present
autoremove: true
autoclean: true
register: install_nvidia_drivers
until: install_nvidia_drivers is not failed
notify: Reload Docker

- name: Add Nvidia Grid Driver repository
when: ansible_os_family == "RedHat"
tags:
- docker-install-nvidia
when:
- ansible_os_family == "RedHat"
block:
- name: Install Nvidia Grid Repository
ansible.builtin.yum_repository:
Expand All @@ -58,24 +68,36 @@
register: yum_repo_added
until:
- yum_repo_added is success

- name: Add Nvidia Grid Driver package and license
tags:
- docker-install-nvidia
block:
- name: Wait for locks
ansible.builtin.include_tasks:
file: wait_for_locks.yml

- name: Install grid video driver
ansible.builtin.package:
ansible.builtin.dnf:
name:
- "{{ docker_vgpu.driver }}"
state: present
alowerasing: true
allow_downgrade: true
register: install_nvidia_drivers
until: install_nvidia_drivers is not failed
notify: Reload Docker

- name: Add Nvidia Grid license
block:
# - name: Wait for locks
# ansible.builtin.include_tasks:
# file: wait_for_locks.yml

# - name: Install grid video driver
# ansible.builtin.package:
# name:
# - "{{ docker_vgpu.driver }}"
# state: present
# register: install_nvidia_drivers
# until: install_nvidia_drivers is not failed
# notify: Reload Docker

- name: Retrieve grid license token
ansible.builtin.uri:
url: "{{ docker_vgpu.license_url | trim('/') }}/-/client-token"
Expand Down
10 changes: 0 additions & 10 deletions tasks/docker-install-nvidia.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
---
- name: Remove existing Nvidia repositories
tags:
- docker-install-nvidia
block:
- name: Remove old repo files
ansible.builtin.file:
Expand All @@ -13,8 +11,6 @@

- name: Add nvidia driver repo and packages
when: ansible_os_family == "Debian"
tags:
- docker-install-nvidia
block:
- name: Add repository for nvidia container toolkit and runtime
ansible.builtin.deb822_repository:
Expand All @@ -41,8 +37,6 @@

- name: Add nvidia driver repo and packages
when: ansible_os_family == "RedHat"
tags:
- docker-install-nvidia
block:
- name: Wait for locks
ansible.builtin.include_tasks:
Expand All @@ -67,8 +61,6 @@
state: present

- name: Install Nvidia Docker and Container Runtime packages
tags:
- docker-install-nvidia
block:
- name: Wait for locks
ansible.builtin.include_tasks:
Expand Down Expand Up @@ -99,8 +91,6 @@
notify: Reload Docker

- name: Patch nvidia drivers to allow more than 2 transcodes
tags:
- docker-install-nvidia
block:
- name: Create nvidia patch directory
ansible.builtin.file:
Expand Down
12 changes: 1 addition & 11 deletions tasks/docker-nvidia-setup.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@
- name: Configure kernel modules for nvidia
when:
- docker_install_nvidia | default(false)
tags:
- docker-install-nvidia
block:
- name: Blacklist nouveau kernel module when cuda cards are present
ansible.builtin.copy:
Expand All @@ -23,21 +21,13 @@
- name: Add nvidia driver repo and packages
ansible.builtin.include_tasks:
file: docker-install-nvidia.yml
apply:
tags: docker-install-nvidia
when:
- docker_install_nvidia | default(False)
- not docker_vgpu.enabled | default(False)
tags:
- docker-install-nvidia

- name: Add nvidia driver repo and packages
- name: Add nvidia grid driver repo and packages
ansible.builtin.include_tasks:
file: docker-install-nvidia-vgpu.yml
apply:
tags: docker-install-nvidia
when:
- docker_install_nvidia | default(False)
- docker_vgpu.enabled | default(False)
tags:
- docker-install-nvidia
29 changes: 15 additions & 14 deletions tasks/install.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,26 +7,27 @@
ansible.builtin.include_tasks:
file: docker-nvidia-setup.yml
apply:
tags: docker-install-nvidia
tags:
- docker-install-nvidia
when:
- docker_install_nvidia | default(False)
tags:
- docker-install-nvidia

- name: Reboot host if necessary
tags:
- always
block:
- name: Flush handlers
ansible.builtin.meta: flush_handlers
# - name: Reboot host if necessary
# tags:
# - always
# block:
# - name: Flush handlers
# ansible.builtin.meta: flush_handlers

- name: Reboot after docker updates
ansible.builtin.reboot:
connect_timeout: 10
reboot_timeout: 600
msg: Rebooting system via Ansible
when:
- docker_requires_reload|default(False)
# - name: Reboot after docker updates
# ansible.builtin.reboot:
# connect_timeout: 10
# reboot_timeout: 600
# msg: Rebooting system via Ansible
# when:
# - docker_requires_reload|default(False)

- name: Enable and start or restart docker socket and service
block:
Expand Down
4 changes: 2 additions & 2 deletions templates/docker-compose.service.j2
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ RestartSec=1min
StandardError=journal
StandardOutput=journal
LogLevelMax=debug
LogRateLimitIntervalSec=10000s
LogRateLimitBurst=30s
LogRateLimitIntervalSec=30s
LogRateLimitBurst=10000

[Install]
WantedBy=multi-user.target {% if shared_storage and (data_mount_root|length > 0) %}{{ data_mount_root|regex_replace('^\\/', '') }}.mount{% else %}{% endif %}
6 changes: 5 additions & 1 deletion templates/docker-override.conf.j2
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,8 @@ Description=Docker Application Container Engine
Documentation=https://docs.docker.com
After=network-online.target docker.socket firewalld.service
Wants=network-online.target
Requires=docker.socket
Requires=docker.socket

[Service]
EnvironmentFile=-/etc/default/docker
ExecStart=ExecStart=/usr/bin/dockerd -H tcp://0.0.0.0:2375 -H unix:///var/run/docker.sock