From b17d3d74eb5a9e7640d94f98f6b27ce4891b3c26 Mon Sep 17 00:00:00 2001 From: "Suren A. Chilingaryan" Date: Sun, 6 Oct 2019 04:37:01 +0200 Subject: Integration of CentOS8 and ipecompute nodes --- docker.yml | 6 ++++++ install.yml | 20 ++++++++++++++++++ inventories/ipe.erb | 17 ++++++++++++++++ rocm.yml | 7 +++++++ roles/common/tasks/main.yml | 13 +++++------- roles/common/tasks/main_dnf.yml | 15 ++++++++++++++ roles/common/tasks/main_yum.yml | 24 ++++++++++++++++++++++ roles/common/tasks/software.yml | 17 ---------------- roles/cuda/vars/centos-8.yml | 4 ++++ roles/cuda/vars/redhat-8.yml | 4 ++++ roles/docker/defaults/main.yml | 8 ++++++++ roles/docker/handlers/main.yml | 4 ++++ roles/docker/tasks/configure_apt.yml | 15 ++++++++++++++ roles/docker/tasks/configure_dnf.yml | 18 +++++++++++++++++ roles/docker/tasks/configure_yum.yml | 38 +++++++++++++++++++++++++++++++++++ roles/docker/tasks/install_docker.yml | 14 +++++++++++++ roles/docker/tasks/install_podman.yml | 12 +++++++++++ roles/docker/tasks/main.yml | 24 ++++++++++++++++++++++ roles/docker/vars/centos-7.yml | 3 +++ roles/docker/vars/centos-8.yml | 11 ++++++++++ roles/rocm/tasks/main.yml | 20 ++++++++++++++++++ roles/storage/defaults/main.yml | 1 + roles/storage/tasks/ipecompute2.yml | 17 ++++++++++++++++ roles/storage/tasks/ipecompute4.yml | 35 ++++++++++++++++++++++++++++++++ roles/storage/tasks/main.yml | 20 ++++++++++++------ roles/storage/tasks/nfs.yml | 12 +++++++++++ 26 files changed, 348 insertions(+), 31 deletions(-) create mode 100644 docker.yml create mode 100644 rocm.yml create mode 100644 roles/common/tasks/main_dnf.yml create mode 100644 roles/common/tasks/main_yum.yml create mode 100644 roles/cuda/vars/centos-8.yml create mode 100644 roles/cuda/vars/redhat-8.yml create mode 100644 roles/docker/defaults/main.yml create mode 100644 roles/docker/handlers/main.yml create mode 100644 roles/docker/tasks/configure_apt.yml create mode 100644 roles/docker/tasks/configure_dnf.yml create mode 100644 roles/docker/tasks/configure_yum.yml create mode 100644 roles/docker/tasks/install_docker.yml create mode 100644 roles/docker/tasks/install_podman.yml create mode 100644 roles/docker/tasks/main.yml create mode 100644 roles/docker/vars/centos-7.yml create mode 100644 roles/docker/vars/centos-8.yml create mode 100644 roles/rocm/tasks/main.yml create mode 100644 roles/storage/defaults/main.yml create mode 100644 roles/storage/tasks/ipecompute2.yml create mode 100644 roles/storage/tasks/ipecompute4.yml create mode 100644 roles/storage/tasks/nfs.yml diff --git a/docker.yml b/docker.yml new file mode 100644 index 0000000..ea91aed --- /dev/null +++ b/docker.yml @@ -0,0 +1,6 @@ +- name: Docker + hosts: all + remote_user: root + roles: + - role: docker + diff --git a/install.yml b/install.yml index 278dac9..f1acdd8 100644 --- a/install.yml +++ b/install.yml @@ -15,9 +15,29 @@ remote_user: root roles: - role: cuda + +# The AMD driver is "surprisingly" crashing +#- name: ROCM +# hosts: rcom +# remote_user: root +# roles: +# - role: rocm + + +- name: Docker + hosts: docker + remote_user: root + roles: + - role: docker - name: Desktop hosts: desktop remote_user: root roles: - role: desktop + +- name: Additional Local and Network Storage + hosts: infra + remote_user: root + roles: + - role: storage diff --git a/inventories/ipe.erb b/inventories/ipe.erb index df62890..20edf72 100644 --- a/inventories/ipe.erb +++ b/inventories/ipe.erb @@ -1,6 +1,9 @@ [ands] 192.168.26.[140:149] +[compute] +192.168.26.[130:139] + [camera] 192.168.26.[80:89] @@ -13,7 +16,21 @@ student [cuda] 192.168.26.[80:84] 192.168.26.[86:89] +192.168.26.[131:133] + +[rocm] +192.168.26.134 + +[docker] +192.168.26.[131:139] [ib] 192.168.26.[60:69] 192.168.26.[80:89] +192.168.26.[130:139] +192.168.26.[140:149] + +[infra] +192.168.26.[80:89] +192.168.26.[130:139] +192.168.26.[140:149] diff --git a/rocm.yml b/rocm.yml new file mode 100644 index 0000000..c76c068 --- /dev/null +++ b/rocm.yml @@ -0,0 +1,7 @@ +- name: Common Software + hosts: all + remote_user: root + roles: + - role: rocm + + diff --git a/roles/common/tasks/main.yml b/roles/common/tasks/main.yml index 286a027..9f3cf79 100644 --- a/roles/common/tasks/main.yml +++ b/roles/common/tasks/main.yml @@ -5,14 +5,11 @@ - epel-release when: ansible_distribution == 'CentOS' or ansible_distribution == 'Red Hat Enterprise Linux' -- name: Add our repository with updates and overrides - yum_repository: name="{{ item.name }}" description= "{{ item.description | default('Ands repository') }}" baseurl="{{ item.url }}" enabled="yes" gpgcheck="no" cost="{{ item.cost | default(1) }}" - with_items: "{{ ands_repositories | default([]) }}" - -# We always update on first install and if requested -- name: Update CentOS - yum: name=* state=latest update_cache=yes - when: (result | changed) or (os_update | default(false)) +- include_tasks: main_yum.yml + when: ansible_pkg_mgr == 'yum' + +- include_tasks: main_dnf.yml + when: ansible_pkg_mgr == 'dnf' - name: Install additional software include_tasks: software.yml diff --git a/roles/common/tasks/main_dnf.yml b/roles/common/tasks/main_dnf.yml new file mode 100644 index 0000000..0572132 --- /dev/null +++ b/roles/common/tasks/main_dnf.yml @@ -0,0 +1,15 @@ +- name: Add our repository with updates and overrides + yum_repository: name="{{ item.name }}" description= "{{ item.description | default('Ands repository') }}" baseurl="{{ item.url }}" enabled="yes" gpgcheck="no" cost="{{ item.cost | default(1) }}" + with_items: "{{ ands_repositories | default([]) }}" + +# We always update on first install and if requested +- name: Update CentOS + dnf: name=* state=latest + when: (result | changed) or (os_update | default(false)) + +- name: Install various ansible requirements + package: name={{item}} state=present + with_items: + - yum-plugin-versionlock + - python-rhsm-certificates +# - iptables-services diff --git a/roles/common/tasks/main_yum.yml b/roles/common/tasks/main_yum.yml new file mode 100644 index 0000000..2b320d5 --- /dev/null +++ b/roles/common/tasks/main_yum.yml @@ -0,0 +1,24 @@ +- name: Add our repository with updates and overrides + yum_repository: name="{{ item.name }}" description= "{{ item.description | default('Ands repository') }}" baseurl="{{ item.url }}" enabled="yes" gpgcheck="no" cost="{{ item.cost | default(1) }}" + with_items: "{{ ands_repositories | default([]) }}" + +# We always update on first install and if requested +- name: Update CentOS + yum: name=* state=latest update_cache=yes + when: (result | changed) or (os_update | default(false)) + +- name: Install various ansible requirements + package: name={{item}} state=present + with_items: + - yum-plugin-versionlock + - libselinux-python + - libsemanage-python + - yamllint + - pyOpenSSL + - python-passlib + - python2-ruamel-yaml + - python2-jmespath + - python-ipaddress + - iptables-services + - PyYAML + - python-rhsm-certificates diff --git a/roles/common/tasks/software.yml b/roles/common/tasks/software.yml index c621ef3..3a1a5c1 100644 --- a/roles/common/tasks/software.yml +++ b/roles/common/tasks/software.yml @@ -1,19 +1,3 @@ -- name: Install various ansible requirements - package: name={{item}} state=present - with_items: - - yum-plugin-versionlock - - libselinux-python - - libsemanage-python - - yamllint - - pyOpenSSL - - python-passlib - - python2-ruamel-yaml - - python2-jmespath - - python-ipaddress - - iptables-services - - PyYAML - - python-rhsm-certificates - - name: Install various administrative tools package: name={{item}} state=present with_items: @@ -21,7 +5,6 @@ - telnet - lsof - strace - - bzr - git - pciutils diff --git a/roles/cuda/vars/centos-8.yml b/roles/cuda/vars/centos-8.yml new file mode 100644 index 0000000..935e84d --- /dev/null +++ b/roles/cuda/vars/centos-8.yml @@ -0,0 +1,4 @@ +--- +cuda_repo_subfolder: rhel8 + +# vim:ft=ansible: \ No newline at end of file diff --git a/roles/cuda/vars/redhat-8.yml b/roles/cuda/vars/redhat-8.yml new file mode 100644 index 0000000..935e84d --- /dev/null +++ b/roles/cuda/vars/redhat-8.yml @@ -0,0 +1,4 @@ +--- +cuda_repo_subfolder: rhel8 + +# vim:ft=ansible: \ No newline at end of file diff --git a/roles/docker/defaults/main.yml b/roles/docker/defaults/main.yml new file mode 100644 index 0000000..a5bcb04 --- /dev/null +++ b/roles/docker/defaults/main.yml @@ -0,0 +1,8 @@ +--- +docker_repo_url: "https://download.docker.com/linux" +nvidia_docker_repo_url: "https://nvidia.github.io" + +nvidia_repos: + - libnvidia-container + - nvidia-container-runtime + - nvidia-docker diff --git a/roles/docker/handlers/main.yml b/roles/docker/handlers/main.yml new file mode 100644 index 0000000..3eb0349 --- /dev/null +++ b/roles/docker/handlers/main.yml @@ -0,0 +1,4 @@ +--- +- name: restart docker + systemd: name="docker" daemon_reload="yes" state="restarted" + become: yes diff --git a/roles/docker/tasks/configure_apt.yml b/roles/docker/tasks/configure_apt.yml new file mode 100644 index 0000000..3fd961a --- /dev/null +++ b/roles/docker/tasks/configure_apt.yml @@ -0,0 +1,15 @@ +--- +# tasks file for ansible-role-cuda +- name: Trust packaging key for Nvidia repositories (apt) + apt_key: + data: "{{ lookup('file', 'files/nvidia_docker_packaging_key.asc') }}" + id: "{{ nvidia_docker_packaging_key_id }}" + state: present + +- name: Configure Nvidia repository (apt) + apt_repository: + repo: "deb {{ nvidia_docker_repo_url }}/{{ nvidia_docker_repo_subfolder }} /" + filename: nvidia_docker + state: present + +# vim:ft=ansible: diff --git a/roles/docker/tasks/configure_dnf.yml b/roles/docker/tasks/configure_dnf.yml new file mode 100644 index 0000000..73ecb30 --- /dev/null +++ b/roles/docker/tasks/configure_dnf.yml @@ -0,0 +1,18 @@ +--- +- name: Import NVIDIA Docker repository gpg keys + rpm_key: + key: "{{ nvidia_docker_repo_url }}/{{ item }}/gpgkey" + state: present + with_items: "{{ nvidia_repos }}" + when: "'cuda' in group_names" + +- name: Configure Nvidia repositories (yum) + yum_repository: + name: "{{ item }}" + description: Official {{ item }} repository + baseurl: "{{ nvidia_docker_repo_url }}/{{ item }}/{{ nvidia_docker_repo_subfolder }}/x86_64/" + gpgkey: "{{ nvidia_docker_repo_url }}/{{ item }}/gpgkey" + gpgcheck: no + enabled: yes + with_items: "{{ nvidia_repos }}" + when: "'cuda' in group_names" diff --git a/roles/docker/tasks/configure_yum.yml b/roles/docker/tasks/configure_yum.yml new file mode 100644 index 0000000..99a2743 --- /dev/null +++ b/roles/docker/tasks/configure_yum.yml @@ -0,0 +1,38 @@ +--- +- name: Upload packaging key for docker repositories + copy: + src: docker_packaging_key.asc + dest: "{{ docker_rpm_key_path }}" + mode: 0644 + +- name: Import Docker CE repository gpg key + rpm_key: + key: https://download.docker.com/linux/centos/gpg + state: present + +- name: Import NVIDIA Docker repository gpg keys + rpm_key: + key: "{{ nvidia_docker_repo_url }}/{{ item }}/gpgkey" + state: present + with_items: "{{ nvidia_repos }}" + +- name: Configure docker repositories (yum) + yum_repository: + name: "docker-ce-{{ item }}" + description: Official docker-ce repository + baseurl: "{{ docker_repo_url }}/{{ docker_repo_subfolder }}/x86_64/{{ item }}" + gpgkey: https://download.docker.com/linux/centos/gpg + gpgcheck: yes + enabled: yes + with_items: + - stable + +- name: Configure Nvidia repositories (yum) + yum_repository: + name: "{{ item }}" + description: Official {{ item }} repository + baseurl: "{{ nvidia_docker_repo_url }}/{{ item }}/{{ nvidia_docker_repo_subfolder }}/x86_64/" + gpgkey: "{{ nvidia_docker_repo_url }}/{{ item }}/gpgkey" + gpgcheck: no + enabled: yes + with_items: "{{ nvidia_repos }}" diff --git a/roles/docker/tasks/install_docker.yml b/roles/docker/tasks/install_docker.yml new file mode 100644 index 0000000..9ae0cb9 --- /dev/null +++ b/roles/docker/tasks/install_docker.yml @@ -0,0 +1,14 @@ +- name: Install requirements + package: name="{{ item }}" state=present + with_items: + - lvm2 + +- name: Install nvidia docker + package: name="nvidia-docker2" state=present + notify: + - restart docker + +- name: Install additional packages + package: name="{{ item }}" state=present + with_items: + - docker-compose diff --git a/roles/docker/tasks/install_podman.yml b/roles/docker/tasks/install_podman.yml new file mode 100644 index 0000000..3498aa7 --- /dev/null +++ b/roles/docker/tasks/install_podman.yml @@ -0,0 +1,12 @@ +- name: Install docker packages + package: name="{{ item }}" state=present + with_items: + - podman + - buildah + - skopeo + +- name: Install NVIDIA packages + package: name="{{ item }}" state=present + with_items: + - nvidia-container-runtime + when: "'cuda' in group_names" diff --git a/roles/docker/tasks/main.yml b/roles/docker/tasks/main.yml new file mode 100644 index 0000000..f13f99f --- /dev/null +++ b/roles/docker/tasks/main.yml @@ -0,0 +1,24 @@ +--- +- name: "Gather OS specific variables" + include_vars: "{{ item }}" + with_first_found: + - "{{ ansible_distribution|lower }}-{{ ansible_distribution_version }}.yml" + - "{{ ansible_distribution|lower }}-{{ ansible_distribution_major_version }}.yml" + - "{{ ansible_distribution|lower }}.yml" + - "{{ ansible_os_family|lower }}.yml" + +- include_tasks: configure_yum.yml + when: ansible_pkg_mgr == 'yum' + +- include_tasks: configure_dnf.yml + when: ansible_pkg_mgr == 'dnf' + +- include_tasks: configure_apt.yml + when: ansible_pkg_mgr == 'apt' + +- include_tasks: install_docker.yml + when: ansible_pkg_mgr == 'yum' or ansible_pkg_mgr == 'apt' + +- include_tasks: install_podman.yml + when: ansible_pkg_mgr == 'dnf' + diff --git a/roles/docker/vars/centos-7.yml b/roles/docker/vars/centos-7.yml new file mode 100644 index 0000000..e681468 --- /dev/null +++ b/roles/docker/vars/centos-7.yml @@ -0,0 +1,3 @@ +--- +nvidia_docker_repo_subfolder: centos7 +docker_repo_subfolder: centos/7 diff --git a/roles/docker/vars/centos-8.yml b/roles/docker/vars/centos-8.yml new file mode 100644 index 0000000..d4d24fe --- /dev/null +++ b/roles/docker/vars/centos-8.yml @@ -0,0 +1,11 @@ +--- +# While we have RHEL8 repo, in fact it references centos7 packages +# https://nvidia.github.io/nvidia-docker/rhel8.0/nvidia-docker.repo +# nvidia_docker_repo_subfolder: rhel8.0 +nvidia_docker_repo_subfolder: centos7 +#docker_repo_subfolder: centos/8 + + +nvidia_repos: + - libnvidia-container + - nvidia-container-runtime diff --git a/roles/rocm/tasks/main.yml b/roles/rocm/tasks/main.yml new file mode 100644 index 0000000..4ae1a87 --- /dev/null +++ b/roles/rocm/tasks/main.yml @@ -0,0 +1,20 @@ +- name: Configure DarkSoft repositories (for packages mangling provides/requires to suit ROCm) + yum_repository: + name: "ands_centos8" + description: Various packages for CentOS8 + baseurl: "http://ufo.kit.edu/ands/repos/centos8/centos8/" + gpgcheck: no + enabled: yes + +- name: Configure ROCm repositories (yum) + yum_repository: + name: "rocm" + description: AMD ROCm Drivers and Infrastructure + baseurl: "http://repo.radeon.com/rocm/yum/rpm/" + gpgcheck: no + enabled: yes + +- name: Install ROCm drivers and packages + package: name="{{ item }}" state=present + with_items: + - rocm-dkms diff --git a/roles/storage/defaults/main.yml b/roles/storage/defaults/main.yml new file mode 100644 index 0000000..ca36e70 --- /dev/null +++ b/roles/storage/defaults/main.yml @@ -0,0 +1 @@ +compute4_ssds: ['sda','sdb','sdc','sdd','sde','sdf','sdg','sdh'] diff --git a/roles/storage/tasks/ipecompute2.yml b/roles/storage/tasks/ipecompute2.yml new file mode 100644 index 0000000..9b2cef8 --- /dev/null +++ b/roles/storage/tasks/ipecompute2.yml @@ -0,0 +1,17 @@ +- name: Delete partitions + parted: device="/dev/sda" label="gpt" number="{{ item }}" state="absent" + with_items: [ 2, 3, 4 ] + +- name: Create partition + parted: + device: "/dev/sda" + label: "gpt" + number: 1 + name: "fast" + state: "present" + +- name: arrays | Creating Array(s) Filesystem + filesystem: dev="/dev/sda1" fstype="xfs" + +- name: arrays | Mounting Array(s) + mount: name="/mnt/fast" src="/dev/sda1" fstype="xfs" state="mounted" diff --git a/roles/storage/tasks/ipecompute4.yml b/roles/storage/tasks/ipecompute4.yml new file mode 100644 index 0000000..5b3a88f --- /dev/null +++ b/roles/storage/tasks/ipecompute4.yml @@ -0,0 +1,35 @@ +--- +#- name: Delete partitions +# parted: device="/dev/{{ item[0] }}" label="gpt" number="{{ item[1] }}" state="absent" +# with_nested: +# - "{{ compute4_ssds }}" +# - [ 2, 3, 4 ] + +- name: Create partition + parted: + device: "/dev/{{ item }}" + label: "gpt" + number: 1 + name: "softraid" + flags: [raid] + state: "present" + failed_when: false + with_items: "{{ compute4_ssds }}" + +- name: arrays | Checking Status Of Array(s) + shell: "cat /proc/mdstat | grep md10" + register: "array_check" + changed_when: false + failed_when: false + check_mode: no + +- name: arrays | Creating Array(s) + shell: "yes | mdadm --create /dev/md10 --level=0 --raid-devices={{ compute4_ssds | count }} {{ compute4_ssds | map('regex_replace', '(.*)', '/dev/\\1') | join ('1 ') }}1" + register: "array_created" + when: array_check.rc != 0 + +- name: arrays | Creating Array(s) Filesystem + filesystem: dev="/dev/md10" fstype="xfs" + +- name: arrays | Mounting Array(s) + mount: name="/mnt/fast" src="/dev/md10" fstype="xfs" state="mounted" diff --git a/roles/storage/tasks/main.yml b/roles/storage/tasks/main.yml index 871e785..014e396 100644 --- a/roles/storage/tasks/main.yml +++ b/roles/storage/tasks/main.yml @@ -1,9 +1,17 @@ --- -- name: Ensure NFS common is installed. - package: name=nfs-utils state=present +- name: Ensure required software is installed. + package: name="{{ item }}" state=present + with_items: [ 'parted', 'mdadm', 'nfs-utils' ] -- name: Create mountable dir - file: path=/mnt/ands state=directory mode=755 owner=root group=root +- debug: msg="{{ inventory_hostname }}" -- name: set mountpoints - mount: name=/mnt/ands src=192.168.26.140:/mnt/ands fstype=nfs4 opts=defaults,minorversion=1,_netdev,nofail,soft,nodiratime,noatime dump=0 passno=0 state=mounted +- name: configure network fs + include_tasks: nfs.yml + +- name: configure ipepdvcompute2 + include_tasks: ipecompute2.yml + when: inventory_hostname == '192.168.26.132' + +- name: configure ipepdvcompute4 + include_tasks: ipecompute4.yml + when: inventory_hostname == '192.168.26.134' diff --git a/roles/storage/tasks/nfs.yml b/roles/storage/tasks/nfs.yml new file mode 100644 index 0000000..9dbd467 --- /dev/null +++ b/roles/storage/tasks/nfs.yml @@ -0,0 +1,12 @@ +--- +- name: Create mountable dir + file: path=/mnt/ands state=directory mode=755 owner=root group=root + +- name: Create mountable dir + file: path=/mnt/pdv state=directory mode=755 owner=root group=root + +- name: set mountpoints + mount: name=/mnt/ands src=192.168.26.140:/mnt/ands fstype=nfs4 opts=defaults,minorversion=1,_netdev,nofail,soft,nodiratime,noatime dump=0 passno=0 state=absent + +- name: set mountpoints + mount: name=/mnt/pdv src=192.168.26.170:/pdv fstype=nfs opts=defaults,_netdev,nofail,soft,nodiratime,noatime dump=0 passno=0 state=mounted -- cgit v1.2.3