Skip to content

Commit

Permalink
#487 working deePDM-py + horovod-hack
Browse files Browse the repository at this point in the history
  • Loading branch information
pavelToman committed Jan 23, 2025
1 parent 342e902 commit 86c81ba
Show file tree
Hide file tree
Showing 6 changed files with 9,107 additions and 11 deletions.
91 changes: 80 additions & 11 deletions 487_DeePMD-kit/deePMD.eb
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ easyblock = 'PythonBundle'
name = 'DeePDM-kit'
version = '3.0.1'
# versionsuffix = '-CUDA-%(cudaver)s'
local_tf_version = '2.15.1'
versionsuffix = '-TensorFlow-%s' % local_tf_version

homepage = 'https://github.com/deepmodeling/deepmd-kit/'
description = "A deep learning package for many-body potential energy representation and molecular dynamics."
Expand All @@ -22,8 +24,20 @@ toolchain = {'name': 'foss', 'version': '2023a'}
# ok 'array-api-compat',
# ok dpdata
# parallel training deps:
# horovod
# OK? horovod - horovod_v2_kenneths_hack
# OK mpi4py
# ok dpdata 0.2.21 requires monty, which is not installed.
# ok dargs 0.4.10 requires typeguard, which is not installed.
# OK mendeleev 0.20.1 requires deprecated, which is not installed.
# OK mendeleev 0.20.1 requires pydantic, which is not installed.
# ok mendeleev 0.20.1 requires pyfiglet, which is not installed.
# OK mendeleev 0.20.1 requires sqlalchemy, which is not installed.
# ok monty 2025.1.9 requires ruamel-yaml, which is not installed.
# ok wcmatch 8.5.2 requires bracex, which is not installed.
# ok typeguard 4.3.0 has requirement typing-extensions>=4.10.0, but you have typing-extensions 4.9.0. -> v4.2.0
# ok mendeleev 0.20.1 has requirement pydantic<3.0.0,>=2.9.2, but you have pydantic 2.5.3. -> 0.18.1
# ok mendeleev 0.20.1 has requirement pyfiglet<0.9,>=0.8.post1, but you have pyfiglet 1.0.2.
# ok tensorflow 2.15.1 has requirement wrapt<1.15,>=1.11.0, but you have wrapt 1.15.0.

builddependencies = [
('scikit-build-core', '0.9.3'),
Expand All @@ -41,30 +55,56 @@ dependencies = [
('TensorFlow', '2.15.1'),
('jax', '0.4.25'),
('mpi4py', '3.1.4'),
('protobuf', '24.0'),
# horovod
('Deprecated', '1.2.14'),
# ('pydantic', '2.5.3'),
('SQLAlchemy','2.0.25'),
('ruamel.yaml', '0.17.32'),
('Horovod', '0.28.1', versionsuffix),
('typing-extensions', '4.9.0'),
# ('protobuf', '24.0'),

]

use_pip = True
sanity_pip_check = True

local_preinstallopts = 'module swap protobuf/3.21.9-GCCcore-12.3.0 && '

exts_list = [
('wrapt', '1.14.1', {
'checksums': ['380a85cf89e0e69b7cfbe2ea9f765f004ff419f34194018a6827ac0e3edfed4d'],
}),
('typeguard', '4.2.0', {
'checksums': ['2aeae510750fca88d0a2ceca3e86de7f71aa43b6c3e6c267737ce1f5effc4b34'],
}),
('dargs', '0.4.10', {
'checksums': ['2b39e0a93dcd323d0affb3f54ee2c11a439084d718934df08f38692dfbadddf8'],
}),
('bracex', '2.5.post1', {
'source_tmpl': '%(name)s-%(version)s-py3-none-any.whl',
'checksums': ['13e5732fec27828d6af308628285ad358047cec36801598368cb28bc631dbaf6'],
}),
('wcmatch', '8.5.2', {
'checksums': ['a70222b86dea82fb382dd87b73278c10756c138bd6f8f714e2183128887b9eb2'],
}),
('mendeleev', '0.20.1', {
'checksums': ['26f27099f0587cab82dd7f84ee31a5c48e994eb478ea2a92756c5fd50764f7da'],
('pyfiglet', '0.8.post1', {
'checksums': ['c6c2321755d09267b438ec7b936825a4910fec696292139e664ca8670e103639'],
}),
('mendeleev', '0.18.1', {
'checksums': ['a5b60bd313a5d2b404a6a250186e643663d5625c8138b3cfba829f1f4384f2a0'],
}),
('array_api_compat', '1.10.0', {
'checksums': ['eb98056fa4993e7e98860b7a1ca73c9ae1c77f1ef95366a5ebd5dec8e6d55bad'],
}),
('monty', '2025.1.9', {
'checksums': ['edb680b01ea1e59225cb666634b0dd2b2393eef07f3d45748445db92e1f1006d'],
}),
('dpdata', '0.2.21', {
'checksums': ['55dcec61bdc8707fb6b3e57406fb7c07b6ccb7a0ac763a1407cc1c3222bf58b1'],
}),
('deepmd_kit', version, {
'modulename': 'deepmd',
'preinstallopts': "module swap protobuf/3.21.9-GCCcore-12.3.0 && ",
'checksums': ['10d4443c6fe31a9a4573ed6eda73b6a669dae572cf2bc43f45e9a63aaae02cff'],
}),
]
Expand All @@ -74,13 +114,42 @@ exts_list = [
# 'dirs': ['lib/python%(pyshortver)s/site-packages'],
# }

# sanity_check_commands = ['%(name)s --help']
sanity_check_commands = ['dp -h']

moduleclass = 'ai'

# E1:
# libtensorflow_cc.so is in $EBROOTTESORFLOW/lib/python3.11/site-packages/tensorflow/libtensorflow_cc.so.2

# E1: OK
# -> use Kenneth's hack in preinstallopts (module swap protobuf/3.21.9-GCCcore-12.3.0) -> works
# ? but in tensorflow-2.15.1 there is req protobuf: protobuf==4.23.4 not 3.21.9
# same problem as with horovod: https://github.com/vscentrum/vsc-software-stack/issues/390#issuecomment-2304201251
# <- during pip install deepmd-kit:
#error This file was generated by an older version of protoc
#error incompatible with your Protocol Buffer headers
#error regenerate this file with a newer version of protoc.
# <- during pip install deepmd-kit: <- log2.txt
# error This file was generated by an older version of protoc
# error incompatible with your Protocol Buffer headers
# error regenerate this file with a newer version of protoc.
# Building CXX object op/tf/CMakeFiles/deepmd_op.dir/cmake_pch.hxx.gch
# In file included from /apps/gent/RHEL8/cascadelake-ib/software/TensorFlow/2.15.1-foss-2023a/lib/python3.11/site-packages/tensorflow/include/tensorflow/core/framework/op.h:26,
# from /tmp/vsc47063/easybuild/build/DeePDMkit/3.0.1/foss-2023a/deepmd_kit/deepmd_kit-3.0.1/source/op/tf/custom_op.h:9,
# from /tmp/vsc47063/easybuild/build/DeePDMkit/3.0.1/foss-2023a/deepmd_kit/deepmd_kit-3.0.1/build/py37-none-linux_x86_64/op/tf/CMakeFiles/deepmd_op.dir/cmake_pch.hxx:5,
# from <command-line>:
# /apps/gent/RHEL8/cascadelake-ib/software/TensorFlow/2.15.1-foss-2023a/lib/python3.11/site-packages/tensorflow/include/tensorflow/core/framework/full_type.pb.h:17:2: error: #error This file was generated by an older version of protoc which is
# 17 | #error This file was generated by an older version of protoc which is
# | ^~~~~
# /apps/gent/RHEL8/cascadelake-ib/software/TensorFlow/2.15.1-foss-2023a/lib/python3.11/site-packages/tensorflow/include/tensorflow/core/framework/full_type.pb.h:18:2: error: #error incompatible with your Protocol Buffer headers. Please
# 18 | #error incompatible with your Protocol Buffer headers. Please
# | ^~~~~
# /apps/gent/RHEL8/cascadelake-ib/software/TensorFlow/2.15.1-foss-2023a/lib/python3.11/site-packages/tensorflow/include/tensorflow/core/framework/full_type.pb.h:19:2: error: #error regenerate this file with a newer version of protoc.
# 19 | #error regenerate this file with a newer version of protoc.
# | ^~~~~
# In file included from /apps/gent/RHEL8/cascadelake-ib/software/TensorFlow/2.15.1-foss-2023a/lib/python3.11/site-packages/tensorflow/include/tensorflow/core/framework/op_def_builder.h:27,
# from /apps/gent/RHEL8/cascadelake-ib/software/TensorFlow/2.15.1-foss-2023a/lib/python3.11/site-packages/tensorflow/include/tensorflow/core/framework/full_type_inference_util.h:24,
# from /apps/gent/RHEL8/cascadelake-ib/software/TensorFlow/2.15.1-foss-2023a/lib/python3.11/site-packages/tensorflow/include/tensorflow/core/framework/op.h:27:
# /apps/gent/RHEL8/cascadelake-ib/software/TensorFlow/2.15.1-foss-2023a/lib/python3.11/site-packages/tensorflow/include/tensorflow/core/framework/op_def.pb.h:17:2: error: #error This file was generated by an older version of protoc which is
# 17 | #error This file was generated by an older version of protoc which is
# | ^~~~~
# /apps/gent/RHEL8/cascadelake-ib/software/TensorFlow/2.15.1-foss-2023a/lib/python3.11/site-packages/tensorflow/include/tensorflow/core/framework/op_def.pb.h:18:2: error: #error incompatible with your Protocol Buffer headers. Please
# 18 | #error incompatible with your Protocol Buffer headers. Please
# | ^~~~~
# /apps/gent/RHEL8/cascadelake-ib/software/TensorFlow/2.15.1-foss-2023a/lib/python3.11/site-packages/tensorflow/include/tensorflow/core/framework/op_def.pb.h:19:2: error: #error regenerate this file with a newer version of protoc.
# 19 | #error regenerate this file with a newer version of protoc.
85 changes: 85 additions & 0 deletions 487_DeePMD-kit/deePMD_v2-torch.eb
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
easyblock = 'PythonBundle'

name = 'DeePDM-kit'
version = '3.0.1'
# versionsuffix = '-CUDA-%(cudaver)s'

homepage = 'https://github.com/deepmodeling/deepmd-kit/'
description = "A deep learning package for many-body potential energy representation and molecular dynamics."

toolchain = {'name': 'foss', 'version': '2023a'}

# DEPS:
# OK scikit-build-core>=0.5,<0.11,!=0.6.0
# OK 'numpy>=1.21',
# OK 'scipy',
# OK 'pyyaml',
# ok 'dargs >= 0.4.7',
# OK "h5py>=3.6.0,!=3.11.0; platform_system=='Linux' and platform_machine=='aarch64'",
# ok 'wcmatch',
# OK 'ml_dtypes',
# ok 'mendeleev',
# ok 'array-api-compat',
# ok dpdata
# parallel training deps:
# NO horovod -
# OK mpi4py

builddependencies = [
('scikit-build-core', '0.9.3'),
('hatchling', '1.18.0'),
('poetry', '1.5.1'),
('git', '2.41.0', '-nodocs'),
]
dependencies = [
# ('CUDA', '12.6.0', '', SYSTEM),
('Python', '3.11.3'),
('SciPy-bundle', '2023.07'),
('ml_dtypes', '0.3.2'),
('PyYAML', '6.0'),
('h5py', '3.9.0'),
('PyTorch', '2.1.2'),
# ('TensorFlow', '2.15.1'),
# ('jax', '0.4.25'),
('mpi4py', '3.1.4'),
# ('protobuf', '24.0'),
# horovod
]

use_pip = True
sanity_pip_check = True

# preinstallopts = 'module swap protobuf/3.21.9-GCCcore-12.3.0 && '
local_preinstallopts = 'export DP_ENABLE_PYTORCH=1 && export DP_ENABLE_TENSORFLOW=0 && '
local_preinstallopts += 'export PYTORCH_ROOT=$EBROOTPYTORCH && '

exts_list = [
('dargs', '0.4.10', {
'checksums': ['2b39e0a93dcd323d0affb3f54ee2c11a439084d718934df08f38692dfbadddf8'],
}),
('wcmatch', '8.5.2', {
'checksums': ['a70222b86dea82fb382dd87b73278c10756c138bd6f8f714e2183128887b9eb2'],
}),
('mendeleev', '0.20.1', {
'checksums': ['26f27099f0587cab82dd7f84ee31a5c48e994eb478ea2a92756c5fd50764f7da'],
}),
('array_api_compat', '1.10.0', {
'checksums': ['eb98056fa4993e7e98860b7a1ca73c9ae1c77f1ef95366a5ebd5dec8e6d55bad'],
}),
('dpdata', '0.2.21', {
'checksums': ['55dcec61bdc8707fb6b3e57406fb7c07b6ccb7a0ac763a1407cc1c3222bf58b1'],
}),
('deepmd_kit', version, {
'preinstallopts': "export DP_ENABLE_PYTORCH=1 && export DP_ENABLE_TENSORFLOW=0 && export PYTORCH_ROOT=$EBROOTPYTORCH && ",
'checksums': ['10d4443c6fe31a9a4573ed6eda73b6a669dae572cf2bc43f45e9a63aaae02cff'],
}),
]

# sanity_check_paths = {
# 'files': ['bin/%(name)s'],
# 'dirs': ['lib/python%(pyshortver)s/site-packages'],
# }

# sanity_check_commands = ['%(name)s --help']

moduleclass = 'ai'
52 changes: 52 additions & 0 deletions 487_DeePMD-kit/horovod_v2_kenneths_hack.eb
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
easyblock = 'PythonBundle'

name = 'Horovod'
version = '0.28.1'
local_tf_version = '2.15.1'
local_cuda_suffix = '-CUDA-%(cudaver)s'
versionsuffix = local_cuda_suffix + '-TensorFlow-%s' % local_tf_version

homepage = 'https://github.com/uber/horovod'
description = "Horovod is a distributed training framework for TensorFlow."

toolchain = {'name': 'foss', 'version': '2023a'}

builddependencies = [
('CMake', '3.26.3'),
# ('protobuf', '3.21.9'),
]
dependencies = [
('Python', '3.11.3'),
('PyYAML', '6.0'),
('CUDA', '12.1.1', '', SYSTEM),
('NCCL', '2.18.3', local_cuda_suffix),
('TensorFlow', local_tf_version, local_cuda_suffix),
]

use_pip = True
sanity_pip_check = True

preinstallopts = 'module swap protobuf/3.21.9-GCCcore-12.3.0 && HOROVOD_WITH_MPI=1 HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL '
preinstallopts += 'HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITHOUT_PYTORCH=1 HOROVOD_WITHOUT_MXNET=1 '

exts_list = [
('cloudpickle', '2.2.1', {
'checksums': ['d89684b8de9e34a2a43b3460fbca07d09d6e25ce858df4d5a44240403b6178f5'],
}),
('horovod', version, {
'patches': ['Horovod-0.28.1_support_flatbuffers_2.0.6.patch'],
'checksums': [
'92a43f5a94c43907a56805bad15f19700c62ffc83b7ca483f9e104e229f67ef0',
'9696ffb3b2bad1d6dd5a9f37bc58078ca7c585f933bcbec037036ad9fc0b297d',
],
}),
]

sanity_check_paths = {
'files': ['bin/horovodrun'],
'dirs': ['lib/python%(pyshortver)s/site-packages'],
}

sanity_check_commands = ["horovodrun --help"]

moduleclass = 'tools'
52 changes: 52 additions & 0 deletions 487_DeePMD-kit/horovod_v3_NOCUDA_kenneths_hack.eb
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
easyblock = 'PythonBundle'

name = 'Horovod'
version = '0.28.1'
local_tf_version = '2.15.1'
# local_cuda_suffix = '-CUDA-%(cudaver)s'
# versionsuffix = local_cuda_suffix + '-TensorFlow-%s' % local_tf_version
versionsuffix = '-TensorFlow-%s' % local_tf_version

homepage = 'https://github.com/uber/horovod'
description = "Horovod is a distributed training framework for TensorFlow."

toolchain = {'name': 'foss', 'version': '2023a'}

builddependencies = [
('CMake', '3.26.3'),
# ('protobuf', '3.21.9'),
]
dependencies = [
('Python', '3.11.3'),
('PyYAML', '6.0'),
('TensorFlow', local_tf_version),
]

use_pip = True
sanity_pip_check = True

local_preinstallopts = 'module swap protobuf/3.21.9-GCCcore-12.3.0 && HOROVOD_WITH_MPI=1 '
local_preinstallopts += 'HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITHOUT_PYTORCH=1 HOROVOD_WITHOUT_MXNET=1 '

exts_list = [
('cloudpickle', '2.2.1', {
'checksums': ['d89684b8de9e34a2a43b3460fbca07d09d6e25ce858df4d5a44240403b6178f5'],
}),
('horovod', version, {
'preinstallopts': local_preinstallopts,
'patches': ['Horovod-0.28.1_support_flatbuffers_2.0.6.patch'],
'checksums': [
'92a43f5a94c43907a56805bad15f19700c62ffc83b7ca483f9e104e229f67ef0',
'9696ffb3b2bad1d6dd5a9f37bc58078ca7c585f933bcbec037036ad9fc0b297d',
],
}),
]

sanity_check_paths = {
'files': ['bin/horovodrun'],
'dirs': ['lib/python%(pyshortver)s/site-packages'],
}

sanity_check_commands = ["horovodrun --help"]

moduleclass = 'tools'
Loading

0 comments on commit 86c81ba

Please sign in to comment.