diff --git a/CMakeLists.txt b/CMakeLists.txt index 3633794c..88514b6d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -51,7 +51,7 @@ include(FindPkgConfig) # Create project and check C compiler cmake_policy(SET CMP0048 NEW) -project(STARS-H VERSION 0.1.1 LANGUAGES C Fortran) +project(STARS-H VERSION 0.3.0 LANGUAGES C Fortran) message(STATUS "Building ${PROJECT_NAME} ${PROJECT_VERSION}") @@ -69,6 +69,12 @@ set(CMAKE_BUILD_TYPE Release CACHE STRING option(OPENMP "Use OpenMP" ON) option(MPI "Use MPI" ON) option(STARPU "Use StarPU" ON) +# Since KBLAS does not support pkg-config, it is OFF by default, since user has +# to provide path by means of +# CFLAGS="-I/path/to/kblas/include -L/path/to/kblas/lib" +option(KBLAS "Use KBLAS" ON) +option(CUDA "Use CUDA" ON) +#set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode arch=compute_70,code=sm_70") # Option to force using parallel blas instead of sequential option(USE_PARALLEL_BLAS "Prefer parallel blas libraries" OFF) @@ -109,7 +115,7 @@ endif() set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE) # the RPATH to be used when installing #set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib") - +set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) # Packaging (make package) #set(CPACK_PACKAGE_VERSION ${STARSH_VERSION}) #set(CPACK_GENERATOR "TGZ") @@ -174,6 +180,25 @@ if(STARPU) endif() endif() +# KBLAS depends on CUDA +if(KBLAS) + set(CUDA ON) +endif() + +# Check CUDA option +if(CUDA) + # If CUDA itself is available + if(CMAKE_CUDA_COMPILER) + enable_language(CUDA) + add_definitions("-DCUDA") + # If it is not available + else() + set(CUDA OFF) + # Also disable dependent KBLAS option + set(KBLAS OFF) + endif() +endif(CUDA) + # Check if GNU Scientific Library is available (for Matern kernel and # Bessel function) if(GSL) @@ -306,6 +331,17 @@ if(BLA_VENDOR MATCHES "Intel") add_definitions("-DMKL") endif() +if(STARPU AND KBLAS) + add_definitions("-DKBLAS") +# find_package(MAGMA) +# if(MAGMA_FOUND) +# include_directories(${MAGMA_INCLUDE_DIRS}) +# link_directories(${MAGMA_LIBRARY_DIRS}) +# add_definitions("-DKBLAS") +# else() +# set(KBLAS OFF) +# endif() +endif() ############################################################################### ## PRINT CONFIGURATION ## diff --git a/Data.md b/Data.md new file mode 100644 index 00000000..6d33e696 --- /dev/null +++ b/Data.md @@ -0,0 +1,18 @@ +# Dataset + +## Mesh Deformation Application + +Dataset is available in KAUST repository: https://repository.kaust.edu.sa/handle/10754/664938. + +DOI:10.25781/KAUST-V2EF2 + +## Acoustic Scattering Application + +Dataset is available in KAUST repository: https://repository.kaust.edu.sa/handle/10754/664400. + + +DOI:10.25781/KAUST-I0634 + +For more information on the dataset please refer to the readme files in the data repositories. + + diff --git a/Jenkinsfile b/Jenkinsfile index 3c57966a..b3b68c44 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -117,5 +117,4 @@ pipeline { } } } - } diff --git a/README.md b/README.md index e968b0bf..613ffcdf 100644 --- a/README.md +++ b/README.md @@ -1,19 +1,16 @@ What is STARS-H? ================ -STARS-H is a **high performance parallel open-source** package of **Software -for Testing Accuracy, Reliability and Scalability of Hierarchical -computations**. It -provides a hierarchical matrix market in order to benchmark performance of -various libraries for hierarchical matrix compressions and computations -(including itself). **Why hierarchical matrices?** Because such matrices arise -in many PDEs and use much less memory, while requiring fewer flops for -computations. There are several hierarchical data formats, each one with its -own performance and memory footprint. STARS-H intends to provide a standard for -assessing accuracy and performance of hierarchical matrix libraries on a given -hardware architecture environment. STARS-H currently supports the tile low-rank -(TLR) data format for approximation on shared and distributed-memory systems, -using MPI, OpenMP and task-based programming models. +The Software for Testing Accuracy, Reliability and Scalability of Hierarchical (STARS-H) +computations is a parallelĀ  library that provides a high performance matrix market of +rank structured matrix operators. STARS-H supports various matrix kernels that are +proxies for many scientific applications, and optionally compresses them by exploiting +their data sparsity. This translates into a lower arithmetic complexity and memory footprint. +STARS-H intends to provide a standard software environment for assessing accuracy and performance +of š¯“—-matrix libraries on a given hardware architecture. STARS-H currently supports +the tile low-rank (TLR) data format for approximation on shared and distributed-memory systems, +possibly equipped with GPUs, using MPI, OpenMP and task-based programming models. + Vision of STARS-H ================= @@ -49,7 +46,7 @@ Applications in matrix-free form: 3. Electrodynamics (sin(kr)/r and cos(kr)/r) 4. Random synthetic TLR matrix 5. Spatial statistics (exponential, square exponential and matern kernels) -6. Mesh deformation using radial basis function (gaussian, exponential, inverse quadratic, inverse multi-quadratic, CPTS, and Wendland kernels) +6. Mesh deformation using radial basis functions, i.e., Gaussian, exponential, inverse quadratic, inverse multi-quadratic, CPTS, and Wendland kernels. 7. Acoustic scattering @@ -138,5 +135,11 @@ and have additional steps on approximation of corresponding matrices. *Important notice: the approximation phase does not require the entire dense matrix to be stored, since matrix elements are computed on the fly.* +Dataset +======== + +Please see Data.md for information about dataset. + + ![Handout](docs/STARS-H-final.png) diff --git a/SARS-CoV-2-meshes/GeneratePopulation.py b/SARS-CoV-2-meshes/GeneratePopulation.py index 596b51cb..843a8e66 100644 --- a/SARS-CoV-2-meshes/GeneratePopulation.py +++ b/SARS-CoV-2-meshes/GeneratePopulation.py @@ -1,4 +1,4 @@ -# @version 1.3.0 +# @version 0.3.0 import pandas as pd import numpy as np diff --git a/SARS-CoV-2-meshes/HierarchicalPopulationCluster.py b/SARS-CoV-2-meshes/HierarchicalPopulationCluster.py index 627fef91..eda0d3b0 100644 --- a/SARS-CoV-2-meshes/HierarchicalPopulationCluster.py +++ b/SARS-CoV-2-meshes/HierarchicalPopulationCluster.py @@ -1,4 +1,4 @@ -# @version 1.3.0 +# @version 0.3.0 import pandas as pd import numpy as np diff --git a/VERSION.txt b/VERSION.txt index f0bb29e7..0d91a54c 100644 --- a/VERSION.txt +++ b/VERSION.txt @@ -1 +1 @@ -1.3.0 +0.3.0 diff --git a/docs/STARS-H-final.png b/docs/STARS-H-final.png index b1873204..0e707d57 100644 Binary files a/docs/STARS-H-final.png and b/docs/STARS-H-final.png differ diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index aca37ad9..b2bee36c 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -5,7 +5,7 @@ # University of Science and Technology (KAUST) # # @file examples/CMakeLists.txt -# @version 1.3.0 +# @version 0.3.0 # @author Aleksandr Mikhalev # @date 2017-11-07 diff --git a/examples/approximation/CMakeLists.txt b/examples/approximation/CMakeLists.txt index c9634b2f..c67529b3 100644 --- a/examples/approximation/CMakeLists.txt +++ b/examples/approximation/CMakeLists.txt @@ -5,7 +5,7 @@ # University of Science and Technology (KAUST) # # @file examples/approximation/CMakeLists.txt -# @version 1.3.0 +# @version 0.3.0 # @author Aleksandr Mikhalev # @date 2017-11-07 diff --git a/examples/approximation/dense.c b/examples/approximation/dense.c index 6bf6662d..5ed3c777 100644 --- a/examples/approximation/dense.c +++ b/examples/approximation/dense.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file examples/approximation/dense.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/examples/approximation/minimal.c b/examples/approximation/minimal.c index f41b632e..b8edb580 100644 --- a/examples/approximation/minimal.c +++ b/examples/approximation/minimal.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file examples/approximation/minimal.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/examples/approximation/randtlr.c b/examples/approximation/randtlr.c index 4a5504f3..2dd751a3 100644 --- a/examples/approximation/randtlr.c +++ b/examples/approximation/randtlr.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file examples/approximation/randtlr.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/examples/approximation/spatial.c b/examples/approximation/spatial.c index eb4d95c5..525497fd 100644 --- a/examples/approximation/spatial.c +++ b/examples/approximation/spatial.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file examples/approximation/spatial.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/examples/problem/CMakeLists.txt b/examples/problem/CMakeLists.txt index da6e3349..bd1df186 100644 --- a/examples/problem/CMakeLists.txt +++ b/examples/problem/CMakeLists.txt @@ -5,7 +5,7 @@ # University of Science and Technology (KAUST) # # @file examples/problem/CMakeLists.txt -# @version 1.3.0 +# @version 0.3.0 # @author Aleksandr Mikhalev # @date 2020-06-09 diff --git a/examples/problem/acoustic.c b/examples/problem/acoustic.c index 92d4dc30..a925d84a 100644 --- a/examples/problem/acoustic.c +++ b/examples/problem/acoustic.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file examples/problem/acoustic.c - * @version 1.3.0 + * @version 0.3.0 * @auther Rabab Alomairy * @author Aleksandr Mikhalev * @date 2020-06-09 diff --git a/examples/problem/dense.c b/examples/problem/dense.c index 87df8179..3e3e53f4 100644 --- a/examples/problem/dense.c +++ b/examples/problem/dense.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file examples/problem/dense.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/examples/problem/minimal.c b/examples/problem/minimal.c index 18a705f6..53595145 100644 --- a/examples/problem/minimal.c +++ b/examples/problem/minimal.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file examples/problem/minimal.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/examples/problem/particles.c b/examples/problem/particles.c index 57e1aa28..2c05482f 100644 --- a/examples/problem/particles.c +++ b/examples/problem/particles.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file examples/problem/particles.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 */ diff --git a/examples/problem/randtlr.c b/examples/problem/randtlr.c index b20136fc..271f1eaa 100644 --- a/examples/problem/randtlr.c +++ b/examples/problem/randtlr.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file examples/problem/randtlr.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/examples/problem/rbf_cube.c b/examples/problem/rbf_cube.c index c8b2120b..74ca8029 100644 --- a/examples/problem/rbf_cube.c +++ b/examples/problem/rbf_cube.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file examples/problem/rbf_cube.c - * @version 1.3.0 + * @version 0.3.0 * @auther Rabab Alomairy * @author Aleksandr Mikhalev * @date 2020-06-09 diff --git a/examples/problem/rbf_virus.c b/examples/problem/rbf_virus.c index 93c37f75..7910a4b2 100644 --- a/examples/problem/rbf_virus.c +++ b/examples/problem/rbf_virus.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file examples/problem/rbf_virus.c - * @version 1.3.0 + * @version 0.3.0 * @auther Rabab Alomairy * @author Aleksandr Mikhalev * @date 2020-06-09 diff --git a/examples/problem/spatial.c b/examples/problem/spatial.c index 63e62421..046ba35d 100644 --- a/examples/problem/spatial.c +++ b/examples/problem/spatial.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file examples/problem/spatial.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/examples/problem/spatial_bivariate.c b/examples/problem/spatial_bivariate.c index 5066a120..02e88d0e 100644 --- a/examples/problem/spatial_bivariate.c +++ b/examples/problem/spatial_bivariate.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file examples/problem/spatial.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2020-06-09 * */ diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt index 3a68910f..c4d21bdd 100644 --- a/include/CMakeLists.txt +++ b/include/CMakeLists.txt @@ -5,7 +5,7 @@ # University of Science and Technology (KAUST) # # @file CMakeLists.txt -# @version 1.3.0 +# @version 0.3.0 # @author Eduardo Gonzalez Fisher # @author Aleksandr Mikhalev # @date 2020-06-09 diff --git a/include/common.h b/include/common.h index 8d1a37cb..90c4b846 100644 --- a/include/common.h +++ b/include/common.h @@ -9,7 +9,7 @@ * @cond * This command in pair with endcond will prevent file from being documented. * - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2020-06-09 * */ diff --git a/include/control/init.h b/include/control/init.h index 3682c44e..3908bd99 100644 --- a/include/control/init.h +++ b/include/control/init.h @@ -5,13 +5,13 @@ * University of Science and Technology (KAUST) * * @file src/control/init.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-08-13 * */ //! Set number of backends and default one -#define BACKEND_NUM 6 +#define BACKEND_NUM 9 #define BACKEND_DEFAULT STARSH_BACKEND_SEQUENTIAL #ifdef OPENMP #undef BACKEND_DEFAULT @@ -59,6 +59,21 @@ struct #else {"MPI_STARPU", STARSH_BACKEND_NOTSUPPORTED}, #endif +#if defined(STARPU) && defined(KBLAS) + {"STARPU_KBLAS", STARSH_BACKEND_STARPU_KBLAS}, +#else + {"STARPU_KBLAS", STARSH_BACKEND_NOTSUPPORTED}, +#endif +#if defined(STARPU) && defined(CUDA) + {"STARPU_CUDA", STARSH_BACKEND_STARPU_CUDA}, +#else + {"STARPU_CUDA", STARSH_BACKEND_NOTSUPPORTED}, +#endif +#if defined(STARPU) && defined(MPI) && defined(KBLAS) + {"MPI_STARPU_KBLAS", STARSH_BACKEND_MPI_STARPU_KBLAS}, +#else + {"MPI_STARPU_KBLAS", STARSH_BACKEND_NOTSUPPORTED}, +#endif }; //! Set number of low-rank engines and default one @@ -137,9 +152,40 @@ static STARSH_blrm_approximate *(dlr_starpu_mpi[LRENGINE_NUM]) = #endif }; +//! Array of approximation functions for STARPU_KBLAS backend +static STARSH_blrm_approximate *(dlr_starpu_kblas[LRENGINE_NUM]) = +{ + #if defined(STARPU) && defined(KBLAS) + starsh_blrm__dsdd_starpu, starsh_blrm__dsdd_starpu, + starsh_blrm__dqp3_starpu, starsh_blrm__drsdd_starpu_kblas2,//3_spatial, + starsh_blrm__drsdd_starpu_kblas3_spatial + #endif +}; + +//! Array of approximation functions for STARPU_CUDA backend +static STARSH_blrm_approximate *(dlr_starpu_cuda[LRENGINE_NUM]) = +{ + #if defined(STARPU) && defined(CUDA) + starsh_blrm__dsdd_starpu, starsh_blrm__dsdd_starpu, + starsh_blrm__dqp3_starpu, starsh_blrm__drsdd_starpu_cuda, + starsh_blrm__drsdd_starpu_cuda + #endif +}; + +//! Array of approximation functions for MPI_STARPU_KBLAS backend +static STARSH_blrm_approximate *(dlr_starpu_mpi_kblas[LRENGINE_NUM]) = +{ + #if defined(STARPU) && defined(MPI) && defined(KBLAS) + starsh_blrm__dsdd_mpi_starpu, starsh_blrm__dsdd_mpi_starpu, + starsh_blrm__dqp3_mpi_starpu, starsh_blrm__drsdd_mpi_starpu_kblas2, + starsh_blrm__drsdd_mpi_starpu_kblas2 + #endif +}; + //! Array of approximation functions, depending on backend static STARSH_blrm_approximate *(*dlr[BACKEND_NUM]) = { - dlr_seq, dlr_omp, dlr_mpi, dlr_mpi, dlr_starpu, dlr_starpu_mpi + dlr_seq, dlr_omp, dlr_mpi, dlr_mpi, dlr_starpu, dlr_starpu_mpi, + dlr_starpu_kblas, dlr_starpu_cuda, dlr_starpu_mpi_kblas, }; diff --git a/include/starsh-acoustic.h b/include/starsh-acoustic.h index 5eff879d..14fd5eb8 100644 --- a/include/starsh-acoustic.h +++ b/include/starsh-acoustic.h @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file include/starsh-acoustic.h - * @version 1.3.0 + * @version 0.3.0 * @auther Rabab Alomairy * @author Aleksandr Mikhalev * @date 2020-06-09 diff --git a/include/starsh-cauchy.h b/include/starsh-cauchy.h index ba129dc4..442ffa88 100644 --- a/include/starsh-cauchy.h +++ b/include/starsh-cauchy.h @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file include/starsh-minimal.h - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/include/starsh-constants.h b/include/starsh-constants.h index 091702f1..919aa643 100644 --- a/include/starsh-constants.h +++ b/include/starsh-constants.h @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file include/starsh-constants.h - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ @@ -30,8 +30,14 @@ enum STARSH_BACKEND //!< Hybrid MPI + OpenMP STARSH_BACKEND_STARPU = 4, //!< StarPU (without MPI) - STARSH_BACKEND_MPI_STARPU = 5 + STARSH_BACKEND_MPI_STARPU = 5, //!< StarPU (with MPI) + STARSH_BACKEND_STARPU_KBLAS = 6, + //!< StarPU+KBLAS (without MPI) + STARSH_BACKEND_STARPU_CUDA = 7, + //!< StarPU+CUDA (without MPI) + STARSH_BACKEND_MPI_STARPU_KBLAS = 8, + //!< MPI+StarPU+KBLAS }; //! Enum for low-rank engine (approximation technique) diff --git a/include/starsh-electrodynamics.h b/include/starsh-electrodynamics.h index d796627d..f3334180 100644 --- a/include/starsh-electrodynamics.h +++ b/include/starsh-electrodynamics.h @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file include/starsh-electrodynamics.h - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/include/starsh-electrostatics.h b/include/starsh-electrostatics.h index f6fd0b66..f3321c1a 100644 --- a/include/starsh-electrostatics.h +++ b/include/starsh-electrostatics.h @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file include/starsh-electrostatics.h - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/include/starsh-minimal.h b/include/starsh-minimal.h index 3fc66c78..7553a76c 100644 --- a/include/starsh-minimal.h +++ b/include/starsh-minimal.h @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file include/starsh-minimal.h - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/include/starsh-mpi-starpu-kblas.h b/include/starsh-mpi-starpu-kblas.h new file mode 100644 index 00000000..b9bf6cf2 --- /dev/null +++ b/include/starsh-mpi-starpu-kblas.h @@ -0,0 +1,67 @@ +/*! @copyright (c) 2017 King Abdullah University of Science and + * Technology (KAUST). All rights reserved. + * + * STARS-H is a software package, provided by King Abdullah + * University of Science and Technology (KAUST) + * + * @file include/starsh-mpi-starpu-kblas.h + * @version 0.1.0 + * @author Aleksandr Mikhalev + * @date 2017-11-07 + * */ + +#ifndef __STARSH_MPI_STARPU_KBLAS_H__ +#define __STARSH_MPI_STARPU_KBLAS_H__ + + +/////////////////////////////////////////////////////////////////////////////// +// APPROXIMATIONS // +/////////////////////////////////////////////////////////////////////////////// + +// Check if this is enabled in Doxygen +//! @cond (STARPU && MPI) + +/*! @addtogroup approximations + * @{ + * */ +// This will automatically include all entities between @{ and @} into group. + +//int starsh_blrm__dsdd_mpi_starpu(STARSH_blrm **matrix, STARSH_blrf *format, +// int maxrank, double tol, int onfly); +int starsh_blrm__drsdd_mpi_starpu_kblas(STARSH_blrm **matrix, STARSH_blrf *format, + int maxrank, double tol, int onfly); +int starsh_blrm__drsdd_mpi_starpu_kblas2(STARSH_blrm **matrix, STARSH_blrf *format, + int maxrank, double tol, int onfly); +int starsh_blrm__drsdd_mpi_starpu_kblas3_spatial(STARSH_blrm **matrix, STARSH_blrf *format, + int maxrank, double tol, int onfly); +//int starsh_blrm__dqp3_mpi_starpu(STARSH_blrm **matrix, STARSH_blrf *format, +// int maxrank, double tol, int onfly); +//int starsh_blrm__dna_mpi_starpu(STARSH_blrm **matrix, STARSH_blrf *format, +// int maxrank, double tol, int onfly); + +//! @} +// End of group + + +/////////////////////////////////////////////////////////////////////////////// +// MATRIX-MATRIX MULTIPLICATION // +/////////////////////////////////////////////////////////////////////////////// + +/*! @addtogroup matmul + * @{ + * */ +// This will automatically include all entities between @{ and @} into group. + +//int starsh_blrm__dmml_mpi_starpu(STARSH_blrm *matrix, int nrhs, double alpha, +// double *A, int lda, double beta, double *B, int ldb); +//int starsh_blrm__dmml_mpi_starpu_tlr(STARSH_blrm *matrix, int nrhs, +// double alpha, double *A, int lda, double beta, double *B, int ldb); + +//! @} +// End of group + +//! @endcond +// End of condition + +#endif // __STARSH_MPI_STARPU_KBLAS_H__ + diff --git a/include/starsh-mpi-starpu.h b/include/starsh-mpi-starpu.h index 74803072..2ba20ec5 100644 --- a/include/starsh-mpi-starpu.h +++ b/include/starsh-mpi-starpu.h @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file include/starsh-mpi-starpu.h - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/include/starsh-mpi.h b/include/starsh-mpi.h index d73de6a2..d8d15465 100644 --- a/include/starsh-mpi.h +++ b/include/starsh-mpi.h @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file include/starsh-mpi.h - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/include/starsh-particles.h b/include/starsh-particles.h index a4ebcdf6..a7575fe1 100644 --- a/include/starsh-particles.h +++ b/include/starsh-particles.h @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file include/starsh-particles.h - * @version 1.3.0 + * @version 0.3.0 * @author Sameh Abdulah * @author Aleksandr Mikhalev * @date 2020-06-09 diff --git a/include/starsh-randtlr.h b/include/starsh-randtlr.h index b52d512f..d1e723c5 100644 --- a/include/starsh-randtlr.h +++ b/include/starsh-randtlr.h @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file include/starsh-randtlr.h - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/include/starsh-rbf.h b/include/starsh-rbf.h index c27a6640..dc658260 100644 --- a/include/starsh-rbf.h +++ b/include/starsh-rbf.h @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file include/starsh-rbf.h - * @version 1.3.0 + * @version 0.3.0 * @auther Rabab Alomairy * @author Aleksandr Mikhalev * @date 2020-06-09 diff --git a/include/starsh-spatial-gsl.h b/include/starsh-spatial-gsl.h index ce404b03..357bdd8c 100644 --- a/include/starsh-spatial-gsl.h +++ b/include/starsh-spatial-gsl.h @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file include/starsh-spatial-gsl.h - * @version 1.3.0 + * @version 0.3.0 * @author sameh Abdulah * @author Aleksandr Mikhalev * @date 2020-06-09 diff --git a/include/starsh-spatial.h b/include/starsh-spatial.h index 949ca5f7..df3f4e78 100644 --- a/include/starsh-spatial.h +++ b/include/starsh-spatial.h @@ -275,4 +275,10 @@ void starsh_ssdata_block_parsimonious2_kernel_2d_simd_gcd(int nrows, int ncols, // defined #include "starsh-spatial-gsl.h" +// Add function that copies data to GPU +#ifdef CUDA +void starsh_ssdata_togpu(STARSH_ssdata **dest, STARSH_ssdata *src); +void starsh_ssdata_free_gpu(STARSH_ssdata *data); +#endif // CUDA + #endif // __STARSH_SPATIAL_H__ diff --git a/include/starsh-starpu-cuda.h b/include/starsh-starpu-cuda.h new file mode 100644 index 00000000..c5320156 --- /dev/null +++ b/include/starsh-starpu-cuda.h @@ -0,0 +1,66 @@ +/*! @copyright (c) 2017 King Abdullah University of Science and + * Technology (KAUST). All rights reserved. + * + * STARS-H is a software package, provided by King Abdullah + * University of Science and Technology (KAUST) + * + * @file include/starsh-starpu-cuda.h + * @version 0.1.0 + * @author Aleksandr Mikhalev + * @date 2017-11-07 + * */ + +#ifndef __STARSH_STARPU_CUDA_H__ +#define __STARSH_STARPU_CUDA_H__ + + +/////////////////////////////////////////////////////////////////////////////// +// APPROXIMATIONS // +/////////////////////////////////////////////////////////////////////////////// + +// Check if this is enabled in Doxygen +//! @cond (STARPU) + +/*! @addtogroup approximations + * @{ + * */ +// This will automatically include all entities between @{ and @} into group. + +//int starsh_blrm__dsdd_starpu(STARSH_blrm **matrix, STARSH_blrf *format, +// int maxrank, double tol, int onfly); +int starsh_blrm__drsdd_starpu_cuda(STARSH_blrm **matrix, STARSH_blrf *format, + int maxrank, double tol, int onfly); +//int starsh_blrm__dqp3_starpu(STARSH_blrm **matrix, STARSH_blrf *format, +// int maxrank, double tol, int onfly); +//int starsh_blrm__dna_starpu(STARSH_blrm **matrix, STARSH_blrf *format, +// int maxrank, double tol, int onfly); + +//! @} +// End of group + + +/////////////////////////////////////////////////////////////////////////////// +// LOW-RANK ROUTINES FOR DENSE // +/////////////////////////////////////////////////////////////////////////////// + +/*! @addtogroup lrdense + * @{ + * */ +// This will automatically include all entities between @{ and @} into group. + +//void starsh_dense_dlrsdd_starpu(void *buffers[], void *cl_arg); +void starsh_dense_dlrrsdd_starpu_cuda_cpu(void *buffers[], void *cl_arg); +void starsh_dense_dlrrsdd_starpu_cuda_gpu(void *buffers[], void *cl_arg); +//void starsh_dense_dlrqp3_starpu(void *buffers[], void *cl_arg); +void starsh_dense_kernel_starpu_cuda_cpu(void *buffers[], void *cl_arg); +//void starsh_dense_dgemm_starpu(void *buffers[], void *cl_arg); +//void starsh_dense_fake_init_starpu(void *buffers[], void *cl_arg); + +//! @} +// End of group + +//! @endcond +// End of condition + +#endif // __STARSH_STARPU_KBLAS_H__ + diff --git a/include/starsh-starpu-kblas.h b/include/starsh-starpu-kblas.h new file mode 100644 index 00000000..a55bd6fa --- /dev/null +++ b/include/starsh-starpu-kblas.h @@ -0,0 +1,74 @@ +/*! @copyright (c) 2017 King Abdullah University of Science and + * Technology (KAUST). All rights reserved. + * + * STARS-H is a software package, provided by King Abdullah + * University of Science and Technology (KAUST) + * + * @file include/starsh-starpu-kblas.h + * @version 0.1.0 + * @author Aleksandr Mikhalev + * @date 2017-11-07 + * */ + +#ifndef __STARSH_STARPU_KBLAS_H__ +#define __STARSH_STARPU_KBLAS_H__ + + +/////////////////////////////////////////////////////////////////////////////// +// APPROXIMATIONS // +/////////////////////////////////////////////////////////////////////////////// + +// Check if this is enabled in Doxygen +//! @cond (STARPU) + +/*! @addtogroup approximations + * @{ + * */ +// This will automatically include all entities between @{ and @} into group. + +//int starsh_blrm__dsdd_starpu(STARSH_blrm **matrix, STARSH_blrf *format, +// int maxrank, double tol, int onfly); +int starsh_blrm__drsdd_starpu_kblas(STARSH_blrm **matrix, STARSH_blrf *format, + int maxrank, double tol, int onfly); +int starsh_blrm__drsdd_starpu_kblas2(STARSH_blrm **matrix, STARSH_blrf *format, + int maxrank, double tol, int onfly); +int starsh_blrm__drsdd_starpu_kblas3_spatial(STARSH_blrm **matrix, STARSH_blrf *format, + int maxrank, double tol, int onfly); +//int starsh_blrm__dqp3_starpu(STARSH_blrm **matrix, STARSH_blrf *format, +// int maxrank, double tol, int onfly); +//int starsh_blrm__dna_starpu(STARSH_blrm **matrix, STARSH_blrf *format, +// int maxrank, double tol, int onfly); + +//! @} +// End of group + + +/////////////////////////////////////////////////////////////////////////////// +// LOW-RANK ROUTINES FOR DENSE // +/////////////////////////////////////////////////////////////////////////////// + +/*! @addtogroup lrdense + * @{ + * */ +// This will automatically include all entities between @{ and @} into group. + +//void starsh_dense_dlrsdd_starpu(void *buffers[], void *cl_arg); +void starsh_dense_dlrrsdd_starpu_kblas_cpu(void *buffers[], void *cl_arg); +void starsh_dense_dlrrsdd_starpu_kblas_gpu(void *buffers[], void *cl_arg); +void starsh_dense_dlrrsdd_starpu_kblas2_gpu(void *buffers[], void *cl_arg); +void starsh_dense_dlrrsdd_starpu_kblas2_getrank(void *buffers[], void *cl_arg); +//void starsh_dense_dlrqp3_starpu(void *buffers[], void *cl_arg); +void starsh_dense_kernel_starpu_kblas_cpu(void *buffers[], void *cl_arg); +void starsh_dense_kernel_starpu_kblas2_cpu(void *buffers[], void *cl_arg); +void starsh_dense_kernel_starpu_kblas3_gpu(void *buffers[], void *cl_arg); +//void starsh_dense_dgemm_starpu(void *buffers[], void *cl_arg); +//void starsh_dense_fake_init_starpu(void *buffers[], void *cl_arg); + +//! @} +// End of group + +//! @endcond +// End of condition + +#endif // __STARSH_STARPU_KBLAS_H__ + diff --git a/include/starsh-starpu.h b/include/starsh-starpu.h index 7792e1e1..a943cb3e 100644 --- a/include/starsh-starpu.h +++ b/include/starsh-starpu.h @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file include/starsh-starpu.h - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/misc_scripts/code_generation/applications/particles/kernel_nd.py b/misc_scripts/code_generation/applications/particles/kernel_nd.py index 743bcd8c..3140cd83 100644 --- a/misc_scripts/code_generation/applications/particles/kernel_nd.py +++ b/misc_scripts/code_generation/applications/particles/kernel_nd.py @@ -6,7 +6,7 @@ University of Science and Technology (KAUST) @file misc_scripts/code_generation/applications/particles/kernel_nd.py - @version 1.3.0 + @version 0.3.0 @author Aleksandr Mikhalev @date 2017-08-22 """ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index fce217b2..4582b2c3 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -5,7 +5,7 @@ # University of Science and Technology (KAUST) # # @file src/CMakeLists.txt -# @version 1.3.0 +# @version 0.3.0 # @author Aleksandr Mikhalev # @date 2017-11-07 @@ -40,6 +40,14 @@ if(OPENMP) endif(OPENMP) if(STARPU) target_link_libraries(starsh PUBLIC ${STARPU_LIBRARIES}) + if(KBLAS) + target_link_libraries(starsh PUBLIC cublas_static cudart_static + culibos cusparse_static stdc++ kblas-gpu dl rt) + endif(KBLAS) + if(CUDA) + target_link_libraries(starsh PUBLIC cublas_static cudart_static + cusolver_static curand_static culibos stdc++ dl rt) + endif(CUDA) endif(STARPU) if(GSL_FOUND) target_link_libraries(starsh PUBLIC ${GSL_LIBRARIES}) diff --git a/src/applications/CMakeLists.txt b/src/applications/CMakeLists.txt index 08b1f8c2..a9dcfdca 100644 --- a/src/applications/CMakeLists.txt +++ b/src/applications/CMakeLists.txt @@ -5,7 +5,7 @@ # University of Science and Technology (KAUST) # # @file src/applications/CMakeLists.txt -# @version 1.3.0 +# @version 0.3.0 # @author Aleksandr Mikhalev # @date 2020-06-09 diff --git a/src/applications/cauchy.c b/src/applications/cauchy.c index 98b0e42b..f402e632 100644 --- a/src/applications/cauchy.c +++ b/src/applications/cauchy.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file src/applications/cauchy.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/src/applications/common.c b/src/applications/common.c index e688c19a..6de86173 100644 --- a/src/applications/common.c +++ b/src/applications/common.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file src/applications/common.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/src/applications/electrodynamics.c b/src/applications/electrodynamics.c index b8a873b3..0966c226 100644 --- a/src/applications/electrodynamics.c +++ b/src/applications/electrodynamics.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file src/applications/electrodynamics.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 */ diff --git a/src/applications/electrodynamics/CMakeLists.txt b/src/applications/electrodynamics/CMakeLists.txt index a0f1f485..5cd4b813 100644 --- a/src/applications/electrodynamics/CMakeLists.txt +++ b/src/applications/electrodynamics/CMakeLists.txt @@ -5,7 +5,7 @@ # University of Science and Technology (KAUST) # # @file src/applications/electrodynamics/CMakeLists.txt -# @version 1.3.0 +# @version 0.3.0 # @author Aleksandr Mikhalev # @date 2017-11-07 diff --git a/src/applications/electrodynamics/kernel_cos.c b/src/applications/electrodynamics/kernel_cos.c index 54062870..8ddc75ee 100644 --- a/src/applications/electrodynamics/kernel_cos.c +++ b/src/applications/electrodynamics/kernel_cos.c @@ -13,7 +13,7 @@ * STARS-H, simply do substitutions yourself. * * @file src/applications/electrodynamics/kernel_cos.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 */ diff --git a/src/applications/electrodynamics/kernel_sin.c b/src/applications/electrodynamics/kernel_sin.c index d8d9114e..86fcddea 100644 --- a/src/applications/electrodynamics/kernel_sin.c +++ b/src/applications/electrodynamics/kernel_sin.c @@ -13,7 +13,7 @@ * STARS-H, simply do substitutions yourself. * * @file src/applications/electrodynamics/kernel_sin.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 */ diff --git a/src/applications/electrostatics.c b/src/applications/electrostatics.c index 3158f740..475f221e 100644 --- a/src/applications/electrostatics.c +++ b/src/applications/electrostatics.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file src/applications/electrostatics.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 */ diff --git a/src/applications/electrostatics/CMakeLists.txt b/src/applications/electrostatics/CMakeLists.txt index bbeb1942..88a35453 100644 --- a/src/applications/electrostatics/CMakeLists.txt +++ b/src/applications/electrostatics/CMakeLists.txt @@ -5,7 +5,7 @@ # University of Science and Technology (KAUST) # # @file src/applications/electrostatics/CMakeLists.txt -# @version 1.3.0 +# @version 0.3.0 # @author Aleksandr Mikhalev # @date 2017-11-07 diff --git a/src/applications/electrostatics/kernel_coulomb_potential.c b/src/applications/electrostatics/kernel_coulomb_potential.c index d5eb6c99..9d1930df 100644 --- a/src/applications/electrostatics/kernel_coulomb_potential.c +++ b/src/applications/electrostatics/kernel_coulomb_potential.c @@ -13,7 +13,7 @@ * STARS-H, simply do substitutions yourself. * * @file src/applications/electrostatics/kernel_coulomb_potential.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 */ diff --git a/src/applications/mesh_deformation/cube.c b/src/applications/mesh_deformation/cube.c index 805eaca0..c56c98f0 100644 --- a/src/applications/mesh_deformation/cube.c +++ b/src/applications/mesh_deformation/cube.c @@ -11,7 +11,7 @@ * STARS-H, simply do substitutions yourself. * * @file src/applications/mesh_deformation/cube.c - * @version 1.3.0 + * @version 0.3.0 * @author Rabab Alomairy * @date 2020-06-09 */ diff --git a/src/applications/mesh_deformation/kernels_rbf.c b/src/applications/mesh_deformation/kernels_rbf.c index f8c4927f..9e6d7d0c 100644 --- a/src/applications/mesh_deformation/kernels_rbf.c +++ b/src/applications/mesh_deformation/kernels_rbf.c @@ -11,7 +11,7 @@ * STARS-H, simply do substitutions yourself. * * @file src/applications/mesh_deformation/cube.c - * @version 1.3.0 + * @version 0.3.0 * @author Rabab Alomairy * @date 2020-06-09 */ diff --git a/src/applications/minimal.c b/src/applications/minimal.c index 9bce1c4d..5768d34c 100644 --- a/src/applications/minimal.c +++ b/src/applications/minimal.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file src/applications/minimal.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/src/applications/randtlr.c b/src/applications/randtlr.c index c03ac848..039a3440 100644 --- a/src/applications/randtlr.c +++ b/src/applications/randtlr.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file src/applications/randtlr.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/src/applications/spatial.c b/src/applications/spatial.c index adf44505..d68ed5be 100644 --- a/src/applications/spatial.c +++ b/src/applications/spatial.c @@ -1828,3 +1828,39 @@ void starsh_ssdata_block_parsimonious2_kernel_2d_simd(int nrows, int ncols, } #endif // GSL +#ifdef CUDA +void starsh_ssdata_togpu(STARSH_ssdata **dest, STARSH_ssdata *src) +{ + void *dest_points; + size_t points_size = sizeof(double) * src->particles.ndim * + src->particles.count; + //printf("COPY to GPU: %zu bytes\n", points_size); + cudaError_t err = cudaSuccess; + err = cudaMalloc(&dest_points, points_size); + if(err != cudaSuccess) + printf("cudaMalloc error\n"); + //printf("points address: %p\n", dest_points); + err = cudaMemcpy(dest_points, src->particles.point, points_size, + cudaMemcpyHostToDevice); + if(err != cudaSuccess) + printf("cudaMemcpy error\n"); + STARSH_ssdata tmp; + tmp = *src; + tmp.particles.point = dest_points; + err = cudaMalloc(dest, sizeof(STARSH_ssdata)); + if(err != cudaSuccess) + printf("cudaMalloc error\n"); + err = cudaMemcpy(*dest, &tmp, sizeof(STARSH_ssdata), cudaMemcpyHostToDevice); + if(err != cudaSuccess) + printf("cudaMemcpy error\n"); + //printf("Succesfully copied into GPU\n"); +} + +void starsh_ssdata_free_gpu(STARSH_ssdata *data) +{ + STARSH_ssdata tmp; + cudaMemcpy(&tmp, data, sizeof(STARSH_ssdata), cudaMemcpyDeviceToHost); + cudaFree(tmp.particles.point); + cudaFree(data); +} +#endif // CUDA diff --git a/src/applications/spatial/CMakeLists.txt b/src/applications/spatial/CMakeLists.txt index d917cd21..8645296b 100644 --- a/src/applications/spatial/CMakeLists.txt +++ b/src/applications/spatial/CMakeLists.txt @@ -5,7 +5,7 @@ # University of Science and Technology (KAUST) # # @file src/applications/spatial/CMakeLists.txt -# @version 1.3.0 +# @version 0.3.0 # @author Aleksandr Mikhalev # @date 2017-11-07 diff --git a/src/backends/CMakeLists.txt b/src/backends/CMakeLists.txt index c51fda81..85d56b1a 100644 --- a/src/backends/CMakeLists.txt +++ b/src/backends/CMakeLists.txt @@ -5,7 +5,7 @@ # University of Science and Technology (KAUST) # # @file src/backends/CMakeLists.txt -# @version 1.3.0 +# @version 0.3.0 # @author Aleksandr Mikhalev # @date 2017-11-07 # Add sequential code @@ -19,11 +19,28 @@ set(BACKENDS_OBJECTS $) # List StarPU sources for docs or build if(STARPU OR DOCS STREQUAL "FULL") add_subdirectory("starpu") + if(KBLAS OR DOCS STREQUAL "FULL") + add_subdirectory("starpu_kblas") + add_subdirectory("starpu_kblas2") + add_subdirectory("starpu_kblas3_spatial") + endif() + if(CUDA OR DOCS STREQUAL "FULL") + add_subdirectory("starpu_cuda") + endif() endif() # Add StarPU backend if(STARPU) list(APPEND BACKENDS_OBJECTS $) + if(KBLAS) + list(APPEND BACKENDS_OBJECTS $) + list(APPEND BACKENDS_OBJECTS $) + list(APPEND BACKENDS_OBJECTS + $) + endif() + if(CUDA) + list(APPEND BACKENDS_OBJECTS $) + endif() endif() # List OpenMP sources for docs or build @@ -46,7 +63,7 @@ if(MPI) list(APPEND BACKENDS_OBJECTS $) endif() -# List MPI sources for docs or build +# List MPI+STARPU sources for docs or build if((MPI AND STARPU) OR DOCS STREQUAL "FULL") add_subdirectory("mpi_starpu") endif() @@ -56,6 +73,21 @@ if(MPI AND STARPU) list(APPEND BACKENDS_OBJECTS $) endif() +# List MPI+STARPU+KBLAS sources for docs or build +if((MPI AND STARPU AND KBLAS) OR DOCS STREQUAL "FULL") + add_subdirectory("mpi_starpu_kblas") + add_subdirectory("mpi_starpu_kblas2") + add_subdirectory("mpi_starpu_kblas3_spatial") +endif() + +# Add MPI+StarPU+KBLAS backend +if(MPI AND STARPU AND KBLAS) + list(APPEND BACKENDS_OBJECTS $) + list(APPEND BACKENDS_OBJECTS $) + list(APPEND BACKENDS_OBJECTS + $) +endif() + # Move all selected backends to parent directory set(BACKENDS_OBJECTS ${BACKENDS_OBJECTS} PARENT_SCOPE) diff --git a/src/backends/mpi/CMakeLists.txt b/src/backends/mpi/CMakeLists.txt index cb64cb0c..bc4d26aa 100644 --- a/src/backends/mpi/CMakeLists.txt +++ b/src/backends/mpi/CMakeLists.txt @@ -5,7 +5,7 @@ # University of Science and Technology (KAUST) # # @file src/backends/mpi/CMakeLists.txt -# @version 1.3.0 +# @version 0.3.0 # @author Aleksandr Mikhalev # @date 2017-11-07 diff --git a/src/backends/mpi/blrm/CMakeLists.txt b/src/backends/mpi/blrm/CMakeLists.txt index 7b433f34..6b3344c3 100644 --- a/src/backends/mpi/blrm/CMakeLists.txt +++ b/src/backends/mpi/blrm/CMakeLists.txt @@ -5,7 +5,7 @@ # University of Science and Technology (KAUST) # # @file src/backends/mpi/blrm/CMakeLists.txt -# @version 1.3.0 +# @version 0.3.0 # @author Aleksandr Mikhalev # @date 2017-11-07 diff --git a/src/backends/mpi/blrm/dfe.c b/src/backends/mpi/blrm/dfe.c index 7f6c00e2..d568cdb4 100644 --- a/src/backends/mpi/blrm/dfe.c +++ b/src/backends/mpi/blrm/dfe.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file src/backends/mpi/blrm/dfe.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ @@ -144,6 +144,8 @@ double starsh_blrm__dfe_mpi(STARSH_blrm *matrix) value[1] *= value[1]; double mpi_value[2] = {0, 0}; MPI_Allreduce(&value, &mpi_value, 2, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + int mpi_rank; + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); return sqrt(mpi_value[0]/mpi_value[1]); } diff --git a/src/backends/mpi/blrm/dmml.c b/src/backends/mpi/blrm/dmml.c index c0a2d1ca..00da8c81 100644 --- a/src/backends/mpi/blrm/dmml.c +++ b/src/backends/mpi/blrm/dmml.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file src/backends/mpi/blrm/dmml.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/src/backends/mpi/blrm/dna.c b/src/backends/mpi/blrm/dna.c index e0cb0642..db6550ca 100644 --- a/src/backends/mpi/blrm/dna.c +++ b/src/backends/mpi/blrm/dna.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file src/backends/mpi/blrm/dna.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/src/backends/mpi/blrm/dqp3.c b/src/backends/mpi/blrm/dqp3.c index ed088c8f..7eff81ec 100644 --- a/src/backends/mpi/blrm/dqp3.c +++ b/src/backends/mpi/blrm/dqp3.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file src/backends/mpi/blrm/dqp3.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/src/backends/mpi/blrm/drsdd.c b/src/backends/mpi/blrm/drsdd.c index 3309239c..4a5ee9a1 100644 --- a/src/backends/mpi/blrm/drsdd.c +++ b/src/backends/mpi/blrm/drsdd.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file src/backends/mpi/blrm/drsdd.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ @@ -287,7 +287,7 @@ int starsh_blrm__drsdd_mpi(STARSH_blrm **matrix, STARSH_blrf *format, starsh_blrf_free(F2); } // Compute near-field blocks if needed - if(onfly == 0 && new_nblocks_near > 0) + if(onfly == 0 && new_nblocks_near_local > 0) { STARSH_MALLOC(near_D, new_nblocks_near_local); size_t size_D = 0; diff --git a/src/backends/mpi/blrm/dsdd.c b/src/backends/mpi/blrm/dsdd.c index b2bc8b35..be370e48 100644 --- a/src/backends/mpi/blrm/dsdd.c +++ b/src/backends/mpi/blrm/dsdd.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file src/backends/mpi/blrm/dsdd.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/src/backends/mpi_starpu/CMakeLists.txt b/src/backends/mpi_starpu/CMakeLists.txt index 2f66051e..92531ec6 100644 --- a/src/backends/mpi_starpu/CMakeLists.txt +++ b/src/backends/mpi_starpu/CMakeLists.txt @@ -5,7 +5,7 @@ # University of Science and Technology (KAUST) # # @file src/backends/mpi_starpu/CMakeLists.txt -# @version 1.3.0 +# @version 0.3.0 # @author Aleksandr Mikhalev # @date 2017-11-07 diff --git a/src/backends/mpi_starpu/blrm/CMakeLists.txt b/src/backends/mpi_starpu/blrm/CMakeLists.txt index 9d3b9845..0b72341b 100644 --- a/src/backends/mpi_starpu/blrm/CMakeLists.txt +++ b/src/backends/mpi_starpu/blrm/CMakeLists.txt @@ -5,7 +5,7 @@ # University of Science and Technology (KAUST) # # @file src/backends/mpi_starpu/blrm/CMakeLists.txt -# @version 1.3.0 +# @version 0.3.0 # @author Aleksandr Mikhalev # @date 2017-11-07 diff --git a/src/backends/mpi_starpu/blrm/dmml.c b/src/backends/mpi_starpu/blrm/dmml.c index 3f8d3bdc..0d3a2845 100644 --- a/src/backends/mpi_starpu/blrm/dmml.c +++ b/src/backends/mpi_starpu/blrm/dmml.c @@ -9,7 +9,7 @@ * @cond * This command in pair with endcond will prevent file from being documented. * - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/src/backends/mpi_starpu/blrm/dqp3.c b/src/backends/mpi_starpu/blrm/dqp3.c index d5109645..4ee38d45 100644 --- a/src/backends/mpi_starpu/blrm/dqp3.c +++ b/src/backends/mpi_starpu/blrm/dqp3.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file src/backends/mpi_starpu/blrm/dqp3.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/src/backends/mpi_starpu/blrm/drsdd.c b/src/backends/mpi_starpu/blrm/drsdd.c index da761e7c..1e9b82d6 100644 --- a/src/backends/mpi_starpu/blrm/drsdd.c +++ b/src/backends/mpi_starpu/blrm/drsdd.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file src/backends/mpi_starpu/blrm/drsdd.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ @@ -283,7 +283,7 @@ int starsh_blrm__drsdd_mpi_starpu(STARSH_blrm **matrix, STARSH_blrf *format, starsh_blrf_free(F2); } // Compute near-field blocks if needed - if(onfly == 0 && new_nblocks_near > 0) + if(onfly == 0 && new_nblocks_near_local > 0) { STARSH_int nbi_value[new_nblocks_near_local]; starpu_data_handle_t D_handle[new_nblocks_near_local]; diff --git a/src/backends/mpi_starpu/blrm/dsdd.c b/src/backends/mpi_starpu/blrm/dsdd.c index c389e5a2..659a07d4 100644 --- a/src/backends/mpi_starpu/blrm/dsdd.c +++ b/src/backends/mpi_starpu/blrm/dsdd.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file src/backends/mpi_starpu/blrm/dsdd.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/src/backends/mpi_starpu_kblas/CMakeLists.txt b/src/backends/mpi_starpu_kblas/CMakeLists.txt new file mode 100644 index 00000000..7bfec5b1 --- /dev/null +++ b/src/backends/mpi_starpu_kblas/CMakeLists.txt @@ -0,0 +1,25 @@ +# @copyright (c) 2017 King Abdullah University of Science and +# Technology (KAUST). All rights reserved. +# +# STARS-H is a software package, provided by King Abdullah +# University of Science and Technology (KAUST) +# +# @file src/backends/mpi_starpu_kblas/CMakeLists.txt +# @version 0.1.0 +# @author Aleksandr Mikhalev +# @date 2017-11-07 + + +# Collect sources for documentation and compilation +set(SRC) +add_subdirectory("blrm") + +# If compilation is requried +if(MPI AND STARPU AND KBLAS) + add_library(backends_mpi_starpu_kblas OBJECT ${SRC}) + set_target_properties(backends_mpi_starpu_kblas PROPERTIES COMPILE_FLAGS + "${MPI_C_COMPILE_FLAGS}") +endif() + +# Put doxygen input to parent scope +set(DOXYGEN_INPUT ${DOXYGEN_INPUT} ${SRC} PARENT_SCOPE) diff --git a/src/backends/mpi_starpu_kblas/blrm/CMakeLists.txt b/src/backends/mpi_starpu_kblas/blrm/CMakeLists.txt new file mode 100644 index 00000000..5d890a91 --- /dev/null +++ b/src/backends/mpi_starpu_kblas/blrm/CMakeLists.txt @@ -0,0 +1,21 @@ +# @copyright (c) 2017 King Abdullah University of Science and +# Technology (KAUST). All rights reserved. +# +# STARS-H is a software package, provided by King Abdullah +# University of Science and Technology (KAUST) +# +# @file src/backends/mpi_starpu/blrm/CMakeLists.txt +# @version 0.1.0 +# @author Aleksandr Mikhalev +# @date 2017-11-07 + + +# set the values of the variable in the parent scope +set(SRC + #"${CMAKE_CURRENT_SOURCE_DIR}/dqp3.c" + "${CMAKE_CURRENT_SOURCE_DIR}/drsdd.c" + #"${CMAKE_CURRENT_SOURCE_DIR}/dfe.c" + #"${CMAKE_CURRENT_SOURCE_DIR}/dsdd.c" + #"${CMAKE_CURRENT_SOURCE_DIR}/dmml.c" + #"${CMAKE_CURRENT_SOURCE_DIR}/dna.c" + PARENT_SCOPE) diff --git a/src/backends/mpi_starpu_kblas/blrm/drsdd.c b/src/backends/mpi_starpu_kblas/blrm/drsdd.c new file mode 100644 index 00000000..a55560a9 --- /dev/null +++ b/src/backends/mpi_starpu_kblas/blrm/drsdd.c @@ -0,0 +1,682 @@ +/*! @copyright (c) 2017 King Abdullah University of Science and + * Technology (KAUST). All rights reserved. + * + * STARS-H is a software package, provided by King Abdullah + * University of Science and Technology (KAUST) + * + * @file src/backends/mpi_starpu_kblas/blrm/drsdd.c + * @version 0.1.0 + * @author Aleksandr Mikhalev + * @date 2017-11-07 + * */ + +#include "common.h" +#include "starsh.h" +#include "starsh-starpu-kblas.h" +#include "starsh-mpi-starpu-kblas.h" +#include +#include +#include +#include "batch_rand.h" +#include +#include + +static void init_starpu_kblas(void *args) +{ + cublasHandle_t *cublas_handles; + kblasHandle_t *kblas_handles; + kblasRandState_t *kblas_states; + cudaStream_t stream = starpu_cuda_get_local_stream(); + int nb, nsamples, maxbatch; + double **work; + int **iwork; + starpu_codelet_unpack_args(args, &cublas_handles, &kblas_handles, + &kblas_states, &work, &iwork, &nb, &nsamples, &maxbatch); + int id = starpu_worker_get_id(); + cublasStatus_t status; + //double time0 = MPI_Wtime(); + //cublasCreate(&cublas_handles[id]); + //double time1 = MPI_Wtime(); + kblasCreate(&kblas_handles[id]); + //double timek = MPI_Wtime(); + //printf("CUBLAS: %f, KBLAS: %f\n", time1-time0, timek-time1); + //return; + kblasSetStream(kblas_handles[id], stream); + //double time2 = MPI_Wtime(); + kblasDrsvd_batch_wsquery(kblas_handles[id], nb, nb, nsamples, maxbatch); + //double time3 = MPI_Wtime(); + kblasAllocateWorkspace(kblas_handles[id]); + //double time4 = MPI_Wtime(); + cublas_handles[id] = kblasGetCublasHandle(kblas_handles[id]); + //double time5 = MPI_Wtime(); + kblasInitRandState(kblas_handles[id], &kblas_states[id], 16384*2, 0); + //double time6 = MPI_Wtime(); + work[id] = malloc(nsamples*maxbatch*sizeof(double)); + //double time7 = MPI_Wtime(); + iwork[id] = malloc(maxbatch*sizeof(int)); + //double time8 = MPI_Wtime(); + cudaStreamSynchronize(stream); + //double time9 = MPI_Wtime(); + //printf("KBLAS INIT: %f %f %f %f %f\n", time1-time0, time2-time1, time3-time2, time4-time3, time5-time4); + //printf("KBLAS INIT: %f %f %f %f\n", time6-time5, time7-time6, time8-time7, time9-time8); +} + +static void init_starpu_cpu(void *args) +{ + int nb, nsamples; + int lwork, liwork; + double **work; + int **iwork; + starpu_codelet_unpack_args(args, &nb, &nsamples, &work, &lwork, &iwork, + &liwork); + int id = starpu_worker_get_id(); + work[id] = malloc(lwork*sizeof(*work[0])); + iwork[id] = malloc(liwork*sizeof(*iwork[0])); +} + +static void deinit_starpu_kblas(void *args) +{ + int nb, nsamples, maxbatch; + double **work; + int **iwork; + cublasHandle_t *cublas_handles; + kblasHandle_t *kblas_handles; + kblasRandState_t *kblas_states; + starpu_codelet_unpack_args(args, &cublas_handles, &kblas_handles, + &kblas_states, &work, &iwork, &nb, &nsamples, &maxbatch); + int id = starpu_worker_get_id(); + kblasDestroyRandState(kblas_states[id]); + kblasDestroy(&kblas_handles[id]); + free(work[id]); + free(iwork[id]); +} + +static void deinit_starpu_cpu(void *args) +{ + int nb, nsamples; + int lwork, liwork; + double **work; + int **iwork; + starpu_codelet_unpack_args(args, &nb, &nsamples, &work, &lwork, &iwork, + &liwork); + int id = starpu_worker_get_id(); + free(work[id]); + free(iwork[id]); +} + +static void empty_cpu_func(void *buffer[], void *cl_arg) +{ +} + +void starsh_dense_kernel_mpi_starpu_kblas_cpu_far(void *buffers[], void *cl_arg) +//! STARPU kernel for matrix kernel. +{ + STARSH_blrf *F; + STARSH_int batch_size; + starpu_codelet_unpack_args(cl_arg, &F, &batch_size); + STARSH_problem *P = F->problem; + STARSH_kernel *kernel = P->kernel; + // Shortcuts to information about clusters + STARSH_cluster *RC = F->row_cluster, *CC = F->col_cluster; + void *RD = RC->data, *CD = CC->data; + double *D = (double *)STARPU_VECTOR_GET_PTR(buffers[0]); + STARSH_int *ind = (STARSH_int *)STARPU_VECTOR_GET_PTR(buffers[1]); + // This works only for equal square tiles + STARSH_int N = RC->size[0]; + STARSH_int stride = N*N; + int pool_size = starpu_combined_worker_get_size(); + int pool_rank = starpu_combined_worker_get_rank(); + for(STARSH_int ibatch = pool_rank; ibatch < batch_size; + ibatch += pool_size) + { + int k = ind[ibatch]; + int i = F->block_far[k*2]; + int j = F->block_far[k*2+1]; + kernel(N, N, RC->pivot+RC->start[i], CC->pivot+CC->start[j], + RD, CD, D + ibatch*stride, N); + } +} + +void starsh_dense_kernel_mpi_starpu_kblas_cpu_near(void *buffers[], void *cl_arg) +//! STARPU kernel for matrix kernel. +{ + STARSH_blrf *F; + STARSH_int batch_size; + starpu_codelet_unpack_args(cl_arg, &F, &batch_size); + STARSH_problem *P = F->problem; + STARSH_kernel *kernel = P->kernel; + // Shortcuts to information about clusters + STARSH_cluster *RC = F->row_cluster, *CC = F->col_cluster; + void *RD = RC->data, *CD = CC->data; + double *D = (double *)STARPU_VECTOR_GET_PTR(buffers[0]); + STARSH_int *ind = (STARSH_int *)STARPU_VECTOR_GET_PTR(buffers[1]); + // This works only for equal square tiles + STARSH_int N = RC->size[0]; + STARSH_int stride = N*N; + int pool_size = starpu_combined_worker_get_size(); + int pool_rank = starpu_combined_worker_get_rank(); + for(STARSH_int ibatch = pool_rank; ibatch < batch_size; + ibatch += pool_size) + { + int k = ind[ibatch]; + int i = F->block_near[k*2]; + int j = F->block_near[k*2+1]; + kernel(N, N, RC->pivot+RC->start[i], CC->pivot+CC->start[j], + RD, CD, D + ibatch*stride, N); + } +} + +int starsh_blrm__drsdd_mpi_starpu_kblas(STARSH_blrm **matrix, + STARSH_blrf *format, int maxrank, double tol, int onfly) +//! Approximate each tile by randomized SVD. +/*! + * @param[out] matrix: Address of pointer to @ref STARSH_blrm object. + * @param[in] format: Block low-rank format. + * @param[in] maxrank: Maximum possible rank. + * @param[in] tol: Relative error tolerance. + * @param[in] onfly: Whether not to store dense blocks. + * @return Error code @ref STARSH_ERRNO. + * @ingroup blrm + * */ +{ + double time_start = MPI_Wtime(); + STARSH_blrf *F = format; + STARSH_problem *P = F->problem; + STARSH_kernel *kernel = P->kernel; + STARSH_int nblocks_far = F->nblocks_far; + STARSH_int nblocks_near = F->nblocks_near; + STARSH_int nblocks_far_local = F->nblocks_far_local; + STARSH_int nblocks_near_local = F->nblocks_near_local; + // Shortcuts to information about clusters + STARSH_cluster *RC = F->row_cluster; + STARSH_cluster *CC = F->col_cluster; + void *RD = RC->data, *CD = CC->data; + // Following values default to given block low-rank format F, but they are + // changed when there are false far-field blocks. + STARSH_int new_nblocks_far = F->nblocks_far; + STARSH_int new_nblocks_near = F->nblocks_near; + STARSH_int new_nblocks_far_local = F->nblocks_far_local; + STARSH_int new_nblocks_near_local = F->nblocks_near_local; + STARSH_int *block_far = F->block_far; + STARSH_int *block_near = F->block_near; + STARSH_int *block_far_local = F->block_far_local; + STARSH_int *block_near_local = F->block_near_local; + // Places to store low-rank factors, dense blocks and ranks + Array **far_U = NULL, **far_V = NULL, **near_D = NULL; + int *far_rank = NULL; + double *alloc_U = NULL, *alloc_V = NULL, *alloc_D = NULL; + size_t offset_U = 0, offset_V = 0, offset_D = 0; + STARSH_int lbi, lbj, bi, bj = 0; + const int oversample = starsh_params.oversample; + // MPI + int mpi_size, mpi_rank; + MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); + // Init CuBLAS and KBLAS handles and temp buffers for all workers (but they + // are used only in GPU codelets) + int workers = starpu_worker_get_count(); + cublasHandle_t cublas_handles[workers]; + kblasHandle_t kblas_handles[workers]; + kblasRandState_t kblas_states[workers]; + double *work[workers]; + int *iwork[workers]; + cublasHandle_t *cuhandles = cublas_handles; + kblasHandle_t *khandles = kblas_handles; + kblasRandState_t *kstates = kblas_states; + double **wwork = work; + int **wiwork = iwork; + //printf("MAIN: %p, %p, %p\n", cuhandles, khandles, svhandles); + void *args_gpu, *args_cpu; + size_t args_gpu_size = 0; + size_t args_cpu_size = 0; + // This works only for TLR with equal tiles + int nb = RC->size[0]; + int nsamples = maxrank+oversample; + // Set size of batch + int batch_size = 100; + // Ceil number of batches + int nbatches_local = (nblocks_far_local-1)/batch_size + 1; + // Get corresponding sizes and minimum of them + int mn = maxrank+oversample; + if(mn > nb) + mn = nb; + // Get size of temporary arrays + int lwork = nb; + int lwork_sdd = (4*mn+7) * mn; + if(lwork_sdd > lwork) + lwork = lwork_sdd; + lwork += mn*(3*nb+mn+1) + nb*nb; + int liwork = 8 * mn; + starpu_codelet_pack_args(&args_gpu, &args_gpu_size, + STARPU_VALUE, &cuhandles, sizeof(cuhandles), + STARPU_VALUE, &khandles, sizeof(khandles), + STARPU_VALUE, &kstates, sizeof(kstates), + STARPU_VALUE, &wwork, sizeof(wwork), + STARPU_VALUE, &wiwork, sizeof(wiwork), + STARPU_VALUE, &nb, sizeof(nb), + STARPU_VALUE, &nsamples, sizeof(nsamples), + STARPU_VALUE, &batch_size, sizeof(batch_size), + 0); + starpu_codelet_pack_args(&args_cpu, &args_cpu_size, + STARPU_VALUE, &nb, sizeof(nb), + STARPU_VALUE, &nsamples, sizeof(nsamples), + STARPU_VALUE, &wwork, sizeof(wwork), + STARPU_VALUE, &lwork, sizeof(lwork), + STARPU_VALUE, &wiwork, sizeof(wiwork), + STARPU_VALUE, &liwork, sizeof(liwork), + 0); + starpu_execute_on_each_worker(init_starpu_kblas, args_gpu, STARPU_CUDA); + starpu_execute_on_each_worker(init_starpu_cpu, args_cpu, STARPU_CPU); + MPI_Barrier(MPI_COMM_WORLD); + //double time0 = MPI_Wtime(); + //if(mpi_rank == 0) + // printf("CUBLAS + WORKSPACE ALLOCATION: %f seconds\n", time0-time_start); + // Init codelet structs and handles + struct starpu_codelet codelet_kernel_far = + { + .cpu_funcs = {starsh_dense_kernel_mpi_starpu_kblas_cpu_far}, + .nbuffers = 2, + .modes = {STARPU_W, STARPU_R}, + .type = STARPU_SPMD, + .max_parallelism = INT_MAX, + }; + struct starpu_codelet codelet_kernel_near = + { + .cpu_funcs = {starsh_dense_kernel_mpi_starpu_kblas_cpu_near}, + .nbuffers = 2, + .modes = {STARPU_W, STARPU_R}, + .type = STARPU_SPMD, + .max_parallelism = INT_MAX, + }; + struct starpu_codelet codelet_lowrank = + { + .cpu_funcs = {starsh_dense_dlrrsdd_starpu_kblas_cpu}, + .cuda_funcs = {starsh_dense_dlrrsdd_starpu_kblas_gpu}, + .cuda_flags = {STARPU_CUDA_ASYNC}, + .nbuffers = 5, + .modes = {STARPU_R, STARPU_SCRATCH, STARPU_W, STARPU_W, STARPU_W}, + .type = STARPU_SPMD, + .max_parallelism = INT_MAX, + }; + struct starpu_codelet codelet_lowrank_cpu = + { + .cpu_funcs = {starsh_dense_dlrrsdd_starpu_kblas_cpu}, + .nbuffers = 5, + .modes = {STARPU_R, STARPU_SCRATCH, STARPU_W, STARPU_W, STARPU_W}, + .type = STARPU_SPMD, + .max_parallelism = INT_MAX, + }; + struct starpu_codelet codelet_lowrank_gpu = + { + .cuda_funcs = {starsh_dense_dlrrsdd_starpu_kblas_gpu}, + .cuda_flags = {STARPU_CUDA_ASYNC}, + .nbuffers = 5, + .modes = {STARPU_R, STARPU_SCRATCH, STARPU_W, STARPU_W, STARPU_W}, + }; + struct starpu_codelet codelet_get_data_back_to_cpu = + { + .cpu_funcs = {empty_cpu_func}, + .nbuffers = 1, + .modes = {STARPU_R}, + }; + // Select if ONLY cpu or gpu + if(getenv("STARSH_KBLAS_CPU")) + codelet_lowrank = codelet_lowrank_cpu; + else if(getenv("STARSH_KBLAS_GPU")) + codelet_lowrank = codelet_lowrank_gpu; + starpu_data_handle_t rank_handle[nbatches_local]; + starpu_data_handle_t D_handle[nbatches_local]; + starpu_data_handle_t Dcopy_handle[nbatches_local]; + starpu_data_handle_t index_handle[nbatches_local]; + starpu_data_handle_t U_handle[nbatches_local]; + starpu_data_handle_t V_handle[nbatches_local]; + // Init buffers to store low-rank factors of far-field blocks if needed + if(nbatches_local > 0) + { + STARSH_MALLOC(far_U, nblocks_far_local); + STARSH_MALLOC(far_V, nblocks_far_local); + STARSH_MALLOC(far_rank, nblocks_far_local); + size_t size_U = nblocks_far_local * nb * maxrank; + size_t size_V = size_U; + STARSH_MALLOC(alloc_U, size_U); + STARSH_MALLOC(alloc_V, size_V); + int shape[] = {nb, maxrank}; + for(lbi = 0; lbi < nblocks_far_local; ++lbi) + { + STARSH_int offset = lbi * nb * maxrank; + array_from_buffer(far_U+lbi, 2, shape, 'd', 'F', alloc_U+offset); + array_from_buffer(far_V+lbi, 2, shape, 'd', 'F', alloc_V+offset); + } + for(lbi = 0; lbi < nbatches_local; ++lbi) + { + STARSH_int offset = lbi * batch_size * nb * maxrank; + double *U = alloc_U + offset; + double *V = alloc_V + offset; + STARSH_int this_batch_size = nblocks_far_local - lbi*batch_size; + if(this_batch_size > batch_size) + this_batch_size = batch_size; + STARSH_int D_size = this_batch_size * nb * nb; + STARSH_int U_size = this_batch_size * nb * maxrank; + STARSH_int V_size = U_size; + //printf("THIS BATCH SIZE=%d\n", this_batch_size); + starpu_vector_data_register(rank_handle+lbi, STARPU_MAIN_RAM, + (uintptr_t)(far_rank + lbi*batch_size), this_batch_size, + sizeof(*far_rank)); + starpu_vector_data_register(D_handle+lbi, -1, 0, D_size, + sizeof(double)); + starpu_vector_data_register(Dcopy_handle+lbi, -1, 0, D_size, + sizeof(double)); + starpu_vector_data_register(index_handle+lbi, STARPU_MAIN_RAM, + (uintptr_t)(block_far_local + lbi*batch_size), + this_batch_size, sizeof(*block_far_local)); + starpu_vector_data_register(U_handle+lbi, STARPU_MAIN_RAM, + (uintptr_t)(U), U_size, sizeof(*U)); + starpu_vector_data_register(V_handle+lbi, STARPU_MAIN_RAM, + (uintptr_t)(V), V_size, sizeof(*V)); + } + } + MPI_Barrier(MPI_COMM_WORLD); + //double time1 = MPI_Wtime(); + //if(mpi_rank == 0) + // printf("REGISTER DATA IN: %f seconds\n", time1-time0); + //time0 = time1; + // Work variables + int info; + // START MEASURING TIME + for(lbi = 0; lbi < nbatches_local; ++lbi) + { + //printf("RUNNING BATCH=%d\n", bi); + STARSH_int this_batch_size = nblocks_far_local - lbi*batch_size; + if(this_batch_size > batch_size) + this_batch_size = batch_size; + // Generate matrix + starpu_task_insert(&codelet_kernel_far, + STARPU_VALUE, &F, sizeof(F), + STARPU_VALUE, &this_batch_size, sizeof(this_batch_size), + STARPU_W, D_handle[lbi], + STARPU_R, index_handle[lbi], + 0); + starpu_data_unregister_submit(index_handle[lbi]); + } + starpu_task_wait_for_all(); + MPI_Barrier(MPI_COMM_WORLD); + //time1 = MPI_Wtime(); + //if(mpi_rank == 0) + // printf("COMPUTE MATRIX IN: %f seconds\n", time1-time0); + //time0 = time1; + STARSH_int nbatches_once = nbatches_local; + for(STARSH_int batch_start = 0; batch_start < nbatches_local; + batch_start += nbatches_once) + { + STARSH_int batch_end = batch_start + nbatches_once; + if(batch_end > nbatches_local) + batch_end = nbatches_local; + for(bi = batch_start; bi < batch_end; ++bi) + { + //printf("RUNNING BATCH=%d\n", bi); + STARSH_int this_batch_size = nblocks_far_local - bi*batch_size; + if(this_batch_size > batch_size) + this_batch_size = batch_size; + // Run KBLAS_RSVD + starpu_task_insert(&codelet_lowrank, + STARPU_VALUE, &this_batch_size, sizeof(this_batch_size), + STARPU_VALUE, &nb, sizeof(nb), + STARPU_VALUE, &maxrank, sizeof(maxrank), + STARPU_VALUE, &oversample, sizeof(oversample), + STARPU_VALUE, &tol, sizeof(tol), + STARPU_VALUE, &cuhandles, sizeof(cuhandles), + STARPU_VALUE, &khandles, sizeof(khandles), + STARPU_VALUE, &kstates, sizeof(kstates), + STARPU_VALUE, &wwork, sizeof(wwork), + STARPU_VALUE, &lwork, sizeof(lwork), + STARPU_VALUE, &wiwork, sizeof(wiwork), + STARPU_R, D_handle[bi], + STARPU_SCRATCH, Dcopy_handle[bi], + STARPU_W, U_handle[bi], + STARPU_W, V_handle[bi], + STARPU_W, rank_handle[bi], + 0); + starpu_data_unregister_submit(Dcopy_handle[bi]); + starpu_task_insert(&codelet_get_data_back_to_cpu, + STARPU_R, U_handle[bi], + 0); + starpu_task_insert(&codelet_get_data_back_to_cpu, + STARPU_R, V_handle[bi], + 0); + starpu_task_insert(&codelet_get_data_back_to_cpu, + STARPU_R, rank_handle[bi], + 0); + starpu_data_unregister_submit(rank_handle[bi]); + starpu_data_unregister_submit(D_handle[bi]); + starpu_data_unregister_submit(U_handle[bi]); + starpu_data_unregister_submit(V_handle[bi]); + } + starpu_task_wait_for_all(); + } + MPI_Barrier(MPI_COMM_WORLD); + //time1 = MPI_Wtime(); + //if(mpi_rank == 0) + // printf("COMPRESS MATRIX IN: %f seconds\n", time1-time0); + //time0 = time1; + // Get number of false far-field blocks + STARSH_int nblocks_false_far_local = 0; + STARSH_int *false_far_local = NULL; + for(lbi = 0; lbi < nblocks_far_local; lbi++) + { + if(far_rank[lbi] == -1) + nblocks_false_far_local++; + } + if(nblocks_false_far_local > 0) + { + // IMPORTANT: `false_far` and `false_far_local` must be in + // ascending order for later code to work normally + STARSH_MALLOC(false_far_local, nblocks_false_far_local); + lbj = 0; + for(lbi = 0; lbi < nblocks_far_local; lbi++) + if(far_rank[lbi] == -1) + false_far_local[lbj++] = block_far_local[lbi]; + } + // Sync list of all false far-field blocks + STARSH_int nblocks_false_far = 0; + int int_nblocks_false_far_local = nblocks_false_far_local; + int *mpi_recvcount, *mpi_offset; + STARSH_MALLOC(mpi_recvcount, mpi_size); + STARSH_MALLOC(mpi_offset, mpi_size); + MPI_Allgather(&int_nblocks_false_far_local, 1, MPI_INT, mpi_recvcount, + 1, MPI_INT, MPI_COMM_WORLD); + for(bi = 0; bi < mpi_size; bi++) + nblocks_false_far += mpi_recvcount[bi]; + mpi_offset[0] = 0; + for(bi = 1; bi < mpi_size; bi++) + mpi_offset[bi] = mpi_offset[bi-1]+mpi_recvcount[bi-1]; + STARSH_int *false_far = NULL; + if(nblocks_false_far > 0) + STARSH_MALLOC(false_far, nblocks_false_far); + MPI_Allgatherv(false_far_local, nblocks_false_far_local, my_MPI_SIZE_T, + false_far, mpi_recvcount, mpi_offset, my_MPI_SIZE_T, + MPI_COMM_WORLD); + free(mpi_recvcount); + free(mpi_offset); + // Make false_far be in ascending order + qsort(false_far, nblocks_false_far, sizeof(*false_far), cmp_size_t); + if(nblocks_false_far > 0) + { + // Update list of near-field blocks + new_nblocks_near = nblocks_near+nblocks_false_far; + new_nblocks_near_local = nblocks_near_local+nblocks_false_far_local; + STARSH_MALLOC(block_near, 2*new_nblocks_near); + if(new_nblocks_near_local > 0) + STARSH_MALLOC(block_near_local, new_nblocks_near_local); + // At first get all near-field blocks, assumed to be dense + for(bi = 0; bi < 2*nblocks_near; bi++) + block_near[bi] = F->block_near[bi]; + for(lbi = 0; lbi < nblocks_near_local; lbi++) + block_near_local[lbi] = F->block_near_local[lbi]; + // Add false far-field blocks + for(bi = 0; bi < nblocks_false_far; bi++) + { + STARSH_int bj = false_far[bi]; + block_near[2*(bi+nblocks_near)] = F->block_far[2*bj]; + block_near[2*(bi+nblocks_near)+1] = F->block_far[2*bj+1]; + } + bi = 0; + for(lbi = 0; lbi < nblocks_false_far_local; lbi++) + { + lbj = false_far_local[lbi]; + while(bi < nblocks_false_far && false_far[bi] < lbj) + bi++; + block_near_local[nblocks_near_local+lbi] = nblocks_near+bi; + } + // Update list of far-field blocks + new_nblocks_far = nblocks_far-nblocks_false_far; + new_nblocks_far_local = nblocks_far_local-nblocks_false_far_local; + if(new_nblocks_far > 0) + { + STARSH_MALLOC(block_far, 2*new_nblocks_far); + if(new_nblocks_far_local > 0) + STARSH_MALLOC(block_far_local, new_nblocks_far_local); + bj = 0; + lbi = 0; + lbj = 0; + for(bi = 0; bi < nblocks_far; bi++) + { + // `false_far` must be in ascending order for this to work + if(bj < nblocks_false_far && false_far[bj] == bi) + { + if(nblocks_false_far_local > lbj && + false_far_local[lbj] == bi) + { + lbi++; + lbj++; + } + bj++; + } + else + { + block_far[2*(bi-bj)] = F->block_far[2*bi]; + block_far[2*(bi-bj)+1] = F->block_far[2*bi+1]; + if(nblocks_far_local > lbi && + F->block_far_local[lbi] == bi) + { + block_far_local[lbi-lbj] = bi-bj; + lbi++; + } + } + } + } + // Update format by creating new format + STARSH_blrf *F2; + info = starsh_blrf_new_from_coo_mpi(&F2, P, F->symm, RC, CC, + new_nblocks_far, block_far, new_nblocks_far_local, + block_far_local, new_nblocks_near, block_near, + new_nblocks_near_local, block_near_local, F->type); + // Swap internal data of formats and free unnecessary data + STARSH_blrf tmp_blrf = *F; + *F = *F2; + *F2 = tmp_blrf; + if(mpi_rank == 0) + STARSH_WARNING("`F` was modified due to false far-field blocks"); + starsh_blrf_free(F2); + } + // Compute near-field blocks if needed + if(onfly == 0 && new_nblocks_near_local > 0) + { + STARSH_MALLOC(near_D, new_nblocks_near_local); + size_t size_D = new_nblocks_near_local * nb * nb; + STARSH_MALLOC(alloc_D, size_D); + nbatches_local = (new_nblocks_near_local-1)/batch_size + 1; + starpu_data_handle_t D_handle[nbatches_local]; + starpu_data_handle_t index_handle[nbatches_local]; + int shape[] = {nb, nb}; + // For each local near-field block compute its elements + for(lbi = 0; lbi < new_nblocks_near_local; ++lbi) + { + // Get indexes of corresponding block row and block column + array_from_buffer(near_D+lbi, 2, shape, 'd', 'F', + alloc_D + lbi*nb*nb); + } + for(lbi = 0; lbi < nbatches_local; ++lbi) + { + STARSH_int this_batch_size = new_nblocks_near_local + - lbi*batch_size; + if(this_batch_size > batch_size) + this_batch_size = batch_size; + STARSH_int D_size = this_batch_size * nb * nb; + double *D = alloc_D + lbi*batch_size*nb*nb; + starpu_vector_data_register(D_handle+lbi, STARPU_MAIN_RAM, + (uintptr_t)(D), D_size, sizeof(*D)); + starpu_vector_data_register(index_handle+lbi, STARPU_MAIN_RAM, + (uintptr_t)(block_near_local + lbi*batch_size), + this_batch_size, sizeof(*block_near_local)); + } + for(lbi = 0; lbi < nbatches_local; ++lbi) + { + STARSH_int this_batch_size = new_nblocks_near_local + - lbi*batch_size; + if(this_batch_size > batch_size) + this_batch_size = batch_size; + // Generate matrix + starpu_task_insert(&codelet_kernel_near, + STARPU_VALUE, &F, sizeof(F), + STARPU_VALUE, &this_batch_size, sizeof(this_batch_size), + STARPU_W, D_handle[lbi], + STARPU_R, index_handle[lbi], + 0); + starpu_data_unregister_submit(D_handle[lbi]); + starpu_data_unregister_submit(index_handle[lbi]); + } + // Wait in this scope, because all handles are not visible outside + starpu_task_wait_for_all(); + } + // Change sizes of far_rank, far_U and far_V if there were false + // far-field blocks + lbj = 0; + for(lbi = 0; lbi < nblocks_far_local; lbi++) + { + if(far_rank[lbi] == -1) + lbj++; + else + { + int shape_U[2] = {far_U[lbi]->shape[0], far_rank[lbi]}; + int shape_V[2] = {far_V[lbi]->shape[0], far_rank[lbi]}; + array_from_buffer(far_U+lbi-lbj, 2, shape_U, 'd', 'F', + far_U[lbi]->data); + array_from_buffer(far_V+lbi-lbj, 2, shape_V, 'd', 'F', + far_V[lbi]->data); + far_rank[lbi-lbj] = far_rank[lbi]; + } + } + if(nblocks_false_far_local > 0 && new_nblocks_far_local > 0) + { + STARSH_REALLOC(far_rank, new_nblocks_far_local); + STARSH_REALLOC(far_U, new_nblocks_far_local); + STARSH_REALLOC(far_V, new_nblocks_far_local); + } + // If all far-field blocks are false, then dealloc buffers + if(new_nblocks_far_local == 0 && nblocks_far_local > 0) + { + block_far = NULL; + free(far_rank); + far_rank = NULL; + free(far_U); + far_U = NULL; + free(far_V); + far_V = NULL; + free(alloc_U); + alloc_U = NULL; + free(alloc_V); + alloc_V = NULL; + } + // Dealloc list of false far-field blocks if it is not empty + if(nblocks_false_far > 0) + free(false_far); + if(nblocks_false_far_local > 0) + free(false_far_local); + // Finish with creating instance of Block Low-Rank Matrix with given + // buffers + starpu_execute_on_each_worker(deinit_starpu_kblas, args_gpu, STARPU_CUDA); + starpu_execute_on_each_worker(deinit_starpu_cpu, args_cpu, STARPU_CPU); + return starsh_blrm_new_mpi(matrix, F, far_rank, far_U, far_V, onfly, + near_D, alloc_U, alloc_V, alloc_D, '1'); +} + diff --git a/src/backends/mpi_starpu_kblas2/CMakeLists.txt b/src/backends/mpi_starpu_kblas2/CMakeLists.txt new file mode 100644 index 00000000..3326eae6 --- /dev/null +++ b/src/backends/mpi_starpu_kblas2/CMakeLists.txt @@ -0,0 +1,25 @@ +# @copyright (c) 2017 King Abdullah University of Science and +# Technology (KAUST). All rights reserved. +# +# STARS-H is a software package, provided by King Abdullah +# University of Science and Technology (KAUST) +# +# @file src/backends/mpi_starpu_kblas/CMakeLists.txt +# @version 0.1.0 +# @author Aleksandr Mikhalev +# @date 2017-11-07 + + +# Collect sources for documentation and compilation +set(SRC) +add_subdirectory("blrm") + +# If compilation is requried +if(MPI AND STARPU AND KBLAS) + add_library(backends_mpi_starpu_kblas2 OBJECT ${SRC}) + set_target_properties(backends_mpi_starpu_kblas2 PROPERTIES COMPILE_FLAGS + "${MPI_C_COMPILE_FLAGS}") +endif() + +# Put doxygen input to parent scope +set(DOXYGEN_INPUT ${DOXYGEN_INPUT} ${SRC} PARENT_SCOPE) diff --git a/src/backends/mpi_starpu_kblas2/blrm/CMakeLists.txt b/src/backends/mpi_starpu_kblas2/blrm/CMakeLists.txt new file mode 100644 index 00000000..5d890a91 --- /dev/null +++ b/src/backends/mpi_starpu_kblas2/blrm/CMakeLists.txt @@ -0,0 +1,21 @@ +# @copyright (c) 2017 King Abdullah University of Science and +# Technology (KAUST). All rights reserved. +# +# STARS-H is a software package, provided by King Abdullah +# University of Science and Technology (KAUST) +# +# @file src/backends/mpi_starpu/blrm/CMakeLists.txt +# @version 0.1.0 +# @author Aleksandr Mikhalev +# @date 2017-11-07 + + +# set the values of the variable in the parent scope +set(SRC + #"${CMAKE_CURRENT_SOURCE_DIR}/dqp3.c" + "${CMAKE_CURRENT_SOURCE_DIR}/drsdd.c" + #"${CMAKE_CURRENT_SOURCE_DIR}/dfe.c" + #"${CMAKE_CURRENT_SOURCE_DIR}/dsdd.c" + #"${CMAKE_CURRENT_SOURCE_DIR}/dmml.c" + #"${CMAKE_CURRENT_SOURCE_DIR}/dna.c" + PARENT_SCOPE) diff --git a/src/backends/mpi_starpu_kblas2/blrm/drsdd.c b/src/backends/mpi_starpu_kblas2/blrm/drsdd.c new file mode 100644 index 00000000..f209ebf0 --- /dev/null +++ b/src/backends/mpi_starpu_kblas2/blrm/drsdd.c @@ -0,0 +1,600 @@ +/*! @copyright (c) 2017 King Abdullah University of Science and + * Technology (KAUST). All rights reserved. + * + * STARS-H is a software package, provided by King Abdullah + * University of Science and Technology (KAUST) + * + * @file src/backends/mpi_starpu_kblas/blrm/drsdd.c + * @version 0.1.0 + * @author Aleksandr Mikhalev + * @date 2017-11-07 + * */ + +#include "common.h" +#include "starsh.h" +#include "starsh-starpu-kblas.h" +#include "starsh-mpi-starpu-kblas.h" +#include +#include +#include +#include "batch_rand.h" +#include +#include + +static void init_starpu_kblas(void *args) +{ + cublasHandle_t *cublas_handles; + kblasHandle_t *kblas_handles; + kblasRandState_t *kblas_states; + cudaStream_t stream = starpu_cuda_get_local_stream(); + int nb, nsamples, maxbatch; + starpu_codelet_unpack_args(args, &cublas_handles, &kblas_handles, + &kblas_states, &nb, &nsamples, &maxbatch); + int id = starpu_worker_get_id(); + cublasStatus_t status; + kblasCreate(&kblas_handles[id]); + kblasSetStream(kblas_handles[id], stream); + kblasDrsvd_batch_wsquery(kblas_handles[id], nb, nb, nsamples, maxbatch); + kblasAllocateWorkspace(kblas_handles[id]); + cublas_handles[id] = kblasGetCublasHandle(kblas_handles[id]); + kblasInitRandState(kblas_handles[id], &kblas_states[id], 16384*2, 0); + cudaStreamSynchronize(stream); +} + +static void deinit_starpu_kblas(void *args) +{ + int nb, nsamples, maxbatch; + cublasHandle_t *cublas_handles; + kblasHandle_t *kblas_handles; + kblasRandState_t *kblas_states; + starpu_codelet_unpack_args(args, &cublas_handles, &kblas_handles, + &kblas_states, &nb, &nsamples, &maxbatch); + int id = starpu_worker_get_id(); + kblasDestroyRandState(kblas_states[id]); + kblasDestroy(&kblas_handles[id]); +} + +static void starsh_dense_kernel_mpi_starpu_kblas2_cpu_far(void *buffers[], void *cl_arg) +//! STARPU kernel for matrix kernel. +{ + STARSH_blrf *F; + int batch_size; + starpu_codelet_unpack_args(cl_arg, &F, &batch_size); + STARSH_problem *P = F->problem; + STARSH_kernel *kernel = P->kernel; + // Shortcuts to information about clusters + STARSH_cluster *RC = F->row_cluster, *CC = F->col_cluster; + void *RD = RC->data, *CD = CC->data; + double *D = (double *)STARPU_VECTOR_GET_PTR(buffers[0]); + STARSH_int *ind = (STARSH_int *)STARPU_VECTOR_GET_PTR(buffers[1]); + // This works only for equal square tiles + STARSH_int N = RC->size[0]; + STARSH_int stride = N*N; + int pool_size = starpu_combined_worker_get_size(); + int pool_rank = starpu_combined_worker_get_rank(); + for(STARSH_int ibatch = pool_rank; ibatch < batch_size; + ibatch += pool_size) + { + int k = ind[ibatch]; + int i = F->block_far[k*2]; + int j = F->block_far[k*2+1]; + kernel(N, N, RC->pivot+RC->start[i], CC->pivot+CC->start[j], + RD, CD, D + ibatch*stride, N); + } +} + +static void starsh_dense_kernel_mpi_starpu_kblas2_cpu_near(void *buffers[], void *cl_arg) +//! STARPU kernel for matrix kernel. +{ + STARSH_blrf *F; + int batch_size; + starpu_codelet_unpack_args(cl_arg, &F, &batch_size); + STARSH_problem *P = F->problem; + STARSH_kernel *kernel = P->kernel; + // Shortcuts to information about clusters + STARSH_cluster *RC = F->row_cluster, *CC = F->col_cluster; + void *RD = RC->data, *CD = CC->data; + double *D = (double *)STARPU_VECTOR_GET_PTR(buffers[0]); + STARSH_int *ind = (STARSH_int *)STARPU_VECTOR_GET_PTR(buffers[1]); + // This works only for equal square tiles + STARSH_int N = RC->size[0]; + STARSH_int stride = N*N; + int pool_size = starpu_combined_worker_get_size(); + int pool_rank = starpu_combined_worker_get_rank(); + for(STARSH_int ibatch = pool_rank; ibatch < batch_size; + ibatch += pool_size) + { + int k = ind[ibatch]; + int i = F->block_near[k*2]; + int j = F->block_near[k*2+1]; + kernel(N, N, RC->pivot+RC->start[i], CC->pivot+CC->start[j], + RD, CD, D + ibatch*stride, N); + } +} + +int starsh_blrm__drsdd_mpi_starpu_kblas2(STARSH_blrm **matrix, + STARSH_blrf *format, int maxrank, double tol, int onfly) +//! Approximate each tile by randomized SVD. +/*! + * @param[out] matrix: Address of pointer to @ref STARSH_blrm object. + * @param[in] format: Block low-rank format. + * @param[in] maxrank: Maximum possible rank. + * @param[in] tol: Relative error tolerance. + * @param[in] onfly: Whether not to store dense blocks. + * @return Error code @ref STARSH_ERRNO. + * @ingroup blrm + * */ +{ + //printf("MPIKBLAS2\n"); + //double time_start = MPI_Wtime(); + STARSH_blrf *F = format; + STARSH_problem *P = F->problem; + STARSH_kernel *kernel = P->kernel; + STARSH_int nblocks_far = F->nblocks_far; + STARSH_int nblocks_near = F->nblocks_near; + STARSH_int nblocks_far_local = F->nblocks_far_local; + STARSH_int nblocks_near_local = F->nblocks_near_local; + // Shortcuts to information about clusters + STARSH_cluster *RC = F->row_cluster; + STARSH_cluster *CC = F->col_cluster; + void *RD = RC->data, *CD = CC->data; + // Following values default to given block low-rank format F, but they are + // changed when there are false far-field blocks. + STARSH_int new_nblocks_far = F->nblocks_far; + STARSH_int new_nblocks_near = F->nblocks_near; + STARSH_int new_nblocks_far_local = F->nblocks_far_local; + STARSH_int new_nblocks_near_local = F->nblocks_near_local; + STARSH_int *block_far = F->block_far; + STARSH_int *block_near = F->block_near; + STARSH_int *block_far_local = F->block_far_local; + STARSH_int *block_near_local = F->block_near_local; + // Places to store low-rank factors, dense blocks and ranks + Array **far_U = NULL, **far_V = NULL, **near_D = NULL; + int *far_rank = NULL; + double *alloc_U = NULL, *alloc_V = NULL, *alloc_D = NULL, *alloc_S = NULL; + size_t offset_U = 0, offset_V = 0, offset_D = 0; + STARSH_int lbi, lbj, bi, bj = 0; + const int oversample = starsh_params.oversample; + // MPI + int mpi_size, mpi_rank; + MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); + // Init CuBLAS and KBLAS handles and temp buffers for all workers (but they + // are used only in GPU codelets) + int workers = starpu_worker_get_count(); + cublasHandle_t cublas_handles[workers]; + kblasHandle_t kblas_handles[workers]; + kblasRandState_t kblas_states[workers]; + cublasHandle_t *cuhandles = cublas_handles; + kblasHandle_t *khandles = kblas_handles; + kblasRandState_t *kstates = kblas_states; + //printf("MAIN: %p, %p, %p\n", cuhandles, khandles, svhandles); + void *args_gpu; + size_t args_gpu_size = 0; + // This works only for TLR with equal tiles + int nb = RC->size[0]; + int nsamples = maxrank+oversample; + // Set size of batch + char *env_var = getenv("STARSH_KBLAS_BATCH"); + int batch_size = 100; + if(env_var) + batch_size = atoi(env_var); + //printf("MPIKBLAS2: batch_size=%d\n", batch_size); + // Ceil number of batches + int nbatches_local = (nblocks_far_local-1)/batch_size + 1; + // Get corresponding sizes and minimum of them + int mn = maxrank+oversample; + if(mn > nb) + mn = nb; + starpu_codelet_pack_args(&args_gpu, &args_gpu_size, + STARPU_VALUE, &cuhandles, sizeof(cuhandles), + STARPU_VALUE, &khandles, sizeof(khandles), + STARPU_VALUE, &kstates, sizeof(kstates), + STARPU_VALUE, &nb, sizeof(nb), + STARPU_VALUE, &nsamples, sizeof(nsamples), + STARPU_VALUE, &batch_size, sizeof(batch_size), + 0); + starpu_execute_on_each_worker(init_starpu_kblas, args_gpu, STARPU_CUDA); + //MPI_Barrier(MPI_COMM_WORLD); + //double time0 = MPI_Wtime(); + //if(mpi_rank == 0) + // printf("CUBLAS + WORKSPACE ALLOCATION: %f seconds\n", time0-time_start); + // Init codelet structs and handles + struct starpu_codelet codelet_kernel_far = + { + .cpu_funcs = {starsh_dense_kernel_mpi_starpu_kblas2_cpu_far}, + .nbuffers = 2, + .modes = {STARPU_W, STARPU_R}, + .type = STARPU_SPMD, + .max_parallelism = INT_MAX, + }; + struct starpu_codelet codelet_kernel_near = + { + .cpu_funcs = {starsh_dense_kernel_mpi_starpu_kblas2_cpu_near}, + .nbuffers = 2, + .modes = {STARPU_W, STARPU_R}, + .type = STARPU_SPMD, + .max_parallelism = INT_MAX, + }; + struct starpu_codelet codelet_lowrank = + { + .cuda_funcs = {starsh_dense_dlrrsdd_starpu_kblas2_gpu}, + .cuda_flags = {STARPU_CUDA_ASYNC}, + .nbuffers = 5, + .modes = {STARPU_R, STARPU_SCRATCH, STARPU_W, STARPU_W, STARPU_W}, + }; + struct starpu_codelet codelet_getrank = + { + .cpu_funcs = {starsh_dense_dlrrsdd_starpu_kblas2_getrank}, + .nbuffers = 4, + .modes = {STARPU_R, STARPU_R, STARPU_R, STARPU_W}, + .type = STARPU_SPMD, + .max_parallelism = INT_MAX, + }; + starpu_data_handle_t D_handle[nbatches_local]; + starpu_data_handle_t Dcopy_handle[nbatches_local]; + starpu_data_handle_t index_handle[nbatches_local]; + starpu_data_handle_t U_handle[nbatches_local]; + starpu_data_handle_t V_handle[nbatches_local]; + starpu_data_handle_t S_handle[nbatches_local]; + starpu_data_handle_t rank_handle[nbatches_local]; + // Init buffers to store low-rank factors of far-field blocks if needed + MPI_Barrier(MPI_COMM_WORLD); + double time0 = MPI_Wtime(); + if(nbatches_local > 0) + { + STARSH_MALLOC(far_U, nblocks_far_local); + STARSH_MALLOC(far_V, nblocks_far_local); + STARSH_MALLOC(far_rank, nblocks_far_local); + size_t size_U = nblocks_far_local * nb * maxrank; + size_t size_V = size_U; + size_t size_D = nblocks_far_local * nb * nb; + size_t size_S = nblocks_far_local * mn; + STARSH_MALLOC(alloc_U, size_U); + STARSH_MALLOC(alloc_V, size_V); + starpu_memory_pin(alloc_U, size_U*sizeof(double)); + starpu_memory_pin(alloc_V, size_V*sizeof(double)); + starpu_malloc(&alloc_S, size_S*sizeof(double)); + starpu_malloc(&alloc_D, size_D*sizeof(double)); + int shape[] = {nb, maxrank}; + for(lbi = 0; lbi < nblocks_far_local; ++lbi) + { + STARSH_int offset = lbi * nb * maxrank; + array_from_buffer(far_U+lbi, 2, shape, 'd', 'F', alloc_U+offset); + array_from_buffer(far_V+lbi, 2, shape, 'd', 'F', alloc_V+offset); + } + for(lbi = 0; lbi < nbatches_local; ++lbi) + { + STARSH_int offset = lbi * batch_size * nb * maxrank; + STARSH_int offset_S = lbi * batch_size * mn; + double *U = alloc_U + offset; + double *V = alloc_V + offset; + double *S = alloc_S + offset_S; + STARSH_int offset_D = lbi * batch_size * nb * nb; + double *D = alloc_D + offset_D; + int this_batch_size = nblocks_far_local - lbi*batch_size; + if(this_batch_size > batch_size) + this_batch_size = batch_size; + STARSH_int D_size = this_batch_size * nb * nb; + STARSH_int U_size = this_batch_size * nb * maxrank; + STARSH_int V_size = U_size; + STARSH_int S_size = this_batch_size * mn; + //printf("THIS BATCH SIZE=%d\n", this_batch_size); + starpu_vector_data_register(rank_handle+lbi, STARPU_MAIN_RAM, + (uintptr_t)(far_rank + lbi*batch_size), this_batch_size, + sizeof(*far_rank)); + starpu_vector_data_register(D_handle+lbi, STARPU_MAIN_RAM, + (uintptr_t)(D), D_size, sizeof(double)); + starpu_vector_data_register(Dcopy_handle+lbi, -1, 0, D_size, + sizeof(double)); + starpu_vector_data_register(index_handle+lbi, STARPU_MAIN_RAM, + (uintptr_t)(block_far_local + lbi*batch_size), + this_batch_size, sizeof(*block_far_local)); + starpu_vector_data_register(U_handle+lbi, STARPU_MAIN_RAM, + (uintptr_t)(U), U_size, sizeof(*U)); + starpu_vector_data_register(V_handle+lbi, STARPU_MAIN_RAM, + (uintptr_t)(V), V_size, sizeof(*V)); + starpu_vector_data_register(S_handle+lbi, STARPU_MAIN_RAM, + (uintptr_t)(S), S_size, sizeof(double)); + } + } + MPI_Barrier(MPI_COMM_WORLD); + //double time1 = MPI_Wtime(); + //if(mpi_rank == 0) + // printf("MPIKBLAS2: pin memory in %f seconds\n", time1-time0); + //time0 = time1; + // Work variables + int info; + // START MEASURING TIME + for(lbi = 0; lbi < nbatches_local; ++lbi) + { + //printf("RUNNING BATCH=%d\n", bi); + int this_batch_size = nblocks_far_local - lbi*batch_size; + if(this_batch_size > batch_size) + this_batch_size = batch_size; + // Generate matrix + starpu_task_insert(&codelet_kernel_far, + STARPU_VALUE, &F, sizeof(F), + STARPU_VALUE, &this_batch_size, sizeof(this_batch_size), + STARPU_W, D_handle[lbi], + STARPU_R, index_handle[lbi], + STARPU_PRIORITY, -2, + 0); + starpu_data_unregister_submit(index_handle[lbi]); + // Run KBLAS_RSVD + starpu_task_insert(&codelet_lowrank, + STARPU_VALUE, &this_batch_size, sizeof(this_batch_size), + STARPU_VALUE, &nb, sizeof(nb), + STARPU_VALUE, &maxrank, sizeof(maxrank), + STARPU_VALUE, &oversample, sizeof(oversample), + STARPU_VALUE, &tol, sizeof(tol), + STARPU_VALUE, &cuhandles, sizeof(cuhandles), + STARPU_VALUE, &khandles, sizeof(khandles), + STARPU_VALUE, &kstates, sizeof(kstates), + STARPU_R, D_handle[lbi], + STARPU_SCRATCH, Dcopy_handle[lbi], + STARPU_W, U_handle[lbi], + STARPU_W, V_handle[lbi], + STARPU_W, S_handle[lbi], + STARPU_PRIORITY, 0, + 0); + starpu_data_unregister_submit(D_handle[lbi]); + starpu_data_unregister_submit(Dcopy_handle[lbi]); + starpu_task_insert(&codelet_getrank, + STARPU_VALUE, &this_batch_size, sizeof(this_batch_size), + STARPU_VALUE, &nb, sizeof(nb), + STARPU_VALUE, &maxrank, sizeof(maxrank), + STARPU_VALUE, &oversample, sizeof(oversample), + STARPU_VALUE, &tol, sizeof(tol), + STARPU_R, U_handle[lbi], + STARPU_R, V_handle[lbi], + STARPU_R, S_handle[lbi], + STARPU_W, rank_handle[lbi], + STARPU_PRIORITY, -1, + 0); + starpu_data_unregister_submit(rank_handle[lbi]); + starpu_data_unregister_submit(U_handle[lbi]); + starpu_data_unregister_submit(V_handle[lbi]); + starpu_data_unregister_submit(S_handle[lbi]); + } + starpu_task_wait_for_all(); + MPI_Barrier(MPI_COMM_WORLD); + //time1 = MPI_Wtime(); + //if(mpi_rank == 0) + // printf("COMPUTE+COMPRESS MATRIX IN: %f seconds\n", time1-time0); + //time0 = time1; + if(nbatches_local > 0) + { + size_t size_U = nblocks_far_local * nb * maxrank; + size_t size_V = size_U; + starpu_free(alloc_D); + starpu_memory_unpin(alloc_U, size_U*sizeof(double)); + starpu_memory_unpin(alloc_V, size_V*sizeof(double)); + starpu_free(alloc_S); + } + MPI_Barrier(MPI_COMM_WORLD); + //if(mpi_rank == 0) + // printf("FINISH FIRST PASS AND UNREGISTER IN: %f seconds\n", + // MPI_Wtime()-time0); + // Get number of false far-field blocks + STARSH_int nblocks_false_far_local = 0; + STARSH_int *false_far_local = NULL; + for(lbi = 0; lbi < nblocks_far_local; lbi++) + { + //far_rank[lbi] = -1; + if(far_rank[lbi] == -1) + nblocks_false_far_local++; + } + if(nblocks_false_far_local > 0) + { + // IMPORTANT: `false_far` and `false_far_local` must be in + // ascending order for later code to work normally + STARSH_MALLOC(false_far_local, nblocks_false_far_local); + lbj = 0; + for(lbi = 0; lbi < nblocks_far_local; lbi++) + if(far_rank[lbi] == -1) + false_far_local[lbj++] = block_far_local[lbi]; + } + // Sync list of all false far-field blocks + STARSH_int nblocks_false_far = 0; + int int_nblocks_false_far_local = nblocks_false_far_local; + int *mpi_recvcount, *mpi_offset; + STARSH_MALLOC(mpi_recvcount, mpi_size); + STARSH_MALLOC(mpi_offset, mpi_size); + MPI_Allgather(&int_nblocks_false_far_local, 1, MPI_INT, mpi_recvcount, + 1, MPI_INT, MPI_COMM_WORLD); + for(bi = 0; bi < mpi_size; bi++) + nblocks_false_far += mpi_recvcount[bi]; + mpi_offset[0] = 0; + for(bi = 1; bi < mpi_size; bi++) + mpi_offset[bi] = mpi_offset[bi-1]+mpi_recvcount[bi-1]; + STARSH_int *false_far = NULL; + if(nblocks_false_far > 0) + STARSH_MALLOC(false_far, nblocks_false_far); + MPI_Allgatherv(false_far_local, nblocks_false_far_local, my_MPI_SIZE_T, + false_far, mpi_recvcount, mpi_offset, my_MPI_SIZE_T, + MPI_COMM_WORLD); + free(mpi_recvcount); + free(mpi_offset); + // Make false_far be in ascending order + qsort(false_far, nblocks_false_far, sizeof(*false_far), cmp_size_t); + if(nblocks_false_far > 0) + { + // Update list of near-field blocks + new_nblocks_near = nblocks_near+nblocks_false_far; + new_nblocks_near_local = nblocks_near_local+nblocks_false_far_local; + STARSH_MALLOC(block_near, 2*new_nblocks_near); + if(new_nblocks_near_local > 0) + STARSH_MALLOC(block_near_local, new_nblocks_near_local); + // At first get all near-field blocks, assumed to be dense + for(bi = 0; bi < 2*nblocks_near; bi++) + block_near[bi] = F->block_near[bi]; + for(lbi = 0; lbi < nblocks_near_local; lbi++) + block_near_local[lbi] = F->block_near_local[lbi]; + // Add false far-field blocks + for(bi = 0; bi < nblocks_false_far; bi++) + { + STARSH_int bj = false_far[bi]; + block_near[2*(bi+nblocks_near)] = F->block_far[2*bj]; + block_near[2*(bi+nblocks_near)+1] = F->block_far[2*bj+1]; + } + bi = 0; + for(lbi = 0; lbi < nblocks_false_far_local; lbi++) + { + lbj = false_far_local[lbi]; + while(bi < nblocks_false_far && false_far[bi] < lbj) + bi++; + block_near_local[nblocks_near_local+lbi] = nblocks_near+bi; + } + // Update list of far-field blocks + new_nblocks_far = nblocks_far-nblocks_false_far; + new_nblocks_far_local = nblocks_far_local-nblocks_false_far_local; + if(new_nblocks_far > 0) + { + STARSH_MALLOC(block_far, 2*new_nblocks_far); + if(new_nblocks_far_local > 0) + STARSH_MALLOC(block_far_local, new_nblocks_far_local); + bj = 0; + lbi = 0; + lbj = 0; + for(bi = 0; bi < nblocks_far; bi++) + { + // `false_far` must be in ascending order for this to work + if(bj < nblocks_false_far && false_far[bj] == bi) + { + if(nblocks_false_far_local > lbj && + false_far_local[lbj] == bi) + { + lbi++; + lbj++; + } + bj++; + } + else + { + block_far[2*(bi-bj)] = F->block_far[2*bi]; + block_far[2*(bi-bj)+1] = F->block_far[2*bi+1]; + if(nblocks_far_local > lbi && + F->block_far_local[lbi] == bi) + { + block_far_local[lbi-lbj] = bi-bj; + lbi++; + } + } + } + } + // Update format by creating new format + STARSH_blrf *F2; + info = starsh_blrf_new_from_coo_mpi(&F2, P, F->symm, RC, CC, + new_nblocks_far, block_far, new_nblocks_far_local, + block_far_local, new_nblocks_near, block_near, + new_nblocks_near_local, block_near_local, F->type); + // Swap internal data of formats and free unnecessary data + STARSH_blrf tmp_blrf = *F; + *F = *F2; + *F2 = tmp_blrf; + if(mpi_rank == 0) + STARSH_WARNING("`F` was modified due to false far-field blocks"); + starsh_blrf_free(F2); + } + // Compute near-field blocks if needed + if(onfly == 0 && new_nblocks_near_local > 0) + { + STARSH_MALLOC(near_D, new_nblocks_near_local); + size_t size_D = new_nblocks_near_local * nb * nb; + STARSH_MALLOC(alloc_D, size_D); + nbatches_local = (new_nblocks_near_local-1)/batch_size + 1; + starpu_data_handle_t D_handle[nbatches_local]; + starpu_data_handle_t index_handle[nbatches_local]; + int shape[] = {nb, nb}; + // For each local near-field block compute its elements + for(lbi = 0; lbi < new_nblocks_near_local; ++lbi) + { + // Get indexes of corresponding block row and block column + array_from_buffer(near_D+lbi, 2, shape, 'd', 'F', + alloc_D + lbi*nb*nb); + } + for(lbi = 0; lbi < nbatches_local; ++lbi) + { + int this_batch_size = new_nblocks_near_local + - lbi*batch_size; + if(this_batch_size > batch_size) + this_batch_size = batch_size; + STARSH_int D_size = this_batch_size * nb * nb; + double *D = alloc_D + lbi*batch_size*nb*nb; + starpu_vector_data_register(D_handle+lbi, STARPU_MAIN_RAM, + (uintptr_t)(D), D_size, sizeof(*D)); + starpu_vector_data_register(index_handle+lbi, STARPU_MAIN_RAM, + (uintptr_t)(block_near_local + lbi*batch_size), + this_batch_size, sizeof(*block_near_local)); + } + for(lbi = 0; lbi < nbatches_local; ++lbi) + { + int this_batch_size = new_nblocks_near_local + - lbi*batch_size; + if(this_batch_size > batch_size) + this_batch_size = batch_size; + // Generate matrix + starpu_task_insert(&codelet_kernel_near, + STARPU_VALUE, &F, sizeof(F), + STARPU_VALUE, &this_batch_size, sizeof(this_batch_size), + STARPU_W, D_handle[lbi], + STARPU_R, index_handle[lbi], + 0); + starpu_data_unregister_submit(D_handle[lbi]); + starpu_data_unregister_submit(index_handle[lbi]); + } + // Wait in this scope, because all handles are not visible outside + starpu_task_wait_for_all(); + } + // Change sizes of far_rank, far_U and far_V if there were false + // far-field blocks + lbj = 0; + for(lbi = 0; lbi < nblocks_far_local; lbi++) + { + if(far_rank[lbi] == -1) + lbj++; + else + { + int shape_U[2] = {far_U[lbi]->shape[0], far_rank[lbi]}; + int shape_V[2] = {far_V[lbi]->shape[0], far_rank[lbi]}; + array_from_buffer(far_U+lbi-lbj, 2, shape_U, 'd', 'F', + far_U[lbi]->data); + array_from_buffer(far_V+lbi-lbj, 2, shape_V, 'd', 'F', + far_V[lbi]->data); + far_rank[lbi-lbj] = far_rank[lbi]; + } + } + if(nblocks_false_far_local > 0 && new_nblocks_far_local > 0) + { + STARSH_REALLOC(far_rank, new_nblocks_far_local); + STARSH_REALLOC(far_U, new_nblocks_far_local); + STARSH_REALLOC(far_V, new_nblocks_far_local); + } + // If all far-field blocks are false, then dealloc buffers + if(new_nblocks_far_local == 0 && nblocks_far_local > 0) + { + block_far = NULL; + free(far_rank); + far_rank = NULL; + free(far_U); + far_U = NULL; + free(far_V); + far_V = NULL; + free(alloc_U); + alloc_U = NULL; + free(alloc_V); + alloc_V = NULL; + } + // Dealloc list of false far-field blocks if it is not empty + if(nblocks_false_far > 0) + free(false_far); + if(nblocks_false_far_local > 0) + free(false_far_local); + // Finish with creating instance of Block Low-Rank Matrix with given + // buffers + starpu_execute_on_each_worker(deinit_starpu_kblas, args_gpu, STARPU_CUDA); + return starsh_blrm_new_mpi(matrix, F, far_rank, far_U, far_V, onfly, + near_D, alloc_U, alloc_V, alloc_D, '1'); +} + diff --git a/src/backends/mpi_starpu_kblas3_spatial/CMakeLists.txt b/src/backends/mpi_starpu_kblas3_spatial/CMakeLists.txt new file mode 100644 index 00000000..bf4bac8b --- /dev/null +++ b/src/backends/mpi_starpu_kblas3_spatial/CMakeLists.txt @@ -0,0 +1,25 @@ +# @copyright (c) 2017 King Abdullah University of Science and +# Technology (KAUST). All rights reserved. +# +# STARS-H is a software package, provided by King Abdullah +# University of Science and Technology (KAUST) +# +# @file src/backends/mpi_starpu_kblas/CMakeLists.txt +# @version 0.1.0 +# @author Aleksandr Mikhalev +# @date 2017-11-07 + + +# Collect sources for documentation and compilation +set(SRC) +add_subdirectory("blrm") + +# If compilation is requried +if(MPI AND STARPU AND KBLAS) + add_library(backends_mpi_starpu_kblas3_spatial OBJECT ${SRC}) + set_target_properties(backends_mpi_starpu_kblas3_spatial PROPERTIES + COMPILE_FLAGS "${MPI_C_COMPILE_FLAGS}") +endif() + +# Put doxygen input to parent scope +set(DOXYGEN_INPUT ${DOXYGEN_INPUT} ${SRC} PARENT_SCOPE) diff --git a/src/backends/mpi_starpu_kblas3_spatial/blrm/CMakeLists.txt b/src/backends/mpi_starpu_kblas3_spatial/blrm/CMakeLists.txt new file mode 100644 index 00000000..5d890a91 --- /dev/null +++ b/src/backends/mpi_starpu_kblas3_spatial/blrm/CMakeLists.txt @@ -0,0 +1,21 @@ +# @copyright (c) 2017 King Abdullah University of Science and +# Technology (KAUST). All rights reserved. +# +# STARS-H is a software package, provided by King Abdullah +# University of Science and Technology (KAUST) +# +# @file src/backends/mpi_starpu/blrm/CMakeLists.txt +# @version 0.1.0 +# @author Aleksandr Mikhalev +# @date 2017-11-07 + + +# set the values of the variable in the parent scope +set(SRC + #"${CMAKE_CURRENT_SOURCE_DIR}/dqp3.c" + "${CMAKE_CURRENT_SOURCE_DIR}/drsdd.c" + #"${CMAKE_CURRENT_SOURCE_DIR}/dfe.c" + #"${CMAKE_CURRENT_SOURCE_DIR}/dsdd.c" + #"${CMAKE_CURRENT_SOURCE_DIR}/dmml.c" + #"${CMAKE_CURRENT_SOURCE_DIR}/dna.c" + PARENT_SCOPE) diff --git a/src/backends/mpi_starpu_kblas3_spatial/blrm/drsdd.c b/src/backends/mpi_starpu_kblas3_spatial/blrm/drsdd.c new file mode 100644 index 00000000..7a363f2e --- /dev/null +++ b/src/backends/mpi_starpu_kblas3_spatial/blrm/drsdd.c @@ -0,0 +1,630 @@ +/*! @copyright (c) 2017 King Abdullah University of Science and + * Technology (KAUST). All rights reserved. + * + * STARS-H is a software package, provided by King Abdullah + * University of Science and Technology (KAUST) + * + * @file src/backends/mpi_starpu_kblas/blrm/drsdd.c + * @version 0.1.0 + * @author Aleksandr Mikhalev + * @date 2017-11-07 + * */ + +#include "common.h" +#include "starsh.h" +#include "starsh-starpu-kblas.h" +#include "starsh-mpi-starpu-kblas.h" +#include "starsh-spatial.h" +#include +#include +#include +#include "batch_rand.h" +#include +#include + +static void init_starpu_kblas(void *args) +{ + cublasHandle_t *cublas_handles; + kblasHandle_t *kblas_handles; + kblasRandState_t *kblas_states; + STARSH_ssdata **data_gpu; + STARSH_ssdata *data_cpu; + //double time0 = MPI_Wtime(); + cudaStream_t stream = starpu_cuda_get_local_stream(); + int nb, nsamples, maxbatch; + starpu_codelet_unpack_args(args, &cublas_handles, &kblas_handles, + &kblas_states, &data_gpu, &data_cpu, &nb, &nsamples, &maxbatch); + int id = starpu_worker_get_id(); + cublasStatus_t status; + //printf("unpack_args: %f seconds\n", MPI_Wtime()-time0); + //time0 = MPI_Wtime(); + kblasCreate(&kblas_handles[id]); + //printf("kblasCreate: %f seconds\n", MPI_Wtime()-time0); + //time0 = MPI_Wtime(); + kblasSetStream(kblas_handles[id], stream); + kblasDrsvd_batch_wsquery(kblas_handles[id], nb, nb, nsamples, maxbatch); + kblasAllocateWorkspace(kblas_handles[id]); + //printf("kblasAllocateWorkspace: %f seconds\n", MPI_Wtime()-time0); + //time0 = MPI_Wtime(); + cublas_handles[id] = kblasGetCublasHandle(kblas_handles[id]); + kblasInitRandState(kblas_handles[id], &kblas_states[id], 16384*2, 0); + starsh_ssdata_togpu(&data_gpu[id], data_cpu); + cudaStreamSynchronize(stream); + //printf("starsh_ssdata_togpu: %f seconds\n", MPI_Wtime()-time0); + //time0 = MPI_Wtime(); +} + +static void deinit_starpu_kblas(void *args) +{ + int nb, nsamples, maxbatch; + cublasHandle_t *cublas_handles; + kblasHandle_t *kblas_handles; + kblasRandState_t *kblas_states; + STARSH_ssdata **data_gpu; + STARSH_ssdata *data_cpu; + starpu_codelet_unpack_args(args, &cublas_handles, &kblas_handles, + &kblas_states, &data_gpu, &data_cpu, &nb, &nsamples, &maxbatch); + int id = starpu_worker_get_id(); + kblasDestroyRandState(kblas_states[id]); + kblasDestroy(&kblas_handles[id]); + starsh_ssdata_free_gpu(data_gpu[id]); + cudaStreamSynchronize(starpu_cuda_get_local_stream()); +} + +static void starsh_dense_dlrrsdd_starpu_kblas3_copy(void *buffers[], void *cl_arg) +{ + int N, batch_size; + starpu_codelet_unpack_args(cl_arg, &N, &batch_size); + double *Dcopy = (double *)STARPU_VECTOR_GET_PTR(buffers[0]); + double *D = (double *)STARPU_VECTOR_GET_PTR(buffers[1]); + cblas_dcopy(N*N*batch_size, Dcopy, 1, D, 1); +} + +int starsh_blrm__drsdd_mpi_starpu_kblas3_spatial(STARSH_blrm **matrix, + STARSH_blrf *format, int maxrank, double tol, int onfly) +//! Approximate each tile by randomized SVD. +/*! + * @param[out] matrix: Address of pointer to @ref STARSH_blrm object. + * @param[in] format: Block low-rank format. + * @param[in] maxrank: Maximum possible rank. + * @param[in] tol: Relative error tolerance. + * @param[in] onfly: Whether not to store dense blocks. + * @return Error code @ref STARSH_ERRNO. + * @ingroup blrm + * */ +{ + double time_start = MPI_Wtime(); + STARSH_blrf *F = format; + STARSH_problem *P = F->problem; + STARSH_kernel *kernel = P->kernel; + STARSH_int nblocks_far = F->nblocks_far; + STARSH_int nblocks_near = F->nblocks_near; + STARSH_int nblocks_far_local = F->nblocks_far_local; + STARSH_int nblocks_near_local = F->nblocks_near_local; + // Shortcuts to information about clusters + STARSH_cluster *RC = F->row_cluster; + STARSH_cluster *CC = F->col_cluster; + void *RD = RC->data, *CD = CC->data; + // Following values default to given block low-rank format F, but they are + // changed when there are false far-field blocks. + STARSH_int new_nblocks_far = F->nblocks_far; + STARSH_int new_nblocks_near = F->nblocks_near; + STARSH_int new_nblocks_far_local = F->nblocks_far_local; + STARSH_int new_nblocks_near_local = F->nblocks_near_local; + STARSH_int *block_far = F->block_far; + STARSH_int *block_near = F->block_near; + STARSH_int *block_far_local = F->block_far_local; + STARSH_int *block_near_local = F->block_near_local; + // Temporary holder for indexes of tiles + STARSH_int *tile_index = NULL; + // Places to store low-rank factors, dense blocks and ranks + Array **far_U = NULL, **far_V = NULL, **near_D = NULL; + int *far_rank = NULL; + double *alloc_U = NULL, *alloc_V = NULL, *alloc_D = NULL, *alloc_S = NULL; + size_t offset_U = 0, offset_V = 0, offset_D = 0; + STARSH_int lbi, lbj, bi, bj = 0; + const int oversample = starsh_params.oversample; + // MPI + int mpi_size, mpi_rank; + MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); + //if(mpi_rank == 0) + // printf("MPIKBLAS3\n"); + // Init CuBLAS and KBLAS handles and temp buffers for all workers (but they + // are used only in GPU codelets) + int workers = starpu_worker_get_count(); + cublasHandle_t cublas_handles[workers]; + kblasHandle_t kblas_handles[workers]; + kblasRandState_t kblas_states[workers]; + STARSH_ssdata *data_gpu_array[workers]; + cublasHandle_t *cuhandles = cublas_handles; + kblasHandle_t *khandles = kblas_handles; + kblasRandState_t *kstates = kblas_states; + STARSH_ssdata **data_gpu = data_gpu_array; + //printf("MAIN: %p, %p, %p\n", cuhandles, khandles, svhandles); + void *args_gpu; + size_t args_gpu_size = 0; + // This works only for TLR with equal tiles + int nb = RC->size[0]; + int nsamples = maxrank+oversample; + // Set size of batch + char *env_var = getenv("STARSH_KBLAS_BATCH"); + int batch_size = 300; + if(env_var) + batch_size = atoi(env_var); + //if(mpi_rank == 0) + // printf("MPIKBLAS3: batch_size=%d\n", batch_size); + // Ceil number of batches + int nbatches_local = (nblocks_far_local-1)/batch_size + 1; + // Get number of temporary buffers for CPU-GPU transfers + int nworkers_gpu = 3 * starpu_cuda_worker_get_count(); + // Get corresponding sizes and minimum of them + int mn = maxrank+oversample; + if(mn > nb) + mn = nb; + starpu_codelet_pack_args(&args_gpu, &args_gpu_size, + STARPU_VALUE, &cuhandles, sizeof(cuhandles), + STARPU_VALUE, &khandles, sizeof(khandles), + STARPU_VALUE, &kstates, sizeof(kstates), + STARPU_VALUE, &data_gpu, sizeof(data_gpu), + STARPU_VALUE, &RD, sizeof(RD), + STARPU_VALUE, &nb, sizeof(nb), + STARPU_VALUE, &nsamples, sizeof(nsamples), + STARPU_VALUE, &batch_size, sizeof(batch_size), + 0); + starpu_execute_on_each_worker(init_starpu_kblas, args_gpu, STARPU_CUDA); + //MPI_Barrier(MPI_COMM_WORLD); + //double time0 = MPI_Wtime(); + //if(mpi_rank == 0) + // printf("CUBLAS + WORKSPACE ALLOCATION: %f seconds\n", time0-time_start); + // Init codelet structs and handles + struct starpu_codelet codelet_kernel = + { + .cuda_funcs = {starsh_dense_kernel_starpu_kblas3_gpu}, + .cuda_flags = {STARPU_CUDA_ASYNC}, + .nbuffers = 2, + .modes = {STARPU_W, STARPU_R}, + //.type = STARPU_SPMD, + //.max_parallelism = INT_MAX, + }; + struct starpu_codelet codelet_lowrank = + { + .cuda_funcs = {starsh_dense_dlrrsdd_starpu_kblas2_gpu}, + .cuda_flags = {STARPU_CUDA_ASYNC}, + .nbuffers = 5, + .modes = {STARPU_R, STARPU_SCRATCH, STARPU_W, STARPU_W, STARPU_W}, + }; + struct starpu_codelet codelet_getrank = + { + .cpu_funcs = {starsh_dense_dlrrsdd_starpu_kblas2_getrank}, + .nbuffers = 6, + .modes = {STARPU_R, STARPU_R, STARPU_R, STARPU_W, STARPU_W, STARPU_W}, + //.type = STARPU_SPMD, + //.max_parallelism = INT_MAX, + }; + struct starpu_codelet codelet_copy = + { + .cpu_funcs = {starsh_dense_dlrrsdd_starpu_kblas3_copy}, + .nbuffers = 2, + .modes = {STARPU_R, STARPU_W}, + }; + //starpu_data_handle_t D_handle[nbatches_local]; + //starpu_data_handle_t Dcopy_handle[nbatches_local]; + starpu_data_handle_t index_handle[nbatches_local]; + starpu_data_handle_t U_handle[nbatches_local]; + starpu_data_handle_t V_handle[nbatches_local]; + //starpu_data_handle_t S_handle[nbatches_local]; + starpu_data_handle_t rank_handle[nbatches_local]; + starpu_data_handle_t D_handle[nworkers_gpu]; + starpu_data_handle_t Dcopy_handle[nworkers_gpu]; + starpu_data_handle_t tmp_U_handle[nworkers_gpu]; + starpu_data_handle_t tmp_V_handle[nworkers_gpu]; + starpu_data_handle_t tmp_S_handle[nworkers_gpu]; + // Init buffers to store low-rank factors of far-field blocks if needed + MPI_Barrier(MPI_COMM_WORLD); + //double time0 = MPI_Wtime(); + //if(mpi_rank == 0) + // printf("MPIKBLAS3: init in %f seconds\n", time0-time_start); + if(nbatches_local > 0) + { + STARSH_MALLOC(far_U, nblocks_far_local); + STARSH_MALLOC(far_V, nblocks_far_local); + STARSH_MALLOC(far_rank, nblocks_far_local); + size_t size_U = nblocks_far_local * nb * maxrank; + size_t size_V = size_U; + //size_t size_D = nblocks_far_local * nb * nb; + //size_t size_S = nblocks_far_local * mn; + STARSH_MALLOC(alloc_U, size_U); + STARSH_MALLOC(alloc_V, size_V); + //starpu_memory_pin(alloc_U, size_U*sizeof(double)); + //starpu_memory_pin(alloc_V, size_V*sizeof(double)); + //starpu_malloc(&alloc_S, size_S*sizeof(double)); + //starpu_malloc(&alloc_D, size_D*sizeof(double)); + int shape[] = {nb, maxrank}; + for(lbi = 0; lbi < nblocks_far_local; ++lbi) + { + STARSH_int offset = lbi * nb * maxrank; + array_from_buffer(far_U+lbi, 2, shape, 'd', 'F', alloc_U+offset); + array_from_buffer(far_V+lbi, 2, shape, 'd', 'F', alloc_V+offset); + } + starpu_malloc(&tile_index, 2*nblocks_far_local*sizeof(*tile_index)); + for(bi = 0; bi < nblocks_far_local; ++bi) + { + STARSH_int ind = block_far_local[bi]; + tile_index[2*bi] = block_far[2*ind]; + tile_index[2*bi+1] = block_far[2*ind+1]; + } + for(lbi = 0; lbi < nbatches_local; ++lbi) + { + STARSH_int offset = lbi * batch_size * nb * maxrank; + //STARSH_int offset_S = lbi * batch_size * mn; + double *U = alloc_U + offset; + double *V = alloc_V + offset; + //double *S = alloc_S + offset_S; + //STARSH_int offset_D = lbi * batch_size * nb * nb; + //double *D = alloc_D + offset_D; + int this_batch_size = nblocks_far_local - lbi*batch_size; + if(this_batch_size > batch_size) + this_batch_size = batch_size; + //STARSH_int D_size = this_batch_size * nb * nb; + STARSH_int U_size = this_batch_size * nb * maxrank; + STARSH_int V_size = U_size; + //STARSH_int S_size = this_batch_size * mn; + //printf("THIS BATCH SIZE=%d\n", this_batch_size); + starpu_vector_data_register(rank_handle+lbi, STARPU_MAIN_RAM, + (uintptr_t)(far_rank + lbi*batch_size), this_batch_size, + sizeof(*far_rank)); + //starpu_vector_data_register(D_handle+lbi, STARPU_MAIN_RAM, + // (uintptr_t)(D), D_size, sizeof(double)); + //starpu_vector_data_register(Dcopy_handle+lbi, -1, 0, D_size, + // sizeof(double)); + starpu_vector_data_register(index_handle+lbi, STARPU_MAIN_RAM, + (uintptr_t)(tile_index + 2*lbi*batch_size), + 2*this_batch_size, sizeof(*tile_index)); + starpu_vector_data_register(U_handle+lbi, STARPU_MAIN_RAM, + (uintptr_t)(U), U_size, sizeof(*U)); + starpu_vector_data_register(V_handle+lbi, STARPU_MAIN_RAM, + (uintptr_t)(V), V_size, sizeof(*V)); + //starpu_vector_data_register(S_handle+lbi, STARPU_MAIN_RAM, + // (uintptr_t)(S), S_size, sizeof(double)); + } + STARSH_int D_size = batch_size * nb * nb; + STARSH_int tmp_U_size = batch_size * nb * maxrank; + STARSH_int tmp_S_size = batch_size * mn; + for(bi = 0; bi < nworkers_gpu; ++bi) + { + starpu_vector_data_register(D_handle+bi, -1, 0, D_size, + sizeof(double)); + starpu_vector_data_register(Dcopy_handle+bi, -1, 0, D_size, + sizeof(double)); + starpu_vector_data_register(tmp_U_handle+bi, -1, 0, tmp_U_size, + sizeof(double)); + starpu_vector_data_register(tmp_V_handle+bi, -1, 0, tmp_U_size, + sizeof(double)); + starpu_vector_data_register(tmp_S_handle+bi, -1, 0, tmp_S_size, + sizeof(double)); + } + } + MPI_Barrier(MPI_COMM_WORLD); + //double time1 = MPI_Wtime(); + //if(mpi_rank == 0) + // printf("MPIKBLAS3: Register data in %f seconds\n", time1-time0); + //time0 = time1; + // Work variables + int info; + // START MEASURING TIME + for(lbi = 0; lbi < nbatches_local; ++lbi) + { + //printf("RUNNING BATCH=%d\n", bi); + int this_batch_size = nblocks_far_local - lbi*batch_size; + if(this_batch_size > batch_size) + this_batch_size = batch_size; + // Generate matrix + starpu_task_insert(&codelet_kernel, + STARPU_VALUE, &data_gpu, sizeof(data_gpu), + STARPU_VALUE, &nb, sizeof(nb), + STARPU_VALUE, &this_batch_size, sizeof(this_batch_size), + STARPU_W, D_handle[lbi % nworkers_gpu], + STARPU_R, index_handle[lbi], + STARPU_PRIORITY, -2, + 0); + starpu_data_unregister_submit(index_handle[lbi]); + // Run KBLAS_RSVD + starpu_task_insert(&codelet_lowrank, + STARPU_VALUE, &this_batch_size, sizeof(this_batch_size), + STARPU_VALUE, &nb, sizeof(nb), + STARPU_VALUE, &maxrank, sizeof(maxrank), + STARPU_VALUE, &oversample, sizeof(oversample), + STARPU_VALUE, &tol, sizeof(tol), + STARPU_VALUE, &cuhandles, sizeof(cuhandles), + STARPU_VALUE, &khandles, sizeof(khandles), + STARPU_VALUE, &kstates, sizeof(kstates), + STARPU_R, D_handle[lbi % nworkers_gpu], + STARPU_SCRATCH, Dcopy_handle[lbi % nworkers_gpu], + STARPU_W, tmp_U_handle[lbi % nworkers_gpu], + STARPU_W, tmp_V_handle[lbi % nworkers_gpu], + STARPU_W, tmp_S_handle[lbi % nworkers_gpu], + STARPU_PRIORITY, 0, + 0); + //starpu_data_unregister_submit(D_handle[lbi]); + //starpu_data_unregister_submit(Dcopy_handle[lbi]); + starpu_task_insert(&codelet_getrank, + STARPU_VALUE, &this_batch_size, sizeof(this_batch_size), + STARPU_VALUE, &nb, sizeof(nb), + STARPU_VALUE, &maxrank, sizeof(maxrank), + STARPU_VALUE, &oversample, sizeof(oversample), + STARPU_VALUE, &tol, sizeof(tol), + STARPU_R, tmp_U_handle[lbi % nworkers_gpu], + STARPU_R, tmp_V_handle[lbi % nworkers_gpu], + STARPU_R, tmp_S_handle[lbi % nworkers_gpu], + STARPU_W, rank_handle[lbi], + STARPU_W, U_handle[lbi], + STARPU_W, V_handle[lbi], + STARPU_PRIORITY, -1, + 0); + starpu_data_unregister_submit(rank_handle[lbi]); + starpu_data_unregister_submit(U_handle[lbi]); + starpu_data_unregister_submit(V_handle[lbi]); + //starpu_data_unregister_submit(S_handle[lbi]); + } + starpu_task_wait_for_all(); + MPI_Barrier(MPI_COMM_WORLD); + //time1 = MPI_Wtime(); + //if(mpi_rank == 0) + // printf("COMPUTE+COMPRESS MATRIX IN: %f seconds\n", time1-time0); + //time0 = time1; + if(nbatches_local > 0) + { + //size_t size_U = nblocks_far_local * nb * maxrank; + //size_t size_V = size_U; + //starpu_free(alloc_D); + //starpu_memory_unpin(alloc_U, size_U*sizeof(double)); + //starpu_memory_unpin(alloc_V, size_V*sizeof(double)); + //starpu_free(alloc_S); + starpu_free(tile_index); + for(bi = 0; bi < nworkers_gpu; ++bi) + { + starpu_data_unregister(D_handle[bi]); + starpu_data_unregister(Dcopy_handle[bi]); + starpu_data_unregister(tmp_U_handle[bi]); + starpu_data_unregister(tmp_V_handle[bi]); + starpu_data_unregister(tmp_S_handle[bi]); + } + } + //MPI_Barrier(MPI_COMM_WORLD); + //if(mpi_rank == 0) + // printf("FINISH FIRST PASS AND UNREGISTER IN: %f seconds\n", + // MPI_Wtime()-time0); + // Get number of false far-field blocks + STARSH_int nblocks_false_far_local = 0; + STARSH_int *false_far_local = NULL; + for(lbi = 0; lbi < nblocks_far_local; lbi++) + { + //far_rank[lbi] = -1; + if(far_rank[lbi] == -1) + nblocks_false_far_local++; + } + if(nblocks_false_far_local > 0) + { + // IMPORTANT: `false_far` and `false_far_local` must be in + // ascending order for later code to work normally + STARSH_MALLOC(false_far_local, nblocks_false_far_local); + lbj = 0; + for(lbi = 0; lbi < nblocks_far_local; lbi++) + if(far_rank[lbi] == -1) + false_far_local[lbj++] = block_far_local[lbi]; + } + // Sync list of all false far-field blocks + STARSH_int nblocks_false_far = 0; + int int_nblocks_false_far_local = nblocks_false_far_local; + int *mpi_recvcount, *mpi_offset; + STARSH_MALLOC(mpi_recvcount, mpi_size); + STARSH_MALLOC(mpi_offset, mpi_size); + MPI_Allgather(&int_nblocks_false_far_local, 1, MPI_INT, mpi_recvcount, + 1, MPI_INT, MPI_COMM_WORLD); + for(bi = 0; bi < mpi_size; bi++) + nblocks_false_far += mpi_recvcount[bi]; + mpi_offset[0] = 0; + for(bi = 1; bi < mpi_size; bi++) + mpi_offset[bi] = mpi_offset[bi-1]+mpi_recvcount[bi-1]; + STARSH_int *false_far = NULL; + if(nblocks_false_far > 0) + STARSH_MALLOC(false_far, nblocks_false_far); + MPI_Allgatherv(false_far_local, nblocks_false_far_local, my_MPI_SIZE_T, + false_far, mpi_recvcount, mpi_offset, my_MPI_SIZE_T, + MPI_COMM_WORLD); + free(mpi_recvcount); + free(mpi_offset); + // Make false_far be in ascending order + qsort(false_far, nblocks_false_far, sizeof(*false_far), cmp_size_t); + if(nblocks_false_far > 0) + { + // Update list of near-field blocks + new_nblocks_near = nblocks_near+nblocks_false_far; + new_nblocks_near_local = nblocks_near_local+nblocks_false_far_local; + STARSH_MALLOC(block_near, 2*new_nblocks_near); + if(new_nblocks_near_local > 0) + STARSH_MALLOC(block_near_local, new_nblocks_near_local); + // At first get all near-field blocks, assumed to be dense + for(bi = 0; bi < 2*nblocks_near; bi++) + block_near[bi] = F->block_near[bi]; + for(lbi = 0; lbi < nblocks_near_local; lbi++) + block_near_local[lbi] = F->block_near_local[lbi]; + // Add false far-field blocks + for(bi = 0; bi < nblocks_false_far; bi++) + { + STARSH_int bj = false_far[bi]; + block_near[2*(bi+nblocks_near)] = F->block_far[2*bj]; + block_near[2*(bi+nblocks_near)+1] = F->block_far[2*bj+1]; + } + bi = 0; + for(lbi = 0; lbi < nblocks_false_far_local; lbi++) + { + lbj = false_far_local[lbi]; + while(bi < nblocks_false_far && false_far[bi] < lbj) + bi++; + block_near_local[nblocks_near_local+lbi] = nblocks_near+bi; + } + // Update list of far-field blocks + new_nblocks_far = nblocks_far-nblocks_false_far; + new_nblocks_far_local = nblocks_far_local-nblocks_false_far_local; + if(new_nblocks_far > 0) + { + STARSH_MALLOC(block_far, 2*new_nblocks_far); + if(new_nblocks_far_local > 0) + STARSH_MALLOC(block_far_local, new_nblocks_far_local); + bj = 0; + lbi = 0; + lbj = 0; + for(bi = 0; bi < nblocks_far; bi++) + { + // `false_far` must be in ascending order for this to work + if(bj < nblocks_false_far && false_far[bj] == bi) + { + if(nblocks_false_far_local > lbj && + false_far_local[lbj] == bi) + { + lbi++; + lbj++; + } + bj++; + } + else + { + block_far[2*(bi-bj)] = F->block_far[2*bi]; + block_far[2*(bi-bj)+1] = F->block_far[2*bi+1]; + if(nblocks_far_local > lbi && + F->block_far_local[lbi] == bi) + { + block_far_local[lbi-lbj] = bi-bj; + lbi++; + } + } + } + } + // Update format by creating new format + STARSH_blrf *F2; + info = starsh_blrf_new_from_coo_mpi(&F2, P, F->symm, RC, CC, + new_nblocks_far, block_far, new_nblocks_far_local, + block_far_local, new_nblocks_near, block_near, + new_nblocks_near_local, block_near_local, F->type); + // Swap internal data of formats and free unnecessary data + STARSH_blrf tmp_blrf = *F; + *F = *F2; + *F2 = tmp_blrf; + if(mpi_rank == 0) + STARSH_WARNING("`F` was modified due to false far-field blocks"); + starsh_blrf_free(F2); + } + // Compute near-field blocks if needed + if(onfly == 0 && new_nblocks_near_local > 0) + { + STARSH_MALLOC(near_D, new_nblocks_near_local); + size_t size_D = new_nblocks_near_local * nb * nb; + STARSH_MALLOC(alloc_D, size_D); + nbatches_local = (new_nblocks_near_local-1)/batch_size + 1; + starpu_data_handle_t D_handle[nbatches_local]; + starpu_data_handle_t index_handle[nbatches_local]; + starpu_malloc(&tile_index, 2*new_nblocks_near_local*sizeof(*tile_index)); + int shape[] = {nb, nb}; + // For each local near-field block compute its elements + for(lbi = 0; lbi < new_nblocks_near_local; ++lbi) + { + // Get indexes of corresponding block row and block column + array_from_buffer(near_D+lbi, 2, shape, 'd', 'F', + alloc_D + lbi*nb*nb); + STARSH_int ind = block_near_local[lbi]; + tile_index[lbi*2] = block_near[2*ind]; + tile_index[lbi*2+1] = block_near[2*ind+1]; + } + for(lbi = 0; lbi < nbatches_local; ++lbi) + { + int this_batch_size = new_nblocks_near_local + - lbi*batch_size; + if(this_batch_size > batch_size) + this_batch_size = batch_size; + STARSH_int D_size = this_batch_size * nb * nb; + double *D = alloc_D + lbi*batch_size*nb*nb; + starpu_vector_data_register(D_handle+lbi, STARPU_MAIN_RAM, + (uintptr_t)(D), D_size, sizeof(*D)); + starpu_vector_data_register(index_handle+lbi, STARPU_MAIN_RAM, + (uintptr_t)(tile_index + 2*lbi*batch_size), + 2*this_batch_size, sizeof(*tile_index)); + } + for(lbi = 0; lbi < nbatches_local; ++lbi) + { + int this_batch_size = new_nblocks_near_local + - lbi*batch_size; + if(this_batch_size > batch_size) + this_batch_size = batch_size; + // Generate matrix + starpu_task_insert(&codelet_kernel, + STARPU_VALUE, &data_gpu, sizeof(data_gpu), + STARPU_VALUE, &nb, sizeof(nb), + STARPU_VALUE, &this_batch_size, sizeof(this_batch_size), + STARPU_W, D_handle[lbi], + STARPU_R, index_handle[lbi], + 0); + starpu_data_unregister_submit(D_handle[lbi]); + starpu_data_unregister_submit(index_handle[lbi]); + } + // Wait in this scope, because all handles are not visible outside + starpu_task_wait_for_all(); + starpu_free(tile_index); + } + // Change sizes of far_rank, far_U and far_V if there were false + // far-field blocks + lbj = 0; + for(lbi = 0; lbi < nblocks_far_local; lbi++) + { + if(far_rank[lbi] == -1) + lbj++; + else + { + int shape_U[2] = {far_U[lbi]->shape[0], far_rank[lbi]}; + int shape_V[2] = {far_V[lbi]->shape[0], far_rank[lbi]}; + array_from_buffer(far_U+lbi-lbj, 2, shape_U, 'd', 'F', + far_U[lbi]->data); + array_from_buffer(far_V+lbi-lbj, 2, shape_V, 'd', 'F', + far_V[lbi]->data); + far_rank[lbi-lbj] = far_rank[lbi]; + } + } + if(nblocks_false_far_local > 0 && new_nblocks_far_local > 0) + { + STARSH_REALLOC(far_rank, new_nblocks_far_local); + STARSH_REALLOC(far_U, new_nblocks_far_local); + STARSH_REALLOC(far_V, new_nblocks_far_local); + } + // If all far-field blocks are false, then dealloc buffers + if(new_nblocks_far_local == 0 && nblocks_far_local > 0) + { + block_far = NULL; + free(far_rank); + far_rank = NULL; + free(far_U); + far_U = NULL; + free(far_V); + far_V = NULL; + free(alloc_U); + alloc_U = NULL; + free(alloc_V); + alloc_V = NULL; + } + // Dealloc list of false far-field blocks if it is not empty + if(nblocks_false_far > 0) + free(false_far); + if(nblocks_false_far_local > 0) + free(false_far_local); + // Finish with creating instance of Block Low-Rank Matrix with given + // buffers + //if(mpi_rank == 0) + // printf("FINISH NEAR-FIELD TILES: %f seconds\n", MPI_Wtime()-time0); + //time0 = MPI_Wtime(); + starpu_execute_on_each_worker(deinit_starpu_kblas, args_gpu, STARPU_CUDA); + //if(mpi_rank == 0) + // printf("MPIKBLAS3: finalize in %f seconds\n", MPI_Wtime()-time0); + return starsh_blrm_new_mpi(matrix, F, far_rank, far_U, far_V, onfly, + near_D, alloc_U, alloc_V, alloc_D, '1'); +} + diff --git a/src/backends/openmp/CMakeLists.txt b/src/backends/openmp/CMakeLists.txt index d7146cb6..85543d42 100644 --- a/src/backends/openmp/CMakeLists.txt +++ b/src/backends/openmp/CMakeLists.txt @@ -5,7 +5,7 @@ # University of Science and Technology (KAUST) # # @file src/backends/openmp/CMakeLists.txt -# @version 1.3.0 +# @version 0.3.0 # @author Aleksandr Mikhalev # @date 2017-11-07 diff --git a/src/backends/openmp/blrm/CMakeLists.txt b/src/backends/openmp/blrm/CMakeLists.txt index 08ee8d6c..1795d6dd 100644 --- a/src/backends/openmp/blrm/CMakeLists.txt +++ b/src/backends/openmp/blrm/CMakeLists.txt @@ -5,7 +5,7 @@ # University of Science and Technology (KAUST) # # @file src/backends/openmp/blrm/CMakeLists.txt -# @version 1.3.0 +# @version 0.3.0 # @author Aleksandr Mikhalev # @date 2017-11-07 diff --git a/src/backends/openmp/blrm/dfe.c b/src/backends/openmp/blrm/dfe.c index 3fb217cd..aae15630 100644 --- a/src/backends/openmp/blrm/dfe.c +++ b/src/backends/openmp/blrm/dfe.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file src/backends/openmp/blrm/dfe.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/src/backends/openmp/blrm/dmml.c b/src/backends/openmp/blrm/dmml.c index 5764d500..2af2e787 100644 --- a/src/backends/openmp/blrm/dmml.c +++ b/src/backends/openmp/blrm/dmml.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file src/backends/openmp/blrm/dmml.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/src/backends/openmp/blrm/dqp3.c b/src/backends/openmp/blrm/dqp3.c index c0b25d91..6fa05cbf 100644 --- a/src/backends/openmp/blrm/dqp3.c +++ b/src/backends/openmp/blrm/dqp3.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file src/backends/openmp/blrm/dqp3.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/src/backends/openmp/blrm/drsdd.c b/src/backends/openmp/blrm/drsdd.c index cdc47be3..f60a9766 100644 --- a/src/backends/openmp/blrm/drsdd.c +++ b/src/backends/openmp/blrm/drsdd.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file src/backends/openmp/blrm/drsdd.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ @@ -229,7 +229,7 @@ int starsh_blrm__drsdd_omp(STARSH_blrm **matrix, STARSH_blrf *format, } STARSH_MALLOC(alloc_D, size_D); // For each near-field block compute its elements - #pragma omp parallel for schedule(dynamic,1) + //#pragma omp parallel for schedule(dynamic,1) for(bi = 0; bi < new_nblocks_near; bi++) { // Get indexes of corresponding block row and block column diff --git a/src/backends/openmp/blrm/dsdd.c b/src/backends/openmp/blrm/dsdd.c index 8d52d3f9..56c2133b 100644 --- a/src/backends/openmp/blrm/dsdd.c +++ b/src/backends/openmp/blrm/dsdd.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file src/backends/openmp/blrm/dsdd.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/src/backends/sequential/CMakeLists.txt b/src/backends/sequential/CMakeLists.txt index 518a790e..ab0d949d 100644 --- a/src/backends/sequential/CMakeLists.txt +++ b/src/backends/sequential/CMakeLists.txt @@ -5,7 +5,7 @@ # University of Science and Technology (KAUST) # # @file src/backends/sequential/CMakeLists.txt -# @version 1.3.0 +# @version 0.3.0 # @author Aleksandr Mikhalev # @date 2017-11-07 diff --git a/src/backends/sequential/blrm/CMakeLists.txt b/src/backends/sequential/blrm/CMakeLists.txt index 2d7977e1..08146f99 100644 --- a/src/backends/sequential/blrm/CMakeLists.txt +++ b/src/backends/sequential/blrm/CMakeLists.txt @@ -5,7 +5,7 @@ # University of Science and Technology (KAUST) # # @file src/backends/sequential/blrm/CMakeLists.txt -# @version 1.3.0 +# @version 0.3.0 # @author Aleksandr Mikhalev # @date 2017-11-07 diff --git a/src/backends/sequential/blrm/dca.c b/src/backends/sequential/blrm/dca.c index 3b471b52..b1517859 100644 --- a/src/backends/sequential/blrm/dca.c +++ b/src/backends/sequential/blrm/dca.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file src/backends/sequential/blrm/dca.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/src/backends/sequential/blrm/dfe.c b/src/backends/sequential/blrm/dfe.c index c9ab9ba6..56532bce 100644 --- a/src/backends/sequential/blrm/dfe.c +++ b/src/backends/sequential/blrm/dfe.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file src/backends/sequential/blrm/dfe.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/src/backends/sequential/blrm/dmml.c b/src/backends/sequential/blrm/dmml.c index ab9c3956..289894e4 100644 --- a/src/backends/sequential/blrm/dmml.c +++ b/src/backends/sequential/blrm/dmml.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file src/backends/sequential/blrm/dmml.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/src/backends/sequential/blrm/dqp3.c b/src/backends/sequential/blrm/dqp3.c index 85da47b5..6ecaa29b 100644 --- a/src/backends/sequential/blrm/dqp3.c +++ b/src/backends/sequential/blrm/dqp3.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file src/backends/sequential/blrm/dqp3.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/src/backends/sequential/blrm/drsdd.c b/src/backends/sequential/blrm/drsdd.c index 33d89fc1..d02cd29e 100644 --- a/src/backends/sequential/blrm/drsdd.c +++ b/src/backends/sequential/blrm/drsdd.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file src/backends/sequential/blrm/drsdd.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/src/backends/sequential/blrm/dsdd.c b/src/backends/sequential/blrm/dsdd.c index 1b90d5f0..38827e14 100644 --- a/src/backends/sequential/blrm/dsdd.c +++ b/src/backends/sequential/blrm/dsdd.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file src/backends/sequential/blrm/dsdd.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/src/backends/sequential/dense/CMakeLists.txt b/src/backends/sequential/dense/CMakeLists.txt index 3d775e68..ec47478e 100644 --- a/src/backends/sequential/dense/CMakeLists.txt +++ b/src/backends/sequential/dense/CMakeLists.txt @@ -5,7 +5,7 @@ # University of Science and Technology (KAUST) # # @file src/backends/sequential/dense/CMakeLists.txt -# @version 1.3.0 +# @version 0.3.0 # @author Aleksandr Mikhalev # @date 2017-11-07 diff --git a/src/backends/sequential/dense/dna.c b/src/backends/sequential/dense/dna.c index e9023c9d..f50a1fce 100644 --- a/src/backends/sequential/dense/dna.c +++ b/src/backends/sequential/dense/dna.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file src/backends/sequential/dense/dna.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/src/backends/sequential/dense/dqp3.c b/src/backends/sequential/dense/dqp3.c index 43729a4c..32ba5056 100644 --- a/src/backends/sequential/dense/dqp3.c +++ b/src/backends/sequential/dense/dqp3.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file src/backends/sequential/dense/dqp3.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/src/backends/sequential/dense/drsdd.c b/src/backends/sequential/dense/drsdd.c index 4dfd82ed..6b711be0 100644 --- a/src/backends/sequential/dense/drsdd.c +++ b/src/backends/sequential/dense/drsdd.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file src/backends/sequential/dense/drsdd.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ @@ -56,7 +56,7 @@ void starsh_dense_dlrrsdd(int nrows, int ncols, double *D, int ldD, double *U, int svdqr_lwork = lwork-(size_t)mn2*(2*ncols+nrows+mn2+1); int iseed[4] = {0, 0, 0, 1}; // Generate random matrix X - LAPACKE_dlarnv_work(3, iseed, nrows*mn2, X); + LAPACKE_dlarnv_work(3, iseed, ncols*mn2, X); // Multiply by random matrix cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, nrows, mn2, ncols, 1.0, D, ldD, X, ncols, 0.0, Q, nrows); diff --git a/src/backends/sequential/dense/dsdd.c b/src/backends/sequential/dense/dsdd.c index 48e1e25c..72df2952 100644 --- a/src/backends/sequential/dense/dsdd.c +++ b/src/backends/sequential/dense/dsdd.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file src/backends/sequential/dense/dsdd.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/src/backends/sequential/dense/dsvfr.c b/src/backends/sequential/dense/dsvfr.c index a700bd39..ae6f376a 100644 --- a/src/backends/sequential/dense/dsvfr.c +++ b/src/backends/sequential/dense/dsvfr.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file src/backends/sequential/dense/dsvfr.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/src/backends/sequential/dense/zrsdd.c b/src/backends/sequential/dense/zrsdd.c index 1f19275a..a11b8bdd 100644 --- a/src/backends/sequential/dense/zrsdd.c +++ b/src/backends/sequential/dense/zrsdd.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file src/backends/sequential/dense/zrsdd.c - * @version 1.3.0 + * @version 0.3.0 * @author Rabab Alomairy * @author Kadir Akbudak * @author Aleksandr Mikhalev diff --git a/src/backends/starpu/CMakeLists.txt b/src/backends/starpu/CMakeLists.txt index 7cb9a3f8..1c071bcb 100644 --- a/src/backends/starpu/CMakeLists.txt +++ b/src/backends/starpu/CMakeLists.txt @@ -5,7 +5,7 @@ # University of Science and Technology (KAUST) # # @file src/backends/starpu/CMakeLists.txt -# @version 1.3.0 +# @version 0.3.0 # @author Aleksandr Mikhalev # @date 2017-11-07 diff --git a/src/backends/starpu/blrm/CMakeLists.txt b/src/backends/starpu/blrm/CMakeLists.txt index 8b1a57d1..1b2b738e 100644 --- a/src/backends/starpu/blrm/CMakeLists.txt +++ b/src/backends/starpu/blrm/CMakeLists.txt @@ -5,7 +5,7 @@ # University of Science and Technology (KAUST) # # @file src/backends/starpu/blrm/CMakeLists.txt -# @version 1.3.0 +# @version 0.3.0 # @author Aleksandr Mikhalev # @date 2017-11-07 diff --git a/src/backends/starpu/blrm/dmml.c b/src/backends/starpu/blrm/dmml.c index cccd7ed0..180c1947 100644 --- a/src/backends/starpu/blrm/dmml.c +++ b/src/backends/starpu/blrm/dmml.c @@ -9,7 +9,7 @@ * @cond * This command in pair with endcond will prevent file from being documented. * - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/src/backends/starpu/blrm/dqp3.c b/src/backends/starpu/blrm/dqp3.c index e9f03a8d..37940b16 100644 --- a/src/backends/starpu/blrm/dqp3.c +++ b/src/backends/starpu/blrm/dqp3.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file src/backends/starpu/blrm/dqp3.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/src/backends/starpu/blrm/drsdd.c b/src/backends/starpu/blrm/drsdd.c index 9009215c..1d8bb8bc 100644 --- a/src/backends/starpu/blrm/drsdd.c +++ b/src/backends/starpu/blrm/drsdd.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file src/backends/starpu/blrm/drsdd.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ @@ -27,6 +27,7 @@ int starsh_blrm__drsdd_starpu(STARSH_blrm **matrix, STARSH_blrf *format, * @ingroup blrm * */ { + printf("IN STARPU (NO KBLAS)\n"); STARSH_blrf *F = format; STARSH_problem *P = F->problem; STARSH_kernel *kernel = P->kernel; @@ -278,8 +279,12 @@ int starsh_blrm__drsdd_starpu(STARSH_blrm **matrix, STARSH_blrf *format, bj++; else { - far_U[bi-bj] = far_U[bi]; - far_V[bi-bj] = far_V[bi]; + int shape_U[2] = {far_U[bi]->shape[0], far_rank[bi]}; + int shape_V[2] = {far_V[bi]->shape[0], far_rank[bi]}; + array_from_buffer(far_U+bi-bj, 2, shape_U, 'd', 'F', + far_U[bi]->data); + array_from_buffer(far_V+bi-bj, 2, shape_V, 'd', 'F', + far_V[bi]->data); far_rank[bi-bj] = far_rank[bi]; } } diff --git a/src/backends/starpu/blrm/dsdd.c b/src/backends/starpu/blrm/dsdd.c index dd57d10f..587e08cf 100644 --- a/src/backends/starpu/blrm/dsdd.c +++ b/src/backends/starpu/blrm/dsdd.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file src/backends/starpu/blrm/dsdd.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/src/backends/starpu/dense/CMakeLists.txt b/src/backends/starpu/dense/CMakeLists.txt index a53c48f9..a86f5f54 100644 --- a/src/backends/starpu/dense/CMakeLists.txt +++ b/src/backends/starpu/dense/CMakeLists.txt @@ -5,7 +5,7 @@ # University of Science and Technology (KAUST) # # @file src/backends/starpu/dense/CMakeLists.txt -# @version 1.3.0 +# @version 0.3.0 # @author Aleksandr Mikhalev # @date 2017-11-07 diff --git a/src/backends/starpu/dense/dgemm.c b/src/backends/starpu/dense/dgemm.c index 88057522..3fc81621 100644 --- a/src/backends/starpu/dense/dgemm.c +++ b/src/backends/starpu/dense/dgemm.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file src/backends/starpu/dense/dgemm.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/src/backends/starpu/dense/dqp3.c b/src/backends/starpu/dense/dqp3.c index 358444a5..a83fd86a 100644 --- a/src/backends/starpu/dense/dqp3.c +++ b/src/backends/starpu/dense/dqp3.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file src/backends/starpu/dense/dqp3.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/src/backends/starpu/dense/drsdd.c b/src/backends/starpu/dense/drsdd.c index 7a3f665a..13683c93 100644 --- a/src/backends/starpu/dense/drsdd.c +++ b/src/backends/starpu/dense/drsdd.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file src/backends/starpu/dense/drsdd.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/src/backends/starpu/dense/dsdd.c b/src/backends/starpu/dense/dsdd.c index bc6857fd..9314d6fd 100644 --- a/src/backends/starpu/dense/dsdd.c +++ b/src/backends/starpu/dense/dsdd.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file src/backends/starpu/dense/dsdd.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/src/backends/starpu/dense/fake_init.c b/src/backends/starpu/dense/fake_init.c index 256011d1..44a23a5a 100644 --- a/src/backends/starpu/dense/fake_init.c +++ b/src/backends/starpu/dense/fake_init.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file src/backends/starpu/dense/fake_init.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/src/backends/starpu/dense/kernel.c b/src/backends/starpu/dense/kernel.c index b36aeb2a..c0b9134a 100644 --- a/src/backends/starpu/dense/kernel.c +++ b/src/backends/starpu/dense/kernel.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file src/backends/starpu/dense/kernel.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/src/backends/starpu_cuda/CMakeLists.txt b/src/backends/starpu_cuda/CMakeLists.txt new file mode 100644 index 00000000..de2a620d --- /dev/null +++ b/src/backends/starpu_cuda/CMakeLists.txt @@ -0,0 +1,26 @@ +# @copyright (c) 2017 King Abdullah University of Science and +# Technology (KAUST). All rights reserved. +# +# STARS-H is a software package, provided by King Abdullah +# University of Science and Technology (KAUST) +# +# @file src/backends/starpu/CMakeLists.txt +# @version 0.1.0 +# @author Aleksandr Mikhalev +# @date 2017-11-07 + + +set(DOXYGEN_INPUT ${DOXYGEN_INPUT}) + +# Collect sources for documentation and compilation +set(SRC) +add_subdirectory("blrm") +add_subdirectory("dense") + +# If compilation is requried +if(STARPU AND CUDA) + add_library(backends_starpu_cuda OBJECT ${SRC}) +endif() + +# Put doxygen input to parent scope +set(DOXYGEN_INPUT ${DOXYGEN_INPUT} ${SRC} PARENT_SCOPE) diff --git a/src/backends/starpu_cuda/blrm/CMakeLists.txt b/src/backends/starpu_cuda/blrm/CMakeLists.txt new file mode 100644 index 00000000..dc39d390 --- /dev/null +++ b/src/backends/starpu_cuda/blrm/CMakeLists.txt @@ -0,0 +1,19 @@ +# @copyright (c) 2017 King Abdullah University of Science and +# Technology (KAUST). All rights reserved. +# +# STARS-H is a software package, provided by King Abdullah +# University of Science and Technology (KAUST) +# +# @file src/backends/starpu/blrm/CMakeLists.txt +# @version 0.1.0 +# @author Aleksandr Mikhalev +# @date 2017-11-07 + + +# set the values of the variable in the parent scope +set(SRC + #"${CMAKE_CURRENT_SOURCE_DIR}/dqp3.c" + "${CMAKE_CURRENT_SOURCE_DIR}/drsdd.c" + #"${CMAKE_CURRENT_SOURCE_DIR}/dsdd.c" + #"${CMAKE_CURRENT_SOURCE_DIR}/dmml.c" + ${SRC} PARENT_SCOPE) diff --git a/src/backends/starpu_cuda/blrm/drsdd.c b/src/backends/starpu_cuda/blrm/drsdd.c new file mode 100644 index 00000000..ba5ccaf5 --- /dev/null +++ b/src/backends/starpu_cuda/blrm/drsdd.c @@ -0,0 +1,425 @@ +/*! @copyright (c) 2017 King Abdullah University of Science and + * Technology (KAUST). All rights reserved. + * + * STARS-H is a software package, provided by King Abdullah + * University of Science and Technology (KAUST) + * + * @file src/backends/starpu/blrm/drsdd.c + * @version 0.1.0 + * @author Aleksandr Mikhalev + * @date 2017-11-07 + * */ + +#include "common.h" +#include "starsh.h" +#include "starsh-starpu-cuda.h" +#include +#include +#include +#include +#include + +static void init_starpu_cuda(void *args) +{ + cublasHandle_t *cublas_handles; + cusolverDnHandle_t *cusolver_handles; + curandGenerator_t *curand_handles; + int **devinfo; + int nb, nsamples; + starpu_codelet_unpack_args(args, &cublas_handles, &cusolver_handles, + &curand_handles, &devinfo, &nb, &nsamples); + int id = starpu_worker_get_id(); + cublasStatus_t status; + //printf("CUBLAS init worker %d at %p\n", id, &cublas_handles[id]); + cublasCreate(&cublas_handles[id]); + cusolverDnCreate(&cusolver_handles[id]); + curandCreateGenerator(&curand_handles[id], CURAND_RNG_PSEUDO_MT19937); + curandSetPseudoRandomGeneratorSeed(curand_handles[id], 0ULL); + cudaMalloc((void **)&devinfo[id], sizeof(int)); +} + +static void deinit_starpu_cuda(void *args) +{ + cublasHandle_t *cublas_handles; + cusolverDnHandle_t *cusolver_handles; + curandGenerator_t *curand_handles; + int **devinfo; + starpu_codelet_unpack_args(args, &cublas_handles, &cusolver_handles, + &curand_handles, &devinfo, 0); + int id = starpu_worker_get_id(); + //printf("CUBLAS deinit worker %d at %p\n", id, &cublas_handles[id]); + cublasDestroy(cublas_handles[id]); + cusolverDnDestroy(cusolver_handles[id]); + curandDestroyGenerator(curand_handles[id]); + cudaFree(devinfo[id]); +} + +int starsh_blrm__drsdd_starpu_cuda(STARSH_blrm **matrix, STARSH_blrf *format, + int maxrank, double tol, int onfly) +//! Approximate each tile by randomized SVD. +/*! + * @param[out] matrix: Address of pointer to @ref STARSH_blrm object. + * @param[in] format: Block low-rank format. + * @param[in] maxrank: Maximum possible rank. + * @param[in] tol: Relative error tolerance. + * @param[in] onfly: Whether not to store dense blocks. + * @return Error code @ref STARSH_ERRNO. + * @ingroup blrm + * */ +{ + STARSH_blrf *F = format; + STARSH_problem *P = F->problem; + STARSH_kernel *kernel = P->kernel; + STARSH_int nblocks_far = F->nblocks_far; + STARSH_int nblocks_near = F->nblocks_near; + // Shortcuts to information about clusters + STARSH_cluster *RC = F->row_cluster; + STARSH_cluster *CC = F->col_cluster; + void *RD = RC->data, *CD = CC->data; + // Following values default to given block low-rank format F, but they are + // changed when there are false far-field blocks. + STARSH_int new_nblocks_far = nblocks_far; + STARSH_int new_nblocks_near = nblocks_near; + STARSH_int *block_far = F->block_far; + STARSH_int *block_near = F->block_near; + // Places to store low-rank factors, dense blocks and ranks + Array **far_U = NULL, **far_V = NULL, **near_D = NULL; + int *far_rank = NULL; + double *alloc_U = NULL, *alloc_V = NULL, *alloc_D = NULL; + size_t offset_U = 0, offset_V = 0, offset_D = 0; + STARSH_int bi, bj = 0; + const int oversample = starsh_params.oversample; + // Init CuBLAS and CuSolver handles and temp buffers for all workers (but + // they are used only in GPU codelets) + int workers = starpu_worker_get_count(); + cublasHandle_t cublas_handles[workers]; + cusolverDnHandle_t cusolver_handles[workers]; + curandGenerator_t curand_handles[workers]; + int *devinfo[workers]; + double singular_values[workers*(maxrank+oversample)]; + cublasHandle_t *cuhandles = cublas_handles; + cusolverDnHandle_t *cuhandles2 = cusolver_handles; + curandGenerator_t *cuhandles3 = curand_handles; + int **devinfo_ptr = devinfo; + double *svhandles = singular_values; + //printf("MAIN: %p, %p, %p\n", cuhandles, cuhandles2, svhandles); + void *args_buffer; + size_t args_buffer_size = 0; + // This works only for TLR with equal tiles + int nb = RC->size[0]; + int nsamples = maxrank+oversample; + starpu_codelet_pack_args(&args_buffer, &args_buffer_size, + STARPU_VALUE, &cuhandles, sizeof(cuhandles), + STARPU_VALUE, &cuhandles2, sizeof(cuhandles2), + STARPU_VALUE, &cuhandles3, sizeof(cuhandles3), + STARPU_VALUE, &devinfo_ptr, sizeof(devinfo_ptr), + STARPU_VALUE, &nb, sizeof(nb), + STARPU_VALUE, &nsamples, sizeof(nsamples), + 0); + starpu_execute_on_each_worker(init_starpu_cuda, args_buffer, STARPU_CUDA); + // Init codelet structs and handles + struct starpu_codelet codelet = + { + //.cpu_funcs = {starsh_dense_dlrrsdd_starpu_cuda_cpu}, + .cuda_funcs = {starsh_dense_dlrrsdd_starpu_cuda_gpu}, + .cuda_flags = {STARPU_CUDA_ASYNC}, + .nbuffers = 6, + .modes = {STARPU_R, STARPU_W, STARPU_W, STARPU_W, STARPU_SCRATCH, + STARPU_SCRATCH} + }; + struct starpu_codelet codelet2 = + { + .cpu_funcs = {starsh_dense_kernel_starpu_cuda_cpu}, + .nbuffers = 1, + .modes = {STARPU_W} + }; + starpu_data_handle_t rank_handle[nblocks_far]; + starpu_data_handle_t D_handle[nblocks_far]; + starpu_data_handle_t U_handle[nblocks_far]; + starpu_data_handle_t V_handle[nblocks_far]; + starpu_data_handle_t work_handle[nblocks_far]; + starpu_data_handle_t iwork_handle[nblocks_far]; + // Init buffers to store low-rank factors of far-field blocks if needed + if(nblocks_far > 0) + { + STARSH_MALLOC(far_U, nblocks_far); + STARSH_MALLOC(far_V, nblocks_far); + STARSH_MALLOC(far_rank, nblocks_far); + size_t size_U = 0, size_V = 0; + // Simple cycle over all far-field blocks + for(bi = 0; bi < nblocks_far; bi++) + { + // Get indexes of corresponding block row and block column + STARSH_int i = block_far[2*bi]; + STARSH_int j = block_far[2*bi+1]; + // Get corresponding sizes and minimum of them + size_U += RC->size[i]; + size_V += CC->size[j]; + //far_rank[bi] = -2; + } + size_U *= maxrank; + size_V *= maxrank; + STARSH_MALLOC(alloc_U, size_U); + STARSH_MALLOC(alloc_V, size_V); + for(bi = 0; bi < nblocks_far; bi++) + { + // Get indexes of corresponding block row and block column + STARSH_int i = block_far[2*bi]; + STARSH_int j = block_far[2*bi+1]; + // Get corresponding sizes and minimum of them + int nrows = RC->size[i], ncols = CC->size[j]; + int mn = nrows < ncols ? nrows : ncols; + int mn2 = maxrank+oversample; + if(mn2 > mn) + mn2 = mn; + // Get size of temporary arrays + int lwork = ncols, lwork_sdd = (4*mn2+7)*mn2; + if(lwork_sdd > lwork) + lwork = lwork_sdd; + cusolverDnDgesvd_bufferSize(cusolver_handles[0], ncols, mn2, + &lwork_sdd); + //printf("CUSOLVER SVD LWORK=%d\n", lwork_sdd); + if(lwork_sdd > lwork) + lwork = lwork_sdd; + lwork += mn2*(2*ncols+nrows+2*mn2+1); + int liwork = 8*mn2; + int shape_U[] = {nrows, maxrank}; + int shape_V[] = {ncols, maxrank}; + double *U = alloc_U+offset_U, *V = alloc_V+offset_V; + offset_U += nrows*maxrank; + offset_V += ncols*maxrank; + array_from_buffer(far_U+bi, 2, shape_U, 'd', 'F', U); + array_from_buffer(far_V+bi, 2, shape_V, 'd', 'F', V); + starpu_vector_data_register(rank_handle+bi, STARPU_MAIN_RAM, + (uintptr_t)(far_rank+bi), 1, sizeof(*far_rank)); + starpu_matrix_data_register(D_handle+bi, -1, 0, nrows, nrows, + ncols, sizeof(double)); + starpu_vector_data_register(U_handle+bi, STARPU_MAIN_RAM, + (uintptr_t)(far_U[bi]->data), nrows*maxrank, sizeof(*U)); + starpu_vector_data_register(V_handle+bi, STARPU_MAIN_RAM, + (uintptr_t)(far_V[bi]->data), ncols*maxrank, sizeof(*V)); + starpu_vector_data_register(work_handle+bi, -1, 0, lwork, + sizeof(*U)); + starpu_vector_data_register(iwork_handle+bi, -1, 0, liwork, + sizeof(int)); + } + offset_U = 0; + offset_V = 0; + } + // Work variables + int info; + // Simple cycle over all far-field admissible blocks + for(bi = 0; bi < nblocks_far; bi++) + { + // Get indexes of corresponding block row and block column + STARSH_int i = block_far[2*bi]; + STARSH_int j = block_far[2*bi+1]; + // Generate matrix + starpu_task_insert(&codelet2, + STARPU_VALUE, &F, sizeof(F), + STARPU_VALUE, &i, sizeof(i), + STARPU_VALUE, &j, sizeof(j), + STARPU_W, D_handle[bi], + 0); + // Approximate + starpu_task_insert(&codelet, + STARPU_VALUE, &maxrank, sizeof(maxrank), + STARPU_VALUE, &oversample, sizeof(oversample), + STARPU_VALUE, &tol, sizeof(tol), + STARPU_VALUE, &cuhandles, sizeof(cuhandles), + STARPU_VALUE, &cuhandles2, sizeof(cuhandles2), + STARPU_VALUE, &cuhandles3, sizeof(cuhandles3), + STARPU_VALUE, &devinfo_ptr, sizeof(devinfo_ptr), + STARPU_VALUE, &svhandles, sizeof(svhandles), + STARPU_R, D_handle[bi], + STARPU_W, U_handle[bi], + STARPU_W, V_handle[bi], + STARPU_W, rank_handle[bi], + STARPU_SCRATCH, work_handle[bi], + STARPU_SCRATCH, iwork_handle[bi], + 0); + } + starpu_task_wait_for_all(); + for(bi = 0; bi < nblocks_far; bi++) + { + starpu_data_unregister(rank_handle[bi]); + starpu_data_unregister(D_handle[bi]); + starpu_data_unregister(U_handle[bi]); + starpu_data_unregister(V_handle[bi]); + starpu_data_unregister(work_handle[bi]); + starpu_data_unregister(iwork_handle[bi]); + } + // Get number of false far-field blocks + STARSH_int nblocks_false_far = 0; + STARSH_int *false_far = NULL; + for(bi = 0; bi < nblocks_far; bi++) + { + //printf("FAR_RANK[%zu]=%d\n", bi, far_rank[bi]); + //far_rank[bi] = -1; + if(far_rank[bi] == -1) + nblocks_false_far++; + } + if(nblocks_false_far > 0) + { + // IMPORTANT: `false_far` must to be in ascending order for later code + // to work normally + STARSH_MALLOC(false_far, nblocks_false_far); + bj = 0; + for(bi = 0; bi < nblocks_far; bi++) + if(far_rank[bi] == -1) + false_far[bj++] = bi; + } + // Update lists of far-field and near-field blocks using previously + // generated list of false far-field blocks + if(nblocks_false_far > 0) + { + // Update list of near-field blocks + new_nblocks_near = nblocks_near+nblocks_false_far; + STARSH_MALLOC(block_near, 2*new_nblocks_near); + // At first get all near-field blocks, assumed to be dense + for(bi = 0; bi < 2*nblocks_near; bi++) + block_near[bi] = F->block_near[bi]; + // Add false far-field blocks + for(bi = 0; bi < nblocks_false_far; bi++) + { + STARSH_int bj = false_far[bi]; + block_near[2*(bi+nblocks_near)] = F->block_far[2*bj]; + block_near[2*(bi+nblocks_near)+1] = F->block_far[2*bj+1]; + } + // Update list of far-field blocks + new_nblocks_far = nblocks_far-nblocks_false_far; + if(new_nblocks_far > 0) + { + STARSH_MALLOC(block_far, 2*new_nblocks_far); + bj = 0; + for(bi = 0; bi < nblocks_far; bi++) + { + // `false_far` must be in ascending order for this to work + if(bj < nblocks_false_far && false_far[bj] == bi) + { + bj++; + } + else + { + block_far[2*(bi-bj)] = F->block_far[2*bi]; + block_far[2*(bi-bj)+1] = F->block_far[2*bi+1]; + } + } + } + // Update format by creating new format + STARSH_blrf *F2; + info = starsh_blrf_new_from_coo(&F2, P, F->symm, RC, CC, + new_nblocks_far, block_far, new_nblocks_near, block_near, + F->type); + // Swap internal data of formats and free unnecessary data + STARSH_blrf tmp_blrf = *F; + *F = *F2; + *F2 = tmp_blrf; + STARSH_WARNING("`F` was modified due to false far-field blocks"); + starsh_blrf_free(F2); + } + // Compute near-field blocks if needed + if(onfly == 0 && new_nblocks_near > 0) + { + starpu_data_handle_t D_handle[new_nblocks_near]; + STARSH_MALLOC(near_D, new_nblocks_near); + size_t size_D = 0; + // Simple cycle over all near-field blocks + for(bi = 0; bi < new_nblocks_near; bi++) + { + // Get indexes of corresponding block row and block column + STARSH_int i = block_near[2*bi]; + STARSH_int j = block_near[2*bi+1]; + // Get corresponding sizes and minimum of them + size_t nrows = RC->size[i]; + size_t ncols = CC->size[j]; + // Update size_D + size_D += nrows*ncols; + } + STARSH_MALLOC(alloc_D, size_D); + // For each near-field block compute its elements + for(bi = 0; bi < new_nblocks_near; bi++) + { + // Get indexes of corresponding block row and block column + STARSH_int i = block_near[2*bi]; + STARSH_int j = block_near[2*bi+1]; + // Get corresponding sizes and minimum of them + int nrows = RC->size[i]; + int ncols = CC->size[j]; + int shape[2] = {nrows, ncols}; + double *D = alloc_D+offset_D; + array_from_buffer(near_D+bi, 2, shape, 'd', 'F', D); + offset_D += near_D[bi]->size; + starpu_matrix_data_register(D_handle+bi, STARPU_MAIN_RAM, + (uintptr_t)(near_D[bi]->data), nrows, nrows, ncols, + sizeof(*D)); + } + for(bi = 0; bi < new_nblocks_near; bi++) + { + // Get indexes of corresponding block row and block column + STARSH_int i = block_near[2*bi]; + STARSH_int j = block_near[2*bi+1]; + // Get matrix + starpu_task_insert(&codelet2, + STARPU_VALUE, &F, sizeof(F), + STARPU_VALUE, &i, sizeof(i), + STARPU_VALUE, &j, sizeof(j), + STARPU_W, D_handle[bi], + 0); + } + // Wait in this scope, because all handles are not visible outside + starpu_task_wait_for_all(); + // Unregister data + for(bi = 0; bi < new_nblocks_near; bi++) + { + starpu_data_unregister(D_handle[bi]); + } + } + // Change sizes of far_rank, far_U and far_V if there were false + // far-field blocks + if(nblocks_false_far > 0 && new_nblocks_far > 0) + { + bj = 0; + for(bi = 0; bi < nblocks_far; bi++) + { + if(far_rank[bi] == -1) + bj++; + else + { + far_U[bi-bj] = far_U[bi]; + far_V[bi-bj] = far_V[bi]; + far_rank[bi-bj] = far_rank[bi]; + } + } + STARSH_REALLOC(far_rank, new_nblocks_far); + STARSH_REALLOC(far_U, new_nblocks_far); + STARSH_REALLOC(far_V, new_nblocks_far); + //STARSH_REALLOC(alloc_U, offset_U); + //STARSH_REALLOC(alloc_V, offset_V); + } + // If all far-field blocks are false, then dealloc buffers + if(new_nblocks_far == 0 && nblocks_far > 0) + { + block_far = NULL; + free(far_rank); + far_rank = NULL; + free(far_U); + far_U = NULL; + free(far_V); + far_V = NULL; + free(alloc_U); + alloc_U = NULL; + free(alloc_V); + alloc_V = NULL; + } + // Dealloc list of false far-field blocks if it is not empty + if(nblocks_false_far > 0) + free(false_far); + // Finish with creating instance of Block Low-Rank Matrix with given + // buffers + starpu_execute_on_each_worker(deinit_starpu_cuda, args_buffer, + STARPU_CUDA); + return starsh_blrm_new(matrix, F, far_rank, far_U, far_V, onfly, near_D, + alloc_U, alloc_V, alloc_D, '1'); +} + diff --git a/src/backends/starpu_cuda/dense/CMakeLists.txt b/src/backends/starpu_cuda/dense/CMakeLists.txt new file mode 100644 index 00000000..0ae6c8ec --- /dev/null +++ b/src/backends/starpu_cuda/dense/CMakeLists.txt @@ -0,0 +1,21 @@ +# @copyright (c) 2017 King Abdullah University of Science and +# Technology (KAUST). All rights reserved. +# +# STARS-H is a software package, provided by King Abdullah +# University of Science and Technology (KAUST) +# +# @file src/backends/starpu/dense/CMakeLists.txt +# @version 0.1.0 +# @author Aleksandr Mikhalev +# @date 2017-11-07 + + +# set the values of the variable in the parent scope +set(SRC + #"${CMAKE_CURRENT_SOURCE_DIR}/dqp3.c" + "${CMAKE_CURRENT_SOURCE_DIR}/drsdd.c" + #"${CMAKE_CURRENT_SOURCE_DIR}/dsdd.c" + "${CMAKE_CURRENT_SOURCE_DIR}/kernel.c" + #"${CMAKE_CURRENT_SOURCE_DIR}/dgemm.c" + #"${CMAKE_CURRENT_SOURCE_DIR}/fake_init.c" + ${SRC} PARENT_SCOPE) diff --git a/src/backends/starpu_cuda/dense/drsdd.c b/src/backends/starpu_cuda/dense/drsdd.c new file mode 100644 index 00000000..478f4a96 --- /dev/null +++ b/src/backends/starpu_cuda/dense/drsdd.c @@ -0,0 +1,153 @@ +/*! @copyright (c) 2017 King Abdullah University of Science and + * Technology (KAUST). All rights reserved. + * + * STARS-H is a software package, provided by King Abdullah + * University of Science and Technology (KAUST) + * + * @file src/backends/starpu/dense/drsdd.c + * @version 0.1.0 + * @author Aleksandr Mikhalev + * @date 2017-11-07 + * */ + +#include "common.h" +#include "starsh.h" +#include "starsh-starpu-cuda.h" +#include +#include +#include +#include + +void starsh_dense_dlrrsdd_starpu_cuda_cpu(void *buffer[], void *cl_arg) +//! STARPU kernel for 1-way randomized SVD on a tile. +{ + int maxrank; + int oversample; + double tol; + cublasHandle_t *cublas_handles; + cusolverDnHandle_t *cusolver_handles; + curandGenerator_t *curand_handles; + int **devinfo; + double *singular_values; + starpu_codelet_unpack_args(cl_arg, &maxrank, &oversample, &tol, + &cublas_handles, &cusolver_handles, &curand_handles, &devinfo, + &singular_values); + //printf("CODELET: %p, %p, %p\n", cublas_handles, cusolver_handles, + // singular_values); + double *D = (double *)STARPU_MATRIX_GET_PTR(buffer[0]); + int nrows = STARPU_MATRIX_GET_NX(buffer[0]); + int ncols = STARPU_MATRIX_GET_NY(buffer[0]); + double *U = (double *)STARPU_VECTOR_GET_PTR(buffer[1]); + double *V = (double *)STARPU_VECTOR_GET_PTR(buffer[2]); + int *rank = (int *)STARPU_VECTOR_GET_PTR(buffer[3]); + double *work = (double *)STARPU_VECTOR_GET_PTR(buffer[4]); + int lwork = STARPU_VECTOR_GET_NX(buffer[4]); + int *iwork = (int *)STARPU_VECTOR_GET_PTR(buffer[5]); + starsh_dense_dlrrsdd(nrows, ncols, D, nrows, U, nrows, V, ncols, rank, + maxrank, oversample, tol, work, lwork, iwork); +} + +void starsh_dense_dlrrsdd_starpu_cuda_gpu(void *buffer[], void *cl_arg) +//! STARPU kernel for 1-way randomized SVD on a tile. +{ + int maxrank; + int oversample; + double tol; + cublasHandle_t *cublas_handles; + cusolverDnHandle_t *cusolver_handles; + curandGenerator_t *curand_handles; + int **devinfo; + double *singular_values; + starpu_codelet_unpack_args(cl_arg, &maxrank, &oversample, &tol, + &cublas_handles, &cusolver_handles, &curand_handles, &devinfo, + &singular_values); + double *D = (double *)STARPU_MATRIX_GET_PTR(buffer[0]); + int nrows = STARPU_MATRIX_GET_NX(buffer[0]); + int ncols = STARPU_MATRIX_GET_NY(buffer[0]); + double *U = (double *)STARPU_VECTOR_GET_PTR(buffer[1]); + double *V = (double *)STARPU_VECTOR_GET_PTR(buffer[2]); + int *rank = (int *)STARPU_VECTOR_GET_PTR(buffer[3]); + double *work = (double *)STARPU_VECTOR_GET_PTR(buffer[4]); + int lwork = STARPU_VECTOR_GET_NX(buffer[4]); + int mn = nrows < ncols ? nrows : ncols; + int mn2 = maxrank+oversample; + if(mn2 > mn) + mn2 = mn; + int id = starpu_worker_get_id(); + cusolverDnHandle_t cusolverhandle = cusolver_handles[id]; + cublasHandle_t cuhandle = cublas_handles[id]; + curandGenerator_t curandhandle = curand_handles[id]; + int *mydevinfo = devinfo[id]; + double *host_S = singular_values+id*(maxrank+oversample); + double *device_X = work; // ncols-by-mn2-by random matrix + double *device_Q = device_X+ncols*mn2; // nrows-by-mn2 matrix + double *device_tau = device_Q+nrows*mn2; // mn2 elements + double *device_S = device_tau; + double *device_U = device_tau+mn2; // ncols-by-mn2 matrix + double *device_V = device_U+ncols*mn2; // mn2-by-mn2 matrix + double *device_rwork = device_V+mn2*mn2; + double *device_work = device_rwork+mn2; + lwork -= (2*ncols+nrows+2*mn2+1)*mn2; + //printf("lwork=%d\n", lwork); + double one = 1.0; + double zero = 0.0; + cusolverStatus_t status; + cublasStatus_t status2; + curandGenerateNormalDouble(curandhandle, device_X, ncols*mn2, zero, one); + status2 = cublasDgemm(cuhandle, CUBLAS_OP_N, CUBLAS_OP_N, nrows, mn2, ncols, + &one, D, nrows, device_X, ncols, &zero, device_Q, nrows); + if(status2) + { + printf("STATUS GEMM=%d\n", status2); + } + cudaMemcpy(host_S, device_Q, sizeof(*device_Q), cudaMemcpyDeviceToHost); + status = cusolverDnDgeqrf(cusolverhandle, nrows, mn2, device_Q, nrows, + device_tau, device_work, lwork, mydevinfo); + if(status) + { + printf("STATUS GEQRF=%d\n", status); + } + status = cusolverDnDorgqr(cusolverhandle, nrows, mn2, mn2, device_Q, nrows, + device_tau, device_work, lwork, mydevinfo); + if(status) + { + printf("STATUS ORGQR=%d\n", status); + } + cublasDgemm(cuhandle, CUBLAS_OP_T, CUBLAS_OP_N, ncols, mn2, nrows, &one, D, + nrows, device_Q, nrows, &zero, device_X, ncols); + status = cusolverDnDgesvd(cusolverhandle, 'S', 'S', ncols, mn2, device_X, + ncols, device_S, device_U, ncols, device_V, mn2, device_work, + lwork, device_rwork, mydevinfo); + if(status) + { + printf("STATUS GESVD=%d\n", status); + } + cudaMemcpy(host_S, device_S, mn2*sizeof(*host_S), cudaMemcpyDeviceToHost); + //printf("SV:"); + //for(int i = 0; i < 5; i++) + // printf(" %f", host_S[i]); + //printf("\n"); + // Get rank, corresponding to given error tolerance + int local_rank = starsh_dense_dsvfr(mn2, host_S, tol); + if(local_rank < mn/2 && local_rank <= maxrank) + { + // Compute right factor of low-rank approximation, using given left + // singular vectors + cublasDgemm(cuhandle, CUBLAS_OP_N, CUBLAS_OP_T, nrows, local_rank, + mn2, &one, device_Q, nrows, device_V, mn2, &zero, U, nrows); + cublasDcopy(cuhandle, ncols*local_rank, device_U, 1, V, 1); + for(int i = 0; i < local_rank; i++) + { + cublasDscal(cuhandle, ncols, &host_S[i], V+i*ncols, 1); + } + } + else + local_rank = -1; + cudaError_t err; + // Write new rank back into device memory + err = cudaMemcpy(rank, &local_rank, sizeof(local_rank), + cudaMemcpyHostToDevice); + if(err != cudaSuccess) + printf("ERROR IN CUDAMEMCPY\n"); +} + diff --git a/src/backends/starpu_cuda/dense/kernel.c b/src/backends/starpu_cuda/dense/kernel.c new file mode 100644 index 00000000..e0d7411e --- /dev/null +++ b/src/backends/starpu_cuda/dense/kernel.c @@ -0,0 +1,34 @@ +/*! @copyright (c) 2017 King Abdullah University of Science and + * Technology (KAUST). All rights reserved. + * + * STARS-H is a software package, provided by King Abdullah + * University of Science and Technology (KAUST) + * + * @file src/backends/starpu/dense/kernel.c + * @version 0.1.0 + * @author Aleksandr Mikhalev + * @date 2017-11-07 + * */ + +#include "common.h" +#include "starsh.h" +#include "starsh-starpu-cuda.h" + +void starsh_dense_kernel_starpu_cuda_cpu(void *buffer[], void *cl_arg) +//! STARPU kernel for matrix kernel. +{ + STARSH_blrf *F; + STARSH_int i, j; + starpu_codelet_unpack_args(cl_arg, &F, &i, &j); + STARSH_problem *P = F->problem; + STARSH_kernel *kernel = P->kernel; + // Shortcuts to information about clusters + STARSH_cluster *RC = F->row_cluster, *CC = F->col_cluster; + void *RD = RC->data, *CD = CC->data; + STARSH_int nrows = RC->size[i]; + STARSH_int ncols = CC->size[j]; + double *D = (double *)STARPU_MATRIX_GET_PTR(buffer[0]); + kernel(nrows, ncols, RC->pivot+RC->start[i], CC->pivot+CC->start[j], + RD, CD, D, nrows); +} + diff --git a/src/backends/starpu_kblas/CMakeLists.txt b/src/backends/starpu_kblas/CMakeLists.txt new file mode 100644 index 00000000..9adf822c --- /dev/null +++ b/src/backends/starpu_kblas/CMakeLists.txt @@ -0,0 +1,26 @@ +# @copyright (c) 2017 King Abdullah University of Science and +# Technology (KAUST). All rights reserved. +# +# STARS-H is a software package, provided by King Abdullah +# University of Science and Technology (KAUST) +# +# @file src/backends/starpu/CMakeLists.txt +# @version 0.1.0 +# @author Aleksandr Mikhalev +# @date 2017-11-07 + + +set(DOXYGEN_INPUT ${DOXYGEN_INPUT}) + +# Collect sources for documentation and compilation +set(SRC) +add_subdirectory("blrm") +add_subdirectory("dense") + +# If compilation is requried +if(STARPU AND KBLAS) + add_library(backends_starpu_kblas OBJECT ${SRC}) +endif() + +# Put doxygen input to parent scope +set(DOXYGEN_INPUT ${DOXYGEN_INPUT} ${SRC} PARENT_SCOPE) diff --git a/src/backends/starpu_kblas/blrm/CMakeLists.txt b/src/backends/starpu_kblas/blrm/CMakeLists.txt new file mode 100644 index 00000000..dc39d390 --- /dev/null +++ b/src/backends/starpu_kblas/blrm/CMakeLists.txt @@ -0,0 +1,19 @@ +# @copyright (c) 2017 King Abdullah University of Science and +# Technology (KAUST). All rights reserved. +# +# STARS-H is a software package, provided by King Abdullah +# University of Science and Technology (KAUST) +# +# @file src/backends/starpu/blrm/CMakeLists.txt +# @version 0.1.0 +# @author Aleksandr Mikhalev +# @date 2017-11-07 + + +# set the values of the variable in the parent scope +set(SRC + #"${CMAKE_CURRENT_SOURCE_DIR}/dqp3.c" + "${CMAKE_CURRENT_SOURCE_DIR}/drsdd.c" + #"${CMAKE_CURRENT_SOURCE_DIR}/dsdd.c" + #"${CMAKE_CURRENT_SOURCE_DIR}/dmml.c" + ${SRC} PARENT_SCOPE) diff --git a/src/backends/starpu_kblas/blrm/drsdd.c b/src/backends/starpu_kblas/blrm/drsdd.c new file mode 100644 index 00000000..dc3238c1 --- /dev/null +++ b/src/backends/starpu_kblas/blrm/drsdd.c @@ -0,0 +1,531 @@ +/*! @copyright (c) 2017 King Abdullah University of Science and + * Technology (KAUST). All rights reserved. + * + * STARS-H is a software package, provided by King Abdullah + * University of Science and Technology (KAUST) + * + * @file src/backends/starpu/blrm/drsdd.c + * @version 0.1.0 + * @author Aleksandr Mikhalev + * @date 2017-11-07 + * */ + +#include "common.h" +#include "starsh.h" +#include "starsh-starpu-kblas.h" +#include +#include +#include +#include "batch_rand.h" +#include +#include + +static void init_starpu_kblas(void *args) +{ + cublasHandle_t *cublas_handles; + kblasHandle_t *kblas_handles; + kblasRandState_t *kblas_states; + cudaStream_t stream = starpu_cuda_get_local_stream(); + int nb, nsamples, maxbatch; + double **work; + int **iwork; + starpu_codelet_unpack_args(args, &cublas_handles, &kblas_handles, + &kblas_states, &work, &iwork, &nb, &nsamples, &maxbatch); + int id = starpu_worker_get_id(); + cublasStatus_t status; + kblasCreate(&kblas_handles[id]); + kblasSetStream(kblas_handles[id], stream); + kblasDrsvd_batch_wsquery(kblas_handles[id], nb, nb, nsamples, maxbatch); + kblasAllocateWorkspace(kblas_handles[id]); + cublas_handles[id] = kblasGetCublasHandle(kblas_handles[id]); + kblasInitRandState(kblas_handles[id], &kblas_states[id], 16384*2, 0); + work[id] = malloc(nsamples*maxbatch*sizeof(double)); + iwork[id] = malloc(maxbatch*sizeof(int)); + cudaStreamSynchronize(stream); +} + +static void init_starpu_cpu(void *args) +{ + int nb, nsamples; + int lwork, liwork; + double **work; + int **iwork; + starpu_codelet_unpack_args(args, &nb, &nsamples, &work, &lwork, &iwork, + &liwork); + int id = starpu_worker_get_id(); + work[id] = malloc(lwork*sizeof(*work[0])); + iwork[id] = malloc(liwork*sizeof(*iwork[0])); +} + +static void deinit_starpu_kblas(void *args) +{ + int nb, nsamples, maxbatch; + double **work; + int **iwork; + cublasHandle_t *cublas_handles; + kblasHandle_t *kblas_handles; + kblasRandState_t *kblas_states; + starpu_codelet_unpack_args(args, &cublas_handles, &kblas_handles, + &kblas_states, &work, &iwork, &nb, &nsamples, &maxbatch); + int id = starpu_worker_get_id(); + kblasDestroyRandState(kblas_states[id]); + kblasDestroy(&kblas_handles[id]); + free(work[id]); + free(iwork[id]); +} + +static void deinit_starpu_cpu(void *args) +{ + int nb, nsamples; + int lwork, liwork; + double **work; + int **iwork; + starpu_codelet_unpack_args(args, &nb, &nsamples, &work, &lwork, &iwork, + &liwork); + int id = starpu_worker_get_id(); + free(work[id]); + free(iwork[id]); +} + +static void empty_cpu_func(void *buffer[], void *cl_arg) +{ +} + +int starsh_blrm__drsdd_starpu_kblas(STARSH_blrm **matrix, STARSH_blrf *format, + int maxrank, double tol, int onfly) +//! Approximate each tile by randomized SVD. +/*! + * @param[out] matrix: Address of pointer to @ref STARSH_blrm object. + * @param[in] format: Block low-rank format. + * @param[in] maxrank: Maximum possible rank. + * @param[in] tol: Relative error tolerance. + * @param[in] onfly: Whether not to store dense blocks. + * @return Error code @ref STARSH_ERRNO. + * @ingroup blrm + * */ +{ + STARSH_blrf *F = format; + STARSH_problem *P = F->problem; + STARSH_kernel *kernel = P->kernel; + STARSH_int nblocks_far = F->nblocks_far; + STARSH_int nblocks_near = F->nblocks_near; + // Shortcuts to information about clusters + STARSH_cluster *RC = F->row_cluster; + STARSH_cluster *CC = F->col_cluster; + void *RD = RC->data, *CD = CC->data; + // Following values default to given block low-rank format F, but they are + // changed when there are false far-field blocks. + STARSH_int new_nblocks_far = nblocks_far; + STARSH_int new_nblocks_near = nblocks_near; + STARSH_int *block_far = F->block_far; + STARSH_int *block_near = F->block_near; + // Places to store low-rank factors, dense blocks and ranks + Array **far_U = NULL, **far_V = NULL, **near_D = NULL; + int *far_rank = NULL; + double *alloc_U = NULL, *alloc_V = NULL, *alloc_D = NULL; + STARSH_int bi, bj = 0; + const int oversample = starsh_params.oversample; + // Init CuBLAS and KBLAS handles and temp buffers for all workers (but they + // are used only in GPU codelets) + int workers = starpu_worker_get_count(); + cublasHandle_t cublas_handles[workers]; + kblasHandle_t kblas_handles[workers]; + kblasRandState_t kblas_states[workers]; + double *work[workers]; + int *iwork[workers]; + cublasHandle_t *cuhandles = cublas_handles; + kblasHandle_t *khandles = kblas_handles; + kblasRandState_t *kstates = kblas_states; + double **wwork = work; + int **wiwork = iwork; + //printf("MAIN: %p, %p, %p\n", cuhandles, khandles, svhandles); + void *args_gpu, *args_cpu; + size_t args_gpu_size = 0; + size_t args_cpu_size = 0; + // This works only for TLR with equal tiles + int nb = RC->size[0]; + int nsamples = maxrank+oversample; + // Set size of batch + int batch_size = 100; + // Ceil number of batches + int nbatches = (nblocks_far-1)/batch_size + 1; + // Get corresponding sizes and minimum of them + int mn = maxrank+oversample; + if(mn > nb) + mn = nb; + // Get size of temporary arrays + int lwork = nb; + int lwork_sdd = (4*mn+7) * mn; + if(lwork_sdd > lwork) + lwork = lwork_sdd; + lwork += mn*(3*nb+mn+1) + nb*nb; + int liwork = 8 * mn; + starpu_codelet_pack_args(&args_gpu, &args_gpu_size, + STARPU_VALUE, &cuhandles, sizeof(cuhandles), + STARPU_VALUE, &khandles, sizeof(khandles), + STARPU_VALUE, &kstates, sizeof(kstates), + STARPU_VALUE, &wwork, sizeof(wwork), + STARPU_VALUE, &wiwork, sizeof(wiwork), + STARPU_VALUE, &nb, sizeof(nb), + STARPU_VALUE, &nsamples, sizeof(nsamples), + STARPU_VALUE, &batch_size, sizeof(batch_size), + 0); + starpu_codelet_pack_args(&args_cpu, &args_cpu_size, + STARPU_VALUE, &nb, sizeof(nb), + STARPU_VALUE, &nsamples, sizeof(nsamples), + STARPU_VALUE, &wwork, sizeof(wwork), + STARPU_VALUE, &lwork, sizeof(lwork), + STARPU_VALUE, &wiwork, sizeof(wiwork), + STARPU_VALUE, &liwork, sizeof(liwork), + 0); + starpu_execute_on_each_worker(init_starpu_kblas, args_gpu, STARPU_CUDA); + starpu_execute_on_each_worker(init_starpu_cpu, args_cpu, STARPU_CPU); + // Init codelet structs and handles + struct starpu_codelet codelet_kernel = + { + .cpu_funcs = {starsh_dense_kernel_starpu_kblas_cpu}, + .nbuffers = 2, + .modes = {STARPU_W, STARPU_R}, + .type = STARPU_SPMD, + .max_parallelism = INT_MAX, + }; + struct starpu_codelet codelet_lowrank = + { + .cpu_funcs = {starsh_dense_dlrrsdd_starpu_kblas_cpu}, + .cuda_funcs = {starsh_dense_dlrrsdd_starpu_kblas_gpu}, + .cuda_flags = {STARPU_CUDA_ASYNC}, + .nbuffers = 5, + .modes = {STARPU_R, STARPU_SCRATCH, STARPU_W, STARPU_W, STARPU_W}, + .type = STARPU_SPMD, + .max_parallelism = INT_MAX, + }; + struct starpu_codelet codelet_lowrank_cpu = + { + .cpu_funcs = {starsh_dense_dlrrsdd_starpu_kblas_cpu}, + .nbuffers = 5, + .modes = {STARPU_R, STARPU_SCRATCH, STARPU_W, STARPU_W, STARPU_W}, + .type = STARPU_SPMD, + .max_parallelism = INT_MAX, + }; + struct starpu_codelet codelet_lowrank_gpu = + { + .cuda_funcs = {starsh_dense_dlrrsdd_starpu_kblas_gpu}, + .cuda_flags = {STARPU_CUDA_ASYNC}, + .nbuffers = 5, + .modes = {STARPU_R, STARPU_SCRATCH, STARPU_W, STARPU_W, STARPU_W}, + }; + struct starpu_codelet codelet_get_data_back_to_cpu = + { + .cpu_funcs = {empty_cpu_func}, + .nbuffers = 1, + .modes = {STARPU_R}, + }; + // Select if ONLY cpu or gpu + if(getenv("STARSH_KBLAS_CPU")) + codelet_lowrank = codelet_lowrank_cpu; + else if(getenv("STARSH_KBLAS_GPU")) + codelet_lowrank = codelet_lowrank_gpu; + starpu_data_handle_t rank_handle[nbatches]; + starpu_data_handle_t D_handle[nbatches]; + starpu_data_handle_t Dcopy_handle[nbatches]; + starpu_data_handle_t index_handle[nbatches]; + starpu_data_handle_t U_handle[nbatches]; + starpu_data_handle_t V_handle[nbatches]; + //printf("BATCHSIZE=%d BATCHCOUNT=%d\n", batch_size, nbatches); + // Init buffers to store low-rank factors of far-field blocks if needed + if(nbatches > 0) + { + STARSH_MALLOC(far_U, nblocks_far); + STARSH_MALLOC(far_V, nblocks_far); + STARSH_MALLOC(far_rank, nblocks_far); + size_t size_U = nblocks_far * nb * maxrank; + size_t size_V = size_U; + STARSH_MALLOC(alloc_U, size_U); + STARSH_MALLOC(alloc_V, size_V); + int shape[] = {nb, maxrank}; + for(bi = 0; bi < nblocks_far; ++bi) + { + STARSH_int offset = bi * nb * maxrank; + array_from_buffer(far_U+bi, 2, shape, 'd', 'F', alloc_U+offset); + array_from_buffer(far_V+bi, 2, shape, 'd', 'F', alloc_V+offset); + } + // START MEASURING TIME + double time0 = omp_get_wtime(); + for(bi = 0; bi < nbatches; ++bi) + { + STARSH_int offset = bi * batch_size * nb * maxrank; + double *U = alloc_U + offset; + double *V = alloc_V + offset; + STARSH_int this_batch_size = nblocks_far - bi*batch_size; + if(this_batch_size > batch_size) + this_batch_size = batch_size; + STARSH_int D_size = this_batch_size * nb * nb; + STARSH_int U_size = this_batch_size * nb * maxrank; + STARSH_int V_size = U_size; + //printf("THIS BATCH SIZE=%d\n", this_batch_size); + starpu_vector_data_register(rank_handle+bi, STARPU_MAIN_RAM, + (uintptr_t)(far_rank + bi*batch_size), this_batch_size, + sizeof(*far_rank)); + starpu_vector_data_register(D_handle+bi, -1, 0, D_size, + sizeof(double)); + starpu_vector_data_register(Dcopy_handle+bi, -1, 0, D_size, + sizeof(double)); + starpu_vector_data_register(index_handle+bi, STARPU_MAIN_RAM, + (uintptr_t)(block_far + 2*bi*batch_size), + 2*this_batch_size, sizeof(*block_far)); + starpu_vector_data_register(U_handle+bi, STARPU_MAIN_RAM, + (uintptr_t)(U), U_size, sizeof(*U)); + starpu_vector_data_register(V_handle+bi, STARPU_MAIN_RAM, + (uintptr_t)(V), V_size, sizeof(*V)); + } + //printf("REGISTER DATA IN: %f seconds\n", omp_get_wtime()-time0); + } + // Work variables + int info; + // START MEASURING TIME + double time0 = omp_get_wtime(); + for(bi = 0; bi < nbatches; ++bi) + { + //printf("RUNNING BATCH=%d\n", bi); + STARSH_int this_batch_size = nblocks_far - bi*batch_size; + if(this_batch_size > batch_size) + this_batch_size = batch_size; + // Generate matrix + starpu_task_insert(&codelet_kernel, + STARPU_VALUE, &F, sizeof(F), + STARPU_VALUE, &this_batch_size, sizeof(this_batch_size), + STARPU_W, D_handle[bi], + STARPU_R, index_handle[bi], + 0); + starpu_data_unregister_submit(index_handle[bi]); + } + //starpu_task_wait_for_all(); + //double time1 = omp_get_wtime(); + //printf("COMPUTE MATRIX IN: %f seconds\n", time1-time0); + //time0 = time1; + STARSH_int nbatches_once = nbatches; + for(STARSH_int batch_start = 0; batch_start < nbatches; + batch_start += nbatches_once) + { + STARSH_int batch_end = batch_start + nbatches_once; + if(batch_end > nbatches) + batch_end = nbatches; + for(bi = batch_start; bi < batch_end; ++bi) + { + //printf("RUNNING BATCH=%d\n", bi); + STARSH_int this_batch_size = nblocks_far - bi*batch_size; + if(this_batch_size > batch_size) + this_batch_size = batch_size; + // Run KBLAS_RSVD + //* + starpu_task_insert(&codelet_lowrank, + STARPU_VALUE, &this_batch_size, sizeof(this_batch_size), + STARPU_VALUE, &nb, sizeof(nb), + STARPU_VALUE, &maxrank, sizeof(maxrank), + STARPU_VALUE, &oversample, sizeof(oversample), + STARPU_VALUE, &tol, sizeof(tol), + STARPU_VALUE, &cuhandles, sizeof(cuhandles), + STARPU_VALUE, &khandles, sizeof(khandles), + STARPU_VALUE, &kstates, sizeof(kstates), + STARPU_VALUE, &wwork, sizeof(wwork), + STARPU_VALUE, &lwork, sizeof(lwork), + STARPU_VALUE, &wiwork, sizeof(wiwork), + STARPU_R, D_handle[bi], + STARPU_SCRATCH, Dcopy_handle[bi], + STARPU_W, U_handle[bi], + STARPU_W, V_handle[bi], + STARPU_W, rank_handle[bi], + 0); + starpu_data_unregister_submit(Dcopy_handle[bi]); + starpu_task_insert(&codelet_get_data_back_to_cpu, + STARPU_R, U_handle[bi], + 0); + starpu_task_insert(&codelet_get_data_back_to_cpu, + STARPU_R, V_handle[bi], + 0); + starpu_task_insert(&codelet_get_data_back_to_cpu, + STARPU_R, rank_handle[bi], + 0); + starpu_data_unregister_submit(rank_handle[bi]); + starpu_data_unregister_submit(D_handle[bi]); + starpu_data_unregister_submit(U_handle[bi]); + starpu_data_unregister_submit(V_handle[bi]); + } + //starpu_task_wait_for_all(); + } + //time1 = omp_get_wtime(); + //printf("COMPRESS MATRIX IN: %f seconds\n", time1-time0); + //time0 = time1; + //printf("FINISH FIRST PASS AND UNREGISTER IN: %f seconds\n", + // omp_get_wtime()-time0); + // Get number of false far-field blocks + STARSH_int nblocks_false_far = 0; + STARSH_int *false_far = NULL; + for(bi = 0; bi < nblocks_far; bi++) + { + //printf("FAR_RANK[%zu]=%d\n", bi, far_rank[bi]); + //far_rank[bi] = 0; + if(far_rank[bi] == -1) + nblocks_false_far++; + } + if(nblocks_false_far > 0) + { + // IMPORTANT: `false_far` must to be in ascending order for later code + // to work normally + STARSH_MALLOC(false_far, nblocks_false_far); + bj = 0; + for(bi = 0; bi < nblocks_far; bi++) + if(far_rank[bi] == -1) + false_far[bj++] = bi; + } + // Update lists of far-field and near-field blocks using previously + // generated list of false far-field blocks + if(nblocks_false_far > 0) + { + // Update list of near-field blocks + new_nblocks_near = nblocks_near+nblocks_false_far; + STARSH_MALLOC(block_near, 2*new_nblocks_near); + // At first get all near-field blocks, assumed to be dense + for(bi = 0; bi < 2*nblocks_near; bi++) + block_near[bi] = F->block_near[bi]; + // Add false far-field blocks + for(bi = 0; bi < nblocks_false_far; bi++) + { + STARSH_int bj = false_far[bi]; + block_near[2*(bi+nblocks_near)] = F->block_far[2*bj]; + block_near[2*(bi+nblocks_near)+1] = F->block_far[2*bj+1]; + } + // Update list of far-field blocks + new_nblocks_far = nblocks_far-nblocks_false_far; + if(new_nblocks_far > 0) + { + STARSH_MALLOC(block_far, 2*new_nblocks_far); + bj = 0; + for(bi = 0; bi < nblocks_far; bi++) + { + // `false_far` must be in ascending order for this to work + if(bj < nblocks_false_far && false_far[bj] == bi) + { + bj++; + } + else + { + block_far[2*(bi-bj)] = F->block_far[2*bi]; + block_far[2*(bi-bj)+1] = F->block_far[2*bi+1]; + } + } + } + // Update format by creating new format + STARSH_blrf *F2; + info = starsh_blrf_new_from_coo(&F2, P, F->symm, RC, CC, + new_nblocks_far, block_far, new_nblocks_near, block_near, + F->type); + // Swap internal data of formats and free unnecessary data + STARSH_blrf tmp_blrf = *F; + *F = *F2; + *F2 = tmp_blrf; + STARSH_WARNING("`F` was modified due to false far-field blocks"); + starsh_blrf_free(F2); + } + // Compute near-field blocks if needed + if(onfly == 0 && new_nblocks_near > 0) + { + STARSH_MALLOC(near_D, new_nblocks_near); + size_t size_D = new_nblocks_near * nb * nb; + STARSH_MALLOC(alloc_D, size_D); + nbatches = (new_nblocks_near-1)/batch_size + 1; + starpu_data_handle_t D_handle[nbatches]; + starpu_data_handle_t index_handle[nbatches]; + int shape[] = {nb, nb}; + // For each near-field block compute its elements + for(bi = 0; bi < new_nblocks_near; ++bi) + { + // Get indexes of corresponding block row and block column + //STARSH_int i = block_near[2*bi]; + //STARSH_int j = block_near[2*bi+1]; + array_from_buffer(near_D+bi, 2, shape, 'd', 'F', + alloc_D + bi*nb*nb); + } + for(bi = 0; bi < nbatches; ++bi) + { + STARSH_int this_batch_size = new_nblocks_near - bi*batch_size; + if(this_batch_size > batch_size) + this_batch_size = batch_size; + STARSH_int D_size = this_batch_size * nb * nb; + double *D = alloc_D + bi*batch_size*nb*nb; + starpu_vector_data_register(D_handle+bi, STARPU_MAIN_RAM, + (uintptr_t)(D), D_size, sizeof(*D)); + starpu_vector_data_register(index_handle+bi, STARPU_MAIN_RAM, + (uintptr_t)(block_near + 2*bi*batch_size), + 2*this_batch_size, sizeof(*block_near)); + } + for(bi = 0; bi < nbatches; ++bi) + { + STARSH_int this_batch_size = new_nblocks_near - bi*batch_size; + if(this_batch_size > batch_size) + this_batch_size = batch_size; + // Generate matrix + starpu_task_insert(&codelet_kernel, + STARPU_VALUE, &F, sizeof(F), + STARPU_VALUE, &this_batch_size, sizeof(this_batch_size), + STARPU_W, D_handle[bi], + STARPU_R, index_handle[bi], + 0); + } + // Wait in this scope, because all handles are not visible outside + starpu_task_wait_for_all(); + // Unregister data + for(bi = 0; bi < nbatches; bi++) + { + starpu_data_unregister(D_handle[bi]); + starpu_data_unregister(index_handle[bi]); + } + } + // Change sizes of far_rank, far_U and far_V if there were false + // far-field blocks + if(nblocks_false_far > 0 && new_nblocks_far > 0) + { + bj = 0; + for(bi = 0; bi < nblocks_far; bi++) + { + if(far_rank[bi] == -1) + bj++; + else + { + far_U[bi-bj] = far_U[bi]; + far_V[bi-bj] = far_V[bi]; + far_rank[bi-bj] = far_rank[bi]; + } + } + STARSH_REALLOC(far_rank, new_nblocks_far); + STARSH_REALLOC(far_U, new_nblocks_far); + STARSH_REALLOC(far_V, new_nblocks_far); + //STARSH_REALLOC(alloc_U, offset_U); + //STARSH_REALLOC(alloc_V, offset_V); + } + // If all far-field blocks are false, then dealloc buffers + if(new_nblocks_far == 0 && nblocks_far > 0) + { + block_far = NULL; + free(far_rank); + far_rank = NULL; + free(far_U); + far_U = NULL; + free(far_V); + far_V = NULL; + free(alloc_U); + alloc_U = NULL; + free(alloc_V); + alloc_V = NULL; + } + // Dealloc list of false far-field blocks if it is not empty + if(nblocks_false_far > 0) + free(false_far); + // Finish with creating instance of Block Low-Rank Matrix with given + // buffers + starpu_execute_on_each_worker(deinit_starpu_kblas, args_gpu, STARPU_CUDA); + starpu_execute_on_each_worker(deinit_starpu_cpu, args_cpu, STARPU_CPU); + return starsh_blrm_new(matrix, F, far_rank, far_U, far_V, onfly, near_D, + alloc_U, alloc_V, alloc_D, '1'); +} + diff --git a/src/backends/starpu_kblas/dense/CMakeLists.txt b/src/backends/starpu_kblas/dense/CMakeLists.txt new file mode 100644 index 00000000..0ae6c8ec --- /dev/null +++ b/src/backends/starpu_kblas/dense/CMakeLists.txt @@ -0,0 +1,21 @@ +# @copyright (c) 2017 King Abdullah University of Science and +# Technology (KAUST). All rights reserved. +# +# STARS-H is a software package, provided by King Abdullah +# University of Science and Technology (KAUST) +# +# @file src/backends/starpu/dense/CMakeLists.txt +# @version 0.1.0 +# @author Aleksandr Mikhalev +# @date 2017-11-07 + + +# set the values of the variable in the parent scope +set(SRC + #"${CMAKE_CURRENT_SOURCE_DIR}/dqp3.c" + "${CMAKE_CURRENT_SOURCE_DIR}/drsdd.c" + #"${CMAKE_CURRENT_SOURCE_DIR}/dsdd.c" + "${CMAKE_CURRENT_SOURCE_DIR}/kernel.c" + #"${CMAKE_CURRENT_SOURCE_DIR}/dgemm.c" + #"${CMAKE_CURRENT_SOURCE_DIR}/fake_init.c" + ${SRC} PARENT_SCOPE) diff --git a/src/backends/starpu_kblas/dense/drsdd.c b/src/backends/starpu_kblas/dense/drsdd.c new file mode 100644 index 00000000..ac767f8c --- /dev/null +++ b/src/backends/starpu_kblas/dense/drsdd.c @@ -0,0 +1,119 @@ +/*! @copyright (c) 2017 King Abdullah University of Science and + * Technology (KAUST). All rights reserved. + * + * STARS-H is a software package, provided by King Abdullah + * University of Science and Technology (KAUST) + * + * @file src/backends/starpu/dense/drsdd.c + * @version 0.1.0 + * @author Aleksandr Mikhalev + * @date 2017-11-07 + * */ + +#include "common.h" +#include "starsh.h" +#include "starsh-starpu-kblas.h" +#include +#include +#include +#include "batch_rand.h" +#include + +void starsh_dense_dlrrsdd_starpu_kblas_cpu(void *buffer[], void *cl_arg) +//! STARPU kernel for 1-way randomized SVD on a tile. +{ + int batch_size; + int nb; + int maxrank; + int oversample; + double tol; + cublasHandle_t *cublas_handles; + kblasHandle_t *kblas_handles; + kblasRandState_t *kblas_states; + double **work; + int lwork; + int **iwork; + starpu_codelet_unpack_args(cl_arg, &batch_size, &nb, &maxrank, &oversample, + &tol, &cublas_handles, &kblas_handles, &kblas_states, &work, + &lwork, &iwork); + double *D = (double *)STARPU_VECTOR_GET_PTR(buffer[0]); + double *Dcopy = (double *)STARPU_VECTOR_GET_PTR(buffer[1]); + double *U = (double *)STARPU_VECTOR_GET_PTR(buffer[2]); + double *V = (double *)STARPU_VECTOR_GET_PTR(buffer[3]); + int *rank = (int *)STARPU_VECTOR_GET_PTR(buffer[4]); + int id = starpu_worker_get_id(); + int pool_size = starpu_combined_worker_get_size(); + int pool_rank = starpu_combined_worker_get_rank(); + for(STARSH_int bi = pool_rank; bi < batch_size; bi += pool_size) + { + starsh_dense_dlrrsdd(nb, nb, D + bi*nb*nb, nb, U + bi*maxrank*nb, nb, + V + bi*maxrank*nb, nb, rank+bi, maxrank, oversample, tol, + work[id], lwork, iwork[id]); + } +} + +void starsh_dense_dlrrsdd_starpu_kblas_gpu(void *buffer[], void *cl_arg) +//! STARPU kernel for 1-way randomized SVD on a tile. +{ + int batch_size; + int nb; + int maxrank; + int oversample; + cublasHandle_t *cublas_handles; + kblasHandle_t *kblas_handles; + kblasRandState_t *kblas_states; + double **work; + int lwork; + int **iwork; + double tol; + starpu_codelet_unpack_args(cl_arg, &batch_size, &nb, &maxrank, &oversample, + &tol, &cublas_handles, &kblas_handles, &kblas_states, &work, + &lwork, &iwork); + double *D = (double *)STARPU_VECTOR_GET_PTR(buffer[0]); + double *Dcopy = (double *)STARPU_VECTOR_GET_PTR(buffer[1]); + double *U = (double *)STARPU_VECTOR_GET_PTR(buffer[2]); + double *V = (double *)STARPU_VECTOR_GET_PTR(buffer[3]); + int *rank = (int *)STARPU_VECTOR_GET_PTR(buffer[4]); + int mn = maxrank+oversample; + if(mn > nb) + mn = nb; + int id = starpu_worker_get_id(); + kblasHandle_t khandle = kblas_handles[id]; + cublasHandle_t cuhandle = cublas_handles[id]; + kblasRandState_t state = kblas_states[id]; + cudaStream_t stream = starpu_cuda_get_local_stream(); + // Create copy of D, since kblas_rsvd spoils it + cublasDcopy(cuhandle, batch_size*nb*nb, D, 1, Dcopy, 1); + // Run randomized SVD, get left singular vectors and singular values + //* + kblasDrsvd_batch_strided(khandle, nb, nb, mn, D, nb, nb*nb, U, mn, state, + batch_size); + cudaMemcpyAsync(work[id], U, mn*batch_size*sizeof(double), + cudaMemcpyDeviceToHost, stream); + cudaStreamSynchronize(stream); + for(int bi = 0; bi < batch_size; ++bi) + { + int local_rank = starsh_dense_dsvfr(mn, work[id] + bi*mn, tol); + if(local_rank >= nb/2 || local_rank > maxrank) + { + iwork[id][bi] = -1; + //printf("RANK=-1\n"); + } + else + { + double one = 1.0; + double zero = 0.0; + cublasDgemm(cuhandle, CUBLAS_OP_T, CUBLAS_OP_N, nb, local_rank, nb, + &one, Dcopy + bi*nb*nb, nb, D + bi*nb*nb, nb, &zero, + V + bi*maxrank*nb, nb); + cublasDcopy(cuhandle, nb*local_rank, D + bi*nb*nb, 1, + U + bi*maxrank*nb, 1); + iwork[id][bi] = local_rank; + //printf("RANK=%d\n", local_rank); + } + } + cudaMemcpyAsync(rank, iwork[id], batch_size*sizeof(int), + cudaMemcpyHostToDevice, stream); + //*/ +} + diff --git a/src/backends/starpu_kblas/dense/kernel.c b/src/backends/starpu_kblas/dense/kernel.c new file mode 100644 index 00000000..cc8d53df --- /dev/null +++ b/src/backends/starpu_kblas/dense/kernel.c @@ -0,0 +1,46 @@ +/*! @copyright (c) 2017 King Abdullah University of Science and + * Technology (KAUST). All rights reserved. + * + * STARS-H is a software package, provided by King Abdullah + * University of Science and Technology (KAUST) + * + * @file src/backends/starpu/dense/kernel.c + * @version 0.1.0 + * @author Aleksandr Mikhalev + * @date 2017-11-07 + * */ + +#include "common.h" +#include "starsh.h" +#include "starsh-starpu-kblas.h" +#include + +void starsh_dense_kernel_starpu_kblas_cpu(void *buffers[], void *cl_arg) +//! STARPU kernel for matrix kernel. +{ + double time0 = omp_get_wtime(); + STARSH_blrf *F; + STARSH_int batch_size; + starpu_codelet_unpack_args(cl_arg, &F, &batch_size); + STARSH_problem *P = F->problem; + STARSH_kernel *kernel = P->kernel; + // Shortcuts to information about clusters + STARSH_cluster *RC = F->row_cluster, *CC = F->col_cluster; + void *RD = RC->data, *CD = CC->data; + double *D = (double *)STARPU_VECTOR_GET_PTR(buffers[0]); + STARSH_int *ind = (STARSH_int *)STARPU_VECTOR_GET_PTR(buffers[1]); + // This works only for equal square tiles + STARSH_int N = RC->size[0]; + STARSH_int stride = N*N; + int pool_size = starpu_combined_worker_get_size(); + int pool_rank = starpu_combined_worker_get_rank(); + for(STARSH_int ibatch = pool_rank; ibatch < batch_size; + ibatch += pool_size) + { + int i = ind[ibatch*2]; + int j = ind[ibatch*2+1]; + kernel(N, N, RC->pivot+RC->start[i], CC->pivot+CC->start[j], + RD, CD, D + ibatch*stride, N); + } +} + diff --git a/src/backends/starpu_kblas2/CMakeLists.txt b/src/backends/starpu_kblas2/CMakeLists.txt new file mode 100644 index 00000000..38e9e2f1 --- /dev/null +++ b/src/backends/starpu_kblas2/CMakeLists.txt @@ -0,0 +1,26 @@ +# @copyright (c) 2017 King Abdullah University of Science and +# Technology (KAUST). All rights reserved. +# +# STARS-H is a software package, provided by King Abdullah +# University of Science and Technology (KAUST) +# +# @file src/backends/starpu/CMakeLists.txt +# @version 0.1.0 +# @author Aleksandr Mikhalev +# @date 2017-11-07 + + +set(DOXYGEN_INPUT ${DOXYGEN_INPUT}) + +# Collect sources for documentation and compilation +set(SRC) +add_subdirectory("blrm") +add_subdirectory("dense") + +# If compilation is requried +if(STARPU AND KBLAS) + add_library(backends_starpu_kblas2 OBJECT ${SRC}) +endif() + +# Put doxygen input to parent scope +set(DOXYGEN_INPUT ${DOXYGEN_INPUT} ${SRC} PARENT_SCOPE) diff --git a/src/backends/starpu_kblas2/blrm/CMakeLists.txt b/src/backends/starpu_kblas2/blrm/CMakeLists.txt new file mode 100644 index 00000000..dc39d390 --- /dev/null +++ b/src/backends/starpu_kblas2/blrm/CMakeLists.txt @@ -0,0 +1,19 @@ +# @copyright (c) 2017 King Abdullah University of Science and +# Technology (KAUST). All rights reserved. +# +# STARS-H is a software package, provided by King Abdullah +# University of Science and Technology (KAUST) +# +# @file src/backends/starpu/blrm/CMakeLists.txt +# @version 0.1.0 +# @author Aleksandr Mikhalev +# @date 2017-11-07 + + +# set the values of the variable in the parent scope +set(SRC + #"${CMAKE_CURRENT_SOURCE_DIR}/dqp3.c" + "${CMAKE_CURRENT_SOURCE_DIR}/drsdd.c" + #"${CMAKE_CURRENT_SOURCE_DIR}/dsdd.c" + #"${CMAKE_CURRENT_SOURCE_DIR}/dmml.c" + ${SRC} PARENT_SCOPE) diff --git a/src/backends/starpu_kblas2/blrm/drsdd.c b/src/backends/starpu_kblas2/blrm/drsdd.c new file mode 100644 index 00000000..e1e8c716 --- /dev/null +++ b/src/backends/starpu_kblas2/blrm/drsdd.c @@ -0,0 +1,567 @@ +/*! @copyright (c) 2017 King Abdullah University of Science and + * Technology (KAUST). All rights reserved. + * + * STARS-H is a software package, provided by King Abdullah + * University of Science and Technology (KAUST) + * + * @file src/backends/starpu/blrm/drsdd.c + * @version 0.1.0 + * @author Aleksandr Mikhalev + * @date 2017-11-07 + * */ + +#include "common.h" +#include "starsh.h" +#include "starsh-starpu-kblas.h" +#include +#include +#include +#include "batch_rand.h" +#include +#include + +static void init_starpu_kblas(void *args) +{ + cublasHandle_t *cublas_handles; + kblasHandle_t *kblas_handles; + kblasRandState_t *kblas_states; + cudaStream_t stream = starpu_cuda_get_local_stream(); + int nb, nsamples, maxbatch; + starpu_codelet_unpack_args(args, &cublas_handles, &kblas_handles, + &kblas_states, &nb, &nsamples, &maxbatch); + int id = starpu_worker_get_id(); + cublasStatus_t status; + kblasCreate(&kblas_handles[id]); + kblasSetStream(kblas_handles[id], stream); + kblasDrsvd_batch_wsquery(kblas_handles[id], nb, nb, nsamples, maxbatch); + kblasAllocateWorkspace(kblas_handles[id]); + cublas_handles[id] = kblasGetCublasHandle(kblas_handles[id]); + kblasInitRandState(kblas_handles[id], &kblas_states[id], 16384*2, 0); + cudaStreamSynchronize(stream); +} + +static void deinit_starpu_kblas(void *args) +{ + int nb, nsamples, maxbatch; + cublasHandle_t *cublas_handles; + kblasHandle_t *kblas_handles; + kblasRandState_t *kblas_states; + starpu_codelet_unpack_args(args, &cublas_handles, &kblas_handles, + &kblas_states, &nb, &nsamples, &maxbatch); + int id = starpu_worker_get_id(); + kblasDestroyRandState(kblas_states[id]); + kblasDestroy(&kblas_handles[id]); +} + +static void starsh_dense_dlrrsdd_starpu_kblas2_copy(void *buffers[], void *cl_arg) +{ + int N, batch_size; + starpu_codelet_unpack_args(cl_arg, &N, &batch_size); + double *Dcopy = (double *)STARPU_VECTOR_GET_PTR(buffers[0]); + double *D = (double *)STARPU_VECTOR_GET_PTR(buffers[1]); + cblas_dcopy(N*N*batch_size, Dcopy, 1, D, 1); +} + +int starsh_blrm__drsdd_starpu_kblas2(STARSH_blrm **matrix, STARSH_blrf *format, + int maxrank, double tol, int onfly) +//! Approximate each tile by randomized SVD. +/*! + * @param[out] matrix: Address of pointer to @ref STARSH_blrm object. + * @param[in] format: Block low-rank format. + * @param[in] maxrank: Maximum possible rank. + * @param[in] tol: Relative error tolerance. + * @param[in] onfly: Whether not to store dense blocks. + * @return Error code @ref STARSH_ERRNO. + * @ingroup blrm + * */ +{ + double time0 = omp_get_wtime(); + //printf("KBLAS2\n"); + STARSH_blrf *F = format; + STARSH_problem *P = F->problem; + STARSH_kernel *kernel = P->kernel; + STARSH_int nblocks_far = F->nblocks_far; + STARSH_int nblocks_near = F->nblocks_near; + // Shortcuts to information about clusters + STARSH_cluster *RC = F->row_cluster; + STARSH_cluster *CC = F->col_cluster; + void *RD = RC->data, *CD = CC->data; + // Following values default to given block low-rank format F, but they are + // changed when there are false far-field blocks. + STARSH_int new_nblocks_far = nblocks_far; + STARSH_int new_nblocks_near = nblocks_near; + STARSH_int *block_far = F->block_far; + STARSH_int *block_near = F->block_near; + // Places to store low-rank factors, dense blocks and ranks + Array **far_U = NULL, **far_V = NULL, **near_D = NULL; + int *far_rank = NULL; + double *alloc_U = NULL, *alloc_V = NULL, *alloc_D = NULL, *alloc_S = NULL; + STARSH_int bi, bj = 0; + const int oversample = starsh_params.oversample; + // Init CuBLAS and KBLAS handles and temp buffers for all workers (but they + // are used only in GPU codelets) + int workers = starpu_worker_get_count(); + cublasHandle_t cublas_handles[workers]; + kblasHandle_t kblas_handles[workers]; + kblasRandState_t kblas_states[workers]; + cublasHandle_t *cuhandles = cublas_handles; + kblasHandle_t *khandles = kblas_handles; + kblasRandState_t *kstates = kblas_states; + //printf("MAIN: %p, %p, %p\n", cuhandles, khandles, svhandles); + void *args_gpu; + size_t args_gpu_size = 0; + // This works only for TLR with equal tiles + int nb = RC->size[0]; + int nsamples = maxrank+oversample; + // Set size of batch + char *env_var = getenv("STARSH_KBLAS_BATCH"); + int batch_size = 300; + if(env_var) + batch_size = atoi(env_var); + //printf("KBLAS2: batch_size=%d\n", batch_size); + // Ceil number of batches + int nbatches = (nblocks_far-1)/batch_size + 1; + // Get number of temporary buffers for CPU-GPU transfers + int nworkers_gpu = 3 * starpu_cuda_worker_get_count(); + int nworkers_cpu = starpu_cpu_worker_get_count(); + // Get corresponding sizes and minimum of them + int mn = maxrank+oversample; + if(mn > nb) + mn = nb; + starpu_codelet_pack_args(&args_gpu, &args_gpu_size, + STARPU_VALUE, &cuhandles, sizeof(cuhandles), + STARPU_VALUE, &khandles, sizeof(khandles), + STARPU_VALUE, &kstates, sizeof(kstates), + STARPU_VALUE, &nb, sizeof(nb), + STARPU_VALUE, &nsamples, sizeof(nsamples), + STARPU_VALUE, &batch_size, sizeof(batch_size), + 0); + starpu_execute_on_each_worker(init_starpu_kblas, args_gpu, STARPU_CUDA); + //printf("KBLAS2 finish init\n"); + // Init codelet structs and handles + struct starpu_codelet codelet_kernel = + { + .cpu_funcs = {starsh_dense_kernel_starpu_kblas2_cpu}, + .nbuffers = 2, + .modes = {STARPU_W, STARPU_R}, + //.type = STARPU_SPMD, + //.max_parallelism = INT_MAX, + }; + struct starpu_codelet codelet_lowrank = + { + .cuda_funcs = {starsh_dense_dlrrsdd_starpu_kblas2_gpu}, + .cuda_flags = {STARPU_CUDA_ASYNC}, + .nbuffers = 5, + .modes = {STARPU_R, STARPU_SCRATCH, STARPU_W, STARPU_W, STARPU_W}, + }; + struct starpu_codelet codelet_getrank = + { + .cpu_funcs = {starsh_dense_dlrrsdd_starpu_kblas2_getrank}, + .nbuffers = 6, + .modes = {STARPU_R, STARPU_R, STARPU_R, STARPU_W, STARPU_W, STARPU_W}, + //.type = STARPU_SPMD, + //.max_parallelism = INT_MAX, + }; + struct starpu_codelet codelet_copy = + { + .cpu_funcs = {starsh_dense_dlrrsdd_starpu_kblas2_copy}, + .nbuffers = 2, + .modes = {STARPU_R, STARPU_W}, + }; + //starpu_data_handle_t D_handle[nbatches]; + starpu_data_handle_t index_handle[nbatches]; + //starpu_data_handle_t Dcopy_handle[nbatches]; + //starpu_data_handle_t tmp_U_handle[nbatches]; + //starpu_data_handle_t tmp_V_handle[nbatches]; + //starpu_data_handle_t tmp_S_handle[nbatches]; + starpu_data_handle_t D_handle[nworkers_cpu]; + starpu_data_handle_t DtoGPU_handle[nworkers_gpu]; + starpu_data_handle_t Dcopy_handle[nworkers_gpu]; + starpu_data_handle_t tmp_U_handle[nworkers_gpu]; + starpu_data_handle_t tmp_V_handle[nworkers_gpu]; + starpu_data_handle_t tmp_S_handle[nworkers_gpu]; + starpu_data_handle_t U_handle[nbatches]; + starpu_data_handle_t V_handle[nbatches]; + starpu_data_handle_t rank_handle[nbatches]; + //printf("KBLAS2: init in %f seconds\n", omp_get_wtime()-time0); + //time0 = omp_get_wtime(); + double *tmp_U_alloc = NULL, *tmp_V_alloc = NULL, *tmp_S_alloc = NULL; + //printf("BATCHSIZE=%d BATCHCOUNT=%d\n", batch_size, nbatches); + // Init buffers to store low-rank factors of far-field blocks if needed + if(nbatches > 0) + { + STARSH_MALLOC(far_U, nblocks_far); + STARSH_MALLOC(far_V, nblocks_far); + STARSH_MALLOC(far_rank, nblocks_far); + size_t size_U = nblocks_far * nb * maxrank; + size_t size_V = size_U; + //size_t size_D = nblocks_far * nb * nb; + //size_t size_S = nblocks_far * mn; + STARSH_MALLOC(alloc_U, size_U); + STARSH_MALLOC(alloc_V, size_V); + //starpu_memory_pin(alloc_U, size_U*sizeof(double)); + //starpu_memory_pin(alloc_V, size_V*sizeof(double)); + //starpu_malloc(&alloc_S, size_S*sizeof(double)); + int shape[] = {nb, maxrank}; + for(bi = 0; bi < nblocks_far; ++bi) + { + STARSH_int offset = bi * nb * maxrank; + array_from_buffer(far_U+bi, 2, shape, 'd', 'F', alloc_U+offset); + array_from_buffer(far_V+bi, 2, shape, 'd', 'F', alloc_V+offset); + far_rank[bi] = -1; + } + //starpu_malloc(&alloc_D, size_D*sizeof(double)); + //size_t tmp_U_alloc_size = (size_t)nworkers_gpu * batch_size * nb * + // maxrank * sizeof(double); + //size_t tmp_S_alloc_size = (size_t)nworkers_gpu * batch_size * mn * + // sizeof(double); + //starpu_malloc(&tmp_U_alloc, tmp_U_alloc_size); + //starpu_malloc(&tmp_V_alloc, tmp_U_alloc_size); + //starpu_malloc(&tmp_S_alloc, tmp_S_alloc_size); + starpu_memory_pin(block_far, 2*nblocks_far*sizeof(*block_far)); + //printf("KBLAS2: pin memory in %e seconds\n", omp_get_wtime()-time0); + // START MEASURING TIME + //time0 = omp_get_wtime(); + for(bi = 0; bi < nbatches; ++bi) + { + int this_batch_size = nblocks_far - bi*batch_size; + if(this_batch_size > batch_size) + this_batch_size = batch_size; + //printf("THIS BATCH SIZE=%d\n", this_batch_size); + starpu_vector_data_register(rank_handle+bi, STARPU_MAIN_RAM, + (uintptr_t)(far_rank + bi*batch_size), this_batch_size, + sizeof(*far_rank)); + //STARSH_int offset_D = bi * batch_size * nb * nb; + //double *D = alloc_D + offset_D; + //STARSH_int D_size = this_batch_size * nb * nb; + //starpu_vector_data_register(DtoGPU_handle+bi, -1, 0, D_size, + // sizeof(double)); + //starpu_vector_data_register(Dcopy_handle+bi, -1, 0, D_size, + // sizeof(double)); + starpu_vector_data_register(index_handle+bi, STARPU_MAIN_RAM, + (uintptr_t)(block_far + 2*bi*batch_size), + 2*this_batch_size, sizeof(*block_far)); + STARSH_int offset = bi * batch_size * nb * maxrank; + //STARSH_int offset_S = bi * batch_size * mn; + double *U = alloc_U + offset; + double *V = alloc_V + offset; + //double *S = alloc_S + offset_S; + STARSH_int U_size = this_batch_size * nb * maxrank; + STARSH_int V_size = U_size; + //STARSH_int tmp_U_size = batch_size * nb * maxrank; + //STARSH_int tmp_V_size = tmp_U_size; + //STARSH_int tmp_S_size = batch_size * mn; + //starpu_vector_data_register(tmp_U_handle+bi, -1, 0 , tmp_U_size, + // sizeof(double)); + //starpu_vector_data_register(tmp_V_handle+bi, -1, 0 , tmp_V_size, + // sizeof(double)); + //starpu_vector_data_register(tmp_S_handle+bi, -1, 0 , tmp_S_size, + // sizeof(double)); + starpu_vector_data_register(U_handle+bi, STARPU_MAIN_RAM, + (uintptr_t)(U), U_size, sizeof(*U)); + starpu_vector_data_register(V_handle+bi, STARPU_MAIN_RAM, + (uintptr_t)(V), V_size, sizeof(*V)); + } + STARSH_int D_size = batch_size * nb * nb; + STARSH_int tmp_U_size = batch_size * nb * maxrank; + STARSH_int tmp_S_size = batch_size * mn; + STARSH_MALLOC(alloc_D, nworkers_cpu * D_size * sizeof(*alloc_D)); + for(bi = 0; bi < nworkers_cpu; ++bi) + { + starpu_vector_data_register(D_handle+bi, STARPU_MAIN_RAM, + (uintptr_t)(alloc_D+bi*D_size), D_size, + sizeof(double)); + } + for(bi = 0; bi < nworkers_gpu; ++bi) + { + starpu_vector_data_register(DtoGPU_handle+bi, -1, 0, D_size, + sizeof(double)); + starpu_vector_data_register(Dcopy_handle+bi, -1, 0, D_size, + sizeof(double)); + starpu_vector_data_register(tmp_U_handle+bi, -1, 0, tmp_U_size, + sizeof(double)); + starpu_vector_data_register(tmp_V_handle+bi, -1, 0, tmp_U_size, + sizeof(double)); + starpu_vector_data_register(tmp_S_handle+bi, -1, 0, tmp_S_size, + sizeof(double)); + } + //printf("REGISTER DATA IN: %f seconds\n", omp_get_wtime()-time0); + } + // Work variables + int info; + // START MEASURING TIME + //time0 = omp_get_wtime(); + for(bi = 0; bi < nbatches; ++bi) + { + //printf("RUNNING BATCH=%d\n", bi); + int this_batch_size = nblocks_far - bi*batch_size; + if(this_batch_size > batch_size) + this_batch_size = batch_size; + // Generate matrix + starpu_task_insert(&codelet_kernel, + STARPU_VALUE, &F, sizeof(F), + STARPU_VALUE, &this_batch_size, sizeof(this_batch_size), + STARPU_W, D_handle[bi % nworkers_cpu], + STARPU_R, index_handle[bi], + STARPU_PRIORITY, -2, + 0); + starpu_data_unregister_submit(index_handle[bi]); + // Copy to pinned memory + starpu_task_insert(&codelet_copy, + STARPU_VALUE, &nb, sizeof(nb), + STARPU_VALUE, &this_batch_size, sizeof(this_batch_size), + STARPU_R, D_handle[bi % nworkers_cpu], + STARPU_W, DtoGPU_handle[bi % nworkers_gpu], + 0); + // Run KBLAS_RSVD + starpu_task_insert(&codelet_lowrank, + STARPU_VALUE, &this_batch_size, sizeof(this_batch_size), + STARPU_VALUE, &nb, sizeof(nb), + STARPU_VALUE, &maxrank, sizeof(maxrank), + STARPU_VALUE, &oversample, sizeof(oversample), + STARPU_VALUE, &tol, sizeof(tol), + STARPU_VALUE, &cuhandles, sizeof(cuhandles), + STARPU_VALUE, &khandles, sizeof(khandles), + STARPU_VALUE, &kstates, sizeof(kstates), + STARPU_R, DtoGPU_handle[bi % nworkers_gpu], + STARPU_SCRATCH, Dcopy_handle[bi % nworkers_gpu], + STARPU_W, tmp_U_handle[bi % nworkers_gpu], + STARPU_W, tmp_V_handle[bi % nworkers_gpu], + STARPU_W, tmp_S_handle[bi % nworkers_gpu], + STARPU_PRIORITY, -1, + 0); + //starpu_data_unregister_submit(D_handle[bi]); + //starpu_data_unregister_submit(Dcopy_handle[bi]); + starpu_task_insert(&codelet_getrank, + STARPU_VALUE, &this_batch_size, sizeof(this_batch_size), + STARPU_VALUE, &nb, sizeof(nb), + STARPU_VALUE, &maxrank, sizeof(maxrank), + STARPU_VALUE, &oversample, sizeof(oversample), + STARPU_VALUE, &tol, sizeof(tol), + STARPU_R, tmp_U_handle[bi % nworkers_gpu], + STARPU_R, tmp_V_handle[bi % nworkers_gpu], + STARPU_R, tmp_S_handle[bi % nworkers_gpu], + STARPU_W, rank_handle[bi], + STARPU_W, U_handle[bi], + STARPU_W, V_handle[bi], + STARPU_PRIORITY, 0, + 0); + //starpu_data_unregister_submit(tmp_U_handle[bi]); + //starpu_data_unregister_submit(tmp_V_handle[bi]); + //starpu_data_unregister_submit(tmp_S_handle[bi]); + starpu_data_unregister_submit(rank_handle[bi]); + starpu_data_unregister_submit(U_handle[bi]); + starpu_data_unregister_submit(V_handle[bi]); + } + //double time1 = omp_get_wtime(); + //printf("SUBMIT IN: %f seconds\n", time1-time0); + starpu_task_wait_for_all(); + //time1 = omp_get_wtime(); + //printf("COMPUTE+COMPRESS MATRIX IN: %f seconds\n", time1-time0); + //time0 = omp_get_wtime(); + if(nbatches > 0) + { + //size_t size_U = nblocks_far * nb * maxrank; + //size_t size_V = size_U; + //starpu_free(alloc_D); + //starpu_memory_unpin(alloc_U, size_U*sizeof(double)); + //starpu_memory_unpin(alloc_V, size_V*sizeof(double)); + //starpu_free(alloc_S); + for(bi = 0; bi < nworkers_cpu; ++bi) + { + starpu_data_unregister(D_handle[bi]); + } + for(bi = 0; bi < nworkers_gpu; ++bi) + { + starpu_data_unregister(DtoGPU_handle[bi]); + starpu_data_unregister(Dcopy_handle[bi]); + starpu_data_unregister(tmp_U_handle[bi]); + starpu_data_unregister(tmp_V_handle[bi]); + starpu_data_unregister(tmp_S_handle[bi]); + } + //starpu_free(tmp_U_alloc); + //starpu_free(tmp_V_alloc); + //starpu_free(tmp_S_alloc); + starpu_memory_unpin(block_far, 2*nblocks_far*sizeof(*block_far)); + free(alloc_D); + alloc_D = NULL; + } + //printf("FINISH FIRST PASS AND UNREGISTER IN: %f seconds\n", + // omp_get_wtime()-time0); + // Get number of false far-field blocks + STARSH_int nblocks_false_far = 0; + STARSH_int *false_far = NULL; + for(bi = 0; bi < nblocks_far; bi++) + { + //printf("FAR_RANK[%zu]=%d\n", bi, far_rank[bi]); + //far_rank[bi] = -1; + if(far_rank[bi] == -1) + nblocks_false_far++; + } + if(nblocks_false_far > 0) + { + // IMPORTANT: `false_far` must to be in ascending order for later code + // to work normally + STARSH_MALLOC(false_far, nblocks_false_far); + bj = 0; + for(bi = 0; bi < nblocks_far; bi++) + if(far_rank[bi] == -1) + false_far[bj++] = bi; + } + // Update lists of far-field and near-field blocks using previously + // generated list of false far-field blocks + if(nblocks_false_far > 0) + { + // Update list of near-field blocks + new_nblocks_near = nblocks_near+nblocks_false_far; + STARSH_MALLOC(block_near, 2*new_nblocks_near); + // At first get all near-field blocks, assumed to be dense + for(bi = 0; bi < 2*nblocks_near; bi++) + block_near[bi] = F->block_near[bi]; + // Add false far-field blocks + for(bi = 0; bi < nblocks_false_far; bi++) + { + STARSH_int bj = false_far[bi]; + block_near[2*(bi+nblocks_near)] = F->block_far[2*bj]; + block_near[2*(bi+nblocks_near)+1] = F->block_far[2*bj+1]; + } + // Update list of far-field blocks + new_nblocks_far = nblocks_far-nblocks_false_far; + if(new_nblocks_far > 0) + { + STARSH_MALLOC(block_far, 2*new_nblocks_far); + bj = 0; + for(bi = 0; bi < nblocks_far; bi++) + { + // `false_far` must be in ascending order for this to work + if(bj < nblocks_false_far && false_far[bj] == bi) + { + bj++; + } + else + { + block_far[2*(bi-bj)] = F->block_far[2*bi]; + block_far[2*(bi-bj)+1] = F->block_far[2*bi+1]; + } + } + } + // Update format by creating new format + STARSH_blrf *F2; + info = starsh_blrf_new_from_coo(&F2, P, F->symm, RC, CC, + new_nblocks_far, block_far, new_nblocks_near, block_near, + F->type); + // Swap internal data of formats and free unnecessary data + STARSH_blrf tmp_blrf = *F; + *F = *F2; + *F2 = tmp_blrf; + STARSH_WARNING("`F` was modified due to false far-field blocks"); + starsh_blrf_free(F2); + } + // Compute near-field blocks if needed + if(onfly == 0 && new_nblocks_near > 0) + { + STARSH_MALLOC(near_D, new_nblocks_near); + size_t size_D = new_nblocks_near * nb * nb; + STARSH_MALLOC(alloc_D, size_D); + nbatches = (new_nblocks_near-1)/batch_size + 1; + starpu_data_handle_t D_handle[nbatches]; + starpu_data_handle_t index_handle[nbatches]; + int shape[] = {nb, nb}; + // For each near-field block compute its elements + for(bi = 0; bi < new_nblocks_near; ++bi) + { + // Get indexes of corresponding block row and block column + //STARSH_int i = block_near[2*bi]; + //STARSH_int j = block_near[2*bi+1]; + array_from_buffer(near_D+bi, 2, shape, 'd', 'F', + alloc_D + bi*nb*nb); + } + for(bi = 0; bi < nbatches; ++bi) + { + int this_batch_size = new_nblocks_near - bi*batch_size; + if(this_batch_size > batch_size) + this_batch_size = batch_size; + STARSH_int D_size = this_batch_size * nb * nb; + double *D = alloc_D + bi*batch_size*nb*nb; + starpu_vector_data_register(D_handle+bi, STARPU_MAIN_RAM, + (uintptr_t)(D), D_size, sizeof(*D)); + starpu_vector_data_register(index_handle+bi, STARPU_MAIN_RAM, + (uintptr_t)(block_near + 2*bi*batch_size), + 2*this_batch_size, sizeof(*block_near)); + } + for(bi = 0; bi < nbatches; ++bi) + { + int this_batch_size = new_nblocks_near - bi*batch_size; + if(this_batch_size > batch_size) + this_batch_size = batch_size; + // Generate matrix + starpu_task_insert(&codelet_kernel, + STARPU_VALUE, &F, sizeof(F), + STARPU_VALUE, &this_batch_size, sizeof(this_batch_size), + STARPU_W, D_handle[bi], + STARPU_R, index_handle[bi], + 0); + } + // Wait in this scope, because all handles are not visible outside + starpu_task_wait_for_all(); + // Unregister data + for(bi = 0; bi < nbatches; bi++) + { + starpu_data_unregister(D_handle[bi]); + starpu_data_unregister(index_handle[bi]); + } + } + // Change sizes of far_rank, far_U and far_V if there were false + // far-field blocks + if(nblocks_false_far > 0 && new_nblocks_far > 0) + { + bj = 0; + for(bi = 0; bi < nblocks_far; bi++) + { + if(far_rank[bi] == -1) + bj++; + else + { + int shape_U[2] = {far_U[bi]->shape[0], far_rank[bi]}; + int shape_V[2] = {far_V[bi]->shape[0], far_rank[bi]}; + array_from_buffer(far_U+bi-bj, 2, shape_U, 'd', 'F', + far_U[bi]->data); + array_from_buffer(far_V+bi-bj, 2, shape_V, 'd', 'F', + far_V[bi]->data); + far_rank[bi-bj] = far_rank[bi]; + } + } + STARSH_REALLOC(far_rank, new_nblocks_far); + STARSH_REALLOC(far_U, new_nblocks_far); + STARSH_REALLOC(far_V, new_nblocks_far); + //STARSH_REALLOC(alloc_U, offset_U); + //STARSH_REALLOC(alloc_V, offset_V); + } + // If all far-field blocks are false, then dealloc buffers + if(new_nblocks_far == 0 && nblocks_far > 0) + { + block_far = NULL; + free(far_rank); + far_rank = NULL; + free(far_U); + far_U = NULL; + free(far_V); + far_V = NULL; + free(alloc_U); + alloc_U = NULL; + free(alloc_V); + alloc_V = NULL; + } + // Dealloc list of false far-field blocks if it is not empty + if(nblocks_false_far > 0) + free(false_far); + // Finish with creating instance of Block Low-Rank Matrix with given + // buffers + //printf("FINISH NEAR-FIELD TILES: %f seconds\n", omp_get_wtime()-time0); + //time0 = omp_get_wtime(); + starpu_execute_on_each_worker(deinit_starpu_kblas, args_gpu, STARPU_CUDA); + //printf("KBLAS2: finalize in %f seconds\n", omp_get_wtime()-time0); + return starsh_blrm_new(matrix, F, far_rank, far_U, far_V, onfly, near_D, + alloc_U, alloc_V, alloc_D, '1'); +} + diff --git a/src/backends/starpu_kblas2/dense/CMakeLists.txt b/src/backends/starpu_kblas2/dense/CMakeLists.txt new file mode 100644 index 00000000..0ae6c8ec --- /dev/null +++ b/src/backends/starpu_kblas2/dense/CMakeLists.txt @@ -0,0 +1,21 @@ +# @copyright (c) 2017 King Abdullah University of Science and +# Technology (KAUST). All rights reserved. +# +# STARS-H is a software package, provided by King Abdullah +# University of Science and Technology (KAUST) +# +# @file src/backends/starpu/dense/CMakeLists.txt +# @version 0.1.0 +# @author Aleksandr Mikhalev +# @date 2017-11-07 + + +# set the values of the variable in the parent scope +set(SRC + #"${CMAKE_CURRENT_SOURCE_DIR}/dqp3.c" + "${CMAKE_CURRENT_SOURCE_DIR}/drsdd.c" + #"${CMAKE_CURRENT_SOURCE_DIR}/dsdd.c" + "${CMAKE_CURRENT_SOURCE_DIR}/kernel.c" + #"${CMAKE_CURRENT_SOURCE_DIR}/dgemm.c" + #"${CMAKE_CURRENT_SOURCE_DIR}/fake_init.c" + ${SRC} PARENT_SCOPE) diff --git a/src/backends/starpu_kblas2/dense/drsdd.c b/src/backends/starpu_kblas2/dense/drsdd.c new file mode 100644 index 00000000..85b7a7bb --- /dev/null +++ b/src/backends/starpu_kblas2/dense/drsdd.c @@ -0,0 +1,105 @@ +/*! @copyright (c) 2017 King Abdullah University of Science and + * Technology (KAUST). All rights reserved. + * + * STARS-H is a software package, provided by King Abdullah + * University of Science and Technology (KAUST) + * + * @file src/backends/starpu/dense/drsdd.c + * @version 0.1.0 + * @author Aleksandr Mikhalev + * @date 2017-11-07 + * */ + +#include "common.h" +#include "starsh.h" +#include "starsh-starpu-kblas.h" +#include +#include +#include +#include "batch_rand.h" +#include + +void starsh_dense_dlrrsdd_starpu_kblas2_gpu(void *buffer[], void *cl_arg) +//! STARPU kernel for 1-way randomized SVD on a tile. +{ + int batch_size; + int nb; + int maxrank; + int oversample; + cublasHandle_t *cublas_handles; + kblasHandle_t *kblas_handles; + kblasRandState_t *kblas_states; + double **work; + int lwork; + int **iwork; + double tol; + starpu_codelet_unpack_args(cl_arg, &batch_size, &nb, &maxrank, &oversample, + &tol, &cublas_handles, &kblas_handles, &kblas_states); + double *D = (double *)STARPU_VECTOR_GET_PTR(buffer[0]); + double *W = (double *)STARPU_VECTOR_GET_PTR(buffer[1]); + double *U = (double *)STARPU_VECTOR_GET_PTR(buffer[2]); + double *V = (double *)STARPU_VECTOR_GET_PTR(buffer[3]); + double *S = (double *)STARPU_VECTOR_GET_PTR(buffer[4]); + int mn = maxrank+oversample; + if(mn > nb) + mn = nb; + int id = starpu_worker_get_id(); + kblasHandle_t khandle = kblas_handles[id]; + cublasHandle_t cuhandle = cublas_handles[id]; + kblasRandState_t state = kblas_states[id]; + cudaStream_t stream = starpu_cuda_get_local_stream(); + // Create copy of first mn columns of D, since kblas_rsvd spoils it + cublasDcopy(cuhandle, batch_size*nb*nb, D, 1, W, 1); + // Run randomized SVD, get left singular vectors and singular values + kblasDrsvd_batch_strided(khandle, nb, nb, mn, W, nb, nb*nb, S, mn, state, + batch_size); + double one = 1.0; + double zero = 0.0; + for(int bi = 0; bi < batch_size; ++bi) + cublasDcopy(cuhandle, nb*maxrank, W+bi*nb*nb, 1, U+bi*maxrank*nb, 1); + kblasDgemm_batch_strided(khandle, KBLAS_Trans, KBLAS_NoTrans, nb, maxrank, + nb, one, D, nb, nb*nb, U, nb, nb*maxrank, zero, V, nb, + maxrank*nb, batch_size); +} + +void starsh_dense_dlrrsdd_starpu_kblas2_getrank(void *buffer[], void *cl_arg) +//! STARPU kernel for 1-way randomized SVD on a tile. +{ + int batch_size; + int nb; + int maxrank; + int oversample; + double tol; + starpu_codelet_unpack_args(cl_arg, &batch_size, &nb, &maxrank, &oversample, + &tol); + double *tmp_U = (double *)STARPU_VECTOR_GET_PTR(buffer[0]); + double *tmp_V = (double *)STARPU_VECTOR_GET_PTR(buffer[1]); + double *tmp_S = (double *)STARPU_VECTOR_GET_PTR(buffer[2]); + int *rank = (int *)STARPU_VECTOR_GET_PTR(buffer[3]); + double *U = (double *)STARPU_VECTOR_GET_PTR(buffer[4]); + double *V = (double *)STARPU_VECTOR_GET_PTR(buffer[5]); + int mn = maxrank+oversample; + if(mn > nb) + mn = nb; + int pool_size = starpu_combined_worker_get_size(); + int pool_rank = starpu_combined_worker_get_rank(); + size_t stride = maxrank * nb; + for(STARSH_int ibatch = pool_rank; ibatch < batch_size; + ibatch += pool_size) + { + int local_rank = starsh_dense_dsvfr(mn, tmp_S+ibatch*mn, tol); + if(local_rank >= nb/2 || local_rank > maxrank) + rank[ibatch] = -1; + else + { + double *local_U = U + ibatch*stride; + double *local_V = V + ibatch*stride; + double *local_tmp_U = tmp_U + ibatch*stride; + double *local_tmp_V = tmp_V + ibatch*stride; + cblas_dcopy(local_rank*nb, local_tmp_U, 1, local_U, 1); + cblas_dcopy(local_rank*nb, local_tmp_V, 1, local_V, 1); + rank[ibatch] = local_rank; + } + } +} + diff --git a/src/backends/starpu_kblas2/dense/kernel.c b/src/backends/starpu_kblas2/dense/kernel.c new file mode 100644 index 00000000..c09a1424 --- /dev/null +++ b/src/backends/starpu_kblas2/dense/kernel.c @@ -0,0 +1,46 @@ +/*! @copyright (c) 2017 King Abdullah University of Science and + * Technology (KAUST). All rights reserved. + * + * STARS-H is a software package, provided by King Abdullah + * University of Science and Technology (KAUST) + * + * @file src/backends/starpu/dense/kernel.c + * @version 0.1.0 + * @author Aleksandr Mikhalev + * @date 2017-11-07 + * */ + +#include "common.h" +#include "starsh.h" +#include "starsh-starpu-kblas.h" +#include + +void starsh_dense_kernel_starpu_kblas2_cpu(void *buffers[], void *cl_arg) +//! STARPU kernel for matrix kernel. +{ + //double time0 = omp_get_wtime(); + STARSH_blrf *F; + int batch_size; + starpu_codelet_unpack_args(cl_arg, &F, &batch_size); + STARSH_problem *P = F->problem; + STARSH_kernel *kernel = P->kernel; + // Shortcuts to information about clusters + STARSH_cluster *RC = F->row_cluster, *CC = F->col_cluster; + void *RD = RC->data, *CD = CC->data; + double *D = (double *)STARPU_VECTOR_GET_PTR(buffers[0]); + STARSH_int *ind = (STARSH_int *)STARPU_VECTOR_GET_PTR(buffers[1]); + // This works only for equal square tiles + STARSH_int N = RC->size[0]; + STARSH_int stride = N*N; + int pool_size = starpu_combined_worker_get_size(); + int pool_rank = starpu_combined_worker_get_rank(); + for(STARSH_int ibatch = pool_rank; ibatch < batch_size; + ibatch += pool_size) + { + int i = ind[ibatch*2]; + int j = ind[ibatch*2+1]; + kernel(N, N, RC->pivot+RC->start[i], CC->pivot+CC->start[j], + RD, CD, D + ibatch*stride, N); + } +} + diff --git a/src/backends/starpu_kblas3_spatial/CMakeLists.txt b/src/backends/starpu_kblas3_spatial/CMakeLists.txt new file mode 100644 index 00000000..36cd3eec --- /dev/null +++ b/src/backends/starpu_kblas3_spatial/CMakeLists.txt @@ -0,0 +1,26 @@ +# @copyright (c) 2017 King Abdullah University of Science and +# Technology (KAUST). All rights reserved. +# +# STARS-H is a software package, provided by King Abdullah +# University of Science and Technology (KAUST) +# +# @file src/backends/starpu/CMakeLists.txt +# @version 0.1.0 +# @author Aleksandr Mikhalev +# @date 2017-11-07 + + +set(DOXYGEN_INPUT ${DOXYGEN_INPUT}) + +# Collect sources for documentation and compilation +set(SRC) +add_subdirectory("blrm") +add_subdirectory("dense") + +# If compilation is requried +if(STARPU AND KBLAS) + add_library(backends_starpu_kblas3_spatial OBJECT ${SRC}) +endif() + +# Put doxygen input to parent scope +set(DOXYGEN_INPUT ${DOXYGEN_INPUT} ${SRC} PARENT_SCOPE) diff --git a/src/backends/starpu_kblas3_spatial/blrm/CMakeLists.txt b/src/backends/starpu_kblas3_spatial/blrm/CMakeLists.txt new file mode 100644 index 00000000..dc39d390 --- /dev/null +++ b/src/backends/starpu_kblas3_spatial/blrm/CMakeLists.txt @@ -0,0 +1,19 @@ +# @copyright (c) 2017 King Abdullah University of Science and +# Technology (KAUST). All rights reserved. +# +# STARS-H is a software package, provided by King Abdullah +# University of Science and Technology (KAUST) +# +# @file src/backends/starpu/blrm/CMakeLists.txt +# @version 0.1.0 +# @author Aleksandr Mikhalev +# @date 2017-11-07 + + +# set the values of the variable in the parent scope +set(SRC + #"${CMAKE_CURRENT_SOURCE_DIR}/dqp3.c" + "${CMAKE_CURRENT_SOURCE_DIR}/drsdd.c" + #"${CMAKE_CURRENT_SOURCE_DIR}/dsdd.c" + #"${CMAKE_CURRENT_SOURCE_DIR}/dmml.c" + ${SRC} PARENT_SCOPE) diff --git a/src/backends/starpu_kblas3_spatial/blrm/drsdd.c b/src/backends/starpu_kblas3_spatial/blrm/drsdd.c new file mode 100644 index 00000000..e26017b9 --- /dev/null +++ b/src/backends/starpu_kblas3_spatial/blrm/drsdd.c @@ -0,0 +1,588 @@ +/*! @copyright (c) 2017 King Abdullah University of Science and + * Technology (KAUST). All rights reserved. + * + * STARS-H is a software package, provided by King Abdullah + * University of Science and Technology (KAUST) + * + * @file src/backends/starpu_kblas3_spatial/blrm/drsdd.c + * @version 0.1.0 + * @author Aleksandr Mikhalev + * @date 2017-11-07 + * */ + +#include "common.h" +#include "starsh.h" +#include "starsh-starpu-kblas.h" +#include "starsh-spatial.h" +#include +#include +#include +#include "batch_rand.h" +#include +#include + +static void init_starpu_kblas(void *args) +{ + cublasHandle_t *cublas_handles; + kblasHandle_t *kblas_handles; + kblasRandState_t *kblas_states; + STARSH_ssdata **data_gpu; + STARSH_ssdata *data_cpu; + cudaStream_t stream = starpu_cuda_get_local_stream(); + int nb, nsamples, maxbatch; + starpu_codelet_unpack_args(args, &cublas_handles, &kblas_handles, + &kblas_states, &data_gpu, &data_cpu, &nb, &nsamples, &maxbatch); + int id = starpu_worker_get_id(); + cublasStatus_t status; + kblasCreate(&kblas_handles[id]); + kblasSetStream(kblas_handles[id], stream); + kblasDrsvd_batch_wsquery(kblas_handles[id], nb, nb, nsamples, maxbatch); + kblasAllocateWorkspace(kblas_handles[id]); + cublas_handles[id] = kblasGetCublasHandle(kblas_handles[id]); + kblasInitRandState(kblas_handles[id], &kblas_states[id], 16384*2, 0); + starsh_ssdata_togpu(&data_gpu[id], data_cpu); + cudaStreamSynchronize(stream); +} + +static void deinit_starpu_kblas(void *args) +{ + int nb, nsamples, maxbatch; + cublasHandle_t *cublas_handles; + kblasHandle_t *kblas_handles; + kblasRandState_t *kblas_states; + STARSH_ssdata **data_gpu; + STARSH_ssdata *data_cpu; + starpu_codelet_unpack_args(args, &cublas_handles, &kblas_handles, + &kblas_states, &data_gpu, &data_cpu, &nb, &nsamples, &maxbatch); + int id = starpu_worker_get_id(); + kblasDestroyRandState(kblas_states[id]); + kblasDestroy(&kblas_handles[id]); + starsh_ssdata_free_gpu(data_gpu[id]); + cudaStreamSynchronize(starpu_cuda_get_local_stream()); +} + +static void starsh_dense_dlrrsdd_starpu_kblas3_copy(void *buffers[], void *cl_arg) +{ + int N, batch_size; + starpu_codelet_unpack_args(cl_arg, &N, &batch_size); + double *Dcopy = (double *)STARPU_VECTOR_GET_PTR(buffers[0]); + double *D = (double *)STARPU_VECTOR_GET_PTR(buffers[1]); + cblas_dcopy(N*N*batch_size, Dcopy, 1, D, 1); +} + +int starsh_blrm__drsdd_starpu_kblas3_spatial(STARSH_blrm **matrix, STARSH_blrf *format, + int maxrank, double tol, int onfly) +//! Approximate each tile by randomized SVD. +/*! + * @param[out] matrix: Address of pointer to @ref STARSH_blrm object. + * @param[in] format: Block low-rank format. + * @param[in] maxrank: Maximum possible rank. + * @param[in] tol: Relative error tolerance. + * @param[in] onfly: Whether not to store dense blocks. + * @return Error code @ref STARSH_ERRNO. + * @ingroup blrm + * */ +{ + //double time0 = omp_get_wtime(); + //printf("KBLAS3\n"); + STARSH_blrf *F = format; + STARSH_problem *P = F->problem; + STARSH_kernel *kernel = P->kernel; + STARSH_int nblocks_far = F->nblocks_far; + STARSH_int nblocks_near = F->nblocks_near; + // Shortcuts to information about clusters + STARSH_cluster *RC = F->row_cluster; + STARSH_cluster *CC = F->col_cluster; + void *RD = RC->data, *CD = CC->data; + // Following values default to given block low-rank format F, but they are + // changed when there are false far-field blocks. + STARSH_int new_nblocks_far = nblocks_far; + STARSH_int new_nblocks_near = nblocks_near; + STARSH_int *block_far = F->block_far; + STARSH_int *block_near = F->block_near; + // Places to store low-rank factors, dense blocks and ranks + Array **far_U = NULL, **far_V = NULL, **near_D = NULL; + int *far_rank = NULL; + double *alloc_U = NULL, *alloc_V = NULL, *alloc_D = NULL, *alloc_S = NULL; + STARSH_int bi, bj = 0; + const int oversample = starsh_params.oversample; + // Init CuBLAS and KBLAS handles and temp buffers for all workers (but they + // are used only in GPU codelets) + int workers = starpu_worker_get_count(); + cublasHandle_t cublas_handles[workers]; + kblasHandle_t kblas_handles[workers]; + kblasRandState_t kblas_states[workers]; + STARSH_ssdata *data_gpu_array[workers]; + cublasHandle_t *cuhandles = cublas_handles; + kblasHandle_t *khandles = kblas_handles; + kblasRandState_t *kstates = kblas_states; + STARSH_ssdata **data_gpu = data_gpu_array; + //printf("MAIN: %p, %p, %p\n", cuhandles, khandles, svhandles); + void *args_gpu; + size_t args_gpu_size = 0; + // This works only for TLR with equal tiles + int nb = RC->size[0]; + int nsamples = maxrank+oversample; + // Set size of batch + char *env_var = getenv("STARSH_KBLAS_BATCH"); + int batch_size = 300; + if(env_var) + batch_size = atoi(env_var); + //printf("KBLAS3: batch_size=%d\n", batch_size); + // Ceil number of batches + int nbatches = (nblocks_far-1)/batch_size + 1; + // Get number of temporary buffers for CPU-GPU transfers + int nworkers_gpu = 3 * starpu_cuda_worker_get_count(); + // Get corresponding sizes and minimum of them + int mn = maxrank+oversample; + if(mn > nb) + mn = nb; + starpu_codelet_pack_args(&args_gpu, &args_gpu_size, + STARPU_VALUE, &cuhandles, sizeof(cuhandles), + STARPU_VALUE, &khandles, sizeof(khandles), + STARPU_VALUE, &kstates, sizeof(kstates), + STARPU_VALUE, &data_gpu, sizeof(data_gpu), + STARPU_VALUE, &RD, sizeof(RD), + STARPU_VALUE, &nb, sizeof(nb), + STARPU_VALUE, &nsamples, sizeof(nsamples), + STARPU_VALUE, &batch_size, sizeof(batch_size), + 0); + starpu_execute_on_each_worker(init_starpu_kblas, args_gpu, STARPU_CUDA); + //printf("KBLAS2 finish init\n"); + // Init codelet structs and handles + struct starpu_codelet codelet_kernel = + { + .cuda_funcs = {starsh_dense_kernel_starpu_kblas3_gpu}, + .cuda_flags = {STARPU_CUDA_ASYNC}, + .nbuffers = 2, + .modes = {STARPU_W, STARPU_R}, + //.type = STARPU_SPMD, + //.max_parallelism = INT_MAX, + }; + struct starpu_codelet codelet_lowrank = + { + .cuda_funcs = {starsh_dense_dlrrsdd_starpu_kblas2_gpu}, + .cuda_flags = {STARPU_CUDA_ASYNC}, + .nbuffers = 5, + .modes = {STARPU_R, STARPU_SCRATCH, STARPU_W, STARPU_W, STARPU_W}, + }; + struct starpu_codelet codelet_getrank = + { + .cpu_funcs = {starsh_dense_dlrrsdd_starpu_kblas2_getrank}, + .nbuffers = 6, + .modes = {STARPU_R, STARPU_R, STARPU_R, STARPU_W, STARPU_W, STARPU_W}, + //.type = STARPU_SPMD, + //.max_parallelism = INT_MAX, + }; + struct starpu_codelet codelet_copy = + { + .cpu_funcs = {starsh_dense_dlrrsdd_starpu_kblas3_copy}, + .nbuffers = 2, + .modes = {STARPU_R, STARPU_W}, + }; + //starpu_data_handle_t D_handle[nbatches]; + starpu_data_handle_t index_handle[nbatches]; + //starpu_data_handle_t Dcopy_handle[nbatches]; + //starpu_data_handle_t tmp_U_handle[nbatches]; + //starpu_data_handle_t tmp_V_handle[nbatches]; + //starpu_data_handle_t tmp_S_handle[nbatches]; + starpu_data_handle_t D_handle[nworkers_gpu]; + starpu_data_handle_t Dcopy_handle[nworkers_gpu]; + starpu_data_handle_t tmp_U_handle[nworkers_gpu]; + starpu_data_handle_t tmp_V_handle[nworkers_gpu]; + starpu_data_handle_t tmp_S_handle[nworkers_gpu]; + starpu_data_handle_t U_handle[nbatches]; + starpu_data_handle_t V_handle[nbatches]; + starpu_data_handle_t rank_handle[nbatches]; + //printf("KBLAS3: init in %f seconds\n", omp_get_wtime()-time0); + //time0 = omp_get_wtime(); + double *tmp_U_alloc = NULL, *tmp_V_alloc = NULL, *tmp_S_alloc = NULL; + //printf("BATCHSIZE=%d BATCHCOUNT=%d\n", batch_size, nbatches); + // Init buffers to store low-rank factors of far-field blocks if needed + if(nbatches > 0) + { + STARSH_MALLOC(far_U, nblocks_far); + STARSH_MALLOC(far_V, nblocks_far); + STARSH_MALLOC(far_rank, nblocks_far); + size_t size_U = nblocks_far * nb * maxrank; + size_t size_V = size_U; + //size_t size_D = nblocks_far * nb * nb; + //size_t size_S = nblocks_far * mn; + STARSH_MALLOC(alloc_U, size_U); + STARSH_MALLOC(alloc_V, size_V); + //starpu_memory_pin(alloc_U, size_U*sizeof(double)); + //starpu_memory_pin(alloc_V, size_V*sizeof(double)); + //starpu_malloc(&alloc_S, size_S*sizeof(double)); + int shape[] = {nb, maxrank}; + for(bi = 0; bi < nblocks_far; ++bi) + { + STARSH_int offset = bi * nb * maxrank; + array_from_buffer(far_U+bi, 2, shape, 'd', 'F', alloc_U+offset); + array_from_buffer(far_V+bi, 2, shape, 'd', 'F', alloc_V+offset); + far_rank[bi] = -1; + } + //starpu_malloc(&alloc_D, size_D*sizeof(double)); + //size_t tmp_U_alloc_size = (size_t)nworkers_gpu * batch_size * nb * + // maxrank * sizeof(double); + //size_t tmp_S_alloc_size = (size_t)nworkers_gpu * batch_size * mn * + // sizeof(double); + //starpu_malloc(&tmp_U_alloc, tmp_U_alloc_size); + //starpu_malloc(&tmp_V_alloc, tmp_U_alloc_size); + //starpu_malloc(&tmp_S_alloc, tmp_S_alloc_size); + starpu_memory_pin(block_far, 2*nblocks_far*sizeof(*block_far)); + //printf("KBLAS3: pin memory in %e seconds\n", omp_get_wtime()-time0); + // START MEASURING TIME + time0 = omp_get_wtime(); + for(bi = 0; bi < nbatches; ++bi) + { + int this_batch_size = nblocks_far - bi*batch_size; + if(this_batch_size > batch_size) + this_batch_size = batch_size; + //printf("THIS BATCH SIZE=%d\n", this_batch_size); + starpu_vector_data_register(rank_handle+bi, STARPU_MAIN_RAM, + (uintptr_t)(far_rank + bi*batch_size), this_batch_size, + sizeof(*far_rank)); + //STARSH_int offset_D = bi * batch_size * nb * nb; + //double *D = alloc_D + offset_D; + //STARSH_int D_size = this_batch_size * nb * nb; + //starpu_vector_data_register(D_handle+bi, -1, 0, D_size, + // sizeof(double)); + //starpu_vector_data_register(Dcopy_handle+bi, -1, 0, D_size, + // sizeof(double)); + starpu_vector_data_register(index_handle+bi, STARPU_MAIN_RAM, + (uintptr_t)(block_far + 2*bi*batch_size), + 2*this_batch_size, sizeof(*block_far)); + STARSH_int offset = bi * batch_size * nb * maxrank; + //STARSH_int offset_S = bi * batch_size * mn; + double *U = alloc_U + offset; + double *V = alloc_V + offset; + //double *S = alloc_S + offset_S; + STARSH_int U_size = this_batch_size * nb * maxrank; + STARSH_int V_size = U_size; + //STARSH_int tmp_U_size = batch_size * nb * maxrank; + //STARSH_int tmp_V_size = tmp_U_size; + //STARSH_int tmp_S_size = batch_size * mn; + //starpu_vector_data_register(tmp_U_handle+bi, -1, 0 , tmp_U_size, + // sizeof(double)); + //starpu_vector_data_register(tmp_V_handle+bi, -1, 0 , tmp_V_size, + // sizeof(double)); + //starpu_vector_data_register(tmp_S_handle+bi, -1, 0 , tmp_S_size, + // sizeof(double)); + starpu_vector_data_register(U_handle+bi, STARPU_MAIN_RAM, + (uintptr_t)(U), U_size, sizeof(*U)); + starpu_vector_data_register(V_handle+bi, STARPU_MAIN_RAM, + (uintptr_t)(V), V_size, sizeof(*V)); + } + STARSH_int D_size = batch_size * nb * nb; + STARSH_int tmp_U_size = batch_size * nb * maxrank; + STARSH_int tmp_S_size = batch_size * mn; + for(bi = 0; bi < nworkers_gpu; ++bi) + { + starpu_vector_data_register(D_handle+bi, -1, 0, D_size, + sizeof(double)); + starpu_vector_data_register(Dcopy_handle+bi, -1, 0, D_size, + sizeof(double)); + starpu_vector_data_register(tmp_U_handle+bi, -1, 0, tmp_U_size, + sizeof(double)); + starpu_vector_data_register(tmp_V_handle+bi, -1, 0, tmp_U_size, + sizeof(double)); + starpu_vector_data_register(tmp_S_handle+bi, -1, 0, tmp_S_size, + sizeof(double)); + } + //printf("REGISTER DATA IN: %f seconds\n", omp_get_wtime()-time0); + } + // Work variables + int info; + // START MEASURING TIME + //time0 = omp_get_wtime(); + for(bi = 0; bi < nbatches; ++bi) + { + //printf("RUNNING BATCH=%d\n", bi); + int this_batch_size = nblocks_far - bi*batch_size; + if(this_batch_size > batch_size) + this_batch_size = batch_size; + // Generate matrix + starpu_task_insert(&codelet_kernel, + STARPU_VALUE, &data_gpu, sizeof(data_gpu), + STARPU_VALUE, &nb, sizeof(nb), + STARPU_VALUE, &this_batch_size, sizeof(this_batch_size), + STARPU_W, D_handle[bi % nworkers_gpu], + STARPU_R, index_handle[bi], + STARPU_PRIORITY, -2, + 0); + starpu_data_unregister_submit(index_handle[bi]); + // Run KBLAS_RSVD + starpu_task_insert(&codelet_lowrank, + STARPU_VALUE, &this_batch_size, sizeof(this_batch_size), + STARPU_VALUE, &nb, sizeof(nb), + STARPU_VALUE, &maxrank, sizeof(maxrank), + STARPU_VALUE, &oversample, sizeof(oversample), + STARPU_VALUE, &tol, sizeof(tol), + STARPU_VALUE, &cuhandles, sizeof(cuhandles), + STARPU_VALUE, &khandles, sizeof(khandles), + STARPU_VALUE, &kstates, sizeof(kstates), + STARPU_R, D_handle[bi % nworkers_gpu], + STARPU_SCRATCH, Dcopy_handle[bi % nworkers_gpu], + STARPU_W, tmp_U_handle[bi % nworkers_gpu], + STARPU_W, tmp_V_handle[bi % nworkers_gpu], + STARPU_W, tmp_S_handle[bi % nworkers_gpu], + STARPU_PRIORITY, 0, + 0); + //starpu_data_unregister_submit(D_handle[bi]); + //starpu_data_unregister_submit(Dcopy_handle[bi]); + starpu_task_insert(&codelet_getrank, + STARPU_VALUE, &this_batch_size, sizeof(this_batch_size), + STARPU_VALUE, &nb, sizeof(nb), + STARPU_VALUE, &maxrank, sizeof(maxrank), + STARPU_VALUE, &oversample, sizeof(oversample), + STARPU_VALUE, &tol, sizeof(tol), + STARPU_R, tmp_U_handle[bi % nworkers_gpu], + STARPU_R, tmp_V_handle[bi % nworkers_gpu], + STARPU_R, tmp_S_handle[bi % nworkers_gpu], + STARPU_W, rank_handle[bi], + STARPU_W, U_handle[bi], + STARPU_W, V_handle[bi], + STARPU_PRIORITY, -1, + 0); + //starpu_data_unregister_submit(tmp_U_handle[bi]); + //starpu_data_unregister_submit(tmp_V_handle[bi]); + //starpu_data_unregister_submit(tmp_S_handle[bi]); + starpu_data_unregister_submit(rank_handle[bi]); + starpu_data_unregister_submit(U_handle[bi]); + starpu_data_unregister_submit(V_handle[bi]); + } + //double time1 = omp_get_wtime(); + //printf("SUBMIT IN: %f seconds\n", time1-time0); + starpu_task_wait_for_all(); + //time1 = omp_get_wtime(); + //printf("COMPUTE+COMPRESS MATRIX IN: %f seconds\n", time1-time0); + //time0 = omp_get_wtime(); + if(nbatches > 0) + { + //size_t size_U = nblocks_far * nb * maxrank; + //size_t size_V = size_U; + //starpu_free(alloc_D); + //starpu_memory_unpin(alloc_U, size_U*sizeof(double)); + //starpu_memory_unpin(alloc_V, size_V*sizeof(double)); + //starpu_free(alloc_S); + for(bi = 0; bi < nworkers_gpu; ++bi) + { + starpu_data_unregister(D_handle[bi]); + starpu_data_unregister(Dcopy_handle[bi]); + starpu_data_unregister(tmp_U_handle[bi]); + starpu_data_unregister(tmp_V_handle[bi]); + starpu_data_unregister(tmp_S_handle[bi]); + } + //starpu_free(tmp_U_alloc); + //starpu_free(tmp_V_alloc); + //starpu_free(tmp_S_alloc); + starpu_memory_unpin(block_far, 2*nblocks_far*sizeof(*block_far)); + } + //printf("FINISH FIRST PASS AND UNREGISTER IN: %f seconds\n", + // omp_get_wtime()-time0); + //time0 = omp_get_wtime(); + // Get number of false far-field blocks + STARSH_int nblocks_false_far = 0; + STARSH_int *false_far = NULL; + for(bi = 0; bi < nblocks_far; bi++) + { + //printf("FAR_RANK[%zu]=%d\n", bi, far_rank[bi]); + //far_rank[bi] = -1; + if(far_rank[bi] == -1) + nblocks_false_far++; + } + if(nblocks_false_far > 0) + { + // IMPORTANT: `false_far` must to be in ascending order for later code + // to work normally + STARSH_MALLOC(false_far, nblocks_false_far); + bj = 0; + for(bi = 0; bi < nblocks_far; bi++) + if(far_rank[bi] == -1) + false_far[bj++] = bi; + } + // Update lists of far-field and near-field blocks using previously + // generated list of false far-field blocks + if(nblocks_false_far > 0) + { + // Update list of near-field blocks + new_nblocks_near = nblocks_near+nblocks_false_far; + STARSH_MALLOC(block_near, 2*new_nblocks_near); + // At first get all near-field blocks, assumed to be dense + for(bi = 0; bi < 2*nblocks_near; bi++) + block_near[bi] = F->block_near[bi]; + // Add false far-field blocks + for(bi = 0; bi < nblocks_false_far; bi++) + { + STARSH_int bj = false_far[bi]; + block_near[2*(bi+nblocks_near)] = F->block_far[2*bj]; + block_near[2*(bi+nblocks_near)+1] = F->block_far[2*bj+1]; + } + // Update list of far-field blocks + new_nblocks_far = nblocks_far-nblocks_false_far; + if(new_nblocks_far > 0) + { + STARSH_MALLOC(block_far, 2*new_nblocks_far); + bj = 0; + for(bi = 0; bi < nblocks_far; bi++) + { + // `false_far` must be in ascending order for this to work + if(bj < nblocks_false_far && false_far[bj] == bi) + { + bj++; + } + else + { + block_far[2*(bi-bj)] = F->block_far[2*bi]; + block_far[2*(bi-bj)+1] = F->block_far[2*bi+1]; + } + } + } + // Update format by creating new format + STARSH_blrf *F2; + info = starsh_blrf_new_from_coo(&F2, P, F->symm, RC, CC, + new_nblocks_far, block_far, new_nblocks_near, block_near, + F->type); + // Swap internal data of formats and free unnecessary data + STARSH_blrf tmp_blrf = *F; + *F = *F2; + *F2 = tmp_blrf; + STARSH_WARNING("`F` was modified due to false far-field blocks"); + starsh_blrf_free(F2); + } + // Compute near-field blocks if needed + if(onfly == 0 && new_nblocks_near > 0) + { + STARSH_MALLOC(near_D, new_nblocks_near); + size_t size_D = new_nblocks_near * nb * nb; + STARSH_MALLOC(alloc_D, size_D); + nbatches = (new_nblocks_near-1)/batch_size + 1; + starpu_data_handle_t D_handle[nbatches]; + starpu_data_handle_t Dcopy_handle[nworkers_gpu]; + starpu_data_handle_t index_handle[nbatches]; + int shape[] = {nb, nb}; + // For each near-field block compute its elements + for(bi = 0; bi < new_nblocks_near; ++bi) + { + // Get indexes of corresponding block row and block column + //STARSH_int i = block_near[2*bi]; + //STARSH_int j = block_near[2*bi+1]; + array_from_buffer(near_D+bi, 2, shape, 'd', 'F', + alloc_D + bi*nb*nb); + } + //starpu_memory_pin(block_near, 2*new_nblocks_near*sizeof(*block_near)); + for(bi = 0; bi < nbatches; ++bi) + { + int this_batch_size = new_nblocks_near - bi*batch_size; + if(this_batch_size > batch_size) + this_batch_size = batch_size; + STARSH_int D_size = this_batch_size * nb * nb; + double *D = alloc_D + bi*batch_size*nb*nb; + starpu_vector_data_register(D_handle+bi, STARPU_MAIN_RAM, + (uintptr_t)(D), D_size, sizeof(double)); + //starpu_vector_data_register(Dcopy_handle+bi, -1, 0, D_size, + // sizeof(double)); + starpu_vector_data_register(index_handle+bi, STARPU_MAIN_RAM, + (uintptr_t)(block_near + 2*bi*batch_size), + 2*this_batch_size, sizeof(*block_near)); + } + //double *Dcopy_alloc; + //size_t Dcopy_size = batch_size * nb * nb; + // overwrite old value of number of temporary work space for transfers + // between GPU and CPU + //int nworkers_gpu = starpu_cuda_worker_get_count(); + //starpu_malloc(&Dcopy_alloc, sizeof(*Dcopy_alloc) * nworkers_gpu * + // Dcopy_size); + //for(bi = 0; bi < nworkers_gpu; ++bi) + //{ + // starpu_vector_data_register(Dcopy_handle+bi, STARPU_MAIN_RAM, + // (uintptr_t)(Dcopy_alloc+bi*Dcopy_size), Dcopy_size, + // sizeof(*Dcopy_alloc)); + //} + for(bi = 0; bi < nbatches; ++bi) + { + int this_batch_size = new_nblocks_near - bi*batch_size; + if(this_batch_size > batch_size) + this_batch_size = batch_size; + // Generate matrix by CPU + starpu_task_insert(&codelet_kernel, + STARPU_VALUE, &data_gpu, sizeof(data_gpu), + STARPU_VALUE, &nb, sizeof(nb), + STARPU_VALUE, &this_batch_size, sizeof(this_batch_size), + STARPU_W, D_handle[bi], + STARPU_R, index_handle[bi], + 0); + starpu_data_unregister_submit(index_handle[bi]); + // Move tile to CPU from GPU + //starpu_task_insert(&codelet_copy, + // STARPU_VALUE, &nb, sizeof(nb), + // STARPU_VALUE, &this_batch_size, sizeof(this_batch_size), + // STARPU_R, Dcopy_handle[bi % nworkers_gpu], + // STARPU_W, D_handle[bi], + // 0); + //starpu_data_unregister_submit(Dcopy_handle[bi]); + starpu_data_unregister_submit(D_handle[bi]); + } + // Wait in this scope, because all handles are not visible outside + starpu_task_wait_for_all(); + //for(bi = 0; bi < nworkers_gpu; ++bi) + //{ + // starpu_data_unregister(Dcopy_handle[bi]); + //} + //starpu_free(Dcopy_alloc); + //starpu_memory_unpin(block_near, 2*new_nblocks_near*sizeof(*block_near)); + } + // Change sizes of far_rank, far_U and far_V if there were false + // far-field blocks + if(nblocks_false_far > 0 && new_nblocks_far > 0) + { + bj = 0; + for(bi = 0; bi < nblocks_far; bi++) + { + if(far_rank[bi] == -1) + bj++; + else + { + int shape_U[2] = {far_U[bi]->shape[0], far_rank[bi]}; + int shape_V[2] = {far_V[bi]->shape[0], far_rank[bi]}; + array_from_buffer(far_U+bi-bj, 2, shape_U, 'd', 'F', + far_U[bi]->data); + array_from_buffer(far_V+bi-bj, 2, shape_V, 'd', 'F', + far_V[bi]->data); + far_rank[bi-bj] = far_rank[bi]; + } + } + STARSH_REALLOC(far_rank, new_nblocks_far); + STARSH_REALLOC(far_U, new_nblocks_far); + STARSH_REALLOC(far_V, new_nblocks_far); + //STARSH_REALLOC(alloc_U, offset_U); + //STARSH_REALLOC(alloc_V, offset_V); + } + // If all far-field blocks are false, then dealloc buffers + if(new_nblocks_far == 0 && nblocks_far > 0) + { + block_far = NULL; + free(far_rank); + far_rank = NULL; + free(far_U); + far_U = NULL; + free(far_V); + far_V = NULL; + free(alloc_U); + alloc_U = NULL; + free(alloc_V); + alloc_V = NULL; + } + // Dealloc list of false far-field blocks if it is not empty + if(nblocks_false_far > 0) + free(false_far); + // Finish with creating instance of Block Low-Rank Matrix with given + // buffers + //printf("FINISH NEAR-FIELD TILES: %f seconds\n", omp_get_wtime()-time0); + //time0 = omp_get_wtime(); + starpu_execute_on_each_worker(deinit_starpu_kblas, args_gpu, STARPU_CUDA); + //printf("KBLAS3: finalize in %f seconds\n", omp_get_wtime()-time0); + return starsh_blrm_new(matrix, F, far_rank, far_U, far_V, onfly, near_D, + alloc_U, alloc_V, alloc_D, '1'); +} + diff --git a/src/backends/starpu_kblas3_spatial/dense/CMakeLists.txt b/src/backends/starpu_kblas3_spatial/dense/CMakeLists.txt new file mode 100644 index 00000000..5a42be3e --- /dev/null +++ b/src/backends/starpu_kblas3_spatial/dense/CMakeLists.txt @@ -0,0 +1,21 @@ +# @copyright (c) 2017 King Abdullah University of Science and +# Technology (KAUST). All rights reserved. +# +# STARS-H is a software package, provided by King Abdullah +# University of Science and Technology (KAUST) +# +# @file src/backends/starpu/dense/CMakeLists.txt +# @version 0.1.0 +# @author Aleksandr Mikhalev +# @date 2017-11-07 + + +# set the values of the variable in the parent scope +set(SRC + #"${CMAKE_CURRENT_SOURCE_DIR}/dqp3.c" + #"${CMAKE_CURRENT_SOURCE_DIR}/drsdd.c" + #"${CMAKE_CURRENT_SOURCE_DIR}/dsdd.c" + "${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu" + #"${CMAKE_CURRENT_SOURCE_DIR}/dgemm.c" + #"${CMAKE_CURRENT_SOURCE_DIR}/fake_init.c" + ${SRC} PARENT_SCOPE) diff --git a/src/backends/starpu_kblas3_spatial/dense/kernel.cu b/src/backends/starpu_kblas3_spatial/dense/kernel.cu new file mode 100644 index 00000000..31707b65 --- /dev/null +++ b/src/backends/starpu_kblas3_spatial/dense/kernel.cu @@ -0,0 +1,82 @@ +/*! @copyright (c) 2017 King Abdullah University of Science and + * Technology (KAUST). All rights reserved. + * + * STARS-H is a software package, provided by King Abdullah + * University of Science and Technology (KAUST) + * + * @file src/backends/starpu/dense/kernel.cu + * @version 0.1.0 + * @author Aleksandr Mikhalev + * @date 2017-11-07 + * */ + +#include "starpu.h" +extern "C" +{ +#include +#include "common.h" +#include "starsh.h" +#include "starsh-starpu-kblas.h" +#include "starsh-spatial.h" +#include + +static __global__ void local_gpu_kernel_for_spatial(STARSH_ssdata *data, + STARSH_int *block_far, int N, double *D, int ldD, int stride) +//! Exponential kernel for 2-dimensional spatial statistics problem on GPU +{ + int tile_i = N * block_far[2*blockIdx.x]; + int tile_j = N * block_far[2*blockIdx.x + 1]; + //printf("blockidx=%d\n", blockIdx.x); + // Read parameters + double beta = -data->beta; + double noise = data->noise; + double sigma = data->sigma; + // Get coordinates + STARSH_int count = data->particles.count; + double *x, *y, *z; + x = data->particles.point; + y = x + count; + //z = y + count; + double *buffer = D + (size_t)stride*blockIdx.x; + // Fill column-major matrix + for(int j = threadIdx.x; j < N; j += blockDim.x) + { + int index_j = tile_j + j; + double x_j = x[index_j]; + double y_j = y[index_j]; + //double z_j = z[index_j]; + for(int i = threadIdx.y; i < N; i += blockDim.y) + { + int index_i = tile_i + i; + double dx = x[index_i] - x_j; + double dy = y[index_i] - y_j; + //double dz = z[index_i] - z_j; + //double dist = norm3d(dx, dy, dz) / beta; + double dist = sqrt(dx*dx + dy*dy) / beta; + if(dist == 0) + buffer[j*(size_t)ldD+i] = sigma + noise; + else + buffer[j*(size_t)ldD+i] = sigma * exp(dist); + //printf("A(%d,%d,%d)=%f\n", index_i, index_j, j*ldD+i, buffer[j*ldD+i]); + } + } +} + +void starsh_dense_kernel_starpu_kblas3_gpu(void *buffers[], void *cl_arg) +//! STARPU kernel for matrix kernel. +{ + //double time0 = omp_get_wtime(); + STARSH_ssdata **data_gpu; + int batch_size; + int N; + int id = starpu_worker_get_id(); + starpu_codelet_unpack_args(cl_arg, &data_gpu, &N, &batch_size); + double *D = (double *)STARPU_VECTOR_GET_PTR(buffers[0]); + STARSH_int *ind = (STARSH_int *)STARPU_VECTOR_GET_PTR(buffers[1]); + dim3 threads(16, 16); + cudaStream_t stream = starpu_cuda_get_local_stream(); + local_gpu_kernel_for_spatial<<>>(data_gpu[id], + ind, N, D, N, N*N); +} + +} // extern "C" diff --git a/src/control/CMakeLists.txt b/src/control/CMakeLists.txt index b6ab5bad..c67ce97e 100644 --- a/src/control/CMakeLists.txt +++ b/src/control/CMakeLists.txt @@ -5,7 +5,7 @@ # University of Science and Technology (KAUST) # # @file src/control/CMakeLists.txt -# @version 1.3.0 +# @version 0.3.0 # @author Aleksandr Mikhalev # @date 2017-11-07 diff --git a/src/control/array.c b/src/control/array.c index 4fd39875..c2c31693 100644 --- a/src/control/array.c +++ b/src/control/array.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file src/control/array.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/src/control/blrf.c b/src/control/blrf.c index 3cff7cfc..9256e3a6 100644 --- a/src/control/blrf.c +++ b/src/control/blrf.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file src/control/blrf.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ @@ -634,6 +634,16 @@ int starsh_blrf_new_tlr_mpi(STARSH_blrf **format, STARSH_problem *problem, MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); int grid_nx = sqrt(mpi_size), grid_ny = grid_nx, grid_x, grid_y; + if(mpi_size == 6) + { + grid_nx = 2; + grid_ny = 3; + } + else if(mpi_size == 2) + { + grid_nx = 1; + grid_ny = 2; + } if(grid_nx*grid_ny != mpi_size) STARSH_ERROR("MPI SIZE MUST BE SQUARE OF INTEGER!"); grid_ny = mpi_size / grid_nx; diff --git a/src/control/cluster.c b/src/control/cluster.c index 33dcd722..de2711a3 100644 --- a/src/control/cluster.c +++ b/src/control/cluster.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file src/control/cluster.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/src/control/init.c b/src/control/init.c index 7e3d08ee..087ba45d 100644 --- a/src/control/init.c +++ b/src/control/init.c @@ -14,6 +14,9 @@ #include "starsh-mpi.h" #include "starsh-starpu.h" #include "starsh-mpi-starpu.h" +#include "starsh-starpu-kblas.h" +#include "starsh-starpu-cuda.h" +#include "starsh-mpi-starpu-kblas.h" #include "common.h" #include "control/init.h" diff --git a/src/control/problem.c b/src/control/problem.c index 9b2d76e7..2ffb8d79 100644 --- a/src/control/problem.c +++ b/src/control/problem.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file src/control/problem.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/src/itersolvers/CMakeLists.txt b/src/itersolvers/CMakeLists.txt index 7ef9492c..3718d01a 100644 --- a/src/itersolvers/CMakeLists.txt +++ b/src/itersolvers/CMakeLists.txt @@ -5,7 +5,7 @@ # University of Science and Technology (KAUST) # # @file src/itersolvers/CMakeLists.txt -# @version 1.3.0 +# @version 0.3.0 # @author Aleksandr Mikhalev # @date 2017-11-07 diff --git a/src/itersolvers/cg.c b/src/itersolvers/cg.c index d0afb6b3..e63747fe 100644 --- a/src/itersolvers/cg.c +++ b/src/itersolvers/cg.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file src/itersolvers/cg.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/testing/CMakeLists.txt b/testing/CMakeLists.txt index 70cc1aba..c79e40b4 100644 --- a/testing/CMakeLists.txt +++ b/testing/CMakeLists.txt @@ -5,7 +5,7 @@ # University of Science and Technology (KAUST) # # @file testing/CMakeLists.txt -# @version 1.3.0 +# @version 0.3.0 # @author Aleksandr Mikhalev # @date 2017-11-07 @@ -61,6 +61,18 @@ if(MPI AND STARPU) ) endif() +if(CUDA) + list(APPEND tests_files + "starpu_spatial_gpu.c" + ) +endif() + +if(CUDA AND MPI) + list(APPEND tests_files + "mpi_starpu_spatial_gpu.c" + ) +endif() + # Uses RUNPATH instead of RPATH SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}") @@ -69,7 +81,7 @@ foreach(test_src ${tests_files}) add_executable(test_${test_exe} ${test_src}) target_link_libraries(test_${test_exe} starsh ${CBLAS_LIBRARIES} ${LAPACKE_LIBRARIES} ${OpenMP_C_FLAGS}) - if(test_src MATCHES "starpu_*") + if((test_src MATCHES "starpu_*") OR (test_src MATCHES "mpi_starpu_*")) target_link_libraries(test_${test_exe} ${STARPU_LIBRARIES}) endif() set_target_properties(test_${test_exe} PROPERTIES OUTPUT_NAME ${test_exe}) diff --git a/testing/cauchy.c b/testing/cauchy.c index 3aec596d..3d55ef56 100644 --- a/testing/cauchy.c +++ b/testing/cauchy.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file testing/cauchy.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/testing/electrodynamics.c b/testing/electrodynamics.c index d18269a1..f1b8f16e 100644 --- a/testing/electrodynamics.c +++ b/testing/electrodynamics.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file testing/electrodynamics.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/testing/electrostatics.c b/testing/electrostatics.c index 9b9367a0..c22f3a77 100644 --- a/testing/electrostatics.c +++ b/testing/electrostatics.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file testing/electrostatics.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/testing/minimal.c b/testing/minimal.c index 84558e3c..ea4a16d5 100644 --- a/testing/minimal.c +++ b/testing/minimal.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file testing/minimal.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/testing/mpi_cauchy.c b/testing/mpi_cauchy.c index 3092d354..06f8e9de 100644 --- a/testing/mpi_cauchy.c +++ b/testing/mpi_cauchy.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file testing/mpi_cauchy.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/testing/mpi_electrodynamics.c b/testing/mpi_electrodynamics.c index bbd11e4e..6f5dbd1a 100644 --- a/testing/mpi_electrodynamics.c +++ b/testing/mpi_electrodynamics.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file testing/mpi_electrodynamics.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/testing/mpi_electrostatics.c b/testing/mpi_electrostatics.c index 521656db..8048783b 100644 --- a/testing/mpi_electrostatics.c +++ b/testing/mpi_electrostatics.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file testing/mpi_electrostatics.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/testing/mpi_minimal.c b/testing/mpi_minimal.c index 435974dd..65fd1d2f 100644 --- a/testing/mpi_minimal.c +++ b/testing/mpi_minimal.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file testing/mpi_minimal.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/testing/mpi_spatial.c b/testing/mpi_spatial.c index ae934758..fb20b284 100644 --- a/testing/mpi_spatial.c +++ b/testing/mpi_spatial.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file testing/mpi_spatial.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/testing/mpi_starpu_cauchy.c b/testing/mpi_starpu_cauchy.c index 7c15e788..74f6002a 100644 --- a/testing/mpi_starpu_cauchy.c +++ b/testing/mpi_starpu_cauchy.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file testing/mpi_starpu_minimal.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/testing/mpi_starpu_electrodynamics.c b/testing/mpi_starpu_electrodynamics.c index f53a3eb5..7ea9e6c6 100644 --- a/testing/mpi_starpu_electrodynamics.c +++ b/testing/mpi_starpu_electrodynamics.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file testing/mpi_starpu_electrodynamics.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/testing/mpi_starpu_electrostatics.c b/testing/mpi_starpu_electrostatics.c index 74f3654f..39d2ed8f 100644 --- a/testing/mpi_starpu_electrostatics.c +++ b/testing/mpi_starpu_electrostatics.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file testing/mpi_starpu_electrostatics.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/testing/mpi_starpu_minimal.c b/testing/mpi_starpu_minimal.c index 7ebcdc39..622a82e4 100644 --- a/testing/mpi_starpu_minimal.c +++ b/testing/mpi_starpu_minimal.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file testing/mpi_starpu_minimal.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/testing/mpi_starpu_spatial.c b/testing/mpi_starpu_spatial.c index 7a209495..1ebc6c43 100644 --- a/testing/mpi_starpu_spatial.c +++ b/testing/mpi_starpu_spatial.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file testing/mpi_starpu_spatial.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ @@ -157,6 +157,7 @@ int main(int argc, char **argv) MPI_Finalize(); return 1; } + //*/ // Measure time for 10 BLRM matvecs and for 10 BLRM TLR matvecs /* Not performed due to no matvec yet with STARPU double *x, *y, *y_tlr; diff --git a/testing/mpi_starpu_spatial_gpu.c b/testing/mpi_starpu_spatial_gpu.c new file mode 100644 index 00000000..1c530a72 --- /dev/null +++ b/testing/mpi_starpu_spatial_gpu.c @@ -0,0 +1,217 @@ +/*! @copyright (c) 2017 King Abdullah University of Science and + * Technology (KAUST). All rights reserved. + * + * STARS-H is a software package, provided by King Abdullah + * University of Science and Technology (KAUST) + * + * @file testing/mpi_starpu_spatial.c + * @version 0.1.0 + * @author Aleksandr Mikhalev + * @date 2017-11-07 + * */ + +#ifdef MKL + #include +#else + #include + #include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +void preheat_cublas() +{ + cublasHandle_t cuhandle; + cublasCreate(&cuhandle); + cublasDestroy(cuhandle); +} + +int main(int argc, char **argv) +{ + MPI_Init(&argc, &argv); + int mpi_size, mpi_rank; + MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); + if(argc != 10) + { + if(mpi_rank == 0) + { + printf("%d arguments provided, but 9 are needed\n", + argc-1); + printf("mpi_starpu_spatial ndim placement kernel beta nu N " + "block_size maxrank tol\n"); + } + MPI_Finalize(); + return 1; + } + int problem_ndim = atoi(argv[1]); + int place = atoi(argv[2]); + // Possible values can be found in documentation for enum + // STARSH_PARTICLES_PLACEMENT + int kernel_type = atoi(argv[3]); + double beta = atof(argv[4]); + double nu = atof(argv[5]); + int N = atoi(argv[6]); + int block_size = atoi(argv[7]); + int maxrank = atoi(argv[8]); + double tol = atof(argv[9]); + double noise = 0; + int onfly = 0; + char symm = 'N', dtype = 'd'; + int ndim = 2; + STARSH_int shape[2] = {N, N}; + int info; + srand(0); + // Init STARS-H + info = starsh_init(); + if(info != 0) + { + MPI_Finalize(); + return 1; + } + // Generate data for spatial statistics problem + STARSH_ssdata *data; + STARSH_kernel *kernel; + info = starsh_application((void **)&data, &kernel, N, dtype, + STARSH_SPATIAL, kernel_type, STARSH_SPATIAL_NDIM, problem_ndim, + STARSH_SPATIAL_BETA, beta, STARSH_SPATIAL_NU, nu, + STARSH_SPATIAL_NOISE, noise, STARSH_SPATIAL_PLACE, place, 0); + if(info != 0) + { + if(mpi_rank == 0) + printf("Problem was NOT generated (wrong parameters)\n"); + MPI_Finalize(); + return 1; + } + // Init problem with given data and kernel and print short info + STARSH_problem *P; + info = starsh_problem_new(&P, ndim, shape, symm, dtype, data, data, + kernel, "Spatial Statistics example"); + if(info != 0) + { + MPI_Finalize(); + return 1; + } + if(mpi_rank == 0) + starsh_problem_info(P); + // Init plain clusterization and print info + STARSH_cluster *C; + info = starsh_cluster_new_plain(&C, data, N, block_size); + if(info != 0) + { + MPI_Finalize(); + return 1; + } + if(mpi_rank == 0) + starsh_cluster_info(C); + // Init tlr division into admissible blocks and print short info + STARSH_blrf *F; + STARSH_blrm *M; + info = starsh_blrf_new_tlr_mpi(&F, P, symm, C, C); + if(info != 0) + { + MPI_Finalize(); + return 1; + } + if(mpi_rank == 0) + starsh_blrf_info(F); + // Init StarPU + (void)starpu_init(NULL); + // Init cublas so that it runs faster next time + starpu_execute_on_each_worker(preheat_cublas, NULL, STARPU_CUDA); + // Approximate each admissible block + MPI_Barrier(MPI_COMM_WORLD); + double time1 = MPI_Wtime(); + info = starsh_blrm__drsdd_mpi_starpu_kblas3_spatial(&M, F, maxrank, tol, onfly); + if(info != 0) + { + if(mpi_rank == 0) + printf("Approximation was NOT computed due to error\n"); + MPI_Finalize(); + return 1; + } + MPI_Barrier(MPI_COMM_WORLD); + time1 = MPI_Wtime()-time1; + if(mpi_rank == 0) + { + starsh_blrf_info(F); + starsh_blrm_info(M); + } + if(mpi_rank == 0) + printf("TIME TO APPROXIMATE: %e secs\n", time1); + // Deinit StarPU + starpu_shutdown(); + // Measure approximation error + /* + MPI_Barrier(MPI_COMM_WORLD); + time1 = MPI_Wtime(); + double rel_err = starsh_blrm__dfe_mpi(M); + MPI_Barrier(MPI_COMM_WORLD); + time1 = MPI_Wtime()-time1; + if(mpi_rank == 0) + { + printf("TIME TO MEASURE ERROR: %e secs\nRELATIVE ERROR: %e\n", + time1, rel_err); + if(rel_err/tol > 10.) + { + printf("Resulting relative error is too big\n"); + MPI_Finalize(); + return 1; + } + } + if(rel_err/tol > 10.) + { + MPI_Finalize(); + return 1; + } + */ + //*/ + // Measure time for 10 BLRM matvecs and for 10 BLRM TLR matvecs + /* Not performed due to no matvec yet with STARPU + double *x, *y, *y_tlr; + int nrhs = 1; + x = malloc(N*nrhs*sizeof(*x)); + y = malloc(N*nrhs*sizeof(*y)); + y_tlr = malloc(N*nrhs*sizeof(*y_tlr)); + if(mpi_rank == 0) + { + int iseed[4] = {0, 0, 0, 1}; + LAPACKE_dlarnv_work(3, iseed, N*nrhs, x); + cblas_dscal(N*nrhs, 0.0, y, 1); + cblas_dscal(N*nrhs, 0.0, y_tlr, 1); + } + MPI_Barrier(MPI_COMM_WORLD); + time1 = MPI_Wtime(); + for(int i = 0; i < 10; i++) + starsh_blrm__dmml_mpi(M, nrhs, 1.0, x, N, 0.0, y, N); + MPI_Barrier(MPI_COMM_WORLD); + time1 = MPI_Wtime()-time1; + if(mpi_rank == 0) + { + printf("TIME FOR 10 BLRM MATVECS: %e secs\n", time1); + } + MPI_Barrier(MPI_COMM_WORLD); + time1 = MPI_Wtime(); + for(int i = 0; i < 10; i++) + starsh_blrm__dmml_mpi_tlr(M, nrhs, 1.0, x, N, 0.0, y_tlr, N); + MPI_Barrier(MPI_COMM_WORLD); + time1 = MPI_Wtime()-time1; + if(mpi_rank == 0) + { + cblas_daxpy(N, -1.0, y, 1, y_tlr, 1); + printf("TIME FOR 10 TLR MATVECS: %e secs\n", time1); + printf("MATVEC DIFF: %e\n", cblas_dnrm2(N, y_tlr, 1) + /cblas_dnrm2(N, y, 1)); + } + */ + MPI_Finalize(); + return 0; +} diff --git a/testing/particles.c b/testing/particles.c index 033eee2e..8b097d1a 100644 --- a/testing/particles.c +++ b/testing/particles.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file testing/particles.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 */ diff --git a/testing/randtlr.c b/testing/randtlr.c index b4068be6..9252508b 100644 --- a/testing/randtlr.c +++ b/testing/randtlr.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file testing/rndtiled.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/testing/spatial.c b/testing/spatial.c index 9a093be0..ea7f533e 100644 --- a/testing/spatial.c +++ b/testing/spatial.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file testing/spatial.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ @@ -108,6 +108,7 @@ int main(int argc, char **argv) return 1; } // Measure time for 10 matvecs + /* double *x, *y; x = malloc(N*nrhs*sizeof(*x)); y = malloc(N*nrhs*sizeof(*y)); @@ -119,5 +120,6 @@ int main(int argc, char **argv) starsh_blrm__dmml(M, nrhs, 1.0, x, N, 0.0, y, N); time1 = omp_get_wtime()-time1; printf("TIME FOR 10 BLRM MATVECS: %e secs\n", time1); + */ return 0; } diff --git a/testing/starpu_cauchy.c b/testing/starpu_cauchy.c index 0f7bcc99..041a3bd1 100644 --- a/testing/starpu_cauchy.c +++ b/testing/starpu_cauchy.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file testing/starpu_cauchy.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/testing/starpu_electrodynamics.c b/testing/starpu_electrodynamics.c index 35f33331..786b43dd 100644 --- a/testing/starpu_electrodynamics.c +++ b/testing/starpu_electrodynamics.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file testing/starpu_electrodynamics.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/testing/starpu_electrostatics.c b/testing/starpu_electrostatics.c index a2a40c58..27266e48 100644 --- a/testing/starpu_electrostatics.c +++ b/testing/starpu_electrostatics.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file testing/starpu_electrostatics.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/testing/starpu_minimal.c b/testing/starpu_minimal.c index 7bb9ba2c..05dccaed 100644 --- a/testing/starpu_minimal.c +++ b/testing/starpu_minimal.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file testing/starpu_minimal.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ diff --git a/testing/starpu_spatial.c b/testing/starpu_spatial.c index 50af0990..28c9299c 100644 --- a/testing/starpu_spatial.c +++ b/testing/starpu_spatial.c @@ -5,7 +5,7 @@ * University of Science and Technology (KAUST) * * @file testing/starpu_spatial.c - * @version 1.3.0 + * @version 0.3.0 * @author Aleksandr Mikhalev * @date 2017-11-07 * */ @@ -100,16 +100,18 @@ int main(int argc, char **argv) starsh_blrf_info(F); starsh_blrm_info(M); printf("TIME TO APPROXIMATE: %e secs\n", time1); + // Deinit StarPU + starpu_shutdown(); // Measure approximation error time1 = omp_get_wtime(); - double rel_err = starsh_blrm__dfe(M); + double rel_err = starsh_blrm__dfe_omp(M); time1 = omp_get_wtime()-time1; printf("TIME TO MEASURE ERROR: %e secs\nRELATIVE ERROR: %e\n", time1, rel_err); if(rel_err/tol > 10.) { printf("Resulting relative error is too big\n"); - return 1; + return 0; } // Measure time for 10 BLRM matvecs and for 10 BLRM TLR matvecs /* Not performed due to no matvec yet with STARPU @@ -125,7 +127,5 @@ int main(int argc, char **argv) time1 = omp_get_wtime()-time1; printf("TIME FOR 10 BLRM MATVECS: %e secs\n", time1); */ - // Deinit StarPU - starpu_shutdown(); return 0; } diff --git a/testing/starpu_spatial_gpu.c b/testing/starpu_spatial_gpu.c new file mode 100644 index 00000000..a10bd821 --- /dev/null +++ b/testing/starpu_spatial_gpu.c @@ -0,0 +1,145 @@ +/*! @copyright (c) 2017 King Abdullah University of Science and + * Technology (KAUST). All rights reserved. + * + * STARS-H is a software package, provided by King Abdullah + * University of Science and Technology (KAUST) + * + * @file testing/starpu_spatial.c + * @version 0.1.0 + * @author Aleksandr Mikhalev + * @date 2017-11-07 + * */ + +#ifdef MKL + #include +#else + #include + #include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +void preheat_cublas() +{ + cublasHandle_t cuhandle; + cublasCreate(&cuhandle); + cublasDestroy(cuhandle); +} + +int main(int argc, char **argv) +{ + if(argc != 10) + { + printf("%d arguments provided, but 9 are needed\n", argc-1); + printf("starpu_spatial ndim placement kernel beta nu N block_size " + "maxrank tol\n"); + return 1; + } + int problem_ndim = atoi(argv[1]); + int place = atoi(argv[2]); + // Possible values can be found in documentation for enum + // STARSH_PARTICLES_PLACEMENT + int kernel_type = atoi(argv[3]); + double beta = atof(argv[4]); + double nu = atof(argv[5]); + int N = atoi(argv[6]); + int block_size = atoi(argv[7]); + int maxrank = atoi(argv[8]); + double tol = atof(argv[9]); + double noise = 0; + int onfly = 0; + char symm = 'N', dtype = 'd'; + int ndim = 2; + STARSH_int shape[2] = {N, N}; + int nrhs = 1; + int info; + srand(0); + // Init STARS-H + info = starsh_init(); + if(info != 0) + return info; + // Generate data for spatial statistics problem + STARSH_ssdata *data; + STARSH_kernel *kernel; + info = starsh_application((void **)&data, &kernel, N, dtype, + STARSH_SPATIAL, kernel_type, STARSH_SPATIAL_NDIM, problem_ndim, + STARSH_SPATIAL_BETA, beta, STARSH_SPATIAL_NU, nu, + STARSH_SPATIAL_NOISE, noise, 0); + if(info != 0) + { + printf("Problem was NOT generated (wrong parameters)\n"); + return info; + } + // Init problem with given data and kernel and print short info + STARSH_problem *P; + info = starsh_problem_new(&P, ndim, shape, symm, dtype, data, data, + kernel, "Spatial Statistics example"); + if(info != 0) + return info; + starsh_problem_info(P); + // Init tiled cluster for tiled low-rank approximation and print info + STARSH_cluster *C; + info = starsh_cluster_new_plain(&C, data, N, block_size); + if(info != 0) + return info; + starsh_cluster_info(C); + // Init tiled division into admissible blocks and print short info + STARSH_blrf *F; + STARSH_blrm *M; + info = starsh_blrf_new_tlr(&F, P, symm, C, C); + if(info != 0) + return info; + starsh_blrf_info(F); + // Init StarPU + (void)starpu_init(NULL); + // Init cublas so that it runs faster next time + starpu_execute_on_each_worker(preheat_cublas, NULL, STARPU_CUDA); + // Approximate each admissible block + double time1 = omp_get_wtime(); + info = starsh_blrm__drsdd_starpu_kblas3_spatial(&M, F, maxrank, tol, onfly); + if(info != 0) + return info; + time1 = omp_get_wtime()-time1; + // Print info about updated format and approximation + starsh_blrf_info(F); + starsh_blrm_info(M); + printf("TIME TO APPROXIMATE: %e secs\n", time1); + // Deinit StarPU + starpu_shutdown(); + // Measure approximation error + /* + time1 = omp_get_wtime(); + double rel_err = starsh_blrm__dfe_omp(M); + time1 = omp_get_wtime()-time1; + printf("TIME TO MEASURE ERROR: %e secs\nRELATIVE ERROR: %e\n", + time1, rel_err); + if(rel_err/tol > 10.) + { + printf("Resulting relative error is too big\n"); + return 0; + } + */ + // Measure time for 10 BLRM matvecs and for 10 BLRM TLR matvecs + /* Not performed due to no matvec yet with STARPU + double *x, *y; + x = malloc(N*nrhs*sizeof(*x)); + y = malloc(N*nrhs*sizeof(*y)); + int iseed[4] = {0, 0, 0, 1}; + LAPACKE_dlarnv_work(3, iseed, N*nrhs, x); + cblas_dscal(N*nrhs, 0.0, y, 1); + time1 = omp_get_wtime(); + for(int i = 0; i < 10; i++) + starsh_blrm__dmml(M, nrhs, 1.0, x, N, 0.0, y, N); + time1 = omp_get_wtime()-time1; + printf("TIME FOR 10 BLRM MATVECS: %e secs\n", time1); + */ + return 0; +}