diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3633794c..88514b6d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -51,7 +51,7 @@ include(FindPkgConfig)
 
 # Create project and check C compiler
 cmake_policy(SET CMP0048 NEW)
-project(STARS-H VERSION 0.1.1 LANGUAGES C Fortran)
+project(STARS-H VERSION 0.3.0 LANGUAGES C Fortran)
 
 message(STATUS "Building ${PROJECT_NAME} ${PROJECT_VERSION}")
 
@@ -69,6 +69,12 @@ set(CMAKE_BUILD_TYPE Release CACHE STRING
 option(OPENMP "Use OpenMP" ON)
 option(MPI "Use MPI" ON)
 option(STARPU "Use StarPU" ON)
+# Since KBLAS does not support pkg-config, it is OFF by default, since user has
+# to provide path by means of
+# CFLAGS="-I/path/to/kblas/include -L/path/to/kblas/lib"
+option(KBLAS "Use KBLAS" ON)
+option(CUDA "Use CUDA" ON)
+#set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode arch=compute_70,code=sm_70")
 
 # Option to force using parallel blas instead of sequential
 option(USE_PARALLEL_BLAS "Prefer parallel blas libraries" OFF)
@@ -109,7 +115,7 @@ endif()
 set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
 # the RPATH to be used when installing
 #set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
-
+set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
 # Packaging (make package)
 #set(CPACK_PACKAGE_VERSION ${STARSH_VERSION})
 #set(CPACK_GENERATOR "TGZ")
@@ -174,6 +180,25 @@ if(STARPU)
     endif()
 endif()
 
+# KBLAS depends on CUDA
+if(KBLAS)
+    set(CUDA ON)
+endif()
+
+# Check CUDA option
+if(CUDA)
+    # If CUDA itself is available
+    if(CMAKE_CUDA_COMPILER)
+        enable_language(CUDA)
+        add_definitions("-DCUDA")
+    # If it is not available
+    else()
+        set(CUDA OFF)
+        # Also disable dependent KBLAS option
+        set(KBLAS OFF)
+    endif()
+endif(CUDA)
+
 # Check if GNU Scientific Library is available (for Matern kernel and
 # Bessel function)
 if(GSL)
@@ -306,6 +331,17 @@ if(BLA_VENDOR MATCHES "Intel")
     add_definitions("-DMKL")
 endif()
 
+if(STARPU AND KBLAS)
+    add_definitions("-DKBLAS")
+#    find_package(MAGMA)
+#    if(MAGMA_FOUND)
+#        include_directories(${MAGMA_INCLUDE_DIRS})
+#        link_directories(${MAGMA_LIBRARY_DIRS})
+#        add_definitions("-DKBLAS")
+#    else()
+#        set(KBLAS OFF)
+#    endif()
+endif()
 
 ###############################################################################
 ##                          PRINT CONFIGURATION                              ##
diff --git a/Data.md b/Data.md
new file mode 100644
index 00000000..6d33e696
--- /dev/null
+++ b/Data.md
@@ -0,0 +1,18 @@
+# Dataset
+
+## Mesh Deformation Application
+
+Dataset is available in KAUST repository: https://repository.kaust.edu.sa/handle/10754/664938. 
+
+DOI:10.25781/KAUST-V2EF2 
+
+## Acoustic Scattering Application
+
+Dataset is available in KAUST repository: https://repository.kaust.edu.sa/handle/10754/664400.
+
+
+DOI:10.25781/KAUST-I0634 
+
+For more information on the dataset please refer to the readme files in the data repositories.
+
+
diff --git a/Jenkinsfile b/Jenkinsfile
index 3c57966a..b3b68c44 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -117,5 +117,4 @@ pipeline {
             }
         }
     }
-        
 }
diff --git a/README.md b/README.md
index e968b0bf..613ffcdf 100644
--- a/README.md
+++ b/README.md
@@ -1,19 +1,16 @@
 What is STARS-H?
 ================
 
-STARS-H is a **high performance parallel open-source** package of **Software
-for Testing Accuracy, Reliability and Scalability of Hierarchical
-computations**. It
-provides a hierarchical matrix market in order to benchmark performance of
-various libraries for hierarchical matrix compressions and computations
-(including itself). **Why hierarchical matrices?** Because such matrices arise
-in many PDEs and use much less memory, while requiring fewer flops for
-computations. There are several hierarchical data formats, each one with its
-own performance and memory footprint. STARS-H intends to provide a standard for
-assessing accuracy and performance of hierarchical matrix libraries on a given
-hardware architecture environment. STARS-H currently supports the tile low-rank
-(TLR) data format for approximation on shared and distributed-memory systems,
-using MPI, OpenMP and task-based programming models.
+The Software for Testing Accuracy, Reliability and Scalability of Hierarchical (STARS-H)
+computations is a parallel  library that provides a high performance matrix market of
+rank structured matrix operators. STARS-H supports various matrix kernels that are
+proxies for many scientific applications, and optionally compresses them by exploiting
+their data sparsity. This translates into a lower arithmetic complexity and memory footprint.
+STARS-H intends to provide a standard software environment for assessing accuracy and performance
+of 𝓗-matrix libraries on a given hardware architecture. STARS-H currently supports
+the tile low-rank (TLR) data format for approximation on shared and distributed-memory systems,
+possibly equipped with GPUs, using MPI, OpenMP and task-based programming models.
+
 
 Vision of STARS-H
 =================
@@ -49,7 +46,7 @@ Applications in matrix-free form:
 3. Electrodynamics (sin(kr)/r and cos(kr)/r)
 4. Random synthetic TLR matrix
 5. Spatial statistics (exponential, square exponential and matern kernels)
-6. Mesh deformation using radial basis function (gaussian, exponential, inverse quadratic, inverse multi-quadratic, CPTS, and Wendland kernels)
+6. Mesh deformation using radial basis functions, i.e., Gaussian, exponential, inverse quadratic, inverse multi-quadratic, CPTS, and Wendland kernels.
 7. Acoustic scattering
 
 
@@ -138,5 +135,11 @@ and have additional steps on approximation of corresponding matrices.
 *Important notice: the approximation phase does not require the entire dense matrix 
 to be stored, since matrix elements are computed on the fly.*
 
+Dataset
+========
+
+Please see Data.md for information about dataset.
+
+
 
 ![Handout](docs/STARS-H-final.png)
diff --git a/SARS-CoV-2-meshes/GeneratePopulation.py b/SARS-CoV-2-meshes/GeneratePopulation.py
index 596b51cb..843a8e66 100644
--- a/SARS-CoV-2-meshes/GeneratePopulation.py
+++ b/SARS-CoV-2-meshes/GeneratePopulation.py
@@ -1,4 +1,4 @@
-# @version 1.3.0
+# @version 0.3.0
 
 import pandas as pd
 import numpy as np
diff --git a/SARS-CoV-2-meshes/HierarchicalPopulationCluster.py b/SARS-CoV-2-meshes/HierarchicalPopulationCluster.py
index 627fef91..eda0d3b0 100644
--- a/SARS-CoV-2-meshes/HierarchicalPopulationCluster.py
+++ b/SARS-CoV-2-meshes/HierarchicalPopulationCluster.py
@@ -1,4 +1,4 @@
-# @version 1.3.0
+# @version 0.3.0
 
 import pandas as pd
 import numpy as np
diff --git a/VERSION.txt b/VERSION.txt
index f0bb29e7..0d91a54c 100644
--- a/VERSION.txt
+++ b/VERSION.txt
@@ -1 +1 @@
-1.3.0
+0.3.0
diff --git a/docs/STARS-H-final.png b/docs/STARS-H-final.png
index b1873204..0e707d57 100644
Binary files a/docs/STARS-H-final.png and b/docs/STARS-H-final.png differ
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index aca37ad9..b2bee36c 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -5,7 +5,7 @@
 #             University of Science and Technology (KAUST)
 #
 # @file examples/CMakeLists.txt
-# @version 1.3.0
+# @version 0.3.0
 # @author Aleksandr Mikhalev
 # @date 2017-11-07
 
diff --git a/examples/approximation/CMakeLists.txt b/examples/approximation/CMakeLists.txt
index c9634b2f..c67529b3 100644
--- a/examples/approximation/CMakeLists.txt
+++ b/examples/approximation/CMakeLists.txt
@@ -5,7 +5,7 @@
 #             University of Science and Technology (KAUST)
 #
 # @file examples/approximation/CMakeLists.txt
-# @version 1.3.0
+# @version 0.3.0
 # @author Aleksandr Mikhalev
 # @date 2017-11-07
 
diff --git a/examples/approximation/dense.c b/examples/approximation/dense.c
index 6bf6662d..5ed3c777 100644
--- a/examples/approximation/dense.c
+++ b/examples/approximation/dense.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file examples/approximation/dense.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/examples/approximation/minimal.c b/examples/approximation/minimal.c
index f41b632e..b8edb580 100644
--- a/examples/approximation/minimal.c
+++ b/examples/approximation/minimal.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file examples/approximation/minimal.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/examples/approximation/randtlr.c b/examples/approximation/randtlr.c
index 4a5504f3..2dd751a3 100644
--- a/examples/approximation/randtlr.c
+++ b/examples/approximation/randtlr.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file examples/approximation/randtlr.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/examples/approximation/spatial.c b/examples/approximation/spatial.c
index eb4d95c5..525497fd 100644
--- a/examples/approximation/spatial.c
+++ b/examples/approximation/spatial.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file examples/approximation/spatial.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/examples/problem/CMakeLists.txt b/examples/problem/CMakeLists.txt
index da6e3349..bd1df186 100644
--- a/examples/problem/CMakeLists.txt
+++ b/examples/problem/CMakeLists.txt
@@ -5,7 +5,7 @@
 #             University of Science and Technology (KAUST)
 #
 # @file examples/problem/CMakeLists.txt
-# @version 1.3.0
+# @version 0.3.0
 # @author Aleksandr Mikhalev
 # @date 2020-06-09
 
diff --git a/examples/problem/acoustic.c b/examples/problem/acoustic.c
index 92d4dc30..a925d84a 100644
--- a/examples/problem/acoustic.c
+++ b/examples/problem/acoustic.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file examples/problem/acoustic.c 
- * @version 1.3.0
+ * @version 0.3.0
  * @auther Rabab Alomairy
  * @author Aleksandr Mikhalev
  * @date 2020-06-09
diff --git a/examples/problem/dense.c b/examples/problem/dense.c
index 87df8179..3e3e53f4 100644
--- a/examples/problem/dense.c
+++ b/examples/problem/dense.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file examples/problem/dense.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/examples/problem/minimal.c b/examples/problem/minimal.c
index 18a705f6..53595145 100644
--- a/examples/problem/minimal.c
+++ b/examples/problem/minimal.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file examples/problem/minimal.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/examples/problem/particles.c b/examples/problem/particles.c
index 57e1aa28..2c05482f 100644
--- a/examples/problem/particles.c
+++ b/examples/problem/particles.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file examples/problem/particles.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  */
diff --git a/examples/problem/randtlr.c b/examples/problem/randtlr.c
index b20136fc..271f1eaa 100644
--- a/examples/problem/randtlr.c
+++ b/examples/problem/randtlr.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file examples/problem/randtlr.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/examples/problem/rbf_cube.c b/examples/problem/rbf_cube.c
index c8b2120b..74ca8029 100644
--- a/examples/problem/rbf_cube.c
+++ b/examples/problem/rbf_cube.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file examples/problem/rbf_cube.c
- * @version 1.3.0
+ * @version 0.3.0
  * @auther Rabab Alomairy
  * @author Aleksandr Mikhalev
  * @date 2020-06-09
diff --git a/examples/problem/rbf_virus.c b/examples/problem/rbf_virus.c
index 93c37f75..7910a4b2 100644
--- a/examples/problem/rbf_virus.c
+++ b/examples/problem/rbf_virus.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file examples/problem/rbf_virus.c
- * @version 1.3.0
+ * @version 0.3.0
  * @auther Rabab Alomairy
  * @author Aleksandr Mikhalev
  * @date 2020-06-09
diff --git a/examples/problem/spatial.c b/examples/problem/spatial.c
index 63e62421..046ba35d 100644
--- a/examples/problem/spatial.c
+++ b/examples/problem/spatial.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file examples/problem/spatial.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/examples/problem/spatial_bivariate.c b/examples/problem/spatial_bivariate.c
index 5066a120..02e88d0e 100644
--- a/examples/problem/spatial_bivariate.c
+++ b/examples/problem/spatial_bivariate.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file examples/problem/spatial.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2020-06-09
  * */
diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt
index 3a68910f..c4d21bdd 100644
--- a/include/CMakeLists.txt
+++ b/include/CMakeLists.txt
@@ -5,7 +5,7 @@
 #             University of Science and Technology (KAUST)
 #
 # @file CMakeLists.txt
-# @version 1.3.0
+# @version 0.3.0
 # @author Eduardo Gonzalez Fisher
 # @author Aleksandr Mikhalev
 # @date 2020-06-09
diff --git a/include/common.h b/include/common.h
index 8d1a37cb..90c4b846 100644
--- a/include/common.h
+++ b/include/common.h
@@ -9,7 +9,7 @@
  * @cond
  * This command in pair with endcond will prevent file from being documented.
  *
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2020-06-09
  * */
diff --git a/include/control/init.h b/include/control/init.h
index 3682c44e..3908bd99 100644
--- a/include/control/init.h
+++ b/include/control/init.h
@@ -5,13 +5,13 @@
  *             University of Science and Technology (KAUST)
  *
  * @file src/control/init.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-08-13
  * */
 
 //! Set number of backends and default one
-#define BACKEND_NUM 6
+#define BACKEND_NUM 9
 #define BACKEND_DEFAULT STARSH_BACKEND_SEQUENTIAL
 #ifdef OPENMP
     #undef BACKEND_DEFAULT
@@ -59,6 +59,21 @@ struct
 #else
     {"MPI_STARPU", STARSH_BACKEND_NOTSUPPORTED},
 #endif
+#if defined(STARPU) && defined(KBLAS)
+    {"STARPU_KBLAS", STARSH_BACKEND_STARPU_KBLAS},
+#else
+    {"STARPU_KBLAS", STARSH_BACKEND_NOTSUPPORTED},
+#endif
+#if defined(STARPU) && defined(CUDA)
+    {"STARPU_CUDA", STARSH_BACKEND_STARPU_CUDA},
+#else
+    {"STARPU_CUDA", STARSH_BACKEND_NOTSUPPORTED},
+#endif
+#if defined(STARPU) && defined(MPI) && defined(KBLAS)
+    {"MPI_STARPU_KBLAS", STARSH_BACKEND_MPI_STARPU_KBLAS},
+#else
+    {"MPI_STARPU_KBLAS", STARSH_BACKEND_NOTSUPPORTED},
+#endif
 };
 
 //! Set number of low-rank engines and default one
@@ -137,9 +152,40 @@ static STARSH_blrm_approximate *(dlr_starpu_mpi[LRENGINE_NUM]) =
     #endif
 };
 
+//! Array of approximation functions for STARPU_KBLAS backend
+static STARSH_blrm_approximate *(dlr_starpu_kblas[LRENGINE_NUM]) =
+{
+    #if defined(STARPU) && defined(KBLAS)
+    starsh_blrm__dsdd_starpu, starsh_blrm__dsdd_starpu,
+    starsh_blrm__dqp3_starpu, starsh_blrm__drsdd_starpu_kblas2,//3_spatial,
+    starsh_blrm__drsdd_starpu_kblas3_spatial
+    #endif
+};
+
+//! Array of approximation functions for STARPU_CUDA backend
+static STARSH_blrm_approximate *(dlr_starpu_cuda[LRENGINE_NUM]) =
+{
+    #if defined(STARPU) && defined(CUDA)
+    starsh_blrm__dsdd_starpu, starsh_blrm__dsdd_starpu,
+    starsh_blrm__dqp3_starpu, starsh_blrm__drsdd_starpu_cuda,
+    starsh_blrm__drsdd_starpu_cuda
+    #endif
+};
+
+//! Array of approximation functions for MPI_STARPU_KBLAS backend
+static STARSH_blrm_approximate *(dlr_starpu_mpi_kblas[LRENGINE_NUM]) =
+{
+    #if defined(STARPU) && defined(MPI) && defined(KBLAS)
+    starsh_blrm__dsdd_mpi_starpu, starsh_blrm__dsdd_mpi_starpu,
+    starsh_blrm__dqp3_mpi_starpu, starsh_blrm__drsdd_mpi_starpu_kblas2,
+    starsh_blrm__drsdd_mpi_starpu_kblas2
+    #endif
+};
+
 //! Array of approximation functions, depending on backend
 static STARSH_blrm_approximate *(*dlr[BACKEND_NUM]) =
 {
-    dlr_seq, dlr_omp, dlr_mpi, dlr_mpi, dlr_starpu, dlr_starpu_mpi
+    dlr_seq, dlr_omp, dlr_mpi, dlr_mpi, dlr_starpu, dlr_starpu_mpi,
+    dlr_starpu_kblas, dlr_starpu_cuda, dlr_starpu_mpi_kblas,
 };
 
diff --git a/include/starsh-acoustic.h b/include/starsh-acoustic.h
index 5eff879d..14fd5eb8 100644
--- a/include/starsh-acoustic.h
+++ b/include/starsh-acoustic.h
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file include/starsh-acoustic.h
- * @version 1.3.0
+ * @version 0.3.0
  * @auther Rabab Alomairy
  * @author Aleksandr Mikhalev
  * @date 2020-06-09
diff --git a/include/starsh-cauchy.h b/include/starsh-cauchy.h
index ba129dc4..442ffa88 100644
--- a/include/starsh-cauchy.h
+++ b/include/starsh-cauchy.h
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file include/starsh-minimal.h
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/include/starsh-constants.h b/include/starsh-constants.h
index 091702f1..919aa643 100644
--- a/include/starsh-constants.h
+++ b/include/starsh-constants.h
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file include/starsh-constants.h
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
@@ -30,8 +30,14 @@ enum STARSH_BACKEND
     //!< Hybrid MPI + OpenMP
     STARSH_BACKEND_STARPU = 4,
     //!< StarPU (without MPI)
-    STARSH_BACKEND_MPI_STARPU = 5
+    STARSH_BACKEND_MPI_STARPU = 5,
     //!< StarPU (with MPI)
+    STARSH_BACKEND_STARPU_KBLAS = 6,
+    //!< StarPU+KBLAS (without MPI)
+    STARSH_BACKEND_STARPU_CUDA = 7,
+    //!< StarPU+CUDA (without MPI)
+    STARSH_BACKEND_MPI_STARPU_KBLAS = 8,
+    //!< MPI+StarPU+KBLAS
 };
 
 //! Enum for low-rank engine (approximation technique)
diff --git a/include/starsh-electrodynamics.h b/include/starsh-electrodynamics.h
index d796627d..f3334180 100644
--- a/include/starsh-electrodynamics.h
+++ b/include/starsh-electrodynamics.h
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file include/starsh-electrodynamics.h
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/include/starsh-electrostatics.h b/include/starsh-electrostatics.h
index f6fd0b66..f3321c1a 100644
--- a/include/starsh-electrostatics.h
+++ b/include/starsh-electrostatics.h
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file include/starsh-electrostatics.h
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/include/starsh-minimal.h b/include/starsh-minimal.h
index 3fc66c78..7553a76c 100644
--- a/include/starsh-minimal.h
+++ b/include/starsh-minimal.h
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file include/starsh-minimal.h
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/include/starsh-mpi-starpu-kblas.h b/include/starsh-mpi-starpu-kblas.h
new file mode 100644
index 00000000..b9bf6cf2
--- /dev/null
+++ b/include/starsh-mpi-starpu-kblas.h
@@ -0,0 +1,67 @@
+/*! @copyright (c) 2017 King Abdullah University of Science and
+ *                      Technology (KAUST). All rights reserved.
+ *
+ * STARS-H is a software package, provided by King Abdullah
+ *             University of Science and Technology (KAUST)
+ *
+ * @file include/starsh-mpi-starpu-kblas.h
+ * @version 0.1.0
+ * @author Aleksandr Mikhalev
+ * @date 2017-11-07
+ * */
+
+#ifndef __STARSH_MPI_STARPU_KBLAS_H__
+#define __STARSH_MPI_STARPU_KBLAS_H__
+
+
+///////////////////////////////////////////////////////////////////////////////
+//                            APPROXIMATIONS                                 //
+///////////////////////////////////////////////////////////////////////////////
+
+// Check if this is enabled in Doxygen
+//! @cond (STARPU && MPI)
+
+/*! @addtogroup approximations
+ * @{
+ * */
+// This will automatically include all entities between @{ and @} into group.
+
+//int starsh_blrm__dsdd_mpi_starpu(STARSH_blrm **matrix, STARSH_blrf *format,
+//        int maxrank, double tol, int onfly);
+int starsh_blrm__drsdd_mpi_starpu_kblas(STARSH_blrm **matrix, STARSH_blrf *format,
+        int maxrank, double tol, int onfly);
+int starsh_blrm__drsdd_mpi_starpu_kblas2(STARSH_blrm **matrix, STARSH_blrf *format,
+        int maxrank, double tol, int onfly);
+int starsh_blrm__drsdd_mpi_starpu_kblas3_spatial(STARSH_blrm **matrix, STARSH_blrf *format,
+        int maxrank, double tol, int onfly);
+//int starsh_blrm__dqp3_mpi_starpu(STARSH_blrm **matrix, STARSH_blrf *format,
+//        int maxrank, double tol, int onfly);
+//int starsh_blrm__dna_mpi_starpu(STARSH_blrm **matrix, STARSH_blrf *format,
+//        int maxrank, double tol, int onfly);
+
+//! @}
+// End of group
+
+
+///////////////////////////////////////////////////////////////////////////////
+//                  MATRIX-MATRIX MULTIPLICATION                             //
+///////////////////////////////////////////////////////////////////////////////
+
+/*! @addtogroup matmul
+ * @{
+ * */
+// This will automatically include all entities between @{ and @} into group.
+
+//int starsh_blrm__dmml_mpi_starpu(STARSH_blrm *matrix, int nrhs, double alpha,
+//        double *A, int lda, double beta, double *B, int ldb);
+//int starsh_blrm__dmml_mpi_starpu_tlr(STARSH_blrm *matrix, int nrhs,
+//        double alpha, double *A, int lda, double beta, double *B, int ldb);
+
+//! @}
+// End of group
+
+//! @endcond
+// End of condition
+
+#endif // __STARSH_MPI_STARPU_KBLAS_H__
+
diff --git a/include/starsh-mpi-starpu.h b/include/starsh-mpi-starpu.h
index 74803072..2ba20ec5 100644
--- a/include/starsh-mpi-starpu.h
+++ b/include/starsh-mpi-starpu.h
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file include/starsh-mpi-starpu.h
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/include/starsh-mpi.h b/include/starsh-mpi.h
index d73de6a2..d8d15465 100644
--- a/include/starsh-mpi.h
+++ b/include/starsh-mpi.h
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file include/starsh-mpi.h
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/include/starsh-particles.h b/include/starsh-particles.h
index a4ebcdf6..a7575fe1 100644
--- a/include/starsh-particles.h
+++ b/include/starsh-particles.h
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file include/starsh-particles.h
- * @version 1.3.0
+ * @version 0.3.0
  * @author Sameh Abdulah
  * @author Aleksandr Mikhalev
  * @date 2020-06-09
diff --git a/include/starsh-randtlr.h b/include/starsh-randtlr.h
index b52d512f..d1e723c5 100644
--- a/include/starsh-randtlr.h
+++ b/include/starsh-randtlr.h
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file include/starsh-randtlr.h
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/include/starsh-rbf.h b/include/starsh-rbf.h
index c27a6640..dc658260 100644
--- a/include/starsh-rbf.h
+++ b/include/starsh-rbf.h
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file include/starsh-rbf.h
- * @version 1.3.0
+ * @version 0.3.0
  * @auther Rabab Alomairy
  * @author Aleksandr Mikhalev
  * @date 2020-06-09
diff --git a/include/starsh-spatial-gsl.h b/include/starsh-spatial-gsl.h
index ce404b03..357bdd8c 100644
--- a/include/starsh-spatial-gsl.h
+++ b/include/starsh-spatial-gsl.h
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file include/starsh-spatial-gsl.h
- * @version 1.3.0
+ * @version 0.3.0
  * @author sameh Abdulah
  * @author Aleksandr Mikhalev
  * @date 2020-06-09
diff --git a/include/starsh-spatial.h b/include/starsh-spatial.h
index 949ca5f7..df3f4e78 100644
--- a/include/starsh-spatial.h
+++ b/include/starsh-spatial.h
@@ -275,4 +275,10 @@ void starsh_ssdata_block_parsimonious2_kernel_2d_simd_gcd(int nrows, int ncols,
 // defined
 #include "starsh-spatial-gsl.h"
 
+// Add function that copies data to GPU
+#ifdef CUDA
+void starsh_ssdata_togpu(STARSH_ssdata **dest, STARSH_ssdata *src);
+void starsh_ssdata_free_gpu(STARSH_ssdata *data);
+#endif // CUDA
+
 #endif // __STARSH_SPATIAL_H__
diff --git a/include/starsh-starpu-cuda.h b/include/starsh-starpu-cuda.h
new file mode 100644
index 00000000..c5320156
--- /dev/null
+++ b/include/starsh-starpu-cuda.h
@@ -0,0 +1,66 @@
+/*! @copyright (c) 2017 King Abdullah University of Science and
+ *                      Technology (KAUST). All rights reserved.
+ *
+ * STARS-H is a software package, provided by King Abdullah
+ *             University of Science and Technology (KAUST)
+ *
+ * @file include/starsh-starpu-cuda.h
+ * @version 0.1.0
+ * @author Aleksandr Mikhalev
+ * @date 2017-11-07
+ * */
+
+#ifndef __STARSH_STARPU_CUDA_H__
+#define __STARSH_STARPU_CUDA_H__
+
+
+///////////////////////////////////////////////////////////////////////////////
+//                            APPROXIMATIONS                                 //
+///////////////////////////////////////////////////////////////////////////////
+
+// Check if this is enabled in Doxygen
+//! @cond (STARPU)
+
+/*! @addtogroup approximations
+ * @{
+ * */
+// This will automatically include all entities between @{ and @} into group.
+
+//int starsh_blrm__dsdd_starpu(STARSH_blrm **matrix, STARSH_blrf *format,
+//        int maxrank, double tol, int onfly);
+int starsh_blrm__drsdd_starpu_cuda(STARSH_blrm **matrix, STARSH_blrf *format,
+        int maxrank, double tol, int onfly);
+//int starsh_blrm__dqp3_starpu(STARSH_blrm **matrix, STARSH_blrf *format,
+//        int maxrank, double tol, int onfly);
+//int starsh_blrm__dna_starpu(STARSH_blrm **matrix, STARSH_blrf *format,
+//        int maxrank, double tol, int onfly);
+
+//! @}
+// End of group
+
+
+///////////////////////////////////////////////////////////////////////////////
+//                  LOW-RANK ROUTINES FOR DENSE                              //
+///////////////////////////////////////////////////////////////////////////////
+
+/*! @addtogroup lrdense
+ * @{
+ * */
+// This will automatically include all entities between @{ and @} into group.
+
+//void starsh_dense_dlrsdd_starpu(void *buffers[], void *cl_arg);
+void starsh_dense_dlrrsdd_starpu_cuda_cpu(void *buffers[], void *cl_arg);
+void starsh_dense_dlrrsdd_starpu_cuda_gpu(void *buffers[], void *cl_arg);
+//void starsh_dense_dlrqp3_starpu(void *buffers[], void *cl_arg);
+void starsh_dense_kernel_starpu_cuda_cpu(void *buffers[], void *cl_arg);
+//void starsh_dense_dgemm_starpu(void *buffers[], void *cl_arg);
+//void starsh_dense_fake_init_starpu(void *buffers[], void *cl_arg);
+
+//! @}
+// End of group
+
+//! @endcond
+// End of condition
+
+#endif // __STARSH_STARPU_KBLAS_H__
+
diff --git a/include/starsh-starpu-kblas.h b/include/starsh-starpu-kblas.h
new file mode 100644
index 00000000..a55bd6fa
--- /dev/null
+++ b/include/starsh-starpu-kblas.h
@@ -0,0 +1,74 @@
+/*! @copyright (c) 2017 King Abdullah University of Science and
+ *                      Technology (KAUST). All rights reserved.
+ *
+ * STARS-H is a software package, provided by King Abdullah
+ *             University of Science and Technology (KAUST)
+ *
+ * @file include/starsh-starpu-kblas.h
+ * @version 0.1.0
+ * @author Aleksandr Mikhalev
+ * @date 2017-11-07
+ * */
+
+#ifndef __STARSH_STARPU_KBLAS_H__
+#define __STARSH_STARPU_KBLAS_H__
+
+
+///////////////////////////////////////////////////////////////////////////////
+//                            APPROXIMATIONS                                 //
+///////////////////////////////////////////////////////////////////////////////
+
+// Check if this is enabled in Doxygen
+//! @cond (STARPU)
+
+/*! @addtogroup approximations
+ * @{
+ * */
+// This will automatically include all entities between @{ and @} into group.
+
+//int starsh_blrm__dsdd_starpu(STARSH_blrm **matrix, STARSH_blrf *format,
+//        int maxrank, double tol, int onfly);
+int starsh_blrm__drsdd_starpu_kblas(STARSH_blrm **matrix, STARSH_blrf *format,
+        int maxrank, double tol, int onfly);
+int starsh_blrm__drsdd_starpu_kblas2(STARSH_blrm **matrix, STARSH_blrf *format,
+        int maxrank, double tol, int onfly);
+int starsh_blrm__drsdd_starpu_kblas3_spatial(STARSH_blrm **matrix, STARSH_blrf *format,
+        int maxrank, double tol, int onfly);
+//int starsh_blrm__dqp3_starpu(STARSH_blrm **matrix, STARSH_blrf *format,
+//        int maxrank, double tol, int onfly);
+//int starsh_blrm__dna_starpu(STARSH_blrm **matrix, STARSH_blrf *format,
+//        int maxrank, double tol, int onfly);
+
+//! @}
+// End of group
+
+
+///////////////////////////////////////////////////////////////////////////////
+//                  LOW-RANK ROUTINES FOR DENSE                              //
+///////////////////////////////////////////////////////////////////////////////
+
+/*! @addtogroup lrdense
+ * @{
+ * */
+// This will automatically include all entities between @{ and @} into group.
+
+//void starsh_dense_dlrsdd_starpu(void *buffers[], void *cl_arg);
+void starsh_dense_dlrrsdd_starpu_kblas_cpu(void *buffers[], void *cl_arg);
+void starsh_dense_dlrrsdd_starpu_kblas_gpu(void *buffers[], void *cl_arg);
+void starsh_dense_dlrrsdd_starpu_kblas2_gpu(void *buffers[], void *cl_arg);
+void starsh_dense_dlrrsdd_starpu_kblas2_getrank(void *buffers[], void *cl_arg);
+//void starsh_dense_dlrqp3_starpu(void *buffers[], void *cl_arg);
+void starsh_dense_kernel_starpu_kblas_cpu(void *buffers[], void *cl_arg);
+void starsh_dense_kernel_starpu_kblas2_cpu(void *buffers[], void *cl_arg);
+void starsh_dense_kernel_starpu_kblas3_gpu(void *buffers[], void *cl_arg);
+//void starsh_dense_dgemm_starpu(void *buffers[], void *cl_arg);
+//void starsh_dense_fake_init_starpu(void *buffers[], void *cl_arg);
+
+//! @}
+// End of group
+
+//! @endcond
+// End of condition
+
+#endif // __STARSH_STARPU_KBLAS_H__
+
diff --git a/include/starsh-starpu.h b/include/starsh-starpu.h
index 7792e1e1..a943cb3e 100644
--- a/include/starsh-starpu.h
+++ b/include/starsh-starpu.h
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file include/starsh-starpu.h
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/misc_scripts/code_generation/applications/particles/kernel_nd.py b/misc_scripts/code_generation/applications/particles/kernel_nd.py
index 743bcd8c..3140cd83 100644
--- a/misc_scripts/code_generation/applications/particles/kernel_nd.py
+++ b/misc_scripts/code_generation/applications/particles/kernel_nd.py
@@ -6,7 +6,7 @@
              University of Science and Technology (KAUST)
 
  @file misc_scripts/code_generation/applications/particles/kernel_nd.py
- @version 1.3.0
+ @version 0.3.0
  @author Aleksandr Mikhalev
  @date 2017-08-22
 """
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index fce217b2..4582b2c3 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -5,7 +5,7 @@
 #             University of Science and Technology (KAUST)
 #
 # @file src/CMakeLists.txt
-# @version 1.3.0
+# @version 0.3.0
 # @author Aleksandr Mikhalev
 # @date 2017-11-07
 
@@ -40,6 +40,14 @@ if(OPENMP)
 endif(OPENMP)
 if(STARPU)
     target_link_libraries(starsh PUBLIC ${STARPU_LIBRARIES})
+    if(KBLAS)
+        target_link_libraries(starsh PUBLIC cublas_static cudart_static
+            culibos cusparse_static stdc++ kblas-gpu dl rt)
+    endif(KBLAS)
+    if(CUDA)
+        target_link_libraries(starsh PUBLIC cublas_static cudart_static
+            cusolver_static curand_static culibos stdc++ dl rt)
+    endif(CUDA)
 endif(STARPU)
 if(GSL_FOUND)
     target_link_libraries(starsh PUBLIC ${GSL_LIBRARIES})
diff --git a/src/applications/CMakeLists.txt b/src/applications/CMakeLists.txt
index 08b1f8c2..a9dcfdca 100644
--- a/src/applications/CMakeLists.txt
+++ b/src/applications/CMakeLists.txt
@@ -5,7 +5,7 @@
 #             University of Science and Technology (KAUST)
 #
 # @file src/applications/CMakeLists.txt
-# @version 1.3.0
+# @version 0.3.0
 # @author Aleksandr Mikhalev
 # @date 2020-06-09
 
diff --git a/src/applications/cauchy.c b/src/applications/cauchy.c
index 98b0e42b..f402e632 100644
--- a/src/applications/cauchy.c
+++ b/src/applications/cauchy.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file src/applications/cauchy.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/src/applications/common.c b/src/applications/common.c
index e688c19a..6de86173 100644
--- a/src/applications/common.c
+++ b/src/applications/common.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file src/applications/common.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/src/applications/electrodynamics.c b/src/applications/electrodynamics.c
index b8a873b3..0966c226 100644
--- a/src/applications/electrodynamics.c
+++ b/src/applications/electrodynamics.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file src/applications/electrodynamics.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  */
diff --git a/src/applications/electrodynamics/CMakeLists.txt b/src/applications/electrodynamics/CMakeLists.txt
index a0f1f485..5cd4b813 100644
--- a/src/applications/electrodynamics/CMakeLists.txt
+++ b/src/applications/electrodynamics/CMakeLists.txt
@@ -5,7 +5,7 @@
 #             University of Science and Technology (KAUST)
 #
 # @file src/applications/electrodynamics/CMakeLists.txt
-# @version 1.3.0
+# @version 0.3.0
 # @author Aleksandr Mikhalev
 # @date 2017-11-07
 
diff --git a/src/applications/electrodynamics/kernel_cos.c b/src/applications/electrodynamics/kernel_cos.c
index 54062870..8ddc75ee 100644
--- a/src/applications/electrodynamics/kernel_cos.c
+++ b/src/applications/electrodynamics/kernel_cos.c
@@ -13,7 +13,7 @@
  * STARS-H, simply do substitutions yourself.
  *
  * @file src/applications/electrodynamics/kernel_cos.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  */
diff --git a/src/applications/electrodynamics/kernel_sin.c b/src/applications/electrodynamics/kernel_sin.c
index d8d9114e..86fcddea 100644
--- a/src/applications/electrodynamics/kernel_sin.c
+++ b/src/applications/electrodynamics/kernel_sin.c
@@ -13,7 +13,7 @@
  * STARS-H, simply do substitutions yourself.
  *
  * @file src/applications/electrodynamics/kernel_sin.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  */
diff --git a/src/applications/electrostatics.c b/src/applications/electrostatics.c
index 3158f740..475f221e 100644
--- a/src/applications/electrostatics.c
+++ b/src/applications/electrostatics.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file src/applications/electrostatics.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  */
diff --git a/src/applications/electrostatics/CMakeLists.txt b/src/applications/electrostatics/CMakeLists.txt
index bbeb1942..88a35453 100644
--- a/src/applications/electrostatics/CMakeLists.txt
+++ b/src/applications/electrostatics/CMakeLists.txt
@@ -5,7 +5,7 @@
 #             University of Science and Technology (KAUST)
 #
 # @file src/applications/electrostatics/CMakeLists.txt
-# @version 1.3.0
+# @version 0.3.0
 # @author Aleksandr Mikhalev
 # @date 2017-11-07
 
diff --git a/src/applications/electrostatics/kernel_coulomb_potential.c b/src/applications/electrostatics/kernel_coulomb_potential.c
index d5eb6c99..9d1930df 100644
--- a/src/applications/electrostatics/kernel_coulomb_potential.c
+++ b/src/applications/electrostatics/kernel_coulomb_potential.c
@@ -13,7 +13,7 @@
  * STARS-H, simply do substitutions yourself.
  *
  * @file src/applications/electrostatics/kernel_coulomb_potential.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  */
diff --git a/src/applications/mesh_deformation/cube.c b/src/applications/mesh_deformation/cube.c
index 805eaca0..c56c98f0 100644
--- a/src/applications/mesh_deformation/cube.c
+++ b/src/applications/mesh_deformation/cube.c
@@ -11,7 +11,7 @@
  * STARS-H, simply do substitutions yourself.
  *
  * @file src/applications/mesh_deformation/cube.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Rabab Alomairy
  * @date 2020-06-09
  */
diff --git a/src/applications/mesh_deformation/kernels_rbf.c b/src/applications/mesh_deformation/kernels_rbf.c
index f8c4927f..9e6d7d0c 100644
--- a/src/applications/mesh_deformation/kernels_rbf.c
+++ b/src/applications/mesh_deformation/kernels_rbf.c
@@ -11,7 +11,7 @@
  * STARS-H, simply do substitutions yourself.
  *
  * @file src/applications/mesh_deformation/cube.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Rabab Alomairy
  * @date 2020-06-09
  */
diff --git a/src/applications/minimal.c b/src/applications/minimal.c
index 9bce1c4d..5768d34c 100644
--- a/src/applications/minimal.c
+++ b/src/applications/minimal.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file src/applications/minimal.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/src/applications/randtlr.c b/src/applications/randtlr.c
index c03ac848..039a3440 100644
--- a/src/applications/randtlr.c
+++ b/src/applications/randtlr.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file src/applications/randtlr.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/src/applications/spatial.c b/src/applications/spatial.c
index adf44505..d68ed5be 100644
--- a/src/applications/spatial.c
+++ b/src/applications/spatial.c
@@ -1828,3 +1828,39 @@ void starsh_ssdata_block_parsimonious2_kernel_2d_simd(int nrows, int ncols,
 }
 #endif // GSL
 
+#ifdef CUDA
+void starsh_ssdata_togpu(STARSH_ssdata **dest, STARSH_ssdata *src)
+{
+    void *dest_points;
+    size_t points_size = sizeof(double) * src->particles.ndim *
+            src->particles.count;
+    //printf("COPY to GPU: %zu bytes\n", points_size);
+    cudaError_t err = cudaSuccess;
+    err = cudaMalloc(&dest_points, points_size);
+    if(err != cudaSuccess)
+        printf("cudaMalloc error\n");
+    //printf("points address: %p\n", dest_points);
+    err = cudaMemcpy(dest_points, src->particles.point, points_size,
+            cudaMemcpyHostToDevice);
+    if(err != cudaSuccess)
+        printf("cudaMemcpy error\n");
+    STARSH_ssdata tmp;
+    tmp = *src;
+    tmp.particles.point = dest_points;
+    err = cudaMalloc(dest, sizeof(STARSH_ssdata));
+    if(err != cudaSuccess)
+        printf("cudaMalloc error\n");
+    err = cudaMemcpy(*dest, &tmp, sizeof(STARSH_ssdata), cudaMemcpyHostToDevice);
+    if(err != cudaSuccess)
+        printf("cudaMemcpy error\n");
+    //printf("Succesfully copied into GPU\n");
+}
+
+void starsh_ssdata_free_gpu(STARSH_ssdata *data)
+{
+    STARSH_ssdata tmp;
+    cudaMemcpy(&tmp, data, sizeof(STARSH_ssdata), cudaMemcpyDeviceToHost);
+    cudaFree(tmp.particles.point);
+    cudaFree(data);
+}
+#endif // CUDA
diff --git a/src/applications/spatial/CMakeLists.txt b/src/applications/spatial/CMakeLists.txt
index d917cd21..8645296b 100644
--- a/src/applications/spatial/CMakeLists.txt
+++ b/src/applications/spatial/CMakeLists.txt
@@ -5,7 +5,7 @@
 #             University of Science and Technology (KAUST)
 #
 # @file src/applications/spatial/CMakeLists.txt
-# @version 1.3.0
+# @version 0.3.0
 # @author Aleksandr Mikhalev
 # @date 2017-11-07
 
diff --git a/src/backends/CMakeLists.txt b/src/backends/CMakeLists.txt
index c51fda81..85d56b1a 100644
--- a/src/backends/CMakeLists.txt
+++ b/src/backends/CMakeLists.txt
@@ -5,7 +5,7 @@
 #             University of Science and Technology (KAUST)
 #
 # @file src/backends/CMakeLists.txt
-# @version 1.3.0
+# @version 0.3.0
 # @author Aleksandr Mikhalev
 # @date 2017-11-07
 # Add sequential code
@@ -19,11 +19,28 @@ set(BACKENDS_OBJECTS $<TARGET_OBJECTS:backends_sequential>)
 # List StarPU sources for docs or build
 if(STARPU OR DOCS STREQUAL "FULL")
     add_subdirectory("starpu")
+    if(KBLAS OR DOCS STREQUAL "FULL")
+        add_subdirectory("starpu_kblas")
+        add_subdirectory("starpu_kblas2")
+        add_subdirectory("starpu_kblas3_spatial")
+    endif()
+    if(CUDA OR DOCS STREQUAL "FULL")
+        add_subdirectory("starpu_cuda")
+    endif()
 endif()
 
 # Add StarPU backend
 if(STARPU)
     list(APPEND BACKENDS_OBJECTS $<TARGET_OBJECTS:backends_starpu>)
+    if(KBLAS)
+        list(APPEND BACKENDS_OBJECTS $<TARGET_OBJECTS:backends_starpu_kblas>)
+        list(APPEND BACKENDS_OBJECTS $<TARGET_OBJECTS:backends_starpu_kblas2>)
+        list(APPEND BACKENDS_OBJECTS
+            $<TARGET_OBJECTS:backends_starpu_kblas3_spatial>)
+    endif()
+    if(CUDA)
+        list(APPEND BACKENDS_OBJECTS $<TARGET_OBJECTS:backends_starpu_cuda>)
+    endif()
 endif()
 
 # List OpenMP sources for docs or build
@@ -46,7 +63,7 @@ if(MPI)
     list(APPEND BACKENDS_OBJECTS $<TARGET_OBJECTS:backends_mpi>)
 endif()
 
-# List MPI sources for docs or build
+# List MPI+STARPU sources for docs or build
 if((MPI AND STARPU) OR DOCS STREQUAL "FULL")
     add_subdirectory("mpi_starpu")
 endif()
@@ -56,6 +73,21 @@ if(MPI AND STARPU)
     list(APPEND BACKENDS_OBJECTS $<TARGET_OBJECTS:backends_mpi_starpu>)
 endif()
 
+# List MPI+STARPU+KBLAS sources for docs or build
+if((MPI AND STARPU AND KBLAS) OR DOCS STREQUAL "FULL")
+    add_subdirectory("mpi_starpu_kblas")
+    add_subdirectory("mpi_starpu_kblas2")
+    add_subdirectory("mpi_starpu_kblas3_spatial")
+endif()
+
+# Add MPI+StarPU+KBLAS backend
+if(MPI AND STARPU AND KBLAS)
+    list(APPEND BACKENDS_OBJECTS $<TARGET_OBJECTS:backends_mpi_starpu_kblas>)
+    list(APPEND BACKENDS_OBJECTS $<TARGET_OBJECTS:backends_mpi_starpu_kblas2>)
+    list(APPEND BACKENDS_OBJECTS
+        $<TARGET_OBJECTS:backends_mpi_starpu_kblas3_spatial>)
+endif()
+
 # Move all selected backends to parent directory
 set(BACKENDS_OBJECTS ${BACKENDS_OBJECTS} PARENT_SCOPE)
 
diff --git a/src/backends/mpi/CMakeLists.txt b/src/backends/mpi/CMakeLists.txt
index cb64cb0c..bc4d26aa 100644
--- a/src/backends/mpi/CMakeLists.txt
+++ b/src/backends/mpi/CMakeLists.txt
@@ -5,7 +5,7 @@
 #             University of Science and Technology (KAUST)
 #
 # @file src/backends/mpi/CMakeLists.txt
-# @version 1.3.0
+# @version 0.3.0
 # @author Aleksandr Mikhalev
 # @date 2017-11-07
 
diff --git a/src/backends/mpi/blrm/CMakeLists.txt b/src/backends/mpi/blrm/CMakeLists.txt
index 7b433f34..6b3344c3 100644
--- a/src/backends/mpi/blrm/CMakeLists.txt
+++ b/src/backends/mpi/blrm/CMakeLists.txt
@@ -5,7 +5,7 @@
 #             University of Science and Technology (KAUST)
 #
 # @file src/backends/mpi/blrm/CMakeLists.txt
-# @version 1.3.0
+# @version 0.3.0
 # @author Aleksandr Mikhalev
 # @date 2017-11-07
 
diff --git a/src/backends/mpi/blrm/dfe.c b/src/backends/mpi/blrm/dfe.c
index 7f6c00e2..d568cdb4 100644
--- a/src/backends/mpi/blrm/dfe.c
+++ b/src/backends/mpi/blrm/dfe.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file src/backends/mpi/blrm/dfe.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
@@ -144,6 +144,8 @@ double starsh_blrm__dfe_mpi(STARSH_blrm *matrix)
     value[1] *= value[1];
     double mpi_value[2] = {0, 0};
     MPI_Allreduce(&value, &mpi_value, 2, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    int mpi_rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
     return sqrt(mpi_value[0]/mpi_value[1]);
 }
 
diff --git a/src/backends/mpi/blrm/dmml.c b/src/backends/mpi/blrm/dmml.c
index c0a2d1ca..00da8c81 100644
--- a/src/backends/mpi/blrm/dmml.c
+++ b/src/backends/mpi/blrm/dmml.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file src/backends/mpi/blrm/dmml.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/src/backends/mpi/blrm/dna.c b/src/backends/mpi/blrm/dna.c
index e0cb0642..db6550ca 100644
--- a/src/backends/mpi/blrm/dna.c
+++ b/src/backends/mpi/blrm/dna.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file src/backends/mpi/blrm/dna.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/src/backends/mpi/blrm/dqp3.c b/src/backends/mpi/blrm/dqp3.c
index ed088c8f..7eff81ec 100644
--- a/src/backends/mpi/blrm/dqp3.c
+++ b/src/backends/mpi/blrm/dqp3.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file src/backends/mpi/blrm/dqp3.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/src/backends/mpi/blrm/drsdd.c b/src/backends/mpi/blrm/drsdd.c
index 3309239c..4a5ee9a1 100644
--- a/src/backends/mpi/blrm/drsdd.c
+++ b/src/backends/mpi/blrm/drsdd.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file src/backends/mpi/blrm/drsdd.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
@@ -287,7 +287,7 @@ int starsh_blrm__drsdd_mpi(STARSH_blrm **matrix, STARSH_blrf *format,
         starsh_blrf_free(F2);
     }
     // Compute near-field blocks if needed
-    if(onfly == 0 && new_nblocks_near > 0)
+    if(onfly == 0 && new_nblocks_near_local > 0)
     {
         STARSH_MALLOC(near_D, new_nblocks_near_local);
         size_t size_D = 0;
diff --git a/src/backends/mpi/blrm/dsdd.c b/src/backends/mpi/blrm/dsdd.c
index b2bc8b35..be370e48 100644
--- a/src/backends/mpi/blrm/dsdd.c
+++ b/src/backends/mpi/blrm/dsdd.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file src/backends/mpi/blrm/dsdd.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/src/backends/mpi_starpu/CMakeLists.txt b/src/backends/mpi_starpu/CMakeLists.txt
index 2f66051e..92531ec6 100644
--- a/src/backends/mpi_starpu/CMakeLists.txt
+++ b/src/backends/mpi_starpu/CMakeLists.txt
@@ -5,7 +5,7 @@
 #             University of Science and Technology (KAUST)
 #
 # @file src/backends/mpi_starpu/CMakeLists.txt
-# @version 1.3.0
+# @version 0.3.0
 # @author Aleksandr Mikhalev
 # @date 2017-11-07
 
diff --git a/src/backends/mpi_starpu/blrm/CMakeLists.txt b/src/backends/mpi_starpu/blrm/CMakeLists.txt
index 9d3b9845..0b72341b 100644
--- a/src/backends/mpi_starpu/blrm/CMakeLists.txt
+++ b/src/backends/mpi_starpu/blrm/CMakeLists.txt
@@ -5,7 +5,7 @@
 #             University of Science and Technology (KAUST)
 #
 # @file src/backends/mpi_starpu/blrm/CMakeLists.txt
-# @version 1.3.0
+# @version 0.3.0
 # @author Aleksandr Mikhalev
 # @date 2017-11-07
 
diff --git a/src/backends/mpi_starpu/blrm/dmml.c b/src/backends/mpi_starpu/blrm/dmml.c
index 3f8d3bdc..0d3a2845 100644
--- a/src/backends/mpi_starpu/blrm/dmml.c
+++ b/src/backends/mpi_starpu/blrm/dmml.c
@@ -9,7 +9,7 @@
  * @cond
  * This command in pair with endcond will prevent file from being documented.
  *
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/src/backends/mpi_starpu/blrm/dqp3.c b/src/backends/mpi_starpu/blrm/dqp3.c
index d5109645..4ee38d45 100644
--- a/src/backends/mpi_starpu/blrm/dqp3.c
+++ b/src/backends/mpi_starpu/blrm/dqp3.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file src/backends/mpi_starpu/blrm/dqp3.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/src/backends/mpi_starpu/blrm/drsdd.c b/src/backends/mpi_starpu/blrm/drsdd.c
index da761e7c..1e9b82d6 100644
--- a/src/backends/mpi_starpu/blrm/drsdd.c
+++ b/src/backends/mpi_starpu/blrm/drsdd.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file src/backends/mpi_starpu/blrm/drsdd.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
@@ -283,7 +283,7 @@ int starsh_blrm__drsdd_mpi_starpu(STARSH_blrm **matrix, STARSH_blrf *format,
         starsh_blrf_free(F2);
     }
     // Compute near-field blocks if needed
-    if(onfly == 0 && new_nblocks_near > 0)
+    if(onfly == 0 && new_nblocks_near_local > 0)
     {
         STARSH_int nbi_value[new_nblocks_near_local];
         starpu_data_handle_t D_handle[new_nblocks_near_local];
diff --git a/src/backends/mpi_starpu/blrm/dsdd.c b/src/backends/mpi_starpu/blrm/dsdd.c
index c389e5a2..659a07d4 100644
--- a/src/backends/mpi_starpu/blrm/dsdd.c
+++ b/src/backends/mpi_starpu/blrm/dsdd.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file src/backends/mpi_starpu/blrm/dsdd.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/src/backends/mpi_starpu_kblas/CMakeLists.txt b/src/backends/mpi_starpu_kblas/CMakeLists.txt
new file mode 100644
index 00000000..7bfec5b1
--- /dev/null
+++ b/src/backends/mpi_starpu_kblas/CMakeLists.txt
@@ -0,0 +1,25 @@
+# @copyright (c) 2017 King Abdullah University of Science and
+#                      Technology (KAUST). All rights reserved.
+#
+# STARS-H is a software package, provided by King Abdullah
+#             University of Science and Technology (KAUST)
+#
+# @file src/backends/mpi_starpu_kblas/CMakeLists.txt
+# @version 0.1.0
+# @author Aleksandr Mikhalev
+# @date 2017-11-07
+
+
+# Collect sources for documentation and compilation
+set(SRC)
+add_subdirectory("blrm")
+
+# If compilation is requried
+if(MPI AND STARPU AND KBLAS)
+    add_library(backends_mpi_starpu_kblas OBJECT ${SRC})
+    set_target_properties(backends_mpi_starpu_kblas PROPERTIES COMPILE_FLAGS
+        "${MPI_C_COMPILE_FLAGS}")
+endif()
+
+# Put doxygen input to parent scope
+set(DOXYGEN_INPUT ${DOXYGEN_INPUT} ${SRC} PARENT_SCOPE)
diff --git a/src/backends/mpi_starpu_kblas/blrm/CMakeLists.txt b/src/backends/mpi_starpu_kblas/blrm/CMakeLists.txt
new file mode 100644
index 00000000..5d890a91
--- /dev/null
+++ b/src/backends/mpi_starpu_kblas/blrm/CMakeLists.txt
@@ -0,0 +1,21 @@
+# @copyright (c) 2017 King Abdullah University of Science and
+#                      Technology (KAUST). All rights reserved.
+#
+# STARS-H is a software package, provided by King Abdullah
+#             University of Science and Technology (KAUST)
+#
+# @file src/backends/mpi_starpu/blrm/CMakeLists.txt
+# @version 0.1.0
+# @author Aleksandr Mikhalev
+# @date 2017-11-07
+
+
+# set the values of the variable in the parent scope
+set(SRC
+    #"${CMAKE_CURRENT_SOURCE_DIR}/dqp3.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/drsdd.c"
+    #"${CMAKE_CURRENT_SOURCE_DIR}/dfe.c"
+    #"${CMAKE_CURRENT_SOURCE_DIR}/dsdd.c"
+    #"${CMAKE_CURRENT_SOURCE_DIR}/dmml.c"
+    #"${CMAKE_CURRENT_SOURCE_DIR}/dna.c"
+    PARENT_SCOPE)
diff --git a/src/backends/mpi_starpu_kblas/blrm/drsdd.c b/src/backends/mpi_starpu_kblas/blrm/drsdd.c
new file mode 100644
index 00000000..a55560a9
--- /dev/null
+++ b/src/backends/mpi_starpu_kblas/blrm/drsdd.c
@@ -0,0 +1,682 @@
+/*! @copyright (c) 2017 King Abdullah University of Science and
+ *                      Technology (KAUST). All rights reserved.
+ *
+ * STARS-H is a software package, provided by King Abdullah
+ *             University of Science and Technology (KAUST)
+ *
+ * @file src/backends/mpi_starpu_kblas/blrm/drsdd.c
+ * @version 0.1.0
+ * @author Aleksandr Mikhalev
+ * @date 2017-11-07
+ * */
+
+#include "common.h"
+#include "starsh.h"
+#include "starsh-starpu-kblas.h"
+#include "starsh-mpi-starpu-kblas.h"
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <kblas.h>
+#include "batch_rand.h"
+#include <starpu.h>
+#include <mpi.h>
+
+static void init_starpu_kblas(void *args)
+{
+    cublasHandle_t *cublas_handles;
+    kblasHandle_t *kblas_handles;
+    kblasRandState_t *kblas_states;
+    cudaStream_t stream = starpu_cuda_get_local_stream();
+    int nb, nsamples, maxbatch;
+    double **work;
+    int **iwork;
+    starpu_codelet_unpack_args(args, &cublas_handles, &kblas_handles,
+            &kblas_states, &work, &iwork, &nb, &nsamples, &maxbatch);
+    int id = starpu_worker_get_id();
+    cublasStatus_t status;
+    //double time0 = MPI_Wtime();
+    //cublasCreate(&cublas_handles[id]);
+    //double time1 = MPI_Wtime();
+    kblasCreate(&kblas_handles[id]);
+    //double timek = MPI_Wtime();
+    //printf("CUBLAS: %f, KBLAS: %f\n", time1-time0, timek-time1);
+    //return;
+    kblasSetStream(kblas_handles[id], stream);
+    //double time2 = MPI_Wtime();
+    kblasDrsvd_batch_wsquery(kblas_handles[id], nb, nb, nsamples, maxbatch);
+    //double time3 = MPI_Wtime();
+    kblasAllocateWorkspace(kblas_handles[id]);
+    //double time4 = MPI_Wtime();
+    cublas_handles[id] = kblasGetCublasHandle(kblas_handles[id]);
+    //double time5 = MPI_Wtime();
+    kblasInitRandState(kblas_handles[id], &kblas_states[id], 16384*2, 0);
+    //double time6 = MPI_Wtime();
+    work[id] = malloc(nsamples*maxbatch*sizeof(double));
+    //double time7 = MPI_Wtime();
+    iwork[id] = malloc(maxbatch*sizeof(int));
+    //double time8 = MPI_Wtime();
+    cudaStreamSynchronize(stream);
+    //double time9 = MPI_Wtime();
+    //printf("KBLAS INIT: %f %f %f %f %f\n", time1-time0, time2-time1, time3-time2, time4-time3, time5-time4);
+    //printf("KBLAS INIT: %f %f %f %f\n", time6-time5, time7-time6, time8-time7, time9-time8);
+}
+
+static void init_starpu_cpu(void *args)
+{
+    int nb, nsamples;
+    int lwork, liwork;
+    double **work;
+    int **iwork;
+    starpu_codelet_unpack_args(args, &nb, &nsamples, &work, &lwork, &iwork,
+            &liwork);
+    int id = starpu_worker_get_id();
+    work[id] = malloc(lwork*sizeof(*work[0]));
+    iwork[id] = malloc(liwork*sizeof(*iwork[0]));
+}
+
+static void deinit_starpu_kblas(void *args)
+{
+    int nb, nsamples, maxbatch;
+    double **work;
+    int **iwork;
+    cublasHandle_t *cublas_handles;
+    kblasHandle_t *kblas_handles;
+    kblasRandState_t *kblas_states;
+    starpu_codelet_unpack_args(args, &cublas_handles, &kblas_handles,
+            &kblas_states, &work, &iwork, &nb, &nsamples, &maxbatch);
+    int id = starpu_worker_get_id();
+    kblasDestroyRandState(kblas_states[id]);
+    kblasDestroy(&kblas_handles[id]);
+    free(work[id]);
+    free(iwork[id]);
+}
+
+static void deinit_starpu_cpu(void *args)
+{
+    int nb, nsamples;
+    int lwork, liwork;
+    double **work;
+    int **iwork;
+    starpu_codelet_unpack_args(args, &nb, &nsamples, &work, &lwork, &iwork,
+            &liwork);
+    int id = starpu_worker_get_id();
+    free(work[id]);
+    free(iwork[id]);
+}
+
+static void empty_cpu_func(void *buffer[],  void *cl_arg)
+{
+}
+
+void starsh_dense_kernel_mpi_starpu_kblas_cpu_far(void *buffers[], void *cl_arg)
+//! STARPU kernel for matrix kernel.
+{
+    STARSH_blrf *F;
+    STARSH_int batch_size;
+    starpu_codelet_unpack_args(cl_arg, &F, &batch_size);
+    STARSH_problem *P = F->problem;
+    STARSH_kernel *kernel = P->kernel;
+    // Shortcuts to information about clusters
+    STARSH_cluster *RC = F->row_cluster, *CC = F->col_cluster;
+    void *RD = RC->data, *CD = CC->data;
+    double *D = (double *)STARPU_VECTOR_GET_PTR(buffers[0]);
+    STARSH_int *ind = (STARSH_int *)STARPU_VECTOR_GET_PTR(buffers[1]);
+    // This works only for equal square tiles
+    STARSH_int N = RC->size[0];
+    STARSH_int stride = N*N;
+    int pool_size = starpu_combined_worker_get_size();
+    int pool_rank = starpu_combined_worker_get_rank();
+    for(STARSH_int ibatch = pool_rank; ibatch < batch_size;
+            ibatch += pool_size)
+    {
+        int k = ind[ibatch];
+        int i = F->block_far[k*2];
+        int j = F->block_far[k*2+1];
+        kernel(N, N, RC->pivot+RC->start[i], CC->pivot+CC->start[j],
+                RD, CD, D + ibatch*stride, N);
+    }
+}
+
+void starsh_dense_kernel_mpi_starpu_kblas_cpu_near(void *buffers[], void *cl_arg)
+//! STARPU kernel for matrix kernel.
+{
+    STARSH_blrf *F;
+    STARSH_int batch_size;
+    starpu_codelet_unpack_args(cl_arg, &F, &batch_size);
+    STARSH_problem *P = F->problem;
+    STARSH_kernel *kernel = P->kernel;
+    // Shortcuts to information about clusters
+    STARSH_cluster *RC = F->row_cluster, *CC = F->col_cluster;
+    void *RD = RC->data, *CD = CC->data;
+    double *D = (double *)STARPU_VECTOR_GET_PTR(buffers[0]);
+    STARSH_int *ind = (STARSH_int *)STARPU_VECTOR_GET_PTR(buffers[1]);
+    // This works only for equal square tiles
+    STARSH_int N = RC->size[0];
+    STARSH_int stride = N*N;
+    int pool_size = starpu_combined_worker_get_size();
+    int pool_rank = starpu_combined_worker_get_rank();
+    for(STARSH_int ibatch = pool_rank; ibatch < batch_size;
+            ibatch += pool_size)
+    {
+        int k = ind[ibatch];
+        int i = F->block_near[k*2];
+        int j = F->block_near[k*2+1];
+        kernel(N, N, RC->pivot+RC->start[i], CC->pivot+CC->start[j],
+                RD, CD, D + ibatch*stride, N);
+    }
+}
+
+int starsh_blrm__drsdd_mpi_starpu_kblas(STARSH_blrm **matrix,
+        STARSH_blrf *format, int maxrank, double tol, int onfly)
+//! Approximate each tile by randomized SVD.
+/*!
+ * @param[out] matrix: Address of pointer to @ref STARSH_blrm object.
+ * @param[in] format: Block low-rank format.
+ * @param[in] maxrank: Maximum possible rank.
+ * @param[in] tol: Relative error tolerance.
+ * @param[in] onfly: Whether not to store dense blocks.
+ * @return Error code @ref STARSH_ERRNO.
+ * @ingroup blrm
+ * */
+{
+    double time_start = MPI_Wtime();
+    STARSH_blrf *F = format;
+    STARSH_problem *P = F->problem;
+    STARSH_kernel *kernel = P->kernel;
+    STARSH_int nblocks_far = F->nblocks_far;
+    STARSH_int nblocks_near = F->nblocks_near;
+    STARSH_int nblocks_far_local = F->nblocks_far_local;
+    STARSH_int nblocks_near_local = F->nblocks_near_local;
+    // Shortcuts to information about clusters
+    STARSH_cluster *RC = F->row_cluster;
+    STARSH_cluster *CC = F->col_cluster;
+    void *RD = RC->data, *CD = CC->data;
+    // Following values default to given block low-rank format F, but they are
+    // changed when there are false far-field blocks.
+    STARSH_int new_nblocks_far = F->nblocks_far;
+    STARSH_int new_nblocks_near = F->nblocks_near;
+    STARSH_int new_nblocks_far_local = F->nblocks_far_local;
+    STARSH_int new_nblocks_near_local = F->nblocks_near_local;
+    STARSH_int *block_far = F->block_far;
+    STARSH_int *block_near = F->block_near;
+    STARSH_int *block_far_local = F->block_far_local;
+    STARSH_int *block_near_local = F->block_near_local;
+    // Places to store low-rank factors, dense blocks and ranks
+    Array **far_U = NULL, **far_V = NULL, **near_D = NULL;
+    int *far_rank = NULL;
+    double *alloc_U = NULL, *alloc_V = NULL, *alloc_D = NULL;
+    size_t offset_U = 0, offset_V = 0, offset_D = 0;
+    STARSH_int lbi, lbj, bi, bj = 0;
+    const int oversample = starsh_params.oversample;
+    // MPI
+    int mpi_size, mpi_rank;
+    MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
+    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+    // Init CuBLAS and KBLAS handles and temp buffers for all workers (but they
+    // are used only in GPU codelets)
+    int workers = starpu_worker_get_count();
+    cublasHandle_t cublas_handles[workers];
+    kblasHandle_t kblas_handles[workers];
+    kblasRandState_t kblas_states[workers];
+    double *work[workers];
+    int *iwork[workers];
+    cublasHandle_t *cuhandles = cublas_handles;
+    kblasHandle_t *khandles = kblas_handles;
+    kblasRandState_t *kstates = kblas_states;
+    double **wwork = work;
+    int **wiwork = iwork;
+    //printf("MAIN: %p, %p, %p\n", cuhandles, khandles, svhandles);
+    void *args_gpu, *args_cpu;
+    size_t args_gpu_size = 0;
+    size_t args_cpu_size = 0;
+    // This works only for TLR with equal tiles
+    int nb = RC->size[0];
+    int nsamples = maxrank+oversample;
+    // Set size of batch
+    int batch_size = 100;
+    // Ceil number of batches
+    int nbatches_local = (nblocks_far_local-1)/batch_size + 1;
+    // Get corresponding sizes and minimum of them
+    int mn = maxrank+oversample;
+    if(mn > nb)
+        mn = nb;
+    // Get size of temporary arrays
+    int lwork = nb;
+    int lwork_sdd = (4*mn+7) * mn;
+    if(lwork_sdd > lwork)
+        lwork = lwork_sdd;
+    lwork += mn*(3*nb+mn+1) + nb*nb;
+    int liwork = 8 * mn;
+    starpu_codelet_pack_args(&args_gpu, &args_gpu_size,
+            STARPU_VALUE, &cuhandles, sizeof(cuhandles),
+            STARPU_VALUE, &khandles, sizeof(khandles),
+            STARPU_VALUE, &kstates, sizeof(kstates),
+            STARPU_VALUE, &wwork, sizeof(wwork),
+            STARPU_VALUE, &wiwork, sizeof(wiwork),
+            STARPU_VALUE, &nb, sizeof(nb),
+            STARPU_VALUE, &nsamples, sizeof(nsamples),
+            STARPU_VALUE, &batch_size, sizeof(batch_size),
+            0);
+    starpu_codelet_pack_args(&args_cpu, &args_cpu_size,
+            STARPU_VALUE, &nb, sizeof(nb),
+            STARPU_VALUE, &nsamples, sizeof(nsamples),
+            STARPU_VALUE, &wwork, sizeof(wwork),
+            STARPU_VALUE, &lwork, sizeof(lwork),
+            STARPU_VALUE, &wiwork, sizeof(wiwork),
+            STARPU_VALUE, &liwork, sizeof(liwork),
+            0);
+    starpu_execute_on_each_worker(init_starpu_kblas, args_gpu, STARPU_CUDA);
+    starpu_execute_on_each_worker(init_starpu_cpu, args_cpu, STARPU_CPU);
+    MPI_Barrier(MPI_COMM_WORLD);
+    //double time0 = MPI_Wtime();
+    //if(mpi_rank == 0)
+    //    printf("CUBLAS + WORKSPACE ALLOCATION: %f seconds\n", time0-time_start);
+    // Init codelet structs and handles
+    struct starpu_codelet codelet_kernel_far =
+    {
+        .cpu_funcs = {starsh_dense_kernel_mpi_starpu_kblas_cpu_far},
+        .nbuffers = 2,
+        .modes = {STARPU_W, STARPU_R},
+        .type = STARPU_SPMD,
+        .max_parallelism = INT_MAX,
+    };
+    struct starpu_codelet codelet_kernel_near =
+    {
+        .cpu_funcs = {starsh_dense_kernel_mpi_starpu_kblas_cpu_near},
+        .nbuffers = 2,
+        .modes = {STARPU_W, STARPU_R},
+        .type = STARPU_SPMD,
+        .max_parallelism = INT_MAX,
+    };
+    struct starpu_codelet codelet_lowrank =
+    {
+        .cpu_funcs = {starsh_dense_dlrrsdd_starpu_kblas_cpu},
+        .cuda_funcs = {starsh_dense_dlrrsdd_starpu_kblas_gpu},
+        .cuda_flags = {STARPU_CUDA_ASYNC},
+        .nbuffers = 5,
+        .modes = {STARPU_R, STARPU_SCRATCH, STARPU_W, STARPU_W, STARPU_W},
+        .type = STARPU_SPMD,
+        .max_parallelism = INT_MAX,
+    };
+    struct starpu_codelet codelet_lowrank_cpu =
+    {
+        .cpu_funcs = {starsh_dense_dlrrsdd_starpu_kblas_cpu},
+        .nbuffers = 5,
+        .modes = {STARPU_R, STARPU_SCRATCH, STARPU_W, STARPU_W, STARPU_W},
+        .type = STARPU_SPMD,
+        .max_parallelism = INT_MAX,
+    };
+    struct starpu_codelet codelet_lowrank_gpu =
+    {
+        .cuda_funcs = {starsh_dense_dlrrsdd_starpu_kblas_gpu},
+        .cuda_flags = {STARPU_CUDA_ASYNC},
+        .nbuffers = 5,
+        .modes = {STARPU_R, STARPU_SCRATCH, STARPU_W, STARPU_W, STARPU_W},
+    };
+    struct starpu_codelet codelet_get_data_back_to_cpu =
+    {
+        .cpu_funcs = {empty_cpu_func},
+        .nbuffers = 1,
+        .modes = {STARPU_R},
+    };
+    // Select if ONLY cpu or gpu
+    if(getenv("STARSH_KBLAS_CPU"))
+        codelet_lowrank = codelet_lowrank_cpu;
+    else if(getenv("STARSH_KBLAS_GPU"))
+        codelet_lowrank = codelet_lowrank_gpu;
+    starpu_data_handle_t rank_handle[nbatches_local];
+    starpu_data_handle_t D_handle[nbatches_local];
+    starpu_data_handle_t Dcopy_handle[nbatches_local];
+    starpu_data_handle_t index_handle[nbatches_local];
+    starpu_data_handle_t U_handle[nbatches_local];
+    starpu_data_handle_t V_handle[nbatches_local];
+    // Init buffers to store low-rank factors of far-field blocks if needed
+    if(nbatches_local > 0)
+    {
+        STARSH_MALLOC(far_U, nblocks_far_local);
+        STARSH_MALLOC(far_V, nblocks_far_local);
+        STARSH_MALLOC(far_rank, nblocks_far_local);
+        size_t size_U = nblocks_far_local * nb * maxrank;
+        size_t size_V = size_U;
+        STARSH_MALLOC(alloc_U, size_U);
+        STARSH_MALLOC(alloc_V, size_V);
+        int shape[] = {nb, maxrank};
+        for(lbi = 0; lbi < nblocks_far_local; ++lbi)
+        {
+            STARSH_int offset = lbi * nb * maxrank;
+            array_from_buffer(far_U+lbi, 2, shape, 'd', 'F', alloc_U+offset);
+            array_from_buffer(far_V+lbi, 2, shape, 'd', 'F', alloc_V+offset);
+        }
+        for(lbi = 0; lbi < nbatches_local; ++lbi)
+        {
+            STARSH_int offset = lbi * batch_size * nb * maxrank;
+            double *U = alloc_U + offset;
+            double *V = alloc_V + offset;
+            STARSH_int this_batch_size = nblocks_far_local - lbi*batch_size;
+            if(this_batch_size > batch_size)
+                this_batch_size = batch_size;
+            STARSH_int D_size = this_batch_size * nb * nb;
+            STARSH_int U_size = this_batch_size * nb * maxrank;
+            STARSH_int V_size = U_size;
+            //printf("THIS BATCH SIZE=%d\n", this_batch_size);
+            starpu_vector_data_register(rank_handle+lbi, STARPU_MAIN_RAM,
+                    (uintptr_t)(far_rank + lbi*batch_size), this_batch_size,
+                    sizeof(*far_rank));
+            starpu_vector_data_register(D_handle+lbi, -1, 0, D_size,
+                    sizeof(double));
+            starpu_vector_data_register(Dcopy_handle+lbi, -1, 0, D_size,
+                    sizeof(double));
+            starpu_vector_data_register(index_handle+lbi, STARPU_MAIN_RAM,
+                    (uintptr_t)(block_far_local + lbi*batch_size),
+                    this_batch_size, sizeof(*block_far_local));
+            starpu_vector_data_register(U_handle+lbi, STARPU_MAIN_RAM,
+                    (uintptr_t)(U), U_size, sizeof(*U));
+            starpu_vector_data_register(V_handle+lbi, STARPU_MAIN_RAM,
+                    (uintptr_t)(V), V_size, sizeof(*V));
+        }
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
+    //double time1 = MPI_Wtime();
+    //if(mpi_rank == 0)
+    //    printf("REGISTER DATA IN: %f seconds\n", time1-time0);
+    //time0 = time1;
+    // Work variables
+    int info;
+    // START MEASURING TIME
+    for(lbi = 0; lbi < nbatches_local; ++lbi)
+    {
+        //printf("RUNNING BATCH=%d\n", bi);
+        STARSH_int this_batch_size = nblocks_far_local - lbi*batch_size;
+        if(this_batch_size > batch_size)
+            this_batch_size = batch_size;
+        // Generate matrix
+        starpu_task_insert(&codelet_kernel_far,
+                STARPU_VALUE, &F, sizeof(F),
+                STARPU_VALUE, &this_batch_size, sizeof(this_batch_size),
+                STARPU_W, D_handle[lbi],
+                STARPU_R, index_handle[lbi],
+                0);
+        starpu_data_unregister_submit(index_handle[lbi]);
+    }
+    starpu_task_wait_for_all();
+    MPI_Barrier(MPI_COMM_WORLD);
+    //time1 = MPI_Wtime();
+    //if(mpi_rank == 0)
+    //    printf("COMPUTE MATRIX IN: %f seconds\n", time1-time0);
+    //time0 = time1;
+    STARSH_int nbatches_once = nbatches_local;
+    for(STARSH_int batch_start = 0; batch_start < nbatches_local;
+            batch_start += nbatches_once)
+    {
+        STARSH_int batch_end = batch_start + nbatches_once;
+        if(batch_end > nbatches_local)
+            batch_end = nbatches_local;
+        for(bi = batch_start; bi < batch_end; ++bi)
+        {
+            //printf("RUNNING BATCH=%d\n", bi);
+            STARSH_int this_batch_size = nblocks_far_local - bi*batch_size;
+            if(this_batch_size > batch_size)
+                this_batch_size = batch_size;
+            // Run KBLAS_RSVD
+            starpu_task_insert(&codelet_lowrank,
+                    STARPU_VALUE, &this_batch_size, sizeof(this_batch_size),
+                    STARPU_VALUE, &nb, sizeof(nb),
+                    STARPU_VALUE, &maxrank, sizeof(maxrank),
+                    STARPU_VALUE, &oversample, sizeof(oversample),
+                    STARPU_VALUE, &tol, sizeof(tol),
+                    STARPU_VALUE, &cuhandles, sizeof(cuhandles),
+                    STARPU_VALUE, &khandles, sizeof(khandles),
+                    STARPU_VALUE, &kstates, sizeof(kstates),
+                    STARPU_VALUE, &wwork, sizeof(wwork),
+                    STARPU_VALUE, &lwork, sizeof(lwork),
+                    STARPU_VALUE, &wiwork, sizeof(wiwork),
+                    STARPU_R, D_handle[bi],
+                    STARPU_SCRATCH, Dcopy_handle[bi],
+                    STARPU_W, U_handle[bi],
+                    STARPU_W, V_handle[bi],
+                    STARPU_W, rank_handle[bi],
+                    0);
+            starpu_data_unregister_submit(Dcopy_handle[bi]);
+            starpu_task_insert(&codelet_get_data_back_to_cpu,
+                    STARPU_R, U_handle[bi],
+                    0);
+            starpu_task_insert(&codelet_get_data_back_to_cpu,
+                    STARPU_R, V_handle[bi],
+                    0);
+            starpu_task_insert(&codelet_get_data_back_to_cpu,
+                    STARPU_R, rank_handle[bi],
+                    0);
+            starpu_data_unregister_submit(rank_handle[bi]);
+            starpu_data_unregister_submit(D_handle[bi]);
+            starpu_data_unregister_submit(U_handle[bi]);
+            starpu_data_unregister_submit(V_handle[bi]);
+        }
+        starpu_task_wait_for_all();
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
+    //time1 = MPI_Wtime();
+    //if(mpi_rank == 0)
+    //    printf("COMPRESS MATRIX IN: %f seconds\n", time1-time0);
+    //time0 = time1;
+    // Get number of false far-field blocks
+    STARSH_int nblocks_false_far_local = 0;
+    STARSH_int *false_far_local = NULL;
+    for(lbi = 0; lbi < nblocks_far_local; lbi++)
+    {
+        if(far_rank[lbi] == -1)
+            nblocks_false_far_local++;
+    }
+    if(nblocks_false_far_local > 0)
+    {
+        // IMPORTANT: `false_far` and `false_far_local` must be in
+        // ascending order for later code to work normally
+        STARSH_MALLOC(false_far_local, nblocks_false_far_local);
+        lbj = 0;
+        for(lbi = 0; lbi < nblocks_far_local; lbi++)
+            if(far_rank[lbi] == -1)
+                false_far_local[lbj++] = block_far_local[lbi];
+    }
+    // Sync list of all false far-field blocks
+    STARSH_int nblocks_false_far = 0;
+    int int_nblocks_false_far_local = nblocks_false_far_local;
+    int *mpi_recvcount, *mpi_offset;
+    STARSH_MALLOC(mpi_recvcount, mpi_size);
+    STARSH_MALLOC(mpi_offset, mpi_size);
+    MPI_Allgather(&int_nblocks_false_far_local, 1, MPI_INT, mpi_recvcount,
+            1, MPI_INT, MPI_COMM_WORLD);
+    for(bi = 0; bi < mpi_size; bi++)
+        nblocks_false_far += mpi_recvcount[bi];
+    mpi_offset[0] = 0;
+    for(bi = 1; bi < mpi_size; bi++)
+        mpi_offset[bi] = mpi_offset[bi-1]+mpi_recvcount[bi-1];
+    STARSH_int *false_far = NULL;
+    if(nblocks_false_far > 0)
+        STARSH_MALLOC(false_far, nblocks_false_far);
+    MPI_Allgatherv(false_far_local, nblocks_false_far_local, my_MPI_SIZE_T,
+            false_far, mpi_recvcount, mpi_offset, my_MPI_SIZE_T,
+            MPI_COMM_WORLD);
+    free(mpi_recvcount);
+    free(mpi_offset);
+    // Make false_far be in ascending order
+    qsort(false_far, nblocks_false_far, sizeof(*false_far), cmp_size_t);
+    if(nblocks_false_far > 0)
+    {
+        // Update list of near-field blocks
+        new_nblocks_near = nblocks_near+nblocks_false_far;
+        new_nblocks_near_local = nblocks_near_local+nblocks_false_far_local;
+        STARSH_MALLOC(block_near, 2*new_nblocks_near);
+        if(new_nblocks_near_local > 0)
+            STARSH_MALLOC(block_near_local, new_nblocks_near_local);
+        // At first get all near-field blocks, assumed to be dense
+        for(bi = 0; bi < 2*nblocks_near; bi++)
+            block_near[bi] = F->block_near[bi];
+        for(lbi = 0; lbi < nblocks_near_local; lbi++)
+            block_near_local[lbi] = F->block_near_local[lbi];
+        // Add false far-field blocks
+        for(bi = 0; bi < nblocks_false_far; bi++)
+        {
+            STARSH_int bj = false_far[bi];
+            block_near[2*(bi+nblocks_near)] = F->block_far[2*bj];
+            block_near[2*(bi+nblocks_near)+1] = F->block_far[2*bj+1];
+        }
+        bi = 0;
+        for(lbi = 0; lbi < nblocks_false_far_local; lbi++)
+        {
+            lbj = false_far_local[lbi];
+            while(bi < nblocks_false_far && false_far[bi] < lbj)
+                bi++;
+            block_near_local[nblocks_near_local+lbi] = nblocks_near+bi;
+        }
+        // Update list of far-field blocks
+        new_nblocks_far = nblocks_far-nblocks_false_far;
+        new_nblocks_far_local = nblocks_far_local-nblocks_false_far_local;
+        if(new_nblocks_far > 0)
+        {
+            STARSH_MALLOC(block_far, 2*new_nblocks_far);
+            if(new_nblocks_far_local > 0)
+                STARSH_MALLOC(block_far_local, new_nblocks_far_local);
+            bj = 0;
+            lbi = 0;
+            lbj = 0;
+            for(bi = 0; bi < nblocks_far; bi++)
+            {
+                // `false_far` must be in ascending order for this to work
+                if(bj < nblocks_false_far && false_far[bj] == bi)
+                {
+                    if(nblocks_false_far_local > lbj &&
+                            false_far_local[lbj] == bi)
+                    {
+                        lbi++;
+                        lbj++;
+                    }
+                    bj++;
+                }
+                else
+                {
+                    block_far[2*(bi-bj)] = F->block_far[2*bi];
+                    block_far[2*(bi-bj)+1] = F->block_far[2*bi+1];
+                    if(nblocks_far_local > lbi &&
+                            F->block_far_local[lbi] == bi)
+                    {
+                        block_far_local[lbi-lbj] = bi-bj;
+                        lbi++;
+                    }
+                }
+            }
+        }
+        // Update format by creating new format
+        STARSH_blrf *F2;
+        info = starsh_blrf_new_from_coo_mpi(&F2, P, F->symm, RC, CC,
+                new_nblocks_far, block_far, new_nblocks_far_local,
+                block_far_local, new_nblocks_near, block_near,
+                new_nblocks_near_local, block_near_local, F->type);
+        // Swap internal data of formats and free unnecessary data
+        STARSH_blrf tmp_blrf = *F;
+        *F = *F2;
+        *F2 = tmp_blrf;
+        if(mpi_rank == 0)
+            STARSH_WARNING("`F` was modified due to false far-field blocks");
+        starsh_blrf_free(F2);
+    }
+    // Compute near-field blocks if needed
+    if(onfly == 0 && new_nblocks_near_local > 0)
+    {
+        STARSH_MALLOC(near_D, new_nblocks_near_local);
+        size_t size_D = new_nblocks_near_local * nb * nb;
+        STARSH_MALLOC(alloc_D, size_D);
+        nbatches_local = (new_nblocks_near_local-1)/batch_size + 1;
+        starpu_data_handle_t D_handle[nbatches_local];
+        starpu_data_handle_t index_handle[nbatches_local];
+        int shape[] = {nb, nb};
+        // For each local near-field block compute its elements
+        for(lbi = 0; lbi < new_nblocks_near_local; ++lbi)
+        {
+            // Get indexes of corresponding block row and block column
+            array_from_buffer(near_D+lbi, 2, shape, 'd', 'F',
+                    alloc_D + lbi*nb*nb);
+        }
+        for(lbi = 0; lbi < nbatches_local; ++lbi)
+        {
+            STARSH_int this_batch_size = new_nblocks_near_local
+                - lbi*batch_size;
+            if(this_batch_size > batch_size)
+                this_batch_size = batch_size;
+            STARSH_int D_size = this_batch_size * nb * nb;
+            double *D = alloc_D + lbi*batch_size*nb*nb;
+            starpu_vector_data_register(D_handle+lbi, STARPU_MAIN_RAM,
+                    (uintptr_t)(D), D_size, sizeof(*D));
+            starpu_vector_data_register(index_handle+lbi, STARPU_MAIN_RAM,
+                    (uintptr_t)(block_near_local + lbi*batch_size),
+                    this_batch_size, sizeof(*block_near_local));
+        }
+        for(lbi = 0; lbi < nbatches_local; ++lbi)
+        {
+            STARSH_int this_batch_size = new_nblocks_near_local
+                - lbi*batch_size;
+            if(this_batch_size > batch_size)
+                this_batch_size = batch_size;
+            // Generate matrix
+            starpu_task_insert(&codelet_kernel_near,
+                    STARPU_VALUE, &F, sizeof(F),
+                    STARPU_VALUE, &this_batch_size, sizeof(this_batch_size),
+                    STARPU_W, D_handle[lbi],
+                    STARPU_R, index_handle[lbi],
+                    0);
+            starpu_data_unregister_submit(D_handle[lbi]);
+            starpu_data_unregister_submit(index_handle[lbi]);
+        }
+        // Wait in this scope, because all handles are not visible outside
+        starpu_task_wait_for_all();
+    }
+    // Change sizes of far_rank, far_U and far_V if there were false
+    // far-field blocks
+    lbj = 0;
+    for(lbi = 0; lbi < nblocks_far_local; lbi++)
+    {
+        if(far_rank[lbi] == -1)
+            lbj++;
+        else
+        {
+            int shape_U[2] = {far_U[lbi]->shape[0], far_rank[lbi]};
+            int shape_V[2] = {far_V[lbi]->shape[0], far_rank[lbi]};
+            array_from_buffer(far_U+lbi-lbj, 2, shape_U, 'd', 'F',
+                    far_U[lbi]->data);
+            array_from_buffer(far_V+lbi-lbj, 2, shape_V, 'd', 'F',
+                    far_V[lbi]->data);
+            far_rank[lbi-lbj] = far_rank[lbi];
+        }
+    }
+    if(nblocks_false_far_local > 0 && new_nblocks_far_local > 0)
+    {
+        STARSH_REALLOC(far_rank, new_nblocks_far_local);
+        STARSH_REALLOC(far_U, new_nblocks_far_local);
+        STARSH_REALLOC(far_V, new_nblocks_far_local);
+    }
+    // If all far-field blocks are false, then dealloc buffers
+    if(new_nblocks_far_local == 0 && nblocks_far_local > 0)
+    {
+        block_far = NULL;
+        free(far_rank);
+        far_rank = NULL;
+        free(far_U);
+        far_U = NULL;
+        free(far_V);
+        far_V = NULL;
+        free(alloc_U);
+        alloc_U = NULL;
+        free(alloc_V);
+        alloc_V = NULL;
+    }
+    // Dealloc list of false far-field blocks if it is not empty
+    if(nblocks_false_far > 0)
+        free(false_far);
+    if(nblocks_false_far_local > 0)
+        free(false_far_local);
+    // Finish with creating instance of Block Low-Rank Matrix with given
+    // buffers
+    starpu_execute_on_each_worker(deinit_starpu_kblas, args_gpu, STARPU_CUDA);
+    starpu_execute_on_each_worker(deinit_starpu_cpu, args_cpu, STARPU_CPU);
+    return starsh_blrm_new_mpi(matrix, F, far_rank, far_U, far_V, onfly,
+            near_D, alloc_U, alloc_V, alloc_D, '1');
+}
+
diff --git a/src/backends/mpi_starpu_kblas2/CMakeLists.txt b/src/backends/mpi_starpu_kblas2/CMakeLists.txt
new file mode 100644
index 00000000..3326eae6
--- /dev/null
+++ b/src/backends/mpi_starpu_kblas2/CMakeLists.txt
@@ -0,0 +1,25 @@
+# @copyright (c) 2017 King Abdullah University of Science and
+#                      Technology (KAUST). All rights reserved.
+#
+# STARS-H is a software package, provided by King Abdullah
+#             University of Science and Technology (KAUST)
+#
+# @file src/backends/mpi_starpu_kblas/CMakeLists.txt
+# @version 0.1.0
+# @author Aleksandr Mikhalev
+# @date 2017-11-07
+
+
+# Collect sources for documentation and compilation
+set(SRC)
+add_subdirectory("blrm")
+
+# If compilation is requried
+if(MPI AND STARPU AND KBLAS)
+    add_library(backends_mpi_starpu_kblas2 OBJECT ${SRC})
+    set_target_properties(backends_mpi_starpu_kblas2 PROPERTIES COMPILE_FLAGS
+        "${MPI_C_COMPILE_FLAGS}")
+endif()
+
+# Put doxygen input to parent scope
+set(DOXYGEN_INPUT ${DOXYGEN_INPUT} ${SRC} PARENT_SCOPE)
diff --git a/src/backends/mpi_starpu_kblas2/blrm/CMakeLists.txt b/src/backends/mpi_starpu_kblas2/blrm/CMakeLists.txt
new file mode 100644
index 00000000..5d890a91
--- /dev/null
+++ b/src/backends/mpi_starpu_kblas2/blrm/CMakeLists.txt
@@ -0,0 +1,21 @@
+# @copyright (c) 2017 King Abdullah University of Science and
+#                      Technology (KAUST). All rights reserved.
+#
+# STARS-H is a software package, provided by King Abdullah
+#             University of Science and Technology (KAUST)
+#
+# @file src/backends/mpi_starpu/blrm/CMakeLists.txt
+# @version 0.1.0
+# @author Aleksandr Mikhalev
+# @date 2017-11-07
+
+
+# set the values of the variable in the parent scope
+set(SRC
+    #"${CMAKE_CURRENT_SOURCE_DIR}/dqp3.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/drsdd.c"
+    #"${CMAKE_CURRENT_SOURCE_DIR}/dfe.c"
+    #"${CMAKE_CURRENT_SOURCE_DIR}/dsdd.c"
+    #"${CMAKE_CURRENT_SOURCE_DIR}/dmml.c"
+    #"${CMAKE_CURRENT_SOURCE_DIR}/dna.c"
+    PARENT_SCOPE)
diff --git a/src/backends/mpi_starpu_kblas2/blrm/drsdd.c b/src/backends/mpi_starpu_kblas2/blrm/drsdd.c
new file mode 100644
index 00000000..f209ebf0
--- /dev/null
+++ b/src/backends/mpi_starpu_kblas2/blrm/drsdd.c
@@ -0,0 +1,600 @@
+/*! @copyright (c) 2017 King Abdullah University of Science and
+ *                      Technology (KAUST). All rights reserved.
+ *
+ * STARS-H is a software package, provided by King Abdullah
+ *             University of Science and Technology (KAUST)
+ *
+ * @file src/backends/mpi_starpu_kblas/blrm/drsdd.c
+ * @version 0.1.0
+ * @author Aleksandr Mikhalev
+ * @date 2017-11-07
+ * */
+
+#include "common.h"
+#include "starsh.h"
+#include "starsh-starpu-kblas.h"
+#include "starsh-mpi-starpu-kblas.h"
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <kblas.h>
+#include "batch_rand.h"
+#include <starpu.h>
+#include <mpi.h>
+
+static void init_starpu_kblas(void *args)
+{
+    cublasHandle_t *cublas_handles;
+    kblasHandle_t *kblas_handles;
+    kblasRandState_t *kblas_states;
+    cudaStream_t stream = starpu_cuda_get_local_stream();
+    int nb, nsamples, maxbatch;
+    starpu_codelet_unpack_args(args, &cublas_handles, &kblas_handles,
+            &kblas_states, &nb, &nsamples, &maxbatch);
+    int id = starpu_worker_get_id();
+    cublasStatus_t status;
+    kblasCreate(&kblas_handles[id]);
+    kblasSetStream(kblas_handles[id], stream);
+    kblasDrsvd_batch_wsquery(kblas_handles[id], nb, nb, nsamples, maxbatch);
+    kblasAllocateWorkspace(kblas_handles[id]);
+    cublas_handles[id] = kblasGetCublasHandle(kblas_handles[id]);
+    kblasInitRandState(kblas_handles[id], &kblas_states[id], 16384*2, 0);
+    cudaStreamSynchronize(stream);
+}
+
+static void deinit_starpu_kblas(void *args)
+{
+    int nb, nsamples, maxbatch;
+    cublasHandle_t *cublas_handles;
+    kblasHandle_t *kblas_handles;
+    kblasRandState_t *kblas_states;
+    starpu_codelet_unpack_args(args, &cublas_handles, &kblas_handles,
+            &kblas_states, &nb, &nsamples, &maxbatch);
+    int id = starpu_worker_get_id();
+    kblasDestroyRandState(kblas_states[id]);
+    kblasDestroy(&kblas_handles[id]);
+}
+
+static void starsh_dense_kernel_mpi_starpu_kblas2_cpu_far(void *buffers[], void *cl_arg)
+//! STARPU kernel for matrix kernel.
+{
+    STARSH_blrf *F;
+    int batch_size;
+    starpu_codelet_unpack_args(cl_arg, &F, &batch_size);
+    STARSH_problem *P = F->problem;
+    STARSH_kernel *kernel = P->kernel;
+    // Shortcuts to information about clusters
+    STARSH_cluster *RC = F->row_cluster, *CC = F->col_cluster;
+    void *RD = RC->data, *CD = CC->data;
+    double *D = (double *)STARPU_VECTOR_GET_PTR(buffers[0]);
+    STARSH_int *ind = (STARSH_int *)STARPU_VECTOR_GET_PTR(buffers[1]);
+    // This works only for equal square tiles
+    STARSH_int N = RC->size[0];
+    STARSH_int stride = N*N;
+    int pool_size = starpu_combined_worker_get_size();
+    int pool_rank = starpu_combined_worker_get_rank();
+    for(STARSH_int ibatch = pool_rank; ibatch < batch_size;
+            ibatch += pool_size)
+    {
+        int k = ind[ibatch];
+        int i = F->block_far[k*2];
+        int j = F->block_far[k*2+1];
+        kernel(N, N, RC->pivot+RC->start[i], CC->pivot+CC->start[j],
+                RD, CD, D + ibatch*stride, N);
+    }
+}
+
+static void starsh_dense_kernel_mpi_starpu_kblas2_cpu_near(void *buffers[], void *cl_arg)
+//! STARPU kernel for matrix kernel.
+{
+    STARSH_blrf *F;
+    int batch_size;
+    starpu_codelet_unpack_args(cl_arg, &F, &batch_size);
+    STARSH_problem *P = F->problem;
+    STARSH_kernel *kernel = P->kernel;
+    // Shortcuts to information about clusters
+    STARSH_cluster *RC = F->row_cluster, *CC = F->col_cluster;
+    void *RD = RC->data, *CD = CC->data;
+    double *D = (double *)STARPU_VECTOR_GET_PTR(buffers[0]);
+    STARSH_int *ind = (STARSH_int *)STARPU_VECTOR_GET_PTR(buffers[1]);
+    // This works only for equal square tiles
+    STARSH_int N = RC->size[0];
+    STARSH_int stride = N*N;
+    int pool_size = starpu_combined_worker_get_size();
+    int pool_rank = starpu_combined_worker_get_rank();
+    for(STARSH_int ibatch = pool_rank; ibatch < batch_size;
+            ibatch += pool_size)
+    {
+        int k = ind[ibatch];
+        int i = F->block_near[k*2];
+        int j = F->block_near[k*2+1];
+        kernel(N, N, RC->pivot+RC->start[i], CC->pivot+CC->start[j],
+                RD, CD, D + ibatch*stride, N);
+    }
+}
+
+int starsh_blrm__drsdd_mpi_starpu_kblas2(STARSH_blrm **matrix,
+        STARSH_blrf *format, int maxrank, double tol, int onfly)
+//! Approximate each tile by randomized SVD.
+/*!
+ * @param[out] matrix: Address of pointer to @ref STARSH_blrm object.
+ * @param[in] format: Block low-rank format.
+ * @param[in] maxrank: Maximum possible rank.
+ * @param[in] tol: Relative error tolerance.
+ * @param[in] onfly: Whether not to store dense blocks.
+ * @return Error code @ref STARSH_ERRNO.
+ * @ingroup blrm
+ * */
+{
+    //printf("MPIKBLAS2\n");
+    //double time_start = MPI_Wtime();
+    STARSH_blrf *F = format;
+    STARSH_problem *P = F->problem;
+    STARSH_kernel *kernel = P->kernel;
+    STARSH_int nblocks_far = F->nblocks_far;
+    STARSH_int nblocks_near = F->nblocks_near;
+    STARSH_int nblocks_far_local = F->nblocks_far_local;
+    STARSH_int nblocks_near_local = F->nblocks_near_local;
+    // Shortcuts to information about clusters
+    STARSH_cluster *RC = F->row_cluster;
+    STARSH_cluster *CC = F->col_cluster;
+    void *RD = RC->data, *CD = CC->data;
+    // Following values default to given block low-rank format F, but they are
+    // changed when there are false far-field blocks.
+    STARSH_int new_nblocks_far = F->nblocks_far;
+    STARSH_int new_nblocks_near = F->nblocks_near;
+    STARSH_int new_nblocks_far_local = F->nblocks_far_local;
+    STARSH_int new_nblocks_near_local = F->nblocks_near_local;
+    STARSH_int *block_far = F->block_far;
+    STARSH_int *block_near = F->block_near;
+    STARSH_int *block_far_local = F->block_far_local;
+    STARSH_int *block_near_local = F->block_near_local;
+    // Places to store low-rank factors, dense blocks and ranks
+    Array **far_U = NULL, **far_V = NULL, **near_D = NULL;
+    int *far_rank = NULL;
+    double *alloc_U = NULL, *alloc_V = NULL, *alloc_D = NULL, *alloc_S = NULL;
+    size_t offset_U = 0, offset_V = 0, offset_D = 0;
+    STARSH_int lbi, lbj, bi, bj = 0;
+    const int oversample = starsh_params.oversample;
+    // MPI
+    int mpi_size, mpi_rank;
+    MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
+    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+    // Init CuBLAS and KBLAS handles and temp buffers for all workers (but they
+    // are used only in GPU codelets)
+    int workers = starpu_worker_get_count();
+    cublasHandle_t cublas_handles[workers];
+    kblasHandle_t kblas_handles[workers];
+    kblasRandState_t kblas_states[workers];
+    cublasHandle_t *cuhandles = cublas_handles;
+    kblasHandle_t *khandles = kblas_handles;
+    kblasRandState_t *kstates = kblas_states;
+    //printf("MAIN: %p, %p, %p\n", cuhandles, khandles, svhandles);
+    void *args_gpu;
+    size_t args_gpu_size = 0;
+    // This works only for TLR with equal tiles
+    int nb = RC->size[0];
+    int nsamples = maxrank+oversample;
+    // Set size of batch
+    char *env_var = getenv("STARSH_KBLAS_BATCH");
+    int batch_size = 100;
+    if(env_var)
+        batch_size = atoi(env_var);
+    //printf("MPIKBLAS2: batch_size=%d\n", batch_size);
+    // Ceil number of batches
+    int nbatches_local = (nblocks_far_local-1)/batch_size + 1;
+    // Get corresponding sizes and minimum of them
+    int mn = maxrank+oversample;
+    if(mn > nb)
+        mn = nb;
+    starpu_codelet_pack_args(&args_gpu, &args_gpu_size,
+            STARPU_VALUE, &cuhandles, sizeof(cuhandles),
+            STARPU_VALUE, &khandles, sizeof(khandles),
+            STARPU_VALUE, &kstates, sizeof(kstates),
+            STARPU_VALUE, &nb, sizeof(nb),
+            STARPU_VALUE, &nsamples, sizeof(nsamples),
+            STARPU_VALUE, &batch_size, sizeof(batch_size),
+            0);
+    starpu_execute_on_each_worker(init_starpu_kblas, args_gpu, STARPU_CUDA);
+    //MPI_Barrier(MPI_COMM_WORLD);
+    //double time0 = MPI_Wtime();
+    //if(mpi_rank == 0)
+    //    printf("CUBLAS + WORKSPACE ALLOCATION: %f seconds\n", time0-time_start);
+    // Init codelet structs and handles
+    struct starpu_codelet codelet_kernel_far =
+    {
+        .cpu_funcs = {starsh_dense_kernel_mpi_starpu_kblas2_cpu_far},
+        .nbuffers = 2,
+        .modes = {STARPU_W, STARPU_R},
+        .type = STARPU_SPMD,
+        .max_parallelism = INT_MAX,
+    };
+    struct starpu_codelet codelet_kernel_near =
+    {
+        .cpu_funcs = {starsh_dense_kernel_mpi_starpu_kblas2_cpu_near},
+        .nbuffers = 2,
+        .modes = {STARPU_W, STARPU_R},
+        .type = STARPU_SPMD,
+        .max_parallelism = INT_MAX,
+    };
+    struct starpu_codelet codelet_lowrank =
+    {
+        .cuda_funcs = {starsh_dense_dlrrsdd_starpu_kblas2_gpu},
+        .cuda_flags = {STARPU_CUDA_ASYNC},
+        .nbuffers = 5,
+        .modes = {STARPU_R, STARPU_SCRATCH, STARPU_W, STARPU_W, STARPU_W},
+    };
+    struct starpu_codelet codelet_getrank =
+    {
+        .cpu_funcs = {starsh_dense_dlrrsdd_starpu_kblas2_getrank},
+        .nbuffers = 4,
+        .modes = {STARPU_R, STARPU_R, STARPU_R, STARPU_W},
+        .type = STARPU_SPMD,
+        .max_parallelism = INT_MAX,
+    };
+    starpu_data_handle_t D_handle[nbatches_local];
+    starpu_data_handle_t Dcopy_handle[nbatches_local];
+    starpu_data_handle_t index_handle[nbatches_local];
+    starpu_data_handle_t U_handle[nbatches_local];
+    starpu_data_handle_t V_handle[nbatches_local];
+    starpu_data_handle_t S_handle[nbatches_local];
+    starpu_data_handle_t rank_handle[nbatches_local];
+    // Init buffers to store low-rank factors of far-field blocks if needed
+    MPI_Barrier(MPI_COMM_WORLD);
+    double time0 = MPI_Wtime();
+    if(nbatches_local > 0)
+    {
+        STARSH_MALLOC(far_U, nblocks_far_local);
+        STARSH_MALLOC(far_V, nblocks_far_local);
+        STARSH_MALLOC(far_rank, nblocks_far_local);
+        size_t size_U = nblocks_far_local * nb * maxrank;
+        size_t size_V = size_U;
+        size_t size_D = nblocks_far_local * nb * nb;
+        size_t size_S = nblocks_far_local * mn;
+        STARSH_MALLOC(alloc_U, size_U);
+        STARSH_MALLOC(alloc_V, size_V);
+        starpu_memory_pin(alloc_U, size_U*sizeof(double));
+        starpu_memory_pin(alloc_V, size_V*sizeof(double));
+        starpu_malloc(&alloc_S, size_S*sizeof(double));
+        starpu_malloc(&alloc_D, size_D*sizeof(double));
+        int shape[] = {nb, maxrank};
+        for(lbi = 0; lbi < nblocks_far_local; ++lbi)
+        {
+            STARSH_int offset = lbi * nb * maxrank;
+            array_from_buffer(far_U+lbi, 2, shape, 'd', 'F', alloc_U+offset);
+            array_from_buffer(far_V+lbi, 2, shape, 'd', 'F', alloc_V+offset);
+        }
+        for(lbi = 0; lbi < nbatches_local; ++lbi)
+        {
+            STARSH_int offset = lbi * batch_size * nb * maxrank;
+            STARSH_int offset_S = lbi * batch_size * mn;
+            double *U = alloc_U + offset;
+            double *V = alloc_V + offset;
+            double *S = alloc_S + offset_S;
+            STARSH_int offset_D = lbi * batch_size * nb * nb;
+            double *D = alloc_D + offset_D;
+            int this_batch_size = nblocks_far_local - lbi*batch_size;
+            if(this_batch_size > batch_size)
+                this_batch_size = batch_size;
+            STARSH_int D_size = this_batch_size * nb * nb;
+            STARSH_int U_size = this_batch_size * nb * maxrank;
+            STARSH_int V_size = U_size;
+            STARSH_int S_size = this_batch_size * mn;
+            //printf("THIS BATCH SIZE=%d\n", this_batch_size);
+            starpu_vector_data_register(rank_handle+lbi, STARPU_MAIN_RAM,
+                    (uintptr_t)(far_rank + lbi*batch_size), this_batch_size,
+                    sizeof(*far_rank));
+            starpu_vector_data_register(D_handle+lbi, STARPU_MAIN_RAM,
+                    (uintptr_t)(D), D_size, sizeof(double));
+            starpu_vector_data_register(Dcopy_handle+lbi, -1, 0, D_size,
+                    sizeof(double));
+            starpu_vector_data_register(index_handle+lbi, STARPU_MAIN_RAM,
+                    (uintptr_t)(block_far_local + lbi*batch_size),
+                    this_batch_size, sizeof(*block_far_local));
+            starpu_vector_data_register(U_handle+lbi, STARPU_MAIN_RAM,
+                    (uintptr_t)(U), U_size, sizeof(*U));
+            starpu_vector_data_register(V_handle+lbi, STARPU_MAIN_RAM,
+                    (uintptr_t)(V), V_size, sizeof(*V));
+            starpu_vector_data_register(S_handle+lbi, STARPU_MAIN_RAM,
+                    (uintptr_t)(S), S_size, sizeof(double));
+        }
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
+    //double time1 = MPI_Wtime();
+    //if(mpi_rank == 0)
+    //    printf("MPIKBLAS2: pin memory in %f seconds\n", time1-time0);
+    //time0 = time1;
+    // Work variables
+    int info;
+    // START MEASURING TIME
+    for(lbi = 0; lbi < nbatches_local; ++lbi)
+    {
+        //printf("RUNNING BATCH=%d\n", bi);
+        int this_batch_size = nblocks_far_local - lbi*batch_size;
+        if(this_batch_size > batch_size)
+            this_batch_size = batch_size;
+        // Generate matrix
+        starpu_task_insert(&codelet_kernel_far,
+                STARPU_VALUE, &F, sizeof(F),
+                STARPU_VALUE, &this_batch_size, sizeof(this_batch_size),
+                STARPU_W, D_handle[lbi],
+                STARPU_R, index_handle[lbi],
+                STARPU_PRIORITY, -2,
+                0);
+        starpu_data_unregister_submit(index_handle[lbi]);
+        // Run KBLAS_RSVD
+        starpu_task_insert(&codelet_lowrank,
+                STARPU_VALUE, &this_batch_size, sizeof(this_batch_size),
+                STARPU_VALUE, &nb, sizeof(nb),
+                STARPU_VALUE, &maxrank, sizeof(maxrank),
+                STARPU_VALUE, &oversample, sizeof(oversample),
+                STARPU_VALUE, &tol, sizeof(tol),
+                STARPU_VALUE, &cuhandles, sizeof(cuhandles),
+                STARPU_VALUE, &khandles, sizeof(khandles),
+                STARPU_VALUE, &kstates, sizeof(kstates),
+                STARPU_R, D_handle[lbi],
+                STARPU_SCRATCH, Dcopy_handle[lbi],
+                STARPU_W, U_handle[lbi],
+                STARPU_W, V_handle[lbi],
+                STARPU_W, S_handle[lbi],
+                STARPU_PRIORITY, 0,
+                0);
+        starpu_data_unregister_submit(D_handle[lbi]);
+        starpu_data_unregister_submit(Dcopy_handle[lbi]);
+        starpu_task_insert(&codelet_getrank,
+                STARPU_VALUE, &this_batch_size, sizeof(this_batch_size),
+                STARPU_VALUE, &nb, sizeof(nb),
+                STARPU_VALUE, &maxrank, sizeof(maxrank),
+                STARPU_VALUE, &oversample, sizeof(oversample),
+                STARPU_VALUE, &tol, sizeof(tol),
+                STARPU_R, U_handle[lbi],
+                STARPU_R, V_handle[lbi],
+                STARPU_R, S_handle[lbi],
+                STARPU_W, rank_handle[lbi],
+                STARPU_PRIORITY, -1,
+                0);
+        starpu_data_unregister_submit(rank_handle[lbi]);
+        starpu_data_unregister_submit(U_handle[lbi]);
+        starpu_data_unregister_submit(V_handle[lbi]);
+        starpu_data_unregister_submit(S_handle[lbi]);
+    }
+    starpu_task_wait_for_all();
+    MPI_Barrier(MPI_COMM_WORLD);
+    //time1 = MPI_Wtime();
+    //if(mpi_rank == 0)
+    //    printf("COMPUTE+COMPRESS MATRIX IN: %f seconds\n", time1-time0);
+    //time0 = time1;
+    if(nbatches_local > 0)
+    {
+        size_t size_U = nblocks_far_local * nb * maxrank;
+        size_t size_V = size_U;
+        starpu_free(alloc_D);
+        starpu_memory_unpin(alloc_U, size_U*sizeof(double));
+        starpu_memory_unpin(alloc_V, size_V*sizeof(double));
+        starpu_free(alloc_S);
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
+    //if(mpi_rank == 0)
+    //    printf("FINISH FIRST PASS AND UNREGISTER IN: %f seconds\n",
+    //            MPI_Wtime()-time0);
+    // Get number of false far-field blocks
+    STARSH_int nblocks_false_far_local = 0;
+    STARSH_int *false_far_local = NULL;
+    for(lbi = 0; lbi < nblocks_far_local; lbi++)
+    {
+        //far_rank[lbi] = -1;
+        if(far_rank[lbi] == -1)
+            nblocks_false_far_local++;
+    }
+    if(nblocks_false_far_local > 0)
+    {
+        // IMPORTANT: `false_far` and `false_far_local` must be in
+        // ascending order for later code to work normally
+        STARSH_MALLOC(false_far_local, nblocks_false_far_local);
+        lbj = 0;
+        for(lbi = 0; lbi < nblocks_far_local; lbi++)
+            if(far_rank[lbi] == -1)
+                false_far_local[lbj++] = block_far_local[lbi];
+    }
+    // Sync list of all false far-field blocks
+    STARSH_int nblocks_false_far = 0;
+    int int_nblocks_false_far_local = nblocks_false_far_local;
+    int *mpi_recvcount, *mpi_offset;
+    STARSH_MALLOC(mpi_recvcount, mpi_size);
+    STARSH_MALLOC(mpi_offset, mpi_size);
+    MPI_Allgather(&int_nblocks_false_far_local, 1, MPI_INT, mpi_recvcount,
+            1, MPI_INT, MPI_COMM_WORLD);
+    for(bi = 0; bi < mpi_size; bi++)
+        nblocks_false_far += mpi_recvcount[bi];
+    mpi_offset[0] = 0;
+    for(bi = 1; bi < mpi_size; bi++)
+        mpi_offset[bi] = mpi_offset[bi-1]+mpi_recvcount[bi-1];
+    STARSH_int *false_far = NULL;
+    if(nblocks_false_far > 0)
+        STARSH_MALLOC(false_far, nblocks_false_far);
+    MPI_Allgatherv(false_far_local, nblocks_false_far_local, my_MPI_SIZE_T,
+            false_far, mpi_recvcount, mpi_offset, my_MPI_SIZE_T,
+            MPI_COMM_WORLD);
+    free(mpi_recvcount);
+    free(mpi_offset);
+    // Make false_far be in ascending order
+    qsort(false_far, nblocks_false_far, sizeof(*false_far), cmp_size_t);
+    if(nblocks_false_far > 0)
+    {
+        // Update list of near-field blocks
+        new_nblocks_near = nblocks_near+nblocks_false_far;
+        new_nblocks_near_local = nblocks_near_local+nblocks_false_far_local;
+        STARSH_MALLOC(block_near, 2*new_nblocks_near);
+        if(new_nblocks_near_local > 0)
+            STARSH_MALLOC(block_near_local, new_nblocks_near_local);
+        // At first get all near-field blocks, assumed to be dense
+        for(bi = 0; bi < 2*nblocks_near; bi++)
+            block_near[bi] = F->block_near[bi];
+        for(lbi = 0; lbi < nblocks_near_local; lbi++)
+            block_near_local[lbi] = F->block_near_local[lbi];
+        // Add false far-field blocks
+        for(bi = 0; bi < nblocks_false_far; bi++)
+        {
+            STARSH_int bj = false_far[bi];
+            block_near[2*(bi+nblocks_near)] = F->block_far[2*bj];
+            block_near[2*(bi+nblocks_near)+1] = F->block_far[2*bj+1];
+        }
+        bi = 0;
+        for(lbi = 0; lbi < nblocks_false_far_local; lbi++)
+        {
+            lbj = false_far_local[lbi];
+            while(bi < nblocks_false_far && false_far[bi] < lbj)
+                bi++;
+            block_near_local[nblocks_near_local+lbi] = nblocks_near+bi;
+        }
+        // Update list of far-field blocks
+        new_nblocks_far = nblocks_far-nblocks_false_far;
+        new_nblocks_far_local = nblocks_far_local-nblocks_false_far_local;
+        if(new_nblocks_far > 0)
+        {
+            STARSH_MALLOC(block_far, 2*new_nblocks_far);
+            if(new_nblocks_far_local > 0)
+                STARSH_MALLOC(block_far_local, new_nblocks_far_local);
+            bj = 0;
+            lbi = 0;
+            lbj = 0;
+            for(bi = 0; bi < nblocks_far; bi++)
+            {
+                // `false_far` must be in ascending order for this to work
+                if(bj < nblocks_false_far && false_far[bj] == bi)
+                {
+                    if(nblocks_false_far_local > lbj &&
+                            false_far_local[lbj] == bi)
+                    {
+                        lbi++;
+                        lbj++;
+                    }
+                    bj++;
+                }
+                else
+                {
+                    block_far[2*(bi-bj)] = F->block_far[2*bi];
+                    block_far[2*(bi-bj)+1] = F->block_far[2*bi+1];
+                    if(nblocks_far_local > lbi &&
+                            F->block_far_local[lbi] == bi)
+                    {
+                        block_far_local[lbi-lbj] = bi-bj;
+                        lbi++;
+                    }
+                }
+            }
+        }
+        // Update format by creating new format
+        STARSH_blrf *F2;
+        info = starsh_blrf_new_from_coo_mpi(&F2, P, F->symm, RC, CC,
+                new_nblocks_far, block_far, new_nblocks_far_local,
+                block_far_local, new_nblocks_near, block_near,
+                new_nblocks_near_local, block_near_local, F->type);
+        // Swap internal data of formats and free unnecessary data
+        STARSH_blrf tmp_blrf = *F;
+        *F = *F2;
+        *F2 = tmp_blrf;
+        if(mpi_rank == 0)
+            STARSH_WARNING("`F` was modified due to false far-field blocks");
+        starsh_blrf_free(F2);
+    }
+    // Compute near-field blocks if needed
+    if(onfly == 0 && new_nblocks_near_local > 0)
+    {
+        STARSH_MALLOC(near_D, new_nblocks_near_local);
+        size_t size_D = new_nblocks_near_local * nb * nb;
+        STARSH_MALLOC(alloc_D, size_D);
+        nbatches_local = (new_nblocks_near_local-1)/batch_size + 1;
+        starpu_data_handle_t D_handle[nbatches_local];
+        starpu_data_handle_t index_handle[nbatches_local];
+        int shape[] = {nb, nb};
+        // For each local near-field block compute its elements
+        for(lbi = 0; lbi < new_nblocks_near_local; ++lbi)
+        {
+            // Get indexes of corresponding block row and block column
+            array_from_buffer(near_D+lbi, 2, shape, 'd', 'F',
+                    alloc_D + lbi*nb*nb);
+        }
+        for(lbi = 0; lbi < nbatches_local; ++lbi)
+        {
+            int this_batch_size = new_nblocks_near_local
+                - lbi*batch_size;
+            if(this_batch_size > batch_size)
+                this_batch_size = batch_size;
+            STARSH_int D_size = this_batch_size * nb * nb;
+            double *D = alloc_D + lbi*batch_size*nb*nb;
+            starpu_vector_data_register(D_handle+lbi, STARPU_MAIN_RAM,
+                    (uintptr_t)(D), D_size, sizeof(*D));
+            starpu_vector_data_register(index_handle+lbi, STARPU_MAIN_RAM,
+                    (uintptr_t)(block_near_local + lbi*batch_size),
+                    this_batch_size, sizeof(*block_near_local));
+        }
+        for(lbi = 0; lbi < nbatches_local; ++lbi)
+        {
+            int this_batch_size = new_nblocks_near_local
+                - lbi*batch_size;
+            if(this_batch_size > batch_size)
+                this_batch_size = batch_size;
+            // Generate matrix
+            starpu_task_insert(&codelet_kernel_near,
+                    STARPU_VALUE, &F, sizeof(F),
+                    STARPU_VALUE, &this_batch_size, sizeof(this_batch_size),
+                    STARPU_W, D_handle[lbi],
+                    STARPU_R, index_handle[lbi],
+                    0);
+            starpu_data_unregister_submit(D_handle[lbi]);
+            starpu_data_unregister_submit(index_handle[lbi]);
+        }
+        // Wait in this scope, because all handles are not visible outside
+        starpu_task_wait_for_all();
+    }
+    // Change sizes of far_rank, far_U and far_V if there were false
+    // far-field blocks
+    lbj = 0;
+    for(lbi = 0; lbi < nblocks_far_local; lbi++)
+    {
+        if(far_rank[lbi] == -1)
+            lbj++;
+        else
+        {
+            int shape_U[2] = {far_U[lbi]->shape[0], far_rank[lbi]};
+            int shape_V[2] = {far_V[lbi]->shape[0], far_rank[lbi]};
+            array_from_buffer(far_U+lbi-lbj, 2, shape_U, 'd', 'F',
+                    far_U[lbi]->data);
+            array_from_buffer(far_V+lbi-lbj, 2, shape_V, 'd', 'F',
+                    far_V[lbi]->data);
+            far_rank[lbi-lbj] = far_rank[lbi];
+        }
+    }
+    if(nblocks_false_far_local > 0 && new_nblocks_far_local > 0)
+    {
+        STARSH_REALLOC(far_rank, new_nblocks_far_local);
+        STARSH_REALLOC(far_U, new_nblocks_far_local);
+        STARSH_REALLOC(far_V, new_nblocks_far_local);
+    }
+    // If all far-field blocks are false, then dealloc buffers
+    if(new_nblocks_far_local == 0 && nblocks_far_local > 0)
+    {
+        block_far = NULL;
+        free(far_rank);
+        far_rank = NULL;
+        free(far_U);
+        far_U = NULL;
+        free(far_V);
+        far_V = NULL;
+        free(alloc_U);
+        alloc_U = NULL;
+        free(alloc_V);
+        alloc_V = NULL;
+    }
+    // Dealloc list of false far-field blocks if it is not empty
+    if(nblocks_false_far > 0)
+        free(false_far);
+    if(nblocks_false_far_local > 0)
+        free(false_far_local);
+    // Finish with creating instance of Block Low-Rank Matrix with given
+    // buffers
+    starpu_execute_on_each_worker(deinit_starpu_kblas, args_gpu, STARPU_CUDA);
+    return starsh_blrm_new_mpi(matrix, F, far_rank, far_U, far_V, onfly,
+            near_D, alloc_U, alloc_V, alloc_D, '1');
+}
+
diff --git a/src/backends/mpi_starpu_kblas3_spatial/CMakeLists.txt b/src/backends/mpi_starpu_kblas3_spatial/CMakeLists.txt
new file mode 100644
index 00000000..bf4bac8b
--- /dev/null
+++ b/src/backends/mpi_starpu_kblas3_spatial/CMakeLists.txt
@@ -0,0 +1,25 @@
+# @copyright (c) 2017 King Abdullah University of Science and
+#                      Technology (KAUST). All rights reserved.
+#
+# STARS-H is a software package, provided by King Abdullah
+#             University of Science and Technology (KAUST)
+#
+# @file src/backends/mpi_starpu_kblas/CMakeLists.txt
+# @version 0.1.0
+# @author Aleksandr Mikhalev
+# @date 2017-11-07
+
+
+# Collect sources for documentation and compilation
+set(SRC)
+add_subdirectory("blrm")
+
+# If compilation is requried
+if(MPI AND STARPU AND KBLAS)
+    add_library(backends_mpi_starpu_kblas3_spatial OBJECT ${SRC})
+    set_target_properties(backends_mpi_starpu_kblas3_spatial PROPERTIES
+        COMPILE_FLAGS "${MPI_C_COMPILE_FLAGS}")
+endif()
+
+# Put doxygen input to parent scope
+set(DOXYGEN_INPUT ${DOXYGEN_INPUT} ${SRC} PARENT_SCOPE)
diff --git a/src/backends/mpi_starpu_kblas3_spatial/blrm/CMakeLists.txt b/src/backends/mpi_starpu_kblas3_spatial/blrm/CMakeLists.txt
new file mode 100644
index 00000000..5d890a91
--- /dev/null
+++ b/src/backends/mpi_starpu_kblas3_spatial/blrm/CMakeLists.txt
@@ -0,0 +1,21 @@
+# @copyright (c) 2017 King Abdullah University of Science and
+#                      Technology (KAUST). All rights reserved.
+#
+# STARS-H is a software package, provided by King Abdullah
+#             University of Science and Technology (KAUST)
+#
+# @file src/backends/mpi_starpu/blrm/CMakeLists.txt
+# @version 0.1.0
+# @author Aleksandr Mikhalev
+# @date 2017-11-07
+
+
+# set the values of the variable in the parent scope
+set(SRC
+    #"${CMAKE_CURRENT_SOURCE_DIR}/dqp3.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/drsdd.c"
+    #"${CMAKE_CURRENT_SOURCE_DIR}/dfe.c"
+    #"${CMAKE_CURRENT_SOURCE_DIR}/dsdd.c"
+    #"${CMAKE_CURRENT_SOURCE_DIR}/dmml.c"
+    #"${CMAKE_CURRENT_SOURCE_DIR}/dna.c"
+    PARENT_SCOPE)
diff --git a/src/backends/mpi_starpu_kblas3_spatial/blrm/drsdd.c b/src/backends/mpi_starpu_kblas3_spatial/blrm/drsdd.c
new file mode 100644
index 00000000..7a363f2e
--- /dev/null
+++ b/src/backends/mpi_starpu_kblas3_spatial/blrm/drsdd.c
@@ -0,0 +1,630 @@
+/*! @copyright (c) 2017 King Abdullah University of Science and
+ *                      Technology (KAUST). All rights reserved.
+ *
+ * STARS-H is a software package, provided by King Abdullah
+ *             University of Science and Technology (KAUST)
+ *
+ * @file src/backends/mpi_starpu_kblas/blrm/drsdd.c
+ * @version 0.1.0
+ * @author Aleksandr Mikhalev
+ * @date 2017-11-07
+ * */
+
+#include "common.h"
+#include "starsh.h"
+#include "starsh-starpu-kblas.h"
+#include "starsh-mpi-starpu-kblas.h"
+#include "starsh-spatial.h"
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <kblas.h>
+#include "batch_rand.h"
+#include <starpu.h>
+#include <mpi.h>
+
+static void init_starpu_kblas(void *args)
+{
+    cublasHandle_t *cublas_handles;
+    kblasHandle_t *kblas_handles;
+    kblasRandState_t *kblas_states;
+    STARSH_ssdata **data_gpu;
+    STARSH_ssdata *data_cpu;
+    //double time0 = MPI_Wtime();
+    cudaStream_t stream = starpu_cuda_get_local_stream();
+    int nb, nsamples, maxbatch;
+    starpu_codelet_unpack_args(args, &cublas_handles, &kblas_handles,
+            &kblas_states, &data_gpu, &data_cpu, &nb, &nsamples, &maxbatch);
+    int id = starpu_worker_get_id();
+    cublasStatus_t status;
+    //printf("unpack_args: %f seconds\n", MPI_Wtime()-time0);
+    //time0 = MPI_Wtime();
+    kblasCreate(&kblas_handles[id]);
+    //printf("kblasCreate: %f seconds\n", MPI_Wtime()-time0);
+    //time0 = MPI_Wtime();
+    kblasSetStream(kblas_handles[id], stream);
+    kblasDrsvd_batch_wsquery(kblas_handles[id], nb, nb, nsamples, maxbatch);
+    kblasAllocateWorkspace(kblas_handles[id]);
+    //printf("kblasAllocateWorkspace: %f seconds\n", MPI_Wtime()-time0);
+    //time0 = MPI_Wtime();
+    cublas_handles[id] = kblasGetCublasHandle(kblas_handles[id]);
+    kblasInitRandState(kblas_handles[id], &kblas_states[id], 16384*2, 0);
+    starsh_ssdata_togpu(&data_gpu[id], data_cpu);
+    cudaStreamSynchronize(stream);
+    //printf("starsh_ssdata_togpu: %f seconds\n", MPI_Wtime()-time0);
+    //time0 = MPI_Wtime();
+}
+
+static void deinit_starpu_kblas(void *args)
+{
+    int nb, nsamples, maxbatch;
+    cublasHandle_t *cublas_handles;
+    kblasHandle_t *kblas_handles;
+    kblasRandState_t *kblas_states;
+    STARSH_ssdata **data_gpu;
+    STARSH_ssdata *data_cpu;
+    starpu_codelet_unpack_args(args, &cublas_handles, &kblas_handles,
+            &kblas_states, &data_gpu, &data_cpu, &nb, &nsamples, &maxbatch);
+    int id = starpu_worker_get_id();
+    kblasDestroyRandState(kblas_states[id]);
+    kblasDestroy(&kblas_handles[id]);
+    starsh_ssdata_free_gpu(data_gpu[id]);
+    cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}
+
+static void starsh_dense_dlrrsdd_starpu_kblas3_copy(void *buffers[], void *cl_arg)
+{
+    int N, batch_size;
+    starpu_codelet_unpack_args(cl_arg, &N, &batch_size);
+    double *Dcopy = (double *)STARPU_VECTOR_GET_PTR(buffers[0]);
+    double *D = (double *)STARPU_VECTOR_GET_PTR(buffers[1]);
+    cblas_dcopy(N*N*batch_size, Dcopy, 1, D, 1);
+}
+
+int starsh_blrm__drsdd_mpi_starpu_kblas3_spatial(STARSH_blrm **matrix,
+        STARSH_blrf *format, int maxrank, double tol, int onfly)
+//! Approximate each tile by randomized SVD.
+/*!
+ * @param[out] matrix: Address of pointer to @ref STARSH_blrm object.
+ * @param[in] format: Block low-rank format.
+ * @param[in] maxrank: Maximum possible rank.
+ * @param[in] tol: Relative error tolerance.
+ * @param[in] onfly: Whether not to store dense blocks.
+ * @return Error code @ref STARSH_ERRNO.
+ * @ingroup blrm
+ * */
+{
+    double time_start = MPI_Wtime();
+    STARSH_blrf *F = format;
+    STARSH_problem *P = F->problem;
+    STARSH_kernel *kernel = P->kernel;
+    STARSH_int nblocks_far = F->nblocks_far;
+    STARSH_int nblocks_near = F->nblocks_near;
+    STARSH_int nblocks_far_local = F->nblocks_far_local;
+    STARSH_int nblocks_near_local = F->nblocks_near_local;
+    // Shortcuts to information about clusters
+    STARSH_cluster *RC = F->row_cluster;
+    STARSH_cluster *CC = F->col_cluster;
+    void *RD = RC->data, *CD = CC->data;
+    // Following values default to given block low-rank format F, but they are
+    // changed when there are false far-field blocks.
+    STARSH_int new_nblocks_far = F->nblocks_far;
+    STARSH_int new_nblocks_near = F->nblocks_near;
+    STARSH_int new_nblocks_far_local = F->nblocks_far_local;
+    STARSH_int new_nblocks_near_local = F->nblocks_near_local;
+    STARSH_int *block_far = F->block_far;
+    STARSH_int *block_near = F->block_near;
+    STARSH_int *block_far_local = F->block_far_local;
+    STARSH_int *block_near_local = F->block_near_local;
+    // Temporary holder for indexes of tiles
+    STARSH_int *tile_index = NULL;
+    // Places to store low-rank factors, dense blocks and ranks
+    Array **far_U = NULL, **far_V = NULL, **near_D = NULL;
+    int *far_rank = NULL;
+    double *alloc_U = NULL, *alloc_V = NULL, *alloc_D = NULL, *alloc_S = NULL;
+    size_t offset_U = 0, offset_V = 0, offset_D = 0;
+    STARSH_int lbi, lbj, bi, bj = 0;
+    const int oversample = starsh_params.oversample;
+    // MPI
+    int mpi_size, mpi_rank;
+    MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
+    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+    //if(mpi_rank == 0)
+    //    printf("MPIKBLAS3\n");
+    // Init CuBLAS and KBLAS handles and temp buffers for all workers (but they
+    // are used only in GPU codelets)
+    int workers = starpu_worker_get_count();
+    cublasHandle_t cublas_handles[workers];
+    kblasHandle_t kblas_handles[workers];
+    kblasRandState_t kblas_states[workers];
+    STARSH_ssdata *data_gpu_array[workers];
+    cublasHandle_t *cuhandles = cublas_handles;
+    kblasHandle_t *khandles = kblas_handles;
+    kblasRandState_t *kstates = kblas_states;
+    STARSH_ssdata **data_gpu = data_gpu_array;
+    //printf("MAIN: %p, %p, %p\n", cuhandles, khandles, svhandles);
+    void *args_gpu;
+    size_t args_gpu_size = 0;
+    // This works only for TLR with equal tiles
+    int nb = RC->size[0];
+    int nsamples = maxrank+oversample;
+    // Set size of batch
+    char *env_var = getenv("STARSH_KBLAS_BATCH");
+    int batch_size = 300;
+    if(env_var)
+        batch_size = atoi(env_var);
+    //if(mpi_rank == 0)
+    //    printf("MPIKBLAS3: batch_size=%d\n", batch_size);
+    // Ceil number of batches
+    int nbatches_local = (nblocks_far_local-1)/batch_size + 1;
+    // Get number of temporary buffers for CPU-GPU transfers
+    int nworkers_gpu = 3 * starpu_cuda_worker_get_count();
+    // Get corresponding sizes and minimum of them
+    int mn = maxrank+oversample;
+    if(mn > nb)
+        mn = nb;
+    starpu_codelet_pack_args(&args_gpu, &args_gpu_size,
+            STARPU_VALUE, &cuhandles, sizeof(cuhandles),
+            STARPU_VALUE, &khandles, sizeof(khandles),
+            STARPU_VALUE, &kstates, sizeof(kstates),
+            STARPU_VALUE, &data_gpu, sizeof(data_gpu),
+            STARPU_VALUE, &RD, sizeof(RD),
+            STARPU_VALUE, &nb, sizeof(nb),
+            STARPU_VALUE, &nsamples, sizeof(nsamples),
+            STARPU_VALUE, &batch_size, sizeof(batch_size),
+            0);
+    starpu_execute_on_each_worker(init_starpu_kblas, args_gpu, STARPU_CUDA);
+    //MPI_Barrier(MPI_COMM_WORLD);
+    //double time0 = MPI_Wtime();
+    //if(mpi_rank == 0)
+    //    printf("CUBLAS + WORKSPACE ALLOCATION: %f seconds\n", time0-time_start);
+    // Init codelet structs and handles
+    struct starpu_codelet codelet_kernel =
+    {
+        .cuda_funcs = {starsh_dense_kernel_starpu_kblas3_gpu},
+        .cuda_flags = {STARPU_CUDA_ASYNC},
+        .nbuffers = 2,
+        .modes = {STARPU_W, STARPU_R},
+        //.type = STARPU_SPMD,
+        //.max_parallelism = INT_MAX,
+    };
+    struct starpu_codelet codelet_lowrank =
+    {
+        .cuda_funcs = {starsh_dense_dlrrsdd_starpu_kblas2_gpu},
+        .cuda_flags = {STARPU_CUDA_ASYNC},
+        .nbuffers = 5,
+        .modes = {STARPU_R, STARPU_SCRATCH, STARPU_W, STARPU_W, STARPU_W},
+    };
+    struct starpu_codelet codelet_getrank =
+    {
+        .cpu_funcs = {starsh_dense_dlrrsdd_starpu_kblas2_getrank},
+        .nbuffers = 6,
+        .modes = {STARPU_R, STARPU_R, STARPU_R, STARPU_W, STARPU_W, STARPU_W},
+        //.type = STARPU_SPMD,
+        //.max_parallelism = INT_MAX,
+    };
+    struct starpu_codelet codelet_copy =
+    {
+        .cpu_funcs = {starsh_dense_dlrrsdd_starpu_kblas3_copy},
+        .nbuffers = 2,
+        .modes = {STARPU_R, STARPU_W},
+    };
+    //starpu_data_handle_t D_handle[nbatches_local];
+    //starpu_data_handle_t Dcopy_handle[nbatches_local];
+    starpu_data_handle_t index_handle[nbatches_local];
+    starpu_data_handle_t U_handle[nbatches_local];
+    starpu_data_handle_t V_handle[nbatches_local];
+    //starpu_data_handle_t S_handle[nbatches_local];
+    starpu_data_handle_t rank_handle[nbatches_local];
+    starpu_data_handle_t D_handle[nworkers_gpu];
+    starpu_data_handle_t Dcopy_handle[nworkers_gpu];
+    starpu_data_handle_t tmp_U_handle[nworkers_gpu];
+    starpu_data_handle_t tmp_V_handle[nworkers_gpu];
+    starpu_data_handle_t tmp_S_handle[nworkers_gpu];
+    // Init buffers to store low-rank factors of far-field blocks if needed
+    MPI_Barrier(MPI_COMM_WORLD);
+    //double time0 = MPI_Wtime();
+    //if(mpi_rank == 0)
+    //    printf("MPIKBLAS3: init in %f seconds\n", time0-time_start);
+    if(nbatches_local > 0)
+    {
+        STARSH_MALLOC(far_U, nblocks_far_local);
+        STARSH_MALLOC(far_V, nblocks_far_local);
+        STARSH_MALLOC(far_rank, nblocks_far_local);
+        size_t size_U = nblocks_far_local * nb * maxrank;
+        size_t size_V = size_U;
+        //size_t size_D = nblocks_far_local * nb * nb;
+        //size_t size_S = nblocks_far_local * mn;
+        STARSH_MALLOC(alloc_U, size_U);
+        STARSH_MALLOC(alloc_V, size_V);
+        //starpu_memory_pin(alloc_U, size_U*sizeof(double));
+        //starpu_memory_pin(alloc_V, size_V*sizeof(double));
+        //starpu_malloc(&alloc_S, size_S*sizeof(double));
+        //starpu_malloc(&alloc_D, size_D*sizeof(double));
+        int shape[] = {nb, maxrank};
+        for(lbi = 0; lbi < nblocks_far_local; ++lbi)
+        {
+            STARSH_int offset = lbi * nb * maxrank;
+            array_from_buffer(far_U+lbi, 2, shape, 'd', 'F', alloc_U+offset);
+            array_from_buffer(far_V+lbi, 2, shape, 'd', 'F', alloc_V+offset);
+        }
+        starpu_malloc(&tile_index, 2*nblocks_far_local*sizeof(*tile_index));
+        for(bi = 0; bi < nblocks_far_local; ++bi)
+        {
+            STARSH_int ind = block_far_local[bi];
+            tile_index[2*bi] = block_far[2*ind];
+            tile_index[2*bi+1] = block_far[2*ind+1];
+        }
+        for(lbi = 0; lbi < nbatches_local; ++lbi)
+        {
+            STARSH_int offset = lbi * batch_size * nb * maxrank;
+            //STARSH_int offset_S = lbi * batch_size * mn;
+            double *U = alloc_U + offset;
+            double *V = alloc_V + offset;
+            //double *S = alloc_S + offset_S;
+            //STARSH_int offset_D = lbi * batch_size * nb * nb;
+            //double *D = alloc_D + offset_D;
+            int this_batch_size = nblocks_far_local - lbi*batch_size;
+            if(this_batch_size > batch_size)
+                this_batch_size = batch_size;
+            //STARSH_int D_size = this_batch_size * nb * nb;
+            STARSH_int U_size = this_batch_size * nb * maxrank;
+            STARSH_int V_size = U_size;
+            //STARSH_int S_size = this_batch_size * mn;
+            //printf("THIS BATCH SIZE=%d\n", this_batch_size);
+            starpu_vector_data_register(rank_handle+lbi, STARPU_MAIN_RAM,
+                    (uintptr_t)(far_rank + lbi*batch_size), this_batch_size,
+                    sizeof(*far_rank));
+            //starpu_vector_data_register(D_handle+lbi, STARPU_MAIN_RAM,
+            //        (uintptr_t)(D), D_size, sizeof(double));
+            //starpu_vector_data_register(Dcopy_handle+lbi, -1, 0, D_size,
+            //        sizeof(double));
+            starpu_vector_data_register(index_handle+lbi, STARPU_MAIN_RAM,
+                    (uintptr_t)(tile_index + 2*lbi*batch_size),
+                    2*this_batch_size, sizeof(*tile_index));
+            starpu_vector_data_register(U_handle+lbi, STARPU_MAIN_RAM,
+                    (uintptr_t)(U), U_size, sizeof(*U));
+            starpu_vector_data_register(V_handle+lbi, STARPU_MAIN_RAM,
+                    (uintptr_t)(V), V_size, sizeof(*V));
+            //starpu_vector_data_register(S_handle+lbi, STARPU_MAIN_RAM,
+            //        (uintptr_t)(S), S_size, sizeof(double));
+        }
+        STARSH_int D_size = batch_size * nb * nb;
+        STARSH_int tmp_U_size = batch_size * nb * maxrank;
+        STARSH_int tmp_S_size = batch_size * mn;
+        for(bi = 0; bi < nworkers_gpu; ++bi)
+        {
+            starpu_vector_data_register(D_handle+bi, -1, 0, D_size,
+                    sizeof(double));
+            starpu_vector_data_register(Dcopy_handle+bi, -1, 0, D_size,
+                    sizeof(double));
+            starpu_vector_data_register(tmp_U_handle+bi, -1, 0, tmp_U_size,
+                    sizeof(double));
+            starpu_vector_data_register(tmp_V_handle+bi, -1, 0, tmp_U_size,
+                    sizeof(double));
+            starpu_vector_data_register(tmp_S_handle+bi, -1, 0, tmp_S_size,
+                    sizeof(double));
+        }
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
+    //double time1 = MPI_Wtime();
+    //if(mpi_rank == 0)
+    //    printf("MPIKBLAS3: Register data in %f seconds\n", time1-time0);
+    //time0 = time1;
+    // Work variables
+    int info;
+    // START MEASURING TIME
+    for(lbi = 0; lbi < nbatches_local; ++lbi)
+    {
+        //printf("RUNNING BATCH=%d\n", bi);
+        int this_batch_size = nblocks_far_local - lbi*batch_size;
+        if(this_batch_size > batch_size)
+            this_batch_size = batch_size;
+        // Generate matrix
+        starpu_task_insert(&codelet_kernel,
+                STARPU_VALUE, &data_gpu, sizeof(data_gpu),
+                STARPU_VALUE, &nb, sizeof(nb),
+                STARPU_VALUE, &this_batch_size, sizeof(this_batch_size),
+                STARPU_W, D_handle[lbi % nworkers_gpu],
+                STARPU_R, index_handle[lbi],
+                STARPU_PRIORITY, -2,
+                0);
+        starpu_data_unregister_submit(index_handle[lbi]);
+        // Run KBLAS_RSVD
+        starpu_task_insert(&codelet_lowrank,
+                STARPU_VALUE, &this_batch_size, sizeof(this_batch_size),
+                STARPU_VALUE, &nb, sizeof(nb),
+                STARPU_VALUE, &maxrank, sizeof(maxrank),
+                STARPU_VALUE, &oversample, sizeof(oversample),
+                STARPU_VALUE, &tol, sizeof(tol),
+                STARPU_VALUE, &cuhandles, sizeof(cuhandles),
+                STARPU_VALUE, &khandles, sizeof(khandles),
+                STARPU_VALUE, &kstates, sizeof(kstates),
+                STARPU_R, D_handle[lbi % nworkers_gpu],
+                STARPU_SCRATCH, Dcopy_handle[lbi % nworkers_gpu],
+                STARPU_W, tmp_U_handle[lbi % nworkers_gpu],
+                STARPU_W, tmp_V_handle[lbi % nworkers_gpu],
+                STARPU_W, tmp_S_handle[lbi % nworkers_gpu],
+                STARPU_PRIORITY, 0,
+                0);
+        //starpu_data_unregister_submit(D_handle[lbi]);
+        //starpu_data_unregister_submit(Dcopy_handle[lbi]);
+        starpu_task_insert(&codelet_getrank,
+                STARPU_VALUE, &this_batch_size, sizeof(this_batch_size),
+                STARPU_VALUE, &nb, sizeof(nb),
+                STARPU_VALUE, &maxrank, sizeof(maxrank),
+                STARPU_VALUE, &oversample, sizeof(oversample),
+                STARPU_VALUE, &tol, sizeof(tol),
+                STARPU_R, tmp_U_handle[lbi % nworkers_gpu],
+                STARPU_R, tmp_V_handle[lbi % nworkers_gpu],
+                STARPU_R, tmp_S_handle[lbi % nworkers_gpu],
+                STARPU_W, rank_handle[lbi],
+                STARPU_W, U_handle[lbi],
+                STARPU_W, V_handle[lbi],
+                STARPU_PRIORITY, -1,
+                0);
+        starpu_data_unregister_submit(rank_handle[lbi]);
+        starpu_data_unregister_submit(U_handle[lbi]);
+        starpu_data_unregister_submit(V_handle[lbi]);
+        //starpu_data_unregister_submit(S_handle[lbi]);
+    }
+    starpu_task_wait_for_all();
+    MPI_Barrier(MPI_COMM_WORLD);
+    //time1 = MPI_Wtime();
+    //if(mpi_rank == 0)
+    //    printf("COMPUTE+COMPRESS MATRIX IN: %f seconds\n", time1-time0);
+    //time0 = time1;
+    if(nbatches_local > 0)
+    {
+        //size_t size_U = nblocks_far_local * nb * maxrank;
+        //size_t size_V = size_U;
+        //starpu_free(alloc_D);
+        //starpu_memory_unpin(alloc_U, size_U*sizeof(double));
+        //starpu_memory_unpin(alloc_V, size_V*sizeof(double));
+        //starpu_free(alloc_S);
+        starpu_free(tile_index);
+        for(bi = 0; bi < nworkers_gpu; ++bi)
+        {
+            starpu_data_unregister(D_handle[bi]);
+            starpu_data_unregister(Dcopy_handle[bi]);
+            starpu_data_unregister(tmp_U_handle[bi]);
+            starpu_data_unregister(tmp_V_handle[bi]);
+            starpu_data_unregister(tmp_S_handle[bi]);
+        }
+    }
+    //MPI_Barrier(MPI_COMM_WORLD);
+    //if(mpi_rank == 0)
+    //    printf("FINISH FIRST PASS AND UNREGISTER IN: %f seconds\n",
+    //            MPI_Wtime()-time0);
+    // Get number of false far-field blocks
+    STARSH_int nblocks_false_far_local = 0;
+    STARSH_int *false_far_local = NULL;
+    for(lbi = 0; lbi < nblocks_far_local; lbi++)
+    {
+        //far_rank[lbi] = -1;
+        if(far_rank[lbi] == -1)
+            nblocks_false_far_local++;
+    }
+    if(nblocks_false_far_local > 0)
+    {
+        // IMPORTANT: `false_far` and `false_far_local` must be in
+        // ascending order for later code to work normally
+        STARSH_MALLOC(false_far_local, nblocks_false_far_local);
+        lbj = 0;
+        for(lbi = 0; lbi < nblocks_far_local; lbi++)
+            if(far_rank[lbi] == -1)
+                false_far_local[lbj++] = block_far_local[lbi];
+    }
+    // Sync list of all false far-field blocks
+    STARSH_int nblocks_false_far = 0;
+    int int_nblocks_false_far_local = nblocks_false_far_local;
+    int *mpi_recvcount, *mpi_offset;
+    STARSH_MALLOC(mpi_recvcount, mpi_size);
+    STARSH_MALLOC(mpi_offset, mpi_size);
+    MPI_Allgather(&int_nblocks_false_far_local, 1, MPI_INT, mpi_recvcount,
+            1, MPI_INT, MPI_COMM_WORLD);
+    for(bi = 0; bi < mpi_size; bi++)
+        nblocks_false_far += mpi_recvcount[bi];
+    mpi_offset[0] = 0;
+    for(bi = 1; bi < mpi_size; bi++)
+        mpi_offset[bi] = mpi_offset[bi-1]+mpi_recvcount[bi-1];
+    STARSH_int *false_far = NULL;
+    if(nblocks_false_far > 0)
+        STARSH_MALLOC(false_far, nblocks_false_far);
+    MPI_Allgatherv(false_far_local, nblocks_false_far_local, my_MPI_SIZE_T,
+            false_far, mpi_recvcount, mpi_offset, my_MPI_SIZE_T,
+            MPI_COMM_WORLD);
+    free(mpi_recvcount);
+    free(mpi_offset);
+    // Make false_far be in ascending order
+    qsort(false_far, nblocks_false_far, sizeof(*false_far), cmp_size_t);
+    if(nblocks_false_far > 0)
+    {
+        // Update list of near-field blocks
+        new_nblocks_near = nblocks_near+nblocks_false_far;
+        new_nblocks_near_local = nblocks_near_local+nblocks_false_far_local;
+        STARSH_MALLOC(block_near, 2*new_nblocks_near);
+        if(new_nblocks_near_local > 0)
+            STARSH_MALLOC(block_near_local, new_nblocks_near_local);
+        // At first get all near-field blocks, assumed to be dense
+        for(bi = 0; bi < 2*nblocks_near; bi++)
+            block_near[bi] = F->block_near[bi];
+        for(lbi = 0; lbi < nblocks_near_local; lbi++)
+            block_near_local[lbi] = F->block_near_local[lbi];
+        // Add false far-field blocks
+        for(bi = 0; bi < nblocks_false_far; bi++)
+        {
+            STARSH_int bj = false_far[bi];
+            block_near[2*(bi+nblocks_near)] = F->block_far[2*bj];
+            block_near[2*(bi+nblocks_near)+1] = F->block_far[2*bj+1];
+        }
+        bi = 0;
+        for(lbi = 0; lbi < nblocks_false_far_local; lbi++)
+        {
+            lbj = false_far_local[lbi];
+            while(bi < nblocks_false_far && false_far[bi] < lbj)
+                bi++;
+            block_near_local[nblocks_near_local+lbi] = nblocks_near+bi;
+        }
+        // Update list of far-field blocks
+        new_nblocks_far = nblocks_far-nblocks_false_far;
+        new_nblocks_far_local = nblocks_far_local-nblocks_false_far_local;
+        if(new_nblocks_far > 0)
+        {
+            STARSH_MALLOC(block_far, 2*new_nblocks_far);
+            if(new_nblocks_far_local > 0)
+                STARSH_MALLOC(block_far_local, new_nblocks_far_local);
+            bj = 0;
+            lbi = 0;
+            lbj = 0;
+            for(bi = 0; bi < nblocks_far; bi++)
+            {
+                // `false_far` must be in ascending order for this to work
+                if(bj < nblocks_false_far && false_far[bj] == bi)
+                {
+                    if(nblocks_false_far_local > lbj &&
+                            false_far_local[lbj] == bi)
+                    {
+                        lbi++;
+                        lbj++;
+                    }
+                    bj++;
+                }
+                else
+                {
+                    block_far[2*(bi-bj)] = F->block_far[2*bi];
+                    block_far[2*(bi-bj)+1] = F->block_far[2*bi+1];
+                    if(nblocks_far_local > lbi &&
+                            F->block_far_local[lbi] == bi)
+                    {
+                        block_far_local[lbi-lbj] = bi-bj;
+                        lbi++;
+                    }
+                }
+            }
+        }
+        // Update format by creating new format
+        STARSH_blrf *F2;
+        info = starsh_blrf_new_from_coo_mpi(&F2, P, F->symm, RC, CC,
+                new_nblocks_far, block_far, new_nblocks_far_local,
+                block_far_local, new_nblocks_near, block_near,
+                new_nblocks_near_local, block_near_local, F->type);
+        // Swap internal data of formats and free unnecessary data
+        STARSH_blrf tmp_blrf = *F;
+        *F = *F2;
+        *F2 = tmp_blrf;
+        if(mpi_rank == 0)
+            STARSH_WARNING("`F` was modified due to false far-field blocks");
+        starsh_blrf_free(F2);
+    }
+    // Compute near-field blocks if needed
+    if(onfly == 0 && new_nblocks_near_local > 0)
+    {
+        STARSH_MALLOC(near_D, new_nblocks_near_local);
+        size_t size_D = new_nblocks_near_local * nb * nb;
+        STARSH_MALLOC(alloc_D, size_D);
+        nbatches_local = (new_nblocks_near_local-1)/batch_size + 1;
+        starpu_data_handle_t D_handle[nbatches_local];
+        starpu_data_handle_t index_handle[nbatches_local];
+        starpu_malloc(&tile_index, 2*new_nblocks_near_local*sizeof(*tile_index));
+        int shape[] = {nb, nb};
+        // For each local near-field block compute its elements
+        for(lbi = 0; lbi < new_nblocks_near_local; ++lbi)
+        {
+            // Get indexes of corresponding block row and block column
+            array_from_buffer(near_D+lbi, 2, shape, 'd', 'F',
+                    alloc_D + lbi*nb*nb);
+            STARSH_int ind = block_near_local[lbi];
+            tile_index[lbi*2] = block_near[2*ind];
+            tile_index[lbi*2+1] = block_near[2*ind+1];
+        }
+        for(lbi = 0; lbi < nbatches_local; ++lbi)
+        {
+            int this_batch_size = new_nblocks_near_local
+                - lbi*batch_size;
+            if(this_batch_size > batch_size)
+                this_batch_size = batch_size;
+            STARSH_int D_size = this_batch_size * nb * nb;
+            double *D = alloc_D + lbi*batch_size*nb*nb;
+            starpu_vector_data_register(D_handle+lbi, STARPU_MAIN_RAM,
+                    (uintptr_t)(D), D_size, sizeof(*D));
+            starpu_vector_data_register(index_handle+lbi, STARPU_MAIN_RAM,
+                    (uintptr_t)(tile_index + 2*lbi*batch_size),
+                    2*this_batch_size, sizeof(*tile_index));
+        }
+        for(lbi = 0; lbi < nbatches_local; ++lbi)
+        {
+            int this_batch_size = new_nblocks_near_local
+                - lbi*batch_size;
+            if(this_batch_size > batch_size)
+                this_batch_size = batch_size;
+            // Generate matrix
+            starpu_task_insert(&codelet_kernel,
+                    STARPU_VALUE, &data_gpu, sizeof(data_gpu),
+                    STARPU_VALUE, &nb, sizeof(nb),
+                    STARPU_VALUE, &this_batch_size, sizeof(this_batch_size),
+                    STARPU_W, D_handle[lbi],
+                    STARPU_R, index_handle[lbi],
+                    0);
+            starpu_data_unregister_submit(D_handle[lbi]);
+            starpu_data_unregister_submit(index_handle[lbi]);
+        }
+        // Wait in this scope, because all handles are not visible outside
+        starpu_task_wait_for_all();
+        starpu_free(tile_index);
+    }
+    // Change sizes of far_rank, far_U and far_V if there were false
+    // far-field blocks
+    lbj = 0;
+    for(lbi = 0; lbi < nblocks_far_local; lbi++)
+    {
+        if(far_rank[lbi] == -1)
+            lbj++;
+        else
+        {
+            int shape_U[2] = {far_U[lbi]->shape[0], far_rank[lbi]};
+            int shape_V[2] = {far_V[lbi]->shape[0], far_rank[lbi]};
+            array_from_buffer(far_U+lbi-lbj, 2, shape_U, 'd', 'F',
+                    far_U[lbi]->data);
+            array_from_buffer(far_V+lbi-lbj, 2, shape_V, 'd', 'F',
+                    far_V[lbi]->data);
+            far_rank[lbi-lbj] = far_rank[lbi];
+        }
+    }
+    if(nblocks_false_far_local > 0 && new_nblocks_far_local > 0)
+    {
+        STARSH_REALLOC(far_rank, new_nblocks_far_local);
+        STARSH_REALLOC(far_U, new_nblocks_far_local);
+        STARSH_REALLOC(far_V, new_nblocks_far_local);
+    }
+    // If all far-field blocks are false, then dealloc buffers
+    if(new_nblocks_far_local == 0 && nblocks_far_local > 0)
+    {
+        block_far = NULL;
+        free(far_rank);
+        far_rank = NULL;
+        free(far_U);
+        far_U = NULL;
+        free(far_V);
+        far_V = NULL;
+        free(alloc_U);
+        alloc_U = NULL;
+        free(alloc_V);
+        alloc_V = NULL;
+    }
+    // Dealloc list of false far-field blocks if it is not empty
+    if(nblocks_false_far > 0)
+        free(false_far);
+    if(nblocks_false_far_local > 0)
+        free(false_far_local);
+    // Finish with creating instance of Block Low-Rank Matrix with given
+    // buffers
+    //if(mpi_rank == 0)
+    //    printf("FINISH NEAR-FIELD TILES: %f seconds\n", MPI_Wtime()-time0);
+    //time0 = MPI_Wtime();
+    starpu_execute_on_each_worker(deinit_starpu_kblas, args_gpu, STARPU_CUDA);
+    //if(mpi_rank == 0)
+    //    printf("MPIKBLAS3: finalize in %f seconds\n", MPI_Wtime()-time0);
+    return starsh_blrm_new_mpi(matrix, F, far_rank, far_U, far_V, onfly,
+            near_D, alloc_U, alloc_V, alloc_D, '1');
+}
+
diff --git a/src/backends/openmp/CMakeLists.txt b/src/backends/openmp/CMakeLists.txt
index d7146cb6..85543d42 100644
--- a/src/backends/openmp/CMakeLists.txt
+++ b/src/backends/openmp/CMakeLists.txt
@@ -5,7 +5,7 @@
 #             University of Science and Technology (KAUST)
 #
 # @file src/backends/openmp/CMakeLists.txt
-# @version 1.3.0
+# @version 0.3.0
 # @author Aleksandr Mikhalev
 # @date 2017-11-07
 
diff --git a/src/backends/openmp/blrm/CMakeLists.txt b/src/backends/openmp/blrm/CMakeLists.txt
index 08ee8d6c..1795d6dd 100644
--- a/src/backends/openmp/blrm/CMakeLists.txt
+++ b/src/backends/openmp/blrm/CMakeLists.txt
@@ -5,7 +5,7 @@
 #             University of Science and Technology (KAUST)
 #
 # @file src/backends/openmp/blrm/CMakeLists.txt
-# @version 1.3.0
+# @version 0.3.0
 # @author Aleksandr Mikhalev
 # @date 2017-11-07
 
diff --git a/src/backends/openmp/blrm/dfe.c b/src/backends/openmp/blrm/dfe.c
index 3fb217cd..aae15630 100644
--- a/src/backends/openmp/blrm/dfe.c
+++ b/src/backends/openmp/blrm/dfe.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file src/backends/openmp/blrm/dfe.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/src/backends/openmp/blrm/dmml.c b/src/backends/openmp/blrm/dmml.c
index 5764d500..2af2e787 100644
--- a/src/backends/openmp/blrm/dmml.c
+++ b/src/backends/openmp/blrm/dmml.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file src/backends/openmp/blrm/dmml.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/src/backends/openmp/blrm/dqp3.c b/src/backends/openmp/blrm/dqp3.c
index c0b25d91..6fa05cbf 100644
--- a/src/backends/openmp/blrm/dqp3.c
+++ b/src/backends/openmp/blrm/dqp3.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file src/backends/openmp/blrm/dqp3.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/src/backends/openmp/blrm/drsdd.c b/src/backends/openmp/blrm/drsdd.c
index cdc47be3..f60a9766 100644
--- a/src/backends/openmp/blrm/drsdd.c
+++ b/src/backends/openmp/blrm/drsdd.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file src/backends/openmp/blrm/drsdd.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
@@ -229,7 +229,7 @@ int starsh_blrm__drsdd_omp(STARSH_blrm **matrix, STARSH_blrf *format,
         }
         STARSH_MALLOC(alloc_D, size_D);
         // For each near-field block compute its elements
-        #pragma omp parallel for schedule(dynamic,1)
+        //#pragma omp parallel for schedule(dynamic,1)
         for(bi = 0; bi < new_nblocks_near; bi++)
         {
             // Get indexes of corresponding block row and block column
diff --git a/src/backends/openmp/blrm/dsdd.c b/src/backends/openmp/blrm/dsdd.c
index 8d52d3f9..56c2133b 100644
--- a/src/backends/openmp/blrm/dsdd.c
+++ b/src/backends/openmp/blrm/dsdd.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file src/backends/openmp/blrm/dsdd.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/src/backends/sequential/CMakeLists.txt b/src/backends/sequential/CMakeLists.txt
index 518a790e..ab0d949d 100644
--- a/src/backends/sequential/CMakeLists.txt
+++ b/src/backends/sequential/CMakeLists.txt
@@ -5,7 +5,7 @@
 #             University of Science and Technology (KAUST)
 #
 # @file src/backends/sequential/CMakeLists.txt
-# @version 1.3.0
+# @version 0.3.0
 # @author Aleksandr Mikhalev
 # @date 2017-11-07
 
diff --git a/src/backends/sequential/blrm/CMakeLists.txt b/src/backends/sequential/blrm/CMakeLists.txt
index 2d7977e1..08146f99 100644
--- a/src/backends/sequential/blrm/CMakeLists.txt
+++ b/src/backends/sequential/blrm/CMakeLists.txt
@@ -5,7 +5,7 @@
 #             University of Science and Technology (KAUST)
 #
 # @file src/backends/sequential/blrm/CMakeLists.txt
-# @version 1.3.0
+# @version 0.3.0
 # @author Aleksandr Mikhalev
 # @date 2017-11-07
 
diff --git a/src/backends/sequential/blrm/dca.c b/src/backends/sequential/blrm/dca.c
index 3b471b52..b1517859 100644
--- a/src/backends/sequential/blrm/dca.c
+++ b/src/backends/sequential/blrm/dca.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file src/backends/sequential/blrm/dca.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/src/backends/sequential/blrm/dfe.c b/src/backends/sequential/blrm/dfe.c
index c9ab9ba6..56532bce 100644
--- a/src/backends/sequential/blrm/dfe.c
+++ b/src/backends/sequential/blrm/dfe.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file src/backends/sequential/blrm/dfe.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/src/backends/sequential/blrm/dmml.c b/src/backends/sequential/blrm/dmml.c
index ab9c3956..289894e4 100644
--- a/src/backends/sequential/blrm/dmml.c
+++ b/src/backends/sequential/blrm/dmml.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file src/backends/sequential/blrm/dmml.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/src/backends/sequential/blrm/dqp3.c b/src/backends/sequential/blrm/dqp3.c
index 85da47b5..6ecaa29b 100644
--- a/src/backends/sequential/blrm/dqp3.c
+++ b/src/backends/sequential/blrm/dqp3.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file src/backends/sequential/blrm/dqp3.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/src/backends/sequential/blrm/drsdd.c b/src/backends/sequential/blrm/drsdd.c
index 33d89fc1..d02cd29e 100644
--- a/src/backends/sequential/blrm/drsdd.c
+++ b/src/backends/sequential/blrm/drsdd.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file src/backends/sequential/blrm/drsdd.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/src/backends/sequential/blrm/dsdd.c b/src/backends/sequential/blrm/dsdd.c
index 1b90d5f0..38827e14 100644
--- a/src/backends/sequential/blrm/dsdd.c
+++ b/src/backends/sequential/blrm/dsdd.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file src/backends/sequential/blrm/dsdd.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/src/backends/sequential/dense/CMakeLists.txt b/src/backends/sequential/dense/CMakeLists.txt
index 3d775e68..ec47478e 100644
--- a/src/backends/sequential/dense/CMakeLists.txt
+++ b/src/backends/sequential/dense/CMakeLists.txt
@@ -5,7 +5,7 @@
 #             University of Science and Technology (KAUST)
 #
 # @file src/backends/sequential/dense/CMakeLists.txt
-# @version 1.3.0
+# @version 0.3.0
 # @author Aleksandr Mikhalev
 # @date 2017-11-07
 
diff --git a/src/backends/sequential/dense/dna.c b/src/backends/sequential/dense/dna.c
index e9023c9d..f50a1fce 100644
--- a/src/backends/sequential/dense/dna.c
+++ b/src/backends/sequential/dense/dna.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file src/backends/sequential/dense/dna.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/src/backends/sequential/dense/dqp3.c b/src/backends/sequential/dense/dqp3.c
index 43729a4c..32ba5056 100644
--- a/src/backends/sequential/dense/dqp3.c
+++ b/src/backends/sequential/dense/dqp3.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file src/backends/sequential/dense/dqp3.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/src/backends/sequential/dense/drsdd.c b/src/backends/sequential/dense/drsdd.c
index 4dfd82ed..6b711be0 100644
--- a/src/backends/sequential/dense/drsdd.c
+++ b/src/backends/sequential/dense/drsdd.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file src/backends/sequential/dense/drsdd.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
@@ -56,7 +56,7 @@ void starsh_dense_dlrrsdd(int nrows, int ncols, double *D, int ldD, double *U,
     int svdqr_lwork = lwork-(size_t)mn2*(2*ncols+nrows+mn2+1);
     int iseed[4] = {0, 0, 0, 1};
     // Generate random matrix X
-    LAPACKE_dlarnv_work(3, iseed, nrows*mn2, X);
+    LAPACKE_dlarnv_work(3, iseed, ncols*mn2, X);
     // Multiply by random matrix
     cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, nrows, mn2,
             ncols, 1.0, D, ldD, X, ncols, 0.0, Q, nrows);
diff --git a/src/backends/sequential/dense/dsdd.c b/src/backends/sequential/dense/dsdd.c
index 48e1e25c..72df2952 100644
--- a/src/backends/sequential/dense/dsdd.c
+++ b/src/backends/sequential/dense/dsdd.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file src/backends/sequential/dense/dsdd.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/src/backends/sequential/dense/dsvfr.c b/src/backends/sequential/dense/dsvfr.c
index a700bd39..ae6f376a 100644
--- a/src/backends/sequential/dense/dsvfr.c
+++ b/src/backends/sequential/dense/dsvfr.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file src/backends/sequential/dense/dsvfr.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/src/backends/sequential/dense/zrsdd.c b/src/backends/sequential/dense/zrsdd.c
index 1f19275a..a11b8bdd 100644
--- a/src/backends/sequential/dense/zrsdd.c
+++ b/src/backends/sequential/dense/zrsdd.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file src/backends/sequential/dense/zrsdd.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Rabab Alomairy
  * @author Kadir Akbudak
  * @author Aleksandr Mikhalev
diff --git a/src/backends/starpu/CMakeLists.txt b/src/backends/starpu/CMakeLists.txt
index 7cb9a3f8..1c071bcb 100644
--- a/src/backends/starpu/CMakeLists.txt
+++ b/src/backends/starpu/CMakeLists.txt
@@ -5,7 +5,7 @@
 #             University of Science and Technology (KAUST)
 #
 # @file src/backends/starpu/CMakeLists.txt
-# @version 1.3.0
+# @version 0.3.0
 # @author Aleksandr Mikhalev
 # @date 2017-11-07
 
diff --git a/src/backends/starpu/blrm/CMakeLists.txt b/src/backends/starpu/blrm/CMakeLists.txt
index 8b1a57d1..1b2b738e 100644
--- a/src/backends/starpu/blrm/CMakeLists.txt
+++ b/src/backends/starpu/blrm/CMakeLists.txt
@@ -5,7 +5,7 @@
 #             University of Science and Technology (KAUST)
 #
 # @file src/backends/starpu/blrm/CMakeLists.txt
-# @version 1.3.0
+# @version 0.3.0
 # @author Aleksandr Mikhalev
 # @date 2017-11-07
 
diff --git a/src/backends/starpu/blrm/dmml.c b/src/backends/starpu/blrm/dmml.c
index cccd7ed0..180c1947 100644
--- a/src/backends/starpu/blrm/dmml.c
+++ b/src/backends/starpu/blrm/dmml.c
@@ -9,7 +9,7 @@
  * @cond
  * This command in pair with endcond will prevent file from being documented.
  *
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/src/backends/starpu/blrm/dqp3.c b/src/backends/starpu/blrm/dqp3.c
index e9f03a8d..37940b16 100644
--- a/src/backends/starpu/blrm/dqp3.c
+++ b/src/backends/starpu/blrm/dqp3.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file src/backends/starpu/blrm/dqp3.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/src/backends/starpu/blrm/drsdd.c b/src/backends/starpu/blrm/drsdd.c
index 9009215c..1d8bb8bc 100644
--- a/src/backends/starpu/blrm/drsdd.c
+++ b/src/backends/starpu/blrm/drsdd.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file src/backends/starpu/blrm/drsdd.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
@@ -27,6 +27,7 @@ int starsh_blrm__drsdd_starpu(STARSH_blrm **matrix, STARSH_blrf *format,
  * @ingroup blrm
  * */
 {
+    printf("IN STARPU (NO KBLAS)\n");
     STARSH_blrf *F = format;
     STARSH_problem *P = F->problem;
     STARSH_kernel *kernel = P->kernel;
@@ -278,8 +279,12 @@ int starsh_blrm__drsdd_starpu(STARSH_blrm **matrix, STARSH_blrf *format,
                 bj++;
             else
             {
-                far_U[bi-bj] = far_U[bi];
-                far_V[bi-bj] = far_V[bi];
+                int shape_U[2] = {far_U[bi]->shape[0], far_rank[bi]};
+                int shape_V[2] = {far_V[bi]->shape[0], far_rank[bi]};
+                array_from_buffer(far_U+bi-bj, 2, shape_U, 'd', 'F',
+                        far_U[bi]->data);
+                array_from_buffer(far_V+bi-bj, 2, shape_V, 'd', 'F',
+                        far_V[bi]->data);
                 far_rank[bi-bj] = far_rank[bi];
             }
         }
diff --git a/src/backends/starpu/blrm/dsdd.c b/src/backends/starpu/blrm/dsdd.c
index dd57d10f..587e08cf 100644
--- a/src/backends/starpu/blrm/dsdd.c
+++ b/src/backends/starpu/blrm/dsdd.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file src/backends/starpu/blrm/dsdd.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/src/backends/starpu/dense/CMakeLists.txt b/src/backends/starpu/dense/CMakeLists.txt
index a53c48f9..a86f5f54 100644
--- a/src/backends/starpu/dense/CMakeLists.txt
+++ b/src/backends/starpu/dense/CMakeLists.txt
@@ -5,7 +5,7 @@
 #             University of Science and Technology (KAUST)
 #
 # @file src/backends/starpu/dense/CMakeLists.txt
-# @version 1.3.0
+# @version 0.3.0
 # @author Aleksandr Mikhalev
 # @date 2017-11-07
 
diff --git a/src/backends/starpu/dense/dgemm.c b/src/backends/starpu/dense/dgemm.c
index 88057522..3fc81621 100644
--- a/src/backends/starpu/dense/dgemm.c
+++ b/src/backends/starpu/dense/dgemm.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file src/backends/starpu/dense/dgemm.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/src/backends/starpu/dense/dqp3.c b/src/backends/starpu/dense/dqp3.c
index 358444a5..a83fd86a 100644
--- a/src/backends/starpu/dense/dqp3.c
+++ b/src/backends/starpu/dense/dqp3.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file src/backends/starpu/dense/dqp3.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/src/backends/starpu/dense/drsdd.c b/src/backends/starpu/dense/drsdd.c
index 7a3f665a..13683c93 100644
--- a/src/backends/starpu/dense/drsdd.c
+++ b/src/backends/starpu/dense/drsdd.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file src/backends/starpu/dense/drsdd.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/src/backends/starpu/dense/dsdd.c b/src/backends/starpu/dense/dsdd.c
index bc6857fd..9314d6fd 100644
--- a/src/backends/starpu/dense/dsdd.c
+++ b/src/backends/starpu/dense/dsdd.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file src/backends/starpu/dense/dsdd.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/src/backends/starpu/dense/fake_init.c b/src/backends/starpu/dense/fake_init.c
index 256011d1..44a23a5a 100644
--- a/src/backends/starpu/dense/fake_init.c
+++ b/src/backends/starpu/dense/fake_init.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file src/backends/starpu/dense/fake_init.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/src/backends/starpu/dense/kernel.c b/src/backends/starpu/dense/kernel.c
index b36aeb2a..c0b9134a 100644
--- a/src/backends/starpu/dense/kernel.c
+++ b/src/backends/starpu/dense/kernel.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file src/backends/starpu/dense/kernel.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/src/backends/starpu_cuda/CMakeLists.txt b/src/backends/starpu_cuda/CMakeLists.txt
new file mode 100644
index 00000000..de2a620d
--- /dev/null
+++ b/src/backends/starpu_cuda/CMakeLists.txt
@@ -0,0 +1,26 @@
+# @copyright (c) 2017 King Abdullah University of Science and
+#                      Technology (KAUST). All rights reserved.
+#
+# STARS-H is a software package, provided by King Abdullah
+#             University of Science and Technology (KAUST)
+#
+# @file src/backends/starpu/CMakeLists.txt
+# @version 0.1.0
+# @author Aleksandr Mikhalev
+# @date 2017-11-07
+
+
+set(DOXYGEN_INPUT ${DOXYGEN_INPUT})
+
+# Collect sources for documentation and compilation
+set(SRC)
+add_subdirectory("blrm")
+add_subdirectory("dense")
+
+# If compilation is requried
+if(STARPU AND CUDA)
+    add_library(backends_starpu_cuda OBJECT ${SRC})
+endif()
+
+# Put doxygen input to parent scope
+set(DOXYGEN_INPUT ${DOXYGEN_INPUT} ${SRC} PARENT_SCOPE)
diff --git a/src/backends/starpu_cuda/blrm/CMakeLists.txt b/src/backends/starpu_cuda/blrm/CMakeLists.txt
new file mode 100644
index 00000000..dc39d390
--- /dev/null
+++ b/src/backends/starpu_cuda/blrm/CMakeLists.txt
@@ -0,0 +1,19 @@
+# @copyright (c) 2017 King Abdullah University of Science and
+#                      Technology (KAUST). All rights reserved.
+#
+# STARS-H is a software package, provided by King Abdullah
+#             University of Science and Technology (KAUST)
+#
+# @file src/backends/starpu/blrm/CMakeLists.txt
+# @version 0.1.0
+# @author Aleksandr Mikhalev
+# @date 2017-11-07
+
+
+# set the values of the variable in the parent scope
+set(SRC
+    #"${CMAKE_CURRENT_SOURCE_DIR}/dqp3.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/drsdd.c"
+    #"${CMAKE_CURRENT_SOURCE_DIR}/dsdd.c"
+    #"${CMAKE_CURRENT_SOURCE_DIR}/dmml.c"
+    ${SRC} PARENT_SCOPE)
diff --git a/src/backends/starpu_cuda/blrm/drsdd.c b/src/backends/starpu_cuda/blrm/drsdd.c
new file mode 100644
index 00000000..ba5ccaf5
--- /dev/null
+++ b/src/backends/starpu_cuda/blrm/drsdd.c
@@ -0,0 +1,425 @@
+/*! @copyright (c) 2017 King Abdullah University of Science and
+ *                      Technology (KAUST). All rights reserved.
+ *
+ * STARS-H is a software package, provided by King Abdullah
+ *             University of Science and Technology (KAUST)
+ *
+ * @file src/backends/starpu/blrm/drsdd.c
+ * @version 0.1.0
+ * @author Aleksandr Mikhalev
+ * @date 2017-11-07
+ * */
+
+#include "common.h"
+#include "starsh.h"
+#include "starsh-starpu-cuda.h"
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <starpu.h>
+#include <cusolverDn.h>
+#include <curand.h>
+
+static void init_starpu_cuda(void *args)
+{
+    cublasHandle_t *cublas_handles;
+    cusolverDnHandle_t *cusolver_handles;
+    curandGenerator_t *curand_handles;
+    int **devinfo;
+    int nb, nsamples;
+    starpu_codelet_unpack_args(args, &cublas_handles, &cusolver_handles,
+            &curand_handles, &devinfo, &nb, &nsamples);
+    int id = starpu_worker_get_id();
+    cublasStatus_t status;
+    //printf("CUBLAS init worker %d at %p\n", id, &cublas_handles[id]);
+    cublasCreate(&cublas_handles[id]);
+    cusolverDnCreate(&cusolver_handles[id]);
+    curandCreateGenerator(&curand_handles[id], CURAND_RNG_PSEUDO_MT19937);
+    curandSetPseudoRandomGeneratorSeed(curand_handles[id], 0ULL);
+    cudaMalloc((void **)&devinfo[id], sizeof(int));
+}
+
+static void deinit_starpu_cuda(void *args)
+{
+    cublasHandle_t *cublas_handles;
+    cusolverDnHandle_t *cusolver_handles;
+    curandGenerator_t *curand_handles;
+    int **devinfo;
+    starpu_codelet_unpack_args(args, &cublas_handles, &cusolver_handles,
+            &curand_handles, &devinfo, 0);
+    int id = starpu_worker_get_id();
+    //printf("CUBLAS deinit worker %d at %p\n", id, &cublas_handles[id]);
+    cublasDestroy(cublas_handles[id]);
+    cusolverDnDestroy(cusolver_handles[id]);
+    curandDestroyGenerator(curand_handles[id]);
+    cudaFree(devinfo[id]);
+}
+
+int starsh_blrm__drsdd_starpu_cuda(STARSH_blrm **matrix, STARSH_blrf *format,
+        int maxrank, double tol, int onfly)
+//! Approximate each tile by randomized SVD.
+/*!
+ * @param[out] matrix: Address of pointer to @ref STARSH_blrm object.
+ * @param[in] format: Block low-rank format.
+ * @param[in] maxrank: Maximum possible rank.
+ * @param[in] tol: Relative error tolerance.
+ * @param[in] onfly: Whether not to store dense blocks.
+ * @return Error code @ref STARSH_ERRNO.
+ * @ingroup blrm
+ * */
+{
+    STARSH_blrf *F = format;
+    STARSH_problem *P = F->problem;
+    STARSH_kernel *kernel = P->kernel;
+    STARSH_int nblocks_far = F->nblocks_far;
+    STARSH_int nblocks_near = F->nblocks_near;
+    // Shortcuts to information about clusters
+    STARSH_cluster *RC = F->row_cluster;
+    STARSH_cluster *CC = F->col_cluster;
+    void *RD = RC->data, *CD = CC->data;
+    // Following values default to given block low-rank format F, but they are
+    // changed when there are false far-field blocks.
+    STARSH_int new_nblocks_far = nblocks_far;
+    STARSH_int new_nblocks_near = nblocks_near;
+    STARSH_int *block_far = F->block_far;
+    STARSH_int *block_near = F->block_near;
+    // Places to store low-rank factors, dense blocks and ranks
+    Array **far_U = NULL, **far_V = NULL, **near_D = NULL;
+    int *far_rank = NULL;
+    double *alloc_U = NULL, *alloc_V = NULL, *alloc_D = NULL;
+    size_t offset_U = 0, offset_V = 0, offset_D = 0;
+    STARSH_int bi, bj = 0;
+    const int oversample = starsh_params.oversample;
+    // Init CuBLAS and CuSolver handles and temp buffers for all workers (but
+    // they are used only in GPU codelets)
+    int workers = starpu_worker_get_count();
+    cublasHandle_t cublas_handles[workers];
+    cusolverDnHandle_t cusolver_handles[workers];
+    curandGenerator_t curand_handles[workers];
+    int *devinfo[workers];
+    double singular_values[workers*(maxrank+oversample)];
+    cublasHandle_t *cuhandles = cublas_handles;
+    cusolverDnHandle_t *cuhandles2 = cusolver_handles;
+    curandGenerator_t *cuhandles3 = curand_handles;
+    int **devinfo_ptr = devinfo;
+    double *svhandles = singular_values;
+    //printf("MAIN: %p, %p, %p\n", cuhandles, cuhandles2, svhandles);
+    void *args_buffer;
+    size_t args_buffer_size = 0;
+    // This works only for TLR with equal tiles
+    int nb = RC->size[0];
+    int nsamples = maxrank+oversample;
+    starpu_codelet_pack_args(&args_buffer, &args_buffer_size,
+            STARPU_VALUE, &cuhandles, sizeof(cuhandles),
+            STARPU_VALUE, &cuhandles2, sizeof(cuhandles2),
+            STARPU_VALUE, &cuhandles3, sizeof(cuhandles3),
+            STARPU_VALUE, &devinfo_ptr, sizeof(devinfo_ptr),
+            STARPU_VALUE, &nb, sizeof(nb),
+            STARPU_VALUE, &nsamples, sizeof(nsamples),
+            0);
+    starpu_execute_on_each_worker(init_starpu_cuda, args_buffer, STARPU_CUDA);
+    // Init codelet structs and handles
+    struct starpu_codelet codelet =
+    {
+        //.cpu_funcs = {starsh_dense_dlrrsdd_starpu_cuda_cpu},
+        .cuda_funcs = {starsh_dense_dlrrsdd_starpu_cuda_gpu},
+        .cuda_flags = {STARPU_CUDA_ASYNC},
+        .nbuffers = 6,
+        .modes = {STARPU_R, STARPU_W, STARPU_W, STARPU_W, STARPU_SCRATCH,
+            STARPU_SCRATCH}
+    };
+    struct starpu_codelet codelet2 =
+    {
+        .cpu_funcs = {starsh_dense_kernel_starpu_cuda_cpu},
+        .nbuffers = 1,
+        .modes = {STARPU_W}
+    };
+    starpu_data_handle_t rank_handle[nblocks_far];
+    starpu_data_handle_t D_handle[nblocks_far];
+    starpu_data_handle_t U_handle[nblocks_far];
+    starpu_data_handle_t V_handle[nblocks_far];
+    starpu_data_handle_t work_handle[nblocks_far];
+    starpu_data_handle_t iwork_handle[nblocks_far];
+    // Init buffers to store low-rank factors of far-field blocks if needed
+    if(nblocks_far > 0)
+    {
+        STARSH_MALLOC(far_U, nblocks_far);
+        STARSH_MALLOC(far_V, nblocks_far);
+        STARSH_MALLOC(far_rank, nblocks_far);
+        size_t size_U = 0, size_V = 0;
+        // Simple cycle over all far-field blocks
+        for(bi = 0; bi < nblocks_far; bi++)
+        {
+            // Get indexes of corresponding block row and block column
+            STARSH_int i = block_far[2*bi];
+            STARSH_int j = block_far[2*bi+1];
+            // Get corresponding sizes and minimum of them
+            size_U += RC->size[i];
+            size_V += CC->size[j];
+            //far_rank[bi] = -2;
+        }
+        size_U *= maxrank;
+        size_V *= maxrank;
+        STARSH_MALLOC(alloc_U, size_U);
+        STARSH_MALLOC(alloc_V, size_V);
+        for(bi = 0; bi < nblocks_far; bi++)
+        {
+            // Get indexes of corresponding block row and block column
+            STARSH_int i = block_far[2*bi];
+            STARSH_int j = block_far[2*bi+1];
+            // Get corresponding sizes and minimum of them
+            int nrows = RC->size[i], ncols = CC->size[j];
+            int mn = nrows < ncols ? nrows : ncols;
+            int mn2 = maxrank+oversample;
+            if(mn2 > mn)
+                mn2 = mn;
+            // Get size of temporary arrays
+            int lwork = ncols, lwork_sdd = (4*mn2+7)*mn2;
+            if(lwork_sdd > lwork)
+                lwork = lwork_sdd;
+            cusolverDnDgesvd_bufferSize(cusolver_handles[0], ncols, mn2,
+                    &lwork_sdd);
+            //printf("CUSOLVER SVD LWORK=%d\n", lwork_sdd);
+            if(lwork_sdd > lwork)
+                lwork = lwork_sdd;
+            lwork += mn2*(2*ncols+nrows+2*mn2+1);
+            int liwork = 8*mn2;
+            int shape_U[] = {nrows, maxrank};
+            int shape_V[] = {ncols, maxrank};
+            double *U = alloc_U+offset_U, *V = alloc_V+offset_V;
+            offset_U += nrows*maxrank;
+            offset_V += ncols*maxrank;
+            array_from_buffer(far_U+bi, 2, shape_U, 'd', 'F', U);
+            array_from_buffer(far_V+bi, 2, shape_V, 'd', 'F', V);
+            starpu_vector_data_register(rank_handle+bi, STARPU_MAIN_RAM,
+                    (uintptr_t)(far_rank+bi), 1, sizeof(*far_rank));
+            starpu_matrix_data_register(D_handle+bi, -1, 0, nrows, nrows,
+                    ncols, sizeof(double));
+            starpu_vector_data_register(U_handle+bi, STARPU_MAIN_RAM,
+                    (uintptr_t)(far_U[bi]->data), nrows*maxrank, sizeof(*U));
+            starpu_vector_data_register(V_handle+bi, STARPU_MAIN_RAM,
+                    (uintptr_t)(far_V[bi]->data), ncols*maxrank, sizeof(*V));
+            starpu_vector_data_register(work_handle+bi, -1, 0, lwork,
+                    sizeof(*U));
+            starpu_vector_data_register(iwork_handle+bi, -1, 0, liwork,
+                    sizeof(int));
+        }
+        offset_U = 0;
+        offset_V = 0;
+    }
+    // Work variables
+    int info;
+    // Simple cycle over all far-field admissible blocks
+    for(bi = 0; bi < nblocks_far; bi++)
+    {
+        // Get indexes of corresponding block row and block column
+        STARSH_int i = block_far[2*bi];
+        STARSH_int j = block_far[2*bi+1];
+        // Generate matrix
+        starpu_task_insert(&codelet2,
+                STARPU_VALUE, &F, sizeof(F),
+                STARPU_VALUE, &i, sizeof(i),
+                STARPU_VALUE, &j, sizeof(j),
+                STARPU_W, D_handle[bi],
+                0);
+        // Approximate
+        starpu_task_insert(&codelet,
+                STARPU_VALUE, &maxrank, sizeof(maxrank),
+                STARPU_VALUE, &oversample, sizeof(oversample),
+                STARPU_VALUE, &tol, sizeof(tol),
+                STARPU_VALUE, &cuhandles, sizeof(cuhandles),
+                STARPU_VALUE, &cuhandles2, sizeof(cuhandles2),
+                STARPU_VALUE, &cuhandles3, sizeof(cuhandles3),
+                STARPU_VALUE, &devinfo_ptr, sizeof(devinfo_ptr),
+                STARPU_VALUE, &svhandles, sizeof(svhandles),
+                STARPU_R, D_handle[bi],
+                STARPU_W, U_handle[bi],
+                STARPU_W, V_handle[bi],
+                STARPU_W, rank_handle[bi],
+                STARPU_SCRATCH, work_handle[bi],
+                STARPU_SCRATCH, iwork_handle[bi],
+                0);
+    }
+    starpu_task_wait_for_all();
+    for(bi = 0; bi < nblocks_far; bi++)
+    {
+        starpu_data_unregister(rank_handle[bi]);
+        starpu_data_unregister(D_handle[bi]);
+        starpu_data_unregister(U_handle[bi]);
+        starpu_data_unregister(V_handle[bi]);
+        starpu_data_unregister(work_handle[bi]);
+        starpu_data_unregister(iwork_handle[bi]);
+    }
+    // Get number of false far-field blocks
+    STARSH_int nblocks_false_far = 0;
+    STARSH_int *false_far = NULL;
+    for(bi = 0; bi < nblocks_far; bi++)
+    {
+        //printf("FAR_RANK[%zu]=%d\n", bi, far_rank[bi]);
+        //far_rank[bi] = -1;
+        if(far_rank[bi] == -1)
+            nblocks_false_far++;
+    }
+    if(nblocks_false_far > 0)
+    {
+        // IMPORTANT: `false_far` must to be in ascending order for later code
+        // to work normally
+        STARSH_MALLOC(false_far, nblocks_false_far);
+        bj = 0;
+        for(bi = 0; bi < nblocks_far; bi++)
+            if(far_rank[bi] == -1)
+                false_far[bj++] = bi;
+    }
+    // Update lists of far-field and near-field blocks using previously
+    // generated list of false far-field blocks
+    if(nblocks_false_far > 0)
+    {
+        // Update list of near-field blocks
+        new_nblocks_near = nblocks_near+nblocks_false_far;
+        STARSH_MALLOC(block_near, 2*new_nblocks_near);
+        // At first get all near-field blocks, assumed to be dense
+        for(bi = 0; bi < 2*nblocks_near; bi++)
+            block_near[bi] = F->block_near[bi];
+        // Add false far-field blocks
+        for(bi = 0; bi < nblocks_false_far; bi++)
+        {
+            STARSH_int bj = false_far[bi];
+            block_near[2*(bi+nblocks_near)] = F->block_far[2*bj];
+            block_near[2*(bi+nblocks_near)+1] = F->block_far[2*bj+1];
+        }
+        // Update list of far-field blocks
+        new_nblocks_far = nblocks_far-nblocks_false_far;
+        if(new_nblocks_far > 0)
+        {
+            STARSH_MALLOC(block_far, 2*new_nblocks_far);
+            bj = 0;
+            for(bi = 0; bi < nblocks_far; bi++)
+            {
+                // `false_far` must be in ascending order for this to work
+                if(bj < nblocks_false_far && false_far[bj] == bi)
+                {
+                    bj++;
+                }
+                else
+                {
+                    block_far[2*(bi-bj)] = F->block_far[2*bi];
+                    block_far[2*(bi-bj)+1] = F->block_far[2*bi+1];
+                }
+            }
+        }
+        // Update format by creating new format
+        STARSH_blrf *F2;
+        info = starsh_blrf_new_from_coo(&F2, P, F->symm, RC, CC,
+                new_nblocks_far, block_far, new_nblocks_near, block_near,
+                F->type);
+        // Swap internal data of formats and free unnecessary data
+        STARSH_blrf tmp_blrf = *F;
+        *F = *F2;
+        *F2 = tmp_blrf;
+        STARSH_WARNING("`F` was modified due to false far-field blocks");
+        starsh_blrf_free(F2);
+    }
+    // Compute near-field blocks if needed
+    if(onfly == 0 && new_nblocks_near > 0)
+    {
+        starpu_data_handle_t D_handle[new_nblocks_near];
+        STARSH_MALLOC(near_D, new_nblocks_near);
+        size_t size_D = 0;
+        // Simple cycle over all near-field blocks
+        for(bi = 0; bi < new_nblocks_near; bi++)
+        {
+            // Get indexes of corresponding block row and block column
+            STARSH_int i = block_near[2*bi];
+            STARSH_int j = block_near[2*bi+1];
+            // Get corresponding sizes and minimum of them
+            size_t nrows = RC->size[i];
+            size_t ncols = CC->size[j];
+            // Update size_D
+            size_D += nrows*ncols;
+        }
+        STARSH_MALLOC(alloc_D, size_D);
+        // For each near-field block compute its elements
+        for(bi = 0; bi < new_nblocks_near; bi++)
+        {
+            // Get indexes of corresponding block row and block column
+            STARSH_int i = block_near[2*bi];
+            STARSH_int j = block_near[2*bi+1];
+            // Get corresponding sizes and minimum of them
+            int nrows = RC->size[i];
+            int ncols = CC->size[j];
+            int shape[2] = {nrows, ncols};
+            double *D = alloc_D+offset_D;
+            array_from_buffer(near_D+bi, 2, shape, 'd', 'F', D);
+            offset_D += near_D[bi]->size;
+            starpu_matrix_data_register(D_handle+bi, STARPU_MAIN_RAM,
+                    (uintptr_t)(near_D[bi]->data), nrows, nrows, ncols,
+                    sizeof(*D));
+        }
+        for(bi = 0; bi < new_nblocks_near; bi++)
+        {
+            // Get indexes of corresponding block row and block column
+            STARSH_int i = block_near[2*bi];
+            STARSH_int j = block_near[2*bi+1];
+            // Get matrix
+            starpu_task_insert(&codelet2,
+                    STARPU_VALUE, &F, sizeof(F),
+                    STARPU_VALUE, &i, sizeof(i),
+                    STARPU_VALUE, &j, sizeof(j),
+                    STARPU_W, D_handle[bi],
+                    0);
+        }
+        // Wait in this scope, because all handles are not visible outside
+        starpu_task_wait_for_all();
+        // Unregister data
+        for(bi = 0; bi < new_nblocks_near; bi++)
+        {
+            starpu_data_unregister(D_handle[bi]);
+        }
+    }
+    // Change sizes of far_rank, far_U and far_V if there were false
+    // far-field blocks
+    if(nblocks_false_far > 0 && new_nblocks_far > 0)
+    {
+        bj = 0;
+        for(bi = 0; bi < nblocks_far; bi++)
+        {
+            if(far_rank[bi] == -1)
+                bj++;
+            else
+            {
+                far_U[bi-bj] = far_U[bi];
+                far_V[bi-bj] = far_V[bi];
+                far_rank[bi-bj] = far_rank[bi];
+            }
+        }
+        STARSH_REALLOC(far_rank, new_nblocks_far);
+        STARSH_REALLOC(far_U, new_nblocks_far);
+        STARSH_REALLOC(far_V, new_nblocks_far);
+        //STARSH_REALLOC(alloc_U, offset_U);
+        //STARSH_REALLOC(alloc_V, offset_V);
+    }
+    // If all far-field blocks are false, then dealloc buffers
+    if(new_nblocks_far == 0 && nblocks_far > 0)
+    {
+        block_far = NULL;
+        free(far_rank);
+        far_rank = NULL;
+        free(far_U);
+        far_U = NULL;
+        free(far_V);
+        far_V = NULL;
+        free(alloc_U);
+        alloc_U = NULL;
+        free(alloc_V);
+        alloc_V = NULL;
+    }
+    // Dealloc list of false far-field blocks if it is not empty
+    if(nblocks_false_far > 0)
+        free(false_far);
+    // Finish with creating instance of Block Low-Rank Matrix with given
+    // buffers
+    starpu_execute_on_each_worker(deinit_starpu_cuda, args_buffer,
+            STARPU_CUDA);
+    return starsh_blrm_new(matrix, F, far_rank, far_U, far_V, onfly, near_D,
+            alloc_U, alloc_V, alloc_D, '1');
+}
+
diff --git a/src/backends/starpu_cuda/dense/CMakeLists.txt b/src/backends/starpu_cuda/dense/CMakeLists.txt
new file mode 100644
index 00000000..0ae6c8ec
--- /dev/null
+++ b/src/backends/starpu_cuda/dense/CMakeLists.txt
@@ -0,0 +1,21 @@
+# @copyright (c) 2017 King Abdullah University of Science and
+#                      Technology (KAUST). All rights reserved.
+#
+# STARS-H is a software package, provided by King Abdullah
+#             University of Science and Technology (KAUST)
+#
+# @file src/backends/starpu/dense/CMakeLists.txt
+# @version 0.1.0
+# @author Aleksandr Mikhalev
+# @date 2017-11-07
+
+
+# set the values of the variable in the parent scope
+set(SRC
+    #"${CMAKE_CURRENT_SOURCE_DIR}/dqp3.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/drsdd.c"
+    #"${CMAKE_CURRENT_SOURCE_DIR}/dsdd.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/kernel.c"
+    #"${CMAKE_CURRENT_SOURCE_DIR}/dgemm.c"
+    #"${CMAKE_CURRENT_SOURCE_DIR}/fake_init.c"
+    ${SRC} PARENT_SCOPE)
diff --git a/src/backends/starpu_cuda/dense/drsdd.c b/src/backends/starpu_cuda/dense/drsdd.c
new file mode 100644
index 00000000..478f4a96
--- /dev/null
+++ b/src/backends/starpu_cuda/dense/drsdd.c
@@ -0,0 +1,153 @@
+/*! @copyright (c) 2017 King Abdullah University of Science and
+ *                      Technology (KAUST). All rights reserved.
+ *
+ * STARS-H is a software package, provided by King Abdullah
+ *             University of Science and Technology (KAUST)
+ *
+ * @file src/backends/starpu/dense/drsdd.c
+ * @version 0.1.0
+ * @author Aleksandr Mikhalev
+ * @date 2017-11-07
+ * */
+
+#include "common.h"
+#include "starsh.h"
+#include "starsh-starpu-cuda.h"
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <cusolverDn.h>
+#include <curand.h>
+
+void starsh_dense_dlrrsdd_starpu_cuda_cpu(void *buffer[], void *cl_arg)
+//! STARPU kernel for 1-way randomized SVD on a tile.
+{
+    int maxrank;
+    int oversample;
+    double tol;
+    cublasHandle_t *cublas_handles;
+    cusolverDnHandle_t *cusolver_handles;
+    curandGenerator_t *curand_handles;
+    int **devinfo;
+    double *singular_values;
+    starpu_codelet_unpack_args(cl_arg, &maxrank, &oversample, &tol,
+            &cublas_handles, &cusolver_handles, &curand_handles, &devinfo,
+            &singular_values);
+    //printf("CODELET: %p, %p, %p\n", cublas_handles, cusolver_handles,
+    //        singular_values);
+    double *D = (double *)STARPU_MATRIX_GET_PTR(buffer[0]);
+    int nrows = STARPU_MATRIX_GET_NX(buffer[0]);
+    int ncols = STARPU_MATRIX_GET_NY(buffer[0]);
+    double *U = (double *)STARPU_VECTOR_GET_PTR(buffer[1]);
+    double *V = (double *)STARPU_VECTOR_GET_PTR(buffer[2]);
+    int *rank = (int *)STARPU_VECTOR_GET_PTR(buffer[3]);
+    double *work = (double *)STARPU_VECTOR_GET_PTR(buffer[4]);
+    int lwork = STARPU_VECTOR_GET_NX(buffer[4]);
+    int *iwork = (int *)STARPU_VECTOR_GET_PTR(buffer[5]);
+    starsh_dense_dlrrsdd(nrows, ncols, D, nrows, U, nrows, V, ncols, rank,
+            maxrank, oversample, tol, work, lwork, iwork);
+}
+
+void starsh_dense_dlrrsdd_starpu_cuda_gpu(void *buffer[], void *cl_arg)
+//! STARPU kernel for 1-way randomized SVD on a tile.
+{
+    int maxrank;
+    int oversample;
+    double tol;
+    cublasHandle_t *cublas_handles;
+    cusolverDnHandle_t *cusolver_handles;
+    curandGenerator_t *curand_handles;
+    int **devinfo;
+    double *singular_values;
+    starpu_codelet_unpack_args(cl_arg, &maxrank, &oversample, &tol,
+            &cublas_handles, &cusolver_handles, &curand_handles, &devinfo,
+            &singular_values);
+    double *D = (double *)STARPU_MATRIX_GET_PTR(buffer[0]);
+    int nrows = STARPU_MATRIX_GET_NX(buffer[0]);
+    int ncols = STARPU_MATRIX_GET_NY(buffer[0]);
+    double *U = (double *)STARPU_VECTOR_GET_PTR(buffer[1]);
+    double *V = (double *)STARPU_VECTOR_GET_PTR(buffer[2]);
+    int *rank = (int *)STARPU_VECTOR_GET_PTR(buffer[3]);
+    double *work = (double *)STARPU_VECTOR_GET_PTR(buffer[4]);
+    int lwork = STARPU_VECTOR_GET_NX(buffer[4]);
+    int mn = nrows < ncols ? nrows : ncols;
+    int mn2 = maxrank+oversample;
+    if(mn2 > mn)
+        mn2 = mn;
+    int id = starpu_worker_get_id();
+    cusolverDnHandle_t cusolverhandle = cusolver_handles[id];
+    cublasHandle_t cuhandle = cublas_handles[id];
+    curandGenerator_t curandhandle = curand_handles[id];
+    int *mydevinfo  = devinfo[id];
+    double *host_S = singular_values+id*(maxrank+oversample);
+    double *device_X = work; // ncols-by-mn2-by random matrix
+    double *device_Q = device_X+ncols*mn2; // nrows-by-mn2 matrix
+    double *device_tau = device_Q+nrows*mn2; // mn2 elements
+    double *device_S = device_tau;
+    double *device_U = device_tau+mn2; // ncols-by-mn2 matrix
+    double *device_V = device_U+ncols*mn2; // mn2-by-mn2 matrix
+    double *device_rwork = device_V+mn2*mn2;
+    double *device_work = device_rwork+mn2;
+    lwork -= (2*ncols+nrows+2*mn2+1)*mn2;
+    //printf("lwork=%d\n", lwork);
+    double one = 1.0;
+    double zero = 0.0;
+    cusolverStatus_t status;
+    cublasStatus_t status2;
+    curandGenerateNormalDouble(curandhandle, device_X, ncols*mn2, zero, one);
+    status2 = cublasDgemm(cuhandle, CUBLAS_OP_N, CUBLAS_OP_N, nrows, mn2, ncols,
+            &one, D, nrows, device_X, ncols, &zero, device_Q, nrows);
+    if(status2)
+    {
+        printf("STATUS GEMM=%d\n", status2);
+    }
+    cudaMemcpy(host_S, device_Q, sizeof(*device_Q), cudaMemcpyDeviceToHost);
+    status = cusolverDnDgeqrf(cusolverhandle, nrows, mn2, device_Q, nrows,
+            device_tau, device_work, lwork, mydevinfo);
+    if(status)
+    {
+        printf("STATUS GEQRF=%d\n", status);
+    }
+    status = cusolverDnDorgqr(cusolverhandle, nrows, mn2, mn2, device_Q, nrows,
+            device_tau, device_work, lwork, mydevinfo);
+    if(status)
+    {
+        printf("STATUS ORGQR=%d\n", status);
+    }
+    cublasDgemm(cuhandle, CUBLAS_OP_T, CUBLAS_OP_N, ncols, mn2, nrows, &one, D,
+            nrows, device_Q, nrows, &zero, device_X, ncols);
+    status = cusolverDnDgesvd(cusolverhandle, 'S', 'S', ncols, mn2, device_X,
+            ncols, device_S, device_U, ncols, device_V, mn2, device_work,
+            lwork, device_rwork, mydevinfo);
+    if(status)
+    {
+        printf("STATUS GESVD=%d\n", status);
+    }
+    cudaMemcpy(host_S, device_S, mn2*sizeof(*host_S), cudaMemcpyDeviceToHost);
+    //printf("SV:");
+    //for(int i = 0; i < 5; i++)
+    //    printf(" %f", host_S[i]);
+    //printf("\n");
+    // Get rank, corresponding to given error tolerance
+    int local_rank = starsh_dense_dsvfr(mn2, host_S, tol);
+    if(local_rank < mn/2 && local_rank <= maxrank)
+    {
+        // Compute right factor of low-rank approximation, using given left
+        // singular vectors
+        cublasDgemm(cuhandle, CUBLAS_OP_N, CUBLAS_OP_T, nrows, local_rank,
+                mn2, &one, device_Q, nrows, device_V, mn2, &zero, U, nrows);
+        cublasDcopy(cuhandle, ncols*local_rank, device_U, 1, V, 1);
+        for(int i = 0; i < local_rank; i++)
+        {
+            cublasDscal(cuhandle, ncols, &host_S[i], V+i*ncols, 1);
+        }
+    }
+    else
+        local_rank = -1;
+    cudaError_t err;
+    // Write new rank back into device memory
+    err = cudaMemcpy(rank, &local_rank, sizeof(local_rank),
+            cudaMemcpyHostToDevice);
+    if(err != cudaSuccess)
+        printf("ERROR IN CUDAMEMCPY\n");
+}
+
diff --git a/src/backends/starpu_cuda/dense/kernel.c b/src/backends/starpu_cuda/dense/kernel.c
new file mode 100644
index 00000000..e0d7411e
--- /dev/null
+++ b/src/backends/starpu_cuda/dense/kernel.c
@@ -0,0 +1,34 @@
+/*! @copyright (c) 2017 King Abdullah University of Science and
+ *                      Technology (KAUST). All rights reserved.
+ *
+ * STARS-H is a software package, provided by King Abdullah
+ *             University of Science and Technology (KAUST)
+ *
+ * @file src/backends/starpu/dense/kernel.c
+ * @version 0.1.0
+ * @author Aleksandr Mikhalev
+ * @date 2017-11-07
+ * */
+
+#include "common.h"
+#include "starsh.h"
+#include "starsh-starpu-cuda.h"
+
+void starsh_dense_kernel_starpu_cuda_cpu(void *buffer[], void *cl_arg)
+//! STARPU kernel for matrix kernel.
+{
+    STARSH_blrf *F;
+    STARSH_int i, j;
+    starpu_codelet_unpack_args(cl_arg, &F, &i, &j);
+    STARSH_problem *P = F->problem;
+    STARSH_kernel *kernel = P->kernel;
+    // Shortcuts to information about clusters
+    STARSH_cluster *RC = F->row_cluster, *CC = F->col_cluster;
+    void *RD = RC->data, *CD = CC->data;
+    STARSH_int nrows = RC->size[i];
+    STARSH_int ncols = CC->size[j];
+    double *D = (double *)STARPU_MATRIX_GET_PTR(buffer[0]);
+    kernel(nrows, ncols, RC->pivot+RC->start[i], CC->pivot+CC->start[j],
+            RD, CD, D, nrows);
+}
+
diff --git a/src/backends/starpu_kblas/CMakeLists.txt b/src/backends/starpu_kblas/CMakeLists.txt
new file mode 100644
index 00000000..9adf822c
--- /dev/null
+++ b/src/backends/starpu_kblas/CMakeLists.txt
@@ -0,0 +1,26 @@
+# @copyright (c) 2017 King Abdullah University of Science and
+#                      Technology (KAUST). All rights reserved.
+#
+# STARS-H is a software package, provided by King Abdullah
+#             University of Science and Technology (KAUST)
+#
+# @file src/backends/starpu/CMakeLists.txt
+# @version 0.1.0
+# @author Aleksandr Mikhalev
+# @date 2017-11-07
+
+
+set(DOXYGEN_INPUT ${DOXYGEN_INPUT})
+
+# Collect sources for documentation and compilation
+set(SRC)
+add_subdirectory("blrm")
+add_subdirectory("dense")
+
+# If compilation is requried
+if(STARPU AND KBLAS)
+    add_library(backends_starpu_kblas OBJECT ${SRC})
+endif()
+
+# Put doxygen input to parent scope
+set(DOXYGEN_INPUT ${DOXYGEN_INPUT} ${SRC} PARENT_SCOPE)
diff --git a/src/backends/starpu_kblas/blrm/CMakeLists.txt b/src/backends/starpu_kblas/blrm/CMakeLists.txt
new file mode 100644
index 00000000..dc39d390
--- /dev/null
+++ b/src/backends/starpu_kblas/blrm/CMakeLists.txt
@@ -0,0 +1,19 @@
+# @copyright (c) 2017 King Abdullah University of Science and
+#                      Technology (KAUST). All rights reserved.
+#
+# STARS-H is a software package, provided by King Abdullah
+#             University of Science and Technology (KAUST)
+#
+# @file src/backends/starpu/blrm/CMakeLists.txt
+# @version 0.1.0
+# @author Aleksandr Mikhalev
+# @date 2017-11-07
+
+
+# set the values of the variable in the parent scope
+set(SRC
+    #"${CMAKE_CURRENT_SOURCE_DIR}/dqp3.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/drsdd.c"
+    #"${CMAKE_CURRENT_SOURCE_DIR}/dsdd.c"
+    #"${CMAKE_CURRENT_SOURCE_DIR}/dmml.c"
+    ${SRC} PARENT_SCOPE)
diff --git a/src/backends/starpu_kblas/blrm/drsdd.c b/src/backends/starpu_kblas/blrm/drsdd.c
new file mode 100644
index 00000000..dc3238c1
--- /dev/null
+++ b/src/backends/starpu_kblas/blrm/drsdd.c
@@ -0,0 +1,531 @@
+/*! @copyright (c) 2017 King Abdullah University of Science and
+ *                      Technology (KAUST). All rights reserved.
+ *
+ * STARS-H is a software package, provided by King Abdullah
+ *             University of Science and Technology (KAUST)
+ *
+ * @file src/backends/starpu/blrm/drsdd.c
+ * @version 0.1.0
+ * @author Aleksandr Mikhalev
+ * @date 2017-11-07
+ * */
+
+#include "common.h"
+#include "starsh.h"
+#include "starsh-starpu-kblas.h"
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <kblas.h>
+#include "batch_rand.h"
+#include <starpu.h>
+#include <omp.h>
+
+static void init_starpu_kblas(void *args)
+{
+    cublasHandle_t *cublas_handles;
+    kblasHandle_t *kblas_handles;
+    kblasRandState_t *kblas_states;
+    cudaStream_t stream = starpu_cuda_get_local_stream();
+    int nb, nsamples, maxbatch;
+    double **work;
+    int **iwork;
+    starpu_codelet_unpack_args(args, &cublas_handles, &kblas_handles,
+            &kblas_states, &work, &iwork, &nb, &nsamples, &maxbatch);
+    int id = starpu_worker_get_id();
+    cublasStatus_t status;
+    kblasCreate(&kblas_handles[id]);
+    kblasSetStream(kblas_handles[id], stream);
+    kblasDrsvd_batch_wsquery(kblas_handles[id], nb, nb, nsamples, maxbatch);
+    kblasAllocateWorkspace(kblas_handles[id]);
+    cublas_handles[id] = kblasGetCublasHandle(kblas_handles[id]);
+    kblasInitRandState(kblas_handles[id], &kblas_states[id], 16384*2, 0);
+    work[id] = malloc(nsamples*maxbatch*sizeof(double));
+    iwork[id] = malloc(maxbatch*sizeof(int));
+    cudaStreamSynchronize(stream);
+}
+
+static void init_starpu_cpu(void *args)
+{
+    int nb, nsamples;
+    int lwork, liwork;
+    double **work;
+    int **iwork;
+    starpu_codelet_unpack_args(args, &nb, &nsamples, &work, &lwork, &iwork,
+            &liwork);
+    int id = starpu_worker_get_id();
+    work[id] = malloc(lwork*sizeof(*work[0]));
+    iwork[id] = malloc(liwork*sizeof(*iwork[0]));
+}
+
+static void deinit_starpu_kblas(void *args)
+{
+    int nb, nsamples, maxbatch;
+    double **work;
+    int **iwork;
+    cublasHandle_t *cublas_handles;
+    kblasHandle_t *kblas_handles;
+    kblasRandState_t *kblas_states;
+    starpu_codelet_unpack_args(args, &cublas_handles, &kblas_handles,
+            &kblas_states, &work, &iwork, &nb, &nsamples, &maxbatch);
+    int id = starpu_worker_get_id();
+    kblasDestroyRandState(kblas_states[id]);
+    kblasDestroy(&kblas_handles[id]);
+    free(work[id]);
+    free(iwork[id]);
+}
+
+static void deinit_starpu_cpu(void *args)
+{
+    int nb, nsamples;
+    int lwork, liwork;
+    double **work;
+    int **iwork;
+    starpu_codelet_unpack_args(args, &nb, &nsamples, &work, &lwork, &iwork,
+            &liwork);
+    int id = starpu_worker_get_id();
+    free(work[id]);
+    free(iwork[id]);
+}
+
+static void empty_cpu_func(void *buffer[],  void *cl_arg)
+{
+}
+
+int starsh_blrm__drsdd_starpu_kblas(STARSH_blrm **matrix, STARSH_blrf *format,
+        int maxrank, double tol, int onfly)
+//! Approximate each tile by randomized SVD.
+/*!
+ * @param[out] matrix: Address of pointer to @ref STARSH_blrm object.
+ * @param[in] format: Block low-rank format.
+ * @param[in] maxrank: Maximum possible rank.
+ * @param[in] tol: Relative error tolerance.
+ * @param[in] onfly: Whether not to store dense blocks.
+ * @return Error code @ref STARSH_ERRNO.
+ * @ingroup blrm
+ * */
+{
+    STARSH_blrf *F = format;
+    STARSH_problem *P = F->problem;
+    STARSH_kernel *kernel = P->kernel;
+    STARSH_int nblocks_far = F->nblocks_far;
+    STARSH_int nblocks_near = F->nblocks_near;
+    // Shortcuts to information about clusters
+    STARSH_cluster *RC = F->row_cluster;
+    STARSH_cluster *CC = F->col_cluster;
+    void *RD = RC->data, *CD = CC->data;
+    // Following values default to given block low-rank format F, but they are
+    // changed when there are false far-field blocks.
+    STARSH_int new_nblocks_far = nblocks_far;
+    STARSH_int new_nblocks_near = nblocks_near;
+    STARSH_int *block_far = F->block_far;
+    STARSH_int *block_near = F->block_near;
+    // Places to store low-rank factors, dense blocks and ranks
+    Array **far_U = NULL, **far_V = NULL, **near_D = NULL;
+    int *far_rank = NULL;
+    double *alloc_U = NULL, *alloc_V = NULL, *alloc_D = NULL;
+    STARSH_int bi, bj = 0;
+    const int oversample = starsh_params.oversample;
+    // Init CuBLAS and KBLAS handles and temp buffers for all workers (but they
+    // are used only in GPU codelets)
+    int workers = starpu_worker_get_count();
+    cublasHandle_t cublas_handles[workers];
+    kblasHandle_t kblas_handles[workers];
+    kblasRandState_t kblas_states[workers];
+    double *work[workers];
+    int *iwork[workers];
+    cublasHandle_t *cuhandles = cublas_handles;
+    kblasHandle_t *khandles = kblas_handles;
+    kblasRandState_t *kstates = kblas_states;
+    double **wwork = work;
+    int **wiwork = iwork;
+    //printf("MAIN: %p, %p, %p\n", cuhandles, khandles, svhandles);
+    void *args_gpu, *args_cpu;
+    size_t args_gpu_size = 0;
+    size_t args_cpu_size = 0;
+    // This works only for TLR with equal tiles
+    int nb = RC->size[0];
+    int nsamples = maxrank+oversample;
+    // Set size of batch
+    int batch_size = 100;
+    // Ceil number of batches
+    int nbatches = (nblocks_far-1)/batch_size + 1;
+    // Get corresponding sizes and minimum of them
+    int mn = maxrank+oversample;
+    if(mn > nb)
+        mn = nb;
+    // Get size of temporary arrays
+    int lwork = nb;
+    int lwork_sdd = (4*mn+7) * mn;
+    if(lwork_sdd > lwork)
+        lwork = lwork_sdd;
+    lwork += mn*(3*nb+mn+1) + nb*nb;
+    int liwork = 8 * mn;
+    starpu_codelet_pack_args(&args_gpu, &args_gpu_size,
+            STARPU_VALUE, &cuhandles, sizeof(cuhandles),
+            STARPU_VALUE, &khandles, sizeof(khandles),
+            STARPU_VALUE, &kstates, sizeof(kstates),
+            STARPU_VALUE, &wwork, sizeof(wwork),
+            STARPU_VALUE, &wiwork, sizeof(wiwork),
+            STARPU_VALUE, &nb, sizeof(nb),
+            STARPU_VALUE, &nsamples, sizeof(nsamples),
+            STARPU_VALUE, &batch_size, sizeof(batch_size),
+            0);
+    starpu_codelet_pack_args(&args_cpu, &args_cpu_size,
+            STARPU_VALUE, &nb, sizeof(nb),
+            STARPU_VALUE, &nsamples, sizeof(nsamples),
+            STARPU_VALUE, &wwork, sizeof(wwork),
+            STARPU_VALUE, &lwork, sizeof(lwork),
+            STARPU_VALUE, &wiwork, sizeof(wiwork),
+            STARPU_VALUE, &liwork, sizeof(liwork),
+            0);
+    starpu_execute_on_each_worker(init_starpu_kblas, args_gpu, STARPU_CUDA);
+    starpu_execute_on_each_worker(init_starpu_cpu, args_cpu, STARPU_CPU);
+    // Init codelet structs and handles
+    struct starpu_codelet codelet_kernel =
+    {
+        .cpu_funcs = {starsh_dense_kernel_starpu_kblas_cpu},
+        .nbuffers = 2,
+        .modes = {STARPU_W, STARPU_R},
+        .type = STARPU_SPMD,
+        .max_parallelism = INT_MAX,
+    };
+    struct starpu_codelet codelet_lowrank =
+    {
+        .cpu_funcs = {starsh_dense_dlrrsdd_starpu_kblas_cpu},
+        .cuda_funcs = {starsh_dense_dlrrsdd_starpu_kblas_gpu},
+        .cuda_flags = {STARPU_CUDA_ASYNC},
+        .nbuffers = 5,
+        .modes = {STARPU_R, STARPU_SCRATCH, STARPU_W, STARPU_W, STARPU_W},
+        .type = STARPU_SPMD,
+        .max_parallelism = INT_MAX,
+    };
+    struct starpu_codelet codelet_lowrank_cpu =
+    {
+        .cpu_funcs = {starsh_dense_dlrrsdd_starpu_kblas_cpu},
+        .nbuffers = 5,
+        .modes = {STARPU_R, STARPU_SCRATCH, STARPU_W, STARPU_W, STARPU_W},
+        .type = STARPU_SPMD,
+        .max_parallelism = INT_MAX,
+    };
+    struct starpu_codelet codelet_lowrank_gpu =
+    {
+        .cuda_funcs = {starsh_dense_dlrrsdd_starpu_kblas_gpu},
+        .cuda_flags = {STARPU_CUDA_ASYNC},
+        .nbuffers = 5,
+        .modes = {STARPU_R, STARPU_SCRATCH, STARPU_W, STARPU_W, STARPU_W},
+    };
+    struct starpu_codelet codelet_get_data_back_to_cpu =
+    {
+        .cpu_funcs = {empty_cpu_func},
+        .nbuffers = 1,
+        .modes = {STARPU_R},
+    };
+    // Select if ONLY cpu or gpu
+    if(getenv("STARSH_KBLAS_CPU"))
+        codelet_lowrank = codelet_lowrank_cpu;
+    else if(getenv("STARSH_KBLAS_GPU"))
+        codelet_lowrank = codelet_lowrank_gpu;
+    starpu_data_handle_t rank_handle[nbatches];
+    starpu_data_handle_t D_handle[nbatches];
+    starpu_data_handle_t Dcopy_handle[nbatches];
+    starpu_data_handle_t index_handle[nbatches];
+    starpu_data_handle_t U_handle[nbatches];
+    starpu_data_handle_t V_handle[nbatches];
+    //printf("BATCHSIZE=%d BATCHCOUNT=%d\n", batch_size, nbatches);
+    // Init buffers to store low-rank factors of far-field blocks if needed
+    if(nbatches > 0)
+    {
+        STARSH_MALLOC(far_U, nblocks_far);
+        STARSH_MALLOC(far_V, nblocks_far);
+        STARSH_MALLOC(far_rank, nblocks_far);
+        size_t size_U = nblocks_far * nb * maxrank;
+        size_t size_V = size_U;
+        STARSH_MALLOC(alloc_U, size_U);
+        STARSH_MALLOC(alloc_V, size_V);
+        int shape[] = {nb, maxrank};
+        for(bi = 0; bi < nblocks_far; ++bi)
+        {
+            STARSH_int offset = bi * nb * maxrank;
+            array_from_buffer(far_U+bi, 2, shape, 'd', 'F', alloc_U+offset);
+            array_from_buffer(far_V+bi, 2, shape, 'd', 'F', alloc_V+offset);
+        }
+        // START MEASURING TIME
+        double time0 = omp_get_wtime();
+        for(bi = 0; bi < nbatches; ++bi)
+        {
+            STARSH_int offset = bi * batch_size * nb * maxrank;
+            double *U = alloc_U + offset;
+            double *V = alloc_V + offset;
+            STARSH_int this_batch_size = nblocks_far - bi*batch_size;
+            if(this_batch_size > batch_size)
+                this_batch_size = batch_size;
+            STARSH_int D_size = this_batch_size * nb * nb;
+            STARSH_int U_size = this_batch_size * nb * maxrank;
+            STARSH_int V_size = U_size;
+            //printf("THIS BATCH SIZE=%d\n", this_batch_size);
+            starpu_vector_data_register(rank_handle+bi, STARPU_MAIN_RAM,
+                    (uintptr_t)(far_rank + bi*batch_size), this_batch_size,
+                    sizeof(*far_rank));
+            starpu_vector_data_register(D_handle+bi, -1, 0, D_size,
+                    sizeof(double));
+            starpu_vector_data_register(Dcopy_handle+bi, -1, 0, D_size,
+                    sizeof(double));
+            starpu_vector_data_register(index_handle+bi, STARPU_MAIN_RAM,
+                    (uintptr_t)(block_far + 2*bi*batch_size),
+                    2*this_batch_size, sizeof(*block_far));
+            starpu_vector_data_register(U_handle+bi, STARPU_MAIN_RAM,
+                    (uintptr_t)(U), U_size, sizeof(*U));
+            starpu_vector_data_register(V_handle+bi, STARPU_MAIN_RAM,
+                    (uintptr_t)(V), V_size, sizeof(*V));
+        }
+        //printf("REGISTER DATA IN: %f seconds\n", omp_get_wtime()-time0);
+    }
+    // Work variables
+    int info;
+    // START MEASURING TIME
+    double time0 = omp_get_wtime();
+    for(bi = 0; bi < nbatches; ++bi)
+    {
+        //printf("RUNNING BATCH=%d\n", bi);
+        STARSH_int this_batch_size = nblocks_far - bi*batch_size;
+        if(this_batch_size > batch_size)
+            this_batch_size = batch_size;
+        // Generate matrix
+        starpu_task_insert(&codelet_kernel,
+                STARPU_VALUE, &F, sizeof(F),
+                STARPU_VALUE, &this_batch_size, sizeof(this_batch_size),
+                STARPU_W, D_handle[bi],
+                STARPU_R, index_handle[bi],
+                0);
+        starpu_data_unregister_submit(index_handle[bi]);
+    }
+    //starpu_task_wait_for_all();
+    //double time1 = omp_get_wtime();
+    //printf("COMPUTE MATRIX IN: %f seconds\n", time1-time0);
+    //time0 = time1;
+    STARSH_int nbatches_once = nbatches;
+    for(STARSH_int batch_start = 0; batch_start < nbatches;
+            batch_start += nbatches_once)
+    {
+        STARSH_int batch_end = batch_start + nbatches_once;
+        if(batch_end > nbatches)
+            batch_end = nbatches;
+        for(bi = batch_start; bi < batch_end; ++bi)
+        {
+            //printf("RUNNING BATCH=%d\n", bi);
+            STARSH_int this_batch_size = nblocks_far - bi*batch_size;
+            if(this_batch_size > batch_size)
+                this_batch_size = batch_size;
+            // Run KBLAS_RSVD
+            //*
+            starpu_task_insert(&codelet_lowrank,
+                    STARPU_VALUE, &this_batch_size, sizeof(this_batch_size),
+                    STARPU_VALUE, &nb, sizeof(nb),
+                    STARPU_VALUE, &maxrank, sizeof(maxrank),
+                    STARPU_VALUE, &oversample, sizeof(oversample),
+                    STARPU_VALUE, &tol, sizeof(tol),
+                    STARPU_VALUE, &cuhandles, sizeof(cuhandles),
+                    STARPU_VALUE, &khandles, sizeof(khandles),
+                    STARPU_VALUE, &kstates, sizeof(kstates),
+                    STARPU_VALUE, &wwork, sizeof(wwork),
+                    STARPU_VALUE, &lwork, sizeof(lwork),
+                    STARPU_VALUE, &wiwork, sizeof(wiwork),
+                    STARPU_R, D_handle[bi],
+                    STARPU_SCRATCH, Dcopy_handle[bi],
+                    STARPU_W, U_handle[bi],
+                    STARPU_W, V_handle[bi],
+                    STARPU_W, rank_handle[bi],
+                    0);
+            starpu_data_unregister_submit(Dcopy_handle[bi]);
+            starpu_task_insert(&codelet_get_data_back_to_cpu,
+                    STARPU_R, U_handle[bi],
+                    0);
+            starpu_task_insert(&codelet_get_data_back_to_cpu,
+                    STARPU_R, V_handle[bi],
+                    0);
+            starpu_task_insert(&codelet_get_data_back_to_cpu,
+                    STARPU_R, rank_handle[bi],
+                    0);
+            starpu_data_unregister_submit(rank_handle[bi]);
+            starpu_data_unregister_submit(D_handle[bi]);
+            starpu_data_unregister_submit(U_handle[bi]);
+            starpu_data_unregister_submit(V_handle[bi]);
+        }
+        //starpu_task_wait_for_all();
+    }
+    //time1 = omp_get_wtime();
+    //printf("COMPRESS MATRIX IN: %f seconds\n", time1-time0);
+    //time0 = time1;
+    //printf("FINISH FIRST PASS AND UNREGISTER IN: %f seconds\n",
+    //        omp_get_wtime()-time0);
+    // Get number of false far-field blocks
+    STARSH_int nblocks_false_far = 0;
+    STARSH_int *false_far = NULL;
+    for(bi = 0; bi < nblocks_far; bi++)
+    {
+        //printf("FAR_RANK[%zu]=%d\n", bi, far_rank[bi]);
+        //far_rank[bi] = 0;
+        if(far_rank[bi] == -1)
+            nblocks_false_far++;
+    }
+    if(nblocks_false_far > 0)
+    {
+        // IMPORTANT: `false_far` must to be in ascending order for later code
+        // to work normally
+        STARSH_MALLOC(false_far, nblocks_false_far);
+        bj = 0;
+        for(bi = 0; bi < nblocks_far; bi++)
+            if(far_rank[bi] == -1)
+                false_far[bj++] = bi;
+    }
+    // Update lists of far-field and near-field blocks using previously
+    // generated list of false far-field blocks
+    if(nblocks_false_far > 0)
+    {
+        // Update list of near-field blocks
+        new_nblocks_near = nblocks_near+nblocks_false_far;
+        STARSH_MALLOC(block_near, 2*new_nblocks_near);
+        // At first get all near-field blocks, assumed to be dense
+        for(bi = 0; bi < 2*nblocks_near; bi++)
+            block_near[bi] = F->block_near[bi];
+        // Add false far-field blocks
+        for(bi = 0; bi < nblocks_false_far; bi++)
+        {
+            STARSH_int bj = false_far[bi];
+            block_near[2*(bi+nblocks_near)] = F->block_far[2*bj];
+            block_near[2*(bi+nblocks_near)+1] = F->block_far[2*bj+1];
+        }
+        // Update list of far-field blocks
+        new_nblocks_far = nblocks_far-nblocks_false_far;
+        if(new_nblocks_far > 0)
+        {
+            STARSH_MALLOC(block_far, 2*new_nblocks_far);
+            bj = 0;
+            for(bi = 0; bi < nblocks_far; bi++)
+            {
+                // `false_far` must be in ascending order for this to work
+                if(bj < nblocks_false_far && false_far[bj] == bi)
+                {
+                    bj++;
+                }
+                else
+                {
+                    block_far[2*(bi-bj)] = F->block_far[2*bi];
+                    block_far[2*(bi-bj)+1] = F->block_far[2*bi+1];
+                }
+            }
+        }
+        // Update format by creating new format
+        STARSH_blrf *F2;
+        info = starsh_blrf_new_from_coo(&F2, P, F->symm, RC, CC,
+                new_nblocks_far, block_far, new_nblocks_near, block_near,
+                F->type);
+        // Swap internal data of formats and free unnecessary data
+        STARSH_blrf tmp_blrf = *F;
+        *F = *F2;
+        *F2 = tmp_blrf;
+        STARSH_WARNING("`F` was modified due to false far-field blocks");
+        starsh_blrf_free(F2);
+    }
+    // Compute near-field blocks if needed
+    if(onfly == 0 && new_nblocks_near > 0)
+    {
+        STARSH_MALLOC(near_D, new_nblocks_near);
+        size_t size_D = new_nblocks_near * nb * nb;
+        STARSH_MALLOC(alloc_D, size_D);
+        nbatches = (new_nblocks_near-1)/batch_size + 1;
+        starpu_data_handle_t D_handle[nbatches];
+        starpu_data_handle_t index_handle[nbatches];
+        int shape[] = {nb, nb};
+        // For each near-field block compute its elements
+        for(bi = 0; bi < new_nblocks_near; ++bi)
+        {
+            // Get indexes of corresponding block row and block column
+            //STARSH_int i = block_near[2*bi];
+            //STARSH_int j = block_near[2*bi+1];
+            array_from_buffer(near_D+bi, 2, shape, 'd', 'F',
+                    alloc_D + bi*nb*nb);
+        }
+        for(bi = 0; bi < nbatches; ++bi)
+        {
+            STARSH_int this_batch_size = new_nblocks_near - bi*batch_size;
+            if(this_batch_size > batch_size)
+                this_batch_size = batch_size;
+            STARSH_int D_size = this_batch_size * nb * nb;
+            double *D = alloc_D + bi*batch_size*nb*nb;
+            starpu_vector_data_register(D_handle+bi, STARPU_MAIN_RAM,
+                    (uintptr_t)(D), D_size, sizeof(*D));
+            starpu_vector_data_register(index_handle+bi, STARPU_MAIN_RAM,
+                    (uintptr_t)(block_near + 2*bi*batch_size),
+                    2*this_batch_size, sizeof(*block_near));
+        }
+        for(bi = 0; bi < nbatches; ++bi)
+        {
+            STARSH_int this_batch_size = new_nblocks_near - bi*batch_size;
+            if(this_batch_size > batch_size)
+                this_batch_size = batch_size;
+            // Generate matrix
+            starpu_task_insert(&codelet_kernel,
+                    STARPU_VALUE, &F, sizeof(F),
+                    STARPU_VALUE, &this_batch_size, sizeof(this_batch_size),
+                    STARPU_W, D_handle[bi],
+                    STARPU_R, index_handle[bi],
+                    0);
+        }
+        // Wait in this scope, because all handles are not visible outside
+        starpu_task_wait_for_all();
+        // Unregister data
+        for(bi = 0; bi < nbatches; bi++)
+        {
+            starpu_data_unregister(D_handle[bi]);
+            starpu_data_unregister(index_handle[bi]);
+        }
+    }
+    // Change sizes of far_rank, far_U and far_V if there were false
+    // far-field blocks
+    if(nblocks_false_far > 0 && new_nblocks_far > 0)
+    {
+        bj = 0;
+        for(bi = 0; bi < nblocks_far; bi++)
+        {
+            if(far_rank[bi] == -1)
+                bj++;
+            else
+            {
+                far_U[bi-bj] = far_U[bi];
+                far_V[bi-bj] = far_V[bi];
+                far_rank[bi-bj] = far_rank[bi];
+            }
+        }
+        STARSH_REALLOC(far_rank, new_nblocks_far);
+        STARSH_REALLOC(far_U, new_nblocks_far);
+        STARSH_REALLOC(far_V, new_nblocks_far);
+        //STARSH_REALLOC(alloc_U, offset_U);
+        //STARSH_REALLOC(alloc_V, offset_V);
+    }
+    // If all far-field blocks are false, then dealloc buffers
+    if(new_nblocks_far == 0 && nblocks_far > 0)
+    {
+        block_far = NULL;
+        free(far_rank);
+        far_rank = NULL;
+        free(far_U);
+        far_U = NULL;
+        free(far_V);
+        far_V = NULL;
+        free(alloc_U);
+        alloc_U = NULL;
+        free(alloc_V);
+        alloc_V = NULL;
+    }
+    // Dealloc list of false far-field blocks if it is not empty
+    if(nblocks_false_far > 0)
+        free(false_far);
+    // Finish with creating instance of Block Low-Rank Matrix with given
+    // buffers
+    starpu_execute_on_each_worker(deinit_starpu_kblas, args_gpu, STARPU_CUDA);
+    starpu_execute_on_each_worker(deinit_starpu_cpu, args_cpu, STARPU_CPU);
+    return starsh_blrm_new(matrix, F, far_rank, far_U, far_V, onfly, near_D,
+            alloc_U, alloc_V, alloc_D, '1');
+}
+
diff --git a/src/backends/starpu_kblas/dense/CMakeLists.txt b/src/backends/starpu_kblas/dense/CMakeLists.txt
new file mode 100644
index 00000000..0ae6c8ec
--- /dev/null
+++ b/src/backends/starpu_kblas/dense/CMakeLists.txt
@@ -0,0 +1,21 @@
+# @copyright (c) 2017 King Abdullah University of Science and
+#                      Technology (KAUST). All rights reserved.
+#
+# STARS-H is a software package, provided by King Abdullah
+#             University of Science and Technology (KAUST)
+#
+# @file src/backends/starpu/dense/CMakeLists.txt
+# @version 0.1.0
+# @author Aleksandr Mikhalev
+# @date 2017-11-07
+
+
+# set the values of the variable in the parent scope
+set(SRC
+    #"${CMAKE_CURRENT_SOURCE_DIR}/dqp3.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/drsdd.c"
+    #"${CMAKE_CURRENT_SOURCE_DIR}/dsdd.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/kernel.c"
+    #"${CMAKE_CURRENT_SOURCE_DIR}/dgemm.c"
+    #"${CMAKE_CURRENT_SOURCE_DIR}/fake_init.c"
+    ${SRC} PARENT_SCOPE)
diff --git a/src/backends/starpu_kblas/dense/drsdd.c b/src/backends/starpu_kblas/dense/drsdd.c
new file mode 100644
index 00000000..ac767f8c
--- /dev/null
+++ b/src/backends/starpu_kblas/dense/drsdd.c
@@ -0,0 +1,119 @@
+/*! @copyright (c) 2017 King Abdullah University of Science and
+ *                      Technology (KAUST). All rights reserved.
+ *
+ * STARS-H is a software package, provided by King Abdullah
+ *             University of Science and Technology (KAUST)
+ *
+ * @file src/backends/starpu/dense/drsdd.c
+ * @version 0.1.0
+ * @author Aleksandr Mikhalev
+ * @date 2017-11-07
+ * */
+
+#include "common.h"
+#include "starsh.h"
+#include "starsh-starpu-kblas.h"
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <kblas.h>
+#include "batch_rand.h"
+#include <omp.h>
+
+void starsh_dense_dlrrsdd_starpu_kblas_cpu(void *buffer[], void *cl_arg)
+//! STARPU kernel for 1-way randomized SVD on a tile.
+{
+    int batch_size;
+    int nb;
+    int maxrank;
+    int oversample;
+    double tol;
+    cublasHandle_t *cublas_handles;
+    kblasHandle_t *kblas_handles;
+    kblasRandState_t *kblas_states;
+    double **work;
+    int lwork;
+    int **iwork;
+    starpu_codelet_unpack_args(cl_arg, &batch_size, &nb, &maxrank, &oversample,
+            &tol, &cublas_handles, &kblas_handles, &kblas_states, &work,
+            &lwork, &iwork);
+    double *D = (double *)STARPU_VECTOR_GET_PTR(buffer[0]);
+    double *Dcopy = (double *)STARPU_VECTOR_GET_PTR(buffer[1]);
+    double *U = (double *)STARPU_VECTOR_GET_PTR(buffer[2]);
+    double *V = (double *)STARPU_VECTOR_GET_PTR(buffer[3]);
+    int *rank = (int *)STARPU_VECTOR_GET_PTR(buffer[4]);
+    int id = starpu_worker_get_id();
+    int pool_size = starpu_combined_worker_get_size();
+    int pool_rank = starpu_combined_worker_get_rank();
+    for(STARSH_int bi = pool_rank; bi < batch_size; bi += pool_size)
+    {
+        starsh_dense_dlrrsdd(nb, nb, D + bi*nb*nb, nb, U + bi*maxrank*nb, nb,
+                V + bi*maxrank*nb, nb, rank+bi, maxrank, oversample, tol,
+                work[id], lwork, iwork[id]);
+    }
+}
+
+void starsh_dense_dlrrsdd_starpu_kblas_gpu(void *buffer[], void *cl_arg)
+//! STARPU kernel for 1-way randomized SVD on a tile.
+{
+    int batch_size;
+    int nb;
+    int maxrank;
+    int oversample;
+    cublasHandle_t *cublas_handles;
+    kblasHandle_t *kblas_handles;
+    kblasRandState_t *kblas_states;
+    double **work;
+    int lwork;
+    int **iwork;
+    double tol;
+    starpu_codelet_unpack_args(cl_arg, &batch_size, &nb, &maxrank, &oversample,
+            &tol, &cublas_handles, &kblas_handles, &kblas_states, &work,
+            &lwork, &iwork);
+    double *D = (double *)STARPU_VECTOR_GET_PTR(buffer[0]);
+    double *Dcopy = (double *)STARPU_VECTOR_GET_PTR(buffer[1]);
+    double *U = (double *)STARPU_VECTOR_GET_PTR(buffer[2]);
+    double *V = (double *)STARPU_VECTOR_GET_PTR(buffer[3]);
+    int *rank = (int *)STARPU_VECTOR_GET_PTR(buffer[4]);
+    int mn = maxrank+oversample;
+    if(mn > nb)
+        mn = nb;
+    int id = starpu_worker_get_id();
+    kblasHandle_t khandle = kblas_handles[id];
+    cublasHandle_t cuhandle = cublas_handles[id];
+    kblasRandState_t state = kblas_states[id];
+    cudaStream_t stream = starpu_cuda_get_local_stream();
+    // Create copy of D, since kblas_rsvd spoils it
+    cublasDcopy(cuhandle, batch_size*nb*nb, D, 1, Dcopy, 1);
+    // Run randomized SVD, get left singular vectors and singular values
+    //*
+    kblasDrsvd_batch_strided(khandle, nb, nb, mn, D, nb, nb*nb, U, mn, state,
+            batch_size);
+    cudaMemcpyAsync(work[id], U, mn*batch_size*sizeof(double),
+            cudaMemcpyDeviceToHost, stream);
+    cudaStreamSynchronize(stream);
+    for(int bi = 0; bi < batch_size; ++bi)
+    {
+        int local_rank = starsh_dense_dsvfr(mn, work[id] + bi*mn, tol);
+        if(local_rank >= nb/2 || local_rank > maxrank)
+        {
+            iwork[id][bi] = -1;
+            //printf("RANK=-1\n");
+        }
+        else
+        {
+            double one = 1.0;
+            double zero = 0.0;
+            cublasDgemm(cuhandle, CUBLAS_OP_T, CUBLAS_OP_N, nb, local_rank, nb,
+                    &one, Dcopy + bi*nb*nb, nb, D + bi*nb*nb, nb, &zero,
+                    V + bi*maxrank*nb, nb);
+            cublasDcopy(cuhandle, nb*local_rank, D + bi*nb*nb, 1,
+                    U + bi*maxrank*nb, 1);
+            iwork[id][bi] = local_rank;
+            //printf("RANK=%d\n", local_rank);
+        }
+    }
+    cudaMemcpyAsync(rank, iwork[id], batch_size*sizeof(int),
+            cudaMemcpyHostToDevice, stream);
+    //*/
+}
+
diff --git a/src/backends/starpu_kblas/dense/kernel.c b/src/backends/starpu_kblas/dense/kernel.c
new file mode 100644
index 00000000..cc8d53df
--- /dev/null
+++ b/src/backends/starpu_kblas/dense/kernel.c
@@ -0,0 +1,46 @@
+/*! @copyright (c) 2017 King Abdullah University of Science and
+ *                      Technology (KAUST). All rights reserved.
+ *
+ * STARS-H is a software package, provided by King Abdullah
+ *             University of Science and Technology (KAUST)
+ *
+ * @file src/backends/starpu/dense/kernel.c
+ * @version 0.1.0
+ * @author Aleksandr Mikhalev
+ * @date 2017-11-07
+ * */
+
+#include "common.h"
+#include "starsh.h"
+#include "starsh-starpu-kblas.h"
+#include <omp.h>
+
+void starsh_dense_kernel_starpu_kblas_cpu(void *buffers[], void *cl_arg)
+//! STARPU kernel for matrix kernel.
+{
+    double time0 = omp_get_wtime();
+    STARSH_blrf *F;
+    STARSH_int batch_size;
+    starpu_codelet_unpack_args(cl_arg, &F, &batch_size);
+    STARSH_problem *P = F->problem;
+    STARSH_kernel *kernel = P->kernel;
+    // Shortcuts to information about clusters
+    STARSH_cluster *RC = F->row_cluster, *CC = F->col_cluster;
+    void *RD = RC->data, *CD = CC->data;
+    double *D = (double *)STARPU_VECTOR_GET_PTR(buffers[0]);
+    STARSH_int *ind = (STARSH_int *)STARPU_VECTOR_GET_PTR(buffers[1]);
+    // This works only for equal square tiles
+    STARSH_int N = RC->size[0];
+    STARSH_int stride = N*N;
+    int pool_size = starpu_combined_worker_get_size();
+    int pool_rank = starpu_combined_worker_get_rank();
+    for(STARSH_int ibatch = pool_rank; ibatch < batch_size;
+            ibatch += pool_size)
+    {
+        int i = ind[ibatch*2];
+        int j = ind[ibatch*2+1];
+        kernel(N, N, RC->pivot+RC->start[i], CC->pivot+CC->start[j],
+                RD, CD, D + ibatch*stride, N);
+    }
+}
+
diff --git a/src/backends/starpu_kblas2/CMakeLists.txt b/src/backends/starpu_kblas2/CMakeLists.txt
new file mode 100644
index 00000000..38e9e2f1
--- /dev/null
+++ b/src/backends/starpu_kblas2/CMakeLists.txt
@@ -0,0 +1,26 @@
+# @copyright (c) 2017 King Abdullah University of Science and
+#                      Technology (KAUST). All rights reserved.
+#
+# STARS-H is a software package, provided by King Abdullah
+#             University of Science and Technology (KAUST)
+#
+# @file src/backends/starpu/CMakeLists.txt
+# @version 0.1.0
+# @author Aleksandr Mikhalev
+# @date 2017-11-07
+
+
+set(DOXYGEN_INPUT ${DOXYGEN_INPUT})
+
+# Collect sources for documentation and compilation
+set(SRC)
+add_subdirectory("blrm")
+add_subdirectory("dense")
+
+# If compilation is requried
+if(STARPU AND KBLAS)
+    add_library(backends_starpu_kblas2 OBJECT ${SRC})
+endif()
+
+# Put doxygen input to parent scope
+set(DOXYGEN_INPUT ${DOXYGEN_INPUT} ${SRC} PARENT_SCOPE)
diff --git a/src/backends/starpu_kblas2/blrm/CMakeLists.txt b/src/backends/starpu_kblas2/blrm/CMakeLists.txt
new file mode 100644
index 00000000..dc39d390
--- /dev/null
+++ b/src/backends/starpu_kblas2/blrm/CMakeLists.txt
@@ -0,0 +1,19 @@
+# @copyright (c) 2017 King Abdullah University of Science and
+#                      Technology (KAUST). All rights reserved.
+#
+# STARS-H is a software package, provided by King Abdullah
+#             University of Science and Technology (KAUST)
+#
+# @file src/backends/starpu/blrm/CMakeLists.txt
+# @version 0.1.0
+# @author Aleksandr Mikhalev
+# @date 2017-11-07
+
+
+# set the values of the variable in the parent scope
+set(SRC
+    #"${CMAKE_CURRENT_SOURCE_DIR}/dqp3.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/drsdd.c"
+    #"${CMAKE_CURRENT_SOURCE_DIR}/dsdd.c"
+    #"${CMAKE_CURRENT_SOURCE_DIR}/dmml.c"
+    ${SRC} PARENT_SCOPE)
diff --git a/src/backends/starpu_kblas2/blrm/drsdd.c b/src/backends/starpu_kblas2/blrm/drsdd.c
new file mode 100644
index 00000000..e1e8c716
--- /dev/null
+++ b/src/backends/starpu_kblas2/blrm/drsdd.c
@@ -0,0 +1,567 @@
+/*! @copyright (c) 2017 King Abdullah University of Science and
+ *                      Technology (KAUST). All rights reserved.
+ *
+ * STARS-H is a software package, provided by King Abdullah
+ *             University of Science and Technology (KAUST)
+ *
+ * @file src/backends/starpu/blrm/drsdd.c
+ * @version 0.1.0
+ * @author Aleksandr Mikhalev
+ * @date 2017-11-07
+ * */
+
+#include "common.h"
+#include "starsh.h"
+#include "starsh-starpu-kblas.h"
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <kblas.h>
+#include "batch_rand.h"
+#include <starpu.h>
+#include <omp.h>
+
+static void init_starpu_kblas(void *args)
+{
+    cublasHandle_t *cublas_handles;
+    kblasHandle_t *kblas_handles;
+    kblasRandState_t *kblas_states;
+    cudaStream_t stream = starpu_cuda_get_local_stream();
+    int nb, nsamples, maxbatch;
+    starpu_codelet_unpack_args(args, &cublas_handles, &kblas_handles,
+            &kblas_states, &nb, &nsamples, &maxbatch);
+    int id = starpu_worker_get_id();
+    cublasStatus_t status;
+    kblasCreate(&kblas_handles[id]);
+    kblasSetStream(kblas_handles[id], stream);
+    kblasDrsvd_batch_wsquery(kblas_handles[id], nb, nb, nsamples, maxbatch);
+    kblasAllocateWorkspace(kblas_handles[id]);
+    cublas_handles[id] = kblasGetCublasHandle(kblas_handles[id]);
+    kblasInitRandState(kblas_handles[id], &kblas_states[id], 16384*2, 0);
+    cudaStreamSynchronize(stream);
+}
+
+static void deinit_starpu_kblas(void *args)
+{
+    int nb, nsamples, maxbatch;
+    cublasHandle_t *cublas_handles;
+    kblasHandle_t *kblas_handles;
+    kblasRandState_t *kblas_states;
+    starpu_codelet_unpack_args(args, &cublas_handles, &kblas_handles,
+            &kblas_states, &nb, &nsamples, &maxbatch);
+    int id = starpu_worker_get_id();
+    kblasDestroyRandState(kblas_states[id]);
+    kblasDestroy(&kblas_handles[id]);
+}
+
+static void starsh_dense_dlrrsdd_starpu_kblas2_copy(void *buffers[], void *cl_arg)
+{
+    int N, batch_size;
+    starpu_codelet_unpack_args(cl_arg, &N, &batch_size);
+    double *Dcopy = (double *)STARPU_VECTOR_GET_PTR(buffers[0]);
+    double *D = (double *)STARPU_VECTOR_GET_PTR(buffers[1]);
+    cblas_dcopy(N*N*batch_size, Dcopy, 1, D, 1);
+}
+
+int starsh_blrm__drsdd_starpu_kblas2(STARSH_blrm **matrix, STARSH_blrf *format,
+        int maxrank, double tol, int onfly)
+//! Approximate each tile by randomized SVD.
+/*!
+ * @param[out] matrix: Address of pointer to @ref STARSH_blrm object.
+ * @param[in] format: Block low-rank format.
+ * @param[in] maxrank: Maximum possible rank.
+ * @param[in] tol: Relative error tolerance.
+ * @param[in] onfly: Whether not to store dense blocks.
+ * @return Error code @ref STARSH_ERRNO.
+ * @ingroup blrm
+ * */
+{
+    double time0 = omp_get_wtime();
+    //printf("KBLAS2\n");
+    STARSH_blrf *F = format;
+    STARSH_problem *P = F->problem;
+    STARSH_kernel *kernel = P->kernel;
+    STARSH_int nblocks_far = F->nblocks_far;
+    STARSH_int nblocks_near = F->nblocks_near;
+    // Shortcuts to information about clusters
+    STARSH_cluster *RC = F->row_cluster;
+    STARSH_cluster *CC = F->col_cluster;
+    void *RD = RC->data, *CD = CC->data;
+    // Following values default to given block low-rank format F, but they are
+    // changed when there are false far-field blocks.
+    STARSH_int new_nblocks_far = nblocks_far;
+    STARSH_int new_nblocks_near = nblocks_near;
+    STARSH_int *block_far = F->block_far;
+    STARSH_int *block_near = F->block_near;
+    // Places to store low-rank factors, dense blocks and ranks
+    Array **far_U = NULL, **far_V = NULL, **near_D = NULL;
+    int *far_rank = NULL;
+    double *alloc_U = NULL, *alloc_V = NULL, *alloc_D = NULL, *alloc_S = NULL;
+    STARSH_int bi, bj = 0;
+    const int oversample = starsh_params.oversample;
+    // Init CuBLAS and KBLAS handles and temp buffers for all workers (but they
+    // are used only in GPU codelets)
+    int workers = starpu_worker_get_count();
+    cublasHandle_t cublas_handles[workers];
+    kblasHandle_t kblas_handles[workers];
+    kblasRandState_t kblas_states[workers];
+    cublasHandle_t *cuhandles = cublas_handles;
+    kblasHandle_t *khandles = kblas_handles;
+    kblasRandState_t *kstates = kblas_states;
+    //printf("MAIN: %p, %p, %p\n", cuhandles, khandles, svhandles);
+    void *args_gpu;
+    size_t args_gpu_size = 0;
+    // This works only for TLR with equal tiles
+    int nb = RC->size[0];
+    int nsamples = maxrank+oversample;
+    // Set size of batch
+    char *env_var = getenv("STARSH_KBLAS_BATCH");
+    int batch_size = 300;
+    if(env_var)
+        batch_size = atoi(env_var);
+    //printf("KBLAS2: batch_size=%d\n", batch_size);
+    // Ceil number of batches
+    int nbatches = (nblocks_far-1)/batch_size + 1;
+    // Get number of temporary buffers for CPU-GPU transfers
+    int nworkers_gpu = 3 * starpu_cuda_worker_get_count();
+    int nworkers_cpu = starpu_cpu_worker_get_count();
+    // Get corresponding sizes and minimum of them
+    int mn = maxrank+oversample;
+    if(mn > nb)
+        mn = nb;
+    starpu_codelet_pack_args(&args_gpu, &args_gpu_size,
+            STARPU_VALUE, &cuhandles, sizeof(cuhandles),
+            STARPU_VALUE, &khandles, sizeof(khandles),
+            STARPU_VALUE, &kstates, sizeof(kstates),
+            STARPU_VALUE, &nb, sizeof(nb),
+            STARPU_VALUE, &nsamples, sizeof(nsamples),
+            STARPU_VALUE, &batch_size, sizeof(batch_size),
+            0);
+    starpu_execute_on_each_worker(init_starpu_kblas, args_gpu, STARPU_CUDA);
+    //printf("KBLAS2 finish init\n");
+    // Init codelet structs and handles
+    struct starpu_codelet codelet_kernel =
+    {
+        .cpu_funcs = {starsh_dense_kernel_starpu_kblas2_cpu},
+        .nbuffers = 2,
+        .modes = {STARPU_W, STARPU_R},
+        //.type = STARPU_SPMD,
+        //.max_parallelism = INT_MAX,
+    };
+    struct starpu_codelet codelet_lowrank =
+    {
+        .cuda_funcs = {starsh_dense_dlrrsdd_starpu_kblas2_gpu},
+        .cuda_flags = {STARPU_CUDA_ASYNC},
+        .nbuffers = 5,
+        .modes = {STARPU_R, STARPU_SCRATCH, STARPU_W, STARPU_W, STARPU_W},
+    };
+    struct starpu_codelet codelet_getrank =
+    {
+        .cpu_funcs = {starsh_dense_dlrrsdd_starpu_kblas2_getrank},
+        .nbuffers = 6,
+        .modes = {STARPU_R, STARPU_R, STARPU_R, STARPU_W, STARPU_W, STARPU_W},
+        //.type = STARPU_SPMD,
+        //.max_parallelism = INT_MAX,
+    };
+    struct starpu_codelet codelet_copy =
+    {
+        .cpu_funcs = {starsh_dense_dlrrsdd_starpu_kblas2_copy},
+        .nbuffers = 2,
+        .modes = {STARPU_R, STARPU_W},
+    };
+    //starpu_data_handle_t D_handle[nbatches];
+    starpu_data_handle_t index_handle[nbatches];
+    //starpu_data_handle_t Dcopy_handle[nbatches];
+    //starpu_data_handle_t tmp_U_handle[nbatches];
+    //starpu_data_handle_t tmp_V_handle[nbatches];
+    //starpu_data_handle_t tmp_S_handle[nbatches];
+    starpu_data_handle_t D_handle[nworkers_cpu];
+    starpu_data_handle_t DtoGPU_handle[nworkers_gpu];
+    starpu_data_handle_t Dcopy_handle[nworkers_gpu];
+    starpu_data_handle_t tmp_U_handle[nworkers_gpu];
+    starpu_data_handle_t tmp_V_handle[nworkers_gpu];
+    starpu_data_handle_t tmp_S_handle[nworkers_gpu];
+    starpu_data_handle_t U_handle[nbatches];
+    starpu_data_handle_t V_handle[nbatches];
+    starpu_data_handle_t rank_handle[nbatches];
+    //printf("KBLAS2: init in %f seconds\n", omp_get_wtime()-time0);
+    //time0 = omp_get_wtime();
+    double *tmp_U_alloc = NULL, *tmp_V_alloc = NULL, *tmp_S_alloc = NULL;
+    //printf("BATCHSIZE=%d BATCHCOUNT=%d\n", batch_size, nbatches);
+    // Init buffers to store low-rank factors of far-field blocks if needed
+    if(nbatches > 0)
+    {
+        STARSH_MALLOC(far_U, nblocks_far);
+        STARSH_MALLOC(far_V, nblocks_far);
+        STARSH_MALLOC(far_rank, nblocks_far);
+        size_t size_U = nblocks_far * nb * maxrank;
+        size_t size_V = size_U;
+        //size_t size_D = nblocks_far * nb * nb;
+        //size_t size_S = nblocks_far * mn;
+        STARSH_MALLOC(alloc_U, size_U);
+        STARSH_MALLOC(alloc_V, size_V);
+        //starpu_memory_pin(alloc_U, size_U*sizeof(double));
+        //starpu_memory_pin(alloc_V, size_V*sizeof(double));
+        //starpu_malloc(&alloc_S, size_S*sizeof(double));
+        int shape[] = {nb, maxrank};
+        for(bi = 0; bi < nblocks_far; ++bi)
+        {
+            STARSH_int offset = bi * nb * maxrank;
+            array_from_buffer(far_U+bi, 2, shape, 'd', 'F', alloc_U+offset);
+            array_from_buffer(far_V+bi, 2, shape, 'd', 'F', alloc_V+offset);
+            far_rank[bi] = -1;
+        }
+        //starpu_malloc(&alloc_D, size_D*sizeof(double));
+        //size_t tmp_U_alloc_size = (size_t)nworkers_gpu * batch_size * nb *
+        //    maxrank * sizeof(double);
+        //size_t tmp_S_alloc_size = (size_t)nworkers_gpu * batch_size * mn *
+        //    sizeof(double);
+        //starpu_malloc(&tmp_U_alloc, tmp_U_alloc_size);
+        //starpu_malloc(&tmp_V_alloc, tmp_U_alloc_size);
+        //starpu_malloc(&tmp_S_alloc, tmp_S_alloc_size);
+        starpu_memory_pin(block_far, 2*nblocks_far*sizeof(*block_far));
+        //printf("KBLAS2: pin memory in %e seconds\n", omp_get_wtime()-time0);
+        // START MEASURING TIME
+        //time0 = omp_get_wtime();
+        for(bi = 0; bi < nbatches; ++bi)
+        {
+            int this_batch_size = nblocks_far - bi*batch_size;
+            if(this_batch_size > batch_size)
+                this_batch_size = batch_size;
+            //printf("THIS BATCH SIZE=%d\n", this_batch_size);
+            starpu_vector_data_register(rank_handle+bi, STARPU_MAIN_RAM,
+                    (uintptr_t)(far_rank + bi*batch_size), this_batch_size,
+                    sizeof(*far_rank));
+            //STARSH_int offset_D = bi * batch_size * nb * nb;
+            //double *D = alloc_D + offset_D;
+            //STARSH_int D_size = this_batch_size * nb * nb;
+            //starpu_vector_data_register(DtoGPU_handle+bi, -1, 0, D_size,
+            //        sizeof(double));
+            //starpu_vector_data_register(Dcopy_handle+bi, -1, 0, D_size,
+            //        sizeof(double));
+            starpu_vector_data_register(index_handle+bi, STARPU_MAIN_RAM,
+                    (uintptr_t)(block_far + 2*bi*batch_size),
+                    2*this_batch_size, sizeof(*block_far));
+            STARSH_int offset = bi * batch_size * nb * maxrank;
+            //STARSH_int offset_S = bi * batch_size * mn;
+            double *U = alloc_U + offset;
+            double *V = alloc_V + offset;
+            //double *S = alloc_S + offset_S;
+            STARSH_int U_size = this_batch_size * nb * maxrank;
+            STARSH_int V_size = U_size;
+            //STARSH_int tmp_U_size = batch_size * nb * maxrank;
+            //STARSH_int tmp_V_size = tmp_U_size;
+            //STARSH_int tmp_S_size = batch_size * mn;
+            //starpu_vector_data_register(tmp_U_handle+bi, -1, 0 , tmp_U_size,
+            //        sizeof(double));
+            //starpu_vector_data_register(tmp_V_handle+bi, -1, 0 , tmp_V_size,
+            //        sizeof(double));
+            //starpu_vector_data_register(tmp_S_handle+bi, -1, 0 , tmp_S_size,
+            //        sizeof(double));
+            starpu_vector_data_register(U_handle+bi, STARPU_MAIN_RAM,
+                    (uintptr_t)(U), U_size, sizeof(*U));
+            starpu_vector_data_register(V_handle+bi, STARPU_MAIN_RAM,
+                    (uintptr_t)(V), V_size, sizeof(*V));
+        }
+        STARSH_int D_size = batch_size * nb * nb;
+        STARSH_int tmp_U_size = batch_size * nb * maxrank;
+        STARSH_int tmp_S_size = batch_size * mn;
+        STARSH_MALLOC(alloc_D, nworkers_cpu * D_size * sizeof(*alloc_D));
+        for(bi = 0; bi < nworkers_cpu; ++bi)
+        {
+            starpu_vector_data_register(D_handle+bi, STARPU_MAIN_RAM,
+                    (uintptr_t)(alloc_D+bi*D_size), D_size,
+                    sizeof(double));
+        }
+        for(bi = 0; bi < nworkers_gpu; ++bi)
+        {
+            starpu_vector_data_register(DtoGPU_handle+bi, -1, 0, D_size,
+                    sizeof(double));
+            starpu_vector_data_register(Dcopy_handle+bi, -1, 0, D_size,
+                    sizeof(double));
+            starpu_vector_data_register(tmp_U_handle+bi, -1, 0, tmp_U_size,
+                    sizeof(double));
+            starpu_vector_data_register(tmp_V_handle+bi, -1, 0, tmp_U_size,
+                    sizeof(double));
+            starpu_vector_data_register(tmp_S_handle+bi, -1, 0, tmp_S_size,
+                    sizeof(double));
+        }
+        //printf("REGISTER DATA IN: %f seconds\n", omp_get_wtime()-time0);
+    }
+    // Work variables
+    int info;
+    // START MEASURING TIME
+    //time0 = omp_get_wtime();
+    for(bi = 0; bi < nbatches; ++bi)
+    {
+        //printf("RUNNING BATCH=%d\n", bi);
+        int this_batch_size = nblocks_far - bi*batch_size;
+        if(this_batch_size > batch_size)
+            this_batch_size = batch_size;
+        // Generate matrix
+        starpu_task_insert(&codelet_kernel,
+                STARPU_VALUE, &F, sizeof(F),
+                STARPU_VALUE, &this_batch_size, sizeof(this_batch_size),
+                STARPU_W, D_handle[bi % nworkers_cpu],
+                STARPU_R, index_handle[bi],
+                STARPU_PRIORITY, -2,
+                0);
+        starpu_data_unregister_submit(index_handle[bi]);
+        // Copy to pinned memory
+        starpu_task_insert(&codelet_copy,
+                STARPU_VALUE, &nb, sizeof(nb),
+                STARPU_VALUE, &this_batch_size, sizeof(this_batch_size),
+                STARPU_R, D_handle[bi % nworkers_cpu],
+                STARPU_W, DtoGPU_handle[bi % nworkers_gpu],
+                0);
+        // Run KBLAS_RSVD
+        starpu_task_insert(&codelet_lowrank,
+                STARPU_VALUE, &this_batch_size, sizeof(this_batch_size),
+                STARPU_VALUE, &nb, sizeof(nb),
+                STARPU_VALUE, &maxrank, sizeof(maxrank),
+                STARPU_VALUE, &oversample, sizeof(oversample),
+                STARPU_VALUE, &tol, sizeof(tol),
+                STARPU_VALUE, &cuhandles, sizeof(cuhandles),
+                STARPU_VALUE, &khandles, sizeof(khandles),
+                STARPU_VALUE, &kstates, sizeof(kstates),
+                STARPU_R, DtoGPU_handle[bi % nworkers_gpu],
+                STARPU_SCRATCH, Dcopy_handle[bi % nworkers_gpu],
+                STARPU_W, tmp_U_handle[bi % nworkers_gpu],
+                STARPU_W, tmp_V_handle[bi % nworkers_gpu],
+                STARPU_W, tmp_S_handle[bi % nworkers_gpu],
+                STARPU_PRIORITY, -1,
+                0);
+        //starpu_data_unregister_submit(D_handle[bi]);
+        //starpu_data_unregister_submit(Dcopy_handle[bi]);
+        starpu_task_insert(&codelet_getrank,
+                STARPU_VALUE, &this_batch_size, sizeof(this_batch_size),
+                STARPU_VALUE, &nb, sizeof(nb),
+                STARPU_VALUE, &maxrank, sizeof(maxrank),
+                STARPU_VALUE, &oversample, sizeof(oversample),
+                STARPU_VALUE, &tol, sizeof(tol),
+                STARPU_R, tmp_U_handle[bi % nworkers_gpu],
+                STARPU_R, tmp_V_handle[bi % nworkers_gpu],
+                STARPU_R, tmp_S_handle[bi % nworkers_gpu],
+                STARPU_W, rank_handle[bi],
+                STARPU_W, U_handle[bi],
+                STARPU_W, V_handle[bi],
+                STARPU_PRIORITY, 0,
+                0);
+        //starpu_data_unregister_submit(tmp_U_handle[bi]);
+        //starpu_data_unregister_submit(tmp_V_handle[bi]);
+        //starpu_data_unregister_submit(tmp_S_handle[bi]);
+        starpu_data_unregister_submit(rank_handle[bi]);
+        starpu_data_unregister_submit(U_handle[bi]);
+        starpu_data_unregister_submit(V_handle[bi]);
+    }
+    //double time1 = omp_get_wtime();
+    //printf("SUBMIT IN: %f seconds\n", time1-time0);
+    starpu_task_wait_for_all();
+    //time1 = omp_get_wtime();
+    //printf("COMPUTE+COMPRESS MATRIX IN: %f seconds\n", time1-time0);
+    //time0 = omp_get_wtime();
+    if(nbatches > 0)
+    {
+        //size_t size_U = nblocks_far * nb * maxrank;
+        //size_t size_V = size_U;
+        //starpu_free(alloc_D);
+        //starpu_memory_unpin(alloc_U, size_U*sizeof(double));
+        //starpu_memory_unpin(alloc_V, size_V*sizeof(double));
+        //starpu_free(alloc_S);
+        for(bi = 0; bi < nworkers_cpu; ++bi)
+        {
+            starpu_data_unregister(D_handle[bi]);
+        }
+        for(bi = 0; bi < nworkers_gpu; ++bi)
+        {
+            starpu_data_unregister(DtoGPU_handle[bi]);
+            starpu_data_unregister(Dcopy_handle[bi]);
+            starpu_data_unregister(tmp_U_handle[bi]);
+            starpu_data_unregister(tmp_V_handle[bi]);
+            starpu_data_unregister(tmp_S_handle[bi]);
+        }
+        //starpu_free(tmp_U_alloc);
+        //starpu_free(tmp_V_alloc);
+        //starpu_free(tmp_S_alloc);
+        starpu_memory_unpin(block_far, 2*nblocks_far*sizeof(*block_far));
+        free(alloc_D);
+        alloc_D = NULL;
+    }
+    //printf("FINISH FIRST PASS AND UNREGISTER IN: %f seconds\n",
+    //        omp_get_wtime()-time0);
+    // Get number of false far-field blocks
+    STARSH_int nblocks_false_far = 0;
+    STARSH_int *false_far = NULL;
+    for(bi = 0; bi < nblocks_far; bi++)
+    {
+        //printf("FAR_RANK[%zu]=%d\n", bi, far_rank[bi]);
+        //far_rank[bi] = -1;
+        if(far_rank[bi] == -1)
+            nblocks_false_far++;
+    }
+    if(nblocks_false_far > 0)
+    {
+        // IMPORTANT: `false_far` must to be in ascending order for later code
+        // to work normally
+        STARSH_MALLOC(false_far, nblocks_false_far);
+        bj = 0;
+        for(bi = 0; bi < nblocks_far; bi++)
+            if(far_rank[bi] == -1)
+                false_far[bj++] = bi;
+    }
+    // Update lists of far-field and near-field blocks using previously
+    // generated list of false far-field blocks
+    if(nblocks_false_far > 0)
+    {
+        // Update list of near-field blocks
+        new_nblocks_near = nblocks_near+nblocks_false_far;
+        STARSH_MALLOC(block_near, 2*new_nblocks_near);
+        // At first get all near-field blocks, assumed to be dense
+        for(bi = 0; bi < 2*nblocks_near; bi++)
+            block_near[bi] = F->block_near[bi];
+        // Add false far-field blocks
+        for(bi = 0; bi < nblocks_false_far; bi++)
+        {
+            STARSH_int bj = false_far[bi];
+            block_near[2*(bi+nblocks_near)] = F->block_far[2*bj];
+            block_near[2*(bi+nblocks_near)+1] = F->block_far[2*bj+1];
+        }
+        // Update list of far-field blocks
+        new_nblocks_far = nblocks_far-nblocks_false_far;
+        if(new_nblocks_far > 0)
+        {
+            STARSH_MALLOC(block_far, 2*new_nblocks_far);
+            bj = 0;
+            for(bi = 0; bi < nblocks_far; bi++)
+            {
+                // `false_far` must be in ascending order for this to work
+                if(bj < nblocks_false_far && false_far[bj] == bi)
+                {
+                    bj++;
+                }
+                else
+                {
+                    block_far[2*(bi-bj)] = F->block_far[2*bi];
+                    block_far[2*(bi-bj)+1] = F->block_far[2*bi+1];
+                }
+            }
+        }
+        // Update format by creating new format
+        STARSH_blrf *F2;
+        info = starsh_blrf_new_from_coo(&F2, P, F->symm, RC, CC,
+                new_nblocks_far, block_far, new_nblocks_near, block_near,
+                F->type);
+        // Swap internal data of formats and free unnecessary data
+        STARSH_blrf tmp_blrf = *F;
+        *F = *F2;
+        *F2 = tmp_blrf;
+        STARSH_WARNING("`F` was modified due to false far-field blocks");
+        starsh_blrf_free(F2);
+    }
+    // Compute near-field blocks if needed
+    if(onfly == 0 && new_nblocks_near > 0)
+    {
+        STARSH_MALLOC(near_D, new_nblocks_near);
+        size_t size_D = new_nblocks_near * nb * nb;
+        STARSH_MALLOC(alloc_D, size_D);
+        nbatches = (new_nblocks_near-1)/batch_size + 1;
+        starpu_data_handle_t D_handle[nbatches];
+        starpu_data_handle_t index_handle[nbatches];
+        int shape[] = {nb, nb};
+        // For each near-field block compute its elements
+        for(bi = 0; bi < new_nblocks_near; ++bi)
+        {
+            // Get indexes of corresponding block row and block column
+            //STARSH_int i = block_near[2*bi];
+            //STARSH_int j = block_near[2*bi+1];
+            array_from_buffer(near_D+bi, 2, shape, 'd', 'F',
+                    alloc_D + bi*nb*nb);
+        }
+        for(bi = 0; bi < nbatches; ++bi)
+        {
+            int this_batch_size = new_nblocks_near - bi*batch_size;
+            if(this_batch_size > batch_size)
+                this_batch_size = batch_size;
+            STARSH_int D_size = this_batch_size * nb * nb;
+            double *D = alloc_D + bi*batch_size*nb*nb;
+            starpu_vector_data_register(D_handle+bi, STARPU_MAIN_RAM,
+                    (uintptr_t)(D), D_size, sizeof(*D));
+            starpu_vector_data_register(index_handle+bi, STARPU_MAIN_RAM,
+                    (uintptr_t)(block_near + 2*bi*batch_size),
+                    2*this_batch_size, sizeof(*block_near));
+        }
+        for(bi = 0; bi < nbatches; ++bi)
+        {
+            int this_batch_size = new_nblocks_near - bi*batch_size;
+            if(this_batch_size > batch_size)
+                this_batch_size = batch_size;
+            // Generate matrix
+            starpu_task_insert(&codelet_kernel,
+                    STARPU_VALUE, &F, sizeof(F),
+                    STARPU_VALUE, &this_batch_size, sizeof(this_batch_size),
+                    STARPU_W, D_handle[bi],
+                    STARPU_R, index_handle[bi],
+                    0);
+        }
+        // Wait in this scope, because all handles are not visible outside
+        starpu_task_wait_for_all();
+        // Unregister data
+        for(bi = 0; bi < nbatches; bi++)
+        {
+            starpu_data_unregister(D_handle[bi]);
+            starpu_data_unregister(index_handle[bi]);
+        }
+    }
+    // Change sizes of far_rank, far_U and far_V if there were false
+    // far-field blocks
+    if(nblocks_false_far > 0 && new_nblocks_far > 0)
+    {
+        bj = 0;
+        for(bi = 0; bi < nblocks_far; bi++)
+        {
+            if(far_rank[bi] == -1)
+                bj++;
+            else
+            {
+                int shape_U[2] = {far_U[bi]->shape[0], far_rank[bi]};
+                int shape_V[2] = {far_V[bi]->shape[0], far_rank[bi]};
+                array_from_buffer(far_U+bi-bj, 2, shape_U, 'd', 'F',
+                        far_U[bi]->data);
+                array_from_buffer(far_V+bi-bj, 2, shape_V, 'd', 'F',
+                        far_V[bi]->data);
+                far_rank[bi-bj] = far_rank[bi];
+            }
+        }
+        STARSH_REALLOC(far_rank, new_nblocks_far);
+        STARSH_REALLOC(far_U, new_nblocks_far);
+        STARSH_REALLOC(far_V, new_nblocks_far);
+        //STARSH_REALLOC(alloc_U, offset_U);
+        //STARSH_REALLOC(alloc_V, offset_V);
+    }
+    // If all far-field blocks are false, then dealloc buffers
+    if(new_nblocks_far == 0 && nblocks_far > 0)
+    {
+        block_far = NULL;
+        free(far_rank);
+        far_rank = NULL;
+        free(far_U);
+        far_U = NULL;
+        free(far_V);
+        far_V = NULL;
+        free(alloc_U);
+        alloc_U = NULL;
+        free(alloc_V);
+        alloc_V = NULL;
+    }
+    // Dealloc list of false far-field blocks if it is not empty
+    if(nblocks_false_far > 0)
+        free(false_far);
+    // Finish with creating instance of Block Low-Rank Matrix with given
+    // buffers
+    //printf("FINISH NEAR-FIELD TILES: %f seconds\n", omp_get_wtime()-time0);
+    //time0 = omp_get_wtime();
+    starpu_execute_on_each_worker(deinit_starpu_kblas, args_gpu, STARPU_CUDA);
+    //printf("KBLAS2: finalize in %f seconds\n", omp_get_wtime()-time0);
+    return starsh_blrm_new(matrix, F, far_rank, far_U, far_V, onfly, near_D,
+            alloc_U, alloc_V, alloc_D, '1');
+}
+
diff --git a/src/backends/starpu_kblas2/dense/CMakeLists.txt b/src/backends/starpu_kblas2/dense/CMakeLists.txt
new file mode 100644
index 00000000..0ae6c8ec
--- /dev/null
+++ b/src/backends/starpu_kblas2/dense/CMakeLists.txt
@@ -0,0 +1,21 @@
+# @copyright (c) 2017 King Abdullah University of Science and
+#                      Technology (KAUST). All rights reserved.
+#
+# STARS-H is a software package, provided by King Abdullah
+#             University of Science and Technology (KAUST)
+#
+# @file src/backends/starpu/dense/CMakeLists.txt
+# @version 0.1.0
+# @author Aleksandr Mikhalev
+# @date 2017-11-07
+
+
+# set the values of the variable in the parent scope
+set(SRC
+    #"${CMAKE_CURRENT_SOURCE_DIR}/dqp3.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/drsdd.c"
+    #"${CMAKE_CURRENT_SOURCE_DIR}/dsdd.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/kernel.c"
+    #"${CMAKE_CURRENT_SOURCE_DIR}/dgemm.c"
+    #"${CMAKE_CURRENT_SOURCE_DIR}/fake_init.c"
+    ${SRC} PARENT_SCOPE)
diff --git a/src/backends/starpu_kblas2/dense/drsdd.c b/src/backends/starpu_kblas2/dense/drsdd.c
new file mode 100644
index 00000000..85b7a7bb
--- /dev/null
+++ b/src/backends/starpu_kblas2/dense/drsdd.c
@@ -0,0 +1,105 @@
+/*! @copyright (c) 2017 King Abdullah University of Science and
+ *                      Technology (KAUST). All rights reserved.
+ *
+ * STARS-H is a software package, provided by King Abdullah
+ *             University of Science and Technology (KAUST)
+ *
+ * @file src/backends/starpu/dense/drsdd.c
+ * @version 0.1.0
+ * @author Aleksandr Mikhalev
+ * @date 2017-11-07
+ * */
+
+#include "common.h"
+#include "starsh.h"
+#include "starsh-starpu-kblas.h"
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <kblas.h>
+#include "batch_rand.h"
+#include <omp.h>
+
+void starsh_dense_dlrrsdd_starpu_kblas2_gpu(void *buffer[], void *cl_arg)
+//! STARPU kernel for 1-way randomized SVD on a tile.
+{
+    int batch_size;
+    int nb;
+    int maxrank;
+    int oversample;
+    cublasHandle_t *cublas_handles;
+    kblasHandle_t *kblas_handles;
+    kblasRandState_t *kblas_states;
+    double **work;
+    int lwork;
+    int **iwork;
+    double tol;
+    starpu_codelet_unpack_args(cl_arg, &batch_size, &nb, &maxrank, &oversample,
+            &tol, &cublas_handles, &kblas_handles, &kblas_states);
+    double *D = (double *)STARPU_VECTOR_GET_PTR(buffer[0]);
+    double *W = (double *)STARPU_VECTOR_GET_PTR(buffer[1]);
+    double *U = (double *)STARPU_VECTOR_GET_PTR(buffer[2]);
+    double *V = (double *)STARPU_VECTOR_GET_PTR(buffer[3]);
+    double *S = (double *)STARPU_VECTOR_GET_PTR(buffer[4]);
+    int mn = maxrank+oversample;
+    if(mn > nb)
+        mn = nb;
+    int id = starpu_worker_get_id();
+    kblasHandle_t khandle = kblas_handles[id];
+    cublasHandle_t cuhandle = cublas_handles[id];
+    kblasRandState_t state = kblas_states[id];
+    cudaStream_t stream = starpu_cuda_get_local_stream();
+    // Create copy of first mn columns of D, since kblas_rsvd spoils it
+    cublasDcopy(cuhandle, batch_size*nb*nb, D, 1, W, 1);
+    // Run randomized SVD, get left singular vectors and singular values
+    kblasDrsvd_batch_strided(khandle, nb, nb, mn, W, nb, nb*nb, S, mn, state,
+            batch_size);
+    double one = 1.0;
+    double zero = 0.0;
+    for(int bi = 0; bi < batch_size; ++bi)
+        cublasDcopy(cuhandle, nb*maxrank, W+bi*nb*nb, 1, U+bi*maxrank*nb, 1);
+    kblasDgemm_batch_strided(khandle, KBLAS_Trans, KBLAS_NoTrans, nb, maxrank,
+            nb, one, D, nb, nb*nb, U, nb, nb*maxrank, zero, V, nb,
+            maxrank*nb, batch_size);
+}
+
+void starsh_dense_dlrrsdd_starpu_kblas2_getrank(void *buffer[], void *cl_arg)
+//! STARPU kernel for 1-way randomized SVD on a tile.
+{
+    int batch_size;
+    int nb;
+    int maxrank;
+    int oversample;
+    double tol;
+    starpu_codelet_unpack_args(cl_arg, &batch_size, &nb, &maxrank, &oversample,
+            &tol);
+    double *tmp_U = (double *)STARPU_VECTOR_GET_PTR(buffer[0]);
+    double *tmp_V = (double *)STARPU_VECTOR_GET_PTR(buffer[1]);
+    double *tmp_S = (double *)STARPU_VECTOR_GET_PTR(buffer[2]);
+    int *rank = (int *)STARPU_VECTOR_GET_PTR(buffer[3]);
+    double *U = (double *)STARPU_VECTOR_GET_PTR(buffer[4]);
+    double *V = (double *)STARPU_VECTOR_GET_PTR(buffer[5]);
+    int mn = maxrank+oversample;
+    if(mn > nb)
+        mn = nb;
+    int pool_size = starpu_combined_worker_get_size();
+    int pool_rank = starpu_combined_worker_get_rank();
+    size_t stride = maxrank * nb;
+    for(STARSH_int ibatch = pool_rank; ibatch < batch_size;
+            ibatch += pool_size)
+    {
+        int local_rank = starsh_dense_dsvfr(mn, tmp_S+ibatch*mn, tol);
+        if(local_rank >= nb/2 || local_rank > maxrank)
+            rank[ibatch] = -1;
+        else
+        {
+            double *local_U = U + ibatch*stride;
+            double *local_V = V + ibatch*stride;
+            double *local_tmp_U = tmp_U + ibatch*stride;
+            double *local_tmp_V = tmp_V + ibatch*stride;
+            cblas_dcopy(local_rank*nb, local_tmp_U, 1, local_U, 1);
+            cblas_dcopy(local_rank*nb, local_tmp_V, 1, local_V, 1);
+            rank[ibatch] = local_rank;
+        }
+    }
+}
+
diff --git a/src/backends/starpu_kblas2/dense/kernel.c b/src/backends/starpu_kblas2/dense/kernel.c
new file mode 100644
index 00000000..c09a1424
--- /dev/null
+++ b/src/backends/starpu_kblas2/dense/kernel.c
@@ -0,0 +1,46 @@
+/*! @copyright (c) 2017 King Abdullah University of Science and
+ *                      Technology (KAUST). All rights reserved.
+ *
+ * STARS-H is a software package, provided by King Abdullah
+ *             University of Science and Technology (KAUST)
+ *
+ * @file src/backends/starpu/dense/kernel.c
+ * @version 0.1.0
+ * @author Aleksandr Mikhalev
+ * @date 2017-11-07
+ * */
+
+#include "common.h"
+#include "starsh.h"
+#include "starsh-starpu-kblas.h"
+#include <omp.h>
+
+void starsh_dense_kernel_starpu_kblas2_cpu(void *buffers[], void *cl_arg)
+//! STARPU kernel for matrix kernel.
+{
+    //double time0 = omp_get_wtime();
+    STARSH_blrf *F;
+    int batch_size;
+    starpu_codelet_unpack_args(cl_arg, &F, &batch_size);
+    STARSH_problem *P = F->problem;
+    STARSH_kernel *kernel = P->kernel;
+    // Shortcuts to information about clusters
+    STARSH_cluster *RC = F->row_cluster, *CC = F->col_cluster;
+    void *RD = RC->data, *CD = CC->data;
+    double *D = (double *)STARPU_VECTOR_GET_PTR(buffers[0]);
+    STARSH_int *ind = (STARSH_int *)STARPU_VECTOR_GET_PTR(buffers[1]);
+    // This works only for equal square tiles
+    STARSH_int N = RC->size[0];
+    STARSH_int stride = N*N;
+    int pool_size = starpu_combined_worker_get_size();
+    int pool_rank = starpu_combined_worker_get_rank();
+    for(STARSH_int ibatch = pool_rank; ibatch < batch_size;
+            ibatch += pool_size)
+    {
+        int i = ind[ibatch*2];
+        int j = ind[ibatch*2+1];
+        kernel(N, N, RC->pivot+RC->start[i], CC->pivot+CC->start[j],
+                RD, CD, D + ibatch*stride, N);
+    }
+}
+
diff --git a/src/backends/starpu_kblas3_spatial/CMakeLists.txt b/src/backends/starpu_kblas3_spatial/CMakeLists.txt
new file mode 100644
index 00000000..36cd3eec
--- /dev/null
+++ b/src/backends/starpu_kblas3_spatial/CMakeLists.txt
@@ -0,0 +1,26 @@
+# @copyright (c) 2017 King Abdullah University of Science and
+#                      Technology (KAUST). All rights reserved.
+#
+# STARS-H is a software package, provided by King Abdullah
+#             University of Science and Technology (KAUST)
+#
+# @file src/backends/starpu/CMakeLists.txt
+# @version 0.1.0
+# @author Aleksandr Mikhalev
+# @date 2017-11-07
+
+
+set(DOXYGEN_INPUT ${DOXYGEN_INPUT})
+
+# Collect sources for documentation and compilation
+set(SRC)
+add_subdirectory("blrm")
+add_subdirectory("dense")
+
+# If compilation is requried
+if(STARPU AND KBLAS)
+    add_library(backends_starpu_kblas3_spatial OBJECT ${SRC})
+endif()
+
+# Put doxygen input to parent scope
+set(DOXYGEN_INPUT ${DOXYGEN_INPUT} ${SRC} PARENT_SCOPE)
diff --git a/src/backends/starpu_kblas3_spatial/blrm/CMakeLists.txt b/src/backends/starpu_kblas3_spatial/blrm/CMakeLists.txt
new file mode 100644
index 00000000..dc39d390
--- /dev/null
+++ b/src/backends/starpu_kblas3_spatial/blrm/CMakeLists.txt
@@ -0,0 +1,19 @@
+# @copyright (c) 2017 King Abdullah University of Science and
+#                      Technology (KAUST). All rights reserved.
+#
+# STARS-H is a software package, provided by King Abdullah
+#             University of Science and Technology (KAUST)
+#
+# @file src/backends/starpu/blrm/CMakeLists.txt
+# @version 0.1.0
+# @author Aleksandr Mikhalev
+# @date 2017-11-07
+
+
+# set the values of the variable in the parent scope
+set(SRC
+    #"${CMAKE_CURRENT_SOURCE_DIR}/dqp3.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/drsdd.c"
+    #"${CMAKE_CURRENT_SOURCE_DIR}/dsdd.c"
+    #"${CMAKE_CURRENT_SOURCE_DIR}/dmml.c"
+    ${SRC} PARENT_SCOPE)
diff --git a/src/backends/starpu_kblas3_spatial/blrm/drsdd.c b/src/backends/starpu_kblas3_spatial/blrm/drsdd.c
new file mode 100644
index 00000000..e26017b9
--- /dev/null
+++ b/src/backends/starpu_kblas3_spatial/blrm/drsdd.c
@@ -0,0 +1,588 @@
+/*! @copyright (c) 2017 King Abdullah University of Science and
+ *                      Technology (KAUST). All rights reserved.
+ *
+ * STARS-H is a software package, provided by King Abdullah
+ *             University of Science and Technology (KAUST)
+ *
+ * @file src/backends/starpu_kblas3_spatial/blrm/drsdd.c
+ * @version 0.1.0
+ * @author Aleksandr Mikhalev
+ * @date 2017-11-07
+ * */
+
+#include "common.h"
+#include "starsh.h"
+#include "starsh-starpu-kblas.h"
+#include "starsh-spatial.h"
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <kblas.h>
+#include "batch_rand.h"
+#include <starpu.h>
+#include <omp.h>
+
+static void init_starpu_kblas(void *args)
+{
+    cublasHandle_t *cublas_handles;
+    kblasHandle_t *kblas_handles;
+    kblasRandState_t *kblas_states;
+    STARSH_ssdata **data_gpu;
+    STARSH_ssdata *data_cpu;
+    cudaStream_t stream = starpu_cuda_get_local_stream();
+    int nb, nsamples, maxbatch;
+    starpu_codelet_unpack_args(args, &cublas_handles, &kblas_handles,
+            &kblas_states, &data_gpu, &data_cpu, &nb, &nsamples, &maxbatch);
+    int id = starpu_worker_get_id();
+    cublasStatus_t status;
+    kblasCreate(&kblas_handles[id]);
+    kblasSetStream(kblas_handles[id], stream);
+    kblasDrsvd_batch_wsquery(kblas_handles[id], nb, nb, nsamples, maxbatch);
+    kblasAllocateWorkspace(kblas_handles[id]);
+    cublas_handles[id] = kblasGetCublasHandle(kblas_handles[id]);
+    kblasInitRandState(kblas_handles[id], &kblas_states[id], 16384*2, 0);
+    starsh_ssdata_togpu(&data_gpu[id], data_cpu);
+    cudaStreamSynchronize(stream);
+}
+
+static void deinit_starpu_kblas(void *args)
+{
+    int nb, nsamples, maxbatch;
+    cublasHandle_t *cublas_handles;
+    kblasHandle_t *kblas_handles;
+    kblasRandState_t *kblas_states;
+    STARSH_ssdata **data_gpu;
+    STARSH_ssdata *data_cpu;
+    starpu_codelet_unpack_args(args, &cublas_handles, &kblas_handles,
+            &kblas_states, &data_gpu, &data_cpu, &nb, &nsamples, &maxbatch);
+    int id = starpu_worker_get_id();
+    kblasDestroyRandState(kblas_states[id]);
+    kblasDestroy(&kblas_handles[id]);
+    starsh_ssdata_free_gpu(data_gpu[id]);
+    cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}
+
+static void starsh_dense_dlrrsdd_starpu_kblas3_copy(void *buffers[], void *cl_arg)
+{
+    int N, batch_size;
+    starpu_codelet_unpack_args(cl_arg, &N, &batch_size);
+    double *Dcopy = (double *)STARPU_VECTOR_GET_PTR(buffers[0]);
+    double *D = (double *)STARPU_VECTOR_GET_PTR(buffers[1]);
+    cblas_dcopy(N*N*batch_size, Dcopy, 1, D, 1);
+}
+
+int starsh_blrm__drsdd_starpu_kblas3_spatial(STARSH_blrm **matrix, STARSH_blrf *format,
+        int maxrank, double tol, int onfly)
+//! Approximate each tile by randomized SVD.
+/*!
+ * @param[out] matrix: Address of pointer to @ref STARSH_blrm object.
+ * @param[in] format: Block low-rank format.
+ * @param[in] maxrank: Maximum possible rank.
+ * @param[in] tol: Relative error tolerance.
+ * @param[in] onfly: Whether not to store dense blocks.
+ * @return Error code @ref STARSH_ERRNO.
+ * @ingroup blrm
+ * */
+{
+    //double time0 = omp_get_wtime();
+    //printf("KBLAS3\n");
+    STARSH_blrf *F = format;
+    STARSH_problem *P = F->problem;
+    STARSH_kernel *kernel = P->kernel;
+    STARSH_int nblocks_far = F->nblocks_far;
+    STARSH_int nblocks_near = F->nblocks_near;
+    // Shortcuts to information about clusters
+    STARSH_cluster *RC = F->row_cluster;
+    STARSH_cluster *CC = F->col_cluster;
+    void *RD = RC->data, *CD = CC->data;
+    // Following values default to given block low-rank format F, but they are
+    // changed when there are false far-field blocks.
+    STARSH_int new_nblocks_far = nblocks_far;
+    STARSH_int new_nblocks_near = nblocks_near;
+    STARSH_int *block_far = F->block_far;
+    STARSH_int *block_near = F->block_near;
+    // Places to store low-rank factors, dense blocks and ranks
+    Array **far_U = NULL, **far_V = NULL, **near_D = NULL;
+    int *far_rank = NULL;
+    double *alloc_U = NULL, *alloc_V = NULL, *alloc_D = NULL, *alloc_S = NULL;
+    STARSH_int bi, bj = 0;
+    const int oversample = starsh_params.oversample;
+    // Init CuBLAS and KBLAS handles and temp buffers for all workers (but they
+    // are used only in GPU codelets)
+    int workers = starpu_worker_get_count();
+    cublasHandle_t cublas_handles[workers];
+    kblasHandle_t kblas_handles[workers];
+    kblasRandState_t kblas_states[workers];
+    STARSH_ssdata *data_gpu_array[workers];
+    cublasHandle_t *cuhandles = cublas_handles;
+    kblasHandle_t *khandles = kblas_handles;
+    kblasRandState_t *kstates = kblas_states;
+    STARSH_ssdata **data_gpu = data_gpu_array;
+    //printf("MAIN: %p, %p, %p\n", cuhandles, khandles, svhandles);
+    void *args_gpu;
+    size_t args_gpu_size = 0;
+    // This works only for TLR with equal tiles
+    int nb = RC->size[0];
+    int nsamples = maxrank+oversample;
+    // Set size of batch
+    char *env_var = getenv("STARSH_KBLAS_BATCH");
+    int batch_size = 300;
+    if(env_var)
+        batch_size = atoi(env_var);
+    //printf("KBLAS3: batch_size=%d\n", batch_size);
+    // Ceil number of batches
+    int nbatches = (nblocks_far-1)/batch_size + 1;
+    // Get number of temporary buffers for CPU-GPU transfers
+    int nworkers_gpu = 3 * starpu_cuda_worker_get_count();
+    // Get corresponding sizes and minimum of them
+    int mn = maxrank+oversample;
+    if(mn > nb)
+        mn = nb;
+    starpu_codelet_pack_args(&args_gpu, &args_gpu_size,
+            STARPU_VALUE, &cuhandles, sizeof(cuhandles),
+            STARPU_VALUE, &khandles, sizeof(khandles),
+            STARPU_VALUE, &kstates, sizeof(kstates),
+            STARPU_VALUE, &data_gpu, sizeof(data_gpu),
+            STARPU_VALUE, &RD, sizeof(RD),
+            STARPU_VALUE, &nb, sizeof(nb),
+            STARPU_VALUE, &nsamples, sizeof(nsamples),
+            STARPU_VALUE, &batch_size, sizeof(batch_size),
+            0);
+    starpu_execute_on_each_worker(init_starpu_kblas, args_gpu, STARPU_CUDA);
+    //printf("KBLAS2 finish init\n");
+    // Init codelet structs and handles
+    struct starpu_codelet codelet_kernel =
+    {
+        .cuda_funcs = {starsh_dense_kernel_starpu_kblas3_gpu},
+        .cuda_flags = {STARPU_CUDA_ASYNC},
+        .nbuffers = 2,
+        .modes = {STARPU_W, STARPU_R},
+        //.type = STARPU_SPMD,
+        //.max_parallelism = INT_MAX,
+    };
+    struct starpu_codelet codelet_lowrank =
+    {
+        .cuda_funcs = {starsh_dense_dlrrsdd_starpu_kblas2_gpu},
+        .cuda_flags = {STARPU_CUDA_ASYNC},
+        .nbuffers = 5,
+        .modes = {STARPU_R, STARPU_SCRATCH, STARPU_W, STARPU_W, STARPU_W},
+    };
+    struct starpu_codelet codelet_getrank =
+    {
+        .cpu_funcs = {starsh_dense_dlrrsdd_starpu_kblas2_getrank},
+        .nbuffers = 6,
+        .modes = {STARPU_R, STARPU_R, STARPU_R, STARPU_W, STARPU_W, STARPU_W},
+        //.type = STARPU_SPMD,
+        //.max_parallelism = INT_MAX,
+    };
+    struct starpu_codelet codelet_copy =
+    {
+        .cpu_funcs = {starsh_dense_dlrrsdd_starpu_kblas3_copy},
+        .nbuffers = 2,
+        .modes = {STARPU_R, STARPU_W},
+    };
+    //starpu_data_handle_t D_handle[nbatches];
+    starpu_data_handle_t index_handle[nbatches];
+    //starpu_data_handle_t Dcopy_handle[nbatches];
+    //starpu_data_handle_t tmp_U_handle[nbatches];
+    //starpu_data_handle_t tmp_V_handle[nbatches];
+    //starpu_data_handle_t tmp_S_handle[nbatches];
+    starpu_data_handle_t D_handle[nworkers_gpu];
+    starpu_data_handle_t Dcopy_handle[nworkers_gpu];
+    starpu_data_handle_t tmp_U_handle[nworkers_gpu];
+    starpu_data_handle_t tmp_V_handle[nworkers_gpu];
+    starpu_data_handle_t tmp_S_handle[nworkers_gpu];
+    starpu_data_handle_t U_handle[nbatches];
+    starpu_data_handle_t V_handle[nbatches];
+    starpu_data_handle_t rank_handle[nbatches];
+    //printf("KBLAS3: init in %f seconds\n", omp_get_wtime()-time0);
+    //time0 = omp_get_wtime();
+    double *tmp_U_alloc = NULL, *tmp_V_alloc = NULL, *tmp_S_alloc = NULL;
+    //printf("BATCHSIZE=%d BATCHCOUNT=%d\n", batch_size, nbatches);
+    // Init buffers to store low-rank factors of far-field blocks if needed
+    if(nbatches > 0)
+    {
+        STARSH_MALLOC(far_U, nblocks_far);
+        STARSH_MALLOC(far_V, nblocks_far);
+        STARSH_MALLOC(far_rank, nblocks_far);
+        size_t size_U = nblocks_far * nb * maxrank;
+        size_t size_V = size_U;
+        //size_t size_D = nblocks_far * nb * nb;
+        //size_t size_S = nblocks_far * mn;
+        STARSH_MALLOC(alloc_U, size_U);
+        STARSH_MALLOC(alloc_V, size_V);
+        //starpu_memory_pin(alloc_U, size_U*sizeof(double));
+        //starpu_memory_pin(alloc_V, size_V*sizeof(double));
+        //starpu_malloc(&alloc_S, size_S*sizeof(double));
+        int shape[] = {nb, maxrank};
+        for(bi = 0; bi < nblocks_far; ++bi)
+        {
+            STARSH_int offset = bi * nb * maxrank;
+            array_from_buffer(far_U+bi, 2, shape, 'd', 'F', alloc_U+offset);
+            array_from_buffer(far_V+bi, 2, shape, 'd', 'F', alloc_V+offset);
+            far_rank[bi] = -1;
+        }
+        //starpu_malloc(&alloc_D, size_D*sizeof(double));
+        //size_t tmp_U_alloc_size = (size_t)nworkers_gpu * batch_size * nb *
+        //    maxrank * sizeof(double);
+        //size_t tmp_S_alloc_size = (size_t)nworkers_gpu * batch_size * mn *
+        //    sizeof(double);
+        //starpu_malloc(&tmp_U_alloc, tmp_U_alloc_size);
+        //starpu_malloc(&tmp_V_alloc, tmp_U_alloc_size);
+        //starpu_malloc(&tmp_S_alloc, tmp_S_alloc_size);
+        starpu_memory_pin(block_far, 2*nblocks_far*sizeof(*block_far));
+        //printf("KBLAS3: pin memory in %e seconds\n", omp_get_wtime()-time0);
+        // START MEASURING TIME
+        time0 = omp_get_wtime();
+        for(bi = 0; bi < nbatches; ++bi)
+        {
+            int this_batch_size = nblocks_far - bi*batch_size;
+            if(this_batch_size > batch_size)
+                this_batch_size = batch_size;
+            //printf("THIS BATCH SIZE=%d\n", this_batch_size);
+            starpu_vector_data_register(rank_handle+bi, STARPU_MAIN_RAM,
+                    (uintptr_t)(far_rank + bi*batch_size), this_batch_size,
+                    sizeof(*far_rank));
+            //STARSH_int offset_D = bi * batch_size * nb * nb;
+            //double *D = alloc_D + offset_D;
+            //STARSH_int D_size = this_batch_size * nb * nb;
+            //starpu_vector_data_register(D_handle+bi, -1, 0, D_size,
+            //        sizeof(double));
+            //starpu_vector_data_register(Dcopy_handle+bi, -1, 0, D_size,
+            //        sizeof(double));
+            starpu_vector_data_register(index_handle+bi, STARPU_MAIN_RAM,
+                    (uintptr_t)(block_far + 2*bi*batch_size),
+                    2*this_batch_size, sizeof(*block_far));
+            STARSH_int offset = bi * batch_size * nb * maxrank;
+            //STARSH_int offset_S = bi * batch_size * mn;
+            double *U = alloc_U + offset;
+            double *V = alloc_V + offset;
+            //double *S = alloc_S + offset_S;
+            STARSH_int U_size = this_batch_size * nb * maxrank;
+            STARSH_int V_size = U_size;
+            //STARSH_int tmp_U_size = batch_size * nb * maxrank;
+            //STARSH_int tmp_V_size = tmp_U_size;
+            //STARSH_int tmp_S_size = batch_size * mn;
+            //starpu_vector_data_register(tmp_U_handle+bi, -1, 0 , tmp_U_size,
+            //        sizeof(double));
+            //starpu_vector_data_register(tmp_V_handle+bi, -1, 0 , tmp_V_size,
+            //        sizeof(double));
+            //starpu_vector_data_register(tmp_S_handle+bi, -1, 0 , tmp_S_size,
+            //        sizeof(double));
+            starpu_vector_data_register(U_handle+bi, STARPU_MAIN_RAM,
+                    (uintptr_t)(U), U_size, sizeof(*U));
+            starpu_vector_data_register(V_handle+bi, STARPU_MAIN_RAM,
+                    (uintptr_t)(V), V_size, sizeof(*V));
+        }
+        STARSH_int D_size = batch_size * nb * nb;
+        STARSH_int tmp_U_size = batch_size * nb * maxrank;
+        STARSH_int tmp_S_size = batch_size * mn;
+        for(bi = 0; bi < nworkers_gpu; ++bi)
+        {
+            starpu_vector_data_register(D_handle+bi, -1, 0, D_size,
+                    sizeof(double));
+            starpu_vector_data_register(Dcopy_handle+bi, -1, 0, D_size,
+                    sizeof(double));
+            starpu_vector_data_register(tmp_U_handle+bi, -1, 0, tmp_U_size,
+                    sizeof(double));
+            starpu_vector_data_register(tmp_V_handle+bi, -1, 0, tmp_U_size,
+                    sizeof(double));
+            starpu_vector_data_register(tmp_S_handle+bi, -1, 0, tmp_S_size,
+                    sizeof(double));
+        }
+        //printf("REGISTER DATA IN: %f seconds\n", omp_get_wtime()-time0);
+    }
+    // Work variables
+    int info;
+    // START MEASURING TIME
+    //time0 = omp_get_wtime();
+    for(bi = 0; bi < nbatches; ++bi)
+    {
+        //printf("RUNNING BATCH=%d\n", bi);
+        int this_batch_size = nblocks_far - bi*batch_size;
+        if(this_batch_size > batch_size)
+            this_batch_size = batch_size;
+        // Generate matrix
+        starpu_task_insert(&codelet_kernel,
+                STARPU_VALUE, &data_gpu, sizeof(data_gpu),
+                STARPU_VALUE, &nb, sizeof(nb),
+                STARPU_VALUE, &this_batch_size, sizeof(this_batch_size),
+                STARPU_W, D_handle[bi % nworkers_gpu],
+                STARPU_R, index_handle[bi],
+                STARPU_PRIORITY, -2,
+                0);
+        starpu_data_unregister_submit(index_handle[bi]);
+        // Run KBLAS_RSVD
+        starpu_task_insert(&codelet_lowrank,
+                STARPU_VALUE, &this_batch_size, sizeof(this_batch_size),
+                STARPU_VALUE, &nb, sizeof(nb),
+                STARPU_VALUE, &maxrank, sizeof(maxrank),
+                STARPU_VALUE, &oversample, sizeof(oversample),
+                STARPU_VALUE, &tol, sizeof(tol),
+                STARPU_VALUE, &cuhandles, sizeof(cuhandles),
+                STARPU_VALUE, &khandles, sizeof(khandles),
+                STARPU_VALUE, &kstates, sizeof(kstates),
+                STARPU_R, D_handle[bi % nworkers_gpu],
+                STARPU_SCRATCH, Dcopy_handle[bi % nworkers_gpu],
+                STARPU_W, tmp_U_handle[bi % nworkers_gpu],
+                STARPU_W, tmp_V_handle[bi % nworkers_gpu],
+                STARPU_W, tmp_S_handle[bi % nworkers_gpu],
+                STARPU_PRIORITY, 0,
+                0);
+        //starpu_data_unregister_submit(D_handle[bi]);
+        //starpu_data_unregister_submit(Dcopy_handle[bi]);
+        starpu_task_insert(&codelet_getrank,
+                STARPU_VALUE, &this_batch_size, sizeof(this_batch_size),
+                STARPU_VALUE, &nb, sizeof(nb),
+                STARPU_VALUE, &maxrank, sizeof(maxrank),
+                STARPU_VALUE, &oversample, sizeof(oversample),
+                STARPU_VALUE, &tol, sizeof(tol),
+                STARPU_R, tmp_U_handle[bi % nworkers_gpu],
+                STARPU_R, tmp_V_handle[bi % nworkers_gpu],
+                STARPU_R, tmp_S_handle[bi % nworkers_gpu],
+                STARPU_W, rank_handle[bi],
+                STARPU_W, U_handle[bi],
+                STARPU_W, V_handle[bi],
+                STARPU_PRIORITY, -1,
+                0);
+        //starpu_data_unregister_submit(tmp_U_handle[bi]);
+        //starpu_data_unregister_submit(tmp_V_handle[bi]);
+        //starpu_data_unregister_submit(tmp_S_handle[bi]);
+        starpu_data_unregister_submit(rank_handle[bi]);
+        starpu_data_unregister_submit(U_handle[bi]);
+        starpu_data_unregister_submit(V_handle[bi]);
+    }
+    //double time1 = omp_get_wtime();
+    //printf("SUBMIT IN: %f seconds\n", time1-time0);
+    starpu_task_wait_for_all();
+    //time1 = omp_get_wtime();
+    //printf("COMPUTE+COMPRESS MATRIX IN: %f seconds\n", time1-time0);
+    //time0 = omp_get_wtime();
+    if(nbatches > 0)
+    {
+        //size_t size_U = nblocks_far * nb * maxrank;
+        //size_t size_V = size_U;
+        //starpu_free(alloc_D);
+        //starpu_memory_unpin(alloc_U, size_U*sizeof(double));
+        //starpu_memory_unpin(alloc_V, size_V*sizeof(double));
+        //starpu_free(alloc_S);
+        for(bi = 0; bi < nworkers_gpu; ++bi)
+        {
+            starpu_data_unregister(D_handle[bi]);
+            starpu_data_unregister(Dcopy_handle[bi]);
+            starpu_data_unregister(tmp_U_handle[bi]);
+            starpu_data_unregister(tmp_V_handle[bi]);
+            starpu_data_unregister(tmp_S_handle[bi]);
+        }
+        //starpu_free(tmp_U_alloc);
+        //starpu_free(tmp_V_alloc);
+        //starpu_free(tmp_S_alloc);
+        starpu_memory_unpin(block_far, 2*nblocks_far*sizeof(*block_far));
+    }
+    //printf("FINISH FIRST PASS AND UNREGISTER IN: %f seconds\n",
+    //        omp_get_wtime()-time0);
+    //time0 = omp_get_wtime();
+    // Get number of false far-field blocks
+    STARSH_int nblocks_false_far = 0;
+    STARSH_int *false_far = NULL;
+    for(bi = 0; bi < nblocks_far; bi++)
+    {
+        //printf("FAR_RANK[%zu]=%d\n", bi, far_rank[bi]);
+        //far_rank[bi] = -1;
+        if(far_rank[bi] == -1)
+            nblocks_false_far++;
+    }
+    if(nblocks_false_far > 0)
+    {
+        // IMPORTANT: `false_far` must to be in ascending order for later code
+        // to work normally
+        STARSH_MALLOC(false_far, nblocks_false_far);
+        bj = 0;
+        for(bi = 0; bi < nblocks_far; bi++)
+            if(far_rank[bi] == -1)
+                false_far[bj++] = bi;
+    }
+    // Update lists of far-field and near-field blocks using previously
+    // generated list of false far-field blocks
+    if(nblocks_false_far > 0)
+    {
+        // Update list of near-field blocks
+        new_nblocks_near = nblocks_near+nblocks_false_far;
+        STARSH_MALLOC(block_near, 2*new_nblocks_near);
+        // At first get all near-field blocks, assumed to be dense
+        for(bi = 0; bi < 2*nblocks_near; bi++)
+            block_near[bi] = F->block_near[bi];
+        // Add false far-field blocks
+        for(bi = 0; bi < nblocks_false_far; bi++)
+        {
+            STARSH_int bj = false_far[bi];
+            block_near[2*(bi+nblocks_near)] = F->block_far[2*bj];
+            block_near[2*(bi+nblocks_near)+1] = F->block_far[2*bj+1];
+        }
+        // Update list of far-field blocks
+        new_nblocks_far = nblocks_far-nblocks_false_far;
+        if(new_nblocks_far > 0)
+        {
+            STARSH_MALLOC(block_far, 2*new_nblocks_far);
+            bj = 0;
+            for(bi = 0; bi < nblocks_far; bi++)
+            {
+                // `false_far` must be in ascending order for this to work
+                if(bj < nblocks_false_far && false_far[bj] == bi)
+                {
+                    bj++;
+                }
+                else
+                {
+                    block_far[2*(bi-bj)] = F->block_far[2*bi];
+                    block_far[2*(bi-bj)+1] = F->block_far[2*bi+1];
+                }
+            }
+        }
+        // Update format by creating new format
+        STARSH_blrf *F2;
+        info = starsh_blrf_new_from_coo(&F2, P, F->symm, RC, CC,
+                new_nblocks_far, block_far, new_nblocks_near, block_near,
+                F->type);
+        // Swap internal data of formats and free unnecessary data
+        STARSH_blrf tmp_blrf = *F;
+        *F = *F2;
+        *F2 = tmp_blrf;
+        STARSH_WARNING("`F` was modified due to false far-field blocks");
+        starsh_blrf_free(F2);
+    }
+    // Compute near-field blocks if needed
+    if(onfly == 0 && new_nblocks_near > 0)
+    {
+        STARSH_MALLOC(near_D, new_nblocks_near);
+        size_t size_D = new_nblocks_near * nb * nb;
+        STARSH_MALLOC(alloc_D, size_D);
+        nbatches = (new_nblocks_near-1)/batch_size + 1;
+        starpu_data_handle_t D_handle[nbatches];
+        starpu_data_handle_t Dcopy_handle[nworkers_gpu];
+        starpu_data_handle_t index_handle[nbatches];
+        int shape[] = {nb, nb};
+        // For each near-field block compute its elements
+        for(bi = 0; bi < new_nblocks_near; ++bi)
+        {
+            // Get indexes of corresponding block row and block column
+            //STARSH_int i = block_near[2*bi];
+            //STARSH_int j = block_near[2*bi+1];
+            array_from_buffer(near_D+bi, 2, shape, 'd', 'F',
+                    alloc_D + bi*nb*nb);
+        }
+        //starpu_memory_pin(block_near, 2*new_nblocks_near*sizeof(*block_near));
+        for(bi = 0; bi < nbatches; ++bi)
+        {
+            int this_batch_size = new_nblocks_near - bi*batch_size;
+            if(this_batch_size > batch_size)
+                this_batch_size = batch_size;
+            STARSH_int D_size = this_batch_size * nb * nb;
+            double *D = alloc_D + bi*batch_size*nb*nb;
+            starpu_vector_data_register(D_handle+bi, STARPU_MAIN_RAM,
+                    (uintptr_t)(D), D_size, sizeof(double));
+            //starpu_vector_data_register(Dcopy_handle+bi, -1, 0, D_size,
+            //        sizeof(double));
+            starpu_vector_data_register(index_handle+bi, STARPU_MAIN_RAM,
+                    (uintptr_t)(block_near + 2*bi*batch_size),
+                    2*this_batch_size, sizeof(*block_near));
+        }
+        //double *Dcopy_alloc;
+        //size_t Dcopy_size = batch_size * nb * nb;
+        // overwrite old value of number of temporary work space for transfers
+        // between GPU and CPU
+        //int nworkers_gpu = starpu_cuda_worker_get_count();
+        //starpu_malloc(&Dcopy_alloc, sizeof(*Dcopy_alloc) * nworkers_gpu *
+        //        Dcopy_size);
+        //for(bi = 0; bi < nworkers_gpu; ++bi)
+        //{
+        //    starpu_vector_data_register(Dcopy_handle+bi, STARPU_MAIN_RAM,
+        //            (uintptr_t)(Dcopy_alloc+bi*Dcopy_size), Dcopy_size,
+        //            sizeof(*Dcopy_alloc));
+        //}
+        for(bi = 0; bi < nbatches; ++bi)
+        {
+            int this_batch_size = new_nblocks_near - bi*batch_size;
+            if(this_batch_size > batch_size)
+                this_batch_size = batch_size;
+            // Generate matrix by CPU
+            starpu_task_insert(&codelet_kernel,
+                    STARPU_VALUE, &data_gpu, sizeof(data_gpu),
+                    STARPU_VALUE, &nb, sizeof(nb),
+                    STARPU_VALUE, &this_batch_size, sizeof(this_batch_size),
+                    STARPU_W, D_handle[bi],
+                    STARPU_R, index_handle[bi],
+                    0);
+            starpu_data_unregister_submit(index_handle[bi]);
+            // Move tile to CPU from GPU
+            //starpu_task_insert(&codelet_copy,
+            //        STARPU_VALUE, &nb, sizeof(nb),
+            //        STARPU_VALUE, &this_batch_size, sizeof(this_batch_size),
+            //        STARPU_R, Dcopy_handle[bi % nworkers_gpu],
+            //        STARPU_W, D_handle[bi],
+            //        0);
+            //starpu_data_unregister_submit(Dcopy_handle[bi]);
+            starpu_data_unregister_submit(D_handle[bi]);
+        }
+        // Wait in this scope, because all handles are not visible outside
+        starpu_task_wait_for_all();
+        //for(bi = 0; bi < nworkers_gpu; ++bi)
+        //{
+        //    starpu_data_unregister(Dcopy_handle[bi]);
+        //}
+        //starpu_free(Dcopy_alloc);
+        //starpu_memory_unpin(block_near, 2*new_nblocks_near*sizeof(*block_near));
+    }
+    // Change sizes of far_rank, far_U and far_V if there were false
+    // far-field blocks
+    if(nblocks_false_far > 0 && new_nblocks_far > 0)
+    {
+        bj = 0;
+        for(bi = 0; bi < nblocks_far; bi++)
+        {
+            if(far_rank[bi] == -1)
+                bj++;
+            else
+            {
+                int shape_U[2] = {far_U[bi]->shape[0], far_rank[bi]};
+                int shape_V[2] = {far_V[bi]->shape[0], far_rank[bi]};
+                array_from_buffer(far_U+bi-bj, 2, shape_U, 'd', 'F',
+                        far_U[bi]->data);
+                array_from_buffer(far_V+bi-bj, 2, shape_V, 'd', 'F',
+                        far_V[bi]->data);
+                far_rank[bi-bj] = far_rank[bi];
+            }
+        }
+        STARSH_REALLOC(far_rank, new_nblocks_far);
+        STARSH_REALLOC(far_U, new_nblocks_far);
+        STARSH_REALLOC(far_V, new_nblocks_far);
+        //STARSH_REALLOC(alloc_U, offset_U);
+        //STARSH_REALLOC(alloc_V, offset_V);
+    }
+    // If all far-field blocks are false, then dealloc buffers
+    if(new_nblocks_far == 0 && nblocks_far > 0)
+    {
+        block_far = NULL;
+        free(far_rank);
+        far_rank = NULL;
+        free(far_U);
+        far_U = NULL;
+        free(far_V);
+        far_V = NULL;
+        free(alloc_U);
+        alloc_U = NULL;
+        free(alloc_V);
+        alloc_V = NULL;
+    }
+    // Dealloc list of false far-field blocks if it is not empty
+    if(nblocks_false_far > 0)
+        free(false_far);
+    // Finish with creating instance of Block Low-Rank Matrix with given
+    // buffers
+    //printf("FINISH NEAR-FIELD TILES: %f seconds\n", omp_get_wtime()-time0);
+    //time0 = omp_get_wtime();
+    starpu_execute_on_each_worker(deinit_starpu_kblas, args_gpu, STARPU_CUDA);
+    //printf("KBLAS3: finalize in %f seconds\n", omp_get_wtime()-time0);
+    return starsh_blrm_new(matrix, F, far_rank, far_U, far_V, onfly, near_D,
+            alloc_U, alloc_V, alloc_D, '1');
+}
+
diff --git a/src/backends/starpu_kblas3_spatial/dense/CMakeLists.txt b/src/backends/starpu_kblas3_spatial/dense/CMakeLists.txt
new file mode 100644
index 00000000..5a42be3e
--- /dev/null
+++ b/src/backends/starpu_kblas3_spatial/dense/CMakeLists.txt
@@ -0,0 +1,21 @@
+# @copyright (c) 2017 King Abdullah University of Science and
+#                      Technology (KAUST). All rights reserved.
+#
+# STARS-H is a software package, provided by King Abdullah
+#             University of Science and Technology (KAUST)
+#
+# @file src/backends/starpu/dense/CMakeLists.txt
+# @version 0.1.0
+# @author Aleksandr Mikhalev
+# @date 2017-11-07
+
+
+# set the values of the variable in the parent scope
+set(SRC
+    #"${CMAKE_CURRENT_SOURCE_DIR}/dqp3.c"
+    #"${CMAKE_CURRENT_SOURCE_DIR}/drsdd.c"
+    #"${CMAKE_CURRENT_SOURCE_DIR}/dsdd.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu"
+    #"${CMAKE_CURRENT_SOURCE_DIR}/dgemm.c"
+    #"${CMAKE_CURRENT_SOURCE_DIR}/fake_init.c"
+    ${SRC} PARENT_SCOPE)
diff --git a/src/backends/starpu_kblas3_spatial/dense/kernel.cu b/src/backends/starpu_kblas3_spatial/dense/kernel.cu
new file mode 100644
index 00000000..31707b65
--- /dev/null
+++ b/src/backends/starpu_kblas3_spatial/dense/kernel.cu
@@ -0,0 +1,82 @@
+/*! @copyright (c) 2017 King Abdullah University of Science and
+ *                      Technology (KAUST). All rights reserved.
+ *
+ * STARS-H is a software package, provided by King Abdullah
+ *             University of Science and Technology (KAUST)
+ *
+ * @file src/backends/starpu/dense/kernel.cu
+ * @version 0.1.0
+ * @author Aleksandr Mikhalev
+ * @date 2017-11-07
+ * */
+
+#include "starpu.h"
+extern "C"
+{
+#include <cuda_runtime.h>
+#include "common.h"
+#include "starsh.h"
+#include "starsh-starpu-kblas.h"
+#include "starsh-spatial.h"
+#include <omp.h>
+
+static __global__ void local_gpu_kernel_for_spatial(STARSH_ssdata *data,
+        STARSH_int *block_far, int N, double *D, int ldD, int stride)
+//! Exponential kernel for 2-dimensional spatial statistics problem on GPU
+{
+    int tile_i = N * block_far[2*blockIdx.x];
+    int tile_j = N * block_far[2*blockIdx.x + 1];
+    //printf("blockidx=%d\n", blockIdx.x);
+    // Read parameters
+    double beta = -data->beta;
+    double noise = data->noise;
+    double sigma = data->sigma;
+    // Get coordinates
+    STARSH_int count = data->particles.count;
+    double *x, *y, *z;
+    x = data->particles.point;
+    y = x + count;
+    //z = y + count;
+    double *buffer = D + (size_t)stride*blockIdx.x;
+    // Fill column-major matrix
+    for(int j = threadIdx.x; j < N; j += blockDim.x)
+    {
+        int index_j = tile_j + j;
+        double x_j = x[index_j];
+        double y_j = y[index_j];
+        //double z_j = z[index_j];
+        for(int i = threadIdx.y; i < N; i += blockDim.y)
+        {
+            int index_i = tile_i + i;
+            double dx = x[index_i] - x_j;
+            double dy = y[index_i] - y_j;
+            //double dz = z[index_i] - z_j;
+            //double dist = norm3d(dx, dy, dz) / beta;
+            double dist = sqrt(dx*dx + dy*dy) / beta;
+            if(dist == 0)
+                buffer[j*(size_t)ldD+i] = sigma + noise;
+            else
+                buffer[j*(size_t)ldD+i] = sigma * exp(dist);
+            //printf("A(%d,%d,%d)=%f\n", index_i, index_j, j*ldD+i, buffer[j*ldD+i]);
+        }
+    }
+}
+
+void starsh_dense_kernel_starpu_kblas3_gpu(void *buffers[], void *cl_arg)
+//! STARPU kernel for matrix kernel.
+{
+    //double time0 = omp_get_wtime();
+    STARSH_ssdata **data_gpu;
+    int batch_size;
+    int N;
+    int id = starpu_worker_get_id();
+    starpu_codelet_unpack_args(cl_arg, &data_gpu, &N, &batch_size);
+    double *D = (double *)STARPU_VECTOR_GET_PTR(buffers[0]);
+    STARSH_int *ind = (STARSH_int *)STARPU_VECTOR_GET_PTR(buffers[1]);
+    dim3 threads(16, 16);
+    cudaStream_t stream = starpu_cuda_get_local_stream();
+    local_gpu_kernel_for_spatial<<<batch_size, threads, 0, stream>>>(data_gpu[id],
+            ind, N, D, N, N*N);
+}
+
+} // extern "C"
diff --git a/src/control/CMakeLists.txt b/src/control/CMakeLists.txt
index b6ab5bad..c67ce97e 100644
--- a/src/control/CMakeLists.txt
+++ b/src/control/CMakeLists.txt
@@ -5,7 +5,7 @@
 #             University of Science and Technology (KAUST)
 #
 # @file src/control/CMakeLists.txt
-# @version 1.3.0
+# @version 0.3.0
 # @author Aleksandr Mikhalev
 # @date 2017-11-07
 
diff --git a/src/control/array.c b/src/control/array.c
index 4fd39875..c2c31693 100644
--- a/src/control/array.c
+++ b/src/control/array.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file src/control/array.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/src/control/blrf.c b/src/control/blrf.c
index 3cff7cfc..9256e3a6 100644
--- a/src/control/blrf.c
+++ b/src/control/blrf.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file src/control/blrf.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
@@ -634,6 +634,16 @@ int starsh_blrf_new_tlr_mpi(STARSH_blrf **format, STARSH_problem *problem,
     MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
     MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
     int grid_nx = sqrt(mpi_size), grid_ny = grid_nx, grid_x, grid_y;
+    if(mpi_size == 6)
+    {
+        grid_nx = 2;
+        grid_ny = 3;
+    }
+    else if(mpi_size == 2)
+    {
+        grid_nx = 1;
+        grid_ny = 2;
+    }
     if(grid_nx*grid_ny != mpi_size)
         STARSH_ERROR("MPI SIZE MUST BE SQUARE OF INTEGER!");
     grid_ny = mpi_size / grid_nx;
diff --git a/src/control/cluster.c b/src/control/cluster.c
index 33dcd722..de2711a3 100644
--- a/src/control/cluster.c
+++ b/src/control/cluster.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file src/control/cluster.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/src/control/init.c b/src/control/init.c
index 7e3d08ee..087ba45d 100644
--- a/src/control/init.c
+++ b/src/control/init.c
@@ -14,6 +14,9 @@
 #include "starsh-mpi.h"
 #include "starsh-starpu.h"
 #include "starsh-mpi-starpu.h"
+#include "starsh-starpu-kblas.h"
+#include "starsh-starpu-cuda.h"
+#include "starsh-mpi-starpu-kblas.h"
 #include "common.h"
 #include "control/init.h"
 
diff --git a/src/control/problem.c b/src/control/problem.c
index 9b2d76e7..2ffb8d79 100644
--- a/src/control/problem.c
+++ b/src/control/problem.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file src/control/problem.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/src/itersolvers/CMakeLists.txt b/src/itersolvers/CMakeLists.txt
index 7ef9492c..3718d01a 100644
--- a/src/itersolvers/CMakeLists.txt
+++ b/src/itersolvers/CMakeLists.txt
@@ -5,7 +5,7 @@
 #             University of Science and Technology (KAUST)
 #
 # @file src/itersolvers/CMakeLists.txt
-# @version 1.3.0
+# @version 0.3.0
 # @author Aleksandr Mikhalev
 # @date 2017-11-07
 
diff --git a/src/itersolvers/cg.c b/src/itersolvers/cg.c
index d0afb6b3..e63747fe 100644
--- a/src/itersolvers/cg.c
+++ b/src/itersolvers/cg.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file src/itersolvers/cg.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/testing/CMakeLists.txt b/testing/CMakeLists.txt
index 70cc1aba..c79e40b4 100644
--- a/testing/CMakeLists.txt
+++ b/testing/CMakeLists.txt
@@ -5,7 +5,7 @@
 #             University of Science and Technology (KAUST)
 #
 # @file testing/CMakeLists.txt
-# @version 1.3.0
+# @version 0.3.0
 # @author Aleksandr Mikhalev
 # @date 2017-11-07
 
@@ -61,6 +61,18 @@ if(MPI AND STARPU)
         )
 endif()
 
+if(CUDA)
+    list(APPEND tests_files
+        "starpu_spatial_gpu.c"
+        )
+endif()
+
+if(CUDA AND MPI)
+    list(APPEND tests_files
+        "mpi_starpu_spatial_gpu.c"
+        )
+endif()
+
 # Uses RUNPATH instead of RPATH
 SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}")
 
@@ -69,7 +81,7 @@ foreach(test_src ${tests_files})
     add_executable(test_${test_exe} ${test_src})
     target_link_libraries(test_${test_exe} starsh ${CBLAS_LIBRARIES}
         ${LAPACKE_LIBRARIES} ${OpenMP_C_FLAGS})
-    if(test_src MATCHES "starpu_*")
+    if((test_src MATCHES "starpu_*") OR (test_src MATCHES "mpi_starpu_*"))
         target_link_libraries(test_${test_exe} ${STARPU_LIBRARIES})
     endif()
     set_target_properties(test_${test_exe} PROPERTIES OUTPUT_NAME ${test_exe})
diff --git a/testing/cauchy.c b/testing/cauchy.c
index 3aec596d..3d55ef56 100644
--- a/testing/cauchy.c
+++ b/testing/cauchy.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file testing/cauchy.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/testing/electrodynamics.c b/testing/electrodynamics.c
index d18269a1..f1b8f16e 100644
--- a/testing/electrodynamics.c
+++ b/testing/electrodynamics.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file testing/electrodynamics.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/testing/electrostatics.c b/testing/electrostatics.c
index 9b9367a0..c22f3a77 100644
--- a/testing/electrostatics.c
+++ b/testing/electrostatics.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file testing/electrostatics.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/testing/minimal.c b/testing/minimal.c
index 84558e3c..ea4a16d5 100644
--- a/testing/minimal.c
+++ b/testing/minimal.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file testing/minimal.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/testing/mpi_cauchy.c b/testing/mpi_cauchy.c
index 3092d354..06f8e9de 100644
--- a/testing/mpi_cauchy.c
+++ b/testing/mpi_cauchy.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file testing/mpi_cauchy.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/testing/mpi_electrodynamics.c b/testing/mpi_electrodynamics.c
index bbd11e4e..6f5dbd1a 100644
--- a/testing/mpi_electrodynamics.c
+++ b/testing/mpi_electrodynamics.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file testing/mpi_electrodynamics.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/testing/mpi_electrostatics.c b/testing/mpi_electrostatics.c
index 521656db..8048783b 100644
--- a/testing/mpi_electrostatics.c
+++ b/testing/mpi_electrostatics.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file testing/mpi_electrostatics.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/testing/mpi_minimal.c b/testing/mpi_minimal.c
index 435974dd..65fd1d2f 100644
--- a/testing/mpi_minimal.c
+++ b/testing/mpi_minimal.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file testing/mpi_minimal.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/testing/mpi_spatial.c b/testing/mpi_spatial.c
index ae934758..fb20b284 100644
--- a/testing/mpi_spatial.c
+++ b/testing/mpi_spatial.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file testing/mpi_spatial.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/testing/mpi_starpu_cauchy.c b/testing/mpi_starpu_cauchy.c
index 7c15e788..74f6002a 100644
--- a/testing/mpi_starpu_cauchy.c
+++ b/testing/mpi_starpu_cauchy.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file testing/mpi_starpu_minimal.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/testing/mpi_starpu_electrodynamics.c b/testing/mpi_starpu_electrodynamics.c
index f53a3eb5..7ea9e6c6 100644
--- a/testing/mpi_starpu_electrodynamics.c
+++ b/testing/mpi_starpu_electrodynamics.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file testing/mpi_starpu_electrodynamics.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/testing/mpi_starpu_electrostatics.c b/testing/mpi_starpu_electrostatics.c
index 74f3654f..39d2ed8f 100644
--- a/testing/mpi_starpu_electrostatics.c
+++ b/testing/mpi_starpu_electrostatics.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file testing/mpi_starpu_electrostatics.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/testing/mpi_starpu_minimal.c b/testing/mpi_starpu_minimal.c
index 7ebcdc39..622a82e4 100644
--- a/testing/mpi_starpu_minimal.c
+++ b/testing/mpi_starpu_minimal.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file testing/mpi_starpu_minimal.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/testing/mpi_starpu_spatial.c b/testing/mpi_starpu_spatial.c
index 7a209495..1ebc6c43 100644
--- a/testing/mpi_starpu_spatial.c
+++ b/testing/mpi_starpu_spatial.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file testing/mpi_starpu_spatial.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
@@ -157,6 +157,7 @@ int main(int argc, char **argv)
         MPI_Finalize();
         return 1;
     }
+    //*/
     // Measure time for 10 BLRM matvecs and for 10 BLRM TLR matvecs
     /* Not performed due to no matvec yet with STARPU
     double *x, *y, *y_tlr;
diff --git a/testing/mpi_starpu_spatial_gpu.c b/testing/mpi_starpu_spatial_gpu.c
new file mode 100644
index 00000000..1c530a72
--- /dev/null
+++ b/testing/mpi_starpu_spatial_gpu.c
@@ -0,0 +1,217 @@
+/*! @copyright (c) 2017 King Abdullah University of Science and
+ *                      Technology (KAUST). All rights reserved.
+ *
+ * STARS-H is a software package, provided by King Abdullah
+ *             University of Science and Technology (KAUST)
+ *
+ * @file testing/mpi_starpu_spatial.c
+ * @version 0.1.0
+ * @author Aleksandr Mikhalev
+ * @date 2017-11-07
+ * */
+
+#ifdef MKL
+    #include <mkl.h>
+#else
+    #include <cblas.h>
+    #include <lapacke.h>
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+#include <mpi.h>
+#include <starpu.h>
+#include <string.h>
+#include <starsh.h>
+#include <starsh-mpi-starpu-kblas.h>
+#include <starsh-spatial.h>
+#include <cuda.h>
+#include <cublas_v2.h>
+
+void preheat_cublas()
+{
+    cublasHandle_t cuhandle;
+    cublasCreate(&cuhandle);
+    cublasDestroy(cuhandle);
+}
+
+int main(int argc, char **argv)
+{
+    MPI_Init(&argc, &argv);
+    int mpi_size, mpi_rank;
+    MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
+    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+    if(argc != 10)
+    {
+        if(mpi_rank == 0)
+        {
+            printf("%d arguments provided, but 9 are needed\n",
+                    argc-1);
+            printf("mpi_starpu_spatial ndim placement kernel beta nu N "
+                    "block_size maxrank tol\n");
+        }
+        MPI_Finalize();
+        return 1;
+    }
+    int problem_ndim = atoi(argv[1]);
+    int place = atoi(argv[2]);
+    // Possible values can be found in documentation for enum
+    // STARSH_PARTICLES_PLACEMENT
+    int kernel_type = atoi(argv[3]);
+    double beta = atof(argv[4]);
+    double nu = atof(argv[5]);
+    int N = atoi(argv[6]);
+    int block_size = atoi(argv[7]);
+    int maxrank = atoi(argv[8]);
+    double tol = atof(argv[9]);
+    double noise = 0;
+    int onfly = 0;
+    char symm = 'N', dtype = 'd';
+    int ndim = 2;
+    STARSH_int shape[2] = {N, N};
+    int info;
+    srand(0);
+    // Init STARS-H
+    info = starsh_init();
+    if(info != 0)
+    {
+        MPI_Finalize();
+        return 1;
+    }
+    // Generate data for spatial statistics problem
+    STARSH_ssdata *data;
+    STARSH_kernel *kernel;
+    info = starsh_application((void **)&data, &kernel, N, dtype,
+            STARSH_SPATIAL, kernel_type, STARSH_SPATIAL_NDIM, problem_ndim,
+            STARSH_SPATIAL_BETA, beta, STARSH_SPATIAL_NU, nu,
+            STARSH_SPATIAL_NOISE, noise, STARSH_SPATIAL_PLACE, place, 0);
+    if(info != 0)
+    {
+        if(mpi_rank == 0)
+            printf("Problem was NOT generated (wrong parameters)\n");
+        MPI_Finalize();
+        return 1;
+    }
+    // Init problem with given data and kernel and print short info
+    STARSH_problem *P;
+    info = starsh_problem_new(&P, ndim, shape, symm, dtype, data, data,
+            kernel, "Spatial Statistics example");
+    if(info != 0)
+    {
+        MPI_Finalize();
+        return 1;
+    }
+    if(mpi_rank == 0)
+        starsh_problem_info(P); 
+    // Init plain clusterization and print info
+    STARSH_cluster *C;
+    info = starsh_cluster_new_plain(&C, data, N, block_size);
+    if(info != 0)
+    {
+        MPI_Finalize();
+        return 1;
+    }
+    if(mpi_rank == 0)
+        starsh_cluster_info(C);
+    // Init tlr division into admissible blocks and print short info
+    STARSH_blrf *F;
+    STARSH_blrm *M;
+    info = starsh_blrf_new_tlr_mpi(&F, P, symm, C, C);
+    if(info != 0)
+    {
+        MPI_Finalize();
+        return 1;
+    }
+    if(mpi_rank == 0)
+        starsh_blrf_info(F);
+    // Init StarPU
+    (void)starpu_init(NULL);
+    // Init cublas so that it runs faster next time
+    starpu_execute_on_each_worker(preheat_cublas, NULL, STARPU_CUDA);
+    // Approximate each admissible block
+    MPI_Barrier(MPI_COMM_WORLD);
+    double time1 = MPI_Wtime();
+    info = starsh_blrm__drsdd_mpi_starpu_kblas3_spatial(&M, F, maxrank, tol, onfly);
+    if(info != 0)
+    {
+        if(mpi_rank == 0)
+            printf("Approximation was NOT computed due to error\n");
+        MPI_Finalize();
+        return 1;
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
+    time1 = MPI_Wtime()-time1;
+    if(mpi_rank == 0)
+    {
+        starsh_blrf_info(F);
+        starsh_blrm_info(M);
+    }
+    if(mpi_rank == 0)
+        printf("TIME TO APPROXIMATE: %e secs\n", time1);
+    // Deinit StarPU
+    starpu_shutdown();
+    // Measure approximation error
+    /*
+    MPI_Barrier(MPI_COMM_WORLD);
+    time1 = MPI_Wtime();
+    double rel_err = starsh_blrm__dfe_mpi(M);
+    MPI_Barrier(MPI_COMM_WORLD);
+    time1 = MPI_Wtime()-time1;
+    if(mpi_rank == 0)
+    {
+        printf("TIME TO MEASURE ERROR: %e secs\nRELATIVE ERROR: %e\n",
+                time1, rel_err);
+        if(rel_err/tol > 10.)
+        {
+            printf("Resulting relative error is too big\n");
+            MPI_Finalize();
+            return 1;
+        }
+    }
+    if(rel_err/tol > 10.)
+    {
+        MPI_Finalize();
+        return 1;
+    }
+    */
+    //*/
+    // Measure time for 10 BLRM matvecs and for 10 BLRM TLR matvecs
+    /* Not performed due to no matvec yet with STARPU
+    double *x, *y, *y_tlr;
+    int nrhs = 1;
+    x = malloc(N*nrhs*sizeof(*x));
+    y = malloc(N*nrhs*sizeof(*y));
+    y_tlr = malloc(N*nrhs*sizeof(*y_tlr));
+    if(mpi_rank == 0)
+    {
+        int iseed[4] = {0, 0, 0, 1};
+        LAPACKE_dlarnv_work(3, iseed, N*nrhs, x);
+        cblas_dscal(N*nrhs, 0.0, y, 1);
+        cblas_dscal(N*nrhs, 0.0, y_tlr, 1);
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
+    time1 = MPI_Wtime();
+    for(int i = 0; i < 10; i++)
+        starsh_blrm__dmml_mpi(M, nrhs, 1.0, x, N, 0.0, y, N);
+    MPI_Barrier(MPI_COMM_WORLD);
+    time1 = MPI_Wtime()-time1;
+    if(mpi_rank == 0)
+    {
+        printf("TIME FOR 10 BLRM MATVECS: %e secs\n", time1);
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
+    time1 = MPI_Wtime();
+    for(int i = 0; i < 10; i++)
+        starsh_blrm__dmml_mpi_tlr(M, nrhs, 1.0, x, N, 0.0, y_tlr, N);
+    MPI_Barrier(MPI_COMM_WORLD);
+    time1 = MPI_Wtime()-time1;
+    if(mpi_rank == 0)
+    {
+        cblas_daxpy(N, -1.0, y, 1, y_tlr, 1);
+        printf("TIME FOR 10 TLR MATVECS: %e secs\n", time1);
+        printf("MATVEC DIFF: %e\n", cblas_dnrm2(N, y_tlr, 1)
+                /cblas_dnrm2(N, y, 1));
+    }
+    */
+    MPI_Finalize();
+    return 0;
+}
diff --git a/testing/particles.c b/testing/particles.c
index 033eee2e..8b097d1a 100644
--- a/testing/particles.c
+++ b/testing/particles.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file testing/particles.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  */
diff --git a/testing/randtlr.c b/testing/randtlr.c
index b4068be6..9252508b 100644
--- a/testing/randtlr.c
+++ b/testing/randtlr.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file testing/rndtiled.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/testing/spatial.c b/testing/spatial.c
index 9a093be0..ea7f533e 100644
--- a/testing/spatial.c
+++ b/testing/spatial.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file testing/spatial.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
@@ -108,6 +108,7 @@ int main(int argc, char **argv)
         return 1;
     }
     // Measure time for 10 matvecs
+    /*
     double *x, *y;
     x = malloc(N*nrhs*sizeof(*x));
     y = malloc(N*nrhs*sizeof(*y));
@@ -119,5 +120,6 @@ int main(int argc, char **argv)
         starsh_blrm__dmml(M, nrhs, 1.0, x, N, 0.0, y, N);
     time1 = omp_get_wtime()-time1;
     printf("TIME FOR 10 BLRM MATVECS: %e secs\n", time1);
+    */
     return 0;
 }
diff --git a/testing/starpu_cauchy.c b/testing/starpu_cauchy.c
index 0f7bcc99..041a3bd1 100644
--- a/testing/starpu_cauchy.c
+++ b/testing/starpu_cauchy.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file testing/starpu_cauchy.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/testing/starpu_electrodynamics.c b/testing/starpu_electrodynamics.c
index 35f33331..786b43dd 100644
--- a/testing/starpu_electrodynamics.c
+++ b/testing/starpu_electrodynamics.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file testing/starpu_electrodynamics.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/testing/starpu_electrostatics.c b/testing/starpu_electrostatics.c
index a2a40c58..27266e48 100644
--- a/testing/starpu_electrostatics.c
+++ b/testing/starpu_electrostatics.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file testing/starpu_electrostatics.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/testing/starpu_minimal.c b/testing/starpu_minimal.c
index 7bb9ba2c..05dccaed 100644
--- a/testing/starpu_minimal.c
+++ b/testing/starpu_minimal.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file testing/starpu_minimal.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
diff --git a/testing/starpu_spatial.c b/testing/starpu_spatial.c
index 50af0990..28c9299c 100644
--- a/testing/starpu_spatial.c
+++ b/testing/starpu_spatial.c
@@ -5,7 +5,7 @@
  *             University of Science and Technology (KAUST)
  *
  * @file testing/starpu_spatial.c
- * @version 1.3.0
+ * @version 0.3.0
  * @author Aleksandr Mikhalev
  * @date 2017-11-07
  * */
@@ -100,16 +100,18 @@ int main(int argc, char **argv)
     starsh_blrf_info(F);
     starsh_blrm_info(M);
     printf("TIME TO APPROXIMATE: %e secs\n", time1);
+    // Deinit StarPU
+    starpu_shutdown();
     // Measure approximation error
     time1 = omp_get_wtime();
-    double rel_err = starsh_blrm__dfe(M);
+    double rel_err = starsh_blrm__dfe_omp(M);
     time1 = omp_get_wtime()-time1;
     printf("TIME TO MEASURE ERROR: %e secs\nRELATIVE ERROR: %e\n",
             time1, rel_err);
     if(rel_err/tol > 10.)
     {
         printf("Resulting relative error is too big\n");
-        return 1;
+        return 0;
     }
     // Measure time for 10 BLRM matvecs and for 10 BLRM TLR matvecs
     /* Not performed due to no matvec yet with STARPU
@@ -125,7 +127,5 @@ int main(int argc, char **argv)
     time1 = omp_get_wtime()-time1;
     printf("TIME FOR 10 BLRM MATVECS: %e secs\n", time1);
     */
-    // Deinit StarPU
-    starpu_shutdown();
     return 0;
 }
diff --git a/testing/starpu_spatial_gpu.c b/testing/starpu_spatial_gpu.c
new file mode 100644
index 00000000..a10bd821
--- /dev/null
+++ b/testing/starpu_spatial_gpu.c
@@ -0,0 +1,145 @@
+/*! @copyright (c) 2017 King Abdullah University of Science and
+ *                      Technology (KAUST). All rights reserved.
+ *
+ * STARS-H is a software package, provided by King Abdullah
+ *             University of Science and Technology (KAUST)
+ *
+ * @file testing/starpu_spatial.c
+ * @version 0.1.0
+ * @author Aleksandr Mikhalev
+ * @date 2017-11-07
+ * */
+
+#ifdef MKL
+    #include <mkl.h>
+#else
+    #include <cblas.h>
+    #include <lapacke.h>
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+#include <omp.h>
+#include <string.h>
+#include <starpu.h>
+#include <starsh.h>
+#include <starsh-starpu-kblas.h>
+#include <starsh-spatial.h>
+#include <cuda.h>
+#include <cublas_v2.h>
+
+void preheat_cublas()
+{
+    cublasHandle_t cuhandle;
+    cublasCreate(&cuhandle);
+    cublasDestroy(cuhandle);
+}
+
+int main(int argc, char **argv)
+{
+    if(argc != 10)
+    {
+        printf("%d arguments provided, but 9 are needed\n", argc-1);
+        printf("starpu_spatial ndim placement kernel beta nu N block_size "
+                "maxrank tol\n");
+        return 1;
+    }
+    int problem_ndim = atoi(argv[1]);
+    int place = atoi(argv[2]);
+    // Possible values can be found in documentation for enum
+    // STARSH_PARTICLES_PLACEMENT
+    int kernel_type = atoi(argv[3]);
+    double beta = atof(argv[4]);
+    double nu = atof(argv[5]);
+    int N = atoi(argv[6]);
+    int block_size = atoi(argv[7]);
+    int maxrank = atoi(argv[8]);
+    double tol = atof(argv[9]);
+    double noise = 0;
+    int onfly = 0;
+    char symm = 'N', dtype = 'd';
+    int ndim = 2;
+    STARSH_int shape[2] = {N, N};
+    int nrhs = 1;
+    int info;
+    srand(0);
+    // Init STARS-H
+    info = starsh_init();
+    if(info != 0)
+        return info;
+    // Generate data for spatial statistics problem
+    STARSH_ssdata *data;
+    STARSH_kernel *kernel;
+    info = starsh_application((void **)&data, &kernel, N, dtype,
+            STARSH_SPATIAL, kernel_type, STARSH_SPATIAL_NDIM, problem_ndim,
+            STARSH_SPATIAL_BETA, beta, STARSH_SPATIAL_NU, nu,
+            STARSH_SPATIAL_NOISE, noise, 0);
+    if(info != 0)
+    {
+        printf("Problem was NOT generated (wrong parameters)\n");
+        return info;
+    }
+    // Init problem with given data and kernel and print short info
+    STARSH_problem *P;
+    info = starsh_problem_new(&P, ndim, shape, symm, dtype, data, data,
+            kernel, "Spatial Statistics example");
+    if(info != 0)
+        return info;
+    starsh_problem_info(P);
+    // Init tiled cluster for tiled low-rank approximation and print info
+    STARSH_cluster *C;
+    info = starsh_cluster_new_plain(&C, data, N, block_size);
+    if(info != 0)
+        return info;
+    starsh_cluster_info(C);
+    // Init tiled division into admissible blocks and print short info
+    STARSH_blrf *F;
+    STARSH_blrm *M;
+    info = starsh_blrf_new_tlr(&F, P, symm, C, C);
+    if(info != 0)
+        return info;
+    starsh_blrf_info(F);
+    // Init StarPU
+    (void)starpu_init(NULL);
+    // Init cublas so that it runs faster next time
+    starpu_execute_on_each_worker(preheat_cublas, NULL, STARPU_CUDA);
+    // Approximate each admissible block
+    double time1 = omp_get_wtime();
+    info = starsh_blrm__drsdd_starpu_kblas3_spatial(&M, F, maxrank, tol, onfly);
+    if(info != 0)
+        return info;
+    time1 = omp_get_wtime()-time1;
+    // Print info about updated format and approximation
+    starsh_blrf_info(F);
+    starsh_blrm_info(M);
+    printf("TIME TO APPROXIMATE: %e secs\n", time1);
+    // Deinit StarPU
+    starpu_shutdown();
+    // Measure approximation error
+    /*
+    time1 = omp_get_wtime();
+    double rel_err = starsh_blrm__dfe_omp(M);
+    time1 = omp_get_wtime()-time1;
+    printf("TIME TO MEASURE ERROR: %e secs\nRELATIVE ERROR: %e\n",
+            time1, rel_err);
+    if(rel_err/tol > 10.)
+    {
+        printf("Resulting relative error is too big\n");
+        return 0;
+    }
+    */
+    // Measure time for 10 BLRM matvecs and for 10 BLRM TLR matvecs
+    /* Not performed due to no matvec yet with STARPU
+    double *x, *y;
+    x = malloc(N*nrhs*sizeof(*x));
+    y = malloc(N*nrhs*sizeof(*y));
+    int iseed[4] = {0, 0, 0, 1};
+    LAPACKE_dlarnv_work(3, iseed, N*nrhs, x);
+    cblas_dscal(N*nrhs, 0.0, y, 1);
+    time1 = omp_get_wtime();
+    for(int i = 0; i < 10; i++)
+        starsh_blrm__dmml(M, nrhs, 1.0, x, N, 0.0, y, N);
+    time1 = omp_get_wtime()-time1;
+    printf("TIME FOR 10 BLRM MATVECS: %e secs\n", time1);
+    */
+    return 0;
+}