diff --git a/cuda_rasterizer/backward.cu b/cuda_rasterizer/backward.cu index 4aa41e1c..135d3cde 100644 --- a/cuda_rasterizer/backward.cu +++ b/cuda_rasterizer/backward.cu @@ -406,9 +406,11 @@ renderCUDA( const float2* __restrict__ points_xy_image, const float4* __restrict__ conic_opacity, const float* __restrict__ colors, + const float* __restrict__ depths, const float* __restrict__ final_Ts, const uint32_t* __restrict__ n_contrib, const float* __restrict__ dL_dpixels, + const float* __restrict__ dL_depths, float3* __restrict__ dL_dmean2D, float4* __restrict__ dL_dconic2D, float* __restrict__ dL_dopacity, @@ -435,6 +437,7 @@ renderCUDA( __shared__ float2 collected_xy[BLOCK_SIZE]; __shared__ float4 collected_conic_opacity[BLOCK_SIZE]; __shared__ float collected_colors[C * BLOCK_SIZE]; + __shared__ float collected_depths[BLOCK_SIZE]; // In the forward, we stored the final value for T, the // product of all (1 - alpha) factors. @@ -448,12 +451,17 @@ renderCUDA( float accum_rec[C] = { 0 }; float dL_dpixel[C]; - if (inside) + float dL_depth; + float accum_depth_rec = 0; + if (inside){ for (int i = 0; i < C; i++) dL_dpixel[i] = dL_dpixels[i * H * W + pix_id]; + dL_depth = dL_depths[pix_id]; + } float last_alpha = 0; float last_color[C] = { 0 }; + float last_depth = 0; // Gradient of pixel coordinate w.r.t. normalized // screen-space viewport corrdinates (-1 to 1) @@ -475,6 +483,7 @@ renderCUDA( collected_conic_opacity[block.thread_rank()] = conic_opacity[coll_id]; for (int i = 0; i < C; i++) collected_colors[i * BLOCK_SIZE + block.thread_rank()] = colors[coll_id * C + i]; + collected_depths[block.thread_rank()] = depths[coll_id]; } block.sync(); @@ -522,6 +531,10 @@ renderCUDA( // many that were affected by this Gaussian. atomicAdd(&(dL_dcolors[global_id * C + ch]), dchannel_dcolor * dL_dchannel); } + const float c_d = collected_depths[j]; + accum_depth_rec = last_alpha * last_depth + (1.f - last_alpha) * accum_depth_rec; + last_depth = c_d; + dL_dalpha += (c_d - accum_depth_rec) * dL_depth; dL_dalpha *= T; // Update last alpha (to be used in the next iteration) last_alpha = alpha; @@ -630,9 +643,11 @@ void BACKWARD::render( const float2* means2D, const float4* conic_opacity, const float* colors, + const float* depths, const float* final_Ts, const uint32_t* n_contrib, const float* dL_dpixels, + const float* dL_depths, float3* dL_dmean2D, float4* dL_dconic2D, float* dL_dopacity, @@ -646,12 +661,14 @@ void BACKWARD::render( means2D, conic_opacity, colors, + depths, final_Ts, n_contrib, dL_dpixels, + dL_depths, dL_dmean2D, dL_dconic2D, dL_dopacity, dL_dcolors ); -} \ No newline at end of file +} diff --git a/cuda_rasterizer/backward.h b/cuda_rasterizer/backward.h index 93dd2e4b..6853c1ea 100644 --- a/cuda_rasterizer/backward.h +++ b/cuda_rasterizer/backward.h @@ -29,9 +29,11 @@ namespace BACKWARD const float2* means2D, const float4* conic_opacity, const float* colors, + const float* depths, const float* final_Ts, const uint32_t* n_contrib, const float* dL_dpixels, + const float* dL_depths, float3* dL_dmean2D, float4* dL_dconic2D, float* dL_dopacity, @@ -62,4 +64,4 @@ namespace BACKWARD glm::vec4* dL_drot); } -#endif \ No newline at end of file +#endif diff --git a/cuda_rasterizer/forward.cu b/cuda_rasterizer/forward.cu index c419a328..7ec44380 100644 --- a/cuda_rasterizer/forward.cu +++ b/cuda_rasterizer/forward.cu @@ -266,11 +266,13 @@ renderCUDA( int W, int H, const float2* __restrict__ points_xy_image, const float* __restrict__ features, + const float* __restrict__ depths, const float4* __restrict__ conic_opacity, float* __restrict__ final_T, uint32_t* __restrict__ n_contrib, const float* __restrict__ bg_color, - float* __restrict__ out_color) + float* __restrict__ out_color, + float* __restrict__ out_depth) { // Identify current tile and associated min/max pixel range. auto block = cg::this_thread_block(); @@ -301,6 +303,7 @@ renderCUDA( uint32_t contributor = 0; uint32_t last_contributor = 0; float C[CHANNELS] = { 0 }; + float D = { 0 }; // Iterate over batches until all done or range is complete for (int i = 0; i < rounds; i++, toDo -= BLOCK_SIZE) @@ -353,6 +356,7 @@ renderCUDA( // Eq. (3) from 3D Gaussian splatting paper. for (int ch = 0; ch < CHANNELS; ch++) C[ch] += features[collected_id[j] * CHANNELS + ch] * alpha * T; + D += depths[collected_id[j]] * alpha * T; T = test_T; @@ -370,6 +374,7 @@ renderCUDA( n_contrib[pix_id] = last_contributor; for (int ch = 0; ch < CHANNELS; ch++) out_color[ch * H * W + pix_id] = C[ch] + T * bg_color[ch]; + out_depth[pix_id] = D; } } @@ -380,11 +385,13 @@ void FORWARD::render( int W, int H, const float2* means2D, const float* colors, + const float* depths, const float4* conic_opacity, float* final_T, uint32_t* n_contrib, const float* bg_color, - float* out_color) + float* out_color, + float* out_depth) { renderCUDA << > > ( ranges, @@ -392,11 +399,13 @@ void FORWARD::render( W, H, means2D, colors, + depths, conic_opacity, final_T, n_contrib, bg_color, - out_color); + out_color, + out_depth); } void FORWARD::preprocess(int P, int D, int M, @@ -452,4 +461,4 @@ void FORWARD::preprocess(int P, int D, int M, tiles_touched, prefiltered ); -} \ No newline at end of file +} diff --git a/cuda_rasterizer/forward.h b/cuda_rasterizer/forward.h index 3c11cb91..fc19f20d 100644 --- a/cuda_rasterizer/forward.h +++ b/cuda_rasterizer/forward.h @@ -55,12 +55,14 @@ namespace FORWARD int W, int H, const float2* points_xy_image, const float* features, + const float* depths, const float4* conic_opacity, float* final_T, uint32_t* n_contrib, const float* bg_color, - float* out_color); + float* out_color, + float* out_depth); } -#endif \ No newline at end of file +#endif diff --git a/cuda_rasterizer/rasterizer.h b/cuda_rasterizer/rasterizer.h index 81544ef6..3d2fd598 100644 --- a/cuda_rasterizer/rasterizer.h +++ b/cuda_rasterizer/rasterizer.h @@ -49,6 +49,7 @@ namespace CudaRasterizer const float tan_fovx, float tan_fovy, const bool prefiltered, float* out_color, + float* out_depth, int* radii = nullptr, bool debug = false); @@ -72,6 +73,7 @@ namespace CudaRasterizer char* binning_buffer, char* image_buffer, const float* dL_dpix, + const float* dL_depths, float* dL_dmean2D, float* dL_dconic, float* dL_dopacity, @@ -85,4 +87,4 @@ namespace CudaRasterizer }; }; -#endif \ No newline at end of file +#endif diff --git a/cuda_rasterizer/rasterizer_impl.cu b/cuda_rasterizer/rasterizer_impl.cu index f8782ac4..5a1bb288 100644 --- a/cuda_rasterizer/rasterizer_impl.cu +++ b/cuda_rasterizer/rasterizer_impl.cu @@ -216,6 +216,7 @@ int CudaRasterizer::Rasterizer::forward( const float tan_fovx, float tan_fovy, const bool prefiltered, float* out_color, + float* out_depth, int* radii, bool debug) { @@ -326,11 +327,13 @@ int CudaRasterizer::Rasterizer::forward( width, height, geomState.means2D, feature_ptr, + geomState.depths, geomState.conic_opacity, imgState.accum_alpha, imgState.n_contrib, background, - out_color), debug) + out_color, + out_depth), debug) return num_rendered; } @@ -357,6 +360,7 @@ void CudaRasterizer::Rasterizer::backward( char* binning_buffer, char* img_buffer, const float* dL_dpix, + const float* dL_depths, float* dL_dmean2D, float* dL_dconic, float* dL_dopacity, @@ -387,6 +391,7 @@ void CudaRasterizer::Rasterizer::backward( // opacity and RGB of Gaussians from per-pixel loss gradients. // If we were given precomputed colors and not SHs, use them. const float* color_ptr = (colors_precomp != nullptr) ? colors_precomp : geomState.rgb; + const float* depth_ptr = geomState.depths; CHECK_CUDA(BACKWARD::render( tile_grid, block, @@ -397,9 +402,11 @@ void CudaRasterizer::Rasterizer::backward( geomState.means2D, geomState.conic_opacity, color_ptr, + depth_ptr, imgState.accum_alpha, imgState.n_contrib, dL_dpix, + dL_depths, (float3*)dL_dmean2D, (float4*)dL_dconic, dL_dopacity, @@ -431,4 +438,4 @@ void CudaRasterizer::Rasterizer::backward( dL_dsh, (glm::vec3*)dL_dscale, (glm::vec4*)dL_drot), debug) -} \ No newline at end of file +} diff --git a/diff_gaussian_rasterization/__init__.py b/diff_gaussian_rasterization/__init__.py index bbef37d1..60da4f4c 100644 --- a/diff_gaussian_rasterization/__init__.py +++ b/diff_gaussian_rasterization/__init__.py @@ -83,22 +83,22 @@ def forward( if raster_settings.debug: cpu_args = cpu_deep_copy_tuple(args) # Copy them before they can be corrupted try: - num_rendered, color, radii, geomBuffer, binningBuffer, imgBuffer = _C.rasterize_gaussians(*args) + num_rendered, color, depth, radii, geomBuffer, binningBuffer, imgBuffer = _C.rasterize_gaussians(*args) except Exception as ex: torch.save(cpu_args, "snapshot_fw.dump") print("\nAn error occured in forward. Please forward snapshot_fw.dump for debugging.") raise ex else: - num_rendered, color, radii, geomBuffer, binningBuffer, imgBuffer = _C.rasterize_gaussians(*args) + num_rendered, color, depth, radii, geomBuffer, binningBuffer, imgBuffer = _C.rasterize_gaussians(*args) # Keep relevant tensors for backward ctx.raster_settings = raster_settings ctx.num_rendered = num_rendered ctx.save_for_backward(colors_precomp, means3D, scales, rotations, cov3Ds_precomp, radii, sh, geomBuffer, binningBuffer, imgBuffer) - return color, radii + return color, radii, depth @staticmethod - def backward(ctx, grad_out_color, _): + def backward(ctx, grad_out_color, grad_radii, grad_depth): # Restore necessary values from context num_rendered = ctx.num_rendered @@ -118,7 +118,8 @@ def backward(ctx, grad_out_color, _): raster_settings.projmatrix, raster_settings.tanfovx, raster_settings.tanfovy, - grad_out_color, + grad_out_color, + grad_depth, sh, raster_settings.sh_degree, raster_settings.campos, diff --git a/rasterize_points.cu b/rasterize_points.cu index ddc5cf8b..f05babb7 100644 --- a/rasterize_points.cu +++ b/rasterize_points.cu @@ -32,7 +32,7 @@ std::function resizeFunctional(torch::Tensor& t) { return lambda; } -std::tuple +std::tuple RasterizeGaussiansCUDA( const torch::Tensor& background, const torch::Tensor& means3D, @@ -66,6 +66,7 @@ RasterizeGaussiansCUDA( auto float_opts = means3D.options().dtype(torch::kFloat32); torch::Tensor out_color = torch::full({NUM_CHANNELS, H, W}, 0.0, float_opts); + torch::Tensor out_depth = torch::full({1, H, W}, 0.0, float_opts); torch::Tensor radii = torch::full({P}, 0, means3D.options().dtype(torch::kInt32)); torch::Device device(torch::kCUDA); @@ -108,10 +109,11 @@ RasterizeGaussiansCUDA( tan_fovy, prefiltered, out_color.contiguous().data(), + out_depth.contiguous().data(), radii.contiguous().data(), debug); } - return std::make_tuple(rendered, out_color, radii, geomBuffer, binningBuffer, imgBuffer); + return std::make_tuple(rendered, out_color, out_depth, radii, geomBuffer, binningBuffer, imgBuffer); } std::tuple @@ -129,6 +131,7 @@ std::tuple(binningBuffer.contiguous().data_ptr()), reinterpret_cast(imageBuffer.contiguous().data_ptr()), dL_dout_color.contiguous().data(), + dL_dout_depth.contiguous().data(), dL_dmeans2D.contiguous().data(), dL_dconic.contiguous().data(), dL_dopacity.contiguous().data(), @@ -214,4 +218,4 @@ torch::Tensor markVisible( } return present; -} \ No newline at end of file +} diff --git a/rasterize_points.h b/rasterize_points.h index 9023d994..bd6ad90e 100644 --- a/rasterize_points.h +++ b/rasterize_points.h @@ -15,7 +15,7 @@ #include #include -std::tuple +std::tuple RasterizeGaussiansCUDA( const torch::Tensor& background, const torch::Tensor& means3D, @@ -52,6 +52,7 @@ std::tuple