Skip to content

Commit

Permalink
1/N Batchify Rasterization Kernel
Browse files Browse the repository at this point in the history
Summary:
# Problem

In CT / State Encoding, we expect a scenario where we would like to render a batch of topologies where each of them would have different number of vertices and triangles. Currently the only way to support this with DRTK is to iterate over the batch in a for loop for each topology and render it.

In a series of diffs we would like to solve this issue by making drtk consume a batch of triangles as opposed to just 1 set of triangles. **However, we would like to achieve this behavior without affecting the most common single topology case by a lot**.

# How do we pass in multiple topologies in a single batch?
- We will provide a `TopologyBatch` structure in xrcia/lib/graphics/structures where we will provide functionality to create a `Batch x MaxTriangles x 3` and `Batch x MaxVertices x 3`.
- Padded vertices will be 0s and padded triangles will have MaxVertices - 1 as their value. But these will discarded as degenerate in rasterization / rendering.

# In this diff
- Extend `rasterize_kernel` and `rasterize_lines_kernel` to support a batch dimension as default.
- `rasterize` will now unsqueeze the batch dimension when using a single topo
- We access the vertex indices of triangles by walking an additional `batch stride * n` in the triangles data pointer.
- Add an extra condition to check to see if the triangles are degenerate; this happens when padding the batch.
- We show that the we don't cause too much overhead in GPU by introducing these 3 extra operations (Same profiling as in D68194200)

Differential Revision: D68388659

fbshipit-source-id: b4f8a7daab8b133b8538f7e5db4f730f70b71deb
  • Loading branch information
kvenkateshan-meta authored and facebook-github-bot committed Jan 22, 2025
1 parent 9683df2 commit e027471
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 18 deletions.
8 changes: 7 additions & 1 deletion drtk/rasterize.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def rasterize(
vi (th.Tensor): face vertex index list tensor. The most significant nibble of vi is
reserved for controlling visibility of the edges in wireframe mode. In non-wireframe
mode, content of the most significant nibble of vi will be ignored.
V x 3
F x 3 or N x F x 3
height (int): height of the image in pixels.
Expand All @@ -56,6 +56,9 @@ def rasterize(
This function is not differentiable. The gradients should be computed with
:func:`edge_grad_estimator` instead.
"""
if vi.ndim == 2:
vi = vi[None].expand(v.shape[0], -1, -1)

_, index_img = th.ops.rasterize_ext.rasterize(v, vi, height, width, wireframe)
return index_img

Expand Down Expand Up @@ -89,6 +92,9 @@ def rasterize_with_depth(
[N, H, W]. Values in of pixels in the depth image not covered by any pixel are 0.
"""
if vi.ndim == 2:
vi = vi[None].expand(v.shape[0], -1, -1)

depth_img, index_img = th.ops.rasterize_ext.rasterize(
v, vi, height, width, wireframe
)
Expand Down
50 changes: 33 additions & 17 deletions src/rasterize/rasterize_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ __global__ void rasterize_kernel(
const index_t H = packed_index_depth_img.sizes[1];
const index_t W = packed_index_depth_img.sizes[2];
const index_t V = v.sizes[1];
const index_t n_prim = vi.sizes[0];
const index_t n_prim = vi.sizes[1];

const index_t index_sN = packed_index_depth_img.strides[0];
const index_t index_sH = packed_index_depth_img.strides[1];
Expand All @@ -39,20 +39,24 @@ __global__ void rasterize_kernel(
const index_t v_sV = v.strides[1];
const index_t v_sC = v.strides[2];

const index_t vi_sF = vi.strides[0];
const index_t vi_sI = vi.strides[1];
const index_t vi_sN = vi.strides[0];
const index_t vi_sF = vi.strides[1];
const index_t vi_sI = vi.strides[2];

CUDA_KERNEL_LOOP_TYPE(index, nthreads, index_t) {
const index_t n = index / n_prim;
const index_t id = index % n_prim;

const int32_t* __restrict vi_ptr = vi.data + vi_sF * id;
const int32_t* __restrict vi_ptr = vi.data + vi_sN * n + vi_sF * id;
const int32_t vi_0 = (int32_t)(((uint32_t)vi_ptr[vi_sI * 0]) & 0x0FFFFFFFU);
const int32_t vi_1 = vi_ptr[vi_sI * 1];
const int32_t vi_2 = vi_ptr[vi_sI * 2];

assert(vi_0 < V && vi_1 < V && vi_2 < V);

// Skip degenerate triangles. Useful for padding of vi buffer for batched rasterization
bool triangle_is_degenerate = (vi_0 == vi_1) && (vi_1 == vi_2);

const scalar_t* __restrict v_ptr = v.data + n * v_sN;
const scalar2_t p_0 = {v_ptr[v_sV * vi_0 + v_sC * 0], v_ptr[v_sV * vi_0 + v_sC * 1]};
const scalar2_t p_1 = {v_ptr[v_sV * vi_1 + v_sC * 0], v_ptr[v_sV * vi_1 + v_sC * 1]};
Expand All @@ -70,7 +74,7 @@ __global__ void rasterize_kernel(
const bool in_canvas = math::all_less_or_eq(min_p, {(scalar_t)(W - 1), (scalar_t)(H - 1)}) &&
math::all_greater(max_p, {0.f, 0.f});

if (all_z_greater_0 && in_canvas) {
if (all_z_greater_0 && in_canvas && ~triangle_is_degenerate) {
const scalar2_t v_01 = p_1 - p_0;
const scalar2_t v_02 = p_2 - p_0;
const scalar2_t v_12 = p_2 - p_1;
Expand Down Expand Up @@ -247,7 +251,7 @@ __global__ void rasterize_lines_kernel(
const index_t H = packed_index_depth_img.sizes[1];
const index_t W = packed_index_depth_img.sizes[2];
const index_t V = v.sizes[1];
const index_t n_prim = vi.sizes[0];
const index_t n_prim = vi.sizes[1];

const index_t index_sN = packed_index_depth_img.strides[0];
const index_t index_sH = packed_index_depth_img.strides[1];
Expand All @@ -257,18 +261,23 @@ __global__ void rasterize_lines_kernel(
const index_t v_sV = v.strides[1];
const index_t v_sC = v.strides[2];

const index_t vi_sF = vi.strides[0];
const index_t vi_sI = vi.strides[1];
const index_t vi_sN = vi.strides[0];
const index_t vi_sF = vi.strides[1];
const index_t vi_sI = vi.strides[2];

CUDA_KERNEL_LOOP_TYPE(index, nthreads, index_t) {
const index_t n = index / n_prim;
const index_t id = index % n_prim;

const int32_t* __restrict vi_ptr = vi.data + vi_sF * id;
const int32_t* __restrict vi_ptr = vi.data + vi_sN * n + vi_sF * id;
const int32_t flag = (int32_t)((((uint32_t)vi_ptr[vi_sI * 0] & 0xF0000000U)) >> 28U);
const int32_t vi_0 = (int32_t)(((uint32_t)vi_ptr[vi_sI * 0]) & 0x0FFFFFFFU);
const int32_t vi_1 = vi_ptr[vi_sI * 1];
const int32_t vi_2 = vi_ptr[vi_sI * 2];

// Skip degenerate triangles. Useful for padding of vi buffer for batched rasterization
bool triangle_is_degenerate = (vi_0 == vi_1) && (vi_1 == vi_2);

const bool edge_0_visible = (flag & 0b00000001) != 0;
const bool edge_1_visible = (flag & 0b00000010) != 0;
const bool edge_2_visible = (flag & 0b00000100) != 0;
Expand All @@ -292,7 +301,7 @@ __global__ void rasterize_lines_kernel(
const bool in_canvas = math::all_less_or_eq(min_p, {(scalar_t)(W - 1), (scalar_t)(H - 1)}) &&
math::all_greater(max_p, {0.f, 0.f});

if (all_z_greater_0 && in_canvas) {
if (all_z_greater_0 && in_canvas && ~triangle_is_degenerate) {
const scalar2_t v_01 = p_1 - p_0;
const scalar2_t v_02 = p_2 - p_0;
const scalar2_t v_12 = p_2 - p_1;
Expand Down Expand Up @@ -412,19 +421,26 @@ std::vector<torch::Tensor> rasterize_cuda(
v.layout() == torch::kStrided && vi.layout() == torch::kStrided,
"rasterize(): expected all inputs to have torch.strided layout");
TORCH_CHECK(
(v.dim() == 3) && (vi.dim() == 2),
"rasterize(): expected v.ndim == 3, vi.ndim == 2, "
(v.dim() == 3) && (vi.dim() == 3),
"rasterize(): expected v.ndim == 3, vi.ndim == 3, "
"but got v with sizes ",
v.sizes(),
" and vi with sizes ",
vi.sizes());
TORCH_CHECK(
v.size(2) == 3 && vi.size(1) == 3,
"rasterize(): expected third dim of v to be of size 3, and second dim of vi to be of size 3, but got ",
v.size(2) == 3 && vi.size(2) == 3,
"rasterize(): expected third dim of v to be of size 3, and last dim of vi to be of size 3, but got ",
v.size(2),
" in the third dim of v, and ",
vi.size(1),
" in the second dim of vi");
vi.size(2),
" in the last dim of vi");
TORCH_CHECK(
(vi.size(0) == v.size(0)),
"rasterize(): expected first dim of vi to match first dim of v, but got ",
v.size(0),
" in first dim of v, and ",
vi.size(0),
" in the first dim of vi");
TORCH_CHECK(
v.size(1) < 0x10000000U,
"rasterize(): expected second dim of v to be less or eual to 268435456, but got ",
Expand All @@ -440,7 +456,7 @@ std::vector<torch::Tensor> rasterize_cuda(
auto stream = at::cuda::getCurrentCUDAStream();

auto N = v.size(0);
auto T = vi.size(0);
auto T = vi.size(1);
auto H = height;
auto W = width;
const auto count_rasterize = N * T;
Expand Down

0 comments on commit e027471

Please sign in to comment.