-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: parallel scan extension for CPU (#17)
* draft: scan extension on cpu * include cpp extension in setup.py * fix: remove extra arg * fix: compile errors * use dev versioning * load library file when being imported * test equivalence to numba version * refactor: return tensor instead of void function * refactor: rename RecurrenceCUDA to Recurrence to cover CPU device * refactor: update functions to use Recurrence for CPU and CUDA devices * refactor: remove contiguous check besides output tensor * refactor: add warning for missing _C*.so file and check extension loading in Recurrence * ci: add workflow step to build CPP extension and copy shared objects * apply suggestions and remove comments * refactor: apply google style format
- Loading branch information
1 parent
eb7b778
commit 5801748
Showing
8 changed files
with
226 additions
and
35 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
import torch | ||
import torch.nn.functional as F | ||
import pytest | ||
from torchlpc.core import lpc_np | ||
|
||
|
||
from .test_grad import create_test_inputs | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"samples", | ||
[64, 4097], | ||
) | ||
@pytest.mark.parametrize( | ||
"cmplx", | ||
[True, False], | ||
) | ||
def test_scan_cpu_equiv(samples: int, cmplx: bool): | ||
batch_size = 4 | ||
x = torch.randn( | ||
batch_size, samples, dtype=torch.float32 if not cmplx else torch.complex64 | ||
) | ||
A = torch.rand_like(x) * 1.8 - 0.9 | ||
zi = torch.randn(batch_size, dtype=x.dtype) | ||
|
||
numba_y = torch.from_numpy( | ||
lpc_np( | ||
x.cpu().numpy(), | ||
-A.cpu().unsqueeze(2).numpy(), | ||
zi.cpu().unsqueeze(1).numpy(), | ||
) | ||
) | ||
ext_y = torch.ops.torchlpc.scan_cpu(x, A, zi) | ||
|
||
assert torch.allclose(numba_y, ext_y) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
#include <torch/script.h> | ||
#include <torch/torch.h> | ||
|
||
#include <algorithm> | ||
#include <utility> | ||
#include <vector> | ||
|
||
template <typename scalar_t> | ||
void scan_cpu(const at::Tensor &input, const at::Tensor &weights, | ||
const at::Tensor &initials, const at::Tensor &output) { | ||
TORCH_CHECK(input.dim() == 2, "Input must be 2D"); | ||
TORCH_CHECK(initials.dim() == 1, "Initials must be 1D"); | ||
TORCH_CHECK(weights.sizes() == input.sizes(), | ||
"Weights must have the same size as input"); | ||
TORCH_CHECK(output.sizes() == input.sizes(), | ||
"Output must have the same size as input"); | ||
TORCH_CHECK(initials.size(0) == input.size(0), | ||
"The first dimension of initials must be the same as the first " | ||
"dimension of input"); | ||
TORCH_INTERNAL_ASSERT(input.device().is_cpu(), "Input must be on CPU"); | ||
TORCH_INTERNAL_ASSERT(initials.device().is_cpu(), | ||
"Initials must be on CPU"); | ||
TORCH_INTERNAL_ASSERT(weights.device().is_cpu(), "Weights must be on CPU"); | ||
TORCH_INTERNAL_ASSERT(output.device().is_cpu(), "Output must be on CPU"); | ||
TORCH_INTERNAL_ASSERT(output.is_contiguous(), "Output must be contiguous"); | ||
|
||
auto input_contiguous = input.contiguous(); | ||
auto weights_contiguous = weights.contiguous(); | ||
auto initials_contiguous = initials.contiguous(); | ||
|
||
auto n_batch = input.size(0); | ||
auto T = input.size(1); | ||
auto total_size = input.numel(); | ||
|
||
std::pair<scalar_t, scalar_t> buffer[total_size]; | ||
|
||
const scalar_t *input_ptr = input_contiguous.data_ptr<scalar_t>(); | ||
const scalar_t *initials_ptr = initials_contiguous.data_ptr<scalar_t>(); | ||
const scalar_t *weights_ptr = weights_contiguous.data_ptr<scalar_t>(); | ||
scalar_t *output_ptr = output.data_ptr<scalar_t>(); | ||
|
||
std::transform(weights_ptr, weights_ptr + total_size, input_ptr, buffer, | ||
[](const scalar_t &a, const scalar_t &b) { | ||
return std::make_pair(a, b); | ||
}); | ||
|
||
at::parallel_for(0, n_batch, 1, [&](int64_t start, int64_t end) { | ||
for (auto b = start; b < end; b++) { | ||
std::inclusive_scan( | ||
buffer + b * T, buffer + (b + 1) * T, buffer + b * T, | ||
[](const std::pair<scalar_t, scalar_t> &a, | ||
const std::pair<scalar_t, scalar_t> &b) { | ||
return std::make_pair(a.first * b.first, | ||
a.second * b.first + b.second); | ||
}, | ||
std::make_pair((scalar_t)1.0, initials_ptr[b])); | ||
} | ||
}); | ||
|
||
std::transform( | ||
buffer, buffer + total_size, output_ptr, | ||
[](const std::pair<scalar_t, scalar_t> &a) { return a.second; }); | ||
} | ||
|
||
at::Tensor scan_cpu_wrapper(const at::Tensor &input, const at::Tensor &weights, | ||
const at::Tensor &initials) { | ||
TORCH_CHECK(input.is_floating_point() || input.is_complex(), | ||
"Input must be floating point or complex"); | ||
TORCH_CHECK(initials.scalar_type() == input.scalar_type(), | ||
"Initials must have the same scalar type as input"); | ||
TORCH_CHECK(weights.scalar_type() == input.scalar_type(), | ||
"Weights must have the same scalar type as input"); | ||
|
||
auto output = at::empty_like(input); | ||
|
||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES( | ||
input.scalar_type(), "scan_cpu", | ||
[&] { scan_cpu<scalar_t>(input, weights, initials, output); }); | ||
return output; | ||
} | ||
|
||
TORCH_LIBRARY(torchlpc, m) { | ||
m.def("torchlpc::scan_cpu(Tensor a, Tensor b, Tensor c) -> Tensor"); | ||
} | ||
|
||
TORCH_LIBRARY_IMPL(torchlpc, CPU, m) { m.impl("scan_cpu", &scan_cpu_wrapper); } |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters