From cde0b157e9d60c73d51eb8ebe487c0e6c51bd4c1 Mon Sep 17 00:00:00 2001 From: Liu Liu Date: Fri, 16 Aug 2024 13:58:01 -0400 Subject: [PATCH] Test some shapes. --- bin/nnc/adversarial_shape_test.cpp | 275 +++++++++++++++++++++++++++++ bin/nnc/makefile | 5 +- 2 files changed, 279 insertions(+), 1 deletion(-) create mode 100644 bin/nnc/adversarial_shape_test.cpp diff --git a/bin/nnc/adversarial_shape_test.cpp b/bin/nnc/adversarial_shape_test.cpp new file mode 100644 index 000000000..758a63078 --- /dev/null +++ b/bin/nnc/adversarial_shape_test.cpp @@ -0,0 +1,275 @@ +extern "C" { +#include +#include +#include +#include +} +#include "nnc/mfa/v2/ShaderCache.hpp" +#include "nnc/mfa/v2/GEMMDescriptor.hpp" +#include "nnc/mfa/v2/GEMMKernelDescriptor.hpp" +#include "nnc/mfa/v2/GEMMKernel.hpp" +#include "3rdparty/dsfmt/dSFMT.h" +#include + +ShaderCache shaderCache; + +std::pair profileProblemSize(GEMMDescriptor descriptor) +{ + const int problemSize1 = descriptor.matrixDimensions[0]; + const int problemSize2 = descriptor.matrixDimensions[1]; + const int problemSize3 = descriptor.matrixDimensions[2]; + + // Allocate FP32 memory for the operands. + float* A = (float*)ccmalloc(sizeof(float) * problemSize1 * problemSize3); + float* B = (float*)ccmalloc(sizeof(float) * problemSize2 * problemSize3); + float* C = (float*)ccmalloc(sizeof(float) * problemSize1 * problemSize2); + float* bias = (float*)ccmalloc(sizeof(float) * problemSize2); + + + dsfmt_t dsfmt; + dsfmt_init_gen_rand(&dsfmt, 1); + + // Initialize A to random numbers. + int i, j; + for (i = 0; i < problemSize3; i++) + for (j = 0; j < problemSize1; j++) + A[i * problemSize1 + j] = dsfmt_genrand_open_close(&dsfmt); + // Initialize B to random numbers. + for (int rowID = 0; rowID < problemSize2; rowID++) + { + for (int columnID = 0; columnID < problemSize3; columnID++) + { + const int address = rowID * problemSize3 + columnID; + B[address] = dsfmt_genrand_open_close(&dsfmt); + } + } + + // Initialize C to random numbers. + for (int rowID = 0; rowID < problemSize2; rowID++) + { + bias[rowID] = dsfmt_genrand_open_close(&dsfmt); + } + void* A_storage = nullptr; + if (descriptor.memoryPrecisions.A == GEMMOperandPrecision::FP16) + { + A_storage = (uint16_t*)ccmalloc(sizeof(uint16_t) * problemSize1 * problemSize3); + ccv_float_to_half_precision(A, (uint16_t*)A_storage, problemSize1 * problemSize3); + void* t = A_storage; + A_storage = A; + A = (float*)t; + } else if (descriptor.memoryPrecisions.A == GEMMOperandPrecision::BF16) { + A_storage = (uint16_t*)ccmalloc(sizeof(uint16_t) * problemSize1 * problemSize3); + for (int i = 0; i < problemSize1 * problemSize3; i++) + ((uint16_t*)A_storage)[i] = ((uint16_t*)A)[i * 2 + 1]; + void* t = A_storage; + A_storage = A; + A = (float*)t; + } + void* B_storage = nullptr; + if (descriptor.memoryPrecisions.B == GEMMOperandPrecision::FP16) + { + B_storage = (uint16_t*)ccmalloc(sizeof(uint16_t) * problemSize2 * problemSize3); + ccv_float_to_half_precision(B, (uint16_t*)B_storage, problemSize2 * problemSize3); + void* t = B_storage; + B_storage = B; + B = (float*)t; + } else if (descriptor.memoryPrecisions.B == GEMMOperandPrecision::BF16) { + B_storage = (uint16_t*)ccmalloc(sizeof(uint16_t) * problemSize2 * problemSize3); + for (int i = 0; i < problemSize2 * problemSize3; i++) + ((uint16_t*)B_storage)[i] = ((uint16_t*)B)[i * 2 + 1]; + void* t = B_storage; + B_storage = B; + B = (float*)t; + } + void* bias_storage = nullptr; + if (descriptor.memoryPrecisions.bias == GEMMOperandPrecision::FP16) + { + bias_storage = (uint16_t*)ccmalloc(sizeof(uint16_t) * problemSize2); + ccv_float_to_half_precision(bias, (uint16_t*)bias_storage, problemSize2); + void* t = bias_storage; + bias_storage = bias; + bias = (float*)t; + } else if (descriptor.memoryPrecisions.bias == GEMMOperandPrecision::BF16) { + bias_storage = (uint16_t*)ccmalloc(sizeof(uint16_t) * problemSize2); + for (int i = 0; i < problemSize2; i++) + ((uint16_t*)bias_storage)[i] = ((uint16_t*)bias)[i * 2 + 1]; + void* t = bias_storage; + bias_storage = bias; + bias = (float*)t; + } + + // Multiply A with B. + int maxGFLOPS = 0; + int occupancy = 0; + DeviceProperties dprops; + NS::SharedPtr device = NS::TransferPtr(MTL::CreateSystemDefaultDevice()); + NS::SharedPtr queue = NS::TransferPtr(device->newCommandQueue()); + { + // Generate the kernel. + auto pipelineValue = shaderCache.findKernel(descriptor, device.get(), dprops); + occupancy = pipelineValue->pipeline->maxTotalThreadsPerThreadgroup(); + NS::SharedPtr bufferA = NS::TransferPtr(device->newBuffer(A, descriptor.memoryPrecisions.A.size() * problemSize1 * problemSize3, MTL::ResourceStorageModeShared | MTL::ResourceHazardTrackingModeTracked)); + NS::SharedPtr bufferB = NS::TransferPtr(device->newBuffer(B, descriptor.memoryPrecisions.B.size() * problemSize2 * problemSize3, MTL::ResourceStorageModeShared | MTL::ResourceHazardTrackingModeTracked)); + NS::SharedPtr bufferC = NS::TransferPtr(device->newBuffer(C, descriptor.memoryPrecisions.C.size() * problemSize1 * problemSize2, MTL::ResourceStorageModeShared | MTL::ResourceHazardTrackingModeTracked)); + NS::SharedPtr bufferBias = NS::TransferPtr(device->newBuffer(bias, descriptor.memoryPrecisions.bias.size() * problemSize2, MTL::ResourceStorageModeShared | MTL::ResourceHazardTrackingModeTracked)); + for (int i = 0; i < 15; i++) + { + const int duplicatedCommandCount = 20; + NS::SharedPtr commandBuffer = NS::TransferPtr(queue->commandBuffer()); + NS::SharedPtr encoder = NS::TransferPtr(commandBuffer->computeCommandEncoder()); + encoder->setComputePipelineState(pipelineValue->pipeline.get()); + encoder->setThreadgroupMemoryLength(pipelineValue->kernel->threadgroupMemoryAllocation, 0); + encoder->setBuffer(bufferA.get(), 0, 0); + encoder->setBuffer(bufferB.get(), 0, 1); + encoder->setBuffer(bufferC.get(), 0, 2); + encoder->useResource(bufferA.get(), MTL::ResourceUsageRead); + encoder->useResource(bufferB.get(), MTL::ResourceUsageRead); + encoder->useResource(bufferC.get(), MTL::ResourceUsageWrite); + if (descriptor.useBias) + { + encoder->setBuffer(bufferBias.get(), 0, 3); + encoder->useResource(bufferBias.get(), MTL::ResourceUsageRead); + } + for (int j = 0; j < duplicatedCommandCount; j++) + { + auto ceilDivide = + [=](int64_t target, uint16_t granularity) -> int64_t { + return (target + int64_t(granularity) - 1) / int64_t(granularity); + }; + MTL::Size gridSize = MTL::Size(ceilDivide(problemSize2, pipelineValue->kernel->blockDimensions[1]), ceilDivide(problemSize1, pipelineValue->kernel->blockDimensions[0]), 1); + MTL::Size groupSize = MTL::Size(pipelineValue->kernel->threadgroupSize, 1, 1); + encoder->dispatchThreadgroups(gridSize, groupSize); + } + encoder->endEncoding(); + commandBuffer->commit(); + commandBuffer->waitUntilCompleted(); + auto start = commandBuffer->GPUStartTime(); + auto end = commandBuffer->GPUEndTime(); + auto latency = end - start; + + // Determine the amount of work done. + auto operations = (int64_t)2 * problemSize1 * problemSize2 * problemSize3; + operations = operations * duplicatedCommandCount; + auto gflops = (int)((double)operations / (double)latency / 1e9); + + // Report the results. + // let latencyMicroseconds = Int(latency / 1e-6) + // print(latencyMicroseconds, "μs", gflops, "GFLOPS") + maxGFLOPS = std::max(maxGFLOPS, gflops); + } + } + + ccfree(A); + ccfree(B); + ccfree(C); + ccfree(bias); + if (A_storage != nullptr) + ccfree(A_storage); + if (B_storage != nullptr) + ccfree(B_storage); + if (bias_storage != nullptr) + ccfree(bias_storage); + return std::make_pair(maxGFLOPS, occupancy); +} + +struct TestDescriptor { + GEMMOperandPrecision precision; + int problemSize[3]; + bool transposeState[2]; + bool useBias; +}; + +void runTest(TestDescriptor descriptor) +{ + // Set up the kernel. + GEMMDescriptor gemmDesc = GEMMDescriptor(); + auto precision = descriptor.precision; + unsigned int m = (unsigned int)descriptor.problemSize[0]; + unsigned int n = (unsigned int)descriptor.problemSize[1]; + unsigned int k = (unsigned int)descriptor.problemSize[2]; + gemmDesc.matrixDimensions = simd::uint3 { m, n, k }; + gemmDesc.memoryPrecisions = { + .A = precision, .B = precision, .C = precision, .bias = precision + }; + gemmDesc.transposeState = simd::uchar3 { descriptor.transposeState[0], descriptor.transposeState[1], descriptor.transposeState[0] }; + gemmDesc.useBias = descriptor.useBias; + + // Test the kernel. + auto statistic = profileProblemSize(gemmDesc); + + // Report the results. + std::cout << "problemSize = " << descriptor.problemSize[0] << "x" << descriptor.problemSize[1] << "x" << descriptor.problemSize[2] << " | "; + if (descriptor.transposeState[0]) + { + std::cout << "A^T "; + } else { + std::cout << "A "; + } + if (descriptor.transposeState[1]) + { + std::cout << "B^T "; + } else { + std::cout << "B "; + } + if (descriptor.useBias) + { + std::cout << "+ BIAS | "; + } else { + std::cout << " | "; + } + + std::cout << statistic.first << " GFLOPS " << statistic.second << " threads/core | " << std::endl; +} + +int main(int argc, char** argv) +{ + ccv_nnc_init(); + { + bool transposeStates[] = { + false, false, + false, true, + // true, false, + // true, true, + false, false, + false, true, + // true, false, + // true, true, + }; + bool useBias[] = { + false, + false, + // false, + // false, + true, + true, + // true, + // true + }; + int problemSizes[] = { + 4608 * 2, 3072, 3072 * 4, + 4608 * 2, 3072 * 4, 3072, + 4608 * 2, 3072, 3072, + // 4608, 3072, 3072 * 3, + // 4608, 3072 * 3, 3072, + }; + + printf("\nPerformance tests:\n"); + for (int i = 0; i < sizeof(problemSizes) / (sizeof(int) * 3); i++) + // for (int problemSize = 7936; problemSize <= 3072 * 4; problemSize += 128) + { + for (int j = 0; j < sizeof(transposeStates) / (sizeof(bool) * 2); j++) + { + TestDescriptor testDescriptor = TestDescriptor(); + testDescriptor.precision = GEMMOperandPrecision::FP16; + testDescriptor.problemSize[0] = problemSizes[i * 3]; + testDescriptor.problemSize[1] = problemSizes[i * 3 + 1]; + testDescriptor.problemSize[2] = problemSizes[i * 3 + 2]; + testDescriptor.transposeState[0] = transposeStates[j * 2]; + testDescriptor.transposeState[1] = transposeStates[j * 2 + 1]; + testDescriptor.useBias = useBias[j]; + runTest(testDescriptor); + } + } + } + return 0; +} diff --git a/bin/nnc/makefile b/bin/nnc/makefile index 422743656..47a9e58a4 100644 --- a/bin/nnc/makefile +++ b/bin/nnc/makefile @@ -4,7 +4,7 @@ LDFLAGS := -L"../../lib" -lccv $(LDFLAGS) CFLAGS := -O3 -Wall -I"../../lib" $(CFLAGS) NVFLAGS := -O3 -I"../../lib" -lineinfo $(NVFLAGS) -TARGETS = nnc-e2e-verify nnc-e2e-sym-verify nnc-sym cifar-10 imagenet coco imdb iwslt wmt csv imdb_lstm laplacian_test +TARGETS = nnc-e2e-verify nnc-e2e-sym-verify nnc-sym cifar-10 imagenet coco imdb iwslt wmt csv imdb_lstm laplacian_test adversarial_shape_test FUZZ_TARGETS = csv_fuzz @@ -40,6 +40,9 @@ libccv.a: laplacian_test.o: laplacian_test.cpp $(CC) $< -o $@ -c $(CFLAGS) -std=c++17 +adversarial_shape_test.o: adversarial_shape_test.cpp + $(CC) $< -o $@ -c $(CFLAGS) -std=c++17 + .gitignore: echo $(TARGETS) | tr ' ' '\n' > .gitignore