add cuda ipynb

Sister19 · Mar 17, 2023 · 09fa4d1 · 09fa4d1
1 parent 5ad849e
commit 09fa4d1
Showing 1 changed file with 277 additions and 0 deletions.
diff --git a/src/cuda/cuda_colab.ipynb b/src/cuda/cuda_colab.ipynb
@@ -0,0 +1,277 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "GY5LBx4UuxZl"
+      },
+      "source": [
+        "# Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "PExlVrs4uv4m"
+      },
+      "outputs": [],
+      "source": [
+        "# https://www.geeksforgeeks.org/how-to-run-cuda-c-c-on-jupyter-notebook-in-google-colaboratory/"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "M-AIy-7PsXXp"
+      },
+      "outputs": [],
+      "source": [
+        "!apt-get --purge remove cuda nvidia* libnvidia-*\n",
+        "!dpkg -l | grep cuda- | awk '{print $2}' | xargs -n1 dpkg --purge\n",
+        "!apt-get remove cuda-*\n",
+        "!apt autoremove\n",
+        "!apt-get update"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "ZNBZ95nauZ3u"
+      },
+      "outputs": [],
+      "source": [
+        "!wget https://developer.nvidia.com/compute/cuda/9.2/Prod/local_installers/cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64 -O cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb\n",
+        "!dpkg -i cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb\n",
+        "!apt-key add /var/cuda-repo-9-2-local/7fa2af80.pub\n",
+        "!apt-get update\n",
+        "!apt-get install cuda-9.2"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "KhW3T1YGu6Ew"
+      },
+      "source": [
+        "# Kode CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "Co9T1Xpo3AnO",
+        "outputId": "b97898d0-db3c-42a8-ffb2-288b9773276d"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Overwriting cuda.cu\n"
+          ]
+        }
+      ],
+      "source": [
+        "#include <stdio.h>\n",
+        "#include <stdlib.h>\n",
+        "#include <math.h>\n",
+        "#include <cuda_runtime.h>\n",
+        "#include <cuComplex.h>\n",
+        "#define MAX_N 512\n",
+        "#define BLOCK_SIZE 16\n",
+        "\n",
+        "struct Matrix {\n",
+        "    int size;\n",
+        "    double mat[MAX_N][MAX_N];\n",
+        "};\n",
+        "\n",
+        "struct FreqMatrix {\n",
+        "    int size;\n",
+        "    cuDoubleComplex mat[MAX_N][MAX_N];\n",
+        "};\n",
+        "\n",
+        "__device__ cuDoubleComplex cuCexp(cuDoubleComplex x)\n",
+        "{\n",
+        "    double factor = exp(x.x);\n",
+        "    return make_cuDoubleComplex(factor * cos(x.y), factor * sin(x.y));\n",
+        "}\n",
+        "\n",
+        "__global__ void dft_kernel(struct Matrix *mat, struct FreqMatrix *freq_domain)\n",
+        "{\n",
+        "  // Implement shared memory\n",
+        "    __shared__ double shared_mat[BLOCK_SIZE][BLOCK_SIZE];\n",
+        "\n",
+        "    int k = blockIdx.x;\n",
+        "    int l = threadIdx.x;\n",
+        "\n",
+        "    cuDoubleComplex element = make_cuDoubleComplex(0.0, 0.0);\n",
+        "\n",
+        "    for (int i = 0; i < mat->size; i += BLOCK_SIZE) {\n",
+        "        for (int j = 0; j < mat->size; j += BLOCK_SIZE) {\n",
+        "            // Load a block of input matrix into shared memory\n",
+        "            shared_mat[l][k] = mat->mat[i + l][j + k];\n",
+        "\n",
+        "            __syncthreads();\n",
+        "\n",
+        "            for (int m = 0; m < BLOCK_SIZE; m++) {\n",
+        "                for (int n = 0; n < BLOCK_SIZE; n++) {\n",
+        "                    cuDoubleComplex arg      = make_cuDoubleComplex((i + m) * k / (double) mat->size + (j + n) * l / (double) mat->size, 0.0);\n",
+        "                    cuDoubleComplex exponent = cuCexp(make_cuDoubleComplex(0.0, -2.0 * M_PI * arg.x));\n",
+        "                    cuDoubleComplex value    = make_cuDoubleComplex(shared_mat[m][k], 0.0);\n",
+        "                    element = cuCadd(element, cuCmul(value, exponent));\n",
+        "                }\n",
+        "            }\n",
+        "\n",
+        "            __syncthreads();\n",
+        "        }\n",
+        "    }\n",
+        "\n",
+        "    element = cuCdiv(element, make_cuDoubleComplex(mat->size*mat->size, 0.0));\n",
+        "    freq_domain->mat[k][l] = element;\n",
+        "}\n",
+        "\n",
+        "void readMatrix(struct Matrix *m)\n",
+        "{\n",
+        "    scanf(\"%d\", &(m->size));\n",
+        "    for (int i = 0; i < m->size; i++)\n",
+        "        for (int j = 0; j < m->size; j++)\n",
+        "            scanf(\"%lf\", &(m->mat[i][j]));\n",
+        "}\n",
+        "\n",
+        "int main(void)\n",
+        "{\n",
+        "    struct Matrix source;\n",
+        "    struct FreqMatrix freq_domain;\n",
+        "    readMatrix(&source);\n",
+        "    freq_domain.size = source.size;\n",
+        "\n",
+        "    // Allocate device memory\n",
+        "    struct Matrix *d_source;\n",
+        "    cudaMalloc(&d_source, sizeof(struct Matrix));\n",
+        "    cudaMemcpy(d_source, &source, sizeof(struct Matrix), cudaMemcpyHostToDevice);\n",
+        "\n",
+        "    struct FreqMatrix *d_freq_domain;\n",
+        "    cudaMalloc(&d_freq_domain, sizeof(struct FreqMatrix));\n",
+        "    cudaMemcpy(d_freq_domain, &freq_domain, sizeof(struct FreqMatrix), cudaMemcpyHostToDevice);\n",
+        "\n",
+        "    // Launch kernel\n",
+        "    dft_kernel<<<source.size, source.size>>>(d_source, d_freq_domain);\n",
+        "\n",
+        "    // Copy results back to host\n",
+        "    cudaMemcpy(&freq_domain, d_freq_domain, sizeof(struct FreqMatrix), cudaMemcpyDeviceToHost);\n",
+        "\n",
+        "    // Free device memory\n",
+        "    cudaFree(d_source);\n",
+        "    cudaFree(d_freq_domain);\n",
+        "\n",
+        "    cuDoubleComplex sum = make_cuDoubleComplex(0.0, 0.0);\n",
+        "    for (int k = 0; k < source.size; k++) {\n",
+        "        for (int l = 0; l < source.size; l++) {\n",
+        "            cuDoubleComplex el = freq_domain.mat[k][l];\n",
+        "            printf(\"(%lf, %lf) \", el.x, el.y);\n",
+        "            sum = cuCadd(sum, el);\n",
+        "        }\n",
+        "        printf(\"\\n\");\n",
+        "    }\n",
+        "    return 0;\n",
+        "};"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Write CUDA"
+      ],
+      "metadata": {
+        "id": "9dCGWHXiPUsX"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "h6eFuOC1qTid"
+      },
+      "outputs": [],
+      "source": [
+        "!nvcc cuda.cu -o cuda"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Test CUDA with Testcase"
+      ],
+      "metadata": {
+        "id": "QzqypJcXPKap"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!time ./cuda < 512.txt > output.txt"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "-RHIFjITEX7r",
+        "outputId": "455d4455-e1e4-48d8-ed63-1bdb314ab965"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "\n",
+            "real\t0m2.798s\n",
+            "user\t0m1.109s\n",
+            "sys\t0m0.743s\n"
+          ]
+        }
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "collapsed_sections": [
+        "F-XhfXxnh1PH",
+        "VRjJkOLWh9Zh",
+        "0OniSKRHisBx",
+        "jTEFIvG_i4dZ",
+        "2pXi7bvNkE-S",
+        "nhR97Rz4x9A0"
+      ],
+      "provenance": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3 (ipykernel)",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.9.6"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}