chapel-lang · e-kayrakli · Feb 16, 2024 · Feb 16, 2024 · Feb 16, 2024 · Jul 1, 2024
diff --git a/test/gpu/native/examples/blog-data/NUMLOCALES b/test/gpu/native/examples/blog-data/NUMLOCALES
@@ -0,0 +1 @@
+4
diff --git a/test/gpu/native/examples/blog-data/README.md b/test/gpu/native/examples/blog-data/README.md
@@ -0,0 +1,3 @@
+This directory contains examples from a blog post
+  https://chapel-lang.org/blog/posts/gpu-data-movement/
+  https://github.com/chapel-lang/chapel-blog/pull/75
diff --git a/test/gpu/native/examples/blog-data/allocation.chpl b/test/gpu/native/examples/blog-data/allocation.chpl
@@ -0,0 +1,10 @@
+var HostArr: [1..5] int;  // allocated on the host
+HostArr = 1;              // executes on [multicore] CPU
+
+on here.gpus[0] {
+  var DevArr: [1..5] int;  // allocated on the device
+  DevArr += 1;             // executes on GPU as a kernel
+  writeln(DevArr);         // prints "1 1 1 1 1"
+}
+
+writeln(HostArr);  // prints "1 1 1 1 1"
diff --git a/test/gpu/native/examples/blog-data/allocation.good b/test/gpu/native/examples/blog-data/allocation.good
@@ -0,0 +1,2 @@
+1 1 1 1 1
+1 1 1 1 1
diff --git a/test/gpu/native/examples/blog-data/distributed.chpl b/test/gpu/native/examples/blog-data/distributed.chpl
@@ -0,0 +1,28 @@
+import RangeChunk.chunks;
+
+config const n = 32;         // now, our application has `--n` to set this!
+config const sliceSize = 4;  // number of elements per slice
+
+var HostArr: [1..n] int;  // allocated on the host
+HostArr = 1;              // executes on [multicore] CPU
+
+coforall (loc, locChunk) in zip(Locales, chunks(1..n, numLocales)) {
+  on loc {
+    const numGpus = here.gpus.size;
+    coforall (gpu, gpuChunk) in zip(here.gpus, chunks(locChunk, numGpus)) {
+      on gpu {
+        const numSlices = gpuChunk.size/sliceSize;  // assume divisibility
+
+        coforall chunk in chunks(gpuChunk, numSlices) {
+          var DevArr: [chunk] int;  // allocated on the device
+
+          DevArr = HostArr[chunk];  // copy a slice from host to device
+          DevArr += 1;              // executes on GPU as a kernel
+          HostArr[chunk] = DevArr;  // copy from device to a slice on host
+        }
+      }
+    }
+  }
+}
+
+writeln(HostArr);  // prints "2 2 2 2 2 ..."
diff --git a/test/gpu/native/examples/blog-data/distributed.good b/test/gpu/native/examples/blog-data/distributed.good
@@ -0,0 +1 @@
+2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
diff --git a/test/gpu/native/examples/blog-data/movement.chpl b/test/gpu/native/examples/blog-data/movement.chpl
@@ -0,0 +1,12 @@
+var HostArr: [1..5] int;  // allocated on the host
+HostArr = 1;              // executes on [multicore] CPU
+
+on here.gpus[0] {
+  var DevArr: [1..5] int;  // allocated on the device
+
+  DevArr = HostArr;  // copy from host to device
+  DevArr += 1;       // executes on GPU as a kernel
+  HostArr = DevArr;  // copy from device to host
+}
+
+writeln(HostArr);  // prints "2 2 2 2 2"
diff --git a/test/gpu/native/examples/blog-data/movement.good b/test/gpu/native/examples/blog-data/movement.good
@@ -0,0 +1 @@
+2 2 2 2 2
diff --git a/test/gpu/native/examples/blog-data/overlap.chpl b/test/gpu/native/examples/blog-data/overlap.chpl
@@ -0,0 +1,21 @@
+import RangeChunk.chunks;
+
+config const n = 32;         // now, our application has `--n` to set this!
+config const sliceSize = 4;  // number of elements per slice
+
+const numSlices = n/sliceSize;  // assume divisibility for simplicity
+
+var HostArr: [1..n] int;  // allocated on the host
+HostArr = 1;              // executes on [multicore] CPU
+
+on here.gpus[0] {
+  coforall chunk in chunks(1..n, numSlices) {
+    var DevArr: [chunk] int;  // allocated on the device *per task*
+
+    DevArr = HostArr[chunk];  // copy a slice from host to device
+    DevArr += 1;              // executes on GPU as a kernel
+    HostArr[chunk] = DevArr;  // copy from device to a slice on host
+  }
+}
+
+writeln(HostArr);  // prints "2 2 2 2 2 ..."
diff --git a/test/gpu/native/examples/blog-data/overlap.good b/test/gpu/native/examples/blog-data/overlap.good
@@ -0,0 +1 @@
+2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
diff --git a/test/gpu/native/examples/blog-data/parallel.chpl b/test/gpu/native/examples/blog-data/parallel.chpl
@@ -0,0 +1,24 @@
+import RangeChunk.chunks;
+
+config const n = 32;         // now, our application has `--n` to set this!
+config const sliceSize = 4;  // number of elements per slice
+
+var HostArr: [1..n] int;  // allocated on the host
+HostArr = 1;              // executes on [multicore] CPU
+
+const numGpus = here.gpus.size;   // number of GPUs on the locale
+coforall (gpu, gpuChunk) in zip(here.gpus, chunks(1..n, numGpus)) {
+  on gpu {
+    const numSlices = gpuChunk.size/sliceSize;  // assume divisibility
+
+    coforall chunk in chunks(gpuChunk, numSlices) {
+      var DevArr: [chunk] int;  // allocated on the device
+
+      DevArr = HostArr[chunk];  // copy a slice from host to device
+      DevArr += 1;              // executes on GPU as a kernel
+      HostArr[chunk] = DevArr;  // copy from device to a slice on host
+    }
+  }
+}
+
+writeln(HostArr);  // prints "2 2 2 2 2 ..."
diff --git a/test/gpu/native/examples/blog-data/parallel.good b/test/gpu/native/examples/blog-data/parallel.good
@@ -0,0 +1 @@
+2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
diff --git a/test/gpu/native/examples/blog-data/slices.chpl b/test/gpu/native/examples/blog-data/slices.chpl
@@ -0,0 +1,21 @@
+import RangeChunk.chunks;
+
+config const n = 32;         // now, our application has `--n` to set this!
+config const sliceSize = 4;  // number of elements per slice
+
+const numSlices = n/sliceSize;  // assume divisibility for simplicity
+
+var HostArr: [1..n] int;  // allocated on the host
+HostArr = 1;              // executes on [multicore] CPU
+
+on here.gpus[0] {
+  var DevArr: [1..n] int;  // allocated on the device
+
+  for chunk in chunks(1..n, numSlices) {
+    DevArr = HostArr[chunk];  // copy a slice from host to device
+    DevArr += 1;              // executes on GPU as a kernel
+    HostArr[chunk] = DevArr;  // copy from device to a slice on host
+  }
+}
+
+writeln(HostArr);          // prints "2 2 2 2 2 ..."
diff --git a/test/gpu/native/examples/blog-data/slices.good b/test/gpu/native/examples/blog-data/slices.good
@@ -0,0 +1 @@
+2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2