From 82a9389d62d82f60f5c85599712f5439f5591426 Mon Sep 17 00:00:00 2001 From: Engin Kayraklioglu Date: Fri, 16 Feb 2024 10:44:20 -0800 Subject: [PATCH 1/4] Add a directory of tests for an upcoming blog post Signed-off-by: Engin Kayraklioglu --- test/gpu/native/examples/blog-data/README.md | 2 ++ .../native/examples/blog-data/allocation.chpl | 10 +++++++ .../native/examples/blog-data/allocation.good | 2 ++ .../examples/blog-data/distributed.chpl | 28 +++++++++++++++++++ .../examples/blog-data/distributed.good | 1 + .../native/examples/blog-data/movement.chpl | 12 ++++++++ .../native/examples/blog-data/movement.good | 1 + .../native/examples/blog-data/overlap.chpl | 21 ++++++++++++++ .../native/examples/blog-data/overlap.good | 1 + .../native/examples/blog-data/parallel.chpl | 24 ++++++++++++++++ .../native/examples/blog-data/parallel.good | 1 + .../gpu/native/examples/blog-data/slices.chpl | 21 ++++++++++++++ .../gpu/native/examples/blog-data/slices.good | 1 + 13 files changed, 125 insertions(+) create mode 100644 test/gpu/native/examples/blog-data/README.md create mode 100644 test/gpu/native/examples/blog-data/allocation.chpl create mode 100644 test/gpu/native/examples/blog-data/allocation.good create mode 100644 test/gpu/native/examples/blog-data/distributed.chpl create mode 100644 test/gpu/native/examples/blog-data/distributed.good create mode 100644 test/gpu/native/examples/blog-data/movement.chpl create mode 100644 test/gpu/native/examples/blog-data/movement.good create mode 100644 test/gpu/native/examples/blog-data/overlap.chpl create mode 100644 test/gpu/native/examples/blog-data/overlap.good create mode 100644 test/gpu/native/examples/blog-data/parallel.chpl create mode 100644 test/gpu/native/examples/blog-data/parallel.good create mode 100644 test/gpu/native/examples/blog-data/slices.chpl create mode 100644 test/gpu/native/examples/blog-data/slices.good diff --git a/test/gpu/native/examples/blog-data/README.md b/test/gpu/native/examples/blog-data/README.md new file mode 100644 index 000000000000..b169c3e45dd7 --- /dev/null +++ b/test/gpu/native/examples/blog-data/README.md @@ -0,0 +1,2 @@ +This directory contains examples from the blog post whose draft is in + https://github.com/chapel-lang/chapel-blog/pull/75 diff --git a/test/gpu/native/examples/blog-data/allocation.chpl b/test/gpu/native/examples/blog-data/allocation.chpl new file mode 100644 index 000000000000..af5e7274e272 --- /dev/null +++ b/test/gpu/native/examples/blog-data/allocation.chpl @@ -0,0 +1,10 @@ +var HostArr: [1..5] int; // allocated on the host +HostArr = 1; // executes on [multicore] CPU + +on here.gpus[0] { + var DevArr: [1..5] int; // allocated on the device + DevArr += 1; // executes on GPU as a kernel + writeln(DevArr); // prints "1 1 1 1 1" +} + +writeln(HostArr); // prints "1 1 1 1 1" diff --git a/test/gpu/native/examples/blog-data/allocation.good b/test/gpu/native/examples/blog-data/allocation.good new file mode 100644 index 000000000000..ac144a1d28ee --- /dev/null +++ b/test/gpu/native/examples/blog-data/allocation.good @@ -0,0 +1,2 @@ +1 1 1 1 1 +1 1 1 1 1 diff --git a/test/gpu/native/examples/blog-data/distributed.chpl b/test/gpu/native/examples/blog-data/distributed.chpl new file mode 100644 index 000000000000..af2ff901a180 --- /dev/null +++ b/test/gpu/native/examples/blog-data/distributed.chpl @@ -0,0 +1,28 @@ +import RangeChunk.chunks; + +config const n = 10; // now, our application has `--n` to set this! +config const sliceSize = 5; // number of elements per slice + +var HostArr: [1..n] int; // allocated on the host +HostArr = 1; // executes on [multicore] CPU + +coforall (loc, locChunk) in zip(Locales, chunks(1..n, numLocales)) { + on loc { + const numGpus = here.gpus.size; + coforall (gpu, gpuChunk) in zip(here.gpus, chunks(locChunk, numGpus)) { + on gpu { + const numSlices = gpuChunk.size/sliceSize; // assume divisibility + + coforall chunk in chunks(gpuChunk, numSlices) { + var DevArr: [chunk] int; // allocated per device + + DevArr = HostArr[chunk]; // copy a slice from host to device + DevArr += 1; // executes on GPU as a kernel + HostArr[chunk] = DevArr; // copy from device to a slice on host + } + } + } + } +} + +writeln(HostArr); // prints "2 2 2 2 2 ..." diff --git a/test/gpu/native/examples/blog-data/distributed.good b/test/gpu/native/examples/blog-data/distributed.good new file mode 100644 index 000000000000..35ba99cb515e --- /dev/null +++ b/test/gpu/native/examples/blog-data/distributed.good @@ -0,0 +1 @@ +2 2 2 2 2 2 2 2 2 2 diff --git a/test/gpu/native/examples/blog-data/movement.chpl b/test/gpu/native/examples/blog-data/movement.chpl new file mode 100644 index 000000000000..2bf1671f6a03 --- /dev/null +++ b/test/gpu/native/examples/blog-data/movement.chpl @@ -0,0 +1,12 @@ +var HostArr: [1..5] int; // allocated on the host +HostArr += 1; // executes on [multicore] CPU + +on here.gpus[0] { + var DevArr: [1..5] int; // allocated on the device + + DevArr = HostArr; // copy from host to device + DevArr += 1; // executes on GPU as a kernel + HostArr = DevArr; // copy from device to host +} + +writeln(HostArr); // prints "2 2 2 2 2" diff --git a/test/gpu/native/examples/blog-data/movement.good b/test/gpu/native/examples/blog-data/movement.good new file mode 100644 index 000000000000..fd4deaa8d583 --- /dev/null +++ b/test/gpu/native/examples/blog-data/movement.good @@ -0,0 +1 @@ +2 2 2 2 2 diff --git a/test/gpu/native/examples/blog-data/overlap.chpl b/test/gpu/native/examples/blog-data/overlap.chpl new file mode 100644 index 000000000000..7671d3bd2ad8 --- /dev/null +++ b/test/gpu/native/examples/blog-data/overlap.chpl @@ -0,0 +1,21 @@ +import RangeChunk.chunks; + +config const n = 10; // now, our application has `--n` to set this! +config const sliceSize = 5; // number of elements per slice + +const numSlices = n/sliceSize; // assume divisibility for simplicity + +var HostArr: [1..n] int; // allocated on the host +HostArr = 1; // executes on [multicore] CPU + +on here.gpus[0] { + coforall chunk in chunks(1..n, numSlices) { + var DevArr: [chunk] int; // allocated on the device *per task* + + DevArr = HostArr[chunk]; // copy a slice from host to device + DevArr += 1; // executes on GPU as a kernel + HostArr[chunk] = DevArr; // copy from device to a slice on host + } +} + +writeln(HostArr); // prints "2 2 2 2 2 ..." diff --git a/test/gpu/native/examples/blog-data/overlap.good b/test/gpu/native/examples/blog-data/overlap.good new file mode 100644 index 000000000000..35ba99cb515e --- /dev/null +++ b/test/gpu/native/examples/blog-data/overlap.good @@ -0,0 +1 @@ +2 2 2 2 2 2 2 2 2 2 diff --git a/test/gpu/native/examples/blog-data/parallel.chpl b/test/gpu/native/examples/blog-data/parallel.chpl new file mode 100644 index 000000000000..031dfb49d9a1 --- /dev/null +++ b/test/gpu/native/examples/blog-data/parallel.chpl @@ -0,0 +1,24 @@ +import RangeChunk.chunks; + +config const n = 10; // now, our application has `--n` to set this! +config const sliceSize = 5; // number of elements per slice + +var HostArr: [1..n] int; // allocated on the host +HostArr = 1; // executes on [multicore] CPU + +const numGpus = here.gpus.size; // number of GPUs on the locale +coforall (gpu, gpuChunk) in zip(here.gpus, chunks(1..n, numGpus)) { + on gpu { + const numSlices = gpuChunk.size/sliceSize; // assume divisibility + + coforall chunk in chunks(gpuChunk, numSlices) { + var DevArr: [chunk] int; // allocated on the device + + DevArr = HostArr[chunk]; // copy a slice from host to device + DevArr += 1; // executes on GPU as a kernel + HostArr[chunk] = DevArr; // copy from device to a slice on host + } + } +} + +writeln(HostArr); // prints "2 2 2 2 2 ..." diff --git a/test/gpu/native/examples/blog-data/parallel.good b/test/gpu/native/examples/blog-data/parallel.good new file mode 100644 index 000000000000..35ba99cb515e --- /dev/null +++ b/test/gpu/native/examples/blog-data/parallel.good @@ -0,0 +1 @@ +2 2 2 2 2 2 2 2 2 2 diff --git a/test/gpu/native/examples/blog-data/slices.chpl b/test/gpu/native/examples/blog-data/slices.chpl new file mode 100644 index 000000000000..ddd971d44e0e --- /dev/null +++ b/test/gpu/native/examples/blog-data/slices.chpl @@ -0,0 +1,21 @@ +import RangeChunk; + +config const n = 10; // now, our application has `--n` to set this! +config const sliceSize = 5; // number of elements per slice + +const numSlices = n/sliceSize; // assume divisibility for simplicity + +var HostArr: [1..n] int; // allocated on the host +HostArr = 1; // executes on [multicore] CPU + +on here.gpus[0] { + var DevArr: [1..n] int; // allocated on the device + + for chunk in RangeChunk.chunks(1..n, numSlices) { + DevArr = HostArr[chunk]; // copy a slice from host to device + DevArr += 1; // executes on GPU as a kernel + HostArr[chunk] = DevArr; // copy from device to a slice on host + } +} + +writeln(HostArr); // prints "2 2 2 2 2 ..." diff --git a/test/gpu/native/examples/blog-data/slices.good b/test/gpu/native/examples/blog-data/slices.good new file mode 100644 index 000000000000..35ba99cb515e --- /dev/null +++ b/test/gpu/native/examples/blog-data/slices.good @@ -0,0 +1 @@ +2 2 2 2 2 2 2 2 2 2 From 94a9ea0c27b3e8d433d7189d467cd68cd490df6e Mon Sep 17 00:00:00 2001 From: Engin Kayraklioglu Date: Fri, 16 Feb 2024 11:02:22 -0800 Subject: [PATCH 2/4] Small adjustments for multilocale, multigpu Signed-off-by: Engin Kayraklioglu --- test/gpu/native/examples/blog-data/NUMLOCALES | 1 + test/gpu/native/examples/blog-data/distributed.chpl | 4 ++-- test/gpu/native/examples/blog-data/distributed.good | 2 +- test/gpu/native/examples/blog-data/overlap.chpl | 4 ++-- test/gpu/native/examples/blog-data/overlap.good | 2 +- test/gpu/native/examples/blog-data/parallel.chpl | 4 ++-- test/gpu/native/examples/blog-data/parallel.good | 2 +- test/gpu/native/examples/blog-data/slices.chpl | 8 ++++---- test/gpu/native/examples/blog-data/slices.good | 2 +- 9 files changed, 15 insertions(+), 14 deletions(-) create mode 100644 test/gpu/native/examples/blog-data/NUMLOCALES diff --git a/test/gpu/native/examples/blog-data/NUMLOCALES b/test/gpu/native/examples/blog-data/NUMLOCALES new file mode 100644 index 000000000000..b8626c4cff28 --- /dev/null +++ b/test/gpu/native/examples/blog-data/NUMLOCALES @@ -0,0 +1 @@ +4 diff --git a/test/gpu/native/examples/blog-data/distributed.chpl b/test/gpu/native/examples/blog-data/distributed.chpl index af2ff901a180..d9f7330c95b9 100644 --- a/test/gpu/native/examples/blog-data/distributed.chpl +++ b/test/gpu/native/examples/blog-data/distributed.chpl @@ -1,7 +1,7 @@ import RangeChunk.chunks; -config const n = 10; // now, our application has `--n` to set this! -config const sliceSize = 5; // number of elements per slice +config const n = 32; // now, our application has `--n` to set this! +config const sliceSize = 4; // number of elements per slice var HostArr: [1..n] int; // allocated on the host HostArr = 1; // executes on [multicore] CPU diff --git a/test/gpu/native/examples/blog-data/distributed.good b/test/gpu/native/examples/blog-data/distributed.good index 35ba99cb515e..3a8b696ad930 100644 --- a/test/gpu/native/examples/blog-data/distributed.good +++ b/test/gpu/native/examples/blog-data/distributed.good @@ -1 +1 @@ -2 2 2 2 2 2 2 2 2 2 +2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 diff --git a/test/gpu/native/examples/blog-data/overlap.chpl b/test/gpu/native/examples/blog-data/overlap.chpl index 7671d3bd2ad8..13b4301e6d18 100644 --- a/test/gpu/native/examples/blog-data/overlap.chpl +++ b/test/gpu/native/examples/blog-data/overlap.chpl @@ -1,7 +1,7 @@ import RangeChunk.chunks; -config const n = 10; // now, our application has `--n` to set this! -config const sliceSize = 5; // number of elements per slice +config const n = 32; // now, our application has `--n` to set this! +config const sliceSize = 4; // number of elements per slice const numSlices = n/sliceSize; // assume divisibility for simplicity diff --git a/test/gpu/native/examples/blog-data/overlap.good b/test/gpu/native/examples/blog-data/overlap.good index 35ba99cb515e..3a8b696ad930 100644 --- a/test/gpu/native/examples/blog-data/overlap.good +++ b/test/gpu/native/examples/blog-data/overlap.good @@ -1 +1 @@ -2 2 2 2 2 2 2 2 2 2 +2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 diff --git a/test/gpu/native/examples/blog-data/parallel.chpl b/test/gpu/native/examples/blog-data/parallel.chpl index 031dfb49d9a1..25ebff8ad7f1 100644 --- a/test/gpu/native/examples/blog-data/parallel.chpl +++ b/test/gpu/native/examples/blog-data/parallel.chpl @@ -1,7 +1,7 @@ import RangeChunk.chunks; -config const n = 10; // now, our application has `--n` to set this! -config const sliceSize = 5; // number of elements per slice +config const n = 32; // now, our application has `--n` to set this! +config const sliceSize = 4; // number of elements per slice var HostArr: [1..n] int; // allocated on the host HostArr = 1; // executes on [multicore] CPU diff --git a/test/gpu/native/examples/blog-data/parallel.good b/test/gpu/native/examples/blog-data/parallel.good index 35ba99cb515e..3a8b696ad930 100644 --- a/test/gpu/native/examples/blog-data/parallel.good +++ b/test/gpu/native/examples/blog-data/parallel.good @@ -1 +1 @@ -2 2 2 2 2 2 2 2 2 2 +2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 diff --git a/test/gpu/native/examples/blog-data/slices.chpl b/test/gpu/native/examples/blog-data/slices.chpl index ddd971d44e0e..07faaf8d2d1f 100644 --- a/test/gpu/native/examples/blog-data/slices.chpl +++ b/test/gpu/native/examples/blog-data/slices.chpl @@ -1,7 +1,7 @@ -import RangeChunk; +import RangeChunk.chunks; -config const n = 10; // now, our application has `--n` to set this! -config const sliceSize = 5; // number of elements per slice +config const n = 32; // now, our application has `--n` to set this! +config const sliceSize = 4; // number of elements per slice const numSlices = n/sliceSize; // assume divisibility for simplicity @@ -11,7 +11,7 @@ HostArr = 1; // executes on [multicore] CPU on here.gpus[0] { var DevArr: [1..n] int; // allocated on the device - for chunk in RangeChunk.chunks(1..n, numSlices) { + for chunk in chunks(1..n, numSlices) { DevArr = HostArr[chunk]; // copy a slice from host to device DevArr += 1; // executes on GPU as a kernel HostArr[chunk] = DevArr; // copy from device to a slice on host diff --git a/test/gpu/native/examples/blog-data/slices.good b/test/gpu/native/examples/blog-data/slices.good index 35ba99cb515e..3a8b696ad930 100644 --- a/test/gpu/native/examples/blog-data/slices.good +++ b/test/gpu/native/examples/blog-data/slices.good @@ -1 +1 @@ -2 2 2 2 2 2 2 2 2 2 +2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 From 5ac12fb6f653715b53c8d1e5866aff2c89fd5b99 Mon Sep 17 00:00:00 2001 From: Engin Kayraklioglu Date: Fri, 16 Feb 2024 11:06:57 -0800 Subject: [PATCH 3/4] Minor adjustments for consistency Signed-off-by: Engin Kayraklioglu --- test/gpu/native/examples/blog-data/allocation.chpl | 2 +- test/gpu/native/examples/blog-data/distributed.chpl | 2 +- test/gpu/native/examples/blog-data/movement.chpl | 2 +- test/gpu/native/examples/blog-data/overlap.chpl | 2 +- test/gpu/native/examples/blog-data/slices.chpl | 12 ++++++------ 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/test/gpu/native/examples/blog-data/allocation.chpl b/test/gpu/native/examples/blog-data/allocation.chpl index af5e7274e272..185fdc7636cc 100644 --- a/test/gpu/native/examples/blog-data/allocation.chpl +++ b/test/gpu/native/examples/blog-data/allocation.chpl @@ -7,4 +7,4 @@ on here.gpus[0] { writeln(DevArr); // prints "1 1 1 1 1" } -writeln(HostArr); // prints "1 1 1 1 1" +writeln(HostArr); // prints "2 2 2 2 2" diff --git a/test/gpu/native/examples/blog-data/distributed.chpl b/test/gpu/native/examples/blog-data/distributed.chpl index d9f7330c95b9..384ef9ca2bdf 100644 --- a/test/gpu/native/examples/blog-data/distributed.chpl +++ b/test/gpu/native/examples/blog-data/distributed.chpl @@ -14,7 +14,7 @@ coforall (loc, locChunk) in zip(Locales, chunks(1..n, numLocales)) { const numSlices = gpuChunk.size/sliceSize; // assume divisibility coforall chunk in chunks(gpuChunk, numSlices) { - var DevArr: [chunk] int; // allocated per device + var DevArr: [chunk] int; // allocated on the device DevArr = HostArr[chunk]; // copy a slice from host to device DevArr += 1; // executes on GPU as a kernel diff --git a/test/gpu/native/examples/blog-data/movement.chpl b/test/gpu/native/examples/blog-data/movement.chpl index 2bf1671f6a03..7a674a168b2b 100644 --- a/test/gpu/native/examples/blog-data/movement.chpl +++ b/test/gpu/native/examples/blog-data/movement.chpl @@ -1,5 +1,5 @@ var HostArr: [1..5] int; // allocated on the host -HostArr += 1; // executes on [multicore] CPU +HostArr = 1; // executes on [multicore] CPU on here.gpus[0] { var DevArr: [1..5] int; // allocated on the device diff --git a/test/gpu/native/examples/blog-data/overlap.chpl b/test/gpu/native/examples/blog-data/overlap.chpl index 13b4301e6d18..48e1ae32c5f5 100644 --- a/test/gpu/native/examples/blog-data/overlap.chpl +++ b/test/gpu/native/examples/blog-data/overlap.chpl @@ -3,7 +3,7 @@ import RangeChunk.chunks; config const n = 32; // now, our application has `--n` to set this! config const sliceSize = 4; // number of elements per slice -const numSlices = n/sliceSize; // assume divisibility for simplicity +const numSlices = n/sliceSize; // assume divisibility for simplicity var HostArr: [1..n] int; // allocated on the host HostArr = 1; // executes on [multicore] CPU diff --git a/test/gpu/native/examples/blog-data/slices.chpl b/test/gpu/native/examples/blog-data/slices.chpl index 07faaf8d2d1f..d1055ca82197 100644 --- a/test/gpu/native/examples/blog-data/slices.chpl +++ b/test/gpu/native/examples/blog-data/slices.chpl @@ -1,9 +1,9 @@ import RangeChunk.chunks; -config const n = 32; // now, our application has `--n` to set this! -config const sliceSize = 4; // number of elements per slice +config const n = 32; // now, our application has `--n` to set this! +config const sliceSize = 4; // number of elements per slice -const numSlices = n/sliceSize; // assume divisibility for simplicity +const numSlices = n/sliceSize; // assume divisibility for simplicity var HostArr: [1..n] int; // allocated on the host HostArr = 1; // executes on [multicore] CPU @@ -12,9 +12,9 @@ on here.gpus[0] { var DevArr: [1..n] int; // allocated on the device for chunk in chunks(1..n, numSlices) { - DevArr = HostArr[chunk]; // copy a slice from host to device - DevArr += 1; // executes on GPU as a kernel - HostArr[chunk] = DevArr; // copy from device to a slice on host + DevArr = HostArr[chunk]; // copy a slice from host to device + DevArr += 1; // executes on GPU as a kernel + HostArr[chunk] = DevArr; // copy from device to a slice on host } } From c940b496592a3aa7fdef58d2c8372b715a5814e9 Mon Sep 17 00:00:00 2001 From: Engin Kayraklioglu Date: Mon, 1 Jul 2024 16:34:32 -0700 Subject: [PATCH 4/4] Update README and a comment Signed-off-by: Engin Kayraklioglu --- test/gpu/native/examples/blog-data/README.md | 3 ++- test/gpu/native/examples/blog-data/allocation.chpl | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/test/gpu/native/examples/blog-data/README.md b/test/gpu/native/examples/blog-data/README.md index b169c3e45dd7..d04576cc38d9 100644 --- a/test/gpu/native/examples/blog-data/README.md +++ b/test/gpu/native/examples/blog-data/README.md @@ -1,2 +1,3 @@ -This directory contains examples from the blog post whose draft is in +This directory contains examples from a blog post + https://chapel-lang.org/blog/posts/gpu-data-movement/ https://github.com/chapel-lang/chapel-blog/pull/75 diff --git a/test/gpu/native/examples/blog-data/allocation.chpl b/test/gpu/native/examples/blog-data/allocation.chpl index 185fdc7636cc..af5e7274e272 100644 --- a/test/gpu/native/examples/blog-data/allocation.chpl +++ b/test/gpu/native/examples/blog-data/allocation.chpl @@ -7,4 +7,4 @@ on here.gpus[0] { writeln(DevArr); // prints "1 1 1 1 1" } -writeln(HostArr); // prints "2 2 2 2 2" +writeln(HostArr); // prints "1 1 1 1 1"