From 5c26eb7ae2a7248cf612d5968d1f96777d1c2d7a Mon Sep 17 00:00:00 2001 From: leightonw-cb Date: Tue, 10 Dec 2024 12:52:03 -0800 Subject: [PATCH] update the examples for SDK version 1.3.0 --- README.rst | 23 +- RELEASE-NOTES.rst | 13 + .../{commands.sh => commands_wse2.sh} | 0 .../README.rst | 4 +- .../cmd_parser.py | 8 +- .../commands_wse2.sh} | 2 +- benchmarks/7pt-stencil-spmv/commands_wse3.sh | 11 + benchmarks/7pt-stencil-spmv/run.appliance.py | 438 ++++++++++ .../run.py | 44 +- .../src}/kernel.csl | 4 +- .../src}/layout.csl | 4 +- .../{powerMethod => 7pt-stencil-spmv}/util.py | 0 .../FFT/{commands.sh => commands_wse2.sh} | 0 benchmarks/FFT/commands_wse3.sh | 12 + .../README.rst | 0 .../bw_cmd_parser.py | 8 +- .../commands_wse2.sh} | 2 +- benchmarks/bandwidth-test/commands_wse3.sh | 10 + benchmarks/bandwidth-test/run.appliance.py | 439 ++++++++++ .../{bandwidthTest => bandwidth-test}/run.py | 40 +- .../src}/bw_sync_kernel.csl | 0 .../src}/bw_sync_layout.csl | 0 .../src}/sync/layout.csl | 0 .../src}/sync/pe.csl | 0 benchmarks/bicgstab/cmd_parser.py | 3 +- .../{commands.sh => commands_wse2.sh} | 0 benchmarks/bicgstab/commands_wse3.sh | 11 + benchmarks/bicgstab/{ => src}/blas.csl | 0 benchmarks/bicgstab/{ => src}/kernel.csl | 4 +- .../bicgstab/{ => src}/kernel_bicgstab.csl | 4 +- benchmarks/bicgstab/{ => src}/layout.csl | 4 +- .../bicgstab/{ => src}/layout_bicgstab.csl | 4 +- .../{commands.sh => commands_wse2.sh} | 0 benchmarks/cholesky/commands_wse3.sh | 8 + .../README.rst | 0 .../cg.py | 0 .../cmd_parser.py | 3 +- .../commands_wse2.sh} | 0 .../conjugate-gradient/commands_wse3.sh | 11 + .../run.py | 0 .../run_cg.py | 0 .../src}/blas.csl | 0 .../src}/kernel.csl | 4 +- .../src}/kernel_cg.csl | 4 +- .../src}/layout.csl | 4 +- .../src}/layout_cg.csl | 4 +- .../util.py | 0 benchmarks/csl-libs/stencil_3d_7pts/pe.csl | 4 +- benchmarks/game-of-life/README.rst | 36 + benchmarks/game-of-life/commands_wse2.sh | 7 + benchmarks/game-of-life/commands_wse3.sh | 7 + benchmarks/game-of-life/layout.csl | 129 +++ benchmarks/game-of-life/pe_program.csl | 318 +++++++ benchmarks/game-of-life/run.py | 214 +++++ .../{commands.sh => commands_wse2.sh} | 0 .../gemm-collectives_2d/commands_wse3.sh | 8 + .../{commands.sh => commands_wse2.sh} | 0 .../commands_wse3.sh | 11 + .../{commands.sh => commands_wse2.sh} | 0 .../gemv-collectives_2d/commands_wse3.sh | 8 + .../{commands.sh => commands_wse2.sh} | 0 .../{commands.sh => commands_wse2.sh} | 0 benchmarks/mandelbrot/commands_wse3.sh | 8 + .../{powerMethod => power-method}/README.rst | 0 .../cmd_parser.py | 3 +- .../commands_wse2.sh} | 0 benchmarks/power-method/commands_wse3.sh | 11 + .../power_method.py | 0 .../{powerMethod => power-method}/run.py | 0 .../run_power.py | 0 .../src}/blas.csl | 0 .../src}/kernel.csl | 4 +- .../src}/kernel_power.csl | 4 +- .../src}/layout.csl | 4 +- .../src}/layout_power.csl | 4 +- .../{stencil-3d-7pts => power-method}/util.py | 0 .../README.rst | 0 .../cmd_parser.py | 3 +- .../commands_wse2.sh} | 0 .../commands_wse3.sh | 11 + .../pcg.py | 0 .../run.py | 0 .../run_pcg.py | 0 .../src}/blas.csl | 0 .../src}/kernel.csl | 4 +- .../src}/kernel_pcg.csl | 4 +- .../src}/layout.csl | 4 +- .../src}/layout_pcg.csl | 4 +- .../util.py | 0 .../{commands.sh => commands_wse2.sh} | 0 benchmarks/residual/commands_wse3.sh | 9 + benchmarks/row-col-broadcast/README.rst | 29 + benchmarks/row-col-broadcast/cmd_parser.py | 92 ++ benchmarks/row-col-broadcast/commands_wse2.sh | 9 + benchmarks/row-col-broadcast/commands_wse3.sh | 9 + benchmarks/row-col-broadcast/compile.py | 147 ++++ benchmarks/row-col-broadcast/run.py | 341 ++++++++ benchmarks/row-col-broadcast/src/kernel.csl | 142 +++ benchmarks/row-col-broadcast/src/layout.csl | 95 ++ .../row-col-broadcast/src/sync/layout.csl | 79 ++ benchmarks/row-col-broadcast/src/sync/pe.csl | 289 +++++++ .../{commands.sh => commands_wse2.sh} | 2 +- .../single-tile-matvec/commands_wse3.sh | 9 + .../single-tile-matvec/compile.appliance.py | 37 + .../single-tile-matvec/run.appliance.py | 333 +++++++ benchmarks/single-tile-matvec/run.py | 4 + .../{ => src}/layout_matvec.csl | 0 .../{ => src}/pe_matvec.csl | 0 benchmarks/spmv-hypersparse/README.rst | 2 +- benchmarks/spmv-hypersparse/cmd_parser.py | 7 +- .../{commands.sh => commands_wse2.sh} | 2 +- benchmarks/spmv-hypersparse/run.appliance.py | 814 ++++++++++++++++++ benchmarks/spmv-hypersparse/run.py | 80 +- .../{ => src}/allreduce2R1E/layout.csl | 0 .../{ => src}/allreduce2R1E/pe.csl | 0 .../{ => src}/hypersparse_spmv/layout.csl | 0 .../{ => src}/hypersparse_spmv/pe.csl | 0 .../spmv-hypersparse/{ => src}/kernel.csl | 0 .../spmv-hypersparse/{ => src}/layout.csl | 0 benchmarks/stencil-v2/README.rst | 6 - benchmarks/stencil-v2/cmd_parser.py | 112 --- benchmarks/stencil-v2/code_memcpy.csl | 248 ------ benchmarks/stencil-v2/commands.sh | 10 - benchmarks/stencil-v2/consts.csl | 107 --- benchmarks/stencil-v2/ic.py | 53 -- benchmarks/stencil-v2/nop.csl | 14 - benchmarks/stencil-v2/oned_exch.csl | 166 ---- benchmarks/stencil-v2/routes.csl | 126 --- benchmarks/stencil-v2/run.py | 382 -------- benchmarks/stencil-v2/switches.csl | 80 -- benchmarks/stencil-v2/task_memcpy.csl | 804 ----------------- benchmarks/stencil-v2/util.csl | 52 -- .../{commands.sh => commands_wse2.sh} | 0 .../wide-multiplication/commands_wse3.sh | 9 + .../{commands.sh => commands_wse2.sh} | 0 .../commands_wse3.sh} | 2 +- tutorials/gemv-02-memory-dsds/README.rst | 12 +- .../{commands.sh => commands_wse2.sh} | 0 .../gemv-02-memory-dsds/commands_wse3.sh | 7 + tutorials/gemv-02-memory-dsds/pe_program.csl | 12 +- .../{commands.sh => commands_wse2.sh} | 0 tutorials/gemv-03-memcpy/commands_wse3.sh | 7 + tutorials/gemv-03-memcpy/pe_program.csl | 4 +- .../{commands.sh => commands_wse2.sh} | 0 tutorials/gemv-04-params/commands_wse3.sh | 7 + tutorials/gemv-04-params/pe_program.csl | 4 +- .../{commands.sh => commands_wse2.sh} | 0 .../gemv-05-multiple-pes/commands_wse3.sh | 7 + tutorials/gemv-05-multiple-pes/pe_program.csl | 4 +- .../{commands.sh => commands_wse2.sh} | 0 tutorials/gemv-06-routes-1/commands_wse3.sh | 7 + tutorials/gemv-06-routes-1/pe_program.csl | 4 +- .../{commands.sh => commands_wse2.sh} | 0 tutorials/gemv-07-routes-2/commands_wse3.sh | 7 + tutorials/gemv-07-routes-2/pe_program.csl | 6 +- .../{commands.sh => commands_wse2.sh} | 0 tutorials/gemv-08-routes-3/commands_wse3.sh | 8 + tutorials/gemv-08-routes-3/pe_program.csl | 6 +- .../{commands.sh => commands_wse2.sh} | 0 tutorials/gemv-09-streaming/commands_wse3.sh | 11 + tutorials/gemv-09-streaming/pe_program.csl | 4 +- .../{commands.sh => commands_wse2.sh} | 0 .../commands_wse3.sh} | 6 +- .../{commands.sh => commands_wse2.sh} | 0 tutorials/pipeline-02-fifo/commands_wse3.sh | 10 + .../{commands.sh => commands_wse2.sh} | 0 .../pipeline-03-multiple/commands_wse3.sh | 10 + .../pipeline-03-multiple/memcpyEdge/d2h.csl | 61 -- .../pipeline-03-multiple/memcpyEdge/east.csl | 35 - .../pipeline-03-multiple/memcpyEdge/h2d.csl | 94 -- .../memcpyEdge/memcpy_edge.csl | 102 --- .../pipeline-03-multiple/memcpyEdge/north.csl | 35 - .../pipeline-03-multiple/memcpyEdge/south.csl | 35 - .../pipeline-03-multiple/memcpyEdge/west.csl | 35 - .../{commands.sh => commands_wse2.sh} | 0 .../commands_wse3.sh} | 3 +- .../{commands.sh => commands_wse2.sh} | 0 .../commands_wse3.sh} | 3 +- .../README.rst | 17 - .../layout.csl | 53 -- .../pe_program.csl | 40 - .../topic-02-streaming-wavelet-data/run.py | 78 -- tutorials/topic-03-sparse-tensors/README.rst | 15 - tutorials/topic-03-sparse-tensors/layout.csl | 53 -- .../topic-03-sparse-tensors/pe_program.csl | 40 - tutorials/topic-03-sparse-tensors/run.py | 70 -- .../{commands.sh => commands_wse2.sh} | 0 .../commands_wse3.sh} | 2 +- tutorials/topic-04-sentinels/README.rst | 19 - tutorials/topic-04-sentinels/layout.csl | 128 --- .../topic-04-sentinels/memcpyEdge/d2h.csl | 61 -- .../topic-04-sentinels/memcpyEdge/east.csl | 35 - .../topic-04-sentinels/memcpyEdge/h2d.csl | 93 -- .../memcpyEdge/memcpy_edge.csl | 102 --- .../topic-04-sentinels/memcpyEdge/north.csl | 35 - .../topic-04-sentinels/memcpyEdge/south.csl | 35 - .../topic-04-sentinels/memcpyEdge/west.csl | 35 - tutorials/topic-04-sentinels/pe_program.csl | 47 - tutorials/topic-04-sentinels/run.py | 95 -- tutorials/topic-04-sentinels/sentinel.csl | 81 -- .../{commands.sh => commands_wse2.sh} | 0 .../commands_wse3.sh} | 2 +- .../{commands.sh => commands_wse2.sh} | 0 .../commands_wse3.sh} | 4 +- tutorials/topic-05-switches/README.rst | 18 - tutorials/topic-05-switches/empty.csl | 25 - tutorials/topic-05-switches/layout.csl | 140 --- .../topic-05-switches/memcpyEdge/d2h.csl | 61 -- .../topic-05-switches/memcpyEdge/east.csl | 35 - .../topic-05-switches/memcpyEdge/h2d.csl | 93 -- .../memcpyEdge/memcpy_edge.csl | 102 --- .../topic-05-switches/memcpyEdge/north.csl | 35 - .../topic-05-switches/memcpyEdge/south.csl | 35 - .../topic-05-switches/memcpyEdge/west.csl | 35 - tutorials/topic-05-switches/recv.csl | 54 -- tutorials/topic-05-switches/run.py | 74 -- tutorials/topic-05-switches/send.csl | 116 --- tutorials/topic-06-libraries/README.rst | 12 - tutorials/topic-06-libraries/layout.csl | 57 -- tutorials/topic-06-libraries/pe_program.csl | 124 --- tutorials/topic-06-libraries/run.py | 82 -- .../{commands.sh => commands_wse2.sh} | 0 tutorials/topic-06-switches/commands_wse3.sh | 7 + tutorials/topic-07-filters/README.rst | 10 - tutorials/topic-07-filters/commands.sh | 9 - tutorials/topic-07-filters/layout.csl | 109 --- tutorials/topic-07-filters/memcpyEdge/d2h.csl | 61 -- .../topic-07-filters/memcpyEdge/east.csl | 35 - tutorials/topic-07-filters/memcpyEdge/h2d.csl | 93 -- .../memcpyEdge/memcpy_edge.csl | 102 --- .../topic-07-filters/memcpyEdge/north.csl | 35 - .../topic-07-filters/memcpyEdge/south.csl | 35 - .../topic-07-filters/memcpyEdge/west.csl | 35 - tutorials/topic-07-filters/recv.csl | 80 -- tutorials/topic-07-filters/run.py | 57 -- tutorials/topic-07-filters/send.csl | 103 --- .../{commands.sh => commands_wse2.sh} | 0 .../commands_wse3.sh | 7 + tutorials/topic-08-fifos/README.rst | 23 - tutorials/topic-08-fifos/buffer.csl | 78 -- tutorials/topic-08-fifos/layout.csl | 88 -- tutorials/topic-08-fifos/memcpyEdge/d2h.csl | 61 -- tutorials/topic-08-fifos/memcpyEdge/east.csl | 35 - tutorials/topic-08-fifos/memcpyEdge/h2d.csl | 93 -- .../topic-08-fifos/memcpyEdge/memcpy_edge.csl | 102 --- tutorials/topic-08-fifos/memcpyEdge/north.csl | 35 - tutorials/topic-08-fifos/memcpyEdge/south.csl | 35 - tutorials/topic-08-fifos/memcpyEdge/west.csl | 35 - tutorials/topic-08-fifos/run.py | 87 -- .../{commands.sh => commands_wse2.sh} | 0 tutorials/topic-08-filters/commands_wse3.sh | 7 + .../{commands.sh => commands_wse2.sh} | 0 .../commands_wse3.sh} | 8 +- tutorials/topic-09-map-builtin/README.rst | 26 - tutorials/topic-09-map-builtin/layout.csl | 62 -- tutorials/topic-09-map-builtin/pe_program.csl | 85 -- tutorials/topic-09-map-builtin/run.py | 102 --- tutorials/topic-10-collectives/README.rst | 24 - tutorials/topic-10-collectives/layout.csl | 86 -- tutorials/topic-10-collectives/pe_program.csl | 147 ---- tutorials/topic-10-collectives/run.py | 120 --- .../{commands.sh => commands_wse2.sh} | 0 .../topic-10-map-builtin/commands_wse3.sh | 8 + .../{commands.sh => commands_wse2.sh} | 0 .../commands_wse3.sh} | 2 +- tutorials/topic-11-debug-library/README.rst | 43 - tutorials/topic-11-debug-library/commands.sh | 10 - tutorials/topic-11-debug-library/layout.csl | 90 -- .../topic-11-debug-library/pe_program.csl | 116 --- tutorials/topic-11-debug-library/run.py | 104 --- .../{commands.sh => commands_wse2.sh} | 0 .../topic-12-debug-library/commands_wse3.sh | 8 + tutorials/topic-12-wse3-features/README.rst | 41 - tutorials/topic-12-wse3-features/layout.csl | 42 - tutorials/topic-12-wse3-features/left_pe.csl | 54 -- tutorials/topic-12-wse3-features/right_pe.csl | 54 -- tutorials/topic-12-wse3-features/run.py | 61 -- .../{commands.sh => commands_wse2.sh} | 0 tutorials/topic-13-simprint/commands_wse3.sh | 8 + .../{commands.sh => commands_wse2.sh} | 0 .../commands_wse3.sh} | 0 281 files changed, 4507 insertions(+), 7677 deletions(-) rename benchmarks/25-pt-stencil/{commands.sh => commands_wse2.sh} (100%) rename benchmarks/{stencil-3d-7pts => 7pt-stencil-spmv}/README.rst (97%) rename benchmarks/{stencil-3d-7pts => 7pt-stencil-spmv}/cmd_parser.py (93%) rename benchmarks/{stencil-3d-7pts/commands.sh => 7pt-stencil-spmv/commands_wse2.sh} (85%) create mode 100755 benchmarks/7pt-stencil-spmv/commands_wse3.sh create mode 100644 benchmarks/7pt-stencil-spmv/run.appliance.py rename benchmarks/{stencil-3d-7pts => 7pt-stencil-spmv}/run.py (92%) rename benchmarks/{stencil-3d-7pts => 7pt-stencil-spmv/src}/kernel.csl (95%) rename benchmarks/{stencil-3d-7pts => 7pt-stencil-spmv/src}/layout.csl (96%) rename benchmarks/{powerMethod => 7pt-stencil-spmv}/util.py (100%) rename benchmarks/FFT/{commands.sh => commands_wse2.sh} (100%) create mode 100755 benchmarks/FFT/commands_wse3.sh rename benchmarks/{bandwidthTest => bandwidth-test}/README.rst (100%) rename benchmarks/{bandwidthTest => bandwidth-test}/bw_cmd_parser.py (93%) rename benchmarks/{bandwidthTest/commands.sh => bandwidth-test/commands_wse2.sh} (80%) create mode 100755 benchmarks/bandwidth-test/commands_wse3.sh create mode 100644 benchmarks/bandwidth-test/run.appliance.py rename benchmarks/{bandwidthTest => bandwidth-test}/run.py (93%) rename benchmarks/{bandwidthTest => bandwidth-test/src}/bw_sync_kernel.csl (100%) rename benchmarks/{bandwidthTest => bandwidth-test/src}/bw_sync_layout.csl (100%) rename benchmarks/{bandwidthTest => bandwidth-test/src}/sync/layout.csl (100%) rename benchmarks/{bandwidthTest => bandwidth-test/src}/sync/pe.csl (100%) rename benchmarks/bicgstab/{commands.sh => commands_wse2.sh} (100%) create mode 100755 benchmarks/bicgstab/commands_wse3.sh rename benchmarks/bicgstab/{ => src}/blas.csl (100%) rename benchmarks/bicgstab/{ => src}/kernel.csl (97%) rename benchmarks/bicgstab/{ => src}/kernel_bicgstab.csl (98%) rename benchmarks/bicgstab/{ => src}/layout.csl (97%) rename benchmarks/bicgstab/{ => src}/layout_bicgstab.csl (97%) rename benchmarks/cholesky/{commands.sh => commands_wse2.sh} (100%) create mode 100755 benchmarks/cholesky/commands_wse3.sh rename benchmarks/{conjugateGradient => conjugate-gradient}/README.rst (100%) rename benchmarks/{conjugateGradient => conjugate-gradient}/cg.py (100%) rename benchmarks/{conjugateGradient => conjugate-gradient}/cmd_parser.py (97%) rename benchmarks/{conjugateGradient/commands.sh => conjugate-gradient/commands_wse2.sh} (100%) create mode 100755 benchmarks/conjugate-gradient/commands_wse3.sh rename benchmarks/{conjugateGradient => conjugate-gradient}/run.py (100%) rename benchmarks/{conjugateGradient => conjugate-gradient}/run_cg.py (100%) rename benchmarks/{conjugateGradient => conjugate-gradient/src}/blas.csl (100%) rename benchmarks/{conjugateGradient => conjugate-gradient/src}/kernel.csl (97%) rename benchmarks/{conjugateGradient => conjugate-gradient/src}/kernel_cg.csl (98%) rename benchmarks/{conjugateGradient => conjugate-gradient/src}/layout.csl (97%) rename benchmarks/{conjugateGradient => conjugate-gradient/src}/layout_cg.csl (97%) rename benchmarks/{conjugateGradient => conjugate-gradient}/util.py (100%) create mode 100644 benchmarks/game-of-life/README.rst create mode 100755 benchmarks/game-of-life/commands_wse2.sh create mode 100755 benchmarks/game-of-life/commands_wse3.sh create mode 100644 benchmarks/game-of-life/layout.csl create mode 100644 benchmarks/game-of-life/pe_program.csl create mode 100644 benchmarks/game-of-life/run.py rename benchmarks/gemm-collectives_2d/{commands.sh => commands_wse2.sh} (100%) create mode 100755 benchmarks/gemm-collectives_2d/commands_wse3.sh rename benchmarks/gemv-checkerboard-pattern/{commands.sh => commands_wse2.sh} (100%) create mode 100755 benchmarks/gemv-checkerboard-pattern/commands_wse3.sh rename benchmarks/gemv-collectives_2d/{commands.sh => commands_wse2.sh} (100%) create mode 100755 benchmarks/gemv-collectives_2d/commands_wse3.sh rename benchmarks/histogram-torus/{commands.sh => commands_wse2.sh} (100%) rename benchmarks/mandelbrot/{commands.sh => commands_wse2.sh} (100%) create mode 100755 benchmarks/mandelbrot/commands_wse3.sh rename benchmarks/{powerMethod => power-method}/README.rst (100%) rename benchmarks/{preconditionedConjugateGradient => power-method}/cmd_parser.py (97%) rename benchmarks/{powerMethod/commands.sh => power-method/commands_wse2.sh} (100%) create mode 100755 benchmarks/power-method/commands_wse3.sh rename benchmarks/{powerMethod => power-method}/power_method.py (100%) rename benchmarks/{powerMethod => power-method}/run.py (100%) rename benchmarks/{powerMethod => power-method}/run_power.py (100%) rename benchmarks/{powerMethod => power-method/src}/blas.csl (100%) rename benchmarks/{powerMethod => power-method/src}/kernel.csl (96%) rename benchmarks/{powerMethod => power-method/src}/kernel_power.csl (97%) rename benchmarks/{powerMethod => power-method/src}/layout.csl (97%) rename benchmarks/{powerMethod => power-method/src}/layout_power.csl (97%) rename benchmarks/{stencil-3d-7pts => power-method}/util.py (100%) rename benchmarks/{preconditionedConjugateGradient => preconditioned-conjugate-gradient}/README.rst (100%) rename benchmarks/{powerMethod => preconditioned-conjugate-gradient}/cmd_parser.py (97%) rename benchmarks/{preconditionedConjugateGradient/commands.sh => preconditioned-conjugate-gradient/commands_wse2.sh} (100%) create mode 100755 benchmarks/preconditioned-conjugate-gradient/commands_wse3.sh rename benchmarks/{preconditionedConjugateGradient => preconditioned-conjugate-gradient}/pcg.py (100%) rename benchmarks/{preconditionedConjugateGradient => preconditioned-conjugate-gradient}/run.py (100%) rename benchmarks/{preconditionedConjugateGradient => preconditioned-conjugate-gradient}/run_pcg.py (100%) rename benchmarks/{preconditionedConjugateGradient => preconditioned-conjugate-gradient/src}/blas.csl (100%) rename benchmarks/{preconditionedConjugateGradient => preconditioned-conjugate-gradient/src}/kernel.csl (97%) rename benchmarks/{preconditionedConjugateGradient => preconditioned-conjugate-gradient/src}/kernel_pcg.csl (98%) rename benchmarks/{preconditionedConjugateGradient => preconditioned-conjugate-gradient/src}/layout.csl (97%) rename benchmarks/{preconditionedConjugateGradient => preconditioned-conjugate-gradient/src}/layout_pcg.csl (97%) rename benchmarks/{preconditionedConjugateGradient => preconditioned-conjugate-gradient}/util.py (100%) rename benchmarks/residual/{commands.sh => commands_wse2.sh} (100%) create mode 100755 benchmarks/residual/commands_wse3.sh create mode 100644 benchmarks/row-col-broadcast/README.rst create mode 100644 benchmarks/row-col-broadcast/cmd_parser.py create mode 100755 benchmarks/row-col-broadcast/commands_wse2.sh create mode 100755 benchmarks/row-col-broadcast/commands_wse3.sh create mode 100644 benchmarks/row-col-broadcast/compile.py create mode 100644 benchmarks/row-col-broadcast/run.py create mode 100644 benchmarks/row-col-broadcast/src/kernel.csl create mode 100644 benchmarks/row-col-broadcast/src/layout.csl create mode 100644 benchmarks/row-col-broadcast/src/sync/layout.csl create mode 100644 benchmarks/row-col-broadcast/src/sync/pe.csl rename benchmarks/single-tile-matvec/{commands.sh => commands_wse2.sh} (73%) create mode 100755 benchmarks/single-tile-matvec/commands_wse3.sh create mode 100644 benchmarks/single-tile-matvec/compile.appliance.py create mode 100644 benchmarks/single-tile-matvec/run.appliance.py rename benchmarks/single-tile-matvec/{ => src}/layout_matvec.csl (100%) rename benchmarks/single-tile-matvec/{ => src}/pe_matvec.csl (100%) rename benchmarks/spmv-hypersparse/{commands.sh => commands_wse2.sh} (87%) create mode 100644 benchmarks/spmv-hypersparse/run.appliance.py rename benchmarks/spmv-hypersparse/{ => src}/allreduce2R1E/layout.csl (100%) rename benchmarks/spmv-hypersparse/{ => src}/allreduce2R1E/pe.csl (100%) rename benchmarks/spmv-hypersparse/{ => src}/hypersparse_spmv/layout.csl (100%) rename benchmarks/spmv-hypersparse/{ => src}/hypersparse_spmv/pe.csl (100%) rename benchmarks/spmv-hypersparse/{ => src}/kernel.csl (100%) rename benchmarks/spmv-hypersparse/{ => src}/layout.csl (100%) delete mode 100644 benchmarks/stencil-v2/README.rst delete mode 100644 benchmarks/stencil-v2/cmd_parser.py delete mode 100644 benchmarks/stencil-v2/code_memcpy.csl delete mode 100755 benchmarks/stencil-v2/commands.sh delete mode 100644 benchmarks/stencil-v2/consts.csl delete mode 100644 benchmarks/stencil-v2/ic.py delete mode 100644 benchmarks/stencil-v2/nop.csl delete mode 100644 benchmarks/stencil-v2/oned_exch.csl delete mode 100644 benchmarks/stencil-v2/routes.csl delete mode 100644 benchmarks/stencil-v2/run.py delete mode 100644 benchmarks/stencil-v2/switches.csl delete mode 100644 benchmarks/stencil-v2/task_memcpy.csl delete mode 100644 benchmarks/stencil-v2/util.csl rename benchmarks/wide-multiplication/{commands.sh => commands_wse2.sh} (100%) create mode 100755 benchmarks/wide-multiplication/commands_wse3.sh rename tutorials/gemv-01-complete-program/{commands.sh => commands_wse2.sh} (100%) rename tutorials/{topic-15-wse3-microthreads/commands.sh => gemv-01-complete-program/commands_wse3.sh} (67%) rename tutorials/gemv-02-memory-dsds/{commands.sh => commands_wse2.sh} (100%) create mode 100755 tutorials/gemv-02-memory-dsds/commands_wse3.sh rename tutorials/gemv-03-memcpy/{commands.sh => commands_wse2.sh} (100%) create mode 100755 tutorials/gemv-03-memcpy/commands_wse3.sh rename tutorials/gemv-04-params/{commands.sh => commands_wse2.sh} (100%) create mode 100755 tutorials/gemv-04-params/commands_wse3.sh rename tutorials/gemv-05-multiple-pes/{commands.sh => commands_wse2.sh} (100%) create mode 100755 tutorials/gemv-05-multiple-pes/commands_wse3.sh rename tutorials/gemv-06-routes-1/{commands.sh => commands_wse2.sh} (100%) create mode 100755 tutorials/gemv-06-routes-1/commands_wse3.sh rename tutorials/gemv-07-routes-2/{commands.sh => commands_wse2.sh} (100%) create mode 100755 tutorials/gemv-07-routes-2/commands_wse3.sh rename tutorials/gemv-08-routes-3/{commands.sh => commands_wse2.sh} (100%) create mode 100755 tutorials/gemv-08-routes-3/commands_wse3.sh rename tutorials/gemv-09-streaming/{commands.sh => commands_wse2.sh} (100%) create mode 100755 tutorials/gemv-09-streaming/commands_wse3.sh rename tutorials/pipeline-01-basic/{commands.sh => commands_wse2.sh} (100%) rename tutorials/{topic-09-map-builtin/commands.sh => pipeline-01-basic/commands_wse3.sh} (65%) rename tutorials/pipeline-02-fifo/{commands.sh => commands_wse2.sh} (100%) create mode 100755 tutorials/pipeline-02-fifo/commands_wse3.sh rename tutorials/pipeline-03-multiple/{commands.sh => commands_wse2.sh} (100%) create mode 100755 tutorials/pipeline-03-multiple/commands_wse3.sh delete mode 100644 tutorials/pipeline-03-multiple/memcpyEdge/d2h.csl delete mode 100644 tutorials/pipeline-03-multiple/memcpyEdge/east.csl delete mode 100644 tutorials/pipeline-03-multiple/memcpyEdge/h2d.csl delete mode 100644 tutorials/pipeline-03-multiple/memcpyEdge/memcpy_edge.csl delete mode 100644 tutorials/pipeline-03-multiple/memcpyEdge/north.csl delete mode 100644 tutorials/pipeline-03-multiple/memcpyEdge/south.csl delete mode 100644 tutorials/pipeline-03-multiple/memcpyEdge/west.csl rename tutorials/topic-01-arrays-and-pointers/{commands.sh => commands_wse2.sh} (100%) rename tutorials/{topic-05-switches/commands.sh => topic-01-arrays-and-pointers/commands_wse3.sh} (53%) rename tutorials/topic-02-libraries/{commands.sh => commands_wse2.sh} (100%) rename tutorials/{topic-06-libraries/commands.sh => topic-02-libraries/commands_wse3.sh} (64%) delete mode 100644 tutorials/topic-02-streaming-wavelet-data/README.rst delete mode 100644 tutorials/topic-02-streaming-wavelet-data/layout.csl delete mode 100644 tutorials/topic-02-streaming-wavelet-data/pe_program.csl delete mode 100644 tutorials/topic-02-streaming-wavelet-data/run.py delete mode 100644 tutorials/topic-03-sparse-tensors/README.rst delete mode 100644 tutorials/topic-03-sparse-tensors/layout.csl delete mode 100644 tutorials/topic-03-sparse-tensors/pe_program.csl delete mode 100644 tutorials/topic-03-sparse-tensors/run.py rename tutorials/topic-03-streaming-wavelet-data/{commands.sh => commands_wse2.sh} (100%) rename tutorials/{topic-03-sparse-tensors/commands.sh => topic-03-streaming-wavelet-data/commands_wse3.sh} (80%) delete mode 100644 tutorials/topic-04-sentinels/README.rst delete mode 100644 tutorials/topic-04-sentinels/layout.csl delete mode 100644 tutorials/topic-04-sentinels/memcpyEdge/d2h.csl delete mode 100644 tutorials/topic-04-sentinels/memcpyEdge/east.csl delete mode 100644 tutorials/topic-04-sentinels/memcpyEdge/h2d.csl delete mode 100644 tutorials/topic-04-sentinels/memcpyEdge/memcpy_edge.csl delete mode 100644 tutorials/topic-04-sentinels/memcpyEdge/north.csl delete mode 100644 tutorials/topic-04-sentinels/memcpyEdge/south.csl delete mode 100644 tutorials/topic-04-sentinels/memcpyEdge/west.csl delete mode 100644 tutorials/topic-04-sentinels/pe_program.csl delete mode 100644 tutorials/topic-04-sentinels/run.py delete mode 100644 tutorials/topic-04-sentinels/sentinel.csl rename tutorials/topic-04-sparse-tensors/{commands.sh => commands_wse2.sh} (100%) rename tutorials/{topic-02-streaming-wavelet-data/commands.sh => topic-04-sparse-tensors/commands_wse3.sh} (80%) rename tutorials/topic-05-sentinels/{commands.sh => commands_wse2.sh} (100%) rename tutorials/{topic-04-sentinels/commands.sh => topic-05-sentinels/commands_wse3.sh} (77%) delete mode 100644 tutorials/topic-05-switches/README.rst delete mode 100644 tutorials/topic-05-switches/empty.csl delete mode 100644 tutorials/topic-05-switches/layout.csl delete mode 100644 tutorials/topic-05-switches/memcpyEdge/d2h.csl delete mode 100644 tutorials/topic-05-switches/memcpyEdge/east.csl delete mode 100644 tutorials/topic-05-switches/memcpyEdge/h2d.csl delete mode 100644 tutorials/topic-05-switches/memcpyEdge/memcpy_edge.csl delete mode 100644 tutorials/topic-05-switches/memcpyEdge/north.csl delete mode 100644 tutorials/topic-05-switches/memcpyEdge/south.csl delete mode 100644 tutorials/topic-05-switches/memcpyEdge/west.csl delete mode 100644 tutorials/topic-05-switches/recv.csl delete mode 100644 tutorials/topic-05-switches/run.py delete mode 100644 tutorials/topic-05-switches/send.csl delete mode 100644 tutorials/topic-06-libraries/README.rst delete mode 100644 tutorials/topic-06-libraries/layout.csl delete mode 100644 tutorials/topic-06-libraries/pe_program.csl delete mode 100644 tutorials/topic-06-libraries/run.py rename tutorials/topic-06-switches/{commands.sh => commands_wse2.sh} (100%) create mode 100755 tutorials/topic-06-switches/commands_wse3.sh delete mode 100644 tutorials/topic-07-filters/README.rst delete mode 100755 tutorials/topic-07-filters/commands.sh delete mode 100644 tutorials/topic-07-filters/layout.csl delete mode 100644 tutorials/topic-07-filters/memcpyEdge/d2h.csl delete mode 100644 tutorials/topic-07-filters/memcpyEdge/east.csl delete mode 100644 tutorials/topic-07-filters/memcpyEdge/h2d.csl delete mode 100644 tutorials/topic-07-filters/memcpyEdge/memcpy_edge.csl delete mode 100644 tutorials/topic-07-filters/memcpyEdge/north.csl delete mode 100644 tutorials/topic-07-filters/memcpyEdge/south.csl delete mode 100644 tutorials/topic-07-filters/memcpyEdge/west.csl delete mode 100644 tutorials/topic-07-filters/recv.csl delete mode 100644 tutorials/topic-07-filters/run.py delete mode 100644 tutorials/topic-07-filters/send.csl rename tutorials/topic-07-switches-entrypt/{commands.sh => commands_wse2.sh} (100%) create mode 100755 tutorials/topic-07-switches-entrypt/commands_wse3.sh delete mode 100644 tutorials/topic-08-fifos/README.rst delete mode 100644 tutorials/topic-08-fifos/buffer.csl delete mode 100644 tutorials/topic-08-fifos/layout.csl delete mode 100644 tutorials/topic-08-fifos/memcpyEdge/d2h.csl delete mode 100644 tutorials/topic-08-fifos/memcpyEdge/east.csl delete mode 100644 tutorials/topic-08-fifos/memcpyEdge/h2d.csl delete mode 100644 tutorials/topic-08-fifos/memcpyEdge/memcpy_edge.csl delete mode 100644 tutorials/topic-08-fifos/memcpyEdge/north.csl delete mode 100644 tutorials/topic-08-fifos/memcpyEdge/south.csl delete mode 100644 tutorials/topic-08-fifos/memcpyEdge/west.csl delete mode 100644 tutorials/topic-08-fifos/run.py rename tutorials/topic-08-filters/{commands.sh => commands_wse2.sh} (100%) create mode 100755 tutorials/topic-08-filters/commands_wse3.sh rename tutorials/topic-09-fifos/{commands.sh => commands_wse2.sh} (100%) rename tutorials/{topic-08-fifos/commands.sh => topic-09-fifos/commands_wse3.sh} (63%) delete mode 100644 tutorials/topic-09-map-builtin/README.rst delete mode 100644 tutorials/topic-09-map-builtin/layout.csl delete mode 100644 tutorials/topic-09-map-builtin/pe_program.csl delete mode 100644 tutorials/topic-09-map-builtin/run.py delete mode 100644 tutorials/topic-10-collectives/README.rst delete mode 100644 tutorials/topic-10-collectives/layout.csl delete mode 100644 tutorials/topic-10-collectives/pe_program.csl delete mode 100644 tutorials/topic-10-collectives/run.py rename tutorials/topic-10-map-builtin/{commands.sh => commands_wse2.sh} (100%) create mode 100755 tutorials/topic-10-map-builtin/commands_wse3.sh rename tutorials/topic-11-collectives/{commands.sh => commands_wse2.sh} (100%) rename tutorials/{topic-10-collectives/commands.sh => topic-11-collectives/commands_wse3.sh} (68%) delete mode 100644 tutorials/topic-11-debug-library/README.rst delete mode 100755 tutorials/topic-11-debug-library/commands.sh delete mode 100644 tutorials/topic-11-debug-library/layout.csl delete mode 100644 tutorials/topic-11-debug-library/pe_program.csl delete mode 100644 tutorials/topic-11-debug-library/run.py rename tutorials/topic-12-debug-library/{commands.sh => commands_wse2.sh} (100%) create mode 100755 tutorials/topic-12-debug-library/commands_wse3.sh delete mode 100644 tutorials/topic-12-wse3-features/README.rst delete mode 100644 tutorials/topic-12-wse3-features/layout.csl delete mode 100644 tutorials/topic-12-wse3-features/left_pe.csl delete mode 100644 tutorials/topic-12-wse3-features/right_pe.csl delete mode 100644 tutorials/topic-12-wse3-features/run.py rename tutorials/topic-13-simprint/{commands.sh => commands_wse2.sh} (100%) create mode 100755 tutorials/topic-13-simprint/commands_wse3.sh rename tutorials/topic-14-color-swap/{commands.sh => commands_wse2.sh} (100%) rename tutorials/{topic-12-wse3-features/commands.sh => topic-15-wse3-microthreads/commands_wse3.sh} (100%) diff --git a/README.rst b/README.rst index 14f6723..9bf8467 100644 --- a/README.rst +++ b/README.rst @@ -57,20 +57,20 @@ The sample applications available are: (GEMM) using the ``collectives`` library. * ``residual``: Computes the norm of the residual of a matrix-vector multiplication. Builds on the ``gemv-checkerboard-pattern`` example. -* ``stencil-v2``: A 3D 25-point stencil finite difference code for solving a +* ``25-pt-stencil``: A 3D 25-point stencil finite difference code for solving a wave equation with a source perturbation. -* ``bandwidthTest``: Benchmarks the bandwidth of data transfers between host +* ``bandwidth-test``: Benchmarks the bandwidth of data transfers between host and device using the ``memcpy`` framework and the ``SdkRuntime`` host API. * ``spmv-hypersparse``: Computes a sparse matrix-vector product using a hypersparse matrix. -* ``stencil-3d-7pts``: Computes a sparse matrix-vector product using a matrix - generated by a 7-point stencil. -* ``powerMethod``: Implements the Power method to compute the eigenvector +* ``7pt-stencil-spmv``: Computes a sparse matrix-vector product using a matrix + generated by a 3D 7-point stencil. +* ``power-method``: Implements the Power method to compute the eigenvector of the largest eigenvalue of a matrix generated by a 7-point stencil. -* ``conjugateGradient``: Implements the Conjugate Gradient (CG) method to +* ``conjugate-gradient``: Implements the Conjugate Gradient (CG) method to approximate the solution to a system of linear equations ``A*x = b``, where ``A`` is a matrix generated by a 7-point stencil. -* ``preconditionedConjugateGradient``: Implements the Preconditioned Conjugate +* ``preconditioned-conjugate-gradient``: Implements the Preconditioned Conjugate Gradient method (PCG) to approximate the solution to a system of linear equations ``A*x = b``, where ``A`` is a matrix generated by a 7-point stencil. @@ -89,15 +89,20 @@ The sample applications available are: * ``FFT``: Implements 1D and 2D Discrete Fourier Transforms (DFT). * ``single-tile-matvec``: Implements highly optimized ``N x N`` matrix-vector products, in which each PE performs the same matrix-vector computation. +* ``row-col-broadcast``: Benchmarks the bandwidth of data transfers between + host and device, where data is broadcast across a row or column of PEs, + using ``memcpy_h2d_colbcast`` and ``memcpy_h2d_rowbcast``. +* ``game-of-life``: Implements Conway's Game of Life, where each PE is treated + as a single cell. Branches -------- For each release of the SDK, there is a corresponding release tag in this repository which contains a version of the CSL examples which are compatible -with that SDK release. For example, the tag ``rel-sdk-1.2.0`` in this +with that SDK release. For example, the tag ``rel-sdk-1.3.0`` in this repository contains a version of the CSL examples which will work (compile and -simulate) with the SDK 1.2.0 release. The ``master`` branch is identical to the +simulate) with the SDK 1.3.0 release. The ``master`` branch is identical to the newest release. Full backward compatibility of the SDK is not guaranteed. diff --git a/RELEASE-NOTES.rst b/RELEASE-NOTES.rst index fe83768..c51d1fc 100644 --- a/RELEASE-NOTES.rst +++ b/RELEASE-NOTES.rst @@ -4,6 +4,19 @@ Release Notes The following are the release notes for the CSL Examples repository, ``csl-examples``. +Version 1.3.0 +------------- + +- The examples are improved and updated to comply with the SDK version 1.3.0. + +- A new example program ``row-col-broadcast`` has been introduced which + benchmarks the bandwidth of data transfers between host and device, + where data is broadcast across a row or column of PEs, + using the new ``memcpy_h2d_colbcast`` and ``memcpy_h2d_rowbcast`` APIs. + +- A new example program ``game-of-life`` has been introduced which implements + Conway's Game of Life, where each PE is treated as a single cell. + Version 1.2.0 ------------- diff --git a/benchmarks/25-pt-stencil/commands.sh b/benchmarks/25-pt-stencil/commands_wse2.sh similarity index 100% rename from benchmarks/25-pt-stencil/commands.sh rename to benchmarks/25-pt-stencil/commands_wse2.sh diff --git a/benchmarks/stencil-3d-7pts/README.rst b/benchmarks/7pt-stencil-spmv/README.rst similarity index 97% rename from benchmarks/stencil-3d-7pts/README.rst rename to benchmarks/7pt-stencil-spmv/README.rst index 8d338e0..e8bf48b 100644 --- a/benchmarks/stencil-3d-7pts/README.rst +++ b/benchmarks/7pt-stencil-spmv/README.rst @@ -1,5 +1,5 @@ -stencil-3d-7pts -=============== +3D 7-Point Stencil SpMV +======================= This example evaluates the performance of 7-point stencil. The kernel records the ``start`` and ``end`` of ``spmv`` by tsc counter. In addition the tsc diff --git a/benchmarks/stencil-3d-7pts/cmd_parser.py b/benchmarks/7pt-stencil-spmv/cmd_parser.py similarity index 93% rename from benchmarks/stencil-3d-7pts/cmd_parser.py rename to benchmarks/7pt-stencil-spmv/cmd_parser.py index 7fab1ad..7a72006 100644 --- a/benchmarks/stencil-3d-7pts/cmd_parser.py +++ b/benchmarks/7pt-stencil-spmv/cmd_parser.py @@ -47,6 +47,8 @@ def parse_args(): "-n", default=1, type=int, help="number of columns") + parser.add_argument("--simulator", action="store_true", + help="Runs on simulator") parser.add_argument( "-k", default=1, type=int, @@ -74,10 +76,9 @@ def parse_args(): parser.add_argument( "--run-only", help="Run only", action="store_true") - # arch = wse1 or wse2 parser.add_argument( "--arch", - help="wse1 or wse2. Default is wse1 when not supplied.") + help="wse2 or wse3. Default is wse2 when not supplied.") parser.add_argument( "--width-west-buf", default=0, type=int, @@ -108,4 +109,7 @@ def parse_args(): print(f"create {logs_dir} to store log files") os.mkdir(logs_dir) + if args.cmaddr is None: + args.simulator = False + return args, logs_dir diff --git a/benchmarks/stencil-3d-7pts/commands.sh b/benchmarks/7pt-stencil-spmv/commands_wse2.sh similarity index 85% rename from benchmarks/stencil-3d-7pts/commands.sh rename to benchmarks/7pt-stencil-spmv/commands_wse2.sh index 25a43c2..17d2c53 100755 --- a/benchmarks/stencil-3d-7pts/commands.sh +++ b/benchmarks/7pt-stencil-spmv/commands_wse2.sh @@ -2,7 +2,7 @@ set -e -cslc ./layout.csl --arch wse2 --fabric-dims=12,7 --fabric-offsets=4,1 \ +cslc ./src/layout.csl --arch wse2 --fabric-dims=12,7 --fabric-offsets=4,1 \ --params=width:5,height:5,MAX_ZDIM:5 --params=BLOCK_SIZE:2 --params=C0_ID:0 \ --params=C1_ID:1 --params=C2_ID:2 --params=C3_ID:3 --params=C4_ID:4 --params=C5_ID:5 \ --params=C6_ID:6 --params=C7_ID:7 --params=C8_ID:8 -o=out \ diff --git a/benchmarks/7pt-stencil-spmv/commands_wse3.sh b/benchmarks/7pt-stencil-spmv/commands_wse3.sh new file mode 100755 index 0000000..8517be9 --- /dev/null +++ b/benchmarks/7pt-stencil-spmv/commands_wse3.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +set -e + +cslc ./src/layout.csl --arch wse3 --fabric-dims=12,7 --fabric-offsets=4,1 \ +--params=width:5,height:5,MAX_ZDIM:5 --params=BLOCK_SIZE:2 --params=C0_ID:0 \ +--params=C1_ID:1 --params=C2_ID:2 --params=C3_ID:3 --params=C4_ID:4 --params=C5_ID:5 \ +--params=C6_ID:6 --params=C7_ID:7 --params=C8_ID:8 -o=out \ +--memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 +cs_python ./run.py -m=5 -n=5 -k=5 --latestlink out --channels=1 \ +--width-west-buf=0 --width-east-buf=0 --zDim=5 --run-only diff --git a/benchmarks/7pt-stencil-spmv/run.appliance.py b/benchmarks/7pt-stencil-spmv/run.appliance.py new file mode 100644 index 0000000..1de0716 --- /dev/null +++ b/benchmarks/7pt-stencil-spmv/run.appliance.py @@ -0,0 +1,438 @@ +# Copyright 2024 Cerebras Systems. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" test 7-point stencil + + The Laplacian operator L on 3-dimensional domain can be represented by 7-point + stencil based on the standard 2nd order Finite Difference Method. The operator form + with Dirichlet boundary conditions can be written by + L[u](i,j,k) = u(i+1, j, k ) + u(i-1, j, k ) + + u(i, j+1,k ) + u(i, j-1, k ) + + u(i, j, k+1) + u(i, j, k-1) + + -6*u(i, j, k) + In general the coefficients of those 7 points can vary. To minimize the memory + consumption, this example assumes the coefficients are independent of index k and + whole vector u(i,j,:) is placed in one PE (px=j, py=i). + The above formula can be re-written by + c_west * x[i-1][j ][k ] + c_east * x[i+1][j ][k ] + + c_south * x[i ][j-1][k ] + c_north * x[i ][j+1][k ] + + c_bot * x[i ][j ][k-1] + c_top * x[i ][j ][k+1] + + c_center * x[i][j][k] + Each PE only holds 7 coefficients organized by c_west, c_east, c_south, c_north, + c_bot, c_top and c_center. + + This example provides two modules, one is allreduce and the other is stencil_3d_7pts. + "allreduce" module can synchronize all PEs to form a reference clock. + "stencil_3d_7pts" module can compute y = A*x where A is the matrix from 7-point stencil + + The framework is + --- + sync() // synchronize all PEs to sample the reference clock + tic() // record start time + spmv(zdim) // compute y = A*x + toc() // record end time + --- + + The tic() samples "time_start" and toc() samples "time_end". The sync() samples + "time_ref" which is used to shift "time_start" and "time_end". + The elapsed time is measured by + cycles_send = max(time_end) - min(time_start) + + The overall runtime is computed via the following formula + time_send = (cycles_send / 0.85) *1.e-3 us + where a PE runs with clock speed 850MHz + + Each PE needs to gather six f32 from six neighbors, the cost of the communication is + 6*h*w*zDim*4 bytes + where w-by-h is the core rectangle and zDim is the length of local vector. + + Here is the list of parameters: + -m= is the height of the core + -n= is the width of the core + -k= is size of x and y allocated in the core + --zDim= is the number of f32 per PE, computed by y = A*x + zDim must be not greater than k + --channels= specifies the number of I/O channels, no bigger than 16 +""" + + +import struct +import os +from typing import Optional +from pathlib import Path +import shutil +import subprocess +import random +import json + +import numpy as np + + +from cmd_parser import parse_args + + +from util import ( + hwl_2_oned_colmajor, + oned_to_hwl_colmajor, + laplacian, +) +from cerebras.sdk.client import ( + SdkCompiler, + SdkRuntime, +) + +from cerebras.appliance.pb.sdk.sdk_common_pb2 import ( + MemcpyDataType, + MemcpyOrder, +) + +hash_filename = "hash.json" + + +def float_to_hex(f): + return hex(struct.unpack('= 1, "number of I/O channels must be at least 1" + + print(f"width_west_buf = {width_west_buf}") + print(f"width_east_buf = {width_east_buf}") + print(f"channels = {channels}") + + height = args.m + width = args.n + pe_length = args.k + zDim = args.zDim + blockSize = args.blockSize + + print(f"width = {width}, height = {height}, pe_length={pe_length}, zDim={zDim}, blockSize={blockSize}") + assert pe_length >= 2, "the maximum size of z must be greater than 1" + assert zDim <= pe_length, "[0, zDim) cannot exceed the storage" + + np.random.seed(2) + # A is h-by-w-by-l + x = np.arange(height*width*pe_length).reshape(height, width, pe_length).astype(np.float32) + 100 + + x_1d = hwl_2_oned_colmajor(height, width, pe_length, x, np.float32) + + # stencil coefficients has the following order + # {c_west, c_east, c_south, c_north, c_bottom, c_top, c_center} + stencil_coeff = np.zeros((height, width, 7), dtype = np.float32) + for i in range(height): + for j in range(width): + stencil_coeff[(i, j, 0)] = -1 # west + stencil_coeff[(i, j, 1)] = -2 # east + stencil_coeff[(i, j, 2)] = -3 # south + stencil_coeff[(i, j, 3)] = -4 # north + stencil_coeff[(i, j, 4)] = -5 # bottom + stencil_coeff[(i, j, 5)] = -6 # top + stencil_coeff[(i, j, 6)] = 6 # center + + stencil_coeff_1d = hwl_2_oned_colmajor(height, width, 7, stencil_coeff, np.float32) + + y_ref = np.zeros((height, width, pe_length), dtype=np.float32) + + laplacian(stencil_coeff, zDim, x, y_ref) + + # fabric-offsets = 1,1 + fabric_offset_x = 1 + fabric_offset_y = 1 + # starting point of the core rectangle = (core_fabric_offset_x, core_fabric_offset_y) + # memcpy framework requires 3 columns at the west of the core rectangle + # memcpy framework requires 2 columns at the east of the core rectangle + core_fabric_offset_x = fabric_offset_x + 3 + width_west_buf + core_fabric_offset_y = fabric_offset_y + # (min_fabric_width, min_fabric_height) is the minimal dimension to run the app + min_fabric_width = (core_fabric_offset_x + width + 2 + 1 + width_east_buf) + min_fabric_height = (core_fabric_offset_y + height + 1) + + fabric_width = 0 + fabric_height = 0 + if args.fabric_dims: + w_str, h_str = args.fabric_dims.split(",") + fabric_width = int(w_str) + fabric_height = int(h_str) + + if fabric_width == 0 or fabric_height == 0: + fabric_width = min_fabric_width + fabric_height = min_fabric_height + + assert fabric_width >= min_fabric_width + assert fabric_height >= min_fabric_height + + # prepare the simulation + print('store ELFs and log files in the folder ', dirname) + + # layout of a rectangle + code_csl = "layout.csl" + + C0 = 0 + C1 = 1 + C2 = 2 + C3 = 3 + C4 = 4 + C5 = 5 + C6 = 6 + C7 = 7 + C8 = 8 + + csl_path = "./src" + + if args.compile_only: + print("WARNING: compile the code, don't run SdkRuntime because the server is down after the compilation"); + hashstr = csl_compile_core( + csl_path, + width, + height, + pe_length, + blockSize, + code_csl, + dirname, + fabric_width, + fabric_height, + core_fabric_offset_x, + core_fabric_offset_y, + args.arch, + C0, + C1, + C2, + C3, + C4, + C5, + C6, + C7, + C8, + channels, + width_west_buf, + width_east_buf + ) + print(f"dump artifact name to file {hash_filename}") + with open(hash_filename, "w") as write_file: + json.dump(hashstr, write_file) + print("COMPILE ONLY: EXIT") + return + + print(f"load artifact name from file {hash_filename}") + with open(hash_filename, "r") as f: + hashstr = json.load(f) + + memcpy_dtype = MemcpyDataType.MEMCPY_32BIT + with SdkRuntime(hashstr, simulator=args.simulator) as runner: + + symbol_x = runner.get_id("x") + symbol_y = runner.get_id("y") + symbol_time_memcpy = runner.get_id("time_memcpy") + symbol_stencil_coeff = runner.get_id("stencil_coeff") + symbol_time_buf_u16 = runner.get_id("time_buf_u16") + symbol_time_ref = runner.get_id("time_ref") + + # load() and run() are called by client.Sdkruntime.__enter__ + #runner.load() + #runner.run() + + print(f"copy vector x of type f32") + # the size of x per PE is pe_length + runner.memcpy_h2d(symbol_x, x_1d, 0, 0, width, height, pe_length,\ + streaming=False, data_type=memcpy_dtype, order=MemcpyOrder.COL_MAJOR, nonblock=True) + + print(f"copy coefficients of type f32") + # each PE holds 7 coefficients + runner.memcpy_h2d(symbol_stencil_coeff, stencil_coeff_1d, 0, 0, width, height, 7,\ + streaming=False, data_type=memcpy_dtype, order=MemcpyOrder.COL_MAJOR, nonblock=True) + + print("step 1: sync all PEs") + runner.launch("f_sync", np.int16(1), nonblock=False) + + print("step 2: tic() records time_start") + runner.launch("f_tic", nonblock=True) + + print(f"step 3: compute y = A*x with zDim = {zDim}") + # positive zDim can be smaller than pe_length + runner.launch("f_spmv", np.int16(zDim), nonblock=False) + + print("step 4: toc() records time_end") + runner.launch("f_toc", nonblock=False) + + print("step 5: prepare (time_start, time_end)") + runner.launch("f_memcpy_timestamps", nonblock=False) + + print("step 6: D2H (time_start, time_end)") + # time_start/time_end is of type u16[3] + time_memcpy_hwl_1d = np.zeros(height*width*6, np.uint32) + runner.memcpy_d2h(time_memcpy_hwl_1d, symbol_time_buf_u16, 0, 0, width, height, 6,\ + streaming=False, data_type=MemcpyDataType.MEMCPY_16BIT, order=MemcpyOrder.COL_MAJOR, nonblock=False) + time_memcpy_hwl = oned_to_hwl_colmajor(height, width, 6, time_memcpy_hwl_1d, np.uint16) + + print("step 7: D2H y of type f32") + y_1d = np.zeros(height*width*pe_length, np.float32) + runner.memcpy_d2h(y_1d, symbol_y, 0, 0, width, height, pe_length,\ + streaming=False, data_type=memcpy_dtype, order=MemcpyOrder.COL_MAJOR, nonblock=False) + y_wse = np.reshape(y_1d, (height, width, pe_length), order='F') + + print("step 8: prepare reference clock") + runner.launch("f_reference_timestamps", nonblock=False) + + print("step 9: D2H reference clock") + time_ref_1d = np.zeros(height*width*3, np.uint32) + runner.memcpy_d2h(time_ref_1d, symbol_time_ref, 0, 0, width, height, 3,\ + streaming=False, data_type=MemcpyDataType.MEMCPY_16BIT, order=MemcpyOrder.COL_MAJOR, nonblock=False) + time_ref_hwl = oned_to_hwl_colmajor(height, width, 3, time_ref_1d, np.uint16) + + # stop() is called by client.Sdkruntime.__exit__ + #runner.stop() + + # time_start = start time of spmv + time_start = np.zeros((height, width)).astype(int) + # time_end = end time of spmv + time_end = np.zeros((height, width)).astype(int) + word = np.zeros(3).astype(np.uint16) + for w in range(width): + for h in range(height): + word[0] = time_memcpy_hwl[(h, w, 0)] + word[1] = time_memcpy_hwl[(h, w, 1)] + word[2] = time_memcpy_hwl[(h, w, 2)] + time_start[(h,w)] = make_u48(word) + word[0] = time_memcpy_hwl[(h, w, 3)] + word[1] = time_memcpy_hwl[(h, w, 4)] + word[2] = time_memcpy_hwl[(h, w, 5)] + time_end[(h,w)] = make_u48(word) + + # time_ref = reference clock + time_ref = np.zeros((height, width)).astype(int) + word = np.zeros(3).astype(np.uint16) + for w in range(width): + for h in range(height): + word[0] = time_ref_hwl[(h, w, 0)] + word[1] = time_ref_hwl[(h, w, 1)] + word[2] = time_ref_hwl[(h, w, 2)] + time_ref[(h, w)] = make_u48(word) + + # adjust the reference clock by the propagation delay + # the right-bottom PE signals other PEs, the propagation delay is + # (h-1) - py + (w-1) - px + for py in range(height): + for px in range(width): + time_ref[(py, px)] = time_ref[(py, px)] - ((width+height-2)-(px + py)) + + # shift time_start and time_end by time_ref + time_start = time_start - time_ref + time_end = time_end - time_ref + + # cycles_send = time_end[(h,w)] - time_start[(h,w)] + # 850MHz --> 1 cycle = (1/0.85) ns = (1/0.85)*1.e-3 us + # time_send = (cycles_send / 0.85) *1.e-3 us + # + # each PE needs to gather six f32 from six neighbors, the cost of the communication is + # 6*h*w*zDim*4 bytes + # + # bandwidth = (((wvlts-1) * 4)/time_send) MBS + wvlts = 6*height*width*zDim + min_time_start = time_start.min() + max_time_end = time_end.max() + cycles_send = max_time_end - min_time_start + time_send = (cycles_send / 0.85) *1.e-3 + bandwidth = ((wvlts * 4)/time_send) + print(f"cycles_send = {cycles_send} cycles") + print(f"time_send = {time_send} us") + print(f"bandwidth = {bandwidth} MB/S ") + + z = y_ref.ravel() - y_wse.ravel() + nrm_z = np.linalg.norm(z, np.inf) + print(f"|y_ref - y_wes| = {nrm_z}") + np.testing.assert_allclose(y_ref.ravel(), y_wse.ravel(), 1.e-5) + print("\nSUCCESS!") + +if __name__ == "__main__": + main() diff --git a/benchmarks/stencil-3d-7pts/run.py b/benchmarks/7pt-stencil-spmv/run.py similarity index 92% rename from benchmarks/stencil-3d-7pts/run.py rename to benchmarks/7pt-stencil-spmv/run.py index e7c0a11..0b15730 100644 --- a/benchmarks/stencil-3d-7pts/run.py +++ b/benchmarks/7pt-stencil-spmv/run.py @@ -251,7 +251,7 @@ def main(): sim_log = os.path.join(dirname, "sim.log") # layout of a rectangle - code_csl = "layout.csl" + code_csl = "src/layout.csl" C0 = 0 C1 = 1 @@ -295,67 +295,67 @@ def main(): return memcpy_dtype = MemcpyDataType.MEMCPY_32BIT - simulator = SdkRuntime(dirname, cmaddr=args.cmaddr) + runner = SdkRuntime(dirname, cmaddr=args.cmaddr) - symbol_x = simulator.get_id("x") - symbol_y = simulator.get_id("y") - symbol_stencil_coeff = simulator.get_id("stencil_coeff") - symbol_time_buf_u16 = simulator.get_id("time_buf_u16") - symbol_time_ref = simulator.get_id("time_ref") + symbol_x = runner.get_id("x") + symbol_y = runner.get_id("y") + symbol_stencil_coeff = runner.get_id("stencil_coeff") + symbol_time_buf_u16 = runner.get_id("time_buf_u16") + symbol_time_ref = runner.get_id("time_ref") - simulator.load() - simulator.run() + runner.load() + runner.run() print(f"copy vector x of type f32") # the size of x per PE is pe_length - simulator.memcpy_h2d(symbol_x, x_1d, 0, 0, width, height, pe_length,\ + runner.memcpy_h2d(symbol_x, x_1d, 0, 0, width, height, pe_length,\ streaming=False, data_type=memcpy_dtype, order=MemcpyOrder.COL_MAJOR, nonblock=True) print(f"copy coefficients of type f32") # each PE holds 7 coefficients - simulator.memcpy_h2d(symbol_stencil_coeff, stencil_coeff_1d, 0, 0, width, height, 7,\ + runner.memcpy_h2d(symbol_stencil_coeff, stencil_coeff_1d, 0, 0, width, height, 7,\ streaming=False, data_type=memcpy_dtype, order=MemcpyOrder.COL_MAJOR, nonblock=True) print("step 1: sync all PEs") - simulator.launch("f_sync", np.int16(1), nonblock=False) + runner.launch("f_sync", np.int16(1), nonblock=False) print("step 2: tic() records time_start") - simulator.launch("f_tic", nonblock=True) + runner.launch("f_tic", nonblock=True) print(f"step 3: compute y = A*x with zDim = {zDim}") # positive zDim can be smaller than pe_length - simulator.launch("f_spmv", np.int16(zDim), nonblock=False) + runner.launch("f_spmv", np.int16(zDim), nonblock=False) print("step 4: toc() records time_end") - simulator.launch("f_toc", nonblock=False) + runner.launch("f_toc", nonblock=False) print("step 5: prepare (time_start, time_end)") - simulator.launch("f_memcpy_timestamps", nonblock=False) + runner.launch("f_memcpy_timestamps", nonblock=False) print("step 6: D2H (time_start, time_end)") time_memcpy_hwl_1d = np.zeros(height*width*6, np.uint32) - simulator.memcpy_d2h(time_memcpy_hwl_1d, symbol_time_buf_u16, 0, 0, width, height, 6,\ + runner.memcpy_d2h(time_memcpy_hwl_1d, symbol_time_buf_u16, 0, 0, width, height, 6,\ streaming=False, data_type=MemcpyDataType.MEMCPY_16BIT, order=MemcpyOrder.COL_MAJOR, nonblock=False) time_memcpy_hwl = oned_to_hwl_colmajor(height, width, 6, time_memcpy_hwl_1d, np.uint16) print("step 7: D2H y of type f32") y_1d = np.zeros(height*width*pe_length, np.float32) - simulator.memcpy_d2h(y_1d, symbol_y, 0, 0, width, height, pe_length,\ + runner.memcpy_d2h(y_1d, symbol_y, 0, 0, width, height, pe_length,\ streaming=False, data_type=memcpy_dtype, order=MemcpyOrder.COL_MAJOR, nonblock=False) y_wse = np.reshape(y_1d, (height, width, pe_length), order='F') print("step 8: prepare reference clock") - simulator.launch("f_reference_timestamps", nonblock=False) + runner.launch("f_reference_timestamps", nonblock=False) print("step 9: D2H reference clock") time_ref_1d = np.zeros(height*width*3, np.uint32) - simulator.memcpy_d2h(time_ref_1d, symbol_time_ref, 0, 0, width, height, 3,\ + runner.memcpy_d2h(time_ref_1d, symbol_time_ref, 0, 0, width, height, 3,\ streaming=False, data_type=MemcpyDataType.MEMCPY_16BIT, order=MemcpyOrder.COL_MAJOR, nonblock=False) time_ref_hwl = oned_to_hwl_colmajor(height, width, 3, time_ref_1d, np.uint16) - simulator.stop() + runner.stop() - if args.cmaddr is None: + if args.simulator: # move simulation log and core dump to the given folder dst_log = Path(f"{dirname}/sim.log") src_log = Path("sim.log") diff --git a/benchmarks/stencil-3d-7pts/kernel.csl b/benchmarks/7pt-stencil-spmv/src/kernel.csl similarity index 95% rename from benchmarks/stencil-3d-7pts/kernel.csl rename to benchmarks/7pt-stencil-spmv/src/kernel.csl index 1adcebc..ff57614 100644 --- a/benchmarks/stencil-3d-7pts/kernel.csl +++ b/benchmarks/7pt-stencil-spmv/src/kernel.csl @@ -32,7 +32,7 @@ const timestamp = @import_module("