diff --git a/README.rst b/README.rst index 14f6723..9bf8467 100644 --- a/README.rst +++ b/README.rst @@ -57,20 +57,20 @@ The sample applications available are: (GEMM) using the ``collectives`` library. * ``residual``: Computes the norm of the residual of a matrix-vector multiplication. Builds on the ``gemv-checkerboard-pattern`` example. -* ``stencil-v2``: A 3D 25-point stencil finite difference code for solving a +* ``25-pt-stencil``: A 3D 25-point stencil finite difference code for solving a wave equation with a source perturbation. -* ``bandwidthTest``: Benchmarks the bandwidth of data transfers between host +* ``bandwidth-test``: Benchmarks the bandwidth of data transfers between host and device using the ``memcpy`` framework and the ``SdkRuntime`` host API. * ``spmv-hypersparse``: Computes a sparse matrix-vector product using a hypersparse matrix. -* ``stencil-3d-7pts``: Computes a sparse matrix-vector product using a matrix - generated by a 7-point stencil. -* ``powerMethod``: Implements the Power method to compute the eigenvector +* ``7pt-stencil-spmv``: Computes a sparse matrix-vector product using a matrix + generated by a 3D 7-point stencil. +* ``power-method``: Implements the Power method to compute the eigenvector of the largest eigenvalue of a matrix generated by a 7-point stencil. -* ``conjugateGradient``: Implements the Conjugate Gradient (CG) method to +* ``conjugate-gradient``: Implements the Conjugate Gradient (CG) method to approximate the solution to a system of linear equations ``A*x = b``, where ``A`` is a matrix generated by a 7-point stencil. -* ``preconditionedConjugateGradient``: Implements the Preconditioned Conjugate +* ``preconditioned-conjugate-gradient``: Implements the Preconditioned Conjugate Gradient method (PCG) to approximate the solution to a system of linear equations ``A*x = b``, where ``A`` is a matrix generated by a 7-point stencil. @@ -89,15 +89,20 @@ The sample applications available are: * ``FFT``: Implements 1D and 2D Discrete Fourier Transforms (DFT). * ``single-tile-matvec``: Implements highly optimized ``N x N`` matrix-vector products, in which each PE performs the same matrix-vector computation. +* ``row-col-broadcast``: Benchmarks the bandwidth of data transfers between + host and device, where data is broadcast across a row or column of PEs, + using ``memcpy_h2d_colbcast`` and ``memcpy_h2d_rowbcast``. +* ``game-of-life``: Implements Conway's Game of Life, where each PE is treated + as a single cell. Branches -------- For each release of the SDK, there is a corresponding release tag in this repository which contains a version of the CSL examples which are compatible -with that SDK release. For example, the tag ``rel-sdk-1.2.0`` in this +with that SDK release. For example, the tag ``rel-sdk-1.3.0`` in this repository contains a version of the CSL examples which will work (compile and -simulate) with the SDK 1.2.0 release. The ``master`` branch is identical to the +simulate) with the SDK 1.3.0 release. The ``master`` branch is identical to the newest release. Full backward compatibility of the SDK is not guaranteed. diff --git a/RELEASE-NOTES.rst b/RELEASE-NOTES.rst index fe83768..c51d1fc 100644 --- a/RELEASE-NOTES.rst +++ b/RELEASE-NOTES.rst @@ -4,6 +4,19 @@ Release Notes The following are the release notes for the CSL Examples repository, ``csl-examples``. +Version 1.3.0 +------------- + +- The examples are improved and updated to comply with the SDK version 1.3.0. + +- A new example program ``row-col-broadcast`` has been introduced which + benchmarks the bandwidth of data transfers between host and device, + where data is broadcast across a row or column of PEs, + using the new ``memcpy_h2d_colbcast`` and ``memcpy_h2d_rowbcast`` APIs. + +- A new example program ``game-of-life`` has been introduced which implements + Conway's Game of Life, where each PE is treated as a single cell. + Version 1.2.0 ------------- diff --git a/benchmarks/25-pt-stencil/commands.sh b/benchmarks/25-pt-stencil/commands_wse2.sh similarity index 100% rename from benchmarks/25-pt-stencil/commands.sh rename to benchmarks/25-pt-stencil/commands_wse2.sh diff --git a/benchmarks/stencil-3d-7pts/README.rst b/benchmarks/7pt-stencil-spmv/README.rst similarity index 97% rename from benchmarks/stencil-3d-7pts/README.rst rename to benchmarks/7pt-stencil-spmv/README.rst index 8d338e0..e8bf48b 100644 --- a/benchmarks/stencil-3d-7pts/README.rst +++ b/benchmarks/7pt-stencil-spmv/README.rst @@ -1,5 +1,5 @@ -stencil-3d-7pts -=============== +3D 7-Point Stencil SpMV +======================= This example evaluates the performance of 7-point stencil. The kernel records the ``start`` and ``end`` of ``spmv`` by tsc counter. In addition the tsc diff --git a/benchmarks/stencil-3d-7pts/cmd_parser.py b/benchmarks/7pt-stencil-spmv/cmd_parser.py similarity index 93% rename from benchmarks/stencil-3d-7pts/cmd_parser.py rename to benchmarks/7pt-stencil-spmv/cmd_parser.py index 7fab1ad..7a72006 100644 --- a/benchmarks/stencil-3d-7pts/cmd_parser.py +++ b/benchmarks/7pt-stencil-spmv/cmd_parser.py @@ -47,6 +47,8 @@ def parse_args(): "-n", default=1, type=int, help="number of columns") + parser.add_argument("--simulator", action="store_true", + help="Runs on simulator") parser.add_argument( "-k", default=1, type=int, @@ -74,10 +76,9 @@ def parse_args(): parser.add_argument( "--run-only", help="Run only", action="store_true") - # arch = wse1 or wse2 parser.add_argument( "--arch", - help="wse1 or wse2. Default is wse1 when not supplied.") + help="wse2 or wse3. Default is wse2 when not supplied.") parser.add_argument( "--width-west-buf", default=0, type=int, @@ -108,4 +109,7 @@ def parse_args(): print(f"create {logs_dir} to store log files") os.mkdir(logs_dir) + if args.cmaddr is None: + args.simulator = False + return args, logs_dir diff --git a/benchmarks/stencil-3d-7pts/commands.sh b/benchmarks/7pt-stencil-spmv/commands_wse2.sh similarity index 85% rename from benchmarks/stencil-3d-7pts/commands.sh rename to benchmarks/7pt-stencil-spmv/commands_wse2.sh index 25a43c2..17d2c53 100755 --- a/benchmarks/stencil-3d-7pts/commands.sh +++ b/benchmarks/7pt-stencil-spmv/commands_wse2.sh @@ -2,7 +2,7 @@ set -e -cslc ./layout.csl --arch wse2 --fabric-dims=12,7 --fabric-offsets=4,1 \ +cslc ./src/layout.csl --arch wse2 --fabric-dims=12,7 --fabric-offsets=4,1 \ --params=width:5,height:5,MAX_ZDIM:5 --params=BLOCK_SIZE:2 --params=C0_ID:0 \ --params=C1_ID:1 --params=C2_ID:2 --params=C3_ID:3 --params=C4_ID:4 --params=C5_ID:5 \ --params=C6_ID:6 --params=C7_ID:7 --params=C8_ID:8 -o=out \ diff --git a/benchmarks/7pt-stencil-spmv/commands_wse3.sh b/benchmarks/7pt-stencil-spmv/commands_wse3.sh new file mode 100755 index 0000000..8517be9 --- /dev/null +++ b/benchmarks/7pt-stencil-spmv/commands_wse3.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +set -e + +cslc ./src/layout.csl --arch wse3 --fabric-dims=12,7 --fabric-offsets=4,1 \ +--params=width:5,height:5,MAX_ZDIM:5 --params=BLOCK_SIZE:2 --params=C0_ID:0 \ +--params=C1_ID:1 --params=C2_ID:2 --params=C3_ID:3 --params=C4_ID:4 --params=C5_ID:5 \ +--params=C6_ID:6 --params=C7_ID:7 --params=C8_ID:8 -o=out \ +--memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 +cs_python ./run.py -m=5 -n=5 -k=5 --latestlink out --channels=1 \ +--width-west-buf=0 --width-east-buf=0 --zDim=5 --run-only diff --git a/benchmarks/7pt-stencil-spmv/run.appliance.py b/benchmarks/7pt-stencil-spmv/run.appliance.py new file mode 100644 index 0000000..1de0716 --- /dev/null +++ b/benchmarks/7pt-stencil-spmv/run.appliance.py @@ -0,0 +1,438 @@ +# Copyright 2024 Cerebras Systems. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" test 7-point stencil + + The Laplacian operator L on 3-dimensional domain can be represented by 7-point + stencil based on the standard 2nd order Finite Difference Method. The operator form + with Dirichlet boundary conditions can be written by + L[u](i,j,k) = u(i+1, j, k ) + u(i-1, j, k ) + + u(i, j+1,k ) + u(i, j-1, k ) + + u(i, j, k+1) + u(i, j, k-1) + + -6*u(i, j, k) + In general the coefficients of those 7 points can vary. To minimize the memory + consumption, this example assumes the coefficients are independent of index k and + whole vector u(i,j,:) is placed in one PE (px=j, py=i). + The above formula can be re-written by + c_west * x[i-1][j ][k ] + c_east * x[i+1][j ][k ] + + c_south * x[i ][j-1][k ] + c_north * x[i ][j+1][k ] + + c_bot * x[i ][j ][k-1] + c_top * x[i ][j ][k+1] + + c_center * x[i][j][k] + Each PE only holds 7 coefficients organized by c_west, c_east, c_south, c_north, + c_bot, c_top and c_center. + + This example provides two modules, one is allreduce and the other is stencil_3d_7pts. + "allreduce" module can synchronize all PEs to form a reference clock. + "stencil_3d_7pts" module can compute y = A*x where A is the matrix from 7-point stencil + + The framework is + --- + sync() // synchronize all PEs to sample the reference clock + tic() // record start time + spmv(zdim) // compute y = A*x + toc() // record end time + --- + + The tic() samples "time_start" and toc() samples "time_end". The sync() samples + "time_ref" which is used to shift "time_start" and "time_end". + The elapsed time is measured by + cycles_send = max(time_end) - min(time_start) + + The overall runtime is computed via the following formula + time_send = (cycles_send / 0.85) *1.e-3 us + where a PE runs with clock speed 850MHz + + Each PE needs to gather six f32 from six neighbors, the cost of the communication is + 6*h*w*zDim*4 bytes + where w-by-h is the core rectangle and zDim is the length of local vector. + + Here is the list of parameters: + -m= is the height of the core + -n= is the width of the core + -k= is size of x and y allocated in the core + --zDim= is the number of f32 per PE, computed by y = A*x + zDim must be not greater than k + --channels= specifies the number of I/O channels, no bigger than 16 +""" + + +import struct +import os +from typing import Optional +from pathlib import Path +import shutil +import subprocess +import random +import json + +import numpy as np + + +from cmd_parser import parse_args + + +from util import ( + hwl_2_oned_colmajor, + oned_to_hwl_colmajor, + laplacian, +) +from cerebras.sdk.client import ( + SdkCompiler, + SdkRuntime, +) + +from cerebras.appliance.pb.sdk.sdk_common_pb2 import ( + MemcpyDataType, + MemcpyOrder, +) + +hash_filename = "hash.json" + + +def float_to_hex(f): + return hex(struct.unpack('= 1, "number of I/O channels must be at least 1" + + print(f"width_west_buf = {width_west_buf}") + print(f"width_east_buf = {width_east_buf}") + print(f"channels = {channels}") + + height = args.m + width = args.n + pe_length = args.k + zDim = args.zDim + blockSize = args.blockSize + + print(f"width = {width}, height = {height}, pe_length={pe_length}, zDim={zDim}, blockSize={blockSize}") + assert pe_length >= 2, "the maximum size of z must be greater than 1" + assert zDim <= pe_length, "[0, zDim) cannot exceed the storage" + + np.random.seed(2) + # A is h-by-w-by-l + x = np.arange(height*width*pe_length).reshape(height, width, pe_length).astype(np.float32) + 100 + + x_1d = hwl_2_oned_colmajor(height, width, pe_length, x, np.float32) + + # stencil coefficients has the following order + # {c_west, c_east, c_south, c_north, c_bottom, c_top, c_center} + stencil_coeff = np.zeros((height, width, 7), dtype = np.float32) + for i in range(height): + for j in range(width): + stencil_coeff[(i, j, 0)] = -1 # west + stencil_coeff[(i, j, 1)] = -2 # east + stencil_coeff[(i, j, 2)] = -3 # south + stencil_coeff[(i, j, 3)] = -4 # north + stencil_coeff[(i, j, 4)] = -5 # bottom + stencil_coeff[(i, j, 5)] = -6 # top + stencil_coeff[(i, j, 6)] = 6 # center + + stencil_coeff_1d = hwl_2_oned_colmajor(height, width, 7, stencil_coeff, np.float32) + + y_ref = np.zeros((height, width, pe_length), dtype=np.float32) + + laplacian(stencil_coeff, zDim, x, y_ref) + + # fabric-offsets = 1,1 + fabric_offset_x = 1 + fabric_offset_y = 1 + # starting point of the core rectangle = (core_fabric_offset_x, core_fabric_offset_y) + # memcpy framework requires 3 columns at the west of the core rectangle + # memcpy framework requires 2 columns at the east of the core rectangle + core_fabric_offset_x = fabric_offset_x + 3 + width_west_buf + core_fabric_offset_y = fabric_offset_y + # (min_fabric_width, min_fabric_height) is the minimal dimension to run the app + min_fabric_width = (core_fabric_offset_x + width + 2 + 1 + width_east_buf) + min_fabric_height = (core_fabric_offset_y + height + 1) + + fabric_width = 0 + fabric_height = 0 + if args.fabric_dims: + w_str, h_str = args.fabric_dims.split(",") + fabric_width = int(w_str) + fabric_height = int(h_str) + + if fabric_width == 0 or fabric_height == 0: + fabric_width = min_fabric_width + fabric_height = min_fabric_height + + assert fabric_width >= min_fabric_width + assert fabric_height >= min_fabric_height + + # prepare the simulation + print('store ELFs and log files in the folder ', dirname) + + # layout of a rectangle + code_csl = "layout.csl" + + C0 = 0 + C1 = 1 + C2 = 2 + C3 = 3 + C4 = 4 + C5 = 5 + C6 = 6 + C7 = 7 + C8 = 8 + + csl_path = "./src" + + if args.compile_only: + print("WARNING: compile the code, don't run SdkRuntime because the server is down after the compilation"); + hashstr = csl_compile_core( + csl_path, + width, + height, + pe_length, + blockSize, + code_csl, + dirname, + fabric_width, + fabric_height, + core_fabric_offset_x, + core_fabric_offset_y, + args.arch, + C0, + C1, + C2, + C3, + C4, + C5, + C6, + C7, + C8, + channels, + width_west_buf, + width_east_buf + ) + print(f"dump artifact name to file {hash_filename}") + with open(hash_filename, "w") as write_file: + json.dump(hashstr, write_file) + print("COMPILE ONLY: EXIT") + return + + print(f"load artifact name from file {hash_filename}") + with open(hash_filename, "r") as f: + hashstr = json.load(f) + + memcpy_dtype = MemcpyDataType.MEMCPY_32BIT + with SdkRuntime(hashstr, simulator=args.simulator) as runner: + + symbol_x = runner.get_id("x") + symbol_y = runner.get_id("y") + symbol_time_memcpy = runner.get_id("time_memcpy") + symbol_stencil_coeff = runner.get_id("stencil_coeff") + symbol_time_buf_u16 = runner.get_id("time_buf_u16") + symbol_time_ref = runner.get_id("time_ref") + + # load() and run() are called by client.Sdkruntime.__enter__ + #runner.load() + #runner.run() + + print(f"copy vector x of type f32") + # the size of x per PE is pe_length + runner.memcpy_h2d(symbol_x, x_1d, 0, 0, width, height, pe_length,\ + streaming=False, data_type=memcpy_dtype, order=MemcpyOrder.COL_MAJOR, nonblock=True) + + print(f"copy coefficients of type f32") + # each PE holds 7 coefficients + runner.memcpy_h2d(symbol_stencil_coeff, stencil_coeff_1d, 0, 0, width, height, 7,\ + streaming=False, data_type=memcpy_dtype, order=MemcpyOrder.COL_MAJOR, nonblock=True) + + print("step 1: sync all PEs") + runner.launch("f_sync", np.int16(1), nonblock=False) + + print("step 2: tic() records time_start") + runner.launch("f_tic", nonblock=True) + + print(f"step 3: compute y = A*x with zDim = {zDim}") + # positive zDim can be smaller than pe_length + runner.launch("f_spmv", np.int16(zDim), nonblock=False) + + print("step 4: toc() records time_end") + runner.launch("f_toc", nonblock=False) + + print("step 5: prepare (time_start, time_end)") + runner.launch("f_memcpy_timestamps", nonblock=False) + + print("step 6: D2H (time_start, time_end)") + # time_start/time_end is of type u16[3] + time_memcpy_hwl_1d = np.zeros(height*width*6, np.uint32) + runner.memcpy_d2h(time_memcpy_hwl_1d, symbol_time_buf_u16, 0, 0, width, height, 6,\ + streaming=False, data_type=MemcpyDataType.MEMCPY_16BIT, order=MemcpyOrder.COL_MAJOR, nonblock=False) + time_memcpy_hwl = oned_to_hwl_colmajor(height, width, 6, time_memcpy_hwl_1d, np.uint16) + + print("step 7: D2H y of type f32") + y_1d = np.zeros(height*width*pe_length, np.float32) + runner.memcpy_d2h(y_1d, symbol_y, 0, 0, width, height, pe_length,\ + streaming=False, data_type=memcpy_dtype, order=MemcpyOrder.COL_MAJOR, nonblock=False) + y_wse = np.reshape(y_1d, (height, width, pe_length), order='F') + + print("step 8: prepare reference clock") + runner.launch("f_reference_timestamps", nonblock=False) + + print("step 9: D2H reference clock") + time_ref_1d = np.zeros(height*width*3, np.uint32) + runner.memcpy_d2h(time_ref_1d, symbol_time_ref, 0, 0, width, height, 3,\ + streaming=False, data_type=MemcpyDataType.MEMCPY_16BIT, order=MemcpyOrder.COL_MAJOR, nonblock=False) + time_ref_hwl = oned_to_hwl_colmajor(height, width, 3, time_ref_1d, np.uint16) + + # stop() is called by client.Sdkruntime.__exit__ + #runner.stop() + + # time_start = start time of spmv + time_start = np.zeros((height, width)).astype(int) + # time_end = end time of spmv + time_end = np.zeros((height, width)).astype(int) + word = np.zeros(3).astype(np.uint16) + for w in range(width): + for h in range(height): + word[0] = time_memcpy_hwl[(h, w, 0)] + word[1] = time_memcpy_hwl[(h, w, 1)] + word[2] = time_memcpy_hwl[(h, w, 2)] + time_start[(h,w)] = make_u48(word) + word[0] = time_memcpy_hwl[(h, w, 3)] + word[1] = time_memcpy_hwl[(h, w, 4)] + word[2] = time_memcpy_hwl[(h, w, 5)] + time_end[(h,w)] = make_u48(word) + + # time_ref = reference clock + time_ref = np.zeros((height, width)).astype(int) + word = np.zeros(3).astype(np.uint16) + for w in range(width): + for h in range(height): + word[0] = time_ref_hwl[(h, w, 0)] + word[1] = time_ref_hwl[(h, w, 1)] + word[2] = time_ref_hwl[(h, w, 2)] + time_ref[(h, w)] = make_u48(word) + + # adjust the reference clock by the propagation delay + # the right-bottom PE signals other PEs, the propagation delay is + # (h-1) - py + (w-1) - px + for py in range(height): + for px in range(width): + time_ref[(py, px)] = time_ref[(py, px)] - ((width+height-2)-(px + py)) + + # shift time_start and time_end by time_ref + time_start = time_start - time_ref + time_end = time_end - time_ref + + # cycles_send = time_end[(h,w)] - time_start[(h,w)] + # 850MHz --> 1 cycle = (1/0.85) ns = (1/0.85)*1.e-3 us + # time_send = (cycles_send / 0.85) *1.e-3 us + # + # each PE needs to gather six f32 from six neighbors, the cost of the communication is + # 6*h*w*zDim*4 bytes + # + # bandwidth = (((wvlts-1) * 4)/time_send) MBS + wvlts = 6*height*width*zDim + min_time_start = time_start.min() + max_time_end = time_end.max() + cycles_send = max_time_end - min_time_start + time_send = (cycles_send / 0.85) *1.e-3 + bandwidth = ((wvlts * 4)/time_send) + print(f"cycles_send = {cycles_send} cycles") + print(f"time_send = {time_send} us") + print(f"bandwidth = {bandwidth} MB/S ") + + z = y_ref.ravel() - y_wse.ravel() + nrm_z = np.linalg.norm(z, np.inf) + print(f"|y_ref - y_wes| = {nrm_z}") + np.testing.assert_allclose(y_ref.ravel(), y_wse.ravel(), 1.e-5) + print("\nSUCCESS!") + +if __name__ == "__main__": + main() diff --git a/benchmarks/stencil-3d-7pts/run.py b/benchmarks/7pt-stencil-spmv/run.py similarity index 92% rename from benchmarks/stencil-3d-7pts/run.py rename to benchmarks/7pt-stencil-spmv/run.py index e7c0a11..0b15730 100644 --- a/benchmarks/stencil-3d-7pts/run.py +++ b/benchmarks/7pt-stencil-spmv/run.py @@ -251,7 +251,7 @@ def main(): sim_log = os.path.join(dirname, "sim.log") # layout of a rectangle - code_csl = "layout.csl" + code_csl = "src/layout.csl" C0 = 0 C1 = 1 @@ -295,67 +295,67 @@ def main(): return memcpy_dtype = MemcpyDataType.MEMCPY_32BIT - simulator = SdkRuntime(dirname, cmaddr=args.cmaddr) + runner = SdkRuntime(dirname, cmaddr=args.cmaddr) - symbol_x = simulator.get_id("x") - symbol_y = simulator.get_id("y") - symbol_stencil_coeff = simulator.get_id("stencil_coeff") - symbol_time_buf_u16 = simulator.get_id("time_buf_u16") - symbol_time_ref = simulator.get_id("time_ref") + symbol_x = runner.get_id("x") + symbol_y = runner.get_id("y") + symbol_stencil_coeff = runner.get_id("stencil_coeff") + symbol_time_buf_u16 = runner.get_id("time_buf_u16") + symbol_time_ref = runner.get_id("time_ref") - simulator.load() - simulator.run() + runner.load() + runner.run() print(f"copy vector x of type f32") # the size of x per PE is pe_length - simulator.memcpy_h2d(symbol_x, x_1d, 0, 0, width, height, pe_length,\ + runner.memcpy_h2d(symbol_x, x_1d, 0, 0, width, height, pe_length,\ streaming=False, data_type=memcpy_dtype, order=MemcpyOrder.COL_MAJOR, nonblock=True) print(f"copy coefficients of type f32") # each PE holds 7 coefficients - simulator.memcpy_h2d(symbol_stencil_coeff, stencil_coeff_1d, 0, 0, width, height, 7,\ + runner.memcpy_h2d(symbol_stencil_coeff, stencil_coeff_1d, 0, 0, width, height, 7,\ streaming=False, data_type=memcpy_dtype, order=MemcpyOrder.COL_MAJOR, nonblock=True) print("step 1: sync all PEs") - simulator.launch("f_sync", np.int16(1), nonblock=False) + runner.launch("f_sync", np.int16(1), nonblock=False) print("step 2: tic() records time_start") - simulator.launch("f_tic", nonblock=True) + runner.launch("f_tic", nonblock=True) print(f"step 3: compute y = A*x with zDim = {zDim}") # positive zDim can be smaller than pe_length - simulator.launch("f_spmv", np.int16(zDim), nonblock=False) + runner.launch("f_spmv", np.int16(zDim), nonblock=False) print("step 4: toc() records time_end") - simulator.launch("f_toc", nonblock=False) + runner.launch("f_toc", nonblock=False) print("step 5: prepare (time_start, time_end)") - simulator.launch("f_memcpy_timestamps", nonblock=False) + runner.launch("f_memcpy_timestamps", nonblock=False) print("step 6: D2H (time_start, time_end)") time_memcpy_hwl_1d = np.zeros(height*width*6, np.uint32) - simulator.memcpy_d2h(time_memcpy_hwl_1d, symbol_time_buf_u16, 0, 0, width, height, 6,\ + runner.memcpy_d2h(time_memcpy_hwl_1d, symbol_time_buf_u16, 0, 0, width, height, 6,\ streaming=False, data_type=MemcpyDataType.MEMCPY_16BIT, order=MemcpyOrder.COL_MAJOR, nonblock=False) time_memcpy_hwl = oned_to_hwl_colmajor(height, width, 6, time_memcpy_hwl_1d, np.uint16) print("step 7: D2H y of type f32") y_1d = np.zeros(height*width*pe_length, np.float32) - simulator.memcpy_d2h(y_1d, symbol_y, 0, 0, width, height, pe_length,\ + runner.memcpy_d2h(y_1d, symbol_y, 0, 0, width, height, pe_length,\ streaming=False, data_type=memcpy_dtype, order=MemcpyOrder.COL_MAJOR, nonblock=False) y_wse = np.reshape(y_1d, (height, width, pe_length), order='F') print("step 8: prepare reference clock") - simulator.launch("f_reference_timestamps", nonblock=False) + runner.launch("f_reference_timestamps", nonblock=False) print("step 9: D2H reference clock") time_ref_1d = np.zeros(height*width*3, np.uint32) - simulator.memcpy_d2h(time_ref_1d, symbol_time_ref, 0, 0, width, height, 3,\ + runner.memcpy_d2h(time_ref_1d, symbol_time_ref, 0, 0, width, height, 3,\ streaming=False, data_type=MemcpyDataType.MEMCPY_16BIT, order=MemcpyOrder.COL_MAJOR, nonblock=False) time_ref_hwl = oned_to_hwl_colmajor(height, width, 3, time_ref_1d, np.uint16) - simulator.stop() + runner.stop() - if args.cmaddr is None: + if args.simulator: # move simulation log and core dump to the given folder dst_log = Path(f"{dirname}/sim.log") src_log = Path("sim.log") diff --git a/benchmarks/stencil-3d-7pts/kernel.csl b/benchmarks/7pt-stencil-spmv/src/kernel.csl similarity index 95% rename from benchmarks/stencil-3d-7pts/kernel.csl rename to benchmarks/7pt-stencil-spmv/src/kernel.csl index 1adcebc..ff57614 100644 --- a/benchmarks/stencil-3d-7pts/kernel.csl +++ b/benchmarks/7pt-stencil-spmv/src/kernel.csl @@ -32,7 +32,7 @@ const timestamp = @import_module("