Skip to content

Commit

Permalink
More loop variants. (#33)
Browse files Browse the repository at this point in the history
* Add loop directive variants
* Add tests
* Adding way to skip test only for device=1
* Remove old comments
* Process schedule_clause and dist_schedule_clause from the grammar
  • Loading branch information
DrTodd13 authored Oct 1, 2024
1 parent fb08a5d commit e86d157
Show file tree
Hide file tree
Showing 2 changed files with 91 additions and 54 deletions.
105 changes: 51 additions & 54 deletions numba/openmp.py
Original file line number Diff line number Diff line change
Expand Up @@ -515,7 +515,7 @@ def replace_vars_inner(self, var_dict):

def add_to_usedef_set(self, use_set, def_set, start):
assert start==True or start==False
if config.DEBUG_OPENMP >= 1:
if config.DEBUG_OPENMP >= 3:
print("add_to_usedef_set", start, self.name, "is_dsa=", is_dsa(self.name))

def add_arg(arg, the_set):
Expand Down Expand Up @@ -3360,6 +3360,7 @@ def get_loops_in_region(all_loops):
collapse_tags = get_tags_of_type(clauses, "QUAL.OMP.COLLAPSE")
new_stmts_for_iterspace = []
collapse_iterspace_block = set()
iterspace_vars = []
if len(collapse_tags) > 0:
# Limit all_loops to just loops within the openmp region.
all_loops = get_loops_in_region(all_loops)
Expand Down Expand Up @@ -3469,7 +3470,6 @@ def get_loops_in_region(all_loops):
new_var_scope = last_loop_entry_block.body[0].target.scope

# -------- Add vars to remember cumulative product of iteration space sizes.
iterspace_vars = []
new_iterspace_var = new_var_scope.redefine("new_iterspace0", self.loc)
start_tags.append(openmp_tag("QUAL.OMP.FIRSTPRIVATE", new_iterspace_var.name))
iterspace_vars.append(new_iterspace_var)
Expand Down Expand Up @@ -3873,8 +3873,7 @@ def _get_loop_kind(func_var, call_table):
start_tags.append(openmp_tag("QUAL.OMP.FIRSTPRIVATE", omp_start_var.name))
start_tags.append(openmp_tag("QUAL.OMP.FIRSTPRIVATE", omp_lb_var.name))
start_tags.append(openmp_tag("QUAL.OMP.FIRSTPRIVATE", omp_ub_var.name))
tags_for_enclosing = [cmp_var.name, omp_lb_var.name, omp_start_var.name, omp_iv_var.name, types_mod_var.name, int64_var.name, itercount_var.name, omp_ub_var.name, const1_var.name, const1_latch_var.name]
#tags_for_enclosing = [omp_lb_var.name, omp_start_var.name, omp_iv_var.name, types_mod_var.name, int64_var.name, itercount_var.name, omp_ub_var.name, const1_var.name, const1_latch_var.name]
tags_for_enclosing = [cmp_var.name, omp_lb_var.name, omp_start_var.name, omp_iv_var.name, types_mod_var.name, int64_var.name, itercount_var.name, omp_ub_var.name, const1_var.name, const1_latch_var.name, get_itercount_var.name] + [x.name for x in iterspace_vars]
tags_for_enclosing = [openmp_tag("QUAL.OMP.PRIVATE", x) for x in tags_for_enclosing]
# Don't blindly copy code here...this isn't doing what the other spots are doing with privatization.
#self.add_private_to_enclosing(replace_vardict, tags_for_enclosing)
Expand All @@ -3891,15 +3890,6 @@ def some_for_directive(self, args, main_start_tag, main_end_tag, first_clause, g
start_tags = [openmp_tag(main_start_tag)]
end_tags = [openmp_tag(main_end_tag)]
clauses = self.some_data_clause_directive(args, start_tags, end_tags, first_clause, has_loop=True)
#sblk = self.blocks[self.blk_start]
#scope = sblk.scope
#eblk = self.blocks[self.blk_end]
#clauses, default_shared = self.flatten(args[first_clause:], sblk)

#if config.DEBUG_OPENMP >= 1:
# print("visit", main_start_tag, args, type(args), default_shared)
# for clause in clauses:
# print("post-process clauses:", clause)

if "PARALLEL" in main_start_tag:
# ---- Back propagate THREAD_LIMIT to enclosed target region. ----
Expand Down Expand Up @@ -3969,6 +3959,18 @@ def for_simd_clause(self, args):
args, type(args), args[0])
return args[0]

def schedule_clause(self, args):
if config.DEBUG_OPENMP >= 1:
print("visit schedule_clause",
args, type(args), args[0])
return args[0]

def dist_schedule_clause(self, args):
if config.DEBUG_OPENMP >= 1:
print("visit dist_schedule_clause",
args, type(args), args[0])
return args[0]

# Don't need a rule for parallel_for_simd_construct.

def parallel_for_simd_directive(self, args):
Expand Down Expand Up @@ -4071,7 +4073,7 @@ def map_clause(self, args):
assert(len(args) == 2)
else:
map_type = "TOFROM" # is this default right? FIX ME
var_list = args
var_list = args[0]
ret = []
for var in var_list:
ret.append(openmp_tag("QUAL.OMP.MAP." + map_type, var))
Expand Down Expand Up @@ -4267,7 +4269,7 @@ def teams_back_prop(self, clauses):
def check_distribute_nesting(self, dir_tag):
if "DISTRIBUTE" in dir_tag and "TEAMS" not in dir_tag:
enclosing_regions = get_enclosing_region(self.func_ir, self.blk_start)
if len(enclosing_regions) < 1 or "TEAMS" not in enclosing_regions[0].tags[0].name:
if len(enclosing_regions) < 1 or "TEAMS" not in enclosing_regions[-1].tags[0].name:
raise NotImplementedError("DISTRIBUTE must be nested under or combined with TEAMS.")

def teams_directive(self, args):
Expand Down Expand Up @@ -4330,10 +4332,11 @@ def target_teams_directive(self, args):
def target_teams_distribute_directive(self, args):
self.some_target_directive(args, "TARGET.TEAMS.DISTRIBUTE", 3, has_loop=True)

def target_loop_directive(self, args):
self.some_target_directive(args, "TARGET.TEAMS.DISTRIBUTE.PARALLEL.LOOP", 2, has_loop=True)

def target_teams_loop_directive(self, args):
self.some_target_directive(args, "TARGET.TEAMS.DISTRIBUTE.PARALLEL.LOOP", 3, has_loop=True)
#self.some_target_directive(args, "TARGET.TEAMS.DISTRIBUTE.PARALLEL.LOOP.SIMD", 3, has_loop=True)
#self.some_target_directive(args, "TARGET.TEAMS.LOOP", 3, has_loop=True)

def target_teams_distribute_parallel_for_directive(self, args):
self.some_target_directive(args, "TARGET.TEAMS.DISTRIBUTE.PARALLEL.LOOP", 5, has_loop=True)
Expand Down Expand Up @@ -4415,6 +4418,26 @@ def teams_distribute_directive(self, args):
def teams_distribute_simd_directive(self, args):
self.some_distribute_directive(args, "TEAMS.DISTRIBUTE.SIMD", 3, has_loop=True)

def teams_loop_directive(self, args):
self.some_distribute_directive(args, "TEAMS.DISTRIBUTE.PARALLEL.LOOP", 2, has_loop=True)

def loop_directive(self, args):
# TODO Add error checking that a clause that the parser accepts if we find that
# loop can even take clauses, which we're not sure that it can.
enclosing_regions = get_enclosing_region(self.func_ir, self.blk_start)
if not enclosing_regions or len(enclosing_regions) < 1:
self.some_for_directive(args, "DIR.OMP.PARALLEL.LOOP", "DIR.OMP.END.PARALLEL.LOOP", 1, True)
else:
if "DISTRIBUTE" in enclosing_regions[-1].tags[0].name:
self.some_distribute_directive(args, "PARALLEL.LOOP", 1, has_loop=True)
elif "TEAMS" in enclosing_regions[-1].tags[0].name:
self.some_distribute_directive(args, "DISTRIBUTE.PARALLEL.LOOP", 1, has_loop=True)
else:
if "TARGET" in enclosing_regions[-1].tags[0].name:
self.some_distribute_directive(args, "TEAMS.DISTRIBUTE.PARALLEL.LOOP", 1, has_loop=True)
else:
self.some_for_directive(args, "DIR.OMP.PARALLEL.LOOP", "DIR.OMP.END.PARALLEL.LOOP", 1, True)

def distribute_directive(self, args):
self.some_distribute_directive(args, "DISTRIBUTE", 1, has_loop=True)

Expand Down Expand Up @@ -4453,8 +4476,6 @@ def some_distribute_directive(self, args, dir_tag, lexer_count, has_loop=False):
start_tags.append(openmp_tag("QUAL.OMP.THREAD_LIMIT", 0))
self.teams_back_prop(clauses)
elif "PARALLEL" in dir_tag:
if len(self.get_clauses_by_name(clauses, "QUAL.OMP.THREAD_LIMIT")) == 0:
start_tags.append(openmp_tag("QUAL.OMP.THREAD_LIMIT", 0))
self.parallel_back_prop(clauses)

if config.DEBUG_OPENMP >= 1:
Expand Down Expand Up @@ -4796,13 +4817,6 @@ def target_teams_distribute_parallel_for_clause(self, args):
print(args[0][0])
return args[0]

def target_teams_loop_clause(self, args):
if config.DEBUG_OPENMP >= 1:
print("visit target_teams_loop_clause", args, type(args), args[0])
if isinstance(args[0], list):
print(args[0][0])
return args[0]

# Don't need a rule for target_update_construct.

def target_update_directive(self, args):
Expand Down Expand Up @@ -5514,12 +5528,15 @@ def NUMBER(self, args):
| teams_distribute_simd_construct
| teams_distribute_parallel_for_construct
| teams_distribute_parallel_for_simd_construct
| loop_construct
| teams_loop_construct
| target_construct
| target_teams_construct
| target_teams_distribute_construct
| target_teams_distribute_simd_construct
| target_teams_distribute_parallel_for_simd_construct
| target_teams_distribute_parallel_for_construct
| target_loop_construct
| target_teams_loop_construct
| target_enter_data_construct
| target_exit_data_construct
Expand All @@ -5539,8 +5556,6 @@ def NUMBER(self, args):
| parallel_sections_construct
| master_construct
| ordered_construct
//teams_distribute_parallel_for_simd_clause: target_clause
// | teams_distribute_parallel_for_simd_clause
for_simd_construct: for_simd_directive
for_simd_directive: FOR SIMD [for_simd_clause*]
for_simd_clause: for_clause
Expand Down Expand Up @@ -5735,6 +5750,9 @@ def NUMBER(self, args):
target_teams_distribute_parallel_for_construct: target_teams_distribute_parallel_for_directive
teams_distribute_parallel_for_construct: teams_distribute_parallel_for_directive
teams_distribute_parallel_for_simd_construct: teams_distribute_parallel_for_simd_directive
loop_construct: loop_directive
teams_loop_construct: teams_loop_directive
target_loop_construct: target_loop_directive
target_teams_loop_construct: target_teams_loop_directive
target_teams_construct: target_teams_directive
target_teams_distribute_construct: target_teams_distribute_directive
Expand Down Expand Up @@ -5903,30 +5921,10 @@ def NUMBER(self, args):
ompx_attribute: OMPX_ATTRIBUTE "(" PYTHON_NAME "(" number_list ")" ")"
OMPX_ATTRIBUTE: "ompx_attribute"
//target_teams_loop_directive: TARGET TEAMS LOOP [target_teams_loop_clause*]
target_teams_loop_directive: TARGET TEAMS LOOP [target_teams_distribute_parallel_for_simd_clause*]
target_teams_loop_clause: if_clause
| device_clause
| private_clause
| firstprivate_clause
// | in_reduction_clause
| map_clause
| is_device_ptr_clause
// | defaultmap_clause
| NOWAIT
| allocate_clause
| depend_with_modifier_clause
// | uses_allocators_clause
| num_teams_clause
| thread_limit_clause
| data_default_clause
| data_sharing_clause
// | reduction_default_only_clause
// | bind_clause
| collapse_clause
| ORDERED
| lastprivate_clause
| ompx_attribute
loop_directive: LOOP [teams_distribute_parallel_for_clause*]
teams_loop_directive: TEAMS LOOP [teams_distribute_parallel_for_clause*]
target_loop_directive: TARGET LOOP [target_teams_distribute_parallel_for_clause*]
target_teams_loop_directive: TARGET TEAMS LOOP [target_teams_distribute_parallel_for_clause*]
target_teams_directive: TARGET TEAMS [target_teams_clause*]
target_teams_clause: if_clause
Expand Down Expand Up @@ -6149,8 +6147,7 @@ def NUMBER(self, args):
for_directive: FOR [for_clause*]
for_clause: unique_for_clause | data_clause | NOWAIT
unique_for_clause: ORDERED
| sched_no_expr
| sched_expr
| schedule_clause
| collapse_clause
LINEAR: "linear"
linear_clause: LINEAR "(" var_list ":" const_num_or_var ")"
Expand Down
40 changes: 40 additions & 0 deletions numba/tests/test_openmp.py
Original file line number Diff line number Diff line change
Expand Up @@ -2953,6 +2953,10 @@ class TestOpenmpTarget(TestOpenmpBase):
def __init__(self, *args):
TestOpenmpBase.__init__(self, *args)

@classmethod
def is_testing_cpu(cls):
return 1 in cls.devices

# How to check for nowait?
# Currently checks only compilation.
# Numba optimizes the whole target away? This runs too fast.
Expand Down Expand Up @@ -4409,6 +4413,25 @@ def test_impl(n):
c = test_impl(n)
np.testing.assert_array_equal(c, np.full((n,n), 2))

def target_nest_teams_nest_loop_collapse(self, device):
target_pragma = f"""target device({device}) map(tofrom: a, b, c)"""
@njit
def test_impl(n):
a = np.ones((n,n))
b = np.ones((n,n))
c = np.zeros((n,n))
with openmp(target_pragma):
with openmp("teams"):
with openmp("loop collapse(2)"):
for i in range(n):
for j in range(n):
c[i,j] = a[i,j] + b[i,j]
return c

n = 10
c = test_impl(n)
np.testing.assert_array_equal(c, np.full((n,n), 2))


for memberName in dir(TestOpenmpTarget):
if memberName.startswith("target"):
Expand Down Expand Up @@ -4462,6 +4485,23 @@ def test_impl(num_steps):

self.check(test_impl, 100000)

def test_pi_loop_directive(self):
def test_impl(num_steps):
step = 1.0 / num_steps

the_sum = 0.0
omp_set_num_threads(4)

with openmp("loop reduction(+:the_sum) schedule(static)"):
for j in range(num_steps):
x = ((j-1) - 0.5) * step
the_sum += 4.0 / (1.0 + x * x)

pi = step * the_sum
return pi

self.check(test_impl, 100000)

def test_pi_spmd(self):
def test_impl(num_steps):
step = 1.0 / num_steps
Expand Down

0 comments on commit e86d157

Please sign in to comment.