diff --git a/numba/openmp.py b/numba/openmp.py index ef2633c46..f9d165dce 100644 --- a/numba/openmp.py +++ b/numba/openmp.py @@ -515,7 +515,7 @@ def replace_vars_inner(self, var_dict): def add_to_usedef_set(self, use_set, def_set, start): assert start==True or start==False - if config.DEBUG_OPENMP >= 1: + if config.DEBUG_OPENMP >= 3: print("add_to_usedef_set", start, self.name, "is_dsa=", is_dsa(self.name)) def add_arg(arg, the_set): @@ -3360,6 +3360,7 @@ def get_loops_in_region(all_loops): collapse_tags = get_tags_of_type(clauses, "QUAL.OMP.COLLAPSE") new_stmts_for_iterspace = [] collapse_iterspace_block = set() + iterspace_vars = [] if len(collapse_tags) > 0: # Limit all_loops to just loops within the openmp region. all_loops = get_loops_in_region(all_loops) @@ -3469,7 +3470,6 @@ def get_loops_in_region(all_loops): new_var_scope = last_loop_entry_block.body[0].target.scope # -------- Add vars to remember cumulative product of iteration space sizes. - iterspace_vars = [] new_iterspace_var = new_var_scope.redefine("new_iterspace0", self.loc) start_tags.append(openmp_tag("QUAL.OMP.FIRSTPRIVATE", new_iterspace_var.name)) iterspace_vars.append(new_iterspace_var) @@ -3873,8 +3873,7 @@ def _get_loop_kind(func_var, call_table): start_tags.append(openmp_tag("QUAL.OMP.FIRSTPRIVATE", omp_start_var.name)) start_tags.append(openmp_tag("QUAL.OMP.FIRSTPRIVATE", omp_lb_var.name)) start_tags.append(openmp_tag("QUAL.OMP.FIRSTPRIVATE", omp_ub_var.name)) - tags_for_enclosing = [cmp_var.name, omp_lb_var.name, omp_start_var.name, omp_iv_var.name, types_mod_var.name, int64_var.name, itercount_var.name, omp_ub_var.name, const1_var.name, const1_latch_var.name] - #tags_for_enclosing = [omp_lb_var.name, omp_start_var.name, omp_iv_var.name, types_mod_var.name, int64_var.name, itercount_var.name, omp_ub_var.name, const1_var.name, const1_latch_var.name] + tags_for_enclosing = [cmp_var.name, omp_lb_var.name, omp_start_var.name, omp_iv_var.name, types_mod_var.name, int64_var.name, itercount_var.name, omp_ub_var.name, const1_var.name, const1_latch_var.name, get_itercount_var.name] + [x.name for x in iterspace_vars] tags_for_enclosing = [openmp_tag("QUAL.OMP.PRIVATE", x) for x in tags_for_enclosing] # Don't blindly copy code here...this isn't doing what the other spots are doing with privatization. #self.add_private_to_enclosing(replace_vardict, tags_for_enclosing) @@ -3891,15 +3890,6 @@ def some_for_directive(self, args, main_start_tag, main_end_tag, first_clause, g start_tags = [openmp_tag(main_start_tag)] end_tags = [openmp_tag(main_end_tag)] clauses = self.some_data_clause_directive(args, start_tags, end_tags, first_clause, has_loop=True) - #sblk = self.blocks[self.blk_start] - #scope = sblk.scope - #eblk = self.blocks[self.blk_end] - #clauses, default_shared = self.flatten(args[first_clause:], sblk) - - #if config.DEBUG_OPENMP >= 1: - # print("visit", main_start_tag, args, type(args), default_shared) - # for clause in clauses: - # print("post-process clauses:", clause) if "PARALLEL" in main_start_tag: # ---- Back propagate THREAD_LIMIT to enclosed target region. ---- @@ -3969,6 +3959,18 @@ def for_simd_clause(self, args): args, type(args), args[0]) return args[0] + def schedule_clause(self, args): + if config.DEBUG_OPENMP >= 1: + print("visit schedule_clause", + args, type(args), args[0]) + return args[0] + + def dist_schedule_clause(self, args): + if config.DEBUG_OPENMP >= 1: + print("visit dist_schedule_clause", + args, type(args), args[0]) + return args[0] + # Don't need a rule for parallel_for_simd_construct. def parallel_for_simd_directive(self, args): @@ -4071,7 +4073,7 @@ def map_clause(self, args): assert(len(args) == 2) else: map_type = "TOFROM" # is this default right? FIX ME - var_list = args + var_list = args[0] ret = [] for var in var_list: ret.append(openmp_tag("QUAL.OMP.MAP." + map_type, var)) @@ -4267,7 +4269,7 @@ def teams_back_prop(self, clauses): def check_distribute_nesting(self, dir_tag): if "DISTRIBUTE" in dir_tag and "TEAMS" not in dir_tag: enclosing_regions = get_enclosing_region(self.func_ir, self.blk_start) - if len(enclosing_regions) < 1 or "TEAMS" not in enclosing_regions[0].tags[0].name: + if len(enclosing_regions) < 1 or "TEAMS" not in enclosing_regions[-1].tags[0].name: raise NotImplementedError("DISTRIBUTE must be nested under or combined with TEAMS.") def teams_directive(self, args): @@ -4330,10 +4332,11 @@ def target_teams_directive(self, args): def target_teams_distribute_directive(self, args): self.some_target_directive(args, "TARGET.TEAMS.DISTRIBUTE", 3, has_loop=True) + def target_loop_directive(self, args): + self.some_target_directive(args, "TARGET.TEAMS.DISTRIBUTE.PARALLEL.LOOP", 2, has_loop=True) + def target_teams_loop_directive(self, args): self.some_target_directive(args, "TARGET.TEAMS.DISTRIBUTE.PARALLEL.LOOP", 3, has_loop=True) - #self.some_target_directive(args, "TARGET.TEAMS.DISTRIBUTE.PARALLEL.LOOP.SIMD", 3, has_loop=True) - #self.some_target_directive(args, "TARGET.TEAMS.LOOP", 3, has_loop=True) def target_teams_distribute_parallel_for_directive(self, args): self.some_target_directive(args, "TARGET.TEAMS.DISTRIBUTE.PARALLEL.LOOP", 5, has_loop=True) @@ -4415,6 +4418,26 @@ def teams_distribute_directive(self, args): def teams_distribute_simd_directive(self, args): self.some_distribute_directive(args, "TEAMS.DISTRIBUTE.SIMD", 3, has_loop=True) + def teams_loop_directive(self, args): + self.some_distribute_directive(args, "TEAMS.DISTRIBUTE.PARALLEL.LOOP", 2, has_loop=True) + + def loop_directive(self, args): + # TODO Add error checking that a clause that the parser accepts if we find that + # loop can even take clauses, which we're not sure that it can. + enclosing_regions = get_enclosing_region(self.func_ir, self.blk_start) + if not enclosing_regions or len(enclosing_regions) < 1: + self.some_for_directive(args, "DIR.OMP.PARALLEL.LOOP", "DIR.OMP.END.PARALLEL.LOOP", 1, True) + else: + if "DISTRIBUTE" in enclosing_regions[-1].tags[0].name: + self.some_distribute_directive(args, "PARALLEL.LOOP", 1, has_loop=True) + elif "TEAMS" in enclosing_regions[-1].tags[0].name: + self.some_distribute_directive(args, "DISTRIBUTE.PARALLEL.LOOP", 1, has_loop=True) + else: + if "TARGET" in enclosing_regions[-1].tags[0].name: + self.some_distribute_directive(args, "TEAMS.DISTRIBUTE.PARALLEL.LOOP", 1, has_loop=True) + else: + self.some_for_directive(args, "DIR.OMP.PARALLEL.LOOP", "DIR.OMP.END.PARALLEL.LOOP", 1, True) + def distribute_directive(self, args): self.some_distribute_directive(args, "DISTRIBUTE", 1, has_loop=True) @@ -4453,8 +4476,6 @@ def some_distribute_directive(self, args, dir_tag, lexer_count, has_loop=False): start_tags.append(openmp_tag("QUAL.OMP.THREAD_LIMIT", 0)) self.teams_back_prop(clauses) elif "PARALLEL" in dir_tag: - if len(self.get_clauses_by_name(clauses, "QUAL.OMP.THREAD_LIMIT")) == 0: - start_tags.append(openmp_tag("QUAL.OMP.THREAD_LIMIT", 0)) self.parallel_back_prop(clauses) if config.DEBUG_OPENMP >= 1: @@ -4796,13 +4817,6 @@ def target_teams_distribute_parallel_for_clause(self, args): print(args[0][0]) return args[0] - def target_teams_loop_clause(self, args): - if config.DEBUG_OPENMP >= 1: - print("visit target_teams_loop_clause", args, type(args), args[0]) - if isinstance(args[0], list): - print(args[0][0]) - return args[0] - # Don't need a rule for target_update_construct. def target_update_directive(self, args): @@ -5514,12 +5528,15 @@ def NUMBER(self, args): | teams_distribute_simd_construct | teams_distribute_parallel_for_construct | teams_distribute_parallel_for_simd_construct + | loop_construct + | teams_loop_construct | target_construct | target_teams_construct | target_teams_distribute_construct | target_teams_distribute_simd_construct | target_teams_distribute_parallel_for_simd_construct | target_teams_distribute_parallel_for_construct + | target_loop_construct | target_teams_loop_construct | target_enter_data_construct | target_exit_data_construct @@ -5539,8 +5556,6 @@ def NUMBER(self, args): | parallel_sections_construct | master_construct | ordered_construct - //teams_distribute_parallel_for_simd_clause: target_clause - // | teams_distribute_parallel_for_simd_clause for_simd_construct: for_simd_directive for_simd_directive: FOR SIMD [for_simd_clause*] for_simd_clause: for_clause @@ -5735,6 +5750,9 @@ def NUMBER(self, args): target_teams_distribute_parallel_for_construct: target_teams_distribute_parallel_for_directive teams_distribute_parallel_for_construct: teams_distribute_parallel_for_directive teams_distribute_parallel_for_simd_construct: teams_distribute_parallel_for_simd_directive + loop_construct: loop_directive + teams_loop_construct: teams_loop_directive + target_loop_construct: target_loop_directive target_teams_loop_construct: target_teams_loop_directive target_teams_construct: target_teams_directive target_teams_distribute_construct: target_teams_distribute_directive @@ -5903,30 +5921,10 @@ def NUMBER(self, args): ompx_attribute: OMPX_ATTRIBUTE "(" PYTHON_NAME "(" number_list ")" ")" OMPX_ATTRIBUTE: "ompx_attribute" - //target_teams_loop_directive: TARGET TEAMS LOOP [target_teams_loop_clause*] - target_teams_loop_directive: TARGET TEAMS LOOP [target_teams_distribute_parallel_for_simd_clause*] - target_teams_loop_clause: if_clause - | device_clause - | private_clause - | firstprivate_clause - // | in_reduction_clause - | map_clause - | is_device_ptr_clause - // | defaultmap_clause - | NOWAIT - | allocate_clause - | depend_with_modifier_clause - // | uses_allocators_clause - | num_teams_clause - | thread_limit_clause - | data_default_clause - | data_sharing_clause - // | reduction_default_only_clause - // | bind_clause - | collapse_clause - | ORDERED - | lastprivate_clause - | ompx_attribute + loop_directive: LOOP [teams_distribute_parallel_for_clause*] + teams_loop_directive: TEAMS LOOP [teams_distribute_parallel_for_clause*] + target_loop_directive: TARGET LOOP [target_teams_distribute_parallel_for_clause*] + target_teams_loop_directive: TARGET TEAMS LOOP [target_teams_distribute_parallel_for_clause*] target_teams_directive: TARGET TEAMS [target_teams_clause*] target_teams_clause: if_clause @@ -6149,8 +6147,7 @@ def NUMBER(self, args): for_directive: FOR [for_clause*] for_clause: unique_for_clause | data_clause | NOWAIT unique_for_clause: ORDERED - | sched_no_expr - | sched_expr + | schedule_clause | collapse_clause LINEAR: "linear" linear_clause: LINEAR "(" var_list ":" const_num_or_var ")" diff --git a/numba/tests/test_openmp.py b/numba/tests/test_openmp.py index d66a8a514..316381cb5 100644 --- a/numba/tests/test_openmp.py +++ b/numba/tests/test_openmp.py @@ -2953,6 +2953,10 @@ class TestOpenmpTarget(TestOpenmpBase): def __init__(self, *args): TestOpenmpBase.__init__(self, *args) + @classmethod + def is_testing_cpu(cls): + return 1 in cls.devices + # How to check for nowait? # Currently checks only compilation. # Numba optimizes the whole target away? This runs too fast. @@ -4409,6 +4413,25 @@ def test_impl(n): c = test_impl(n) np.testing.assert_array_equal(c, np.full((n,n), 2)) + def target_nest_teams_nest_loop_collapse(self, device): + target_pragma = f"""target device({device}) map(tofrom: a, b, c)""" + @njit + def test_impl(n): + a = np.ones((n,n)) + b = np.ones((n,n)) + c = np.zeros((n,n)) + with openmp(target_pragma): + with openmp("teams"): + with openmp("loop collapse(2)"): + for i in range(n): + for j in range(n): + c[i,j] = a[i,j] + b[i,j] + return c + + n = 10 + c = test_impl(n) + np.testing.assert_array_equal(c, np.full((n,n), 2)) + for memberName in dir(TestOpenmpTarget): if memberName.startswith("target"): @@ -4462,6 +4485,23 @@ def test_impl(num_steps): self.check(test_impl, 100000) + def test_pi_loop_directive(self): + def test_impl(num_steps): + step = 1.0 / num_steps + + the_sum = 0.0 + omp_set_num_threads(4) + + with openmp("loop reduction(+:the_sum) schedule(static)"): + for j in range(num_steps): + x = ((j-1) - 0.5) * step + the_sum += 4.0 / (1.0 + x * x) + + pi = step * the_sum + return pi + + self.check(test_impl, 100000) + def test_pi_spmd(self): def test_impl(num_steps): step = 1.0 / num_steps