More loop variants. (#33)

* Add loop directive variants * Add tests * Adding way to skip test only for device=1 * Remove old comments * Process schedule_clause and dist_schedule_clause from the grammar
Python-for-HPC · Oct 1, 2024 · e86d157 · e86d157
1 parent fb08a5d
commit e86d157
Show file tree

Hide file tree

Showing 2 changed files with 91 additions and 54 deletions.
diff --git a/numba/openmp.py b/numba/openmp.py
@@ -515,7 +515,7 @@ def replace_vars_inner(self, var_dict):
 
     def add_to_usedef_set(self, use_set, def_set, start):
         assert start==True or start==False
-        if config.DEBUG_OPENMP >= 1:
+        if config.DEBUG_OPENMP >= 3:
             print("add_to_usedef_set", start, self.name, "is_dsa=", is_dsa(self.name))
 
         def add_arg(arg, the_set):
@@ -3360,6 +3360,7 @@ def get_loops_in_region(all_loops):
         collapse_tags = get_tags_of_type(clauses, "QUAL.OMP.COLLAPSE")
         new_stmts_for_iterspace = []
         collapse_iterspace_block = set()
+        iterspace_vars = []
         if len(collapse_tags) > 0:
             # Limit all_loops to just loops within the openmp region.
             all_loops = get_loops_in_region(all_loops)
@@ -3469,7 +3470,6 @@ def get_loops_in_region(all_loops):
             new_var_scope = last_loop_entry_block.body[0].target.scope
 
             # -------- Add vars to remember cumulative product of iteration space sizes.
-            iterspace_vars = []
             new_iterspace_var = new_var_scope.redefine("new_iterspace0", self.loc)
             start_tags.append(openmp_tag("QUAL.OMP.FIRSTPRIVATE", new_iterspace_var.name))
             iterspace_vars.append(new_iterspace_var)
@@ -3873,8 +3873,7 @@ def _get_loop_kind(func_var, call_table):
                     start_tags.append(openmp_tag("QUAL.OMP.FIRSTPRIVATE", omp_start_var.name))
                     start_tags.append(openmp_tag("QUAL.OMP.FIRSTPRIVATE", omp_lb_var.name))
                     start_tags.append(openmp_tag("QUAL.OMP.FIRSTPRIVATE", omp_ub_var.name))
-                    tags_for_enclosing = [cmp_var.name, omp_lb_var.name, omp_start_var.name, omp_iv_var.name, types_mod_var.name, int64_var.name, itercount_var.name, omp_ub_var.name, const1_var.name, const1_latch_var.name]
-                    #tags_for_enclosing = [omp_lb_var.name, omp_start_var.name, omp_iv_var.name, types_mod_var.name, int64_var.name, itercount_var.name, omp_ub_var.name, const1_var.name, const1_latch_var.name]
+                    tags_for_enclosing = [cmp_var.name, omp_lb_var.name, omp_start_var.name, omp_iv_var.name, types_mod_var.name, int64_var.name, itercount_var.name, omp_ub_var.name, const1_var.name, const1_latch_var.name, get_itercount_var.name] + [x.name for x in iterspace_vars]
                     tags_for_enclosing = [openmp_tag("QUAL.OMP.PRIVATE", x) for x in tags_for_enclosing]
                     # Don't blindly copy code here...this isn't doing what the other spots are doing with privatization.
                     #self.add_private_to_enclosing(replace_vardict, tags_for_enclosing)
@@ -3891,15 +3890,6 @@ def some_for_directive(self, args, main_start_tag, main_end_tag, first_clause, g
         start_tags = [openmp_tag(main_start_tag)]
         end_tags = [openmp_tag(main_end_tag)]
         clauses = self.some_data_clause_directive(args, start_tags, end_tags, first_clause, has_loop=True)
-        #sblk = self.blocks[self.blk_start]
-        #scope = sblk.scope
-        #eblk = self.blocks[self.blk_end]
-        #clauses, default_shared = self.flatten(args[first_clause:], sblk)
-
-        #if config.DEBUG_OPENMP >= 1:
-        #    print("visit", main_start_tag, args, type(args), default_shared)
-        #    for clause in clauses:
-        #        print("post-process clauses:", clause)
 
         if "PARALLEL" in main_start_tag:
             # ---- Back propagate THREAD_LIMIT to enclosed target region. ----
@@ -3969,6 +3959,18 @@ def for_simd_clause(self, args):
                   args, type(args), args[0])
         return args[0]
 
+    def schedule_clause(self, args):
+        if config.DEBUG_OPENMP >= 1:
+            print("visit schedule_clause",
+                  args, type(args), args[0])
+        return args[0]
+
+    def dist_schedule_clause(self, args):
+        if config.DEBUG_OPENMP >= 1:
+            print("visit dist_schedule_clause",
+                  args, type(args), args[0])
+        return args[0]
+
     # Don't need a rule for parallel_for_simd_construct.
 
     def parallel_for_simd_directive(self, args):
@@ -4071,7 +4073,7 @@ def map_clause(self, args):
             assert(len(args) == 2)
         else:
             map_type = "TOFROM"  # is this default right?  FIX ME
-            var_list = args
+            var_list = args[0]
         ret = []
         for var in var_list:
             ret.append(openmp_tag("QUAL.OMP.MAP." + map_type, var))
@@ -4267,7 +4269,7 @@ def teams_back_prop(self, clauses):
     def check_distribute_nesting(self, dir_tag):
         if "DISTRIBUTE" in dir_tag and "TEAMS" not in dir_tag:
             enclosing_regions = get_enclosing_region(self.func_ir, self.blk_start)
-            if len(enclosing_regions) < 1 or "TEAMS" not in enclosing_regions[0].tags[0].name:
+            if len(enclosing_regions) < 1 or "TEAMS" not in enclosing_regions[-1].tags[0].name:
                 raise NotImplementedError("DISTRIBUTE must be nested under or combined with TEAMS.")
 
     def teams_directive(self, args):
@@ -4330,10 +4332,11 @@ def target_teams_directive(self, args):
     def target_teams_distribute_directive(self, args):
         self.some_target_directive(args, "TARGET.TEAMS.DISTRIBUTE", 3, has_loop=True)
 
+    def target_loop_directive(self, args):
+        self.some_target_directive(args, "TARGET.TEAMS.DISTRIBUTE.PARALLEL.LOOP", 2, has_loop=True)
+
     def target_teams_loop_directive(self, args):
         self.some_target_directive(args, "TARGET.TEAMS.DISTRIBUTE.PARALLEL.LOOP", 3, has_loop=True)
-        #self.some_target_directive(args, "TARGET.TEAMS.DISTRIBUTE.PARALLEL.LOOP.SIMD", 3, has_loop=True)
-        #self.some_target_directive(args, "TARGET.TEAMS.LOOP", 3, has_loop=True)
 
     def target_teams_distribute_parallel_for_directive(self, args):
         self.some_target_directive(args, "TARGET.TEAMS.DISTRIBUTE.PARALLEL.LOOP", 5, has_loop=True)
@@ -4415,6 +4418,26 @@ def teams_distribute_directive(self, args):
     def teams_distribute_simd_directive(self, args):
         self.some_distribute_directive(args, "TEAMS.DISTRIBUTE.SIMD", 3, has_loop=True)
 
+    def teams_loop_directive(self, args):
+        self.some_distribute_directive(args, "TEAMS.DISTRIBUTE.PARALLEL.LOOP", 2, has_loop=True)
+
+    def loop_directive(self, args):
+        # TODO Add error checking that a clause that the parser accepts if we find that
+        # loop can even take clauses, which we're not sure that it can.
+        enclosing_regions = get_enclosing_region(self.func_ir, self.blk_start)
+        if not enclosing_regions or len(enclosing_regions) < 1:
+            self.some_for_directive(args, "DIR.OMP.PARALLEL.LOOP", "DIR.OMP.END.PARALLEL.LOOP", 1, True)
+        else:
+            if "DISTRIBUTE" in enclosing_regions[-1].tags[0].name:
+                self.some_distribute_directive(args, "PARALLEL.LOOP", 1, has_loop=True)
+            elif "TEAMS" in enclosing_regions[-1].tags[0].name:
+                self.some_distribute_directive(args, "DISTRIBUTE.PARALLEL.LOOP", 1, has_loop=True)
+            else:
+                if "TARGET" in enclosing_regions[-1].tags[0].name:
+                    self.some_distribute_directive(args, "TEAMS.DISTRIBUTE.PARALLEL.LOOP", 1, has_loop=True)
+                else:
+                    self.some_for_directive(args, "DIR.OMP.PARALLEL.LOOP", "DIR.OMP.END.PARALLEL.LOOP", 1, True)
+
     def distribute_directive(self, args):
         self.some_distribute_directive(args, "DISTRIBUTE", 1, has_loop=True)
 
@@ -4453,8 +4476,6 @@ def some_distribute_directive(self, args, dir_tag, lexer_count, has_loop=False):
                 start_tags.append(openmp_tag("QUAL.OMP.THREAD_LIMIT", 0))
             self.teams_back_prop(clauses)
         elif "PARALLEL" in dir_tag:
-            if len(self.get_clauses_by_name(clauses, "QUAL.OMP.THREAD_LIMIT")) == 0:
-                start_tags.append(openmp_tag("QUAL.OMP.THREAD_LIMIT", 0))
             self.parallel_back_prop(clauses)
 
         if config.DEBUG_OPENMP >= 1:
@@ -4796,13 +4817,6 @@ def target_teams_distribute_parallel_for_clause(self, args):
                 print(args[0][0])
         return args[0]
 
-    def target_teams_loop_clause(self, args):
-        if config.DEBUG_OPENMP >= 1:
-            print("visit target_teams_loop_clause", args, type(args), args[0])
-            if isinstance(args[0], list):
-                print(args[0][0])
-        return args[0]
-
     # Don't need a rule for target_update_construct.
 
     def target_update_directive(self, args):
@@ -5514,12 +5528,15 @@ def NUMBER(self, args):
                     | teams_distribute_simd_construct
                     | teams_distribute_parallel_for_construct
                     | teams_distribute_parallel_for_simd_construct
+                    | loop_construct
+                    | teams_loop_construct
                     | target_construct
                     | target_teams_construct
                     | target_teams_distribute_construct
                     | target_teams_distribute_simd_construct
                     | target_teams_distribute_parallel_for_simd_construct
                     | target_teams_distribute_parallel_for_construct
+                    | target_loop_construct
                     | target_teams_loop_construct
                     | target_enter_data_construct
                     | target_exit_data_construct
@@ -5539,8 +5556,6 @@ def NUMBER(self, args):
                     | parallel_sections_construct
                     | master_construct
                     | ordered_construct
-    //teams_distribute_parallel_for_simd_clause: target_clause
-    //                                         | teams_distribute_parallel_for_simd_clause
     for_simd_construct: for_simd_directive
     for_simd_directive: FOR SIMD [for_simd_clause*]
     for_simd_clause: for_clause
@@ -5735,6 +5750,9 @@ def NUMBER(self, args):
     target_teams_distribute_parallel_for_construct: target_teams_distribute_parallel_for_directive
     teams_distribute_parallel_for_construct: teams_distribute_parallel_for_directive
     teams_distribute_parallel_for_simd_construct: teams_distribute_parallel_for_simd_directive
+    loop_construct: loop_directive
+    teams_loop_construct: teams_loop_directive
+    target_loop_construct: target_loop_directive
     target_teams_loop_construct: target_teams_loop_directive
     target_teams_construct: target_teams_directive
     target_teams_distribute_construct: target_teams_distribute_directive
@@ -5903,30 +5921,10 @@ def NUMBER(self, args):
 
     ompx_attribute: OMPX_ATTRIBUTE "(" PYTHON_NAME "(" number_list ")" ")"
     OMPX_ATTRIBUTE: "ompx_attribute"
-    //target_teams_loop_directive: TARGET TEAMS LOOP [target_teams_loop_clause*]
-    target_teams_loop_directive: TARGET TEAMS LOOP [target_teams_distribute_parallel_for_simd_clause*]
-    target_teams_loop_clause: if_clause
-                            | device_clause
-                            | private_clause
-                            | firstprivate_clause
-                     //     | in_reduction_clause
-                            | map_clause
-                            | is_device_ptr_clause
-                     //     | defaultmap_clause
-                            | NOWAIT
-                            | allocate_clause
-                            | depend_with_modifier_clause
-                     //     | uses_allocators_clause
-                            | num_teams_clause
-                            | thread_limit_clause
-                            | data_default_clause
-                            | data_sharing_clause
-                     //     | reduction_default_only_clause
-                     //     | bind_clause
-                            | collapse_clause
-                            | ORDERED
-                            | lastprivate_clause
-                            | ompx_attribute
+    loop_directive: LOOP [teams_distribute_parallel_for_clause*]
+    teams_loop_directive: TEAMS LOOP [teams_distribute_parallel_for_clause*]
+    target_loop_directive: TARGET LOOP [target_teams_distribute_parallel_for_clause*]
+    target_teams_loop_directive: TARGET TEAMS LOOP [target_teams_distribute_parallel_for_clause*]
 
     target_teams_directive: TARGET TEAMS [target_teams_clause*]
     target_teams_clause: if_clause
@@ -6149,8 +6147,7 @@ def NUMBER(self, args):
     for_directive: FOR [for_clause*]
     for_clause: unique_for_clause | data_clause | NOWAIT
     unique_for_clause: ORDERED
-                     | sched_no_expr
-                     | sched_expr
+                     | schedule_clause
                      | collapse_clause
     LINEAR: "linear"
     linear_clause: LINEAR "(" var_list ":" const_num_or_var ")"

diff --git a/numba/tests/test_openmp.py b/numba/tests/test_openmp.py
@@ -2953,6 +2953,10 @@ class TestOpenmpTarget(TestOpenmpBase):
     def __init__(self, *args):
         TestOpenmpBase.__init__(self, *args)
 
+    @classmethod
+    def is_testing_cpu(cls):
+        return 1 in cls.devices
+
     # How to check for nowait?
     # Currently checks only compilation.
     # Numba optimizes the whole target away? This runs too fast.
@@ -4409,6 +4413,25 @@ def test_impl(n):
         c = test_impl(n)
         np.testing.assert_array_equal(c, np.full((n,n), 2))
 
+    def target_nest_teams_nest_loop_collapse(self, device):
+        target_pragma = f"""target device({device}) map(tofrom: a, b, c)"""
+        @njit
+        def test_impl(n):
+            a = np.ones((n,n))
+            b = np.ones((n,n))
+            c = np.zeros((n,n))
+            with openmp(target_pragma):
+                with openmp("teams"):
+                    with openmp("loop collapse(2)"):
+                        for i in range(n):
+                            for j in range(n):
+                                c[i,j] = a[i,j] + b[i,j]
+            return c
+
+        n = 10
+        c = test_impl(n)
+        np.testing.assert_array_equal(c, np.full((n,n), 2))
+
 
 for memberName in dir(TestOpenmpTarget):
     if memberName.startswith("target"):
@@ -4462,6 +4485,23 @@ def test_impl(num_steps):
 
         self.check(test_impl, 100000)
 
+    def test_pi_loop_directive(self):
+        def test_impl(num_steps):
+            step = 1.0 / num_steps
+
+            the_sum = 0.0
+            omp_set_num_threads(4)
+
+            with openmp("loop reduction(+:the_sum) schedule(static)"):
+                for j in range(num_steps):
+                    x = ((j-1) - 0.5) * step
+                    the_sum += 4.0 / (1.0 + x * x)
+
+            pi = step * the_sum
+            return pi
+
+        self.check(test_impl, 100000)
+
     def test_pi_spmd(self):
         def test_impl(num_steps):
             step = 1.0 / num_steps