From d0918dce3567bcfe6f007085a2d5b1781dd24b10 Mon Sep 17 00:00:00 2001 From: "John H. Hartman" Date: Thu, 26 Sep 2024 14:23:02 -0700 Subject: [PATCH 1/6] Fix logical accessible CPU set debug message Don't crash if the logical accessible CPU set for a locale isn't set. Signed-off-by: John H. Hartman --- runtime/src/topo/hwloc/topo-hwloc.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/runtime/src/topo/hwloc/topo-hwloc.c b/runtime/src/topo/hwloc/topo-hwloc.c index 192ecd32428c..92925bcde09b 100644 --- a/runtime/src/topo/hwloc/topo-hwloc.c +++ b/runtime/src/topo/hwloc/topo-hwloc.c @@ -704,7 +704,11 @@ static void partitionResources(void) { if (debug) { for (int i = 0; i < numLocalesOnNode; i++) { char buf[1024]; - hwloc_bitmap_list_snprintf(buf, sizeof(buf), logAccSets[i]); + if (logAccSets[i] != NULL) { + hwloc_bitmap_list_snprintf(buf, sizeof(buf), logAccSets[i]); + } else { + strncpy(buf, "unknown", sizeof(buf)); + } _DBG_P("logAccSets[%d]: %s", i, buf); } } From ef3f0a1bc5bd55207d3bcf60deeaf684caa81ead Mon Sep 17 00:00:00 2001 From: "John H. Hartman" Date: Thu, 26 Sep 2024 14:40:07 -0700 Subject: [PATCH 2/6] Dump distance matrix to debugging output Signed-off-by: John H. Hartman --- runtime/src/topo/hwloc/topo-hwloc.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/runtime/src/topo/hwloc/topo-hwloc.c b/runtime/src/topo/hwloc/topo-hwloc.c index 92925bcde09b..e14a11681a9a 100644 --- a/runtime/src/topo/hwloc/topo-hwloc.c +++ b/runtime/src/topo/hwloc/topo-hwloc.c @@ -1324,6 +1324,15 @@ static void fillDistanceMatrix(int numObjs, hwloc_obj_t *objs, } } } +#ifdef DEBUG + printf("distances:\n"); + for (int i = 0; i < numLocales; i++) { + for (int j = 0; j < numObjs; j++) { + printf("%02d ", distances[i][j]); + } + printf("\n"); + } +#endif } From 840e56dbcf26cf739e1990bbd6aa142404358faa Mon Sep 17 00:00:00 2001 From: "John H. Hartman" Date: Thu, 26 Sep 2024 15:06:58 -0700 Subject: [PATCH 3/6] Minimum distance might be the maximum Signed-off-by: John H. Hartman --- runtime/src/topo/hwloc/topo-hwloc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/runtime/src/topo/hwloc/topo-hwloc.c b/runtime/src/topo/hwloc/topo-hwloc.c index e14a11681a9a..d0c777e098d2 100644 --- a/runtime/src/topo/hwloc/topo-hwloc.c +++ b/runtime/src/topo/hwloc/topo-hwloc.c @@ -1439,7 +1439,7 @@ chpl_topo_pci_addr_t *chpl_topo_selectNicByType(chpl_topo_pci_addr_t *inAddr, for (int j = 0; j < numNics; j++) { _DBG_P("used[%d] = %d, distances[%d][%d] = %d", j, used[j], i, j, distances[i][j]); - if ((!used[j]) && (distances[i][j] < minimum)) { + if ((!used[j]) && (distances[i][j] <= minimum)) { minimum = distances[i][j]; minLoc = i; minNic = j; @@ -1512,6 +1512,7 @@ int chpl_topo_selectMyDevices(chpl_topo_pci_addr_t *inAddrs, int owners[numDevs]; // locale that owns each device hwloc_obj_t objs[numDevs]; // the device objects int devsPerLocale = numDevs / numLocales; + _DBG_P("devsPerLocale = %d", devsPerLocale); int owned[numLocales]; // number of devices each locale owns for (int i = 0; i < numDevs; i++) { @@ -1565,7 +1566,7 @@ int chpl_topo_selectMyDevices(chpl_topo_pci_addr_t *inAddrs, for (int i = 0; i < numLocales; i++) { if (owned[i] < devsPerLocale) { for (int j = 0; j < numDevs; j++) { - if ((owners[j] == -1) && (distances[i][j] < minimum)) { + if ((owners[j] == -1) && (distances[i][j] <= minimum)) { minimum = distances[i][j]; minLoc = i; minDev = j; From b49c7457aea49c583edb75f270a0aae04b193f50 Mon Sep 17 00:00:00 2001 From: "John H. Hartman" Date: Thu, 26 Sep 2024 15:12:41 -0700 Subject: [PATCH 4/6] Partition devices based on number of co-locales, not locales If we are oversubscribed then the locales should share the devices, instead of treating them as co-locales and partitioning the devices Signed-off-by: John H. Hartman --- runtime/src/topo/hwloc/topo-hwloc.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/runtime/src/topo/hwloc/topo-hwloc.c b/runtime/src/topo/hwloc/topo-hwloc.c index d0c777e098d2..4a8c6a772f43 100644 --- a/runtime/src/topo/hwloc/topo-hwloc.c +++ b/runtime/src/topo/hwloc/topo-hwloc.c @@ -1507,20 +1507,21 @@ int chpl_topo_selectMyDevices(chpl_topo_pci_addr_t *inAddrs, int numLocales = chpl_get_num_locales_on_node(); _DBG_P("count = %d", *count); _DBG_P("numLocales = %d", numLocales); - if (numLocales > 1) { + int numColocales = chpl_env_rt_get_int("LOCALES_PER_NODE", 0); + if (numColocales > 1) { int numDevs = *count; int owners[numDevs]; // locale that owns each device hwloc_obj_t objs[numDevs]; // the device objects - int devsPerLocale = numDevs / numLocales; + int devsPerLocale = numDevs / numColocales; _DBG_P("devsPerLocale = %d", devsPerLocale); - int owned[numLocales]; // number of devices each locale owns + int owned[numColocales]; // number of devices each co-locale owns for (int i = 0; i < numDevs; i++) { owners[i] = -1; objs[i] = NULL; } - for (int i = 0; i < numLocales; i++) { + for (int i = 0; i < numColocales; i++) { owned[i] = 0; } @@ -1589,6 +1590,7 @@ int chpl_topo_selectMyDevices(chpl_topo_pci_addr_t *inAddrs, assert(j == devsPerLocale); *count = devsPerLocale; } else { + // No co-locales, use all the devices. for (int i = 0; i < *count; i++) { outAddrs[i] = inAddrs[i]; } From 7a25877568a33a182228e41698a36bdfbfaf0da3 Mon Sep 17 00:00:00 2001 From: "John H. Hartman" Date: Tue, 8 Oct 2024 12:35:30 -0700 Subject: [PATCH 5/6] Allocate resources based on partitions instead of co-locales If the number of nodes does not evenly divide the number of locales there will be a "remainder node" that has fewer co-locales than the other nodes. Previously, there was some special-casing to deal with the remainder node which was clunky and error-prone. This commit introduces the "partition" abstraction in which the number of partitions on each node is the expected number of co-locales on the node. All nodes, including the remainder node, allocate resources based on partitions, then assign co-locales to partitions. On the remainder node this means that some partitions (and therefore resources) go unused, but this is what we want because all locales should have the same amount of resources. This greatly cleans up the code. In addition, oversubscription handling is cleaner. If there are locales on the node, but the expected number of co-locales is zero, the node is oversubscribed and all locales share all resources. Also added some remainder node and oversubsciption tests. Signed-off-by: John H. Hartman --- runtime/src/topo/hwloc/topo-hwloc.c | 239 +++++++++++++----------- test/runtime/jhh/colocales/colocales.c | 38 +++- test/runtime/jhh/colocales/colocales.py | 49 ++++- 3 files changed, 207 insertions(+), 119 deletions(-) diff --git a/runtime/src/topo/hwloc/topo-hwloc.c b/runtime/src/topo/hwloc/topo-hwloc.c index 4a8c6a772f43..cbce3cfa6bfb 100644 --- a/runtime/src/topo/hwloc/topo-hwloc.c +++ b/runtime/src/topo/hwloc/topo-hwloc.c @@ -101,6 +101,11 @@ static hwloc_nodeset_t numaSet = NULL; static hwloc_obj_t myRoot = NULL; +// This is the number of partitions that the resources should be divided into. +// Typically this is the number of co-locales per node, except for the last +// node which might have fewer than that. +static int numPartitions = 1; + // Logical CPU sets for all locales on this node. Entries are NULL if // we don't have that info. static hwloc_cpuset_t *logAccSets = NULL; @@ -272,13 +277,14 @@ void chpl_topo_exit(void) { } if (logAccSets != NULL) { - for (int i = 0; i < chpl_get_num_locales_on_node(); i++) { + for (int i = 0; i < numPartitions; i++) { if (logAccSets[i] != NULL) { hwloc_bitmap_free(logAccSets[i]); } } sys_free(logAccSets); logAccSets = NULL; + numPartitions = 1; } hwloc_topology_destroy(topology); } @@ -531,7 +537,20 @@ static const char *objTypeString(hwloc_obj_type_t t) { } // -// Partitions resources when running with co-locales. +// Partition resources when running with co-locales. This is complicated a bit +// by oversubscription and that the number of locales might not be evenly +// divisable by the number of nodes. If the number of colocales is zero, then +// we are oversubscribed and each locale uses all of the resources available +// to it. Otherwise, the number of locales on the node might be less than the +// expected number of co-locales because the "remainder" node might not have +// its full complement of co-locales. To deal with this, the resources are +// partitioned based on the expected number of co-locales, but then assigned +// to locales based on the number of co-locales that actually exist. This +// ensures that all co-locales on all nodes have the same amount of +// resources. If there are more locales than expected co-locales then the +// user has launched the program manually; just treat the system as +// oversubscribed as it isn't clear how to partition resources in this +// situation. // static void partitionResources(void) { @@ -576,8 +595,18 @@ static void partitionResources(void) { if (numLocalesOnNode > 1) { oversubscribed = true; } - logAccSets = sys_calloc(numLocalesOnNode, sizeof(hwloc_cpuset_t)); - if (numColocales > 0) { + if ((numColocales > 0) && (numLocalesOnNode <= numColocales)){ + numPartitions = numColocales; + } + logAccSets = sys_calloc(numPartitions, sizeof(hwloc_cpuset_t)); + if ((numColocales > 0) && (numLocalesOnNode > numColocales)) { + char msg[200]; + snprintf(msg, sizeof(msg), + "The node has more locales (%d) than co-locales (%d).\n" + "Considering the node oversubscribed.", + numLocalesOnNode, numColocales); + chpl_warning(msg, 0, 0); + } else if (numColocales > 0) { // We get our own socket/NUMA/cache/core object if we have exclusive // access to the node, we know our local rank, and the number of locales // on the node is less than or equal to the number of objects. It is an @@ -601,7 +630,7 @@ static void partitionResources(void) { HWLOC_OBJ_TYPE_MAX}; for (int i = 0; rootTypes[i] != HWLOC_OBJ_TYPE_MAX; i++) { int numObjs = hwloc_get_nbobjs_by_type(topology, rootTypes[i]); - if (numObjs == numColocales) { + if (numObjs == numPartitions) { myRootType = rootTypes[i]; break; } @@ -611,15 +640,15 @@ static void partitionResources(void) { _DBG_P("myRootType: %s", objTypeString(myRootType)); int numCores = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_CORE); int numObjs = hwloc_get_nbobjs_by_type(topology, myRootType); - if (numObjs < numLocalesOnNode) { + if (numObjs < numPartitions) { char msg[200]; snprintf(msg, sizeof(msg), "Node only has %d %s(s)", numObjs, objTypeString(myRootType)); chpl_error(msg, 0, 0); } - if (numObjs > numLocalesOnNode) { - int coresPerLocale = numCores / numObjs; - unusedCores = (numObjs - numLocalesOnNode) * coresPerLocale; + if (numObjs > numPartitions) { + int coresPerPartition = numCores / numObjs; + unusedCores = (numObjs - numPartitions) * coresPerPartition; } // Use the object whose logical index corresponds to our local rank. @@ -628,11 +657,11 @@ static void partitionResources(void) { _DBG_P("confining ourself to %s %d", objTypeString(myRootType), rank); - // Compute the accessible PUs for all locales on this node based on - // the object each occupies. This is used to determine which NIC each - // locale should use. + // Compute the accessible PUs for all partitions based on the object + // each occupies. This is used to determine which NIC each locale + // should use. - for (int i = 0; i < numLocalesOnNode; i++) { + for (int i = 0; i < numPartitions; i++) { hwloc_obj_t obj; CHK_ERR(obj = hwloc_get_obj_inside_cpuset_by_type(topology, root->cpuset, myRootType, i)); @@ -642,21 +671,21 @@ static void partitionResources(void) { } } else { // Cores not tied to a root object - int coresPerLocale = numCPUsPhysAcc / numLocalesOnNode; - if (coresPerLocale < 1) { + int coresPerPartition = numCPUsPhysAcc / numPartitions; + if (coresPerPartition < 1) { char msg[200]; snprintf(msg, sizeof(msg), "Cannot run %d co-locales on %d cores.", - numLocalesOnNode, numCPUsPhysAcc); + numPartitions, numCPUsPhysAcc); chpl_error(msg, 0, 0); } - unusedCores = numCPUsPhysAcc % numLocalesOnNode; + unusedCores = numCPUsPhysAcc % numPartitions; int count = 0; int locale = -1; int id; hwloc_bitmap_foreach_begin(id, physAccSet) { if (count == 0) { locale++; - if (locale == numLocalesOnNode) { + if (locale == numPartitions) { break; } CHK_ERR_ERRNO(logAccSets[locale] = hwloc_bitmap_alloc()); @@ -668,7 +697,7 @@ static void partitionResources(void) { HWLOC_OBJ_CORE, pu)); hwloc_bitmap_or(logAccSets[locale], logAccSets[locale], core->cpuset); - count = (count + 1) % coresPerLocale; + count = (count + 1) % coresPerPartition; } hwloc_bitmap_foreach_end(); } if (unusedCores != 0) { @@ -697,12 +726,13 @@ static void partitionResources(void) { oversubscribed = false; } } else { - // We don't know which PUs other locales on the same node are using, - // so just set our own. + // The node is oversubscribed. We will use all accessible PUs, and we + // don't know which PUs other locales on the same node are using, so just + // set our own. logAccSets[0] = hwloc_bitmap_dup(logAccSet); } if (debug) { - for (int i = 0; i < numLocalesOnNode; i++) { + for (int i = 0; i < numPartitions; i++) { char buf[1024]; if (logAccSets[i] != NULL) { hwloc_bitmap_list_snprintf(buf, sizeof(buf), logAccSets[i]); @@ -1288,12 +1318,11 @@ static void fillDistanceMatrix(int numObjs, hwloc_obj_t *objs, // Build a distance matrix between locales and objects. - int numLocales = chpl_get_num_locales_on_node(); - _DBG_P("numLocales = %d numObjs = %d", numLocales, numObjs); + _DBG_P("numPartitions = %d numObjs = %d", numPartitions, numObjs); - hwloc_obj_t locales[numLocales]; + hwloc_obj_t locales[numPartitions]; - for (int i = 0; i < numLocales; i++) { + for (int i = 0; i < numPartitions; i++) { if (logAccSets[i] != NULL) { CHK_ERR(locales[i] = hwloc_get_obj_covering_cpuset(topology, logAccSets[i])); @@ -1315,7 +1344,7 @@ static void fillDistanceMatrix(int numObjs, hwloc_obj_t *objs, // is NULL then we don't know which PUs that locale is using, so // we ignore it by setting its distances to infinite. - for (int i = 0; i < numLocales; i++) { + for (int i = 0; i < numPartitions; i++) { for (int j = 0; j < numObjs; j++) { if (locales[i] != NULL) { distances[i][j] = distance(topology, objs[j], locales[i]); @@ -1326,7 +1355,7 @@ static void fillDistanceMatrix(int numObjs, hwloc_obj_t *objs, } #ifdef DEBUG printf("distances:\n"); - for (int i = 0; i < numLocales; i++) { + for (int i = 0; i < numPartitions; i++) { for (int j = 0; j < numObjs; j++) { printf("%02d ", distances[i][j]); } @@ -1355,13 +1384,12 @@ chpl_topo_pci_addr_t *chpl_topo_selectNicByType(chpl_topo_pci_addr_t *inAddr, hwloc_obj_t nic = NULL; chpl_topo_pci_addr_t *result = NULL; - int numLocales = chpl_get_num_locales_on_node(); struct hwloc_pcidev_attr_s *nicAttr; int localRank = chpl_get_local_rank(); _DBG_P("chpl_topo_selectNicByType: %04x:%02x:%02x.%x", inAddr->domain, inAddr->bus, inAddr->device, inAddr->function); - _DBG_P("numLocales %d rank %d", numLocales, localRank); + _DBG_P("numPartitions %d rank %d", numPartitions, localRank); // find the PCI object corresponding to the specified NIC nic = hwloc_get_pcidev_by_busid(topology, (unsigned) inAddr->domain, @@ -1369,7 +1397,7 @@ chpl_topo_pci_addr_t *chpl_topo_selectNicByType(chpl_topo_pci_addr_t *inAddr, (unsigned) inAddr->device, (unsigned) inAddr->function); if (nic != NULL) { - if ((numLocales > 1) && (localRank >= 0)) { + if ((numPartitions > 1) && (localRank >= 0)) { // Find all the NICS with the same vendor and device as the specified NIC. nicAttr = &(nic->attr->pcidev); int maxNics = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PCI_DEVICE); @@ -1395,7 +1423,7 @@ chpl_topo_pci_addr_t *chpl_topo_selectNicByType(chpl_topo_pci_addr_t *inAddr, // qsort(nics, numNics, sizeof(*nics), comparePCIObjs); - int distances[numLocales][numNics]; + int distances[numPartitions][numNics]; fillDistanceMatrix(numNics, nics, distances); @@ -1406,15 +1434,15 @@ chpl_topo_pci_addr_t *chpl_topo_selectNicByType(chpl_topo_pci_addr_t *inAddr, // share NICs, so mark all NICs as unassigned and repeat the // process. - hwloc_obj_t assigned[numLocales]; // NIC assigned to the locale + hwloc_obj_t assigned[numPartitions]; // NIC assigned to the locale int numAssigned = 0; - for (int i = 0; i < numLocales; i++) { + for (int i = 0; i < numPartitions; i++) { assigned[i] = NULL; } chpl_bool finished = false; - while (!finished && (numAssigned < numLocales)) { + while (!finished && (numAssigned < numPartitions)) { _DBG_P("outer loop: numAssigned %d", numAssigned); // The used array keeps track of NICs that have been assigned in @@ -1427,12 +1455,12 @@ chpl_topo_pci_addr_t *chpl_topo_selectNicByType(chpl_topo_pci_addr_t *inAddr, // assigned a NIC and a NIC that hasn't been assigned in this // iteration ("used") and assign that NIC to that locale. int numAvail = numNics; - while((numAvail > 0) && (numAssigned < numLocales)) { + while((numAvail > 0) && (numAssigned < numPartitions)) { _DBG_P("inner loop: numAssigned %d numAvail %d", numAssigned, numAvail); int minimum = INT32_MAX; int minNic = -1; int minLoc = -1; - for (int i = 0; i < numLocales; i++) { + for (int i = 0; i < numPartitions; i++) { _DBG_P("assigned[%d] = %p", i, assigned[i]); _DBG_P("minimum = %d", minimum); if (!assigned[i]) { @@ -1481,18 +1509,21 @@ chpl_topo_pci_addr_t *chpl_topo_selectNicByType(chpl_topo_pci_addr_t *inAddr, } // Given the PCI bus addresses of a set of devices, determine which of those -// devices the calling locale should use. Each co-locale is assigned the same -// number of devices and each device is assigned to at most one locale. This -// function uses a greedy algorithm to assign devices to locales. The -// distance matrix records the distance between each device and the locale's -// CPU set. The device/locale pair with the minimum distance are assigned to -// each other and the device is removed from consideration. The process then -// repeats until all co-locales have been assigned the proper number of -// devices. +// devices the calling locale should use. Devices are assigned to partitions, +// where the number of partitions is equal to the expected number of +// co-locales on the device (there may be fewer if the number of nodes +// doesn't evenly divide the number of locales). Each partition is assigned +// the same number of devices and each device is assigned to at most one +// partition. This function uses a greedy algorithm to assign devices to +// partition. The distance matrix records the distance between each device +// and the partition's CPU set. The device/partition pair with the minimum +// distance are assigned to each other and the device is removed from +// consideration. The process then repeats until all partitions have been +// assigned the proper number of devices. // -// Note that cores are assigned to co-locales during initialization of the +// Note that cores are assigned to partitions during initialization of the // topology layer before this function is called. As a result, the assignment -// of cores and devices to co-locales may not be optimal, especially if the +// of cores and devices to paratitions may not be optimal, especially if the // machine topology is asymmetric. For example, if there are two co-locales // on a machine with four NUMA domains, one co-locale will be assigned cores // in the first two NUMA domains and the other the second two domains. If @@ -1504,81 +1535,77 @@ int chpl_topo_selectMyDevices(chpl_topo_pci_addr_t *inAddrs, int *count) { int result = 0; - int numLocales = chpl_get_num_locales_on_node(); _DBG_P("count = %d", *count); - _DBG_P("numLocales = %d", numLocales); - int numColocales = chpl_env_rt_get_int("LOCALES_PER_NODE", 0); - if (numColocales > 1) { + _DBG_P("numPartitions = %d", numPartitions); + int rank = chpl_get_local_rank(); + if ((numPartitions > 1) && (rank >= 0)) { int numDevs = *count; - int owners[numDevs]; // locale that owns each device + int owners[numDevs]; // partition to which the device belongs hwloc_obj_t objs[numDevs]; // the device objects - int devsPerLocale = numDevs / numColocales; - _DBG_P("devsPerLocale = %d", devsPerLocale); - int owned[numColocales]; // number of devices each co-locale owns + int devsPerPartition = numDevs / numPartitions; + _DBG_P("devsPerPartition = %d", devsPerPartition); + int owned[numPartitions]; // number of devices in each partition for (int i = 0; i < numDevs; i++) { owners[i] = -1; objs[i] = NULL; } - for (int i = 0; i < numColocales; i++) { + for (int i = 0; i < numPartitions; i++) { owned[i] = 0; } - int rank = chpl_get_local_rank(); - if (rank >= 0) { - for (int i = 0; i < numDevs; i++) { - hwloc_obj_t obj; - // find the PCI object corresponding to the specified bus address - obj = hwloc_get_pcidev_by_busid(topology, - (unsigned) inAddrs[i].domain, - (unsigned) inAddrs[i].bus, - (unsigned) inAddrs[i].device, - (unsigned) inAddrs[i].function); - if (obj == NULL) { - _DBG_P("Could not find PCI %04x:%02x:%02x.%x", inAddrs[i].domain, - inAddrs[i].bus, inAddrs[i].device, inAddrs[i].function); - if (debug) { - _DBG_P("PCI devices:"); - for (hwloc_obj_t obj = hwloc_get_next_pcidev(topology, NULL); - obj != NULL; - obj = hwloc_get_next_pcidev(topology, obj)) { - _DBG_P("%04x:%02x:%02x.%x", obj->attr->pcidev.domain, - obj->attr->pcidev.bus, obj->attr->pcidev.dev, - obj->attr->pcidev.func); - } + for (int i = 0; i < numDevs; i++) { + hwloc_obj_t obj; + // find the PCI object corresponding to the specified bus address + obj = hwloc_get_pcidev_by_busid(topology, + (unsigned) inAddrs[i].domain, + (unsigned) inAddrs[i].bus, + (unsigned) inAddrs[i].device, + (unsigned) inAddrs[i].function); + if (obj == NULL) { + _DBG_P("Could not find PCI %04x:%02x:%02x.%x", inAddrs[i].domain, + inAddrs[i].bus, inAddrs[i].device, inAddrs[i].function); + if (debug) { + _DBG_P("PCI devices:"); + for (hwloc_obj_t obj = hwloc_get_next_pcidev(topology, NULL); + obj != NULL; + obj = hwloc_get_next_pcidev(topology, obj)) { + _DBG_P("%04x:%02x:%02x.%x", obj->attr->pcidev.domain, + obj->attr->pcidev.bus, obj->attr->pcidev.dev, + obj->attr->pcidev.func); } - result = 1; - goto done; } - objs[i] = obj; + result = 1; + goto done; } - int distances[numLocales][numDevs]; - fillDistanceMatrix(numDevs, objs, distances); - while (owned[rank] < devsPerLocale) { - - // Find the minimum distance between a locale that needs more devices - // and a device that doesn't have an owner and assign that device to - // that locale. - - int minimum = INT32_MAX; - int minDev = -1; - int minLoc = -1; - for (int i = 0; i < numLocales; i++) { - if (owned[i] < devsPerLocale) { - for (int j = 0; j < numDevs; j++) { - if ((owners[j] == -1) && (distances[i][j] <= minimum)) { - minimum = distances[i][j]; - minLoc = i; - minDev = j; - } + objs[i] = obj; + } + int distances[numPartitions][numDevs]; + fillDistanceMatrix(numDevs, objs, distances); + while (owned[rank] < devsPerPartition) { + + // Find the minimum distance between a partition that needs more devices + // and a device that doesn't have a partition and assign that device to + // that partition. + + int minimum = INT32_MAX; + int minDev = -1; + int minPart = -1; + for (int i = 0; i < numPartitions; i++) { + if (owned[i] < devsPerPartition) { + for (int j = 0; j < numDevs; j++) { + if ((owners[j] == -1) && (distances[i][j] <= minimum)) { + minimum = distances[i][j]; + minPart = i; + minDev = j; } } } - assert((minDev >= 0) && (minLoc >= 0)); - owners[minDev] = minLoc; - owned[minLoc]++; } + assert((minDev >= 0) && (minPart >= 0)); + owners[minDev] = minPart; + owned[minPart]++; } // Return the addresses of our devices int j = 0; @@ -1587,10 +1614,10 @@ int chpl_topo_selectMyDevices(chpl_topo_pci_addr_t *inAddrs, outAddrs[j++] = inAddrs[i]; } } - assert(j == devsPerLocale); - *count = devsPerLocale; + assert(j == devsPerPartition); + *count = devsPerPartition; } else { - // No co-locales, use all the devices. + // Use all the devices. for (int i = 0; i < *count; i++) { outAddrs[i] = inAddrs[i]; } diff --git a/test/runtime/jhh/colocales/colocales.c b/test/runtime/jhh/colocales/colocales.c index 5e32e210d159..68a3fbf50978 100644 --- a/test/runtime/jhh/colocales/colocales.c +++ b/test/runtime/jhh/colocales/colocales.c @@ -19,7 +19,8 @@ void Usage(char *name) { fprintf(stderr, "Usage: %s [-m mask] [-N nic] [-n numColocales] [-r rank]\n", name); fprintf(stderr, "\t-m \tMask off accessible PUs\n"); fprintf(stderr, "\t-N \tNIC bus address\n"); - fprintf(stderr, "\t-n \tNumber of locales on node\n"); + fprintf(stderr, "\t-n \tExpected number of co-locales on node\n"); + fprintf(stderr, "\t-a \tActual number of locales on node\n"); fprintf(stderr, "\t-r \tLocal rank\n"); fprintf(stderr, "\t-h\t\tPrint this message\n"); } @@ -31,18 +32,22 @@ int main(int argc, char* argv[]) { int *cpus = NULL; char *mask = NULL; int rank = -1; - int numLocales = 1; - char *numLocalesStr = "1"; + int numCoLocales = -1; + char *numCoLocalesStr = NULL; + int numLocales = -1; char *nic = NULL; - while ((opt = getopt(argc, argv, "m:N:n:r:v")) != -1) { + while ((opt = getopt(argc, argv, "a:m:N:n:r:v")) != -1) { switch(opt) { + case 'a': + numLocales = atoi(optarg); + break; case 'm': mask = optarg; break; case 'n': - numLocales = atoi(optarg); - numLocalesStr = optarg; + numCoLocales = atoi(optarg); + numCoLocalesStr = optarg; break; case 'r': rank = atoi(optarg); @@ -68,11 +73,28 @@ int main(int argc, char* argv[]) { exit(1); } + if (numCoLocales == -1) { + numCoLocales = 1; + numCoLocalesStr = "1"; + } + + if (numLocales == -1) { + numLocales = numCoLocales; + } + + if (numLocales < 1) { + fprintf(stderr, "There must be > 0 locales on the node\n"); + exit(1); + } if (rank >= numLocales) { - fprintf(stderr, "Rank must be less than number of locales on node.\n"); + fprintf(stderr, + "Rank (%d) must be less than number of locales on node (%d).\n", + rank, numLocales); exit(1); } - setenv("CHPL_RT_LOCALES_PER_NODE", numLocalesStr, 1); + if (numCoLocales > 0) { + setenv("CHPL_RT_LOCALES_PER_NODE", numCoLocalesStr, 1); + } chpl__init_colocales(0, 0); // unsure why this is needed chpl_topo_pre_comm_init(mask); chpl_comm_init(NULL, NULL); diff --git a/test/runtime/jhh/colocales/colocales.py b/test/runtime/jhh/colocales/colocales.py index cd93c4855890..884e7afbf292 100755 --- a/test/runtime/jhh/colocales/colocales.py +++ b/test/runtime/jhh/colocales/colocales.py @@ -326,6 +326,32 @@ def test_12_leftover_cores(self): output = self.runCmd("./colocales -r 0 -n 17") self.assertIn("warning: 9 cores are unused\n", output); + def test_13_remainder_node(self): + """ + Remainder node has fewer co-locales. + However, they should not get more than their allotment of cores. + """ + for i in range(0, 3): + with self.subTest(i=i): + output = self.runCmd("./colocales -r %d -n %d -a 3 -N %s" % + (i, 16, self.nics[0])) + cpus = stringify([i*8+j for j in range(0, 8)]) + self.assertIn("Physical CPUs: %s\n" % cpus, output) + + def test_14_oversubscribed(self): + """ + All locales get all cores when oversubscribed. + """ + for i in range(0, 4): + with self.subTest(i=i): + output = self.runCmd("./colocales -r %d -a 4 -N %s" % + (i, self.nics[0])) + cpus = stringify(self.getSocketCores("all")) + self.assertIn("Physical CPUs: %s\n" % cpus, output) + cpus = stringify(self.getSocketThreads("all")) + self.assertIn("Logical CPUs: %s\n" % cpus, output) + + class Ex3Tests(TestCases.TestCase): """ HPE Cray EX. One sockets, four NUMA domains per socket, 64 cores per @@ -366,17 +392,30 @@ def test_11_two_colocales(self): Each co-locale gets the two GPUs closest to it """ for i in range(0, 2): - output = self.runCmd("./colocales -r %d -n 2" % i); - x = i * 2 - self.assertIn("GPUS: " + ' '.join(self.gpus[x:x+1]), output) + with self.subTest(i=i): + output = self.runCmd("./colocales -r %d -n 2" % i); + x = i * 2 + self.assertIn("GPUS: " + ' '.join(self.gpus[x:x+1]), output) def test_12_four_colocales(self): """ Each co-locale gets the GPU closest to it """ for i in range(0, 4): - output = self.runCmd("./colocales -r %d -n 4" % i); - self.assertIn("GPUS: " + self.gpus[i], output) + with self.subTest(i=i): + output = self.runCmd("./colocales -r %d -n 4" % i); + self.assertIn("GPUS: " + self.gpus[i], output) + + def test_13_oversubscribed(self): + """ + All locales get all GPUs when oversubscribed. + """ + for i in range(0, 4): + with self.subTest(i=i): + output = self.runCmd("./colocales -r %d -a 4" % i); + self.assertIn("GPUS: " + ' '.join(self.gpus), output) + + class Hpc6aTests(TestCases.TestCase): """ From a20458117a2db6fbf87548f9cdb9bdcff3caae90 Mon Sep 17 00:00:00 2001 From: "John H. Hartman" Date: Wed, 9 Oct 2024 13:44:35 -0700 Subject: [PATCH 6/6] Fixed typo. Signed-off-by: John H. Hartman --- runtime/src/topo/hwloc/topo-hwloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runtime/src/topo/hwloc/topo-hwloc.c b/runtime/src/topo/hwloc/topo-hwloc.c index cbce3cfa6bfb..7195deaa1ccc 100644 --- a/runtime/src/topo/hwloc/topo-hwloc.c +++ b/runtime/src/topo/hwloc/topo-hwloc.c @@ -1523,7 +1523,7 @@ chpl_topo_pci_addr_t *chpl_topo_selectNicByType(chpl_topo_pci_addr_t *inAddr, // // Note that cores are assigned to partitions during initialization of the // topology layer before this function is called. As a result, the assignment -// of cores and devices to paratitions may not be optimal, especially if the +// of cores and devices to partitions may not be optimal, especially if the // machine topology is asymmetric. For example, if there are two co-locales // on a machine with four NUMA domains, one co-locale will be assigned cores // in the first two NUMA domains and the other the second two domains. If