Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update with nccl-2.23.4-1 #164

Merged
merged 1 commit into from
Sep 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 39 additions & 23 deletions include/core.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,44 +50,60 @@

#include <errno.h>
// Check system calls
#define SYSCHECK(call, name) do { \
#define SYSCHECK(statement, name) do { \
int retval; \
SYSCHECKVAL(call, name, retval); \
} while (false)

#define SYSCHECKVAL(call, name, retval) do { \
SYSCHECKSYNC(call, name, retval); \
SYSCHECKSYNC((statement), name, retval); \
if (retval == -1) { \
WARN("Call to " name " failed : %s", strerror(errno)); \
WARN("Call to " name " failed: %s", strerror(errno)); \
return ncclSystemError; \
} \
} while (false)

#define SYSCHECKSYNC(call, name, retval) do { \
retval = call; \
#define SYSCHECKSYNC(statement, name, retval) do { \
retval = (statement); \
if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \
} else { \
break; \
} \
} while(true)

#define SYSCHECKGOTO(statement, RES, label) do { \
if ((statement) == -1) { \
/* Print the back trace*/ \
RES = ncclSystemError; \
INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \
#define SYSCHECKGOTO(statement, name, RES, label) do { \
int retval; \
SYSCHECKSYNC((statement), name, retval); \
if (retval == -1) { \
WARN("Call to " name " failed: %s", strerror(errno)); \
RES = ncclSystemError; \
goto label; \
} \
} while (0);
} while (0)

// Pthread calls don't set errno and never return EINTR.
#define PTHREADCHECK(statement, name) do { \
int retval = (statement); \
if (retval != 0) { \
WARN("Call to " name " failed: %s", strerror(retval)); \
return ncclSystemError; \
} \
} while (0)

#define PTHREADCHECKGOTO(statement, name, RES, label) do { \
int retval = (statement); \
if (retval != 0) { \
WARN("Call to " name " failed: %s", strerror(retval)); \
RES = ncclSystemError; \
goto label; \
} \
} while (0)


#define NEQCHECK(statement, value) do { \
if ((statement) != value) { \
/* Print the back trace*/ \
INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, ncclSystemError, strerror(errno)); \
return ncclSystemError; \
} \
} while (0);
} while (0)

#define NEQCHECKGOTO(statement, value, RES, label) do { \
if ((statement) != value) { \
Expand All @@ -96,15 +112,15 @@
INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \
goto label; \
} \
} while (0);
} while (0)

#define EQCHECK(statement, value) do { \
if ((statement) == value) { \
/* Print the back trace*/ \
INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, ncclSystemError, strerror(errno)); \
return ncclSystemError; \
} \
} while (0);
} while (0)

#define EQCHECKGOTO(statement, value, RES, label) do { \
if ((statement) == value) { \
Expand All @@ -113,7 +129,7 @@
INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \
goto label; \
} \
} while (0);
} while (0)

// Propagate errors up
#define NCCLCHECK(call) do { \
Expand All @@ -122,15 +138,15 @@
/* Print the back trace*/ \
return RES; \
} \
} while (0);
} while (0)

#define NCCLCHECKGOTO(call, RES, label) do { \
RES = call; \
if (RES != ncclSuccess && RES != ncclInProgress) { \
/* Print the back trace*/ \
goto label; \
} \
} while (0);
} while (0)

#define NCCLWAIT(call, cond, abortFlagPtr) do { \
volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \
Expand All @@ -139,7 +155,7 @@
return ncclInternalError; \
} \
if (tmpAbortFlag) NEQCHECK(*tmpAbortFlag, 0); \
} while (!(cond));
} while (!(cond))

#define NCCLWAITGOTO(call, cond, abortFlagPtr, RES, label) do { \
volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \
Expand All @@ -148,7 +164,7 @@
goto label; \
} \
if (tmpAbortFlag) NEQCHECKGOTO(*tmpAbortFlag, 0, RES, label); \
} while (!(cond));
} while (!(cond))

#define NCCLCHECKTHREAD(a, args) do { \
if (((args)->ret = (a)) != ncclSuccess && (args)->ret != ncclInProgress) { \
Expand Down
8 changes: 8 additions & 0 deletions include/p2p_plugin.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,13 @@ struct ncclIbMergedDev {
int devs[NCCL_IB_MAX_DEVS_PER_NIC]; // Points to an index in ncclIbDevs
int speed;
char devName[MAX_MERGED_DEV_NAME]; // Up to NCCL_IB_MAX_DEVS_PER_NIC * name size, and a character for each '+'
int dmaBufSupported; // 0 = uninit, 1 = yes, -1 = no
} __attribute__((aligned(64)));

struct ncclIbStats {
int fatalErrorCount;
};

struct ncclIbRequest {
struct ncclIbNetCommBase* base;
int type;
Expand Down Expand Up @@ -108,6 +113,7 @@ typedef struct ncclIbDev {
struct ncclIbMrCache mrCache;
int ar; // ADAPTIVE_ROUTING
struct ibv_port_attr portAttr;
struct ncclIbStats stats;
} __attribute__((aligned(64))) ncclIbDev;


Expand Down Expand Up @@ -144,4 +150,6 @@ int ncclIbRelaxedOrderingCapable(void);

nccl_p2p_plugin_t nccl_p2p_get_plugin_type();

ncclResult_t ncclIbStatsInit(struct ncclIbStats* stat);

#endif
14 changes: 7 additions & 7 deletions include/timer.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,15 +33,15 @@ static double startTimes[8];
#define TIME_START(index) do { \
counts[index]++; \
startTimes[index] = gettime(); \
} while (0);
} while (0)

#define TIME_STOP(index) do { \
times[index] += gettime() - startTimes[index]; \
} while (0);
} while (0)

#define TIME_CANCEL(index) do { \
counts[index]--; \
} while (0);
} while (0)

#define TIME_PRINT(name) do { \
printf("%s stats", name); \
Expand All @@ -50,11 +50,11 @@ static double startTimes[8];
counts[i] = 0; \
} \
printf("\n"); \
} while (0);
} while (0)
#else
#define TIME_START(index) do {} while(0);
#define TIME_STOP(index) do {} while(0);
#define TIME_CANCEL(index) do {} while(0);
#define TIME_START(index) do {} while(0)
#define TIME_STOP(index) do {} while(0)
#define TIME_CANCEL(index) do {} while(0)
#define TIME_PRINT(name)
#endif
#endif
Loading
Loading