diff --git a/.ci/artifacts.sh b/.ci/artifacts.sh index ba256d570..d1ea70a02 100755 --- a/.ci/artifacts.sh +++ b/.ci/artifacts.sh @@ -1,12 +1,9 @@ #!/bin/bash -xl if [ -d jenkins ]; then - gzip -f ./jenkins/*.tar 2>/dev/null || true - cd ./jenkins/ ; - for f in *.tar.gz ; do [ -e "$f" ] && mv "$f" "${flags}/arch-${name}-$f" ; done ; - cd .. - cd ./jenkins/${flags}; - for f in *.tap ; do [ -e "$f" ] && mv "$f" "${flags}-${name}-$f" ; done ; - for f in *.xml ; do [ -e "$f" ] && mv "$f" "${flags}-${name}-$f" ; done ; - cd ../.. + pushd ./jenkins/${flags}; + gzip -f *.tar 2>/dev/null || true + for f in *.tar.gz ; do mv -f "$f" "arch-${name}-$f" ; done; + for f in *.{tap,xml} ; do mv -f "$f" "${flags}-${name}-$f" ; done ; + popd fi diff --git a/.ci/dockerfiles/Dockerfile.rhel8.6 b/.ci/dockerfiles/Dockerfile.rhel8.6 new file mode 100644 index 000000000..679896211 --- /dev/null +++ b/.ci/dockerfiles/Dockerfile.rhel8.6 @@ -0,0 +1,26 @@ +FROM harbor.mellanox.com/hpcx/x86_64/rhel8.6/core:latest +ARG _UID=6213 +ARG _GID=101 +ARG _LOGIN=swx-jenkins +ARG _HOME=/var/home/$_LOGIN + +RUN yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm \ + && yum install -y cppcheck \ + && yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm \ + && yum install -y csbuild clang-tools-extra sudo curl autoconf automake make libtool \ + libnl3-devel libnl3 rdma-core-devel rdma-core bc \ + && yum clean all + +RUN pip3 install -U pip --no-cache-dir \ + && pip3 install compiledb --no-cache-dir + +RUN echo "${_LOGIN} ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers && \ + echo "root ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers && \ + mkdir -p ${_HOME} && \ + groupadd -f -g "$_GID" "$_LOGIN" && \ + useradd -u "$_UID" -g "$_GID" -s /bin/bash -m -d ${_HOME} "${_LOGIN}" && \ + chown -R ${_LOGIN} ${_HOME} && \ + mkdir /build && chown -R ${_LOGIN} /build + +USER "$_LOGIN" +ENTRYPOINT [ "/bin/bash", "--login", "--rcfile", "/etc/bashrc", "-c" ] diff --git a/.ci/matrix_job.yaml b/.ci/matrix_job.yaml index cd7970a70..5455da234 100644 --- a/.ci/matrix_job.yaml +++ b/.ci/matrix_job.yaml @@ -4,16 +4,20 @@ job: LIBXLIO step_allow_single_selector: false registry_host: harbor.mellanox.com -registry_auth: swx-storage +registry_auth: 1daaea28-800e-425f-a91f-3bd3e9136eea +registry_path: /swx-infra/media kubernetes: privileged: false cloud: swx-k8s-spray nodeSelector: 'beta.kubernetes.io/os=linux' - + namespace: xlio-ci limits: '{memory: 8Gi, cpu: 7000m}' requests: '{memory: 8Gi, cpu: 7000m}' +credentials: + - {credentialsId: '925b0900-e273-4042-bc7c-facaefae0727', usernameVariable: 'XLIO_COV_USER', passwordVariable: 'XLIO_COV_PASSWORD'} + volumes: - {mountPath: /hpc/local/bin, hostPath: /hpc/local/bin} - {mountPath: /hpc/local/oss, hostPath: /hpc/local/oss} @@ -29,9 +33,6 @@ volumes: # User profile for release - {mountPath: /var/home/swx-jenkins, hostPath: /labhome/swx-jenkins} -env: - build_dockers: false - runs_on_dockers: # mofed - {name: 'ub20.04-mofed-x86_64', url: 'harbor.mellanox.com/swx-infra/x86_64/ubuntu20.04/builder:mofed-5.2-2.2.0.0', category: 'base', arch: 'x86_64'} @@ -40,9 +41,34 @@ runs_on_dockers: - {name: 'rhel8.6-mofed-x86_64', url: 'harbor.mellanox.com/hpcx/x86_64/rhel8.6/builder:mofed-5.6-0.4.5.0', category: 'base', arch: 'x86_64'} # - {name: 'oracle8.6-mofed-x86_64', url: 'harbor.mellanox.com/rivermax/base_oraclelinux8.6:mofed-5.9-0.3.4.0', category: 'base', arch: 'x86_64'} # tool - - {name: 'toolbox', url: 'harbor.mellanox.com/hpcx/x86_64/rhel8.3/builder:inbox', category: 'tool', arch: 'x86_64'} + - {name: 'toolbox', url: 'harbor.mellanox.com/hpcx/x86_64/rhel8.6/builder:inbox', category: 'tool', arch: 'x86_64'} - {name: 'blackduck', url: 'harbor.mellanox.com/toolbox/ngci-centos:7.9.2009.2', category: 'tool', arch: 'x86_64'} - {name: 'header-check', url: 'harbor.mellanox.com/toolbox/header_check:0.0.14', category: 'tool', arch: 'x86_64', tag: '0.0.14'} +# static tests + - {file: '.ci/dockerfiles/Dockerfile.rhel8.6', + arch: 'x86_64', + name: 'xlio_static.cppcheck', + uri: '$arch/$name', + tag: '20240703', + build_args: '--no-cache', + category: 'tool' + } + - {file: '.ci/dockerfiles/Dockerfile.rhel8.6', + arch: 'x86_64', + name: 'xlio_static.csbuild', + uri: '$arch/$name', + tag: '20240703', + build_args: '--no-cache', + category: 'tool' + } + - {file: '.ci/dockerfiles/Dockerfile.rhel8.6', + arch: 'x86_64', + name: 'xlio_static.tidy', + uri: '$arch/$name', + tag: '20240703', + build_args: '--no-cache', + category: 'tool' + } runs_on_agents: - {nodeLabel: 'beni09', category: 'base'} @@ -51,7 +77,6 @@ matrix: axes: flags: - default - - dpcp arch: - x86_64 - aarch64 @@ -196,6 +221,7 @@ steps: - name: Coverity enable: ${do_coverity} + credentialsId: '925b0900-e273-4042-bc7c-facaefae0727' containerSelector: - "{name: 'toolbox', category: 'tool'}" agentSelector: @@ -215,9 +241,9 @@ steps: - name: Cppcheck enable: ${do_cppcheck} containerSelector: - - "{name: 'skip-container'}" + - "{name: 'xlio_static.cppcheck', category: 'tool', variant: 1}" agentSelector: - - "{nodeLabel: 'beni09', variant:2}" + - "{nodeLabel: 'skip-agent'}" run: | [ "x${do_cppcheck}" == "xtrue" ] && action=yes || action=no env WORKSPACE=$PWD TARGET=${flags} jenkins_test_cppcheck=${action} ./contrib/test_jenkins.sh @@ -232,9 +258,9 @@ steps: - name: Csbuild enable: ${do_csbuild} containerSelector: - - "{name: 'skip-container'}" + - "{name: 'xlio_static.csbuild', category: 'tool', variant: 1}" agentSelector: - - "{nodeLabel: 'beni09', variant:2}" + - "{nodeLabel: 'skip-agent'}" run: | [ "x${do_csbuild}" == "xtrue" ] && action=yes || action=no env WORKSPACE=$PWD TARGET=${flags} jenkins_test_csbuild=${action} ./contrib/test_jenkins.sh @@ -249,9 +275,9 @@ steps: - name: Tidy enable: ${do_tidy} containerSelector: - - "{name: 'skip-container'}" + - "{name: 'xlio_static.tidy', category: 'tool', variant: 1}" agentSelector: - - "{nodeLabel: 'beni09', variant:2}" + - "{nodeLabel: 'skip-agent'}" run: | [ "x${do_tidy}" == "xtrue" ] && action=yes || action=no env WORKSPACE=$PWD TARGET=${flags} jenkins_test_tidy=${action} ./contrib/test_jenkins.sh @@ -268,7 +294,7 @@ steps: containerSelector: - "{name: 'skip-container'}" agentSelector: - - "{nodeLabel: 'beni09', variant:2}" + - "{nodeLabel: 'beni09'}" run: | [ "x${do_test}" == "xtrue" ] && action=yes || action=no env WORKSPACE=$PWD TARGET=${flags} jenkins_test_run=${action} ./contrib/test_jenkins.sh diff --git a/.ci/opensource_jjb.yaml b/.ci/opensource_jjb.yaml index de5934f8e..c6989764b 100644 --- a/.ci/opensource_jjb.yaml +++ b/.ci/opensource_jjb.yaml @@ -74,6 +74,10 @@ name: "do_coverity" default: true description: "Launch coverity verification." + - bool: + name: "do_coverity_snapshot" + default: false + description: "Submit Coverity Static Analysis as a snapshot (normally it should be checked only for master branch after proper defects review)" - bool: name: "do_test" default: true diff --git a/CHANGES b/CHANGES index 25a1862b4..53d4f6c84 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,65 @@ +Version 3.30.5-1: +Date + Time 2024-0-09 +============================================================= +Fixed: + - RM #3795997 OCI degradation IO operations with 8k conn, block size of 256k + - RM #3855390 http CPS test with IPv6 is reporting XLIO warning + - RM #3788164 Nginx Degradation 10MB with a low number of workers. + +Version 3.30.4-1: +Date + Time 2024-04-04 +============================================================= +Fixed: + - RM #3792798 Do code cleanup for new storage API + - RM #3829626 Nginx http CPS tests report error + - RM #3818038 I/O errors during FIO + +Version 3.30.3-1: +Date + Time 2024-03-20 +============================================================= +Added: + - RM #3788369 New Storage API + - RM #3777348 Improve sockinfo cache utilization + +Fixed: + - RM #3829626 Nginx http CPS tests report XLIO ERROR + - RM #3808935 SNAP4 static build error + +Version 3.30.2-1: +Date + Time 2024-03-11 +============================================================= +Added: + - RM #3792777 provide new storage API headers + - RM #3792789 provide new storage API implementation, integration level + - RM #3770816 Modernize C++ source code + - RM #3813802 Fix warnings from newer cppcheck version + - RM #3795922 Remove pbuf_split_64k() and refused_data + +Fixed: + - RM #3781322 higher CPU loads when loaded with Nginx responding to http requests of high payloads. + - RM #3792731 False positive Walloc-size-larger-than warning + +Version 3.30.1-1: +Date + Time 2024-02-22 +============================================================= +Fixed: + - RM #3786434 C++ or C23 feature in xlio_extra.h breaks compilation of some C programs + +Version 3.30.0-1: +Date + Time 2024-02-12 +============================================================= +Added: + - RM #3724170 Static compilation with LTO and PGO support + - RM #3668182 productize LwIP express data path + - RM #3664594 Backport TCP_KEEPALIVE from VMA + - RM #3514044 XLIO DPCP Only - Remove legacy code and legacy flows + +Fixed: + - RM #3771283 Fix function pointer check + - RM #3690535 Remove leftover after Multi Packet RQ removal + - RM #3678579 Fix last_unacked and last_unsent + - RM #3704820 XLIO error when enabling UDP listen socket + Version 3.21.2-1: Date + Time 2024-01-11 ============================================================= diff --git a/README b/README index b3f19bc79..65617d8db 100644 --- a/README +++ b/README @@ -153,6 +153,7 @@ Example: XLIO DETAILS: Mem Allocation type Huge pages [XLIO_MEM_ALLOC_TYPE] XLIO DETAILS: Memory limit 2 GB [XLIO_MEMORY_LIMIT] XLIO DETAILS: Memory limit (user allocator) 0 [XLIO_MEMORY_LIMIT_USER] + XLIO DETAILS: Hugepage size 0 [XLIO_HUGEPAGE_SIZE] XLIO DETAILS: Num of UC ARPs 3 [XLIO_NEIGH_UC_ARP_QUATA] XLIO DETAILS: UC ARP delay (msec) 10000 [XLIO_NEIGH_UC_ARP_DELAY_MSEC] XLIO DETAILS: Num of neigh restart retries 1 [XLIO_NEIGH_NUM_ERR_RETRIES] @@ -963,6 +964,12 @@ provided with XLIO extra API. Default value 0 makes XLIO use XLIO_MEMORY_LIMIT value for user allocations. Default value is 0 +XLIO_HUGEPAGE_SIZE +Force specific hugepage size for XLIO internal memory allocations. Value 0 allows +to use any supported and available hugepages. The size may be specified with +suffixes such as KB, MB, GB. +Default value is 0 + XLIO_NEIGH_UC_ARP_QUATA XLIO will send UC ARP in case neigh state is NUD_STALE. In case that neigh state is still NUD_STALE XLIO will try diff --git a/README.md b/README.md index 5726648da..13a7d6671 100644 --- a/README.md +++ b/README.md @@ -64,17 +64,6 @@ $ make -j $ make install ``` -### Building XLIO without dpcp - -```sh -$ ./autogen.sh -$ ./configure --prefix=/where/to/install -$ make -j -$ make install -``` - -Advanced HW features are not enabled for this build type. - ### Usage Examples #### Sockperf diff --git a/config/m4/compiler.m4 b/config/m4/compiler.m4 index 050ae60b1..0103dd4fe 100644 --- a/config/m4/compiler.m4 +++ b/config/m4/compiler.m4 @@ -71,107 +71,12 @@ AC_DEFUN([CHECK_COMPILER_ATTRIBUTE], [ # Usage: CHECK_COMPILER_CXX([standard], [option], [definition]) # Note: # - [definition] can be omitted if it is equal to attribute -# -AC_DEFUN([CHECK_COMPILER_CXX], [ - case "$1" in - 11) -m4_define([_prj_cv_compiler_body_11], [[ -#ifndef __cplusplus -#error This is not a C++ compiler -#elif __cplusplus < 201103L -#error This is not a C++11 compiler -#else -#include -int main(int argc, char** argv) -{ - (void)argc; - (void)argv; - /* decltype */ - int a = 5; - decltype(a) b = a; - return (b - a); -} -#endif // __cplusplus >= 201103L -]]) - ;; - 14) -m4_define([_prj_cv_compiler_body_14], [[ -#ifndef __cplusplus -#error This is not a C++ compiler -#elif __cplusplus < 201402L -#error This is not a C++14 compiler -#else -#include -int main(int argc, char** argv) -{ - (void)argc; - (void)argv; - /* Binary integer literals */ - constexpr auto i = 0b0000000000101010; - static_assert(i == 42, "wrong value"); - return 0; -} -#endif // __cplusplus >= 201402L -]]) - ;; - 17) -m4_define([_prj_cv_compiler_body_17], [[ -#ifndef __cplusplus -#error This is not a C++ compiler -#elif __cplusplus < 201703L -#error This is not a C++17 compiler -#else -int main(int argc, char** argv) -{ - (void)argc; - (void)argv; - // Check constexpr lambda - auto identity = [](int n) constexpr { return n; }; - static_assert(identity(123) == 123); - return 0; -} -#endif // __cplusplus >= 201703L -]]) - ;; - *) - AC_MSG_ERROR([invalid first argument as [$1] to [$0]]) - ;; - esac - case "$2" in - std) - prj_cv_option=-std=c++$1 - ;; - gnu) - prj_cv_option=-std=gnu++$1 - ;; - *) - AC_MSG_ERROR([invalid first argument as [$2] to [$0]]) - ;; - esac - - AC_CACHE_VAL(prj_cv_compiler_cxx_[$1], [ - prj_cv_compiler_save_CXXFLAGS="$CXXFLAGS" - CXXFLAGS="$prj_cv_option $CXXFLAGS" - - # - # Try to compile using the C++ compiler - # - AC_LANG_PUSH(C++) - AC_COMPILE_IFELSE([AC_LANG_SOURCE(_prj_cv_compiler_body_[$1])], - [prj_cv_compiler_cxx_$1=yes], - [prj_cv_compiler_cxx_$1=no]) - AC_LANG_POP(C++) - - CXXFLAGS="$prj_cv_compiler_save_CXXFLAGS" - ]) - AC_MSG_CHECKING([for compiler c++ [$1]]) - AC_MSG_RESULT([$prj_cv_compiler_cxx_$1]) - AS_IF([test "x$prj_cv_compiler_cxx_[$1]" = "xyes"], - [CXXFLAGS="$prj_cv_option $CXXFLAGS"], - [AC_MSG_ERROR([A compiler with support for C++[$1] language features is required])] - ) -]) - +saved_cxxflags="$CXXFLAGS" +CXXFLAGS="-Werror -std=c++14" +AC_MSG_CHECKING([whether CXX supports -std=c++14]) +AC_COMPILE_IFELSE([AC_LANG_PROGRAM([])], [AC_MSG_RESULT([yes])], + [AC_MSG_ERROR([C++14 is unsupported])]) +CXXFLAGS="-std=c++14 $saved_cxxflags" ########################## # Configure compiler capabilities @@ -256,6 +161,4 @@ else AC_MSG_CHECKING([for symbols visibility]) AC_MSG_RESULT([yes]) fi - -CHECK_COMPILER_CXX([14], [std], []) ]) diff --git a/config/m4/dpcp.m4 b/config/m4/dpcp.m4 index 2c57680bc..5ac2a7aab 100644 --- a/config/m4/dpcp.m4 +++ b/config/m4/dpcp.m4 @@ -56,64 +56,61 @@ get_version_number() get_min_supported_version() { - echo 10130 + echo 10143 } AC_ARG_WITH([dpcp], - AS_HELP_STRING([--with-dpcp(=DIR)], - [Search for dpcp headers and libraries in DIR (default NO)]), + AS_HELP_STRING([--with-dpcp@<:@=DIR@:>@], + [Search for dpcp headers and libraries in DIR @<:@default: /usr@:>@]), [], - [with_dpcp=no] + [] ) -if test "x$prj_cv_directverbs" != x3 && test "x$with_dpcp" != xno; then - AC_MSG_ERROR([dpcp can be used under RDMA-core subsystem only]) +if test "x$prj_cv_directverbs" != x3; then + AC_MSG_ERROR([RDMA-core subsystem required]) fi prj_cv_dpcp=0 -AS_IF([test "x$with_dpcp" == xno], - [], - [ - if test -z "$with_dpcp" || test "$with_dpcp" = "yes"; then - with_dpcp=/usr - fi +if test -z "$with_dpcp" || test "$with_dpcp" = "yes"; then + with_dpcp=/usr +fi - FUNC_CHECK_WITHDIR([dpcp], [$with_dpcp], [include/mellanox/dpcp.h]) +FUNC_CHECK_WITHDIR([dpcp], [$with_dpcp], [include/mellanox/dpcp.h]) - prj_cv_dpcp_save_CPPFLAGS="$CPPFLAGS" - prj_cv_dpcp_save_CXXFLAGS="$CXXFLAGS" - prj_cv_dpcp_save_CFLAGS="$CFLAGS" - prj_cv_dpcp_save_LDFLAGS="$LDFLAGS" - prj_cv_dpcp_save_LIBS="$LIBS" +prj_cv_dpcp_save_CPPFLAGS="$CPPFLAGS" +prj_cv_dpcp_save_CXXFLAGS="$CXXFLAGS" +prj_cv_dpcp_save_CFLAGS="$CFLAGS" +prj_cv_dpcp_save_LDFLAGS="$LDFLAGS" +prj_cv_dpcp_save_LIBS="$LIBS" - prj_cv_dpcp_CPPFLAGS="-I$with_dpcp/include" - prj_cv_dpcp_LIBS="-ldpcp -lmlx5" - prj_cv_dpcp_LDFLAGS="-L$with_dpcp/lib -Wl,--rpath,$with_dpcp/lib" - if test -d "$with_dpcp/lib64"; then - prj_cv_dpcp_LDFLAGS="-L$with_dpcp/lib64 -Wl,--rpath,$with_dpcp/lib64" - fi +prj_cv_dpcp_CPPFLAGS="-I$with_dpcp/include" +prj_cv_dpcp_LIBS="-ldpcp -lmlx5 -libverbs -lgcov" +prj_cv_dpcp_LDFLAGS="-L$with_dpcp/lib -Wl,--rpath,$with_dpcp/lib" +if test -d "$with_dpcp/lib64"; then + prj_cv_dpcp_LDFLAGS="-L$with_dpcp/lib64 -Wl,--rpath,$with_dpcp/lib64" +fi - CPPFLAGS="$prj_cv_dpcp_CPPFLAGS $CPPFLAGS" - CXXFLAGS="-std=c++11 $CXXFLAGS" - LDFLAGS="$prj_cv_dpcp_LDFLAGS $LDFLAGS" - LIBS="$prj_cv_dpcp_LIBS $LIBS" - - AC_LANG_PUSH([C++]) - AC_CHECK_HEADER( - [mellanox/dpcp.h], - [AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include ]], - [[dpcp::provider *provider; - dpcp::provider::get_instance(provider);]])], - [prj_cv_dpcp=1]) - ]) - AC_LANG_POP() - - CPPFLAGS="$prj_cv_dpcp_save_CPPFLAGS" - CXXFLAGS="$prj_cv_dpcp_save_CXXFLAGS" - CFLAGS="$prj_cv_dpcp_save_CFLAGS" - LDFLAGS="$prj_cv_dpcp_save_LDFLAGS" - LIBS="$prj_cv_dpcp_save_LIBS" +CPPFLAGS="$prj_cv_dpcp_CPPFLAGS $CPPFLAGS" +CXXFLAGS="-std=c++11 $CXXFLAGS" +LDFLAGS="$prj_cv_dpcp_LDFLAGS $LDFLAGS" +LIBS="$prj_cv_dpcp_LIBS $LIBS" + +AC_LANG_PUSH([C++]) +AC_CHECK_HEADER( + [mellanox/dpcp.h], + [AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include ]], + [[dpcp::provider *provider; + dpcp::provider::get_instance(provider);]])], + [prj_cv_dpcp=1]) ]) +AC_LANG_POP() + +CPPFLAGS="$prj_cv_dpcp_save_CPPFLAGS" +CXXFLAGS="$prj_cv_dpcp_save_CXXFLAGS" +CFLAGS="$prj_cv_dpcp_save_CFLAGS" +LDFLAGS="$prj_cv_dpcp_save_LDFLAGS" +LIBS="$prj_cv_dpcp_save_LIBS" + AC_MSG_CHECKING([for dpcp support]) if test "$prj_cv_dpcp" -ne 0; then @@ -124,16 +121,12 @@ if test "$prj_cv_dpcp" -ne 0; then min_supported_version=($(get_min_supported_version)) if test "$dpcp_version_number" -ge "$min_supported_version"; then - AC_DEFINE_UNQUOTED([DEFINED_DPCP], [$dpcp_version_number], [Define to DPCP version number (major * 10000 + minor * 100 + patch)]) AC_DEFINE_UNQUOTED([DEFINED_DPCP_MIN], [$min_supported_version], [Define to DPCP version number (major * 10000 + minor * 100 + patch)]) AC_MSG_RESULT([yes]) else - AC_MSG_RESULT([no]) AC_MSG_ERROR([found incompatible dpcp version $dpcp_version_number (min supported version $min_supported_version) ]) fi else - AS_IF([test "x$with_dpcp" == xno], - [AC_MSG_RESULT([no])], - [AC_MSG_ERROR([dpcp support requested but not present])]) + AC_MSG_ERROR([dpcp support requested but not present]) fi ]) diff --git a/config/m4/linking_optimization.m4 b/config/m4/linking_optimization.m4 new file mode 100644 index 000000000..2264c8b1f --- /dev/null +++ b/config/m4/linking_optimization.m4 @@ -0,0 +1,121 @@ +# +# Copyright © 2001-2024 NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# + +AC_PROG_CC +AC_PROG_CXX + +AC_MSG_CHECKING([for LTO]) +AC_ARG_ENABLE(lto, AS_HELP_STRING([--enable-lto], [Enable Link Time Optimization]), + [ + enable_lto=$enableval + ], [enable_lto=yes]) + +AS_IF([test "x$enable_lto" = "xyes"], + [ + case $CC in + gcc*|g++*) + AC_SUBST([XLIO_LTO], ["-flto=auto"]) + ;; + clang*|clang++*) + AC_SUBST([XLIO_LTO], ["-flto=thin"]) + ;; + *) + AC_MSG_ERROR([Compiler doesn't support link time optimization]) + ;; + esac + AC_MSG_RESULT([yes]) + ], + [ + AC_SUBST([XLIO_LTO], [""]) + AC_MSG_RESULT([no]) + ] +) + +AC_MSG_CHECKING([for PGO generate]) +AC_ARG_WITH([profile-generate], + [AS_HELP_STRING([--with-profile-generate=DIR], [Path to store profiles for Profile Guided Optimization])], + [ + COMMON_FLAGS="" + case $CC in + gcc*|g++*) + COMMON_FLAGS+="-fprofile-generate -fprofile-correction -Wno-error=missing-profile" + COMMON_FLAGS+=" -fprofile-partial-training -fprofile-dir=$withval" + ;; + clang*|clang++*) + COMMON_FLAGS+="-fprofile-generate=$withval" + ;; + *) + AC_MSG_ERROR([Compiler doesn't support profile guided optimization]) + ;; + esac + AC_CHECK_LIB([gcov], [__gcov_init], [], [AC_MSG_ERROR([libgcov not found])]) + AC_MSG_RESULT([$withval yes]) + profile_generate=yes + AC_SUBST([XLIO_PROFILE], ["$COMMON_FLAGS"]) + AC_SUBST([XLIO_GCOV], ["-lgcov"]) + ], + [ + profile_generate=no + AC_MSG_RESULT([no]) + ] +) + +AC_MSG_CHECKING([for PGO use]) +AC_ARG_WITH([profile-use], + [AS_HELP_STRING([--with-profile-use=DIR], [Path to read profiles for Profile Guided Optimization])], + [ + COMMON_FLAGS="" + case $CC in + gcc*|g++*) + COMMON_FLAGS+="-fprofile-use -fprofile-correction -Wno-error=missing-profile" + COMMON_FLAGS+=" -fprofile-partial-training -fprofile-dir=$withval" + ;; + clang*|clang++*) + COMMON_FLAGS+="-fprofile-use=$withval" + ;; + *) + AC_MSG_ERROR([Compiler doesn't support profile guided optimization]) + ;; + esac + AC_MSG_RESULT([$withval yes]) + profile_use=yes + AC_SUBST([XLIO_PROFILE], ["$COMMON_FLAGS"]) + ], + [ + profile_use=no + AC_MSG_RESULT([no]) + ] +) + +AS_IF([test "x$profile_use" = "xyes" && test "x$profile_generate" = "xyes"], [ + AC_MSG_ERROR([** Cannot use both --with-profile-generate and --with-profile-use]) +]) diff --git a/config/m4/verbs.m4 b/config/m4/verbs.m4 index 7c2feb870..62ad5a15a 100644 --- a/config/m4/verbs.m4 +++ b/config/m4/verbs.m4 @@ -139,7 +139,6 @@ CHECK_VERBS_ATTRIBUTE([IBV_QPT_RAW_PACKET], [infiniband/verbs.h]) CHECK_VERBS_ATTRIBUTE([IBV_WC_WITH_VLAN], [infiniband/verbs.h]) CHECK_VERBS_ATTRIBUTE([IBV_DEVICE_RAW_IP_CSUM], [infiniband/verbs.h]) CHECK_VERBS_ATTRIBUTE([IBV_SEND_IP_CSUM], [infiniband/verbs.h]) -CHECK_VERBS_ATTRIBUTE([IBV_FLOW_SPEC_ACTION_TAG], [infiniband/verbs.h], [IBV_FLOW_TAG]) CHECK_VERBS_ATTRIBUTE([IBV_WC_EX_WITH_COMPLETION_TIMESTAMP], [infiniband/verbs.h], [IBV_CQ_TIMESTAMP]) CHECK_VERBS_MEMBER([struct ibv_device_attr_ex.orig_attr], [infiniband/verbs.h], [IBV_DEVICE_ATTR_EX]) CHECK_VERBS_MEMBER([struct ibv_alloc_dm_attr.length], [infiniband/verbs.h], [IBV_DM]) diff --git a/configure.ac b/configure.ac index 60a87715b..25a5dc8fc 100644 --- a/configure.ac +++ b/configure.ac @@ -13,8 +13,8 @@ dnl===-----------------------------------------------------------------------=== # Update version number here: # define([prj_ver_major], 3) -define([prj_ver_minor], 21) -define([prj_ver_revision], 2) +define([prj_ver_minor], 30) +define([prj_ver_revision], 5) define([prj_ver_release], esyscmd([echo ${PRJ_RELEASE:=0}])) @@ -105,9 +105,22 @@ show_section_title "Configure build tools" : ${CFLAGS=""} : ${CXXFLAGS=""} +m4_include([config/m4/linking_optimization.m4]) + # Find compiler, libtools, etc # LT_INIT([disable-static]) + +# LT_INIT exposes the ability to configure --enable-static +if test "x$enable_static" = "xyes"; then + AC_SUBST([XLIO_STATIC_BUILD], ["-DXLIO_STATIC_BUILD"]) + if test "x$enable_shared" = "xyes"; then + AC_MSG_ERROR([Please add --disable-shared or --enable-shared=no]) + fi +else + AC_SUBST([XLIO_STATIC_BUILD], [""]) +fi + AC_PROG_CC AC_PROG_CXX diff --git a/contrib/build_pkg.sh b/contrib/build_pkg.sh index 4f65a71e1..1d8b26c1c 100755 --- a/contrib/build_pkg.sh +++ b/contrib/build_pkg.sh @@ -31,6 +31,10 @@ while test "$1" != ""; do arg_rpm="${arg_deb/=/ }" opt_exports="$opt_exports :$arg_deb"; opt_defines="$opt_defines --define='$arg_rpm'"; + if [[ $arg_deb =~ ^configure_options[[:blank:]]*= ]]; then + shopt -s extglob + opt_conf_val="${arg_deb##configure_options*([[:blank:]])=}" + fi shift ;; *) @@ -110,7 +114,7 @@ cd ${pkg_dir} if [ "$rc" -eq 0 ]; then echo ${pkg_label} "Running ./configure ..." - ${pkg_indir}/configure >> ${pkg_log} 2>&1 + ${pkg_indir}/configure $opt_conf_val >> ${pkg_log} 2>&1 rc=$((rc + $?)) fi diff --git a/contrib/jenkins_tests/build.sh b/contrib/jenkins_tests/build.sh index 24c4e153e..0fa64f4e2 100755 --- a/contrib/jenkins_tests/build.sh +++ b/contrib/jenkins_tests/build.sh @@ -13,19 +13,20 @@ cd ${build_dir} # Set symbolic links to default build and install ln -s "${build_dir}/0/install" "${install_dir}" -build_list="\ -debug:--enable-opt-log=no --enable-debug \ -nginx-off:--enable-nginx=no \ -envoy-on:--enable-nginx=yes \ -default: " - +declare -A build_list +build_list['debug']="--enable-opt-log=no --enable-debug" +build_list['nginx-off']="--enable-nginx=no" +build_list['envoy-on']="--enable-nginx=yes" +build_list['static-on']="--enable-static --disable-shared" +build_list['default']="" build_tap=${WORKSPACE}/${prefix}/build.tap -echo "1..$(echo $build_list | tr " " "\n" | wc -l)" > $build_tap +echo "1..${#build_list[@]}" > $build_tap test_id=0 -for build in $build_list; do - IFS=':' read build_name build_option <<< "$build" + +for build_name in "${!build_list[@]}"; do + build_option="${build_list[$build_name]}" mkdir -p ${build_dir}/${test_id} cd ${build_dir}/${test_id} test_exec='${WORKSPACE}/configure --prefix=${build_dir}/${test_id}/install $build_option $jenkins_test_custom_configure && make $make_opt install' diff --git a/contrib/jenkins_tests/compiler.sh b/contrib/jenkins_tests/compiler.sh index c37503141..f5d66bcff 100755 --- a/contrib/jenkins_tests/compiler.sh +++ b/contrib/jenkins_tests/compiler.sh @@ -25,7 +25,7 @@ for compiler in $compiler_list; do echo "======================================================" $cc --version echo - test_exec='${WORKSPACE}/configure --prefix=$compiler_dir-$cc CC=$cc CXX=$cxx $jenkins_test_custom_configure && make $make_opt all' + test_exec='${WORKSPACE}/configure --prefix=$compiler_dir-$cc CC=$cc CXX=$cxx --disable-lto $jenkins_test_custom_configure && make $make_opt all' do_check_result "$test_exec" "$test_id" "$test_name" "$compiler_tap" "${compiler_dir}/compiler-${test_id}" [ ! -z "$module" ] && module unload "$module" cd ${compiler_dir} diff --git a/contrib/jenkins_tests/copyrights.sh b/contrib/jenkins_tests/copyrights.sh index 4e7eba0fd..2ebb0db5f 100755 --- a/contrib/jenkins_tests/copyrights.sh +++ b/contrib/jenkins_tests/copyrights.sh @@ -12,7 +12,7 @@ if [ ! -d "$WORKSPACE" ]; then exit 1 fi -cpp_files=' "extensions": [".c", ".cc", ".cpp", "c++", ".h", ".hpp", ".cs", ".inl", ".l", ".y"],' +cpp_files=' "extensions": [".c", ".cc", ".cpp", "c++", ".h", ".hpp", ".cs", ".l", ".y"],' sed -i "s/.*\"extensions\": \[\"\.c\".*/$cpp_files/g" /opt/nvidia/ProjectConfig/header-types.json cat /opt/nvidia/ProjectConfig/header-types.json diff --git a/contrib/jenkins_tests/cov.sh b/contrib/jenkins_tests/cov.sh index 30bc72778..95c792539 100755 --- a/contrib/jenkins_tests/cov.sh +++ b/contrib/jenkins_tests/cov.sh @@ -46,6 +46,16 @@ eval "cov-analyze --config ${cov_dir}/coverity_config.xml \ --dir ${cov_build}" rc=$(($rc+$?)) +if [[ "${do_coverity_snapshot}" == true ]]; then + cov-commit-defects --ssl --on-new-cert trust \ + --url https://coverity.mellanox.com:8443 \ + --user "${XLIO_COV_USER}" --password "${XLIO_COV_PASSWORD}" \ + --dir "${cov_build}" \ + --stream libxlio-main \ + --strip-path "${WORKSPACE}" + rc=$(($rc+$?)) +fi + set -eE # Excluded files for the local report generated by command "cov-format-errors": diff --git a/contrib/jenkins_tests/cppcheck.sh b/contrib/jenkins_tests/cppcheck.sh index f79cba6ab..625e20485 100755 --- a/contrib/jenkins_tests/cppcheck.sh +++ b/contrib/jenkins_tests/cppcheck.sh @@ -7,26 +7,10 @@ echo "Checking for cppcheck ..." tool_app=cppcheck # This unit requires cppcheck so check for existence -if [ $(command -v ${tool_app} >/dev/null 2>&1 || echo $?) ]; then - set +e - eval "timeout -s SIGKILL 20s https://github.com/danmar/cppcheck.git cppcheck " > /dev/null 2>&1 - if [ $? -eq 0 ]; then - eval "cd cppcheck && checkout 2.1 " > /dev/null 2>&1 - if [ $? -eq 0 ]; then - eval "make $make_opt FILESDIR=$PWD HAVE_RULES=yes " > /dev/null 2>&1 - if [ $? -eq 0 ]; then - tool_app=$PWD/cppcheck - fi - fi - cd .. - fi - set -e - if [ $(command -v ${tool_app} >/dev/null 2>&1 || echo $?) ]; then echo "[SKIP] cppcheck tool does not exist" exit 1 fi -fi echo $(${tool_app} --version) @@ -39,7 +23,7 @@ cd $cppcheck_dir ${WORKSPACE}/configure $jenkins_test_custom_configure > "${cppcheck_dir}/cppcheck.log" 2>&1 set +eE -eval "find ${WORKSPACE}/src -name '*.h' -o -name '*.cpp' -o -name '*.c' -o -name '*.hpp' -o -name '*.inl' | \ +eval "find ${WORKSPACE}/src -name '*.h' -o -name '*.cpp' -o -name '*.c' -o -name '*.hpp' | \ ${tool_app} --std=c++11 --language=c++ --force --enable=information \ -I${WORKSPACE}/src \ -I${WORKSPACE}/src/stats \ diff --git a/contrib/jenkins_tests/gtest.sh b/contrib/jenkins_tests/gtest.sh index 8bf54f5f3..31bcf41eb 100755 --- a/contrib/jenkins_tests/gtest.sh +++ b/contrib/jenkins_tests/gtest.sh @@ -64,8 +64,8 @@ gtest_opt_ipv6="--addr=$(do_get_addrs 'inet6' ${opt2}) -r fdff:ffff:ffff:ffff:ff set +eE if [[ -z "${MANUAL_RUN}" ]]; then - ${WORKSPACE}/configure --prefix=$install_dir - make -C tests/gtest + ${WORKSPACE}/configure --prefix=$install_dir $jenkins_test_custom_configure + make $make_opt -C tests/gtest rc=$(($rc+$?)) fi @@ -73,24 +73,24 @@ eval "${sudo_cmd} pkill -9 ${prj_service} 2>/dev/null || true" eval "${sudo_cmd} ${install_dir}/sbin/${prj_service} --console -v5 &" # Exclude EXTRA API tests -eval "${sudo_cmd} $timeout_exe env GTEST_TAP=2 LD_PRELOAD=$gtest_lib $gtest_app $gtest_opt --gtest_filter=-xlio_*:tcp_send_zc* --gtest_output=xml:${WORKSPACE}/${prefix}/test-basic.xml" +eval "${sudo_cmd} $timeout_exe env GTEST_TAP=2 LD_PRELOAD=$gtest_lib $gtest_app $gtest_opt --gtest_filter=-xlio_* --gtest_output=xml:${WORKSPACE}/${prefix}/test-basic.xml" rc=$(($rc+$?)) # Exclude EXTRA API tests IPv6 -eval "${sudo_cmd} $timeout_exe env GTEST_TAP=2 LD_PRELOAD=$gtest_lib $gtest_app $gtest_opt_ipv6 --gtest_filter=-xlio_*:tcp_send_zc* --gtest_output=xml:${WORKSPACE}/${prefix}/test-basic-ipv6.xml" +eval "${sudo_cmd} $timeout_exe env GTEST_TAP=2 LD_PRELOAD=$gtest_lib $gtest_app $gtest_opt_ipv6 --gtest_filter=-xlio_* --gtest_output=xml:${WORKSPACE}/${prefix}/test-basic-ipv6.xml" rc=$(($rc+$?)) # Verify Delegated TCP Timers tests -eval "${sudo_cmd} $timeout_exe env XLIO_RX_POLL_ON_TX_TCP=1 XLIO_TCP_ABORT_ON_CLOSE=1 XLIO_TCP_CTL_THREAD=delegate GTEST_TAP=2 LD_PRELOAD=$gtest_lib $gtest_app $gtest_opt --gtest_filter=-xlio*:tcp_send_zc* --gtest_output=xml:${WORKSPACE}/${prefix}/test-delegate.xml" +eval "${sudo_cmd} $timeout_exe env XLIO_RX_POLL_ON_TX_TCP=1 XLIO_TCP_ABORT_ON_CLOSE=1 XLIO_TCP_CTL_THREAD=delegate GTEST_TAP=2 LD_PRELOAD=$gtest_lib $gtest_app $gtest_opt --gtest_filter=-xlio* --gtest_output=xml:${WORKSPACE}/${prefix}/test-delegate.xml" rc=$(($rc+$?)) # Verify Delegated TCP Timers tests IPv6 -eval "${sudo_cmd} $timeout_exe env XLIO_RX_POLL_ON_TX_TCP=1 XLIO_TCP_ABORT_ON_CLOSE=1 XLIO_TCP_CTL_THREAD=delegate GTEST_TAP=2 LD_PRELOAD=$gtest_lib $gtest_app $gtest_opt_ipv6 --gtest_filter=-xlio*:tcp_send_zc* --gtest_output=xml:${WORKSPACE}/${prefix}/test-delegate-ipv6.xml" +eval "${sudo_cmd} $timeout_exe env XLIO_RX_POLL_ON_TX_TCP=1 XLIO_TCP_ABORT_ON_CLOSE=1 XLIO_TCP_CTL_THREAD=delegate GTEST_TAP=2 LD_PRELOAD=$gtest_lib $gtest_app $gtest_opt_ipv6 --gtest_filter=-xlio* --gtest_output=xml:${WORKSPACE}/${prefix}/test-delegate-ipv6.xml" rc=$(($rc+$?)) if [[ -z "${MANUAL_RUN}" ]]; then make -C tests/gtest clean - make -C tests/gtest CPPFLAGS="-DEXTRA_API_ENABLED=1" + make $make_opt -C tests/gtest CPPFLAGS="-DEXTRA_API_ENABLED=1" rc=$(($rc+$?)) fi diff --git a/contrib/jenkins_tests/rpm.sh b/contrib/jenkins_tests/rpm.sh index e22952329..78014bd12 100755 --- a/contrib/jenkins_tests/rpm.sh +++ b/contrib/jenkins_tests/rpm.sh @@ -48,7 +48,7 @@ if [ $opt_tarball -eq 1 ]; then if [ -n "$(automake --version | grep 'automake (GNU automake) 1.10.1')" ]; then test_exec='make $make_opt dist' else - test_exec='make $make_opt dist && make $make_opt distcheck' + test_exec='make $make_opt dist && DISTCHECK_CONFIGURE_FLAGS='"'"$jenkins_test_custom_configure"'"' make $make_opt distcheck' fi do_check_result "$test_exec" "$test_id" "tarball" "$rpm_tap" "${rpm_dir}/rpm-${test_id}" @@ -73,9 +73,9 @@ fi if [ $opt_binrpm -eq 1 ]; then if [ $opt_rpm -eq 1 ]; then - test_exec="env RPM_BUILD_NCPUS=${NPROC} rpmbuild -bb $rpmmacros $rpmopts $rpmspec" + test_exec="env RPM_BUILD_NCPUS=${NPROC} rpmbuild -bb --define='configure_options $jenkins_test_custom_configure' $rpmmacros $rpmopts $rpmspec" else - test_exec="dpkg-buildpackage -us -uc -b" + test_exec="env configure_options=\"$jenkins_test_custom_configure\" dpkg-buildpackage -us -uc -b" fi do_check_result "$test_exec" "$test_id" "binrpm" "$rpm_tap" "${rpm_dir}/rpm-${test_id}" test_id=$((test_id+1)) @@ -88,5 +88,40 @@ if [ $opt_checkpkg -eq 1 ]; then test_id=$((test_id+1)) fi + +# check if we have email of indevidual users in the packages rpm/deb metadata Maintainer field +pacakges_location="$rpm_dir"/dist-pkg/packages +email_log_file="$rpm_dir"/dist-pkg/email_scan.log + +if [ $opt_rpm -eq 1 ]; then + search_filter="*.rpm" + test_info_exec="rpm -qpi --changelog" +else + search_filter="*.deb" + test_info_exec="apt info" +fi + +# iterate on all packages and extarct the metadata to outout file +find "$pacakges_location" -type f -name "$search_filter" -exec $test_info_exec {} \; | tee -a "$email_log_file" + +do_archive "$email_log_file" + +set +e +# grep email strings exclude allowed email networking-support@nvidia.com +test_output=$(grep -E -o "\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,6}\b" "$email_log_file" | grep -v "networking-support") +test_rc=$? +# check rc - grep will return 0 if it found such mail and 1 if not +if [[ $test_rc -eq 0 ]]; then + # if we found such mail we will get return code 0 + echo "ERROR: found bad email address $test_output" + rc=$((rc + 1)) +elif [[ -n "$test_output" ]]; then + # if we got rc not 0 and we have output it means something else failed + echo "ERROR: could not find bad email but something else failed: $test_output" + rc=$((rc + 1)) +fi + +set -e + echo "[${0##*/}]..................exit code = $rc" exit $rc diff --git a/contrib/jenkins_tests/style.sh b/contrib/jenkins_tests/style.sh index 6905177e2..90fd026f1 100755 --- a/contrib/jenkins_tests/style.sh +++ b/contrib/jenkins_tests/style.sh @@ -24,9 +24,9 @@ rm -rf $style_tap ln -sf $WORKSPACE/contrib/jenkins_tests/style.conf $WORKSPACE/.clang-format -check_files="$(find $WORKSPACE/src/ ! -name 'config_*' -a \( -iname '*.c' -o -iname '*.cpp' -o -iname '*.h' -o -iname '*.inl' -o -name '*.cc' \))" -check_files+=" $(find $WORKSPACE/tools/daemon/ \( -iname '*.c' -o -iname '*.cpp' -o -iname '*.h' -o -iname '*.inl' -o -iname '*.cc' \))" -check_files+=" $(find $WORKSPACE/tests/gtest/ \( -path "*/googletest" \) ! -prune -o ! \( -name 'tap.h' -o -name 'gtest.h' -o -name 'gtest-all.cc' \) -a \( -iname '*.c' -o -iname '*.cpp' -o -iname '*.h' -o -iname '*.inl' -o -iname '*.cc' \))" +check_files="$(find $WORKSPACE/src/ ! -name 'config_*' -a \( -iname '*.c' -o -iname '*.cpp' -o -iname '*.h' -o -name '*.cc' \))" +check_files+=" $(find $WORKSPACE/tools/daemon/ \( -iname '*.c' -o -iname '*.cpp' -o -iname '*.h' -o -iname '*.cc' \))" +check_files+=" $(find $WORKSPACE/tests/gtest/ \( -path "*/googletest" \) ! -prune -o ! \( -name 'tap.h' -o -name 'gtest.h' -o -name 'gtest-all.cc' \) -a \( -iname '*.c' -o -iname '*.cpp' -o -iname '*.h' -o -iname '*.cc' \))" i=0 nerrors=0 diff --git a/contrib/jenkins_tests/tidy.sh b/contrib/jenkins_tests/tidy.sh index 4eba3f681..94b74d6d4 100755 --- a/contrib/jenkins_tests/tidy.sh +++ b/contrib/jenkins_tests/tidy.sh @@ -13,6 +13,8 @@ source $(dirname $0)/globals.sh echo "Checking for tidy ..." +git config --global --add safe.directory $WORKSPACE + cd $WORKSPACE rm -rf $tidy_dir @@ -58,9 +60,9 @@ if [ ! -e $WORKSPACE/.clang-format ]; then ln -sf $WORKSPACE/contrib/jenkins_tests/style.conf $WORKSPACE/.clang-format fi -check_files="$(find $WORKSPACE/src/ ! -name 'config_*' -a \( -iname '*.c' -o -iname '*.cpp' -o -iname '*.hpp' -o -iname '*.h' -o -iname '*.inl' \) 2>&1 | tee -a "${tidy_dir}/${test_name}.log")" -check_files+=" $(find $WORKSPACE/tools/daemon/ \( -iname '*.c' -o -iname '*.cpp' -o -iname '*.hpp' -o -iname '*.h' -o -iname '*.inl' \) 2>&1 | tee -a "${tidy_dir}/${test_name}.log")" -check_files+=" $(find $WORKSPACE/tests/gtest/ \( -path "*/googletest" \) ! -prune -o ! -name 'tap.h' -a \( -iname '*.c' -o -iname '*.cpp' -o -iname '*.cc' -o -iname '*.hpp' -o -iname '*.h' -o -iname '*.inl' \) 2>&1 | tee -a "${tidy_dir}/${test_name}.log")" +check_files="$(find $WORKSPACE/src/ ! -name 'config_*' -a \( -iname '*.c' -o -iname '*.cpp' -o -iname '*.hpp' -o -iname '*.h' \) 2>&1 | tee -a "${tidy_dir}/${test_name}.log")" +check_files+=" $(find $WORKSPACE/tools/daemon/ \( -iname '*.c' -o -iname '*.cpp' -o -iname '*.hpp' -o -iname '*.h' \) 2>&1 | tee -a "${tidy_dir}/${test_name}.log")" +check_files+=" $(find $WORKSPACE/tests/gtest/ \( -path "*/googletest" \) ! -prune -o ! -name 'tap.h' -a \( -iname '*.c' -o -iname '*.cpp' -o -iname '*.cc' -o -iname '*.hpp' -o -iname '*.h' \) 2>&1 | tee -a "${tidy_dir}/${test_name}.log")" i=0 nerrors=0 diff --git a/contrib/scripts/libxlio.spec.in b/contrib/scripts/libxlio.spec.in index c00ffb42d..1d79c29e3 100644 --- a/contrib/scripts/libxlio.spec.in +++ b/contrib/scripts/libxlio.spec.in @@ -178,6 +178,8 @@ fi %files devel %dir %{_includedir}/mellanox %{_includedir}/mellanox/xlio_extra.h +%{_includedir}/mellanox/xlio_types.h +%{_includedir}/mellanox/xlio.h %if %{use_rel} > 0 %{_libdir}/%{name}-debug.so %endif @@ -187,7 +189,7 @@ fi %{_mandir}/man8/xlio_stats.* %changelog -* Thu Jan 11 2024 NVIDIA CORPORATION 3.21.2-1 -- Bump version to 3.21.2 +* Tue Apr 9 2024 NVIDIA CORPORATION 3.30.5-1 +- Bump version to 3.30.5 - Please refer to CHANGES for full changelog. diff --git a/contrib/test_jenkins.sh b/contrib/test_jenkins.sh index ea8209144..60fd956b3 100755 --- a/contrib/test_jenkins.sh +++ b/contrib/test_jenkins.sh @@ -100,16 +100,15 @@ do_check_env # set predefined configuration settings and extra options # that depend on environment # + TARGET=${TARGET:=all} i=0 if [ "$TARGET" == "all" -o "$TARGET" == "default" ]; then - target_list[$i]="default: --disable-nginx" - i=$((i+1)) -fi -if [ "$TARGET" == "all" -o "$TARGET" == "dpcp" ]; then + export jenkins_target="default" + export prefix=${jenkins_test_custom_prefix}/${jenkins_target} do_check_dpcp opt_value if [ ! -z "${opt_value}" ]; then - target_list[$i]="dpcp: --enable-nginx --with-dpcp=${opt_value}" + target_list[$i]="default: --enable-nginx --with-dpcp=${opt_value}" i=$((i+1)) else echo "Requested dpcp support can not be executed" @@ -130,9 +129,11 @@ for target_v in "${target_list[@]}"; do ret=0 IFS=':' read target_name target_option <<< "$target_v" + export jenkins_target="${target_name}" + export prefix=${jenkins_test_custom_prefix}/${jenkins_target} export jenkins_test_artifacts="${WORKSPACE}/${prefix}/xlio-${BUILD_NUMBER}-${HOSTNAME}-${target_name}" export jenkins_test_custom_configure="${jenkins_test_custom_configure} ${target_option}" - export jenkins_target="${target_name}" + set +x echo "======================================================" echo " Checking for [${jenkins_target}] target" diff --git a/debian/copyright b/debian/copyright index 88bb20923..72d3f06dd 100644 --- a/debian/copyright +++ b/debian/copyright @@ -51,8 +51,6 @@ License: GPLv2 and 2BSD Files: src/core/lwip/def.h src/core/lwip/err.h - src/core/lwip/init.c - src/core/lwip/init.h src/core/lwip/ip_addr.h src/core/lwip/opt.h src/core/lwip/pbuf.c diff --git a/debian/libxlio-dev.install b/debian/libxlio-dev.install index 04f2925df..e52df90cc 100644 --- a/debian/libxlio-dev.install +++ b/debian/libxlio-dev.install @@ -1,2 +1,4 @@ usr/include/mellanox/xlio_extra.h +usr/include/mellanox/xlio_types.h +usr/include/mellanox/xlio.h libxlio-debug.so usr/lib diff --git a/src/core/Makefile.am b/src/core/Makefile.am index f19605b4d..996ac4a99 100644 --- a/src/core/Makefile.am +++ b/src/core/Makefile.am @@ -12,7 +12,7 @@ LEX_OUTPUT_ROOT=lex.libxlio_yy # as built) because we don't want it to be created by old version of flex/yacc # on some machines that will generate gcc warmings. # in case you change the *.l or *.y in the future - than change the commenting in the following 3 lines -#----- +# #BUILT_SOURCES += config_scanner.c config_parser.h config_parser.c #libconfig_parser_la_SOURCES += util/config_scanner.l util/config_parser.y libconfig_parser_la_SOURCES += config_scanner.c config_parser.c @@ -25,32 +25,38 @@ dist-hook: SUBDIRS = infra netlink EXTRA_DIST = \ - dev/cq_mgr.inl \ - dev/cq_mgr_mlx5.inl \ util/libxlio.conf sysconf_DATA = util/libxlio.conf otherincludedir = $(includedir)/mellanox -otherinclude_HEADERS = xlio_extra.h +otherinclude_HEADERS = \ + xlio.h \ + xlio_extra.h \ + xlio_types.h install-exec-hook: rm -f $(DESTDIR)$(libdir)/libxlio.la - rm -f $(DESTDIR)$(libdir)/libxlio.a rm -f $(DESTDIR)$(bindir)/state_machine_test rm -f $(DESTDIR)$(bindir)/vlogger_test + uninstall-hook: rm -f $(DESTDIR)$(libdir)/libxlio.so* + rm -f $(DESTDIR)$(libdir)/libxlio.a lib_LTLIBRARIES = libxlio.la AM_CPPFLAGS := \ -I$(top_srcdir)/src ${LIBNL_CFLAGS} -libxlio_la_LDFLAGS := -no-undefined -version-number @PRJ_LIBRARY_MAJOR@:@PRJ_LIBRARY_MINOR@:@PRJ_LIBRARY_REVISION@ +libxlio_la_CFLAGS = $(XLIO_STATIC_BUILD) $(XLIO_LTO) $(XLIO_PROFILE) +libxlio_la_CXXFLAGS = $(XLIO_STATIC_BUILD) $(XLIO_LTO) $(XLIO_PROFILE) + +libxlio_la_LDFLAGS := $(XLIO_LTO) $(XLIO_PROFILE) -no-undefined \ + -version-number @PRJ_LIBRARY_MAJOR@:@PRJ_LIBRARY_MINOR@:@PRJ_LIBRARY_REVISION@ libxlio_la_LIBADD = \ - -lrt -ldl -lpthread $(LIBNL_LIBS) $(VERBS_LIBS) $(DPCP_LIBS) \ + -lrt -ldl -lpthread $(LIBNL_LIBS) $(VERBS_LIBS) $(DPCP_LIBS) $(XLIO_GCOV) \ $(top_builddir)/src/utils/libutils.la \ $(top_builddir)/src/vlogger/libvlogger.la \ $(top_builddir)/src/state_machine/libstate_machine.la \ @@ -62,20 +68,19 @@ libxlio_la_LIBADD = \ libxlio_la_SOURCES := \ dev/allocator.cpp \ dev/buffer_pool.cpp \ - dev/cq_mgr.cpp \ - dev/cq_mgr_mlx5.cpp \ - dev/cq_mgr_mlx5_strq.cpp \ + dev/cq_mgr_rx.cpp \ + dev/cq_mgr_rx_regrq.cpp \ + dev/cq_mgr_rx_strq.cpp \ + dev/cq_mgr_tx.cpp \ dev/dm_mgr.cpp \ - dev/qp_mgr.cpp \ - dev/qp_mgr_eth_mlx5.cpp \ - dev/qp_mgr_eth_mlx5_dpcp.cpp \ + dev/hw_queue_tx.cpp \ + dev/hw_queue_rx.cpp \ dev/gro_mgr.cpp \ dev/rfs.cpp \ dev/rfs_uc.cpp \ dev/rfs_uc_tcp_gro.cpp \ dev/rfs_mc.cpp \ - dev/rfs_rule_ibv.cpp \ - dev/rfs_rule_dpcp.cpp \ + dev/rfs_rule.cpp \ dev/time_converter.cpp \ dev/time_converter_ptp.cpp \ dev/time_converter_rtc.cpp \ @@ -96,7 +101,8 @@ libxlio_la_SOURCES := \ \ event/delta_timer.cpp \ event/event_handler_manager.cpp \ - event/thread_local_event_handler.cpp \ + event/event_handler_manager_local.cpp \ + event/poll_group.cpp \ event/vlogger_timer_handler.cpp \ event/netlink_event.cpp \ \ @@ -118,7 +124,6 @@ libxlio_la_SOURCES := \ lwip/cc_lwip.c \ lwip/cc_cubic.c \ lwip/cc_none.c \ - lwip/init.c \ \ proto/ip_frag.cpp \ proto/flow_tuple.cpp \ @@ -142,14 +147,12 @@ libxlio_la_SOURCES := \ proto/header.cpp \ proto/arp.cpp \ \ + sock/sock_stats.cpp \ sock/sockinfo.cpp \ sock/sockinfo_udp.cpp \ sock/sockinfo_ulp.cpp \ sock/sockinfo_tcp.cpp \ - sock/tcp_seg_pool.cpp \ sock/fd_collection.cpp \ - sock/pipeinfo.cpp \ - sock/socket_fd_api.cpp \ sock/sock-redirect.cpp \ sock/sock-app.cpp \ sock/sock-extra.cpp \ @@ -171,9 +174,11 @@ libxlio_la_SOURCES := \ \ dev/allocator.h \ dev/buffer_pool.h \ - dev/cq_mgr.h \ - dev/cq_mgr_mlx5.h \ - dev/cq_mgr_mlx5_strq.h \ + dev/cq_mgr_rx.h \ + dev/cq_mgr_rx_inl.h \ + dev/cq_mgr_rx_regrq.h \ + dev/cq_mgr_rx_strq.h \ + dev/cq_mgr_tx.h \ dev/dm_mgr.h \ dev/gro_mgr.h \ dev/ib_ctx_handler_collection.h \ @@ -185,16 +190,13 @@ libxlio_la_SOURCES := \ dev/net_device_entry.h \ dev/net_device_table_mgr.h \ dev/net_device_val.h \ - dev/qp_mgr.h \ - dev/qp_mgr_eth_mlx5.h \ - dev/qp_mgr_eth_mlx5_dpcp.h \ + dev/hw_queue_rx.h \ + dev/hw_queue_tx.h \ dev/rfs.h \ dev/rfs_mc.h \ dev/rfs_uc.h \ dev/rfs_uc_tcp_gro.h \ dev/rfs_rule.h \ - dev/rfs_rule_ibv.h \ - dev/rfs_rule_dpcp.h \ dev/src_addr_selector.h \ dev/ring.h \ dev/ring_bond.h \ @@ -203,17 +205,18 @@ libxlio_la_SOURCES := \ dev/ring_tap.h \ dev/ring_allocation_logic.h \ dev/wqe_send_handler.h \ + dev/xlio_ti.h \ \ event/command.h \ event/delta_timer.h \ event/event.h \ event/event_handler_ibverbs.h \ event/event_handler_manager.h \ - event/thread_local_event_handler.h \ + event/event_handler_manager_local.h \ event/event_handler_rdma_cm.h \ event/netlink_event.h \ + event/poll_group.h \ event/timer_handler.h \ - event/timers_group.h \ event/vlogger_timer_handler.h \ \ ib/base/verbs_extra.h \ @@ -233,7 +236,6 @@ libxlio_la_SOURCES := \ lwip/cc.h \ lwip/def.h \ lwip/err.h \ - lwip/init.h \ lwip/ip_addr.h \ lwip/opt.h \ lwip/pbuf.h \ @@ -273,15 +275,11 @@ libxlio_la_SOURCES := \ \ sock/cleanable_obj.h \ sock/fd_collection.h \ - sock/pipeinfo.h \ - sock/pkt_rcvr_sink.h \ - sock/pkt_sndr_source.h \ - sock/socket_fd_api.h \ + sock/sock_stats.h \ sock/sockinfo.h \ sock/sockinfo_tcp.h \ sock/sockinfo_udp.h \ sock/sockinfo_ulp.h \ - sock/tcp_seg_pool.h \ sock/sock-redirect.h \ sock/sock-redirect-internal.h \ sock/sock-app.h \ @@ -295,6 +293,7 @@ libxlio_la_SOURCES := \ util/instrumentation.h \ util/libxlio.h \ util/list.h \ + util/cached_obj_pool.h \ util/sg_array.h \ util/ip_address.h \ util/sock_addr.h \ @@ -314,7 +313,9 @@ libxlio_la_SOURCES := \ \ config_parser.h \ main.h \ - xlio_extra.h + xlio.h \ + xlio_extra.h \ + xlio_types.h libxlio_la_DEPENDENCIES = \ $(top_builddir)/src/vlogger/libvlogger.la \ diff --git a/src/core/dev/allocator.cpp b/src/core/dev/allocator.cpp index 5696464b3..9af6613b9 100644 --- a/src/core/dev/allocator.cpp +++ b/src/core/dev/allocator.cpp @@ -40,9 +40,13 @@ #include "ib_ctx_handler_collection.h" #include "util/hugepage_mgr.h" #include "util/vtypes.h" +#include "xlio.h" #define MODULE_NAME "allocator" +// See description at the xlio_memory_cb_t definition. +xlio_memory_cb_t g_user_memory_cb = nullptr; + xlio_allocator::xlio_allocator() : xlio_allocator(nullptr, nullptr) { @@ -62,6 +66,7 @@ xlio_allocator::xlio_allocator(alloc_t alloc_func, free_t free_func) m_type = static_cast(safe_mce_sys().mem_alloc_type); m_data = nullptr; m_size = 0; + m_page_size = 0; m_memalloc = alloc_func; m_memfree = free_func; if (m_memalloc) { @@ -151,7 +156,7 @@ void *xlio_allocator::alloc_huge(size_t size) __log_info_dbg("Allocating %zu bytes in huge tlb using mmap", size); size_t actual_size = size; - m_data = g_hugepage_mgr.alloc_hugepages(actual_size); + m_data = g_hugepage_mgr.alloc_hugepages(actual_size, m_page_size); if (!m_data && g_hugepage_mgr.get_default_hugepage() && m_type == ALLOC_TYPE_HUGEPAGES) { // Print a warning message on allocation error if hugepages are supported // and this is not a fallback from a different allocation method. @@ -286,7 +291,7 @@ bool xlio_registrator::register_memory(void *data, size_t size, ib_ctx_handler * return lkey != LKEY_ERROR; } - // Path for all ib contextes + // Path for all ib contexts ib_context_map_t *ib_ctx_map = g_p_ib_ctx_handler_collection->get_ib_cxt_list(); if (likely(ib_ctx_map)) { for (const auto &ib_ctx_key_val : *ib_ctx_map) { @@ -500,6 +505,10 @@ bool xlio_heap::expand(size_t size /*=0*/) m_blocks.push_back(block); m_latest_offset = 0; + if (m_b_hw && g_user_memory_cb) { + g_user_memory_cb(data, size, block->page_size()); + } + return true; error: diff --git a/src/core/dev/allocator.h b/src/core/dev/allocator.h index 39a749947..9e27693cf 100644 --- a/src/core/dev/allocator.h +++ b/src/core/dev/allocator.h @@ -61,8 +61,9 @@ class xlio_allocator { void dealloc(); - inline size_t size() { return m_size; } - inline void *data() { return m_data; } + size_t size() { return m_size; } + size_t page_size() { return m_page_size; } + void *data() { return m_data; } private: void print_hugepages_warning(size_t requested_size); @@ -71,6 +72,7 @@ class xlio_allocator { alloc_mode_t m_type; void *m_data; size_t m_size; + size_t m_page_size; private: alloc_t m_memalloc; diff --git a/src/core/dev/buffer_pool.cpp b/src/core/dev/buffer_pool.cpp index 5b276bd32..6176c32ea 100644 --- a/src/core/dev/buffer_pool.cpp +++ b/src/core/dev/buffer_pool.cpp @@ -47,22 +47,22 @@ // When Striding RQ is on, it points to g_buffer_pool_rx_stride since the upper layers work with // strides. When Striding RQ is off, it points to g_buffer_pool_rx_rwqe since the upper layers work // with RWQEs buffers themselves. -buffer_pool *g_buffer_pool_rx_ptr = NULL; +buffer_pool *g_buffer_pool_rx_ptr = nullptr; // This buffer-pool holds buffer descriptors which represent strides in strided RWQEs. // These buffers descriptos do not actually own a buffer. // Each such descriptor points into a portion of a buffer of a g_buffer_pool_rx_rwqe descriptor. -buffer_pool *g_buffer_pool_rx_stride = NULL; +buffer_pool *g_buffer_pool_rx_stride = nullptr; // This buffer-pool holds the actual buffers for receive WQEs. -buffer_pool *g_buffer_pool_rx_rwqe = NULL; +buffer_pool *g_buffer_pool_rx_rwqe = nullptr; // This buffer-pool holds the actual buffers for send WQEs. -buffer_pool *g_buffer_pool_tx = NULL; +buffer_pool *g_buffer_pool_tx = nullptr; // This buffer-pool holds buffer descriptors for zero copy TX. // These buffer descriptors do not actually own a buffer. -buffer_pool *g_buffer_pool_zc = NULL; +buffer_pool *g_buffer_pool_zc = nullptr; // inlining a function only help in case it come before using it... inline void buffer_pool::put_buffer_helper(mem_buf_desc_t *buff) @@ -74,15 +74,15 @@ inline void buffer_pool::put_buffer_helper(mem_buf_desc_t *buff) } #endif - if (buff->lwip_pbuf.pbuf.desc.attr == PBUF_DESC_STRIDE) { - mem_buf_desc_t *rwqe = reinterpret_cast(buff->lwip_pbuf.pbuf.desc.mdesc); + if (buff->lwip_pbuf.desc.attr == PBUF_DESC_STRIDE) { + mem_buf_desc_t *rwqe = reinterpret_cast(buff->lwip_pbuf.desc.mdesc); if (buff->rx.strides_num == rwqe->add_ref_count(-buff->rx.strides_num)) { // Is last stride. g_buffer_pool_rx_rwqe->put_buffers_thread_safe(rwqe); } } buff->p_next_desc = m_p_head; - assert(buff->lwip_pbuf.pbuf.type != PBUF_ZEROCOPY || this == g_buffer_pool_zc || + assert(buff->lwip_pbuf.type != PBUF_ZEROCOPY || this == g_buffer_pool_zc || g_buffer_pool_zc == NULL); free_lwip_pbuf(&buff->lwip_pbuf); m_p_head = buff; @@ -281,7 +281,7 @@ bool buffer_pool::get_buffers_thread_safe(descq_t &pDeque, ring_slave *desc_owne // Remove from list head = m_p_head; m_p_head = m_p_head->p_next_desc; - head->p_next_desc = NULL; + head->p_next_desc = nullptr; // Init head->lkey = lkey; @@ -490,7 +490,7 @@ void buffer_pool::put_buffers_after_deref_thread_safe(descq_t *pDeque) std::lock_guard lock(m_lock); while (!pDeque->empty()) { mem_buf_desc_t *list = pDeque->get_and_pop_front(); - if (list->dec_ref_count() <= 1 && (list->lwip_pbuf.pbuf.ref-- <= 1)) { + if (likely(list) && list->dec_ref_count() <= 1 && (list->lwip_pbuf.ref-- <= 1)) { put_buffers(list); } } @@ -498,7 +498,7 @@ void buffer_pool::put_buffers_after_deref_thread_safe(descq_t *pDeque) void buffer_pool::put_buffer_after_deref_thread_safe(mem_buf_desc_t *buff) { - if (buff->dec_ref_count() <= 1 && (buff->lwip_pbuf.pbuf.ref-- <= 1)) { + if (buff->dec_ref_count() <= 1 && (buff->lwip_pbuf.ref-- <= 1)) { std::lock_guard lock(m_lock); put_buffers(buff); } diff --git a/src/core/dev/buffer_pool.h b/src/core/dev/buffer_pool.h index eba576d7d..2f0a133d4 100644 --- a/src/core/dev/buffer_pool.h +++ b/src/core/dev/buffer_pool.h @@ -49,26 +49,22 @@ enum buffer_pool_type { BUFFER_POOL_TX, }; -inline static void free_lwip_pbuf(struct pbuf_custom *pbuf_custom) +inline static void free_lwip_pbuf(struct pbuf *lwip_pbuf) { - mem_buf_desc_t *p_desc = (mem_buf_desc_t *)pbuf_custom; + mem_buf_desc_t *p_desc = reinterpret_cast(lwip_pbuf); - if (pbuf_custom->pbuf.desc.attr == PBUF_DESC_MDESC || - pbuf_custom->pbuf.desc.attr == PBUF_DESC_NVME_TX) { - mem_desc *mdesc = (mem_desc *)pbuf_custom->pbuf.desc.mdesc; + if (lwip_pbuf->desc.attr == PBUF_DESC_MDESC || lwip_pbuf->desc.attr == PBUF_DESC_NVME_TX) { + mem_desc *mdesc = reinterpret_cast(lwip_pbuf->desc.mdesc); mdesc->put(); - } else if ((pbuf_custom->pbuf.type == PBUF_ZEROCOPY) && - (pbuf_custom->pbuf.desc.attr == PBUF_DESC_MAP)) { - mapping_t *mapping = (mapping_t *)pbuf_custom->pbuf.desc.map; - mapping->put(); } - if (p_desc->m_flags & mem_buf_desc_t::ZCOPY) { + if (p_desc->m_flags & mem_buf_desc_t::CALLBACK) { p_desc->tx.zc.callback(p_desc); } - pbuf_custom->pbuf.flags = 0; - pbuf_custom->pbuf.ref = 0; - pbuf_custom->pbuf.desc.attr = PBUF_DESC_NONE; + p_desc->m_flags = 0; + lwip_pbuf->flags = 0; + lwip_pbuf->ref = 0; + lwip_pbuf->desc.attr = PBUF_DESC_NONE; } /** diff --git a/src/core/dev/cq_mgr.cpp b/src/core/dev/cq_mgr.cpp deleted file mode 100644 index baf8b8f90..000000000 --- a/src/core/dev/cq_mgr.cpp +++ /dev/null @@ -1,990 +0,0 @@ -/* - * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "cq_mgr.h" -#include "cq_mgr.inl" -#include -#include -#include -#include - -#include "utils/bullseye.h" -#include -#include -#include "util/instrumentation.h" -#include -#include -#include "ib/base/verbs_extra.h" - -#include "buffer_pool.h" -#include "qp_mgr.h" -#include "ring_simple.h" - -#define MODULE_NAME "cqm" - -#define cq_logpanic __log_info_panic -#define cq_logerr __log_info_err -#define cq_logwarn __log_info_warn -#define cq_loginfo __log_info_info -#define cq_logdbg __log_info_dbg -#define cq_logfunc __log_info_func -#define cq_logfuncall __log_info_funcall - -#define cq_logdbg_no_funcname(log_fmt, log_args...) \ - do { \ - if (g_vlogger_level >= VLOG_DEBUG) \ - vlog_printf(VLOG_DEBUG, MODULE_NAME "[%p]:%d: " log_fmt "\n", __INFO__, __LINE__, \ - ##log_args); \ - } while (0) - -atomic_t cq_mgr::m_n_cq_id_counter = ATOMIC_INIT(1); - -uint64_t cq_mgr::m_n_global_sn = 0; - -cq_mgr::cq_mgr(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, int cq_size, - struct ibv_comp_channel *p_comp_event_channel, bool is_rx, bool config) - : m_p_ibv_cq(NULL) - , m_b_is_rx(is_rx) - , m_cq_id(0) - , m_n_cq_poll_sn(0) - , m_p_ring(p_ring) - , m_n_wce_counter(0) - , m_b_was_drained(false) - , m_b_is_rx_hw_csum_on(false) - , m_n_sysvar_cq_poll_batch_max(safe_mce_sys().cq_poll_batch_max) - , m_n_sysvar_progress_engine_wce_max(safe_mce_sys().progress_engine_wce_max) - , m_p_cq_stat(&m_cq_stat_static) // use local copy of stats by default (on rx cq get shared - // memory stats) - , m_transport_type(m_p_ring->get_transport_type()) - , m_p_next_rx_desc_poll(NULL) - , m_n_sysvar_rx_prefetch_bytes_before_poll(safe_mce_sys().rx_prefetch_bytes_before_poll) - , m_n_sysvar_rx_prefetch_bytes(safe_mce_sys().rx_prefetch_bytes) - , m_sz_transport_header(0) - , m_p_ib_ctx_handler(p_ib_ctx_handler) - , m_n_sysvar_rx_num_wr_to_post_recv(safe_mce_sys().rx_num_wr_to_post_recv) - , m_rx_buffs_rdy_for_free_head(NULL) - , m_rx_buffs_rdy_for_free_tail(NULL) - , m_comp_event_channel(p_comp_event_channel) - , m_b_notification_armed(false) - , m_n_sysvar_qp_compensation_level(safe_mce_sys().qp_compensation_level) - , m_rx_lkey(g_buffer_pool_rx_rwqe->find_lkey_by_ib_ctx_thread_safe(m_p_ib_ctx_handler)) - , m_b_sysvar_cq_keep_qp_full(safe_mce_sys().cq_keep_qp_full) - , m_n_out_of_free_bufs_warning(0) -{ - BULLSEYE_EXCLUDE_BLOCK_START - if (m_rx_lkey == 0) { - __log_info_panic("invalid lkey found %u", m_rx_lkey); - } - BULLSEYE_EXCLUDE_BLOCK_END - - memset(&m_cq_stat_static, 0, sizeof(m_cq_stat_static)); - memset(&m_qp_rec, 0, sizeof(m_qp_rec)); - m_rx_queue.set_id("cq_mgr (%p) : m_rx_queue", this); - m_rx_pool.set_id("cq_mgr (%p) : m_rx_pool", this); - m_cq_id = atomic_fetch_and_inc(&m_n_cq_id_counter); // cq id is nonzero - if (config) { - configure(cq_size); - } -} - -void cq_mgr::configure(int cq_size) -{ - xlio_ibv_cq_init_attr attr; - memset(&attr, 0, sizeof(attr)); - - prep_ibv_cq(attr); - - struct ibv_context *context = m_p_ib_ctx_handler->get_ibv_context(); - int comp_vector = 0; -#if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) - /* - * For some scenario with forking usage we may want to distribute CQs across multiple - * CPUs to improve CPS in case of multiple processes. - */ - if (safe_mce_sys().app.distribute_cq_interrupts && g_p_app->get_worker_id() >= 0) { - comp_vector = g_p_app->get_worker_id() % context->num_comp_vectors; - } -#endif - m_p_ibv_cq = xlio_ibv_create_cq(context, cq_size - 1, (void *)this, m_comp_event_channel, - comp_vector, &attr); - BULLSEYE_EXCLUDE_BLOCK_START - if (!m_p_ibv_cq) { - throw_xlio_exception("ibv_create_cq failed"); - } - BULLSEYE_EXCLUDE_BLOCK_END - VALGRIND_MAKE_MEM_DEFINED(m_p_ibv_cq, sizeof(ibv_cq)); - switch (m_transport_type) { - case XLIO_TRANSPORT_ETH: - m_sz_transport_header = ETH_HDR_LEN; - break; - BULLSEYE_EXCLUDE_BLOCK_START - default: - cq_logpanic("Unknown transport type: %d", m_transport_type); - break; - BULLSEYE_EXCLUDE_BLOCK_END - } - - if (m_b_is_rx) { - xlio_stats_instance_create_cq_block(m_p_cq_stat); - } - - if (m_b_is_rx) { - m_b_is_rx_hw_csum_on = - xlio_is_rx_hw_csum_supported(m_p_ib_ctx_handler->get_ibv_device_attr()); - cq_logdbg("RX CSUM support = %d", m_b_is_rx_hw_csum_on); - } - - cq_logdbg("Created CQ as %s with fd[%d] and of size %d elements (ibv_cq_hndl=%p)", - (m_b_is_rx ? "Rx" : "Tx"), get_channel_fd(), cq_size, m_p_ibv_cq); -} - -void cq_mgr::prep_ibv_cq(xlio_ibv_cq_init_attr &attr) const -{ - if (m_p_ib_ctx_handler->get_ctx_time_converter_status()) { - xlio_ibv_cq_init_ts_attr(&attr); - } -} - -uint32_t cq_mgr::clean_cq() -{ - uint32_t ret_total = 0; - int ret = 0; - uint64_t cq_poll_sn = 0; - mem_buf_desc_t *buff = NULL; - /* coverity[stack_use_local_overflow] */ - xlio_ibv_wc wce[MCE_MAX_CQ_POLL_BATCH]; - while ((ret = poll(wce, MCE_MAX_CQ_POLL_BATCH, &cq_poll_sn)) > 0) { - for (int i = 0; i < ret; i++) { - if (m_b_is_rx) { - buff = cqe_process_rx(&wce[i]); - } else { - buff = cqe_log_and_get_buf_tx(&wce[i]); - } - if (buff) { - m_rx_queue.push_back(buff); - } - } - ret_total += ret; - } - - return ret_total; -} - -cq_mgr::~cq_mgr() -{ - cq_logfunc(""); - cq_logdbg("destroying CQ as %s", (m_b_is_rx ? "Rx" : "Tx")); - - if (m_rx_buffs_rdy_for_free_head) { - reclaim_recv_buffers(m_rx_buffs_rdy_for_free_head); - } - - m_b_was_drained = true; - if (m_rx_queue.size() + m_rx_pool.size()) { - cq_logdbg("Returning %lu buffers to global Rx pool (ready queue %lu, free pool %lu))", - m_rx_queue.size() + m_rx_pool.size(), m_rx_queue.size(), m_rx_pool.size()); - - g_buffer_pool_rx_rwqe->put_buffers_thread_safe(&m_rx_queue, m_rx_queue.size()); - m_p_cq_stat->n_rx_sw_queue_len = m_rx_queue.size(); - - g_buffer_pool_rx_rwqe->put_buffers_thread_safe(&m_rx_pool, m_rx_pool.size()); - m_p_cq_stat->n_buffer_pool_len = m_rx_pool.size(); - } - - cq_logfunc("destroying ibv_cq"); - IF_VERBS_FAILURE_EX(ibv_destroy_cq(m_p_ibv_cq), EIO) - { - cq_logdbg("destroy cq failed (errno=%d %m)", errno); - } - ENDIF_VERBS_FAILURE; - VALGRIND_MAKE_MEM_UNDEFINED(m_p_ibv_cq, sizeof(ibv_cq)); - - statistics_print(); - if (m_b_is_rx) { - xlio_stats_instance_remove_cq_block(m_p_cq_stat); - } - - cq_logdbg("done"); -} - -void cq_mgr::statistics_print() -{ - if (m_p_cq_stat->n_rx_pkt_drop || m_p_cq_stat->n_rx_sw_queue_len || - m_p_cq_stat->n_rx_drained_at_once_max || m_p_cq_stat->n_buffer_pool_len) { - cq_logdbg_no_funcname("Packets dropped: %12llu", - (unsigned long long int)m_p_cq_stat->n_rx_pkt_drop); - cq_logdbg_no_funcname("Drained max: %17u", m_p_cq_stat->n_rx_drained_at_once_max); - cq_logdbg_no_funcname("CQE errors: %18llu", - (unsigned long long int)m_p_cq_stat->n_rx_cqe_error); - } -} - -ibv_cq *cq_mgr::get_ibv_cq_hndl() -{ - return m_p_ibv_cq; -} - -int cq_mgr::get_channel_fd() -{ - return m_comp_event_channel->fd; -} - -void cq_mgr::add_qp_rx(qp_mgr *qp) -{ - cq_logdbg("qp_mgr=%p", qp); - descq_t temp_desc_list; - temp_desc_list.set_id("cq_mgr (%p) : temp_desc_list", this); - - m_p_cq_stat->n_rx_drained_at_once_max = 0; - - /* return_extra_buffers(); */ // todo?? - - // Initial fill of receiver work requests - uint32_t qp_rx_wr_num = qp->get_rx_max_wr_num(); - cq_logdbg("Trying to push %d WRE to allocated qp (%p)", qp_rx_wr_num, qp); - while (qp_rx_wr_num) { - uint32_t n_num_mem_bufs = m_n_sysvar_rx_num_wr_to_post_recv; - if (n_num_mem_bufs > qp_rx_wr_num) { - n_num_mem_bufs = qp_rx_wr_num; - } - bool res = g_buffer_pool_rx_rwqe->get_buffers_thread_safe(temp_desc_list, m_p_ring, - n_num_mem_bufs, m_rx_lkey); - if (!res) { - VLOG_PRINTF_INFO_ONCE_THEN_ALWAYS( - VLOG_WARNING, VLOG_DEBUG, - "WARNING Out of mem_buf_desc from Rx buffer pool for qp_mgr qp_mgr initialization " - "(qp=%p),\n" - "\tThis might happen due to wrong setting of XLIO_RX_BUFS and XLIO_RX_WRE. Please " - "refer to README.txt for more info", - qp); - break; - } - - qp->post_recv_buffers(&temp_desc_list, temp_desc_list.size()); - if (!temp_desc_list.empty()) { - cq_logdbg("qp post recv is already full (push=%d, planned=%d)", - qp->get_rx_max_wr_num() - qp_rx_wr_num, qp->get_rx_max_wr_num()); - g_buffer_pool_rx_rwqe->put_buffers_thread_safe(&temp_desc_list, temp_desc_list.size()); - break; - } - qp_rx_wr_num -= n_num_mem_bufs; - } - cq_logdbg("Successfully post_recv qp with %d new Rx buffers (planned=%d)", - qp->get_rx_max_wr_num() - qp_rx_wr_num, qp->get_rx_max_wr_num()); - - // Add qp_mgr to map - m_qp_rec.qp = qp; - m_qp_rec.debt = 0; -} - -void cq_mgr::del_qp_rx(qp_mgr *qp) -{ - BULLSEYE_EXCLUDE_BLOCK_START - if (m_qp_rec.qp != qp) { - cq_logdbg("wrong qp_mgr=%p != m_qp_rec.qp=%p", qp, m_qp_rec.qp); - return; - } - BULLSEYE_EXCLUDE_BLOCK_END - cq_logdbg("qp_mgr=%p", m_qp_rec.qp); - return_extra_buffers(); - - clean_cq(); - memset(&m_qp_rec, 0, sizeof(m_qp_rec)); -} - -void cq_mgr::add_qp_tx(qp_mgr *qp) -{ - // Assume locked! - cq_logdbg("qp_mgr=%p", qp); - m_qp_rec.qp = qp; - m_qp_rec.debt = 0; -} - -void cq_mgr::del_qp_tx(qp_mgr *qp) -{ - BULLSEYE_EXCLUDE_BLOCK_START - if (m_qp_rec.qp != qp) { - cq_logdbg("wrong qp_mgr=%p != m_qp_rec.qp=%p", qp, m_qp_rec.qp); - return; - } - BULLSEYE_EXCLUDE_BLOCK_END - cq_logdbg("qp_mgr=%p", m_qp_rec.qp); - - memset(&m_qp_rec, 0, sizeof(m_qp_rec)); -} - -bool cq_mgr::request_more_buffers() -{ - cq_logfuncall("Allocating additional %d buffers for internal use", - m_n_sysvar_qp_compensation_level); - - // Assume locked! - // Add an additional free buffer descs to RX cq mgr - bool res = g_buffer_pool_rx_rwqe->get_buffers_thread_safe( - m_rx_pool, m_p_ring, m_n_sysvar_qp_compensation_level, m_rx_lkey); - if (!res) { - cq_logfunc("Out of mem_buf_desc from RX free pool for internal object pool"); - return false; - }; - - m_p_cq_stat->n_buffer_pool_len = m_rx_pool.size(); - return true; -} - -void cq_mgr::return_extra_buffers() -{ - if (m_rx_pool.size() < m_n_sysvar_qp_compensation_level * 2) { - return; - } - int buff_to_rel = m_rx_pool.size() - m_n_sysvar_qp_compensation_level; - - cq_logfunc("releasing %d buffers to global rx pool", buff_to_rel); - g_buffer_pool_rx_rwqe->put_buffers_thread_safe(&m_rx_pool, buff_to_rel); - m_p_cq_stat->n_buffer_pool_len = m_rx_pool.size(); -} - -int cq_mgr::poll(xlio_ibv_wc *p_wce, int num_entries, uint64_t *p_cq_poll_sn) -{ - // Assume locked!!! - cq_logfuncall(""); - - int ret = xlio_ibv_poll_cq(m_p_ibv_cq, num_entries, p_wce); - if (ret <= 0) { - // Zero polled wce OR ibv_poll_cq() has driver specific errors - // so we can't really do anything with them - *p_cq_poll_sn = m_n_global_sn; - return 0; - } - - if (unlikely(g_vlogger_level >= VLOG_FUNC_ALL)) { - for (int i = 0; i < ret; i++) { - cq_logfuncall("wce[%d] info wr_id=%x, status=%x, opcode=%x, vendor_err=%x, " - "byte_len=%d, imm_data=%x", - i, p_wce[i].wr_id, p_wce[i].status, xlio_wc_opcode(p_wce[i]), - p_wce[i].vendor_err, p_wce[i].byte_len, p_wce[i].imm_data); - cq_logfuncall("qp_num=%x, src_qp=%x, wc_flags=%x, pkey_index=%x, slid=%x, sl=%x, " - "dlid_path_bits=%x", - p_wce[i].qp_num, p_wce[i].src_qp, xlio_wc_flags(p_wce[i]), - p_wce[i].pkey_index, p_wce[i].slid, p_wce[i].sl, p_wce[i].dlid_path_bits); - } - } - - // spoil the global sn if we have packets ready - union __attribute__((packed)) { - uint64_t global_sn; - struct { - uint32_t cq_id; - uint32_t cq_sn; - } bundle; - } next_sn; - next_sn.bundle.cq_sn = ++m_n_cq_poll_sn; - next_sn.bundle.cq_id = m_cq_id; - - *p_cq_poll_sn = m_n_global_sn = next_sn.global_sn; - - return ret; -} - -void cq_mgr::process_cq_element_log_helper(mem_buf_desc_t *p_mem_buf_desc, xlio_ibv_wc *p_wce) -{ - BULLSEYE_EXCLUDE_BLOCK_START - // wce with bad status value - if (p_wce->status == IBV_WC_SUCCESS) { - cq_logdbg("wce: wr_id=%#lx, status=%#x, vendor_err=%#x, qp_num=%#x", p_wce->wr_id, - p_wce->status, p_wce->vendor_err, p_wce->qp_num); - if (m_b_is_rx_hw_csum_on && !xlio_wc_rx_hw_csum_ok(*p_wce)) { - cq_logdbg("wce: bad rx_csum"); - } - cq_logdbg("wce: opcode=%#x, byte_len=%u, src_qp=%#x, wc_flags=%#lx", xlio_wc_opcode(*p_wce), - p_wce->byte_len, p_wce->src_qp, (unsigned long)xlio_wc_flags(*p_wce)); - cq_logdbg("wce: pkey_index=%#x, slid=%#x, sl=%#x, dlid_path_bits=%#x, imm_data=%#x", - p_wce->pkey_index, p_wce->slid, p_wce->sl, p_wce->dlid_path_bits, - p_wce->imm_data); - if (p_mem_buf_desc) { - cq_logdbg("mem_buf_desc: lkey=%#x, p_buffer=%p, sz_buffer=%lu", p_mem_buf_desc->lkey, - p_mem_buf_desc->p_buffer, p_mem_buf_desc->sz_buffer); - } - } else if (p_wce->status != IBV_WC_WR_FLUSH_ERR) { - cq_logwarn("wce: wr_id=%#lx, status=%#x, vendor_err=%#x, qp_num=%#x", p_wce->wr_id, - p_wce->status, p_wce->vendor_err, p_wce->qp_num); - cq_loginfo("wce: opcode=%#x, byte_len=%u, src_qp=%#x, wc_flags=%#lx", - xlio_wc_opcode(*p_wce), p_wce->byte_len, p_wce->src_qp, - (unsigned long)xlio_wc_flags(*p_wce)); - cq_loginfo("wce: pkey_index=%#x, slid=%#x, sl=%#x, dlid_path_bits=%#x, imm_data=%#x", - p_wce->pkey_index, p_wce->slid, p_wce->sl, p_wce->dlid_path_bits, - p_wce->imm_data); - - m_p_cq_stat->n_rx_cqe_error++; - if (p_mem_buf_desc) { - cq_logwarn("mem_buf_desc: lkey=%#x, p_buffer=%p, sz_buffer=%lu", p_mem_buf_desc->lkey, - p_mem_buf_desc->p_buffer, p_mem_buf_desc->sz_buffer); - } - } - BULLSEYE_EXCLUDE_BLOCK_END - - cq_logfunc("wce error status '%s' [%d] (wr_id=%p, qp_num=%x)", - priv_ibv_wc_status_str(p_wce->status), p_wce->status, p_wce->wr_id, p_wce->qp_num); -} - -mem_buf_desc_t *cq_mgr::cqe_log_and_get_buf_tx(xlio_ibv_wc *p_wce) -{ - // Assume locked!!! - cq_logfuncall(""); - - mem_buf_desc_t *p_mem_buf_desc = (mem_buf_desc_t *)(uintptr_t)p_wce->wr_id; - if (unlikely(p_wce->status != IBV_WC_SUCCESS)) { - process_cq_element_log_helper(p_mem_buf_desc, p_wce); - } - return p_mem_buf_desc; -} - -mem_buf_desc_t *cq_mgr::cqe_process_rx(xlio_ibv_wc *p_wce) -{ - // Assume locked!!! - cq_logfuncall(""); - - // Get related mem_buf_desc pointer from the wr_id - mem_buf_desc_t *p_mem_buf_desc = (mem_buf_desc_t *)(uintptr_t)p_wce->wr_id; - - bool bad_wce = p_wce->status != IBV_WC_SUCCESS; - - if (unlikely(bad_wce || p_mem_buf_desc == NULL)) { - if (p_mem_buf_desc == NULL) { - m_p_next_rx_desc_poll = NULL; - cq_logdbg("wce->wr_id = 0!!! When status == IBV_WC_SUCCESS"); - return NULL; - } - - process_cq_element_log_helper(p_mem_buf_desc, p_wce); - - m_p_next_rx_desc_poll = NULL; - - if (p_mem_buf_desc == NULL) { - cq_logdbg("wce->wr_id = 0!!! When status != IBV_WC_SUCCESS"); - return NULL; - } - if (p_mem_buf_desc->p_desc_owner) { - reclaim_recv_buffer_helper(p_mem_buf_desc); - return NULL; - } - // AlexR: can this wce have a valid mem_buf_desc pointer? - // AlexR: are we throwing away a data buffer and a mem_buf_desc element? - cq_logdbg("no desc_owner(wr_id=%lu, qp_num=%x)", p_wce->wr_id, p_wce->qp_num); - return NULL; - } - - if (m_n_sysvar_rx_prefetch_bytes_before_poll) { - /*for debug: - if (m_p_next_rx_desc_poll && m_p_next_rx_desc_poll != p_mem_buf_desc) { - cq_logerr("prefetched wrong buffer"); - }*/ - m_p_next_rx_desc_poll = p_mem_buf_desc->p_prev_desc; - p_mem_buf_desc->p_prev_desc = NULL; - } - - p_mem_buf_desc->rx.is_sw_csum_need = !(m_b_is_rx_hw_csum_on && xlio_wc_rx_hw_csum_ok(*p_wce)); - - if (likely(xlio_wc_opcode(*p_wce) & XLIO_IBV_WC_RECV)) { - // Save recevied total bytes - p_mem_buf_desc->sz_data = p_wce->byte_len; - - // we use context to verify that on reclaim rx buffer path we return the buffer to the right - // CQ - p_mem_buf_desc->rx.is_xlio_thr = false; - p_mem_buf_desc->rx.context = this; - - // this is not a deadcode if timestamping is defined in verbs API - // coverity[dead_error_condition] - if (xlio_wc_flags(*p_wce) & XLIO_IBV_WC_WITH_TIMESTAMP) { - p_mem_buf_desc->rx.timestamps.hw_raw = xlio_wc_timestamp(*p_wce); - } - - VALGRIND_MAKE_MEM_DEFINED(p_mem_buf_desc->p_buffer, p_mem_buf_desc->sz_data); - - prefetch_range((uint8_t *)p_mem_buf_desc->p_buffer + m_sz_transport_header, - std::min(p_mem_buf_desc->sz_data - m_sz_transport_header, - (size_t)m_n_sysvar_rx_prefetch_bytes)); - // prefetch((uint8_t*)p_mem_buf_desc->p_buffer + m_sz_transport_header); - } - - return p_mem_buf_desc; -} - -bool cq_mgr::compensate_qp_poll_success(mem_buf_desc_t *buff_cur) -{ - // Assume locked!!! - // Compensate QP for all completions that we found - if (m_rx_pool.size() || request_more_buffers()) { - size_t buffers = std::min(m_qp_rec.debt, m_rx_pool.size()); - m_qp_rec.qp->post_recv_buffers(&m_rx_pool, buffers); - m_qp_rec.debt -= buffers; - m_p_cq_stat->n_buffer_pool_len = m_rx_pool.size(); - } else if (m_b_sysvar_cq_keep_qp_full || - m_qp_rec.debt + MCE_MAX_CQ_POLL_BATCH > (int)m_qp_rec.qp->m_rx_num_wr) { - m_p_cq_stat->n_rx_pkt_drop++; - m_qp_rec.qp->post_recv_buffer(buff_cur); - --m_qp_rec.debt; - return true; - } - - return false; -} - -void cq_mgr::compensate_qp_poll_failed() -{ - // Assume locked!!! - // Compensate QP for all completions debt - if (m_qp_rec.debt) { - if (likely(m_rx_pool.size() || request_more_buffers())) { - size_t buffers = std::min(m_qp_rec.debt, m_rx_pool.size()); - m_qp_rec.qp->post_recv_buffers(&m_rx_pool, buffers); - m_qp_rec.debt -= buffers; - m_p_cq_stat->n_buffer_pool_len = m_rx_pool.size(); - } - } -} - -void cq_mgr::reclaim_recv_buffer_helper(mem_buf_desc_t *buff) -{ - if (buff->dec_ref_count() <= 1 && (buff->lwip_pbuf.pbuf.ref-- <= 1)) { - if (likely(buff->p_desc_owner == m_p_ring)) { - mem_buf_desc_t *temp = NULL; - while (buff) { - VLIST_DEBUG_CQ_MGR_PRINT_ERROR_IS_MEMBER; - temp = buff; - assert(temp->lwip_pbuf.pbuf.type != PBUF_ZEROCOPY); - buff = temp->p_next_desc; - temp->clear_transport_data(); - temp->p_next_desc = NULL; - temp->p_prev_desc = NULL; - temp->reset_ref_count(); - free_lwip_pbuf(&temp->lwip_pbuf); - m_rx_pool.push_back(temp); - } - m_p_cq_stat->n_buffer_pool_len = m_rx_pool.size(); - } else { - cq_logfunc("Buffer returned to wrong CQ"); - g_buffer_pool_rx_rwqe->put_buffers_thread_safe(buff); - } - } -} - -void cq_mgr::process_tx_buffer_list(mem_buf_desc_t *p_mem_buf_desc) -{ - // Assume locked!!! - BULLSEYE_EXCLUDE_BLOCK_START - if (p_mem_buf_desc && - (p_mem_buf_desc->p_desc_owner == - m_p_ring /*|| m_p_ring->get_parent()->is_member(p_mem_buf_desc->p_desc_owner)*/)) { - m_p_ring->mem_buf_desc_return_to_owner_tx(p_mem_buf_desc); - /* if decided to free buffers of another ring here, need to modify return_to_owner to check - * owner and return to gpool. */ - } else if (p_mem_buf_desc && m_p_ring->get_parent()->is_member(p_mem_buf_desc->p_desc_owner)) { - cq_logerr("got buffer of wrong owner, high-availability event? buf=%p, owner=%p", - p_mem_buf_desc, p_mem_buf_desc ? p_mem_buf_desc->p_desc_owner : NULL); - /* if decided to free buffers here, remember its a list and need to deref members. */ - // p_mem_buf_desc->p_desc_owner->mem_buf_desc_return_to_owner_tx(p_mem_buf_desc); /* this - // can cause a deadlock between rings, use trylock? */ - } else { - cq_logerr("got buffer of wrong owner, buf=%p, owner=%p", p_mem_buf_desc, - p_mem_buf_desc ? p_mem_buf_desc->p_desc_owner : NULL); - } - BULLSEYE_EXCLUDE_BLOCK_END -} - -// This method is called when ring release returns unposted buffers. -void cq_mgr::mem_buf_desc_return_to_owner(mem_buf_desc_t *p_mem_buf_desc, - void *pv_fd_ready_array /*=NULL*/) -{ - cq_logfuncall(""); - NOT_IN_USE(pv_fd_ready_array); - cq_mgr::reclaim_recv_buffer_helper(p_mem_buf_desc); -} - -int cq_mgr::poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void *pv_fd_ready_array) -{ - // Assume locked!!! - cq_logfuncall(""); - - /* coverity[stack_use_local_overflow] */ - xlio_ibv_wc wce[MCE_MAX_CQ_POLL_BATCH]; - - int ret; - uint32_t ret_rx_processed = process_recv_queue(pv_fd_ready_array); - if (unlikely(ret_rx_processed >= m_n_sysvar_cq_poll_batch_max)) { - m_p_ring->m_gro_mgr.flush_all(pv_fd_ready_array); - return ret_rx_processed; - } - - if (m_p_next_rx_desc_poll) { - prefetch_range((uint8_t *)m_p_next_rx_desc_poll->p_buffer, - m_n_sysvar_rx_prefetch_bytes_before_poll); - } - - ret = poll(wce, m_n_sysvar_cq_poll_batch_max, p_cq_poll_sn); - if (ret > 0) { - m_n_wce_counter += ret; - if (ret < (int)m_n_sysvar_cq_poll_batch_max) { - m_b_was_drained = true; - } - - for (int i = 0; i < ret; i++) { - mem_buf_desc_t *buff = cqe_process_rx((&wce[i])); - if (buff) { - if (xlio_wc_opcode(wce[i]) & XLIO_IBV_WC_RECV) { - if ((++m_qp_rec.debt < (int)m_n_sysvar_rx_num_wr_to_post_recv) || - !compensate_qp_poll_success(buff)) { - process_recv_buffer(buff, pv_fd_ready_array); - } - } - } - } - ret_rx_processed += ret; - m_p_ring->m_gro_mgr.flush_all(pv_fd_ready_array); - } else { - compensate_qp_poll_failed(); - } - - return ret_rx_processed; -} - -int cq_mgr::poll_and_process_element_tx(uint64_t *p_cq_poll_sn) -{ - // Assume locked!!! - cq_logfuncall(""); - - /* coverity[stack_use_local_overflow] */ - xlio_ibv_wc wce[MCE_MAX_CQ_POLL_BATCH]; - int ret = poll(wce, m_n_sysvar_cq_poll_batch_max, p_cq_poll_sn); - if (ret > 0) { - m_n_wce_counter += ret; - if (ret < (int)m_n_sysvar_cq_poll_batch_max) { - m_b_was_drained = true; - } - - for (int i = 0; i < ret; i++) { - mem_buf_desc_t *buff = cqe_log_and_get_buf_tx((&wce[i])); - if (buff) { - process_tx_buffer_list(buff); - } - } - } - - return ret; -} - -bool cq_mgr::reclaim_recv_buffers(mem_buf_desc_t *rx_reuse_lst) -{ - if (m_rx_buffs_rdy_for_free_head) { - reclaim_recv_buffer_helper(m_rx_buffs_rdy_for_free_head); - m_rx_buffs_rdy_for_free_head = m_rx_buffs_rdy_for_free_tail = NULL; - } - reclaim_recv_buffer_helper(rx_reuse_lst); - return_extra_buffers(); - - return true; -} - -bool cq_mgr::reclaim_recv_buffers_no_lock(mem_buf_desc_t *rx_reuse_lst) -{ - if (likely(rx_reuse_lst)) { - reclaim_recv_buffer_helper(rx_reuse_lst); - return true; - } - return false; -} - -int cq_mgr::reclaim_recv_single_buffer(mem_buf_desc_t *rx_reuse) -{ - int ret_val = 0; - - ret_val = rx_reuse->lwip_pbuf_dec_ref_count(); - if ((ret_val == 0) && (rx_reuse->get_ref_count() <= 0)) { - /*if ((safe_mce_sys().thread_mode > THREAD_MODE_SINGLE)) { - m_lock_ring_rx.lock(); - }*/ - if (!m_rx_buffs_rdy_for_free_head) { - m_rx_buffs_rdy_for_free_head = m_rx_buffs_rdy_for_free_tail = rx_reuse; - } else { - m_rx_buffs_rdy_for_free_tail->p_next_desc = rx_reuse; - m_rx_buffs_rdy_for_free_tail = rx_reuse; - } - m_rx_buffs_rdy_for_free_tail->p_next_desc = NULL; - /*if ((safe_mce_sys().thread_mode > THREAD_MODE_SINGLE)) { - m_lock_ring_rx.lock(); - }*/ - } - return ret_val; -} - -bool cq_mgr::reclaim_recv_buffers(descq_t *rx_reuse) -{ - cq_logfuncall(""); - // Called from outside cq_mgr context which is not locked!! - while (!rx_reuse->empty()) { - mem_buf_desc_t *buff = rx_reuse->get_and_pop_front(); - reclaim_recv_buffer_helper(buff); - } - return_extra_buffers(); - - return true; -} - -// -// @OUT: p_recycle_buffers_last_wr_id Returns the final WR_ID handled. When set, this indicates -// this is a CQE drain flow. -// @OUT: returns total number of processes CQE's -// - -int cq_mgr::drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id /*=NULL*/) -{ - cq_logfuncall("cq was %s drained. %d processed wce since last check. %d strides in m_rx_queue", - (m_b_was_drained ? "" : "not "), m_n_wce_counter, m_rx_queue.size()); - - // CQ polling loop until max wce limit is reached for this interval or CQ is drained - uint32_t ret_total = 0; - uint64_t cq_poll_sn = 0; - - /* drain_and_proccess() is mainly called in following cases as - * Internal thread: - * Frequency of real polling can be controlled by - * XLIO_PROGRESS_ENGINE_INTERVAL and XLIO_PROGRESS_ENGINE_WCE_MAX. - * socketxtreme: - * User does socketxtreme_poll() - * Cleanup: - * QP down logic to release rx buffers should force polling to do this. - * Not null argument indicates one. - */ - while (((m_n_sysvar_progress_engine_wce_max > m_n_wce_counter) && (!m_b_was_drained)) || - (p_recycle_buffers_last_wr_id)) { - - /* coverity[stack_use_local_overflow] */ - xlio_ibv_wc wce[MCE_MAX_CQ_POLL_BATCH]; - int ret = poll(wce, MCE_MAX_CQ_POLL_BATCH, &cq_poll_sn); - if (ret <= 0) { - m_b_was_drained = true; - m_p_ring->m_gro_mgr.flush_all(NULL); - return ret_total; - } - - m_n_wce_counter += ret; - if (ret < MCE_MAX_CQ_POLL_BATCH) { - m_b_was_drained = true; - } - - for (int i = 0; i < ret; i++) { - mem_buf_desc_t *buff = cqe_process_rx(&wce[i]); - if (buff) { - if (p_recycle_buffers_last_wr_id) { - m_p_cq_stat->n_rx_pkt_drop++; - reclaim_recv_buffer_helper(buff); - } else { - bool procces_now = false; - if (m_transport_type == XLIO_TRANSPORT_ETH) { - procces_now = is_eth_tcp_frame(buff); - } - // We process immediately all non udp/ip traffic.. - if (procces_now) { - buff->rx.is_xlio_thr = true; - if ((++m_qp_rec.debt < (int)m_n_sysvar_rx_num_wr_to_post_recv) || - !compensate_qp_poll_success(buff)) { - process_recv_buffer(buff, NULL); - } - } else { // udp/ip traffic we just put in the cq's rx queue - m_rx_queue.push_back(buff); - mem_buf_desc_t *buff_cur = m_rx_queue.get_and_pop_front(); - if ((++m_qp_rec.debt < (int)m_n_sysvar_rx_num_wr_to_post_recv) || - !compensate_qp_poll_success(buff_cur)) { - m_rx_queue.push_front(buff_cur); - } - } - } - } - if (p_recycle_buffers_last_wr_id) { - *p_recycle_buffers_last_wr_id = (uintptr_t)wce[i].wr_id; - } - } - ret_total += ret; - } - m_p_ring->m_gro_mgr.flush_all(NULL); - - m_n_wce_counter = 0; - m_b_was_drained = false; - - // Update cq statistics - m_p_cq_stat->n_rx_sw_queue_len = m_rx_queue.size(); - m_p_cq_stat->n_rx_drained_at_once_max = - std::max(ret_total, m_p_cq_stat->n_rx_drained_at_once_max); - - return ret_total; -} - -// 1 -> busy -// 0 -> ok -// -1 -> error -int cq_mgr::ack_and_request_notification() -{ - int res, cq_ev_count = 0; - ibv_cq *ib_cq; - void *cq_context; - do { - res = ibv_get_cq_event(m_comp_event_channel, &ib_cq, &cq_context); - if (res == 0) { - ++cq_ev_count; - } - } while (res == 0); - if (errno != EAGAIN) { - return -1; - } - if (cq_ev_count > 0) { - get_cq_event(cq_ev_count); - ibv_ack_cq_events(m_p_ibv_cq, cq_ev_count); - return 1; - } - IF_VERBS_FAILURE(req_notify_cq()) - { - cq_logerr("Failure arming the qp_mgr notification channel (errno=%d %m)", errno); - return -1; - } - ENDIF_VERBS_FAILURE - return 0; -} - -int cq_mgr::request_notification(uint64_t poll_sn) -{ - int ret = -1; - - cq_logfuncall(""); - - if ((m_n_global_sn > 0 && poll_sn != m_n_global_sn)) { - // The cq_mgr's has receive packets pending processing (or got processed since cq_poll_sn) - cq_logfunc("miss matched poll sn (user=0x%lx, cq=0x%lx)", poll_sn, m_n_cq_poll_sn); - return 1; - } - - if (m_b_notification_armed == false) { - - cq_logfunc("arming cq_mgr notification channel"); - - // Arm the CQ notification channel - IF_VERBS_FAILURE(req_notify_cq()) - { - cq_logerr("Failure arming the qp_mgr notification channel (errno=%d %m)", errno); - } - else - { - ret = 0; - m_b_notification_armed = true; - } - ENDIF_VERBS_FAILURE; - } else { - // cq_mgr notification channel already armed - ret = 0; - } - - cq_logfuncall("returning with %d", ret); - return ret; -} - -int cq_mgr::wait_for_notification_and_process_element(uint64_t *p_cq_poll_sn, - void *pv_fd_ready_array) -{ - int ret = -1; - - cq_logfunc(""); - - if (m_b_notification_armed) { - cq_mgr *p_cq_mgr_context = NULL; - struct ibv_cq *p_cq_hndl = NULL; - void *p; // deal with compiler warnings - - // Block on the cq_mgr's notification event channel - IF_VERBS_FAILURE(ibv_get_cq_event(m_comp_event_channel, &p_cq_hndl, &p)) - { - cq_logfunc("waiting on cq_mgr event returned with error (errno=%d %m)", errno); - } - else - { - get_cq_event(); - p_cq_mgr_context = (cq_mgr *)p; - if (p_cq_mgr_context != this) { - cq_logerr("mismatch with cq_mgr returned from new event (event->cq_mgr->%p)", - p_cq_mgr_context); - // this can be if we are using a single channel for several/all cq_mgrs - // in this case we need to deliver the event to the correct cq_mgr - } - - // Ack event - ibv_ack_cq_events(m_p_ibv_cq, 1); - - // Clear flag - m_b_notification_armed = false; - - // Now try processing the ready element - if (m_b_is_rx) { - ret = poll_and_process_element_rx(p_cq_poll_sn, pv_fd_ready_array); - } else { - ret = poll_and_process_element_tx(p_cq_poll_sn); - } - } - ENDIF_VERBS_FAILURE; - } else { - cq_logfunc("notification channel is not armed"); - errno = EAGAIN; - } - - return ret; -} - -cq_mgr *get_cq_mgr_from_cq_event(struct ibv_comp_channel *p_cq_channel) -{ - cq_mgr *p_cq_mgr = NULL; - struct ibv_cq *p_cq_hndl = NULL; - void *p_context; // deal with compiler warnings - - // read & ack the CQ event - IF_VERBS_FAILURE(ibv_get_cq_event(p_cq_channel, &p_cq_hndl, &p_context)) - { - vlog_printf(VLOG_INFO, - MODULE_NAME ":%d: waiting on cq_mgr event returned with error (errno=%d %m)\n", - __LINE__, errno); - } - else - { - p_cq_mgr = (cq_mgr *)p_context; // Save the cq_mgr - p_cq_mgr->get_cq_event(); - ibv_ack_cq_events(p_cq_hndl, 1); // Ack the ibv event - } - ENDIF_VERBS_FAILURE; - - return p_cq_mgr; -} diff --git a/src/core/dev/cq_mgr_mlx5.cpp b/src/core/dev/cq_mgr_mlx5.cpp deleted file mode 100644 index af825c32d..000000000 --- a/src/core/dev/cq_mgr_mlx5.cpp +++ /dev/null @@ -1,637 +0,0 @@ -/* - * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "cq_mgr_mlx5.h" - -#if defined(DEFINED_DIRECT_VERBS) - -#include -#include "cq_mgr.inl" -#include "cq_mgr_mlx5.inl" -#include "qp_mgr.h" -#include "qp_mgr_eth_mlx5.h" -#include "ring_simple.h" - -#include - -#define MODULE_NAME "cqm_mlx5" - -#define cq_logfunc __log_info_func -#define cq_logdbg __log_info_dbg -#define cq_logwarn __log_info_warn -#define cq_logerr __log_info_err -#define cq_logpanic __log_info_panic -#define cq_logfuncall __log_info_funcall - -cq_mgr_mlx5::cq_mgr_mlx5(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, uint32_t cq_size, - struct ibv_comp_channel *p_comp_event_channel, bool is_rx, - bool call_configure) - : cq_mgr(p_ring, p_ib_ctx_handler, cq_size, p_comp_event_channel, is_rx, call_configure) - , m_qp(NULL) - , m_rx_hot_buffer(NULL) -{ - cq_logfunc(""); - - memset(&m_mlx5_cq, 0, sizeof(m_mlx5_cq)); -} - -uint32_t cq_mgr_mlx5::clean_cq() -{ - uint32_t ret_total = 0; - uint64_t cq_poll_sn = 0; - mem_buf_desc_t *buff; - - if (m_b_is_rx) { - /* Sanity check for cq: initialization of tx and rx cq has difference: - * tx - is done in qp_mgr::configure() - * rx - is done in qp_mgr::up() - * as a result rx cq can be created but not initialized - */ - if (NULL == m_qp) { - return 0; - } - - buff_status_e status = BS_OK; - while ((buff = poll(status))) { - if (cqe_process_rx(buff, status)) { - m_rx_queue.push_back(buff); - } - ++ret_total; - } - update_global_sn(cq_poll_sn, ret_total); - } else { // Tx - int ret = 0; - /* coverity[stack_use_local_overflow] */ - xlio_ibv_wc wce[MCE_MAX_CQ_POLL_BATCH]; - while ((ret = cq_mgr::poll(wce, MCE_MAX_CQ_POLL_BATCH, &cq_poll_sn)) > 0) { - for (int i = 0; i < ret; i++) { - buff = cqe_log_and_get_buf_tx(&wce[i]); - if (buff) { - m_p_ring->mem_buf_desc_return_single_to_owner_tx(buff); - } - } - ret_total += ret; - } - } - - return ret_total; -} - -cq_mgr_mlx5::~cq_mgr_mlx5() -{ - cq_logfunc(""); - cq_logdbg("destroying CQ as %s", (m_b_is_rx ? "Rx" : "Tx")); -} - -mem_buf_desc_t *cq_mgr_mlx5::poll(enum buff_status_e &status) -{ - mem_buf_desc_t *buff = NULL; - - if (unlikely(NULL == m_rx_hot_buffer)) { - if (likely(m_qp->m_mlx5_qp.rq.tail != (m_qp->m_mlx5_qp.rq.head))) { - uint32_t index = m_qp->m_mlx5_qp.rq.tail & (m_qp_rec.qp->m_rx_num_wr - 1); - m_rx_hot_buffer = (mem_buf_desc_t *)m_qp->m_rq_wqe_idx_to_wrid[index]; - m_qp->m_rq_wqe_idx_to_wrid[index] = 0; - prefetch((void *)m_rx_hot_buffer); - prefetch((uint8_t *)m_mlx5_cq.cq_buf + - ((m_mlx5_cq.cq_ci & (m_mlx5_cq.cqe_count - 1)) << m_mlx5_cq.cqe_size_log)); - } else { - /* If rq_tail and rq_head are pointing to the same wqe, - * the wq is empty and there is no cqe to be received */ - return NULL; - } - } - xlio_mlx5_cqe *cqe = check_cqe(); - if (likely(cqe)) { - /* Update the consumer index */ - ++m_mlx5_cq.cq_ci; - rmb(); - cqe_to_mem_buff_desc(cqe, m_rx_hot_buffer, status); - - ++m_qp->m_mlx5_qp.rq.tail; - *m_mlx5_cq.dbrec = htonl(m_mlx5_cq.cq_ci & 0xffffff); - - buff = m_rx_hot_buffer; - m_rx_hot_buffer = NULL; - } else { - prefetch((void *)m_rx_hot_buffer); - } - - prefetch((uint8_t *)m_mlx5_cq.cq_buf + - ((m_mlx5_cq.cq_ci & (m_mlx5_cq.cqe_count - 1)) << m_mlx5_cq.cqe_size_log)); - - return buff; -} - -void cq_mgr_mlx5::cqe_to_mem_buff_desc(struct xlio_mlx5_cqe *cqe, mem_buf_desc_t *p_rx_wc_buf_desc, - enum buff_status_e &status) -{ - struct mlx5_err_cqe *ecqe; - ecqe = (struct mlx5_err_cqe *)cqe; - - switch (MLX5_CQE_OPCODE(cqe->op_own)) { - case MLX5_CQE_RESP_WR_IMM: - cq_logerr("IBV_WC_RECV_RDMA_WITH_IMM is not supported"); - status = BS_CQE_RESP_WR_IMM_NOT_SUPPORTED; - break; - case MLX5_CQE_RESP_SEND: - case MLX5_CQE_RESP_SEND_IMM: - case MLX5_CQE_RESP_SEND_INV: { - status = BS_OK; - p_rx_wc_buf_desc->sz_data = ntohl(cqe->byte_cnt); -#ifdef DEFINED_UTLS - p_rx_wc_buf_desc->rx.tls_decrypted = (cqe->pkt_info >> 3) & 0x3; -#endif /* DEFINED_UTLS */ - p_rx_wc_buf_desc->rx.timestamps.hw_raw = ntohll(cqe->timestamp); - p_rx_wc_buf_desc->rx.flow_tag_id = xlio_get_flow_tag(cqe); - p_rx_wc_buf_desc->rx.is_sw_csum_need = - !(m_b_is_rx_hw_csum_on && (cqe->hds_ip_ext & MLX5_CQE_L4_OK) && - (cqe->hds_ip_ext & MLX5_CQE_L3_OK)); - if (cqe->lro_num_seg > 1) { - lro_update_hdr(cqe, p_rx_wc_buf_desc); - m_p_cq_stat->n_rx_lro_packets++; - m_p_cq_stat->n_rx_lro_bytes += p_rx_wc_buf_desc->sz_data; - } - return; - } - case MLX5_CQE_INVALID: /* No cqe!*/ - { - cq_logerr("We should no receive a buffer without a cqe\n"); - status = BS_CQE_INVALID; - break; - } - case MLX5_CQE_REQ: - case MLX5_CQE_REQ_ERR: - case MLX5_CQE_RESP_ERR: - default: { - if (MLX5_CQE_SYNDROME_WR_FLUSH_ERR == ecqe->syndrome) { - status = BS_IBV_WC_WR_FLUSH_ERR; - } else { - status = BS_GENERAL_ERR; - } - /* - IB compliant completion with error syndrome: - 0x1: Local_Length_Error - 0x2: Local_QP_Operation_Error - 0x4: Local_Protection_Error - 0x5: Work_Request_Flushed_Error - 0x6: Memory_Window_Bind_Error - 0x10: Bad_Response_Error - 0x11: Local_Access_Error - 0x12: Remote_Invalid_Request_Error - 0x13: Remote_Access_Error - 0x14: Remote_Operation_Error - 0x15: Transport_Retry_Counter_Exceeded - 0x16: RNR_Retry_Counter_Exceeded - 0x22: Aborted_Error - other: Reserved - */ - break; - } - } - - // increase cqe error counter should be done once, here (regular flow) - switch (MLX5_CQE_OPCODE(cqe->op_own)) { - case MLX5_CQE_INVALID: - case MLX5_CQE_REQ_ERR: - case MLX5_CQE_RESP_ERR: - m_p_cq_stat->n_rx_cqe_error++; - break; - } -} - -int cq_mgr_mlx5::drain_and_proccess_helper(mem_buf_desc_t *buff, buff_status_e status, - uintptr_t *p_recycle_buffers_last_wr_id) -{ - ++m_n_wce_counter; - if (cqe_process_rx(buff, status)) { - if (p_recycle_buffers_last_wr_id) { - m_p_cq_stat->n_rx_pkt_drop++; - reclaim_recv_buffer_helper(buff); - } else { - bool procces_now = - (m_transport_type == XLIO_TRANSPORT_ETH ? is_eth_tcp_frame(buff) : false); - - if (procces_now) { // We process immediately all non udp/ip traffic.. - buff->rx.is_xlio_thr = true; - if ((++m_qp_rec.debt < (int)m_n_sysvar_rx_num_wr_to_post_recv) || - !compensate_qp_poll_success(buff)) { - process_recv_buffer(buff, nullptr); - } - } else { // udp/ip traffic we just put in the cq's rx queue - m_rx_queue.push_back(buff); - mem_buf_desc_t *buff_cur = m_rx_queue.get_and_pop_front(); - if ((++m_qp_rec.debt < (int)m_n_sysvar_rx_num_wr_to_post_recv) || - !compensate_qp_poll_success(buff_cur)) { - m_rx_queue.push_front(buff_cur); - } - } - } - } - - if (p_recycle_buffers_last_wr_id) { - *p_recycle_buffers_last_wr_id = (uintptr_t)buff; - } - - return 1; -} - -int cq_mgr_mlx5::drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id /*=NULL*/) -{ - cq_logfuncall("cq was %s drained. %d processed wce since last check. %d wce in m_rx_queue", - (m_b_was_drained ? "" : "not "), m_n_wce_counter, m_rx_queue.size()); - - /* CQ polling loop until max wce limit is reached for this interval or CQ is drained */ - uint32_t ret_total = 0; - uint64_t cq_poll_sn = 0; - - /* drain_and_proccess() is mainly called in following cases as - * Internal thread: - * Frequency of real polling can be controlled by - * PROGRESS_ENGINE_INTERVAL and PROGRESS_ENGINE_WCE_MAX. - * socketxtreme: - * User does socketxtreme_poll() - * Cleanup: - * QP down logic to release rx buffers should force polling to do this. - * Not null argument indicates one. - */ - - while (((m_n_sysvar_progress_engine_wce_max > m_n_wce_counter) && (!m_b_was_drained)) || - (p_recycle_buffers_last_wr_id)) { - buff_status_e status = BS_OK; - mem_buf_desc_t *buff = poll(status); - if (NULL == buff) { - update_global_sn(cq_poll_sn, ret_total); - m_b_was_drained = true; - m_p_ring->m_gro_mgr.flush_all(NULL); - return ret_total; - } - - ++m_n_wce_counter; - - if (cqe_process_rx(buff, status)) { - if (p_recycle_buffers_last_wr_id) { - m_p_cq_stat->n_rx_pkt_drop++; - reclaim_recv_buffer_helper(buff); - } else { - bool procces_now = false; - if (m_transport_type == XLIO_TRANSPORT_ETH) { - procces_now = is_eth_tcp_frame(buff); - } - /* We process immediately all non udp/ip traffic.. */ - if (procces_now) { - buff->rx.is_xlio_thr = true; - if ((++m_qp_rec.debt < (int)m_n_sysvar_rx_num_wr_to_post_recv) || - !compensate_qp_poll_success(buff)) { - process_recv_buffer(buff, NULL); - } - } else { /* udp/ip traffic we just put in the cq's rx queue */ - m_rx_queue.push_back(buff); - mem_buf_desc_t *buff_cur = m_rx_queue.front(); - m_rx_queue.pop_front(); - if ((++m_qp_rec.debt < (int)m_n_sysvar_rx_num_wr_to_post_recv) || - !compensate_qp_poll_success(buff_cur)) { - m_rx_queue.push_front(buff_cur); - } - } - } - } - - if (p_recycle_buffers_last_wr_id) { - *p_recycle_buffers_last_wr_id = (uintptr_t)buff; - } - - ++ret_total; - } - - update_global_sn(cq_poll_sn, ret_total); - - m_p_ring->m_gro_mgr.flush_all(NULL); - - m_n_wce_counter = 0; - m_b_was_drained = false; - - // Update cq statistics - m_p_cq_stat->n_rx_sw_queue_len = m_rx_queue.size(); - m_p_cq_stat->n_rx_drained_at_once_max = - std::max(ret_total, m_p_cq_stat->n_rx_drained_at_once_max); - - return ret_total; -} - -mem_buf_desc_t *cq_mgr_mlx5::cqe_process_rx(mem_buf_desc_t *p_mem_buf_desc, - enum buff_status_e status) -{ - /* Assume locked!!! */ - cq_logfuncall(""); - - /* we use context to verify that on reclaim rx buffer path we return the buffer to the right CQ - */ - p_mem_buf_desc->rx.is_xlio_thr = false; - p_mem_buf_desc->rx.context = NULL; - - if (unlikely(status != BS_OK)) { - m_p_next_rx_desc_poll = NULL; - reclaim_recv_buffer_helper(p_mem_buf_desc); - return NULL; - } - - if (m_n_sysvar_rx_prefetch_bytes_before_poll) { - m_p_next_rx_desc_poll = p_mem_buf_desc->p_prev_desc; - p_mem_buf_desc->p_prev_desc = NULL; - } - - VALGRIND_MAKE_MEM_DEFINED(p_mem_buf_desc->p_buffer, p_mem_buf_desc->sz_data); - - prefetch_range((uint8_t *)p_mem_buf_desc->p_buffer + m_sz_transport_header, - std::min(p_mem_buf_desc->sz_data - m_sz_transport_header, - (size_t)m_n_sysvar_rx_prefetch_bytes)); - - return p_mem_buf_desc; -} - -mem_buf_desc_t *cq_mgr_mlx5::poll_and_process_socketxtreme() -{ - buff_status_e status = BS_OK; - mem_buf_desc_t *buff_wqe = poll(status); - - if (buff_wqe) { - if (cqe_process_rx(buff_wqe, status)) { - if ((++m_qp_rec.debt < (int)m_n_sysvar_rx_num_wr_to_post_recv) || - !compensate_qp_poll_success(buff_wqe)) { - return buff_wqe; - } - } else if (++m_qp_rec.debt >= (int)m_n_sysvar_rx_num_wr_to_post_recv) { - compensate_qp_poll_failed(); - } - } else { - compensate_qp_poll_failed(); - } - - return nullptr; -} - -int cq_mgr_mlx5::poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void *pv_fd_ready_array) -{ - /* Assume locked!!! */ - cq_logfuncall(""); - - uint32_t ret_rx_processed = process_recv_queue(pv_fd_ready_array); - if (unlikely(ret_rx_processed >= m_n_sysvar_cq_poll_batch_max)) { - m_p_ring->m_gro_mgr.flush_all(pv_fd_ready_array); - return ret_rx_processed; - } - - if (m_p_next_rx_desc_poll) { - prefetch_range((uint8_t *)m_p_next_rx_desc_poll->p_buffer, - m_n_sysvar_rx_prefetch_bytes_before_poll); - } - - buff_status_e status = BS_OK; - uint32_t ret = 0; - while (ret < m_n_sysvar_cq_poll_batch_max) { - mem_buf_desc_t *buff = poll(status); - if (buff) { - ++ret; - if (cqe_process_rx(buff, status)) { - if ((++m_qp_rec.debt < (int)m_n_sysvar_rx_num_wr_to_post_recv) || - !compensate_qp_poll_success(buff)) { - process_recv_buffer(buff, pv_fd_ready_array); - } - } else { - m_p_cq_stat->n_rx_pkt_drop++; - if (++m_qp_rec.debt >= (int)m_n_sysvar_rx_num_wr_to_post_recv) { - compensate_qp_poll_failed(); - } - } - } else { - m_b_was_drained = true; - break; - } - } - - update_global_sn(*p_cq_poll_sn, ret); - - if (likely(ret > 0)) { - ret_rx_processed += ret; - m_n_wce_counter += ret; - m_p_ring->m_gro_mgr.flush_all(pv_fd_ready_array); - } else { - compensate_qp_poll_failed(); - } - - return ret_rx_processed; -} - -void cq_mgr_mlx5::log_cqe_error(struct xlio_mlx5_cqe *cqe) -{ - struct mlx5_err_cqe *ecqe = (struct mlx5_err_cqe *)cqe; - - /* TODO We can also ask qp_mgr to log WQE fields from SQ. But at first, we need to remove - * prefetch and memset of the next WQE there. Credit system will guarantee that we don't - * reuse the WQE at this point. - */ - - if (MLX5_CQE_SYNDROME_WR_FLUSH_ERR != ecqe->syndrome) { - cq_logwarn("cqe: syndrome=0x%x vendor=0x%x hw=0x%x (type=0x%x) wqe_opcode_qpn=0x%x " - "wqe_counter=0x%x", - ecqe->syndrome, ecqe->vendor_err_synd, *((uint8_t *)&ecqe->rsvd1 + 16), - *((uint8_t *)&ecqe->rsvd1 + 17), ntohl(ecqe->s_wqe_opcode_qpn), - ntohs(ecqe->wqe_counter)); - } -} - -void cq_mgr_mlx5::handle_sq_wqe_prop(unsigned index) -{ - sq_wqe_prop *p = &m_qp->m_sq_wqe_idx_to_prop[index]; - sq_wqe_prop *prev; - unsigned credits = 0; - - /* - * TX completions can be signalled for a set of WQEs as an optimization. - * Therefore, for every TX completion we may need to handle multiple - * WQEs. Since every WQE can have various size and the WQE index is - * wrapped around, we build a linked list to simplify things. Each - * element of the linked list represents properties of a previously - * posted WQE. - * - * We keep index of the last completed WQE and stop processing the list - * when we reach the index. This condition is checked in - * is_sq_wqe_prop_valid(). - */ - - do { - if (p->buf) { - m_p_ring->mem_buf_desc_return_single_locked(p->buf); - } - if (p->ti) { - xlio_ti *ti = p->ti; - if (ti->m_callback) { - ti->m_callback(ti->m_callback_arg); - } - - ti->put(); - if (unlikely(ti->m_released && ti->m_ref == 0)) { - m_qp->ti_released(ti); - } - } - credits += p->credits; - - prev = p; - p = p->next; - } while (p != NULL && m_qp->is_sq_wqe_prop_valid(p, prev)); - - m_p_ring->return_tx_pool_to_global_pool(); - m_qp->credits_return(credits); - m_qp->m_sq_wqe_prop_last_signalled = index; -} - -int cq_mgr_mlx5::poll_and_process_element_tx(uint64_t *p_cq_poll_sn) -{ - cq_logfuncall(""); - - static auto is_error_opcode = [&](uint8_t opcode) { - return opcode == MLX5_CQE_REQ_ERR || opcode == MLX5_CQE_RESP_ERR; - }; - - int ret = 0; - uint32_t num_polled_cqes = 0; - xlio_mlx5_cqe *cqe = get_cqe_tx(num_polled_cqes); - - if (likely(cqe)) { - unsigned index = ntohs(cqe->wqe_counter) & (m_qp->m_tx_num_wr - 1); - - // All error opcodes have the most significant bit set. - if (unlikely(cqe->op_own & 0x80) && is_error_opcode(cqe->op_own >> 4)) { - m_p_cq_stat->n_rx_cqe_error++; - log_cqe_error(cqe); - } - - handle_sq_wqe_prop(index); - ret = 1; - } - update_global_sn(*p_cq_poll_sn, num_polled_cqes); - - return ret; -} - -void cq_mgr_mlx5::set_qp_rq(qp_mgr *qp) -{ - m_qp = static_cast(qp); - - m_qp->m_rq_wqe_counter = 0; // In case of bonded qp, wqe_counter must be reset to zero - m_rx_hot_buffer = NULL; - - if (0 != xlio_ib_mlx5_get_cq(m_p_ibv_cq, &m_mlx5_cq)) { - cq_logpanic("xlio_ib_mlx5_get_cq failed (errno=%d %m)", errno); - } - VALGRIND_MAKE_MEM_DEFINED(&m_mlx5_cq, sizeof(m_mlx5_cq)); - cq_logfunc("qp_mgr=%p m_mlx5_cq.dbrec=%p m_mlx5_cq.cq_buf=%p", m_qp, m_mlx5_cq.dbrec, - m_mlx5_cq.cq_buf); -} - -void cq_mgr_mlx5::add_qp_rx(qp_mgr *qp) -{ - cq_logfunc(""); - set_qp_rq(qp); - cq_mgr::add_qp_rx(qp); -} - -void cq_mgr_mlx5::add_qp_tx(qp_mgr *qp) -{ - // Assume locked! - cq_mgr::add_qp_tx(qp); - m_qp = static_cast(qp); - - if (0 != xlio_ib_mlx5_get_cq(m_p_ibv_cq, &m_mlx5_cq)) { - cq_logpanic("xlio_ib_mlx5_get_cq failed (errno=%d %m)", errno); - } - - cq_logfunc("qp_mgr=%p m_mlx5_cq.dbrec=%p m_mlx5_cq.cq_buf=%p", m_qp, m_mlx5_cq.dbrec, - m_mlx5_cq.cq_buf); -} - -void cq_mgr_mlx5::lro_update_hdr(struct xlio_mlx5_cqe *cqe, mem_buf_desc_t *p_rx_wc_buf_desc) -{ - struct ethhdr *p_eth_h = (struct ethhdr *)(p_rx_wc_buf_desc->p_buffer); - struct tcphdr *p_tcp_h; - size_t transport_header_len = ETH_HDR_LEN; - - if (p_eth_h->h_proto == htons(ETH_P_8021Q)) { - transport_header_len = ETH_VLAN_HDR_LEN; - } - - if (0x02 == ((cqe->l4_hdr_type_etc >> 2) & 0x3)) { - // CQE indicates IPv4 in the l3_hdr_type field - struct iphdr *p_ip_h = (struct iphdr *)(p_rx_wc_buf_desc->p_buffer + transport_header_len); - - assert(p_ip_h->version == IPV4_VERSION); - assert(p_ip_h->protocol == IPPROTO_TCP); - - p_ip_h->ttl = cqe->lro_min_ttl; - p_ip_h->tot_len = htons(ntohl(cqe->byte_cnt) - transport_header_len); - p_ip_h->check = 0; // Ignore. - - p_tcp_h = (struct tcphdr *)((uint8_t *)p_ip_h + (int)(p_ip_h->ihl) * 4); - } else { - // Assume LRO can happen for either IPv4 or IPv6 L3 protocol. Skip checking l3_hdr_type. - struct ip6_hdr *p_ip6_h = - (struct ip6_hdr *)(p_rx_wc_buf_desc->p_buffer + transport_header_len); - - assert(0x01 == ((cqe->l4_hdr_type_etc >> 2) & 0x3)); // IPv6 L3 header. - assert(ip_header_version(p_ip6_h) == IPV6); - assert(p_ip6_h->ip6_nxt == IPPROTO_TCP); - assert(ntohl(cqe->byte_cnt) >= transport_header_len + IPV6_HLEN); - - p_ip6_h->ip6_hlim = cqe->lro_min_ttl; - // Payload length doesn't include main header. - p_ip6_h->ip6_plen = htons(ntohl(cqe->byte_cnt) - transport_header_len - IPV6_HLEN); - - // LRO doesn't create a session for packets with extension headers, so IPv6 header is 40b. - p_tcp_h = (struct tcphdr *)((uint8_t *)p_ip6_h + IPV6_HLEN); - } - - p_tcp_h->psh = !!(cqe->lro_tcppsh_abort_dupack & MLX5_CQE_LRO_TCP_PUSH_MASK); - - /* TCP packet flag is set, and packet carries no data or - * TCP packet flag is set, and packet carries data - */ - if ((0x03 == ((cqe->l4_hdr_type_etc >> 4) & 0x7)) || - (0x04 == ((cqe->l4_hdr_type_etc >> 4) & 0x7))) { - p_tcp_h->ack = 1; - p_tcp_h->ack_seq = cqe->lro_ack_seq_num; - p_tcp_h->window = cqe->lro_tcp_win; - p_tcp_h->check = 0; // Ignore. - } -} - -#endif /* DEFINED_DIRECT_VERBS */ diff --git a/src/core/dev/cq_mgr_mlx5.inl b/src/core/dev/cq_mgr_mlx5.inl deleted file mode 100644 index 39b00a36f..000000000 --- a/src/core/dev/cq_mgr_mlx5.inl +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef CQ_MGR_MLX5_INL_H -#define CQ_MGR_MLX5_INL_H - -#include "dev/cq_mgr_mlx5.h" - -#if defined(DEFINED_DIRECT_VERBS) - -/**/ -/** inlining functions can only help if they are implemented before their usage **/ -/**/ -inline struct xlio_mlx5_cqe *cq_mgr_mlx5::check_cqe(void) -{ - struct xlio_mlx5_cqe *cqe = - (struct xlio_mlx5_cqe *)(((uint8_t *)m_mlx5_cq.cq_buf) + - ((m_mlx5_cq.cq_ci & (m_mlx5_cq.cqe_count - 1)) - << m_mlx5_cq.cqe_size_log)); - /* - * CQE ownership is defined by Owner bit in the CQE. - * The value indicating SW ownership is flipped every - * time CQ wraps around. - * */ - if (likely((MLX5_CQE_OPCODE(cqe->op_own)) != MLX5_CQE_INVALID) && - !((MLX5_CQE_OWNER(cqe->op_own)) ^ !!(m_mlx5_cq.cq_ci & m_mlx5_cq.cqe_count))) { - return cqe; - } - - return NULL; -} - -#endif /* DEFINED_DIRECT_VERBS */ -#endif // CQ_MGR_MLX5_INL_H diff --git a/src/core/dev/cq_mgr_rx.cpp b/src/core/dev/cq_mgr_rx.cpp new file mode 100644 index 000000000..452ba5176 --- /dev/null +++ b/src/core/dev/cq_mgr_rx.cpp @@ -0,0 +1,580 @@ +/* + * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "cq_mgr_rx.h" +#include "cq_mgr_rx_inl.h" +#include +#include +#include +#include + +#include "utils/bullseye.h" +#include +#include +#include "util/instrumentation.h" +#include +#include +#include "ib/base/verbs_extra.h" + +#include "buffer_pool.h" +#include "hw_queue_rx.h" +#include "ring_simple.h" + +#define MODULE_NAME "cq_mgr_rx" + +#define cq_logpanic __log_info_panic +#define cq_logerr __log_info_err +#define cq_logwarn __log_info_warn +#define cq_loginfo __log_info_info +#define cq_logdbg __log_info_dbg +#define cq_logfunc __log_info_func +#define cq_logfuncall __log_info_funcall + +#define cq_logdbg_no_funcname(log_fmt, log_args...) \ + do { \ + if (g_vlogger_level >= VLOG_DEBUG) \ + vlog_printf(VLOG_DEBUG, MODULE_NAME "[%p]:%d: " log_fmt "\n", __INFO__, __LINE__, \ + ##log_args); \ + } while (0) + +atomic_t cq_mgr_rx::m_n_cq_id_counter_rx = ATOMIC_INIT(1); + +uint64_t cq_mgr_rx::m_n_global_sn_rx = 0; + +cq_mgr_rx::cq_mgr_rx(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, int cq_size, + struct ibv_comp_channel *p_comp_event_channel) + : m_p_ring(p_ring) + , m_n_sysvar_cq_poll_batch_max(safe_mce_sys().cq_poll_batch_max) + , m_n_sysvar_progress_engine_wce_max(safe_mce_sys().progress_engine_wce_max) + , m_p_cq_stat(&m_cq_stat_static) // use local copy of stats by default + , m_n_sysvar_rx_prefetch_bytes_before_poll(safe_mce_sys().rx_prefetch_bytes_before_poll) + , m_n_sysvar_rx_prefetch_bytes(safe_mce_sys().rx_prefetch_bytes) + , m_p_ib_ctx_handler(p_ib_ctx_handler) + , m_n_sysvar_rx_num_wr_to_post_recv(safe_mce_sys().rx_num_wr_to_post_recv) + , m_comp_event_channel(p_comp_event_channel) + , m_n_sysvar_qp_compensation_level(safe_mce_sys().qp_compensation_level) + , m_rx_lkey(g_buffer_pool_rx_rwqe->find_lkey_by_ib_ctx_thread_safe(m_p_ib_ctx_handler)) + , m_b_sysvar_cq_keep_qp_full(safe_mce_sys().cq_keep_qp_full) +{ + BULLSEYE_EXCLUDE_BLOCK_START + if (m_rx_lkey == LKEY_ERROR) { + __log_info_panic("invalid lkey found %u", m_rx_lkey); + } + BULLSEYE_EXCLUDE_BLOCK_END + + memset(&m_cq_stat_static, 0, sizeof(m_cq_stat_static)); + + m_rx_queue.set_id("cq_mgr_rx (%p) : m_rx_queue", this); + m_rx_pool.set_id("cq_mgr_rx (%p) : m_rx_pool", this); + m_cq_id_rx = atomic_fetch_and_inc(&m_n_cq_id_counter_rx); // cq id is nonzero + configure(cq_size); + + memset(&m_mlx5_cq, 0, sizeof(m_mlx5_cq)); +} + +void cq_mgr_rx::configure(int cq_size) +{ + xlio_ibv_cq_init_attr attr; + memset(&attr, 0, sizeof(attr)); + + struct ibv_context *context = m_p_ib_ctx_handler->get_ibv_context(); + int comp_vector = 0; +#if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) + /* + * For some scenario with forking usage we may want to distribute CQs across multiple + * CPUs to improve CPS in case of multiple processes. + */ + if (safe_mce_sys().app.distribute_cq_interrupts && g_p_app->get_worker_id() >= 0) { + comp_vector = g_p_app->get_worker_id() % context->num_comp_vectors; + } +#endif + m_p_ibv_cq = xlio_ibv_create_cq(context, cq_size - 1, (void *)this, m_comp_event_channel, + comp_vector, &attr); + BULLSEYE_EXCLUDE_BLOCK_START + if (!m_p_ibv_cq) { + cq_logerr("Failed to create CQ, this: %p, ctx: %p size: %d compch: %p", this, context, + cq_size - 1, m_comp_event_channel); + throw_xlio_exception("ibv_create_cq failed"); + } + BULLSEYE_EXCLUDE_BLOCK_END + VALGRIND_MAKE_MEM_DEFINED(m_p_ibv_cq, sizeof(ibv_cq)); + + xlio_stats_instance_create_cq_block(m_p_cq_stat); + + m_b_is_rx_hw_csum_on = xlio_is_rx_hw_csum_supported(m_p_ib_ctx_handler->get_ibv_device_attr()); + + cq_logdbg("RX CSUM support = %d", m_b_is_rx_hw_csum_on); + + cq_logdbg("Created CQ as Rx with fd[%d] and of size %d elements (ibv_cq_hndl=%p)", + get_channel_fd(), cq_size, m_p_ibv_cq); +} + +cq_mgr_rx::~cq_mgr_rx() +{ + cq_logdbg("Destroying Rx CQ"); + + if (m_rx_buffs_rdy_for_free_head) { + reclaim_recv_buffers(m_rx_buffs_rdy_for_free_head); + } + + m_b_was_drained = true; + if (m_rx_queue.size() + m_rx_pool.size()) { + cq_logdbg("Returning %lu buffers to global Rx pool (ready queue %lu, free pool %lu))", + m_rx_queue.size() + m_rx_pool.size(), m_rx_queue.size(), m_rx_pool.size()); + + g_buffer_pool_rx_rwqe->put_buffers_thread_safe(&m_rx_queue, m_rx_queue.size()); + m_p_cq_stat->n_rx_sw_queue_len = m_rx_queue.size(); + + g_buffer_pool_rx_rwqe->put_buffers_thread_safe(&m_rx_pool, m_rx_pool.size()); + m_p_cq_stat->n_buffer_pool_len = m_rx_pool.size(); + } + + cq_logfunc("destroying ibv_cq"); + IF_VERBS_FAILURE_EX(ibv_destroy_cq(m_p_ibv_cq), EIO) + { + cq_logdbg("destroy cq failed (errno=%d %m)", errno); + } + ENDIF_VERBS_FAILURE; + VALGRIND_MAKE_MEM_UNDEFINED(m_p_ibv_cq, sizeof(ibv_cq)); + + statistics_print(); + xlio_stats_instance_remove_cq_block(m_p_cq_stat); + + cq_logdbg("Destroying Rx CQ done"); +} + +void cq_mgr_rx::statistics_print() +{ + if (m_p_cq_stat->n_rx_pkt_drop || m_p_cq_stat->n_rx_sw_queue_len || + m_p_cq_stat->n_rx_drained_at_once_max || m_p_cq_stat->n_buffer_pool_len) { + cq_logdbg_no_funcname("Packets dropped: %12llu", + (unsigned long long int)m_p_cq_stat->n_rx_pkt_drop); + cq_logdbg_no_funcname("Drained max: %17u", m_p_cq_stat->n_rx_drained_at_once_max); + cq_logdbg_no_funcname("CQE errors: %18llu", + (unsigned long long int)m_p_cq_stat->n_rx_cqe_error); + } +} + +void cq_mgr_rx::add_hqrx(hw_queue_rx *hqrx_ptr) +{ + m_hqrx_ptr = hqrx_ptr; + m_hqrx_ptr->m_rq_wqe_counter = 0; // In case of bonded hqrx, wqe_counter must be reset to zero + m_rx_hot_buffer = nullptr; + + if (0 != xlio_ib_mlx5_get_cq(m_p_ibv_cq, &m_mlx5_cq)) { + cq_logpanic("xlio_ib_mlx5_get_cq failed (errno=%d %m)", errno); + } + + VALGRIND_MAKE_MEM_DEFINED(&m_mlx5_cq, sizeof(m_mlx5_cq)); + cq_logfunc("hqrx_ptr=%p m_mlx5_cq.dbrec=%p m_mlx5_cq.cq_buf=%p", hqrx_ptr, m_mlx5_cq.dbrec, + m_mlx5_cq.cq_buf); + + descq_t temp_desc_list; + temp_desc_list.set_id("cq_mgr_rx (%p) : temp_desc_list", this); + + m_p_cq_stat->n_rx_drained_at_once_max = 0; + + /* return_extra_buffers(); */ // todo?? + + // Initial fill of receiver work requests + uint32_t hqrx_wr_num = hqrx_ptr->get_rx_max_wr_num(); + cq_logdbg("Trying to push %d WRE to allocated hqrx (%p)", hqrx_wr_num, hqrx_ptr); + while (hqrx_wr_num) { + uint32_t n_num_mem_bufs = m_n_sysvar_rx_num_wr_to_post_recv; + if (n_num_mem_bufs > hqrx_wr_num) { + n_num_mem_bufs = hqrx_wr_num; + } + bool res = g_buffer_pool_rx_rwqe->get_buffers_thread_safe(temp_desc_list, m_p_ring, + n_num_mem_bufs, m_rx_lkey); + if (!res) { + VLOG_PRINTF_INFO_ONCE_THEN_ALWAYS( + VLOG_WARNING, VLOG_DEBUG, + "WARNING Out of mem_buf_desc from Rx buffer pool for hqrx initialization " + "(hqrx_ptr=%p),\n" + "\tThis might happen due to wrong setting of XLIO_RX_BUFS and XLIO_RX_WRE. Please " + "refer to README.txt for more info", + hqrx_ptr); + break; + } + + hqrx_ptr->post_recv_buffers(&temp_desc_list, temp_desc_list.size()); + if (!temp_desc_list.empty()) { + cq_logdbg("hqrx_ptr post recv is already full (push=%d, planned=%d)", + hqrx_ptr->get_rx_max_wr_num() - hqrx_wr_num, hqrx_ptr->get_rx_max_wr_num()); + g_buffer_pool_rx_rwqe->put_buffers_thread_safe(&temp_desc_list, temp_desc_list.size()); + break; + } + hqrx_wr_num -= n_num_mem_bufs; + } + + cq_logdbg("Successfully post_recv hqrx with %d new Rx buffers (planned=%d)", + hqrx_ptr->get_rx_max_wr_num() - hqrx_wr_num, hqrx_ptr->get_rx_max_wr_num()); + + m_debt = 0; +} + +void cq_mgr_rx::del_hqrx(hw_queue_rx *hqrx_ptr) +{ + BULLSEYE_EXCLUDE_BLOCK_START + if (m_hqrx_ptr != hqrx_ptr) { + cq_logdbg("wrong hqrx_ptr=%p != m_hqrx_ptr=%p", hqrx_ptr, m_hqrx_ptr); + return; + } + BULLSEYE_EXCLUDE_BLOCK_END + cq_logdbg("m_hqrx_ptr=%p", m_hqrx_ptr); + return_extra_buffers(); + + clean_cq(); + m_hqrx_ptr = nullptr; + m_debt = 0; +} + +void cq_mgr_rx::lro_update_hdr(struct xlio_mlx5_cqe *cqe, mem_buf_desc_t *p_rx_wc_buf_desc) +{ + struct ethhdr *p_eth_h = (struct ethhdr *)(p_rx_wc_buf_desc->p_buffer); + struct tcphdr *p_tcp_h; + size_t transport_header_len = ETH_HDR_LEN; + + if (p_eth_h->h_proto == htons(ETH_P_8021Q)) { + transport_header_len = ETH_VLAN_HDR_LEN; + } + + if (0x02 == ((cqe->l4_hdr_type_etc >> 2) & 0x3)) { + // CQE indicates IPv4 in the l3_hdr_type field + struct iphdr *p_ip_h = (struct iphdr *)(p_rx_wc_buf_desc->p_buffer + transport_header_len); + + assert(p_ip_h->version == IPV4_VERSION); + assert(p_ip_h->protocol == IPPROTO_TCP); + + p_ip_h->ttl = cqe->lro_min_ttl; + p_ip_h->tot_len = htons(ntohl(cqe->byte_cnt) - transport_header_len); + p_ip_h->check = 0; // Ignore. + + p_tcp_h = (struct tcphdr *)((uint8_t *)p_ip_h + (int)(p_ip_h->ihl) * 4); + } else { + // Assume LRO can happen for either IPv4 or IPv6 L3 protocol. Skip checking l3_hdr_type. + struct ip6_hdr *p_ip6_h = + (struct ip6_hdr *)(p_rx_wc_buf_desc->p_buffer + transport_header_len); + + assert(0x01 == ((cqe->l4_hdr_type_etc >> 2) & 0x3)); // IPv6 L3 header. + assert(p_ip6_h->ip6_nxt == IPPROTO_TCP); + assert(ntohl(cqe->byte_cnt) >= transport_header_len + IPV6_HLEN); + + p_ip6_h->ip6_hlim = cqe->lro_min_ttl; + // Payload length doesn't include main header. + p_ip6_h->ip6_plen = htons(ntohl(cqe->byte_cnt) - transport_header_len - IPV6_HLEN); + + // LRO doesn't create a session for packets with extension headers, so IPv6 header is 40b. + p_tcp_h = (struct tcphdr *)((uint8_t *)p_ip6_h + IPV6_HLEN); + } + + p_tcp_h->psh = !!(cqe->lro_tcppsh_abort_dupack & MLX5_CQE_LRO_TCP_PUSH_MASK); + + /* TCP packet flag is set, and packet carries no data or + * TCP packet flag is set, and packet carries data + */ + if ((0x03 == ((cqe->l4_hdr_type_etc >> 4) & 0x7)) || + (0x04 == ((cqe->l4_hdr_type_etc >> 4) & 0x7))) { + p_tcp_h->ack = 1; + p_tcp_h->ack_seq = cqe->lro_ack_seq_num; + p_tcp_h->window = cqe->lro_tcp_win; + p_tcp_h->check = 0; // Ignore. + } +} + +bool cq_mgr_rx::request_more_buffers() +{ + cq_logfuncall("Allocating additional %d buffers for internal use", + m_n_sysvar_qp_compensation_level); + + // Assume locked! + // Add an additional free buffer descs to RX cq mgr + bool res = g_buffer_pool_rx_rwqe->get_buffers_thread_safe( + m_rx_pool, m_p_ring, m_n_sysvar_qp_compensation_level, m_rx_lkey); + if (!res) { + cq_logfunc("Out of mem_buf_desc from RX free pool for internal object pool"); + return false; + }; + + m_p_cq_stat->n_buffer_pool_len = m_rx_pool.size(); + return true; +} + +void cq_mgr_rx::return_extra_buffers() +{ + if (m_rx_pool.size() < m_n_sysvar_qp_compensation_level * 2) { + return; + } + int buff_to_rel = m_rx_pool.size() - m_n_sysvar_qp_compensation_level; + + cq_logfunc("releasing %d buffers to global rx pool", buff_to_rel); + g_buffer_pool_rx_rwqe->put_buffers_thread_safe(&m_rx_pool, buff_to_rel); + m_p_cq_stat->n_buffer_pool_len = m_rx_pool.size(); +} + +mem_buf_desc_t *cq_mgr_rx::cqe_process_rx(mem_buf_desc_t *p_mem_buf_desc, enum buff_status_e status) +{ + /* Assume locked!!! */ + cq_logfuncall(""); + + /* we use context to verify that on reclaim rx buffer path we return the buffer to the right CQ + */ + p_mem_buf_desc->rx.is_xlio_thr = false; + p_mem_buf_desc->rx.context = nullptr; + + if (unlikely(status != BS_OK)) { + m_p_next_rx_desc_poll = nullptr; + reclaim_recv_buffer_helper(p_mem_buf_desc); + return nullptr; + } + + if (m_n_sysvar_rx_prefetch_bytes_before_poll) { + m_p_next_rx_desc_poll = p_mem_buf_desc->p_prev_desc; + p_mem_buf_desc->p_prev_desc = nullptr; + } + + VALGRIND_MAKE_MEM_DEFINED(p_mem_buf_desc->p_buffer, p_mem_buf_desc->sz_data); + + prefetch_range((uint8_t *)p_mem_buf_desc->p_buffer + m_sz_transport_header, + std::min(p_mem_buf_desc->sz_data - m_sz_transport_header, + (size_t)m_n_sysvar_rx_prefetch_bytes)); + + return p_mem_buf_desc; +} + +bool cq_mgr_rx::compensate_qp_poll_success(mem_buf_desc_t *buff_cur) +{ + // Assume locked!!! + // Compensate QP for all completions that we found + if (m_rx_pool.size() || request_more_buffers()) { + size_t buffers = std::min(m_debt, m_rx_pool.size()); + m_hqrx_ptr->post_recv_buffers(&m_rx_pool, buffers); + m_debt -= buffers; + m_p_cq_stat->n_buffer_pool_len = m_rx_pool.size(); + } else if (m_b_sysvar_cq_keep_qp_full || + m_debt + MCE_MAX_CQ_POLL_BATCH > (int)m_hqrx_ptr->m_rx_num_wr) { + m_p_cq_stat->n_rx_pkt_drop++; + m_hqrx_ptr->post_recv_buffer(buff_cur); + --m_debt; + return true; + } + + return false; +} + +void cq_mgr_rx::compensate_qp_poll_failed() +{ + // Assume locked!!! + // Compensate QP for all completions debt + if (m_debt) { + if (likely(m_rx_pool.size() || request_more_buffers())) { + size_t buffers = std::min(m_debt, m_rx_pool.size()); + m_hqrx_ptr->post_recv_buffers(&m_rx_pool, buffers); + m_debt -= buffers; + m_p_cq_stat->n_buffer_pool_len = m_rx_pool.size(); + } + } +} + +void cq_mgr_rx::reclaim_recv_buffer_helper(mem_buf_desc_t *buff) +{ + if (buff->dec_ref_count() <= 1 && (buff->lwip_pbuf.ref-- <= 1)) { + if (likely(buff->p_desc_owner == m_p_ring)) { + mem_buf_desc_t *temp = nullptr; + while (buff) { + VLIST_DEBUG_CQ_MGR_PRINT_ERROR_IS_MEMBER; + temp = buff; + assert(temp->lwip_pbuf.type != PBUF_ZEROCOPY); + buff = temp->p_next_desc; + temp->clear_transport_data(); + temp->p_next_desc = nullptr; + temp->p_prev_desc = nullptr; + temp->reset_ref_count(); + free_lwip_pbuf(&temp->lwip_pbuf); + m_rx_pool.push_back(temp); + } + m_p_cq_stat->n_buffer_pool_len = m_rx_pool.size(); + } else { + cq_logfunc("Buffer returned to wrong CQ"); + g_buffer_pool_rx_rwqe->put_buffers_thread_safe(buff); + } + } +} + +// This method is called when ring release returns unposted buffers. +void cq_mgr_rx::mem_buf_desc_return_to_owner(mem_buf_desc_t *p_mem_buf_desc, + void *pv_fd_ready_array /*=NULL*/) +{ + cq_logfuncall(""); + NOT_IN_USE(pv_fd_ready_array); + cq_mgr_rx::reclaim_recv_buffer_helper(p_mem_buf_desc); +} + +bool cq_mgr_rx::reclaim_recv_buffers(mem_buf_desc_t *rx_reuse_lst) +{ + if (m_rx_buffs_rdy_for_free_head) { + reclaim_recv_buffer_helper(m_rx_buffs_rdy_for_free_head); + m_rx_buffs_rdy_for_free_head = m_rx_buffs_rdy_for_free_tail = nullptr; + } + reclaim_recv_buffer_helper(rx_reuse_lst); + return_extra_buffers(); + + return true; +} + +bool cq_mgr_rx::reclaim_recv_buffers_no_lock(mem_buf_desc_t *rx_reuse_lst) +{ + if (likely(rx_reuse_lst)) { + reclaim_recv_buffer_helper(rx_reuse_lst); + return true; + } + return false; +} + +int cq_mgr_rx::reclaim_recv_single_buffer(mem_buf_desc_t *rx_reuse) +{ + int ret_val = 0; + + ret_val = rx_reuse->lwip_pbuf_dec_ref_count(); + if ((ret_val == 0) && (rx_reuse->get_ref_count() <= 0)) { + /*if ((safe_mce_sys().thread_mode > THREAD_MODE_SINGLE)) { + m_lock_ring_rx.lock(); + }*/ + if (!m_rx_buffs_rdy_for_free_head) { + m_rx_buffs_rdy_for_free_head = m_rx_buffs_rdy_for_free_tail = rx_reuse; + } else { + m_rx_buffs_rdy_for_free_tail->p_next_desc = rx_reuse; + m_rx_buffs_rdy_for_free_tail = rx_reuse; + } + m_rx_buffs_rdy_for_free_tail->p_next_desc = nullptr; + /*if ((safe_mce_sys().thread_mode > THREAD_MODE_SINGLE)) { + m_lock_ring_rx.lock(); + }*/ + } + return ret_val; +} + +bool cq_mgr_rx::reclaim_recv_buffers(descq_t *rx_reuse) +{ + cq_logfuncall(""); + // Called from outside cq_mgr_rx context which is not locked!! + while (!rx_reuse->empty()) { + mem_buf_desc_t *buff = rx_reuse->get_and_pop_front(); + reclaim_recv_buffer_helper(buff); + } + return_extra_buffers(); + + return true; +} + +int cq_mgr_rx::request_notification(uint64_t poll_sn) +{ + int ret = -1; + + cq_logfuncall(""); + + if ((m_n_global_sn_rx > 0 && poll_sn != m_n_global_sn_rx)) { + // The cq_mgr_rx's has receive packets pending processing (or got processed since + // cq_poll_sn) + cq_logfunc("miss matched poll sn (user=0x%lx, cq=0x%lx)", poll_sn, m_n_cq_poll_sn_rx); + return 1; + } + + if (m_b_notification_armed == false) { + + cq_logfunc("arming cq_mgr_rx notification channel"); + + // Arm the CQ notification channel + IF_VERBS_FAILURE(xlio_ib_mlx5_req_notify_cq(&m_mlx5_cq, 0)) + { + cq_logerr("Failure arming the RX notification channel (errno=%d %m)", errno); + } + else + { + ret = 0; + m_b_notification_armed = true; + } + ENDIF_VERBS_FAILURE; + } else { + // cq_mgr_rx notification channel already armed + ret = 0; + } + + cq_logfuncall("returning with %d", ret); + return ret; +} + +int cq_mgr_rx::wait_for_notification_and_process_element(uint64_t *p_cq_poll_sn, + void *pv_fd_ready_array) +{ + int ret = -1; + + cq_logfunc(""); + + if (m_b_notification_armed) { + cq_mgr_rx *p_cq_mgr_context = nullptr; + struct ibv_cq *p_cq_hndl = nullptr; + void *p; // deal with compiler warnings + + // Block on the cq_mgr_rx's notification event channel + IF_VERBS_FAILURE(ibv_get_cq_event(m_comp_event_channel, &p_cq_hndl, &p)) + { + cq_logfunc("waiting on cq_mgr_rx event returned with error (errno=%d %m)", errno); + } + else + { + get_cq_event(); + p_cq_mgr_context = (cq_mgr_rx *)p; + if (p_cq_mgr_context != this) { + cq_logerr("mismatch with cq_mgr_rx returned from new event (event->cq_mgr_rx->%p)", + p_cq_mgr_context); + // this can be if we are using a single channel for several/all cq_mgrs + // in this case we need to deliver the event to the correct cq_mgr_rx + } + + // Ack event + ibv_ack_cq_events(m_p_ibv_cq, 1); + + // Clear flag + m_b_notification_armed = false; + + // Now try processing the ready element + ret = poll_and_process_element_rx(p_cq_poll_sn, pv_fd_ready_array); + } + ENDIF_VERBS_FAILURE; + } else { + cq_logfunc("notification channel is not armed"); + errno = EAGAIN; + } + + return ret; +} diff --git a/src/core/dev/cq_mgr.h b/src/core/dev/cq_mgr_rx.h similarity index 61% rename from src/core/dev/cq_mgr.h rename to src/core/dev/cq_mgr_rx.h index 1e10900bc..0d23ae750 100644 --- a/src/core/dev/cq_mgr.h +++ b/src/core/dev/cq_mgr_rx.h @@ -30,12 +30,11 @@ * SOFTWARE. */ -#ifndef CQ_MGR_H -#define CQ_MGR_H +#ifndef CQ_MGR_RX_H +#define CQ_MGR_RX_H #include "ib/base/verbs_extra.h" #include "utils/atomic.h" -#include "dev/qp_mgr.h" #include "dev/ib_ctx_handler.h" #include "util/sys_vars.h" #include "util/xlio_stats.h" @@ -56,55 +55,39 @@ class net_device_mgr; class ring; -class qp_mgr; +class hw_queue_rx; class ring_simple; -#define LOCAL_IF_INFO_INVALID \ - (local_if_info_t) { 0, 0 } +/* Get CQE opcode. */ +#define MLX5_CQE_OPCODE(op_own) ((op_own) >> 4) -struct cq_request_info_t { - struct ibv_device *p_ibv_device; - struct ibv_context *p_ibv_context; - int n_port; - qp_mgr *p_qp_mgr; -}; - -struct buff_lst_info_t { - mem_buf_desc_t *buff_lst; - uint32_t n_buff_num; -}; +/* Get CQE owner bit. */ +#define MLX5_CQE_OWNER(op_own) ((op_own)&MLX5_CQE_OWNER_MASK) -typedef std::pair local_if_info_key_t; - -typedef struct local_if_info_t { - in_addr_t addr; - uint32_t attached_grp_ref_cnt; -} local_if_info_t; - -struct qp_rec { - qp_mgr *qp; - int debt; -}; - -// Class cq_mgr -// -class cq_mgr { - friend class ring; // need to expose the m_n_global_sn only to ring - friend class ring_simple; // need to expose the m_n_global_sn only to ring - friend class ring_bond; // need to expose the m_n_global_sn only to ring +class cq_mgr_rx { + friend class ring; // need to expose the m_n_global_sn_rx only to ring + friend class ring_simple; // need to expose the m_n_global_sn_rx only to ring + friend class ring_bond; // need to expose the m_n_global_sn_rx only to ring friend class rfs_uc_tcp_gro; // need for stats public: - cq_mgr(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, int cq_size, - struct ibv_comp_channel *p_comp_event_channel, bool is_rx, bool config = true); - virtual ~cq_mgr(); + enum buff_status_e { + BS_OK, + BS_CQE_RESP_WR_IMM_NOT_SUPPORTED, + BS_IBV_WC_WR_FLUSH_ERR, + BS_CQE_INVALID, + BS_GENERAL_ERR + }; + + cq_mgr_rx(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, int cq_size, + struct ibv_comp_channel *p_comp_event_channel); + virtual ~cq_mgr_rx(); void configure(int cq_size); - ibv_cq *get_ibv_cq_hndl(); - int get_channel_fd(); - // ack events and rearm CQ - int ack_and_request_notification(); + ibv_cq *get_ibv_cq_hndl() { return m_p_ibv_cq; } + int get_channel_fd() { return m_comp_event_channel ? m_comp_event_channel->fd : 0; } + /** * Arm the managed CQ's notification channel * Calling this more then once without get_event() will return without @@ -126,7 +109,7 @@ class cq_mgr { * (on non-blocked channel) (some other thread beat you to it) */ int wait_for_notification_and_process_element(uint64_t *p_cq_poll_sn, - void *pv_fd_ready_array = NULL); + void *pv_fd_ready_array = nullptr); /** * This will poll n_num_poll time on the cq or stop early if it gets @@ -135,8 +118,8 @@ class cq_mgr { * @return >=0 number of wce processed * < 0 error */ - virtual int poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void *pv_fd_ready_array = NULL); - virtual int poll_and_process_element_tx(uint64_t *p_cq_poll_sn); + virtual int poll_and_process_element_rx(uint64_t *p_cq_poll_sn, + void *pv_fd_ready_array = nullptr) = 0; virtual mem_buf_desc_t *poll_and_process_socketxtreme() { return nullptr; }; /** @@ -145,34 +128,26 @@ class cq_mgr { * @return >=0 number of wce processed * < 0 error */ - virtual int drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id = NULL); + virtual int drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id = nullptr) = 0; // CQ implements the Rx mem_buf_desc_owner. // These callbacks will be called for each Rx buffer that passed processed completion - // Rx completion handling at the cq_mgr level is forwarding the packet to the ib_comm_mgr layer + // Rx completion handling at the cq_mgr_rx level is forwarding the packet to the ib_comm_mgr + // layer void mem_buf_desc_return_to_owner(mem_buf_desc_t *p_mem_buf_desc, - void *pv_fd_ready_array = NULL); - - virtual void add_qp_rx(qp_mgr *qp); - virtual void del_qp_rx(qp_mgr *qp); + void *pv_fd_ready_array = nullptr); - virtual void add_qp_tx(qp_mgr *qp); - virtual void del_qp_tx(qp_mgr *qp); + virtual void add_hqrx(hw_queue_rx *hqrx_ptr); + virtual void del_hqrx(hw_queue_rx *hqrx_ptr); - virtual uint32_t clean_cq(); + virtual uint32_t clean_cq() = 0; bool reclaim_recv_buffers(descq_t *rx_reuse); bool reclaim_recv_buffers(mem_buf_desc_t *rx_reuse_lst); bool reclaim_recv_buffers_no_lock(mem_buf_desc_t *rx_reuse_lst); int reclaim_recv_single_buffer(mem_buf_desc_t *rx_reuse); - // maps between qpn and vlan id to the local interface - void map_vlan_and_qpn_to_local_if(int qp_num, uint16_t vlan_id, in_addr_t local_if); - - // unmaps the qpn and vlan id - void unmap_vlan_and_qpn(int qp_num, uint16_t vlan_id); - - virtual void get_cq_event(int count = 1) { NOT_IN_USE(count); }; + void get_cq_event(int count = 1) { xlio_ib_mlx5_get_cq_event(&m_mlx5_cq, count); }; protected: /** @@ -182,50 +157,45 @@ class cq_mgr { * @p_cq_poll_sn global unique wce id that maps last wce polled * @return Number of successfully polled wce */ - int poll(xlio_ibv_wc *p_wce, int num_entries, uint64_t *p_cq_poll_sn); void compensate_qp_poll_failed(); - inline void process_recv_buffer(mem_buf_desc_t *buff, void *pv_fd_ready_array = NULL); - - /* Process a WCE... meaning... - * - extract the mem_buf_desc from the wce.wr_id and then loop on all linked mem_buf_desc - * and deliver them to their owner for further processing (sockinfo on Tx path and ib_conn_mgr - * on Rx path) - * - for Tx wce the data buffers will be released to the associated ring before the mem_buf_desc - * are returned - */ - mem_buf_desc_t *cqe_log_and_get_buf_tx(xlio_ibv_wc *p_wce); - mem_buf_desc_t *cqe_process_rx(xlio_ibv_wc *p_wce); + void lro_update_hdr(struct xlio_mlx5_cqe *cqe, mem_buf_desc_t *p_rx_wc_buf_desc); + inline void process_recv_buffer(mem_buf_desc_t *buff, void *pv_fd_ready_array = nullptr); + + inline void update_global_sn_rx(uint64_t &cq_poll_sn, uint32_t rettotal); + + inline struct xlio_mlx5_cqe *check_cqe(void); + + mem_buf_desc_t *cqe_process_rx(mem_buf_desc_t *p_mem_buf_desc, enum buff_status_e status); + virtual void reclaim_recv_buffer_helper(mem_buf_desc_t *buff); // Returns true if the given buffer was used, // false if the given buffer was not used. bool compensate_qp_poll_success(mem_buf_desc_t *buff); - inline uint32_t process_recv_queue(void *pv_fd_ready_array = NULL); + inline uint32_t process_recv_queue(void *pv_fd_ready_array = nullptr); virtual void statistics_print(); - virtual void prep_ibv_cq(xlio_ibv_cq_init_attr &attr) const; - // returns list of buffers to the owner. - void process_tx_buffer_list(mem_buf_desc_t *p_mem_buf_desc); - struct ibv_cq *m_p_ibv_cq; - bool m_b_is_rx; + xlio_ib_mlx5_cq_t m_mlx5_cq; + hw_queue_rx *m_hqrx_ptr = nullptr; + mem_buf_desc_t *m_rx_hot_buffer = nullptr; + struct ibv_cq *m_p_ibv_cq = nullptr; descq_t m_rx_queue; - static uint64_t m_n_global_sn; - uint32_t m_cq_id; - uint32_t m_n_cq_poll_sn; + static uint64_t m_n_global_sn_rx; + uint32_t m_cq_id_rx = 0U; + uint32_t m_n_cq_poll_sn_rx = 0U; ring_simple *m_p_ring; - uint32_t m_n_wce_counter; - bool m_b_was_drained; - bool m_b_is_rx_hw_csum_on; - qp_rec m_qp_rec; + uint32_t m_n_wce_counter = 0U; + bool m_b_was_drained = false; + bool m_b_is_rx_hw_csum_on = false; + int m_debt = 0; const uint32_t m_n_sysvar_cq_poll_batch_max; const uint32_t m_n_sysvar_progress_engine_wce_max; cq_stats_t *m_p_cq_stat; - transport_type_t m_transport_type; - mem_buf_desc_t *m_p_next_rx_desc_poll; + mem_buf_desc_t *m_p_next_rx_desc_poll = nullptr; uint32_t m_n_sysvar_rx_prefetch_bytes_before_poll; const uint32_t m_n_sysvar_rx_prefetch_bytes; - size_t m_sz_transport_header; + size_t m_sz_transport_header = ETH_HDR_LEN; ib_ctx_handler *m_p_ib_ctx_handler; const uint32_t m_n_sysvar_rx_num_wr_to_post_recv; descq_t m_rx_pool; @@ -234,40 +204,60 @@ class cq_mgr { * represented as struct xlio_buff_t * from user application by special XLIO extended API */ - mem_buf_desc_t *m_rx_buffs_rdy_for_free_head; - mem_buf_desc_t *m_rx_buffs_rdy_for_free_tail; + mem_buf_desc_t *m_rx_buffs_rdy_for_free_head = nullptr; + mem_buf_desc_t *m_rx_buffs_rdy_for_free_tail = nullptr; private: struct ibv_comp_channel *m_comp_event_channel; - bool m_b_notification_armed; + bool m_b_notification_armed = false; const uint32_t m_n_sysvar_qp_compensation_level; const uint32_t m_rx_lkey; const bool m_b_sysvar_cq_keep_qp_full; - int32_t m_n_out_of_free_bufs_warning; cq_stats_t m_cq_stat_static; - static atomic_t m_n_cq_id_counter; - - void handle_tcp_ctl_packets(uint32_t rx_processed, void *pv_fd_ready_array); + static atomic_t m_n_cq_id_counter_rx; // requests safe_mce_sys().qp_compensation_level buffers from global pool bool request_more_buffers() __attribute__((noinline)); // returns safe_mce_sys().qp_compensation_level buffers to global pool void return_extra_buffers() __attribute__((noinline)); - - // Finds and sets the local if to which the buff is addressed (according to qpn and vlan id). - inline void find_buff_dest_local_if(mem_buf_desc_t *buff); - - // Finds and sets the xlio if to which the buff is addressed (according to qpn). - inline void find_buff_dest_xlio_if_ctx(mem_buf_desc_t *buff); - - void process_cq_element_log_helper(mem_buf_desc_t *p_mem_buf_desc, xlio_ibv_wc *p_wce); - - virtual int req_notify_cq() { return ibv_req_notify_cq(m_p_ibv_cq, 0); }; }; -// Helper gunction to extract the Tx cq_mgr from the CQ event, -// Since we have a single TX CQ comp channel for all cq_mgr's, it might not be the active_cq object -cq_mgr *get_cq_mgr_from_cq_event(struct ibv_comp_channel *p_cq_channel); +inline void cq_mgr_rx::update_global_sn_rx(uint64_t &cq_poll_sn, uint32_t num_polled_cqes) +{ + if (num_polled_cqes > 0) { + // spoil the global sn if we have packets ready + union __attribute__((packed)) { + uint64_t global_sn; + struct { + uint32_t cq_id; + uint32_t cq_sn; + } bundle; + } next_sn; + m_n_cq_poll_sn_rx += num_polled_cqes; + next_sn.bundle.cq_sn = m_n_cq_poll_sn_rx; + next_sn.bundle.cq_id = m_cq_id_rx; + + m_n_global_sn_rx = next_sn.global_sn; + } + + cq_poll_sn = m_n_global_sn_rx; +} + +inline struct xlio_mlx5_cqe *cq_mgr_rx::check_cqe(void) +{ + struct xlio_mlx5_cqe *cqe = + (struct xlio_mlx5_cqe *)(((uint8_t *)m_mlx5_cq.cq_buf) + + ((m_mlx5_cq.cq_ci & (m_mlx5_cq.cqe_count - 1)) + << m_mlx5_cq.cqe_size_log)); + // CQE ownership is defined by Owner bit in the CQE. + // The value indicating SW ownership is flipped every time CQ wraps around. + if (likely((MLX5_CQE_OPCODE(cqe->op_own)) != MLX5_CQE_INVALID) && + !((MLX5_CQE_OWNER(cqe->op_own)) ^ !!(m_mlx5_cq.cq_ci & m_mlx5_cq.cqe_count))) { + return cqe; + } + + return nullptr; +} #endif // CQ_MGR_H diff --git a/src/core/dev/cq_mgr.inl b/src/core/dev/cq_mgr_rx_inl.h similarity index 93% rename from src/core/dev/cq_mgr.inl rename to src/core/dev/cq_mgr_rx_inl.h index dc8670188..a27ef15be 100644 --- a/src/core/dev/cq_mgr.inl +++ b/src/core/dev/cq_mgr_rx_inl.h @@ -33,15 +33,17 @@ #ifndef CQ_MGR_INL_H #define CQ_MGR_INL_H -#include "cq_mgr.h" +#include "cq_mgr_rx.h" #include "ring_simple.h" #include "util/utils.h" +#include +#include /**/ /** inlining functions can only help if they are implemented before their usage **/ /**/ -inline void cq_mgr::process_recv_buffer(mem_buf_desc_t *p_mem_buf_desc, void *pv_fd_ready_array) +inline void cq_mgr_rx::process_recv_buffer(mem_buf_desc_t *p_mem_buf_desc, void *pv_fd_ready_array) { // Assume locked!!! @@ -52,7 +54,7 @@ inline void cq_mgr::process_recv_buffer(mem_buf_desc_t *p_mem_buf_desc, void *pv } } -inline uint32_t cq_mgr::process_recv_queue(void *pv_fd_ready_array) +inline uint32_t cq_mgr_rx::process_recv_queue(void *pv_fd_ready_array) { // Assume locked!!! // If we have packets in the queue, dequeue one and process it @@ -78,7 +80,7 @@ inline bool is_eth_tcp_frame(mem_buf_desc_t *buff) uint16_t h_proto = p_eth_h->h_proto; size_t transport_header_len = ETH_HDR_LEN; - struct vlanhdr *p_vlan_hdr = NULL; + struct vlanhdr *p_vlan_hdr = nullptr; if (h_proto == htons(ETH_P_8021Q)) { p_vlan_hdr = (struct vlanhdr *)((uint8_t *)p_eth_h + transport_header_len); transport_header_len = ETH_VLAN_HDR_LEN; diff --git a/src/core/dev/cq_mgr_rx_regrq.cpp b/src/core/dev/cq_mgr_rx_regrq.cpp new file mode 100644 index 000000000..85d6e2fc5 --- /dev/null +++ b/src/core/dev/cq_mgr_rx_regrq.cpp @@ -0,0 +1,394 @@ +/* + * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "cq_mgr_rx_regrq.h" + +#if defined(DEFINED_DIRECT_VERBS) + +#include +#include "cq_mgr_rx_inl.h" +#include "hw_queue_rx.h" +#include "ring_simple.h" + +#include + +#define MODULE_NAME "cq_mgr_rx_regrq" + +#define cq_logfunc __log_info_func +#define cq_logdbg __log_info_dbg +#define cq_logwarn __log_info_warn +#define cq_logerr __log_info_err +#define cq_logpanic __log_info_panic +#define cq_logfuncall __log_info_funcall + +cq_mgr_rx_regrq::cq_mgr_rx_regrq(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, + uint32_t cq_size, struct ibv_comp_channel *p_comp_event_channel) + : cq_mgr_rx(p_ring, p_ib_ctx_handler, cq_size, p_comp_event_channel) +{ + cq_logfunc(""); +} + +uint32_t cq_mgr_rx_regrq::clean_cq() +{ + uint32_t ret_total = 0; + uint64_t cq_poll_sn = 0; + mem_buf_desc_t *buff; + + if (!m_hqrx_ptr) { // Sanity check + return 0; + } + + buff_status_e status = BS_OK; + while ((buff = poll(status))) { + if (cqe_process_rx(buff, status)) { + m_rx_queue.push_back(buff); + } + ++ret_total; + } + update_global_sn_rx(cq_poll_sn, ret_total); + + return ret_total; +} + +cq_mgr_rx_regrq::~cq_mgr_rx_regrq() +{ + cq_logdbg("Destroying CQ REGRQ"); +} + +mem_buf_desc_t *cq_mgr_rx_regrq::poll(enum buff_status_e &status) +{ + mem_buf_desc_t *buff = nullptr; + + if (unlikely(!m_rx_hot_buffer)) { + if (likely(m_hqrx_ptr->m_rq_data.tail != (m_hqrx_ptr->m_rq_data.head))) { + uint32_t index = m_hqrx_ptr->m_rq_data.tail & (m_hqrx_ptr->m_rx_num_wr - 1); + m_rx_hot_buffer = (mem_buf_desc_t *)m_hqrx_ptr->m_rq_wqe_idx_to_wrid[index]; + m_hqrx_ptr->m_rq_wqe_idx_to_wrid[index] = 0; + prefetch((void *)m_rx_hot_buffer); + prefetch((uint8_t *)m_mlx5_cq.cq_buf + + ((m_mlx5_cq.cq_ci & (m_mlx5_cq.cqe_count - 1)) << m_mlx5_cq.cqe_size_log)); + } else { + /* If rq_tail and rq_head are pointing to the same wqe, + * the wq is empty and there is no cqe to be received */ + return nullptr; + } + } + xlio_mlx5_cqe *cqe = check_cqe(); + if (likely(cqe)) { + /* Update the consumer index */ + ++m_mlx5_cq.cq_ci; + rmb(); + cqe_to_mem_buff_desc(cqe, m_rx_hot_buffer, status); + + ++m_hqrx_ptr->m_rq_data.tail; + *m_mlx5_cq.dbrec = htonl(m_mlx5_cq.cq_ci & 0xffffff); + + buff = m_rx_hot_buffer; + m_rx_hot_buffer = nullptr; + } else { + prefetch((void *)m_rx_hot_buffer); + } + + prefetch((uint8_t *)m_mlx5_cq.cq_buf + + ((m_mlx5_cq.cq_ci & (m_mlx5_cq.cqe_count - 1)) << m_mlx5_cq.cqe_size_log)); + + return buff; +} + +void cq_mgr_rx_regrq::cqe_to_mem_buff_desc(struct xlio_mlx5_cqe *cqe, + mem_buf_desc_t *p_rx_wc_buf_desc, + enum buff_status_e &status) +{ + struct mlx5_err_cqe *ecqe; + ecqe = (struct mlx5_err_cqe *)cqe; + + switch (MLX5_CQE_OPCODE(cqe->op_own)) { + case MLX5_CQE_RESP_WR_IMM: + cq_logerr("IBV_WC_RECV_RDMA_WITH_IMM is not supported"); + status = BS_CQE_RESP_WR_IMM_NOT_SUPPORTED; + break; + case MLX5_CQE_RESP_SEND: + case MLX5_CQE_RESP_SEND_IMM: + case MLX5_CQE_RESP_SEND_INV: { + status = BS_OK; + p_rx_wc_buf_desc->sz_data = ntohl(cqe->byte_cnt); +#ifdef DEFINED_UTLS + p_rx_wc_buf_desc->rx.tls_decrypted = (cqe->pkt_info >> 3) & 0x3; +#endif /* DEFINED_UTLS */ + p_rx_wc_buf_desc->rx.timestamps.hw_raw = ntohll(cqe->timestamp); + p_rx_wc_buf_desc->rx.flow_tag_id = ntohl((uint32_t)(cqe->sop_drop_qpn)); + p_rx_wc_buf_desc->rx.is_sw_csum_need = + !(m_b_is_rx_hw_csum_on && (cqe->hds_ip_ext & MLX5_CQE_L4_OK) && + (cqe->hds_ip_ext & MLX5_CQE_L3_OK)); + if (cqe->lro_num_seg > 1) { + lro_update_hdr(cqe, p_rx_wc_buf_desc); + m_p_cq_stat->n_rx_lro_packets++; + m_p_cq_stat->n_rx_lro_bytes += p_rx_wc_buf_desc->sz_data; + } + return; + } + case MLX5_CQE_INVALID: /* No cqe!*/ + { + cq_logerr("We should no receive a buffer without a cqe\n"); + status = BS_CQE_INVALID; + break; + } + case MLX5_CQE_REQ: + case MLX5_CQE_REQ_ERR: + case MLX5_CQE_RESP_ERR: + default: { + if (MLX5_CQE_SYNDROME_WR_FLUSH_ERR == ecqe->syndrome) { + status = BS_IBV_WC_WR_FLUSH_ERR; + } else { + status = BS_GENERAL_ERR; + } + /* + IB compliant completion with error syndrome: + 0x1: Local_Length_Error + 0x2: Local_QP_Operation_Error + 0x4: Local_Protection_Error + 0x5: Work_Request_Flushed_Error + 0x6: Memory_Window_Bind_Error + 0x10: Bad_Response_Error + 0x11: Local_Access_Error + 0x12: Remote_Invalid_Request_Error + 0x13: Remote_Access_Error + 0x14: Remote_Operation_Error + 0x15: Transport_Retry_Counter_Exceeded + 0x16: RNR_Retry_Counter_Exceeded + 0x22: Aborted_Error + other: Reserved + */ + break; + } + } + + // increase cqe error counter should be done once, here (regular flow) + switch (MLX5_CQE_OPCODE(cqe->op_own)) { + case MLX5_CQE_INVALID: + case MLX5_CQE_REQ_ERR: + case MLX5_CQE_RESP_ERR: + m_p_cq_stat->n_rx_cqe_error++; + break; + } +} + +int cq_mgr_rx_regrq::drain_and_proccess_helper(mem_buf_desc_t *buff, buff_status_e status, + uintptr_t *p_recycle_buffers_last_wr_id) +{ + ++m_n_wce_counter; + if (cqe_process_rx(buff, status)) { + if (p_recycle_buffers_last_wr_id) { + m_p_cq_stat->n_rx_pkt_drop++; + reclaim_recv_buffer_helper(buff); + } else { + bool procces_now = is_eth_tcp_frame(buff); + + if (procces_now) { // We process immediately all non udp/ip traffic.. + buff->rx.is_xlio_thr = true; + if ((++m_debt < (int)m_n_sysvar_rx_num_wr_to_post_recv) || + !compensate_qp_poll_success(buff)) { + process_recv_buffer(buff, nullptr); + } + } else { // udp/ip traffic we just put in the cq's rx queue + m_rx_queue.push_back(buff); + mem_buf_desc_t *buff_cur = m_rx_queue.get_and_pop_front(); + if ((++m_debt < (int)m_n_sysvar_rx_num_wr_to_post_recv) || + !compensate_qp_poll_success(buff_cur)) { + m_rx_queue.push_front(buff_cur); + } + } + } + } + + if (p_recycle_buffers_last_wr_id) { + *p_recycle_buffers_last_wr_id = (uintptr_t)buff; + } + + return 1; +} + +int cq_mgr_rx_regrq::drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id /*=NULL*/) +{ + cq_logfuncall("cq was %s drained. %d processed wce since last check. %d wce in m_rx_queue", + (m_b_was_drained ? "" : "not "), m_n_wce_counter, m_rx_queue.size()); + + /* CQ polling loop until max wce limit is reached for this interval or CQ is drained */ + uint32_t ret_total = 0; + uint64_t cq_poll_sn = 0; + + /* drain_and_proccess() is mainly called in following cases as + * Internal thread: + * Frequency of real polling can be controlled by + * PROGRESS_ENGINE_INTERVAL and PROGRESS_ENGINE_WCE_MAX. + * socketxtreme: + * User does socketxtreme_poll() + * Cleanup: + * QP down logic to release rx buffers should force polling to do this. + * Not null argument indicates one. + */ + + while (((m_n_sysvar_progress_engine_wce_max > m_n_wce_counter) && (!m_b_was_drained)) || + (p_recycle_buffers_last_wr_id)) { + buff_status_e status = BS_OK; + mem_buf_desc_t *buff = poll(status); + if (!buff) { + update_global_sn_rx(cq_poll_sn, ret_total); + m_b_was_drained = true; + m_p_ring->m_gro_mgr.flush_all(nullptr); + return ret_total; + } + + ++m_n_wce_counter; + + if (cqe_process_rx(buff, status)) { + if (p_recycle_buffers_last_wr_id) { + m_p_cq_stat->n_rx_pkt_drop++; + reclaim_recv_buffer_helper(buff); + } else { + bool procces_now = is_eth_tcp_frame(buff); + + /* We process immediately all non udp/ip traffic.. */ + if (procces_now) { + buff->rx.is_xlio_thr = true; + if ((++m_debt < (int)m_n_sysvar_rx_num_wr_to_post_recv) || + !compensate_qp_poll_success(buff)) { + process_recv_buffer(buff, NULL); + } + } else { /* udp/ip traffic we just put in the cq's rx queue */ + m_rx_queue.push_back(buff); + mem_buf_desc_t *buff_cur = m_rx_queue.front(); + m_rx_queue.pop_front(); + if ((++m_debt < (int)m_n_sysvar_rx_num_wr_to_post_recv) || + !compensate_qp_poll_success(buff_cur)) { + m_rx_queue.push_front(buff_cur); + } + } + } + } + + if (p_recycle_buffers_last_wr_id) { + *p_recycle_buffers_last_wr_id = (uintptr_t)buff; + } + + ++ret_total; + } + + update_global_sn_rx(cq_poll_sn, ret_total); + + m_p_ring->m_gro_mgr.flush_all(nullptr); + + m_n_wce_counter = 0; + m_b_was_drained = false; + + // Update cq statistics + m_p_cq_stat->n_rx_sw_queue_len = m_rx_queue.size(); + m_p_cq_stat->n_rx_drained_at_once_max = + std::max(ret_total, m_p_cq_stat->n_rx_drained_at_once_max); + + return ret_total; +} + +mem_buf_desc_t *cq_mgr_rx_regrq::poll_and_process_socketxtreme() +{ + buff_status_e status = BS_OK; + mem_buf_desc_t *buff_wqe = poll(status); + + if (buff_wqe) { + if (cqe_process_rx(buff_wqe, status)) { + if ((++m_debt < (int)m_n_sysvar_rx_num_wr_to_post_recv) || + !compensate_qp_poll_success(buff_wqe)) { + return buff_wqe; + } + } else if (++m_debt >= (int)m_n_sysvar_rx_num_wr_to_post_recv) { + compensate_qp_poll_failed(); + } + } else { + compensate_qp_poll_failed(); + } + + return nullptr; +} + +int cq_mgr_rx_regrq::poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void *pv_fd_ready_array) +{ + /* Assume locked!!! */ + cq_logfuncall(""); + + uint32_t ret_rx_processed = process_recv_queue(pv_fd_ready_array); + if (unlikely(ret_rx_processed >= m_n_sysvar_cq_poll_batch_max)) { + m_p_ring->m_gro_mgr.flush_all(pv_fd_ready_array); + return ret_rx_processed; + } + + if (m_p_next_rx_desc_poll) { + prefetch_range((uint8_t *)m_p_next_rx_desc_poll->p_buffer, + m_n_sysvar_rx_prefetch_bytes_before_poll); + } + + buff_status_e status = BS_OK; + uint32_t ret = 0; + while (ret < m_n_sysvar_cq_poll_batch_max) { + mem_buf_desc_t *buff = poll(status); + if (buff) { + ++ret; + if (cqe_process_rx(buff, status)) { + if ((++m_debt < (int)m_n_sysvar_rx_num_wr_to_post_recv) || + !compensate_qp_poll_success(buff)) { + process_recv_buffer(buff, pv_fd_ready_array); + } + } else { + m_p_cq_stat->n_rx_pkt_drop++; + if (++m_debt >= (int)m_n_sysvar_rx_num_wr_to_post_recv) { + compensate_qp_poll_failed(); + } + } + } else { + m_b_was_drained = true; + break; + } + } + + update_global_sn_rx(*p_cq_poll_sn, ret); + + if (likely(ret > 0)) { + ret_rx_processed += ret; + m_n_wce_counter += ret; + m_p_ring->m_gro_mgr.flush_all(pv_fd_ready_array); + } else { + compensate_qp_poll_failed(); + } + + return ret_rx_processed; +} + +#endif /* DEFINED_DIRECT_VERBS */ diff --git a/src/core/sock/tcp_seg_pool.h b/src/core/dev/cq_mgr_rx_regrq.h similarity index 57% rename from src/core/sock/tcp_seg_pool.h rename to src/core/dev/cq_mgr_rx_regrq.h index b7c9852eb..8a02f77a0 100644 --- a/src/core/sock/tcp_seg_pool.h +++ b/src/core/dev/cq_mgr_rx_regrq.h @@ -30,39 +30,33 @@ * SOFTWARE. */ -#ifndef TCP_SEG_POOL_H -#define TCP_SEG_POOL_H +#ifndef CQ_MGR_REGRQ_H +#define CQ_MGR_REGRQ_H -#include -#include "dev/allocator.h" -#include "utils/lock_wrapper.h" -#include "lwip/tcp_impl.h" +#include "cq_mgr_rx.h" -class tcp_seg_pool : lock_spin { +class cq_mgr_rx_regrq : public cq_mgr_rx { public: - tcp_seg_pool(); - virtual ~tcp_seg_pool(); + cq_mgr_rx_regrq(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, uint32_t cq_size, + struct ibv_comp_channel *p_comp_event_channel); - std::pair get_tcp_seg_list(uint32_t amount); - tcp_seg *get_tcp_segs(uint32_t amount); - void put_tcp_segs(tcp_seg *seg_list); + virtual ~cq_mgr_rx_regrq() override; - static tcp_seg *split_tcp_segs(uint32_t count, tcp_seg *&tcp_seg_list, uint32_t &total_count); + virtual int drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id = nullptr) override; + virtual mem_buf_desc_t *poll_and_process_socketxtreme() override; + virtual int poll_and_process_element_rx(uint64_t *p_cq_poll_sn, + void *pv_fd_ready_array = nullptr) override; -private: - bool expand(); - void print_report(vlog_levels_t log_level = VLOG_DEBUG); + virtual uint32_t clean_cq() override; - tcp_seg *m_p_head; - xlio_allocator_heap m_allocator; +protected: + mem_buf_desc_t *poll(enum buff_status_e &status); + inline void cqe_to_mem_buff_desc(struct xlio_mlx5_cqe *cqe, mem_buf_desc_t *p_rx_wc_buf_desc, + enum buff_status_e &status); - struct { - unsigned total_segs; - unsigned allocations; - unsigned expands; - } m_stats; +private: + int drain_and_proccess_helper(mem_buf_desc_t *buff, buff_status_e status, + uintptr_t *p_recycle_buffers_last_wr_id); }; -extern tcp_seg_pool *g_tcp_seg_pool; - -#endif +#endif // CQ_MGR_MLX5_H diff --git a/src/core/dev/cq_mgr_mlx5_strq.cpp b/src/core/dev/cq_mgr_rx_strq.cpp similarity index 81% rename from src/core/dev/cq_mgr_mlx5_strq.cpp rename to src/core/dev/cq_mgr_rx_strq.cpp index d18f2b34b..041364177 100644 --- a/src/core/dev/cq_mgr_mlx5_strq.cpp +++ b/src/core/dev/cq_mgr_rx_strq.cpp @@ -30,19 +30,17 @@ * SOFTWARE. */ -#include "cq_mgr_mlx5_strq.h" +#include "cq_mgr_rx_strq.h" #if defined(DEFINED_DIRECT_VERBS) #include -#include "cq_mgr.inl" -#include "cq_mgr_mlx5.inl" -#include "qp_mgr.h" -#include "qp_mgr_eth_mlx5.h" +#include "cq_mgr_rx_inl.h" +#include "hw_queue_rx.h" #include "ring_simple.h" #include -#define MODULE_NAME "cq_mgr_mlx5_strq" +#define MODULE_NAME "cq_mgr_rx_strq" #define cq_logfunc __log_info_func #define cq_logdbg __log_info_dbg @@ -56,12 +54,10 @@ ##log_args); \ } while (0) -cq_mgr_mlx5_strq::cq_mgr_mlx5_strq(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, - uint32_t cq_size, uint32_t stride_size_bytes, - uint32_t strides_num, - struct ibv_comp_channel *p_comp_event_channel, - bool call_configure) - : cq_mgr_mlx5(p_ring, p_ib_ctx_handler, cq_size, p_comp_event_channel, true, call_configure) +cq_mgr_rx_strq::cq_mgr_rx_strq(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, + uint32_t cq_size, uint32_t stride_size_bytes, uint32_t strides_num, + struct ibv_comp_channel *p_comp_event_channel) + : cq_mgr_rx(p_ring, p_ib_ctx_handler, cq_size, p_comp_event_channel) , _owner_ring(p_ring) , _stride_size_bytes(stride_size_bytes) , _strides_num(strides_num) @@ -74,7 +70,7 @@ cq_mgr_mlx5_strq::cq_mgr_mlx5_strq(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx return_stride(next_stride()); // Fill _stride_cache } -cq_mgr_mlx5_strq::~cq_mgr_mlx5_strq() +cq_mgr_rx_strq::~cq_mgr_rx_strq() { cq_logfunc(""); cq_logdbg("destroying CQ STRQ"); @@ -88,7 +84,10 @@ cq_mgr_mlx5_strq::~cq_mgr_mlx5_strq() cq_logdbg("Clearing %zu stride objects)", m_rx_queue.size()); while (!m_rx_queue.empty()) { - reclaim_recv_buffer_helper(m_rx_queue.get_and_pop_front()); + mem_buf_desc_t *buff = m_rx_queue.get_and_pop_front(); + if (likely(buff)) { + reclaim_recv_buffer_helper(buff); + } } m_p_cq_stat->n_rx_sw_queue_len = m_rx_queue.size(); @@ -101,7 +100,7 @@ cq_mgr_mlx5_strq::~cq_mgr_mlx5_strq() g_buffer_pool_rx_stride->put_buffers_thread_safe(&_stride_cache, _stride_cache.size()); } -mem_buf_desc_t *cq_mgr_mlx5_strq::next_stride() +mem_buf_desc_t *cq_mgr_rx_strq::next_stride() { if (unlikely(_stride_cache.size() <= 0U)) { if (!g_buffer_pool_rx_stride->get_buffers_thread_safe( @@ -117,7 +116,7 @@ mem_buf_desc_t *cq_mgr_mlx5_strq::next_stride() return _stride_cache.get_and_pop_back(); } -void cq_mgr_mlx5_strq::return_stride(mem_buf_desc_t *desc) +void cq_mgr_rx_strq::return_stride(mem_buf_desc_t *desc) { _stride_cache.push_back(desc); @@ -127,16 +126,12 @@ void cq_mgr_mlx5_strq::return_stride(mem_buf_desc_t *desc) } } -uint32_t cq_mgr_mlx5_strq::clean_cq() +uint32_t cq_mgr_rx_strq::clean_cq() { uint32_t ret_total = 0; uint64_t cq_poll_sn = 0; - /* Sanity check for cq: initialization of tx and rx cq has difference: - * rx - is done in qp_mgr::up() - * as a result rx cq can be created but not initialized - */ - if (NULL == m_qp) { + if (!m_hqrx_ptr) { // Sanity check return 0; } @@ -151,18 +146,18 @@ uint32_t cq_mgr_mlx5_strq::clean_cq() stride_buf = nullptr; } - update_global_sn(cq_poll_sn, ret_total); + update_global_sn_rx(cq_poll_sn, ret_total); return ret_total; } -bool cq_mgr_mlx5_strq::set_current_hot_buffer() +bool cq_mgr_rx_strq::set_current_hot_buffer() { - if (likely(m_qp->m_mlx5_qp.rq.tail != (m_qp->m_mlx5_qp.rq.head))) { - uint32_t index = m_qp->m_mlx5_qp.rq.tail & (m_qp_rec.qp->m_rx_num_wr - 1); - m_rx_hot_buffer = (mem_buf_desc_t *)m_qp->m_rq_wqe_idx_to_wrid[index]; + if (likely(m_hqrx_ptr->m_rq_data.tail != (m_hqrx_ptr->m_rq_data.head))) { + uint32_t index = m_hqrx_ptr->m_rq_data.tail & (m_hqrx_ptr->m_rx_num_wr - 1); + m_rx_hot_buffer = (mem_buf_desc_t *)m_hqrx_ptr->m_rq_wqe_idx_to_wrid[index]; m_rx_hot_buffer->set_ref_count(_strides_num); - m_qp->m_rq_wqe_idx_to_wrid[index] = 0; + m_hqrx_ptr->m_rq_wqe_idx_to_wrid[index] = 0; return true; } @@ -171,13 +166,13 @@ bool cq_mgr_mlx5_strq::set_current_hot_buffer() return false; } -mem_buf_desc_t *cq_mgr_mlx5_strq::poll(enum buff_status_e &status, mem_buf_desc_t *&buff_stride) +mem_buf_desc_t *cq_mgr_rx_strq::poll(enum buff_status_e &status, mem_buf_desc_t *&buff_stride) { - mem_buf_desc_t *buff = NULL; + mem_buf_desc_t *buff = nullptr; if (unlikely(!m_rx_hot_buffer)) { if (!set_current_hot_buffer()) { - return NULL; + return nullptr; } } @@ -199,9 +194,9 @@ mem_buf_desc_t *cq_mgr_mlx5_strq::poll(enum buff_status_e &status, mem_buf_desc_ bool is_wqe_complete = strq_cqe_to_mem_buff_desc(cqe, status, is_filler); if (is_wqe_complete) { - ++m_qp->m_mlx5_qp.rq.tail; + ++m_hqrx_ptr->m_rq_data.tail; buff = m_rx_hot_buffer; - m_rx_hot_buffer = NULL; + m_rx_hot_buffer = nullptr; if (likely(status == BS_OK)) { ++m_p_cq_stat->n_rx_consumed_rwqe_count; } @@ -228,8 +223,8 @@ mem_buf_desc_t *cq_mgr_mlx5_strq::poll(enum buff_status_e &status, mem_buf_desc_ return buff; } -inline bool cq_mgr_mlx5_strq::strq_cqe_to_mem_buff_desc(struct xlio_mlx5_cqe *cqe, - enum buff_status_e &status, bool &is_filler) +inline bool cq_mgr_rx_strq::strq_cqe_to_mem_buff_desc(struct xlio_mlx5_cqe *cqe, + enum buff_status_e &status, bool &is_filler) { struct mlx5_err_cqe *ecqe; ecqe = (struct mlx5_err_cqe *)cqe; @@ -245,8 +240,8 @@ inline bool cq_mgr_mlx5_strq::strq_cqe_to_mem_buff_desc(struct xlio_mlx5_cqe *cq case MLX5_CQE_RESP_SEND_INV: { status = BS_OK; _hot_buffer_stride->rx.strides_num = ((host_byte_cnt >> 16) & 0x00003FFF); - _hot_buffer_stride->lwip_pbuf.pbuf.desc.attr = PBUF_DESC_STRIDE; - _hot_buffer_stride->lwip_pbuf.pbuf.desc.mdesc = m_rx_hot_buffer; + _hot_buffer_stride->lwip_pbuf.desc.attr = PBUF_DESC_STRIDE; + _hot_buffer_stride->lwip_pbuf.desc.mdesc = m_rx_hot_buffer; is_filler = (host_byte_cnt >> 31 != 0U ? true : false); _hot_buffer_stride->sz_data = @@ -257,7 +252,7 @@ inline bool cq_mgr_mlx5_strq::strq_cqe_to_mem_buff_desc(struct xlio_mlx5_cqe *cq _current_wqe_consumed_bytes += _hot_buffer_stride->sz_buffer; _hot_buffer_stride->rx.timestamps.hw_raw = ntohll(cqe->timestamp); - _hot_buffer_stride->rx.flow_tag_id = xlio_get_flow_tag(cqe); + _hot_buffer_stride->rx.flow_tag_id = ntohl((uint32_t)(cqe->sop_drop_qpn)); _hot_buffer_stride->rx.is_sw_csum_need = !(m_b_is_rx_hw_csum_on && (cqe->hds_ip_ext & MLX5_CQE_L4_OK) && (cqe->hds_ip_ext & MLX5_CQE_L3_OK)); @@ -282,8 +277,8 @@ inline bool cq_mgr_mlx5_strq::strq_cqe_to_mem_buff_desc(struct xlio_mlx5_cqe *cq case MLX5_CQE_RESP_ERR: default: { _hot_buffer_stride->rx.strides_num = ((host_byte_cnt >> 16) & 0x00003FFF); - _hot_buffer_stride->lwip_pbuf.pbuf.desc.attr = PBUF_DESC_STRIDE; - _hot_buffer_stride->lwip_pbuf.pbuf.desc.mdesc = m_rx_hot_buffer; + _hot_buffer_stride->lwip_pbuf.desc.attr = PBUF_DESC_STRIDE; + _hot_buffer_stride->lwip_pbuf.desc.mdesc = m_rx_hot_buffer; is_filler = true; _current_wqe_consumed_bytes = _wqe_buff_size_bytes; _hot_buffer_stride->sz_data = 0U; @@ -338,12 +333,12 @@ inline bool cq_mgr_mlx5_strq::strq_cqe_to_mem_buff_desc(struct xlio_mlx5_cqe *cq return false; } -int cq_mgr_mlx5_strq::drain_and_proccess_helper(mem_buf_desc_t *buff, mem_buf_desc_t *buff_wqe, - buff_status_e status, - uintptr_t *p_recycle_buffers_last_wr_id) +int cq_mgr_rx_strq::drain_and_proccess_helper(mem_buf_desc_t *buff, mem_buf_desc_t *buff_wqe, + buff_status_e status, + uintptr_t *p_recycle_buffers_last_wr_id) { int ret_total = 0; - if (buff_wqe && (++m_qp_rec.debt >= (int)m_n_sysvar_rx_num_wr_to_post_recv) && + if (buff_wqe && (++m_debt >= (int)m_n_sysvar_rx_num_wr_to_post_recv) && !p_recycle_buffers_last_wr_id) { compensate_qp_poll_failed(); // Reuse this method as success. } @@ -357,8 +352,7 @@ int cq_mgr_mlx5_strq::drain_and_proccess_helper(mem_buf_desc_t *buff, mem_buf_de m_p_cq_stat->n_rx_pkt_drop++; reclaim_recv_buffer_helper(buff); } else { - bool procces_now = - (m_transport_type == XLIO_TRANSPORT_ETH ? is_eth_tcp_frame(buff) : false); + bool procces_now = is_eth_tcp_frame(buff); // We process immediately all non udp/ip traffic.. if (procces_now) { @@ -378,7 +372,7 @@ int cq_mgr_mlx5_strq::drain_and_proccess_helper(mem_buf_desc_t *buff, mem_buf_de return ret_total; } -int cq_mgr_mlx5_strq::drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id) +int cq_mgr_rx_strq::drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id) { cq_logfuncall("cq was %s drained. %d processed wce since last check. %d wce in m_rx_queue", (m_b_was_drained ? "" : "not "), m_n_wce_counter, m_rx_queue.size()); @@ -401,7 +395,7 @@ int cq_mgr_mlx5_strq::drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id mem_buf_desc_t *buff = nullptr; mem_buf_desc_t *buff_wqe = poll(status, buff); if (!buff && !buff_wqe) { - update_global_sn(cq_poll_sn, ret_total); + update_global_sn_rx(cq_poll_sn, ret_total); m_b_was_drained = true; m_p_ring->m_gro_mgr.flush_all(nullptr); return ret_total; @@ -411,7 +405,7 @@ int cq_mgr_mlx5_strq::drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id drain_and_proccess_helper(buff, buff_wqe, status, p_recycle_buffers_last_wr_id); } - update_global_sn(cq_poll_sn, ret_total); + update_global_sn_rx(cq_poll_sn, ret_total); m_p_ring->m_gro_mgr.flush_all(nullptr); m_n_wce_counter = 0; // Actually strides count. @@ -425,8 +419,8 @@ int cq_mgr_mlx5_strq::drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id return ret_total; } -mem_buf_desc_t *cq_mgr_mlx5_strq::process_strq_cq_element_rx(mem_buf_desc_t *p_mem_buf_desc, - enum buff_status_e status) +mem_buf_desc_t *cq_mgr_rx_strq::process_strq_cq_element_rx(mem_buf_desc_t *p_mem_buf_desc, + enum buff_status_e status) { /* Assume locked!!! */ cq_logfuncall(""); @@ -450,20 +444,20 @@ mem_buf_desc_t *cq_mgr_mlx5_strq::process_strq_cq_element_rx(mem_buf_desc_t *p_m return p_mem_buf_desc; } -mem_buf_desc_t *cq_mgr_mlx5_strq::poll_and_process_socketxtreme() +mem_buf_desc_t *cq_mgr_rx_strq::poll_and_process_socketxtreme() { buff_status_e status = BS_OK; mem_buf_desc_t *buff = nullptr; mem_buf_desc_t *buff_wqe = poll(status, buff); - if ((buff_wqe && (++m_qp_rec.debt >= (int)m_n_sysvar_rx_num_wr_to_post_recv)) || !buff) { + if ((buff_wqe && (++m_debt >= (int)m_n_sysvar_rx_num_wr_to_post_recv)) || !buff) { compensate_qp_poll_failed(); // Reuse this method as success. } return (buff && cqe_process_rx(buff, status) ? buff : nullptr); } -int cq_mgr_mlx5_strq::poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void *pv_fd_ready_array) +int cq_mgr_rx_strq::poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void *pv_fd_ready_array) { /* Assume locked!!! */ cq_logfuncall(""); @@ -485,7 +479,7 @@ int cq_mgr_mlx5_strq::poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void * mem_buf_desc_t *buff = nullptr; mem_buf_desc_t *buff_wqe = poll(status, buff); - if (buff_wqe && (++m_qp_rec.debt >= (int)m_n_sysvar_rx_num_wr_to_post_recv)) { + if (buff_wqe && (++m_debt >= (int)m_n_sysvar_rx_num_wr_to_post_recv)) { compensate_qp_poll_failed(); // Reuse this method as success. } @@ -501,7 +495,7 @@ int cq_mgr_mlx5_strq::poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void * } } - update_global_sn(*p_cq_poll_sn, ret); + update_global_sn_rx(*p_cq_poll_sn, ret); if (likely(ret > 0)) { m_n_wce_counter += ret; // Actually strides count. @@ -513,18 +507,17 @@ int cq_mgr_mlx5_strq::poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void * return ret_rx_processed; } -void cq_mgr_mlx5_strq::add_qp_rx(qp_mgr *qp) +void cq_mgr_rx_strq::add_hqrx(hw_queue_rx *hqrx) { cq_logfunc(""); - set_qp_rq(qp); _hot_buffer_stride = nullptr; _current_wqe_consumed_bytes = 0U; - cq_mgr::add_qp_rx(qp); + cq_mgr_rx::add_hqrx(hqrx); } -void cq_mgr_mlx5_strq::statistics_print() +void cq_mgr_rx_strq::statistics_print() { - cq_mgr::statistics_print(); + cq_mgr_rx::statistics_print(); cq_logdbg_no_funcname("RWQE consumed: %12" PRIu64, m_p_cq_stat->n_rx_consumed_rwqe_count); cq_logdbg_no_funcname("Packets count: %12" PRIu64, m_p_cq_stat->n_rx_packet_count); cq_logdbg_no_funcname("Max Strides per Packet: %12" PRIu16, @@ -534,13 +527,13 @@ void cq_mgr_mlx5_strq::statistics_print() cq_logdbg_no_funcname("LRO bytes: %12" PRIu64, m_p_cq_stat->n_rx_lro_bytes); } -void cq_mgr_mlx5_strq::reclaim_recv_buffer_helper(mem_buf_desc_t *buff) +void cq_mgr_rx_strq::reclaim_recv_buffer_helper(mem_buf_desc_t *buff) { - if (buff->dec_ref_count() <= 1 && (buff->lwip_pbuf.pbuf.ref-- <= 1)) { + if (buff->dec_ref_count() <= 1 && (buff->lwip_pbuf.ref-- <= 1)) { if (likely(buff->p_desc_owner == m_p_ring)) { mem_buf_desc_t *temp = nullptr; while (buff) { - if (unlikely(buff->lwip_pbuf.pbuf.desc.attr != PBUF_DESC_STRIDE)) { + if (unlikely(buff->lwip_pbuf.desc.attr != PBUF_DESC_STRIDE)) { __log_info_err("CQ STRQ reclaim_recv_buffer_helper with incompatible " "mem_buf_desc_t object"); // We cannot continue iterating over a broken buffer object. @@ -548,15 +541,15 @@ void cq_mgr_mlx5_strq::reclaim_recv_buffer_helper(mem_buf_desc_t *buff) } mem_buf_desc_t *rwqe = - reinterpret_cast(buff->lwip_pbuf.pbuf.desc.mdesc); + reinterpret_cast(buff->lwip_pbuf.desc.mdesc); if (buff->rx.strides_num == rwqe->add_ref_count(-buff->rx.strides_num)) { // Is last stride. - cq_mgr::reclaim_recv_buffer_helper(rwqe); + cq_mgr_rx::reclaim_recv_buffer_helper(rwqe); } VLIST_DEBUG_CQ_MGR_PRINT_ERROR_IS_MEMBER; temp = buff; - assert(temp->lwip_pbuf.pbuf.type != PBUF_ZEROCOPY); + assert(temp->lwip_pbuf.type != PBUF_ZEROCOPY); buff = temp->p_next_desc; temp->clear_transport_data(); temp->p_next_desc = nullptr; diff --git a/src/core/dev/cq_mgr_mlx5_strq.h b/src/core/dev/cq_mgr_rx_strq.h similarity index 83% rename from src/core/dev/cq_mgr_mlx5_strq.h rename to src/core/dev/cq_mgr_rx_strq.h index 8a00de614..3852465f2 100644 --- a/src/core/dev/cq_mgr_mlx5_strq.h +++ b/src/core/dev/cq_mgr_rx_strq.h @@ -30,33 +30,33 @@ * SOFTWARE. */ -#ifndef CQ_MGR_MLX5_STRQ_H -#define CQ_MGR_MLX5_STRQ_H +#ifndef CQ_MGR_STRQ_H +#define CQ_MGR_STRQ_H #include #include -#include "cq_mgr_mlx5.h" +#include "cq_mgr_rx.h" -class cq_mgr_mlx5_strq : public cq_mgr_mlx5 { +class cq_mgr_rx_strq : public cq_mgr_rx { public: - cq_mgr_mlx5_strq(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, uint32_t cq_size, - uint32_t stride_size_bytes, uint32_t strides_num, - struct ibv_comp_channel *p_comp_event_channel, bool call_configure = true); + cq_mgr_rx_strq(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, uint32_t cq_size, + uint32_t stride_size_bytes, uint32_t strides_num, + struct ibv_comp_channel *p_comp_event_channel); - virtual ~cq_mgr_mlx5_strq() override; + virtual ~cq_mgr_rx_strq() override; virtual int drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id = NULL) override; virtual mem_buf_desc_t *poll_and_process_socketxtreme() override; virtual int poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void *pv_fd_ready_array = NULL) override; - virtual void add_qp_rx(qp_mgr *qp) override; + virtual void add_hqrx(hw_queue_rx *qp) override; virtual uint32_t clean_cq() override; protected: virtual void statistics_print() override; virtual void reclaim_recv_buffer_helper(mem_buf_desc_t *buff) override; - inline mem_buf_desc_t *poll(enum buff_status_e &status, mem_buf_desc_t *&buff_stride); + mem_buf_desc_t *poll(enum buff_status_e &status, mem_buf_desc_t *&buff_stride); private: mem_buf_desc_t *next_stride(); diff --git a/src/core/dev/cq_mgr_tx.cpp b/src/core/dev/cq_mgr_tx.cpp new file mode 100644 index 000000000..17fceb582 --- /dev/null +++ b/src/core/dev/cq_mgr_tx.cpp @@ -0,0 +1,350 @@ +/* + * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "dev/cq_mgr_tx.h" +#include +#include +#include +#include "ring_simple.h" +#include "hw_queue_tx.h" + +#define MODULE_NAME "cq_mgr_tx" + +#define cq_logpanic __log_info_panic +#define cq_logerr __log_info_err +#define cq_logwarn __log_info_warn +#define cq_loginfo __log_info_info +#define cq_logdbg __log_info_dbg +#define cq_logfunc __log_info_func +#define cq_logfuncall __log_info_funcall + +atomic_t cq_mgr_tx::m_n_cq_id_counter_tx = ATOMIC_INIT(1); + +uint64_t cq_mgr_tx::m_n_global_sn_tx = 0U; + +cq_mgr_tx::cq_mgr_tx(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, int cq_size, + ibv_comp_channel *p_comp_event_channel) + : m_p_ring(p_ring) + , m_p_ib_ctx_handler(p_ib_ctx_handler) + , m_comp_event_channel(p_comp_event_channel) +{ + m_cq_id_tx = atomic_fetch_and_inc(&m_n_cq_id_counter_tx); // cq id is nonzero + configure(cq_size); + + memset(&m_mlx5_cq, 0, sizeof(m_mlx5_cq)); +} + +cq_mgr_tx::~cq_mgr_tx() +{ + cq_logdbg("Destroying CQ as Tx"); + + IF_VERBS_FAILURE_EX(ibv_destroy_cq(m_p_ibv_cq), EIO) + { + cq_logdbg("destroy cq failed (errno=%d %m)", errno); + } + ENDIF_VERBS_FAILURE; + VALGRIND_MAKE_MEM_UNDEFINED(m_p_ibv_cq, sizeof(ibv_cq)); + cq_logdbg("Destroying CQ as Tx done"); +} + +uint32_t cq_mgr_tx::clean_cq() +{ + uint32_t ret_total = 0; + uint64_t cq_poll_sn = 0; + mem_buf_desc_t *buff; + + int ret = 0; + /* coverity[stack_use_local_overflow] */ + xlio_ibv_wc wce[MCE_MAX_CQ_POLL_BATCH]; + while ((ret = clean_cq_poll_tx(wce, MCE_MAX_CQ_POLL_BATCH, &cq_poll_sn)) > 0) { + for (int i = 0; i < ret; i++) { + buff = (mem_buf_desc_t *)(uintptr_t)(wce[i].wr_id); + if (buff) { + m_p_ring->mem_buf_desc_return_single_to_owner_tx(buff); + } + } + ret_total += ret; + } + + return ret_total; +} + +int cq_mgr_tx::clean_cq_poll_tx(xlio_ibv_wc *p_wce, int num_entries, uint64_t *p_cq_poll_sn) +{ + // Assume locked!!! + cq_logfuncall(""); + + int ret = xlio_ibv_poll_cq(m_p_ibv_cq, num_entries, p_wce); + if (ret <= 0) { + // Zero polled wce OR ibv_poll_cq() has driver specific errors + // so we can't really do anything with them + *p_cq_poll_sn = m_n_global_sn_tx; + return 0; + } + + if (unlikely(g_vlogger_level >= VLOG_FUNC_ALL)) { + for (int i = 0; i < ret; i++) { + cq_logfuncall("wce[%d] info wr_id=%x, status=%x, opcode=%x, vendor_err=%x, " + "byte_len=%d, imm_data=%x", + i, p_wce[i].wr_id, p_wce[i].status, xlio_wc_opcode(p_wce[i]), + p_wce[i].vendor_err, p_wce[i].byte_len, p_wce[i].imm_data); + cq_logfuncall("qp_num=%x, src_qp=%x, wc_flags=%x, pkey_index=%x, slid=%x, sl=%x, " + "dlid_path_bits=%x", + p_wce[i].qp_num, p_wce[i].src_qp, xlio_wc_flags(p_wce[i]), + p_wce[i].pkey_index, p_wce[i].slid, p_wce[i].sl, p_wce[i].dlid_path_bits); + } + } + + // spoil the global sn if we have packets ready + union __attribute__((packed)) { + uint64_t global_sn; + struct { + uint32_t cq_id; + uint32_t cq_sn; + } bundle; + } next_sn; + next_sn.bundle.cq_sn = ++m_n_cq_poll_sn_tx; + next_sn.bundle.cq_id = m_cq_id_tx; + + *p_cq_poll_sn = m_n_global_sn_tx = next_sn.global_sn; + + return ret; +} + +void cq_mgr_tx::configure(int cq_size) +{ + xlio_ibv_cq_init_attr attr; + memset(&attr, 0, sizeof(attr)); + + struct ibv_context *context = m_p_ib_ctx_handler->get_ibv_context(); + int comp_vector = 0; +#if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) + /* + * For some scenario with forking usage we may want to distribute CQs across multiple + * CPUs to improve CPS in case of multiple processes. + */ + if (safe_mce_sys().app.distribute_cq_interrupts && g_p_app->get_worker_id() >= 0) { + comp_vector = g_p_app->get_worker_id() % context->num_comp_vectors; + } +#endif + m_p_ibv_cq = xlio_ibv_create_cq(context, cq_size - 1, (void *)this, m_comp_event_channel, + comp_vector, &attr); + BULLSEYE_EXCLUDE_BLOCK_START + if (!m_p_ibv_cq) { + throw_xlio_exception("ibv_create_cq failed"); + } + BULLSEYE_EXCLUDE_BLOCK_END + VALGRIND_MAKE_MEM_DEFINED(m_p_ibv_cq, sizeof(ibv_cq)); + + cq_logdbg("Created CQ as Tx with fd[%d] and of size %d elements (ibv_cq_hndl=%p)", + get_channel_fd(), cq_size, m_p_ibv_cq); +} + +void cq_mgr_tx::add_qp_tx(hw_queue_tx *hqtx_ptr) +{ + // Assume locked! + cq_logdbg("hqtx_ptr=%p", hqtx_ptr); + m_hqtx_ptr = hqtx_ptr; + + if (0 != xlio_ib_mlx5_get_cq(m_p_ibv_cq, &m_mlx5_cq)) { + cq_logpanic("xlio_ib_mlx5_get_cq failed (errno=%d %m)", errno); + } + + cq_logfunc("hqtx_ptr=%p m_mlx5_cq.dbrec=%p m_mlx5_cq.cq_buf=%p", m_hqtx_ptr, m_mlx5_cq.dbrec, + m_mlx5_cq.cq_buf); +} + +void cq_mgr_tx::del_qp_tx(hw_queue_tx *hqtx_ptr) +{ + BULLSEYE_EXCLUDE_BLOCK_START + if (m_hqtx_ptr != hqtx_ptr) { + cq_logdbg("wrong hqtx_ptr=%p != m_hqtx_ptr=%p", hqtx_ptr, m_hqtx_ptr); + return; + } + BULLSEYE_EXCLUDE_BLOCK_END + cq_logdbg("m_hqtx_ptr=%p", m_hqtx_ptr); + m_hqtx_ptr = nullptr; +} + +int cq_mgr_tx::request_notification(uint64_t poll_sn) +{ + int ret = -1; + + cq_logfuncall(""); + + if ((m_n_global_sn_tx > 0 && poll_sn != m_n_global_sn_tx)) { + // The cq_mgr_tx's has receive packets pending processing (or got processed since + // cq_poll_sn) + cq_logfunc("miss matched poll sn (user=0x%lx, cq=0x%lx)", poll_sn, m_n_cq_poll_sn_tx); + return 1; + } + + if (m_b_notification_armed == false) { + + cq_logfunc("arming cq_mgr_tx notification channel"); + + // Arm the CQ notification channel + IF_VERBS_FAILURE(xlio_ib_mlx5_req_notify_cq(&m_mlx5_cq, 0)) + { + cq_logerr("Failure arming the TX notification channel (errno=%d %m)", errno); + } + else + { + ret = 0; + m_b_notification_armed = true; + } + ENDIF_VERBS_FAILURE; + } else { + // cq_mgr_tx notification channel already armed + ret = 0; + } + + cq_logfuncall("returning with %d", ret); + return ret; +} + +cq_mgr_tx *cq_mgr_tx::get_cq_mgr_from_cq_event(struct ibv_comp_channel *p_cq_channel) +{ + cq_mgr_tx *p_cq_mgr = nullptr; + struct ibv_cq *p_cq_hndl = nullptr; + void *p_context; // deal with compiler warnings + + // read & ack the CQ event + IF_VERBS_FAILURE(ibv_get_cq_event(p_cq_channel, &p_cq_hndl, &p_context)) + { + vlog_printf(VLOG_INFO, + MODULE_NAME + ":%d: waiting on cq_mgr_tx event returned with error (errno=%d %m)\n", + __LINE__, errno); + } + else + { + p_cq_mgr = (cq_mgr_tx *)p_context; // Save the cq_mgr_tx + p_cq_mgr->get_cq_event(); + ibv_ack_cq_events(p_cq_hndl, 1); // Ack the ibv event + } + ENDIF_VERBS_FAILURE; + + return p_cq_mgr; +} + +int cq_mgr_tx::poll_and_process_element_tx(uint64_t *p_cq_poll_sn) +{ + cq_logfuncall(""); + + static auto is_error_opcode = [&](uint8_t opcode) { + return opcode == MLX5_CQE_REQ_ERR || opcode == MLX5_CQE_RESP_ERR; + }; + + int ret = 0; + uint32_t num_polled_cqes = 0; + xlio_mlx5_cqe *cqe = get_cqe_tx(num_polled_cqes); + + if (likely(cqe)) { + unsigned index = ntohs(cqe->wqe_counter) & (m_hqtx_ptr->m_tx_num_wr - 1); + + // All error opcodes have the most significant bit set. + if (unlikely(cqe->op_own & 0x80) && is_error_opcode(cqe->op_own >> 4)) { + // m_p_cq_stat->n_tx_cqe_error++; Future counter + log_cqe_error(cqe); + } + + handle_sq_wqe_prop(index); + ret = 1; + } + update_global_sn_tx(*p_cq_poll_sn, num_polled_cqes); + + return ret; +} + +void cq_mgr_tx::log_cqe_error(struct xlio_mlx5_cqe *cqe) +{ + struct mlx5_err_cqe *ecqe = (struct mlx5_err_cqe *)cqe; + + /* TODO We can also ask hw_queue_tx to log WQE fields from SQ. But at first, we need to remove + * prefetch and memset of the next WQE there. Credit system will guarantee that we don't + * reuse the WQE at this point. + */ + + if (MLX5_CQE_SYNDROME_WR_FLUSH_ERR != ecqe->syndrome) { + cq_logwarn("cqe: syndrome=0x%x vendor=0x%x hw=0x%x (type=0x%x) wqe_opcode_qpn=0x%x " + "wqe_counter=0x%x", + ecqe->syndrome, ecqe->vendor_err_synd, *((uint8_t *)&ecqe->rsvd1 + 16), + *((uint8_t *)&ecqe->rsvd1 + 17), ntohl(ecqe->s_wqe_opcode_qpn), + ntohs(ecqe->wqe_counter)); + } +} + +void cq_mgr_tx::handle_sq_wqe_prop(unsigned index) +{ + sq_wqe_prop *p = &m_hqtx_ptr->m_sq_wqe_idx_to_prop[index]; + sq_wqe_prop *prev; + unsigned credits = 0; + + /* + * TX completions can be signalled for a set of WQEs as an optimization. + * Therefore, for every TX completion we may need to handle multiple + * WQEs. Since every WQE can have various size and the WQE index is + * wrapped around, we build a linked list to simplify things. Each + * element of the linked list represents properties of a previously + * posted WQE. + * + * We keep index of the last completed WQE and stop processing the list + * when we reach the index. This condition is checked in + * is_sq_wqe_prop_valid(). + */ + + do { + if (p->buf) { + m_p_ring->mem_buf_desc_return_single_locked(p->buf); + } + if (p->ti) { + xlio_ti *ti = p->ti; + if (ti->m_callback) { + ti->m_callback(ti->m_callback_arg); + } + + ti->put(); + if (unlikely(ti->m_released && ti->m_ref == 0)) { + ti->ti_released(); + } + } + credits += p->credits; + + prev = p; + p = p->next; + } while (p && m_hqtx_ptr->is_sq_wqe_prop_valid(p, prev)); + + m_p_ring->return_tx_pool_to_global_pool(); + m_hqtx_ptr->credits_return(credits); + m_hqtx_ptr->m_sq_wqe_prop_last_signalled = index; +} diff --git a/src/core/dev/cq_mgr_mlx5.h b/src/core/dev/cq_mgr_tx.h similarity index 55% rename from src/core/dev/cq_mgr_mlx5.h rename to src/core/dev/cq_mgr_tx.h index ff656fe0b..f5b5b7fec 100644 --- a/src/core/dev/cq_mgr_mlx5.h +++ b/src/core/dev/cq_mgr_tx.h @@ -30,74 +30,75 @@ * SOFTWARE. */ -#ifndef CQ_MGR_MLX5_H -#define CQ_MGR_MLX5_H +#ifndef CQ_MGR_TX_H +#define CQ_MGR_TX_H -#include "cq_mgr.h" -#include "qp_mgr_eth_mlx5.h" +#include "dev/ib_ctx_handler.h" -#if defined(DEFINED_DIRECT_VERBS) +class hw_queue_tx; +class ring_simple; -class qp_mgr_eth_mlx5; +class cq_mgr_tx { +public: + cq_mgr_tx(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, int cq_size, + ibv_comp_channel *p_comp_event_channel); + ~cq_mgr_tx(); + + // Helper gunction to extract the cq_mgr_tx from the CQ event, + // Since we have a single TX CQ comp channel for all cq_mgr_tx's, it might not be the active_cq + // object + static cq_mgr_tx *get_cq_mgr_from_cq_event(struct ibv_comp_channel *p_cq_channel); + + ibv_cq *get_ibv_cq_hndl() { return m_p_ibv_cq; } + int get_channel_fd() { return m_comp_event_channel->fd; } + + void configure(int cq_size); + void add_qp_tx(hw_queue_tx *hqtx_ptr); + void del_qp_tx(hw_queue_tx *hqtx_ptr); + + uint32_t clean_cq(); + + /** + * Arm the managed CQ's notification channel + * Calling this more then once without get_event() will return without + * doing anything (arm flag is changed to true on first call). + * This call will also check if a wce was processes between the + * last poll and this arm request - if true it will not arm the CQ + * @return ==0 cq is armed + * ==1 cq not armed (cq poll_sn out of sync) + * < 0 on error + */ + int request_notification(uint64_t poll_sn); -/* Get CQE opcode. */ -#define MLX5_CQE_OPCODE(op_own) ((op_own) >> 4) + int poll_and_process_element_tx(uint64_t *p_cq_poll_sn); -/* Get CQE owner bit. */ -#define MLX5_CQE_OWNER(op_own) ((op_own)&MLX5_CQE_OWNER_MASK) + void reset_notification_armed() { m_b_notification_armed = false; } -class cq_mgr_mlx5 : public cq_mgr { -public: - enum buff_status_e { - BS_OK, - BS_CQE_RESP_WR_IMM_NOT_SUPPORTED, - BS_IBV_WC_WR_FLUSH_ERR, - BS_CQE_INVALID, - BS_GENERAL_ERR - }; - - cq_mgr_mlx5(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, uint32_t cq_size, - struct ibv_comp_channel *p_comp_event_channel, bool is_rx, - bool call_configure = true); - virtual ~cq_mgr_mlx5(); - - virtual int drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id = NULL); - virtual mem_buf_desc_t *poll_and_process_socketxtreme(); - virtual int poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void *pv_fd_ready_array = NULL); - virtual int poll_and_process_element_tx(uint64_t *p_cq_poll_sn); - - mem_buf_desc_t *cqe_process_rx(mem_buf_desc_t *p_mem_buf_desc, enum buff_status_e status); - virtual void add_qp_rx(qp_mgr *qp); - void set_qp_rq(qp_mgr *qp); - virtual void add_qp_tx(qp_mgr *qp); - virtual uint32_t clean_cq(); - virtual void get_cq_event(int count = 1) { xlio_ib_mlx5_get_cq_event(&m_mlx5_cq, count); }; - -protected: - qp_mgr_eth_mlx5 *m_qp; - xlio_ib_mlx5_cq_t m_mlx5_cq; - mem_buf_desc_t *m_rx_hot_buffer; +private: + void log_cqe_error(struct xlio_mlx5_cqe *cqe); + void handle_sq_wqe_prop(unsigned index); + int clean_cq_poll_tx(xlio_ibv_wc *p_wce, int num_entries, uint64_t *p_cq_poll_sn); - inline struct xlio_mlx5_cqe *check_cqe(void); - mem_buf_desc_t *poll(enum buff_status_e &status); + void get_cq_event(int count = 1) { xlio_ib_mlx5_get_cq_event(&m_mlx5_cq, count); }; + inline void update_global_sn_tx(uint64_t &cq_poll_sn, uint32_t rettotal); inline struct xlio_mlx5_cqe *get_cqe_tx(uint32_t &num_polled_cqes); - void log_cqe_error(struct xlio_mlx5_cqe *cqe); - inline void cqe_to_mem_buff_desc(struct xlio_mlx5_cqe *cqe, mem_buf_desc_t *p_rx_wc_buf_desc, - enum buff_status_e &status); - inline void update_global_sn(uint64_t &cq_poll_sn, uint32_t rettotal); - void lro_update_hdr(struct xlio_mlx5_cqe *cqe, mem_buf_desc_t *p_rx_wc_buf_desc); -private: - void handle_sq_wqe_prop(unsigned index); - int drain_and_proccess_socketxtreme(uintptr_t *p_recycle_buffers_last_wr_id); - int drain_and_proccess_helper(mem_buf_desc_t *buff, buff_status_e status, - uintptr_t *p_recycle_buffers_last_wr_id); + static atomic_t m_n_cq_id_counter_tx; + static uint64_t m_n_global_sn_tx; - virtual int req_notify_cq() { return xlio_ib_mlx5_req_notify_cq(&m_mlx5_cq, 0); }; + xlio_ib_mlx5_cq_t m_mlx5_cq; + ring_simple *m_p_ring; + ib_ctx_handler *m_p_ib_ctx_handler; + ibv_comp_channel *m_comp_event_channel; + hw_queue_tx *m_hqtx_ptr = nullptr; + struct ibv_cq *m_p_ibv_cq = nullptr; + uint32_t m_cq_id_tx = 0U; + uint32_t m_n_cq_poll_sn_tx = 0U; + bool m_b_notification_armed = false; }; -inline void cq_mgr_mlx5::update_global_sn(uint64_t &cq_poll_sn, uint32_t num_polled_cqes) +inline void cq_mgr_tx::update_global_sn_tx(uint64_t &cq_poll_sn, uint32_t num_polled_cqes) { if (num_polled_cqes > 0) { // spoil the global sn if we have packets ready @@ -108,17 +109,17 @@ inline void cq_mgr_mlx5::update_global_sn(uint64_t &cq_poll_sn, uint32_t num_pol uint32_t cq_sn; } bundle; } next_sn; - m_n_cq_poll_sn += num_polled_cqes; - next_sn.bundle.cq_sn = m_n_cq_poll_sn; - next_sn.bundle.cq_id = m_cq_id; + m_n_cq_poll_sn_tx += num_polled_cqes; + next_sn.bundle.cq_sn = m_n_cq_poll_sn_tx; + next_sn.bundle.cq_id = m_cq_id_tx; - m_n_global_sn = next_sn.global_sn; + m_n_global_sn_tx = next_sn.global_sn; } - cq_poll_sn = m_n_global_sn; + cq_poll_sn = m_n_global_sn_tx; } -inline struct xlio_mlx5_cqe *cq_mgr_mlx5::get_cqe_tx(uint32_t &num_polled_cqes) +inline struct xlio_mlx5_cqe *cq_mgr_tx::get_cqe_tx(uint32_t &num_polled_cqes) { struct xlio_mlx5_cqe *cqe_ret = nullptr; struct xlio_mlx5_cqe *cqe = @@ -150,5 +151,4 @@ inline struct xlio_mlx5_cqe *cq_mgr_mlx5::get_cqe_tx(uint32_t &num_polled_cqes) return cqe_ret; } -#endif /* DEFINED_DIRECT_VERBS */ -#endif // CQ_MGR_MLX5_H +#endif // CQ_MGR_TX_H diff --git a/src/core/dev/dm_mgr.cpp b/src/core/dev/dm_mgr.cpp index 826d6b89a..3afb547b8 100644 --- a/src/core/dev/dm_mgr.cpp +++ b/src/core/dev/dm_mgr.cpp @@ -53,9 +53,9 @@ #define dm_logfunc __log_info_func dm_mgr::dm_mgr() - : m_p_dm_mr(NULL) - , m_p_ibv_dm(NULL) - , m_p_ring_stat(NULL) + : m_p_dm_mr(nullptr) + , m_p_ibv_dm(nullptr) + , m_p_ring_stat(nullptr) , m_allocation(0) , m_used(0) , m_head(0) {}; @@ -106,7 +106,7 @@ bool dm_mgr::allocate_resources(ib_ctx_handler *ib_ctx, ring_stats_t *ring_stats m_p_dm_mr = xlio_ibv_reg_dm_mr(&mr_in); if (!m_p_dm_mr) { xlio_ibv_free_dm(m_p_ibv_dm); - m_p_ibv_dm = NULL; + m_p_ibv_dm = nullptr; dm_logerr("ibv_free_dm error - dm_mr registration failed, %d %m", errno); return false; } @@ -132,7 +132,7 @@ void dm_mgr::release_resources() } else { dm_logdbg("ibv_dereg_mr success"); } - m_p_dm_mr = NULL; + m_p_dm_mr = nullptr; } if (m_p_ibv_dm) { @@ -141,10 +141,10 @@ void dm_mgr::release_resources() } else { dm_logdbg("ibv_free_dm success"); } - m_p_ibv_dm = NULL; + m_p_ibv_dm = nullptr; } - m_p_ring_stat = NULL; + m_p_ring_stat = nullptr; dm_logdbg("Device memory release completed!"); } diff --git a/src/core/dev/dm_mgr.h b/src/core/dev/dm_mgr.h index 924b5cb5b..adaf52eda 100644 --- a/src/core/dev/dm_mgr.h +++ b/src/core/dev/dm_mgr.h @@ -52,7 +52,10 @@ class dm_mgr { bool copy_data(struct mlx5_wqe_data_seg *seg, uint8_t *src, uint32_t length, mem_buf_desc_t *buff); void release_data(mem_buf_desc_t *buff); - inline bool is_completion_need() { return m_allocation - m_used < DM_COMPLETION_THRESHOLD; }; + inline bool is_completion_need() const + { + return m_allocation - m_used < DM_COMPLETION_THRESHOLD; + }; private: struct ibv_mr *m_p_dm_mr; @@ -85,7 +88,7 @@ class dm_mgr { return false; }; inline void release_data(mem_buf_desc_t *buff) { NOT_IN_USE(buff); }; - inline bool is_completion_need() { return false; }; + inline bool is_completion_need() const { return false; }; }; #endif /* DEFINED_IBV_DM */ diff --git a/src/core/dev/hw_queue_rx.cpp b/src/core/dev/hw_queue_rx.cpp new file mode 100644 index 000000000..6ed6856bf --- /dev/null +++ b/src/core/dev/hw_queue_rx.cpp @@ -0,0 +1,637 @@ +/* + * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include "dev/hw_queue_rx.h" +#include "dev/buffer_pool.h" +#include "dev/ring_simple.h" +#include "dev/rfs_rule.h" +#include "dev/cq_mgr_rx_regrq.h" +#include "dev/cq_mgr_rx_strq.h" + +#undef MODULE_NAME +#define MODULE_NAME "hw_queue_rx" + +#define hwqrx_logpanic __log_info_panic +#define hwqrx_logerr __log_info_err +#define hwqrx_logwarn __log_info_warn +#define hwqrx_loginfo __log_info_info +#define hwqrx_logdbg __log_info_dbg +#define hwqrx_logfunc __log_info_func +#define hwqrx_logfuncall __log_info_funcall + +#define ALIGN_WR_DOWN(_num_wr_) (std::max(32, ((_num_wr_) & ~(0xf)))) + +hw_queue_rx::hw_queue_rx(ring_simple *ring, ib_ctx_handler *ib_ctx, + ibv_comp_channel *rx_comp_event_channel, uint16_t vlan) + : m_p_ring(ring) + , m_p_ib_ctx_handler(ib_ctx) + , m_n_sysvar_rx_num_wr_to_post_recv(safe_mce_sys().rx_num_wr_to_post_recv) + , m_rx_num_wr(align32pow2(safe_mce_sys().rx_num_wr)) + , m_n_sysvar_rx_prefetch_bytes_before_poll(safe_mce_sys().rx_prefetch_bytes_before_poll) + , m_vlan(vlan) +{ + hwqrx_logfunc(""); + + if (!configure_rq(rx_comp_event_channel)) { + throw_xlio_exception("Failed to create RQ"); + } +} + +hw_queue_rx::~hw_queue_rx() +{ + hwqrx_logfunc(""); + + m_rq.reset(nullptr); // Must be destroyed before RX CQ. + + if (m_rq_wqe_idx_to_wrid) { + if (0 != munmap(m_rq_wqe_idx_to_wrid, m_rx_num_wr * sizeof(*m_rq_wqe_idx_to_wrid))) { + hwqrx_logerr( + "Failed deallocating memory with munmap m_rq_wqe_idx_to_wrid (errno=%d %m)", errno); + } + m_rq_wqe_idx_to_wrid = nullptr; + } + + if (m_p_cq_mgr_rx) { + delete m_p_cq_mgr_rx; + m_p_cq_mgr_rx = nullptr; + } + + delete[] m_ibv_rx_sg_array; + delete[] m_ibv_rx_wr_array; + + hwqrx_logdbg("Rx buffer poll: %ld free global buffers available", + g_buffer_pool_rx_rwqe->get_free_count()); +} + +bool hw_queue_rx::configure_rq(ibv_comp_channel *rx_comp_event_channel) +{ + // Check device capabilities for max QP work requests + /*uint32_t max_qp_wr = ALIGN_WR_DOWN(m_p_ib_ctx_handler->get_ibv_device_attr()->max_qp_wr - 1); + if (m_rx_num_wr > max_qp_wr) { + hwqrx_logwarn("Allocating only %d Rx work requests while user " + "requested %s=%d for RX on <%p>", + max_qp_wr, SYS_VAR_RX_NUM_WRE, m_rx_num_wr, m_p_ib_ctx_handler); + m_rx_num_wr = max_qp_wr; + }*/ + + // Create associated cq_mgr_tx + BULLSEYE_EXCLUDE_BLOCK_START + m_p_cq_mgr_rx = init_rx_cq_mgr(rx_comp_event_channel); + if (!m_p_cq_mgr_rx) { + hwqrx_logerr("Failed allocating m_p_cq_mgr_rx (errno=%d %m)", errno); + return false; + } + BULLSEYE_EXCLUDE_BLOCK_END + + // Modify the cq_mgr_rx to use a non-blocking event channel + set_fd_block_mode(m_p_cq_mgr_rx->get_channel_fd(), false); + + m_curr_rx_wr = 0; + + xlio_ib_mlx5_cq_t mlx5_cq; + memset(&mlx5_cq, 0, sizeof(mlx5_cq)); + xlio_ib_mlx5_get_cq(m_p_cq_mgr_rx->get_ibv_cq_hndl(), &mlx5_cq); + + hwqrx_logdbg( + "Creating RQ of transport type '%s' on ibv device '%s' [%p], cq: %p(%u), wre: %d, sge: %d", + priv_xlio_transport_type_str(m_p_ring->get_transport_type()), + m_p_ib_ctx_handler->get_ibname(), m_p_ib_ctx_handler->get_ibv_device(), m_p_cq_mgr_rx, + mlx5_cq.cq_num, m_rx_num_wr, m_rx_sge); + + if (safe_mce_sys().enable_striding_rq) { + m_rx_sge = 2U; // Striding-RQ needs a reserved segment. + m_strq_wqe_reserved_seg = 1U; + } + + m_ibv_rx_wr_array = new ibv_recv_wr[m_n_sysvar_rx_num_wr_to_post_recv]; + m_ibv_rx_sg_array = new ibv_sge[m_n_sysvar_rx_num_wr_to_post_recv * m_rx_sge]; + + for (uint32_t wr_idx = 0; wr_idx < m_n_sysvar_rx_num_wr_to_post_recv; wr_idx++) { + m_ibv_rx_wr_array[wr_idx].sg_list = &m_ibv_rx_sg_array[wr_idx * m_rx_sge]; + m_ibv_rx_wr_array[wr_idx].num_sge = m_rx_sge; + m_ibv_rx_wr_array[wr_idx].next = &m_ibv_rx_wr_array[wr_idx + 1]; + } + + m_ibv_rx_wr_array[m_n_sysvar_rx_num_wr_to_post_recv - 1].next = nullptr; + + if (safe_mce_sys().enable_striding_rq) { + for (uint32_t wr_idx = 0; wr_idx < m_n_sysvar_rx_num_wr_to_post_recv; wr_idx++) { + memset(m_ibv_rx_wr_array[wr_idx].sg_list, 0, sizeof(ibv_sge)); + // To bypass a check inside xlio_ib_mlx5_post_recv. + m_ibv_rx_wr_array[wr_idx].sg_list[0].length = 1U; + } + } + + // Create the QP + if (!prepare_rq(mlx5_cq.cq_num)) { + return false; + } + + return true; +} + +void hw_queue_rx::up() +{ + m_tir.reset(create_tir()); + if (!m_tir) { + hwqrx_logpanic("TIR creation for hw_queue_rx failed (errno=%d %m)", errno); + } + + release_rx_buffers(); // We might have old flushed cqe's in our CQ still from previous HA event + + modify_queue_to_ready_state(); + + m_p_cq_mgr_rx->add_hqrx(this); +} + +void hw_queue_rx::down() +{ + m_tir.reset(nullptr); + + modify_queue_to_error_state(); + + // let the QP drain all wqe's to flushed cqe's now that we moved + // it to error state and post_sent final trigger for completion + usleep(1000); + + release_rx_buffers(); + m_p_cq_mgr_rx->del_hqrx(this); +} + +void hw_queue_rx::release_rx_buffers() +{ + int total_ret = m_curr_rx_wr; + if (m_curr_rx_wr) { + hwqrx_logdbg("Returning %d pending post_recv buffers to CQ owner", m_curr_rx_wr); + while (m_curr_rx_wr) { + // Cleaning unposted buffers. Unposted buffers are not attached to any strides. + --m_curr_rx_wr; + mem_buf_desc_t *p_mem_buf_desc = + (mem_buf_desc_t *)(uintptr_t)m_ibv_rx_wr_array[m_curr_rx_wr].wr_id; + if (p_mem_buf_desc && p_mem_buf_desc->p_desc_owner) { + m_p_ring->mem_buf_desc_return_to_owner_rx(p_mem_buf_desc); + } else { + g_buffer_pool_rx_rwqe->put_buffers_thread_safe(p_mem_buf_desc); + } + } + } + // Wait for all FLUSHed WQE on Rx CQ + hwqrx_logdbg("draining cq_mgr_rx %p (last_posted_rx_wr_id = %lu)", m_p_cq_mgr_rx, + m_last_posted_rx_wr_id); + uintptr_t last_polled_rx_wr_id = 0; + while (m_p_cq_mgr_rx && last_polled_rx_wr_id != m_last_posted_rx_wr_id && errno != EIO && + !is_rq_empty() && !m_p_ib_ctx_handler->is_removed()) { + + // Process the FLUSH'ed WQE's + int ret = m_p_cq_mgr_rx->drain_and_proccess(&last_polled_rx_wr_id); + hwqrx_logdbg("draining completed on cq_mgr_rx (%d wce) last_polled_rx_wr_id = %lu", ret, + last_polled_rx_wr_id); + + total_ret += ret; + + if (!ret) { + // Query context for ib_verbs events (especially for IBV_EVENT_DEVICE_FATAL) + g_p_event_handler_manager->query_for_ibverbs_event( + m_p_ib_ctx_handler->get_ibv_context()->async_fd); + } + + // Add short delay (500 usec) to allow for WQE's to be flushed to CQ every poll cycle + const struct timespec short_sleep = {0, 500000}; // 500 usec + nanosleep(&short_sleep, nullptr); + } + m_last_posted_rx_wr_id = 0; // Clear the posted WR_ID flag, we just clear the entire RQ + hwqrx_logdbg("draining completed with a total of %d wce's on cq_mgr_rx", total_ret); + NOT_IN_USE(total_ret); // Suppress --enable-opt-log=high warning +} + +void hw_queue_rx::post_recv_buffers(descq_t *p_buffers, size_t count) +{ + hwqrx_logfuncall(""); + // Called from cq_mgr_rx context under cq_mgr_rx::LOCK! + while (count--) { + post_recv_buffer(p_buffers->get_and_pop_front()); + } +} + +void hw_queue_rx::modify_queue_to_ready_state() +{ + hwqrx_logdbg(""); + dpcp::status rc = m_rq->modify_state(dpcp::RQ_RDY); + if (dpcp::DPCP_OK != rc) { + hwqrx_logerr("Failed to modify rq state to RDY, rc: %d, rqn: %" PRIu32, + static_cast(rc), m_rq_data.rqn); + } +} + +void hw_queue_rx::modify_queue_to_error_state() +{ + hwqrx_logdbg(""); + + m_p_cq_mgr_rx->clean_cq(); + + dpcp::status rc = m_rq->modify_state(dpcp::RQ_ERR); + + /* During plugout theres is possibility that kernel + * remove device resources before working process complete + * removing process. As a result ibv api function can + * return EIO=5 errno code. + */ + if (dpcp::DPCP_OK != rc && errno != EIO) { + hwqrx_logerr("Failed to modify rq state to ERR, rc: %d, rqn: %" PRIu32, + static_cast(rc), m_rq_data.rqn); + } +} + +rfs_rule *hw_queue_rx::create_rfs_rule(dpcp::match_params &match_value, + dpcp::match_params &match_mask, uint16_t priority, + uint32_t flow_tag, xlio_tir *tir_ext) +{ + if (m_p_ib_ctx_handler && m_p_ib_ctx_handler->get_dpcp_adapter()) { + // TLS RX uses tir_ext. + dpcp::tir *dpcp_tir = (tir_ext ? xlio_tir_to_dpcp_tir(tir_ext) : m_tir.get()); + + std::unique_ptr new_rule(new rfs_rule()); + if (dpcp_tir && + new_rule->create(match_value, match_mask, *dpcp_tir, priority, flow_tag, + *m_p_ib_ctx_handler->get_dpcp_adapter())) { + return new_rule.release(); + } + } + + return nullptr; +} + +void hw_queue_rx::post_recv_buffer(mem_buf_desc_t *p_mem_buf_desc) +{ + uint32_t index = (m_curr_rx_wr * m_rx_sge) + m_strq_wqe_reserved_seg; + m_ibv_rx_sg_array[index].addr = (uintptr_t)p_mem_buf_desc->p_buffer; + m_ibv_rx_sg_array[index].length = p_mem_buf_desc->sz_buffer; + m_ibv_rx_sg_array[index].lkey = p_mem_buf_desc->lkey; + + post_recv_buffer_rq(p_mem_buf_desc); +} + +void hw_queue_rx::post_recv_buffer_rq(mem_buf_desc_t *p_mem_buf_desc) +{ + if (m_n_sysvar_rx_prefetch_bytes_before_poll) { + if (m_p_prev_rx_desc_pushed) { + m_p_prev_rx_desc_pushed->p_prev_desc = p_mem_buf_desc; + } + m_p_prev_rx_desc_pushed = p_mem_buf_desc; + } + + m_ibv_rx_wr_array[m_curr_rx_wr].wr_id = (uintptr_t)p_mem_buf_desc; + + if (m_rq_wqe_idx_to_wrid) { + uint32_t index = m_rq_wqe_counter & (m_rx_num_wr - 1); + m_rq_wqe_idx_to_wrid[index] = (uintptr_t)p_mem_buf_desc; + ++m_rq_wqe_counter; + } + + if (m_curr_rx_wr == m_n_sysvar_rx_num_wr_to_post_recv - 1) { + + m_last_posted_rx_wr_id = (uintptr_t)p_mem_buf_desc; + + m_p_prev_rx_desc_pushed = nullptr; + p_mem_buf_desc->p_prev_desc = nullptr; + + m_curr_rx_wr = 0; + struct ibv_recv_wr *bad_wr = nullptr; + if (xlio_raw_post_recv(&bad_wr) < 0) { + uint32_t n_pos_bad_rx_wr = + ((uint8_t *)bad_wr - (uint8_t *)m_ibv_rx_wr_array) / sizeof(struct ibv_recv_wr); + hwqrx_logerr("failed posting list (errno=%d %s)", errno, strerror(errno)); + hwqrx_logerr( + "bad_wr is %d in submitted list (bad_wr=%p, m_ibv_rx_wr_array=%p, size=%zu)", + n_pos_bad_rx_wr, bad_wr, m_ibv_rx_wr_array, sizeof(struct ibv_recv_wr)); + hwqrx_logerr("bad_wr info: wr_id=%#lx, next=%p, addr=%#lx, length=%d, lkey=%#x", + bad_wr[0].wr_id, bad_wr[0].next, bad_wr[0].sg_list[0].addr, + bad_wr[0].sg_list[0].length, bad_wr[0].sg_list[0].lkey); + + // Fix broken linked list of rx_wr + if (n_pos_bad_rx_wr != (m_n_sysvar_rx_num_wr_to_post_recv - 1)) { + m_ibv_rx_wr_array[n_pos_bad_rx_wr].next = &m_ibv_rx_wr_array[n_pos_bad_rx_wr + 1]; + } + throw_xlio_exception("Failed to post a WQE to RQ"); + } + hwqrx_logfunc("Successful buffer post to RQ"); + } else { + m_curr_rx_wr++; + } +} + +int hw_queue_rx::xlio_raw_post_recv(struct ibv_recv_wr **bad_wr) +{ + struct mlx5_wqe_data_seg *scat; + int err = 0; + int nreq = 0; + int i, j; + int ind = m_rq_data.head & (m_rq_data.wqe_cnt - 1); + + struct ibv_recv_wr *wr = m_ibv_rx_wr_array; + for (; wr; ++nreq, wr = wr->next) { + if (unlikely((int)m_rq_data.head - (int)m_rq_data.tail + nreq >= (int)m_rx_num_wr)) { + errno = ENOMEM; + err = -1; + *bad_wr = wr; + goto out; + } + + if (unlikely(wr->num_sge > (int)m_rx_sge)) { + errno = EINVAL; + err = -1; + *bad_wr = wr; + goto out; + } + + scat = + (struct mlx5_wqe_data_seg *)((uint8_t *)m_rq_data.buf + (ind << m_rq_data.wqe_shift)); + + for (i = 0, j = 0; i < wr->num_sge; ++i) { + if (unlikely(!wr->sg_list[i].length)) { + continue; + } + + scat[j].byte_count = htonl(wr->sg_list[i].length); + scat[j].lkey = htonl(wr->sg_list[i].lkey); + scat[j].addr = htonll(wr->sg_list[i].addr); + j++; + } + + if (j < (int)m_rx_sge) { + scat[j].byte_count = 0; + scat[j].lkey = htonl(MLX5_INVALID_LKEY); + scat[j].addr = 0; + } + + ind = (ind + 1) & (m_rq_data.wqe_cnt - 1); + } + +out: + if (likely(nreq)) { + m_rq_data.head += nreq; + + wmb(); // Make sure that descriptors are written before doorbell record. + + // Buffers are posted only after the RQ is in ready state. OK to update doorbell. + *m_rq_data.dbrec = htonl(m_rq_data.head & 0xffff); + } + + return err; +} + +bool hw_queue_rx::init_rx_cq_mgr_prepare() +{ + m_rq_wqe_idx_to_wrid = + (uint64_t *)mmap(nullptr, m_rx_num_wr * sizeof(*m_rq_wqe_idx_to_wrid), + PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (m_rq_wqe_idx_to_wrid == MAP_FAILED) { + hwqrx_logerr("Failed allocating m_rq_wqe_idx_to_wrid (errno=%d %m)", errno); + return false; + } + + return true; +} + +cq_mgr_rx *hw_queue_rx::init_rx_cq_mgr(struct ibv_comp_channel *p_rx_comp_event_channel) +{ + if (!init_rx_cq_mgr_prepare()) { + return nullptr; + } + + if (safe_mce_sys().enable_striding_rq) { + return new cq_mgr_rx_strq(m_p_ring, m_p_ib_ctx_handler, + safe_mce_sys().strq_stride_num_per_rwqe * m_rx_num_wr, + safe_mce_sys().strq_stride_size_bytes, + safe_mce_sys().strq_stride_num_per_rwqe, p_rx_comp_event_channel); + } + + return new cq_mgr_rx_regrq(m_p_ring, m_p_ib_ctx_handler, m_rx_num_wr, p_rx_comp_event_channel); +} + +#if defined(DEFINED_UTLS) +xlio_tir *hw_queue_rx::tls_create_tir(bool cached) +{ + xlio_tir *tir = NULL; + + if (cached && !m_tls_tir_cache.empty()) { + tir = m_tls_tir_cache.back(); + m_tls_tir_cache.pop_back(); + } else if (!cached) { + dpcp::tir *new_tir = create_tir(true); + + if (new_tir != NULL) { + tir = new xlio_tir(this, new_tir, xlio_ti::ti_type::TLS_TIR); + } + if (unlikely(tir == NULL && new_tir != NULL)) { + delete new_tir; + } + } + return tir; +} + +void hw_queue_rx::tls_release_tir(xlio_tir *tir) +{ + /* TODO We don't have to lock ring to destroy DEK object (a garbage collector?). */ + + assert(tir && tir->m_type == xlio_ti::ti_type::TLS_TIR); + tir->m_released = true; + tir->assign_callback(NULL, NULL); + if (tir->m_ref == 0) { + put_tls_tir_in_cache(tir); + } +} + +void hw_queue_rx::put_tls_tir_in_cache(xlio_tir *tir) +{ + // Because the absense of TIR flush command, reusing a TIR + // may result in undefined behaviour. + // Until a flush command is available the TIR cache is disabled. + // Re-enabling TIR cache should also add destroy_tir_cache on ring cleanup. + // m_tls_tir_cache.push_back(tir); + + delete tir; +} + +void hw_queue_rx::ti_released(xlio_ti *ti) +{ + assert(ti->m_released); + assert(ti->m_ref == 0); + if (ti->m_type == xlio_ti::ti_type::TLS_TIR) { + put_tls_tir_in_cache(static_cast(ti)); + } +} +#else /* DEFINED_UTLS */ +void hw_queue_rx::ti_released(xlio_ti *) {}; +#endif /* defined(DEFINED_UTLS) */ + +dpcp::tir *hw_queue_rx::create_tir(bool is_tls /*=false*/) +{ + dpcp::tir *tir_obj = nullptr; + dpcp::status status = dpcp::DPCP_OK; + dpcp::tir::attr tir_attr; + + memset(&tir_attr, 0, sizeof(tir_attr)); + tir_attr.flags = dpcp::TIR_ATTR_INLINE_RQN | dpcp::TIR_ATTR_TRANSPORT_DOMAIN; + tir_attr.inline_rqn = m_rq_data.rqn; + tir_attr.transport_domain = m_p_ib_ctx_handler->get_dpcp_adapter()->get_td(); + + if (m_p_ring->m_lro.cap && m_p_ring->m_lro.max_payload_sz) { + tir_attr.flags |= dpcp::TIR_ATTR_LRO; + tir_attr.lro.timeout_period_usecs = XLIO_MLX5_PARAMS_LRO_TIMEOUT; + tir_attr.lro.enable_mask = 3; // Bitmask for IPv4 and IPv6 support + tir_attr.lro.max_msg_sz = m_p_ring->m_lro.max_payload_sz >> 8; + } + + if (is_tls) { + tir_attr.flags |= dpcp::TIR_ATTR_TLS; + tir_attr.tls_en = 1; + } + + status = m_p_ib_ctx_handler->get_dpcp_adapter()->create_tir(tir_attr, tir_obj); + + if (dpcp::DPCP_OK != status) { + hwqrx_logerr("Failed creating dpcp tir with flags=0x%x status=%d", tir_attr.flags, status); + return nullptr; + } + + hwqrx_logdbg("TIR: %p created", tir_obj); + + return tir_obj; +} + +bool hw_queue_rx::prepare_rq(uint32_t cqn) +{ + hwqrx_logdbg(""); + + dpcp::adapter *dpcp_adapter = m_p_ib_ctx_handler->get_dpcp_adapter(); + if (!dpcp_adapter) { + hwqrx_logerr("Failed to get dpcp::adapter for prepare_rq"); + return false; + } + + // user_index Unused. + dpcp::rq_attr rqattrs; + memset(&rqattrs, 0, sizeof(rqattrs)); + rqattrs.cqn = cqn; + rqattrs.wqe_num = m_rx_num_wr; + rqattrs.wqe_sz = m_rx_sge; + + if (safe_mce_sys().hw_ts_conversion_mode == TS_CONVERSION_MODE_RTC) { + hwqrx_logdbg("Enabled RTC timestamp format for RQ"); + rqattrs.ts_format = dpcp::rq_ts_format::RQ_TS_REAL_TIME; + } + + std::unique_ptr new_rq; + dpcp::status rc = dpcp::DPCP_OK; + + if (safe_mce_sys().enable_striding_rq) { + rqattrs.buf_stride_sz = safe_mce_sys().strq_stride_size_bytes; + rqattrs.buf_stride_num = safe_mce_sys().strq_stride_num_per_rwqe; + + // Striding-RQ WQE format is as of Shared-RQ (PRM, page 381, wq_type). + // In this case the WQE minimum size is 2 * 16, and the first segment is reserved. + rqattrs.wqe_sz = m_rx_sge * 16U; + + dpcp::striding_rq *new_rq_ptr = nullptr; + rc = dpcp_adapter->create_striding_rq(rqattrs, new_rq_ptr); + new_rq.reset(new_rq_ptr); + } else { + dpcp::regular_rq *new_rq_ptr = nullptr; + rc = dpcp_adapter->create_regular_rq(rqattrs, new_rq_ptr); + new_rq.reset(new_rq_ptr); + } + + if (dpcp::DPCP_OK != rc) { + hwqrx_logerr("Failed to create dpcp rq, rc: %d, cqn: %" PRIu32, static_cast(rc), cqn); + return false; + } + + if (!store_rq_mlx5_params(*new_rq)) { + hwqrx_logerr( + "Failed to retrieve initial DPCP RQ parameters, rc: %d, basic_rq: %p, cqn: %" PRIu32, + static_cast(rc), new_rq.get(), cqn); + return false; + } + + m_rq = std::move(new_rq); + + // At this stage there is no TIR associated with the RQ, So it mimics QP INIT state. + // At RDY state without a TIR, Work Requests can be submitted to the RQ. + modify_queue_to_ready_state(); + + hwqrx_logdbg("Succeeded to create dpcp rq, rqn: %" PRIu32 ", cqn: %" PRIu32, m_rq_data.rqn, + cqn); + + return true; +} + +bool hw_queue_rx::store_rq_mlx5_params(dpcp::basic_rq &new_rq) +{ + uint32_t *dbrec_tmp = nullptr; + dpcp::status rc = new_rq.get_dbrec(dbrec_tmp); + if (dpcp::DPCP_OK != rc) { + hwqrx_logerr("Failed to retrieve dbrec of dpcp rq, rc: %d, basic_rq: %p", + static_cast(rc), &new_rq); + return false; + } + m_rq_data.dbrec = dbrec_tmp; + + rc = new_rq.get_wq_buf(m_rq_data.buf); + if (dpcp::DPCP_OK != rc) { + hwqrx_logerr("Failed to retrieve wq-buf of dpcp rq, rc: %d, basic_rq: %p", + static_cast(rc), &new_rq); + return false; + } + + rc = new_rq.get_id(m_rq_data.rqn); + if (dpcp::DPCP_OK != rc) { + hwqrx_logerr("Failed to retrieve rqn of dpcp rq, rc: %d, basic_rq: %p", + static_cast(rc), &new_rq); + return false; + } + + new_rq.get_wqe_num(m_rq_data.wqe_cnt); + new_rq.get_wq_stride_sz(m_rq_data.stride); + if (safe_mce_sys().enable_striding_rq) { + m_rq_data.stride /= 16U; + } + + m_rq_data.wqe_shift = ilog_2(m_rq_data.stride); + m_rq_data.head = 0; + m_rq_data.tail = 0; + + return true; +} diff --git a/src/core/dev/hw_queue_rx.h b/src/core/dev/hw_queue_rx.h new file mode 100644 index 000000000..7aaec82a2 --- /dev/null +++ b/src/core/dev/hw_queue_rx.h @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef HW_QUEUE_RX_H +#define HW_QUEUE_RX_H + +#include +#include "dev/xlio_ti.h" +#include "dev/ib_ctx_handler.h" +#include "dev/rfs_rule.h" +#include "dev/cq_mgr_rx.h" +#include "proto/mem_buf_desc.h" +#include "util/sg_array.h" + +class ring_simple; + +// @class hw_queue_rx +// Object to manages the SQ operations. This object is used for Rx. +// Once created it requests from the system a CQ to work with. +class hw_queue_rx : public xlio_ti_owner { + friend class cq_mgr_rx; + friend class cq_mgr_rx_regrq; + friend class cq_mgr_rx_strq; + +public: + hw_queue_rx(ring_simple *ring, ib_ctx_handler *ib_ctx, ibv_comp_channel *rx_comp_event_channel, + uint16_t vlan); + virtual ~hw_queue_rx(); + + virtual void ti_released(xlio_ti *ti) override; + + void up(); + void down(); + + // Post for receive single mem_buf_desc + void post_recv_buffer(mem_buf_desc_t *p_mem_buf_desc); + + // Post for receive a list of mem_buf_desc + void post_recv_buffers(descq_t *p_buffers, size_t count); + + cq_mgr_rx *get_rx_cq_mgr() const { return m_p_cq_mgr_rx; } + uint32_t get_rx_max_wr_num() const { return m_rx_num_wr; } + uint16_t get_vlan() const { return m_vlan; }; + void modify_queue_to_ready_state(); + void modify_queue_to_error_state(); + void release_rx_buffers(); + + rfs_rule *create_rfs_rule(dpcp::match_params &match_value, dpcp::match_params &match_mask, + uint16_t priority, uint32_t flow_tag, xlio_tir *tir_ext); + +#ifdef DEFINED_UTLS + xlio_tir *tls_create_tir(bool cached); + void tls_release_tir(xlio_tir *tir); +#endif /* DEFINED_UTLS */ + +private: + cq_mgr_rx *init_rx_cq_mgr(struct ibv_comp_channel *p_rx_comp_event_channel); + + bool init_rx_cq_mgr_prepare(); + void post_recv_buffer_rq(mem_buf_desc_t *p_mem_buf_desc); + void put_tls_tir_in_cache(xlio_tir *tir); + bool prepare_rq(uint32_t cqn); + bool configure_rq(ibv_comp_channel *rx_comp_event_channel); + bool store_rq_mlx5_params(dpcp::basic_rq &new_rq); + int xlio_raw_post_recv(struct ibv_recv_wr **bad_wr); + bool is_rq_empty() const { return (m_rq_data.head == m_rq_data.tail); } + + dpcp::tir *create_tir(bool is_tls = false); + dpcp::tir *xlio_tir_to_dpcp_tir(xlio_tir *tir) { return tir->m_p_tir.get(); } + + struct { + volatile uint32_t *dbrec; + void *buf; + uint32_t wqe_cnt; + uint32_t stride; + uint32_t wqe_shift; + uint32_t rqn; + unsigned head; + unsigned tail; + } m_rq_data; + + std::vector m_tls_tir_cache; + std::unique_ptr m_tir = {nullptr}; + std::unique_ptr m_rq = {nullptr}; + ring_simple *m_p_ring; + cq_mgr_rx *m_p_cq_mgr_rx = nullptr; + ib_ctx_handler *m_p_ib_ctx_handler; + ibv_sge *m_ibv_rx_sg_array; + ibv_recv_wr *m_ibv_rx_wr_array; + uintptr_t m_last_posted_rx_wr_id = 0U; // Remember so in case we flush RQ we know to wait until + // this WR_ID is received + mem_buf_desc_t *m_p_prev_rx_desc_pushed = nullptr; + uint64_t *m_rq_wqe_idx_to_wrid = nullptr; + uint64_t m_rq_wqe_counter = 0U; + uint32_t m_curr_rx_wr = 0U; + uint32_t m_strq_wqe_reserved_seg = 0U; + uint32_t m_n_sysvar_rx_num_wr_to_post_recv; + uint32_t m_rx_num_wr; + uint32_t m_rx_sge = MCE_DEFAULT_RX_NUM_SGE; + const uint32_t m_n_sysvar_rx_prefetch_bytes_before_poll; + uint16_t m_vlan; +}; + +#endif // HW_QUEUE_RX_H diff --git a/src/core/dev/qp_mgr_eth_mlx5.cpp b/src/core/dev/hw_queue_tx.cpp similarity index 54% rename from src/core/dev/qp_mgr_eth_mlx5.cpp rename to src/core/dev/hw_queue_tx.cpp index ddfe5cd35..41ba5b68a 100644 --- a/src/core/dev/qp_mgr_eth_mlx5.cpp +++ b/src/core/dev/hw_queue_tx.cpp @@ -29,20 +29,26 @@ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "qp_mgr_eth_mlx5.h" -#if defined(DEFINED_DIRECT_VERBS) - -#include +#include #include -#include "cq_mgr_mlx5.h" +#include +#include "dev/hw_queue_tx.h" +#include "dev/ring_simple.h" +#include "dev/cq_mgr_rx_regrq.h" #include "proto/tls.h" -#include "util/utils.h" -#include "vlogger/vlogger.h" -#include "ring_simple.h" +#include "util/valgrind.h" #undef MODULE_NAME -#define MODULE_NAME "qpm_mlx5" +#define MODULE_NAME "hw_queue_tx" + +#define hwqtx_logpanic __log_info_panic +#define hwqtx_logerr __log_info_err +#define hwqtx_logwarn __log_info_warn +#define hwqtx_loginfo __log_info_info +#define hwqtx_logdbg __log_info_dbg +#define hwqtx_logfunc __log_info_func +#define hwqtx_logfuncall __log_info_funcall #if !defined(MLX5_ETH_INLINE_HEADER_SIZE) #define MLX5_ETH_INLINE_HEADER_SIZE 18 @@ -51,13 +57,13 @@ #define OCTOWORD 16 #define WQEBB 64 -//#define DBG_DUMP_WQE 1 +//#define DBG_DUMP_WQE 1 #ifdef DBG_DUMP_WQE #define dbg_dump_wqe(_addr, _size) \ { \ uint32_t *_wqe = _addr; \ - qp_logfunc("Dumping %d bytes from %p", _size, _wqe); \ + hwqtx_logfunc("Dumping %d bytes from %p", _size, _wqe); \ for (int i = 0; i < (int)_size / 4; i += 4) { \ qp_logfunc("%08x %08x %08x %08x", ntohl(_wqe[i + 0]), ntohl(_wqe[i + 1]), \ ntohl(_wqe[i + 2]), ntohl(_wqe[i + 3])); \ @@ -88,22 +94,17 @@ static bool is_bf(struct ibv_context *ib_ctx) env = getenv("MLX5_SHUT_UP_BF"); if (!env || !strcmp(env, "0")) { -#if defined(DEFINED_DIRECT_VERBS) && (DEFINED_DIRECT_VERBS == 3) && \ - defined(MLX5DV_UAR_ALLOC_TYPE_BF) struct mlx5dv_devx_uar *uar = mlx5dv_devx_alloc_uar(ib_ctx, MLX5DV_UAR_ALLOC_TYPE_BF); if (uar) { mlx5dv_devx_free_uar(uar); return true; } -#else - NOT_IN_USE(ib_ctx); -#endif /* DEFINED_DIRECT_VERBS */ } + return false; } -//! Maps xlio_ibv_wr_opcode to real MLX5 opcode. -// +// Maps xlio_ibv_wr_opcode to real MLX5 opcode. static inline uint32_t get_mlx5_opcode(xlio_ibv_wr_opcode verbs_opcode) { switch (verbs_opcode) { @@ -118,276 +119,411 @@ static inline uint32_t get_mlx5_opcode(xlio_ibv_wr_opcode verbs_opcode) } } -qp_mgr_eth_mlx5::qp_mgr_eth_mlx5(struct qp_mgr_desc *desc, const uint32_t tx_num_wr, - const uint16_t vlan, bool call_configure) - : qp_mgr_eth(desc, tx_num_wr, vlan, false) - , m_sq_wqe_idx_to_prop(NULL) - , m_sq_wqe_prop_last(NULL) - , m_sq_wqe_prop_last_signalled(0) - , m_sq_free_credits(0) - , m_rq_wqe_counter(0) - , m_sq_wqes(NULL) - , m_sq_wqe_hot(NULL) - , m_sq_wqes_end(NULL) - , m_sq_wqe_hot_index(0) - , m_sq_wqe_counter(0) - , m_b_fence_needed(false) - , m_dm_enabled(false) +hw_queue_tx::hw_queue_tx(ring_simple *ring, const slave_data_t *slave, const uint32_t tx_num_wr) + : m_p_ring(ring) + , m_p_ib_ctx_handler(slave->p_ib_ctx) + , m_n_sysvar_tx_num_wr_to_signal(safe_mce_sys().tx_num_wr_to_signal) + , m_tx_num_wr(tx_num_wr) + , m_port_num(slave->port_num) { - // Check device capabilities for dummy send support - m_hw_dummy_send_support = xlio_is_nop_supported(m_p_ib_ctx_handler->get_ibv_device_attr()); - - if (call_configure && configure(desc)) { - throw_xlio_exception("failed creating qp_mgr_eth"); - } + hwqtx_logfunc(""); memset(&m_mlx5_qp, 0, sizeof(m_mlx5_qp)); - m_db_method = - (is_bf(((ib_ctx_handler *)desc->slave->p_ib_ctx)->get_ibv_context()) ? MLX5_DB_METHOD_BF - : MLX5_DB_METHOD_DB); - qp_logdbg("m_db_method=%d", m_db_method); -} + m_mlx5_qp.cap.max_inline_data = safe_mce_sys().tx_max_inline; + m_mlx5_qp.cap.max_send_sge = + (m_p_ring->is_tso() ? m_p_ib_ctx_handler->get_ibv_device_attr()->max_sge + : MCE_DEFAULT_TX_NUM_SGE); -void qp_mgr_eth_mlx5::init_qp() -{ - if (0 != xlio_ib_mlx5_get_qp(m_qp, &m_mlx5_qp)) { - qp_logpanic("xlio_ib_mlx5_get_qp failed (errno=%d %m)", errno); - } + memset(&m_rate_limit, 0, sizeof(struct xlio_rate_limit_t)); - m_sq_wqes = (struct mlx5_eth_wqe(*)[])(uintptr_t)m_mlx5_qp.sq.buf; - m_sq_wqe_hot = &(*m_sq_wqes)[0]; - m_sq_wqes_end = - (uint8_t *)((uintptr_t)m_mlx5_qp.sq.buf + m_mlx5_qp.sq.wqe_cnt * m_mlx5_qp.sq.stride); - m_sq_wqe_counter = 0; + // Check device capabilities for dummy send support + m_hw_dummy_send_support = xlio_is_nop_supported(m_p_ib_ctx_handler->get_ibv_device_attr()); - m_sq_wqe_hot_index = 0; + if (configure(slave)) { + throw_xlio_exception("Failed to configure"); + } +} - uint32_t old_wr_val = m_tx_num_wr; - m_tx_num_wr = (m_sq_wqes_end - (uint8_t *)m_sq_wqe_hot) / WQEBB; +hw_queue_tx::~hw_queue_tx() +{ + hwqtx_logfunc(""); - // We use the min between CQ size and the QP size (that might be increases by ibv creation). - m_sq_free_credits = std::min(m_tx_num_wr, old_wr_val); + if (m_sq_wqe_idx_to_prop) { + if (0 != munmap(m_sq_wqe_idx_to_prop, m_tx_num_wr * sizeof(*m_sq_wqe_idx_to_prop))) { + hwqtx_logerr( + "Failed deallocating memory with munmap m_sq_wqe_idx_to_prop (errno=%d %m)", errno); + } + m_sq_wqe_idx_to_prop = nullptr; + } - /* Maximum BF inlining consists of: - * - CTRL: - * - 1st WQEBB is mostly used for CTRL and ETH segment (where ETH header is inlined) - * - 4 bytes for size of inline data - * - DATA: - * - 1 OCTOWORD from 1st WQEBB is used for data inlining, except for - * the 4 bytes used for stating the inline data size - * - 3 WQEBB are fully availabie for data inlining - */ - m_qp_cap.max_inline_data = OCTOWORD - 4 + 3 * WQEBB; + destroy_tis_cache(); - if (m_sq_wqe_idx_to_prop == NULL) { - m_sq_wqe_idx_to_prop = - (sq_wqe_prop *)mmap(NULL, m_tx_num_wr * sizeof(*m_sq_wqe_idx_to_prop), - PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - if (m_sq_wqe_idx_to_prop == MAP_FAILED) { - qp_logerr("Failed allocating m_sq_wqe_idx_to_prop (errno=%d %m)", errno); - return; + hwqtx_logdbg("calling ibv_destroy_qp(qp=%p)", m_mlx5_qp.qp); + if (m_mlx5_qp.qp) { + IF_VERBS_FAILURE_EX(ibv_destroy_qp(m_mlx5_qp.qp), EIO) + { + hwqtx_logdbg("QP destroy failure (errno = %d %m)", -errno); } - m_sq_wqe_prop_last_signalled = m_tx_num_wr - 1; - m_sq_wqe_prop_last = NULL; + ENDIF_VERBS_FAILURE; + VALGRIND_MAKE_MEM_UNDEFINED(m_mlx5_qp.qp, sizeof(ibv_qp)); + m_mlx5_qp.qp = nullptr; } - qp_logfunc("m_tx_num_wr=%d max_inline_data: %d m_sq_wqe_idx_to_prop=%p", m_tx_num_wr, - get_max_inline_data(), m_sq_wqe_idx_to_prop); + if (m_p_cq_mgr_tx) { + delete m_p_cq_mgr_tx; + m_p_cq_mgr_tx = nullptr; + } - memset((void *)(uintptr_t)m_sq_wqe_hot, 0, sizeof(struct mlx5_eth_wqe)); - m_sq_wqe_hot->ctrl.data[0] = htonl(MLX5_OPCODE_SEND); - m_sq_wqe_hot->ctrl.data[1] = htonl((m_mlx5_qp.qpn << 8) | 4); - m_sq_wqe_hot->ctrl.data[2] = 0; - m_sq_wqe_hot->eseg.inline_hdr_sz = htons(MLX5_ETH_INLINE_HEADER_SIZE); - m_sq_wqe_hot->eseg.cs_flags = XLIO_TX_PACKET_L3_CSUM | XLIO_TX_PACKET_L4_CSUM; + if (m_p_cq_mgr_rx_unused) { + delete m_p_cq_mgr_rx_unused; + m_p_cq_mgr_rx_unused = nullptr; + } - qp_logfunc("%p allocated for %d QPs sq_wqes:%p sq_wqes_end: %p and configured %d WRs " - "BlueFlame: %p buf_size: %d offset: %d", - m_qp, m_mlx5_qp.qpn, m_sq_wqes, m_sq_wqes_end, m_tx_num_wr, m_mlx5_qp.bf.reg, - m_mlx5_qp.bf.size, m_mlx5_qp.bf.offset); + hwqtx_logdbg("Destructor hw_queue_tx end"); } -void qp_mgr_eth_mlx5::init_device_memory() +int hw_queue_tx::configure(const slave_data_t *slave) { - /* This limitation is done because of a observation - * that dm_copy takes a lot of time on VMs w/o BF (RM:1542628) - */ - if (m_p_ib_ctx_handler->get_on_device_memory_size() > 0) { - if (m_db_method == MLX5_DB_METHOD_BF) { - m_dm_enabled = - m_dm_mgr.allocate_resources(m_p_ib_ctx_handler, m_p_ring->m_p_ring_stat.get()); + hwqtx_logdbg("Creating QP of transport type '%s' on ibv device '%s' [%p] on port %d", + priv_xlio_transport_type_str(m_p_ring->get_transport_type()), + m_p_ib_ctx_handler->get_ibname(), m_p_ib_ctx_handler->get_ibv_device(), + m_port_num); + hwqtx_logdbg("HW Dummy send support for QP = %d", m_hw_dummy_send_support); + + // Create associated cq_mgr_tx and unused cq_mgr_rx_regrq just for QP sake. + BULLSEYE_EXCLUDE_BLOCK_START + m_p_cq_mgr_tx = init_tx_cq_mgr(); + if (!m_p_cq_mgr_tx) { + hwqtx_logerr("Failed allocating m_p_cq_mgr_tx (errno=%d %m)", errno); + return -1; + } + m_p_cq_mgr_rx_unused = new cq_mgr_rx_regrq(m_p_ring, m_p_ib_ctx_handler, 2, nullptr); + if (!m_p_cq_mgr_rx_unused) { + hwqtx_logerr("Failed allocating m_p_cq_mgr_rx_unused (errno=%d %m)", errno); + return -1; + } + BULLSEYE_EXCLUDE_BLOCK_END + + // Modify the cq_mgr_tx to use a non-blocking event channel + set_fd_block_mode(m_p_cq_mgr_tx->get_channel_fd(), false); + hwqtx_logdbg("cq tx: %p", m_p_cq_mgr_tx); + + // Create QP + xlio_ibv_qp_init_attr qp_init_attr; + memset(&qp_init_attr, 0, sizeof(qp_init_attr)); + + // TODO: m_tx_num_wr and m_rx_num_wr should be part of m_mlx5_qp.cap + // and assigned as a result of ibv_query_qp() + m_mlx5_qp.cap.max_send_wr = m_tx_num_wr; + m_mlx5_qp.cap.max_recv_wr = 1; + m_mlx5_qp.cap.max_recv_sge = 1; + + memcpy(&qp_init_attr.cap, &m_mlx5_qp.cap, sizeof(qp_init_attr.cap)); + qp_init_attr.recv_cq = m_p_cq_mgr_rx_unused->get_ibv_cq_hndl(); + qp_init_attr.send_cq = m_p_cq_mgr_tx->get_ibv_cq_hndl(); + qp_init_attr.sq_sig_all = 0; + + // In case of enabled TSO we need to take into account amount of SGE together with header inline + // Per PRM maximum of CTRL + ETH + ETH_HEADER_INLINE+DATA_PTR*NUM_SGE+MAX_INLINE+INLINE_SIZE + // MLX5 return 32678 WQEBBs at max so minimal number + int max_wqe_sz = + 16 + 14 + 16 * qp_init_attr.cap.max_send_sge + qp_init_attr.cap.max_inline_data + 4; + max_wqe_sz += (m_p_ring->is_tso() ? m_p_ring->m_tso.max_header_sz : 94); + int num_wr = 32678 * 64 / max_wqe_sz; + hwqtx_logdbg("calculated max_wqe_sz=%d num_wr=%d", max_wqe_sz, num_wr); + if (num_wr < (signed)m_tx_num_wr) { + qp_init_attr.cap.max_send_wr = + num_wr; // force min for create_qp or you will have error of memory allocation + } - } else { -#if defined(DEFINED_IBV_DM) - VLOG_PRINTF_ONCE_THEN_DEBUG( - VLOG_WARNING, - "Device Memory functionality is not used on devices w/o Blue Flame support\n"); -#endif /* DEFINED_IBV_DM */ + hwqtx_logdbg("Requested QP parameters: wre: tx = %d sge: tx = %d inline: %d", + qp_init_attr.cap.max_send_wr, qp_init_attr.cap.max_send_sge, + qp_init_attr.cap.max_inline_data); + + // Create the HW Queue + if (prepare_queue(qp_init_attr)) { + return -1; + } + + hwqtx_logdbg("Configured QP parameters: wre: tx = %d sge: tx = %d inline: %d", + qp_init_attr.cap.max_send_wr, qp_init_attr.cap.max_send_sge, + qp_init_attr.cap.max_inline_data); + + /* Check initial parameters with actual */ + enum ibv_qp_attr_mask attr_mask = IBV_QP_CAP; + struct ibv_qp_attr tmp_ibv_qp_attr; + struct ibv_qp_init_attr tmp_ibv_qp_init_attr; + IF_VERBS_FAILURE(ibv_query_qp(m_mlx5_qp.qp, &tmp_ibv_qp_attr, attr_mask, &tmp_ibv_qp_init_attr)) + { + hwqtx_logerr("ibv_query_qp failed (errno=%d %m)", errno); + return -1; + } + ENDIF_VERBS_FAILURE; + m_mlx5_qp.cap.max_send_wr = + std::min(tmp_ibv_qp_attr.cap.max_send_wr, m_mlx5_qp.cap.max_send_wr); + m_mlx5_qp.cap.max_send_sge = + std::min(tmp_ibv_qp_attr.cap.max_send_sge, m_mlx5_qp.cap.max_send_sge); + m_mlx5_qp.cap.max_inline_data = + std::min(tmp_ibv_qp_attr.cap.max_inline_data, m_mlx5_qp.cap.max_inline_data); + + hwqtx_logdbg("Used QP (num=%d) wre: tx = %d sge: tx = %d inline: %d", m_mlx5_qp.qp->qp_num, + m_mlx5_qp.cap.max_send_wr, m_mlx5_qp.cap.max_send_sge, + m_mlx5_qp.cap.max_inline_data); + +#if defined(DEFINED_ROCE_LAG) + if (slave && slave->lag_tx_port_affinity > 0) { + struct mlx5dv_context attr_out; + + memset(&attr_out, 0, sizeof(attr_out)); + attr_out.comp_mask |= MLX5DV_CONTEXT_MASK_NUM_LAG_PORTS; + if (!mlx5dv_query_device(slave->p_ib_ctx->get_ibv_context(), &attr_out)) { + hwqtx_logdbg("QP ROCE LAG port: %d of %d", slave->lag_tx_port_affinity, + attr_out.num_lag_ports); + + if (!mlx5dv_modify_qp_lag_port(m_mlx5_qp.qp, slave->lag_tx_port_affinity)) { + uint8_t current_port_num = 0; + uint8_t active_port_num = 0; + + if (!mlx5dv_query_qp_lag_port(m_mlx5_qp.qp, ¤t_port_num, &active_port_num)) { + hwqtx_logdbg("QP ROCE LAG port affinity: %d => %d", current_port_num, + active_port_num); + } + } } } +#endif /* DEFINED_ROCE_LAG */ + NOT_IN_USE(slave); + return 0; } -void qp_mgr_eth_mlx5::up() +void hw_queue_tx::up() { - init_qp(); - qp_mgr::up(); + init_queue(); + + // Add buffers + hwqtx_logdbg("QP current state: %d", priv_ibv_query_qp_state(m_mlx5_qp.qp)); + + m_p_cq_mgr_tx->add_qp_tx(this); + + release_tx_buffers(); + + modify_queue_to_ready_state(); + init_device_memory(); } -void qp_mgr_eth_mlx5::down() +void hw_queue_tx::down() { if (m_dm_enabled) { m_dm_mgr.release_resources(); } - qp_mgr::down(); + hwqtx_logdbg("QP current state: %d", priv_ibv_query_qp_state(m_mlx5_qp.qp)); + modify_queue_to_error_state(); + + // free buffers from current active resource iterator + trigger_completion_for_all_sent_packets(); + + // let the QP drain all wqe's to flushed cqe's now that we moved + // it to error state and post_sent final trigger for completion + usleep(1000); + + release_tx_buffers(); + m_p_cq_mgr_tx->del_qp_tx(this); } -#if defined(DEFINED_UTLS) -void qp_mgr_eth_mlx5::destroy_tis_cache(void) +void hw_queue_tx::release_tx_buffers() { - while (!m_tls_tis_cache.empty()) { - xlio_tis *tis = m_tls_tis_cache.back(); - m_tls_tis_cache.pop_back(); - delete tis; + int ret; + uint64_t poll_sn = 0; + hwqtx_logdbg("draining cq_mgr_tx %p", m_p_cq_mgr_tx); + while (m_p_cq_mgr_tx && m_mlx5_qp.qp && + ((ret = m_p_cq_mgr_tx->poll_and_process_element_tx(&poll_sn)) > 0) && + (errno != EIO && !m_p_ib_ctx_handler->is_removed())) { + hwqtx_logdbg("draining completed on cq_mgr_tx (%d wce)", ret); } + NOT_IN_USE(ret); // Suppress --enable-opt-log=high warning } -#endif /* defined(DEFINED_UTLS) */ -void qp_mgr_eth_mlx5::update_next_wqe_hot() +void hw_queue_tx::send_wqe(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, xlio_tis *tis, + unsigned credits) { - // Preparing next WQE as Ethernet send WQE and index: - m_sq_wqe_hot = &(*m_sq_wqes)[m_sq_wqe_counter & (m_tx_num_wr - 1)]; - m_sq_wqe_hot_index = m_sq_wqe_counter & (m_tx_num_wr - 1); - memset(m_sq_wqe_hot, 0, sizeof(mlx5_eth_wqe)); - - // Fill Ethernet segment with header inline: - struct mlx5_wqe_eth_seg *eth_seg = - (struct mlx5_wqe_eth_seg *)((uint8_t *)m_sq_wqe_hot + sizeof(struct mlx5_wqe_ctrl_seg)); - eth_seg->inline_hdr_sz = htons(MLX5_ETH_INLINE_HEADER_SIZE); + mem_buf_desc_t *p_mem_buf_desc = (mem_buf_desc_t *)p_send_wqe->wr_id; + /* Control tx completions: + * - URGENT packets should notify application as soon as possible to + * confirm one that user buffers are free to reuse. So force completion + * signal for such work requests. + */ + bool request_comp = (p_mem_buf_desc->m_flags & mem_buf_desc_t::URGENT); + bool skip_db = !request_comp && (p_mem_buf_desc->lwip_pbuf.desc.attr == PBUF_DESC_EXPRESS); + + hwqtx_logfunc("VERBS send, unsignaled_count: %d", m_n_unsignaled_count); + + send_to_wire(p_send_wqe, attr, request_comp, skip_db, tis, credits); + + if (request_comp || is_signal_requested_for_last_wqe()) { + uint64_t dummy_poll_sn = 0; + int ret = m_p_cq_mgr_tx->poll_and_process_element_tx(&dummy_poll_sn); + BULLSEYE_EXCLUDE_BLOCK_START + if (ret < 0) { + hwqtx_logerr("error from cq_mgr_tx->process_next_element (ret=%d %m)", ret); + } + BULLSEYE_EXCLUDE_BLOCK_END + hwqtx_logfunc("polling succeeded on cq_mgr_tx (%d wce)", ret); + } } -//! Cleanup resources QP itself will be freed by base class DTOR -qp_mgr_eth_mlx5::~qp_mgr_eth_mlx5() +void hw_queue_tx::modify_queue_to_ready_state() { - if (m_rq_wqe_idx_to_wrid) { - if (0 != munmap(m_rq_wqe_idx_to_wrid, m_rx_num_wr * sizeof(*m_rq_wqe_idx_to_wrid))) { - qp_logerr("Failed deallocating memory with munmap m_rq_wqe_idx_to_wrid (errno=%d %m)", - errno); + hwqtx_logdbg(""); + int ret = 0; + int qp_state = priv_ibv_query_qp_state(m_mlx5_qp.qp); + if (qp_state != IBV_QPS_INIT) { + BULLSEYE_EXCLUDE_BLOCK_START + if ((ret = priv_ibv_modify_qp_from_err_to_init_raw(m_mlx5_qp.qp, m_port_num)) != 0) { + hwqtx_logpanic("failed to modify QP from %d to RTS state (ret = %d)", qp_state, ret); } - m_rq_wqe_idx_to_wrid = NULL; + BULLSEYE_EXCLUDE_BLOCK_END } - if (m_sq_wqe_idx_to_prop) { - if (0 != munmap(m_sq_wqe_idx_to_prop, m_tx_num_wr * sizeof(*m_sq_wqe_idx_to_prop))) { - qp_logerr("Failed deallocating memory with munmap m_sq_wqe_idx_to_prop (errno=%d %m)", - errno); - } - m_sq_wqe_idx_to_prop = NULL; + + BULLSEYE_EXCLUDE_BLOCK_START + if ((ret = priv_ibv_modify_qp_from_init_to_rts(m_mlx5_qp.qp)) != 0) { + hwqtx_logpanic("failed to modify QP from INIT to RTS state (ret = %d)", ret); } - destroy_tis_cache(); + + BULLSEYE_EXCLUDE_BLOCK_END } -void qp_mgr_eth_mlx5::post_recv_buffer(mem_buf_desc_t *p_mem_buf_desc) +void hw_queue_tx::modify_queue_to_error_state() { - m_ibv_rx_sg_array[m_curr_rx_wr].addr = (uintptr_t)p_mem_buf_desc->p_buffer; - m_ibv_rx_sg_array[m_curr_rx_wr].length = p_mem_buf_desc->sz_buffer; - m_ibv_rx_sg_array[m_curr_rx_wr].lkey = p_mem_buf_desc->lkey; + hwqtx_logdbg(""); - post_recv_buffer_rq(p_mem_buf_desc); + BULLSEYE_EXCLUDE_BLOCK_START + if (priv_ibv_modify_qp_to_err(m_mlx5_qp.qp)) { + hwqtx_logdbg("ibv_modify_qp failure (errno = %d %m)", errno); + } + BULLSEYE_EXCLUDE_BLOCK_END } -void qp_mgr_eth_mlx5::post_recv_buffer_rq(mem_buf_desc_t *p_mem_buf_desc) +int hw_queue_tx::prepare_queue(xlio_ibv_qp_init_attr &qp_init_attr) { - if (m_n_sysvar_rx_prefetch_bytes_before_poll) { - if (m_p_prev_rx_desc_pushed) { - m_p_prev_rx_desc_pushed->p_prev_desc = p_mem_buf_desc; - } - m_p_prev_rx_desc_pushed = p_mem_buf_desc; + hwqtx_logdbg(""); + int ret = 0; + + qp_init_attr.qp_type = IBV_QPT_RAW_PACKET; + xlio_ibv_qp_init_attr_comp_mask(m_p_ib_ctx_handler->get_ibv_pd(), qp_init_attr); + + if (m_p_ring->is_tso()) { + xlio_ibv_qp_init_attr_tso(qp_init_attr, m_p_ring->get_max_header_sz()); + hwqtx_logdbg("create qp with max_tso_header = %d", m_p_ring->get_max_header_sz()); } - m_ibv_rx_wr_array[m_curr_rx_wr].wr_id = (uintptr_t)p_mem_buf_desc; + m_mlx5_qp.qp = xlio_ibv_create_qp(m_p_ib_ctx_handler->get_ibv_pd(), &qp_init_attr); - if (m_rq_wqe_idx_to_wrid) { - uint32_t index = m_rq_wqe_counter & (m_rx_num_wr - 1); - m_rq_wqe_idx_to_wrid[index] = (uintptr_t)p_mem_buf_desc; - ++m_rq_wqe_counter; + BULLSEYE_EXCLUDE_BLOCK_START + if (!m_mlx5_qp.qp) { + hwqtx_logerr("ibv_create_qp failed (errno=%d %m)", errno); + return -1; + } + VALGRIND_MAKE_MEM_DEFINED(m_mlx5_qp.qp, sizeof(ibv_qp)); + if ((ret = priv_ibv_modify_qp_from_err_to_init_raw(m_mlx5_qp.qp, m_port_num)) != 0) { + hwqtx_logerr("failed to modify QP from ERR to INIT state (ret = %d)", ret); + return ret; } + BULLSEYE_EXCLUDE_BLOCK_END - if (m_curr_rx_wr == m_n_sysvar_rx_num_wr_to_post_recv - 1) { + return 0; +} - m_last_posted_rx_wr_id = (uintptr_t)p_mem_buf_desc; +void hw_queue_tx::init_queue() +{ + if (0 != xlio_ib_mlx5_get_qp_tx(&m_mlx5_qp)) { + hwqtx_logpanic("xlio_ib_mlx5_get_qp_tx failed (errno=%d %m)", errno); + } - m_p_prev_rx_desc_pushed = NULL; - p_mem_buf_desc->p_prev_desc = NULL; + m_sq_wqes = (struct mlx5_eth_wqe(*)[])(uintptr_t)m_mlx5_qp.sq.buf; + m_sq_wqes_end = + (uint8_t *)((uintptr_t)m_mlx5_qp.sq.buf + m_mlx5_qp.sq.wqe_cnt * m_mlx5_qp.sq.stride); + m_sq_wqe_last = &(*m_sq_wqes)[0]; + m_sq_wqe_last_index = 0; + m_sq_wqe_counter = 0; - m_curr_rx_wr = 0; - struct ibv_recv_wr *bad_wr = NULL; - IF_VERBS_FAILURE(xlio_ib_mlx5_post_recv(&m_mlx5_qp, &m_ibv_rx_wr_array[0], &bad_wr)) - { - uint32_t n_pos_bad_rx_wr = - ((uint8_t *)bad_wr - (uint8_t *)m_ibv_rx_wr_array) / sizeof(struct ibv_recv_wr); - qp_logerr("failed posting list (errno=%d %s)", errno, strerror(errno)); - qp_logerr("bad_wr is %d in submitted list (bad_wr=%p, m_ibv_rx_wr_array=%p, size=%zu)", - n_pos_bad_rx_wr, bad_wr, m_ibv_rx_wr_array, sizeof(struct ibv_recv_wr)); - qp_logerr("bad_wr info: wr_id=%#lx, next=%p, addr=%#lx, length=%d, lkey=%#x", - bad_wr[0].wr_id, bad_wr[0].next, bad_wr[0].sg_list[0].addr, - bad_wr[0].sg_list[0].length, bad_wr[0].sg_list[0].lkey); - qp_logerr("QP current state: %d", priv_ibv_query_qp_state(m_qp)); - - // Fix broken linked list of rx_wr - if (n_pos_bad_rx_wr != (m_n_sysvar_rx_num_wr_to_post_recv - 1)) { - m_ibv_rx_wr_array[n_pos_bad_rx_wr].next = &m_ibv_rx_wr_array[n_pos_bad_rx_wr + 1]; - } - throw; + uint32_t old_wr_val = m_tx_num_wr; + m_tx_num_wr = (m_sq_wqes_end - (uint8_t *)m_sq_wqe_last) / WQEBB; + + // We use the min between CQ size and the QP size (that might be increases by ibv creation). + m_sq_free_credits = std::min(m_tx_num_wr, old_wr_val); + + /* Maximum BF inlining consists of: + * - CTRL: + * - 1st WQEBB is mostly used for CTRL and ETH segment (where ETH header is inlined) + * - 4 bytes for size of inline data + * - DATA: + * - 1 OCTOWORD from 1st WQEBB is used for data inlining, except for + * the 4 bytes used for stating the inline data size + * - 3 WQEBB are fully availabie for data inlining + */ + m_mlx5_qp.cap.max_inline_data = OCTOWORD - 4 + 3 * WQEBB; + + if (!m_sq_wqe_idx_to_prop) { + m_sq_wqe_idx_to_prop = + (sq_wqe_prop *)mmap(nullptr, m_tx_num_wr * sizeof(*m_sq_wqe_idx_to_prop), + PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (m_sq_wqe_idx_to_prop == MAP_FAILED) { + hwqtx_logerr("Failed allocating m_sq_wqe_idx_to_prop (errno=%d %m)", errno); + return; } - ENDIF_VERBS_FAILURE; - qp_logfunc("Successful ibv_post_recv"); - } else { - m_curr_rx_wr++; + m_sq_wqe_prop_last_signalled = m_tx_num_wr - 1; + m_sq_wqe_prop_last = nullptr; } + + hwqtx_logfunc("m_tx_num_wr=%d max_inline_data: %d m_sq_wqe_idx_to_prop=%p", m_tx_num_wr, + get_max_inline_data(), m_sq_wqe_idx_to_prop); + + hwqtx_logfunc("%p allocated for %d QPs sq_wqes:%p sq_wqes_end: %p and configured %d WRs " + "BlueFlame: %p", + m_mlx5_qp.qp, m_mlx5_qp.qpn, m_sq_wqes, m_sq_wqes_end, m_tx_num_wr, + m_mlx5_qp.bf.reg); } -bool qp_mgr_eth_mlx5::init_rx_cq_mgr_prepare() +void hw_queue_tx::init_device_memory() { - m_rx_num_wr = align32pow2(m_rx_num_wr); - - m_rq_wqe_idx_to_wrid = - (uint64_t *)mmap(NULL, m_rx_num_wr * sizeof(*m_rq_wqe_idx_to_wrid), PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - if (m_rq_wqe_idx_to_wrid == MAP_FAILED) { - qp_logerr("Failed allocating m_rq_wqe_idx_to_wrid (errno=%d %m)", errno); - return false; + /* This limitation is done because of a observation + * that dm_copy takes a lot of time on VMs w/o BF (RM:1542628) + */ + if (m_p_ib_ctx_handler->get_on_device_memory_size() > 0 && + is_bf(m_p_ib_ctx_handler->get_ibv_context())) { + m_dm_enabled = + m_dm_mgr.allocate_resources(m_p_ib_ctx_handler, m_p_ring->m_p_ring_stat.get()); } - - return true; } -cq_mgr *qp_mgr_eth_mlx5::init_rx_cq_mgr(struct ibv_comp_channel *p_rx_comp_event_channel) +void hw_queue_tx::update_wqe_last() { - return (!init_rx_cq_mgr_prepare() ? NULL - : new cq_mgr_mlx5(m_p_ring, m_p_ib_ctx_handler, m_rx_num_wr, - p_rx_comp_event_channel, true)); + m_sq_wqe_last_index = m_sq_wqe_counter & (m_tx_num_wr - 1); + m_sq_wqe_last = &(*m_sq_wqes)[m_sq_wqe_last_index]; } -cq_mgr *qp_mgr_eth_mlx5::init_tx_cq_mgr() +cq_mgr_tx *hw_queue_tx::init_tx_cq_mgr() { m_tx_num_wr = align32pow2(m_tx_num_wr); - return new cq_mgr_mlx5(m_p_ring, m_p_ib_ctx_handler, m_tx_num_wr, - m_p_ring->get_tx_comp_event_channel(), false); + return new cq_mgr_tx(m_p_ring, m_p_ib_ctx_handler, m_tx_num_wr, + m_p_ring->get_tx_comp_event_channel()); } -inline void qp_mgr_eth_mlx5::ring_doorbell(int db_method, int num_wqebb, int num_wqebb_top, - bool skip_comp /*=false*/) +inline void hw_queue_tx::ring_doorbell(int num_wqebb, bool skip_comp /*=false*/, + bool skip_db /*=false*/) { - uint64_t *dst = (uint64_t *)((uint8_t *)m_mlx5_qp.bf.reg + m_mlx5_qp.bf.offset); - uint64_t *src = reinterpret_cast(m_sq_wqe_hot); + uint64_t *dst = (uint64_t *)m_mlx5_qp.bf.reg; + uint64_t *src = reinterpret_cast(m_sq_wqe_last); struct xlio_mlx5_wqe_ctrl_seg *ctrl = reinterpret_cast(src); /* TODO Refactor m_n_unsignedled_count, is_completion_need(), set_unsignaled_count(): * Some logic is hidden inside the methods and in one branch the field is changed directly. */ - if (!skip_comp && is_completion_need()) { + if (is_completion_need() && !skip_comp) { + skip_db = false; // Follow the TX completion batching scheme ctrl->fm_ce_se |= MLX5_WQE_CTRL_CQ_UPDATE; } if (ctrl->fm_ce_se & MLX5_WQE_CTRL_CQ_UPDATE) { @@ -400,184 +536,166 @@ inline void qp_mgr_eth_mlx5::ring_doorbell(int db_method, int num_wqebb, int num m_b_fence_needed = false; } - m_sq_wqe_counter = (m_sq_wqe_counter + num_wqebb + num_wqebb_top) & 0xFFFF; - - // Make sure that descriptors are written before - // updating doorbell record and ringing the doorbell - wmb(); - *m_mlx5_qp.sq.dbrec = htonl(m_sq_wqe_counter); + m_sq_wqe_counter = (m_sq_wqe_counter + num_wqebb) & 0xFFFF; - // This wc_wmb ensures ordering between DB record and BF copy - wc_wmb(); - if (likely(db_method == MLX5_DB_METHOD_BF)) { - /* Copying src to BlueFlame register buffer by Write Combining cnt WQEBBs - * Avoid using memcpy() to copy to BlueFlame page, since memcpy() - * implementations may use move-string-buffer assembler instructions, - * which do not guarantee order of copying. + m_b_db_needed = skip_db; + if (!skip_db) { + /* Make sure that descriptors are written before updating doorbell record and + * ringing the doorbell. */ - while (num_wqebb--) { - COPY_64B_NT(dst, src); - } - src = (uint64_t *)m_sq_wqes; - while (num_wqebb_top--) { - COPY_64B_NT(dst, src); - } - } else { + wmb(); + *m_mlx5_qp.sq.dbrec = htonl(m_sq_wqe_counter); + + // This wc_wmb ensures ordering between DB record and BF copy. + wc_wmb(); *dst = *src; - } - /* Use wc_wmb() to ensure write combining buffers are flushed out - * of the running CPU. - * sfence instruction affects only the WC buffers of the CPU that executes it - */ - wc_wmb(); - m_mlx5_qp.bf.offset ^= m_mlx5_qp.bf.size; + /* Use wc_wmb() to ensure write combining buffers are flushed out of the running CPU. + * sfence instruction affects only the WC buffers of the CPU that executes it. + */ + wc_wmb(); + } } -inline int qp_mgr_eth_mlx5::fill_inl_segment(sg_array &sga, uint8_t *cur_seg, uint8_t *data_addr, - int max_inline_len, int inline_len) +inline int hw_queue_tx::fill_inl_segment(sg_array &sga, uint8_t *cur_seg, uint8_t *data_addr, + int max_inline_len, int inline_len) { int wqe_inline_size = 0; - while ((data_addr != NULL) && inline_len) { + while ((data_addr) && inline_len) { dbg_dump_wqe((uint32_t *)data_addr, inline_len); memcpy(cur_seg, data_addr, inline_len); wqe_inline_size += inline_len; cur_seg += inline_len; inline_len = max_inline_len - wqe_inline_size; data_addr = sga.get_data(&inline_len); - qp_logfunc("data_addr:%p cur_seg: %p inline_len: %d wqe_inline_size: %d", data_addr, - cur_seg, inline_len, wqe_inline_size); + hwqtx_logfunc("data_addr:%p cur_seg: %p inline_len: %d wqe_inline_size: %d", data_addr, + cur_seg, inline_len, wqe_inline_size); } return wqe_inline_size; } -//! Fill WQE dynamically, based on amount of free WQEBB in SQ -inline int qp_mgr_eth_mlx5::fill_wqe(xlio_ibv_send_wr *pswr) +inline int hw_queue_tx::fill_wqe_inline(xlio_ibv_send_wr *pswr) { - // control segment is mostly filled by preset after previous packet - // we always inline ETH header sg_array sga(pswr->sg_list, pswr->num_sge); - uint8_t *cur_seg = (uint8_t *)m_sq_wqe_hot + sizeof(struct mlx5_wqe_ctrl_seg); + uint8_t *cur_seg = (uint8_t *)m_sq_wqe_last + sizeof(struct mlx5_wqe_ctrl_seg); int inline_len = MLX5_ETH_INLINE_HEADER_SIZE; int data_len = sga.length(); int wqe_size = sizeof(struct mlx5_wqe_ctrl_seg) / OCTOWORD; int max_inline_len = get_max_inline_data(); - // assume packet is full inline - if (likely(data_len <= max_inline_len && xlio_send_wr_opcode(*pswr) == XLIO_IBV_WR_SEND)) { - uint8_t *data_addr = sga.get_data(&inline_len); // data for inlining in ETH header + uint8_t *data_addr = sga.get_data(&inline_len); // data for inlining in ETH header + + m_sq_wqe_last->eseg.inline_hdr_sz = htons(MLX5_ETH_INLINE_HEADER_SIZE); + + data_len -= inline_len; + hwqtx_logfunc( + "wqe_last:%p num_sge: %d data_addr: %p data_len: %d max_inline_len: %d inline_len: %d", + m_sq_wqe_last, pswr->num_sge, data_addr, data_len, max_inline_len, inline_len); + + // Fill Ethernet segment with header inline, static data + // were populated in preset after previous packet send + memcpy(cur_seg + offsetof(struct mlx5_wqe_eth_seg, inline_hdr_start), data_addr, + MLX5_ETH_INLINE_HEADER_SIZE); + cur_seg += sizeof(struct mlx5_wqe_eth_seg); + wqe_size += sizeof(struct mlx5_wqe_eth_seg) / OCTOWORD; + + max_inline_len = data_len; + // Filling inline data segment + // size of BlueFlame buffer is 4*WQEBBs, 3*OCTOWORDS of the first + // was allocated for control and ethernet segment so we have 3*WQEBB+16-4 + int rest_space = std::min((int)(m_sq_wqes_end - cur_seg - 4), (3 * WQEBB + OCTOWORD - 4)); + // Filling till the end of inline WQE segment or + // to end of WQEs + if (likely(max_inline_len <= rest_space)) { + inline_len = max_inline_len; + hwqtx_logfunc("data_addr:%p cur_seg: %p rest_space: %d inline_len: %d wqe_size: %d", + data_addr, cur_seg, rest_space, inline_len, wqe_size); + // bypass inline size and fill inline data segment + data_addr = sga.get_data(&inline_len); + inline_len = fill_inl_segment(sga, cur_seg + 4, data_addr, max_inline_len, inline_len); + + // store inline data size and mark the data as inlined + *(uint32_t *)((uint8_t *)m_sq_wqe_last + sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_eth_seg)) = htonl(0x80000000 | inline_len); + rest_space = align_to_octoword_up(inline_len + 4); // align to OCTOWORDs + wqe_size += rest_space / OCTOWORD; + // assert((data_len-inline_len)==0); + // configuring control + m_sq_wqe_last->ctrl.data[1] = htonl((m_mlx5_qp.qpn << 8) | wqe_size); + rest_space = align_to_WQEBB_up(wqe_size) / 4; + hwqtx_logfunc("data_len: %d inline_len: %d wqe_size: %d wqebbs: %d", data_len - inline_len, + inline_len, wqe_size, rest_space); + return rest_space; + } else { + // wrap around case, first filling till the end of m_sq_wqes + int wrap_up_size = max_inline_len - rest_space; + inline_len = rest_space; + hwqtx_logfunc("WRAP_UP_SIZE: %d data_addr:%p cur_seg: %p rest_space: %d inline_len: %d " + "wqe_size: %d", + wrap_up_size, data_addr, cur_seg, rest_space, inline_len, wqe_size); + + data_addr = sga.get_data(&inline_len); + inline_len = fill_inl_segment(sga, cur_seg + 4, data_addr, rest_space, inline_len); data_len -= inline_len; - qp_logfunc( - "wqe_hot:%p num_sge: %d data_addr: %p data_len: %d max_inline_len: %d inline_len: %d", - m_sq_wqe_hot, pswr->num_sge, data_addr, data_len, max_inline_len, inline_len); - - // Fill Ethernet segment with header inline, static data - // were populated in preset after previous packet send - memcpy(cur_seg + offsetof(struct mlx5_wqe_eth_seg, inline_hdr_start), data_addr, - MLX5_ETH_INLINE_HEADER_SIZE); - cur_seg += sizeof(struct mlx5_wqe_eth_seg); - wqe_size += sizeof(struct mlx5_wqe_eth_seg) / OCTOWORD; - - max_inline_len = data_len; - // Filling inline data segment - // size of BlueFlame buffer is 4*WQEBBs, 3*OCTOWORDS of the first - // was allocated for control and ethernet segment so we have 3*WQEBB+16-4 - int rest_space = std::min((int)(m_sq_wqes_end - cur_seg - 4), (3 * WQEBB + OCTOWORD - 4)); - // Filling till the end of inline WQE segment or - // to end of WQEs - if (likely(max_inline_len <= rest_space)) { - inline_len = max_inline_len; - qp_logfunc("data_addr:%p cur_seg: %p rest_space: %d inline_len: %d wqe_size: %d", - data_addr, cur_seg, rest_space, inline_len, wqe_size); - // bypass inline size and fill inline data segment - data_addr = sga.get_data(&inline_len); - inline_len = fill_inl_segment(sga, cur_seg + 4, data_addr, max_inline_len, inline_len); - - // store inline data size and mark the data as inlined - *(uint32_t *)((uint8_t *)m_sq_wqe_hot + sizeof(struct mlx5_wqe_ctrl_seg) + - sizeof(struct mlx5_wqe_eth_seg)) = htonl(0x80000000 | inline_len); - rest_space = align_to_octoword_up(inline_len + 4); // align to OCTOWORDs - wqe_size += rest_space / OCTOWORD; - // assert((data_len-inline_len)==0); - // configuring control - m_sq_wqe_hot->ctrl.data[1] = htonl((m_mlx5_qp.qpn << 8) | wqe_size); - rest_space = align_to_WQEBB_up(wqe_size) / 4; - qp_logfunc("data_len: %d inline_len: %d wqe_size: %d wqebbs: %d", data_len - inline_len, - inline_len, wqe_size, rest_space); - ring_doorbell(m_db_method, rest_space); - return rest_space; - } else { - // wrap around case, first filling till the end of m_sq_wqes - int wrap_up_size = max_inline_len - rest_space; - inline_len = rest_space; - qp_logfunc("WRAP_UP_SIZE: %d data_addr:%p cur_seg: %p rest_space: %d inline_len: %d " - "wqe_size: %d", - wrap_up_size, data_addr, cur_seg, rest_space, inline_len, wqe_size); - - data_addr = sga.get_data(&inline_len); - inline_len = fill_inl_segment(sga, cur_seg + 4, data_addr, rest_space, inline_len); - data_len -= inline_len; - rest_space = align_to_octoword_up(inline_len + 4); - wqe_size += rest_space / OCTOWORD; - rest_space = - align_to_WQEBB_up(rest_space / OCTOWORD) / 4; // size of 1st chunk at the end - - qp_logfunc( - "END chunk data_addr: %p data_len: %d inline_len: %d wqe_size: %d wqebbs: %d", - data_addr, data_len, inline_len, wqe_size, rest_space); - // Wrap around - // - cur_seg = (uint8_t *)m_sq_wqes; - data_addr = sga.get_data(&wrap_up_size); - - wrap_up_size = fill_inl_segment(sga, cur_seg, data_addr, data_len, wrap_up_size); - inline_len += wrap_up_size; - max_inline_len = align_to_octoword_up(wrap_up_size); - wqe_size += max_inline_len / OCTOWORD; - max_inline_len = align_to_WQEBB_up(max_inline_len / OCTOWORD) / 4; - // store inline data size - *(uint32_t *)((uint8_t *)m_sq_wqe_hot + sizeof(struct mlx5_wqe_ctrl_seg) + - sizeof(struct mlx5_wqe_eth_seg)) = htonl(0x80000000 | inline_len); - qp_logfunc("BEGIN_CHUNK data_addr: %p data_len: %d wqe_size: %d inline_len: %d " - "end_wqebbs: %d wqebbs: %d", - data_addr, data_len - wrap_up_size, wqe_size, inline_len + wrap_up_size, - rest_space, max_inline_len); - // assert((data_len-wrap_up_size)==0); - // configuring control - m_sq_wqe_hot->ctrl.data[1] = htonl((m_mlx5_qp.qpn << 8) | wqe_size); - - dbg_dump_wqe((uint32_t *)m_sq_wqe_hot, rest_space * 4 * 16); - dbg_dump_wqe((uint32_t *)m_sq_wqes, max_inline_len * 4 * 16); - - ring_doorbell(m_db_method, rest_space, max_inline_len); - return rest_space + max_inline_len; - } + rest_space = align_to_octoword_up(inline_len + 4); + wqe_size += rest_space / OCTOWORD; + rest_space = align_to_WQEBB_up(rest_space / OCTOWORD) / 4; // size of 1st chunk at the end + + hwqtx_logfunc("END chunk data_addr: %p data_len: %d inline_len: %d wqe_size: %d wqebbs: %d", + data_addr, data_len, inline_len, wqe_size, rest_space); + // Wrap around + // + cur_seg = (uint8_t *)m_sq_wqes; + data_addr = sga.get_data(&wrap_up_size); + + wrap_up_size = fill_inl_segment(sga, cur_seg, data_addr, data_len, wrap_up_size); + inline_len += wrap_up_size; + max_inline_len = align_to_octoword_up(wrap_up_size); + wqe_size += max_inline_len / OCTOWORD; + max_inline_len = align_to_WQEBB_up(max_inline_len / OCTOWORD) / 4; + // store inline data size + *(uint32_t *)((uint8_t *)m_sq_wqe_last + sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_eth_seg)) = htonl(0x80000000 | inline_len); + hwqtx_logfunc("BEGIN_CHUNK data_addr: %p data_len: %d wqe_size: %d inline_len: %d " + "end_wqebbs: %d wqebbs: %d", + data_addr, data_len - wrap_up_size, wqe_size, inline_len + wrap_up_size, + rest_space, max_inline_len); + // assert((data_len-wrap_up_size)==0); + // configuring control + m_sq_wqe_last->ctrl.data[1] = htonl((m_mlx5_qp.qpn << 8) | wqe_size); + + dbg_dump_wqe((uint32_t *)m_sq_wqe_last, rest_space * 4 * 16); + dbg_dump_wqe((uint32_t *)m_sq_wqes, max_inline_len * 4 * 16); + + return rest_space + max_inline_len; + } +} + +//! Fill WQE dynamically, based on amount of free WQEBB in SQ +inline int hw_queue_tx::fill_wqe(xlio_ibv_send_wr *pswr) +{ + if (pswr->num_sge == 1 && pswr->sg_list[0].length <= get_max_inline_data() && + xlio_send_wr_opcode(*pswr) == XLIO_IBV_WR_SEND) { + // Packet is fully inline + return fill_wqe_inline(pswr); } else { if (xlio_send_wr_opcode(*pswr) == XLIO_IBV_WR_SEND) { - /* data is bigger than max to inline we inlined only ETH header + uint from IP (18 - * bytes) the rest will be in data pointer segment adding data seg with pointer if there - * still data to transfer - */ - wqe_size = fill_wqe_send(pswr); - return wqe_size; + // Data is bigger than max to inline + return fill_wqe_send(pswr); } else { - /* Support XLIO_IBV_WR_SEND_TSO operation - */ - wqe_size = fill_wqe_lso(pswr); - return wqe_size; + // Support XLIO_IBV_WR_SEND_TSO operation + return fill_wqe_lso(pswr); } } - return 1; } -inline int qp_mgr_eth_mlx5::fill_wqe_send(xlio_ibv_send_wr *pswr) +inline int hw_queue_tx::fill_wqe_send(xlio_ibv_send_wr *pswr) { struct mlx5_wqe_eth_seg *eseg; struct mlx5_wqe_data_seg *dseg; int wqe_size = sizeof(mlx5_wqe_ctrl_seg) / OCTOWORD; - eseg = (struct mlx5_wqe_eth_seg *)((uint8_t *)m_sq_wqe_hot + sizeof(mlx5_wqe_ctrl_seg)); - eseg->inline_hdr_sz = 0; + eseg = (struct mlx5_wqe_eth_seg *)((uint8_t *)m_sq_wqe_last + sizeof(mlx5_wqe_ctrl_seg)); /* Unlike Linux kernel, rdma-core defines mlx5_wqe_eth_seg as 32 bytes, because it contains * 18 bytes of inline header. We don't want to inline partial header to avoid an extra copy @@ -605,21 +723,19 @@ inline int qp_mgr_eth_mlx5::fill_wqe_send(xlio_ibv_send_wr *pswr) } } - m_sq_wqe_hot->ctrl.data[1] = htonl((m_mlx5_qp.qpn << 8) | wqe_size); + m_sq_wqe_last->ctrl.data[1] = htonl((m_mlx5_qp.qpn << 8) | wqe_size); int wqebbs = align_to_WQEBB_up(wqe_size) / 4; - /* TODO FIXME Split into top and bottom parts */ - ring_doorbell(m_db_method, wqebbs); return wqebbs; } //! Filling wqe for LSO -inline int qp_mgr_eth_mlx5::fill_wqe_lso(xlio_ibv_send_wr *pswr) +inline int hw_queue_tx::fill_wqe_lso(xlio_ibv_send_wr *pswr) { - struct mlx5_wqe_ctrl_seg *ctrl = NULL; - struct mlx5_wqe_eth_seg *eseg = NULL; - struct mlx5_wqe_data_seg *dpseg = NULL; - uint8_t *cur_seg = NULL; + struct mlx5_wqe_ctrl_seg *ctrl = nullptr; + struct mlx5_wqe_eth_seg *eseg = nullptr; + struct mlx5_wqe_data_seg *dpseg = nullptr; + uint8_t *cur_seg = nullptr; uint8_t *p_hdr = (uint8_t *)pswr->tso.hdr; int inl_hdr_size = pswr->tso.hdr_sz; int inl_hdr_copy_size = 0; @@ -629,7 +745,7 @@ inline int qp_mgr_eth_mlx5::fill_wqe_lso(xlio_ibv_send_wr *pswr) int rest = 0; int i = 0; - ctrl = (struct mlx5_wqe_ctrl_seg *)m_sq_wqe_hot; + ctrl = (struct mlx5_wqe_ctrl_seg *)m_sq_wqe_last; /* Do usual send operation in case payload less than mss */ if (0 == pswr->tso.mss) { @@ -637,7 +753,7 @@ inline int qp_mgr_eth_mlx5::fill_wqe_lso(xlio_ibv_send_wr *pswr) htonl(((m_sq_wqe_counter & 0xffff) << 8) | (get_mlx5_opcode(XLIO_IBV_WR_SEND) & 0xff)); } - eseg = (struct mlx5_wqe_eth_seg *)((uint8_t *)m_sq_wqe_hot + sizeof(*ctrl)); + eseg = (struct mlx5_wqe_eth_seg *)((uint8_t *)m_sq_wqe_last + sizeof(*ctrl)); eseg->mss = htons(pswr->tso.mss); eseg->inline_hdr_sz = htons(inl_hdr_size); @@ -659,77 +775,65 @@ inline int qp_mgr_eth_mlx5::fill_wqe_lso(xlio_ibv_send_wr *pswr) max_inline_len = align_to_octoword_up(inl_hdr_copy_size); cur_seg = (uint8_t *)m_sq_wqes + max_inline_len; wqe_size += rest / OCTOWORD; - inl_hdr_copy_size = align_to_WQEBB_up(wqe_size) / 4; } wqe_size += max_inline_len / OCTOWORD; - qp_logfunc("TSO: num_sge: %d max_inline_len: %d inl_hdr_size: %d rest: %d", pswr->num_sge, - max_inline_len, inl_hdr_size, rest); + hwqtx_logfunc("TSO: num_sge: %d max_inline_len: %d inl_hdr_size: %d rest: %d", pswr->num_sge, + max_inline_len, inl_hdr_size, rest); // Filling data pointer segments with payload by scatter-gather list elements dpseg = (struct mlx5_wqe_data_seg *)cur_seg; for (i = 0; i < pswr->num_sge; i++) { if (unlikely((uintptr_t)dpseg >= (uintptr_t)m_sq_wqes_end)) { dpseg = (struct mlx5_wqe_data_seg *)m_sq_wqes; - inl_hdr_copy_size = align_to_WQEBB_up(wqe_size) / 4; } dpseg->addr = htonll((uint64_t)pswr->sg_list[i].addr); dpseg->lkey = htonl(pswr->sg_list[i].lkey); dpseg->byte_count = htonl(pswr->sg_list[i].length); - qp_logfunc("DATA_SEG: addr:%llx len: %d lkey: %x dp_seg: %p wqe_size: %d", - pswr->sg_list[i].addr, pswr->sg_list[i].length, dpseg->lkey, dpseg, wqe_size); + hwqtx_logfunc("DATA_SEG: addr:%llx len: %d lkey: %x dp_seg: %p wqe_size: %d", + pswr->sg_list[i].addr, pswr->sg_list[i].length, dpseg->lkey, dpseg, wqe_size); dpseg++; wqe_size += sizeof(struct mlx5_wqe_data_seg) / OCTOWORD; } - inl_hdr_size = align_to_WQEBB_up(wqe_size) / 4; - m_sq_wqe_hot->ctrl.data[1] = htonl((m_mlx5_qp.qpn << 8) | wqe_size); - - // sending by BlueFlame or DoorBell covering wrap around - // TODO Make a single doorbell call - if (likely(inl_hdr_size <= 4)) { - if (likely(inl_hdr_copy_size == 0)) { - ring_doorbell(MLX5_DB_METHOD_DB, inl_hdr_size); - } else { - ring_doorbell(MLX5_DB_METHOD_DB, inl_hdr_copy_size, inl_hdr_size - inl_hdr_copy_size); - } - } else { - ring_doorbell(MLX5_DB_METHOD_DB, inl_hdr_size); - } + m_sq_wqe_last->ctrl.data[1] = htonl((m_mlx5_qp.qpn << 8) | wqe_size); + return align_to_WQEBB_up(wqe_size) / 4; } -void qp_mgr_eth_mlx5::store_current_wqe_prop(mem_buf_desc_t *buf, unsigned credits, xlio_ti *ti) +void hw_queue_tx::store_current_wqe_prop(mem_buf_desc_t *buf, unsigned credits, xlio_ti *ti) { - m_sq_wqe_idx_to_prop[m_sq_wqe_hot_index] = sq_wqe_prop { + m_sq_wqe_idx_to_prop[m_sq_wqe_last_index] = sq_wqe_prop { .buf = buf, .credits = credits, .ti = ti, .next = m_sq_wqe_prop_last, }; - m_sq_wqe_prop_last = &m_sq_wqe_idx_to_prop[m_sq_wqe_hot_index]; - if (ti != NULL) { + m_sq_wqe_prop_last = &m_sq_wqe_idx_to_prop[m_sq_wqe_last_index]; + if (ti) { ti->get(); } } -//! Send one RAW packet by MLX5 BlueFlame -// -int qp_mgr_eth_mlx5::send_to_wire(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, - bool request_comp, xlio_tis *tis, unsigned credits) +//! Send one RAW packet +void hw_queue_tx::send_to_wire(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, + bool request_comp, bool skip_db, xlio_tis *tis, unsigned credits) { - struct xlio_mlx5_wqe_ctrl_seg *ctrl = NULL; - struct mlx5_wqe_eth_seg *eseg = NULL; + struct xlio_mlx5_wqe_ctrl_seg *ctrl; + struct mlx5_wqe_eth_seg *eseg; uint32_t tisn = tis ? tis->get_tisn() : 0; - ctrl = (struct xlio_mlx5_wqe_ctrl_seg *)m_sq_wqe_hot; - eseg = (struct mlx5_wqe_eth_seg *)((uint8_t *)m_sq_wqe_hot + sizeof(*ctrl)); + update_wqe_last(); + memset(m_sq_wqe_last, 0, sizeof(*m_sq_wqe_last)); + + ctrl = (struct xlio_mlx5_wqe_ctrl_seg *)m_sq_wqe_last; + eseg = (struct mlx5_wqe_eth_seg *)((uint8_t *)m_sq_wqe_last + sizeof(*ctrl)); /* Configure ctrl segment * qpn_ds or ctrl.data[1] is set inside fill_wqe() */ ctrl->opmod_idx_opcode = htonl(((m_sq_wqe_counter & 0xffff) << 8) | (get_mlx5_opcode(xlio_send_wr_opcode(*p_send_wqe)) & 0xff)); - m_sq_wqe_hot->ctrl.data[2] = 0; + m_sq_wqe_last->ctrl.data[2] = 0; ctrl->fm_ce_se = (request_comp ? (uint8_t)MLX5_WQE_CTRL_CQ_UPDATE : 0); ctrl->tis_tir_num = htobe32(tisn << 8); @@ -747,22 +851,138 @@ int qp_mgr_eth_mlx5::send_to_wire(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packe /* Complete WQE */ int wqebbs = fill_wqe(p_send_wqe); assert(wqebbs > 0 && (unsigned)wqebbs <= credits); - NOT_IN_USE(wqebbs); + ring_doorbell(wqebbs, false, skip_db); - update_next_wqe_hot(); + hwqtx_logfunc( + "m_sq_wqe_last: %p m_sq_wqe_last_index: %d wqe_counter: %d new_last_index: %d wr_id: %llx", + m_sq_wqe_last, m_sq_wqe_last_index, m_sq_wqe_counter, + (m_sq_wqe_counter & (m_tx_num_wr - 1)), p_send_wqe->wr_id); +} - qp_logfunc( - "m_sq_wqe_hot: %p m_sq_wqe_hot_index: %d wqe_counter: %d new_hot_index: %d wr_id: %llx", - m_sq_wqe_hot, m_sq_wqe_hot_index, m_sq_wqe_counter, (m_sq_wqe_counter & (m_tx_num_wr - 1)), - p_send_wqe->wr_id); +std::unique_ptr hw_queue_tx::create_tis(uint32_t flags) +{ + dpcp::adapter *adapter = m_p_ib_ctx_handler->get_dpcp_adapter(); + bool is_tls = flags & dpcp::TIS_ATTR_TLS, is_nvme = flags & dpcp::TIS_ATTR_NVMEOTCP; + if (unlikely(!adapter || (is_tls && is_nvme))) { + return nullptr; + } - return 0; + dpcp::tis::attr tis_attr = { + .flags = flags, + .tls_en = is_tls, + .nvmeotcp = is_nvme, + .transport_domain = adapter->get_td(), + .pd = adapter->get_pd(), + }; + + dpcp::tis *dpcp_tis = nullptr; + if (unlikely(adapter->create_tis(tis_attr, dpcp_tis) != dpcp::DPCP_OK)) { + hwqtx_logerr("Failed to create TIS with NVME enabled"); + return nullptr; + } + + auto tis_type = is_tls ? xlio_ti::ti_type::TLS_TIS : xlio_ti::ti_type::NVME_TIS; + return std::make_unique(this, std::unique_ptr(dpcp_tis), tis_type); +} + +static inline void nvme_fill_static_params_control(xlio_mlx5_wqe_ctrl_seg *cseg, + xlio_mlx5_wqe_umr_ctrl_seg *ucseg, + uint32_t producer_index, uint32_t qpn, + uint32_t tisn, uint8_t fence_flags) +{ + memset(cseg, 0, sizeof(*cseg)); + memset(ucseg, 0, sizeof(*ucseg)); + cseg->opmod_idx_opcode = + htobe32(((producer_index & 0xffff) << 8) | MLX5_OPCODE_UMR | + (MLX5_CTRL_SEGMENT_OPC_MOD_UMR_NVMEOTCP_TIS_STATIC_PARAMS << 24)); + size_t num_wqe_ds = 12U; + cseg->qpn_ds = htobe32((qpn << MLX5_WQE_CTRL_QPN_SHIFT) | num_wqe_ds); + cseg->fm_ce_se = fence_flags; + cseg->tis_tir_num = htobe32(tisn << MLX5_WQE_CTRL_TIR_TIS_INDEX_SHIFT); + + ucseg->flags = MLX5_UMR_INLINE; + ucseg->bsf_octowords = htobe16(MLX5E_TRANSPORT_STATIC_PARAMS_OCTWORD_SIZE); +} + +static inline void nvme_fill_static_params_transport_params( + mlx5_wqe_transport_static_params_seg *params, uint32_t config) + +{ + memset(params, 0, sizeof(*params)); + void *ctx = params->ctx; + + DEVX_SET(transport_static_params, ctx, const_1, 1); + DEVX_SET(transport_static_params, ctx, const_2, 2); + DEVX_SET(transport_static_params, ctx, acc_type, MLX5_TRANSPORT_STATIC_PARAMS_ACC_TYPE_NVMETCP); + DEVX_SET(transport_static_params, ctx, nvme_resync_tcp_sn, 0); + DEVX_SET(transport_static_params, ctx, pda, static_cast(config & XLIO_NVME_PDA_MASK)); + DEVX_SET(transport_static_params, ctx, ddgst_en, bool(config & XLIO_NVME_DDGST_ENABLE)); + DEVX_SET(transport_static_params, ctx, ddgst_offload_en, + bool(config & XLIO_NVME_DDGST_OFFLOAD)); + DEVX_SET(transport_static_params, ctx, hddgst_en, bool(config & XLIO_NVME_HDGST_ENABLE)); + DEVX_SET(transport_static_params, ctx, hdgst_offload_en, + bool(config & XLIO_NVME_HDGST_OFFLOAD)); + DEVX_SET(transport_static_params, ctx, ti, MLX5_TRANSPORT_STATIC_PARAMS_TI_INITIATOR); + DEVX_SET(transport_static_params, ctx, const1, 1); + DEVX_SET(transport_static_params, ctx, zero_copy_en, 0); } -#ifdef DEFINED_UTLS +static inline void nvme_fill_progress_wqe(mlx5e_set_nvmeotcp_progress_params_wqe *wqe, + uint32_t producer_index, uint32_t qpn, uint32_t tisn, + uint32_t tcp_seqno, uint8_t fence_flags) +{ + memset(wqe, 0, sizeof(*wqe)); + auto cseg = &wqe->ctrl.ctrl; + + size_t progres_params_ds = DIV_ROUND_UP(sizeof(*wqe), MLX5_SEND_WQE_DS); + cseg->opmod_idx_opcode = + htobe32(((producer_index & 0xffff) << 8) | XLIO_MLX5_OPCODE_SET_PSV | + (MLX5_CTRL_SEGMENT_OPC_MOD_UMR_NVMEOTCP_TIS_PROGRESS_PARAMS << 24)); + cseg->qpn_ds = htobe32((qpn << MLX5_WQE_CTRL_QPN_SHIFT) | progres_params_ds); + cseg->fm_ce_se = fence_flags; + + mlx5_seg_nvmeotcp_progress_params *params = &wqe->params; + params->tir_num = htobe32(tisn); + void *ctx = params->ctx; -std::unique_ptr qp_mgr_eth_mlx5::get_new_tls_dek(const void *key, - uint32_t key_size_bytes) + DEVX_SET(nvmeotcp_progress_params, ctx, next_pdu_tcp_sn, tcp_seqno); + DEVX_SET(nvmeotcp_progress_params, ctx, pdu_tracker_state, + MLX5E_NVMEOTCP_PROGRESS_PARAMS_PDU_TRACKER_STATE_START); + /* if (is_tx) offloading state == 0*/ + DEVX_SET(nvmeotcp_progress_params, ctx, offloading_state, 0); +} + +void hw_queue_tx::nvme_set_static_context(xlio_tis *tis, uint32_t config) +{ + update_wqe_last(); + + auto *cseg = wqebb_get(0U); + auto *ucseg = wqebb_get(0U, sizeof(*cseg)); + + nvme_fill_static_params_control(cseg, ucseg, m_sq_wqe_counter, m_mlx5_qp.qpn, tis->get_tisn(), + 0); + memset(wqebb_get(1U), 0, sizeof(mlx5_mkey_seg)); + + auto *params = wqebb_get(2U); + nvme_fill_static_params_transport_params(params, config); + store_current_wqe_prop(nullptr, SQ_CREDITS_UMR, tis); + ring_doorbell(MLX5E_TRANSPORT_SET_STATIC_PARAMS_WQEBBS); +} + +void hw_queue_tx::nvme_set_progress_context(xlio_tis *tis, uint32_t tcp_seqno) +{ + update_wqe_last(); + + auto *wqe = reinterpret_cast(m_sq_wqe_last); + nvme_fill_progress_wqe(wqe, m_sq_wqe_counter, m_mlx5_qp.qpn, tis->get_tisn(), tcp_seqno, + MLX5_FENCE_MODE_INITIATOR_SMALL); + store_current_wqe_prop(nullptr, SQ_CREDITS_SET_PSV, tis); + ring_doorbell(MLX5E_NVMEOTCP_PROGRESS_PARAMS_WQEBBS); +} + +#if defined(DEFINED_UTLS) +std::unique_ptr hw_queue_tx::get_new_tls_dek(const void *key, + uint32_t key_size_bytes) { dpcp::tls_dek *_dek = nullptr; dpcp::adapter *adapter = m_p_ib_ctx_handler->get_dpcp_adapter(); @@ -776,7 +996,7 @@ std::unique_ptr qp_mgr_eth_mlx5::get_new_tls_dek(const void *key, dek_attr.pd_id = adapter->get_pd(); status = adapter->create_tls_dek(dek_attr, _dek); if (unlikely(status != dpcp::DPCP_OK)) { - qp_logwarn("Failed to create new DEK, status: %d", status); + hwqtx_logwarn("Failed to create new DEK, status: %d", status); if (_dek) { delete _dek; _dek = nullptr; @@ -787,8 +1007,7 @@ std::unique_ptr qp_mgr_eth_mlx5::get_new_tls_dek(const void *key, return std::unique_ptr(_dek); } -std::unique_ptr qp_mgr_eth_mlx5::get_tls_dek(const void *key, - uint32_t key_size_bytes) +std::unique_ptr hw_queue_tx::get_tls_dek(const void *key, uint32_t key_size_bytes) { dpcp::status status; dpcp::adapter *adapter = m_p_ib_ctx_handler->get_dpcp_adapter(); @@ -810,12 +1029,12 @@ std::unique_ptr qp_mgr_eth_mlx5::get_tls_dek(const void *key, } if (unlikely(m_tls_dek_get_cache.empty())) { - qp_logdbg("Empty DEK get cache. Swapping caches and do Sync-Crypto. Put-Cache size: %zu", - m_tls_dek_put_cache.size()); + hwqtx_logdbg("Empty DEK get cache. Swapping caches and do Sync-Crypto. Put-Cache size: %zu", + m_tls_dek_put_cache.size()); status = adapter->sync_crypto_tls(); if (unlikely(status != dpcp::DPCP_OK)) { - qp_logwarn("Failed to flush DEK HW cache, status: %d", status); + hwqtx_logwarn("Failed to flush DEK HW cache, status: %d", status); return get_new_tls_dek(key, key_size_bytes); } @@ -833,16 +1052,16 @@ std::unique_ptr qp_mgr_eth_mlx5::get_tls_dek(const void *key, dek_attr.pd_id = adapter->get_pd(); status = out_dek->modify(dek_attr); if (unlikely(status != dpcp::DPCP_OK)) { - qp_logwarn("Failed to modify DEK, status: %d", status); + hwqtx_logwarn("Failed to modify DEK, status: %d", status); out_dek.reset(nullptr); } return out_dek; } -void qp_mgr_eth_mlx5::put_tls_dek(std::unique_ptr &&tls_dek_obj) +void hw_queue_tx::put_tls_dek(std::unique_ptr &&tls_dek_obj) { - if (tls_dek_obj == nullptr) { + if (!tls_dek_obj) { return; } // We don't allow unlimited DEK cache to avoid system DEK starvation. @@ -852,12 +1071,12 @@ void qp_mgr_eth_mlx5::put_tls_dek(std::unique_ptr &&tls_dek_obj) } } -xlio_tis *qp_mgr_eth_mlx5::tls_context_setup_tx(const xlio_tls_info *info) +xlio_tis *hw_queue_tx::tls_context_setup_tx(const xlio_tls_info *info) { std::unique_ptr tis; if (m_tls_tis_cache.empty()) { tis = create_tis(DPCP_TIS_FLAGS | dpcp::TIS_ATTR_TLS); - if (unlikely(tis == nullptr)) { + if (unlikely(!tis)) { return nullptr; } } else { @@ -884,8 +1103,7 @@ xlio_tis *qp_mgr_eth_mlx5::tls_context_setup_tx(const xlio_tls_info *info) return tis.release(); } -void qp_mgr_eth_mlx5::tls_context_resync_tx(const xlio_tls_info *info, xlio_tis *tis, - bool skip_static) +void hw_queue_tx::tls_context_resync_tx(const xlio_tls_info *info, xlio_tis *tis, bool skip_static) { uint32_t tisn = tis->get_tisn(); @@ -896,29 +1114,9 @@ void qp_mgr_eth_mlx5::tls_context_resync_tx(const xlio_tls_info *info, xlio_tis m_b_fence_needed = true; } -xlio_tir *qp_mgr_eth_mlx5::tls_create_tir(bool cached) -{ - xlio_tir *tir = NULL; - - if (cached && !m_tls_tir_cache.empty()) { - tir = m_tls_tir_cache.back(); - m_tls_tir_cache.pop_back(); - } else if (!cached) { - dpcp::tir *_tir = create_tir(true); - - if (_tir != NULL) { - tir = new xlio_tir(_tir, xlio_ti::ti_type::TLS_TIR); - } - if (unlikely(tir == NULL && _tir != NULL)) { - delete _tir; - } - } - return tir; -} - -int qp_mgr_eth_mlx5::tls_context_setup_rx(xlio_tir *tir, const xlio_tls_info *info, - uint32_t next_record_tcp_sn, xlio_comp_cb_t callback, - void *callback_arg) +int hw_queue_tx::tls_context_setup_rx(xlio_tir *tir, const xlio_tls_info *info, + uint32_t next_record_tcp_sn, xlio_comp_cb_t callback, + void *callback_arg) { uint32_t tirn; dpcp::tls_dek *_dek; @@ -933,7 +1131,7 @@ int qp_mgr_eth_mlx5::tls_context_setup_rx(xlio_tir *tir, const xlio_tls_info *in dek_attr.pd_id = adapter->get_pd(); status = adapter->create_tls_dek(dek_attr, _dek); if (unlikely(status != dpcp::DPCP_OK)) { - qp_logerr("Failed to create DEK, status: %d", status); + hwqtx_logerr("Failed to create DEK, status: %d", status); return -1; } tir->assign_dek(_dek); @@ -948,14 +1146,13 @@ int qp_mgr_eth_mlx5::tls_context_setup_rx(xlio_tir *tir, const xlio_tls_info *in return 0; } -void qp_mgr_eth_mlx5::tls_resync_rx(xlio_tir *tir, const xlio_tls_info *info, - uint32_t hw_resync_tcp_sn) +void hw_queue_tx::tls_resync_rx(xlio_tir *tir, const xlio_tls_info *info, uint32_t hw_resync_tcp_sn) { tls_post_static_params_wqe(tir, info, tir->get_tirn(), tir->get_dek_id(), hw_resync_tcp_sn, false, false); } -void qp_mgr_eth_mlx5::tls_get_progress_params_rx(xlio_tir *tir, void *buf, uint32_t lkey) +void hw_queue_tx::tls_get_progress_params_rx(xlio_tir *tir, void *buf, uint32_t lkey) { /* Address must be aligned by 64. */ assert((uintptr_t)buf == ((uintptr_t)buf >> 6U << 6U)); @@ -963,9 +1160,9 @@ void qp_mgr_eth_mlx5::tls_get_progress_params_rx(xlio_tir *tir, void *buf, uint3 tls_get_progress_params_wqe(tir, tir->get_tirn(), buf, lkey); } -inline void qp_mgr_eth_mlx5::tls_fill_static_params_wqe( - struct mlx5_wqe_tls_static_params_seg *params, const struct xlio_tls_info *info, - uint32_t key_id, uint32_t resync_tcp_sn) +inline void hw_queue_tx::tls_fill_static_params_wqe(struct mlx5_wqe_tls_static_params_seg *params, + const struct xlio_tls_info *info, + uint32_t key_id, uint32_t resync_tcp_sn) { unsigned char *initial_rn, *iv; uint8_t tls_version; @@ -994,14 +1191,14 @@ inline void qp_mgr_eth_mlx5::tls_fill_static_params_wqe( DEVX_SET(tls_static_params, ctx, dek_index, key_id); } -inline void qp_mgr_eth_mlx5::tls_post_static_params_wqe(xlio_ti *ti, - const struct xlio_tls_info *info, - uint32_t tis_tir_number, uint32_t key_id, - uint32_t resync_tcp_sn, bool fence, - bool is_tx) +inline void hw_queue_tx::tls_post_static_params_wqe(xlio_ti *ti, const struct xlio_tls_info *info, + uint32_t tis_tir_number, uint32_t key_id, + uint32_t resync_tcp_sn, bool fence, bool is_tx) { + update_wqe_last(); + struct mlx5_set_tls_static_params_wqe *wqe = - reinterpret_cast(m_sq_wqe_hot); + reinterpret_cast(m_sq_wqe_last); struct xlio_mlx5_wqe_ctrl_seg *cseg = &wqe->ctrl.ctrl; xlio_mlx5_wqe_umr_ctrl_seg *ucseg = &wqe->uctrl; struct mlx5_mkey_seg *mkcseg = &wqe->mkc; @@ -1029,22 +1226,21 @@ inline void qp_mgr_eth_mlx5::tls_post_static_params_wqe(xlio_ti *ti, * * There are 3 cases: * 1. There is enough room in the SQ for 3 WQEBBs: - * 3 WQEBBs posted from m_sq_wqe_hot current location. + * 3 WQEBBs posted from m_sq_wqe_last current location. * 2. There is enough room in the SQ for 2 WQEBBs: - * 2 WQEBBs posted from m_sq_wqe_hot current location till m_sq_wqes_end. + * 2 WQEBBs posted from m_sq_wqe_last current location till m_sq_wqes_end. * 1 WQEBB posted from m_sq_wqes beginning. * 3. There is enough room in the SQ for 1 WQEBB: - * 1 WQEBB posted from m_sq_wqe_hot current location till m_sq_wqes_end. + * 1 WQEBB posted from m_sq_wqe_last current location till m_sq_wqes_end. * 2 WQEBBs posted from m_sq_wqes beginning. - * The case of 0 WQEBBs room left in the SQ shouldn't happen, m_sq_wqe_hot wrap around handling - * done when setting next m_sq_wqe_hot. + * The case of 0 WQEBBs room left in the SQ shouldn't happen, m_sq_wqe_last wrap around handling + * done when setting next m_sq_wqe_last. * * In all the 3 cases, no need to change cseg and ucseg pointers, since they fit to * one WQEBB and will be posted before m_sq_wqes_end. */ - // XXX: We set inline_hdr_sz for every new hot wqe. This corrupts UMR WQE without memset(). - memset(m_sq_wqe_hot, 0, sizeof(*m_sq_wqe_hot)); + memset(m_sq_wqe_last, 0, sizeof(*m_sq_wqe_last)); cseg->opmod_idx_opcode = htobe32(((m_sq_wqe_counter & 0xffff) << 8) | MLX5_OPCODE_UMR | (opmod << 24)); cseg->qpn_ds = htobe32((m_mlx5_qp.qpn << MLX5_WQE_CTRL_QPN_SHIFT) | STATIC_PARAMS_DS_CNT); @@ -1054,8 +1250,6 @@ inline void qp_mgr_eth_mlx5::tls_post_static_params_wqe(xlio_ti *ti, ucseg->flags = MLX5_UMR_INLINE; ucseg->bsf_octowords = htobe16(DEVX_ST_SZ_BYTES(tls_static_params) / 16); - int num_wqebbs = TLS_SET_STATIC_PARAMS_WQEBBS; - int num_wqebbs_top = 0; int sq_wqebbs_room_left = (static_cast(m_sq_wqes_end - reinterpret_cast(cseg)) / MLX5_SEND_WQE_BB); @@ -1067,14 +1261,10 @@ inline void qp_mgr_eth_mlx5::tls_post_static_params_wqe(xlio_ti *ti, if (unlikely(sq_wqebbs_room_left == 2)) { // Case 2: Change tspseg pointer: tspseg = reinterpret_cast(m_sq_wqes); - num_wqebbs = 2; - num_wqebbs_top = 1; } else if (unlikely(sq_wqebbs_room_left == 1)) { // Case 3: Change mkcseg and tspseg pointers: mkcseg = reinterpret_cast(m_sq_wqes); tspseg = reinterpret_cast( reinterpret_cast(m_sq_wqes) + sizeof(*mkcseg)); - num_wqebbs = 1; - num_wqebbs_top = 2; } memset(mkcseg, 0, sizeof(*mkcseg)); @@ -1083,13 +1273,11 @@ inline void qp_mgr_eth_mlx5::tls_post_static_params_wqe(xlio_ti *ti, tls_fill_static_params_wqe(tspseg, info, key_id, resync_tcp_sn); store_current_wqe_prop(nullptr, SQ_CREDITS_UMR, ti); - ring_doorbell(MLX5_DB_METHOD_DB, num_wqebbs, num_wqebbs_top, true); - dbg_dump_wqe((uint32_t *)m_sq_wqe_hot, sizeof(mlx5_set_tls_static_params_wqe)); - - update_next_wqe_hot(); + ring_doorbell(TLS_SET_STATIC_PARAMS_WQEBBS, true); + dbg_dump_wqe((uint32_t *)m_sq_wqe_last, sizeof(mlx5_set_tls_static_params_wqe)); } -inline void qp_mgr_eth_mlx5::tls_fill_progress_params_wqe( +inline void hw_queue_tx::tls_fill_progress_params_wqe( struct mlx5_wqe_tls_progress_params_seg *params, uint32_t tis_tir_number, uint32_t next_record_tcp_sn) { @@ -1103,14 +1291,14 @@ inline void qp_mgr_eth_mlx5::tls_fill_progress_params_wqe( DEVX_SET(tls_progress_params, ctx, auth_state, MLX5E_TLS_PROGRESS_PARAMS_AUTH_STATE_NO_OFFLOAD); } -inline void qp_mgr_eth_mlx5::tls_post_progress_params_wqe(xlio_ti *ti, uint32_t tis_tir_number, - uint32_t next_record_tcp_sn, bool fence, - bool is_tx) +inline void hw_queue_tx::tls_post_progress_params_wqe(xlio_ti *ti, uint32_t tis_tir_number, + uint32_t next_record_tcp_sn, bool fence, + bool is_tx) { - uint16_t num_wqebbs = TLS_SET_PROGRESS_PARAMS_WQEBBS; + update_wqe_last(); struct mlx5_set_tls_progress_params_wqe *wqe = - reinterpret_cast(m_sq_wqe_hot); + reinterpret_cast(m_sq_wqe_last); struct xlio_mlx5_wqe_ctrl_seg *cseg = &wqe->ctrl.ctrl; uint8_t opmod = is_tx ? MLX5_OPC_MOD_TLS_TIS_PROGRESS_PARAMS : MLX5_OPC_MOD_TLS_TIR_PROGRESS_PARAMS; @@ -1129,19 +1317,17 @@ inline void qp_mgr_eth_mlx5::tls_post_progress_params_wqe(xlio_ti *ti, uint32_t tls_fill_progress_params_wqe(&wqe->params, tis_tir_number, next_record_tcp_sn); store_current_wqe_prop(nullptr, SQ_CREDITS_SET_PSV, ti); - ring_doorbell(MLX5_DB_METHOD_DB, num_wqebbs); - dbg_dump_wqe((uint32_t *)m_sq_wqe_hot, sizeof(mlx5_set_tls_progress_params_wqe)); - - update_next_wqe_hot(); + ring_doorbell(TLS_SET_PROGRESS_PARAMS_WQEBBS); + dbg_dump_wqe((uint32_t *)m_sq_wqe_last, sizeof(mlx5_set_tls_progress_params_wqe)); } -inline void qp_mgr_eth_mlx5::tls_get_progress_params_wqe(xlio_ti *ti, uint32_t tirn, void *buf, - uint32_t lkey) +inline void hw_queue_tx::tls_get_progress_params_wqe(xlio_ti *ti, uint32_t tirn, void *buf, + uint32_t lkey) { - uint16_t num_wqebbs = TLS_GET_PROGRESS_WQEBBS; + update_wqe_last(); struct mlx5_get_tls_progress_params_wqe *wqe = - reinterpret_cast(m_sq_wqe_hot); + reinterpret_cast(m_sq_wqe_last); struct xlio_mlx5_wqe_ctrl_seg *cseg = &wqe->ctrl.ctrl; struct xlio_mlx5_seg_get_psv *psv = &wqe->psv; uint8_t opmod = MLX5_OPC_MOD_TLS_TIR_PROGRESS_PARAMS; @@ -1162,204 +1348,60 @@ inline void qp_mgr_eth_mlx5::tls_get_progress_params_wqe(xlio_ti *ti, uint32_t t store_current_wqe_prop(nullptr, SQ_CREDITS_GET_PSV, ti); - ring_doorbell(MLX5_DB_METHOD_DB, num_wqebbs); - - update_next_wqe_hot(); + ring_doorbell(TLS_GET_PROGRESS_WQEBBS); } -void qp_mgr_eth_mlx5::tls_tx_post_dump_wqe(xlio_tis *tis, void *addr, uint32_t len, uint32_t lkey, - bool first) +void hw_queue_tx::tls_tx_post_dump_wqe(xlio_tis *tis, void *addr, uint32_t len, uint32_t lkey, + bool first) { post_dump_wqe(tis, addr, len, lkey, first); } -void qp_mgr_eth_mlx5::tls_release_tis(xlio_tis *tis) +void hw_queue_tx::tls_release_tis(xlio_tis *tis) { - assert(tis != nullptr && tis->m_type == xlio_ti::ti_type::TLS_TIS); + assert(tis && tis->m_type == xlio_ti::ti_type::TLS_TIS); tis->m_released = true; if (tis->m_ref == 0) { put_tls_tis_in_cache(tis); } } -void qp_mgr_eth_mlx5::tls_release_tir(xlio_tir *tir) -{ - /* TODO We don't have to lock ring to destroy DEK object (a garbage collector?). */ - - assert(tir != nullptr && tir->m_type == xlio_ti::ti_type::TLS_TIR); - tir->m_released = true; - tir->assign_callback(NULL, NULL); - if (tir->m_ref == 0) { - put_tls_tir_in_cache(tir); - } -} - -dpcp::tir *qp_mgr_eth_mlx5::xlio_tir_to_dpcp_tir(xlio_tir *tir) -{ - return tir->m_p_tir.get(); -} -#else /* DEFINED_UTLS */ -void qp_mgr_eth_mlx5::ti_released(xlio_ti *) {}; -void qp_mgr_eth_mlx5::destroy_tis_cache(void) {}; -#endif /* DEFINED_UTLS */ - -#ifdef DEFINED_DPCP -std::unique_ptr qp_mgr_eth_mlx5::create_tis(uint32_t flags) const -{ - dpcp::adapter *adapter = m_p_ib_ctx_handler->get_dpcp_adapter(); - bool is_tls = flags & dpcp::TIS_ATTR_TLS, is_nvme = flags & dpcp::TIS_ATTR_NVMEOTCP; - if (unlikely(adapter == nullptr || (is_tls && is_nvme))) { - return nullptr; - } - - dpcp::tis::attr tis_attr = { - .flags = flags, - .tls_en = is_tls, - .nvmeotcp = is_nvme, - .transport_domain = adapter->get_td(), - .pd = adapter->get_pd(), - }; - - dpcp::tis *dpcp_tis = nullptr; - if (unlikely(adapter->create_tis(tis_attr, dpcp_tis) != dpcp::DPCP_OK)) { - qp_logerr("Failed to create TIS with NVME enabled"); - return nullptr; - } - - auto tis_type = is_tls ? xlio_ti::ti_type::TLS_TIS : xlio_ti::ti_type::NVME_TIS; - return std::make_unique(std::unique_ptr(dpcp_tis), tis_type); -} - -static inline void nvme_fill_static_params_control(xlio_mlx5_wqe_ctrl_seg *cseg, - xlio_mlx5_wqe_umr_ctrl_seg *ucseg, - uint32_t producer_index, uint32_t qpn, - uint32_t tisn, uint8_t fence_flags) -{ - memset(cseg, 0, sizeof(*cseg)); - memset(ucseg, 0, sizeof(*ucseg)); - cseg->opmod_idx_opcode = - htobe32(((producer_index & 0xffff) << 8) | MLX5_OPCODE_UMR | - (MLX5_CTRL_SEGMENT_OPC_MOD_UMR_NVMEOTCP_TIS_STATIC_PARAMS << 24)); - size_t num_wqe_ds = 12U; - cseg->qpn_ds = htobe32((qpn << MLX5_WQE_CTRL_QPN_SHIFT) | num_wqe_ds); - cseg->fm_ce_se = fence_flags; - cseg->tis_tir_num = htobe32(tisn << MLX5_WQE_CTRL_TIR_TIS_INDEX_SHIFT); - - ucseg->flags = MLX5_UMR_INLINE; - ucseg->bsf_octowords = htobe16(MLX5E_TRANSPORT_STATIC_PARAMS_OCTWORD_SIZE); -} - -static inline void nvme_fill_static_params_transport_params( - mlx5_wqe_transport_static_params_seg *params, uint32_t config) - +void hw_queue_tx::put_tls_tis_in_cache(xlio_tis *tis) { - memset(params, 0, sizeof(*params)); - void *ctx = params->ctx; - - DEVX_SET(transport_static_params, ctx, const_1, 1); - DEVX_SET(transport_static_params, ctx, const_2, 2); - DEVX_SET(transport_static_params, ctx, acc_type, MLX5_TRANSPORT_STATIC_PARAMS_ACC_TYPE_NVMETCP); - DEVX_SET(transport_static_params, ctx, nvme_resync_tcp_sn, 0); - DEVX_SET(transport_static_params, ctx, pda, static_cast(config & XLIO_NVME_PDA_MASK)); - DEVX_SET(transport_static_params, ctx, ddgst_en, bool(config & XLIO_NVME_DDGST_ENABLE)); - DEVX_SET(transport_static_params, ctx, ddgst_offload_en, - bool(config & XLIO_NVME_DDGST_OFFLOAD)); - DEVX_SET(transport_static_params, ctx, hddgst_en, bool(config & XLIO_NVME_HDGST_ENABLE)); - DEVX_SET(transport_static_params, ctx, hdgst_offload_en, - bool(config & XLIO_NVME_HDGST_OFFLOAD)); - DEVX_SET(transport_static_params, ctx, ti, MLX5_TRANSPORT_STATIC_PARAMS_TI_INITIATOR); - DEVX_SET(transport_static_params, ctx, const1, 1); - DEVX_SET(transport_static_params, ctx, zero_copy_en, 0); -} - -static inline void nvme_fill_progress_wqe(mlx5e_set_nvmeotcp_progress_params_wqe *wqe, - uint32_t producer_index, uint32_t qpn, uint32_t tisn, - uint32_t tcp_seqno, uint8_t fence_flags) -{ - memset(wqe, 0, sizeof(*wqe)); - auto cseg = &wqe->ctrl.ctrl; - - size_t progres_params_ds = DIV_ROUND_UP(sizeof(*wqe), MLX5_SEND_WQE_DS); - cseg->opmod_idx_opcode = - htobe32(((producer_index & 0xffff) << 8) | XLIO_MLX5_OPCODE_SET_PSV | - (MLX5_CTRL_SEGMENT_OPC_MOD_UMR_NVMEOTCP_TIS_PROGRESS_PARAMS << 24)); - cseg->qpn_ds = htobe32((qpn << MLX5_WQE_CTRL_QPN_SHIFT) | progres_params_ds); - cseg->fm_ce_se = fence_flags; - - mlx5_seg_nvmeotcp_progress_params *params = &wqe->params; - params->tir_num = htobe32(tisn); - void *ctx = params->ctx; - - DEVX_SET(nvmeotcp_progress_params, ctx, next_pdu_tcp_sn, tcp_seqno); - DEVX_SET(nvmeotcp_progress_params, ctx, pdu_tracker_state, - MLX5E_NVMEOTCP_PROGRESS_PARAMS_PDU_TRACKER_STATE_START); - /* if (is_tx) offloading state == 0*/ - DEVX_SET(nvmeotcp_progress_params, ctx, offloading_state, 0); -} - -void qp_mgr_eth_mlx5::nvme_set_static_context(xlio_tis *tis, uint32_t config) -{ - auto *cseg = wqebb_get(0U); - auto *ucseg = wqebb_get(0U, sizeof(*cseg)); - - nvme_fill_static_params_control(cseg, ucseg, m_sq_wqe_counter, m_mlx5_qp.qpn, tis->get_tisn(), - 0); - memset(wqebb_get(1U), 0, sizeof(mlx5_mkey_seg)); - - auto *params = wqebb_get(2U); - nvme_fill_static_params_transport_params(params, config); - store_current_wqe_prop(nullptr, SQ_CREDITS_UMR, tis); - ring_doorbell(MLX5_DB_METHOD_DB, MLX5E_TRANSPORT_SET_STATIC_PARAMS_WQEBBS); - update_next_wqe_hot(); -} + std::unique_ptr dek = tis->release_dek(); + assert(dynamic_cast(dek.get())); -void qp_mgr_eth_mlx5::nvme_set_progress_context(xlio_tis *tis, uint32_t tcp_seqno) -{ - auto *wqe = reinterpret_cast(m_sq_wqe_hot); - nvme_fill_progress_wqe(wqe, m_sq_wqe_counter, m_mlx5_qp.qpn, tis->get_tisn(), tcp_seqno, - MLX5_FENCE_MODE_INITIATOR_SMALL); - store_current_wqe_prop(nullptr, SQ_CREDITS_SET_PSV, tis); - ring_doorbell(MLX5_DB_METHOD_DB, MLX5E_NVMEOTCP_PROGRESS_PARAMS_WQEBBS); - update_next_wqe_hot(); + put_tls_dek(std::unique_ptr(dynamic_cast(dek.release()))); + m_tls_tis_cache.push_back(tis); } -#endif /* DEFINED_DPCP */ -#if defined(DEFINED_UTLS) -void qp_mgr_eth_mlx5::ti_released(xlio_ti *ti) +void hw_queue_tx::ti_released(xlio_ti *ti) { assert(ti->m_released); assert(ti->m_ref == 0); if (ti->m_type == xlio_ti::ti_type::TLS_TIS) { put_tls_tis_in_cache(static_cast(ti)); - } else if (ti->m_type == xlio_ti::ti_type::TLS_TIR) { - put_tls_tir_in_cache(static_cast(ti)); } } -void qp_mgr_eth_mlx5::put_tls_tis_in_cache(xlio_tis *tis) -{ - std::unique_ptr dek = tis->release_dek(); - assert(dynamic_cast(dek.get()) != nullptr); - - put_tls_dek(std::unique_ptr(dynamic_cast(dek.release()))); - m_tls_tis_cache.push_back(tis); -} - -void qp_mgr_eth_mlx5::put_tls_tir_in_cache(xlio_tir *tir) +void hw_queue_tx::destroy_tis_cache(void) { - // Because the absense of TIR flush command, reusing a TIR - // may result in undefined behaviour. - // Until a flush command is available the TIR cache is disabled. - // Re-enabling TIR cache should also add destroy_tir_cache on ring cleanup. - // m_tls_tir_cache.push_back(tir); - - delete tir; + while (!m_tls_tis_cache.empty()) { + xlio_tis *tis = m_tls_tis_cache.back(); + m_tls_tis_cache.pop_back(); + delete tis; + } } +#else /* DEFINED_UTLS */ +void hw_queue_tx::ti_released(xlio_ti *) {}; +void hw_queue_tx::destroy_tis_cache(void) {}; #endif /* defined(DEFINED_UTLS) */ -void qp_mgr_eth_mlx5::post_nop_fence(void) +void hw_queue_tx::post_nop_fence(void) { - struct mlx5_wqe *wqe = reinterpret_cast(m_sq_wqe_hot); + update_wqe_last(); + + struct mlx5_wqe *wqe = reinterpret_cast(m_sq_wqe_last); struct xlio_mlx5_wqe_ctrl_seg *cseg = &wqe->ctrl; memset(wqe, 0, sizeof(*wqe)); @@ -1368,21 +1410,20 @@ void qp_mgr_eth_mlx5::post_nop_fence(void) cseg->qpn_ds = htobe32((m_mlx5_qp.qpn << MLX5_WQE_CTRL_QPN_SHIFT) | 0x01); cseg->fm_ce_se = MLX5_FENCE_MODE_INITIATOR_SMALL; - store_current_wqe_prop(nullptr, SQ_CREDITS_NOP, NULL); + store_current_wqe_prop(nullptr, SQ_CREDITS_NOP, nullptr); - ring_doorbell(MLX5_DB_METHOD_DB, 1); - - update_next_wqe_hot(); + ring_doorbell(1); } -void qp_mgr_eth_mlx5::post_dump_wqe(xlio_tis *tis, void *addr, uint32_t len, uint32_t lkey, - bool is_first) +void hw_queue_tx::post_dump_wqe(xlio_tis *tis, void *addr, uint32_t len, uint32_t lkey, + bool is_first) { - struct mlx5_dump_wqe *wqe = reinterpret_cast(m_sq_wqe_hot); + update_wqe_last(); + + struct mlx5_dump_wqe *wqe = reinterpret_cast(m_sq_wqe_last); struct xlio_mlx5_wqe_ctrl_seg *cseg = &wqe->ctrl.ctrl; struct mlx5_wqe_data_seg *dseg = &wqe->data; uint32_t tisn = tis ? tis->get_tisn() : 0; - uint16_t num_wqebbs = XLIO_DUMP_WQEBBS; uint16_t ds_cnt = sizeof(*wqe) / MLX5_SEND_WQE_DS; memset(wqe, 0, sizeof(*wqe)); @@ -1398,27 +1439,25 @@ void qp_mgr_eth_mlx5::post_dump_wqe(xlio_tis *tis, void *addr, uint32_t len, uin store_current_wqe_prop(nullptr, SQ_CREDITS_DUMP, tis); - ring_doorbell(MLX5_DB_METHOD_DB, num_wqebbs, 0, true); - - update_next_wqe_hot(); + ring_doorbell(XLIO_DUMP_WQEBBS, true); } //! Handle releasing of Tx buffers // Single post send with SIGNAL of a dummy packet // NOTE: Since the QP is in ERROR state no packets will be sent on the wire! // So we can post_send anything we want :) -void qp_mgr_eth_mlx5::trigger_completion_for_all_sent_packets() +void hw_queue_tx::trigger_completion_for_all_sent_packets() { - qp_logfunc("unsignaled count=%d", m_n_unsignaled_count); + hwqtx_logfunc("unsignaled count=%d", m_n_unsignaled_count); if (!is_signal_requested_for_last_wqe()) { // Post a dummy WQE and request a signal to complete all the unsignaled WQEs in SQ - qp_logdbg("Need to send closing tx wr..."); + hwqtx_logdbg("Need to send closing tx wr..."); mem_buf_desc_t *p_mem_buf_desc = m_p_ring->mem_buf_tx_get(0, true, PBUF_RAM); // Align Tx buffer accounting since we will be bypassing the normal send calls m_p_ring->m_missing_buf_ref_count--; if (!p_mem_buf_desc) { - qp_logerr("no buffer in pool"); + hwqtx_logerr("no buffer in pool"); return; } @@ -1442,10 +1481,10 @@ void qp_mgr_eth_mlx5::trigger_completion_for_all_sent_packets() memset(&send_wr, 0, sizeof(send_wr)); send_wr.wr_id = (uintptr_t)p_mem_buf_desc; - send_wr.wr.ud.ah = NULL; + send_wr.wr.ud.ah = nullptr; send_wr.sg_list = sge; send_wr.num_sge = 1; - send_wr.next = NULL; + send_wr.next = nullptr; xlio_send_wr_opcode(send_wr) = XLIO_IBV_WR_SEND; unsigned credits = credits_calculate(&send_wr); @@ -1453,17 +1492,17 @@ void qp_mgr_eth_mlx5::trigger_completion_for_all_sent_packets() // TODO Wait for available space in SQ to post the WQE. This method mustn't fail, // because we may want to wait until all the WQEs are completed and we need to post // something and request signal. - qp_logdbg("No space in SQ to trigger completions with a post operation"); + hwqtx_logdbg("No space in SQ to trigger completions with a post operation"); return; } send_to_wire(&send_wr, (xlio_wr_tx_packet_attr)(XLIO_TX_PACKET_L3_CSUM | XLIO_TX_PACKET_L4_CSUM), - true, nullptr, credits); + true, false, nullptr, credits); } } -void qp_mgr_eth_mlx5::reset_inflight_zc_buffers_ctx(void *ctx) +void hw_queue_tx::reset_inflight_zc_buffers_ctx(void *ctx) { sq_wqe_prop *p = m_sq_wqe_prop_last; sq_wqe_prop *prev; @@ -1483,4 +1522,33 @@ void qp_mgr_eth_mlx5::reset_inflight_zc_buffers_ctx(void *ctx) } } -#endif /* DEFINED_DIRECT_VERBS */ +uint32_t hw_queue_tx::is_ratelimit_change(struct xlio_rate_limit_t &rate_limit) +{ + uint32_t rl_changes = 0; + + if (m_rate_limit.rate != rate_limit.rate) { + rl_changes |= RL_RATE; + } + if (m_rate_limit.max_burst_sz != rate_limit.max_burst_sz) { + rl_changes |= RL_BURST_SIZE; + } + if (m_rate_limit.typical_pkt_sz != rate_limit.typical_pkt_sz) { + rl_changes |= RL_PKT_SIZE; + } + + return rl_changes; +} + +int hw_queue_tx::modify_qp_ratelimit(struct xlio_rate_limit_t &rate_limit, uint32_t rl_changes) +{ + int ret; + + ret = priv_ibv_modify_qp_ratelimit(m_mlx5_qp.qp, rate_limit, rl_changes); + if (ret) { + hwqtx_logdbg("failed to modify qp ratelimit ret %d (errno=%d %m)", ret, errno); + return -1; + } + + m_rate_limit = rate_limit; + return 0; +} diff --git a/src/core/dev/hw_queue_tx.h b/src/core/dev/hw_queue_tx.h new file mode 100644 index 000000000..bb7929aee --- /dev/null +++ b/src/core/dev/hw_queue_tx.h @@ -0,0 +1,322 @@ +/* + * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef HW_QUEUE_TX_H +#define HW_QUEUE_TX_H + +#include +#include +#include "dev/xlio_ti.h" +#include "dev/cq_mgr_tx.h" +#include "dev/cq_mgr_rx.h" +#include "dev/dm_mgr.h" +#include "proto/mem_buf_desc.h" +#include "proto/xlio_lwip.h" +#include "util/sg_array.h" + +#ifndef MAX_SUPPORTED_IB_INLINE_SIZE +#define MAX_SUPPORTED_IB_INLINE_SIZE 884 +#endif + +struct slave_data_t; +struct xlio_tls_info; + +enum { + SQ_CREDITS_UMR = 3U, + SQ_CREDITS_SET_PSV = 1U, + SQ_CREDITS_GET_PSV = 1U, + SQ_CREDITS_DUMP = 1U, + SQ_CREDITS_NOP = 1U, + SQ_CREDITS_TLS_TX_CONTEXT = SQ_CREDITS_UMR + SQ_CREDITS_SET_PSV, + SQ_CREDITS_TLS_RX_CONTEXT = SQ_CREDITS_UMR + SQ_CREDITS_SET_PSV, + SQ_CREDITS_TLS_RX_RESYNC = SQ_CREDITS_UMR, + SQ_CREDITS_TLS_RX_GET_PSV = SQ_CREDITS_GET_PSV, +}; + +/* WQE properties description. */ +struct sq_wqe_prop { + /* A buffer held by the WQE. This is NULL for control WQEs. */ + mem_buf_desc_t *buf; + /* Number of credits (usually number of WQEBBs). */ + unsigned credits; + /* Transport interface (TIS/TIR) current WQE holds reference to. */ + xlio_ti *ti; + struct sq_wqe_prop *next; +}; + +// @class hw_queue_tx +// Object to manages the SQ operations. This object is used for Tx. +// Once created it requests from the system a CQ to work with. +class hw_queue_tx : public xlio_ti_owner { + friend class cq_mgr_tx; + +public: + hw_queue_tx(ring_simple *ring, const slave_data_t *slave, const uint32_t tx_num_wr); + virtual ~hw_queue_tx(); + + virtual void ti_released(xlio_ti *ti) override; + + void up(); + void down(); + + void send_wqe(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, xlio_tis *tis, + unsigned credits); + + void ring_delayed_doorbell() + { + if (m_b_db_needed) { + struct xlio_mlx5_wqe_ctrl_seg *ctrl = &m_sq_wqe_last->ctrl.ctrl; + ctrl->fm_ce_se |= MLX5_WQE_CTRL_CQ_UPDATE; + m_b_db_needed = false; + set_unsignaled_count(); + + wmb(); + *m_mlx5_qp.sq.dbrec = htonl(m_sq_wqe_counter); + wc_wmb(); + *(uint64_t *)m_mlx5_qp.bf.reg = *(uint64_t *)m_sq_wqe_last; + wc_wmb(); + } + } + + struct ibv_qp *get_ibv_qp() const { return m_mlx5_qp.qp; }; + + // This function can be replaced with a parameter during ring creation. + // chain of calls may serve as cache warm for dummy send feature. + bool get_hw_dummy_send_support() { return m_hw_dummy_send_support; } + cq_mgr_tx *get_tx_cq_mgr() const { return m_p_cq_mgr_tx; } + uint32_t get_max_inline_data() const { return m_mlx5_qp.cap.max_inline_data; } + uint32_t get_max_send_sge() const { return m_mlx5_qp.cap.max_send_sge; } + + void modify_queue_to_ready_state(); + void modify_queue_to_error_state(); + void release_tx_buffers(); + uint32_t is_ratelimit_change(struct xlio_rate_limit_t &rate_limit); + int modify_qp_ratelimit(struct xlio_rate_limit_t &rate_limit, uint32_t rl_changes); + void dm_release_data(mem_buf_desc_t *buff) { m_dm_mgr.release_data(buff); } + +#ifdef DEFINED_UTLS + xlio_tis *tls_context_setup_tx(const xlio_tls_info *info); + xlio_tir *tls_create_tir(bool cached); + int tls_context_setup_rx(xlio_tir *tir, const xlio_tls_info *info, uint32_t next_record_tcp_sn, + xlio_comp_cb_t callback, void *callback_arg); + void tls_context_resync_tx(const xlio_tls_info *info, xlio_tis *tis, bool skip_static); + void tls_resync_rx(xlio_tir *tir, const xlio_tls_info *info, uint32_t hw_resync_tcp_sn); + void tls_get_progress_params_rx(xlio_tir *tir, void *buf, uint32_t lkey); + void tls_release_tis(xlio_tis *tis); + void tls_release_tir(xlio_tir *tir); + void tls_tx_post_dump_wqe(xlio_tis *tis, void *addr, uint32_t len, uint32_t lkey, bool first); +#endif /* DEFINED_UTLS */ + +#define DPCP_TIS_FLAGS (dpcp::TIS_ATTR_TRANSPORT_DOMAIN | dpcp::TIS_ATTR_PD) +#define DPCP_TIS_NVME_FLAG (dpcp::TIS_ATTR_NVMEOTCP) + std::unique_ptr create_tis(uint32_t flags); + void nvme_set_static_context(xlio_tis *tis, uint32_t config); + void nvme_set_progress_context(xlio_tis *tis, uint32_t tcp_seqno); + + /* Get a memory inside a wqebb at a wqebb_num offset from the m_sq_wqe_hot and account for + * m_sq_wqe_counter wrap-around. Use offset_in_wqebb to for the internal address. Use the + * template parameter to cast the resulting address to the required pointer type */ + template + constexpr inline T wqebb_get(size_t wqebb_num, size_t offset_in_wqebb = 0U) + { + return reinterpret_cast( + reinterpret_cast( + &(*m_sq_wqes)[(m_sq_wqe_counter + wqebb_num) & (m_tx_num_wr - 1)]) + + offset_in_wqebb); + } + + void post_nop_fence(); + void post_dump_wqe(xlio_tis *tis, void *addr, uint32_t len, uint32_t lkey, bool first); + +#if defined(DEFINED_UTLS) + std::unique_ptr get_new_tls_dek(const void *key, uint32_t key_size_bytes); + std::unique_ptr get_tls_dek(const void *key, uint32_t key_size_bytes); + void put_tls_dek(std::unique_ptr &&dek_obj); +#endif + + void reset_inflight_zc_buffers_ctx(void *ctx); + + void credits_return(unsigned credits) { m_sq_free_credits += credits; } + + bool credits_get(unsigned credits) + { + if (m_sq_free_credits >= credits) { + m_sq_free_credits -= credits; + return true; + } + return false; + } + + unsigned credits_calculate(xlio_ibv_send_wr *p_send_wqe) + { + /* Credit is a logical value which is opaque for users. Only hw_queue_tx can interpret the + * value and currently, one credit equals to one WQEBB in the SQ. + * + * Current method does best effort to predict how many WQEBBs will be used to send + * p_send_wqe in send_to_wire(). The predicted value may be higher than actual, but + * mustn't be lower. + * + * There are 3 branches in this order: + * 1. Full non-TSO packet inline + * 2. Non-TSO packet with scatter-gather elements and no inline data + * 3. TSO packet with inline headers + * + * Formulas details: + * 1. WQEBB is 64 bytes, the 1st WQEBB contains ctrl segment, eth segment and 18 bytes of + * inline data. So, we take the 1st WQEBB and number of WQEBBs for the packet minus 18 + * bytes. + * 2. Data segment for each scatter-gather element is 16 bytes. Therefore, WQEBB can hold + * up to 4 data segments. The 1st element fits into the 1st WQEBB after the eth segment. + * So, we take the 1st WQEBB and number of WQEBBs for scatter-gather elements minus 1. + * 3. Inline header starts from offset 46 in WQE (2 bytes before 16 bytes alignment). + * Decrease inline header size by 2 to align it to 16 bytes boundary at the right edge. + * This compensates data segments alignment. Add the 2 bytes back and length of + * scatter-gather elements. Take into account that 18 bytes goes to the 1st WQEBB and + * add the 1st WQEBB to the result. + */ + if (xlio_send_wr_opcode(*p_send_wqe) != XLIO_IBV_WR_TSO) { + if (p_send_wqe->num_sge == 1 && p_send_wqe->sg_list->length <= 204) { + return (p_send_wqe->sg_list->length + 63U - 18U) / 64U + 1U; + } else { + return (p_send_wqe->num_sge + 3U - 1U) / 4U + 1U; + } + } else { + return (((p_send_wqe->tso.hdr_sz + 15U - 2U) & ~15U) + 2U + p_send_wqe->num_sge * 16U - + 18U + 63U) / + 64U + + 1U; + } + } + +private: + cq_mgr_tx *init_tx_cq_mgr(); + + int configure(const slave_data_t *slave); + int prepare_queue(xlio_ibv_qp_init_attr &qp_init_attr); + void init_queue(); + void init_device_memory(); + void trigger_completion_for_all_sent_packets(); + void update_wqe_last(); + void destroy_tis_cache(); + void put_tls_tis_in_cache(xlio_tis *tis); + + void send_to_wire(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, bool request_comp, + bool skip_db, xlio_tis *tis, unsigned credits); + + void set_unsignaled_count(void) { m_n_unsignaled_count = m_n_sysvar_tx_num_wr_to_signal - 1; } + + bool is_completion_need() const + { + return !m_n_unsignaled_count || (m_dm_enabled && m_dm_mgr.is_completion_need()); + } + + bool is_signal_requested_for_last_wqe() + { + return m_n_unsignaled_count == m_n_sysvar_tx_num_wr_to_signal - 1; + } + + void dec_unsignaled_count(void) + { + if (m_n_unsignaled_count > 0) { + --m_n_unsignaled_count; + } + } + + bool is_sq_wqe_prop_valid(sq_wqe_prop *p, sq_wqe_prop *prev) + { + unsigned p_i = p - m_sq_wqe_idx_to_prop; + unsigned prev_i = prev - m_sq_wqe_idx_to_prop; + return (p_i != m_sq_wqe_prop_last_signalled) && + ((m_tx_num_wr + p_i - m_sq_wqe_prop_last_signalled) % m_tx_num_wr < + (m_tx_num_wr + prev_i - m_sq_wqe_prop_last_signalled) % m_tx_num_wr); + } + +#if defined(DEFINED_UTLS) + inline void tls_fill_static_params_wqe(struct mlx5_wqe_tls_static_params_seg *params, + const struct xlio_tls_info *info, uint32_t key_id, + uint32_t resync_tcp_sn); + inline void tls_post_static_params_wqe(xlio_ti *ti, const struct xlio_tls_info *info, + uint32_t tis_tir_number, uint32_t key_id, + uint32_t resync_tcp_sn, bool fence, bool is_tx); + inline void tls_fill_progress_params_wqe(struct mlx5_wqe_tls_progress_params_seg *params, + uint32_t tis_tir_number, uint32_t next_record_tcp_sn); + inline void tls_post_progress_params_wqe(xlio_ti *ti, uint32_t tis_tir_number, + uint32_t next_record_tcp_sn, bool fence, bool is_tx); + inline void tls_get_progress_params_wqe(xlio_ti *ti, uint32_t tirn, void *buf, uint32_t lkey); +#endif /* DEFINED_UTLS */ + + inline void store_current_wqe_prop(mem_buf_desc_t *wr_id, unsigned credits, xlio_ti *ti); + inline int fill_wqe(xlio_ibv_send_wr *p_send_wqe); + inline int fill_wqe_inline(xlio_ibv_send_wr *pswr); + inline int fill_wqe_send(xlio_ibv_send_wr *pswr); + inline int fill_wqe_lso(xlio_ibv_send_wr *pswr); + inline int fill_inl_segment(sg_array &sga, uint8_t *cur_seg, uint8_t *data_addr, + int max_inline_len, int inline_len); + inline void ring_doorbell(int num_wqebb, bool skip_comp = false, bool skip_db = false); + + struct xlio_rate_limit_t m_rate_limit; + xlio_ib_mlx5_qp_t m_mlx5_qp; + ring_simple *m_p_ring; + cq_mgr_tx *m_p_cq_mgr_tx; + cq_mgr_rx *m_p_cq_mgr_rx_unused; + ib_ctx_handler *m_p_ib_ctx_handler; + sq_wqe_prop *m_sq_wqe_idx_to_prop = nullptr; + sq_wqe_prop *m_sq_wqe_prop_last = nullptr; + + struct mlx5_eth_wqe (*m_sq_wqes)[] = nullptr; + struct mlx5_eth_wqe *m_sq_wqe_last = nullptr; + uint8_t *m_sq_wqes_end = nullptr; + + const uint32_t m_n_sysvar_tx_num_wr_to_signal; + uint32_t m_tx_num_wr; + unsigned m_sq_wqe_prop_last_signalled = 0U; + unsigned m_sq_free_credits = 0U; + uint32_t m_n_unsignaled_count = 0U; + int m_sq_wqe_last_index = 0; + uint16_t m_sq_wqe_counter = 0U; + uint8_t m_port_num; + bool m_b_fence_needed = false; + bool m_b_db_needed = false; + bool m_dm_enabled = false; + bool m_hw_dummy_send_support = false; + dm_mgr m_dm_mgr; + + // TIS cache. Protected by ring tx lock. TODO Move to ring. + std::vector m_tls_tis_cache; + +#if defined(DEFINED_UTLS) + std::list> m_tls_dek_get_cache; + std::list> m_tls_dek_put_cache; +#endif +}; + +#endif // HW_QUEUE_TX_H diff --git a/src/core/dev/ib_ctx_handler.cpp b/src/core/dev/ib_ctx_handler.cpp index bf9bd40a2..cced43b2c 100644 --- a/src/core/dev/ib_ctx_handler.cpp +++ b/src/core/dev/ib_ctx_handler.cpp @@ -60,48 +60,27 @@ ib_ctx_handler::ib_ctx_handler(struct ib_ctx_handler_desc *desc) , m_on_device_memory(0) , m_removed(false) , m_lock_umr("spin_lock_umr") - , m_p_ctx_time_converter(NULL) + , m_p_ctx_time_converter(nullptr) { - if (NULL == desc) { + if (!desc) { ibch_logpanic("Invalid ib_ctx_handler"); } m_p_ibv_device = desc->device; - if (m_p_ibv_device == NULL) { + if (!m_p_ibv_device) { ibch_logpanic("m_p_ibv_device is invalid"); } - m_p_ibv_context = NULL; -#ifdef DEFINED_DPCP m_p_adapter = set_dpcp_adapter(); - if (NULL == m_p_adapter) -#endif /* DEFINED_DPCP */ - { -#if defined(DEFINED_ROCE_LAG) - struct mlx5dv_context_attr dv_attr; - - memset(&dv_attr, 0, sizeof(dv_attr)); - dv_attr.flags |= MLX5DV_CONTEXT_FLAGS_DEVX; - m_p_ibv_context = mlx5dv_open_device(m_p_ibv_device, &dv_attr); -#endif /* DEFINED_ROCE_LAG */ - if (m_p_ibv_context == NULL) { - m_p_ibv_context = ibv_open_device(m_p_ibv_device); - } - if (m_p_ibv_context == NULL) { - ibch_logpanic("m_p_ibv_context is invalid"); - } - // Create pd for this device - m_p_ibv_pd = ibv_alloc_pd(m_p_ibv_context); - if (m_p_ibv_pd == NULL) { - ibch_logpanic("ibv device %p pd allocation failure (ibv context %p) (errno=%d %m)", - m_p_ibv_device, m_p_ibv_context, errno); - } + if (!m_p_adapter) { + ibch_logpanic("ibv device %p adapter allocation failure (errno=%d %m)", m_p_ibv_device, + errno); } VALGRIND_MAKE_MEM_DEFINED(m_p_ibv_pd, sizeof(struct ibv_pd)); m_p_ibv_device_attr = new xlio_ibv_device_attr_ex(); - if (m_p_ibv_device_attr == NULL) { + if (!m_p_ibv_device_attr) { ibch_logpanic("ibv device %p attr allocation failure (ibv context %p) (errno=%d %m)", m_p_ibv_device, m_p_ibv_context, errno); } @@ -138,15 +117,9 @@ ib_ctx_handler::ib_ctx_handler(struct ib_ctx_handler_desc *desc) ibv_dealloc_pd(m_p_ibv_pd); } -#ifdef DEFINED_DPCP if (m_p_adapter) { delete m_p_adapter; - m_p_ibv_context = NULL; - } -#endif /* DEFINED_DPCP */ - if (m_p_ibv_context) { - ibv_close_device(m_p_ibv_context); - m_p_ibv_context = NULL; + m_p_ibv_context = nullptr; } } @@ -171,7 +144,7 @@ ib_ctx_handler::~ib_ctx_handler() } ENDIF_VERBS_FAILURE; VALGRIND_MAKE_MEM_UNDEFINED(m_p_ibv_pd, sizeof(struct ibv_pd)); - m_p_ibv_pd = NULL; + m_p_ibv_pd = nullptr; } if (m_p_ctx_time_converter) { @@ -179,15 +152,9 @@ ib_ctx_handler::~ib_ctx_handler() } delete m_p_ibv_device_attr; -#ifdef DEFINED_DPCP if (m_p_adapter) { delete m_p_adapter; - m_p_ibv_context = NULL; - } -#endif /* DEFINED_DPCP */ - if (m_p_ibv_context) { - ibv_close_device(m_p_ibv_context); - m_p_ibv_context = NULL; + m_p_ibv_context = nullptr; } BULLSEYE_EXCLUDE_BLOCK_END @@ -235,8 +202,6 @@ void ib_ctx_handler::print_val() ibch_logdbg("%s", m_str); } -#ifdef DEFINED_DPCP - int parse_dpcp_version(const char *dpcp_ver) { static const std::string s_delimiter("."); @@ -257,15 +222,15 @@ int parse_dpcp_version(const char *dpcp_ver) dpcp::adapter *ib_ctx_handler::set_dpcp_adapter() { dpcp::status status = dpcp::DPCP_ERR_NO_SUPPORT; - dpcp::provider *p_provider = NULL; - dpcp::adapter_info *dpcp_lst = NULL; + dpcp::provider *p_provider = nullptr; + dpcp::adapter_info *dpcp_lst = nullptr; size_t adapters_num = 0; size_t i = 0; int dpcp_ver = 0; - m_p_adapter = NULL; + m_p_adapter = nullptr; if (!m_p_ibv_device) { - return NULL; + return nullptr; } status = dpcp::provider::get_instance(p_provider); @@ -286,13 +251,13 @@ dpcp::adapter *ib_ctx_handler::set_dpcp_adapter() * 0 arguments along with DPCP_ERR_OUT_OF_RANGE error. On success, the * number of actual adapters is not set, so we need a separate call here. */ - status = p_provider->get_adapter_info_lst(NULL, adapters_num); + status = p_provider->get_adapter_info_lst(nullptr, adapters_num); if (dpcp::DPCP_ERR_OUT_OF_RANGE != status || 0 == adapters_num) { ibch_logdbg("found no adapters status = %d", status); goto err; } - dpcp_lst = new (std::nothrow) dpcp::adapter_info[adapters_num]; + dpcp_lst = new (std::nothrow) dpcp::adapter_info[static_cast(adapters_num)]; if (!dpcp_lst) { ibch_logerr("failed allocating memory for devices"); goto err; @@ -306,13 +271,13 @@ dpcp::adapter *ib_ctx_handler::set_dpcp_adapter() for (i = 0; i < adapters_num; i++) { if (dpcp_lst[i].name == m_p_ibv_device->name) { - dpcp::adapter *adapter = NULL; + dpcp::adapter *adapter = nullptr; status = p_provider->open_adapter(dpcp_lst[i].name, adapter); if ((dpcp::DPCP_OK == status) && (adapter)) { int ret = 0; - struct ibv_context *ctx = NULL; - struct ibv_pd *pd = NULL; + struct ibv_context *ctx = nullptr; + struct ibv_pd *pd = nullptr; mlx5dv_obj mlx5_obj; ctx = (ibv_context *)adapter->get_ibv_context(); @@ -355,6 +320,7 @@ dpcp::adapter *ib_ctx_handler::set_dpcp_adapter() m_p_adapter = adapter; m_p_ibv_context = ctx; m_p_ibv_pd = pd; + check_capabilities(); ibch_logdbg("dpcp adapter: %s is up", adapter->get_name().c_str()); } @@ -369,11 +335,20 @@ dpcp::adapter *ib_ctx_handler::set_dpcp_adapter() return m_p_adapter; } -#endif /* DEFINED_DPCP */ + +void ib_ctx_handler::check_capabilities() +{ + dpcp::adapter_hca_capabilities caps; + dpcp::status rc = m_p_adapter->get_hca_capabilities(caps); + if (rc == dpcp::DPCP_OK) { + set_flow_tag_capability(caps.flow_table_caps.receive.is_flow_action_tag_supported); + ibch_logdbg("Flow Tag Support: %s", get_flow_tag_capability() ? "Yes" : "No"); + } +} void ib_ctx_handler::set_ctx_time_converter_status(ts_conversion_mode_t conversion_mode) { - if (m_p_ctx_time_converter != NULL) { + if (m_p_ctx_time_converter) { /* * Don't override time_converter object. Current method may be * called more than once if multiple slaves point to the same @@ -439,20 +414,14 @@ void ib_ctx_handler::set_ctx_time_converter_status(ts_conversion_mode_t conversi #endif // DEFINED_IBV_CQ_TIMESTAMP } -ts_conversion_mode_t ib_ctx_handler::get_ctx_time_converter_status() -{ - return m_p_ctx_time_converter ? m_p_ctx_time_converter->get_converter_status() - : TS_CONVERSION_MODE_DISABLE; -} - uint32_t ib_ctx_handler::mem_reg(void *addr, size_t length, uint64_t access) { - struct ibv_mr *mr = NULL; + struct ibv_mr *mr = nullptr; uint32_t lkey = LKEY_ERROR; mr = ibv_reg_mr(m_p_ibv_pd, addr, length, access); VALGRIND_MAKE_MEM_DEFINED(mr, sizeof(ibv_mr)); - if (NULL == mr) { + if (!mr) { print_warning_rlimit_memlock(length, errno); } else { m_mr_map_lkey[mr->lkey] = mr; @@ -491,7 +460,7 @@ struct ibv_mr *ib_ctx_handler::get_mem_reg(uint32_t lkey) return iter->second; } - return NULL; + return nullptr; } uint32_t ib_ctx_handler::user_mem_reg(void *addr, size_t length, uint64_t access) @@ -578,6 +547,6 @@ void ib_ctx_handler::handle_event_device_fatal() g_p_event_handler_manager->unregister_ibverbs_event(m_p_ibv_context->async_fd, this); if (m_p_ctx_time_converter) { m_p_ctx_time_converter->clean_obj(); - m_p_ctx_time_converter = NULL; + m_p_ctx_time_converter = nullptr; } } diff --git a/src/core/dev/ib_ctx_handler.h b/src/core/dev/ib_ctx_handler.h index 8c97ac3bf..dd9c36d16 100644 --- a/src/core/dev/ib_ctx_handler.h +++ b/src/core/dev/ib_ctx_handler.h @@ -40,10 +40,7 @@ #include "dev/time_converter.h" #include "ib/base/verbs_extra.h" #include "utils/lock_wrapper.h" - -#ifdef DEFINED_DPCP #include -#endif /* DEFINED_DPCP */ typedef std::unordered_map mr_map_lkey_t; @@ -78,10 +75,9 @@ class ib_ctx_handler : public event_handler_ibverbs { ibv_device *get_ibv_device() { return m_p_ibv_device; } inline char *get_ibname() { return (m_p_ibv_device ? m_p_ibv_device->name : (char *)""); } struct ibv_context *get_ibv_context() { return m_p_ibv_context; } -#ifdef DEFINED_DPCP dpcp::adapter *set_dpcp_adapter(); dpcp::adapter *get_dpcp_adapter() { return m_p_adapter; } -#endif /* DEFINED_DPCP */ + void check_capabilities(); xlio_ibv_device_attr *get_ibv_device_attr() { return xlio_get_device_orig_attr(m_p_ibv_device_attr); @@ -93,7 +89,6 @@ class ib_ctx_handler : public event_handler_ibverbs { uint32_t user_mem_reg(void *addr, size_t length, uint64_t access); bool is_removed() { return m_removed; } void set_ctx_time_converter_status(ts_conversion_mode_t conversion_mode); - ts_conversion_mode_t get_ctx_time_converter_status(); void set_flow_tag_capability(bool flow_tag_capability); bool get_flow_tag_capability() { return m_flow_tag_enabled; } // m_flow_tag_capability void set_burst_capability(bool burst); @@ -116,10 +111,8 @@ class ib_ctx_handler : public event_handler_ibverbs { private: void handle_event_device_fatal(); ibv_device *m_p_ibv_device; // HCA handle - struct ibv_context *m_p_ibv_context; -#ifdef DEFINED_DPCP + struct ibv_context *m_p_ibv_context = nullptr; dpcp::adapter *m_p_adapter; -#endif /* DEFINED_DPCP */ xlio_ibv_device_attr_ex *m_p_ibv_device_attr; ibv_pd *m_p_ibv_pd; bool m_flow_tag_enabled; diff --git a/src/core/dev/ib_ctx_handler_collection.cpp b/src/core/dev/ib_ctx_handler_collection.cpp index 7d11a4561..467483e59 100644 --- a/src/core/dev/ib_ctx_handler_collection.cpp +++ b/src/core/dev/ib_ctx_handler_collection.cpp @@ -50,7 +50,7 @@ #define ibchc_logfunc __log_info_func #define ibchc_logfuncall __log_info_funcall -ib_ctx_handler_collection *g_p_ib_ctx_handler_collection = NULL; +ib_ctx_handler_collection *g_p_ib_ctx_handler_collection = nullptr; void check_flow_steering_log_num_mgm_entry_size() { @@ -66,7 +66,8 @@ void check_flow_steering_log_num_mgm_entry_size() vlog_printf( VLOG_DEBUG, "Flow steering option for mlx4 driver does not exist in current OFED version\n"); - } else if (flow_steering_val[0] != '-' || (strtol(&flow_steering_val[1], NULL, 0) % 2) == 0) { + } else if (flow_steering_val[0] != '-' || + (strtol(&flow_steering_val[1], nullptr, 0) % 2) == 0) { char module_info[3] = {0}; if (!run_and_retreive_system_command("modinfo mlx4_core > /dev/null 2>&1 ; echo $?", module_info, sizeof(module_info)) && @@ -146,8 +147,8 @@ ib_ctx_handler_collection::~ib_ctx_handler_collection() void ib_ctx_handler_collection::update_tbl(const char *ifa_name) { - struct ibv_device **dev_list = NULL; - ib_ctx_handler *p_ib_ctx_handler = NULL; + struct ibv_device **dev_list = nullptr; + ib_ctx_handler *p_ib_ctx_handler = nullptr; int num_devices = 0; int i; @@ -216,7 +217,7 @@ ib_ctx_handler *ib_ctx_handler_collection::get_ib_ctx(const char *ifa_name) if (check_netvsc_device_exist(ifa_name)) { if (!get_netvsc_slave(ifa_name, active_slave, slave_flags)) { - return NULL; + return nullptr; } ifa_name = (const char *)active_slave; } else if (check_bond_device_exist(ifa_name)) { @@ -228,11 +229,11 @@ ib_ctx_handler *ib_ctx_handler_collection::get_ib_ctx(const char *ifa_name) /* active/active: return the first slave */ if (!get_bond_slaves_name_list(ifa_name, slaves, sizeof(slaves))) { - return NULL; + return nullptr; } slave_name = strtok_r(slaves, " ", &save_ptr); - if (NULL == slave_name) { - return NULL; + if (!slave_name) { + return nullptr; } save_ptr = strchr(slave_name, '\n'); if (save_ptr) { @@ -248,7 +249,7 @@ ib_ctx_handler *ib_ctx_handler_collection::get_ib_ctx(const char *ifa_name) } } - return NULL; + return nullptr; } void ib_ctx_handler_collection::del_ib_ctx(ib_ctx_handler *ib_ctx) diff --git a/src/core/dev/ib_ctx_handler_collection.h b/src/core/dev/ib_ctx_handler_collection.h index 9958c18eb..c60415b23 100644 --- a/src/core/dev/ib_ctx_handler_collection.h +++ b/src/core/dev/ib_ctx_handler_collection.h @@ -45,7 +45,7 @@ class ib_ctx_handler_collection { ib_ctx_handler_collection(); ~ib_ctx_handler_collection(); - void update_tbl(const char *ifa_name = NULL); + void update_tbl(const char *ifa_name = nullptr); void print_val_tbl(); inline ib_context_map_t *get_ib_cxt_list() diff --git a/src/core/dev/net_device_entry.cpp b/src/core/dev/net_device_entry.cpp index 49724a1d6..0167707a6 100644 --- a/src/core/dev/net_device_entry.cpp +++ b/src/core/dev/net_device_entry.cpp @@ -51,7 +51,7 @@ net_device_entry::net_device_entry(int if_index, net_device_val *ndv) m_val = ndv; m_is_valid = false; m_cma_id_bind_trial_count = 0; - m_timer_handle = NULL; + m_timer_handle = nullptr; timer_count = -1; m_bond = net_device_val::NO_BOND; @@ -78,7 +78,7 @@ net_device_entry::~net_device_entry() { if (m_timer_handle) { g_p_event_handler_manager->unregister_timer_event(this, m_timer_handle); - m_timer_handle = NULL; + m_timer_handle = nullptr; } net_device_val *p_ndv = dynamic_cast(m_val); if (p_ndv && p_ndv->get_is_bond() == net_device_val::LAG_8023ad) { diff --git a/src/core/dev/net_device_table_mgr.cpp b/src/core/dev/net_device_table_mgr.cpp index 2a68a7acd..ba0debaae 100644 --- a/src/core/dev/net_device_table_mgr.cpp +++ b/src/core/dev/net_device_table_mgr.cpp @@ -61,7 +61,7 @@ #define ndtm_logfunc __log_info_func #define ndtm_logfuncall __log_info_funcall -net_device_table_mgr *g_p_net_device_table_mgr = NULL; +net_device_table_mgr *g_p_net_device_table_mgr = nullptr; enum net_device_table_mgr_timers { RING_PROGRESS_ENGINE_TIMER, RING_ADAPT_CQ_MODERATION_TIMER }; @@ -76,7 +76,7 @@ net_device_table_mgr::net_device_table_mgr() ndtm_logdbg(""); - m_global_ring_epfd = orig_os_api.epoll_create(48); + m_global_ring_epfd = SYSCALL(epoll_create, 48); BULLSEYE_EXCLUDE_BLOCK_START if (m_global_ring_epfd == -1) { @@ -85,12 +85,12 @@ net_device_table_mgr::net_device_table_mgr() throw_xlio_exception("epoll_create failed"); } - if (orig_os_api.pipe(m_global_ring_pipe_fds)) { + if (SYSCALL(pipe, m_global_ring_pipe_fds)) { ndtm_logerr("pipe create failed. (errno=%d %m)", errno); free_ndtm_resources(); throw_xlio_exception("pipe create failed"); } - if (orig_os_api.write(m_global_ring_pipe_fds[1], "#", 1) != 1) { + if (SYSCALL(write, m_global_ring_pipe_fds[1], "#", 1) != 1) { ndtm_logerr("pipe write failed. (errno=%d %m)", errno); free_ndtm_resources(); throw_xlio_exception("pipe write failed"); @@ -103,7 +103,7 @@ net_device_table_mgr::net_device_table_mgr() /* throw exception if there are no supported devices. */ if (m_net_device_map_index.empty()) { int num_devices = 0; - struct ibv_device **dev_list = NULL; + struct ibv_device **dev_list = nullptr; dev_list = xlio_ibv_get_device_list(&num_devices); if (dev_list && num_devices == 0) { ibv_free_device_list(dev_list); @@ -151,12 +151,12 @@ void net_device_table_mgr::free_ndtm_resources() m_lock.lock(); if (m_global_ring_epfd > 0) { - orig_os_api.close(m_global_ring_epfd); + SYSCALL(close, m_global_ring_epfd); m_global_ring_epfd = 0; } - orig_os_api.close(m_global_ring_pipe_fds[1]); - orig_os_api.close(m_global_ring_pipe_fds[0]); + SYSCALL(close, m_global_ring_pipe_fds[1]); + SYSCALL(close, m_global_ring_pipe_fds[0]); net_device_map_index_t::iterator itr; while ((itr = m_net_device_map_index.begin()) != m_net_device_map_index.end()) { @@ -191,7 +191,7 @@ void net_device_table_mgr::update_tbl() net_device_val *p_net_device_val; /* Set up the netlink socket */ - fd = orig_os_api.socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + fd = SYSCALL(socket, AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); if (fd < 0) { ndtm_logerr("netlink socket() creation"); return; @@ -210,7 +210,7 @@ void net_device_table_mgr::update_tbl() nl_req.infomsg.ifi_change = 0xffffffff; /* Send the netlink request */ - rc = orig_os_api.send(fd, &nl_req, nl_req.hdr.nlmsg_len, 0); + rc = SYSCALL(send, fd, &nl_req, nl_req.hdr.nlmsg_len, 0); if (rc < 0) { ndtm_logerr("netlink send() operation"); goto ret; @@ -220,7 +220,7 @@ void net_device_table_mgr::update_tbl() do { /* Receive the netlink reply */ - rc = orig_os_api.recv(fd, nl_res, sizeof(nl_res), 0); + rc = SYSCALL(recv, fd, nl_res, sizeof(nl_res), 0); if (rc < 0) { ndtm_logerr("netlink recv() operation"); goto ret; @@ -296,7 +296,7 @@ void net_device_table_mgr::update_tbl() ndtm_logdbg("Check completed. Found %ld offload capable network interfaces", m_net_device_map_index.size()); - orig_os_api.close(fd); + SYSCALL(close, fd); } void net_device_table_mgr::print_val_tbl() @@ -323,18 +323,18 @@ net_device_val *net_device_table_mgr::get_net_device_val(const ip_addr &if_addr) ndtm_logdbg("Found %s for addr: %s", net_dev->to_str().c_str(), if_addr.to_str().c_str()); if (net_dev->get_state() == net_device_val::INVALID) { ndtm_logdbg("invalid net_device %s", net_dev->to_str().c_str()); - return NULL; + return nullptr; } return iter->second; } ndtm_logdbg("Can't find net_device for addr: %s", if_addr.to_str().c_str()); - return NULL; + return nullptr; } net_device_val *net_device_table_mgr::get_net_device_val(int if_index) { net_device_map_index_t::iterator iter; - net_device_val *net_dev = NULL; + net_device_val *net_dev = nullptr; std::lock_guard lock(m_lock); @@ -362,9 +362,9 @@ net_device_val *net_device_table_mgr::get_net_device_val(int if_index) net_dev->get_ifname()); if (ret > 0 && (size_t)ret < sizeof(sys_path)) { ret = errno; /* to suppress errno */ - int fd = open(sys_path, O_RDONLY); + int fd = SYSCALL(open, sys_path, O_RDONLY); if (fd >= 0) { - close(fd); + SYSCALL(close, fd); goto out; } errno = ret; @@ -374,14 +374,14 @@ net_device_val *net_device_table_mgr::get_net_device_val(int if_index) } ndtm_logdbg("Can't find net_device for index: %d", if_index); - return NULL; + return nullptr; out: ndtm_logdbg("Found %s for index: %d", net_dev->to_str().c_str(), if_index); if (net_dev->get_state() == net_device_val::INVALID) { ndtm_logdbg("invalid net_device %s", net_dev->to_str().c_str()); - return NULL; + return nullptr; } return net_dev; } @@ -396,7 +396,7 @@ net_device_entry *net_device_table_mgr::create_new_entry(int if_index, const obs if (p_ndv) { return new net_device_entry(if_index, p_ndv); } - return NULL; + return nullptr; } void net_device_table_mgr::get_ip_list(local_ip_list_t &ip_list, sa_family_t family, int if_index) @@ -420,7 +420,8 @@ void net_device_table_mgr::get_ip_list(local_ip_list_t &ip_list, sa_family_t fam m_lock.unlock(); } -int net_device_table_mgr::global_ring_poll_and_process_element(uint64_t *p_poll_sn, +int net_device_table_mgr::global_ring_poll_and_process_element(uint64_t *p_poll_sn_rx, + uint64_t *p_poll_sn_tx, void *pv_fd_ready_array /*= NULL*/) { ndtm_logfunc(""); @@ -429,8 +430,8 @@ int net_device_table_mgr::global_ring_poll_and_process_element(uint64_t *p_poll_ net_device_map_index_t::iterator net_dev_iter; for (net_dev_iter = m_net_device_map_index.begin(); net_dev_iter != m_net_device_map_index.end(); net_dev_iter++) { - int ret = net_dev_iter->second->global_ring_poll_and_process_element(p_poll_sn, - pv_fd_ready_array); + int ret = net_dev_iter->second->global_ring_poll_and_process_element( + p_poll_sn_rx, p_poll_sn_tx, pv_fd_ready_array); if (ret < 0) { ndtm_logdbg("Error in net_device_val[%p]->poll_and_process_element() (errno=%d %m)", net_dev_iter->second, errno); @@ -446,14 +447,14 @@ int net_device_table_mgr::global_ring_poll_and_process_element(uint64_t *p_poll_ return ret_total; } -int net_device_table_mgr::global_ring_request_notification(uint64_t poll_sn) +int net_device_table_mgr::global_ring_request_notification(uint64_t poll_sn_rx, uint64_t poll_sn_tx) { ndtm_logfunc(""); int ret_total = 0; net_device_map_index_t::iterator net_dev_iter; for (net_dev_iter = m_net_device_map_index.begin(); m_net_device_map_index.end() != net_dev_iter; net_dev_iter++) { - int ret = net_dev_iter->second->global_ring_request_notification(poll_sn); + int ret = net_dev_iter->second->global_ring_request_notification(poll_sn_rx, poll_sn_tx); BULLSEYE_EXCLUDE_BLOCK_START if (ret < 0) { ndtm_logerr("Error in net_device_val[%p]->request_notification() (errno=%d %m)", @@ -479,7 +480,7 @@ int net_device_table_mgr::global_ring_wait_for_notification_and_process_element( int max_fd = 16; struct epoll_event events[max_fd]; - int res = orig_os_api.epoll_wait(global_ring_epfd_get(), events, max_fd, 0); + int res = SYSCALL(epoll_wait, global_ring_epfd_get(), events, max_fd, 0); if (res > 0) { for (int event_idx = 0; event_idx < res; ++event_idx) { int fd = events[event_idx].data.fd; // This is the Rx cq channel fd @@ -512,8 +513,8 @@ int net_device_table_mgr::global_ring_wait_for_notification_and_process_element( } else { ndtm_logdbg("removing wakeup fd from epfd"); BULLSEYE_EXCLUDE_BLOCK_START - if ((orig_os_api.epoll_ctl(m_global_ring_epfd, EPOLL_CTL_DEL, - m_global_ring_pipe_fds[0], NULL)) && + if ((SYSCALL(epoll_ctl, m_global_ring_epfd, EPOLL_CTL_DEL, + m_global_ring_pipe_fds[0], nullptr)) && (!(errno == ENOENT || errno == EBADF))) { ndtm_logerr("failed to del pipe channel fd from internal epfd (errno=%d %m)", errno); @@ -582,14 +583,13 @@ void net_device_table_mgr::handle_timer_expired(void *user_data) void net_device_table_mgr::global_ring_wakeup() { ndtm_logdbg(""); - epoll_event ev = {0, {0}}; + epoll_event ev = {0, {nullptr}}; ev.events = EPOLLIN; - ev.data.ptr = NULL; + ev.data.ptr = nullptr; int errno_tmp = errno; // don't let wakeup affect errno, as this can fail with EEXIST BULLSEYE_EXCLUDE_BLOCK_START - if ((orig_os_api.epoll_ctl(m_global_ring_epfd, EPOLL_CTL_ADD, m_global_ring_pipe_fds[0], - &ev)) && + if ((SYSCALL(epoll_ctl, m_global_ring_epfd, EPOLL_CTL_ADD, m_global_ring_pipe_fds[0], &ev)) && (errno != EEXIST)) { ndtm_logerr("failed to add pipe channel fd to internal epfd (errno=%d %m)", errno); } @@ -617,7 +617,7 @@ void net_device_table_mgr::del_link_event(const netlink_link_info *info) * resources correctly. */ if (info->flags & IFF_SLAVE) { - net_device_val *net_dev = NULL; + net_device_val *net_dev = nullptr; int if_index = info->ifindex; ndtm_logdbg("netlink event: if_index: %d state: %s", info->ifindex, @@ -642,7 +642,7 @@ void net_device_table_mgr::new_link_event(const netlink_link_info *info) * DOWN state (see RTM_DELLINK). */ if (info->flags & IFF_SLAVE) { - net_device_val *net_dev = NULL; + net_device_val *net_dev = nullptr; int if_index = info->ifindex; ndtm_logdbg("netlink event: if_index: %d state: %s", info->ifindex, diff --git a/src/core/dev/net_device_table_mgr.h b/src/core/dev/net_device_table_mgr.h index b0a0d1425..9c5614f38 100644 --- a/src/core/dev/net_device_table_mgr.h +++ b/src/core/dev/net_device_table_mgr.h @@ -74,7 +74,8 @@ class net_device_table_mgr : public cache_table_mgr, publ * channel. If race condition case occures then that CQ is polled and processed (and the CQ * notification is armed) Returns >=0 the total number of wce processed < 0 on error */ - int global_ring_poll_and_process_element(uint64_t *p_poll_sn, void *pv_fd_ready_array = NULL); + int global_ring_poll_and_process_element(uint64_t *p_poll_sn_rx, uint64_t *p_poll_sn_tx, + void *pv_fd_ready_array = nullptr); /** * This will poll one time on the ALL the managed CQ's @@ -83,9 +84,9 @@ class net_device_table_mgr : public cache_table_mgr, publ * < 0 error */ int global_ring_wait_for_notification_and_process_element(uint64_t *p_poll_sn, - void *pv_fd_ready_array = NULL); + void *pv_fd_ready_array = nullptr); - int global_ring_request_notification(uint64_t poll_sn); + int global_ring_request_notification(uint64_t poll_sn_rx, uint64_t poll_sn_tx); /** * This will poll one time on the ALL the managed CQ's diff --git a/src/core/dev/net_device_val.cpp b/src/core/dev/net_device_val.cpp index d65cb3eea..d59b48a2f 100644 --- a/src/core/dev/net_device_val.cpp +++ b/src/core/dev/net_device_val.cpp @@ -72,30 +72,25 @@ ring_alloc_logic_attr::ring_alloc_logic_attr() : m_ring_alloc_logic(RING_LOGIC_PER_INTERFACE) - , m_user_id_key(0) , m_use_locks(true) + , m_user_id_key(0) { - m_mem_desc.iov_base = NULL; - m_mem_desc.iov_len = 0; init(); } ring_alloc_logic_attr::ring_alloc_logic_attr(ring_logic_t ring_logic, bool use_locks) : m_ring_alloc_logic(ring_logic) - , m_user_id_key(0) , m_use_locks(use_locks) + , m_user_id_key(0) { - m_mem_desc.iov_base = NULL; - m_mem_desc.iov_len = 0; init(); } ring_alloc_logic_attr::ring_alloc_logic_attr(const ring_alloc_logic_attr &other) : m_hash(other.m_hash) , m_ring_alloc_logic(other.m_ring_alloc_logic) - , m_user_id_key(other.m_user_id_key) - , m_mem_desc(other.m_mem_desc) , m_use_locks(other.m_use_locks) + , m_user_id_key(other.m_user_id_key) { } @@ -118,8 +113,6 @@ void ring_alloc_logic_attr::init() HASH_ITER(m_ring_alloc_logic, size_t); HASH_ITER(m_user_id_key, uint64_t); - HASH_ITER(m_mem_desc.iov_base, uintptr_t); - HASH_ITER(m_mem_desc.iov_len, size_t); HASH_ITER(m_use_locks, bool); m_hash = h; @@ -134,14 +127,6 @@ void ring_alloc_logic_attr::set_ring_alloc_logic(ring_logic_t logic) } } -void ring_alloc_logic_attr::set_memory_descriptor(iovec &mem_desc) -{ - if (m_mem_desc.iov_base != mem_desc.iov_base || m_mem_desc.iov_len != mem_desc.iov_len) { - m_mem_desc = mem_desc; - init(); - } -} - void ring_alloc_logic_attr::set_user_id_key(uint64_t user_id_key) { if (m_user_id_key != user_id_key) { @@ -162,8 +147,7 @@ const std::string ring_alloc_logic_attr::to_str() const { std::stringstream ss; - ss << "allocation logic " << m_ring_alloc_logic << " key " << m_user_id_key << " user address " - << m_mem_desc.iov_base << " user length " << m_mem_desc.iov_len << " use locks " + ss << "allocation logic " << m_ring_alloc_logic << " key " << m_user_id_key << " use locks " << !!m_use_locks; return ss.str(); @@ -174,8 +158,8 @@ net_device_val::net_device_val(struct net_device_val_desc *desc) { bool valid = false; ib_ctx_handler *ib_ctx; - struct nlmsghdr *nl_msg = NULL; - struct ifinfomsg *nl_msgdata = NULL; + struct nlmsghdr *nl_msg = nullptr; + struct ifinfomsg *nl_msgdata = nullptr; int nl_attrlen; struct rtattr *nl_attr; @@ -185,15 +169,15 @@ net_device_val::net_device_val(struct net_device_val_desc *desc) m_flags = 0; m_mtu = 0; m_state = INVALID; - m_p_L2_addr = NULL; - m_p_br_addr = NULL; + m_p_L2_addr = nullptr; + m_p_br_addr = nullptr; m_bond = NO_BOND; m_if_active = 0; m_bond_xmit_hash_policy = XHP_LAYER_2; m_bond_fail_over_mac = 0; m_transport_type = XLIO_TRANSPORT_UNKNOWN; - if (NULL == desc) { + if (!desc) { nd_logerr("Invalid net_device_val name=%s", "NA"); m_state = INVALID; return; @@ -332,12 +316,12 @@ net_device_val::~net_device_val() } if (m_p_br_addr) { delete m_p_br_addr; - m_p_br_addr = NULL; + m_p_br_addr = nullptr; } if (m_p_L2_addr) { delete m_p_L2_addr; - m_p_L2_addr = NULL; + m_p_L2_addr = nullptr; } slave_data_vector_t::iterator slave = m_slaves.begin(); @@ -360,7 +344,7 @@ void net_device_val::set_ip_array() static int _seq = 0; /* Set up the netlink socket */ - fd = orig_os_api.socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + fd = SYSCALL(socket, AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); if (fd < 0) { nd_logerr("netlink socket() creation"); return; @@ -377,7 +361,7 @@ void net_device_val::set_ip_array() nl_req.addrmsg.ifa_index = m_if_idx; /* Send the netlink request */ - rc = orig_os_api.send(fd, &nl_req, nl_req.hdr.nlmsg_len, 0); + rc = SYSCALL(send, fd, &nl_req, nl_req.hdr.nlmsg_len, 0); if (rc < 0) { nd_logerr("netlink send() operation"); goto ret; @@ -385,7 +369,7 @@ void net_device_val::set_ip_array() do { /* Receive the netlink reply */ - rc = orig_os_api.recv(fd, nl_res, sizeof(nl_res), 0); + rc = SYSCALL(recv, fd, nl_res, sizeof(nl_res), 0); if (rc < 0) { nd_logerr("netlink recv() operation"); goto ret; @@ -442,7 +426,7 @@ void net_device_val::set_ip_array() } while (1); ret: - orig_os_api.close(fd); + SYSCALL(close, fd); print_ips(); } @@ -594,7 +578,7 @@ void net_device_val::set_slave_array() nd_logdbg(""); if (m_bond == NETVSC) { - slave_data_t *s = NULL; + slave_data_t *s = nullptr; unsigned int slave_flags = 0; if (get_netvsc_slave(get_ifname_link(), active_slave, slave_flags)) { if ((slave_flags & IFF_UP) && verify_qp_creation(active_slave, IBV_QPT_RAW_PACKET)) { @@ -620,7 +604,7 @@ void net_device_val::set_slave_array() slave_data_t *s = new slave_data_t(if_nametoindex(slave)); m_slaves.push_back(s); - slave = strtok(NULL, " "); + slave = strtok(nullptr, " "); } } @@ -708,7 +692,7 @@ const slave_data_t *net_device_val::get_slave(int if_index) return cur_slave; } } - return NULL; + return nullptr; } void net_device_val::verify_bonding_mode() @@ -726,7 +710,7 @@ void net_device_val::verify_bonding_mode() sprintf(bond_failover_mac_param_file, BONDING_FAILOVER_MAC_PARAM_FILE, get_ifname_link()); if (priv_safe_read_file(bond_mode_param_file, bond_mode_file_content, FILENAME_MAX) > 0) { - char *bond_mode = NULL; + char *bond_mode = nullptr; bond_mode = strtok(bond_mode_file_content, " "); if (bond_mode) { if (!strcmp(bond_mode, "active-backup")) { @@ -752,16 +736,16 @@ void net_device_val::verify_bonding_mode() get_ifname_link()); if (priv_safe_try_read_file(bond_xmit_hash_policy_param_file, bond_xmit_hash_policy_file_content, FILENAME_MAX) > 0) { - char *bond_xhp = NULL; - char *saveptr = NULL; + char *bond_xhp = nullptr; + char *saveptr = nullptr; bond_xhp = strtok_r(bond_xmit_hash_policy_file_content, " ", &saveptr); - if (NULL == bond_xhp) { + if (!bond_xhp) { nd_logdbg("could not parse bond xmit hash policy, staying with default (L2)\n"); } else { - bond_xhp = strtok_r(NULL, " ", &saveptr); + bond_xhp = strtok_r(nullptr, " ", &saveptr); if (bond_xhp) { - m_bond_xmit_hash_policy = (bond_xmit_hash_policy)strtol(bond_xhp, NULL, 10); + m_bond_xmit_hash_policy = (bond_xmit_hash_policy)strtol(bond_xhp, nullptr, 10); if (m_bond_xmit_hash_policy < XHP_LAYER_2 || m_bond_xmit_hash_policy > XHP_ENCAP_3_4) { vlog_printf(VLOG_WARNING, @@ -949,9 +933,9 @@ bool net_device_val::update_active_slaves() void net_device_val::update_netvsc_slaves(int if_index, int if_flags) { - slave_data_t *s = NULL; + slave_data_t *s = nullptr; bool found = false; - ib_ctx_handler *ib_ctx = NULL, *up_ib_ctx = NULL; + ib_ctx_handler *ib_ctx = nullptr, *up_ib_ctx = nullptr; char if_name[IFNAMSIZ] = {0}; m_lock.lock(); @@ -1017,7 +1001,7 @@ ring *net_device_val::reserve_ring(resource_allocation_key *key) nd_logfunc(""); std::lock_guard lock(m_lock); key = ring_key_redirection_reserve(key); - ring *the_ring = NULL; + ring *the_ring = nullptr; rings_hash_map_t::iterator ring_iter = m_h_ring_map.find(key); if (m_h_ring_map.end() == ring_iter) { @@ -1026,11 +1010,11 @@ ring *net_device_val::reserve_ring(resource_allocation_key *key) resource_allocation_key *new_key = new resource_allocation_key(*key); the_ring = create_ring(new_key); if (!the_ring) { - return NULL; + return nullptr; } m_h_ring_map[new_key] = std::make_pair(the_ring, 0); // each ring is born with ref_count = 0 ring_iter = m_h_ring_map.find(new_key); - epoll_event ev = {0, {0}}; + epoll_event ev = {0, {nullptr}}; size_t num_ring_rx_fds; int *ring_rx_fds_array = the_ring->get_rx_channel_fds(num_ring_rx_fds); ev.events = EPOLLIN; @@ -1038,8 +1022,8 @@ ring *net_device_val::reserve_ring(resource_allocation_key *key) int cq_ch_fd = ring_rx_fds_array[i]; ev.data.fd = cq_ch_fd; BULLSEYE_EXCLUDE_BLOCK_START - if (unlikely(orig_os_api.epoll_ctl(g_p_net_device_table_mgr->global_ring_epfd_get(), - EPOLL_CTL_ADD, cq_ch_fd, &ev))) { + if (unlikely(SYSCALL(epoll_ctl, g_p_net_device_table_mgr->global_ring_epfd_get(), + EPOLL_CTL_ADD, cq_ch_fd, &ev))) { nd_logerr( "Failed to add RING notification fd to global_table_mgr_epfd (errno=%d %s)", errno, strerror(errno)); @@ -1071,7 +1055,7 @@ int net_device_val::release_ring(resource_allocation_key *key) std::lock_guard lock(m_lock); red_key = get_ring_key_redirection(key); - ring *the_ring = NULL; + ring *the_ring = nullptr; rings_hash_map_t::iterator ring_iter = m_h_ring_map.find(red_key); if (m_h_ring_map.end() != ring_iter) { @@ -1091,10 +1075,9 @@ int net_device_val::release_ring(resource_allocation_key *key) for (size_t i = 0; i < num_ring_rx_fds; i++) { int cq_ch_fd = ring_rx_fds_array[i]; BULLSEYE_EXCLUDE_BLOCK_START - if (unlikely( - (orig_os_api.epoll_ctl(g_p_net_device_table_mgr->global_ring_epfd_get(), - EPOLL_CTL_DEL, cq_ch_fd, NULL)) && - (!(errno == ENOENT || errno == EBADF)))) { + if (unlikely((SYSCALL(epoll_ctl, g_p_net_device_table_mgr->global_ring_epfd_get(), + EPOLL_CTL_DEL, cq_ch_fd, nullptr)) && + (!(errno == ENOENT || errno == EBADF)))) { nd_logerr("Failed to delete RING notification fd to global_table_mgr_epfd " "(errno=%d %s)", errno, strerror(errno)); @@ -1190,7 +1173,8 @@ void net_device_val::ring_key_redirection_release(resource_allocation_key *key) } } -int net_device_val::global_ring_poll_and_process_element(uint64_t *p_poll_sn, +int net_device_val::global_ring_poll_and_process_element(uint64_t *p_poll_sn_rx, + uint64_t *p_poll_sn_tx, void *pv_fd_ready_array /*=NULL*/) { nd_logfuncall(""); @@ -1198,7 +1182,7 @@ int net_device_val::global_ring_poll_and_process_element(uint64_t *p_poll_sn, std::lock_guard lock(m_lock); rings_hash_map_t::iterator ring_iter; for (ring_iter = m_h_ring_map.begin(); ring_iter != m_h_ring_map.end(); ring_iter++) { - int ret = THE_RING->poll_and_process_element_rx(p_poll_sn, pv_fd_ready_array); + int ret = THE_RING->poll_and_process_element_rx(p_poll_sn_rx, pv_fd_ready_array); BULLSEYE_EXCLUDE_BLOCK_START if (ret < 0 && errno != EAGAIN) { nd_logerr("Error in RX ring->poll_and_process_element() of %p (errno=%d %s)", THE_RING, @@ -1207,11 +1191,11 @@ int net_device_val::global_ring_poll_and_process_element(uint64_t *p_poll_sn, } BULLSEYE_EXCLUDE_BLOCK_END if (ret > 0) { - nd_logfunc("ring[%p] RX Returned with: %d (sn=%d)", THE_RING, ret, *p_poll_sn); + nd_logfunc("ring[%p] RX Returned with: %d (sn=%d)", THE_RING, ret, *p_poll_sn_rx); ret_total += ret; } #if defined(DEFINED_FORCE_TX_POLLING) - ret = THE_RING->poll_and_process_element_tx(p_poll_sn); + ret = THE_RING->poll_and_process_element_tx(p_poll_sn_tx); BULLSEYE_EXCLUDE_BLOCK_START if (ret < 0 && errno != EAGAIN) { nd_logerr("Error in TX ring->poll_and_process_element() of %p (errno=%d %m)", THE_RING, @@ -1220,7 +1204,7 @@ int net_device_val::global_ring_poll_and_process_element(uint64_t *p_poll_sn, } BULLSEYE_EXCLUDE_BLOCK_END if (ret > 0) { - nd_logfunc("ring[%p] TX Returned with: %d (sn=%d)", THE_RING, ret, *p_poll_sn); + nd_logfunc("ring[%p] TX Returned with: %d (sn=%d)", THE_RING, ret, *p_poll_sn_tx); ret_total += ret; } #endif /* DEFINED_FORCE_TX_POLLING */ @@ -1228,13 +1212,13 @@ int net_device_val::global_ring_poll_and_process_element(uint64_t *p_poll_sn, return ret_total; } -int net_device_val::global_ring_request_notification(uint64_t poll_sn) +int net_device_val::global_ring_request_notification(uint64_t poll_sn_rx, uint64_t poll_sn_tx) { int ret_total = 0; std::lock_guard lock(m_lock); rings_hash_map_t::iterator ring_iter; for (ring_iter = m_h_ring_map.begin(); ring_iter != m_h_ring_map.end(); ring_iter++) { - int ret = THE_RING->request_notification(CQT_RX, poll_sn); + int ret = THE_RING->request_notification(CQT_RX, poll_sn_rx); BULLSEYE_EXCLUDE_BLOCK_START if (ret < 0) { nd_logerr("Error RX ring[%p]->request_notification() (errno=%d %s)", THE_RING, errno, @@ -1242,17 +1226,17 @@ int net_device_val::global_ring_request_notification(uint64_t poll_sn) return ret; } BULLSEYE_EXCLUDE_BLOCK_END - nd_logfunc("ring[%p] RX Returned with: %d (sn=%d)", THE_RING, ret, poll_sn); + nd_logfunc("ring[%p] RX Returned with: %d (sn=%d)", THE_RING, ret, poll_sn_rx); ret_total += ret; #if defined(DEFINED_FORCE_TX_POLLING) - ret = THE_RING->request_notification(CQT_TX, poll_sn); + ret = THE_RING->request_notification(CQT_TX, poll_sn_tx); BULLSEYE_EXCLUDE_BLOCK_START if (ret < 0) { nd_logerr("Error TX ring[%p]->request_notification() (errno=%d %m)", THE_RING, errno); return ret; } BULLSEYE_EXCLUDE_BLOCK_END - nd_logfunc("ring[%p] TX Returned with: %d (sn=%d)", THE_RING, ret, poll_sn); + nd_logfunc("ring[%p] TX Returned with: %d (sn=%d)", THE_RING, ret, poll_sn_tx); ret_total += ret; #endif /* DEFINED_FORCE_TX_POLLING */ } @@ -1336,7 +1320,7 @@ void net_device_val_eth::configure() m_p_L2_addr = create_L2_address(get_ifname()); BULLSEYE_EXCLUDE_BLOCK_START - if (m_p_L2_addr == NULL) { + if (!m_p_L2_addr) { nd_logpanic("m_p_L2_addr allocation error"); } BULLSEYE_EXCLUDE_BLOCK_END @@ -1380,7 +1364,7 @@ uint32_t net_device_val::get_priority_by_tc_class(uint32_t tc_class) void net_device_val_eth::parse_prio_egress_map() { int len, ret; - nl_cache *cache = NULL; + nl_cache *cache = nullptr; rtnl_link *link; vlan_map *map; @@ -1424,7 +1408,7 @@ void net_device_val_eth::parse_prio_egress_map() ring *net_device_val_eth::create_ring(resource_allocation_key *key) { - ring *ring = NULL; + ring *ring = nullptr; try { switch (m_bond) { @@ -1454,7 +1438,7 @@ L2_address *net_device_val_eth::create_L2_address(const char *ifname) { if (m_p_L2_addr) { delete m_p_L2_addr; - m_p_L2_addr = NULL; + m_p_L2_addr = nullptr; } unsigned char hw_addr[ETH_ALEN]; get_local_ll_addr(ifname, hw_addr, ETH_ALEN, false); @@ -1465,14 +1449,14 @@ void net_device_val_eth::create_br_address(const char *ifname) { if (m_p_br_addr) { delete m_p_br_addr; - m_p_br_addr = NULL; + m_p_br_addr = nullptr; } uint8_t hw_addr[ETH_ALEN]; get_local_ll_addr(ifname, hw_addr, ETH_ALEN, true); m_p_br_addr = new ETH_addr(hw_addr); BULLSEYE_EXCLUDE_BLOCK_START - if (m_p_br_addr == NULL) { + if (!m_p_br_addr) { nd_logpanic("m_p_br_addr allocation error"); } BULLSEYE_EXCLUDE_BLOCK_END @@ -1504,7 +1488,7 @@ bool net_device_val::verify_bond_or_eth_qp_creation() char *slave_name; char *save_ptr; slave_name = strtok_r(slaves, " ", &save_ptr); - while (slave_name != NULL) { + while (slave_name) { char *p = strchr(slave_name, '\n'); if (p) { *p = '\0'; // Remove the tailing 'new line" char @@ -1513,7 +1497,7 @@ bool net_device_val::verify_bond_or_eth_qp_creation() // check all slaves but print only once for bond bond_ok = false; } - slave_name = strtok_r(NULL, " ", &save_ptr); + slave_name = strtok_r(nullptr, " ", &save_ptr); } if (!bond_ok) { vlog_printf(VLOG_WARNING, @@ -1571,9 +1555,9 @@ bool net_device_val::verify_qp_creation(const char *ifname, enum ibv_qp_type qp_ { bool success = false; char bond_roce_lag_path[256] = {0}; - struct ibv_cq *cq = NULL; - struct ibv_comp_channel *channel = NULL; - struct ibv_qp *qp = NULL; + struct ibv_cq *cq = nullptr; + struct ibv_comp_channel *channel = nullptr; + struct ibv_qp *qp = nullptr; struct ibv_context *context; int comp_vector = 0; @@ -1646,15 +1630,6 @@ bool net_device_val::verify_qp_creation(const char *ifname, enum ibv_qp_type qp_ qp = xlio_ibv_create_qp(p_ib_ctx->get_ibv_pd(), &qp_init_attr); if (qp) { success = true; - - if (qp_type == IBV_QPT_RAW_PACKET && - !priv_ibv_query_flow_tag_supported(qp, port_num, AF_INET) && - !priv_ibv_query_flow_tag_supported(qp, port_num, AF_INET6)) { - p_ib_ctx->set_flow_tag_capability(true); - } - nd_logdbg("verified interface %s for flow tag capabilities : %s", ifname, - p_ib_ctx->get_flow_tag_capability() ? "enabled" : "disabled"); - if (qp_type == IBV_QPT_RAW_PACKET && p_ib_ctx->is_packet_pacing_supported() && !priv_ibv_query_burst_supported(qp, port_num)) { p_ib_ctx->set_burst_capability(true); diff --git a/src/core/dev/net_device_val.h b/src/core/dev/net_device_val.h index 34d9aae84..658f7ba0e 100644 --- a/src/core/dev/net_device_val.h +++ b/src/core/dev/net_device_val.h @@ -60,21 +60,17 @@ class ring_alloc_logic_attr { ring_alloc_logic_attr(ring_logic_t ring_logic, bool use_locks); ring_alloc_logic_attr(const ring_alloc_logic_attr &other); void set_ring_alloc_logic(ring_logic_t logic); - void set_memory_descriptor(iovec &mem_desc); void set_user_id_key(uint64_t user_id_key); void set_use_locks(bool use_locks); const std::string to_str() const; inline ring_logic_t get_ring_alloc_logic() { return m_ring_alloc_logic; } - inline iovec *get_memory_descriptor() { return &m_mem_desc; } inline uint64_t get_user_id_key() { return m_user_id_key; } inline bool get_use_locks() { return m_use_locks; } bool operator==(const ring_alloc_logic_attr &other) const { return (m_ring_alloc_logic == other.m_ring_alloc_logic && - m_user_id_key == other.m_user_id_key && - m_mem_desc.iov_base == other.m_mem_desc.iov_base && - m_mem_desc.iov_len == other.m_mem_desc.iov_len && m_use_locks == other.m_use_locks); + m_user_id_key == other.m_user_id_key && m_use_locks == other.m_use_locks); } bool operator!=(const ring_alloc_logic_attr &other) const { return !(*this == other); } @@ -85,8 +81,6 @@ class ring_alloc_logic_attr { m_ring_alloc_logic = other.m_ring_alloc_logic; m_user_id_key = other.m_user_id_key; m_hash = other.m_hash; - m_mem_desc.iov_base = other.m_mem_desc.iov_base; - m_mem_desc.iov_len = other.m_mem_desc.iov_len; m_use_locks = other.m_use_locks; } return *this; @@ -101,12 +95,11 @@ class ring_alloc_logic_attr { private: size_t m_hash; - /* ring allocation logic , per thread per fd ... */ + /* Ring allocation logic: per thread, per interface, etc */ ring_logic_t m_ring_alloc_logic; - /* either user_idx or key as defined in ring_logic_t */ - uint64_t m_user_id_key; - iovec m_mem_desc; bool m_use_locks; + /* Either user_idx or key as defined in ring_logic_t */ + uint64_t m_user_id_key; void init(); }; @@ -129,14 +122,14 @@ typedef std::unordered_map slave_data_vector_t; @@ -247,8 +240,9 @@ class net_device_val { transport_type_t get_transport_type() const { return m_transport_type; } bool update_active_backup_slaves(); - int global_ring_poll_and_process_element(uint64_t *p_poll_sn, void *pv_fd_ready_array = NULL); - int global_ring_request_notification(uint64_t poll_sn); + int global_ring_poll_and_process_element(uint64_t *p_poll_sn_rx, uint64_t *p_poll_sn_tx, + void *pv_fd_ready_array = NULL); + int global_ring_request_notification(uint64_t poll_sn_rx, uint64_t poll_sn_tx); int ring_drain_and_proccess(); void ring_adapt_cq_moderation(); L2_address *get_l2_address() { return m_p_L2_addr; }; diff --git a/src/core/dev/qp_mgr.cpp b/src/core/dev/qp_mgr.cpp deleted file mode 100644 index 21d5d91ca..000000000 --- a/src/core/dev/qp_mgr.cpp +++ /dev/null @@ -1,713 +0,0 @@ -/* - * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "qp_mgr.h" -#include "utils/bullseye.h" -#include "util/utils.h" -#include "util/valgrind.h" -#include "util/instrumentation.h" -#include "iomux/io_mux_call.h" -#include "buffer_pool.h" -#include "cq_mgr.h" -#include "ring_simple.h" -#include "util/valgrind.h" -#include "dev/rfs_rule_ibv.h" -#include - -#undef MODULE_NAME -#define MODULE_NAME "qpm" - -#define qp_logpanic __log_info_panic -#define qp_logerr __log_info_err -#define qp_logwarn __log_info_warn -#define qp_loginfo __log_info_info -#define qp_logdbg __log_info_dbg -#define qp_logfunc __log_info_func -#define qp_logfuncall __log_info_funcall - -//#define ALIGN_WR_UP(_num_wr_) (max(32, ((_num_wr_ + 0xf) & ~(0xf)))) -#define ALIGN_WR_DOWN(_num_wr_) (max(32, ((_num_wr_) & ~(0xf)))) - -#define FICTIVE_REMOTE_QPN 0x48 -#define FICTIVE_REMOTE_QKEY 0x01234567 -#define FICTIVE_AH_SL 5 -#define FICTIVE_AH_DLID 0x3 - -#define MAX_UPSTREAM_CQ_MSHV_SIZE 8192 - -qp_mgr::qp_mgr(struct qp_mgr_desc *desc, const uint32_t tx_num_wr) - : m_qp(NULL) - , m_rq_wqe_idx_to_wrid(NULL) - , m_p_ring((ring_simple *)desc->ring) - , m_port_num((uint8_t)desc->slave->port_num) - , m_p_ib_ctx_handler((ib_ctx_handler *)desc->slave->p_ib_ctx) - , m_max_qp_wr(0) - , m_p_cq_mgr_rx(NULL) - , m_p_cq_mgr_tx(NULL) - , m_rx_num_wr(safe_mce_sys().rx_num_wr) - , m_tx_num_wr(tx_num_wr) - , m_hw_dummy_send_support(false) - , m_n_sysvar_rx_num_wr_to_post_recv(safe_mce_sys().rx_num_wr_to_post_recv) - , m_n_sysvar_tx_num_wr_to_signal(safe_mce_sys().tx_num_wr_to_signal) - , m_n_sysvar_rx_prefetch_bytes_before_poll(safe_mce_sys().rx_prefetch_bytes_before_poll) - , m_curr_rx_wr(0) - , m_last_posted_rx_wr_id(0) - , m_n_unsignaled_count(0) - , m_p_prev_rx_desc_pushed(NULL) - , m_n_ip_id_base(0) - , m_n_ip_id_offset(0) -{ - memset(&m_qp_cap, 0, sizeof(m_qp_cap)); - m_qp_cap.max_inline_data = safe_mce_sys().tx_max_inline; - m_qp_cap.max_send_sge = (m_p_ring->is_tso() ? m_p_ib_ctx_handler->get_ibv_device_attr()->max_sge - : MCE_DEFAULT_TX_NUM_SGE); - m_qp_cap.max_recv_sge = (m_p_ring->is_socketxtreme()) ? 1 : MCE_DEFAULT_RX_NUM_SGE; - - m_ibv_rx_sg_array = new ibv_sge[m_n_sysvar_rx_num_wr_to_post_recv]; - m_ibv_rx_wr_array = new ibv_recv_wr[m_n_sysvar_rx_num_wr_to_post_recv]; - - memset(&m_rate_limit, 0, sizeof(struct xlio_rate_limit_t)); - - qp_logfunc(""); -} - -qp_mgr::~qp_mgr() -{ - qp_logfunc(""); - - qp_logdbg("calling ibv_destroy_qp(qp=%p)", m_qp); - if (m_qp) { - IF_VERBS_FAILURE_EX(ibv_destroy_qp(m_qp), EIO) - { - qp_logdbg("QP destroy failure (errno = %d %m)", -errno); - } - ENDIF_VERBS_FAILURE; - VALGRIND_MAKE_MEM_UNDEFINED(m_qp, sizeof(ibv_qp)); - } - m_qp = NULL; - - if (m_p_cq_mgr_tx) { - delete m_p_cq_mgr_tx; - m_p_cq_mgr_tx = NULL; - } - if (m_p_cq_mgr_rx) { - delete m_p_cq_mgr_rx; - m_p_cq_mgr_rx = NULL; - } - - delete[] m_ibv_rx_sg_array; - delete[] m_ibv_rx_wr_array; - - qp_logdbg("Rx buffer poll: %ld free global buffers available", - g_buffer_pool_rx_rwqe->get_free_count()); - qp_logdbg("delete done"); -} - -cq_mgr *qp_mgr::handle_cq_initialization(uint32_t *num_wr, - struct ibv_comp_channel *comp_event_channel, bool is_rx) -{ - qp_logfunc(""); - cq_mgr *cq = NULL; - - try { - cq = new cq_mgr(m_p_ring, m_p_ib_ctx_handler, *num_wr, comp_event_channel, is_rx); - } catch (xlio_exception &e) { - // This is a workaround for an issue with cq creation of mlx4 devices on - // upstream-driver VMs over Windows Hypervisor. - if (safe_mce_sys().hypervisor == mce_sys_var::HYPER_MSHV && m_p_ib_ctx_handler->is_mlx4() && - *num_wr > MAX_UPSTREAM_CQ_MSHV_SIZE) { - qp_logdbg("cq creation failed with cq_size of %d. retrying with size of %d", *num_wr, - MAX_UPSTREAM_CQ_MSHV_SIZE); - *num_wr = MAX_UPSTREAM_CQ_MSHV_SIZE; - try { - cq = new cq_mgr(m_p_ring, m_p_ib_ctx_handler, *num_wr, comp_event_channel, is_rx); - } catch (xlio_exception &) { - } - } - - if (!cq) { - qp_logerr("%s", e.message); - } - } - - return cq; -} - -cq_mgr *qp_mgr::init_rx_cq_mgr(struct ibv_comp_channel *p_rx_comp_event_channel) -{ - return handle_cq_initialization(&m_rx_num_wr, p_rx_comp_event_channel, true); -} - -cq_mgr *qp_mgr::init_tx_cq_mgr() -{ - return handle_cq_initialization(&m_tx_num_wr, m_p_ring->get_tx_comp_event_channel(), false); -} - -int qp_mgr::configure(struct qp_mgr_desc *desc) -{ - qp_logdbg("Creating QP of transport type '%s' on ibv device '%s' [%p] on port %d", - priv_xlio_transport_type_str(m_p_ring->get_transport_type()), - m_p_ib_ctx_handler->get_ibname(), m_p_ib_ctx_handler->get_ibv_device(), m_port_num); - - // Check device capabilities for max QP work requests - m_max_qp_wr = ALIGN_WR_DOWN(m_p_ib_ctx_handler->get_ibv_device_attr()->max_qp_wr - 1); - if (m_rx_num_wr > m_max_qp_wr) { - qp_logwarn("Allocating only %d Rx QP work requests while user " - "requested %s=%d for QP on <%p, %d>", - m_max_qp_wr, SYS_VAR_RX_NUM_WRE, m_rx_num_wr, m_p_ib_ctx_handler, m_port_num); - m_rx_num_wr = m_max_qp_wr; - } - - qp_logdbg("HW Dummy send support for QP = %d", m_hw_dummy_send_support); - - // Create associated Tx & Rx cq_mgrs - m_p_cq_mgr_tx = init_tx_cq_mgr(); - BULLSEYE_EXCLUDE_BLOCK_START - if (!m_p_cq_mgr_tx) { - qp_logerr("Failed allocating m_p_cq_mgr_tx (errno=%d %m)", errno); - return -1; - } - m_p_cq_mgr_rx = init_rx_cq_mgr(desc->rx_comp_event_channel); - if (!m_p_cq_mgr_rx) { - qp_logerr("Failed allocating m_p_cq_mgr_rx (errno=%d %m)", errno); - return -1; - } - BULLSEYE_EXCLUDE_BLOCK_END - - // Modify the Rx and Tx cq_mgr to use a non-blocking event channel - set_fd_block_mode(m_p_cq_mgr_rx->get_channel_fd(), false); - set_fd_block_mode(m_p_cq_mgr_tx->get_channel_fd(), false); - - qp_logdbg("cq tx: %p rx: %p", m_p_cq_mgr_tx, m_p_cq_mgr_rx); - - // Create QP - xlio_ibv_qp_init_attr qp_init_attr; - memset(&qp_init_attr, 0, sizeof(qp_init_attr)); - - // TODO: m_tx_num_wr and m_rx_num_wr should be part of m_qp_cap - // and assigned as a result of ibv_query_qp() - m_qp_cap.max_send_wr = m_tx_num_wr; - m_qp_cap.max_recv_wr = m_rx_num_wr; - - memcpy(&qp_init_attr.cap, &m_qp_cap, sizeof(qp_init_attr.cap)); - qp_init_attr.recv_cq = m_p_cq_mgr_rx->get_ibv_cq_hndl(); - qp_init_attr.send_cq = m_p_cq_mgr_tx->get_ibv_cq_hndl(); - qp_init_attr.sq_sig_all = 0; - - // In case of enabled TSO we need to take into account amount of SGE together with header inline - // Per PRM maximum of CTRL + ETH + ETH_HEADER_INLINE+DATA_PTR*NUM_SGE+MAX_INLINE+INLINE_SIZE - // MLX5 return 32678 WQEBBs at max so minimal number - int max_wqe_sz = - 16 + 14 + 16 * qp_init_attr.cap.max_send_sge + qp_init_attr.cap.max_inline_data + 4; - max_wqe_sz += (m_p_ring->is_tso() ? m_p_ring->m_tso.max_header_sz : 94); - int num_wr = 32678 * 64 / max_wqe_sz; - qp_logdbg("calculated max_wqe_sz=%d num_wr=%d", max_wqe_sz, num_wr); - if (num_wr < (signed)m_tx_num_wr) { - qp_init_attr.cap.max_send_wr = - num_wr; // force min for create_qp or you will have error of memory allocation - } - - qp_logdbg("Requested QP parameters: " - "wre: tx = %d rx = %d " - "sge: tx = %d rx = %d " - "inline: %d", - qp_init_attr.cap.max_send_wr, qp_init_attr.cap.max_recv_wr, - qp_init_attr.cap.max_send_sge, qp_init_attr.cap.max_recv_sge, - qp_init_attr.cap.max_inline_data); - - // Create the QP - if (prepare_ibv_qp(qp_init_attr)) { - return -1; - } - - qp_logdbg("Configured QP parameters: " - "wre: tx = %d rx = %d " - "sge: tx = %d rx = %d " - "inline: %d", - qp_init_attr.cap.max_send_wr, qp_init_attr.cap.max_recv_wr, - qp_init_attr.cap.max_send_sge, qp_init_attr.cap.max_recv_sge, - qp_init_attr.cap.max_inline_data); - - /* Check initial parameters with actual */ - enum ibv_qp_attr_mask attr_mask = IBV_QP_CAP; - struct ibv_qp_attr tmp_ibv_qp_attr; - struct ibv_qp_init_attr tmp_ibv_qp_init_attr; - IF_VERBS_FAILURE(ibv_query_qp(m_qp, &tmp_ibv_qp_attr, attr_mask, &tmp_ibv_qp_init_attr)) - { - qp_logerr("ibv_query_qp failed (errno=%d %m)", errno); - return -1; - } - ENDIF_VERBS_FAILURE; - m_qp_cap.max_send_wr = min(tmp_ibv_qp_attr.cap.max_send_wr, m_qp_cap.max_send_wr); - m_qp_cap.max_recv_wr = min(tmp_ibv_qp_attr.cap.max_recv_wr, m_qp_cap.max_recv_wr); - m_qp_cap.max_send_sge = min(tmp_ibv_qp_attr.cap.max_send_sge, m_qp_cap.max_send_sge); - m_qp_cap.max_recv_sge = min(tmp_ibv_qp_attr.cap.max_recv_sge, m_qp_cap.max_recv_sge); - m_qp_cap.max_inline_data = min(tmp_ibv_qp_attr.cap.max_inline_data, m_qp_cap.max_inline_data); - - qp_logdbg("Used QP (num=%d) " - "wre: tx = %d rx = %d " - "sge: tx = %d rx = %d " - "inline: %d", - m_qp->qp_num, m_qp_cap.max_send_wr, m_qp_cap.max_recv_wr, m_qp_cap.max_send_sge, - m_qp_cap.max_recv_sge, m_qp_cap.max_inline_data); - -#if defined(DEFINED_ROCE_LAG) - if (desc->slave && desc->slave->lag_tx_port_affinity > 0) { - const slave_data_t *p_slave = desc->slave; - struct mlx5dv_context attr_out; - - memset(&attr_out, 0, sizeof(attr_out)); - attr_out.comp_mask |= MLX5DV_CONTEXT_MASK_NUM_LAG_PORTS; - if (!mlx5dv_query_device(p_slave->p_ib_ctx->get_ibv_context(), &attr_out)) { - qp_logdbg("QP ROCE LAG port: %d of %d", p_slave->lag_tx_port_affinity, - attr_out.num_lag_ports); - - if (!mlx5dv_modify_qp_lag_port(m_qp, p_slave->lag_tx_port_affinity)) { - uint8_t current_port_num = 0; - uint8_t active_port_num = 0; - - if (!mlx5dv_query_qp_lag_port(m_qp, ¤t_port_num, &active_port_num)) { - qp_logdbg("QP ROCE LAG port affinity: %d => %d", current_port_num, - active_port_num); - } - } - } - } -#endif /* DEFINED_ROCE_LAG */ - // All buffers will be allocated from this qp_mgr buffer pool so we can already set the Rx & Tx - // lkeys - for (uint32_t wr_idx = 0; wr_idx < m_n_sysvar_rx_num_wr_to_post_recv; wr_idx++) { - m_ibv_rx_wr_array[wr_idx].sg_list = &m_ibv_rx_sg_array[wr_idx]; - m_ibv_rx_wr_array[wr_idx].num_sge = 1; - m_ibv_rx_wr_array[wr_idx].next = - (wr_idx < (m_n_sysvar_rx_num_wr_to_post_recv - 1) ? &m_ibv_rx_wr_array[wr_idx + 1] - : NULL); // pre-define the linked list - } - - m_curr_rx_wr = 0; - - return 0; -} - -void qp_mgr::up() -{ - // Add buffers - qp_logdbg("QP current state: %d", priv_ibv_query_qp_state(m_qp)); - - m_p_cq_mgr_tx->add_qp_tx(this); - - release_rx_buffers(); // We might have old flushed cqe's in our CQ still from previous HA event - release_tx_buffers(); - - modify_qp_to_ready_state(); - - m_p_cq_mgr_rx->add_qp_rx(this); -} - -void qp_mgr::down() -{ - qp_logdbg("QP current state: %d", priv_ibv_query_qp_state(m_qp)); - modify_qp_to_error_state(); - - // free buffers from current active resource iterator - trigger_completion_for_all_sent_packets(); - - // let the QP drain all wqe's to flushed cqe's now that we moved - // it to error state and post_sent final trigger for completion - usleep(1000); - - release_tx_buffers(); - release_rx_buffers(); - m_p_cq_mgr_tx->del_qp_tx(this); - m_p_cq_mgr_rx->del_qp_rx(this); -} - -void qp_mgr::modify_qp_to_error_state() -{ - qp_logdbg(""); - - BULLSEYE_EXCLUDE_BLOCK_START - if (priv_ibv_modify_qp_to_err(m_qp)) { - qp_logdbg("ibv_modify_qp failure (errno = %d %m)", errno); - } - BULLSEYE_EXCLUDE_BLOCK_END -} - -void qp_mgr::release_rx_buffers() -{ - int total_ret = m_curr_rx_wr; - if (m_curr_rx_wr) { - qp_logdbg("Returning %d pending post_recv buffers to CQ owner", m_curr_rx_wr); - while (m_curr_rx_wr) { - // Cleaning unposted buffers. Unposted buffers are not attached to any strides. - --m_curr_rx_wr; - mem_buf_desc_t *p_mem_buf_desc = - (mem_buf_desc_t *)(uintptr_t)m_ibv_rx_wr_array[m_curr_rx_wr].wr_id; - if (p_mem_buf_desc && p_mem_buf_desc->p_desc_owner) { - m_p_ring->mem_buf_desc_return_to_owner_rx(p_mem_buf_desc); - } else { - g_buffer_pool_rx_rwqe->put_buffers_thread_safe(p_mem_buf_desc); - } - } - } - // Wait for all FLUSHed WQE on Rx CQ - qp_logdbg("draining rx cq_mgr %p (last_posted_rx_wr_id = %lu)", m_p_cq_mgr_rx, - m_last_posted_rx_wr_id); - uintptr_t last_polled_rx_wr_id = 0; - while (m_p_cq_mgr_rx && last_polled_rx_wr_id != m_last_posted_rx_wr_id && errno != EIO && - !m_p_ib_ctx_handler->is_removed() && !is_rq_empty() && !g_b_exit) { - - // Process the FLUSH'ed WQE's - int ret = m_p_cq_mgr_rx->drain_and_proccess(&last_polled_rx_wr_id); - qp_logdbg("draining completed on rx cq_mgr (%d wce) last_polled_rx_wr_id = %lu", ret, - last_polled_rx_wr_id); - - total_ret += ret; - - if (!ret) { - // Query context for ib_verbs events (especially for IBV_EVENT_DEVICE_FATAL) - g_p_event_handler_manager->query_for_ibverbs_event( - m_p_ib_ctx_handler->get_ibv_context()->async_fd); - } - - // Add short delay (500 usec) to allow for WQE's to be flushed to CQ every poll cycle - const struct timespec short_sleep = {0, 500000}; // 500 usec - nanosleep(&short_sleep, NULL); - } - m_last_posted_rx_wr_id = 0; // Clear the posted WR_ID flag, we just clear the entire RQ - qp_logdbg("draining completed with a total of %d wce's on rx cq_mgr", total_ret); - NOT_IN_USE(total_ret); // Suppress --enable-opt-log=high warning -} - -void qp_mgr::release_tx_buffers() -{ - int ret; - uint64_t poll_sn = 0; - qp_logdbg("draining tx cq_mgr %p", m_p_cq_mgr_tx); - while (m_p_cq_mgr_tx && m_qp && - ((ret = m_p_cq_mgr_tx->poll_and_process_element_tx(&poll_sn)) > 0) && - (errno != EIO && !m_p_ib_ctx_handler->is_removed())) { - qp_logdbg("draining completed on tx cq_mgr (%d wce)", ret); - } - NOT_IN_USE(ret); // Suppress --enable-opt-log=high warning -} - -void qp_mgr::trigger_completion_for_all_sent_packets() -{ - xlio_ibv_send_wr send_wr; - ibv_sge sge[1]; - - // Handle releasing of Tx buffers - // Single post send with SIGNAL of a dummy packet - - // NOTE: Since the QP is in ERROR state no packets will be sent on the wire! - // So we can post_send anything we want :) - - qp_logdbg("unsignaled count=%d", m_n_unsignaled_count); - if (!is_signal_requested_for_last_wqe()) { - qp_logdbg("Need to send closing tx wr..."); - mem_buf_desc_t *p_mem_buf_desc = m_p_ring->mem_buf_tx_get(0, true, PBUF_RAM); - m_p_ring->m_missing_buf_ref_count--; // Align Tx buffer accounting since we will be - // bypassing the normal send calls - if (!p_mem_buf_desc) { - qp_logerr("no buffer in pool"); - return; - } - - // Prepare dummy packet: zeroed payload ('0000'). - // For ETH it replaces the MAC header!! (Nothing is going on the wire, QP in error state) - // For IB it replaces the IPoIB header. - - /* need to send at least eth+ip, since libmlx5 will drop just eth header */ - ethhdr *p_buffer_ethhdr = (ethhdr *)p_mem_buf_desc->p_buffer; - memset(p_buffer_ethhdr, 0, sizeof(*p_buffer_ethhdr)); - p_buffer_ethhdr->h_proto = htons(ETH_P_IP); - iphdr *p_buffer_iphdr = (iphdr *)(p_mem_buf_desc->p_buffer + sizeof(*p_buffer_ethhdr)); - memset(p_buffer_iphdr, 0, sizeof(*p_buffer_iphdr)); - sge[0].length = sizeof(ethhdr) + sizeof(iphdr); - sge[0].addr = (uintptr_t)(p_mem_buf_desc->p_buffer); - sge[0].lkey = m_p_ring->m_tx_lkey; - - // Prepare send wr for (does not care if it is UD/IB or RAW/ETH) - // UD requires AH+qkey, RAW requires minimal payload instead of MAC header. - - memset(&send_wr, 0, sizeof(send_wr)); - send_wr.wr_id = (uintptr_t)p_mem_buf_desc; - send_wr.sg_list = sge; - send_wr.num_sge = 1; - send_wr.next = NULL; - xlio_send_wr_opcode(send_wr) = XLIO_IBV_WR_SEND; - qp_logdbg("IBV_SEND_SIGNALED"); - - // Close the Tx unsignaled send list - set_unsignaled_count(); - - // We don't check for available space in SQ, because this is legacy code. - send_to_wire(&send_wr, - (xlio_wr_tx_packet_attr)(XLIO_TX_PACKET_L3_CSUM | XLIO_TX_PACKET_L4_CSUM), - true, NULL, 0); - } -} - -uint32_t qp_mgr::get_rx_max_wr_num() -{ - return m_rx_num_wr; -} - -void qp_mgr::post_recv_buffer(mem_buf_desc_t *p_mem_buf_desc) -{ - if (m_n_sysvar_rx_prefetch_bytes_before_poll) { - if (m_p_prev_rx_desc_pushed) { - m_p_prev_rx_desc_pushed->p_prev_desc = p_mem_buf_desc; - } - m_p_prev_rx_desc_pushed = p_mem_buf_desc; - } - - m_ibv_rx_wr_array[m_curr_rx_wr].wr_id = (uintptr_t)p_mem_buf_desc; - m_ibv_rx_sg_array[m_curr_rx_wr].addr = (uintptr_t)p_mem_buf_desc->p_buffer; - m_ibv_rx_sg_array[m_curr_rx_wr].length = p_mem_buf_desc->sz_buffer; - m_ibv_rx_sg_array[m_curr_rx_wr].lkey = p_mem_buf_desc->lkey; - - if (m_curr_rx_wr == m_n_sysvar_rx_num_wr_to_post_recv - 1) { - - m_last_posted_rx_wr_id = (uintptr_t)p_mem_buf_desc; - - m_p_prev_rx_desc_pushed = NULL; - p_mem_buf_desc->p_prev_desc = NULL; - - m_curr_rx_wr = 0; - struct ibv_recv_wr *bad_wr = NULL; - IF_VERBS_FAILURE(ibv_post_recv(m_qp, &m_ibv_rx_wr_array[0], &bad_wr)) - { - uint32_t n_pos_bad_rx_wr = - ((uint8_t *)bad_wr - (uint8_t *)m_ibv_rx_wr_array) / sizeof(struct ibv_recv_wr); - qp_logerr("failed posting list (errno=%d %m)", errno); - qp_logerr("bad_wr is %d in submitted list (bad_wr=%p, m_ibv_rx_wr_array=%p, size=%zu)", - n_pos_bad_rx_wr, bad_wr, m_ibv_rx_wr_array, sizeof(struct ibv_recv_wr)); - qp_logerr("bad_wr info: wr_id=%#lx, next=%p, addr=%#lx, length=%d, lkey=%#x", - bad_wr[0].wr_id, bad_wr[0].next, bad_wr[0].sg_list[0].addr, - bad_wr[0].sg_list[0].length, bad_wr[0].sg_list[0].lkey); - qp_logerr("QP current state: %d", priv_ibv_query_qp_state(m_qp)); - - // Fix broken linked list of rx_wr - if (n_pos_bad_rx_wr != (m_n_sysvar_rx_num_wr_to_post_recv - 1)) { - m_ibv_rx_wr_array[n_pos_bad_rx_wr].next = &m_ibv_rx_wr_array[n_pos_bad_rx_wr + 1]; - } - throw; - } - ENDIF_VERBS_FAILURE; - qp_logfunc("Successful ibv_post_recv"); - } else { - m_curr_rx_wr++; - } -} - -void qp_mgr::post_recv_buffers(descq_t *p_buffers, size_t count) -{ - qp_logfuncall(""); - // Called from cq_mgr context under cq_mgr::LOCK! - while (count--) { - post_recv_buffer(p_buffers->get_and_pop_front()); - } -} - -inline int qp_mgr::send_to_wire(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, - bool request_comp, xlio_tis *tis, unsigned credits) -{ - NOT_IN_USE(attr); - NOT_IN_USE(tis); - NOT_IN_USE(credits); - int ret = 0; - xlio_ibv_send_wr *bad_wr = NULL; - - if (request_comp) { - xlio_send_wr_send_flags(*p_send_wqe) = - (xlio_ibv_send_flags)(xlio_send_wr_send_flags(*p_send_wqe) | XLIO_IBV_SEND_SIGNALED); - } - - IF_VERBS_FAILURE(xlio_ibv_post_send(m_qp, p_send_wqe, &bad_wr)) - { - qp_logerr( - "failed post_send%s (errno=%d %m)\n", - ((xlio_send_wr_send_flags(*p_send_wqe) & XLIO_IBV_SEND_INLINE) ? "(+inline)" : ""), - errno); - if (bad_wr) { - qp_logerr("bad_wr info: wr_id=%#lx, send_flags=%#lx, addr=%#lx, length=%d, lkey=%#x, " - "max_inline_data=%d", - bad_wr->wr_id, (unsigned long)xlio_send_wr_send_flags(*bad_wr), - bad_wr->sg_list[0].addr, bad_wr->sg_list[0].length, bad_wr->sg_list[0].lkey, - get_max_inline_data()); - } - ret = -1; - } - ENDIF_VERBS_FAILURE; - - // Clear the SINGAL request - xlio_send_wr_send_flags(*p_send_wqe) = - (xlio_ibv_send_flags)(xlio_send_wr_send_flags(*p_send_wqe) & ~XLIO_IBV_SEND_SIGNALED); - - return ret; -} - -int qp_mgr::send(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, xlio_tis *tis, - unsigned credits) -{ - mem_buf_desc_t *p_mem_buf_desc = (mem_buf_desc_t *)p_send_wqe->wr_id; - /* Control tx completions: - * - XLIO_TX_WRE_BATCHING - The number of Tx Work Request Elements used - * until a completion signal is requested. - * - ZCOPY packets should notify application as soon as possible to - * confirm one that user buffers are free to reuse. So force completion - * signal for such work requests. - * - First call of send() should do completion. It means that - * m_n_unsignaled_count must be zero for this time. - */ - bool request_comp = (p_mem_buf_desc->m_flags & mem_buf_desc_t::ZCOPY); - - qp_logfunc("VERBS send, unsignaled_count: %d", m_n_unsignaled_count); - - // TODO send_to_wire() and send() can return void after removing ibverbs support - if (send_to_wire(p_send_wqe, attr, request_comp, tis, credits)) { - return -1; - } - - if (request_comp || is_signal_requested_for_last_wqe()) { - uint64_t dummy_poll_sn = 0; - int ret = m_p_cq_mgr_tx->poll_and_process_element_tx(&dummy_poll_sn); - BULLSEYE_EXCLUDE_BLOCK_START - if (ret < 0) { - qp_logerr("error from cq_mgr_tx->process_next_element (ret=%d %m)", ret); - } - BULLSEYE_EXCLUDE_BLOCK_END - qp_logfunc("polling succeeded on tx cq_mgr (%d wce)", ret); - } - - return 0; -} - -void qp_mgr_eth::modify_qp_to_ready_state() -{ - qp_logdbg(""); - int ret = 0; - int qp_state = priv_ibv_query_qp_state(m_qp); - if (qp_state != IBV_QPS_INIT) { - BULLSEYE_EXCLUDE_BLOCK_START - if ((ret = priv_ibv_modify_qp_from_err_to_init_raw(m_qp, m_port_num)) != 0) { - qp_logpanic("failed to modify QP from %d to RTS state (ret = %d)", qp_state, ret); - } - BULLSEYE_EXCLUDE_BLOCK_END - } - - BULLSEYE_EXCLUDE_BLOCK_START - if ((ret = priv_ibv_modify_qp_from_init_to_rts(m_qp)) != 0) { - qp_logpanic("failed to modify QP from INIT to RTS state (ret = %d)", ret); - } - - BULLSEYE_EXCLUDE_BLOCK_END -} - -int qp_mgr_eth::prepare_ibv_qp(xlio_ibv_qp_init_attr &qp_init_attr) -{ - qp_logdbg(""); - int ret = 0; - - qp_init_attr.qp_type = IBV_QPT_RAW_PACKET; - xlio_ibv_qp_init_attr_comp_mask(m_p_ib_ctx_handler->get_ibv_pd(), qp_init_attr); - - if (m_p_ring->is_tso()) { - xlio_ibv_qp_init_attr_tso(qp_init_attr, m_p_ring->get_max_header_sz()); - qp_logdbg("create qp with max_tso_header = %d", m_p_ring->get_max_header_sz()); - } - - m_qp = xlio_ibv_create_qp(m_p_ib_ctx_handler->get_ibv_pd(), &qp_init_attr); - - BULLSEYE_EXCLUDE_BLOCK_START - if (!m_qp) { - qp_logerr("ibv_create_qp failed (errno=%d %m)", errno); - return -1; - } - VALGRIND_MAKE_MEM_DEFINED(m_qp, sizeof(ibv_qp)); - if ((ret = priv_ibv_modify_qp_from_err_to_init_raw(m_qp, m_port_num)) != 0) { - qp_logerr("failed to modify QP from ERR to INIT state (ret = %d)", ret); - return ret; - } - BULLSEYE_EXCLUDE_BLOCK_END - - return 0; -} - -uint32_t qp_mgr::is_ratelimit_change(struct xlio_rate_limit_t &rate_limit) -{ - uint32_t rl_changes = 0; - - if (m_rate_limit.rate != rate_limit.rate) { - rl_changes |= RL_RATE; - } - if (m_rate_limit.max_burst_sz != rate_limit.max_burst_sz) { - rl_changes |= RL_BURST_SIZE; - } - if (m_rate_limit.typical_pkt_sz != rate_limit.typical_pkt_sz) { - rl_changes |= RL_PKT_SIZE; - } - - return rl_changes; -} - -int qp_mgr::modify_qp_ratelimit(struct xlio_rate_limit_t &rate_limit, uint32_t rl_changes) -{ - int ret; - - ret = priv_ibv_modify_qp_ratelimit(m_qp, rate_limit, rl_changes); - if (ret) { - qp_logdbg("failed to modify qp ratelimit ret %d (errno=%d %m)", ret, errno); - return -1; - } - - m_rate_limit = rate_limit; - return 0; -} - -rfs_rule *qp_mgr::create_rfs_rule(xlio_ibv_flow_attr &attrs, xlio_tir *tir_ext) -{ - if (unlikely(tir_ext != NULL)) { - qp_logwarn("Requested steering rule cannot be created. Consider " - "building XLIO with DPCP support or disabling legacy RQ mode."); - return nullptr; - } - - unique_ptr new_rule(new rfs_rule_ibv()); - if (new_rule->create(attrs, this->get_ibv_qp())) { - return new_rule.release(); - } - - return nullptr; -} diff --git a/src/core/dev/qp_mgr.h b/src/core/dev/qp_mgr.h deleted file mode 100644 index 57b816757..000000000 --- a/src/core/dev/qp_mgr.h +++ /dev/null @@ -1,505 +0,0 @@ -/* - * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef QP_MGR_H -#define QP_MGR_H - -#include -#include -#include - -#include "ib/base/verbs_extra.h" -#include "proto/xlio_lwip.h" -#include "vlogger/vlogger.h" -#include "utils/atomic.h" -#include "util/vtypes.h" -#include "util/sys_vars.h" -#include "util/libxlio.h" -#include "util/if.h" -#include "lwip/opt.h" -#include "proto/mem_buf_desc.h" -#include "infra/sender.h" -#include "dev/ib_ctx_handler.h" -#include "dev/cq_mgr.h" -#include "dev/rfs_rule.h" - -/* Forward declarations */ -struct xlio_tls_info; -class xlio_tis; -class xlio_tir; -class buffer_pool; -class cq_mgr; -struct slave_data; -class ring; -class ring_simple; -class ring_eth_cb; - -#ifndef MAX_SUPPORTED_IB_INLINE_SIZE -#define MAX_SUPPORTED_IB_INLINE_SIZE 884 -#endif - -enum { - SQ_CREDITS_UMR = 3U, - SQ_CREDITS_SET_PSV = 1U, - SQ_CREDITS_GET_PSV = 1U, - SQ_CREDITS_DUMP = 1U, - SQ_CREDITS_NOP = 1U, - SQ_CREDITS_TLS_TX_CONTEXT = SQ_CREDITS_UMR + SQ_CREDITS_SET_PSV, - SQ_CREDITS_TLS_RX_CONTEXT = SQ_CREDITS_UMR + SQ_CREDITS_SET_PSV, - SQ_CREDITS_TLS_RX_RESYNC = SQ_CREDITS_UMR, - SQ_CREDITS_TLS_RX_GET_PSV = SQ_CREDITS_GET_PSV, -}; - -struct qp_mgr_desc { - ring_simple *ring; - const struct slave_data *slave; - struct ibv_comp_channel *rx_comp_event_channel; -}; - -/* Work request completion callback */ -/* TODO Add argument for completion status to handle errors. */ -typedef void (*xlio_comp_cb_t)(void *); - -class xlio_ti { -public: - enum ti_type : uint8_t { UNKNOWN, TLS_TIS, TLS_TIR, NVME_TIS, NVME_TIR }; - - xlio_ti(ti_type type = UNKNOWN) - : m_type(type) - , m_released(false) - , m_ref(0) - , m_callback(nullptr) - , m_callback_arg(nullptr) - { - } - virtual ~xlio_ti() {}; - - inline void assign_callback(xlio_comp_cb_t callback, void *callback_arg) - { - m_callback = callback; - m_callback_arg = callback_arg; - } - - /* - * Reference counting. m_ref must be protected by ring tx lock. Device - * layer (QP, CQ) is responsible for the reference counting. - */ - - inline void get(void) - { - ++m_ref; - assert(m_ref > 0); - } - - inline uint32_t put(void) - { - assert(m_ref > 0); - return --m_ref; - } - - ti_type m_type; - bool m_released; - uint32_t m_ref; - - xlio_comp_cb_t m_callback; - void *m_callback_arg; -}; - -/** - * @class qp_mgr - * - * Object to manages the QP operation - * This object is used for Rx & Tx at the same time - * Once created it requests from the system a CQ to work with (for Rx & Tx separately) - * - * The qp_mgr object will manage the memory data buffers to be used for Rx & Tx. - * A descriptor (mem_buf_desc_t) is used to point to each memory data buffers which is also menaged - * by the qm_mgr. - * - * NOTE: - * The idea here is to use the rdma_cma_id object to manage the QP - * all we need is to rdma_resolve_addr() so we have the correct pkey in the cma_id object - * the rest is a simple transition of the QP states that is hidden inside the rdma_cm - * - */ -class qp_mgr { - friend class cq_mgr; - friend class cq_mgr_mlx5; - friend class cq_mgr_mlx5_strq; - friend class cq_mgr_mp; - -public: - qp_mgr(struct qp_mgr_desc *desc, const uint32_t tx_num_wr); - virtual ~qp_mgr(); - - virtual void up(); - virtual void down(); - - // Post for receive single mem_buf_desc - virtual void post_recv_buffer(mem_buf_desc_t *p_mem_buf_desc); - // Post for receive a list of mem_buf_desc - void post_recv_buffers(descq_t *p_buffers, size_t count); - int send(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, xlio_tis *tis, - unsigned credits); - - inline uint32_t get_max_inline_data() const { return m_qp_cap.max_inline_data; } - inline uint32_t get_max_send_sge() const { return m_qp_cap.max_send_sge; } - int get_port_num() const { return m_port_num; } - virtual uint16_t get_partiton() const { return 0; }; - struct ibv_qp *get_ibv_qp() const { return m_qp; }; - class cq_mgr *get_tx_cq_mgr() const { return m_p_cq_mgr_tx; } - class cq_mgr *get_rx_cq_mgr() const { return m_p_cq_mgr_rx; } - virtual uint32_t get_rx_max_wr_num(); - // This function can be replaced with a parameter during ring creation. - // chain of calls may serve as cache warm for dummy send feature. - inline bool get_hw_dummy_send_support() { return m_hw_dummy_send_support; } - - virtual void modify_qp_to_ready_state() = 0; - virtual void modify_qp_to_error_state(); - - void release_rx_buffers(); - void release_tx_buffers(); - virtual void trigger_completion_for_all_sent_packets(); - uint32_t is_ratelimit_change(struct xlio_rate_limit_t &rate_limit); - int modify_qp_ratelimit(struct xlio_rate_limit_t &rate_limit, uint32_t rl_changes); - virtual void dm_release_data(mem_buf_desc_t *buff) { NOT_IN_USE(buff); } - - virtual rfs_rule *create_rfs_rule(xlio_ibv_flow_attr &attrs, xlio_tir *tir_ext); - -#ifdef DEFINED_UTLS - virtual xlio_tis *tls_context_setup_tx(const xlio_tls_info *info) - { - NOT_IN_USE(info); - return NULL; - } - virtual xlio_tir *tls_create_tir(bool cached) - { - NOT_IN_USE(cached); - return NULL; - } - virtual int tls_context_setup_rx(xlio_tir *tir, const xlio_tls_info *info, - uint32_t next_record_tcp_sn, xlio_comp_cb_t callback, - void *callback_arg) - { - NOT_IN_USE(tir); - NOT_IN_USE(info); - NOT_IN_USE(next_record_tcp_sn); - NOT_IN_USE(callback); - NOT_IN_USE(callback_arg); - return -1; - } - virtual void tls_context_resync_tx(const xlio_tls_info *info, xlio_tis *tis, bool skip_static) - { - NOT_IN_USE(info); - NOT_IN_USE(tis); - NOT_IN_USE(skip_static); - } - virtual void tls_resync_rx(xlio_tir *tir, const xlio_tls_info *info, uint32_t hw_resync_tcp_sn) - { - NOT_IN_USE(tir); - NOT_IN_USE(info); - NOT_IN_USE(hw_resync_tcp_sn); - } - virtual void tls_get_progress_params_rx(xlio_tir *tir, void *buf, uint32_t lkey) - { - NOT_IN_USE(tir); - NOT_IN_USE(buf); - NOT_IN_USE(lkey); - } - virtual void tls_release_tis(xlio_tis *tis) { NOT_IN_USE(tis); } - virtual void tls_release_tir(xlio_tir *tir) { NOT_IN_USE(tir); } - virtual void tls_tx_post_dump_wqe(xlio_tis *tis, void *addr, uint32_t len, uint32_t lkey, - bool first) - { - NOT_IN_USE(tis); - NOT_IN_USE(addr); - NOT_IN_USE(len); - NOT_IN_USE(lkey); - NOT_IN_USE(first); - } -#endif /* DEFINED_UTLS */ -#if defined(DEFINED_DPCP) - virtual std::unique_ptr create_tis(uint32_t) const { return nullptr; }; -#endif /* defined(DEFINED_DPCP) */ - virtual void nvme_set_static_context(xlio_tis *tis, uint32_t config) - { - NOT_IN_USE(tis); - NOT_IN_USE(config); - }; - virtual void nvme_set_progress_context(xlio_tis *tis, uint32_t tcp_seqno) - { - NOT_IN_USE(tis); - NOT_IN_USE(tcp_seqno); - }; - virtual void post_nop_fence(void) {} - virtual void post_dump_wqe(xlio_tis *tis, void *addr, uint32_t len, uint32_t lkey, bool first) - { - NOT_IN_USE(tis); - NOT_IN_USE(addr); - NOT_IN_USE(len); - NOT_IN_USE(lkey); - NOT_IN_USE(first); - } - - virtual void reset_inflight_zc_buffers_ctx(void *ctx) { NOT_IN_USE(ctx); } - virtual bool credits_get(unsigned credits) - { - NOT_IN_USE(credits); - return true; - } - virtual void credits_return(unsigned credits) { NOT_IN_USE(credits); } - inline unsigned credits_calculate(xlio_ibv_send_wr *p_send_wqe) - { - /* Credit is a logical value which is opaque for users. Only qp_mgr can interpret the - * value and currently, one credit equals to one WQEBB in the SQ. - * - * Current method does best effort to predict how many WQEBBs will be used to send - * p_send_wqe in send_to_wire(). The predicted value may be higher than actual, but - * mustn't be lower. - * - * There are 3 branches in this order: - * 1. Full non-TSO packet inline - * 2. Non-TSO packet with scatter-gather elements and no inline data - * 3. TSO packet with inline headers - * - * Formulas details: - * 1. WQEBB is 64 bytes, the 1st WQEBB contains ctrl segment, eth segment and 18 bytes of - * inline data. So, we take the 1st WQEBB and number of WQEBBs for the packet minus 18 - * bytes. - * 2. Data segment for each scatter-gather element is 16 bytes. Therefore, WQEBB can hold - * up to 4 data segments. The 1st element fits into the 1st WQEBB after the eth segment. - * So, we take the 1st WQEBB and number of WQEBBs for scatter-gather elements minus 1. - * 3. Inline header starts from offset 46 in WQE (2 bytes before 16 bytes alignment). - * Decrease inline header size by 2 to align it to 16 bytes boundary at the right edge. - * This compensates data segments alignment. Add the 2 bytes back and length of - * scatter-gather elements. Take into account that 18 bytes goes to the 1st WQEBB and - * add the 1st WQEBB to the result. - */ - if (xlio_send_wr_opcode(*p_send_wqe) != XLIO_IBV_WR_TSO) { - if (p_send_wqe->num_sge == 1 && p_send_wqe->sg_list->length <= 204) { - return (p_send_wqe->sg_list->length + 63U - 18U) / 64U + 1U; - } else { - return (p_send_wqe->num_sge + 3U - 1U) / 4U + 1U; - } - } else { - return (((p_send_wqe->tso.hdr_sz + 15U - 2U) & ~15U) + 2U + p_send_wqe->num_sge * 16U - - 18U + 63U) / - 64U + - 1U; - } - } - -protected: - struct ibv_qp *m_qp; - uint64_t *m_rq_wqe_idx_to_wrid; - - ring_simple *m_p_ring; - uint8_t m_port_num; - ib_ctx_handler *m_p_ib_ctx_handler; - - struct ibv_qp_cap m_qp_cap; - uint32_t m_max_qp_wr; - - cq_mgr *m_p_cq_mgr_rx; - cq_mgr *m_p_cq_mgr_tx; - - uint32_t m_rx_num_wr; - uint32_t m_tx_num_wr; - - bool m_hw_dummy_send_support; - - uint32_t m_n_sysvar_rx_num_wr_to_post_recv; - const uint32_t m_n_sysvar_tx_num_wr_to_signal; - const uint32_t m_n_sysvar_rx_prefetch_bytes_before_poll; - - // recv_wr - ibv_sge *m_ibv_rx_sg_array; - ibv_recv_wr *m_ibv_rx_wr_array; - uint32_t m_curr_rx_wr; - uintptr_t m_last_posted_rx_wr_id; // Remember so in case we flush RQ we know to wait until this - // WR_ID is received - - // send wr - uint32_t m_n_unsignaled_count; - - mem_buf_desc_t *m_p_prev_rx_desc_pushed; - - // generating packet IDs - uint16_t m_n_ip_id_base; - uint16_t m_n_ip_id_offset; - struct xlio_rate_limit_t m_rate_limit; - - int configure(struct qp_mgr_desc *desc); - virtual int prepare_ibv_qp(xlio_ibv_qp_init_attr &qp_init_attr) = 0; - inline void set_unsignaled_count(void) - { - m_n_unsignaled_count = m_n_sysvar_tx_num_wr_to_signal - 1; - } - inline void dec_unsignaled_count(void) - { - if (m_n_unsignaled_count > 0) { - --m_n_unsignaled_count; - } - } - inline bool is_signal_requested_for_last_wqe() - { - return m_n_unsignaled_count == m_n_sysvar_tx_num_wr_to_signal - 1; - } - - virtual cq_mgr *init_rx_cq_mgr(struct ibv_comp_channel *p_rx_comp_event_channel); - virtual cq_mgr *init_tx_cq_mgr(void); - - cq_mgr *handle_cq_initialization(uint32_t *num_wr, struct ibv_comp_channel *comp_event_channel, - bool is_rx); - - virtual int send_to_wire(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, - bool request_comp, xlio_tis *tis, unsigned credits); - virtual bool is_completion_need() { return !m_n_unsignaled_count; } - virtual bool is_rq_empty() const { return false; } -}; - -class qp_mgr_eth : public qp_mgr { -public: - qp_mgr_eth(struct qp_mgr_desc *desc, const uint32_t tx_num_wr, const uint16_t vlan, - bool call_configure = true) - : qp_mgr(desc, tx_num_wr) - , m_vlan(vlan) - { - if (call_configure && configure(desc)) { - throw_xlio_exception("failed creating qp"); - } - }; - - virtual ~qp_mgr_eth() {} - - virtual void modify_qp_to_ready_state(); - virtual uint16_t get_partiton() const { return m_vlan; }; - -protected: - virtual int prepare_ibv_qp(xlio_ibv_qp_init_attr &qp_init_attr); - -private: - const uint16_t m_vlan; -}; - -#if defined(DEFINED_UTLS) || defined(DEFINED_DPCP) -class xlio_tis : public xlio_ti { -public: - xlio_tis(std::unique_ptr _tis, xlio_ti::ti_type type) - : xlio_ti(type) - , m_dek() - , m_p_tis(std::move(_tis)) - , m_tisn(0U) - , m_dek_id(0U) - { - dpcp::status ret = m_p_tis->get_tisn(m_tisn); - assert(ret == dpcp::DPCP_OK); - (void)ret; - } - - ~xlio_tis() = default; - - inline std::unique_ptr release_dek() - { - assert(m_ref == 0); - m_released = false; - return std::move(m_dek); - } - - inline uint32_t get_tisn(void) noexcept { return m_tisn; } - - inline void assign_dek(std::unique_ptr &&dek_ptr) - { - m_dek = std::move(dek_ptr); - m_dek_id = m_dek->get_key_id(); - } - - inline uint32_t get_dek_id(void) noexcept { return m_dek_id; } - -private: - std::unique_ptr m_dek; - std::unique_ptr m_p_tis; - uint32_t m_tisn; - uint32_t m_dek_id; -}; - -class xlio_tir : public xlio_ti { -public: - xlio_tir(dpcp::tir *_tir, xlio_ti::ti_type type) - : xlio_ti(type) - { - m_p_tir.reset(_tir); - m_dek = NULL; - m_tirn = 0; - m_dek_id = 0; - - /* Cache the tir number. Mustn't fail for a valid TIR object. */ - m_tirn = m_p_tir->get_tirn(); - assert(m_tirn != 0); - } - - ~xlio_tir() = default; - - inline std::unique_ptr release_dek() - { - assert(m_ref == 0); - m_released = false; - return std::move(m_dek); - } - - inline uint32_t get_tirn(void) { return m_tirn; } - - inline void assign_dek(void *dek_ptr) - { - m_dek.reset(reinterpret_cast(dek_ptr)); - m_dek_id = m_dek->get_key_id(); - } - - inline uint32_t get_dek_id(void) { return m_dek_id; } - - std::unique_ptr m_p_tir; - -private: - std::unique_ptr m_dek; - uint32_t m_tirn; - uint32_t m_dek_id; -}; -#else /* DEFINED_UTLS or DEFINED_DPCP */ -/* A stub classes to compile without uTLS support. */ -class xlio_tis : public xlio_ti { -public: - inline uint32_t get_tisn(void) noexcept { return 0; } -}; -class xlio_tir : public xlio_ti { -}; -#endif /* DEFINED_UTLS or DEFINED_DPCP */ -#endif diff --git a/src/core/dev/qp_mgr_eth_mlx5.h b/src/core/dev/qp_mgr_eth_mlx5.h deleted file mode 100644 index 1bc9a20bb..000000000 --- a/src/core/dev/qp_mgr_eth_mlx5.h +++ /dev/null @@ -1,233 +0,0 @@ -/* - * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef QP_MGR_ETH_MLX5_H -#define QP_MGR_ETH_MLX5_H - -#include "qp_mgr.h" -#include "util/sg_array.h" -#include "dev/dm_mgr.h" -#include -#include - -#if defined(DEFINED_DIRECT_VERBS) - -#define qp_logpanic __log_info_panic -#define qp_logerr __log_info_err -#define qp_logwarn __log_info_warn -#define qp_loginfo __log_info_info -#define qp_logdbg __log_info_dbg -#define qp_logfunc __log_info_func -#define qp_logfuncall __log_info_funcall - -/* WQE properties description. */ -struct sq_wqe_prop { - /* A buffer held by the WQE. This is NULL for control WQEs. */ - mem_buf_desc_t *buf; - /* Number of credits (usually number of WQEBBs). */ - unsigned credits; - /* Transport interface (TIS/TIR) current WQE holds reference to. */ - xlio_ti *ti; - struct sq_wqe_prop *next; -}; -typedef struct sq_wqe_prop sq_wqe_prop; - -class qp_mgr_eth_mlx5 : public qp_mgr_eth { - friend class cq_mgr_mlx5; - -public: - qp_mgr_eth_mlx5(struct qp_mgr_desc *desc, const uint32_t tx_num_wr, const uint16_t vlan, - bool call_configure = true); - virtual ~qp_mgr_eth_mlx5(); - void up() override; - void down() override; - void post_recv_buffer( - mem_buf_desc_t *p_mem_buf_desc) override; // Post for receive single mem_buf_desc - xlio_ib_mlx5_qp_t m_mlx5_qp; - -#ifdef DEFINED_UTLS - xlio_tis *tls_context_setup_tx(const xlio_tls_info *info) override; - xlio_tir *tls_create_tir(bool cached) override; - int tls_context_setup_rx(xlio_tir *tir, const xlio_tls_info *info, uint32_t next_record_tcp_sn, - xlio_comp_cb_t callback, void *callback_arg) override; - void tls_context_resync_tx(const xlio_tls_info *info, xlio_tis *tis, bool skip_static) override; - void tls_resync_rx(xlio_tir *tir, const xlio_tls_info *info, - uint32_t hw_resync_tcp_sn) override; - void tls_get_progress_params_rx(xlio_tir *tir, void *buf, uint32_t lkey) override; - void tls_release_tis(xlio_tis *tis) override; - void tls_release_tir(xlio_tir *tir) override; - void tls_tx_post_dump_wqe(xlio_tis *tis, void *addr, uint32_t len, uint32_t lkey, - bool first) override; -#endif /* DEFINED_UTLS */ -#ifdef DEFINED_DPCP -#define DPCP_TIS_FLAGS (dpcp::TIS_ATTR_TRANSPORT_DOMAIN | dpcp::TIS_ATTR_PD) -#define DPCP_TIS_NVME_FLAG (dpcp::TIS_ATTR_NVMEOTCP) - std::unique_ptr create_tis(uint32_t flags) const override; - void nvme_set_static_context(xlio_tis *tis, uint32_t config) override; - void nvme_set_progress_context(xlio_tis *tis, uint32_t tcp_seqno) override; -#else -#define DPCP_TIS_FLAGS (0U) -#define DPCP_TIS_NVME_FLAG (0U) -#endif /* DEFINED_DPCP */ - /* Get a memory inside a wqebb at a wqebb_num offset from the m_sq_wqe_hot and account for - * m_sq_wqe_counter wrap-around. Use offset_in_wqebb to for the internal address. Use the - * template parameter to cast the resulting address to the required pointer type */ - template - constexpr inline T wqebb_get(size_t wqebb_num, size_t offset_in_wqebb = 0U) - { - return reinterpret_cast( - reinterpret_cast( - &(*m_sq_wqes)[(m_sq_wqe_counter + wqebb_num) & (m_tx_num_wr - 1)]) + - offset_in_wqebb); - } - - void post_nop_fence(void) override; - void post_dump_wqe(xlio_tis *tis, void *addr, uint32_t len, uint32_t lkey, bool first) override; - -#if defined(DEFINED_UTLS) - std::unique_ptr get_new_tls_dek(const void *key, uint32_t key_size_bytes); - std::unique_ptr get_tls_dek(const void *key, uint32_t key_size_bytes); - void put_tls_dek(std::unique_ptr &&dek_obj); -#endif - - void reset_inflight_zc_buffers_ctx(void *ctx) override; - // TODO Make credits API inline. - bool credits_get(unsigned credits) override - { - if (m_sq_free_credits >= credits) { - m_sq_free_credits -= credits; - return true; - } - return false; - } - void credits_return(unsigned credits) override { m_sq_free_credits += credits; } - -protected: - void post_recv_buffer_rq(mem_buf_desc_t *p_mem_buf_desc); - void trigger_completion_for_all_sent_packets() override; - bool init_rx_cq_mgr_prepare(); - void init_qp(); - void init_device_memory(); - cq_mgr *init_rx_cq_mgr(struct ibv_comp_channel *p_rx_comp_event_channel) override; - cq_mgr *init_tx_cq_mgr(void) override; - - void put_tls_tir_in_cache(xlio_tir *tir); - void put_tls_tis_in_cache(xlio_tis *tis); - void ti_released(xlio_ti *ti); - - virtual bool is_rq_empty() const override { return (m_mlx5_qp.rq.head == m_mlx5_qp.rq.tail); } - - inline bool is_sq_wqe_prop_valid(sq_wqe_prop *p, sq_wqe_prop *prev) - { - unsigned p_i = p - m_sq_wqe_idx_to_prop; - unsigned prev_i = prev - m_sq_wqe_idx_to_prop; - return (p_i != m_sq_wqe_prop_last_signalled) && - ((m_tx_num_wr + p_i - m_sq_wqe_prop_last_signalled) % m_tx_num_wr < - (m_tx_num_wr + prev_i - m_sq_wqe_prop_last_signalled) % m_tx_num_wr); - } - - sq_wqe_prop *m_sq_wqe_idx_to_prop; - sq_wqe_prop *m_sq_wqe_prop_last; - unsigned m_sq_wqe_prop_last_signalled; - unsigned m_sq_free_credits; - uint64_t m_rq_wqe_counter; - -private: - void update_next_wqe_hot(); - - bool is_completion_need() override - { - return !m_n_unsignaled_count || (m_dm_enabled && m_dm_mgr.is_completion_need()); - }; - void dm_release_data(mem_buf_desc_t *buff) override { m_dm_mgr.release_data(buff); } - - int send_to_wire(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, bool request_comp, - xlio_tis *tis, unsigned credits) override; - inline int fill_wqe(xlio_ibv_send_wr *p_send_wqe); - inline void store_current_wqe_prop(mem_buf_desc_t *wr_id, unsigned credits, xlio_ti *ti); - void destroy_tis_cache(void); -#if defined(DEFINED_UTLS) - inline void tls_fill_static_params_wqe(struct mlx5_wqe_tls_static_params_seg *params, - const struct xlio_tls_info *info, uint32_t key_id, - uint32_t resync_tcp_sn); - inline void tls_post_static_params_wqe(xlio_ti *ti, const struct xlio_tls_info *info, - uint32_t tis_tir_number, uint32_t key_id, - uint32_t resync_tcp_sn, bool fence, bool is_tx); - inline void tls_fill_progress_params_wqe(struct mlx5_wqe_tls_progress_params_seg *params, - uint32_t tis_tir_number, uint32_t next_record_tcp_sn); - inline void tls_post_progress_params_wqe(xlio_ti *ti, uint32_t tis_tir_number, - uint32_t next_record_tcp_sn, bool fence, bool is_tx); - inline void tls_get_progress_params_wqe(xlio_ti *ti, uint32_t tirn, void *buf, uint32_t lkey); - -protected: - dpcp::tir *xlio_tir_to_dpcp_tir(xlio_tir *tir); - virtual dpcp::tir *create_tir(bool is_tls = false) - { - NOT_IN_USE(is_tls); - return NULL; - } - -private: -#endif /* DEFINED_UTLS */ - inline int fill_wqe_send(xlio_ibv_send_wr *pswr); - inline int fill_wqe_lso(xlio_ibv_send_wr *pswr); - inline void ring_doorbell(int db_method, int num_wqebb, int num_wqebb_top = 0, - bool skip_comp = false); - inline int fill_inl_segment(sg_array &sga, uint8_t *cur_seg, uint8_t *data_addr, - int max_inline_len, int inline_len); - - struct mlx5_eth_wqe (*m_sq_wqes)[]; - struct mlx5_eth_wqe *m_sq_wqe_hot; - uint8_t *m_sq_wqes_end; - enum { MLX5_DB_METHOD_BF, MLX5_DB_METHOD_DB } m_db_method; - - int m_sq_wqe_hot_index; - uint16_t m_sq_wqe_counter; - - bool m_b_fence_needed; - - bool m_dm_enabled; - dm_mgr m_dm_mgr; - /* - * TIS cache. Protected by ring tx lock. - * TODO Move to ring. - */ - std::vector m_tls_tis_cache; - std::vector m_tls_tir_cache; - -#if defined(DEFINED_UTLS) - std::list> m_tls_dek_get_cache; - std::list> m_tls_dek_put_cache; -#endif -}; -#endif // defined(DEFINED_DIRECT_VERBS) -#endif // QP_MGR_ETH_MLX5_H diff --git a/src/core/dev/qp_mgr_eth_mlx5_dpcp.cpp b/src/core/dev/qp_mgr_eth_mlx5_dpcp.cpp deleted file mode 100644 index 3c4b2cb5a..000000000 --- a/src/core/dev/qp_mgr_eth_mlx5_dpcp.cpp +++ /dev/null @@ -1,348 +0,0 @@ -/* - * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "qp_mgr_eth_mlx5_dpcp.h" - -#if defined(DEFINED_DPCP) - -#include -#include "ring_simple.h" -#include "rfs_rule_dpcp.h" -#include "cq_mgr_mlx5_strq.h" - -#define MODULE_NAME "qp_mgr_eth_mlx5_dpcp" - -qp_mgr_eth_mlx5_dpcp::qp_mgr_eth_mlx5_dpcp(struct qp_mgr_desc *desc, uint32_t tx_num_wr, - uint16_t vlan) - : qp_mgr_eth_mlx5(desc, tx_num_wr, vlan, false) -{ - if (configure(desc)) { - throw_xlio_exception("Failed creating qp_mgr_eth_mlx5_dpcp"); - } - - if (!configure_rq_dpcp()) { - throw_xlio_exception("Failed to create qp_mgr_eth_mlx5_dpcp"); - } -} - -bool qp_mgr_eth_mlx5_dpcp::configure_rq_dpcp() -{ - qp_logdbg("Creating RQ of transport type '%s' on ibv device '%s' [%p] on port %d", - priv_xlio_transport_type_str(m_p_ring->get_transport_type()), - m_p_ib_ctx_handler->get_ibname(), m_p_ib_ctx_handler->get_ibv_device(), m_port_num); - - m_qp_cap.max_recv_wr = m_rx_num_wr; - - qp_logdbg("Requested RQ parameters: wre: rx = %d sge: rx = %d", m_qp_cap.max_recv_wr, - m_qp_cap.max_recv_sge); - - xlio_ib_mlx5_cq_t mlx5_cq; - memset(&mlx5_cq, 0, sizeof(mlx5_cq)); - xlio_ib_mlx5_get_cq(m_p_cq_mgr_rx->get_ibv_cq_hndl(), &mlx5_cq); - - qp_logdbg("Configuring dpcp RQ, cq-rx: %p, cqn-rx: %u", m_p_cq_mgr_rx, - static_cast(mlx5_cq.cq_num)); - - if (safe_mce_sys().enable_striding_rq) { - m_qp_cap.max_recv_sge = 2U; // Striding-RQ needs a reserved segment. - _strq_wqe_reserved_seg = 1U; - - delete[] m_ibv_rx_sg_array; - m_ibv_rx_sg_array = new ibv_sge[m_n_sysvar_rx_num_wr_to_post_recv * m_qp_cap.max_recv_sge]; - for (uint32_t wr_idx = 0; wr_idx < m_n_sysvar_rx_num_wr_to_post_recv; wr_idx++) { - m_ibv_rx_wr_array[wr_idx].sg_list = &m_ibv_rx_sg_array[wr_idx * m_qp_cap.max_recv_sge]; - m_ibv_rx_wr_array[wr_idx].num_sge = m_qp_cap.max_recv_sge; - memset(m_ibv_rx_wr_array[wr_idx].sg_list, 0, sizeof(ibv_sge)); - m_ibv_rx_wr_array[wr_idx].sg_list[0].length = - 1U; // To bypass a check inside xlio_ib_mlx5_post_recv. - } - } - - // Create the QP - if (!prepare_rq(mlx5_cq.cq_num)) { - return false; - } - - return true; -} - -bool qp_mgr_eth_mlx5_dpcp::prepare_rq(uint32_t cqn) -{ - qp_logdbg(""); - - dpcp::adapter *dpcp_adapter = m_p_ib_ctx_handler->get_dpcp_adapter(); - if (!dpcp_adapter) { - qp_logerr("Failed to get dpcp::adapter for prepare_rq"); - return false; - } - - // user_index Unused. - dpcp::rq_attr rqattrs; - memset(&rqattrs, 0, sizeof(rqattrs)); - rqattrs.cqn = cqn; - rqattrs.wqe_num = m_qp_cap.max_recv_wr; - rqattrs.wqe_sz = m_qp_cap.max_recv_sge; - - if (safe_mce_sys().hw_ts_conversion_mode == TS_CONVERSION_MODE_RTC) { - qp_logdbg("Enabled RTC timestamp format for RQ"); - rqattrs.ts_format = dpcp::rq_ts_format::RQ_TS_REAL_TIME; - } - - std::unique_ptr new_rq; - dpcp::status rc = dpcp::DPCP_OK; - - if (safe_mce_sys().enable_striding_rq) { - rqattrs.buf_stride_sz = safe_mce_sys().strq_stride_size_bytes; - rqattrs.buf_stride_num = safe_mce_sys().strq_stride_num_per_rwqe; - - // Striding-RQ WQE format is as of Shared-RQ (PRM, page 381, wq_type). - // In this case the WQE minimum size is 2 * 16, and the first segment is reserved. - rqattrs.wqe_sz = m_qp_cap.max_recv_sge * 16U; - - dpcp::striding_rq *new_rq_ptr = nullptr; - rc = dpcp_adapter->create_striding_rq(rqattrs, new_rq_ptr); - new_rq.reset(new_rq_ptr); - } else { - dpcp::regular_rq *new_rq_ptr = nullptr; - rc = dpcp_adapter->create_regular_rq(rqattrs, new_rq_ptr); - new_rq.reset(new_rq_ptr); - } - - if (dpcp::DPCP_OK != rc) { - qp_logerr("Failed to create dpcp rq, rc: %d, cqn: %" PRIu32, static_cast(rc), cqn); - return false; - } - - memset(&m_mlx5_qp, 0, sizeof(m_mlx5_qp)); - if (!store_rq_mlx5_params(*new_rq)) { - qp_logerr( - "Failed to retrieve initial DPCP RQ parameters, rc: %d, basic_rq: %p, cqn: %" PRIu32, - static_cast(rc), new_rq.get(), cqn); - return false; - } - - _rq = std::move(new_rq); - - // At this stage there is no TIR associated with the RQ, So it mimics QP INIT state. - // At RDY state without a TIR, Work Requests can be submitted to the RQ. - modify_rq_to_ready_state(); - - qp_logdbg("Succeeded to create dpcp rq, rqn: %" PRIu32 ", cqn: %" PRIu32, m_mlx5_qp.rqn, cqn); - - return true; -} - -bool qp_mgr_eth_mlx5_dpcp::store_rq_mlx5_params(dpcp::basic_rq &new_rq) -{ - uint32_t *dbrec_tmp = nullptr; - dpcp::status rc = new_rq.get_dbrec(dbrec_tmp); - if (dpcp::DPCP_OK != rc) { - qp_logerr("Failed to retrieve dbrec of dpcp rq, rc: %d, basic_rq: %p", static_cast(rc), - &new_rq); - return false; - } - m_mlx5_qp.rq.dbrec = dbrec_tmp; - - rc = new_rq.get_wq_buf(m_mlx5_qp.rq.buf); - if (dpcp::DPCP_OK != rc) { - qp_logerr("Failed to retrieve wq-buf of dpcp rq, rc: %d, basic_rq: %p", - static_cast(rc), &new_rq); - return false; - } - - rc = new_rq.get_id(m_mlx5_qp.rqn); - if (dpcp::DPCP_OK != rc) { - qp_logerr("Failed to retrieve rqn of dpcp rq, rc: %d, basic_rq: %p", static_cast(rc), - &new_rq); - return false; - } - - new_rq.get_wqe_num(m_mlx5_qp.rq.wqe_cnt); - new_rq.get_wq_stride_sz(m_mlx5_qp.rq.stride); - if (safe_mce_sys().enable_striding_rq) { - m_mlx5_qp.rq.stride /= 16U; - } - - m_mlx5_qp.rq.wqe_shift = ilog_2(m_mlx5_qp.rq.stride); - m_mlx5_qp.rq.head = 0; - m_mlx5_qp.rq.tail = 0; - m_mlx5_qp.cap.max_recv_wr = m_qp_cap.max_recv_wr; - m_mlx5_qp.cap.max_recv_sge = m_qp_cap.max_recv_sge; - m_mlx5_qp.tirn = 0U; - - return true; -} - -void qp_mgr_eth_mlx5_dpcp::init_tir_rq() -{ - if (_rq && !store_rq_mlx5_params(*_rq)) { - qp_logpanic("Failed to retrieve DPCP RQ parameters (errno=%d %m)", errno); - } - - _tir.reset(create_tir()); - if (!_tir) { - qp_logpanic("TIR creation for qp_mgr_eth_mlx5_dpcp failed (errno=%d %m)", errno); - } -} - -void qp_mgr_eth_mlx5_dpcp::up() -{ - qp_mgr_eth_mlx5::init_qp(); - init_tir_rq(); - qp_mgr::up(); - init_device_memory(); -} - -void qp_mgr_eth_mlx5_dpcp::down() -{ - _tir.reset(nullptr); - - qp_mgr_eth_mlx5::down(); -} - -rfs_rule *qp_mgr_eth_mlx5_dpcp::create_rfs_rule(xlio_ibv_flow_attr &attrs, xlio_tir *tir_ext) -{ - // TODO Remove copypaste. -#ifdef DEFINED_UTLS - if (tir_ext && m_p_ib_ctx_handler && m_p_ib_ctx_handler->get_dpcp_adapter()) { - std::unique_ptr new_rule(new rfs_rule_dpcp()); - if (new_rule->create(attrs, *xlio_tir_to_dpcp_tir(tir_ext), - *m_p_ib_ctx_handler->get_dpcp_adapter())) { - return new_rule.release(); - } - } else -#endif /* DEFINED_UTLS */ - if (_tir && m_p_ib_ctx_handler && m_p_ib_ctx_handler->get_dpcp_adapter()) { - std::unique_ptr new_rule(new rfs_rule_dpcp()); - if (new_rule->create(attrs, *_tir, *m_p_ib_ctx_handler->get_dpcp_adapter())) { - return new_rule.release(); - } - } - - NOT_IN_USE(tir_ext); - return nullptr; -} - -void qp_mgr_eth_mlx5_dpcp::modify_qp_to_ready_state() -{ - qp_mgr_eth_mlx5::modify_qp_to_ready_state(); - modify_rq_to_ready_state(); -} - -void qp_mgr_eth_mlx5_dpcp::modify_qp_to_error_state() -{ - m_p_cq_mgr_rx->clean_cq(); - - qp_mgr_eth_mlx5::modify_qp_to_error_state(); - - dpcp::status rc = _rq->modify_state(dpcp::RQ_ERR); - - /* During plugout theres is possibility that kernel - * remove device resources before working process complete - * removing process. As a result ibv api function can - * return EIO=5 errno code. - */ - if (dpcp::DPCP_OK != rc && errno != EIO) { - qp_logerr("Failed to modify rq state to ERR, rc: %d, rqn: %" PRIu32, static_cast(rc), - m_mlx5_qp.rqn); - } -} - -void qp_mgr_eth_mlx5_dpcp::modify_rq_to_ready_state() -{ - dpcp::status rc = _rq->modify_state(dpcp::RQ_RDY); - if (dpcp::DPCP_OK != rc) { - qp_logerr("Failed to modify rq state to RDY, rc: %d, rqn: %" PRIu32, static_cast(rc), - m_mlx5_qp.rqn); - } -} - -cq_mgr *qp_mgr_eth_mlx5_dpcp::init_rx_cq_mgr(struct ibv_comp_channel *p_rx_comp_event_channel) -{ - if (unlikely(!safe_mce_sys().enable_striding_rq)) { - return qp_mgr_eth_mlx5::init_rx_cq_mgr(p_rx_comp_event_channel); - } - - return (!init_rx_cq_mgr_prepare() - ? nullptr - : new cq_mgr_mlx5_strq(m_p_ring, m_p_ib_ctx_handler, - safe_mce_sys().strq_stride_num_per_rwqe * m_rx_num_wr, - safe_mce_sys().strq_stride_size_bytes, - safe_mce_sys().strq_stride_num_per_rwqe, - p_rx_comp_event_channel, true)); -} - -void qp_mgr_eth_mlx5_dpcp::post_recv_buffer(mem_buf_desc_t *p_mem_buf_desc) -{ - uint32_t index = (m_curr_rx_wr * m_qp_cap.max_recv_sge) + _strq_wqe_reserved_seg; - m_ibv_rx_sg_array[index].addr = (uintptr_t)p_mem_buf_desc->p_buffer; - m_ibv_rx_sg_array[index].length = p_mem_buf_desc->sz_buffer; - m_ibv_rx_sg_array[index].lkey = p_mem_buf_desc->lkey; - - post_recv_buffer_rq(p_mem_buf_desc); -} - -dpcp::tir *qp_mgr_eth_mlx5_dpcp::create_tir(bool is_tls /*=false*/) -{ - dpcp::tir *tir_obj = nullptr; - dpcp::status status = dpcp::DPCP_OK; - dpcp::tir::attr tir_attr; - - memset(&tir_attr, 0, sizeof(tir_attr)); - tir_attr.flags = dpcp::TIR_ATTR_INLINE_RQN | dpcp::TIR_ATTR_TRANSPORT_DOMAIN; - tir_attr.inline_rqn = m_mlx5_qp.rqn; - tir_attr.transport_domain = m_p_ib_ctx_handler->get_dpcp_adapter()->get_td(); - - if (m_p_ring->m_lro.cap && m_p_ring->m_lro.max_payload_sz) { - tir_attr.flags |= dpcp::TIR_ATTR_LRO; - tir_attr.lro.timeout_period_usecs = XLIO_MLX5_PARAMS_LRO_TIMEOUT; - tir_attr.lro.enable_mask = 3; // Bitmask for IPv4 and IPv6 support - tir_attr.lro.max_msg_sz = m_p_ring->m_lro.max_payload_sz >> 8; - } - - if (is_tls) { - tir_attr.flags |= dpcp::TIR_ATTR_TLS; - tir_attr.tls_en = 1; - } - - status = m_p_ib_ctx_handler->get_dpcp_adapter()->create_tir(tir_attr, tir_obj); - - if (dpcp::DPCP_OK != status) { - qp_logerr("Failed creating dpcp tir with flags=0x%x status=%d", tir_attr.flags, status); - return nullptr; - } - - qp_logdbg("TIR: %p created", tir_obj); - - return tir_obj; -} - -#endif // defined(DEFINED_DPCP) diff --git a/src/core/dev/rfs.cpp b/src/core/dev/rfs.cpp index c5088a475..68693b3d0 100644 --- a/src/core/dev/rfs.cpp +++ b/src/core/dev/rfs.cpp @@ -32,7 +32,6 @@ #include "utils/bullseye.h" #include "dev/rfs.h" -#include "dev/qp_mgr.h" #include "dev/ring_simple.h" #include "sock/sock-redirect.h" #include "sock/sock-app.h" @@ -40,6 +39,14 @@ #define MODULE_NAME "rfs" +#define rfs_logpanic __log_info_panic +#define rfs_logerr __log_info_err +#define rfs_logwarn __log_info_warn +#define rfs_loginfo __log_info_info +#define rfs_logdbg __log_info_dbg +#define rfs_logfunc __log_info_func +#define rfs_logfuncall __log_info_funcall + /**/ /** inlining functions can only help if they are implemented before their usage **/ /**/ @@ -68,14 +75,11 @@ inline void rfs::filter_keep_attached(rule_filter_map_t::iterator &filter_iter) return; } - // save all ibv_flow rules only for filter - for (size_t i = 0; i < m_attach_flow_data_vector.size(); i++) { - filter_iter->second.rfs_rule_vec.push_back(m_attach_flow_data_vector[i]->rfs_flow); - rfs_logdbg("filter_keep_attached copying rfs_flow, Tag: %" PRIu32 - ", Flow: %s, Index: %zu, Ptr: %p, Counter: %d", - m_flow_tag_id, m_flow_tuple.to_str().c_str(), i, - m_attach_flow_data_vector[i]->rfs_flow, filter_iter->second.counter); - } + // save ibv_flow rule only for filter + filter_iter->second.rfs_rule_holder = m_rfs_flow; + rfs_logdbg( + "filter_keep_attached copying rfs_flow, Tag: %" PRIu32 ", Flow: %s, Ptr: %p, Counter: %d", + m_flow_tag_id, m_flow_tuple.to_str().c_str(), m_rfs_flow, filter_iter->second.counter); } inline void rfs::prepare_filter_detach(int &filter_counter, bool decrease_counter) @@ -103,45 +107,38 @@ inline void rfs::prepare_filter_detach(int &filter_counter, bool decrease_counte filter_counter = filter_iter->second.counter; // if we do not need to destroy rfs_rule, still mark this rfs as detached m_b_tmp_is_attached = (filter_counter == 0) && m_b_tmp_is_attached; - if (filter_counter != 0 || filter_iter->second.rfs_rule_vec.empty()) { + if (filter_counter != 0) { return; } BULLSEYE_EXCLUDE_BLOCK_START - if (m_attach_flow_data_vector.size() != filter_iter->second.rfs_rule_vec.size()) { - // sanity check for having the same number of qps on all rfs objects - rfs_logerr("all rfs objects in the ring should have the same number of elements"); + if (m_rfs_flow && m_rfs_flow != filter_iter->second.rfs_rule_holder) { + rfs_logerr("our assumption that there should be only one rule for filter group is wrong"); + } else if (filter_iter->second.rfs_rule_holder) { + m_rfs_flow = filter_iter->second.rfs_rule_holder; + rfs_logdbg("prepare_filter_detach copying rfs_flow, Tag: %" PRIu32 + ", Flow: %s, Ptr: %p, Counter: %d", + m_flow_tag_id, m_flow_tuple.to_str().c_str(), m_rfs_flow, + filter_iter->second.counter); } BULLSEYE_EXCLUDE_BLOCK_END - - for (size_t i = 0; i < m_attach_flow_data_vector.size(); i++) { - BULLSEYE_EXCLUDE_BLOCK_START - if (m_attach_flow_data_vector[i]->rfs_flow && - m_attach_flow_data_vector[i]->rfs_flow != filter_iter->second.rfs_rule_vec[i]) { - rfs_logerr( - "our assumption that there should be only one rule for filter group is wrong"); - } else if (filter_iter->second.rfs_rule_vec[i]) { - m_attach_flow_data_vector[i]->rfs_flow = filter_iter->second.rfs_rule_vec[i]; - rfs_logdbg("prepare_filter_detach copying rfs_flow, Tag: %" PRIu32 - ", Flow: %s, Index: %zu, Ptr: %p, Counter: %d", - m_flow_tag_id, m_flow_tuple.to_str().c_str(), i, - m_attach_flow_data_vector[i]->rfs_flow, filter_iter->second.counter); - } - BULLSEYE_EXCLUDE_BLOCK_END - } } rfs::rfs(flow_tuple *flow_spec_5t, ring_slave *p_ring, rfs_rule_filter *rule_filter /*= NULL*/, uint32_t flow_tag_id /*=0*/) : m_flow_tuple(rule_filter ? rule_filter->m_flow_tuple : *flow_spec_5t) , m_p_ring(p_ring) + , m_p_ring_simple(dynamic_cast(p_ring)) , m_p_rule_filter(rule_filter) , m_n_sinks_list_entries(0) , m_n_sinks_list_max_length(RFS_SINKS_LIST_DEFAULT_LEN) , m_flow_tag_id(flow_tag_id) , m_b_tmp_is_attached(false) { - m_sinks_list = new pkt_rcvr_sink *[m_n_sinks_list_max_length]; + memset(&m_match_value, 0, sizeof(m_match_value)); + memset(&m_match_mask, 0, sizeof(m_match_mask)); + + m_sinks_list = new sockinfo *[m_n_sinks_list_max_length]; #if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) if (g_p_app->type != APP_NONE && g_p_app->get_worker_id() >= 0) { @@ -150,12 +147,12 @@ rfs::rfs(flow_tuple *flow_spec_5t, ring_slave *p_ring, rfs_rule_filter *rule_fil #endif BULLSEYE_EXCLUDE_BLOCK_START - if (m_sinks_list == NULL) { + if (!m_sinks_list) { rfs_logpanic("sinks list allocation failed!"); } BULLSEYE_EXCLUDE_BLOCK_END - memset(m_sinks_list, 0, sizeof(pkt_rcvr_sink *) * m_n_sinks_list_max_length); + memset(m_sinks_list, 0, sizeof(sockinfo *) * m_n_sinks_list_max_length); } rfs::~rfs() @@ -179,23 +176,12 @@ rfs::~rfs() if (m_p_rule_filter) { delete m_p_rule_filter; - m_p_rule_filter = NULL; + m_p_rule_filter = nullptr; } delete[] m_sinks_list; - - while (m_attach_flow_data_vector.size() > 0) { - attach_flow_data_t *flow_data = m_attach_flow_data_vector.back(); - if (reinterpret_cast(&flow_data->ibv_flow_attr)->eth.val.ether_type == - htons(ETH_P_IP)) { - delete reinterpret_cast(flow_data); - } else { - delete reinterpret_cast(flow_data); - } - m_attach_flow_data_vector.pop_back(); - } } -bool rfs::add_sink(pkt_rcvr_sink *p_sink) +bool rfs::add_sink(sockinfo *p_sink) { uint32_t i; @@ -219,16 +205,16 @@ bool rfs::add_sink(pkt_rcvr_sink *p_sink) if (m_n_sinks_list_entries == m_n_sinks_list_max_length) { // Sinks list array is full // Reallocate a new array with double size uint32_t tmp_sinks_list_length = 2 * m_n_sinks_list_max_length; - pkt_rcvr_sink **tmp_sinks_list = new pkt_rcvr_sink *[tmp_sinks_list_length]; + sockinfo **tmp_sinks_list = new sockinfo *[tmp_sinks_list_length]; BULLSEYE_EXCLUDE_BLOCK_START - if (tmp_sinks_list == NULL) { + if (!tmp_sinks_list) { rfs_logerr("sinks list allocation failed!"); return false; } BULLSEYE_EXCLUDE_BLOCK_END - memcpy(tmp_sinks_list, m_sinks_list, sizeof(pkt_rcvr_sink *) * m_n_sinks_list_max_length); + memcpy(tmp_sinks_list, m_sinks_list, sizeof(sockinfo *) * m_n_sinks_list_max_length); delete[] m_sinks_list; m_sinks_list = tmp_sinks_list; m_n_sinks_list_max_length = tmp_sinks_list_length; @@ -241,7 +227,7 @@ bool rfs::add_sink(pkt_rcvr_sink *p_sink) return true; } -bool rfs::del_sink(pkt_rcvr_sink *p_sink) +bool rfs::del_sink(sockinfo *p_sink) { uint32_t i; @@ -256,7 +242,7 @@ bool rfs::del_sink(pkt_rcvr_sink *p_sink) for (/*continue i*/; i < (m_n_sinks_list_entries - 1); ++i) { m_sinks_list[i] = m_sinks_list[i + 1]; } - m_sinks_list[i] = NULL; + m_sinks_list[i] = nullptr; m_n_sinks_list_entries--; rfs_logdbg("Removed sink (%p), num of sinks is now: %d", p_sink, @@ -272,7 +258,7 @@ bool rfs::del_sink(pkt_rcvr_sink *p_sink) return false; } -bool rfs::attach_flow(pkt_rcvr_sink *sink) +bool rfs::attach_flow(sockinfo *sink) { bool ret; int filter_counter = 1; @@ -289,7 +275,7 @@ bool rfs::attach_flow(pkt_rcvr_sink *sink) } else { rfs_logdbg("rfs: Joining existing flow"); #if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) - if (g_p_app->type != APP_NONE && g_p_app->add_second_4t_rule) { + if (g_p_app->type != APP_NONE && m_p_ring->is_simple() && g_p_app->add_second_4t_rule) { // This is second 4 tuple rule for the same worker (when number // of workers is not power of two) create_flow(); @@ -308,7 +294,7 @@ bool rfs::attach_flow(pkt_rcvr_sink *sink) return ret; } -bool rfs::detach_flow(pkt_rcvr_sink *sink) +bool rfs::detach_flow(sockinfo *sink) { bool ret = false; int filter_counter = 0; @@ -333,60 +319,55 @@ bool rfs::detach_flow(pkt_rcvr_sink *sink) #ifdef DEFINED_UTLS -template -rfs_rule *create_rule_T(xlio_tir *tir, const flow_tuple &flow_spec, attach_flow_data_t *iter, - bool is5T) +rfs_rule *rfs::create_rule(xlio_tir *tir, const flow_tuple &flow_spec) { - auto *p_attr = - reinterpret_cast(&iter->ibv_flow_attr); - - if (unlikely(p_attr->eth.type != XLIO_IBV_FLOW_SPEC_ETH)) { - // We support only ETH rules for now - return NULL; + if (!m_p_ring_simple) { + rfs_logpanic("Incompatible ring type"); } - auto flow_attr(*p_attr); - if (!is5T) { + auto *hqrx = m_p_ring_simple->m_hqrx; + + dpcp::match_params match_value_tmp; + dpcp::match_params match_mask_tmp; + memcpy(&match_value_tmp, &m_match_value, sizeof(m_match_value)); + memcpy(&match_mask_tmp, &m_match_mask, sizeof(m_match_mask)); + + if (!m_flow_tuple.is_5_tuple()) { // For UTLS, We need the most specific 5T rule (in case the current rule is 3T). - ibv_flow_spec_set_single_ip(flow_attr.ip.val.src_ip, flow_attr.ip.mask.src_ip, - flow_spec.get_src_ip()); - flow_attr.tcp_udp.val.src_port = flow_spec.get_src_port(); - flow_attr.tcp_udp.mask.src_port = FS_MASK_ON_16; - } - // The highest priority to override TCP rule - flow_attr.attr.priority = 0; - return iter->p_qp_mgr->create_rfs_rule(flow_attr.attr, tir); -} -rfs_rule *rfs::create_rule(xlio_tir *tir, const flow_tuple &flow_spec) -{ - if (m_attach_flow_data_vector.size() == 1) { - if (m_flow_tuple.get_family() == AF_INET) { - return create_rule_T( - tir, flow_spec, m_attach_flow_data_vector[0], m_flow_tuple.is_5_tuple()); + if (match_value_tmp.ethertype == ETH_P_IP) { + match_mask_tmp.src.ipv4 = flow_spec.get_src_ip().is_anyaddr() ? 0U : 0xFFFFFFFFU; + match_value_tmp.src.ipv4 = ntohl(flow_spec.get_src_ip().get_in4_addr().s_addr); + } else { + memset(match_mask_tmp.src.ipv6, flow_spec.get_src_ip().is_anyaddr() ? 0U : 0xFFU, + sizeof(match_mask_tmp.src.ipv6)); + memcpy(match_value_tmp.src.ipv6, &flow_spec.get_src_ip().get_in6_addr(), + sizeof(match_value_tmp.src.ipv6)); } - return create_rule_T( - tir, flow_spec, m_attach_flow_data_vector[0], m_flow_tuple.is_5_tuple()); + match_mask_tmp.src_port = 0xFFFFU; + match_value_tmp.src_port = ntohs(flow_spec.get_src_port()); } - return nullptr; + // The highest priority to override TCP rule + return hqrx->create_rfs_rule(match_value_tmp, match_mask_tmp, 0, m_flow_tag_id, tir); } #endif /* DEFINED_UTLS */ bool rfs::create_flow() { - for (size_t i = 0; i < m_attach_flow_data_vector.size(); i++) { - attach_flow_data_t *iter = m_attach_flow_data_vector[i]; - iter->rfs_flow = iter->p_qp_mgr->create_rfs_rule(iter->ibv_flow_attr, NULL); - if (!iter->rfs_flow) { - rfs_logerr("Create RFS flow failed, Tag: %" PRIu32 ", Flow: %s, Priority: %" PRIu16 - ", errno: %d - %m", - m_flow_tag_id, m_flow_tuple.to_str().c_str(), iter->ibv_flow_attr.priority, - errno); // TODO ALEXR - Add info about QP, spec into log msg - return false; - } + if (!m_p_ring_simple) { + rfs_logpanic("Incompatible ring type"); + } + + m_rfs_flow = m_p_ring_simple->m_hqrx->create_rfs_rule(m_match_value, m_match_mask, m_priority, + m_flow_tag_id, nullptr); + if (!m_rfs_flow) { + rfs_logerr("Create RFS flow failed, Tag: %" PRIu32 ", Flow: %s, Priority: %" PRIu16 + ", errno: %d - %m", + m_flow_tag_id, m_flow_tuple.to_str().c_str(), m_priority, errno); + return false; } m_b_tmp_is_attached = true; @@ -398,19 +379,14 @@ bool rfs::create_flow() bool rfs::destroy_flow() { - for (size_t i = 0; i < m_attach_flow_data_vector.size(); i++) { - attach_flow_data_t *iter = m_attach_flow_data_vector[i]; - if (unlikely(!iter->rfs_flow)) { - rfs_logdbg( - "Destroy RFS flow failed, RFS flow was not created. " - "This is OK for MC same ip diff port scenario. Tag: %" PRIu32 - ", Flow: %s, Priority: %" PRIu16, - m_flow_tag_id, m_flow_tuple.to_str().c_str(), - iter->ibv_flow_attr.priority); // TODO ALEXR - Add info about QP, spec into log msg - } else { - delete iter->rfs_flow; - iter->rfs_flow = nullptr; - } + if (unlikely(!m_rfs_flow)) { + rfs_logdbg("Destroy RFS flow failed, RFS flow was not created. " + "This is OK for MC same ip diff port scenario. Tag: %" PRIu32 + ", Flow: %s, Priority: %" PRIu16, + m_flow_tag_id, m_flow_tuple.to_str().c_str(), m_priority); + } else { + delete m_rfs_flow; + m_rfs_flow = nullptr; } m_b_tmp_is_attached = false; @@ -419,3 +395,46 @@ bool rfs::destroy_flow() return true; } + +void rfs::prepare_flow_spec_eth_ip(const ip_address &dst_ip, const ip_address &src_ip) +{ + if (!m_p_ring_simple) { + rfs_logpanic("Incompatible ring type"); + } + + m_match_value.vlan_id = m_p_ring_simple->m_hqrx->get_vlan() & VLAN_VID_MASK; + m_match_mask.vlan_id = (m_p_ring_simple->m_hqrx->get_vlan() ? VLAN_VID_MASK : 0); + + bool is_ipv4 = (m_flow_tuple.get_family() == AF_INET); + if (is_ipv4) { + m_match_mask.dst.ipv4 = dst_ip.is_anyaddr() ? 0U : 0xFFFFFFFFU; + m_match_value.dst.ipv4 = ntohl(dst_ip.get_in4_addr().s_addr); + m_match_mask.src.ipv4 = src_ip.is_anyaddr() ? 0U : 0xFFFFFFFFU; + m_match_value.src.ipv4 = ntohl(src_ip.get_in4_addr().s_addr); + m_match_mask.ip_version = 0xF; + m_match_value.ip_version = 4U; + m_match_mask.ethertype = 0xFFFFU; + m_match_value.ethertype = ETH_P_IP; + } else { + memset(m_match_mask.dst.ipv6, dst_ip.is_anyaddr() ? 0U : 0xFFU, + sizeof(m_match_mask.dst.ipv6)); + memcpy(m_match_value.dst.ipv6, &dst_ip.get_in6_addr(), sizeof(m_match_value.dst.ipv6)); + memset(m_match_mask.src.ipv6, src_ip.is_anyaddr() ? 0U : 0xFFU, + sizeof(m_match_mask.src.ipv6)); + memcpy(m_match_value.src.ipv6, &src_ip.get_in6_addr(), sizeof(m_match_value.src.ipv6)); + m_match_mask.ip_version = 0xF; + m_match_value.ip_version = 6U; + m_match_mask.ethertype = 0xFFFFU; + m_match_value.ethertype = ETH_P_IPV6; + } +} + +void rfs::prepare_flow_spec_tcp_udp() +{ + m_match_mask.dst_port = (m_flow_tuple.get_dst_port() ? 0xFFFFU : 0U); + m_match_value.dst_port = ntohs(m_flow_tuple.get_dst_port()); + m_match_mask.src_port = (m_flow_tuple.get_src_port() ? 0xFFFFU : 0U); + m_match_value.src_port = ntohs(m_flow_tuple.get_src_port()); + m_match_mask.protocol = 0xFF; + m_match_value.protocol = (m_flow_tuple.get_protocol() == PROTO_TCP ? IPPROTO_TCP : IPPROTO_UDP); +} diff --git a/src/core/dev/rfs.h b/src/core/dev/rfs.h index a5cf28efe..316f97b62 100644 --- a/src/core/dev/rfs.h +++ b/src/core/dev/rfs.h @@ -34,7 +34,7 @@ #define RFS_H #include - +#include #include "ib/base/verbs_extra.h" #include "util/vtypes.h" #include "dev/ring_simple.h" @@ -43,8 +43,8 @@ #define RFS_SINKS_LIST_DEFAULT_LEN 32 -class qp_mgr; -class pkt_rcvr_sink; +class hw_queue_rx; +class sockinfo; /* * Priority description: @@ -56,58 +56,6 @@ class pkt_rcvr_sink; * shadow for socket reuse feature. */ -/* ETHERNET - */ - -typedef struct ibv_flow_attr_eth { - xlio_ibv_flow_attr attr; - xlio_ibv_flow_spec_eth eth; -} ibv_flow_attr_eth; - -template struct attach_flow_data_eth_ip_tcp_udp_t { - rfs_rule *rfs_flow; - qp_mgr *p_qp_mgr; - struct ibv_flow_attr_eth_ip_tcp_udp : public ibv_flow_attr_eth { - T ip; - xlio_ibv_flow_spec_tcp_udp tcp_udp; - xlio_ibv_flow_spec_action_tag flow_tag; // must be the last as struct can be used without it - - ibv_flow_attr_eth_ip_tcp_udp(uint8_t port) - { - memset(this, 0, sizeof(*this)); - attr.size = sizeof(T) - sizeof(flow_tag); - attr.num_of_specs = 3; - attr.type = XLIO_IBV_FLOW_ATTR_NORMAL; - attr.priority = 2; // almost highest priority, 1 is used for 5-tuple later - attr.port = port; - } - inline void add_flow_tag_spec(void) - { - attr.num_of_specs++; - attr.size += sizeof(flow_tag); - } - } ibv_flow_attr; - attach_flow_data_eth_ip_tcp_udp_t(qp_mgr *qp_mgr) - : rfs_flow(NULL) - , p_qp_mgr(qp_mgr) - , ibv_flow_attr(qp_mgr->get_port_num()) - { - } -}; - -typedef attach_flow_data_eth_ip_tcp_udp_t - attach_flow_data_eth_ipv4_tcp_udp_t; -typedef attach_flow_data_eth_ip_tcp_udp_t - attach_flow_data_eth_ipv6_tcp_udp_t; - -typedef struct attach_flow_data_t { - rfs_rule *rfs_flow; - qp_mgr *p_qp_mgr; - xlio_ibv_flow_attr ibv_flow_attr; -} attach_flow_data_t; - -typedef std::vector attach_flow_data_vector_t; - class rfs_rule_filter { public: rfs_rule_filter(rule_filter_map_t &map, const sock_addr &key, flow_tuple &flow_tuple) @@ -131,21 +79,21 @@ class rfs_rule_filter { class rfs { public: - rfs(flow_tuple *flow_spec_5t, ring_slave *p_ring, rfs_rule_filter *rule_filter = NULL, + rfs(flow_tuple *flow_spec_5t, ring_slave *p_ring, rfs_rule_filter *rule_filter = nullptr, uint32_t flow_tag_id = 0); virtual ~rfs(); /** * Register/Unregister a sink with this rfs object - * Get notifications about incoming packets using the pkt_rcvr_sink callback api + * Get notifications about incoming packets using the sockinfo callback api * The rfs will call ibv_attach on the QP once when at least one receiver sink is registered * An ibv_detach is called when the last receiver sink is deleted from the registered list * */ - bool attach_flow(pkt_rcvr_sink *sink); // Add a sink. If this is the first sink --> map the sink - // and attach flow to QP - bool detach_flow(pkt_rcvr_sink *sink); // Delete a sink. If this is the last sink --> delete it - // and detach flow from QP + bool attach_flow(sockinfo *sink); // Add a sink. If this is the first sink --> map the sink + // and attach flow to QP + bool detach_flow(sockinfo *sink); // Delete a sink. If this is the last sink --> delete it + // and detach flow from QP #ifdef DEFINED_UTLS rfs_rule *create_rule(xlio_tir *tir, const flow_tuple &flow_spec); // Create a duplicate rule which points to @@ -158,20 +106,27 @@ class rfs { protected: flow_tuple m_flow_tuple; ring_slave *m_p_ring; + ring_simple *m_p_ring_simple; rfs_rule_filter *m_p_rule_filter; - attach_flow_data_vector_t m_attach_flow_data_vector; - pkt_rcvr_sink **m_sinks_list; + rfs_rule *m_rfs_flow = nullptr; + sockinfo **m_sinks_list; uint32_t m_n_sinks_list_entries; // Number of actual sinks in the array (we shrink the array if // a sink is removed) uint32_t m_n_sinks_list_max_length; uint32_t m_flow_tag_id; // Associated with this rule, set by attach_flow() + uint16_t m_priority = 2U; // Almost highest priority, 1 is used for 5-tuple later bool m_b_tmp_is_attached; // Only temporary, while ibcm calls attach_flow with no sinks... + dpcp::match_params m_match_value; + dpcp::match_params m_match_mask; + bool create_flow(); // Attach flow to all queues bool destroy_flow(); // Detach flow from all queues - bool add_sink(pkt_rcvr_sink *p_sink); - bool del_sink(pkt_rcvr_sink *p_sink); - virtual bool prepare_flow_spec() = 0; + bool add_sink(sockinfo *p_sink); + bool del_sink(sockinfo *p_sink); + void prepare_flow_spec_eth_ip(const ip_address &dst_ip, const ip_address &src_ip); + void prepare_flow_spec_tcp_udp(); + virtual void prepare_flow_spec() = 0; private: rfs(); // I don't want anyone to use the default constructor diff --git a/src/core/dev/rfs_mc.cpp b/src/core/dev/rfs_mc.cpp index 96e5a3079..14f7fa064 100644 --- a/src/core/dev/rfs_mc.cpp +++ b/src/core/dev/rfs_mc.cpp @@ -34,9 +34,18 @@ #include "util/utils.h" #include "dev/rfs_mc.h" #include "dev/ring_simple.h" +#include "sock/sockinfo.h" #define MODULE_NAME "rfs_mc" +#define rfs_logpanic __log_info_panic +#define rfs_logerr __log_info_err +#define rfs_logwarn __log_info_warn +#define rfs_loginfo __log_info_info +#define rfs_logdbg __log_info_dbg +#define rfs_logfunc __log_info_func +#define rfs_logfuncall __log_info_funcall + rfs_mc::rfs_mc(flow_tuple *flow_spec_5t, ring_slave *p_ring, rfs_rule_filter *rule_filter /*= NULL*/, int flow_tag_id /*=0*/) : rfs(flow_spec_5t, p_ring, rule_filter, flow_tag_id) @@ -47,69 +56,33 @@ rfs_mc::rfs_mc(flow_tuple *flow_spec_5t, ring_slave *p_ring, } BULLSEYE_EXCLUDE_BLOCK_END - if (m_p_ring->is_simple() && !prepare_flow_spec()) { - throw_xlio_exception("IB multicast offload is not supported"); + if (m_p_ring->is_simple()) { + prepare_flow_spec(); } } -bool rfs_mc::prepare_flow_spec() +void rfs_mc::prepare_flow_spec() { - ring_simple *p_ring = dynamic_cast(m_p_ring); - - if (!p_ring) { - rfs_logpanic("Incompatible ring type"); - } - - transport_type_t type = p_ring->get_transport_type(); - - /* - * todo note that ring is not locked here. - * we touch members that should not change during the ring life. - * the ring will not be deleted as we increased refcnt. - * if one of these assumptions change, we must lock. - */ - attach_flow_data_t *p_attach_flow_data = nullptr; - xlio_ibv_flow_spec_eth *p_eth = nullptr; - xlio_ibv_flow_spec_tcp_udp *p_tcp_udp = nullptr; + const ip_address &dst_ip = + (safe_mce_sys().eth_mc_l2_only_rules ? ip_address::any_addr() : m_flow_tuple.get_dst_ip()); - switch (type) { - case XLIO_TRANSPORT_ETH: { - bool is_ipv4 = (m_flow_tuple.get_family() == AF_INET); - if (is_ipv4) { - prepare_flow_spec_by_ip( - p_ring->m_p_qp_mgr, p_attach_flow_data, p_eth, p_tcp_udp); - } else { - prepare_flow_spec_by_ip( - p_ring->m_p_qp_mgr, p_attach_flow_data, p_eth, p_tcp_udp); - } + prepare_flow_spec_eth_ip(dst_ip, ip_address::any_addr()); - if (!p_attach_flow_data) { - return false; - } + uint8_t dst_mac[6]; + create_multicast_mac_from_ip(dst_mac, m_flow_tuple.get_dst_ip(), m_flow_tuple.get_family()); - uint8_t dst_mac[6]; - create_multicast_mac_from_ip(dst_mac, m_flow_tuple.get_dst_ip(), m_flow_tuple.get_family()); - ibv_flow_spec_eth_set(p_eth, dst_mac, htons(p_ring->m_p_qp_mgr->get_partiton()), is_ipv4); + memset(&m_match_mask.dst_mac, 0xFF, sizeof(m_match_mask.dst_mac)); + memcpy(&m_match_value.dst_mac, dst_mac, sizeof(dst_mac)); - if (safe_mce_sys().eth_mc_l2_only_rules) { - ibv_flow_spec_tcp_udp_set(p_tcp_udp, 0, 0, 0); - } else { - ibv_flow_spec_tcp_udp_set(p_tcp_udp, (m_flow_tuple.get_protocol() == PROTO_TCP), - m_flow_tuple.get_dst_port(), m_flow_tuple.get_src_port()); - } + if (safe_mce_sys().eth_mc_l2_only_rules) { + m_match_mask.dst_port = m_match_value.dst_port = m_match_mask.src_port = + m_match_value.src_port = 0U; - break; + m_match_mask.protocol = 0xFF; + m_match_value.protocol = IPPROTO_UDP; + } else { + prepare_flow_spec_tcp_udp(); } - BULLSEYE_EXCLUDE_BLOCK_START - default: - rfs_logpanic("Incompatible transport type = %d", type); - return false; - break; - BULLSEYE_EXCLUDE_BLOCK_END - } - - m_attach_flow_data_vector.push_back(p_attach_flow_data); - return true; } bool rfs_mc::rx_dispatch_packet(mem_buf_desc_t *p_rx_wc_buf_desc, void *pv_fd_ready_array) diff --git a/src/core/dev/rfs_mc.h b/src/core/dev/rfs_mc.h index 87c514a47..a708467cd 100644 --- a/src/core/dev/rfs_mc.h +++ b/src/core/dev/rfs_mc.h @@ -35,8 +35,6 @@ #include "dev/rfs.h" -#define MODULE_NAME "rfs_mc" - /** * @class rfs_mc * @@ -47,47 +45,14 @@ class rfs_mc : public rfs { public: - rfs_mc(flow_tuple *flow_spec_5t, ring_slave *p_ring, rfs_rule_filter *rule_filter = NULL, + rfs_mc(flow_tuple *flow_spec_5t, ring_slave *p_ring, rfs_rule_filter *rule_filter = nullptr, int32_t flow_tag_id = 0); - virtual bool rx_dispatch_packet(mem_buf_desc_t *p_rx_wc_buf_desc, void *pv_fd_ready_array); + virtual bool rx_dispatch_packet(mem_buf_desc_t *p_rx_wc_buf_desc, + void *pv_fd_ready_array) override; protected: - virtual bool prepare_flow_spec(); - - template - void prepare_flow_spec_by_ip(qp_mgr *qp_mgr, attach_flow_data_t *&p_attach_flow_data, - xlio_ibv_flow_spec_eth *&p_eth, - xlio_ibv_flow_spec_tcp_udp *&p_tcp_udp); + void prepare_flow_spec() override; }; -template -void rfs_mc::prepare_flow_spec_by_ip(qp_mgr *qp_mgr, attach_flow_data_t *&p_attach_flow_data, - xlio_ibv_flow_spec_eth *&p_eth, - xlio_ibv_flow_spec_tcp_udp *&p_tcp_udp) -{ - T *attach_flow_data_eth = new (std::nothrow) T(qp_mgr); - if (!attach_flow_data_eth) { - return; - } - - p_eth = &(attach_flow_data_eth->ibv_flow_attr.eth); - p_tcp_udp = &(attach_flow_data_eth->ibv_flow_attr.tcp_udp); - p_attach_flow_data = reinterpret_cast(attach_flow_data_eth); - - const ip_address &dst_ip = - (safe_mce_sys().eth_mc_l2_only_rules ? ip_address::any_addr() : m_flow_tuple.get_dst_ip()); - - ibv_flow_spec_ip_set(&(attach_flow_data_eth->ibv_flow_attr.ip), dst_ip, ip_address::any_addr()); - - if (m_flow_tag_id) { // Will not attach flow_tag spec to rule for tag_id==0 - ibv_flow_spec_flow_tag_set(&(attach_flow_data_eth->ibv_flow_attr.flow_tag), m_flow_tag_id); - attach_flow_data_eth->ibv_flow_attr.add_flow_tag_spec(); - rfs_logdbg("Adding flow_tag spec to MC rule, num_of_specs: %d flow_tag_id: %d", - attach_flow_data_eth->ibv_flow_attr.attr.num_of_specs, m_flow_tag_id); - } -} - -#undef MODULE_NAME - #endif /* RFS_MC_H */ diff --git a/src/core/dev/rfs_rule.cpp b/src/core/dev/rfs_rule.cpp new file mode 100644 index 000000000..da5199a49 --- /dev/null +++ b/src/core/dev/rfs_rule.cpp @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "dev/rfs_rule.h" + +#include +#include "dev/rfs.h" + +#define MODULE_NAME "rfs_rule" + +#define rfs_logpanic __log_info_panic +#define rfs_logerr __log_info_err +#define rfs_logwarn __log_info_warn +#define rfs_loginfo __log_info_info +#define rfs_logdbg __log_info_dbg +#define rfs_logfunc __log_info_func +#define rfs_logfuncall __log_info_funcall + +bool rfs_rule::create(dpcp::match_params &match_value, dpcp::match_params &match_mask, + dpcp::tir &in_tir, uint16_t priority, uint32_t flow_tag, + dpcp::adapter &in_adapter) +{ + rfs_logdbg("Creating flow dpcp_adpater::create_flow_rule(), priority %" PRIu16 + ", flow_tag: %" PRIu32, + priority, flow_tag); + rfs_logdbg("match_mask:\n" + "ethertype: 0x%04" PRIx16 ", vlan_id: 0x%04" PRIx16 ", protocol: 0x%02" PRIx8 + ", ip_version: 0x%02" PRIx8 "\n" + "dst_port: 0x%04" PRIx16 ", src_ports: 0x%04" PRIx16 "\n" + "src_ip: ipv4: 0x%08" PRIx32 ", ipv6: 0x%016" PRIx64 "%016" PRIx64 "\n" + "dst_ip: ipv4: 0x%08" PRIx32 ", ipv6: 0x%016" PRIx64 "%016" PRIx64 "\n" + "dst_mac: 0x%016" PRIx64, + match_mask.ethertype, match_mask.vlan_id, match_mask.protocol, match_mask.ip_version, + match_mask.dst_port, match_mask.src_port, match_mask.src.ipv4, + *reinterpret_cast(match_mask.src.ipv6 + 8), + *reinterpret_cast(match_mask.src.ipv6), match_mask.dst.ipv4, + *reinterpret_cast(match_mask.dst.ipv6 + 8), + *reinterpret_cast(match_mask.dst.ipv6), + *reinterpret_cast(match_mask.dst_mac)); + rfs_logdbg("match_value:\n" + "ethertype: 0x%04" PRIx16 ", vlan_id: %" PRIu16 ", protocol: %" PRIu8 + ", ip_version: %" PRIu8 "\n" + "dst_port: %" PRIu16 ", src_ports: %" PRIu16 "\n" + "src_ip: ipv4: 0x%08" PRIx32 ", ipv6: 0x%016" PRIx64 "%016" PRIx64 "\n" + "dst_ip: ipv4: 0x%08" PRIx32 ", ipv6: 0x%016" PRIx64 "%016" PRIx64 "\n" + "dst_mac: 0x%016" PRIx64, + match_value.ethertype, match_value.vlan_id, match_value.protocol, + match_value.ip_version, match_value.dst_port, match_value.src_port, + match_value.src.ipv4, *reinterpret_cast(match_value.src.ipv6 + 8), + *reinterpret_cast(match_value.src.ipv6), match_value.dst.ipv4, + *reinterpret_cast(match_value.dst.ipv6 + 8), + *reinterpret_cast(match_value.dst.ipv6), + *reinterpret_cast(match_value.dst_mac)); + + dpcp::flow_rule *new_rule = nullptr; + dpcp::status status_out = in_adapter.create_flow_rule(priority, match_mask, new_rule); + if (status_out != dpcp::DPCP_OK) { + rfs_logerr("Failed dpcp_adpater::create_flow_rule(), Priority %" PRIu16 ", Status: %d", + priority, static_cast(status_out)); + return false; + } + + rfs_logdbg("Succeeded dpcp_adpater::create_flow_rule(), Priority %" PRIu16 + ", rfs_rule %p, dpcp_flow: %p", + priority, this, new_rule); + + _dpcp_flow.reset(new_rule); + + status_out = _dpcp_flow->set_match_value(match_value); + if (status_out != dpcp::DPCP_OK) { + rfs_logerr("Failed dpcp_flow_rule::set_match_value(), Status: %d, dpcp_flow: %p", + static_cast(status_out), new_rule); + return false; + } + + status_out = _dpcp_flow->add_dest_tir(&in_tir); + if (status_out != dpcp::DPCP_OK) { + rfs_logerr("Failed dpcp_flow_rule::add_dest_tir(), Status: %d, dpcp_flow: %p", + static_cast(status_out), new_rule); + return false; + } + + uint32_t tirn = 0U; + in_tir.get_id(tirn); + rfs_logdbg("Added dpcp_flow_rule::add_dest_tir() TIR %" PRIu32 ", dpcp_flow: %p", tirn, + new_rule); + + if (flow_tag) { + rfs_logdbg("Setting flow tag dpcp_adpater::set_flow_id(), Tag: %" PRIu32 ", dpcp_flow: %p", + flow_tag, new_rule); + + status_out = _dpcp_flow->set_flow_id(flow_tag); + if (status_out != dpcp::DPCP_OK) { + rfs_logerr("Failed dpcp_flow_rule::set_flow_id(), Status: %d, dpcp_flow: %p", + static_cast(status_out), new_rule); + return false; + } + } + + status_out = _dpcp_flow->apply_settings(); + if (status_out != dpcp::DPCP_OK) { + rfs_logerr("Failed dpcp_flow_rule::apply_settings(), Status: %d, dpcp_flow: %p", + static_cast(status_out), new_rule); + return false; + } + + return true; +} diff --git a/src/core/dev/rfs_rule.h b/src/core/dev/rfs_rule.h index 9cd2eb813..0ce39394e 100644 --- a/src/core/dev/rfs_rule.h +++ b/src/core/dev/rfs_rule.h @@ -33,19 +33,17 @@ #ifndef RFS_RULE_H #define RFS_RULE_H -#include - -#define rfs_logpanic __log_info_panic -#define rfs_logerr __log_info_err -#define rfs_logwarn __log_info_warn -#define rfs_loginfo __log_info_info -#define rfs_logdbg __log_info_dbg -#define rfs_logfunc __log_info_func -#define rfs_logfuncall __log_info_funcall +#include +#include "ib/base/verbs_extra.h" +#include class rfs_rule { public: - virtual ~rfs_rule() {} + bool create(dpcp::match_params &match_value, dpcp::match_params &match_mask, dpcp::tir &in_tir, + uint16_t priority, uint32_t flow_tag, dpcp::adapter &in_adapter); + +private: + std::unique_ptr _dpcp_flow; }; #endif diff --git a/src/core/dev/rfs_rule_dpcp.cpp b/src/core/dev/rfs_rule_dpcp.cpp deleted file mode 100644 index 5abf6f9ba..000000000 --- a/src/core/dev/rfs_rule_dpcp.cpp +++ /dev/null @@ -1,160 +0,0 @@ -/* - * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "dev/rfs_rule_dpcp.h" - -#if defined(DEFINED_DPCP) - -#include -#include "dev/rfs.h" - -#define MODULE_NAME "rfs_rule_dpcp" - -rfs_rule_dpcp::~rfs_rule_dpcp() -{ -} - -bool rfs_rule_dpcp::create(const xlio_ibv_flow_attr &attrs, dpcp::tir &in_tir, - dpcp::adapter &in_adapter) -{ - const ibv_flow_attr_eth &attrs_eth(reinterpret_cast(attrs)); - dpcp::match_params mp; - dpcp::match_params match_msk; - - memset(&mp, 0, sizeof(mp)); - memset(&match_msk, 0, sizeof(match_msk)); - - memset(&match_msk.dst_mac, 0xFF, sizeof(match_msk.dst_mac)); - memcpy(&mp.dst_mac, attrs_eth.eth.val.dst_mac, - min(sizeof(mp.dst_mac), sizeof(attrs_eth.eth.val.dst_mac))); - - match_msk.ethertype = htons(attrs_eth.eth.mask.ether_type); - mp.ethertype = htons(attrs_eth.eth.val.ether_type); - match_msk.vlan_id = ntohs(attrs_eth.eth.mask.vlan_tag); - mp.vlan_id = ntohs(attrs_eth.eth.val.vlan_tag); - - const xlio_ibv_flow_spec_tcp_udp *p_tcp_udp = nullptr; - const xlio_ibv_flow_spec_action_tag *p_flow_tag = nullptr; - - if (attrs_eth.eth.val.ether_type == htons(ETH_P_IP)) { - const auto &attrs_tcpudp( - reinterpret_cast< - const attach_flow_data_eth_ipv4_tcp_udp_t::ibv_flow_attr_eth_ip_tcp_udp &>(attrs)); - - p_tcp_udp = &(attrs_tcpudp.tcp_udp); - p_flow_tag = &(attrs_tcpudp.flow_tag); - - match_msk.dst.ipv4 = ntohl(attrs_tcpudp.ip.mask.dst_ip); - mp.dst.ipv4 = ntohl(attrs_tcpudp.ip.val.dst_ip); - match_msk.src.ipv4 = ntohl(attrs_tcpudp.ip.mask.src_ip); - mp.src.ipv4 = ntohl(attrs_tcpudp.ip.val.src_ip); - mp.ip_version = 4U; - } else { - const auto &attrs_tcpudp( - reinterpret_cast< - const attach_flow_data_eth_ipv6_tcp_udp_t::ibv_flow_attr_eth_ip_tcp_udp &>(attrs)); - - p_tcp_udp = &(attrs_tcpudp.tcp_udp); - p_flow_tag = &(attrs_tcpudp.flow_tag); - - memcpy(match_msk.dst.ipv6, attrs_tcpudp.ip.mask.dst_ip, sizeof(match_msk.dst.ipv6)); - memcpy(mp.dst.ipv6, attrs_tcpudp.ip.val.dst_ip, sizeof(mp.dst.ipv6)); - memcpy(match_msk.src.ipv6, attrs_tcpudp.ip.mask.src_ip, sizeof(match_msk.src.ipv6)); - memcpy(mp.src.ipv6, attrs_tcpudp.ip.val.src_ip, sizeof(mp.src.ipv6)); - mp.ip_version = 6U; - } - - match_msk.dst_port = ntohs(p_tcp_udp->mask.dst_port); - mp.dst_port = ntohs(p_tcp_udp->val.dst_port); - match_msk.src_port = ntohs(p_tcp_udp->mask.src_port); - mp.src_port = ntohs(p_tcp_udp->val.src_port); - match_msk.protocol = 0xFF; - mp.protocol = (p_tcp_udp->type == XLIO_IBV_FLOW_SPEC_TCP ? IPPROTO_TCP : IPPROTO_UDP); - match_msk.ip_version = 0xF; - - dpcp::flow_rule *new_rule = nullptr; - dpcp::status status_out = in_adapter.create_flow_rule(attrs.priority, match_msk, new_rule); - if (status_out != dpcp::DPCP_OK) { - rfs_logerr( - "Failed dpcp_adpater::create_flow_rule(), Type: %u, Priority %" PRIu16 ", Status: %d", - static_cast(attrs.type), attrs.priority, static_cast(status_out)); - return false; - } - - rfs_logdbg("Succeeded dpcp_adpater::create_flow_rule(), Type: %u, Priority %" PRIu16 - ", rfs_rule_dpcp %p, dpcp_flow: %p", - static_cast(attrs.type), attrs.priority, this, new_rule); - - _dpcp_flow.reset(new_rule); - - status_out = _dpcp_flow->set_match_value(mp); - if (status_out != dpcp::DPCP_OK) { - rfs_logerr("Failed dpcp_flow_rule::set_match_value(), Status: %d, dpcp_flow: %p", - static_cast(status_out), new_rule); - return false; - } - - status_out = _dpcp_flow->add_dest_tir(&in_tir); - if (status_out != dpcp::DPCP_OK) { - rfs_logerr("Failed dpcp_flow_rule::add_dest_tir(), Status: %d, dpcp_flow: %p", - static_cast(status_out), new_rule); - return false; - } - - uint32_t tirn = 0U; - in_tir.get_id(tirn); - rfs_logdbg("Added dpcp_flow_rule::add_dest_tir() TIR %" PRIu32 ", dpcp_flow: %p", tirn, - new_rule); - - if (p_flow_tag->type == XLIO_IBV_FLOW_SPEC_ACTION_TAG) { - rfs_logdbg("Setting flow tag dpcp_adpater::set_flow_id(), Tag: %" PRIu32 ", dpcp_flow: %p", - p_flow_tag->tag_id, new_rule); - - status_out = _dpcp_flow->set_flow_id(p_flow_tag->tag_id); - if (status_out != dpcp::DPCP_OK) { - rfs_logerr("Failed dpcp_flow_rule::set_flow_id(), Status: %d, dpcp_flow: %p", - static_cast(status_out), new_rule); - return false; - } - } - - status_out = _dpcp_flow->apply_settings(); - if (status_out != dpcp::DPCP_OK) { - rfs_logerr("Failed dpcp_flow_rule::apply_settings(), Status: %d, dpcp_flow: %p", - static_cast(status_out), new_rule); - return false; - } - - return true; -} - -#endif // defined(DEFINED_DPCP) diff --git a/src/core/dev/rfs_rule_ibv.h b/src/core/dev/rfs_rule_ibv.h deleted file mode 100644 index 1be41454a..000000000 --- a/src/core/dev/rfs_rule_ibv.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef RFS_RULE_IBV_H -#define RFS_RULE_IBV_H - -#include -#include "util/utils.h" -#include "ib/base/verbs_extra.h" -#include "dev/rfs_rule.h" - -using namespace std; - -template using deleter_func = void (*)(T *); - -template using unique_ptr_delfunc = std::unique_ptr>; - -class rfs_rule_ibv : public rfs_rule { -public: - virtual ~rfs_rule_ibv(); - - bool create(xlio_ibv_flow_attr &attrs, ibv_qp *qp); - -private: - static void destory_ibv_flow(xlio_ibv_flow *flow); - - unique_ptr_delfunc _ibv_flow {nullptr, destory_ibv_flow}; -}; - -#endif diff --git a/src/core/dev/rfs_uc.cpp b/src/core/dev/rfs_uc.cpp index f7c3565bc..277492957 100644 --- a/src/core/dev/rfs_uc.cpp +++ b/src/core/dev/rfs_uc.cpp @@ -37,9 +37,18 @@ #include "util/instrumentation.h" #include "sock/sock-redirect.h" #include "sock/sock-app.h" +#include "sock/sockinfo.h" #define MODULE_NAME "rfs_uc" +#define rfs_logpanic __log_info_panic +#define rfs_logerr __log_info_err +#define rfs_logwarn __log_info_warn +#define rfs_loginfo __log_info_info +#define rfs_logdbg __log_info_dbg +#define rfs_logfunc __log_info_func +#define rfs_logfuncall __log_info_funcall + rfs_uc::rfs_uc(flow_tuple *flow_spec_5t, ring_slave *p_ring, rfs_rule_filter *rule_filter, uint32_t flow_tag_id) : rfs(flow_spec_5t, p_ring, rule_filter, flow_tag_id) @@ -50,63 +59,28 @@ rfs_uc::rfs_uc(flow_tuple *flow_spec_5t, ring_slave *p_ring, rfs_rule_filter *ru } BULLSEYE_EXCLUDE_BLOCK_END - if (m_p_ring->is_simple() && !prepare_flow_spec()) { - throw_xlio_exception("rfs_uc: Incompatible transport type"); + if (m_p_ring->is_simple()) { + prepare_flow_spec(); } } -bool rfs_uc::prepare_flow_spec() +void rfs_uc::prepare_flow_spec() { - ring_simple *p_ring = dynamic_cast(m_p_ring); - - if (!p_ring) { + if (!m_p_ring_simple) { rfs_logpanic("Incompatible ring type"); } - /* - * todo note that ring is not locked here. - * we touch members that should not change during the ring life. - * the ring will not be deleted as we increased refcnt. - * if one of these assumptions change, we must lock. - */ - attach_flow_data_t *p_attach_flow_data = nullptr; - xlio_ibv_flow_spec_eth *p_eth = nullptr; - xlio_ibv_flow_spec_tcp_udp *p_tcp_udp = nullptr; - - switch (p_ring->get_transport_type()) { - case XLIO_TRANSPORT_ETH: { - bool is_ipv4 = (m_flow_tuple.get_family() == AF_INET); - if (is_ipv4) { - prepare_flow_spec_by_ip( - p_ring->m_p_qp_mgr, p_attach_flow_data, p_eth, p_tcp_udp); - } else { - prepare_flow_spec_by_ip( - p_ring->m_p_qp_mgr, p_attach_flow_data, p_eth, p_tcp_udp); - } - - if (!p_attach_flow_data) { - return false; - } - - ibv_flow_spec_eth_set(p_eth, p_ring->m_p_l2_addr->get_address(), - htons(p_ring->m_p_qp_mgr->get_partiton()), is_ipv4); + prepare_flow_spec_eth_ip(m_flow_tuple.get_dst_ip(), m_flow_tuple.get_src_ip()); + prepare_flow_spec_tcp_udp(); - break; - } - BULLSEYE_EXCLUDE_BLOCK_START - default: - return false; - break; - BULLSEYE_EXCLUDE_BLOCK_END - } - - ibv_flow_spec_tcp_udp_set(p_tcp_udp, (m_flow_tuple.get_protocol() == PROTO_TCP), - m_flow_tuple.get_dst_port(), m_flow_tuple.get_src_port()); + memset(&m_match_mask.dst_mac, 0xFF, sizeof(m_match_mask.dst_mac)); + memcpy(&m_match_value.dst_mac, m_p_ring_simple->m_p_l2_addr->get_address(), + sizeof(m_match_value.dst_mac)); if (m_flow_tuple.get_src_port() || !m_flow_tuple.get_src_ip().is_anyaddr()) { - // set priority of 5-tuple to be higher than 3-tuple - // to make sure 5-tuple have higher priority on ConnectX-4 - p_attach_flow_data->ibv_flow_attr.priority = 1; + // Set priority of 5-tuple to be higher than 3-tuple + // to make sure 5-tuple have higher priority. + m_priority = 1; } #if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) else if (g_p_app->type != APP_NONE && g_p_app->get_worker_id() >= 0) { @@ -123,25 +97,24 @@ bool rfs_uc::prepare_flow_spec() } else { src_port = g_p_app->get_worker_id(); } - p_tcp_udp->val.src_port = htons((uint16_t)src_port * g_p_app->src_port_stride); - p_tcp_udp->mask.src_port = - htons((uint16_t)((g_p_app->workers_pow2 * g_p_app->src_port_stride) - 2)); - p_attach_flow_data->ibv_flow_attr.priority = 1; + + m_match_mask.src_port = + static_cast((g_p_app->workers_pow2 * g_p_app->src_port_stride) - 2); + m_match_value.src_port = static_cast(src_port * g_p_app->src_port_stride); + + m_priority = 1; rfs_logdbg("src_port_stride: %d workers_num %d \n", g_p_app->src_port_stride, g_p_app->workers_num); rfs_logdbg("sp_tcp_udp->val.src_port: %d p_tcp_udp->mask.src_port %d \n", - ntohs(p_tcp_udp->val.src_port), ntohs(p_tcp_udp->mask.src_port)); - m_flow_tuple.set_src_port(p_tcp_udp->val.src_port); + m_match_value.src_port, m_match_mask.src_port); + + m_flow_tuple.set_src_port(m_match_value.src_port); } } #endif - rfs_logfunc("transport type: %d, num_of_specs: %d flow_tag_id: %d", - p_ring->get_transport_type(), p_attach_flow_data->ibv_flow_attr.num_of_specs, + rfs_logfunc("Transport type: %d, flow_tag_id: %d", m_p_ring_simple->get_transport_type(), m_flow_tag_id); - - m_attach_flow_data_vector.push_back(p_attach_flow_data); - return true; } bool rfs_uc::rx_dispatch_packet(mem_buf_desc_t *p_rx_wc_buf_desc, void *pv_fd_ready_array) diff --git a/src/core/dev/rfs_uc.h b/src/core/dev/rfs_uc.h index e5937a765..b6d3ff529 100644 --- a/src/core/dev/rfs_uc.h +++ b/src/core/dev/rfs_uc.h @@ -35,8 +35,6 @@ #include "dev/rfs.h" -#define MODULE_NAME "rfs_uc" - /** * @class rfs_uc * @@ -47,45 +45,14 @@ class rfs_uc : public rfs { public: - rfs_uc(flow_tuple *flow_spec_5t, ring_slave *p_ring, rfs_rule_filter *rule_filter = NULL, + rfs_uc(flow_tuple *flow_spec_5t, ring_slave *p_ring, rfs_rule_filter *rule_filter = nullptr, uint32_t flow_tag_id = 0); - virtual bool rx_dispatch_packet(mem_buf_desc_t *p_rx_wc_buf_desc, void *pv_fd_ready_array); + virtual bool rx_dispatch_packet(mem_buf_desc_t *p_rx_wc_buf_desc, + void *pv_fd_ready_array) override; protected: - virtual bool prepare_flow_spec(); - - template - void prepare_flow_spec_by_ip(qp_mgr *qp_mgr, attach_flow_data_t *&p_attach_flow_data, - xlio_ibv_flow_spec_eth *&p_eth, - xlio_ibv_flow_spec_tcp_udp *&p_tcp_udp); + virtual void prepare_flow_spec() override; }; -template -void rfs_uc::prepare_flow_spec_by_ip(qp_mgr *qp_mgr, attach_flow_data_t *&p_attach_flow_data, - xlio_ibv_flow_spec_eth *&p_eth, - xlio_ibv_flow_spec_tcp_udp *&p_tcp_udp) -{ - T *attach_flow_data_eth = new (std::nothrow) T(qp_mgr); - if (!attach_flow_data_eth) { - return; - } - - decltype(T::ibv_flow_attr_eth_ip_tcp_udp::ip) *p_ip = &(attach_flow_data_eth->ibv_flow_attr.ip); - p_eth = &(attach_flow_data_eth->ibv_flow_attr.eth); - p_tcp_udp = &(attach_flow_data_eth->ibv_flow_attr.tcp_udp); - p_attach_flow_data = reinterpret_cast(attach_flow_data_eth); - - ibv_flow_spec_ip_set(p_ip, m_flow_tuple.get_dst_ip(), m_flow_tuple.get_src_ip()); - - if (m_flow_tag_id) { // Will not attach flow_tag spec to rule for tag_id==0 - ibv_flow_spec_flow_tag_set(&(attach_flow_data_eth->ibv_flow_attr.flow_tag), m_flow_tag_id); - attach_flow_data_eth->ibv_flow_attr.add_flow_tag_spec(); - rfs_logdbg("Adding flow_tag spec to rule, num_of_specs: %d flow_tag_id: %d", - attach_flow_data_eth->ibv_flow_attr.attr.num_of_specs, m_flow_tag_id); - } -} - -#undef MODULE_NAME - #endif /* RFS_UC_H */ diff --git a/src/core/dev/rfs_uc_tcp_gro.cpp b/src/core/dev/rfs_uc_tcp_gro.cpp index 4571007b3..522a6ff63 100644 --- a/src/core/dev/rfs_uc_tcp_gro.cpp +++ b/src/core/dev/rfs_uc_tcp_gro.cpp @@ -39,6 +39,8 @@ #define MODULE_NAME "rfs_uc_tcp_gro" +#define rfs_logpanic __log_info_panic + #define TCP_H_LEN_NO_OPTIONS 5 #define TCP_H_LEN_TIMESTAMP 8 @@ -157,7 +159,7 @@ bool rfs_uc_tcp_gro::rx_dispatch_packet(mem_buf_desc_t *p_rx_pkt_mem_buf_desc_in cq_stats_t &cq_stats = *m_p_ring_simple->m_p_cq_mgr_rx->m_p_cq_stat; cq_stats.n_rx_gro_packets++; cq_stats.n_rx_gro_frags += 1; - cq_stats.n_rx_gro_bytes += p_rx_pkt_mem_buf_desc_info->lwip_pbuf.pbuf.tot_len; + cq_stats.n_rx_gro_bytes += p_rx_pkt_mem_buf_desc_info->lwip_pbuf.tot_len; return rfs_uc::rx_dispatch_packet(p_rx_pkt_mem_buf_desc_info, pv_fd_ready_array); } @@ -185,14 +187,13 @@ bool rfs_uc_tcp_gro::add_packet(mem_buf_desc_t *mem_buf_desc, void *payload_ptr, mem_buf_desc->reset_ref_count(); - mem_buf_desc->lwip_pbuf.pbuf.len = mem_buf_desc->lwip_pbuf.pbuf.tot_len = - mem_buf_desc->rx.sz_payload; - mem_buf_desc->lwip_pbuf.pbuf.ref = 1; - mem_buf_desc->lwip_pbuf.pbuf.next = NULL; - mem_buf_desc->lwip_pbuf.pbuf.payload = payload_ptr; + mem_buf_desc->lwip_pbuf.len = mem_buf_desc->lwip_pbuf.tot_len = mem_buf_desc->rx.sz_payload; + mem_buf_desc->lwip_pbuf.ref = 1; + mem_buf_desc->lwip_pbuf.next = nullptr; + mem_buf_desc->lwip_pbuf.payload = payload_ptr; - m_gro_desc.p_last->lwip_pbuf.pbuf.next = &(mem_buf_desc->lwip_pbuf.pbuf); - m_gro_desc.p_last->p_next_desc = NULL; + m_gro_desc.p_last->lwip_pbuf.next = &mem_buf_desc->lwip_pbuf; + m_gro_desc.p_last->p_next_desc = nullptr; mem_buf_desc->p_prev_desc = m_gro_desc.p_last; m_gro_desc.p_last = mem_buf_desc; @@ -230,18 +231,18 @@ void rfs_uc_tcp_gro::flush_gro_desc(void *pv_fd_ready_array) p_tcp_ts_h->popts[2] = m_gro_desc.tsecr; } - m_gro_desc.p_first->lwip_pbuf.pbuf.gro = 1; + m_gro_desc.p_first->lwip_pbuf.gro = 1; - m_gro_desc.p_first->lwip_pbuf.pbuf.tot_len = m_gro_desc.p_first->lwip_pbuf.pbuf.len = + m_gro_desc.p_first->lwip_pbuf.tot_len = m_gro_desc.p_first->lwip_pbuf.len = (m_gro_desc.p_first->sz_data - m_gro_desc.p_first->rx.n_transport_header_len); - m_gro_desc.p_first->lwip_pbuf.pbuf.ref = 1; - m_gro_desc.p_first->lwip_pbuf.pbuf.payload = + m_gro_desc.p_first->lwip_pbuf.ref = 1; + m_gro_desc.p_first->lwip_pbuf.payload = (u8_t *)(m_gro_desc.p_first->p_buffer + m_gro_desc.p_first->rx.n_transport_header_len); m_gro_desc.p_first->rx.is_xlio_thr = m_gro_desc.p_last->rx.is_xlio_thr; for (mem_buf_desc_t *p_desc = m_gro_desc.p_last; p_desc != m_gro_desc.p_first; p_desc = p_desc->p_prev_desc) { - p_desc->p_prev_desc->lwip_pbuf.pbuf.tot_len += p_desc->lwip_pbuf.pbuf.tot_len; + p_desc->p_prev_desc->lwip_pbuf.tot_len += p_desc->lwip_pbuf.tot_len; } } @@ -257,7 +258,7 @@ void rfs_uc_tcp_gro::flush_gro_desc(void *pv_fd_ready_array) cq_stats_t &cq_stats = *m_p_ring_simple->m_p_cq_mgr_rx->m_p_cq_stat; cq_stats.n_rx_gro_packets++; cq_stats.n_rx_gro_frags += m_gro_desc.buf_count; - cq_stats.n_rx_gro_bytes += m_gro_desc.p_first->lwip_pbuf.pbuf.tot_len; + cq_stats.n_rx_gro_bytes += m_gro_desc.p_first->lwip_pbuf.tot_len; if (!rfs_uc::rx_dispatch_packet(m_gro_desc.p_first, pv_fd_ready_array)) { m_p_ring_simple->reclaim_recv_buffers_no_lock(m_gro_desc.p_first); diff --git a/src/core/dev/rfs_uc_tcp_gro.h b/src/core/dev/rfs_uc_tcp_gro.h index 4df456d31..7150b8c9a 100644 --- a/src/core/dev/rfs_uc_tcp_gro.h +++ b/src/core/dev/rfs_uc_tcp_gro.h @@ -68,7 +68,7 @@ class gro_mgr; class rfs_uc_tcp_gro : public rfs_uc { public: rfs_uc_tcp_gro(flow_tuple *flow_spec_5t, ring_slave *p_ring, - rfs_rule_filter *rule_filter = NULL, uint32_t flow_tag_id = 0); + rfs_rule_filter *rule_filter = nullptr, uint32_t flow_tag_id = 0); virtual bool rx_dispatch_packet(mem_buf_desc_t *p_rx_wc_buf_desc, void *pv_fd_ready_array); diff --git a/src/core/dev/ring.cpp b/src/core/dev/ring.cpp index 9f8870990..661e2aaa8 100644 --- a/src/core/dev/ring.cpp +++ b/src/core/dev/ring.cpp @@ -31,50 +31,52 @@ */ #include "ring.h" +#include "event/poll_group.h" #include "proto/route_table_mgr.h" -#include "sock/tcp_seg_pool.h" +#include "sock/sockinfo.h" #undef MODULE_NAME #define MODULE_NAME "ring" #undef MODULE_HDR #define MODULE_HDR MODULE_NAME "%d:%s() " +tcp_seg_pool *g_tcp_seg_pool = nullptr; +socketxtreme_ec_pool *g_socketxtreme_ec_pool = nullptr; + ring::ring() - : m_p_n_rx_channel_fds(NULL) - , m_parent(NULL) - , m_tcp_seg_list(nullptr) - , m_tcp_seg_count(0U) { - m_if_index = 0; print_val(); } ring::~ring() { if (m_tcp_seg_list) { - g_tcp_seg_pool->put_tcp_segs(m_tcp_seg_list); + g_tcp_seg_pool->put_objs(m_tcp_seg_list); + } + + if (m_socketxtreme_ec_list) { + g_socketxtreme_ec_pool->put_objs(m_socketxtreme_ec_list); } } -// Assumed num > 0. -tcp_seg *ring::get_tcp_segs(uint32_t num) +template +static inline T *get_obj_list(cached_obj_pool *obj_pool, uint32_t num, T *&obj_list_from, + uint32_t &obj_count, uint32_t batch_size) { - std::lock_guard lock(m_tcp_seg_lock); - - if (unlikely(num > m_tcp_seg_count)) { - uint32_t getsize = std::max(safe_mce_sys().tx_segs_ring_batch_tcp, num - m_tcp_seg_count); - auto seg_list = g_tcp_seg_pool->get_tcp_seg_list(getsize); - if (!seg_list.first) { + if (unlikely(num > obj_count)) { + uint32_t getsize = std::max(batch_size, num - obj_count); + auto obj_list = obj_pool->get_obj_list(getsize); + if (!obj_list.first) { return nullptr; } - seg_list.second->next = m_tcp_seg_list; - m_tcp_seg_list = seg_list.first; - m_tcp_seg_count += getsize; + obj_list.second->next = obj_list_from; + obj_list_from = obj_list.first; + obj_count += getsize; } - tcp_seg *head = m_tcp_seg_list; - tcp_seg *last = head; - m_tcp_seg_count -= num; + T *head = obj_list_from; + T *last = head; + obj_count -= num; // For non-batching, improves branch prediction. For batching, we do not get here often. if (unlikely(num > 1U)) { @@ -83,12 +85,51 @@ tcp_seg *ring::get_tcp_segs(uint32_t num) } } - m_tcp_seg_list = last->next; + obj_list_from = last->next; last->next = nullptr; return head; } +// Assumed num > 0. +tcp_seg *ring::get_tcp_segs(uint32_t num) +{ + std::lock_guard lock(m_tcp_seg_lock); + + return get_obj_list(g_tcp_seg_pool, num, m_tcp_seg_list, m_tcp_seg_count, + safe_mce_sys().tx_segs_ring_batch_tcp); +} + +// Assumed num > 0. +ring_ec *ring::socketxtreme_get_ecs(uint32_t num) +{ + std::lock_guard lock(m_ec_lock); + + return get_obj_list(g_socketxtreme_ec_pool, num, m_socketxtreme_ec_list, + m_socketxtreme_ec_count, 256U); +} + +template +static inline void put_obj_list(cached_obj_pool *obj_pool, T *&obj_list_to, T *&obj_list_from, + uint32_t &obj_count, uint32_t return_treshold) +{ + T *obj_temp = obj_list_to; + obj_list_to = obj_list_from; + + // For non-batching, improves branch prediction. For batching, we do not get here often. + if (unlikely(obj_list_from->next)) { + while (likely(obj_list_from->next)) { + obj_list_from = obj_list_from->next; + ++obj_count; // Count all except the first. + } + } + + obj_list_from->next = obj_temp; + if (unlikely(++obj_count > return_treshold)) { + obj_pool->put_objs(obj_pool->split_obj_list(obj_count / 2, obj_list_to, obj_count)); + } +} + // Assumed seg is not nullptr void ring::put_tcp_segs(tcp_seg *seg) { @@ -96,26 +137,117 @@ void ring::put_tcp_segs(tcp_seg *seg) std::lock_guard lock(m_tcp_seg_lock); - tcp_seg *seg_temp = m_tcp_seg_list; - m_tcp_seg_list = seg; + put_obj_list(g_tcp_seg_pool, m_tcp_seg_list, seg, m_tcp_seg_count, return_treshold); +} - // For non-batching, improves branch prediction. For batching, we do not get here often. - if (unlikely(seg->next)) { - while (likely(seg->next)) { - seg = seg->next; - ++m_tcp_seg_count; // Count all except the first. +// Assumed ec is not nullptr +void ring::socketxtreme_put_ecs(ring_ec *ec) +{ + static const uint32_t return_treshold = 256 * 2U; + + std::lock_guard lock(m_ec_lock); + + put_obj_list(g_socketxtreme_ec_pool, m_socketxtreme_ec_list, ec, m_socketxtreme_ec_count, + return_treshold); +} + +void ring::socketxtreme_ec_sock_list_add(sockinfo *sock) +{ + sock->set_ec_ring_list_next(nullptr); + if (likely(m_socketxtreme.ec_sock_list_end)) { + m_socketxtreme.ec_sock_list_end->set_ec_ring_list_next(sock); + m_socketxtreme.ec_sock_list_end = sock; + } else { + m_socketxtreme.ec_sock_list_end = m_socketxtreme.ec_sock_list_start = sock; + } +} + +xlio_socketxtreme_completion_t &ring::socketxtreme_start_ec_operation(sockinfo *sock, + bool always_new) +{ + m_socketxtreme.lock_ec_list.lock(); + if (likely(!sock->get_last_ec())) { + socketxtreme_ec_sock_list_add(sock); + always_new = true; + } + + if (always_new) { + sock->add_ec(socketxtreme_get_ecs(1U)); + } + + return sock->get_last_ec()->completion; +} + +void ring::socketxtreme_end_ec_operation() +{ + m_socketxtreme.lock_ec_list.unlock(); +} + +bool ring::socketxtreme_ec_pop_completion(xlio_socketxtreme_completion_t *completion) +{ + struct ring_ec *ec = nullptr; + + m_socketxtreme.lock_ec_list.lock(); + if (m_socketxtreme.ec_sock_list_start) { + ec = m_socketxtreme.ec_sock_list_start->pop_next_ec(); + + ring_logfunc( + "tid: %d completion %p: events:%lu, ud:%lu, b:%p, %p\n", gettid(), ec, + ec->completion.events, ec->completion.user_data, ec->completion.packet.buff_lst, + ec->completion.packet.buff_lst ? ec->completion.packet.buff_lst->next : nullptr); + + memcpy(completion, &ec->completion, sizeof(ec->completion)); + ec->next = nullptr; + socketxtreme_put_ecs(ec); + if (!m_socketxtreme.ec_sock_list_start + ->has_next_ec()) { // Last ec of the socket was popped. + // Remove socket from ready list. + sockinfo *temp = m_socketxtreme.ec_sock_list_start; + m_socketxtreme.ec_sock_list_start = temp->get_ec_ring_list_next(); + if (!m_socketxtreme.ec_sock_list_start) { + m_socketxtreme.ec_sock_list_end = nullptr; + } + temp->set_ec_ring_list_next(nullptr); } } + m_socketxtreme.lock_ec_list.unlock(); + return (ec != nullptr); +} + +void ring::socketxtreme_ec_clear_sock(sockinfo *sock) +{ + m_socketxtreme.lock_ec_list.lock(); + + ring_ec *ecs = sock->clear_ecs(); + if (ecs) { + socketxtreme_put_ecs(ecs); + sockinfo *temp = m_socketxtreme.ec_sock_list_start; + sockinfo *prev = nullptr; + while (temp && temp != sock) { + prev = temp; + temp = temp->get_ec_ring_list_next(); + } - seg->next = seg_temp; - if (unlikely(++m_tcp_seg_count > return_treshold)) { - g_tcp_seg_pool->put_tcp_segs( - tcp_seg_pool::split_tcp_segs(m_tcp_seg_count / 2, m_tcp_seg_list, m_tcp_seg_count)); + if (prev) { + prev->set_ec_ring_list_next(sock->get_ec_ring_list_next()); + } + + if (sock == m_socketxtreme.ec_sock_list_start) { + m_socketxtreme.ec_sock_list_start = sock->get_ec_ring_list_next(); + } + + if (sock == m_socketxtreme.ec_sock_list_end) { + m_socketxtreme.ec_sock_list_end = prev; + } + + sock->set_ec_ring_list_next(nullptr); } + + m_socketxtreme.lock_ec_list.unlock(); } void ring::print_val() { ring_logdbg("%d: %p: parent %p", m_if_index, this, - ((uintptr_t)this == (uintptr_t)m_parent ? 0 : m_parent)); + ((uintptr_t)this == (uintptr_t)m_parent ? nullptr : m_parent)); } diff --git a/src/core/dev/ring.h b/src/core/dev/ring.h index 4db195b61..a92c3e8f5 100644 --- a/src/core/dev/ring.h +++ b/src/core/dev/ring.h @@ -35,14 +35,19 @@ #include #include "ib/base/verbs_extra.h" +#include "dev/buffer_pool.h" +#include "dev/xlio_ti.h" #include "proto/flow_tuple.h" -#include "sock/socket_fd_api.h" +#include "proto/xlio_lwip.h" +#include "proto/L2_address.h" +#include "util/cached_obj_pool.h" +#include "lwip/tcp_impl.h" /* Forward declarations */ struct xlio_tls_info; -class xlio_tis; -class pkt_rcvr_sink; -typedef void (*xlio_comp_cb_t)(void *); // Copied from qp_mgr.h +class sockinfo; +class rfs_rule; +class poll_group; #define ring_logpanic __log_info_panic #define ring_logerr __log_info_err @@ -57,20 +62,18 @@ typedef enum { CQT_RX, CQT_TX } cq_type_t; typedef size_t ring_user_id_t; -/* Ring event completion */ +// Socketxtreme completion struct ring_ec { - struct list_head list; struct xlio_socketxtreme_completion_t completion; - struct xlio_buff_t *last_buff_lst; - - inline void clear() - { - INIT_LIST_HEAD(&list); - memset(&completion, 0, sizeof(completion)); - last_buff_lst = NULL; - } + ring_ec *next; }; +typedef cached_obj_pool tcp_seg_pool; +typedef cached_obj_pool socketxtreme_ec_pool; + +extern tcp_seg_pool *g_tcp_seg_pool; +extern socketxtreme_ec_pool *g_socketxtreme_ec_pool; + class ring { public: ring(); @@ -79,13 +82,11 @@ class ring { virtual void print_val(); - virtual bool attach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink, - bool force_5t = false) = 0; - virtual bool detach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink) = 0; + virtual bool attach_flow(flow_tuple &flow_spec_5t, sockinfo *sink, bool force_5t = false) = 0; + virtual bool detach_flow(flow_tuple &flow_spec_5t, sockinfo *sink) = 0; virtual void restart() = 0; - // Funcs taken from qp_mgr.h // Get/Release memory buffer descriptor with a linked data memory buffer virtual mem_buf_desc_t *mem_buf_tx_get(ring_user_id_t id, bool b_block, pbuf_type type, int n_num_mem_bufs = 1) = 0; @@ -93,31 +94,32 @@ class ring { bool trylock = false) = 0; virtual void mem_buf_rx_release(mem_buf_desc_t *p_mem_buf_desc) { - buffer_pool::free_rx_lwip_pbuf_custom(&p_mem_buf_desc->lwip_pbuf.pbuf); - }; + buffer_pool::free_rx_lwip_pbuf_custom(&p_mem_buf_desc->lwip_pbuf); + } virtual void send_ring_buffer(ring_user_id_t id, xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr) = 0; virtual int send_lwip_buffer(ring_user_id_t id, xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, xlio_tis *tis) = 0; - // Funcs taken from cq_mgr.h + virtual void ring_delayed_doorbell() {} + virtual int get_num_resources() const = 0; virtual int *get_rx_channel_fds(size_t &length) const { length = 1; return m_p_n_rx_channel_fds; - }; - virtual int get_tx_channel_fd() const { return -1; }; + } + virtual int get_tx_channel_fd() const { return -1; } virtual bool get_hw_dummy_send_support(ring_user_id_t id, xlio_ibv_send_wr *p_send_wqe) = 0; virtual int request_notification(cq_type_t cq_type, uint64_t poll_sn) = 0; virtual bool reclaim_recv_buffers(descq_t *rx_reuse) = 0; virtual bool reclaim_recv_buffers(mem_buf_desc_t *rx_reuse_lst) = 0; - virtual bool reclaim_recv_buffers_no_lock(mem_buf_desc_t *) { return false; }; + virtual bool reclaim_recv_buffers_no_lock(mem_buf_desc_t *) { return false; } virtual int drain_and_proccess() = 0; virtual int wait_for_notification_and_process_element(int cq_channel_fd, uint64_t *p_cq_poll_sn, - void *pv_fd_ready_array = NULL) = 0; + void *pv_fd_ready_array = nullptr) = 0; virtual int poll_and_process_element_rx(uint64_t *p_cq_poll_sn, - void *pv_fd_ready_array = NULL) = 0; + void *pv_fd_ready_array = nullptr) = 0; virtual int poll_and_process_element_tx(uint64_t *p_cq_poll_sn) = 0; virtual void adapt_cq_moderation() = 0; virtual void mem_buf_desc_return_single_to_owner_tx(mem_buf_desc_t *p_mem_buf_desc) = 0; @@ -127,14 +129,14 @@ class ring { virtual void inc_tx_retransmissions_stats(ring_user_id_t id) = 0; virtual bool is_member(ring_slave *rng) = 0; virtual bool is_active_member(ring_slave *rng, ring_user_id_t id) = 0; - ring *get_parent() { return m_parent; }; - ring_user_id_t generate_id() { return 0; }; + ring *get_parent() { return m_parent; } + ring_user_id_t generate_id() { return 0; } virtual ring_user_id_t generate_id(const address_t src_mac, const address_t dst_mac, uint16_t eth_proto, uint16_t encap_proto, const ip_address &src_ip, const ip_address &dst_ip, uint16_t src_port, uint16_t dst_port) = 0; virtual int modify_ratelimit(struct xlio_rate_limit_t &rate_limit) = 0; - virtual uint32_t get_tx_user_lkey(void *addr, size_t length, void *p_mapping = NULL) = 0; + virtual uint32_t get_tx_user_lkey(void *addr, size_t length) = 0; virtual uint32_t get_max_inline_data() = 0; virtual uint32_t get_max_send_sge(void) = 0; virtual uint32_t get_max_payload_sz(void) = 0; @@ -146,10 +148,6 @@ class ring { virtual int socketxtreme_poll(struct xlio_socketxtreme_completion_t *xlio_completions, unsigned int ncompletions, int flags) = 0; - virtual bool is_socketxtreme(void) = 0; - virtual void put_ec(struct ring_ec *ec) = 0; - virtual void del_ec(struct ring_ec *ec) = 0; - inline int get_if_index() { return m_if_index; } #ifdef DEFINED_UTLS @@ -221,12 +219,12 @@ class ring { { NOT_IN_USE(tis); NOT_IN_USE(config); - }; + } virtual void nvme_set_progress_context(xlio_tis *tis, uint32_t tcp_seqno) { NOT_IN_USE(tis); NOT_IN_USE(tcp_seqno); - }; + } enum { NVME_CRC_TX = 1 << 0, @@ -262,18 +260,40 @@ class ring { struct tcp_seg *get_tcp_segs(uint32_t num); void put_tcp_segs(struct tcp_seg *seg); + ring_ec *socketxtreme_get_ecs(uint32_t num); + void socketxtreme_put_ecs(struct ring_ec *ec); + + void socketxtreme_ec_clear_sock(sockinfo *sock); + void socketxtreme_ec_sock_list_add(sockinfo *sock); + bool socketxtreme_ec_pop_completion(xlio_socketxtreme_completion_t *completion); + void socketxtreme_end_ec_operation(); + xlio_socketxtreme_completion_t &socketxtreme_start_ec_operation(sockinfo *sock, + bool always_new); + protected: inline void set_parent(ring *parent) { m_parent = (parent ? parent : this); } inline void set_if_index(int if_index) { m_if_index = if_index; } - int *m_p_n_rx_channel_fds; - ring *m_parent; + int *m_p_n_rx_channel_fds = nullptr; + ring *m_parent = nullptr; - struct tcp_seg *m_tcp_seg_list; - uint32_t m_tcp_seg_count; + struct tcp_seg *m_tcp_seg_list = nullptr; + ring_ec *m_socketxtreme_ec_list = nullptr; + uint32_t m_tcp_seg_count = 0U; + uint32_t m_socketxtreme_ec_count = 0U; lock_spin_recursive m_tcp_seg_lock; + lock_spin_recursive m_ec_lock; + + struct { + // Queue of ready sockets. Each socket can be added only once to this queue. + sockinfo *ec_sock_list_start = nullptr; + sockinfo *ec_sock_list_end = nullptr; + + // Thread-safety lock for get/put operations under the queue. + lock_spin lock_ec_list; + } m_socketxtreme; - int m_if_index; /* Interface index */ + int m_if_index = 0; /* Interface index */ }; #endif /* RING_H */ diff --git a/src/core/dev/ring_allocation_logic.cpp b/src/core/dev/ring_allocation_logic.cpp index 0546f064d..2a5660f6c 100644 --- a/src/core/dev/ring_allocation_logic.cpp +++ b/src/core/dev/ring_allocation_logic.cpp @@ -48,35 +48,27 @@ #define ral_logfuncall __log_info_funcall ring_allocation_logic::ring_allocation_logic() - : m_owner(NULL) - , m_ring_migration_ratio(0) - , m_source(-1) + : m_ring_migration_ratio(-1) , m_migration_try_count(0) + , m_source(-1) , m_migration_candidate(0) - , m_active(true) , m_res_key() { - m_type = ""; } ring_allocation_logic::ring_allocation_logic(ring_logic_t allocation_logic, int ring_migration_ratio, source_t source, resource_allocation_key &ring_profile) - : m_owner(NULL) - , m_ring_migration_ratio(ring_migration_ratio) - , m_source(source) + : m_ring_migration_ratio(ring_migration_ratio) , m_migration_try_count(ring_migration_ratio) + , m_source(source) { - m_type = ""; - if (ring_profile.get_ring_alloc_logic() == RING_LOGIC_PER_INTERFACE) { ring_profile.set_ring_alloc_logic(allocation_logic); } m_res_key = resource_allocation_key(ring_profile); m_migration_candidate = 0; m_res_key.set_user_id_key(calc_res_key_by_logic()); - - m_active = true; } /** @@ -118,7 +110,7 @@ uint64_t ring_allocation_logic::calc_res_key_by_logic() break; default: // not suppose to get here - ral_logdbg("non-valid ring logic = %d", m_res_key.get_ring_alloc_logic()); + ral_logdbg("Non-valid ring logic = %d", m_res_key.get_ring_alloc_logic()); break; BULLSEYE_EXCLUDE_BLOCK_END } @@ -150,9 +142,10 @@ resource_allocation_key *ring_allocation_logic::create_new_key(const ip_address */ bool ring_allocation_logic::should_migrate_ring() { - ral_logfuncall("currently accessed from thread=%lu, cpu=%d", pthread_self(), sched_getcpu()); + ral_logfuncall("Currently accessed from thread=%lu, cpu=%d", pthread_self(), sched_getcpu()); - if (false == m_active) { + if (m_ring_migration_ratio < 0) { + // Ring migration is disabled return false; } @@ -175,10 +168,7 @@ bool ring_allocation_logic::should_migrate_ring() m_migration_try_count = 0; if (!m_migration_candidate) { - // save current used allocation key - // no need to save profile, and allocation logic uint64_t curr_id = m_res_key.get_user_id_key(); - // calc new key uint64_t new_id = calc_res_key_by_logic(); if (new_id == curr_id || g_n_internal_thread_id == curr_id) { return false; @@ -187,7 +177,7 @@ bool ring_allocation_logic::should_migrate_ring() return false; } - ral_logdbg("migrating from ring of id=%s to ring of id=%lu", m_res_key.to_str().c_str(), + ral_logdbg("Migrating from ring of id=%s to ring of id=%lu", m_res_key.to_str().c_str(), m_migration_candidate); m_migration_candidate = 0; @@ -198,11 +188,17 @@ const std::string ring_allocation_logic::to_str() const { std::stringstream ss; - ss << '[' << m_type << '=' << m_owner << ']'; + ss << '[' << this << ']'; return ss.str(); } +void ring_allocation_logic::debug_print_type(const char *type) +{ + ral_logdbg("Type %s", type); + NOT_IN_USE(type); // Suppress --enable-opt-log=high warning +} + cpu_manager g_cpu_manager; __thread int g_n_thread_cpu_core = NO_CPU; @@ -239,7 +235,7 @@ int cpu_manager::reserve_cpu_for_thread(pthread_t tid, int suggested_cpu /* = NO int avail_cpus = CPU_COUNT(&cpu_set); if (avail_cpus == 0) { unlock(); - __log_err("no cpu available for tid=%lu", tid); + __log_err("No cpu available for tid=%lu", tid); return -1; } @@ -264,7 +260,7 @@ int cpu_manager::reserve_cpu_for_thread(pthread_t tid, int suggested_cpu /* = NO } CPU_ZERO(&cpu_set); CPU_SET(cpu, &cpu_set); - __log_dbg("attach tid=%lu running on cpu=%d to cpu=%d", tid, sched_getcpu(), cpu); + __log_dbg("Attach tid=%lu running on cpu=%d to cpu=%d", tid, sched_getcpu(), cpu); ret = pthread_setaffinity_np(tid, sizeof(cpu_set_t), &cpu_set); if (ret) { unlock(); diff --git a/src/core/dev/ring_allocation_logic.h b/src/core/dev/ring_allocation_logic.h index 203b632aa..af15f1cb7 100644 --- a/src/core/dev/ring_allocation_logic.h +++ b/src/core/dev/ring_allocation_logic.h @@ -84,6 +84,8 @@ class ring_allocation_logic { ring_allocation_logic(ring_logic_t ring_allocation_logic, int ring_migration_ratio, source_t source, resource_allocation_key &ring_profile); + void debug_print_type(const char *type); + public: /* careful, you'll lose the previous key !! */ resource_allocation_key *create_new_key(const ip_address &addr, int suggested_cpu = NO_CPU); @@ -93,24 +95,21 @@ class ring_allocation_logic { bool should_migrate_ring(); bool is_logic_support_migration() { - return m_res_key.get_ring_alloc_logic() >= RING_LOGIC_PER_THREAD && - m_res_key.get_ring_alloc_logic() < RING_LOGIC_PER_OBJECT && m_ring_migration_ratio > 0; + return m_ring_migration_ratio > 0 && + m_res_key.get_ring_alloc_logic() >= RING_LOGIC_PER_THREAD && + m_res_key.get_ring_alloc_logic() < RING_LOGIC_PER_OBJECT; } uint64_t calc_res_key_by_logic(); inline ring_logic_t get_alloc_logic_type() { return m_res_key.get_ring_alloc_logic(); } - inline void enable_migration(bool active) { m_active = active; } - const std::string to_str() const; + inline void disable_migration() { m_ring_migration_ratio = -1; } -protected: - const char *m_type; - const void *m_owner; + const std::string to_str() const; private: int m_ring_migration_ratio; - source_t m_source; int m_migration_try_count; + source_t m_source; uint64_t m_migration_candidate; - bool m_active; resource_allocation_key m_res_key; }; @@ -119,14 +118,13 @@ class ring_allocation_logic_rx : public ring_allocation_logic { ring_allocation_logic_rx() : ring_allocation_logic() { + debug_print_type("Rx"); } - ring_allocation_logic_rx(source_t source, resource_allocation_key &ring_profile, - const void *owner) + ring_allocation_logic_rx(source_t source, resource_allocation_key &ring_profile) : ring_allocation_logic(safe_mce_sys().ring_allocation_logic_rx, safe_mce_sys().ring_migration_ratio_rx, source, ring_profile) { - m_type = "Rx"; - m_owner = owner; + debug_print_type("Rx"); } }; @@ -135,14 +133,13 @@ class ring_allocation_logic_tx : public ring_allocation_logic { ring_allocation_logic_tx() : ring_allocation_logic() { + debug_print_type("Tx"); } - ring_allocation_logic_tx(source_t source, resource_allocation_key &ring_profile, - const void *owner) + ring_allocation_logic_tx(source_t source, resource_allocation_key &ring_profile) : ring_allocation_logic(safe_mce_sys().ring_allocation_logic_tx, safe_mce_sys().ring_migration_ratio_tx, source, ring_profile) { - m_type = "Tx"; - m_owner = owner; + debug_print_type("Tx"); } }; diff --git a/src/core/dev/ring_bond.cpp b/src/core/dev/ring_bond.cpp index 7ba89555c..ce0401008 100644 --- a/src/core/dev/ring_bond.cpp +++ b/src/core/dev/ring_bond.cpp @@ -49,7 +49,7 @@ ring_bond::ring_bond(int if_index) , m_lock_ring_rx("ring_bond:lock_rx") , m_lock_ring_tx("ring_bond:lock_tx") { - net_device_val *p_ndev = NULL; + net_device_val *p_ndev = nullptr; /* Configure ring() fields */ set_parent(this); @@ -57,7 +57,7 @@ ring_bond::ring_bond(int if_index) /* Sanity check */ p_ndev = g_p_net_device_table_mgr->get_net_device_val(m_parent->get_if_index()); - if (NULL == p_ndev) { + if (!p_ndev) { ring_logpanic("Invalid if_index = %d", if_index); } @@ -89,17 +89,17 @@ ring_bond::~ring_bond() if (m_p_n_rx_channel_fds) { delete[] m_p_n_rx_channel_fds; - m_p_n_rx_channel_fds = NULL; + m_p_n_rx_channel_fds = nullptr; } } void ring_bond::print_val() { ring_logdbg("%d: %p: parent %p type %s", m_if_index, this, - ((uintptr_t)this == (uintptr_t)m_parent ? 0 : m_parent), "bond"); + ((uintptr_t)this == (uintptr_t)m_parent ? nullptr : m_parent), "bond"); } -bool ring_bond::attach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink, bool force_5t) +bool ring_bond::attach_flow(flow_tuple &flow_spec_5t, sockinfo *sink, bool force_5t) { bool ret = true; struct flow_sink_t value = {flow_spec_5t, sink}; @@ -117,7 +117,7 @@ bool ring_bond::attach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink, bool return ret; } -bool ring_bond::detach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink) +bool ring_bond::detach_flow(flow_tuple &flow_spec_5t, sockinfo *sink) { bool ret = true; struct flow_sink_t value = {flow_spec_5t, sink}; @@ -145,7 +145,7 @@ void ring_bond::restart() { net_device_val *p_ndev = g_p_net_device_table_mgr->get_net_device_val(m_parent->get_if_index()); - if (NULL == p_ndev) { + if (!p_ndev) { return; } const slave_data_vector_t &slaves = p_ndev->get_slave_array(); @@ -161,7 +161,7 @@ void ring_bond::restart() ring_tap *p_ring_tap = dynamic_cast(p_ring_bond_netvsc->m_tap_ring); if (p_ring_tap) { size_t num_ring_rx_fds = 0; - int *ring_rx_fds_array = NULL; + int *ring_rx_fds_array = nullptr; int epfd = -1; int fd = -1; int rc = 0; @@ -175,7 +175,7 @@ void ring_bond::restart() epfd = g_p_net_device_table_mgr->global_ring_epfd_get(); if (epfd > 0) { fd = ring_rx_fds_array[k]; - rc = orig_os_api.epoll_ctl(epfd, EPOLL_CTL_DEL, fd, NULL); + rc = SYSCALL(epoll_ctl, epfd, EPOLL_CTL_DEL, fd, nullptr); ring_logdbg("Remove fd=%d from epfd=%d rc=%d errno=%d", fd, epfd, rc, errno); } @@ -186,14 +186,14 @@ void ring_bond::restart() epfd = si->get_rx_epfd(); if (epfd > 0) { fd = ring_rx_fds_array[k]; - rc = orig_os_api.epoll_ctl(epfd, EPOLL_CTL_DEL, fd, NULL); + rc = SYSCALL(epoll_ctl, epfd, EPOLL_CTL_DEL, fd, NULL); ring_logdbg("Remove fd=%d from epfd=%d rc=%d errno=%d", fd, epfd, rc, errno); } epfd = si->get_epoll_context_fd(); if (epfd > 0) { fd = ring_rx_fds_array[k]; - rc = orig_os_api.epoll_ctl(epfd, EPOLL_CTL_DEL, fd, NULL); + rc = SYSCALL(epoll_ctl, epfd, EPOLL_CTL_DEL, fd, NULL); ring_logdbg("Remove fd=%d from epfd=%d rc=%d errno=%d", fd, epfd, rc, errno); } @@ -204,8 +204,8 @@ void ring_bond::restart() p_ring_tap->inc_vf_plugouts(); p_ring_bond_netvsc->slave_destroy( p_ring_bond_netvsc->m_vf_ring->get_if_index()); - p_ring_bond_netvsc->m_vf_ring = NULL; - p_ring_tap->set_vf_ring(NULL); + p_ring_bond_netvsc->m_vf_ring = nullptr; + p_ring_tap->set_vf_ring(nullptr); } else { for (i = 0; i < slaves.size(); i++) { if (slaves[i]->if_index != p_ring_tap->get_if_index()) { @@ -219,11 +219,11 @@ void ring_bond::restart() for (k = 0; k < num_ring_rx_fds; k++) { epfd = g_p_net_device_table_mgr->global_ring_epfd_get(); if (epfd > 0) { - epoll_event ev = {0, {0}}; + epoll_event ev = {0, {nullptr}}; fd = ring_rx_fds_array[k]; ev.events = EPOLLIN; ev.data.fd = fd; - rc = orig_os_api.epoll_ctl(epfd, EPOLL_CTL_ADD, fd, &ev); + rc = SYSCALL(epoll_ctl, epfd, EPOLL_CTL_ADD, fd, &ev); ring_logdbg("Add fd=%d from epfd=%d rc=%d errno=%d", fd, epfd, rc, errno); } @@ -239,18 +239,18 @@ void ring_bond::restart() fd = ring_rx_fds_array[k]; ev.events = EPOLLIN; ev.data.fd = fd; - rc = orig_os_api.epoll_ctl(epfd, EPOLL_CTL_ADD, fd, &ev); + rc = SYSCALL(epoll_ctl, epfd, EPOLL_CTL_ADD, fd, &ev); ring_logdbg("Add fd=%d from epfd=%d rc=%d errno=%d", fd, epfd, rc, errno); } epfd = si->get_epoll_context_fd(); if (epfd > 0) { -#define CQ_FD_MARK 0xabcd /* see socket_fd_api */ +#define CQ_FD_MARK 0xabcd /* see sockinfo */ epoll_event ev = {0, {0}}; fd = ring_rx_fds_array[k]; ev.events = EPOLLIN | EPOLLPRI; ev.data.u64 = (((uint64_t)CQ_FD_MARK << 32) | fd); - rc = orig_os_api.epoll_ctl(epfd, EPOLL_CTL_ADD, fd, &ev); + rc = SYSCALL(epoll_ctl, epfd, EPOLL_CTL_ADD, fd, &ev); ring_logdbg("Add fd=%d from epfd=%d rc=%d errno=%d", fd, epfd, rc, errno); } @@ -291,13 +291,18 @@ void ring_bond::restart() if (slaves[j]->active) { ring_logdbg("ring %d active", i); if (slaves[j]->lag_tx_port_affinity != 1) { - tmp_ring->start_active_qp_mgr(); + tmp_ring->start_active_queue_tx(); + /* coverity[sleep] */ + tmp_ring->start_active_queue_rx(); } m_bond_rings[i]->m_active = true; } else { ring_logdbg("ring %d not active", i); if (slaves[j]->lag_tx_port_affinity != 1) { - tmp_ring->stop_active_qp_mgr(); + /* coverity[sleep] */ + tmp_ring->stop_active_queue_tx(); + /* coverity[sleep] */ + tmp_ring->stop_active_queue_rx(); } m_bond_rings[i]->m_active = false; } @@ -307,14 +312,14 @@ void ring_bond::restart() popup_xmit_rings(); int ret = 0; - uint64_t poll_sn = cq_mgr::m_n_global_sn; + uint64_t poll_sn = cq_mgr_rx::m_n_global_sn_rx; ret = request_notification(CQT_RX, poll_sn); if (ret < 0) { - ring_logdbg("failed arming rx cq_mgr (errno=%d %m)", errno); + ring_logdbg("failed arming cq_mgr_rx (errno=%d %m)", errno); } ret = request_notification(CQT_TX, poll_sn); if (ret < 0) { - ring_logdbg("failed arming tx cq_mgr (errno=%d %m)", errno); + ring_logdbg("failed arming cq_mgr_tx (errno=%d %m)", errno); } if (m_type == net_device_val::ACTIVE_BACKUP) { @@ -362,7 +367,7 @@ void ring_bond::adapt_cq_moderation() mem_buf_desc_t *ring_bond::mem_buf_tx_get(ring_user_id_t id, bool b_block, pbuf_type type, int n_num_mem_bufs /* default = 1 */) { - mem_buf_desc_t *ret = NULL; + mem_buf_desc_t *ret = nullptr; std::lock_guard lock(m_lock_ring_tx); ret = m_xmit_rings[id]->mem_buf_tx_get(id, b_block, type, n_num_mem_bufs); @@ -401,7 +406,7 @@ void ring_bond::mem_buf_rx_release(mem_buf_desc_t *p_mem_buf_desc) } } if (i == m_bond_rings.size()) { - buffer_pool::free_rx_lwip_pbuf_custom(&p_mem_buf_desc->lwip_pbuf.pbuf); + buffer_pool::free_rx_lwip_pbuf_custom(&p_mem_buf_desc->lwip_pbuf); } } @@ -427,7 +432,7 @@ void ring_bond::send_ring_buffer(ring_user_id_t id, xlio_ibv_send_wr *p_send_wqe } else { ring_logfunc("active ring=%p, silent packet drop (%p), (HA event?)", m_xmit_rings[id], p_mem_buf_desc); - p_mem_buf_desc->p_next_desc = NULL; + p_mem_buf_desc->p_next_desc = nullptr; if (likely(p_mem_buf_desc->p_desc_owner == m_bond_rings[id])) { m_bond_rings[id]->mem_buf_tx_release(p_mem_buf_desc, true); } else { @@ -449,7 +454,7 @@ int ring_bond::send_lwip_buffer(ring_user_id_t id, xlio_ibv_send_wr *p_send_wqe, ring_logfunc("active ring=%p, silent packet drop (%p), (HA event?)", m_xmit_rings[id], p_mem_buf_desc); - p_mem_buf_desc->p_next_desc = NULL; + p_mem_buf_desc->p_next_desc = nullptr; /* no need to free the buffer here, as for lwip buffers we have 2 ref counts, */ /* one for caller, and one for completion. for completion, we ref count in */ /* send_lwip_buffer(). Since we are not going in, the caller will free the */ @@ -670,7 +675,7 @@ bool ring_bond::reclaim_recv_buffers(mem_buf_desc_t *) void ring_bond::update_cap(ring_slave *slave) { - if (NULL == slave) { + if (!slave) { m_max_inline_data = (uint32_t)(-1); m_max_send_sge = (uint32_t)(-1); return; @@ -743,7 +748,7 @@ int ring_bond::devide_buffers_helper(mem_buf_desc_t *p_mem_buf_desc_list, } } temp = head->p_next_desc; - head->p_next_desc = NULL; + head->p_next_desc = nullptr; if (i == m_bond_rings.size()) { // handle no owner ring_logdbg("No matching ring %p to return buffer", current->p_desc_owner); @@ -759,7 +764,7 @@ int ring_bond::devide_buffers_helper(mem_buf_desc_t *p_mem_buf_desc_list, void ring_bond::popup_xmit_rings() { - ring_slave *cur_slave = NULL; + ring_slave *cur_slave = nullptr; size_t i, j; m_xmit_rings.clear(); @@ -792,7 +797,7 @@ void ring_bond::popup_recv_rings() net_device_val *p_ndev = g_p_net_device_table_mgr->get_net_device_val(m_parent->get_if_index()); m_recv_rings.clear(); - if (NULL == p_ndev) { + if (!p_ndev) { return; } const slave_data_vector_t &slaves = p_ndev->get_slave_array(); @@ -825,7 +830,7 @@ void ring_bond::update_rx_channel_fds() { if (m_p_n_rx_channel_fds) { delete[] m_p_n_rx_channel_fds; - m_p_n_rx_channel_fds = NULL; + m_p_n_rx_channel_fds = nullptr; } if (m_recv_rings.size() == 0) { return; @@ -955,7 +960,7 @@ int ring_bond::socketxtreme_poll(struct xlio_socketxtreme_completion_t *, unsign void ring_bond::slave_destroy(int if_index) { - ring_slave *cur_slave = NULL; + ring_slave *cur_slave = nullptr; ring_slave_vector_t::iterator iter; for (iter = m_bond_rings.begin(); iter != m_bond_rings.end(); iter++) { @@ -976,7 +981,7 @@ void ring_bond_eth::slave_create(int if_index) ring_slave *cur_slave; cur_slave = new ring_eth(if_index, this); - if (cur_slave == NULL) { + if (!cur_slave) { ring_logpanic("Error creating bond ring: memory allocation error"); } @@ -995,11 +1000,11 @@ void ring_bond_eth::slave_create(int if_index) void ring_bond_netvsc::slave_create(int if_index) { - ring_slave *cur_slave = NULL; - net_device_val *p_ndev = NULL; + ring_slave *cur_slave = nullptr; + net_device_val *p_ndev = nullptr; p_ndev = g_p_net_device_table_mgr->get_net_device_val(m_parent->get_if_index()); - if (NULL == p_ndev) { + if (!p_ndev) { ring_logpanic("Error creating bond ring"); } diff --git a/src/core/dev/ring_bond.h b/src/core/dev/ring_bond.h index e38bbe025..888efe901 100644 --- a/src/core/dev/ring_bond.h +++ b/src/core/dev/ring_bond.h @@ -42,7 +42,7 @@ typedef std::vector ring_slave_vector_t; struct flow_sink_t { flow_tuple flow; - pkt_rcvr_sink *sink; + sockinfo *sink; }; class ring_bond : public ring { @@ -59,7 +59,8 @@ class ring_bond : public ring { return m_p_n_rx_channel_fds; }; virtual int request_notification(cq_type_t cq_type, uint64_t poll_sn); - virtual int poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void *pv_fd_ready_array = NULL); + virtual int poll_and_process_element_rx(uint64_t *p_cq_poll_sn, + void *pv_fd_ready_array = nullptr); virtual int poll_and_process_element_tx(uint64_t *p_cq_poll_sn); virtual void adapt_cq_moderation(); virtual bool reclaim_recv_buffers(descq_t *rx_reuse); @@ -67,10 +68,10 @@ class ring_bond : public ring { virtual void mem_buf_rx_release(mem_buf_desc_t *p_mem_buf_desc); virtual int drain_and_proccess(); virtual int wait_for_notification_and_process_element(int cq_channel_fd, uint64_t *p_cq_poll_sn, - void *pv_fd_ready_array = NULL); + void *pv_fd_ready_array = nullptr); virtual int get_num_resources() const { return m_bond_rings.size(); }; - virtual bool attach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink, bool force_5t = false); - virtual bool detach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink); + virtual bool attach_flow(flow_tuple &flow_spec_5t, sockinfo *sink, bool force_5t = false); + virtual bool detach_flow(flow_tuple &flow_spec_5t, sockinfo *sink); virtual void restart(); virtual mem_buf_desc_t *mem_buf_tx_get(ring_user_id_t id, bool b_block, pbuf_type type, int n_num_mem_bufs = 1); @@ -92,9 +93,9 @@ class ring_bond : public ring { virtual bool get_hw_dummy_send_support(ring_user_id_t id, xlio_ibv_send_wr *p_send_wqe); virtual int modify_ratelimit(struct xlio_rate_limit_t &rate_limit); /* XXX TODO We have to support ring_bond for zerocopy. */ - virtual uint32_t get_tx_user_lkey(void *addr, size_t length, void *p_mapping = NULL) + virtual uint32_t get_tx_user_lkey(void *addr, size_t length) { - NOT_IN_USE(p_mapping), NOT_IN_USE(addr); + NOT_IN_USE(addr); NOT_IN_USE(length); return LKEY_ERROR; } @@ -116,7 +117,7 @@ class ring_bond : public ring { } protected: - void update_cap(ring_slave *slave = NULL); + void update_cap(ring_slave *slave = nullptr); void update_rx_channel_fds(); /* Fill m_xmit_rings array */ @@ -130,10 +131,6 @@ class ring_bond : public ring { int devide_buffers_helper(mem_buf_desc_t *p_mem_buf_desc_list, mem_buf_desc_t **buffer_per_ring); - bool is_socketxtreme(void) { return false; } - void put_ec(struct ring_ec *ec) { NOT_IN_USE(ec); } - void del_ec(struct ring_ec *ec) { NOT_IN_USE(ec); } - protected: /* Array of all aggregated rings * Every ring can be Active or Backup @@ -196,8 +193,8 @@ class ring_bond_netvsc : public ring_bond { net_device_val *p_ndev = g_p_net_device_table_mgr->get_net_device_val(m_parent->get_if_index()); - m_vf_ring = NULL; - m_tap_ring = NULL; + m_vf_ring = nullptr; + m_tap_ring = nullptr; if (p_ndev) { const slave_data_vector_t &slaves = p_ndev->get_slave_array(); update_cap(); diff --git a/src/core/dev/ring_simple.cpp b/src/core/dev/ring_simple.cpp index 334272788..c0826132a 100644 --- a/src/core/dev/ring_simple.cpp +++ b/src/core/dev/ring_simple.cpp @@ -36,10 +36,6 @@ #include "util/valgrind.h" #include "util/sg_array.h" #include "sock/fd_collection.h" -#if defined(DEFINED_DIRECT_VERBS) -#include "dev/qp_mgr_eth_mlx5.h" -#include "dev/qp_mgr_eth_mlx5_dpcp.h" -#endif #undef MODULE_NAME #define MODULE_NAME "ring_simple" @@ -88,35 +84,19 @@ inline void ring_simple::send_status_handler(int ret, xlio_ibv_send_wr *p_send_w BULLSEYE_EXCLUDE_BLOCK_END } -qp_mgr *ring_eth::create_qp_mgr(struct qp_mgr_desc *desc) -{ -#if defined(DEFINED_DPCP) - if (safe_mce_sys().enable_dpcp_rq) { - return new qp_mgr_eth_mlx5_dpcp(desc, get_tx_num_wr(), m_partition); - } -#endif - return new qp_mgr_eth_mlx5(desc, get_tx_num_wr(), m_partition); -} - ring_simple::ring_simple(int if_index, ring *parent, ring_type_t type, bool use_locks) : ring_slave(if_index, parent, type, use_locks) - , m_p_ib_ctx(NULL) - , m_p_qp_mgr(NULL) - , m_p_cq_mgr_rx(NULL) - , m_p_cq_mgr_tx(NULL) , m_lock_ring_tx_buf_wait("ring:lock_tx_buf_wait") - , m_tx_num_bufs(0) - , m_zc_num_bufs(0) - , m_tx_num_wr(0) - , m_missing_buf_ref_count(0) - , m_tx_lkey(0) , m_gro_mgr(safe_mce_sys().gro_streams_max, MAX_GRO_BUFS) - , m_up(false) - , m_p_rx_comp_event_channel(NULL) - , m_p_tx_comp_event_channel(NULL) - , m_p_l2_addr(NULL) { net_device_val *p_ndev = g_p_net_device_table_mgr->get_net_device_val(m_parent->get_if_index()); + BULLSEYE_EXCLUDE_BLOCK_START + if (!p_ndev) { + // Coverity warning suppression + throw_xlio_exception("Cannot find netdev for a ring"); + } + BULLSEYE_EXCLUDE_BLOCK_END + const slave_data_t *p_slave = p_ndev->get_slave(get_if_index()); ring_logdbg("new ring_simple()"); @@ -126,7 +106,7 @@ ring_simple::ring_simple(int if_index, ring *parent, ring_type_t type, bool use_ */ BULLSEYE_EXCLUDE_BLOCK_START m_p_ib_ctx = p_slave->p_ib_ctx; - if (m_p_ib_ctx == NULL) { + if (!m_p_ib_ctx) { ring_logpanic("m_p_ib_ctx = NULL. It can be related to wrong bonding configuration"); } @@ -145,8 +125,6 @@ ring_simple::ring_simple(int if_index, ring *parent, ring_type_t type, bool use_ memset(&m_tls, 0, sizeof(m_tls)); #endif /* DEFINED_UTLS */ memset(&m_lro, 0, sizeof(m_lro)); - - INIT_LIST_HEAD(&m_socketxtreme.ec_list); } ring_simple::~ring_simple() @@ -163,12 +141,20 @@ ring_simple::~ring_simple() // Was done in order to allow iperf's FIN packet to be sent. usleep(25000); - if (m_p_qp_mgr) { - stop_active_qp_mgr(); + if (m_hqtx) { + stop_active_queue_tx(); // Release QP/CQ resources - delete m_p_qp_mgr; - m_p_qp_mgr = NULL; + delete m_hqtx; + m_hqtx = nullptr; + } + + if (m_hqrx) { + stop_active_queue_rx(); + + // Release QP/CQ resources + delete m_hqrx; + m_hqrx = nullptr; } /* coverity[double_lock] TODO: RM#1049980 */ @@ -217,34 +203,32 @@ ring_simple::~ring_simple() } ENDIF_VERBS_FAILURE; VALGRIND_MAKE_MEM_UNDEFINED(m_p_tx_comp_event_channel, sizeof(struct ibv_comp_channel)); - m_p_tx_comp_event_channel = NULL; + m_p_tx_comp_event_channel = nullptr; } /* coverity[double_unlock] TODO: RM#1049980 */ m_lock_ring_tx.unlock(); m_lock_ring_rx.unlock(); - ring_logdbg("queue of event completion elements is %s", - (list_empty(&m_socketxtreme.ec_list) ? "empty" : "not empty")); - while (!list_empty(&m_socketxtreme.ec_list)) { - struct ring_ec *ec = NULL; - ec = get_ec(); - if (ec) { - del_ec(ec); - } - } - ring_logdbg("delete ring_simple() completed"); } void ring_simple::create_resources() { net_device_val *p_ndev = g_p_net_device_table_mgr->get_net_device_val(m_parent->get_if_index()); + BULLSEYE_EXCLUDE_BLOCK_START + if (!p_ndev) { + // Coverity warning suppression + throw_xlio_exception("Cannot find netdev for a ring"); + } + BULLSEYE_EXCLUDE_BLOCK_END + const slave_data_t *p_slave = p_ndev->get_slave(get_if_index()); save_l2_address(p_slave->p_L2_addr); m_p_tx_comp_event_channel = ibv_create_comp_channel(m_p_ib_ctx->get_ibv_context()); - if (m_p_tx_comp_event_channel == NULL) { + BULLSEYE_EXCLUDE_BLOCK_START + if (!m_p_tx_comp_event_channel) { VLOG_PRINTF_INFO_ONCE_THEN_ALWAYS( VLOG_ERROR, VLOG_DEBUG, "ibv_create_comp_channel for tx failed. m_p_tx_comp_event_channel = %p (errno=%d %m)", @@ -277,6 +261,11 @@ void ring_simple::create_resources() const xlio_ibv_tso_caps *caps = &xlio_get_tso_caps(m_p_ib_ctx->get_ibv_device_attr_ex()); if (ibv_is_qpt_supported(caps->supported_qpts, IBV_QPT_RAW_PACKET)) { + if (caps->max_tso && (caps->max_tso > MCE_DEFAULT_MAX_TSO_SIZE)) { + ring_logwarn("max_tso cap (=%u) is higher than default TSO size (=%u). " + "Increase XLIO_MAX_TSO_SIZE to get full TSO potential.", + caps->max_tso, MCE_DEFAULT_MAX_TSO_SIZE); + } m_tso.max_payload_sz = caps->max_tso; /* ETH(14) + IP(20) + TCP(20) + TCP OPTIONS(40) */ m_tso.max_header_sz = 94; @@ -291,7 +280,6 @@ void ring_simple::create_resources() memset(&m_lro, 0, sizeof(m_lro)); if ((safe_mce_sys().enable_lro == option_3::ON) || ((safe_mce_sys().enable_lro == option_3::AUTO) && (1 == validate_lro(get_if_index())))) { -#if defined(DEFINED_DPCP) dpcp::adapter_hca_capabilities caps; if (m_p_ib_ctx->get_dpcp_adapter() && @@ -317,7 +305,6 @@ void ring_simple::create_resources() m_lro.max_payload_sz = std::min(actual_buf_size, XLIO_MLX5_PARAMS_LRO_PAYLOAD_SIZE) / 256U * 256U; } -#endif /* DEFINED_DPCP */ } ring_logdbg("ring attributes: m_lro = %d", m_lro.cap); ring_logdbg("ring attributes: m_lro:psh_flag = %d", m_lro.psh_flag); @@ -352,11 +339,9 @@ void ring_simple::create_resources() #endif ring_logdbg("ring attributes: m_flow_tag_enabled = %d", m_flow_tag_enabled); - m_p_rx_comp_event_channel = ibv_create_comp_channel( - m_p_ib_ctx->get_ibv_context()); // ODED TODO: Adjust the ibv_context to be the exact one in - // case of different devices + m_p_rx_comp_event_channel = ibv_create_comp_channel(m_p_ib_ctx->get_ibv_context()); BULLSEYE_EXCLUDE_BLOCK_START - if (m_p_rx_comp_event_channel == NULL) { + if (!m_p_rx_comp_event_channel) { VLOG_PRINTF_INFO_ONCE_THEN_ALWAYS( VLOG_ERROR, VLOG_DEBUG, "ibv_create_comp_channel for rx failed. p_rx_comp_event_channel = %p (errno=%d %m)", @@ -379,22 +364,22 @@ void ring_simple::create_resources() g_p_fd_collection->add_cq_channel_fd(m_p_tx_comp_event_channel->fd, this); } - struct qp_mgr_desc desc; - memset(&desc, 0, sizeof(desc)); - desc.ring = this; - desc.slave = p_slave; - desc.rx_comp_event_channel = m_p_rx_comp_event_channel; - m_p_qp_mgr = create_qp_mgr(&desc); + std::unique_ptr temp_hqtx(new hw_queue_tx(this, p_slave, get_tx_num_wr())); + std::unique_ptr temp_hqrx( + new hw_queue_rx(this, p_slave->p_ib_ctx, m_p_rx_comp_event_channel, m_vlan)); BULLSEYE_EXCLUDE_BLOCK_START - if (m_p_qp_mgr == NULL) { - ring_logerr("Failed to allocate qp_mgr!"); - throw_xlio_exception("create qp failed"); + if (!temp_hqtx || !temp_hqrx) { + ring_logerr("Failed to allocate hw_queue_tx/hw_queue_rx!"); + throw_xlio_exception("Create hw_queue_tx/hw_queue_rx failed"); } BULLSEYE_EXCLUDE_BLOCK_END - // save cq_mgr pointers - m_p_cq_mgr_rx = m_p_qp_mgr->get_rx_cq_mgr(); - m_p_cq_mgr_tx = m_p_qp_mgr->get_tx_cq_mgr(); + m_hqtx = temp_hqtx.release(); + m_hqrx = temp_hqrx.release(); + + // save pointers + m_p_cq_mgr_rx = m_hqrx->get_rx_cq_mgr(); + m_p_cq_mgr_tx = m_hqtx->get_tx_cq_mgr(); init_tx_buffers(RING_TX_BUFS_COMPENSATE); @@ -408,7 +393,8 @@ void ring_simple::create_resources() * even if slave is not active */ if (p_slave->active || (p_slave->lag_tx_port_affinity == 1)) { - start_active_qp_mgr(); + start_active_queue_tx(); + start_active_queue_rx(); } ring_logdbg("new ring_simple() completed"); @@ -455,7 +441,7 @@ int ring_simple::socketxtreme_poll(struct xlio_socketxtreme_completion_t *xlio_c bool do_poll = true; if (likely(xlio_completions) && ncompletions) { - if ((flags & SOCKETXTREME_POLL_TX) && list_empty(&m_socketxtreme.ec_list)) { + if ((flags & SOCKETXTREME_POLL_TX) && !m_socketxtreme.ec_sock_list_start) { uint64_t poll_sn = 0; const std::lock_guard lock(m_lock_ring_tx); m_p_cq_mgr_tx->poll_and_process_element_tx(&poll_sn); @@ -463,11 +449,8 @@ int ring_simple::socketxtreme_poll(struct xlio_socketxtreme_completion_t *xlio_c const std::lock_guard lock(m_lock_ring_rx); while (!g_b_exit && (i < (int)ncompletions)) { - if (!list_empty(&m_socketxtreme.ec_list)) { - ring_ec *ec = get_ec(); - if (ec) { - memcpy(xlio_completions, &ec->completion, sizeof(ec->completion)); - ec->clear(); + if (m_socketxtreme.ec_sock_list_start) { + if (socketxtreme_ec_pop_completion(xlio_completions)) { xlio_completions++; i++; } @@ -481,11 +464,11 @@ int ring_simple::socketxtreme_poll(struct xlio_socketxtreme_completion_t *xlio_c // completions than ncompletions, what is not optimal for performance. // Not each packet results in a real completion but this check is good enough. if (++pkts >= ncompletions) { - m_gro_mgr.flush_all(NULL); + m_gro_mgr.flush_all(nullptr); pkts = 0U; } } else { - m_gro_mgr.flush_all(NULL); + m_gro_mgr.flush_all(nullptr); do_poll = false; } } else { @@ -505,7 +488,7 @@ int ring_simple::wait_for_notification_and_process_element(int cq_channel_fd, void *pv_fd_ready_array /*NULL*/) { int ret = -1; - if (m_p_cq_mgr_rx != NULL) { + if (m_p_cq_mgr_rx) { RING_TRY_LOCK_RUN_AND_UPDATE_RET(m_lock_ring_rx, m_p_cq_mgr_rx->wait_for_notification_and_process_element( p_cq_poll_sn, pv_fd_ready_array); @@ -571,8 +554,7 @@ void ring_simple::mem_buf_desc_return_single_multi_ref(mem_buf_desc_t *p_mem_buf std::lock_guard lock(m_lock_ring_tx); - p_mem_buf_desc->lwip_pbuf.pbuf.ref -= - std::min(p_mem_buf_desc->lwip_pbuf.pbuf.ref, ref - 1); + p_mem_buf_desc->lwip_pbuf.ref -= std::min(p_mem_buf_desc->lwip_pbuf.ref, ref - 1); put_tx_single_buffer(p_mem_buf_desc); } @@ -594,7 +576,7 @@ mem_buf_desc_t *ring_simple::mem_buf_tx_get(ring_user_id_t id, bool b_block, pbu { NOT_IN_USE(id); int ret = 0; - mem_buf_desc_t *buff_list = NULL; + mem_buf_desc_t *buff_list = nullptr; uint64_t poll_sn = 0; ring_logfuncall("n_num_mem_bufs=%d", n_num_mem_bufs); @@ -606,16 +588,16 @@ mem_buf_desc_t *ring_simple::mem_buf_tx_get(ring_user_id_t id, bool b_block, pbu // Try to poll once in the hope that we get a few freed tx mem_buf_desc ret = m_p_cq_mgr_tx->poll_and_process_element_tx(&poll_sn); if (ret < 0) { - ring_logdbg("failed polling on tx cq_mgr (qp_mgr=%p, cq_mgr_tx=%p) (ret=%d %m)", - m_p_qp_mgr, m_p_cq_mgr_tx, ret); + ring_logdbg("failed polling on cq_mgr_tx (hqtx=%p, cq_mgr_tx=%p) (ret=%d %m)", m_hqtx, + m_p_cq_mgr_tx, ret); /* coverity[double_unlock] TODO: RM#1049980 */ m_lock_ring_tx.unlock(); - return NULL; + return nullptr; } else if (ret > 0) { - ring_logfunc("polling succeeded on tx cq_mgr (%d wce)", ret); + ring_logfunc("polling succeeded on cq_mgr_tx (%d wce)", ret); buff_list = get_tx_buffers(type, n_num_mem_bufs); } else if (b_block) { // (ret == 0) - // Arm & Block on tx cq_mgr notification channel + // Arm & Block on tx cq_mgr_tx notification channel // until we get a few freed tx mem_buf_desc & data buffers // Only a single thread should block on next Tx cqe event, hence the dedicated lock! @@ -632,8 +614,8 @@ mem_buf_desc_t *ring_simple::mem_buf_tx_get(ring_user_id_t id, bool b_block, pbu ret = m_p_cq_mgr_tx->request_notification(poll_sn); if (ret < 0) { // this is most likely due to cq_poll_sn out of sync, need to poll_cq again - ring_logdbg("failed arming tx cq_mgr (qp_mgr=%p, cq_mgr_tx=%p) (errno=%d %m)", - m_p_qp_mgr, m_p_cq_mgr_tx, errno); + ring_logdbg("failed arming cq_mgr_tx (hqtx=%p, cq_mgr_tx=%p) (errno=%d %m)", + m_hqtx, m_p_cq_mgr_tx, errno); } else if (ret == 0) { // prepare to block @@ -646,7 +628,7 @@ mem_buf_desc_t *ring_simple::mem_buf_tx_get(ring_user_id_t id, bool b_block, pbu /* coverity[double_unlock] coverity[unlock] TODO: RM#1049980 */ m_lock_ring_tx.unlock(); - ret = orig_os_api.poll(&poll_fd, 1, 100); + ret = SYSCALL(poll, &poll_fd, 1, 100); if (ret == 0) { m_lock_ring_tx_buf_wait.unlock(); /* coverity[double_lock] TODO: RM#1049980 */ @@ -654,34 +636,35 @@ mem_buf_desc_t *ring_simple::mem_buf_tx_get(ring_user_id_t id, bool b_block, pbu buff_list = get_tx_buffers(type, n_num_mem_bufs); continue; } else if (ret < 0) { - ring_logdbg("failed blocking on tx cq_mgr (errno=%d %m)", errno); + ring_logdbg("failed blocking on cq_mgr_tx (errno=%d %m)", errno); m_lock_ring_tx_buf_wait.unlock(); - return NULL; + return nullptr; } /* coverity[double_lock] TODO: RM#1049980 */ m_lock_ring_tx.lock(); - // Find the correct Tx cq_mgr from the CQ event, + // Find the correct cq_mgr_tx from the CQ event, // It might not be the active_cq object since we have a single TX CQ comp - // channel for all cq_mgr's - cq_mgr *p_cq_mgr_tx = get_cq_mgr_from_cq_event(get_tx_comp_event_channel()); + // channel for all cq_mgr_tx's + cq_mgr_tx *p_cq_mgr_tx = + cq_mgr_tx::get_cq_mgr_from_cq_event(get_tx_comp_event_channel()); if (p_cq_mgr_tx) { // Allow additional CQ arming now - p_cq_mgr_tx->m_b_notification_armed = false; + p_cq_mgr_tx->reset_notification_armed(); // Perform a non blocking event read, clear the fd channel ret = p_cq_mgr_tx->poll_and_process_element_tx(&poll_sn); if (ret < 0) { - ring_logdbg("failed handling Tx cq_mgr channel (qp_mgr=%p, " + ring_logdbg("failed handling cq_mgr_tx channel (hqtx=%p, " "cq_mgr_tx=%p) (errno=%d %m)", - m_p_qp_mgr, m_p_cq_mgr_tx, errno); + m_hqtx, m_p_cq_mgr_tx, errno); /* coverity[double_unlock] TODO: RM#1049980 */ m_lock_ring_tx.unlock(); m_lock_ring_tx_buf_wait.unlock(); - return NULL; + return nullptr; } - ring_logfunc("polling/blocking succeeded on tx cq_mgr (we got %d wce)", + ring_logfunc("polling/blocking succeeded on cq_mgr_tx (we got %d wce)", ret); } } @@ -695,7 +678,7 @@ mem_buf_desc_t *ring_simple::mem_buf_tx_get(ring_user_id_t id, bool b_block, pbu } else { // get out on non blocked socket m_lock_ring_tx.unlock(); - return NULL; + return nullptr; } } @@ -730,7 +713,7 @@ int ring_simple::mem_buf_tx_release(mem_buf_desc_t *p_mem_buf_desc_list, bool b_ void ring_simple::mem_buf_rx_release(mem_buf_desc_t *p_mem_buf_desc) { - p_mem_buf_desc->p_next_desc = NULL; + p_mem_buf_desc->p_next_desc = nullptr; reclaim_recv_buffers(p_mem_buf_desc); } @@ -739,11 +722,11 @@ inline int ring_simple::send_buffer(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_pac xlio_tis *tis) { int ret = 0; - unsigned credits = m_p_qp_mgr->credits_calculate(p_send_wqe); + unsigned credits = m_hqtx->credits_calculate(p_send_wqe); - if (likely(m_p_qp_mgr->credits_get(credits)) || + if (likely(m_hqtx->credits_get(credits)) || is_available_qp_wr(is_set(attr, XLIO_TX_PACKET_BLOCK), credits)) { - ret = m_p_qp_mgr->send(p_send_wqe, attr, tis, credits); + m_hqtx->send_wqe(p_send_wqe, attr, tis, credits); } else { ring_logdbg("Silent packet drop, SQ is full!"); ret = -1; @@ -758,7 +741,7 @@ bool ring_simple::get_hw_dummy_send_support(ring_user_id_t id, xlio_ibv_send_wr NOT_IN_USE(id); NOT_IN_USE(p_send_wqe); - return m_p_qp_mgr->get_hw_dummy_send_support(); + return m_hqtx->get_hw_dummy_send_support(); } void ring_simple::send_ring_buffer(ring_user_id_t id, xlio_ibv_send_wr *p_send_wqe, @@ -773,7 +756,7 @@ void ring_simple::send_ring_buffer(ring_user_id_t id, xlio_ibv_send_wr *p_send_w } std::lock_guard lock(m_lock_ring_tx); - int ret = send_buffer(p_send_wqe, attr, 0); + int ret = send_buffer(p_send_wqe, attr, nullptr); send_status_handler(ret, p_send_wqe); } @@ -802,18 +785,18 @@ bool ring_simple::is_available_qp_wr(bool b_block, unsigned credits) // Try to poll once in the hope that we get space in SQ ret = m_p_cq_mgr_tx->poll_and_process_element_tx(&poll_sn); if (ret < 0) { - ring_logdbg("failed polling on tx cq_mgr (qp_mgr=%p, cq_mgr_tx=%p) (ret=%d %m)", - m_p_qp_mgr, m_p_cq_mgr_tx, ret); + ring_logdbg("failed polling on cq_mgr_tx (hqtx=%p, cq_mgr_tx=%p) (ret=%d %m)", m_hqtx, + m_p_cq_mgr_tx, ret); /* coverity[missing_unlock] */ return false; } - granted = m_p_qp_mgr->credits_get(credits); + granted = m_hqtx->credits_get(credits); if (granted) { break; } if (b_block) { - // Arm & Block on tx cq_mgr notification channel until we get space in SQ + // Arm & Block on cq_mgr_tx notification channel until we get space in SQ // Only a single thread should block on next Tx cqe event, hence the dedicated lock! /* coverity[double_unlock] TODO: RM#1049980 */ @@ -826,8 +809,8 @@ bool ring_simple::is_available_qp_wr(bool b_block, unsigned credits) ret = m_p_cq_mgr_tx->request_notification(poll_sn); if (ret < 0) { // this is most likely due to cq_poll_sn out of sync, need to poll_cq again - ring_logdbg("failed arming tx cq_mgr (qp_mgr=%p, cq_mgr_tx=%p) (errno=%d %m)", - m_p_qp_mgr, m_p_cq_mgr_tx, errno); + ring_logdbg("failed arming cq_mgr_tx (hqtx=%p, cq_mgr_tx=%p) (errno=%d %m)", m_hqtx, + m_p_cq_mgr_tx, errno); } else if (ret == 0) { // prepare to block // CQ is armed, block on the CQ's Tx event channel (fd) @@ -839,9 +822,9 @@ bool ring_simple::is_available_qp_wr(bool b_block, unsigned credits) /* coverity[double_unlock] TODO: RM#1049980 */ m_lock_ring_tx.unlock(); - ret = orig_os_api.poll(&poll_fd, 1, -1); + ret = SYSCALL(poll, &poll_fd, 1, -1); if (ret <= 0) { - ring_logdbg("failed blocking on tx cq_mgr (errno=%d %m)", errno); + ring_logdbg("failed blocking on cq_mgr_tx (errno=%d %m)", errno); m_lock_ring_tx_buf_wait.unlock(); /* coverity[double_lock] TODO: RM#1049980 */ m_lock_ring_tx.lock(); @@ -851,21 +834,22 @@ bool ring_simple::is_available_qp_wr(bool b_block, unsigned credits) /* coverity[double_lock] TODO: RM#1049980 */ m_lock_ring_tx.lock(); - // Find the correct Tx cq_mgr from the CQ event, + // Find the correct cq_mgr_tx from the CQ event, // It might not be the active_cq object since we have a single TX CQ comp - // channel for all cq_mgr's - cq_mgr *p_cq_mgr_tx = get_cq_mgr_from_cq_event(get_tx_comp_event_channel()); + // channel for all cq_mgr_tx's + cq_mgr_tx *p_cq_mgr_tx = + cq_mgr_tx::get_cq_mgr_from_cq_event(get_tx_comp_event_channel()); if (p_cq_mgr_tx) { // Allow additional CQ arming now - p_cq_mgr_tx->m_b_notification_armed = false; + p_cq_mgr_tx->reset_notification_armed(); // Perform a non blocking event read, clear the fd channel ret = p_cq_mgr_tx->poll_and_process_element_tx(&poll_sn); if (ret < 0) { - ring_logdbg("failed handling Tx cq_mgr channel (qp_mgr=%p, " + ring_logdbg("failed handling cq_mgr_tx channel (hqtx=%p " "cq_mgr_tx=%p) (errno=%d %m)", - m_p_qp_mgr, m_p_cq_mgr_tx, errno); + m_hqtx, m_p_cq_mgr_tx, errno); /* coverity[double_unlock] TODO: RM#1049980 */ m_lock_ring_tx.unlock(); m_lock_ring_tx_buf_wait.unlock(); @@ -873,7 +857,7 @@ bool ring_simple::is_available_qp_wr(bool b_block, unsigned credits) m_lock_ring_tx.lock(); return false; } - ring_logfunc("polling/blocking succeeded on tx cq_mgr (we got %d wce)", ret); + ring_logfunc("polling/blocking succeeded on cq_mgr_tx (we got %d wce)", ret); } } @@ -923,26 +907,26 @@ mem_buf_desc_t *ring_simple::get_tx_buffers(pbuf_type type, uint32_t n_num_mem_b } if (unlikely(pool.size() < n_num_mem_bufs)) { - return NULL; + return nullptr; } } head = pool.get_and_pop_back(); - head->lwip_pbuf.pbuf.ref = 1; - assert(head->lwip_pbuf.pbuf.type == type); - head->lwip_pbuf.pbuf.type = type; + head->lwip_pbuf.ref = 1; + assert(head->lwip_pbuf.type == type); + head->lwip_pbuf.type = type; n_num_mem_bufs--; mem_buf_desc_t *next = head; while (n_num_mem_bufs) { next->p_next_desc = pool.get_and_pop_back(); next = next->p_next_desc; - next->lwip_pbuf.pbuf.ref = 1; - assert(head->lwip_pbuf.pbuf.type == type); - next->lwip_pbuf.pbuf.type = type; + next->lwip_pbuf.ref = 1; + assert(head->lwip_pbuf.type == type); + next->lwip_pbuf.type = type; n_num_mem_bufs--; } - next->p_next_desc = NULL; + next->p_next_desc = nullptr; return head; } @@ -971,19 +955,19 @@ void ring_simple::return_tx_pool_to_global_pool() int ring_simple::put_tx_buffer_helper(mem_buf_desc_t *buff) { if (buff->tx.dev_mem_length) { - m_p_qp_mgr->dm_release_data(buff); + m_hqtx->dm_release_data(buff); } // Potential race, ref is protected here by ring_tx lock, and in dst_entry_tcp & // sockinfo_tcp by tcp lock - if (likely(buff->lwip_pbuf.pbuf.ref)) { - buff->lwip_pbuf.pbuf.ref--; + if (likely(buff->lwip_pbuf.ref)) { + buff->lwip_pbuf.ref--; } else { ring_logerr("ref count of %p is already zero, double free??", buff); } - if (buff->lwip_pbuf.pbuf.ref == 0) { - descq_t &pool = buff->lwip_pbuf.pbuf.type == PBUF_ZEROCOPY ? m_zc_pool : m_tx_pool; + if (buff->lwip_pbuf.ref == 0) { + descq_t &pool = buff->lwip_pbuf.type == PBUF_ZEROCOPY ? m_zc_pool : m_tx_pool; buff->p_next_desc = nullptr; free_lwip_pbuf(&buff->lwip_pbuf); pool.push_back(buff); @@ -1106,37 +1090,56 @@ void ring_simple::adapt_cq_moderation() m_lock_ring_rx.unlock(); } -void ring_simple::start_active_qp_mgr() +void ring_simple::start_active_queue_tx() { - m_lock_ring_rx.lock(); m_lock_ring_tx.lock(); - if (!m_up) { + if (!m_up_tx) { /* TODO: consider avoid using sleep */ /* coverity[sleep] */ - m_p_qp_mgr->up(); - m_up = true; + m_hqtx->up(); + m_up_tx = true; } m_lock_ring_tx.unlock(); - m_lock_ring_rx.unlock(); } -void ring_simple::stop_active_qp_mgr() +void ring_simple::start_active_queue_rx() { m_lock_ring_rx.lock(); + if (!m_up_rx) { + /* TODO: consider avoid using sleep */ + /* coverity[sleep] */ + m_hqrx->up(); + m_up_rx = true; + } + m_lock_ring_rx.unlock(); +} + +void ring_simple::stop_active_queue_tx() +{ m_lock_ring_tx.lock(); - if (m_up) { - m_up = false; + if (m_up_tx) { + m_up_tx = false; /* TODO: consider avoid using sleep */ /* coverity[sleep] */ - m_p_qp_mgr->down(); + m_hqtx->down(); } m_lock_ring_tx.unlock(); +} +void ring_simple::stop_active_queue_rx() +{ + m_lock_ring_rx.lock(); + if (m_up_rx) { + m_up_rx = false; + /* TODO: consider avoid using sleep */ + /* coverity[sleep] */ + m_hqrx->down(); + } m_lock_ring_rx.unlock(); } bool ring_simple::is_up() { - return m_up; + return m_up_tx && m_up_rx; } int ring_simple::modify_ratelimit(struct xlio_rate_limit_t &rate_limit) @@ -1152,58 +1155,48 @@ int ring_simple::modify_ratelimit(struct xlio_rate_limit_t &rate_limit) return -1; } - uint32_t rl_changes = m_p_qp_mgr->is_ratelimit_change(rate_limit); + uint32_t rl_changes = m_hqtx->is_ratelimit_change(rate_limit); - if (m_up && rl_changes) { - return m_p_qp_mgr->modify_qp_ratelimit(rate_limit, rl_changes); + if (m_up_tx && rl_changes) { + return m_hqtx->modify_qp_ratelimit(rate_limit, rl_changes); } return 0; } -uint32_t ring_simple::get_tx_user_lkey(void *addr, size_t length, void *p_mapping /*=NULL*/) +uint32_t ring_simple::get_tx_user_lkey(void *addr, size_t length) { uint32_t lkey; /* - * Current implementation supports 2 modes: - * 1. Per ring registration cache where addr is the key - * 2. Proxy query to an external mapping object + * Current implementation supports a ring registration cache where addr is the key. * - * The 1st mode is used for send zerocopy and the 2nd made is used for - * sendfile offload. These 2 modes are differentiated by p_mapping - * value. It is NULL in the 1st case. + * The mode is used for send zerocopy. * - * TODO In the 1st mode we don't support memory deregistration. + * TODO The mode doesnn't support memory deregistration. */ - if (p_mapping == NULL) { - auto iter = m_user_lkey_map.find(addr); - if (iter != m_user_lkey_map.end()) { - lkey = iter->second; + auto iter = m_user_lkey_map.find(addr); + if (iter != m_user_lkey_map.end()) { + lkey = iter->second; + } else { + lkey = m_p_ib_ctx->user_mem_reg(addr, length, XLIO_IBV_ACCESS_LOCAL_WRITE); + if (lkey == LKEY_ERROR) { + ring_logerr("Can't register user memory addr %p len %lx", addr, length); } else { - lkey = m_p_ib_ctx->user_mem_reg(addr, length, XLIO_IBV_ACCESS_LOCAL_WRITE); - if (lkey == LKEY_ERROR) { - ring_logerr("Can't register user memory addr %p len %lx", addr, length); - } else { - m_user_lkey_map[addr] = lkey; - } + m_user_lkey_map[addr] = lkey; } - } else { - mapping_t *mapping = (mapping_t *)p_mapping; - lkey = mapping->get_lkey(NULL, m_p_ib_ctx, addr, length); } - return lkey; } uint32_t ring_simple::get_max_inline_data() { - return m_p_qp_mgr->get_max_inline_data(); + return m_hqtx->get_max_inline_data(); } uint32_t ring_simple::get_max_send_sge(void) { - return m_p_qp_mgr->get_max_send_sge(); + return m_hqtx->get_max_send_sge(); } uint32_t ring_simple::get_max_payload_sz(void) diff --git a/src/core/dev/ring_simple.h b/src/core/dev/ring_simple.h index cf96e4377..080b11a01 100644 --- a/src/core/dev/ring_simple.h +++ b/src/core/dev/ring_simple.h @@ -39,7 +39,8 @@ #include #include "dev/gro_mgr.h" -#include "dev/qp_mgr.h" +#include "dev/hw_queue_tx.h" +#include "dev/hw_queue_rx.h" #include "dev/net_device_table_mgr.h" struct cq_moderation_info { @@ -66,7 +67,7 @@ class ring_simple : public ring_slave { int request_notification(cq_type_t cq_type, uint64_t poll_sn) override; int poll_and_process_element_rx(uint64_t *p_cq_poll_sn, - void *pv_fd_ready_array = NULL) override; + void *pv_fd_ready_array = nullptr) override; int poll_and_process_element_tx(uint64_t *p_cq_poll_sn) override; void adapt_cq_moderation() override; bool reclaim_recv_buffers(descq_t *rx_reuse) override; @@ -78,15 +79,17 @@ class ring_simple : public ring_slave { unsigned int ncompletions, int flags) override; int drain_and_proccess() override; int wait_for_notification_and_process_element(int cq_channel_fd, uint64_t *p_cq_poll_sn, - void *pv_fd_ready_array = NULL) override; + void *pv_fd_ready_array = nullptr) override; void mem_buf_desc_return_to_owner_tx(mem_buf_desc_t *p_mem_buf_desc); void mem_buf_desc_return_to_owner_rx(mem_buf_desc_t *p_mem_buf_desc, - void *pv_fd_ready_array = NULL); + void *pv_fd_ready_array = nullptr); inline int send_buffer(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, xlio_tis *tis); bool is_up() override; - void start_active_qp_mgr(); - void stop_active_qp_mgr(); + void start_active_queue_tx(); + void start_active_queue_rx(); + void stop_active_queue_tx(); + void stop_active_queue_rx(); mem_buf_desc_t *mem_buf_tx_get(ring_user_id_t id, bool b_block, pbuf_type type, int n_num_mem_bufs = 1) override; int mem_buf_tx_release(mem_buf_desc_t *p_mem_buf_desc_list, bool b_accounting, @@ -95,6 +98,13 @@ class ring_simple : public ring_slave { xlio_wr_tx_packet_attr attr) override; int send_lwip_buffer(ring_user_id_t id, xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, xlio_tis *tis) override; + + void ring_delayed_doorbell() override + { + std::lock_guard lock(m_lock_ring_tx); + m_hqtx->ring_delayed_doorbell(); + } + void mem_buf_desc_return_single_to_owner_tx(mem_buf_desc_t *p_mem_buf_desc) override; void mem_buf_desc_return_single_multi_ref(mem_buf_desc_t *p_mem_buf_desc, unsigned ref) override; @@ -110,7 +120,7 @@ class ring_simple : public ring_slave { { return m_p_tx_comp_event_channel ? m_p_tx_comp_event_channel->fd : -1; } - uint32_t get_tx_user_lkey(void *addr, size_t length, void *p_mapping = NULL) override; + uint32_t get_tx_user_lkey(void *addr, size_t length) override; uint32_t get_max_inline_data() override; ib_ctx_handler *get_ctx(ring_user_id_t id) override { @@ -138,7 +148,7 @@ class ring_simple : public ring_slave { { std::lock_guard lock(m_lock_ring_tx); - xlio_tis *tis = m_p_qp_mgr->tls_context_setup_tx(info); + xlio_tis *tis = m_hqtx->tls_context_setup_tx(info); if (likely(tis != NULL)) { ++m_p_ring_stat->n_tx_tls_contexts; } @@ -156,7 +166,7 @@ class ring_simple : public ring_slave { * Locking is required for TX ring with cached=true. */ std::lock_guard lock(m_lock_ring_tx); - return m_p_qp_mgr->tls_create_tir(cached); + return m_hqrx->tls_create_tir(cached); } int tls_context_setup_rx(xlio_tir *tir, const xlio_tls_info *info, uint32_t next_record_tcp_sn, xlio_comp_cb_t callback, void *callback_arg) override @@ -165,7 +175,7 @@ class ring_simple : public ring_slave { std::lock_guard lock(m_lock_ring_tx); int rc = - m_p_qp_mgr->tls_context_setup_rx(tir, info, next_record_tcp_sn, callback, callback_arg); + m_hqtx->tls_context_setup_rx(tir, info, next_record_tcp_sn, callback, callback_arg); if (likely(rc == 0)) { ++m_p_ring_stat->n_rx_tls_contexts; } @@ -179,7 +189,7 @@ class ring_simple : public ring_slave { void tls_context_resync_tx(const xlio_tls_info *info, xlio_tis *tis, bool skip_static) override { std::lock_guard lock(m_lock_ring_tx); - m_p_qp_mgr->tls_context_resync_tx(info, tis, skip_static); + m_hqtx->tls_context_resync_tx(info, tis, skip_static); uint64_t dummy_poll_sn = 0; m_p_cq_mgr_tx->poll_and_process_element_tx(&dummy_poll_sn); @@ -187,7 +197,7 @@ class ring_simple : public ring_slave { void tls_resync_rx(xlio_tir *tir, const xlio_tls_info *info, uint32_t hw_resync_tcp_sn) override { std::lock_guard lock(m_lock_ring_tx); - m_p_qp_mgr->tls_resync_rx(tir, info, hw_resync_tcp_sn); + m_hqtx->tls_resync_rx(tir, info, hw_resync_tcp_sn); } void tls_get_progress_params_rx(xlio_tir *tir, void *buf, uint32_t lkey) override { @@ -195,7 +205,7 @@ class ring_simple : public ring_slave { if (lkey == LKEY_TX_DEFAULT) { lkey = m_tx_lkey; } - m_p_qp_mgr->tls_get_progress_params_rx(tir, buf, lkey); + m_hqtx->tls_get_progress_params_rx(tir, buf, lkey); /* Do polling to speedup handling of the completion. */ uint64_t dummy_poll_sn = 0; m_p_cq_mgr_tx->poll_and_process_element_tx(&dummy_poll_sn); @@ -203,13 +213,13 @@ class ring_simple : public ring_slave { void tls_release_tis(xlio_tis *tis) override { std::lock_guard lock(m_lock_ring_tx); - m_p_qp_mgr->tls_release_tis(tis); + m_hqtx->tls_release_tis(tis); } void tls_release_tir(xlio_tir *tir) override { /* TIR objects are protected with TX lock */ std::lock_guard lock(m_lock_ring_tx); - m_p_qp_mgr->tls_release_tir(tir); + m_hqrx->tls_release_tir(tir); } void tls_tx_post_dump_wqe(xlio_tis *tis, void *addr, uint32_t len, uint32_t lkey, bool first) override @@ -218,21 +228,21 @@ class ring_simple : public ring_slave { if (lkey == LKEY_TX_DEFAULT) { lkey = m_tx_lkey; } - m_p_qp_mgr->tls_tx_post_dump_wqe(tis, addr, len, lkey, first); + m_hqtx->tls_tx_post_dump_wqe(tis, addr, len, lkey, first); } #endif /* DEFINED_UTLS */ -#ifdef DEFINED_DPCP + std::unique_ptr create_tis(uint32_t flags) const override { std::lock_guard lock(m_lock_ring_tx); - return m_p_qp_mgr->create_tis(flags); + return m_hqtx->create_tis(flags); } int get_supported_nvme_feature_mask() const override { dpcp::adapter_hca_capabilities caps {}; auto adapter = m_p_ib_ctx->get_dpcp_adapter(); - if (adapter == nullptr || (dpcp::DPCP_OK != adapter->get_hca_capabilities(caps)) || + if (!adapter || (dpcp::DPCP_OK != adapter->get_hca_capabilities(caps)) || !caps.nvmeotcp_caps.enabled) { return 0; } @@ -244,54 +254,52 @@ class ring_simple : public ring_slave { void nvme_set_static_context(xlio_tis *tis, uint32_t config) override { std::lock_guard lock(m_lock_ring_tx); - m_p_qp_mgr->nvme_set_static_context(tis, config); + m_hqtx->nvme_set_static_context(tis, config); } void nvme_set_progress_context(xlio_tis *tis, uint32_t tcp_seqno) override { std::lock_guard lock(m_lock_ring_tx); - m_p_qp_mgr->nvme_set_progress_context(tis, tcp_seqno); + m_hqtx->nvme_set_progress_context(tis, tcp_seqno); } -#endif /* DEFINED_DPCP */ void post_nop_fence(void) override { std::lock_guard lock(m_lock_ring_tx); - m_p_qp_mgr->post_nop_fence(); + m_hqtx->post_nop_fence(); } void post_dump_wqe(xlio_tis *tis, void *addr, uint32_t len, uint32_t lkey, bool is_first) override { std::lock_guard lock(m_lock_ring_tx); - m_p_qp_mgr->post_dump_wqe(tis, addr, len, lkey, is_first); + m_hqtx->post_dump_wqe(tis, addr, len, lkey, is_first); } void reset_inflight_zc_buffers_ctx(ring_user_id_t id, void *ctx) override { std::lock_guard lock(m_lock_ring_tx); NOT_IN_USE(id); - m_p_qp_mgr->reset_inflight_zc_buffers_ctx(ctx); + m_hqtx->reset_inflight_zc_buffers_ctx(ctx); } bool credits_get(unsigned credits) override { std::lock_guard lock(m_lock_ring_tx); - return m_p_qp_mgr->credits_get(credits); + return m_hqtx->credits_get(credits); } void credits_return(unsigned credits) override { std::lock_guard lock(m_lock_ring_tx); - m_p_qp_mgr->credits_return(credits); + m_hqtx->credits_return(credits); } - friend class cq_mgr; - friend class cq_mgr_mlx5; - friend class cq_mgr_mlx5_strq; - friend class qp_mgr; - friend class qp_mgr_eth_mlx5; - friend class qp_mgr_eth_mlx5_dpcp; + friend class cq_mgr_rx; + friend class cq_mgr_rx_regrq; + friend class cq_mgr_rx_strq; + friend class hw_queue_tx; + friend class hw_queue_rx; friend class rfs; friend class rfs_uc; friend class rfs_uc_tcp_gro; @@ -299,7 +307,6 @@ class ring_simple : public ring_slave { friend class ring_bond; protected: - virtual qp_mgr *create_qp_mgr(struct qp_mgr_desc *desc) = 0; void create_resources(); virtual void init_tx_buffers(uint32_t count); void inc_cq_moderation_stats(size_t sz_data) override; @@ -308,35 +315,6 @@ class ring_simple : public ring_slave { inline uint32_t get_mtu() { return m_mtu; } private: - bool is_socketxtreme(void) override { return safe_mce_sys().enable_socketxtreme; } - - void put_ec(struct ring_ec *ec) override - { - m_socketxtreme.lock_ec_list.lock(); - list_add_tail(&ec->list, &m_socketxtreme.ec_list); - m_socketxtreme.lock_ec_list.unlock(); - } - - void del_ec(struct ring_ec *ec) override - { - m_socketxtreme.lock_ec_list.lock(); - list_del_init(&ec->list); - ec->clear(); - m_socketxtreme.lock_ec_list.unlock(); - } - - inline ring_ec *get_ec(void) - { - struct ring_ec *ec = NULL; - - m_socketxtreme.lock_ec_list.lock(); - if (!list_empty(&m_socketxtreme.ec_list)) { - ec = list_entry(m_socketxtreme.ec_list.next, struct ring_ec, list); - list_del_init(&ec->list); - } - m_socketxtreme.lock_ec_list.unlock(); - return ec; - } inline void send_status_handler(int ret, xlio_ibv_send_wr *p_send_wqe); inline mem_buf_desc_t *get_tx_buffers(pbuf_type type, uint32_t n_num_mem_bufs); inline int put_tx_buffer_helper(mem_buf_desc_t *buff); @@ -354,42 +332,32 @@ class ring_simple : public ring_slave { if (m_p_l2_addr) { delete m_p_l2_addr; } - m_p_l2_addr = NULL; + m_p_l2_addr = nullptr; }; protected: ib_ctx_handler *m_p_ib_ctx; - qp_mgr *m_p_qp_mgr; + hw_queue_tx *m_hqtx = nullptr; + hw_queue_rx *m_hqrx = nullptr; struct cq_moderation_info m_cq_moderation_info; - cq_mgr *m_p_cq_mgr_rx; - cq_mgr *m_p_cq_mgr_tx; + cq_mgr_rx *m_p_cq_mgr_rx = nullptr; + cq_mgr_tx *m_p_cq_mgr_tx = nullptr; std::unordered_map m_user_lkey_map; private: - struct { - /* queue of event completion elements - * this queue is stored events related different sockinfo (sockets) - * In current implementation every sockinfo (socket) can have single event - * in this queue - */ - struct list_head ec_list; - - /* Thread-safety lock for get/put operations under the queue */ - lock_spin lock_ec_list; - } m_socketxtreme; - lock_mutex m_lock_ring_tx_buf_wait; - uint32_t m_tx_num_bufs; - uint32_t m_zc_num_bufs; - uint32_t m_tx_num_wr; - uint32_t m_missing_buf_ref_count; - uint32_t m_tx_lkey; // this is the registered memory lkey for a given specific device for the - // buffer pool use + uint32_t m_tx_num_bufs = 0U; + uint32_t m_zc_num_bufs = 0U; + uint32_t m_tx_num_wr = 0U; + uint32_t m_missing_buf_ref_count = 0U; + uint32_t m_tx_lkey = 0U; // this is the registered memory lkey for a given specific device for + // the buffer pool use gro_mgr m_gro_mgr; - bool m_up; - struct ibv_comp_channel *m_p_rx_comp_event_channel; - struct ibv_comp_channel *m_p_tx_comp_event_channel; - L2_address *m_p_l2_addr; + bool m_up_tx = false; + bool m_up_rx = false; + struct ibv_comp_channel *m_p_rx_comp_event_channel = nullptr; + struct ibv_comp_channel *m_p_tx_comp_event_channel = nullptr; + L2_address *m_p_l2_addr = nullptr; uint32_t m_mtu; struct { @@ -440,28 +408,20 @@ class ring_simple : public ring_slave { class ring_eth : public ring_simple { public: - ring_eth(int if_index, ring *parent = NULL, ring_type_t type = RING_ETH, + ring_eth(int if_index, ring *parent = nullptr, ring_type_t type = RING_ETH, bool call_create_res = true, bool use_locks = true) : ring_simple(if_index, parent, type, use_locks) { net_device_val_eth *p_ndev = dynamic_cast( g_p_net_device_table_mgr->get_net_device_val(m_parent->get_if_index())); if (p_ndev) { - m_partition = p_ndev->get_vlan(); + m_vlan = p_ndev->get_vlan(); - /* Do resource initialization for - * ring_eth_direct, ring_eth_cb inside related - * constructors because - * they use own create_qp_mgr() methods - */ if (call_create_res) { create_resources(); } } } - -protected: - qp_mgr *create_qp_mgr(struct qp_mgr_desc *desc) override; }; #endif // RING_SIMPLE_H diff --git a/src/core/dev/ring_slave.cpp b/src/core/dev/ring_slave.cpp index 7e5e598f1..c6bb055b0 100644 --- a/src/core/dev/ring_slave.cpp +++ b/src/core/dev/ring_slave.cpp @@ -47,11 +47,13 @@ // AF_INET address 0.0.0.0:0, used for 3T flow spec keys. static const sock_addr s_sock_addrany; +static thread_local lock_dummy t_lock_dummy_ring; + static lock_base *get_new_lock(const char *name, bool real_lock) { return (real_lock ? static_cast(multilock::create_new_lock(MULTILOCK_RECURSIVE, name)) - : static_cast(new lock_dummy())); + : static_cast(&t_lock_dummy_ring)); } ring_slave::ring_slave(int if_index, ring *parent, ring_type_t type, bool use_locks) @@ -61,14 +63,14 @@ ring_slave::ring_slave(int if_index, ring *parent, ring_type_t type, bool use_lo , m_lock_ring_rx(get_new_lock("ring_slave:lock_rx", use_locks)) , m_lock_ring_tx(get_new_lock("ring_slave:lock_tx", use_locks)) , m_p_ring_stat(new ring_stats_t) - , m_partition(0) + , m_vlan(0) , m_flow_tag_enabled(false) , m_b_sysvar_eth_mc_l2_only_rules(safe_mce_sys().eth_mc_l2_only_rules) , m_b_sysvar_mc_force_flowtag(safe_mce_sys().mc_force_flowtag) , m_type(type) { - net_device_val *p_ndev = NULL; - const slave_data_t *p_slave = NULL; + net_device_val *p_ndev = nullptr; + const slave_data_t *p_slave = nullptr; /* Configure ring() fields */ set_parent(parent); @@ -76,7 +78,7 @@ ring_slave::ring_slave(int if_index, ring *parent, ring_type_t type, bool use_lo /* Sanity check */ p_ndev = g_p_net_device_table_mgr->get_net_device_val(m_parent->get_if_index()); - if (NULL == p_ndev) { + if (!p_ndev) { ring_logpanic("Invalid if_index = %d", if_index); } @@ -122,7 +124,8 @@ ring_slave::~ring_slave() void ring_slave::print_val() { ring_logdbg("%d: %p: parent %p type %s", m_if_index, this, - ((uintptr_t)this == (uintptr_t)m_parent ? 0 : m_parent), ring_type_str[m_type]); + ((uintptr_t)this == (uintptr_t)m_parent ? nullptr : m_parent), + ring_type_str[m_type]); } void ring_slave::restart() @@ -157,14 +160,14 @@ void ring_slave::inc_tx_retransmissions_stats(ring_user_id_t) } template -bool steering_handler::attach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink, +bool steering_handler::attach_flow(flow_tuple &flow_spec_5t, sockinfo *sink, bool force_5t) { rfs *p_rfs; - rfs *p_tmp_rfs = NULL; + rfs *p_tmp_rfs = nullptr; sockinfo *si = static_cast(sink); - if (si == NULL) { + if (!si) { return false; } @@ -184,7 +187,7 @@ bool steering_handler::attach_flow(flow_tuple &flow_spec_5t, flow_spec_5t.get_dst_port(), flow_spec_5t.get_src_port()); sock_addr rule_key(flow_spec_5t.get_family(), &flow_spec_5t.get_dst_ip(), flow_spec_5t.get_dst_port()); - rfs_rule_filter *dst_port_filter = NULL; + rfs_rule_filter *dst_port_filter = nullptr; if (safe_mce_sys().udp_3t_rules) { auto dst_port_iter = m_ring.m_udp_uc_dst_port_attach_map.find(rule_key); if (dst_port_iter == m_ring.m_udp_uc_dst_port_attach_map.end()) { @@ -220,7 +223,7 @@ bool steering_handler::attach_flow(flow_tuple &flow_spec_5t, return false; } BULLSEYE_EXCLUDE_BLOCK_START - if (p_tmp_rfs == NULL) { + if (!p_tmp_rfs) { ring_logerr("Failed to allocate rfs!"); return false; } @@ -257,7 +260,7 @@ bool steering_handler::attach_flow(flow_tuple &flow_spec_5t, // It means that for every MC group, even if we have sockets with different ports - only one // rule in the HW. So the hash map below keeps track of the number of sockets per rule so we // know when to call ibv_attach and ibv_detach - rfs_rule_filter *l2_mc_ip_filter = NULL; + rfs_rule_filter *l2_mc_ip_filter = nullptr; if (m_ring.m_b_sysvar_eth_mc_l2_only_rules) { auto l2_mc_iter = m_ring.m_l2_mc_ip_attach_map.find(rule_key); // It means that this is the first time attach called with this MC ip @@ -297,7 +300,7 @@ bool steering_handler::attach_flow(flow_tuple &flow_spec_5t, flow_spec_5t.get_dst_port(), flow_spec_5t.get_src_port()); sock_addr rule_key(flow_spec_5t.get_family(), &flow_spec_5t.get_dst_ip(), flow_spec_5t.get_dst_port()); - rfs_rule_filter *dst_port_filter = NULL; + rfs_rule_filter *dst_port_filter = nullptr; if (safe_mce_sys().tcp_3t_rules) { auto dst_port_iter = m_ring.m_tcp_dst_port_attach_map.find(rule_key); if (dst_port_iter == m_ring.m_tcp_dst_port_attach_map.end()) { @@ -339,14 +342,14 @@ bool steering_handler::attach_flow(flow_tuple &flow_spec_5t, return false; } BULLSEYE_EXCLUDE_BLOCK_START - if (p_tmp_rfs == NULL) { + if (!p_tmp_rfs) { ring_logerr("Failed to allocate rfs!"); return false; } BULLSEYE_EXCLUDE_BLOCK_END p_rfs = p_tmp_rfs; - si->rfs_ptr = p_rfs; + si->set_rfs_ptr(p_rfs); #if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) if (g_p_app->type == APP_NONE || !g_p_app->add_second_4t_rule) #endif @@ -386,7 +389,7 @@ bool steering_handler::attach_flow(flow_tuple &flow_spec_5t, return ret; } -bool ring_slave::attach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink, bool force_5t) +bool ring_slave::attach_flow(flow_tuple &flow_spec_5t, sockinfo *sink, bool force_5t) { std::lock_guard lock(m_lock_ring_rx); @@ -396,9 +399,9 @@ bool ring_slave::attach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink, bool } template -bool steering_handler::detach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink) +bool steering_handler::detach_flow(flow_tuple &flow_spec_5t, sockinfo *sink) { - rfs *p_rfs = NULL; + rfs *p_rfs = nullptr; ring_logdbg("flow: %s, with sink (%p)", flow_spec_5t.to_str().c_str(), sink); @@ -520,7 +523,7 @@ bool steering_handler::detach_flow(flow_tuple &flow_spec_5t, return true; } -bool ring_slave::detach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink) +bool ring_slave::detach_flow(flow_tuple &flow_spec_5t, sockinfo *sink) { std::lock_guard lock(m_lock_ring_rx); @@ -592,14 +595,14 @@ bool ring_slave::rx_process_buffer(mem_buf_desc_t *p_rx_wc_buf_desc, void *pv_fd if (likely(m_flow_tag_enabled && p_rx_wc_buf_desc->rx.flow_tag_id && p_rx_wc_buf_desc->rx.flow_tag_id != FLOW_TAG_MASK && !p_rx_wc_buf_desc->rx.is_sw_csum_need)) { - sockinfo *si = NULL; + sockinfo *si = nullptr; // trying to get sockinfo per flow_tag_id-1 as it was incremented at attach // to allow mapping sockfd=0 assert(g_p_fd_collection); si = static_cast( g_p_fd_collection->get_sockfd(p_rx_wc_buf_desc->rx.flow_tag_id - 1)); - if (likely((si != NULL) && si->flow_tag_enabled())) { + if (likely(si)) { // will process packets with set flow_tag_id and enabled for the socket if (p_eth_h->h_proto == NET_ETH_P_8021Q) { // Handle VLAN header as next protocol @@ -643,10 +646,6 @@ bool ring_slave::rx_process_buffer(mem_buf_desc_t *p_rx_wc_buf_desc, void *pv_fd if (likely(protocol == IPPROTO_TCP)) { struct tcphdr *p_tcp_h = (struct tcphdr *)((uint8_t *)p_ip_h + ip_hdr_len); - // Update the L3 and L4 info - p_rx_wc_buf_desc->rx.src.set_ip_port(family, saddr, p_tcp_h->source); - p_rx_wc_buf_desc->rx.dst.set_ip_port(family, daddr, p_tcp_h->dest); - // Update packet descriptor with datagram base address and length p_rx_wc_buf_desc->rx.frag.iov_base = (uint8_t *)p_tcp_h + sizeof(struct tcphdr); p_rx_wc_buf_desc->rx.frag.iov_len = ip_payload_len - sizeof(struct tcphdr); @@ -665,7 +664,7 @@ bool ring_slave::rx_process_buffer(mem_buf_desc_t *p_rx_wc_buf_desc, void *pv_fd p_tcp_h->fin ? "F" : "", ntohl(p_tcp_h->seq), ntohl(p_tcp_h->ack_seq), ntohs(p_tcp_h->window), p_rx_wc_buf_desc->rx.sz_payload); - return si->rfs_ptr->rx_dispatch_packet(p_rx_wc_buf_desc, pv_fd_ready_array); + return si->get_rfs_ptr()->rx_dispatch_packet(p_rx_wc_buf_desc, pv_fd_ready_array); } if (likely(protocol == IPPROTO_UDP)) { @@ -706,7 +705,7 @@ bool ring_slave::rx_process_buffer(mem_buf_desc_t *p_rx_wc_buf_desc, void *pv_fd ETH_HW_ADDR_PRINT_ADDR(p_eth_h->h_source), htons(h_proto)); // Handle VLAN header as next protocol - struct vlanhdr *p_vlan_hdr = NULL; + struct vlanhdr *p_vlan_hdr = nullptr; uint16_t packet_vlan = 0; if (h_proto == NET_ETH_P_8021Q) { p_vlan_hdr = (struct vlanhdr *)((uint8_t *)p_eth_h + ETH_HDR_LEN); @@ -721,9 +720,9 @@ bool ring_slave::rx_process_buffer(mem_buf_desc_t *p_rx_wc_buf_desc, void *pv_fd // TODO: Remove this code when handling vlan in flow steering will be available. Change this // code if vlan stripping is performed. - if ((m_partition & VLAN_VID_MASK) != packet_vlan) { + if ((m_vlan & VLAN_VID_MASK) != packet_vlan) { ring_logfunc("Rx buffer dropped- Mismatched vlan. Packet vlan = %d, Local vlan = %d", - packet_vlan, m_partition & VLAN_VID_MASK); + packet_vlan, m_vlan & VLAN_VID_MASK); return false; } @@ -965,7 +964,7 @@ bool steering_handler::rx_process_buffer_no_flow_id( p_rx_wc_buf_desc->rx.frag.iov_len = ip_tot_len - hdr_data.ip_hdr_len; // Add ip fragment packet to out fragment manager - mem_buf_desc_t *new_buf = NULL; + mem_buf_desc_t *new_buf = nullptr; int ret = -1; if (g_p_ip_frag_manager) { ret = g_p_ip_frag_manager->add_frag(p_ip_h, p_rx_wc_buf_desc, &new_buf); diff --git a/src/core/dev/ring_slave.h b/src/core/dev/ring_slave.h index 80c802db3..439e972b6 100644 --- a/src/core/dev/ring_slave.h +++ b/src/core/dev/ring_slave.h @@ -233,7 +233,7 @@ inline bool operator==(flow_spec_4t_key_ipv6 const &key1, flow_spec_4t_key_ipv6 struct counter_and_ibv_flows { int counter; - std::vector rfs_rule_vec; + rfs_rule *rfs_rule_holder = nullptr; }; typedef std::unordered_map rule_filter_map_t; @@ -247,8 +247,8 @@ template class steering_handler { { } - bool attach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink, bool force_5t = false); - bool detach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink); + bool attach_flow(flow_tuple &flow_spec_5t, sockinfo *sink, bool force_5t = false); + bool detach_flow(flow_tuple &flow_spec_5t, sockinfo *sink); inline bool rx_process_buffer_no_flow_id(mem_buf_desc_t *p_rx_wc_buf_desc, void *pv_fd_ready_array, HDR *p_ip_h); @@ -292,8 +292,8 @@ class ring_slave : public ring { virtual int reclaim_recv_single_buffer(mem_buf_desc_t *rx_reuse) = 0; virtual void inc_cq_moderation_stats(size_t sz_data) = 0; - virtual bool attach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink, bool force_5t = false); - virtual bool detach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink); + virtual bool attach_flow(flow_tuple &flow_spec_5t, sockinfo *sink, bool force_5t = false); + virtual bool detach_flow(flow_tuple &flow_spec_5t, sockinfo *sink); #ifdef DEFINED_UTLS /* Call this method in an RX ring. */ @@ -328,7 +328,7 @@ class ring_slave : public ring { descq_t m_zc_pool; transport_type_t m_transport_type; /* transport ETH/IB */ std::unique_ptr m_p_ring_stat; - uint16_t m_partition; + uint16_t m_vlan; bool m_flow_tag_enabled; const bool m_b_sysvar_eth_mc_l2_only_rules; const bool m_b_sysvar_mc_force_flowtag; diff --git a/src/core/dev/ring_tap.cpp b/src/core/dev/ring_tap.cpp index 66c395951..377372c1c 100644 --- a/src/core/dev/ring_tap.cpp +++ b/src/core/dev/ring_tap.cpp @@ -45,7 +45,7 @@ ring_tap::ring_tap(int if_index, ring *parent) : ring_slave(if_index, parent, RING_TAP, true) , m_tap_fd(-1) - , m_vf_ring(NULL) + , m_vf_ring(nullptr) , m_sysvar_qp_compensation_level(safe_mce_sys().qp_compensation_level) , m_tap_data_available(false) { @@ -120,7 +120,7 @@ void ring_tap::tap_create(net_device_val *p_ndev) unsigned char hw_addr[ETH_ALEN]; /* Open TAP device */ - if ((m_tap_fd = orig_os_api.open("/dev/net/tun", O_RDWR)) < 0) { + if ((m_tap_fd = SYSCALL(open, "/dev/net/tun", O_RDWR)) < 0) { ring_logerr("FAILED to open tap %m"); rc = -errno; goto error; @@ -146,14 +146,14 @@ void ring_tap::tap_create(net_device_val *p_ndev) /* Setting TAP attributes */ ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_ONE_QUEUE; - if ((rc = orig_os_api.ioctl(m_tap_fd, TUNSETIFF, (void *)&ifr)) < 0) { + if ((rc = SYSCALL(ioctl, m_tap_fd, TUNSETIFF, (void *)&ifr)) < 0) { ring_logerr("ioctl failed fd = %d, %d %m", m_tap_fd, rc); rc = -errno; goto error; } /* Set TAP fd nonblocking */ - if ((rc = orig_os_api.fcntl(m_tap_fd, F_SETFL, O_NONBLOCK)) < 0) { + if ((rc = SYSCALL(fcntl, m_tap_fd, F_SETFL, O_NONBLOCK)) < 0) { ring_logerr("ioctl failed fd = %d, %d %m", m_tap_fd, rc); rc = -errno; goto error; @@ -168,7 +168,7 @@ void ring_tap::tap_create(net_device_val *p_ndev) } /* Create socket */ - if ((ioctl_sock = orig_os_api.socket(AF_INET, SOCK_DGRAM, 0)) < 0) { + if ((ioctl_sock = SYSCALL(socket, AF_INET, SOCK_DGRAM, 0)) < 0) { ring_logerr("FAILED to open socket"); rc = -errno; goto error; @@ -178,7 +178,7 @@ void ring_tap::tap_create(net_device_val *p_ndev) ifr.ifr_hwaddr.sa_family = AF_LOCAL; get_local_ll_addr(p_ndev->get_ifname_link(), hw_addr, ETH_ALEN, false); memcpy(ifr.ifr_hwaddr.sa_data, hw_addr, ETH_ALEN); - if ((rc = orig_os_api.ioctl(ioctl_sock, SIOCSIFHWADDR, &ifr)) < 0) { + if ((rc = SYSCALL(ioctl, ioctl_sock, SIOCSIFHWADDR, &ifr)) < 0) { ring_logerr("ioctl SIOCSIFHWADDR failed %d %m, %s", rc, tap_name); rc = -errno; goto error; @@ -186,7 +186,7 @@ void ring_tap::tap_create(net_device_val *p_ndev) /* Set link UP */ ifr.ifr_flags |= (IFF_UP | IFF_SLAVE); - if ((rc = orig_os_api.ioctl(ioctl_sock, SIOCSIFFLAGS, &ifr)) < 0) { + if ((rc = SYSCALL(ioctl, ioctl_sock, SIOCSIFFLAGS, &ifr)) < 0) { ring_logerr("ioctl SIOCGIFFLAGS failed %d %m, %s", rc, tap_name); rc = -errno; goto error; @@ -203,7 +203,7 @@ void ring_tap::tap_create(net_device_val *p_ndev) /* Update if_index on ring class */ set_if_index(tap_if_index); - orig_os_api.close(ioctl_sock); + SYSCALL(close, ioctl_sock); ring_logdbg("Tap device %d: %s [fd=%d] was created successfully", tap_if_index, ifr.ifr_name, m_tap_fd); @@ -214,11 +214,11 @@ void ring_tap::tap_create(net_device_val *p_ndev) ring_logerr("Tap device creation failed %d, %m", rc); if (ioctl_sock >= 0) { - orig_os_api.close(ioctl_sock); + SYSCALL(close, ioctl_sock); } if (m_tap_fd >= 0) { - orig_os_api.close(m_tap_fd); + SYSCALL(close, m_tap_fd); } m_tap_fd = -1; @@ -227,13 +227,13 @@ void ring_tap::tap_create(net_device_val *p_ndev) void ring_tap::tap_destroy() { if (m_tap_fd >= 0) { - orig_os_api.close(m_tap_fd); + SYSCALL(close, m_tap_fd); m_tap_fd = -1; } } -bool ring_tap::attach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink, bool force_5t) +bool ring_tap::attach_flow(flow_tuple &flow_spec_5t, sockinfo *sink, bool force_5t) { std::lock_guard lock(m_lock_ring_rx); bool ret = ring_slave::attach_flow(flow_spec_5t, sink, force_5t); @@ -254,7 +254,7 @@ bool ring_tap::attach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink, bool f return ret; } -bool ring_tap::detach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink) +bool ring_tap::detach_flow(flow_tuple &flow_spec_5t, sockinfo *sink) { std::lock_guard lock(m_lock_ring_rx); bool ret = ring_slave::detach_flow(flow_spec_5t, sink); @@ -286,7 +286,7 @@ int ring_tap::wait_for_notification_and_process_element(int, uint64_t *, void *p int ring_tap::drain_and_proccess() { - return process_element_rx(NULL); + return process_element_rx(nullptr); } bool ring_tap::reclaim_recv_buffers(descq_t *rx_reuse) @@ -309,14 +309,14 @@ bool ring_tap::reclaim_recv_buffers(descq_t *rx_reuse) bool ring_tap::reclaim_recv_buffers(mem_buf_desc_t *buff) { if (buff && (buff->dec_ref_count() <= 1)) { - mem_buf_desc_t *temp = NULL; + mem_buf_desc_t *temp = nullptr; while (buff) { if (buff->lwip_pbuf_dec_ref_count() <= 0) { temp = buff; buff = temp->p_next_desc; temp->clear_transport_data(); - temp->p_next_desc = NULL; - temp->p_prev_desc = NULL; + temp->p_next_desc = nullptr; + temp->p_prev_desc = nullptr; temp->reset_ref_count(); free_lwip_pbuf(&temp->lwip_pbuf); m_rx_pool.push_back(temp); @@ -425,7 +425,7 @@ int ring_tap::process_element_rx(void *pv_fd_ready_array) std::lock_guard lock(m_lock_ring_rx); if (m_rx_pool.size() || request_more_rx_buffers()) { mem_buf_desc_t *buff = m_rx_pool.get_and_pop_front(); - ret = orig_os_api.read(m_tap_fd, buff->p_buffer, buff->sz_buffer); + ret = SYSCALL(read, m_tap_fd, buff->p_buffer, buff->sz_buffer); if (ret > 0) { /* Data was read and processed successfully */ buff->sz_data = ret; @@ -469,7 +469,7 @@ bool ring_tap::request_more_rx_buffers() mem_buf_desc_t *ring_tap::mem_buf_tx_get(ring_user_id_t id, bool b_block, pbuf_type type, int n_num_mem_bufs) { - mem_buf_desc_t *head = NULL; + mem_buf_desc_t *head = nullptr; NOT_IN_USE(id); NOT_IN_USE(b_block); @@ -489,14 +489,14 @@ mem_buf_desc_t *ring_tap::mem_buf_tx_get(ring_user_id_t id, bool b_block, pbuf_t } head = m_tx_pool.get_and_pop_back(); - head->lwip_pbuf.pbuf.ref = 1; + head->lwip_pbuf.ref = 1; n_num_mem_bufs--; mem_buf_desc_t *next = head; while (n_num_mem_bufs) { next->p_next_desc = m_tx_pool.get_and_pop_back(); next = next->p_next_desc; - next->lwip_pbuf.pbuf.ref = 1; + next->lwip_pbuf.ref = 1; n_num_mem_bufs--; } @@ -520,15 +520,15 @@ void ring_tap::mem_buf_desc_return_single_to_owner_tx(mem_buf_desc_t *p_mem_buf_ if (likely(p_mem_buf_desc)) { // potential race, ref is protected here by ring_tx lock, and in dst_entry_tcp & // sockinfo_tcp by tcp lock - if (likely(p_mem_buf_desc->lwip_pbuf.pbuf.ref)) { - p_mem_buf_desc->lwip_pbuf.pbuf.ref--; + if (likely(p_mem_buf_desc->lwip_pbuf.ref)) { + p_mem_buf_desc->lwip_pbuf.ref--; } else { ring_logerr("ref count of %p is already zero, double free??", p_mem_buf_desc); } - if (p_mem_buf_desc->lwip_pbuf.pbuf.ref == 0) { - p_mem_buf_desc->p_next_desc = NULL; - if (unlikely(p_mem_buf_desc->lwip_pbuf.pbuf.type == PBUF_ZEROCOPY)) { + if (p_mem_buf_desc->lwip_pbuf.ref == 0) { + p_mem_buf_desc->p_next_desc = nullptr; + if (unlikely(p_mem_buf_desc->lwip_pbuf.type == PBUF_ZEROCOPY)) { g_buffer_pool_zc->put_buffers_thread_safe(p_mem_buf_desc); return; } @@ -547,8 +547,7 @@ void ring_tap::mem_buf_desc_return_single_multi_ref(mem_buf_desc_t *p_mem_buf_de } m_lock_ring_tx.lock(); - p_mem_buf_desc->lwip_pbuf.pbuf.ref -= - std::min(p_mem_buf_desc->lwip_pbuf.pbuf.ref, ref - 1); + p_mem_buf_desc->lwip_pbuf.ref -= std::min(p_mem_buf_desc->lwip_pbuf.ref, ref - 1); m_lock_ring_tx.unlock(); mem_buf_desc_return_single_to_owner_tx(p_mem_buf_desc); } @@ -568,17 +567,17 @@ int ring_tap::mem_buf_tx_release(mem_buf_desc_t *buff_list, bool b_accounting, b while (buff_list) { next = buff_list->p_next_desc; - buff_list->p_next_desc = NULL; + buff_list->p_next_desc = nullptr; // potential race, ref is protected here by ring_tx lock, and in dst_entry_tcp & // sockinfo_tcp by tcp lock - if (likely(buff_list->lwip_pbuf.pbuf.ref)) { - buff_list->lwip_pbuf.pbuf.ref--; + if (likely(buff_list->lwip_pbuf.ref)) { + buff_list->lwip_pbuf.ref--; } else { ring_logerr("ref count of %p is already zero, double free??", buff_list); } - if (buff_list->lwip_pbuf.pbuf.ref == 0) { + if (buff_list->lwip_pbuf.ref == 0) { free_lwip_pbuf(&buff_list->lwip_pbuf); m_tx_pool.push_back(buff_list); freed++; @@ -607,7 +606,7 @@ int ring_tap::send_buffer(xlio_ibv_send_wr *wr, xlio_wr_tx_packet_attr attr) iovec[i].iov_len = wr->sg_list[i].length; } - ret = orig_os_api.writev(m_tap_fd, iovec, wr->num_sge); + ret = SYSCALL(writev, m_tap_fd, iovec, wr->num_sge); if (ret < 0) { ring_logdbg("writev: tap_fd %d, errno: %d\n", m_tap_fd, errno); } diff --git a/src/core/dev/ring_tap.h b/src/core/dev/ring_tap.h index 9077ce26a..b358e292f 100644 --- a/src/core/dev/ring_tap.h +++ b/src/core/dev/ring_tap.h @@ -42,8 +42,8 @@ class ring_tap : public ring_slave { virtual ~ring_tap(); virtual bool is_up() { return (m_vf_ring || m_active); } - virtual bool attach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink, bool force_5t = false); - virtual bool detach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink); + virtual bool attach_flow(flow_tuple &flow_spec_5t, sockinfo *sink, bool force_5t = false); + virtual bool detach_flow(flow_tuple &flow_spec_5t, sockinfo *sink); virtual int poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void *pv_fd_ready_array = NULL); virtual int poll_and_process_element_tx(uint64_t *p_cq_poll_sn) { @@ -51,7 +51,7 @@ class ring_tap : public ring_slave { return 0; } virtual int wait_for_notification_and_process_element(int cq_channel_fd, uint64_t *p_cq_poll_sn, - void *pv_fd_ready_array = NULL); + void *pv_fd_ready_array = nullptr); virtual int drain_and_proccess(); virtual bool reclaim_recv_buffers(descq_t *rx_reuse); virtual bool reclaim_recv_buffers(mem_buf_desc_t *buff); @@ -99,9 +99,8 @@ class ring_tap : public ring_slave { return 0; } void inc_cq_moderation_stats(size_t sz_data) { NOT_IN_USE(sz_data); } - virtual uint32_t get_tx_user_lkey(void *addr, size_t length, void *p_mapping = NULL) + virtual uint32_t get_tx_user_lkey(void *addr, size_t length) { - NOT_IN_USE(p_mapping); NOT_IN_USE(addr); NOT_IN_USE(length); return LKEY_ERROR; @@ -110,7 +109,7 @@ class ring_tap : public ring_slave { ib_ctx_handler *get_ctx(ring_user_id_t id) { NOT_IN_USE(id); - return NULL; + return nullptr; } virtual uint32_t get_max_send_sge(void) { return 1; } virtual uint32_t get_max_payload_sz(void) { return 0; } @@ -137,10 +136,6 @@ class ring_tap : public ring_slave { void tap_create(net_device_val *p_ndev); void tap_destroy(); - bool is_socketxtreme(void) { return false; } - void put_ec(struct ring_ec *ec) { NOT_IN_USE(ec); } - void del_ec(struct ring_ec *ec) { NOT_IN_USE(ec); } - /* These fields are NETVSC mode specific */ int m_tap_fd; /* file descriptor of tap device */ ring_slave *m_vf_ring; diff --git a/src/core/dev/time_converter.cpp b/src/core/dev/time_converter.cpp index e2d3991a2..3d1da4ca1 100644 --- a/src/core/dev/time_converter.cpp +++ b/src/core/dev/time_converter.cpp @@ -188,7 +188,7 @@ void time_converter::clean_obj() } set_cleaned(); - m_timer_handle = NULL; + m_timer_handle = nullptr; if (g_p_event_handler_manager->is_running()) { g_p_event_handler_manager->unregister_timers_event_and_delete(this); } else { diff --git a/src/core/dev/time_converter_ib_ctx.cpp b/src/core/dev/time_converter_ib_ctx.cpp index 378e95a45..1a72eaf41 100644 --- a/src/core/dev/time_converter_ib_ctx.cpp +++ b/src/core/dev/time_converter_ib_ctx.cpp @@ -71,11 +71,11 @@ time_converter_ib_ctx::time_converter_ib_ctx(struct ibv_context *ctx, m_converter_status = TS_CONVERSION_MODE_SYNC; g_p_event_handler_manager->register_timer_event(UPDATE_HW_TIMER_FIRST_ONESHOT_MS, - this, ONE_SHOT_TIMER, 0); + this, ONE_SHOT_TIMER, nullptr); g_p_event_handler_manager->register_timer_event(UPDATE_HW_TIMER_SECOND_ONESHOT_MS, - this, ONE_SHOT_TIMER, 0); + this, ONE_SHOT_TIMER, nullptr); m_timer_handle = g_p_event_handler_manager->register_timer_event( - UPDATE_HW_TIMER_PERIOD_MS, this, PERIODIC_TIMER, 0); + UPDATE_HW_TIMER_PERIOD_MS, this, PERIODIC_TIMER, nullptr); } } } diff --git a/src/core/dev/time_converter_ptp.cpp b/src/core/dev/time_converter_ptp.cpp index fdb1de1c3..517ff8377 100644 --- a/src/core/dev/time_converter_ptp.cpp +++ b/src/core/dev/time_converter_ptp.cpp @@ -65,7 +65,7 @@ time_converter_ptp::time_converter_ptp(struct ibv_context *ctx) } m_timer_handle = g_p_event_handler_manager->register_timer_event(UPDATE_HW_TIMER_PTP_PERIOD_MS, - this, PERIODIC_TIMER, 0); + this, PERIODIC_TIMER, nullptr); m_converter_status = TS_CONVERSION_MODE_PTP; } diff --git a/src/core/dev/time_converter_rtc.cpp b/src/core/dev/time_converter_rtc.cpp index c0f873dc7..45edd95e2 100644 --- a/src/core/dev/time_converter_rtc.cpp +++ b/src/core/dev/time_converter_rtc.cpp @@ -58,7 +58,7 @@ void time_converter_rtc::handle_timer_expired(void *) void time_converter_rtc::convert_hw_time_to_system_time(uint64_t hwtime, struct timespec *systime) { hwtime &= 0x7FFFFFFFFFFFFFFF; - systime->tv_nsec = (uint32_t)(hwtime & ~(0x3 << 30)); + systime->tv_nsec = (uint32_t)(hwtime & ~(0x3UL << 30)); systime->tv_sec = (uint32_t)(hwtime >> 32); ibchtc_logfine("hwtime: %09ld", hwtime); diff --git a/src/core/dev/wqe_send_handler.cpp b/src/core/dev/wqe_send_handler.cpp index be06efc30..e30ecce3d 100644 --- a/src/core/dev/wqe_send_handler.cpp +++ b/src/core/dev/wqe_send_handler.cpp @@ -60,7 +60,7 @@ void wqe_send_handler::init_wqe(xlio_ibv_send_wr &wqe_to_init, struct ibv_sge *s wqe_to_init.num_sge = num_sge; xlio_send_wr_opcode(wqe_to_init) = XLIO_IBV_WR_SEND; - wqe_to_init.next = NULL; + wqe_to_init.next = nullptr; wqe_to_init.sg_list = sge_list; wqe_to_init.wr_id = 0; } diff --git a/src/core/dev/xlio_ti.h b/src/core/dev/xlio_ti.h new file mode 100644 index 000000000..e977c87de --- /dev/null +++ b/src/core/dev/xlio_ti.h @@ -0,0 +1,184 @@ +/* + * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef XLIO_TI_H +#define XLIO_TI_H + +#include +#include +#include +#include + +/* Work request completion callback */ +/* TODO Add argument for completion status to handle errors. */ +typedef void (*xlio_comp_cb_t)(void *); + +class xlio_ti; + +class xlio_ti_owner { +public: + virtual void ti_released(xlio_ti *ti) = 0; +}; + +class xlio_ti { +public: + enum ti_type : uint8_t { UNKNOWN, TLS_TIS, TLS_TIR, NVME_TIS, NVME_TIR }; + + xlio_ti(xlio_ti_owner *ti_owner, ti_type type = UNKNOWN) + : m_ti_owner(ti_owner) + , m_type(type) + , m_released(false) + , m_ref(0) + , m_callback(nullptr) + , m_callback_arg(nullptr) + { + } + + virtual ~xlio_ti() {}; + + void assign_callback(xlio_comp_cb_t callback, void *callback_arg) + { + m_callback = callback; + m_callback_arg = callback_arg; + } + + /* + * Reference counting. m_ref must be protected by ring tx lock. Device + * layer (QP, CQ) is responsible for the reference counting. + */ + + void get() + { + ++m_ref; + assert(m_ref > 0); + } + + uint32_t put() + { + assert(m_ref > 0); + return --m_ref; + } + + void ti_released() { m_ti_owner->ti_released(this); } + + xlio_ti_owner *const m_ti_owner; + ti_type m_type; + bool m_released; + uint32_t m_ref; + + xlio_comp_cb_t m_callback; + void *m_callback_arg; +}; + +class xlio_tis : public xlio_ti { +public: + xlio_tis(xlio_ti_owner *ti_owner, std::unique_ptr _tis, xlio_ti::ti_type type) + : xlio_ti(ti_owner, type) + , m_dek() + , m_p_tis(std::move(_tis)) + , m_tisn(0U) + , m_dek_id(0U) + { + dpcp::status ret = m_p_tis->get_tisn(m_tisn); + assert(ret == dpcp::DPCP_OK); + (void)ret; + } + + ~xlio_tis() = default; + + std::unique_ptr release_dek() + { + assert(m_ref == 0); + m_released = false; + return std::move(m_dek); + } + + uint32_t get_tisn() noexcept { return m_tisn; } + + void assign_dek(std::unique_ptr &&dek_ptr) + { + m_dek = std::move(dek_ptr); + m_dek_id = m_dek->get_key_id(); + } + + uint32_t get_dek_id() noexcept { return m_dek_id; } + +private: + std::unique_ptr m_dek; + std::unique_ptr m_p_tis; + uint32_t m_tisn; + uint32_t m_dek_id; +}; + +class xlio_tir : public xlio_ti { +public: + xlio_tir(xlio_ti_owner *ti_owner, dpcp::tir *dpcp_tir, xlio_ti::ti_type type) + : xlio_ti(ti_owner, type) + { + m_p_tir.reset(dpcp_tir); + m_dek = NULL; + m_tirn = 0; + m_dek_id = 0; + + /* Cache the tir number. Mustn't fail for a valid TIR object. */ + m_tirn = m_p_tir->get_tirn(); + assert(m_tirn != 0); + } + + ~xlio_tir() = default; + + std::unique_ptr release_dek() + { + assert(m_ref == 0); + m_released = false; + return std::move(m_dek); + } + + uint32_t get_tirn() { return m_tirn; } + + void assign_dek(void *dek_ptr) + { + m_dek.reset(reinterpret_cast(dek_ptr)); + m_dek_id = m_dek->get_key_id(); + } + + uint32_t get_dek_id() { return m_dek_id; } + + std::unique_ptr m_p_tir; + +private: + std::unique_ptr m_dek; + uint32_t m_tirn; + uint32_t m_dek_id; +}; + +#endif // XLIO_TI_H diff --git a/src/core/event/delta_timer.cpp b/src/core/event/delta_timer.cpp index ecf8b6097..bf5590020 100644 --- a/src/core/event/delta_timer.cpp +++ b/src/core/event/delta_timer.cpp @@ -48,7 +48,7 @@ #define tmr_loginfo __log_info #define tmr_logdbg __log_dbg #define tmr_logfunc __log_func -//#define tmr_logfuncall __log_funcall +//#define tmr_logfuncall __log_funcall #define tmr_logfuncall(fmt, ...) #define IS_NODE_INVALID(_node_) \ @@ -56,16 +56,16 @@ timer::timer() { - m_list_head = NULL; + m_list_head = nullptr; gettime(&m_ts_last); } timer::~timer() { timer_node_t *iter = m_list_head; - timer_node_t *to_free = NULL; + timer_node_t *to_free = nullptr; tmr_logfunc(""); - m_list_head = NULL; + m_list_head = nullptr; // free all the list while (iter) { to_free = iter; @@ -133,7 +133,7 @@ void timer::remove_timer(timer_node_t *node, timer_handler *handler) BULLSEYE_EXCLUDE_BLOCK_END // Invalidate node before freeing it - node->handler = NULL; + node->handler = nullptr; node->req_type = INVALID_TIMER; // Remove & Free node @@ -145,7 +145,7 @@ void timer::remove_timer(timer_node_t *node, timer_handler *handler) void timer::remove_all_timers(timer_handler *handler) { timer_node_t *node = m_list_head; - timer_node_t *node_tmp = NULL; + timer_node_t *node_tmp = nullptr; // Look for handler in the list if node wasen't indicated while (node) { @@ -160,12 +160,12 @@ void timer::remove_all_timers(timer_handler *handler) } BULLSEYE_EXCLUDE_BLOCK_END // Invalidate node before freeing it - node_tmp->handler = NULL; + node_tmp->handler = nullptr; node_tmp->req_type = INVALID_TIMER; remove_from_list(node_tmp); // Remove & Free node free(node_tmp); - node_tmp = NULL; + node_tmp = nullptr; } else { node = node->next; } @@ -177,7 +177,7 @@ void timer::remove_all_timers(timer_handler *handler) int timer::update_timeout() { int ret = 0, delta_msec = 0; - timer_node_t *list_tmp = NULL; + timer_node_t *list_tmp = nullptr; struct timespec ts_now, ts_delta; ret = gettime(&ts_now); @@ -248,7 +248,7 @@ void timer::process_registered_timers() case PERIODIC_TIMER: // re-insert remove_from_list(iter); - iter->prev = iter->next = NULL; + iter->prev = iter->next = nullptr; insert_to_list(iter); break; @@ -308,8 +308,8 @@ void timer::insert_to_list(timer_node_t *new_node) if (!m_list_head) { // first node in the list new_node->delta_time_msec = new_node->orig_time_msec; // time from now - new_node->next = NULL; - new_node->prev = NULL; + new_node->next = nullptr; + new_node->prev = nullptr; m_list_head = new_node; tmr_logfuncall("insert first node to list (handler %p, timer %d, delta time %d)", new_node->handler, new_node->orig_time_msec, new_node->delta_time_msec); @@ -318,7 +318,7 @@ void timer::insert_to_list(timer_node_t *new_node) // else: need to find the correct place in the list tmp_delta = new_node->orig_time_msec; iter = m_list_head; - prev = NULL; + prev = nullptr; while (iter && tmp_delta >= iter->delta_time_msec) { tmp_delta = tmp_delta - iter->delta_time_msec; diff --git a/src/core/event/delta_timer.h b/src/core/event/delta_timer.h index c1dc228f2..311351fcc 100644 --- a/src/core/event/delta_timer.h +++ b/src/core/event/delta_timer.h @@ -39,7 +39,6 @@ #define INFINITE_TIMEOUT (-1) class timer_handler; -class timers_group; enum timer_req_type_t { // reregister itself every after timer expires. (the client doesn't need to reregister) @@ -65,7 +64,6 @@ struct timer_node_t { /* link to the context registered */ timer_handler *handler; void *user_data; - timers_group *group; timer_req_type_t req_type; struct timer_node_t *next; struct timer_node_t *prev; diff --git a/src/core/event/event.h b/src/core/event/event.h index 74a4d20f0..856b7cbff 100644 --- a/src/core/event/event.h +++ b/src/core/event/event.h @@ -40,7 +40,7 @@ class event { public: - event(void *notifier = NULL) + event(void *notifier = nullptr) : m_notifier(notifier) { } diff --git a/src/core/event/event_handler_manager.cpp b/src/core/event/event_handler_manager.cpp index 71fd86a58..984aaa96f 100644 --- a/src/core/event/event_handler_manager.cpp +++ b/src/core/event/event_handler_manager.cpp @@ -37,12 +37,12 @@ #include #include "core/dev/ring_allocation_logic.h" #include "core/sock/fd_collection.h" -#include "core/sock/sock-redirect.h" // calling orig_os_api.epoll() +#include "core/sock/sock-redirect.h" // calling SYSCALL(epoll)() #include "core/proto/route_table_mgr.h" #include "timer_handler.h" #include "event_handler_ibverbs.h" #include "event_handler_rdma_cm.h" - +#include "core/sock/sockinfo_tcp.h" #include "core/util/instrumentation.h" #define MODULE_NAME "evh:" @@ -81,20 +81,19 @@ #define INITIAL_EVENTS_NUM 64 -event_handler_manager *g_p_event_handler_manager = NULL; +event_handler_manager *g_p_event_handler_manager = nullptr; pthread_t g_n_internal_thread_id = 0; void *event_handler_manager::register_timer_event(int timeout_msec, timer_handler *handler, - timer_req_type_t req_type, void *user_data, - timers_group *group /* = NULL */) + timer_req_type_t req_type, void *user_data) { evh_logdbg("timer handler '%p' registered %s timer for %d msec (user data: %p)", handler, timer_req_type_str(req_type), timeout_msec, user_data); BULLSEYE_EXCLUDE_BLOCK_START if (!handler || (req_type < 0 || req_type >= INVALID_TIMER)) { evh_logwarn("bad timer type (%d) or handler (%p)", req_type, handler); - return NULL; + return nullptr; } BULLSEYE_EXCLUDE_BLOCK_END @@ -115,7 +114,6 @@ void *event_handler_manager::register_timer_event(int timeout_msec, timer_handle reg_action.type = REGISTER_TIMER; reg_action.info.timer.handler = handler; reg_action.info.timer.user_data = user_data; - reg_action.info.timer.group = group; reg_action.info.timer.node = node; reg_action.info.timer.timeout_msec = timeout_msec; reg_action.info.timer.req_type = req_type; @@ -123,6 +121,16 @@ void *event_handler_manager::register_timer_event(int timeout_msec, timer_handle return node; } +void event_handler_manager::register_socket_timer_event(sockinfo_tcp *sock_tcp) +{ + evh_logdbg("Registering TCP socket timer: %p", sock_tcp); + reg_action_t reg_action; + memset(®_action, 0, sizeof(reg_action)); + reg_action.type = REGISTER_TCP_SOCKET_TIMER; + reg_action.info.timer.user_data = sock_tcp; + post_new_reg_action(reg_action); +} + void event_handler_manager::wakeup_timer_event(timer_handler *handler, void *node) { evh_logdbg("timer handler '%p'", handler); @@ -138,7 +146,6 @@ void event_handler_manager::wakeup_timer_event(timer_handler *handler, void *nod reg_action.info.timer.handler = handler; reg_action.info.timer.node = node; post_new_reg_action(reg_action); - return; } void event_handler_manager::unregister_timer_event(timer_handler *handler, void *node) @@ -175,6 +182,16 @@ void event_handler_manager::unregister_timers_event_and_delete(timer_handler *ha post_new_reg_action(reg_action); } +void event_handler_manager::unregister_socket_timer_and_delete(sockinfo_tcp *sock_tcp) +{ + evh_logdbg("Unregistering TCP socket timer: %p", sock_tcp); + reg_action_t reg_action; + memset(®_action, 0, sizeof(reg_action)); + reg_action.type = UNREGISTER_TCP_SOCKET_TIMER_AND_DELETE; + reg_action.info.timer.user_data = sock_tcp; + post_new_reg_action(reg_action); +} + void event_handler_manager::register_ibverbs_event(int fd, event_handler_ibverbs *handler, void *channel, void *user_data) { @@ -249,7 +266,7 @@ event_handler_manager::event_handler_manager(bool internal_thread_mode) return; } - m_epfd = orig_os_api.epoll_create(INITIAL_EVENTS_NUM); + m_epfd = SYSCALL(epoll_create, INITIAL_EVENTS_NUM); BULLSEYE_EXCLUDE_BLOCK_START if (m_epfd == -1) { evh_logdbg("epoll_create failed on ibv device collection (errno=%d %m)", errno); @@ -290,7 +307,7 @@ void *event_handler_thread(void *_p_tgtObject) tasks_file += "/tasks"; FILE *fp = fopen(tasks_file.c_str(), "w"); BULLSEYE_EXCLUDE_BLOCK_START - if (fp == NULL) { + if (!fp) { evh_logpanic("Failed to open %s for writing", tasks_file.c_str()); } if (fprintf(fp, "%d", gettid()) <= 0) { @@ -313,7 +330,6 @@ void *event_handler_thread(void *_p_tgtObject) } else { evh_logdbg("Internal thread affinity not set."); } - /* cppcheck-suppress resourceLeak */ } void *ret = p_tgtObject->thread_loop(); @@ -390,7 +406,7 @@ void event_handler_manager::stop_thread() // Wait for thread exit if (m_event_handler_tid) { - pthread_join(m_event_handler_tid, 0); + pthread_join(m_event_handler_tid, nullptr); evh_logdbg("event handler thread stopped"); } else { evh_logdbg("event handler thread not running"); @@ -399,13 +415,13 @@ void event_handler_manager::stop_thread() m_event_handler_tid = 0; // Close main epfd and signaling socket - orig_os_api.close(m_epfd); + SYSCALL(close, m_epfd); m_epfd = -1; } void event_handler_manager::update_epfd(int fd, int operation, int events) { - epoll_event ev = {0, {0}}; + epoll_event ev = {0, {nullptr}}; if (m_epfd < 0) { return; @@ -414,7 +430,7 @@ void event_handler_manager::update_epfd(int fd, int operation, int events) ev.events = events; ev.data.fd = fd; BULLSEYE_EXCLUDE_BLOCK_START - if ((orig_os_api.epoll_ctl(m_epfd, operation, fd, &ev) < 0) && + if ((SYSCALL(epoll_ctl, m_epfd, operation, fd, &ev) < 0) && (!(errno == ENOENT || errno == EBADF))) { const char *operation_str[] = {"", "ADD", "DEL", "MOD"}; evh_logerr("epoll_ctl(%d, %s, fd=%d) failed (errno=%d %m)", m_epfd, @@ -426,6 +442,10 @@ void event_handler_manager::update_epfd(int fd, int operation, int events) const char *event_handler_manager::reg_action_str(event_action_type_e reg_action_type) { switch (reg_action_type) { + case REGISTER_TCP_SOCKET_TIMER: + return "REGISTER_TCP_SOCKET_TIMER"; + case UNREGISTER_TCP_SOCKET_TIMER_AND_DELETE: + return "UNREGISTER_TCP_SOCKET_TIMER_AND_DELETE"; case REGISTER_TIMER: return "REGISTER_TIMER"; case UNREGISTER_TIMER: @@ -476,30 +496,18 @@ void event_handler_manager::post_new_reg_action(reg_action_t ®_action) void event_handler_manager::priv_register_timer_handler(timer_reg_info_t &info) { - if (info.group) { - info.group->add_new_timer((timer_node_t *)info.node, info.handler, info.user_data); - } else { - m_timer.add_new_timer(info.timeout_msec, (timer_node_t *)info.node, info.handler, - info.user_data, info.req_type); - } + m_timer.add_new_timer(info.timeout_msec, (timer_node_t *)info.node, info.handler, + info.user_data, info.req_type); } void event_handler_manager::priv_wakeup_timer_handler(timer_reg_info_t &info) { - timer_node_t *node = (timer_node_t *)info.node; - if (node && !node->group) { - m_timer.wakeup_timer(node); - } + m_timer.wakeup_timer((timer_node_t *)info.node); } void event_handler_manager::priv_unregister_timer_handler(timer_reg_info_t &info) { - timer_node_t *node = (timer_node_t *)info.node; - if (node && node->group) { - node->group->remove_timer((timer_node_t *)info.node); - } else { - m_timer.remove_timer(node, info.handler); - } + m_timer.remove_timer((timer_node_t *)info.node, info.handler); } void event_handler_manager::priv_unregister_all_handler_timers(timer_reg_info_t &info) @@ -525,7 +533,7 @@ void event_handler_manager::priv_prepare_ibverbs_async_event_queue(event_handler set_fd_block_mode(poll_fd.fd, false); // empty the async event queue - while (orig_os_api.poll(&poll_fd, 1, 0) > 0) { + while (SYSCALL(poll, &poll_fd, 1, 0) > 0) { process_ibverbs_event(i); cnt++; } @@ -538,14 +546,12 @@ void event_handler_manager::priv_register_ibverbs_events(ibverbs_reg_info_t &inf event_handler_map_t::iterator i; i = m_event_handler_map.find(info.fd); if (i == m_event_handler_map.end()) { - event_data_t v; + event_data_t v = {}; v.type = EV_IBVERBS; v.ibverbs_ev.fd = info.fd; v.ibverbs_ev.channel = info.channel; - /* coverity[uninit_use_in_call] */ - /* cppcheck-suppress uninitStructMember */ m_event_handler_map[info.fd] = v; i = m_event_handler_map.find(info.fd); @@ -626,15 +632,13 @@ void event_handler_manager::priv_register_rdma_cm_events(rdma_cm_reg_info_t &inf event_handler_map_t::iterator iter_fd = m_event_handler_map.find(info.fd); if (iter_fd == m_event_handler_map.end()) { evh_logdbg("Adding new channel (fd %d, id %p, handler %p)", info.fd, info.id, info.handler); - event_data_t map_value; + event_data_t map_value = {}; map_value.type = EV_RDMA_CM; map_value.rdma_cm_ev.n_ref_count = 1; map_value.rdma_cm_ev.map_rdma_cm_id[info.id] = info.handler; map_value.rdma_cm_ev.cma_channel = info.cma_channel; - /* coverity[uninit_use_in_call] */ - /* cppcheck-suppress uninitStructMember */ m_event_handler_map[info.fd] = map_value; update_epfd(info.fd, EPOLL_CTL_ADD, EPOLLIN | EPOLLPRI); @@ -703,13 +707,11 @@ void event_handler_manager::priv_register_command_events(command_reg_info_t &inf event_handler_map_t::iterator iter_fd = m_event_handler_map.find(info.fd); if (iter_fd == m_event_handler_map.end()) { evh_logdbg("Adding new channel (fd %d)", info.fd); - event_data_t map_value; + event_data_t map_value = {}; map_value.type = EV_COMMAND; map_value.command_ev.cmd = info.cmd; - /* coverity[uninit_use_in_call] */ - /* cppcheck-suppress uninitStructMember */ m_event_handler_map[info.fd] = map_value; update_epfd(info.fd, EPOLL_CTL_ADD, EPOLLIN | EPOLLPRI); } @@ -736,7 +738,17 @@ void event_handler_manager::handle_registration_action(reg_action_t ®_action) } evh_logfunc("event action %d", reg_action.type); + sockinfo_tcp *sock; switch (reg_action.type) { + case REGISTER_TCP_SOCKET_TIMER: + sock = reinterpret_cast(reg_action.info.timer.user_data); + sock->get_tcp_timer_collection()->add_new_timer(sock); + break; + case UNREGISTER_TCP_SOCKET_TIMER_AND_DELETE: + sock = reinterpret_cast(reg_action.info.timer.user_data); + sock->get_tcp_timer_collection()->remove_timer(sock); + delete sock; + break; case REGISTER_TIMER: priv_register_timer_handler(reg_action.info.timer); break; @@ -767,7 +779,7 @@ void event_handler_manager::handle_registration_action(reg_action_t ®_action) case UNREGISTER_TIMERS_AND_DELETE: priv_unregister_all_handler_timers(reg_action.info.timer); delete reg_action.info.timer.handler; - reg_action.info.timer.handler = NULL; + reg_action.info.timer.handler = nullptr; break; BULLSEYE_EXCLUDE_BLOCK_START default: @@ -795,7 +807,7 @@ void event_handler_manager::query_for_ibverbs_event(int async_fd) } // Check for ready events - if (orig_os_api.poll(&poll_fd, 1, 0) <= 0) { + if (SYSCALL(poll, &poll_fd, 1, 0) <= 0) { return; } @@ -874,7 +886,7 @@ void event_handler_manager::process_rdma_cm_event(event_handler_map_t::iterator // Read the notification event channel struct rdma_event_channel *cma_channel = (struct rdma_event_channel *)iter_fd->second.rdma_cm_ev.cma_channel; - struct rdma_cm_event *p_tmp_cm_event = NULL; + struct rdma_cm_event *p_tmp_cm_event = nullptr; struct rdma_cm_event cma_event; evh_logfunc_entry("cma_channel %p (fd = %d)", cma_channel, cma_channel->fd); @@ -907,7 +919,7 @@ void event_handler_manager::process_rdma_cm_event(event_handler_map_t::iterator } // Find registered event handler - if (cma_id != NULL) { + if (cma_id) { event_handler_rdma_cm_map_t::iterator iter_id = iter_fd->second.rdma_cm_ev.map_rdma_cm_id.find(cma_id); if (iter_id != iter_fd->second.rdma_cm_ev.map_rdma_cm_id.end()) { @@ -965,20 +977,24 @@ void *event_handler_manager::thread_loop() g_p_net_device_table_mgr) { m_cq_epfd = g_p_net_device_table_mgr->global_ring_epfd_get(); if (m_cq_epfd > 0) { - epoll_event evt = {0, {0}}; + epoll_event evt = {0, {nullptr}}; evt.events = EPOLLIN | EPOLLPRI; evt.data.fd = m_cq_epfd; - orig_os_api.epoll_ctl(m_epfd, EPOLL_CTL_ADD, m_cq_epfd, &evt); + SYSCALL(epoll_ctl, m_epfd, EPOLL_CTL_ADD, m_cq_epfd, &evt); } } - uint64_t poll_sn = 0; + uint64_t poll_sn_rx = 0; + uint64_t poll_sn_tx = 0; if (m_b_sysvar_internal_thread_arm_cq_enabled && m_cq_epfd > 0 && g_p_net_device_table_mgr) { - g_p_net_device_table_mgr->global_ring_poll_and_process_element(&poll_sn, NULL); - int ret = g_p_net_device_table_mgr->global_ring_request_notification(poll_sn); + g_p_net_device_table_mgr->global_ring_poll_and_process_element(&poll_sn_rx, &poll_sn_tx, + nullptr); + int ret = + g_p_net_device_table_mgr->global_ring_request_notification(poll_sn_rx, poll_sn_tx); if (ret > 0) { - g_p_net_device_table_mgr->global_ring_poll_and_process_element(&poll_sn, NULL); + g_p_net_device_table_mgr->global_ring_poll_and_process_element( + &poll_sn_rx, &poll_sn_tx, nullptr); } } @@ -989,20 +1005,20 @@ void *event_handler_manager::thread_loop() } } - evh_logfuncall("calling orig_os_api.epoll with %d msec timeout", timeout_msec); - int ret = orig_os_api.epoll_wait(m_epfd, p_events, maxevents, timeout_msec); + evh_logfuncall("calling SYSCALL(epoll) with %d msec timeout", timeout_msec); + int ret = SYSCALL(epoll_wait, m_epfd, p_events, maxevents, timeout_msec); if (ret < 0) { evh_logfunc("epoll returned with error, errno=%d %m)", errno); continue; } - evh_logfuncall("orig_os_api.epoll found %d ready fds", ret); + evh_logfuncall("SYSCALL(epoll) found %d ready fds", ret); // check pipe for (int idx = 0; (idx < ret) && (m_b_continue_running); ++idx) { if (m_b_sysvar_internal_thread_arm_cq_enabled && p_events[idx].data.fd == m_cq_epfd && g_p_net_device_table_mgr) { g_p_net_device_table_mgr->global_ring_wait_for_notification_and_process_element( - &poll_sn, NULL); + &poll_sn_rx, nullptr); } else if (is_wakeup_fd(p_events[idx].data.fd)) { // a request for registration was sent m_reg_action_q_lock.lock(); @@ -1058,7 +1074,7 @@ void *event_handler_manager::thread_loop() case EV_RDMA_CM: int result; poll_fd.fd = fd; - result = orig_os_api.poll(&poll_fd, 1, 0); + result = SYSCALL(poll, &poll_fd, 1, 0); if (result == 0) { evh_logdbg("error in fd %d", fd); break; @@ -1096,5 +1112,5 @@ void *event_handler_manager::thread_loop() free(p_events); - return 0; + return nullptr; } diff --git a/src/core/event/event_handler_manager.h b/src/core/event/event_handler_manager.h index 2904bcc19..c9e2ba69f 100644 --- a/src/core/event/event_handler_manager.h +++ b/src/core/event/event_handler_manager.h @@ -42,17 +42,19 @@ #include "core/infra/subject_observer.h" #include "core/event/command.h" #include "core/event/delta_timer.h" -#include "core/event/timers_group.h" #include "core/util/xlio_stats.h" class timer_handler; class event_handler_ibverbs; class event_handler_rdma_cm; +class sockinfo_tcp; typedef std::map event_handler_rdma_cm_map_t; typedef enum { + REGISTER_TCP_SOCKET_TIMER, + UNREGISTER_TCP_SOCKET_TIMER_AND_DELETE, REGISTER_TIMER, WAKEUP_TIMER, /* NOT AVAILABLE FOR GROUPED TIMERS */ UNREGISTER_TIMER, @@ -94,7 +96,6 @@ struct timer_reg_info_t { void *node; unsigned int timeout_msec; void *user_data; - timers_group *group; timer_req_type_t req_type; }; @@ -158,11 +159,14 @@ class event_handler_manager : public wakeup_pipe { ~event_handler_manager(); void *register_timer_event(int timeout_msec, timer_handler *handler, timer_req_type_t req_type, - void *user_data, timers_group *group = NULL); + void *user_data); void wakeup_timer_event(timer_handler *handler, void *node); void unregister_timer_event(timer_handler *handler, void *node); void unregister_timers_event_and_delete(timer_handler *handler); + void register_socket_timer_event(sockinfo_tcp *sock_tcp); + void unregister_socket_timer_and_delete(sockinfo_tcp *sock_tcp); + void register_ibverbs_event(int fd, event_handler_ibverbs *handler, void *channel, void *user_data); void unregister_ibverbs_event(int fd, event_handler_ibverbs *handler); diff --git a/src/core/event/thread_local_event_handler.cpp b/src/core/event/event_handler_manager_local.cpp similarity index 75% rename from src/core/event/thread_local_event_handler.cpp rename to src/core/event/event_handler_manager_local.cpp index 5100f4bb9..1595d5b54 100644 --- a/src/core/event/thread_local_event_handler.cpp +++ b/src/core/event/event_handler_manager_local.cpp @@ -30,28 +30,29 @@ * SOFTWARE. */ -#include "thread_local_event_handler.h" +#include "event_handler_manager_local.h" #include "util/sys_vars.h" -thread_local thread_local_event_handler g_thread_local_event_handler; +using namespace std::chrono; -thread_local_event_handler::thread_local_event_handler() +thread_local event_handler_manager_local g_event_handler_manager_local; + +event_handler_manager_local::event_handler_manager_local() : event_handler_manager(false) { } -void thread_local_event_handler::post_new_reg_action(reg_action_t ®_action) +void event_handler_manager_local::post_new_reg_action(reg_action_t ®_action) { // For thread local event handler registration can be immediate. handle_registration_action(reg_action); } -void thread_local_event_handler::do_tasks() +void event_handler_manager_local::do_tasks() { - auto curr_time = chrono::steady_clock::now(); - if (likely( - safe_mce_sys().tcp_timer_resolution_msec > - chrono::duration_cast(curr_time - _last_run_time).count())) { + auto curr_time = steady_clock::now(); + if (likely(safe_mce_sys().tcp_timer_resolution_msec > + duration_cast(curr_time - _last_run_time).count())) { return; } @@ -60,7 +61,7 @@ void thread_local_event_handler::do_tasks() do_tasks_for_thread_local(); } -void thread_local_event_handler::do_tasks_for_thread_local() +void event_handler_manager_local::do_tasks_for_thread_local() { m_timer.process_registered_timers_uncond(); } diff --git a/src/core/event/thread_local_event_handler.h b/src/core/event/event_handler_manager_local.h similarity index 87% rename from src/core/event/thread_local_event_handler.h rename to src/core/event/event_handler_manager_local.h index 1db1b7bd9..40199caef 100644 --- a/src/core/event/thread_local_event_handler.h +++ b/src/core/event/event_handler_manager_local.h @@ -37,11 +37,9 @@ #include "event_handler_manager.h" -using namespace std; - -class thread_local_event_handler : public event_handler_manager { +class event_handler_manager_local : public event_handler_manager { public: - thread_local_event_handler(); + event_handler_manager_local(); void do_tasks(); @@ -51,9 +49,9 @@ class thread_local_event_handler : public event_handler_manager { private: void do_tasks_for_thread_local(); - chrono::steady_clock::time_point _last_run_time; + std::chrono::steady_clock::time_point _last_run_time; }; -extern thread_local thread_local_event_handler g_thread_local_event_handler; +extern thread_local event_handler_manager_local g_event_handler_manager_local; #endif diff --git a/src/core/event/netlink_event.cpp b/src/core/event/netlink_event.cpp index 198078194..aa6ab0502 100644 --- a/src/core/event/netlink_event.cpp +++ b/src/core/event/netlink_event.cpp @@ -32,9 +32,10 @@ #include "netlink_event.h" #include "vlogger/vlogger.h" + #include #include -#include "stdio.h" +#include #define TOSTR_MAX_SIZE 4096 @@ -91,7 +92,7 @@ const std::string route_nl_event::to_str() const neigh_nl_event::neigh_nl_event(struct nlmsghdr *hdr, struct rtnl_neigh *neigh, void *notifier) : netlink_event(hdr, notifier) - , m_neigh_info(NULL) + , m_neigh_info(nullptr) { m_neigh_info = new netlink_neigh_info(neigh); if ((!hdr) && (neigh)) { @@ -108,7 +109,7 @@ neigh_nl_event::~neigh_nl_event() route_nl_event::route_nl_event(struct nlmsghdr *hdr, struct rtnl_route *route, void *notifier) : netlink_event(hdr, notifier) - , m_route_info(NULL) + , m_route_info(nullptr) { m_route_info = new netlink_route_info(route); } diff --git a/src/core/event/poll_group.cpp b/src/core/event/poll_group.cpp new file mode 100644 index 000000000..7a7b81110 --- /dev/null +++ b/src/core/event/poll_group.cpp @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "config.h" +#include "poll_group.h" + +#include "dev/net_device_table_mgr.h" +#include "dev/net_device_val.h" +#include "dev/ring.h" +#include "event/event_handler_manager_local.h" +#include "sock/sockinfo_tcp.h" + +#define MODULE_NAME "group:" + +#define grp_logpanic __log_panic +#define grp_logerr __log_err +#define grp_logwarn __log_warn +#define grp_loginfo __log_info +#define grp_logdbg __log_dbg + +/* + * Collection of the groups to destroy leftovers in the library destructor. + * Groups are likely pre-initialized in a small number (up to the number of CPU cores) + * and destroyed at exit. Therefore, a simple collection data structure is enough. + */ +static std::vector s_poll_groups; +static lock_spin s_poll_groups_lock; + +poll_group::poll_group(const struct xlio_poll_group_attr *attr) + : m_socket_event_cb(attr->socket_event_cb) + , m_socket_comp_cb(attr->socket_comp_cb) + , m_socket_rx_cb(attr->socket_rx_cb) + , m_group_flags(attr->flags) +{ + /* + * In the best case, we expect a single ring per group. Reserve two elements for a scenario + * with two network interfaces and when the both interfaces are used by the sockets. + * More complex scenarios will be covered with re-allocation. + */ + m_rings.reserve(2); + + m_event_handler = std::make_unique(); + m_tcp_timers = std::make_unique(1U); + m_tcp_timers->set_group(this); + + s_poll_groups_lock.lock(); + s_poll_groups.push_back(this); + s_poll_groups_lock.unlock(); + + grp_logdbg("Polling group %p created", this); +} + +poll_group::~poll_group() +{ + s_poll_groups_lock.lock(); + auto iter = std::find(s_poll_groups.begin(), s_poll_groups.end(), this); + if (iter != std::end(s_poll_groups)) { + s_poll_groups.erase(iter); + } + s_poll_groups_lock.unlock(); + + while (!m_sockets_list.empty()) { + sockinfo_tcp *si = dynamic_cast(m_sockets_list.front()); + if (likely(si)) { + close_socket(si, true); + } + } + + // Release references to the rings that we take in add_ring() + for (auto &item : m_rings_ref) { + item.second->release_ring(item.first.get()); + } + m_rings_ref.clear(); + + grp_logdbg("Polling group %p destroyed", this); +} + +/*static*/ +void poll_group::destroy_all_groups() +{ + s_poll_groups_lock.lock(); + std::vector groups(std::move(s_poll_groups)); + s_poll_groups_lock.unlock(); + for (poll_group *grp : groups) { + delete grp; + } +} + +void poll_group::poll() +{ + for (ring *rng : m_rings) { + uint64_t sn; + rng->poll_and_process_element_tx(&sn); + sn = 0; + rng->poll_and_process_element_rx(&sn); + } + m_event_handler->do_tasks(); +} + +void poll_group::add_dirty_socket(sockinfo_tcp *si) +{ + if (m_group_flags & XLIO_GROUP_FLAG_DIRTY) { + m_dirty_sockets.push_back(si); + } +} + +void poll_group::flush() +{ + for (auto si : m_dirty_sockets) { + si->flush(); + } + m_dirty_sockets.clear(); + for (ring *rng : m_rings) { + rng->ring_delayed_doorbell(); + } +} + +void poll_group::add_ring(ring *rng, ring_alloc_logic_attr *attr) +{ + if (std::find(m_rings.begin(), m_rings.end(), rng) == std::end(m_rings)) { + grp_logdbg("New ring %p in group %p", rng, this); + m_rings.push_back(rng); + + /* + * Take reference to the ring. This avoids a race between socket destruction and buffer + * return to the group. Socket destruction can lead to the ring destruction. But user + * may return a buffer outside of the socket lifecycle. + * This also avoids extra ring destruction in a scenario when application closes all + * the sockets multiple times in runtime. + */ + net_device_val *nd = g_p_net_device_table_mgr->get_net_device_val(rng->get_if_index()); + if (nd) { + ring *reserved = nd->reserve_ring(attr); + if (reserved != rng) { + grp_logerr("Cannot reserve ring %p (reserved=%p)", rng, reserved); + if (reserved) { + nd->release_ring(attr); + } + } else { + m_rings_ref.push_back( + std::make_pair(std::make_unique(*attr), nd)); + } + } + } +} + +void poll_group::add_socket(sockinfo_tcp *si) +{ + m_sockets_list.push_back(si); +} + +void poll_group::close_socket(sockinfo_tcp *si, bool force /*=false*/) +{ + m_sockets_list.erase(si); + + bool closed = si->prepare_to_close(force); + if (closed) { + /* + * Current implementation forces TCP reset, so the socket is expected to be closable. + * Do a polling iteration to increase the chance that all the relevant WQEs are completed + * and XLIO emitted all the TX completion before the XLIO_SOCKET_EVENT_TERMINATED event. + * + * TODO Implement more reliable mechanism of deferred socket destruction if there are + * not completed TX operations. + */ + poll(); + si->clean_socket_obj(); + } +} diff --git a/src/core/dev/qp_mgr_eth_mlx5_dpcp.h b/src/core/event/poll_group.h similarity index 50% rename from src/core/dev/qp_mgr_eth_mlx5_dpcp.h rename to src/core/event/poll_group.h index 3fcce4281..3fbbf59c6 100644 --- a/src/core/dev/qp_mgr_eth_mlx5_dpcp.h +++ b/src/core/event/poll_group.h @@ -30,51 +30,58 @@ * SOFTWARE. */ -#ifndef QP_MGR_ETH_MLX5_DPCP_H -#define QP_MGR_ETH_MLX5_DPCP_H +#ifndef XLIO_GROUP_H +#define XLIO_GROUP_H -#include - -#if defined(DEFINED_DPCP) -#include #include -#include "dev/qp_mgr_eth_mlx5.h" +#include + +#include "sock/fd_collection.h" +#include "xlio.h" -class qp_mgr_eth_mlx5_dpcp : public qp_mgr_eth_mlx5 { +/* Forward declarations */ +struct xlio_poll_group_attr; +class event_handler_manager_local; +class ring; +class ring_alloc_logic_attr; +class sockinfo_tcp; +class tcp_timers_collection; + +class poll_group { public: - qp_mgr_eth_mlx5_dpcp(struct qp_mgr_desc *desc, uint32_t tx_num_wr, uint16_t vlan); + poll_group(const struct xlio_poll_group_attr *attr); + ~poll_group(); + static void destroy_all_groups(); + + void poll(); - virtual ~qp_mgr_eth_mlx5_dpcp() override {} + void add_dirty_socket(sockinfo_tcp *si); + void flush(); - virtual void up() override; - virtual void down() override; + void add_ring(ring *rng, ring_alloc_logic_attr *attr); - virtual rfs_rule *create_rfs_rule(xlio_ibv_flow_attr &attrs, xlio_tir *tir_ext) override; - virtual void modify_qp_to_ready_state() override; - virtual void modify_qp_to_error_state() override; - virtual void post_recv_buffer(mem_buf_desc_t *p_mem_buf_desc) override; + void add_socket(sockinfo_tcp *si); + void close_socket(sockinfo_tcp *si, bool force = false); -protected: - virtual cq_mgr *init_rx_cq_mgr(struct ibv_comp_channel *p_rx_comp_event_channel) override; + unsigned get_flags() const { return m_group_flags; } + event_handler_manager_local *get_event_handler() const { return m_event_handler.get(); } + tcp_timers_collection *get_tcp_timers() const { return m_tcp_timers.get(); } + +public: + xlio_socket_event_cb_t m_socket_event_cb; + xlio_socket_comp_cb_t m_socket_comp_cb; + xlio_socket_rx_cb_t m_socket_rx_cb; private: -#ifdef DEFINED_UTLS - // TODO: Move UTLS related code to this class and remove qp_mgr_eth_mlx5::create_tir() - dpcp::tir *create_tir(bool is_tls = false) override; -#else - dpcp::tir *create_tir(bool is_tls = false); -#endif - bool configure_rq_dpcp(); - bool prepare_rq(uint32_t cqn); - bool store_rq_mlx5_params(dpcp::basic_rq &new_rq); - void modify_rq_to_ready_state(); - void init_tir_rq(); + std::vector m_rings; + std::unique_ptr m_event_handler; + std::unique_ptr m_tcp_timers; - std::unique_ptr _tir = {nullptr}; - std::unique_ptr _rq = {nullptr}; - uint32_t _strq_wqe_reserved_seg = 0U; -}; + unsigned m_group_flags; + std::vector m_dirty_sockets; -#endif // defined(DEFINED_DPCP) + sock_fd_api_list_t m_sockets_list; + std::vector, net_device_val *>> m_rings_ref; +}; -#endif +#endif /* XLIO_GROUP_H */ diff --git a/src/core/event/timers_group.h b/src/core/event/timers_group.h deleted file mode 100644 index aa67a383a..000000000 --- a/src/core/event/timers_group.h +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef TIMERS_GROUP_H -#define TIMERS_GROUP_H - -/* - * This is an API for batching timers into groups. - * Instead of registering each timer separately into the internal thread, the group is registered - * once, and the timers are registered to the group. The registration to the group is still done - * through the internal thread. The group must be deleted through the internal thread (must - * implement clean_obj interface). Registering to group must be used with register_timer_event() and - * unregister_timer_event() only. - */ -class timers_group : public timer_handler { -public: - virtual ~timers_group() {}; - // execute all the timers registered to the group - // according to the internal group logic. - virtual void handle_timer_expired(void *user_data) = 0; - -protected: - friend class event_handler_manager; - // add a new timer - virtual void add_new_timer(timer_node_t *node, timer_handler *handler, void *user_data) = 0; - - // remove timer from list and free it. - // called for stopping (unregistering) a timer - virtual void remove_timer(timer_node_t *node) = 0; -}; - -#endif diff --git a/src/core/event/vlogger_timer_handler.cpp b/src/core/event/vlogger_timer_handler.cpp index c515b9686..9107a39c6 100644 --- a/src/core/event/vlogger_timer_handler.cpp +++ b/src/core/event/vlogger_timer_handler.cpp @@ -37,16 +37,16 @@ #include "timer_handler.h" #include "event_handler_manager.h" -vlogger_timer_handler *g_p_vlogger_timer_handler = NULL; +vlogger_timer_handler *g_p_vlogger_timer_handler = nullptr; vlogger_timer_handler::vlogger_timer_handler() - : m_timer_handle(NULL) + : m_timer_handle(nullptr) { if (g_p_event_handler_manager) { /* failure in allocating m_timer_handle will result in throwing an exception by called * methods */ m_timer_handle = g_p_event_handler_manager->register_timer_event( - UPDATE_VLOGGER_LEVELS_INTERVAL, this, PERIODIC_TIMER, 0); + UPDATE_VLOGGER_LEVELS_INTERVAL, this, PERIODIC_TIMER, nullptr); } } @@ -54,7 +54,7 @@ vlogger_timer_handler::~vlogger_timer_handler() { if (m_timer_handle) { g_p_event_handler_manager->unregister_timer_event(this, m_timer_handle); - m_timer_handle = NULL; + m_timer_handle = nullptr; } } diff --git a/src/core/ib/base/verbs_extra.cpp b/src/core/ib/base/verbs_extra.cpp index dc7cf746e..65a661793 100644 --- a/src/core/ib/base/verbs_extra.cpp +++ b/src/core/ib/base/verbs_extra.cpp @@ -261,89 +261,6 @@ int priv_ibv_query_burst_supported(struct ibv_qp *qp, uint8_t port_num) return -1; } -int priv_ibv_query_flow_tag_supported(struct ibv_qp *qp, uint8_t port_num, sa_family_t family) -{ - NOT_IN_USE(qp); - NOT_IN_USE(port_num); - int res = -1; - -#ifdef DEFINED_IBV_FLOW_TAG - - // Create - struct { - xlio_ibv_flow_attr attr; - xlio_ibv_flow_spec_eth eth; - xlio_ibv_flow_spec_ipv4 ipv4; - xlio_ibv_flow_spec_tcp_udp tcp_udp; - xlio_ibv_flow_spec_action_tag flow_tag; - } ft_attr_ipv4; - - struct { - xlio_ibv_flow_attr attr; - xlio_ibv_flow_spec_eth eth; - xlio_ibv_flow_spec_ipv6 ipv6; - xlio_ibv_flow_spec_tcp_udp tcp_udp; - xlio_ibv_flow_spec_action_tag flow_tag; - } ft_attr_ipv6; - - xlio_ibv_flow_attr *p_attr = nullptr; - xlio_ibv_flow_spec_eth *p_eth = nullptr; - xlio_ibv_flow_spec_tcp_udp *p_tcp_udp = nullptr; - xlio_ibv_flow_spec_action_tag *p_flow_tag = nullptr; - - // Initialize - if (family == AF_INET) { - memset(&ft_attr_ipv4, 0, sizeof(ft_attr_ipv4)); - p_attr = &(ft_attr_ipv4.attr); - p_eth = &(ft_attr_ipv4.eth); - p_tcp_udp = &(ft_attr_ipv4.tcp_udp); - p_flow_tag = &(ft_attr_ipv4.flow_tag); - p_attr->size = sizeof(ft_attr_ipv4); - } else { - memset(&ft_attr_ipv6, 0, sizeof(ft_attr_ipv6)); - p_attr = &(ft_attr_ipv6.attr); - p_eth = &(ft_attr_ipv6.eth); - p_tcp_udp = &(ft_attr_ipv6.tcp_udp); - p_flow_tag = &(ft_attr_ipv6.flow_tag); - p_attr->size = sizeof(ft_attr_ipv6); - } - - p_attr->num_of_specs = 4; - p_attr->type = XLIO_IBV_FLOW_ATTR_NORMAL; - p_attr->priority = 2; // almost highest priority, 1 is used for 5-tuple later - p_attr->port = port_num; - - // Set filters - uint8_t mac_0[ETH_ALEN] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; - uint8_t mac_f[ETH_ALEN] = {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}; - - bool is_ipv4 = (family == AF_INET); - ibv_flow_spec_eth_set(p_eth, mac_0, 0, is_ipv4); // L2 filter - memcpy(p_eth->val.src_mac, mac_f, ETH_ALEN); - memset(p_eth->mask.src_mac, FS_MASK_ON_8, ETH_ALEN); - - if (is_ipv4) { - ibv_flow_spec_ip_set(&ft_attr_ipv4.ipv4, ip_address::loopback4_addr(), - ip_address::loopback4_addr()); // L3 filter - } else { - ibv_flow_spec_ip_set(&ft_attr_ipv6.ipv6, ip_address::loopback6_addr(), - ip_address::loopback6_addr()); // L3 filter - } - - ibv_flow_spec_tcp_udp_set(p_tcp_udp, true, 0, 0); // L4 filter - ibv_flow_spec_flow_tag_set(p_flow_tag, FLOW_TAG_MASK - 1); // enable flow tag - - // Create flow - xlio_ibv_flow *ibv_flow = xlio_ibv_create_flow(qp, p_attr); - if (ibv_flow) { - res = 0; - xlio_ibv_destroy_flow(ibv_flow); - } -#endif // DEFINED_IBV_FLOW_TAG - - return res; -} - int xlio_rdma_lib_reset() { #ifdef HAVE_RDMA_LIB_RESET diff --git a/src/core/ib/base/verbs_extra.h b/src/core/ib/base/verbs_extra.h index 7471c8b18..344ae02dd 100644 --- a/src/core/ib/base/verbs_extra.h +++ b/src/core/ib/base/verbs_extra.h @@ -105,7 +105,6 @@ void priv_ibv_modify_cq_moderation(struct ibv_cq *cq, uint32_t period, uint32_t #define FS_MASK_ON_64 (0xffffffffffffffff) #define FLOW_TAG_MASK ((1 << 20) - 1) -int priv_ibv_query_flow_tag_supported(struct ibv_qp *qp, uint8_t port_num, sa_family_t family); int priv_ibv_query_burst_supported(struct ibv_qp *qp, uint8_t port_num); /* DEFINED_VERBS_VERSION: @@ -169,13 +168,6 @@ typedef int xlio_ibv_cq_init_attr; ibv_create_cq(context, cqe, cq_context, channel, comp_vector) // rx hw timestamp -#define XLIO_IBV_WC_WITH_TIMESTAMP 0 -#define xlio_wc_timestamp(wc) 0 -#define xlio_ibv_cq_init_ts_attr(attr) \ - { \ - NOT_IN_USE(attr); \ - } - #ifdef DEFINED_IBV_CQ_TIMESTAMP #define XLIO_IBV_DEVICE_ATTR_HCA_CORE_CLOCK 0 #define XLIO_IBV_VALUES_MASK_RAW_CLOCK IBV_VALUES_MASK_RAW_CLOCK @@ -185,14 +177,12 @@ typedef struct ibv_values_ex xlio_ts_values; #endif // ibv_post_send -#define XLIO_IBV_SEND_SIGNALED IBV_SEND_SIGNALED -#define XLIO_IBV_SEND_INLINE IBV_SEND_INLINE +#define XLIO_IBV_SEND_INLINE IBV_SEND_INLINE #ifdef DEFINED_IBV_SEND_IP_CSUM #define XLIO_IBV_SEND_IP_CSUM (IBV_SEND_IP_CSUM) #else #define DEFINED_SW_CSUM #endif -#define xlio_ibv_send_flags ibv_send_flags #define xlio_send_wr_send_flags(wr) (wr).send_flags #define XLIO_IBV_WR_SEND IBV_WR_SEND #define xlio_ibv_wr_opcode ibv_wr_opcode @@ -218,38 +208,9 @@ typedef struct ibv_tso_caps xlio_ibv_tso_caps; (xlio_ibv_wr_opcode)(0) // Use 0 as "default" opcode when NOP is not defined. #endif -#define xlio_ibv_post_send(qp, wr, bad_wr) ibv_post_send(qp, wr, bad_wr) typedef struct ibv_send_wr xlio_ibv_send_wr; // ibv_reg_mr #define XLIO_IBV_ACCESS_LOCAL_WRITE IBV_ACCESS_LOCAL_WRITE -// flow steering -#define XLIO_IBV_FLOW_ATTR_NORMAL IBV_FLOW_ATTR_NORMAL -#define XLIO_IBV_FLOW_ATTR_FLAGS_ALLOW_LOOP_BACK IBV_FLOW_ATTR_FLAGS_ALLOW_LOOP_BACK -#define XLIO_IBV_FLOW_SPEC_ETH IBV_FLOW_SPEC_ETH -#define XLIO_IBV_FLOW_SPEC_IPV4 IBV_FLOW_SPEC_IPV4 -#define XLIO_IBV_FLOW_SPEC_IPV6 IBV_FLOW_SPEC_IPV6 -#define XLIO_IBV_FLOW_SPEC_TCP IBV_FLOW_SPEC_TCP -#define XLIO_IBV_FLOW_SPEC_UDP IBV_FLOW_SPEC_UDP -#define xlio_ibv_create_flow(qp, flow) ibv_create_flow(qp, flow) -#define xlio_ibv_destroy_flow(flow_id) ibv_destroy_flow(flow_id) -typedef struct ibv_flow xlio_ibv_flow; -typedef struct ibv_flow_attr xlio_ibv_flow_attr; -typedef struct ibv_flow_spec_ib xlio_ibv_flow_spec_ib; -typedef struct ibv_flow_spec_eth xlio_ibv_flow_spec_eth; -typedef struct ibv_flow_spec_ipv4 xlio_ibv_flow_spec_ipv4; -typedef struct ibv_flow_spec_ipv6 xlio_ibv_flow_spec_ipv6; -typedef struct ibv_flow_spec_tcp_udp xlio_ibv_flow_spec_tcp_udp; - -// Flow tag -#ifdef DEFINED_IBV_FLOW_TAG -#define XLIO_IBV_FLOW_SPEC_ACTION_TAG IBV_FLOW_SPEC_ACTION_TAG -typedef struct ibv_flow_spec_action_tag xlio_ibv_flow_spec_action_tag; -#define xlio_get_flow_tag(cqe) ntohl((uint32_t)(cqe->sop_drop_qpn)) -#else -typedef struct ibv_flow_spec_action_tag_dummy { -} xlio_ibv_flow_spec_action_tag; -#define xlio_get_flow_tag(cqe) 0 -#endif // DEFINED_IBV_FLOW_TAG #ifdef DEFINED_IBV_CQ_ATTR_MODERATE typedef struct ibv_modify_cq_attr xlio_ibv_cq_attr; @@ -351,93 +312,4 @@ typedef enum { int xlio_rdma_lib_reset(); -static inline void ibv_flow_spec_eth_set(xlio_ibv_flow_spec_eth *eth, uint8_t *dst_mac, - uint16_t vlan_tag, bool is_ipv4) -{ - eth->type = XLIO_IBV_FLOW_SPEC_ETH; - eth->size = sizeof(xlio_ibv_flow_spec_eth); - eth->val.ether_type = ntohs(is_ipv4 ? ETH_P_IP : ETH_P_IPV6); - eth->mask.ether_type = FS_MASK_ON_16; - memcpy(eth->val.dst_mac, dst_mac, ETH_ALEN); - memset(eth->mask.dst_mac, FS_MASK_ON_8, ETH_ALEN); - eth->val.vlan_tag = vlan_tag & htons(VLAN_VID_MASK); - eth->mask.vlan_tag = - eth->val.vlan_tag ? htons(VLAN_VID_MASK) : 0; // we do not support vlan options -} - -template -static inline void ibv_flow_spec_set_single_ip(T &spec_ip_val, T &spec_ip_mask, - const ip_address &src_ip) -{ -} - -typedef decltype(ibv_flow_ipv4_filter::src_ip) spec_ipv4_type; -template <> -inline void ibv_flow_spec_set_single_ip(spec_ipv4_type &spec_ip_val, spec_ipv4_type &spec_ip_mask, - const ip_address &in_ip) -{ - memcpy(&spec_ip_val, &in_ip.get_in4_addr(), sizeof(spec_ipv4_type)); - spec_ip_mask = (!in_ip.is_anyaddr() ? FS_MASK_ON_32 : 0U); -} - -typedef decltype(ibv_flow_ipv6_filter::src_ip) spec_ipv6_type; -template <> -inline void ibv_flow_spec_set_single_ip(spec_ipv6_type &spec_ip_val, spec_ipv6_type &spec_ip_mask, - const ip_address &in_ip) -{ - memcpy(&spec_ip_val, &in_ip.get_in6_addr(), sizeof(spec_ipv6_type)); - memset(&spec_ip_mask, in_ip.is_anyaddr() ? 0 : 0xff, sizeof(spec_ipv6_type)); -} - -static inline void ibv_flow_spec_ip_set(xlio_ibv_flow_spec_ipv4 *ipv4, const ip_address &dst_ip, - const ip_address &src_ip) -{ - ipv4->type = XLIO_IBV_FLOW_SPEC_IPV4; - ipv4->size = sizeof(xlio_ibv_flow_spec_ipv4); - ibv_flow_spec_set_single_ip(ipv4->val.src_ip, ipv4->mask.src_ip, src_ip); - ibv_flow_spec_set_single_ip(ipv4->val.dst_ip, ipv4->mask.dst_ip, dst_ip); -} - -static inline void ibv_flow_spec_ip_set(xlio_ibv_flow_spec_ipv6 *ipv6, const ip_address &dst_ip, - const ip_address &src_ip) -{ - ipv6->type = XLIO_IBV_FLOW_SPEC_IPV6; - ipv6->size = sizeof(xlio_ibv_flow_spec_ipv6); - ibv_flow_spec_set_single_ip(ipv6->val.src_ip, ipv6->mask.src_ip, src_ip); - ibv_flow_spec_set_single_ip(ipv6->val.dst_ip, ipv6->mask.dst_ip, dst_ip); - ipv6->val.flow_label = ipv6->mask.flow_label = 0U; - ipv6->val.next_hdr = ipv6->mask.next_hdr = 0U; - ipv6->val.traffic_class = ipv6->mask.traffic_class = 0U; - ipv6->val.hop_limit = ipv6->mask.hop_limit = 0U; -} - -static inline void ibv_flow_spec_tcp_udp_set(xlio_ibv_flow_spec_tcp_udp *tcp_udp, bool is_tcp, - uint16_t dst_port, uint16_t src_port) -{ - tcp_udp->type = is_tcp ? XLIO_IBV_FLOW_SPEC_TCP : XLIO_IBV_FLOW_SPEC_UDP; - tcp_udp->size = sizeof(xlio_ibv_flow_spec_tcp_udp); - tcp_udp->val.src_port = src_port; - if (tcp_udp->val.src_port) { - tcp_udp->mask.src_port = FS_MASK_ON_16; - } - tcp_udp->val.dst_port = dst_port; - if (tcp_udp->val.dst_port) { - tcp_udp->mask.dst_port = FS_MASK_ON_16; - } -} - -static inline void ibv_flow_spec_flow_tag_set(xlio_ibv_flow_spec_action_tag *flow_tag, - uint32_t tag_id) -{ - NOT_IN_USE(tag_id); - if (flow_tag == NULL) { - return; - } -#ifdef DEFINED_IBV_FLOW_TAG - flow_tag->type = XLIO_IBV_FLOW_SPEC_ACTION_TAG; - flow_tag->size = sizeof(xlio_ibv_flow_spec_action_tag); - flow_tag->tag_id = tag_id; -#endif // DEFINED_IBV_FLOW_TAG -} - #endif diff --git a/src/core/ib/mlx5/ib_mlx5.cpp b/src/core/ib/mlx5/ib_mlx5.cpp index 3f38f6c42..3fc4da36c 100644 --- a/src/core/ib/mlx5/ib_mlx5.cpp +++ b/src/core/ib/mlx5/ib_mlx5.cpp @@ -40,7 +40,7 @@ #include "util/utils.h" #include "ib/mlx5/ib_mlx5.h" -int xlio_ib_mlx5_get_qp(struct ibv_qp *qp, xlio_ib_mlx5_qp_t *mlx5_qp, uint32_t flags) +int xlio_ib_mlx5_get_qp_tx(xlio_ib_mlx5_qp_t *mlx5_qp) { int ret = 0; struct mlx5dv_obj obj; @@ -52,7 +52,7 @@ int xlio_ib_mlx5_get_qp(struct ibv_qp *qp, xlio_ib_mlx5_qp_t *mlx5_qp, uint32_t memset(&obj, 0, sizeof(obj)); memset(&dqp, 0, sizeof(dqp)); - obj.qp.in = qp; + obj.qp.in = mlx5_qp->qp; obj.qp.out = &dqp; #if defined(DEFINED_DV_RAW_QP_HANDLES) dqp.comp_mask |= MLX5DV_QP_MASK_RAW_QP_HANDLES; @@ -62,42 +62,26 @@ int xlio_ib_mlx5_get_qp(struct ibv_qp *qp, xlio_ib_mlx5_qp_t *mlx5_qp, uint32_t goto out; } - memset(mlx5_qp, 0, sizeof(*mlx5_qp)); VALGRIND_MAKE_MEM_DEFINED(&dqp, sizeof(dqp)); - mlx5_qp->qp = qp; - mlx5_qp->qpn = qp->qp_num; - mlx5_qp->flags = flags; + mlx5_qp->qpn = mlx5_qp->qp->qp_num; mlx5_qp->sq.dbrec = &dqp.dbrec[MLX5_SND_DBR]; mlx5_qp->sq.buf = dqp.sq.buf; mlx5_qp->sq.wqe_cnt = dqp.sq.wqe_cnt; mlx5_qp->sq.stride = dqp.sq.stride; - mlx5_qp->rq.dbrec = &dqp.dbrec[MLX5_RCV_DBR]; - mlx5_qp->rq.buf = dqp.rq.buf; - mlx5_qp->rq.wqe_cnt = dqp.rq.wqe_cnt; - mlx5_qp->rq.stride = dqp.rq.stride; - mlx5_qp->rq.wqe_shift = ilog_2(dqp.rq.stride); - mlx5_qp->rq.head = 0; - mlx5_qp->rq.tail = 0; mlx5_qp->bf.reg = dqp.bf.reg; - mlx5_qp->bf.size = dqp.bf.size; - mlx5_qp->bf.offset = 0; #if defined(DEFINED_DV_RAW_QP_HANDLES) - mlx5_qp->tirn = dqp.tirn; mlx5_qp->tisn = dqp.tisn; - mlx5_qp->rqn = dqp.rqn; mlx5_qp->sqn = dqp.sqn; #endif /* DEFINED_DV_RAW_QP_HANDLES */ - ret = ibv_query_qp(qp, &tmp_ibv_qp_attr, attr_mask, &tmp_ibv_qp_init_attr); + ret = ibv_query_qp(mlx5_qp->qp, &tmp_ibv_qp_attr, attr_mask, &tmp_ibv_qp_init_attr); if (ret != 0) { goto out; } VALGRIND_MAKE_MEM_DEFINED(&tmp_ibv_qp_attr, sizeof(tmp_ibv_qp_attr)); mlx5_qp->cap.max_send_wr = tmp_ibv_qp_attr.cap.max_send_wr; - mlx5_qp->cap.max_recv_wr = tmp_ibv_qp_attr.cap.max_recv_wr; mlx5_qp->cap.max_send_sge = tmp_ibv_qp_attr.cap.max_send_sge; - mlx5_qp->cap.max_recv_sge = tmp_ibv_qp_attr.cap.max_recv_sge; mlx5_qp->cap.max_inline_data = tmp_ibv_qp_attr.cap.max_inline_data; out: @@ -117,7 +101,7 @@ int xlio_ib_mlx5_get_cq(struct ibv_cq *cq, xlio_ib_mlx5_cq_t *mlx5_cq) * from ERROR state to RESET so cq_ci or cq_sn should not be * updated */ - if (mlx5_cq == NULL || mlx5_cq->cq == cq) { + if (!mlx5_cq || mlx5_cq->cq == cq) { return 0; } @@ -149,85 +133,4 @@ int xlio_ib_mlx5_get_cq(struct ibv_cq *cq, xlio_ib_mlx5_cq_t *mlx5_cq) return 0; } -int xlio_ib_mlx5_post_recv(xlio_ib_mlx5_qp_t *mlx5_qp, struct ibv_recv_wr *wr, - struct ibv_recv_wr **bad_wr) -{ - struct mlx5_wqe_data_seg *scat; - int err = 0; - int nreq; - int ind; - int i, j; - - ind = mlx5_qp->rq.head & (mlx5_qp->rq.wqe_cnt - 1); - *bad_wr = NULL; - - for (nreq = 0; wr; ++nreq, wr = wr->next) { - if (unlikely((int)mlx5_qp->rq.head - (int)mlx5_qp->rq.tail + nreq >= - (int)mlx5_qp->cap.max_recv_wr)) { - errno = ENOMEM; - err = -errno; - *bad_wr = wr; - goto out; - } - - if (unlikely(wr->num_sge > (int)mlx5_qp->cap.max_recv_sge)) { - errno = EINVAL; - err = -errno; - *bad_wr = wr; - goto out; - } - - scat = (struct mlx5_wqe_data_seg *)((uint8_t *)mlx5_qp->rq.buf + - (ind << mlx5_qp->rq.wqe_shift)); - - for (i = 0, j = 0; i < wr->num_sge; ++i) { - if (unlikely(!wr->sg_list[i].length)) { - continue; - } - - scat[j].byte_count = htonl(wr->sg_list[i].length); - scat[j].lkey = htonl(wr->sg_list[i].lkey); - scat[j].addr = htonll(wr->sg_list[i].addr); - j++; - } - - if (j < (int)mlx5_qp->cap.max_recv_sge) { - scat[j].byte_count = 0; - scat[j].lkey = htonl(MLX5_INVALID_LKEY); - scat[j].addr = 0; - } - - ind = (ind + 1) & (mlx5_qp->rq.wqe_cnt - 1); - } - -out: - if (likely(nreq)) { - mlx5_qp->rq.head += nreq; - - /* - * Make sure that descriptors are written before - * doorbell record. - */ - wmb(); - - /* - * For Raw Packet QP, avoid updating the doorbell record - * as long as the QP isn't in RTR state, to avoid receiving - * packets in illegal states. - * This is only for Raw Packet QPs since they are represented - * differently in the hardware. - * For DPCP RQ, the RQ state is switched along with the QP-unused-rq, - * and in such case if RQ.State == RST, doorbells are not processed anyway - * and for RDY state without a TIR incomming messages never reach RQ (PRM 8.14.1). - */ - if (likely(!((mlx5_qp->qp->qp_type == IBV_QPT_RAW_PACKET || - mlx5_qp->flags & XLIO_IB_MLX5_QP_FLAGS_USE_UNDERLAY) && - mlx5_qp->qp->state < IBV_QPS_RTR))) { - *mlx5_qp->rq.dbrec = htonl(mlx5_qp->rq.head & 0xffff); - } - } - - return err; -} - #endif /* DEFINED_DIRECT_VERBS */ diff --git a/src/core/ib/mlx5/ib_mlx5.h b/src/core/ib/mlx5/ib_mlx5.h index 948e98517..1ecf5a5a7 100644 --- a/src/core/ib/mlx5/ib_mlx5.h +++ b/src/core/ib/mlx5/ib_mlx5.h @@ -65,15 +65,12 @@ extern "C" { */ int xlio_ib_mlx5dv_init_obj(struct mlx5dv_obj *obj, uint64_t type); -enum { XLIO_IB_MLX5_QP_FLAGS_USE_UNDERLAY = 0x01 }; - enum { XLIO_IB_MLX5_CQ_SET_CI = 0, XLIO_IB_MLX5_CQ_ARM_DB = 1 }; /* Queue pair */ typedef struct xlio_ib_mlx5_qp { struct ibv_qp *qp; uint32_t qpn; - uint32_t flags; struct ibv_qp_cap cap; struct { volatile uint32_t *dbrec; @@ -81,23 +78,10 @@ typedef struct xlio_ib_mlx5_qp { uint32_t wqe_cnt; uint32_t stride; } sq; - struct { - volatile uint32_t *dbrec; - void *buf; - uint32_t wqe_cnt; - uint32_t stride; - uint32_t wqe_shift; - unsigned head; - unsigned tail; - } rq; struct { void *reg; - uint32_t size; - uint32_t offset; } bf; - uint32_t tirn; uint32_t tisn; - uint32_t rqn; uint32_t sqn; } xlio_ib_mlx5_qp_t; @@ -482,7 +466,7 @@ enum { /* * Interfaces */ -int xlio_ib_mlx5_get_qp(struct ibv_qp *qp, xlio_ib_mlx5_qp_t *mlx5_qp, uint32_t flags = 0); +int xlio_ib_mlx5_get_qp_tx(xlio_ib_mlx5_qp_t *mlx5_qp); int xlio_ib_mlx5_post_recv(xlio_ib_mlx5_qp_t *mlx5_qp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr); diff --git a/src/core/infra/DemoObserver.cpp b/src/core/infra/DemoObserver.cpp index bc92f02e8..e5178e06b 100644 --- a/src/core/infra/DemoObserver.cpp +++ b/src/core/infra/DemoObserver.cpp @@ -51,8 +51,8 @@ void Demo_Observer::notify_cb() void Demo_Observer::register_to_subjects(Demo_Coll_Mgr1 *coll_for_subjects_1, Demo_Coll_Mgr2 *coll_for_subjects_2) { - Demo_Subject1 *s1 = NULL; - Demo_Subject2 *s2 = NULL; + Demo_Subject1 *s1 = nullptr; + Demo_Subject2 *s2 = nullptr; key_class c('a'); key_class i(1); char ch = 'a'; diff --git a/src/core/infra/DemoSubject.h b/src/core/infra/DemoSubject.h index fb9d17c37..1320c7726 100644 --- a/src/core/infra/DemoSubject.h +++ b/src/core/infra/DemoSubject.h @@ -48,7 +48,6 @@ template class key_class : public tostr { const std::string to_str() const { char s[20]; - /* cppcheck-suppress wrongPrintfScanfArgNum */ snprintf(s, sizeof(s), "%d.%d.%d.%d", NIPQUAD(m_key)); return (std::string(s)); } diff --git a/src/core/infra/cache_subject_observer.h b/src/core/infra/cache_subject_observer.h index 5d504babe..ed5397000 100644 --- a/src/core/infra/cache_subject_observer.h +++ b/src/core/infra/cache_subject_observer.h @@ -115,7 +115,7 @@ template class cache_table_mgr : public tostr, publ public: cache_table_mgr(const char *lock_name = "lock(cache_table_mgr)") : m_lock(lock_name) - , m_timer_handle(NULL) {}; + , m_timer_handle(nullptr) {}; virtual ~cache_table_mgr(); /* Returns pointer to the subject */ @@ -215,7 +215,7 @@ void cache_table_mgr::start_garbage_collector(int timeout_msec) m_timer_handle = g_p_event_handler_manager->register_timer_event(timeout_msec, this, PERIODIC_TIMER, NULL); - if (m_timer_handle == NULL) { + if (!m_timer_handle) { __log_warn("Failed to start garbage_collector"); } } @@ -224,7 +224,7 @@ template void cache_table_mgr::stop_garba { if (m_timer_handle) { g_p_event_handler_manager->unregister_timer_event(this, m_timer_handle); - m_timer_handle = NULL; + m_timer_handle = nullptr; } } diff --git a/src/core/infra/subject_observer.h b/src/core/infra/subject_observer.h index 44d333c9b..5d14930b6 100644 --- a/src/core/infra/subject_observer.h +++ b/src/core/infra/subject_observer.h @@ -60,7 +60,7 @@ class subject { virtual ~subject() {}; virtual bool register_observer(IN const observer *const new_observer); bool unregister_observer(IN const observer *const old_observer); - void notify_observers(event *ev = NULL); + void notify_observers(event *ev = nullptr); protected: lock_mutex_recursive m_lock; diff --git a/src/core/iomux/epfd_info.cpp b/src/core/iomux/epfd_info.cpp index a2d21ba5e..dae4861dc 100644 --- a/src/core/iomux/epfd_info.cpp +++ b/src/core/iomux/epfd_info.cpp @@ -45,7 +45,7 @@ int epfd_info::remove_fd_from_epoll_os(int fd) { - int ret = orig_os_api.epoll_ctl(m_epfd, EPOLL_CTL_DEL, fd, NULL); + int ret = SYSCALL(epoll_ctl, m_epfd, EPOLL_CTL_DEL, fd, nullptr); BULLSEYE_EXCLUDE_BLOCK_START if (ret < 0) { __log_dbg("failed to remove fd=%d from os epoll epfd=%d (errno=%d %m)", fd, m_epfd, errno); @@ -99,7 +99,7 @@ epfd_info::epfd_info(int epfd, int size) epfd_info::~epfd_info() { __log_funcall(""); - socket_fd_api *sock_fd; + sockinfo *sock_fd; // Meny: going over all handled fds and removing epoll context. @@ -146,13 +146,13 @@ int epfd_info::ctl(int op, int fd, epoll_event *event) { int ret; epoll_event event_dummy; - if (event == NULL) { + if (!event) { memset(&event_dummy, 0, sizeof(event_dummy)); event = &event_dummy; } // YossiE TODO make "event table" - and add index in that table instead - // of real event (in orig_os_api.epoll_ctl). must have this because fd's can + // of real event (in SYSCALL(epoll_ctl)). must have this because fd's can // be added after the cq. lock(); @@ -201,13 +201,13 @@ int epfd_info::add_fd(int fd, epoll_event *event) { int ret; epoll_fd_rec fd_rec; - epoll_event evt = {0, {0}}; + epoll_event evt = {0, {nullptr}}; bool is_offloaded = false; __log_funcall("fd=%d", fd); - socket_fd_api *temp_sock_fd_api = fd_collection_get_sockfd(fd); + sockinfo *temp_sock_fd_api = fd_collection_get_sockfd(fd); if (temp_sock_fd_api && temp_sock_fd_api->get_type() == FD_TYPE_SOCKET) { is_offloaded = true; } @@ -237,7 +237,7 @@ int epfd_info::add_fd(int fd, epoll_event *event) evt.events = event->events; evt.data.u64 = 0; // zero all data evt.data.fd = fd; - ret = orig_os_api.epoll_ctl(m_epfd, EPOLL_CTL_ADD, fd, &evt); + ret = SYSCALL(epoll_ctl, m_epfd, EPOLL_CTL_ADD, fd, &evt); BULLSEYE_EXCLUDE_BLOCK_START if (ret < 0) { __log_dbg("failed to add fd=%d to epoll epfd=%d (errno=%d %m)", fd, m_epfd, errno); @@ -294,7 +294,7 @@ int epfd_info::add_fd(int fd, epoll_event *event) // if the socket is ready, add it to ready events uint32_t events = 0; int errors; - if ((event->events & EPOLLIN) && temp_sock_fd_api->is_readable(NULL, NULL)) { + if ((event->events & EPOLLIN) && temp_sock_fd_api->is_readable(nullptr, nullptr)) { events |= EPOLLIN; } if ((event->events & EPOLLOUT) && temp_sock_fd_api->is_writeable()) { @@ -337,11 +337,11 @@ void epfd_info::increase_ring_ref_count(ring *ring) size_t num_ring_rx_fds; int *ring_rx_fds_array = ring->get_rx_channel_fds(num_ring_rx_fds); for (size_t i = 0; i < num_ring_rx_fds; i++) { - epoll_event evt = {0, {0}}; + epoll_event evt = {0, {nullptr}}; evt.events = EPOLLIN | EPOLLPRI; int fd = ring_rx_fds_array[i]; evt.data.u64 = (((uint64_t)CQ_FD_MARK << 32) | fd); - int ret = orig_os_api.epoll_ctl(m_epfd, EPOLL_CTL_ADD, fd, &evt); + int ret = SYSCALL(epoll_ctl, m_epfd, EPOLL_CTL_ADD, fd, &evt); BULLSEYE_EXCLUDE_BLOCK_START if (ret < 0) { __log_dbg("failed to add cq fd=%d to epoll epfd=%d (errno=%d %m)", fd, m_epfd, @@ -378,7 +378,7 @@ void epfd_info::decrease_ring_ref_count(ring *ring) int *ring_rx_fds_array = ring->get_rx_channel_fds(num_ring_rx_fds); for (size_t i = 0; i < num_ring_rx_fds; i++) { // delete cq fd from epfd - int ret = orig_os_api.epoll_ctl(m_epfd, EPOLL_CTL_DEL, ring_rx_fds_array[i], NULL); + int ret = SYSCALL(epoll_ctl, m_epfd, EPOLL_CTL_DEL, ring_rx_fds_array[i], nullptr); BULLSEYE_EXCLUDE_BLOCK_START if (ret < 0) { __log_dbg("failed to remove cq fd=%d from epfd=%d (errno=%d %m)", @@ -403,7 +403,7 @@ int epfd_info::del_fd(int fd, bool passthrough) __log_funcall("fd=%d", fd); epoll_fd_rec *fi; - socket_fd_api *temp_sock_fd_api = fd_collection_get_sockfd(fd); + sockinfo *temp_sock_fd_api = fd_collection_get_sockfd(fd); if (temp_sock_fd_api && temp_sock_fd_api->skip_os_select()) { __log_dbg("fd=%d must be skipped from os epoll()", fd); } else if (!passthrough) { @@ -448,7 +448,7 @@ int epfd_info::del_fd(int fd, bool passthrough) // remove fd and replace by last fd m_p_offloaded_fds[fi->offloaded_index - 1] = m_p_offloaded_fds[m_n_offloaded_fds - 1]; - socket_fd_api *last_socket = + sockinfo *last_socket = fd_collection_get_sockfd(m_p_offloaded_fds[m_n_offloaded_fds - 1]); if (last_socket && last_socket->get_epoll_context_fd() == m_epfd) { last_socket->m_fd_rec.offloaded_index = fi->offloaded_index; @@ -485,7 +485,7 @@ int epfd_info::mod_fd(int fd, epoll_event *event) return -1; } - socket_fd_api *temp_sock_fd_api = fd_collection_get_sockfd(fd); + sockinfo *temp_sock_fd_api = fd_collection_get_sockfd(fd); // check if fd is offloaded that new event mask is OK if (temp_sock_fd_api && temp_sock_fd_api->m_fd_rec.offloaded_index > 0) { if (m_log_invalid_events && (event->events & ~SUPPORTED_EPOLL_EVENTS)) { @@ -503,7 +503,7 @@ int epfd_info::mod_fd(int fd, epoll_event *event) evt.events = event->events; evt.data.u64 = 0; // zero all data evt.data.fd = fd; - ret = orig_os_api.epoll_ctl(m_epfd, EPOLL_CTL_MOD, fd, &evt); + ret = SYSCALL(epoll_ctl, m_epfd, EPOLL_CTL_MOD, fd, &evt); BULLSEYE_EXCLUDE_BLOCK_START if (ret < 0) { __log_err("failed to modify fd=%d in epoll epfd=%d (errno=%d %m)", fd, m_epfd, errno); @@ -521,7 +521,7 @@ int epfd_info::mod_fd(int fd, epoll_event *event) uint32_t events = 0; if (is_offloaded) { // if the socket is ready, add it to ready events - if ((event->events & EPOLLIN) && temp_sock_fd_api->is_readable(NULL, NULL)) { + if ((event->events & EPOLLIN) && temp_sock_fd_api->is_readable(nullptr, nullptr)) { events |= EPOLLIN; } if ((event->events & EPOLLOUT) && temp_sock_fd_api->is_writeable()) { @@ -549,8 +549,8 @@ int epfd_info::mod_fd(int fd, epoll_event *event) epoll_fd_rec *epfd_info::get_fd_rec(int fd) { - epoll_fd_rec *fd_rec = NULL; - socket_fd_api *temp_sock_fd_api = fd_collection_get_sockfd(fd); + epoll_fd_rec *fd_rec = nullptr; + sockinfo *temp_sock_fd_api = fd_collection_get_sockfd(fd); lock(); if (temp_sock_fd_api && temp_sock_fd_api->get_epoll_context_fd() == m_epfd) { @@ -575,7 +575,7 @@ void epfd_info::fd_closed(int fd, bool passthrough) unlock(); } -void epfd_info::insert_epoll_event_cb(socket_fd_api *sock_fd, uint32_t event_flags) +void epfd_info::insert_epoll_event_cb(sockinfo *sock_fd, uint32_t event_flags) { lock(); // EPOLLHUP | EPOLLERR are reported without user request @@ -585,7 +585,7 @@ void epfd_info::insert_epoll_event_cb(socket_fd_api *sock_fd, uint32_t event_fla unlock(); } -void epfd_info::insert_epoll_event(socket_fd_api *sock_fd, uint32_t event_flags) +void epfd_info::insert_epoll_event(sockinfo *sock_fd, uint32_t event_flags) { // assumed lock if (sock_fd->ep_ready_fd_node.is_list_member()) { @@ -598,7 +598,7 @@ void epfd_info::insert_epoll_event(socket_fd_api *sock_fd, uint32_t event_flags) do_wakeup(); } -void epfd_info::remove_epoll_event(socket_fd_api *sock_fd, uint32_t event_flags) +void epfd_info::remove_epoll_event(sockinfo *sock_fd, uint32_t event_flags) { sock_fd->m_epoll_event_flags &= ~event_flags; if (sock_fd->m_epoll_event_flags == 0) { @@ -611,7 +611,7 @@ epoll_stats_t *epfd_info::stats() return m_stats; } -int epfd_info::ring_poll_and_process_element(uint64_t *p_poll_sn, +int epfd_info::ring_poll_and_process_element(uint64_t *p_poll_sn_rx, uint64_t *p_poll_sn_tx, void *pv_fd_ready_array /* = NULL*/) { __log_func(""); @@ -625,7 +625,7 @@ int epfd_info::ring_poll_and_process_element(uint64_t *p_poll_sn, m_ring_map_lock.lock(); for (ring_map_t::iterator iter = m_ring_map.begin(); iter != m_ring_map.end(); iter++) { - int ret = iter->first->poll_and_process_element_rx(p_poll_sn, pv_fd_ready_array); + int ret = iter->first->poll_and_process_element_rx(p_poll_sn_rx, pv_fd_ready_array); BULLSEYE_EXCLUDE_BLOCK_START if (ret < 0 && errno != EAGAIN) { __log_err("Error in RX ring->poll_and_process_element() of %p (errno=%d %m)", @@ -635,11 +635,11 @@ int epfd_info::ring_poll_and_process_element(uint64_t *p_poll_sn, } BULLSEYE_EXCLUDE_BLOCK_END if (ret > 0) { - __log_func("ring[%p] RX Returned with: %d (sn=%d)", iter->first, ret, *p_poll_sn); + __log_func("ring[%p] RX Returned with: %d (sn=%d)", iter->first, ret, *p_poll_sn_rx); ret_total += ret; } #if defined(DEFINED_FORCE_TX_POLLING) - ret = iter->first->poll_and_process_element_tx(p_poll_sn); + ret = iter->first->poll_and_process_element_tx(p_poll_sn_tx); BULLSEYE_EXCLUDE_BLOCK_START if (ret < 0 && errno != EAGAIN) { __log_err("Error in TX ring->poll_and_process_element() of %p (errno=%d %m)", @@ -649,7 +649,7 @@ int epfd_info::ring_poll_and_process_element(uint64_t *p_poll_sn, } BULLSEYE_EXCLUDE_BLOCK_END if (ret > 0) { - __log_func("ring[%p] TX Returned with: %d (sn=%d)", iter->first, ret, *p_poll_sn); + __log_func("ring[%p] TX Returned with: %d (sn=%d)", iter->first, ret, *p_poll_sn_tx); ret_total += ret; } #endif /* DEFINED_FORCE_TX_POLLING */ @@ -669,7 +669,7 @@ int epfd_info::ring_poll_and_process_element(uint64_t *p_poll_sn, return ret_total; } -int epfd_info::ring_request_notification(uint64_t poll_sn) +int epfd_info::ring_request_notification(uint64_t poll_sn_rx, uint64_t poll_sn_tx) { __log_func(""); int ret_total = 0; @@ -681,7 +681,7 @@ int epfd_info::ring_request_notification(uint64_t poll_sn) m_ring_map_lock.lock(); for (ring_map_t::iterator iter = m_ring_map.begin(); iter != m_ring_map.end(); iter++) { - int ret = iter->first->request_notification(CQT_RX, poll_sn); + int ret = iter->first->request_notification(CQT_RX, poll_sn_rx); BULLSEYE_EXCLUDE_BLOCK_START if (ret < 0) { __log_err("Error RX ring[%p]->request_notification() (errno=%d %m)", iter->first, @@ -690,10 +690,10 @@ int epfd_info::ring_request_notification(uint64_t poll_sn) return ret; } BULLSEYE_EXCLUDE_BLOCK_END - __log_func("ring[%p] RX Returned with: %d (sn=%d)", iter->first, ret, poll_sn); + __log_func("ring[%p] RX Returned with: %d (sn=%d)", iter->first, ret, poll_sn_rx); ret_total += ret; #if defined(DEFINED_FORCE_TX_POLLING) - ret = iter->first->request_notification(CQT_TX, poll_sn); + ret = iter->first->request_notification(CQT_TX, poll_sn_tx); BULLSEYE_EXCLUDE_BLOCK_START if (ret < 0) { __log_err("Error TX ring[%p]->request_notification() (errno=%d %m)", iter->first, @@ -702,7 +702,7 @@ int epfd_info::ring_request_notification(uint64_t poll_sn) return ret; } BULLSEYE_EXCLUDE_BLOCK_END - __log_func("ring[%p] TX Returned with: %d (sn=%d)", iter->first, ret, poll_sn); + __log_func("ring[%p] TX Returned with: %d (sn=%d)", iter->first, ret, poll_sn_tx); ret_total += ret; #endif /* DEFINED_FORCE_TX_POLLING */ } @@ -754,7 +754,7 @@ int epfd_info::ring_wait_for_notification_and_process_element(uint64_t *p_poll_s } else { __log_dbg("failed to find channel fd. removing cq fd=%d from epfd=%d", fd, m_epfd); BULLSEYE_EXCLUDE_BLOCK_START - if ((orig_os_api.epoll_ctl(m_epfd, EPOLL_CTL_DEL, fd, NULL)) && + if ((SYSCALL(epoll_ctl, m_epfd, EPOLL_CTL_DEL, fd, nullptr)) && (!(errno == ENOENT || errno == EBADF))) { __log_err("failed to del cq channel fd=%d from os epfd=%d (errno=%d %m)", fd, m_epfd, errno); diff --git a/src/core/iomux/epfd_info.h b/src/core/iomux/epfd_info.h index a58d092ea..245d4f196 100644 --- a/src/core/iomux/epfd_info.h +++ b/src/core/iomux/epfd_info.h @@ -37,8 +37,8 @@ #include #include -typedef xlio_list_t ep_ready_fd_list_t; -typedef xlio_list_t fd_info_list_t; +typedef xlio_list_t ep_ready_fd_list_t; +typedef xlio_list_t fd_info_list_t; typedef std::unordered_map fd_info_map_t; typedef std::unordered_map ring_map_t; typedef std::deque ready_cq_fd_q_t; @@ -87,12 +87,13 @@ class epfd_info : public lock_mutex_recursive, public cleanable_obj, public wake */ epoll_stats_t *stats(); - int ring_poll_and_process_element(uint64_t *p_poll_sn, void *pv_fd_ready_array = NULL); + int ring_poll_and_process_element(uint64_t *p_poll_sn_rx, uint64_t *p_poll_sn_tx, + void *pv_fd_ready_array = nullptr); - int ring_request_notification(uint64_t poll_sn); + int ring_request_notification(uint64_t poll_sn_rx, uint64_t poll_sn_tx); int ring_wait_for_notification_and_process_element(uint64_t *p_poll_sn, - void *pv_fd_ready_array = NULL); + void *pv_fd_ready_array = nullptr); virtual void clean_obj(); @@ -120,9 +121,9 @@ class epfd_info : public lock_mutex_recursive, public cleanable_obj, public wake int remove_fd_from_epoll_os(int fd); inline size_t get_fd_non_offloaded_size() { return m_fd_non_offloaded_map.size(); } inline size_t get_fd_offloaded_size() { return m_fd_offloaded_list.size(); } - void insert_epoll_event_cb(socket_fd_api *sock_fd, uint32_t event_flags); - void insert_epoll_event(socket_fd_api *sock_fd, uint32_t event_flags); - void remove_epoll_event(socket_fd_api *sock_fd, uint32_t event_flags); + void insert_epoll_event_cb(sockinfo *sock_fd, uint32_t event_flags); + void insert_epoll_event(sockinfo *sock_fd, uint32_t event_flags); + void remove_epoll_event(sockinfo *sock_fd, uint32_t event_flags); void increase_ring_ref_count(ring *ring); void decrease_ring_ref_count(ring *ring); diff --git a/src/core/iomux/epoll_wait_call.cpp b/src/core/iomux/epoll_wait_call.cpp index 2a5765356..563a569c2 100644 --- a/src/core/iomux/epoll_wait_call.cpp +++ b/src/core/iomux/epoll_wait_call.cpp @@ -36,7 +36,7 @@ #include #include -#include +#include #include #include "epfd_info.h" @@ -46,7 +46,7 @@ epoll_wait_call::epoll_wait_call(epoll_event *extra_events_buffer, offloaded_mode_t *off_modes_buffer, int epfd, epoll_event *events, int maxevents, int timeout, const sigset_t *sigmask /* = NULL */) - : io_mux_call(NULL, off_modes_buffer, 0, sigmask) + : io_mux_call(nullptr, off_modes_buffer, 0, sigmask) , // TODO: rethink on these arguments m_epfd(epfd) , m_events(events) @@ -85,11 +85,11 @@ int epoll_wait_call::get_current_events() return m_n_all_ready_fds; } - xlio_list_t socket_fd_list; + xlio_list_t socket_fd_list; lock(); int i, ready_rfds = 0, ready_wfds = 0; i = m_n_all_ready_fds; - socket_fd_api *p_socket_object; + sockinfo *p_socket_object; ep_ready_fd_list_t::iterator iter = m_epfd_info->m_ready_fds.begin(); while (iter != m_epfd_info->m_ready_fds.end() && i < m_maxevents) { p_socket_object = *iter; @@ -111,7 +111,7 @@ int epoll_wait_call::get_current_events() } if (mutual_events & EPOLLIN) { - if (handle_epoll_event(p_socket_object->is_readable(NULL), EPOLLIN, p_socket_object, + if (handle_epoll_event(p_socket_object->is_readable(nullptr), EPOLLIN, p_socket_object, i)) { ready_rfds++; got_event = true; @@ -165,7 +165,7 @@ int epoll_wait_call::get_current_events() * see RM task 212058 */ while (!socket_fd_list.empty()) { - socket_fd_api *sockfd = socket_fd_list.get_and_pop_front(); + sockinfo *sockfd = socket_fd_list.get_and_pop_front(); sockfd->consider_rings_migration_rx(); } @@ -200,10 +200,9 @@ bool epoll_wait_call::_wait(int timeout) } if (m_sigmask) { - ready_fds = - orig_os_api.epoll_pwait(m_epfd, m_p_ready_events, m_maxevents, timeout, m_sigmask); + ready_fds = SYSCALL(epoll_pwait, m_epfd, m_p_ready_events, m_maxevents, timeout, m_sigmask); } else { - ready_fds = orig_os_api.epoll_wait(m_epfd, m_p_ready_events, m_maxevents, timeout); + ready_fds = SYSCALL(epoll_wait, m_epfd, m_p_ready_events, m_maxevents, timeout); } if (timeout) { @@ -236,7 +235,7 @@ bool epoll_wait_call::_wait(int timeout) } if (m_p_ready_events[i].events & EPOLLIN) { - socket_fd_api *temp_sock_fd_api = fd_collection_get_sockfd(fd); + sockinfo *temp_sock_fd_api = fd_collection_get_sockfd(fd); if (temp_sock_fd_api) { // Instructing the socket to sample the OS immediately to prevent hitting EAGAIN on // recvfrom(), after iomux returned a shadow fd as ready (only for non-blocking @@ -344,8 +343,8 @@ bool epoll_wait_call::immidiate_return(int &poll_os_countdown) return false; } -bool epoll_wait_call::handle_epoll_event(bool is_ready, uint32_t events, - socket_fd_api *socket_object, int index) +bool epoll_wait_call::handle_epoll_event(bool is_ready, uint32_t events, sockinfo *socket_object, + int index) { if (is_ready) { epoll_fd_rec &fd_rec = socket_object->m_fd_rec; @@ -385,7 +384,7 @@ bool epoll_wait_call::handle_os_countdown(int &poll_os_countdown) if (cq_ready) { // This will empty the cqepfd // (most likely in case of a wakeup and probably only under epoll_wait (Not select/poll)) - ring_wait_for_notification_and_process_element(NULL); + ring_wait_for_notification_and_process_element(nullptr); } /* Before we exit with ready OS fd's we'll check the CQs once more and exit * below after calling check_all_offloaded_sockets(); @@ -405,16 +404,16 @@ bool epoll_wait_call::handle_os_countdown(int &poll_os_countdown) int epoll_wait_call::ring_poll_and_process_element() { - return m_epfd_info->ring_poll_and_process_element(&m_poll_sn, NULL); + return m_epfd_info->ring_poll_and_process_element(&m_poll_sn_rx, &m_poll_sn_tx, nullptr); } int epoll_wait_call::ring_request_notification() { - return m_epfd_info->ring_request_notification(m_poll_sn); + return m_epfd_info->ring_request_notification(m_poll_sn_rx, m_poll_sn_tx); } int epoll_wait_call::ring_wait_for_notification_and_process_element(void *pv_fd_ready_array) { - return m_epfd_info->ring_wait_for_notification_and_process_element(&m_poll_sn, + return m_epfd_info->ring_wait_for_notification_and_process_element(&m_poll_sn_rx, pv_fd_ready_array); } diff --git a/src/core/iomux/epoll_wait_call.h b/src/core/iomux/epoll_wait_call.h index be4505648..57d11b5e9 100644 --- a/src/core/iomux/epoll_wait_call.h +++ b/src/core/iomux/epoll_wait_call.h @@ -56,7 +56,7 @@ class epoll_wait_call : public io_mux_call { */ epoll_wait_call(epoll_event *extra_events_buffer, offloaded_mode_t *off_modes_buffer, int epfd, epoll_event *events, int maxevents, int timeout, - const sigset_t *sigmask = NULL); + const sigset_t *sigmask = nullptr); virtual ~epoll_wait_call(); /// @override @@ -100,8 +100,7 @@ class epoll_wait_call : public io_mux_call { int get_current_events(); - bool handle_epoll_event(bool is_ready, uint32_t events, socket_fd_api *socket_object, - int index); + bool handle_epoll_event(bool is_ready, uint32_t events, sockinfo *socket_object, int index); protected: virtual int ring_poll_and_process_element(); diff --git a/src/core/iomux/io_mux_call.cpp b/src/core/iomux/io_mux_call.cpp index e8e6ff4c4..d2f7a6c92 100644 --- a/src/core/iomux/io_mux_call.cpp +++ b/src/core/iomux/io_mux_call.cpp @@ -92,7 +92,7 @@ inline void io_mux_call::check_offloaded_wsockets() if (m_p_offloaded_modes[offloaded_index] & OFF_WRITE) { int fd = m_p_all_offloaded_fds[offloaded_index]; - socket_fd_api *p_socket_object = fd_collection_get_sockfd(fd); + sockinfo *p_socket_object = fd_collection_get_sockfd(fd); if (!p_socket_object) { // If we can't find this previously mapped offloaded socket // then it was probably closed. We need to get out with error code @@ -113,7 +113,7 @@ inline void io_mux_call::check_offloaded_esockets() for (int offloaded_index = 0; offloaded_index < *m_p_num_all_offloaded_fds; ++offloaded_index) { if (m_p_offloaded_modes[offloaded_index] & OFF_RDWR) { int fd = m_p_all_offloaded_fds[offloaded_index]; - socket_fd_api *p_socket_object = fd_collection_get_sockfd(fd); + sockinfo *p_socket_object = fd_collection_get_sockfd(fd); if (!p_socket_object) { // If we can't find this previously mapped offloaded socket // then it was probably closed. We need to get out with error code @@ -177,8 +177,9 @@ io_mux_call::io_mux_call(int *off_fds_buffer, offloaded_mode_t *off_modes_buffer , m_p_offloaded_modes(off_modes_buffer) , m_num_all_offloaded_fds(0) , m_cqepfd(-1) - , m_poll_sn(0) - , m_p_stats(NULL) + , m_poll_sn_rx(0) + , m_poll_sn_tx(0) + , m_p_stats(nullptr) , m_n_all_ready_fds(0) , m_n_ready_rfds(0) , m_n_ready_wfds(0) @@ -204,7 +205,7 @@ void io_mux_call::check_offloaded_rsockets() { int fd, offloaded_index, num_all_offloaded_fds; fd_array_t fd_ready_array; - socket_fd_api *p_socket_object; + sockinfo *p_socket_object; fd_ready_array.fd_max = FD_ARRAY_MAX; @@ -229,7 +230,7 @@ void io_mux_call::check_offloaded_rsockets() fd_ready_array.fd_count = 0; // Poll the socket object - if (p_socket_object->is_readable(&m_poll_sn, &fd_ready_array)) { + if (p_socket_object->is_readable(&m_poll_sn_rx, &fd_ready_array)) { set_offloaded_rfd_ready(offloaded_index); // We have offloaded traffic. Don't sample the OS immediately p_socket_object->unset_immediate_os_sample(); @@ -259,7 +260,7 @@ bool io_mux_call::handle_os_countdown(int &poll_os_countdown) // This will empty the cqepfd // (most likely in case of a wakeup and probably only under epoll_wait (Not // select/poll)) - ring_wait_for_notification_and_process_element(NULL); + ring_wait_for_notification_and_process_element(nullptr); } /* Before we exit with ready OS fd's we'll check the CQs once more and exit * below after calling check_all_offloaded_sockets(); @@ -423,7 +424,7 @@ void io_mux_call::blocking_loops() woke_up_non_valid = false; ret = ring_request_notification(); - __log_func("arming cq with poll_sn=%lx ret=%d", m_poll_sn, ret); + __log_func("arming cq with poll_sn=%lx ret=%d", m_poll_sn_rx, ret); if (ret < 0) { xlio_throw_object(io_mux_call::io_error); } else if (ret > 0) { @@ -549,18 +550,19 @@ bool io_mux_call::immidiate_return(int &poll_os_countdown) int io_mux_call::ring_poll_and_process_element() { // TODO: (select, poll) this access all CQs, it is better to check only relevant ones - return g_p_net_device_table_mgr->global_ring_poll_and_process_element(&m_poll_sn, NULL); + return g_p_net_device_table_mgr->global_ring_poll_and_process_element(&m_poll_sn_rx, + &m_poll_sn_tx, nullptr); } int io_mux_call::ring_request_notification() { - return g_p_net_device_table_mgr->global_ring_request_notification(m_poll_sn); + return g_p_net_device_table_mgr->global_ring_request_notification(m_poll_sn_rx, m_poll_sn_tx); } int io_mux_call::ring_wait_for_notification_and_process_element(void *pv_fd_ready_array) { return g_p_net_device_table_mgr->global_ring_wait_for_notification_and_process_element( - &m_poll_sn, pv_fd_ready_array); + &m_poll_sn_rx, pv_fd_ready_array); } bool io_mux_call::is_sig_pending() diff --git a/src/core/iomux/io_mux_call.h b/src/core/iomux/io_mux_call.h index b26daccad..0672d8f87 100644 --- a/src/core/iomux/io_mux_call.h +++ b/src/core/iomux/io_mux_call.h @@ -38,7 +38,7 @@ #include #include -#include +#include #include // from sigset.h @@ -77,7 +77,7 @@ class io_mux_call { * @param fds_buffer Pointer to a buffer large enough to hold all fds. */ io_mux_call(int *off_fds_buffer, offloaded_mode_t *off_modes_buffer, int num_fds = 0, - const sigset_t *sigmask = NULL); // = 0 is only temp + const sigset_t *sigmask = nullptr); // = 0 is only temp virtual ~io_mux_call() {}; /** @@ -264,7 +264,8 @@ class io_mux_call { int m_cqepfd; /// poll sn - uint64_t m_poll_sn; + uint64_t m_poll_sn_rx; + uint64_t m_poll_sn_tx; /// xlio statistics. each implementation must initialize this. iomux_func_stats_t *m_p_stats; diff --git a/src/core/iomux/poll_call.cpp b/src/core/iomux/poll_call.cpp index 93739b165..f767d558b 100644 --- a/src/core/iomux/poll_call.cpp +++ b/src/core/iomux/poll_call.cpp @@ -34,7 +34,7 @@ #include #include -#include +#include #include #include #include @@ -54,7 +54,7 @@ poll_call::poll_call(int *off_rfds_buffer, offloaded_mode_t *off_modes_buffer, i { nfds_t i; int fd; - m_fds = NULL; + m_fds = nullptr; // create stats m_p_stats = &g_poll_stats; @@ -71,7 +71,7 @@ poll_call::poll_call(int *off_rfds_buffer, offloaded_mode_t *off_modes_buffer, i } fd = m_orig_fds[i].fd; - socket_fd_api *temp_sock_fd_api = fd_collection_get_sockfd(fd); + sockinfo *temp_sock_fd_api = fd_collection_get_sockfd(fd); if (temp_sock_fd_api && (temp_sock_fd_api->get_type() == FD_TYPE_SOCKET)) { // POLLERR and POLLHUP are always enabled implicitly and considered as READ by XLIO offloaded_mode_t off_mode = OFF_READ; @@ -97,7 +97,7 @@ poll_call::poll_call(int *off_rfds_buffer, offloaded_mode_t *off_modes_buffer, i __log_func("fd=%d must be skipped from os r poll()", fd); m_fds[i].fd = -1; } else if (m_orig_fds[i].events & POLLIN) { - if (temp_sock_fd_api->is_readable(NULL)) { + if (temp_sock_fd_api->is_readable(nullptr)) { io_mux_call::update_fd_array(&m_fd_ready_array, fd); m_n_ready_rfds++; m_n_all_ready_fds++; @@ -132,7 +132,7 @@ bool poll_call::wait_os(bool zero_timeout) { __log_func("calling os poll: %d", m_nfds); if (m_sigmask) { - struct timespec to, *pto = NULL; + struct timespec to, *pto = nullptr; if (zero_timeout) { to.tv_sec = to.tv_nsec = 0; pto = &to; @@ -141,9 +141,9 @@ bool poll_call::wait_os(bool zero_timeout) to.tv_nsec = (m_timeout % 1000) * 1000000; pto = &to; } - m_n_all_ready_fds = orig_os_api.ppoll(m_fds, m_nfds, pto, m_sigmask); + m_n_all_ready_fds = SYSCALL(ppoll, m_fds, m_nfds, pto, m_sigmask); } else { - m_n_all_ready_fds = orig_os_api.poll(m_fds, m_nfds, zero_timeout ? 0 : m_timeout); + m_n_all_ready_fds = SYSCALL(poll, m_fds, m_nfds, zero_timeout ? 0 : m_timeout); } if (m_n_all_ready_fds < 0) { xlio_throw_object(io_mux_call::io_error); @@ -160,7 +160,7 @@ bool poll_call::wait(const timeval &elapsed) { // poll fds and cq int timeout; - struct timespec to, *pto = NULL; + struct timespec to, *pto = nullptr; if (m_timeout < 0) { timeout = m_timeout; @@ -175,9 +175,9 @@ bool poll_call::wait(const timeval &elapsed) to.tv_sec = m_timeout / 1000; to.tv_nsec = (m_timeout % 1000) * 1000000; pto = &to; - m_n_all_ready_fds = orig_os_api.ppoll(m_fds, m_nfds + 1, pto, m_sigmask); + m_n_all_ready_fds = SYSCALL(ppoll, m_fds, m_nfds + 1, pto, m_sigmask); } else { - m_n_all_ready_fds = orig_os_api.poll(m_fds, m_nfds + 1, timeout); + m_n_all_ready_fds = SYSCALL(poll, m_fds, m_nfds + 1, timeout); } if (m_n_all_ready_fds > 0 && m_fds[m_nfds].revents) { diff --git a/src/core/iomux/poll_call.h b/src/core/iomux/poll_call.h index b4f592dcb..1005c59e3 100644 --- a/src/core/iomux/poll_call.h +++ b/src/core/iomux/poll_call.h @@ -54,7 +54,7 @@ class poll_call : public io_mux_call { */ poll_call(int *off_rfds_buffer, offloaded_mode_t *off_modes_buffer, int *lookup_buffer, pollfd *working_fds_arr, pollfd *fds, nfds_t nfds, int timeout, - const sigset_t *__sigmask = NULL); + const sigset_t *__sigmask = nullptr); /// @override virtual void set_offloaded_rfd_ready(int fd_index); diff --git a/src/core/iomux/select_call.cpp b/src/core/iomux/select_call.cpp index e81961047..23c9e069a 100644 --- a/src/core/iomux/select_call.cpp +++ b/src/core/iomux/select_call.cpp @@ -61,7 +61,6 @@ select_call::select_call(int *off_fds_buffer, offloaded_mode_t *off_modes_buffer , m_b_run_prepare_to_poll(false) { int fd; - // socket_fd_api* temp_sock_fd_api = NULL; if (m_nfds > FD_SETSIZE) { errno = ENOMEM; @@ -91,7 +90,7 @@ select_call::select_call(int *off_fds_buffer, offloaded_mode_t *off_modes_buffer bool check_read = offloaded_read && FD_ISSET(fd, m_readfds); bool check_write = offloaded_write && FD_ISSET(fd, m_writefds); - socket_fd_api *psock = fd_collection_get_sockfd(fd); + sockinfo *psock = fd_collection_get_sockfd(fd); if (psock && psock->get_type() == FD_TYPE_SOCKET) { @@ -112,7 +111,7 @@ select_call::select_call(int *off_fds_buffer, offloaded_mode_t *off_modes_buffer if (!psock->skip_os_select()) { if (check_read) { FD_SET(fd, &m_os_rfds); - if (psock->is_readable(NULL)) { + if (psock->is_readable(nullptr)) { io_mux_call::update_fd_array(&m_fd_ready_array, fd); m_n_ready_rfds++; m_n_all_ready_fds++; @@ -181,8 +180,8 @@ void select_call::prepare_to_block() bool select_call::wait_os(bool zero_timeout) { - timeval to, *pto = NULL; - timespec to_pselect, *pto_pselect = NULL; + timeval to, *pto = nullptr; + timespec to_pselect, *pto_pselect = nullptr; /* Avner: I put it in comment, because this logic is wrong @@ -219,9 +218,9 @@ bool select_call::wait_os(bool zero_timeout) pto_pselect = &to_pselect; } m_n_all_ready_fds = - orig_os_api.pselect(m_nfds, m_readfds, m_writefds, m_exceptfds, pto_pselect, m_sigmask); + SYSCALL(pselect, m_nfds, m_readfds, m_writefds, m_exceptfds, pto_pselect, m_sigmask); } else { - m_n_all_ready_fds = orig_os_api.select(m_nfds, m_readfds, m_writefds, m_exceptfds, pto); + m_n_all_ready_fds = SYSCALL(select, m_nfds, m_readfds, m_writefds, m_exceptfds, pto); } if (m_n_all_ready_fds < 0) { xlio_throw_object(io_mux_call::io_error); @@ -234,8 +233,8 @@ bool select_call::wait_os(bool zero_timeout) bool select_call::wait(const timeval &elapsed) { - timeval timeout, *pto = NULL; - timespec to_pselect, *pto_pselect = NULL; + timeval timeout, *pto = nullptr; + timespec to_pselect, *pto_pselect = nullptr; BULLSEYE_EXCLUDE_BLOCK_START if (m_n_all_ready_fds > 0) { @@ -283,10 +282,10 @@ bool select_call::wait(const timeval &elapsed) pto_pselect = &to_pselect; } m_n_all_ready_fds = - orig_os_api.pselect(m_nfds, m_readfds, m_writefds, m_exceptfds, pto_pselect, m_sigmask); + SYSCALL(pselect, m_nfds, m_readfds, m_writefds, m_exceptfds, pto_pselect, m_sigmask); } else { m_n_all_ready_fds = - orig_os_api.select(m_nfds_with_cq, m_readfds, m_writefds, m_exceptfds, pto); + SYSCALL(select, m_nfds_with_cq, m_readfds, m_writefds, m_exceptfds, pto); } __log_func("done select CQ+OS nfds=%d cqfd=%d pto=%p ready=%d!!!", m_nfds_with_cq, m_cqepfd, pto, m_n_all_ready_fds); diff --git a/src/core/iomux/select_call.h b/src/core/iomux/select_call.h index c935c746e..f856455c0 100644 --- a/src/core/iomux/select_call.h +++ b/src/core/iomux/select_call.h @@ -52,7 +52,7 @@ class select_call : public io_mux_call { */ select_call(int *off_fds_buffer, offloaded_mode_t *off_modes_buffer, int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, timeval *timeout, - const sigset_t *__sigmask = NULL); + const sigset_t *__sigmask = nullptr); /// @override virtual void set_offloaded_rfd_ready(int fd_index); diff --git a/src/core/libxlio.c b/src/core/libxlio.c index 33467d8f9..26e48f232 100644 --- a/src/core/libxlio.c +++ b/src/core/libxlio.c @@ -33,6 +33,7 @@ extern int xlio_init(void); extern int xlio_exit(void); +#ifndef XLIO_STATIC_BUILD int __attribute__((constructor)) sock_redirect_lib_load_constructor(void) { return xlio_init(); @@ -42,3 +43,4 @@ int __attribute__((destructor)) sock_redirect_lib_load_destructor(void) { return xlio_exit(); } +#endif /* XLIO_STATIC_BUILD */ diff --git a/src/core/lwip/cc_cubic.c b/src/core/lwip/cc_cubic.c index 044fa84ac..9071cf228 100644 --- a/src/core/lwip/cc_cubic.c +++ b/src/core/lwip/cc_cubic.c @@ -81,7 +81,8 @@ */ #include "cc_cubic.h" -#include "errno.h" + +#include #include #if TCP_CC_ALGO_MOD diff --git a/src/core/lwip/def.h b/src/core/lwip/def.h index 5cb2bed41..4e4f22a81 100644 --- a/src/core/lwip/def.h +++ b/src/core/lwip/def.h @@ -32,8 +32,6 @@ #ifndef __LWIP_DEF_H__ #define __LWIP_DEF_H__ -/* arch.h might define NULL already */ - #include "core/lwip/opt.h" #ifdef __cplusplus @@ -47,65 +45,7 @@ extern "C" { #define NULL ((void *)0) #endif -/** Get the absolute difference between 2 u32_t values (correcting overflows) - * 'a' is expected to be 'higher' (without overflow) than 'b'. */ -#define LWIP_U32_DIFF(a, b) (((a) >= (b)) ? ((a) - (b)) : (((a) + ((b) ^ 0xFFFFFFFF) + 1))) - -/* Endianess-optimized shifting of two u8_t to create one u16_t */ -#if BYTE_ORDER == LITTLE_ENDIAN -#define LWIP_MAKE_U16(a, b) ((a << 8) | b) -#else -#define LWIP_MAKE_U16(a, b) ((b << 8) | a) -#endif - -#ifndef LWIP_PLATFORM_BYTESWAP -#define LWIP_PLATFORM_BYTESWAP 0 -#endif - -#ifndef LWIP_PREFIX_BYTEORDER_FUNCS -/* workaround for naming collisions on some platforms */ - -#ifdef htons -#undef htons -#endif /* htons */ -#ifdef htonl -#undef htonl -#endif /* htonl */ -#ifdef ntohs -#undef ntohs -#endif /* ntohs */ -#ifdef ntohl -#undef ntohl -#endif /* ntohl */ - -#define htons(x) lwip_htons(x) -#define ntohs(x) lwip_ntohs(x) -#define htonl(x) lwip_htonl(x) -#define ntohl(x) lwip_ntohl(x) -#endif /* LWIP_PREFIX_BYTEORDER_FUNCS */ - -#if BYTE_ORDER == BIG_ENDIAN -#define lwip_htons(x) (x) -#define lwip_ntohs(x) (x) -#define lwip_htonl(x) (x) -#define lwip_ntohl(x) (x) -#define PP_HTONS(x) (x) -#define PP_NTOHS(x) (x) -#define PP_HTONL(x) (x) -#define PP_NTOHL(x) (x) -#else /* BYTE_ORDER != BIG_ENDIAN */ -#if LWIP_PLATFORM_BYTESWAP -#define lwip_htons(x) LWIP_PLATFORM_HTONS(x) -#define lwip_ntohs(x) LWIP_PLATFORM_HTONS(x) -#define lwip_htonl(x) LWIP_PLATFORM_HTONL(x) -#define lwip_ntohl(x) LWIP_PLATFORM_HTONL(x) -#else /* LWIP_PLATFORM_BYTESWAP */ -u16_t lwip_htons(u16_t x); -u16_t lwip_ntohs(u16_t x); -u32_t lwip_htonl(u32_t x); -u32_t lwip_ntohl(u32_t x); -#endif /* LWIP_PLATFORM_BYTESWAP */ - +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ /* These macros should be calculated by the preprocessor and are used with compile-time constants only (so that there is no little-endian overhead at runtime). */ @@ -115,8 +55,12 @@ u32_t lwip_ntohl(u32_t x); ((((x)&0xff) << 24) | (((x)&0xff00) << 8) | (((x)&0xff0000UL) >> 8) | \ (((x)&0xff000000UL) >> 24)) #define PP_NTOHL(x) PP_HTONL(x) - -#endif /* BYTE_ORDER == BIG_ENDIAN */ +#else /* __BYTE_ORDER__ */ +#define PP_HTONS(x) (x) +#define PP_NTOHS(x) (x) +#define PP_HTONL(x) (x) +#define PP_NTOHL(x) (x) +#endif /* __BYTE_ORDER__ */ static inline u32_t read32_be(const void *addr) { diff --git a/src/core/lwip/init.c b/src/core/lwip/init.c deleted file mode 100644 index 653718deb..000000000 --- a/src/core/lwip/init.c +++ /dev/null @@ -1,53 +0,0 @@ -/** - * @file - * Modules initialization - * - */ - -/* - * Copyright (c) 2001-2004 Swedish Institute of Computer Science. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without modification, - * are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * 3. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT - * SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT - * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING - * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY - * OF SUCH DAMAGE. - * - * This file is part of the lwIP TCP/IP stack. - * - * Author: Adam Dunkels - * - */ - -#include "lwip/opt.h" - -#include "lwip/init.h" -#include "lwip/pbuf.h" -#include "lwip/tcp_impl.h" - -/** - * Perform Sanity check of user-configurable values, and initialize all modules. - */ -void lwip_init(void) -{ - /* Modules initialization */ - pbuf_init(); - tcp_init(); -} diff --git a/src/core/lwip/init.h b/src/core/lwip/init.h deleted file mode 100644 index 87a947afa..000000000 --- a/src/core/lwip/init.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) 2001-2004 Swedish Institute of Computer Science. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without modification, - * are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * 3. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT - * SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT - * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING - * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY - * OF SUCH DAMAGE. - * - * This file is part of the lwIP TCP/IP stack. - * - * Author: Adam Dunkels - * - */ -#ifndef __LWIP_INIT_H__ -#define __LWIP_INIT_H__ - -#include "lwip/opt.h" - -#ifdef __cplusplus -extern "C" { -#endif - -/* Modules initialization */ -void lwip_init(void); - -#ifdef __cplusplus -} -#endif - -#endif /* __LWIP_INIT_H__ */ diff --git a/src/core/lwip/opt.h b/src/core/lwip/opt.h index 466606f83..7f3aefbb1 100644 --- a/src/core/lwip/opt.h +++ b/src/core/lwip/opt.h @@ -91,35 +91,20 @@ #define CONST_TCP_MSS 1460 #define LWIP_TCP_MSS (lwip_tcp_mss) -/** - * TCP_SND_BUF: TCP sender buffer space (bytes). - */ -#define TCP_SND_BUF (lwip_tcp_snd_buf) -#define TCP_SND_BUF_NO_NAGLE 256000 - /* Misc */ -// replace lwip byte swapping to optimized one -#include - -#define LWIP_PLATFORM_BYTESWAP 1 -#define LWIP_PLATFORM_HTONS(x) bswap_16(x) -#define LWIP_PLATFORM_HTONL(x) bswap_32(x) - -// enable LWIP DEBUG here -#if 1 -//#define PBUF_DEBUG LWIP_DBG_ON -//#define TCP_DEBUG LWIP_DBG_ON -//#define TCP_INPUT_DEBUG LWIP_DBG_ON -//#define TCP_FR_DEBUG LWIP_DBG_ON -//#define TCP_RTO_DEBUG LWIP_DBG_ON -//#define TCP_CWND_DEBUG LWIP_DBG_ON -//#define TCP_WND_DEBUG LWIP_DBG_ON -//#define TCP_OUTPUT_DEBUG LWIP_DBG_ON -//#define TCP_RST_DEBUG LWIP_DBG_ON -//#define TCP_QLEN_DEBUG LWIP_DBG_ON -//#define TCP_TSO_DEBUG LWIP_DBG_ON -#endif +// Enable LWIP DEBUG here +//#define PBUF_DEBUG LWIP_DBG_ON +//#define TCP_DEBUG LWIP_DBG_ON +//#define TCP_INPUT_DEBUG LWIP_DBG_ON +//#define TCP_FR_DEBUG LWIP_DBG_ON +//#define TCP_RTO_DEBUG LWIP_DBG_ON +//#define TCP_CWND_DEBUG LWIP_DBG_ON +//#define TCP_WND_DEBUG LWIP_DBG_ON +//#define TCP_OUTPUT_DEBUG LWIP_DBG_ON +//#define TCP_RST_DEBUG LWIP_DBG_ON +//#define TCP_QLEN_DEBUG LWIP_DBG_ON +//#define TCP_TSO_DEBUG LWIP_DBG_ON /* --------------------------------- @@ -227,10 +212,10 @@ #define LWIP_TCP_KEEPALIVE 0 #endif -/* Define platform endianness */ -#ifndef BYTE_ORDER -#define BYTE_ORDER LITTLE_ENDIAN -#endif /* BYTE_ORDER */ +/* Platform endianness */ +#if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) || !defined(__ORDER_BIG_ENDIAN__) +#error "__BYTE_ORDER__ or __ORDER_..._ENDIAN__ is not defined" +#endif /* __BYTE_ORDER__ */ /* Define generic types used in lwIP */ typedef uint8_t u8_t; diff --git a/src/core/lwip/pbuf.c b/src/core/lwip/pbuf.c index de6152816..cfa96b6dc 100644 --- a/src/core/lwip/pbuf.c +++ b/src/core/lwip/pbuf.c @@ -162,16 +162,26 @@ u8_t pbuf_header(struct pbuf *p, s32_t header_size_increment) return 1; } - /* Check that we aren't going to move off the end of the pbuf */ - if (header_size_increment < 0 && (-header_size_increment) > (s32_t)p->len) { - return 1; - } + if (header_size_increment >= 0) { + u32_t header_increment = (u32_t)header_size_increment; + /* set new payload pointer */ + p->payload = (u8_t *)p->payload - header_increment; + /* modify pbuf length fields */ + p->len += header_increment; + p->tot_len += header_increment; + } else { + u32_t header_decrement = (u32_t)(-header_size_increment); + /* Check that we aren't going to move off the end of the pbuf */ + if (header_decrement > p->len) { + return 1; + } - /* set new payload pointer */ - p->payload = (u8_t *)p->payload - header_size_increment; - /* modify pbuf length fields */ - p->len += header_size_increment; - p->tot_len += header_size_increment; + /* set new payload pointer */ + p->payload = (u8_t *)p->payload + header_decrement; + /* modify pbuf length fields */ + p->len -= header_decrement; + p->tot_len -= header_decrement; + } LWIP_DEBUGF(PBUF_DEBUG | LWIP_DBG_TRACE, ("pbuf_header: new %p (%" S32_F ")\n", (void *)p->payload, header_size_increment)); @@ -312,42 +322,3 @@ void pbuf_cat(struct pbuf *h, struct pbuf *t) * so netto there is no change to the reference count of t. */ } - -// windows scale needs large pbuf -/** - * This method modifies a 'pbuf chain', so that its total length is - * smaller than 64K. The remainder of the original pbuf chain is stored - * in *rest. - * This function never creates new pbufs, but splits an existing chain - * in two parts. The tot_len of the modified packet queue will likely be - * smaller than 64K. - * 'packet queues' are not supported by this function. - */ -void pbuf_split_64k(struct pbuf *p, struct pbuf **rest) -{ - if (p == NULL || p->tot_len < 0xffff) { - // pbuf is smaller than 64K - *rest = NULL; - } else { - u32_t tot_len_front = 0; - struct pbuf *i = NULL; - - *rest = p; - while (*rest != NULL && tot_len_front + (*rest)->len <= 0xffff) { - tot_len_front += (*rest)->len; - i = *rest; - *rest = (*rest)->next; - } - /* i now points to last packet of the first segment. Set next - * pointer to NULL */ - i->next = NULL; - - /* Update the tot_len field in the first part */ - for (i = p; i && i->next != *rest && *rest; i = i->next) { - i->tot_len -= (*rest)->tot_len; - } - - /* tot_len field in rest does not need modifications */ - /* reference counters do not need modifications */ - } -} diff --git a/src/core/lwip/pbuf.h b/src/core/lwip/pbuf.h index 4a4cbd6fb..683d3febc 100644 --- a/src/core/lwip/pbuf.h +++ b/src/core/lwip/pbuf.h @@ -55,20 +55,20 @@ enum { PBUF_DESC_NONE = 0, PBUF_DESC_MDESC, PBUF_DESC_FD, - PBUF_DESC_MAP, PBUF_DESC_MKEY, PBUF_DESC_STRIDE, PBUF_DESC_TLS_RX, PBUF_DESC_NVME_TX, + PBUF_DESC_EXPRESS, }; typedef struct { int attr; + u32_t mkey; union { - void *map; - void *mdesc; int fd; - u32_t mkey; + void *mdesc; + void *opaque; }; } pbuf_desc; @@ -80,7 +80,7 @@ struct pbuf { void *payload; /** length of this buffer */ - u16_t len; + u32_t len; u8_t gro; @@ -110,19 +110,6 @@ struct pbuf { pbuf_desc desc; }; -/** Prototype for a function to free a custom pbuf */ -typedef void (*pbuf_free_custom_fn)(struct pbuf *p); - -/** A custom pbuf: like a pbuf, but following a function pointer to free it. */ -struct pbuf_custom { - /** The actual pbuf */ - struct pbuf pbuf; - u64_t padding; /* TODO Remove and optimize mem_buf_desc alignment. */ -}; - -/* Initializes the pbuf module. This call is empty for now, but may not be in future. */ -#define pbuf_init() - void pbuf_realloc(struct pbuf *p, u32_t size); u8_t pbuf_header(struct pbuf *p, s32_t header_size); void pbuf_ref(struct pbuf *p); @@ -130,8 +117,6 @@ u8_t pbuf_free(struct pbuf *p); u8_t pbuf_clen(struct pbuf *p); void pbuf_cat(struct pbuf *head, struct pbuf *tail); -void pbuf_split_64k(struct pbuf *p, struct pbuf **rest); // windows scale needs large pbuf - #ifdef __cplusplus } #endif diff --git a/src/core/lwip/tcp.c b/src/core/lwip/tcp.c index 059cf5cc5..ed8a707ed 100644 --- a/src/core/lwip/tcp.c +++ b/src/core/lwip/tcp.c @@ -94,7 +94,6 @@ u16_t lwip_tcp_mss = CONST_TCP_MSS; u8_t enable_push_flag = 1; u8_t enable_ts_option = 0; u32_t lwip_tcp_snd_buf = 0; -u32_t lwip_zc_tx_size = 0; u32_t lwip_tcp_nodelay_treshold = 0; /* slow timer value */ @@ -151,7 +150,7 @@ static err_t tcp_close_shutdown(struct tcp_pcb *pcb, u8_t rst_on_unacked_data) if (rst_on_unacked_data && ((get_tcp_state(pcb) == ESTABLISHED) || (get_tcp_state(pcb) == CLOSE_WAIT))) { - if ((pcb->refused_data != NULL) || (pcb->rcv_wnd != pcb->rcv_wnd_max)) { + if (pcb->rcv_wnd != pcb->rcv_wnd_max) { /* Not all data received by application, send RST to tell the remote side about this. */ LWIP_ASSERT("pcb->flags & TF_RXCLOSED", pcb->flags & TF_RXCLOSED); @@ -285,11 +284,6 @@ err_t tcp_shutdown(struct tcp_pcb *pcb, int shut_rx, int shut_tx) /* shutting down the tx AND rx side is the same as closing for the raw API */ return tcp_close_shutdown(pcb, 1); } - /* ... and free buffered data */ - if (pcb->refused_data != NULL) { - pbuf_free(pcb->refused_data); - pcb->refused_data = NULL; - } } if (shut_tx) { /* This can't happen twice since if it succeeds, the pcb's state is changed. @@ -709,7 +703,7 @@ void tcp_slowtmr(struct tcp_pcb *pcb) pcb->remote_ip, pcb->is_ipv6); ++pcb_remove; - err = ERR_ABRT; + err = ERR_TIMEOUT; ++pcb_reset; } #if LWIP_TCP_KEEPALIVE @@ -789,39 +783,14 @@ void tcp_slowtmr(struct tcp_pcb *pcb) } /** - * Is called every slow_tmr_interval and process data previously - * "refused" by upper layer (application) and sends delayed ACKs. - * + * Is called every slow_tmr_interval/2 and process data previously * and sends delayed ACKs. * Automatically called from tcp_tmr(). */ void tcp_fasttmr(struct tcp_pcb *pcb) { if (pcb != NULL && PCB_IN_ACTIVE_STATE(pcb)) { - /* If there is data which was previously "refused" by upper layer */ - while (pcb->refused_data != - NULL) { // 'while' instead of 'if' because windows scale uses large pbuf - struct pbuf *rest; - /* Notify again application with data previously received. */ - err_t err; - pbuf_split_64k(pcb->refused_data, &rest); - LWIP_DEBUGF(TCP_INPUT_DEBUG, ("tcp_fasttmr: notify kept packet\n")); - TCP_EVENT_RECV(pcb, pcb->refused_data, ERR_OK, err); - if (err == ERR_OK) { - pcb->refused_data = rest; - } else { - if (rest) { - pbuf_cat(pcb->refused_data, rest); /* undo splitting */ - } - if (err == ERR_ABRT) { - /* if err == ERR_ABRT, 'pcb' is already deallocated */ - pcb = NULL; - } - break; - } - } - /* send delayed ACKs */ - if (pcb && (pcb->flags & TF_ACK_DELAY)) { + if (pcb->flags & TF_ACK_DELAY) { LWIP_DEBUGF(TCP_DEBUG, ("tcp_fasttmr: delayed ACK\n")); tcp_ack_now(pcb); tcp_output(pcb); @@ -937,9 +906,9 @@ void tcp_pcb_init(struct tcp_pcb *pcb, u8_t prio, void *container) memset(pcb, 0, sizeof(*pcb)); pcb->my_container = container; - pcb->max_snd_buff = TCP_SND_BUF; pcb->is_last_seg_dropped = false; pcb->prio = prio; + pcb->max_snd_buff = lwip_tcp_snd_buf; pcb->snd_buf = pcb->max_snd_buff; pcb->snd_queuelen = 0; pcb->snd_scale = 0; @@ -1014,7 +983,7 @@ void tcp_pcb_recycle(struct tcp_pcb *pcb) u32_t iss; pcb->flags = 0; - pcb->max_snd_buff = TCP_SND_BUF; + pcb->max_snd_buff = lwip_tcp_snd_buf; pcb->snd_buf = pcb->max_snd_buff; pcb->user_timeout_ms = 0; pcb->ticks_since_data_sent = -1; @@ -1057,13 +1026,9 @@ void tcp_pcb_recycle(struct tcp_pcb *pcb) tcp_tx_pbuf_free(pcb, pcb->pbuf_alloc); pcb->pbuf_alloc = NULL; } - if (pcb->refused_data) { - pbuf_free(pcb->refused_data); - pcb->refused_data = NULL; - } } -struct pbuf *tcp_tx_pbuf_alloc(struct tcp_pcb *pcb, u16_t length, pbuf_type type, pbuf_desc *desc, +struct pbuf *tcp_tx_pbuf_alloc(struct tcp_pcb *pcb, u32_t length, pbuf_type type, pbuf_desc *desc, struct pbuf *p_buff) { struct pbuf *p; @@ -1245,11 +1210,6 @@ void tcp_pcb_purge(struct tcp_pcb *pcb) LWIP_DEBUGF(TCP_DEBUG, ("tcp_pcb_purge\n")); - if (pcb->refused_data != NULL) { - LWIP_DEBUGF(TCP_DEBUG, ("tcp_pcb_purge: data left on ->refused_data\n")); - pbuf_free(pcb->refused_data); - pcb->refused_data = NULL; - } if (pcb->unsent != NULL) { LWIP_DEBUGF(TCP_DEBUG, ("tcp_pcb_purge: not all data sent\n")); } @@ -1271,7 +1231,7 @@ void tcp_pcb_purge(struct tcp_pcb *pcb) tcp_tx_segs_free(pcb, pcb->unsent); tcp_tx_segs_free(pcb, pcb->unacked); pcb->unacked = pcb->unsent = NULL; - pcb->last_unsent = NULL; + pcb->last_unacked = pcb->last_unsent = NULL; #if TCP_OVERSIZE pcb->unsent_oversize = 0; #endif /* TCP_OVERSIZE */ diff --git a/src/core/lwip/tcp.h b/src/core/lwip/tcp.h index bd231378c..5b818ebb8 100644 --- a/src/core/lwip/tcp.h +++ b/src/core/lwip/tcp.h @@ -50,7 +50,6 @@ void register_sys_now(sys_now_fn fn); extern u16_t lwip_tcp_mss; extern u32_t lwip_tcp_snd_buf; -extern u32_t lwip_zc_tx_size; extern u32_t lwip_tcp_nodelay_treshold; struct tcp_seg; @@ -333,7 +332,7 @@ struct tcp_pcb { u32_t acked; - u32_t snd_buf; /* Available buffer space for sending (in bytes). */ + s32_t snd_buf; /* Available buffer space for sending (in bytes). */ u32_t max_snd_buff; u32_t snd_sml_snt; /* maintain state for minshall's algorithm */ @@ -359,7 +358,6 @@ struct tcp_pcb { struct tcp_seg *ooseq; /* Received out of sequence segments. */ #endif /* TCP_QUEUE_OOSEQ */ - struct pbuf *refused_data; /* Data previously received but not yet taken by upper layer */ struct tcp_seg *seg_alloc; /* Available tcp_seg element for use */ struct pbuf *pbuf_alloc; /* Available pbuf element for use */ @@ -420,8 +418,6 @@ struct tcp_pcb { /* Maximum number of SGE */ u32_t max_send_sge; } tso; - - u32_t max_send_sge; }; typedef u16_t (*ip_route_mtu_fn)(struct tcp_pcb *pcb); @@ -480,6 +476,8 @@ err_t tcp_shutdown(struct tcp_pcb *pcb, int shut_rx, int shut_tx); err_t tcp_write(struct tcp_pcb *pcb, const void *dataptr, u32_t len, u16_t apiflags, pbuf_desc *desc); +err_t tcp_write_express(struct tcp_pcb *pcb, const struct iovec *iov, u32_t iovcnt, + pbuf_desc *desc); #define TCP_PRIO_MIN 1 #define TCP_PRIO_NORMAL 64 diff --git a/src/core/lwip/tcp_impl.h b/src/core/lwip/tcp_impl.h index fad6e9322..c3e0c02b0 100644 --- a/src/core/lwip/tcp_impl.h +++ b/src/core/lwip/tcp_impl.h @@ -40,8 +40,6 @@ extern "C" { #endif -#define tcp_init() /* Compatibility define, no init needed. */ - /* Functions for interfacing with TCP: */ #if defined(__GNUC__) && (((__GNUC__ == 4) && (__GNUC_MINOR__ >= 4)) || (__GNUC__ > 4)) #pragma GCC visibility push(hidden) @@ -57,7 +55,7 @@ void L3_level_tcp_input(struct pbuf *p, struct tcp_pcb *pcb); /* Used within the TCP code only: */ struct tcp_pcb *tcp_alloc(u8_t prio); -struct pbuf *tcp_tx_pbuf_alloc(struct tcp_pcb *pcb, u16_t length, pbuf_type type, pbuf_desc *desc, +struct pbuf *tcp_tx_pbuf_alloc(struct tcp_pcb *pcb, u32_t length, pbuf_type type, pbuf_desc *desc, struct pbuf *p_buff); void tcp_tx_preallocted_buffers_free(struct tcp_pcb *pcb); void tcp_tx_pbuf_free(struct tcp_pcb *pcb, struct pbuf *pbuf); @@ -92,7 +90,7 @@ void set_tmr_resolution(u32_t v); ((tpcb)->flags & TF_INFR) || \ (((tpcb)->unsent != NULL) && \ (((tpcb)->unsent->next != NULL) || ((tpcb)->unsent->len >= (tpcb)->mss))) || \ - ((tcp_sndbuf(tpcb) == 0) || (tcp_sndqueuelen(tpcb) >= (tpcb)->max_tcp_snd_queuelen))) \ + ((tcp_sndbuf(tpcb) <= 0) || (tcp_sndqueuelen(tpcb) >= (tpcb)->max_tcp_snd_queuelen))) \ ? 1 \ : 0) #define tcp_output_nagle(tpcb) (tcp_do_output_nagle(tpcb) ? tcp_output(tpcb) : ERR_OK) diff --git a/src/core/lwip/tcp_in.c b/src/core/lwip/tcp_in.c index 19c9f74f7..75c234f3a 100644 --- a/src/core/lwip/tcp_in.c +++ b/src/core/lwip/tcp_in.c @@ -180,29 +180,6 @@ void L3_level_tcp_input(struct pbuf *p, struct tcp_pcb *pcb) in_data.recv_data = NULL; in_data.recv_flags = 0; - /* If there is data which was previously "refused" by upper layer */ - /* 'while' instead of 'if' because windows scale uses large pbuf */ - while (pcb->refused_data != NULL) { - struct pbuf *rest; - pbuf_split_64k(pcb->refused_data, &rest); - - /* Notify again application with data previously received. */ - LWIP_DEBUGF(TCP_INPUT_DEBUG, ("tcp_input: notify kept packet\n")); - TCP_EVENT_RECV(pcb, pcb->refused_data, ERR_OK, err); - if (err == ERR_OK) { - pcb->refused_data = rest; - } else { - if (rest) { - pbuf_cat(pcb->refused_data, rest); /* undo splitting */ - } - /* if err == ERR_ABRT, 'pcb' is already deallocated */ - /* drop incoming packets, because pcb is "full" */ - LWIP_DEBUGF(TCP_INPUT_DEBUG, - ("tcp_input: drop incoming packets, because pcb is \"full\"\n")); - pbuf_free(p); - return; - } - } pcb->is_in_input = 1; err = tcp_process(pcb, &in_data); /* A return value of ERR_ABRT means that tcp_abort() was called @@ -230,9 +207,7 @@ void L3_level_tcp_input(struct pbuf *p, struct tcp_pcb *pcb) } } - while (in_data.recv_data != - NULL) { // 'while' instead of 'if' because windows scale uses large pbuf - struct pbuf *rest = NULL; + if (in_data.recv_data) { if (pcb->flags & TF_RXCLOSED) { /* received data although already closed -> abort (send RST) to notify the remote host that not all data has been @@ -241,30 +216,18 @@ void L3_level_tcp_input(struct pbuf *p, struct tcp_pcb *pcb) tcp_abort(pcb); goto aborted; } - pbuf_split_64k(in_data.recv_data, &rest); if (in_data.flags & TCP_PSH) { in_data.recv_data->flags |= PBUF_FLAG_PUSH; } /* Notify application that data has been received. */ TCP_EVENT_RECV(pcb, in_data.recv_data, ERR_OK, err); if (err == ERR_ABRT) { - if (rest) { - pbuf_cat(in_data.recv_data, rest); /* undo splitting */ - } goto aborted; } /* If the upper layer can't receive this data, store it */ if (err != ERR_OK) { - if (rest) { - pbuf_cat(in_data.recv_data, rest); /* undo splitting */ - } - pcb->refused_data = in_data.recv_data; - LWIP_DEBUGF( - TCP_INPUT_DEBUG, - ("tcp_input: keep incoming packet, because pcb is \"full\"\n")); - break; - } else { - in_data.recv_data = rest; + pcb->rcv_wnd += in_data.recv_data->tot_len; + pbuf_free(in_data.recv_data); } } @@ -285,11 +248,6 @@ void L3_level_tcp_input(struct pbuf *p, struct tcp_pcb *pcb) pcb->is_in_input = 0; /* Try to send something out. */ tcp_output(pcb); -#if TCP_INPUT_DEBUG -#if TCP_DEBUG - tcp_debug_print_state(get_tcp_state(pcb)); -#endif /* TCP_DEBUG */ -#endif /* TCP_INPUT_DEBUG */ } } /* Jump target if pcb has been aborted in a callback (by calling tcp_abort()). @@ -298,7 +256,7 @@ void L3_level_tcp_input(struct pbuf *p, struct tcp_pcb *pcb) pcb->is_in_input = 0; in_data.recv_data = NULL; - /* give up our reference to inseg.p */ + /* tcp_receive() sets in_data.inseg.p to NULL in case of recv_data */ if (in_data.inseg.p != NULL) { pbuf_free(in_data.inseg.p); in_data.inseg.p = NULL; @@ -1008,7 +966,6 @@ static void tcp_receive(struct tcp_pcb *pcb, tcp_in_data *in_data) struct tcp_seg *prev, *cseg; #endif /* TCP_QUEUE_OOSEQ */ struct pbuf *p; - s32_t off; s16_t m; u32_t right_wnd_edge; u32_t new_tot_len; @@ -1016,6 +973,9 @@ static void tcp_receive(struct tcp_pcb *pcb, tcp_in_data *in_data) s8_t persist = 0; if (in_data->flags & TCP_ACK) { + if (pcb->unacked) { + __builtin_prefetch(pcb->unacked->p); + } right_wnd_edge = pcb->snd_wnd + pcb->snd_wl2; /* Update window. */ @@ -1221,6 +1181,7 @@ static void tcp_receive(struct tcp_pcb *pcb, tcp_in_data *in_data) /* If there's nothing left to acknowledge, stop the retransmit timer, otherwise reset it to start again */ if (pcb->unacked == NULL) { + pcb->last_unacked = NULL; if (persist) { /* start persist timer */ pcb->persist_cnt = 0; @@ -1375,7 +1336,7 @@ static void tcp_receive(struct tcp_pcb *pcb, tcp_in_data *in_data) adjust the ->data pointer in the seg and the segment length.*/ - off = pcb->rcv_nxt - in_data->seqno; + u32_t off = pcb->rcv_nxt - in_data->seqno; p = in_data->inseg.p; LWIP_ASSERT("inseg.p != NULL", in_data->inseg.p); if (in_data->inseg.p->len < off) { diff --git a/src/core/lwip/tcp_out.c b/src/core/lwip/tcp_out.c index aa8cc00ef..c307f5e81 100644 --- a/src/core/lwip/tcp_out.c +++ b/src/core/lwip/tcp_out.c @@ -137,16 +137,10 @@ err_t tcp_send_fin(struct tcp_pcb *pcb) { /* first, try to add the fin to the last unsent segment */ if (pcb->unsent != NULL) { - struct tcp_seg *last_unsent; - for (last_unsent = pcb->unsent; last_unsent->next != NULL; - last_unsent = last_unsent->next) { - ; - } - - if ((TCPH_FLAGS(last_unsent->tcphdr) & (TCP_SYN | TCP_FIN | TCP_RST)) == 0) { + if ((TCPH_FLAGS(pcb->last_unsent->tcphdr) & (TCP_SYN | TCP_FIN | TCP_RST)) == 0) { /* no SYN/FIN/RST flag in the header, we can add the FIN flag */ - TCPH_SET_FLAG(last_unsent->tcphdr, TCP_FIN); - last_unsent->tcp_flags |= TCP_FIN; + TCPH_SET_FLAG(pcb->last_unsent->tcphdr, TCP_FIN); + pcb->last_unsent->tcp_flags |= TCP_FIN; pcb->flags |= TF_FIN; return ERR_OK; } @@ -234,6 +228,29 @@ static struct tcp_seg *tcp_create_segment(struct tcp_pcb *pcb, struct pbuf *p, u return seg; } +/** + * Allocate a PBUF_RAM pbuf + * + * This function is like pbuf_alloc(layer, length, PBUF_RAM) except + * there may be extra bytes available at the end. + * + * @param length size of the pbuf's payload. + * @param pcb The TCP connection that will enqueue the pbuf. + * @param + */ +static struct pbuf *tcp_pbuf_prealloc_express(u32_t length, struct tcp_pcb *pcb, pbuf_type type, + pbuf_desc *desc, struct pbuf *p_buff) +{ + struct pbuf *p; + + p = tcp_tx_pbuf_alloc(pcb, 0, type, desc, p_buff); + if (p != NULL) { + LWIP_ASSERT("need unchained pbuf", p->next == NULL); + p->len = p->tot_len = length; + } + return p; +} + /** * Allocate a PBUF_RAM pbuf, perhaps with extra space at the end. * @@ -248,12 +265,12 @@ static struct tcp_seg *tcp_create_segment(struct tcp_pcb *pcb, struct pbuf *p, u * @param first_seg true when this pbuf will be used in the first enqueued segment. * @param */ -static struct pbuf *tcp_pbuf_prealloc(u16_t length, u16_t max_length, u16_t *oversize, +static struct pbuf *tcp_pbuf_prealloc(u32_t length, u32_t max_length, u16_t *oversize, struct tcp_pcb *pcb, pbuf_type type, u8_t tcp_write_flag_more, u8_t first_seg, pbuf_desc *desc, struct pbuf *p_buff) { struct pbuf *p; - u16_t alloc = length; + u32_t alloc = length; if (length < max_length) { /* Should we allocate an oversized pbuf, or just the minimum @@ -300,19 +317,8 @@ static err_t tcp_write_checks(struct tcp_pcb *pcb, u32_t len) return ERR_OK; } - /* fail on too much data */ - if (len > pcb->snd_buf) { - LWIP_DEBUGF(TCP_OUTPUT_DEBUG | 3, - ("tcp_write: too much data (len=%" U32_F " > snd_buf=%" U32_F ")\n", len, - pcb->snd_buf)); - pcb->flags |= TF_NAGLEMEMERR; - return ERR_MEM; - } - LWIP_DEBUGF(TCP_QLEN_DEBUG, ("tcp_write: queuelen: %" U32_F "\n", (u32_t)pcb->snd_queuelen)); - /* If total number of pbufs on the unsent/unacked queues exceeds the * configured maximum, return an error */ - /* check for configured max queuelen and possible overflow */ if ((pcb->snd_queuelen >= pcb->max_unsent_len) || (pcb->snd_queuelen > TCP_SNDQUEUELEN_OVERFLOW)) { LWIP_DEBUGF(TCP_OUTPUT_DEBUG | 3, @@ -321,17 +327,12 @@ static err_t tcp_write_checks(struct tcp_pcb *pcb, u32_t len) pcb->flags |= TF_NAGLEMEMERR; return ERR_MEM; } - if (pcb->snd_queuelen != 0) { - } else { - LWIP_ASSERT("tcp_write: no pbufs on queue => both queues empty", - pcb->unacked == NULL && pcb->unsent == NULL); - } return ERR_OK; } -static inline u16_t tcp_xmit_size_goal(struct tcp_pcb *pcb, int use_max) +static inline u32_t tcp_xmit_size_goal(struct tcp_pcb *pcb, int use_max) { - u16_t size = pcb->mss; + u32_t size = pcb->mss; #if LWIP_TCP_TIMESTAMPS if ((pcb->flags & TF_TIMESTAMP)) { @@ -359,14 +360,17 @@ static inline u16_t tcp_xmit_size_goal(struct tcp_pcb *pcb, int use_max) * To prompt the system to send data now, call tcp_output() after * calling tcp_write(). * + * The function will copy the data from arg to a new pbuf. + * * @param pcb Protocol control block for the TCP connection to enqueue data for. * @param arg Pointer to the data to be enqueued for sending. * @param len Data length in bytes - * @param apiflags combination of following flags : + * @param apiflags combination of following flags: * - TCP_WRITE_FLAG_COPY (0x01) data will be copied into memory belonging to the stack * - TCP_WRITE_FLAG_MORE (0x02) for TCP connection, PSH flag will be set on last segment sent * - TCP_WRITE_DUMMY (0x10) indicates if the packet is a dummy packet * - TCP_WRITE_FILE (0x40) data should be taken from file + * @param desc Additional metadata that allows later to check the data mkey/lkey. * @return ERR_OK if enqueued, another err_t on error */ err_t tcp_write(struct tcp_pcb *pcb, const void *arg, u32_t len, u16_t apiflags, pbuf_desc *desc) @@ -382,9 +386,8 @@ err_t tcp_write(struct tcp_pcb *pcb, const void *arg, u32_t len, u16_t apiflags, u16_t oversize_used = 0; #endif /* TCP_OVERSIZE */ err_t err; - u16_t mss_local = 0; - u16_t mss_local_minus_opts; - int tot_p = 0; + u32_t mss_local = 0; + u32_t mss_local_minus_opts; const int piov_max_size = 512; const int piov_max_len = 65536; struct iovec piov[piov_max_size]; @@ -393,9 +396,8 @@ err_t tcp_write(struct tcp_pcb *pcb, const void *arg, u32_t len, u16_t apiflags, off_t offset = 0; off_t offset_next = 0; - bool is_zerocopy = !!(apiflags & TCP_WRITE_ZEROCOPY); - bool is_file = (apiflags & (TCP_WRITE_FILE | TCP_WRITE_ZEROCOPY)) == TCP_WRITE_FILE; - pbuf_type type = (apiflags & TCP_WRITE_ZEROCOPY) ? PBUF_ZEROCOPY : PBUF_RAM; + bool is_file = (apiflags & TCP_WRITE_FILE) == TCP_WRITE_FILE; + pbuf_type type = PBUF_RAM; int byte_queued = pcb->snd_nxt - pcb->lastack; if (len < pcb->mss && !(apiflags & TCP_WRITE_DUMMY)) { @@ -413,14 +415,9 @@ err_t tcp_write(struct tcp_pcb *pcb, const void *arg, u32_t len, u16_t apiflags, } queuelen = pcb->snd_queuelen; - if (is_zerocopy) { - mss_local = lwip_zc_tx_size; - } else { - mss_local = tcp_xmit_size_goal(pcb, 1); - } + mss_local = tcp_xmit_size_goal(pcb, 1); optflags |= (apiflags & TCP_WRITE_DUMMY) ? TF_SEG_OPTS_DUMMY_MSG : 0; - optflags |= (apiflags & TCP_WRITE_ZEROCOPY) ? TF_SEG_OPTS_ZEROCOPY : 0; #if LWIP_TCP_TIMESTAMPS if (pcb->flags & TF_TIMESTAMP) { @@ -430,10 +427,6 @@ err_t tcp_write(struct tcp_pcb *pcb, const void *arg, u32_t len, u16_t apiflags, optlen = LWIP_TCP_OPT_LENGTH(optflags); mss_local_minus_opts = mss_local - optlen; - if (is_zerocopy) { - /* TCP options will reside in seg->l2_l3_tcphdr_zc */ - optlen = 0; - } if (is_file) { offset = offset_next = *(__off64_t *)arg; } @@ -460,21 +453,10 @@ err_t tcp_write(struct tcp_pcb *pcb, const void *arg, u32_t len, u16_t apiflags, * pos records progress as data is segmented. */ - /* Find the tail of the unsent queue. */ - if (pcb->unsent != NULL) { - u16_t space; - u16_t unsent_optlen; - - if (!pcb->last_unsent || pcb->last_unsent->next) { - /* @todo: this could be sped up by keeping last_unsent in the pcb */ - for (pcb->last_unsent = pcb->unsent; pcb->last_unsent->next != NULL; - pcb->last_unsent = pcb->last_unsent->next) { - ; - } - } - + if (pcb->last_unsent != NULL) { + u32_t space; /* Usable space at the end of the last unsent segment */ - unsent_optlen = LWIP_TCP_OPT_LENGTH(pcb->last_unsent->flags); + u16_t unsent_optlen = LWIP_TCP_OPT_LENGTH(pcb->last_unsent->flags); if ((pcb->last_unsent->p->type == type) && (mss_local > pcb->last_unsent->len + unsent_optlen) && (TCP_SEQ_GEQ(pcb->last_unsent->seqno, pcb->snd_nxt)) && @@ -487,7 +469,6 @@ err_t tcp_write(struct tcp_pcb *pcb, const void *arg, u32_t len, u16_t apiflags, #endif /* TCP_OVERSIZE */ } seg = pcb->last_unsent; - tot_p = pbuf_clen(seg->p); /* * Phase 1: Copy data directly into an oversized pbuf. @@ -504,7 +485,7 @@ err_t tcp_write(struct tcp_pcb *pcb, const void *arg, u32_t len, u16_t apiflags, #endif /* TCP_OVERSIZE_DBGCHECK */ if (pcb->unsent_oversize > 0) { - if (!(apiflags & (TCP_WRITE_FILE | TCP_WRITE_ZEROCOPY))) { + if (!(apiflags & TCP_WRITE_FILE)) { oversize = pcb->unsent_oversize; LWIP_ASSERT("inconsistent oversize vs. space", oversize_used <= space); oversize_used = oversize < len ? oversize : len; @@ -525,9 +506,9 @@ err_t tcp_write(struct tcp_pcb *pcb, const void *arg, u32_t len, u16_t apiflags, * the end. */ if (!is_file && (pos < len) && (space > 0) && (pcb->last_unsent->len > 0) && - (tot_p < (int)pcb->tso.max_send_sge)) { + (pbuf_clen(seg->p) < pcb->tso.max_send_sge)) { - u16_t seglen = space < len - pos ? space : len - pos; + u32_t seglen = space < len - pos ? space : len - pos; if ((concat_p = tcp_pbuf_prealloc(seglen, space, &oversize, pcb, type, TCP_WRITE_FLAG_MORE, 1, desc, NULL)) == NULL) { LWIP_DEBUGF( @@ -539,18 +520,13 @@ err_t tcp_write(struct tcp_pcb *pcb, const void *arg, u32_t len, u16_t apiflags, #if TCP_OVERSIZE_DBGCHECK pcb->last_unsent->oversize_left += oversize; #endif /* TCP_OVERSIZE_DBGCHECK */ - if (is_zerocopy) { - concat_p->payload = (u8_t *)arg + pos; - } else { - memcpy(concat_p->payload, (u8_t *)arg + pos, seglen); - } + memcpy(concat_p->payload, (u8_t *)arg + pos, seglen); pos += seglen; - queuelen += pbuf_clen(concat_p); + queuelen++; /* There is only one pbuf in the list */ } } else { #if TCP_OVERSIZE - pcb->last_unsent = NULL; LWIP_ASSERT("unsent_oversize mismatch (pcb->unsent is NULL)", pcb->unsent_oversize == 0); #endif /* TCP_OVERSIZE */ } @@ -564,13 +540,8 @@ err_t tcp_write(struct tcp_pcb *pcb, const void *arg, u32_t len, u16_t apiflags, while (pos < len) { struct pbuf *p; u32_t left = len - pos; - u16_t max_len = mss_local_minus_opts; - u16_t seglen = left > max_len ? max_len : left; - - /* create pbuf of the exact size needed now, to later avoid the p1 (oversize) flow */ - if (is_zerocopy) { - max_len = seglen; - } + u32_t max_len = mss_local_minus_opts; + u32_t seglen = left > max_len ? max_len : left; /* If copy is set, memory should be allocated and data copied * into pbuf */ @@ -583,9 +554,7 @@ err_t tcp_write(struct tcp_pcb *pcb, const void *arg, u32_t len, u16_t apiflags, } LWIP_ASSERT("tcp_write: check that first pbuf can hold the complete seglen", (p->len >= seglen)); - if (is_zerocopy) { - p->payload = (u8_t *)arg + pos; - } else if (is_file) { + if (is_file) { piov[piov_cur_index].iov_base = (void *)((char *)p->payload + optlen); piov[piov_cur_index].iov_len = seglen; @@ -612,7 +581,7 @@ err_t tcp_write(struct tcp_pcb *pcb, const void *arg, u32_t len, u16_t apiflags, memcpy((char *)p->payload + optlen, (u8_t *)arg + pos, seglen); } - queuelen += pbuf_clen(p); + queuelen++; /* There is only one pbuf in the list */ /* Now that there are more segments queued, we check again if the * length of the queue exceeds the configured maximum or @@ -738,6 +707,192 @@ err_t tcp_write(struct tcp_pcb *pcb, const void *arg, u32_t len, u16_t apiflags, return ERR_MEM; } +/** + * Write data for sending (but does not send it immediately). + * + * The function will zero-copy the data into the payload, i.e. the data pointer, instead of the + * data, will be set. + * + * @param pcb Protocol control block for the TCP connection to enqueue data for. + * @param iov Vector of the data buffers to be enqueued for sending. + * @param iovcnt Number of the iov elements. + * @param desc Additional metadata that allows later to check the data mkey/lkey. + * @return ERR_OK if enqueued, another err_t on error + */ +err_t tcp_write_express(struct tcp_pcb *pcb, const struct iovec *iov, u32_t iovcnt, pbuf_desc *desc) +{ + struct pbuf *p = NULL; + struct tcp_seg *seg = NULL; + struct tcp_seg *queue = NULL; + struct tcp_seg *last; + void *opaque = NULL; + const u32_t seglen_max = tcp_tso(pcb) ? pcb->tso.max_payload_sz : pcb->mss; + u32_t pos; + u32_t seglen; + u32_t last_seglen; + u32_t total_len = 0; + u16_t queuelen = 0; + u8_t optflags = TF_SEG_OPTS_ZEROCOPY; + + /* + * We may run out of memory at any point. In that case we must return ERR_MEM and not change + * anything in pcb. Therefore, all changes are recorded in local variables and committed at + * the end of the function. Some pcb fields are maintained in local copies. + */ + + last = pcb->last_unsent; + const bool can_merge = + last && (last->flags & TF_SEG_OPTS_ZEROCOPY) && TCP_SEQ_GEQ(last->seqno, pcb->snd_nxt); + if (!can_merge) { + /* We cannot append data to a segment of different type or a retransmitted segment. */ + last = NULL; + } + last_seglen = last ? last->len : 0; + + if (desc->attr == PBUF_DESC_EXPRESS) { + /* + * Keep opaque value only in the right most pbuf for each send operation. + * + * Express path needs to call the completion callback only after the send operation + * is completed and all the related buffers are not used by XLIO. + * Current implementation keeps the opaque in the last pbuf and calls the callback + * when the opaque is set. + * This implementation can call the callback while a buffer is still in SQ in a specific + * case of spurious retransmission. However, without HW offloads and user memory + * deregistration, the buffer in the SQ won't lead to a functional issue. + * This is a place for improvements. + */ + opaque = desc->opaque; + desc->opaque = NULL; + } + + for (unsigned i = 0; i < iovcnt; ++i) { + u8_t *data = (u8_t *)iov[i].iov_base; + const u32_t len = iov[i].iov_len; + pos = 0; + + /* Chain a new pbuf to the last segment if there is enough space. */ + if (last) { + seg = last; + const u32_t space = seglen_max - seg->len; + + if (space > 0 && pbuf_clen(seg->p) < pcb->tso.max_send_sge) { + seglen = space < len ? space : len; + + p = tcp_pbuf_prealloc_express(seglen, pcb, PBUF_ZEROCOPY, desc, NULL); + if (!p) { + goto memerr; + } + p->payload = data; + pbuf_cat(seg->p, p); + seg->len += p->tot_len; + pos += seglen; + queuelen++; + } + } + + while (pos < len) { + u32_t left = len - pos; + seglen = left > seglen_max ? seglen_max : left; + + p = tcp_pbuf_prealloc_express(seglen, pcb, PBUF_ZEROCOPY, desc, NULL); + if (!p) { + goto memerr; + } + p->payload = data + pos; + + seg = tcp_create_segment(pcb, p, 0, pcb->snd_lbb + total_len + pos, optflags); + if (!seg) { + tcp_tx_pbuf_free(pcb, p); + goto memerr; + } + + if (!queue) { + queue = seg; + } + if (last) { + last->next = seg; + } + last = seg; + + pos += seglen; + queuelen++; + } + + total_len += len; + } + + /* Set the PSH flag in the last segment that we enqueued. */ + if (enable_push_flag && seg != NULL && seg->tcphdr != NULL) { + TCPH_SET_FLAG(seg->tcphdr, TCP_PSH); + } + +#if TCP_OVERSIZE + pcb->unsent_oversize = 0; +#endif /* TCP_OVERSIZE */ + + if (!pcb->last_unsent) { + pcb->unsent = queue; + } else { + /* The next field is either NULL or equals to queue, so we can overwrite. */ + pcb->last_unsent->next = queue; + } + if (last) { + pcb->last_unsent = last; + } + + if (desc->attr == PBUF_DESC_EXPRESS) { + /* See description above. */ + if (p) { + /* 'p' is the last allocated pbuf. */ + p->desc.opaque = opaque; + } + desc->opaque = opaque; + } + + /* Update the pcb state. */ + pcb->snd_lbb += total_len; + pcb->snd_buf -= total_len; + pcb->snd_queuelen += queuelen; + + /* TODO Move Minshall's logic to tcp_output(). */ + if (total_len < pcb->mss) { + const u32_t byte_queued = pcb->snd_nxt - pcb->lastack; + pcb->snd_sml_add = (pcb->unacked ? pcb->unacked->len : 0) + byte_queued; + } + + return ERR_OK; + +memerr: + /* Error path - restore unsent queue. */ + pcb->flags |= TF_NAGLEMEMERR; + if (queue != NULL) { + tcp_tx_segs_free(pcb, queue); + } + if (pcb->last_unsent && last_seglen > 0) { + pcb->last_unsent->next = NULL; + p = pcb->last_unsent->p; + while (last_seglen > 0) { + last_seglen -= p->len; + p = p->next; + } + if (p) { + pcb->last_unsent->len -= p->tot_len; + struct pbuf *ptmp = pcb->last_unsent->p; + while (ptmp) { + ptmp->tot_len -= p->tot_len; + if (ptmp->next == p) { + ptmp->next = NULL; + } + ptmp = ptmp->next; + } + assert(pcb->last_unsent->len == last_seglen); + assert(pcb->last_unsent->p->tot_len == last_seglen); + } + } + return ERR_MEM; +} + /** * Enqueue TCP options for transmission. * @@ -796,22 +951,11 @@ err_t tcp_enqueue_flags(struct tcp_pcb *pcb, u8_t flags) #endif /* LWIP_TCP_TIMESTAMPS */ optlen = LWIP_TCP_OPT_LENGTH(optflags); - /* tcp_enqueue_flags is always called with either SYN or FIN in flags. - * We need one available snd_buf byte to do that. - * This means we can't send FIN while snd_buf==0. A better fix would be to - * not include SYN and FIN sequence numbers in the snd_buf count. */ - - /*if (pcb->snd_buf == 0) { - LWIP_DEBUGF(TCP_OUTPUT_DEBUG | 3, ("tcp_enqueue_flags: no send buffer available\n")); - return ERR_MEM; - }*/ //to consider snd_buf for syn or fin, unmarked sections with SND_BUF_FOR_SYN_FIN - /* Allocate pbuf with room for TCP header + options */ if ((p = tcp_tx_pbuf_alloc(pcb, optlen, PBUF_RAM, NULL, NULL)) == NULL) { pcb->flags |= TF_NAGLEMEMERR; return ERR_MEM; } - LWIP_ASSERT("tcp_enqueue_flags: check that first pbuf can hold optlen", (p->len >= optlen)); /* Allocate memory for tcp_seg, and fill in fields. */ if ((seg = tcp_create_segment(pcb, p, flags, pcb->snd_lbb, optflags)) == NULL) { @@ -819,7 +963,6 @@ err_t tcp_enqueue_flags(struct tcp_pcb *pcb, u8_t flags) tcp_tx_pbuf_free(pcb, p); return ERR_MEM; } - LWIP_ASSERT("tcp_enqueue_flags: invalid segment length", seg->len == 0); LWIP_DEBUGF( TCP_OUTPUT_DEBUG | LWIP_DBG_TRACE, @@ -830,11 +973,7 @@ err_t tcp_enqueue_flags(struct tcp_pcb *pcb, u8_t flags) if (pcb->unsent == NULL) { pcb->unsent = seg; } else { - struct tcp_seg *useg; - for (useg = pcb->unsent; useg->next != NULL; useg = useg->next) { - ; - } - useg->next = seg; + pcb->last_unsent->next = seg; } pcb->last_unsent = seg; #if TCP_OVERSIZE @@ -843,10 +982,10 @@ err_t tcp_enqueue_flags(struct tcp_pcb *pcb, u8_t flags) #endif /* TCP_OVERSIZE */ /* SYN and FIN bump the sequence number */ - if ((flags & TCP_SYN) || (flags & TCP_FIN)) { + if (flags & (TCP_SYN | TCP_FIN)) { pcb->snd_lbb++; /* optlen does not influence snd_buf */ - // pcb->snd_buf--; SND_BUF_FOR_SYN_FIN + pcb->snd_buf--; } if (flags & TCP_FIN) { pcb->flags |= TF_FIN; @@ -856,10 +995,6 @@ err_t tcp_enqueue_flags(struct tcp_pcb *pcb, u8_t flags) pcb->snd_queuelen += pbuf_clen(seg->p); LWIP_DEBUGF(TCP_QLEN_DEBUG, ("tcp_enqueue_flags: %" S16_F " (after enqueued)\n", pcb->snd_queuelen)); - if (pcb->snd_queuelen != 0) { - LWIP_ASSERT("tcp_enqueue_flags: invalid queue length", - pcb->unacked != NULL || pcb->unsent != NULL); - } return ERR_OK; } @@ -953,7 +1088,7 @@ static void tcp_tso_segment(struct tcp_pcb *pcb, struct tcp_seg *seg, u32_t wnd) u32_t max_payload_sz = LWIP_MIN(pcb->tso.max_payload_sz, (wnd - (seg->seqno - pcb->lastack))); u32_t tot_len = 0; u8_t flags = seg->flags; - int tot_p = 0; + u8_t tot_p = 0; /* Ignore retransmitted segments and special segments */ @@ -972,7 +1107,7 @@ static void tcp_tso_segment(struct tcp_pcb *pcb, struct tcp_seg *seg, u32_t wnd) } tot_p += pbuf_clen(cur_seg->p); - if (tot_p > (int)pcb->max_send_sge) { + if (tot_p > pcb->tso.max_send_sge) { goto err; } @@ -1034,7 +1169,7 @@ static struct tcp_seg *tcp_split_one_segment(struct tcp_pcb *pcb, struct tcp_seg struct tcp_seg *result = NULL; struct pbuf *cur_p = NULL; int tcp_hlen_delta; - u16_t max_length = 0; + u32_t max_length = 0; u16_t oversize = 0; pbuf_type type = PBUF_RAM; @@ -1160,8 +1295,8 @@ __attribute__((unused)) static struct tcp_seg *tcp_rexmit_segment(struct tcp_pcb struct tcp_seg *new_seg = NULL; struct pbuf *cur_p = NULL; int tcp_hlen_delta; - u16_t mss_local = 0; - u16_t mss_local_minus_opts; + u32_t mss_local = 0; + u32_t mss_local_minus_opts; u8_t optflags = 0; u8_t optlen = 0; u32_t seqno = 0; @@ -1382,8 +1517,8 @@ void tcp_split_segment(struct tcp_pcb *pcb, struct tcp_seg *seg, u32_t wnd) u16_t oversize = 0; u8_t optlen = 0; u8_t optflags = 0; - u16_t mss_local = 0; - u16_t max_length; + u32_t mss_local = 0; + u32_t max_length; pbuf_type type = PBUF_RAM; int is_zerocopy = 0; @@ -1429,6 +1564,11 @@ void tcp_split_segment(struct tcp_pcb *pcb, struct tcp_seg *seg, u32_t wnd) return; } + if (seg->p->desc.attr == PBUF_DESC_EXPRESS) { + /* Keep opaque value only in the right most pbuf for each send operation. */ + seg->p->desc.opaque = NULL; + } + /* Copy the data from the original buffer */ if (is_zerocopy) { p->payload = (char *)seg->p->payload + lentosend; @@ -1486,6 +1626,7 @@ void tcp_split_segment(struct tcp_pcb *pcb, struct tcp_seg *seg, u32_t wnd) struct pbuf *pnewtail = seg->p; struct pbuf *ptmp = seg->p; u32_t headchainlen = seg->p->len; + oversize = 1; // count bufs in the left seg while ((headchainlen + pnewhead->len - (tcp_hlen_delta + optlen)) <= lentosend) { if (pnewtail->ref > 1) { @@ -1495,6 +1636,7 @@ void tcp_split_segment(struct tcp_pcb *pcb, struct tcp_seg *seg, u32_t wnd) headchainlen += pnewhead->len; pnewtail = pnewhead; pnewhead = pnewhead->next; + oversize++; if (NULL == pnewhead) { LWIP_ASSERT("tcp_split_segment: We should not be here", 0); @@ -1625,6 +1767,9 @@ err_t tcp_output(struct tcp_pcb *pcb) pcb->unacked->next = pcb->unsent; pcb->unsent = pcb->unacked; pcb->unacked = NULL; + if (NULL == pcb->last_unsent) { + pcb->last_unsent = pcb->last_unacked; + } pcb->last_unacked = NULL; } seg = pcb->unsent; @@ -1835,7 +1980,6 @@ static err_t tcp_output_segment(struct tcp_seg *seg, struct tcp_pcb *pcb) /* zc_buf is only used to pass pointer to TCP header to ip_output(). */ struct pbuf zc_pbuf; struct pbuf *p; - u16_t len; u32_t *opts; /* The TCP header has already been constructed, but the ackno and @@ -1861,14 +2005,14 @@ static err_t tcp_output_segment(struct tcp_seg *seg, struct tcp_pcb *pcb) if (seg->flags & TF_SEG_OPTS_MSS) { /* coverity[result_independent_of_operands] */ TCP_BUILD_MSS_OPTION(*opts, pcb->advtsd_mss); - opts += 1; // Move to the next line (meaning next 32 bit) as this option is 4 bytes long + opts++; // Move to the next line (meaning next 32 bit) as this option is 4 bytes long } /* If RCV_SCALE is set then prepare segment for window scaling option */ if (seg->flags & TF_SEG_OPTS_WNDSCALE) { TCP_BUILD_WNDSCALE_OPTION(*opts, rcv_wnd_scale); - opts += 1; // Move to the next line (meaning next 32 bit) as this option is 3 bytes long + - // we added 1 byte NOOP padding => total 4 bytes + opts++; // Move to the next line (meaning next 32 bit) as this option is 3 bytes long + + // we added 1 byte NOOP padding => total 4 bytes } #if LWIP_TCP_TIMESTAMPS @@ -1925,7 +2069,7 @@ static err_t tcp_output_segment(struct tcp_seg *seg, struct tcp_pcb *pcb) p->next = seg->p; p->len = p->tot_len = LWIP_TCP_HDRLEN(seg->tcphdr); } else { - len = (u16_t)((u8_t *)seg->tcphdr - (u8_t *)seg->p->payload); + u32_t len = (u32_t)((u8_t *)seg->tcphdr - (u8_t *)seg->p->payload); seg->p->len -= len; seg->p->tot_len -= len; @@ -2008,29 +2152,25 @@ void tcp_rst(u32_t seqno, u32_t ackno, u16_t local_port, u16_t remote_port, stru */ void tcp_rexmit_rto(struct tcp_pcb *pcb) { - struct tcp_seg *seg; - if (pcb->unacked == NULL) { return; } /* Move all unacked segments to the head of the unsent queue */ - for (seg = pcb->unacked; seg->next != NULL; seg = seg->next) { - ; - } - /* concatenate unsent queue after unacked queue */ - seg->next = pcb->unsent; - if (pcb->unsent == NULL) { + if (pcb->unsent) { + pcb->last_unacked->next = pcb->unsent; + } else { /* If there are no unsent segments, update last_unsent to the last unacked */ - pcb->last_unsent = seg; + pcb->last_unsent = pcb->last_unacked; #if TCP_OVERSIZE && TCP_OVERSIZE_DBGCHECK - pcb->unsent_oversize = seg->oversize_left; + pcb->unsent_oversize = pcb->last_unacked->oversize_left; #endif /* TCP_OVERSIZE && TCP_OVERSIZE_DBGCHECK*/ } /* unsent queue is the concatenated queue (of unacked, unsent) */ pcb->unsent = pcb->unacked; /* unacked queue is now empty */ pcb->unacked = NULL; + pcb->last_unacked = NULL; /* increment number of retransmissions */ ++pcb->nrtx; @@ -2061,7 +2201,11 @@ void tcp_rexmit(struct tcp_pcb *pcb) /* Move the first unacked segment to the unsent queue */ /* Keep the unsent queue sorted. */ seg = pcb->unacked; - pcb->unacked = seg->next; + + pcb->unacked = pcb->unacked->next; + if (NULL == pcb->unacked) { + pcb->last_unacked = NULL; + } cur_seg = &(pcb->unsent); while (*cur_seg && TCP_SEQ_LT((*cur_seg)->seqno, seg->seqno)) { @@ -2197,8 +2341,8 @@ void tcp_zero_window_probe(struct tcp_pcb *pcb) struct tcp_seg *seg; u16_t len; u8_t is_fin; - u32_t snd_nxt; u8_t optlen = 0; + u32_t snd_nxt; u32_t *opts; LWIP_DEBUGF_IP_ADDR(TCP_DEBUG, "tcp_zero_window_probe: sending ZERO WINDOW probe to ", @@ -2262,7 +2406,7 @@ void tcp_zero_window_probe(struct tcp_pcb *pcb) } /* The byte may be acknowledged without the window being opened. */ - snd_nxt = lwip_ntohl(seg->tcphdr->seqno) + 1; + snd_nxt = ntohl(seg->tcphdr->seqno) + 1; if (TCP_SEQ_LT(pcb->snd_nxt, snd_nxt)) { pcb->snd_nxt = snd_nxt; } diff --git a/src/core/main.cpp b/src/core/main.cpp index 4557d4625..7335abc35 100644 --- a/src/core/main.cpp +++ b/src/core/main.cpp @@ -52,6 +52,7 @@ #include "util/xlio_stats.h" #include "util/utils.h" #include "event/event_handler_manager.h" +#include "event/poll_group.h" #include "event/vlogger_timer_handler.h" #include "dev/buffer_pool.h" #include "dev/ib_ctx_handler_collection.h" @@ -65,18 +66,18 @@ #include "proto/neighbour_table_mgr.h" #include "netlink/netlink_wrapper.h" #include "event/command.h" - +#include "sock/sock_stats.h" #include "sock/sock-redirect.h" #include "sock/sock-app.h" #include "sock/fd_collection.h" #include "sock/sockinfo_tcp.h" #include "sock/sockinfo_udp.h" -#include "sock/tcp_seg_pool.h" #include "sock/bind_no_port.h" #include "iomux/io_mux_call.h" #include "util/instrumentation.h" #include "util/agent.h" +#include "xlio.h" void check_netperf_flags(); @@ -105,10 +106,12 @@ bool g_b_exit = false; bool g_init_ibv_fork_done = false; bool g_is_forked_child = false; bool g_init_global_ctors_done = true; -static command_netlink *s_cmd_nl = NULL; +static command_netlink *s_cmd_nl = nullptr; #define MAX_VERSION_STR_LEN 128 global_stats_t g_global_stat_static; +static uint32_t g_ec_pool_size = 0U; +static uint32_t g_ec_pool_no_objs = 0U; static int free_libxlio_resources() { @@ -134,42 +137,44 @@ static int free_libxlio_resources() if (g_tcp_timers_collection) { g_tcp_timers_collection->clean_obj(); } - g_tcp_timers_collection = NULL; + g_tcp_timers_collection = nullptr; // Block all sock-redicrt API calls into our offloading core fd_collection *g_p_fd_collection_temp = g_p_fd_collection; - g_p_fd_collection = NULL; + g_p_fd_collection = nullptr; if (g_p_fd_collection_temp) { delete g_p_fd_collection_temp; } + poll_group::destroy_all_groups(); + if (g_p_lwip) { delete g_p_lwip; } - g_p_lwip = NULL; + g_p_lwip = nullptr; if (g_p_route_table_mgr) { delete g_p_route_table_mgr; } - g_p_route_table_mgr = NULL; + g_p_route_table_mgr = nullptr; if (g_bind_no_port) { delete g_bind_no_port; } - g_bind_no_port = NULL; + g_bind_no_port = nullptr; if (g_p_rule_table_mgr) { delete g_p_rule_table_mgr; } - g_p_rule_table_mgr = NULL; + g_p_rule_table_mgr = nullptr; if (g_p_net_device_table_mgr) { delete g_p_net_device_table_mgr; } - g_p_net_device_table_mgr = NULL; + g_p_net_device_table_mgr = nullptr; ip_frag_manager *g_p_ip_frag_manager_temp = g_p_ip_frag_manager; - g_p_ip_frag_manager = NULL; + g_p_ip_frag_manager = nullptr; if (g_p_ip_frag_manager_temp) { delete g_p_ip_frag_manager_temp; } @@ -177,12 +182,17 @@ static int free_libxlio_resources() if (g_p_neigh_table_mgr) { delete g_p_neigh_table_mgr; } - g_p_neigh_table_mgr = NULL; + g_p_neigh_table_mgr = nullptr; if (g_tcp_seg_pool) { delete g_tcp_seg_pool; } - g_tcp_seg_pool = NULL; + g_tcp_seg_pool = nullptr; + + if (g_socketxtreme_ec_pool) { + delete g_socketxtreme_ec_pool; + } + g_socketxtreme_ec_pool = NULL; if (safe_mce_sys().print_report) { buffer_pool::print_report_on_errors(VLOG_INFO); @@ -191,64 +201,64 @@ static int free_libxlio_resources() if (g_buffer_pool_zc) { delete g_buffer_pool_zc; } - g_buffer_pool_zc = NULL; + g_buffer_pool_zc = nullptr; if (g_buffer_pool_tx) { delete g_buffer_pool_tx; } - g_buffer_pool_tx = NULL; + g_buffer_pool_tx = nullptr; if (g_buffer_pool_rx_stride) { delete g_buffer_pool_rx_stride; } - g_buffer_pool_rx_stride = NULL; + g_buffer_pool_rx_stride = nullptr; if (g_buffer_pool_rx_rwqe) { delete g_buffer_pool_rx_rwqe; } - g_buffer_pool_rx_rwqe = NULL; + g_buffer_pool_rx_rwqe = nullptr; if (g_zc_cache) { delete g_zc_cache; } - g_zc_cache = NULL; + g_zc_cache = nullptr; xlio_heap::finalize(); if (s_cmd_nl) { delete s_cmd_nl; } - s_cmd_nl = NULL; + s_cmd_nl = nullptr; if (g_p_netlink_handler) { delete g_p_netlink_handler; } - g_p_netlink_handler = NULL; + g_p_netlink_handler = nullptr; if (g_p_ib_ctx_handler_collection) { delete g_p_ib_ctx_handler_collection; } - g_p_ib_ctx_handler_collection = NULL; + g_p_ib_ctx_handler_collection = nullptr; if (g_p_vlogger_timer_handler) { delete g_p_vlogger_timer_handler; } - g_p_vlogger_timer_handler = NULL; + g_p_vlogger_timer_handler = nullptr; if (g_p_event_handler_manager) { delete g_p_event_handler_manager; } - g_p_event_handler_manager = NULL; + g_p_event_handler_manager = nullptr; if (g_p_agent) { delete g_p_agent; } - g_p_agent = NULL; + g_p_agent = nullptr; if (safe_mce_sys().app_name) { free(safe_mce_sys().app_name); } - safe_mce_sys().app_name = NULL; + safe_mce_sys().app_name = nullptr; vlog_printf(VLOG_DEBUG, "Stopping logger module\n"); @@ -260,14 +270,14 @@ static int free_libxlio_resources() // cosmetics - remove when adding iomux block fprintf(g_stats_file, "======================================================\n"); fclose(g_stats_file); - g_stats_file = NULL; + g_stats_file = nullptr; } #if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) if (g_p_app) { delete g_p_app; } - g_p_app = NULL; + g_p_app = nullptr; #endif return 0; @@ -430,7 +440,7 @@ int get_ofed_version_info(char *ofed_version_str, int len) void print_xlio_global_settings() { struct utsname sys_info; - time_t clock = time(NULL); + time_t clock = time(nullptr); char ofed_version_info[MAX_VERSION_STR_LEN]; vlog_printf(VLOG_INFO, @@ -557,8 +567,6 @@ void print_xlio_global_settings() SYS_VAR_TX_NUM_BUFS); VLOG_PARAM_STRING("Tx Mem Buf size", safe_mce_sys().tx_buf_size, MCE_DEFAULT_TX_BUF_SIZE, SYS_VAR_TX_BUF_SIZE, option_size::to_str(safe_mce_sys().tx_buf_size)); - VLOG_PARAM_STRING("ZC TX size", safe_mce_sys().zc_tx_size, MCE_DEFAULT_ZC_TX_SIZE, - SYS_VAR_ZC_TX_SIZE, option_size::to_str(safe_mce_sys().zc_tx_size)); VLOG_PARAM_NUMBER("Tx QP WRE", safe_mce_sys().tx_num_wr, MCE_DEFAULT_TX_NUM_WRE, SYS_VAR_TX_NUM_WRE); VLOG_PARAM_NUMBER("Tx QP WRE Batching", safe_mce_sys().tx_num_wr_to_signal, @@ -649,9 +657,9 @@ void print_xlio_global_settings() VLOG_PARAM_STRING("Force Flowtag for MC", safe_mce_sys().mc_force_flowtag, MCE_DEFAULT_MC_FORCE_FLOWTAG, SYS_VAR_MC_FORCE_FLOWTAG, safe_mce_sys().mc_force_flowtag ? "Enabled " : "Disabled"); - VLOG_STR_PARAM_STRING("Striding RQ", option_strq::to_str(safe_mce_sys().enable_strq_env), - option_strq::to_str(MCE_DEFAULT_STRQ), SYS_VAR_STRQ, - option_strq::to_str(safe_mce_sys().enable_strq_env)); + VLOG_STR_PARAM_STRING("Striding RQ", option_3::to_str(safe_mce_sys().enable_strq_env), + option_3::to_str(MCE_DEFAULT_STRQ), SYS_VAR_STRQ, + option_3::to_str(safe_mce_sys().enable_strq_env)); VLOG_PARAM_NUMBER("STRQ Strides per RWQE", safe_mce_sys().strq_stride_num_per_rwqe, MCE_DEFAULT_STRQ_NUM_STRIDES, SYS_VAR_STRQ_NUM_STRIDES); VLOG_PARAM_NUMBER("STRQ Stride Size (Bytes)", safe_mce_sys().strq_stride_size_bytes, @@ -781,8 +789,8 @@ void print_xlio_global_settings() VLOG_PARAM_STRING("Memory limit (user allocator)", safe_mce_sys().memory_limit_user, MCE_DEFAULT_MEMORY_LIMIT_USER, SYS_VAR_MEMORY_LIMIT_USER, option_size::to_str(safe_mce_sys().memory_limit_user)); - VLOG_PARAM_NUMBER("Hugepage log2", safe_mce_sys().hugepage_log2, MCE_DEFAULT_HUGEPAGE_LOG2, - SYS_VAR_HUGEPAGE_LOG2); + VLOG_PARAM_STRING("Hugepage size", safe_mce_sys().hugepage_size, MCE_DEFAULT_HUGEPAGE_SIZE, + SYS_VAR_HUGEPAGE_SIZE, option_size::to_str(safe_mce_sys().hugepage_size)); VLOG_PARAM_NUMBER("Num of UC ARPs", safe_mce_sys().neigh_uc_arp_quata, MCE_DEFAULT_NEIGH_UC_ARP_QUATA, SYS_VAR_NEIGH_UC_ARP_QUATA); @@ -797,6 +805,8 @@ void print_xlio_global_settings() VLOG_STR_PARAM_STRING("TSO support", option_3::to_str(safe_mce_sys().enable_tso), option_3::to_str(MCE_DEFAULT_TSO), SYS_VAR_TSO, option_3::to_str(safe_mce_sys().enable_tso)); + VLOG_PARAM_STRING("TSO max size", safe_mce_sys().max_tso_sz, MCE_DEFAULT_MAX_TSO_SIZE, + SYS_VAR_MAX_TSO_SIZE, option_size::to_str(safe_mce_sys().max_tso_sz)); VLOG_STR_PARAM_STRING("LRO support", option_3::to_str(safe_mce_sys().enable_lro), option_3::to_str(MCE_DEFAULT_LRO), SYS_VAR_LRO, option_3::to_str(safe_mce_sys().enable_lro)); @@ -807,12 +817,14 @@ void print_xlio_global_settings() SYS_VAR_UTLS_RX, safe_mce_sys().enable_utls_rx ? "Enabled " : "Disabled"); VLOG_PARAM_STRING("UTLS TX support", safe_mce_sys().enable_utls_tx, MCE_DEFAULT_UTLS_TX, SYS_VAR_UTLS_TX, safe_mce_sys().enable_utls_tx ? "Enabled " : "Disabled"); - VLOG_PARAM_NUMBER( - "UTLS high watermark DEK cache size", safe_mce_sys().utls_high_wmark_dek_cache_size, - MCE_DEFAULT_UTLS_HIGH_WMARK_DEK_CACHE_SIZE, SYS_VAR_UTLS_HIGH_WMARK_DEK_CACHE_SIZE); - VLOG_PARAM_NUMBER( - "UTLS low watermark DEK cache size", safe_mce_sys().utls_low_wmark_dek_cache_size, - MCE_DEFAULT_UTLS_LOW_WMARK_DEK_CACHE_SIZE, SYS_VAR_UTLS_LOW_WMARK_DEK_CACHE_SIZE); + VLOG_PARAM_NUMBER("UTLS high watermark DEK cache size", + static_cast(safe_mce_sys().utls_high_wmark_dek_cache_size), + MCE_DEFAULT_UTLS_HIGH_WMARK_DEK_CACHE_SIZE, + SYS_VAR_UTLS_HIGH_WMARK_DEK_CACHE_SIZE); + VLOG_PARAM_NUMBER("UTLS low watermark DEK cache size", + static_cast(safe_mce_sys().utls_low_wmark_dek_cache_size), + MCE_DEFAULT_UTLS_LOW_WMARK_DEK_CACHE_SIZE, + SYS_VAR_UTLS_LOW_WMARK_DEK_CACHE_SIZE); #endif /* DEFINED_UTLS */ #if defined(DEFINED_NGINX) VLOG_PARAM_NUMBER("Number of Nginx workers", @@ -889,13 +901,6 @@ void print_xlio_global_settings() vlog_printf(VLOG_INFO, "---------------------------------------------------------------------------\n"); - -#if !defined(DEFINED_DPCP) - if (safe_mce_sys().mce_spec == MCE_SPEC_NVME_BF2) { - vlog_printf(VLOG_INFO, "XLIO '%s' spec is used without enabled DPCP!\n", - xlio_spec::to_str(MCE_SPEC_NVME_BF2)); - } -#endif } void prepare_fork() @@ -933,7 +938,7 @@ void register_handler_segv() act.sa_handler = handle_segfault; act.sa_flags = 0; sigemptyset(&act.sa_mask); - sigaction(SIGSEGV, &act, NULL); + sigaction(SIGSEGV, &act, nullptr); vlog_printf(VLOG_INFO, "Registered a SIGSEGV handler\n"); } @@ -1042,6 +1047,8 @@ static void do_global_ctors_helper() *g_p_vlogger_level = g_vlogger_level; *g_p_vlogger_details = g_vlogger_details; + sock_stats::instance().init_sock_stats(safe_mce_sys().stats_fd_num_max); + g_global_stat_static.init(); xlio_stats_instance_create_global_block(&g_global_stat_static); @@ -1097,20 +1104,24 @@ static void do_global_ctors_helper() safe_mce_sys().lwip_mss)), (safe_mce_sys().m_ioctl.user_alloc.flags & IOCTL_USER_ALLOC_TX ? safe_mce_sys().m_ioctl.user_alloc.memalloc - : NULL), + : nullptr), (safe_mce_sys().m_ioctl.user_alloc.flags & IOCTL_USER_ALLOC_TX ? safe_mce_sys().m_ioctl.user_alloc.memfree - : NULL))); + : nullptr))); NEW_CTOR(g_buffer_pool_zc, buffer_pool(BUFFER_POOL_TX, 0)); - NEW_CTOR(g_tcp_seg_pool, tcp_seg_pool()); + NEW_CTOR(g_tcp_seg_pool, + tcp_seg_pool("TCP segments", safe_mce_sys().tx_segs_pool_batch_tcp, + g_global_stat_static.n_tcp_seg_pool_size, + g_global_stat_static.n_tcp_seg_pool_no_segs)); + + NEW_CTOR(g_socketxtreme_ec_pool, + socketxtreme_ec_pool("Socketxtreme ec", 512, g_ec_pool_size, g_ec_pool_no_objs)); // For delegated TCP timers the global collection is not used. if (safe_mce_sys().tcp_ctl_thread != option_tcp_ctl_thread::CTL_THREAD_DELEGATE_TCP_TIMERS) { - NEW_CTOR(g_tcp_timers_collection, - tcp_timers_collection(safe_mce_sys().tcp_timer_resolution_msec, - safe_mce_sys().timer_resolution_msec)); + NEW_CTOR(g_tcp_timers_collection, tcp_timers_collection()); } NEW_CTOR(g_p_vlogger_timer_handler, vlogger_timer_handler()); @@ -1149,13 +1160,13 @@ static void do_global_ctors_helper() // Register netlink fd to the event_manager s_cmd_nl = new command_netlink(g_p_netlink_handler); - if (s_cmd_nl == NULL) { + if (!s_cmd_nl) { throw_xlio_exception("Failed allocating command_netlink\n"); } BULLSEYE_EXCLUDE_BLOCK_END g_p_event_handler_manager->register_command_event(fd, s_cmd_nl); g_p_event_handler_manager->register_timer_event(safe_mce_sys().timer_netlink_update_msec, - s_cmd_nl, PERIODIC_TIMER, NULL); + s_cmd_nl, PERIODIC_TIMER, nullptr); } #ifdef DEFINED_UTLS @@ -1182,29 +1193,30 @@ int do_global_ctors() void reset_globals() { - g_p_fd_collection = NULL; - g_p_ip_frag_manager = NULL; - g_zc_cache = NULL; - g_buffer_pool_rx_ptr = NULL; - g_buffer_pool_rx_stride = NULL; - g_buffer_pool_rx_rwqe = NULL; - g_buffer_pool_tx = NULL; - g_buffer_pool_zc = NULL; - g_tcp_seg_pool = NULL; - g_tcp_timers_collection = NULL; - g_p_vlogger_timer_handler = NULL; - g_p_event_handler_manager = NULL; - g_p_agent = NULL; - g_p_route_table_mgr = NULL; - g_bind_no_port = NULL; - g_p_rule_table_mgr = NULL; - g_stats_file = NULL; - g_p_net_device_table_mgr = NULL; - g_p_neigh_table_mgr = NULL; - g_p_lwip = NULL; - g_p_netlink_handler = NULL; - g_p_ib_ctx_handler_collection = NULL; - s_cmd_nl = NULL; + g_p_fd_collection = nullptr; + g_p_ip_frag_manager = nullptr; + g_zc_cache = nullptr; + g_buffer_pool_rx_ptr = nullptr; + g_buffer_pool_rx_stride = nullptr; + g_buffer_pool_rx_rwqe = nullptr; + g_buffer_pool_tx = nullptr; + g_buffer_pool_zc = nullptr; + g_tcp_seg_pool = nullptr; + g_socketxtreme_ec_pool = NULL; + g_tcp_timers_collection = nullptr; + g_p_vlogger_timer_handler = nullptr; + g_p_event_handler_manager = nullptr; + g_p_agent = nullptr; + g_p_route_table_mgr = nullptr; + g_bind_no_port = nullptr; + g_p_rule_table_mgr = nullptr; + g_stats_file = nullptr; + g_p_net_device_table_mgr = nullptr; + g_p_neigh_table_mgr = nullptr; + g_p_lwip = nullptr; + g_p_netlink_handler = nullptr; + g_p_ib_ctx_handler_collection = nullptr; + s_cmd_nl = nullptr; g_cpu_manager.reset(); } @@ -1226,9 +1238,9 @@ void check_netperf_flags() if (strcmp(command, "netserver")) { return; } - pch = strtok(NULL, " "); + pch = strtok(nullptr, " "); - while (pch != NULL) { + while (pch) { if (*pch == '-') { if (strchr(pch, 'D')) { b_D_flag = true; @@ -1240,7 +1252,7 @@ void check_netperf_flags() if (b_f_flag && b_D_flag) { break; } - pch = strtok(NULL, " "); + pch = strtok(nullptr, " "); } if (!b_D_flag || !b_f_flag) { vlog_printf(VLOG_WARNING, "Running netserver without flags: -D, -f can cause failure\n"); @@ -1281,7 +1293,9 @@ extern "C" int xlio_init(void) { PROFILE_FUNC +#ifndef XLIO_STATIC_BUILD get_orig_funcs(); +#endif /* XLIO_STATIC_BUILD */ safe_mce_sys(); g_init_global_ctors_done = false; diff --git a/src/core/netlink/neigh_info.cpp b/src/core/netlink/neigh_info.cpp index 816e72b63..d9becb130 100644 --- a/src/core/netlink/neigh_info.cpp +++ b/src/core/netlink/neigh_info.cpp @@ -37,13 +37,13 @@ netlink_neigh_info::netlink_neigh_info(struct rtnl_neigh *neigh) : dst_addr_str("") - , dst_addr(NULL) + , dst_addr(nullptr) , dst_addr_len(0) , addr_family(0) , flags(0) , ifindex(0) , lladdr_str("") - , lladdr(NULL) + , lladdr(nullptr) , lladdr_len(0) , state(0) , type(0) diff --git a/src/core/netlink/neigh_info.h b/src/core/netlink/neigh_info.h index b18c8f019..90b0f695e 100644 --- a/src/core/netlink/neigh_info.h +++ b/src/core/netlink/neigh_info.h @@ -41,13 +41,13 @@ class netlink_neigh_info { public: netlink_neigh_info() : dst_addr_str("") - , dst_addr(NULL) + , dst_addr(nullptr) , dst_addr_len(0) , addr_family(0) , flags(0) , ifindex(0) , lladdr_str("") - , lladdr(NULL) + , lladdr(nullptr) , lladdr_len(0) , state(0) , type(0) diff --git a/src/core/netlink/netlink_wrapper.cpp b/src/core/netlink/netlink_wrapper.cpp index 4c864b8ce..7155d27db 100644 --- a/src/core/netlink/netlink_wrapper.cpp +++ b/src/core/netlink/netlink_wrapper.cpp @@ -55,7 +55,7 @@ #define nl_logdbg __log_dbg #define nl_logfine __log_fine -netlink_wrapper *g_p_netlink_handler = NULL; +netlink_wrapper *g_p_netlink_handler = nullptr; // structure to pass arguments on internal netlink callbacks handling typedef struct rcv_msg_arg { @@ -167,7 +167,7 @@ void netlink_wrapper::neigh_cache_callback(nl_object *obj) nl_logdbg("notify on neigh event: %s", new_event.to_str().c_str()); netlink_wrapper::notify_observers(&new_event, nlgrpNEIGH); - g_nl_rcv_arg.msghdr = NULL; + g_nl_rcv_arg.msghdr = nullptr; nl_logfine("<--- neigh_cache_callback"); } @@ -180,7 +180,7 @@ void netlink_wrapper::link_cache_callback(nl_object *obj) nl_logdbg("notify on link event: %s", new_event.to_str().c_str()); netlink_wrapper::notify_observers(&new_event, nlgrpLINK); - g_nl_rcv_arg.msghdr = NULL; + g_nl_rcv_arg.msghdr = nullptr; nl_logfine("<--- link_cache_callback"); } @@ -202,21 +202,21 @@ void netlink_wrapper::route_cache_callback(nl_object *obj) } else { nl_logdbg("Received invalid route event"); } - g_nl_rcv_arg.msghdr = NULL; + g_nl_rcv_arg.msghdr = nullptr; nl_logfine("<--- route_cache_callback"); } netlink_wrapper::netlink_wrapper() - : m_socket_handle(NULL) - , m_mngr(NULL) - , m_cache_link(NULL) - , m_cache_neigh(NULL) - , m_cache_route(NULL) + : m_socket_handle(nullptr) + , m_mngr(nullptr) + , m_cache_link(nullptr) + , m_cache_neigh(nullptr) + , m_cache_route(nullptr) { nl_logfine("---> netlink_route_listener CTOR"); g_nl_rcv_arg.subjects_map = &m_subjects_map; g_nl_rcv_arg.netlink = this; - g_nl_rcv_arg.msghdr = NULL; + g_nl_rcv_arg.msghdr = nullptr; nl_logfine("<--- netlink_route_listener CTOR"); } @@ -307,18 +307,18 @@ int netlink_wrapper::open_channel() nl_logdbg("netlink socket is open"); - if (nl_cache_mngr_add_ext(m_mngr, "route/link", link_callback, NULL, &m_cache_link)) { + if (nl_cache_mngr_add_ext(m_mngr, "route/link", link_callback, nullptr, &m_cache_link)) { return -1; } - if (nl_cache_mngr_add_ext(m_mngr, "route/route", route_callback, NULL, &m_cache_route)) { + if (nl_cache_mngr_add_ext(m_mngr, "route/route", route_callback, nullptr, &m_cache_route)) { return -1; } - if (nl_cache_mngr_add_ext(m_mngr, "route/neigh", neigh_callback, NULL, &m_cache_neigh)) { + if (nl_cache_mngr_add_ext(m_mngr, "route/neigh", neigh_callback, nullptr, &m_cache_neigh)) { return -1; } // set custom callback for every message to update message - nl_socket_modify_cb(m_socket_handle, NL_CB_MSG_IN, NL_CB_CUSTOM, nl_msg_rcv_cb, NULL); + nl_socket_modify_cb(m_socket_handle, NL_CB_MSG_IN, NL_CB_CUSTOM, nl_msg_rcv_cb, nullptr); // set the socket non-blocking BULLSEYE_EXCLUDE_BLOCK_START @@ -386,7 +386,7 @@ bool netlink_wrapper::register_event(e_netlink_event_type type, const observer * bool netlink_wrapper::unregister(e_netlink_event_type type, const observer *obs) { std::lock_guard lock(m_subj_map_lock); - if (obs == NULL) { + if (!obs) { return false; } @@ -452,7 +452,7 @@ void netlink_wrapper::neigh_timer_expired() void netlink_wrapper::notify_neigh_cache_entries() { nl_logfine("--->netlink_wrapper::notify_cache_entries"); - g_nl_rcv_arg.msghdr = NULL; + g_nl_rcv_arg.msghdr = nullptr; nl_object *obj = nl_cache_get_first(m_cache_neigh); while (obj) { nl_object_get(obj); diff --git a/src/core/netlink/test_main.cpp b/src/core/netlink/test_main.cpp index 9317a5b03..758593214 100644 --- a/src/core/netlink/test_main.cpp +++ b/src/core/netlink/test_main.cpp @@ -30,12 +30,13 @@ * SOFTWARE. */ +#include +#include +#include + #include "core/infra/subject_observer.h" #include "netlink_wrapper.h" #include "neigh_info.h" -#include -#include "errno.h" -#include #include "vlogger/vlogger.h" #include "core/event/netlink_event.h" @@ -114,7 +115,7 @@ void netlink_test() struct epoll_event *e = new struct epoll_event(); e->data.fd = fd; - e->data.ptr = NULL; + e->data.ptr = nullptr; e->events = EPOLLIN | EPOLLET; epoll_ctl(epfd, EPOLL_CTL_ADD, fd, e); diff --git a/src/core/proto/L2_address.cpp b/src/core/proto/L2_address.cpp index 3d2f315a5..74ce51a5d 100644 --- a/src/core/proto/L2_address.cpp +++ b/src/core/proto/L2_address.cpp @@ -56,7 +56,7 @@ void L2_address::set(address_t const address, addrlen_t const len) L2_panic("len = %lu", len); } - if (address == NULL) { + if (!address) { L2_panic("address == NULL"); } BULLSEYE_EXCLUDE_BLOCK_END diff --git a/src/core/proto/dst_entry.cpp b/src/core/proto/dst_entry.cpp index 1ead1adf4..fe81310f4 100644 --- a/src/core/proto/dst_entry.cpp +++ b/src/core/proto/dst_entry.cpp @@ -60,9 +60,9 @@ dst_entry::dst_entry(const sock_addr &dst, uint16_t src_port, socket_data &sock_ , m_so_bindtodevice_ip(in6addr_any) , m_route_src_ip(in6addr_any) , m_pkt_src_ip(in6addr_any) - , m_ring_alloc_logic_tx(sock_data.fd, ring_alloc_logic, this) - , m_p_tx_mem_buf_desc_list(NULL) - , m_p_zc_mem_buf_desc_list(NULL) + , m_ring_alloc_logic_tx(sock_data.fd, ring_alloc_logic) + , m_p_tx_mem_buf_desc_list(nullptr) + , m_p_zc_mem_buf_desc_list(nullptr) , m_b_tx_mem_buf_desc_list_pending(false) , m_ttl_hop_limit(sock_data.ttl_hop_limit) , m_tos(sock_data.tos) @@ -92,46 +92,46 @@ dst_entry::~dst_entry() if (m_p_rt_entry) { g_p_route_table_mgr->unregister_observer( route_rule_table_key(m_dst_ip, m_route_src_ip, m_family, m_tos), this); - m_p_rt_entry = NULL; + m_p_rt_entry = nullptr; } if (m_p_ring) { if (m_sge) { delete[] m_sge; - m_sge = NULL; + m_sge = nullptr; } if (m_p_tx_mem_buf_desc_list) { m_p_ring->mem_buf_tx_release(m_p_tx_mem_buf_desc_list, true); - m_p_tx_mem_buf_desc_list = NULL; + m_p_tx_mem_buf_desc_list = nullptr; } if (m_p_zc_mem_buf_desc_list) { m_p_ring->mem_buf_tx_release(m_p_zc_mem_buf_desc_list, true); - m_p_zc_mem_buf_desc_list = NULL; + m_p_zc_mem_buf_desc_list = nullptr; } m_p_net_dev_val->release_ring(m_ring_alloc_logic_tx.get_key()); - m_p_ring = NULL; + m_p_ring = nullptr; } if (m_p_send_wqe_handler) { delete m_p_send_wqe_handler; - m_p_send_wqe_handler = NULL; + m_p_send_wqe_handler = nullptr; } if (m_p_neigh_val) { delete m_p_neigh_val; - m_p_neigh_val = NULL; + m_p_neigh_val = nullptr; } if (m_header) { delete m_header; - m_header = NULL; + m_header = nullptr; } if (m_header_neigh) { delete m_header_neigh; - m_header_neigh = NULL; + m_header_neigh = nullptr; } dst_logdbg("Done %s", to_str().c_str()); @@ -140,21 +140,20 @@ dst_entry::~dst_entry() void dst_entry::init_members() { set_state(false); - m_p_rt_val = NULL; - m_p_net_dev_val = NULL; - m_p_ring = NULL; - m_p_net_dev_entry = NULL; - m_p_neigh_entry = NULL; - m_p_neigh_val = NULL; - m_p_rt_entry = NULL; + m_p_rt_val = nullptr; + m_p_net_dev_val = nullptr; + m_p_ring = nullptr; + m_p_net_dev_entry = nullptr; + m_p_neigh_entry = nullptr; + m_p_neigh_val = nullptr; + m_p_rt_entry = nullptr; memset(&m_inline_send_wqe, 0, sizeof(m_inline_send_wqe)); memset(&m_not_inline_send_wqe, 0, sizeof(m_not_inline_send_wqe)); memset(&m_fragmented_send_wqe, 0, sizeof(m_not_inline_send_wqe)); - m_p_send_wqe_handler = NULL; - m_sge = NULL; + m_p_send_wqe_handler = nullptr; + m_sge = nullptr; m_b_is_offloaded = true; m_b_is_initialized = false; - m_p_send_wqe = NULL; m_max_inline = 0; m_max_ip_payload_size = 0; m_max_udp_payload_size = 0; @@ -228,7 +227,7 @@ bool dst_entry::update_net_dev_val() } g_p_neigh_table_mgr->unregister_observer( neigh_key(ip_addr(dst_addr, m_family), m_p_net_dev_val), this); - m_p_neigh_entry = NULL; + m_p_neigh_entry = nullptr; } // Change the net_device, clean old resources... @@ -259,7 +258,7 @@ bool dst_entry::update_net_dev_val() bool dst_entry::update_rt_val() { bool ret_val = true; - route_val *p_rt_val = NULL; + route_val *p_rt_val = nullptr; if (m_p_rt_entry && m_p_rt_entry->get_val(p_rt_val)) { if (m_p_rt_val == p_rt_val) { @@ -280,7 +279,7 @@ bool dst_entry::resolve_net_dev(bool is_connect) { bool ret_val = false; - cache_entry_subject *p_ces = NULL; + cache_entry_subject *p_ces = nullptr; if (m_dst_ip.is_anyaddr()) { dst_logdbg(PRODUCT_NAME " does not offload zero net IP address"); @@ -308,7 +307,7 @@ bool dst_entry::resolve_net_dev(bool is_connect) // set src addr by XLIO. We keep this logic for IPv4 only for backward compliancy. if (m_family == AF_INET && is_connect && m_route_src_ip.is_anyaddr()) { dst_logfunc("Checking rt_entry src addr"); - route_val *p_rt_val = NULL; + route_val *p_rt_val = nullptr; if (m_p_rt_entry && m_p_rt_entry->get_val(p_rt_val) && !p_rt_val->get_src_addr().is_anyaddr()) { g_p_route_table_mgr->unregister_observer(rtk, this); @@ -347,11 +346,11 @@ bool dst_entry::resolve_neigh() if (m_p_rt_val && !m_p_rt_val->get_gw_addr().is_anyaddr() && !dst_addr.is_mc(m_family)) { dst_addr = m_p_rt_val->get_gw_addr(); } - cache_entry_subject *p_ces = NULL; + cache_entry_subject *p_ces = nullptr; if (m_p_neigh_entry || g_p_neigh_table_mgr->register_observer( neigh_key(ip_addr(dst_addr, m_family), m_p_net_dev_val), this, &p_ces)) { - if (m_p_neigh_entry == NULL) { + if (!m_p_neigh_entry) { m_p_neigh_entry = dynamic_cast(p_ces); } if (m_p_neigh_entry) { @@ -379,7 +378,7 @@ bool dst_entry::resolve_ring() if (m_p_ring) { if (m_sge) { delete[] m_sge; - m_sge = NULL; + m_sge = nullptr; } m_sge = new (std::nothrow) struct ibv_sge[m_p_ring->get_max_send_sge()]; if (!m_sge) { @@ -401,15 +400,15 @@ bool dst_entry::release_ring() if (m_p_ring) { if (m_p_tx_mem_buf_desc_list) { m_p_ring->mem_buf_tx_release(m_p_tx_mem_buf_desc_list, true); - m_p_tx_mem_buf_desc_list = NULL; + m_p_tx_mem_buf_desc_list = nullptr; } if (m_p_zc_mem_buf_desc_list) { m_p_ring->mem_buf_tx_release(m_p_zc_mem_buf_desc_list, true); - m_p_zc_mem_buf_desc_list = NULL; + m_p_zc_mem_buf_desc_list = nullptr; } dst_logdbg("releasing a ring"); m_p_net_dev_val->release_ring(m_ring_alloc_logic_tx.get_key()); - m_p_ring = NULL; + m_p_ring = nullptr; } ret_val = true; } @@ -456,7 +455,7 @@ bool dst_entry::conf_l2_hdr_and_snd_wqe_eth() // scratch if (m_p_send_wqe_handler) { delete m_p_send_wqe_handler; - m_p_send_wqe_handler = NULL; + m_p_send_wqe_handler = nullptr; } m_p_send_wqe_handler = new wqe_send_handler(); @@ -584,11 +583,11 @@ bool dst_entry::prepare_to_send(struct xlio_rate_limit_t &rate_limit, bool skip_ m_src_port, m_dst_port); if (m_p_tx_mem_buf_desc_list) { m_p_ring->mem_buf_tx_release(m_p_tx_mem_buf_desc_list, true); - m_p_tx_mem_buf_desc_list = NULL; + m_p_tx_mem_buf_desc_list = nullptr; } if (m_p_zc_mem_buf_desc_list) { m_p_ring->mem_buf_tx_release(m_p_zc_mem_buf_desc_list, true); - m_p_zc_mem_buf_desc_list = NULL; + m_p_zc_mem_buf_desc_list = nullptr; } resolved = true; } @@ -681,7 +680,7 @@ void dst_entry::do_ring_migration_tx(lock_base &socket_lock, resource_allocation m_p_ring = new_ring; if (m_sge) { delete[] m_sge; - m_sge = NULL; + m_sge = nullptr; } m_sge = new (std::nothrow) struct ibv_sge[m_p_ring->get_max_send_sge()]; if (!m_sge) { @@ -692,9 +691,9 @@ void dst_entry::do_ring_migration_tx(lock_base &socket_lock, resource_allocation get_route_mtu() + (uint32_t)m_header->m_transport_header_len); mem_buf_desc_t *tmp_list = m_p_tx_mem_buf_desc_list; - m_p_tx_mem_buf_desc_list = NULL; + m_p_tx_mem_buf_desc_list = nullptr; mem_buf_desc_t *tmp_list_zc = m_p_zc_mem_buf_desc_list; - m_p_zc_mem_buf_desc_list = NULL; + m_p_zc_mem_buf_desc_list = nullptr; m_slow_path_lock.unlock(); socket_lock.unlock(); @@ -771,7 +770,7 @@ bool dst_entry::alloc_neigh_val(transport_type_t tranport) if (m_p_neigh_val) { delete m_p_neigh_val; - m_p_neigh_val = NULL; + m_p_neigh_val = nullptr; } switch (tranport) { @@ -790,25 +789,25 @@ void dst_entry::return_buffers_pool() { int count; - if (m_p_tx_mem_buf_desc_list == NULL && m_p_zc_mem_buf_desc_list == NULL) { + if (!m_p_tx_mem_buf_desc_list && !m_p_zc_mem_buf_desc_list) { return; } if (m_b_tx_mem_buf_desc_list_pending && m_p_ring) { - if (m_p_tx_mem_buf_desc_list != NULL) { + if (m_p_tx_mem_buf_desc_list) { count = m_p_ring->mem_buf_tx_release(m_p_tx_mem_buf_desc_list, true, true); if (count) { - m_p_tx_mem_buf_desc_list = NULL; + m_p_tx_mem_buf_desc_list = nullptr; } } - if (m_p_zc_mem_buf_desc_list != NULL) { + if (m_p_zc_mem_buf_desc_list) { count = m_p_ring->mem_buf_tx_release(m_p_zc_mem_buf_desc_list, true, true); if (count) { - m_p_zc_mem_buf_desc_list = NULL; + m_p_zc_mem_buf_desc_list = nullptr; } } } - set_tx_buff_list_pending(m_p_tx_mem_buf_desc_list != NULL || m_p_zc_mem_buf_desc_list != NULL); + set_tx_buff_list_pending(m_p_tx_mem_buf_desc_list || m_p_zc_mem_buf_desc_list); } int dst_entry::modify_ratelimit(struct xlio_rate_limit_t &rate_limit) @@ -833,7 +832,7 @@ bool dst_entry::update_ring_alloc_logic(int fd, lock_base &socket_lock, { resource_allocation_key old_key(*m_ring_alloc_logic_tx.get_key()); - m_ring_alloc_logic_tx = ring_allocation_logic_tx(fd, ring_alloc_logic, this); + m_ring_alloc_logic_tx = ring_allocation_logic_tx(fd, ring_alloc_logic); if (*m_ring_alloc_logic_tx.get_key() != old_key) { std::lock_guard locker(m_tx_migration_lock); diff --git a/src/core/proto/dst_entry.h b/src/core/proto/dst_entry.h index b559023d1..f9afa28b6 100644 --- a/src/core/proto/dst_entry.h +++ b/src/core/proto/dst_entry.h @@ -40,7 +40,6 @@ #include "vlogger/vlogger.h" #include "utils/lock_wrapper.h" -#include "core/sock/socket_fd_api.h" #include "core/proto/route_entry.h" #include "core/proto/route_val.h" #include "core/proto/neighbour_table_mgr.h" @@ -55,6 +54,17 @@ /* Forward declarations */ class xlio_tis; +class sockinfo; + +typedef enum { + TX_WRITE = 13, + TX_WRITEV, + TX_SEND, + TX_SENDTO, + TX_SENDMSG, + TX_FILE, + TX_UNDEF +} tx_call_t; struct socket_data { int fd; @@ -85,7 +95,7 @@ class dst_entry : public cache_observer, public tostr { virtual ssize_t fast_send(const iovec *p_iov, const ssize_t sz_iov, xlio_send_attr attr) = 0; virtual ssize_t slow_send(const iovec *p_iov, const ssize_t sz_iov, xlio_send_attr attr, struct xlio_rate_limit_t &rate_limit, int flags = 0, - socket_fd_api *sock = 0, tx_call_t call_type = TX_UNDEF) = 0; + sockinfo *sock = nullptr, tx_call_t call_type = TX_UNDEF) = 0; bool try_migrate_ring_tx(lock_base &socket_lock); @@ -112,7 +122,7 @@ class dst_entry : public cache_observer, public tostr { } inline void set_src_sel_prefs(uint8_t sel_flags) { m_src_sel_prefs = sel_flags; } inline ring *get_ring() { return m_p_ring; } - inline ib_ctx_handler *get_ctx() { return m_p_ring->get_ctx(m_id); } + inline ib_ctx_handler *get_ctx() { return m_p_ring ? m_p_ring->get_ctx(m_id) : nullptr; } inline sa_family_t get_sa_family() { return m_family; } uint8_t get_tos() const { return m_tos; } uint8_t get_ttl_hop_limit() const { return m_ttl_hop_limit; } @@ -167,7 +177,6 @@ class dst_entry : public cache_observer, public tostr { uint8_t m_pcp; bool m_b_is_initialized; - xlio_ibv_send_wr *m_p_send_wqe; uint32_t m_max_inline; ring_user_id_t m_id; uint16_t m_max_ip_payload_size; diff --git a/src/core/proto/dst_entry_tcp.cpp b/src/core/proto/dst_entry_tcp.cpp index 1e82730a0..75dde2dbc 100644 --- a/src/core/proto/dst_entry_tcp.cpp +++ b/src/core/proto/dst_entry_tcp.cpp @@ -71,7 +71,8 @@ ssize_t dst_entry_tcp::fast_send(const iovec *p_iov, const ssize_t sz_iov, xlio_ void *p_pkt; void *p_ip_hdr; void *p_tcp_hdr; - tcp_iovec *p_tcp_iov = NULL; + tcp_iovec *p_tcp_iov = nullptr; + xlio_ibv_send_wr *p_send_wqe; size_t hdr_alignment_diff = 0; bool is_zerocopy = is_set(attr.flags, XLIO_TX_PACKET_ZEROCOPY); @@ -151,7 +152,7 @@ ssize_t dst_entry_tcp::fast_send(const iovec *p_iov, const ssize_t sz_iov, xlio_ tcp_hdr_len = (static_cast(p_tcp_hdr))->doff * 4; if (!is_zerocopy && (total_packet_len < m_max_inline) && (1 == sz_iov)) { - m_p_send_wqe = &m_inline_send_wqe; + p_send_wqe = &m_inline_send_wqe; p_tcp_iov[0].iovec.iov_base = (uint8_t *)p_pkt + hdr_alignment_diff; p_tcp_iov[0].iovec.iov_len = total_packet_len; } else if (is_set(attr.flags, (xlio_wr_tx_packet_attr)(XLIO_TX_PACKET_TSO))) { @@ -165,18 +166,18 @@ ssize_t dst_entry_tcp::fast_send(const iovec *p_iov, const ssize_t sz_iov, xlio_ send_wqe_h.enable_tso(send_wqe, (void *)((uint8_t *)p_pkt + hdr_alignment_diff), m_header->m_total_hdr_len + tcp_hdr_len, 0); } - m_p_send_wqe = &send_wqe; + p_send_wqe = &send_wqe; if (!is_zerocopy) { p_tcp_iov[0].iovec.iov_base = (uint8_t *)p_tcp_hdr + tcp_hdr_len; p_tcp_iov[0].iovec.iov_len -= tcp_hdr_len; } } else { - m_p_send_wqe = &m_not_inline_send_wqe; + p_send_wqe = &m_not_inline_send_wqe; p_tcp_iov[0].iovec.iov_base = (uint8_t *)p_pkt + hdr_alignment_diff; p_tcp_iov[0].iovec.iov_len = total_packet_len; } - if (unlikely(p_tcp_iov[0].p_desc->lwip_pbuf.pbuf.ref > 1)) { + if (unlikely(p_tcp_iov[0].p_desc->lwip_pbuf.ref > 1)) { /* * First buffer in the vector is used for reference counting. * The reference is released after completion depending on @@ -194,16 +195,16 @@ ssize_t dst_entry_tcp::fast_send(const iovec *p_iov, const ssize_t sz_iov, xlio_ * * We don't change data, only pointer to buffer descriptor. */ - pbuf_type type = (pbuf_type)p_tcp_iov[0].p_desc->lwip_pbuf.pbuf.type; + pbuf_type type = (pbuf_type)p_tcp_iov[0].p_desc->lwip_pbuf.type; mem_buf_desc_t *p_mem_buf_desc = - get_buffer(type, &(p_tcp_iov[0].p_desc->lwip_pbuf.pbuf.desc), + get_buffer(type, &(p_tcp_iov[0].p_desc->lwip_pbuf.desc), is_set(attr.flags, XLIO_TX_PACKET_BLOCK)); if (!p_mem_buf_desc) { return -1; } p_tcp_iov[0].p_desc = p_mem_buf_desc; } else { - p_tcp_iov[0].p_desc->lwip_pbuf.pbuf.ref++; + p_tcp_iov[0].p_desc->lwip_pbuf.ref++; } /* save pointers to ip and tcp headers for software checksum calculation */ @@ -211,7 +212,7 @@ ssize_t dst_entry_tcp::fast_send(const iovec *p_iov, const ssize_t sz_iov, xlio_ p_tcp_iov[0].p_desc->tx.p_tcp_h = static_cast(p_tcp_hdr); /* set wr_id as a pointer to memory descriptor */ - m_p_send_wqe->wr_id = (uintptr_t)p_tcp_iov[0].p_desc; + p_send_wqe->wr_id = (uintptr_t)p_tcp_iov[0].p_desc; /* Update scatter gather element list * ref counter is incremented (above) for the first memory descriptor only because it is @@ -222,14 +223,18 @@ ssize_t dst_entry_tcp::fast_send(const iovec *p_iov, const ssize_t sz_iov, xlio_ m_sge[i].addr = (uintptr_t)p_tcp_iov[i].iovec.iov_base; m_sge[i].length = p_tcp_iov[i].iovec.iov_len; if (is_zerocopy) { - if (PBUF_DESC_MKEY == p_tcp_iov[i].p_desc->lwip_pbuf.pbuf.desc.attr) { + auto *p_desc = p_tcp_iov[i].p_desc; + auto &pbuf_descriptor = p_desc->lwip_pbuf.desc; + if (PBUF_DESC_EXPRESS == pbuf_descriptor.attr) { + m_sge[i].lkey = pbuf_descriptor.mkey; + } else if (PBUF_DESC_MKEY == pbuf_descriptor.attr) { /* PBUF_DESC_MKEY - value is provided by user */ - m_sge[i].lkey = p_tcp_iov[i].p_desc->lwip_pbuf.pbuf.desc.mkey; - } else if (PBUF_DESC_MDESC == p_tcp_iov[i].p_desc->lwip_pbuf.pbuf.desc.attr || - PBUF_DESC_NVME_TX == p_tcp_iov[i].p_desc->lwip_pbuf.pbuf.desc.attr) { - mem_desc *mdesc = (mem_desc *)p_tcp_iov[i].p_desc->lwip_pbuf.pbuf.desc.mdesc; - m_sge[i].lkey = mdesc->get_lkey(p_tcp_iov[i].p_desc, ib_ctx, - (void *)m_sge[i].addr, m_sge[i].length); + m_sge[i].lkey = pbuf_descriptor.mkey; + } else if (PBUF_DESC_MDESC == pbuf_descriptor.attr || + PBUF_DESC_NVME_TX == pbuf_descriptor.attr) { + mem_desc *mdesc = (mem_desc *)pbuf_descriptor.mdesc; + m_sge[i].lkey = + mdesc->get_lkey(p_desc, ib_ctx, (void *)m_sge[i].addr, m_sge[i].length); if (m_sge[i].lkey == LKEY_TX_DEFAULT) { m_sge[i].lkey = m_p_ring->get_tx_lkey(m_id); } @@ -237,26 +242,24 @@ ssize_t dst_entry_tcp::fast_send(const iovec *p_iov, const ssize_t sz_iov, xlio_ /* Do not check desc.attr for specific type because * PBUF_DESC_FD - is not possible for XLIO_TX_PACKET_ZEROCOPY * PBUF_DESC_NONE - map should be initialized to NULL in - * dst_entry_tcp::get_buffer() PBUF_DESC_MAP - map should point on mapping - * object + * dst_entry_tcp::get_buffer() object */ masked_addr = (void *)((uint64_t)m_sge[i].addr & m_user_huge_page_mask); m_sge[i].lkey = - m_p_ring->get_tx_user_lkey(masked_addr, m_n_sysvar_user_huge_page_size, - p_tcp_iov[i].p_desc->lwip_pbuf.pbuf.desc.map); + m_p_ring->get_tx_user_lkey(masked_addr, m_n_sysvar_user_huge_page_size); } } else { m_sge[i].lkey = (i == 0 ? m_p_ring->get_tx_lkey(m_id) : m_sge[0].lkey); } } - ret = send_lwip_buffer(m_id, m_p_send_wqe, attr.flags, attr.tis); + ret = send_lwip_buffer(m_id, p_send_wqe, attr.flags, attr.tis); } else { // We don'nt support inline in this case, since we believe that this a very rare case mem_buf_desc_t *p_mem_buf_desc; size_t total_packet_len = 0; - p_mem_buf_desc = get_buffer(PBUF_RAM, NULL, is_set(attr.flags, XLIO_TX_PACKET_BLOCK)); - if (p_mem_buf_desc == NULL) { + p_mem_buf_desc = get_buffer(PBUF_RAM, nullptr, is_set(attr.flags, XLIO_TX_PACKET_BLOCK)); + if (!p_mem_buf_desc) { ret = -1; goto out; } @@ -291,13 +294,13 @@ ssize_t dst_entry_tcp::fast_send(const iovec *p_iov, const ssize_t sz_iov, xlio_ p_mem_buf_desc->tx.p_ip_h = p_ip_hdr; p_mem_buf_desc->tx.p_tcp_h = static_cast(p_tcp_hdr); - m_p_send_wqe = &m_not_inline_send_wqe; - m_p_send_wqe->wr_id = (uintptr_t)p_mem_buf_desc; + p_send_wqe = &m_not_inline_send_wqe; + p_send_wqe->wr_id = (uintptr_t)p_mem_buf_desc; - send_ring_buffer(m_id, m_p_send_wqe, attr.flags); + send_ring_buffer(m_id, p_send_wqe, attr.flags); } - if (unlikely(m_p_tx_mem_buf_desc_list == NULL)) { + if (unlikely(!m_p_tx_mem_buf_desc_list)) { m_p_tx_mem_buf_desc_list = m_p_ring->mem_buf_tx_get( m_id, is_set(attr.flags, XLIO_TX_PACKET_BLOCK), PBUF_RAM, m_n_sysvar_tx_bufs_batch_tcp); } @@ -312,7 +315,7 @@ ssize_t dst_entry_tcp::fast_send(const iovec *p_iov, const ssize_t sz_iov, xlio_ ssize_t dst_entry_tcp::slow_send(const iovec *p_iov, const ssize_t sz_iov, xlio_send_attr attr, struct xlio_rate_limit_t &rate_limit, int flags /*= 0*/, - socket_fd_api *sock /*= 0*/, tx_call_t call_type /*= 0*/) + sockinfo *sock /*= 0*/, tx_call_t call_type /*= 0*/) { ssize_t ret_val = -1; @@ -382,42 +385,37 @@ mem_buf_desc_t *dst_entry_tcp::get_buffer(pbuf_type type, pbuf_desc *desc, p_desc_list = type == PBUF_ZEROCOPY ? &m_p_zc_mem_buf_desc_list : &m_p_tx_mem_buf_desc_list; // Get a bunch of tx buf descriptor and data buffers - if (unlikely(*p_desc_list == NULL)) { + if (unlikely(!*p_desc_list)) { *p_desc_list = m_p_ring->mem_buf_tx_get(m_id, b_blocked, type, m_n_sysvar_tx_bufs_batch_tcp); } mem_buf_desc_t *p_mem_buf_desc = *p_desc_list; - if (unlikely(p_mem_buf_desc == NULL)) { + if (unlikely(!p_mem_buf_desc)) { dst_tcp_logfunc("silent packet drop, no buffers!"); } else { *p_desc_list = (*p_desc_list)->p_next_desc; - p_mem_buf_desc->p_next_desc = NULL; + p_mem_buf_desc->p_next_desc = nullptr; // for TX, set lwip payload to the data segment. // lwip will send it with payload pointing to the tcp header. if (p_mem_buf_desc->p_buffer) { - p_mem_buf_desc->lwip_pbuf.pbuf.payload = (u8_t *)p_mem_buf_desc->p_buffer + + p_mem_buf_desc->lwip_pbuf.payload = (u8_t *)p_mem_buf_desc->p_buffer + m_header->m_aligned_l2_l3_len + sizeof(struct tcphdr); } else { - p_mem_buf_desc->lwip_pbuf.pbuf.payload = NULL; + p_mem_buf_desc->lwip_pbuf.payload = nullptr; } /* Initialize pbuf description */ - memset(&p_mem_buf_desc->lwip_pbuf.pbuf.desc, 0, - sizeof(p_mem_buf_desc->lwip_pbuf.pbuf.desc)); - p_mem_buf_desc->lwip_pbuf.pbuf.desc.attr = PBUF_DESC_NONE; - if (desc) { - memcpy(&p_mem_buf_desc->lwip_pbuf.pbuf.desc, desc, - sizeof(p_mem_buf_desc->lwip_pbuf.pbuf.desc)); - if (p_mem_buf_desc->lwip_pbuf.pbuf.desc.attr == PBUF_DESC_MDESC || - p_mem_buf_desc->lwip_pbuf.pbuf.desc.attr == PBUF_DESC_NVME_TX) { - mem_desc *mdesc = (mem_desc *)p_mem_buf_desc->lwip_pbuf.pbuf.desc.mdesc; + if (likely(desc)) { + memcpy(&p_mem_buf_desc->lwip_pbuf.desc, desc, sizeof(p_mem_buf_desc->lwip_pbuf.desc)); + if (p_mem_buf_desc->lwip_pbuf.desc.attr == PBUF_DESC_MDESC || + p_mem_buf_desc->lwip_pbuf.desc.attr == PBUF_DESC_NVME_TX) { + mem_desc *mdesc = (mem_desc *)p_mem_buf_desc->lwip_pbuf.desc.mdesc; mdesc->get(); - } else if (p_mem_buf_desc->lwip_pbuf.pbuf.type == PBUF_ZEROCOPY && - (p_mem_buf_desc->lwip_pbuf.pbuf.desc.attr == PBUF_DESC_MAP)) { - mapping_t *mapping = (mapping_t *)p_mem_buf_desc->lwip_pbuf.pbuf.desc.map; - mapping->get(); } + } else { + memset(&p_mem_buf_desc->lwip_pbuf.desc, 0, sizeof(p_mem_buf_desc->lwip_pbuf.desc)); + p_mem_buf_desc->lwip_pbuf.desc.attr = PBUF_DESC_NONE; } } @@ -431,7 +429,7 @@ void dst_entry_tcp::put_buffer(mem_buf_desc_t *p_desc) { // todo accumulate buffers? - if (unlikely(p_desc == NULL)) { + if (unlikely(!p_desc)) { return; } @@ -440,26 +438,26 @@ void dst_entry_tcp::put_buffer(mem_buf_desc_t *p_desc) } else { // potential race, ref is protected here by tcp lock, and in ring by ring_tx lock - if (likely(p_desc->lwip_pbuf.pbuf.ref)) { - p_desc->lwip_pbuf.pbuf.ref--; + if (likely(p_desc->lwip_pbuf.ref)) { + p_desc->lwip_pbuf.ref--; } else { dst_tcp_logerr("ref count of %p is already zero, double free??", p_desc); } - if (p_desc->lwip_pbuf.pbuf.ref == 0) { - p_desc->p_next_desc = NULL; - buffer_pool::free_tx_lwip_pbuf_custom(&p_desc->lwip_pbuf.pbuf); + if (p_desc->lwip_pbuf.ref == 0) { + p_desc->p_next_desc = nullptr; + buffer_pool::free_tx_lwip_pbuf_custom(&p_desc->lwip_pbuf); } } } void dst_entry_tcp::put_zc_buffer(mem_buf_desc_t *p_desc) { - if (likely(p_desc->lwip_pbuf.pbuf.ref <= 1)) { - p_desc->lwip_pbuf.pbuf.ref = 1; + if (likely(p_desc->lwip_pbuf.ref <= 1)) { + p_desc->lwip_pbuf.ref = 1; p_desc->p_next_desc = m_p_zc_mem_buf_desc_list; m_p_zc_mem_buf_desc_list = p_desc; } else { - p_desc->lwip_pbuf.pbuf.ref--; + p_desc->lwip_pbuf.ref--; } } diff --git a/src/core/proto/dst_entry_tcp.h b/src/core/proto/dst_entry_tcp.h index d4d2aaac2..66223470a 100644 --- a/src/core/proto/dst_entry_tcp.h +++ b/src/core/proto/dst_entry_tcp.h @@ -50,7 +50,7 @@ class dst_entry_tcp : public dst_entry { ssize_t fast_send(const iovec *p_iov, const ssize_t sz_iov, xlio_send_attr attr); ssize_t slow_send(const iovec *p_iov, const ssize_t sz_iov, xlio_send_attr attr, - struct xlio_rate_limit_t &rate_limit, int flags = 0, socket_fd_api *sock = 0, + struct xlio_rate_limit_t &rate_limit, int flags = 0, sockinfo *sock = nullptr, tx_call_t call_type = TX_UNDEF); ssize_t slow_send_neigh(const iovec *p_iov, size_t sz_iov, struct xlio_rate_limit_t &rate_limit); diff --git a/src/core/proto/dst_entry_udp.cpp b/src/core/proto/dst_entry_udp.cpp index 817815713..09926f81b 100644 --- a/src/core/proto/dst_entry_udp.cpp +++ b/src/core/proto/dst_entry_udp.cpp @@ -33,6 +33,7 @@ #include "utils/bullseye.h" #include "core/util/utils.h" #include "dst_entry_udp.h" +#include "sock/sockinfo.h" #define MODULE_NAME "dst_udp" @@ -176,7 +177,7 @@ bool dst_entry_udp::fast_send_fragmented_ipv6(mem_buf_desc_t *p_mem_buf_desc, co n_ip_frag_offset, ntohl(packet_id)); tmp = p_mem_buf_desc->p_next_desc; - p_mem_buf_desc->p_next_desc = NULL; + p_mem_buf_desc->p_next_desc = nullptr; // We don't check the return valuse of post send when we reach the HW we consider that we // completed our job @@ -202,14 +203,15 @@ inline ssize_t dst_entry_udp::fast_send_not_fragmented(const iovec *p_iov, const ssize_t sz_data_payload) { mem_buf_desc_t *p_mem_buf_desc; + xlio_ibv_send_wr *p_send_wqe; bool b_blocked = is_set(attr, XLIO_TX_PACKET_BLOCK); // Get a bunch of tx buf descriptor and data buffers - if (unlikely(m_p_tx_mem_buf_desc_list == NULL)) { + if (unlikely(!m_p_tx_mem_buf_desc_list)) { m_p_tx_mem_buf_desc_list = m_p_ring->mem_buf_tx_get(m_id, b_blocked, PBUF_RAM, m_n_sysvar_tx_bufs_batch_udp); - if (unlikely(m_p_tx_mem_buf_desc_list == NULL)) { + if (unlikely(!m_p_tx_mem_buf_desc_list)) { if (b_blocked) { dst_udp_logdbg("Error when blocking for next tx buffer (errno=%d %m)", errno); } else { @@ -226,7 +228,7 @@ inline ssize_t dst_entry_udp::fast_send_not_fragmented(const iovec *p_iov, const // Disconnect the first buffer from the list p_mem_buf_desc = m_p_tx_mem_buf_desc_list; m_p_tx_mem_buf_desc_list = m_p_tx_mem_buf_desc_list->p_next_desc; - p_mem_buf_desc->p_next_desc = NULL; + p_mem_buf_desc->p_next_desc = nullptr; set_tx_buff_list_pending(false); @@ -234,7 +236,7 @@ inline ssize_t dst_entry_udp::fast_send_not_fragmented(const iovec *p_iov, const // Skip inlining in case of L4 SW checksum because headers and data are not contiguous in memory if (sz_iov == 1 && ((sz_data_payload + m_header->m_total_hdr_len) < m_max_inline) && !is_set(attr, XLIO_TX_SW_L4_CSUM)) { - m_p_send_wqe = &m_inline_send_wqe; + p_send_wqe = &m_inline_send_wqe; m_header->get_udp_hdr()->len = htons((uint16_t)sz_udp_payload); m_header->set_ip_len(m_header->m_ip_header_len + sz_udp_payload); @@ -248,7 +250,7 @@ inline ssize_t dst_entry_udp::fast_send_not_fragmented(const iovec *p_iov, const m_sge[1].addr = (uintptr_t)p_iov[0].iov_base; m_sge[1].lkey = m_p_ring->get_tx_lkey(m_id); } else { - m_p_send_wqe = &m_not_inline_send_wqe; + p_send_wqe = &m_not_inline_send_wqe; void *p_pkt = p_mem_buf_desc->p_buffer; void *p_ip_hdr; @@ -302,11 +304,11 @@ inline ssize_t dst_entry_udp::fast_send_not_fragmented(const iovec *p_iov, const BULLSEYE_EXCLUDE_BLOCK_END } - m_p_send_wqe->wr_id = reinterpret_cast(p_mem_buf_desc); - send_ring_buffer(m_id, m_p_send_wqe, attr); + p_send_wqe->wr_id = reinterpret_cast(p_mem_buf_desc); + send_ring_buffer(m_id, p_send_wqe, attr); // request tx buffers for the next packets - if (unlikely(m_p_tx_mem_buf_desc_list == NULL)) { + if (unlikely(!m_p_tx_mem_buf_desc_list)) { m_p_tx_mem_buf_desc_list = m_p_ring->mem_buf_tx_get(m_id, b_blocked, PBUF_RAM, m_n_sysvar_tx_bufs_batch_udp); } @@ -324,7 +326,7 @@ inline bool dst_entry_udp::fast_send_fragmented_ipv4(mem_buf_desc_t *p_mem_buf_d void *p_ip_hdr; void *p_udp_hdr; mem_buf_desc_t *tmp; - m_p_send_wqe = &m_fragmented_send_wqe; + xlio_ibv_send_wr *p_send_wqe = &m_fragmented_send_wqe; uint16_t packet_id = gen_packet_id_ip4(); // Int for counting offset inside the ip datagram payload @@ -397,18 +399,18 @@ inline bool dst_entry_udp::fast_send_fragmented_ipv4(mem_buf_desc_t *p_mem_buf_d (uintptr_t)(p_mem_buf_desc->p_buffer + (uint8_t)m_header->m_transport_header_tx_offset); m_sge[1].length = sz_user_data_to_copy + hdr_len; m_sge[1].lkey = m_p_ring->get_tx_lkey(m_id); - m_p_send_wqe->wr_id = (uintptr_t)p_mem_buf_desc; + p_send_wqe->wr_id = (uintptr_t)p_mem_buf_desc; dst_udp_logfunc("packet_sz=%d, payload_sz=%d, ip_offset=%d id=%d", m_sge[1].length - m_header->m_transport_header_len, sz_user_data_to_copy, n_ip_frag_offset, ntohs(packet_id)); tmp = p_mem_buf_desc->p_next_desc; - p_mem_buf_desc->p_next_desc = NULL; + p_mem_buf_desc->p_next_desc = nullptr; // We don't check the return valuse of post send when we reach the HW we consider that we // completed our job - send_ring_buffer(m_id, m_p_send_wqe, attr); + send_ring_buffer(m_id, p_send_wqe, attr); p_mem_buf_desc = tmp; @@ -443,7 +445,7 @@ ssize_t dst_entry_udp::fast_send_fragmented(const iovec *p_iov, const ssize_t sz mem_buf_desc_t *p_mem_buf_desc = m_p_ring->mem_buf_tx_get(m_id, b_blocked, PBUF_RAM, n_num_frags); - if (unlikely(p_mem_buf_desc == NULL)) { + if (unlikely(!p_mem_buf_desc)) { if (b_blocked) { dst_udp_logdbg("Error when blocking for next tx buffer (errno=%d %m)", errno); } else { @@ -496,7 +498,7 @@ ssize_t dst_entry_udp::fast_send(const iovec *p_iov, const ssize_t sz_iov, xlio_ ssize_t dst_entry_udp::slow_send(const iovec *p_iov, const ssize_t sz_iov, xlio_send_attr attr, struct xlio_rate_limit_t &rate_limit, int flags /*= 0*/, - socket_fd_api *sock /*= 0*/, tx_call_t call_type /*= 0*/) + sockinfo *sock /*= 0*/, tx_call_t call_type /*= 0*/) { ssize_t ret_val = 0; diff --git a/src/core/proto/dst_entry_udp.h b/src/core/proto/dst_entry_udp.h index 9b9c7724f..3b7cda9b2 100644 --- a/src/core/proto/dst_entry_udp.h +++ b/src/core/proto/dst_entry_udp.h @@ -43,7 +43,7 @@ class dst_entry_udp : public dst_entry { ssize_t fast_send(const iovec *p_iov, const ssize_t sz_iov, xlio_send_attr attr); ssize_t slow_send(const iovec *p_iov, const ssize_t sz_iov, xlio_send_attr attr, - struct xlio_rate_limit_t &rate_limit, int flags = 0, socket_fd_api *sock = 0, + struct xlio_rate_limit_t &rate_limit, int flags = 0, sockinfo *sock = nullptr, tx_call_t call_type = TX_UNDEF); static bool fast_send_fragmented_ipv6(mem_buf_desc_t *p_mem_buf_desc, const iovec *p_iov, const ssize_t sz_iov, xlio_wr_tx_packet_attr attr, diff --git a/src/core/proto/dst_entry_udp_mc.cpp b/src/core/proto/dst_entry_udp_mc.cpp index 71aee9fb7..396dce28c 100644 --- a/src/core/proto/dst_entry_udp_mc.cpp +++ b/src/core/proto/dst_entry_udp_mc.cpp @@ -84,10 +84,10 @@ bool dst_entry_udp_mc::resolve_net_dev(bool is_connect) { NOT_IN_USE(is_connect); bool ret_val = false; - cache_entry_subject *net_dev_entry = NULL; + cache_entry_subject *net_dev_entry = nullptr; if (!m_mc_tx_src_ip.is_anyaddr() && !m_mc_tx_src_ip.is_mc(m_family)) { - if (m_p_net_dev_entry == NULL) { + if (!m_p_net_dev_entry) { net_device_val *mc_net_dev = g_p_net_device_table_mgr->get_net_device_val(ip_addr(m_mc_tx_src_ip, m_family)); if (mc_net_dev) { diff --git a/src/core/proto/ip_frag.cpp b/src/core/proto/ip_frag.cpp index b2b3ed96b..5fda4b391 100644 --- a/src/core/proto/ip_frag.cpp +++ b/src/core/proto/ip_frag.cpp @@ -38,23 +38,17 @@ #include "core/event/event_handler_manager.h" #include "mem_buf_desc.h" +#undef MODULE_NAME +#define MODULE_NAME "ip_frag" + //#define IP_FRAG_DEBUG 1 #ifdef IP_FRAG_DEBUG -#define frag_dbg(fmt, args...) \ - vlog_printf(VLOG_WARNING, "%s:%d : " fmt "\n", __FUNCTION__, __LINE__, ##args) +#define frag_dbg __log_info_dbg #else #define frag_dbg(fmt, args...) #endif - -#define frag_err(fmt, args...) \ - vlog_printf(VLOG_ERROR, "%s:%d : " fmt "\n", __FUNCTION__, __LINE__, ##args) - -#define frag_panic(fmt, args...) \ - do { \ - vlog_printf(VLOG_PANIC, "%s:%d : " fmt "\n", __FUNCTION__, __LINE__, ##args); \ - throw; \ - } while (0) +#define frag_panic __log_info_panic #ifdef IP_FRAG_DEBUG static int debug_drop_every_n_pkt = 0; // 0 - Disabled, 1/N is the number of packet dropped @@ -64,9 +58,9 @@ static int g_ip_frag_count_check = 0; #define MEMBUF_DEBUG_REF_INC(__p_desc__) \ { \ g_ip_frag_count_check++; \ - if (__p_desc__->n_ref_count != 0) \ + if (__p_desc__->inc_ref_count() != 0) { \ frag_panic("REF_INC: p=%p\n", __p_desc__); \ - __p_desc__->n_ref_count++; \ + } \ } #define MEMBUF_DEBUG_REF_DEC(__p_desc__) \ { \ @@ -79,9 +73,9 @@ static int g_ip_frag_count_check = 0; #define MEMBUF_DEBUG_REF_DEC_1(__p_desc__) \ { \ g_ip_frag_count_check--; \ - __p_desc__->n_ref_count--; \ - if (__p_desc__->n_ref_count != 0) \ + if (__p_desc__->dec_ref_count() != 1) { \ frag_panic("REF_DEC: p=%p\n", __p_desc__); \ + } \ } #define PRINT_STATISTICS() \ { \ @@ -93,14 +87,14 @@ static int g_ip_frag_count_check = 0; #define PRINT_STATISTICS() #endif -ip_frag_manager *g_p_ip_frag_manager = NULL; +ip_frag_manager *g_p_ip_frag_manager = nullptr; -ip_frag_hole_desc *hole_base = NULL; -ip_frag_hole_desc *hole_free_list_head = NULL; +ip_frag_hole_desc *hole_base = nullptr; +ip_frag_hole_desc *hole_free_list_head = nullptr; int hole_free_list_count = 0; -ip_frag_desc *desc_base = NULL; -ip_frag_desc *desc_free_list_head = NULL; +ip_frag_desc *desc_base = nullptr; +ip_frag_desc *desc_free_list_head = nullptr; int desc_free_list_count = 0; ip_frag_manager::ip_frag_manager() @@ -162,7 +156,7 @@ void ip_frag_manager::free_frag_resources(void) unlock(); - // Must call cq_mgr outside the lock to avoid ABBA deadlock + // Must call cq_mgr_rx outside the lock to avoid ABBA deadlock return_buffers_to_owners(temp_buff_map); delete[] desc_base; @@ -215,7 +209,7 @@ ip_frag_hole_desc *ip_frag_manager::alloc_hole_desc() struct ip_frag_hole_desc *ret; ret = hole_free_list_head; if (!ret) { - return NULL; + return nullptr; } // unlink from hole's free list @@ -223,9 +217,9 @@ ip_frag_hole_desc *ip_frag_manager::alloc_hole_desc() hole_free_list_count--; // clear hole struct - ret->data_first = 0; - ret->data_last = 0; - ret->next = 0; + ret->data_first = nullptr; + ret->data_last = nullptr; + ret->next = nullptr; return ret; } @@ -242,14 +236,14 @@ ip_frag_desc_t *ip_frag_manager::alloc_frag_desc() ip_frag_desc_t *ret; ret = desc_free_list_head; if (!ret) { - return NULL; + return nullptr; } // unlink from hole's free list desc_free_list_head = ret->next; --desc_free_list_count; - ret->next = 0; + ret->next = nullptr; return ret; } @@ -282,13 +276,13 @@ void ip_frag_manager::destroy_frag_desc(ip_frag_desc_t *desc) */ ip_frag_desc_t *ip_frag_manager::new_frag_desc(ip_frag_key_t &key) { - ip_frag_desc_t *desc = NULL; - struct ip_frag_hole_desc *hole = NULL; + ip_frag_desc_t *desc = nullptr; + struct ip_frag_hole_desc *hole = nullptr; hole = alloc_hole_desc(); if (!hole) { frag_dbg("NULL hole"); - return NULL; + return nullptr; } hole->first = IP_FRAG_NINF; hole->last = IP_FRAG_INF; @@ -297,10 +291,10 @@ ip_frag_desc_t *ip_frag_manager::new_frag_desc(ip_frag_key_t &key) if (!desc) { frag_dbg("NULL desc"); free_hole_desc(hole); - return NULL; + return nullptr; } desc->ttl = IP_FRAG_TTL; - desc->frag_list = 0; + desc->frag_list = nullptr; desc->hole_list = hole; desc->frag_counter = m_frag_counter; @@ -388,7 +382,7 @@ int ip_frag_manager::add_frag(iphdr *hdr, mem_buf_desc_t *frag, mem_buf_desc_t * frag_dbg("> old fragmented packet"); } } - if (desc == NULL) { + if (!desc) { MEMBUF_DEBUG_REF_DEC(frag); PRINT_STATISTICS(); unlock(); @@ -399,7 +393,7 @@ int ip_frag_manager::add_frag(iphdr *hdr, mem_buf_desc_t *frag, mem_buf_desc_t * /* 8 step reassembly algorithm as described in RFC 815 */ // step 1 - phole_prev = 0; + phole_prev = nullptr; phole = desc->hole_list; while (phole) { // step 2 and step 3 @@ -507,7 +501,7 @@ int ip_frag_manager::add_frag(iphdr *hdr, mem_buf_desc_t *frag, mem_buf_desc_t * } frag_dbg("> need more packets"); - *ret = NULL; + *ret = nullptr; PRINT_STATISTICS(); unlock(); return 0; @@ -565,7 +559,7 @@ void ip_frag_manager::handle_timer_expired(void *user_data) PRINT_STATISTICS(); unlock(); - // Must call cq_mgr outside the lock to avoid ABBA deadlock + // Must call cq_mgr_rx outside the lock to avoid ABBA deadlock return_buffers_to_owners(temp_buff_map); } diff --git a/src/core/proto/mapping.cpp b/src/core/proto/mapping.cpp index 854975226..68d36f417 100644 --- a/src/core/proto/mapping.cpp +++ b/src/core/proto/mapping.cpp @@ -56,7 +56,7 @@ #define map_logdbg_exit __log_exit_dbg #define map_logfunc_exit __log_exit_func -mapping_cache *g_zc_cache = NULL; +mapping_cache *g_zc_cache = nullptr; mapping_t::mapping_t(file_uid_t &uid, mapping_cache *cache, ib_ctx_handler *p_ib_ctx) : m_registrator() @@ -64,7 +64,7 @@ mapping_t::mapping_t(file_uid_t &uid, mapping_cache *cache, ib_ctx_handler *p_ib m_state = MAPPING_STATE_UNMAPPED; m_fd = -1; m_uid = uid; - m_addr = NULL; + m_addr = nullptr; m_size = 0; m_ref = 0; m_owners = 0; @@ -125,8 +125,8 @@ int mapping_t::map(int fd) * performance results. For now, use only MAP_PRIVATE mappings. */ flags = /* rw ? MAP_SHARED :*/ MAP_PRIVATE; - m_addr = - mmap64(NULL, m_size, PROT_WRITE | PROT_READ, flags | MAP_NORESERVE | MAP_POPULATE, m_fd, 0); + m_addr = mmap64(nullptr, m_size, PROT_WRITE | PROT_READ, flags | MAP_NORESERVE | MAP_POPULATE, + m_fd, 0); if (MAP_FAILED == m_addr) { map_logerr("mmap64() errno=%d (%s)", errno, strerror(errno)); goto failed_close_fd; @@ -146,8 +146,8 @@ int mapping_t::map(int fd) failed_unmap: (void)munmap(m_addr, m_size); failed_close_fd: - orig_os_api.close(m_fd); - m_addr = NULL; + SYSCALL(close, m_fd); + m_addr = nullptr; m_size = 0; m_fd = -1; failed: @@ -171,9 +171,9 @@ int mapping_t::unmap(void) map_logerr("munmap() errno=%d (%s)", errno, strerror(errno)); } p_cache->memory_free(m_size); - orig_os_api.close(m_fd); + SYSCALL(close, m_fd); m_fd = -1; - m_addr = NULL; + m_addr = nullptr; m_size = 0; m_state = MAPPING_STATE_UNMAPPED; @@ -230,7 +230,7 @@ int mapping_t::duplicate_fd(int fd, bool &rw) len = readlink(link, filename, sizeof(filename) - 1); if (len > 0) { filename[len] = '\0'; - result = orig_os_api.open(filename, O_RDWR); + result = SYSCALL(open, filename, O_RDWR); if (result < 0) { map_logdbg("open() errno=%d (%s)", errno, strerror(errno)); } else { @@ -248,11 +248,11 @@ int mapping_t::duplicate_fd(int fd, bool &rw) if (result < 0) { /* Fallback to dup(2). */ - result = orig_os_api.dup(fd); + result = SYSCALL(dup, fd); if (result < 0) { map_logerr("dup() errno=%d (%s)", errno, strerror(errno)); } else { - int flags = orig_os_api.fcntl(result, F_GETFL); + int flags = SYSCALL(fcntl, result, F_GETFL); rw = (flags > 0) && ((flags & O_RDWR) == O_RDWR); } } @@ -295,7 +295,7 @@ mapping_cache::~mapping_cache() mapping_t *mapping_cache::get_mapping(int local_fd, void *p_ctx) { - mapping_t *mapping = NULL; + mapping_t *mapping = nullptr; mapping_fd_map_iter_t iter; file_uid_t uid; struct stat st; @@ -311,7 +311,7 @@ mapping_t *mapping_cache::get_mapping(int local_fd, void *p_ctx) } } - if (mapping == NULL) { + if (!mapping) { if (fstat(local_fd, &st) != 0) { map_logerr("fstat() errno=%d (%s)", errno, strerror(errno)); goto quit; @@ -324,7 +324,7 @@ mapping_t *mapping_cache::get_mapping(int local_fd, void *p_ctx) } quit: - if (mapping != NULL) { + if (mapping) { mapping->get(); /* Mapping object may be unmapped, call mmap() in this case */ @@ -335,9 +335,9 @@ mapping_t *mapping_cache::get_mapping(int local_fd, void *p_ctx) unlock(); - if (mapping != NULL && mapping->m_state == MAPPING_STATE_FAILED) { + if (mapping && mapping->m_state == MAPPING_STATE_FAILED) { mapping->put(); - mapping = NULL; + mapping = nullptr; } return mapping; } @@ -403,7 +403,7 @@ void mapping_cache::memory_free(size_t size) mapping_t *mapping_cache::get_mapping_by_uid_unlocked(file_uid_t &uid, ib_ctx_handler *p_ib_ctx) { - mapping_t *mapping = NULL; + mapping_t *mapping = nullptr; mapping_uid_map_iter_t iter; iter = m_cache_uid.find(uid); @@ -414,9 +414,9 @@ mapping_t *mapping_cache::get_mapping_by_uid_unlocked(file_uid_t &uid, ib_ctx_ha } } - if (mapping == NULL) { + if (!mapping) { mapping = new (std::nothrow) mapping_t(uid, this, p_ib_ctx); - if (mapping != NULL) { + if (mapping) { m_cache_uid[uid] = mapping; } } diff --git a/src/core/proto/mapping.h b/src/core/proto/mapping.h index e99dd18f7..14ede8caa 100644 --- a/src/core/proto/mapping.h +++ b/src/core/proto/mapping.h @@ -127,7 +127,7 @@ class mapping_cache : public lock_spin { mapping_cache(size_t threshold); ~mapping_cache(); - mapping_t *get_mapping(int local_fd, void *p_ctx = NULL); + mapping_t *get_mapping(int local_fd, void *p_ctx = nullptr); void release_mapping(mapping_t *mapping); void handle_close(int local_fd); @@ -137,7 +137,7 @@ class mapping_cache : public lock_spin { struct mapping_cache_stats m_stats; private: - mapping_t *get_mapping_by_uid_unlocked(file_uid_t &uid, ib_ctx_handler *p_ib_ctx = NULL); + mapping_t *get_mapping_by_uid_unlocked(file_uid_t &uid, ib_ctx_handler *p_ib_ctx = nullptr); void evict_mapping_unlocked(mapping_t *mapping); bool cache_evict_unlocked(size_t toFree); diff --git a/src/core/proto/mem_buf_desc.h b/src/core/proto/mem_buf_desc.h index a250b0f96..116459198 100644 --- a/src/core/proto/mem_buf_desc.h +++ b/src/core/proto/mem_buf_desc.h @@ -65,26 +65,26 @@ struct timestamps_t { */ class mem_buf_desc_t { public: - enum flags { TYPICAL = 0, CLONED = 0x01, ZCOPY = 0x02 }; + enum flags { TYPICAL = 0, CLONED = 0x01, URGENT = 0x02, CALLBACK = 0x04 }; public: mem_buf_desc_t(uint8_t *buffer, size_t size, pbuf_type type) : p_buffer(buffer) , m_flags(mem_buf_desc_t::TYPICAL) , lkey(0) - , p_next_desc(0) - , p_prev_desc(0) + , p_next_desc(nullptr) + , p_prev_desc(nullptr) , sz_buffer(size) , sz_data(0) - , p_desc_owner(0) + , p_desc_owner(nullptr) + , unused_padding(0) { - memset(&lwip_pbuf, 0, sizeof(lwip_pbuf)); clear_transport_data(); memset(&ee, 0, sizeof(ee)); reset_ref_count(); - lwip_pbuf.pbuf.type = type; + lwip_pbuf.type = type; } // Copy constructor for the clone() method. @@ -94,14 +94,42 @@ class mem_buf_desc_t { memcpy((void *)this, &ref, sizeof(mem_buf_desc_t)); } + inline mem_buf_desc_t *clone() + { + mem_buf_desc_t *p_desc = new mem_buf_desc_t(*this); + INIT_LIST_HEAD(&p_desc->buffer_node.head); + p_desc->m_flags |= mem_buf_desc_t::CLONED; + return p_desc; + } + // Destructor specifically for cloned buffers. ~mem_buf_desc_t() {} - /* This field must be first in this class - * It encapsulates pbuf structure from lwip - * and extra fields to proceed customer specific requirements - */ - struct pbuf_custom lwip_pbuf; + inline void clear_transport_data(void) + { + // rx field is the largest in the union, this clears tx as well. + memset((void *)&rx, 0, sizeof(rx)); + } + + inline int get_ref_count() const { return atomic_read(&n_ref_count); } + inline void reset_ref_count() { atomic_set(&n_ref_count, 0); } + inline void set_ref_count(int x) { atomic_set(&n_ref_count, x); } + inline int inc_ref_count() { return atomic_fetch_and_inc(&n_ref_count); } + inline int dec_ref_count() { return atomic_fetch_and_dec(&n_ref_count); } + inline int add_ref_count(int x) { return atomic_fetch_add_relaxed(x, &n_ref_count); } + inline unsigned int lwip_pbuf_get_ref_count() const { return lwip_pbuf.ref; } + inline unsigned int lwip_pbuf_inc_ref_count() { return ++lwip_pbuf.ref; } + inline unsigned int lwip_pbuf_dec_ref_count() + { + if (likely(lwip_pbuf.ref)) { + --lwip_pbuf.ref; + } + return lwip_pbuf.ref; + } + +public: + /* This field must be first in this class. It encapsulates pbuf structure from lwip */ + struct pbuf lwip_pbuf; uint8_t *p_buffer; static inline size_t buffer_node_offset(void) @@ -184,52 +212,15 @@ class mem_buf_desc_t { size_t sz_buffer; // this is the size of the buffer size_t sz_data; // this is the amount of data inside the buffer (sz_data <= sz_buffer) - // Tx: qp_mgr owns the mem_buf_desc and the associated data buffer - // Rx: cq_mgr owns the mem_buf_desc and the associated data buffer + // Tx: cq_mgr_tx owns the mem_buf_desc and the associated data buffer + // Rx: cq_mgr_rx owns the mem_buf_desc and the associated data buffer ring_slave *p_desc_owner; private: atomic_t n_ref_count; // number of interested receivers (sockinfo) [can be modified only in - // cq_mgr context] - + // cq_mgr_rx context] public: - inline void clear_transport_data(void) - { - // rx field is the largest in the union, this clears tx as well. - memset((void *)&rx, 0, sizeof(rx)); - } - - inline mem_buf_desc_t *clone() - { - mem_buf_desc_t *p_desc = new mem_buf_desc_t(*this); - INIT_LIST_HEAD(&p_desc->buffer_node.head); - p_desc->m_flags |= mem_buf_desc_t::CLONED; - return p_desc; - } - - inline int get_ref_count() const { return atomic_read(&n_ref_count); } - - inline void reset_ref_count() { atomic_set(&n_ref_count, 0); } - - inline void set_ref_count(int x) { atomic_set(&n_ref_count, x); } - - inline int inc_ref_count() { return atomic_fetch_and_inc(&n_ref_count); } - - inline int dec_ref_count() { return atomic_fetch_and_dec(&n_ref_count); } - - inline int add_ref_count(int x) { return atomic_fetch_add_relaxed(x, &n_ref_count); } - - inline unsigned int lwip_pbuf_inc_ref_count() { return ++lwip_pbuf.pbuf.ref; } - - inline unsigned int lwip_pbuf_dec_ref_count() - { - if (likely(lwip_pbuf.pbuf.ref)) { - --lwip_pbuf.pbuf.ref; - } - return lwip_pbuf.pbuf.ref; - } - - inline unsigned int lwip_pbuf_get_ref_count() const { return lwip_pbuf.pbuf.ref; } + uint64_t unused_padding; // Align the structure to the cache line boundary }; typedef xlio_list_t descq_t; diff --git a/src/core/proto/mem_desc.h b/src/core/proto/mem_desc.h index 682705c4c..bbc16ecc2 100644 --- a/src/core/proto/mem_desc.h +++ b/src/core/proto/mem_desc.h @@ -194,7 +194,7 @@ class zcopy_hugepage_mgr : public lock_spin { page = iter->second; } else { page = new zcopy_hugepage(page_addr, m_hugepage_size); - if (likely(page != NULL)) { + if (likely(page)) { m_hugepage_map[page_addr] = page; } } diff --git a/src/core/proto/neighbour.cpp b/src/core/proto/neighbour.cpp index 20668892b..21bad8bae 100644 --- a/src/core/proto/neighbour.cpp +++ b/src/core/proto/neighbour.cpp @@ -159,18 +159,18 @@ inline int neigh_eth::build_uc_neigh_val() neigh_entry::neigh_entry(neigh_key key, transport_type_t _type, bool is_init_resources) : cache_entry_subject(key) - , m_cma_id(NULL) + , m_cma_id(nullptr) , m_src_addr(in6addr_any) , m_rdma_port_space((enum rdma_port_space)0) - , m_state_machine(NULL) + , m_state_machine(nullptr) , m_type(UNKNOWN) , m_trans_type(_type) , m_state(false) , m_err_counter(0) - , m_timer_handle(NULL) + , m_timer_handle(nullptr) , m_arp_counter(0) , m_p_dev(key.get_net_device_val()) - , m_p_ring(NULL) + , m_p_ring(nullptr) , m_is_loopback(false) , m_to_str(std::string(priv_xlio_transport_type_str(m_trans_type)) + ":" + get_key().to_str()) , m_id(0) @@ -183,7 +183,7 @@ neigh_entry::neigh_entry(neigh_key key, transport_type_t _type, bool is_init_res m_val = NULL; BULLSEYE_EXCLUDE_BLOCK_START - if (m_p_dev == NULL) { + if (!m_p_dev) { neigh_logpanic("get_net_dev return NULL"); } @@ -221,11 +221,11 @@ neigh_entry::neigh_entry(neigh_key key, transport_type_t _type, bool is_init_res // Allocate one ring for g_p_neigh_table_mgr. All eigh_entry objects will share the same ring. ring_alloc_logic_attr ring_attr(RING_LOGIC_PER_OBJECT, true); - m_ring_allocation_logic = ring_allocation_logic_tx(g_p_neigh_table_mgr, ring_attr, this); + m_ring_allocation_logic = ring_allocation_logic_tx(g_p_neigh_table_mgr, ring_attr); if (is_init_resources) { m_p_ring = m_p_dev->reserve_ring(m_ring_allocation_logic.get_key()); - if (m_p_ring == NULL) { + if (!m_p_ring) { neigh_logpanic("reserve_ring return NULL"); } m_id = m_p_ring->generate_id(); @@ -244,11 +244,11 @@ neigh_entry::~neigh_entry() if (m_state_machine) { delete m_state_machine; - m_state_machine = NULL; + m_state_machine = nullptr; } if (m_p_dev && m_p_ring) { m_p_dev->release_ring(m_ring_allocation_logic.get_key()); - m_p_ring = NULL; + m_p_ring = nullptr; } if (m_val) { delete m_val; @@ -261,7 +261,7 @@ neigh_entry::~neigh_entry() bool neigh_entry::is_deletable() { - if (m_state_machine == NULL) { + if (!m_state_machine) { return true; } @@ -282,7 +282,7 @@ void neigh_entry::clean_obj() m_lock.lock(); set_cleaned(); - m_timer_handle = NULL; + m_timer_handle = nullptr; if (g_p_event_handler_manager->is_running()) { g_p_event_handler_manager->unregister_timers_event_and_delete(this); m_lock.unlock(); @@ -335,7 +335,7 @@ void neigh_entry::handle_timer_expired(void *ctx) neigh_logdbg("Timeout expired!"); // Clear Timer Handler - m_timer_handle = NULL; + m_timer_handle = nullptr; m_sm_lock.lock(); int sm_state = m_state_machine->get_curr_state(); @@ -447,7 +447,7 @@ bool neigh_entry::post_send_udp_ipv4(neigh_send_data *n_send_data) // Find number of ip fragments (-> packets, buffers, buffer descs...) neigh_logdbg("ENTER post_send_udp_ipv4"); int n_num_frags = 1; - mem_buf_desc_t *p_mem_buf_desc, *tmp = NULL; + mem_buf_desc_t *p_mem_buf_desc, *tmp = nullptr; void *p_pkt; void *p_ip_hdr; void *p_udp_hdr; @@ -475,7 +475,7 @@ bool neigh_entry::post_send_udp_ipv4(neigh_send_data *n_send_data) // Get all needed tx buf descriptor and data buffers p_mem_buf_desc = m_p_ring->mem_buf_tx_get(m_id, false, PBUF_RAM, n_num_frags); - if (unlikely(p_mem_buf_desc == NULL)) { + if (unlikely(!p_mem_buf_desc)) { neigh_logdbg("Packet dropped. not enough tx buffers"); return false; } @@ -553,7 +553,7 @@ bool neigh_entry::post_send_udp_ipv4(neigh_send_data *n_send_data) NOT_IN_USE(id); // Fix unused-but-set error when bebug logs are disabled tmp = p_mem_buf_desc->p_next_desc; - p_mem_buf_desc->p_next_desc = NULL; + p_mem_buf_desc->p_next_desc = nullptr; // We don't check the return value of post send when we reach the HW we consider that we // completed our job @@ -582,7 +582,7 @@ bool neigh_entry::post_send_udp_ipv6_fragmented(neigh_send_data *n_send_data, si (sz_udp_payload + max_payload_size_per_packet - 1) / max_payload_size_per_packet; mem_buf_desc_t *p_mem_buf_desc = m_p_ring->mem_buf_tx_get(m_id, false, PBUF_RAM, n_num_frags); - if (unlikely(p_mem_buf_desc == NULL)) { + if (unlikely(!p_mem_buf_desc)) { neigh_logdbg("Packet dropped. not enough tx buffers"); return false; } @@ -597,7 +597,7 @@ bool neigh_entry::post_send_udp_ipv6_not_fragmented(neigh_send_data *n_send_data { neigh_logdbg("ENTER post_send_udp_ipv6_not_fragmented"); mem_buf_desc_t *p_mem_buf_desc = m_p_ring->mem_buf_tx_get(m_id, false, PBUF_RAM); - if (unlikely(p_mem_buf_desc == NULL)) { + if (unlikely(!p_mem_buf_desc)) { neigh_logdbg("Packet dropped. not enough tx buffers"); return false; } @@ -672,14 +672,14 @@ bool neigh_entry::post_send_tcp(neigh_send_data *p_data) p_mem_buf_desc = m_p_ring->mem_buf_tx_get(m_id, false, PBUF_RAM, 1); BULLSEYE_EXCLUDE_BLOCK_START - if (unlikely(p_mem_buf_desc == NULL)) { + if (unlikely(!p_mem_buf_desc)) { neigh_logdbg("Packet dropped. not enough tx buffers"); return false; } BULLSEYE_EXCLUDE_BLOCK_END - p_mem_buf_desc->lwip_pbuf.pbuf.payload = (u8_t *)p_mem_buf_desc->p_buffer + h->m_total_hdr_len; - p_mem_buf_desc->p_next_desc = NULL; + p_mem_buf_desc->lwip_pbuf.payload = (u8_t *)p_mem_buf_desc->p_buffer + h->m_total_hdr_len; + p_mem_buf_desc->p_next_desc = nullptr; // copy L4 neigh buffer to tx buffer memcpy((void *)(p_mem_buf_desc->p_buffer + h->m_aligned_l2_l3_len), p_data->m_iov.iov_base, @@ -712,9 +712,9 @@ bool neigh_entry::post_send_tcp(neigh_send_data *p_data) neigh_logerr("p_buffer - addr=%d, m_total_hdr_len=%u, p_buffer=%p, type=%d, len=%d, " "tot_len=%d, payload=%p, hdr_alignment_diff=%zd\n", (int)(p_mem_buf_desc->p_buffer - (uint8_t *)m_sge.addr), h->m_total_hdr_len, - p_mem_buf_desc->p_buffer, p_mem_buf_desc->lwip_pbuf.pbuf.type, - p_mem_buf_desc->lwip_pbuf.pbuf.len, p_mem_buf_desc->lwip_pbuf.pbuf.tot_len, - p_mem_buf_desc->lwip_pbuf.pbuf.payload, hdr_alignment_diff); + p_mem_buf_desc->p_buffer, p_mem_buf_desc->lwip_pbuf.type, + p_mem_buf_desc->lwip_pbuf.len, p_mem_buf_desc->lwip_pbuf.tot_len, + p_mem_buf_desc->lwip_pbuf.payload, hdr_alignment_diff); } m_send_wqe.wr_id = (uintptr_t)p_mem_buf_desc; @@ -753,7 +753,7 @@ bool neigh_entry::get_peer_info(neigh_val *p_val) { neigh_logfunc("calling neigh_entry get_peer_info. state = %d", m_state); BULLSEYE_EXCLUDE_BLOCK_START - if (p_val == NULL) { + if (!p_val) { neigh_logdbg("p_val is NULL, return false"); return false; } @@ -814,7 +814,7 @@ void neigh_entry::handle_neigh_event(neigh_nl_event *nl_ev) case NUD_REACHABLE: case NUD_PERMANENT: { BULLSEYE_EXCLUDE_BLOCK_START - if (m_state_machine == NULL) { + if (!m_state_machine) { neigh_logerr("m_state_machine: not a valid case"); break; } @@ -844,7 +844,7 @@ void neigh_entry::handle_neigh_event(neigh_nl_event *nl_ev) case NUD_STALE: { BULLSEYE_EXCLUDE_BLOCK_START - if (m_state_machine == NULL) { + if (!m_state_machine) { neigh_logerr("m_state_machine: not a valid case"); break; } @@ -961,7 +961,7 @@ neigh_entry::event_t neigh_entry::rdma_event_mapping(struct rdma_cm_event *p_rdm { // General check of cma_id BULLSEYE_EXCLUDE_BLOCK_START - if (m_cma_id != NULL && m_cma_id != p_rdma_cm_event->id) { + if (m_cma_id && m_cma_id != p_rdma_cm_event->id) { neigh_logerr("cma_id %p != event->cma_id %p", m_cma_id, p_rdma_cm_event->id); return EV_UNHANDLED; } @@ -1165,7 +1165,7 @@ int neigh_entry::priv_enter_init_resolution() sock_addr src_sa(get_family(), &m_src_addr, 0); /* we had issues passing unicast src addr, let it find the correct one itself */ - sockaddr *src_p_sa = get_dst_addr().is_mc() ? src_sa.get_p_sa() : NULL; + sockaddr *src_p_sa = get_dst_addr().is_mc() ? src_sa.get_p_sa() : nullptr; int timeout_ms = RESOLVE_TIMEOUT_MS; if (get_family() == AF_INET6 && @@ -1378,14 +1378,14 @@ void neigh_entry::priv_destroy_cma_id() neigh_logdbg("Failed in rdma_destroy_id (errno=%d %m)", errno); } ENDIF_RDMACM_FAILURE; - m_cma_id = NULL; + m_cma_id = nullptr; } } void *neigh_entry::priv_register_timer_event(int timeout_msec, timer_handler *handler, timer_req_type_t req_type, void *user_data) { - void *_timer_handler = NULL; + void *_timer_handler = nullptr; std::lock_guard lock(m_lock); if (!is_cleaned()) { _timer_handler = g_p_event_handler_manager->register_timer_event(timeout_msec, handler, @@ -1402,7 +1402,7 @@ void neigh_entry::priv_unregister_timer() // as ONESHOT timer free itself after it run. // TODO: unregister all timers? is there just one or more? // g_p_event_handler_manager->unregister_timer_event(this, m_timer_handle); - m_timer_handle = NULL; + m_timer_handle = nullptr; } } //============================================================== neigh_eth @@ -1426,22 +1426,22 @@ neigh_eth::neigh_eth(neigh_key key) sm_short_table_line_t short_sm_table[] = { // {curr state, event, next state, action func } - {ST_NOT_ACTIVE, EV_KICK_START, ST_INIT, NULL}, - {ST_NOT_ACTIVE, EV_ARP_RESOLVED, ST_READY, NULL}, - {ST_ERROR, EV_KICK_START, ST_INIT, NULL}, - {ST_INIT, EV_ARP_RESOLVED, ST_READY, NULL}, - {ST_INIT, EV_START_RESOLUTION, ST_INIT_RESOLUTION, NULL}, - {ST_INIT_RESOLUTION, EV_RDMA_RESOLVE_FAILED, ST_SOLICIT_SEND, NULL}, - {ST_INIT_RESOLUTION, EV_ADDR_RESOLVED, ST_ADDR_RESOLVED, NULL}, - {ST_INIT_RESOLUTION, EV_ARP_RESOLVED, ST_READY, NULL}, - {ST_ADDR_RESOLVED, EV_ARP_RESOLVED, ST_READY, NULL}, - {ST_SOLICIT_SEND, EV_ARP_RESOLVED, ST_READY, NULL}, - {ST_SOLICIT_SEND, EV_TIMEOUT_EXPIRED, ST_ERROR, NULL}, - {ST_SOLICIT_SEND, EV_ERROR, ST_ERROR, NULL}, - {ST_READY, EV_ERROR, ST_ERROR, NULL}, - {ST_INIT, EV_ERROR, ST_ERROR, NULL}, - {ST_INIT_RESOLUTION, EV_ERROR, ST_ERROR, NULL}, - {ST_ERROR, EV_ERROR, ST_NOT_ACTIVE, NULL}, + {ST_NOT_ACTIVE, EV_KICK_START, ST_INIT, nullptr}, + {ST_NOT_ACTIVE, EV_ARP_RESOLVED, ST_READY, nullptr}, + {ST_ERROR, EV_KICK_START, ST_INIT, nullptr}, + {ST_INIT, EV_ARP_RESOLVED, ST_READY, nullptr}, + {ST_INIT, EV_START_RESOLUTION, ST_INIT_RESOLUTION, nullptr}, + {ST_INIT_RESOLUTION, EV_RDMA_RESOLVE_FAILED, ST_SOLICIT_SEND, nullptr}, + {ST_INIT_RESOLUTION, EV_ADDR_RESOLVED, ST_ADDR_RESOLVED, nullptr}, + {ST_INIT_RESOLUTION, EV_ARP_RESOLVED, ST_READY, nullptr}, + {ST_ADDR_RESOLVED, EV_ARP_RESOLVED, ST_READY, nullptr}, + {ST_SOLICIT_SEND, EV_ARP_RESOLVED, ST_READY, nullptr}, + {ST_SOLICIT_SEND, EV_TIMEOUT_EXPIRED, ST_ERROR, nullptr}, + {ST_SOLICIT_SEND, EV_ERROR, ST_ERROR, nullptr}, + {ST_READY, EV_ERROR, ST_ERROR, nullptr}, + {ST_INIT, EV_ERROR, ST_ERROR, nullptr}, + {ST_INIT_RESOLUTION, EV_ERROR, ST_ERROR, nullptr}, + {ST_ERROR, EV_ERROR, ST_NOT_ACTIVE, nullptr}, // Entry functions {ST_INIT, SM_STATE_ENTRY, SM_NO_ST, neigh_entry::dofunc_enter_init}, {ST_INIT_RESOLUTION, SM_STATE_ENTRY, SM_NO_ST, neigh_entry::dofunc_enter_init_resolution}, @@ -1459,13 +1459,13 @@ neigh_eth::neigh_eth(neigh_key key) EV_LAST, // max events short_sm_table, // short table general_st_entry, // default entry function - NULL, // default leave function - NULL, // default func + nullptr, // default leave function + nullptr, // default func print_event_info // debug function ); BULLSEYE_EXCLUDE_BLOCK_START - if (m_state_machine == NULL) { + if (!m_state_machine) { neigh_logpanic("Failed allocating state_machine"); } BULLSEYE_EXCLUDE_BLOCK_END @@ -1601,7 +1601,7 @@ bool neigh_eth::send_arp_request(bool is_broadcast) net_device_val_eth *netdevice_eth = dynamic_cast(m_p_dev); BULLSEYE_EXCLUDE_BLOCK_START - if (netdevice_eth == NULL) { + if (!netdevice_eth) { neigh_logdbg("Net dev is NULL not sending ARP"); return false; } @@ -1616,7 +1616,7 @@ bool neigh_eth::send_arp_request(bool is_broadcast) const unsigned char *peer_mac = dst->get_address(); BULLSEYE_EXCLUDE_BLOCK_START - if (src == NULL || dst == NULL) { + if (!src || !dst) { neigh_logdbg("src or dst is NULL not sending ARP"); return false; } @@ -1628,7 +1628,7 @@ bool neigh_eth::send_arp_request(bool is_broadcast) 0, 0); mem_buf_desc_t *p_mem_buf_desc = m_p_ring->mem_buf_tx_get(m_id, false, PBUF_RAM, 1); BULLSEYE_EXCLUDE_BLOCK_START - if (unlikely(p_mem_buf_desc == NULL)) { + if (unlikely(!p_mem_buf_desc)) { neigh_logdbg("No free TX buffer, not sending ARP"); return false; } @@ -1655,7 +1655,7 @@ bool neigh_eth::send_arp_request(bool is_broadcast) m_sge.addr = (uintptr_t)(p_mem_buf_desc->p_buffer + (uint8_t)h.m_transport_header_tx_offset); m_sge.length = sizeof(eth_arp_hdr) + h.m_total_hdr_len; m_sge.lkey = p_mem_buf_desc->lkey; - p_mem_buf_desc->p_next_desc = NULL; + p_mem_buf_desc->p_next_desc = nullptr; m_send_wqe.wr_id = (uintptr_t)p_mem_buf_desc; m_p_ring->send_ring_buffer(m_id, &m_send_wqe, (xlio_wr_tx_packet_attr)0); @@ -1671,7 +1671,7 @@ bool neigh_eth::send_neighbor_solicitation() net_device_val_eth *net_dev = dynamic_cast(m_p_dev); BULLSEYE_EXCLUDE_BLOCK_START - if (net_dev == nullptr) { + if (!net_dev) { neigh_logdbg("Net device is unavailable - not sending NS"); return false; } @@ -1679,7 +1679,7 @@ bool neigh_eth::send_neighbor_solicitation() const L2_address *src_mac = m_p_dev->get_l2_address(); BULLSEYE_EXCLUDE_BLOCK_START - if (src_mac == nullptr) { + if (!src_mac) { neigh_logdbg("Source MAC address is unavailable - not sending NS"); return false; } @@ -1715,7 +1715,7 @@ bool neigh_eth::send_neighbor_solicitation() htons(ETH_P_IPV6), m_src_addr, dst_snm, 0, 0); mem_buf_desc_t *p_mem_buf_desc = m_p_ring->mem_buf_tx_get(m_id, false, PBUF_RAM, 1); BULLSEYE_EXCLUDE_BLOCK_START - if (unlikely(p_mem_buf_desc == NULL)) { + if (unlikely(!p_mem_buf_desc)) { neigh_logdbg("No free TX buffer - not sending NS"); return false; } @@ -1787,7 +1787,7 @@ bool neigh_eth::send_neighbor_solicitation() m_sge.addr = reinterpret_cast(head); m_sge.length = static_cast(tail - head); m_sge.lkey = p_mem_buf_desc->lkey; - p_mem_buf_desc->p_next_desc = NULL; + p_mem_buf_desc->p_next_desc = nullptr; m_send_wqe.wr_id = (uintptr_t)p_mem_buf_desc; neigh_logdbg("NS request: base=%p addr=%p length=%" PRIu32, p_mem_buf_desc->p_buffer, (void *)m_sge.addr, m_sge.length); diff --git a/src/core/proto/neighbour.h b/src/core/proto/neighbour.h index b50fb79f0..6e9353de2 100644 --- a/src/core/proto/neighbour.h +++ b/src/core/proto/neighbour.h @@ -108,7 +108,7 @@ class neigh_val { public: neigh_val() : m_trans_type(XLIO_TRANSPORT_UNKNOWN) - , m_l2_address(NULL) {}; + , m_l2_address(nullptr) {}; virtual ~neigh_val() {}; virtual void zero_all_members() @@ -116,7 +116,7 @@ class neigh_val { if (m_l2_address) { delete m_l2_address; } - m_l2_address = NULL; + m_l2_address = nullptr; }; const L2_address *get_l2_address() const { return m_l2_address; }; @@ -275,8 +275,8 @@ class neigh_entry : public cache_entry_subject, bool priv_is_reachable(int state) { return state & (NUD_REACHABLE | NUD_PERMANENT); } bool priv_is_failed(int state) { return state & (NUD_FAILED | NUD_INCOMPLETE); } - void event_handler(event_t event, void *p_event_info = NULL); - void priv_event_handler_no_locks(event_t event, void *p_event_info = NULL); + void event_handler(event_t event, void *p_event_info = nullptr); + void priv_event_handler_no_locks(event_t event, void *p_event_info = nullptr); virtual bool priv_handle_neigh_is_l2_changed(address_t) { return false; }; void priv_handle_neigh_reachable_event(); @@ -291,7 +291,7 @@ class neigh_entry : public cache_entry_subject, virtual bool prepare_to_send_packet(neigh_send_data *) { return true; }; void handle_timer_expired(void *user_data) override; - virtual ring_user_id_t generate_ring_user_id(header *h = NULL) + virtual ring_user_id_t generate_ring_user_id(header *h = nullptr) { NOT_IN_USE(h); return m_p_ring->generate_id(); @@ -328,7 +328,7 @@ class neigh_eth : public neigh_entry { bool is_deletable() override; protected: - ring_user_id_t generate_ring_user_id(header *h = NULL) override; + ring_user_id_t generate_ring_user_id(header *h = nullptr) override; private: int build_mc_neigh_val(); diff --git a/src/core/proto/neighbour_table_mgr.cpp b/src/core/proto/neighbour_table_mgr.cpp index e592cf019..73d1697c1 100644 --- a/src/core/proto/neighbour_table_mgr.cpp +++ b/src/core/proto/neighbour_table_mgr.cpp @@ -50,7 +50,7 @@ #define neigh_mgr_logfunc __log_func #define neigh_mgr_logfuncall __log_funcall -neigh_table_mgr *g_p_neigh_table_mgr = NULL; +neigh_table_mgr *g_p_neigh_table_mgr = nullptr; #define DEFAULT_GARBAGE_COLLECTOR_TIME 100000 @@ -110,7 +110,7 @@ neigh_entry *neigh_table_mgr::create_new_entry(neigh_key neigh_key, const observ return (new neigh_eth(neigh_key)); } else { neigh_mgr_logdbg("Cannot create new entry, transport type is UNKNOWN"); - return NULL; + return nullptr; } } @@ -121,7 +121,7 @@ void neigh_table_mgr::notify_cb(event *ev) neigh_nl_event *nl_ev = dynamic_cast(ev); BULLSEYE_EXCLUDE_BLOCK_START - if (nl_ev == NULL) { + if (!nl_ev) { neigh_mgr_logdbg("Non neigh_nl_event type"); return; } diff --git a/src/core/proto/netlink_socket_mgr.cpp b/src/core/proto/netlink_socket_mgr.cpp index 75e0949ab..e5adc5a57 100644 --- a/src/core/proto/netlink_socket_mgr.cpp +++ b/src/core/proto/netlink_socket_mgr.cpp @@ -90,14 +90,14 @@ bool netlink_socket_mgr::query(const struct nlmsghdr *nl_msg, char *buf, int &le uint32_t nl_seq = nl_msg->nlmsg_seq; BULLSEYE_EXCLUDE_BLOCK_START - if ((sockfd = orig_os_api.socket(PF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE)) < 0) { + if ((sockfd = SYSCALL(socket, PF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE)) < 0) { __log_err("NL socket creation failed, errno = %d", errno); return false; } - if (orig_os_api.fcntl(sockfd, F_SETFD, FD_CLOEXEC) != 0) { + if (SYSCALL(fcntl, sockfd, F_SETFD, FD_CLOEXEC) != 0) { __log_warn("Fail in fcntl, errno = %d", errno); } - if ((len = orig_os_api.send(sockfd, nl_msg, nl_msg->nlmsg_len, 0)) < 0) { + if ((len = SYSCALL(send, sockfd, nl_msg, nl_msg->nlmsg_len, 0)) < 0) { __log_err("Write to NL socket failed, errno = %d", errno); } if (len > 0 && (len = recv_info(sockfd, nl_pid, nl_seq, buf)) < 0) { @@ -126,7 +126,7 @@ int netlink_socket_mgr::recv_info(int sockfd, uint32_t pid, uint32_t seq, char * do { // Receive response from the kernel BULLSEYE_EXCLUDE_BLOCK_START - if ((readLen = orig_os_api.recv(sockfd, buf_ptr, MSG_BUFF_SIZE - msgLen, 0)) < 0) { + if ((readLen = SYSCALL(recv, sockfd, buf_ptr, MSG_BUFF_SIZE - msgLen, 0)) < 0) { __log_err("NL socket read failed, errno = %d", errno); return -1; } @@ -161,7 +161,7 @@ int netlink_socket_mgr::recv_info(int sockfd, uint32_t pid, uint32_t seq, char * // Update data in a table void netlink_socket_mgr::update_tbl(nl_data_t data_type) { - struct nlmsghdr *nl_msg = NULL; + struct nlmsghdr *nl_msg = nullptr; char *buf; int len = 0; diff --git a/src/core/proto/nvme_parse_input_args.h b/src/core/proto/nvme_parse_input_args.h index f43aa37d0..7f2e30c28 100644 --- a/src/core/proto/nvme_parse_input_args.h +++ b/src/core/proto/nvme_parse_input_args.h @@ -63,7 +63,7 @@ class nvme_pdu_mdesc : public mem_desc { auto this_addr = reinterpret_cast(aligned_alloc( alignof(nvme_pdu_mdesc), num_segments * (sizeof(iovec) + sizeof(xlio_pd_key)) + sizeof(nvme_pdu_mdesc))); - if (this_addr == nullptr) { + if (!this_addr) { return nullptr; } auto container = std::unique_ptr(this_addr); @@ -135,7 +135,7 @@ class nvme_pdu_mdesc : public mem_desc { : chunk(nullptr, 0U, LKEY_TX_DEFAULT) {}; inline bool is_valid() { - return iov.iov_base != nullptr && iov.iov_len != 0U && mkey != LKEY_TX_DEFAULT; + return iov.iov_base && iov.iov_len != 0U && mkey != LKEY_TX_DEFAULT; } }; diff --git a/src/core/proto/route_entry.cpp b/src/core/proto/route_entry.cpp index 670cdf002..80a4b80b1 100644 --- a/src/core/proto/route_entry.cpp +++ b/src/core/proto/route_entry.cpp @@ -53,8 +53,8 @@ route_entry::route_entry(route_rule_table_key rtk) , cache_observer() , m_b_offloaded_net_dev(false) , m_is_valid(false) - , m_p_net_dev_entry(NULL) - , m_p_net_dev_val(NULL) + , m_p_net_dev_entry(nullptr) + , m_p_net_dev_val(nullptr) { m_val = NULL; cache_entry_subject *> *rr_entry = NULL; @@ -67,7 +67,7 @@ route_entry::~route_entry() unregister_to_net_device(); if (m_p_rr_entry) { g_p_rule_table_mgr->unregister_observer(get_key(), this); - m_p_rr_entry = NULL; + m_p_rr_entry = nullptr; } } @@ -90,7 +90,7 @@ void route_entry::set_val(IN route_val *&val) void route_entry::register_to_net_device() { - cache_entry_subject *net_dev_entry = NULL; + cache_entry_subject *net_dev_entry = nullptr; if (g_p_net_device_table_mgr->register_observer(m_val->get_if_index(), this, &net_dev_entry)) { rt_entry_logdbg("route_entry [%p] is registered to an offloaded device", this); m_p_net_dev_entry = (net_device_entry *)net_dev_entry; @@ -120,8 +120,8 @@ void route_entry::unregister_to_net_device() } } - m_p_net_dev_entry = NULL; - m_p_net_dev_val = NULL; + m_p_net_dev_entry = nullptr; + m_p_net_dev_val = nullptr; } void route_entry::notify_cb() @@ -131,7 +131,7 @@ void route_entry::notify_cb() if (m_p_net_dev_entry->is_valid()) { m_p_net_dev_entry->get_val(m_p_net_dev_val); } else { - m_p_net_dev_val = NULL; + m_p_net_dev_val = nullptr; } notify_observers(); } diff --git a/src/core/proto/route_table_mgr.cpp b/src/core/proto/route_table_mgr.cpp index 94aa606ca..45983f3b3 100644 --- a/src/core/proto/route_table_mgr.cpp +++ b/src/core/proto/route_table_mgr.cpp @@ -30,6 +30,7 @@ * SOFTWARE. */ +#include #include #include #include @@ -48,7 +49,7 @@ #include "vlogger/vlogger.h" #include "core/util/vtypes.h" #include "core/util/utils.h" -#include "core/sock/socket_fd_api.h" +#include "core/sock/sockinfo.h" #include "core/sock/sock-redirect.h" #include "core/dev/net_device_table_mgr.h" #include "core/util/ip_address.h" @@ -70,7 +71,7 @@ static inline route_val *find_route_val(route_table_t &table, const ip_address &dst, uint32_t table_id); -route_table_mgr *g_p_route_table_mgr = NULL; +route_table_mgr *g_p_route_table_mgr = nullptr; route_table_mgr::route_table_mgr() : netlink_socket_mgr() @@ -217,7 +218,7 @@ void route_table_mgr::rt_mgr_update_source_ip(route_table_t &table) if (!val.get_gw_addr().is_anyaddr() && val.get_src_addr().is_anyaddr()) { route_val *p_val_dst; uint32_t table_id = val.get_table_id(); - if ((p_val_dst = ::find_route_val(table, val.get_gw_addr(), table_id)) != nullptr) { + if ((p_val_dst = ::find_route_val(table, val.get_gw_addr(), table_id))) { if (!p_val_dst->get_src_addr().is_anyaddr()) { val.set_src_addr(p_val_dst->get_src_addr()); } else if (&val == p_val_dst) { // gateway of the entry lead to same entry @@ -422,7 +423,7 @@ bool route_table_mgr::route_resolve(IN route_rule_table_key key, OUT route_resul const sa_family_t family = key.get_family(); route_table_t &rt = family == AF_INET ? m_table_in4 : m_table_in6; - route_val *p_val = NULL; + route_val *p_val = nullptr; auto table_id_list = g_p_rule_table_mgr->rule_resolve(key); @@ -476,12 +477,12 @@ void route_table_mgr::update_entry(INOUT route_entry *p_ent, bool b_register_to_ rule_entry *p_rr_entry = p_ent->get_rule_entry(); std::deque *p_rr_val; if (p_rr_entry && p_rr_entry->get_val(p_rr_val)) { - route_val *p_val = NULL; + route_val *p_val = nullptr; const ip_address &peer_ip = p_ent->get_key().get_dst_ip(); for (const auto &p_rule_val : *p_rr_val) { uint32_t table_id = p_rule_val->get_table_id(); - if ((p_val = ::find_route_val(rt, peer_ip, table_id)) != nullptr) { + if ((p_val = ::find_route_val(rt, peer_ip, table_id))) { p_ent->set_val(p_val); if (b_register_to_net_dev) { // Check if broadcast IPv4 which is NOT supported diff --git a/src/core/proto/rule_table_mgr.cpp b/src/core/proto/rule_table_mgr.cpp index 774cd82dc..73f059cc7 100644 --- a/src/core/proto/rule_table_mgr.cpp +++ b/src/core/proto/rule_table_mgr.cpp @@ -66,7 +66,7 @@ #define DEFAULT_RULE_TABLE_SIZE 64 -rule_table_mgr *g_p_rule_table_mgr = NULL; +rule_table_mgr *g_p_rule_table_mgr = nullptr; static inline bool is_matching_rule(const route_rule_table_key &key, const rule_val &val); rule_table_mgr::rule_table_mgr() diff --git a/src/core/proto/xlio_lwip.cpp b/src/core/proto/xlio_lwip.cpp index f8aea1b0c..bc26b48d3 100644 --- a/src/core/proto/xlio_lwip.cpp +++ b/src/core/proto/xlio_lwip.cpp @@ -35,7 +35,6 @@ #include "core/event/event_handler_manager.h" #include "core/sock/sockinfo_tcp.h" -#include "core/lwip/init.h" #include "core/lwip/tcp_impl.h" #include "xlio_lwip.h" @@ -82,7 +81,7 @@ u8_t xlio_lwip::read_tcp_timestamp_option(void) return res; } -xlio_lwip *g_p_lwip = 0; +xlio_lwip *g_p_lwip = nullptr; /** * LWIP "network" driver code @@ -102,7 +101,6 @@ xlio_lwip::xlio_lwip() lwip_tcp_mss = get_lwip_tcp_mss(safe_mce_sys().mtu, safe_mce_sys().lwip_mss); lwip_tcp_snd_buf = safe_mce_sys().tcp_send_buffer_size; - lwip_zc_tx_size = safe_mce_sys().zc_tx_size; lwip_tcp_nodelay_treshold = safe_mce_sys().tcp_nodelay_treshold; BULLSEYE_EXCLUDE_BLOCK_END @@ -119,10 +117,6 @@ xlio_lwip::xlio_lwip() rcv_wnd_scale = 0; } - // Bring up LWIP - lwip_init(); - lwip_logdbg("LWIP subsystem initialized"); - // In case of batching is not requested we fetch tcp_seg from the ring directly. // This creates hot segments, CPU cache wise. if (safe_mce_sys().tx_segs_batch_tcp == 1U) { @@ -142,12 +136,13 @@ xlio_lwip::xlio_lwip() set_tmr_resolution(safe_mce_sys().tcp_timer_resolution_msec); // tcp_ticks increases in the rate of tcp slow_timer void *node = g_p_event_handler_manager->register_timer_event( - safe_mce_sys().tcp_timer_resolution_msec * 2, this, PERIODIC_TIMER, 0); + safe_mce_sys().tcp_timer_resolution_msec * 2, this, PERIODIC_TIMER, nullptr); if (!node) { lwip_logdbg("LWIP: failed to register timer event"); free_lwip_resources(); throw_xlio_exception("LWIP: failed to register timer event"); } + lwip_logdbg("LWIP subsystem initialized"); } xlio_lwip::~xlio_lwip() diff --git a/src/core/proto/xlio_lwip.h b/src/core/proto/xlio_lwip.h index 0b50ff610..dff65f502 100644 --- a/src/core/proto/xlio_lwip.h +++ b/src/core/proto/xlio_lwip.h @@ -35,7 +35,6 @@ #include "core/event/timer_handler.h" #include "core/proto/mem_buf_desc.h" -#include "core/sock/pkt_rcvr_sink.h" #include "core/lwip/tcp.h" typedef enum xlio_wr_tx_packet_attr { diff --git a/src/core/sock/bind_no_port.cpp b/src/core/sock/bind_no_port.cpp index 83e17dca7..32a9f5879 100644 --- a/src/core/sock/bind_no_port.cpp +++ b/src/core/sock/bind_no_port.cpp @@ -52,7 +52,7 @@ int bind_no_port::set_src_port_in_db(int fd, in_port_t port, flow_tuple &tuple) if (INPORT_ANY == port) { sock_addr addr; socklen_t addr_len = sizeof(addr); - if ((ret = orig_os_api.getsockname(fd, addr.get_p_sa(), &addr_len))) { + if ((ret = SYSCALL(getsockname, fd, addr.get_p_sa(), &addr_len))) { return ret; } port = addr.get_in_port(); @@ -92,7 +92,7 @@ int bind_no_port::bind_and_set_port_map(const sock_addr &src, const sock_addr &d in_port_t chosen_port = choose_src_port(tuple); addr.set_in_port(chosen_port); - if ((ret = orig_os_api.bind(fd, addr.get_p_sa(), addr_len))) { + if ((ret = SYSCALL(bind, fd, addr.get_p_sa(), addr_len))) { return ret; } diff --git a/src/core/sock/cleanable_obj.h b/src/core/sock/cleanable_obj.h index d8fe73649..415a00f02 100644 --- a/src/core/sock/cleanable_obj.h +++ b/src/core/sock/cleanable_obj.h @@ -41,7 +41,7 @@ class cleanable_obj { public: cleanable_obj() { m_b_cleaned = false; }; - virtual ~cleanable_obj() {}; + virtual ~cleanable_obj() = default; /* This function should be used just for objects that * was allocated via new() (not by new[], nor by placement new, nor a local object on the stack, diff --git a/src/core/sock/fd_collection.cpp b/src/core/sock/fd_collection.cpp index 0a288a29c..56c30ce20 100644 --- a/src/core/sock/fd_collection.cpp +++ b/src/core/sock/fd_collection.cpp @@ -36,9 +36,8 @@ #include "util/libxlio.h" #include "fd_collection.h" #include "sock-redirect.h" -#include "socket_fd_api.h" +#include "sockinfo.h" #include "sockinfo_udp.h" -#include "pipeinfo.h" #include "sockinfo_tcp.h" #include "iomux/epfd_info.h" @@ -55,7 +54,7 @@ #define fdcoll_logdbg __log_dbg #define fdcoll_logfunc __log_func -fd_collection *g_p_fd_collection = NULL; +fd_collection *g_p_fd_collection = nullptr; fd_collection::fd_collection() : lock_mutex_recursive("fd_collection") @@ -76,8 +75,8 @@ fd_collection::fd_collection() } fdcoll_logdbg("using open files max limit of %d file descriptors", m_n_fd_map_size); - m_p_sockfd_map = new socket_fd_api *[m_n_fd_map_size]; - memset(m_p_sockfd_map, 0, m_n_fd_map_size * sizeof(socket_fd_api *)); + m_p_sockfd_map = new sockinfo *[m_n_fd_map_size]; + memset(m_p_sockfd_map, 0, m_n_fd_map_size * sizeof(sockinfo *)); m_p_epfd_map = new epfd_info *[m_n_fd_map_size]; memset(m_p_epfd_map, 0, m_n_fd_map_size * sizeof(epfd_info *)); @@ -97,16 +96,16 @@ fd_collection::~fd_collection() m_n_fd_map_size = -1; delete[] m_p_sockfd_map; - m_p_sockfd_map = NULL; + m_p_sockfd_map = nullptr; delete[] m_p_epfd_map; - m_p_epfd_map = NULL; + m_p_epfd_map = nullptr; delete[] m_p_cq_channel_map; - m_p_cq_channel_map = NULL; + m_p_cq_channel_map = nullptr; delete[] m_p_tap_map; - m_p_tap_map = NULL; + m_p_tap_map = nullptr; m_epfd_lst.clear_without_cleanup(); m_pending_to_remove_lst.clear_without_cleanup(); @@ -121,7 +120,7 @@ void fd_collection::prepare_to_close() for (int fd = 0; fd < m_n_fd_map_size; ++fd) { if (m_p_sockfd_map[fd]) { if (!g_is_forked_child) { - socket_fd_api *p_sfd_api = get_sockfd(fd); + sockinfo *p_sfd_api = get_sockfd(fd); if (p_sfd_api) { p_sfd_api->prepare_to_close(true); } @@ -131,6 +130,7 @@ void fd_collection::prepare_to_close() unlock(); } +// Called in destructor after Internal-Thread destroyed void fd_collection::clear() { int fd; @@ -147,8 +147,8 @@ void fd_collection::clear() * these sockets can not be deleted through the it. */ while (!m_pending_to_remove_lst.empty()) { - socket_fd_api *p_sfd_api = m_pending_to_remove_lst.get_and_pop_back(); - p_sfd_api->clean_obj(); + sockinfo *p_sfd_api = m_pending_to_remove_lst.get_and_pop_back(); + p_sfd_api->clean_socket_obj(); } g_global_stat_static.n_pending_sockets = 0; @@ -158,14 +158,14 @@ void fd_collection::clear() for (fd = 0; fd < m_n_fd_map_size; ++fd) { if (m_p_sockfd_map[fd]) { if (!g_is_forked_child) { - socket_fd_api *p_sfd_api = get_sockfd(fd); + sockinfo *p_sfd_api = get_sockfd(fd); if (p_sfd_api) { p_sfd_api->statistics_print(); - p_sfd_api->clean_obj(); + p_sfd_api->clean_socket_obj(); } } - m_p_sockfd_map[fd] = NULL; + m_p_sockfd_map[fd] = nullptr; fdcoll_logdbg("destroyed fd=%d", fd); } @@ -174,7 +174,7 @@ void fd_collection::clear() if (p_epfd) { delete p_epfd; } - m_p_epfd_map[fd] = NULL; + m_p_epfd_map[fd] = nullptr; fdcoll_logdbg("destroyed epfd=%d", fd); } @@ -183,12 +183,12 @@ void fd_collection::clear() if (p_cq_ch_info) { delete p_cq_ch_info; } - m_p_cq_channel_map[fd] = NULL; + m_p_cq_channel_map[fd] = nullptr; fdcoll_logdbg("destroyed cq_channel_fd=%d", fd); } if (m_p_tap_map[fd]) { - m_p_tap_map[fd] = NULL; + m_p_tap_map[fd] = nullptr; fdcoll_logdbg("destroyed tapfd=%d", fd); } } @@ -203,7 +203,7 @@ int fd_collection::addsocket(int fd, int domain, int type, bool check_offload /* const int SOCK_TYPE_MASK = 0xf; int sock_type = type & SOCK_TYPE_MASK; int sock_flags = type & ~SOCK_TYPE_MASK; - socket_fd_api *p_sfd_api_obj; + sockinfo *p_sfd_api_obj; fdcoll_logfunc("fd=%d domain=%d type=%d", fd, domain, type); @@ -255,7 +255,7 @@ int fd_collection::addsocket(int fd, int domain, int type, bool check_offload /* lock(); BULLSEYE_EXCLUDE_BLOCK_START - if (p_sfd_api_obj == NULL) { + if (!p_sfd_api_obj) { fdcoll_logpanic("[fd=%d] Failed creating new sockinfo (%m)", fd); } BULLSEYE_EXCLUDE_BLOCK_END @@ -311,7 +311,7 @@ void fd_collection::offloading_rule_change_thread(bool offloaded, pthread_t tid) void fd_collection::statistics_print_helper(int fd, vlog_levels_t log_level) { - socket_fd_api *socket_fd; + sockinfo *socket_fd; epfd_info *epoll_fd; if ((socket_fd = get_sockfd(fd))) { @@ -348,58 +348,6 @@ void fd_collection::statistics_print(int fd, vlog_levels_t log_level) vlog_printf(log_level, "==================================================\n"); } -int fd_collection::addpipe(int fdrd, int fdwr) -{ - fdcoll_logfunc("fdrd=%d, fdwr=%d", fdrd, fdwr); - - if (!is_valid_fd(fdrd) || !is_valid_fd(fdwr)) { - return -1; - } - - lock(); - - // Sanity check to remove any old objects using the same fd!! - socket_fd_api *p_fdrd_api_obj = get_sockfd(fdrd); - BULLSEYE_EXCLUDE_BLOCK_START - if (p_fdrd_api_obj) { - fdcoll_logwarn("[fd=%d] Deleting old duplicate object (%p)", fdrd, p_fdrd_api_obj); - unlock(); - handle_close(fdrd, true); - lock(); - } - BULLSEYE_EXCLUDE_BLOCK_END - socket_fd_api *p_fdwr_api_obj = get_sockfd(fdwr); - BULLSEYE_EXCLUDE_BLOCK_START - if (p_fdwr_api_obj) { - fdcoll_logwarn("[fd=%d] Deleting old duplicate object (%p)", fdwr, p_fdwr_api_obj); - unlock(); - handle_close(fdwr, true); - lock(); - } - BULLSEYE_EXCLUDE_BLOCK_END - - unlock(); - p_fdrd_api_obj = new pipeinfo(fdrd); - p_fdwr_api_obj = new pipeinfo(fdwr); - lock(); - - BULLSEYE_EXCLUDE_BLOCK_START - if (p_fdrd_api_obj == NULL) { - fdcoll_logpanic("[fd=%d] Failed creating new pipeinfo (%m)", fdrd); - } - if (p_fdwr_api_obj == NULL) { - fdcoll_logpanic("[fd=%d] Failed creating new pipeinfo (%m)", fdwr); - } - BULLSEYE_EXCLUDE_BLOCK_END - - m_p_sockfd_map[fdrd] = p_fdrd_api_obj; - m_p_sockfd_map[fdwr] = p_fdwr_api_obj; - - unlock(); - - return 0; -} - int fd_collection::addepfd(int epfd, int size) { fdcoll_logfunc("epfd=%d", epfd); @@ -424,7 +372,7 @@ int fd_collection::addepfd(int epfd, int size) lock(); BULLSEYE_EXCLUDE_BLOCK_START - if (p_fd_info == NULL) { + if (!p_fd_info) { fdcoll_logpanic("[fd=%d] Failed creating new sockinfo (%m)", epfd); } BULLSEYE_EXCLUDE_BLOCK_END @@ -480,7 +428,7 @@ int fd_collection::add_cq_channel_fd(int cq_ch_fd, ring *p_ring) BULLSEYE_EXCLUDE_BLOCK_END // Sanity check to remove any old objects using the same fd!! - socket_fd_api *p_cq_ch_fd_api_obj = get_sockfd(cq_ch_fd); + sockinfo *p_cq_ch_fd_api_obj = get_sockfd(cq_ch_fd); BULLSEYE_EXCLUDE_BLOCK_START if (p_cq_ch_fd_api_obj) { fdcoll_logwarn("[fd=%d] Deleting old duplicate object (%p)", cq_ch_fd, p_cq_ch_fd_api_obj); @@ -495,9 +443,9 @@ int fd_collection::add_cq_channel_fd(int cq_ch_fd, ring *p_ring) BULLSEYE_EXCLUDE_BLOCK_START if (p_cq_ch_info) { fdcoll_logwarn("cq channel fd already exists in fd_collection"); - m_p_cq_channel_map[cq_ch_fd] = NULL; + m_p_cq_channel_map[cq_ch_fd] = nullptr; delete p_cq_ch_info; - p_cq_ch_info = NULL; + p_cq_ch_info = nullptr; } BULLSEYE_EXCLUDE_BLOCK_END @@ -506,7 +454,7 @@ int fd_collection::add_cq_channel_fd(int cq_ch_fd, ring *p_ring) lock(); BULLSEYE_EXCLUDE_BLOCK_START - if (p_cq_ch_info == NULL) { + if (!p_cq_ch_info) { fdcoll_logpanic("[fd=%d] Failed creating new cq_channel_info (%m)", cq_ch_fd); } BULLSEYE_EXCLUDE_BLOCK_END @@ -517,10 +465,10 @@ int fd_collection::add_cq_channel_fd(int cq_ch_fd, ring *p_ring) return 0; } -int fd_collection::del_sockfd(int fd, bool b_cleanup /*=false*/, bool is_for_udp_pool /*=false*/) +int fd_collection::del_sockfd(int fd, bool is_for_udp_pool /*=false*/) { int ret_val = -1; - socket_fd_api *p_sfd_api; + sockinfo *p_sfd_api; p_sfd_api = get_sockfd(fd); @@ -532,7 +480,9 @@ int fd_collection::del_sockfd(int fd, bool b_cleanup /*=false*/, bool is_for_udp // 2. Socket deletion when TCP connection == CLOSED if (p_sfd_api->prepare_to_close()) { // the socket is already closable - ret_val = del(fd, b_cleanup, m_p_sockfd_map); + // This may register the socket to be erased by internal thread, + // However, a timer may tick on this socket before it is deleted. + ret_val = del_socket(fd, m_p_sockfd_map); } else { lock(); // The socket is not ready for close. @@ -545,7 +495,7 @@ int fd_collection::del_sockfd(int fd, bool b_cleanup /*=false*/, bool is_for_udp if (!is_for_udp_pool) { ++g_global_stat_static.n_pending_sockets; } - m_p_sockfd_map[fd] = NULL; + m_p_sockfd_map[fd] = nullptr; m_pending_to_remove_lst.push_front(p_sfd_api); } @@ -581,7 +531,7 @@ void fd_collection::del_tapfd(int fd) } lock(); - m_p_tap_map[fd] = NULL; + m_p_tap_map[fd] = nullptr; unlock(); } @@ -609,6 +559,28 @@ template int fd_collection::del(int fd, bool b_cleanup, cls **map return -1; } +int fd_collection::del_socket(int fd, sockinfo **map_type) +{ + fdcoll_logfunc("fd=%d", fd); + + if (!is_valid_fd(fd)) { + return -1; + } + + lock(); + sockinfo *p_obj = map_type[fd]; + if (p_obj) { + map_type[fd] = nullptr; + unlock(); + p_obj->clean_socket_obj(); + return 0; + } + + fdcoll_logdbg("[fd=%d] Could not find related object", fd); + unlock(); + return -1; +} + void fd_collection::remove_from_all_epfds(int fd, bool passthrough) { epfd_info_list_t::iterator itr; @@ -623,7 +595,7 @@ void fd_collection::remove_from_all_epfds(int fd, bool passthrough) } #if defined(DEFINED_NGINX) -void fd_collection::push_socket_pool(socket_fd_api *sockfd) +void fd_collection::push_socket_pool(sockinfo *sockfd) { lock(); sockfd->prepare_to_close_socket_pool(true); @@ -646,9 +618,9 @@ bool fd_collection::pop_socket_pool(int &fd, bool &add_to_udp_pool, int type) lock(); if (!m_socket_pool.empty()) { // use fd from pool - will skip creation of new fd by os - socket_fd_api *sockfd = m_socket_pool.top(); + sockinfo *sockfd = m_socket_pool.top(); fd = sockfd->get_fd(); - if (m_p_sockfd_map[fd] == NULL) { + if (!m_p_sockfd_map[fd]) { m_p_sockfd_map[fd] = sockfd; m_pending_to_remove_lst.erase(sockfd); } @@ -677,7 +649,7 @@ void fd_collection::handle_socket_pool(int fd) return; } - socket_fd_api *sockfd = get_sockfd(fd); + sockinfo *sockfd = get_sockfd(fd); if (sockfd) { ++m_socket_pool_counter; sockfd->set_params_for_socket_pool(); diff --git a/src/core/sock/fd_collection.h b/src/core/sock/fd_collection.h index 5918398c4..d657b4050 100644 --- a/src/core/sock/fd_collection.h +++ b/src/core/sock/fd_collection.h @@ -41,11 +41,11 @@ #include "event/event_handler_manager.h" #include "event/timer_handler.h" #include "sock/cleanable_obj.h" -#include "sock/socket_fd_api.h" +#include "sock/sockinfo.h" #include "iomux/epfd_info.h" #include "utils/lock_wrapper.h" -typedef xlio_list_t sock_fd_api_list_t; +typedef xlio_list_t sock_fd_api_list_t; typedef xlio_list_t epfd_info_list_t; typedef std::unordered_map offload_thread_rule_t; @@ -67,7 +67,7 @@ class cq_channel_info : public cleanable_obj { public: cq_channel_info(ring *p_ring) : m_p_ring(p_ring) {}; - ~cq_channel_info() {}; + ~cq_channel_info() override = default; ring *get_ring() const noexcept { return m_p_ring; }; protected: @@ -77,7 +77,7 @@ class cq_channel_info : public cleanable_obj { class fd_collection : private lock_mutex_recursive { public: fd_collection(); - ~fd_collection(); + ~fd_collection() override; /** * Create and add a sockinfo. Use get_sock() to get it. @@ -87,14 +87,6 @@ class fd_collection : private lock_mutex_recursive { */ int addsocket(int fd, int domain, int type, bool check_offload = false); - /** - * Create pipeinfo. Use get_sock() to get it. - * @param fdrd Read fd. - * @param fdwr Write fd. - * @return 0 on success, -1 on failure. - */ - int addpipe(int fdrd, int fdwr); - /** * Create epfd_info. Use get_epfd() to get it. * @param epfd epoll fd. @@ -120,9 +112,9 @@ class fd_collection : private lock_mutex_recursive { int addtapfd(int tapfd, ring_tap *p_ring); /** - * Remove pipeinfo/sockinfo. + * Remove sockinfo. */ - int del_sockfd(int fd, bool b_cleanup = false, bool is_for_udp_pool = false); + int del_sockfd(int fd, bool is_for_udp_pool = false); /** * Remove epfd_info. @@ -145,12 +137,12 @@ class fd_collection : private lock_mutex_recursive { */ inline bool set_immediate_os_sample(int fd); - inline void reuse_sockfd(int fd, socket_fd_api *p_sfd_api_obj); - inline void destroy_sockfd(socket_fd_api *p_sfd_api_obj); + inline void reuse_sockfd(int fd, sockinfo *p_sfd_api_obj); + inline void destroy_sockfd(sockinfo *p_sfd_api_obj); /** - * Get sock_fd_api (sockinfo or pipeinfo) by fd. + * Get sock_fd_api (sockinfo) by fd. */ - inline socket_fd_api *get_sockfd(int fd); + inline sockinfo *get_sockfd(int fd); /** * Get epfd_info by fd. @@ -192,12 +184,13 @@ class fd_collection : private lock_mutex_recursive { #if defined(DEFINED_NGINX) bool pop_socket_pool(int &fd, bool &add_to_udp_pool, int type); - void push_socket_pool(socket_fd_api *sockfd); + void push_socket_pool(sockinfo *sockfd); void handle_socket_pool(int fd); #endif private: template int del(int fd, bool b_cleanup, cls **map_type); template inline cls *get(int fd, cls **map_type); + int del_socket(int fd, sockinfo **map_type); inline bool is_valid_fd(int fd); inline bool create_offloaded_sockets(); @@ -213,7 +206,7 @@ class fd_collection : private lock_mutex_recursive { private: int m_n_fd_map_size; - socket_fd_api **m_p_sockfd_map; + sockinfo **m_p_sockfd_map; epfd_info **m_p_epfd_map; cq_channel_info **m_p_cq_channel_map; ring_tap **m_p_tap_map; @@ -230,7 +223,7 @@ class fd_collection : private lock_mutex_recursive { #if defined(DEFINED_NGINX) bool m_use_socket_pool; - std::stack m_socket_pool; + std::stack m_socket_pool; int m_socket_pool_size; int m_socket_pool_counter; #endif @@ -277,7 +270,7 @@ inline bool fd_collection::set_immediate_os_sample(int fd) return false; } -inline void fd_collection::reuse_sockfd(int fd, socket_fd_api *p_sfd_api_obj) +inline void fd_collection::reuse_sockfd(int fd, sockinfo *p_sfd_api_obj) { lock(); m_pending_to_remove_lst.erase(p_sfd_api_obj); @@ -286,16 +279,16 @@ inline void fd_collection::reuse_sockfd(int fd, socket_fd_api *p_sfd_api_obj) unlock(); } -inline void fd_collection::destroy_sockfd(socket_fd_api *p_sfd_api_obj) +inline void fd_collection::destroy_sockfd(sockinfo *p_sfd_api_obj) { lock(); --g_global_stat_static.n_pending_sockets; m_pending_to_remove_lst.erase(p_sfd_api_obj); - p_sfd_api_obj->clean_obj(); + p_sfd_api_obj->clean_socket_obj(); unlock(); } -inline socket_fd_api *fd_collection::get_sockfd(int fd) +inline sockinfo *fd_collection::get_sockfd(int fd) { return get(fd, m_p_sockfd_map); } @@ -322,12 +315,12 @@ inline int fd_collection::get_fd_map_size() extern fd_collection *g_p_fd_collection; -inline socket_fd_api *fd_collection_get_sockfd(int fd) +inline sockinfo *fd_collection_get_sockfd(int fd) { if (g_p_fd_collection) { return g_p_fd_collection->get_sockfd(fd); } - return NULL; + return nullptr; } inline epfd_info *fd_collection_get_epfd(int fd) @@ -335,7 +328,7 @@ inline epfd_info *fd_collection_get_epfd(int fd) if (g_p_fd_collection) { return g_p_fd_collection->get_epfd(fd); } - return NULL; + return nullptr; } #endif diff --git a/src/core/sock/pipeinfo.cpp b/src/core/sock/pipeinfo.cpp deleted file mode 100644 index 64e0f7d23..000000000 --- a/src/core/sock/pipeinfo.cpp +++ /dev/null @@ -1,435 +0,0 @@ -/* - * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include -#include "utils/bullseye.h" -#include - -#include "sock-redirect.h" - -#include "pipeinfo.h" - -#define MODULE_NAME "pi" -#undef VLOG_PRINTF -#define VLOG_PRINTF(log_level, log_fmt, log_args...) \ - vlog_printf(log_level, "fd[%#x]:%s() " log_fmt "\n", m_fd, __FUNCTION__, ##log_args) -#define VLOG_PRINTF_DETAILS(log_level, log_fmt, log_args...) \ - vlog_printf(log_level, MODULE_NAME ":%d:fd[%#x]:%s() " log_fmt "\n", __LINE__, m_fd, \ - __FUNCTION__, ##log_args) - -#define pi_logpanic(log_fmt, log_args...) \ - VLOG_PRINTF(VLOG_PANIC, log_fmt, ##log_args); \ - throw; -#define pi_logerr(log_fmt, log_args...) VLOG_PRINTF(VLOG_ERROR, log_fmt, ##log_args) -#define pi_logwarn(log_fmt, log_args...) VLOG_PRINTF(VLOG_WARNING, log_fmt, ##log_args) -#define pi_loginfo(log_fmt, log_args...) VLOG_PRINTF(VLOG_INFO, log_fmt, ##log_args) - -#if (MAX_DEFINED_LOG_LEVEL < DEFINED_VLOG_DEBUG) -#define pi_logdbg_no_funcname(log_fmt, log_args...) ((void)0) -#define pi_logdbg(log_fmt, log_args...) ((void)0) -#define si_logdbg_no_funcname(log_fmt, log_args...) ((void)0) -#else -#define pi_logdbg_no_funcname(log_fmt, log_args...) \ - if (g_vlogger_level >= VLOG_DEBUG) \ - vlog_printf(VLOG_DEBUG, MODULE_NAME ":%d:fd[%d]: " log_fmt "\n", __LINE__, m_fd, ##log_args) -#define pi_logdbg(log_fmt, log_args...) \ - if (g_vlogger_level >= VLOG_DEBUG) \ - VLOG_PRINTF_DETAILS(VLOG_DEBUG, log_fmt, ##log_args) -#define si_logdbg_no_funcname(log_fmt, log_args...) \ - do { \ - if (g_vlogger_level >= VLOG_DEBUG) \ - vlog_printf(VLOG_DEBUG, MODULE_NAME "[fd=%d]:%d: " log_fmt "\n", m_fd, __LINE__, \ - ##log_args); \ - } while (0) -#endif - -#if (MAX_DEFINED_LOG_LEVEL < DEFINED_VLOG_FINE) -#define pi_logfunc(log_fmt, log_args...) ((void)0) -#else -#define pi_logfunc(log_fmt, log_args...) \ - if (g_vlogger_level >= VLOG_FUNC) \ - VLOG_PRINTF_DETAILS(VLOG_FUNC, log_fmt, ##log_args) -#endif - -#if (MAX_DEFINED_LOG_LEVEL < DEFINED_VLOG_FINER) -#define pi_logfuncall(log_fmt, log_args...) ((void)0) -#else -#define pi_logfuncall(log_fmt, log_args...) \ - if (g_vlogger_level >= VLOG_FUNC_ALL) \ - VLOG_PRINTF_DETAILS(VLOG_FUNC_ALL, log_fmt, ##log_args) -#endif /* MAX_DEFINED_LOG_LEVEL */ - -pipeinfo::pipeinfo(int fd) - : socket_fd_api(fd) - , m_lock("pipeinfo::m_lock") - , m_lock_rx("pipeinfo::m_lock_rx") - , m_lock_tx("pipeinfo::m_lock_tx") -{ - pi_logfunc(""); - - m_b_closed = true; - m_timer_handle = NULL; - - m_b_blocking = true; - - m_p_socket_stats = NULL; // mce_stats_instance_create_socket_block(); - if (m_p_socket_stats == NULL) { - // pi_logdbg("Got NULL from mce_stats_instance_create_socket_block, using local member"); - m_p_socket_stats = &m_socket_stats; - } - m_p_socket_stats->reset(); - m_p_socket_stats->fd = m_fd; - m_p_socket_stats->b_blocking = m_b_blocking; - m_p_socket_stats->n_rx_ready_pkt_count = 0; - m_p_socket_stats->counters.n_rx_ready_pkt_max = 0; - m_p_socket_stats->n_rx_ready_byte_count = 0; - m_p_socket_stats->n_tx_ready_byte_count = 0; - m_p_socket_stats->counters.n_rx_ready_byte_max = 0; - m_p_socket_stats->n_rx_zcopy_pkt_count = 0; - - m_b_closed = false; - - m_b_lbm_event_q_pipe_timer_on = false; - m_write_count = m_write_count_on_last_timer = 0; - m_write_count_no_change_count = 0; - - pi_logfunc("done"); -} - -pipeinfo::~pipeinfo() -{ - m_b_closed = true; - pi_logfunc(""); - - // Change to non-blocking socket so calling threads can exit - m_b_blocking = false; - - m_lock_tx.lock(); - m_lock_rx.lock(); - m_lock.lock(); - - if (m_timer_handle) { - g_p_event_handler_manager->unregister_timer_event(this, m_timer_handle); - m_timer_handle = NULL; - } - - statistics_print(); - - m_lock_tx.unlock(); - m_lock_rx.unlock(); - m_lock.unlock(); - - pi_logfunc("done"); -} - -void pipeinfo::clean_obj() -{ - if (is_cleaned()) { - return; - } - - set_cleaned(); - m_timer_handle = NULL; - if (g_p_event_handler_manager->is_running()) { - g_p_event_handler_manager->unregister_timers_event_and_delete(this); - } else { - cleanable_obj::clean_obj(); - } -} - -int pipeinfo::fcntl_helper(int __cmd, unsigned long int __arg, bool &bexit) -{ - - switch (__cmd) { - case F_SETFL: { - pi_logfunc("cmd=F_SETFL, arg=%#x", __cmd, __arg); - if (__arg & O_NONBLOCK) { - pi_logdbg("set to non-blocking mode"); - m_b_blocking = false; - } else { - pi_logdbg("set to blocked mode"); - m_b_blocking = true; - } - m_p_socket_stats->b_blocking = m_b_blocking; - } break; - - case F_GETFL: /* Get file status flags. */ - pi_logfunc("F_GETFL, arg=%#x", __arg); - break; - - case F_GETFD: /* Get file descriptor flags. */ - pi_logfunc("F_GETFD, arg=%#x", __arg); - break; - - case F_SETFD: /* Set file descriptor flags. */ - pi_logfunc("F_SETFD, arg=%#x", __arg); - break; - - default: - pi_logfunc("cmd=%d, arg=%#x", __cmd, __arg); - break; - } - - bexit = false; - return 0; -} - -int pipeinfo::fcntl(int __cmd, unsigned long int __arg) -{ - - bool bexit = false; - int ret_val = fcntl_helper(__cmd, __arg, bexit); - if (bexit) { - return ret_val; - } - - return orig_os_api.fcntl(m_fd, __cmd, __arg); -} - -int pipeinfo::fcntl64(int __cmd, unsigned long int __arg) -{ - - bool bexit = false; - int ret_val = fcntl_helper(__cmd, __arg, bexit); - if (bexit) { - return ret_val; - } - - return orig_os_api.fcntl64(m_fd, __cmd, __arg); -} - -int pipeinfo::ioctl(unsigned long int __request, unsigned long int __arg) -{ - int *p_arg = (int *)__arg; - - switch (__request) { - case FIONBIO: { - if (*p_arg) { - pi_logdbg("FIONBIO, arg=%d - set to non-blocking mode", *p_arg); - m_b_blocking = false; - } else { - pi_logdbg("FIONBIO, arg=%d - set to blocked mode", *p_arg); - m_b_blocking = true; - } - - m_p_socket_stats->b_blocking = m_b_blocking; - } break; - - default: - pi_logfunc("request=%d, arg=%#x", __request, __arg); - break; - } - - return orig_os_api.ioctl(m_fd, __request, __arg); -} - -ssize_t pipeinfo::rx(const rx_call_t call_type, iovec *p_iov, ssize_t sz_iov, int *p_flags, - sockaddr *__from, socklen_t *__fromlen, struct msghdr *__msg) -{ - pi_logfunc(""); - ssize_t ret = - socket_fd_api::rx_os(call_type, p_iov, sz_iov, *p_flags, __from, __fromlen, __msg); - save_stats_rx_os(ret); - return ret; -} - -void pipeinfo::handle_timer_expired(void *user_data) -{ - NOT_IN_USE(user_data); - pi_logfunc("(m_write_count=%d)", m_write_count); - m_lock_tx.lock(); - write_lbm_pipe_enhance(); - m_lock_tx.unlock(); -} - -ssize_t pipeinfo::tx(xlio_tx_call_attr_t &tx_arg) -{ - const iovec *p_iov = tx_arg.attr.iov; - const ssize_t sz_iov = tx_arg.attr.sz_iov; - const int __flags = tx_arg.attr.flags; - const struct sockaddr *__to = tx_arg.attr.addr; - const socklen_t __tolen = tx_arg.attr.len; - ssize_t ret = -1; - - pi_logfunc(""); - m_lock_tx.lock(); - switch (tx_arg.opcode) { - case TX_WRITE: - ret = orig_os_api.write(m_fd, p_iov[0].iov_base, p_iov[0].iov_len); - break; - case TX_SEND: - case TX_SENDTO: - case TX_SENDMSG: - default: - ret = socket_fd_api::tx_os(tx_arg.opcode, p_iov, sz_iov, __flags, __to, __tolen); - break; - } - - save_stats_tx_os(ret); - m_lock_tx.unlock(); - return ret; -} - -void pipeinfo::write_lbm_pipe_enhance() -{ - pi_logfunc("(m_write_count=%d)", m_write_count); - - if (m_write_count == m_write_count_on_last_timer) { - // No pipe write happened during the last timer_expired() - m_write_count_no_change_count++; - - // After 3 of these stop timer - if (m_write_count_no_change_count >= 2 && m_b_lbm_event_q_pipe_timer_on) { - if (m_timer_handle) { - g_p_event_handler_manager->unregister_timer_event(this, m_timer_handle); - m_timer_handle = NULL; - } - m_b_lbm_event_q_pipe_timer_on = false; - - pi_logfunc("pipe_write DONE timer Un-Reg"); - } - } - - m_write_count = 0; - m_write_count_no_change_count = 0; - m_write_count_on_last_timer = 0; - - // Send the buffered data - char buf[10] = "\0"; - orig_os_api.write(m_fd, buf, 1); -} - -void pipeinfo::statistics_print(vlog_levels_t log_level) -{ - bool b_any_activiy = false; - NOT_IN_USE(log_level); - - if (m_p_socket_stats->counters.n_tx_sent_byte_count || - m_p_socket_stats->counters.n_tx_sent_pkt_count || m_p_socket_stats->counters.n_tx_errors || - m_p_socket_stats->counters.n_tx_eagain) { - pi_logdbg_no_funcname( - "Tx Offload: %" PRIu64 " KB / %d / %d / %d [kilobytes/packets/errors/eagains]", - m_p_socket_stats->counters.n_tx_sent_byte_count / 1024, - m_p_socket_stats->counters.n_tx_sent_pkt_count, m_p_socket_stats->counters.n_tx_errors, - m_p_socket_stats->counters.n_tx_eagain); - b_any_activiy = true; - } - if (m_p_socket_stats->counters.n_tx_os_bytes || m_p_socket_stats->counters.n_tx_os_packets || - m_p_socket_stats->counters.n_tx_os_errors) { - pi_logdbg_no_funcname("Tx OS info: %" PRIu64 " KB / %d / %d [kilobytes/packets/errors]", - m_p_socket_stats->counters.n_tx_os_bytes / 1024, - m_p_socket_stats->counters.n_tx_os_packets, - m_p_socket_stats->counters.n_tx_os_errors); - b_any_activiy = true; - } - if (m_p_socket_stats->counters.n_rx_bytes || m_p_socket_stats->counters.n_rx_packets || - m_p_socket_stats->counters.n_rx_errors || m_p_socket_stats->counters.n_rx_eagain) { - pi_logdbg_no_funcname( - "Rx Offload: %" PRIu64 " KB / %d / %d / %d [kilobytes/packets/errors/eagains]", - m_p_socket_stats->counters.n_rx_bytes / 1024, m_p_socket_stats->counters.n_rx_packets, - m_p_socket_stats->counters.n_rx_errors, m_p_socket_stats->counters.n_rx_eagain); - b_any_activiy = true; - } - if (m_p_socket_stats->counters.n_rx_os_bytes || m_p_socket_stats->counters.n_rx_os_packets || - m_p_socket_stats->counters.n_rx_os_errors) { - pi_logdbg_no_funcname("Rx OS info: %" PRIu64 " KB / %d / %d [kilobytes/packets/errors]", - m_p_socket_stats->counters.n_rx_os_bytes / 1024, - m_p_socket_stats->counters.n_rx_os_packets, - m_p_socket_stats->counters.n_rx_os_errors); - b_any_activiy = true; - } - if (m_p_socket_stats->counters.n_rx_poll_miss || m_p_socket_stats->counters.n_rx_poll_hit) { - pi_logdbg_no_funcname("Rx poll: %d / %d (%2.2f%%) [miss/hit]", - m_p_socket_stats->counters.n_rx_poll_miss, - m_p_socket_stats->counters.n_rx_poll_hit, - (float)(m_p_socket_stats->counters.n_rx_poll_hit * 100) / - (float)(m_p_socket_stats->counters.n_rx_poll_miss + - m_p_socket_stats->counters.n_rx_poll_hit)); - b_any_activiy = true; - } - if (m_p_socket_stats->counters.n_rx_ready_byte_drop) { - si_logdbg_no_funcname( - "Rx byte: max %d / dropped %d (%2.2f%%) [limit is %d]", - m_p_socket_stats->counters.n_rx_ready_byte_max, - m_p_socket_stats->counters.n_rx_ready_byte_drop, - (m_p_socket_stats->counters.n_rx_packets - ? (float)(m_p_socket_stats->counters.n_rx_ready_byte_drop * 100) / - (float)m_p_socket_stats->counters.n_rx_packets - : 0), - m_p_socket_stats->n_rx_ready_byte_limit); - b_any_activiy = true; - } - if (m_p_socket_stats->counters.n_rx_ready_pkt_drop) { - si_logdbg_no_funcname("Rx pkt : max %d / dropped %d (%2.2f%%)", - m_p_socket_stats->counters.n_rx_ready_pkt_max, - m_p_socket_stats->counters.n_rx_ready_pkt_drop, - (m_p_socket_stats->counters.n_rx_packets - ? (float)(m_p_socket_stats->counters.n_rx_ready_pkt_drop * 100) / - (float)m_p_socket_stats->counters.n_rx_packets - : 0)); - b_any_activiy = true; - } - if (m_p_socket_stats->strq_counters.n_strq_total_strides) { - si_logdbg_no_funcname("Rx RQ Strides: %" PRIu64 " / %u [total/max-per-packet]\n", - m_p_socket_stats->strq_counters.n_strq_total_strides, - m_p_socket_stats->strq_counters.n_strq_max_strides_per_packet); - b_any_activiy = true; - } - if (b_any_activiy == false) { - pi_logdbg_no_funcname("Rx and Tx where not active"); - } -} - -void pipeinfo::save_stats_rx_os(int bytes) -{ - if (bytes >= 0) { - m_p_socket_stats->counters.n_rx_os_bytes += bytes; - m_p_socket_stats->counters.n_rx_os_packets++; - } else if (errno == EAGAIN) { - m_p_socket_stats->counters.n_rx_os_eagain++; - } else { - m_p_socket_stats->counters.n_rx_os_errors++; - } -} - -void pipeinfo::save_stats_tx_os(int bytes) -{ - if (bytes >= 0) { - m_p_socket_stats->counters.n_tx_os_bytes += bytes; - m_p_socket_stats->counters.n_tx_os_packets++; - } else if (errno == EAGAIN) { - m_p_socket_stats->counters.n_rx_os_eagain++; - } else { - m_p_socket_stats->counters.n_tx_os_errors++; - } -} diff --git a/src/core/sock/pipeinfo.h b/src/core/sock/pipeinfo.h deleted file mode 100644 index 59218ff27..000000000 --- a/src/core/sock/pipeinfo.h +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef PIPEINFO_H -#define PIPEINFO_H - -#include "socket_fd_api.h" -#include "utils/lock_wrapper.h" -#include -#include - -class pipeinfo : public socket_fd_api, public timer_handler { -public: - pipeinfo(int fd); - ~pipeinfo(); - - virtual void clean_obj(); - -#if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) - virtual void copy_sockopt_fork(const socket_fd_api *copy_from) { NOT_IN_USE(copy_from); } -#endif - - int fcntl(int __cmd, unsigned long int __arg); - int fcntl64(int __cmd, unsigned long int __arg); - int ioctl(unsigned long int __request, unsigned long int __arg); - - // Process a Rx request, we might have a ready packet, or we might block until - // we have one (if sockinfo::m_b_blocking == true) - ssize_t rx(const rx_call_t call_type, struct iovec *p_iov, ssize_t sz_iov, int *p_flags, - struct sockaddr *__from = NULL, socklen_t *__fromlen = NULL, - struct msghdr *__msg = NULL); - - // Process a Tx request, handle all that is needed to send the packet, we might block - // until the connection info is ready or a tx buffer is releast (if sockinfo::m_b_blocking == - // true) - ssize_t tx(xlio_tx_call_attr_t &tx_arg); - - void statistics_print(vlog_levels_t log_level = VLOG_DEBUG); - - virtual inline fd_type_t get_type() { return FD_TYPE_PIPE; } - -private: - bool m_b_blocking; - - // Main mutex to protect from multi threaded access to sockinfo from sock-redirect - bool m_b_closed; - lock_mutex m_lock; - lock_mutex m_lock_rx; - lock_mutex m_lock_tx; - - socket_stats_t m_socket_stats; - socket_stats_t *m_p_socket_stats; - - void *m_timer_handle; - - int m_write_count; - int m_write_count_on_last_timer; - int m_write_count_no_change_count; - bool m_b_lbm_event_q_pipe_timer_on; - - void handle_timer_expired(void *user_data); - - void write_lbm_pipe_enhance(); - - void save_stats_rx_os(int bytes); - void save_stats_tx_os(int bytes); - - int fcntl_helper(int __cmd, unsigned long int __arg, bool &bexit); -}; - -#endif diff --git a/src/core/sock/pkt_rcvr_sink.h b/src/core/sock/pkt_rcvr_sink.h deleted file mode 100644 index af59859cf..000000000 --- a/src/core/sock/pkt_rcvr_sink.h +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef PKT_RECVR_SINK_H -#define PKT_RECVR_SINK_H - -class mem_buf_desc_t; -class ring; - -/* - * Class pkt_rcvr_sink - * An object must implement pkt_rcvr_sink to register with ib_conn_mgr_base - * The rx_joined_notify_cb() will be called when the IBCM is ready to start - * receiving packets (MC join is complete and CQ is mapped). - * The rx_diconnect_notify_cb() will be called before the IB stops receiving - * packets (CQ is being removed and MC leave is called). - * The rx_pkt_notify_cb() will be called when a ip packet is in the ready q for the socket. - * The implementing object should register the information and release calling context immediately. - * When no packet receivers (or transmitters) are registered the objects will be deleted - */ -class pkt_rcvr_sink { -public: - virtual ~pkt_rcvr_sink() {}; - - // Callback from lower layer notifying new receive packets - // Return: 'true' if object queuing this receive packet - // 'false' if not interested in this receive packet - virtual bool rx_input_cb(mem_buf_desc_t *p_rx_pkt_mem_buf_desc_info, - void *pv_fd_ready_array) = 0; - - // Callback from lower layer notifying completion of RX registration process - virtual void rx_add_ring_cb(ring *p_ring) = 0; - - // Callback from lower layer notifying before RX resources deallocation - virtual void rx_del_ring_cb(ring *p_ring) = 0; -}; - -#endif diff --git a/src/core/sock/pkt_sndr_source.h b/src/core/sock/pkt_sndr_source.h deleted file mode 100644 index 8198afcfb..000000000 --- a/src/core/sock/pkt_sndr_source.h +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef PKT_SNDR_SOURCE_H -#define PKT_SNDR_SOURCE_H - -/** - * @class pkt_sndr_source - * An object must implement pkt_sndr_source to register with ib_conn_mgr_base - * When no packet transmitters (or receivers) are registered the objects will be - * deleted. - */ -class pkt_sndr_source { -public: - virtual ~pkt_sndr_source() {}; -}; - -#endif diff --git a/src/core/sock/sock-app.cpp b/src/core/sock/sock-app.cpp index 3f2953d8a..477abbae8 100644 --- a/src/core/sock/sock-app.cpp +++ b/src/core/sock/sock-app.cpp @@ -34,7 +34,8 @@ #include "config.h" #endif -#include +#include +#include #include #include #include @@ -61,10 +62,10 @@ map_udp_bounded_port_t g_map_udp_bounded_port; static int init_worker(int worker_id, int listen_fd); -struct app_conf *g_p_app = NULL; +struct app_conf *g_p_app = nullptr; #if defined(DEFINED_NGINX) -int app_conf::proc_nginx(void) +int app_conf::proc_nginx() { int rc = 0; @@ -80,7 +81,7 @@ int app_conf::proc_nginx(void) */ fd_collection *p_fd_collection = (fd_collection *)g_p_app->context; for (int fd = 0; fd < p_fd_collection->get_fd_map_size(); fd++) { - socket_fd_api *sock_fd_api = p_fd_collection->get_sockfd(fd); + sockinfo *sock_fd_api = p_fd_collection->get_sockfd(fd); if (!sock_fd_api || !dynamic_cast(sock_fd_api)) { continue; } @@ -105,7 +106,7 @@ int app_conf::proc_envoy(int __op, int __fd) /* Prcess only sockets from map_listen_fd */ auto iter = g_p_app->map_listen_fd.find(__fd); if (iter != g_p_app->map_listen_fd.end()) { - socket_fd_api *p_socket_object = fd_collection_get_sockfd(__fd); + sockinfo *p_socket_object = fd_collection_get_sockfd(__fd); if (iter->second == gettid()) { /* process listen sockets from main thread and remove * them from map_listen_fd @@ -193,7 +194,7 @@ static int init_worker(int worker_id, int listen_fd) app_logdbg("worker: %d fd: %d", worker_id, listen_fd); int ret = 0; - socket_fd_api *child_sock_fd_api = nullptr; + sockinfo *child_sock_fd_api = nullptr; int parent_fd = listen_fd; fd_collection *p_fd_collection = (fd_collection *)g_p_app->context; @@ -235,7 +236,7 @@ static int init_worker(int worker_id, int listen_fd) * Nginx: parent_fd is equal to listen_fd */ sockinfo *si; - socket_fd_api *parent_sock_fd_api = p_fd_collection->get_sockfd(parent_fd); + sockinfo *parent_sock_fd_api = p_fd_collection->get_sockfd(parent_fd); if (!parent_sock_fd_api || !(si = dynamic_cast(parent_sock_fd_api))) { app_logerr("parent sockinfo is not found"); return -1; @@ -253,7 +254,7 @@ static int init_worker(int worker_id, int listen_fd) if (child_sock_fd_api) { child_sock_fd_api->copy_sockopt_fork(parent_sock_fd_api); - ret = bind(listen_fd, sa.get_p_sa(), sa_len); + ret = bind_internal(child_sock_fd_api, sa.get_p_sa(), sa_len); if (ret < 0) { app_logerr("bind() error"); } diff --git a/src/core/sock/sock-app.h b/src/core/sock/sock-app.h index 9dc3611e8..9f7e66397 100644 --- a/src/core/sock/sock-app.h +++ b/src/core/sock/sock-app.h @@ -78,12 +78,12 @@ struct app_conf { map_thread_id.clear(); map_dup_fd.clear(); unused_worker_id.clear(); - context = NULL; + context = nullptr; setup(); } - ~app_conf() {} + ~app_conf() = default; void setup() { @@ -112,7 +112,7 @@ struct app_conf { } #if defined(DEFINED_NGINX) - int proc_nginx(void); + int proc_nginx(); #endif /* DEFINED_NGINX */ #if defined(DEFINED_ENVOY) diff --git a/src/core/sock/sock-extra.cpp b/src/core/sock/sock-extra.cpp index 83608ae3c..0a658640e 100644 --- a/src/core/sock/sock-extra.cpp +++ b/src/core/sock/sock-extra.cpp @@ -38,13 +38,15 @@ #include #include #include -#include -#include +#include +#include +#include #include #include #include #include "sock/sock-extra.h" +#include "xlio.h" #define MODULE_NAME "extra:" @@ -61,7 +63,7 @@ extern "C" int xlio_register_recv_callback(int __fd, xlio_recv_callback_t __callback, void *__context) { - socket_fd_api *p_socket_object = NULL; + sockinfo *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object && !safe_mce_sys().enable_socketxtreme) { p_socket_object->register_callback(__callback, __context); @@ -74,7 +76,7 @@ extern "C" int xlio_register_recv_callback(int __fd, xlio_recv_callback_t __call extern "C" int xlio_recvfrom_zcopy(int __fd, void *__buf, size_t __nbytes, int *__flags, struct sockaddr *__from, socklen_t *__fromlen) { - socket_fd_api *p_socket_object = NULL; + sockinfo *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { struct iovec piov[1]; @@ -83,18 +85,13 @@ extern "C" int xlio_recvfrom_zcopy(int __fd, void *__buf, size_t __nbytes, int * *__flags |= MSG_XLIO_ZCOPY; return p_socket_object->rx(RX_RECVFROM, piov, 1, __flags, __from, __fromlen); } - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.recvfrom) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - return orig_os_api.recvfrom(__fd, __buf, __nbytes, *__flags, __from, __fromlen); + return SYSCALL(recvfrom, __fd, __buf, __nbytes, *__flags, __from, __fromlen); } extern "C" int xlio_recvfrom_zcopy_free_packets(int __fd, struct xlio_recvfrom_zcopy_packet_t *pkts, size_t count) { - socket_fd_api *p_socket_object = NULL; + sockinfo *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { return p_socket_object->recvfrom_zcopy_free_packets(pkts, count); @@ -123,12 +120,12 @@ extern "C" int xlio_socketxtreme_poll(int fd, struct xlio_socketxtreme_completio unsigned int ncompletions, int flags) { int ret_val = -1; - cq_channel_info *cq_ch_info = NULL; + cq_channel_info *cq_ch_info = nullptr; cq_ch_info = g_p_fd_collection->get_cq_channel_fd(fd); if (safe_mce_sys().tcp_ctl_thread == option_tcp_ctl_thread::CTL_THREAD_DELEGATE_TCP_TIMERS) { - g_thread_local_event_handler.do_tasks(); + g_event_handler_manager_local.do_tasks(); } if (likely(cq_ch_info)) { @@ -158,8 +155,8 @@ static int dummy_xlio_socketxtreme_free_packets(struct xlio_socketxtreme_packet_ extern "C" int xlio_socketxtreme_free_packets(struct xlio_socketxtreme_packet_desc_t *packets, int num) { - mem_buf_desc_t *desc = NULL; - sockinfo_tcp *p_socket_object = NULL; + mem_buf_desc_t *desc = nullptr; + sockinfo_tcp *p_socket_object = nullptr; if (likely(packets)) { for (int i = 0; i < num; i++) { @@ -203,7 +200,7 @@ static int dummy_xlio_socketxtreme_ref_buff(xlio_buff_t *buff) extern "C" int xlio_socketxtreme_ref_buff(xlio_buff_t *buff) { int ret_val = 0; - mem_buf_desc_t *desc = NULL; + mem_buf_desc_t *desc = nullptr; if (likely(buff)) { desc = (mem_buf_desc_t *)buff; @@ -229,7 +226,7 @@ static int dummy_xlio_socketxtreme_free_buff(xlio_buff_t *buff) extern "C" int xlio_socketxtreme_free_buff(xlio_buff_t *buff) { int ret_val = 0; - mem_buf_desc_t *desc = NULL; + mem_buf_desc_t *desc = nullptr; if (likely(buff)) { desc = (mem_buf_desc_t *)buff; @@ -244,32 +241,19 @@ extern "C" int xlio_socketxtreme_free_buff(xlio_buff_t *buff) extern "C" int xlio_get_socket_rings_num(int fd) { - socket_fd_api *p_socket_object = NULL; - p_socket_object = fd_collection_get_sockfd(fd); - if (p_socket_object && p_socket_object->check_rings()) { - return p_socket_object->get_rings_num(); - } - - return 0; + sockinfo *p_socket_object = fd_collection_get_sockfd(fd); + return p_socket_object ? p_socket_object->get_rings_num() : 0; } extern "C" int xlio_get_socket_rings_fds(int fd, int *ring_fds, int ring_fds_sz) { - if (ring_fds_sz <= 0 || ring_fds == NULL) { + if (ring_fds_sz <= 0 || !ring_fds) { errno = EINVAL; return -1; } - socket_fd_api *p_socket_object = fd_collection_get_sockfd(fd); - if (p_socket_object && p_socket_object->check_rings()) { - int rings_num = 0; - int *p_rings_fds = p_socket_object->get_rings_fds(rings_num); - int num_rings_to_copy = min(ring_fds_sz, rings_num); - std::copy(&p_rings_fds[0], &p_rings_fds[num_rings_to_copy], ring_fds); - return num_rings_to_copy; - } - - return 0; + sockinfo *p_socket_object = fd_collection_get_sockfd(fd); + return p_socket_object ? p_socket_object->get_rings_fds(ring_fds, ring_fds_sz) : 0; } extern "C" int xlio_add_conf_rule(const char *config_line) @@ -309,13 +293,13 @@ static inline struct cmsghdr *__cmsg_nxthdr(void *__ctl, size_t __size, struct c __ptr = (struct cmsghdr *)(((unsigned char *)__cmsg) + CMSG_ALIGN(__cmsg->cmsg_len)); if ((unsigned long)((char *)(__ptr + 1) - (char *)__ctl) > __size) { - return NULL; + return nullptr; } return __ptr; } -extern "C" int xlio_ioctl(void *cmsg_hdr, size_t cmsg_len) +extern "C" int xlio_extra_ioctl(void *cmsg_hdr, size_t cmsg_len) { struct cmsghdr *cmsg = (struct cmsghdr *)cmsg_hdr; @@ -347,11 +331,11 @@ extern "C" int xlio_ioctl(void *cmsg_hdr, size_t cmsg_len) return 0; } -struct xlio_api_t *extra_api(void) +struct xlio_api_t *extra_api() { - static struct xlio_api_t *xlio_api = NULL; + static struct xlio_api_t *xlio_api = nullptr; - if (NULL == xlio_api) { + if (!xlio_api) { bool enable_socketxtreme = safe_mce_sys().enable_socketxtreme; xlio_api = new struct xlio_api_t(); @@ -386,8 +370,242 @@ struct xlio_api_t *extra_api(void) enable_socketxtreme ? xlio_socketxtreme_free_buff : dummy_xlio_socketxtreme_free_buff, XLIO_EXTRA_API_SOCKETXTREME_FREE_XLIO_BUFF); SET_EXTRA_API(dump_fd_stats, xlio_dump_fd_stats, XLIO_EXTRA_API_DUMP_FD_STATS); - SET_EXTRA_API(ioctl, xlio_ioctl, XLIO_EXTRA_API_IOCTL); + SET_EXTRA_API(ioctl, xlio_extra_ioctl, XLIO_EXTRA_API_IOCTL); + + // XLIO Socket API. + SET_EXTRA_API(xlio_init_ex, xlio_init_ex, XLIO_EXTRA_API_XLIO_SOCKET); + SET_EXTRA_API(xlio_poll_group_create, xlio_poll_group_create, XLIO_EXTRA_API_XLIO_SOCKET); + SET_EXTRA_API(xlio_poll_group_destroy, xlio_poll_group_destroy, XLIO_EXTRA_API_XLIO_SOCKET); + SET_EXTRA_API(xlio_poll_group_poll, xlio_poll_group_poll, XLIO_EXTRA_API_XLIO_SOCKET); + SET_EXTRA_API(xlio_socket_create, xlio_socket_create, XLIO_EXTRA_API_XLIO_SOCKET); + SET_EXTRA_API(xlio_socket_destroy, xlio_socket_destroy, XLIO_EXTRA_API_XLIO_SOCKET); + SET_EXTRA_API(xlio_socket_setsockopt, xlio_socket_setsockopt, XLIO_EXTRA_API_XLIO_SOCKET); + SET_EXTRA_API(xlio_socket_bind, xlio_socket_bind, XLIO_EXTRA_API_XLIO_SOCKET); + SET_EXTRA_API(xlio_socket_connect, xlio_socket_connect, XLIO_EXTRA_API_XLIO_SOCKET); + SET_EXTRA_API(xlio_socket_get_pd, xlio_socket_get_pd, XLIO_EXTRA_API_XLIO_SOCKET); + SET_EXTRA_API(xlio_socket_send, xlio_socket_send, XLIO_EXTRA_API_XLIO_SOCKET); + SET_EXTRA_API(xlio_socket_sendv, xlio_socket_sendv, XLIO_EXTRA_API_XLIO_SOCKET); + SET_EXTRA_API(xlio_poll_group_flush, xlio_poll_group_flush, XLIO_EXTRA_API_XLIO_SOCKET); + SET_EXTRA_API(xlio_socket_flush, xlio_socket_flush, XLIO_EXTRA_API_XLIO_SOCKET); + SET_EXTRA_API(xlio_socket_buf_free, xlio_socket_buf_free, XLIO_EXTRA_API_XLIO_SOCKET); + SET_EXTRA_API(xlio_poll_group_buf_free, xlio_poll_group_buf_free, + XLIO_EXTRA_API_XLIO_SOCKET); } return xlio_api; } + +/* + * Storage API + */ + +extern "C" int xlio_init_ex(const struct xlio_init_attr *attr) +{ + // Set XLIO Socket API specific parameters unless user sets them explicitly + if (!getenv(SYS_VAR_PROGRESS_ENGINE_INTERVAL)) { + setenv(SYS_VAR_PROGRESS_ENGINE_INTERVAL, "0", 1); + } + + // Read the updated parameters. A global object could trigger the reading earlier. + safe_mce_sys().get_env_params(); + + xlio_init(); + + extern xlio_memory_cb_t g_user_memory_cb; + g_user_memory_cb = attr->memory_cb; + + if (attr->memory_alloc) { + safe_mce_sys().m_ioctl.user_alloc.flags = IOCTL_USER_ALLOC_TX | IOCTL_USER_ALLOC_RX; + safe_mce_sys().m_ioctl.user_alloc.memalloc = attr->memory_alloc; + safe_mce_sys().m_ioctl.user_alloc.memfree = attr->memory_free; + safe_mce_sys().memory_limit_user = + std::max(safe_mce_sys().memory_limit_user, safe_mce_sys().memory_limit); + } + + DO_GLOBAL_CTORS(); + + return 0; +} + +extern "C" int xlio_poll_group_create(const struct xlio_poll_group_attr *attr, + xlio_poll_group_t *group_out) +{ + // Validate input arguments + if (!group_out || !attr || !attr->socket_event_cb) { + errno = EINVAL; + return -1; + } + + poll_group *grp = new poll_group(attr); + if (!grp) { + errno = ENOMEM; + return -1; + } + + *group_out = reinterpret_cast(grp); + return 0; +} + +extern "C" int xlio_poll_group_destroy(xlio_poll_group_t group) +{ + poll_group *grp = reinterpret_cast(group); + + delete grp; + return 0; +} + +extern "C" void xlio_poll_group_poll(xlio_poll_group_t group) +{ + poll_group *grp = reinterpret_cast(group); + + grp->poll(); +} + +extern "C" int xlio_socket_create(const struct xlio_socket_attr *attr, xlio_socket_t *sock_out) +{ + // Validate input arguments + if (!sock_out || !attr || !attr->group || + !(attr->domain == AF_INET || attr->domain == AF_INET6)) { + errno = EINVAL; + return -1; + } + + int fd = SYSCALL(socket, attr->domain, SOCK_STREAM, 0); + if (fd < 0) { + return -1; + } + + sockinfo_tcp *si = new sockinfo_tcp(fd, attr->domain); + if (!si) { + errno = ENOMEM; + return -1; + } + si->set_xlio_socket(attr); + + poll_group *grp = reinterpret_cast(attr->group); + grp->add_socket(si); + + *sock_out = reinterpret_cast(si); + return 0; +} + +extern "C" int xlio_socket_destroy(xlio_socket_t sock) +{ + sockinfo_tcp *si = reinterpret_cast(sock); + poll_group *grp = si->get_poll_group(); + + if (likely(grp)) { + // We always force TCP reset not to handle FIN handshake and TIME-WAIT state. + grp->close_socket(si, true); + } else { + return XLIO_CALL(close, si->get_fd()); + } + return 0; +} + +extern "C" int xlio_socket_setsockopt(xlio_socket_t sock, int level, int optname, + const void *optval, socklen_t optlen) +{ + sockinfo_tcp *si = reinterpret_cast(sock); + int errno_save = errno; + + int rc = si->setsockopt(level, optname, optval, optlen); + if (rc == 0) { + errno = errno_save; + } + return rc; +} + +extern "C" int xlio_socket_bind(xlio_socket_t sock, const struct sockaddr *addr, socklen_t addrlen) +{ + sockinfo_tcp *si = reinterpret_cast(sock); + int errno_save = errno; + + int rc = si->bind(addr, addrlen); + if (rc == 0) { + errno = errno_save; + } + return rc; +} + +extern "C" int xlio_socket_connect(xlio_socket_t sock, const struct sockaddr *to, socklen_t tolen) +{ + sockinfo_tcp *si = reinterpret_cast(sock); + int errno_save = errno; + + int rc = si->connect(to, tolen); + rc = (rc == -1 && (errno == EINPROGRESS || errno == EAGAIN)) ? 0 : rc; + if (rc == 0) { + si->add_tx_ring_to_group(); + errno = errno_save; + } + return rc; +} + +extern "C" struct ibv_pd *xlio_socket_get_pd(xlio_socket_t sock) +{ + sockinfo_tcp *si = reinterpret_cast(sock); + ib_ctx_handler *ctx = si->get_ctx(); + + return ctx ? ctx->get_ibv_pd() : nullptr; +} + +static void xlio_buf_free(struct xlio_buf *buf) +{ + // TODO Use mem_buf_desc_t field as xlio_buf + mem_buf_desc_t *desc = reinterpret_cast(buf); + ring_slave *rng = desc->p_desc_owner; + + desc->p_next_desc = nullptr; + bool ret = rng->reclaim_recv_buffers(desc); + if (unlikely(!ret)) { + g_buffer_pool_rx_ptr->put_buffer_after_deref_thread_safe(desc); + } +} + +extern "C" void xlio_socket_buf_free(xlio_socket_t sock, struct xlio_buf *buf) +{ + NOT_IN_USE(sock); + xlio_buf_free(buf); +} + +extern "C" void xlio_poll_group_buf_free(xlio_poll_group_t group, struct xlio_buf *buf) +{ + NOT_IN_USE(group); + xlio_buf_free(buf); +} + +extern "C" int xlio_socket_send(xlio_socket_t sock, const void *data, size_t len, + const struct xlio_socket_send_attr *attr) +{ + const struct iovec iov = {.iov_base = const_cast(data), .iov_len = len}; + + return xlio_socket_sendv(sock, &iov, 1, attr); +} + +extern "C" int xlio_socket_sendv(xlio_socket_t sock, const struct iovec *iov, unsigned iovcnt, + const struct xlio_socket_send_attr *attr) +{ + sockinfo_tcp *si = reinterpret_cast(sock); + + unsigned flags = XLIO_EXPRESS_OP_TYPE_DESC; + flags |= !(attr->flags & XLIO_SOCKET_SEND_FLAG_FLUSH) * XLIO_EXPRESS_MSG_MORE; + + int rc = (attr->flags & XLIO_SOCKET_SEND_FLAG_INLINE) + ? si->tcp_tx_express_inline(iov, iovcnt, flags) + : si->tcp_tx_express(iov, iovcnt, attr->mkey, flags, + reinterpret_cast(attr->userdata_op)); + return rc < 0 ? rc : 0; +} + +extern "C" void xlio_poll_group_flush(xlio_poll_group_t group) +{ + poll_group *grp = reinterpret_cast(group); + grp->flush(); +} + +extern "C" void xlio_socket_flush(xlio_socket_t sock) +{ + sockinfo_tcp *si = reinterpret_cast(sock); + si->flush(true); +} diff --git a/src/core/sock/sock-extra.h b/src/core/sock/sock-extra.h index 71c03bd90..9a5fa49d4 100644 --- a/src/core/sock/sock-extra.h +++ b/src/core/sock/sock-extra.h @@ -35,6 +35,6 @@ #include "xlio_extra.h" -struct xlio_api_t *extra_api(void); +struct xlio_api_t *extra_api(); #endif /* _SOCK_EXTRA_H_ */ diff --git a/src/core/sock/sock-redirect.cpp b/src/core/sock/sock-redirect.cpp index 0b8d08509..8d2bd3855 100644 --- a/src/core/sock/sock-redirect.cpp +++ b/src/core/sock/sock-redirect.cpp @@ -35,8 +35,12 @@ #include "sock-redirect.h" #include "sock-extra.h" #include "sock-app.h" +#include "xlio.h" +#include #include +#include +#include #include #include #include @@ -45,7 +49,7 @@ #include "utils/lock_wrapper.h" #include #include -#include +#include #include #include #include @@ -82,10 +86,8 @@ using namespace std; #define srdr_logfunc_exit __log_exit_func #define EP_MAX_EVENTS (int)((INT_MAX / sizeof(struct epoll_event))) - -struct os_api orig_os_api; struct sigaction g_act_prev; -sighandler_t g_sighandler = NULL; +sighandler_t g_sighandler = nullptr; class ring_simple; class ring_eth_direct; @@ -96,6 +98,8 @@ template void assign_dlsym(T &ptr, const char *name) #define FD_MAP_SIZE (g_p_fd_collection ? g_p_fd_collection->get_fd_map_size() : 1024) +#ifndef XLIO_STATIC_BUILD +struct os_api orig_os_api; #define GET_ORIG_FUNC(__name) \ if (!orig_os_api.__name) { \ dlerror(); \ @@ -115,15 +119,6 @@ template void assign_dlsym(T &ptr, const char *name) } \ } -#define VERIFY_PASSTROUGH_CHANGED(__ret, __func_and_params__) \ - do { \ - bool passthrough = p_socket_object->isPassthrough(); \ - __ret = __func_and_params__; \ - if (!passthrough && p_socket_object->isPassthrough()) { \ - handle_close(__fd, false, true); \ - } \ - } while (0); - void get_orig_funcs() { // Save pointer to original functions @@ -177,7 +172,6 @@ void get_orig_funcs() GET_ORIG_FUNC(creat); GET_ORIG_FUNC(dup); GET_ORIG_FUNC(dup2); - GET_ORIG_FUNC(clone); GET_ORIG_FUNC(fork); GET_ORIG_FUNC(vfork); GET_ORIG_FUNC(daemon); @@ -188,6 +182,16 @@ void get_orig_funcs() GET_ORIG_FUNC(waitpid); #endif // DEFINED_NGINX } +#endif /* XLIO_STATIC_BUILD */ + +#define VERIFY_PASSTROUGH_CHANGED(__ret, __func_and_params__) \ + do { \ + bool passthrough = p_socket_object->isPassthrough(); \ + __ret = __func_and_params__; \ + if (!passthrough && p_socket_object->isPassthrough()) { \ + handle_close(__fd, false, true); \ + } \ + } while (0); const char *socket_get_domain_str(int domain) { @@ -236,7 +240,7 @@ bool handle_close(int fd, bool cleanup, bool passthrough) bool to_close_now = true; bool is_for_udp_pool = false; - srdr_logfunc("Cleanup fd=%d", fd); + srdr_logfunc("Cleanup fd=%d cleanup=%d", fd, !!cleanup); if (g_zc_cache) { g_zc_cache->handle_close(fd); @@ -246,7 +250,7 @@ bool handle_close(int fd, bool cleanup, bool passthrough) // Remove fd from all existing epoll sets g_p_fd_collection->remove_from_all_epfds(fd, passthrough); - socket_fd_api *sockfd = fd_collection_get_sockfd(fd); + sockinfo *sockfd = fd_collection_get_sockfd(fd); if (sockfd) { // Don't call close(2) for objects without a shadow socket (TCP incoming sockets). to_close_now = !passthrough && sockfd->is_shadow_socket_present(); @@ -254,7 +258,7 @@ bool handle_close(int fd, bool cleanup, bool passthrough) // Save this value before pointer is destructed is_for_udp_pool = sockfd->m_is_for_socket_pool; #endif - g_p_fd_collection->del_sockfd(fd, cleanup, is_for_udp_pool); + g_p_fd_collection->del_sockfd(fd, is_for_udp_pool); if (safe_mce_sys().deferred_close) { to_close_now = false; } @@ -279,15 +283,6 @@ bool handle_close(int fd, bool cleanup, bool passthrough) //----------------------------------------------------------------------------- // replacement functions //----------------------------------------------------------------------------- - -/* Create a new socket of type TYPE in domain DOMAIN, using - protocol PROTOCOL. If PROTOCOL is zero, one is chosen automatically. - Returns a file descriptor for the new socket, or -1 for errors. */ -extern "C" EXPORT_SYMBOL int socket(int __domain, int __type, int __protocol) -{ - return socket_internal(__domain, __type, __protocol, true, true); -} - /* Internal logic of socket() syscall implementation. It can be called from within XLIO, for example, to create a socket for an incoming TCP connection. */ int socket_internal(int __domain, int __type, int __protocol, bool shadow, bool check_offload) @@ -301,11 +296,6 @@ int socket_internal(int __domain, int __type, int __protocol, bool shadow, bool } PROFILE_BLOCK("socket") - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.socket) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END #if defined(DEFINED_NGINX) bool add_to_udp_pool = false; if (g_p_app && g_p_app->type == APP_NGINX && g_p_fd_collection && offload_sockets && @@ -316,7 +306,7 @@ int socket_internal(int __domain, int __type, int __protocol, bool shadow, bool fd = SOCKET_FAKE_FD; if (shadow || !offload_sockets || !g_p_fd_collection) { - fd = orig_os_api.socket(__domain, __type, __protocol); + fd = SYSCALL(socket, __domain, __type, __protocol); vlog_printf(VLOG_DEBUG, "ENTER: %s(domain=%s(%d), type=%s(%d), protocol=%d) = %d\n", __func__, socket_get_domain_str(__domain), __domain, socket_get_type_str(__type), __type, __protocol, fd); @@ -344,225 +334,675 @@ int socket_internal(int __domain, int __type, int __protocol, bool shadow, bool return fd; } -extern "C" EXPORT_SYMBOL int close(int __fd) +int bind_internal(void *sock, const struct sockaddr *addr, socklen_t addrlen) { - PROFILE_FUNC - - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.close) { - get_orig_funcs(); + auto p_socket_object = reinterpret_cast(sock); + int ret = p_socket_object->bind(addr, addrlen); + if (p_socket_object->isPassthrough()) { + int fd = p_socket_object->get_fd(); + handle_close(fd, false, true); + if (ret) { + ret = SYSCALL(bind, fd, addr, addrlen); + } } - BULLSEYE_EXCLUDE_BLOCK_END - - srdr_logdbg_entry("fd=%d", __fd); - - bool toclose = handle_close(__fd); - int rc = toclose ? orig_os_api.close(__fd) : 0; - - return rc; + return ret; } -extern "C" EXPORT_SYMBOL void __res_iclose(res_state statp, bool free_addr) +ssize_t sendmsg_internal(void *sock, __const struct msghdr *__msg, int __flags) { - PROFILE_FUNC - - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.__res_iclose) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - - /* Current implementation doesn't handle XLIO sockets without a shadow socket or from a socket - pool. If such a socket is present in the nssocks list, system __res_iclose() will close the - fd. This will break the socket functionality. - Assume that resolver doesn't use the above scenarios. */ + auto p_socket_object = reinterpret_cast(sock); + xlio_tx_call_attr_t tx_arg; - srdr_logdbg_entry(""); - for (int ns = 0; ns < statp->_u._ext.nscount; ns++) { - int sock = statp->_u._ext.nssocks[ns]; - if (sock != -1) { - handle_close(sock); + tx_arg.opcode = TX_SENDMSG; + tx_arg.attr.iov = __msg->msg_iov; + tx_arg.attr.sz_iov = (ssize_t)__msg->msg_iovlen; + tx_arg.attr.flags = __flags; + tx_arg.attr.addr = (struct sockaddr *)(__CONST_SOCKADDR_ARG)__msg->msg_name; + tx_arg.attr.len = (socklen_t)__msg->msg_namelen; + tx_arg.attr.hdr = __msg; + tx_arg.priv.attr = PBUF_DESC_NONE; + + if (0 < __msg->msg_controllen) { + struct cmsghdr *cmsg = CMSG_FIRSTHDR((struct msghdr *)__msg); + if ((cmsg->cmsg_level == SOL_SOCKET) && + (cmsg->cmsg_type == SCM_XLIO_PD || cmsg->cmsg_type == SCM_XLIO_NVME_PD)) { + if ((tx_arg.attr.flags & MSG_ZEROCOPY) && + (__msg->msg_iovlen == + ((cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(struct xlio_pd_key)))) { + tx_arg.priv.attr = + (cmsg->cmsg_type == SCM_XLIO_PD) ? PBUF_DESC_MKEY : PBUF_DESC_NVME_TX; + tx_arg.priv.opaque = (void *)CMSG_DATA(cmsg); + } else { + errno = EINVAL; + return -1; + } } } - orig_os_api.__res_iclose(statp, free_addr); + + return p_socket_object->tx(tx_arg); } -/* Shut down all or part of the connection open on socket FD. - HOW determines what to shut down: - SHUT_RD = No more receptions; - SHUT_WR = No more transmissions; - SHUT_RDWR = No more receptions or transmissions. - Returns 0 on success, -1 for errors. */ -extern "C" EXPORT_SYMBOL int shutdown(int __fd, int __how) +static ssize_t sendfile_helper(sockinfo *p_socket_object, int in_fd, __off64_t *offset, + size_t count) { - PROFILE_FUNC - - srdr_logdbg_entry("fd=%d, how=%d", __fd, __how); + ssize_t totSent = 0; + struct stat64 stat_buf; + __off64_t orig_offset = 0; + __off64_t cur_offset; + struct iovec piov[1]; + xlio_tx_call_attr_t tx_arg; + sockinfo *s = (sockinfo *)p_socket_object; - socket_fd_api *p_socket_object = NULL; - p_socket_object = fd_collection_get_sockfd(__fd); - if (p_socket_object) { - return p_socket_object->shutdown(__how); + if (p_socket_object->get_type() != FD_TYPE_SOCKET) { + errno = EBADF; + return -1; } - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.shutdown) { - get_orig_funcs(); + if (!offset) { + orig_offset = lseek64(in_fd, 0, SEEK_CUR); + if (orig_offset < 0) { + errno = ESPIPE; + return -1; + } + cur_offset = orig_offset; + } else { + cur_offset = *offset; } - BULLSEYE_EXCLUDE_BLOCK_END - return orig_os_api.shutdown(__fd, __how); -} + if (PROTO_TCP == s->get_protocol()) { + mapping_t *mapping; + int rc; -extern "C" EXPORT_SYMBOL int listen(int __fd, int backlog) -{ - PROFILE_FUNC + /* Get mapping from the cache */ + mapping = g_zc_cache->get_mapping(in_fd); + if (!mapping) { + srdr_logdbg("Couldn't allocate mapping object"); + goto fallback; + } - srdr_logdbg_entry("fd=%d, backlog=%d", __fd, backlog); + if ((__off64_t)mapping->m_size < (__off64_t)(cur_offset + count)) { + struct stat st_buf; -#if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) - if (g_p_app && g_p_app->type != APP_NONE) { - /* Envoy: - * Socket handling - * Envoy uses the following procedure for creating sockets and assigning them to workers. - * - * When a listener is created, a socket is pre-created for every worker on the main thread. - * This allows most errors to be caught early on in the listener creation process (e.g., bad - * socket option, unable to bind, etc.). - * - If using reuse_port, a unique socket is created for every worker. - * - If not using reuse_port, a unique socket is created for worker 0, and then that socket - * is duplicated for all other workers. - * a listener can close() its sockets when removed without concern for other listeners. - * - * Implementation: - * - reuse_port(false) : - * Envoy uses dup() call for listen socket on workers_N (N > 0) - * dup() call does not create socket object and does not store fd - * in fd_collection in current implementation - * so as a result duplicated fd is not returned by fd_collection_get_sockfd(__fd) and - * listen() call for duplicated fds are ignored. - * Original listen socket is not ignored by listen() function. - * - reuse_port(true) : - * dup() is not used. Unique socket is created for every worker. - * - * Store all duplicated fd in map_dup_fd with reference to original fd - * Store all listen fd in map_listen_fd with tid - * Identify correct listen fd during epoll_ctl(ADD) call by tid. It should be different. - * Set worker id in map_thread_id basing on tid - * - * Nginx: - * Nginx store all listen fd in map_listen_fd to proceed later in children processes - * after fork() call. - * Set worker id in map_thread_id basing on tid(pid). Nginx has single thread per process so - * tid and pid should be equal. - */ - std::lock_guardm_lock)> lock(g_p_app->m_lock); - g_p_app->map_listen_fd[__fd] = gettid(); - } -#endif /* DEFINED_ENVOY */ + /* + * This is slow path, we check fstat(2) to handle the + * scenario when user changes the file while respective + * mapping exists and the file becomes larger. + * As workaround, fallback to preadv() implementation. + */ + mapping->put(); + rc = fstat(in_fd, &st_buf); + if ((rc == 0) && (st_buf.st_size >= (off_t)(cur_offset + count))) { + s->get_sock_stats()->counters.n_tx_sendfile_overflows++; + goto fallback; + } else { + errno = EOVERFLOW; + return -1; + } + } - socket_fd_api *p_socket_object = NULL; - p_socket_object = fd_collection_get_sockfd(__fd); + piov[0].iov_base = (char *)mapping->m_addr + cur_offset; + piov[0].iov_len = count; - if (p_socket_object) { - // for verifying that the socket is really offloaded - int ret = p_socket_object->prepareListen(); - if (ret < 0) { - return ret; // error + tx_arg.opcode = TX_FILE; + tx_arg.attr.iov = piov; + tx_arg.attr.sz_iov = 1; + tx_arg.attr.flags = MSG_ZEROCOPY; + tx_arg.priv.attr = PBUF_DESC_MDESC; + tx_arg.priv.mdesc = (void *)mapping; + totSent = p_socket_object->tx(tx_arg); + + mapping->put(); + fallback: + /* Fallback to readv() implementation */ + if (totSent == 0) { + s->get_sock_stats()->counters.n_tx_sendfile_fallbacks++; + tx_arg.clear(); + tx_arg.opcode = TX_FILE; + tx_arg.attr.iov = piov; + tx_arg.attr.sz_iov = 1; + tx_arg.priv.attr = PBUF_DESC_FD; + tx_arg.priv.fd = in_fd; + piov[0].iov_base = (void *)&cur_offset; + piov[0].iov_len = count; + totSent = p_socket_object->tx(tx_arg); } - if (ret > 0) { // Passthrough - handle_close(__fd, false, true); - } else { -#if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) - if (g_p_app && g_p_app->type != APP_NONE) { - p_socket_object->m_back_log = backlog; - } else -#endif - { - return p_socket_object->listen(backlog); - } + } else { + __off64_t pa_offset = 0; + size_t pa_count = 0; + struct flock64 lock; + + if ((fstat64(in_fd, &stat_buf) == -1) || + ((__off64_t)stat_buf.st_size < (__off64_t)(cur_offset + count))) { + errno = EOVERFLOW; + return -1; } - } - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.listen) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END + tx_arg.opcode = TX_WRITE; + tx_arg.attr.iov = piov; + tx_arg.attr.sz_iov = 1; - srdr_logdbg("OS listen fd=%d, backlog=%d", __fd, backlog); - return orig_os_api.listen(__fd, backlog); -} + /* The off argument of mmap() is constrained to be aligned and + * sized according to the value returned by sysconf() + */ + pa_offset = cur_offset & ~(sysconf(_SC_PAGE_SIZE) - 1); + pa_count = count + cur_offset - pa_offset; -extern "C" EXPORT_SYMBOL int accept(int __fd, struct sockaddr *__addr, socklen_t *__addrlen) -{ - PROFILE_FUNC + lock.l_type = F_RDLCK; + lock.l_whence = SEEK_SET; + lock.l_start = pa_offset; + lock.l_len = pa_count; + lock.l_pid = 0; - socket_fd_api *p_socket_object = NULL; - p_socket_object = fd_collection_get_sockfd(__fd); - if (p_socket_object) { - return p_socket_object->accept(__addr, __addrlen); - } + /* try to use mmap() approach */ + if (-1 != (XLIO_CALL(fcntl, in_fd, F_SETLK, &lock))) { + void *addr = nullptr; + addr = + mmap64(nullptr, pa_count, PROT_READ, MAP_SHARED | MAP_NORESERVE, in_fd, pa_offset); + if (MAP_FAILED != addr) { + ssize_t toRead, numSent = 0; - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.accept) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END + while (count > 0) { + toRead = min(sysconf(_SC_PAGE_SIZE), (ssize_t)count); - return orig_os_api.accept(__fd, __addr, __addrlen); -} + piov[0].iov_base = (void *)((uintptr_t)addr + cur_offset - pa_offset + totSent); + piov[0].iov_len = toRead; -extern "C" EXPORT_SYMBOL int accept4(int __fd, struct sockaddr *__addr, socklen_t *__addrlen, - int __flags) -{ - PROFILE_FUNC + numSent = p_socket_object->tx(tx_arg); + if (numSent == -1) { + break; + } - socket_fd_api *p_socket_object = NULL; + count -= numSent; + totSent += numSent; + } + (void)munmap(addr, pa_count); + } + lock.l_type = F_UNLCK; + (void)XLIO_CALL(fcntl, in_fd, F_SETLK, &lock); + } + + /* fallback on read() approach */ + if (totSent == 0) { + char buf[sysconf(_SC_PAGE_SIZE)]; + ssize_t toRead, numRead, numSent = 0; + + s->get_sock_stats()->counters.n_tx_sendfile_fallbacks++; + + while (count > 0) { + toRead = min(sizeof(buf), count); + numRead = pread(in_fd, buf, toRead, cur_offset + totSent); + if (numRead <= 0) { + if (numRead < 0 && totSent == 0) { + totSent = -1; + } + break; + } + + piov[0].iov_base = (void *)buf; + piov[0].iov_len = numRead; + + numSent = p_socket_object->tx(tx_arg); + if (numSent == -1) { + break; + } + + count -= numSent; + totSent += numSent; + } + } + } + + if (totSent > 0) { + if (offset) { + *offset = *offset + totSent; + } else { + (void)lseek64(in_fd, (orig_offset + totSent), SEEK_SET); + } + } + + return totSent; +} + +// Format a fd_set into a string for logging +// Check nfd to know how many 32 bits hexs do we want to sprintf into user buffer +const char *dbg_sprintf_fdset(char *buf, int buflen, int __nfds, fd_set *__fds) +{ + if (buflen < 1) { + return "(null)"; + } + buf[0] = '\0'; + + if ((__nfds <= 0) || (!__fds)) { + return "(null)"; + } + + int fdsize = 1 + ((__nfds - 1) / (8 * sizeof(uint32_t))); + switch (fdsize) { + case 1: + snprintf(buf, buflen, "%08x", ((uint32_t *)__fds)[0]); + break; + case 2: + snprintf(buf, buflen, "%08x %08x", ((uint32_t *)__fds)[1], ((uint32_t *)__fds)[0]); + break; + case 3: + snprintf(buf, buflen, "%08x %08x %08x", ((uint32_t *)__fds)[2], ((uint32_t *)__fds)[1], + ((uint32_t *)__fds)[0]); + break; + case 4: + snprintf(buf, buflen, "%08x %08x %08x %08x", ((uint32_t *)__fds)[3], ((uint32_t *)__fds)[2], + ((uint32_t *)__fds)[1], ((uint32_t *)__fds)[0]); + break; + case 5: + snprintf(buf, buflen, "%08x %08x %08x %08x %08x", ((uint32_t *)__fds)[4], + ((uint32_t *)__fds)[3], ((uint32_t *)__fds)[2], ((uint32_t *)__fds)[1], + ((uint32_t *)__fds)[0]); + break; + case 6: + snprintf(buf, buflen, "%08x %08x %08x %08x %08x %08x", ((uint32_t *)__fds)[5], + ((uint32_t *)__fds)[4], ((uint32_t *)__fds)[3], ((uint32_t *)__fds)[2], + ((uint32_t *)__fds)[1], ((uint32_t *)__fds)[0]); + break; + default: + buf[0] = '\0'; + } + return buf; +} + +/* Poll the file descriptors described by the NFDS structures starting at + FDS. If TIMis nonzero and not -1, allow TIMmilliseconds for + an event to occur; if TIMis -1, block until an event occurs. + Returns the number of file descriptors with events, zero if timed out, + or -1 for errors. */ +static int poll_helper(struct pollfd *__fds, nfds_t __nfds, int __timeout, + const sigset_t *__sigmask = nullptr) +{ + int off_rfd_buffer[__nfds]; + io_mux_call::offloaded_mode_t off_modes_buffer[__nfds]; + int lookup_buffer[__nfds]; + pollfd working_fds_arr[__nfds + 1]; + + try { + poll_call pcall(off_rfd_buffer, off_modes_buffer, lookup_buffer, working_fds_arr, __fds, + __nfds, __timeout, __sigmask); + + int rc = pcall.call(); + srdr_logfunc_exit("rc = %d", rc); + return rc; + } catch (io_mux_call::io_error &) { + srdr_logfunc_exit("io_mux_call::io_error (errno=%d %m)", errno); + return -1; + } +} + +/* Check the first NFDS descriptors each in READFDS (if not NULL) for read + readiness, in WRITEFDS (if not NULL) for write readiness, and in EXCEPTFDS + (if not NULL) for exceptional conditions. If TIMis not NULL, time out + after waiting the interval specified therein. Returns the number of ready + descriptors, or -1 for errors. + + This function is a cancellation point and therefore not marked with + __THROW. */ +static int select_helper(int __nfds, fd_set *__readfds, fd_set *__writefds, fd_set *__exceptfds, + struct timeval *__timeout, const sigset_t *__sigmask = nullptr) +{ + int off_rfds_buffer[__nfds]; + io_mux_call::offloaded_mode_t off_modes_buffer[__nfds]; + + if (g_vlogger_level >= VLOG_FUNC) { + const int tmpbufsize = 256; + char tmpbuf[tmpbufsize], tmpbuf2[tmpbufsize]; + NOT_IN_USE(tmpbufsize); /* to suppress warning in case MAX_DEFINED_LOG_LEVEL */ + NOT_IN_USE(tmpbuf); /* to suppress warning in case MAX_DEFINED_LOG_LEVEL */ + NOT_IN_USE(tmpbuf2); /* to suppress warning in case MAX_DEFINED_LOG_LEVEL */ + srdr_logfunc("readfds: %s, writefds: %s", + dbg_sprintf_fdset(tmpbuf, tmpbufsize, __nfds, __readfds), + dbg_sprintf_fdset(tmpbuf2, tmpbufsize, __nfds, __writefds)); + } + + try { + select_call scall(off_rfds_buffer, off_modes_buffer, __nfds, __readfds, __writefds, + __exceptfds, __timeout, __sigmask); + int rc = scall.call(); + + if (g_vlogger_level >= VLOG_FUNC) { + const int tmpbufsize = 256; + char tmpbuf[tmpbufsize], tmpbuf2[tmpbufsize]; + NOT_IN_USE(tmpbufsize); /* to suppress warning in case MAX_DEFINED_LOG_LEVEL */ + NOT_IN_USE(tmpbuf); /* to suppress warning in case MAX_DEFINED_LOG_LEVEL */ + NOT_IN_USE(tmpbuf2); /* to suppress warning in case MAX_DEFINED_LOG_LEVEL */ + srdr_logfunc_exit("readfds: %s, writefds: %s", + dbg_sprintf_fdset(tmpbuf, tmpbufsize, __nfds, __readfds), + dbg_sprintf_fdset(tmpbuf2, tmpbufsize, __nfds, __writefds)); + } + + return rc; + } catch (io_mux_call::io_error &) { + srdr_logfunc_exit("io_mux_call::io_error (errno=%d %m)", errno); + return -1; + } +} + +static void xlio_epoll_create(int epfd, int size) +{ + if (g_p_fd_collection) { + // Sanity check to remove any old sockinfo object using the same fd!! + handle_close(epfd, true); + + // insert epfd to fd_collection as epfd_info + g_p_fd_collection->addepfd(epfd, size); + } +} + +/* Wait for events on an epoll instance "epfd". Returns the number of + triggered events returned in "events" buffer. Or -1 in case of + error with the "errno" variable set to the specific error code. The + "events" parameter is a buffer that will contain triggered + events. The "maxevents" is the maximum number of events to be + returned ( usually size of "events" ). The "timeout" parameter + specifies the maximum wait time in milliseconds (-1 == infinite). */ +inline int epoll_wait_helper(int __epfd, struct epoll_event *__events, int __maxevents, + int __timeout, const sigset_t *__sigmask = nullptr) +{ + if (__maxevents <= 0 || __maxevents > EP_MAX_EVENTS) { + srdr_logdbg("invalid value for maxevents: %d", __maxevents); + errno = EINVAL; + return -1; + } + + if (safe_mce_sys().tcp_ctl_thread == option_tcp_ctl_thread::CTL_THREAD_DELEGATE_TCP_TIMERS) { + g_event_handler_manager_local.do_tasks(); + } + + epoll_event extra_events_buffer[__maxevents]; + + try { + epoll_wait_call epcall(extra_events_buffer, nullptr, __epfd, __events, __maxevents, + __timeout, __sigmask); + + int rc = epcall.get_current_events(); // returns ready nfds + if (rc <= 0) { + // if no ready nfds available then check all lower level queues (XLIO ring's and OS + // queues) + epcall.init_offloaded_fds(); + rc = epcall.call(); + } + + srdr_logfunc_exit("rc = %d", rc); + return rc; + } catch (io_mux_call::io_error &) { + srdr_logfunc_exit("io_mux_call::io_error (errno=%d %m)", errno); + return -1; + } +} + +static void handler_intr(int sig) +{ + switch (sig) { + case SIGINT: + g_b_exit = true; + srdr_logdbg("Catch Signal: SIGINT (%d)", sig); + break; + default: + srdr_logdbg("Catch Signal: %d", sig); + break; + } + + if (g_act_prev.sa_handler) { + g_act_prev.sa_handler(sig); + } +} + +static void handle_signal(int signum) +{ + srdr_logdbg_entry("Caught signal! signum=%d", signum); + + if (signum == SIGINT) { + g_b_exit = true; + } + + if (g_sighandler) { + g_sighandler(signum); + } +} + +int sigaction_internal(int signum, const struct sigaction *act, struct sigaction *oldact) +{ + int ret = 0; + + PROFILE_FUNC + + if (safe_mce_sys().handle_sigintr) { + srdr_logdbg_entry("signum=%d, act=%p, oldact=%p", signum, act, oldact); + + switch (signum) { + case SIGINT: + if (oldact && g_act_prev.sa_handler) { + *oldact = g_act_prev; + } + if (act) { + struct sigaction xlio_action; + xlio_action.sa_handler = handler_intr; + xlio_action.sa_flags = 0; + sigemptyset(&xlio_action.sa_mask); + + ret = SYSCALL(sigaction, SIGINT, &xlio_action, nullptr); + + if (ret < 0) { + srdr_logdbg("Failed to register SIGINT handler, calling to original sigaction " + "handler"); + break; + } + srdr_logdbg("Registered SIGINT handler"); + g_act_prev = *act; + } + if (ret >= 0) { + srdr_logdbg_exit("returned with %d", ret); + } else { + srdr_logdbg_exit("failed (errno=%d %m)", errno); + } + + return ret; + break; + default: + break; + } + } + ret = SYSCALL(sigaction, signum, act, oldact); + + if (safe_mce_sys().handle_sigintr) { + if (ret >= 0) { + srdr_logdbg_exit("returned with %d", ret); + } else { + srdr_logdbg_exit("failed (errno=%d %m)", errno); + } + } + return ret; +} + +extern "C" { +/* Create a new socket of type TYPE in domain DOMAIN, using + protocol PROTOCOL. If PROTOCOL is zero, one is chosen automatically. + Returns a file descriptor for the new socket, or -1 for errors. */ +EXPORT_SYMBOL int XLIO_SYMBOL(socket)(int __domain, int __type, int __protocol) +{ + return socket_internal(__domain, __type, __protocol, true, true); +} + +EXPORT_SYMBOL int XLIO_SYMBOL(close)(int __fd) +{ + PROFILE_FUNC + + srdr_logdbg_entry("fd=%d", __fd); + + bool toclose = handle_close(__fd); + int rc = toclose ? SYSCALL(close, __fd) : 0; + + return rc; +} + +#ifdef XLIO_STATIC_BUILD +extern void __res_iclose(res_state statp, bool free_addr); +#endif + +EXPORT_SYMBOL void XLIO_SYMBOL(__res_iclose)(res_state statp, bool free_addr) +{ + PROFILE_FUNC + + /* Current implementation doesn't handle XLIO sockets without a shadow socket or from a socket + pool. If such a socket is present in the nssocks list, system __res_iclose() will close the + fd. This will break the socket functionality. + Assume that resolver doesn't use the above scenarios. */ + + srdr_logdbg_entry(""); + for (int ns = 0; ns < statp->_u._ext.nscount; ns++) { + int sock = statp->_u._ext.nssocks[ns]; + if (sock != -1) { + handle_close(sock); + } + } + SYSCALL(__res_iclose, statp, free_addr); +} + +/* Shut down all or part of the connection open on socket FD. + HOW determines what to shut down: + SHUT_RD = No more receptions; + SHUT_WR = No more transmissions; + SHUT_RDWR = No more receptions or transmissions. + Returns 0 on success, -1 for errors. */ +EXPORT_SYMBOL int XLIO_SYMBOL(shutdown)(int __fd, int __how) +{ + PROFILE_FUNC + + srdr_logdbg_entry("fd=%d, how=%d", __fd, __how); + + sockinfo *p_socket_object = nullptr; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object) { + return p_socket_object->shutdown(__how); + } + + return SYSCALL(shutdown, __fd, __how); +} + +EXPORT_SYMBOL int XLIO_SYMBOL(listen)(int __fd, int backlog) +{ + PROFILE_FUNC + + srdr_logdbg_entry("fd=%d, backlog=%d", __fd, backlog); + +#if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) + if (g_p_app && g_p_app->type != APP_NONE) { + /* Envoy: + * Socket handling + * Envoy uses the following procedure for creating sockets and assigning them to workers. + * + * When a listener is created, a socket is pre-created for every worker on the main thread. + * This allows most errors to be caught early on in the listener creation process (e.g., bad + * socket option, unable to bind, etc.). + * - If using reuse_port, a unique socket is created for every worker. + * - If not using reuse_port, a unique socket is created for worker 0, and then that socket + * is duplicated for all other workers. + * a listener can close() its sockets when removed without concern for other listeners. + * + * Implementation: + * - reuse_port(false) : + * Envoy uses dup() call for listen socket on workers_N (N > 0) + * dup() call does not create socket object and does not store fd + * in fd_collection in current implementation + * so as a result duplicated fd is not returned by fd_collection_get_sockfd(__fd) and + * listen() call for duplicated fds are ignored. + * Original listen socket is not ignored by listen() function. + * - reuse_port(true) : + * dup() is not used. Unique socket is created for every worker. + * + * Store all duplicated fd in map_dup_fd with reference to original fd + * Store all listen fd in map_listen_fd with tid + * Identify correct listen fd during epoll_ctl(ADD) call by tid. It should be different. + * Set worker id in map_thread_id basing on tid + * + * Nginx: + * Nginx store all listen fd in map_listen_fd to proceed later in children processes + * after fork() call. + * Set worker id in map_thread_id basing on tid(pid). Nginx has single thread per process so + * tid and pid should be equal. + */ + std::lock_guardm_lock)> lock(g_p_app->m_lock); + g_p_app->map_listen_fd[__fd] = gettid(); + } +#endif /* DEFINED_ENVOY */ + + sockinfo *p_socket_object = nullptr; + p_socket_object = fd_collection_get_sockfd(__fd); + + if (p_socket_object) { + // for verifying that the socket is really offloaded + int ret = p_socket_object->prepareListen(); + if (ret < 0) { + return ret; // error + } + if (ret > 0) { // Passthrough + handle_close(__fd, false, true); + } else { +#if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) + if (g_p_app && g_p_app->type != APP_NONE) { + p_socket_object->m_back_log = backlog; + } else +#endif + { + return p_socket_object->listen(backlog); + } + } + } + + srdr_logdbg("OS listen fd=%d, backlog=%d", __fd, backlog); + return SYSCALL(listen, __fd, backlog); +} + +EXPORT_SYMBOL int XLIO_SYMBOL(accept)(int __fd, struct sockaddr *__addr, socklen_t *__addrlen) +{ + PROFILE_FUNC + + sockinfo *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { - return p_socket_object->accept4(__addr, __addrlen, __flags); + return p_socket_object->accept(__addr, __addrlen); } - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.accept4) { - get_orig_funcs(); + return SYSCALL(accept, __fd, __addr, __addrlen); +} + +EXPORT_SYMBOL int XLIO_SYMBOL(accept4)(int __fd, struct sockaddr *__addr, socklen_t *__addrlen, + int __flags) +{ + PROFILE_FUNC + + sockinfo *p_socket_object = nullptr; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object) { + return p_socket_object->accept4(__addr, __addrlen, __flags); } - BULLSEYE_EXCLUDE_BLOCK_END - return orig_os_api.accept4(__fd, __addr, __addrlen, __flags); + return SYSCALL(accept4, __fd, __addr, __addrlen, __flags); } /* Give the socket FD the local address ADDR (which is LEN bytes long). */ -extern "C" EXPORT_SYMBOL int bind(int __fd, const struct sockaddr *__addr, socklen_t __addrlen) +EXPORT_SYMBOL int XLIO_SYMBOL(bind)(int __fd, const struct sockaddr *__addr, socklen_t __addrlen) { int errno_tmp = errno; PROFILE_FUNC - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.bind) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - char buf[256]; NOT_IN_USE(buf); /* to suppress warning in case MAX_DEFINED_LOG_LEVEL */ srdr_logdbg_entry("fd=%d, %s", __fd, sprintf_sockaddr(buf, 256, __addr, __addrlen)); int ret = 0; - socket_fd_api *p_socket_object = NULL; + sockinfo *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { - ret = p_socket_object->bind(__addr, __addrlen); - if (p_socket_object->isPassthrough()) { - handle_close(__fd, false, true); - if (ret) { - ret = orig_os_api.bind(__fd, __addr, __addrlen); - } - } + ret = bind_internal(p_socket_object, __addr, __addrlen); } else { - ret = orig_os_api.bind(__fd, __addr, __addrlen); + ret = SYSCALL(bind, __fd, __addr, __addrlen); } if (ret >= 0) { @@ -583,37 +1023,30 @@ extern "C" EXPORT_SYMBOL int bind(int __fd, const struct sockaddr *__addr, sockl This function is a cancellation point and therefore not marked with __THROW. */ -extern "C" EXPORT_SYMBOL int connect(int __fd, const struct sockaddr *__to, socklen_t __tolen) +EXPORT_SYMBOL int XLIO_SYMBOL(connect)(int __fd, const struct sockaddr *__to, socklen_t __tolen) { int errno_tmp = errno; PROFILE_FUNC - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.connect) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - char buf[256]; NOT_IN_USE(buf); /* to suppress warning in case MAX_DEFINED_LOG_LEVEL */ srdr_logdbg_entry("fd=%d, %s", __fd, sprintf_sockaddr(buf, 256, __to, __tolen)); int ret = 0; - socket_fd_api *p_socket_object = fd_collection_get_sockfd(__fd); - if (p_socket_object == nullptr) { + sockinfo *p_socket_object = fd_collection_get_sockfd(__fd); + if (!p_socket_object) { srdr_logdbg_exit("Unable to get sock_fd_api"); - ret = orig_os_api.connect(__fd, __to, __tolen); - } else if (__to == nullptr || - (get_sa_family(__to) != AF_INET && (get_sa_family(__to) != AF_INET6))) { + ret = SYSCALL(connect, __fd, __to, __tolen); + } else if (!__to || (get_sa_family(__to) != AF_INET && (get_sa_family(__to) != AF_INET6))) { p_socket_object->setPassthrough(); - ret = orig_os_api.connect(__fd, __to, __tolen); + ret = SYSCALL(connect, __fd, __to, __tolen); } else { ret = p_socket_object->connect(__to, __tolen); if (p_socket_object->isPassthrough()) { handle_close(__fd, false, true); if (ret) { - ret = orig_os_api.connect(__fd, __to, __tolen); + ret = SYSCALL(connect, __fd, __to, __tolen); } } } @@ -631,12 +1064,12 @@ extern "C" EXPORT_SYMBOL int connect(int __fd, const struct sockaddr *__to, sock /* Set socket FD's option OPTNAME at protocol level LEVEL to *OPTVAL (which is OPTLEN bytes long). Returns 0 on success, -1 for errors. */ -extern "C" EXPORT_SYMBOL int setsockopt(int __fd, int __level, int __optname, - __const void *__optval, socklen_t __optlen) +EXPORT_SYMBOL int XLIO_SYMBOL(setsockopt)(int __fd, int __level, int __optname, + __const void *__optval, socklen_t __optlen) { srdr_logdbg_entry("fd=%d, level=%d, optname=%d", __fd, __level, __optname); - if (NULL == __optval) { + if (!__optval) { errno = EFAULT; return -1; } @@ -644,19 +1077,14 @@ extern "C" EXPORT_SYMBOL int setsockopt(int __fd, int __level, int __optname, PROFILE_FUNC int ret = 0; - socket_fd_api *p_socket_object = NULL; + sockinfo *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { VERIFY_PASSTROUGH_CHANGED( ret, p_socket_object->setsockopt(__level, __optname, __optval, __optlen)); } else { - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.setsockopt) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - ret = orig_os_api.setsockopt(__fd, __level, __optname, __optval, __optlen); + ret = SYSCALL(setsockopt, __fd, __level, __optname, __optval, __optlen); } if (ret >= 0) { @@ -670,35 +1098,30 @@ extern "C" EXPORT_SYMBOL int setsockopt(int __fd, int __level, int __optname, /* Get socket FD's option OPTNAME at protocol level LEVEL to *OPTVAL (which is OPTLEN bytes long). Returns 0 on success, -1 for errors. */ -extern "C" EXPORT_SYMBOL int getsockopt(int __fd, int __level, int __optname, void *__optval, - socklen_t *__optlen) +EXPORT_SYMBOL int XLIO_SYMBOL(getsockopt)(int __fd, int __level, int __optname, void *__optval, + socklen_t *__optlen) { PROFILE_FUNC srdr_logdbg_entry("fd=%d, level=%d, optname=%d", __fd, __level, __optname); +#ifndef XLIO_STATIC_BUILD if (__fd == -2 && __level == SOL_SOCKET && __optname == SO_XLIO_GET_API && __optlen && *__optlen >= sizeof(struct xlio_api_t *)) { - struct xlio_api_t *xlio_api = extra_api(); - - *((xlio_api_t **)__optval) = xlio_api; + *((xlio_api_t **)__optval) = extra_api(); *__optlen = sizeof(struct xlio_api_t *); return 0; } +#endif /* XLIO_STATIC_BUILD */ int ret = 0; - socket_fd_api *p_socket_object = NULL; + sockinfo *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { VERIFY_PASSTROUGH_CHANGED( ret, p_socket_object->getsockopt(__level, __optname, __optval, __optlen)); } else { - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.getsockopt) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - ret = orig_os_api.getsockopt(__fd, __level, __optname, __optval, __optlen); + ret = SYSCALL(getsockopt, __fd, __level, __optname, __optval, __optlen); } if (ret >= 0) { @@ -719,7 +1142,7 @@ extern "C" EXPORT_SYMBOL int getsockopt(int __fd, int __level, int __optname, vo user requested explicitly that XLIO will throw an exception in such a case by setting XLIO_EXCEPTION_HANDLING accordingly (see README.txt) */ -extern "C" EXPORT_SYMBOL int fcntl(int __fd, int __cmd, ...) +EXPORT_SYMBOL int XLIO_SYMBOL(fcntl)(int __fd, int __cmd, ...) { PROFILE_FUNC @@ -732,17 +1155,12 @@ extern "C" EXPORT_SYMBOL int fcntl(int __fd, int __cmd, ...) va_end(va); int ret = 0; - socket_fd_api *p_socket_object = NULL; + sockinfo *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { VERIFY_PASSTROUGH_CHANGED(res, p_socket_object->fcntl(__cmd, arg)); } else { - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.fcntl) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - res = orig_os_api.fcntl(__fd, __cmd, arg); + res = SYSCALL(fcntl, __fd, __cmd, arg); } if (__cmd == F_DUPFD) { @@ -768,7 +1186,7 @@ extern "C" EXPORT_SYMBOL int fcntl(int __fd, int __cmd, ...) by setting XLIO_EXCEPTION_HANDLING accordingly (see README.txt) */ -extern "C" EXPORT_SYMBOL int fcntl64(int __fd, int __cmd, ...) +EXPORT_SYMBOL int XLIO_SYMBOL(fcntl64)(int __fd, int __cmd, ...) { PROFILE_FUNC @@ -781,27 +1199,12 @@ extern "C" EXPORT_SYMBOL int fcntl64(int __fd, int __cmd, ...) va_end(va); int ret = 0; - socket_fd_api *p_socket_object = NULL; + sockinfo *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.fcntl64) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - if (p_socket_object && orig_os_api.fcntl64) { + if (p_socket_object && VALID_SYSCALL(fcntl64)) { VERIFY_PASSTROUGH_CHANGED(res, p_socket_object->fcntl64(__cmd, arg)); } else { - if (!orig_os_api.fcntl64) { - srdr_logfunc_exit("failed (errno=%d %m)", errno); - VLOG_PRINTF_ONCE_THEN_ALWAYS(VLOG_ERROR, VLOG_DEBUG, - "fcntl64 was not found during runtime. Set %s to " - "appripriate debug level to see datails. Ignoring...", - SYS_VAR_LOG_LEVEL); - errno = EOPNOTSUPP; - return -1; - } else { - res = orig_os_api.fcntl64(__fd, __cmd, arg); - } + res = SYSCALL_ERRNO_UNSUPPORTED(fcntl64, __fd, __cmd, arg); } if (__cmd == F_DUPFD) { @@ -813,453 +1216,137 @@ extern "C" EXPORT_SYMBOL int fcntl64(int __fd, int __cmd, ...) } else { srdr_logfunc_exit("failed (errno=%d %m)", errno); } - return res; -} - -/* Perform the I/O control operation specified by REQUEST on FD. - One argument may follow; its presence and type depend on REQUEST. - Return value depends on REQUEST. Usually -1 indicates error. */ -extern "C" EXPORT_SYMBOL int ioctl(int __fd, unsigned long int __request, ...) -{ - PROFILE_FUNC - - srdr_logfunc_entry("fd=%d, request=%d", __fd, __request); - - int res = -1; - va_list va; - va_start(va, __request); - unsigned long int arg = va_arg(va, unsigned long int); - va_end(va); - - int ret = 0; - - socket_fd_api *p_socket_object = NULL; - p_socket_object = fd_collection_get_sockfd(__fd); - if (p_socket_object && arg) { - VERIFY_PASSTROUGH_CHANGED(res, p_socket_object->ioctl(__request, arg)); - } else { - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.ioctl) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - res = orig_os_api.ioctl(__fd, __request, arg); - } - - if (ret >= 0) { - srdr_logfunc_exit("returned with %d", ret); - } else { - srdr_logfunc_exit("failed (errno=%d %m)", errno); - } - return res; -} - -extern "C" EXPORT_SYMBOL int getsockname(int __fd, struct sockaddr *__name, socklen_t *__namelen) -{ - PROFILE_FUNC - - srdr_logdbg_entry("fd=%d", __fd); - - int ret = 0; - socket_fd_api *p_socket_object = NULL; - p_socket_object = fd_collection_get_sockfd(__fd); - if (p_socket_object) { - ret = p_socket_object->getsockname(__name, __namelen); - - if (safe_mce_sys().trigger_dummy_send_getsockname) { - char buf[264] = {0}; - struct iovec msg_iov = {&buf, sizeof(buf)}; - struct msghdr msg = {NULL, 0, &msg_iov, 1, NULL, 0, 0}; - int ret_send = sendmsg(__fd, &msg, XLIO_SND_FLAGS_DUMMY); - srdr_logdbg("Triggered dummy message for socket fd=%d (ret_send=%d)", __fd, ret_send); - NOT_IN_USE(ret_send); - } - } else { - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.getsockname) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - ret = orig_os_api.getsockname(__fd, __name, __namelen); - } - - if (ret >= 0) { - srdr_logdbg_exit("returned with %d", ret); - } else { - srdr_logdbg_exit("failed (errno=%d %m)", errno); - } - return ret; -} - -extern "C" EXPORT_SYMBOL int getpeername(int __fd, struct sockaddr *__name, socklen_t *__namelen) -{ - PROFILE_FUNC - - srdr_logdbg_entry("fd=%d", __fd); - - int ret = 0; - socket_fd_api *p_socket_object = NULL; - p_socket_object = fd_collection_get_sockfd(__fd); - if (p_socket_object) { - ret = p_socket_object->getpeername(__name, __namelen); - } else { - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.getpeername) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - ret = orig_os_api.getpeername(__fd, __name, __namelen); - } - - if (ret >= 0) { - srdr_logdbg_exit("returned with %d", ret); - } else { - srdr_logdbg_exit("failed (errno=%d %m)", errno); - } - return ret; -} - -/* Read NBYTES into BUF from FD. Return the - number read, -1 for errors or 0 for EOF. - - This function is a cancellation point and therefore not marked with - __THROW. */ -extern "C" EXPORT_SYMBOL ssize_t read(int __fd, void *__buf, size_t __nbytes) -{ - PROFILE_FUNC - - srdr_logfuncall_entry("fd=%d", __fd); - - socket_fd_api *p_socket_object = NULL; - p_socket_object = fd_collection_get_sockfd(__fd); - if (p_socket_object) { - struct iovec piov[1]; - piov[0].iov_base = __buf; - piov[0].iov_len = __nbytes; - int dummy_flags = 0; - return p_socket_object->rx(RX_READ, piov, 1, &dummy_flags); - } - - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.read) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - - return orig_os_api.read(__fd, __buf, __nbytes); -} - -#if defined HAVE___READ_CHK -/* Checks that the buffer is big enough to contain the number of bytes - * the user requests to read. If the buffer is too small, aborts, - * else read NBYTES into BUF from FD. Return the - number read, -1 for errors or 0 for EOF. - - This function is a cancellation point and therefore not marked with - __THROW. */ -extern "C" EXPORT_SYMBOL ssize_t __read_chk(int __fd, void *__buf, size_t __nbytes, size_t __buflen) -{ - PROFILE_FUNC - - srdr_logfuncall_entry("fd=%d", __fd); - - socket_fd_api *p_socket_object = NULL; - p_socket_object = fd_collection_get_sockfd(__fd); - if (p_socket_object) { - BULLSEYE_EXCLUDE_BLOCK_START - if (__nbytes > __buflen) { - srdr_logpanic("buffer overflow detected"); - } - BULLSEYE_EXCLUDE_BLOCK_END - - struct iovec piov[1]; - piov[0].iov_base = __buf; - piov[0].iov_len = __nbytes; - int dummy_flags = 0; - return p_socket_object->rx(RX_READ, piov, 1, &dummy_flags); - } - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.__read_chk) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - - return orig_os_api.__read_chk(__fd, __buf, __nbytes, __buflen); -} -#endif - -/* Read COUNT blocks into VECTOR from FD. Return the - number of bytes read, -1 for errors or 0 for EOF. - - This function is a cancellation point and therefore not marked with - __THROW. */ - -extern "C" EXPORT_SYMBOL ssize_t readv(int __fd, const struct iovec *iov, int iovcnt) -{ - PROFILE_FUNC - - srdr_logfuncall_entry("fd=%d", __fd); - - socket_fd_api *p_socket_object = NULL; - p_socket_object = fd_collection_get_sockfd(__fd); - if (p_socket_object) { - struct iovec *piov = (struct iovec *)iov; - int dummy_flags = 0; - return p_socket_object->rx(RX_READV, piov, iovcnt, &dummy_flags); - } - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.readv) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - - return orig_os_api.readv(__fd, iov, iovcnt); -} - -/* Read N bytes into BUF from socket FD. - Returns the number read or -1 for errors. - - This function is a cancellation point and therefore not marked with - __THROW. */ -extern "C" EXPORT_SYMBOL ssize_t recv(int __fd, void *__buf, size_t __nbytes, int __flags) -{ - PROFILE_FUNC - - srdr_logfuncall_entry("fd=%d", __fd); - - socket_fd_api *p_socket_object = NULL; - p_socket_object = fd_collection_get_sockfd(__fd); - if (p_socket_object) { - struct iovec piov[1]; - piov[0].iov_base = __buf; - piov[0].iov_len = __nbytes; - return p_socket_object->rx(RX_RECV, piov, 1, &__flags); - } - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.recv) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - - return orig_os_api.recv(__fd, __buf, __nbytes, __flags); -} - -#if defined HAVE___RECV_CHK -/* Checks that the buffer is big enough to contain the number of bytes - the user requests to read. If the buffer is too small, aborts, - else read N bytes into BUF from socket FD. - Returns the number read or -1 for errors. + return res; +} - This function is a cancellation point and therefore not marked with - __THROW. */ -extern "C" EXPORT_SYMBOL ssize_t __recv_chk(int __fd, void *__buf, size_t __nbytes, size_t __buflen, - int __flags) +/* Perform the I/O control operation specified by REQUEST on FD. + One argument may follow; its presence and type depend on REQUEST. + Return value depends on REQUEST. Usually -1 indicates error. */ +EXPORT_SYMBOL int XLIO_SYMBOL(ioctl)(int __fd, unsigned long int __request, ...) { PROFILE_FUNC - srdr_logfuncall_entry("fd=%d", __fd); + srdr_logfunc_entry("fd=%d, request=%d", __fd, __request); - socket_fd_api *p_socket_object = NULL; - p_socket_object = fd_collection_get_sockfd(__fd); - if (p_socket_object) { - BULLSEYE_EXCLUDE_BLOCK_START - if (__nbytes > __buflen) { - srdr_logpanic("buffer overflow detected"); - } - BULLSEYE_EXCLUDE_BLOCK_END + int res = -1; + va_list va; + va_start(va, __request); + unsigned long int arg = va_arg(va, unsigned long int); + va_end(va); - struct iovec piov[1]; - piov[0].iov_base = __buf; - piov[0].iov_len = __nbytes; - return p_socket_object->rx(RX_RECV, piov, 1, &__flags); - } - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.__recv_chk) { - get_orig_funcs(); + int ret = 0; + + sockinfo *p_socket_object = nullptr; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object && arg) { + VERIFY_PASSTROUGH_CHANGED(res, p_socket_object->ioctl(__request, arg)); + } else { + res = SYSCALL(ioctl, __fd, __request, arg); } - BULLSEYE_EXCLUDE_BLOCK_END - return orig_os_api.__recv_chk(__fd, __buf, __nbytes, __buflen, __flags); + if (ret >= 0) { + srdr_logfunc_exit("returned with %d", ret); + } else { + srdr_logfunc_exit("failed (errno=%d %m)", errno); + } + return res; } -#endif - -/* Receive a message as described by MESSAGE from socket FD. - Returns the number of bytes read or -1 for errors. - This function is a cancellation point and therefore not marked with - __THROW. */ -extern "C" EXPORT_SYMBOL ssize_t recvmsg(int __fd, struct msghdr *__msg, int __flags) +EXPORT_SYMBOL int XLIO_SYMBOL(getsockname)(int __fd, struct sockaddr *__name, socklen_t *__namelen) { PROFILE_FUNC - srdr_logfuncall_entry("fd=%d", __fd); - - if (__msg == NULL) { - srdr_logdbg("NULL msghdr"); - errno = EINVAL; - return -1; - } + srdr_logdbg_entry("fd=%d", __fd); - socket_fd_api *p_socket_object = NULL; + int ret = 0; + sockinfo *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { - __msg->msg_flags = 0; - return p_socket_object->rx(RX_RECVMSG, __msg->msg_iov, __msg->msg_iovlen, &__flags, - (__SOCKADDR_ARG)__msg->msg_name, - (socklen_t *)&__msg->msg_namelen, __msg); - } - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.recvmsg) { - get_orig_funcs(); + ret = p_socket_object->getsockname(__name, __namelen); + + if (safe_mce_sys().trigger_dummy_send_getsockname) { + char buf[264] = {0}; + struct iovec msg_iov = {&buf, sizeof(buf)}; + struct msghdr msg = {nullptr, 0, &msg_iov, 1, nullptr, 0, 0}; + int ret_send = sendmsg(__fd, &msg, XLIO_SND_FLAGS_DUMMY); + srdr_logdbg("Triggered dummy message for socket fd=%d (ret_send=%d)", __fd, ret_send); + NOT_IN_USE(ret_send); + } + } else { + ret = SYSCALL(getsockname, __fd, __name, __namelen); } - BULLSEYE_EXCLUDE_BLOCK_END - return orig_os_api.recvmsg(__fd, __msg, __flags); + if (ret >= 0) { + srdr_logdbg_exit("returned with %d", ret); + } else { + srdr_logdbg_exit("failed (errno=%d %m)", errno); + } + return ret; } -/* The following definitions are for kernels previous to 2.6.32 which dont support recvmmsg */ -#ifndef HAVE_STRUCT_MMSGHDR -#ifndef __INTEL_COMPILER -struct mmsghdr { - struct msghdr msg_hdr; // Message header - unsigned int msg_len; // Number of received bytes for header -}; -#endif -#endif - -#ifndef MSG_WAITFORONE -#define MSG_WAITFORONE 0x10000 // recvmmsg(): block until 1+ packets avail -#endif - -/* Receive multiple messages as described by MESSAGE from socket FD. - Returns the number of messages received or -1 for errors. - - This function is a cancellation point and therefore not marked with - __THROW. */ -extern "C" EXPORT_SYMBOL -#ifdef RECVMMSG_WITH_CONST_TIMESPEC - int - recvmmsg(int __fd, struct mmsghdr *__mmsghdr, unsigned int __vlen, int __flags, - const struct timespec *__timeout) -#else - int - recvmmsg(int __fd, struct mmsghdr *__mmsghdr, unsigned int __vlen, int __flags, - struct timespec *__timeout) -#endif +EXPORT_SYMBOL int XLIO_SYMBOL(getpeername)(int __fd, struct sockaddr *__name, socklen_t *__namelen) { PROFILE_FUNC - int num_of_msg = 0; - struct timespec start_time = TIMESPEC_INITIALIZER, current_time = TIMESPEC_INITIALIZER, - delta_time = TIMESPEC_INITIALIZER; - - srdr_logfuncall_entry("fd=%d, mmsghdr length=%d flags=%x", __fd, __vlen, __flags); - - if (__mmsghdr == NULL) { - srdr_logdbg("NULL mmsghdr"); - errno = EINVAL; - return -1; - } + srdr_logdbg_entry("fd=%d", __fd); - if (__timeout) { - gettime(&start_time); - } - socket_fd_api *p_socket_object = NULL; + int ret = 0; + sockinfo *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { - int ret = 0; - for (unsigned int i = 0; i < __vlen; i++) { - int flags = __flags; - __mmsghdr[i].msg_hdr.msg_flags = 0; - ret = p_socket_object->rx( - RX_RECVMSG, __mmsghdr[i].msg_hdr.msg_iov, __mmsghdr[i].msg_hdr.msg_iovlen, &flags, - (__SOCKADDR_ARG)__mmsghdr[i].msg_hdr.msg_name, - (socklen_t *)&__mmsghdr[i].msg_hdr.msg_namelen, &__mmsghdr[i].msg_hdr); - if (ret < 0) { - break; - } - num_of_msg++; - __mmsghdr[i].msg_len = ret; - if ((i == 0) && (flags & MSG_WAITFORONE)) { - __flags |= MSG_DONTWAIT; - } - if (__timeout) { - gettime(¤t_time); - ts_sub(¤t_time, &start_time, &delta_time); - if (ts_cmp(&delta_time, __timeout, >)) { - break; - } - } - } - if (num_of_msg || ret == 0) { - // todo save ret for so_error if ret != 0(see kernel) - return num_of_msg; - } else { - return ret; - } - } - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.recvmmsg) { - get_orig_funcs(); + ret = p_socket_object->getpeername(__name, __namelen); + } else { + ret = SYSCALL(getpeername, __fd, __name, __namelen); } - BULLSEYE_EXCLUDE_BLOCK_END - return orig_os_api.recvmmsg(__fd, __mmsghdr, __vlen, __flags, __timeout); + if (ret >= 0) { + srdr_logdbg_exit("returned with %d", ret); + } else { + srdr_logdbg_exit("failed (errno=%d %m)", errno); + } + return ret; } -/* Read N bytes into BUF through socket FD. - If ADDR is not NULL, fill in *ADDR_LEN bytes of it with tha address of - the sender, and store the actual size of the address in *ADDR_LEN. - Returns the number of bytes read or -1 for errors. +/* Read NBYTES into BUF from FD. Return the + number read, -1 for errors or 0 for EOF. This function is a cancellation point and therefore not marked with __THROW. */ -extern "C" EXPORT_SYMBOL ssize_t recvfrom(int __fd, void *__buf, size_t __nbytes, int __flags, - struct sockaddr *__from, socklen_t *__fromlen) +EXPORT_SYMBOL ssize_t XLIO_SYMBOL(read)(int __fd, void *__buf, size_t __nbytes) { - ssize_t ret_val = 0; - PROFILE_FUNC srdr_logfuncall_entry("fd=%d", __fd); - socket_fd_api *p_socket_object = NULL; + sockinfo *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { struct iovec piov[1]; piov[0].iov_base = __buf; piov[0].iov_len = __nbytes; - ret_val = p_socket_object->rx(RX_RECVFROM, piov, 1, &__flags, __from, __fromlen); - } else { - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.recvfrom) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - ret_val = orig_os_api.recvfrom(__fd, __buf, __nbytes, __flags, __from, __fromlen); + int dummy_flags = 0; + return p_socket_object->rx(RX_READ, piov, 1, &dummy_flags); } - return ret_val; + + return SYSCALL(read, __fd, __buf, __nbytes); } -#if defined HAVE___RECVFROM_CHK +#if defined HAVE___READ_CHK && !defined(XLIO_STATIC_BUILD) /* Checks that the buffer is big enough to contain the number of bytes - the user requests to read. If the buffer is too small, aborts, - else read N bytes into BUF through socket FD. - If ADDR is not NULL, fill in *ADDR_LEN bytes of it with tha address of - the sender, and store the actual size of the address in *ADDR_LEN. - Returns the number of bytes read or -1 for errors. + * the user requests to read. If the buffer is too small, aborts, + * else read NBYTES into BUF from FD. Return the + number read, -1 for errors or 0 for EOF. This function is a cancellation point and therefore not marked with __THROW. */ -extern "C" EXPORT_SYMBOL ssize_t __recvfrom_chk(int __fd, void *__buf, size_t __nbytes, - size_t __buflen, int __flags, - struct sockaddr *__from, socklen_t *__fromlen) +EXPORT_SYMBOL ssize_t XLIO_SYMBOL(__read_chk)(int __fd, void *__buf, size_t __nbytes, + size_t __buflen) { PROFILE_FUNC srdr_logfuncall_entry("fd=%d", __fd); - socket_fd_api *p_socket_object = NULL; + sockinfo *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { BULLSEYE_EXCLUDE_BLOCK_START @@ -1271,601 +1358,519 @@ extern "C" EXPORT_SYMBOL ssize_t __recvfrom_chk(int __fd, void *__buf, size_t __ struct iovec piov[1]; piov[0].iov_base = __buf; piov[0].iov_len = __nbytes; - return p_socket_object->rx(RX_RECVFROM, piov, 1, &__flags, __from, __fromlen); - } - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.__recvfrom_chk) { - get_orig_funcs(); + int dummy_flags = 0; + return p_socket_object->rx(RX_READ, piov, 1, &dummy_flags); } - BULLSEYE_EXCLUDE_BLOCK_END - return orig_os_api.__recvfrom_chk(__fd, __buf, __nbytes, __buflen, __flags, __from, __fromlen); + return SYSCALL(__read_chk, __fd, __buf, __nbytes, __buflen); } #endif -/* Write N bytes of BUF to FD. Return the number written, or -1. +/* Read COUNT blocks into VECTOR from FD. Return the + number of bytes read, -1 for errors or 0 for EOF. This function is a cancellation point and therefore not marked with __THROW. */ -extern "C" EXPORT_SYMBOL ssize_t write(int __fd, __const void *__buf, size_t __nbytes) + +EXPORT_SYMBOL ssize_t XLIO_SYMBOL(readv)(int __fd, const struct iovec *iov, int iovcnt) { PROFILE_FUNC - srdr_logfuncall_entry("fd=%d, nbytes=%d", __fd, __nbytes); + srdr_logfuncall_entry("fd=%d", __fd); - socket_fd_api *p_socket_object = NULL; + sockinfo *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { - struct iovec piov[1] = {{(void *)__buf, __nbytes}}; - xlio_tx_call_attr_t tx_arg; - - tx_arg.opcode = TX_WRITE; - tx_arg.attr.iov = piov; - tx_arg.attr.sz_iov = 1; - - return p_socket_object->tx(tx_arg); - } - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.write) { - get_orig_funcs(); + struct iovec *piov = (struct iovec *)iov; + int dummy_flags = 0; + return p_socket_object->rx(RX_READV, piov, iovcnt, &dummy_flags); } - BULLSEYE_EXCLUDE_BLOCK_END - return orig_os_api.write(__fd, __buf, __nbytes); + return SYSCALL(readv, __fd, iov, iovcnt); } -/* Write IOCNT blocks from IOVEC to FD. Return the number written, or -1. +/* Read N bytes into BUF from socket FD. + Returns the number read or -1 for errors. This function is a cancellation point and therefore not marked with __THROW. */ -extern "C" EXPORT_SYMBOL ssize_t writev(int __fd, const struct iovec *iov, int iovcnt) +EXPORT_SYMBOL ssize_t XLIO_SYMBOL(recv)(int __fd, void *__buf, size_t __nbytes, int __flags) { PROFILE_FUNC - srdr_logfuncall_entry("fd=%d, %d iov blocks", __fd, iovcnt); + srdr_logfuncall_entry("fd=%d", __fd); - socket_fd_api *p_socket_object = NULL; + sockinfo *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { - xlio_tx_call_attr_t tx_arg; - - tx_arg.opcode = TX_WRITEV; - tx_arg.attr.iov = (struct iovec *)iov; - tx_arg.attr.sz_iov = iovcnt; - - return p_socket_object->tx(tx_arg); - } - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.writev) { - get_orig_funcs(); + struct iovec piov[1]; + piov[0].iov_base = __buf; + piov[0].iov_len = __nbytes; + return p_socket_object->rx(RX_RECV, piov, 1, &__flags); } - BULLSEYE_EXCLUDE_BLOCK_END - return orig_os_api.writev(__fd, iov, iovcnt); + return SYSCALL(recv, __fd, __buf, __nbytes, __flags); } -/* Send N bytes of BUF to socket FD. Returns the number sent or -1. +#if defined HAVE___RECV_CHK && !defined(XLIO_STATIC_BUILD) +/* Checks that the buffer is big enough to contain the number of bytes + the user requests to read. If the buffer is too small, aborts, + else read N bytes into BUF from socket FD. + Returns the number read or -1 for errors. This function is a cancellation point and therefore not marked with __THROW. */ -extern "C" EXPORT_SYMBOL ssize_t send(int __fd, __const void *__buf, size_t __nbytes, int __flags) +EXPORT_SYMBOL ssize_t XLIO_SYMBOL(__recv_chk)(int __fd, void *__buf, size_t __nbytes, + size_t __buflen, int __flags) { PROFILE_FUNC - srdr_logfuncall_entry("fd=%d, nbytes=%d", __fd, __nbytes); + srdr_logfuncall_entry("fd=%d", __fd); - socket_fd_api *p_socket_object = NULL; + sockinfo *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { - struct iovec piov[1] = {{(void *)__buf, __nbytes}}; - xlio_tx_call_attr_t tx_arg; - - tx_arg.opcode = TX_SEND; - tx_arg.attr.iov = piov; - tx_arg.attr.sz_iov = 1; - tx_arg.attr.flags = __flags; - - return p_socket_object->tx(tx_arg); - } + BULLSEYE_EXCLUDE_BLOCK_START + if (__nbytes > __buflen) { + srdr_logpanic("buffer overflow detected"); + } + BULLSEYE_EXCLUDE_BLOCK_END - // Ignore dummy messages for OS - if (unlikely(IS_DUMMY_PACKET(__flags))) { - errno = EINVAL; - return -1; - } - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.send) { - get_orig_funcs(); + struct iovec piov[1]; + piov[0].iov_base = __buf; + piov[0].iov_len = __nbytes; + return p_socket_object->rx(RX_RECV, piov, 1, &__flags); } - BULLSEYE_EXCLUDE_BLOCK_END - return orig_os_api.send(__fd, __buf, __nbytes, __flags); + return SYSCALL(__recv_chk, __fd, __buf, __nbytes, __buflen, __flags); } +#endif -/* Sends a message as described by MESSAGE to socket FD. +/* Receive a message as described by MESSAGE from socket FD. Returns the number of bytes read or -1 for errors. This function is a cancellation point and therefore not marked with __THROW. */ -extern "C" EXPORT_SYMBOL ssize_t sendmsg(int __fd, __const struct msghdr *__msg, int __flags) +EXPORT_SYMBOL ssize_t XLIO_SYMBOL(recvmsg)(int __fd, struct msghdr *__msg, int __flags) { PROFILE_FUNC srdr_logfuncall_entry("fd=%d", __fd); - socket_fd_api *p_socket_object = NULL; - p_socket_object = fd_collection_get_sockfd(__fd); - if (p_socket_object) { - xlio_tx_call_attr_t tx_arg; - - tx_arg.opcode = TX_SENDMSG; - tx_arg.attr.iov = __msg->msg_iov; - tx_arg.attr.sz_iov = (ssize_t)__msg->msg_iovlen; - tx_arg.attr.flags = __flags; - tx_arg.attr.addr = (struct sockaddr *)(__CONST_SOCKADDR_ARG)__msg->msg_name; - tx_arg.attr.len = (socklen_t)__msg->msg_namelen; - tx_arg.attr.hdr = __msg; - tx_arg.priv.attr = PBUF_NONE; - - if (0 < __msg->msg_controllen) { - struct cmsghdr *cmsg = CMSG_FIRSTHDR((struct msghdr *)__msg); - if ((cmsg->cmsg_level == SOL_SOCKET) && - (cmsg->cmsg_type == SCM_XLIO_PD || cmsg->cmsg_type == SCM_XLIO_NVME_PD)) { - if ((tx_arg.attr.flags & MSG_ZEROCOPY) && - (__msg->msg_iovlen == - ((cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(struct xlio_pd_key)))) { - tx_arg.priv.attr = - (cmsg->cmsg_type == SCM_XLIO_PD) ? PBUF_DESC_MKEY : PBUF_DESC_NVME_TX; - tx_arg.priv.map = (void *)CMSG_DATA(cmsg); - } else { - errno = EINVAL; - return -1; - } - } - } - - return p_socket_object->tx(tx_arg); - } - - // Ignore dummy messages for OS - if (unlikely(IS_DUMMY_PACKET(__flags))) { + if (!__msg) { + srdr_logdbg("NULL msghdr"); errno = EINVAL; return -1; } - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.sendmsg) { - get_orig_funcs(); + + sockinfo *p_socket_object = nullptr; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object) { + __msg->msg_flags = 0; + return p_socket_object->rx(RX_RECVMSG, __msg->msg_iov, __msg->msg_iovlen, &__flags, + (__SOCKADDR_ARG)__msg->msg_name, + (socklen_t *)&__msg->msg_namelen, __msg); } - BULLSEYE_EXCLUDE_BLOCK_END - return orig_os_api.sendmsg(__fd, __msg, __flags); + return SYSCALL(recvmsg, __fd, __msg, __flags); } -/* Send multiple messages as described by MESSAGE from socket FD. - Returns the number of messages sent or -1 for errors. +/* The following definitions are for kernels previous to 2.6.32 which dont support recvmmsg */ +#ifndef HAVE_STRUCT_MMSGHDR +#ifndef __INTEL_COMPILER +struct mmsghdr { + struct msghdr msg_hdr; // Message header + unsigned int msg_len; // Number of received bytes for header +}; +#endif +#endif + +#ifndef MSG_WAITFORONE +#define MSG_WAITFORONE 0x10000 // recvmmsg(): block until 1+ packets avail +#endif + +/* Receive multiple messages as described by MESSAGE from socket FD. + Returns the number of messages received or -1 for errors. This function is a cancellation point and therefore not marked with __THROW. */ -extern "C" EXPORT_SYMBOL int sendmmsg(int __fd, struct mmsghdr *__mmsghdr, unsigned int __vlen, - int __flags) -{ - int num_of_msg = 0; +#if defined(RECVMMSG_WITH_CONST_TIMESPEC) || defined(XLIO_STATIC_BUILD) +EXPORT_SYMBOL int XLIO_SYMBOL(recvmmsg)(int __fd, struct mmsghdr *__mmsghdr, unsigned int __vlen, + int __flags, const struct timespec *__timeout) +#else +EXPORT_SYMBOL int XLIO_SYMBOL(recvmmsg)(int __fd, struct mmsghdr *__mmsghdr, unsigned int __vlen, + int __flags, struct timespec *__timeout) +#endif +{ PROFILE_FUNC + int num_of_msg = 0; + struct timespec start_time = TIMESPEC_INITIALIZER, current_time = TIMESPEC_INITIALIZER, + delta_time = TIMESPEC_INITIALIZER; + srdr_logfuncall_entry("fd=%d, mmsghdr length=%d flags=%x", __fd, __vlen, __flags); - if (__mmsghdr == NULL) { + if (!__mmsghdr) { srdr_logdbg("NULL mmsghdr"); errno = EINVAL; return -1; } - socket_fd_api *p_socket_object = NULL; + if (__timeout) { + gettime(&start_time); + } + sockinfo *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { + int ret = 0; for (unsigned int i = 0; i < __vlen; i++) { - xlio_tx_call_attr_t tx_arg; - - tx_arg.opcode = TX_SENDMSG; - tx_arg.attr.iov = __mmsghdr[i].msg_hdr.msg_iov; - tx_arg.attr.sz_iov = (ssize_t)__mmsghdr[i].msg_hdr.msg_iovlen; - tx_arg.attr.flags = __flags; - tx_arg.attr.addr = (struct sockaddr *)(__SOCKADDR_ARG)__mmsghdr[i].msg_hdr.msg_name; - tx_arg.attr.len = (socklen_t)__mmsghdr[i].msg_hdr.msg_namelen; - tx_arg.attr.hdr = &__mmsghdr[i].msg_hdr; - - int ret = p_socket_object->tx(tx_arg); + int flags = __flags; + __mmsghdr[i].msg_hdr.msg_flags = 0; + ret = p_socket_object->rx( + RX_RECVMSG, __mmsghdr[i].msg_hdr.msg_iov, __mmsghdr[i].msg_hdr.msg_iovlen, &flags, + (__SOCKADDR_ARG)__mmsghdr[i].msg_hdr.msg_name, + (socklen_t *)&__mmsghdr[i].msg_hdr.msg_namelen, &__mmsghdr[i].msg_hdr); if (ret < 0) { - if (num_of_msg) { - return num_of_msg; - } else { - return ret; - } + break; } num_of_msg++; __mmsghdr[i].msg_len = ret; + if ((i == 0) && (flags & MSG_WAITFORONE)) { + __flags |= MSG_DONTWAIT; + } + if (__timeout) { + gettime(¤t_time); + ts_sub(¤t_time, &start_time, &delta_time); + if (ts_cmp(&delta_time, __timeout, >)) { + break; + } + } + } + if (num_of_msg || ret == 0) { + // todo save ret for so_error if ret != 0(see kernel) + return num_of_msg; + } else { + return ret; } - return num_of_msg; } - // Ignore dummy messages for OS - if (unlikely(IS_DUMMY_PACKET(__flags))) { - errno = EINVAL; - return -1; - } - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.sendmmsg) { - get_orig_funcs(); + struct timespec timeout = TIMESPEC_INITIALIZER; + if (__timeout) { + memcpy(&timeout, __timeout, sizeof(timeout)); } - BULLSEYE_EXCLUDE_BLOCK_END - - return orig_os_api.sendmmsg(__fd, __mmsghdr, __vlen, __flags); + return SYSCALL(recvmmsg, __fd, __mmsghdr, __vlen, __flags, &timeout); } -/* Send N bytes of BUF on socket FD to peer at address ADDR (which is - ADDR_LEN bytes long). Returns the number sent, or -1 for errors. +/* Read N bytes into BUF through socket FD. + If ADDR is not NULL, fill in *ADDR_LEN bytes of it with tha address of + the sender, and store the actual size of the address in *ADDR_LEN. + Returns the number of bytes read or -1 for errors. This function is a cancellation point and therefore not marked with __THROW. */ -extern "C" EXPORT_SYMBOL ssize_t sendto(int __fd, __const void *__buf, size_t __nbytes, int __flags, - const struct sockaddr *__to, socklen_t __tolen) +EXPORT_SYMBOL ssize_t XLIO_SYMBOL(recvfrom)(int __fd, void *__buf, size_t __nbytes, int __flags, + struct sockaddr *__from, socklen_t *__fromlen) { + ssize_t ret_val = 0; + PROFILE_FUNC - srdr_logfuncall_entry("fd=%d, nbytes=%d", __fd, __nbytes); + srdr_logfuncall_entry("fd=%d", __fd); - socket_fd_api *p_socket_object = NULL; + sockinfo *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { - struct iovec piov[1] = {{(void *)__buf, __nbytes}}; - xlio_tx_call_attr_t tx_arg; - - tx_arg.opcode = TX_SENDTO; - tx_arg.attr.iov = piov; - tx_arg.attr.sz_iov = 1; - tx_arg.attr.flags = __flags; - tx_arg.attr.addr = (struct sockaddr *)__to; - tx_arg.attr.len = __tolen; - - return p_socket_object->tx(tx_arg); - } - - // Ignore dummy messages for OS - if (unlikely(IS_DUMMY_PACKET(__flags))) { - errno = EINVAL; - return -1; - } - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.sendto) { - get_orig_funcs(); + struct iovec piov[1]; + piov[0].iov_base = __buf; + piov[0].iov_len = __nbytes; + ret_val = p_socket_object->rx(RX_RECVFROM, piov, 1, &__flags, __from, __fromlen); + } else { + ret_val = SYSCALL(recvfrom, __fd, __buf, __nbytes, __flags, __from, __fromlen); } - BULLSEYE_EXCLUDE_BLOCK_END - - return orig_os_api.sendto(__fd, __buf, __nbytes, __flags, __to, __tolen); + return ret_val; } -static ssize_t sendfile_helper(socket_fd_api *p_socket_object, int in_fd, __off64_t *offset, - size_t count) -{ - ssize_t totSent = 0; - struct stat64 stat_buf; - __off64_t orig_offset = 0; - __off64_t cur_offset; - struct iovec piov[1]; - xlio_tx_call_attr_t tx_arg; - sockinfo *s = (sockinfo *)p_socket_object; - - if (p_socket_object->get_type() != FD_TYPE_SOCKET) { - errno = EBADF; - return -1; - } +#if defined HAVE___RECVFROM_CHK && !defined(XLIO_STATIC_BUILD) +/* Checks that the buffer is big enough to contain the number of bytes + the user requests to read. If the buffer is too small, aborts, + else read N bytes into BUF through socket FD. + If ADDR is not NULL, fill in *ADDR_LEN bytes of it with tha address of + the sender, and store the actual size of the address in *ADDR_LEN. + Returns the number of bytes read or -1 for errors. - if (offset == NULL) { - orig_offset = lseek64(in_fd, 0, SEEK_CUR); - if (orig_offset < 0) { - errno = ESPIPE; - return -1; - } - cur_offset = orig_offset; - } else { - cur_offset = *offset; - } + This function is a cancellation point and therefore not marked with + __THROW. */ +EXPORT_SYMBOL ssize_t XLIO_SYMBOL(__recvfrom_chk)(int __fd, void *__buf, size_t __nbytes, + size_t __buflen, int __flags, + struct sockaddr *__from, socklen_t *__fromlen) +{ + PROFILE_FUNC - if (PROTO_TCP == s->get_protocol()) { - mapping_t *mapping; - int rc; + srdr_logfuncall_entry("fd=%d", __fd); - /* Get mapping from the cache */ - mapping = g_zc_cache->get_mapping(in_fd); - if (mapping == NULL) { - srdr_logdbg("Couldn't allocate mapping object"); - goto fallback; + sockinfo *p_socket_object = nullptr; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object) { + BULLSEYE_EXCLUDE_BLOCK_START + if (__nbytes > __buflen) { + srdr_logpanic("buffer overflow detected"); } + BULLSEYE_EXCLUDE_BLOCK_END - if ((__off64_t)mapping->m_size < (__off64_t)(cur_offset + count)) { - struct stat st_buf; + struct iovec piov[1]; + piov[0].iov_base = __buf; + piov[0].iov_len = __nbytes; + return p_socket_object->rx(RX_RECVFROM, piov, 1, &__flags, __from, __fromlen); + } - /* - * This is slow path, we check fstat(2) to handle the - * scenario when user changes the file while respective - * mapping exists and the file becomes larger. - * As workaround, fallback to preadv() implementation. - */ - mapping->put(); - rc = fstat(in_fd, &st_buf); - if ((rc == 0) && (st_buf.st_size >= (off_t)(cur_offset + count))) { - s->m_p_socket_stats->counters.n_tx_sendfile_overflows++; - goto fallback; - } else { - errno = EOVERFLOW; - return -1; - } - } + return SYSCALL(__recvfrom_chk, __fd, __buf, __nbytes, __buflen, __flags, __from, __fromlen); +} +#endif - piov[0].iov_base = (char *)mapping->m_addr + cur_offset; - piov[0].iov_len = count; +/* Write N bytes of BUF to FD. Return the number written, or -1. - tx_arg.opcode = TX_FILE; - tx_arg.attr.iov = piov; - tx_arg.attr.sz_iov = 1; - tx_arg.attr.flags = MSG_ZEROCOPY; - tx_arg.priv.attr = PBUF_DESC_MAP; - tx_arg.priv.map = (void *)mapping; - totSent = p_socket_object->tx(tx_arg); + This function is a cancellation point and therefore not marked with + __THROW. */ +EXPORT_SYMBOL ssize_t XLIO_SYMBOL(write)(int __fd, __const void *__buf, size_t __nbytes) +{ + PROFILE_FUNC - mapping->put(); - fallback: - /* Fallback to readv() implementation */ - if (totSent == 0) { - s->m_p_socket_stats->counters.n_tx_sendfile_fallbacks++; - tx_arg.clear(); - tx_arg.opcode = TX_FILE; - tx_arg.attr.iov = piov; - tx_arg.attr.sz_iov = 1; - tx_arg.priv.attr = PBUF_DESC_FD; - tx_arg.priv.fd = in_fd; - piov[0].iov_base = (void *)&cur_offset; - piov[0].iov_len = count; - totSent = p_socket_object->tx(tx_arg); - } - } else { - __off64_t pa_offset = 0; - size_t pa_count = 0; - struct flock64 lock; + srdr_logfuncall_entry("fd=%d, nbytes=%d", __fd, __nbytes); - if ((fstat64(in_fd, &stat_buf) == -1) || - ((__off64_t)stat_buf.st_size < (__off64_t)(cur_offset + count))) { - errno = EOVERFLOW; - return -1; - } + sockinfo *p_socket_object = nullptr; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object) { + struct iovec piov[1] = {{(void *)__buf, __nbytes}}; + xlio_tx_call_attr_t tx_arg; tx_arg.opcode = TX_WRITE; tx_arg.attr.iov = piov; tx_arg.attr.sz_iov = 1; - /* The off argument of mmap() is constrained to be aligned and - * sized according to the value returned by sysconf() - */ - pa_offset = cur_offset & ~(sysconf(_SC_PAGE_SIZE) - 1); - pa_count = count + cur_offset - pa_offset; + return p_socket_object->tx(tx_arg); + } - lock.l_type = F_RDLCK; - lock.l_whence = SEEK_SET; - lock.l_start = pa_offset; - lock.l_len = pa_count; - lock.l_pid = 0; + return SYSCALL(write, __fd, __buf, __nbytes); +} - /* try to use mmap() approach */ - if (-1 != (fcntl(in_fd, F_SETLK, &lock))) { - void *addr = NULL; - addr = mmap64(NULL, pa_count, PROT_READ, MAP_SHARED | MAP_NORESERVE, in_fd, pa_offset); - if (MAP_FAILED != addr) { - ssize_t toRead, numSent = 0; +/* Write IOCNT blocks from IOVEC to FD. Return the number written, or -1. - while (count > 0) { - toRead = min(sysconf(_SC_PAGE_SIZE), (ssize_t)count); + This function is a cancellation point and therefore not marked with + __THROW. */ +EXPORT_SYMBOL ssize_t XLIO_SYMBOL(writev)(int __fd, const struct iovec *iov, int iovcnt) +{ + PROFILE_FUNC - piov[0].iov_base = (void *)((uintptr_t)addr + cur_offset - pa_offset + totSent); - piov[0].iov_len = toRead; + srdr_logfuncall_entry("fd=%d, %d iov blocks", __fd, iovcnt); - numSent = p_socket_object->tx(tx_arg); - if (numSent == -1) { - break; - } + sockinfo *p_socket_object = nullptr; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object) { + xlio_tx_call_attr_t tx_arg; - count -= numSent; - totSent += numSent; - } - (void)munmap(addr, pa_count); - } - lock.l_type = F_UNLCK; - (void)fcntl(in_fd, F_SETLK, &lock); - } + tx_arg.opcode = TX_WRITEV; + tx_arg.attr.iov = (struct iovec *)iov; + tx_arg.attr.sz_iov = iovcnt; - /* fallback on read() approach */ - if (totSent == 0) { - char buf[sysconf(_SC_PAGE_SIZE)]; - ssize_t toRead, numRead, numSent = 0; + return p_socket_object->tx(tx_arg); + } - s->m_p_socket_stats->counters.n_tx_sendfile_fallbacks++; + return SYSCALL(writev, __fd, iov, iovcnt); +} - while (count > 0) { - toRead = min(sizeof(buf), count); - numRead = pread(in_fd, buf, toRead, cur_offset + totSent); - if (numRead <= 0) { - if (numRead < 0 && totSent == 0) { - totSent = -1; - } - break; - } +/* Send N bytes of BUF to socket FD. Returns the number sent or -1. - piov[0].iov_base = (void *)buf; - piov[0].iov_len = numRead; + This function is a cancellation point and therefore not marked with + __THROW. */ +EXPORT_SYMBOL ssize_t XLIO_SYMBOL(send)(int __fd, __const void *__buf, size_t __nbytes, int __flags) +{ + PROFILE_FUNC - numSent = p_socket_object->tx(tx_arg); - if (numSent == -1) { - break; - } + srdr_logfuncall_entry("fd=%d, nbytes=%d", __fd, __nbytes); - count -= numSent; - totSent += numSent; - } - } + sockinfo *p_socket_object = nullptr; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object) { + struct iovec piov[1] = {{(void *)__buf, __nbytes}}; + xlio_tx_call_attr_t tx_arg; + + tx_arg.opcode = TX_SEND; + tx_arg.attr.iov = piov; + tx_arg.attr.sz_iov = 1; + tx_arg.attr.flags = __flags; + + return p_socket_object->tx(tx_arg); } - if (totSent > 0) { - if (offset != NULL) { - *offset = *offset + totSent; - } else { - (void)lseek64(in_fd, (orig_offset + totSent), SEEK_SET); - } + // Ignore dummy messages for OS + if (unlikely(IS_DUMMY_PACKET(__flags))) { + errno = EINVAL; + return -1; } - return totSent; + return SYSCALL(send, __fd, __buf, __nbytes, __flags); } -extern "C" EXPORT_SYMBOL ssize_t sendfile(int out_fd, int in_fd, off_t *offset, size_t count) +/* Sends a message as described by MESSAGE to socket FD. + Returns the number of bytes read or -1 for errors. + + This function is a cancellation point and therefore not marked with + __THROW. */ +EXPORT_SYMBOL ssize_t XLIO_SYMBOL(sendmsg)(int __fd, __const struct msghdr *__msg, int __flags) { PROFILE_FUNC - srdr_logfuncall_entry("out_fd=%d, in_fd=%d, offset=%p, *offset=%zu, count=%d", out_fd, in_fd, - offset, offset ? *offset : 0, count); + srdr_logfuncall_entry("fd=%d", __fd); - socket_fd_api *p_socket_object = fd_collection_get_sockfd(out_fd); - if (!p_socket_object) { - if (!orig_os_api.sendfile) { - get_orig_funcs(); - } - return orig_os_api.sendfile(out_fd, in_fd, offset, count); + sockinfo *p_socket_object = nullptr; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object) { + return sendmsg_internal(p_socket_object, __msg, __flags); } - return sendfile_helper(p_socket_object, in_fd, offset, count); + // Ignore dummy messages for OS + if (unlikely(IS_DUMMY_PACKET(__flags))) { + errno = EINVAL; + return -1; + } + + return SYSCALL(sendmsg, __fd, __msg, __flags); } -extern "C" EXPORT_SYMBOL ssize_t sendfile64(int out_fd, int in_fd, __off64_t *offset, size_t count) +/* Send multiple messages as described by MESSAGE from socket FD. + Returns the number of messages sent or -1 for errors. + + This function is a cancellation point and therefore not marked with + __THROW. */ +EXPORT_SYMBOL int XLIO_SYMBOL(sendmmsg)(int __fd, struct mmsghdr *__mmsghdr, unsigned int __vlen, + int __flags) { + int num_of_msg = 0; + PROFILE_FUNC - srdr_logfuncall_entry("out_fd=%d, in_fd=%d, offset=%p, *offset=%zu, count=%d", out_fd, in_fd, - offset, offset ? *offset : 0, count); + srdr_logfuncall_entry("fd=%d, mmsghdr length=%d flags=%x", __fd, __vlen, __flags); - socket_fd_api *p_socket_object = fd_collection_get_sockfd(out_fd); - if (!p_socket_object) { - if (!orig_os_api.sendfile64) { - get_orig_funcs(); - } - return orig_os_api.sendfile64(out_fd, in_fd, offset, count); + if (!__mmsghdr) { + srdr_logdbg("NULL mmsghdr"); + errno = EINVAL; + return -1; } - return sendfile_helper(p_socket_object, in_fd, offset, count); -} + sockinfo *p_socket_object = nullptr; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object) { + for (unsigned int i = 0; i < __vlen; i++) { + xlio_tx_call_attr_t tx_arg; -// Format a fd_set into a string for logging -// Check nfd to know how many 32 bits hexs do we want to sprintf into user buffer -const char *dbg_sprintf_fdset(char *buf, int buflen, int __nfds, fd_set *__fds) -{ - if (buflen < 1) { - return "(null)"; - } - buf[0] = '\0'; + tx_arg.opcode = TX_SENDMSG; + tx_arg.attr.iov = __mmsghdr[i].msg_hdr.msg_iov; + tx_arg.attr.sz_iov = (ssize_t)__mmsghdr[i].msg_hdr.msg_iovlen; + tx_arg.attr.flags = __flags; + tx_arg.attr.addr = (struct sockaddr *)(__SOCKADDR_ARG)__mmsghdr[i].msg_hdr.msg_name; + tx_arg.attr.len = (socklen_t)__mmsghdr[i].msg_hdr.msg_namelen; + tx_arg.attr.hdr = &__mmsghdr[i].msg_hdr; - if ((__nfds <= 0) || (__fds == NULL)) { - return "(null)"; + int ret = p_socket_object->tx(tx_arg); + if (ret < 0) { + if (num_of_msg) { + return num_of_msg; + } else { + return ret; + } + } + num_of_msg++; + __mmsghdr[i].msg_len = ret; + } + return num_of_msg; } - int fdsize = 1 + ((__nfds - 1) / (8 * sizeof(uint32_t))); - switch (fdsize) { - case 1: - snprintf(buf, buflen, "%08x", ((uint32_t *)__fds)[0]); - break; - case 2: - snprintf(buf, buflen, "%08x %08x", ((uint32_t *)__fds)[1], ((uint32_t *)__fds)[0]); - break; - case 3: - snprintf(buf, buflen, "%08x %08x %08x", ((uint32_t *)__fds)[2], ((uint32_t *)__fds)[1], - ((uint32_t *)__fds)[0]); - break; - case 4: - snprintf(buf, buflen, "%08x %08x %08x %08x", ((uint32_t *)__fds)[3], ((uint32_t *)__fds)[2], - ((uint32_t *)__fds)[1], ((uint32_t *)__fds)[0]); - break; - case 5: - snprintf(buf, buflen, "%08x %08x %08x %08x %08x", ((uint32_t *)__fds)[4], - ((uint32_t *)__fds)[3], ((uint32_t *)__fds)[2], ((uint32_t *)__fds)[1], - ((uint32_t *)__fds)[0]); - break; - case 6: - snprintf(buf, buflen, "%08x %08x %08x %08x %08x %08x", ((uint32_t *)__fds)[5], - ((uint32_t *)__fds)[4], ((uint32_t *)__fds)[3], ((uint32_t *)__fds)[2], - ((uint32_t *)__fds)[1], ((uint32_t *)__fds)[0]); - break; - default: - buf[0] = '\0'; + // Ignore dummy messages for OS + if (unlikely(IS_DUMMY_PACKET(__flags))) { + errno = EINVAL; + return -1; } - return buf; + + return SYSCALL(sendmmsg, __fd, __mmsghdr, __vlen, __flags); } -/* Check the first NFDS descriptors each in READFDS (if not NULL) for read - readiness, in WRITEFDS (if not NULL) for write readiness, and in EXCEPTFDS - (if not NULL) for exceptional conditions. If TIMis not NULL, time out - after waiting the interval specified therein. Returns the number of ready - descriptors, or -1 for errors. +/* Send N bytes of BUF on socket FD to peer at address ADDR (which is + ADDR_LEN bytes long). Returns the number sent, or -1 for errors. This function is a cancellation point and therefore not marked with __THROW. */ -static int select_helper(int __nfds, fd_set *__readfds, fd_set *__writefds, fd_set *__exceptfds, - struct timeval *__timeout, const sigset_t *__sigmask = NULL) +EXPORT_SYMBOL ssize_t XLIO_SYMBOL(sendto)(int __fd, __const void *__buf, size_t __nbytes, + int __flags, const struct sockaddr *__to, + socklen_t __tolen) { - int off_rfds_buffer[__nfds]; - io_mux_call::offloaded_mode_t off_modes_buffer[__nfds]; + PROFILE_FUNC - if (g_vlogger_level >= VLOG_FUNC) { - const int tmpbufsize = 256; - char tmpbuf[tmpbufsize], tmpbuf2[tmpbufsize]; - NOT_IN_USE(tmpbufsize); /* to suppress warning in case MAX_DEFINED_LOG_LEVEL */ - NOT_IN_USE(tmpbuf); /* to suppress warning in case MAX_DEFINED_LOG_LEVEL */ - NOT_IN_USE(tmpbuf2); /* to suppress warning in case MAX_DEFINED_LOG_LEVEL */ - srdr_logfunc("readfds: %s, writefds: %s", - dbg_sprintf_fdset(tmpbuf, tmpbufsize, __nfds, __readfds), - dbg_sprintf_fdset(tmpbuf2, tmpbufsize, __nfds, __writefds)); + srdr_logfuncall_entry("fd=%d, nbytes=%d", __fd, __nbytes); + + sockinfo *p_socket_object = nullptr; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object) { + struct iovec piov[1] = {{(void *)__buf, __nbytes}}; + xlio_tx_call_attr_t tx_arg; + + tx_arg.opcode = TX_SENDTO; + tx_arg.attr.iov = piov; + tx_arg.attr.sz_iov = 1; + tx_arg.attr.flags = __flags; + tx_arg.attr.addr = (struct sockaddr *)__to; + tx_arg.attr.len = __tolen; + + return p_socket_object->tx(tx_arg); } - try { - select_call scall(off_rfds_buffer, off_modes_buffer, __nfds, __readfds, __writefds, - __exceptfds, __timeout, __sigmask); - int rc = scall.call(); + // Ignore dummy messages for OS + if (unlikely(IS_DUMMY_PACKET(__flags))) { + errno = EINVAL; + return -1; + } + + return SYSCALL(sendto, __fd, __buf, __nbytes, __flags, __to, __tolen); +} + +EXPORT_SYMBOL ssize_t XLIO_SYMBOL(sendfile)(int out_fd, int in_fd, off_t *offset, size_t count) +{ + PROFILE_FUNC + + srdr_logfuncall_entry("out_fd=%d, in_fd=%d, offset=%p, *offset=%zu, count=%d", out_fd, in_fd, + offset, offset ? *offset : 0, count); + + sockinfo *p_socket_object = fd_collection_get_sockfd(out_fd); + if (!p_socket_object) { + return SYSCALL(sendfile, out_fd, in_fd, offset, count); + } + + return sendfile_helper(p_socket_object, in_fd, offset, count); +} - if (g_vlogger_level >= VLOG_FUNC) { - const int tmpbufsize = 256; - char tmpbuf[tmpbufsize], tmpbuf2[tmpbufsize]; - NOT_IN_USE(tmpbufsize); /* to suppress warning in case MAX_DEFINED_LOG_LEVEL */ - NOT_IN_USE(tmpbuf); /* to suppress warning in case MAX_DEFINED_LOG_LEVEL */ - NOT_IN_USE(tmpbuf2); /* to suppress warning in case MAX_DEFINED_LOG_LEVEL */ - srdr_logfunc_exit("readfds: %s, writefds: %s", - dbg_sprintf_fdset(tmpbuf, tmpbufsize, __nfds, __readfds), - dbg_sprintf_fdset(tmpbuf2, tmpbufsize, __nfds, __writefds)); - } +EXPORT_SYMBOL ssize_t XLIO_SYMBOL(sendfile64)(int out_fd, int in_fd, __off64_t *offset, + size_t count) +{ + PROFILE_FUNC - return rc; - } catch (io_mux_call::io_error &) { - srdr_logfunc_exit("io_mux_call::io_error (errno=%d %m)", errno); - return -1; + srdr_logfuncall_entry("out_fd=%d, in_fd=%d, offset=%p, *offset=%zu, count=%d", out_fd, in_fd, + offset, offset ? *offset : 0, count); + + sockinfo *p_socket_object = fd_collection_get_sockfd(out_fd); + if (!p_socket_object) { + return SYSCALL(sendfile64, out_fd, in_fd, offset, count); } + + return sendfile_helper(p_socket_object, in_fd, offset, count); } -extern "C" EXPORT_SYMBOL int select(int __nfds, fd_set *__readfds, fd_set *__writefds, - fd_set *__exceptfds, struct timeval *__timeout) +EXPORT_SYMBOL int XLIO_SYMBOL(select)(int __nfds, fd_set *__readfds, fd_set *__writefds, + fd_set *__exceptfds, struct timeval *__timeout) { PROFILE_FUNC if (!g_p_fd_collection) { - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.select) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - return orig_os_api.select(__nfds, __readfds, __writefds, __exceptfds, __timeout); + return SYSCALL(select, __nfds, __readfds, __writefds, __exceptfds, __timeout); } if (__timeout) { @@ -1878,19 +1883,14 @@ extern "C" EXPORT_SYMBOL int select(int __nfds, fd_set *__readfds, fd_set *__wri return select_helper(__nfds, __readfds, __writefds, __exceptfds, __timeout); } -extern "C" EXPORT_SYMBOL int pselect(int __nfds, fd_set *__readfds, fd_set *__writefds, - fd_set *__errorfds, const struct timespec *__timeout, - const sigset_t *__sigmask) +EXPORT_SYMBOL int XLIO_SYMBOL(pselect)(int __nfds, fd_set *__readfds, fd_set *__writefds, + fd_set *__errorfds, const struct timespec *__timeout, + const sigset_t *__sigmask) { PROFILE_FUNC if (!g_p_fd_collection) { - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.pselect) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - return orig_os_api.pselect(__nfds, __readfds, __writefds, __errorfds, __timeout, __sigmask); + return SYSCALL(pselect, __nfds, __readfds, __writefds, __errorfds, __timeout, __sigmask); } struct timeval select_time; @@ -1903,47 +1903,16 @@ extern "C" EXPORT_SYMBOL int pselect(int __nfds, fd_set *__readfds, fd_set *__wr srdr_logfunc_entry("nfds=%d, timeout=(infinite)", __nfds); } - return select_helper(__nfds, __readfds, __writefds, __errorfds, __timeout ? &select_time : NULL, - __sigmask); -} - -/* Poll the file descriptors described by the NFDS structures starting at - FDS. If TIMis nonzero and not -1, allow TIMmilliseconds for - an event to occur; if TIMis -1, block until an event occurs. - Returns the number of file descriptors with events, zero if timed out, - or -1 for errors. */ -static int poll_helper(struct pollfd *__fds, nfds_t __nfds, int __timeout, - const sigset_t *__sigmask = NULL) -{ - int off_rfd_buffer[__nfds]; - io_mux_call::offloaded_mode_t off_modes_buffer[__nfds]; - int lookup_buffer[__nfds]; - pollfd working_fds_arr[__nfds + 1]; - - try { - poll_call pcall(off_rfd_buffer, off_modes_buffer, lookup_buffer, working_fds_arr, __fds, - __nfds, __timeout, __sigmask); - - int rc = pcall.call(); - srdr_logfunc_exit("rc = %d", rc); - return rc; - } catch (io_mux_call::io_error &) { - srdr_logfunc_exit("io_mux_call::io_error (errno=%d %m)", errno); - return -1; - } + return select_helper(__nfds, __readfds, __writefds, __errorfds, + __timeout ? &select_time : nullptr, __sigmask); } -extern "C" EXPORT_SYMBOL int poll(struct pollfd *__fds, nfds_t __nfds, int __timeout) +EXPORT_SYMBOL int XLIO_SYMBOL(poll)(struct pollfd *__fds, nfds_t __nfds, int __timeout) { PROFILE_FUNC if (!g_p_fd_collection) { - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.poll) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - return orig_os_api.poll(__fds, __nfds, __timeout); + return SYSCALL(poll, __fds, __nfds, __timeout); } srdr_logfunc_entry("nfds=%d, timeout=(%d milli-sec)", __nfds, __timeout); @@ -1951,19 +1920,14 @@ extern "C" EXPORT_SYMBOL int poll(struct pollfd *__fds, nfds_t __nfds, int __tim return poll_helper(__fds, __nfds, __timeout); } -#if defined HAVE___POLL_CHK -extern "C" EXPORT_SYMBOL int __poll_chk(struct pollfd *__fds, nfds_t __nfds, int __timeout, - size_t __fdslen) +#if defined HAVE___POLL_CHK && !defined(XLIO_STATIC_BUILD) +EXPORT_SYMBOL int XLIO_SYMBOL(__poll_chk)(struct pollfd *__fds, nfds_t __nfds, int __timeout, + size_t __fdslen) { PROFILE_FUNC if (!g_p_fd_collection) { - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.__poll_chk) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - return orig_os_api.__poll_chk(__fds, __nfds, __timeout, __fdslen); + return SYSCALL(__poll_chk, __fds, __nfds, __timeout, __fdslen); } BULLSEYE_EXCLUDE_BLOCK_START @@ -1978,42 +1942,31 @@ extern "C" EXPORT_SYMBOL int __poll_chk(struct pollfd *__fds, nfds_t __nfds, int } #endif -extern "C" EXPORT_SYMBOL int ppoll(struct pollfd *__fds, nfds_t __nfds, - const struct timespec *__timeout, const sigset_t *__sigmask) +EXPORT_SYMBOL int XLIO_SYMBOL(ppoll)(struct pollfd *__fds, nfds_t __nfds, + const struct timespec *__timeout, const sigset_t *__sigmask) { PROFILE_FUNC if (!g_p_fd_collection) { - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.ppoll) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - return orig_os_api.ppoll(__fds, __nfds, __timeout, __sigmask); + return SYSCALL(ppoll, __fds, __nfds, __timeout, __sigmask); } - int timeout = - (__timeout == NULL) ? -1 : (__timeout->tv_sec * 1000 + __timeout->tv_nsec / 1000000); + int timeout = (!__timeout) ? -1 : (__timeout->tv_sec * 1000 + __timeout->tv_nsec / 1000000); srdr_logfunc_entry("nfds=%d, timeout=(%d milli-sec)", __nfds, timeout); return poll_helper(__fds, __nfds, timeout, __sigmask); } -#if defined HAVE___PPOLL_CHK -extern "C" EXPORT_SYMBOL int __ppoll_chk(struct pollfd *__fds, nfds_t __nfds, - const struct timespec *__timeout, - const sigset_t *__sigmask, size_t __fdslen) +#if defined HAVE___PPOLL_CHK && !defined(XLIO_STATIC_BUILD) +EXPORT_SYMBOL int XLIO_SYMBOL(__ppoll_chk)(struct pollfd *__fds, nfds_t __nfds, + const struct timespec *__timeout, + const sigset_t *__sigmask, size_t __fdslen) { PROFILE_FUNC if (!g_p_fd_collection) { - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.__ppoll_chk) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - return orig_os_api.__ppoll_chk(__fds, __nfds, __timeout, __sigmask, __fdslen); + return SYSCALL(__ppoll_chk, __fds, __nfds, __timeout, __sigmask, __fdslen); } BULLSEYE_EXCLUDE_BLOCK_START @@ -2023,8 +1976,7 @@ extern "C" EXPORT_SYMBOL int __ppoll_chk(struct pollfd *__fds, nfds_t __nfds, BULLSEYE_EXCLUDE_BLOCK_END - int timeout = - (__timeout == NULL) ? -1 : (__timeout->tv_sec * 1000 + __timeout->tv_nsec / 1000000); + int timeout = (!__timeout) ? -1 : (__timeout->tv_sec * 1000 + __timeout->tv_nsec / 1000000); srdr_logfunc_entry("nfds=%d, timeout=(%d milli-sec)", __nfds, timeout); @@ -2032,22 +1984,11 @@ extern "C" EXPORT_SYMBOL int __ppoll_chk(struct pollfd *__fds, nfds_t __nfds, } #endif -static void xlio_epoll_create(int epfd, int size) -{ - if (g_p_fd_collection) { - // Sanity check to remove any old sockinfo object using the same fd!! - handle_close(epfd, true); - - // insert epfd to fd_collection as epfd_info - g_p_fd_collection->addepfd(epfd, size); - } -} - /* Creates an epoll instance. Returns fd for the new instance. The "size" parameter is a hint specifying the number of file descriptors to be associated with the new instance. The fd returned by epoll_create() should be closed with close(). */ -extern "C" EXPORT_SYMBOL int epoll_create(int __size) +EXPORT_SYMBOL int XLIO_SYMBOL(epoll_create)(int __size) { DO_GLOBAL_CTORS(); @@ -2059,13 +2000,7 @@ extern "C" EXPORT_SYMBOL int epoll_create(int __size) return -1; } - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.epoll_create) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - - int epfd = orig_os_api.epoll_create(__size + 1); // +1 for the cq epfd + int epfd = SYSCALL(epoll_create, __size + 1); // +1 for the cq epfd srdr_logdbg("ENTER: (size=%d) = %d", __size, epfd); if (epfd <= 0) { @@ -2077,19 +2012,13 @@ extern "C" EXPORT_SYMBOL int epoll_create(int __size) return epfd; } -extern "C" EXPORT_SYMBOL int epoll_create1(int __flags) +EXPORT_SYMBOL int XLIO_SYMBOL(epoll_create1)(int __flags) { DO_GLOBAL_CTORS(); PROFILE_FUNC - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.epoll_create1) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - - int epfd = orig_os_api.epoll_create1(__flags); + int epfd = SYSCALL(epoll_create1, __flags); srdr_logdbg("ENTER: (flags=%d) = %d", __flags, epfd); if (epfd <= 0) { @@ -2107,7 +2036,8 @@ extern "C" EXPORT_SYMBOL int epoll_create1(int __flags) constants defined above. The "fd" parameter is the target of the operation. The "event" parameter describes which events the caller is interested in and any associated user data. */ -extern "C" EXPORT_SYMBOL int epoll_ctl(int __epfd, int __op, int __fd, struct epoll_event *__event) +EXPORT_SYMBOL int XLIO_SYMBOL(epoll_ctl)(int __epfd, int __op, int __fd, + struct epoll_event *__event) { PROFILE_FUNC @@ -2143,50 +2073,8 @@ extern "C" EXPORT_SYMBOL int epoll_ctl(int __epfd, int __op, int __fd, struct ep return rc; } -/* Wait for events on an epoll instance "epfd". Returns the number of - triggered events returned in "events" buffer. Or -1 in case of - error with the "errno" variable set to the specific error code. The - "events" parameter is a buffer that will contain triggered - events. The "maxevents" is the maximum number of events to be - returned ( usually size of "events" ). The "timeout" parameter - specifies the maximum wait time in milliseconds (-1 == infinite). */ -inline int epoll_wait_helper(int __epfd, struct epoll_event *__events, int __maxevents, - int __timeout, const sigset_t *__sigmask = NULL) -{ - if (__maxevents <= 0 || __maxevents > EP_MAX_EVENTS) { - srdr_logdbg("invalid value for maxevents: %d", __maxevents); - errno = EINVAL; - return -1; - } - - if (safe_mce_sys().tcp_ctl_thread == option_tcp_ctl_thread::CTL_THREAD_DELEGATE_TCP_TIMERS) { - g_thread_local_event_handler.do_tasks(); - } - - epoll_event extra_events_buffer[__maxevents]; - - try { - epoll_wait_call epcall(extra_events_buffer, NULL, __epfd, __events, __maxevents, __timeout, - __sigmask); - - int rc = epcall.get_current_events(); // returns ready nfds - if (rc <= 0) { - // if no ready nfds available then check all lower level queues (XLIO ring's and OS - // queues) - epcall.init_offloaded_fds(); - rc = epcall.call(); - } - - srdr_logfunc_exit("rc = %d", rc); - return rc; - } catch (io_mux_call::io_error &) { - srdr_logfunc_exit("io_mux_call::io_error (errno=%d %m)", errno); - return -1; - } -} - -extern "C" EXPORT_SYMBOL int epoll_wait(int __epfd, struct epoll_event *__events, int __maxevents, - int __timeout) +EXPORT_SYMBOL int XLIO_SYMBOL(epoll_wait)(int __epfd, struct epoll_event *__events, int __maxevents, + int __timeout) { PROFILE_FUNC @@ -2196,8 +2084,9 @@ extern "C" EXPORT_SYMBOL int epoll_wait(int __epfd, struct epoll_event *__events return epoll_wait_helper(__epfd, __events, __maxevents, __timeout); } -extern "C" EXPORT_SYMBOL int epoll_pwait(int __epfd, struct epoll_event *__events, int __maxevents, - int __timeout, const sigset_t *__sigmask) +EXPORT_SYMBOL int XLIO_SYMBOL(epoll_pwait)(int __epfd, struct epoll_event *__events, + int __maxevents, int __timeout, + const sigset_t *__sigmask) { PROFILE_FUNC @@ -2211,17 +2100,11 @@ extern "C" EXPORT_SYMBOL int epoll_pwait(int __epfd, struct epoll_event *__event protocol PROTOCOL, which are connected to each other, and put file descriptors for them in FDS[0] and FDS[1]. If PROTOCOL is zero, one will be chosen automatically. Returns 0 on success, -1 for errors. */ -extern "C" EXPORT_SYMBOL int socketpair(int __domain, int __type, int __protocol, int __sv[2]) +EXPORT_SYMBOL int XLIO_SYMBOL(socketpair)(int __domain, int __type, int __protocol, int __sv[2]) { PROFILE_FUNC - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.socketpair) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - - int ret = orig_os_api.socketpair(__domain, __type, __protocol, __sv); + int ret = SYSCALL(socketpair, __domain, __type, __protocol, __sv); srdr_logdbg("(domain=%s(%d) type=%s(%d) protocol=%d, fd[%d,%d]) = %d", socket_get_domain_str(__domain), __domain, socket_get_type_str(__type), __type, @@ -2240,17 +2123,11 @@ extern "C" EXPORT_SYMBOL int socketpair(int __domain, int __type, int __protocol If successful, two file descriptors are stored in PIPEDES; bytes written on PIPEDES[1] can be read from PIPEDES[0]. Returns 0 if successful, -1 if not. */ -extern "C" EXPORT_SYMBOL int pipe(int __filedes[2]) +EXPORT_SYMBOL int XLIO_SYMBOL(pipe)(int __filedes[2]) { PROFILE_FUNC - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.pipe) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - - int ret = orig_os_api.pipe(__filedes); + int ret = SYSCALL(pipe, __filedes); srdr_logdbg("(fd[%d,%d]) = %d", __filedes[0], __filedes[1], ret); if (ret == 0 && g_p_fd_collection) { @@ -2264,7 +2141,7 @@ extern "C" EXPORT_SYMBOL int pipe(int __filedes[2]) return ret; } -extern "C" EXPORT_SYMBOL int open(__const char *__file, int __oflag, ...) +EXPORT_SYMBOL int XLIO_SYMBOL(open)(__const char *__file, int __oflag, ...) { va_list va; va_start(va, __oflag); @@ -2272,13 +2149,7 @@ extern "C" EXPORT_SYMBOL int open(__const char *__file, int __oflag, ...) PROFILE_FUNC - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.open) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - - int fd = orig_os_api.open(__file, __oflag, mode); + int fd = SYSCALL(open, __file, __oflag, mode); va_end(va); srdr_logdbg("(file=%s, flags=%#x, mode=%#x) = %d", __file, __oflag, mode, fd); @@ -2289,17 +2160,11 @@ extern "C" EXPORT_SYMBOL int open(__const char *__file, int __oflag, ...) return fd; } -extern "C" EXPORT_SYMBOL int creat(const char *__pathname, mode_t __mode) +EXPORT_SYMBOL int XLIO_SYMBOL(creat)(const char *__pathname, mode_t __mode) { PROFILE_FUNC - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.creat) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - - int fd = orig_os_api.creat(__pathname, __mode); + int fd = SYSCALL(creat, __pathname, __mode); srdr_logdbg("(pathname=%s, mode=%#x) = %d", __pathname, __mode, fd); @@ -2310,17 +2175,11 @@ extern "C" EXPORT_SYMBOL int creat(const char *__pathname, mode_t __mode) } /* Duplicate FD, returning a new file descriptor on the same file. */ -extern "C" EXPORT_SYMBOL int dup(int __fd) +EXPORT_SYMBOL int XLIO_SYMBOL(dup)(int __fd) { PROFILE_FUNC - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.dup) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - - int fid = orig_os_api.dup(__fd); + int fid = SYSCALL(dup, __fd); srdr_logdbg("(fd=%d) = %d", __fd, fid); @@ -2337,7 +2196,7 @@ extern "C" EXPORT_SYMBOL int dup(int __fd) } /* Duplicate FD to FD2, closing FD2 and making it open on the same file. */ -extern "C" EXPORT_SYMBOL int dup2(int __fd, int __fd2) +EXPORT_SYMBOL int XLIO_SYMBOL(dup2)(int __fd, int __fd2) { PROFILE_FUNC @@ -2346,13 +2205,7 @@ extern "C" EXPORT_SYMBOL int dup2(int __fd, int __fd2) handle_close(__fd2); } - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.dup2) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - - int fid = orig_os_api.dup2(__fd, __fd2); + int fid = SYSCALL(dup2, __fd, __fd2); srdr_logdbg("(fd=%d, fd2=%d) = %d", __fd, __fd2, fid); @@ -2362,28 +2215,11 @@ extern "C" EXPORT_SYMBOL int dup2(int __fd, int __fd2) return fid; } -#ifdef _CHANGE_CLONE_PROTO_IN_SLES_10_ -extern "C" EXPORT_SYMBOL int clone(int (*__fn)(void *), void *__child_stack, int __flags, - void *__arg) -{ - PROFILE_FUNC - - srdr_logfunc_entry("flags=%#x", __flags); - - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.clone) - get_orig_funcs(); - BULLSEYE_EXCLUDE_BLOCK_END - - return orig_os_api.clone(__fn, __child_stack, __flags, __arg); -} -#endif - /* Clone the calling process, creating an exact copy. Return -1 for errors, 0 to the new process, and the process ID of the new process to the old process. */ -extern "C" EXPORT_SYMBOL pid_t fork(void) +EXPORT_SYMBOL pid_t XLIO_SYMBOL(fork)(void) { PROFILE_FUNC @@ -2399,12 +2235,6 @@ extern "C" EXPORT_SYMBOL pid_t fork(void) "undefined!!"); } - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.fork) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - #if defined(DEFINED_NGINX) static int worker_index = -1; if (g_p_app && g_p_app->type == APP_NGINX && (g_p_app->get_worker_id() == -1)) { @@ -2428,7 +2258,7 @@ extern "C" EXPORT_SYMBOL pid_t fork(void) } #endif - pid_t pid = orig_os_api.fork(); + pid_t pid = SYSCALL(fork); if (pid == 0) { #if defined(DEFINED_NGINX) void *p_fd_collection_temp = g_p_fd_collection; @@ -2459,7 +2289,7 @@ extern "C" EXPORT_SYMBOL pid_t fork(void) if (g_p_app && g_p_app->type == APP_NGINX) { g_p_app->map_thread_id[gettid()] = worker_index; /* Child process needs information about - * listen socket_fd_api objects, so pass this using parent`s g_p_fd_collection. + * listen sockinfo objects, so pass this using parent`s g_p_fd_collection. * It is possible as far as parent`s g_p_fd_collection is not deleted * by reset_globals() */ @@ -2491,17 +2321,17 @@ extern "C" EXPORT_SYMBOL pid_t fork(void) } /* Redirect vfork to fork */ -extern "C" EXPORT_SYMBOL pid_t vfork(void) +EXPORT_SYMBOL pid_t XLIO_SYMBOL(vfork)(void) { PROFILE_FUNC - return fork(); + return XLIO_CALL(fork); } /* Put the program in the background, and dissociate from the controlling terminal. If NOCHDIR is zero, do `chdir ("/")'. If NOCLOSE is zero, redirects stdin, stdout, and stderr to /dev/null. */ -extern "C" EXPORT_SYMBOL int daemon(int __nochdir, int __noclose) +EXPORT_SYMBOL int XLIO_SYMBOL(daemon)(int __nochdir, int __noclose) { PROFILE_FUNC @@ -2512,13 +2342,7 @@ extern "C" EXPORT_SYMBOL int daemon(int __nochdir, int __noclose) prepare_fork(); } - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.daemon) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - - int ret = orig_os_api.daemon(__nochdir, __noclose); + int ret = SYSCALL(daemon, __nochdir, __noclose); if (ret == 0) { g_is_forked_child = true; srdr_logdbg_exit("returned with %d", ret); @@ -2547,105 +2371,16 @@ extern "C" EXPORT_SYMBOL int daemon(int __nochdir, int __noclose) return ret; } -static void handler_intr(int sig) -{ - switch (sig) { - case SIGINT: - g_b_exit = true; - srdr_logdbg("Catch Signal: SIGINT (%d)", sig); - break; - default: - srdr_logdbg("Catch Signal: %d", sig); - break; - } - - if (g_act_prev.sa_handler) { - g_act_prev.sa_handler(sig); - } -} - -extern "C" EXPORT_SYMBOL int sigaction(int signum, const struct sigaction *act, - struct sigaction *oldact) -{ - int ret = 0; - - PROFILE_FUNC - - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.sigaction) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - - if (safe_mce_sys().handle_sigintr) { - srdr_logdbg_entry("signum=%d, act=%p, oldact=%p", signum, act, oldact); - - switch (signum) { - case SIGINT: - if (oldact && g_act_prev.sa_handler) { - *oldact = g_act_prev; - } - if (act) { - struct sigaction xlio_action; - xlio_action.sa_handler = handler_intr; - xlio_action.sa_flags = 0; - sigemptyset(&xlio_action.sa_mask); - - ret = orig_os_api.sigaction(SIGINT, &xlio_action, NULL); - - if (ret < 0) { - srdr_logdbg("Failed to register SIGINT handler, calling to original sigaction " - "handler"); - break; - } - srdr_logdbg("Registered SIGINT handler"); - g_act_prev = *act; - } - if (ret >= 0) { - srdr_logdbg_exit("returned with %d", ret); - } else { - srdr_logdbg_exit("failed (errno=%d %m)", errno); - } - - return ret; - break; - default: - break; - } - } - ret = orig_os_api.sigaction(signum, act, oldact); - - if (safe_mce_sys().handle_sigintr) { - if (ret >= 0) { - srdr_logdbg_exit("returned with %d", ret); - } else { - srdr_logdbg_exit("failed (errno=%d %m)", errno); - } - } - return ret; -} - -static void handle_signal(int signum) +EXPORT_SYMBOL int XLIO_SYMBOL(sigaction)(int signum, const struct sigaction *act, + struct sigaction *oldact) { - srdr_logdbg_entry("Caught signal! signum=%d", signum); - - if (signum == SIGINT) { - g_b_exit = true; - } - - if (g_sighandler) { - g_sighandler(signum); - } + return sigaction_internal(signum, act, oldact); } -extern "C" EXPORT_SYMBOL sighandler_t signal(int signum, sighandler_t handler) +EXPORT_SYMBOL sighandler_t XLIO_SYMBOL(signal)(int signum, sighandler_t handler) { PROFILE_FUNC - if (!orig_os_api.signal) { - get_orig_funcs(); - } - if (safe_mce_sys().handle_sigintr) { srdr_logdbg_entry("signum=%d, handler=%p", signum, handler); @@ -2653,28 +2388,22 @@ extern "C" EXPORT_SYMBOL sighandler_t signal(int signum, sighandler_t handler) // Only SIGINT is supported for now if (signum == SIGINT) { g_sighandler = handler; - return orig_os_api.signal(SIGINT, &handle_signal); + return SYSCALL(signal, SIGINT, &handle_signal); } } } - return orig_os_api.signal(signum, handler); + return SYSCALL(signal, signum, handler); } #if defined(DEFINED_NGINX) -extern "C" EXPORT_SYMBOL int setuid(uid_t uid) +EXPORT_SYMBOL int XLIO_SYMBOL(setuid)(uid_t uid) { PROFILE_FUNC - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.setuid) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - uid_t previous_uid = geteuid(); - int orig_rc = orig_os_api.setuid(uid); + int orig_rc = SYSCALL(setuid, uid); if (orig_rc < 0) { srdr_logdbg_exit("failed (errno=%d %m)", errno); } @@ -2687,11 +2416,11 @@ extern "C" EXPORT_SYMBOL int setuid(uid_t uid) return orig_rc; } -extern "C" EXPORT_SYMBOL pid_t waitpid(pid_t pid, int *wstatus, int options) +EXPORT_SYMBOL pid_t XLIO_SYMBOL(waitpid)(pid_t pid, int *wstatus, int options) { PROFILE_FUNC - pid_t child_pid = orig_os_api.waitpid(pid, wstatus, options); + pid_t child_pid = SYSCALL(waitpid, pid, wstatus, options); /* This segment is used as part of NGINX worker termination recovery mechanism. The mechanism * marks the worker PID slot as vacant with -1 later to reuse it in the fork system call.The * implicit assumptions here are that: @@ -2705,7 +2434,8 @@ extern "C" EXPORT_SYMBOL pid_t waitpid(pid_t pid, int *wstatus, int options) g_p_app->unused_worker_id.insert(g_p_app->get_worker_id()); g_p_app->map_thread_id.erase(getpid()); } + return child_pid; } - #endif // DEFINED_NGINX +} diff --git a/src/core/sock/sock-redirect.h b/src/core/sock/sock-redirect.h index 055648756..5487f1e5f 100644 --- a/src/core/sock/sock-redirect.h +++ b/src/core/sock/sock-redirect.h @@ -33,6 +33,10 @@ #ifndef SOCK_REDIRECT_H #define SOCK_REDIRECT_H +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + // if you need select with more than 1024 sockets - enable this #ifndef SELECT_BIG_SETSIZE #define SELECT_BIG_SETSIZE 0 @@ -85,6 +89,22 @@ #include #include +#ifdef XLIO_STATIC_BUILD +#define XLIO_SYMBOL(_func) xlio_##_func +#define SYSCALL(_func, ...) ::_func(__VA_ARGS__) +#define XLIO_CALL(_func, ...) xlio_##_func(__VA_ARGS__) +#define SYSCALL_ERRNO_UNSUPPORTED(_func, ...) SYSCALL(_func, __VA_ARGS__) +#define VALID_SYSCALL(_func) (true) +#else +#define XLIO_SYMBOL(_func) _func +#define VALID_SYSCALL(_func) ((orig_os_api._func) != nullptr) +#define SYSCALL(_func, ...) \ + ((VALID_SYSCALL(_func) ? (void)0 : get_orig_funcs()), orig_os_api._func(__VA_ARGS__)) +#define SYSCALL_ERRNO_UNSUPPORTED(_func, ...) \ + (VALID_SYSCALL(_func) ? orig_os_api._func(__VA_ARGS__) : ((errno = EOPNOTSUPP), -1)) +#define XLIO_CALL(_func, ...) _func(__VA_ARGS__) +#endif /* XLIO_STATIC_BUILD */ + struct mmsghdr; /** @@ -92,7 +112,7 @@ struct mmsghdr; * variables to hold the function-pointers to original functions *----------------------------------------------------------------------------- */ - +#ifndef XLIO_STATIC_BUILD struct os_api { int (*creat)(const char *__pathname, mode_t __mode); int (*open)(__const char *__file, int __oflag, ...); @@ -174,8 +194,8 @@ struct os_api { const sigset_t *sigmask); int (*clone)(int (*__fn)(void *), void *__child_stack, int __flags, void *__arg); - pid_t (*fork)(void); - pid_t (*vfork)(void); + pid_t (*fork)(); + pid_t (*vfork)(); int (*daemon)(int __nochdir, int __noclose); int (*sigaction)(int signum, const struct sigaction *act, struct sigaction *oldact); @@ -185,6 +205,10 @@ struct os_api { pid_t (*waitpid)(pid_t pid, int *wstatus, int options); #endif // DEFINED_NGINX }; +extern os_api orig_os_api; + +extern void get_orig_funcs(); +#endif /* XLIO_STATIC_BUILD */ /** *----------------------------------------------------------------------------- @@ -204,10 +228,6 @@ struct os_api { } \ } while (0) -extern os_api orig_os_api; - -extern void get_orig_funcs(); - extern iomux_stats_t *g_p_select_stats; extern iomux_stats_t *g_p_poll_stats; extern iomux_stats_t *g_p_epoll_stats; @@ -221,4 +241,10 @@ bool handle_close(int fd, bool cleanup = false, bool passthrough = false); // TODO: look for additional such functions/calls int socket_internal(int __domain, int __type, int __protocol, bool shadow, bool check_offload); +// allow calling our sendmsg(...) implementation safely from within libxlio.so +ssize_t sendmsg_internal(void *sock, __const struct msghdr *__msg, int __flags); + +// allow calling our bind(...) implementation safely from within libxlio.so +int bind_internal(void *sock, const struct sockaddr *addr, socklen_t addrlen); + #endif // SOCK_REDIRECT_H diff --git a/src/core/dev/rfs_rule_ibv.cpp b/src/core/sock/sock_stats.cpp similarity index 56% rename from src/core/dev/rfs_rule_ibv.cpp rename to src/core/sock/sock_stats.cpp index f2f343b8c..58c6bd6a9 100644 --- a/src/core/dev/rfs_rule_ibv.cpp +++ b/src/core/sock/sock_stats.cpp @@ -1,3 +1,4 @@ + /* * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * @@ -30,36 +31,49 @@ * SOFTWARE. */ -#include -#include "dev/rfs_rule_ibv.h" +#include "sock_stats.h" -#define MODULE_NAME "rfs_rule_ibv" +thread_local socket_stats_t sock_stats::t_dummy_stats; -rfs_rule_ibv::~rfs_rule_ibv() +sock_stats &sock_stats::instance() { + static sock_stats the_instance; + return the_instance; } -bool rfs_rule_ibv::create(xlio_ibv_flow_attr &attrs, ibv_qp *qp) +socket_stats_t *sock_stats::get_stats_obj() { - _ibv_flow.reset(xlio_ibv_create_flow(qp, &attrs)); - if (_ibv_flow != nullptr) { - rfs_logdbg("Succeeded xlio_ibv_create_flow, Type: %u, Priority %" PRIu16 - ", rfs_rule_ibv: %p, ibv_flow: %p", - static_cast(attrs.type), attrs.priority, this, _ibv_flow.get()); - return true; + std::lock_guard lock(_stats_lock); + + if (!_socket_stats_list) { + return nullptr; } - rfs_logerr("Failed xlio_ibv_create_flow, Type: %u, Priority %" PRIu16, - static_cast(attrs.type), attrs.priority); - return false; + socket_stats_t *stat = _socket_stats_list; + _socket_stats_list = _socket_stats_list->_next_stat; + return stat; +} + +void sock_stats::return_stats_obj(socket_stats_t *stats) +{ + std::lock_guard lock(_stats_lock); + stats->_next_stat = _socket_stats_list; + _socket_stats_list = stats; } -void rfs_rule_ibv::destory_ibv_flow(xlio_ibv_flow *flow) +void sock_stats::init_sock_stats(size_t max_stats) { - IF_VERBS_FAILURE_EX(xlio_ibv_destroy_flow(flow), EIO) - { - __log_err("Failed xlio_ibv_destroy_flow, ibv_flow: %p", flow); + if (max_stats == 0U) { + return; } - else { __log_dbg("Success xlio_ibv_destroy_flow, ibv_flow: %p", flow); } - ENDIF_VERBS_FAILURE; + + std::lock_guard lock(_stats_lock); + + _socket_stats_vec.resize(max_stats); + for (size_t idx = 1; idx < _socket_stats_vec.size(); ++idx) { + _socket_stats_vec[idx - 1U]._next_stat = &_socket_stats_vec[idx]; + } + + _socket_stats_vec[_socket_stats_vec.size() - 1U]._next_stat = nullptr; + _socket_stats_list = &_socket_stats_vec[0]; } diff --git a/src/core/dev/rfs_rule_dpcp.h b/src/core/sock/sock_stats.h similarity index 73% rename from src/core/dev/rfs_rule_dpcp.h rename to src/core/sock/sock_stats.h index 8443b5bea..3dcde7d2a 100644 --- a/src/core/dev/rfs_rule_dpcp.h +++ b/src/core/sock/sock_stats.h @@ -30,31 +30,31 @@ * SOFTWARE. */ -#ifndef RFS_RULE_DPCP_H -#define RFS_RULE_DPCP_H - -#include - -#if defined(DEFINED_DPCP) +#ifndef SOCK_STATS_H +#define SOCK_STATS_H +#include +#include +#include #include -#include "util/utils.h" -#include "ib/base/verbs_extra.h" -#include "dev/rfs_rule.h" -#include - -using namespace std; +#include "util/ip_address.h" +#include "util/xlio_stats.h" -class rfs_rule_dpcp : public rfs_rule { +class sock_stats { public: - virtual ~rfs_rule_dpcp(); + static sock_stats &instance(); + static thread_local socket_stats_t t_dummy_stats; - bool create(const xlio_ibv_flow_attr &attrs, dpcp::tir &in_tir, dpcp::adapter &in_adapter); + void init_sock_stats(size_t max_stats); + socket_stats_t *get_stats_obj(); + void return_stats_obj(socket_stats_t *stats); private: - unique_ptr _dpcp_flow; -}; + sock_stats() {} -#endif // defined(DEFINED_DPCP) + std::mutex _stats_lock; + socket_stats_t *_socket_stats_list = nullptr; + std::vector _socket_stats_vec; +}; #endif diff --git a/src/core/sock/socket_fd_api.cpp b/src/core/sock/socket_fd_api.cpp deleted file mode 100644 index d0dc9c9a8..000000000 --- a/src/core/sock/socket_fd_api.cpp +++ /dev/null @@ -1,383 +0,0 @@ -/* - * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include - -#include -#include -#include "utils/bullseye.h" -#include "sock-redirect.h" -#include "sock-app.h" - -#include "socket_fd_api.h" - -#define MODULE_NAME "sapi" -#undef MODULE_HDR_INFO -#define MODULE_HDR_INFO MODULE_NAME "[fd=%d]:%d:%s() " -#undef __INFO__ -#define __INFO__ m_fd - -socket_fd_api::socket_fd_api(int fd) - : m_epoll_event_flags(0) - , m_fd(fd) - , m_n_sysvar_select_poll_os_ratio(safe_mce_sys().select_poll_os_ratio) - , m_econtext(NULL) -#if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) - , m_is_for_socket_pool(false) - , m_back_log(0) -#endif -{ -} - -socket_fd_api::~socket_fd_api() -{ - bool toclose = safe_mce_sys().deferred_close && m_fd >= 0; - -#if defined(DEFINED_NGINX) - if (g_p_app->type == APP_NGINX) { - // Sockets from a socket pool are not closed during close(), so do it now. - toclose = toclose || (m_is_for_socket_pool && m_fd >= 0); - } -#endif - - if (toclose) { - orig_os_api.close(m_fd); - } -} - -void socket_fd_api::destructor_helper() -{ -} - -int socket_fd_api::shutdown(int __how) -{ - __log_info_func(""); - int ret = orig_os_api.shutdown(m_fd, __how); - if (ret) { - __log_info_dbg("shutdown failed (ret=%d %m)", ret); - } - return ret; -} - -int socket_fd_api::bind(const sockaddr *__addr, socklen_t __addrlen) -{ - __log_info_func(""); - int ret = orig_os_api.bind(m_fd, __addr, __addrlen); - if (ret) { - __log_info_dbg("bind failed (ret=%d %m)", ret); - } - return ret; -} - -int socket_fd_api::connect(const sockaddr *__to, socklen_t __tolen) -{ - __log_info_func(""); - int ret = orig_os_api.connect(m_fd, __to, __tolen); - if (ret) { - __log_info_dbg("connect failed (ret=%d %m)", ret); - } - return ret; -} - -int socket_fd_api::accept(struct sockaddr *__addr, socklen_t *__addrlen) -{ - __log_info_func(""); - int ret = orig_os_api.accept(m_fd, __addr, __addrlen); - if (ret < 0) { - __log_info_dbg("accept failed (ret=%d %m)", ret); - } - return ret; -} - -int socket_fd_api::accept4(struct sockaddr *__addr, socklen_t *__addrlen, int __flags) -{ - __log_info_func(""); - int ret = orig_os_api.accept4(m_fd, __addr, __addrlen, __flags); - if (ret < 0) { - __log_info_dbg("accept4 failed (ret=%d %m)", ret); - } - return ret; -} - -int socket_fd_api::listen(int backlog) -{ - __log_info_func(""); - int ret = orig_os_api.listen(m_fd, backlog); - if (ret < 0) { - __log_info_dbg("listen failed (ret=%d %m)", ret); - } - return ret; -} - -int socket_fd_api::getsockname(sockaddr *__name, socklen_t *__namelen) -{ - __log_info_func(""); - int ret = orig_os_api.getsockname(m_fd, __name, __namelen); - if (ret) { - __log_info_dbg("getsockname failed (ret=%d %m)", ret); - } - return ret; -} - -int socket_fd_api::getpeername(sockaddr *__name, socklen_t *__namelen) -{ - __log_info_func(""); - int ret = orig_os_api.getpeername(m_fd, __name, __namelen); - if (ret) { - __log_info_dbg("getpeername failed (ret=%d %m)", ret); - } - return ret; -} - -int socket_fd_api::setsockopt(int __level, int __optname, __const void *__optval, - socklen_t __optlen) -{ - __log_info_func(""); - int ret = orig_os_api.setsockopt(m_fd, __level, __optname, __optval, __optlen); - if (ret) { - __log_info_dbg("setsockopt failed (ret=%d %m)", ret); - } - return ret; -} - -int socket_fd_api::getsockopt(int __level, int __optname, void *__optval, socklen_t *__optlen) -{ - __log_info_func(""); - int ret = orig_os_api.getsockopt(m_fd, __level, __optname, __optval, __optlen); - if (ret) { - __log_info_dbg("getsockopt failed (ret=%d %m)", ret); - } - return ret; -} - -bool socket_fd_api::is_readable(uint64_t *p_poll_sn, fd_array_t *p_fd_array) -{ - NOT_IN_USE(p_poll_sn); - NOT_IN_USE(p_fd_array); - __log_info_funcall(""); - return false; -} - -void socket_fd_api::set_immediate_os_sample() -{ - __log_info_funcall(""); - return; -} - -void socket_fd_api::unset_immediate_os_sample() -{ - __log_info_funcall(""); - return; -} - -bool socket_fd_api::is_writeable() -{ - __log_info_funcall(""); - return true; -} - -bool socket_fd_api::is_errorable(int *errors) -{ - NOT_IN_USE(errors); - __log_info_funcall(""); - return false; -} - -void socket_fd_api::statistics_print(vlog_levels_t log_level /* = VLOG_DEBUG */) -{ - int epoll_fd = get_epoll_context_fd(); - - // Socket data - vlog_printf(log_level, "Fd number : %d\n", m_fd); - if (epoll_fd) { - vlog_printf(log_level, "Socket epoll Fd : %d\n", epoll_fd); - vlog_printf(log_level, "Socket epoll flags : 0x%x\n", m_fd_rec.events); - } -} - -ssize_t socket_fd_api::rx_os(const rx_call_t call_type, iovec *p_iov, ssize_t sz_iov, - const int flags, sockaddr *__from, socklen_t *__fromlen, - struct msghdr *__msg) -{ - errno = 0; - switch (call_type) { - case RX_READ: - __log_info_func("calling os receive with orig read"); - return orig_os_api.read(m_fd, p_iov[0].iov_base, p_iov[0].iov_len); - - case RX_READV: - __log_info_func("calling os receive with orig readv"); - return orig_os_api.readv(m_fd, p_iov, sz_iov); - - case RX_RECV: - __log_info_func("calling os receive with orig recv"); - return orig_os_api.recv(m_fd, p_iov[0].iov_base, p_iov[0].iov_len, flags); - - case RX_RECVFROM: - __log_info_func("calling os receive with orig recvfrom"); - return orig_os_api.recvfrom(m_fd, p_iov[0].iov_base, p_iov[0].iov_len, flags, __from, - __fromlen); - - case RX_RECVMSG: { - __log_info_func("calling os receive with orig recvmsg"); - return orig_os_api.recvmsg(m_fd, __msg, flags); - } - } - return (ssize_t)-1; -} - -ssize_t socket_fd_api::tx_os(const tx_call_t call_type, const iovec *p_iov, const ssize_t sz_iov, - const int __flags, const sockaddr *__to, const socklen_t __tolen) -{ - errno = 0; - - // Ignore dummy messages for OS - if (unlikely(IS_DUMMY_PACKET(__flags))) { - errno = EINVAL; - return -1; - } - - switch (call_type) { - case TX_WRITE: - __log_info_func("calling os transmit with orig write"); - return orig_os_api.write(m_fd, p_iov[0].iov_base, p_iov[0].iov_len); - - case TX_WRITEV: - __log_info_func("calling os transmit with orig writev"); - return orig_os_api.writev(m_fd, p_iov, sz_iov); - - case TX_SEND: - __log_info_func("calling os transmit with orig send"); - return orig_os_api.send(m_fd, p_iov[0].iov_base, p_iov[0].iov_len, __flags); - - case TX_SENDTO: - __log_info_func("calling os transmit with orig sendto"); - return orig_os_api.sendto(m_fd, p_iov[0].iov_base, p_iov[0].iov_len, __flags, __to, - __tolen); - - case TX_SENDMSG: { - msghdr __message; - memset(&__message, 0, sizeof(__message)); - __message.msg_iov = (iovec *)p_iov; - __message.msg_iovlen = sz_iov; - __message.msg_name = (void *)__to; - __message.msg_namelen = __tolen; - - __log_info_func("calling os transmit with orig sendmsg"); - return orig_os_api.sendmsg(m_fd, &__message, __flags); - } - default: - __log_info_func("calling undefined os call type!"); - break; - } - return (ssize_t)-1; -} - -int socket_fd_api::register_callback(xlio_recv_callback_t callback, void *context) -{ - NOT_IN_USE(callback); - NOT_IN_USE(context); - return -1; -} - -int socket_fd_api::recvfrom_zcopy_free_packets(struct xlio_recvfrom_zcopy_packet_t *pkts, - size_t count) -{ - NOT_IN_USE(pkts); - NOT_IN_USE(count); - return -1; -} - -int socket_fd_api::add_epoll_context(epfd_info *epfd) -{ - if (!m_econtext) { - // This socket is not registered to any epfd - m_econtext = epfd; - return 0; - } else { - // Currently XLIO does not support more then 1 epfd listed - errno = (m_econtext == epfd) ? EEXIST : ENOMEM; - return -1; - } -} - -void socket_fd_api::remove_epoll_context(epfd_info *epfd) -{ - if (m_econtext == epfd) { - m_econtext = NULL; - } -} - -void socket_fd_api::notify_epoll_context(uint32_t events) -{ - if (m_econtext) { - m_econtext->insert_epoll_event_cb(this, events); - } -} - -void socket_fd_api::notify_epoll_context_add_ring(ring *ring) -{ - if (m_econtext) { - m_econtext->increase_ring_ref_count(ring); - } -} - -void socket_fd_api::notify_epoll_context_remove_ring(ring *ring) -{ - if (m_econtext) { - m_econtext->decrease_ring_ref_count(ring); - } -} - -bool socket_fd_api::notify_epoll_context_verify(epfd_info *epfd) -{ - return m_econtext == epfd; -} - -void socket_fd_api::notify_epoll_context_fd_is_offloaded() -{ - if (m_econtext) { - m_econtext->remove_fd_from_epoll_os(m_fd); - } -} - -int socket_fd_api::get_epoll_context_fd() -{ - if (m_econtext) { - return m_econtext->get_epoll_fd(); - } - return 0; -} - -#if _BullseyeCoverage -#pragma BullseyeCoverage on -#endif diff --git a/src/core/sock/socket_fd_api.h b/src/core/sock/socket_fd_api.h deleted file mode 100644 index 34ba49ab2..000000000 --- a/src/core/sock/socket_fd_api.h +++ /dev/null @@ -1,307 +0,0 @@ -/* - * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef SOCKET_FD_API_H -#define SOCKET_FD_API_H - -#include "config.h" -#include -#include "xlio_extra.h" - -#include -#include -#include - -#ifndef SOCK_NONBLOCK -#define SOCK_NONBLOCK 04000 -#endif -#ifndef SOCK_CLOEXEC -#define SOCK_CLOEXEC 02000000 -#endif -#ifndef SO_MAX_PACING_RATE -#define SO_MAX_PACING_RATE 47 -#endif - -#define IS_DUMMY_PACKET(flags) (flags & XLIO_SND_FLAGS_DUMMY) - -class cq_mgr; -class epfd_info; -class mem_buf_desc_t; - -struct epoll_fd_rec { - uint32_t events; - epoll_data epdata; - int offloaded_index; // offloaded fd index + 1 - - epoll_fd_rec() { reset(); } - - void reset() - { - this->events = 0; - memset(&this->epdata, 0, sizeof(this->epdata)); - this->offloaded_index = 0; - } -}; - -typedef enum { - TX_WRITE = 13, - TX_WRITEV, - TX_SEND, - TX_SENDTO, - TX_SENDMSG, - TX_FILE, - TX_UNDEF -} tx_call_t; - -enum { - TX_FLAG_NO_PARTIAL_WRITE = 1 << 0, -}; - -/* This structure describes the send operation attributes - * Used attributes can be of different types TX_FILE, TX_WRITE, TX_WRITEV, TX_SEND, TX_SENDTO, - * TX_SENDMSG - */ -typedef struct xlio_tx_call_attr { - tx_call_t opcode; - struct _attr { - struct iovec *iov; - ssize_t sz_iov; - int flags; - struct sockaddr *addr; - socklen_t len; - const struct msghdr *hdr; - } attr; - - unsigned xlio_flags; - pbuf_desc priv; - - ~xlio_tx_call_attr() {}; - void clear(void) - { - opcode = TX_UNDEF; - memset(&attr, 0, sizeof(attr)); - memset(&priv, 0, sizeof(priv)); - priv.attr = PBUF_DESC_NONE; - xlio_flags = 0; - } - - xlio_tx_call_attr() { clear(); } -} xlio_tx_call_attr_t; - -typedef enum { RX_READ = 23, RX_READV, RX_RECV, RX_RECVFROM, RX_RECVMSG } rx_call_t; - -#define FD_ARRAY_MAX 24 -typedef struct { - // coverity[member_decl] - int fd_list[FD_ARRAY_MAX]; // Note: An FD might appear twice in the list, - // the user of this array will need to handle it correctly - int fd_max; - int fd_count; -} fd_array_t; - -enum fd_type_t { - FD_TYPE_SOCKET = 0, - FD_TYPE_PIPE, -}; - -typedef xlio_list_t xlio_desc_list_t; - -/** - * - * class socket_fd_api - * - */ - -class socket_fd_api : public cleanable_obj { -public: - socket_fd_api(int fd); - virtual ~socket_fd_api(); - - virtual void setPassthrough() {} - virtual bool isPassthrough() { return false; } - - virtual int prepareListen() { return 0; } - - virtual void destructor_helper(); - - virtual int shutdown(int __how); - - virtual int listen(int backlog); - - virtual int accept(struct sockaddr *__addr, socklen_t *__addrlen); - - virtual int accept4(struct sockaddr *__addr, socklen_t *__addrlen, int __flags); - - virtual int bind(const sockaddr *__addr, socklen_t __addrlen); - - virtual int connect(const sockaddr *__to, socklen_t __tolen); - - virtual int getsockname(sockaddr *__name, socklen_t *__namelen); - virtual int getpeername(sockaddr *__name, socklen_t *__namelen); - - virtual int setsockopt(int __level, int __optname, __const void *__optval, socklen_t __optlen); - - virtual int getsockopt(int __level, int __optname, void *__optval, socklen_t *__optlen); - virtual int fcntl(int __cmd, unsigned long int __arg) = 0; - virtual int fcntl64(int __cmd, unsigned long int __arg) = 0; - - virtual int ioctl(unsigned long int __request, unsigned long int __arg) = 0; - - virtual ssize_t rx(const rx_call_t call_type, iovec *iov, const ssize_t iovlen, - int *p_flags = 0, sockaddr *__from = NULL, socklen_t *__fromlen = NULL, - struct msghdr *__msg = NULL) = 0; - - virtual bool is_readable(uint64_t *p_poll_sn, fd_array_t *p_fd_array = NULL); - - virtual bool is_writeable(); - - virtual bool is_errorable(int *errors); - - // Instructing the socket to immediately sample/un-sample the OS in receive flow - virtual void set_immediate_os_sample(); - virtual void unset_immediate_os_sample(); - - virtual bool is_outgoing() { return false; } - virtual bool is_incoming() { return false; } - virtual bool is_closable() { return true; } - virtual bool is_shadow_socket_present() { return m_fd >= 0; } - -#if defined(DEFINED_NGINX) - virtual void prepare_to_close_socket_pool(bool _push_pop) { NOT_IN_USE(_push_pop); } - virtual void set_params_for_socket_pool() {} -#endif - - // In some cases we need the socket can't be deleted immidiatly - //(for example STREAME sockets) - // This prepares the socket for termination and return true if the - // Return val: true is the socket is already closable and false otherwise - virtual bool prepare_to_close(bool process_shutdown = false) - { - NOT_IN_USE(process_shutdown); - return is_closable(); - } - - virtual ssize_t tx(xlio_tx_call_attr_t &tx_arg) = 0; - - virtual void statistics_print(vlog_levels_t log_level = VLOG_DEBUG); - - virtual int register_callback(xlio_recv_callback_t callback, void *context); - - virtual int recvfrom_zcopy_free_packets(struct xlio_recvfrom_zcopy_packet_t *pkts, - size_t count); - - virtual int get_fd() const { return m_fd; }; - - // true if fd must be skipped from OS select() - // If m_n_sysvar_select_poll_os_ratio == 0, it means that user configured XLIO not to poll os - // (i.e. TRUE...) - virtual bool skip_os_select() { return (!m_n_sysvar_select_poll_os_ratio); }; - - virtual fd_type_t get_type() = 0; - -#if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) - // This socket options copy is currently implemented for nginx and for very specific options. - // This copy is called as part of fork() flow of nginx specifically. - // If a generic fork() is implemented, this copy should be reimplemented in a more generic way, - // see is_inherited_option mechanism of sockinfo_tcp for an example. - virtual void copy_sockopt_fork(const socket_fd_api *copy_from) = 0; -#endif - - virtual void consider_rings_migration_rx() {} - virtual int add_epoll_context(epfd_info *epfd); - virtual void remove_epoll_context(epfd_info *epfd); - int get_epoll_context_fd(); - - // Calling OS transmit - ssize_t tx_os(const tx_call_t call_type, const iovec *p_iov, const ssize_t sz_iov, - const int __flags, const sockaddr *__to, const socklen_t __tolen); - - static inline size_t pendig_to_remove_node_offset(void) - { - return NODE_OFFSET(socket_fd_api, pendig_to_remove_node); - } - - static inline size_t socket_fd_list_node_offset(void) - { - return NODE_OFFSET(socket_fd_api, socket_fd_list_node); - } - - static inline size_t ep_ready_fd_node_offset(void) - { - return NODE_OFFSET(socket_fd_api, ep_ready_fd_node); - } - - static inline size_t ep_info_fd_node_offset(void) - { - return NODE_OFFSET(socket_fd_api, ep_info_fd_node); - } - - virtual int get_rings_num() { return 0; } - virtual bool check_rings() { return false; } - virtual int *get_rings_fds(int &res_length) - { - res_length = 0; - return NULL; - } - -protected: - void notify_epoll_context(uint32_t events); - void notify_epoll_context_add_ring(ring *ring); - void notify_epoll_context_remove_ring(ring *ring); - bool notify_epoll_context_verify(epfd_info *epfd); - void notify_epoll_context_fd_is_offloaded(); - - // Calling OS receive - ssize_t rx_os(const rx_call_t call_type, iovec *p_iov, ssize_t sz_iov, const int flags, - sockaddr *__from, socklen_t *__fromlen, struct msghdr *__msg); - -public: - list_node pendig_to_remove_node; - list_node socket_fd_list_node; - list_node ep_ready_fd_node; - uint32_t m_epoll_event_flags; - list_node ep_info_fd_node; - epoll_fd_rec m_fd_rec; - -protected: - // identification information - int m_fd; - const uint32_t m_n_sysvar_select_poll_os_ratio; - epfd_info *m_econtext; - -public: -#if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) - bool m_is_for_socket_pool; // true when this fd will be used for socket pool on close - int m_back_log; -#endif -}; -#endif diff --git a/src/core/sock/sockinfo.cpp b/src/core/sock/sockinfo.cpp index 855d57db9..ede97961d 100644 --- a/src/core/sock/sockinfo.cpp +++ b/src/core/sock/sockinfo.cpp @@ -59,78 +59,79 @@ #define si_logfunc __log_info_func #define si_logfuncall __log_info_funcall +const char *sockinfo::setsockopt_so_opt_to_str(int opt) +{ + switch (opt) { + case SO_REUSEADDR: + return "SO_REUSEADDR"; + case SO_REUSEPORT: + return "SO_REUSEPORT"; + case SO_BROADCAST: + return "SO_BROADCAST"; + case SO_RCVBUF: + return "SO_RCVBUF"; + case SO_SNDBUF: + return "SO_SNDBUF"; + case SO_TIMESTAMP: + return "SO_TIMESTAMP"; + case SO_TIMESTAMPNS: + return "SO_TIMESTAMPNS"; + case SO_BINDTODEVICE: + return "SO_BINDTODEVICE"; + case SO_ZEROCOPY: + return "SO_ZEROCOPY"; + case SO_XLIO_RING_ALLOC_LOGIC: + return "SO_XLIO_RING_ALLOC_LOGIC"; + case SO_MAX_PACING_RATE: + return "SO_MAX_PACING_RATE"; + case SO_XLIO_SHUTDOWN_RX: + return "SO_XLIO_SHUTDOWN_RX"; + case IPV6_V6ONLY: + return "IPV6_V6ONLY"; + case IPV6_ADDR_PREFERENCES: + return "IPV6_ADDR_PREFERENCES"; + default: + break; + } + return "UNKNOWN SO opt"; +} + sockinfo::sockinfo(int fd, int domain, bool use_ring_locks) - : socket_fd_api(fd) - , m_reuseaddr(false) - , m_reuseport(false) - , m_flow_tag_enabled(false) - , m_b_blocking(true) - , m_b_pktinfo(false) - , m_b_rcvtstamp(false) - , m_b_rcvtstampns(false) - , m_b_zc(false) + : m_fd_context((void *)((uintptr_t)fd)) + , m_family(domain) + , m_fd(fd) + , m_rx_num_buffs_reuse(safe_mce_sys().rx_bufs_batch) , m_skip_cq_poll_in_rx(safe_mce_sys().skip_poll_in_rx == SKIP_POLL_IN_RX_ENABLE) - , m_n_tsing_flags(0) - , m_protocol(PROTO_UNDEFINED) - , m_src_sel_flags(0U) + , m_is_ipv6only(safe_mce_sys().sysctl_reader.get_ipv6_bindv6only()) , m_lock_rcv(MULTILOCK_RECURSIVE, MODULE_NAME "::m_lock_rcv") , m_lock_snd(MODULE_NAME "::m_lock_snd") - , m_state(SOCKINFO_OPENED) - , m_family(domain) - , m_p_connected_dst_entry(NULL) , m_so_bindtodevice_ip(ip_address::any_addr(), domain) - , m_p_rx_ring(0) - , m_rx_reuse_buf_pending(false) - , m_rx_reuse_buf_postponed(false) , m_rx_ring_map_lock(MODULE_NAME "::m_rx_ring_map_lock") - , m_n_rx_pkt_ready_list_count(0) - , m_rx_pkt_ready_offset(0) - , m_rx_ready_byte_count(0) - , m_n_sysvar_rx_num_buffs_reuse(safe_mce_sys().rx_bufs_batch) - , m_n_sysvar_rx_poll_num(safe_mce_sys().rx_poll_num) , m_ring_alloc_log_rx(safe_mce_sys().ring_allocation_logic_rx, use_ring_locks) , m_ring_alloc_log_tx(safe_mce_sys().ring_allocation_logic_tx, use_ring_locks) - , m_pcp(0) - , m_rx_callback(NULL) - , m_rx_callback_context(NULL) - , m_fd_context((void *)((uintptr_t)m_fd)) - , m_flow_tag_id(0) - , m_rx_cq_wait_ctrl(safe_mce_sys().rx_cq_wait_ctrl) , m_n_uc_ttl_hop_lim(m_family == AF_INET ? safe_mce_sys().sysctl_reader.get_net_ipv4_ttl() : safe_mce_sys().sysctl_reader.get_net_ipv6_hop_limit()) - , m_bind_no_port(false) - , m_is_ipv6only(safe_mce_sys().sysctl_reader.get_ipv6_bindv6only()) - , m_p_rings_fds(NULL) { - m_rx_epfd = orig_os_api.epoll_create(128); + m_rx_epfd = SYSCALL(epoll_create, 128); if (unlikely(m_rx_epfd == -1)) { throw_xlio_exception("create internal epoll"); } - wakeup_set_epoll_fd(m_rx_epfd); + m_sock_wakeup_pipe.wakeup_set_epoll_fd(m_rx_epfd); if (m_fd == SOCKET_FAKE_FD) { m_fd = m_rx_epfd; m_fd_context = (void *)((uintptr_t)m_fd); } - m_ring_alloc_logic_rx = ring_allocation_logic_rx(get_fd(), m_ring_alloc_log_rx, this); + m_ring_alloc_logic_rx = ring_allocation_logic_rx(get_fd(), m_ring_alloc_log_rx); - m_p_socket_stats = &m_socket_stats; // Save stats as local copy and allow state publisher to - // copy from this location socket_stats_init(); - xlio_stats_instance_create_socket_block(m_p_socket_stats); + m_rx_reuse_buff.n_buff_num = 0; memset(&m_so_ratelimit, 0, sizeof(xlio_rate_limit_t)); set_flow_tag(m_fd + 1); atomic_set(&m_zckey, 0); - m_last_zcdesc = NULL; - - m_socketxtreme.ec_cache.clear(); - struct ring_ec ec; - ec.clear(); - m_socketxtreme.ec_cache.push_back(ec); - m_socketxtreme.ec = &m_socketxtreme.ec_cache.back(); m_connected.set_sa_family(m_family); m_bound.set_sa_family(m_family); @@ -147,13 +148,8 @@ sockinfo::~sockinfo() // Change to non-blocking socket so calling threads can exit m_b_blocking = false; - // This will wake up any blocked thread in rx() call to orig_os_api.epoll_wait() - orig_os_api.close(m_rx_epfd); - - if (m_p_rings_fds) { - delete[] m_p_rings_fds; - m_p_rings_fds = NULL; - } + // This will wake up any blocked thread in rx() call to SYSCALL(epoll_wait, ) + SYSCALL(close, m_rx_epfd); while (!m_error_queue.empty()) { mem_buf_desc_t *buff = m_error_queue.get_and_pop_front(); @@ -165,13 +161,39 @@ sockinfo::~sockinfo() } } - xlio_stats_instance_remove_socket_block(m_p_socket_stats); + if (m_has_stats) { + xlio_stats_instance_remove_socket_block(m_p_socket_stats); + sock_stats::instance().return_stats_obj(m_p_socket_stats); + } + + bool toclose = safe_mce_sys().deferred_close && m_fd >= 0; + +#if defined(DEFINED_NGINX) + if (g_p_app->type == APP_NGINX) { + // Sockets from a socket pool are not closed during close(), so do it now. + toclose = toclose || (m_is_for_socket_pool && m_fd >= 0); + } +#endif - m_socketxtreme.ec_cache.clear(); + if (toclose) { + SYSCALL(close, m_fd); + } } -void sockinfo::socket_stats_init(void) +void sockinfo::socket_stats_init() { + if (!m_has_stats) { // This check is for listen sockets. + m_p_socket_stats = sock_stats::instance().get_stats_obj(); + if (!m_p_socket_stats) { + m_p_socket_stats = &sock_stats::t_dummy_stats; + return; + } + + m_has_stats = true; + // Save stats as local copy and allow state publisher to copy from this location + xlio_stats_instance_create_socket_block(m_p_socket_stats); + } + m_p_socket_stats->reset(); m_p_socket_stats->fd = m_fd; m_p_socket_stats->inode = fd2inode(m_fd); @@ -180,10 +202,43 @@ void sockinfo::socket_stats_init(void) m_p_socket_stats->ring_alloc_logic_tx = m_ring_alloc_log_tx.get_ring_alloc_logic(); m_p_socket_stats->ring_user_id_rx = m_ring_alloc_logic_rx.calc_res_key_by_logic(); m_p_socket_stats->ring_user_id_tx = - ring_allocation_logic_tx(get_fd(), m_ring_alloc_log_tx, this).calc_res_key_by_logic(); + ring_allocation_logic_tx(get_fd(), m_ring_alloc_log_tx).calc_res_key_by_logic(); m_p_socket_stats->sa_family = m_family; } +ring_ec *sockinfo::pop_next_ec() +{ + if (likely(m_socketxtreme_ec_first)) { + ring_ec *temp = m_socketxtreme_ec_first; + m_socketxtreme_ec_first = m_socketxtreme_ec_first->next; + if (likely(!m_socketxtreme_ec_first)) { // We likely to have a single ec most of the time. + m_socketxtreme_ec_last = nullptr; + } + + return temp; + } + + return nullptr; +} + +ring_ec *sockinfo::clear_ecs() +{ + ring_ec *temp = m_socketxtreme_ec_first; + m_socketxtreme_ec_first = m_socketxtreme_ec_last = nullptr; + return temp; +} + +void sockinfo::add_ec(ring_ec *ec) +{ + memset(&ec->completion, 0, sizeof(ec->completion)); + if (likely(!m_socketxtreme_ec_last)) { + m_socketxtreme_ec_last = m_socketxtreme_ec_first = ec; + } else { + m_socketxtreme_ec_last->next = ec; + m_socketxtreme_ec_last = ec; + } +} + void sockinfo::set_blocking(bool is_blocked) { si_logdbg("set socket to %s mode", is_blocked ? "blocked" : "non-blocking"); @@ -251,7 +306,7 @@ int sockinfo::fcntl(int __cmd, unsigned long int __arg) } si_logdbg("going to OS for fcntl cmd=%d, arg=%#lx", __cmd, __arg); - return orig_os_api.fcntl(m_fd, __cmd, __arg); + return SYSCALL(fcntl, m_fd, __cmd, __arg); } int sockinfo::fcntl64(int __cmd, unsigned long int __arg) @@ -263,7 +318,19 @@ int sockinfo::fcntl64(int __cmd, unsigned long int __arg) } si_logdbg("going to OS for fcntl64 cmd=%d, arg=%#lx", __cmd, __arg); - return orig_os_api.fcntl64(m_fd, __cmd, __arg); + return SYSCALL(fcntl64, m_fd, __cmd, __arg); +} + +int sockinfo::get_epoll_context_fd() +{ + return (has_epoll_context() ? m_econtext->get_epoll_fd() : 0); +} + +void sockinfo::insert_epoll_event(uint64_t events) +{ + if (has_epoll_context()) { + m_econtext->insert_epoll_event_cb(this, static_cast(events)); + } } int sockinfo::set_ring_attr(xlio_ring_alloc_logic_attr *attr) @@ -276,7 +343,7 @@ int sockinfo::set_ring_attr(xlio_ring_alloc_logic_attr *attr) update_header_field(&du); m_p_socket_stats->ring_alloc_logic_tx = m_ring_alloc_log_tx.get_ring_alloc_logic(); m_p_socket_stats->ring_user_id_tx = - ring_allocation_logic_tx(get_fd(), m_ring_alloc_log_tx, this).calc_res_key_by_logic(); + ring_allocation_logic_tx(get_fd(), m_ring_alloc_log_tx).calc_res_key_by_logic(); } if ((attr->comp_mask & XLIO_RING_ALLOC_MASK_RING_INGRESS) && attr->ingress) { ring_alloc_logic_attr old_key(*m_ring_alloc_logic_rx.get_key()); @@ -284,7 +351,7 @@ int sockinfo::set_ring_attr(xlio_ring_alloc_logic_attr *attr) if (set_ring_attr_helper(&m_ring_alloc_log_rx, attr)) { return SOCKOPT_NO_XLIO_SUPPORT; } - m_ring_alloc_logic_rx = ring_allocation_logic_rx(get_fd(), m_ring_alloc_log_rx, this); + m_ring_alloc_logic_rx = ring_allocation_logic_rx(get_fd(), m_ring_alloc_log_rx); if (m_rx_nd_map.size()) { std::lock_guard locker(m_rx_migration_lock); @@ -314,7 +381,7 @@ void sockinfo::set_ring_logic_rx(ring_alloc_logic_attr ral) { if (m_rx_ring_map.empty()) { m_ring_alloc_log_rx = ral; - m_ring_alloc_logic_rx = ring_allocation_logic_rx(get_fd(), m_ring_alloc_log_rx, this); + m_ring_alloc_logic_rx = ring_allocation_logic_rx(get_fd(), m_ring_alloc_log_rx); m_p_socket_stats->ring_alloc_logic_rx = m_ring_alloc_log_rx.get_ring_alloc_logic(); m_p_socket_stats->ring_user_id_rx = m_ring_alloc_logic_rx.calc_res_key_by_logic(); } @@ -326,7 +393,7 @@ void sockinfo::set_ring_logic_tx(ring_alloc_logic_attr ral) m_ring_alloc_log_tx = ral; m_p_socket_stats->ring_alloc_logic_tx = m_ring_alloc_log_tx.get_ring_alloc_logic(); m_p_socket_stats->ring_user_id_tx = - ring_allocation_logic_tx(get_fd(), m_ring_alloc_log_tx, this).calc_res_key_by_logic(); + ring_allocation_logic_tx(get_fd(), m_ring_alloc_log_tx).calc_res_key_by_logic(); } } @@ -376,7 +443,7 @@ int sockinfo::ioctl(unsigned long int __request, unsigned long int __arg) } si_logdbg("going to OS for ioctl request=%lu, flags=%#lx", __request, __arg); - return orig_os_api.ioctl(m_fd, __request, __arg); + return SYSCALL(ioctl, m_fd, __request, __arg); } int sockinfo::setsockopt(int __level, int __optname, const void *__optval, socklen_t __optlen) @@ -395,59 +462,6 @@ int sockinfo::setsockopt(int __level, int __optname, const void *__optval, sockl errno = EINVAL; } break; - case SO_XLIO_RING_USER_MEMORY: - if (__optval) { - if (__optlen == sizeof(iovec)) { - iovec *attr = (iovec *)__optval; - m_ring_alloc_log_rx.set_memory_descriptor(*attr); - m_ring_alloc_logic_rx = - ring_allocation_logic_rx(get_fd(), m_ring_alloc_log_rx, this); - if (m_p_rx_ring || m_rx_ring_map.size()) { - si_logwarn("user asked to assign memory for " - "RX ring but ring already exists"); - } - ret = SOCKOPT_INTERNAL_XLIO_SUPPORT; - } else { - ret = SOCKOPT_NO_XLIO_SUPPORT; - errno = EINVAL; - si_logdbg("SOL_SOCKET, SO_XLIO_RING_USER_MEMORY - " - "bad length expected %zu got %d", - sizeof(iovec), __optlen); - } - } else { - ret = SOCKOPT_NO_XLIO_SUPPORT; - errno = EINVAL; - si_logdbg("SOL_SOCKET, SO_XLIO_RING_USER_MEMORY - NOT HANDLED, optval == NULL"); - } - break; - case SO_XLIO_FLOW_TAG: - if (__optval) { - if (__optlen == sizeof(uint32_t)) { - if (set_flow_tag(*(uint32_t *)__optval)) { - si_logdbg("SO_XLIO_FLOW_TAG, set " - "socket fd: %d to flow id: %d", - m_fd, m_flow_tag_id); - // not supported in OS - ret = SOCKOPT_INTERNAL_XLIO_SUPPORT; - } else { - ret = SOCKOPT_NO_XLIO_SUPPORT; - errno = EINVAL; - } - } else { - ret = SOCKOPT_NO_XLIO_SUPPORT; - errno = EINVAL; - si_logdbg("SO_XLIO_FLOW_TAG, bad length " - "expected %zu got %d", - sizeof(uint32_t), __optlen); - break; - } - } else { - ret = SOCKOPT_NO_XLIO_SUPPORT; - errno = EINVAL; - si_logdbg("SO_XLIO_FLOW_TAG - NOT HANDLED, " - "optval == NULL"); - } - break; case SO_REUSEADDR: if (__optval && __optlen == sizeof(int)) { @@ -734,14 +748,6 @@ int sockinfo::getsockopt(int __level, int __optname, void *__optval, socklen_t * errno = EINVAL; } break; - case SO_XLIO_FLOW_TAG: - if (*__optlen >= sizeof(uint32_t)) { - *(uint32_t *)__optval = m_flow_tag_id; - ret = 0; - } else { - errno = EINVAL; - } - break; case SO_MAX_PACING_RATE: if (*__optlen == sizeof(struct xlio_rate_limit_t)) { *(struct xlio_rate_limit_t *)__optval = m_so_ratelimit; @@ -804,7 +810,7 @@ int sockinfo::getsockopt(int __level, int __optname, void *__optval, socklen_t * } #if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) -void sockinfo::copy_sockopt_fork(const socket_fd_api *copy_from) +void sockinfo::copy_sockopt_fork(const sockinfo *copy_from) { const sockinfo *skinfo = dynamic_cast(copy_from); if (skinfo) { @@ -832,7 +838,7 @@ int sockinfo::get_sock_by_L3_L4(in_protocol_t protocol, const ip_address &ip, in assert(g_p_fd_collection); int map_size = g_p_fd_collection->get_fd_map_size(); for (int i = 0; i < map_size; i++) { - socket_fd_api *p_sock_i = g_p_fd_collection->get_sockfd(i); + sockinfo *p_sock_i = g_p_fd_collection->get_sockfd(i); if (!p_sock_i || p_sock_i->get_type() != FD_TYPE_SOCKET) { continue; } @@ -847,7 +853,7 @@ int sockinfo::get_sock_by_L3_L4(in_protocol_t protocol, const ip_address &ip, in void sockinfo::save_stats_rx_offload(int nbytes) { - if (nbytes < 0) { + if (unlikely(has_stats()) && nbytes < 0) { if (errno == EAGAIN) { m_p_socket_stats->counters.n_rx_eagain++; } else { @@ -880,15 +886,6 @@ void sockinfo::save_stats_tx_os(int bytes) } } -size_t sockinfo::handle_msg_trunc(size_t total_rx, size_t payload_size, int in_flags, - int *p_out_flags) -{ - NOT_IN_USE(payload_size); - NOT_IN_USE(in_flags); - *p_out_flags &= ~MSG_TRUNC; // don't handle msg_trunc - return total_rx; -} - bool sockinfo::attach_receiver(flow_tuple_with_local_if &flow_key) { // This function should be called from within mutex protected context of the sockinfo!!! @@ -910,7 +907,7 @@ bool sockinfo::attach_receiver(flow_tuple_with_local_if &flow_key) // Allocate resources on specific interface (create ring) net_device_resources_t *p_nd_resources = create_nd_resources(ip_addr(flow_key.get_local_if(), flow_key.get_family())); - if (NULL == p_nd_resources) { + if (!p_nd_resources) { // any error which occurred inside create_nd_resources() was already printed. No need to // reprint errors here return false; @@ -1021,7 +1018,7 @@ bool sockinfo::detach_receiver(flow_tuple_with_local_if &flow_key) net_device_resources_t *sockinfo::create_nd_resources(const ip_addr &ip_local) { - net_device_resources_t *p_nd_resources = NULL; + net_device_resources_t *p_nd_resources = nullptr; // Check if we are already registered to net_device with the local ip as observers rx_net_device_map_t::iterator rx_nd_iter = m_rx_nd_map.find(ip_local); @@ -1030,12 +1027,12 @@ net_device_resources_t *sockinfo::create_nd_resources(const ip_addr &ip_local) // Need to register as observer to net_device net_device_resources_t nd_resources; nd_resources.refcnt = 0; - nd_resources.p_nde = NULL; - nd_resources.p_ndv = NULL; - nd_resources.p_ring = NULL; + nd_resources.p_nde = nullptr; + nd_resources.p_ndv = nullptr; + nd_resources.p_ring = nullptr; BULLSEYE_EXCLUDE_BLOCK_START - cache_entry_subject *net_dev_entry = NULL; + cache_entry_subject *net_dev_entry = nullptr; net_device_val *net_dev = g_p_net_device_table_mgr->get_net_device_val(ip_local); if (!net_dev || !g_p_net_device_table_mgr->register_observer(net_dev->get_if_idx(), &m_rx_nd_observer, @@ -1102,12 +1099,12 @@ net_device_resources_t *sockinfo::create_nd_resources(const ip_addr &ip_local) return p_nd_resources; err: - return NULL; + return nullptr; } bool sockinfo::destroy_nd_resources(const ip_addr &ip_local) { - net_device_resources_t *p_nd_resources = NULL; + net_device_resources_t *p_nd_resources = nullptr; rx_net_device_map_t::iterator rx_nd_iter = m_rx_nd_map.find(ip_local); BULLSEYE_EXCLUDE_BLOCK_START @@ -1187,7 +1184,7 @@ void sockinfo::do_rings_migration_rx(resource_allocation_key &old_key) if (rc < 0) { si_logerr("Failed to release ring for allocation key %s", old_key.to_str().c_str()); new_key->set_user_id_key(old_calc_id); - m_ring_alloc_logic_rx.enable_migration(false); + m_ring_alloc_logic_rx.disable_migration(); si_logwarn("Migration is disabled due to failure"); } lock_rx_q(); @@ -1200,7 +1197,7 @@ void sockinfo::do_rings_migration_rx(resource_allocation_key &old_key) si_logerr("Failed to reserve ring for allocation key %s on lip %s", new_key->to_str().c_str(), ip_local.to_str().c_str()); new_key->set_user_id_key(old_calc_id); - m_ring_alloc_logic_rx.enable_migration(false); + m_ring_alloc_logic_rx.disable_migration(); si_logwarn("Migration is disabled due to failure"); lock_rx_q(); rx_nd_iter++; @@ -1232,7 +1229,7 @@ void sockinfo::do_rings_migration_rx(resource_allocation_key &old_key) si_logerr("Failed to release ring for allocation key %s", new_key->to_str().c_str()); } - new_ring = NULL; + new_ring = nullptr; break; } lock_rx_q(); @@ -1258,7 +1255,7 @@ void sockinfo::do_rings_migration_rx(resource_allocation_key &old_key) si_logerr("Failed to reserve ring for allocation key %s on lip %s", new_key->to_str().c_str(), ip_local.to_str(m_family).c_str()); new_key->set_user_id_key(old_calc_id); - m_ring_alloc_logic_rx.enable_migration(false); + m_ring_alloc_logic_rx.disable_migration(); si_logwarn("Migration is disabled due to failure"); lock_rx_q(); rx_nd_iter++; @@ -1334,7 +1331,15 @@ int sockinfo::add_epoll_context(epfd_info *epfd) m_rx_ring_map_lock.lock(); lock_rx_q(); - ret = socket_fd_api::add_epoll_context(epfd); + if (!m_econtext && !safe_mce_sys().enable_socketxtreme) { + // This socket is not registered to any epfd + m_econtext = epfd; + } else { + // Currently XLIO does not support more then 1 epfd listed + errno = (m_econtext == epfd) ? EEXIST : ENOMEM; + ret = -1; + } + if (ret < 0) { goto unlock_locks; } @@ -1345,7 +1350,9 @@ int sockinfo::add_epoll_context(epfd_info *epfd) sock_ring_map_iter = m_rx_ring_map.begin(); while (sock_ring_map_iter != m_rx_ring_map.end()) { - notify_epoll_context_add_ring(sock_ring_map_iter->first); + if (has_epoll_context()) { + m_econtext->increase_ring_ref_count(sock_ring_map_iter->first); + } sock_ring_map_iter++; } @@ -1362,7 +1369,7 @@ void sockinfo::remove_epoll_context(epfd_info *epfd) m_rx_ring_map_lock.lock(); lock_rx_q(); - if (!notify_epoll_context_verify(epfd)) { + if (!has_epoll_context() || m_econtext != epfd) { unlock_rx_q(); m_rx_ring_map_lock.unlock(); return; @@ -1370,11 +1377,14 @@ void sockinfo::remove_epoll_context(epfd_info *epfd) rx_ring_map_t::const_iterator sock_ring_map_iter = m_rx_ring_map.begin(); while (sock_ring_map_iter != m_rx_ring_map.end()) { - notify_epoll_context_remove_ring(sock_ring_map_iter->first); + m_econtext->decrease_ring_ref_count(sock_ring_map_iter->first); sock_ring_map_iter++; } - socket_fd_api::remove_epoll_context(epfd); + if (m_econtext == epfd) { + m_econtext = NULL; + } + if (safe_mce_sys().skip_poll_in_rx == SKIP_POLL_IN_RX_EPOLL_ONLY) { m_skip_cq_poll_in_rx = false; } @@ -1401,7 +1411,14 @@ void sockinfo::statistics_print(vlog_levels_t log_level /* = VLOG_DEBUG */) bool b_any_activity = false; - socket_fd_api::statistics_print(log_level); + int epoll_fd = get_epoll_context_fd(); + + // Socket data + vlog_printf(log_level, "Fd number : %d\n", m_fd); + if (epoll_fd) { + vlog_printf(log_level, "Socket epoll Fd : %d\n", epoll_fd); + vlog_printf(log_level, "Socket epoll flags : 0x%x\n", m_fd_rec.events); + } vlog_printf(log_level, "Bind info : %s\n", m_bound.to_str_ip_port(true).c_str()); vlog_printf(log_level, "Connection info : %s\n", m_connected.to_str_ip_port(true).c_str()); @@ -1418,6 +1435,10 @@ void sockinfo::statistics_print(vlog_levels_t log_level /* = VLOG_DEBUG */) m_p_connected_dst_entry->is_offloaded() ? "true" : "false"); } + if (!has_stats()) { + return; + } + if (m_p_socket_stats->ring_alloc_logic_rx == RING_LOGIC_PER_USER_ID) { vlog_printf(log_level, "RX Ring User ID : %lu\n", m_p_socket_stats->ring_user_id_rx); } @@ -1476,10 +1497,9 @@ void sockinfo::statistics_print(vlog_levels_t log_level /* = VLOG_DEBUG */) (float)(m_p_socket_stats->counters.n_rx_ready_byte_drop * 100) / (float)m_p_socket_stats->counters.n_rx_packets; } - vlog_printf(log_level, "Rx byte : max %d / dropped %d (%2.2f%%) / limit %d\n", + vlog_printf(log_level, "Rx byte : max %d / dropped %d (%2.2f%%)\n", m_p_socket_stats->counters.n_rx_ready_byte_max, - m_p_socket_stats->counters.n_rx_ready_byte_drop, rx_drop_percentage, - m_p_socket_stats->n_rx_ready_byte_limit); + m_p_socket_stats->counters.n_rx_ready_byte_drop, rx_drop_percentage); if (m_p_socket_stats->n_rx_ready_pkt_count) { rx_drop_percentage = (float)(m_p_socket_stats->counters.n_rx_ready_pkt_drop * 100) / @@ -1524,10 +1544,10 @@ void sockinfo::statistics_print(vlog_levels_t log_level /* = VLOG_DEBUG */) // Sleep on different CQs and OS listen socket int sockinfo::os_wait_sock_rx_epfd(epoll_event *ep_events, int maxevents) { - if (unlikely(m_rx_cq_wait_ctrl)) { + if (unlikely(safe_mce_sys().rx_cq_wait_ctrl)) { add_cqfd_to_sock_rx_epfd(m_p_rx_ring); int ret = - orig_os_api.epoll_wait(m_rx_epfd, ep_events, maxevents, m_loops_timer.time_left_msec()); + SYSCALL(epoll_wait, m_rx_epfd, ep_events, maxevents, m_loops_timer.time_left_msec()); remove_cqfd_from_sock_rx_epfd(m_p_rx_ring); return ret; } @@ -1537,14 +1557,14 @@ int sockinfo::os_wait_sock_rx_epfd(epoll_event *ep_events, int maxevents) int sockinfo::os_epoll_wait(epoll_event *ep_events, int maxevents) { - return orig_os_api.epoll_wait(m_rx_epfd, ep_events, maxevents, m_loops_timer.time_left_msec()); + return SYSCALL(epoll_wait, m_rx_epfd, ep_events, maxevents, m_loops_timer.time_left_msec()); } // Add this new CQ channel fd to the rx epfd handle (no need to wake up any sleeping thread about // this new fd) void sockinfo::add_cqfd_to_sock_rx_epfd(ring *p_ring) { - epoll_event ev = {0, {0}}; + epoll_event ev = {0, {nullptr}}; ev.events = EPOLLIN; size_t num_ring_rx_fds; int *ring_rx_fds_array = p_ring->get_rx_channel_fds(num_ring_rx_fds); @@ -1553,7 +1573,7 @@ void sockinfo::add_cqfd_to_sock_rx_epfd(ring *p_ring) ev.data.fd = ring_rx_fds_array[i]; BULLSEYE_EXCLUDE_BLOCK_START - if (unlikely(orig_os_api.epoll_ctl(m_rx_epfd, EPOLL_CTL_ADD, ev.data.fd, &ev))) { + if (unlikely(SYSCALL(epoll_ctl, m_rx_epfd, EPOLL_CTL_ADD, ev.data.fd, &ev))) { si_logerr("failed to add cq channel fd to internal epfd errno=%d (%m)", errno); } BULLSEYE_EXCLUDE_BLOCK_END @@ -1568,7 +1588,7 @@ void sockinfo::remove_cqfd_from_sock_rx_epfd(ring *base_ring) for (size_t i = 0; i < num_ring_rx_fds; i++) { BULLSEYE_EXCLUDE_BLOCK_START if (unlikely( - (orig_os_api.epoll_ctl(m_rx_epfd, EPOLL_CTL_DEL, ring_rx_fds_array[i], NULL)) && + (SYSCALL(epoll_ctl, m_rx_epfd, EPOLL_CTL_DEL, ring_rx_fds_array[i], nullptr)) && (!(errno == ENOENT || errno == EBADF)))) { si_logerr("failed to delete cq channel fd from internal epfd (errno=%d %s)", errno, strerror(errno)); @@ -1611,17 +1631,17 @@ void sockinfo::rx_add_ring_cb(ring *p_ring) // each event on the cq-fd. This causes high latency and increased CPU usage by the Kernel // which leads to decreased performance. For example, for 350K connections and a single // ring. there will be 350K epfds watching a single cq-fd. When this cq-fd has an event, the - // Kernel loops through all the 350K epfds. By setting m_rx_cq_wait_ctrl=true, we add the - // cq-fd only to the epfds of the sockets that are going to sleep inside + // Kernel loops through all the 350K epfds. By setting safe_mce_sys().rx_cq_wait_ctrl=true, + // we add the cq-fd only to the epfds of the sockets that are going to sleep inside // sockinfo_tcp::rx_wait_helper/sockinfo_udp::rx_wait. - if (!m_rx_cq_wait_ctrl) { + if (!safe_mce_sys().rx_cq_wait_ctrl) { add_cqfd_to_sock_rx_epfd(p_ring); } - do_wakeup(); // A ready wce can be pending due to the drain logic (cq channel will not wake - // up by itself) + // A ready wce can be pending due to the drain logic (cq channel will not wake up by itself) + m_sock_wakeup_pipe.do_wakeup(); } else { - // Increase ref count on cq_mgr object + // Increase ref count on cq_mgr_rx object rx_ring_iter->second->refcnt++; } @@ -1633,7 +1653,9 @@ void sockinfo::rx_add_ring_cb(ring *p_ring) // first in order. possible race between removal of fd from epoll (epoll_ctl del, or epoll // close) and here. need to add a third-side lock (fd_collection?) to sync between epoll and // socket. - notify_epoll_context_add_ring(p_ring); + if (has_epoll_context()) { + m_econtext->increase_ring_ref_count(p_ring); + } } lock_rx_q(); @@ -1645,7 +1667,7 @@ void sockinfo::rx_del_ring_cb(ring *p_ring) bool notify_epoll = false; - // Remove the rx cq_mgr from our rx cq map + // Remove the rx cq_mgr_rx from our rx cq map unlock_rx_q(); m_rx_ring_map_lock.lock(); lock_rx_q(); @@ -1662,13 +1684,13 @@ void sockinfo::rx_del_ring_cb(ring *p_ring) if (rx_ring_iter != m_rx_ring_map.end()) { BULLSEYE_EXCLUDE_BLOCK_END ring_info_t *p_ring_info = rx_ring_iter->second; - // Decrease ref count on cq_mgr object + // Decrease ref count on cq_mgr_rx object p_ring_info->refcnt--; - // Is this the last reference to this cq_mgr? + // Is this the last reference to this cq_mgr_rx? if (p_ring_info->refcnt == 0) { - // Move all cq_mgr->rx_reuse buffers to temp reuse queue related to p_rx_cq_mgr + // Move all cq_mgr_rx->rx_reuse buffers to temp reuse queue related to p_rx_cq_mgr move_descs(base_ring, &temp_rx_reuse, &p_ring_info->rx_reuse_info.rx_reuse, true); move_descs(base_ring, &temp_rx_reuse_global, &p_ring_info->rx_reuse_info.rx_reuse, false); @@ -1678,7 +1700,7 @@ void sockinfo::rx_del_ring_cb(ring *p_ring) p_ring_info->rx_reuse_info.rx_reuse.size()); } - if (!m_rx_cq_wait_ctrl) { + if (!safe_mce_sys().rx_cq_wait_ctrl) { remove_cqfd_from_sock_rx_epfd(base_ring); } @@ -1688,18 +1710,13 @@ void sockinfo::rx_del_ring_cb(ring *p_ring) delete p_ring_info; if (m_p_rx_ring == base_ring) { - /* Ring should not have completion events related closed socket - * in wait list - */ - for (auto &ec : m_socketxtreme.ec_cache) { - if (0 != ec.completion.events) { - m_p_rx_ring->del_ec(&ec); - } - } + // Ring should not have completion events related closed socket in wait list + m_p_rx_ring->socketxtreme_ec_clear_sock(this); + if (m_rx_ring_map.size() == 1) { m_p_rx_ring = m_rx_ring_map.begin()->first; } else { - m_p_rx_ring = NULL; + m_p_rx_ring = nullptr; } move_descs(base_ring, &temp_rx_reuse, &m_rx_reuse_buff.rx_reuse, true); @@ -1719,7 +1736,9 @@ void sockinfo::rx_del_ring_cb(ring *p_ring) // first in order. possible race between removal of fd from epoll (epoll_ctl del, or epoll // close) and here. need to add a third-side lock (fd_collection?) to sync between epoll and // socket. - notify_epoll_context_remove_ring(base_ring); + if (has_epoll_context()) { + m_econtext->decrease_ring_ref_count(base_ring); + } } // no need for m_lock_rcv since temp_rx_reuse is on the stack @@ -1959,10 +1978,10 @@ void sockinfo::destructor_helper() if (m_p_connected_dst_entry) { delete m_p_connected_dst_entry; } - m_p_connected_dst_entry = NULL; + m_p_connected_dst_entry = nullptr; } -int sockinfo::register_callback(xlio_recv_callback_t callback, void *context) +int sockinfo::register_callback_ctx(xlio_recv_callback_t callback, void *context) { m_rx_callback = callback; m_rx_callback_context = context; @@ -1994,53 +2013,53 @@ int sockinfo::modify_ratelimit(dst_entry *p_dst_entry, struct xlio_rate_limit_t int sockinfo::get_rings_num() { - int count = 0; + size_t count = 0; size_t num_rx_channel_fds; - if (is_socketxtreme()) { - /* socketXtreme mode support just single ring */ - return 1; + + ring *tx_ring = m_p_connected_dst_entry ? m_p_connected_dst_entry->get_ring() : nullptr; + if (tx_ring) { + (void)tx_ring->get_rx_channel_fds(count); } - rx_ring_map_t::iterator it = m_rx_ring_map.begin(); - for (; it != m_rx_ring_map.end(); ++it) { - (void)it->first->get_rx_channel_fds(num_rx_channel_fds); - count += (int)num_rx_channel_fds; + for (auto pair : m_rx_ring_map) { + if (tx_ring == pair.first) { + continue; + } + (void)pair.first->get_rx_channel_fds(num_rx_channel_fds); + count += num_rx_channel_fds; } - return count; + return static_cast(count); } -int *sockinfo::get_rings_fds(int &res_length) +int sockinfo::get_rings_fds(int *ring_fds, int ring_fds_sz) { - res_length = 0; - int index = 0; size_t num_rx_channel_fds; + int *channel_fds; + int index = 0; - if (is_socketxtreme()) { - /* socketXtreme mode support just single ring */ - res_length = 1; - return m_p_rx_ring->get_rx_channel_fds(num_rx_channel_fds); - } - - if (m_p_rings_fds) { - return m_p_rings_fds; + /* + * We return RX channels for the TX ring to make it consistent and comparable with the RX + * rings. The channels are used only as indirect pointers to the rings, therefore, this + * doesn't introduce any functionality issues. + */ + ring *tx_ring = m_p_connected_dst_entry ? m_p_connected_dst_entry->get_ring() : nullptr; + if (tx_ring) { + channel_fds = tx_ring->get_rx_channel_fds(num_rx_channel_fds); + for (size_t i = 0; i < num_rx_channel_fds && index < ring_fds_sz; ++i) { + ring_fds[index++] = channel_fds[i]; + } } - res_length = get_rings_num(); - m_p_rings_fds = new int[res_length]; - rx_ring_map_t::iterator it = m_rx_ring_map.begin(); - for (; it != m_rx_ring_map.end(); ++it) { - int *p_n_rx_channel_fds = it->first->get_rx_channel_fds(num_rx_channel_fds); - for (size_t j = 0; j < num_rx_channel_fds; ++j) { - int fd = p_n_rx_channel_fds[j]; - if (fd != -1) { - m_p_rings_fds[index] = fd; - ++index; - } else { - si_logdbg("got ring with fd -1"); - } + for (auto pair : m_rx_ring_map) { + if (tx_ring == pair.first) { + continue; + } + channel_fds = pair.first->get_rx_channel_fds(num_rx_channel_fds); + for (size_t i = 0; i < num_rx_channel_fds && index < ring_fds_sz; ++i) { + ring_fds[index++] = channel_fds[i]; } } - return m_p_rings_fds; + return index; } int sockinfo::setsockopt_kernel(int __level, int __optname, const void *__optval, @@ -2075,7 +2094,7 @@ int sockinfo::setsockopt_kernel(int __level, int __optname, const void *__optval } si_logdbg("going to OS for setsockopt level %d optname %d", __level, __optname); - int ret = orig_os_api.setsockopt(m_fd, __level, __optname, __optval, __optlen); + int ret = SYSCALL(setsockopt, m_fd, __level, __optname, __optval, __optlen); BULLSEYE_EXCLUDE_BLOCK_START if (ret) { if (EPERM == errno && allow_privileged) { @@ -2174,7 +2193,7 @@ void sockinfo::handle_recv_timestamping(struct cmsg_state *cm_state) void sockinfo::handle_recv_errqueue(struct cmsg_state *cm_state) { - mem_buf_desc_t *buff = NULL; + mem_buf_desc_t *buff = nullptr; if (m_error_queue.empty()) { return; @@ -2224,7 +2243,7 @@ void sockinfo::insert_cmsg(struct cmsg_state *cm_state, int level, int type, voi (struct cmsghdr *)((char *)cm_state->cmhdr + CMSG_ALIGN(cm_state->cmhdr->cmsg_len)); if ((char *)(next + 1) > ((char *)cm_state->mhdr->msg_control + cm_state->mhdr->msg_controllen)) { - cm_state->cmhdr = NULL; + cm_state->cmhdr = nullptr; } else { cm_state->cmhdr = next; } @@ -2250,3 +2269,101 @@ void sockinfo::handle_cmsg(struct msghdr *msg, int flags) cm_state.mhdr->msg_controllen = cm_state.cmsg_bytes_consumed; } + +ssize_t sockinfo::rx_os(const rx_call_t call_type, iovec *p_iov, ssize_t sz_iov, const int flags, + sockaddr *__from, socklen_t *__fromlen, struct msghdr *__msg) +{ + errno = 0; + switch (call_type) { + case RX_READ: + __log_info_func("calling os receive with orig read"); + return SYSCALL(read, m_fd, p_iov[0].iov_base, p_iov[0].iov_len); + + case RX_READV: + __log_info_func("calling os receive with orig readv"); + return SYSCALL(readv, m_fd, p_iov, sz_iov); + + case RX_RECV: + __log_info_func("calling os receive with orig recv"); + return SYSCALL(recv, m_fd, p_iov[0].iov_base, p_iov[0].iov_len, flags); + + case RX_RECVFROM: + __log_info_func("calling os receive with orig recvfrom"); + return SYSCALL(recvfrom, m_fd, p_iov[0].iov_base, p_iov[0].iov_len, flags, __from, + __fromlen); + + case RX_RECVMSG: { + __log_info_func("calling os receive with orig recvmsg"); + return SYSCALL(recvmsg, m_fd, __msg, flags); + } + } + return (ssize_t)-1; +} + +ssize_t sockinfo::tx_os(const tx_call_t call_type, const iovec *p_iov, const ssize_t sz_iov, + const int __flags, const sockaddr *__to, const socklen_t __tolen) +{ + errno = 0; + + // Ignore dummy messages for OS + if (unlikely(IS_DUMMY_PACKET(__flags))) { + errno = EINVAL; + return -1; + } + + switch (call_type) { + case TX_WRITE: + __log_info_func("calling os transmit with orig write"); + return SYSCALL(write, m_fd, p_iov[0].iov_base, p_iov[0].iov_len); + + case TX_WRITEV: + __log_info_func("calling os transmit with orig writev"); + return SYSCALL(writev, m_fd, p_iov, sz_iov); + + case TX_SEND: + __log_info_func("calling os transmit with orig send"); + return SYSCALL(send, m_fd, p_iov[0].iov_base, p_iov[0].iov_len, __flags); + + case TX_SENDTO: + __log_info_func("calling os transmit with orig sendto"); + return SYSCALL(sendto, m_fd, p_iov[0].iov_base, p_iov[0].iov_len, __flags, __to, __tolen); + + case TX_SENDMSG: { + msghdr __message; + memset(&__message, 0, sizeof(__message)); + __message.msg_iov = (iovec *)p_iov; + __message.msg_iovlen = sz_iov; + __message.msg_name = (void *)__to; + __message.msg_namelen = __tolen; + + __log_info_func("calling os transmit with orig sendmsg"); + return SYSCALL(sendmsg, m_fd, &__message, __flags); + } + default: + __log_info_func("calling undefined os call type!"); + break; + } + return (ssize_t)-1; +} + +int sockinfo::handle_exception_flow() +{ + if (safe_mce_sys().exception_handling.is_suit_un_offloading()) { + try_un_offloading(); + } + if (safe_mce_sys().exception_handling == xlio_exception_handling::MODE_RETURN_ERROR) { + errno = EINVAL; + return -1; + } + if (safe_mce_sys().exception_handling == xlio_exception_handling::MODE_ABORT) { + return -2; + } + return 0; +} + +bool sockinfo::skip_os_select() +{ + // If safe_mce_sys().select_poll_os_ratio == 0, it means that user configured XLIO not to poll + // os (i.e. TRUE...) + return (!safe_mce_sys().select_poll_os_ratio); +} diff --git a/src/core/sock/sockinfo.h b/src/core/sock/sockinfo.h index 1037990ad..86a21c404 100644 --- a/src/core/sock/sockinfo.h +++ b/src/core/sock/sockinfo.h @@ -33,28 +33,29 @@ #include #include #include - +#include #include "config.h" +#include "xlio_extra.h" +#include "dev/cq_mgr_rx.h" +#include "dev/buffer_pool.h" +#include "sock/cleanable_obj.h" #include "vlogger/vlogger.h" #include "utils/lock_wrapper.h" -#include "xlio_extra.h" #include "util/data_updater.h" #include "util/sock_addr.h" #include "util/xlio_stats.h" #include "util/sys_vars.h" #include "util/wakeup_pipe.h" +#include "iomux/epfd_info.h" #include "proto/flow_tuple.h" #include "proto/mem_buf_desc.h" #include "proto/dst_entry.h" #include "dev/net_device_table_mgr.h" #include "dev/ring_simple.h" #include "dev/ring_allocation_logic.h" - -#include "socket_fd_api.h" -#include "pkt_rcvr_sink.h" -#include "pkt_sndr_source.h" #include "sock-redirect.h" #include "sock-app.h" +#include "sock_stats.h" #ifndef BASE_SOCKINFO_H #define BASE_SOCKINFO_H @@ -62,20 +63,18 @@ #define SI_RX_EPFD_EVENT_MAX 16 #define BYTE_TO_KB(byte_value) ((byte_value) / 125) #define KB_TO_BYTE(kbit_value) ((kbit_value)*125) +#define FD_ARRAY_MAX 24 -#if DEFINED_MISSING_NET_TSTAMP -enum { - SOF_TIMESTAMPING_TX_HARDWARE = (1 << 0), - SOF_TIMESTAMPING_TX_SOFTWARE = (1 << 1), - SOF_TIMESTAMPING_RX_HARDWARE = (1 << 2), - SOF_TIMESTAMPING_RX_SOFTWARE = (1 << 3), - SOF_TIMESTAMPING_SOFTWARE = (1 << 4), - SOF_TIMESTAMPING_SYS_HARDWARE = (1 << 5), - SOF_TIMESTAMPING_RAW_HARDWARE = (1 << 6), - SOF_TIMESTAMPING_MASK = (SOF_TIMESTAMPING_RAW_HARDWARE - 1) | SOF_TIMESTAMPING_RAW_HARDWARE -}; -#else -#include +#ifndef SOCK_NONBLOCK +#define SOCK_NONBLOCK 04000 +#endif + +#ifndef SOCK_CLOEXEC +#define SOCK_CLOEXEC 02000000 +#endif + +#ifndef SO_MAX_PACING_RATE +#define SO_MAX_PACING_RATE 47 #endif #ifndef SO_TIMESTAMPNS @@ -106,14 +105,51 @@ enum { #define MSG_ZEROCOPY 0x4000000 #endif +#define IS_DUMMY_PACKET(flags) (flags & XLIO_SND_FLAGS_DUMMY) +#define NOTIFY_ON_EVENTS(context, events) context->set_events(events) + +// Sockinfo setsockopt() return values +// Internal socket option, should not pass request to OS. +#define SOCKOPT_INTERNAL_XLIO_SUPPORT 0 +// Socket option was found but not supported, error should be returned to user. +#define SOCKOPT_NO_XLIO_SUPPORT -1 +// Should pass to TCP/UDP level or OS. +#define SOCKOPT_PASS_TO_OS 1 +// Pass the option also to the OS. +#define SOCKOPT_HANDLE_BY_OS -2 + +#if DEFINED_MISSING_NET_TSTAMP +enum { + SOF_TIMESTAMPING_TX_HARDWARE = (1 << 0), + SOF_TIMESTAMPING_TX_SOFTWARE = (1 << 1), + SOF_TIMESTAMPING_RX_HARDWARE = (1 << 2), + SOF_TIMESTAMPING_RX_SOFTWARE = (1 << 3), + SOF_TIMESTAMPING_SOFTWARE = (1 << 4), + SOF_TIMESTAMPING_SYS_HARDWARE = (1 << 5), + SOF_TIMESTAMPING_RAW_HARDWARE = (1 << 6), + SOF_TIMESTAMPING_MASK = (SOF_TIMESTAMPING_RAW_HARDWARE - 1) | SOF_TIMESTAMPING_RAW_HARDWARE +}; +#else +#include +#endif + +typedef enum { RX_READ = 23, RX_READV, RX_RECV, RX_RECVFROM, RX_RECVMSG } rx_call_t; + +enum { + TX_FLAG_NO_PARTIAL_WRITE = 1 << 0, +}; + +enum fd_type_t { + FD_TYPE_SOCKET = 0, + FD_TYPE_PIPE, +}; + struct cmsg_state { struct msghdr *mhdr; struct cmsghdr *cmhdr; size_t cmsg_bytes_consumed; }; -#define NOTIFY_ON_EVENTS(context, events) context->set_events(events) - struct buff_info_t { buff_info_t() { @@ -125,45 +161,85 @@ struct buff_info_t { descq_t rx_reuse; }; -typedef struct { +struct epoll_fd_rec { + uint32_t events; + epoll_data epdata; + int offloaded_index; // offloaded fd index + 1 + + epoll_fd_rec() { reset(); } + + void reset() + { + this->events = 0; + memset(&this->epdata, 0, sizeof(this->epdata)); + this->offloaded_index = 0; + } +}; + +struct net_device_resources_t { net_device_entry *p_nde; net_device_val *p_ndv; ring *p_ring; int refcnt; -} net_device_resources_t; - -typedef std::unordered_map rx_net_device_map_t; - -/* - * Sockinfo setsockopt() return values - */ -#define SOCKOPT_INTERNAL_XLIO_SUPPORT 0 // Internal socket option, should not pass request to OS. -#define SOCKOPT_NO_XLIO_SUPPORT \ - -1 // Socket option was found but not supported, error should be returned to user. -#define SOCKOPT_PASS_TO_OS 1 // Should pass to TCP/UDP level or OS. -#define SOCKOPT_HANDLE_BY_OS -2 // Pass the option also to the OS. +}; -typedef std::unordered_map rx_flow_map_t; +struct fd_array_t { + // coverity[member_decl] + int fd_list[FD_ARRAY_MAX]; // Note: An FD might appear twice in the list, + // the user of this array will need to handle it correctly + int fd_max; + int fd_count; +}; -typedef struct { +struct ring_info_t { int refcnt; buff_info_t rx_reuse_info; -} ring_info_t; +}; +// This structure describes the send operation attributes +// Used attributes can be of different types TX_FILE, TX_WRITE, TX_WRITEV, TX_SEND, TX_SENDTO, +// TX_SENDMSG +struct xlio_tx_call_attr_t { + tx_call_t opcode; + unsigned xlio_flags; + + struct _attr { + struct iovec *iov; + ssize_t sz_iov; + int flags; + socklen_t len; + struct sockaddr *addr; + const struct msghdr *hdr; + } attr; + + pbuf_desc priv; + + ~xlio_tx_call_attr_t() {}; + void clear(void) + { + opcode = TX_UNDEF; + memset(&attr, 0, sizeof(attr)); + memset(&priv, 0, sizeof(priv)); + priv.attr = PBUF_DESC_NONE; + xlio_flags = 0; + } + + xlio_tx_call_attr_t() { clear(); } +}; + +typedef std::unordered_map rx_net_device_map_t; +typedef xlio_list_t xlio_desc_list_t; +typedef std::unordered_map rx_flow_map_t; typedef std::unordered_map rx_ring_map_t; // see route.c in Linux kernel const uint8_t ip_tos2prio[16] = {0, 0, 0, 0, 2, 2, 2, 2, 6, 6, 6, 6, 4, 4, 4, 4}; -class sockinfo : public socket_fd_api, - public pkt_rcvr_sink, - public pkt_sndr_source, - public wakeup_pipe { -public: - sockinfo(int fd, int domain, bool use_ring_locks); - virtual ~sockinfo(); +class epfd_info; - enum sockinfo_state { +class sockinfo { +public: + enum sockinfo_state : uint16_t { SOCKINFO_UNDEFINED, SOCKINFO_OPENED, SOCKINFO_CLOSING, @@ -171,89 +247,146 @@ class sockinfo : public socket_fd_api, SOCKINFO_DESTROYING }; -#if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) - virtual void copy_sockopt_fork(const socket_fd_api *copy_from); -#endif -#if defined(DEFINED_NGINX) - void set_m_n_sysvar_rx_num_buffs_reuse(int val) { m_n_sysvar_rx_num_buffs_reuse = val; } -#endif - - virtual void consider_rings_migration_rx(); - virtual int add_epoll_context(epfd_info *epfd); - virtual void remove_epoll_context(epfd_info *epfd); - - inline bool set_flow_tag(uint32_t flow_tag_id) + static inline size_t pendig_to_remove_node_offset(void) { - if (flow_tag_id && (flow_tag_id != FLOW_TAG_MASK)) { - m_flow_tag_id = flow_tag_id; - m_flow_tag_enabled = true; - return true; - } - m_flow_tag_id = FLOW_TAG_MASK; - return false; + return NODE_OFFSET(sockinfo, pendig_to_remove_node); } - inline bool get_reuseaddr(void) { return m_reuseaddr; } - inline bool get_reuseport(void) { return m_reuseport; } - inline bool flow_tag_enabled(void) { return m_flow_tag_enabled; } - inline int get_rx_epfd(void) { return m_rx_epfd; } - inline bool is_blocking(void) { return m_b_blocking; } - - bool flow_in_reuse(void) { return m_reuseaddr | m_reuseport; } - virtual int *get_rings_fds(int &res_length); - virtual int get_rings_num(); - virtual bool check_rings() { return m_p_rx_ring ? true : false; } - virtual void statistics_print(vlog_levels_t log_level = VLOG_DEBUG); - uint32_t get_flow_tag_val() { return m_flow_tag_id; } - inline in_protocol_t get_protocol(void) { return m_protocol; } - bool validate_and_convert_mapped_ipv4(sock_addr &sock) const; - void socket_stats_init(void); - - void sock_pop_descs_rx_ready(descq_t *cache) + static inline size_t socket_fd_list_node_offset(void) { - lock_rx_q(); - mem_buf_desc_t *temp; - const size_t size = get_size_m_rx_pkt_ready_list(); - - for (size_t i = 0; i < size; i++) { - temp = get_front_m_rx_pkt_ready_list(); - pop_front_m_rx_pkt_ready_list(); - cache->push_back(temp); - } - m_n_rx_pkt_ready_list_count = 0; - m_rx_ready_byte_count = 0; - m_p_socket_stats->n_rx_ready_pkt_count = 0; - m_p_socket_stats->n_rx_ready_byte_count = 0; - - unlock_rx_q(); + return NODE_OFFSET(sockinfo, socket_fd_list_node); } - sa_family_t get_family() { return m_family; } + static inline size_t ep_ready_fd_node_offset(void) + { + return NODE_OFFSET(sockinfo, ep_ready_fd_node); + } -protected: - inline void set_rx_reuse_pending(bool is_pending = true) + static inline size_t ep_info_fd_node_offset(void) { - m_rx_reuse_buf_pending = is_pending; + return NODE_OFFSET(sockinfo, ep_info_fd_node); } - virtual void set_blocking(bool is_blocked); + sockinfo(int fd, int domain, bool use_ring_locks); + virtual ~sockinfo(); + + // Callback from lower layer notifying new receive packets + // Return: 'true' if object queuing this receive packet + // 'false' if not interested in this receive packet + virtual bool rx_input_cb(mem_buf_desc_t *p_rx_pkt_mem_buf_desc_info, + void *pv_fd_ready_array) = 0; + + virtual ssize_t tx(xlio_tx_call_attr_t &tx_arg) = 0; + virtual bool is_readable(uint64_t *p_poll_sn, fd_array_t *p_fd_array = nullptr) = 0; + virtual bool is_writeable() = 0; + virtual bool is_errorable(int *errors) = 0; + virtual void clean_socket_obj() = 0; + virtual void setPassthrough() = 0; + virtual bool isPassthrough() = 0; + virtual int prepareListen() = 0; + virtual int shutdown(int __how) = 0; + virtual int listen(int backlog) = 0; + virtual int accept(struct sockaddr *__addr, socklen_t *__addrlen) = 0; + virtual int accept4(struct sockaddr *__addr, socklen_t *__addrlen, int __flags) = 0; + virtual int bind(const sockaddr *__addr, socklen_t __addrlen) = 0; + virtual int connect(const sockaddr *__to, socklen_t __tolen) = 0; + virtual int getsockname(sockaddr *__name, socklen_t *__namelen) = 0; + virtual int getpeername(sockaddr *__name, socklen_t *__namelen) = 0; + virtual int setsockopt(int __level, int __optname, __const void *__optval, + socklen_t __optlen) = 0; + virtual int getsockopt(int __level, int __optname, void *__optval, socklen_t *__optlen) = 0; + virtual bool is_outgoing() = 0; + virtual bool is_incoming() = 0; + virtual bool is_closable() = 0; + virtual void statistics_print(vlog_levels_t log_level = VLOG_DEBUG) = 0; + virtual int register_callback(xlio_recv_callback_t callback, void *context) = 0; virtual int fcntl(int __cmd, unsigned long int __arg); virtual int fcntl64(int __cmd, unsigned long int __arg); virtual int ioctl(unsigned long int __request, unsigned long int __arg); - virtual int setsockopt(int __level, int __optname, const void *__optval, socklen_t __optlen); - int setsockopt_kernel(int __level, int __optname, const void *__optval, socklen_t __optlen, - int supported, bool allow_priv); - virtual int getsockopt(int __level, int __optname, void *__optval, socklen_t *__optlen); + virtual fd_type_t get_type() = 0; + + virtual ssize_t rx(const rx_call_t call_type, iovec *iov, const ssize_t iovlen, + int *p_flags = 0, sockaddr *__from = nullptr, socklen_t *__fromlen = nullptr, + struct msghdr *__msg = nullptr) = 0; + + virtual int recvfrom_zcopy_free_packets(struct xlio_recvfrom_zcopy_packet_t *pkts, + size_t count) = 0; + + // Instructing the socket to immediately sample/un-sample the OS in receive flow + virtual void set_immediate_os_sample() = 0; + virtual void unset_immediate_os_sample() = 0; + + // In some cases we need the socket can't be deleted immidiatly + //(for example STREAME sockets) + // This prepares the socket for termination and return true if the + // Return val: true is the socket is already closable and false otherwise + virtual bool prepare_to_close(bool process_shutdown = false) = 0; + virtual bool skip_os_select(); // true if fd must be skipped from OS select() + + inline bool set_flow_tag(uint32_t flow_tag_id); + inline void sock_pop_descs_rx_ready(descq_t *cache); + + // Socketxtreme related. + ring_ec *pop_next_ec(); + ring_ec *clear_ecs(); + void add_ec(ring_ec *ec); + ring_ec *get_last_ec() { return m_socketxtreme_ec_last; } + bool has_next_ec() { return (m_socketxtreme_ec_first != nullptr); } + sockinfo *get_ec_ring_list_next() { return m_socketxtreme_ring_list_next; } + void set_ec_ring_list_next(sockinfo *sock) { m_socketxtreme_ring_list_next = sock; } + + bool has_epoll_context() { return (!safe_mce_sys().enable_socketxtreme && m_econtext); } + bool has_stats() const { return m_has_stats; } + bool get_rx_pkt_ready_list_count() const { return m_n_rx_pkt_ready_list_count; } + int get_fd() const { return m_fd; }; + sa_family_t get_family() { return m_family; } + bool get_reuseaddr(void) { return m_reuseaddr; } + bool get_reuseport(void) { return m_reuseport; } + int get_rx_epfd(void) { return m_rx_epfd; } + bool is_blocking(void) { return m_b_blocking; } + bool flow_in_reuse(void) { return m_reuseaddr | m_reuseport; } + bool is_shadow_socket_present() { return m_fd >= 0 && m_fd != m_rx_epfd; } + uint32_t get_flow_tag_val() { return m_flow_tag_id; } + in_protocol_t get_protocol(void) { return m_protocol; } + socket_stats_t *get_sock_stats() const { return m_p_socket_stats; } + rfs *get_rfs_ptr() const { return m_rfs_ptr; } + void set_rfs_ptr(rfs *r) { m_rfs_ptr = r; } + void destructor_helper(); + int get_rings_fds(int *ring_fds, int ring_fds_sz); + int get_rings_num(); + bool validate_and_convert_mapped_ipv4(sock_addr &sock) const; + int register_callback_ctx(xlio_recv_callback_t callback, void *context); + void consider_rings_migration_rx(); + int add_epoll_context(epfd_info *epfd); + void remove_epoll_context(epfd_info *epfd); + int get_epoll_context_fd(); + + // Calling OS transmit + ssize_t tx_os(const tx_call_t call_type, const iovec *p_iov, const ssize_t sz_iov, + const int __flags, const sockaddr *__to, const socklen_t __tolen); +#if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) + // This socket options copy is currently implemented for nginx and for very specific options. + // This copy is called as part of fork() flow of nginx specifically. + // If a generic fork() is implemented, this copy should be reimplemented in a more generic way, + // see is_inherited_option mechanism of sockinfo_tcp for an example. + void copy_sockopt_fork(const sockinfo *copy_from); +#if defined(DEFINED_NGINX) + virtual void prepare_to_close_socket_pool(bool _push_pop) { NOT_IN_USE(_push_pop); } + virtual void set_params_for_socket_pool() {}; + void set_rx_num_buffs_reuse(int val) { m_rx_num_buffs_reuse = val; } +#endif +#endif +protected: + static const char *setsockopt_so_opt_to_str(int opt); + + virtual void lock_rx_q() = 0; + virtual void unlock_rx_q() = 0; + virtual void set_blocking(bool is_blocked); virtual mem_buf_desc_t *get_front_m_rx_pkt_ready_list() = 0; virtual size_t get_size_m_rx_pkt_ready_list() = 0; virtual void pop_front_m_rx_pkt_ready_list() = 0; virtual void push_back_m_rx_pkt_ready_list(mem_buf_desc_t *buff) = 0; - - void save_stats_rx_os(int bytes); - void save_stats_tx_os(int bytes); - void save_stats_rx_offload(int nbytes); - virtual int rx_verify_available_data() = 0; virtual void update_header_field(data_updater *updater) = 0; virtual mem_buf_desc_t *get_next_desc(mem_buf_desc_t *p_desc) = 0; @@ -264,11 +397,32 @@ class sockinfo : public socket_fd_api, virtual void post_deqeue(bool release_buff) = 0; virtual int os_epoll_wait(epoll_event *ep_events, int maxevents); virtual int zero_copy_rx(iovec *p_iov, mem_buf_desc_t *pdesc, int *p_flags) = 0; - virtual int register_callback(xlio_recv_callback_t callback, void *context); + virtual void handle_ip_pktinfo(struct cmsg_state *cm_state) = 0; + virtual bool try_un_offloading(); // un-offload the socket if possible virtual size_t handle_msg_trunc(size_t total_rx, size_t payload_size, int in_flags, - int *p_out_flags); + int *p_out_flags) = 0; + + // This callback will notify that socket is ready to receive and map the cq. + virtual void rx_add_ring_cb(ring *p_ring); + virtual void rx_del_ring_cb(ring *p_ring); + + inline void set_rx_reuse_pending(bool is_pending = true); + inline void reuse_buffer(mem_buf_desc_t *buff); + inline xlio_socketxtreme_completion_t *set_events_socketxtreme(uint64_t events, + bool full_transaction); + inline void set_events(uint64_t events); + inline void save_strq_stats(uint32_t packet_strides); + + inline int dequeue_packet(iovec *p_iov, ssize_t sz_iov, sockaddr *__from, socklen_t *__fromlen, + int in_flags, int *p_out_flags); + int get_sock_by_L3_L4(in_protocol_t protocol, const ip_address &ip, in_port_t port); + void notify_epoll_context(uint32_t events); + void save_stats_rx_os(int bytes); + void save_stats_tx_os(int bytes); + void save_stats_rx_offload(int nbytes); + void socket_stats_init(); bool attach_receiver(flow_tuple_with_local_if &flow_key); bool detach_receiver(flow_tuple_with_local_if &flow_key); net_device_resources_t *create_nd_resources(const ip_addr &ip_local); @@ -278,35 +432,15 @@ class sockinfo : public socket_fd_api, int set_ring_attr_helper(ring_alloc_logic_attr *sock_attr, xlio_ring_alloc_logic_attr *attr); void set_ring_logic_rx(ring_alloc_logic_attr ral); void set_ring_logic_tx(ring_alloc_logic_attr ral); - - // Attach to all relevant rings for offloading receive flows - always used from slow path - // According to bounded information we need to attach to all UC relevant flows - // If local_ip is ANY then we need to attach to all offloaded interfaces OR to the one our - // connected_ip is routed to - bool attach_as_uc_receiver(role_t role, bool skip_rules = false); - transport_t find_target_family(role_t role, const struct sockaddr *sock_addr_first, - const struct sockaddr *sock_addr_second = NULL); - - // This callback will notify that socket is ready to receive and map the cq. - virtual void rx_add_ring_cb(ring *p_ring); - virtual void rx_del_ring_cb(ring *p_ring); - - virtual void lock_rx_q() { m_lock_rcv.lock(); } - virtual void unlock_rx_q() { m_lock_rcv.unlock(); } - void shutdown_rx(); - void destructor_helper(); int modify_ratelimit(dst_entry *p_dst_entry, struct xlio_rate_limit_t &rate_limit); - void move_descs(ring *p_ring, descq_t *toq, descq_t *fromq, bool own); - void pop_descs_rx_ready(descq_t *cache, ring *p_ring = NULL); + void pop_descs_rx_ready(descq_t *cache, ring *p_ring = nullptr); void push_descs_rx_ready(descq_t *cache); - void reuse_descs(descq_t *reuseq, ring *p_ring = NULL); + void reuse_descs(descq_t *reuseq, ring *p_ring = nullptr); int set_sockopt_prio(__const void *__optval, socklen_t __optlen); bool ipv6_set_addr_sel_pref(int val); int ipv6_get_addr_sel_pref(); - - virtual void handle_ip_pktinfo(struct cmsg_state *cm_state) = 0; inline void handle_recv_timestamping(struct cmsg_state *cm_state); inline void handle_recv_errqueue(struct cmsg_state *cm_state); void insert_cmsg(struct cmsg_state *cm_state, int level, int type, void *data, int len); @@ -315,339 +449,343 @@ class sockinfo : public socket_fd_api, void add_cqfd_to_sock_rx_epfd(ring *p_ring); void remove_cqfd_from_sock_rx_epfd(ring *p_ring); int os_wait_sock_rx_epfd(epoll_event *ep_events, int maxevents); - virtual bool try_un_offloading(); // un-offload the socket if possible - - bool is_shadow_socket_present() { return m_fd >= 0 && m_fd != m_rx_epfd; } - inline bool is_socketxtreme() { return safe_mce_sys().enable_socketxtreme; } - - inline void set_events_socketxtreme(uint64_t events) - { - m_socketxtreme.ec->completion.user_data = (uint64_t)m_fd_context; - if (!m_socketxtreme.ec->completion.events) { - m_socketxtreme.ec->completion.events |= events; - m_p_rx_ring->put_ec(m_socketxtreme.ec); - - m_socketxtreme.ec = NULL; - for (auto &ec : m_socketxtreme.ec_cache) { - if (0 == ec.completion.events) { - m_socketxtreme.ec = &ec; - break; - } - } - if (NULL == m_socketxtreme.ec) { - struct ring_ec ec; - ec.clear(); - m_socketxtreme.ec_cache.push_back(ec); - m_socketxtreme.ec = &m_socketxtreme.ec_cache.back(); - } - } else { - m_socketxtreme.ec->completion.events |= events; - } - } - - inline void set_events(uint64_t events) - { - /* Collect all events if rx ring is enabled */ - if (is_socketxtreme() && m_state == SOCKINFO_OPENED) { - set_events_socketxtreme(events); - } - - socket_fd_api::notify_epoll_context((uint32_t)events); - } - - inline void save_strq_stats(uint32_t packet_strides) - { - m_socket_stats.strq_counters.n_strq_total_strides += static_cast(packet_strides); - m_socket_stats.strq_counters.n_strq_max_strides_per_packet = - std::max(m_socket_stats.strq_counters.n_strq_max_strides_per_packet, packet_strides); - } - - inline int dequeue_packet(iovec *p_iov, ssize_t sz_iov, sockaddr *__from, socklen_t *__fromlen, - int in_flags, int *p_out_flags) - { - mem_buf_desc_t *pdesc; - int total_rx = 0; - uint32_t nbytes, pos; - bool relase_buff = true; + void insert_epoll_event(uint64_t events); + int handle_exception_flow(); - bool is_peek = in_flags & MSG_PEEK; - int rx_pkt_ready_list_idx = 1; - int rx_pkt_ready_offset = m_rx_pkt_ready_offset; + // Attach to all relevant rings for offloading receive flows - always used from slow path + // According to bounded information we need to attach to all UC relevant flows + // If local_ip is ANY then we need to attach to all offloaded interfaces OR to the one our + // connected_ip is routed to + bool attach_as_uc_receiver(role_t role, bool skip_rules = false); - pdesc = get_front_m_rx_pkt_ready_list(); - void *iov_base = (uint8_t *)pdesc->rx.frag.iov_base + m_rx_pkt_ready_offset; - size_t bytes_left = pdesc->rx.frag.iov_len - m_rx_pkt_ready_offset; - size_t payload_size = pdesc->rx.sz_payload; + // Calling OS receive + ssize_t rx_os(const rx_call_t call_type, iovec *p_iov, ssize_t sz_iov, const int flags, + sockaddr *__from, socklen_t *__fromlen, struct msghdr *__msg); - if (__from && __fromlen) { - pdesc->rx.src.get_sa_by_family(__from, *__fromlen, m_family); - } - - if (in_flags & MSG_XLIO_ZCOPY) { - relase_buff = false; - total_rx = zero_copy_rx(p_iov, pdesc, p_out_flags); - if (unlikely(total_rx < 0)) { - return -1; - } - m_rx_pkt_ready_offset = 0; - } else { -#ifdef DEFINED_UTLS - uint8_t tls_type = pdesc->rx.tls_type; -#endif /* DEFINED_UTLS */ - for (int i = 0; i < sz_iov && pdesc; i++) { - pos = 0; - while (pos < p_iov[i].iov_len && pdesc) { -#ifdef DEFINED_UTLS - if (unlikely(pdesc->rx.tls_type != tls_type)) { - break; - } -#endif /* DEFINED_UTLS */ - nbytes = p_iov[i].iov_len - pos; - if (nbytes > bytes_left) { - nbytes = bytes_left; - } - memcpy((char *)(p_iov[i].iov_base) + pos, iov_base, nbytes); - pos += nbytes; - total_rx += nbytes; - m_rx_pkt_ready_offset += nbytes; - bytes_left -= nbytes; - iov_base = (uint8_t *)iov_base + nbytes; - if (m_b_rcvtstamp || m_n_tsing_flags) { - update_socket_timestamps(&pdesc->rx.timestamps); - } - if (bytes_left <= 0) { - if (unlikely(is_peek)) { - pdesc = get_next_desc_peek(pdesc, rx_pkt_ready_list_idx); - } else { - pdesc = get_next_desc(pdesc); - } - m_rx_pkt_ready_offset = 0; - if (pdesc) { - iov_base = pdesc->rx.frag.iov_base; - bytes_left = pdesc->rx.frag.iov_len; - } - } - } - } - } + int setsockopt_kernel(int __level, int __optname, const void *__optval, socklen_t __optlen, + int supported, bool allow_priv); - if (unlikely(is_peek)) { - m_rx_pkt_ready_offset = - rx_pkt_ready_offset; // if MSG_PEEK is on, m_rx_pkt_ready_offset must be zero-ed - // save_stats_rx_offload(total_rx); //TODO?? - } else { - m_rx_ready_byte_count -= total_rx; - m_p_socket_stats->n_rx_ready_byte_count -= total_rx; - post_deqeue(relase_buff); - save_stats_rx_offload(total_rx); - } + transport_t find_target_family(role_t role, const struct sockaddr *sock_addr_first, + const struct sockaddr *sock_addr_second = nullptr); - total_rx = handle_msg_trunc(total_rx, payload_size, in_flags, p_out_flags); +private: + int fcntl_helper(int __cmd, unsigned long int __arg, bool &bexit); + bool attach_as_uc_receiver_anyip(sa_family_t family, role_t role, bool skip_rules); - return total_rx; - } +protected: + dst_entry *m_p_connected_dst_entry = nullptr; + sockinfo_state m_state = SOCKINFO_OPENED; // socket current state + uint8_t m_n_tsing_flags = 0U; + bool m_has_stats = false; + bool m_b_rcvtstamp = false; + bool m_b_zc = false; + bool m_b_blocking = true; + bool m_b_rcvtstampns = false; + rfs *m_rfs_ptr = nullptr; + ring *m_p_rx_ring = nullptr; // used in TCP/UDP + ring_ec *m_socketxtreme_ec_first = nullptr; + ring_ec *m_socketxtreme_ec_last = nullptr; + sockinfo *m_socketxtreme_ring_list_next = nullptr; + + // End of first cache line - inline void reuse_buffer(mem_buf_desc_t *buff) - { - set_rx_reuse_pending(false); - ring *p_ring = buff->p_desc_owner->get_parent(); - rx_ring_map_t::iterator iter = m_rx_ring_map.find(p_ring); - if (likely(iter != m_rx_ring_map.end())) { - if (safe_mce_sys().buffer_batching_mode == BUFFER_BATCHING_NONE) { - if (!p_ring->reclaim_recv_buffers(buff)) { - g_buffer_pool_rx_ptr->put_buffer_after_deref_thread_safe(buff); - } - return; - } + void *m_fd_context; // Context data stored with socket + mem_buf_desc_t *m_last_zcdesc = nullptr; + socket_stats_t *m_p_socket_stats = nullptr; - descq_t *rx_reuse = &iter->second->rx_reuse_info.rx_reuse; - int &n_buff_num = iter->second->rx_reuse_info.n_buff_num; - rx_reuse->push_back(buff); - n_buff_num += buff->rx.n_frags; - if (n_buff_num < m_n_sysvar_rx_num_buffs_reuse) { - return; - } - if (n_buff_num >= 2 * m_n_sysvar_rx_num_buffs_reuse) { - if (p_ring->reclaim_recv_buffers(rx_reuse)) { - n_buff_num = 0; - } else { - g_buffer_pool_rx_ptr->put_buffers_after_deref_thread_safe(rx_reuse); - n_buff_num = 0; - } - m_rx_reuse_buf_postponed = false; - } else { - m_rx_reuse_buf_postponed = true; - } - } else { - // Retuned buffer to global pool when owner can't be found - // In case ring was deleted while buffers where still queued - vlog_printf(VLOG_DEBUG, "Buffer owner not found\n"); - // Awareness: these are best efforts: decRef without lock in case no CQ - g_buffer_pool_rx_ptr->put_buffer_after_deref_thread_safe(buff); - } - } + /* Socket error queue that keeps local errors and internal data required + * to provide notification ability. + */ + descq_t m_error_queue; + lock_spin_simple m_error_queue_lock; - static const char *setsockopt_so_opt_to_str(int opt) - { - switch (opt) { - case SO_REUSEADDR: - return "SO_REUSEADDR"; - case SO_REUSEPORT: - return "SO_REUSEPORT"; - case SO_BROADCAST: - return "SO_BROADCAST"; - case SO_RCVBUF: - return "SO_RCVBUF"; - case SO_SNDBUF: - return "SO_SNDBUF"; - case SO_TIMESTAMP: - return "SO_TIMESTAMP"; - case SO_TIMESTAMPNS: - return "SO_TIMESTAMPNS"; - case SO_BINDTODEVICE: - return "SO_BINDTODEVICE"; - case SO_ZEROCOPY: - return "SO_ZEROCOPY"; - case SO_XLIO_RING_ALLOC_LOGIC: - return "SO_XLIO_RING_ALLOC_LOGIC"; - case SO_MAX_PACING_RATE: - return "SO_MAX_PACING_RATE"; - case SO_XLIO_FLOW_TAG: - return "SO_XLIO_FLOW_TAG"; - case SO_XLIO_SHUTDOWN_RX: - return "SO_XLIO_SHUTDOWN_RX"; - case IPV6_V6ONLY: - return "IPV6_V6ONLY"; - case IPV6_ADDR_PREFERENCES: - return "IPV6_ADDR_PREFERENCES"; - default: - break; - } - return "UNKNOWN SO opt"; - } + /* TX zcopy counter + * The notification itself for tx zcopy operation is a simple scalar value. + * Each socket maintains an internal unsigned 32-bit counter. + * Each send call with MSG_ZEROCOPY that successfully sends data increments + * the counter. The counter is not incremented on failure or if called with + * length zero. + * The counter counts system call invocations, not bytes. + * It wraps after UINT_MAX calls. + */ + atomic_t m_zckey; - int get_sock_by_L3_L4(in_protocol_t protocol, const ip_address &ip, in_port_t port); + // End of second cache line - ////////////////////////////////////////////////////////////////// - int handle_exception_flow() - { - if (safe_mce_sys().exception_handling.is_suit_un_offloading()) { - try_un_offloading(); - } - if (safe_mce_sys().exception_handling == xlio_exception_handling::MODE_RETURN_ERROR) { - errno = EINVAL; - return -1; - } - if (safe_mce_sys().exception_handling == xlio_exception_handling::MODE_ABORT) { - return -2; - } - return 0; - } - ////////////////////////////////////////////////////////////////// -private: - int fcntl_helper(int __cmd, unsigned long int __arg, bool &bexit); - bool attach_as_uc_receiver_anyip(sa_family_t family, role_t role, bool skip_rules); + epfd_info *m_econtext = nullptr; + wakeup_pipe m_sock_wakeup_pipe; + int m_rx_epfd; + in_protocol_t m_protocol = PROTO_UNDEFINED; + sa_family_t m_family; public: - socket_stats_t *m_p_socket_stats; - /* Last memory descriptor with zcopy operation method */ - mem_buf_desc_t *m_last_zcdesc; - struct { - /* Use std::deque in current design as far as it allows pushing - * elements on either end without moving around any other element - * but trade this for slightly worse iteration speeds. - */ - std::deque ec_cache; - struct ring_ec *ec; - } m_socketxtreme; - - rfs *rfs_ptr = nullptr; + list_node socket_fd_list_node; + list_node ep_ready_fd_node; + list_node ep_info_fd_node; + list_node pendig_to_remove_node; + epoll_fd_rec m_fd_rec; + uint32_t m_epoll_event_flags = 0U; protected: - bool m_reuseaddr; // to track setsockopt with SO_REUSEADDR - bool m_reuseport; // to track setsockopt with SO_REUSEPORT - bool m_flow_tag_enabled; // for this socket - bool m_b_blocking; - bool m_b_pktinfo; - bool m_b_rcvtstamp; - bool m_b_rcvtstampns; - bool m_b_zc; + int m_fd; // identification information + /** + * list of pending ready packet on the Rx, + * each element is a pointer to the ib_conn_mgr that holds this ready rx datagram + */ + size_t m_rx_pkt_ready_offset = 0U; + size_t m_rx_ready_byte_count = 0U; + buff_info_t m_rx_reuse_buff; // used in TCP instead of m_rx_ring_map + int m_n_rx_pkt_ready_list_count = 0; + int m_rx_num_buffs_reuse; + // used to periodically return buffers, even if threshold was not reached + bool m_rx_reuse_buf_pending = false; + // used to mark threshold was reached, but free was not done yet + bool m_rx_reuse_buf_postponed = false; bool m_skip_cq_poll_in_rx; - uint8_t m_n_tsing_flags; - in_protocol_t m_protocol; - uint8_t m_src_sel_flags; + bool m_reuseaddr = false; // to track setsockopt with SO_REUSEADDR + bool m_reuseport = false; // to track setsockopt with SO_REUSEPORT + bool m_b_pktinfo = false; + bool m_bind_no_port = false; + bool m_is_ipv6only; multilock m_lock_rcv; lock_mutex m_lock_snd; lock_mutex m_rx_migration_lock; - - sockinfo_state m_state; // socket current state - sa_family_t m_family; sock_addr m_bound; sock_addr m_connected; - dst_entry *m_p_connected_dst_entry; ip_addr m_so_bindtodevice_ip; - - socket_stats_t m_socket_stats; - - int m_rx_epfd; cache_observer m_rx_nd_observer; rx_net_device_map_t m_rx_nd_map; rx_flow_map_t m_rx_flow_map; - // we either listen on ALL system cqs or bound to the specific cq - ring *m_p_rx_ring; // used in TCP/UDP - buff_info_t m_rx_reuse_buff; // used in TCP instead of m_rx_ring_map - bool m_rx_reuse_buf_pending; // used to periodically return buffers, even if threshold was not - // reached - bool m_rx_reuse_buf_postponed; // used to mark threshold was reached, but free was not done yet rx_ring_map_t m_rx_ring_map; // CQ map lock_mutex_recursive m_rx_ring_map_lock; ring_allocation_logic_rx m_ring_alloc_logic_rx; - loops_timer m_loops_timer; - - /** - * list of pending ready packet on the Rx, - * each element is a pointer to the ib_conn_mgr that holds this ready rx datagram - */ - int m_n_rx_pkt_ready_list_count; - size_t m_rx_pkt_ready_offset; - size_t m_rx_ready_byte_count; - - int m_n_sysvar_rx_num_buffs_reuse; - const int32_t m_n_sysvar_rx_poll_num; ring_alloc_logic_attr m_ring_alloc_log_rx; ring_alloc_logic_attr m_ring_alloc_log_tx; - uint32_t m_pcp; - - /* Socket error queue that keeps local errors and internal data required - * to provide notification ability. - */ - descq_t m_error_queue; - lock_spin m_error_queue_lock; - - /* TX zcopy counter - * The notification itself for tx zcopy operation is a simple scalar value. - * Each socket maintains an internal unsigned 32-bit counter. - * Each send call with MSG_ZEROCOPY that successfully sends data increments - * the counter. The counter is not incremented on failure or if called with - * length zero. - * The counter counts system call invocations, not bytes. - * It wraps after UINT_MAX calls. - */ - atomic_t m_zckey; - - // Callback function pointer to support VMA extra API (xlio_extra.h) - xlio_recv_callback_t m_rx_callback; - void *m_rx_callback_context; // user context + // Callback function pointer to support XLIO extra API (xlio_extra.h) + xlio_recv_callback_t m_rx_callback = nullptr; + void *m_rx_callback_context = nullptr; // user context struct xlio_rate_limit_t m_so_ratelimit; - void *m_fd_context; // Context data stored with socket - uint32_t m_flow_tag_id; // Flow Tag for this socket - bool m_rx_cq_wait_ctrl; + uint32_t m_pcp = 0U; + uint32_t m_flow_tag_id = 0U; // Flow Tag for this socket uint8_t m_n_uc_ttl_hop_lim; - bool m_bind_no_port; - bool m_is_ipv6only; - int *m_p_rings_fds; + uint8_t m_src_sel_flags = 0U; + +public: +#if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) + bool m_is_for_socket_pool = false; // true when this fd will be used for socket pool on close + int m_back_log = 0; +#endif }; +void sockinfo::set_rx_reuse_pending(bool is_pending) +{ + m_rx_reuse_buf_pending = is_pending; +} + +bool sockinfo::set_flow_tag(uint32_t flow_tag_id) +{ + if (flow_tag_id && (flow_tag_id != FLOW_TAG_MASK)) { + m_flow_tag_id = flow_tag_id; + return true; + } + m_flow_tag_id = FLOW_TAG_MASK; + return false; +} + +void sockinfo::sock_pop_descs_rx_ready(descq_t *cache) +{ + lock_rx_q(); + mem_buf_desc_t *temp; + const size_t size = get_size_m_rx_pkt_ready_list(); + + for (size_t i = 0; i < size; i++) { + temp = get_front_m_rx_pkt_ready_list(); + pop_front_m_rx_pkt_ready_list(); + cache->push_back(temp); + } + m_n_rx_pkt_ready_list_count = 0; + m_rx_ready_byte_count = 0; + m_p_socket_stats->n_rx_ready_pkt_count = 0; + m_p_socket_stats->n_rx_ready_byte_count = 0; + + unlock_rx_q(); +} + +xlio_socketxtreme_completion_t *sockinfo::set_events_socketxtreme(uint64_t events, + bool full_transaction) +{ + bool always_new = + ((events & (XLIO_SOCKETXTREME_PACKET | XLIO_SOCKETXTREME_NEW_CONNECTION_ACCEPTED)) != 0U); + xlio_socketxtreme_completion_t &completion = + m_p_rx_ring->socketxtreme_start_ec_operation(this, always_new); + completion.user_data = (uint64_t)m_fd_context; + completion.events |= events; + + if (full_transaction) { + m_p_rx_ring->socketxtreme_end_ec_operation(); + return nullptr; + } + + return &completion; +} + +void sockinfo::set_events(uint64_t events) +{ + /* Collect all events if rx ring is enabled */ + if (safe_mce_sys().enable_socketxtreme) { + if (m_state == SOCKINFO_OPENED) { + set_events_socketxtreme(events, true); + } + } else { + insert_epoll_event(events); + } +} + +void sockinfo::save_strq_stats(uint32_t packet_strides) +{ + if (unlikely(has_stats())) { + m_p_socket_stats->counters.n_rx_packets++; + m_p_socket_stats->strq_counters.n_strq_total_strides += + static_cast(packet_strides); + m_p_socket_stats->strq_counters.n_strq_max_strides_per_packet = + std::max(m_p_socket_stats->strq_counters.n_strq_max_strides_per_packet, packet_strides); + } +} + +int sockinfo::dequeue_packet(iovec *p_iov, ssize_t sz_iov, sockaddr *__from, socklen_t *__fromlen, + int in_flags, int *p_out_flags) +{ + mem_buf_desc_t *pdesc; + int total_rx = 0; + uint32_t nbytes, pos; + bool relase_buff = true; + + bool is_peek = in_flags & MSG_PEEK; + int rx_pkt_ready_list_idx = 1; + int rx_pkt_ready_offset = m_rx_pkt_ready_offset; + + pdesc = get_front_m_rx_pkt_ready_list(); + void *iov_base = (uint8_t *)pdesc->rx.frag.iov_base + m_rx_pkt_ready_offset; + size_t bytes_left = pdesc->rx.frag.iov_len - m_rx_pkt_ready_offset; + size_t payload_size = pdesc->rx.sz_payload; + + if (__from && __fromlen) { + if (m_protocol == PROTO_UDP || m_connected.is_anyport()) { + // For UDP non-connected or TCP listen socket fetch from packet. + pdesc->rx.src.get_sa_by_family(__from, *__fromlen, m_family); + } else { + // For TCP connected 5T fetch from m_connected. + // For TCP flow-tag we avoid filling packet with src for performance. + m_connected.get_sa_by_family(__from, *__fromlen, m_family); + } + } + + if (in_flags & MSG_XLIO_ZCOPY) { + relase_buff = false; + total_rx = zero_copy_rx(p_iov, pdesc, p_out_flags); + if (unlikely(total_rx < 0)) { + return -1; + } + m_rx_pkt_ready_offset = 0; + } else { +#ifdef DEFINED_UTLS + uint8_t tls_type = pdesc->rx.tls_type; +#endif /* DEFINED_UTLS */ + for (int i = 0; i < sz_iov && pdesc; i++) { + pos = 0; + while (pos < p_iov[i].iov_len && pdesc) { +#ifdef DEFINED_UTLS + if (unlikely(pdesc->rx.tls_type != tls_type)) { + break; + } +#endif /* DEFINED_UTLS */ + nbytes = p_iov[i].iov_len - pos; + if (nbytes > bytes_left) { + nbytes = bytes_left; + } + memcpy((char *)(p_iov[i].iov_base) + pos, iov_base, nbytes); + pos += nbytes; + total_rx += nbytes; + m_rx_pkt_ready_offset += nbytes; + bytes_left -= nbytes; + iov_base = (uint8_t *)iov_base + nbytes; + if (m_b_rcvtstamp || m_n_tsing_flags) { + update_socket_timestamps(&pdesc->rx.timestamps); + } + if (bytes_left <= 0) { + if (unlikely(is_peek)) { + pdesc = get_next_desc_peek(pdesc, rx_pkt_ready_list_idx); + } else { + pdesc = get_next_desc(pdesc); + } + m_rx_pkt_ready_offset = 0; + if (pdesc) { + iov_base = pdesc->rx.frag.iov_base; + bytes_left = pdesc->rx.frag.iov_len; + } + } + } + } + } + + if (unlikely(is_peek)) { + m_rx_pkt_ready_offset = + rx_pkt_ready_offset; // if MSG_PEEK is on, m_rx_pkt_ready_offset must be zero-ed + // save_stats_rx_offload(total_rx); //TODO?? + } else { + if (unlikely(has_stats())) { + m_p_socket_stats->n_rx_ready_byte_count -= total_rx; + } + m_rx_ready_byte_count -= total_rx; + post_deqeue(relase_buff); + save_stats_rx_offload(total_rx); + } + + total_rx = handle_msg_trunc(total_rx, payload_size, in_flags, p_out_flags); + + return total_rx; +} + +void sockinfo::reuse_buffer(mem_buf_desc_t *buff) +{ + set_rx_reuse_pending(false); + ring *p_ring = buff->p_desc_owner->get_parent(); + rx_ring_map_t::iterator iter = m_rx_ring_map.find(p_ring); + if (likely(iter != m_rx_ring_map.end())) { + if (safe_mce_sys().buffer_batching_mode == BUFFER_BATCHING_NONE) { + if (!p_ring->reclaim_recv_buffers(buff)) { + g_buffer_pool_rx_ptr->put_buffer_after_deref_thread_safe(buff); + } + return; + } + + descq_t *rx_reuse = &iter->second->rx_reuse_info.rx_reuse; + int &n_buff_num = iter->second->rx_reuse_info.n_buff_num; + rx_reuse->push_back(buff); + n_buff_num += buff->rx.n_frags; + if (n_buff_num < m_rx_num_buffs_reuse) { + return; + } + if (n_buff_num >= 2 * m_rx_num_buffs_reuse) { + if (p_ring->reclaim_recv_buffers(rx_reuse)) { + n_buff_num = 0; + } else { + g_buffer_pool_rx_ptr->put_buffers_after_deref_thread_safe(rx_reuse); + n_buff_num = 0; + } + m_rx_reuse_buf_postponed = false; + } else { + m_rx_reuse_buf_postponed = true; + } + } else { + // Retuned buffer to global pool when owner can't be found + // In case ring was deleted while buffers where still queued + vlog_printf(VLOG_DEBUG, "Buffer owner not found\n"); + // Awareness: these are best efforts: decRef without lock in case no CQ + g_buffer_pool_rx_ptr->put_buffer_after_deref_thread_safe(buff); + } +} + #endif /* BASE_SOCKINFO_H */ diff --git a/src/core/sock/sockinfo_nvme.cpp b/src/core/sock/sockinfo_nvme.cpp index 7e529523f..01a0fa785 100644 --- a/src/core/sock/sockinfo_nvme.cpp +++ b/src/core/sock/sockinfo_nvme.cpp @@ -89,11 +89,10 @@ ssize_t sockinfo_tcp_ops_nvme::tx(xlio_tx_call_attr_t &tx_arg) errno = EINVAL; return -1; } - auto aux_data = reinterpret_cast(tx_arg.priv.map); + auto aux_data = reinterpret_cast(tx_arg.priv.opaque); auto msg = tx_arg.attr.hdr; - if (msg->msg_iov == nullptr || aux_data == nullptr || msg->msg_iovlen == 0U || - aux_data[0].message_length == 0U) { + if (!msg->msg_iov || !aux_data || msg->msg_iovlen == 0U || aux_data[0].message_length == 0U) { si_nvme_logerr("Invalid msg_iov, msg_iovlen, or auxiliary data"); errno = EINVAL; return -1; @@ -136,7 +135,7 @@ ssize_t sockinfo_tcp_ops_nvme::tx(xlio_tx_call_attr_t &tx_arg) /* Update tx_arg before sending to TCP */ auto *desc = nvme_pdu_mdesc::create(num_iovecs, msg->msg_iov, aux_data, m_p_sock->get_next_tcp_seqno(), total_tx_length); - if (desc == nullptr) { + if (!desc) { si_nvme_logerr("Unable to allocate nvme_mdesc"); errno = ENOMEM; return -1; @@ -165,27 +164,27 @@ static inline bool request_credits_for_resync(ring *p_ring, size_t datalen, size int sockinfo_tcp_ops_nvme::postrouting(pbuf *p, tcp_seg *seg, xlio_send_attr &attr) { - if (!m_is_ddgs_on || p == nullptr || seg == nullptr || seg->len == 0U) { + if (!m_is_ddgs_on || !p || !seg || seg->len == 0U) { return ERR_OK; } - assert(m_p_tis != nullptr); + assert(m_p_tis); attr.tis = m_p_tis.get(); if (likely(seg->seqno == m_expected_seqno)) { m_expected_seqno += seg->len; return ERR_OK; } - assert(p->next != nullptr); + assert(p->next); assert(p->next->desc.attr == PBUF_DESC_NVME_TX); ring *p_ring = m_p_sock->get_tx_ring(); - if (p_ring == nullptr) { + if (!p_ring) { si_nvme_logerr("No ring"); return ERR_RTE; } auto nvme_mdesc = dynamic_cast(static_cast(p->next->desc.mdesc)); - if (unlikely(nvme_mdesc == nullptr)) { + if (unlikely(!nvme_mdesc)) { si_nvme_logerr("NVME momory descriptor not found"); return ERR_RTE; } @@ -241,18 +240,18 @@ bool sockinfo_tcp_ops_nvme::handle_send_ret(ssize_t ret, tcp_seg *seg) err_t sockinfo_tcp_ops_nvme::recv(pbuf *p) { - return p != nullptr ? ERR_OK : ERR_ARG; + return p ? ERR_OK : ERR_ARG; } int sockinfo_tcp_ops_nvme::setsockopt_tx(const uint32_t &config) { ring *p_ring = m_p_sock->get_tx_ring(); - if (p_ring == nullptr) { + if (!p_ring) { errno = ENOTSUP; return -1; } m_p_tis = p_ring->create_tis(DPCP_TIS_FLAGS | DPCP_TIS_NVME_FLAG); - if (m_p_tis == nullptr) { + if (!m_p_tis) { errno = ENOTSUP; return -1; } diff --git a/src/core/sock/sockinfo_nvme.h b/src/core/sock/sockinfo_nvme.h index e4968ce61..40703b688 100644 --- a/src/core/sock/sockinfo_nvme.h +++ b/src/core/sock/sockinfo_nvme.h @@ -36,12 +36,11 @@ #include #include #include "sockinfo_ulp.h" /* sockinfo_tcp_ops */ -#include "dev/qp_mgr_eth_mlx5.h" +#include "dev/hw_queue_tx.h" #include "proto/nvme_parse_input_args.h" #include "xlio_extra.h" #include "lwip/err.h" /* err_t */ -typedef struct xlio_tx_call_attr xlio_tx_call_attr_t; struct xlio_send_attr; class sockinfo_tcp_ops_nvme : public sockinfo_tcp_ops { @@ -56,7 +55,7 @@ class sockinfo_tcp_ops_nvme : public sockinfo_tcp_ops { , m_is_ddgs_on(false) { } - ~sockinfo_tcp_ops_nvme() + ~sockinfo_tcp_ops_nvme() override { if (m_pdu_mdesc) { m_pdu_mdesc->put(); diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index ee3f8e359..47cf629b0 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -46,16 +46,17 @@ #include "util/list.h" #include "util/agent.h" #include "event/event_handler_manager.h" +#include "event/event_handler_manager_local.h" +#include "event/poll_group.h" #include "proto/route_table_mgr.h" #include "proto/xlio_lwip.h" #include "proto/dst_entry_tcp.h" #include "iomux/io_mux_call.h" -#include "event/thread_local_event_handler.h" #include "sock-redirect.h" #include "fd_collection.h" #include "sockinfo_tcp.h" -#include "tcp_seg_pool.h" #include "bind_no_port.h" +#include "xlio.h" #define UNLOCK_RET(_ret) \ unlock_tcp_con(); \ @@ -79,9 +80,10 @@ extern global_stats_t g_global_stat_static; -tcp_timers_collection *g_tcp_timers_collection = NULL; +tcp_timers_collection *g_tcp_timers_collection = nullptr; thread_local thread_local_tcp_timers g_thread_local_tcp_timers; -bind_no_port *g_bind_no_port = NULL; +bind_no_port *g_bind_no_port = nullptr; +static thread_local lock_dummy t_lock_dummy_socket; /* * The following socket options are inherited by a connected TCP socket from the listening socket: @@ -132,18 +134,28 @@ static bool is_inherited_option(int __level, int __optname) return ret; } -static event_handler_manager *get_event_mgr() +event_handler_manager *sockinfo_tcp::get_event_mgr() { - return (safe_mce_sys().tcp_ctl_thread != option_tcp_ctl_thread::CTL_THREAD_DELEGATE_TCP_TIMERS - ? g_p_event_handler_manager - : &g_thread_local_event_handler); + if (is_xlio_socket()) { + return m_p_group->get_event_handler(); + } else if (safe_mce_sys().tcp_ctl_thread == + option_tcp_ctl_thread::CTL_THREAD_DELEGATE_TCP_TIMERS) { + return &g_event_handler_manager_local; + } else { + return g_p_event_handler_manager; + } } -static tcp_timers_collection *get_tcp_timer_collection() +tcp_timers_collection *sockinfo_tcp::get_tcp_timer_collection() { - return (safe_mce_sys().tcp_ctl_thread != option_tcp_ctl_thread::CTL_THREAD_DELEGATE_TCP_TIMERS - ? g_tcp_timers_collection - : &g_thread_local_tcp_timers); + if (is_xlio_socket()) { + return m_p_group->get_tcp_timers(); + } else if (safe_mce_sys().tcp_ctl_thread == + option_tcp_ctl_thread::CTL_THREAD_DELEGATE_TCP_TIMERS) { + return &g_thread_local_tcp_timers; + } else { + return g_tcp_timers_collection; + } } static lock_base *get_new_tcp_lock() @@ -151,20 +163,19 @@ static lock_base *get_new_tcp_lock() return ( safe_mce_sys().tcp_ctl_thread != option_tcp_ctl_thread::CTL_THREAD_DELEGATE_TCP_TIMERS ? static_cast(multilock::create_new_lock(MULTILOCK_RECURSIVE, "tcp_con")) - : static_cast(new lock_dummy)); + : static_cast(&t_lock_dummy_socket)); } inline void sockinfo_tcp::lwip_pbuf_init_custom(mem_buf_desc_t *p_desc) { - if (!p_desc->lwip_pbuf.pbuf.gro) { - p_desc->lwip_pbuf.pbuf.len = p_desc->lwip_pbuf.pbuf.tot_len = + if (!p_desc->lwip_pbuf.gro) { + p_desc->lwip_pbuf.len = p_desc->lwip_pbuf.tot_len = (p_desc->sz_data - p_desc->rx.n_transport_header_len); - p_desc->lwip_pbuf.pbuf.ref = 1; - p_desc->lwip_pbuf.pbuf.next = NULL; - p_desc->lwip_pbuf.pbuf.payload = - (u8_t *)p_desc->p_buffer + p_desc->rx.n_transport_header_len; + p_desc->lwip_pbuf.ref = 1; + p_desc->lwip_pbuf.next = nullptr; + p_desc->lwip_pbuf.payload = (u8_t *)p_desc->p_buffer + p_desc->rx.n_transport_header_len; } - p_desc->lwip_pbuf.pbuf.gro = 0; + p_desc->lwip_pbuf.gro = 0; } /* change default rx_wait impl to flow based one */ @@ -217,27 +228,27 @@ inline void sockinfo_tcp::return_pending_tx_buffs() inline void sockinfo_tcp::reuse_buffer(mem_buf_desc_t *buff) { /* Special case when ZC buffers are used in RX path. */ - if (buff->lwip_pbuf.pbuf.type == PBUF_ZEROCOPY) { + if (buff->lwip_pbuf.type == PBUF_ZEROCOPY) { dst_entry_tcp *p_dst = (dst_entry_tcp *)(m_p_connected_dst_entry); - mem_buf_desc_t *underlying = - reinterpret_cast(buff->lwip_pbuf.pbuf.desc.mdesc); + mem_buf_desc_t *underlying = reinterpret_cast(buff->lwip_pbuf.desc.mdesc); - buff->lwip_pbuf.pbuf.desc.mdesc = NULL; + buff->lwip_pbuf.desc.mdesc = nullptr; if (likely(p_dst)) { p_dst->put_zc_buffer(buff); } else { g_buffer_pool_zc->put_buffers_thread_safe(buff); } - if (underlying->lwip_pbuf.pbuf.ref > 1) { - --underlying->lwip_pbuf.pbuf.ref; + if (underlying->lwip_pbuf.ref > 1) { + --underlying->lwip_pbuf.ref; return; } /* Continue and release the underlying buffer. */ buff = underlying; - buff->lwip_pbuf.pbuf.ref = 1; - buff->lwip_pbuf.pbuf.next = NULL; - buff->p_next_desc = NULL; + + buff->lwip_pbuf.ref = 1; + buff->lwip_pbuf.next = nullptr; + buff->p_next_desc = nullptr; } if (safe_mce_sys().buffer_batching_mode == BUFFER_BATCHING_NONE) { @@ -251,10 +262,10 @@ inline void sockinfo_tcp::reuse_buffer(mem_buf_desc_t *buff) if (likely(m_p_rx_ring)) { m_rx_reuse_buff.n_buff_num += buff->rx.n_frags; m_rx_reuse_buff.rx_reuse.push_back(buff); - if (m_rx_reuse_buff.n_buff_num < m_n_sysvar_rx_num_buffs_reuse) { + if (m_rx_reuse_buff.n_buff_num < m_rx_num_buffs_reuse) { return; } - if (m_rx_reuse_buff.n_buff_num >= 2 * m_n_sysvar_rx_num_buffs_reuse) { + if (m_rx_reuse_buff.n_buff_num >= 2 * m_rx_num_buffs_reuse) { if (m_p_rx_ring->reclaim_recv_buffers(&m_rx_reuse_buff.rx_reuse)) { m_rx_reuse_buff.n_buff_num = 0; } else { @@ -278,7 +289,6 @@ static inline bool use_socket_ring_locks() sockinfo_tcp::sockinfo_tcp(int fd, int domain) : sockinfo(fd, domain, use_socket_ring_locks()) - , m_timer_handle(NULL) , m_tcp_con_lock(get_new_tcp_lock()) , m_sysvar_buffer_batching_mode(safe_mce_sys().buffer_batching_mode) , m_sysvar_tx_segs_batch_tcp(safe_mce_sys().tx_segs_batch_tcp) @@ -328,7 +338,7 @@ sockinfo_tcp::sockinfo_tcp(int fd, int domain) si_tcp_logdbg("new pcb %p pcb state %d", &m_pcb, get_tcp_state(&m_pcb)); tcp_arg(&m_pcb, this); tcp_ip_output(&m_pcb, sockinfo_tcp::ip_output); - if (is_socketxtreme()) { + if (safe_mce_sys().enable_socketxtreme) { tcp_recv(&m_pcb, sockinfo_tcp::rx_lwip_cb_socketxtreme); } else { tcp_recv(&m_pcb, sockinfo_tcp::rx_lwip_cb); @@ -336,13 +346,9 @@ sockinfo_tcp::sockinfo_tcp(int fd, int domain) tcp_err(&m_pcb, sockinfo_tcp::err_lwip_cb); tcp_sent(&m_pcb, sockinfo_tcp::ack_recvd_lwip_cb); - m_n_pbufs_rcvd = m_n_pbufs_freed = 0; - - m_parent = NULL; - m_iomux_ready_fd_array = NULL; + m_parent = nullptr; + m_iomux_ready_fd_array = nullptr; - /* SNDBUF accounting */ - m_sndbuff_max = 0; /* RCVBUF accounting */ m_rcvbuff_max = safe_mce_sys().sysctl_reader.get_tcp_rmem()->default_value; @@ -381,13 +387,135 @@ sockinfo_tcp::sockinfo_tcp(int fd, int domain) } } - if (g_p_agent != NULL) { + if (g_p_agent) { g_p_agent->register_cb((agent_cb_t)&sockinfo_tcp::put_agent_msg, (void *)this); } si_tcp_logdbg("TCP PCB FLAGS: 0x%x", m_pcb.flags); si_tcp_logfunc("done"); } +void sockinfo_tcp::rx_add_ring_cb(ring *p_ring) +{ + if (m_p_group) { + m_p_group->add_ring(p_ring, &m_ring_alloc_log_rx); + } + sockinfo::rx_add_ring_cb(p_ring); +} + +void sockinfo_tcp::set_xlio_socket(const struct xlio_socket_attr *attr) +{ + m_xlio_socket_userdata = attr->userdata_sq; + m_p_group = reinterpret_cast(attr->group); + + bool current_locks = m_ring_alloc_log_rx.get_use_locks(); + + m_ring_alloc_log_rx.set_ring_alloc_logic(RING_LOGIC_PER_USER_ID); + m_ring_alloc_log_rx.set_user_id_key(reinterpret_cast(m_p_group)); + m_ring_alloc_log_rx.set_use_locks(current_locks || + (m_p_group->get_flags() & XLIO_GROUP_FLAG_SAFE)); + m_ring_alloc_logic_rx = ring_allocation_logic_rx(get_fd(), m_ring_alloc_log_rx); + + m_ring_alloc_log_tx.set_ring_alloc_logic(RING_LOGIC_PER_USER_ID); + m_ring_alloc_log_tx.set_user_id_key(reinterpret_cast(m_p_group)); + m_ring_alloc_log_tx.set_use_locks(current_locks || + (m_p_group->get_flags() & XLIO_GROUP_FLAG_SAFE)); + + if (!current_locks && (m_p_group->get_flags() & XLIO_GROUP_FLAG_SAFE)) { + m_tcp_con_lock = multilock::create_new_lock(MULTILOCK_RECURSIVE, "tcp_con"); + } + + tcp_recv(&m_pcb, sockinfo_tcp::rx_lwip_cb_xlio_socket); + tcp_err(&m_pcb, sockinfo_tcp::err_lwip_cb_xlio_socket); + set_blocking(false); +} + +void sockinfo_tcp::add_tx_ring_to_group() +{ + ring *rng = get_tx_ring(); + if (m_p_group && rng) { + m_p_group->add_ring(rng, &m_ring_alloc_log_tx); + } +} + +void sockinfo_tcp::xlio_socket_event(int event, int value) +{ + if (is_xlio_socket()) { + /* poll_group::m_socket_event_cb must be always set. */ + m_p_group->m_socket_event_cb(reinterpret_cast(this), m_xlio_socket_userdata, + event, value); + } +} + +/*static*/ +err_t sockinfo_tcp::rx_lwip_cb_xlio_socket(void *arg, struct tcp_pcb *pcb, struct pbuf *p, + err_t err) +{ + sockinfo_tcp *conn = (sockinfo_tcp *)arg; + + NOT_IN_USE(pcb); + assert((uintptr_t)pcb->my_container == (uintptr_t)arg); + + // if is FIN + if (unlikely(!p)) { + return conn->handle_fin(pcb, err); + } + + if (unlikely(err != ERR_OK)) { + conn->handle_rx_lwip_cb_error(p); + return err; + } + + tcp_recved(pcb, p->tot_len); + + if (conn->m_p_group->m_socket_rx_cb) { + struct pbuf *ptmp = p; + while (ptmp) { + /* TODO Pass mem_buf_desc_t field intead of pbuf itself as xlio_buf */ + conn->m_p_group->m_socket_rx_cb(reinterpret_cast(conn), + conn->m_xlio_socket_userdata, ptmp->payload, ptmp->len, + reinterpret_cast(ptmp)); + ptmp = ptmp->next; + } + } else { + pbuf_free(p); + } + + // TODO Stats + + return ERR_OK; +} + +/*static*/ +void sockinfo_tcp::err_lwip_cb_xlio_socket(void *pcb_container, err_t err) +{ + sockinfo_tcp *conn = reinterpret_cast(pcb_container); + + // TODO Reduce copy-paste + conn->m_conn_state = TCP_CONN_FAILED; + conn->m_error_status = ECONNABORTED; + if (err == ERR_TIMEOUT) { + conn->m_conn_state = TCP_CONN_TIMEOUT; + conn->m_error_status = ETIMEDOUT; + } else if (err == ERR_RST) { + if (conn->m_sock_state == TCP_SOCK_ASYNC_CONNECT) { + conn->m_conn_state = TCP_CONN_ERROR; + conn->m_error_status = ECONNREFUSED; + } else { + conn->m_conn_state = TCP_CONN_RESETED; + conn->m_error_status = ECONNRESET; + } + } + + // Avoid binding twice in case of calling connect again after previous call failed. + if (conn->m_sock_state != TCP_SOCK_BOUND) { // TODO: maybe we need to exclude more states? + conn->m_sock_state = TCP_SOCK_INITED; + } + + if (conn->m_state != SOCKINFO_CLOSING) { + conn->xlio_socket_event(XLIO_SOCKET_EVENT_ERROR, conn->m_error_status); + } +} + sockinfo_tcp::~sockinfo_tcp() { si_tcp_logfunc(""); @@ -402,13 +530,13 @@ sockinfo_tcp::~sockinfo_tcp() prepare_to_close(true); } - do_wakeup(); + m_sock_wakeup_pipe.do_wakeup(); if (m_ops_tcp != m_ops) { delete m_ops_tcp; } delete m_ops; - m_ops = NULL; + m_ops = nullptr; // Return buffers released in the TLS layer destructor m_rx_reuse_buf_postponed = m_rx_reuse_buff.n_buff_num > 0; @@ -427,7 +555,7 @@ sockinfo_tcp::~sockinfo_tcp() si_tcp_logwarn("still %d tcp segs in use!", m_tcp_seg_in_use); } if (m_tcp_seg_list) { - g_tcp_seg_pool->put_tcp_segs(m_tcp_seg_list); + g_tcp_seg_pool->put_objs(m_tcp_seg_list); } while (!m_socket_options_list.empty()) { @@ -454,38 +582,33 @@ sockinfo_tcp::~sockinfo_tcp() m_rx_ctl_reuse_list.size()); } - if (g_p_agent != NULL) { + if (g_p_agent) { g_p_agent->unregister_cb((agent_cb_t)&sockinfo_tcp::put_agent_msg, (void *)this); } si_tcp_logdbg("sock closed"); + + xlio_socket_event(XLIO_SOCKET_EVENT_TERMINATED, 0); } -void sockinfo_tcp::clean_obj() +void sockinfo_tcp::clean_socket_obj() { + lock_tcp_con(); + if (is_cleaned()) { return; } + m_is_cleaned = true; - lock_tcp_con(); - set_cleaned(); + unlock_tcp_con(); event_handler_manager *p_event_mgr = get_event_mgr(); - bool delegated_timers_exit = g_b_exit && (safe_mce_sys().tcp_ctl_thread == option_tcp_ctl_thread::CTL_THREAD_DELEGATE_TCP_TIMERS); - /* Remove group timers from g_tcp_timers_collection */ - if (p_event_mgr->is_running() && m_timer_handle && !delegated_timers_exit) { - p_event_mgr->unregister_timer_event(this, m_timer_handle); - } - - m_timer_handle = NULL; - unlock_tcp_con(); - if (p_event_mgr->is_running() && !delegated_timers_exit) { - p_event_mgr->unregister_timers_event_and_delete(this); + p_event_mgr->unregister_socket_timer_and_delete(this); } else { - cleanable_obj::clean_obj(); + delete this; } } @@ -502,7 +625,7 @@ bool sockinfo_tcp::prepare_listen_to_close() m_syn_received.erase(key); m_ready_conn_cnt--; new_sock->lock_tcp_con(); - new_sock->m_parent = NULL; + new_sock->m_parent = nullptr; new_sock->abort_connection(); new_sock->unlock_tcp_con(); close(new_sock->get_fd()); @@ -529,15 +652,15 @@ bool sockinfo_tcp::prepare_listen_to_close() bool sockinfo_tcp::prepare_to_close(bool process_shutdown /* = false */) { - bool do_abort = safe_mce_sys().tcp_abort_on_close; - bool state; + si_tcp_logdbg(""); lock_tcp_con(); - si_tcp_logdbg(""); - + bool do_abort = safe_mce_sys().tcp_abort_on_close || m_n_rx_pkt_ready_list_count; bool is_listen_socket = is_server() || get_tcp_state(&m_pcb) == LISTEN; + m_state = SOCKINFO_CLOSING; + /* * consider process_shutdown: * workaround for LBM which does not close the listen sockets properly on process shutdown. @@ -550,9 +673,6 @@ bool sockinfo_tcp::prepare_to_close(bool process_shutdown /* = false */) m_sock_state == TCP_SOCK_CONNECTED_WR || m_sock_state == TCP_SOCK_CONNECTED_RDWR) { m_sock_state = TCP_SOCK_BOUND; } - if (!is_listen_socket && (do_abort || m_n_rx_pkt_ready_list_count)) { - abort_connection(); - } m_rx_ready_byte_count += m_rx_pkt_ready_offset; m_p_socket_stats->n_rx_ready_byte_count += m_rx_pkt_ready_offset; @@ -616,40 +736,37 @@ bool sockinfo_tcp::prepare_to_close(bool process_shutdown /* = false */) * termination sequence * If process_shutdown is set as True do abort() with setting tcp state as CLOSED */ - if (get_tcp_state(&m_pcb) != LISTEN && + if (!is_listen_socket && (do_abort || process_shutdown || (m_linger.l_onoff && !m_linger.l_linger))) { abort_connection(); } else { tcp_close(&m_pcb); if (is_listen_socket) { - tcp_accept(&m_pcb, 0); - tcp_syn_handled(&m_pcb, 0); - tcp_clone_conn(&m_pcb, 0); - tcp_accepted_pcb(&m_pcb, 0); + tcp_accept(&m_pcb, nullptr); + tcp_syn_handled(&m_pcb, nullptr); + tcp_clone_conn(&m_pcb, nullptr); + tcp_accepted_pcb(&m_pcb, nullptr); prepare_listen_to_close(); // close pending to accept sockets } else { tcp_recv(&m_pcb, sockinfo_tcp::rx_drop_lwip_cb); - tcp_sent(&m_pcb, 0); - } - - // todo should we do this each time we get into prepare_to_close ? - if (get_tcp_state(&m_pcb) != LISTEN) { - handle_socket_linger(); + tcp_sent(&m_pcb, nullptr); + if (m_linger.l_onoff && m_linger.l_linger) { + // TODO Should we do this each time we get into prepare_to_close? + handle_socket_linger(); + } } } - m_state = SOCKINFO_CLOSING; NOTIFY_ON_EVENTS(this, EPOLLHUP); + m_sock_wakeup_pipe.do_wakeup(); - do_wakeup(); - - if (m_econtext) { + if (has_epoll_context()) { m_econtext->fd_closed(m_fd); } - state = is_closable(); - if (state) { + bool is_closable_state = is_closable(); + if (is_closable_state) { m_state = SOCKINFO_CLOSED; reset_ops(); } else if (!is_listen_socket) { @@ -664,7 +781,7 @@ bool sockinfo_tcp::prepare_to_close(bool process_shutdown /* = false */) unlock_tcp_con(); - return state; + return is_closable_state; } void sockinfo_tcp::handle_socket_linger() @@ -682,7 +799,7 @@ void sockinfo_tcp::handle_socket_linger() /* SOCKETXTREME WA: Don't call rx_wait() in order not to miss events in socketxtreme_poll() * flow. TBD: find proper solution! rx_wait(poll_cnt, false); * */ - if (!is_socketxtreme()) { + if (!safe_mce_sys().enable_socketxtreme) { rx_wait(poll_cnt, false); } tcp_output(&m_pcb); @@ -750,38 +867,31 @@ bool sockinfo_tcp::prepare_dst_to_send(bool is_accepted_socket /* = false */) bool ret_val = false; if (m_p_connected_dst_entry) { - if (is_accepted_socket) { - ret_val = m_p_connected_dst_entry->prepare_to_send(m_so_ratelimit, true, false); - } else { - ret_val = m_p_connected_dst_entry->prepare_to_send(m_so_ratelimit, false, true); - } - + bool skip_rules = is_accepted_socket; + bool is_connect = !is_accepted_socket; + ret_val = m_p_connected_dst_entry->prepare_to_send(m_so_ratelimit, skip_rules, is_connect); if (ret_val) { /* dst_entry has resolved tx ring, * so it is a time to provide TSO information to PCB */ - m_pcb.tso.max_buf_sz = - std::min(safe_mce_sys().tx_buf_size, - m_p_connected_dst_entry->get_ring()->get_max_payload_sz()); - m_pcb.tso.max_payload_sz = m_p_connected_dst_entry->get_ring()->get_max_payload_sz(); - m_pcb.tso.max_header_sz = m_p_connected_dst_entry->get_ring()->get_max_header_sz(); - m_pcb.tso.max_send_sge = m_p_connected_dst_entry->get_ring()->get_max_send_sge(); - /* reserve one slot for network headers of zerocopy segments */ - m_pcb.max_send_sge = m_pcb.tso.max_send_sge - 1; - safe_mce_sys().zc_tx_size = - std::min(safe_mce_sys().zc_tx_size, m_pcb.tso.max_payload_sz); + auto *ring = m_p_connected_dst_entry->get_ring(); + uint32_t max_tso_sz = std::min(ring->get_max_payload_sz(), safe_mce_sys().max_tso_sz); + m_pcb.tso.max_buf_sz = std::min(safe_mce_sys().tx_buf_size, max_tso_sz); + m_pcb.tso.max_payload_sz = max_tso_sz; + m_pcb.tso.max_header_sz = ring->get_max_header_sz(); + m_pcb.tso.max_send_sge = ring->get_max_send_sge(); } } return ret_val; } -unsigned sockinfo_tcp::tx_wait(int &err, bool blocking) +unsigned sockinfo_tcp::tx_wait(bool blocking) { - unsigned sz = tcp_sndbuf(&m_pcb); + unsigned sz = sndbuf_available(); int poll_count = 0; - si_tcp_logfunc("sz = %d rx_count=%d", sz, m_n_rx_pkt_ready_list_count); - err = 0; - while (is_rts() && (sz = tcp_sndbuf(&m_pcb)) == 0) { + si_tcp_logfunc("sz = %u rx_count=%d", sz, m_n_rx_pkt_ready_list_count); + int err = 0; + while (is_rts() && (sz = sndbuf_available()) == 0) { err = rx_wait(poll_count, blocking); // AlexV:Avoid from going to sleep, for the blocked socket of course, since // progress engine may consume an arrived credit and it will not wakeup the @@ -801,7 +911,7 @@ unsigned sockinfo_tcp::tx_wait(int &err, bool blocking) poll_count = 0; } } - si_tcp_logfunc("end sz=%d rx_count=%d", sz, m_n_rx_pkt_ready_list_count); + si_tcp_logfunc("end sz=%u rx_count=%d", sz, m_n_rx_pkt_ready_list_count); return sz; } @@ -857,7 +967,7 @@ void sockinfo_tcp::put_agent_msg(void *arg) if (p_si_tcp->is_server() || get_tcp_state(&p_si_tcp->m_pcb) == LISTEN) { return; } - if (unlikely(g_p_agent == NULL)) { + if (unlikely(!g_p_agent)) { return; } @@ -894,77 +1004,59 @@ ssize_t sockinfo_tcp::tx(xlio_tx_call_attr_t &tx_arg) return m_ops->tx(tx_arg); } -static inline bool cannot_do_requested_partial_write(const tcp_pcb &pcb, +static inline bool cannot_do_requested_partial_write(size_t sndbuf_available, const xlio_tx_call_attr_t &tx_arg, - bool is_blocking, size_t total_iov_len) + size_t total_iov_len) { - return !BLOCK_THIS_RUN(is_blocking, tx_arg.attr.flags) && - (tx_arg.xlio_flags & TX_FLAG_NO_PARTIAL_WRITE) && - unlikely(tcp_sndbuf(&pcb) < total_iov_len); + return (tx_arg.xlio_flags & TX_FLAG_NO_PARTIAL_WRITE) && + unlikely(sndbuf_available < total_iov_len); } -static inline bool tcp_wnd_unavalable(const tcp_pcb &pcb, size_t total_iov_len) -{ #ifdef DEFINED_TCP_TX_WND_AVAILABILITY - return !tcp_is_wnd_available(&pcb, total_iov_len); +#define TCP_WND_UNAVALABLE(pcb, total_iov_len) !tcp_is_wnd_available(&pcb, total_iov_len) #else - NOT_IN_USE(pcb); - NOT_IN_USE(total_iov_len); - return false; +#define TCP_WND_UNAVALABLE(pcb, total_iov_len) false #endif + +static inline bool is_invalid_iovec(const iovec *iov, size_t sz_iov) +{ + return !iov || sz_iov == 0; } +/** + * Handles transmission operations on a TCP socket, supporting various user actions such as + * write, send, sendv, sendmsg, and sendfile. This function operates on both blocking and + * non-blocking sockets, providing options for zero-copy send operations. When the socket is + * configured for zero-copy send, it executes a fast-path send for non-blocking operations; + * otherwise, it falls back to the tcp_tx_slow_path function. + * + * @param tx_arg The TCP transmission arguments and parameters. + * @return Returns the number of bytes transmitted, or -1 on error with the errno set. + */ ssize_t sockinfo_tcp::tcp_tx(xlio_tx_call_attr_t &tx_arg) { iovec *p_iov = tx_arg.attr.iov; size_t sz_iov = tx_arg.attr.sz_iov; - struct sockaddr *__dst = tx_arg.attr.addr; - socklen_t __dstlen = tx_arg.attr.len; - int __flags = tx_arg.attr.flags; + int flags = tx_arg.attr.flags; int errno_tmp = errno; int ret = 0; int poll_count = 0; - uint16_t apiflags = 0; err_t err; - bool is_send_zerocopy = false; - void *tx_ptr = NULL; - struct xlio_pd_key *pd_key_array = NULL; + void *tx_ptr = nullptr; + struct xlio_pd_key *pd_key_array = nullptr; /* Let allow OS to process all invalid scenarios to avoid any * inconsistencies in setting errno values */ - if (unlikely(m_sock_offload != TCP_SOCK_LWIP) || unlikely(!p_iov) || unlikely(0 == sz_iov)) { - ret = socket_fd_api::tx_os(tx_arg.opcode, p_iov, sz_iov, __flags, __dst, __dstlen); + if (unlikely(m_sock_offload != TCP_SOCK_LWIP) || unlikely(is_invalid_iovec(p_iov, sz_iov))) { + struct sockaddr *dst = tx_arg.attr.addr; + socklen_t dstlen = tx_arg.attr.len; + ret = tx_os(tx_arg.opcode, p_iov, sz_iov, flags, dst, dstlen); save_stats_tx_os(ret); return ret; } -retry_is_ready: - - if (unlikely(!is_rts())) { - - if (m_conn_state == TCP_CONN_TIMEOUT) { - si_tcp_logdbg("TX timed out"); - errno = ETIMEDOUT; - } else if (m_conn_state == TCP_CONN_CONNECTING) { - si_tcp_logdbg("TX while async-connect on socket go to poll"); - rx_wait_helper(poll_count, false); - if (m_conn_state == TCP_CONN_CONNECTED) { - goto retry_is_ready; - } - si_tcp_logdbg("TX while async-connect on socket return EAGAIN"); - errno = EAGAIN; - } else if (m_conn_state == TCP_CONN_RESETED) { - si_tcp_logdbg("TX on reseted socket"); - errno = ECONNRESET; - } else if (m_conn_state == TCP_CONN_ERROR) { - si_tcp_logdbg("TX on connection failed socket"); - errno = ECONNREFUSED; - } else { - si_tcp_logdbg("TX on disconnected socket"); - errno = EPIPE; - } - + if (unlikely(!is_connected_and_ready_to_send())) { return -1; } si_tcp_logfunc("tx: iov=%p niovs=%d", p_iov, sz_iov); @@ -973,6 +1065,115 @@ ssize_t sockinfo_tcp::tcp_tx(xlio_tx_call_attr_t &tx_arg) rx_wait_helper(poll_count, false); } + bool is_dummy = IS_DUMMY_PACKET(flags); + bool is_blocking = BLOCK_THIS_RUN(m_b_blocking, flags); + bool is_packet_zerocopy = (flags & MSG_ZEROCOPY) && ((m_b_zc) || (tx_arg.opcode == TX_FILE)); + if (unlikely(is_dummy) || unlikely(!is_packet_zerocopy) || unlikely(is_blocking)) { + return tcp_tx_slow_path(tx_arg); + } + + bool is_non_file_zerocopy = tx_arg.opcode != TX_FILE; + pd_key_array = + (tx_arg.priv.attr == PBUF_DESC_MKEY ? (struct xlio_pd_key *)tx_arg.priv.opaque : nullptr); + + si_tcp_logfunc("tx: iov=%p niovs=%zu", p_iov, sz_iov); + + size_t total_iov_len = + std::accumulate(&p_iov[0], &p_iov[sz_iov], 0U, + [](size_t sum, const iovec &curr) { return sum + curr.iov_len; }); + lock_tcp_con(); + + if (cannot_do_requested_partial_write(sndbuf_available(), tx_arg, total_iov_len) || + TCP_WND_UNAVALABLE(m_pcb, total_iov_len)) { + return tcp_tx_handle_errno_and_unlock(EAGAIN); + } + + int total_tx = 0; + for (size_t i = 0; i < sz_iov; i++) { + si_tcp_logfunc("iov:%d base=%p len=%d", i, p_iov[i].iov_base, p_iov[i].iov_len); + if (unlikely(!p_iov[i].iov_base)) { + continue; + } + + tx_ptr = p_iov[i].iov_base; + if ((tx_arg.priv.attr == PBUF_DESC_MKEY) && pd_key_array) { + tx_arg.priv.mkey = pd_key_array[i].mkey; + } + unsigned pos = 0; + while (pos < p_iov[i].iov_len) { + unsigned tx_size = sndbuf_available(); + + if (tx_size == 0) { + if (unlikely(!is_rts())) { + si_tcp_logdbg("TX on disconnected socket"); + return tcp_tx_handle_errno_and_unlock(ECONNRESET); + } + // force out TCP data before going on wait() + tcp_output(&m_pcb); + + return tcp_tx_handle_sndbuf_unavailable(total_tx, is_dummy, is_non_file_zerocopy, + errno_tmp); + } + + tx_size = std::min(p_iov[i].iov_len - pos, tx_size); + if (is_non_file_zerocopy) { + /* + * For send zerocopy we don't support pbufs which + * cross huge page boundaries. To avoid forming + * such a pbuf, we have to adjust tx_size, so + * tcp_write receives a buffer which doesn't cross + * the boundary. + */ + unsigned remainder = + ~m_user_huge_page_mask + 1 - ((uint64_t)tx_ptr & ~m_user_huge_page_mask); + tx_size = std::min(remainder, tx_size); + } + + if (unlikely(!is_rts())) { + si_tcp_logdbg("TX on disconnected socket"); + return tcp_tx_handle_errno_and_unlock(ECONNRESET); + } + if (unlikely(g_b_exit)) { + return tcp_tx_handle_partial_send_and_unlock(total_tx, EINTR, is_dummy, + is_non_file_zerocopy, errno_tmp); + } + + const struct iovec iov = {.iov_base = tx_ptr, .iov_len = tx_size}; + err = tcp_write_express(&m_pcb, &iov, 1, &tx_arg.priv); + if (unlikely(err != ERR_OK)) { + // tcp_write_express() can return only ERR_MEM error. + return tcp_tx_handle_partial_send_and_unlock(total_tx, EAGAIN, is_dummy, + is_non_file_zerocopy, errno_tmp); + } + tx_ptr = (void *)((char *)tx_ptr + tx_size); + pos += tx_size; + total_tx += tx_size; + } + } + + return tcp_tx_handle_done_and_unlock(total_tx, errno_tmp, is_dummy, is_non_file_zerocopy); +} + +/** + * Handles transmission operations on a TCP socket similar to tcp_tx. + * This is a fallback function when the operation is either blocking, not zero-copy, or the socket + * wasn't configured for zero-copy operations. + * + * @param tx_arg The TCP transmission arguments and parameters. + * @return Returns the number of bytes transmitted, or -1 on error with the errno set. + */ +ssize_t sockinfo_tcp::tcp_tx_slow_path(xlio_tx_call_attr_t &tx_arg) +{ + iovec *p_iov = tx_arg.attr.iov; + size_t sz_iov = tx_arg.attr.sz_iov; + int flags = tx_arg.attr.flags; + int errno_tmp = errno; + int poll_count = 0; + uint16_t apiflags = 0; + bool is_send_zerocopy = false; + void *tx_ptr = nullptr; + struct xlio_pd_key *pd_key_array = nullptr; + if (tx_arg.opcode == TX_FILE) { /* * TX_FILE is a special operation which reads a single file. @@ -983,7 +1184,7 @@ ssize_t sockinfo_tcp::tcp_tx(xlio_tx_call_attr_t &tx_arg) apiflags |= XLIO_TX_FILE; } - bool is_dummy = IS_DUMMY_PACKET(__flags); + bool is_dummy = IS_DUMMY_PACKET(flags); if (unlikely(is_dummy)) { apiflags |= XLIO_TX_PACKET_DUMMY; } @@ -991,33 +1192,27 @@ ssize_t sockinfo_tcp::tcp_tx(xlio_tx_call_attr_t &tx_arg) /* To force zcopy flow there are two possible ways * - send() MSG_ZEROCOPY flag should be passed by user application * and SO_ZEROCOPY activated - * - sendfile() MSG_SEROCOPY flag set internally with opcode TX_FILE + * - sendfile() MSG_ZEROCOPY flag set internally with opcode TX_FILE */ - if ((__flags & MSG_ZEROCOPY) && ((m_b_zc) || (tx_arg.opcode == TX_FILE))) { + if ((flags & MSG_ZEROCOPY) && ((m_b_zc) || (tx_arg.opcode == TX_FILE))) { apiflags |= XLIO_TX_PACKET_ZEROCOPY; is_send_zerocopy = tx_arg.opcode != TX_FILE; pd_key_array = - (tx_arg.priv.attr == PBUF_DESC_MKEY ? (struct xlio_pd_key *)tx_arg.priv.map : NULL); + (tx_arg.priv.attr == PBUF_DESC_MKEY ? (struct xlio_pd_key *)tx_arg.priv.opaque + : nullptr); } si_tcp_logfunc("tx: iov=%p niovs=%zu", p_iov, sz_iov); - size_t total_iov_len = - std::accumulate(&p_iov[0], &p_iov[sz_iov], 0U, - [](size_t sum, const iovec &curr) { return sum + curr.iov_len; }); lock_tcp_con(); - if (cannot_do_requested_dummy_send(m_pcb, tx_arg) || - cannot_do_requested_partial_write(m_pcb, tx_arg, m_b_blocking, total_iov_len) || - tcp_wnd_unavalable(m_pcb, total_iov_len)) { - unlock_tcp_con(); - errno = EAGAIN; - return -1; + if (cannot_do_requested_dummy_send(m_pcb, tx_arg)) { + return tcp_tx_handle_errno_and_unlock(EAGAIN); } int total_tx = 0; - __off64_t file_offset = 0; - bool block_this_run = BLOCK_THIS_RUN(m_b_blocking, __flags); + off64_t file_offset = 0; + bool block_this_run = BLOCK_THIS_RUN(m_b_blocking, flags); for (size_t i = 0; i < sz_iov; i++) { si_tcp_logfunc("iov:%d base=%p len=%d", i, p_iov[i].iov_base, p_iov[i].iov_len); if (unlikely(!p_iov[i].iov_base)) { @@ -1025,7 +1220,7 @@ ssize_t sockinfo_tcp::tcp_tx(xlio_tx_call_attr_t &tx_arg) } if ((tx_arg.opcode == TX_FILE) && !(apiflags & XLIO_TX_PACKET_ZEROCOPY)) { - file_offset = *(__off64_t *)p_iov[i].iov_base; + file_offset = *(off64_t *)p_iov[i].iov_base; tx_ptr = &file_offset; } else { tx_ptr = p_iov[i].iov_base; @@ -1035,7 +1230,7 @@ ssize_t sockinfo_tcp::tcp_tx(xlio_tx_call_attr_t &tx_arg) } unsigned pos = 0; while (pos < p_iov[i].iov_len) { - unsigned tx_size = tcp_sndbuf(&m_pcb); + auto tx_size = sndbuf_available(); /* Process a case when space is not available at the sending socket * to hold the message to be transmitted @@ -1048,43 +1243,21 @@ ssize_t sockinfo_tcp::tcp_tx(xlio_tx_call_attr_t &tx_arg) if (tx_size == 0) { if (unlikely(!is_rts())) { si_tcp_logdbg("TX on disconnected socket"); - errno = ECONNRESET; - goto err; + return tcp_tx_handle_errno_and_unlock(ECONNRESET); } // force out TCP data before going on wait() tcp_output(&m_pcb); - /* Set return values for nonblocking socket and finish processing */ + // non blocking socket should return in order not to tx_wait() if (!block_this_run) { - // non blocking socket should return in order not to tx_wait() - if (total_tx > 0) { - m_tx_consecutive_eagain_count = 0; - goto done; - } else { - m_tx_consecutive_eagain_count++; - if (m_tx_consecutive_eagain_count >= TX_CONSECUTIVE_EAGAIN_THREASHOLD) { - if (safe_mce_sys().tcp_ctl_thread == - option_tcp_ctl_thread::CTL_THREAD_DELEGATE_TCP_TIMERS) { - // Slow path. We must attempt TCP timers here for applications that - // do not check for EV_OUT. - g_thread_local_event_handler.do_tasks(); - } - // in case of zero sndbuf and non-blocking just try once polling CQ for - // ACK - rx_wait(poll_count, false); - m_tx_consecutive_eagain_count = 0; - } - errno = EAGAIN; - goto err; - } + return tcp_tx_handle_sndbuf_unavailable(total_tx, is_dummy, is_send_zerocopy, + errno_tmp); } - tx_size = tx_wait(ret, true); + tx_size = tx_wait(block_this_run); } - if (tx_size > p_iov[i].iov_len - pos) { - tx_size = p_iov[i].iov_len - pos; - } + tx_size = std::min(p_iov[i].iov_len - pos, tx_size); if (is_send_zerocopy) { /* * For send zerocopy we don't support pbufs which @@ -1095,63 +1268,59 @@ ssize_t sockinfo_tcp::tcp_tx(xlio_tx_call_attr_t &tx_arg) */ unsigned remainder = ~m_user_huge_page_mask + 1 - ((uint64_t)tx_ptr & ~m_user_huge_page_mask); - if (tx_size > remainder) { - tx_size = remainder; - } - } - retry_write: - if (unlikely(!is_rts())) { - si_tcp_logdbg("TX on disconnected socket"); - errno = ECONNRESET; - goto err; + tx_size = std::min(remainder, tx_size); } - if (unlikely(g_b_exit)) { - if (total_tx > 0) { - goto done; - } else { - errno = EINTR; - si_tcp_logdbg("returning with: EINTR"); - goto err; + do { + if (unlikely(!is_rts())) { + si_tcp_logdbg("TX on disconnected socket"); + return tcp_tx_handle_errno_and_unlock(ECONNRESET); } - } - - err = tcp_write(&m_pcb, tx_ptr, tx_size, apiflags, &tx_arg.priv); - if (unlikely(err != ERR_OK)) { - if (unlikely(err == ERR_CONN)) { // happens when remote drops during big write - si_tcp_logdbg("connection closed: tx'ed = %d", total_tx); - shutdown(SHUT_WR); - if (total_tx > 0) { - goto done; - } - errno = EPIPE; - unlock_tcp_con(); - return -1; + if (unlikely(g_b_exit)) { + return tcp_tx_handle_partial_send_and_unlock(total_tx, EINTR, is_dummy, + is_send_zerocopy, errno_tmp); } - if (unlikely(err != ERR_MEM)) { - // we should not get here... - BULLSEYE_EXCLUDE_BLOCK_START - si_tcp_logpanic("tcp_write return: %d", err); - BULLSEYE_EXCLUDE_BLOCK_END + + err_t err; + if (apiflags & XLIO_TX_PACKET_ZEROCOPY) { + const struct iovec iov = {.iov_base = tx_ptr, .iov_len = tx_size}; + err = tcp_write_express(&m_pcb, &iov, 1, &tx_arg.priv); + } else { + err = tcp_write(&m_pcb, tx_ptr, tx_size, apiflags, &tx_arg.priv); } - /* Set return values for nonblocking socket and finish processing */ - if (!block_this_run) { - if (total_tx > 0) { - goto done; - } else { - errno = EAGAIN; - goto err; + if (unlikely(err != ERR_OK)) { + if (unlikely(err == ERR_CONN)) { // happens when remote drops during big write + si_tcp_logdbg("connection closed: tx'ed = %d", total_tx); + shutdown(SHUT_WR); + return tcp_tx_handle_partial_send_and_unlock(total_tx, EPIPE, is_dummy, + is_send_zerocopy, errno_tmp); + } + if (unlikely(err != ERR_MEM)) { + // we should not get here... + BULLSEYE_EXCLUDE_BLOCK_START + si_tcp_logpanic("tcp_write return: %d", err); + BULLSEYE_EXCLUDE_BLOCK_END + } + /* Set return values for nonblocking socket and finish processing */ + if (!block_this_run) { + if (total_tx > 0) { + return tcp_tx_handle_done_and_unlock(total_tx, errno_tmp, is_dummy, + is_send_zerocopy); + } else { + return tcp_tx_handle_errno_and_unlock(EAGAIN); + } } - } - rx_wait(poll_count, true); + rx_wait(poll_count, true); - // AlexV:Avoid from going to sleep, for the blocked socket of course, since - // progress engine may consume an arrived credit and it will not wakeup the - // transmit thread. - poll_count = 0; + // AlexV:Avoid from going to sleep, for the blocked socket of course, since + // progress engine may consume an arrived credit and it will not wakeup the + // transmit thread. + poll_count = 0; - goto retry_write; - } + continue; + } + break; + } while (true); if (tx_arg.opcode == TX_FILE && !(apiflags & XLIO_TX_PACKET_ZEROCOPY)) { file_offset += tx_size; } else { @@ -1161,45 +1330,8 @@ ssize_t sockinfo_tcp::tcp_tx(xlio_tx_call_attr_t &tx_arg) total_tx += tx_size; } } -done: - tcp_output(&m_pcb); // force data out - - if (unlikely(is_dummy)) { - m_p_socket_stats->counters.n_tx_dummy++; - } else if (total_tx) { - m_p_socket_stats->counters.n_tx_sent_byte_count += total_tx; - m_p_socket_stats->counters.n_tx_sent_pkt_count++; - m_p_socket_stats->n_tx_ready_byte_count += total_tx; - } - - /* Each send call with MSG_ZEROCOPY that successfully sends - * data increments the counter. - * The counter is not incremented on failure or if called with length zero. - */ - if (is_send_zerocopy && (total_tx > 0)) { - if (m_last_zcdesc->tx.zc.id != (uint32_t)atomic_read(&m_zckey)) { - si_tcp_logerr("Invalid tx zcopy operation"); - } else { - atomic_fetch_and_inc(&m_zckey); - } - } - unlock_tcp_con(); - - /* Restore errno on function entry in case success */ - errno = errno_tmp; - - return total_tx; - -err: - // nothing send nb mode or got some other error - if (errno == EAGAIN) { - m_p_socket_stats->counters.n_tx_eagain++; - } else { - m_p_socket_stats->counters.n_tx_errors++; - } - unlock_tcp_con(); - return -1; + return tcp_tx_handle_done_and_unlock(total_tx, errno_tmp, is_dummy, is_send_zerocopy); } /* @@ -1221,7 +1353,7 @@ err_t sockinfo_tcp::ip_output(struct pbuf *p, struct tcp_seg *seg, void *v_p_con dst_entry *p_dst = p_si_tcp->m_p_connected_dst_entry; int max_count = p_si_tcp->m_pcb.tso.max_send_sge; tcp_iovec lwip_iovec[max_count]; - xlio_send_attr attr = {(xlio_wr_tx_packet_attr)flags, p_si_tcp->m_pcb.mss, 0, 0}; + xlio_send_attr attr = {(xlio_wr_tx_packet_attr)flags, p_si_tcp->m_pcb.mss, 0, nullptr}; int count = 0; void *cur_end; @@ -1287,20 +1419,19 @@ err_t sockinfo_tcp::ip_output(struct pbuf *p, struct tcp_seg *seg, void *v_p_con return ERR_OK; } - ssize_t ret = 0; - if (likely((p_dst->is_valid()))) { - ret = p_dst->fast_send((struct iovec *)lwip_iovec, count, attr); - } else { - ret = p_dst->slow_send((struct iovec *)lwip_iovec, count, attr, p_si_tcp->m_so_ratelimit); - } + ssize_t ret = likely((p_dst->is_valid())) + ? p_dst->fast_send((struct iovec *)lwip_iovec, count, attr) + : p_dst->slow_send((struct iovec *)lwip_iovec, count, attr, p_si_tcp->m_so_ratelimit); rc = p_si_tcp->m_ops->handle_send_ret(ret, seg); - if (p_dst->try_migrate_ring_tx(p_si_tcp->m_tcp_con_lock.get_lock_base())) { - p_si_tcp->m_p_socket_stats->counters.n_tx_migrations++; + if (unlikely(safe_mce_sys().ring_migration_ratio_tx > 0)) { // Condition for cache optimization + if (p_dst->try_migrate_ring_tx(p_si_tcp->m_tcp_con_lock.get_lock_base())) { + p_si_tcp->m_p_socket_stats->counters.n_tx_migrations++; + } } - if (rc && is_set(attr.flags, XLIO_TX_PACKET_REXMIT)) { + if (unlikely(is_set(attr.flags, XLIO_TX_PACKET_REXMIT) && rc)) { p_si_tcp->m_p_socket_stats->counters.n_tx_retransmits++; } @@ -1365,9 +1496,12 @@ err_t sockinfo_tcp::ip_output_syn_ack(struct pbuf *p, struct tcp_seg *seg, void */ p_si_tcp->reset_ops(); } + if (new_state == ESTABLISHED) { + p_si_tcp->xlio_socket_event(XLIO_SOCKET_EVENT_ESTABLISHED, 0); + } /* Update daemon about actual state for offloaded connection */ - if (g_p_agent != NULL && likely(p_si_tcp->m_sock_offload == TCP_SOCK_LWIP)) { + if (g_p_agent && likely(p_si_tcp->m_sock_offload == TCP_SOCK_LWIP)) { p_si_tcp->put_agent_msg((void *)p_si_tcp); } } @@ -1414,7 +1548,7 @@ void sockinfo_tcp::err_lwip_cb(void *pcb_container, err_t err) return; } - if (conn->m_parent != NULL) { + if (conn->m_parent) { // In case we got RST or abandon() before we accepted the connection conn->unlock_tcp_con(); int delete_fd = conn->m_parent->handle_child_FIN(conn); @@ -1425,7 +1559,7 @@ void sockinfo_tcp::err_lwip_cb(void *pcb_container, err_t err) // terminating stage, in which case we don't expect to handle packets. // Calling close() under lock will prevent internal thread to delete the object before // we finish with the current processing. - close(delete_fd); + XLIO_CALL(close, delete_fd); return; } } @@ -1478,7 +1612,7 @@ void sockinfo_tcp::err_lwip_cb(void *pcb_container, err_t err) conn->m_sock_state = TCP_SOCK_INITED; } - conn->do_wakeup(); + conn->m_sock_wakeup_pipe.do_wakeup(); } bool sockinfo_tcp::process_peer_ctl_packets(xlio_desc_list_t &peer_packets) @@ -1493,6 +1627,7 @@ bool sockinfo_tcp::process_peer_ctl_packets(xlio_desc_list_t &peer_packets) return false; } + // Listen socket is 3T and so rx.src/dst are set as part of rx_process_buffer_no_flow_id. struct tcp_pcb *pcb = get_syn_received_pcb(desc->rx.src, desc->rx.dst); // 2.1.2 get the pcb and sockinfo @@ -1691,9 +1826,8 @@ void sockinfo_tcp::process_rx_ctl_packets() } // Execute TCP timers of this connection -void sockinfo_tcp::handle_timer_expired(void *user_data) +void sockinfo_tcp::handle_timer_expired() { - NOT_IN_USE(user_data); si_tcp_logfunc(""); if (tcp_ctl_thread_on(m_sysvar_tcp_ctl_thread)) { @@ -1735,7 +1869,7 @@ int sockinfo_tcp::handle_child_FIN(sockinfo_tcp *child_conn) m_received_syn_num--; m_p_socket_stats->listen_counters.n_rx_fin++; m_p_socket_stats->listen_counters.n_conn_dropped++; - child_conn->m_parent = NULL; + child_conn->m_parent = nullptr; unlock_tcp_con(); child_conn->lock_tcp_con(); child_conn->abort_connection(); @@ -1757,7 +1891,9 @@ err_t sockinfo_tcp::ack_recvd_lwip_cb(void *arg, struct tcp_pcb *tpcb, u16_t ack ASSERT_LOCKED(conn->m_tcp_con_lock); - conn->m_p_socket_stats->n_tx_ready_byte_count -= ack; + if (unlikely(conn->has_stats())) { + conn->m_p_socket_stats->n_tx_ready_byte_count -= ack; + } if (conn->sndbuf_available() >= conn->m_required_send_block) { NOTIFY_ON_EVENTS(conn, EPOLLOUT); @@ -1767,7 +1903,7 @@ err_t sockinfo_tcp::ack_recvd_lwip_cb(void *arg, struct tcp_pcb *tpcb, u16_t ack return ERR_OK; } -void sockinfo_tcp::tcp_shutdown_rx(void) +void sockinfo_tcp::tcp_shutdown_rx() { /* Call this method under connection lock */ @@ -1779,7 +1915,7 @@ void sockinfo_tcp::tcp_shutdown_rx(void) * null in such case and as a result update_fd_array() call means nothing */ io_mux_call::update_fd_array(m_iomux_ready_fd_array, m_fd); - do_wakeup(); + m_sock_wakeup_pipe.do_wakeup(); tcp_shutdown(&m_pcb, 1, 0); @@ -1821,7 +1957,6 @@ err_t sockinfo_tcp::rx_lwip_cb(void *arg, struct tcp_pcb *pcb, struct pbuf *p, e } conn->rx_lwip_process_chained_pbufs(p); - conn->save_packet_info_in_ready_list(p); // notify io_mux @@ -1829,7 +1964,7 @@ err_t sockinfo_tcp::rx_lwip_cb(void *arg, struct tcp_pcb *pcb, struct pbuf *p, e io_mux_call::update_fd_array(conn->m_iomux_ready_fd_array, conn->m_fd); // OLG: Now we should wakeup all threads that are sleeping on this socket. - conn->do_wakeup(); + conn->m_sock_wakeup_pipe.do_wakeup(); /* * RCVBUFF Accounting: tcp_recved here(stream into the 'internal' buffer) only if the user @@ -1846,58 +1981,28 @@ err_t sockinfo_tcp::rx_lwip_cb(void *arg, struct tcp_pcb *pcb, struct pbuf *p, e return ERR_OK; } -static inline void _rx_lwip_cb_socketxtreme_helper(pbuf *p, - xlio_socketxtreme_completion_t *completion, - xlio_buff_t *&buff_list_tail, - bool use_hw_timestamp, - std::function notify) +inline void sockinfo_tcp::rx_lwip_cb_socketxtreme_helper(pbuf *p) { + xlio_socketxtreme_completion_t *completion = + set_events_socketxtreme(XLIO_SOCKETXTREME_PACKET, false); + mem_buf_desc_t *current_desc = reinterpret_cast(p); // Is IPv4 only. + assert(p); assert(current_desc->rx.src.get_sa_family() == AF_INET); + assert(current_desc->rx.n_frags > 0); - if (buff_list_tail == nullptr) { - // New completion - completion->packet.buff_lst = reinterpret_cast(p); - completion->packet.total_len = p->tot_len; - completion->packet.num_bufs = current_desc->rx.n_frags; - - assert(reinterpret_cast(p)->rx.n_frags > 0); - current_desc->rx.src.get_sa(reinterpret_cast(&completion->src), - sizeof(completion->src)); - if (use_hw_timestamp) { - completion->packet.hw_timestamp = current_desc->rx.timestamps.hw; - } - notify(); - } else { - // Update existing completion - xlio_buff_t *&buff_list_head = completion->packet.buff_lst; - completion->packet.total_len += p->tot_len; - completion->packet.num_bufs += current_desc->rx.n_frags; + completion->packet.buff_lst = reinterpret_cast(p); + completion->packet.total_len = p->tot_len; + completion->packet.num_bufs = current_desc->rx.n_frags; - auto membuff_list_tail = reinterpret_cast(buff_list_tail); - while (membuff_list_tail->p_next_desc) { - membuff_list_tail = membuff_list_tail->p_next_desc; - } - membuff_list_tail->p_next_desc = current_desc; - reinterpret_cast(buff_list_head)->rx.n_frags = - completion->packet.num_bufs; - pbuf_cat(reinterpret_cast(buff_list_head), p); - current_desc->rx.n_frags = 0; + if (m_n_tsing_flags & SOF_TIMESTAMPING_RAW_HARDWARE) { + completion->packet.hw_timestamp = current_desc->rx.timestamps.hw; } - buff_list_tail = reinterpret_cast(p); -} - -inline void sockinfo_tcp::rx_lwip_cb_socketxtreme_helper(pbuf *p) -{ - auto notify = [this]() { NOTIFY_ON_EVENTS(this, XLIO_SOCKETXTREME_PACKET); }; - bool use_hw_timestamp = (m_n_tsing_flags & SOF_TIMESTAMPING_RAW_HARDWARE); - assert(p); - _rx_lwip_cb_socketxtreme_helper(p, &m_socketxtreme.ec->completion, - m_socketxtreme.ec->last_buff_lst, use_hw_timestamp, notify); - save_stats_rx_offload(m_socketxtreme.ec->completion.packet.total_len); + m_p_rx_ring->socketxtreme_end_ec_operation(); + save_stats_rx_offload(p->tot_len); } inline err_t sockinfo_tcp::handle_fin(struct tcp_pcb *pcb, err_t err) @@ -1912,7 +2017,7 @@ inline err_t sockinfo_tcp::handle_fin(struct tcp_pcb *pcb, err_t err) __log_dbg("[fd=%d] null pbuf sock(%p %p) err=%d", m_fd, &(m_pcb), pcb, err); tcp_shutdown_rx(); - if (m_parent != nullptr) { + if (m_parent) { // in case we got FIN before we accepted the connection /* TODO need to add some refcount inside parent in case parent and child are closed * together*/ @@ -1925,7 +2030,7 @@ inline err_t sockinfo_tcp::handle_fin(struct tcp_pcb *pcb, err_t err) // terminating stage, in which case we don't expect to handle packets. // Calling close() under lock will prevent internal thread to delete the object before // we finish with the current processing. - close(delete_fd); + XLIO_CALL(close, delete_fd); return ERR_ABRT; } } @@ -1938,7 +2043,7 @@ inline void sockinfo_tcp::handle_rx_lwip_cb_error(pbuf *p) // notify io_mux NOTIFY_ON_EVENTS(this, EPOLLERR); - do_wakeup(); + m_sock_wakeup_pipe.do_wakeup(); vlog_printf(VLOG_ERROR, "%s:%d %s\n", __func__, __LINE__, "recv error!!!"); pbuf_free(p); m_sock_state = TCP_SOCK_INITED; @@ -1950,20 +2055,21 @@ inline void sockinfo_tcp::rx_lwip_process_chained_pbufs(pbuf *p) p_first_desc->rx.sz_payload = p->tot_len; p_first_desc->rx.n_frags = 0; - m_connected.get_sa(reinterpret_cast(&p_first_desc->rx.src), - static_cast(sizeof(p_first_desc->rx.src))); + if (unlikely(has_stats())) { + m_p_socket_stats->counters.n_rx_bytes += p->tot_len; - // We go over the p_first_desc again, so decrement what we did in rx_input_cb. - m_socket_stats.strq_counters.n_strq_total_strides -= - static_cast(p_first_desc->rx.strides_num); - m_socket_stats.counters.n_rx_data_pkts++; - // Assume that all chained buffers are GRO packets - m_socket_stats.counters.n_gro += !!p->next; + // We go over the p_first_desc again, so decrement what we did in rx_input_cb. + m_p_socket_stats->strq_counters.n_strq_total_strides -= + static_cast(p_first_desc->rx.strides_num); + m_p_socket_stats->counters.n_rx_data_pkts++; + // Assume that all chained buffers are GRO packets + m_p_socket_stats->counters.n_gro += !!p->next; + } // To avoid reset ref count for first mem_buf_desc, save it and set after the while int head_ref = p_first_desc->get_ref_count(); - for (auto *p_curr_desc = p_first_desc; p_curr_desc != nullptr; + for (auto *p_curr_desc = p_first_desc; p_curr_desc; p = p->next, p_curr_desc = p_curr_desc->p_next_desc) { /* Here we reset ref count for all mem_buf_desc except for the head (p_first_desc). Chain of pbufs can contain some pbufs with ref count >=1 like in ooo or flow tag flows. @@ -1979,9 +2085,22 @@ inline void sockinfo_tcp::rx_lwip_process_chained_pbufs(pbuf *p) p_curr_desc->rx.frag.iov_base = p->payload; p_curr_desc->rx.frag.iov_len = p->len; p_curr_desc->p_next_desc = reinterpret_cast(p->next); - process_timestamps(p_curr_desc); } + + // To avoid redundant checking for every packet a seperate loop runs + // only in case timestamps are needed. + if (m_b_rcvtstamp || m_n_tsing_flags) { + for (auto *p_curr_desc = p_first_desc; p_curr_desc; + p_curr_desc = p_curr_desc->p_next_desc) { + process_timestamps(p_curr_desc); + } + } + p_first_desc->set_ref_count(head_ref); + + if (unlikely(has_stats())) { + m_p_socket_stats->counters.n_rx_frags += p_first_desc->rx.n_frags; + } } inline void sockinfo_tcp::save_packet_info_in_ready_list(pbuf *p) @@ -1989,16 +2108,15 @@ inline void sockinfo_tcp::save_packet_info_in_ready_list(pbuf *p) m_rx_pkt_ready_list.push_back(reinterpret_cast(p)); m_n_rx_pkt_ready_list_count++; m_rx_ready_byte_count += p->tot_len; - m_p_socket_stats->counters.n_rx_bytes += p->tot_len; - m_p_socket_stats->n_rx_ready_byte_count += p->tot_len; - m_p_socket_stats->n_rx_ready_pkt_count++; - m_socket_stats.counters.n_rx_frags += reinterpret_cast(p)->rx.n_frags; - m_p_socket_stats->counters.n_rx_ready_pkt_max = - std::max((uint32_t)m_p_socket_stats->n_rx_ready_pkt_count, - m_p_socket_stats->counters.n_rx_ready_pkt_max); - m_p_socket_stats->counters.n_rx_ready_byte_max = - std::max((uint32_t)m_p_socket_stats->n_rx_ready_byte_count, - m_p_socket_stats->counters.n_rx_ready_byte_max); + + if (unlikely(has_stats())) { + m_p_socket_stats->n_rx_ready_byte_count += p->tot_len; + m_p_socket_stats->n_rx_ready_pkt_count++; + m_p_socket_stats->counters.n_rx_ready_pkt_max = std::max( + (uint32_t)m_n_rx_pkt_ready_list_count, m_p_socket_stats->counters.n_rx_ready_pkt_max); + m_p_socket_stats->counters.n_rx_ready_byte_max = std::max( + (uint32_t)m_rx_ready_byte_count, m_p_socket_stats->counters.n_rx_ready_byte_max); + } } inline void sockinfo_tcp::rx_lwip_shrink_rcv_wnd(size_t pbuf_tot_len, int bytes_received) @@ -2041,14 +2159,10 @@ err_t sockinfo_tcp::rx_lwip_cb_socketxtreme(void *arg, struct tcp_pcb *pcb, stru conn->handle_rx_lwip_cb_error(p); return err; } - conn->rx_lwip_process_chained_pbufs(p); - - conn->m_p_socket_stats->counters.n_rx_bytes += p->tot_len; - conn->m_socket_stats.counters.n_rx_frags += reinterpret_cast(p)->rx.n_frags; + conn->rx_lwip_process_chained_pbufs(p); conn->rx_lwip_cb_socketxtreme_helper(p); - io_mux_call::update_fd_array(conn->m_iomux_ready_fd_array, conn->m_fd); - conn->do_wakeup(); + /* * RCVBUFF Accounting: tcp_recved here(stream into the 'internal' buffer) only if the user * buffer is not 'filled' @@ -2085,10 +2199,8 @@ err_t sockinfo_tcp::rx_lwip_cb_recv_callback(void *arg, struct tcp_pcb *pcb, str conn->handle_rx_lwip_cb_error(p); return err; } - conn->rx_lwip_process_chained_pbufs(p); - conn->m_p_socket_stats->counters.n_rx_bytes += p->tot_len; - conn->m_socket_stats.counters.n_rx_frags += reinterpret_cast(p)->rx.n_frags; + conn->rx_lwip_process_chained_pbufs(p); xlio_recv_callback_retval_t callback_retval = XLIO_PACKET_RECV; @@ -2100,10 +2212,10 @@ err_t sockinfo_tcp::rx_lwip_cb_recv_callback(void *arg, struct tcp_pcb *pcb, str pkt_info.struct_sz = sizeof(pkt_info); pkt_info.packet_id = (void *)p_first_desc; - pkt_info.src = p_first_desc->rx.src.get_p_sa(); - pkt_info.dst = p_first_desc->rx.dst.get_p_sa(); - pkt_info.socket_ready_queue_pkt_count = conn->m_p_socket_stats->n_rx_ready_pkt_count; - pkt_info.socket_ready_queue_byte_count = conn->m_p_socket_stats->n_rx_ready_byte_count; + pkt_info.src = conn->m_connected.get_p_sa(); + pkt_info.dst = conn->m_bound.get_p_sa(); + pkt_info.socket_ready_queue_pkt_count = conn->m_n_rx_pkt_ready_list_count; + pkt_info.socket_ready_queue_byte_count = conn->m_rx_ready_byte_count; if (conn->m_n_tsing_flags & SOF_TIMESTAMPING_RAW_HARDWARE) { pkt_info.hw_timestamp = p_first_desc->rx.timestamps.hw; @@ -2139,7 +2251,7 @@ err_t sockinfo_tcp::rx_lwip_cb_recv_callback(void *arg, struct tcp_pcb *pcb, str if (callback_retval != XLIO_PACKET_HOLD) { // OLG: Now we should wakeup all threads that are sleeping on this socket. - conn->do_wakeup(); + conn->m_sock_wakeup_pipe.do_wakeup(); } else { conn->m_p_socket_stats->n_rx_zcopy_pkt_count++; } @@ -2201,6 +2313,9 @@ int sockinfo_tcp::handle_rx_error(bool blocking) si_tcp_logdbg("RX on reseted socket"); m_conn_state = TCP_CONN_FAILED; errno = ECONNRESET; + } else if (m_conn_state == TCP_CONN_TIMEOUT) { + si_tcp_logdbg("RX on timed out socket"); + errno = ETIMEDOUT; } else { si_tcp_logdbg("RX on disconnected socket - EOF"); ret = 0; @@ -2242,7 +2357,7 @@ ssize_t sockinfo_tcp::rx(const rx_call_t call_type, iovec *p_iov, ssize_t sz_iov si_tcp_logfuncall(""); if (unlikely(m_sock_offload != TCP_SOCK_LWIP)) { int ret = 0; - ret = socket_fd_api::rx_os(call_type, p_iov, sz_iov, in_flags, __from, __fromlen, __msg); + ret = rx_os(call_type, p_iov, sz_iov, in_flags, __from, __fromlen, __msg); save_stats_rx_os(ret); return ret; } @@ -2368,17 +2483,15 @@ ssize_t sockinfo_tcp::rx(const rx_call_t call_type, iovec *p_iov, ssize_t sz_iov void sockinfo_tcp::register_timer() { - if (m_timer_handle == NULL) { + // A reused time-wait socket wil try to add a timer although it is already registered. + // We should avoid calling register_socket_timer_event unnecessarily because it introduces + // internal-thread locks contention. + if (!is_timer_registered()) { si_tcp_logdbg("Registering TCP socket timer: socket: %p, thread-col: %p, global-col: %p", this, get_tcp_timer_collection(), g_tcp_timers_collection); - /* user_data is the socket itself for a fast cast in the timer_expired(). */ - m_timer_handle = get_event_mgr()->register_timer_event( - safe_mce_sys().tcp_timer_resolution_msec, this, PERIODIC_TIMER, - reinterpret_cast(this), get_tcp_timer_collection()); - } else { - si_tcp_logdbg("register_timer was called more than once. Something might be wrong, or " - "connect was called twice."); + set_timer_registered(true); + get_event_mgr()->register_socket_timer_event(this); } } @@ -2399,7 +2512,7 @@ void sockinfo_tcp::queue_rx_ctl_packet(struct tcp_pcb *pcb, mem_buf_desc_t *p_de } if (m_sysvar_tcp_ctl_thread == option_tcp_ctl_thread::CTL_THREAD_WITH_WAKEUP) { - g_p_event_handler_manager->wakeup_timer_event(this, m_timer_handle); + get_tcp_timer_collection()->register_wakeup_event(); } return; @@ -2407,16 +2520,17 @@ void sockinfo_tcp::queue_rx_ctl_packet(struct tcp_pcb *pcb, mem_buf_desc_t *p_de bool sockinfo_tcp::rx_input_cb(mem_buf_desc_t *p_rx_pkt_mem_buf_desc_info, void *pv_fd_ready_array) { - struct tcp_pcb *pcb = NULL; + struct tcp_pcb *pcb = nullptr; int dropped_count = 0; lock_tcp_con(); save_strq_stats(p_rx_pkt_mem_buf_desc_info->rx.strides_num); - m_socket_stats.counters.n_rx_packets++; + m_iomux_ready_fd_array = (fd_array_t *)pv_fd_ready_array; if (unlikely(get_tcp_state(&m_pcb) == LISTEN)) { + // Listen socket is always 3T and so rx.src/dst are set as part of no-flow-id path. pcb = get_syn_received_pcb(p_rx_pkt_mem_buf_desc_info->rx.src, p_rx_pkt_mem_buf_desc_info->rx.dst); bool established_backlog_full = false; @@ -2482,7 +2596,7 @@ bool sockinfo_tcp::rx_input_cb(mem_buf_desc_t *p_rx_pkt_mem_buf_desc_info, void sock->m_tcp_con_lock.unlock(); } - m_iomux_ready_fd_array = NULL; + m_iomux_ready_fd_array = nullptr; while (dropped_count--) { mem_buf_desc_t *p_rx_pkt_desc = m_rx_cb_dropped_list.get_and_pop_front(); @@ -2594,8 +2708,8 @@ int sockinfo_tcp::connect(const sockaddr *__to, socklen_t __tolen) TRANS_XLIO) { passthrough_unlock("non offloaded socket --> connect only via OS"); return -1; - } else { - notify_epoll_context_fd_is_offloaded(); // remove fd from os epoll + } else if (has_epoll_context()) { + m_econtext->remove_fd_from_epoll_os(m_fd); // remove fd from os epoll } if (bound_any_addr) { @@ -2696,7 +2810,7 @@ int sockinfo_tcp::bind(const sockaddr *__addr, socklen_t __addrlen) if (INPORT_ANY == in_port && (m_pcb.so_options & SOF_REUSEADDR)) { int reuse = 0; - ret = orig_os_api.setsockopt(m_fd, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof(reuse)); + ret = SYSCALL(setsockopt, m_fd, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof(reuse)); BULLSEYE_EXCLUDE_BLOCK_START if (ret) { si_tcp_logerr("Failed to disable SO_REUSEADDR option (ret=%d %m), connection will be " @@ -2706,9 +2820,9 @@ int sockinfo_tcp::bind(const sockaddr *__addr, socklen_t __addrlen) return ret; } BULLSEYE_EXCLUDE_BLOCK_END - ret = orig_os_api.bind(m_fd, __addr, __addrlen); + ret = SYSCALL(bind, m_fd, __addr, __addrlen); reuse = 1; - int rv = orig_os_api.setsockopt(m_fd, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof(reuse)); + int rv = SYSCALL(setsockopt, m_fd, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof(reuse)); BULLSEYE_EXCLUDE_BLOCK_START if (rv) { si_tcp_logerr("Failed to enable SO_REUSEADDR option (ret=%d %m)", rv); @@ -2720,7 +2834,7 @@ int sockinfo_tcp::bind(const sockaddr *__addr, socklen_t __addrlen) } } else { si_tcp_logdbg("OS bind to %s", sockaddr2str(__addr, __addrlen, true).c_str()); - ret = orig_os_api.bind(m_fd, __addr, __addrlen); + ret = SYSCALL(bind, m_fd, __addr, __addrlen); } #if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) @@ -2738,7 +2852,7 @@ int sockinfo_tcp::bind(const sockaddr *__addr, socklen_t __addrlen) socklen_t addr_len = sizeof(addr); BULLSEYE_EXCLUDE_BLOCK_START - if (orig_os_api.getsockname(m_fd, addr.get_p_sa(), &addr_len)) { + if (SYSCALL(getsockname, m_fd, addr.get_p_sa(), &addr_len)) { si_tcp_logerr("get sockname failed"); UNLOCK_RET(-1); } @@ -2919,13 +3033,13 @@ int sockinfo_tcp::listen(int backlog) if (!success) { /* we will get here if attach_as_uc_receiver failed */ passthrough_unlock("Fallback the connection to os"); - return orig_os_api.listen(m_fd, orig_backlog); + return SYSCALL(listen, m_fd, orig_backlog); } // Calling to orig_listen() by default to monitor connection requests for not offloaded // sockets BULLSEYE_EXCLUDE_BLOCK_START - if (orig_os_api.listen(m_fd, orig_backlog)) { + if (SYSCALL(listen, m_fd, orig_backlog)) { // NOTE: The attach_as_uc_receiver at this stage already created steering rules. // Packets may arrive into the queues and the application may theoreticaly // call accept() with success. @@ -2936,10 +3050,10 @@ int sockinfo_tcp::listen(int backlog) BULLSEYE_EXCLUDE_BLOCK_END // Add the user's orig fd to the rx epfd handle - epoll_event ev = {0, {0}}; + epoll_event ev = {0, {nullptr}}; ev.events = EPOLLIN; ev.data.fd = m_fd; - int ret = orig_os_api.epoll_ctl(m_rx_epfd, EPOLL_CTL_ADD, ev.data.fd, &ev); + int ret = SYSCALL(epoll_ctl, m_rx_epfd, EPOLL_CTL_ADD, ev.data.fd, &ev); BULLSEYE_EXCLUDE_BLOCK_START if (unlikely(ret)) { if (errno == EEXIST) { @@ -2954,8 +3068,7 @@ int sockinfo_tcp::listen(int backlog) BULLSEYE_EXCLUDE_BLOCK_END if (tcp_ctl_thread_on(m_sysvar_tcp_ctl_thread)) { - m_timer_handle = g_p_event_handler_manager->register_timer_event( - safe_mce_sys().timer_resolution_msec, this, PERIODIC_TIMER, 0, NULL); + g_p_event_handler_manager->register_socket_timer_event(this); } unlock_tcp_con(); @@ -2982,7 +3095,7 @@ int sockinfo_tcp::accept_helper(struct sockaddr *__addr, socklen_t *__addrlen, { sockinfo_tcp *ns; // todo do one CQ poll and go to sleep even if infinite polling was set - int poll_count = m_n_sysvar_rx_poll_num; // do one poll and go to sleep (if blocking) + int poll_count = safe_mce_sys().rx_poll_num; // do one poll and go to sleep (if blocking) int ret; si_tcp_logfuncall(""); @@ -2991,9 +3104,9 @@ int sockinfo_tcp::accept_helper(struct sockaddr *__addr, socklen_t *__addrlen, if (m_sock_offload == TCP_SOCK_PASSTHROUGH) { si_tcp_logdbg("passthrough - go to OS accept()"); if (__flags) { - return orig_os_api.accept4(m_fd, __addr, __addrlen, __flags); + return SYSCALL(accept4, m_fd, __addr, __addrlen, __flags); } else { - return orig_os_api.accept(m_fd, __addr, __addrlen); + return SYSCALL(accept, m_fd, __addr, __addrlen); } } @@ -3026,20 +3139,20 @@ int sockinfo_tcp::accept_helper(struct sockaddr *__addr, socklen_t *__addrlen, pollfd os_fd[1]; os_fd[0].fd = m_fd; os_fd[0].events = POLLIN; - ret = orig_os_api.poll(os_fd, 1, 0); // Zero timeout - just poll and return quickly + ret = SYSCALL(poll, os_fd, 1, 0); // Zero timeout - just poll and return quickly if (unlikely(ret == -1)) { m_p_socket_stats->counters.n_rx_os_errors++; - si_tcp_logdbg("orig_os_api.poll returned with error (errno=%d %m)", errno); + si_tcp_logdbg("SYSCALL(poll) returned with error (errno=%d %m)", errno); unlock_tcp_con(); return -1; } if (ret == 1) { - si_tcp_logdbg("orig_os_api.poll returned with packet"); + si_tcp_logdbg("SYSCALL(poll) returned with packet"); unlock_tcp_con(); if (__flags) { - return orig_os_api.accept4(m_fd, __addr, __addrlen, __flags); + return SYSCALL(accept4, m_fd, __addr, __addrlen, __flags); } else { - return orig_os_api.accept(m_fd, __addr, __addrlen); + return SYSCALL(accept, m_fd, __addr, __addrlen); } } @@ -3083,7 +3196,7 @@ int sockinfo_tcp::accept_helper(struct sockaddr *__addr, socklen_t *__addrlen, if (m_sysvar_tcp_ctl_thread == option_tcp_ctl_thread::CTL_THREAD_WITH_WAKEUP && !m_rx_peer_packets.empty()) { - g_p_event_handler_manager->wakeup_timer_event(this, m_timer_handle); + get_tcp_timer_collection()->register_wakeup_event(); } unlock_tcp_con(); @@ -3164,15 +3277,15 @@ sockinfo_tcp *sockinfo_tcp::accept_clone() fd = socket_internal(m_family, SOCK_STREAM, 0, false, false); if (fd < 0) { m_p_socket_stats->listen_counters.n_conn_dropped++; - return 0; + return nullptr; } si = dynamic_cast(fd_collection_get_sockfd(fd)); if (!si) { si_tcp_logwarn("can not get accept socket from FD collection"); - close(fd); - return 0; + XLIO_CALL(close, fd); + return nullptr; } // This method is called from a flow which assumes that the socket is locked @@ -3227,19 +3340,17 @@ void sockinfo_tcp::accept_connection_socketxtreme(sockinfo_tcp *parent, sockinfo child->m_p_socket_stats->set_bound_if(child->m_bound); child->m_p_socket_stats->bound_port = child->m_bound.get_in_port(); - xlio_socketxtreme_completion_t &parent_compl = parent->m_socketxtreme.ec->completion; - - child->m_connected.get_sa(reinterpret_cast(&parent_compl.src), - static_cast(sizeof(parent_compl.src))); - /* Update xlio_completion with * XLIO_SOCKETXTREME_NEW_CONNECTION_ACCEPTED related data */ if (likely(child->m_parent)) { + xlio_socketxtreme_completion_t &completion = + *(child->set_events_socketxtreme(XLIO_SOCKETXTREME_NEW_CONNECTION_ACCEPTED, false)); + completion.listen_fd = child->m_parent->get_fd(); - child->m_socketxtreme.ec->completion.src = parent->m_socketxtreme.ec->completion.src; - child->m_socketxtreme.ec->completion.listen_fd = child->m_parent->get_fd(); - NOTIFY_ON_EVENTS(child, XLIO_SOCKETXTREME_NEW_CONNECTION_ACCEPTED); + child->m_connected.get_sa(reinterpret_cast(&completion.src), + static_cast(sizeof(completion.src))); + child->m_p_rx_ring->socketxtreme_end_ec_operation(); } else { vlog_printf(VLOG_ERROR, "XLIO_SOCKETXTREME_NEW_CONNECTION_ACCEPTED: can't find listen socket for new " @@ -3286,7 +3397,7 @@ err_t sockinfo_tcp::accept_lwip_cb(void *arg, struct tcp_pcb *child_pcb, err_t e tcp_ip_output(&(new_sock->m_pcb), sockinfo_tcp::ip_output); tcp_arg(&(new_sock->m_pcb), new_sock); - if (new_sock->is_socketxtreme()) { + if (safe_mce_sys().enable_socketxtreme) { tcp_recv(&new_sock->m_pcb, sockinfo_tcp::rx_lwip_cb_socketxtreme); } else { tcp_recv(&new_sock->m_pcb, sockinfo_tcp::rx_lwip_cb); @@ -3309,7 +3420,6 @@ err_t sockinfo_tcp::accept_lwip_cb(void *arg, struct tcp_pcb *child_pcb, err_t e tcp_nagle_disabled(&new_sock->m_pcb)) { conn_nagle_disabled ? tcp_nagle_disable(&new_sock->m_pcb) : tcp_nagle_enable(&new_sock->m_pcb); - new_sock->fit_snd_bufs_to_nagle(conn_nagle_disabled); } if (new_sock->m_conn_state == TCP_CONN_INIT) { @@ -3337,10 +3447,12 @@ err_t sockinfo_tcp::accept_lwip_cb(void *arg, struct tcp_pcb *child_pcb, err_t e while (!temp_list.empty()) { mem_buf_desc_t *desc = temp_list.get_and_pop_front(); - desc->inc_ref_count(); - L3_level_tcp_input((pbuf *)desc, &new_sock->m_pcb); - if (desc->dec_ref_count() <= 1) { // todo reuse needed? - new_sock->m_rx_ctl_reuse_list.push_back(desc); + if (likely(desc)) { + desc->inc_ref_count(); + L3_level_tcp_input((pbuf *)desc, &new_sock->m_pcb); + if (desc->dec_ref_count() <= 1) { // todo reuse needed? + new_sock->m_rx_ctl_reuse_list.push_back(desc); + } } } } @@ -3354,7 +3466,7 @@ err_t sockinfo_tcp::accept_lwip_cb(void *arg, struct tcp_pcb *child_pcb, err_t e // todo check that listen socket was not closed by now ? (is_server()) conn->m_ready_pcbs.erase(&new_sock->m_pcb); - if (conn->is_socketxtreme()) { + if (safe_mce_sys().enable_socketxtreme) { accept_connection_socketxtreme(conn, new_sock); } else { conn->m_accepted_conns.push_back(new_sock); @@ -3366,13 +3478,13 @@ err_t sockinfo_tcp::accept_lwip_cb(void *arg, struct tcp_pcb *child_pcb, err_t e conn->m_p_socket_stats->listen_counters.n_conn_backlog++; // OLG: Now we should wakeup all threads that are sleeping on this socket. - conn->do_wakeup(); + conn->m_sock_wakeup_pipe.do_wakeup(); // Now we should register the child socket to TCP timer conn->unlock_tcp_con(); /* Do this after auto_accept_connection() call */ - new_sock->m_parent = NULL; + new_sock->m_parent = nullptr; new_sock->lock_tcp_con(); @@ -3415,7 +3527,7 @@ void sockinfo_tcp::push_back_m_rx_pkt_ready_list(mem_buf_desc_t *buff) struct tcp_pcb *sockinfo_tcp::get_syn_received_pcb(const flow_tuple &key) const { - struct tcp_pcb *ret_val = NULL; + struct tcp_pcb *ret_val = nullptr; syn_received_map_t::const_iterator itr; itr = m_syn_received.find(key); @@ -3450,7 +3562,6 @@ err_t sockinfo_tcp::clone_conn_cb(void *arg, struct tcp_pcb **newpcb) new_sock = conn->accept_clone(); if (new_sock) { - /* cppcheck-suppress autoVariables */ *newpcb = (struct tcp_pcb *)(&new_sock->m_pcb); new_sock->m_pcb.my_container = (void *)new_sock; /* XXX We have to search for correct listen socket every time, @@ -3499,12 +3610,12 @@ err_t sockinfo_tcp::syn_received_timewait_cb(void *arg, struct tcp_pcb *newpcb) new_sock->m_b_blocking = true; /* Dump statistics of the previous incarnation of the socket. */ - print_full_stats(new_sock->m_p_socket_stats, NULL, safe_mce_sys().stats_file); + print_full_stats(new_sock->m_p_socket_stats, nullptr, safe_mce_sys().stats_file); new_sock->socket_stats_init(); /* Reset zerocopy state */ atomic_set(&new_sock->m_zckey, 0); - new_sock->m_last_zcdesc = NULL; + new_sock->m_last_zcdesc = nullptr; new_sock->m_b_zc = false; new_sock->m_state = SOCKINFO_OPENED; @@ -3512,7 +3623,7 @@ err_t sockinfo_tcp::syn_received_timewait_cb(void *arg, struct tcp_pcb *newpcb) new_sock->m_conn_state = TCP_CONN_INIT; new_sock->m_parent = listen_sock; - if (new_sock->is_socketxtreme()) { + if (safe_mce_sys().enable_socketxtreme) { tcp_recv(&new_sock->m_pcb, sockinfo_tcp::rx_lwip_cb_socketxtreme); } else { tcp_recv(&new_sock->m_pcb, sockinfo_tcp::rx_lwip_cb); @@ -3521,7 +3632,7 @@ err_t sockinfo_tcp::syn_received_timewait_cb(void *arg, struct tcp_pcb *newpcb) tcp_err(&new_sock->m_pcb, sockinfo_tcp::err_lwip_cb); tcp_sent(&new_sock->m_pcb, sockinfo_tcp::ack_recvd_lwip_cb); new_sock->m_pcb.syn_tw_handled_cb = nullptr; - new_sock->wakeup_clear(); + new_sock->m_sock_wakeup_pipe.wakeup_clear(); if (tcp_ctl_thread_on(new_sock->m_sysvar_tcp_ctl_thread)) { tcp_ip_output(&new_sock->m_pcb, sockinfo_tcp::ip_output_syn_ack); } @@ -3703,7 +3814,7 @@ err_t sockinfo_tcp::connect_lwip_cb(void *arg, struct tcp_pcb *tpcb, err_t err) NOTIFY_ON_EVENTS(conn, EPOLLOUT); // OLG: Now we should wakeup all threads that are sleeping on this socket. - conn->do_wakeup(); + conn->m_sock_wakeup_pipe.do_wakeup(); conn->m_p_socket_stats->set_connected_ip(conn->m_connected); conn->m_p_socket_stats->connected_port = conn->m_connected.get_in_port(); @@ -3774,10 +3885,10 @@ int sockinfo_tcp::wait_for_conn_ready_blocking() int sockinfo_tcp::os_epoll_wait(epoll_event *ep_events, int maxevents) { - return (likely(m_sysvar_tcp_ctl_thread != option_tcp_ctl_thread::CTL_THREAD_DELEGATE_TCP_TIMERS) - ? orig_os_api.epoll_wait(m_rx_epfd, ep_events, maxevents, - m_loops_timer.time_left_msec()) - : os_epoll_wait_with_tcp_timers(ep_events, maxevents)); + return ( + likely(m_sysvar_tcp_ctl_thread != option_tcp_ctl_thread::CTL_THREAD_DELEGATE_TCP_TIMERS) + ? SYSCALL(epoll_wait, m_rx_epfd, ep_events, maxevents, m_loops_timer.time_left_msec()) + : os_epoll_wait_with_tcp_timers(ep_events, maxevents)); } int sockinfo_tcp::os_epoll_wait_with_tcp_timers(epoll_event *ep_events, int maxevents) @@ -3790,7 +3901,7 @@ int sockinfo_tcp::os_epoll_wait_with_tcp_timers(epoll_event *ep_events, int maxe ? sys_timer_resolution_msec : std::min(m_loops_timer.time_left_msec(), sys_timer_resolution_msec)); - rc = orig_os_api.epoll_wait(m_rx_epfd, ep_events, maxevents, next_timeout); + rc = SYSCALL(epoll_wait, m_rx_epfd, ep_events, maxevents, next_timeout); if (rc != 0 || m_loops_timer.time_left_msec() == 0) { break; @@ -3799,7 +3910,7 @@ int sockinfo_tcp::os_epoll_wait_with_tcp_timers(epoll_event *ep_events, int maxe // epol_wait timeout // We must run here TCP timers because we are in a mode when TCP timers are // handled by the context threads instead of the internal thread. - g_thread_local_event_handler.do_tasks(); + g_event_handler_manager_local.do_tasks(); } while (1); return rc; @@ -3906,7 +4017,7 @@ bool sockinfo_tcp::is_writeable() goto noblock; } - if (tcp_sndbuf(&m_pcb) > m_required_send_block) { + if (sndbuf_available() > m_required_send_block) { goto noblock; } @@ -3920,7 +4031,7 @@ bool sockinfo_tcp::is_writeable() p_fd_array->fd_count++; } */ - __log_funcall("--->>> tcp_sndbuf(&m_pcb)=%d", tcp_sndbuf(&m_pcb)); + __log_funcall("--->>> tcp_sndbuf(&m_pcb)=%ld", sndbuf_available()); return true; } @@ -3952,7 +4063,7 @@ int sockinfo_tcp::shutdown(int __how) // if in os pathrough just redirect to os if (m_sock_offload == TCP_SOCK_PASSTHROUGH) { si_tcp_logdbg("passthrough - go to OS shutdown()"); - return orig_os_api.shutdown(m_fd, __how); + return SYSCALL(shutdown, m_fd, __how); } lock_tcp_con(); @@ -4007,7 +4118,7 @@ int sockinfo_tcp::shutdown(int __how) if (is_server()) { if (shut_rx) { - tcp_accept(&m_pcb, 0); + tcp_accept(&m_pcb, nullptr); tcp_syn_handled(&m_pcb, sockinfo_tcp::syn_received_drop_lwip_cb); } } else { @@ -4018,7 +4129,7 @@ int sockinfo_tcp::shutdown(int __how) } } - do_wakeup(); + m_sock_wakeup_pipe.do_wakeup(); if (err == ERR_OK) { unlock_tcp_con(); @@ -4124,34 +4235,12 @@ void sockinfo_tcp::fit_rcv_wnd(bool force_fit) void sockinfo_tcp::fit_snd_bufs(unsigned int new_max_snd_buff) { - uint32_t sent_buffs_num = 0; + // snd_buf can become negative + m_pcb.snd_buf += ((int)new_max_snd_buff - m_pcb.max_snd_buff); + m_pcb.max_snd_buff = new_max_snd_buff; - sent_buffs_num = m_pcb.max_snd_buff - m_pcb.snd_buf; - if (sent_buffs_num <= new_max_snd_buff) { - m_pcb.max_snd_buff = new_max_snd_buff; - if (m_pcb.mss) { - m_pcb.max_unsent_len = (16 * (m_pcb.max_snd_buff) / m_pcb.mss); - } else { - m_pcb.max_unsent_len = - (16 * (m_pcb.max_snd_buff) / 536); /* should MSS be 0 use a const...very unlikely */ - } - /* make sure max_unsent_len is not 0 */ - m_pcb.max_unsent_len = std::max(m_pcb.max_unsent_len, 1U); - m_pcb.snd_buf = m_pcb.max_snd_buff - sent_buffs_num; - } -} - -void sockinfo_tcp::fit_snd_bufs_to_nagle(bool disable_nagle) -{ - if (m_sndbuff_max) { - return; - } - - if (disable_nagle) { - fit_snd_bufs(TCP_SND_BUF_NO_NAGLE); - } else { - fit_snd_bufs(TCP_SND_BUF); - } + uint16_t mss = m_pcb.mss ?: 536; + m_pcb.max_unsent_len = (mss - 1 + m_pcb.max_snd_buff * 16) / mss; } //////////////////////////////////////////////////////////////////////////////// @@ -4179,7 +4268,7 @@ int sockinfo_tcp::tcp_setsockopt(int __level, int __optname, __const void *__opt SOCKOPT_PASS_TO_OS) { if (!is_incoming() && (ret_opt == SOCKOPT_INTERNAL_XLIO_SUPPORT || ret_opt == SOCKOPT_HANDLE_BY_OS) && - m_sock_state <= TCP_SOCK_ACCEPT_READY && __optval != NULL && + m_sock_state <= TCP_SOCK_ACCEPT_READY && __optval && is_inherited_option(__level, __optname)) { socket_option_t *opt_curr = new socket_option_t(__level, __optname, __optval, __optlen); if (opt_curr) { @@ -4244,7 +4333,6 @@ int sockinfo_tcp::tcp_setsockopt(int __level, int __optname, __const void *__opt } else { tcp_nagle_enable(&m_pcb); } - fit_snd_bufs_to_nagle(val); unlock_tcp_con(); si_tcp_logdbg("(TCP_NODELAY) nagle: %d", val); break; @@ -4419,10 +4507,10 @@ int sockinfo_tcp::tcp_setsockopt(int __level, int __optname, __const void *__opt lock_tcp_con(); // OS allocates double the size of memory requested by the application - not sure we // need it. - m_sndbuff_max = std::max(2 * m_pcb.mss, 2 * val); - fit_snd_bufs(m_sndbuff_max); + val = std::max(2 * m_pcb.mss, 2 * val); + fit_snd_bufs(val); unlock_tcp_con(); - si_tcp_logdbg("setsockopt SO_SNDBUF: %d", m_sndbuff_max); + si_tcp_logdbg("setsockopt SO_SNDBUF: requested %d, set %d", *(int *)__optval, val); break; case SO_LINGER: if (__optlen < sizeof(struct linger)) { @@ -4620,7 +4708,7 @@ int sockinfo_tcp::tcp_setsockopt(int __level, int __optname, __const void *__opt return ret; } - if (!is_incoming() && m_sock_state <= TCP_SOCK_ACCEPT_READY && __optval != NULL && + if (!is_incoming() && m_sock_state <= TCP_SOCK_ACCEPT_READY && __optval && is_inherited_option(__level, __optname)) { m_socket_options_list.push_back( new socket_option_t(__level, __optname, __optval, __optlen)); @@ -4659,10 +4747,12 @@ void sockinfo_tcp::get_tcp_info(struct tcp_info *ti) ti->tcpi_snd_mss = m_pcb.mss; ti->tcpi_retransmits = m_pcb.nrtx; // ti->tcpi_retrans - we don't keep it and calculation would be O(N). - ti->tcpi_total_retrans = m_p_socket_stats->counters.n_tx_retransmits; ti->tcpi_snd_cwnd = m_pcb.cwnd / m_pcb.mss; ti->tcpi_snd_ssthresh = m_pcb.ssthresh / m_pcb.mss; + // This will be incorrect if sockets number is bigger than safe_mce_sys().stats_fd_num_max. + ti->tcpi_total_retrans = m_p_socket_stats->counters.n_tx_retransmits; + // Currently we miss per segment statistics and most of congestion control fields. } @@ -4819,8 +4909,8 @@ int sockinfo_tcp::getsockopt_offload(int __level, int __optname, void *__optval, break; case SO_SNDBUF: if (*__optlen >= sizeof(int)) { - *(int *)__optval = m_sndbuff_max; - si_tcp_logdbg("(SO_SNDBUF) sndbuf=%d", m_sndbuff_max); + *(int *)__optval = m_pcb.max_snd_buff; + si_tcp_logdbg("(SO_SNDBUF) sndbuf=%d", *(int *)__optval); ret = 0; } else { errno = EINVAL; @@ -4940,7 +5030,7 @@ int sockinfo_tcp::getsockopt(int __level, int __optname, void *__optval, socklen return -1; } - ret = orig_os_api.getsockopt(m_fd, __level, __optname, __optval, __optlen); + ret = SYSCALL(getsockopt, m_fd, __level, __optname, __optval, __optlen); BULLSEYE_EXCLUDE_BLOCK_START if (ret) { @@ -4956,7 +5046,7 @@ int sockinfo_tcp::getsockname(sockaddr *__name, socklen_t *__namelen) if (m_sock_offload == TCP_SOCK_PASSTHROUGH) { si_tcp_logdbg("passthrough - go to OS getsockname"); - return orig_os_api.getsockname(m_fd, __name, __namelen); + return SYSCALL(getsockname, m_fd, __name, __namelen); } // according to man address should be truncated if given struct is too small @@ -4979,7 +5069,7 @@ int sockinfo_tcp::getpeername(sockaddr *__name, socklen_t *__namelen) if (m_sock_offload == TCP_SOCK_PASSTHROUGH) { si_tcp_logdbg("passthrough - go to OS getpeername"); - return orig_os_api.getpeername(m_fd, __name, __namelen); + return SYSCALL(getpeername, m_fd, __name, __namelen); } if (m_conn_state < TCP_CONN_CONNECTED) { @@ -5050,7 +5140,7 @@ int sockinfo_tcp::rx_wait_helper(int &poll_count, bool blocking) // There are scenarios when rx_wait_helper is called in an infinite loop but exits before // OS epoll_wait. Delegated TCP timers must be attempted in such case. // This is a slow path. So calling chrono::now(), even with every iteration, is OK here. - g_thread_local_event_handler.do_tasks(); + g_event_handler_manager_local.do_tasks(); } // if in blocking accept state skip poll phase and go to sleep directly @@ -5059,7 +5149,7 @@ int sockinfo_tcp::rx_wait_helper(int &poll_count, bool blocking) return -1; } - if (poll_count < m_n_sysvar_rx_poll_num || m_n_sysvar_rx_poll_num == -1) { + if (poll_count < safe_mce_sys().rx_poll_num || safe_mce_sys().rx_poll_num == -1) { return 0; } @@ -5104,7 +5194,7 @@ int sockinfo_tcp::rx_wait_helper(int &poll_count, bool blocking) lock_tcp_con(); if (!m_n_rx_pkt_ready_list_count && !m_ready_conn_cnt) { - going_to_sleep(); + m_sock_wakeup_pipe.going_to_sleep(); unlock_tcp_con(); } else { unlock_tcp_con(); @@ -5114,7 +5204,7 @@ int sockinfo_tcp::rx_wait_helper(int &poll_count, bool blocking) ret = os_wait_sock_rx_epfd(rx_epfd_events, SI_RX_EPFD_EVENT_MAX); lock_tcp_con(); - return_from_sleep(); + m_sock_wakeup_pipe.return_from_sleep(); unlock_tcp_con(); if (ret <= 0) { @@ -5128,9 +5218,9 @@ int sockinfo_tcp::rx_wait_helper(int &poll_count, bool blocking) for (int event_idx = 0; event_idx < ret; event_idx++) { int fd = rx_epfd_events[event_idx].data.fd; - if (is_wakeup_fd(fd)) { // wakeup event + if (m_sock_wakeup_pipe.is_wakeup_fd(fd)) { // wakeup event lock_tcp_con(); - remove_wakeup_fd(); + m_sock_wakeup_pipe.remove_wakeup_fd(); unlock_tcp_con(); continue; } @@ -5156,26 +5246,27 @@ int sockinfo_tcp::rx_wait_helper(int &poll_count, bool blocking) mem_buf_desc_t *sockinfo_tcp::get_next_desc(mem_buf_desc_t *p_desc) { m_rx_pkt_ready_list.pop_front(); - m_p_socket_stats->n_rx_ready_pkt_count--; + if (unlikely(has_stats())) { + m_p_socket_stats->n_rx_ready_pkt_count--; + } m_n_rx_pkt_ready_list_count--; if (p_desc->p_next_desc) { - // vlog_printf(VLOG_ERROR, "detected chained pbufs! REF %u\n", - // p_desc->lwip_pbuf.pbuf.ref); mem_buf_desc_t *prev = p_desc; p_desc = p_desc->p_next_desc; - prev->rx.sz_payload = prev->lwip_pbuf.pbuf.len; - p_desc->rx.sz_payload = p_desc->lwip_pbuf.pbuf.tot_len = - prev->lwip_pbuf.pbuf.tot_len - prev->lwip_pbuf.pbuf.len; + prev->rx.sz_payload = prev->lwip_pbuf.len; + p_desc->rx.sz_payload = p_desc->lwip_pbuf.tot_len = + prev->lwip_pbuf.tot_len - prev->lwip_pbuf.len; p_desc->rx.n_frags = --prev->rx.n_frags; - p_desc->rx.src = prev->rx.src; p_desc->inc_ref_count(); m_rx_pkt_ready_list.push_front(p_desc); m_n_rx_pkt_ready_list_count++; - m_p_socket_stats->n_rx_ready_pkt_count++; - prev->lwip_pbuf.pbuf.next = NULL; - prev->p_next_desc = NULL; + prev->lwip_pbuf.next = nullptr; + prev->p_next_desc = nullptr; prev->rx.n_frags = 1; + if (unlikely(has_stats())) { + m_p_socket_stats->n_rx_ready_pkt_count++; + } reuse_buffer(prev); } else { reuse_buffer(p_desc); @@ -5183,7 +5274,7 @@ mem_buf_desc_t *sockinfo_tcp::get_next_desc(mem_buf_desc_t *p_desc) if (m_n_rx_pkt_ready_list_count) { return m_rx_pkt_ready_list.front(); } else { - return NULL; + return nullptr; } } @@ -5196,7 +5287,7 @@ mem_buf_desc_t *sockinfo_tcp::get_next_desc_peek(mem_buf_desc_t *pdesc, int &rx_ pdesc = m_rx_pkt_ready_list[rx_pkt_ready_list_idx]; rx_pkt_ready_list_idx++; } else { - pdesc = NULL; + pdesc = nullptr; } return pdesc; @@ -5260,21 +5351,21 @@ int sockinfo_tcp::zero_copy_rx(iovec *p_iov, mem_buf_desc_t *pdesc, int *p_flags if (len < 0 && p_desc_iter) { // Update length of right side of chain after split - push to pkt_ready_list - p_desc_iter->rx.sz_payload = p_desc_iter->lwip_pbuf.pbuf.tot_len = - prev->lwip_pbuf.pbuf.tot_len - prev->lwip_pbuf.pbuf.len; + p_desc_iter->rx.sz_payload = p_desc_iter->lwip_pbuf.tot_len = + prev->lwip_pbuf.tot_len - prev->lwip_pbuf.len; // Update length of left side of chain after split - return to app mem_buf_desc_t *p_desc_head = reinterpret_cast(p_pkts->packet_id); // XXX TODO: subsequent buffers are not updated - p_desc_head->lwip_pbuf.pbuf.tot_len = p_desc_head->rx.sz_payload -= + p_desc_head->lwip_pbuf.tot_len = p_desc_head->rx.sz_payload -= p_desc_iter->rx.sz_payload; p_desc_iter->rx.n_frags = p_desc_head->rx.n_frags - p_pkts->sz_iov; p_desc_head->rx.n_frags = p_pkts->sz_iov; - p_desc_iter->rx.src = prev->rx.src; p_desc_iter->inc_ref_count(); - prev->lwip_pbuf.pbuf.next = NULL; - prev->p_next_desc = NULL; + + prev->lwip_pbuf.next = nullptr; + prev->p_next_desc = nullptr; m_rx_pkt_ready_list.push_front(p_desc_iter); break; @@ -5318,7 +5409,7 @@ void sockinfo_tcp::statistics_print(vlog_levels_t log_level /* = VLOG_DEBUG */) tcp_conn_state_e conn_state; u32_t last_unsent_seqno = 0, last_unacked_seqno = 0, first_unsent_seqno = 0, first_unacked_seqno = 0; - u16_t last_unsent_len = 0, last_unacked_len = 0, first_unsent_len = 0, first_unacked_len = 0; + u32_t last_unsent_len = 0, last_unacked_len = 0, first_unsent_len = 0, first_unacked_len = 0; int rcvbuff_max, rcvbuff_current, rcvbuff_non_tcp_recved, rx_pkt_ready_list_size, rx_ctl_packets_list_size, rx_ctl_reuse_list_size; @@ -5419,7 +5510,7 @@ void sockinfo_tcp::statistics_print(vlog_levels_t log_level /* = VLOG_DEBUG */) pcb.snd_wl1, pcb.snd_wl2); // Send buffer - vlog_printf(log_level, "Send buffer : snd_buf %u, max_snd_buff %u\n", pcb.snd_buf, + vlog_printf(log_level, "Send buffer : snd_buf %d, max_snd_buff %u\n", pcb.snd_buf, pcb.max_snd_buff); // Retransmission @@ -5532,12 +5623,12 @@ void sockinfo_tcp::socketxtreme_recv_buffs_tcp(mem_buf_desc_t *desc, uint16_t le mem_buf_desc_t *sockinfo_tcp::tcp_tx_mem_buf_alloc(pbuf_type type) { dst_entry_tcp *p_dst = (dst_entry_tcp *)(m_p_connected_dst_entry); - mem_buf_desc_t *desc = NULL; + mem_buf_desc_t *desc = nullptr; if (likely(p_dst)) { /* Currently this method is called from TLS layer without locks */ m_tcp_con_lock.lock(); - desc = p_dst->get_buffer(type, NULL); + desc = p_dst->get_buffer(type, nullptr); m_tcp_con_lock.unlock(); } return desc; @@ -5553,24 +5644,35 @@ struct pbuf *sockinfo_tcp::tcp_tx_pbuf_alloc(void *p_conn, pbuf_type type, pbuf_ { sockinfo_tcp *p_si_tcp = (sockinfo_tcp *)(((struct tcp_pcb *)p_conn)->my_container); dst_entry_tcp *p_dst = (dst_entry_tcp *)(p_si_tcp->m_p_connected_dst_entry); - mem_buf_desc_t *p_desc = NULL; + mem_buf_desc_t *p_desc = nullptr; if (likely(p_dst)) { p_desc = p_dst->get_buffer(type, desc); - if (p_desc && (p_desc->lwip_pbuf.pbuf.type == PBUF_ZEROCOPY) && - ((p_desc->lwip_pbuf.pbuf.desc.attr == PBUF_DESC_NONE) || - (p_desc->lwip_pbuf.pbuf.desc.attr == PBUF_DESC_MKEY) || - p_desc->lwip_pbuf.pbuf.desc.attr == PBUF_DESC_NVME_TX)) { + } + if (likely(p_desc) && p_desc->lwip_pbuf.type == PBUF_ZEROCOPY) { + if (p_desc->lwip_pbuf.desc.attr == PBUF_DESC_EXPRESS) { + p_desc->m_flags |= mem_buf_desc_t::CALLBACK; + p_desc->tx.zc.callback = tcp_express_zc_callback; + if (p_buff) { + mem_buf_desc_t *p_prev_desc = reinterpret_cast(p_buff); + p_desc->tx.zc.ctx = p_prev_desc->tx.zc.ctx; + } else { + p_desc->tx.zc.ctx = reinterpret_cast(p_si_tcp); + } + } else if ((p_desc->lwip_pbuf.desc.attr == PBUF_DESC_NONE) || + (p_desc->lwip_pbuf.desc.attr == PBUF_DESC_MKEY) || + (p_desc->lwip_pbuf.desc.attr == PBUF_DESC_NVME_TX)) { /* Prepare error queue fields for send zerocopy */ if (p_buff) { /* It is a special case that can happen as a result * of split operation of existing zc buffer */ mem_buf_desc_t *p_prev_desc = (mem_buf_desc_t *)p_buff; - p_desc->m_flags |= mem_buf_desc_t::ZCOPY; + p_desc->m_flags |= mem_buf_desc_t::URGENT; + p_desc->m_flags |= mem_buf_desc_t::CALLBACK; p_desc->tx.zc.id = p_prev_desc->tx.zc.id; p_desc->tx.zc.count = p_prev_desc->tx.zc.count; - p_desc->tx.zc.len = p_desc->lwip_pbuf.pbuf.len; + p_desc->tx.zc.len = p_desc->lwip_pbuf.len; p_desc->tx.zc.ctx = p_prev_desc->tx.zc.ctx; p_desc->tx.zc.callback = tcp_tx_zc_callback; p_prev_desc->tx.zc.count = 0; @@ -5589,7 +5691,7 @@ void sockinfo_tcp::tcp_rx_pbuf_free(struct pbuf *p_buff) { mem_buf_desc_t *desc = (mem_buf_desc_t *)p_buff; - if (desc->p_desc_owner != NULL && p_buff->type != PBUF_ZEROCOPY) { + if (desc->p_desc_owner && p_buff->type != PBUF_ZEROCOPY) { desc->p_desc_owner->mem_buf_rx_release(desc); } else { buffer_pool::free_rx_lwip_pbuf_custom(p_buff); @@ -5614,8 +5716,8 @@ void sockinfo_tcp::tcp_tx_pbuf_free(void *p_conn, struct pbuf *p_buff) __log_err("ref count of %p is already zero, double free??", p_desc); } - if (p_desc->lwip_pbuf.pbuf.ref == 0) { - p_desc->p_next_desc = NULL; + if (p_desc->lwip_pbuf.ref == 0) { + p_desc->p_next_desc = nullptr; buffer_pool::free_tx_lwip_pbuf_custom(p_buff); } } @@ -5623,16 +5725,17 @@ void sockinfo_tcp::tcp_tx_pbuf_free(void *p_conn, struct pbuf *p_buff) mem_buf_desc_t *sockinfo_tcp::tcp_tx_zc_alloc(mem_buf_desc_t *p_desc) { - p_desc->m_flags |= mem_buf_desc_t::ZCOPY; + p_desc->m_flags |= mem_buf_desc_t::URGENT; + p_desc->m_flags |= mem_buf_desc_t::CALLBACK; p_desc->tx.zc.id = atomic_read(&m_zckey); p_desc->tx.zc.count = 1; - p_desc->tx.zc.len = p_desc->lwip_pbuf.pbuf.len; + p_desc->tx.zc.len = p_desc->lwip_pbuf.len; p_desc->tx.zc.ctx = (void *)this; p_desc->tx.zc.callback = tcp_tx_zc_callback; - if (m_last_zcdesc && (m_last_zcdesc != p_desc) && (m_last_zcdesc->lwip_pbuf.pbuf.ref > 0) && + if (m_last_zcdesc && (m_last_zcdesc != p_desc) && (m_last_zcdesc->lwip_pbuf.ref > 0) && (m_last_zcdesc->tx.zc.id == p_desc->tx.zc.id)) { - m_last_zcdesc->tx.zc.len = m_last_zcdesc->lwip_pbuf.pbuf.len; + m_last_zcdesc->tx.zc.len = m_last_zcdesc->lwip_pbuf.len; m_last_zcdesc->tx.zc.count = 0; } m_last_zcdesc = p_desc; @@ -5640,9 +5743,22 @@ mem_buf_desc_t *sockinfo_tcp::tcp_tx_zc_alloc(mem_buf_desc_t *p_desc) return p_desc; } +/*static*/ +void sockinfo_tcp::tcp_express_zc_callback(mem_buf_desc_t *p_desc) +{ + sockinfo_tcp *si = reinterpret_cast(p_desc->tx.zc.ctx); + const uintptr_t opaque_op = reinterpret_cast(p_desc->lwip_pbuf.desc.opaque); + + if (opaque_op && si->m_p_group && si->m_p_group->m_socket_comp_cb) { + si->m_p_group->m_socket_comp_cb(reinterpret_cast(si), + si->m_xlio_socket_userdata, opaque_op); + } +} + +/*static*/ void sockinfo_tcp::tcp_tx_zc_callback(mem_buf_desc_t *p_desc) { - sockinfo_tcp *sock = NULL; + sockinfo_tcp *sock = nullptr; if (!p_desc) { return; @@ -5662,7 +5778,6 @@ void sockinfo_tcp::tcp_tx_zc_callback(mem_buf_desc_t *p_desc) cleanup: /* Clean up */ - p_desc->m_flags &= ~mem_buf_desc_t::ZCOPY; memset(&p_desc->tx.zc, 0, sizeof(p_desc->tx.zc)); if (sock && p_desc == sock->m_last_zcdesc) { sock->m_last_zcdesc = nullptr; @@ -5674,7 +5789,7 @@ void sockinfo_tcp::tcp_tx_zc_handle(mem_buf_desc_t *p_desc) uint32_t lo, hi; uint16_t count; uint32_t prev_lo, prev_hi; - mem_buf_desc_t *err_queue = NULL; + mem_buf_desc_t *err_queue = nullptr; sockinfo_tcp *sock = this; count = p_desc->tx.zc.count; @@ -5704,7 +5819,7 @@ void sockinfo_tcp::tcp_tx_zc_handle(mem_buf_desc_t *p_desc) err_queue->ee.ee_data = hi; } } else if ((sum_count >= (1ULL << 32)) || (lo != prev_hi + 1)) { - err_queue = NULL; + err_queue = nullptr; } else { err_queue->ee.ee_data += count; } @@ -5720,7 +5835,12 @@ void sockinfo_tcp::tcp_tx_zc_handle(mem_buf_desc_t *p_desc) /* Signal events on socket */ NOTIFY_ON_EVENTS(sock, EPOLLERR); - sock->do_wakeup(); + + // Avoid cache access unnecessarily. + // Non-blocking sockets are waked-up as part of mux handling. + if (unlikely(is_blocking())) { + sock->m_sock_wakeup_pipe.do_wakeup(); + } } struct tcp_seg *sockinfo_tcp::tcp_seg_alloc_direct(void *p_conn) @@ -5749,12 +5869,12 @@ void sockinfo_tcp::tcp_seg_free_cached(void *p_conn, struct tcp_seg *seg) void sockinfo_tcp::return_tcp_segs(struct tcp_seg *seg) { - (likely(m_p_rx_ring)) ? m_p_rx_ring->put_tcp_segs(seg) : g_tcp_seg_pool->put_tcp_segs(seg); + (likely(m_p_rx_ring)) ? m_p_rx_ring->put_tcp_segs(seg) : g_tcp_seg_pool->put_objs(seg); } struct tcp_seg *sockinfo_tcp::get_tcp_seg_direct() { - return likely(m_p_rx_ring) ? m_p_rx_ring->get_tcp_segs(1U) : g_tcp_seg_pool->get_tcp_segs(1U); + return likely(m_p_rx_ring) ? m_p_rx_ring->get_tcp_segs(1U) : g_tcp_seg_pool->get_objs(1U); } struct tcp_seg *sockinfo_tcp::get_tcp_seg_cached() @@ -5762,7 +5882,7 @@ struct tcp_seg *sockinfo_tcp::get_tcp_seg_cached() if (!m_tcp_seg_list) { m_tcp_seg_list = (likely(m_p_rx_ring)) ? m_p_rx_ring->get_tcp_segs(m_sysvar_tx_segs_batch_tcp) - : g_tcp_seg_pool->get_tcp_segs(m_sysvar_tx_segs_batch_tcp); + : g_tcp_seg_pool->get_objs(m_sysvar_tx_segs_batch_tcp); if (unlikely(!m_tcp_seg_list)) { return nullptr; @@ -5796,30 +5916,21 @@ void sockinfo_tcp::put_tcp_seg_cached(struct tcp_seg *seg) --m_tcp_seg_in_use; if (m_tcp_seg_count > 2U * m_sysvar_tx_segs_batch_tcp && m_tcp_seg_in_use < m_tcp_seg_count / 2U) { - return_tcp_segs(tcp_seg_pool::split_tcp_segs((m_tcp_seg_count - m_tcp_seg_in_use) / 2U, + return_tcp_segs(tcp_seg_pool::split_obj_list((m_tcp_seg_count - m_tcp_seg_in_use) / 2U, m_tcp_seg_list, m_tcp_seg_count)); } } -tcp_timers_collection::tcp_timers_collection(int period, int resolution) +tcp_timers_collection::tcp_timers_collection() + : tcp_timers_collection(safe_mce_sys().tcp_timer_resolution_msec / + safe_mce_sys().timer_resolution_msec) { - m_n_period = period; - m_n_resolution = resolution; - m_n_intervals_size = period / resolution; - m_timer_handle = NULL; - m_p_intervals = new timer_node_t *[m_n_intervals_size]; - BULLSEYE_EXCLUDE_BLOCK_START - if (!m_p_intervals) { - __log_dbg("failed to allocate memory"); - free_tta_resources(); - throw_xlio_exception("failed to allocate memory"); - } +} - BULLSEYE_EXCLUDE_BLOCK_END - memset(m_p_intervals, 0, sizeof(timer_node_t *) * m_n_intervals_size); - m_n_location = 0; - m_n_next_insert_bucket = 0; - m_n_count = 0; +tcp_timers_collection::tcp_timers_collection(int intervals) +{ + m_n_intervals_size = intervals; + m_p_intervals.resize(m_n_intervals_size); } tcp_timers_collection::~tcp_timers_collection() @@ -5827,21 +5938,29 @@ tcp_timers_collection::~tcp_timers_collection() free_tta_resources(); } -void tcp_timers_collection::free_tta_resources(void) +event_handler_manager *tcp_timers_collection::get_event_mgr() { - if (m_n_count) { - for (int i = 0; i < m_n_intervals_size; i++) { - if (m_p_intervals[i]) { - remove_timer(m_p_intervals[i]); - } - } + if (m_p_group) { + return m_p_group->get_event_handler(); + } else if (safe_mce_sys().tcp_ctl_thread == + option_tcp_ctl_thread::CTL_THREAD_DELEGATE_TCP_TIMERS) { + return &g_event_handler_manager_local; + } else { + return g_p_event_handler_manager; + } +} - if (m_n_count) { - __log_dbg("not all TCP timers have been removed, count=%d", m_n_count); +void tcp_timers_collection::free_tta_resources() +{ + for (auto &bucket : m_p_intervals) { + while (!bucket.empty()) { + remove_timer(bucket.front()); } } - delete[] m_p_intervals; + if (m_n_count) { + __log_dbg("Not all TCP socket timers have been removed, count=%d", m_n_count); + } } void tcp_timers_collection::clean_obj() @@ -5851,7 +5970,7 @@ void tcp_timers_collection::clean_obj() } set_cleaned(); - m_timer_handle = NULL; + m_timer_handle = nullptr; event_handler_manager *p_event_mgr = get_event_mgr(); if (p_event_mgr->is_running()) { @@ -5864,13 +5983,15 @@ void tcp_timers_collection::clean_obj() void tcp_timers_collection::handle_timer_expired(void *user_data) { NOT_IN_USE(user_data); - timer_node_t *iter = m_p_intervals[m_n_location]; - sockinfo_tcp *p_sock; - + sock_list &bucket = m_p_intervals[m_n_location]; m_n_location = (m_n_location + 1) % m_n_intervals_size; - while (iter) { - p_sock = reinterpret_cast(iter->user_data); + auto iter = bucket.begin(); + while (iter != bucket.end()) { + sockinfo_tcp *p_sock = *iter; + // Must inc iter first bacause handle_timer_expired can erase + // the socket that the iter points to, with delegated timers. + iter++; /* It is not guaranteed that the same sockinfo object is met once * in this loop. @@ -5881,7 +6002,7 @@ void tcp_timers_collection::handle_timer_expired(void *user_data) if (!p_sock->trylock_tcp_con()) { bool destroyable = false; if (!p_sock->is_cleaned()) { - p_sock->handle_timer_expired(iter->user_data); + p_sock->handle_timer_expired(); destroyable = p_sock->is_destroyable_no_lock(); } p_sock->unlock_tcp_con(); @@ -5889,78 +6010,73 @@ void tcp_timers_collection::handle_timer_expired(void *user_data) g_p_fd_collection->destroy_sockfd(p_sock); } } - iter = iter->next; } /* Processing all messages for the daemon */ - if (g_p_agent != NULL) { + if (g_p_agent) { g_p_agent->progress(); } } -void tcp_timers_collection::add_new_timer(timer_node_t *node, timer_handler *handler, - void *user_data) +void tcp_timers_collection::add_new_timer(sockinfo_tcp *sock) { - node->handler = handler; - node->user_data = user_data; - node->group = this; - node->next = NULL; - node->prev = NULL; - if (m_p_intervals[m_n_next_insert_bucket] != NULL) { - m_p_intervals[m_n_next_insert_bucket]->prev = node; - node->next = m_p_intervals[m_n_next_insert_bucket]; + if (!sock) { + __log_warn("Trying to add timer for null TCP socket %p", sock); + return; } - m_p_intervals[m_n_next_insert_bucket] = node; - m_n_next_insert_bucket = (m_n_next_insert_bucket + 1) % m_n_intervals_size; - if (m_n_count == 0) { - m_timer_handle = - get_event_mgr()->register_timer_event(m_n_resolution, this, PERIODIC_TIMER, NULL); + sock_list &bucket = m_p_intervals[m_n_next_insert_bucket]; + bucket.emplace_back(sock); + auto rc = + m_sock_remove_map.emplace(sock, std::make_tuple(m_n_next_insert_bucket, --(bucket.end()))); + + // If the socket already exists in m_sock_remove_map, emplace returns false in rc.second + // Mainly for sanity check, we dont expect it. + if (unlikely(!rc.second)) { + __log_warn("Trying to add timer twice for TCP socket %p", sock); + bucket.pop_back(); + return; } - m_n_count++; - __log_dbg("new TCP timer handler [%p] was added", handler); + m_n_next_insert_bucket = (m_n_next_insert_bucket + 1) % m_n_intervals_size; + if (0 == m_n_count++) { + m_timer_handle = get_event_mgr()->register_timer_event(safe_mce_sys().timer_resolution_msec, + this, PERIODIC_TIMER, nullptr); + } + + __log_dbg("New TCP socket [%p] timer was added", sock); } -void tcp_timers_collection::remove_timer(timer_node_t *node) +void tcp_timers_collection::remove_timer(sockinfo_tcp *sock) { - if (!node) { - return; - } - - node->group = NULL; + auto node = m_sock_remove_map.find(sock); + if (node != m_sock_remove_map.end()) { + m_p_intervals[std::get<0>(node->second)].erase(std::get<1>(node->second)); + m_sock_remove_map.erase(node); + sock->set_timer_registered(false); - if (node->prev) { - node->prev->next = node->next; - } else { - for (int i = 0; i < m_n_intervals_size; i++) { - if (m_p_intervals[i] == node) { - m_p_intervals[i] = node->next; - break; + if (!(--m_n_count)) { + if (m_timer_handle) { + get_event_mgr()->unregister_timer_event(this, m_timer_handle); + m_timer_handle = nullptr; } } - } - - if (node->next) { - node->next->prev = node->prev; - } - m_n_count--; - if (m_n_count == 0) { - if (m_timer_handle) { - get_event_mgr()->unregister_timer_event(this, m_timer_handle); - m_timer_handle = NULL; - } + __log_dbg("TCP socket [%p] timer was removed", sock); + } else { + // Listen sockets are not added to timers. + // As part of socket general unregister and destroy they will get here and will no be found. + __log_dbg("TCP socket [%p] timer was not found (listen socket)", sock); } +} - __log_dbg("TCP timer handler [%p] was removed", node->handler); - - free(node); +void tcp_timers_collection::register_wakeup_event() +{ + g_p_event_handler_manager->wakeup_timer_event(this, m_timer_handle); } thread_local_tcp_timers::thread_local_tcp_timers() - : tcp_timers_collection(safe_mce_sys().tcp_timer_resolution_msec, - safe_mce_sys().tcp_timer_resolution_msec) + : tcp_timers_collection(1) { } @@ -6006,7 +6122,7 @@ bool sockinfo_tcp::is_utls_supported(int direction) const int sockinfo_tcp::get_supported_nvme_feature_mask() const { ring *p_ring = get_tx_ring(); - if (p_ring == nullptr) { + if (!p_ring) { return false; } return p_ring->get_supported_nvme_feature_mask(); @@ -6031,8 +6147,8 @@ inline bool sockinfo_tcp::handle_bind_no_port(int &bind_ret, in_port_t in_port, // first bind call with port 0, we set SO_REUSEPORT so we will be able to bind to a // specific port later when we reuse port int so_reuseport = 1; - if ((bind_ret = orig_os_api.setsockopt(m_fd, SOL_SOCKET, SO_REUSEPORT, &so_reuseport, - sizeof(so_reuseport)))) { + if ((bind_ret = SYSCALL(setsockopt, m_fd, SOL_SOCKET, SO_REUSEPORT, &so_reuseport, + sizeof(so_reuseport)))) { return RETURN_FROM_BIND; } m_bound.set_sockaddr(__addr, __addrlen); @@ -6042,3 +6158,239 @@ inline bool sockinfo_tcp::handle_bind_no_port(int &bind_ret, in_port_t in_port, return CONTINUE_WITH_BIND; } + +int sockinfo_tcp::tcp_tx_express(const struct iovec *iov, unsigned iov_len, uint32_t mkey, + unsigned flags, void *opaque_op) +{ + if (unlikely(!is_connected_and_ready_to_send())) { + return -1; + } + + pbuf_desc mdesc; + + switch (flags & XLIO_EXPRESS_OP_TYPE_MASK) { + case XLIO_EXPRESS_OP_TYPE_DESC: + mdesc.attr = PBUF_DESC_EXPRESS; + break; + case XLIO_EXPRESS_OP_TYPE_FILE_ZEROCOPY: + mdesc.attr = PBUF_DESC_MDESC; + break; + default: + return -1; + }; + mdesc.mkey = mkey; + mdesc.opaque = opaque_op; + + int bytes_written = 0; + for (unsigned i = 0; i < iov_len; ++i) { + bytes_written += iov[i].iov_len; + } + + lock_tcp_con(); + + err_t err = tcp_write_express(&m_pcb, iov, iov_len, &mdesc); + if (unlikely(err != ERR_OK)) { + // The only error in tcp_write_express() is a memory error. + m_conn_state = TCP_CONN_ERROR; + m_error_status = ENOMEM; + return tcp_tx_handle_errno_and_unlock(ENOMEM); + } + if (!(flags & XLIO_EXPRESS_MSG_MORE)) { + tcp_output(&m_pcb); + m_b_xlio_socket_dirty = false; + } else if (m_p_group && !m_b_xlio_socket_dirty) { + m_b_xlio_socket_dirty = true; + m_p_group->add_dirty_socket(this); + } + + unlock_tcp_con(); + + return bytes_written; +} + +int sockinfo_tcp::tcp_tx_express_inline(const struct iovec *iov, unsigned iov_len, unsigned flags) +{ + if (unlikely(!is_connected_and_ready_to_send())) { + return -1; + } + + pbuf_desc mdesc; + int bytes_written = 0; + + memset(&mdesc, 0, sizeof(mdesc)); + mdesc.attr = PBUF_DESC_EXPRESS; + + lock_tcp_con(); + + for (unsigned i = 0; i < iov_len; ++i) { + bytes_written += iov[i].iov_len; + err_t err = tcp_write(&m_pcb, iov[i].iov_base, iov[i].iov_len, 0, &mdesc); + if (unlikely(err != ERR_OK)) { + // XXX tcp_write() can return multiple errors. + // XXX tcp_write() can also fail due to queuelen limit, but this is unlikely. + m_conn_state = TCP_CONN_ERROR; + m_error_status = ENOMEM; + return tcp_tx_handle_errno_and_unlock(ENOMEM); + } + } + if (!(flags & XLIO_EXPRESS_MSG_MORE)) { + /* Force doorbell and TX completion for the last TCP segment. Group level flush is not + * mandatory if user uses send level flush. + */ + mem_buf_desc_t *p_desc = reinterpret_cast(m_pcb.last_unsent->p); + p_desc->m_flags |= mem_buf_desc_t::URGENT; + + m_b_xlio_socket_dirty = false; + tcp_output(&m_pcb); + } else if (m_p_group && !m_b_xlio_socket_dirty) { + m_b_xlio_socket_dirty = true; + m_p_group->add_dirty_socket(this); + } + + unlock_tcp_con(); + + return bytes_written; +} + +void sockinfo_tcp::flush(bool force_db /*=false*/) +{ + lock_tcp_con(); + m_b_xlio_socket_dirty = false; + tcp_output(&m_pcb); + + if (force_db) { + ring *tx_ring = get_tx_ring(); + if (likely(tx_ring)) { + tx_ring->ring_delayed_doorbell(); + } + } + unlock_tcp_con(); +} + +ssize_t sockinfo_tcp::tcp_tx_handle_done_and_unlock(ssize_t total_tx, int errno_tmp, bool is_dummy, + bool is_send_zerocopy) +{ + tcp_output(&m_pcb); // force data out + + if (unlikely(has_stats())) { + if (unlikely(is_dummy)) { + m_p_socket_stats->counters.n_tx_dummy++; + } else if (total_tx) { + m_p_socket_stats->counters.n_tx_sent_byte_count += total_tx; + m_p_socket_stats->counters.n_tx_sent_pkt_count++; + m_p_socket_stats->n_tx_ready_byte_count += total_tx; + } + } + + /* Each send call with MSG_ZEROCOPY that successfully sends + * data increments the counter. + * The counter is not incremented on failure or if called with length zero. + */ + if (is_send_zerocopy && (total_tx > 0)) { + if (m_last_zcdesc->tx.zc.id != (uint32_t)atomic_read(&m_zckey)) { + /* si_tcp_logerr("Invalid tx zcopy operation"); */ + } else { + atomic_fetch_and_inc(&m_zckey); + } + } + + unlock_tcp_con(); + + /* Restore errno on function entry in case success */ + errno = errno_tmp; + + return total_tx; +} + +ssize_t sockinfo_tcp::tcp_tx_handle_errno_and_unlock(int error_number) +{ + errno = error_number; + + // nothing send nb mode or got some other error + if (errno == EAGAIN) { + m_p_socket_stats->counters.n_tx_eagain++; + } else { + m_p_socket_stats->counters.n_tx_errors++; + } + unlock_tcp_con(); + return -1; +} + +ssize_t sockinfo_tcp::tcp_tx_handle_partial_send_and_unlock(ssize_t total_tx, int errno_to_report, + bool is_dummy, bool is_send_zerocopy, + int errno_to_restore) +{ + if (total_tx > 0) { + return tcp_tx_handle_done_and_unlock(total_tx, errno_to_restore, is_dummy, + is_send_zerocopy); + } + si_tcp_logdbg("Returning with: %d", errno_to_report); + return tcp_tx_handle_errno_and_unlock(errno_to_report); +} + +bool sockinfo_tcp::is_connected_and_ready_to_send() +{ + /* TODO should we add !g_b_exit here? */ + if (unlikely(!is_rts())) { + if (m_conn_state == TCP_CONN_TIMEOUT) { + si_tcp_logdbg("TX timed out"); + errno = ETIMEDOUT; + } else if (m_conn_state == TCP_CONN_CONNECTING) { + si_tcp_logdbg("TX while async-connect on socket return EAGAIN"); + errno = EAGAIN; + } else if (m_conn_state == TCP_CONN_RESETED) { + si_tcp_logdbg("TX on reseted socket"); + errno = ECONNRESET; + } else if (m_conn_state == TCP_CONN_ERROR) { + si_tcp_logdbg("TX on connection failed socket"); + errno = ECONNREFUSED; + } else { + si_tcp_logdbg("TX on unconnected socket"); + errno = EPIPE; + } + return false; + } + return true; +} + +/* Process a case when space is not available at the sending socket + * to hold the message to be transmitted + * Nonblocking socket: + * - no data is buffered: return (-1) and EAGAIN + * - some data is buffered: return number of bytes ready to be sent + */ +ssize_t sockinfo_tcp::tcp_tx_handle_sndbuf_unavailable(ssize_t total_tx, bool is_dummy, + bool is_send_zerocopy, int errno_to_restore) +{ + // non blocking socket should return in order not to tx_wait() + if (total_tx > 0) { + m_tx_consecutive_eagain_count = 0; + return tcp_tx_handle_done_and_unlock(total_tx, errno_to_restore, is_dummy, + is_send_zerocopy); + } else { + m_tx_consecutive_eagain_count++; + if (m_tx_consecutive_eagain_count >= TX_CONSECUTIVE_EAGAIN_THREASHOLD) { + if (safe_mce_sys().tcp_ctl_thread == + option_tcp_ctl_thread::CTL_THREAD_DELEGATE_TCP_TIMERS) { + // Slow path. We must attempt TCP timers here for applications that + // do not check for EV_OUT. + g_event_handler_manager_local.do_tasks(); + } + // in case of zero sndbuf and non-blocking just try once polling CQ for + // ACK + int poll_count = 0; + rx_wait(poll_count, false); + m_tx_consecutive_eagain_count = 0; + } + return tcp_tx_handle_errno_and_unlock(EAGAIN); + } +} + +size_t sockinfo_tcp::handle_msg_trunc(size_t total_rx, size_t payload_size, int in_flags, + int *p_out_flags) +{ + NOT_IN_USE(payload_size); + NOT_IN_USE(in_flags); + *p_out_flags &= ~MSG_TRUNC; // don't handle msg_trunc + return total_rx; +} diff --git a/src/core/sock/sockinfo_tcp.h b/src/core/sock/sockinfo_tcp.h index e72e8d46d..bb193bc2d 100644 --- a/src/core/sock/sockinfo_tcp.h +++ b/src/core/sock/sockinfo_tcp.h @@ -35,12 +35,11 @@ #include "utils/lock_wrapper.h" #include "proto/mem_buf_desc.h" -#include "sock/socket_fd_api.h" +#include "sock/sockinfo.h" #include "dev/buffer_pool.h" -#include "dev/cq_mgr.h" +#include "dev/cq_mgr_rx.h" #include "xlio_extra.h" -// LWIP includes #include "lwip/opt.h" #include "lwip/tcp_impl.h" @@ -48,6 +47,10 @@ #include "sockinfo_ulp.h" #include "sockinfo_nvme.h" +/* Forward declarations */ +struct xlio_socket_attr; +class poll_group; + #define BLOCK_THIS_RUN(blocking, flags) (blocking && !(flags & MSG_DONTWAIT)) /** @@ -89,6 +92,14 @@ enum tcp_conn_state_e { TCP_CONN_RESETED }; +enum xlio_express_flags : uint32_t { + XLIO_EXPRESS_OP_TYPE_DESC, + XLIO_EXPRESS_OP_TYPE_FILE_ZEROCOPY, + XLIO_EXPRESS_OP_TYPE_MASK = 0x000fu, + XLIO_EXPRESS_MSG_MORE, + XLIO_EXPRESS_MSG_MASK = 0x00f0u, +}; + struct socket_option_t { const int level; const int optname; @@ -113,6 +124,51 @@ struct socket_option_t { } }; +class tcp_timers_collection : public timer_handler, public cleanable_obj { +public: + tcp_timers_collection(); + tcp_timers_collection(int intervals); + ~tcp_timers_collection() override; + + void clean_obj() override; + + void handle_timer_expired(void *user_data) override; + + void register_wakeup_event(); + + void add_new_timer(sockinfo_tcp *sock); + + void remove_timer(sockinfo_tcp *sock); + + void set_group(poll_group *group) { m_p_group = group; } + inline event_handler_manager *get_event_mgr(); + +private: + void free_tta_resources(); + +protected: + void *m_timer_handle = nullptr; + +private: + typedef std::list sock_list; + typedef typename sock_list::iterator sock_list_itr; + std::vector m_p_intervals; + std::unordered_map> m_sock_remove_map; + int m_n_intervals_size; + int m_n_location = 0; + int m_n_count = 0; + int m_n_next_insert_bucket = 0; + poll_group *m_p_group = nullptr; +}; + +class thread_local_tcp_timers : public tcp_timers_collection { +public: + thread_local_tcp_timers(); + ~thread_local_tcp_timers() override; +}; + +extern tcp_timers_collection *g_tcp_timers_collection; + typedef std::deque socket_options_list_t; typedef std::map ready_pcb_map_t; typedef std::map syn_received_map_t; @@ -127,83 +183,89 @@ enum inet_ecns { INET_ECN_MASK = 3, }; -class sockinfo_tcp : public sockinfo, public timer_handler { +class sockinfo_tcp : public sockinfo { public: - static inline size_t accepted_conns_node_offset(void) + static inline size_t accepted_conns_node_offset() { return NODE_OFFSET(sockinfo_tcp, accepted_conns_node); } typedef xlio_list_t sock_list_t; sockinfo_tcp(int fd, int domain); - virtual ~sockinfo_tcp(); + ~sockinfo_tcp() override; - virtual void clean_obj(); + void clean_socket_obj() override; void setPassthrough(bool _isPassthrough) { m_sock_offload = _isPassthrough ? TCP_SOCK_PASSTHROUGH : TCP_SOCK_LWIP; m_p_socket_stats->b_is_offloaded = !_isPassthrough; } - void setPassthrough() { setPassthrough(true); } - bool isPassthrough() { return m_sock_offload == TCP_SOCK_PASSTHROUGH; } + void setPassthrough() override { setPassthrough(true); } + bool isPassthrough() override { return m_sock_offload == TCP_SOCK_PASSTHROUGH; } - int prepareListen(); - int shutdown(int __how); + int prepareListen() override; + int shutdown(int __how) override; // Not always we can close immediately TCP socket: we can do that only after the TCP connection // in closed. In this method we just kikstarting the TCP connection termination (empty the // unsent/unacked, senf FIN...) Return val: true is the socket is already closable and false // otherwise - virtual bool prepare_to_close(bool process_shutdown = false); + bool prepare_to_close(bool process_shutdown = false) override; void create_dst_entry(); bool prepare_dst_to_send(bool is_accepted_socket = false); - virtual int fcntl(int __cmd, unsigned long int __arg); - virtual int fcntl64(int __cmd, unsigned long int __arg); - virtual int ioctl(unsigned long int __request, unsigned long int __arg); - virtual int setsockopt(int __level, int __optname, const void *__optval, socklen_t __optlen); + int fcntl(int __cmd, unsigned long int __arg) override; + int fcntl64(int __cmd, unsigned long int __arg) override; + int ioctl(unsigned long int __request, unsigned long int __arg) override; + int setsockopt(int __level, int __optname, const void *__optval, socklen_t __optlen) override; virtual int tcp_setsockopt(int __level, int __optname, const void *__optval, socklen_t __optlen); - virtual int getsockopt(int __level, int __optname, void *__optval, socklen_t *__optlen); + int getsockopt(int __level, int __optname, void *__optval, socklen_t *__optlen) override; int getsockopt_offload(int __level, int __optname, void *__optval, socklen_t *__optlen); - virtual int connect(const sockaddr *, socklen_t); - virtual int bind(const sockaddr *__addr, socklen_t __addrlen); - virtual int listen(int backlog); - virtual int accept(struct sockaddr *__addr, socklen_t *__addrlen); - virtual int accept4(struct sockaddr *__addr, socklen_t *__addrlen, int __flags); - virtual int getsockname(sockaddr *__name, socklen_t *__namelen); - virtual int getpeername(sockaddr *__name, socklen_t *__namelen); + int connect(const sockaddr *, socklen_t) override; + int bind(const sockaddr *__addr, socklen_t __addrlen) override; + int listen(int backlog) override; + int accept(struct sockaddr *__addr, socklen_t *__addrlen) override; + int accept4(struct sockaddr *__addr, socklen_t *__addrlen, int __flags) override; + int getsockname(sockaddr *__name, socklen_t *__namelen) override; + int getpeername(sockaddr *__name, socklen_t *__namelen) override; + void set_immediate_os_sample() override {}; + void unset_immediate_os_sample() override {}; inline bool handle_bind_no_port(int &bind_ret, in_port_t in_port, const sockaddr *__addr, socklen_t __addrlen); inline void non_tcp_recved(int rx_len); - virtual int recvfrom_zcopy_free_packets(struct xlio_recvfrom_zcopy_packet_t *pkts, - size_t count); + int recvfrom_zcopy_free_packets(struct xlio_recvfrom_zcopy_packet_t *pkts, + size_t count) override; void socketxtreme_recv_buffs_tcp(mem_buf_desc_t *desc, uint16_t len); - virtual void statistics_print(vlog_levels_t log_level = VLOG_DEBUG); + void statistics_print(vlog_levels_t log_level = VLOG_DEBUG) override; - inline struct tcp_pcb *get_pcb(void) { return &m_pcb; } + inline struct tcp_pcb *get_pcb() { return &m_pcb; } - inline unsigned sndbuf_available(void) { return tcp_sndbuf(&m_pcb); } + inline unsigned sndbuf_available() + { + return static_cast(std::max(tcp_sndbuf(&m_pcb), 0)); + } - inline unsigned get_mss(void) { return m_pcb.mss; } + inline unsigned get_mss() { return m_pcb.mss; } - ssize_t tx(xlio_tx_call_attr_t &tx_arg); + ssize_t tx(xlio_tx_call_attr_t &tx_arg) override; ssize_t tcp_tx(xlio_tx_call_attr_t &tx_arg); ssize_t rx(const rx_call_t call_type, iovec *p_iov, ssize_t sz_iov, int *p_flags, - sockaddr *__from = NULL, socklen_t *__fromlen = NULL, struct msghdr *__msg = NULL); + sockaddr *__from = nullptr, socklen_t *__fromlen = nullptr, + struct msghdr *__msg = nullptr) override; static err_t ip_output(struct pbuf *p, struct tcp_seg *seg, void *v_p_conn, uint16_t flags); static err_t ip_output_syn_ack(struct pbuf *p, struct tcp_seg *seg, void *v_p_conn, uint16_t flags); static void tcp_state_observer(void *pcb_container, enum tcp_state new_state); static uint16_t get_route_mtu(struct tcp_pcb *pcb); - virtual void update_header_field(data_updater *updater); - virtual bool rx_input_cb(mem_buf_desc_t *p_rx_pkt_mem_buf_desc_info, void *pv_fd_ready_array); + void update_header_field(data_updater *updater) override; + bool rx_input_cb(mem_buf_desc_t *p_rx_pkt_mem_buf_desc_info, void *pv_fd_ready_array) override; void abort_connection(); - void tcp_shutdown_rx(void); + void tcp_shutdown_rx(); mem_buf_desc_t *tcp_tx_mem_buf_alloc(pbuf_type type); void tcp_rx_mem_buf_free(mem_buf_desc_t *p_desc); @@ -215,22 +277,23 @@ class sockinfo_tcp : public sockinfo, public timer_handler { static struct tcp_seg *tcp_seg_alloc_cached(void *p_conn); static void tcp_seg_free_direct(void *p_conn, struct tcp_seg *seg); static void tcp_seg_free_cached(void *p_conn, struct tcp_seg *seg); - uint32_t get_next_tcp_seqno(void) { return m_pcb.snd_lbb; } - uint32_t get_next_tcp_seqno_rx(void) { return m_pcb.rcv_nxt; } + uint32_t get_next_tcp_seqno() { return m_pcb.snd_lbb; } + uint32_t get_next_tcp_seqno_rx() { return m_pcb.rcv_nxt; } mem_buf_desc_t *tcp_tx_zc_alloc(mem_buf_desc_t *p_desc); + static void tcp_express_zc_callback(mem_buf_desc_t *p_desc); static void tcp_tx_zc_callback(mem_buf_desc_t *p_desc); void tcp_tx_zc_handle(mem_buf_desc_t *p_desc); - bool inline is_readable(uint64_t *p_poll_sn, fd_array_t *p_fd_array = NULL); - bool inline is_writeable(); - bool inline is_errorable(int *errors); - bool is_closable() + bool is_readable(uint64_t *p_poll_sn, fd_array_t *p_fd_array = NULL) override; + bool is_writeable() override; + bool is_errorable(int *errors) override; + bool is_closable() override { return get_tcp_state(&m_pcb) == CLOSED && m_syn_received.empty() && m_accepted_conns.empty(); } - bool inline is_destroyable_lock(void) + bool inline is_destroyable_lock() { bool state; m_tcp_con_lock.lock(); @@ -238,25 +301,27 @@ class sockinfo_tcp : public sockinfo, public timer_handler { m_tcp_con_lock.unlock(); return state; } - bool inline is_destroyable_no_lock(void) + bool inline is_destroyable_no_lock() { return get_tcp_state(&m_pcb) == CLOSED && m_state == SOCKINFO_CLOSING; } - bool skip_os_select() + bool skip_os_select() override { // calling os select on offloaded TCP sockets makes no sense unless it's a listen socket // to make things worse, it returns that os fd is ready... return (m_sock_offload == TCP_SOCK_LWIP && !is_server() && m_conn_state != TCP_CONN_INIT); } - bool is_outgoing() + bool is_outgoing() override { const bool is_listen_socket = is_server() || get_tcp_state(&m_pcb) == LISTEN; // Excluding incoming and listen sockets we can determine outgoing sockets. return !m_b_incoming && !is_listen_socket; } - bool is_incoming() { return m_b_incoming; } + bool is_incoming() override { return m_b_incoming; } + bool is_timer_registered() const { return m_timer_registered; } + void set_timer_registered(bool v) { m_timer_registered = v; } bool is_connected() { return m_sock_state == TCP_SOCK_CONNECTED_RDWR; } @@ -277,32 +342,41 @@ class sockinfo_tcp : public sockinfo, public timer_handler { return m_sock_state == TCP_SOCK_ACCEPT_READY || m_sock_state == TCP_SOCK_ACCEPT_SHUT; } - virtual void update_socket_timestamps(timestamps_t *ts) { m_rx_timestamps = *ts; } + void update_socket_timestamps(timestamps_t *ts) override { m_rx_timestamps = *ts; } - virtual inline fd_type_t get_type() { return FD_TYPE_SOCKET; } + inline fd_type_t get_type() override { return FD_TYPE_SOCKET; } - void handle_timer_expired(void *user_data); + void handle_timer_expired(); - inline ib_ctx_handler *get_ctx(void) + inline ib_ctx_handler *get_ctx() { return m_p_connected_dst_entry ? m_p_connected_dst_entry->get_ctx() : nullptr; } - inline ring *get_tx_ring(void) const noexcept + inline ring *get_tx_ring() const noexcept { return m_p_connected_dst_entry ? m_p_connected_dst_entry->get_ring() : nullptr; } - inline ring *get_rx_ring(void) { return m_p_rx_ring; } - const flow_tuple_with_local_if &get_flow_tuple(void) + void rx_add_ring_cb(ring *p_ring) override; + ring *get_rx_ring() { return m_p_rx_ring; } + const flow_tuple_with_local_if &get_flow_tuple() { - /* XXX Dosn't handle empty map and a map with multiple elements. */ + /* XXX Doesn't handle empty map and a map with multiple elements. */ auto rx_flow_iter = m_rx_flow_map.begin(); return rx_flow_iter->first; } + void rx_poll_on_tx_if_needed() + { + if (m_sysvar_rx_poll_on_tx_tcp) { + int poll_count = 0; + rx_wait_helper(poll_count, false); + } + } + /* Proxy to support ULP. TODO Refactor. */ - inline sockinfo_tcp_ops *get_ops(void) { return m_ops; } + inline sockinfo_tcp_ops *get_ops() { return m_ops; } inline void set_ops(sockinfo_tcp_ops *ops) noexcept { std::swap(ops, m_ops); @@ -310,18 +384,18 @@ class sockinfo_tcp : public sockinfo, public timer_handler { delete ops; } } - inline void reset_ops(void) noexcept { set_ops(m_ops_tcp); } + inline void reset_ops() noexcept { set_ops(m_ops_tcp); } bool is_utls_supported(int direction) const; int get_supported_nvme_feature_mask() const; - inline int trylock_tcp_con(void) { return m_tcp_con_lock.trylock(); } - inline void lock_tcp_con(void) { m_tcp_con_lock.lock(); } - inline void unlock_tcp_con(void) { m_tcp_con_lock.unlock(); } - + inline int trylock_tcp_con() { return m_tcp_con_lock.trylock(); } + inline void lock_tcp_con() { m_tcp_con_lock.lock(); } + inline void unlock_tcp_con() { m_tcp_con_lock.unlock(); } inline void set_reguired_send_block(unsigned sz) { m_required_send_block = sz; } - + tcp_timers_collection *get_tcp_timer_collection(); + bool is_cleaned() const { return m_is_cleaned; } static err_t rx_lwip_cb(void *arg, struct tcp_pcb *tpcb, struct pbuf *p, err_t err); static err_t rx_lwip_cb_socketxtreme(void *arg, struct tcp_pcb *tpcb, struct pbuf *p, err_t err); @@ -330,17 +404,33 @@ class sockinfo_tcp : public sockinfo, public timer_handler { static err_t rx_drop_lwip_cb(void *arg, struct tcp_pcb *tpcb, struct pbuf *p, err_t err); inline void rx_lwip_cb_socketxtreme_helper(pbuf *p); - virtual int register_callback(xlio_recv_callback_t callback, void *context) + int register_callback(xlio_recv_callback_t callback, void *context) override { tcp_recv(&m_pcb, sockinfo_tcp::rx_lwip_cb_recv_callback); - return sockinfo::register_callback(callback, context); + return register_callback_ctx(callback, context); } + int tcp_tx_express(const struct iovec *iov, unsigned iov_len, uint32_t mkey, unsigned flags, + void *opaque_op); + int tcp_tx_express_inline(const struct iovec *iov, unsigned iov_len, unsigned flags); + void flush(bool force_db = false); + + void set_xlio_socket(const struct xlio_socket_attr *attr); + void add_tx_ring_to_group(); + bool is_xlio_socket() { return m_p_group != nullptr; } + poll_group *get_poll_group() { return m_p_group; } + void xlio_socket_event(int event, int value); + static err_t rx_lwip_cb_xlio_socket(void *arg, struct tcp_pcb *tpcb, struct pbuf *p, err_t err); + static void err_lwip_cb_xlio_socket(void *pcb_container, err_t err); + protected: - virtual void lock_rx_q(); - virtual void unlock_rx_q(); - virtual bool try_un_offloading(); // un-offload the socket if possible - virtual int os_epoll_wait(epoll_event *ep_events, int maxevents); + void lock_rx_q() override; + void unlock_rx_q() override; + bool try_un_offloading() override; // un-offload the socket if possible + int os_epoll_wait(epoll_event *ep_events, int maxevents) override; + + size_t handle_msg_trunc(size_t total_rx, size_t payload_size, int in_flags, + int *p_out_flags) override; private: int fcntl_helper(int __cmd, unsigned long int __arg, bool &bexit); @@ -382,7 +472,7 @@ class sockinfo_tcp : public sockinfo, public timer_handler { int wait_for_conn_ready_blocking(); static err_t connect_lwip_cb(void *arg, struct tcp_pcb *tpcb, err_t err); // tx - unsigned tx_wait(int &err, bool blocking); + unsigned tx_wait(bool blocking); int os_epoll_wait_with_tcp_timers(epoll_event *ep_events, int maxevents); int handle_child_FIN(sockinfo_tcp *child_conn); @@ -390,6 +480,15 @@ class sockinfo_tcp : public sockinfo, public timer_handler { // int rx_wait(int &poll_count, bool blocking = true); static err_t ack_recvd_lwip_cb(void *arg, struct tcp_pcb *tpcb, u16_t space); + ssize_t tcp_tx_handle_done_and_unlock(ssize_t total_tx, int errno_tmp, bool is_dummy, + bool is_send_zerocopy); + ssize_t tcp_tx_handle_errno_and_unlock(int error_number); + ssize_t tcp_tx_handle_partial_send_and_unlock(ssize_t total_tx, int errno_to_report, + bool is_dummy, bool is_send_zerocopy, + int errno_to_restore); + ssize_t tcp_tx_handle_sndbuf_unavailable(ssize_t total_tx, bool is_dummy, bool is_send_zerocopy, + int errno_to_restore); + ssize_t tcp_tx_slow_path(xlio_tx_call_attr_t &tx_arg); inline err_t handle_fin(struct tcp_pcb *pcb, err_t err); inline void handle_rx_lwip_cb_error(pbuf *p); inline void rx_lwip_cb_error(pbuf *p); @@ -408,7 +507,7 @@ class sockinfo_tcp : public sockinfo, public timer_handler { /* * Supported only for UDP */ - virtual void handle_ip_pktinfo(struct cmsg_state *) {}; + void handle_ip_pktinfo(struct cmsg_state *) override {}; int handle_rx_error(bool blocking); @@ -429,9 +528,9 @@ class sockinfo_tcp : public sockinfo, public timer_handler { inline void return_pending_rx_buffs(); inline void return_pending_tx_buffs(); inline void reuse_buffer(mem_buf_desc_t *buff); - virtual mem_buf_desc_t *get_next_desc(mem_buf_desc_t *p_desc); - virtual mem_buf_desc_t *get_next_desc_peek(mem_buf_desc_t *p_desc, int &rx_pkt_ready_list_idx); - virtual timestamps_t *get_socket_timestamps(); + mem_buf_desc_t *get_next_desc(mem_buf_desc_t *p_desc) override; + mem_buf_desc_t *get_next_desc_peek(mem_buf_desc_t *p_desc, int &rx_pkt_ready_list_idx) override; + timestamps_t *get_socket_timestamps() override; inline void return_reuse_buffers_postponed() { @@ -445,7 +544,7 @@ class sockinfo_tcp : public sockinfo, public timer_handler { m_rx_reuse_buf_postponed = false; if (m_p_rx_ring) { - if (m_rx_reuse_buff.n_buff_num >= m_n_sysvar_rx_num_buffs_reuse) { + if (m_rx_reuse_buff.n_buff_num >= m_rx_num_buffs_reuse) { if (m_p_rx_ring->reclaim_recv_buffers(&m_rx_reuse_buff.rx_reuse)) { m_rx_reuse_buff.n_buff_num = 0; } else { @@ -457,7 +556,7 @@ class sockinfo_tcp : public sockinfo, public timer_handler { while (iter != m_rx_ring_map.end()) { descq_t *rx_reuse = &iter->second->rx_reuse_info.rx_reuse; int &n_buff_num = iter->second->rx_reuse_info.n_buff_num; - if (n_buff_num >= m_n_sysvar_rx_num_buffs_reuse) { + if (n_buff_num >= m_rx_num_buffs_reuse) { if (iter->first->reclaim_recv_buffers(rx_reuse)) { n_buff_num = 0; } else { @@ -469,28 +568,27 @@ class sockinfo_tcp : public sockinfo, public timer_handler { } } - virtual void post_deqeue(bool release_buff); - virtual int zero_copy_rx(iovec *p_iov, mem_buf_desc_t *pdesc, int *p_flags); + void post_deqeue(bool release_buff) override; + int zero_copy_rx(iovec *p_iov, mem_buf_desc_t *pdesc, int *p_flags) override; // Returns the connected pcb, with 5 tuple which matches the input arguments, // in state "SYN Received" or NULL if pcb wasn't found struct tcp_pcb *get_syn_received_pcb(const flow_tuple &key) const; struct tcp_pcb *get_syn_received_pcb(const sock_addr &src, const sock_addr &dst); - virtual mem_buf_desc_t *get_front_m_rx_pkt_ready_list(); - virtual size_t get_size_m_rx_pkt_ready_list(); - virtual void pop_front_m_rx_pkt_ready_list(); - virtual void push_back_m_rx_pkt_ready_list(mem_buf_desc_t *buff); + mem_buf_desc_t *get_front_m_rx_pkt_ready_list() override; + size_t get_size_m_rx_pkt_ready_list() override; + void pop_front_m_rx_pkt_ready_list() override; + void push_back_m_rx_pkt_ready_list(mem_buf_desc_t *buff) override; // lock_spin_recursive m_rx_cq_lck; /* pick all cqs that match given address */ - virtual int rx_verify_available_data(); + int rx_verify_available_data() override; inline int rx_wait(int &poll_count, bool blocking); inline int rx_wait_lockless(int &poll_count, bool blocking); int rx_wait_helper(int &poll_count, bool blocking); void fit_rcv_wnd(bool force_fit); void fit_snd_bufs(unsigned int new_max); - void fit_snd_bufs_to_nagle(bool disable_nagle); inline struct tcp_seg *get_tcp_seg_cached(); inline struct tcp_seg *get_tcp_seg_direct(); @@ -505,6 +603,9 @@ class sockinfo_tcp : public sockinfo, public timer_handler { void process_reuse_ctl_packets(); void process_rx_ctl_packets(); static void put_agent_msg(void *arg); + bool is_connected_and_ready_to_send(); + + inline event_handler_manager *get_event_mgr(); public: static const int CONNECT_DEFAULT_TIMEOUT_MS = 10000; @@ -517,6 +618,7 @@ class sockinfo_tcp : public sockinfo, public timer_handler { // lwip specific things struct tcp_pcb m_pcb; + fd_array_t *m_iomux_ready_fd_array; socket_options_list_t m_socket_options_list; timestamps_t m_rx_timestamps; tcp_sock_offload_e m_sock_offload; @@ -526,16 +628,14 @@ class sockinfo_tcp : public sockinfo, public timer_handler { bool m_xlio_thr; bool m_b_incoming; bool m_b_attached; + bool m_timer_registered = false; /* connection state machine */ int m_conn_timeout; - /* SNDBUF acconting */ - int m_sndbuff_max; /* RCVBUF acconting */ int m_rcvbuff_max; int m_rcvbuff_current; int m_rcvbuff_non_tcp_recved; tcp_conn_state_e m_conn_state; - fd_array_t *m_iomux_ready_fd_array; struct linger m_linger; /* local & peer addresses */ @@ -556,13 +656,12 @@ class sockinfo_tcp : public sockinfo, public timer_handler { uint32_t m_ready_conn_cnt; int m_backlog; - void *m_timer_handle; multilock m_tcp_con_lock; // used for reporting 'connected' on second non-blocking call to connect or // second call to failed connect blocking socket. bool report_connected; - + bool m_is_cleaned = false; // If this socket registered deletion on internal thread. int m_error_status; const buffer_batching_mode_t m_sysvar_buffer_batching_mode; @@ -588,52 +687,13 @@ class sockinfo_tcp : public sockinfo, public timer_handler { uint64_t m_user_huge_page_mask; unsigned m_required_send_block; uint16_t m_external_vlan_tag = 0U; - - // stats - uint64_t m_n_pbufs_rcvd; - uint64_t m_n_pbufs_freed; -}; -typedef struct tcp_seg tcp_seg; - -class tcp_timers_collection : public timers_group, public cleanable_obj { -public: - tcp_timers_collection(int period, int resolution); - virtual ~tcp_timers_collection(); - - void clean_obj(); - - virtual void handle_timer_expired(void *user_data); - -protected: - // add a new timer - void add_new_timer(timer_node_t *node, timer_handler *handler, void *user_data); - - // remove timer from list and free it. - // called for stopping (unregistering) a timer - void remove_timer(timer_node_t *node); - - void *m_timer_handle; - -private: - timer_node_t **m_p_intervals; - - int m_n_period; - int m_n_resolution; - int m_n_intervals_size; - int m_n_location; - int m_n_count; - int m_n_next_insert_bucket; - - void free_tta_resources(); -}; - -class thread_local_tcp_timers : public tcp_timers_collection { -public: - thread_local_tcp_timers(); - ~thread_local_tcp_timers(); + /* + * Storage API + * TODO Move the fields to proper cold/hot sections in the final version. + */ + bool m_b_xlio_socket_dirty = false; + uintptr_t m_xlio_socket_userdata = 0; + poll_group *m_p_group = nullptr; }; -extern tcp_timers_collection *g_tcp_timers_collection; -extern thread_local thread_local_tcp_timers g_thread_local_tcp_timers; - #endif diff --git a/src/core/sock/sockinfo_udp.cpp b/src/core/sock/sockinfo_udp.cpp index 8f50bd30c..7c4eb8068 100644 --- a/src/core/sock/sockinfo_udp.cpp +++ b/src/core/sock/sockinfo_udp.cpp @@ -101,10 +101,10 @@ inline int sockinfo_udp::poll_os() uint64_t pending_data = 0; m_rx_udp_poll_os_ratio_counter = 0; - ret = orig_os_api.ioctl(m_fd, FIONREAD, &pending_data); + ret = SYSCALL(ioctl, m_fd, FIONREAD, &pending_data); if (unlikely(ret == -1)) { m_p_socket_stats->counters.n_rx_os_errors++; - si_udp_logdbg("orig_os_api.ioctl returned with error in polling loop (errno=%d %m)", errno); + si_udp_logdbg("SYSCALL(ioctl) returned with error in polling loop (errno=%d %m)", errno); return -1; } if (pending_data > 0) { @@ -150,7 +150,7 @@ inline int sockinfo_udp::rx_wait(bool blocking) } loops++; - if (!blocking || m_n_sysvar_rx_poll_num != -1) { + if (!blocking || safe_mce_sys().rx_poll_num != -1) { loops_to_go--; } if (m_loops_timer.is_timeout()) { @@ -193,7 +193,7 @@ inline int sockinfo_udp::rx_wait(bool blocking) //(can happen if another thread was polling & processing the wce) // and update is_sleeping flag under the same lock to synchronize between // this code and wakeup mechanism. - if (is_readable(NULL)) { + if (is_readable(nullptr)) { return 0; } } @@ -204,7 +204,7 @@ inline int sockinfo_udp::rx_wait(bool blocking) /* coverity[double_lock] TODO: RM#1049980 */ m_lock_rcv.lock(); if (!m_n_rx_pkt_ready_list_count) { - going_to_sleep(); + m_sock_wakeup_pipe.going_to_sleep(); /* coverity[double_unlock] TODO: RM#1049980 */ m_lock_rcv.unlock(); } else { @@ -216,7 +216,7 @@ inline int sockinfo_udp::rx_wait(bool blocking) /* coverity[double_lock] TODO: RM#1049980 */ m_lock_rcv.lock(); - return_from_sleep(); + m_sock_wakeup_pipe.return_from_sleep(); /* coverity[double_unlock] TODO: RM#1049980 */ m_lock_rcv.unlock(); @@ -246,17 +246,17 @@ inline int sockinfo_udp::rx_wait(bool blocking) * This is the classical case of wakeup, but we don't want to * waist time on removing wakeup fd, it will be done next time */ - if (is_readable(NULL)) { + if (is_readable(nullptr)) { return 0; } // Run through all ready fd's for (int event_idx = 0; event_idx < ret; ++event_idx) { int fd = rx_epfd_events[event_idx].data.fd; - if (is_wakeup_fd(fd)) { + if (m_sock_wakeup_pipe.is_wakeup_fd(fd)) { /* coverity[double_lock] TODO: RM#1049980 */ m_lock_rcv.lock(); - remove_wakeup_fd(); + m_sock_wakeup_pipe.remove_wakeup_fd(); /* coverity[double_unlock] TODO: RM#1049980 */ m_lock_rcv.unlock(); continue; @@ -383,7 +383,7 @@ sockinfo_udp::sockinfo_udp(int fd, int domain) , m_mc_num_grp_with_src_filter(0) , m_port_map_lock("sockinfo_udp::m_ports_map_lock") , m_port_map_index(0) - , m_p_last_dst_entry(NULL) + , m_p_last_dst_entry(nullptr) , m_tos(0) , m_n_sysvar_rx_poll_yield_loops(safe_mce_sys().rx_poll_yield_loops) , m_n_sysvar_rx_udp_poll_os_ratio(safe_mce_sys().rx_udp_poll_os_ratio) @@ -409,14 +409,14 @@ sockinfo_udp::sockinfo_udp(int fd, int domain) socklen_t option_len = sizeof(n_so_rcvbuf_bytes); BULLSEYE_EXCLUDE_BLOCK_START if (unlikely( - orig_os_api.getsockopt(m_fd, SOL_SOCKET, SO_RCVBUF, &n_so_rcvbuf_bytes, &option_len))) { + SYSCALL(getsockopt, m_fd, SOL_SOCKET, SO_RCVBUF, &n_so_rcvbuf_bytes, &option_len))) { si_udp_logdbg("Failure in getsockopt (errno=%d %m)", errno); } BULLSEYE_EXCLUDE_BLOCK_END si_udp_logdbg("Sockets RCVBUF = %d bytes", n_so_rcvbuf_bytes); rx_ready_byte_count_limit_update(n_so_rcvbuf_bytes); - epoll_event ev = {0, {0}}; + epoll_event ev = {0, {nullptr}}; ev.events = EPOLLIN; @@ -424,7 +424,7 @@ sockinfo_udp::sockinfo_udp(int fd, int domain) ev.data.fd = m_fd; BULLSEYE_EXCLUDE_BLOCK_START - if (unlikely(orig_os_api.epoll_ctl(m_rx_epfd, EPOLL_CTL_ADD, ev.data.fd, &ev))) { + if (unlikely(SYSCALL(epoll_ctl, m_rx_epfd, EPOLL_CTL_ADD, ev.data.fd, &ev))) { si_udp_logpanic("failed to add user's fd to internal epfd errno=%d (%m)", errno); } BULLSEYE_EXCLUDE_BLOCK_END @@ -439,7 +439,7 @@ sockinfo_udp::~sockinfo_udp() // Remove all RX ready queue buffers (Push into reuse queue per ring) si_udp_logdbg("Releasing %d ready rx packets (total of %lu bytes)", m_n_rx_pkt_ready_list_count, - m_p_socket_stats->n_rx_ready_byte_count); + m_rx_ready_byte_count); rx_ready_byte_count_limit_update(0); // Clear the dst_entry map @@ -454,14 +454,14 @@ sockinfo_udp::~sockinfo_udp() /* AlexR: We don't have to be nice and delete the fd. close() will do that any way. This save us the problem when closing in the clean-up case - if we get closed be the - nameserver socket 53. if (unlikely( orig_os_api.epoll_ctl(m_rx_epfd, EPOLL_CTL_DEL, m_fd, + nameserver socket 53. if (unlikely( SYSCALL(epoll_ctl, m_rx_epfd, EPOLL_CTL_DEL, m_fd, NULL))) { if (errno == ENOENT) si_logfunc("failed to del users fd from internal epfd - probably clean up case (errno=%d %m)", errno); else si_logerr("failed to del users fd from internal epfd (errno=%d %m)", errno); } */ m_lock_rcv.lock(); - do_wakeup(); + m_sock_wakeup_pipe.do_wakeup(); destructor_helper(); @@ -517,7 +517,7 @@ int sockinfo_udp::bind(const struct sockaddr *__addr, socklen_t __addrlen) // We always call the orig_bind which will check sanity of the user socket api // and the OS will also allocate a specific port that we can also use - int ret = orig_os_api.bind(m_fd, __addr, __addrlen); + int ret = SYSCALL(bind, m_fd, __addr, __addrlen); if (ret) { si_udp_logdbg("orig bind failed (ret=%d %m)", ret); // TODO: Should we set errno again (maybe log write modified the orig.bind() errno)? @@ -549,7 +549,7 @@ int sockinfo_udp::connect(const struct sockaddr *__to, socklen_t __tolen) // We always call the orig_connect which will check sanity of the user socket api // and the OS will also allocate a specific bound port that we can also use - int ret = orig_os_api.connect(m_fd, __to, __tolen); + int ret = SYSCALL(connect, m_fd, __to, __tolen); if (ret) { si_udp_logdbg("orig connect failed (ret=%d, errno=%d %m)", ret, errno); return ret; @@ -609,7 +609,7 @@ int sockinfo_udp::connect(const struct sockaddr *__to, socklen_t __tolen) // Create the new dst_entry, delete if one already exists if (m_p_connected_dst_entry) { delete m_p_connected_dst_entry; - m_p_connected_dst_entry = NULL; + m_p_connected_dst_entry = nullptr; } if (dst_ipaddr.is_mc(m_family)) { @@ -648,6 +648,46 @@ int sockinfo_udp::connect(const struct sockaddr *__to, socklen_t __tolen) return 0; } +int sockinfo_udp::shutdown(int __how) +{ + si_udp_logfunc(""); + int ret = SYSCALL(shutdown, m_fd, __how); + if (ret) { + si_udp_logdbg("shutdown failed (ret=%d %m)", ret); + } + return ret; +} + +int sockinfo_udp::accept(struct sockaddr *__addr, socklen_t *__addrlen) +{ + si_udp_logfunc(""); + int ret = SYSCALL(accept, m_fd, __addr, __addrlen); + if (ret < 0) { + si_udp_logdbg("accept failed (ret=%d %m)", ret); + } + return ret; +} + +int sockinfo_udp::accept4(struct sockaddr *__addr, socklen_t *__addrlen, int __flags) +{ + si_udp_logfunc(""); + int ret = SYSCALL(accept4, m_fd, __addr, __addrlen, __flags); + if (ret < 0) { + si_udp_logdbg("accept4 failed (ret=%d %m)", ret); + } + return ret; +} + +int sockinfo_udp::listen(int backlog) +{ + si_udp_logfunc(""); + int ret = SYSCALL(listen, m_fd, backlog); + if (ret < 0) { + si_udp_logdbg("listen failed (ret=%d %m)", ret); + } + return ret; +} + int sockinfo_udp::getsockname(struct sockaddr *__name, socklen_t *__namelen) { si_udp_logdbg(""); @@ -657,13 +697,23 @@ int sockinfo_udp::getsockname(struct sockaddr *__name, socklen_t *__namelen) return -1; } - return orig_os_api.getsockname(m_fd, __name, __namelen); + return SYSCALL(getsockname, m_fd, __name, __namelen); +} + +int sockinfo_udp::getpeername(sockaddr *__name, socklen_t *__namelen) +{ + si_udp_logfunc(""); + int ret = SYSCALL(getpeername, m_fd, __name, __namelen); + if (ret) { + si_udp_logdbg("getpeername failed (ret=%d %m)", ret); + } + return ret; } int sockinfo_udp::on_sockname_change(struct sockaddr *__name, socklen_t __namelen) { BULLSEYE_EXCLUDE_BLOCK_START - if (__name == NULL) { + if (!__name) { si_udp_logerr("invalid NULL __name"); errno = EFAULT; return -1; @@ -732,7 +782,7 @@ int sockinfo_udp::setsockopt(int __level, int __optname, __const void *__optval, si_udp_logfunc("level=%d, optname=%d", __level, __optname); if (unlikely(m_state == SOCKINFO_DESTROYING) || unlikely(g_b_exit)) { - return orig_os_api.setsockopt(m_fd, __level, __optname, __optval, __optlen); + return SYSCALL(setsockopt, m_fd, __level, __optname, __optval, __optlen); } std::lock_guard lock_tx(m_lock_snd); @@ -994,7 +1044,7 @@ int sockinfo_udp::setsockopt(int __level, int __optname, __const void *__optval, break; } - if (NULL == __optval) { + if (!__optval) { si_udp_logdbg("IPPROTO_IP, %s; Bad optval! calling OS setsockopt()", setsockopt_ip_opt_to_str(__optname)); break; @@ -1136,7 +1186,7 @@ int sockinfo_udp::setsockopt(int __level, int __optname, __const void *__optval, // offloaded, check if need to pend else if (m_bound.is_anyport()) { // Delay attaching to this MC group until we have bound UDP port - ret = orig_os_api.setsockopt(m_fd, __level, __optname, __optval, __optlen); + ret = SYSCALL(setsockopt, m_fd, __level, __optname, __optval, __optlen); if (ret) { return ret; } @@ -1149,7 +1199,7 @@ int sockinfo_udp::setsockopt(int __level, int __optname, __const void *__optval, } if (goto_os) { - ret = orig_os_api.setsockopt(m_fd, __level, __optname, __optval, __optlen); + ret = SYSCALL(setsockopt, m_fd, __level, __optname, __optval, __optlen); if (ret) { return ret; } @@ -1412,7 +1462,7 @@ int sockinfo_udp::setsockopt(int __level, int __optname, __const void *__optval, return 0; } break; case IPV6_RECVPKTINFO: - m_b_pktinfo = __optval != nullptr && *(int *)__optval != 0; + m_b_pktinfo = __optval && *(int *)__optval != 0; break; } break; // case IPPROTO_IPV6 @@ -1463,7 +1513,7 @@ int sockinfo_udp::multicast_membership_setsockopt_ip6(int optname, const void *o // offloaded, check if need to pend else if (m_bound.is_anyport()) { // Delay attaching to this MC group until we have bound UDP port - ret = orig_os_api.setsockopt(m_fd, IPPROTO_IPV6, optname, optval, optlen); + ret = SYSCALL(setsockopt, m_fd, IPPROTO_IPV6, optname, optval, optlen); if (ret) { return ret; } @@ -1477,7 +1527,7 @@ int sockinfo_udp::multicast_membership_setsockopt_ip6(int optname, const void *o } if (goto_os) { - ret = orig_os_api.setsockopt(m_fd, IPPROTO_IPV6, optname, optval, optlen); + ret = SYSCALL(setsockopt, m_fd, IPPROTO_IPV6, optname, optval, optlen); if (ret) { return ret; } @@ -1593,7 +1643,7 @@ int sockinfo_udp::getsockopt(int __level, int __optname, void *__optval, socklen { si_udp_logfunc("level=%d, optname=%d", __level, __optname); - int ret = orig_os_api.getsockopt(m_fd, __level, __optname, __optval, __optlen); + int ret = SYSCALL(getsockopt, m_fd, __level, __optname, __optval, __optlen); if (unlikely(m_state == SOCKINFO_DESTROYING) || unlikely(g_b_exit)) { return ret; @@ -1615,9 +1665,9 @@ int sockinfo_udp::getsockopt(int __level, int __optname, void *__optval, socklen uint32_t n_so_rcvbuf_bytes = *(int *)__optval; si_udp_logdbg("SOL_SOCKET, SO_RCVBUF=%d", n_so_rcvbuf_bytes); - if (m_p_socket_stats->n_rx_ready_byte_count > n_so_rcvbuf_bytes) { + if (m_rx_ready_byte_count > n_so_rcvbuf_bytes) { si_udp_logdbg("Releasing at least %lu bytes from ready rx packets queue", - m_p_socket_stats->n_rx_ready_byte_count - n_so_rcvbuf_bytes); + m_rx_ready_byte_count - n_so_rcvbuf_bytes); } rx_ready_byte_count_limit_update(n_so_rcvbuf_bytes); @@ -1669,14 +1719,14 @@ int sockinfo_udp::getsockopt(int __level, int __optname, void *__optval, socklen void sockinfo_udp::rx_ready_byte_count_limit_update(size_t n_rx_ready_bytes_limit_new) { si_udp_logfunc("new limit: %d Bytes (old: %d Bytes, min value %d Bytes)", - n_rx_ready_bytes_limit_new, m_p_socket_stats->n_rx_ready_byte_limit, + n_rx_ready_bytes_limit_new, m_rx_ready_byte_limit, m_n_sysvar_rx_ready_byte_min_limit); if (n_rx_ready_bytes_limit_new > 0 && n_rx_ready_bytes_limit_new < m_n_sysvar_rx_ready_byte_min_limit) { n_rx_ready_bytes_limit_new = m_n_sysvar_rx_ready_byte_min_limit; } - m_p_socket_stats->n_rx_ready_byte_limit = n_rx_ready_bytes_limit_new; - drop_rx_ready_byte_count(m_p_socket_stats->n_rx_ready_byte_limit); + m_rx_ready_byte_limit = n_rx_ready_bytes_limit_new; + drop_rx_ready_byte_count(n_rx_ready_bytes_limit_new); return; } @@ -1687,8 +1737,7 @@ void sockinfo_udp::drop_rx_ready_byte_count(size_t n_rx_bytes_limit) m_lock_rcv.lock(); while (m_n_rx_pkt_ready_list_count) { mem_buf_desc_t *p_rx_pkt_desc = m_rx_pkt_ready_list.front(); - if (m_p_socket_stats->n_rx_ready_byte_count > n_rx_bytes_limit || - p_rx_pkt_desc->rx.sz_payload == 0U) { + if (m_rx_ready_byte_count > n_rx_bytes_limit || p_rx_pkt_desc->rx.sz_payload == 0U) { m_rx_pkt_ready_list.pop_front(); m_n_rx_pkt_ready_list_count--; m_rx_ready_byte_count -= p_rx_pkt_desc->rx.sz_payload; @@ -1813,7 +1862,7 @@ ssize_t sockinfo_udp::rx(const rx_call_t call_type, iovec *p_iov, ssize_t sz_iov } in_flags &= ~MSG_XLIO_ZCOPY; - ret = socket_fd_api::rx_os(call_type, p_iov, sz_iov, in_flags, __from, __fromlen, __msg); + ret = rx_os(call_type, p_iov, sz_iov, in_flags, __from, __fromlen, __msg); *p_flags = in_flags; save_stats_rx_os(ret); if (ret > 0) { @@ -1904,15 +1953,14 @@ bool sockinfo_udp::is_readable(uint64_t *p_poll_sn, fd_array_t *p_fd_ready_array if (m_n_sysvar_rx_cq_drain_rate_nsec == MCE_RX_CQ_DRAIN_RATE_DISABLED) { si_udp_logfunc("=> true (ready count = %d packets / %d bytes)", - m_n_rx_pkt_ready_list_count, m_p_socket_stats->n_rx_ready_byte_count); + m_n_rx_pkt_ready_list_count, m_rx_ready_byte_count); return true; } else { tscval_t tsc_now = TSCVAL_INITIALIZER; gettimeoftsc(&tsc_now); if (tsc_now - g_si_tscv_last_poll < m_n_sysvar_rx_delta_tsc_between_cq_polls) { si_udp_logfunc("=> true (ready count = %d packets / %d bytes)", - m_n_rx_pkt_ready_list_count, - m_p_socket_stats->n_rx_ready_byte_count); + m_n_rx_pkt_ready_list_count, m_rx_ready_byte_count); return true; } @@ -1947,8 +1995,7 @@ bool sockinfo_udp::is_readable(uint64_t *p_poll_sn, fd_array_t *p_fd_ready_array if (m_n_rx_pkt_ready_list_count) { // Get out of the CQ polling loop si_udp_logfunc("=> polled true (ready count = %d packets / %d bytes)", - m_n_rx_pkt_ready_list_count, - m_p_socket_stats->n_rx_ready_byte_count); + m_n_rx_pkt_ready_list_count, m_rx_ready_byte_count); m_rx_ring_map_lock.unlock(); return true; } @@ -1963,13 +2010,13 @@ bool sockinfo_udp::is_readable(uint64_t *p_poll_sn, fd_array_t *p_fd_ready_array // m_n_rx_pkt_ready_list_count if (m_n_rx_pkt_ready_list_count) { si_udp_logfunc("=> true (ready count = %d packets / %d bytes)", m_n_rx_pkt_ready_list_count, - m_p_socket_stats->n_rx_ready_byte_count); + m_rx_ready_byte_count); return true; } // Not ready packets in ready queue, return false si_udp_logfuncall("=> false (ready count = %d packets / %d bytes)", m_n_rx_pkt_ready_list_count, - m_p_socket_stats->n_rx_ready_byte_count); + m_rx_ready_byte_count); return false; } @@ -2041,7 +2088,7 @@ ssize_t sockinfo_udp::tx(xlio_tx_call_attr_t &tx_arg) si_udp_logdbg("MSG_OOB not supported in UDP (tx-ing to os)"); goto tx_packet_to_os; } - if (__dst != NULL) { + if (__dst) { sock_addr dst(__dst, __dstlen); if (!validate_and_convert_mapped_ipv4(dst)) { si_udp_logdbg("Mapped IPv4 on IPv6-Only socket"); @@ -2130,7 +2177,7 @@ ssize_t sockinfo_udp::tx(xlio_tx_call_attr_t &tx_arg) } { - xlio_send_attr attr = {(xlio_wr_tx_packet_attr)0, 0, 0, 0}; + xlio_send_attr attr = {(xlio_wr_tx_packet_attr)0, 0, 0, nullptr}; bool b_blocking = m_b_blocking; if (unlikely(__flags & MSG_DONTWAIT)) { b_blocking = false; @@ -2148,8 +2195,11 @@ ssize_t sockinfo_udp::tx(xlio_tx_call_attr_t &tx_arg) tx_arg.opcode); } - if (unlikely(p_dst_entry->try_migrate_ring_tx(m_lock_snd))) { - m_p_socket_stats->counters.n_tx_migrations++; + // Condition for cache optimization + if (unlikely(safe_mce_sys().ring_migration_ratio_tx > 0)) { + if (unlikely(p_dst_entry->try_migrate_ring_tx(m_lock_snd))) { + m_p_socket_stats->counters.n_tx_migrations++; + } } // TODO ALEXR - still need to handle "is_dropped" in send path @@ -2178,7 +2228,7 @@ ssize_t sockinfo_udp::tx(xlio_tx_call_attr_t &tx_arg) tx_packet_to_os: // Calling OS transmit - ret = socket_fd_api::tx_os(tx_arg.opcode, p_iov, sz_iov, __flags, __dst, __dstlen); + ret = tx_os(tx_arg.opcode, p_iov, sz_iov, __flags, __dst, __dstlen); tx_packet_to_os_stats: save_stats_tx_os(ret); @@ -2231,7 +2281,7 @@ int sockinfo_udp::rx_verify_available_data() } else if (ret == 1) { // Got 1, means we have a ready packet in OS uint64_t pending_data = 0; - ret = orig_os_api.ioctl(m_fd, FIONREAD, &pending_data); + ret = SYSCALL(ioctl, m_fd, FIONREAD, &pending_data); if (ret >= 0) { // This will cause the next non-blocked read to check the OS again. // We do this only after a successful read. @@ -2258,8 +2308,8 @@ inline xlio_recv_callback_retval_t sockinfo_udp::inspect_by_user_cb(mem_buf_desc pkt_info.packet_id = (void *)p_desc; pkt_info.src = p_desc->rx.src.get_p_sa(); pkt_info.dst = p_desc->rx.dst.get_p_sa(); - pkt_info.socket_ready_queue_pkt_count = m_p_socket_stats->n_rx_ready_pkt_count; - pkt_info.socket_ready_queue_byte_count = m_p_socket_stats->n_rx_ready_byte_count; + pkt_info.socket_ready_queue_pkt_count = m_n_rx_pkt_ready_list_count; + pkt_info.socket_ready_queue_byte_count = m_rx_ready_byte_count; if (m_n_tsing_flags & SOF_TIMESTAMPING_RAW_HARDWARE) { pkt_info.hw_timestamp = p_desc->rx.timestamps.hw; @@ -2285,13 +2335,11 @@ inline xlio_recv_callback_retval_t sockinfo_udp::inspect_by_user_cb(mem_buf_desc */ inline void sockinfo_udp::rx_udp_cb_socketxtreme_helper(mem_buf_desc_t *p_desc) { - struct xlio_socketxtreme_completion_t *completion; - // xlio_socketxtreme_completion_t is IPv4 only. assert(p_desc->rx.src.get_sa_family() == AF_INET); - completion = &m_socketxtreme.ec->completion; - + xlio_socketxtreme_completion_t *completion = + set_events_socketxtreme(XLIO_SOCKETXTREME_PACKET, false); completion->packet.num_bufs = p_desc->rx.n_frags; completion->packet.total_len = 0; p_desc->rx.src.get_sa(reinterpret_cast(&completion->src), @@ -2309,9 +2357,9 @@ inline void sockinfo_udp::rx_udp_cb_socketxtreme_helper(mem_buf_desc_t *p_desc) completion->packet.buff_lst->len = p_desc->rx.frag.iov_len; } - NOTIFY_ON_EVENTS(this, XLIO_SOCKETXTREME_PACKET); - save_stats_rx_offload(completion->packet.total_len); + + m_p_rx_ring->socketxtreme_end_ec_operation(); } /** @@ -2328,15 +2376,16 @@ inline void sockinfo_udp::update_ready(mem_buf_desc_t *p_desc, void *pv_fd_ready m_rx_pkt_ready_list.push_back(p_desc); m_n_rx_pkt_ready_list_count++; m_rx_ready_byte_count += p_desc->rx.sz_payload; - m_p_socket_stats->n_rx_ready_pkt_count++; - m_p_socket_stats->n_rx_ready_byte_count += p_desc->rx.sz_payload; - m_p_socket_stats->counters.n_rx_ready_pkt_max = - std::max((uint32_t)m_p_socket_stats->n_rx_ready_pkt_count, - m_p_socket_stats->counters.n_rx_ready_pkt_max); - m_p_socket_stats->counters.n_rx_ready_byte_max = - std::max((uint32_t)m_p_socket_stats->n_rx_ready_byte_count, - m_p_socket_stats->counters.n_rx_ready_byte_max); - do_wakeup(); + if (unlikely(has_stats())) { + m_p_socket_stats->n_rx_ready_byte_count += p_desc->rx.sz_payload; + m_p_socket_stats->n_rx_ready_pkt_count++; + m_p_socket_stats->counters.n_rx_ready_pkt_max = + std::max((uint32_t)m_n_rx_pkt_ready_list_count, + m_p_socket_stats->counters.n_rx_ready_pkt_max); + m_p_socket_stats->counters.n_rx_ready_byte_max = std::max( + (uint32_t)m_rx_ready_byte_count, m_p_socket_stats->counters.n_rx_ready_byte_max); + } + m_sock_wakeup_pipe.do_wakeup(); m_lock_rcv.unlock(); } else { m_p_socket_stats->n_rx_zcopy_pkt_count++; @@ -2352,7 +2401,7 @@ inline void sockinfo_udp::update_ready(mem_buf_desc_t *p_desc, void *pv_fd_ready io_mux_call::update_fd_array((fd_array_t *)pv_fd_ready_array, m_fd); si_udp_logfunc("rx ready count = %d packets / %d bytes", m_n_rx_pkt_ready_list_count, - m_p_socket_stats->n_rx_ready_byte_count); + m_rx_ready_byte_count); } bool sockinfo_udp::packet_is_loopback(mem_buf_desc_t *p_desc) @@ -2365,18 +2414,15 @@ bool sockinfo_udp::packet_is_loopback(mem_buf_desc_t *p_desc) bool sockinfo_udp::rx_input_cb(mem_buf_desc_t *p_desc, void *pv_fd_ready_array) { - m_p_socket_stats->counters.n_rx_packets++; - if (unlikely((m_state == SOCKINFO_DESTROYING) || g_b_exit)) { si_udp_logfunc("rx packet discarded - fd closed"); return false; } /* Check if sockinfo rx byte SO_RCVBUF reached - then disregard this packet */ - if (unlikely(m_p_socket_stats->n_rx_ready_byte_count >= - m_p_socket_stats->n_rx_ready_byte_limit)) { + if (unlikely(m_rx_ready_byte_count >= m_rx_ready_byte_limit)) { si_udp_logfunc("rx packet discarded - socket limit reached (%d bytes)", - m_p_socket_stats->n_rx_ready_byte_limit); + m_rx_ready_byte_limit); m_p_socket_stats->counters.n_rx_ready_byte_drop += p_desc->rx.sz_payload; m_p_socket_stats->counters.n_rx_ready_pkt_drop++; return false; @@ -2479,8 +2525,7 @@ bool sockinfo_udp::rx_input_cb(mem_buf_desc_t *p_desc, void *pv_fd_ready_array) m_port_map_index = ((m_port_map_index + 1) >= m_port_map.size() ? 0 : (m_port_map_index + 1)); int new_port = m_port_map[m_port_map_index].port; - socket_fd_api *sock_api = - g_p_fd_collection->get_sockfd(m_port_map[m_port_map_index].fd); + sockinfo *sock_api = g_p_fd_collection->get_sockfd(m_port_map[m_port_map_index].fd); if (!sock_api || sock_api->get_type() != FD_TYPE_SOCKET) { m_port_map.erase(std::remove(m_port_map.begin(), m_port_map.end(), m_port_map[m_port_map_index].port)); @@ -2509,7 +2554,7 @@ bool sockinfo_udp::rx_input_cb(mem_buf_desc_t *p_desc, void *pv_fd_ready_array) p_desc->inc_ref_count(); save_strq_stats(p_desc->rx.strides_num); - if (is_socketxtreme()) { + if (safe_mce_sys().enable_socketxtreme) { rx_udp_cb_socketxtreme_helper(p_desc); } else { update_ready(p_desc, pv_fd_ready_array, cb_ret); @@ -2527,7 +2572,7 @@ void sockinfo_udp::rx_add_ring_cb(ring *p_ring) // Now that we got at least 1 CQ attached start polling the CQs if (m_b_blocking) { - m_loops_to_go = m_n_sysvar_rx_poll_num; + m_loops_to_go = safe_mce_sys().rx_poll_num; } else { m_loops_to_go = 1; // Force single CQ poll in case of non-blocking socket } @@ -2557,7 +2602,7 @@ void sockinfo_udp::set_blocking(bool is_blocked) // Set the high CQ polling RX_POLL value // depending on where we have mapped offloaded MC gorups if (m_rx_ring_map.size() > 0) { - m_loops_to_go = m_n_sysvar_rx_poll_num; + m_loops_to_go = safe_mce_sys().rx_poll_num; } else { m_loops_to_go = safe_mce_sys().rx_poll_num_init; } @@ -2781,7 +2826,7 @@ int sockinfo_udp::mc_change_membership_ip4(const mc_pending_pram *p_mc_pram) // we will get RX from OS return -1; } - xlio_stats_mc_group_add(mc_grp, m_p_socket_stats); + xlio_stats_mc_group_add(mc_grp, has_stats() ? m_p_socket_stats : nullptr); original_os_setsockopt_helper(&mreq_src, pram_size, p_mc_pram->optname, IPPROTO_IP); m_multicast = true; break; @@ -2793,7 +2838,7 @@ int sockinfo_udp::mc_change_membership_ip4(const mc_pending_pram *p_mc_pram) // we will get RX from OS return -1; } - xlio_stats_mc_group_add(mc_grp, m_p_socket_stats); + xlio_stats_mc_group_add(mc_grp, has_stats() ? m_p_socket_stats : nullptr); pram_size = sizeof(ip_mreq_source); original_os_setsockopt_helper(&mreq_src, pram_size, p_mc_pram->optname, IPPROTO_IP); m_multicast = true; @@ -2806,7 +2851,7 @@ int sockinfo_udp::mc_change_membership_ip4(const mc_pending_pram *p_mc_pram) if (!detach_receiver(flow_key)) { return -1; } - xlio_stats_mc_group_remove(mc_grp, m_p_socket_stats); + xlio_stats_mc_group_remove(mc_grp, has_stats() ? m_p_socket_stats : nullptr); m_multicast = false; break; } @@ -2819,7 +2864,7 @@ int sockinfo_udp::mc_change_membership_ip4(const mc_pending_pram *p_mc_pram) if (!detach_receiver(flow_key)) { return -1; } - xlio_stats_mc_group_remove(mc_grp, m_p_socket_stats); + xlio_stats_mc_group_remove(mc_grp, has_stats() ? m_p_socket_stats : nullptr); m_multicast = false; // get out from MC group } break; @@ -3021,7 +3066,7 @@ int sockinfo_udp::mc_change_membership_ip6(const mc_pending_pram *p_mc_pram) // we will get RX from OS return -1; } - xlio_stats_mc_group_add(mc_grp, m_p_socket_stats); + xlio_stats_mc_group_add(mc_grp, has_stats() ? m_p_socket_stats : nullptr); original_os_setsockopt_helper(&p_mc_pram->req, p_mc_pram->pram_size, p_mc_pram->optname, IPPROTO_IPV6); } break; @@ -3038,7 +3083,7 @@ int sockinfo_udp::mc_change_membership_ip6(const mc_pending_pram *p_mc_pram) if (!detach_receiver(flow_key)) { return -1; } - xlio_stats_mc_group_remove(mc_grp, m_p_socket_stats); + xlio_stats_mc_group_remove(mc_grp, has_stats() ? m_p_socket_stats : nullptr); } break; } @@ -3058,7 +3103,7 @@ void sockinfo_udp::original_os_setsockopt_helper(const void *pram, int pram_size { si_udp_logdbg("calling orig_setsockopt(%s) for igmp support by OS", setsockopt_ip_opt_to_str(optname)); - if (orig_os_api.setsockopt(m_fd, level, optname, pram, pram_size)) { + if (SYSCALL(setsockopt, m_fd, level, optname, pram, pram_size)) { si_udp_logdbg("orig setsockopt(%s) failed (errno=%d %m)", setsockopt_ip_opt_to_str(optname), errno); } @@ -3094,16 +3139,18 @@ void sockinfo_udp::save_stats_threadid_tx() void sockinfo_udp::save_stats_tx_offload(int bytes, bool is_dummy) { - if (unlikely(is_dummy)) { - m_p_socket_stats->counters.n_tx_dummy++; - } else { - if (bytes >= 0) { - m_p_socket_stats->counters.n_tx_sent_byte_count += bytes; - m_p_socket_stats->counters.n_tx_sent_pkt_count++; - } else if (errno == EAGAIN) { - m_p_socket_stats->counters.n_rx_os_eagain++; + if (unlikely(has_stats())) { + if (unlikely(is_dummy)) { + m_p_socket_stats->counters.n_tx_dummy++; } else { - m_p_socket_stats->counters.n_tx_errors++; + if (bytes >= 0) { + m_p_socket_stats->counters.n_tx_sent_byte_count += bytes; + m_p_socket_stats->counters.n_tx_sent_pkt_count++; + } else if (errno == EAGAIN) { + m_p_socket_stats->counters.n_rx_os_eagain++; + } else { + m_p_socket_stats->counters.n_tx_errors++; + } } } } @@ -3145,7 +3192,7 @@ timestamps_t *sockinfo_udp::get_socket_timestamps() { if (unlikely(m_rx_pkt_ready_list.empty())) { si_udp_logdbg("m_rx_pkt_ready_list empty"); - return NULL; + return nullptr; } return &m_rx_pkt_ready_list.front()->rx.timestamps; } @@ -3233,9 +3280,9 @@ void sockinfo_udp::push_back_m_rx_pkt_ready_list(mem_buf_desc_t *buff) bool sockinfo_udp::prepare_to_close(bool process_shutdown) { m_lock_rcv.lock(); - do_wakeup(); + m_sock_wakeup_pipe.do_wakeup(); - if (m_econtext) { + if (has_epoll_context()) { m_econtext->fd_closed(m_fd); } diff --git a/src/core/sock/sockinfo_udp.h b/src/core/sock/sockinfo_udp.h index 39fa29628..3db469120 100644 --- a/src/core/sock/sockinfo_udp.h +++ b/src/core/sock/sockinfo_udp.h @@ -48,8 +48,6 @@ #include "proto/mem_buf_desc.h" #include "proto/dst_entry_udp.h" -#include "pkt_rcvr_sink.h" -#include "pkt_sndr_source.h" #include "sock-redirect.h" #include "sockinfo.h" @@ -87,19 +85,34 @@ typedef std::unordered_map> mc_m class sockinfo_udp : public sockinfo { public: sockinfo_udp(int fd, int domain); - virtual ~sockinfo_udp(); + ~sockinfo_udp() override; - void setPassthrough() { m_p_socket_stats->b_is_offloaded = m_sock_offload = false; } - bool isPassthrough() { return !m_sock_offload; } + void setPassthrough() override { m_p_socket_stats->b_is_offloaded = m_sock_offload = false; } + bool isPassthrough() override { return !m_sock_offload; } int prepare_to_connect(const sockaddr *__to, socklen_t __tolen); int bind_no_os(); - int bind(const struct sockaddr *__addr, socklen_t __addrlen); - int connect(const struct sockaddr *__to, socklen_t __tolen); - virtual int getsockname(sockaddr *__name, socklen_t *__namelen); - int setsockopt(int __level, int __optname, const void *__optval, socklen_t __optlen); - int getsockopt(int __level, int __optname, void *__optval, socklen_t *__optlen); + int bind(const struct sockaddr *__addr, socklen_t __addrlen) override; + int connect(const struct sockaddr *__to, socklen_t __tolen) override; + void clean_socket_obj() override { delete this; } + bool is_writeable() override { return true; }; + bool is_errorable(int *errors) override + { + NOT_IN_USE(errors); + return false; + } + bool is_outgoing() override { return false; } + bool is_incoming() override { return false; } + int shutdown(int __how) override; + int prepareListen() override { return 0; } + int listen(int backlog) override; + int accept(struct sockaddr *__addr, socklen_t *__addrlen) override; + int accept4(struct sockaddr *__addr, socklen_t *__addrlen, int __flags) override; + int getsockname(sockaddr *__name, socklen_t *__namelen) override; + int getpeername(sockaddr *__name, socklen_t *__namelen) override; + int setsockopt(int __level, int __optname, const void *__optval, socklen_t __optlen) override; + int getsockopt(int __level, int __optname, void *__optval, socklen_t *__optlen) override; int resolve_if_ip(const int if_index, const ip_address &ip, ip_address &resolved_ip); int fill_mc_structs_ip6(int optname, const void *optval, mc_pending_pram *mcpram); @@ -110,17 +123,18 @@ class sockinfo_udp : public sockinfo { * Sampling the OS immediately by matching the rx_skip_os counter * (m_rx_udp_poll_os_ratio_counter) to the limit (safe_mce_sys().rx_udp_poll_os_ratio) */ - void set_immediate_os_sample(); + void set_immediate_os_sample() override; /** * Reseting rx_skip_os counter to prevent sampling OS immediately */ - void unset_immediate_os_sample(); + void unset_immediate_os_sample() override; /** * Process a Rx request, we might have a ready packet, or we might block until * we have one (if sockinfo::m_b_blocking == true) */ ssize_t rx(const rx_call_t call_type, iovec *p_iov, ssize_t sz_iov, int *p_flags, - sockaddr *__from = NULL, socklen_t *__fromlen = NULL, struct msghdr *__msg = NULL); + sockaddr *__from = nullptr, socklen_t *__fromlen = nullptr, + struct msghdr *__msg = nullptr) override; /** * Check that a call to this sockinfo rx() will not block * -> meaning, we got an offloaded ready rx datagram @@ -128,11 +142,11 @@ class sockinfo_udp : public sockinfo { * * While polling CQ, the fd_array is filled with a list of newly queued packets FD's */ - bool is_readable(uint64_t *p_poll_sn, fd_array_t *p_fd_array = NULL); + bool is_readable(uint64_t *p_poll_sn, fd_array_t *p_fd_array = nullptr) override; /** * Arm the event channel(s) assosiated with this sockinfo * Fill the fd_set (p_rxfds) with the correct fd channel values and the p_nfds with the (max_fd - * + 1) Fill the p_cq_mgr_fd_map with the pointer to the cq_mgr asosiated with the fd Return + * + 1) Fill the p_cq_mgr_fd_map with the pointer to the cq_mgr_rx asosiated with the fd Return * count of channels (fds) that where mapped */ int rx_request_notification(uint64_t poll_sn); @@ -141,14 +155,14 @@ class sockinfo_udp : public sockinfo { * until the connection info is ready or a tx buffer is releast (if sockinfo::m_b_blocking == * true) */ - ssize_t tx(xlio_tx_call_attr_t &tx_arg); + ssize_t tx(xlio_tx_call_attr_t &tx_arg) override; /** * Check that a call to this sockinof rx() will not block * -> meaning, we got a ready rx packet */ - void rx_add_ring_cb(ring *p_ring); - void rx_del_ring_cb(ring *p_ring); - virtual int rx_verify_available_data(); + void rx_add_ring_cb(ring *p_ring) override; + void rx_del_ring_cb(ring *p_ring) override; + int rx_verify_available_data() override; /** * This callback will handle ready rx packet notification, @@ -158,27 +172,38 @@ class sockinfo_udp : public sockinfo { * incremented and method returns false. * Normally it is single point from sockinfo to be called from ring level. */ - bool rx_input_cb(mem_buf_desc_t *p_desc, void *pv_fd_ready_array); + bool rx_input_cb(mem_buf_desc_t *p_desc, void *pv_fd_ready_array) override; // This call will handle all rdma related events (bind->listen->connect_req->accept) - virtual void statistics_print(vlog_levels_t log_level = VLOG_DEBUG); - virtual int recvfrom_zcopy_free_packets(struct xlio_recvfrom_zcopy_packet_t *pkts, - size_t count); - virtual inline fd_type_t get_type() { return FD_TYPE_SOCKET; } + void statistics_print(vlog_levels_t log_level = VLOG_DEBUG) override; + int recvfrom_zcopy_free_packets(struct xlio_recvfrom_zcopy_packet_t *pkts, + size_t count) override; + inline fd_type_t get_type() override { return FD_TYPE_SOCKET; } - virtual bool prepare_to_close(bool process_shutdown = false); - virtual void update_header_field(data_updater *updater); + bool prepare_to_close(bool process_shutdown = false) override; + void update_header_field(data_updater *updater) override; #if defined(DEFINED_NGINX) - virtual void prepare_to_close_socket_pool(bool _push_pop); - virtual void set_params_for_socket_pool() + void prepare_to_close_socket_pool(bool _push_pop) override; + void set_params_for_socket_pool() override { m_is_for_socket_pool = true; - set_m_n_sysvar_rx_num_buffs_reuse(safe_mce_sys().nginx_udp_socket_pool_rx_num_buffs_reuse); + set_rx_num_buffs_reuse(safe_mce_sys().nginx_udp_socket_pool_rx_num_buffs_reuse); } - bool is_closable() { return !m_is_for_socket_pool; } + bool is_closable() override { return !m_is_for_socket_pool; } +#else + bool is_closable() override { return true; } #endif + int register_callback(xlio_recv_callback_t callback, void *context) override + { + return register_callback_ctx(callback, context); + } + +protected: + void lock_rx_q() override { m_lock_rcv.lock(); } + void unlock_rx_q() override { m_lock_rcv.unlock(); } + private: bool packet_is_loopback(mem_buf_desc_t *p_desc); ssize_t check_payload_size(const iovec *p_iov, ssize_t sz_iov); @@ -195,7 +220,7 @@ class sockinfo_udp : public sockinfo { void handle_pending_mreq(); void original_os_setsockopt_helper(const void *pram, int pram_size, int optname, int level); /* helper functions */ - void set_blocking(bool is_blocked); + void set_blocking(bool is_blocked) override; void rx_ready_byte_count_limit_update( size_t n_rx_ready_bytes_limit); // Drop rx ready packets from head of queue @@ -212,10 +237,10 @@ class sockinfo_udp : public sockinfo { inline int poll_os(); virtual inline void reuse_buffer(mem_buf_desc_t *buff); - virtual mem_buf_desc_t *get_next_desc(mem_buf_desc_t *p_desc); - virtual mem_buf_desc_t *get_next_desc_peek(mem_buf_desc_t *p_desc, int &rx_pkt_ready_list_idx); - virtual timestamps_t *get_socket_timestamps(); - virtual void update_socket_timestamps(timestamps_t *) {}; + mem_buf_desc_t *get_next_desc(mem_buf_desc_t *p_desc) override; + mem_buf_desc_t *get_next_desc_peek(mem_buf_desc_t *p_desc, int &rx_pkt_ready_list_idx) override; + timestamps_t *get_socket_timestamps() override; + void update_socket_timestamps(timestamps_t *) override {}; inline void return_reuse_buffers_postponed() { @@ -232,7 +257,7 @@ class sockinfo_udp : public sockinfo { while (iter != m_rx_ring_map.end()) { descq_t *rx_reuse = &iter->second->rx_reuse_info.rx_reuse; int &n_buff_num = iter->second->rx_reuse_info.n_buff_num; - if (n_buff_num >= m_n_sysvar_rx_num_buffs_reuse) { + if (n_buff_num >= m_rx_num_buffs_reuse) { if (iter->first->reclaim_recv_buffers(rx_reuse)) { n_buff_num = 0; } else { @@ -248,16 +273,16 @@ class sockinfo_udp : public sockinfo { inline void update_ready(mem_buf_desc_t *p_rx_wc_buf_desc, void *pv_fd_ready_array, xlio_recv_callback_retval_t cb_ret); - virtual void post_deqeue(bool release_buff); - virtual int zero_copy_rx(iovec *p_iov, mem_buf_desc_t *pdesc, int *p_flags); - virtual size_t handle_msg_trunc(size_t total_rx, size_t payload_size, int in_flags, - int *p_out_flags); - virtual void handle_ip_pktinfo(struct cmsg_state *cm_state); + void post_deqeue(bool release_buff) override; + int zero_copy_rx(iovec *p_iov, mem_buf_desc_t *pdesc, int *p_flags) override; + size_t handle_msg_trunc(size_t total_rx, size_t payload_size, int in_flags, + int *p_out_flags) override; + void handle_ip_pktinfo(struct cmsg_state *cm_state) override; - virtual mem_buf_desc_t *get_front_m_rx_pkt_ready_list(); - virtual size_t get_size_m_rx_pkt_ready_list(); - virtual void pop_front_m_rx_pkt_ready_list(); - virtual void push_back_m_rx_pkt_ready_list(mem_buf_desc_t *buff); + mem_buf_desc_t *get_front_m_rx_pkt_ready_list() override; + size_t get_size_m_rx_pkt_ready_list() override; + void pop_front_m_rx_pkt_ready_list() override; + void push_back_m_rx_pkt_ready_list(mem_buf_desc_t *buff) override; private: struct port_socket_t { @@ -268,6 +293,7 @@ class sockinfo_udp : public sockinfo { bool operator==(const int &r_port) { return port == r_port; } }; + uint32_t m_rx_ready_byte_limit; ip_addr m_mc_tx_src_ip; bool m_b_mc_tx_loop; uint8_t m_n_mc_ttl_hop_lim; diff --git a/src/core/sock/sockinfo_ulp.cpp b/src/core/sock/sockinfo_ulp.cpp index 0448ca32d..f0c6125e7 100644 --- a/src/core/sock/sockinfo_ulp.cpp +++ b/src/core/sock/sockinfo_ulp.cpp @@ -46,7 +46,7 @@ #define si_ulp_logerr __log_info_err /*inline*/ -ring *sockinfo_tcp_ops::get_tx_ring(void) +ring *sockinfo_tcp_ops::get_tx_ring() { return m_p_sock->get_tx_ring(); } @@ -86,11 +86,11 @@ bool sockinfo_tcp_ops::handle_send_ret(ssize_t ret, struct tcp_seg *seg) #include struct xlio_tls_api { - EVP_CIPHER_CTX *(*EVP_CIPHER_CTX_new)(void); + EVP_CIPHER_CTX *(*EVP_CIPHER_CTX_new)(); void (*EVP_CIPHER_CTX_free)(EVP_CIPHER_CTX *); int (*EVP_CIPHER_CTX_reset)(EVP_CIPHER_CTX *); - const EVP_CIPHER *(*EVP_aes_128_gcm)(void); - const EVP_CIPHER *(*EVP_aes_256_gcm)(void); + const EVP_CIPHER *(*EVP_aes_128_gcm)(); + const EVP_CIPHER *(*EVP_aes_256_gcm)(); int (*EVP_DecryptInit_ex)(EVP_CIPHER_CTX *, const EVP_CIPHER *, ENGINE *, const unsigned char *, const unsigned char *); int (*EVP_DecryptUpdate)(EVP_CIPHER_CTX *, unsigned char *, int *, const unsigned char *, int); @@ -116,7 +116,7 @@ template static void dlsym_default(T &ptr, const char *name) #define XLIO_TLS_API_FIND(__name) dlsym_default(s_tls_api.__name, #__name); -void xlio_tls_api_setup(void) +void xlio_tls_api_setup() { XLIO_TLS_API_FIND(EVP_CIPHER_CTX_new); XLIO_TLS_API_FIND(EVP_CIPHER_CTX_free); @@ -171,9 +171,10 @@ static inline uint8_t get_alert_level(uint8_t alert_type) * tls_record */ -enum { +enum : size_t { TLS_RECORD_HDR_LEN = 5U, TLS_RECORD_IV_LEN = TLS_AES_GCM_IV_LEN, + TLS_13_RECORD_IV_LEN = 0U, TLS_RECORD_TAG_LEN = 16U, TLS_RECORD_NONCE_LEN = 12U, /* SALT + IV */ /* TLS 1.2 record overhead. */ @@ -199,8 +200,8 @@ class tls_record : public mem_desc { m_record_number = record_number; m_size = TLS_RECORD_HDR_LEN + TLS_RECORD_TAG_LEN; m_p_data = nullptr; - tls_sock->get_record_buf(m_p_buf, m_p_data, zc_owner != nullptr); - if (likely(m_p_buf)) { + tls_sock->get_record_buf(m_p_buf, m_p_data, zc_owner); + if (likely(m_p_buf && m_p_data)) { if (iv) { m_size += TLS_RECORD_IV_LEN; memcpy(&m_p_data[5], iv, TLS_RECORD_IV_LEN); @@ -221,7 +222,7 @@ class tls_record : public mem_desc { m_p_zc_data = nullptr; } - ~tls_record() + ~tls_record() override { /* * Because of batching, buffers can be freed after their socket @@ -235,9 +236,9 @@ class tls_record : public mem_desc { } } - void get(void) { (void)atomic_fetch_and_inc(&m_ref); } + void get() override { (void)atomic_fetch_and_inc(&m_ref); } - void put(void) + void put() override { int ref = atomic_fetch_and_dec(&m_ref); @@ -246,7 +247,8 @@ class tls_record : public mem_desc { } } - uint32_t get_lkey(mem_buf_desc_t *desc, ib_ctx_handler *ib_ctx, const void *addr, size_t len) + uint32_t get_lkey(mem_buf_desc_t *desc, ib_ctx_handler *ib_ctx, const void *addr, + size_t len) override { const uintptr_t uaddr = (uintptr_t)addr; const uintptr_t ubuf = (uintptr_t)m_p_buf->p_buffer; @@ -273,7 +275,7 @@ class tls_record : public mem_desc { return len; } - inline size_t avail_space(void) + inline size_t avail_space() { /* Don't produce records larger than 16KB according to the protocol. */ size_t max_len = m_p_zc_owner ? (size_t)TLS_RECORD_MAX @@ -305,7 +307,8 @@ class tls_record : public mem_desc { assert(iov_max >= 3); (void)iov_max; iov[0].iov_base = m_p_data; - iov[0].iov_len = TLS_RECORD_HDR_LEN + (is_tls13 ? 0 : TLS_RECORD_IV_LEN); + iov[0].iov_len = + TLS_RECORD_HDR_LEN + (is_tls13 ? TLS_13_RECORD_IV_LEN : TLS_RECORD_IV_LEN); iov[1].iov_base = m_p_zc_data; iov[1].iov_len = m_size - (is_tls13 ? TLS_13_RECORD_OVERHEAD : TLS_12_RECORD_OVERHEAD); iov[2].iov_base = m_p_data + iov[0].iov_len; @@ -317,7 +320,7 @@ class tls_record : public mem_desc { } private: - inline void set_length(void) + inline void set_length() { uint16_t len = m_size - TLS_RECORD_HDR_LEN; @@ -416,9 +419,9 @@ sockinfo_tcp_ops_tls::~sockinfo_tcp_ops_tls() * users. Note, we are under TCP connection lock here. */ mem_buf_desc_t *pdesc = m_rx_bufs.front(); - if (pdesc->lwip_pbuf.pbuf.ref > 1) { + if (pdesc->lwip_pbuf.ref > 1) { m_rx_bufs.pop_front(); - pbuf_free(&pdesc->lwip_pbuf.pbuf); + pbuf_free(&pdesc->lwip_pbuf); } while (!m_rx_bufs.empty()) { pdesc = m_rx_bufs.get_and_pop_front(); @@ -449,7 +452,7 @@ void sockinfo_tcp_ops_tls::get_record_buf(mem_buf_desc_t *&buf, uint8_t *&data, m_zc_stor = m_p_sock->tcp_tx_mem_buf_alloc(PBUF_RAM); m_zc_stor_offset = 0; if (likely(m_zc_stor)) { - m_zc_stor->lwip_pbuf.pbuf.ref += m_zc_stor->sz_buffer / TLS_ZC_BLOCK; + m_zc_stor->lwip_pbuf.ref += m_zc_stor->sz_buffer / TLS_ZC_BLOCK; } } buf = m_zc_stor; @@ -570,11 +573,6 @@ int sockinfo_tcp_ops_tls::setsockopt(int __level, int __optname, const void *__o return -1; } - if (unlikely(keylen > TLS_AES_GCM_KEY_MAX)) { - errno = EINVAL; - return -1; - } - xlio_tls_info *tls_info = (__optname == TLS_TX) ? &m_tls_info_tx : &m_tls_info_rx; tls_info->tls_version = base_info->version; tls_info->tls_cipher = base_info->cipher_type; @@ -605,7 +603,7 @@ int sockinfo_tcp_ops_tls::setsockopt(int __level, int __optname, const void *__o return -1; } m_is_tls_tx = true; - m_p_sock->m_p_socket_stats->tls_tx_offload = true; + m_p_sock->get_sock_stats()->tls_tx_offload = true; } else { m_p_cipher_ctx = (void *)g_tls_api->EVP_CIPHER_CTX_new(); if (unlikely(!m_p_cipher_ctx)) { @@ -658,12 +656,12 @@ int sockinfo_tcp_ops_tls::setsockopt(int __level, int __optname, const void *__o } tcp_recv(m_p_sock->get_pcb(), sockinfo_tcp_ops_tls::rx_lwip_cb); - m_p_sock->m_p_socket_stats->tls_rx_offload = true; + m_p_sock->get_sock_stats()->tls_rx_offload = true; m_p_sock->unlock_tcp_con(); } - m_p_sock->m_p_socket_stats->tls_version = base_info->version; - m_p_sock->m_p_socket_stats->tls_cipher = base_info->cipher_type; + m_p_sock->get_sock_stats()->tls_version = base_info->version; + m_p_sock->get_sock_stats()->tls_cipher = base_info->cipher_type; si_ulp_logdbg("TLS%s %s offload is configured, keylen=%u", base_info->version == TLS_1_2_VERSION ? "1.2" : "1.3", @@ -671,7 +669,7 @@ int sockinfo_tcp_ops_tls::setsockopt(int __level, int __optname, const void *__o return 0; } -err_t sockinfo_tcp_ops_tls::tls_rx_consume_ready_packets(void) +err_t sockinfo_tcp_ops_tls::tls_rx_consume_ready_packets() { err_t ret = ERR_OK; @@ -682,7 +680,7 @@ err_t sockinfo_tcp_ops_tls::tls_rx_consume_ready_packets(void) * receive encrypted TLS records with header and TAG after successful * setsockopt() call. */ - if (m_p_sock->m_p_socket_stats->n_rx_ready_pkt_count != 0) { + if (m_p_sock->get_rx_pkt_ready_list_count() > 0) { descq_t descs_rx_ready; m_p_sock->sock_pop_descs_rx_ready(&descs_rx_ready); @@ -690,7 +688,7 @@ err_t sockinfo_tcp_ops_tls::tls_rx_consume_ready_packets(void) mem_buf_desc_t *temp; temp = descs_rx_ready.front(); descs_rx_ready.pop_front(); - ret = recv(&temp->lwip_pbuf.pbuf); + ret = recv(&temp->lwip_pbuf); if (unlikely(ERR_OK != ret)) { break; } @@ -752,21 +750,16 @@ ssize_t sockinfo_tcp_ops_tls::tx(xlio_tx_call_attr_t &tx_arg) } } + uint8_t *iv = is_tx_tls13() ? nullptr : m_tls_info_tx.iv; + mem_desc *zc_owner = is_zerocopy ? reinterpret_cast(tx_arg.priv.mdesc) : nullptr; for (ssize_t i = 0; i < tx_arg.attr.sz_iov; ++i) { pos = 0; while (pos < p_iov[i].iov_len) { tls_record *rec; ssize_t ret2; - size_t sndbuf = m_p_sock->sndbuf_available(); - size_t tosend = p_iov[i].iov_len - pos; + size_t tosend = std::min(p_iov[i].iov_len - pos, TLS_RECORD_MAX); - /* - * XXX This approach can lead to issue with epoll() - * since such a socket will always be ready for write - */ - if (!block_this_run && sndbuf < TLS_RECORD_SMALLEST && - (sndbuf < m_tls_rec_overhead || (sndbuf - m_tls_rec_overhead) < tosend)) { - /* We don't want to create too small TLS records when we do partial write. */ + if (m_p_sock->sndbuf_available() == 0U && !block_this_run) { if (ret == 0) { errno = EAGAIN; ret = -1; @@ -774,10 +767,8 @@ ssize_t sockinfo_tcp_ops_tls::tx(xlio_tx_call_attr_t &tx_arg) goto done; } - rec = new tls_record( - this, m_p_sock->get_next_tcp_seqno(), m_next_recno_tx, - is_tx_tls13() ? nullptr : m_tls_info_tx.iv, - is_zerocopy ? reinterpret_cast(tx_arg.priv.mdesc) : nullptr); + rec = + new tls_record(this, m_p_sock->get_next_tcp_seqno(), m_next_recno_tx, iv, zc_owner); if (unlikely(!rec || !rec->m_p_buf)) { if (ret == 0) { errno = ENOMEM; @@ -792,6 +783,14 @@ ssize_t sockinfo_tcp_ops_tls::tx(xlio_tx_call_attr_t &tx_arg) } goto done; } + + tosend = rec->append_data((uint8_t *)p_iov[i].iov_base + pos, tosend, is_tx_tls13()); + /* Set type after all data, because for TLS1.3 it is in the tail. */ + rec->set_type(tls_type, is_tx_tls13()); + rec->fill_iov(tls_arg.attr.iov, ARRAY_SIZE(tls_iov), is_tx_tls13()); + tls_arg.priv.mdesc = reinterpret_cast(rec); + pos += tosend; + ++m_next_recno_tx; /* * Prepare unique explicit_nonce for the next TLS1.2 record. @@ -800,20 +799,16 @@ ssize_t sockinfo_tcp_ops_tls::tx(xlio_tx_call_attr_t &tx_arg) if (!is_tx_tls13()) { ++m_tls_info_tx.iv64; } - + retry: if (!block_this_run) { - /* sndbuf overflow is not possible since we have a check above. */ - tosend = std::min(tosend, sndbuf - m_tls_rec_overhead); - } - tosend = rec->append_data((uint8_t *)p_iov[i].iov_base + pos, tosend, is_tx_tls13()); - /* Set type after all data, because for TLS1.3 it is in the tail. */ - rec->set_type(tls_type, is_tx_tls13()); - rec->fill_iov(tls_arg.attr.iov, ARRAY_SIZE(tls_iov), is_tx_tls13()); - tls_arg.priv.mdesc = reinterpret_cast(rec); - pos += tosend; + m_p_sock->rx_poll_on_tx_if_needed(); + ret2 = m_p_sock->tcp_tx_express(tls_arg.attr.iov, tls_arg.attr.sz_iov, 0, + XLIO_EXPRESS_OP_TYPE_FILE_ZEROCOPY, + reinterpret_cast(rec)); - retry: - ret2 = m_p_sock->tcp_tx(tls_arg); + } else { + ret2 = m_p_sock->tcp_tx(tls_arg); + } if (block_this_run && (ret2 != (ssize_t)tls_arg.attr.iov[0].iov_len)) { if ((ret2 >= 0) || (errno == EINTR && !g_b_exit)) { ret2 = ret2 < 0 ? 0 : ret2; @@ -863,8 +858,11 @@ ssize_t sockinfo_tcp_ops_tls::tx(xlio_tx_call_attr_t &tx_arg) /* Statistics */ if (ret > 0) { errno = errno_save; - m_p_sock->m_p_socket_stats->tls_counters.n_tls_tx_records += m_next_recno_tx - last_recno; - m_p_sock->m_p_socket_stats->tls_counters.n_tls_tx_bytes += ret; + if (unlikely(m_p_sock->has_stats())) { + m_p_sock->get_sock_stats()->tls_counters.n_tls_tx_records += + m_next_recno_tx - last_recno; + m_p_sock->get_sock_stats()->tls_counters.n_tls_tx_bytes += ret; + } } return ret; } @@ -890,7 +888,7 @@ int sockinfo_tcp_ops_tls::postrouting(struct pbuf *p, struct tcp_seg *seg, xlio_ uint64_t recno_be64 = htobe64(rec->m_record_number); bool skip_static = !memcmp(m_tls_info_tx.rec_seq, &recno_be64, TLS_AES_GCM_REC_SEQ_LEN); - bool is_zerocopy = rec->m_p_zc_owner != nullptr; + bool is_zerocopy = rec->m_p_zc_owner; unsigned mss = m_p_sock->get_mss(); uint32_t totlen = seg->seqno - rec->m_seqno; uint32_t lkey = LKEY_TX_DEFAULT; @@ -899,7 +897,9 @@ int sockinfo_tcp_ops_tls::postrouting(struct pbuf *p, struct tcp_seg *seg, xlio_ if (is_zerocopy) { hdrlen = std::min( - TLS_RECORD_HDR_LEN + (is_tx_tls13() ? 0 : TLS_RECORD_IV_LEN), totlen); + TLS_RECORD_HDR_LEN + + (is_tx_tls13() ? TLS_13_RECORD_IV_LEN : TLS_RECORD_IV_LEN), + totlen); taillen = TLS_RECORD_TAG_LEN + !!is_tx_tls13(); /* Determine the trailer portion to resend. */ taillen = std::max(totlen + taillen, rec->m_size) - rec->m_size; @@ -977,8 +977,8 @@ int sockinfo_tcp_ops_tls::postrouting(struct pbuf *p, struct tcp_seg *seg, xlio_ m_expected_seqno = seg->seqno; /* Statistics */ - ++m_p_sock->m_p_socket_stats->tls_counters.n_tls_tx_resync; - m_p_sock->m_p_socket_stats->tls_counters.n_tls_tx_resync_replay += + ++m_p_sock->get_sock_stats()->tls_counters.n_tls_tx_resync; + m_p_sock->get_sock_stats()->tls_counters.n_tls_tx_resync_replay += (seg->seqno != rec->m_seqno); } m_expected_seqno += seg->len; @@ -1055,18 +1055,18 @@ void sockinfo_tcp_ops_tls::copy_by_offset(uint8_t *dst, uint32_t offset, uint32_ mem_buf_desc_t *pdesc = *iter; /* Skip leading buffers */ - if (unlikely(pdesc->lwip_pbuf.pbuf.len <= offset)) { - while (pdesc && pdesc->lwip_pbuf.pbuf.len <= offset) { - offset -= pdesc->lwip_pbuf.pbuf.len; + if (unlikely(pdesc->lwip_pbuf.len <= offset)) { + while (pdesc && pdesc->lwip_pbuf.len <= offset) { + offset -= pdesc->lwip_pbuf.len; pdesc = *(++iter); } } /* Copy */ while (likely(pdesc) && len > 0) { - uint32_t buflen = std::min(pdesc->lwip_pbuf.pbuf.len - offset, len); + uint32_t buflen = std::min(pdesc->lwip_pbuf.len - offset, len); - memcpy(dst, (uint8_t *)pdesc->lwip_pbuf.pbuf.payload + offset, buflen); + memcpy(dst, (uint8_t *)pdesc->lwip_pbuf.payload + offset, buflen); len -= buflen; dst += buflen; offset = 0; @@ -1083,24 +1083,24 @@ uint16_t sockinfo_tcp_ops_tls::offset_to_host16(uint32_t offset) uint16_t res = 0; /* Skip leading buffers */ - if (unlikely(pdesc->lwip_pbuf.pbuf.len <= offset)) { - while (pdesc && pdesc->lwip_pbuf.pbuf.len <= offset) { - offset -= pdesc->lwip_pbuf.pbuf.len; + if (unlikely(pdesc->lwip_pbuf.len <= offset)) { + while (pdesc && pdesc->lwip_pbuf.len <= offset) { + offset -= pdesc->lwip_pbuf.len; pdesc = *(++iter); } } if (likely(pdesc)) { - res = (uint16_t)((uint8_t *)pdesc->lwip_pbuf.pbuf.payload)[offset] << 8U; + res = (uint16_t)((uint8_t *)pdesc->lwip_pbuf.payload)[offset] << 8U; ++offset; - if (unlikely(offset >= pdesc->lwip_pbuf.pbuf.len)) { + if (unlikely(offset >= pdesc->lwip_pbuf.len)) { offset = 0; pdesc = *(++iter); if (unlikely(!pdesc)) { return 0; } } - res |= (uint16_t)((uint8_t *)pdesc->lwip_pbuf.pbuf.payload)[offset]; + res |= (uint16_t)((uint8_t *)pdesc->lwip_pbuf.payload)[offset]; } return res; } @@ -1131,7 +1131,7 @@ int sockinfo_tcp_ops_tls::tls_rx_decrypt(struct pbuf *plist) copy_by_offset(&buf[TLS_AES_GCM_SALT_LEN], m_rx_offset + TLS_RECORD_HDR_LEN, TLS_RECORD_IV_LEN); } - ret = g_tls_api->EVP_DecryptInit_ex(tls_ctx, (EVP_CIPHER *)m_p_evp_cipher, NULL, + ret = g_tls_api->EVP_DecryptInit_ex(tls_ctx, (EVP_CIPHER *)m_p_evp_cipher, nullptr, m_tls_info_rx.key, buf); if (unlikely(!ret)) { return TLS_DECRYPT_INTERNAL; @@ -1150,20 +1150,20 @@ int sockinfo_tcp_ops_tls::tls_rx_decrypt(struct pbuf *plist) copy_by_offset(buf, m_rx_offset, 3); buf[3] = rec_len >> 8U; buf[4] = rec_len & 0xFFU; - ret = g_tls_api->EVP_DecryptUpdate(tls_ctx, NULL, &len, buf, 5); + ret = g_tls_api->EVP_DecryptUpdate(tls_ctx, nullptr, &len, buf, 5); } else { uint16_t rec_len = m_rx_rec_len - m_tls_rec_overhead; *((uint64_t *)buf) = htobe64(m_next_recno_rx); copy_by_offset(buf + 8, m_rx_offset, 3); buf[11] = rec_len >> 8U; buf[12] = rec_len & 0xFFU; - ret = g_tls_api->EVP_DecryptUpdate(tls_ctx, NULL, &len, buf, 13); + ret = g_tls_api->EVP_DecryptUpdate(tls_ctx, nullptr, &len, buf, 13); } if (unlikely(!ret)) { return TLS_DECRYPT_INTERNAL; } - for (p = plist; p != NULL; p = p->next) { + for (p = plist; p; p = p->next) { if (((mem_buf_desc_t *)p)->rx.tls_decrypted == TLS_RX_DECRYPTED) { /* * This is partially decrypted record, stop here @@ -1214,7 +1214,7 @@ int sockinfo_tcp_ops_tls::tls_rx_encrypt(struct pbuf *plist) copy_by_offset(&buf[TLS_AES_GCM_SALT_LEN], m_rx_offset + TLS_RECORD_HDR_LEN, TLS_RECORD_IV_LEN); } - ret = g_tls_api->EVP_EncryptInit_ex(tls_ctx, (EVP_CIPHER *)m_p_evp_cipher, NULL, + ret = g_tls_api->EVP_EncryptInit_ex(tls_ctx, (EVP_CIPHER *)m_p_evp_cipher, nullptr, m_tls_info_rx.key, buf); if (unlikely(!ret)) { return TLS_DECRYPT_INTERNAL; @@ -1232,13 +1232,13 @@ int sockinfo_tcp_ops_tls::tls_rx_encrypt(struct pbuf *plist) copy_by_offset(buf, m_rx_offset, 3); buf[3] = rec_len >> 8U; buf[4] = rec_len & 0xFFU; - ret = g_tls_api->EVP_EncryptUpdate(tls_ctx, NULL, &len, buf, 5); + ret = g_tls_api->EVP_EncryptUpdate(tls_ctx, nullptr, &len, buf, 5); } else { *((uint64_t *)buf) = htobe64(m_next_recno_rx); copy_by_offset(buf + 8, m_rx_offset, 3); buf[11] = rec_len >> 8U; buf[12] = rec_len & 0xFFU; - ret = g_tls_api->EVP_EncryptUpdate(tls_ctx, NULL, &len, buf, 13); + ret = g_tls_api->EVP_EncryptUpdate(tls_ctx, nullptr, &len, buf, 13); } if (unlikely(!ret)) { return TLS_DECRYPT_INTERNAL; @@ -1293,14 +1293,14 @@ err_t sockinfo_tcp_ops_tls::recv(struct pbuf *p) m_p_tx_ring->credits_get(SQ_CREDITS_TLS_RX_GET_PSV)) { /* If we fail to request credits we will retry resync flow with the next incoming packet. */ m_rx_psv_buf = m_p_sock->tcp_tx_mem_buf_alloc(PBUF_RAM); - m_rx_psv_buf->lwip_pbuf.pbuf.payload = + m_rx_psv_buf->lwip_pbuf.payload = (void *)(((uintptr_t)m_rx_psv_buf->p_buffer + 63U) >> 6U << 6U); - uint8_t *payload = (uint8_t *)m_rx_psv_buf->lwip_pbuf.pbuf.payload; + uint8_t *payload = (uint8_t *)m_rx_psv_buf->lwip_pbuf.payload; if (likely(m_rx_psv_buf->sz_buffer >= (size_t)(payload - m_rx_psv_buf->p_buffer + 64))) { - memset(m_rx_psv_buf->lwip_pbuf.pbuf.payload, 0, 64); + memset(m_rx_psv_buf->lwip_pbuf.payload, 0, 64); m_rx_resync_recno = m_next_recno_rx; m_p_tx_ring->tls_get_progress_params_rx(m_p_tir, payload, LKEY_TX_DEFAULT); - ++m_p_sock->m_p_socket_stats->tls_counters.n_tls_rx_resync; + ++m_p_sock->get_sock_stats()->tls_counters.n_tls_rx_resync; } } @@ -1343,7 +1343,8 @@ err_t sockinfo_tcp_ops_tls::recv(struct pbuf *p) struct pbuf *pi; struct pbuf *pres = nullptr; struct pbuf *ptmp = nullptr; - uint32_t offset = m_rx_offset + TLS_RECORD_HDR_LEN + (is_rx_tls13() ? 0 : TLS_RECORD_IV_LEN); + uint32_t offset = m_rx_offset + TLS_RECORD_HDR_LEN + + (is_rx_tls13() ? TLS_13_RECORD_IV_LEN : TLS_RECORD_IV_LEN); uint32_t remain = m_rx_rec_len - m_tls_rec_overhead; unsigned bufs_nr = 0; unsigned decrypted_nr = 0; @@ -1351,7 +1352,7 @@ err_t sockinfo_tcp_ops_tls::recv(struct pbuf *p) uint8_t tls_decrypted = 0; mem_buf_desc_t *pdesc = *iter; - tls_type = ((uint8_t *)pdesc->lwip_pbuf.pbuf.payload)[m_rx_offset]; + tls_type = ((uint8_t *)pdesc->lwip_pbuf.payload)[m_rx_offset]; if (is_rx_tls13()) { /* TLS 1.3 sends record type as the last byte of the payload. */ ++remain; @@ -1366,7 +1367,7 @@ err_t sockinfo_tcp_ops_tls::recv(struct pbuf *p) break; } - pi = &pdesc->lwip_pbuf.pbuf; + pi = &pdesc->lwip_pbuf; if (pi->len <= offset) { offset -= pi->len; goto next_buffer; @@ -1444,8 +1445,11 @@ err_t sockinfo_tcp_ops_tls::recv(struct pbuf *p) } /* Statistics */ - m_p_sock->m_p_socket_stats->tls_counters.n_tls_rx_records_enc += !!(decrypted_nr == 0); - m_p_sock->m_p_socket_stats->tls_counters.n_tls_rx_records_partial += !!(decrypted_nr != 0); + if (unlikely(m_p_sock->has_stats())) { + m_p_sock->get_sock_stats()->tls_counters.n_tls_rx_records_enc += !!(decrypted_nr == 0); + m_p_sock->get_sock_stats()->tls_counters.n_tls_rx_records_partial += + !!(decrypted_nr != 0); + } } /* Handle decryption failures. */ @@ -1477,10 +1481,12 @@ err_t sockinfo_tcp_ops_tls::recv(struct pbuf *p) } /* Statistics */ - m_p_sock->m_p_socket_stats->tls_counters.n_tls_rx_records += 1U; - m_p_sock->m_p_socket_stats->tls_counters.n_tls_rx_bytes += likely(pres) ? pres->tot_len : 0; - /* Adjust TCP counters with received TLS header/trailer. */ - m_p_sock->m_p_socket_stats->counters.n_rx_bytes += m_tls_rec_overhead; + if (unlikely(m_p_sock->has_stats())) { + m_p_sock->get_sock_stats()->tls_counters.n_tls_rx_records += 1U; + m_p_sock->get_sock_stats()->tls_counters.n_tls_rx_bytes += likely(pres) ? pres->tot_len : 0; + /* Adjust TCP counters with received TLS header/trailer. */ + m_p_sock->get_sock_stats()->counters.n_rx_bytes += m_tls_rec_overhead; + } ++m_next_recno_rx; @@ -1503,18 +1509,18 @@ err_t sockinfo_tcp_ops_tls::recv(struct pbuf *p) break; } pdesc = m_rx_bufs.front(); - if (pdesc->lwip_pbuf.pbuf.len > (m_rx_rec_len + m_rx_offset)) { + if (pdesc->lwip_pbuf.len > (m_rx_rec_len + m_rx_offset)) { break; } m_rx_bufs.pop_front(); - m_rx_rec_len -= pdesc->lwip_pbuf.pbuf.len - m_rx_offset; - m_rx_rec_rcvd -= pdesc->lwip_pbuf.pbuf.len - m_rx_offset; + m_rx_rec_len -= pdesc->lwip_pbuf.len - m_rx_offset; + m_rx_rec_rcvd -= pdesc->lwip_pbuf.len - m_rx_offset; m_rx_offset = 0; /* * pbuf_free() is slow when it actually frees a buffer, however, * we expect to only reduce ref counter with this call. */ - pbuf_free(&pdesc->lwip_pbuf.pbuf); + pbuf_free(&pdesc->lwip_pbuf); } m_rx_offset += m_rx_rec_len; m_rx_rec_rcvd -= m_rx_rec_len; @@ -1559,7 +1565,7 @@ void sockinfo_tcp_ops_tls::rx_comp_callback(void *arg) if (utls->m_rx_psv_buf) { /* Resync flow, GET_PSV is completed. */ struct xlio_tls_progress_params *params = - (struct xlio_tls_progress_params *)utls->m_rx_psv_buf->lwip_pbuf.pbuf.payload; + (struct xlio_tls_progress_params *)utls->m_rx_psv_buf->lwip_pbuf.payload; uint32_t resync_seqno = be32toh(params->hw_resync_tcp_sn); int tracker = params->state >> 6U; int auth = (params->state >> 4U) & 0x3U; diff --git a/src/core/sock/sockinfo_ulp.h b/src/core/sock/sockinfo_ulp.h index 7a2ce6c70..b9970d1a2 100644 --- a/src/core/sock/sockinfo_ulp.h +++ b/src/core/sock/sockinfo_ulp.h @@ -33,7 +33,7 @@ #ifndef _SOCKINFO_ULP_H #define _SOCKINFO_ULP_H -#include "socket_fd_api.h" /* xlio_tx_call_attr_t */ +#include "sockinfo.h" /* xlio_tx_call_attr_t */ #include "proto/dst_entry.h" /* xlio_send_attr */ #include "proto/tls.h" /* xlio_tls_info */ #include "lwip/err.h" /* err_t */ @@ -53,9 +53,9 @@ class sockinfo_tcp_ops { public: sockinfo_tcp_ops(sockinfo_tcp *sock) : m_p_sock(sock) {}; - virtual ~sockinfo_tcp_ops() {} + virtual ~sockinfo_tcp_ops() = default; - inline ring *get_tx_ring(void); + inline ring *get_tx_ring(); virtual int setsockopt(int __level, int __optname, const void *__optval, socklen_t __optlen); virtual ssize_t tx(xlio_tx_call_attr_t &tx_arg); @@ -79,12 +79,12 @@ enum xlio_utls_mode { UTLS_MODE_RX = 1 << 1, }; -void xlio_tls_api_setup(void); +void xlio_tls_api_setup(); class sockinfo_tcp_ops_tls : public sockinfo_tcp_ops { public: sockinfo_tcp_ops_tls(sockinfo_tcp *sock); - ~sockinfo_tcp_ops_tls(); + ~sockinfo_tcp_ops_tls() override; int setsockopt(int, int, const void *, socklen_t) override; ssize_t tx(xlio_tx_call_attr_t &tx_arg) override; @@ -94,13 +94,13 @@ class sockinfo_tcp_ops_tls : public sockinfo_tcp_ops { void get_record_buf(mem_buf_desc_t *&buf, uint8_t *&data, bool is_zerocopy); private: - inline bool is_tx_tls13(void) { return m_tls_info_tx.tls_version == TLS_1_3_VERSION; } - inline bool is_rx_tls13(void) { return m_tls_info_rx.tls_version == TLS_1_3_VERSION; } + inline bool is_tx_tls13() { return m_tls_info_tx.tls_version == TLS_1_3_VERSION; } + inline bool is_rx_tls13() { return m_tls_info_rx.tls_version == TLS_1_3_VERSION; } int send_alert(uint8_t alert_type); void terminate_session_fatal(uint8_t alert_type); - err_t tls_rx_consume_ready_packets(void); + err_t tls_rx_consume_ready_packets(); err_t recv(struct pbuf *p) override; void copy_by_offset(uint8_t *dst, uint32_t offset, uint32_t len); uint16_t offset_to_host16(uint32_t offset); diff --git a/src/core/sock/tcp_seg_pool.cpp b/src/core/sock/tcp_seg_pool.cpp deleted file mode 100644 index 396bd6172..000000000 --- a/src/core/sock/tcp_seg_pool.cpp +++ /dev/null @@ -1,164 +0,0 @@ -/* - * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "tcp_seg_pool.h" -#include "core/util/utils.h" -#include "vlogger/vlogger.h" - -#define MODULE_NAME "tcp_seg_pool" - -extern global_stats_t g_global_stat_static; - -tcp_seg_pool *g_tcp_seg_pool = NULL; - -tcp_seg_pool::tcp_seg_pool() - : m_p_head(nullptr) - , m_allocator(false) -{ - memset(&m_stats, 0, sizeof(m_stats)); - expand(); -} - -tcp_seg_pool::~tcp_seg_pool() -{ - print_report(); -} - -tcp_seg *tcp_seg_pool::get_tcp_segs(uint32_t amount) -{ - return get_tcp_seg_list(amount).first; -} - -std::pair tcp_seg_pool::get_tcp_seg_list(uint32_t amount) -{ - uint32_t count; - tcp_seg *head, *next, *prev; - if (unlikely(amount <= 0)) { - return std::make_pair(nullptr, nullptr); - } - lock(); -repeat: - count = amount; - head = next = m_p_head; - prev = NULL; - while (count > 0 && next) { - prev = next; - next = next->next; - count--; - } - if (count) { - // run out of segments - if (expand()) { - goto repeat; - } - g_global_stat_static.n_tcp_seg_pool_no_segs++; - unlock(); - return std::make_pair(nullptr, nullptr); - } - prev->next = NULL; - m_p_head = next; - m_stats.allocations++; - g_global_stat_static.n_tcp_seg_pool_size -= amount; - unlock(); - - return std::make_pair(head, prev); -} - -void tcp_seg_pool::put_tcp_segs(tcp_seg *seg_list) -{ - tcp_seg *next = seg_list; - if (unlikely(!seg_list)) { - return; - } - - int i; - for (i = 1; next->next; i++) { - next = next->next; - } - - lock(); - next->next = m_p_head; - m_p_head = seg_list; - g_global_stat_static.n_tcp_seg_pool_size += i; - unlock(); -} - -// Splitting seg list such that first 'count' segs are returned and 'tcp_seg_list' -// is updated to point to the remaining segs. -// The length of tcp_seg_list is assumed to be at least 'count' long. -tcp_seg *tcp_seg_pool::split_tcp_segs(uint32_t count, tcp_seg *&tcp_seg_list, uint32_t &total_count) -{ - struct tcp_seg *head = tcp_seg_list; - struct tcp_seg *last = head; - total_count -= count; - while (count-- > 1U) { - last = last->next; - } - - tcp_seg_list = last->next; - last->next = nullptr; - return head; -} - -bool tcp_seg_pool::expand() -{ - size_t size = sizeof(tcp_seg) * safe_mce_sys().tx_segs_pool_batch_tcp; - tcp_seg *tcp_segs_array = (tcp_seg *)m_allocator.alloc(size); - - if (!tcp_segs_array) { - __log_dbg("TCP segments allocation failed"); - return false; - } - - // Allocator can allocate more memory than requested - utilize it. - size_t segs_nr = size / sizeof(tcp_seg); - - if (segs_nr > 0) { - memset(tcp_segs_array, 0, size); - for (size_t i = 0; i < segs_nr - 1; i++) { - tcp_segs_array[i].next = &tcp_segs_array[i + 1]; - } - tcp_segs_array[segs_nr - 1].next = m_p_head; - m_p_head = &tcp_segs_array[0]; - m_stats.total_segs += segs_nr; - m_stats.expands++; - g_global_stat_static.n_tcp_seg_pool_size += segs_nr; - } - return true; -} - -void tcp_seg_pool::print_report(vlog_levels_t log_level /*=VLOG_DEBUG*/) -{ - vlog_printf(log_level, "TCP segments pool statistics:\n"); - vlog_printf(log_level, " allocations=%u expands=%u total_segs=%u\n", m_stats.allocations, - m_stats.expands, m_stats.total_segs); -} diff --git a/src/core/util/agent.cpp b/src/core/util/agent.cpp index 90c362938..0cc47f493 100644 --- a/src/core/util/agent.cpp +++ b/src/core/util/agent.cpp @@ -60,6 +60,12 @@ #define AGENT_DEFAULT_ALIVE (1) /* periodic time for alive check (in sec) */ /* Force system call */ +#ifdef XLIO_STATIC_BUILD +#define sys_call(_result, _func, ...) \ + do { \ + _result = ::_func(__VA_ARGS__); \ + } while (0) +#else /* XLIO_STATIC_BUILD */ #define sys_call(_result, _func, ...) \ do { \ if (orig_os_api._func) \ @@ -67,6 +73,7 @@ else \ _result = ::_func(__VA_ARGS__); \ } while (0) +#endif /* XLIO_STATIC_BUILD */ /* Print user notification */ #define output_fatal() \ @@ -84,7 +91,7 @@ vlog_printf(_level, "*************************************************************\n"); \ } while (0) -agent *g_p_agent = NULL; +agent *g_p_agent = nullptr; agent::agent() : m_state(AGENT_CLOSED) @@ -93,7 +100,7 @@ agent::agent() , m_msg_num(AGENT_DEFAULT_MSG_NUM) { int rc = 0; - agent_msg_t *msg = NULL; + agent_msg_t *msg = nullptr; int i = 0; INIT_LIST_HEAD(&m_cb_queue); @@ -107,7 +114,7 @@ agent::agent() while (i--) { /* coverity[overwrite_var] */ msg = (agent_msg_t *)calloc(1, sizeof(*msg)); - if (NULL == msg) { + if (!msg) { rc = -ENOMEM; __log_dbg("failed queue creation (rc = %d)", rc); goto err; @@ -207,8 +214,8 @@ agent::agent() agent::~agent() { - agent_msg_t *msg = NULL; - agent_callback_t *cb = NULL; + agent_msg_t *msg = nullptr; + agent_callback_t *cb = nullptr; if (AGENT_CLOSED == m_state) { return; @@ -253,14 +260,14 @@ agent::~agent() void agent::register_cb(agent_cb_t fn, void *arg) { - agent_callback_t *cb = NULL; - struct list_head *entry = NULL; + agent_callback_t *cb = nullptr; + struct list_head *entry = nullptr; if (AGENT_CLOSED == m_state) { return; } - if (NULL == fn) { + if (!fn) { return; } @@ -287,8 +294,8 @@ void agent::register_cb(agent_cb_t fn, void *arg) void agent::unregister_cb(agent_cb_t fn, void *arg) { - agent_callback_t *cb = NULL; - struct list_head *entry = NULL; + agent_callback_t *cb = nullptr; + struct list_head *entry = nullptr; if (AGENT_CLOSED == m_state) { return; @@ -311,7 +318,7 @@ void agent::unregister_cb(agent_cb_t fn, void *arg) int agent::put(const void *data, size_t length, intptr_t tag) { - agent_msg_t *msg = NULL; + agent_msg_t *msg = nullptr; int i = 0; if (AGENT_CLOSED == m_state) { @@ -338,7 +345,7 @@ int agent::put(const void *data, size_t length, intptr_t tag) for (i = 0; i < AGENT_DEFAULT_MSG_GROW; i++) { /* coverity[overwrite_var] */ msg = (agent_msg_t *)malloc(sizeof(*msg)); - if (NULL == msg) { + if (!msg) { break; } msg->length = 0; @@ -370,7 +377,7 @@ int agent::put(const void *data, size_t length, intptr_t tag) void agent::progress(void) { - agent_msg_t *msg = NULL; + agent_msg_t *msg = nullptr; struct timeval tv_now = TIMEVAL_INITIALIZER; static struct timeval tv_inactive_elapsed = TIMEVAL_INITIALIZER; static struct timeval tv_alive_elapsed = TIMEVAL_INITIALIZER; @@ -424,8 +431,8 @@ void agent::progress(void) void agent::progress_cb(void) { - agent_callback_t *cb = NULL; - struct list_head *entry = NULL; + agent_callback_t *cb = nullptr; + struct list_head *entry = nullptr; m_cb_lock.lock(); list_for_each(entry, &m_cb_queue) @@ -448,7 +455,7 @@ int agent::send(agent_msg_t *msg) return -EBADF; } - if (NULL == msg) { + if (!msg) { return -EINVAL; } diff --git a/src/core/util/cached_obj_pool.h b/src/core/util/cached_obj_pool.h new file mode 100644 index 000000000..f3268adb4 --- /dev/null +++ b/src/core/util/cached_obj_pool.h @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef CACHED_OBJ_POOL_H +#define CACHED_OBJ_POOL_H + +#include +#include "dev/allocator.h" +#include "utils/lock_wrapper.h" + +template class cached_obj_pool : lock_spin { +public: + cached_obj_pool(const char *pool_name, size_t alloc_batch, uint32_t &global_obj_pool_size_ref, + uint32_t &global_obj_pool_no_objs_ref); + ~cached_obj_pool() override; + + std::pair get_obj_list(uint32_t amount); + T *get_objs(uint32_t amount); + void put_objs(T *obj_list); + + static T *split_obj_list(uint32_t count, T *&obj_list, uint32_t &total_count); + +protected: + bool expand(); + + T *m_p_head = nullptr; + xlio_allocator_heap m_allocator; + + struct { + unsigned total_objs; + unsigned allocations; + unsigned expands; + uint32_t &global_obj_pool_size; + uint32_t &global_obj_pool_no_objs; + } m_stats; + + const size_t m_alloc_batch; + const char *m_pool_name; +}; + +template +cached_obj_pool::cached_obj_pool(const char *pool_name, size_t alloc_batch, + uint32_t &global_obj_pool_size_ref, + uint32_t &global_obj_pool_no_objs_ref) + : m_allocator(false) + , m_stats {0U, 0U, 0U, global_obj_pool_size_ref, global_obj_pool_no_objs_ref} + , m_alloc_batch(alloc_batch) + , m_pool_name(pool_name) +{ + expand(); +} + +template cached_obj_pool::~cached_obj_pool() +{ + vlog_printf(VLOG_DEBUG, "%s pool statistics:\n", m_pool_name); + vlog_printf(VLOG_DEBUG, " allocations=%u expands=%u total_segs=%u\n", m_stats.allocations, + m_stats.expands, m_stats.total_objs); +} + +template T *cached_obj_pool::get_objs(uint32_t amount) +{ + return get_obj_list(amount).first; +} + +template std::pair cached_obj_pool::get_obj_list(uint32_t amount) +{ + uint32_t count; + T *head, *next, *prev; + if (unlikely(amount <= 0)) { + return std::make_pair(nullptr, nullptr); + } + lock(); +repeat: + count = amount; + head = next = m_p_head; + prev = nullptr; + while (count > 0 && next) { + prev = next; + next = next->next; + count--; + } + if (count) { + // Ran out of objects + if (expand()) { + goto repeat; + } + m_stats.global_obj_pool_no_objs++; + unlock(); + return std::make_pair(nullptr, nullptr); + } + prev->next = nullptr; + m_p_head = next; + m_stats.allocations++; + m_stats.global_obj_pool_size -= amount; + unlock(); + + return std::make_pair(head, prev); +} + +template void cached_obj_pool::put_objs(T *obj_list) +{ + if (unlikely(!obj_list)) { + return; + } + + T *next = obj_list; + int i; + for (i = 1; next->next; i++) { + next = next->next; + } + + lock(); + next->next = m_p_head; + m_p_head = obj_list; + m_stats.global_obj_pool_size += i; + unlock(); +} + +// Splitting obj list such that first 'count' objs are returned and 'obj_list' +// is updated to point to the remaining objs. +// The length of obj_list is assumed to be at least 'count' long. +template +T *cached_obj_pool::split_obj_list(uint32_t count, T *&obj_list, uint32_t &total_count) +{ + T *head = obj_list; + T *last = head; + total_count -= count; + while (count-- > 1U) { + last = last->next; + } + + obj_list = last->next; + last->next = nullptr; + return head; +} + +template bool cached_obj_pool::expand() +{ + size_t size = sizeof(T) * m_alloc_batch; + T *objs_array = (T *)m_allocator.alloc(size); + if (!objs_array) { + vlog_printf(VLOG_DEBUG, "Cached pool failed to allocate objects (%s)\n", m_pool_name); + return false; + } + + // Allocator can allocate more memory than requested - utilize it. + size_t objs_nr = size / sizeof(T); + + if (objs_nr > 0) { + memset(objs_array, 0, size); + for (size_t i = 0; i < objs_nr - 1; i++) { + objs_array[i].next = &objs_array[i + 1]; + } + objs_array[objs_nr - 1].next = m_p_head; + m_p_head = &objs_array[0]; + m_stats.total_objs += objs_nr; + m_stats.expands++; + m_stats.global_obj_pool_size += objs_nr; + } + return true; +} + +#endif diff --git a/src/core/util/hugepage_mgr.cpp b/src/core/util/hugepage_mgr.cpp index 73d84f817..f502786bf 100644 --- a/src/core/util/hugepage_mgr.cpp +++ b/src/core/util/hugepage_mgr.cpp @@ -55,6 +55,18 @@ hugepage_mgr::hugepage_mgr() memset(&m_stats, 0, sizeof(m_stats)); m_default_hugepage = read_meminfo("Hugepagesize:"); update(); + + /* Check hugepage size if requested by user explicitly. */ + if (safe_mce_sys().hugepage_size != 0 && !is_hugepage_supported(safe_mce_sys().hugepage_size)) { + vlog_printf(VLOG_WARNING, + "Requested hugepage %s is not supported by the system. " + "XLIO will autodetect optimal hugepage.\n", + option_size::to_str(safe_mce_sys().hugepage_size)); + /* Value 0 means default autodetection behavior. Don't set MCE_DEFAULT_HUGEPAGE_SIZE + * here, because it can be defined to an unsupported specific value. + */ + safe_mce_sys().hugepage_size = 0; + } } void hugepage_mgr::update() @@ -95,7 +107,7 @@ void *hugepage_mgr::alloc_hugepages_helper(size_t &size, size_t hugepage) map_flags = (int)log2(hugepage) << MAP_HUGE_SHIFT; } - ptr = mmap(NULL, actual_size, PROT_READ | PROT_WRITE, + ptr = mmap(nullptr, actual_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE | MAP_HUGETLB | map_flags, -1, 0); if (ptr == MAP_FAILED) { ptr = nullptr; @@ -106,7 +118,7 @@ void *hugepage_mgr::alloc_hugepages_helper(size_t &size, size_t hugepage) return ptr; } -void *hugepage_mgr::alloc_hugepages(size_t &size) +void *hugepage_mgr::alloc_hugepages(size_t &size, size_t &hugepage_size) { std::lock_guard lock(m_lock); @@ -115,12 +127,12 @@ void *hugepage_mgr::alloc_hugepages(size_t &size) void *ptr = nullptr; std::vector hugepages; - if (safe_mce_sys().hugepage_log2 == 0) { + if (safe_mce_sys().hugepage_size == 0) { get_supported_hugepages(hugepages); std::sort(hugepages.begin(), hugepages.end(), std::greater()); } else { // User requested specific hugepage size - don't check other types. - hugepages.push_back(1LU << safe_mce_sys().hugepage_log2); + hugepages.push_back(safe_mce_sys().hugepage_size); } for (auto iter = hugepages.begin(); !ptr && iter != hugepages.end(); ++iter) { @@ -137,6 +149,7 @@ void *hugepage_mgr::alloc_hugepages(size_t &size) } if (ptr) { size = actual_size; + hugepage_size = hugepage; } // Statistics @@ -173,10 +186,9 @@ void hugepage_mgr::print_report(bool short_report /*=false*/) get_supported_hugepages(hugepages); vlog_printf(VLOG_INFO, "Hugepages info:\n"); - if (safe_mce_sys().hugepage_log2) { - vlog_printf(VLOG_INFO, " User forced to use %lu kB hugepages (%s=%u).\n", - (1LU << safe_mce_sys().hugepage_log2) / 1024U, SYS_VAR_HUGEPAGE_LOG2, - safe_mce_sys().hugepage_log2); + if (safe_mce_sys().hugepage_size) { + vlog_printf(VLOG_INFO, " User forced to use %lu kB hugepages.\n", + (safe_mce_sys().hugepage_size) / 1024U); } for (size_t hugepage : hugepages) { vlog_printf(VLOG_INFO, " %zu kB : total=%u free=%u\n", hugepage / 1024U, diff --git a/src/core/util/hugepage_mgr.h b/src/core/util/hugepage_mgr.h index 33f5a8485..b0a8c76ca 100644 --- a/src/core/util/hugepage_mgr.h +++ b/src/core/util/hugepage_mgr.h @@ -66,7 +66,7 @@ class hugepage_mgr { size_t get_default_hugepage() { return m_default_hugepage; } bool is_hugepage_supported(size_t hugepage); - void *alloc_hugepages(size_t &size); + void *alloc_hugepages(size_t &size, size_t &hugepage_size); void dealloc_hugepages(void *ptr, size_t size); void print_report(bool short_report = false); diff --git a/src/core/util/match.cpp b/src/core/util/match.cpp index 3f8f47c80..c17804428 100644 --- a/src/core/util/match.cpp +++ b/src/core/util/match.cpp @@ -85,8 +85,8 @@ static void free_dbl_lst(struct dbl_lst *dbl_lst) free(node); node = tmp; } - dbl_lst->head = NULL; - dbl_lst->tail = NULL; + dbl_lst->head = nullptr; + dbl_lst->tail = nullptr; } static void free_instance_content(struct instance *instance) @@ -124,8 +124,8 @@ void __xlio_free_resources(void) free(node); node = tmp; } - __instance_list.head = NULL; - __instance_list.tail = NULL; + __instance_list.head = nullptr; + __instance_list.tail = nullptr; } void get_address_port_rule_str(char *addr_buf, char *ports_buf, struct address_port_rule *rule) @@ -276,7 +276,7 @@ static inline int match_ipv4_addr(struct address_port_rule *rule, const struct s static int match_ip_addr_and_port(transport_t my_transport, struct use_family_rule *rule, const struct sockaddr *addr_in_first, const socklen_t addrlen_first, - const struct sockaddr *addr_in_second = NULL, + const struct sockaddr *addr_in_second = nullptr, const socklen_t addrlen_second = 0) { const struct sockaddr_in *sin_first = (const struct sockaddr_in *)addr_in_first; @@ -308,7 +308,7 @@ static int match_ip_addr_and_port(transport_t my_transport, struct use_family_ru MAX_ADDR_STR_LEN); port_first = ntohs(sin_first->sin_port); } - if (addr_str_first == NULL) { + if (!addr_str_first) { addr_str_first = "INVALID_ADDR"; } @@ -322,7 +322,7 @@ static int match_ip_addr_and_port(transport_t my_transport, struct use_family_ru addr_buf_second, MAX_ADDR_STR_LEN); port_second = ntohs(sin_second->sin_port); } - if (addr_str_second == NULL) { + if (!addr_str_second) { addr_str_second = "INVALID_ADDR"; } @@ -350,7 +350,7 @@ static int match_ip_addr_and_port(transport_t my_transport, struct use_family_ru } if (match && rule->first.match_by_addr) { - if (__xlio_sockaddr_to_xlio(addr_in_first, addrlen_first, &tmp_sin_first, NULL) || + if (__xlio_sockaddr_to_xlio(addr_in_first, addrlen_first, &tmp_sin_first, nullptr) || match_ipv4_addr(&(rule->first), &tmp_sin_first)) { match_logdbg("NEGATIVE MATCH by address"); match = 0; @@ -372,7 +372,7 @@ static int match_ip_addr_and_port(transport_t my_transport, struct use_family_ru } if (match && rule->second.match_by_addr) { - if (__xlio_sockaddr_to_xlio(addr_in_second, addrlen_second, &tmp_sin_second, NULL) || + if (__xlio_sockaddr_to_xlio(addr_in_second, addrlen_second, &tmp_sin_second, nullptr) || match_ipv4_addr(&(rule->second), &tmp_sin_second)) { match_logdbg("NEGATIVE MATCH by address"); match = 0; @@ -425,12 +425,12 @@ static transport_t get_family_by_first_matching_rule(transport_t my_transport, struct dbl_lst rules_lst, const struct sockaddr *sin_first, const socklen_t addrlen_first, - const struct sockaddr *sin_second = NULL, + const struct sockaddr *sin_second = nullptr, const socklen_t addrlen_second = 0) { struct dbl_lst_node *node; - for (node = rules_lst.head; node != NULL; node = node->next) { + for (node = rules_lst.head; node; node = node->next) { /* first rule wins */ struct use_family_rule *rule = (struct use_family_rule *)node->data; if (rule) { @@ -447,7 +447,7 @@ static transport_t get_family_by_first_matching_rule(transport_t my_transport, static transport_t get_family_by_instance_first_matching_rule( transport_t my_transport, role_t role, const char *app_id, const struct sockaddr *sin_first, - const socklen_t addrlen_first, const struct sockaddr *sin_second = NULL, + const socklen_t addrlen_first, const struct sockaddr *sin_second = nullptr, const socklen_t addrlen_second = 0) { transport_t target_family = TRANS_DEFAULT; @@ -593,8 +593,7 @@ static transport_t match_by_all_rules_program(in_protocol_t my_protocol, struct struct dbl_lst_node *node; struct use_family_rule *rule; - for (node = rules_lst.head; (node != NULL) && (target_family == TRANS_DEFAULT); - node = node->next) { + for (node = rules_lst.head; (node) && (target_family == TRANS_DEFAULT); node = node->next) { /* * to declare a dont care we either have a dont care address and port * or the previous non global rules use the same target family as the @@ -771,7 +770,7 @@ int __xlio_sockaddr_to_xlio(const struct sockaddr *addr_in, socklen_t addrlen, addr_out->sin_port = sin6->sin6_port; if (inet_ntop(addr_out->sin_family, (void *)&(addr_out->sin_addr), buf, MAX_ADDR_STR_LEN) == - NULL) { + nullptr) { match_logdbg("__xlio_sockaddr_to_xlio: Converted IPv4 address is illegal"); } else { match_logdbg("__xlio_sockaddr_to_xlio: Converted IPv4 is:%s", buf); diff --git a/src/core/util/sg_array.h b/src/core/util/sg_array.h index 8f500c89c..b4210c9ee 100644 --- a/src/core/util/sg_array.h +++ b/src/core/util/sg_array.h @@ -84,14 +84,14 @@ class sg_array { uint8_t *old_p = (uint8_t *)m_sg[m_index].addr + m_pos; m_pos += *get_len; if (unlikely(m_pos < 0)) { - return NULL; + return nullptr; } return old_p; } else { *get_len = m_current->length - m_pos; if (unlikely(m_pos < 0)) { - return NULL; + return nullptr; } uint8_t *old_p = (uint8_t *)m_sg[m_index++].addr + m_pos; // moving to next sge @@ -99,13 +99,13 @@ class sg_array { return old_p; } } - return NULL; + return nullptr; } inline int get_num_sge(void) { return m_sg ? m_num_sge : -1; } inline int length(void) { - if (unlikely(m_sg == NULL || m_num_sge == 0)) { + if (unlikely(!m_sg || m_num_sge == 0)) { return 0; } for (int i = 0; i < m_num_sge; i++) { diff --git a/src/core/util/sys_vars.cpp b/src/core/util/sys_vars.cpp index ab0d1acba..c62709b49 100644 --- a/src/core/util/sys_vars.cpp +++ b/src/core/util/sys_vars.cpp @@ -105,13 +105,13 @@ typedef struct { const char **input_names; } xlio_spec_names; -static const char *names_none[] = {"none", NULL}; -static const char *spec_names_ulatency[] = {"ultra-latency", NULL}; -static const char *spec_names_latency[] = {"latency", NULL}; -static const char *spec_names_multi_ring[] = {"multi_ring_latency", NULL}; -static const char *spec_names_nginx[] = {"nginx", NULL}; -static const char *spec_names_nginx_dpu[] = {"nginx_dpu", NULL}; -static const char *spec_names_nvme_bf2[] = {"nvme_bf2", NULL}; +static const char *names_none[] = {"none", nullptr}; +static const char *spec_names_ulatency[] = {"ultra-latency", nullptr}; +static const char *spec_names_latency[] = {"latency", nullptr}; +static const char *spec_names_multi_ring[] = {"multi_ring_latency", nullptr}; +static const char *spec_names_nginx[] = {"nginx", nullptr}; +static const char *spec_names_nginx_dpu[] = {"nginx_dpu", nullptr}; +static const char *spec_names_nvme_bf2[] = {"nvme_bf2", nullptr}; // must be by order because "to_str" relies on that! static const xlio_spec_names specs[] = { @@ -277,7 +277,7 @@ const char *to_str(MODE option, const OPT (&options)[N]) } } - return NULL; + return nullptr; } } // namespace option_x @@ -309,12 +309,6 @@ static option_t options[] = {AUTO_ON_OFF_IMPL}; OPTION_FROM_TO_STR_IMPL } // namespace option_3 -namespace option_strq { -static option_t options[] = {AUTO_ON_OFF_IMPL, - {REGULAR_RQ, "Regular RQ", {"regular_rq", NULL, NULL}}}; -OPTION_FROM_TO_STR_IMPL -} // namespace option_strq - namespace option_tcp_ctl_thread { static option_t options[] = { {CTL_THREAD_DISABLE, "Disabled", {"disable", "disabled", NULL}}, @@ -374,14 +368,14 @@ int mce_sys_var::list_to_cpuset(char *cpulist, cpu_set_t *cpu_set) * Here we assume that if we get a second subtoken * then we must be processing a range. */ - subtoken = strtok_r(NULL, dash, &dash_saveptr); + subtoken = strtok_r(nullptr, dash, &dash_saveptr); if (subtoken) { errno = 0; range_end = strtol(subtoken, &endptr, 10); if ((!range_end && *endptr) || errno) { return -1; } - subtoken = NULL; + subtoken = nullptr; } else { range_end = range_start; } @@ -395,7 +389,7 @@ int mce_sys_var::list_to_cpuset(char *cpulist, cpu_set_t *cpu_set) } } - token = strtok_r(NULL, comma, &comma_saveptr); + token = strtok_r(nullptr, comma, &comma_saveptr); } return 0; @@ -425,7 +419,7 @@ int mce_sys_var::hex_to_cpuset(char *start, cpu_set_t *cpu_set) return -1; } - digit = strtol(hexc, NULL, 16); + digit = strtol(hexc, nullptr, 16); /* * Each hex digit is 4 bits. For each bit set per @@ -493,9 +487,9 @@ void mce_sys_var::read_env_variable_with_pid(char *mce_sys_name, size_t mce_sys_ char *env_ptr) { int n = -1; - char *d_pos = NULL; + char *d_pos = nullptr; - if (NULL == env_ptr || NULL == mce_sys_name || mce_sys_max_size < 2) { + if (!env_ptr || !mce_sys_name || mce_sys_max_size < 2) { return; } @@ -584,7 +578,7 @@ const char *mce_sys_var::cpuid_hv_vendor() static __thread char vendor[13] = {0}; if (!cpuid_hv()) { - return NULL; + return nullptr; } #if defined(__x86_64__) uint32_t _ebx = 0, _ecx = 0, _edx = 0; @@ -599,7 +593,7 @@ const char *mce_sys_var::cpuid_hv_vendor() void mce_sys_var::read_hv() { - const char *hyper_vendor_id = NULL; + const char *hyper_vendor_id = nullptr; hypervisor = mce_sys_var::HYPER_NONE; hyper_vendor_id = cpuid_hv_vendor(); @@ -697,8 +691,8 @@ void mce_sys_var::update_multi_process_params() tx_segs_pool_batch_tcp = 256; rx_num_wr = 1; strq_strides_compensation_level = 32; - strq_stride_size_bytes = 512; - strq_stride_num_per_rwqe = 32; + strq_stride_size_bytes = STRQ_MIN_STRIDE_SIZE_BYTES; + strq_stride_num_per_rwqe = STRQ_MIN_STRIDES_NUM; tx_buf_size = 0; rx_buf_size = 0; } @@ -710,7 +704,7 @@ void mce_sys_var::get_env_params() { int c = 0, len = 0; char *env_ptr; - FILE *fp = NULL; + FILE *fp = nullptr; int app_name_size = MAX_CMD_LINE; // Large buffer size to avoid need for realloc @@ -768,6 +762,7 @@ void mce_sys_var::get_env_params() handle_sigintr = MCE_DEFAULT_HANDLE_SIGINTR; handle_segfault = MCE_DEFAULT_HANDLE_SIGFAULT; stats_fd_num_max = MCE_DEFAULT_STATS_FD_NUM; + stats_fd_num_monitor = MCE_DEFAULT_STATS_FD_NUM; ring_allocation_logic_tx = MCE_DEFAULT_RING_ALLOCATION_LOGIC_TX; ring_allocation_logic_rx = MCE_DEFAULT_RING_ALLOCATION_LOGIC_RX; @@ -781,7 +776,6 @@ void mce_sys_var::get_env_params() zc_cache_threshold = MCE_DEFAULT_ZC_CACHE_THRESHOLD; tx_num_bufs = MCE_DEFAULT_TX_NUM_BUFS; tx_buf_size = MCE_DEFAULT_TX_BUF_SIZE; - zc_tx_size = MCE_DEFAULT_ZC_TX_SIZE; tcp_nodelay_treshold = MCE_DEFAULT_TCP_NODELAY_TRESHOLD; tx_num_wr = MCE_DEFAULT_TX_NUM_WRE; tx_num_wr_to_signal = MCE_DEFAULT_TX_NUM_WRE_TO_SIGNAL; @@ -843,6 +837,7 @@ void mce_sys_var::get_env_params() progress_engine_wce_max = MCE_DEFAULT_PROGRESS_ENGINE_WCE_MAX; cq_keep_qp_full = MCE_DEFAULT_CQ_KEEP_QP_FULL; qp_compensation_level = MCE_DEFAULT_QP_COMPENSATION_LEVEL; + max_tso_sz = MCE_DEFAULT_MAX_TSO_SIZE; user_huge_page_size = MCE_DEFAULT_USER_HUGE_PAGE_SIZE; internal_thread_arm_cq_enabled = MCE_DEFAULT_INTERNAL_THREAD_ARM_CQ_ENABLED; @@ -864,7 +859,7 @@ void mce_sys_var::get_env_params() memory_limit = MCE_DEFAULT_MEMORY_LIMIT; memory_limit_user = MCE_DEFAULT_MEMORY_LIMIT_USER; heap_metadata_block = MCE_DEFAULT_HEAP_METADATA_BLOCK; - hugepage_log2 = MCE_DEFAULT_HUGEPAGE_LOG2; + hugepage_size = MCE_DEFAULT_HUGEPAGE_SIZE; enable_socketxtreme = MCE_DEFAULT_SOCKETXTREME; enable_tso = MCE_DEFAULT_TSO; #ifdef DEFINED_UTLS @@ -911,7 +906,7 @@ void mce_sys_var::get_env_params() /* Configure enable_socketxtreme as first because * this mode has some special predefined parameter limitations */ - if ((env_ptr = getenv(SYS_VAR_SOCKETXTREME)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_SOCKETXTREME))) { enable_socketxtreme = atoi(env_ptr) ? true : false; } if (enable_socketxtreme) { @@ -920,15 +915,11 @@ void mce_sys_var::get_env_params() progress_engine_interval_msec = MCE_CQ_DRAIN_INTERVAL_DISABLED; } -#if defined(DEFINED_DPCP) - if ((env_ptr = getenv(SYS_VAR_STRQ)) != NULL) { - enable_strq_env = option_strq::from_str(env_ptr, MCE_DEFAULT_STRQ); + if ((env_ptr = getenv(SYS_VAR_STRQ))) { + enable_strq_env = option_3::from_str(env_ptr, MCE_DEFAULT_STRQ); } -#endif - enable_striding_rq = - (enable_strq_env == option_strq::ON || enable_strq_env == option_strq::AUTO); - enable_dpcp_rq = (enable_striding_rq || (enable_strq_env == option_strq::REGULAR_RQ)); + enable_striding_rq = (enable_strq_env == option_3::ON || enable_strq_env == option_3::AUTO); if (enable_striding_rq) { rx_num_bufs = MCE_DEFAULT_STRQ_NUM_BUFS; @@ -937,7 +928,7 @@ void mce_sys_var::get_env_params() qp_compensation_level = MCE_DEFAULT_STRQ_COMPENSATION_LEVEL; } - if ((env_ptr = getenv(SYS_VAR_SPEC)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_SPEC))) { mce_spec = (uint32_t)xlio_spec::from_str(env_ptr, MCE_SPEC_NONE); } @@ -946,7 +937,7 @@ void mce_sys_var::get_env_params() * based on number of workers or application type further. */ #if defined(DEFINED_NGINX) - if ((env_ptr = getenv(SYS_VAR_NGINX_WORKERS_NUM)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_NGINX_WORKERS_NUM))) { app.workers_num = (uint32_t)atoi(env_ptr); if (app.workers_num > 0) { app.type = APP_NGINX; @@ -1195,31 +1186,31 @@ void mce_sys_var::get_env_params() break; } - if ((env_ptr = getenv(SYS_VAR_PRINT_REPORT)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_PRINT_REPORT))) { print_report = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_LOG_FILENAME)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_LOG_FILENAME))) { read_env_variable_with_pid(log_filename, sizeof(log_filename), env_ptr); } - if ((env_ptr = getenv(SYS_VAR_STATS_FILENAME)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_STATS_FILENAME))) { read_env_variable_with_pid(stats_filename, sizeof(stats_filename), env_ptr); } - if ((env_ptr = getenv(SYS_VAR_STATS_SHMEM_DIRNAME)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_STATS_SHMEM_DIRNAME))) { read_env_variable_with_pid(stats_shmem_dirname, sizeof(stats_shmem_dirname), env_ptr); } - if ((env_ptr = getenv(SYS_VAR_CONF_FILENAME)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_CONF_FILENAME))) { read_env_variable_with_pid(conf_filename, sizeof(conf_filename), env_ptr); } - if ((env_ptr = getenv(SYS_VAR_SERVICE_DIR)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_SERVICE_DIR))) { read_env_variable_with_pid(service_notify_dir, sizeof(service_notify_dir), env_ptr); } - if ((env_ptr = getenv(SYS_VAR_SERVICE_ENABLE)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_SERVICE_ENABLE))) { service_enable = atoi(env_ptr) ? true : false; } if (HYPER_MSHV == hypervisor && !service_enable) { @@ -1228,7 +1219,7 @@ void mce_sys_var::get_env_params() SYS_VAR_SERVICE_ENABLE); } - if ((env_ptr = getenv(SYS_VAR_LOG_LEVEL)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_LOG_LEVEL))) { log_level = log_level::from_str(env_ptr, VLOG_DEFAULT); } @@ -1236,79 +1227,69 @@ void mce_sys_var::get_env_params() log_details = 2; } - if ((env_ptr = getenv(SYS_VAR_LOG_DETAILS)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_LOG_DETAILS))) { log_details = (uint32_t)atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_LOG_COLORS)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_LOG_COLORS))) { log_colors = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_APPLICATION_ID)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_APPLICATION_ID))) { read_env_variable_with_pid(app_id, sizeof(app_id), env_ptr); } - if ((env_ptr = getenv(SYS_VAR_HANDLE_SIGINTR)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_HANDLE_SIGINTR))) { handle_sigintr = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_HANDLE_SIGSEGV)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_HANDLE_SIGSEGV))) { handle_segfault = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_STATS_FD_NUM)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_STATS_FD_NUM))) { stats_fd_num_max = (uint32_t)atoi(env_ptr); + stats_fd_num_monitor = std::min(stats_fd_num_max, MAX_STATS_FD_NUM); if (stats_fd_num_max > MAX_STATS_FD_NUM) { vlog_printf(VLOG_WARNING, " Can only monitor maximum %d sockets in statistics \n", MAX_STATS_FD_NUM); - stats_fd_num_max = MAX_STATS_FD_NUM; } } read_strq_strides_num(); read_strq_stride_size_bytes(); - if ((env_ptr = getenv(SYS_VAR_STRQ_STRIDES_NUM_BUFS)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_STRQ_STRIDES_NUM_BUFS))) { strq_strides_num_bufs = (uint32_t)atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_STRQ_STRIDES_COMPENSATION_LEVEL)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_STRQ_STRIDES_COMPENSATION_LEVEL))) { strq_strides_compensation_level = (uint32_t)atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_ZC_CACHE_THRESHOLD)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_ZC_CACHE_THRESHOLD))) { zc_cache_threshold = option_size::from_str(env_ptr); } bool tx_num_bufs_set = false; - if ((env_ptr = getenv(SYS_VAR_TX_NUM_BUFS)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TX_NUM_BUFS))) { tx_num_bufs = (uint32_t)atoi(env_ptr); tx_num_bufs_set = true; } - if ((env_ptr = getenv(SYS_VAR_TX_BUF_SIZE)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TX_BUF_SIZE))) { tx_buf_size = (uint32_t)option_size::from_str(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_ZC_TX_SIZE)) != NULL) { - zc_tx_size = (uint32_t)option_size::from_str(env_ptr); - if (zc_tx_size > MCE_MAX_ZC_TX_SIZE) { - vlog_printf(VLOG_WARNING, - "ZC TX size [%u] exceeds the maximum (max=%u), setting to default.\n", - zc_tx_size, MCE_MAX_ZC_TX_SIZE); - zc_tx_size = MCE_DEFAULT_ZC_TX_SIZE; - } - } - - if ((env_ptr = getenv(SYS_VAR_TCP_NODELAY_TRESHOLD)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TCP_NODELAY_TRESHOLD))) { tcp_nodelay_treshold = (uint32_t)atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_TX_NUM_WRE)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TX_NUM_WRE))) { tx_num_wr = (uint32_t)atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_TX_NUM_WRE_TO_SIGNAL)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TX_NUM_WRE_TO_SIGNAL))) { tx_num_wr_to_signal = std::min(NUM_TX_WRE_TO_SIGNAL_MAX, std::max(1, atoi(env_ptr))); } @@ -1316,7 +1297,7 @@ void mce_sys_var::get_env_params() tx_num_wr = tx_num_wr_to_signal * 2; } - if ((env_ptr = getenv(SYS_VAR_TX_MAX_INLINE)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TX_MAX_INLINE))) { tx_max_inline = (uint32_t)atoi(env_ptr); } if (tx_max_inline > MAX_SUPPORTED_IB_INLINE_SIZE) { @@ -1325,35 +1306,35 @@ void mce_sys_var::get_env_params() tx_max_inline = MAX_SUPPORTED_IB_INLINE_SIZE; } - if ((env_ptr = getenv(SYS_VAR_TX_MC_LOOPBACK)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TX_MC_LOOPBACK))) { tx_mc_loopback_default = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_TX_NONBLOCKED_EAGAINS)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TX_NONBLOCKED_EAGAINS))) { tx_nonblocked_eagains = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_TX_PREFETCH_BYTES)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TX_PREFETCH_BYTES))) { tx_prefetch_bytes = (uint32_t)atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_TX_BUFS_BATCH_TCP)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TX_BUFS_BATCH_TCP))) { tx_bufs_batch_tcp = (uint32_t)std::max(atoi(env_ptr), 1); } - if ((env_ptr = getenv(SYS_VAR_TX_SEGS_BATCH_TCP)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TX_SEGS_BATCH_TCP))) { tx_segs_batch_tcp = (uint32_t)std::max(atoi(env_ptr), 1); } - if ((env_ptr = getenv(SYS_VAR_TX_SEGS_RING_BATCH_TCP)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TX_SEGS_RING_BATCH_TCP))) { tx_segs_ring_batch_tcp = (uint32_t)std::max(atoi(env_ptr), 1); } - if ((env_ptr = getenv(SYS_VAR_TX_SEGS_POOL_BATCH_TCP)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TX_SEGS_POOL_BATCH_TCP))) { tx_segs_pool_batch_tcp = (uint32_t)std::max(atoi(env_ptr), 1); } - if ((env_ptr = getenv(SYS_VAR_RING_ALLOCATION_LOGIC_TX)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_RING_ALLOCATION_LOGIC_TX))) { ring_allocation_logic_tx = (ring_logic_t)atoi(env_ptr); if (!is_ring_logic_valid(ring_allocation_logic_tx)) { vlog_printf(VLOG_WARNING, "%s = %d is not valid, setting logic to default = %d\n", @@ -1363,7 +1344,7 @@ void mce_sys_var::get_env_params() } } - if ((env_ptr = getenv(SYS_VAR_RING_ALLOCATION_LOGIC_RX)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_RING_ALLOCATION_LOGIC_RX))) { ring_allocation_logic_rx = (ring_logic_t)atoi(env_ptr); if (!is_ring_logic_valid(ring_allocation_logic_rx)) { vlog_printf(VLOG_WARNING, "%s = %d is not valid, setting logic to default = %d\n", @@ -1373,41 +1354,41 @@ void mce_sys_var::get_env_params() } } - if ((env_ptr = getenv(SYS_VAR_RING_MIGRATION_RATIO_TX)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_RING_MIGRATION_RATIO_TX))) { ring_migration_ratio_tx = atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_RING_MIGRATION_RATIO_RX)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_RING_MIGRATION_RATIO_RX))) { ring_migration_ratio_rx = atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_RING_LIMIT_PER_INTERFACE)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_RING_LIMIT_PER_INTERFACE))) { ring_limit_per_interface = std::max(0, atoi(env_ptr)); } - if ((env_ptr = getenv(SYS_VAR_RING_DEV_MEM_TX)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_RING_DEV_MEM_TX))) { ring_dev_mem_tx = std::max(0, atoi(env_ptr)); } - if ((env_ptr = getenv(SYS_VAR_TCP_MAX_SYN_RATE)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TCP_MAX_SYN_RATE))) { tcp_max_syn_rate = std::min(TCP_MAX_SYN_RATE_TOP_LIMIT, std::max(0, atoi(env_ptr))); } bool rx_num_bufs_set = false; - if ((env_ptr = getenv(SYS_VAR_RX_NUM_BUFS)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_RX_NUM_BUFS))) { rx_num_bufs = (uint32_t)atoi(env_ptr); rx_num_bufs_set = true; } - if ((env_ptr = getenv(SYS_VAR_RX_BUF_SIZE)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_RX_BUF_SIZE))) { rx_buf_size = (uint32_t)option_size::from_str(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_RX_NUM_WRE_TO_POST_RECV)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_RX_NUM_WRE_TO_POST_RECV))) { rx_num_wr_to_post_recv = std::min(NUM_RX_WRE_TO_POST_RECV_MAX, std::max(1, atoi(env_ptr))); } - if ((env_ptr = getenv(SYS_VAR_RX_NUM_WRE)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_RX_NUM_WRE))) { rx_num_wr = (uint32_t)atoi(env_ptr); } @@ -1425,7 +1406,7 @@ void mce_sys_var::get_env_params() rx_num_wr = rx_num_wr_to_post_recv * 2; } - if ((env_ptr = getenv(SYS_VAR_RX_NUM_POLLS)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_RX_NUM_POLLS))) { rx_poll_num = atoi(env_ptr); } if (rx_poll_num < MCE_MIN_RX_NUM_POLLS || rx_poll_num > MCE_MAX_RX_NUM_POLLS) { @@ -1433,7 +1414,7 @@ void mce_sys_var::get_env_params() MCE_MIN_RX_NUM_POLLS, MCE_MAX_RX_NUM_POLLS, rx_poll_num); rx_poll_num = MCE_DEFAULT_RX_NUM_POLLS; } - if ((env_ptr = getenv(SYS_VAR_RX_NUM_POLLS_INIT)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_RX_NUM_POLLS_INIT))) { rx_poll_num_init = atoi(env_ptr); } if (rx_poll_num_init < MCE_MIN_RX_NUM_POLLS || rx_poll_num_init > MCE_MAX_RX_NUM_POLLS) { @@ -1445,11 +1426,11 @@ void mce_sys_var::get_env_params() rx_poll_num = 1; // Force at least one good polling loop } - if ((env_ptr = getenv(SYS_VAR_RX_UDP_POLL_OS_RATIO)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_RX_UDP_POLL_OS_RATIO))) { rx_udp_poll_os_ratio = (uint32_t)atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_HW_TS_CONVERSION_MODE)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_HW_TS_CONVERSION_MODE))) { hw_ts_conversion_mode = (ts_conversion_mode_t)atoi(env_ptr); if ((uint32_t)hw_ts_conversion_mode >= TS_CONVERSION_MODE_LAST) { vlog_printf( @@ -1462,32 +1443,32 @@ void mce_sys_var::get_env_params() } // The following 2 params were replaced by SYS_VAR_RX_UDP_POLL_OS_RATIO - if ((env_ptr = getenv(SYS_VAR_RX_POLL_OS_RATIO)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_RX_POLL_OS_RATIO))) { rx_udp_poll_os_ratio = (uint32_t)atoi(env_ptr); vlog_printf(VLOG_WARNING, "The parameter %s is no longer in use. Parameter %s was set to %d instead\n", SYS_VAR_RX_POLL_OS_RATIO, SYS_VAR_RX_UDP_POLL_OS_RATIO, rx_udp_poll_os_ratio); } - if ((env_ptr = getenv(SYS_VAR_RX_SKIP_OS)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_RX_SKIP_OS))) { rx_udp_poll_os_ratio = (uint32_t)atoi(env_ptr); vlog_printf(VLOG_WARNING, "The parameter %s is no longer in use. Parameter %s was set to %d instead\n", SYS_VAR_RX_SKIP_OS, SYS_VAR_RX_UDP_POLL_OS_RATIO, rx_udp_poll_os_ratio); } - if ((env_ptr = getenv(SYS_VAR_RX_POLL_YIELD)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_RX_POLL_YIELD))) { rx_poll_yield_loops = (uint32_t)atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_SELECT_CPU_USAGE_STATS)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_SELECT_CPU_USAGE_STATS))) { select_handle_cpu_usage_stats = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_RX_BYTE_MIN_LIMIT)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_RX_BYTE_MIN_LIMIT))) { rx_ready_byte_min_limit = (uint32_t)atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_RX_PREFETCH_BYTES)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_RX_PREFETCH_BYTES))) { rx_prefetch_bytes = (uint32_t)atoi(env_ptr); } if (rx_prefetch_bytes < MCE_MIN_RX_PREFETCH_BYTES || @@ -1497,7 +1478,7 @@ void mce_sys_var::get_env_params() rx_prefetch_bytes = MCE_DEFAULT_RX_PREFETCH_BYTES; } - if ((env_ptr = getenv(SYS_VAR_RX_PREFETCH_BYTES_BEFORE_POLL)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_RX_PREFETCH_BYTES_BEFORE_POLL))) { rx_prefetch_bytes_before_poll = (uint32_t)atoi(env_ptr); } if (rx_prefetch_bytes_before_poll != 0 && @@ -1510,34 +1491,34 @@ void mce_sys_var::get_env_params() rx_prefetch_bytes_before_poll = MCE_DEFAULT_RX_PREFETCH_BYTES_BEFORE_POLL; } - if ((env_ptr = getenv(SYS_VAR_RX_CQ_DRAIN_RATE_NSEC)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_RX_CQ_DRAIN_RATE_NSEC))) { rx_cq_drain_rate_nsec = atoi(env_ptr); } // Update the rx cq polling rate for draining logic tscval_t tsc_per_second = get_tsc_rate_per_second(); rx_delta_tsc_between_cq_polls = tsc_per_second * rx_cq_drain_rate_nsec / NSEC_PER_SEC; - if ((env_ptr = getenv(SYS_VAR_GRO_STREAMS_MAX)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_GRO_STREAMS_MAX))) { gro_streams_max = std::max(atoi(env_ptr), 0); } - if ((env_ptr = getenv(SYS_VAR_TCP_3T_RULES)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TCP_3T_RULES))) { tcp_3t_rules = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_UDP_3T_RULES)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_UDP_3T_RULES))) { udp_3t_rules = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_ETH_MC_L2_ONLY_RULES)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_ETH_MC_L2_ONLY_RULES))) { eth_mc_l2_only_rules = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_DISABLE_FLOW_TAG)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_DISABLE_FLOW_TAG))) { disable_flow_tag = std::max(atoi(env_ptr), 0) ? true : false; } // mc_force_flowtag must be after disable_flow_tag - if ((env_ptr = getenv(SYS_VAR_MC_FORCE_FLOWTAG)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_MC_FORCE_FLOWTAG))) { mc_force_flowtag = atoi(env_ptr) ? true : false; if (disable_flow_tag) { vlog_printf(VLOG_WARNING, "%s and %s can't be set together. Disabling %s\n", @@ -1547,7 +1528,7 @@ void mce_sys_var::get_env_params() } } - if ((env_ptr = getenv(SYS_VAR_SELECT_NUM_POLLS)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_SELECT_NUM_POLLS))) { select_poll_num = atoi(env_ptr); } @@ -1557,7 +1538,7 @@ void mce_sys_var::get_env_params() select_poll_num = MCE_DEFAULT_SELECT_NUM_POLLS; } - if ((env_ptr = getenv(SYS_VAR_SELECT_POLL_OS_FORCE)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_SELECT_POLL_OS_FORCE))) { select_poll_os_force = (uint32_t)atoi(env_ptr); } @@ -1566,11 +1547,11 @@ void mce_sys_var::get_env_params() select_skip_os_fd_check = 1; } - if ((env_ptr = getenv(SYS_VAR_SELECT_POLL_OS_RATIO)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_SELECT_POLL_OS_RATIO))) { select_poll_os_ratio = (uint32_t)atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_SELECT_SKIP_OS)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_SELECT_SKIP_OS))) { select_skip_os_fd_check = (uint32_t)atoi(env_ptr); } @@ -1578,10 +1559,10 @@ void mce_sys_var::get_env_params() if ((mce_spec != MCE_SPEC_NVME_BF2) && (rx_poll_num < 0 || select_poll_num < 0)) { cq_moderation_enable = false; } - if ((env_ptr = getenv(SYS_VAR_CQ_MODERATION_ENABLE)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_CQ_MODERATION_ENABLE))) { cq_moderation_enable = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_CQ_MODERATION_COUNT)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_CQ_MODERATION_COUNT))) { cq_moderation_count = (uint32_t)atoi(env_ptr); } @@ -1591,11 +1572,11 @@ void mce_sys_var::get_env_params() cq_moderation_count = max_cq_moderation_count; } - if ((env_ptr = getenv(SYS_VAR_CQ_MODERATION_PERIOD_USEC)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_CQ_MODERATION_PERIOD_USEC))) { cq_moderation_period_usec = (uint32_t)atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_CQ_AIM_MAX_COUNT)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_CQ_AIM_MAX_COUNT))) { cq_aim_max_count = (uint32_t)atoi(env_ptr); } @@ -1605,11 +1586,11 @@ void mce_sys_var::get_env_params() cq_aim_max_count = max_cq_aim_max_count; } - if ((env_ptr = getenv(SYS_VAR_CQ_AIM_MAX_PERIOD_USEC)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_CQ_AIM_MAX_PERIOD_USEC))) { cq_aim_max_period_usec = (uint32_t)atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_CQ_AIM_INTERVAL_MSEC)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_CQ_AIM_INTERVAL_MSEC))) { cq_aim_interval_msec = (uint32_t)atoi(env_ptr); } @@ -1617,7 +1598,7 @@ void mce_sys_var::get_env_params() cq_aim_interval_msec = MCE_CQ_ADAPTIVE_MODERATION_DISABLED; } - if ((env_ptr = getenv(SYS_VAR_CQ_AIM_INTERRUPTS_RATE_PER_SEC)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_CQ_AIM_INTERRUPTS_RATE_PER_SEC))) { cq_aim_interrupts_rate_per_sec = (uint32_t)atoi(env_ptr); } #else @@ -1651,7 +1632,7 @@ void mce_sys_var::get_env_params() } #endif /* DEFINED_IBV_CQ_ATTR_MODERATE */ - if ((env_ptr = getenv(SYS_VAR_CQ_POLL_BATCH_MAX)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_CQ_POLL_BATCH_MAX))) { cq_poll_batch_max = (uint32_t)atoi(env_ptr); } if (cq_poll_batch_max < MCE_MIN_CQ_POLL_BATCH || cq_poll_batch_max > MCE_MAX_CQ_POLL_BATCH) { @@ -1660,7 +1641,7 @@ void mce_sys_var::get_env_params() cq_poll_batch_max = MCE_DEFAULT_CQ_POLL_BATCH; } - if ((env_ptr = getenv(SYS_VAR_PROGRESS_ENGINE_INTERVAL)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_PROGRESS_ENGINE_INTERVAL))) { progress_engine_interval_msec = (uint32_t)atoi(env_ptr); } if (enable_socketxtreme && (progress_engine_interval_msec != MCE_CQ_DRAIN_INTERVAL_DISABLED)) { @@ -1670,41 +1651,41 @@ void mce_sys_var::get_env_params() SYS_VAR_SOCKETXTREME); } - if ((env_ptr = getenv(SYS_VAR_PROGRESS_ENGINE_WCE_MAX)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_PROGRESS_ENGINE_WCE_MAX))) { progress_engine_wce_max = (uint32_t)atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_CQ_KEEP_QP_FULL)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_CQ_KEEP_QP_FULL))) { cq_keep_qp_full = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_QP_COMPENSATION_LEVEL)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_QP_COMPENSATION_LEVEL))) { qp_compensation_level = (uint32_t)atoi(env_ptr); } if (qp_compensation_level < rx_num_wr_to_post_recv) { qp_compensation_level = rx_num_wr_to_post_recv; } - if ((env_ptr = getenv(SYS_VAR_USER_HUGE_PAGE_SIZE)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_USER_HUGE_PAGE_SIZE))) { user_huge_page_size = option_size::from_str(env_ptr); if (user_huge_page_size == 0) { user_huge_page_size = g_hugepage_mgr.get_default_hugepage(); } } - if ((env_ptr = getenv(SYS_VAR_OFFLOADED_SOCKETS)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_OFFLOADED_SOCKETS))) { offloaded_sockets = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_TIMER_RESOLUTION_MSEC)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TIMER_RESOLUTION_MSEC))) { timer_resolution_msec = atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_TCP_TIMER_RESOLUTION_MSEC)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TCP_TIMER_RESOLUTION_MSEC))) { tcp_timer_resolution_msec = atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_TCP_CTL_THREAD)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TCP_CTL_THREAD))) { tcp_ctl_thread = option_tcp_ctl_thread::from_str(env_ptr, MCE_DEFAULT_TCP_CTL_THREAD); if (tcp_ctl_thread == option_tcp_ctl_thread::CTL_THREAD_DELEGATE_TCP_TIMERS) { if (progress_engine_interval_msec != MCE_CQ_DRAIN_INTERVAL_DISABLED) { @@ -1727,7 +1708,7 @@ void mce_sys_var::get_env_params() } } - if ((env_ptr = getenv(SYS_VAR_TCP_TIMESTAMP_OPTION)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TCP_TIMESTAMP_OPTION))) { tcp_ts_opt = (tcp_ts_opt_t)atoi(env_ptr); if ((uint32_t)tcp_ts_opt >= TCP_TS_OPTION_LAST) { vlog_printf(VLOG_WARNING, @@ -1739,30 +1720,30 @@ void mce_sys_var::get_env_params() } } - if ((env_ptr = getenv(SYS_VAR_TCP_NODELAY)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TCP_NODELAY))) { tcp_nodelay = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_TCP_QUICKACK)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TCP_QUICKACK))) { tcp_quickack = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_TCP_PUSH_FLAG)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TCP_PUSH_FLAG))) { tcp_push_flag = atoi(env_ptr) ? true : false; } // TODO: this should be replaced by calling "exception_handling.init()" that will be called from // init() - if ((env_ptr = getenv(xlio_exception_handling::getSysVar())) != NULL) { - exception_handling = xlio_exception_handling( - strtol(env_ptr, NULL, 10)); // xlio_exception_handling is responsible for its invariant + if ((env_ptr = getenv(xlio_exception_handling::getSysVar()))) { + exception_handling = xlio_exception_handling(strtol( + env_ptr, nullptr, 10)); // xlio_exception_handling is responsible for its invariant } - if ((env_ptr = getenv(SYS_VAR_AVOID_SYS_CALLS_ON_TCP_FD)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_AVOID_SYS_CALLS_ON_TCP_FD))) { avoid_sys_calls_on_tcp_fd = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_ALLOW_PRIVILEGED_SOCK_OPT)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_ALLOW_PRIVILEGED_SOCK_OPT))) { allow_privileged_sock_opt = atoi(env_ptr) ? true : false; } @@ -1775,16 +1756,16 @@ void mce_sys_var::get_env_params() tcp_timer_resolution_msec = timer_resolution_msec; } - if ((env_ptr = getenv(SYS_VAR_INTERNAL_THREAD_ARM_CQ)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_INTERNAL_THREAD_ARM_CQ))) { internal_thread_arm_cq_enabled = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_INTERNAL_THREAD_CPUSET)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_INTERNAL_THREAD_CPUSET))) { snprintf(internal_thread_cpuset, FILENAME_MAX, "%s", env_ptr); } // handle internal thread affinity - default is CPU-0 - if ((env_ptr = getenv(SYS_VAR_INTERNAL_THREAD_AFFINITY)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_INTERNAL_THREAD_AFFINITY))) { int n = snprintf(internal_thread_affinity_str, sizeof(internal_thread_affinity_str), "%s", env_ptr); if (unlikely(((int)sizeof(internal_thread_affinity_str) < n) || (n < 0))) { @@ -1797,18 +1778,18 @@ void mce_sys_var::get_env_params() internal_thread_affinity_str); } - if ((env_ptr = getenv(SYS_VAR_WAIT_AFTER_JOIN_MSEC)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_WAIT_AFTER_JOIN_MSEC))) { wait_after_join_msec = (uint32_t)atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_THREAD_MODE)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_THREAD_MODE))) { thread_mode = (thread_mode_t)atoi(env_ptr); if (thread_mode < 0 || thread_mode >= THREAD_MODE_LAST) { thread_mode = MCE_DEFAULT_THREAD_MODE; } } - if ((env_ptr = getenv(SYS_VAR_BUFFER_BATCHING_MODE)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_BUFFER_BATCHING_MODE))) { buffer_batching_mode = (buffer_batching_mode_t)atoi(env_ptr); if (buffer_batching_mode < 0 || buffer_batching_mode >= BUFFER_BATCHING_LAST) { buffer_batching_mode = MCE_DEFAULT_BUFFER_BATCHING_MODE; @@ -1821,24 +1802,24 @@ void mce_sys_var::get_env_params() rx_bufs_batch = 1; } - if ((env_ptr = getenv(SYS_VAR_NETLINK_TIMER_MSEC)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_NETLINK_TIMER_MSEC))) { timer_netlink_update_msec = (uint32_t)atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_NEIGH_NUM_ERR_RETRIES)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_NEIGH_NUM_ERR_RETRIES))) { neigh_num_err_retries = (uint32_t)atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_NEIGH_UC_ARP_DELAY_MSEC)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_NEIGH_UC_ARP_DELAY_MSEC))) { neigh_wait_till_send_arp_msec = (uint32_t)atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_NEIGH_UC_ARP_QUATA)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_NEIGH_UC_ARP_QUATA))) { neigh_uc_arp_quata = (uint32_t)atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_MEM_ALLOC_TYPE)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_MEM_ALLOC_TYPE))) { mem_alloc_type = option_alloc_type::from_str(env_ptr, MCE_DEFAULT_MEM_ALLOC_TYPE); } - if ((env_ptr = getenv(SYS_VAR_MEMORY_LIMIT)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_MEMORY_LIMIT))) { memory_limit = option_size::from_str(env_ptr) ?: MCE_DEFAULT_MEMORY_LIMIT; } else { /* @@ -1880,47 +1861,42 @@ void mce_sys_var::get_env_params() memory_limit = std::max(memory_limit, memory_limit_est); } } - if ((env_ptr = getenv(SYS_VAR_MEMORY_LIMIT_USER)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_MEMORY_LIMIT_USER))) { memory_limit_user = option_size::from_str(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_HEAP_METADATA_BLOCK)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_HEAP_METADATA_BLOCK))) { heap_metadata_block = option_size::from_str(env_ptr) ?: MCE_DEFAULT_HEAP_METADATA_BLOCK; } - if ((env_ptr = getenv(SYS_VAR_HUGEPAGE_LOG2)) != NULL) { - unsigned val = (unsigned)atoi(env_ptr); - - // mmap() uses 6 bits for the hugepage size log2 - if (val < 64U) { - hugepage_log2 = val; - } else { - hugepage_log2 = MCE_DEFAULT_HUGEPAGE_LOG2; - vlog_printf(VLOG_WARNING, "%s parameter can be in range [0, 63], but set to %u\n", - SYS_VAR_HUGEPAGE_LOG2, val); + if ((env_ptr = getenv(SYS_VAR_HUGEPAGE_SIZE))) { + hugepage_size = option_size::from_str(env_ptr); + if (hugepage_size & (hugepage_size - 1)) { + vlog_printf(VLOG_WARNING, "%s must be a power of 2. Fallback to default value (%s)\n", + SYS_VAR_HUGEPAGE_SIZE, option_size::to_str(MCE_DEFAULT_HUGEPAGE_SIZE)); + hugepage_size = MCE_DEFAULT_HUGEPAGE_SIZE; } - if (hugepage_log2 != 0 && !g_hugepage_mgr.is_hugepage_supported(1LU << hugepage_log2)) { - vlog_printf(VLOG_WARNING, - "Requested hugepage %zu kB is not supported. " - "XLIO will autodetect optimal hugepage.", - (1LU << hugepage_log2) / 1024LU); - /* Value 0 means default autodetection behavior. Don't set MCE_DEFAULT_HUGEPAGE_LOG2 - * here, because it can be defined to an unsupported specific value. - */ - hugepage_log2 = 0; + if (hugepage_size > MCE_MAX_HUGEPAGE_SIZE) { + vlog_printf(VLOG_WARNING, "%s exceeds maximum possible hugepage size (%s)\n", + SYS_VAR_HUGEPAGE_SIZE, option_size::to_str(MCE_MAX_HUGEPAGE_SIZE)); + hugepage_size = MCE_DEFAULT_HUGEPAGE_SIZE; } } - if ((env_ptr = getenv(SYS_VAR_BF)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_BF))) { handle_bf = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_FORK)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_FORK))) { handle_fork = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_TSO)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TSO))) { enable_tso = option_3::from_str(env_ptr, MCE_DEFAULT_TSO); } + if ((env_ptr = getenv(SYS_VAR_MAX_TSO_SIZE))) { + max_tso_sz = option_size::from_str(env_ptr); + } + if ((enable_tso != option_3::OFF) && (ring_migration_ratio_tx != -1)) { ring_migration_ratio_tx = -1; vlog_printf(VLOG_DEBUG, "%s parameter is forced to %d in case %s is enabled\n", @@ -1950,67 +1926,67 @@ void mce_sys_var::get_env_params() } #endif /* DEFINED_UTLS */ - if ((env_ptr = getenv(SYS_VAR_LRO)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_LRO))) { enable_lro = option_3::from_str(env_ptr, MCE_DEFAULT_LRO); } - if ((env_ptr = getenv(SYS_VAR_CLOSE_ON_DUP2)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_CLOSE_ON_DUP2))) { close_on_dup2 = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_MTU)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_MTU))) { mtu = (uint32_t)atoi(env_ptr); } #if defined(DEFINED_NGINX) - if ((env_ptr = getenv(SYS_VAR_NGINX_UDP_POOL_SIZE)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_NGINX_UDP_POOL_SIZE))) { nginx_udp_socket_pool_size = (uint32_t)atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_NGINX_UDP_POOL_RX_NUM_BUFFS_REUSE)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_NGINX_UDP_POOL_RX_NUM_BUFFS_REUSE))) { nginx_udp_socket_pool_rx_num_buffs_reuse = (uint32_t)atoi(env_ptr); } #endif // DEFINED_NGINX #if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) - if ((env_ptr = getenv(SYS_VAR_SRC_PORT_STRIDE)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_SRC_PORT_STRIDE))) { app.src_port_stride = (uint32_t)atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_DISTRIBUTE_CQ)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_DISTRIBUTE_CQ))) { app.distribute_cq_interrupts = atoi(env_ptr) ? true : false; } #endif - if ((env_ptr = getenv(SYS_VAR_MSS)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_MSS))) { lwip_mss = (uint32_t)atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_TCP_CC_ALGO)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TCP_CC_ALGO))) { lwip_cc_algo_mod = (uint32_t)atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_DEFERRED_CLOSE)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_DEFERRED_CLOSE))) { deferred_close = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_TCP_ABORT_ON_CLOSE)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TCP_ABORT_ON_CLOSE))) { tcp_abort_on_close = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_RX_POLL_ON_TX_TCP)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_RX_POLL_ON_TX_TCP))) { rx_poll_on_tx_tcp = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_RX_CQ_WAIT_CTRL)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_RX_CQ_WAIT_CTRL))) { rx_cq_wait_ctrl = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_TRIGGER_DUMMY_SEND_GETSOCKNAME)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TRIGGER_DUMMY_SEND_GETSOCKNAME))) { trigger_dummy_send_getsockname = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_TCP_SEND_BUFFER_SIZE)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TCP_SEND_BUFFER_SIZE))) { tcp_send_buffer_size = (uint32_t)option_size::from_str(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_SKIP_POLL_IN_RX)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_SKIP_POLL_IN_RX))) { int temp = atoi(env_ptr); if (temp < 0 || temp > SKIP_POLL_IN_RX_EPOLL_ONLY) { temp = 0; @@ -2018,7 +1994,7 @@ void mce_sys_var::get_env_params() skip_poll_in_rx = (skip_poll_in_rx_t)temp; } - if ((env_ptr = getenv(SYS_VAR_MULTILOCK)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_MULTILOCK))) { int temp = atoi(env_ptr); if (temp < 0 || temp > MULTILOCK_MUTEX) { temp = 0; @@ -2069,10 +2045,10 @@ void set_env_params() } // Don't override user defined values. - if (getenv("MLX_QP_ALLOC_TYPE") == nullptr) { + if (!getenv("MLX_QP_ALLOC_TYPE")) { setenv("MLX_QP_ALLOC_TYPE", ibv_alloc_type, 0); } - if (getenv("MLX_CQ_ALLOC_TYPE") == nullptr) { + if (!getenv("MLX_CQ_ALLOC_TYPE")) { setenv("MLX_CQ_ALLOC_TYPE", ibv_alloc_type, 0); } } diff --git a/src/core/util/sys_vars.h b/src/core/util/sys_vars.h index 9dd1aaa22..c97c1f9da 100644 --- a/src/core/util/sys_vars.h +++ b/src/core/util/sys_vars.h @@ -208,11 +208,6 @@ typedef enum { AUTO_ON_OFF_DEF } mode_t; OPTIONS_FROM_TO_STR_DEF; } // namespace option_3 -namespace option_strq { -typedef enum { AUTO_ON_OFF_DEF, REGULAR_RQ = 2 } mode_t; -OPTIONS_FROM_TO_STR_DEF; -} // namespace option_strq - namespace option_tcp_ctl_thread { typedef enum { CTL_THREAD_DISABLE = 0, @@ -364,6 +359,7 @@ struct mce_sys_var { bool handle_sigintr; bool handle_segfault; uint32_t stats_fd_num_max; + uint32_t stats_fd_num_monitor; ring_logic_t ring_allocation_logic_tx; ring_logic_t ring_allocation_logic_rx; @@ -376,7 +372,6 @@ struct mce_sys_var { size_t zc_cache_threshold; uint32_t tx_num_bufs; uint32_t tx_buf_size; - uint32_t zc_tx_size; uint32_t tcp_nodelay_treshold; uint32_t tx_num_wr; uint32_t tx_num_wr_to_signal; @@ -415,7 +410,6 @@ struct mce_sys_var { bool disable_flow_tag; bool enable_striding_rq; - bool enable_dpcp_rq; bool tcp_3t_rules; bool udp_3t_rules; bool eth_mc_l2_only_rules; @@ -440,6 +434,7 @@ struct mce_sys_var { uint32_t progress_engine_wce_max; bool cq_keep_qp_full; uint32_t qp_compensation_level; + uint32_t max_tso_sz; size_t user_huge_page_size; bool offloaded_sockets; @@ -460,7 +455,7 @@ struct mce_sys_var { size_t memory_limit; size_t memory_limit_user; size_t heap_metadata_block; - uint8_t hugepage_log2; + size_t hugepage_size; bool handle_fork; bool close_on_dup2; uint32_t mtu; /* effective MTU. If mtu==0 then auto calculate the MTU */ @@ -477,7 +472,7 @@ struct mce_sys_var { bool enable_socketxtreme; option_3::mode_t enable_tso; option_3::mode_t enable_lro; - option_strq::mode_t enable_strq_env; + option_3::mode_t enable_strq_env; #ifdef DEFINED_UTLS bool enable_utls_rx; bool enable_utls_tx; @@ -583,7 +578,6 @@ extern mce_sys_var &safe_mce_sys(); #define SYS_VAR_ZC_CACHE_THRESHOLD "XLIO_ZC_CACHE_THRESHOLD" #define SYS_VAR_TX_NUM_BUFS "XLIO_TX_BUFS" #define SYS_VAR_TX_BUF_SIZE "XLIO_TX_BUF_SIZE" -#define SYS_VAR_ZC_TX_SIZE "XLIO_ZC_TX_SIZE" #define SYS_VAR_TCP_NODELAY_TRESHOLD "XLIO_TCP_NODELAY_TRESHOLD" #define SYS_VAR_TX_NUM_WRE "XLIO_TX_WRE" #define SYS_VAR_TX_NUM_WRE_TO_SIGNAL "XLIO_TX_WRE_BATCHING" @@ -644,6 +638,7 @@ extern mce_sys_var &safe_mce_sys(); #define SYS_VAR_PROGRESS_ENGINE_WCE_MAX "XLIO_PROGRESS_ENGINE_WCE_MAX" #define SYS_VAR_CQ_KEEP_QP_FULL "XLIO_CQ_KEEP_QP_FULL" #define SYS_VAR_QP_COMPENSATION_LEVEL "XLIO_QP_COMPENSATION_LEVEL" +#define SYS_VAR_MAX_TSO_SIZE "XLIO_MAX_TSO_SIZE" #define SYS_VAR_USER_HUGE_PAGE_SIZE "XLIO_USER_HUGE_PAGE_SIZE" #define SYS_VAR_OFFLOADED_SOCKETS "XLIO_OFFLOADED_SOCKETS" #define SYS_VAR_TIMER_RESOLUTION_MSEC "XLIO_TIMER_RESOLUTION_MSEC" @@ -662,7 +657,7 @@ extern mce_sys_var &safe_mce_sys(); #define SYS_VAR_MEMORY_LIMIT "XLIO_MEMORY_LIMIT" #define SYS_VAR_MEMORY_LIMIT_USER "XLIO_MEMORY_LIMIT_USER" #define SYS_VAR_HEAP_METADATA_BLOCK "XLIO_HEAP_METADATA_BLOCK" -#define SYS_VAR_HUGEPAGE_LOG2 "XLIO_HUGEPAGE_LOG2" +#define SYS_VAR_HUGEPAGE_SIZE "XLIO_HUGEPAGE_SIZE" #define SYS_VAR_FORK "XLIO_FORK" #define SYS_VAR_BF "XLIO_BF" #define SYS_VAR_CLOSE_ON_DUP2 "XLIO_CLOSE_ON_DUP2" @@ -731,7 +726,7 @@ extern mce_sys_var &safe_mce_sys(); #define MCE_DEFAULT_APP_ID ("XLIO_DEFAULT_APPLICATION_ID") #define MCE_DEFAULT_HANDLE_SIGINTR (true) #define MCE_DEFAULT_HANDLE_SIGFAULT (false) -#define MCE_DEFAULT_STATS_FD_NUM 100 +#define MCE_DEFAULT_STATS_FD_NUM 0 #define MCE_DEFAULT_RING_ALLOCATION_LOGIC_TX (RING_LOGIC_PER_INTERFACE) #define MCE_DEFAULT_RING_ALLOCATION_LOGIC_RX (RING_LOGIC_PER_INTERFACE) #define MCE_DEFAULT_RING_MIGRATION_RATIO_TX (-1) @@ -739,7 +734,6 @@ extern mce_sys_var &safe_mce_sys(); #define MCE_DEFAULT_RING_LIMIT_PER_INTERFACE (0) #define MCE_DEFAULT_RING_DEV_MEM_TX (0) #define MCE_DEFAULT_TCP_MAX_SYN_RATE (0) -#define MCE_DEFAULT_ZC_TX_SIZE (32768) #define MCE_DEFAULT_TCP_NODELAY_TRESHOLD (0) #define MCE_DEFAULT_ZC_CACHE_THRESHOLD (10LU * 1024 * 1024 * 1024) // 10GB #define MCE_DEFAULT_TX_NUM_BUFS (200000) @@ -758,12 +752,7 @@ extern mce_sys_var &safe_mce_sys(); #define MCE_DEFAULT_TX_SEGS_POOL_BATCH_TCP (16384) #define MCE_DEFAULT_TX_NUM_SGE (4) -#if defined(DEFINED_DPCP) -#define MCE_DEFAULT_STRQ (option_strq::ON) -#else -#define MCE_DEFAULT_STRQ (option_strq::OFF) -#endif - +#define MCE_DEFAULT_STRQ (option_3::ON) #define MCE_DEFAULT_STRQ_NUM_STRIDES (16384) #define MCE_DEFAULT_STRQ_STRIDE_SIZE_BYTES (512) #define MCE_DEFAULT_STRQ_NUM_BUFS (64) @@ -835,7 +824,8 @@ extern mce_sys_var &safe_mce_sys(); #define MCE_DEFAULT_MEMORY_LIMIT (2LU * 1024 * 1024 * 1024) #define MCE_DEFAULT_MEMORY_LIMIT_USER (0) #define MCE_DEFAULT_HEAP_METADATA_BLOCK (32LU * 1024 * 1024) -#define MCE_DEFAULT_HUGEPAGE_LOG2 (0) +#define MCE_DEFAULT_HUGEPAGE_SIZE (0) +#define MCE_MAX_HUGEPAGE_SIZE (1ULL << 63ULL) #define MCE_DEFAULT_FORK_SUPPORT (true) #define MCE_DEFAULT_BF_FLAG (true) #define MCE_DEFAULT_CLOSE_ON_DUP2 (true) @@ -866,7 +856,6 @@ extern mce_sys_var &safe_mce_sys(); #define MCE_MAX_RX_NUM_POLLS (100000000) #define MCE_MIN_RX_PREFETCH_BYTES (32) /* Just enough for headers (IPoIB+IP+UDP)*/ #define MCE_MAX_RX_PREFETCH_BYTES (2044) -#define MCE_MAX_ZC_TX_SIZE (65535) #define MCE_RX_CQ_DRAIN_RATE_DISABLED (0) #define MCE_CQ_DRAIN_INTERVAL_DISABLED (0) #define MCE_CQ_ADAPTIVE_MODERATION_DISABLED (0) @@ -874,6 +863,7 @@ extern mce_sys_var &safe_mce_sys(); #define MCE_MAX_CQ_POLL_BATCH (128) #define MCE_DEFAULT_SOCKETXTREME (false) #define MCE_DEFAULT_TSO (option_3::AUTO) +#define MCE_DEFAULT_MAX_TSO_SIZE (256 * 1024) #ifdef DEFINED_UTLS #define MCE_DEFAULT_UTLS_RX (false) #define MCE_DEFAULT_UTLS_TX (true) @@ -928,7 +918,7 @@ extern mce_sys_var &safe_mce_sys(); #define NETVSC_DEVICE_UPPER_FILE "/sys/class/net/%s/upper_%s/ifindex" #define NETVSC_ID "{f8615163-df3e-46c5-913f-f2d2f965ed0e}\n" -#define MAX_STATS_FD_NUM 1024 +#define MAX_STATS_FD_NUM 1024U #define MAX_WINDOW_SCALING 14 #define STRQ_MIN_STRIDES_NUM 512 diff --git a/src/core/util/sysctl_reader.h b/src/core/util/sysctl_reader.h index 86c814036..475e1917b 100644 --- a/src/core/util/sysctl_reader.h +++ b/src/core/util/sysctl_reader.h @@ -57,7 +57,7 @@ class sysctl_reader_t { FILE *pfile = fopen(path, "r"); int ans; - if (pfile == NULL) { + if (!pfile) { return -1; } diff --git a/src/core/util/utils.cpp b/src/core/util/utils.cpp index a4016d104..601113572 100644 --- a/src/core/util/utils.cpp +++ b/src/core/util/utils.cpp @@ -133,7 +133,7 @@ int get_base_interface_name(const char *if_name, char *base_ifname, size_t sz_ba } BULLSEYE_EXCLUDE_BLOCK_END - for (ifa = ifaddr; ifa != NULL; ifa = ifa->ifa_next) { + for (ifa = ifaddr; ifa; ifa = ifa->ifa_next) { if (!strcmp(ifa->ifa_name, if_name)) { continue; } @@ -495,7 +495,7 @@ void set_fd_block_mode(int fd, bool b_block) { __log_dbg("fd[%d]: setting to %sblocking mode (%d)", fd, b_block ? "" : "non-", b_block); - int flags = orig_os_api.fcntl(fd, F_GETFL); + int flags = SYSCALL(fcntl, fd, F_GETFL); BULLSEYE_EXCLUDE_BLOCK_START if (flags < 0) { __log_err("failed reading fd[%d] flag (rc=%d errno=%d %m)", fd, flags, errno); @@ -509,7 +509,7 @@ void set_fd_block_mode(int fd, bool b_block) flags |= O_NONBLOCK; } - int ret = orig_os_api.fcntl(fd, F_SETFL, flags); + int ret = SYSCALL(fcntl, fd, F_SETFL, flags); BULLSEYE_EXCLUDE_BLOCK_START if (ret < 0) { __log_err("failed changing fd[%d] to %sblocking mode (rc=%d errno=%d %s)", fd, @@ -544,20 +544,20 @@ int priv_read_file(const char *path, char *buf, size_t size, vlog_levels_t log_level /*= VLOG_ERROR*/) { int len = -1; - int fd = open(path, O_RDONLY); + int fd = SYSCALL(open, path, O_RDONLY); BULLSEYE_EXCLUDE_BLOCK_START if (fd < 0) { VLOG_PRINTF(log_level, "ERROR while opening file %s (errno %d %m)", path, errno); return -1; } BULLSEYE_EXCLUDE_BLOCK_END - len = read(fd, buf, size); + len = SYSCALL(read, fd, buf, size); BULLSEYE_EXCLUDE_BLOCK_START if (len < 0) { VLOG_PRINTF(log_level, "ERROR while reading from file %s (errno %d %m)", path, errno); } BULLSEYE_EXCLUDE_BLOCK_END - close(fd); + SYSCALL(close, fd); return len; } @@ -583,13 +583,13 @@ int get_port_from_ifname(const char *ifname) snprintf(dev_path, sizeof(dev_path), VERBS_DEVICE_PORT_PARAM_FILE, ifname); if (priv_safe_try_read_file(dev_path, num_buf, sizeof(num_buf)) > 0) { dev_port = - strtol(num_buf, NULL, 0); // base=0 means strtol() can parse hexadecimal and decimal + strtol(num_buf, nullptr, 0); // base=0 means strtol() can parse hexadecimal and decimal __log_dbg("dev_port file=%s dev_port str=%s dev_port val=%d", dev_path, num_buf, dev_port); } snprintf(dev_path, sizeof(dev_path), VERBS_DEVICE_ID_PARAM_FILE, ifname); if (priv_safe_try_read_file(dev_path, num_buf, sizeof(num_buf)) > 0) { dev_id = - strtol(num_buf, NULL, 0); // base=0 means strtol() can parse hexadecimal and decimal + strtol(num_buf, nullptr, 0); // base=0 means strtol() can parse hexadecimal and decimal __log_dbg("dev_id file= %s dev_id str=%s dev_id val=%d", dev_path, num_buf, dev_id); } @@ -674,20 +674,20 @@ class socket_context_manager { .tv_usec = 10, }; - m_fd = orig_os_api.socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + m_fd = SYSCALL(socket, AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); if (m_fd < 0) { throw std::runtime_error("Open netlink socket failed"); } - if (orig_os_api.setsockopt(m_fd, SOL_SOCKET, SO_RCVTIMEO, (const char *)&tv, sizeof tv)) { - close(m_fd); + if (SYSCALL(setsockopt, m_fd, SOL_SOCKET, SO_RCVTIMEO, (const char *)&tv, sizeof tv)) { + SYSCALL(close, m_fd); throw std::runtime_error("Setsockopt non-blocking failed"); } } socket_context_manager(int fd) noexcept : m_fd(fd) {}; - ~socket_context_manager() { close(m_fd); }; + ~socket_context_manager() { SYSCALL(close, m_fd); }; void send_getaddr_request(uint8_t family) { @@ -709,7 +709,7 @@ class socket_context_manager { iovec iov = {&msg_buf, msg_buf.nl.nlmsg_len}; msghdr msg = {&sa, sizeof(sa), &iov, 1, nullptr, 0, 0}; - if (orig_os_api.sendmsg(m_fd, &msg, 0) < 0) { + if (SYSCALL(sendmsg, m_fd, &msg, 0) < 0) { throw std::runtime_error("Send RTM_GETADDR request failed"); } } @@ -720,7 +720,7 @@ class socket_context_manager { iovec iov = {&m_buf, m_buf.size()}; msghdr msg = {&sa, sizeof(sa), &iov, 1, nullptr, 0, 0}; - return orig_os_api.recvmsg(m_fd, &msg, 0); + return SYSCALL(recvmsg, m_fd, &msg, 0); } nlmsghdr *get_nlmsghdr() { return reinterpret_cast(&m_buf); } @@ -837,7 +837,7 @@ uint16_t get_vlan_id_from_ifname(const char *ifname) { // find vlan id from interface name struct vlan_ioctl_args ifr; - int fd = orig_os_api.socket(AF_INET, SOCK_DGRAM, 0); + int fd = SYSCALL(socket, AF_INET, SOCK_DGRAM, 0); if (fd < 0) { __log_err("ERROR from socket() (errno=%d %m)", errno); @@ -847,15 +847,15 @@ uint16_t get_vlan_id_from_ifname(const char *ifname) ifr.cmd = GET_VLAN_VID_CMD; strncpy(ifr.device1, ifname, sizeof(ifr.device1) - 1); - if (orig_os_api.ioctl(fd, SIOCGIFVLAN, &ifr) < 0) { + if (SYSCALL(ioctl, fd, SIOCGIFVLAN, &ifr) < 0) { __log_dbg( "Failure in ioctl(SIOCGIFVLAN, cmd=GET_VLAN_VID_CMD) for interface '%s' (errno=%d %m)", ifname, errno); - orig_os_api.close(fd); + SYSCALL(close, fd); return 0; } - orig_os_api.close(fd); + SYSCALL(close, fd); __log_dbg("found vlan id '%d' for interface '%s'", ifr.u.VID, ifname); @@ -866,7 +866,7 @@ size_t get_vlan_base_name_from_ifname(const char *ifname, char *base_ifname, siz { // find vlan base name from interface name struct vlan_ioctl_args ifr; - int fd = orig_os_api.socket(AF_INET, SOCK_DGRAM, 0); + int fd = SYSCALL(socket, AF_INET, SOCK_DGRAM, 0); if (fd < 0) { __log_err("ERROR from socket() (errno=%d %m)", errno); return -1; @@ -875,15 +875,15 @@ size_t get_vlan_base_name_from_ifname(const char *ifname, char *base_ifname, siz ifr.cmd = GET_VLAN_REALDEV_NAME_CMD; strncpy(ifr.device1, ifname, sizeof(ifr.device1) - 1); - if (orig_os_api.ioctl(fd, SIOCGIFVLAN, &ifr) < 0) { + if (SYSCALL(ioctl, fd, SIOCGIFVLAN, &ifr) < 0) { __log_dbg("Failure in ioctl(SIOCGIFVLAN, cmd=GET_VLAN_REALDEV_NAME_CMD) for interface '%s' " "(errno=%d %m)", ifname, errno); - orig_os_api.close(fd); + SYSCALL(close, fd); return 0; } - orig_os_api.close(fd); + SYSCALL(close, fd); size_t name_len = strlen(ifr.u.device2); if (base_ifname && name_len > 0) { @@ -924,7 +924,7 @@ int run_and_retreive_system_command(const char *cmd_line, char *return_str, int if (file) { int fd = fileno(file); if (fd > 0) { - int actual_len = read(fd, return_str, return_str_len - 1); + int actual_len = SYSCALL(read, fd, return_str, return_str_len - 1); if (actual_len > 0) { return_str[actual_len] = '\0'; } else { @@ -991,9 +991,9 @@ size_t get_local_ll_addr(IN const char *ifname, OUT unsigned char *addr, IN int bool check_bond_device_exist(const char *ifname) { int ret = 0; - struct nl_cache *cache = NULL; - struct rtnl_link *link = NULL; - char *link_type = NULL; + struct nl_cache *cache = nullptr; + struct rtnl_link *link = nullptr; + char *link_type = nullptr; struct nl_sock *nl_socket = nl_socket_alloc(); if (!nl_socket) { @@ -1014,7 +1014,7 @@ bool check_bond_device_exist(const char *ifname) } link_type = rtnl_link_get_type(link); if (link_type && (strcmp(link_type, "bond") != 0)) { - link_type = NULL; + link_type = nullptr; } out: if (link) { @@ -1043,12 +1043,12 @@ bool get_bond_name(IN const char *ifname, OUT char *bond_name, IN int sz) return ret; } - for (ifa = ifaddr; ifa != NULL; ifa = ifa->ifa_next) { + for (ifa = ifaddr; ifa; ifa = ifa->ifa_next) { snprintf(upper_path, sizeof(upper_path), NETVSC_DEVICE_UPPER_FILE, base_ifname, ifa->ifa_name); - int fd = open(upper_path, O_RDONLY); + int fd = SYSCALL(open, upper_path, O_RDONLY); if (fd >= 0) { - close(fd); + SYSCALL(close, fd); if (IFNAMSIZ <= sz) { memcpy(bond_name, ifa->ifa_name, IFNAMSIZ); } @@ -1106,12 +1106,12 @@ bool get_netvsc_slave(IN const char *ifname, OUT char *slave_name, OUT unsigned return ret; } - for (ifa = ifaddr; ifa != NULL; ifa = ifa->ifa_next) { + for (ifa = ifaddr; ifa; ifa = ifa->ifa_next) { snprintf(netvsc_path, sizeof(netvsc_path), NETVSC_DEVICE_LOWER_FILE, base_ifname, ifa->ifa_name); - int fd = open(netvsc_path, O_RDONLY); + int fd = SYSCALL(open, netvsc_path, O_RDONLY); if (fd >= 0) { - close(fd); + SYSCALL(close, fd); memcpy(slave_name, ifa->ifa_name, IFNAMSIZ); slave_flags = ifa->ifa_flags; __log_dbg("Found slave_name = %s, slave_flags = %u", slave_name, slave_flags); @@ -1186,9 +1186,9 @@ bool check_device_exist(const char *ifname, const char *path) n = snprintf(device_path, sizeof(device_path), path, ifname); if (likely((0 < n) && (n < (int)sizeof(device_path)))) { - fd = orig_os_api.open(device_path, O_RDONLY); + fd = SYSCALL(open, device_path, O_RDONLY); if (fd >= 0) { - orig_os_api.close(fd); + SYSCALL(close, fd); } if (fd < 0 && errno == EMFILE) { __log_warn("There are no free fds in the system. This may cause unexpected behavior"); @@ -1212,9 +1212,9 @@ bool check_device_name_ib_name(const char *ifname, const char *ibname) n = snprintf(ib_path, sizeof(ib_path), "/sys/class/infiniband/%s/device/net/%s/ifindex", ibname, str_ifname); if (likely((0 < n) && (n < (int)sizeof(ib_path)))) { - fd = open(ib_path, O_RDONLY); + fd = SYSCALL(open, ib_path, O_RDONLY); if (fd >= 0) { - close(fd); + SYSCALL(close, fd); return true; } } @@ -1303,7 +1303,7 @@ int validate_tso(int if_index) struct ifreq req; struct ethtool_value eval; - fd = orig_os_api.socket(AF_INET, SOCK_DGRAM, 0); + fd = SYSCALL(socket, AF_INET, SOCK_DGRAM, 0); if (fd < 0) { __log_err("ERROR from socket() (errno=%d %m)", errno); return -1; @@ -1313,13 +1313,13 @@ int validate_tso(int if_index) req.ifr_ifindex = if_index; if_indextoname(if_index, req.ifr_name); req.ifr_data = (char *)&eval; - ret = orig_os_api.ioctl(fd, SIOCETHTOOL, &req); + ret = SYSCALL(ioctl, fd, SIOCETHTOOL, &req); if (ret < 0) { __log_dbg("ioctl(SIOCETHTOOL) cmd=ETHTOOL_GTSO (errno=%d %m)", errno); } else { ret = eval.data; } - orig_os_api.close(fd); + SYSCALL(close, fd); return ret; #else NOT_IN_USE(if_index); @@ -1335,7 +1335,7 @@ int validate_lro(int if_index) struct ifreq req; struct ethtool_value eval; - fd = orig_os_api.socket(AF_INET, SOCK_DGRAM, 0); + fd = SYSCALL(socket, AF_INET, SOCK_DGRAM, 0); if (fd < 0) { __log_err("ERROR from socket() (errno=%d %m)", errno); return -1; @@ -1345,13 +1345,13 @@ int validate_lro(int if_index) req.ifr_ifindex = if_index; if_indextoname(if_index, req.ifr_name); req.ifr_data = (char *)&eval; - ret = orig_os_api.ioctl(fd, SIOCETHTOOL, &req); + ret = SYSCALL(ioctl, fd, SIOCETHTOOL, &req); if (ret < 0) { __log_dbg("ioctl(SIOCETHTOOL) cmd=ETHTOOL_GFLAGS (errno=%d %m)", errno); } else { ret = (eval.data & ETH_FLAG_LRO ? 1 : 0); } - orig_os_api.close(fd); + SYSCALL(close, fd); return ret; #else NOT_IN_USE(if_index); diff --git a/src/core/util/utils.h b/src/core/util/utils.h index 0293436cb..af11bbbe0 100644 --- a/src/core/util/utils.h +++ b/src/core/util/utils.h @@ -279,8 +279,8 @@ size_t get_vlan_base_name_from_ifname(const char *ifname, char *base_ifname, siz size_t get_local_ll_addr(const char *ifname, unsigned char *addr, int addr_len, bool is_broadcast); /* Print warning while RoCE Lag is enabled */ -void print_roce_lag_warnings(const char *interface, char *disable_path = NULL, - const char *port1 = NULL, const char *port2 = NULL); +void print_roce_lag_warnings(const char *interface, char *disable_path = nullptr, + const char *port1 = nullptr, const char *port2 = nullptr); /*Print a warning to the user when there was an error registering memory*/ void print_warning_rlimit_memlock(size_t length, int error); @@ -323,10 +323,10 @@ static inline int get_procname(int pid, char *proc, size_t size) { char app_full_name[PATH_MAX] = {0}; char proccess_proc_dir[FILE_NAME_MAX_SIZE] = {0}; - char *app_base_name = NULL; + char *app_base_name = nullptr; int n = -1; - if (NULL == proc) { + if (!proc) { return -1; } @@ -351,7 +351,7 @@ static inline int get_procname(int pid, char *proc, size_t size) inline void create_multicast_mac_from_ip(unsigned char *mc_mac, const ip_address &addr, sa_family_t family) { - if (mc_mac == NULL) { + if (!mc_mac) { return; } diff --git a/src/core/util/vtypes.h b/src/core/util/vtypes.h index 181790a7c..15138ca30 100644 --- a/src/core/util/vtypes.h +++ b/src/core/util/vtypes.h @@ -41,6 +41,7 @@ #include "utils/types.h" #include "utils/bullseye.h" + #ifndef IN #define IN #endif @@ -53,7 +54,11 @@ #define INOUT #endif -#if __BYTE_ORDER == __LITTLE_ENDIAN +#if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) || !defined(__ORDER_BIG_ENDIAN__) +#error "__BYTE_ORDER__ or __ORDER_..._ENDIAN__ is not defined" +#endif + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ static inline uint64_t htonll(uint64_t x) { return bswap_64(x); @@ -62,7 +67,7 @@ static inline uint64_t ntohll(uint64_t x) { return bswap_64(x); } -#elif __BYTE_ORDER == __BIG_ENDIAN +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ static inline uint64_t htonll(uint64_t x) { return x; @@ -72,7 +77,7 @@ static inline uint64_t ntohll(uint64_t x) return x; } #else -#error __BYTE_ORDER is neither __LITTLE_ENDIAN nor __BIG_ENDIAN +#error __BYTE_ORDER__ is neither __ORDER_LITTLE_ENDIAN__ nor __ORDER_BIG_ENDIAN__ #endif #define likely(x) __builtin_expect(!!(x), 1) @@ -96,7 +101,7 @@ static inline uint64_t ntohll(uint64_t x) (uint8_t)(((ip) >> 24) & 0xff), (uint8_t)(((ip) >> 16) & 0xff), (uint8_t)(((ip) >> 8) & 0xff), \ (uint8_t)((ip)&0xff) -#if __BYTE_ORDER == __LITTLE_ENDIAN +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ /* The host byte order is the same as network byte order, so these functions are all just identity. */ @@ -104,13 +109,11 @@ static inline uint64_t ntohll(uint64_t x) #define NIPQUAD(ip) NETWORK_IP_PRINTQUAD_LITTLE_ENDIAN(ip) #define HIPQUAD(ip) HOST_IP_PRINTQUAD_LITTLE_ENDIAN(ip) -#else -#if __BYTE_ORDER == __BIG_ENDIAN +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ #define NIPQUAD(ip) HOST_IP_PRINTQUAD_LITTLE_ENDIAN(ip) #define HIPQUAD(ip) NETWORK_IP_PRINTQUAD_LITTLE_ENDIAN(ip) -#endif #endif #define ETH_HW_ADDR_PRINT_FMT "%02x:%02x:%02x:%02x:%02x:%02x" diff --git a/src/core/util/wakeup.cpp b/src/core/util/wakeup.cpp index a21e0284e..f15d60dce 100644 --- a/src/core/util/wakeup.cpp +++ b/src/core/util/wakeup.cpp @@ -49,21 +49,21 @@ #undef MODULE_HDR_INFO #define MODULE_HDR_INFO MODULE_NAME "[epfd=%d]:%d:%s() " #undef __INFO__ -#define __INFO__ m_epfd +#define __INFO__ m_wakeup_epfd wakeup::wakeup() { - m_epfd = 0; + m_wakeup_epfd = 0; m_is_sleeping = 0; memset(&m_ev, 0, sizeof(m_ev)); } void wakeup::going_to_sleep() { BULLSEYE_EXCLUDE_BLOCK_START - if (likely(m_epfd)) { + if (likely(m_wakeup_epfd)) { m_is_sleeping++; } else { - wkup_logerr(" m_epfd is not initialized - cannot use wakeup mechanism\n"); + wkup_logerr(" m_wakeup_epfd is not initialized - cannot use wakeup mechanism\n"); m_is_sleeping = 0; } BULLSEYE_EXCLUDE_BLOCK_END @@ -71,5 +71,5 @@ void wakeup::going_to_sleep() void wakeup::wakeup_set_epoll_fd(int epfd) { - m_epfd = epfd; + m_wakeup_epfd = epfd; } diff --git a/src/core/util/wakeup.h b/src/core/util/wakeup.h index 193223c72..3b3a635b4 100644 --- a/src/core/util/wakeup.h +++ b/src/core/util/wakeup.h @@ -48,15 +48,11 @@ class wakeup { void going_to_sleep(); void return_from_sleep() { --m_is_sleeping; }; void wakeup_clear() { m_is_sleeping = 0; } + void wakeup_set_epoll_fd(int epfd); protected: - virtual void wakeup_set_epoll_fd(int epfd); int m_is_sleeping; - - // lock_spin_recursive m_wakeup_lock; This lock is not needed for now. Maybe we will need it for - // epoll. - - int m_epfd; + int m_wakeup_epfd; struct epoll_event m_ev; }; diff --git a/src/core/util/wakeup_pipe.cpp b/src/core/util/wakeup_pipe.cpp index 32afa180c..8bac8886c 100644 --- a/src/core/util/wakeup_pipe.cpp +++ b/src/core/util/wakeup_pipe.cpp @@ -49,7 +49,7 @@ #undef MODULE_HDR_INFO #define MODULE_HDR_INFO MODULE_NAME "[epfd=%d]:%d:%s() " #undef __INFO__ -#define __INFO__ m_epfd +#define __INFO__ m_wakeup_epfd #define UNINIT_PIPE_FD (-1) int wakeup_pipe::g_wakeup_pipes[2] = {UNINIT_PIPE_FD, UNINIT_PIPE_FD}; @@ -60,18 +60,18 @@ wakeup_pipe::wakeup_pipe() int ref = atomic_fetch_and_inc(&ref_count); if (ref == 0) { BULLSEYE_EXCLUDE_BLOCK_START - if (orig_os_api.pipe(g_wakeup_pipes)) { + if (SYSCALL(pipe, g_wakeup_pipes)) { wkup_logpanic("wakeup pipe create failed (errno=%d %m)", errno); } - if (orig_os_api.write(g_wakeup_pipes[1], "^", 1) != 1) { + if (SYSCALL(write, g_wakeup_pipes[1], "^", 1) != 1) { wkup_logpanic("wakeup pipe write failed(errno=%d %m)", errno); } BULLSEYE_EXCLUDE_BLOCK_END wkup_logdbg("created wakeup pipe [RD=%d, WR=%d]", g_wakeup_pipes[0], g_wakeup_pipes[1]); // ToDo - these pipe should be closed at some point - // orig_os_api.close(g_si_wakeup_pipes[1]); - // orig_os_api.close(g_si_wakeup_pipes[0]); + // SYSCALL(close, g_si_wakeup_pipes[1]); + // SYSCALL(close, g_si_wakeup_pipes[0]); } m_ev.events = EPOLLIN; @@ -96,7 +96,7 @@ void wakeup_pipe::do_wakeup() int errno_tmp = errno; // don't let wakeup affect errno, as this can fail with EEXIST BULLSEYE_EXCLUDE_BLOCK_START - if ((orig_os_api.epoll_ctl(m_epfd, EPOLL_CTL_ADD, g_wakeup_pipes[0], &m_ev)) && + if ((SYSCALL(epoll_ctl, m_wakeup_epfd, EPOLL_CTL_ADD, g_wakeup_pipes[0], &m_ev)) && (errno != EEXIST)) { wkup_logerr("Failed to add wakeup fd to internal epfd (errno=%d %m)", errno); } @@ -114,7 +114,7 @@ void wakeup_pipe::remove_wakeup_fd() } wkup_entry_dbg(""); int tmp_errno = errno; - if (orig_os_api.epoll_ctl(m_epfd, EPOLL_CTL_DEL, g_wakeup_pipes[0], NULL)) { + if (SYSCALL(epoll_ctl, m_wakeup_epfd, EPOLL_CTL_DEL, g_wakeup_pipes[0], nullptr)) { BULLSEYE_EXCLUDE_BLOCK_START if (errno == ENOENT) { wkup_logdbg("Failed to delete global pipe from internal epfd it was already deleted"); diff --git a/src/core/util/xlio_stats.h b/src/core/util/xlio_stats.h index 725f7fb3f..5b77774f7 100644 --- a/src/core/util/xlio_stats.h +++ b/src/core/util/xlio_stats.h @@ -225,44 +225,47 @@ typedef struct socket_listen_counters { } } socket_listen_counters_t; -typedef struct socket_stats_t { - int fd; - uint32_t inode; - uint32_t tcp_state; // enum tcp_state - uint8_t socket_type; // SOCK_STREAM, SOCK_DGRAM, ... - bool padding1; - sa_family_t sa_family; - bool b_is_offloaded; - bool b_blocking; - bool b_mc_loop; - bool padding2; - in_port_t bound_port; - in_port_t connected_port; - ip_address bound_if; - ip_address connected_ip; - ip_address mc_tx_if; - pid_t threadid_last_rx; - pid_t threadid_last_tx; - uint32_t n_rx_ready_pkt_count; - uint32_t n_rx_ready_byte_limit; - uint64_t n_rx_ready_byte_count; +struct socket_stats_t { + // Data Path uint64_t n_tx_ready_byte_count; - uint32_t n_rx_zcopy_pkt_count; + uint64_t n_rx_ready_byte_count; + uint32_t n_rx_ready_pkt_count; socket_counters_t counters; + socket_strq_counters_t strq_counters; #ifdef DEFINED_UTLS - bool tls_tx_offload; - bool tls_rx_offload; - uint16_t tls_version; - uint16_t tls_cipher; socket_tls_counters_t tls_counters; #endif /* DEFINED_UTLS */ - socket_strq_counters_t strq_counters; socket_listen_counters_t listen_counters; + + // Control Path std::bitset mc_grp_map; ring_logic_t ring_alloc_logic_rx; ring_logic_t ring_alloc_logic_tx; + ip_address bound_if; + ip_address connected_ip; + ip_address mc_tx_if; + int fd; + uint32_t inode; + uint32_t tcp_state; // enum tcp_state + uint32_t n_rx_zcopy_pkt_count; + pid_t threadid_last_rx; + pid_t threadid_last_tx; uint64_t ring_user_id_rx; uint64_t ring_user_id_tx; + sa_family_t sa_family; + in_port_t bound_port; + in_port_t connected_port; + uint8_t socket_type; // SOCK_STREAM, SOCK_DGRAM, ... + bool b_is_offloaded; + bool b_blocking; + bool b_mc_loop; +#ifdef DEFINED_UTLS + uint16_t tls_version; + uint16_t tls_cipher; + bool tls_tx_offload; + bool tls_rx_offload; +#endif /* DEFINED_UTLS */ + socket_stats_t *_next_stat; void reset() { @@ -274,8 +277,8 @@ typedef struct socket_stats_t { bound_if = connected_ip = mc_tx_if = ip_address(in6addr_any); bound_port = connected_port = (in_port_t)0; threadid_last_rx = threadid_last_tx = pid_t(0); - n_rx_ready_pkt_count = n_rx_ready_byte_count = n_rx_ready_byte_limit = - n_rx_zcopy_pkt_count = n_tx_ready_byte_count = 0; + n_rx_ready_pkt_count = n_rx_ready_byte_count = n_rx_zcopy_pkt_count = + n_tx_ready_byte_count = 0; memset(&counters, 0, sizeof(counters)); #ifdef DEFINED_UTLS tls_tx_offload = tls_rx_offload = false; @@ -287,7 +290,6 @@ typedef struct socket_stats_t { mc_grp_map.reset(); ring_user_id_rx = ring_user_id_tx = 0; ring_alloc_logic_rx = ring_alloc_logic_tx = RING_LOGIC_PER_INTERFACE; - padding1 = padding2 = 0; }; void set_bound_if(sock_addr &sock) @@ -312,10 +314,11 @@ typedef struct socket_stats_t { : bound_if(in6addr_any) , connected_ip(in6addr_any) , mc_tx_if(in6addr_any) + , _next_stat(nullptr) { reset(); }; -} socket_stats_t; +}; typedef struct { bool b_enabled; diff --git a/src/core/xlio.h b/src/core/xlio.h new file mode 100644 index 000000000..f3daa80bb --- /dev/null +++ b/src/core/xlio.h @@ -0,0 +1,494 @@ +/* + * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef XLIO_H +#define XLIO_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "xlio_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +int xlio_socket(int __domain, int __type, int __protocol); + +int xlio_close(int __fd); + +int xlio_shutdown(int __fd, int __how); + +int xlio_listen(int __fd, int backlog); + +int xlio_accept(int __fd, struct sockaddr *__addr, socklen_t *__addrlen); + +int xlio_accept4(int __fd, struct sockaddr *__addr, socklen_t *__addrlen, int __flags); + +int xlio_bind(int __fd, const struct sockaddr *__addr, socklen_t __addrlen); + +int xlio_connect(int __fd, const struct sockaddr *__to, socklen_t __tolen); + +int xlio_setsockopt(int __fd, int __level, int __optname, __const void *__optval, + socklen_t __optlen); + +int xlio_getsockopt(int __fd, int __level, int __optname, void *__optval, socklen_t *__optlen); + +int xlio_fcntl(int __fd, int __cmd, ...); + +int xlio_fcntl64(int __fd, int __cmd, ...); + +int xlio_ioctl(int __fd, unsigned long int __request, ...); + +int xlio_getsockname(int __fd, struct sockaddr *__name, socklen_t *__namelen); + +int xlio_getpeername(int __fd, struct sockaddr *__name, socklen_t *__namelen); + +ssize_t xlio_read(int __fd, void *__buf, size_t __nbytes); + +ssize_t xlio_readv(int __fd, const struct iovec *iov, int iovcnt); + +ssize_t xlio_recv(int __fd, void *__buf, size_t __nbytes, int __flags); + +ssize_t xlio_recvmsg(int __fd, struct msghdr *__msg, int __flags); + +struct mmsghdr; + +int xlio_recvmmsg(int __fd, struct mmsghdr *__mmsghdr, unsigned int __vlen, int __flags, + const struct timespec *__timeout); + +ssize_t xlio_recvfrom(int __fd, void *__buf, size_t __nbytes, int __flags, struct sockaddr *__from, + socklen_t *__fromlen); + +ssize_t xlio_write(int __fd, __const void *__buf, size_t __nbytes); + +ssize_t xlio_writev(int __fd, const struct iovec *iov, int iovcnt); + +ssize_t xlio_send(int __fd, __const void *__buf, size_t __nbytes, int __flags); + +ssize_t xlio_sendmsg(int __fd, __const struct msghdr *__msg, int __flags); + +int xlio_sendmmsg(int __fd, struct mmsghdr *__mmsghdr, unsigned int __vlen, int __flags); + +ssize_t xlio_sendto(int __fd, __const void *__buf, size_t __nbytes, int __flags, + const struct sockaddr *__to, socklen_t __tolen); + +ssize_t xlio_sendfile(int out_fd, int in_fd, off_t *offset, size_t count); + +ssize_t xlio_sendfile64(int out_fd, int in_fd, __off64_t *offset, size_t count); + +int xlio_select(int __nfds, fd_set *__readfds, fd_set *__writefds, fd_set *__exceptfds, + struct timeval *__timeout); + +int xlio_pselect(int __nfds, fd_set *__readfds, fd_set *__writefds, fd_set *__errorfds, + const struct timespec *__timeout, const sigset_t *__sigmask); +int xlio_poll(struct pollfd *__fds, nfds_t __nfds, int __timeout); + +int xlio_ppoll(struct pollfd *__fds, nfds_t __nfds, const struct timespec *__timeout, + const sigset_t *__sigmask); + +int xlio_epoll_create(int __size); + +int xlio_epoll_create1(int __flags); + +int xlio_epoll_ctl(int __epfd, int __op, int __fd, struct epoll_event *__event); + +int xlio_epoll_wait(int __epfd, struct epoll_event *__events, int __maxevents, int __timeout); + +int xlio_epoll_pwait(int __epfd, struct epoll_event *__events, int __maxevents, int __timeout, + const sigset_t *__sigmask); +int xlio_socketpair(int __domain, int __type, int __protocol, int __sv[2]); + +int xlio_pipe(int __filedes[2]); + +int xlio_open(__const char *__file, int __oflag, ...); + +int xlio_creat(const char *__pathname, mode_t __mode); + +int xlio_dup(int __fd); + +int xlio_dup2(int __fd, int __fd2); + +/* Before using XLIO static interface call xlio_init; */ +int xlio_init(void); + +/* After finishing workling with XLIO interface call xlio_exit */ +int xlio_exit(void); + +/** + * Zero-copy revcfrom implementation. + * + * @param s Socket file descriptor. + * @param buf Buffer to fill with received data or pointers to data (see below). + * @param flags Pointer to flags (see below). + * @param from If not NULL, will be filled with source address (same as recvfrom). + * @param fromlen If not NULL, will be filled with source address size (same as recvfrom). + * + * This function attempts to receive a packet without doing data copy. + * The flags argument can contain the usual flags of recvmsg(), and also the + * MSG_XLIO_ZCOPY_FORCE flag. If the latter is set, the function will not + * fall back to data copy. Otherwise, the function falls back to data copy + * if zero-copy cannot be performed. If zero-copy is done then MSG_XLIO_ZCOPY + * flag is set upon exit. + * + * If zero copy is performed (MSG_XLIO_ZCOPY flag is returned), the buffer + * is filled with a xlio_recvfrom_zcopy_packets_t structure, holding as much fragments + * as `len' allows. The total size of all fragments is returned. + * Otherwise the MSG_XLIO_ZCOPY flag is not set and the buffer is filled + * with actual data and it's size is returned (same as recvfrom()) + * If no data was received the return value is zero. + * + * NOTE: The returned packet must be freed with free_packet() after + * the application finished using it. + */ +int xlio_recvfrom_zcopy(int s, void *buf, size_t len, int *flags, struct sockaddr *from, + socklen_t *fromlen); + +/** + * Frees a packet received by recvfrom_zcopy() or held by receive callback. + * + * @param s Socket from which the packet was received. + * @param pkts Array of packet. + * @param count Number of packets in the array. + * @return 0 on success, -1 on failure + * + * errno is set to: EINVAL - not a offloaded socket + * ENOENT - the packet was not received from `s'. + */ +int xlio_recvfrom_zcopy_free_packets(int s, struct xlio_recvfrom_zcopy_packet_t *pkts, + size_t count); + +/* + * Add a libxlio.conf rule to the top of the list. + * This rule will not apply to existing sockets which already considered the conf rules. + * (around connect/listen/send/recv ..) + * @param config_line A char buffer with the exact format as defined in libxlio.conf, and should + * end with '\0'. + * @return 0 on success, or error code on failure. + */ +int xlio_add_conf_rule(const char *config_line); + +/* + * Create sockets on pthread tid as offloaded/not-offloaded. + * This does not affect existing sockets. + * Offloaded sockets are still subject to libxlio.conf rules. + * @param offload 1 for offloaded, 0 for not-offloaded. + * @return 0 on success, or error code on failure. + */ +int xlio_thread_offload(int offload, pthread_t tid); + +/** + * Returns the amount of rings that are associated with socket. + * + * @param fd File Descriptor number of the socket. + * @return On success, return the amount of rings. + * On error, -1 is returned. + * + * errno is set to: EINVAL - not a offloaded fd + */ +int xlio_get_socket_rings_num(int fd); + +/** + * Returns FDs of the RX rings that are associated with the socket. + * + * This function gets socket FD + int array + array size and populates + * the array with FD numbers of the rings that are associated + * with the socket. + * + * @param fd File Descriptor number. + * @param ring_fds Array of ring fds + * @param ring_fds_sz Size of the array + * @return On success, return the number populated array entries. + * On error, -1 is returned. + * + * errno is set to: EINVAL - not a offloaded fd + TBD + */ +int xlio_get_socket_rings_fds(int fd, int *ring_fds, int ring_fds_sz); + +/* + * Dump fd statistics using the library logger. + * @param fd to dump, 0 for all open fds. + * @param log_level dumping level corresponding vlog_levels_t enum (vlogger.h). + * @return 0 on success, or error code on failure. + * + * errno is set to: EOPNOTSUPP - Function is not supported when socketXtreme is enabled. + */ +int xlio_dump_fd_stats(int fd, int log_level); + +/** + * This function allows to communicate with library using extendable protocol + * based on struct cmshdr. + * + * Ancillary data is a sequence of cmsghdr structures with appended data. + * The sequence of cmsghdr structures should never be accessed directly. + * Instead, use only the following macros: CMSG_ALIGN, CMSG_SPACE, CMSG_DATA, + * CMSG_LEN. + * + * @param cmsg_hdr - point to control message + * @param cmsg_len - the byte count of the ancillary data, + * which contains the size of the structure header. + * + * @return -1 on failure and 0 on success + */ +int xlio_extra_ioctl(void *cmsg_hdr, size_t cmsg_len); + +/** + * Register a received packet notification callback. + * + * @param s Socket file descriptor. + * @param callback Callback function. + * @param context user contex for callback function. + * @return 0 - success, -1 - error + * + * errno is set to: EINVAL - not offloaded socket + */ +int xlio_register_recv_callback(int s, xlio_recv_callback_t callback, void *context); + +/** + * socketxtreme_poll() polls for completions + * + * @param fd File descriptor. + * @param completions Array of completions. + * @param ncompletions Maximum number of completion to return. + * @param flags Flags. + * SOCKETXTREME_POLL_TX - poll tx completions + * @return On success, return the number of ready completions. + * On error, -1 is returned, and TBD:errno is set?. + * + * This function polls the `fd` for completions and returns maximum `ncompletions` ready + * completions via `completions` array. + * The `fd` can represent a ring, socket or epoll file descriptor. + * + * Completions are indicated for incoming packets and/or for other events. + * If XLIO_SOCKETXTREME_PACKET flag is enabled in xlio_socketxtreme_completion_t.events field + * the completion points to incoming packet descriptor that can be accesses + * via xlio_socketxtreme_completion_t.packet field. + * Packet descriptor points to library specific buffers that contain data scattered + * by HW, so the data is deliver to application with zero copy. + * Notice: after application finished using the returned packets + * and their buffers it must free them using socketxtreme_free_packets(), + * socketxtreme_free_buff() functions. + * + * If XLIO_SOCKETXTREME_PACKET flag is disabled xlio_socketxtreme_completion_t.packet field is + * reserved. + * + * In addition to packet arrival event (indicated by XLIO_SOCKETXTREME_PACKET flag) + * The library also reports XLIO_SOCKETXTREME_NEW_CONNECTION_ACCEPTED event and standard + * epoll events via xlio_socketxtreme_completion_t.events field. + * XLIO_SOCKETXTREME_NEW_CONNECTION_ACCEPTED event is reported when new connection is + * accepted by the server. + * When working with socketxtreme_poll() new connections are accepted + * automatically and accept(listen_socket) must not be called. + * XLIO_SOCKETXTREME_NEW_CONNECTION_ACCEPTED event is reported for the new + * connected/child socket (xlio_socketxtreme_completion_t.user_data refers to child socket) + * and EPOLLIN event is not generated for the listen socket. + * For events other than packet arrival and new connection acceptance + * xlio_socketxtreme_completion_t.events bitmask composed using standard epoll API + * events types. + * Notice: the same completion can report multiple events, for example + * XLIO_SOCKETXTREME_PACKET flag can be enabled together with EPOLLOUT event, + * etc... + * + * * errno is set to: EOPNOTSUPP - socketXtreme was not enabled during configuration time. + */ +int xlio_socketxtreme_poll(int fd, struct xlio_socketxtreme_completion_t *completions, + unsigned int ncompletions, int flags); + +/** + * Frees packets received by socketxtreme_poll(). + * + * @param packets Packets to free. + * @param num Number of packets in `packets` array + * @return 0 on success, -1 on failure + * + * For each packet in `packet` array this function: + * - Updates receive queue size and the advertised TCP + * window size, if needed, for the socket that received + * the packet. + * - Frees the library specific buffer list that is associated with the packet. + * Notice: for each buffer in buffer list the library decreases buffer's + * reference count and only buffers with reference count zero are deallocated. + * Notice: + * - Application can increase buffer reference count, + * in order to hold the buffer even after socketxtreme_free_packets() + * was called for the buffer, using socketxtreme_ref_buff(). + * - Application is responsible to free buffers, that + * couldn't be deallocated during socketxtreme_free_packets() due to + * non zero reference count, using socketxtreme_free_buff() function. + * + * errno is set to: EINVAL - NULL pointer is provided. + * EOPNOTSUPP - socketXtreme was not enabled during configuration time. + */ +int xlio_socketxtreme_free_packets(struct xlio_socketxtreme_packet_desc_t *packets, int num); + +/* This function increments the reference count of the buffer. + * This function should be used in order to hold the buffer + * even after socketxtreme_free_packets() call. + * When buffer is not needed any more it should be freed via + * socketxtreme_free_buff(). + * + * @param buff Buffer to update. + * @return On success, return buffer's reference count after the change + * On errors -1 is returned + * + * errno is set to: EINVAL - NULL pointer is provided. + * EOPNOTSUPP - socketXtreme was not enabled during configuration time. + */ +int xlio_socketxtreme_ref_buff(struct xlio_buff_t *buff); + +/* This function decrements the buff reference count. + * When buff's reference count reaches zero, the buff is + * deallocated. + * + * @param buff Buffer to free. + * @return On success, return buffer's reference count after the change + * On error -1 is returned + * + * Notice: return value zero means that buffer was deallocated. + * + * errno is set to: EINVAL - NULL pointer is provided. + * EOPNOTSUPP - socketXtreme was not enabled during configuration time. + */ +int xlio_socketxtreme_free_buff(struct xlio_buff_t *buff); + +/* + * XLIO Socket API + * + * This is performance-oriented event based API. + */ + +/* + * XLIO initialization. + * + * xlio_init_ex() must be called before using any XLIO Socket API. This is heavy operation. + * xlio_init_ex() is not thread-safe operation, however, subsequent serialized calls exit + * successfully without any action. + * + * If set, xlio_init_attr::memory_cb() notifies about memory blocks which are allocated to + * buffers. Each zerocopy RX buffer resides within one such memory block. + * If set, XLIO uses external allocator xlio_init_attr::memory_alloc() instead of the internal. + * Current implementation allocates a single memory block and does it in xlio_init_ex() context. + */ +int xlio_init_ex(const struct xlio_init_attr *attr); + +/* + * XLIO polling groups. + * + * Event callbacks are registered per group. This allows to move control flow connections to + * a separate group and implement RX / completion logic differently. + * + * xlio_poll_group_poll() polls HW for events and executes TCP timers. Most of the callbacks are + * expected from the context of this call. + * + * Recommendations: + * - Groups are expected to be long lived objects. Frequent creation/destruction has a penalty. + * - Reduce the number of different network interfaces within a group to minimum. This will + * optimize the HW objects utilization. However, maintaining extra groups can have an overhead. + */ + +int xlio_poll_group_create(const struct xlio_poll_group_attr *attr, xlio_poll_group_t *group_out); +int xlio_poll_group_destroy(xlio_poll_group_t group); +void xlio_poll_group_poll(xlio_poll_group_t group); + +/* + * XLIO socket. + * + * XLIO socket is represented by xlio_socket_t instead of file descriptor. This is a TCP + * non-blocking socket abstraction. + * + * xlio_socket_destroy() triggers socket closing procedure. The process can be asynchronous + * and socket events may be expected until XLIO_SOCKET_EVENT_TERMINATED event arrives. + * Example of the possible events is zerocopy completions which can arrive from the + * xlio_socket_destroy() context or xlio_poll_group_poll() context. + * + * Limitations: + * - Only outgoing connections are supported + * - Bonding is not supported + */ + +/* Forward declaration. */ +struct ibv_pd; + +int xlio_socket_create(const struct xlio_socket_attr *attr, xlio_socket_t *sock_out); +int xlio_socket_destroy(xlio_socket_t sock); +int xlio_socket_setsockopt(xlio_socket_t sock, int level, int optname, const void *optval, + socklen_t optlen); +int xlio_socket_bind(xlio_socket_t sock, const struct sockaddr *addr, socklen_t addrlen); +int xlio_socket_connect(xlio_socket_t sock, const struct sockaddr *to, socklen_t tolen); +struct ibv_pd *xlio_socket_get_pd(xlio_socket_t sock); + +/* + * TX flow. + * + * Properties of the TX flow: + * - Non-blocking + * - No partial write support - accepts all data unless memory allocation error happens + * - Each send call expects a complete or part of a single PDU or message. This is a requirement + * in case of either crypto or CRC offload is enabled. + * - User requests zerocopy completion callback with non-zero userdata_op value and controls + * the logic of completions. For example, each completion can complete entire PDU object. + * - Inline send operations don't trigger the completion callback. + * - XLIO aggregates data on socket and pushes it to wire with the flush-like API or + * XLIO_SOCKET_SEND_FLAG_FLUSH flag. + * + * **Current limitations**: + * - Currently, data can be pushes to wire in the RX flow regardless of the flush logic. + * - Avoid using xlio_socket_flush() for a XLIO_GROUP_FLAG_DIRTY group. + * - For a XLIO_GROUP_FLAG_DIRTY group, usage of XLIO_SOCKET_SEND_FLAG_FLUSH is limited, + * it's better to avoid using them both. + */ + +/* Returns either 0 or -1. The errors, except of ENOMEM, are not recoverable. */ +int xlio_socket_send(xlio_socket_t sock, const void *data, size_t len, + const struct xlio_socket_send_attr *attr); +int xlio_socket_sendv(xlio_socket_t sock, const struct iovec *iov, unsigned iovcnt, + const struct xlio_socket_send_attr *attr); +void xlio_poll_group_flush(xlio_poll_group_t group); +void xlio_socket_flush(xlio_socket_t sock); + +/* + * RX flow. + */ + +void xlio_socket_buf_free(xlio_socket_t sock, struct xlio_buf *buf); +void xlio_poll_group_buf_free(xlio_poll_group_t group, struct xlio_buf *buf); + +#ifdef __cplusplus +} +#endif +#endif /* XLIO_H */ diff --git a/src/core/xlio_extra.h b/src/core/xlio_extra.h index c027349b9..3bd15b022 100644 --- a/src/core/xlio_extra.h +++ b/src/core/xlio_extra.h @@ -35,275 +35,19 @@ #include #include -#include #include -/* - * Flags for recvfrom_zcopy() - */ -#define MSG_XLIO_ZCOPY_FORCE 0x01000000 // don't fallback to bcopy -#define MSG_XLIO_ZCOPY 0x00040000 // return: zero copy was done - -/* - * Options for setsockopt()/getsockopt() - */ -#define SO_XLIO_GET_API 2800 -#define SO_XLIO_USER_DATA 2801 -#define SO_XLIO_RING_ALLOC_LOGIC 2810 -#define SO_XLIO_RING_USER_MEMORY 2811 -#define SO_XLIO_FLOW_TAG 2820 -#define SO_XLIO_SHUTDOWN_RX 2821 -#define SO_XLIO_PD 2822 -#define SCM_XLIO_PD SO_XLIO_PD -#define SCM_XLIO_NVME_PD 2823 -#define SO_XLIO_EXT_VLAN_TAG 2824 - -/** - * @def SO_XLIO_ISOLATE - * Socket isolation option groups sockets under specified policy. - * - * Supported policies: - * - SO_XLIO_ISOLATE_DEFAULT - default behavior according to XLIO configuration. - * - * - SO_XLIO_ISOLATE_SAFE - isolate sockets from the default sockets and guarantee thread - * safety regardless of XLIO configuration (note: this option doesn't change socket API - * thread safety model). This policy is mostly effective in XLIO_TCP_CTL_THREAD=delegate - * configuration. - * - * Current limitations: - * - SO_XLIO_ISOLATE option is supported only by TCP sockets - * - SO_XLIO_ISOLATE must be called according to thread safety model and XLIO configuration - * - SO_XLIO_ISOLATE may be called after socket() syscall and before either listen() or connect() - */ -#define SO_XLIO_ISOLATE 2825 -#define SO_XLIO_ISOLATE_DEFAULT 0 -#define SO_XLIO_ISOLATE_SAFE 1 - -enum { CMSG_XLIO_IOCTL_USER_ALLOC = 2900 }; +#include "xlio_types.h" -/* - * Flags for Dummy send API - */ -#define XLIO_SND_FLAGS_DUMMY MSG_SYN // equals to 0x400 - -/* - * Magic value for xlio_get_api (NVDAXLIO) - */ +/** Magic value for xlio_get_api (NVDAXLIO) */ #define XLIO_MAGIC_NUMBER (0x4f494c584144564eULL) -/* - * Return values for the receive packet notify callback function - */ -typedef enum { - XLIO_PACKET_DROP, /* The library will drop the received packet and recycle - the buffer if no other socket needs it */ - - XLIO_PACKET_RECV, /* The library will queue the received packet on this socket ready queue. - The application will read it with the usual recv socket APIs */ - - XLIO_PACKET_HOLD /* Application will handle the queuing of the received packet. The application - must return the descriptor to the library using the free packet function - But not in the context of XLIO's callback itself. */ -} xlio_recv_callback_retval_t; - -/** - * @brief Pass this structure as an argument into getsockopt() with @ref SO_XLIO_PD - * to get protection domain information from ring used for current socket. - * This information can be available after setting connection for TX ring - * and bounding to device for RX ring. - * By default getting PD for TX ring. - * This case can be used with sendmsg(SCM_XLIO_PD) when the data portion contains - * an array of the elements with datatype as struct xlio_pd_key. Number of elements in this - * array should be equal to msg_iovlen value. Every data pointer in msg_iov has - * correspondent memory key. - * - * @param flags - to specify needed information. - * @param pd - protection domain (PD) for the RDMA device context - */ -struct xlio_pd_attr { - uint32_t flags; - void *ib_pd; -}; - -/** - * @brief elements with this datatype can be passed into sendmsg(SCM_XLIO_PD) - * as control message with correspondent pointer to data. - * - * @param flags - to specify needed information. By default mkey value is used. - * @param mkey - memory key - */ -struct xlio_pd_key { - union { - uint32_t flags; - uint32_t message_length; - }; - uint32_t mkey; -}; - -#define NVDA_NVME 666 -#define NVME_TX 1 -#define NVME_RX 2 - -enum { - XLIO_NVME_DDGST_ENABLE = 1U << 31, - XLIO_NVME_DDGST_OFFLOAD = 1U << 30, - XLIO_NVME_HDGST_ENABLE = 1U << 29, - XLIO_NVME_HDGST_OFFLOAD = 1U << 28, - XLIO_NVME_PDA_MASK = ((1U << 4) - 1U), - XLIO_NVME_DDGST_MASK = (XLIO_NVME_DDGST_ENABLE | XLIO_NVME_DDGST_OFFLOAD), -}; - -/************ SocketXtreme API types definition start***************/ - -enum { - XLIO_SOCKETXTREME_PACKET = (1ULL << 32), /* New packet is available */ - XLIO_SOCKETXTREME_NEW_CONNECTION_ACCEPTED = - (1ULL << 33) /* New connection is auto accepted by server */ -}; - -/* - * Represents specific buffer - * Used in SocketXtreme extended API. - */ -struct xlio_buff_t { - struct xlio_buff_t *next; /* next buffer (for last buffer next == NULL) */ - void *payload; /* pointer to data */ - uint16_t len; /* data length */ -}; +/* Forward declaration. */ +struct ibv_pd; /** - * Represents one specific packet - * Used in SocketXtreme extended API. - */ -struct xlio_socketxtreme_packet_desc_t { - size_t num_bufs; /* number of packet's buffers */ - uint16_t total_len; /* total data length */ - struct xlio_buff_t *buff_lst; /* list of packet's buffers */ - struct timespec hw_timestamp; /* packet hw_timestamp */ -}; - -/* - * Represents specific completion form. - * Used in SocketXtreme extended API. - */ -struct xlio_socketxtreme_completion_t { - /* Packet is valid in case XLIO_SOCKETXTREME_PACKET event is set - */ - struct xlio_socketxtreme_packet_desc_t packet; - /* Set of events - */ - uint64_t events; - /* User provided data. - * By default this field has FD of the socket - * User is able to change the content using setsockopt() - * with level argument SOL_SOCKET and opname as SO_XLIO_USER_DATA - */ - uint64_t user_data; - /* Source address (in network byte order) set for: - * XLIO_SOCKETXTREME_PACKET and XLIO_SOCKETXTREME_NEW_CONNECTION_ACCEPTED events - */ - struct sockaddr_in src; - /* Connected socket's parent/listen socket fd number. - * Valid in case XLIO_SOCKETXTREME_NEW_CONNECTION_ACCEPTED event is set. - */ - int listen_fd; -}; - -/************ SocketXtreme API types definition end ***************/ - -/** - * Represents one packet - * Used in receive zero-copy extended API. - */ -struct __attribute__((packed)) xlio_recvfrom_zcopy_packet_t { - void *packet_id; // packet identifier - size_t sz_iov; // number of fragments - struct iovec iov[]; // fragments size+data -}; - -/** - * Represents received packets - * Used in receive zero-copy extended API. - */ -struct __attribute__((packed)) xlio_recvfrom_zcopy_packets_t { - size_t n_packet_num; // number of received packets - struct xlio_recvfrom_zcopy_packet_t pkts[]; // array of received packets -}; - -/* - * Structure holding additional information on the packet and socket - * Note: Check structure size value for future library changes - */ -struct __attribute__((packed)) xlio_info_t { - size_t - struct_sz; /* Compare this value with sizeof(xlio_info_t) to check version compatability */ - void *packet_id; /* Handle to received packet buffer to be return if zero copy logic is used */ - - /* Packet addressing information (in network byte order) */ - const struct sockaddr *src; - const struct sockaddr *dst; - - /* Packet information */ - size_t payload_sz; - - /* Socket's information */ - uint32_t socket_ready_queue_pkt_count; /* Current count of packets waiting to be read from the - socket */ - uint32_t socket_ready_queue_byte_count; /* Current count of bytes waiting to be read from the - socket */ - - /* Packet timestamping information */ - struct timespec hw_timestamp; - struct timespec sw_timestamp; -}; - -struct xlio_rate_limit_t { - uint32_t rate; /* rate limit in Kbps */ - uint32_t max_burst_sz; /* maximum burst size in bytes */ - uint16_t typical_pkt_sz; /* typical packet size in bytes */ -}; - -typedef enum { - RING_LOGIC_PER_INTERFACE = 0, //!< RING_LOGIC_PER_INTERFACE - RING_LOGIC_PER_IP = 1, //!< RING_LOGIC_PER_IP - RING_LOGIC_PER_SOCKET = 10, //!< RING_LOGIC_PER_SOCKET - RING_LOGIC_PER_USER_ID = 11, //!< RING_LOGIC_PER_USER_ID - RING_LOGIC_PER_THREAD = 20, //!< RING_LOGIC_PER_THREAD - RING_LOGIC_PER_CORE = 30, //!< RING_LOGIC_PER_CORE - RING_LOGIC_PER_CORE_ATTACH_THREADS = 31, //!< RING_LOGIC_PER_CORE_ATTACH_THREADS - RING_LOGIC_PER_OBJECT = 32, //!< RING_LOGIC_PER_OBJECT - RING_LOGIC_ISOLATE = 33, //!< RING_LOGIC_ISOLATE - RING_LOGIC_LAST //!< RING_LOGIC_LAST -} ring_logic_t; - -typedef enum { - XLIO_RING_ALLOC_MASK_RING_USER_ID = (1 << 0), - XLIO_RING_ALLOC_MASK_RING_INGRESS = (1 << 1), - XLIO_RING_ALLOC_MASK_RING_ENGRESS = (1 << 2), -} xlio_ring_alloc_logic_attr_comp_mask; - -/** - * @brief pass this struct to process by the library using setsockopt with - * @ref SO_XLIO_RING_ALLOC_LOGIC - * to set the allocation logic of this FD when he requests a ring. - * @note ring_alloc_logic is a mandatory - * @param comp_mask - what fields are read when processing this struct - * see @ref xlio_ring_alloc_logic_attr_comp_mask - * @param ring_alloc_logic- allocation ratio to use - * @param user_idx - when used RING_LOGIC_PER_USER_ID int @ref ring_alloc_logic - * this is the user id to define. This lets you define the same ring for - * few FD's regardless the interface\thread\core. - * @param ingress - RX ring - * @param engress - TX ring + * XLIO Extended Socket API */ -struct xlio_ring_alloc_logic_attr { - uint32_t comp_mask; - ring_logic_t ring_alloc_logic; - uint32_t user_id; - uint32_t ingress : 1; - uint32_t engress : 1; - uint32_t reserved : 30; -}; enum { XLIO_EXTRA_API_REGISTER_RECV_CALLBACK = (1 << 0), @@ -319,48 +63,7 @@ enum { XLIO_EXTRA_API_SOCKETXTREME_FREE_XLIO_BUFF = (1 << 10), XLIO_EXTRA_API_DUMP_FD_STATS = (1 << 11), XLIO_EXTRA_API_IOCTL = (1 << 12), -}; - -/** - * - * Notification callback for incoming packet on socket - * @param fd Socket's file descriptor which this packet refers to - * @param iov iovector structure array point holding the packet - * received data buffer pointers and size of each buffer - * @param iov_sz Size of iov array - * @param xlio_info Additional information on the packet and socket - * @param context User-defined value provided during callback - * registration for each socket - * - * This callback function should be registered by the library calling - * register_recv_callback() in the extended API. It can be unregistered by - * setting a NULL function pointer. The library will call the callback to notify - * of new incoming packets after the IP & UDP header processing and before - * they are queued in the socket's receive queue. - * Context of the callback will always be from one of the user's application - * threads when calling the following socket APIs: select, poll, epoll, recv, - * recvfrom, recvmsg, read, readv. - * - * Notes: - * - The application can call all of the Socket APIs control and send from - * within the callback context. - * - Packet loss might occur depending on the applications behavior in the - * callback context. - * - Parameters `iov' and `xlio_info' are only valid until callback context - * is returned to the library. User should copy these structures for later use - * if working with zero copy logic. - */ -typedef xlio_recv_callback_retval_t (*xlio_recv_callback_t)(int fd, size_t sz_iov, - struct iovec iov[], - struct xlio_info_t *xlio_info, - void *context); - -/** - * XLIO Extended Socket API - */ - -enum { - SOCKETXTREME_POLL_TX = (1 << 15), + XLIO_EXTRA_API_XLIO_SOCKET = (1 << 13), }; struct __attribute__((packed)) xlio_api_t { @@ -613,6 +316,30 @@ struct __attribute__((packed)) xlio_api_t { * EOPNOTSUPP - socketXtreme was not enabled during configuration time. */ int (*socketxtreme_free_buff)(struct xlio_buff_t *buff); + + /** + * XLIO Socket API. + */ + int (*xlio_init_ex)(const struct xlio_init_attr *attr); + int (*xlio_poll_group_create)(const struct xlio_poll_group_attr *attr, + xlio_poll_group_t *group_out); + int (*xlio_poll_group_destroy)(xlio_poll_group_t group); + void (*xlio_poll_group_poll)(xlio_poll_group_t group); + int (*xlio_socket_create)(const struct xlio_socket_attr *attr, xlio_socket_t *sock_out); + int (*xlio_socket_destroy)(xlio_socket_t sock); + int (*xlio_socket_setsockopt)(xlio_socket_t sock, int level, int optname, const void *optval, + socklen_t optlen); + int (*xlio_socket_bind)(xlio_socket_t sock, const struct sockaddr *addr, socklen_t addrlen); + int (*xlio_socket_connect)(xlio_socket_t sock, const struct sockaddr *to, socklen_t tolen); + struct ibv_pd *(*xlio_socket_get_pd)(xlio_socket_t sock); + int (*xlio_socket_send)(xlio_socket_t sock, const void *data, size_t len, + const struct xlio_socket_send_attr *attr); + int (*xlio_socket_sendv)(xlio_socket_t sock, const struct iovec *iov, unsigned iovcnt, + const struct xlio_socket_send_attr *attr); + void (*xlio_poll_group_flush)(xlio_poll_group_t group); + void (*xlio_socket_flush)(xlio_socket_t sock); + void (*xlio_socket_buf_free)(xlio_socket_t sock, struct xlio_buf *buf); + void (*xlio_poll_group_buf_free)(xlio_poll_group_t group, struct xlio_buf *buf); }; /** diff --git a/src/core/xlio_types.h b/src/core/xlio_types.h new file mode 100644 index 000000000..3fa7d472c --- /dev/null +++ b/src/core/xlio_types.h @@ -0,0 +1,464 @@ +/* + * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef XLIO_TYPES_H +#define XLIO_TYPES_H + +#include +#include +#include +#include +#include + +/* + * Flags for recvfrom_zcopy() + */ +#define MSG_XLIO_ZCOPY_FORCE 0x01000000 // don't fallback to bcopy +#define MSG_XLIO_ZCOPY 0x00040000 // return: zero copy was done + +/* + * Options for setsockopt()/getsockopt() + */ +#define SO_XLIO_GET_API 2800 +#define SO_XLIO_USER_DATA 2801 +#define SO_XLIO_RING_ALLOC_LOGIC 2810 +#define SO_XLIO_SHUTDOWN_RX 2821 +#define SO_XLIO_PD 2822 +#define SCM_XLIO_PD SO_XLIO_PD +#define SCM_XLIO_NVME_PD 2823 +#define SO_XLIO_EXT_VLAN_TAG 2824 + +/** + * @def SO_XLIO_ISOLATE + * Socket isolation option groups sockets under specified policy. + * + * Supported policies: + * - SO_XLIO_ISOLATE_DEFAULT - default behavior according to XLIO configuration. + * + * - SO_XLIO_ISOLATE_SAFE - isolate sockets from the default sockets and guarantee thread + * safety regardless of XLIO configuration (note: this option doesn't change socket API + * thread safety model). This policy is mostly effective in XLIO_TCP_CTL_THREAD=delegate + * configuration. + * + * Current limitations: + * - SO_XLIO_ISOLATE option is supported only by TCP sockets + * - SO_XLIO_ISOLATE must be called according to thread safety model and XLIO configuration + * - SO_XLIO_ISOLATE may be called after socket() syscall and before either listen() or connect() + */ +#define SO_XLIO_ISOLATE 2825 +#define SO_XLIO_ISOLATE_DEFAULT 0 +#define SO_XLIO_ISOLATE_SAFE 1 + +enum { CMSG_XLIO_IOCTL_USER_ALLOC = 2900 }; + +/* + * Flags for Dummy send API + */ +#define XLIO_SND_FLAGS_DUMMY MSG_SYN // equals to 0x400 + +/* + * Return values for the receive packet notify callback function + */ +typedef enum { + XLIO_PACKET_DROP, /* The library will drop the received packet and recycle + the buffer if no other socket needs it */ + + XLIO_PACKET_RECV, /* The library will queue the received packet on this socket ready queue. + The application will read it with the usual recv socket APIs */ + + XLIO_PACKET_HOLD /* Application will handle the queuing of the received packet. The application + must return the descriptor to the library using the free packet function + But not in the context of XLIO's callback itself. */ +} xlio_recv_callback_retval_t; + +/** + * @brief Pass this structure as an argument into getsockopt() with @ref SO_XLIO_PD + * to get protection domain information from ring used for current socket. + * This information can be available after setting connection for TX ring + * and bounding to device for RX ring. + * By default getting PD for TX ring. + * This case can be used with sendmsg(SCM_XLIO_PD) when the data portion contains + * an array of the elements with datatype as struct xlio_pd_key. Number of elements in this + * array should be equal to msg_iovlen value. Every data pointer in msg_iov has + * correspondent memory key. + * + * @param flags - to specify needed information. + * @param pd - protection domain (PD) for the RDMA device context + */ +struct xlio_pd_attr { + uint32_t flags; + void *ib_pd; +}; + +/** + * @brief elements with this datatype can be passed into sendmsg(SCM_XLIO_PD) + * as control message with correspondent pointer to data. + * + * @param flags - to specify needed information. By default mkey value is used. + * @param mkey - memory key + */ +struct xlio_pd_key { + union { + uint32_t flags; + uint32_t message_length; + }; + uint32_t mkey; +}; + +#define NVDA_NVME 666 +#define NVME_TX 1 +#define NVME_RX 2 + +enum { + XLIO_NVME_DDGST_ENABLE = 1U << 31, + XLIO_NVME_DDGST_OFFLOAD = 1U << 30, + XLIO_NVME_HDGST_ENABLE = 1U << 29, + XLIO_NVME_HDGST_OFFLOAD = 1U << 28, + XLIO_NVME_PDA_MASK = ((1U << 4) - 1U), + XLIO_NVME_DDGST_MASK = (XLIO_NVME_DDGST_ENABLE | XLIO_NVME_DDGST_OFFLOAD), +}; + +/************ SocketXtreme API types definition start***************/ + +enum { + SOCKETXTREME_POLL_TX = (1 << 15), +}; + +enum { + XLIO_SOCKETXTREME_PACKET = (1ULL << 32), /* New packet is available */ + XLIO_SOCKETXTREME_NEW_CONNECTION_ACCEPTED = + (1ULL << 33) /* New connection is auto accepted by server */ +}; + +/* + * Represents specific buffer + * Used in SocketXtreme extended API. + */ +struct xlio_buff_t { + struct xlio_buff_t *next; /* next buffer (for last buffer next == NULL) */ + void *payload; /* pointer to data */ + uint16_t len; /* data length */ +}; + +/** + * Represents one specific packet + * Used in SocketXtreme extended API. + */ +struct xlio_socketxtreme_packet_desc_t { + size_t num_bufs; /* number of packet's buffers */ + uint16_t total_len; /* total data length */ + struct xlio_buff_t *buff_lst; /* list of packet's buffers */ + struct timespec hw_timestamp; /* packet hw_timestamp */ +}; + +/* + * Represents specific completion form. + * Used in SocketXtreme extended API. + */ +struct xlio_socketxtreme_completion_t { + /* Packet is valid in case XLIO_SOCKETXTREME_PACKET event is set + */ + struct xlio_socketxtreme_packet_desc_t packet; + /* Set of events + */ + uint64_t events; + /* User provided data. + * By default this field has FD of the socket + * User is able to change the content using setsockopt() + * with level argument SOL_SOCKET and opname as SO_XLIO_USER_DATA + */ + uint64_t user_data; + /* Source address (in network byte order) set for: + * XLIO_SOCKETXTREME_PACKET and XLIO_SOCKETXTREME_NEW_CONNECTION_ACCEPTED events + */ + struct sockaddr_in src; + /* Connected socket's parent/listen socket fd number. + * Valid in case XLIO_SOCKETXTREME_NEW_CONNECTION_ACCEPTED event is set. + */ + int listen_fd; +}; + +/************ SocketXtreme API types definition end ***************/ + +/** + * Represents one packet + * Used in receive zero-copy extended API. + */ +struct __attribute__((packed)) xlio_recvfrom_zcopy_packet_t { + void *packet_id; // packet identifier + size_t sz_iov; // number of fragments + struct iovec iov[]; // fragments size+data +}; + +/** + * Represents received packets + * Used in receive zero-copy extended API. + */ +struct __attribute__((packed)) xlio_recvfrom_zcopy_packets_t { + size_t n_packet_num; // number of received packets + struct xlio_recvfrom_zcopy_packet_t pkts[]; // array of received packets +}; + +/* + * Structure holding additional information on the packet and socket + * Note: Check structure size value for future library changes + */ +struct __attribute__((packed)) xlio_info_t { + size_t + struct_sz; /* Compare this value with sizeof(xlio_info_t) to check version compatability */ + void *packet_id; /* Handle to received packet buffer to be return if zero copy logic is used */ + + /* Packet addressing information (in network byte order) */ + const struct sockaddr *src; + const struct sockaddr *dst; + + /* Packet information */ + size_t payload_sz; + + /* Socket's information */ + uint32_t socket_ready_queue_pkt_count; /* Current count of packets waiting to be read from the + socket */ + uint32_t socket_ready_queue_byte_count; /* Current count of bytes waiting to be read from the + socket */ + + /* Packet timestamping information */ + struct timespec hw_timestamp; + struct timespec sw_timestamp; +}; + +struct xlio_rate_limit_t { + uint32_t rate; /* rate limit in Kbps */ + uint32_t max_burst_sz; /* maximum burst size in bytes */ + uint16_t typical_pkt_sz; /* typical packet size in bytes */ +}; + +typedef enum { + RING_LOGIC_PER_INTERFACE = 0, //!< RING_LOGIC_PER_INTERFACE + RING_LOGIC_PER_IP = 1, //!< RING_LOGIC_PER_IP + RING_LOGIC_PER_SOCKET = 10, //!< RING_LOGIC_PER_SOCKET + RING_LOGIC_PER_USER_ID = 11, //!< RING_LOGIC_PER_USER_ID + RING_LOGIC_PER_THREAD = 20, //!< RING_LOGIC_PER_THREAD + RING_LOGIC_PER_CORE = 30, //!< RING_LOGIC_PER_CORE + RING_LOGIC_PER_CORE_ATTACH_THREADS = 31, //!< RING_LOGIC_PER_CORE_ATTACH_THREADS + RING_LOGIC_PER_OBJECT = 32, //!< RING_LOGIC_PER_OBJECT + RING_LOGIC_ISOLATE = 33, //!< RING_LOGIC_ISOLATE + RING_LOGIC_LAST //!< RING_LOGIC_LAST +} ring_logic_t; + +typedef enum { + XLIO_RING_ALLOC_MASK_RING_USER_ID = (1 << 0), + XLIO_RING_ALLOC_MASK_RING_INGRESS = (1 << 1), + XLIO_RING_ALLOC_MASK_RING_ENGRESS = (1 << 2), +} xlio_ring_alloc_logic_attr_comp_mask; + +/** + * @brief pass this struct to process by the library using setsockopt with + * @ref SO_XLIO_RING_ALLOC_LOGIC + * to set the allocation logic of this FD when he requests a ring. + * @note ring_alloc_logic is a mandatory + * @param comp_mask - what fields are read when processing this struct + * see @ref xlio_ring_alloc_logic_attr_comp_mask + * @param ring_alloc_logic- allocation ratio to use + * @param user_idx - when used RING_LOGIC_PER_USER_ID int @ref ring_alloc_logic + * this is the user id to define. This lets you define the same ring for + * few FD's regardless the interface\thread\core. + * @param ingress - RX ring + * @param engress - TX ring + */ +struct xlio_ring_alloc_logic_attr { + uint32_t comp_mask; + ring_logic_t ring_alloc_logic; + uint32_t user_id; + uint32_t ingress : 1; + uint32_t engress : 1; + uint32_t reserved : 30; +}; + +/** + * + * Notification callback for incoming packet on socket + * @param fd Socket's file descriptor which this packet refers to + * @param iov iovector structure array point holding the packet + * received data buffer pointers and size of each buffer + * @param iov_sz Size of iov array + * @param xlio_info Additional information on the packet and socket + * @param context User-defined value provided during callback + * registration for each socket + * + * This callback function should be registered by the library calling + * register_recv_callback() in the extended API. It can be unregistered by + * setting a NULL function pointer. The library will call the callback to notify + * of new incoming packets after the IP & UDP header processing and before + * they are queued in the socket's receive queue. + * Context of the callback will always be from one of the user's application + * threads when calling the following socket APIs: select, poll, epoll, recv, + * recvfrom, recvmsg, read, readv. + * + * Notes: + * - The application can call all of the Socket APIs control and send from + * within the callback context. + * - Packet loss might occur depending on the applications behavior in the + * callback context. + * - Parameters `iov' and `xlio_info' are only valid until callback context + * is returned to the library. User should copy these structures for later use + * if working with zero copy logic. + */ +typedef xlio_recv_callback_retval_t (*xlio_recv_callback_t)(int fd, size_t sz_iov, + struct iovec iov[], + struct xlio_info_t *xlio_info, + void *context); + +/* + * XLIO Socket API main objects + */ + +typedef uintptr_t xlio_poll_group_t; +typedef uintptr_t xlio_socket_t; + +struct xlio_buf { + uint64_t userdata; +}; + +/* + * XLIO Socket API callbacks + */ + +/* + * Memory callback. + * + * XLIO calls the callback each time XLIO allocates a memory region which can be used for RX + * buffers. User can use this information to prepare the memory for some logic in the future. + * Zerocopy RX interface provides pointers to such memory. + * + * Argument hugepage_size provides the page size if XLIO uses hugepages for the allocation. + * If hugepage_size is not zero, the both addr and len are aligned to the page size boundary. + * There is no alignment guarantee for regular pages and hugepage_size is zero in this case. + * In case of external user allocator, XLIO reports hugepage_size zero regardless of the underlying + * pages properties. + */ +typedef void (*xlio_memory_cb_t)(void *addr, size_t len, size_t hugepage_size); + +/* + * Socket event callback. + * + * May be called from xlio_poll_group_poll() context. + * In the callback context, send operation is allowed only for the ESTABLISHED event. + * Argument value holds the error code for the ERROR event and 0 for other events. + * + * List of possible error code values: + * ECONNABORTED - connection aborted by local side + * ECONNRESET - connection reset by remote side + * ECONNREFUSED - connection refused by remote side during TCP handshake + * ETIMEDOUT - connection timed out due to keepalive, user timeout option or TCP handshake timeout + */ +enum { + /* TCP connection established. */ + XLIO_SOCKET_EVENT_ESTABLISHED = 1, + /* Socket terminated and no further events are possible. */ + XLIO_SOCKET_EVENT_TERMINATED, + /* Passive close. */ + XLIO_SOCKET_EVENT_CLOSED, + /* An error occurred, see the error code value. */ + XLIO_SOCKET_EVENT_ERROR, +}; +typedef void (*xlio_socket_event_cb_t)(xlio_socket_t, uintptr_t userdata_sq, int event, int value); + +/* + * Zerocopy completion event. + * + * May be called from the following contexts: + * - xlio_poll_group_poll() - likely + * - xlio_socket_send() - can happen only if data is flushed + * - xlio_socket_flush() / xlio_poll_group_flush() + * - xlio_socket_destroy() + * + * In the callback context, send operation is allowed unless the socket is under destruction. + */ +typedef void (*xlio_socket_comp_cb_t)(xlio_socket_t, uintptr_t userdata_sq, uintptr_t userdata_op); + +/* + * RX callback. + * + * Returns TCP payload upon arrival. Each call returns a single contiguous buffer. The buffer points + * to memory within a block which is provided by the memory_cb() notification. + * + * xlio_buf is a descriptor of the buffer which must be returned to XLIO. During user ownership, + * they may use the uninitialized field in the structure. + */ +typedef void (*xlio_socket_rx_cb_t)(xlio_socket_t, uintptr_t userdata_sq, void *data, size_t len, + struct xlio_buf *buf); + +/* + * XLIO Socket API attribute structures + */ + +struct xlio_init_attr { + unsigned flags; + xlio_memory_cb_t memory_cb; + + /* Optional external user allocator for XLIO buffers. */ + void *(*memory_alloc)(size_t); + void (*memory_free)(void *); +}; + +/* Sockets and rings will be protected with locks regardless of XLIO configuration. */ +#define XLIO_GROUP_FLAG_SAFE 0x1 +/* Group will keep dirty sockets to be flushed with xlio_poll_group_flush(). */ +#define XLIO_GROUP_FLAG_DIRTY 0x2 + +struct xlio_poll_group_attr { + unsigned flags; + + xlio_socket_event_cb_t socket_event_cb; + xlio_socket_comp_cb_t socket_comp_cb; + xlio_socket_rx_cb_t socket_rx_cb; +}; + +struct xlio_socket_attr { + unsigned flags; + int domain; /* AF_INET or AF_INET6 */ + xlio_poll_group_t group; + uintptr_t userdata_sq; +}; + +/* Flush socket after queueing the data. */ +#define XLIO_SOCKET_SEND_FLAG_FLUSH 0x1 +/* Copy user data to the internal buffers instead of taking ownership. */ +#define XLIO_SOCKET_SEND_FLAG_INLINE 0x2 + +struct xlio_socket_send_attr { + unsigned flags; + uint32_t mkey; + uintptr_t userdata_op; +}; + +#endif /* XLIO_TYPES_H */ diff --git a/src/stats/stats_data_reader.h b/src/stats/stats_data_reader.h index 08a7cd17c..46407070c 100644 --- a/src/stats/stats_data_reader.h +++ b/src/stats/stats_data_reader.h @@ -79,7 +79,7 @@ struct tls_context_counters_show { tls_context_counters_show &update(const sh_mem_t *mem) { - return (mem != nullptr) ? update(mem->ring_inst_arr) : *this; + return (mem) ? update(mem->ring_inst_arr) : *this; } #ifdef DEFINED_UTLS @@ -120,7 +120,7 @@ struct global_counters_show { global_counters_show &update(const sh_mem_t *mem) { - return (mem != nullptr) ? update(mem->global_inst_arr) : *this; + return (mem) ? update(mem->global_inst_arr) : *this; } global_counters_show &update(const global_instance_block_t (&globals)[NUM_OF_SUPPORTED_GLOBALS]) @@ -234,7 +234,7 @@ struct ring_packet_aggregate { ring_packet_aggregate &update(const sh_mem_t *mem) { - return (mem != nullptr) ? update(mem->ring_inst_arr) : *this; + return (mem) ? update(mem->ring_inst_arr) : *this; } ring_packet_aggregate &update(const ring_instance_block_t (&rings)[NUM_OF_SUPPORTED_RINGS]) @@ -294,7 +294,7 @@ struct socket_listen_counter_aggregate { socket_listen_counter_aggregate &update(const sh_mem_t *mem) { - if (mem != nullptr) { + if (mem) { std::swap(curr, prev); curr = summarize_listen_counters(*mem); } diff --git a/src/stats/stats_printer.cpp b/src/stats/stats_printer.cpp index 3abb7c286..d7eade9d3 100644 --- a/src/stats/stats_printer.cpp +++ b/src/stats/stats_printer.cpp @@ -106,7 +106,6 @@ void print_full_stats(socket_stats_t *p_si_stats, mc_grp_info_t *p_mc_grp_info, if (p_si_stats->socket_type == SOCK_DGRAM) { fprintf(filename, ", MC Loop %s", p_si_stats->b_mc_loop ? "Enabled " : "Disabled"); if (!p_si_stats->mc_tx_if.is_anyaddr()) { - /* cppcheck-suppress wrongPrintfScanfArgNum */ fprintf(filename, ", MC IF = [%s]", p_si_stats->mc_tx_if.to_str(p_si_stats->sa_family).c_str()); } @@ -117,13 +116,11 @@ void print_full_stats(socket_stats_t *p_si_stats, mc_grp_info_t *p_mc_grp_info, // Bounded + Connected information // if (!p_si_stats->bound_if.is_anyaddr() || p_si_stats->bound_port) { - /* cppcheck-suppress wrongPrintfScanfArgNum */ fprintf(filename, "- Local Address = [%s:%d]\n", p_si_stats->bound_if.to_str(p_si_stats->sa_family).c_str(), ntohs(p_si_stats->bound_port)); } if (!p_si_stats->connected_ip.is_anyaddr() || p_si_stats->connected_port) { - /* cppcheck-suppress wrongPrintfScanfArgNum */ fprintf(filename, "- Foreign Address = [%s:%d]\n", p_si_stats->connected_ip.to_str(p_si_stats->sa_family).c_str(), ntohs(p_si_stats->connected_port)); @@ -131,7 +128,6 @@ void print_full_stats(socket_stats_t *p_si_stats, mc_grp_info_t *p_mc_grp_info, if (p_mc_grp_info) { for (int grp_idx = 0; grp_idx < p_mc_grp_info->max_grp_num; grp_idx++) { if (p_si_stats->mc_grp_map.test(grp_idx)) { - /* cppcheck-suppress wrongPrintfScanfArgNum */ fprintf(filename, "- Member of = [%s]\n", p_mc_grp_info->mc_grp_tbl[grp_idx].mc_grp.to_str().c_str()); } @@ -206,10 +202,9 @@ void print_full_stats(socket_stats_t *p_si_stats, mc_grp_info_t *p_mc_grp_info, b_any_activiy = true; } if (p_si_stats->counters.n_rx_data_pkts || p_si_stats->n_rx_ready_pkt_count) { - fprintf(filename, "Rx byte: cur %lu / max %u / dropped%s %u / limit %u\n", + fprintf(filename, "Rx byte: cur %lu / max %u / dropped%s %u\n", p_si_stats->n_rx_ready_byte_count, p_si_stats->counters.n_rx_ready_byte_max, - post_fix, p_si_stats->counters.n_rx_ready_byte_drop, - p_si_stats->n_rx_ready_byte_limit); + post_fix, p_si_stats->counters.n_rx_ready_byte_drop); fprintf(filename, "Rx pkt : cur %u / max %u / dropped%s %u\n", p_si_stats->n_rx_ready_pkt_count, p_si_stats->counters.n_rx_ready_pkt_max, post_fix, p_si_stats->counters.n_rx_ready_pkt_drop); @@ -345,7 +340,6 @@ void print_netstat_like(socket_stats_t *p_si_stats, mc_grp_info_t *, FILE *file, // int len = 0; if (!p_si_stats->bound_if.is_anyaddr() || p_si_stats->bound_port) { - /* cppcheck-suppress wrongPrintfScanfArgNum */ len = fprintf(file, "%s:%-5d", p_si_stats->bound_if.to_str(p_si_stats->sa_family).c_str(), ntohs(p_si_stats->bound_port)); @@ -360,7 +354,6 @@ void print_netstat_like(socket_stats_t *p_si_stats, mc_grp_info_t *, FILE *file, fprintf(file, " "); if (!p_si_stats->connected_ip.is_anyaddr() || p_si_stats->connected_port) { - /* cppcheck-suppress wrongPrintfScanfArgNum */ len = fprintf(file, "%s:%-5d", p_si_stats->connected_ip.to_str(p_si_stats->sa_family).c_str(), ntohs(p_si_stats->connected_port)); diff --git a/src/stats/stats_publisher.cpp b/src/stats/stats_publisher.cpp index 02f671e56..9649694ff 100644 --- a/src/stats/stats_publisher.cpp +++ b/src/stats/stats_publisher.cpp @@ -181,7 +181,7 @@ void xlio_shmem_stats_open(vlog_levels_t **p_p_xlio_log_level, uint8_t **p_p_xli } BULLSEYE_EXCLUDE_BLOCK_END - shmem_size = SHMEM_STATS_SIZE(safe_mce_sys().stats_fd_num_max); + shmem_size = SHMEM_STATS_SIZE(safe_mce_sys().stats_fd_num_monitor); buf = malloc(shmem_size); if (buf == NULL) { goto shmem_error; @@ -266,11 +266,11 @@ void xlio_shmem_stats_open(vlog_levels_t **p_p_xlio_log_level, uint8_t **p_p_xli write_version_details_to_shmem(&g_sh_mem->ver_info); memcpy(g_sh_mem->stats_protocol_ver, STATS_PROTOCOL_VER, std::min(sizeof(g_sh_mem->stats_protocol_ver), sizeof(STATS_PROTOCOL_VER))); - g_sh_mem->max_skt_inst_num = safe_mce_sys().stats_fd_num_max; + g_sh_mem->max_skt_inst_num = safe_mce_sys().stats_fd_num_monitor; g_sh_mem->reader_counter = 0; __log_dbg("file '%s' fd %d shared memory at %p with %d max blocks", g_sh_mem_info.filename_sh_stats, g_sh_mem_info.fd_sh_stats, g_sh_mem_info.p_sh_stats, - safe_mce_sys().stats_fd_num_max); + safe_mce_sys().stats_fd_num_monitor); // Update the shmem initial log values g_sh_mem->log_level = **p_p_xlio_log_level; @@ -306,11 +306,11 @@ void xlio_shmem_stats_close() if (g_sh_mem_info.p_sh_stats && g_sh_mem_info.p_sh_stats != MAP_FAILED) { __log_dbg("file '%s' fd %d shared memory at %p with %d max blocks", g_sh_mem_info.filename_sh_stats, g_sh_mem_info.fd_sh_stats, - g_sh_mem_info.p_sh_stats, safe_mce_sys().stats_fd_num_max); + g_sh_mem_info.p_sh_stats, safe_mce_sys().stats_fd_num_monitor); BULLSEYE_EXCLUDE_BLOCK_START - if (munmap(g_sh_mem_info.p_sh_stats, SHMEM_STATS_SIZE(safe_mce_sys().stats_fd_num_max)) != - 0) { + if (munmap(g_sh_mem_info.p_sh_stats, + SHMEM_STATS_SIZE(safe_mce_sys().stats_fd_num_monitor)) != 0) { vlog_printf(VLOG_ERROR, "%s: file [%s] fd [%d] error while unmap shared memory at [%p]\n", __func__, g_sh_mem_info.filename_sh_stats, g_sh_mem_info.fd_sh_stats, @@ -351,7 +351,7 @@ void xlio_stats_instance_create_socket_block(socket_stats_t *local_stats_addr) goto out; } } - if (g_sh_mem->max_skt_inst_num + 1 < safe_mce_sys().stats_fd_num_max) { + if (g_sh_mem->max_skt_inst_num + 1 < safe_mce_sys().stats_fd_num_monitor) { // allocate next sh_mem block p_skt_stats = &g_sh_mem->skt_inst_arr[g_sh_mem->max_skt_inst_num].skt_stats; g_sh_mem->skt_inst_arr[g_sh_mem->max_skt_inst_num].b_enabled = true; @@ -360,8 +360,10 @@ void xlio_stats_instance_create_socket_block(socket_stats_t *local_stats_addr) } else { if (!printed_sock_limit_info) { printed_sock_limit_info = true; - vlog_printf(VLOG_INFO, "Statistics can monitor up to %d sockets - increase %s\n", - safe_mce_sys().stats_fd_num_max, SYS_VAR_STATS_FD_NUM); + if (safe_mce_sys().stats_fd_num_monitor < MAX_STATS_FD_NUM) { + vlog_printf(VLOG_INFO, "Statistics can monitor up to %d sockets - increase %s\n", + safe_mce_sys().stats_fd_num_monitor, SYS_VAR_STATS_FD_NUM); + } } goto out; } @@ -418,6 +420,10 @@ void xlio_stats_mc_group_add(const ip_address &mc_grp, socket_stats_t *p_socket_ int empty_entry = -1; int index_to_insert = -1; + if (!p_socket_stats) { + return; + } + g_lock_mc_info.lock(); for (int grp_idx = 0; grp_idx < g_sh_mem->mc_info.max_grp_num && index_to_insert == -1; grp_idx++) { @@ -451,6 +457,10 @@ void xlio_stats_mc_group_add(const ip_address &mc_grp, socket_stats_t *p_socket_ void xlio_stats_mc_group_remove(const ip_address &mc_grp, socket_stats_t *p_socket_stats) { + if (!p_socket_stats) { + return; + } + g_lock_mc_info.lock(); for (int grp_idx = 0; grp_idx < g_sh_mem->mc_info.max_grp_num; grp_idx++) { if (g_sh_mem->mc_info.mc_grp_tbl[grp_idx].sock_num && diff --git a/src/stats/stats_reader.cpp b/src/stats/stats_reader.cpp index 16d5e5e0c..18a118b16 100644 --- a/src/stats/stats_reader.cpp +++ b/src/stats/stats_reader.cpp @@ -133,25 +133,6 @@ typedef enum { e_K = 1024, e_M = 1048576 } units_t; #define SEC_TO_MICRO(n) ((n)*1000000) #define TIME_DIFF_in_MICRO(start, end) \ (SEC_TO_MICRO((end).tv_sec - (start).tv_sec) + (NANO_TO_MICRO((end).tv_nsec - (start).tv_nsec))) -// printf formating when IP is in network byte ordering (for LITTLE_ENDIAN) -#define NETWORK_IP_PRINTQUAD_LITTLE_ENDIAN(ip) \ - (uint8_t)((ip)&0xff), (uint8_t)(((ip) >> 8) & 0xff), (uint8_t)(((ip) >> 16) & 0xff), \ - (uint8_t)(((ip) >> 24) & 0xff) - -// printf formating when IP is in host byte ordering (for LITTLE_ENDIAN) -#define HOST_IP_PRINTQUAD_LITTLE_ENDIAN(ip) \ - (uint8_t)(((ip) >> 24) & 0xff), (uint8_t)(((ip) >> 16) & 0xff), (uint8_t)(((ip) >> 8) & 0xff), \ - (uint8_t)((ip)&0xff) - -#if __BYTE_ORDER == __LITTLE_ENDIAN -/* The host byte order is the same as network byte order, so these functions are all just identity. - */ -#define NIPQUAD(ip) NETWORK_IP_PRINTQUAD_LITTLE_ENDIAN(ip) -#else -#if __BYTE_ORDER == __BIG_ENDIAN -#define NIPQUAD(ip) HOST_IP_PRINTQUAD_LITTLE_ENDIAN(ip) -#endif -#endif bool g_b_exit = false; struct sigaction g_sigact; @@ -252,7 +233,6 @@ void update_delta_stat(socket_stats_t *p_curr_stat, socket_stats_t *p_prev_stat) (p_curr_stat->counters.n_rx_poll_hit - p_prev_stat->counters.n_rx_poll_hit) / delay; p_prev_stat->n_rx_ready_byte_count = p_curr_stat->n_rx_ready_byte_count; p_prev_stat->n_tx_ready_byte_count = p_curr_stat->n_tx_ready_byte_count; - p_prev_stat->n_rx_ready_byte_limit = p_curr_stat->n_rx_ready_byte_limit; p_prev_stat->counters.n_rx_ready_byte_max = p_curr_stat->counters.n_rx_ready_byte_max; p_prev_stat->counters.n_rx_ready_byte_drop = (p_curr_stat->counters.n_rx_ready_byte_drop - p_prev_stat->counters.n_rx_ready_byte_drop) / @@ -1214,7 +1194,6 @@ void print_mc_group_fds(mc_group_fds_t *mc_group_fds, int array_size) printf("------------------------------\n"); for (int i = 0; i < array_size; i++) { char mcg_str[256]; - /* cppcheck-suppress wrongPrintfScanfArgNum */ sprintf(mcg_str, "[%s]", mc_group_fds[i].mc_grp.to_str().c_str()); printf("%-22s", mcg_str); for (const auto &fd : mc_group_fds[i].fd_list) { @@ -1243,8 +1222,6 @@ void show_mc_group_stats(mc_grp_info_t *p_mc_grp_info, socket_instance_block_t * socket_stats_t *p_si_stats = &p_instance[i].skt_stats; for (int grp_idx = 0; grp_idx < p_mc_grp_info->max_grp_num; grp_idx++) { if (p_si_stats->mc_grp_map.test(grp_idx)) { - // printf("fd %d Member of = [%d.%d.%d.%d]\n",p_si_stats->fd, - // NIPQUAD(p_si_stats->mc_grp[grp_idx])); add_fd_to_array(p_si_stats->fd, p_mc_grp_info->mc_grp_tbl[grp_idx].mc_grp, mc_group_fds, &array_size); } diff --git a/src/utils/lock_wrapper.h b/src/utils/lock_wrapper.h index 9f4925971..d35f93528 100644 --- a/src/utils/lock_wrapper.h +++ b/src/utils/lock_wrapper.h @@ -42,7 +42,7 @@ #include "types.h" #include "utils/bullseye.h" #include "utils/rdtsc.h" -#include +#include #include #include @@ -83,6 +83,7 @@ class lock_base { lock_base(const char *_lock_name = NULL) : m_lock_name(_lock_name) {}; virtual ~lock_base() {}; + virtual void delete_obj() { delete this; } virtual int lock() = 0; virtual int trylock() = 0; virtual int unlock() = 0; @@ -224,6 +225,39 @@ class lock_spin : public lock_base { pthread_spinlock_t m_lock; }; +/** + * pthread spinlock + */ +/* coverity[missing_move_assignment] */ +class lock_spin_simple { +public: + lock_spin_simple() { pthread_spin_init(&m_lock, 0); }; + ~lock_spin_simple() { pthread_spin_destroy(&m_lock); }; + inline int lock() + { + DEFINED_NO_THREAD_LOCK_RETURN_0 + return pthread_spin_lock(&m_lock); + }; + inline int trylock() + { + DEFINED_NO_THREAD_LOCK_RETURN_0 + return pthread_spin_trylock(&m_lock); + }; + inline int unlock() + { + DEFINED_NO_THREAD_LOCK_RETURN_0 + return pthread_spin_unlock(&m_lock); + }; + inline int is_locked_by_me() + { + assert(!"lock_spin_simple::is_locked_by_me is unsupported"); + return 0; // Unsupported + } + +protected: + pthread_spinlock_t m_lock; +}; + /** * pthread spinlock */ @@ -458,21 +492,27 @@ class lock_dummy : public lock_base { { } - inline int lock() { return 0; } - inline int trylock() { return 0; } - inline int unlock() { return 0; } - inline int is_locked_by_me() { return 1; } + void delete_obj() override {} + int lock() override { return 0; } + int trylock() override { return 0; } + int unlock() override { return 0; } + int is_locked_by_me() override { return 1; } }; +static inline void lock_deleter_func(lock_base *lock) +{ + lock->delete_obj(); +} + class multilock { public: multilock(lock_base *_lock) - : m_lock(_lock) + : m_lock(_lock, lock_deleter_func) { } multilock(multilock_recursive_t _recursive, const char *_str) - : m_lock(create_new_lock(_recursive, _str)) + : m_lock(create_new_lock(_recursive, _str), lock_deleter_func) { } @@ -504,7 +544,8 @@ class multilock { inline const char *to_str() { return m_lock->to_str(); } private: - std::unique_ptr m_lock; + typedef std::function lock_deleter; + std::unique_ptr m_lock; }; #endif // LOCK_WRAPPER_H diff --git a/src/vlogger/vlogger.h b/src/vlogger/vlogger.h index 0c23f9a88..e1c6a23fd 100644 --- a/src/vlogger/vlogger.h +++ b/src/vlogger/vlogger.h @@ -107,7 +107,7 @@ #define __log_panic(log_fmt, log_args...) \ do { \ VLOG_PRINTF(VLOG_PANIC, log_fmt, ##log_args); \ - throw; \ + std::terminate(); \ } while (0) #define __log_err(log_fmt, log_args...) \ do { \ @@ -165,7 +165,7 @@ #define __log_info_panic(log_fmt, log_args...) \ do { \ VLOG_PRINTF_INFO(VLOG_PANIC, log_fmt, ##log_args); \ - throw; \ + std::terminate(); \ } while (0) #define __log_info_err(log_fmt, log_args...) \ do { \ diff --git a/tests/extra_api/xlio_socket_api.c b/tests/extra_api/xlio_socket_api.c new file mode 100644 index 000000000..9bb7b11a6 --- /dev/null +++ b/tests/extra_api/xlio_socket_api.c @@ -0,0 +1,348 @@ +/* + * Copyright © 2019-2024 NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* g++ -I./install/include -L./install/lib -L../dpcp/install/lib -o test xlio_socket_api.c -lxlio -lm -lnl-3 -ldpcp -libverbs -lmlx5 -lrdmacm -lnl-route-3 -g3 */ +/* LD_LIBRARY_PATH=./install/lib:../dpcp/install/lib ./test */ +/* Use `nc -l 8080` on the remote side */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define TEST_USERDATA_MAGIC 0xfeedbeef +#define FAKE_PORT 65535 + +static bool quit = false; +static bool terminated = false; +static int g_test_events; +static int g_comp_events = 0; +static char sndbuf[256]; +static struct ibv_mr *mr_buf; + +static void memory_cb(void *data, size_t size, size_t page_size) +{ + printf("Memory area allocated data=%p size=%zu page_size=%zu\n", data, size, page_size); +} + +static void send_single_msg(xlio_socket_t sock, const void *data, size_t len, uintptr_t userdata_op, + unsigned flags) +{ + struct xlio_socket_send_attr attr = { + .flags = flags, + .mkey = mr_buf->lkey, + .userdata_op = userdata_op, + }; + memcpy(sndbuf, data, len); + int ret = xlio_socket_send(sock, sndbuf, len, &attr); + assert(ret == 0); + xlio_socket_flush(sock); +} + +static void send_inline_msg(xlio_socket_t sock, const void *data, size_t len, uintptr_t userdata_op, + unsigned flags) +{ + struct xlio_socket_send_attr attr = { + .flags = flags | XLIO_SOCKET_SEND_FLAG_INLINE, + .mkey = 0, + .userdata_op = userdata_op, + }; + int ret = xlio_socket_send(sock, data, len, &attr); + assert(ret == 0); + xlio_socket_flush(sock); +} + +static void socket_event_cb(xlio_socket_t sock, uintptr_t userdata_sq, int event, int value) +{ + if (event == XLIO_SOCKET_EVENT_ESTABLISHED) { + printf("Connection established (sock=%lx).\n", userdata_sq); + } else if (event == XLIO_SOCKET_EVENT_CLOSED) { + printf("Connection closed passively (sock=%lx).\n", userdata_sq); + } else if (event == XLIO_SOCKET_EVENT_TERMINATED) { + printf("Connection terminated (sock=%lx).\n", userdata_sq); + terminated = true; + } else { + printf("Event callback: event=%d value=%d (sock=%lx).\n", event, value, userdata_sq); + if (event == XLIO_SOCKET_EVENT_ERROR) { + quit = true; + } + } +} + +static void socket_comp_cb(xlio_socket_t sock, uintptr_t userdata_sq, uintptr_t userdata_op) +{ + const char *reply_msg = "completed\n"; + const char *inline_msg = "inline\n"; + + printf("Completed zcopy buffer userdata_sq=%lx userdata_op=%lx.\n", userdata_sq, userdata_op); + assert(userdata_sq != 0); + assert(userdata_op != 0); + + ++g_comp_events; + if (!quit) { + /* + * Don't send data after socket destroy, completions are still possible until + * XLIO_SOCKET_EVENT_TERMINATED event arrives. + */ + send_single_msg(sock, reply_msg, strlen(reply_msg), 0, 0); + send_inline_msg(sock, inline_msg, strlen(inline_msg), 0, 0); + } +} + +static void socket_rx_cb(xlio_socket_t sock, uintptr_t userdata_sq, void *data, size_t len, + struct xlio_buf *buf) +{ + char *msg = (char *)malloc(len + 1); + memcpy(msg, data, len); + msg[len] = '\0'; + if (len > 0 && msg[len - 1] == '\n') { + msg[len - 1] = '\0'; + } + printf("RECV: %s\n", msg); + if (strncmp(msg, "quit", 4) == 0 || strncmp(msg, "exit", 4) == 0) { + quit = true; + } + free(msg); + + send_single_msg(sock, data, len, 0xdeadbeef, 0); + xlio_socket_buf_free(sock, buf); +} + +static void test_event_cb(xlio_socket_t sock, uintptr_t userdata_sq, int event, int value) +{ + (void)sock; + (void)value; + assert(userdata_sq = TEST_USERDATA_MAGIC); + + printf("Test event callback: event=%d value=%d.\n", event, value); + + if (event == XLIO_SOCKET_EVENT_ERROR || event == XLIO_SOCKET_EVENT_TERMINATED) { + ++g_test_events; + } +} + +static void test_comp_cb(xlio_socket_t sock, uintptr_t userdata_sq, uintptr_t userdata_op) +{ + (void)sock; + (void)userdata_op; + assert(userdata_sq = TEST_USERDATA_MAGIC); +} + +static void test_rx_cb(xlio_socket_t sock, uintptr_t userdata_sq, void *data, size_t len, + struct xlio_buf *buf) +{ + (void)data; + (void)len; + assert(userdata_sq = TEST_USERDATA_MAGIC); + xlio_socket_buf_free(sock, buf); +} + +static void test_multi_groups(const char *ip) +{ + xlio_poll_group_t group1; + xlio_poll_group_t group2; + xlio_poll_group_t group3; + xlio_socket_t sock1_1; + xlio_socket_t sock1_2; + xlio_socket_t sock2; + xlio_socket_t sock3; + int rc; + + struct xlio_poll_group_attr gattr = { + .socket_event_cb = &test_event_cb, + .socket_comp_cb = &test_comp_cb, + .socket_rx_cb = &test_rx_cb, + }; + + rc = xlio_poll_group_create(&gattr, &group1); + assert(rc == 0); + rc = xlio_poll_group_create(&gattr, &group2); + assert(rc == 0); + + gattr.flags = XLIO_GROUP_FLAG_SAFE; + rc = xlio_poll_group_create(&gattr, &group3); + assert(rc == 0); + + struct xlio_socket_attr sattr = { + .domain = AF_INET, + .userdata_sq = TEST_USERDATA_MAGIC, + }; + + sattr.group = group1; + rc = xlio_socket_create(&sattr, &sock1_1); + assert(rc == 0); + rc = xlio_socket_create(&sattr, &sock1_2); + assert(rc == 0); + sattr.group = group2; + rc = xlio_socket_create(&sattr, &sock2); + assert(rc == 0); + sattr.group = group3; + rc = xlio_socket_create(&sattr, &sock3); + assert(rc == 0); + + struct sockaddr_in addr = {}; + addr.sin_family = AF_INET; + addr.sin_port = htons(FAKE_PORT); + rc = inet_aton(ip, &addr.sin_addr); + assert(rc != 0); + + g_test_events = 0; + /* Connect will fail, we need it to allocate rings for the checks below. */ + rc = xlio_socket_connect(sock1_1, (struct sockaddr *)&addr, sizeof(addr)); + assert(rc == 0); + rc = xlio_socket_connect(sock1_2, (struct sockaddr *)&addr, sizeof(addr)); + assert(rc == 0); + rc = xlio_socket_connect(sock2, (struct sockaddr *)&addr, sizeof(addr)); + assert(rc == 0); + rc = xlio_socket_connect(sock3, (struct sockaddr *)&addr, sizeof(addr)); + assert(rc == 0); + + /* TODO There is no API to check expected internal ring distribution. */ + + /* Wait for ERROR events (ECONREFUSED). */ + while (g_test_events < 4) { + xlio_poll_group_poll(group1); + xlio_poll_group_poll(group2); + xlio_poll_group_poll(group3); + } + + g_test_events = 0; + xlio_socket_destroy(sock1_1); + xlio_socket_destroy(sock1_2); + xlio_socket_destroy(sock2); + xlio_socket_destroy(sock3); + + /* Wait for TERMINATED events. */ + while (g_test_events < 4) { + xlio_poll_group_poll(group1); + xlio_poll_group_poll(group2); + xlio_poll_group_poll(group3); + } + + xlio_poll_group_destroy(group1); + xlio_poll_group_destroy(group2); + xlio_poll_group_destroy(group3); + + printf("Multi group test done.\n"); +} + +int main(int argc, char **argv) +{ + xlio_poll_group_t group; + xlio_socket_t sock; + int rc; + + struct xlio_init_attr iattr = { + .flags = 0, + .memory_cb = &memory_cb, + }; + struct xlio_poll_group_attr gattr = { + .socket_event_cb = &socket_event_cb, + .socket_comp_cb = &socket_comp_cb, + .socket_rx_cb = &socket_rx_cb, + }; + + if (argc < 2) { + printf("Usage: %s \n", argv[0]); + printf("Run 'nc -l 8080' on the server with the address.\n"); + printf("Type messages on the nc side.\n"); + printf("Message 'quit' or 'exit' will terminate the client.\n"); + return 1; + } + + rc = xlio_init_ex(&iattr); + assert(rc == 0); + + test_multi_groups(argv[1]); + + rc = xlio_poll_group_create(&gattr, &group); + assert(rc == 0); + + printf("Group created.\n"); + + struct xlio_socket_attr sattr = { + .domain = AF_INET, + .group = group, + .userdata_sq = 0xdeadc0de, + }; + + rc = xlio_socket_create(&sattr, &sock); + assert(rc == 0); + + printf("Socket created, connecting to %s:8080.\n", argv[1]); + + struct sockaddr_in addr = {}; + addr.sin_family = AF_INET; + addr.sin_port = htons(8080); + rc = inet_aton(argv[1], &addr.sin_addr); + assert(rc != 0); + + rc = xlio_socket_connect(sock, (struct sockaddr *)&addr, sizeof(addr)); + assert(rc == 0); + + struct ibv_pd *pd = xlio_socket_get_pd(sock); + assert(pd != NULL); + mr_buf = ibv_reg_mr(pd, sndbuf, sizeof(sndbuf), IBV_ACCESS_LOCAL_WRITE); + assert(mr_buf != NULL); + + printf("Starting polling loop.\n"); + + while (!quit) { + xlio_poll_group_poll(group); + } + + printf("Quiting...\n"); + + rc = xlio_socket_destroy(sock); + assert(rc == 0); + + while (!terminated) { + xlio_poll_group_poll(group); + } + + rc = xlio_poll_group_destroy(group); + assert(rc == 0); + + printf("Zerocopy completion events: %d\n", g_comp_events); + + ibv_dereg_mr(mr_buf); + xlio_exit(); + + return 0; +} diff --git a/tests/gtest/core/xlio_send_zc.cc b/tests/gtest/core/xlio_send_zc.cc index cf30a4bc5..a4ba5e5a7 100644 --- a/tests/gtest/core/xlio_send_zc.cc +++ b/tests/gtest/core/xlio_send_zc.cc @@ -421,7 +421,7 @@ TEST_F(xlio_send_zc, ti_2) * Send data using few sendmsg(MSG_ZEROCOPY) * @details */ -TEST_F(xlio_send_zc, ti_3_few_send) +TEST_F(xlio_send_zc, DISABLED_ti_3_few_send) { int rc = EOK; int test_iter = 3; @@ -576,7 +576,7 @@ TEST_F(xlio_send_zc, ti_3_few_send) * single call * @details */ -TEST_F(xlio_send_zc, ti_4_large_send) +TEST_F(xlio_send_zc, DISABLED_ti_4_large_send) { int rc = EOK; diff --git a/tests/gtest/extra_api/extra_poll.cc b/tests/gtest/extra_api/extra_poll.cc index 64481f861..29259933f 100644 --- a/tests/gtest/extra_api/extra_poll.cc +++ b/tests/gtest/extra_api/extra_poll.cc @@ -86,8 +86,6 @@ TEST_F(socketxtreme_poll, ti_1) int pid = fork(); if (0 == pid) { /* I am the child */ - struct epoll_event event; - barrier_fork(pid); fd = m_tcp_base.sock_create_fa_nb(m_family); @@ -100,11 +98,31 @@ TEST_F(socketxtreme_poll, ti_1) ASSERT_EQ(EINPROGRESS, errno); ASSERT_EQ((-1), rc); - event.events = EPOLLOUT | EPOLLIN; - event.data.fd = fd; - rc = test_base::event_wait(&event); - EXPECT_LT(0, rc); - EXPECT_EQ((uint32_t)(EPOLLOUT), event.events); + // Wait for connect to complete. + struct xlio_socketxtreme_completion_t xlio_comps; + int xlio_ring_fd[2] = {-1, -1}; + rc = xlio_api->get_socket_rings_fds(fd, xlio_ring_fd, 2); + ASSERT_LE(1, rc); + + rc = 0; + while (rc == 0) { + if (xlio_ring_fd[0] > 0) { + rc = xlio_api->socketxtreme_poll(xlio_ring_fd[0], &xlio_comps, 1, 0); + ASSERT_LE(0, rc); + if (rc > 0) { + ASSERT_LT(0U, (xlio_comps.events & EPOLLOUT)); + break; + } + } + + if (xlio_ring_fd[1] > 0) { + rc = xlio_api->socketxtreme_poll(xlio_ring_fd[1], &xlio_comps, 1, 0); + ASSERT_LE(0, rc); + if (rc > 0) { + ASSERT_LT(0U, (xlio_comps.events & EPOLLOUT)); + } + } + } log_trace("Established connection: fd=%d to %s\n", fd, sys_addr2str((struct sockaddr *)&server_addr)); @@ -379,48 +397,55 @@ TEST_F(socketxtreme_poll, ti_4_socket_isolation) bool received_data = false; char msg[] = "Hello"; - int _xlio_ring_fd = -1; - int _xlio_peer_ring_fd = -1; + int ring_fd[3] = {-1, -1, -1}; + int peer_ring_fd[3] = {-1, -1, -1}; + int ring_fd_nr; + int peer_ring_fd_nr = 0; struct xlio_socketxtreme_completion_t xlio_comps; int fd_peer = -1; struct sockaddr peer_addr; - auto poll_single_ring = [&](int ring_fd) { - rc = xlio_api->socketxtreme_poll(ring_fd, &xlio_comps, 1, SOCKETXTREME_POLL_TX); - if (rc == 0) { - return; - } - if ((xlio_comps.events & EPOLLERR) || (xlio_comps.events & EPOLLHUP) || - (xlio_comps.events & EPOLLRDHUP)) { - log_trace("Close connection: event: 0x%lx\n", xlio_comps.events); - rc = -1; - return; - } - if (xlio_comps.events & XLIO_SOCKETXTREME_NEW_CONNECTION_ACCEPTED) { - EXPECT_EQ(fd, (int)xlio_comps.listen_fd); - fd_peer = (int)xlio_comps.user_data; - memcpy(&peer_addr, &xlio_comps.src, sizeof(peer_addr)); - log_trace("Accepted connection: fd: %d from %s\n", fd_peer, - sys_addr2str((struct sockaddr *)&peer_addr)); - - rc = xlio_api->get_socket_rings_fds(fd_peer, &_xlio_peer_ring_fd, 1); - ASSERT_EQ(1, rc); - ASSERT_LE(0, _xlio_peer_ring_fd); - - rc = send(fd_peer, (void *)msg, sizeof(msg), 0); - EXPECT_EQ(static_cast(sizeof(msg)), rc); - } - if (xlio_comps.events & XLIO_SOCKETXTREME_PACKET) { - EXPECT_EQ(1U, xlio_comps.packet.num_bufs); - EXPECT_EQ(sizeof(msg), xlio_comps.packet.total_len); - EXPECT_TRUE(xlio_comps.packet.buff_lst->payload); - EXPECT_EQ(0, - strncmp(msg, (const char *)xlio_comps.packet.buff_lst->payload, - xlio_comps.packet.total_len)); - log_trace("Received data: user_data: %p data: %s\n", - (void *)((uintptr_t)xlio_comps.user_data), - (char *)xlio_comps.packet.buff_lst->payload); - received_data = true; + auto poll_rings = [&](int *rings, int rings_nr) { + for (int i = 0; i < rings_nr; ++i) { + rc = xlio_api->socketxtreme_poll(rings[i], &xlio_comps, 1, SOCKETXTREME_POLL_TX); + if (rc == 0) { + continue; + } + if ((xlio_comps.events & EPOLLERR) || (xlio_comps.events & EPOLLHUP) || + (xlio_comps.events & EPOLLRDHUP)) { + log_trace("Close connection: event: 0x%lx\n", xlio_comps.events); + rc = -1; + return; + } + if (xlio_comps.events & XLIO_SOCKETXTREME_NEW_CONNECTION_ACCEPTED) { + EXPECT_EQ(fd, (int)xlio_comps.listen_fd); + fd_peer = (int)xlio_comps.user_data; + memcpy(&peer_addr, &xlio_comps.src, sizeof(peer_addr)); + log_trace("Accepted connection: fd: %d from %s\n", fd_peer, + sys_addr2str((struct sockaddr *)&peer_addr)); + + rc = xlio_api->get_socket_rings_num(fd); + ASSERT_GE((int)ARRAY_SIZE(peer_ring_fd), rc); + + peer_ring_fd_nr = + xlio_api->get_socket_rings_fds(fd_peer, peer_ring_fd, ARRAY_SIZE(peer_ring_fd)); + ASSERT_LT(0, peer_ring_fd_nr); + + rc = send(fd_peer, (void *)msg, sizeof(msg), 0); + EXPECT_EQ(static_cast(sizeof(msg)), rc); + } + if (xlio_comps.events & XLIO_SOCKETXTREME_PACKET) { + EXPECT_EQ(1U, xlio_comps.packet.num_bufs); + EXPECT_EQ(sizeof(msg), xlio_comps.packet.total_len); + EXPECT_TRUE(xlio_comps.packet.buff_lst->payload); + EXPECT_EQ(0, + strncmp(msg, (const char *)xlio_comps.packet.buff_lst->payload, + xlio_comps.packet.total_len)); + log_trace("Received data: user_data: %p data: %s\n", + (void *)((uintptr_t)xlio_comps.user_data), + (char *)xlio_comps.packet.buff_lst->payload); + received_data = true; + } } rc = 0; }; @@ -453,15 +478,17 @@ TEST_F(socketxtreme_poll, ti_4_socket_isolation) rc = sock_noblock(fd); ASSERT_EQ(0, rc); - rc = xlio_api->get_socket_rings_fds(fd, &_xlio_ring_fd, 1); - ASSERT_EQ(1, rc); - ASSERT_LE(0, _xlio_ring_fd); + rc = xlio_api->get_socket_rings_num(fd); + ASSERT_GE((int)ARRAY_SIZE(ring_fd), rc); + + ring_fd_nr = xlio_api->get_socket_rings_fds(fd, ring_fd, ARRAY_SIZE(ring_fd)); + ASSERT_LT(0, ring_fd_nr); uint64_t ts = timestamp_ms(); ASSERT_NE(0LU, ts); rc = 0; while (rc == 0 && !received_data && !testing::Test::HasFailure()) { - poll_single_ring(_xlio_ring_fd); + poll_rings(ring_fd, ring_fd_nr); if (timestamp_ms_elapsed(ts, 500UL)) { log_trace("No data received by client within time limit\n"); break; @@ -490,17 +517,19 @@ TEST_F(socketxtreme_poll, ti_4_socket_isolation) rc = listen(fd, 5); CHECK_ERR_OK(rc); - rc = xlio_api->get_socket_rings_fds(fd, &_xlio_ring_fd, 1); - ASSERT_EQ(1, rc); - ASSERT_LE(0, _xlio_ring_fd); + rc = xlio_api->get_socket_rings_num(fd); + ASSERT_GE((int)ARRAY_SIZE(ring_fd), rc); + + ring_fd_nr = xlio_api->get_socket_rings_fds(fd, ring_fd, ARRAY_SIZE(ring_fd)); + ASSERT_LT(0, ring_fd_nr); barrier_fork(pid); rc = 0; while (rc == 0 && !child_fork_exit() && !testing::Test::HasFailure()) { - poll_single_ring(_xlio_ring_fd); - if (_xlio_peer_ring_fd >= 0 && _xlio_peer_ring_fd != _xlio_ring_fd && rc == 0) { - poll_single_ring(_xlio_peer_ring_fd); + poll_rings(ring_fd, ring_fd_nr); + if (peer_ring_fd_nr > 0 && rc == 0 && !testing::Test::HasFailure()) { + poll_rings(peer_ring_fd, peer_ring_fd_nr); } } diff --git a/tests/gtest/nvme/nvme.cc b/tests/gtest/nvme/nvme.cc index 49e594de0..1a15691e7 100644 --- a/tests/gtest/nvme/nvme.cc +++ b/tests/gtest/nvme/nvme.cc @@ -32,11 +32,11 @@ #include #include +#include #include #include #include "common/def.h" #include "common/base.h" -#include "dev/qp_mgr_eth_mlx5.h" #include "proto/nvme_parse_input_args.h" #include "tcp/tcp_base.h" #include "xlio_extra.h" @@ -44,7 +44,6 @@ using namespace std; -#ifdef DEFINED_DPCP using test_iovec = vector; static ssize_t total_test_iovec_size(test_iovec &pdus) @@ -353,8 +352,8 @@ class nvme_tx : public tcp_send_zc { vector mrs; int client_fd; bool nvme_supported = true; - msghdr *msg; - uint8_t *cmsg_buffer; + msghdr *msg = nullptr; + uint8_t *cmsg_buffer = nullptr; vector msghdr_iov {}; void TearDown() override @@ -370,6 +369,36 @@ class nvme_tx : public tcp_send_zc { msghdr_iov.clear(); } + bool is_nvme_supported() + { + bool nvme_support = false; + int pid = fork(); + if (0 != pid) { + int cfd = tcp_base::sock_create(); + int rc = bind(cfd, (sockaddr *)&client_addr, sizeof(client_addr)); + barrier_fork(pid, true); + rc |= connect(cfd, (sockaddr *)&server_addr, sizeof(server_addr)); + rc |= setsockopt(cfd, IPPROTO_TCP, TCP_ULP, "nvme", 4); + nvme_support = (rc == 0); + close(cfd); + wait_fork(pid); + } else { // I am the child + int listen_fd = tcp_base::sock_create(); + int reuse_on = 1; + int rc = setsockopt(listen_fd, SOL_SOCKET, SO_REUSEPORT, &reuse_on, sizeof(reuse_on)); + rc |= bind(listen_fd, (sockaddr *)&server_addr, sizeof(server_addr)); + rc |= listen(listen_fd, 5); + barrier_fork(pid, true); + int server_fd = accept(listen_fd, nullptr, nullptr); + peer_wait(server_fd); + close(server_fd); + close(listen_fd); + exit(testing::Test::HasFailure()); + } + + return nvme_support; + } + void client_socket_create() { client_fd = tcp_base::sock_create(); @@ -521,6 +550,9 @@ class nvme_tx : public tcp_send_zc { TEST_F(nvme_tx, send_single_pdu) { + SKIP_TRUE(is_nvme_supported(), "NVME offload not supported"); + SKIP_TRUE(!getenv("XLIO_TCP_CTL_THREAD"), "Skip non default XLIO_TCP_CTL_THREAD"); + int pid = fork(); uint32_t empty_ddgst; @@ -542,6 +574,9 @@ TEST_F(nvme_tx, send_single_pdu) TEST_F(nvme_tx, send_multiple_pdus) { + SKIP_TRUE(is_nvme_supported(), "NVME offload not supported"); + SKIP_TRUE(!getenv("XLIO_TCP_CTL_THREAD"), "Skip non default XLIO_TCP_CTL_THREAD"); + int pid = fork(); uint32_t empty_ddgst; @@ -642,4 +677,3 @@ TEST_F(nvme_tx, send_multiple_pdus) server_process(pid, rx_iovs); } } -#endif /* DEFINED_DPCP */ diff --git a/tests/gtest/tcp/tcp_send_zc.cc b/tests/gtest/tcp/tcp_send_zc.cc index e9a8e502d..663b2db2b 100644 --- a/tests/gtest/tcp/tcp_send_zc.cc +++ b/tests/gtest/tcp/tcp_send_zc.cc @@ -50,7 +50,7 @@ * Send data using single send(MSG_ZEROCOPY) * @details */ -TEST_F(tcp_send_zc, ti_1_send_once) +TEST_F(tcp_send_zc, DISABLED_ti_1_send_once) { int rc = EOK; char test_msg[] = "Hello test"; @@ -150,7 +150,7 @@ TEST_F(tcp_send_zc, ti_1_send_once) * Send data using few sendmsg(MSG_ZEROCOPY) * @details */ -TEST_F(tcp_send_zc, ti_2_few_send) +TEST_F(tcp_send_zc, DISABLED_ti_2_few_send) { int rc = EOK; int test_iter = 3; @@ -276,7 +276,7 @@ TEST_F(tcp_send_zc, ti_2_few_send) * single call * @details */ -TEST_F(tcp_send_zc, ti_3_large_send) +TEST_F(tcp_send_zc, DISABLED_ti_3_large_send) { int rc = EOK; @@ -403,7 +403,7 @@ TEST_F(tcp_send_zc, ti_3_large_send) * notification after every call * @details */ -TEST_F(tcp_send_zc, ti_4_mass_send_check_every_call) +TEST_F(tcp_send_zc, DISABLED_ti_4_mass_send_check_every_call) { int rc = EOK; struct { @@ -549,7 +549,7 @@ TEST_F(tcp_send_zc, ti_4_mass_send_check_every_call) * notification after last call * @details */ -TEST_F(tcp_send_zc, ti_5_mass_send_check_last_call) +TEST_F(tcp_send_zc, DISABLED_ti_5_mass_send_check_last_call) { int rc = EOK; struct { @@ -684,7 +684,7 @@ TEST_F(tcp_send_zc, ti_5_mass_send_check_last_call) * Verify epoll notification * @details */ -TEST_F(tcp_send_zc, ti_6_epoll_notification) +TEST_F(tcp_send_zc, DISABLED_ti_6_epoll_notification) { int rc = EOK; char test_msg[] = "Hello test"; diff --git a/tests/gtest/tcp/tcp_sockopt.cc b/tests/gtest/tcp/tcp_sockopt.cc index 0ce724ce4..5dbc0cc76 100644 --- a/tests/gtest/tcp/tcp_sockopt.cc +++ b/tests/gtest/tcp/tcp_sockopt.cc @@ -258,6 +258,25 @@ TEST_F(tcp_sockopt, ti_3_setsockopt_isolate) SKIP_TRUE(server_addr.addr.sa_family == AF_INET && client_addr.addr.sa_family == AF_INET, "This test supports only IPv4"); + auto compare_rings_ne = [&](int *arr1, int arr1_nr, int *arr2, int arr2_nr) { + // Whether arr1 and arr2 don't overlap (contain different rings) + for (int i = 0; i < arr1_nr; ++i) { + for (int j = 0; j < arr2_nr; ++j) { + ASSERT_NE(arr1[i], arr2[j]); + } + } + }; + auto compare_rings_contains = [&](int *arr1, int arr1_nr, int *arr2, int arr2_nr) { + // Whether arr1 contains all arr2 + for (int i = 0; i < arr2_nr; ++i) { + bool contains = false; + for (int j = 0; j < arr1_nr; ++j) { + contains = contains || (arr2[i] == arr1[j]); + } + ASSERT_TRUE(contains); + } + }; + auto test_client = [&]() { char buf[64]; sockaddr_store_t addr; @@ -300,13 +319,20 @@ TEST_F(tcp_sockopt, ti_3_setsockopt_isolate) ASSERT_EQ(-1, rc); ASSERT_EQ(EINVAL, errno); - int xlio_ring_fds[3]; - int xlio_ring_fds2[3]; - rc = xlio_api->get_socket_rings_fds(sock, xlio_ring_fds, ARRAY_SIZE(xlio_ring_fds)); - ASSERT_EQ(1, rc); - rc = xlio_api->get_socket_rings_fds(sock2, xlio_ring_fds2, ARRAY_SIZE(xlio_ring_fds2)); - ASSERT_EQ(1, rc); - ASSERT_NE(xlio_ring_fds[0], xlio_ring_fds2[0]); + int ring_fds[3]; + int ring_fds2[3]; + int ring_fds_nr; + int ring_fds2_nr; + rc = xlio_api->get_socket_rings_num(sock); + ASSERT_LT(0, rc); + ASSERT_GE((int)ARRAY_SIZE(ring_fds), rc); + rc = xlio_api->get_socket_rings_num(sock2); + ASSERT_LT(0, rc); + ASSERT_GE((int)ARRAY_SIZE(ring_fds2), rc); + ring_fds_nr = xlio_api->get_socket_rings_fds(sock, ring_fds, ARRAY_SIZE(ring_fds)); + ring_fds2_nr = xlio_api->get_socket_rings_fds(sock2, ring_fds2, ARRAY_SIZE(ring_fds2)); + compare_rings_ne(ring_fds, ring_fds_nr, ring_fds2, ring_fds2_nr); + ASSERT_TRUE(!testing::Test::HasFailure()); len = write(sock, HELLO_STR, sizeof(HELLO_STR)); ASSERT_LT(0, len); @@ -393,20 +419,23 @@ TEST_F(tcp_sockopt, ti_3_setsockopt_isolate) ASSERT_EQ(EINVAL, errno); /* - * Check rings + * Check rings for listen sockets */ - int xlio_ring_fds[3]; - int xlio_ring_fds2[3]; - int xlio_ring_fds3[3]; - rc = xlio_api->get_socket_rings_fds(sock, xlio_ring_fds, ARRAY_SIZE(xlio_ring_fds)); - ASSERT_EQ(1, rc); - rc = xlio_api->get_socket_rings_fds(sock2, xlio_ring_fds2, ARRAY_SIZE(xlio_ring_fds2)); - ASSERT_EQ(1, rc); - rc = xlio_api->get_socket_rings_fds(sock3, xlio_ring_fds3, ARRAY_SIZE(xlio_ring_fds3)); - ASSERT_EQ(1, rc); - ASSERT_EQ(xlio_ring_fds[0], xlio_ring_fds2[0]); - ASSERT_NE(xlio_ring_fds[0], xlio_ring_fds3[0]); + int ring_fds[3]; + int ring_fds2[3]; + int ring_fds3[3]; + int ring_fds_nr; + int ring_fds2_nr; + int ring_fds3_nr; + ring_fds_nr = xlio_api->get_socket_rings_fds(sock, ring_fds, ARRAY_SIZE(ring_fds)); + ASSERT_EQ(1, ring_fds_nr); + ring_fds2_nr = xlio_api->get_socket_rings_fds(sock2, ring_fds2, ARRAY_SIZE(ring_fds2)); + ASSERT_EQ(1, ring_fds2_nr); + ring_fds3_nr = xlio_api->get_socket_rings_fds(sock3, ring_fds3, ARRAY_SIZE(ring_fds3)); + ASSERT_EQ(1, ring_fds3_nr); + ASSERT_EQ(ring_fds[0], ring_fds2[0]); + ASSERT_NE(ring_fds[0], ring_fds3[0]); // Notify client to proceed with connect() barrier_fork(pid); @@ -431,13 +460,20 @@ TEST_F(tcp_sockopt, ti_3_setsockopt_isolate) log_trace("Accepted connection: fd=%d from %s\n", sock_in2, sys_addr2str((struct sockaddr *)&peer_addr)); - rc = xlio_api->get_socket_rings_fds(sock_in, xlio_ring_fds2, ARRAY_SIZE(xlio_ring_fds2)); - ASSERT_EQ(1, rc); - rc = xlio_api->get_socket_rings_fds(sock_in2, xlio_ring_fds3, ARRAY_SIZE(xlio_ring_fds3)); - ASSERT_EQ(1, rc); + rc = xlio_api->get_socket_rings_num(sock_in); + ASSERT_LT(0, rc); + ASSERT_GE((int)ARRAY_SIZE(ring_fds2), rc); + rc = xlio_api->get_socket_rings_num(sock_in2); + ASSERT_LT(0, rc); + ASSERT_GE((int)ARRAY_SIZE(ring_fds3), rc); + + ring_fds2_nr = xlio_api->get_socket_rings_fds(sock_in, ring_fds2, ARRAY_SIZE(ring_fds2)); + ring_fds3_nr = xlio_api->get_socket_rings_fds(sock_in2, ring_fds3, ARRAY_SIZE(ring_fds3)); // Incoming TCP sockets inherit ring allocation logic from their parents - ASSERT_EQ(xlio_ring_fds[0], xlio_ring_fds2[0]); - ASSERT_EQ(xlio_ring_fds[0], xlio_ring_fds3[0]); + compare_rings_contains(ring_fds2, ring_fds2_nr, ring_fds, ring_fds_nr); + ASSERT_TRUE(!testing::Test::HasFailure()); + compare_rings_contains(ring_fds3, ring_fds3_nr, ring_fds, ring_fds_nr); + ASSERT_TRUE(!testing::Test::HasFailure()); /* * Socket read / write @@ -855,6 +891,8 @@ class tcp_with_fifo : public testing::TestWithParam { */ TEST_P(tcp_with_fifo, accepted_socket_inherits_the_setsockopt_param) { + SKIP_TRUE(!getenv("XLIO_TCP_CTL_THREAD"), "Skip non default XLIO_TCP_CTL_THREAD"); + int level, optname, value; std::tie(level, optname, value) = GetParam(); pid_t pid = fork();