From 87f3394404ff9f9ec92c906cd4c39b5562aea42e Mon Sep 17 00:00:00 2001
From: Kevin Chen <kevinch@nvidia.com>
Date: Thu, 18 Aug 2022 16:36:47 -0700
Subject: [PATCH] TensorRT 8.4.3.1 updates

Signed-off-by: Kevin Chen <kevinch@nvidia.com>
---
 README.md                              | 10 +++++-----
 docker/centos-7.Dockerfile             |  2 +-
 docker/ubuntu-18.04.Dockerfile         |  2 +-
 docker/ubuntu-20.04-aarch64.Dockerfile |  2 +-
 docker/ubuntu-20.04.Dockerfile         |  2 +-
 docker/ubuntu-cross-aarch64.Dockerfile |  2 +-
 include/NvInferVersion.h               |  6 +++---
 samples/common/sampleInference.cpp     | 22 ++++++++++++++++------
 8 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/README.md b/README.md
index d9aeebb5..a662a59a 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@ Need enterprise support? NVIDIA global support is available for TensorRT with th
 To build the TensorRT-OSS components, you will first need the following software packages.
 
 **TensorRT GA build**
-* [TensorRT](https://developer.nvidia.com/nvidia-tensorrt-download) v8.4.2.4
+* [TensorRT](https://developer.nvidia.com/nvidia-tensorrt-download) v8.4.3.1
 
 **System Packages**
 * [CUDA](https://developer.nvidia.com/cuda-toolkit)
@@ -71,16 +71,16 @@ To build the TensorRT-OSS components, you will first need the following software
 
     ```bash
     cd ~/Downloads
-    tar -xvzf TensorRT-8.4.2.4.Linux.x86_64-gnu.cuda-11.6.cudnn8.4.tar.gz
-    export TRT_LIBPATH=`pwd`/TensorRT-8.4.2.4
+    tar -xvzf TensorRT-8.4.3.1.Linux.x86_64-gnu.cuda-11.6.cudnn8.4.tar.gz
+    export TRT_LIBPATH=`pwd`/TensorRT-8.4.3.1
     ```
 
     **Example: Windows on x86-64 with cuda-11.4**
 
     ```powershell
     cd ~\Downloads
-    Expand-Archive .\TensorRT-8.4.2.4.Windows10.x86_64.cuda-11.6.cudnn8.4.zip
-    $Env:TRT_LIBPATH = '$(Get-Location)\TensorRT-8.4.2.4'
+    Expand-Archive .\TensorRT-8.4.3.1.Windows10.x86_64.cuda-11.6.cudnn8.4.zip
+    $Env:TRT_LIBPATH = '$(Get-Location)\TensorRT-8.4.3.1'
     $Env:PATH += 'C:\Program Files (x86)\Microsoft Visual Studio\2017\Professional\MSBuild\15.0\Bin\'
     ```
 
diff --git a/docker/centos-7.Dockerfile b/docker/centos-7.Dockerfile
index 4de1d4e5..3e6d3d66 100644
--- a/docker/centos-7.Dockerfile
+++ b/docker/centos-7.Dockerfile
@@ -21,7 +21,7 @@ ARG OS_VERSION=7
 FROM nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-centos${OS_VERSION}
 LABEL maintainer="NVIDIA CORPORATION"
 
-ENV TRT_VERSION 8.4.2.4
+ENV TRT_VERSION 8.4.3.1
 SHELL ["/bin/bash", "-c"]
 
 # Setup user account
diff --git a/docker/ubuntu-18.04.Dockerfile b/docker/ubuntu-18.04.Dockerfile
index fcc6c3cd..7a2c6ad3 100644
--- a/docker/ubuntu-18.04.Dockerfile
+++ b/docker/ubuntu-18.04.Dockerfile
@@ -21,7 +21,7 @@ ARG OS_VERSION=18.04
 FROM nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu${OS_VERSION}
 LABEL maintainer="NVIDIA CORPORATION"
 
-ENV TRT_VERSION 8.4.2.4
+ENV TRT_VERSION 8.4.3.1
 SHELL ["/bin/bash", "-c"]
 
 # Setup user account
diff --git a/docker/ubuntu-20.04-aarch64.Dockerfile b/docker/ubuntu-20.04-aarch64.Dockerfile
index 4aa6e0c8..e301ded8 100644
--- a/docker/ubuntu-20.04-aarch64.Dockerfile
+++ b/docker/ubuntu-20.04-aarch64.Dockerfile
@@ -18,7 +18,7 @@
 # Multi-arch container support available in non-cudnn containers.
 FROM nvidia/cuda:11.4.2-devel-ubuntu20.04
 
-ENV TRT_VERSION 8.4.2.4
+ENV TRT_VERSION 8.4.3.1
 SHELL ["/bin/bash", "-c"]
 
 # Setup user account
diff --git a/docker/ubuntu-20.04.Dockerfile b/docker/ubuntu-20.04.Dockerfile
index 8bc2fff4..05a75abf 100644
--- a/docker/ubuntu-20.04.Dockerfile
+++ b/docker/ubuntu-20.04.Dockerfile
@@ -21,7 +21,7 @@ ARG OS_VERSION=20.04
 FROM nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu${OS_VERSION}
 LABEL maintainer="NVIDIA CORPORATION"
 
-ENV TRT_VERSION 8.4.2.4
+ENV TRT_VERSION 8.4.3.1
 SHELL ["/bin/bash", "-c"]
 
 # Setup user account
diff --git a/docker/ubuntu-cross-aarch64.Dockerfile b/docker/ubuntu-cross-aarch64.Dockerfile
index 7a7bceb6..913f3cec 100644
--- a/docker/ubuntu-cross-aarch64.Dockerfile
+++ b/docker/ubuntu-cross-aarch64.Dockerfile
@@ -21,7 +21,7 @@ ARG OS_VERSION=20.04
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${OS_VERSION}
 LABEL maintainer="NVIDIA CORPORATION"
 
-ENV TRT_VERSION 8.4.2.4
+ENV TRT_VERSION 8.4.3.1
 ENV DEBIAN_FRONTEND=noninteractive
 
 ARG uid=1000
diff --git a/include/NvInferVersion.h b/include/NvInferVersion.h
index d6026af4..35f1f8f9 100644
--- a/include/NvInferVersion.h
+++ b/include/NvInferVersion.h
@@ -21,8 +21,8 @@
 
 #define NV_TENSORRT_MAJOR 8 //!< TensorRT major version.
 #define NV_TENSORRT_MINOR 4 //!< TensorRT minor version.
-#define NV_TENSORRT_PATCH 2 //!< TensorRT patch version.
-#define NV_TENSORRT_BUILD 4 //!< TensorRT build number.
+#define NV_TENSORRT_PATCH 3 //!< TensorRT patch version.
+#define NV_TENSORRT_BUILD 1 //!< TensorRT build number.
 
 #define NV_TENSORRT_LWS_MAJOR 0 //!< TensorRT LWS major version.
 #define NV_TENSORRT_LWS_MINOR 0 //!< TensorRT LWS minor version.
@@ -30,6 +30,6 @@
 
 #define NV_TENSORRT_SONAME_MAJOR 8 //!< Shared object library major version number.
 #define NV_TENSORRT_SONAME_MINOR 4 //!< Shared object library minor version number.
-#define NV_TENSORRT_SONAME_PATCH 2 //!< Shared object library patch version number.
+#define NV_TENSORRT_SONAME_PATCH 3 //!< Shared object library patch version number.
 
 #endif // NV_INFER_VERSION_H
diff --git a/samples/common/sampleInference.cpp b/samples/common/sampleInference.cpp
index b4425acb..a14c70d8 100644
--- a/samples/common/sampleInference.cpp
+++ b/samples/common/sampleInference.cpp
@@ -579,7 +579,7 @@ class Iteration
         if (!skipTransfers)
         {
             record(EventType::kINPUT_S, StreamType::kINPUT);
-            mBindings.transferInputToDevice(getStream(StreamType::kINPUT));
+            setInputData(false);
             record(EventType::kINPUT_E, StreamType::kINPUT);
             wait(EventType::kINPUT_E, StreamType::kCOMPUTE); // Wait for input DMA before compute
         }
@@ -597,7 +597,7 @@ class Iteration
         {
             wait(EventType::kCOMPUTE_E, StreamType::kOUTPUT); // Wait for compute before output DMA
             record(EventType::kOUTPUT_S, StreamType::kOUTPUT);
-            mBindings.transferOutputToHost(getStream(StreamType::kOUTPUT));
+            fetchOutputData(false);
             record(EventType::kOUTPUT_E, StreamType::kOUTPUT);
         }
 
@@ -641,14 +641,24 @@ class Iteration
         getStream(StreamType::kINPUT).wait(gpuStart);
     }
 
-    void setInputData()
+    void setInputData(bool sync)
     {
         mBindings.transferInputToDevice(getStream(StreamType::kINPUT));
+        // additional sync to avoid overlapping with inference execution.
+        if (sync)
+        {
+            getStream(StreamType::kINPUT).synchronize();
+        }
     }
 
-    void fetchOutputData()
+    void fetchOutputData(bool sync)
     {
         mBindings.transferOutputToHost(getStream(StreamType::kOUTPUT));
+        // additional sync to avoid overlapping with inference execution.
+        if (sync)
+        {
+            getStream(StreamType::kOUTPUT).synchronize();
+        }
     }
 
 private:
@@ -841,7 +851,7 @@ void inferenceExecution(InferenceOptions const& inference, InferenceEnvironment&
             streamId, inference, *iEnv.template getContext<ContextType>(streamId), *iEnv.bindings[streamId]);
         if (inference.skipTransfers)
         {
-            iteration->setInputData();
+            iteration->setInputData(true);
         }
         iStreams.emplace_back(iteration);
     }
@@ -862,7 +872,7 @@ void inferenceExecution(InferenceOptions const& inference, InferenceEnvironment&
     {
         for (auto& s : iStreams)
         {
-            s->fetchOutputData();
+            s->fetchOutputData(true);
         }
     }