Merge pull request LeelaChessZero#8 from LeelaChessZero/master

ankan-ban · web-flow · commit 80ac4a1f9464 · 2018-12-21T10:25:52.000+05:30
get latest
diff --git a/README.md b/README.md
@@ -17,11 +17,11 @@ Versioning follows the Semantic Versioning guidelines, with major, minor and pat
 Download using git:
 
 ```
-git clone -b release --recurse-submodules https://github.com/LeelaChessZero/lc0.git
+git clone -b release/0.19 --recurse-submodules https://github.com/LeelaChessZero/lc0.git
 ```
 
 If downloading an archive, you need to also download and place the submodule:
- * Download https://github.com/LeelaChessZero/lc0/archive/release.zip ([.tar.gz](https://github.com/LeelaChessZero/lc0/archive/release.tar.gz) archive is also available)
+ * Download https://github.com/LeelaChessZero/lc0/archive/release/0.19.zip ([.tar.gz](https://github.com/LeelaChessZero/lc0/archive/release/0.19.tar.gz) archive is also available)
  * Extract
  * Download https://github.com/LeelaChessZero/lczero-common/archive/master.zip (also available as [.tar.gz](https://github.com/LeelaChessZero/lczero-common/archive/master.tar.gz))
  * Move the second archive into the first archive's `libs/lczero-common/` folder and extract
diff --git a/appveyor.yml b/appveyor.yml
@@ -39,6 +39,7 @@ install:
 - cmd: IF NOT EXIST c:\cache\protobuf\ cmake -G "Visual Studio 15 2017 Win64" -Dprotobuf_BUILD_SHARED_LIBS=NO -Dprotobuf_MSVC_STATIC_RUNTIME=NO -Dprotobuf_BUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX=c:/cache/protobuf ../cmake
 - cmd: IF NOT EXIST c:\cache\protobuf\ msbuild INSTALL.vcxproj /p:Configuration=Release /p:Platform=x64 /m
 - cmd: set PATH=c:\cache\protobuf\bin;%PATH%
+- cmd: IF NOT EXIST c:\cache\testnet appveyor DownloadFile http://lczero.org/get_network?sha=7170f639ba1cdc407283b8e52377283e36845b954788c6ada8897937637ef032 -Filename c:\cache\testnet
 - cmd: IF %GTEST%==true IF NOT EXIST C:\cache\syzygy mkdir C:\cache\syzygy 
 - cmd: IF %GTEST%==true cd C:\cache\syzygy
 - cmd: IF %GTEST%==true IF NOT EXIST KQvK.rtbz curl --remote-name-all https://tablebase.lichess.ovh/tables/standard/3-4-5/K{P,N,R,B,Q}vK.rtb{w,z}
@@ -53,7 +54,16 @@ before_build:
 - cmd: git submodule update --init --recursive
 - cmd: meson build --backend vs2017 --buildtype release -Dgtest=%GTEST% -Dopencl=%OPENCL% -Dblas=%BLAS% -Dcudnn=%CUDA% -Dispc_native_only=false -Dpopcnt=false -Dcudnn_include="%CUDA_PATH%\include","%PKG_FOLDER%\cuda\include" -Dcudnn_libdirs="%CUDA_PATH%\lib\x64","%PKG_FOLDER%\cuda\lib\x64" -Dprotobuf_include="%PKG_FOLDER%\protobuf\include" -Dprotobuf_libdir="%PKG_FOLDER%\protobuf\lib" -Dopenblas_include="%PKG_FOLDER%\OpenBLAS.0.2.14.1\lib\native\include" -Dopenblas_libdirs="%PKG_FOLDER%\OpenBLAS.0.2.14.1\lib\native\lib\x64" -Dopencl_include="%PKG_FOLDER%\opencl-nug.0.777.12\build\native\include" -Dopencl_libdirs="%PKG_FOLDER%\opencl-nug.0.777.12\build\native\lib\x64" -Ddefault_library=static
 build_script:
-- cmd: msbuild "C:\projects\lc0\build\lc0.sln" /m /p:WholeProgramOptimization=true /logger:"C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll"
+- cmd: IF %APPVEYOR_REPO_TAG%==false msbuild "C:\projects\lc0\build\lc0.sln" /m /p:WholeProgramOptimization=true /logger:"C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll"
+- cmd: IF %APPVEYOR_REPO_TAG%==true msbuild "C:\projects\lc0\build\lc0.sln" /m /p:WholeProgramOptimization=PGInstrument /logger:"C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll"
+- cmd: cd build
+- cmd: IF %APPVEYOR_REPO_TAG%==true IF %BLAS%==true copy C:\cache\OpenBLAS.0.2.14.1\lib\native\bin\x64\*.dll
+- cmd: IF %APPVEYOR_REPO_TAG%==true IF %OPENCL%==true copy C:\cache\opencl-nug.0.777.12\build\native\bin\OpenCL.dll
+- cmd: IF %APPVEYOR_REPO_TAG%==true IF %CUDA%==true copy "%CUDA_PATH%"\bin\*.dll
+- cmd: IF %APPVEYOR_REPO_TAG%==true IF %CUDA%==true copy %PKG_FOLDER%\cuda\bin\cudnn64_7.dll
+- cmd: IF %APPVEYOR_REPO_TAG%==true lc0 benchmark --weights=c:\cache\testnet --backend=random --movetime=10000
+- cmd: cd ..
+- cmd: IF %APPVEYOR_REPO_TAG%==true msbuild "C:\projects\lc0\build\lc0.sln" /m /p:WholeProgramOptimization=PGOptimize /logger:"C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll"
 after_build:
 - cmd: IF %APPVEYOR_REPO_TAG%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip %APPVEYOR_BUILD_FOLDER%\build\lc0.exe
 - cmd: IF %APPVEYOR_REPO_TAG%==true appveyor DownloadFile "https://ci.appveyor.com/api/projects/LeelaChessZero/lczero-client/artifacts/client.exe?branch=release&pr=false&job=Environment%%3A%%20NAME%%3D.exe%%2C%%20GOOS%%3Dwindows"
diff --git a/build.sh b/build.sh
@@ -16,18 +16,20 @@ BUILDDIR=build/${BUILDTYPE}
 
 if [ -f ${BUILDDIR}/build.ninja ]
 then
-  meson configure ${BUILDDIR} --buildtype ${BUILDTYPE} --prefix ${INSTALL_PREFIX:-/usr/local} "$@"
+  meson configure ${BUILDDIR} -Dbuildtype=${BUILDTYPE} -Dprefix=${INSTALL_PREFIX:-/usr/local} "$@"
 else
   meson ${BUILDDIR} --buildtype ${BUILDTYPE} --prefix ${INSTALL_PREFIX:-/usr/local} "$@"
 fi
 
 pushd ${BUILDDIR}
 
+NINJA=$(awk '/ninja/ {ninja=$4} END {print ninja}' meson-logs/meson-log.txt)
+
 if [ -n "${INSTALL_PREFIX}" ]
 then
-  ninja install
+  ${NINJA} install
 else
-  ninja
+  ${NINJA}
 fi
 
 popd
diff --git a/meson.build b/meson.build
@@ -26,6 +26,7 @@ endif
 if cc.get_id() == 'clang' or cc.get_id() == 'gcc'
   add_project_arguments('-Wextra', language : 'cpp')
   add_project_arguments('-pedantic', language : 'cpp')
+  add_project_arguments('-ffast-math', language : 'cpp')
 
   if get_option('buildtype') == 'release'
     add_project_arguments('-march=native', language : 'cpp')
diff --git a/src/mcts/search.cc b/src/mcts/search.cc
@@ -39,6 +39,7 @@
 #include "mcts/node.h"
 #include "neural/cache.h"
 #include "neural/encoder.h"
+#include "utils/fastmath.h"
 #include "utils/random.h"
 
 namespace lczero {
@@ -198,7 +199,7 @@ inline float ComputeCpuct(const SearchParams& params, uint32_t N) {
   const float init = params.GetCpuct();
   const float k = params.GetCpuctFactor();
   const float base = params.GetCpuctBase();
-  return init + (k ? k * std::log((N + base) / base) : 0.0f);
+  return init + (k ? k * FastLog((N + base) / base) : 0.0f);
 }
 }  // namespace
 
@@ -837,7 +838,9 @@ SearchWorker::NodeToProcess SearchWorker::PickNodeToExtend(
     // n_in_flight_ is incremented. If the method returns false, then there is
     // a search collision, and this node is already being expanded.
     if (!node->TryStartScoreUpdate()) {
-      IncrementNInFlight(node, search_->root_node_, collision_limit - 1);
+      if (!is_root_node) {
+        IncrementNInFlight(node->GetParent(), search_->root_node_, collision_limit - 1);
+      }
       return NodeToProcess::Collision(node, depth, collision_limit);
     }
     // Either terminal or unexamined leaf node -- the end of this playout.
@@ -1137,7 +1140,10 @@ void SearchWorker::FetchSingleNodeResult(NodeToProcess* node_to_process,
     float p =
         computation_->GetPVal(idx_in_computation, edge.GetMove().as_nn_index());
     if (params_.GetPolicySoftmaxTemp() != 1.0f) {
-      p = pow(p, 1 / params_.GetPolicySoftmaxTemp());
+      // Flush denormals to zero.
+      p = p < 1.17549435E-38
+              ? 0.0
+              : FastPow2(FastLog2(p) / params_.GetPolicySoftmaxTemp());
     }
     edge.edge()->SetP(p);
     // Edge::SetP does some rounding, so only add to the total after rounding.
diff --git a/src/neural/cuda/layers.cc b/src/neural/cuda/layers.cc
@@ -297,7 +297,6 @@ void SELayer<float>::LoadWeights(float* w1, float* b1, float* w2, float* b2,
   size_t num_weights1 = C * numFc1Out_;
   size_t weight_size1 = sizeof(float) * num_weights1;
 
-  size_t num_weights2 = 2 * num_weights1;
   size_t weight_size2 = 2 * weight_size1;
 
   // Weight for the first FC layer.
@@ -385,7 +384,6 @@ void SELayer<float>::Eval(int N, float* output, const float* input,
                           const float* /*input2*/, void* scratch,
                           size_t scratch_size, cudnnHandle_t /*cudnn*/,
                           cublasHandle_t cublas) {
-  assert(output == input2);
   // Ping-pong between 'op1' and 'op2' (parts of scratch memory).
   float* op1 = (float*)scratch;
   float* op2 = (float*)scratch + scratch_size / sizeof(float) / 2;
diff --git a/src/neural/loader.cc b/src/neural/loader.cc
@@ -60,6 +60,10 @@ std::string DecompressGzip(const std::string& filename) {
   if (!file) throw Exception("Cannot read weights from " + filename);
   while (true) {
     int sz = gzread(file, &buffer[bytes_read], buffer.size() - bytes_read);
+    if (sz < 0) {
+      int errnum;
+      throw Exception(gzerror(file, &errnum));
+    }
     if (sz == static_cast<int>(buffer.size()) - bytes_read) {
       bytes_read = buffer.size();
       buffer.resize(buffer.size() * 2);
diff --git a/src/neural/opencl/OpenCLTuner.cc b/src/neural/opencl/OpenCLTuner.cc
@@ -44,8 +44,6 @@ static void sgemmBatched_ref(const std::vector<float>& a,
     auto offset_v = batch * n * k;
     auto offset_m = batch * m * n;
 
-    // cblas_sgemm(CblasRowMajor, CblasTrans, CblasNoTrans, m, n, k, 1.0f,
-    //             &a[offset_u], m, &b[offset_v], n, 0.0f, &c[offset_m], n);
     // Calculates C = transpose(tranpose(A) * B) in row major, or
     // C = A * transpose(B) in column major.
     for (auto i = 0; i < m; i++) {
@@ -169,16 +167,16 @@ static float compare_ref(std::vector<float>& x, std::vector<float>& ref,
                          const int m_ceil, const int n_ceil) {
   auto sum = 0.0f;
   for (auto batch = 0; batch < batch_size; batch++) {
-    for (auto i = 0; i < n; i++) {
-      for (auto j = 0; j < m; j++) {
-        auto r = ref[batch * n * m + i * m + j];
+    for (auto j = 0; j < m; j++) {
+      for (auto i = 0; i < n; i++) {
+        auto r = ref[batch * n * m + j * n + i];
         auto y = x[batch * n_ceil * m_ceil + j * n_ceil + i];
 
         sum += (r - y) * (r - y);
       }
     }
   }
-  return sum / (m * n);
+  return sum / (m * n * batch_size);
 }
 
 std::string Tuner::tune_sgemm(const int m, const int n, const int k,
diff --git a/src/utils/fastmath.h b/src/utils/fastmath.h
@@ -0,0 +1,68 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2018 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#pragma once
+
+#include <cstring>
+
+namespace lczero {
+// These stunts are performed by trained professionals, do not try this at home.
+
+// Fast approximate log2(x). Does no range checking.
+// The approximation used here is log2(2^N*(1+f)) ~ N+f*(1.342671-0.342671*f)
+// where N is the integer and f the fractional part, f>=0.
+inline float FastLog2(const float a) {
+  int32_t tmp;
+  std::memcpy(&tmp, &a, sizeof(float));
+  int expb = (tmp >> 23);
+  tmp = (tmp & 0x7fffff) | (0x7f << 23);
+  float out;
+  std::memcpy(&out, &tmp, sizeof(float));
+  return out * (2.028011f - 0.342671f * out) - 128.68534f + expb;
+}
+
+// Fast approximate 2^x. Does only limited range checking.
+// The approximation used here is 2^(N+f) ~ 2^N*(1+f*(0.656366+0.343634*f))
+// where N is the integer and f the fractional part, f>=0.
+inline float FastPow2(const float a) {
+  if (a < -126) return 0.0;
+  int exp = floor(a);
+  float out = a - exp;
+  out = 1.0f + out * (0.656366f + 0.343634f * out);
+  int32_t tmp;
+  std::memcpy(&tmp, &out, sizeof(float));
+  tmp += exp << 23;
+  std::memcpy(&out, &tmp, sizeof(float));
+  return out;
+}
+
+// Fast approximate ln(x). Does no range checking.
+inline float FastLog(const float a) {
+  return 0.6931471805599453f * FastLog2(a);
+}
+
+}  // namespace lczero