Skip to content

Commit 80ac4a1

Browse files
authored
Merge pull request LeelaChessZero#8 from LeelaChessZero/master
get latest
2 parents beed96e + 1a5f95f commit 80ac4a1

File tree

9 files changed

+104
-17
lines changed

9 files changed

+104
-17
lines changed

README.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,11 @@ Versioning follows the Semantic Versioning guidelines, with major, minor and pat
1717
Download using git:
1818

1919
```
20-
git clone -b release --recurse-submodules https://github.com/LeelaChessZero/lc0.git
20+
git clone -b release/0.19 --recurse-submodules https://github.com/LeelaChessZero/lc0.git
2121
```
2222

2323
If downloading an archive, you need to also download and place the submodule:
24-
* Download https://github.com/LeelaChessZero/lc0/archive/release.zip ([.tar.gz](https://github.com/LeelaChessZero/lc0/archive/release.tar.gz) archive is also available)
24+
* Download https://github.com/LeelaChessZero/lc0/archive/release/0.19.zip ([.tar.gz](https://github.com/LeelaChessZero/lc0/archive/release/0.19.tar.gz) archive is also available)
2525
* Extract
2626
* Download https://github.com/LeelaChessZero/lczero-common/archive/master.zip (also available as [.tar.gz](https://github.com/LeelaChessZero/lczero-common/archive/master.tar.gz))
2727
* Move the second archive into the first archive's `libs/lczero-common/` folder and extract

appveyor.yml

+11-1
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ install:
3939
- cmd: IF NOT EXIST c:\cache\protobuf\ cmake -G "Visual Studio 15 2017 Win64" -Dprotobuf_BUILD_SHARED_LIBS=NO -Dprotobuf_MSVC_STATIC_RUNTIME=NO -Dprotobuf_BUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX=c:/cache/protobuf ../cmake
4040
- cmd: IF NOT EXIST c:\cache\protobuf\ msbuild INSTALL.vcxproj /p:Configuration=Release /p:Platform=x64 /m
4141
- cmd: set PATH=c:\cache\protobuf\bin;%PATH%
42+
- cmd: IF NOT EXIST c:\cache\testnet appveyor DownloadFile http://lczero.org/get_network?sha=7170f639ba1cdc407283b8e52377283e36845b954788c6ada8897937637ef032 -Filename c:\cache\testnet
4243
- cmd: IF %GTEST%==true IF NOT EXIST C:\cache\syzygy mkdir C:\cache\syzygy
4344
- cmd: IF %GTEST%==true cd C:\cache\syzygy
4445
- cmd: IF %GTEST%==true IF NOT EXIST KQvK.rtbz curl --remote-name-all https://tablebase.lichess.ovh/tables/standard/3-4-5/K{P,N,R,B,Q}vK.rtb{w,z}
@@ -53,7 +54,16 @@ before_build:
5354
- cmd: git submodule update --init --recursive
5455
- cmd: meson build --backend vs2017 --buildtype release -Dgtest=%GTEST% -Dopencl=%OPENCL% -Dblas=%BLAS% -Dcudnn=%CUDA% -Dispc_native_only=false -Dpopcnt=false -Dcudnn_include="%CUDA_PATH%\include","%PKG_FOLDER%\cuda\include" -Dcudnn_libdirs="%CUDA_PATH%\lib\x64","%PKG_FOLDER%\cuda\lib\x64" -Dprotobuf_include="%PKG_FOLDER%\protobuf\include" -Dprotobuf_libdir="%PKG_FOLDER%\protobuf\lib" -Dopenblas_include="%PKG_FOLDER%\OpenBLAS.0.2.14.1\lib\native\include" -Dopenblas_libdirs="%PKG_FOLDER%\OpenBLAS.0.2.14.1\lib\native\lib\x64" -Dopencl_include="%PKG_FOLDER%\opencl-nug.0.777.12\build\native\include" -Dopencl_libdirs="%PKG_FOLDER%\opencl-nug.0.777.12\build\native\lib\x64" -Ddefault_library=static
5556
build_script:
56-
- cmd: msbuild "C:\projects\lc0\build\lc0.sln" /m /p:WholeProgramOptimization=true /logger:"C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll"
57+
- cmd: IF %APPVEYOR_REPO_TAG%==false msbuild "C:\projects\lc0\build\lc0.sln" /m /p:WholeProgramOptimization=true /logger:"C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll"
58+
- cmd: IF %APPVEYOR_REPO_TAG%==true msbuild "C:\projects\lc0\build\lc0.sln" /m /p:WholeProgramOptimization=PGInstrument /logger:"C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll"
59+
- cmd: cd build
60+
- cmd: IF %APPVEYOR_REPO_TAG%==true IF %BLAS%==true copy C:\cache\OpenBLAS.0.2.14.1\lib\native\bin\x64\*.dll
61+
- cmd: IF %APPVEYOR_REPO_TAG%==true IF %OPENCL%==true copy C:\cache\opencl-nug.0.777.12\build\native\bin\OpenCL.dll
62+
- cmd: IF %APPVEYOR_REPO_TAG%==true IF %CUDA%==true copy "%CUDA_PATH%"\bin\*.dll
63+
- cmd: IF %APPVEYOR_REPO_TAG%==true IF %CUDA%==true copy %PKG_FOLDER%\cuda\bin\cudnn64_7.dll
64+
- cmd: IF %APPVEYOR_REPO_TAG%==true lc0 benchmark --weights=c:\cache\testnet --backend=random --movetime=10000
65+
- cmd: cd ..
66+
- cmd: IF %APPVEYOR_REPO_TAG%==true msbuild "C:\projects\lc0\build\lc0.sln" /m /p:WholeProgramOptimization=PGOptimize /logger:"C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll"
5767
after_build:
5868
- cmd: IF %APPVEYOR_REPO_TAG%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip %APPVEYOR_BUILD_FOLDER%\build\lc0.exe
5969
- cmd: IF %APPVEYOR_REPO_TAG%==true appveyor DownloadFile "https://ci.appveyor.com/api/projects/LeelaChessZero/lczero-client/artifacts/client.exe?branch=release&pr=false&job=Environment%%3A%%20NAME%%3D.exe%%2C%%20GOOS%%3Dwindows"

build.sh

+5-3
Original file line numberDiff line numberDiff line change
@@ -16,18 +16,20 @@ BUILDDIR=build/${BUILDTYPE}
1616

1717
if [ -f ${BUILDDIR}/build.ninja ]
1818
then
19-
meson configure ${BUILDDIR} --buildtype ${BUILDTYPE} --prefix ${INSTALL_PREFIX:-/usr/local} "$@"
19+
meson configure ${BUILDDIR} -Dbuildtype=${BUILDTYPE} -Dprefix=${INSTALL_PREFIX:-/usr/local} "$@"
2020
else
2121
meson ${BUILDDIR} --buildtype ${BUILDTYPE} --prefix ${INSTALL_PREFIX:-/usr/local} "$@"
2222
fi
2323

2424
pushd ${BUILDDIR}
2525

26+
NINJA=$(awk '/ninja/ {ninja=$4} END {print ninja}' meson-logs/meson-log.txt)
27+
2628
if [ -n "${INSTALL_PREFIX}" ]
2729
then
28-
ninja install
30+
${NINJA} install
2931
else
30-
ninja
32+
${NINJA}
3133
fi
3234

3335
popd

meson.build

+1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ endif
2626
if cc.get_id() == 'clang' or cc.get_id() == 'gcc'
2727
add_project_arguments('-Wextra', language : 'cpp')
2828
add_project_arguments('-pedantic', language : 'cpp')
29+
add_project_arguments('-ffast-math', language : 'cpp')
2930

3031
if get_option('buildtype') == 'release'
3132
add_project_arguments('-march=native', language : 'cpp')

src/mcts/search.cc

+9-3
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
#include "mcts/node.h"
4040
#include "neural/cache.h"
4141
#include "neural/encoder.h"
42+
#include "utils/fastmath.h"
4243
#include "utils/random.h"
4344

4445
namespace lczero {
@@ -198,7 +199,7 @@ inline float ComputeCpuct(const SearchParams& params, uint32_t N) {
198199
const float init = params.GetCpuct();
199200
const float k = params.GetCpuctFactor();
200201
const float base = params.GetCpuctBase();
201-
return init + (k ? k * std::log((N + base) / base) : 0.0f);
202+
return init + (k ? k * FastLog((N + base) / base) : 0.0f);
202203
}
203204
} // namespace
204205

@@ -837,7 +838,9 @@ SearchWorker::NodeToProcess SearchWorker::PickNodeToExtend(
837838
// n_in_flight_ is incremented. If the method returns false, then there is
838839
// a search collision, and this node is already being expanded.
839840
if (!node->TryStartScoreUpdate()) {
840-
IncrementNInFlight(node, search_->root_node_, collision_limit - 1);
841+
if (!is_root_node) {
842+
IncrementNInFlight(node->GetParent(), search_->root_node_, collision_limit - 1);
843+
}
841844
return NodeToProcess::Collision(node, depth, collision_limit);
842845
}
843846
// Either terminal or unexamined leaf node -- the end of this playout.
@@ -1137,7 +1140,10 @@ void SearchWorker::FetchSingleNodeResult(NodeToProcess* node_to_process,
11371140
float p =
11381141
computation_->GetPVal(idx_in_computation, edge.GetMove().as_nn_index());
11391142
if (params_.GetPolicySoftmaxTemp() != 1.0f) {
1140-
p = pow(p, 1 / params_.GetPolicySoftmaxTemp());
1143+
// Flush denormals to zero.
1144+
p = p < 1.17549435E-38
1145+
? 0.0
1146+
: FastPow2(FastLog2(p) / params_.GetPolicySoftmaxTemp());
11411147
}
11421148
edge.edge()->SetP(p);
11431149
// Edge::SetP does some rounding, so only add to the total after rounding.

src/neural/cuda/layers.cc

-2
Original file line numberDiff line numberDiff line change
@@ -297,7 +297,6 @@ void SELayer<float>::LoadWeights(float* w1, float* b1, float* w2, float* b2,
297297
size_t num_weights1 = C * numFc1Out_;
298298
size_t weight_size1 = sizeof(float) * num_weights1;
299299

300-
size_t num_weights2 = 2 * num_weights1;
301300
size_t weight_size2 = 2 * weight_size1;
302301

303302
// Weight for the first FC layer.
@@ -385,7 +384,6 @@ void SELayer<float>::Eval(int N, float* output, const float* input,
385384
const float* /*input2*/, void* scratch,
386385
size_t scratch_size, cudnnHandle_t /*cudnn*/,
387386
cublasHandle_t cublas) {
388-
assert(output == input2);
389387
// Ping-pong between 'op1' and 'op2' (parts of scratch memory).
390388
float* op1 = (float*)scratch;
391389
float* op2 = (float*)scratch + scratch_size / sizeof(float) / 2;

src/neural/loader.cc

+4
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,10 @@ std::string DecompressGzip(const std::string& filename) {
6060
if (!file) throw Exception("Cannot read weights from " + filename);
6161
while (true) {
6262
int sz = gzread(file, &buffer[bytes_read], buffer.size() - bytes_read);
63+
if (sz < 0) {
64+
int errnum;
65+
throw Exception(gzerror(file, &errnum));
66+
}
6367
if (sz == static_cast<int>(buffer.size()) - bytes_read) {
6468
bytes_read = buffer.size();
6569
buffer.resize(buffer.size() * 2);

src/neural/opencl/OpenCLTuner.cc

+4-6
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,6 @@ static void sgemmBatched_ref(const std::vector<float>& a,
4444
auto offset_v = batch * n * k;
4545
auto offset_m = batch * m * n;
4646

47-
// cblas_sgemm(CblasRowMajor, CblasTrans, CblasNoTrans, m, n, k, 1.0f,
48-
// &a[offset_u], m, &b[offset_v], n, 0.0f, &c[offset_m], n);
4947
// Calculates C = transpose(tranpose(A) * B) in row major, or
5048
// C = A * transpose(B) in column major.
5149
for (auto i = 0; i < m; i++) {
@@ -169,16 +167,16 @@ static float compare_ref(std::vector<float>& x, std::vector<float>& ref,
169167
const int m_ceil, const int n_ceil) {
170168
auto sum = 0.0f;
171169
for (auto batch = 0; batch < batch_size; batch++) {
172-
for (auto i = 0; i < n; i++) {
173-
for (auto j = 0; j < m; j++) {
174-
auto r = ref[batch * n * m + i * m + j];
170+
for (auto j = 0; j < m; j++) {
171+
for (auto i = 0; i < n; i++) {
172+
auto r = ref[batch * n * m + j * n + i];
175173
auto y = x[batch * n_ceil * m_ceil + j * n_ceil + i];
176174

177175
sum += (r - y) * (r - y);
178176
}
179177
}
180178
}
181-
return sum / (m * n);
179+
return sum / (m * n * batch_size);
182180
}
183181

184182
std::string Tuner::tune_sgemm(const int m, const int n, const int k,

src/utils/fastmath.h

+68
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
/*
2+
This file is part of Leela Chess Zero.
3+
Copyright (C) 2018 The LCZero Authors
4+
5+
Leela Chess is free software: you can redistribute it and/or modify
6+
it under the terms of the GNU General Public License as published by
7+
the Free Software Foundation, either version 3 of the License, or
8+
(at your option) any later version.
9+
10+
Leela Chess is distributed in the hope that it will be useful,
11+
but WITHOUT ANY WARRANTY; without even the implied warranty of
12+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13+
GNU General Public License for more details.
14+
15+
You should have received a copy of the GNU General Public License
16+
along with Leela Chess. If not, see <http://www.gnu.org/licenses/>.
17+
18+
Additional permission under GNU GPL version 3 section 7
19+
20+
If you modify this Program, or any covered work, by linking or
21+
combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
22+
Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
23+
modified version of those libraries), containing parts covered by the
24+
terms of the respective license agreement, the licensors of this
25+
Program grant you additional permission to convey the resulting work.
26+
*/
27+
28+
#pragma once
29+
30+
#include <cstring>
31+
32+
namespace lczero {
33+
// These stunts are performed by trained professionals, do not try this at home.
34+
35+
// Fast approximate log2(x). Does no range checking.
36+
// The approximation used here is log2(2^N*(1+f)) ~ N+f*(1.342671-0.342671*f)
37+
// where N is the integer and f the fractional part, f>=0.
38+
inline float FastLog2(const float a) {
39+
int32_t tmp;
40+
std::memcpy(&tmp, &a, sizeof(float));
41+
int expb = (tmp >> 23);
42+
tmp = (tmp & 0x7fffff) | (0x7f << 23);
43+
float out;
44+
std::memcpy(&out, &tmp, sizeof(float));
45+
return out * (2.028011f - 0.342671f * out) - 128.68534f + expb;
46+
}
47+
48+
// Fast approximate 2^x. Does only limited range checking.
49+
// The approximation used here is 2^(N+f) ~ 2^N*(1+f*(0.656366+0.343634*f))
50+
// where N is the integer and f the fractional part, f>=0.
51+
inline float FastPow2(const float a) {
52+
if (a < -126) return 0.0;
53+
int exp = floor(a);
54+
float out = a - exp;
55+
out = 1.0f + out * (0.656366f + 0.343634f * out);
56+
int32_t tmp;
57+
std::memcpy(&tmp, &out, sizeof(float));
58+
tmp += exp << 23;
59+
std::memcpy(&out, &tmp, sizeof(float));
60+
return out;
61+
}
62+
63+
// Fast approximate ln(x). Does no range checking.
64+
inline float FastLog(const float a) {
65+
return 0.6931471805599453f * FastLog2(a);
66+
}
67+
68+
} // namespace lczero

0 commit comments

Comments
 (0)