diff --git a/.ci/sonar-project.properties b/.ci/sonar-project.properties
index d2ea04418..19d3c886c 100644
--- a/.ci/sonar-project.properties
+++ b/.ci/sonar-project.properties
@@ -18,7 +18,7 @@ sonar.lang.patterns.cpp=**/*.cpp,**/*.cc,**/*.cxx,**/*.c++,**/*.h,**/*.hpp,**/*.
 sonar.working.directory=build/sonar-workdir
 sonar.cfamily.build-wrapper-output=build/bw-output
 
-# it expects cppunit xml format. googletest format is uncompatible.
+# it expects cppunit xml format. googletest format is incompatible.
 # sonar.cfamily.cppunit.reportsPath=build/unittest-reports
 
 sonar.cfamily.gcov.reportsPath=build
diff --git a/.clang-tidy b/.clang-tidy
index 9682bbabf..ce40e8080 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -22,7 +22,7 @@ CheckOptions:
   - key:             readability-function-size.ParameterThreshold
     value:           '6'
   - key:             readability-function-size.NestingThreshold
-    value:           '6'
+    value:           '7'
   - key:             readability-function-size.VariableThreshold
     value:           '30'
   - key:             readability-simplify-boolean-expr.ChainedConditionalAssignment
@@ -48,5 +48,5 @@ CheckOptions:
   - key:             modernize-use-default-member-init.UseAssignment
     value:           1
   - key:             cppcoreguidelines-macro-usage.AllowedRegexp
-    value:           'DEBUG|_GLIBCXX_SANITIZE_VECTOR|RAWSPEED_SOURCE_DIR|STR|XSTR|BUFFER_PADDING|BSWAP16|BSWAP32|BSWAP64|ThrowExceptionHelper|ThrowIOE|ThrowRSE|ThrowCME|ThrowRDE|ThrowRPE|ThrowTPE|ThrowFIE|ThrowCPE|ThrowFPE|DECODER|fuji_quant_gradient|JPEG_MEMSRC|RLVTABLE|PRECISION_MIN|PRECISION_MAX|MARKER_BAND_END|SQR|RS_CAMERAS_XML_PATH|FULLDECODE|IMPL|IMPL0|IMPL1|PUMP|DECODE|PARSER|GEN_E|GEN_PFS|GEN_PSS|BENCHMARK_CAPTURE_NAME|OMPFIRSTPRIVATECLAUSE|OMPSHAREDCLAUSE'
+    value:           'DEBUG|_GLIBCXX_SANITIZE_VECTOR|RAWSPEED_SOURCE_DIR|STR|XSTR|BSWAP16|BSWAP32|BSWAP64|ThrowExceptionHelper|ThrowIOE|ThrowRSE|ThrowCME|ThrowRDE|ThrowRPE|ThrowTPE|ThrowFIE|ThrowCPE|ThrowFPE|DECODER|fuji_quant_gradient|JPEG_MEMSRC|RLVTABLE|PRECISION_MIN|PRECISION_MAX|MARKER_BAND_END|SQR|RS_CAMERAS_XML_PATH|FULLDECODE|IMPL|IMPL0|IMPL1|PUMP|DECODE|PARSER|GEN_E|GEN_PFS|GEN_PSS|BENCHMARK_CAPTURE_NAME|OMPFIRSTPRIVATECLAUSE|OMPSHAREDCLAUSE|RAWSPEED_UNLIKELY_FUNCTION|RAWSPEED_NOINLINE'
 ...
diff --git a/.travis.yml b/.travis.yml
index f327193f2..15dbfa4b5 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -27,7 +27,14 @@ install:
       echo "oracle-java9-installer hold" | sudo dpkg --set-selections;
       travis_retry sudo add-apt-repository ppa:ubuntu-toolchain-r/test -y;
       travis_retry sudo apt-get update -q;
-      travis_retry sudo apt-get install -y -q -f --fix-missing clang-3.5 clang-3.9 cmake g++-5 git libiomp-dev libjpeg-dev libpugixml-dev libxml2-utils make ninja-build zlib1g-dev;
+      travis_retry sudo apt-get install -y -q -f --fix-missing clang-3.9 cmake g++-5 git libiomp-dev libjpeg-dev libpugixml-dev libxml2-utils make ninja-build zlib1g-dev;
+    fi;
+  - if [[ "$TRAVIS_OS_NAME" == "linux" && "$EXTRA" == "NODOCKER"  && "$TRAVIS_DIST" == "xenial" ]]; then
+      echo "oracle-java8-installer hold" | sudo dpkg --set-selections;
+      echo "oracle-java9-installer hold" | sudo dpkg --set-selections;
+      travis_retry sudo add-apt-repository ppa:ubuntu-toolchain-r/test -y;
+      travis_retry sudo apt-get update -q;
+      travis_retry sudo apt-get install -y -q -f --fix-missing clang-3.5;
     fi;
   - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
       travis_retry brew update > /dev/null && brew tap Homebrew/bundle && cd .ci && brew bundle --verbose;
@@ -62,26 +69,26 @@ jobs:
   include:
   - stage: test
     os: linux
-    dist: trusty
+    dist: bionic
     sudo: required
     services:
       - docker
     env: CC=gcc-9 CXX=g++-9
   - stage: test
     os: linux
-    dist: trusty
+    dist: bionic
     sudo: required
     services:
       - docker
     env: CC=gcc-9 CXX=g++-9 ECO="-DWITH_OPENMP=OFF"
   - os: linux
-    dist: trusty
+    dist: bionic
     sudo: required
     services:
       - docker
     env: CC=clang-9 CXX=clang++-9
   - os: linux
-    dist: trusty
+    dist: bionic
     sudo: required
     services:
       - docker
@@ -95,40 +102,40 @@ jobs:
   #- os: osx
   #  env: CC=cc CXX=c++ FLAVOR=Coverage
   - os: linux
-    dist: trusty
+    dist: bionic
     sudo: required
     services:
       - docker
     # Don't forget to ensure that FindLLVMClangTidy.cmake is also bumped
     env: CC=clang-9 CXX=clang++-9 TARGET=STATICANALYSIS ECO="-DUSE_CLANG_TIDY=ON"
   - os: linux
-    dist: trusty
+    dist: bionic
     sudo: required
     services:
       - docker
     env: CC=clang-9 CXX=clang++-9 ECO="-DWITH_OPENMP=OFF"
   - os: linux
-    dist: trusty
+    dist: bionic
     sudo: required
     env: CC=gcc-5 CXX=g++-5 EXTRA=NODOCKER
   - os: linux
-    dist: trusty
+    dist: xenial
     sudo: required
-    env: CC=clang-3.5 CXX=clang++-3.5 EXTRA=NODOCKER ECO="-DWITH_OPENMP=OFF"
+    env: CC=clang-3.5 CXX=clang++-3.5 EXTRA=NODOCKER ECO="-DWITH_OPENMP=OFF -DWITH_ZLIB=OFF"
   - os: linux
-    dist: trusty
+    dist: bionic
     sudo: required
     env: CC=clang-3.9 CXX=clang++-3.9 EXTRA=NODOCKER
   #- stage: deploy
   #  os: linux
   #  sudo: required
-  #  dist: xenial
+  #  dist: bionic
   #  edge: true
   #  services:
   #    - docker
   - stage: deploy
     os: linux
-    dist: trusty
+    dist: bionic
     sudo: required
     services:
       - docker
diff --git a/README.rst b/README.rst
index c15f58100..302ac72f3 100644
--- a/README.rst
+++ b/README.rst
@@ -17,6 +17,7 @@ rawspeed |travis-ci| |appveyor-ci| OBS_ |codecov| |oss-fuzz|
 ================================================================================
 RawSpeed Developer Information
 ================================================================================
+
 What is RawSpeed?
 --------------------------------------------------------------------------------
 
diff --git a/bench/librawspeed/bench/Common.h b/bench/librawspeed/bench/Common.h
index 7b4f99282..e4819b89b 100644
--- a/bench/librawspeed/bench/Common.h
+++ b/bench/librawspeed/bench/Common.h
@@ -18,6 +18,8 @@
     Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */
 
+#pragma once
+
 #include "common/Point.h" // for iPoint2D
 #include <cstddef>        // for size_t
 
diff --git a/bench/librawspeed/io/BitStreamBenchmark.cpp b/bench/librawspeed/io/BitStreamBenchmark.cpp
index 4919cd5b5..03f5e6dd6 100644
--- a/bench/librawspeed/io/BitStreamBenchmark.cpp
+++ b/bench/librawspeed/io/BitStreamBenchmark.cpp
@@ -61,7 +61,8 @@ static inline void BM_BitStream(benchmark::State& state, Endianness endianness,
   assert((Step == 1) || rawspeed::isAligned(Step, 2));
   assert((fillSize == 1) || rawspeed::isAligned(fillSize, 2));
 
-  const rawspeed::Buffer b(state.range(0));
+  auto storage = rawspeed::Buffer::Create(state.range(0));
+  const rawspeed::Buffer b(storage.get(), state.range(0));
   assert(b.getSize() > 0);
   assert(b.getSize() == (size_t)state.range(0));
 
diff --git a/cmake/src-dependencies.cmake b/cmake/src-dependencies.cmake
index 8e1f97ee3..52e716cee 100644
--- a/cmake/src-dependencies.cmake
+++ b/cmake/src-dependencies.cmake
@@ -43,21 +43,11 @@ if(WITH_OPENMP)
     message(STATUS "Looking for OpenMP - found (system)")
   endif()
 
-  # FIXME: OpenMP::OpenMP_CXX target, and ${OpenMP_CXX_LIBRARIES} were both
-  # added in cmake-3.9. Until then, this is correct:
-  if(NOT TARGET OpenMP::OpenMP_CXX)
-    add_library(OpenMP::OpenMP_CXX INTERFACE IMPORTED)
-    if(OpenMP_CXX_FLAGS)
-      set_property(TARGET OpenMP::OpenMP_CXX PROPERTY INTERFACE_COMPILE_OPTIONS ${OpenMP_CXX_FLAGS})
-      set_property(TARGET OpenMP::OpenMP_CXX PROPERTY INTERFACE_LINK_LIBRARIES ${OpenMP_CXX_FLAGS})
-      # Yes, both of them to the same value.
-    endif()
-  endif()
-
   # The wrapper library that *actually* should be linked to.
   add_library(RawSpeed::OpenMP_CXX INTERFACE IMPORTED)
   set_property(TARGET RawSpeed::OpenMP_CXX        PROPERTY INTERFACE_COMPILE_OPTIONS $<TARGET_PROPERTY:OpenMP::OpenMP_CXX,INTERFACE_COMPILE_OPTIONS>)
   set_property(TARGET RawSpeed::OpenMP_CXX APPEND PROPERTY INTERFACE_COMPILE_OPTIONS ${OPENMP_VERSION_SPECIFIER})
+  set_property(TARGET RawSpeed::OpenMP_CXX        PROPERTY INTERFACE_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:OpenMP::OpenMP_CXX,INTERFACE_INCLUDE_DIRECTORIES>)
   if(NOT USE_BUNDLED_LLVMOPENMP)
     set_property(TARGET RawSpeed::OpenMP_CXX      PROPERTY INTERFACE_LINK_LIBRARIES  $<TARGET_PROPERTY:OpenMP::OpenMP_CXX,INTERFACE_LINK_LIBRARIES>)
   else()
@@ -145,7 +135,7 @@ add_feature_info("XML reading" HAVE_PUGIXML "used for loading of data/cameras.xm
 unset(HAVE_JPEG)
 if(WITH_JPEG)
   message(STATUS "Looking for JPEG")
-  find_package(JPEG 1.5.0) # libjpeg-turbo
+  find_package(JPEG)
   if(NOT JPEG_FOUND)
     message(SEND_ERROR "Did not find JPEG! Either make it find JPEG, or pass -DWITH_JPEG=OFF to disable JPEG.")
   else()
diff --git a/data/cameras.xml b/data/cameras.xml
index bd040d63a..2b8ad46ec 100644
--- a/data/cameras.xml
+++ b/data/cameras.xml
@@ -4275,6 +4275,17 @@
 		<Sensor black="220" white="4000" iso_min="12800" iso_max="25599"/>
 		<Sensor black="187" white="4000" iso_min="25600" iso_max="25600"/>
 	</Camera>
+	<Camera make="OLYMPUS CORPORATION" model="E-M5MarkIII">
+		<ID make="Olympus" model="E-M5 Mark III">Olympus E-M5 Mark III</ID>
+		<CFA width="2" height="2">
+			<Color x="0" y="0">RED</Color>
+			<Color x="1" y="0">GREEN</Color>
+			<Color x="0" y="1">GREEN</Color>
+			<Color x="1" y="1">BLUE</Color>
+		</CFA>
+		<Crop x="0" y="0" width="0" height="0"/>
+		<Sensor black="257" white="4000"/>
+	</Camera>
 	<Camera make="OLYMPUS IMAGING CORP." model="E-P1">
 		<ID make="Olympus" model="E-P1">Olympus E-P1</ID>
 		<Crop x="0" y="0" width="4094" height="3082"/>
@@ -8371,6 +8382,17 @@
 		<Crop x="0" y="0" width="-12" height="0"/>
 		<Sensor black="800" white="16300"/>
 	</Camera>
+	<Camera make="SONY" model="DSC-RX100M7">
+		<ID make="Sony" model="DSC-RX100M7">Sony DSC-RX100M7</ID>
+		<CFA width="2" height="2">
+			<Color x="0" y="0">RED</Color>
+			<Color x="1" y="0">GREEN</Color>
+			<Color x="0" y="1">GREEN</Color>
+			<Color x="1" y="1">BLUE</Color>
+		</CFA>
+		<Crop x="0" y="0" width="-12" height="0"/>
+		<Sensor black="800" white="16300"/>
+	</Camera>
 	<Camera make="SONY" model="DSC-RX1R">
 		<ID make="Sony" model="DSC-RX1R">Sony DSC-RX1R</ID>
 		<CFA width="2" height="2">
@@ -8783,6 +8805,17 @@
 		<Crop x="0" y="0" width="-28" height="0"/>
 		<Sensor black="512" white="16300"/>
 	</Camera>
+	<Camera make="SONY" model="ILCE-6600">
+		<ID make="Sony" model="ILCE-6600">Sony ILCE-6600</ID>
+		<CFA width="2" height="2">
+			<Color x="0" y="0">RED</Color>
+			<Color x="1" y="0">GREEN</Color>
+			<Color x="0" y="1">GREEN</Color>
+			<Color x="1" y="1">BLUE</Color>
+		</CFA>
+		<Crop x="0" y="0" width="-28" height="0"/>
+		<Sensor black="512" white="16383"/>
+	</Camera>
 	<Camera make="SONY" model="ILCE-7">
 		<ID make="Sony" model="ILCE-7">Sony ILCE-7</ID>
 		<CFA width="2" height="2">
@@ -9701,6 +9734,20 @@
 			<Hint name="jpeg32_bitorder" value=""/>
 		</Hints>
 	</Camera>
+	<Camera make="FUJIFILM" model="X-A10">
+		<ID make="Fujifilm" model="X-A10">Fujifilm X-A10</ID>
+		<CFA width="2" height="2">
+			<Color x="0" y="0">RED</Color>
+			<Color x="1" y="0">GREEN</Color>
+			<Color x="0" y="1">GREEN</Color>
+			<Color x="1" y="1">BLUE</Color>
+		</CFA>
+		<Crop x="0" y="0" width="0" height="0"/>
+		<Sensor black="256" white="4094"/>
+		<Hints>
+			<Hint name="jpeg32_bitorder" value=""/>
+		</Hints>
+	</Camera>
 	<Camera make="FUJIFILM" model="XQ1">
 		<ID make="Fujifilm" model="XQ1">Fujifilm XQ1</ID>
 		<CFA2 width="6" height="6">
diff --git a/docs/IntegrationTesting.rst b/docs/IntegrationTesting.rst
new file mode 100644
index 000000000..9df5ee639
--- /dev/null
+++ b/docs/IntegrationTesting.rst
@@ -0,0 +1,163 @@
+.. _integration_testing:
+
+================================================================================
+Integration Testing
+================================================================================
+
+.. seealso::
+
+   :ref:`RSA`
+
+As a first step, you *need* to acquire the sample archive you will want to use,
+see e.g. :ref:`rpu_rsync`.
+
+Due to the specifics of the the domain, just having the samples you want to use
+for integration testing is not sufficient. Given *just* the samples, it is not
+possible to verify anything in an automatic manner.
+
+You can, of course, load the samples into some software that uses the
+`RawSpeed <rawspeed_>`_ library, for example into darktable_, and see that they
+decoded into some meaningful image, but that is indirect and tests much more
+than just the library.
+
+.. _rawspeed: https://github.com/darktable-org/rawspeed
+.. _darktable: https://github.com/darktable-org/darktable
+
+So instead, we want to document (record, called `a hash` onwards) how the
+samples decode 'currently' (in a trusted, known-good hardware/software/compiler
+stack/compilation options etc), store this per-image info, and then just check
+against it afterwards (after modifying the library, or anything else really).
+
+.. _producing_trusted_reference_hashes:
+
+Producing Trusted reference Hashes
+----------------------------------
+
+Optionally, it may or may not be a good idea to first manually inspect the
+samples (via e.g. darktable_), make a note which are seemingly currently decoded
+correctly, and which are not.
+
+For best results the Trusted Hashes should be produced in most mundane
+environment - stable mainstream hardware (little-endian, x86; no overclocking),
+stable software stack, and most importantly a trusted compiler. You also
+shouldn't use ``-Oomg-optimize -fmoar-performance`` compilation flags for this.
+
+.. WARNING::
+   Trusted baseline hashes are a the very foundation for any further integration
+   testing. It is always important to have good, stable foundation. It will not
+   be productive if those hashes are produced incorrectly, be it either because
+   the hardware is faulty (RAM/disk bit flips), or the library was miscompiled.
+
+Other than that, generating said hashes is pretty trivial.
+
+Specifying location of Reference Sample Archive
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In order to make use of build system integration of integration testing,
+we must first tell it where the :ref:`sample set<sampleset>` is located,
+for example:
+
+::
+
+  $ cmake -DRAWSPEED_ENABLE_SAMPLE_BASED_TESTING:BOOL=ON \
+          -DRAWSPEED_REFERENCE_SAMPLE_ARCHIVE:PATH="~/raw-camera-samples/raw.pixls.us-unique/" \
+          <path to rawspeed repo checkout>
+
+.. NOTE::
+
+  The location of the samples must be writable if you intend to produce hashes.
+
+Other required CMake flags
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+We also need to build the code that is actually responsible for these
+integration tests:
+
+::
+
+  $ cmake -DBUILD_TOOLS:BOOL=ON \
+          <path to rawspeed repo checkout>
+
+Producing hashes
+~~~~~~~~~~~~~~~~
+
+After that is done, we can finally create the hashes, and for that there is
+a ``rstest-create`` build target:
+
+::
+
+  $ cmake --build . -- rstest-clean # get rid of any pre-existing hashes, just in case.
+  [1/1] Running utility command for rstest-clean
+  $ cmake --build . -- rstest-create
+  <maybe actually building library's sources and rstest if you didn't build it yet>
+  [0/1] Running utility command for rstest-create
+  <full path to a sample>: starting decoding ...
+  <full path to a sample>:  <> MB / <> ms
+
+  Total decoding time: <>s
+
+  All good, all hashes created!
+
+And that's it, we've got the hashes! They were placed next to the samples in
+the archive, with ``.hash`` suffix appended. Maybe you want to use some kind of
+layered file system (overlayfs_ e.g.) to separate those from the actual samples,
+up to you.
+
+.. _overlayfs: https://www.kernel.org/doc/Documentation/filesystems/overlayfs.txt
+
+Performing Integration Testing
+------------------------------
+
+.. IMPORTANT::
+
+  Do ensure that the library is actually re-compiled with the changes you want
+  to test. To err on the safe side, sometimes it is useful to remove the entire
+  build directory and make a fresh build!
+
+After you have performed the changes you wanted to - modified the library,
+or changed hardware/software/compiler/compiler flags - and you want to validate
+that those changes did not cause any regressions in the sample set, it is time
+to actually make use of the Trusted Reference Hashes that we have created
+previously.
+
+For that, there is a ``rstest-test`` build target.
+If everything is good you may see:
+
+::
+
+  $ cmake --build . -- rstest-test
+  [0/1] Running utility command for rstest-test
+  <full path to a sample>: starting decoding ...
+  <full path to a sample>:  <> MB / <> ms
+  Total decoding time: <>s
+
+  All good, no tests failed!
+
+Or, if there are issues, you may see:
+
+::
+
+  $ cmake --build . -- rstest-test
+  [0/1] Running utility command for rstest-test
+  <full path to a sample>: starting decoding ...
+  <full path to a sample>:  <> MB / <> ms
+  <full path to a sample> failed: hash/metadata mismatch
+  Total decoding time: <>s
+
+  WARNING: the following <> tests have failed:
+  <full path to a sample> failed: hash/metadata mismatch
+  See rstest.log for details.
+  <...>
+  ninja: build stopped: subcommand failed.
+
+Unless the process crashed, it should have created
+``<full path to a sample>.hash.failed``, and outputted the diff_ between
+the existing ``<full path to a sample>.hash`` Trusted Hash and the actual result
+``<full path to a sample>.hash.failed`` into ``rstest.log`` file in root of the
+build dir.
+
+.. _diff: https://manpages.debian.org/unstable/diffutils/diff.1.en.html
+
+.. seealso::
+
+   :ref:`lnt`
diff --git a/docs/ReferenceSampleArchive.rst b/docs/ReferenceSampleArchive.rst
new file mode 100644
index 000000000..cbf471628
--- /dev/null
+++ b/docs/ReferenceSampleArchive.rst
@@ -0,0 +1,97 @@
+.. _RSA:
+
+================================================================================
+Reference Sample Archive
+================================================================================
+
+While there is some test coverage via unit tests, the major bulk of testing
+is achieved via integration tests over some sample set.
+
+.. _sampleset:
+
+What is considered a sample set
+-------------------------------
+
+Here and onwards, a sample set is just a directory with samples, and two special
+files. There should be a ``timestamp.txt`` containing an
+`Unix time <Unix_time_>`_ (presumably, of when the set was last updated).
+Most importantly, it **must** also contain ``filelist.sha1`` file in the
+top-level directory, which is used as a digest to the contents of said sample
+set. Said file **must** be a valid sha1sum_ output, with format:
+
+::
+
+  <40-char SHA1><space><asterisk><filename>
+
+.. _Unix_time: https://en.wikipedia.org/wiki/Unix_time
+
+.. _sha1sum: https://manpages.debian.org/unstable/coreutils/sha1sum.1.en.html
+
+Canonical Sample Set
+--------------------
+
+The canonical raw sample data set is `raw.pixls.us <RPU_>`_.
+It is freely licensed - all new samples are in Public Domain under
+`CC0 1.0 <CC0_>`_ license (85+% of samples and counting),
+however some older samples are still under more restrictive
+`CC BY-NC-SA 4.0 <BYNCSA40_>`_ license.
+
+**Please read** `this <rpu-post_>`_ **for more info on how to contribute samples!**
+
+.. _RPU: https://raw.pixls.us/
+
+.. _CC0: https://creativecommons.org/publicdomain/zero/1.0/
+
+.. _BYNCSA40: http://creativecommons.org/licenses/by-nc-sa/4.0/
+
+.. _rpu-post: https://discuss.pixls.us/t/raw-samples-wanted/5420?u=lebedevri
+
+Full sample set
+~~~~~~~~~~~~~~~
+
+The complete set, that includes every sample available, and thus has as good
+coverage as we can get, but as downside it is *quite* bulky - |rpu-button-size|
+total, spanning |rpu-button-samples|.
+
+.. |rpu-button-cameras| image:: https://raw.pixls.us/button-cameras.svg
+    :target: https://raw.pixls.us/
+
+.. |rpu-button-samples| image:: https://raw.pixls.us/button-samples.svg
+    :target: https://raw.pixls.us/
+
+It is accessible at: https://raw.pixls.us/data/
+
+Masterset
+~~~~~~~~~
+
+But there is also a masterset, with just a handful hand-picked samples that
+provide reasonable-ish coverage while spanning only ~ :math:`1/22`'th of the
+disk footprint and ~ :math:`1/44`` sample count of the full set.
+
+.. CAUTION::
+   Unless you want to perform rigorous regression testing
+   the masterset is strongly recommended!
+
+.. TIP::
+   Masterset **only** contains samples that are in `public domain <CC0_>`_.
+
+It is accessible at: https://raw.pixls.us/data-unique/
+
+.. |rpu-button-size| image:: https://raw.pixls.us/button-size.svg
+
+.. _rpu_rsync:
+
+Acquiring Canonical Sample Set
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Pick which sample set you will want to acquire. Be wary of disk footprint!
+Probably the easiest way to fetch it is via rsync_, for example:
+
+::
+
+   $ rsync -vvrLtW --preallocate --delete --compress --compress-level=1 --progress \
+           rsync://raw.pixls.us/data-unique/ ~/raw-camera-samples/raw.pixls.us-unique/
+   $ # it might be a good idea to verify consistency afterwards:
+   $ sha1sum -c --strict ~/raw-camera-samples/raw.pixls.us-unique/filelist.sha1
+
+.. _rsync: https://manpages.debian.org/unstable/rsync/rsync.1.en.html
diff --git a/docs/index.rst b/docs/index.rst
index a28c8181d..3fd5c84cd 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -12,6 +12,8 @@ Welcome to RawSpeed's documentation!
 
    self
    CameraSupport
+   ReferenceSampleArchive
+   IntegrationTesting
    Doxygen
    lnt/index.rst
 
diff --git a/fuzz/librawspeed/decompressors/HuffmanTable/Common.h b/fuzz/librawspeed/decompressors/HuffmanTable/Common.h
index ecbe44597..f3fef06cb 100644
--- a/fuzz/librawspeed/decompressors/HuffmanTable/Common.h
+++ b/fuzz/librawspeed/decompressors/HuffmanTable/Common.h
@@ -18,6 +18,8 @@
     Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */
 
+#pragma once
+
 #include "io/Buffer.h"     // for Buffer
 #include "io/ByteStream.h" // for ByteStream
 
diff --git a/fuzz/librawspeed/fuzz/Common.h b/fuzz/librawspeed/fuzz/Common.h
index e9d3ccdd5..c6d081faf 100644
--- a/fuzz/librawspeed/fuzz/Common.h
+++ b/fuzz/librawspeed/fuzz/Common.h
@@ -18,6 +18,8 @@
     Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */
 
+#pragma once
+
 #include "common/RawImage.h"           // for RawImage
 #include "metadata/ColorFilterArray.h" // for ColorFilterArray
 
diff --git a/lnt/README.rst b/lnt/README.rst
index eae01bfb7..12349587f 100644
--- a/lnt/README.rst
+++ b/lnt/README.rst
@@ -1,5 +1,7 @@
 .. _my-label: lnt
 
+.. _lnt:
+
 =================================
 LLVM LNT / Test-Suite Integration
 =================================
@@ -15,15 +17,16 @@ Prerequisites
   * all of the normal prerequisites for building **development** version of RawSpeed.
   * python's `virtualenv <https://packages.debian.org/unstable/virtualenv>`_
   * `llvm-size`, `llvm-lit <https://llvm.org/docs/CommandGuide/lit.html>`_ (from
-    `llvm <https://packages.debian.org/unstable/llvm-8>`_,
-    `llvm-tools <https://packages.debian.org/unstable/llvm-8-tools>`_ packages)
-  * A checkout of raw sample archive of you choice.
-
-    It is suggested to use `https://raw.pixls.us <https://raw.pixls.us>`_
-    masterset (see CI scripts for how to get it)
+    `llvm <https://packages.debian.org/unstable/llvm-9>`_,
+    `llvm-tools <https://packages.debian.org/unstable/llvm-9-tools>`_ packages)
+  * A checkout of raw sample archive of you choice. It is suggested to use
+    `https://raw.pixls.us <https://raw.pixls.us>`_ masterset.
+    Please see :ref:`RSA` page for details.
   * Reference hashes for the raws in the sampleset.
 
     Generate them via ``$ ninja rstest-create`` from your **trusted** (!) dev build.
+    Please see :ref:`integration_testing` and
+    :ref:`producing_trusted_reference_hashes` pages for details.
 
 Getting it done
 ---------------
@@ -55,8 +58,9 @@ Getting it done
   # View results.
   $SANDBOX/bin/lnt runserver $PERFDB
 
-See also
---------
+
+.. seealso::
+
   * https://llvm.org/docs/TestSuiteGuide.html#common-configuration-options
   * ``$ $SANDBOX/bin/lnt runtest test_suite --help``
   * ``$ $SANDBOX/bin/lnt --help``
diff --git a/src/config.h.in b/src/config.h.in
index cfa6f8c66..870704b71 100644
--- a/src/config.h.in
+++ b/src/config.h.in
@@ -86,3 +86,6 @@
 #ifndef __has_extension
 #define __has_extension __has_feature // Compatibility with pre-3.0 compilers.
 #endif
+
+#define RAWSPEED_UNLIKELY_FUNCTION __attribute__((cold))
+#define RAWSPEED_NOINLINE __attribute__((noinline))
diff --git a/src/librawspeed/common/Array2DRef.h b/src/librawspeed/common/Array2DRef.h
index ec3a6e16f..a67f68e1a 100644
--- a/src/librawspeed/common/Array2DRef.h
+++ b/src/librawspeed/common/Array2DRef.h
@@ -33,6 +33,8 @@ template <class T> class Array2DRef {
 
   friend Array2DRef<const T>; // We need to be able to convert to const version.
 
+  inline T& operator[](int row) const;
+
 public:
   using value_type = T;
   using cvless_value_type = typename std::remove_cv<value_type>::type;
@@ -62,7 +64,7 @@ template <class T> class Array2DRef {
     return {storage->data(), width, height};
   }
 
-  inline T& operator()(int x, int y) const;
+  inline T& operator()(int row, int col) const;
 };
 
 template <class T>
@@ -74,14 +76,18 @@ Array2DRef<T>::Array2DRef(T* data, const int dataWidth, const int dataHeight,
   _pitch = (dataPitch == 0 ? dataWidth : dataPitch);
 }
 
-template <class T>
-T& Array2DRef<T>::operator()(const int x, const int y) const {
+template <class T> T& Array2DRef<T>::operator[](const int row) const {
   assert(_data);
-  assert(x >= 0);
-  assert(y >= 0);
-  assert(x < width);
-  assert(y < height);
-  return _data[y * _pitch + x];
+  assert(row >= 0);
+  assert(row < height);
+  return _data[row * _pitch];
+}
+
+template <class T>
+T& Array2DRef<T>::operator()(const int row, const int col) const {
+  assert(col >= 0);
+  assert(col < width);
+  return (&(operator[](row)))[col];
 }
 
 } // namespace rawspeed
diff --git a/src/librawspeed/common/CMakeLists.txt b/src/librawspeed/common/CMakeLists.txt
index cab66c1ce..19e5b44a3 100644
--- a/src/librawspeed/common/CMakeLists.txt
+++ b/src/librawspeed/common/CMakeLists.txt
@@ -50,3 +50,7 @@ target_sources(rawspeed_get_number_of_processor_cores PRIVATE
 )
 
 target_link_libraries(rawspeed_get_number_of_processor_cores PRIVATE rawspeed)
+
+if(TARGET RawSpeed::OpenMP_CXX)
+  target_link_libraries(rawspeed_get_number_of_processor_cores PRIVATE RawSpeed::OpenMP_CXX)
+endif()
diff --git a/src/librawspeed/common/Common.h b/src/librawspeed/common/Common.h
index 413dbc94a..430ddb171 100644
--- a/src/librawspeed/common/Common.h
+++ b/src/librawspeed/common/Common.h
@@ -148,6 +148,29 @@ clampBits(T value, unsigned int nBits,
   return clampBits<UnsignedT>(value, nBits);
 }
 
+template <typename T>
+inline constexpr bool __attribute__((const))
+isIntN(T value, unsigned int nBits,
+       typename std::enable_if<std::is_arithmetic<T>::value>::type* /*unused*/ =
+           nullptr) {
+  assert(nBits < CHAR_BIT * sizeof(T) && "Check must not be tautological.");
+  using UnsignedT = typename std::make_unsigned<T>::type;
+  const auto highBits = static_cast<UnsignedT>(value) >> nBits;
+  return highBits == 0;
+}
+
+template <typename T>
+inline constexpr typename std::make_signed<T>::type __attribute__((const))
+signExtend(
+    T value, unsigned int nBits,
+    typename std::enable_if<std::is_unsigned<T>::value>::type* /*unused*/ =
+        nullptr) {
+  assert(nBits != 0 && "Only valid for non-zero bit count.");
+  const T SpareSignBits = CHAR_BIT * sizeof(T) - nBits;
+  using SignedT = typename std::make_signed<T>::type;
+  return static_cast<SignedT>(value << SpareSignBits) >> SpareSignBits;
+}
+
 // Trim both leading and trailing spaces from the string
 inline std::string trimSpaces(const std::string& str)
 {
diff --git a/src/librawspeed/common/Point.h b/src/librawspeed/common/Point.h
index 8af32ef9d..679547ad5 100644
--- a/src/librawspeed/common/Point.h
+++ b/src/librawspeed/common/Point.h
@@ -77,15 +77,10 @@ class iPoint2D {
   area_type __attribute__((pure)) area() const {
     using signed_area = std::make_signed<area_type>::type;
 
-    if (x >= 0 && y >= 0)
-      return static_cast<area_type>(x) * static_cast<area_type>(y);
-    if (x >= 0 && y < 0)
-      return static_cast<area_type>(x) * (-1 * static_cast<signed_area>(y));
-    if (y >= 0 && x < 0)
-      return static_cast<area_type>(y) * (-1 * static_cast<signed_area>(x));
+    area_type x_abs = std::abs(static_cast<signed_area>(x));
+    area_type y_abs = std::abs(static_cast<signed_area>(y));
 
-    assert(x < 0 && y < 0);
-    return static_cast<signed_area>(x) * static_cast<signed_area>(y);
+    return x_abs * y_abs;
   }
 
   constexpr bool isThisInside(const iPoint2D& rhs) const {
@@ -93,7 +88,10 @@ class iPoint2D {
   }
 
   constexpr iPoint2D getSmallest(const iPoint2D& rhs) const {
-    return {x < rhs.x ? x : rhs.x, y < rhs.y ? y : rhs.y};
+    return {
+        std::min(x, rhs.x),
+        std::min(y, rhs.y),
+    };
   }
 
   value_type x = 0;
@@ -160,7 +158,7 @@ class iRectangle2D {
   void setSize(const iPoint2D& size) { dim = size; }
   void setSize(iPoint2D&& size) { dim = size; }
 
-  /* Crop, so area is postitive, and return true, if there is any area left */
+  /* Crop, so area is positive, and return true, if there is any area left */
   /* This will ensure that bottomright is never on the left/top of the offset */
   bool cropArea() {
     dim.x = std::max(0, dim.x);
diff --git a/src/librawspeed/common/RawImage.cpp b/src/librawspeed/common/RawImage.cpp
index 5e87a9215..af80194a2 100644
--- a/src/librawspeed/common/RawImage.cpp
+++ b/src/librawspeed/common/RawImage.cpp
@@ -46,11 +46,11 @@ RawImageData::RawImageData() : cfa(iPoint2D(0, 0)) {
   blackLevelSeparate.fill(-1);
 }
 
-RawImageData::RawImageData(const iPoint2D& _dim, uint32_t _bpc, uint32_t _cpp)
+RawImageData::RawImageData(const iPoint2D& _dim, int _bpc, int _cpp)
     : dim(_dim), isCFA(_cpp == 1), cfa(iPoint2D(0, 0)), cpp(_cpp) {
   assert(_bpc > 0);
 
-  if (cpp > std::numeric_limits<decltype(bpp)>::max() / _bpc)
+  if (cpp > std::numeric_limits<decltype(cpp)>::max() / _bpc)
     ThrowRDE("Components-per-pixel is too large.");
 
   bpp = _bpc * _cpp;
@@ -221,14 +221,14 @@ uint8_t* RawImageData::getData() const {
 }
 
 uint8_t* RawImageData::getData(uint32_t x, uint32_t y) {
+  x += mOffset.x;
+  y += mOffset.y;
+
   if (x >= static_cast<unsigned>(uncropped_dim.x))
     ThrowRDE("X Position outside image requested.");
   if (y >= static_cast<unsigned>(uncropped_dim.y))
     ThrowRDE("Y Position outside image requested.");
 
-  x += mOffset.x;
-  y += mOffset.y;
-
   if (!data)
     ThrowRDE("Data not yet allocated.");
 
@@ -463,7 +463,7 @@ void RawImageData::expandBorder(iRectangle2D validData)
       uint8_t* src_pos = getData(validData.pos.x, y);
       uint8_t* dst_pos = getData(validData.pos.x - 1, y);
       for (int x = validData.pos.x; x >= 0; x--) {
-        for (uint32_t i = 0; i < bpp; i++) {
+        for (int i = 0; i < bpp; i++) {
           dst_pos[i] = src_pos[i];
         }
         dst_pos -= bpp;
@@ -477,7 +477,7 @@ void RawImageData::expandBorder(iRectangle2D validData)
       uint8_t* src_pos = getData(pos - 1, y);
       uint8_t* dst_pos = getData(pos, y);
       for (int x = pos; x < dim.x; x++) {
-        for (uint32_t i = 0; i < bpp; i++) {
+        for (int i = 0; i < bpp; i++) {
           dst_pos[i] = src_pos[i];
         }
         dst_pos += bpp;
diff --git a/src/librawspeed/common/RawImage.h b/src/librawspeed/common/RawImage.h
index 4bebd1b79..98ed2623c 100644
--- a/src/librawspeed/common/RawImage.h
+++ b/src/librawspeed/common/RawImage.h
@@ -21,18 +21,21 @@
 #pragma once
 
 #include "rawspeedconfig.h"
-#include "ThreadSafetyAnalysis.h"      // for GUARDED_BY, REQUIRES
-#include "common/Common.h" // for uint32_t, uint8_t, uint16_t, wri...
-#include "common/ErrorLog.h"           // for ErrorLog
-#include "common/Mutex.h"              // for Mutex
-#include "common/Point.h"              // for iPoint2D, iRectangle2D (ptr o...
-#include "common/TableLookUp.h"        // for TableLookUp
-#include "metadata/BlackArea.h"        // for BlackArea
-#include "metadata/ColorFilterArray.h" // for ColorFilterArray
-#include <array>                       // for array
-#include <memory>                      // for unique_ptr, operator==
-#include <string>                      // for string
-#include <vector>                      // for vector
+#include "ThreadSafetyAnalysis.h" // for GUARDED_BY, REQUIRES
+#include "common/Array2DRef.h"    // for Array2DRef
+#include "common/Common.h"        // for uint32_t, uint8_t, uint16_t, wri...
+#include "common/ErrorLog.h"      // for ErrorLog
+#include "common/Mutex.h"         // for Mutex
+#include "common/Point.h"         // for iPoint2D, iRectangle2D (ptr o...
+#include "common/TableLookUp.h"   // for TableLookUp
+#include "decoders/RawDecoderException.h" // for ThrowRDE
+#include "metadata/BlackArea.h"           // for BlackArea
+#include "metadata/ColorFilterArray.h"    // for ColorFilterArray
+#include <array>                          // for array
+#include <cassert>                        // for assert
+#include <memory>                         // for unique_ptr, operator==
+#include <string>                         // for string
+#include <vector>                         // for vector
 
 namespace rawspeed {
 
@@ -107,11 +110,13 @@ class RawImageData : public ErrorLog {
   void blitFrom(const RawImage& src, const iPoint2D& srcPos,
                 const iPoint2D& size, const iPoint2D& destPos);
   rawspeed::RawImageType getDataType() const { return dataType; }
+  inline Array2DRef<uint16_t> getU16DataAsUncroppedArray2DRef() const noexcept;
   uint8_t* getData() const;
   uint8_t*
   getData(uint32_t x,
           uint32_t y); // Not super fast, but safe. Don't use per pixel.
   uint8_t* getDataUncropped(uint32_t x, uint32_t y);
+
   void subFrame(iRectangle2D cropped);
   void clearArea(iRectangle2D area, uint8_t value = 0);
   iPoint2D __attribute__((pure)) getUncroppedDim() const;
@@ -130,7 +135,7 @@ class RawImageData : public ErrorLog {
   bool isAllocated() {return !!data;}
   void createBadPixelMap();
   iPoint2D dim;
-  uint32_t pitch = 0;
+  int pitch = 0;
 
   // padding is the size of the area after last pixel of line n
   // and before the first pixel of line n+1
@@ -162,15 +167,15 @@ class RawImageData : public ErrorLog {
 protected:
   RawImageType dataType;
   RawImageData();
-  RawImageData(const iPoint2D& dim, uint32_t bpp, uint32_t cpp = 1);
+  RawImageData(const iPoint2D& dim, int bpp, int cpp = 1);
   virtual void scaleValues(int start_y, int end_y) = 0;
   virtual void doLookup(int start_y, int end_y) = 0;
   virtual void fixBadPixel(uint32_t x, uint32_t y, int component = 0) = 0;
   void fixBadPixelsThread(int start_y, int end_y);
   void startWorker(RawImageWorker::RawImageWorkerTask task, bool cropped );
   uint8_t* data = nullptr;
-  uint32_t cpp = 1; // Components per pixel
-  uint32_t bpp = 0; // Bytes per pixel.
+  int cpp = 1; // Components per pixel
+  int bpp = 0; // Bytes per pixel.
   friend class RawImage;
   iPoint2D mOffset;
   iPoint2D uncropped_dim;
@@ -259,6 +264,15 @@ inline RawImage RawImage::create(const iPoint2D& dim, RawImageType type,
   }
 }
 
+inline Array2DRef<uint16_t>
+RawImageData::getU16DataAsUncroppedArray2DRef() const noexcept {
+  assert(dataType == TYPE_USHORT16 &&
+         "Attempting to access floating-point buffer as uint16_t.");
+  assert(data && "Data not yet allocated.");
+  return {reinterpret_cast<uint16_t*>(data), cpp * dim.x, dim.y,
+          static_cast<int>(pitch / sizeof(uint16_t))};
+}
+
 // setWithLookUp will set a single pixel by using the lookup table if supplied,
 // You must supply the destination where the value should be written, and a pointer to
 // a value that will be used to store a random counter that can be reused between calls.
diff --git a/src/librawspeed/common/RawspeedException.h b/src/librawspeed/common/RawspeedException.h
index 68ea2b96c..bb7a4a5ef 100644
--- a/src/librawspeed/common/RawspeedException.h
+++ b/src/librawspeed/common/RawspeedException.h
@@ -33,8 +33,9 @@
 namespace rawspeed {
 
 template <typename T>
-[[noreturn]] void __attribute__((noreturn, noinline, format(printf, 1, 2)))
-ThrowException(const char* fmt, ...) {
+[[noreturn]] void RAWSPEED_UNLIKELY_FUNCTION RAWSPEED_NOINLINE
+    __attribute__((noreturn, format(printf, 1, 2)))
+    ThrowException(const char* fmt, ...) {
   static constexpr size_t bufSize = 8192;
 #if defined(HAVE_CXX_THREAD_LOCAL)
   static thread_local std::array<char, bufSize> buf;
@@ -56,15 +57,15 @@ ThrowException(const char* fmt, ...) {
 
 class RawspeedException : public std::runtime_error {
 private:
-  static void log(const char* msg) {
+  static void RAWSPEED_UNLIKELY_FUNCTION RAWSPEED_NOINLINE
+  log(const char* msg) {
     writeLog(DEBUG_PRIO_EXTRA, "EXCEPTION: %s", msg);
   }
 
 public:
-  explicit RawspeedException(const std::string& msg) : std::runtime_error(msg) {
-    log(msg.c_str());
-  }
-  explicit RawspeedException(const char* msg) : std::runtime_error(msg) {
+  explicit RAWSPEED_UNLIKELY_FUNCTION RAWSPEED_NOINLINE
+  RawspeedException(const char* msg)
+      : std::runtime_error(msg) {
     log(msg);
   }
 };
diff --git a/src/librawspeed/common/Spline.h b/src/librawspeed/common/Spline.h
index d4bb49992..12338b3f3 100644
--- a/src/librawspeed/common/Spline.h
+++ b/src/librawspeed/common/Spline.h
@@ -97,7 +97,7 @@ class Spline final {
       s.d = (sn.c - s.c) / (3. * h[i]);
     }
 
-    // The last segment is nonsensical, and was only used to temporairly store
+    // The last segment is nonsensical, and was only used to temporarily store
     // the a and c to simplify calculations, so drop that 'segment' now
     segments.pop_back();
 
diff --git a/src/librawspeed/decoders/DngDecoder.cpp b/src/librawspeed/decoders/DngDecoder.cpp
index 88ead5838..880aa2793 100644
--- a/src/librawspeed/decoders/DngDecoder.cpp
+++ b/src/librawspeed/decoders/DngDecoder.cpp
@@ -475,7 +475,8 @@ void DngDecoder::handleMetadata(const TiffIFD* raw) {
       ThrowRDE("Error decoding default crop size");
 
     iPoint2D size(sz[0], sz[1]);
-    if ((size + cropped.pos).isThisInside(mRaw->dim))
+    if (size.isThisInside(mRaw->dim) &&
+        (size + cropped.pos).isThisInside(mRaw->dim))
       cropped.dim = size;
 
     if (!cropped.hasPositiveArea())
diff --git a/src/librawspeed/decoders/IiqDecoder.cpp b/src/librawspeed/decoders/IiqDecoder.cpp
index 63cc69d6a..55828ceb5 100644
--- a/src/librawspeed/decoders/IiqDecoder.cpp
+++ b/src/librawspeed/decoders/IiqDecoder.cpp
@@ -66,7 +66,7 @@ bool IiqDecoder::isAppropriateDecoder(const TiffRootIFD* rootIFD,
   const std::string& make = id.make;
 
   return IiqDecoder::isAppropriateDecoder(file) &&
-         (make == "Phase One A/S" || make == "Leaf");
+         (make == "Phase One A/S" || make == "Phase One" || make == "Leaf");
 }
 
 // FIXME: this is very close to SamsungV0Decompressor::computeStripes()
@@ -187,7 +187,7 @@ RawImage IiqDecoder::decodeRawInternal() {
   }
 
   // FIXME: could be wrong. max "active pixels" in "Sensor+" mode - "101 MP"
-  if (width == 0 || height == 0 || width > 11976 || height > 8852)
+  if (width == 0 || height == 0 || width > 11976 || height > 8854)
     ThrowRDE("Unexpected image dimensions found: (%u; %u)", width, height);
 
   if (split_col > width || split_row > height)
@@ -315,18 +315,19 @@ void IiqDecoder::CorrectQuadrantMultipliersCombined(ByteStream data,
 
   for (int quadRow = 0; quadRow < 2; quadRow++) {
     for (int quadCol = 0; quadCol < 2; quadCol++) {
+      const Array2DRef<uint16_t> img(mRaw->getU16DataAsUncroppedArray2DRef());
+
       const Spline<> s(control_points[quadRow][quadCol]);
       const std::vector<uint16_t> curve = s.calculateCurve();
 
       int row_start = quadRow == 0 ? 0 : split_row;
-      int row_end = quadRow == 0 ? split_row : mRaw->dim.y;
+      int row_end = quadRow == 0 ? split_row : img.height;
       int col_start = quadCol == 0 ? 0 : split_col;
-      int col_end = quadCol == 0 ? split_col : mRaw->dim.x;
+      int col_end = quadCol == 0 ? split_col : img.width;
 
       for (int row = row_start; row < row_end; row++) {
-        auto* pixel =
-            reinterpret_cast<uint16_t*>(mRaw->getData(col_start, row));
-        for (int col = col_start; col < col_end; col++, pixel++) {
+        for (int col = col_start; col < col_end; col++) {
+          uint16_t& pixel = img(row, col);
           // This adjustment is expected to be made with the
           // black-level already subtracted from the pixel values.
           // Because this is kept as metadata and not subtracted at
@@ -334,8 +335,8 @@ void IiqDecoder::CorrectQuadrantMultipliersCombined(ByteStream data,
           // appropriate amount before indexing into the curve and
           // then add it back so that subtracting the black level
           // later will work as expected
-          const uint16_t diff = *pixel < black_level ? *pixel : black_level;
-          *pixel = curve[*pixel - diff] + diff;
+          const uint16_t diff = pixel < black_level ? pixel : black_level;
+          pixel = curve[pixel - diff] + diff;
         }
       }
     }
@@ -390,9 +391,7 @@ void IiqDecoder::handleBadPixel(const uint16_t col, const uint16_t row) {
 }
 
 void IiqDecoder::correctBadColumn(const uint16_t col) {
-  const Array2DRef<uint16_t> img(reinterpret_cast<uint16_t*>(mRaw->getData()),
-                                 mRaw->dim.x, mRaw->dim.y,
-                                 mRaw->pitch / sizeof(uint16_t));
+  const Array2DRef<uint16_t> img(mRaw->getU16DataAsUncroppedArray2DRef());
 
   for (int row = 2; row < mRaw->dim.y - 2; row++) {
     if (mRaw->cfa.getColorAt(col, row) == CFA_GREEN) {
@@ -408,10 +407,10 @@ void IiqDecoder::correctBadColumn(const uint16_t col) {
       std::array<uint16_t, 4> val;
       std::array<int32_t, 4> dev;
       int32_t sum = 0;
-      sum += val[0] = img(col - 1, row - 1);
-      sum += val[1] = img(col - 1, row + 1);
-      sum += val[2] = img(col + 1, row - 1);
-      sum += val[3] = img(col + 1, row + 1);
+      sum += val[0] = img(row - 1, col - 1);
+      sum += val[1] = img(row + 1, col - 1);
+      sum += val[2] = img(row - 1, col + 1);
+      sum += val[3] = img(row + 1, col + 1);
       for (int i = 0; i < 4; i++) {
         dev[i] = std::abs((val[i] * 4) - sum);
         if (dev[max] < dev[i])
@@ -419,7 +418,7 @@ void IiqDecoder::correctBadColumn(const uint16_t col) {
       }
       const int three_pixels = sum - val[max];
       // This is `std::lround(three_pixels / 3.0)`, but without FP.
-      img(col, row) = (three_pixels + 1) / 3;
+      img(row, col) = (three_pixels + 1) / 3;
     } else {
       /*
        * Do non-green pixels. Let's pretend we are in "R" pixel, in the middle:
@@ -431,11 +430,11 @@ void IiqDecoder::correctBadColumn(const uint16_t col) {
        * We have 6 other "R" pixels - 2 by horizontal, 4 by diagonals.
        * We need to combine them, to get the value of the pixel we are in.
        */
-      uint32_t diags = img(col - 2, row + 2) + img(col - 2, row - 2) +
-                       img(col + 2, row + 2) + img(col + 2, row - 2);
-      uint32_t horiz = img(col - 2, row) + img(col + 2, row);
+      uint32_t diags = img(row + 2, col - 2) + img(row - 2, col - 2) +
+                       img(row + 2, col + 2) + img(row - 2, col + 2);
+      uint32_t horiz = img(row, col - 2) + img(row, col + 2);
       // But this is not just averaging, we bias towards the horizontal pixels.
-      img(col, row) = std::lround(diags * 0.0732233 + horiz * 0.3535534);
+      img(row, col) = std::lround(diags * 0.0732233 + horiz * 0.3535534);
     }
   }
 }
diff --git a/src/librawspeed/decoders/NefDecoder.cpp b/src/librawspeed/decoders/NefDecoder.cpp
index 4ca537467..6b874f906 100644
--- a/src/librawspeed/decoders/NefDecoder.cpp
+++ b/src/librawspeed/decoders/NefDecoder.cpp
@@ -304,47 +304,41 @@ void NefDecoder::DecodeUncompressed() {
   }
 }
 
-void NefDecoder::readCoolpixSplitRaw(const ByteStream& input,
-                                     const iPoint2D& size,
+void NefDecoder::readCoolpixSplitRaw(ByteStream input, const iPoint2D& size,
                                      const iPoint2D& offset, int inputPitch) {
-  uint8_t* data = mRaw->getData();
-  uint32_t outPitch = mRaw->pitch;
-  uint32_t w = size.x;
-  uint32_t h = size.y;
-  uint32_t cpp = mRaw->getCpp();
-  if (input.getRemainSize() < (inputPitch*h)) {
-    if (static_cast<int>(input.getRemainSize()) > inputPitch)
-      h = input.getRemainSize() / inputPitch - 1;
-    else
-      ThrowIOE(
-          "Not enough data to decode a single line. Image file truncated.");
-  }
-
-  if (offset.y > mRaw->dim.y)
-    ThrowRDE("Invalid y offset");
-  if (offset.x + size.x > mRaw->dim.x)
-    ThrowRDE("Invalid x offset");
-
-  uint32_t y = offset.y;
-  h = min(h + static_cast<uint32_t>(offset.y),
-          static_cast<uint32_t>(mRaw->dim.y));
-  w *= cpp;
-  h /= 2;
-  BitPumpMSB in(input);
-  for (; y < h; y++) {
-    auto* dest = reinterpret_cast<uint16_t*>(
-        &data[offset.x * sizeof(uint16_t) * cpp + y * 2 * outPitch]);
-    for (uint32_t x = 0; x < w; x++) {
-      dest[x] =  in.getBits(12);
-    }
-  }
-  for (y = offset.y; y < h; y++) {
-    auto* dest = reinterpret_cast<uint16_t*>(
-        &data[offset.x * sizeof(uint16_t) * cpp + (y * 2 + 1) * outPitch]);
-    for (uint32_t x = 0; x < w; x++) {
-      dest[x] =  in.getBits(12);
-    }
+  const Array2DRef<uint16_t> img(mRaw->getU16DataAsUncroppedArray2DRef());
+
+  if (size.y % 2 != 0)
+    ThrowRDE("Odd number of rows");
+  if (size.x % 8 != 0)
+    ThrowRDE("Column count isn't multiple of 8");
+  if (inputPitch != ((3 * size.x) / 2))
+    ThrowRDE("Unexpected input pitch");
+
+  // BitPumpMSB loads exactly 4 bytes at once, and we squeeze 12 bits each time.
+  // We produce 2 pixels per 3 bytes (24 bits). If we want to be smart and to
+  // know where the first input bit for first odd row is, the input slice width
+  // must be a multiple of 8 pixels.
+
+  if (offset.x > mRaw->dim.x || offset.y > mRaw->dim.y)
+    ThrowRDE("All pixels outside of image");
+  if (offset.x + size.x > mRaw->dim.x || offset.y + size.y > mRaw->dim.y)
+    ThrowRDE("Output is partailly out of image");
+
+  // The input bytes are laid out in the memory in the following way:
+  // First, all even (0-2-4-) rows, and then all odd (1-3-5-) rows.
+  BitPumpMSB even(input.getStream(size.y / 2, inputPitch));
+  BitPumpMSB odd(input.getStream(size.y / 2, inputPitch));
+  for (int row = offset.y; row < size.y;) {
+    for (int col = offset.x; col < size.x; ++col)
+      img(row, col) = even.getBits(12);
+    ++row;
+    for (int col = offset.x; col < size.x; ++col)
+      img(row, col) = odd.getBits(12);
+    ++row;
   }
+  assert(even.getRemainSize() == 0 && odd.getRemainSize() == 0 &&
+         "Should have run out of input");
 }
 
 void NefDecoder::DecodeD100Uncompressed() {
@@ -358,6 +352,10 @@ void NefDecoder::DecodeD100Uncompressed() {
   mRaw->dim = iPoint2D(width, height);
   mRaw->createData();
 
+  ByteStream bs(DataBuffer(mFile->getSubView(offset), Endianness::little));
+  if (bs.getRemainSize() == 0)
+    ThrowRDE("No input to decode!");
+
   UncompressedDecompressor u(
       ByteStream(DataBuffer(mFile->getSubView(offset), Endianness::little)),
       mRaw);
@@ -381,8 +379,7 @@ void NefDecoder::DecodeSNefUncompressed() {
   mRaw->createData();
 
   ByteStream in(DataBuffer(mFile->getSubView(offset), Endianness::little));
-
-  DecodeNikonSNef(&in, width, height);
+  DecodeNikonSNef(in);
 }
 
 void NefDecoder::checkSupportInternal(const CameraMetaData* meta) {
@@ -615,12 +612,9 @@ void NefDecoder::decodeMetaDataInternal(const CameraMetaData* meta) {
 // We un-apply the whitebalance, so output matches lossless.
 // Note that values are scaled. See comment below on details.
 // OPTME: It would be trivial to run this multithreaded.
-void NefDecoder::DecodeNikonSNef(ByteStream* input, uint32_t w, uint32_t h) {
-  if (w < 6)
-    ThrowIOE("got a %u wide sNEF, aborting", w);
-
-  if (input->getRemainSize() < (w * h * 3))
-    ThrowIOE("Not enough data to decode. Image file truncated.");
+void NefDecoder::DecodeNikonSNef(const ByteStream& input) {
+  if (mRaw->dim.x < 6)
+    ThrowIOE("got a %u wide sNEF, aborting", mRaw->dim.x);
 
   // We need to read the applied whitebalance, since we should return
   // data before whitebalance, so we "unapply" it.
@@ -661,14 +655,12 @@ void NefDecoder::DecodeNikonSNef(ByteStream* input, uint32_t w, uint32_t h) {
   uint16_t tmp;
   auto* tmpch = reinterpret_cast<uint8_t*>(&tmp);
 
-  uint8_t* data = mRaw->getData();
-  uint32_t pitch = mRaw->pitch;
-  const uint8_t* in = input->getData(w * h * 3);
+  const Array2DRef<uint16_t> out(mRaw->getU16DataAsUncroppedArray2DRef());
+  const uint8_t* in = input.peekData(out.width * out.height);
 
-  for (uint32_t y = 0; y < h; y++) {
-    auto* dest = reinterpret_cast<uint16_t*>(&data[y * pitch]);
+  for (int row = 0; row < out.height; row++) {
     uint32_t random = in[0] + (in[1] << 8) + (in[2] << 16);
-    for (uint32_t x = 0; x < w * 3; x += 6) {
+    for (int col = 0; col < out.width; col += 6) {
       uint32_t g1 = in[0];
       uint32_t g2 = in[1];
       uint32_t g3 = in[2];
@@ -685,7 +677,7 @@ void NefDecoder::DecodeNikonSNef(ByteStream* input, uint32_t w, uint32_t h) {
       float cb2 = cb;
       float cr2 = cr;
       // Interpolate right pixel. We assume the sample is aligned with left pixel.
-      if ((x+6) < w*3) {
+      if ((col + 6) < out.width) {
         g4 = in[3];
         g5 = in[4];
         g6 = in[5];
@@ -700,27 +692,27 @@ void NefDecoder::DecodeNikonSNef(ByteStream* input, uint32_t w, uint32_t h) {
 
       mRaw->setWithLookUp(clampBits(static_cast<int>(y1 + 1.370705 * cr), 12),
                           tmpch, &random);
-      dest[x] = clampBits((inv_wb_r * tmp + (1<<9)) >> 10, 15);
+      out(row, col) = clampBits((inv_wb_r * tmp + (1 << 9)) >> 10, 15);
 
       mRaw->setWithLookUp(
           clampBits(static_cast<int>(y1 - 0.337633 * cb - 0.698001 * cr), 12),
-          reinterpret_cast<uint8_t*>(&dest[x + 1]), &random);
+          reinterpret_cast<uint8_t*>(&out(row, col + 1)), &random);
 
       mRaw->setWithLookUp(clampBits(static_cast<int>(y1 + 1.732446 * cb), 12),
                           tmpch, &random);
-      dest[x+2]   = clampBits((inv_wb_b * tmp + (1<<9)) >> 10, 15);
+      out(row, col + 2) = clampBits((inv_wb_b * tmp + (1 << 9)) >> 10, 15);
 
       mRaw->setWithLookUp(clampBits(static_cast<int>(y2 + 1.370705 * cr2), 12),
                           tmpch, &random);
-      dest[x+3] = clampBits((inv_wb_r * tmp + (1<<9)) >> 10, 15);
+      out(row, col + 3) = clampBits((inv_wb_r * tmp + (1 << 9)) >> 10, 15);
 
       mRaw->setWithLookUp(
           clampBits(static_cast<int>(y2 - 0.337633 * cb2 - 0.698001 * cr2), 12),
-          reinterpret_cast<uint8_t*>(&dest[x + 4]), &random);
+          reinterpret_cast<uint8_t*>(&out(row, col + 4)), &random);
 
       mRaw->setWithLookUp(clampBits(static_cast<int>(y2 + 1.732446 * cb2), 12),
                           tmpch, &random);
-      dest[x+5] = clampBits((inv_wb_b * tmp + (1<<9)) >> 10, 15);
+      out(row, col + 5) = clampBits((inv_wb_b * tmp + (1 << 9)) >> 10, 15);
     }
   }
 }
diff --git a/src/librawspeed/decoders/NefDecoder.h b/src/librawspeed/decoders/NefDecoder.h
index afbb5d782..65b73f168 100644
--- a/src/librawspeed/decoders/NefDecoder.h
+++ b/src/librawspeed/decoders/NefDecoder.h
@@ -60,9 +60,9 @@ class NefDecoder final : public AbstractTiffDecoder
   void DecodeUncompressed();
   void DecodeD100Uncompressed();
   void DecodeSNefUncompressed();
-  void readCoolpixSplitRaw(const ByteStream& input, const iPoint2D& size,
+  void readCoolpixSplitRaw(ByteStream input, const iPoint2D& size,
                            const iPoint2D& offset, int inputPitch);
-  void DecodeNikonSNef(ByteStream* input, uint32_t w, uint32_t h);
+  void DecodeNikonSNef(const ByteStream& input);
   std::string getMode();
   std::string getExtendedMode(const std::string &mode);
   static std::vector<uint16_t> gammaCurve(double pwr, double ts, int mode,
diff --git a/src/librawspeed/decoders/RawDecoderException.h b/src/librawspeed/decoders/RawDecoderException.h
index c29d1d09e..aee9706b6 100644
--- a/src/librawspeed/decoders/RawDecoderException.h
+++ b/src/librawspeed/decoders/RawDecoderException.h
@@ -21,6 +21,7 @@
 
 #pragma once
 
+#include "rawspeedconfig.h"
 #include "common/RawspeedException.h" // for ThrowException, RawspeedException
 #include <string>                     // for string
 
@@ -28,9 +29,9 @@ namespace rawspeed {
 
 class RawDecoderException : public RawspeedException {
 public:
-  explicit RawDecoderException(const std::string& msg)
+  explicit RAWSPEED_UNLIKELY_FUNCTION RAWSPEED_NOINLINE
+  RawDecoderException(const char* msg)
       : RawspeedException(msg) {}
-  explicit RawDecoderException(const char* msg) : RawspeedException(msg) {}
 };
 
 #define ThrowRDE(...)                                                          \
diff --git a/src/librawspeed/decompressors/AbstractHuffmanTable.h b/src/librawspeed/decompressors/AbstractHuffmanTable.h
index 4333eabec..96fe4c3aa 100644
--- a/src/librawspeed/decompressors/AbstractHuffmanTable.h
+++ b/src/librawspeed/decompressors/AbstractHuffmanTable.h
@@ -66,6 +66,9 @@ class AbstractHuffmanTable {
   };
 
 protected:
+  bool fullDecode = true;
+  bool fixDNGBug16 = false;
+
   inline size_t __attribute__((pure)) maxCodePlusDiffLength() const {
     return nCodesPerLength.size() - 1 +
            *(std::max_element(codeValues.cbegin(), codeValues.cend()));
@@ -219,10 +222,34 @@ class AbstractHuffmanTable {
     }
   }
 
+  template <typename BIT_STREAM, bool FULL_DECODE>
+  inline int processSymbol(BIT_STREAM& bs, CodeSymbol symbol,
+                           int codeValue) const {
+    assert(symbol.code_len >= 0 && symbol.code_len <= 16);
+
+    // If we were only looking for symbol's code value, then just return it.
+    if (!FULL_DECODE)
+      return codeValue;
+
+    // Else, treat it as the length of following difference
+    // that we need to read and extend.
+    int diff_l = codeValue;
+    assert(diff_l >= 0 && diff_l <= 16);
+
+    if (diff_l == 16) {
+      if (fixDNGBug16)
+        bs.skipBitsNoFill(16);
+      return -32768;
+    }
+
+    assert(symbol.code_len + diff_l <= 32);
+    return diff_l ? extend(bs.getBitsNoFill(diff_l), diff_l) : 0;
+  }
+
   // Figure F.12 – Extending the sign bit of a decoded value in V
   // WARNING: this is *not* your normal 2's complement sign extension!
-  // WARNING: the caller should check that len != 0 before calling the function
   inline static int __attribute__((const)) extend(uint32_t diff, uint32_t len) {
+    assert(len > 0);
     int32_t ret = diff;
     if ((diff & (1 << (len - 1))) == 0)
       ret -= (1 << len) - 1;
diff --git a/src/librawspeed/decompressors/Cr2Decompressor.cpp b/src/librawspeed/decompressors/Cr2Decompressor.cpp
index 41b4465f1..317437adc 100644
--- a/src/librawspeed/decompressors/Cr2Decompressor.cpp
+++ b/src/librawspeed/decompressors/Cr2Decompressor.cpp
@@ -144,16 +144,15 @@ void Cr2Decompressor::decodeN_X_Y()
   //  * for <3,2,1>: 6  = 3*2*1
   //  * for <3,2,2>: 12 = 3*2*2
   // and advances x by N_COMP*X_S_F and y by Y_S_F
-  constexpr int xStepSize = N_COMP * X_S_F;
-  constexpr int yStepSize = Y_S_F;
+  constexpr int sliceColStep = N_COMP * X_S_F;
+  constexpr int frameRowStep = Y_S_F;
 
   auto ht = getHuffmanTables<N_COMP>();
   auto pred = getInitialPredictors<N_COMP>();
   auto predNext = reinterpret_cast<uint16_t*>(mRaw->getDataUncropped(0, 0));
 
-  BitPumpJPEG bitStream(input);
+  BitPumpJPEG bs(input);
 
-  uint32_t pixelPitch = mRaw->pitch / 2; // Pitch in pixel
   if (frame.cps != 3 && frame.w * frame.cps > 2 * frame.h) {
     // Fix Canon double height issue where Canon doubled the width and halfed
     // the height (e.g. with 5Ds), ask Canon. frame.w needs to stay as is here
@@ -173,9 +172,13 @@ void Cr2Decompressor::decodeN_X_Y()
   for (const auto& width : {slicing.sliceWidth, slicing.lastSliceWidth}) {
     if (width > mRaw->dim.x)
       ThrowRDE("Slice is longer than image's height, which is unsupported.");
-    if (width % xStepSize != 0) {
+    if (width % sliceColStep != 0) {
       ThrowRDE("Slice width (%u) should be multiple of pixel group size (%u)",
-               width, xStepSize);
+               width, sliceColStep);
+    }
+    if (width % mRaw->getCpp() != 0) {
+      ThrowRDE("Slice width (%u) should be multiple of image cpp (%u)", width,
+               mRaw->getCpp());
     }
   }
 
@@ -183,67 +186,69 @@ void Cr2Decompressor::decodeN_X_Y()
       mRaw->getCpp() * mRaw->dim.area())
     ThrowRDE("Incorrrect slice height / slice widths! Less than image size.");
 
-  unsigned processedPixels = 0;
-  unsigned processedLineSlices = 0;
+  const Array2DRef<uint16_t> out(mRaw->getU16DataAsUncroppedArray2DRef());
+  unsigned globalFrameCol = 0;
+  unsigned globalFrameRow = 0;
   for (auto sliceId = 0; sliceId < slicing.numSlices; sliceId++) {
     const unsigned sliceWidth = slicing.widthOfSlice(sliceId);
 
-    assert(frame.h % yStepSize == 0);
-    for (unsigned y = 0; y < frame.h; y += yStepSize) {
-      // Fix for Canon 80D mraw format.
-      // In that format, `frame` is 4032x3402, while `mRaw` is 4536x3024.
-      // Consequently, the slices in `frame` wrap around plus there are few
-      // 'extra' sliced lines because sum(slicesW) * sliceH > mRaw->dim.area()
-      // Those would overflow, hence the break.
-      // see FIX_CANON_FRAME_VS_IMAGE_SIZE_MISMATCH
-      unsigned destY = processedLineSlices % mRaw->dim.y;
-      unsigned destX = processedLineSlices / mRaw->dim.y *
-                       slicing.widthOfSlice(0) / mRaw->getCpp();
-      if (destX >= static_cast<unsigned>(mRaw->dim.x))
+    assert(frame.h % frameRowStep == 0);
+    for (unsigned sliceFrameRow = 0; sliceFrameRow < frame.h;
+         sliceFrameRow += frameRowStep, globalFrameRow += frameRowStep) {
+      unsigned row = globalFrameRow % mRaw->dim.y;
+      unsigned col = globalFrameRow / mRaw->dim.y * slicing.widthOfSlice(0) /
+                     mRaw->getCpp();
+      if (col >= static_cast<unsigned>(mRaw->dim.x))
         break;
-      auto dest =
-          reinterpret_cast<uint16_t*>(mRaw->getDataUncropped(destX, destY));
-
-      assert(sliceWidth % xStepSize == 0);
-      if (X_S_F == 1) {
-        if (destX + sliceWidth > static_cast<unsigned>(mRaw->dim.x))
-          ThrowRDE("Bad slice width / frame size / image size combination.");
-        if (((sliceId + 1) == slicing.numSlices) &&
-            ((destX + sliceWidth) < static_cast<unsigned>(mRaw->dim.x)))
-          ThrowRDE("Insufficient slices - do not fill the entire image");
-      } else {
-        // FIXME.
-      }
-      for (unsigned x = 0; x < sliceWidth; x += xStepSize) {
+
+      assert(sliceWidth % mRaw->getCpp() == 0);
+      unsigned pixelsPerSliceRow = sliceWidth / mRaw->getCpp();
+      if (col + pixelsPerSliceRow > static_cast<unsigned>(mRaw->dim.x))
+        ThrowRDE("Bad slice width / frame size / image size combination.");
+      if (((sliceId + 1) == slicing.numSlices) &&
+          (col + pixelsPerSliceRow != static_cast<unsigned>(mRaw->dim.x)))
+        ThrowRDE("Insufficient slices - do not fill the entire image");
+
+      col *= mRaw->getCpp();
+      assert(sliceWidth % sliceColStep == 0);
+      for (unsigned sliceCol = 0; sliceCol < sliceWidth;) {
         // check if we processed one full raw row worth of pixels
-        if (processedPixels == frame.w) {
+        if (globalFrameCol == frame.w) {
           // if yes -> update predictor by going back exactly one row,
           // no matter where we are right now.
           // makes no sense from an image compression point of view, ask Canon.
           copy_n(predNext, N_COMP, pred.data());
-          predNext = dest;
-          processedPixels = 0;
+          predNext = &out(row, col);
+          globalFrameCol = 0;
         }
 
-        if (X_S_F == 1) { // will be optimized out
-          unroll_loop<N_COMP>([&](int i) {
-            dest[i] = pred[i] += ht[i]->decodeNext(bitStream);
-          });
-        } else {
-          unroll_loop<Y_S_F>([&](int i) {
-            dest[0 + i*pixelPitch] = pred[0] += ht[0]->decodeNext(bitStream);
-            dest[3 + i*pixelPitch] = pred[0] += ht[0]->decodeNext(bitStream);
-          });
-
-          dest[1] = pred[1] += ht[1]->decodeNext(bitStream);
-          dest[2] = pred[2] += ht[2]->decodeNext(bitStream);
+        // How many pixel can we decode until we finish the row of either
+        // the frame (i.e. predictor change time), or of the current slice?
+        assert(frame.w % X_S_F == 0);
+        unsigned sliceColsRemainingInThisFrameRow =
+            sliceColStep * ((frame.w - globalFrameCol) / X_S_F);
+        unsigned sliceColsRemainingInThisSliceRow = sliceWidth - sliceCol;
+        unsigned sliceColsRemaining = std::min(
+            sliceColsRemainingInThisSliceRow, sliceColsRemainingInThisFrameRow);
+        assert(sliceColsRemaining >= sliceColStep &&
+               (sliceColsRemaining % sliceColStep) == 0);
+        for (unsigned sliceColEnd = sliceCol + sliceColsRemaining;
+             sliceCol < sliceColEnd; sliceCol += sliceColStep,
+                      globalFrameCol += X_S_F, col += sliceColStep) {
+          if (X_S_F == 1) { // will be optimized out
+            for (int c = 0; c < sliceColStep; ++c)
+              out(row, col + c) = pred[c] += ht[c]->decodeNext(bs);
+          } else {
+            for (int dstRow = 0; dstRow < Y_S_F; ++dstRow) {
+              for (int c : {0, 3})
+                out(row + dstRow, col + c) = pred[0] += ht[0]->decodeNext(bs);
+            }
+
+            for (int c : {1, 2})
+              out(row, col + c) = pred[c] += ht[c]->decodeNext(bs);
+          }
         }
-
-        dest += xStepSize;
-        processedPixels += X_S_F;
       }
-
-      processedLineSlices += yStepSize;
     }
   }
 }
diff --git a/src/librawspeed/decompressors/CrwDecompressor.cpp b/src/librawspeed/decompressors/CrwDecompressor.cpp
index 838e4baee..ff1ef8a19 100644
--- a/src/librawspeed/decompressors/CrwDecompressor.cpp
+++ b/src/librawspeed/decompressors/CrwDecompressor.cpp
@@ -243,29 +243,26 @@ inline void CrwDecompressor::decodeBlock(std::array<int, 64>* diffBuf,
 
 // FIXME: this function is horrible.
 void CrwDecompressor::decompress() {
-  const uint32_t height = mRaw->dim.y;
-  const uint32_t width = mRaw->dim.x;
+  const Array2DRef<uint16_t> out(mRaw->getU16DataAsUncroppedArray2DRef());
+  assert(out.width > 0);
+  assert(out.width % 4 == 0);
+  assert(out.height > 0);
 
   {
-    assert(width > 0);
-    assert(width % 4 == 0);
-    assert(height > 0);
-
     // Each block encodes 64 pixels
 
-    assert((height * width) % 64 == 0);
-    const unsigned hBlocks = height * width / 64;
+    assert((out.height * out.width) % 64 == 0);
+    const unsigned hBlocks = out.height * out.width / 64;
     assert(hBlocks > 0);
 
     BitPumpJPEG lPump(rawInput);
     BitPumpJPEG iPump(rawInput);
 
     int carry = 0;
-    std::array<int, 2> base;
+    std::array<int, 2> base = {512, 512}; // starting predictors
 
-    uint32_t j = 0;
-    uint16_t* dest = nullptr;
-    uint32_t i = 0;
+    int row = 0;
+    int col = 0;
 
     for (unsigned block = 0; block < hBlocks; block++) {
       array<int, 64> diffBuf = {{}};
@@ -276,58 +273,44 @@ void CrwDecompressor::decompress() {
       diffBuf[0] += carry;
       carry = diffBuf[0];
 
-      for (uint32_t k = 0; k < 64; k++) {
-        if (i % width == 0) {
+      for (uint32_t k = 0; k < 64; ++k, ++col) {
+        if (col == out.width) {
           // new line. sadly, does not always happen when k == 0.
-          i = 0;
-
-          dest = reinterpret_cast<uint16_t*>(mRaw->getData(0, j));
-
-          j++;
-          base[0] = base[1] = 512;
+          col = 0;
+          row++;
+          base = {512, 512}; // reinit.
         }
 
-        assert(dest != nullptr);
         base[k & 1] += diffBuf[k];
 
-        if (base[k & 1] >> 10)
+        if (!isIntN(base[k & 1], 10))
           ThrowRDE("Error decompressing");
 
-        *dest = base[k & 1];
-
-        i++;
-        dest++;
+        out(row, col) = base[k & 1];
       }
     }
-    assert(j == height);
-    assert(i == width);
+    assert(row == (out.height - 1));
+    assert(col == out.width);
   }
 
   // Add the uncompressed 2 low bits to the decoded 8 high bits
   if (lowbits) {
-    assert(width > 0);
-    assert(width % 4 == 0);
-    assert(height > 0);
-
-    for (uint32_t j = 0; j < height; j++) {
-      auto* dest = reinterpret_cast<uint16_t*>(mRaw->getData(0, j));
-
-      assert(width % 4 == 0);
-      for (uint32_t i = 0; i < width; /* NOTE: i += 4 */) {
+    for (int row = 0; row < out.height; row++) {
+      for (int col = 0; col < out.width; /* NOTE: col += 4 */) {
         const uint8_t c = lowbitInput.getByte();
         // LSB-packed: p3 << 6 | p2 << 4 | p1 << 2 | p0 << 0
 
         // We have read 8 bits, which is 4 pairs of 2 bits. So process 4 pixels.
-        for (uint32_t p = 0; p < 4; p++) {
+        for (uint32_t p = 0; p < 4; ++p, ++col) {
+          uint16_t& pixel = out(row, col);
+
           uint16_t low = (c >> (2 * p)) & 0b11;
-          uint16_t val = (*dest << 2) | low;
+          uint16_t val = (pixel << 2) | low;
 
-          if (width == 2672 && val < 512)
+          if (out.width == 2672 && val < 512)
             val += 2; // No idea why this is needed
 
-          *dest = val;
-          i++;
-          dest++;
+          pixel = val;
         }
       }
     }
diff --git a/src/librawspeed/decompressors/FujiDecompressor.cpp b/src/librawspeed/decompressors/FujiDecompressor.cpp
index 88c0ba271..bedc32e8b 100644
--- a/src/librawspeed/decompressors/FujiDecompressor.cpp
+++ b/src/librawspeed/decompressors/FujiDecompressor.cpp
@@ -173,6 +173,8 @@ template <typename T>
 void FujiDecompressor::copy_line(fuji_compressed_block* info,
                                  const FujiStrip& strip, int cur_line,
                                  T&& idx) const {
+  const Array2DRef<uint16_t> out(mRaw->getU16DataAsUncroppedArray2DRef());
+
   std::array<uint16_t*, 3> lineBufB;
   std::array<uint16_t*, 6> lineBufG;
   std::array<uint16_t*, 3> lineBufR;
@@ -187,9 +189,6 @@ void FujiDecompressor::copy_line(fuji_compressed_block* info,
   }
 
   for (int row_count = 0; row_count < FujiStrip::lineHeight(); row_count++) {
-    auto* const raw_block_data = reinterpret_cast<uint16_t*>(
-        mRaw->getData(strip.offsetX(), strip.offsetY(cur_line) + row_count));
-
     for (int pixel_count = 0; pixel_count < strip.width(); pixel_count++) {
       uint16_t* line_buf = nullptr;
 
@@ -210,7 +209,8 @@ void FujiDecompressor::copy_line(fuji_compressed_block* info,
         __builtin_unreachable();
       }
 
-      raw_block_data[pixel_count] = line_buf[idx(pixel_count)];
+      out(strip.offsetY(cur_line) + row_count, strip.offsetX() + pixel_count) =
+          line_buf[idx(pixel_count)];
     }
   }
 }
diff --git a/src/librawspeed/decompressors/HasselbladDecompressor.cpp b/src/librawspeed/decompressors/HasselbladDecompressor.cpp
index bf423d33f..cc4f3ab48 100644
--- a/src/librawspeed/decompressors/HasselbladDecompressor.cpp
+++ b/src/librawspeed/decompressors/HasselbladDecompressor.cpp
@@ -66,28 +66,29 @@ void HasselbladDecompressor::decodeScan() {
              frame.w, frame.h, mRaw->dim.x, mRaw->dim.y);
   }
 
-  assert(frame.h > 0);
-  assert(frame.w > 0);
-  assert(frame.w % 2 == 0);
+  const Array2DRef<uint16_t> out(mRaw->getU16DataAsUncroppedArray2DRef());
+
+  assert(out.height > 0);
+  assert(out.width > 0);
+  assert(out.width % 2 == 0);
 
   const auto ht = getHuffmanTables<1>();
 
   BitPumpMSB32 bitStream(input);
   // Pixels are packed two at a time, not like LJPEG:
   // [p1_length_as_huffman][p2_length_as_huffman][p0_diff_with_length][p1_diff_with_length]|NEXT PIXELS
-  for (uint32_t y = 0; y < frame.h; y++) {
-    auto* dest = reinterpret_cast<uint16_t*>(mRaw->getData(0, y));
+  for (int row = 0; row < out.height; row++) {
     int p1 = 0x8000 + pixelBaseOffset;
     int p2 = 0x8000 + pixelBaseOffset;
-    for (uint32_t x = 0; x < frame.w; x += 2) {
+    for (int col = 0; col < out.width; col += 2) {
       int len1 = ht[0]->decodeLength(bitStream);
       int len2 = ht[0]->decodeLength(bitStream);
       p1 += getBits(&bitStream, len1);
       p2 += getBits(&bitStream, len2);
       // NOTE: this is rather unusual and weird, but appears to be correct.
       // clampBits(p, 16) results in completely garbled images.
-      dest[x] = uint16_t(p1);
-      dest[x + 1] = uint16_t(p2);
+      out(row, col) = uint16_t(p1);
+      out(row, col + 1) = uint16_t(p2);
     }
   }
   input.skipBytes(bitStream.getBufferPosition());
diff --git a/src/librawspeed/decompressors/HuffmanTableLUT.h b/src/librawspeed/decompressors/HuffmanTableLUT.h
index d961f8fdb..358151f1e 100644
--- a/src/librawspeed/decompressors/HuffmanTableLUT.h
+++ b/src/librawspeed/decompressors/HuffmanTableLUT.h
@@ -21,9 +21,10 @@
 
 #pragma once
 
-#include "common/Common.h" // for uint32_t, uint16_t, int32_t
+#include "common/Common.h"                      // for uint32_t, uint16_t, ...
 #include "decoders/RawDecoderException.h"       // for ThrowRDE
 #include "decompressors/AbstractHuffmanTable.h" // for AbstractHuffmanTable
+#include "decompressors/HuffmanTableLookup.h"   // for HuffmanTableLookup
 #include "io/BitStream.h"                       // for BitStreamTraits
 #include <cassert>                              // for assert
 #include <cstddef>                              // for size_t
@@ -64,13 +65,7 @@
 
 namespace rawspeed {
 
-class HuffmanTableLUT final : public AbstractHuffmanTable {
-  // private fields calculated from codesPerBits and codeValues
-  // they are index '1' based, so we can directly lookup the value
-  // for code length l without decrementing
-  std::vector<uint32_t> maxCodeOL;    // index is length of code
-  std::vector<uint16_t> codeOffsetOL; // index is length of code
-
+class HuffmanTableLUT final : public HuffmanTableLookup {
   // The code can be compiled with two different decode lookup table layouts.
   // The idea is that different CPU architectures may perform better with
   // one or the other, depending on the relative performance of their arithmetic
@@ -96,38 +91,10 @@ class HuffmanTableLUT final : public AbstractHuffmanTable {
   std::vector<uint8_t> decodeLookup;
 #endif
 
-  bool fullDecode = true;
-  bool fixDNGBug16 = false;
-
 public:
   void setup(bool fullDecode_, bool fixDNGBug16_) {
-    this->fullDecode = fullDecode_;
-    this->fixDNGBug16 = fixDNGBug16_;
-
-    assert(!nCodesPerLength.empty());
-    assert(maxCodesCount() > 0);
-
-    unsigned int maxCodeLength = nCodesPerLength.size() - 1U;
-    assert(codeValues.size() == maxCodesCount());
-
-    assert(maxCodePlusDiffLength() <= 32U);
-
-    // Figure C.1: make table of Huffman code length for each symbol
-    // Figure C.2: generate the codes themselves
-    const auto symbols = generateCodeSymbols();
-    assert(symbols.size() == maxCodesCount());
-
-    // Figure F.15: generate decoding tables
-    codeOffsetOL.resize(maxCodeLength + 1UL, 0xFFFF);
-    maxCodeOL.resize(maxCodeLength + 1UL, 0xFFFFFFFF);
-    int code_index = 0;
-    for (unsigned int l = 1U; l <= maxCodeLength; l++) {
-      if (nCodesPerLength[l]) {
-        codeOffsetOL[l] = symbols[code_index].code - code_index;
-        code_index += nCodesPerLength[l];
-        maxCodeOL[l] = symbols[code_index - 1].code;
-      }
-    }
+    const std::vector<CodeSymbol> symbols =
+        HuffmanTableLookup::setup(fullDecode_, fixDNGBug16_);
 
     // Generate lookup table for fast decoding lookup.
     // See definition of decodeLookup above
@@ -148,10 +115,12 @@ class HuffmanTableLUT final : public AbstractHuffmanTable {
           // lookup bit depth is too small to fit both the encoded length
           // and the final difference value.
           // -> store only the length and do a normal sign extension later
+          assert(!fullDecode || diff_l > 0);
           decodeLookup[c] = diff_l << PayloadShift | code_l;
         } else {
           // diff_l + code_l <= lookupDepth
           // The table bit depth is large enough to store both.
+          assert(diff_l != 16);
           decodeLookup[c] = (code_l + diff_l) | FlagMask;
 
           if (diff_l) {
@@ -165,14 +134,16 @@ class HuffmanTableLUT final : public AbstractHuffmanTable {
     }
   }
 
-  template<typename BIT_STREAM> inline int decodeLength(BIT_STREAM& bs) const {
+  template <typename BIT_STREAM>
+  inline __attribute__((always_inline)) int decodeLength(BIT_STREAM& bs) const {
     static_assert(BitStreamTraits<BIT_STREAM>::canUseWithHuffmanTable,
                   "This BitStream specialization is not marked as usable here");
     assert(!fullDecode);
     return decode<BIT_STREAM, false>(bs);
   }
 
-  template<typename BIT_STREAM> inline int decodeNext(BIT_STREAM& bs) const {
+  template <typename BIT_STREAM>
+  inline __attribute__((always_inline)) int decodeNext(BIT_STREAM& bs) const {
     static_assert(BitStreamTraits<BIT_STREAM>::canUseWithHuffmanTable,
                   "This BitStream specialization is not marked as usable here");
     assert(fullDecode);
@@ -183,74 +154,46 @@ class HuffmanTableLUT final : public AbstractHuffmanTable {
   // one returning only the length of the of diff bits (see Hasselblad),
   // one to return the fully decoded diff.
   // All ifs depending on this bool will be optimized out by the compiler
-  template<typename BIT_STREAM, bool FULL_DECODE> inline int decode(BIT_STREAM& bs) const {
+  template <typename BIT_STREAM, bool FULL_DECODE>
+  inline __attribute__((always_inline)) int decode(BIT_STREAM& bs) const {
     static_assert(BitStreamTraits<BIT_STREAM>::canUseWithHuffmanTable,
                   "This BitStream specialization is not marked as usable here");
     assert(FULL_DECODE == fullDecode);
-
-    // 32 is the absolute maximum combined length of code + diff
-    // assertion  maxCodePlusDiffLength() <= 32U  is already checked in setup()
     bs.fill(32);
 
-    // for processors supporting bmi2 instructions, using maxCodePlusDiffLength()
-    // might be beneficial
+    CodeSymbol partial;
+    partial.code_len = LookupDepth;
+    partial.code = bs.peekBitsNoFill(partial.code_len);
 
-    uint32_t code = bs.peekBitsNoFill(LookupDepth);
-    assert(code < decodeLookup.size());
-    auto val = static_cast<unsigned>(decodeLookup[code]);
-    int len = val & LenMask;
-    assert(len >= 0);
-    assert(len <= 16);
+    assert(partial.code < decodeLookup.size());
+    auto lutEntry = static_cast<unsigned>(decodeLookup[partial.code]);
+    int payload = static_cast<int>(lutEntry) >> PayloadShift;
+    int len = lutEntry & LenMask;
 
-    // if the code is invalid (bitstream corrupted) len will be 0
+    // How far did reading of those LookupDepth bits *actually* move us forward?
     bs.skipBitsNoFill(len);
-    if (FULL_DECODE && val & FlagMask) {
-      // if the flag bit is set, the payload is the already sign extended difference
-      return static_cast<int>(val) >> PayloadShift;
-    }
-
-    if (len) {
-      // if the flag bit is not set but len != 0, the payload is the number of bits to sign extend and return
-      const int l_diff = static_cast<int>(val) >> PayloadShift;
-      assert((FULL_DECODE && (len + l_diff <= 32)) || !FULL_DECODE);
-      if (FULL_DECODE && l_diff == 16) {
-        if (fixDNGBug16)
-          bs.skipBitsNoFill(16);
-        return -32768;
-      }
-      return FULL_DECODE ? extend(bs.getBitsNoFill(l_diff), l_diff) : l_diff;
-    }
-
-    uint32_t code_l = LookupDepth;
-    bs.skipBitsNoFill(code_l);
-    while (code_l < maxCodeOL.size() &&
-           (0xFFFFFFFF == maxCodeOL[code_l] || code > maxCodeOL[code_l])) {
-      uint32_t temp = bs.getBitsNoFill(1);
-      code = (code << 1) | temp;
-      code_l++;
-    }
-
-    if (code_l >= maxCodeOL.size() ||
-        (0xFFFFFFFF == maxCodeOL[code_l] || code > maxCodeOL[code_l]))
-      ThrowRDE("bad Huffman code: %u (len: %u)", code, code_l);
-
-    if (code < codeOffsetOL[code_l])
-      ThrowRDE("likely corrupt Huffman code: %u (len: %u)", code, code_l);
-
-    int diff_l = codeValues[code - codeOffsetOL[code_l]];
-
-    if (!FULL_DECODE)
-      return diff_l;
 
-    if (diff_l == 16) {
-      if (fixDNGBug16)
-        bs.skipBitsNoFill(16);
-      return -32768;
+    // If the flag bit is set, then the 'len' was code_l+value,
+    // and payload is the already-extended difference.
+    if (FULL_DECODE && lutEntry & FlagMask)
+      return payload;
+
+    int codeValue;
+    if (lutEntry) {
+      // If the flag is not set, but the entry is not empty,
+      // the payload is the code value for this symbol.
+      partial.code_len = len;
+      codeValue = payload;
+      assert(!FULL_DECODE || codeValue /*aka diff_l*/ > 0);
+    } else {
+      // No match in the lookup table, because either the code is longer
+      // than LookupDepth or the input is corrupt. Need to read more bits...
+      assert(len == 0);
+      bs.skipBitsNoFill(partial.code_len);
+      std::tie(partial, codeValue) = finishReadingPartialSymbol(bs, partial);
     }
 
-    assert(FULL_DECODE);
-    assert((diff_l && (len + code_l + diff_l <= 32)) || !diff_l);
-    return diff_l ? extend(bs.getBitsNoFill(diff_l), diff_l) : 0;
+    return processSymbol<BIT_STREAM, FULL_DECODE>(bs, partial, codeValue);
   }
 };
 
diff --git a/src/librawspeed/decompressors/HuffmanTableLookup.h b/src/librawspeed/decompressors/HuffmanTableLookup.h
index 90d5cb6bc..e0e749bc7 100644
--- a/src/librawspeed/decompressors/HuffmanTableLookup.h
+++ b/src/librawspeed/decompressors/HuffmanTableLookup.h
@@ -63,18 +63,16 @@
 
 namespace rawspeed {
 
-class HuffmanTableLookup final : public AbstractHuffmanTable {
+class HuffmanTableLookup : public AbstractHuffmanTable {
+protected:
   // private fields calculated from codesPerBits and codeValues
   // they are index '1' based, so we can directly lookup the value
   // for code length l without decrementing
   std::vector<uint32_t> maxCodeOL;    // index is length of code
   std::vector<uint16_t> codeOffsetOL; // index is length of code
 
-  bool fullDecode = true;
-  bool fixDNGBug16 = false;
-
 public:
-  void setup(bool fullDecode_, bool fixDNGBug16_) {
+  std::vector<CodeSymbol> setup(bool fullDecode_, bool fixDNGBug16_) {
     this->fullDecode = fullDecode_;
     this->fixDNGBug16 = fixDNGBug16_;
 
@@ -88,20 +86,22 @@ class HuffmanTableLookup final : public AbstractHuffmanTable {
 
     // Figure C.1: make table of Huffman code length for each symbol
     // Figure C.2: generate the codes themselves
-    const auto symbols = generateCodeSymbols();
+    std::vector<CodeSymbol> symbols = generateCodeSymbols();
     assert(symbols.size() == maxCodesCount());
 
     // Figure F.15: generate decoding tables
     codeOffsetOL.resize(maxCodeLength + 1UL, 0xFFFF);
     maxCodeOL.resize(maxCodeLength + 1UL, 0xFFFFFFFF);
-    int code_index = 0;
-    for (unsigned int l = 1U; l <= maxCodeLength; l++) {
-      if (nCodesPerLength[l]) {
-        codeOffsetOL[l] = symbols[code_index].code - code_index;
-        code_index += nCodesPerLength[l];
-        maxCodeOL[l] = symbols[code_index - 1].code;
-      }
+    for (unsigned int numCodesSoFar = 0, codeLen = 1; codeLen <= maxCodeLength;
+         codeLen++) {
+      if (!nCodesPerLength[codeLen])
+        continue;
+      codeOffsetOL[codeLen] = symbols[numCodesSoFar].code - numCodesSoFar;
+      numCodesSoFar += nCodesPerLength[codeLen];
+      maxCodeOL[codeLen] = symbols[numCodesSoFar - 1].code;
     }
+
+    return symbols;
   }
 
   template <typename BIT_STREAM> inline int decodeLength(BIT_STREAM& bs) const {
@@ -118,6 +118,42 @@ class HuffmanTableLookup final : public AbstractHuffmanTable {
     return decode<BIT_STREAM, true>(bs);
   }
 
+protected:
+  template <typename BIT_STREAM>
+  inline std::pair<CodeSymbol, int /*codeValue*/>
+  finishReadingPartialSymbol(BIT_STREAM& bs, CodeSymbol partial) const {
+    while (partial.code_len < maxCodeOL.size() &&
+           (0xFFFFFFFF == maxCodeOL[partial.code_len] ||
+            partial.code > maxCodeOL[partial.code_len])) {
+      uint32_t temp = bs.getBitsNoFill(1);
+      partial.code = (partial.code << 1) | temp;
+      partial.code_len++;
+    }
+
+    if (partial.code_len >= maxCodeOL.size() ||
+        (0xFFFFFFFF == maxCodeOL[partial.code_len] ||
+         partial.code > maxCodeOL[partial.code_len]) ||
+        partial.code < codeOffsetOL[partial.code_len])
+      ThrowRDE("bad Huffman code: %u (len: %u)", partial.code,
+               partial.code_len);
+
+    int codeValue = codeValues[partial.code - codeOffsetOL[partial.code_len]];
+
+    return {partial, codeValue};
+  }
+
+  template <typename BIT_STREAM>
+  inline std::pair<CodeSymbol, int /*codeValue*/>
+  readSymbol(BIT_STREAM& bs) const {
+    // Start from completely unknown symbol.
+    CodeSymbol partial;
+    partial.code_len = 0;
+    partial.code = 0;
+
+    return finishReadingPartialSymbol(bs, partial);
+  }
+
+public:
   // The bool template paraeter is to enable two versions:
   // one returning only the length of the of diff bits (see Hasselblad),
   // one to return the fully decoded diff.
@@ -127,44 +163,13 @@ class HuffmanTableLookup final : public AbstractHuffmanTable {
     static_assert(BitStreamTraits<BIT_STREAM>::canUseWithHuffmanTable,
                   "This BitStream specialization is not marked as usable here");
     assert(FULL_DECODE == fullDecode);
-
-    // 32 is the absolute maximum combined length of code + diff
-    // assertion  maxCodePlusDiffLength() <= 32U  is already checked in setup()
     bs.fill(32);
 
-    // for processors supporting bmi2 instructions, using
-    // maxCodePlusDiffLength() might be beneficial
-
-    uint32_t code = 0;
-    uint32_t code_l = 0;
-    while (code_l < maxCodeOL.size() &&
-           (0xFFFFFFFF == maxCodeOL[code_l] || code > maxCodeOL[code_l])) {
-      uint32_t temp = bs.getBitsNoFill(1);
-      code = (code << 1) | temp;
-      code_l++;
-    }
-
-    if (code_l >= maxCodeOL.size() ||
-        (0xFFFFFFFF == maxCodeOL[code_l] || code > maxCodeOL[code_l]))
-      ThrowRDE("bad Huffman code: %u (len: %u)", code, code_l);
-
-    if (code < codeOffsetOL[code_l])
-      ThrowRDE("likely corrupt Huffman code: %u (len: %u)", code, code_l);
-
-    int diff_l = codeValues[code - codeOffsetOL[code_l]];
-
-    if (!FULL_DECODE)
-      return diff_l;
-
-    if (diff_l == 16) {
-      if (fixDNGBug16)
-        bs.skipBitsNoFill(16);
-      return -32768;
-    }
+    CodeSymbol symbol;
+    int codeValue;
+    std::tie(symbol, codeValue) = readSymbol(bs);
 
-    assert(FULL_DECODE);
-    assert((diff_l && (code_l + diff_l <= 32)) || !diff_l);
-    return diff_l ? extend(bs.getBitsNoFill(diff_l), diff_l) : 0;
+    return processSymbol<BIT_STREAM, FULL_DECODE>(bs, symbol, codeValue);
   }
 };
 
diff --git a/src/librawspeed/decompressors/HuffmanTableTree.h b/src/librawspeed/decompressors/HuffmanTableTree.h
index 468ebee48..76f89d638 100644
--- a/src/librawspeed/decompressors/HuffmanTableTree.h
+++ b/src/librawspeed/decompressors/HuffmanTableTree.h
@@ -39,12 +39,10 @@ class HuffmanTableTree final : public AbstractHuffmanTable {
 
   BinaryHuffmanTree<ValueType> tree;
 
-  bool fullDecode = true;
-  bool fixDNGBug16 = false;
-
 protected:
   template <typename BIT_STREAM>
-  inline ValueType getValue(BIT_STREAM& bs) const {
+  inline std::pair<CodeSymbol, ValueType /*codeValue*/>
+  readSymbol(BIT_STREAM& bs) const {
     static_assert(BitStreamTraits<BIT_STREAM>::canUseWithHuffmanTable,
                   "This BitStream specialization is not marked as usable here");
     CodeSymbol partial;
@@ -75,7 +73,7 @@ class HuffmanTableTree final : public AbstractHuffmanTable {
       if (static_cast<decltype(tree)::Node::Type>(*newNode) ==
           decltype(tree)::Node::Type::Leaf) {
         // Ok, great, hit a Leaf. This is it.
-        return newNode->getAsLeaf().value;
+        return {partial, newNode->getAsLeaf().value};
       }
 
       // Else, this is a branch, continue looking.
@@ -147,20 +145,11 @@ class HuffmanTableTree final : public AbstractHuffmanTable {
 
     bs.fill(32);
 
-    const auto codeValue = getValue(bs);
-
-    const int diff_l = codeValue;
-
-    if (!FULL_DECODE)
-      return diff_l;
-
-    if (diff_l == 16) {
-      if (fixDNGBug16)
-        bs.skipBitsNoFill(16);
-      return -32768;
-    }
+    CodeSymbol symbol;
+    int codeValue;
+    std::tie(symbol, codeValue) = readSymbol(bs);
 
-    return diff_l ? extend(bs.getBitsNoFill(diff_l), diff_l) : 0;
+    return processSymbol<BIT_STREAM, FULL_DECODE>(bs, symbol, codeValue);
   }
 };
 
diff --git a/src/librawspeed/decompressors/HuffmanTableVector.h b/src/librawspeed/decompressors/HuffmanTableVector.h
index 40bbc6bda..eafc8d1e4 100644
--- a/src/librawspeed/decompressors/HuffmanTableVector.h
+++ b/src/librawspeed/decompressors/HuffmanTableVector.h
@@ -33,15 +33,13 @@ namespace rawspeed {
 class HuffmanTableVector final : public AbstractHuffmanTable {
   std::vector<CodeSymbol> symbols;
 
-  bool fullDecode = true;
-  bool fixDNGBug16 = false;
-
   // Given this code len, which code id is the minimal?
   std::vector<unsigned int> extrCodeIdForLen; // index is length of code
 
 protected:
   template <typename BIT_STREAM>
-  inline std::pair<CodeSymbol, unsigned> getSymbol(BIT_STREAM& bs) const {
+  inline std::pair<CodeSymbol, int /*codeValue*/>
+  readSymbol(BIT_STREAM& bs) const {
     static_assert(BitStreamTraits<BIT_STREAM>::canUseWithHuffmanTable,
                   "This BitStream specialization is not marked as usable here");
 
@@ -63,7 +61,7 @@ class HuffmanTableVector final : public AbstractHuffmanTable {
            codeId < extrCodeIdForLen[1U + partial.code_len]; codeId++) {
         const CodeSymbol& symbol = symbols[codeId];
         if (symbol == partial) // yay, found?
-          return std::make_pair(symbol, codeId);
+          return {symbol, codeValues[codeId]};
       }
 
       // Ok, but does any symbol have this same prefix?
@@ -136,21 +134,11 @@ class HuffmanTableVector final : public AbstractHuffmanTable {
 
     bs.fill(32);
 
-    const auto got = getSymbol(bs);
-    const unsigned codeId = got.second;
-
-    const int diff_l = codeValues[codeId];
-
-    if (!FULL_DECODE)
-      return diff_l;
-
-    if (diff_l == 16) {
-      if (fixDNGBug16)
-        bs.skipBitsNoFill(16);
-      return -32768;
-    }
+    CodeSymbol symbol;
+    int codeValue;
+    std::tie(symbol, codeValue) = readSymbol(bs);
 
-    return diff_l ? extend(bs.getBitsNoFill(diff_l), diff_l) : 0;
+    return processSymbol<BIT_STREAM, FULL_DECODE>(bs, symbol, codeValue);
   }
 };
 
diff --git a/src/librawspeed/decompressors/JpegDecompressor.cpp b/src/librawspeed/decompressors/JpegDecompressor.cpp
index b2166a07f..a9623c129 100644
--- a/src/librawspeed/decompressors/JpegDecompressor.cpp
+++ b/src/librawspeed/decompressors/JpegDecompressor.cpp
@@ -40,7 +40,6 @@
 #include "io/IOException.h" // for ThrowIOE
 #endif
 
-using std::vector;
 using std::unique_ptr;
 using std::min;
 
@@ -117,8 +116,6 @@ void JpegDecompressor::decode(uint32_t offX,
                               uint32_t offY) { /* Each slice is a JPEG image */
   struct JpegDecompressStruct dinfo;
 
-  vector<JSAMPROW> buffer(1);
-
   const auto size = input.getRemainSize();
 
   JPEG_MEMSRC(&dinfo, input.getData(size), size);
@@ -136,11 +133,14 @@ void JpegDecompressor::decode(uint32_t offX,
       complete_buffer(
           alignedMallocArray<uint8_t, 16>(dinfo.output_height, row_stride),
           &alignedFree);
+
+  const Array2DRef<uint8_t> tmp(&complete_buffer[0],
+                                dinfo.output_components * dinfo.output_width,
+                                dinfo.output_height, row_stride);
+
   while (dinfo.output_scanline < dinfo.output_height) {
-    buffer[0] = static_cast<JSAMPROW>(
-        &complete_buffer[static_cast<size_t>(dinfo.output_scanline) *
-                         row_stride]);
-    if (0 == jpeg_read_scanlines(&dinfo, &buffer[0], 1))
+    auto rowOut = static_cast<JSAMPROW>(&tmp(dinfo.output_scanline, 0));
+    if (0 == jpeg_read_scanlines(&dinfo, &rowOut, 1))
       ThrowRDE("JPEG Error while decompressing image.");
   }
   jpeg_finish_decompress(&dinfo);
@@ -148,16 +148,11 @@ void JpegDecompressor::decode(uint32_t offX,
   // Now the image is decoded, and we copy the image data
   int copy_w = min(mRaw->dim.x - offX, dinfo.output_width);
   int copy_h = min(mRaw->dim.y - offY, dinfo.output_height);
-  for (int y = 0; y < copy_h; y++) {
-    uint8_t* src = &complete_buffer[static_cast<size_t>(row_stride) * y];
-    auto* dst = reinterpret_cast<uint16_t*>(mRaw->getData(offX, y + offY));
-    for (int x = 0; x < copy_w; x++) {
-      for (int c = 0; c < dinfo.output_components; c++) {
-        *dst = *src;
-        src++;
-        dst++;
-      }
-    }
+
+  const Array2DRef<uint16_t> out(mRaw->getU16DataAsUncroppedArray2DRef());
+  for (int row = 0; row < copy_h; row++) {
+    for (int col = 0; col < dinfo.output_components * copy_w; col++)
+      out(row + offY, dinfo.output_components * offX + col) = tmp(row, col);
   }
 }
 
diff --git a/src/librawspeed/decompressors/KodakDecompressor.cpp b/src/librawspeed/decompressors/KodakDecompressor.cpp
index 702509763..a8983b65f 100644
--- a/src/librawspeed/decompressors/KodakDecompressor.cpp
+++ b/src/librawspeed/decompressors/KodakDecompressor.cpp
@@ -108,32 +108,29 @@ KodakDecompressor::decodeSegment(const uint32_t bsize) {
 }
 
 void KodakDecompressor::decompress() {
-  uint8_t* data = mRaw->getData();
-  uint32_t pitch = mRaw->pitch;
+  const Array2DRef<uint16_t> out(mRaw->getU16DataAsUncroppedArray2DRef());
 
   uint32_t random = 0;
-  for (auto y = 0; y < mRaw->dim.y; y++) {
-    auto* dest = reinterpret_cast<uint16_t*>(&data[y * pitch]);
-
-    for (auto x = 0; x < mRaw->dim.x; x += segment_size) {
-      const uint32_t len = std::min(segment_size, mRaw->dim.x - x);
+  for (int row = 0; row < out.height; row++) {
+    for (int col = 0; col < out.width;) {
+      const int len = std::min(segment_size, mRaw->dim.x - col);
 
       const segment buf = decodeSegment(len);
 
       std::array<int, 2> pred;
       pred.fill(0);
 
-      for (uint32_t i = 0; i < len; i++) {
+      for (int i = 0; i < len; ++i, ++col) {
         pred[i & 1] += buf[i];
 
         int value = pred[i & 1];
-        if (unsigned(value) >= (1U << bps))
+        if (!isIntN(value, bps))
           ThrowRDE("Value out of bounds %d (bps = %i)", value, bps);
 
         if (uncorrectedRawValues)
-          dest[x + i] = value;
+          out(row, col) = value;
         else
-          mRaw->setWithLookUp(value, reinterpret_cast<uint8_t*>(&dest[x + i]),
+          mRaw->setWithLookUp(value, reinterpret_cast<uint8_t*>(&out(row, col)),
                               &random);
       }
     }
diff --git a/src/librawspeed/decompressors/NikonDecompressor.cpp b/src/librawspeed/decompressors/NikonDecompressor.cpp
index 25e9009d8..ae6a73eaa 100644
--- a/src/librawspeed/decompressors/NikonDecompressor.cpp
+++ b/src/librawspeed/decompressors/NikonDecompressor.cpp
@@ -466,10 +466,10 @@ NikonDecompressor::NikonDecompressor(const RawImage& raw, ByteStream metadata,
   if (bitsPS == 14)
     huffSelect += 3;
 
-  pUp1[0] = metadata.getU16();
-  pUp1[1] = metadata.getU16();
-  pUp2[0] = metadata.getU16();
-  pUp2[1] = metadata.getU16();
+  pUp[0][0] = metadata.getU16();
+  pUp[1][0] = metadata.getU16();
+  pUp[0][1] = metadata.getU16();
+  pUp[1][1] = metadata.getU16();
 
   curve = createCurve(&metadata, bitsPS, v0, v1, &split);
 
@@ -482,43 +482,22 @@ template <typename Huffman>
 void NikonDecompressor::decompress(BitPumpMSB* bits, int start_y, int end_y) {
   Huffman ht = createHuffmanTable<Huffman>(huffSelect);
 
-  uint8_t* draw = mRaw->getData();
-  uint32_t pitch = mRaw->pitch;
-
-  int pLeft1 = 0;
-  int pLeft2 = 0;
+  const Array2DRef<uint16_t> out(mRaw->getU16DataAsUncroppedArray2DRef());
 
   // allow gcc to devirtualize the calls below
   auto* rawdata = reinterpret_cast<RawImageDataU16*>(mRaw.get());
 
-  const iPoint2D& size = mRaw->dim;
-  assert(size.x % 2 == 0);
-  assert(size.x >= 2);
-  for (uint32_t y = start_y; y < static_cast<uint32_t>(end_y); y++) {
-    auto* dest =
-        reinterpret_cast<uint16_t*>(&draw[y * pitch]); // Adjust destination
-    pUp1[y & 1] += ht.decodeNext(*bits);
-    pUp2[y & 1] += ht.decodeNext(*bits);
-    pLeft1 = pUp1[y & 1];
-    pLeft2 = pUp2[y & 1];
-
-    rawdata->setWithLookUp(clampBits(pLeft1, 15),
-                           reinterpret_cast<uint8_t*>(dest + 0), &random);
-    rawdata->setWithLookUp(clampBits(pLeft2, 15),
-                           reinterpret_cast<uint8_t*>(dest + 1), &random);
-
-    dest += 2;
-
-    for (uint32_t x = 2; x < static_cast<uint32_t>(size.x); x += 2) {
-      pLeft1 += ht.decodeNext(*bits);
-      pLeft2 += ht.decodeNext(*bits);
-
-      rawdata->setWithLookUp(clampBits(pLeft1, 15),
-                             reinterpret_cast<uint8_t*>(dest + 0), &random);
-      rawdata->setWithLookUp(clampBits(pLeft2, 15),
-                             reinterpret_cast<uint8_t*>(dest + 1), &random);
-
-      dest += 2;
+  assert(out.width % 2 == 0);
+  assert(out.width >= 2);
+  for (int row = start_y; row < end_y; row++) {
+    std::array<int, 2> pred = pUp[row & 1];
+    for (int col = 0; col < out.width; col++) {
+      pred[col & 1] += ht.decodeNext(*bits);
+      if (col < 2)
+        pUp[row & 1][col & 1] = pred[col & 1];
+      rawdata->setWithLookUp(clampBits(pred[col & 1], 15),
+                             reinterpret_cast<uint8_t*>(&out(row, col)),
+                             &random);
     }
   }
 }
diff --git a/src/librawspeed/decompressors/NikonDecompressor.h b/src/librawspeed/decompressors/NikonDecompressor.h
index 534886dd6..3b3b26d35 100644
--- a/src/librawspeed/decompressors/NikonDecompressor.h
+++ b/src/librawspeed/decompressors/NikonDecompressor.h
@@ -37,8 +37,7 @@ class NikonDecompressor final : public AbstractDecompressor {
   uint32_t huffSelect = 0;
   uint32_t split = 0;
 
-  std::array<int, 2> pUp1;
-  std::array<int, 2> pUp2;
+  std::array<std::array<int, 2>, 2> pUp;
 
   std::vector<uint16_t> curve;
 
diff --git a/src/librawspeed/decompressors/OlympusDecompressor.cpp b/src/librawspeed/decompressors/OlympusDecompressor.cpp
index 5c795fb34..b17924d6a 100644
--- a/src/librawspeed/decompressors/OlympusDecompressor.cpp
+++ b/src/librawspeed/decompressors/OlympusDecompressor.cpp
@@ -99,18 +99,18 @@ OlympusDecompressor::parseCarry(BitPumpMSB* bits,
   return (diff * 4) | low;
 }
 
-inline int OlympusDecompressor::getPred(int row, int x, uint16_t* dest,
-                                        const uint16_t* up_ptr) {
-  auto getLeft = [dest]() { return dest[-2]; };
-  auto getUp = [up_ptr]() { return up_ptr[0]; };
-  auto getLeftUp = [up_ptr]() { return up_ptr[-2]; };
+inline int OlympusDecompressor::getPred(const Array2DRef<uint16_t> out, int row,
+                                        int col) {
+  auto getLeft = [&]() { return out(row, col - 2); };
+  auto getUp = [&]() { return out(row - 2, col); };
+  auto getLeftUp = [&]() { return out(row - 2, col - 2); };
 
   int pred;
-  if (row < 2 && x < 2)
+  if (row < 2 && col < 2)
     pred = 0;
   else if (row < 2)
     pred = getLeft();
-  else if (x < 2)
+  else if (col < 2)
     pred = getUp();
   else {
     int left = getLeft();
@@ -139,23 +139,19 @@ void OlympusDecompressor::decompressRow(BitPumpMSB* bits, int row) const {
   assert(mRaw->dim.x > 0);
   assert(mRaw->dim.x % 2 == 0);
 
-  int pitch = mRaw->pitch;
+  const Array2DRef<uint16_t> out(mRaw->getU16DataAsUncroppedArray2DRef());
 
   std::array<std::array<int, 3>, 2> acarry{{}};
 
-  auto* dest = reinterpret_cast<uint16_t*>(mRaw->getData(0, row));
-  const auto* up_ptr = row > 0 ? &dest[-pitch] : &dest[0];
-  for (int x = 0; x < mRaw->dim.x; x++) {
-    int c = x & 1;
+  for (int col = 0; col < out.width; col++) {
+    int c = col & 1;
 
     std::array<int, 3>& carry = acarry[c];
 
     int diff = parseCarry(bits, &carry);
-    int pred = getPred(row, x, dest, up_ptr);
+    int pred = getPred(out, row, col);
 
-    *dest = pred + diff;
-    dest++;
-    up_ptr++;
+    out(row, col) = pred + diff;
   }
 }
 
diff --git a/src/librawspeed/decompressors/OlympusDecompressor.h b/src/librawspeed/decompressors/OlympusDecompressor.h
index a9d420b21..54d7f4c49 100644
--- a/src/librawspeed/decompressors/OlympusDecompressor.h
+++ b/src/librawspeed/decompressors/OlympusDecompressor.h
@@ -46,8 +46,7 @@ class OlympusDecompressor final : public AbstractDecompressor {
   inline __attribute__((always_inline)) int
   parseCarry(BitPumpMSB* bits, std::array<int, 3>* carry) const;
 
-  static inline int getPred(int row, int x, uint16_t* dest,
-                            const uint16_t* up_ptr);
+  static inline int getPred(Array2DRef<uint16_t> out, int row, int col);
 
   void decompressRow(BitPumpMSB* bits, int row) const;
 
diff --git a/src/librawspeed/decompressors/PanasonicDecompressor.cpp b/src/librawspeed/decompressors/PanasonicDecompressor.cpp
index 3ae17bdcb..ca65b8543 100644
--- a/src/librawspeed/decompressors/PanasonicDecompressor.cpp
+++ b/src/librawspeed/decompressors/PanasonicDecompressor.cpp
@@ -163,9 +163,11 @@ class PanasonicDecompressor::ProxyStream {
   }
 };
 
-void PanasonicDecompressor::processPixelPacket(
-    ProxyStream* bits, int y, uint16_t* dest, int xbegin,
+inline void PanasonicDecompressor::processPixelPacket(
+    ProxyStream* bits, int row, int col,
     std::vector<uint32_t>* zero_pos) const noexcept {
+  const Array2DRef<uint16_t> out(mRaw->getU16DataAsUncroppedArray2DRef());
+
   int sh = 0;
 
   std::array<int, 2> pred;
@@ -176,7 +178,7 @@ void PanasonicDecompressor::processPixelPacket(
 
   int u = 0;
 
-  for (int p = 0; p < PixelsPerPacket; p++) {
+  for (int p = 0; p < PixelsPerPacket; ++p, ++col) {
     const int c = p & 1;
 
     if (u == 2) {
@@ -198,13 +200,12 @@ void PanasonicDecompressor::processPixelPacket(
         pred[c] = nonz[c] << 4 | bits->getBits(4);
     }
 
-    *dest = pred[c];
+    out(row, col) = pred[c];
 
     if (zero_is_bad && 0 == pred[c])
-      zero_pos->push_back((y << 16) | (xbegin + p));
+      zero_pos->push_back((row << 16) | col);
 
     u++;
-    dest++;
   }
 }
 
@@ -213,28 +214,22 @@ void PanasonicDecompressor::processBlock(const Block& block,
     noexcept {
   ProxyStream bits(block.bs, section_split_offset);
 
-  for (int y = block.beginCoord.y; y <= block.endCoord.y; y++) {
-    int x = 0;
+  for (int row = block.beginCoord.y; row <= block.endCoord.y; row++) {
+    int col = 0;
     // First row may not begin at the first column.
-    if (block.beginCoord.y == y)
-      x = block.beginCoord.x;
+    if (block.beginCoord.y == row)
+      col = block.beginCoord.x;
 
-    int endx = mRaw->dim.x;
+    int endCol = mRaw->dim.x;
     // Last row may end before the last column.
-    if (block.endCoord.y == y)
-      endx = block.endCoord.x;
-
-    auto* dest = reinterpret_cast<uint16_t*>(mRaw->getData(x, y));
+    if (block.endCoord.y == row)
+      endCol = block.endCoord.x;
 
-    assert(x % PixelsPerPacket == 0);
-    assert(endx % PixelsPerPacket == 0);
+    assert(col % PixelsPerPacket == 0);
+    assert(endCol % PixelsPerPacket == 0);
 
-    for (; x < endx;) {
-      processPixelPacket(&bits, y, dest, x, zero_pos);
-
-      x += PixelsPerPacket;
-      dest += PixelsPerPacket;
-    }
+    for (; col < endCol; col += PixelsPerPacket)
+      processPixelPacket(&bits, row, col, zero_pos);
   }
 }
 
diff --git a/src/librawspeed/decompressors/PanasonicDecompressor.h b/src/librawspeed/decompressors/PanasonicDecompressor.h
index 6591c01fd..e9e845ec0 100644
--- a/src/librawspeed/decompressors/PanasonicDecompressor.h
+++ b/src/librawspeed/decompressors/PanasonicDecompressor.h
@@ -73,8 +73,9 @@ class PanasonicDecompressor final : public AbstractDecompressor {
 
   void chopInputIntoBlocks();
 
-  void processPixelPacket(ProxyStream* bits, int y, uint16_t* dest, int xbegin,
-                          std::vector<uint32_t>* zero_pos) const noexcept;
+  inline void
+  processPixelPacket(ProxyStream* bits, int row, int col,
+                     std::vector<uint32_t>* zero_pos) const noexcept;
 
   void processBlock(const Block& block, std::vector<uint32_t>* zero_pos) const
       noexcept;
diff --git a/src/librawspeed/decompressors/PanasonicDecompressorV5.cpp b/src/librawspeed/decompressors/PanasonicDecompressorV5.cpp
index 3f32b8253..09a0874f0 100644
--- a/src/librawspeed/decompressors/PanasonicDecompressorV5.cpp
+++ b/src/librawspeed/decompressors/PanasonicDecompressorV5.cpp
@@ -174,21 +174,19 @@ class PanasonicDecompressorV5::ProxyStream {
 };
 
 template <const PanasonicDecompressorV5::PacketDsc& dsc>
-void PanasonicDecompressorV5::processPixelPacket(BitPumpLSB* bs,
-                                                 uint16_t* dest) const {
+inline void PanasonicDecompressorV5::processPixelPacket(BitPumpLSB* bs, int row,
+                                                        int col) const {
   static_assert(dsc.pixelsPerPacket > 0, "dsc should be compile-time const");
   static_assert(dsc.bps > 0 && dsc.bps <= 16, "");
 
+  const Array2DRef<uint16_t> out(mRaw->getU16DataAsUncroppedArray2DRef());
+
   assert(bs->getFillLevel() == 0);
 
-  const uint16_t* const endDest = dest + dsc.pixelsPerPacket;
-  for (; dest != endDest;) {
+  for (int p = 0; p < dsc.pixelsPerPacket;) {
     bs->fill();
-    for (; bs->getFillLevel() >= dsc.bps; dest++) {
-      assert(dest != endDest);
-
-      *dest = bs->getBitsNoFill(dsc.bps);
-    }
+    for (; bs->getFillLevel() >= dsc.bps; ++p, ++col)
+      out(row, col) = bs->getBitsNoFill(dsc.bps);
   }
   bs->skipBitsNoFill(bs->getFillLevel()); // get rid of padding.
 }
@@ -201,28 +199,22 @@ void PanasonicDecompressorV5::processBlock(const Block& block) const {
   ProxyStream proxy(block.bs);
   BitPumpLSB bs(proxy.getStream());
 
-  for (int y = block.beginCoord.y; y <= block.endCoord.y; y++) {
-    int x = 0;
+  for (int row = block.beginCoord.y; row <= block.endCoord.y; row++) {
+    int col = 0;
     // First row may not begin at the first column.
-    if (block.beginCoord.y == y)
-      x = block.beginCoord.x;
+    if (block.beginCoord.y == row)
+      col = block.beginCoord.x;
 
     int endx = mRaw->dim.x;
     // Last row may end before the last column.
-    if (block.endCoord.y == y)
+    if (block.endCoord.y == row)
       endx = block.endCoord.x;
 
-    auto* dest = reinterpret_cast<uint16_t*>(mRaw->getData(x, y));
-
-    assert(x % dsc.pixelsPerPacket == 0);
+    assert(col % dsc.pixelsPerPacket == 0);
     assert(endx % dsc.pixelsPerPacket == 0);
 
-    for (; x < endx;) {
-      processPixelPacket<dsc>(&bs, dest);
-
-      x += dsc.pixelsPerPacket;
-      dest += dsc.pixelsPerPacket;
-    }
+    for (; col < endx; col += dsc.pixelsPerPacket)
+      processPixelPacket<dsc>(&bs, row, col);
   }
 }
 
diff --git a/src/librawspeed/decompressors/PanasonicDecompressorV5.h b/src/librawspeed/decompressors/PanasonicDecompressorV5.h
index 9396c07cc..1ad2b2507 100644
--- a/src/librawspeed/decompressors/PanasonicDecompressorV5.h
+++ b/src/librawspeed/decompressors/PanasonicDecompressorV5.h
@@ -91,7 +91,7 @@ class PanasonicDecompressorV5 final : public AbstractDecompressor {
   void chopInputIntoBlocks(const PacketDsc& dsc);
 
   template <const PacketDsc& dsc>
-  void processPixelPacket(BitPumpLSB* bs, uint16_t* dest) const;
+  inline void processPixelPacket(BitPumpLSB* bs, int row, int col) const;
 
   template <const PacketDsc& dsc> void processBlock(const Block& block) const;
 
diff --git a/src/librawspeed/decompressors/PentaxDecompressor.cpp b/src/librawspeed/decompressors/PentaxDecompressor.cpp
index 9c8776607..e58a1585d 100644
--- a/src/librawspeed/decompressors/PentaxDecompressor.cpp
+++ b/src/librawspeed/decompressors/PentaxDecompressor.cpp
@@ -137,36 +137,24 @@ HuffmanTable PentaxDecompressor::SetupHuffmanTable(ByteStream* metaData) {
 }
 
 void PentaxDecompressor::decompress(const ByteStream& data) const {
-  BitPumpMSB bs(data);
-  uint8_t* draw = mRaw->getData();
-
-  assert(mRaw->dim.y > 0);
-  assert(mRaw->dim.x > 0);
-  assert(mRaw->dim.x % 2 == 0);
-
-  std::array<int, 2> pUp1 = {{}};
-  std::array<int, 2> pUp2 = {{}};
-
-  for (int y = 0; y < mRaw->dim.y && mRaw->dim.x >= 2; y++) {
-    auto* dest = reinterpret_cast<uint16_t*>(&draw[y * mRaw->pitch]);
+  const Array2DRef<uint16_t> out(mRaw->getU16DataAsUncroppedArray2DRef());
 
-    pUp1[y & 1] += ht.decodeNext(bs);
-    pUp2[y & 1] += ht.decodeNext(bs);
+  assert(out.height > 0);
+  assert(out.width > 0);
+  assert(out.width % 2 == 0);
 
-    int pLeft1 = dest[0] = pUp1[y & 1];
-    int pLeft2 = dest[1] = pUp2[y & 1];
-
-    for (int x = 2; x < mRaw->dim.x; x += 2) {
-      pLeft1 += ht.decodeNext(bs);
-      pLeft2 += ht.decodeNext(bs);
-
-      dest[x] = pLeft1;
-      dest[x + 1] = pLeft2;
-
-      if (pLeft1 < 0 || pLeft1 > 65535)
-        ThrowRDE("decoded value out of bounds at %d:%d", x, y);
-      if (pLeft2 < 0 || pLeft2 > 65535)
-        ThrowRDE("decoded value out of bounds at %d:%d", x, y);
+  BitPumpMSB bs(data);
+  for (int row = 0; row < out.height; row++) {
+    std::array<int, 2> pred = {{}};
+    if (row >= 2)
+      pred = {out(row - 2, 0), out(row - 2, 1)};
+
+    for (int col = 0; col < out.width; col++) {
+      pred[col & 1] += ht.decodeNext(bs);
+      int value = pred[col & 1];
+      if (!isIntN(value, 16))
+        ThrowRDE("decoded value out of bounds at %d:%d", col, row);
+      out(row, col) = value;
     }
   }
 }
diff --git a/src/librawspeed/decompressors/PhaseOneDecompressor.cpp b/src/librawspeed/decompressors/PhaseOneDecompressor.cpp
index bb4bc7e5a..fda407522 100644
--- a/src/librawspeed/decompressors/PhaseOneDecompressor.cpp
+++ b/src/librawspeed/decompressors/PhaseOneDecompressor.cpp
@@ -46,15 +46,15 @@ PhaseOneDecompressor::PhaseOneDecompressor(const RawImage& img,
     ThrowRDE("Unexpected cpp: %u", mRaw->getCpp());
 
   if (!mRaw->dim.hasPositiveArea() || mRaw->dim.x % 2 != 0 ||
-      mRaw->dim.x > 11976 || mRaw->dim.y > 8852) {
+      mRaw->dim.x > 11976 || mRaw->dim.y > 8854) {
     ThrowRDE("Unexpected image dimensions found: (%u; %u)", mRaw->dim.x,
              mRaw->dim.y);
   }
 
-  validateStrips();
+  prepareStrips();
 }
 
-void PhaseOneDecompressor::validateStrips() const {
+void PhaseOneDecompressor::prepareStrips() {
   // The 'strips' vector should contain exactly one element per row of image.
 
   // If the length is different, then the 'strips' vector is clearly incorrect.
@@ -63,42 +63,26 @@ void PhaseOneDecompressor::validateStrips() const {
              strips.size());
   }
 
-  struct RowBin {
-    using value_type = unsigned char;
-    bool isEmpty() const { return data == 0; }
-    void fill() { data = 1; }
-    value_type data = 0;
-  };
-
   // Now, the strips in 'strips' vector aren't in order.
   // The 'decltype(strips)::value_type::n' is the row number of a strip.
   // We need to make sure that we have every row (0..mRaw->dim.y-1), once.
-
-  // There are many ways to do that. Here, we take the histogram of all the
-  // row numbers, and if any bin ends up not being '1' (one strip per row),
-  // then the input is bad.
-  std::vector<RowBin> histogram;
-  histogram.resize(strips.size());
-  int numBinsFilled = 0;
-  std::for_each(strips.begin(), strips.end(),
-                [y = mRaw->dim.y, &histogram,
-                 &numBinsFilled](const PhaseOneStrip& strip) {
-                  if (strip.n < 0 || strip.n >= y)
-                    ThrowRDE("Strip specifies out-of-bounds row %u", strip.n);
-                  RowBin& rowBin = histogram[strip.n];
-                  if (!rowBin.isEmpty())
-                    ThrowRDE("Duplicate row %u", strip.n);
-                  rowBin.fill();
-                  numBinsFilled++;
-                });
-  assert(histogram.size() == strips.size());
-  assert(numBinsFilled == mRaw->dim.y &&
-         "We should only get here if all the rows/bins got filled.");
+  // For that, first let's sort them to have monothonically increasting `n`.
+  // This will also serialize the per-line outputting.
+  std::sort(
+      strips.begin(), strips.end(),
+      [](const PhaseOneStrip& a, const PhaseOneStrip& b) { return a.n < b.n; });
+  // And now ensure that slice number matches the slice's row.
+  for (decltype(strips)::size_type i = 0; i < strips.size(); ++i)
+    if (static_cast<decltype(strips)::size_type>(strips[i].n) != i)
+      ThrowRDE("Strips validation issue.");
+  // All good.
 }
 
 void PhaseOneDecompressor::decompressStrip(const PhaseOneStrip& strip) const {
-  uint32_t width = mRaw->dim.x;
-  assert(width % 2 == 0);
+  const Array2DRef<uint16_t> out(mRaw->getU16DataAsUncroppedArray2DRef());
+
+  assert(out.width > 0);
+  assert(out.width % 2 == 0);
 
   static constexpr std::array<const int, 10> length = {8,  7, 6,  9,  11,
                                                        10, 5, 12, 14, 13};
@@ -108,10 +92,11 @@ void PhaseOneDecompressor::decompressStrip(const PhaseOneStrip& strip) const {
   std::array<int32_t, 2> pred;
   pred.fill(0);
   std::array<int, 2> len;
-  auto* img = reinterpret_cast<uint16_t*>(mRaw->getData(0, strip.n));
-  for (uint32_t col = 0; col < width; col++) {
+  const int row = strip.n;
+  for (int col = 0; col < out.width; col++) {
     pump.fill(32);
-    if (col >= (width & ~7U)) // last 'width % 8' pixels.
+    if (static_cast<unsigned>(col) >=
+        (out.width & ~7U)) // last 'width % 8' pixels.
       len[0] = len[1] = 14;
     else if ((col & 7) == 0) {
       for (int& i : len) {
@@ -135,12 +120,12 @@ void PhaseOneDecompressor::decompressStrip(const PhaseOneStrip& strip) const {
 
     int i = len[col & 1];
     if (i == 14)
-      img[col] = pred[col & 1] = pump.getBitsNoFill(16);
+      out(row, col) = pred[col & 1] = pump.getBitsNoFill(16);
     else {
       pred[col & 1] +=
           static_cast<signed>(pump.getBitsNoFill(i)) + 1 - (1 << (i - 1));
       // FIXME: is the truncation the right solution here?
-      img[col] = uint16_t(pred[col & 1]);
+      out(row, col) = uint16_t(pred[col & 1]);
     }
   }
 }
diff --git a/src/librawspeed/decompressors/PhaseOneDecompressor.h b/src/librawspeed/decompressors/PhaseOneDecompressor.h
index b1f1b7ef0..37d5d5e3a 100644
--- a/src/librawspeed/decompressors/PhaseOneDecompressor.h
+++ b/src/librawspeed/decompressors/PhaseOneDecompressor.h
@@ -33,9 +33,10 @@ namespace rawspeed {
 class RawImage;
 
 struct PhaseOneStrip {
-  const int n;
-  const ByteStream bs;
+  int n;
+  ByteStream bs;
 
+  PhaseOneStrip() = default;
   PhaseOneStrip(int block, ByteStream bs_) : n(block), bs(std::move(bs_)) {}
 };
 
@@ -48,7 +49,7 @@ class PhaseOneDecompressor final : public AbstractDecompressor {
 
   void decompressThread() const noexcept;
 
-  void validateStrips() const;
+  void prepareStrips();
 
 public:
   PhaseOneDecompressor(const RawImage& img,
diff --git a/src/librawspeed/decompressors/SamsungV0Decompressor.cpp b/src/librawspeed/decompressors/SamsungV0Decompressor.cpp
index a4dc223f4..a9412df40 100644
--- a/src/librawspeed/decompressors/SamsungV0Decompressor.cpp
+++ b/src/librawspeed/decompressors/SamsungV0Decompressor.cpp
@@ -84,22 +84,14 @@ void SamsungV0Decompressor::computeStripes(ByteStream bso, ByteStream bsr) {
 }
 
 void SamsungV0Decompressor::decompress() const {
-  for (int y = 0; y < mRaw->dim.y; y++)
-    decompressStrip(y, stripes[y]);
+  for (int row = 0; row < mRaw->dim.y; row++)
+    decompressStrip(row, stripes[row]);
 
   // Swap red and blue pixels to get the final CFA pattern
-  for (int y = 0; y < mRaw->dim.y - 1; y += 2) {
-    auto* topline = reinterpret_cast<uint16_t*>(mRaw->getData(0, y));
-    auto* bottomline = reinterpret_cast<uint16_t*>(mRaw->getData(0, y + 1));
-
-    for (int x = 0; x < mRaw->dim.x - 1; x += 2) {
-      uint16_t temp = topline[1];
-      topline[1] = bottomline[0];
-      bottomline[0] = temp;
-
-      topline += 2;
-      bottomline += 2;
-    }
+  const Array2DRef<uint16_t> out(mRaw->getU16DataAsUncroppedArray2DRef());
+  for (int row = 0; row < out.height - 1; row += 2) {
+    for (int col = 0; col < out.width - 1; col += 2)
+      std::swap(out(row, col + 1), out(row + 1, col));
   }
 }
 
@@ -110,27 +102,19 @@ int32_t SamsungV0Decompressor::calcAdj(BitPumpMSB32* bits, int b) {
   return adj;
 }
 
-void SamsungV0Decompressor::decompressStrip(uint32_t y,
+void SamsungV0Decompressor::decompressStrip(int row,
                                             const ByteStream& bs) const {
-  const uint32_t width = mRaw->dim.x;
-  assert(width > 0);
+  const Array2DRef<uint16_t> out(mRaw->getU16DataAsUncroppedArray2DRef());
+  assert(out.width > 0);
 
   BitPumpMSB32 bits(bs);
 
   std::array<int, 4> len;
   for (int& i : len)
-    i = y < 2 ? 7 : 4;
-
-  auto* img = reinterpret_cast<uint16_t*>(mRaw->getData(0, y));
-  const auto* const past_last =
-      reinterpret_cast<uint16_t*>(mRaw->getData(width - 1, y) + mRaw->getBpp());
-  uint16_t* img_up = reinterpret_cast<uint16_t*>(
-      mRaw->getData(0, std::max(0, static_cast<int>(y) - 1)));
-  uint16_t* img_up2 = reinterpret_cast<uint16_t*>(
-      mRaw->getData(0, std::max(0, static_cast<int>(y) - 2)));
+    i = row < 2 ? 7 : 4;
 
   // Image is arranged in groups of 16 pixels horizontally
-  for (uint32_t x = 0; x < width; x += 16) {
+  for (int col = 0; col < out.width; col += 16) {
     bits.fill();
     bool dir = !!bits.getBitsNoFill(1);
 
@@ -165,10 +149,10 @@ void SamsungV0Decompressor::decompressStrip(uint32_t y,
     if (dir) {
       // Upward prediction
 
-      if (y < 2)
+      if (row < 2)
         ThrowRDE("Upward prediction for the first two rows. Raw corrupt");
 
-      if (x + 16 >= width)
+      if (col + 16 >= out.width)
         ThrowRDE("Upward prediction for the last block of pixels. Raw corrupt");
 
       // First we decode even pixels
@@ -176,7 +160,7 @@ void SamsungV0Decompressor::decompressStrip(uint32_t y,
         int b = len[c >> 3];
         int32_t adj = calcAdj(&bits, b);
 
-        img[c] = adj + img_up[c];
+        out(row, col + c) = adj + out(row - 1, col + c);
       }
 
       // Now we decode odd pixels
@@ -186,34 +170,30 @@ void SamsungV0Decompressor::decompressStrip(uint32_t y,
         int b = len[2 | (c >> 3)];
         int32_t adj = calcAdj(&bits, b);
 
-        img[c] = adj + img_up2[c];
+        out(row, col + c) = adj + out(row - 2, col + c);
       }
     } else {
       // Left to right prediction
       // First we decode even pixels
-      int pred_left = x != 0 ? img[-2] : 128;
+      int pred_left = col != 0 ? out(row, col - 2) : 128;
       for (int c = 0; c < 16; c += 2) {
         int b = len[c >> 3];
         int32_t adj = calcAdj(&bits, b);
 
-        if (img + c < past_last)
-          img[c] = adj + pred_left;
+        if (col + c < out.width)
+          out(row, col + c) = adj + pred_left;
       }
 
       // Now we decode odd pixels
-      pred_left = x != 0 ? img[-1] : 128;
+      pred_left = col != 0 ? out(row, col - 1) : 128;
       for (int c = 1; c < 16; c += 2) {
         int b = len[2 | (c >> 3)];
         int32_t adj = calcAdj(&bits, b);
 
-        if (img + c < past_last)
-          img[c] = adj + pred_left;
+        if (col + c < out.width)
+          out(row, col + c) = adj + pred_left;
       }
     }
-
-    img += 16;
-    img_up += 16;
-    img_up2 += 16;
   }
 }
 
diff --git a/src/librawspeed/decompressors/SamsungV0Decompressor.h b/src/librawspeed/decompressors/SamsungV0Decompressor.h
index b3d1a7181..4a29449a3 100644
--- a/src/librawspeed/decompressors/SamsungV0Decompressor.h
+++ b/src/librawspeed/decompressors/SamsungV0Decompressor.h
@@ -36,7 +36,7 @@ class SamsungV0Decompressor final : public AbstractSamsungDecompressor {
 
   void computeStripes(ByteStream bso, ByteStream bsr);
 
-  void decompressStrip(uint32_t y, const ByteStream& bs) const;
+  void decompressStrip(int row, const ByteStream& bs) const;
 
   static int32_t calcAdj(BitPumpMSB32* bits, int b);
 
diff --git a/src/librawspeed/decompressors/SamsungV1Decompressor.cpp b/src/librawspeed/decompressors/SamsungV1Decompressor.cpp
index bc0241b6c..123a6488f 100644
--- a/src/librawspeed/decompressors/SamsungV1Decompressor.cpp
+++ b/src/librawspeed/decompressors/SamsungV1Decompressor.cpp
@@ -39,7 +39,7 @@ struct SamsungV1Decompressor::encTableItem {
 
 SamsungV1Decompressor::SamsungV1Decompressor(const RawImage& image,
                                              const ByteStream* bs_, int bit)
-    : AbstractSamsungDecompressor(image), bs(bs_), bits(bit) {
+    : AbstractSamsungDecompressor(image), bs(bs_) {
   if (mRaw->getCpp() != 1 || mRaw->getDataType() != TYPE_USHORT16 ||
       mRaw->getBpp() != 2)
     ThrowRDE("Unexpected component count / data type");
@@ -54,7 +54,8 @@ SamsungV1Decompressor::SamsungV1Decompressor(const RawImage& image,
   const uint32_t width = mRaw->dim.x;
   const uint32_t height = mRaw->dim.y;
 
-  if (width == 0 || height == 0 || width > 5664 || height > 3714)
+  if (width == 0 || height == 0 || width % 32 != 0 || height % 2 != 0 ||
+      width > 5664 || height > 3714)
     ThrowRDE("Unexpected image dimensions found: (%u; %u)", width, height);
 }
 
@@ -77,9 +78,6 @@ SamsungV1Decompressor::samsungDiff(BitPumpMSB* pump,
 }
 
 void SamsungV1Decompressor::decompress() {
-  const uint32_t width = mRaw->dim.x;
-  const uint32_t height = mRaw->dim.y;
-
   // This format has a variable length encoding of how many bits are needed
   // to encode the difference between pixels, we use a table to process it
   // that has two values, the first the number of bits that were used to
@@ -101,8 +99,6 @@ void SamsungV1Decompressor::decompress() {
                                                               {4, 8},
                                                               {4, 2}}};
   std::vector<encTableItem> tbl(1024);
-  std::array<std::array<uint16_t, 2>, 2> vpred = {{}};
-  std::array<uint16_t, 2> hpred;
 
   // We generate a 1024 entry table (to be addressed by reading 10 bits) by
   // consecutively filling in 2^(10-N) positions where N is the variable number
@@ -120,18 +116,23 @@ void SamsungV1Decompressor::decompress() {
     }
   }
 
+  const Array2DRef<uint16_t> out(mRaw->getU16DataAsUncroppedArray2DRef());
+  assert(out.width % 32 == 0 && "Should have even count of pixels per row.");
+  assert(out.height % 2 == 0 && "Should have even row count.");
   BitPumpMSB pump(*bs);
-  for (uint32_t y = 0; y < height; y++) {
-    auto* img = reinterpret_cast<uint16_t*>(mRaw->getData(0, y));
-    for (uint32_t x = 0; x < width; x++) {
+  for (int row = 0; row < out.height; row++) {
+    std::array<int, 2> pred = {{}};
+    if (row >= 2)
+      pred = {out(row - 2, 0), out(row - 2, 1)};
+
+    for (int col = 0; col < out.width; col++) {
       int32_t diff = samsungDiff(&pump, tbl);
-      if (x < 2)
-        hpred[x] = vpred[y & 1][x] += diff;
-      else
-        hpred[x & 1] += diff;
-      img[x] = hpred[x & 1];
-      if (img[x] >> bits)
-        ThrowRDE("decoded value out of bounds at %d:%d", x, y);
+      pred[col & 1] += diff;
+
+      int value = pred[col & 1];
+      if (!isIntN(value, bits))
+        ThrowRDE("decoded value out of bounds");
+      out(row, col) = value;
     }
   }
 }
diff --git a/src/librawspeed/decompressors/SamsungV1Decompressor.h b/src/librawspeed/decompressors/SamsungV1Decompressor.h
index b79b46419..347e03a5d 100644
--- a/src/librawspeed/decompressors/SamsungV1Decompressor.h
+++ b/src/librawspeed/decompressors/SamsungV1Decompressor.h
@@ -38,7 +38,7 @@ class SamsungV1Decompressor final : public AbstractSamsungDecompressor {
                                     const std::vector<encTableItem>& tbl);
 
   const ByteStream* bs;
-  int bits;
+  static constexpr int bits = 12;
 
 public:
   SamsungV1Decompressor(const RawImage& image, const ByteStream* bs_, int bit);
diff --git a/src/librawspeed/decompressors/SamsungV2Decompressor.cpp b/src/librawspeed/decompressors/SamsungV2Decompressor.cpp
index e1bd358aa..28447f887 100644
--- a/src/librawspeed/decompressors/SamsungV2Decompressor.cpp
+++ b/src/librawspeed/decompressors/SamsungV2Decompressor.cpp
@@ -71,30 +71,28 @@ constexpr bool operator&(SamsungV2Decompressor::OptFlags lhs,
                  rhs));
 }
 
-inline int32_t SamsungV2Decompressor::getDiff(BitPumpMSB32* pump,
-                                              uint32_t len) {
+inline __attribute__((always_inline)) int16_t
+SamsungV2Decompressor::getDiff(BitPumpMSB32* pump, uint32_t len) {
   if (len == 0)
     return 0;
-  int32_t diff = pump->getBits(len);
-  // If the first bit is 1 we need to turn this into a negative number
-  if (diff >> (len - 1))
-    diff -= (1 << len);
-  return diff;
+  assert(len <= 15 && "Difference occupies at most 15 bits.");
+  return signExtend(pump->getBits(len), len);
 }
 
 SamsungV2Decompressor::SamsungV2Decompressor(const RawImage& image,
-                                             const ByteStream& bs, int bit)
-    : AbstractSamsungDecompressor(image), bits(bit) {
+                                             const ByteStream& bs,
+                                             unsigned bits)
+    : AbstractSamsungDecompressor(image) {
   if (mRaw->getCpp() != 1 || mRaw->getDataType() != TYPE_USHORT16 ||
       mRaw->getBpp() != 2)
     ThrowRDE("Unexpected component count / data type");
 
-  switch (bit) {
+  switch (bits) {
   case 12:
   case 14:
     break;
   default:
-    ThrowRDE("Unexpected bit per pixel (%u)", bit);
+    ThrowRDE("Unexpected bit per pixel (%u)", bits);
   }
 
   static constexpr const auto headerSize = 16;
@@ -107,6 +105,8 @@ SamsungV2Decompressor::SamsungV2Decompressor(const RawImage& image,
   startpump.getBits(16); // NLCVersion
   startpump.getBits(4);  // ImgFormat
   bitDepth = startpump.getBits(4) + 1;
+  if (bitDepth != bits)
+    ThrowRDE("Bit depth mismatch with container, %u vs %u", bitDepth, bits);
   startpump.getBits(4); // NumBlkInRCUnit
   startpump.getBits(4); // CompressionRatio
   width = startpump.getBits(16);
@@ -131,10 +131,9 @@ SamsungV2Decompressor::SamsungV2Decompressor(const RawImage& image,
 
   if (width == 0 || height == 0 || width % 16 != 0 || width > 6496 ||
       height > 4336)
-    ThrowRDE("Unexpected image dimensions found: (%u; %u)", width, height);
+    ThrowRDE("Unexpected image dimensions found: (%i; %i)", width, height);
 
-  if (width != static_cast<uint32_t>(mRaw->dim.x) ||
-      height != static_cast<uint32_t>(mRaw->dim.y))
+  if (width != mRaw->dim.x || height != mRaw->dim.y)
     ThrowRDE("EXIF image dimensions do not match dimensions from raw header");
 
   data = startpump.getStream(startpump.getRemainSize());
@@ -143,40 +142,40 @@ SamsungV2Decompressor::SamsungV2Decompressor(const RawImage& image,
 void SamsungV2Decompressor::decompress() {
   switch (_flags) {
   case OptFlags::NONE:
-    for (uint32_t row = 0; row < height; row++)
+    for (int row = 0; row < height; row++)
       decompressRow<OptFlags::NONE>(row);
     break;
   case OptFlags::ALL:
-    for (uint32_t row = 0; row < height; row++)
+    for (int row = 0; row < height; row++)
       decompressRow<OptFlags::ALL>(row);
     break;
 
   case OptFlags::SKIP:
-    for (uint32_t row = 0; row < height; row++)
+    for (int row = 0; row < height; row++)
       decompressRow<OptFlags::SKIP>(row);
     break;
   case OptFlags::MV:
-    for (uint32_t row = 0; row < height; row++)
+    for (int row = 0; row < height; row++)
       decompressRow<OptFlags::MV>(row);
     break;
   case OptFlags::QP:
-    for (uint32_t row = 0; row < height; row++)
+    for (int row = 0; row < height; row++)
       decompressRow<OptFlags::QP>(row);
     break;
 
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wswitch"
   case OptFlags::SKIP | OptFlags::MV:
-    for (uint32_t row = 0; row < height; row++)
+    for (int row = 0; row < height; row++)
       decompressRow<OptFlags::SKIP | OptFlags::MV>(row);
     break;
   case OptFlags::SKIP | OptFlags::QP:
-    for (uint32_t row = 0; row < height; row++)
+    for (int row = 0; row < height; row++)
       decompressRow<OptFlags::SKIP | OptFlags::QP>(row);
     break;
 
   case OptFlags::MV | OptFlags::QP:
-    for (uint32_t row = 0; row < height; row++)
+    for (int row = 0; row < height; row++)
       decompressRow<OptFlags::MV | OptFlags::QP>(row);
     break;
 #pragma GCC diagnostic pop
@@ -194,7 +193,189 @@ void SamsungV2Decompressor::decompress() {
 // the actual difference bits
 
 template <SamsungV2Decompressor::OptFlags optflags>
-void SamsungV2Decompressor::decompressRow(uint32_t row) {
+inline __attribute__((always_inline)) std::array<uint16_t, 16>
+SamsungV2Decompressor::prepareBaselineValues(BitPumpMSB32* pump, int row,
+                                             int col) {
+  const Array2DRef<uint16_t> img(mRaw->getU16DataAsUncroppedArray2DRef());
+
+  std::array<uint16_t, 16> baseline;
+
+  if (!(optflags & OptFlags::QP) && !(col & 63)) {
+    static constexpr std::array<int32_t, 3> scalevals = {{0, -2, 2}};
+    uint32_t i = pump->getBits(2);
+    scale = i < 3 ? scale + scalevals[i] : pump->getBits(12);
+  }
+
+  // First we figure out which reference pixels mode we're in
+  if (optflags & OptFlags::MV)
+    motion = pump->getBits(1) ? 3 : 7;
+  else if (!pump->getBits(1))
+    motion = pump->getBits(3);
+
+  if ((row == 0 || row == 1) && (motion != 7))
+    ThrowRDE("At start of image and motion isn't 7. File corrupted?");
+
+  if (motion == 7) {
+    // The base case.
+    // If we're at the left edge we just start at the initial value.
+    if (col == 0) {
+      baseline.fill(initVal);
+      return baseline;
+    }
+    // Else just set all pixels to the previous ones on the same line.
+    std::array<uint16_t, 2> prev;
+    for (int i = 0; i < 2; i++)
+      prev[i] = img(row, col + i - 2);
+    for (int i = 0; i < 16; i++)
+      baseline[i] = prev[i & 1];
+    return baseline;
+  }
+
+  // The complex case, we now need to actually lookup one or two lines above
+  if (row < 2)
+    ThrowRDE("Got a previous line lookup on first two lines. File corrupted?");
+
+  static constexpr std::array<int32_t, 7> motionOffset = {-4, -2, -2, 0,
+                                                          0,  2,  4};
+  static constexpr std::array<int32_t, 7> motionDoAverage = {0, 0, 1, 0,
+                                                             1, 0, 0};
+
+  int32_t slideOffset = motionOffset[motion];
+  int32_t doAverage = motionDoAverage[motion];
+
+  for (int i = 0; i < 16; i++) {
+    int refRow = row;
+    int refCol = col + i + slideOffset;
+
+    if ((row + i) & 1) { // Red or blue pixels use same color two lines up
+      refRow -= 2;
+    } else { // Green pixel N uses Green pixel N from row above
+      refRow -= 1;
+      refCol += (i & 1) ? -1 : 1; // (top left or top right)
+    }
+
+    if (refCol < 0)
+      ThrowRDE("Bad motion %u at the beginning of the row", motion);
+    if ((refCol >= width) || (doAverage && (refCol + 2 >= width)))
+      ThrowRDE("Bad motion %u at the end of the row", motion);
+
+    // In some cases we use as reference interpolation of this pixel and
+    // the next
+    if (doAverage) {
+      baseline[i] = (img(refRow, refCol) + img(refRow, refCol + 2) + 1) >> 1;
+    } else
+      baseline[i] = img(refRow, refCol);
+  }
+
+  return baseline;
+}
+
+template <SamsungV2Decompressor::OptFlags optflags>
+inline __attribute__((always_inline)) std::array<uint32_t, 4>
+SamsungV2Decompressor::decodeDiffLengths(BitPumpMSB32* pump, int row) {
+  if (!(optflags & OptFlags::SKIP || !pump->getBits(1)))
+    return {};
+
+  std::array<uint32_t, 4> diffBits;
+
+  // Figure out how many difference bits we have to read for each pixel
+  std::array<uint32_t, 4> flags;
+  for (unsigned int& flag : flags)
+    flag = pump->getBits(2);
+
+  for (int i = 0; i < 4; i++) {
+    // The color is 0-Green 1-Blue 2-Red
+    uint32_t colornum = (row % 2 != 0) ? i >> 1 : ((i >> 1) + 2) % 3;
+
+    assert(flags[i] <= 3);
+    switch (flags[i]) {
+    case 0:
+      diffBits[i] = diffBitsMode[colornum][0];
+      break;
+    case 1:
+      diffBits[i] = diffBitsMode[colornum][0] + 1;
+      break;
+    case 2:
+      if (diffBitsMode[colornum][0] == 0)
+        ThrowRDE("Difference bits underflow. File corrupted?");
+      diffBits[i] = diffBitsMode[colornum][0] - 1;
+      break;
+    case 3:
+      diffBits[i] = pump->getBits(4);
+      break;
+    default:
+      __builtin_unreachable();
+    }
+
+    diffBitsMode[colornum][0] = diffBitsMode[colornum][1];
+    diffBitsMode[colornum][1] = diffBits[i];
+
+    if (diffBits[i] > bitDepth + 1)
+      ThrowRDE("Too many difference bits (%u). File corrupted?", diffBits[i]);
+    assert(diffBits[i] <= 15 && "So any difference fits within uint16_t");
+  }
+
+  return diffBits;
+}
+
+template <SamsungV2Decompressor::OptFlags optflags>
+inline __attribute__((always_inline)) std::array<int, 16>
+SamsungV2Decompressor::decodeDifferences(BitPumpMSB32* pump, int row) {
+  // Figure out how many difference bits we have to read for each pixel
+  const std::array<uint32_t, 4> diffBits =
+      decodeDiffLengths<optflags>(pump, row);
+
+  // Actually read the differences. We know these fit into 15-bit ints.
+  std::array<int16_t, 16> diffs;
+  for (int i = 0; i < 16; i++) {
+    uint32_t len = diffBits[i >> 2];
+    int16_t diff = getDiff(pump, len);
+    diffs[i] = diff;
+  }
+
+  // Reshuffle the differences, while they still are only 16-bit.
+  std::array<int16_t, 16> shuffled;
+  for (int i = 0; i < 16; i++) {
+    int p;
+    // The differences are stored interlaced:
+    // 0 2 4 6 8 10 12 14 1 3 5 7 9 11 13 15
+    if (row % 2)
+      p = ((i % 8) << 1) - (i >> 3) + 1;
+    else
+      p = ((i % 8) << 1) + (i >> 3);
+
+    shuffled[p] = diffs[i];
+  }
+
+  // And finally widen and scale the differences.
+  std::array<int, 16> scaled;
+  for (int i = 0; i < 16; i++) {
+    int scaledDiff = int(shuffled[i]) * (scale * 2 + 1) + scale;
+    scaled[i] = scaledDiff;
+  }
+
+  return scaled;
+}
+
+template <SamsungV2Decompressor::OptFlags optflags>
+inline __attribute__((always_inline)) void
+SamsungV2Decompressor::processBlock(BitPumpMSB32* pump, int row, int col) {
+  const Array2DRef<uint16_t> out(mRaw->getU16DataAsUncroppedArray2DRef());
+
+  const std::array<uint16_t, 16> baseline =
+      prepareBaselineValues<optflags>(pump, row, col);
+
+  // Figure out how many difference bits we have to read for each pixel
+  const std::array<int, 16> diffs = decodeDifferences<optflags>(pump, row);
+
+  // Actually apply the differences and write them to the pixels
+  for (int i = 0; i < 16; ++i, ++col)
+    out(row, col) = clampBits(baseline[i] + diffs[i], bitDepth);
+}
+
+template <SamsungV2Decompressor::OptFlags optflags>
+void SamsungV2Decompressor::decompressRow(int row) {
+
   // Align pump to 16byte boundary
   const auto line_offset = data.getPosition();
   if ((line_offset & 0xf) != 0)
@@ -202,148 +383,18 @@ void SamsungV2Decompressor::decompressRow(uint32_t row) {
 
   BitPumpMSB32 pump(data);
 
-  auto* img = reinterpret_cast<uint16_t*>(mRaw->getData(0, row));
-  uint16_t* img_up = reinterpret_cast<uint16_t*>(
-      mRaw->getData(0, std::max(0, static_cast<int>(row) - 1)));
-  uint16_t* img_up2 = reinterpret_cast<uint16_t*>(
-      mRaw->getData(0, std::max(0, static_cast<int>(row) - 2)));
-
   // Initialize the motion and diff modes at the start of the line
-  uint32_t motion = 7;
+  motion = 7;
   // By default we are not scaling values at all
-  int32_t scale = 0;
+  scale = 0;
 
-  std::array<std::array<int, 2>, 3> diffBitsMode = {{}};
   for (auto& i : diffBitsMode)
     i[0] = i[1] = (row == 0 || row == 1) ? 7 : 4;
 
   assert(width >= 16);
   assert(width % 16 == 0);
-  for (uint32_t col = 0; col < width; col += 16) {
-    if (!(optflags & OptFlags::QP) && !(col & 63)) {
-      static constexpr std::array<int32_t, 3> scalevals = {{0, -2, 2}};
-      uint32_t i = pump.getBits(2);
-      scale = i < 3 ? scale + scalevals[i] : pump.getBits(12);
-    }
-
-    // First we figure out which reference pixels mode we're in
-    if (optflags & OptFlags::MV)
-      motion = pump.getBits(1) ? 3 : 7;
-    else if (!pump.getBits(1))
-      motion = pump.getBits(3);
-
-    if ((row == 0 || row == 1) && (motion != 7))
-      ThrowRDE("At start of image and motion isn't 7. File corrupted?");
-
-    if (motion == 7) {
-      // The base case, just set all pixels to the previous ones on the same
-      // line If we're at the left edge we just start at the initial value
-      for (uint32_t i = 0; i < 16; i++)
-        img[i] = (col == 0) ? initVal : *(img + i - 2);
-    } else {
-      // The complex case, we now need to actually lookup one or two lines
-      // above
-      if (row < 2)
-        ThrowRDE(
-            "Got a previous line lookup on first two lines. File corrupted?");
-
-      static constexpr std::array<int32_t, 7> motionOffset = {-4, -2, -2, 0,
-                                                              0,  2,  4};
-      static constexpr std::array<int32_t, 7> motionDoAverage = {0, 0, 1, 0,
-                                                                 1, 0, 0};
-
-      int32_t slideOffset = motionOffset[motion];
-      int32_t doAverage = motionDoAverage[motion];
-
-      for (uint32_t i = 0; i < 16; i++) {
-        uint16_t* line;
-        uint16_t* refpixel;
-
-        if ((row + i) & 0x1) {
-          // Red or blue pixels use same color two lines up
-          line = img_up2;
-          refpixel = line + i + slideOffset;
-        } else {
-          // Green pixel N uses Green pixel N from row above
-          // (top left or top right)
-          line = img_up;
-          refpixel = line + i + slideOffset + (((i % 2) != 0) ? -1 : 1);
-        }
-
-        if (col == 0 && line > refpixel)
-          ThrowRDE("Bad motion %u at the beginning of the row", motion);
-        if (col + 16 == width && ((refpixel >= line + 16) ||
-                                  (doAverage && (refpixel + 2 >= line + 16))))
-          ThrowRDE("Bad motion %u at the end of the row", motion);
-
-        // In some cases we use as reference interpolation of this pixel and
-        // the next
-        if (doAverage)
-          img[i] = (*refpixel + *(refpixel + 2) + 1) >> 1;
-        else
-          img[i] = *refpixel;
-      }
-    }
-
-    // Figure out how many difference bits we have to read for each pixel
-    std::array<uint32_t, 4> diffBits = {};
-    if (optflags & OptFlags::SKIP || !pump.getBits(1)) {
-      std::array<uint32_t, 4> flags;
-      for (unsigned int& flag : flags)
-        flag = pump.getBits(2);
-
-      for (uint32_t i = 0; i < 4; i++) {
-        // The color is 0-Green 1-Blue 2-Red
-        uint32_t colornum = (row % 2 != 0) ? i >> 1 : ((i >> 1) + 2) % 3;
-
-        assert(flags[i] <= 3);
-        switch (flags[i]) {
-        case 0:
-          diffBits[i] = diffBitsMode[colornum][0];
-          break;
-        case 1:
-          diffBits[i] = diffBitsMode[colornum][0] + 1;
-          break;
-        case 2:
-          if (diffBitsMode[colornum][0] == 0)
-            ThrowRDE("Difference bits underflow. File corrupted?");
-          diffBits[i] = diffBitsMode[colornum][0] - 1;
-          break;
-        case 3:
-          diffBits[i] = pump.getBits(4);
-          break;
-        default:
-          __builtin_unreachable();
-        }
-
-        diffBitsMode[colornum][0] = diffBitsMode[colornum][1];
-        diffBitsMode[colornum][1] = diffBits[i];
-
-        if (diffBits[i] > bitDepth + 1)
-          ThrowRDE("Too many difference bits. File corrupted?");
-      }
-    }
-
-    // Actually read the differences and write them to the pixels
-    for (uint32_t i = 0; i < 16; i++) {
-      uint32_t len = diffBits[i >> 2];
-      int32_t diff = getDiff(&pump, len);
-
-      uint16_t* value = nullptr;
-      // Apply the diff to pixels 0 2 4 6 8 10 12 14 1 3 5 7 9 11 13 15
-      if (row % 2)
-        value = &img[((i & 0x7) << 1) + 1 - (i >> 3)];
-      else
-        value = &img[((i & 0x7) << 1) + (i >> 3)];
-
-      diff = diff * (scale * 2 + 1) + scale;
-      *value = clampBits(static_cast<int>(*value) + diff, bits);
-    }
-
-    img += 16;
-    img_up += 16;
-    img_up2 += 16;
-  }
+  for (int col = 0; col < width; col += 16)
+    processBlock<optflags>(&pump, row, col);
 
   data.skipBytes(pump.getBufferPosition());
 }
diff --git a/src/librawspeed/decompressors/SamsungV2Decompressor.h b/src/librawspeed/decompressors/SamsungV2Decompressor.h
index af3afb338..ff6fb9387 100644
--- a/src/librawspeed/decompressors/SamsungV2Decompressor.h
+++ b/src/librawspeed/decompressors/SamsungV2Decompressor.h
@@ -35,22 +35,42 @@ class SamsungV2Decompressor final : public AbstractSamsungDecompressor {
   enum struct OptFlags : uint32_t;
 
 protected:
-  int bits;
-
   uint32_t bitDepth;
-  uint32_t width;
-  uint32_t height;
+  int width;
+  int height;
   OptFlags _flags;
-  uint32_t initVal;
+  uint16_t initVal;
 
   ByteStream data;
 
-  static inline int32_t getDiff(BitPumpMSB32* pump, uint32_t len);
+  int motion;
+  int scale;
+  std::array<std::array<int, 2>, 3> diffBitsMode;
+
+  static inline __attribute__((always_inline)) int16_t
+  getDiff(BitPumpMSB32* pump, uint32_t len);
+
+  template <OptFlags optflags>
+  inline __attribute__((always_inline)) std::array<uint16_t, 16>
+  prepareBaselineValues(BitPumpMSB32* pump, int row, int col);
+
+  template <OptFlags optflags>
+  inline __attribute__((always_inline)) std::array<uint32_t, 4>
+  decodeDiffLengths(BitPumpMSB32* pump, int row);
+
+  template <OptFlags optflags>
+  inline __attribute__((always_inline)) std::array<int, 16>
+  decodeDifferences(BitPumpMSB32* pump, int row);
+
+  template <OptFlags optflags>
+  inline __attribute__((always_inline)) void processBlock(BitPumpMSB32* pump,
+                                                          int row, int col);
 
-  template <OptFlags optflags> void decompressRow(uint32_t row);
+  template <OptFlags optflags> void decompressRow(int row);
 
 public:
-  SamsungV2Decompressor(const RawImage& image, const ByteStream& bs, int bit);
+  SamsungV2Decompressor(const RawImage& image, const ByteStream& bs,
+                        unsigned bit);
 
   void decompress();
 };
diff --git a/src/librawspeed/decompressors/SonyArw1Decompressor.cpp b/src/librawspeed/decompressors/SonyArw1Decompressor.cpp
index ce680b2dc..124c6b8e0 100644
--- a/src/librawspeed/decompressors/SonyArw1Decompressor.cpp
+++ b/src/librawspeed/decompressors/SonyArw1Decompressor.cpp
@@ -51,24 +51,19 @@ inline int SonyArw1Decompressor::getDiff(BitPumpMSB* bs, uint32_t len) {
 }
 
 void SonyArw1Decompressor::decompress(const ByteStream& input) const {
-  const uint32_t w = mRaw->dim.x;
-  const uint32_t h = mRaw->dim.y;
-
-  assert(w > 0);
-  assert(h > 0);
-  assert(h % 2 == 0);
+  const Array2DRef<uint16_t> out(mRaw->getU16DataAsUncroppedArray2DRef());
+  assert(out.width > 0);
+  assert(out.height > 0);
+  assert(out.height % 2 == 0);
 
   BitPumpMSB bits(input);
-  uint8_t* data = mRaw->getData();
-  auto* dest = reinterpret_cast<uint16_t*>(&data[0]);
-  uint32_t pitch = mRaw->pitch / sizeof(uint16_t);
-  int sum = 0;
-  for (int64_t x = w - 1; x >= 0; x--) {
-    for (uint32_t y = 0; y < h + 1; y += 2) {
+  int pred = 0;
+  for (int col = out.width - 1; col >= 0; col--) {
+    for (int row = 0; row < out.height + 1; row += 2) {
       bits.fill(32);
 
-      if (y == h)
-        y = 1;
+      if (row == out.height)
+        row = 1;
 
       uint32_t len = 4 - bits.getBitsNoFill(2);
 
@@ -80,13 +75,12 @@ void SonyArw1Decompressor::decompress(const ByteStream& input) const {
           len++;
 
       int diff = getDiff(&bits, len);
-      sum += diff;
+      pred += diff;
 
-      if (sum < 0 || (sum >> 12) > 0)
+      if (!isIntN(pred, 12))
         ThrowRDE("Error decompressing");
 
-      if (y < h)
-        dest[x + y * pitch] = sum;
+      out(row, col) = pred;
     }
   }
 }
diff --git a/src/librawspeed/decompressors/SonyArw2Decompressor.cpp b/src/librawspeed/decompressors/SonyArw2Decompressor.cpp
index 8be59f8db..eeca8fd9b 100644
--- a/src/librawspeed/decompressors/SonyArw2Decompressor.cpp
+++ b/src/librawspeed/decompressors/SonyArw2Decompressor.cpp
@@ -49,25 +49,23 @@ SonyArw2Decompressor::SonyArw2Decompressor(const RawImage& img,
 }
 
 void SonyArw2Decompressor::decompressRow(int row) const {
-  uint8_t* data = mRaw->getData();
-  uint32_t pitch = mRaw->pitch;
-  int32_t w = mRaw->dim.x;
+  const Array2DRef<uint16_t> out(mRaw->getU16DataAsUncroppedArray2DRef());
+  assert(out.width > 0);
+  assert(out.width % 32 == 0);
 
-  assert(mRaw->dim.x > 0);
-  assert(mRaw->dim.x % 32 == 0);
-
-  auto* dest = reinterpret_cast<uint16_t*>(&data[row * pitch]);
+  // Allow compiler to devirtualize the calls below.
+  auto& rawdata = reinterpret_cast<RawImageDataU16&>(*mRaw);
 
   ByteStream rowBs = input;
-  rowBs.skipBytes(row * mRaw->dim.x);
-  rowBs = rowBs.peekStream(mRaw->dim.x);
+  rowBs.skipBytes(row * out.width);
+  rowBs = rowBs.peekStream(out.width);
 
   BitPumpLSB bits(rowBs);
 
   uint32_t random = bits.peekBits(24);
 
   // Each loop iteration processes 16 pixels, consuming 128 bits of input.
-  for (int32_t x = 0; x < w;) {
+  for (int col = 0; col < out.width; col += ((col & 1) != 0) ? 31 : 1) {
     // 30 bits.
     int _max = bits.getBits(11);
     int _min = bits.getBits(11);
@@ -99,10 +97,9 @@ void SonyArw2Decompressor::decompressRow(int row) const {
             p = 0x7ff;
         }
       }
-      mRaw->setWithLookUp(p << 1, reinterpret_cast<uint8_t*>(&dest[x + i * 2]),
-                          &random);
+      rawdata.setWithLookUp(
+          p << 1, reinterpret_cast<uint8_t*>(&out(row, col + i * 2)), &random);
     }
-    x += ((x & 1) != 0) ? 31 : 1; // Skip to next 32 pixels
   }
 }
 
diff --git a/src/librawspeed/decompressors/VC5Decompressor.cpp b/src/librawspeed/decompressors/VC5Decompressor.cpp
index f5a8a6eef..af00d4801 100644
--- a/src/librawspeed/decompressors/VC5Decompressor.cpp
+++ b/src/librawspeed/decompressors/VC5Decompressor.cpp
@@ -119,10 +119,10 @@ VC5Decompressor::Wavelet::bandAsArray2DRef(const unsigned int iBand) const {
 }
 
 namespace {
-auto convolute = [](int x, int y, std::array<int, 4> muls,
+auto convolute = [](int row, int col, std::array<int, 4> muls,
                     const Array2DRef<const int16_t> high, auto lowGetter,
                     int DescaleShift = 0) {
-  auto highCombined = muls[0] * high(x, y);
+  auto highCombined = muls[0] * high(row, col);
   auto lowsCombined = [muls, lowGetter]() {
     int lows = 0;
     for (int i = 0; i < 3; i++)
@@ -178,38 +178,38 @@ constexpr std::array<int, 4> ConvolutionParams::Last::mul_odd;
 void VC5Decompressor::Wavelet::reconstructPass(
     const Array2DRef<int16_t> dst, const Array2DRef<const int16_t> high,
     const Array2DRef<const int16_t> low) const noexcept {
-  auto process = [low, high, dst](auto segment, int x, int y) {
-    auto lowGetter = [&x, &y, low](int delta) {
-      return low(x, y + decltype(segment)::coord_shift + delta);
+  auto process = [low, high, dst](auto segment, int row, int col) {
+    auto lowGetter = [&row, &col, low](int delta) {
+      return low(row + decltype(segment)::coord_shift + delta, col);
     };
-    auto convolution = [&x, &y, high, lowGetter](std::array<int, 4> muls) {
-      return convolute(x, y, muls, high, lowGetter, /*DescaleShift*/ 0);
+    auto convolution = [&row, &col, high, lowGetter](std::array<int, 4> muls) {
+      return convolute(row, col, muls, high, lowGetter, /*DescaleShift*/ 0);
     };
 
     int even = convolution(decltype(segment)::mul_even);
     int odd = convolution(decltype(segment)::mul_odd);
 
-    dst(x, 2 * y) = static_cast<int16_t>(even);
-    dst(x, 2 * y + 1) = static_cast<int16_t>(odd);
+    dst(2 * row, col) = static_cast<int16_t>(even);
+    dst(2 * row + 1, col) = static_cast<int16_t>(odd);
   };
 
   // Vertical reconstruction
 #ifdef HAVE_OPENMP
 #pragma omp for schedule(static)
 #endif
-  for (int y = 0; y < height; ++y) {
-    if (y == 0) {
+  for (int row = 0; row < height; ++row) {
+    if (row == 0) {
       // 1st row
-      for (int x = 0; x < width; ++x)
-        process(ConvolutionParams::First, x, y);
-    } else if (y + 1 < height) {
+      for (int col = 0; col < width; ++col)
+        process(ConvolutionParams::First, row, col);
+    } else if (row + 1 < height) {
       // middle rows
-      for (int x = 0; x < width; ++x)
-        process(ConvolutionParams::Middle, x, y);
+      for (int col = 0; col < width; ++col)
+        process(ConvolutionParams::Middle, row, col);
     } else {
       // last row
-      for (int x = 0; x < width; ++x)
-        process(ConvolutionParams::Last, x, y);
+      for (int col = 0; col < width; ++col)
+        process(ConvolutionParams::Last, row, col);
     }
   }
 }
@@ -218,14 +218,14 @@ void VC5Decompressor::Wavelet::combineLowHighPass(
     const Array2DRef<int16_t> dst, const Array2DRef<const int16_t> low,
     const Array2DRef<const int16_t> high, int descaleShift,
     bool clampUint = false) const noexcept {
-  auto process = [low, high, descaleShift, clampUint, dst](auto segment, int x,
-                                                           int y) {
-    auto lowGetter = [&x, &y, low](int delta) {
-      return low(x + decltype(segment)::coord_shift + delta, y);
+  auto process = [low, high, descaleShift, clampUint, dst](auto segment,
+                                                           int row, int col) {
+    auto lowGetter = [&row, &col, low](int delta) {
+      return low(row, col + decltype(segment)::coord_shift + delta);
     };
-    auto convolution = [&x, &y, high, lowGetter,
+    auto convolution = [&row, &col, high, lowGetter,
                         descaleShift](std::array<int, 4> muls) {
-      return convolute(x, y, muls, high, lowGetter, descaleShift);
+      return convolute(row, col, muls, high, lowGetter, descaleShift);
     };
 
     int even = convolution(decltype(segment)::mul_even);
@@ -235,24 +235,24 @@ void VC5Decompressor::Wavelet::combineLowHighPass(
       even = clampBits(even, 14);
       odd = clampBits(odd, 14);
     }
-    dst(2 * x, y) = static_cast<int16_t>(even);
-    dst(2 * x + 1, y) = static_cast<int16_t>(odd);
+    dst(row, 2 * col) = static_cast<int16_t>(even);
+    dst(row, 2 * col + 1) = static_cast<int16_t>(odd);
   };
 
   // Horizontal reconstruction
 #ifdef HAVE_OPENMP
 #pragma omp for schedule(static)
 #endif
-  for (int y = 0; y < dst.height; ++y) {
+  for (int row = 0; row < dst.height; ++row) {
     // First col
-    int x = 0;
-    process(ConvolutionParams::First, x, y);
+    int col = 0;
+    process(ConvolutionParams::First, row, col);
     // middle cols
-    for (x = 1; x + 1 < width; ++x) {
-      process(ConvolutionParams::Middle, x, y);
+    for (col = 1; col + 1 < width; ++col) {
+      process(ConvolutionParams::Middle, row, col);
     }
     // last col
-    process(ConvolutionParams::Last, x, y);
+    process(ConvolutionParams::Last, row, col);
   }
 }
 
@@ -532,7 +532,7 @@ void VC5Decompressor::Wavelet::LowPassBand::decode(const Wavelet& wavelet) {
   BitPumpMSB bits(bs);
   for (auto row = 0; row < dst.height; ++row) {
     for (auto col = 0; col < dst.width; ++col)
-      dst(col, row) = static_cast<int16_t>(bits.getBits(lowpassPrecision));
+      dst(row, col) = static_cast<int16_t>(bits.getBits(lowpassPrecision));
   }
 }
 
@@ -766,9 +766,7 @@ void VC5Decompressor::reconstructLowpassBands() const noexcept {
 }
 
 void VC5Decompressor::combineFinalLowpassBands() const noexcept {
-  const Array2DRef<uint16_t> out(reinterpret_cast<uint16_t*>(mRaw->getData()),
-                                 mRaw->dim.x, mRaw->dim.y,
-                                 mRaw->pitch / sizeof(uint16_t));
+  const Array2DRef<uint16_t> out(mRaw->getU16DataAsUncroppedArray2DRef());
 
   const int width = out.width / 2;
   const int height = out.height / 2;
@@ -790,20 +788,20 @@ void VC5Decompressor::combineFinalLowpassBands() const noexcept {
     for (int col = 0; col < width; ++col) {
       const int mid = 2048;
 
-      int gs = lowbands0(col, row);
-      int rg = lowbands1(col, row) - mid;
-      int bg = lowbands2(col, row) - mid;
-      int gd = lowbands3(col, row) - mid;
+      int gs = lowbands0(row, col);
+      int rg = lowbands1(row, col) - mid;
+      int bg = lowbands2(row, col) - mid;
+      int gd = lowbands3(row, col) - mid;
 
       int r = gs + 2 * rg;
       int b = gs + 2 * bg;
       int g1 = gs + gd;
       int g2 = gs - gd;
 
-      out(2 * col + 0, 2 * row + 0) = static_cast<uint16_t>(mVC5LogTable[r]);
-      out(2 * col + 1, 2 * row + 0) = static_cast<uint16_t>(mVC5LogTable[g1]);
-      out(2 * col + 0, 2 * row + 1) = static_cast<uint16_t>(mVC5LogTable[g2]);
-      out(2 * col + 1, 2 * row + 1) = static_cast<uint16_t>(mVC5LogTable[b]);
+      out(2 * row + 0, 2 * col + 0) = static_cast<uint16_t>(mVC5LogTable[r]);
+      out(2 * row + 0, 2 * col + 1) = static_cast<uint16_t>(mVC5LogTable[g1]);
+      out(2 * row + 1, 2 * col + 0) = static_cast<uint16_t>(mVC5LogTable[g2]);
+      out(2 * row + 1, 2 * col + 1) = static_cast<uint16_t>(mVC5LogTable[b]);
     }
   }
 }
diff --git a/src/librawspeed/interpolators/Cr2sRawInterpolator.cpp b/src/librawspeed/interpolators/Cr2sRawInterpolator.cpp
index 5f5a75a2c..6285c4c37 100644
--- a/src/librawspeed/interpolators/Cr2sRawInterpolator.cpp
+++ b/src/librawspeed/interpolators/Cr2sRawInterpolator.cpp
@@ -28,7 +28,6 @@
 #include <cassert>                         // for assert
 #include <type_traits>                     // for is_pod
 
-using std::is_pod;
 using std::array;
 
 namespace rawspeed {
@@ -38,12 +37,6 @@ struct Cr2sRawInterpolator::YCbCr final {
   int Cb;
   int Cr;
 
-  inline static void LoadY(YCbCr* dst, const YCbCr& src) {
-    assert(dst);
-
-    dst->Y = src.Y;
-  }
-
   inline static void LoadY(YCbCr* p, const uint16_t* data) {
     assert(p);
     assert(data);
@@ -59,7 +52,7 @@ struct Cr2sRawInterpolator::YCbCr final {
     p->Cr = data[2];
   }
 
-  inline static void Load(YCbCr* p, const uint16_t* data) {
+  inline static void LoadYCbCr(YCbCr* p, const uint16_t* data) {
     assert(p);
     assert(data);
 
@@ -69,14 +62,6 @@ struct Cr2sRawInterpolator::YCbCr final {
 
   YCbCr() = default;
 
-  explicit YCbCr(uint16_t* data) {
-    static_assert(is_pod<YCbCr>::value, "not a POD");
-
-    assert(data);
-
-    Load(this, data);
-  }
-
   inline void signExtend() {
     Cb -= 16384;
     Cr -= 16384;
@@ -130,7 +115,8 @@ inline void Cr2sRawInterpolator::interpolate_422_row(uint16_t* data, int w) {
     assert(x % 2 == 0);
 
     // load, process and output first pixel, which is full
-    YCbCr p0(data);
+    YCbCr p0;
+    YCbCr::LoadYCbCr(&p0, data);
     p0.process(hue);
     YUV_TO_RGB<version>(p0, data);
     data += 3;
@@ -140,7 +126,8 @@ inline void Cr2sRawInterpolator::interpolate_422_row(uint16_t* data, int w) {
     YCbCr::LoadY(&p, data);
 
     // load third pixel, which is full, process
-    YCbCr p1(data + 3);
+    YCbCr p1;
+    YCbCr::LoadYCbCr(&p1, data + 3);
     p1.process(hue);
 
     // and finally, interpolate and output the middle pixel
@@ -157,7 +144,8 @@ inline void Cr2sRawInterpolator::interpolate_422_row(uint16_t* data, int w) {
   //  .. [ Y1 Cb  Cr  ] [ Y2 ... ... ]
 
   // load, process and output first pixel, which is full
-  YCbCr p(data);
+  YCbCr p;
+  YCbCr::LoadYCbCr(&p, data);
   p.process(hue);
   YUV_TO_RGB<version>(p, data);
   data += 3;
@@ -225,7 +213,8 @@ Cr2sRawInterpolator::interpolate_420_row(std::array<uint16_t*, 3> line, int w) {
     assert(x % 2 == 0);
 
     // load, process and output first pixel of first row, which is full
-    YCbCr p0(line[0]);
+    YCbCr p0;
+    YCbCr::LoadYCbCr(&p0, line[0]);
     p0.process(hue);
     YUV_TO_RGB<version>(p0, line[0]);
     line[0] += 3;
@@ -288,7 +277,8 @@ Cr2sRawInterpolator::interpolate_420_row(std::array<uint16_t*, 3> line, int w) {
   //               .. .   .       .. .   .
 
   // load, process and output first pixel of first row, which is full
-  YCbCr p0(line[0]);
+  YCbCr p0;
+  YCbCr::LoadYCbCr(&p0, line[0]);
   p0.process(hue);
   YUV_TO_RGB<version>(p0, line[0]);
   line[0] += 3;
@@ -366,7 +356,8 @@ inline void Cr2sRawInterpolator::interpolate_420(int w, int h) {
     assert(x % 2 == 0);
 
     // load, process and output first pixel of first row, which is full
-    YCbCr p0(line[0]);
+    YCbCr p0;
+    YCbCr::LoadYCbCr(&p0, line[0]);
     p0.process(hue);
     YUV_TO_RGB<version>(p0, line[0]);
     line[0] += 3;
@@ -415,7 +406,8 @@ inline void Cr2sRawInterpolator::interpolate_420(int w, int h) {
   //  row 1:  ... [ Y3 ... ... ] [ Y4 ... ... ]
 
   // load, process and output first pixel of first row, which is full
-  YCbCr p(line[0]);
+  YCbCr p;
+  YCbCr::LoadYCbCr(&p, line[0]);
   p.process(hue);
   YUV_TO_RGB<version>(p, line[0]);
   line[0] += 3;
diff --git a/src/librawspeed/io/BitStream.cpp b/src/librawspeed/io/BitStream.cpp
new file mode 100644
index 000000000..c20e4324a
--- /dev/null
+++ b/src/librawspeed/io/BitStream.cpp
@@ -0,0 +1,27 @@
+/*
+    RawSpeed - RAW file decoder.
+
+    Copyright (C) 2019 Roman Lebedev
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with this library; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+
+#include "io/BitStream.h" // for BitStreamCacheBase
+
+namespace rawspeed {
+
+constexpr unsigned BitStreamCacheBase::MaxProcessBytes;
+
+} // namespace rawspeed
diff --git a/src/librawspeed/io/BitStream.h b/src/librawspeed/io/BitStream.h
index 0922da39a..f576ee125 100644
--- a/src/librawspeed/io/BitStream.h
+++ b/src/librawspeed/io/BitStream.h
@@ -23,7 +23,7 @@
 #pragma once
 
 #include "common/Common.h" // for uint32_t, uint8_t, uint64_t
-#include "io/Buffer.h"     // for Buffer::size_type, BUFFER_PADDING
+#include "io/Buffer.h"     // for Buffer::size_type
 #include "io/ByteStream.h"  // for ByteStream
 #include "io/IOException.h" // for IOException (ptr only), ThrowIOE
 #include <cassert>          // for assert
@@ -93,6 +93,12 @@ template <typename Tag, typename Cache>
 class BitStream final : public ByteStream {
   Cache cache;
 
+  // A temporary intermediate buffer that may be used by fill() method either
+  // in debug build to enforce lack of out-of-bounds reads, or when we are
+  // nearing the end of the input buffer and can not just read MaxProcessBytes
+  // from it, but have to read as much as we can and fill rest with zeros.
+  std::array<uint8_t, BitStreamCacheBase::MaxProcessBytes> tmp = {};
+
   // this method hase to be implemented in the concrete BitStream template
   // specializations. It will return the number of bytes processed. It needs
   // to process up to BitStreamCacheBase::MaxProcessBytes bytes of input.
@@ -108,57 +114,46 @@ class BitStream final : public ByteStream {
   }
 
 private:
-  inline void fillSafe() {
+  inline const uint8_t* getInput() {
     assert(data);
-    if (pos + BitStreamCacheBase::MaxProcessBytes <= size) {
-      std::array<uint8_t, BitStreamCacheBase::MaxProcessBytes> tmp;
-      tmp.fill(0);
-      assert(!(size - pos < BitStreamCacheBase::MaxProcessBytes));
-      memcpy(tmp.data(), data + pos, BitStreamCacheBase::MaxProcessBytes);
-      pos += fillCache(tmp.data(), size, &pos);
-    } else if (pos < size) {
-      std::array<uint8_t, BitStreamCacheBase::MaxProcessBytes> tmp;
-      tmp.fill(0);
-      assert(size - pos < BitStreamCacheBase::MaxProcessBytes);
-      memcpy(tmp.data(), data + pos, size - pos);
-      pos += fillCache(tmp.data(), size, &pos);
-    } else if (pos <= size + BitStreamCacheBase::MaxProcessBytes) {
-      std::array<uint8_t, BitStreamCacheBase::MaxProcessBytes> tmp;
-      tmp.fill(0);
-      pos += fillCache(tmp.data(), size, &pos);
-    } else {
-      // assert(size < pos);
+
+#if !defined(DEBUG)
+    // Do we have MaxProcessBytes or more bytes left in the input buffer?
+    // If so, then we can just read from said buffer.
+    if (pos + BitStreamCacheBase::MaxProcessBytes <= size)
+      return data + pos;
+#endif
+
+    // We have to use intermediate buffer, either because the input is running
+    // out of bytes, or because we want to enforce bounds checking.
+
+    // Note that in order to keep all fill-level invariants we must allow to
+    // over-read past-the-end a bit.
+    if (pos > size + BitStreamCacheBase::MaxProcessBytes)
       ThrowIOE("Buffer overflow read in BitStream");
-    }
-  }
 
-  // In non-DEBUG builds, fillSafe() will be called at most once
-  // per the life-time of the BitStream  therefore it should *NOT* be inlined
-  // into the normal codepath.
-  inline void __attribute__((noinline, cold)) fillSafeNoinline() { fillSafe(); }
+    tmp.fill(0);
+
+    // How many bytes are left in input buffer?
+    // Since pos can be past-the-end we need to carefully handle overflow.
+    Buffer::size_type bytesRemaining = (pos < size) ? size - pos : 0;
+    // And if we are not at the end of the input, we may have more than we need.
+    bytesRemaining =
+        std::min(BitStreamCacheBase::MaxProcessBytes, bytesRemaining);
+
+    memcpy(tmp.data(), data + pos, bytesRemaining);
+    return tmp.data();
+  }
 
 public:
   inline void fill(uint32_t nbits = Cache::MaxGetBits) {
     assert(data);
     assert(nbits <= Cache::MaxGetBits);
-    if (cache.fillLevel < nbits) {
-#if defined(DEBUG)
-      // really slow, but best way to check all the assumptions.
-      fillSafe();
-#elif BUFFER_PADDING >= 8
-      static_assert(BitStreamCacheBase::MaxProcessBytes == 8,
-                    "update these too");
-      // FIXME: this looks very wrong. We don't check pos at all here.
-      // I suspect this should be:  if (pos <= size)
-      pos += fillCache(data + pos, size, &pos);
-#else
-      // disabling this run-time bounds check saves about 1% on intel x86-64
-      if (pos + BitStreamCacheBase::MaxProcessBytes <= size)
-        pos += fillCache(data + pos, size, &pos);
-      else
-        fillSafeNoinline();
-#endif
-    }
+
+    if (cache.fillLevel >= nbits)
+      return;
+
+    pos += fillCache(getInput(), size, &pos);
   }
 
   // these methods might be specialized by implementations that support it
diff --git a/src/librawspeed/io/Buffer.cpp b/src/librawspeed/io/Buffer.cpp
deleted file mode 100644
index 4133fa33d..000000000
--- a/src/librawspeed/io/Buffer.cpp
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
-    RawSpeed - RAW file decoder.
-
-    Copyright (C) 2009-2014 Klaus Post
-    Copyright (C) 2017 Axel Waggershauser
-
-    This library is free software; you can redistribute it and/or
-    modify it under the terms of the GNU Lesser General Public
-    License as published by the Free Software Foundation; either
-    version 2 of the License, or (at your option) any later version.
-
-    This library is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-    Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public
-    License along with this library; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-*/
-
-#include "io/Buffer.h"
-#include "AddressSanitizer.h" // for ASan
-#include "common/Common.h"    // for uint8_t, roundUp
-#include "common/Memory.h"    // for alignedFree, alignedFreeConstPtr, alig...
-#include "io/IOException.h"   // for ThrowIOE
-#include <cassert>            // for assert
-#include <memory>             // for unique_ptr
-
-using std::unique_ptr;
-
-namespace rawspeed {
-
-unique_ptr<uint8_t, decltype(&alignedFree)> Buffer::Create(size_type size) {
-  if (!size)
-    ThrowIOE("Trying to allocate 0 bytes sized buffer.");
-
-  unique_ptr<uint8_t, decltype(&alignedFree)> data(
-      alignedMalloc<uint8_t, 16>(roundUp(size + BUFFER_PADDING, 16)),
-      &alignedFree);
-  if (!data)
-    ThrowIOE("Failed to allocate %uz bytes memory buffer.", size);
-
-  assert(!ASan::RegionIsPoisoned(data.get(), size));
-
-  return data;
-}
-
-Buffer::Buffer(unique_ptr<uint8_t, decltype(&alignedFree)> data_,
-               size_type size_)
-    : size(size_) {
-  if (!size)
-    ThrowIOE("Buffer has zero size?");
-
-  if (data_.get_deleter() != &alignedFree)
-    ThrowIOE("Wrong deleter. Expected rawspeed::alignedFree()");
-
-  data = data_.release();
-  if (!data)
-    ThrowIOE("Memory buffer is nonexistent");
-
-  assert(!ASan::RegionIsPoisoned(data, size));
-
-  isOwner = true;
-}
-
-Buffer::~Buffer() {
-  if (isOwner) {
-    alignedFreeConstPtr(data);
-  }
-}
-
-Buffer& Buffer::operator=(Buffer&& rhs) noexcept {
-  if (this == &rhs) {
-    assert(!ASan::RegionIsPoisoned(data, size));
-    return *this;
-  }
-
-  if (isOwner)
-    alignedFreeConstPtr(data);
-
-  data = rhs.data;
-  size = rhs.size;
-  isOwner = rhs.isOwner;
-
-  assert(!ASan::RegionIsPoisoned(data, size));
-
-  rhs.isOwner = false;
-
-  return *this;
-}
-
-Buffer& Buffer::operator=(const Buffer& rhs) {
-  if (this == &rhs) {
-    assert(!ASan::RegionIsPoisoned(data, size));
-    return *this;
-  }
-
-  Buffer unOwningTmp(rhs.data, rhs.size);
-  *this = std::move(unOwningTmp);
-  assert(!isOwner);
-  assert(!ASan::RegionIsPoisoned(data, size));
-
-  return *this;
-}
-
-#if 0
-Buffer* Buffer::clone() {
-  Buffer *new_map = new Buffer(size);
-  memcpy(new_map->data, data, size);
-  return new_map;
-}
-
-Buffer* Buffer::cloneRandomSize() {
-  uint32_t new_size = (rand() | (rand() << 15)) % size;
-  Buffer *new_map = new Buffer(new_size);
-  memcpy(new_map->data, data, new_size);
-  return new_map;
-}
-
-void Buffer::corrupt(int errors) {
-  for (int i = 0; i < errors; i++) {
-    uint32_t pos = (rand() | (rand() << 15)) % size;
-    data[pos] = rand() & 0xff;
-  }
-}
-#endif
-
-} // namespace rawspeed
diff --git a/src/librawspeed/io/Buffer.h b/src/librawspeed/io/Buffer.h
index ca0f35b4d..265ff113a 100644
--- a/src/librawspeed/io/Buffer.h
+++ b/src/librawspeed/io/Buffer.h
@@ -32,25 +32,12 @@
 
 namespace rawspeed {
 
-// This allows to specify the number of bytes that each Buffer needs to
-// allocate additionally to be able to remove one runtime bounds check
-// in BitStream::fill. There are two sane choices:
-// 0 : allocate exactly as much data as required, or
-// set it to the value of  BitStreamCacheBase::MaxProcessBytes
-#define BUFFER_PADDING 0UL
-
-// if the padding is >= 4, bounds checking in BitStream::fill are not compiled,
-// which supposedly saves about 1% on modern CPUs
-// WARNING: if the padding is >= 4, do *NOT* create Buffer from
-// passed unowning pointer and size. Or, subtract BUFFER_PADDING from size.
-// else bound checks will malfunction => bad things can happen !!!
-
 /*************************************************************************
  * This is the buffer abstraction.
  *
  * It allows access to some piece of memory, typically a whole or part
  * of a raw file. The underlying memory may be owned by the buffer or not.
- * It supports move operations to properly deal with owneship transfer.
+ * It supports move operations to properly deal with ownership transfer.
  * It intentionally supports only read/const access to the underlying memory.
  *
  *************************************************************************/
@@ -67,26 +54,46 @@ class Buffer
 public:
   // allocates the databuffer, and returns owning non-const pointer.
   static std::unique_ptr<uint8_t, decltype(&alignedFree)>
-  Create(size_type size);
+  Create(size_type size) {
+    if (!size)
+      ThrowIOE("Trying to allocate 0 bytes sized buffer.");
 
-  // constructs an empty buffer
-  Buffer() = default;
+    std::unique_ptr<uint8_t, decltype(&alignedFree)> data(
+        alignedMalloc<uint8_t, 16>(roundUp(size, 16)),
+        &alignedFree);
+    if (!data)
+      ThrowIOE("Failed to allocate %uz bytes memory buffer.", size);
 
-  // Allocates the memory
-  explicit Buffer(size_type size_) : Buffer(Create(size_), size_) {
-    assert(!ASan::RegionIsPoisoned(data, size));
+    assert(!ASan::RegionIsPoisoned(data.get(), size));
+
+    return data;
   }
 
+  // constructs an empty buffer
+  Buffer() = default;
+
   // creates buffer from owning unique_ptr
   Buffer(std::unique_ptr<uint8_t, decltype(&alignedFree)> data_,
-         size_type size_);
+         size_type size_)
+      : size(size_) {
+    if (!size)
+      ThrowIOE("Buffer has zero size?");
+
+    if (data_.get_deleter() != &alignedFree)
+      ThrowIOE("Wrong deleter. Expected rawspeed::alignedFree()");
+
+    data = data_.release();
+    if (!data)
+      ThrowIOE("Memory buffer is nonexistent");
+
+    assert(!ASan::RegionIsPoisoned(data, size));
+
+    isOwner = true;
+  }
 
   // Data already allocated
   explicit Buffer(const uint8_t* data_, size_type size_)
       : data(data_), size(size_) {
-    static_assert(BUFFER_PADDING == 0, "please do make sure that you do NOT "
-                                       "call this function from YOUR code, and "
-                                       "then comment-out this assert.");
     assert(!ASan::RegionIsPoisoned(data, size));
   }
 
@@ -103,10 +110,45 @@ class Buffer
   }
 
   // Frees memory if owned
-  ~Buffer();
+  ~Buffer() {
+    if (isOwner) {
+      alignedFreeConstPtr(data);
+    }
+  }
+
+  Buffer& operator=(Buffer&& rhs) noexcept {
+    if (this == &rhs) {
+      assert(!ASan::RegionIsPoisoned(data, size));
+      return *this;
+    }
+
+    if (isOwner)
+      alignedFreeConstPtr(data);
+
+    data = rhs.data;
+    size = rhs.size;
+    isOwner = rhs.isOwner;
 
-  Buffer& operator=(Buffer&& rhs) noexcept;
-  Buffer& operator=(const Buffer& rhs);
+    assert(!ASan::RegionIsPoisoned(data, size));
+
+    rhs.isOwner = false;
+
+    return *this;
+  }
+
+  Buffer& operator=(const Buffer& rhs) {
+    if (this == &rhs) {
+      assert(!ASan::RegionIsPoisoned(data, size));
+      return *this;
+    }
+
+    Buffer unOwningTmp(rhs.data, rhs.size);
+    *this = std::move(unOwningTmp);
+    assert(!isOwner);
+    assert(!ASan::RegionIsPoisoned(data, size));
+
+    return *this;
+  }
 
   Buffer getSubView(size_type offset, size_type size_) const {
     if (!isValid(0, offset))
@@ -163,14 +205,8 @@ class Buffer
   }
 
   inline bool isValid(size_type offset, size_type count = 1) const {
-    return static_cast<uint64_t>(offset) + count <=
-           static_cast<uint64_t>(size) + BUFFER_PADDING;
+    return static_cast<uint64_t>(offset) + count <= static_cast<uint64_t>(size);
   }
-
-//  Buffer* clone();
-//  /* For testing purposes */
-//  void corrupt(int errors);
-//  Buffer* cloneRandomSize();
 };
 
 /*
diff --git a/src/librawspeed/io/ByteStream.h b/src/librawspeed/io/ByteStream.h
index f59f6ffbf..fb6be6a47 100644
--- a/src/librawspeed/io/ByteStream.h
+++ b/src/librawspeed/io/ByteStream.h
@@ -152,7 +152,7 @@ class ByteStream : public DataBuffer
     return DataBuffer::get<T>(pos, i);
   }
 
-  inline uint16_t peekU16() { return peek<uint16_t>(); }
+  inline uint16_t peekU16() const { return peek<uint16_t>(); }
 
   template<typename T> inline T get() {
     auto ret = peek<T>();
diff --git a/src/librawspeed/io/CMakeLists.txt b/src/librawspeed/io/CMakeLists.txt
index bb1e44094..3decfffec 100644
--- a/src/librawspeed/io/CMakeLists.txt
+++ b/src/librawspeed/io/CMakeLists.txt
@@ -4,8 +4,8 @@ FILE(GLOB SOURCES
   "BitPumpMSB.h"
   "BitPumpMSB16.h"
   "BitPumpMSB32.h"
+  "BitStream.cpp"
   "BitStream.h"
-  "Buffer.cpp"
   "Buffer.h"
   "ByteStream.h"
   "Endianness.h"
diff --git a/src/librawspeed/io/FileIOException.h b/src/librawspeed/io/FileIOException.h
index 315a3bd2f..43ae48bc1 100644
--- a/src/librawspeed/io/FileIOException.h
+++ b/src/librawspeed/io/FileIOException.h
@@ -21,6 +21,7 @@
 
 #pragma once
 
+#include "rawspeedconfig.h"
 #include "common/RawspeedException.h"     // for ThrowExceptionHelper
 #include "decoders/RawDecoderException.h" // for RawDecoderException
 #include <string>                         // for string
@@ -29,8 +30,9 @@ namespace rawspeed {
 
 class FileIOException final : public RawDecoderException {
 public:
-  explicit FileIOException(const std::string& msg) : RawDecoderException(msg) {}
-  explicit FileIOException(const char* msg) : RawDecoderException(msg) {}
+  explicit RAWSPEED_UNLIKELY_FUNCTION RAWSPEED_NOINLINE
+  FileIOException(const char* msg)
+      : RawDecoderException(msg) {}
 };
 
 #define ThrowFIE(...)                                                          \
diff --git a/src/librawspeed/io/IOException.h b/src/librawspeed/io/IOException.h
index 74f5c038c..96f69d3eb 100644
--- a/src/librawspeed/io/IOException.h
+++ b/src/librawspeed/io/IOException.h
@@ -21,6 +21,7 @@
 
 #pragma once
 
+#include "rawspeedconfig.h"
 #include "common/RawspeedException.h" // for RawspeedException
 #include <string>                     // for string
 
@@ -28,8 +29,9 @@ namespace rawspeed {
 
 class IOException final : public RawspeedException {
 public:
-  explicit IOException(const std::string& msg) : RawspeedException(msg) {}
-  explicit IOException(const char* msg) : RawspeedException(msg) {}
+  explicit RAWSPEED_UNLIKELY_FUNCTION RAWSPEED_NOINLINE
+  IOException(const char* msg)
+      : RawspeedException(msg) {}
 };
 
 #define ThrowIOE(...) ThrowExceptionHelper(rawspeed::IOException, __VA_ARGS__)
diff --git a/src/librawspeed/metadata/CameraMetadataException.h b/src/librawspeed/metadata/CameraMetadataException.h
index b9b9fa8de..97a185be4 100644
--- a/src/librawspeed/metadata/CameraMetadataException.h
+++ b/src/librawspeed/metadata/CameraMetadataException.h
@@ -21,6 +21,7 @@
 
 #pragma once
 
+#include "rawspeedconfig.h"
 #include "common/RawspeedException.h"
 #include <string> // for string
 
@@ -28,9 +29,9 @@ namespace rawspeed {
 
 class CameraMetadataException final : public RawspeedException {
 public:
-  explicit CameraMetadataException(const std::string& msg)
+  explicit RAWSPEED_UNLIKELY_FUNCTION RAWSPEED_NOINLINE
+  CameraMetadataException(const char* msg)
       : RawspeedException(msg) {}
-  explicit CameraMetadataException(const char* msg) : RawspeedException(msg) {}
 };
 
 #define ThrowCME(...)                                                          \
diff --git a/src/librawspeed/parsers/CiffParserException.h b/src/librawspeed/parsers/CiffParserException.h
index 8803fb140..ed5c60558 100644
--- a/src/librawspeed/parsers/CiffParserException.h
+++ b/src/librawspeed/parsers/CiffParserException.h
@@ -22,6 +22,7 @@
 
 #pragma once
 
+#include "rawspeedconfig.h"
 #include "common/RawspeedException.h"   // for ThrowExceptionHelper
 #include "parsers/RawParserException.h" // for ThrowRPE, RawParserException
 #include <string>
@@ -30,9 +31,9 @@ namespace rawspeed {
 
 class CiffParserException final : public RawParserException {
 public:
-  explicit CiffParserException(const std::string& msg)
+  explicit RAWSPEED_UNLIKELY_FUNCTION RAWSPEED_NOINLINE
+  CiffParserException(const char* msg)
       : RawParserException(msg) {}
-  explicit CiffParserException(const char* msg) : RawParserException(msg) {}
 };
 
 #define ThrowCPE(...)                                                          \
diff --git a/src/librawspeed/parsers/FiffParserException.h b/src/librawspeed/parsers/FiffParserException.h
index d6cb98279..5a1ea3844 100644
--- a/src/librawspeed/parsers/FiffParserException.h
+++ b/src/librawspeed/parsers/FiffParserException.h
@@ -20,6 +20,7 @@
 
 #pragma once
 
+#include "rawspeedconfig.h"
 #include "common/RawspeedException.h"   // for ThrowExceptionHelper
 #include "parsers/RawParserException.h" // for ThrowRPE, RawParserException
 #include <string>
@@ -28,9 +29,9 @@ namespace rawspeed {
 
 class FiffParserException final : public RawParserException {
 public:
-  explicit FiffParserException(const std::string& msg)
+  explicit RAWSPEED_UNLIKELY_FUNCTION RAWSPEED_NOINLINE
+  FiffParserException(const char* msg)
       : RawParserException(msg) {}
-  explicit FiffParserException(const char* msg) : RawParserException(msg) {}
 };
 
 #define ThrowFPE(...)                                                          \
diff --git a/src/librawspeed/parsers/RawParserException.h b/src/librawspeed/parsers/RawParserException.h
index c93a3f69d..4cd91f2e3 100644
--- a/src/librawspeed/parsers/RawParserException.h
+++ b/src/librawspeed/parsers/RawParserException.h
@@ -20,6 +20,7 @@
 
 #pragma once
 
+#include "rawspeedconfig.h"
 #include "common/RawspeedException.h"
 #include <string>
 
@@ -27,9 +28,9 @@ namespace rawspeed {
 
 class RawParserException : public RawspeedException {
 public:
-  explicit RawParserException(const std::string& msg)
+  explicit RAWSPEED_UNLIKELY_FUNCTION RAWSPEED_NOINLINE
+  RawParserException(const char* msg)
       : RawspeedException(msg) {}
-  explicit RawParserException(const char* msg) : RawspeedException(msg) {}
 };
 
 #define ThrowRPE(...)                                                          \
diff --git a/src/librawspeed/parsers/TiffParser.cpp b/src/librawspeed/parsers/TiffParser.cpp
index 4bd60c098..38654c65d 100644
--- a/src/librawspeed/parsers/TiffParser.cpp
+++ b/src/librawspeed/parsers/TiffParser.cpp
@@ -72,7 +72,7 @@ TiffRootIFDOwner TiffParser::parse(TiffIFD* parent, const Buffer& data) {
 
   TiffRootIFDOwner root = std::make_unique<TiffRootIFD>(
       parent, nullptr, bs,
-      UINT32_MAX); // tell TiffIFD constructur not to parse bs as IFD
+      UINT32_MAX); // tell TiffIFD constructor not to parse bs as IFD
 
   NORangesSet<Buffer> ifds;
 
diff --git a/src/librawspeed/parsers/TiffParserException.h b/src/librawspeed/parsers/TiffParserException.h
index 6cb30fba0..78eb32e1b 100644
--- a/src/librawspeed/parsers/TiffParserException.h
+++ b/src/librawspeed/parsers/TiffParserException.h
@@ -21,6 +21,7 @@
 
 #pragma once
 
+#include "rawspeedconfig.h"
 #include "common/RawspeedException.h"   // for ThrowExceptionHelper
 #include "parsers/RawParserException.h" // for ThrowRPE, RawParserException
 #include <string>
@@ -29,9 +30,9 @@ namespace rawspeed {
 
 class TiffParserException final : public RawParserException {
 public:
-  explicit TiffParserException(const std::string& msg)
+  explicit RAWSPEED_UNLIKELY_FUNCTION RAWSPEED_NOINLINE
+  TiffParserException(const char* msg)
       : RawParserException(msg) {}
-  explicit TiffParserException(const char* msg) : RawParserException(msg) {}
 };
 
 #define ThrowTPE(...)                                                          \
diff --git a/src/librawspeed/tiff/TiffIFD.cpp b/src/librawspeed/tiff/TiffIFD.cpp
index 9fc7b59d5..46bb232ac 100644
--- a/src/librawspeed/tiff/TiffIFD.cpp
+++ b/src/librawspeed/tiff/TiffIFD.cpp
@@ -60,7 +60,7 @@ void TiffIFD::parseIFDEntry(NORangesSet<Buffer>* ifds, ByteStream* bs) {
   try {
     switch (t->tag) {
     case DNGPRIVATEDATA:
-      // These are arbitrairly 'rebased', to preserve the offsets, but as it is
+      // These are arbitrarily 'rebased', to preserve the offsets, but as it is
       // implemented right now, that could trigger UB (pointer arithmetics,
       // creating pointer to unowned memory, etc). And since this is not even
       // used anywhere right now, let's not
diff --git a/src/utilities/identify/rawspeed-identify.cpp b/src/utilities/identify/rawspeed-identify.cpp
index 43b9c6f80..8d17a161b 100644
--- a/src/utilities/identify/rawspeed-identify.cpp
+++ b/src/utilities/identify/rawspeed-identify.cpp
@@ -158,17 +158,17 @@ int main(int argc, char* argv[]) { // NOLINT
 #else
     // turn the locale ANSI encoded string into UTF-8 so that FileReader can
     // turn it into UTF-16 later
-    int size = MultiByteToWideChar(CP_ACP, 0, argv[1], -1, NULL, 0);
+    int size = MultiByteToWideChar(CP_ACP, 0, argv[1], -1, nullptr, 0);
     std::wstring wImageFileName;
     wImageFileName.resize(size);
     MultiByteToWideChar(CP_ACP, 0, argv[1], -1, &wImageFileName[0], size);
-    size = WideCharToMultiByte(CP_UTF8, 0, &wImageFileName[0], -1, NULL, 0,
-                               NULL, NULL);
+    size = WideCharToMultiByte(CP_UTF8, 0, &wImageFileName[0], -1, nullptr, 0,
+                               nullptr, nullptr);
     std::string _imageFileName;
     _imageFileName.resize(size);
     char* imageFileName = &_imageFileName[0];
     WideCharToMultiByte(CP_UTF8, 0, &wImageFileName[0], -1, imageFileName, size,
-                        NULL, NULL);
+                        nullptr, nullptr);
 #endif
 
     fprintf(stderr, "Loading file: \"%s\"\n", imageFileName);
diff --git a/src/utilities/rsbench/CMakeLists.txt b/src/utilities/rsbench/CMakeLists.txt
index 2357e94a8..c69b7a638 100644
--- a/src/utilities/rsbench/CMakeLists.txt
+++ b/src/utilities/rsbench/CMakeLists.txt
@@ -3,6 +3,10 @@ rawspeed_add_executable(rsbench main.cpp)
 target_link_libraries(rsbench rawspeed)
 target_link_libraries(rsbench rawspeed_bench)
 
+if(TARGET RawSpeed::OpenMP_CXX)
+  target_link_libraries(rsbench RawSpeed::OpenMP_CXX)
+endif()
+
 rawspeed_add_test(NAME utilities/rsbench COMMAND rsbench --help)
 
 add_dependencies(benchmarks rsbench)
diff --git a/src/utilities/rstest/md5.h b/src/utilities/rstest/md5.h
index 0b76895d2..f5922b23b 100644
--- a/src/utilities/rstest/md5.h
+++ b/src/utilities/rstest/md5.h
@@ -27,6 +27,8 @@
  *   Software.
  */
 
+#pragma once
+
 #include <array>   // for array
 #include <cstdint> // for uint8_t, uint32_t
 #include <cstdio>  // for size_t
diff --git a/src/utilities/rstest/rstest.cpp b/src/utilities/rstest/rstest.cpp
index dc255fc4c..60b9ac306 100644
--- a/src/utilities/rstest/rstest.cpp
+++ b/src/utilities/rstest/rstest.cpp
@@ -101,9 +101,8 @@ class RstestHashMismatch final : public rawspeed::RawspeedException {
 public:
   size_t time;
 
-  explicit RstestHashMismatch(const std::string& msg, size_t time_)
-      : RawspeedException(msg), time(time_) {}
-  explicit RstestHashMismatch(const char* msg, size_t time_)
+  explicit RAWSPEED_UNLIKELY_FUNCTION RAWSPEED_NOINLINE
+  RstestHashMismatch(const char* msg, size_t time_)
       : RawspeedException(msg), time(time_) {}
 };
 
diff --git a/test/librawspeed/test/ExceptionsTest.cpp b/test/librawspeed/test/ExceptionsTest.cpp
index a8bdb9b59..bfb4002b1 100644
--- a/test/librawspeed/test/ExceptionsTest.cpp
+++ b/test/librawspeed/test/ExceptionsTest.cpp
@@ -47,7 +47,7 @@ using std::unique_ptr;
 
 namespace rawspeed_test {
 
-static const std::string msg("my very Smart error Message #1 !");
+static const char* msg = "my very Smart error Message #1 !";
 
 #define FMT "%s"
 
@@ -191,15 +191,15 @@ TYPED_TEST(ExceptionsTest, ThrowMessage) {
 }
 
 TYPED_TEST(ExceptionsTest, ThrowHelperTest) {
-  ASSERT_ANY_THROW(MetaHelper<TypeParam>(msg.c_str()));
-  EXPECT_THROW(MetaHelper<TypeParam>(msg.c_str()), std::runtime_error);
-  EXPECT_THROW(MetaHelper<TypeParam>(msg.c_str()), RawspeedException);
-  EXPECT_THROW(MetaHelper<TypeParam>(msg.c_str()), TypeParam);
+  ASSERT_ANY_THROW(MetaHelper<TypeParam>(msg));
+  EXPECT_THROW(MetaHelper<TypeParam>(msg), std::runtime_error);
+  EXPECT_THROW(MetaHelper<TypeParam>(msg), RawspeedException);
+  EXPECT_THROW(MetaHelper<TypeParam>(msg), TypeParam);
 }
 
 TYPED_TEST(ExceptionsTest, ThrowHelperTestMessage) {
   try {
-    MetaHelper<TypeParam>(msg.c_str());
+    MetaHelper<TypeParam>(msg);
   } catch (std::exception& ex) {
     ASSERT_THAT(ex.what(), testing::HasSubstr(msg));
   }