From 24716a47b1cc881955c4d8fa9a80944e10850dc7 Mon Sep 17 00:00:00 2001
From: Vesa Karvonen <vesa.a.j.k@gmail.com>
Date: Thu, 27 Jul 2023 17:37:24 +0300
Subject: [PATCH] Pad to cache aligned size based on architecture and OCaml
 version

---
 CHANGES.md                   |  7 ++++++
 src/Multicore_magic.ml       | 31 +++++++++++++++++------
 src/Multicore_magic.mli      | 20 +++++++++------
 src/cache.1.ml               |  1 +
 src/cache.16.ml              |  1 +
 src/cache.32.ml              |  1 +
 src/cache.8.ml               |  1 +
 src/cache.mli                |  1 +
 src/dune                     | 48 ++++++++++++++++++++++++++++++++++++
 test/Multicore_magic_test.ml | 12 +++++++++
 10 files changed, 108 insertions(+), 15 deletions(-)
 create mode 100644 src/cache.1.ml
 create mode 100644 src/cache.16.ml
 create mode 100644 src/cache.32.ml
 create mode 100644 src/cache.8.ml
 create mode 100644 src/cache.mli

diff --git a/CHANGES.md b/CHANGES.md
index b5da380..d711074 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -2,6 +2,13 @@
 
 All notable changes to this project will be documented in this file.
 
+## 2.0.0
+
+- Changed the semantics of `copy_as_padded` to not always copy and to not
+  guarantee that `length_of_padded_array*` works with it. These semantic changes
+  allow better use of the OCaml allocator to guarantee cache friendly alignment.
+  (@polytypic)
+
 ## 1.0.1
 
 - Ported the library to OCaml 4 (@polytypic)
diff --git a/src/Multicore_magic.ml b/src/Multicore_magic.ml
index 9d852fc..735310f 100644
--- a/src/Multicore_magic.ml
+++ b/src/Multicore_magic.ml
@@ -1,17 +1,32 @@
-let num_padding_words = 15
+let num_padding_words = Cache.words_per_cache_line - 1
 
 let copy_as_padded (o : 'a) : 'a =
   let o = Obj.repr o in
-  let n = Obj.new_block (Obj.tag o) (Obj.size o + num_padding_words) in
-  for i = 0 to Obj.size o - 1 do
-    Obj.set_field n i (Obj.field o i)
-  done;
-  Obj.magic n
+  if Obj.is_block o then begin
+    let original_size = Obj.size o in
+    let padded_size =
+      if original_size <= num_padding_words then num_padding_words
+      else original_size + num_padding_words
+    in
+    if original_size <> padded_size then begin
+      let t = Obj.tag o in
+      if Sys.word_size = 64 && t != Obj.double_array_tag then begin
+        let n = Obj.new_block t padded_size in
+        Array.blit (Obj.magic o) 0 (Obj.magic n) 0 original_size;
+        Obj.magic n
+      end
+      else Obj.magic o
+    end
+    else Obj.magic o
+  end
+  else Obj.magic o
 
 let make_padded_array n x =
   let a = Array.make (n + num_padding_words) x in
-  if Obj.is_block (Obj.repr x) && Obj.tag (Obj.repr x) != Obj.double_tag then
-    Array.fill a n num_padding_words (Obj.magic ());
+  if Obj.is_block (Obj.repr x) then
+    Array.fill a n num_padding_words
+      (if Obj.tag (Obj.repr x) == Obj.double_tag then Obj.magic 0.0
+       else Obj.magic ());
   a
 
 let[@inline] length_of_padded_array x = Array.length x - num_padding_words
diff --git a/src/Multicore_magic.mli b/src/Multicore_magic.mli
index 4ee6935..85d134b 100644
--- a/src/Multicore_magic.mli
+++ b/src/Multicore_magic.mli
@@ -7,8 +7,9 @@
 (** {1 Helpers for using padding to avoid false sharing} *)
 
 val copy_as_padded : 'a -> 'a
-(** Creates a shallow clone of the given object.  The clone will have extra
-    padding words added after the last used word.
+(** Depending on the object, either creates a shallow clone of it or returns it
+    as is.  When cloned, the clone will have extra padding words added after the
+    last used word.
 
     This is designed to help avoid
     {{:https://en.wikipedia.org/wiki/False_sharing} false sharing}.  False
@@ -32,21 +33,26 @@ val copy_as_padded : 'a -> 'a
     }
 
     let padded_variant = Multicore_magic.copy_as_padded (Some 1)
-
-    let padded_array = Multicore_magic.copy_as_padded [|3; 1; 4|]
 ]}
 
-    Padding changes the length of an array, see {!length_of_padded_array}. *)
+    Padding changes the length of an array.  If you need to pad an array, use
+    {!make_padded_array}. *)
 
 val make_padded_array : int -> 'a -> 'a array
 (** Creates a padded array.  The length of the returned array includes padding.
     Use {!length_of_padded_array} to get the unpadded length. *)
 
 val length_of_padded_array : 'a array -> int
-(** Returns the length of a padded array without the padding. *)
+(** Returns the length of an array created by {!make_padded_array} without the
+    padding.
+
+    {b WARNING}: This is not guaranteed to work with {!copy_as_padded}. *)
 
 val length_of_padded_array_minus_1 : 'a array -> int
-(** Returns the length of a padded array without the padding minus 1. *)
+(** Returns the length of an array created by {!make_padded_array} without the
+    padding minus 1.
+
+    {b WARNING}: This is not guaranteed to work with {!copy_as_padded}. *)
 
 (** {1 Missing [Atomic] operations} *)
 
diff --git a/src/cache.1.ml b/src/cache.1.ml
new file mode 100644
index 0000000..5de3a57
--- /dev/null
+++ b/src/cache.1.ml
@@ -0,0 +1 @@
+let words_per_cache_line = 1
diff --git a/src/cache.16.ml b/src/cache.16.ml
new file mode 100644
index 0000000..58c08a7
--- /dev/null
+++ b/src/cache.16.ml
@@ -0,0 +1 @@
+let words_per_cache_line = 16
diff --git a/src/cache.32.ml b/src/cache.32.ml
new file mode 100644
index 0000000..fc15674
--- /dev/null
+++ b/src/cache.32.ml
@@ -0,0 +1 @@
+let words_per_cache_line = 32
diff --git a/src/cache.8.ml b/src/cache.8.ml
new file mode 100644
index 0000000..6cb39e4
--- /dev/null
+++ b/src/cache.8.ml
@@ -0,0 +1 @@
+let words_per_cache_line = 8
diff --git a/src/cache.mli b/src/cache.mli
new file mode 100644
index 0000000..685c409
--- /dev/null
+++ b/src/cache.mli
@@ -0,0 +1 @@
+val words_per_cache_line : int
diff --git a/src/dune b/src/dune
index 62a2837..2f25888 100644
--- a/src/dune
+++ b/src/dune
@@ -1,3 +1,51 @@
 (library
  (name Multicore_magic)
  (public_name multicore-magic))
+
+(rule
+ (targets cache.ml)
+ (deps cache.1.ml)
+ (enabled_if
+  (< %{ocaml_version} 5.0.0))
+ (action
+  (progn
+   (copy cache.1.ml cache.ml))))
+
+(rule
+ (targets cache.ml)
+ (deps cache.8.ml)
+ (enabled_if
+  (and
+   (>= %{ocaml_version} 5.0.0)
+   (not
+    (or
+     (= %{architecture} arm64)
+     (= %{architecture} power)
+     (= %{architecture} s390x)))))
+ (action
+  (progn
+   (copy cache.8.ml cache.ml))))
+
+(rule
+ (targets cache.ml)
+ (deps cache.16.ml)
+ (enabled_if
+  (and
+   (>= %{ocaml_version} 5.0.0)
+   (or
+    (= %{architecture} arm64)
+    (= %{architecture} power))))
+ (action
+  (progn
+   (copy cache.16.ml cache.ml))))
+
+(rule
+ (targets cache.ml)
+ (deps cache.32.ml)
+ (enabled_if
+  (and
+   (>= %{ocaml_version} 5.0.0)
+   (= %{architecture} s390x)))
+ (action
+  (progn
+   (copy cache.32.ml cache.ml))))
diff --git a/test/Multicore_magic_test.ml b/test/Multicore_magic_test.ml
index e3f6d9b..8614664 100644
--- a/test/Multicore_magic_test.ml
+++ b/test/Multicore_magic_test.ml
@@ -1,3 +1,4 @@
+let can_pad_int () = assert (Multicore_magic.copy_as_padded 101 = 101)
 let can_pad_ref () = assert (!(Multicore_magic.copy_as_padded (ref 101)) = 101)
 
 let can_pad_atomic () =
@@ -11,6 +12,14 @@ let can_pad_records () =
   let x = Multicore_magic.copy_as_padded { foo; bar; baz } in
   assert (x.foo = foo && x.bar == bar && x.baz == baz)
 
+let can_pad_float_record () =
+  let open struct
+    type record = { foo : float; bar : float; baz : float }
+  end in
+  let foo = 4.2 and bar = 10.1 and baz = 9.6 in
+  let x = Multicore_magic.copy_as_padded { foo; bar; baz } in
+  assert (x.foo = foo && x.bar = bar && x.baz = baz)
+
 let can_pad_variants () =
   let open struct
     type variant = Foo of int * int Atomic.t
@@ -65,9 +74,12 @@ let fence () =
 let () =
   Alcotest.run "multicore-magic"
     [
+      ("can pad int", [ Alcotest.test_case "" `Quick can_pad_int ]);
       ("can pad ref", [ Alcotest.test_case "" `Quick can_pad_ref ]);
       ("can pad atomic", [ Alcotest.test_case "" `Quick can_pad_atomic ]);
       ("can pad records", [ Alcotest.test_case "" `Quick can_pad_records ]);
+      ( "can pad float record",
+        [ Alcotest.test_case "" `Quick can_pad_float_record ] );
       ("can pad variants", [ Alcotest.test_case "" `Quick can_pad_variants ]);
       ("can pad arrays", [ Alcotest.test_case "" `Quick can_pad_arrays ]);
       ( "padded array length",