From a77fe704e14c00137a7ccadaac5b8a2df66f2048 Mon Sep 17 00:00:00 2001
From: Warren Weckesser <warren.weckesser@gmail.com>
Date: Sun, 30 Jun 2024 23:39:54 -0400
Subject: [PATCH] MAINT: Convert the 'cross' module to C++.

And drop support for complex types in cross3 and cross2--the
definition of the cross product of complex vectors needs a
closer look.
---
 pyproject.toml                        |   2 +-
 src/cross/cross_gufunc.c.src          | 509 --------------------------
 src/cross/cross_gufunc.h              | 162 ++++++++
 src/cross/define_cxx_gufunc_extmod.py | 110 ++++++
 ufunclab/meson.build                  |   2 +-
 ufunclab/tests/test_cross.py          |  20 +-
 6 files changed, 279 insertions(+), 526 deletions(-)
 delete mode 100644 src/cross/cross_gufunc.c.src
 create mode 100644 src/cross/cross_gufunc.h
 create mode 100644 src/cross/define_cxx_gufunc_extmod.py

diff --git a/pyproject.toml b/pyproject.toml
index 3dc0b7b..9730cff 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,7 +8,7 @@ requires = [
 
 [project]
 name = 'ufunclab'
-version = '0.0.8.dev13'
+version = '0.0.8.dev14'
 description = 'NumPy ufuncs and utilities.'
 readme = 'README.md'
 requires-python = '>=3.9'
diff --git a/src/cross/cross_gufunc.c.src b/src/cross/cross_gufunc.c.src
deleted file mode 100644
index 315e784..0000000
--- a/src/cross/cross_gufunc.c.src
+++ /dev/null
@@ -1,509 +0,0 @@
-//
-// cross_gufunc.c.src
-//
-// gufunc implementation of the cross product for 3-d vectors.
-//
-
-#define PY_SSIZE_T_CLEAN
-#include "Python.h"
-
-#include <stddef.h>
-#include <stdint.h>
-#include <complex.h>
-
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#include "numpy/ndarraytypes.h"
-#include "numpy/arrayscalars.h"
-#include "numpy/ufuncobject.h"
-
-#include "../src/util/ufunc_tools.h"
-
-
-// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-// ufunc inner loops
-//
-// Loops could be added for smaller integer bit widths, if needed.
-// If loops are added for unsigned integers, it might be reasonable to
-// make the output type signed, since, eg., cross3([1, 2, 3], [2, 2, 1])
-// is [-4, 5, -2].
-// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
-
-/**begin repeat
- * #typename    = int32,   int64,   float, double, longdouble   #
- * #ctype       = int32_t, int64_t, float, double, long double  #
- */
-
-static void cross3_@typename@_loop(char **args, const npy_intp *dimensions,
-                                   const npy_intp* steps, void* data)
-{
-    // Notation: out = u x v
-    //
-    // dimensions[0]: Number of input arrays
-    // dimensions[1]: Length of each array (must be 3)
-    // steps[0]:  u array outer step
-    // steps[1]:  v array outer step
-    // steps[2]:  out array outer step
-    // steps[3]:  inner (core) u array step
-    // steps[4]:  inner v array step
-    // steps[5]:  inner out array step
-    char *pu = args[0];
-    char *pv = args[1];
-    char *pout = args[2];
-    npy_intp nloops = dimensions[0];
-
-    assert(dimensions[1] == 3);
-
-    for (int j = 0; j < nloops; ++j, pu += steps[0], pv += steps[1],
-                                     pout += steps[2]) {
-        @ctype@ u0, u1, u2, v0, v1, v2;
-        u0 = *(@ctype@ *) pu;
-        u1 = *(@ctype@ *) (pu + steps[3]);
-        u2 = *(@ctype@ *) (pu + 2*steps[3]);
-        v0 = *(@ctype@ *) pv;
-        v1 = *(@ctype@ *) (pv + steps[4]);
-        v2 = *(@ctype@ *) (pv + 2*steps[4]);
-        *(@ctype@ *) pout = u1*v2 - u2*v1;
-        *(@ctype@ *) (pout + steps[5]) = u2*v0 - u0*v2;            
-        *(@ctype@ *) (pout + 2*steps[5]) = u0*v1 - u1*v0;
-    }
-}
-
-static void cross2_@typename@_loop(char **args, const npy_intp *dimensions,
-                                   const npy_intp* steps, void* data)
-{
-    // Notation: out = u x v
-    //
-    // dimensions[0]: Number of input arrays
-    // dimensions[1]: Length of each array
-    // steps[0]:  u array outer step
-    // steps[1]:  v array outer step
-    // steps[2]:  out array outer step
-    // steps[3]:  inner (core) u array step
-    // steps[4]:  inner v array step
-    char *pu = args[0];
-    char *pv = args[1];
-    char *pout = args[2];
-    npy_intp nloops = dimensions[0];
-
-    for (int j = 0; j < nloops; ++j, pu += steps[0], pv += steps[1],
-                                     pout += steps[2]) {
-        @ctype@ u0, u1, v0, v1;
-        u0 = *(@ctype@ *) pu;
-        u1 = *(@ctype@ *) (pu + steps[3]);
-        v0 = *(@ctype@ *) pv;
-        v1 = *(@ctype@ *) (pv + steps[4]);
-        *(@ctype@ *) pout = u0*v1 - u1*v0;
-    }
-}
-
-/**end repeat**/
-
-/**begin repeat
- * #typename    = cfloat,        cdouble,        clongdouble         #
- * #ctype       = complex float, complex double, complex long double #
- * #msvc_ctype  = _Fcomplex,     _Dcomplex,      _Lcomplex           #
- * #msvc_cmul   = _FCmulcc,      _Cmulcc,        _LCmulcc            #
- * #msvc_cbuild = _FCbuild,      _Cbuild,        _LCbuild            #
- * #msvc_imag   = cimagf,        cimag,          cimagl              #
- * #msvc_real   = crealf,        creal,          creall              #
- */
-
-static void cross3_@typename@_loop(char **args, const npy_intp *dimensions,
-                                   const npy_intp* steps, void* data)
-{
-    // Notation: out = u x v
-    //
-    // dimensions[0]: Number of input arrays
-    // dimensions[1]: Length of each array (must be 3)
-    // steps[0]:  u array outer step
-    // steps[1]:  v array outer step
-    // steps[2]:  out array outer step
-    // steps[3]:  inner (core) u array step
-    // steps[4]:  inner v array step
-    // steps[5]:  inner out array step
-    char *pu = args[0];
-    char *pv = args[1];
-    char *pout = args[2];
-    npy_intp nloops = dimensions[0];
-
-    assert(dimensions[1] == 3);
-
-    for (int j = 0; j < nloops; ++j, pu += steps[0], pv += steps[1],
-                                     pout += steps[2]) {
-#ifdef _MSC_VER
-        @msvc_ctype@ u0, u1, u2, v0, v1, v2;
-        @msvc_ctype@ p1, p2, d;
-
-        u0 = *(@msvc_ctype@ *) pu;
-        u1 = *(@msvc_ctype@ *) (pu + steps[3]);
-        u2 = *(@msvc_ctype@ *) (pu + 2*steps[3]);
-        v0 = *(@msvc_ctype@ *) pv;
-        v1 = *(@msvc_ctype@ *) (pv + steps[4]);
-        v2 = *(@msvc_ctype@ *) (pv + 2*steps[4]);
-
-        p1 = @msvc_cmul@(u1, v2);
-        p2 = @msvc_cmul@(u2, v1);
-        d = @msvc_cbuild@(@msvc_real@(p1) - @msvc_real@(p2),
-                          @msvc_imag@(p1) - @msvc_imag@(p2));
-        *(@msvc_ctype@ *) pout = d;
-
-        p1 = @msvc_cmul@(u2, v0);
-        p2 = @msvc_cmul@(u0, v2);
-        d = @msvc_cbuild@(@msvc_real@(p1) - @msvc_real@(p2),
-                          @msvc_imag@(p1) - @msvc_imag@(p2));
-        *(@msvc_ctype@ *) (pout + steps[5]) = d;
-
-        p1 = @msvc_cmul@(u0, v1);
-        p2 = @msvc_cmul@(u1, v0);
-        d = @msvc_cbuild@(@msvc_real@(p1) - @msvc_real@(p2),
-                          @msvc_imag@(p1) - @msvc_imag@(p2));
-        *(@msvc_ctype@ *) (pout + 2*steps[5]) = d;
-
-#else
-        @ctype@ u0, u1, u2, v0, v1, v2;
-        u0 = *(@ctype@ *) pu;
-        u1 = *(@ctype@ *) (pu + steps[3]);
-        u2 = *(@ctype@ *) (pu + 2*steps[3]);
-        v0 = *(@ctype@ *) pv;
-        v1 = *(@ctype@ *) (pv + steps[4]);
-        v2 = *(@ctype@ *) (pv + 2*steps[4]);
-        *(@ctype@ *) pout = u1*v2 - u2*v1;
-        *(@ctype@ *) (pout + steps[5]) = u2*v0 - u0*v2;            
-        *(@ctype@ *) (pout + 2*steps[5]) = u0*v1 - u1*v0;
-#endif
-    }
-}
-
-static void cross2_@typename@_loop(char **args, const npy_intp *dimensions,
-                                   const npy_intp* steps, void* data)
-{
-    // Notation: out = u x v
-    //
-    // dimensions[0]: Number of input arrays
-    // dimensions[1]: Length of each array
-    // steps[0]:  u array outer step
-    // steps[1]:  v array outer step
-    // steps[2]:  out array outer step
-    // steps[3]:  inner (core) u array step
-    // steps[4]:  inner v array step
-    char *pu = args[0];
-    char *pv = args[1];
-    char *pout = args[2];
-    npy_intp nloops = dimensions[0];
-
-    for (int j = 0; j < nloops; ++j, pu += steps[0], pv += steps[1],
-                                     pout += steps[2]) {
-#ifdef _MSC_VER
-        @msvc_ctype@ u0, u1, v0, v1;
-        @msvc_ctype@ p1, p2, d;
-
-        u0 = *(@msvc_ctype@ *) pu;
-        u1 = *(@msvc_ctype@ *) (pu + steps[3]);
-        v0 = *(@msvc_ctype@ *) pv;
-        v1 = *(@msvc_ctype@ *) (pv + steps[4]);
-
-        p1 = @msvc_cmul@(u0, v1);
-        p2 = @msvc_cmul@(u1, v0);
-        d = @msvc_cbuild@(@msvc_real@(p1) - @msvc_real@(p2),
-                          @msvc_imag@(p1) - @msvc_imag@(p2));
-        *(@msvc_ctype@ *) pout = d;
-
-#else
-        @ctype@ u0, u1, v0, v1;
-        u0 = *(@ctype@ *) pu;
-        u1 = *(@ctype@ *) (pu + steps[3]);
-        v0 = *(@ctype@ *) pv;
-        v1 = *(@ctype@ *) (pv + steps[4]);
-        *(@ctype@ *) pout = u0*v1 - u1*v0;
-#endif
-    }
-}
-
-/**end repeat**/
-
-// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-// ufunc inner loop for object arrays.
-// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
-//
-// Computes a*d - b*c
-// Returns a new reference, or NULL on error.
-//
-static PyObject *det2(PyObject *a, PyObject *b, PyObject *c, PyObject *d)
-{
-    PyObject *p0, *p1, *diff;
-
-    p0 = PyNumber_Multiply(a, d);
-    if (p0 == NULL) {
-        return NULL;
-    }
-    p1 = PyNumber_Multiply(b, c);
-    if (p1 == NULL) {
-        Py_DECREF(p0);
-        return NULL;
-    }
-    diff = PyNumber_Subtract(p0, p1);
-    Py_DECREF(p0);
-    Py_DECREF(p1);
-    return diff;
-}
-
-
-//
-// XXX Verify that when det2() returns NULL (i.e. an error
-//     occurred while computing a term), we don't end up with
-//     a memory leak.
-//
-static void cross3_object_loop(char **args, const npy_intp *dimensions,
-                               const npy_intp* steps, void* data)
-{
-    // Notation: out = u x v
-    //
-    // dimensions[0]: Number of input arrays
-    // dimensions[1]: Length of each array (must be 3)
-    // steps[0]:  u array outer step
-    // steps[1]:  v array outer step
-    // steps[2]:  out array outer step
-    // steps[3]:  inner (core) u array step
-    // steps[4]:  inner v array step
-    // steps[5]:  inner out array step
-    char *pu = args[0];
-    char *pv = args[1];
-    char *pout = args[2];
-    npy_intp nloops = dimensions[0];
-
-    assert(dimensions[1] == 3);
-
-    for (int j = 0; j < nloops; ++j, pu += steps[0], pv += steps[1],
-                                     pout += steps[2]) {
-        PyObject *u0, *u1, *u2, *v0, *v1, *v2, *out;
-        u0 = *(PyObject **) pu;
-        u1 = *(PyObject **) (pu + steps[3]);
-        u2 = *(PyObject **) (pu + 2*steps[3]);
-        v0 = *(PyObject **) pv;
-        v1 = *(PyObject **) (pv + steps[4]);
-        v2 = *(PyObject **) (pv + 2*steps[4]);
-        out = det2(u1, u2, v1, v2);
-        if (out == NULL) {
-            return;
-        }
-        *(PyObject **) pout = out;
-        out = det2(u2, u0, v2, v0);
-        if (out == NULL) {
-            return;
-        }
-        *(PyObject **) (pout + steps[5]) = out;
-        out = det2(u0, u1, v0, v1);
-        if (out == NULL) {
-            return;
-        }
-        *(PyObject **) (pout + 2*steps[5]) = out;
-    }
-}
-
-
-static void cross2_object_loop(char **args, const npy_intp *dimensions,
-                               const npy_intp* steps, void* data)
-{
-    // Notation: out = u x v
-    //
-    // dimensions[0]: Number of input arrays
-    // steps[0]:  u array outer step
-    // steps[1]:  v array outer step
-    // steps[2]:  out array outer step
-    // steps[3]:  inner (core) u array step
-    // steps[4]:  inner v array step
-    char *pu = args[0];
-    char *pv = args[1];
-    char *pout = args[2];
-    npy_intp nloops = dimensions[0];
-
-    for (int j = 0; j < nloops; ++j, pu += steps[0], pv += steps[1],
-                                     pout += steps[2]) {
-        PyObject *u0, *u1, *v0, *v1, *out;
-        u0 = *(PyObject **) pu;
-        u1 = *(PyObject **) (pu + steps[3]);
-        v0 = *(PyObject **) pv;
-        v1 = *(PyObject **) (pv + steps[4]);
-        out = det2(u0, u1, v0, v1);
-        if (out == NULL) {
-            return;
-        }
-        *(PyObject **) pout = out;
-    }
-}
-
-// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-// ufunc configuration data.
-// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
-// These are the input and return dtypes of cross3.
-static char cross3_typecodes[] = {
-    NPY_INT32, NPY_INT32, NPY_INT32,
-    NPY_INT64, NPY_INT64, NPY_INT64,
-    NPY_FLOAT, NPY_FLOAT, NPY_FLOAT,
-    NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE,
-    NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE,
-    NPY_CFLOAT, NPY_CFLOAT, NPY_CFLOAT,
-    NPY_CDOUBLE, NPY_CDOUBLE, NPY_CDOUBLE,
-    NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE,
-    NPY_OBJECT, NPY_OBJECT, NPY_OBJECT
-};
-
-static PyUFuncGenericFunction cross3_funcs[] = {
-    (PyUFuncGenericFunction) &cross3_int32_loop,
-    (PyUFuncGenericFunction) &cross3_int64_loop,
-    (PyUFuncGenericFunction) &cross3_float_loop,
-    (PyUFuncGenericFunction) &cross3_double_loop,
-    (PyUFuncGenericFunction) &cross3_longdouble_loop,
-    (PyUFuncGenericFunction) &cross3_cfloat_loop,
-    (PyUFuncGenericFunction) &cross3_cdouble_loop,
-    (PyUFuncGenericFunction) &cross3_clongdouble_loop,
-    (PyUFuncGenericFunction) &cross3_object_loop
-};
-
-#define CROSS3_NTYPES  (sizeof(cross3_funcs)/sizeof(cross3_funcs[0]))
-static void *cross3_data[CROSS3_NTYPES];
-
-
-// These are the input and return dtypes of cross2.
-// (Same as for cross3, so could probably share the types array.)
-static char cross2_typecodes[] = {
-    NPY_INT32, NPY_INT32, NPY_INT32,
-    NPY_INT64, NPY_INT64, NPY_INT64,
-    NPY_FLOAT, NPY_FLOAT, NPY_FLOAT,
-    NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE,
-    NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE,
-    NPY_CFLOAT, NPY_CFLOAT, NPY_CFLOAT,
-    NPY_CDOUBLE, NPY_CDOUBLE, NPY_CDOUBLE,
-    NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE,
-    NPY_OBJECT, NPY_OBJECT, NPY_OBJECT
-};
-
-static PyUFuncGenericFunction cross2_funcs[] = {
-    (PyUFuncGenericFunction) &cross2_int32_loop,
-    (PyUFuncGenericFunction) &cross2_int64_loop,
-    (PyUFuncGenericFunction) &cross2_float_loop,
-    (PyUFuncGenericFunction) &cross2_double_loop,
-    (PyUFuncGenericFunction) &cross2_longdouble_loop,
-    (PyUFuncGenericFunction) &cross2_cfloat_loop,
-    (PyUFuncGenericFunction) &cross2_cdouble_loop,
-    (PyUFuncGenericFunction) &cross2_clongdouble_loop,
-    (PyUFuncGenericFunction) &cross2_object_loop
-};
-
-#define CROSS2_NTYPES  (sizeof(cross2_funcs)/sizeof(cross2_funcs[0]))
-static void *cross2_data[CROSS2_NTYPES];
-
-// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-// Python extension module definitions.
-// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
-static PyMethodDef CrossMethods[] = {
-        {NULL, NULL, 0, NULL}
-};
-
-static struct PyModuleDef moduledef = {
-    PyModuleDef_HEAD_INIT,
-    .m_name = "_cross",
-    .m_doc = "Module that defines the cross2 and cross3 functions.",
-    .m_size = -1,
-    .m_methods = CrossMethods
-};
-
-
-#define CROSS3_DOCSTRING \
-"cross3(u, v, /, ...)\n"                                       \
-"\n"                                                           \
-"Compute the 3-d vector cross product of u and v.\n"           \
-"\n"                                                           \
-"Parameters\n"                                                 \
-"----------\n"                                                 \
-"u : array_like, shape (..., 3)\n"                             \
-"    Input array\n"                                            \
-"v : array_like, shape (..., 3)\n"                             \
-"    Input array\n"                                            \
-"\n"                                                           \
-"Returns\n"                                                    \
-"-------\n"                                                    \
-"out : ndarray, shape (..., 3)\n"                              \
-"    Cross product of u and v.\n"                              \
-"\n"                                                           \
-"See Also\n"                                                   \
-"--------\n"                                                   \
-"ufunclab.cross2\n"                                            \
-"numpy.cross\n"                                                \
-"\n"                                                           \
-"Examples\n"                                                   \
-"--------\n"                                                   \
-">>> from ufunclab import cross3\n"                            \
-">>> cross3([1, 2, -2], [5, 3, 1])\n"                          \
-"array([  8, -11,  -7])\n"                                     \
-">>> cross3([[1, 2, -2], [6, 0, 2]], [[5, 3, 1], [2, 2, 3]])\n"\
-"array([[  8, -11,  -7],\n"                                    \
-"       [ -4, -14,  12]])\n"                                   \
-"\n"
-
-#define CROSS2_DOCSTRING \
-"cross2(u, v, /, ...)\n"                                       \
-"\n"                                                           \
-"Compute the cross product of 2-d vectors u and v.\n"          \
-"The result is a scalar.\n"                                    \
-"\n"                                                           \
-"Parameters\n"                                                 \
-"----------\n"                                                 \
-"u : array_like, shape (..., 2)\n"                             \
-"    Input array\n"                                            \
-"v : array_like, shape (..., 2)\n"                             \
-"    Input array\n"                                            \
-"\n"                                                           \
-"See Also\n"                                                   \
-"--------\n"                                                   \
-"ufunclab.cross3\n"                                            \
-"numpy.cross\n"                                                \
-"\n"                                                           \
-"Returns\n"                                                    \
-"-------\n"                                                    \
-"out : scalar or ndarray, shape (...)\n"                       \
-"    Cross product of u and v.\n"                              \
-"\n"                                                           \
-"Examples\n"                                                   \
-"--------\n"                                                   \
-">>> from ufunclab import cross2\n"                            \
-">>> cross2([1, 2], [5, 3])\n"                                 \
-"-7\n"                                                         \
-">>> cross2([[1, 2], [6, 0]], [[5, 3], [2, 3]])\n"             \
-"array([-7, 18])\n"                                           \
-"\n"
-
-PyMODINIT_FUNC PyInit__cross(void)
-{
-    PyObject *module;
-
-    module = PyModule_Create(&moduledef);
-    if (!module) {
-        return NULL;
-    }
-
-    import_array();
-    import_umath();
-
-    // Create the cross3 ufunc.
-    if (ul_define_gufunc(module, "cross3", CROSS3_DOCSTRING, "(3),(3)->(3)",
-                         CROSS3_NTYPES,
-                         cross3_funcs, cross3_data, cross3_typecodes) == NULL) {
-        Py_DECREF(module);
-        return NULL;
-    }
-
-    // Create the cross2 ufunc.
-    if (ul_define_gufunc(module, "cross2", CROSS2_DOCSTRING, "(2),(2)->()",
-                         CROSS2_NTYPES,
-                         cross2_funcs, cross2_data, cross2_typecodes) == NULL) {
-        Py_DECREF(module);
-        return NULL;
-    }
-
-    return module;
-}
diff --git a/src/cross/cross_gufunc.h b/src/cross/cross_gufunc.h
new file mode 100644
index 0000000..a19bfe9
--- /dev/null
+++ b/src/cross/cross_gufunc.h
@@ -0,0 +1,162 @@
+
+#ifndef CROSS_GUFUNC_H
+#define CROSS_GUFUNC_H
+
+#define PY_SSIZE_T_CLEAN
+#include "Python.h"
+
+#include <stddef.h>
+#include <stdint.h>
+#include <complex.h>
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#include "numpy/ndarraytypes.h"
+#include "numpy/arrayscalars.h"
+#include "numpy/ufuncobject.h"
+
+#include "../src/util/strided.hpp"
+
+//
+// `cross3_core_calc`, the C++ core function
+// for the gufunc `cross3` with signature '(3),(3)->(3)'
+// for the real numeric types (integers and floating point).
+//
+template<typename T>
+static void
+cross3_core_calc(
+    T *p_u,              // pointer to first element of u, a strided 1-d array with n elements
+    npy_intp u_stride,   // stride (in bytes) for elements of u
+    T *p_v,              // pointer to first element of v, a strided 1-d array with n elements
+    npy_intp v_stride,   // stride (in bytes) for elements of v
+    T *p_out,            // pointer to out
+    npy_intp out_stride  // stride (in bytes) for elements of out
+)
+{
+    T u0 = get(p_u, u_stride, 0);
+    T u1 = get(p_u, u_stride, 1);
+    T u2 = get(p_u, u_stride, 2);
+    T v0 = get(p_v, v_stride, 0);
+    T v1 = get(p_v, v_stride, 1);
+    T v2 = get(p_v, v_stride, 2);
+    set(p_out, out_stride, 0, u1*v2 - u2*v1);
+    set(p_out, out_stride, 1, u2*v0 - u0*v2);
+    set(p_out, out_stride, 2, u0*v1 - u1*v0);
+}
+
+//
+// Computes a*d - b*c
+// Returns a new reference, or NULL on error.
+//
+static PyObject *det2(PyObject *a, PyObject *b, PyObject *c, PyObject *d)
+{
+    PyObject *p0, *p1, *diff;
+
+    p0 = PyNumber_Multiply(a, d);
+    if (p0 == NULL) {
+        return NULL;
+    }
+    p1 = PyNumber_Multiply(b, c);
+    if (p1 == NULL) {
+        Py_DECREF(p0);
+        return NULL;
+    }
+    diff = PyNumber_Subtract(p0, p1);
+    Py_DECREF(p0);
+    Py_DECREF(p1);
+    return diff;
+}
+
+//
+// `cross3_core_calc_object`, the C++ core function
+// for the gufunc `cross3` with signature '(3),(3)->(3)'
+// for the object type.
+//
+static void
+cross3_core_calc_object(
+    PyObject* *p_u,      // pointer to first element of u, a strided 1-d array with n elements
+    npy_intp u_stride,   // stride (in bytes) for elements of u
+    PyObject* *p_v,      // pointer to first element of v, a strided 1-d array with n elements
+    npy_intp v_stride,   // stride (in bytes) for elements of v
+    PyObject* *p_out,    // pointer to out
+    npy_intp out_stride  // stride (in bytes) for elements of out
+)
+{
+    PyObject* u0 = get(p_u, u_stride, 0);
+    PyObject* u1 = get(p_u, u_stride, 1);
+    PyObject* u2 = get(p_u, u_stride, 2);
+    PyObject* v0 = get(p_v, v_stride, 0);
+    PyObject* v1 = get(p_v, v_stride, 1);
+    PyObject* v2 = get(p_v, v_stride, 2);
+
+    PyObject* out0 = det2(u1, u2, v1, v2);
+    if (out0 == NULL) {
+        return;
+    }
+
+    PyObject* out1 = det2(u2, u0, v2, v0);
+    if (out1 == NULL) {
+        Py_DECREF(out0);
+        return;
+    }
+
+    PyObject* out2 = det2(u0, u1, v0, v1);
+    if (out2 == NULL) {
+        Py_DECREF(out0);
+        Py_DECREF(out1);
+        return;
+    }
+
+    set(p_out, out_stride, 0, out0);
+    set(p_out, out_stride, 1, out1);
+    set(p_out, out_stride, 2, out2);
+}
+
+//
+// `cross2_core_calc`, the C++ core function
+// for the gufunc `cross2` with signature '(2),(2)->()'
+// for the real numeric types (integers and floating point).
+//
+template<typename T>
+static void
+cross2_core_calc(
+        T *p_u,              // pointer to first element of u, a strided 1-d array with n elements
+        npy_intp u_stride,   // stride (in bytes) for elements of u
+        T *p_v,              // pointer to first element of v, a strided 1-d array with n elements
+        npy_intp v_stride,   // stride (in bytes) for elements of v
+        T *p_out             // pointer to out
+)
+{
+    T u0 = get(p_u, u_stride, 0);
+    T u1 = get(p_u, u_stride, 1);
+    T v0 = get(p_v, v_stride, 0);
+    T v1 = get(p_v, v_stride, 1);
+    p_out[0] = u0*v1 - u1*v0;
+}
+
+//
+// `cross2_core_calc_object`, the C++ core function
+// for the gufunc `cross2` with signature '(2),(2)->()'
+// for the object type.
+//
+static void
+cross2_core_calc_object(
+    PyObject* *p_u,      // pointer to first element of x, a strided 1-d array with n elements
+    npy_intp u_stride,   // stride (in bytes) for elements of x
+    PyObject* *p_v,      // pointer to first element of x, a strided 1-d array with n elements
+    npy_intp v_stride,   // stride (in bytes) for elements of x
+    PyObject* *p_out     // pointer to out
+)
+{
+    PyObject* u0 = get(p_u, u_stride, 0);
+    PyObject* u1 = get(p_u, u_stride, 1);
+    PyObject* v0 = get(p_v, v_stride, 0);
+    PyObject* v1 = get(p_v, v_stride, 1);
+
+    PyObject* out = det2(u0, u1, v0, v1);
+    if (out == NULL) {
+        return;
+    }
+    p_out[0] = out;
+}
+
+#endif
diff --git a/src/cross/define_cxx_gufunc_extmod.py b/src/cross/define_cxx_gufunc_extmod.py
new file mode 100644
index 0000000..6d19c29
--- /dev/null
+++ b/src/cross/define_cxx_gufunc_extmod.py
@@ -0,0 +1,110 @@
+
+from ufunc_config_types import UFuncExtMod, UFunc, UFuncSource
+
+
+CROSS3_DOCSTRING = """\
+cross3(u, v, /, ...)
+
+Compute the 3-d vector cross product of u and v.
+
+Parameters
+----------
+u : array_like, shape (..., 3)
+    Input array
+v : array_like, shape (..., 3)
+    Input array
+
+Returns
+-------
+out : ndarray, shape (..., 3)
+    Cross product of u and v.
+
+See Also
+--------
+ufunclab.cross2
+numpy.cross
+
+Examples
+--------
+>>> from ufunclab import cross3
+>>> cross3([1, 2, -2], [5, 3, 1])
+array([  8, -11,  -7])
+>>> cross3([[1, 2, -2], [6, 0, 2]], [[5, 3, 1], [2, 2, 3]])
+array([[  8, -11,  -7],
+       [ -4, -14,  12]])
+
+"""
+
+CROSS2_DOCSTRING = """\
+cross2(u, v, /, ...)
+
+Compute the cross product of 2-d vectors u and v.
+The result is a scalar.
+
+Parameters
+----------
+u : array_like, shape (..., 2)
+    Input array
+v : array_like, shape (..., 2)
+    Input array
+
+Returns
+-------
+out : scalar or ndarray, shape (...)
+    Cross product of u and v.
+
+See Also
+--------
+ufunclab.cross3
+numpy.cross
+
+Examples
+--------
+>>> from ufunclab import cross2
+>>> cross2([1, 2], [5, 3])
+-7
+>>> cross2([[1, 2], [6, 0]], [[5, 3], [2, 3]])
+array([-7, 18])
+"""
+
+cross3_src = UFuncSource(
+    funcname='cross3_core_calc',
+    typesignatures=['ii->i', 'll->l', 'ff->f', 'dd->d', 'gg->g'],
+)
+
+cross3_src_object = UFuncSource(
+    funcname='cross3_core_calc_object',
+    typesignatures=['OO->O'],
+)
+
+cross3 = UFunc(
+    name='cross3',
+    header='cross_gufunc.h',
+    docstring=CROSS3_DOCSTRING,
+    signature='(3),(3)->(3)',
+    sources=[cross3_src, cross3_src_object],
+)
+
+cross2_src = UFuncSource(
+    funcname='cross2_core_calc',
+    typesignatures=['ii->i', 'll->l', 'ff->f', 'dd->d', 'gg->g'],
+)
+
+cross2_src_object = UFuncSource(
+    funcname='cross2_core_calc_object',
+    typesignatures=['OO->O'],
+)
+
+cross2 = UFunc(
+    name='cross2',
+    header='cross_gufunc.h',
+    docstring=CROSS2_DOCSTRING,
+    signature='(2),(2)->()',
+    sources=[cross2_src, cross2_src_object],
+)
+
+extmod = UFuncExtMod(
+    module='_cross',
+    docstring="This extension module defines the gufuncs 'cross3' and 'cross2'.",
+    ufuncs=[cross3, cross2],
+)
diff --git a/ufunclab/meson.build b/ufunclab/meson.build
index 5183a57..9a1f178 100644
--- a/ufunclab/meson.build
+++ b/ufunclab/meson.build
@@ -49,6 +49,7 @@ gufunc_cxx_src_dirs = [
     'all_same',
     'backlash',
     'corr',
+    'cross',
     'first',
     'hysteresis_relay',
     'mad',
@@ -191,7 +192,6 @@ py3.extension_module(
 #----------------------------------------------------------------------
 
 numpy_templated_c_gufunc_dirs = [
-    'cross',
     'fillnan1d',
     'linear_interp1d',
     'means',
diff --git a/ufunclab/tests/test_cross.py b/ufunclab/tests/test_cross.py
index 8dafa6f..e52b030 100644
--- a/ufunclab/tests/test_cross.py
+++ b/ufunclab/tests/test_cross.py
@@ -24,27 +24,18 @@ def numpy_cross2(a, b, axisa=-1, axisb=-1):
 
 
 @pytest.mark.parametrize('u, v', [([1, 2, 3], [5, 3, 1]),
-                                  ([1.5, 0.5, -1.5], [2.0, 9.0, -3.0]),
-                                  ([1+2j, 3, -4j], [3-1j, 2j, 6])])
+                                  ([1.5, 0.5, -1.5], [2.0, 9.0, -3.0])])
 def test_cross3_basic(u, v):
     w = cross3(u, v)
     assert_equal(w, np.cross(u, v))
 
 
-def test_cross3_object1():
-    u = np.array([1, 2-1j, Fraction(1, 2)], dtype=object)
+def test_cross3_object():
+    u = np.array([1, 2, Fraction(1, 2)], dtype=object)
     v = np.array([-2.5, 8, Fraction(3, 2)], dtype=object)
     w = cross3(u, v)
     assert w.dtype == np.dtype(object)
-    assert_equal(w, np.array([-1-1.5j, -2.75, 13-2.5j], dtype=object))
-
-
-def test_cross3_object2():
-    u = np.array([1, 2+1j, -2], dtype=object)
-    v = np.array([5, -3j, Fraction(1, 2)], dtype=object)
-    w = cross3(u, v)
-    assert w.dtype == np.dtype(object)
-    assert_equal(w, np.array([1-5.5j, Fraction(-21, 2), -10-8j], dtype=object))
+    assert_equal(w, np.array([-1, -2.75, 13.0], dtype=object))
 
 
 def test_cross3_basic_broadcasting():
@@ -62,8 +53,7 @@ def test_cross3_nontrivial_axes():
 
 
 @pytest.mark.parametrize('u, v', [([1, 2], [5, 3]),
-                                  ([1.5, 0.5], [2.0, 9.0]),
-                                  ([1+2j, 3], [3-1j, 2j])])
+                                  ([1.5, 0.5], [2.0, 9.0])])
 def test_cross2_basic(u, v):
     w = cross2(u, v)
     assert_equal(w, numpy_cross2(u, v))