From 1de4083e12f5ad5da0810c2ceea7e548c66db037 Mon Sep 17 00:00:00 2001
From: Qiming Sun <osirpt.sun@gmail.com>
Date: Sat, 3 Feb 2024 23:35:37 -0800
Subject: [PATCH 01/44] Remove the implicit dependency to mpi4py in __init__.py
 (issue #1888) (#2059)

* Remove the implicit dependency to mpi4py in __init__.py (issue #1888)

* import error

* Import error in momgfccsd tests
---
 pyscf/cc/__init__.py            |  3 ---
 pyscf/cc/test/test_momgfccsd.py | 23 ++++++++++++-----------
 pyscf/pbc/__all__.py            |  3 +++
 pyscf/post_scf.py               |  3 +++
 4 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/pyscf/cc/__init__.py b/pyscf/cc/__init__.py
index 6a037a27fe..6b4dc5d34e 100644
--- a/pyscf/cc/__init__.py
+++ b/pyscf/cc/__init__.py
@@ -75,7 +75,6 @@
 from pyscf.cc import eom_uccsd
 from pyscf.cc import eom_gccsd
 from pyscf.cc import qcisd
-from pyscf.cc import momgfccsd
 from pyscf import scf
 
 def CCSD(mf, frozen=None, mo_coeff=None, mo_occ=None):
@@ -223,5 +222,3 @@ def _finalize(self):
         return self
     mycc._finalize = _finalize.__get__(mycc, mycc.__class__)
     return mycc
-
-MomGFCCSD = momgfccsd.MomGFCCSD
diff --git a/pyscf/cc/test/test_momgfccsd.py b/pyscf/cc/test/test_momgfccsd.py
index ef21f1e312..51d73a47a5 100644
--- a/pyscf/cc/test/test_momgfccsd.py
+++ b/pyscf/cc/test/test_momgfccsd.py
@@ -2,6 +2,7 @@
 import unittest
 import numpy as np
 from pyscf import gto, scf, cc, lib
+from pyscf.cc import momgfccsd
 
 
 class KnownValues(unittest.TestCase):
@@ -22,7 +23,7 @@ def setUpClass(cls):
         cls.mycc.kernel()
         cls.mycc.solve_lambda()
 
-        gfcc = cc.momgfccsd.MomGFCCSD(cls.mycc, niter=(5, 5))
+        gfcc = momgfccsd.MomGFCCSD(cls.mycc, niter=(5, 5))
         imds = gfcc.make_imds()
         cls.hole_moments = gfcc.build_hole_moments(imds=imds)
         cls.part_moments = gfcc.build_part_moments(imds=imds)
@@ -54,7 +55,7 @@ def tearDownClass(cls):
 
     def test_lambda_assertion(self):
         with lib.temporary_env(self.mycc, l1=None, l2=None):
-            gfcc = cc.momgfccsd.MomGFCCSD(self.mycc, niter=(0, 0))
+            gfcc = momgfccsd.MomGFCCSD(self.mycc, niter=(0, 0))
             self.assertRaises(ValueError, gfcc.kernel)
 
     def _test_moments(self, e, v, nmax, ref):
@@ -64,7 +65,7 @@ def _test_moments(self, e, v, nmax, ref):
         self.assertAlmostEqual(np.max(np.abs(m1-m2)), 0.0, 7)
 
     def _test_niter(self, niter):
-        gfcc = cc.momgfccsd.MomGFCCSD(self.mycc, niter=(niter, niter))
+        gfcc = momgfccsd.MomGFCCSD(self.mycc, niter=(niter, niter))
         eh, vh, ep, vp = gfcc.kernel()
         self.assertAlmostEqual(gfcc.ipgfccsd(nroots=1)[0], self.ips[niter])
         self.assertAlmostEqual(gfcc.eagfccsd(nroots=1)[0], self.eas[niter])
@@ -96,7 +97,7 @@ def test_amp_input(self):
         imds.make_ea()
         t1, t2, l1, l2 = self.mycc.t1, self.mycc.t2, self.mycc.l1, self.mycc.l2
         with lib.temporary_env(self.mycc, t1=None, t2=None, l1=None, l2=None):
-            gfcc = cc.momgfccsd.MomGFCCSD(self.mycc, niter=(niter, niter))
+            gfcc = momgfccsd.MomGFCCSD(self.mycc, niter=(niter, niter))
             eh, vh, ep, vp = gfcc.kernel(t1=t1, t2=t2, l1=l1, l2=l2, imds=imds)
             self.assertAlmostEqual(gfcc.ipgfccsd(nroots=1)[0], self.ips[niter])
             self.assertAlmostEqual(gfcc.eagfccsd(nroots=1)[0], self.eas[niter])
@@ -105,7 +106,7 @@ def test_amp_input(self):
 
     def test_mom_input(self):
         niter = 2
-        gfcc = cc.momgfccsd.MomGFCCSD(self.mycc, niter=(niter, niter))
+        gfcc = momgfccsd.MomGFCCSD(self.mycc, niter=(niter, niter))
         hole_moments = self.hole_moments[:2*niter+2]
         part_moments = self.part_moments[:2*niter+2]
         eh, vh, ep, vp = gfcc.kernel(hole_moments=hole_moments, part_moments=part_moments)
@@ -116,7 +117,7 @@ def test_mom_input(self):
 
     def test_hermi_moments(self):
         niter = 2
-        gfcc = cc.momgfccsd.MomGFCCSD(self.mycc, niter=(niter, niter))
+        gfcc = momgfccsd.MomGFCCSD(self.mycc, niter=(niter, niter))
         gfcc.hermi_moments = True
         hole_moments = self.hole_moments[:2*niter+2]
         part_moments = self.part_moments[:2*niter+2]
@@ -128,7 +129,7 @@ def test_hermi_moments(self):
 
     def test_hermi_moments(self):
         niter = 2
-        gfcc = cc.momgfccsd.MomGFCCSD(self.mycc, niter=(niter, niter))
+        gfcc = momgfccsd.MomGFCCSD(self.mycc, niter=(niter, niter))
         gfcc.hermi_moments = True
         gfcc.hermi_solver = True
         hole_moments = self.hole_moments[:2*niter+2]
@@ -141,7 +142,7 @@ def test_hermi_moments(self):
 
     def test_misc(self):
         niter = 2
-        gfcc = cc.momgfccsd.MomGFCCSD(self.mycc, niter=(niter, niter))
+        gfcc = momgfccsd.MomGFCCSD(self.mycc, niter=(niter, niter))
         gfcc.reset()
         eh, vh, ep, vp = gfcc.kernel()
         self.assertAlmostEqual(gfcc.ipgfccsd(nroots=1)[0], self.ips[niter])
@@ -157,14 +158,14 @@ def test_misc(self):
 
     def test_chkfile(self):
         niter = 1
-        gfcc = cc.momgfccsd.MomGFCCSD(self.mycc, niter=(niter, niter))
+        gfcc = momgfccsd.MomGFCCSD(self.mycc, niter=(niter, niter))
         eh, vh, ep, vp = gfcc.kernel()
         self.assertAlmostEqual(gfcc.ipgfccsd(nroots=1)[0], self.ips[niter])
         self.assertAlmostEqual(gfcc.eagfccsd(nroots=1)[0], self.eas[niter])
         self._test_moments(eh, vh, 2*niter+1, self.hole_moments)
         self._test_moments(ep, vp, 2*niter+1, self.part_moments)
         gfcc.dump_chk(chkfile="tmp.chk")
-        gfcc = cc.momgfccsd.MomGFCCSD(self.mycc, niter=(niter, niter))
+        gfcc = momgfccsd.MomGFCCSD(self.mycc, niter=(niter, niter))
         gfcc.update("tmp.chk")
         self.assertAlmostEqual(gfcc.ipgfccsd(nroots=1)[0], self.ips[niter])
         self.assertAlmostEqual(gfcc.eagfccsd(nroots=1)[0], self.eas[niter])
@@ -184,7 +185,7 @@ def test_density_fitting(self):
         mycc.solve_lambda()
 
         niter = 3
-        gfcc = cc.momgfccsd.MomGFCCSD(self.mycc, niter=(niter, niter))
+        gfcc = momgfccsd.MomGFCCSD(self.mycc, niter=(niter, niter))
         eh, vh, ep, vp = gfcc.kernel()
         self.assertAlmostEqual(gfcc.ipgfccsd(nroots=1)[0], self.ips[niter])
         self.assertAlmostEqual(gfcc.eagfccsd(nroots=1)[0], self.eas[niter])
diff --git a/pyscf/pbc/__all__.py b/pyscf/pbc/__all__.py
index f542b5fb09..51ad5899e9 100644
--- a/pyscf/pbc/__all__.py
+++ b/pyscf/pbc/__all__.py
@@ -13,3 +13,6 @@
     from . import dft
 except (ImportError, IOError):
     pass
+
+# Note the mpicc module implicitly import mpi4py. This module should not be
+# automatically imported until the dependency to mpi4py is completely removed.
diff --git a/pyscf/post_scf.py b/pyscf/post_scf.py
index aba028fe92..821362ffc8 100644
--- a/pyscf/post_scf.py
+++ b/pyscf/post_scf.py
@@ -3,3 +3,6 @@
     from . import doci
 except ImportError:
     pass
+
+# Note the agf2 module implicitly import mpi4py. This module should not be
+# automatically imported until the dependency to mpi4py is completely removed.

From 7185c5c3b7c3e64a29a2134d29442cb5d707c23d Mon Sep 17 00:00:00 2001
From: Qiming Sun <osirpt.sun@gmail.com>
Date: Sat, 3 Feb 2024 15:17:04 -0800
Subject: [PATCH 02/44] Fix df-grad for UHF when symmetry is enabled (issue
 #2054)

---
 pyscf/df/grad/rhf.py          | 6 +++---
 pyscf/df/test/test_df_grad.py | 1 +
 pyscf/scf/uhf_symm.py         | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/pyscf/df/grad/rhf.py b/pyscf/df/grad/rhf.py
index 7c2db50f0d..0c945d8d21 100644
--- a/pyscf/df/grad/rhf.py
+++ b/pyscf/df/grad/rhf.py
@@ -342,6 +342,9 @@ def _decompose_rdm1 (mf_grad, mol, dm):
     if hasattr (dm, 'mo_coeff') and hasattr (dm, 'mo_occ'):
         mo_coeff = dm.mo_coeff
         mo_occ = dm.mo_occ
+        if getattr(mo_occ, 'ndim', None) == 1: # RHF orbitals
+            mo_coeff = [mo_coeff]
+            mo_occ = [mo_occ]
     else:
         s0 = mol.intor ('int1e_ovlp')
         mo_occ = []
@@ -352,10 +355,7 @@ def _decompose_rdm1 (mf_grad, mol, dm):
             mo_occ.append (n)
             mo_coeff.append (c)
         mo_occ = numpy.stack (mo_occ, axis=0)
-    nmo = mo_occ.shape[-1]
 
-    mo_coeff = numpy.asarray(mo_coeff).reshape(-1,nao,nmo)
-    mo_occ   = numpy.asarray(mo_occ).reshape(-1,nmo)
     orbor = []
     orbol = []
     for i in range(nset):
diff --git a/pyscf/df/test/test_df_grad.py b/pyscf/df/test/test_df_grad.py
index 710af63fd6..b63cf284a2 100644
--- a/pyscf/df/test/test_df_grad.py
+++ b/pyscf/df/test/test_df_grad.py
@@ -90,6 +90,7 @@ def test_uhf_grad(self):
             ['O' , (0. , 0.     , 0.)],
             [1   , (0. , -0.757 , 0.587)],
             [1   , (0. , 0.757  , 0.587)] ]
+        mol.symmetry = True
         mol.verbose = 0
         mol.basis = '631g'
         mol.spin = 2
diff --git a/pyscf/scf/uhf_symm.py b/pyscf/scf/uhf_symm.py
index d289049405..1ea38b0956 100644
--- a/pyscf/scf/uhf_symm.py
+++ b/pyscf/scf/uhf_symm.py
@@ -530,7 +530,7 @@ def _finalize(self):
             mo_b = lib.tag_array(self.mo_coeff[1][:,idxb], orbsym=orbsymb,
                                  degen_mapping=degen_b)
         self.mo_coeff = (mo_a, mo_b)
-        self.mo_occ = (self.mo_occ[0][idxa], self.mo_occ[1][idxb])
+        self.mo_occ = numpy.asarray([self.mo_occ[0][idxa], self.mo_occ[1][idxb]])
         if self.chkfile:
             chkfile.dump_scf(self.mol, self.chkfile, self.e_tot, self.mo_energy,
                              self.mo_coeff, self.mo_occ, overwrite_mol=False)

From 68952e57a48fc236169873c535b2580c7e9a6875 Mon Sep 17 00:00:00 2001
From: Qiming Sun <osirpt.sun@gmail.com>
Date: Thu, 1 Feb 2024 14:50:23 -0800
Subject: [PATCH 03/44] Check cylindrical symmetry of the orbitals for FCI
 solver (issue #2022)

---
 pyscf/fci/direct_spin1_cyl_sym.py    | 10 +++++++---
 pyscf/fci/direct_spin1_symm.py       | 12 ++++++++++++
 pyscf/fci/test/test_spin1_cyl_sym.py |  9 +++++++++
 pyscf/fci/test/test_spin1_symm.py    | 13 ++++++++++++-
 4 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/pyscf/fci/direct_spin1_cyl_sym.py b/pyscf/fci/direct_spin1_cyl_sym.py
index 0056bbd992..80aa27192f 100644
--- a/pyscf/fci/direct_spin1_cyl_sym.py
+++ b/pyscf/fci/direct_spin1_cyl_sym.py
@@ -42,9 +42,9 @@
 from pyscf.fci import cistring
 from pyscf.fci import direct_spin1
 from pyscf.fci import direct_spin1_symm
-from pyscf.fci.direct_spin1_symm import (_sv_associated_det,
-                                         _strs_angular_momentum,
-                                         _cyl_sym_orbital_rotation)
+from pyscf.fci.direct_spin1_symm import (
+    _sv_associated_det, _strs_angular_momentum, _cyl_sym_orbital_rotation,
+    _validate_degen_mapping)
 from pyscf.fci import direct_nosym
 from pyscf.fci import addons
 from pyscf import __config__
@@ -558,6 +558,10 @@ def kernel(self, h1e, eri, norb, nelec, ci0=None,
         if not hasattr(orbsym, 'degen_mapping'):
             degen_mapping = map_degeneracy(h1e.diagonal(), orbsym)
             orbsym = lib.tag_array(orbsym, degen_mapping=degen_mapping)
+        if not _validate_degen_mapping(orbsym.degen_mapping, norb):
+            raise lib.exceptions.PointGroupSymmetryError(
+                'Incomplete 2D-irrep orbitals for cylindrical symmetry.\n'
+                f'orbsym = {orbsym}.')
 
         u = _cyl_sym_orbital_rotation(orbsym, orbsym.degen_mapping)
         h1e = u.dot(h1e).dot(u.conj().T)
diff --git a/pyscf/fci/direct_spin1_symm.py b/pyscf/fci/direct_spin1_symm.py
index 272a432ec6..017505836a 100644
--- a/pyscf/fci/direct_spin1_symm.py
+++ b/pyscf/fci/direct_spin1_symm.py
@@ -257,6 +257,13 @@ def get_init_guess(norb, nelec, nroots, hdiag, orbsym, wfnsym=0):
         raise RuntimeError(f'Initial guess for symmetry {wfnsym} not found')
     return ci0
 
+def _validate_degen_mapping(mapping, norb):
+    '''Check if 2D irreps are properly paired'''
+    mapping = np.asarray(mapping)
+    return (mapping.max() < norb and
+            # Must be self-conjugated
+            numpy.array_equal(mapping[mapping], numpy.arange(norb)))
+
 def get_init_guess_cyl_sym(norb, nelec, nroots, hdiag, orbsym, wfnsym=0):
     neleca, nelecb = _unpack_nelec(nelec)
     strsa = strsb = cistring.gen_strings4orblist(range(norb), neleca)
@@ -751,6 +758,11 @@ def kernel(self, h1e, eri, norb, nelec, ci0=None,
                 orbsym = lib.tag_array(orbsym, degen_mapping=degen_mapping)
             if davidson_only is None:
                 davidson_only = True
+            if not _validate_degen_mapping(orbsym.degen_mapping, norb):
+                raise lib.exceptions.PointGroupSymmetryError(
+                    'Incomplete 2D-irrep orbitals for cylindrical symmetry.\n'
+                    f'orbsym = {orbsym}. '
+                    f'Retry {self.__class__} with D2h subgroup symmetry.')
 
         wfnsym_ir = self.guess_wfnsym(norb, nelec, ci0, orbsym, wfnsym, **kwargs)
         self.sym_allowed_idx = sym_allowed_indices(nelec, orbsym, wfnsym_ir)
diff --git a/pyscf/fci/test/test_spin1_cyl_sym.py b/pyscf/fci/test/test_spin1_cyl_sym.py
index 84e014d690..0648528e31 100644
--- a/pyscf/fci/test/test_spin1_cyl_sym.py
+++ b/pyscf/fci/test/test_spin1_cyl_sym.py
@@ -212,6 +212,15 @@ def test_linearmole_a2(self):
         mc.run()
         self.assertAlmostEqual(mc.e_tot, 2.8999951068356475, 8)
 
+    def test_incomplete_orbsym(self):
+        sol = direct_spin1_cyl_sym.FCI(gto.Mole())
+        no, ne = 2, 2
+        h1 = np.ones((no,no))
+        h2 = np.ones((no,no,no,no))
+        orbsym = lib.tag_array(np.array([0,3]), degen_mapping=[0,2])
+        with self.assertRaises(lib.exceptions.PointGroupSymmetryError):
+            sol.kernel(h1, h2, no, ne, orbsym=orbsym)
+
 if __name__ == "__main__":
     print("Full Tests for spin1-symm")
     unittest.main()
diff --git a/pyscf/fci/test/test_spin1_symm.py b/pyscf/fci/test/test_spin1_symm.py
index 462009e3dd..4b63d48d27 100644
--- a/pyscf/fci/test/test_spin1_symm.py
+++ b/pyscf/fci/test/test_spin1_symm.py
@@ -15,7 +15,7 @@
 
 import unittest
 import numpy
-from pyscf import gto
+from pyscf import gto, lib
 from pyscf import scf
 from pyscf import ao2mo
 from pyscf import fci
@@ -197,6 +197,17 @@ def test_linearmole(self):
         ci1 = fci.addons.transform_ci(ci_y, (3,3), u.T)
         self.assertAlmostEqual(abs(ci1.ravel().dot(ci_y.ravel())), 1, 9)
 
+    def test_incomplete_orbsym(self):
+        mol = gto.Mole()
+        mol.groupname = 'Dooh'
+        sol = direct_spin1_symm.FCI(mol)
+        no, ne = 2, 2
+        h1 = numpy.ones((no,no))
+        h2 = numpy.ones((no,no,no,no))
+        orbsym = lib.tag_array(numpy.array([0,3]), degen_mapping=[0,2])
+        with self.assertRaises(lib.exceptions.PointGroupSymmetryError):
+            sol.kernel(h1, h2, no, ne, orbsym=orbsym)
+
 if __name__ == "__main__":
     print("Full Tests for spin1-symm")
     unittest.main()

From 751103d903857e4792be2e995bb49bbe00d8a00e Mon Sep 17 00:00:00 2001
From: Qiming Sun <osirpt.sun@gmail.com>
Date: Sat, 3 Feb 2024 14:39:29 -0800
Subject: [PATCH 04/44] Fix bug for init_guess=atom (issue #2056) Fix dft
 get_veff tests

---
 pyscf/dft/test/test_h2o.py | 11 ++++++-----
 pyscf/scf/hf.py            |  2 +-
 pyscf/scf/test/test_h2o.py |  6 ++++++
 3 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/pyscf/dft/test/test_h2o.py b/pyscf/dft/test/test_h2o.py
index 7a147c9bae..7eee60fe8c 100644
--- a/pyscf/dft/test/test_h2o.py
+++ b/pyscf/dft/test/test_h2o.py
@@ -91,6 +91,7 @@ def tearDownModule():
 class KnownValues(unittest.TestCase):
     def test_nr_lda(self):
         method = dft.RKS(h2o)
+        method.init_guess = 'atom' # initial guess problem, issue #2056
         method.grids.prune = dft.gen_grid.treutler_prune
         method.grids.atom_grid = {"H": (50, 194), "O": (50, 194),}
         method.xc = 'lda, vwn_rpa'
@@ -372,7 +373,7 @@ def test_nr_rks_nlc(self):
         self.assertAlmostEqual(lib.fp(vxc), 22.767792068559917, 8)
 
         method.xc = 'B97M_V'
-        vxc = method.get_veff(h2o, dm)
+        vxc = method.get_veff(h2o, dm, dm, vxc)
         self.assertAlmostEqual(lib.fp(vxc), 23.067046560473408, 8)
 
     def test_nr_rks_nlc_small_memory_high_cost(self):
@@ -381,7 +382,7 @@ def test_nr_rks_nlc_small_memory_high_cost(self):
         method._eri = None
         method.max_memory = 0
         method.xc = 'wB97M_V'
-        vxc = method.get_veff(h2o, dm, dm, vxc)
+        vxc = method.get_veff(h2o, dm)
         self.assertAlmostEqual(lib.fp(vxc), 22.767792068559917, 8)
 
         method._eri = None
@@ -419,7 +420,7 @@ def test_nr_uks_nlc_high_cost(self):
         self.assertAlmostEqual(lib.fp(vxc[1]), 22.767792068559917, 8)
 
         method.xc = 'B97M_V'
-        vxc = method.get_veff(h2o, dm)
+        vxc = method.get_veff(h2o, dm, dm, vxc)
         self.assertAlmostEqual(lib.fp(vxc[0]), 23.067046560473408, 8)
         self.assertAlmostEqual(lib.fp(vxc[1]), 23.067046560473408, 8)
 
@@ -430,7 +431,7 @@ def test_nr_uks_nlc_small_memory_high_cost(self):
         method._eri = None
         method.max_memory = 0
         method.xc = 'wB97M_V'
-        vxc = method.get_veff(h2o, dm, dm, vxc)
+        vxc = method.get_veff(h2o, dm)
         self.assertAlmostEqual(lib.fp(vxc[0]), 22.767792068559917, 8)
         self.assertAlmostEqual(lib.fp(vxc[1]), 22.767792068559917, 8)
 
@@ -471,7 +472,7 @@ def test_nr_gks_nlc_small_memory_high_cost(self):
         method._eri = None
         method.max_memory = 0
         method.xc = 'wB97M_V'
-        vxc = method.get_veff(h2o, dm, dm, vxc)
+        vxc = method.get_veff(h2o, dm)
         self.assertAlmostEqual(lib.fp(vxc), 3.172920887028461+0j, 8)
 
         method._eri = None
diff --git a/pyscf/scf/hf.py b/pyscf/scf/hf.py
index b71de95b03..9f559265b5 100644
--- a/pyscf/scf/hf.py
+++ b/pyscf/scf/hf.py
@@ -506,7 +506,7 @@ def init_guess_by_atom(mol):
 
     dm = scipy.linalg.block_diag(*atm_dms)
     mo_coeff = scipy.linalg.block_diag(*mo_coeff)
-    mo_occ = numpy.hstack(occ)
+    mo_occ = numpy.hstack(mo_occ)
 
     if mol.cart:
         cart2sph = mol.cart2sph_coeff(normalized='sp')
diff --git a/pyscf/scf/test/test_h2o.py b/pyscf/scf/test/test_h2o.py
index 042d5bf3ad..706cb5df6a 100644
--- a/pyscf/scf/test/test_h2o.py
+++ b/pyscf/scf/test/test_h2o.py
@@ -191,6 +191,8 @@ def test_nr_uhf_symm(self):
 
     def test_init_guess_minao(self):
         dm = scf.hf.init_guess_by_minao(mol)
+        self.assertEqual(dm.mo_coeff.shape[0], mol.nao)
+        self.assertEqual(dm.mo_occ.size, dm.mo_coeff.shape[1])
         s = scf.hf.get_ovlp(mol)
         occ, mo = scipy.linalg.eigh(dm, s, type=2)
         ftmp = tempfile.NamedTemporaryFile(dir=lib.param.TMPDIR)
@@ -213,6 +215,8 @@ def test_init_guess_minao(self):
 
     def test_init_guess_atom(self):
         dm = scf.hf.init_guess_by_atom(mol)
+        self.assertEqual(dm.mo_coeff.shape[0], mol.nao)
+        self.assertEqual(dm.mo_occ.size, dm.mo_coeff.shape[1])
         s = scf.hf.get_ovlp(mol)
         occ, mo = scipy.linalg.eigh(dm, s, type=2)
         ftmp = tempfile.NamedTemporaryFile(dir=lib.param.TMPDIR)
@@ -240,6 +244,8 @@ def test_init_guess_atom(self):
 
     def test_init_guess_1e(self):
         dm = scf.hf.init_guess_by_1e(mol)
+        self.assertEqual(dm.mo_coeff.shape[0], mol.nao)
+        self.assertEqual(dm.mo_occ.size, dm.mo_coeff.shape[1])
         s = scf.hf.get_ovlp(mol)
         occ, mo = scipy.linalg.eigh(dm, s, type=2)
         ftmp = tempfile.NamedTemporaryFile(dir=lib.param.TMPDIR)

From f32c4c0dbd0932df48e3b70bb7e19235066bfbfa Mon Sep 17 00:00:00 2001
From: Qiming Sun <osirpt.sun@gmail.com>
Date: Sat, 3 Feb 2024 14:07:39 -0800
Subject: [PATCH 05/44] Improve magmom initialization (issue #2055)

---
 pyscf/gto/mole.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/pyscf/gto/mole.py b/pyscf/gto/mole.py
index 76f5e8b95b..ab1e32c61c 100644
--- a/pyscf/gto/mole.py
+++ b/pyscf/gto/mole.py
@@ -1209,7 +1209,8 @@ def copy(mol, deep=True):
     newmol._ecp    = copy.deepcopy(mol._ecp)
     newmol.pseudo  = copy.deepcopy(mol.pseudo)
     newmol._pseudo = copy.deepcopy(mol._pseudo)
-    newmol.magmom  = list(mol.magmom)
+    if mol.magmom:
+        newmol.magmom  = list(mol.magmom)
     return newmol
 
 def pack(mol):
@@ -2577,16 +2578,16 @@ def build(self, dump_input=True, parse_arg=ARGPARSE,
             # number of electrons are consistent.
             self.nelec
 
-        if self.magmom is None:
+        if not self.magmom:
             self.magmom = [0,] * self.natm
         elif len(self.magmom) != self.natm:
             logger.warn(self, 'len(magmom) != natm. Set magmom to zero')
             self.magmom = [0,] * self.natm
+        elif isinstance(self.magmom, np.ndarray):
+            self.magmom = self.magmom.tolist()
         if self.spin == 0 and abs(numpy.sum(self.magmom) - self.spin) > 1e-6:
             #don't check for unrestricted calcs.
             raise ValueError("mol.magmom is set incorrectly.")
-        if isinstance(self.magmom, np.ndarray):
-            self.magmom = self.magmom.tolist()
 
         if self.symmetry:
             self._build_symmetry()

From 5592023258158e6f98c6f5148b5137fb998dab9a Mon Sep 17 00:00:00 2001
From: Hongzhou Ye <hzyechem@gmail.com>
Date: Sun, 4 Feb 2024 02:51:07 -0500
Subject: [PATCH 06/44] DIIS with damping (#2053)

* update simple damping and add DIIS damping

* update get_fock in solvent and dynamic level shift

---------

Co-authored-by: hongzhouye <>
---
 pyscf/pbc/scf/khf.py             | 10 +++++-----
 pyscf/pbc/scf/krohf.py           |  7 ++++---
 pyscf/pbc/scf/kuhf.py            | 13 +++++++------
 pyscf/pbc/scf/test/test_khf.py   | 23 +++++++----------------
 pyscf/scf/addons.py              |  5 +++--
 pyscf/scf/diis.py                | 11 ++++++++---
 pyscf/scf/hf.py                  | 32 ++++++++++++++++++--------------
 pyscf/scf/rohf.py                |  7 ++++---
 pyscf/scf/test/test_rhf.py       | 13 ++++++++-----
 pyscf/scf/test/test_uhf.py       | 16 ++++++++--------
 pyscf/scf/uhf.py                 | 11 ++++++-----
 pyscf/solvent/_attach_solvent.py |  5 +++--
 12 files changed, 81 insertions(+), 72 deletions(-)

diff --git a/pyscf/pbc/scf/khf.py b/pyscf/pbc/scf/khf.py
index 3c03e36661..1ef2d88908 100644
--- a/pyscf/pbc/scf/khf.py
+++ b/pyscf/pbc/scf/khf.py
@@ -125,7 +125,8 @@ def get_jk(mf, cell, dm_kpts, kpts, kpts_band=None, with_j=True, with_k=True,
                                  omega, exxdiv=mf.exxdiv)
 
 def get_fock(mf, h1e=None, s1e=None, vhf=None, dm=None, cycle=-1, diis=None,
-             diis_start_cycle=None, level_shift_factor=None, damp_factor=None):
+             diis_start_cycle=None, level_shift_factor=None, damp_factor=None,
+             fock_last=None):
     h1e_kpts, s_kpts, vhf_kpts, dm_kpts = h1e, s1e, vhf, dm
     if h1e_kpts is None: h1e_kpts = mf.get_hcore()
     if vhf_kpts is None: vhf_kpts = mf.get_veff(mf.cell, dm_kpts)
@@ -142,11 +143,10 @@ def get_fock(mf, h1e=None, s1e=None, vhf=None, dm=None, cycle=-1, diis=None,
     if s_kpts is None: s_kpts = mf.get_ovlp()
     if dm_kpts is None: dm_kpts = mf.make_rdm1()
 
-    if 0 <= cycle < diis_start_cycle-1 and abs(damp_factor) > 1e-4:
-        f_kpts = [mol_hf.damping(s1e, dm_kpts[k] * 0.5, f_kpts[k], damp_factor)
-                  for k, s1e in enumerate(s_kpts)]
+    if 0 <= cycle < diis_start_cycle-1 and abs(damp_factor) > 1e-4 and fock_last is not None:
+        f_kpts = [mol_hf.damping(f, f_prev, damp_factor) for f,f_prev in zip(f_kpts,fock_last)]
     if diis and cycle >= diis_start_cycle:
-        f_kpts = diis.update(s_kpts, dm_kpts, f_kpts, mf, h1e_kpts, vhf_kpts)
+        f_kpts = diis.update(s_kpts, dm_kpts, f_kpts, mf, h1e_kpts, vhf_kpts, f_prev=fock_last)
     if abs(level_shift_factor) > 1e-4:
         f_kpts = [mol_hf.level_shift(s, dm_kpts[k], f_kpts[k], level_shift_factor)
                   for k, s in enumerate(s_kpts)]
diff --git a/pyscf/pbc/scf/krohf.py b/pyscf/pbc/scf/krohf.py
index fb9b8d7161..6a23588fb7 100644
--- a/pyscf/pbc/scf/krohf.py
+++ b/pyscf/pbc/scf/krohf.py
@@ -51,7 +51,8 @@ def make_rdm1(mo_coeff_kpts, mo_occ_kpts, **kwargs):
     return lib.tag_array((dma, dmb), mo_coeff=mo_coeff_kpts, mo_occ=mo_occ_kpts)
 
 def get_fock(mf, h1e=None, s1e=None, vhf=None, dm=None, cycle=-1, diis=None,
-             diis_start_cycle=None, level_shift_factor=None, damp_factor=None):
+             diis_start_cycle=None, level_shift_factor=None, damp_factor=None,
+             fock_last=None):
     h1e_kpts, s_kpts, vhf_kpts, dm_kpts = h1e, s1e, vhf, dm
     if h1e_kpts is None: h1e_kpts = mf.get_hcore()
     if vhf_kpts is None: vhf_kpts = mf.get_veff(mf.cell, dm_kpts)
@@ -71,10 +72,10 @@ def get_fock(mf, h1e=None, s1e=None, vhf=None, dm=None, cycle=-1, diis=None,
     if dm_kpts is None: dm_kpts = mf.make_rdm1()
 
     dm_sf = dm_kpts[0] + dm_kpts[1]
-    if 0 <= cycle < diis_start_cycle-1 and abs(damp_factor) > 1e-4:
+    if 0 <= cycle < diis_start_cycle-1 and abs(damp_factor) > 1e-4 and fock_last is not None:
         raise NotImplementedError('ROHF Fock-damping')
     if diis and cycle >= diis_start_cycle:
-        f_kpts = diis.update(s_kpts, dm_sf, f_kpts, mf, h1e_kpts, vhf_kpts)
+        f_kpts = diis.update(s_kpts, dm_sf, f_kpts, mf, h1e_kpts, vhf_kpts, f_prev=fock_last)
     if abs(level_shift_factor) > 1e-4:
         f_kpts = [mol_hf.level_shift(s, dm_sf[k]*.5, f_kpts[k], level_shift_factor)
                   for k, s in enumerate(s_kpts)]
diff --git a/pyscf/pbc/scf/kuhf.py b/pyscf/pbc/scf/kuhf.py
index f0b077ad36..af56a2ced3 100644
--- a/pyscf/pbc/scf/kuhf.py
+++ b/pyscf/pbc/scf/kuhf.py
@@ -59,7 +59,8 @@ def make_dm(mos, occs):
     return lib.tag_array(dm, mo_coeff=mo_coeff_kpts, mo_occ=mo_occ_kpts)
 
 def get_fock(mf, h1e=None, s1e=None, vhf=None, dm=None, cycle=-1, diis=None,
-             diis_start_cycle=None, level_shift_factor=None, damp_factor=None):
+             diis_start_cycle=None, level_shift_factor=None, damp_factor=None,
+             fock_last=None):
     h1e_kpts, s_kpts, vhf_kpts, dm_kpts = h1e, s1e, vhf, dm
     if h1e_kpts is None: h1e_kpts = mf.get_hcore()
     if vhf_kpts is None: vhf_kpts = mf.get_veff(mf.cell, dm_kpts)
@@ -85,15 +86,15 @@ def get_fock(mf, h1e=None, s1e=None, vhf=None, dm=None, cycle=-1, diis=None,
     else:
         dampa = dampb = damp_factor
 
-    if 0 <= cycle < diis_start_cycle-1 and abs(dampa)+abs(dampb) > 1e-4:
+    if 0 <= cycle < diis_start_cycle-1 and abs(dampa)+abs(dampb) > 1e-4 and fock_last is not None:
         f_a = []
         f_b = []
-        for k, s1e in enumerate(s_kpts):
-            f_a.append(mol_hf.damping(s1e, dm_kpts[0][k], f_kpts[0][k], dampa))
-            f_b.append(mol_hf.damping(s1e, dm_kpts[1][k], f_kpts[1][k], dampb))
+        for k in range(len(s_kpts)):
+            f_a.append(mol_hf.damping(f_kpts[0][k], fock_last[0][k], dampa))
+            f_b.append(mol_hf.damping(f_kpts[1][k], fock_last[1][k], dampa))
         f_kpts = [f_a, f_b]
     if diis and cycle >= diis_start_cycle:
-        f_kpts = diis.update(s_kpts, dm_kpts, f_kpts, mf, h1e_kpts, vhf_kpts)
+        f_kpts = diis.update(s_kpts, dm_kpts, f_kpts, mf, h1e_kpts, vhf_kpts, f_prev=fock_last)
     if abs(level_shift_factor) > 1e-4:
         f_kpts =([mol_hf.level_shift(s, dm_kpts[0,k], f_kpts[0,k], shifta)
                   for k, s in enumerate(s_kpts)],
diff --git a/pyscf/pbc/scf/test/test_khf.py b/pyscf/pbc/scf/test/test_khf.py
index 643776713d..7d39168fee 100644
--- a/pyscf/pbc/scf/test/test_khf.py
+++ b/pyscf/pbc/scf/test/test_khf.py
@@ -292,23 +292,14 @@ def test_small_system(self):
     def test_damping(self):
         nao = cell.nao
         np.random.seed(1)
-        s = kmf.get_ovlp()
-        d = np.random.random((len(kpts),nao,nao))
-        d = (d + d.transpose(0,2,1)) * 2
-        vhf = 0
-        f = khf.get_fock(kmf, kmf.get_hcore(), s, vhf, d, cycle=0,
-                         diis_start_cycle=2, damp_factor=0.5)
-        self.assertAlmostEqual(np.linalg.norm(f[0]), 95.32749551722966, 6)
-        self.assertAlmostEqual(np.linalg.norm(f[1]), 73.9231303798864, 6)
-        self.assertAlmostEqual(np.linalg.norm(f[2]), 58.973290554565196, 6)
-
-        vhf = np.zeros((2,len(kpts),nao,nao))
-        d1 = np.asarray([d/2, d/2])
-        f1 = kuhf.get_fock(kumf, kumf.get_hcore(), s, vhf, d1, cycle=0,
-                             diis_start_cycle=2, damp_factor=0.5)
+        f = kmf.get_hcore()
+        df  = np.random.rand(len(kpts),nao,nao)
+        f_prev = f + df
+        damp = 0.3
+        f_damp = khf.get_fock(kmf, h1e=0, s1e=0, vhf=f, dm=0, cycle=0,
+                              diis_start_cycle=2, damp_factor=damp, fock_last=f_prev)
         for k in range(len(kpts)):
-            self.assertAlmostEqual(abs(f[k] - f1[0,k]).max(), 0, 9)
-            self.assertAlmostEqual(abs(f[k] - f1[1,k]).max(), 0, 9)
+            self.assertAlmostEqual(abs(f_damp[k] - (f[k]*(1-damp) + f_prev[k]*damp)).max(), 0, 9)
 
 if __name__ == '__main__':
     print("Full Tests for pbc.scf.khf")
diff --git a/pyscf/scf/addons.py b/pyscf/scf/addons.py
index ae25c6874b..213d11721a 100644
--- a/pyscf/scf/addons.py
+++ b/pyscf/scf/addons.py
@@ -406,7 +406,8 @@ def dynamic_level_shift_(mf, factor=1.):
     old_get_fock = mf.get_fock
     mf._last_e = None
     def get_fock(h1e, s1e, vhf, dm, cycle=-1, diis=None,
-                 diis_start_cycle=None, level_shift_factor=None, damp_factor=None):
+                 diis_start_cycle=None, level_shift_factor=None, damp_factor=None,
+                 fock_last=None):
         if cycle > 0 or diis is not None:
             if 'exc' in mf.scf_summary:  # DFT
                 e_tot = mf.scf_summary['e1'] + mf.scf_summary['coul'] + mf.scf_summary['exc']
@@ -417,7 +418,7 @@ def get_fock(h1e, s1e, vhf, dm, cycle=-1, diis=None,
                 logger.info(mf, 'Set level shift to %g', level_shift_factor)
             mf._last_e = e_tot
         return old_get_fock(h1e, s1e, vhf, dm, cycle, diis, diis_start_cycle,
-                            level_shift_factor, damp_factor)
+                            level_shift_factor, damp_factor, fock_last=fock_last)
     mf.get_fock = get_fock
     return mf
 dynamic_level_shift = dynamic_level_shift_
diff --git a/pyscf/scf/diis.py b/pyscf/scf/diis.py
index 9f273ed5a1..321f81cdfe 100644
--- a/pyscf/scf/diis.py
+++ b/pyscf/scf/diis.py
@@ -43,6 +43,7 @@ def __init__(self, mf=None, filename=None, Corth=None):
         self.rollback = 0
         self.space = 8
         self.Corth = Corth
+        self.damp = 0
         #?self._scf = mf
         #?if hasattr(self._scf, 'get_orbsym'): # Symmetry adapted SCF objects
         #?    self.orbsym = mf.get_orbsym(Corth)
@@ -51,7 +52,11 @@ def __init__(self, mf=None, filename=None, Corth=None):
     def update(self, s, d, f, *args, **kwargs):
         errvec = get_err_vec(s, d, f, self.Corth)
         logger.debug1(self, 'diis-norm(errvec)=%g', numpy.linalg.norm(errvec))
-        xnew = lib.diis.DIIS.update(self, f, xerr=errvec)
+        f_prev = kwargs.get('f_prev', None)
+        if abs(self.damp) < 1e-6 or f_prev is None:
+            xnew = lib.diis.DIIS.update(self, f, xerr=errvec)
+        else:
+            xnew = lib.diis.DIIS.update(self, f*(1-self.damp) + f_prev*self.damp, xerr=errvec)
         if self.rollback > 0 and len(self._bookkeep) == self.space:
             self._bookkeep = self._bookkeep[-self.rollback:]
         return xnew
@@ -125,7 +130,7 @@ class EDIIS(lib.diis.DIIS):
     '''SCF-EDIIS
     Ref: JCP 116, 8255 (2002); DOI:10.1063/1.1470195
     '''
-    def update(self, s, d, f, mf, h1e, vhf):
+    def update(self, s, d, f, mf, h1e, vhf, *args, **kwargs):
         if self._head >= self.space:
             self._head = 0
         if not self._buffer:
@@ -185,7 +190,7 @@ class ADIIS(lib.diis.DIIS):
     '''
     Ref: JCP 132, 054109 (2010); DOI:10.1063/1.3304922
     '''
-    def update(self, s, d, f, mf, h1e, vhf):
+    def update(self, s, d, f, mf, h1e, vhf, *args, **kwargs):
         if self._head >= self.space:
             self._head = 0
         if not self._buffer:
diff --git a/pyscf/scf/hf.py b/pyscf/scf/hf.py
index 9f559265b5..b6ecb5ace0 100644
--- a/pyscf/scf/hf.py
+++ b/pyscf/scf/hf.py
@@ -149,6 +149,7 @@ def kernel(mf, conv_tol=1e-10, conv_tol_grad=None,
         mf_diis = mf.DIIS(mf, mf.diis_file)
         mf_diis.space = mf.diis_space
         mf_diis.rollback = mf.diis_space_rollback
+        mf_diis.damp = mf.diis_damp
 
         # We get the used orthonormalized AO basis from any old eigendecomposition.
         # Since the ingredients for the Fock matrix has already been built, we can
@@ -166,12 +167,13 @@ def kernel(mf, conv_tol=1e-10, conv_tol_grad=None,
     # A preprocessing hook before the SCF iteration
     mf.pre_kernel(locals())
 
+    fock_last = None
     cput1 = logger.timer(mf, 'initialize scf', *cput0)
     for cycle in range(mf.max_cycle):
         dm_last = dm
         last_hf_e = e_tot
 
-        fock = mf.get_fock(h1e, s1e, vhf, dm, cycle, mf_diis)
+        fock = mf.get_fock(h1e, s1e, vhf, dm, cycle, mf_diis, fock_last=fock_last)
         mo_energy, mo_coeff = mf.eig(fock, s1e)
         mo_occ = mf.get_occ(mo_energy, mo_coeff)
         dm = mf.make_rdm1(mo_coeff, mo_occ)
@@ -181,6 +183,7 @@ def kernel(mf, conv_tol=1e-10, conv_tol_grad=None,
         # Here Fock matrix is h1e + vhf, without DIIS.  Calling get_fock
         # instead of the statement "fock = h1e + vhf" because Fock matrix may
         # be modified in some methods.
+        fock_last = fock
         fock = mf.get_fock(h1e, s1e, vhf, dm)  # = h1e + vhf, no DIIS
         norm_gorb = numpy.linalg.norm(mf.get_grad(mo_coeff, mo_occ, fock))
         if not TIGHT_GRAD_CONV_TOL:
@@ -753,14 +756,8 @@ def level_shift(s, d, f, factor):
     return f + dm_vir * factor
 
 
-def damping(s, d, f, factor):
-    #dm_vir = s - reduce(numpy.dot, (s,d,s))
-    #sinv = numpy.linalg.inv(s)
-    #f0 = reduce(numpy.dot, (dm_vir, sinv, f, d, s))
-    dm_vir = numpy.eye(s.shape[0]) - numpy.dot(s, d)
-    f0 = reduce(numpy.dot, (dm_vir, f, d, s))
-    f0 = (f0+f0.conj().T) * (factor/(factor+1.))
-    return f - f0
+def damping(f, f_prev, factor):
+    return f*(1-factor) + f_prev*factor
 
 
 # full density matrix for RHF
@@ -990,7 +987,8 @@ def get_veff(mol, dm, dm_last=None, vhf_last=None, hermi=1, vhfopt=None):
         return vj - vk * .5 + numpy.asarray(vhf_last)
 
 def get_fock(mf, h1e=None, s1e=None, vhf=None, dm=None, cycle=-1, diis=None,
-             diis_start_cycle=None, level_shift_factor=None, damp_factor=None):
+             diis_start_cycle=None, level_shift_factor=None, damp_factor=None,
+             fock_last=None):
     '''F = h^{core} + V^{HF}
 
     Special treatment (damping, DIIS, or level shift) will be applied to the
@@ -1030,10 +1028,10 @@ def get_fock(mf, h1e=None, s1e=None, vhf=None, dm=None, cycle=-1, diis=None,
     if s1e is None: s1e = mf.get_ovlp()
     if dm is None: dm = mf.make_rdm1()
 
-    if 0 <= cycle < diis_start_cycle-1 and abs(damp_factor) > 1e-4:
-        f = damping(s1e, dm*.5, f, damp_factor)
+    if 0 <= cycle < diis_start_cycle-1 and abs(damp_factor) > 1e-4 and fock_last is not None:
+        f = damping(f, fock_last, damp_factor)
     if diis is not None and cycle >= diis_start_cycle:
-        f = diis.update(s1e, dm, f, mf, h1e, vhf)
+        f = diis.update(s1e, dm, f, mf, h1e, vhf, f_prev=fock_last)
     if abs(level_shift_factor) > 1e-4:
         f = level_shift(s1e, dm*.5, f, level_shift_factor)
     return f
@@ -1463,6 +1461,8 @@ class SCF(lib.StreamObject):
             vector) will be reused.
         diis_space : int
             DIIS space size.  By default, 8 Fock matrices and errors vector are stored.
+        diis_damp : float
+            DIIS damping factor.  Default is 0.
         diis_start_cycle : int
             The step to start DIIS.  Default is 1.
         diis_file: 'str'
@@ -1515,6 +1515,7 @@ class SCF(lib.StreamObject):
     DIIS = diis.SCF_DIIS
     diis = getattr(__config__, 'scf_hf_SCF_diis', True)
     diis_space = getattr(__config__, 'scf_hf_SCF_diis_space', 8)
+    diis_damp = getattr(__config__, 'scf_hf_SCF_diis_damp', 0)
     # need > 0 if initial DM is numpy.zeros array
     diis_start_cycle = getattr(__config__, 'scf_hf_SCF_diis_start_cycle', 1)
     diis_file = None
@@ -1530,7 +1531,7 @@ class SCF(lib.StreamObject):
 
     _keys = {
         'conv_tol', 'conv_tol_grad', 'max_cycle', 'init_guess',
-        'DIIS', 'diis', 'diis_space', 'diis_start_cycle',
+        'DIIS', 'diis', 'diis_space', 'diis_damp', 'diis_start_cycle',
         'diis_file', 'diis_space_rollback', 'damp', 'level_shift',
         'direct_scf', 'direct_scf_tol', 'conv_check', 'callback',
         'mol', 'chkfile', 'mo_energy', 'mo_coeff', 'mo_occ',
@@ -1597,10 +1598,13 @@ def dump_flags(self, verbose=None):
             log.info('DIIS = %s', self.diis)
             log.info('diis_start_cycle = %d', self.diis_start_cycle)
             log.info('diis_space = %d', self.diis.space)
+            if getattr(self.diis, 'damp', None):
+                log.info('diis_damp = %g', self.diis.damp)
         elif self.diis:
             log.info('DIIS = %s', self.DIIS)
             log.info('diis_start_cycle = %d', self.diis_start_cycle)
             log.info('diis_space = %d', self.diis_space)
+            log.info('diis_damp = %g', self.diis_damp)
         else:
             log.info('DIIS disabled')
         log.info('SCF conv_tol = %g', self.conv_tol)
diff --git a/pyscf/scf/rohf.py b/pyscf/scf/rohf.py
index 6be51ebcd7..951e08a526 100644
--- a/pyscf/scf/rohf.py
+++ b/pyscf/scf/rohf.py
@@ -73,7 +73,8 @@ def init_guess_by_chkfile(mol, chkfile_name, project=None):
     return lib.tag_array(dm, mo_coeff=mo_coeff, mo_occ=mo_occ)
 
 def get_fock(mf, h1e=None, s1e=None, vhf=None, dm=None, cycle=-1, diis=None,
-             diis_start_cycle=None, level_shift_factor=None, damp_factor=None):
+             diis_start_cycle=None, level_shift_factor=None, damp_factor=None,
+             fock_last=None):
     '''Build fock matrix based on Roothaan's effective fock.
     See also :func:`get_roothaan_fock`
     '''
@@ -100,10 +101,10 @@ def get_fock(mf, h1e=None, s1e=None, vhf=None, dm=None, cycle=-1, diis=None,
         damp_factor = mf.damp
 
     dm_tot = dm[0] + dm[1]
-    if 0 <= cycle < diis_start_cycle-1 and abs(damp_factor) > 1e-4:
+    if 0 <= cycle < diis_start_cycle-1 and abs(damp_factor) > 1e-4 and fock_last is not None:
         raise NotImplementedError('ROHF Fock-damping')
     if diis and cycle >= diis_start_cycle:
-        f = diis.update(s1e, dm_tot, f, mf, h1e, vhf)
+        f = diis.update(s1e, dm_tot, f, mf, h1e, vhf, f_prev=fock_last)
     if abs(level_shift_factor) > 1e-4:
         f = hf.level_shift(s1e, dm_tot*.5, f, level_shift_factor)
     f = lib.tag_array(f, focka=focka, fockb=fockb)
diff --git a/pyscf/scf/test/test_rhf.py b/pyscf/scf/test/test_rhf.py
index 200a495e6e..5690e466f6 100644
--- a/pyscf/scf/test/test_rhf.py
+++ b/pyscf/scf/test/test_rhf.py
@@ -416,11 +416,14 @@ def test_nr_rohf(self):
     def test_damping(self):
         nao = mol.nao_nr()
         numpy.random.seed(1)
-        s = scf.hf.get_ovlp(mol)
-        d = numpy.random.random((nao,nao))
-        d = d + d.T
-        f = scf.hf.damping(s, d, scf.hf.get_hcore(mol), .5)
-        self.assertAlmostEqual(numpy.linalg.norm(f), 23361.854064083178, 9)
+        f = scf.hf.get_hcore(mol)
+        df  = numpy.random.rand(nao,nao)
+        df += df.T
+        f_prev = f + df
+        damp = 0.3
+        f_damp = scf.hf.get_fock(mf, h1e=0, s1e=0, vhf=f, dm=0, cycle=0,
+                                 diis_start_cycle=2, damp_factor=damp, fock_last=f_prev)
+        self.assertAlmostEqual(abs(f_damp - (f*(1-damp) + f_prev*damp)).max(), 0, 9)
 
     def test_level_shift(self):
         nao = mol.nao_nr()
diff --git a/pyscf/scf/test/test_uhf.py b/pyscf/scf/test/test_uhf.py
index fc7607d3d5..36c84e28cf 100644
--- a/pyscf/scf/test/test_uhf.py
+++ b/pyscf/scf/test/test_uhf.py
@@ -399,14 +399,14 @@ def test_get_occ_extreme_case(self):
     def test_damping(self):
         nao = mol.nao_nr()
         numpy.random.seed(1)
-        s = scf.hf.get_ovlp(mol)
-        d = numpy.random.random((nao,nao))
-        d = (d + d.T) * 2
-        vhf = 0
-        f = scf.uhf.get_fock(mf, scf.hf.get_hcore(mol), s, vhf, d, cycle=0,
-                             diis_start_cycle=2, damp_factor=0.5)
-        self.assertAlmostEqual(numpy.linalg.norm(f[0]), 23361.854064083178, 9)
-        self.assertAlmostEqual(numpy.linalg.norm(f[1]), 23361.854064083178, 9)
+        f = numpy.asarray([scf.hf.get_hcore(mol)]*2)
+        df  = numpy.random.rand(2,nao,nao)
+        f_prev = f + df
+        damp = 0.3
+        f_damp = scf.uhf.get_fock(mf, h1e=0, s1e=0, vhf=f, dm=0, cycle=0,
+                                 diis_start_cycle=2, damp_factor=damp, fock_last=f_prev)
+        self.assertAlmostEqual(abs(f_damp[0] - (f[0]*(1-damp) + f_prev[0]*damp)).max(), 0, 9)
+        self.assertAlmostEqual(abs(f_damp[1] - (f[1]*(1-damp) + f_prev[1]*damp)).max(), 0, 9)
 
     def test_get_irrep_nelec(self):
         fock = n2mf.get_fock()
diff --git a/pyscf/scf/uhf.py b/pyscf/scf/uhf.py
index 2200c5e0e0..4f07335bd6 100644
--- a/pyscf/scf/uhf.py
+++ b/pyscf/scf/uhf.py
@@ -236,7 +236,8 @@ def get_veff(mol, dm, dm_last=0, vhf_last=0, hermi=1, vhfopt=None):
     return vhf
 
 def get_fock(mf, h1e=None, s1e=None, vhf=None, dm=None, cycle=-1, diis=None,
-             diis_start_cycle=None, level_shift_factor=None, damp_factor=None):
+             diis_start_cycle=None, level_shift_factor=None, damp_factor=None,
+             fock_last=None):
     if h1e is None: h1e = mf.get_hcore()
     if vhf is None: vhf = mf.get_veff(mf.mol, dm)
     f = numpy.asarray(h1e) + vhf
@@ -265,11 +266,11 @@ def get_fock(mf, h1e=None, s1e=None, vhf=None, dm=None, cycle=-1, diis=None,
 
     if isinstance(dm, numpy.ndarray) and dm.ndim == 2:
         dm = [dm*.5] * 2
-    if 0 <= cycle < diis_start_cycle-1 and abs(dampa)+abs(dampb) > 1e-4:
-        f = (hf.damping(s1e, dm[0], f[0], dampa),
-             hf.damping(s1e, dm[1], f[1], dampb))
+    if 0 <= cycle < diis_start_cycle-1 and abs(dampa)+abs(dampb) > 1e-4 and fock_last is not None:
+        f = (hf.damping(f[0], fock_last[0], dampa),
+             hf.damping(f[1], fock_last[1], dampa))
     if diis and cycle >= diis_start_cycle:
-        f = diis.update(s1e, dm, f, mf, h1e, vhf)
+        f = diis.update(s1e, dm, f, mf, h1e, vhf, f_prev=fock_last)
     if abs(shifta)+abs(shiftb) > 1e-4:
         f = (hf.level_shift(s1e, dm[0], f[0], shifta),
              hf.level_shift(s1e, dm[1], f[1], shiftb))
diff --git a/pyscf/solvent/_attach_solvent.py b/pyscf/solvent/_attach_solvent.py
index ab5d58dc7c..1f98b9677d 100644
--- a/pyscf/solvent/_attach_solvent.py
+++ b/pyscf/solvent/_attach_solvent.py
@@ -92,14 +92,15 @@ def get_veff(self, mol=None, dm=None, *args, **kwargs):
 
     def get_fock(self, h1e=None, s1e=None, vhf=None, dm=None, cycle=-1,
                  diis=None, diis_start_cycle=None,
-                 level_shift_factor=None, damp_factor=None):
+                 level_shift_factor=None, damp_factor=None, fock_last=None):
         # DIIS was called inside super().get_fock. v_solvent, as a function of
         # dm, should be extrapolated as well. To enable it, v_solvent has to be
         # added to the fock matrix before DIIS was called.
         if getattr(vhf, 'v_solvent', None) is None:
             vhf = self.get_veff(self.mol, dm)
         return super().get_fock(h1e, s1e, vhf+vhf.v_solvent, dm, cycle, diis,
-                                diis_start_cycle, level_shift_factor, damp_factor)
+                                diis_start_cycle, level_shift_factor, damp_factor,
+                                fock_last)
 
     def energy_elec(self, dm=None, h1e=None, vhf=None):
         if dm is None:

From c2d539e190c26a2e3ef8b905c6aac27ec8216a82 Mon Sep 17 00:00:00 2001
From: Qiming Sun <osirpt.sun@gmail.com>
Date: Tue, 14 Nov 2023 13:50:35 -0800
Subject: [PATCH 07/44] Add level shift to CPHF solver

---
 pyscf/hessian/rhf.py                 | 19 +++++--
 pyscf/hessian/test/test_rhf.py       | 12 +++++
 pyscf/hessian/test/test_uhf.py       |  9 ++++
 pyscf/hessian/uhf.py                 |  9 ++--
 pyscf/lib/linalg_helper.py           |  4 +-
 pyscf/lib/test/test_linalg_helper.py | 18 +++++++
 pyscf/scf/cphf.py                    | 56 +++++++++++++-------
 pyscf/scf/ucphf.py                   | 76 +++++++++++++++++-----------
 8 files changed, 146 insertions(+), 57 deletions(-)

diff --git a/pyscf/hessian/rhf.py b/pyscf/hessian/rhf.py
index 90e7db492d..a11fc9d7ff 100644
--- a/pyscf/hessian/rhf.py
+++ b/pyscf/hessian/rhf.py
@@ -296,7 +296,8 @@ def _get_jk(mol, intor, comp, aosym, script_dms,
     return vs
 
 def solve_mo1(mf, mo_energy, mo_coeff, mo_occ, h1ao_or_chkfile,
-              fx=None, atmlst=None, max_memory=4000, verbose=None, max_cycle=50):
+              fx=None, atmlst=None, max_memory=4000, verbose=None,
+              max_cycle=50, level_shift=0):
     '''Solve the first order equation
 
     Kwargs:
@@ -343,7 +344,8 @@ def _ao2mo(mat):
 
         h1vo = numpy.vstack(h1vo)
         s1vo = numpy.vstack(s1vo)
-        mo1, e1 = cphf.solve(fx, mo_energy, mo_occ, h1vo, s1vo, max_cycle=max_cycle)
+        mo1, e1 = cphf.solve(fx, mo_energy, mo_occ, h1vo, s1vo,
+                             max_cycle=max_cycle, level_shift=level_shift)
         mo1 = numpy.einsum('pq,xqi->xpi', mo_coeff, mo1).reshape(-1,3,nao,nocc)
         e1 = e1.reshape(-1,3,nocc,nocc)
 
@@ -470,8 +472,15 @@ def h_op(x):
 class HessianBase(lib.StreamObject):
     '''Non-relativistic restricted Hartree-Fock hessian'''
 
+    # Max. number of iterations for Krylov solver
+    max_cycle = 50
+    # Shift virtual orbitals to slightly improve the convergence speed of Krylov solver
+    # A small level_shift ~ 0.1 is often helpful to decrease 2 - 3 iterations
+    # while the error of cphf solver may be increased by one magnitude.
+    level_shift = 0
+
     _keys = {
-        'mol', 'base', 'chkfile', 'atmlst', 'de', 'max_cycle'
+        'mol', 'base', 'chkfile', 'atmlst', 'de', 'max_cycle', 'level_shift'
     }
 
     def __init__(self, scf_method):
@@ -481,7 +490,6 @@ def __init__(self, scf_method):
         self.base = scf_method
         self.chkfile = scf_method.chkfile
         self.max_memory = self.mol.max_memory
-        self.max_cycle = 50
         self.atmlst = range(self.mol.natm)
         self.de = numpy.zeros((0,0,3,3))  # (A,B,dR_A,dR_B)
 
@@ -566,7 +574,8 @@ def get_hcore(iatm, jatm):
     def solve_mo1(self, mo_energy, mo_coeff, mo_occ, h1ao_or_chkfile,
                   fx=None, atmlst=None, max_memory=4000, verbose=None):
         return solve_mo1(self.base, mo_energy, mo_coeff, mo_occ, h1ao_or_chkfile,
-                         fx, atmlst, max_memory, verbose, max_cycle=self.max_cycle)
+                         fx, atmlst, max_memory, verbose,
+                         max_cycle=self.max_cycle, level_shift=self.level_shift)
 
     def hess_nuc(self, mol=None, atmlst=None):
         if mol is None: mol = self.mol
diff --git a/pyscf/hessian/test/test_rhf.py b/pyscf/hessian/test/test_rhf.py
index e1ae1f7087..7f3bfdb2f9 100644
--- a/pyscf/hessian/test/test_rhf.py
+++ b/pyscf/hessian/test/test_rhf.py
@@ -35,6 +35,18 @@ def tearDownModule():
     del mol
 
 class KnownValues(unittest.TestCase):
+    def test_rhf_hess(self):
+        mf = scf.RHF(mol)
+        e0 = mf.kernel()
+        hess = hessian.RHF(mf).kernel()
+        self.assertAlmostEqual(lib.fp(hess), -0.7816352153153946, 6)
+
+        hobj = hessian.RHF(mf)
+        hobj.max_cycle = 10
+        hobj.level_shift = .1
+        hess = hobj.kernel()
+        self.assertAlmostEqual(lib.fp(hess), -0.7816352153153946, 6)
+
     def test_finite_diff_x2c_rhf_hess(self):
         mf = scf.RHF(mol).x2c()
         mf.conv_tol = 1e-14
diff --git a/pyscf/hessian/test/test_uhf.py b/pyscf/hessian/test/test_uhf.py
index eb2622d10b..06d32b38ad 100644
--- a/pyscf/hessian/test/test_uhf.py
+++ b/pyscf/hessian/test/test_uhf.py
@@ -36,6 +36,15 @@ def tearDownModule():
     del mol
 
 class KnownValues(unittest.TestCase):
+    def test_uhf_hess(self):
+        mf = scf.UHF(mol)
+        mf.conv_tol = 1e-14
+        e0 = mf.kernel()
+        hobj = mf.Hessian()
+        hobj.level_shift = .05
+        hess = hobj.kernel()
+        self.assertAlmostEqual(lib.fp(hess), -0.20243405976628576, 5)
+
     def test_finite_diff_rhf_hess(self):
         mf = scf.UHF(mol)
         mf.conv_tol = 1e-14
diff --git a/pyscf/hessian/uhf.py b/pyscf/hessian/uhf.py
index 1be5ccc587..1b30e264ee 100644
--- a/pyscf/hessian/uhf.py
+++ b/pyscf/hessian/uhf.py
@@ -257,7 +257,8 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
         return chkfile
 
 def solve_mo1(mf, mo_energy, mo_coeff, mo_occ, h1ao_or_chkfile,
-              fx=None, atmlst=None, max_memory=4000, verbose=None):
+              fx=None, atmlst=None, max_memory=4000, verbose=None,
+              max_cycle=50, level_shift=0):
     mol = mf.mol
     if atmlst is None: atmlst = range(mol.natm)
 
@@ -306,7 +307,8 @@ def _ao2mo(mat, mo_coeff, mocc):
 
         h1vo = (numpy.vstack(h1voa), numpy.vstack(h1vob))
         s1vo = (numpy.vstack(s1voa), numpy.vstack(s1vob))
-        mo1, e1 = ucphf.solve(fx, mo_energy, mo_occ, h1vo, s1vo)
+        mo1, e1 = ucphf.solve(fx, mo_energy, mo_occ, h1vo, s1vo,
+                              max_cycle=max_cycle, level_shift=level_shift)
         mo1a = numpy.einsum('pq,xqi->xpi', mo_coeff[0], mo1[0]).reshape(-1,3,nao,nocca)
         mo1b = numpy.einsum('pq,xqi->xpi', mo_coeff[1], mo1[1]).reshape(-1,3,nao,noccb)
         e1a = e1[0].reshape(-1,3,nocca,nocca)
@@ -449,7 +451,8 @@ class Hessian(rhf_hess.HessianBase):
     def solve_mo1(self, mo_energy, mo_coeff, mo_occ, h1ao_or_chkfile,
                   fx=None, atmlst=None, max_memory=4000, verbose=None):
         return solve_mo1(self.base, mo_energy, mo_coeff, mo_occ, h1ao_or_chkfile,
-                         fx, atmlst, max_memory, verbose)
+                         fx, atmlst, max_memory, verbose,
+                         max_cycle=self.max_cycle, level_shift=self.level_shift)
 
     def to_gpu(self):
         raise NotImplementedError
diff --git a/pyscf/lib/linalg_helper.py b/pyscf/lib/linalg_helper.py
index fe2892364c..1fcc1a9265 100644
--- a/pyscf/lib/linalg_helper.py
+++ b/pyscf/lib/linalg_helper.py
@@ -1290,9 +1290,9 @@ def krylov(aop, b, x0=None, tol=1e-10, max_cycle=30, dot=numpy.dot,
     >>> from pyscf import lib
     >>> a = numpy.random.random((10,10)) * 1e-2
     >>> b = numpy.random.random(10)
-    >>> aop = lambda x: numpy.dot(a,x)
+    >>> aop = lambda x: a.dot(x.T).T
     >>> x = lib.krylov(aop, b)
-    >>> numpy.allclose(numpy.dot(a,x)+x, b)
+    >>> numpy.allclose(aop(x)+x, b)
     True
     '''
     if isinstance(aop, numpy.ndarray) and aop.ndim == 2:
diff --git a/pyscf/lib/test/test_linalg_helper.py b/pyscf/lib/test/test_linalg_helper.py
index 6cc7e463c4..6c8c57f56f 100644
--- a/pyscf/lib/test/test_linalg_helper.py
+++ b/pyscf/lib/test/test_linalg_helper.py
@@ -116,6 +116,24 @@ def precond(x, *args):
         x1 = linalg_helper.krylov(aop, b/a_diag, x1, max_cycle=30)
         self.assertAlmostEqual(abs(xref - x1).max(), 0, 6)
 
+    def test_krylov_with_level_shift(self):
+        numpy.random.seed(10)
+        n = 100
+        a = numpy.random.rand(n,n) * .1
+        a = a.dot(a.T)
+        a_diag = numpy.random.rand(n)
+        b = numpy.random.rand(n)
+        ref = numpy.linalg.solve(numpy.diag(a_diag) + a, b)
+
+        #((diag+shift) + (a-shift)) x = b
+        shift = .1
+        a_diag += shift
+        a -= numpy.eye(n)*shift
+
+        aop = lambda x: (a.dot(x.T).T/a_diag)
+        c = linalg_helper.krylov(aop, b/a_diag, max_cycle=18)
+        self.assertAlmostEqual(abs(ref - c).max(), 0, 9)
+
     def test_dgeev(self):
         numpy.random.seed(12)
         n = 100
diff --git a/pyscf/scf/cphf.py b/pyscf/scf/cphf.py
index 19e2d2d10b..73cbdc2010 100644
--- a/pyscf/scf/cphf.py
+++ b/pyscf/scf/cphf.py
@@ -27,7 +27,8 @@
 
 
 def solve(fvind, mo_energy, mo_occ, h1, s1=None,
-          max_cycle=20, tol=1e-9, hermi=False, verbose=logger.WARN):
+          max_cycle=50, tol=1e-9, hermi=False, verbose=logger.WARN,
+          level_shift=0):
     '''
     Args:
         fvind : function
@@ -36,29 +37,43 @@ def solve(fvind, mo_energy, mo_occ, h1, s1=None,
     Kwargs:
         hermi : boolean
             Whether the matrix defined by fvind is Hermitian or not.
+        level_shift : float
+            Add to diagonal terms to slightly improve the convergence speed of
+            Krylov solver
     '''
     if s1 is None:
         return solve_nos1(fvind, mo_energy, mo_occ, h1,
-                          max_cycle, tol, hermi, verbose)
+                          max_cycle, tol, hermi, verbose, level_shift)
     else:
         return solve_withs1(fvind, mo_energy, mo_occ, h1, s1,
-                            max_cycle, tol, hermi, verbose)
+                            max_cycle, tol, hermi, verbose, level_shift)
 kernel = solve
 
 # h1 shape is (:,nvir,nocc)
 def solve_nos1(fvind, mo_energy, mo_occ, h1,
-               max_cycle=20, tol=1e-9, hermi=False, verbose=logger.WARN):
-    '''For field independent basis. First order overlap matrix is zero'''
+               max_cycle=50, tol=1e-9, hermi=False, verbose=logger.WARN,
+               level_shift=0):
+    '''For field independent basis. First order overlap matrix is zero
+
+    Kwargs:
+        level_shift : float
+            Add to diagonal terms to slightly improve the convergence speed of
+            Krylov solver
+    '''
+    assert not hermi
     log = logger.new_logger(verbose=verbose)
     t0 = (logger.process_clock(), logger.perf_counter())
 
     e_a = mo_energy[mo_occ==0]
     e_i = mo_energy[mo_occ>0]
-    e_ai = 1 / lib.direct_sum('a-i->ai', e_a, e_i)
+    e_ai = 1 / (e_a[:,None] + level_shift - e_i)
     mo1base = h1 * -e_ai
 
     def vind_vo(mo1):
-        v = fvind(mo1.reshape(h1.shape)).reshape(h1.shape)
+        mo1 = mo1.reshape(h1.shape)
+        v = fvind(mo1).reshape(h1.shape)
+        if level_shift != 0:
+            v -= mo1 * level_shift
         v *= e_ai
         return v.ravel()
     mo1 = lib.krylov(vind_vo, mo1base.ravel(),
@@ -68,20 +83,23 @@ def vind_vo(mo1):
 
 # h1 shape is (:,nocc+nvir,nocc)
 def solve_withs1(fvind, mo_energy, mo_occ, h1, s1,
-                 max_cycle=20, tol=1e-9, hermi=False, verbose=logger.WARN):
+                 max_cycle=50, tol=1e-9, hermi=False, verbose=logger.WARN,
+                 level_shift=0):
     '''For field dependent basis. First order overlap matrix is non-zero.
     The first order orbitals are set to
     C^1_{ij} = -1/2 S1
     e1 = h1 - s1*e0 + (e0_j-e0_i)*c1 + vhf[c1]
 
     Kwargs:
-        hermi : boolean
-            Whether the matrix defined by fvind is Hermitian or not.
+        level_shift : float
+            Add to diagonal terms to slightly improve the convergence speed of
+            Krylov solver
 
     Returns:
         First order orbital coefficients (in MO basis) and first order orbital
         energy matrix
     '''
+    assert not hermi
     log = logger.new_logger(verbose=verbose)
     t0 = (logger.process_clock(), logger.perf_counter())
 
@@ -89,34 +107,38 @@ def solve_withs1(fvind, mo_energy, mo_occ, h1, s1,
     viridx = mo_occ == 0
     e_a = mo_energy[viridx]
     e_i = mo_energy[occidx]
-    e_ai = 1 / lib.direct_sum('a-i->ai', e_a, e_i)
+    e_ai = 1 / (e_a[:,None] + level_shift - e_i)
     nvir, nocc = e_ai.shape
     nmo = nocc + nvir
 
     s1 = s1.reshape(-1,nmo,nocc)
     hs = mo1base = h1.reshape(-1,nmo,nocc) - s1*e_i
-    mo_e1 = hs[:,occidx,:].copy()
 
+    mo1base = hs.copy()
     mo1base[:,viridx] *= -e_ai
     mo1base[:,occidx] = -s1[:,occidx] * .5
 
     def vind_vo(mo1):
-        v = fvind(mo1.reshape(h1.shape)).reshape(-1,nmo,nocc)
+        mo1 = mo1.reshape(mo1base.shape)
+        v = fvind(mo1).reshape(mo1base.shape)
+        if level_shift != 0:
+            v -= mo1 * level_shift
         v[:,viridx,:] *= e_ai
         v[:,occidx,:] = 0
         return v.ravel()
     mo1 = lib.krylov(vind_vo, mo1base.ravel(),
                      tol=tol, max_cycle=max_cycle, hermi=hermi, verbose=log)
     mo1 = mo1.reshape(mo1base.shape)
+    mo1[:,occidx] = mo1base[:,occidx]
     log.timer('krylov solver in CPHF', *t0)
 
-    v1mo = fvind(mo1.reshape(h1.shape)).reshape(-1,nmo,nocc)
-    mo1[:,viridx] = mo1base[:,viridx] - v1mo[:,viridx]*e_ai
+    hs += fvind(mo1).reshape(mo1base.shape)
+    mo1[:,viridx] = hs[:,viridx] / (e_i - e_a[:,None])
 
     # mo_e1 has the same symmetry as the first order Fock matrix (hermitian or
     # anti-hermitian). mo_e1 = v1mo - s1*lib.direct_sum('i+j->ij',e_i,e_i)
-    mo_e1 += mo1[:,occidx] * lib.direct_sum('i-j->ij', e_i, e_i)
-    mo_e1 += v1mo[:,occidx,:]
+    mo_e1 = hs[:,occidx,:]
+    mo_e1 += mo1[:,occidx] * (e_i[:,None] - e_i)
 
     if h1.ndim == 3:
         return mo1, mo_e1
diff --git a/pyscf/scf/ucphf.py b/pyscf/scf/ucphf.py
index cf4a7e641a..f23442452a 100644
--- a/pyscf/scf/ucphf.py
+++ b/pyscf/scf/ucphf.py
@@ -27,7 +27,8 @@
 
 
 def solve(fvind, mo_energy, mo_occ, h1, s1=None,
-          max_cycle=20, tol=1e-9, hermi=False, verbose=logger.WARN):
+          max_cycle=50, tol=1e-9, hermi=False, verbose=logger.WARN,
+          level_shift=0):
     '''
     Args:
         fvind : function
@@ -35,16 +36,18 @@ def solve(fvind, mo_energy, mo_occ, h1, s1=None,
     '''
     if s1 is None:
         return solve_nos1(fvind, mo_energy, mo_occ, h1,
-                          max_cycle, tol, hermi, verbose)
+                          max_cycle, tol, hermi, verbose, level_shift)
     else:
         return solve_withs1(fvind, mo_energy, mo_occ, h1, s1,
-                            max_cycle, tol, hermi, verbose)
+                            max_cycle, tol, hermi, verbose, level_shift)
 kernel = solve
 
 # h1 shape is (:,nvir,nocc)
 def solve_nos1(fvind, mo_energy, mo_occ, h1,
-               max_cycle=20, tol=1e-9, hermi=False, verbose=logger.WARN):
+               max_cycle=20, tol=1e-9, hermi=False, verbose=logger.WARN,
+               level_shift=0):
     '''For field independent basis. First order overlap matrix is zero'''
+    assert not hermi
     log = logger.new_logger(verbose=verbose)
     t0 = (logger.process_clock(), logger.perf_counter())
 
@@ -56,15 +59,20 @@ def solve_nos1(fvind, mo_energy, mo_occ, h1,
     noccb = numpy.count_nonzero(occidxb)
     nvira = mo_occ[0].size - nocca
     nvirb = mo_occ[1].size - noccb
-    e_ai = numpy.hstack(((mo_energy[0][viridxa,None]-mo_energy[0][occidxa]).ravel(),
-                         (mo_energy[1][viridxb,None]-mo_energy[1][occidxb]).ravel()))
+    mo_ea, mo_eb = mo_energy
+    e_ai = numpy.hstack(
+        ((mo_ea[viridxa,None]+level_shift - mo_ea[occidxa]).ravel(),
+         (mo_eb[viridxb,None]+level_shift - mo_eb[occidxb]).ravel()))
     e_ai = 1 / e_ai
     mo1base = numpy.hstack((h1[0].reshape(-1,nvira*nocca),
                             h1[1].reshape(-1,nvirb*noccb)))
     mo1base *= -e_ai
 
     def vind_vo(mo1):
-        v = fvind(mo1.reshape(mo1base.shape)).reshape(mo1base.shape)
+        mo1 = mo1.reshape(mo1base.shape)
+        v = fvind(mo1).reshape(mo1base.shape)
+        if level_shift != 0:
+            v -= mo1 * level_shift
         v *= e_ai
         return v.ravel()
     mo1 = lib.krylov(vind_vo, mo1base.ravel(),
@@ -83,12 +91,14 @@ def vind_vo(mo1):
 
 # h1 shape is (:,nvir+nocc,nocc)
 def solve_withs1(fvind, mo_energy, mo_occ, h1, s1,
-                 max_cycle=20, tol=1e-9, hermi=False, verbose=logger.WARN):
+                 max_cycle=20, tol=1e-9, hermi=False, verbose=logger.WARN,
+                 level_shift=0):
     '''For field dependent basis. First order overlap matrix is non-zero.
     The first order orbitals are set to
     C^1_{ij} = -1/2 S1
     e1 = h1 - s1*e0 + (e0_j-e0_i)*c1 + vhf[c1]
     '''
+    assert not hermi
     log = logger.new_logger(verbose=verbose)
     t0 = (logger.process_clock(), logger.perf_counter())
 
@@ -99,27 +109,31 @@ def solve_withs1(fvind, mo_energy, mo_occ, h1, s1,
     nocca = numpy.count_nonzero(occidxa)
     noccb = numpy.count_nonzero(occidxb)
     nmoa, nmob = mo_occ[0].size, mo_occ[1].size
-    eai_a = mo_energy[0][viridxa,None] - mo_energy[0][occidxa]
-    eai_b = mo_energy[1][viridxb,None] - mo_energy[1][occidxb]
+    ei_a = mo_energy[0][occidxa]
+    ei_b = mo_energy[1][occidxb]
+    ea_a = mo_energy[0][viridxa]
+    ea_b = mo_energy[1][viridxb]
+    eai_a = 1. / (ea_a[:,None] + level_shift - ei_a)
+    eai_b = 1. / (ea_b[:,None] + level_shift - ei_b)
     s1_a = s1[0].reshape(-1,nmoa,nocca)
     nset = s1_a.shape[0]
     s1_b = s1[1].reshape(nset,nmob,noccb)
-    hs_a = mo1base_a = h1[0].reshape(nset,nmoa,nocca) - s1_a * mo_energy[0][occidxa]
-    hs_b = mo1base_b = h1[1].reshape(nset,nmob,noccb) - s1_b * mo_energy[1][occidxb]
-    mo_e1_a = hs_a[:,occidxa].copy()
-    mo_e1_b = hs_b[:,occidxb].copy()
+    hs_a = h1[0].reshape(nset,nmoa,nocca) - s1_a * ei_a
+    hs_b = h1[1].reshape(nset,nmob,noccb) - s1_b * ei_b
 
-    mo1base_a[:,viridxa]/= -eai_a
-    mo1base_b[:,viridxb]/= -eai_b
+    mo1base_a = hs_a.copy()
+    mo1base_b = hs_b.copy()
+    mo1base_a[:,viridxa] *= -eai_a
+    mo1base_b[:,viridxb] *= -eai_b
     mo1base_a[:,occidxa] = -s1_a[:,occidxa] * .5
     mo1base_b[:,occidxb] = -s1_b[:,occidxb] * .5
-
-    eai_a = 1. / eai_a
-    eai_b = 1. / eai_b
     mo1base = numpy.hstack((mo1base_a.reshape(nset,-1), mo1base_b.reshape(nset,-1)))
 
     def vind_vo(mo1):
+        mo1 = mo1.reshape(mo1base.shape)
         v = fvind(mo1).reshape(mo1base.shape)
+        if level_shift != 0:
+            v -= mo1 * level_shift
         v1a = v[:,:nmoa*nocca].reshape(nset,nmoa,nocca)
         v1b = v[:,nmoa*nocca:].reshape(nset,nmob,noccb)
         v1a[:,viridxa] *= eai_a
@@ -129,21 +143,23 @@ def vind_vo(mo1):
         return v.ravel()
     mo1 = lib.krylov(vind_vo, mo1base.ravel(),
                      tol=tol, max_cycle=max_cycle, hermi=hermi, verbose=log)
-    log.timer('krylov solver in CPHF', *t0)
-
-    v1mo = fvind(mo1).reshape(mo1base.shape)
-    v1a = v1mo[:,:nmoa*nocca].reshape(nset,nmoa,nocca)
-    v1b = v1mo[:,nmoa*nocca:].reshape(nset,nmob,noccb)
     mo1 = mo1.reshape(mo1base.shape)
     mo1_a = mo1[:,:nmoa*nocca].reshape(nset,nmoa,nocca)
     mo1_b = mo1[:,nmoa*nocca:].reshape(nset,nmob,noccb)
-    mo1_a[:,viridxa] = mo1base_a[:,viridxa] - v1a[:,viridxa] * eai_a
-    mo1_b[:,viridxb] = mo1base_b[:,viridxb] - v1b[:,viridxb] * eai_b
+    mo1_a[:,occidxa] = mo1base_a[:,occidxa]
+    mo1_b[:,occidxb] = mo1base_b[:,occidxb]
+    log.timer('krylov solver in CPHF', *t0)
 
-    mo_e1_a += mo1_a[:,occidxa] * (mo_energy[0][occidxa,None] - mo_energy[0][occidxa])
-    mo_e1_b += mo1_b[:,occidxb] * (mo_energy[1][occidxb,None] - mo_energy[1][occidxb])
-    mo_e1_a += v1mo[:,:nmoa*nocca].reshape(nset,nmoa,nocca)[:,occidxa]
-    mo_e1_b += v1mo[:,nmoa*nocca:].reshape(nset,nmob,noccb)[:,occidxb]
+    v1mo = fvind(mo1).reshape(mo1base.shape)
+    hs_a += v1mo[:,:nmoa*nocca].reshape(nset,nmoa,nocca)
+    hs_b += v1mo[:,nmoa*nocca:].reshape(nset,nmob,noccb)
+    mo1_a[:,viridxa] = hs_a[:,viridxa] / (ei_a - ea_a[:,None])
+    mo1_b[:,viridxb] = hs_b[:,viridxb] / (ei_b - ea_b[:,None])
+
+    mo_e1_a = hs_a[:,occidxa]
+    mo_e1_b = hs_b[:,occidxb]
+    mo_e1_a += mo1_a[:,occidxa] * (ei_a[:,None] - ei_a)
+    mo_e1_b += mo1_b[:,occidxb] * (ei_b[:,None] - ei_b)
 
     if isinstance(h1[0], numpy.ndarray) and h1[0].ndim == 2:
         mo1_a, mo1_b = mo1_a[0], mo1_b[0]

From 63ce7fd786dc329df32896255cf415dbd7841723 Mon Sep 17 00:00:00 2001
From: Xing Zhang <fishjojo@users.noreply.github.com>
Date: Sun, 4 Feb 2024 19:31:46 -0800
Subject: [PATCH 08/44] fix cp2k basis parse (issue #2043) (#2045)

* fix issue #2043

* add test for parse_cp2k

* allow symbol choosing for GTH PP parser
---
 pyscf/gto/basis/parse_cp2k.py       | 21 ++++++----
 pyscf/gto/basis/parse_cp2k_pp.py    | 23 +++++++---
 pyscf/gto/test/test_basis_parser.py | 65 +++++++++++++++++++++++++++++
 3 files changed, 96 insertions(+), 13 deletions(-)

diff --git a/pyscf/gto/basis/parse_cp2k.py b/pyscf/gto/basis/parse_cp2k.py
index bc2b534fe3..bf624140ca 100644
--- a/pyscf/gto/basis/parse_cp2k.py
+++ b/pyscf/gto/basis/parse_cp2k.py
@@ -23,13 +23,14 @@
 import re
 from pyscf.lib.exceptions import BasisNotFoundError
 from pyscf.gto.basis import parse_nwchem
+from pyscf.gto.basis.parse_nwchem import _search_basis_block
 from pyscf import __config__
 
 DISABLE_EVAL = getattr(__config__, 'DISABLE_EVAL', False)
 
 MAXL = 8
 
-def parse(string, optimize=False):
+def parse(string, symb=None, optimize=False):
     '''Parse the basis text which is in CP2K format, return an internal
     basis format which can be assigned to :attr:`Mole.basis`
     Lines started with # are ignored.
@@ -50,6 +51,12 @@ def parse(string, optimize=False):
     ... #
     ... """)}
     '''
+    if symb is not None:
+        raw_data = list(filter(None, re.split(BASIS_SET_DELIMITER, string)))
+        string = _search_basis_block(raw_data, symb)
+        if not string:
+            raise BasisNotFoundError(f'Basis not found for {symb}')
+
     bastxt = []
     for dat in string.splitlines():
         x = dat.split('#')[0].strip()
@@ -115,10 +122,8 @@ def _parse(blines, optimize=False):
 def search_seg(basisfile, symb):
     with open(basisfile, 'r') as fin:
         fdata = re.split(BASIS_SET_DELIMITER, fin.read())
-    for dat in fdata[1:]:
-        dat0 = dat.split(None, 1)
-        if dat0 and dat0[0] == symb:
-            # remove blank lines
-            return [x.strip() for x in dat.splitlines()
-                    if x.strip() and 'END' not in x]
-    raise BasisNotFoundError(f'Basis for {symb} not found in {basisfile}')
+    raw_basis = _search_basis_block(fdata[1:], symb)
+    if not raw_basis:
+        raise BasisNotFoundError(f'Basis for {symb} not found in {basisfile}')
+    return [x.strip() for x in raw_basis.splitlines()
+            if x.strip() and 'END' not in x]
diff --git a/pyscf/gto/basis/parse_cp2k_pp.py b/pyscf/gto/basis/parse_cp2k_pp.py
index c385ca3b9c..ff64c1a0bc 100644
--- a/pyscf/gto/basis/parse_cp2k_pp.py
+++ b/pyscf/gto/basis/parse_cp2k_pp.py
@@ -21,10 +21,11 @@
 '''
 
 import sys
+import re
 from pyscf.lib.exceptions import BasisNotFoundError
 import numpy as np
 
-def parse(string):
+def parse(string, symb=None):
     '''Parse the pseudo text *string* which is in CP2K format, return an internal
     basis format which can be assigned to :attr:`Cell.pseudo`
     Lines started with # are ignored.
@@ -45,8 +46,14 @@ def parse(string):
     ...      0.28637912    0
     ... """)}
     '''
-    pseudotxt = [x.strip() for x in string.splitlines()
-                 if x.strip() and 'END' not in x and '#PSEUDOPOTENTIAL' not in x]
+    if symb is not None:
+        raw_data = list(filter(None, re.split('#PSEUDOPOTENTIAL', string)))
+        pseudotxt = _search_gthpp_block(raw_data, symb)
+        if not pseudotxt:
+            raise BasisNotFoundError(f'Pseudopotential not found for {symb}.')
+    else:
+        pseudotxt = [x.strip() for x in string.splitlines()
+                     if x.strip() and 'END' not in x and '#PSEUDOPOTENTIAL' not in x]
     return _parse(pseudotxt)
 
 def load(pseudofile, symb, suffix=None):
@@ -95,7 +102,13 @@ def search_seg(pseudofile, symb, suffix=None):
     fin = open(pseudofile, 'r')
     fdata = fin.read().split('#PSEUDOPOTENTIAL')
     fin.close()
-    for dat in fdata[1:]:
+    dat = _search_gthpp_block(fdata[1:], symb, suffix)
+    if not dat:
+        raise BasisNotFoundError(f'Pseudopotential for {symb} in {pseudofile}')
+    return dat
+
+def _search_gthpp_block(raw_data, symb, suffix=None):
+    for dat in raw_data:
         dat0 = dat.split(None, 1)
         if dat0 and dat0[0] == symb:
             dat = [x.strip() for x in dat.splitlines()
@@ -107,7 +120,7 @@ def search_seg(pseudofile, symb, suffix=None):
             else:
                 if any(suffix == x.split('-')[-1] for x in dat[0].split()):
                     return dat
-    raise BasisNotFoundError(f'Pseudopotential for {symb} in {pseudofile}')
+    return None
 
 if __name__ == '__main__':
     args = sys.argv[1:]
diff --git a/pyscf/gto/test/test_basis_parser.py b/pyscf/gto/test/test_basis_parser.py
index d886e252e5..db8d7873a4 100644
--- a/pyscf/gto/test/test_basis_parser.py
+++ b/pyscf/gto/test/test_basis_parser.py
@@ -21,6 +21,7 @@
 from pyscf import lib
 from pyscf.gto.basis import parse_molpro
 from pyscf.gto.basis import parse_gaussian
+from pyscf.gto.basis import parse_cp2k, parse_cp2k_pp
 from pyscf.lib.exceptions import BasisNotFoundError
 
 class KnownValues(unittest.TestCase):
@@ -462,6 +463,70 @@ def test_parse_molpro_ecp_soc(self):
                 [3, [[], [], [[2.928812, -11.777154, 7.851436], [2.904069, -15.525522, -7.762761], [0.287352, -0.14855, 0.099033], [0.48938, -0.273682, -0.136841]], [], [], [], []]]]]
         self.assertEqual(ecp_data, ref)
 
+    def test_parse_gth_basis(self):
+        basis_str = '''
+                        #BASIS SET
+                        C DZV-GTH
+                          1
+                          2  0  1  4  2  2
+                                4.3362376436   0.1490797872   0.0000000000  -0.0878123619   0.0000000000
+                                1.2881838513  -0.0292640031   0.0000000000  -0.2775560300   0.0000000000
+                                0.4037767149  -0.6882040510   0.0000000000  -0.4712295093   0.0000000000
+                                0.1187877657  -0.3964426906   1.0000000000  -0.4058039291   1.0000000000
+                        #
+                        #BASIS SET
+                        N DZV-GTH
+                          1
+                          2  0  1  4  2  2
+                                6.1526903413   0.1506300537   0.0000000000  -0.0950603476   0.0000000000
+                                1.8236332280  -0.0360100734   0.0000000000  -0.2918864295   0.0000000000
+                                0.5676628870  -0.6942023212   0.0000000000  -0.4739050050   0.0000000000
+                                0.1628222852  -0.3878929987   1.0000000000  -0.3893418670   1.0000000000
+                        #
+                    '''
+        basis1 = parse_cp2k.parse(basis_str, 'C')
+        ref = gto.basis.load('gth-dzv', 'C')
+        self.assertEqual(ref, basis1)
+        basis1 = parse_cp2k.parse(basis_str, 'N')
+        ref = gto.basis.load('gth-dzv', 'N')
+        self.assertEqual(ref, basis1)
+
+        basis_str = '''
+                        C DZV-GTH
+                          1
+                          2  0  1  4  2  2
+                                4.3362376436   0.1490797872   0.0000000000  -0.0878123619   0.0000000000
+                                1.2881838513  -0.0292640031   0.0000000000  -0.2775560300   0.0000000000
+                                0.4037767149  -0.6882040510   0.0000000000  -0.4712295093   0.0000000000
+                                0.1187877657  -0.3964426906   1.0000000000  -0.4058039291   1.0000000000
+                        #
+                    '''
+        basis1 = parse_cp2k.parse(basis_str)
+        ref = gto.basis.load('gth-dzv', 'C')
+        self.assertEqual(ref, basis1)
+
+    def test_parse_gth_pp(self):
+        pp_str = '''
+            #PSEUDOPOTENTIAL
+            B GTH-PADE-q3 GTH-LDA-q3 GTH-PADE GTH-LDA
+                2    1
+                 0.43392956    2    -5.57864173     0.80425145
+                2
+                 0.37384326    1     6.23392822
+                 0.36039317    0
+            #PSEUDOPOTENTIAL
+            C GTH-PADE-q4 GTH-LDA-q4 GTH-PADE GTH-LDA
+                2    2
+                 0.34883045    2    -8.51377110     1.22843203
+                2
+                 0.30455321    1     9.52284179
+                 0.23267730    0'''
+        pp1 = parse_cp2k_pp.parse(pp_str, 'B')
+        ref = gto.basis.load_pseudo('gth-pade', 'B')
+        self.assertEqual(ref, pp1)
+        pp1 = parse_cp2k_pp.parse(pp_str, 'C')
+        ref = gto.basis.load_pseudo('gth-pade', 'C')
+        self.assertEqual(ref, pp1)
 
 if __name__ == "__main__":
     print("test basis module")

From 56adbedfb98b923dbecfad7be0c62a43d01f90ae Mon Sep 17 00:00:00 2001
From: matthew-hennefarth <matthew.hennefarth@gmail.com>
Date: Mon, 5 Feb 2024 13:00:16 -0600
Subject: [PATCH 09/44] SA-CASSCF Nonadiabatic Coupling Vectors (#2046)

* add nacs stuff but need to rebuild

* fix io warning test

* add sacasscf nac files from mrh

* update and add nac_method

* fix for flake8

* add example file

* update to assert len and uniqueness of states

* udpate

* udpate

* udpate

* fix assert statements and handling of state[0]==state[1]
---
 examples/nac/01-sacasscf_nac.py |  90 ++++++++++
 pyscf/mcscf/addons.py           |   8 +
 pyscf/mcscf/mc1step.py          |   4 +
 pyscf/nac/__init__.py           |  32 ++++
 pyscf/nac/sacasscf.py           | 293 ++++++++++++++++++++++++++++++++
 pyscf/nac/test/test_sacasscf.py | 175 +++++++++++++++++++
 6 files changed, 602 insertions(+)
 create mode 100644 examples/nac/01-sacasscf_nac.py
 create mode 100644 pyscf/nac/__init__.py
 create mode 100644 pyscf/nac/sacasscf.py
 create mode 100644 pyscf/nac/test/test_sacasscf.py

diff --git a/examples/nac/01-sacasscf_nac.py b/examples/nac/01-sacasscf_nac.py
new file mode 100644
index 0000000000..5384498ad1
--- /dev/null
+++ b/examples/nac/01-sacasscf_nac.py
@@ -0,0 +1,90 @@
+from pyscf import gto, scf, mcscf, lib
+
+# NAC signs are really, really hard to nail down.
+# There are arbitrary signs associated with
+# 1. The MO coefficients
+# 2. The CI vectors
+# 3. Almost any kind of post-processing (natural-orbital analysis, etc.)
+# 4. Developer convention on whether the bra index or ket index is 1st
+# It MIGHT help comparison to OpenMolcas if you load a rasscf.h5 file
+# I TRIED to choose the same convention for #4 as OpenMolcas.
+mol = gto.M (atom='Li 0 0 0;H 1.5 0 0', basis='sto-3g',
+             output='LiH_sa2casscf22_sto3g.log', verbose=lib.logger.INFO)
+
+mf = scf.RHF (mol).run ()
+mc = mcscf.CASSCF (mf, 2, 2)
+mc.fix_spin_(ss=0, shift=1)
+mc = mc.state_average ([0.5,0.5]).run (conv_tol=1e-10)
+
+mc_nacs = mc.nac_method()
+
+# 1. <1|d0/dR>
+#    Equivalent OpenMolcas input:
+#    ```
+#    &ALASKA
+#    NAC=1 2
+#    ```
+nac = mc_nacs.kernel (state=(0,1))
+print ("\nNAC <1|d0/dR>:\n", nac)
+print ("Notice that according to the NACs printed above, rigidly moving the")
+print ("molecule along the bond axis changes the electronic wave function, which")
+print ("is obviously unphysical. This broken translational symmetry is due to the")
+print ("'CSF contribution'. Omitting the CSF contribution corresponds to using the")
+print ("'electron-translation factors' of Fatehi and Subotnik and is requested by")
+print ("passing 'use_etfs=True'.")
+
+# 2. <1|d0/dR> w/ ETFs (i.e., w/out CSF contribution)
+#    Equivalent OpenMolcas input:
+#    ```
+#    &ALASKA
+#    NAC=1 2
+#    NOCSF
+#    ```
+nac = mc_nacs.kernel (state=(0,1), use_etfs=True)
+print ("\nNAC <1|d0/dR> w/ ETFs:\n", nac)
+print ("These NACs are much more well-behaved: moving the molecule rigidly around")
+print ("in space doesn't induce any change to the electronic wave function.")
+
+# 3. <0|d1/dR>
+#    Equivalent OpenMolcas input:
+#    ```
+#    &ALASKA
+#    NAC=2 1
+#    ```
+nac = mc_nacs.kernel (state=(1,0))
+print ("\nThe NACs are antisymmetric with respect to state transposition.")
+print ("NAC <0|d1/dR>:\n", nac)
+
+# 4. <0|d1/dR> w/ ETFs
+#    Equivalent OpenMolcas input:
+#    ```
+#    &ALASKA
+#    NAC=2 1
+#    NOCSF
+#    ```
+nac = mc_nacs.kernel (state=(1,0), use_etfs=True)
+print ("NAC <0|d1/dR> w/ ETFs:\n", nac)
+
+# 5. <1|d0/dR>*(E1-E0) = <0|d1/dR>*(E0-E1)
+#    I'm not aware of any OpenMolcas equivalent for this, but all the information
+#    should obviously be in the output file, as long as you aren't right at a CI.
+nac_01 = mc_nacs.kernel (state=(0,1), mult_ediff=True)
+nac_10 = mc_nacs.kernel (state=(1,0), mult_ediff=True)
+print ("\nNACs diverge at conical intersections (CI). The important question")
+print ("is how quickly it diverges. You can get at this by calculating NACs")
+print ("multiplied by the energy difference using the keyword 'mult_ediff=True'.")
+print ("This yields a quantity which is symmetric wrt state interchange and is")
+print ("finite at a CI.")
+print ("NAC <1|d0/dR>*(E1-E0):\n", nac_01)
+print ("NAC <0|d1/dR>*(E0-E1):\n", nac_10)
+
+# 6. <1|d0/dR>*(E1-E0) w/ETFs = <0|d1/dR>*(E0-E1) w/ETFs = <0|dH/dR|1>
+#    This is the quantity one uses to optimize MECIs
+v01 = mc_nacs.kernel (state=(0,1), use_etfs=True, mult_ediff=True)
+v10 = mc_nacs.kernel (state=(1,0), use_etfs=True, mult_ediff=True)
+print ("\nUsing both 'use_etfs=True' and 'mult_ediff=True' corresponds to the")
+print ("derivative of the off-diagonal element of the potential matrix. This")
+print ("tells you one of the two components of the branching plane at the CI.")
+print ("<1|d0/dR>*(E1-E0) w/ ETFs = <1|dH/dR|0>:\n", v01)
+print ("<0|d1/dR>*(E0-E1) w/ ETFs = <0|dH/dR|1>:\n", v10)
+
diff --git a/pyscf/mcscf/addons.py b/pyscf/mcscf/addons.py
index 0e7cca66fe..57b1db9e5f 100644
--- a/pyscf/mcscf/addons.py
+++ b/pyscf/mcscf/addons.py
@@ -1104,6 +1104,14 @@ def nuc_grad_method (self, state=None):
 
     Gradients = nuc_grad_method
 
+    def nac_method(self):
+        if callable(getattr(self, '_state_average_nac_method', None)):
+            return self._state_average_nac_method()
+        else:
+            raise NotImplementedError("NAC method")
+
+    NACs = nac_method
+
 def state_average_(casscf, weights=(0.5,0.5), wfnsym=None):
     ''' Inplace version of state_average '''
     sacasscf = state_average (casscf, weights, wfnsym)
diff --git a/pyscf/mcscf/mc1step.py b/pyscf/mcscf/mc1step.py
index 35249ce9a5..020d6e6d88 100644
--- a/pyscf/mcscf/mc1step.py
+++ b/pyscf/mcscf/mc1step.py
@@ -1275,6 +1275,10 @@ def _state_average_nuc_grad_method (self, state=None):
         from pyscf.grad import sacasscf as sacasscf_grad
         return sacasscf_grad.Gradients (self, state=state)
 
+    def _state_average_nac_method(self):
+        from pyscf.nac import sacasscf as sacasscf_nac
+        return sacasscf_nac.NonAdiabaticCouplings(self)
+
     def newton(self):
         from pyscf.mcscf import newton_casscf
         from pyscf.mcscf.addons import StateAverageMCSCFSolver
diff --git a/pyscf/nac/__init__.py b/pyscf/nac/__init__.py
new file mode 100644
index 0000000000..897245c989
--- /dev/null
+++ b/pyscf/nac/__init__.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python
+# Copyright 2014-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+Analytical Nonadiabatic Coupling Vectors
+============================
+
+Simple usage::
+
+    >>> from pyscf import gto, scf, mcscf, nac
+    >>> mol = gto.M(atom='N 0 0 0; N 0 0 1', basis='ccpvdz')
+    >>> mf = scf.RHF(mol).run()
+    >>> mc = mcscf.CASSCF(mf, 2, 2).state_average([0.5, 0.5]).run()
+    >>> mc_nac = nac.sacasscf.NonAdiabaticCouplings(mc)
+    >>> mc_nac = mc.nac_method() # Also valid
+    >>> mc_nac.kernel(state=(0,1), use_etfs=False)
+"""
+
+from . import sacasscf
diff --git a/pyscf/nac/sacasscf.py b/pyscf/nac/sacasscf.py
new file mode 100644
index 0000000000..fe84e034ff
--- /dev/null
+++ b/pyscf/nac/sacasscf.py
@@ -0,0 +1,293 @@
+import numpy as np
+from pyscf import lib
+from pyscf.lib import logger
+from pyscf.fci import direct_spin1
+from pyscf.mcscf import newton_casscf
+from pyscf.grad import casscf as casscf_grad
+from pyscf.grad import sacasscf as sacasscf_grad
+from functools import reduce
+
+# The extension from gradients -> NACs has three basic steps:
+# 0. ("state" index integer -> tuple)
+# 1. fcisolver.make_rdm12 -> fcisolver.trans_rdm12
+# 2. remove core-orbital and nuclear contributions to everything
+# 3. option to include the "csf contribution"
+# Additional good ideas:
+# a. Option to multiply NACs by the energy difference to control
+#    singularities
+
+def _unpack_state(state):
+    assert len(state) == 2, "derivative couplings are defined between 2 states"
+    return state[0], state[1]
+
+
+def grad_elec_core(mc_grad, mo_coeff=None, atmlst=None, eris=None, mf_grad=None):
+    """Compute the core-electron part of the CASSCF (Hellmann-Feynman)
+    gradient using a modified RHF grad_elec call."""
+    mc = mc_grad.base
+    if mo_coeff is None: mo_coeff = mc.mo_coeff
+    if eris is None: eris = mc.ao2mo (mo_coeff)
+    if mf_grad is None: mf_grad = mc._scf.nuc_grad_method ()
+    ncore = mc.ncore
+    moH = mo_coeff.conj ().T
+    f0 = (moH @ mc.get_hcore () @ mo_coeff) + eris.vhf_c
+    mo_energy = f0.diagonal ().copy ()
+    mo_occ = np.zeros_like (mo_energy)
+    mo_occ[:ncore] = 2.0
+    f0 *= mo_occ[None,:]
+    dme0 = lambda * args: mo_coeff @ ((f0+f0.T)*.5) @ moH
+    with lib.temporary_env (mf_grad, make_rdm1e=dme0, verbose=0):
+        with lib.temporary_env (mf_grad.base, mo_coeff=mo_coeff, mo_occ=mo_occ):
+            # Second level there should become unnecessary in future, if anyone
+            # ever gets around to cleaning up pyscf.df.grad.rhf & pyscf.grad.rhf
+            de = mf_grad.grad_elec (mo_coeff=mo_coeff, mo_energy=mo_energy,
+                                    mo_occ=mo_occ, atmlst=atmlst)
+    return de
+
+def grad_elec_active (mc_grad, mo_coeff=None, ci=None, atmlst=None,
+                      eris=None, mf_grad=None, verbose=None):
+    '''Compute the active-electron part of the CASSCF (Hellmann-Feynman)
+    gradient by subtracting the core-electron part.'''
+    t0 = (logger.process_clock (), logger.perf_counter ())
+    mc = mc_grad.base
+    log = logger.new_logger (mc_grad, verbose)
+    if mf_grad is None: mf_grad=mc._scf.nuc_grad_method ()
+    de = mc_grad.grad_elec (mo_coeff=mo_coeff, ci=ci, atmlst=atmlst,
+                            verbose=0)
+    de -= grad_elec_core (mc_grad, mo_coeff=mo_coeff, atmlst=atmlst,
+                          eris=eris, mf_grad=mf_grad)
+    log.debug ('CASSCF active-orbital gradient:\n{}'.format (de))
+    log.timer ('CASSCF active-orbital gradient', *t0)
+    return de
+
+def gen_g_hop_active (mc, mo, ci0, eris, verbose=None):
+    '''Compute the active-electron part of the orbital rotation gradient
+    by patching out the appropriate block of eris.vhf_c'''
+    moH = mo.conj ().T
+    ncore = mc.ncore
+    vnocore = eris.vhf_c.copy ()
+    vnocore[:,:ncore] = -moH @ mc.get_hcore () @ mo[:,:ncore]
+    with lib.temporary_env (eris, vhf_c=vnocore):
+        return newton_casscf.gen_g_hop (mc, mo, ci0, eris, verbose=verbose)
+
+def _nac_csf (mol, mf_grad, tm1, atmlst):
+    if atmlst is None: atmlst = list (range (mol.natm))
+    aoslices = mol.aoslice_by_atom ()
+    s1 = mf_grad.get_ovlp (mol)
+    # if libcint documentation is to be trusted, mf_grad.get_ovlp
+    # corresponds to differentiating on the SECOND index: <p|dq/dR>
+    nac = np.zeros ((len(atmlst), 3))
+    for k, ia in enumerate (atmlst):
+        shl0, shl1, p0, p1 = aoslices[ia]
+        nac[k] += 0.5*np.einsum ('xij,ij->x', s1[:,p0:p1], tm1[p0:p1])
+    return nac
+
+def nac_csf (mc_grad, mo_coeff=None, ci=None, state=None, mf_grad=None,
+             atmlst=None):
+    '''Compute the "CSF contribution" to the SA-CASSCF NAC'''
+    mc = mc_grad.base
+    if mo_coeff is None: mo_coeff = mc.mo_coeff
+    if ci is None: ci = mc.ci
+    if state is None: state = mc_grad.state
+    if mf_grad is None: mf_grad = mc._scf.nuc_grad_method ()
+    if atmlst is None: atmlst = mc_grad.atmlst
+    mol = mc.mol
+    ket, bra = _unpack_state (state)
+    ncore, ncas, nelecas = mc.ncore, mc.ncas, mc.nelecas
+    castm1 = direct_spin1.trans_rdm1 (ci[bra], ci[ket], ncas, nelecas)
+    # if PySCF commentary is to be trusted, trans_rdm1[p,q] is
+    # <bra|q'p|ket>. I want <bra|p'q - q'p|ket>.
+    castm1 = castm1.conj ().T - castm1
+    mo_cas = mo_coeff[:,ncore:][:,:ncas]
+    tm1 = reduce (np.dot, (mo_cas, castm1, mo_cas.conj ().T))
+    return _nac_csf (mol, mf_grad, tm1, atmlst)
+
+class NonAdiabaticCouplings (sacasscf_grad.Gradients):
+    '''SA-CASSCF non-adiabatic couplings (NACs) between states
+
+    kwargs/attributes:
+
+    state : tuple of length 2
+        The NACs returned are <state[1]|d(state[0])/dR>.
+        In other words, state = (ket, bra).
+    mult_ediff : logical
+        If True, returns NACs multiplied by the energy difference.
+        Useful near conical intersections to avoid numerical problems.
+    use_etfs : logical
+        If True, use the ``electron translation factors'' of Fatehi and
+        Subotnik [JPCL 3, 2039 (2012)], which guarantee conservation of
+        total electron + nuclear momentum when the nuclei are moving
+        (i.e., in non-adiabatic molecular dynamics). This corresponds
+        to omitting the so-called ``CSF contribution'' [cf. JCTC 12,
+        3636 (2016)].
+    '''
+
+    def __init__(self, mc, state=None, mult_ediff=False, use_etfs=False):
+        self.mult_ediff = mult_ediff
+        self.use_etfs = use_etfs
+        if state is not None:
+            assert len(state) == 2, "derivative couplings are defined between 2 states"
+        sacasscf_grad.Gradients.__init__(self, mc, state=state)
+
+    def make_fcasscf_nacs (self, state=None, casscf_attr=None,
+                           fcisolver_attr=None):
+        if state is None: state = self.state
+        if casscf_attr is None: casscf_attr = {}
+        if fcisolver_attr is None: fcisolver_attr = {}
+        ket, bra = _unpack_state (state)
+        ci, ncas, nelecas = self.base.ci, self.base.ncas, self.base.nelecas
+        # TODO: use fcisolver.fcisolvers in state-average mix case for this
+        castm1, castm2 = direct_spin1.trans_rdm12 (ci[bra], ci[ket], ncas,
+                                                   nelecas)
+        castm1 = 0.5 * (castm1 + castm1.T)
+        castm2 = 0.5 * (castm2 + castm2.transpose (1,0,3,2))
+        fcisolver_attr['make_rdm12'] = lambda *args, **kwargs : (castm1, castm2)
+        fcisolver_attr['make_rdm1'] = lambda *args, **kwargs : castm1
+        fcisolver_attr['make_rdm2'] = lambda *args, **kwargs : castm2
+        return sacasscf_grad.Gradients.make_fcasscf (self,
+            state=ket, casscf_attr=casscf_attr, fcisolver_attr=fcisolver_attr)
+
+
+    def get_wfn_response (self, atmlst=None, state=None, verbose=None, mo=None, ci=None, **kwargs):
+        if state is None: state = self.state
+        if atmlst is None: atmlst = self.atmlst
+        if verbose is None: verbose = self.verbose
+        if mo is None: mo = self.base.mo_coeff
+        if ci is None: ci = self.base.ci
+        log = logger.new_logger (self, verbose)
+        ket, bra = _unpack_state (state)
+        fcasscf = self.make_fcasscf_nacs (state)
+        fcasscf.mo_coeff = mo
+        fcasscf.ci = ci[ket]
+        eris = fcasscf.ao2mo (mo)
+        g_all_ket = gen_g_hop_active (fcasscf, mo, ci[ket], eris, verbose)[0]
+        g_all = np.zeros (self.nlag)
+        g_all[:self.ngorb] = g_all_ket[:self.ngorb]
+        # The fun thing about the ci sector is that you swap them (&/2):
+        # <I|[H,|A><I|-|I><A|]|J> = <A|H|J> = <J|[H,|A><J|-|J><A|]|J>/2
+        # (It should be zero for converged SA-CASSCF anyway, though)
+        g_ci_bra = 0.5 * g_all_ket[self.ngorb:]
+        g_all_bra = gen_g_hop_active (fcasscf, mo, ci[bra], eris, verbose)[0]
+        g_ci_ket = 0.5 * g_all_bra[self.ngorb:]
+        # I have to make sure they don't talk to each other because the
+        # preconditioner doesn't explore that space at all. Should I
+        # instead solve at the init_guess step, like in MC-PDFT?
+        # In practice it should all be zeros but how tightly does
+        # everything have to be converged?
+        ndet_ket = (self.na_states[ket], self.nb_states[ket])
+        ndet_bra = (self.na_states[bra], self.nb_states[bra])
+        if ndet_ket==ndet_bra:
+            ket2bra = np.dot (ci[bra].conj ().ravel (), g_ci_ket)
+            bra2ket = np.dot (ci[ket].conj ().ravel (), g_ci_bra)
+            log.debug ('SA-CASSCF <bra|H|ket>,<ket|H|bra> check: %5.3g , %5.3g',
+                       ket2bra, bra2ket)
+            g_ci_ket -= ket2bra * ci[bra].ravel ()
+            g_ci_bra -= bra2ket * ci[ket].ravel ()
+        ndet_ket = ndet_ket[0]*ndet_ket[1]
+        ndet_bra = ndet_bra[0]*ndet_bra[1]
+        # No need to reshape or anything, just use the magic of repeated slicing
+        offs_ket = (sum ([na * nb for na, nb in zip(
+                         self.na_states[:ket], self.nb_states[:ket])])
+                    if ket > 0 else 0)
+        offs_bra = (sum ([na * nb for na, nb in zip(
+                         self.na_states[:bra], self.nb_states[:bra])])
+                    if ket > 0 else 0)
+        g_all[self.ngorb:][offs_ket:][:ndet_ket] = g_ci_ket
+        g_all[self.ngorb:][offs_bra:][:ndet_bra] = g_ci_bra
+        return g_all
+
+
+    def get_ham_response (self, state=None, atmlst=None, verbose=None, mo=None,
+                          ci=None, eris=None, mf_grad=None, **kwargs):
+        if state is None: state = self.state
+        if atmlst is None: atmlst = self.atmlst
+        if verbose is None: verbose = self.verbose
+        if mo is None: mo = self.base.mo_coeff
+        if ci is None: ci = self.base.ci
+        if mf_grad is None: mf_grad = self.base._scf.nuc_grad_method ()
+        if eris is None and self.eris is None:
+            eris = self.eris = self.base.ao2mo (mo)
+        elif eris is None:
+            eris = self.eris
+        use_etfs = kwargs.get ('use_etfs', self.use_etfs)
+        ket, bra = _unpack_state (state)
+        fcasscf_grad = casscf_grad.Gradients (self.make_fcasscf_nacs (state))
+        nac = grad_elec_active (fcasscf_grad, mo_coeff=mo, ci=ci[ket],
+                                eris=eris, atmlst=atmlst, verbose=verbose)
+        if not use_etfs: nac += self.nac_csf (
+            mo_coeff=mo, ci=ci, state=state, mf_grad=mf_grad, atmlst=atmlst)
+        return nac
+
+    def nac_csf (self, mo_coeff=None, ci=None, state=None, mf_grad=None, atmlst=None):
+        if state is None: state = self.state
+        if atmlst is None: atmlst = self.atmlst
+        if mo_coeff is None: mo_coeff = self.base.mo_coeff
+        if ci is None: ci = self.base.ci
+        if mf_grad is None: mf_grad = self.base._scf.nuc_grad_method ()
+        nac = nac_csf (self, mo_coeff=mo_coeff, ci=ci, state=state,
+                       mf_grad=mf_grad, atmlst=atmlst)
+        ket, bra = _unpack_state (state)
+        e_bra = self.base.e_states[bra]
+        e_ket = self.base.e_states[ket]
+        nac *= e_bra - e_ket
+        return nac
+
+    def kernel (self, *args, **kwargs):
+        mult_ediff = kwargs.get ('mult_ediff', self.mult_ediff)
+        state = kwargs.get ('state', self.state)
+        assert len(state) == 2, "derivative couplings are defined between 2 states"
+        if state[0] == state[1]:
+            mol = kwargs.get('mol', self.mol)
+            atmlst = kwargs.get('atmlst', range(mol.natm))
+            return np.zeros((len(atmlst), 3))
+
+        nac = sacasscf_grad.Gradients.kernel (self, *args, **kwargs)
+        if not mult_ediff:
+            ket, bra = _unpack_state (state)
+            e_bra = self.base.e_states[bra]
+            e_ket = self.base.e_states[ket]
+            nac /= e_bra - e_ket
+        return nac
+
+if __name__=='__main__':
+    from pyscf import gto, scf, mcscf
+    from scipy import linalg
+    mol = gto.M (atom = 'Li 0 0 0; H 0 0 1.5', basis='sto-3g',
+                 output='sacasscf_nacs.log', verbose=lib.logger.INFO)
+    mf = scf.RHF (mol).run ()
+    mc = mcscf.CASSCF (mf, 2, 2).fix_spin_(ss=0).state_average ([0.5,0.5]).run (conv_tol=1e-10)
+    openmolcas_energies = np.array ([-7.85629118, -7.72175252])
+    print ("energies:",mc.e_states)
+    print ("disagreement w openmolcas:", np.around (mc.e_states-openmolcas_energies, 8))
+    mc_nacs = NonAdiabaticCouplings (mc)
+    print ("no csf contr")
+    nac_01 = mc_nacs.kernel (state=(0,1), use_etfs=True)
+    nac_10 = mc_nacs.kernel (state=(1,0), use_etfs=True)
+    nac_01_mult = mc_nacs.kernel (state=(0,1), use_etfs=True, mult_ediff=True)
+    nac_10_mult = mc_nacs.kernel (state=(1,0), use_etfs=True, mult_ediff=True)
+    print ("antisym")
+    print (nac_01)
+    print ("checking antisym:",linalg.norm(nac_01+nac_10))
+    print ("sym")
+    print (nac_01_mult)
+    print ("checking sym:",linalg.norm(nac_01_mult-nac_10_mult))
+
+
+    print ("incl csf contr")
+    nac_01 = mc_nacs.kernel (state=(0,1), use_etfs=False)
+    nac_10 = mc_nacs.kernel (state=(1,0), use_etfs=False)
+    nac_01_mult = mc_nacs.kernel (state=(0,1), use_etfs=False, mult_ediff=True)
+    nac_10_mult = mc_nacs.kernel (state=(1,0), use_etfs=False, mult_ediff=True)
+    print ("antisym")
+    print (nac_01)
+    print ("checking antisym:",linalg.norm(nac_01+nac_10))
+    print ("sym")
+    print (nac_01_mult)
+    print ("checking sym:",linalg.norm(nac_01_mult-nac_10_mult))
+
+    print ("Check gradients")
+    mc_grad = mc.nuc_grad_method ()
+    de_0 = mc_grad.kernel (state=0)
+    print (de_0)
+    de_1 = mc_grad.kernel (state=1)
+    print (de_1)
diff --git a/pyscf/nac/test/test_sacasscf.py b/pyscf/nac/test/test_sacasscf.py
new file mode 100644
index 0000000000..5a33c966d0
--- /dev/null
+++ b/pyscf/nac/test/test_sacasscf.py
@@ -0,0 +1,175 @@
+#!/usr/bin/env python
+# Copyright 2014-2022 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import numpy as np
+from pyscf import gto, scf, mcscf
+from pyscf.nac.sacasscf import NonAdiabaticCouplings
+import unittest
+
+
+def diatomic(atom1, atom2, r, basis, ncas, nelecas, nstates,
+             charge=None, spin=None, symmetry=False, cas_irrep=None):
+    global mols
+    xyz = '{:s} 0.0 0.0 0.0; {:s} {:.3f} 0.0 0.0'.format(atom1, atom2, r)
+    mol = gto.M(atom=xyz, basis=basis, charge=charge, spin=spin,
+                symmetry=symmetry, verbose=0, output='/dev/null')
+
+    mols.append(mol)
+
+    mf = scf.RHF(mol)
+
+    mc = mcscf.CASSCF(mf.run(), ncas, nelecas).set(natorb=True)
+
+    if spin is not None:
+        s = spin*0.5
+
+    else:
+        s = (mol.nelectron % 2)*0.5
+
+    mc.fix_spin_(ss=s*(s+1), shift=1)
+    mc = mc.state_average([1.0/float(nstates), ]*nstates)
+    mc.conv_tol = mc.conv_tol_diabatize = 1e-12
+    mo = None
+
+    if symmetry and (cas_irrep is not None):
+        mo = mc.sort_mo_by_irrep(cas_irrep)
+
+    mc.kernel(mo)
+
+    return mc.nac_method()
+
+def setUpModule():
+    global mols 
+    mols = []
+
+def tearDownModule():
+    global mols, diatomic
+    [m.stdout.close() for m in mols]
+    del mols, diatomic
+
+
+class KnownValues(unittest.TestCase):
+
+    def test_nac_h2_sa2casscf22_sto3g(self):
+        # z_orb:    no
+        # z_ci:     yes
+        # z_is:     no
+        mc_grad = diatomic('H', 'H', 1.3, 'STO-3G', 2, 2, 2)
+
+        # OpenMolcas v23.02 - PC
+        de_ref = np.array([[2.24611972496341E-01, 2.24611972496341E-01],
+                           [3.91518173397213E-18, -3.91518173397213E-18]])
+        for i in range(2):
+            with self.subTest(use_etfs=bool(i)):
+                de = mc_grad.kernel(state=(0, 1), use_etfs=bool(i))[:, 0]
+                de *= np.sign(de[0]) * np.sign(de_ref[i, 0])
+                # TODO: somehow confirm sign convention
+                self.assertAlmostEqual(de[0], de_ref[i, 0], 5)
+                self.assertAlmostEqual(de[1], de_ref[i, 1], 5)
+
+
+    def test_nac_h2_sa3casscf22_sto3g(self):
+        # z_orb:    no
+        # z_ci:     no
+        # z_is:     no
+        mc_grad = diatomic('H', 'H', 1.3, 'STO-3G', 2, 2, 3)
+
+        # OpenMolcas v23.02 - PC
+        de_ref = np.array([[2.24611972496341E-01,2.24611972496341E-01 ],
+                           [3.91518173397213E-18, -3.91518173397213E-18]])
+        for i in range(2):
+            with self.subTest(use_etfs=bool(i)):
+                de = mc_grad.kernel(state=(0, 1), use_etfs=bool(i))[:, 0]
+                de *= np.sign(de[0]) * np.sign(de_ref[i, 0])
+                # TODO: somehow confirm sign convention
+                self.assertAlmostEqual(de[0], de_ref[i, 0], 5)
+                self.assertAlmostEqual(de[1], de_ref[i, 1], 5)
+
+    def test_nac_h2_sa2caasf22_631g(self):
+        # z_orb:    yes
+        # z_ci:     yes
+        # z_is:     no
+        mc_grad = diatomic('H', 'H', 1.3, '6-31G', 2, 2, 2)
+
+        # OpenMolcas v23.02 - PC
+        de_ref = np.array([[2.63335709207419E-01,2.63335709207420E-01],
+                           [-4.13635186565710E-16,4.47060252146777E-16 ]])
+
+        for i in range(2):
+            with self.subTest(use_etfs=bool(i)):
+                de = mc_grad.kernel(state=(0, 1), use_etfs=bool(i))[:, 0]
+                de *= np.sign(de[0]) * np.sign(de_ref[i, 0])
+                # TODO: somehow confirm sign convention
+                self.assertAlmostEqual(de[0], de_ref[i, 0], 5)
+                self.assertAlmostEqual(de[1], de_ref[i, 1], 5)
+
+
+    def test_nac_h2_sa3casscf22_631g(self):
+        # z_orb:    yes
+        # z_ci:     no
+        # z_is:     no
+        mc_grad = diatomic('H', 'H', 1.3, '6-31G', 2, 2, 3)
+
+        # OpenMolcas v23.02 - PC
+        de_ref = np.array([[-2.61263051047980E-01,-2.61263051047980E-01],
+                           [-5.77124316768522E-17,2.47338992900795E-17 ]])
+
+        for i in range(2):
+            with self.subTest(use_etfs=bool(i)):
+                de = mc_grad.kernel(state=(0, 1), use_etfs=bool(i))[:, 0]
+                de *= np.sign(de[0]) * np.sign(de_ref[i, 0])
+                # TODO: somehow confirm sign convention
+                self.assertAlmostEqual(de[0], de_ref[i, 0], 5)
+                self.assertAlmostEqual(de[1], de_ref[i, 1], 5)
+
+    def test_nac_lih_sa2casscf22_sto3g(self):
+        # z_orb:    yes
+        # z_ci:     yes
+        # z_is:     yes
+        mc_grad = diatomic('Li', 'H', 1.5, 'STO-3G', 2, 2, 2)
+
+        # OpenMolcas v23.02 - PC
+        de_ref = np.array([[1.83701729060390E-01, -6.91462064586138E-02],
+                           [9.14842536971979E-02, -9.14842536971979E-02]])
+        for i in range(2):
+            with self.subTest(use_etfs=bool(i)):
+                de = mc_grad.kernel(state=(0, 1), use_etfs=bool(i))[:, 0]
+                de *= np.sign(de[0]) * np.sign(de_ref[i, 0])
+                # TODO: somehow confirm sign convention
+                self.assertAlmostEqual(de[0], de_ref[i, 0], 5)
+                self.assertAlmostEqual(de[1], de_ref[i, 1], 5)
+
+    def test_nac_lih_cms3ftlda22_sto3g(self):
+        # z_orb:    yes
+        # z_ci:     no
+        # z_is:     yes
+        mc_grad = diatomic('Li', 'H', 2.5, 'STO-3G', 2, 2, 3)
+
+        # OpenMolcas v23.02 - PC
+        de_ref = np.array([[2.68015835251472E-01, -6.48474666167559E-02],
+                           [1.24870721811750E-01, -1.24870721811750E-01]])
+
+        for i in range(2):
+            with self.subTest(use_etfs=bool(i)):
+                de = mc_grad.kernel(state=(0, 1), use_etfs=bool(i))[:, 0]
+                de *= np.sign(de[0]) * np.sign(de_ref[i, 0])
+                # TODO: somehow confirm sign convention
+                self.assertAlmostEqual(de[0], de_ref[i, 0], 5)
+                self.assertAlmostEqual(de[1], de_ref[i, 1], 5)
+
+
+if __name__ == "__main__":
+    print("Full Tests for SA-CASSCF non-adiabatic couplings of diatomic molecules")
+    unittest.main()

From 83c62be47c622c1ee7bf2b2d63eab126c0090145 Mon Sep 17 00:00:00 2001
From: Qiming Sun <osirpt.sun@gmail.com>
Date: Thu, 1 Feb 2024 11:26:46 -0800
Subject: [PATCH 10/44] Backward compatibility issue in fci.cistring

---
 pyscf/fci/cistring.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pyscf/fci/cistring.py b/pyscf/fci/cistring.py
index 0ef463d693..4de0ed169a 100644
--- a/pyscf/fci/cistring.py
+++ b/pyscf/fci/cistring.py
@@ -107,6 +107,8 @@ def gen_occs_iter(orb_list, nelec):
         return res
     occslst = gen_occs_iter(orb_list, nelec)
     return numpy.asarray(occslst, dtype=numpy.int32).view(OIndexList)
+# Add this symbol for backward compatibility. Should remove in the future.
+_gen_occslst = gen_occslst
 
 def _strs2occslst(strs, norb):
     na = len(strs)

From 6f8ce439fbe0c26ec4d226e2225b28c7f668f922 Mon Sep 17 00:00:00 2001
From: Qiming Sun <osirpt.sun@gmail.com>
Date: Sat, 3 Feb 2024 14:46:12 -0800
Subject: [PATCH 11/44] Release 2.5

---
 CHANGELOG         | 29 +++++++++++++++++++++++++++++
 NOTICE            |  5 ++++-
 README.md         |  4 ++--
 pyscf/__init__.py |  2 +-
 4 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 0131615df5..b199af5208 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,31 @@
+PySCF 2.5.0 (2024-02-03)
+------------------------
+* Added
+  - SA-CASSCF Nonadiabatic Coupling Vectors 
+  - The to_gpu function to convert pyscf objects to gpu4pyscf objects.
+  - 4th, and 5th order XC derivatives.
+* Improved
+  - DIIS with damping for gapless systems.
+  - CPHF solver with level shift for gapless systems.
+  - The memory footprint for rsdf_builder and rsjk_builder.
+  - Use UHF natural orbital for RHF chkfile initial guess.
+  - Pipek Mezey + Stability check using Jacobi sweep.
+  - The conversion between FCI strings and addresses for more than 64 orbitals.
+  - The interface to the dftd3 and dftd4 dispersion correction modules.
+  - Switch off the sparsity treatment in DFT numerical integration for small system.
+  - Lattice-sum cutoff for non-orthogonal cell.
+  - Allow turning off AO symmetry for PBC DFT.
+* Fixes
+  - cp2k basis parsers
+  - k2gamma for dft classes.
+  - Mole.magmom attribute serialization error.
+  - post-hf Gradients with Cartesian GTOs.
+  - Basis order problem in molden.load .
+  - PBC DFT Becke grids rounding error.
+  - PBC rsdf for un-sorted basis.
+  - The get_bands function with k-point symmetry.
+
+
 PySCF 2.4.0 (2023-10-16)
 ------------------------
 * Added
@@ -37,6 +65,7 @@ PySCF 2.4.0 (2023-10-16)
   - Assume 46 and 78 core configurations to be f-in-valence.
   - Coding styles and deprecated warnings from numpy.
 
+
 PySCF 2.3.0 (2023-07-04)
 ------------------------
 * Added
diff --git a/NOTICE b/NOTICE
index a21469aea7..dc52a6294c 100644
--- a/NOTICE
+++ b/NOTICE
@@ -102,7 +102,10 @@ Xiaojie Wu
 Pavel Pokhilko
 Frédéric Chapoton
 Daniel King
-
+Jiachen Li
+Felipe S. S. Schneider
+Aniruddha Seal
+Peter Reinholdt
 
 
 ---
diff --git a/README.md b/README.md
index a0b2a39bf4..9a90da41cc 100644
--- a/README.md
+++ b/README.md
@@ -7,9 +7,9 @@ Python-based Simulations of Chemistry Framework
 [![Build Status](https://github.com/pyscf/pyscf/workflows/CI/badge.svg)](https://github.com/pyscf/pyscf/actions?query=workflow%3ACI)
 [![codecov](https://codecov.io/gh/pyscf/pyscf/branch/master/graph/badge.svg)](https://codecov.io/gh/pyscf/pyscf)
 
-2023-10-15
+2024-02-03
 
-* [Stable release 2.4.0](https://github.com/pyscf/pyscf/releases/tag/v2.4.0)
+* [Stable release 2.5.0](https://github.com/pyscf/pyscf/releases/tag/v2.5.0)
 * [Changelog](../master/CHANGELOG)
 * [Documentation](http://www.pyscf.org)
 * [Installation](#installation)
diff --git a/pyscf/__init__.py b/pyscf/__init__.py
index 3e32baeb00..c775395d8b 100644
--- a/pyscf/__init__.py
+++ b/pyscf/__init__.py
@@ -35,7 +35,7 @@
 
 '''
 
-__version__ = '2.4.0'
+__version__ = '2.5.0'
 
 import os
 import sys

From e1dedd25c7787433b8988e4b7ab11a4a23f38bc3 Mon Sep 17 00:00:00 2001
From: Zhihao Cui <zhcui0408@gmail.com>
Date: Sat, 10 Feb 2024 15:46:06 -0500
Subject: [PATCH 12/44] fix magmom check in mol.copy()

If magmom is an np.ndarray,
`if mol.magmom` will raise error.
Need to explicitly use `is not None`
---
 pyscf/gto/mole.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyscf/gto/mole.py b/pyscf/gto/mole.py
index ab1e32c61c..b79074aa56 100644
--- a/pyscf/gto/mole.py
+++ b/pyscf/gto/mole.py
@@ -1209,7 +1209,7 @@ def copy(mol, deep=True):
     newmol._ecp    = copy.deepcopy(mol._ecp)
     newmol.pseudo  = copy.deepcopy(mol.pseudo)
     newmol._pseudo = copy.deepcopy(mol._pseudo)
-    if mol.magmom:
+    if mol.magmom is not None:
         newmol.magmom  = list(mol.magmom)
     return newmol
 

From c1898ba0cb5ef12f5519ad68377b3bfdb69d3f9d Mon Sep 17 00:00:00 2001
From: Qiming Sun <osirpt.sun@gmail.com>
Date: Thu, 15 Feb 2024 20:53:06 -0800
Subject: [PATCH 13/44] Move LebedevGrid C code to Python code; fix issue #2073
 (#2076)

* Move LebedevGrid C code to Python code; fix issue #2073

* Restore CxLebedevGrid.c and fix bug

* lint error
---
 pyscf/dft/LebedevGrid.py | 5047 ++++++++++++++++++++++++++++++++++++++
 pyscf/dft/gen_grid.py    |   43 +-
 pyscf/solvent/ddcosmo.py |    4 +-
 pyscf/solvent/pcm.py     |    4 +-
 4 files changed, 5051 insertions(+), 47 deletions(-)
 create mode 100644 pyscf/dft/LebedevGrid.py

diff --git a/pyscf/dft/LebedevGrid.py b/pyscf/dft/LebedevGrid.py
new file mode 100644
index 0000000000..0fdfd69f45
--- /dev/null
+++ b/pyscf/dft/LebedevGrid.py
@@ -0,0 +1,5047 @@
+# This code was modified from CxLebedevGrid.cpp (from Gerald Knizia).
+# The following comments are copied from the header file CxLebedevGrid.h
+#
+#
+#ccgk: This code generates Lebedev grids. It is based on C files from
+#ccgk: Dmitri Laikov, which were converted to Fortran by Christoph van Wuellen.
+#ccgk: I (Gerald Knizia) subsequently converted them back to C++.
+#ccgk:
+#ccgk: The original distribution contained the following readme file:
+#ccgk:
+#
+#      Lebedev grids of orders n=6m+5 where m=0,1,...,21 in 16 digit precision
+#      =======================================================================
+#
+#      The file Lebedev-Laikov.F implements a set of subroutines providing
+#      Lebedev-Laikov grids of order n=2m+1, where m=1,2,...,15, and additionally
+#      grids of order n=6m+5, where m=5,6,...,21. The parameters ensure
+#      that angular integration of polynomials x**k * y**l * z**m, where k+l+m <= 131
+#      can be performed with a relative accuracy of 2e-14 [1]. Note that the weights
+#      are normalised to add up to 1.0.
+#
+#      For each order n a separate subroutine is provided named
+#      LD. The parameters X, Y, Z are arrays for the
+#      cartesian components of each point, and the parameter W is an array for the
+#      weights. The subroutines increase the integer parameter N by number of grid
+#      points generated. All these routines use the subroutine gen_oh which takes care
+#      of the octahedral symmetry of the grids.
+#
+#      Christoph van Wuellen (Ruhr-Universitaet, Bochum, Germany) generated the
+#      routines in Lebedev-Laikov.F by translating the original C-routines kindly
+#      provided by Dmitri Laikov (Moscow State University, Moscow, Russia). We
+#      are in debt to Dmitri Laikov for giving us permission to make these routines
+#      publically available.
+#
+#      Huub van Dam
+#      Daresbury Laboratory, Daresbury, United Kingdom
+#      April, 2000
+#
+#      References
+#      ==========
+#
+#      [1] V.I. Lebedev, and D.N. Laikov
+#         "A quadrature formula for the sphere of the 131st
+#         algebraic order of accuracy"
+#         Doklady Mathematics, Vol. 59, No. 3, 1999, pp. 477-481.
+#
+#ccgk: and the following comments and references for the original of the subroutine SphGenOh:
+#
+#      chvd
+#      chvd   This subroutine is part of a set of subroutines that generate
+#      chvd   Lebedev grids [1-6] for integration on a sphere. The original
+#      chvd   C-code [1] was kindly provided by Dr. Dmitri N. Laikov and
+#      chvd   translated into fortran by Dr. Christoph van Wuellen.
+#      chvd   This subroutine was translated from C to fortran77 by hand.
+#      chvd
+#      chvd   Users of this code are asked to include reference [1] in their
+#      chvd   publications, and in the user- and programmers-manuals
+#      chvd   describing their codes.
+#      chvd
+#      chvd   This code was distributed through CCL (http://www.ccl.net/).
+#      chvd
+#      chvd   [1] V.I. Lebedev, and D.N. Laikov
+#      chvd       "A quadrature formula for the sphere of the 131st
+#      chvd        algebraic order of accuracy"
+#      chvd       Doklady Mathematics, Vol. 59, No. 3, 1999, pp. 477-481.
+#      chvd
+#      chvd   [2] V.I. Lebedev
+#      chvd       "A quadrature formula for the sphere of 59th algebraic
+#      chvd        order of accuracy"
+#      chvd       Russian Acad. Sci. Dokl. Math., Vol. 50, 1995, pp. 283-286.
+#      chvd
+#      chvd   [3] V.I. Lebedev, and A.L. Skorokhodov
+#      chvd       "Quadrature formulas of orders 41, 47, and 53 for the sphere"
+#      chvd       Russian Acad. Sci. Dokl. Math., Vol. 45, 1992, pp. 587-592.
+#      chvd
+#      chvd   [4] V.I. Lebedev
+#      chvd       "Spherical quadrature formulas exact to orders 25-29"
+#      chvd       Siberian Mathematical Journal, Vol. 18, 1977, pp. 99-107.
+#      chvd
+#      chvd   [5] V.I. Lebedev
+#      chvd       "Quadratures on a sphere"
+#      chvd       Computational Mathematics and Mathematical Physics, Vol. 16,
+#      chvd       1976, pp. 10-24.
+#      chvd
+#      chvd   [6] V.I. Lebedev
+#      chvd       "Values of the nodes and weights of ninth to seventeenth
+#      chvd        order Gauss-Markov quadrature formulae invariant under the
+#      chvd        octahedron group with inversion"
+#      chvd       Computational Mathematics and Mathematical Physics, Vol. 15,
+#      chvd       1975, pp. 44-51.
+#      chvd
+#      cvw
+#      cvw    Given a point on a sphere (specified by a and b), generate all
+#      cvw    the equivalent points under Oh symmetry, making grid points with
+#      cvw    weight v.
+#      cvw    The variable num is increased by the number of different points
+#      cvw    generated.
+#      cvw
+#      cvw    Depending on code, there are 6...48 different but equivalent
+#      cvw    points.
+#      cvw
+#      cvw    code=1:   (0,0,1) etc                                (  6 points)
+#      cvw    code=2:   (0,a,a) etc, a=1/sqrt(2)                   ( 12 points)
+#      cvw    code=3:   (a,a,a) etc, a=1/sqrt(3)                   (  8 points)
+#      cvw    code=4:   (a,a,b) etc, b=sqrt(1-2 a^2)               ( 24 points)
+#      cvw    code=5:   (a,b,0) etc, b=sqrt(1-a^2), a input        ( 24 points)
+#      cvw    code=6:   (a,b,c) etc, c=sqrt(1-a^2-b^2), a/b input  ( 48 points)
+#      cvw
+
+import numpy as np
+from functools import lru_cache
+
+@lru_cache(maxsize=500)
+def SphGenOh(code, a, b, v):
+    if code == 0:
+        a = 1.0
+        g = np.array((
+            #  pos/x          pos/y          pos/z         weight
+            a,             0.,            0.,            v,
+            -a,            0.,            0.,            v,
+            0.,            a,             0.,            v,
+            0.,            -a,            0.,            v,
+            0.,            0.,            a,             v,
+            0.,            0.,            -a,            v,
+        )).reshape(6, 4)
+    elif code == 1:
+        a = np.sqrt(0.5)
+        g = np.array((
+            #  pos/x          pos/y          pos/z         weight
+            0.,            a,             a,             v,
+            0.,            -a,            a,             v,
+            0.,            a,             -a,            v,
+            0.,            -a,            -a,            v,
+            a,             0.,            a,             v,
+            -a,            0.,            a,             v,
+            a,             0.,            -a,            v,
+            -a,            0.,            -a,            v,
+            a,             a,             0.,            v,
+            -a,            a,             0.,            v,
+            a,             -a,            0.,            v,
+            -a,            -a,            0.,            v,
+        )).reshape(12, 4)
+    elif code == 2:
+        a = np.sqrt(1./3.)
+        g = np.array((
+            #  pos/x          pos/y          pos/z         weight
+            a,             a,             a,             v,
+            -a,            a,             a,             v,
+            a,             -a,            a,             v,
+            -a,            -a,            a,             v,
+            a,             a,             -a,            v,
+            -a,            a,             -a,            v,
+            a,             -a,            -a,            v,
+            -a,            -a,            -a,            v,
+        )).reshape(8, 4)
+    elif code == 3:
+        b = np.sqrt(1. - 2.*a*a)
+        g = np.array((
+            #  pos/x          pos/y          pos/z         weight
+            a,             a,             b,             v,
+            -a,            a,             b,             v,
+            a,             -a,            b,             v,
+            -a,            -a,            b,             v,
+            a,             a,             -b,            v,
+            -a,            a,             -b,            v,
+            a,             -a,            -b,            v,
+            -a,            -a,            -b,            v,
+            a,             b,             a,             v,
+            -a,            b,             a,             v,
+            a,             -b,            a,             v,
+            -a,            -b,            a,             v,
+            a,             b,             -a,            v,
+            -a,            b,             -a,            v,
+            a,             -b,            -a,            v,
+            -a,            -b,            -a,            v,
+            b,             a,             a,             v,
+            -b,            a,             a,             v,
+            b,             -a,            a,             v,
+            -b,            -a,            a,             v,
+            b,             a,             -a,            v,
+            -b,            a,             -a,            v,
+            b,             -a,            -a,            v,
+            -b,            -a,            -a,            v,
+        )).reshape(24, 4)
+    elif code == 4:
+        b = np.sqrt(1. - a*a)
+        g = np.array((
+            #  pos/x          pos/y          pos/z         weight
+            a,             b,             0.,            v,
+            -a,            b,             0.,            v,
+            a,             -b,            0.,            v,
+            -a,            -b,            0.,            v,
+            b,             a,             0.,            v,
+            -b,            a,             0.,            v,
+            b,             -a,            0.,            v,
+            -b,            -a,            0.,            v,
+            a,             0.,            b,             v,
+            -a,            0.,            b,             v,
+            a,             0.,            -b,            v,
+            -a,            0.,            -b,            v,
+            b,             0.,            a,             v,
+            -b,            0.,            a,             v,
+            b,             0.,            -a,            v,
+            -b,            0.,            -a,            v,
+            0.,            a,             b,             v,
+            0.,            -a,            b,             v,
+            0.,            a,             -b,            v,
+            0.,            -a,            -b,            v,
+            0.,            b,             a,             v,
+            0.,            -b,            a,             v,
+            0.,            b,             -a,            v,
+            0.,            -b,            -a,            v,
+        )).reshape(24, 4)
+    elif code == 5:
+        c = np.sqrt(1. - a*a - b*b)
+        g = np.array((
+            #  pos/x          pos/y          pos/z         weight
+            a,             b,             c,             v,
+            -a,            b,             c,             v,
+            a,             -b,            c,             v,
+            -a,            -b,            c,             v,
+            a,             b,             -c,            v,
+            -a,            b,             -c,            v,
+            a,             -b,            -c,            v,
+            -a,            -b,            -c,            v,
+            a,             c,             b,             v,
+            -a,            c,             b,             v,
+            a,             -c,            b,             v,
+            -a,            -c,            b,             v,
+            a,             c,             -b,            v,
+            -a,            c,             -b,            v,
+            a,             -c,            -b,            v,
+            -a,            -c,            -b,            v,
+            b,             a,             c,             v,
+            -b,            a,             c,             v,
+            b,             -a,            c,             v,
+            -b,            -a,            c,             v,
+            b,             a,             -c,            v,
+            -b,            a,             -c,            v,
+            b,             -a,            -c,            v,
+            -b,            -a,            -c,            v,
+            b,             c,             a,             v,
+            -b,            c,             a,             v,
+            b,             -c,            a,             v,
+            -b,            -c,            a,             v,
+            b,             c,             -a,            v,
+            -b,            c,             -a,            v,
+            b,             -c,            -a,            v,
+            -b,            -c,            -a,            v,
+            c,             a,             b,             v,
+            -c,            a,             b,             v,
+            c,             -a,            b,             v,
+            -c,            -a,            b,             v,
+            c,             a,             -b,            v,
+            -c,            a,             -b,            v,
+            c,             -a,            -b,            v,
+            -c,            -a,            -b,            v,
+            c,             b,             a,             v,
+            -c,            b,             a,             v,
+            c,             -b,            a,             v,
+            -c,            -b,            a,             v,
+            c,             b,             -a,            v,
+            -c,            b,             -a,            v,
+            c,             -b,            -a,            v,
+            -c,            -b,            -a,            v,
+        )).reshape(48, 4)
+    return g
+
+
+def MakeAngularGrid_6():
+    grids = []
+    a = 0
+    b = 0
+    v = 0.1666666666666667e+0
+    grids.append(SphGenOh(0, a, b, v))
+    return np.vstack(grids)
+
+
+def MakeAngularGrid_14():
+    grids = []
+    a = 0
+    b = 0
+    v = 0.6666666666666667e-1
+    grids.append(SphGenOh(0, a, b, v))
+    v = 0.7500000000000000e-1
+    grids.append(SphGenOh(2, a, b, v))
+    return np.vstack(grids)
+
+
+def MakeAngularGrid_26():
+    grids = []
+    a = 0
+    b = 0
+    v = 0.4761904761904762e-1
+    grids.append(SphGenOh(0, a, b, v))
+    v = 0.3809523809523810e-1
+    grids.append(SphGenOh(1, a, b, v))
+    v = 0.3214285714285714e-1
+    grids.append(SphGenOh(2, a, b, v))
+    return np.vstack(grids)
+
+
+def MakeAngularGrid_38():
+    grids = []
+    a = 0
+    b = 0
+    v = 0.9523809523809524e-2
+    grids.append(SphGenOh(0, a, b, v))
+    v = 0.3214285714285714e-1
+    grids.append(SphGenOh(2, a, b, v))
+    a = 0.4597008433809831e+0
+    v = 0.2857142857142857e-1
+    grids.append(SphGenOh(4, a, b, v))
+    return np.vstack(grids)
+
+
+def MakeAngularGrid_50():
+    grids = []
+    a = 0
+    b = 0
+    v = 0.1269841269841270e-1
+    grids.append(SphGenOh(0, a, b, v))
+    v = 0.2257495590828924e-1
+    grids.append(SphGenOh(1, a, b, v))
+    v = 0.2109375000000000e-1
+    grids.append(SphGenOh(2, a, b, v))
+    a = 0.3015113445777636e+0
+    v = 0.2017333553791887e-1
+    grids.append(SphGenOh(3, a, b, v))
+    return np.vstack(grids)
+
+
+def MakeAngularGrid_74():
+    grids = []
+    a = 0
+    b = 0
+    v = 0.5130671797338464e-3
+    grids.append(SphGenOh(0, a, b, v))
+    v = 0.1660406956574204e-1
+    grids.append(SphGenOh(1, a, b, v))
+    v = -0.2958603896103896e-1
+    grids.append(SphGenOh(2, a, b, v))
+    a = 0.4803844614152614e+0
+    v = 0.2657620708215946e-1
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.3207726489807764e+0
+    v = 0.1652217099371571e-1
+    grids.append(SphGenOh(4, a, b, v))
+    return np.vstack(grids)
+
+
+def MakeAngularGrid_86():
+    grids = []
+    a = 0
+    b = 0
+    v = 0.1154401154401154e-1
+    grids.append(SphGenOh(0, a, b, v))
+    v = 0.1194390908585628e-1
+    grids.append(SphGenOh(2, a, b, v))
+    a = 0.3696028464541502e+0
+    v = 0.1111055571060340e-1
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6943540066026664e+0
+    v = 0.1187650129453714e-1
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.3742430390903412e+0
+    v = 0.1181230374690448e-1
+    grids.append(SphGenOh(4, a, b, v))
+    return np.vstack(grids)
+
+
+def MakeAngularGrid_110():
+    grids = []
+    a = 0
+    b = 0
+    v = 0.3828270494937162e-2
+    grids.append(SphGenOh(0, a, b, v))
+    v = 0.9793737512487512e-2
+    grids.append(SphGenOh(2, a, b, v))
+    a = 0.1851156353447362e+0
+    v = 0.8211737283191111e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6904210483822922e+0
+    v = 0.9942814891178103e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.3956894730559419e+0
+    v = 0.9595471336070963e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4783690288121502e+0
+    v = 0.9694996361663028e-2
+    grids.append(SphGenOh(4, a, b, v))
+    return np.vstack(grids)
+
+
+def MakeAngularGrid_146():
+    grids = []
+    a = 0
+    b = 0
+    v = 0.5996313688621381e-3
+    grids.append(SphGenOh(0, a, b, v))
+    v = 0.7372999718620756e-2
+    grids.append(SphGenOh(1, a, b, v))
+    v = 0.7210515360144488e-2
+    grids.append(SphGenOh(2, a, b, v))
+    a = 0.6764410400114264e+0
+    v = 0.7116355493117555e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4174961227965453e+0
+    v = 0.6753829486314477e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.1574676672039082e+0
+    v = 0.7574394159054034e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.1403553811713183e+0
+    b = 0.4493328323269557e+0
+    v = 0.6991087353303262e-2
+    grids.append(SphGenOh(5, a, b, v))
+    return np.vstack(grids)
+
+
+def MakeAngularGrid_170():
+    grids = []
+    a = 0
+    b = 0
+    v = 0.5544842902037365e-2
+    grids.append(SphGenOh(0, a, b, v))
+    v = 0.6071332770670752e-2
+    grids.append(SphGenOh(1, a, b, v))
+    v = 0.6383674773515093e-2
+    grids.append(SphGenOh(2, a, b, v))
+    a = 0.2551252621114134e+0
+    v = 0.5183387587747790e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6743601460362766e+0
+    v = 0.6317929009813725e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4318910696719410e+0
+    v = 0.6201670006589077e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.2613931360335988e+0
+    v = 0.5477143385137348e-2
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.4990453161796037e+0
+    b = 0.1446630744325115e+0
+    v = 0.5968383987681156e-2
+    grids.append(SphGenOh(5, a, b, v))
+    return np.vstack(grids)
+
+
+def MakeAngularGrid_194():
+    grids = []
+    a = 0
+    b = 0
+    v = 0.1782340447244611e-2
+    grids.append(SphGenOh(0, a, b, v))
+    v = 0.5716905949977102e-2
+    grids.append(SphGenOh(1, a, b, v))
+    v = 0.5573383178848738e-2
+    grids.append(SphGenOh(2, a, b, v))
+    a = 0.6712973442695226e+0
+    v = 0.5608704082587997e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.2892465627575439e+0
+    v = 0.5158237711805383e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4446933178717437e+0
+    v = 0.5518771467273614e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.1299335447650067e+0
+    v = 0.4106777028169394e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.3457702197611283e+0
+    v = 0.5051846064614808e-2
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.1590417105383530e+0
+    b = 0.8360360154824589e+0
+    v = 0.5530248916233094e-2
+    grids.append(SphGenOh(5, a, b, v))
+    return np.vstack(grids)
+
+
+def MakeAngularGrid_230():
+    grids = []
+    a = 0
+    b = 0
+    v = -0.5522639919727325e-1
+    grids.append(SphGenOh(0, a, b, v))
+    v = 0.4450274607445226e-2
+    grids.append(SphGenOh(2, a, b, v))
+    a = 0.4492044687397611e+0
+    v = 0.4496841067921404e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.2520419490210201e+0
+    v = 0.5049153450478750e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6981906658447242e+0
+    v = 0.3976408018051883e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6587405243460960e+0
+    v = 0.4401400650381014e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4038544050097660e-1
+    v = 0.1724544350544401e-1
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.5823842309715585e+0
+    v = 0.4231083095357343e-2
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.3545877390518688e+0
+    v = 0.5198069864064399e-2
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.2272181808998187e+0
+    b = 0.4864661535886647e+0
+    v = 0.4695720972568883e-2
+    grids.append(SphGenOh(5, a, b, v))
+    return np.vstack(grids)
+
+
+def MakeAngularGrid_266():
+    grids = []
+    a = 0
+    b = 0
+    v = -0.1313769127326952e-2
+    grids.append(SphGenOh(0, a, b, v))
+    v = -0.2522728704859336e-2
+    grids.append(SphGenOh(1, a, b, v))
+    v = 0.4186853881700583e-2
+    grids.append(SphGenOh(2, a, b, v))
+    a = 0.7039373391585475e+0
+    v = 0.5315167977810885e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.1012526248572414e+0
+    v = 0.4047142377086219e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4647448726420539e+0
+    v = 0.4112482394406990e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.3277420654971629e+0
+    v = 0.3595584899758782e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6620338663699974e+0
+    v = 0.4256131351428158e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.8506508083520399e+0
+    v = 0.4229582700647240e-2
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.3233484542692899e+0
+    b = 0.1153112011009701e+0
+    v = 0.4080914225780505e-2
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2314790158712601e+0
+    b = 0.5244939240922365e+0
+    v = 0.4071467593830964e-2
+    grids.append(SphGenOh(5, a, b, v))
+    return np.vstack(grids)
+
+
+def MakeAngularGrid_302():
+    grids = []
+    a = 0
+    b = 0
+    v = 0.8545911725128148e-3
+    grids.append(SphGenOh(0, a, b, v))
+    v = 0.3599119285025571e-2
+    grids.append(SphGenOh(2, a, b, v))
+    a = 0.3515640345570105e+0
+    v = 0.3449788424305883e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6566329410219612e+0
+    v = 0.3604822601419882e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4729054132581005e+0
+    v = 0.3576729661743367e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.9618308522614784e-1
+    v = 0.2352101413689164e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.2219645236294178e+0
+    v = 0.3108953122413675e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.7011766416089545e+0
+    v = 0.3650045807677255e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.2644152887060663e+0
+    v = 0.2982344963171804e-2
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.5718955891878961e+0
+    v = 0.3600820932216460e-2
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.2510034751770465e+0
+    b = 0.8000727494073952e+0
+    v = 0.3571540554273387e-2
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1233548532583327e+0
+    b = 0.4127724083168531e+0
+    v = 0.3392312205006170e-2
+    grids.append(SphGenOh(5, a, b, v))
+    return np.vstack(grids)
+
+
+def MakeAngularGrid_350():
+    grids = []
+    a = 0
+    b = 0
+    v = 0.3006796749453936e-2
+    grids.append(SphGenOh(0, a, b, v))
+    v = 0.3050627745650771e-2
+    grids.append(SphGenOh(2, a, b, v))
+    a = 0.7068965463912316e+0
+    v = 0.1621104600288991e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4794682625712025e+0
+    v = 0.3005701484901752e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.1927533154878019e+0
+    v = 0.2990992529653774e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6930357961327123e+0
+    v = 0.2982170644107595e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.3608302115520091e+0
+    v = 0.2721564237310992e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6498486161496169e+0
+    v = 0.3033513795811141e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.1932945013230339e+0
+    v = 0.3007949555218533e-2
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.3800494919899303e+0
+    v = 0.2881964603055307e-2
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.2899558825499574e+0
+    b = 0.7934537856582316e+0
+    v = 0.2958357626535696e-2
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.9684121455103957e-1
+    b = 0.8280801506686862e+0
+    v = 0.3036020026407088e-2
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1833434647041659e+0
+    b = 0.9074658265305127e+0
+    v = 0.2832187403926303e-2
+    grids.append(SphGenOh(5, a, b, v))
+    return np.vstack(grids)
+
+
+def MakeAngularGrid_434():
+    grids = []
+    a = 0
+    b = 0
+    v = 0.5265897968224436e-3
+    grids.append(SphGenOh(0, a, b, v))
+    v = 0.2548219972002607e-2
+    grids.append(SphGenOh(1, a, b, v))
+    v = 0.2512317418927307e-2
+    grids.append(SphGenOh(2, a, b, v))
+    a = 0.6909346307509111e+0
+    v = 0.2530403801186355e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.1774836054609158e+0
+    v = 0.2014279020918528e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4914342637784746e+0
+    v = 0.2501725168402936e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6456664707424256e+0
+    v = 0.2513267174597564e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.2861289010307638e+0
+    v = 0.2302694782227416e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.7568084367178018e-1
+    v = 0.1462495621594614e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.3927259763368002e+0
+    v = 0.2445373437312980e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.8818132877794288e+0
+    v = 0.2417442375638981e-2
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.9776428111182649e+0
+    v = 0.1910951282179532e-2
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.2054823696403044e+0
+    b = 0.8689460322872412e+0
+    v = 0.2416930044324775e-2
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5905157048925271e+0
+    b = 0.7999278543857286e+0
+    v = 0.2512236854563495e-2
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5550152361076807e+0
+    b = 0.7717462626915901e+0
+    v = 0.2496644054553086e-2
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.9371809858553722e+0
+    b = 0.3344363145343455e+0
+    v = 0.2236607760437849e-2
+    grids.append(SphGenOh(5, a, b, v))
+    return np.vstack(grids)
+
+
+def MakeAngularGrid_590():
+    grids = []
+    a = 0
+    b = 0
+    v = 0.3095121295306187e-3
+    grids.append(SphGenOh(0, a, b, v))
+    v = 0.1852379698597489e-2
+    grids.append(SphGenOh(2, a, b, v))
+    a = 0.7040954938227469e+0
+    v = 0.1871790639277744e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6807744066455243e+0
+    v = 0.1858812585438317e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6372546939258752e+0
+    v = 0.1852028828296213e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.5044419707800358e+0
+    v = 0.1846715956151242e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4215761784010967e+0
+    v = 0.1818471778162769e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.3317920736472123e+0
+    v = 0.1749564657281154e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.2384736701421887e+0
+    v = 0.1617210647254411e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.1459036449157763e+0
+    v = 0.1384737234851692e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6095034115507196e-1
+    v = 0.9764331165051050e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6116843442009876e+0
+    v = 0.1857161196774078e-2
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.3964755348199858e+0
+    v = 0.1705153996395864e-2
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.1724782009907724e+0
+    v = 0.1300321685886048e-2
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.5610263808622060e+0
+    b = 0.3518280927733519e+0
+    v = 0.1842866472905286e-2
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4742392842551980e+0
+    b = 0.2634716655937950e+0
+    v = 0.1802658934377451e-2
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5984126497885380e+0
+    b = 0.1816640840360209e+0
+    v = 0.1849830560443660e-2
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3791035407695563e+0
+    b = 0.1720795225656878e+0
+    v = 0.1713904507106709e-2
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2778673190586244e+0
+    b = 0.8213021581932511e-1
+    v = 0.1555213603396808e-2
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5033564271075117e+0
+    b = 0.8999205842074875e-1
+    v = 0.1802239128008525e-2
+    grids.append(SphGenOh(5, a, b, v))
+    return np.vstack(grids)
+
+
+def MakeAngularGrid_770():
+    grids = []
+    a = 0
+    b = 0
+    v = 0.2192942088181184e-3
+    grids.append(SphGenOh(0, a, b, v))
+    v = 0.1436433617319080e-2
+    grids.append(SphGenOh(1, a, b, v))
+    v = 0.1421940344335877e-2
+    grids.append(SphGenOh(2, a, b, v))
+    a = 0.5087204410502360e-1
+    v = 0.6798123511050502e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.1228198790178831e+0
+    v = 0.9913184235294912e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.2026890814408786e+0
+    v = 0.1180207833238949e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.2847745156464294e+0
+    v = 0.1296599602080921e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.3656719078978026e+0
+    v = 0.1365871427428316e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4428264886713469e+0
+    v = 0.1402988604775325e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.5140619627249735e+0
+    v = 0.1418645563595609e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6306401219166803e+0
+    v = 0.1421376741851662e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6716883332022612e+0
+    v = 0.1423996475490962e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6979792685336881e+0
+    v = 0.1431554042178567e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.1446865674195309e+0
+    v = 0.9254401499865368e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.3390263475411216e+0
+    v = 0.1250239995053509e-2
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.5335804651263506e+0
+    v = 0.1394365843329230e-2
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.6944024393349413e-1
+    b = 0.2355187894242326e+0
+    v = 0.1127089094671749e-2
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2269004109529460e+0
+    b = 0.4102182474045730e+0
+    v = 0.1345753760910670e-2
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.8025574607775339e-1
+    b = 0.6214302417481605e+0
+    v = 0.1424957283316783e-2
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1467999527896572e+0
+    b = 0.3245284345717394e+0
+    v = 0.1261523341237750e-2
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1571507769824727e+0
+    b = 0.5224482189696630e+0
+    v = 0.1392547106052696e-2
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2365702993157246e+0
+    b = 0.6017546634089558e+0
+    v = 0.1418761677877656e-2
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.7714815866765732e-1
+    b = 0.4346575516141163e+0
+    v = 0.1338366684479554e-2
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3062936666210730e+0
+    b = 0.4908826589037616e+0
+    v = 0.1393700862676131e-2
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3822477379524787e+0
+    b = 0.5648768149099500e+0
+    v = 0.1415914757466932e-2
+    grids.append(SphGenOh(5, a, b, v))
+    return np.vstack(grids)
+
+
+def MakeAngularGrid_974():
+    grids = []
+    a = 0
+    b = 0
+    v = 0.1438294190527431e-3
+    grids.append(SphGenOh(0, a, b, v))
+    v = 0.1125772288287004e-2
+    grids.append(SphGenOh(2, a, b, v))
+    a = 0.4292963545341347e-1
+    v = 0.4948029341949241e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.1051426854086404e+0
+    v = 0.7357990109125470e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.1750024867623087e+0
+    v = 0.8889132771304384e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.2477653379650257e+0
+    v = 0.9888347838921435e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.3206567123955957e+0
+    v = 0.1053299681709471e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.3916520749849983e+0
+    v = 0.1092778807014578e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4590825874187624e+0
+    v = 0.1114389394063227e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.5214563888415861e+0
+    v = 0.1123724788051555e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6253170244654199e+0
+    v = 0.1125239325243814e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6637926744523170e+0
+    v = 0.1126153271815905e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6910410398498301e+0
+    v = 0.1130286931123841e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.7052907007457760e+0
+    v = 0.1134986534363955e-2
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.1236686762657990e+0
+    v = 0.6823367927109931e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.2940777114468387e+0
+    v = 0.9454158160447096e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.4697753849207649e+0
+    v = 0.1074429975385679e-2
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.6334563241139567e+0
+    v = 0.1129300086569132e-2
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.5974048614181342e-1
+    b = 0.2029128752777523e+0
+    v = 0.8436884500901954e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1375760408473636e+0
+    b = 0.4602621942484054e+0
+    v = 0.1075255720448885e-2
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3391016526336286e+0
+    b = 0.5030673999662036e+0
+    v = 0.1108577236864462e-2
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1271675191439820e+0
+    b = 0.2817606422442134e+0
+    v = 0.9566475323783357e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2693120740413512e+0
+    b = 0.4331561291720157e+0
+    v = 0.1080663250717391e-2
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1419786452601918e+0
+    b = 0.6256167358580814e+0
+    v = 0.1126797131196295e-2
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6709284600738255e-1
+    b = 0.3798395216859157e+0
+    v = 0.1022568715358061e-2
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.7057738183256172e-1
+    b = 0.5517505421423520e+0
+    v = 0.1108960267713108e-2
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2783888477882155e+0
+    b = 0.6029619156159187e+0
+    v = 0.1122790653435766e-2
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1979578938917407e+0
+    b = 0.3589606329589096e+0
+    v = 0.1032401847117460e-2
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2087307061103274e+0
+    b = 0.5348666438135476e+0
+    v = 0.1107249382283854e-2
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4055122137872836e+0
+    b = 0.5674997546074373e+0
+    v = 0.1121780048519972e-2
+    grids.append(SphGenOh(5, a, b, v))
+    return np.vstack(grids)
+
+
+def MakeAngularGrid_1202():
+    grids = []
+    a = 0
+    b = 0
+    v = 0.1105189233267572e-3
+    grids.append(SphGenOh(0, a, b, v))
+    v = 0.9205232738090741e-3
+    grids.append(SphGenOh(1, a, b, v))
+    v = 0.9133159786443561e-3
+    grids.append(SphGenOh(2, a, b, v))
+    a = 0.3712636449657089e-1
+    v = 0.3690421898017899e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.9140060412262223e-1
+    v = 0.5603990928680660e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.1531077852469906e+0
+    v = 0.6865297629282609e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.2180928891660612e+0
+    v = 0.7720338551145630e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.2839874532200175e+0
+    v = 0.8301545958894795e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.3491177600963764e+0
+    v = 0.8686692550179628e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4121431461444309e+0
+    v = 0.8927076285846890e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4718993627149127e+0
+    v = 0.9060820238568219e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.5273145452842337e+0
+    v = 0.9119777254940867e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6209475332444019e+0
+    v = 0.9128720138604181e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6569722711857291e+0
+    v = 0.9130714935691735e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6841788309070143e+0
+    v = 0.9152873784554116e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.7012604330123631e+0
+    v = 0.9187436274321654e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.1072382215478166e+0
+    v = 0.5176977312965694e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.2582068959496968e+0
+    v = 0.7331143682101417e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.4172752955306717e+0
+    v = 0.8463232836379928e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.5700366911792503e+0
+    v = 0.9031122694253992e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.9827986018263947e+0
+    b = 0.1771774022615325e+0
+    v = 0.6485778453163257e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.9624249230326228e+0
+    b = 0.2475716463426288e+0
+    v = 0.7435030910982369e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.9402007994128811e+0
+    b = 0.3354616289066489e+0
+    v = 0.7998527891839054e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.9320822040143202e+0
+    b = 0.3173615246611977e+0
+    v = 0.8101731497468018e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.9043674199393299e+0
+    b = 0.4090268427085357e+0
+    v = 0.8483389574594331e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.8912407560074747e+0
+    b = 0.3854291150669224e+0
+    v = 0.8556299257311812e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.8676435628462708e+0
+    b = 0.4932221184851285e+0
+    v = 0.8803208679738260e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.8581979986041619e+0
+    b = 0.4785320675922435e+0
+    v = 0.8811048182425720e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.8396753624049856e+0
+    b = 0.4507422593157064e+0
+    v = 0.8850282341265444e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.8165288564022188e+0
+    b = 0.5632123020762100e+0
+    v = 0.9021342299040653e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.8015469370783529e+0
+    b = 0.5434303569693900e+0
+    v = 0.9010091677105086e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.7773563069070351e+0
+    b = 0.5123518486419871e+0
+    v = 0.9022692938426915e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.7661621213900394e+0
+    b = 0.6394279634749102e+0
+    v = 0.9158016174693465e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.7553584143533510e+0
+    b = 0.6269805509024392e+0
+    v = 0.9131578003189435e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.7344305757559503e+0
+    b = 0.6031161693096310e+0
+    v = 0.9107813579482705e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.7043837184021765e+0
+    b = 0.5693702498468441e+0
+    v = 0.9105760258970126e-3
+    grids.append(SphGenOh(5, a, b, v))
+    return np.vstack(grids)
+
+
+def MakeAngularGrid_1454():
+    grids = []
+    a = 0
+    b = 0
+    v = 0.7777160743261247e-4
+    grids.append(SphGenOh(0, a, b, v))
+    v = 0.7557646413004701e-3
+    grids.append(SphGenOh(2, a, b, v))
+    a = 0.3229290663413854e-1
+    v = 0.2841633806090617e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.8036733271462222e-1
+    v = 0.4374419127053555e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.1354289960531653e+0
+    v = 0.5417174740872172e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.1938963861114426e+0
+    v = 0.6148000891358593e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.2537343715011275e+0
+    v = 0.6664394485800705e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.3135251434752570e+0
+    v = 0.7025039356923220e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.3721558339375338e+0
+    v = 0.7268511789249627e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4286809575195696e+0
+    v = 0.7422637534208629e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4822510128282994e+0
+    v = 0.7509545035841214e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.5320679333566263e+0
+    v = 0.7548535057718401e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6172998195394274e+0
+    v = 0.7554088969774001e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6510679849127481e+0
+    v = 0.7553147174442808e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6777315251687360e+0
+    v = 0.7564767653292297e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6963109410648741e+0
+    v = 0.7587991808518730e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.7058935009831749e+0
+    v = 0.7608261832033027e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.9955546194091857e+0
+    v = 0.4021680447874916e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.9734115901794209e+0
+    v = 0.5804871793945964e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.9275693732388626e+0
+    v = 0.6792151955945159e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.8568022422795103e+0
+    v = 0.7336741211286294e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.7623495553719372e+0
+    v = 0.7581866300989608e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.5707522908892223e+0
+    b = 0.4387028039889501e+0
+    v = 0.7538257859800743e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5196463388403083e+0
+    b = 0.3858908414762617e+0
+    v = 0.7483517247053123e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4646337531215351e+0
+    b = 0.3301937372343854e+0
+    v = 0.7371763661112059e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4063901697557691e+0
+    b = 0.2725423573563777e+0
+    v = 0.7183448895756934e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3456329466643087e+0
+    b = 0.2139510237495250e+0
+    v = 0.6895815529822191e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2831395121050332e+0
+    b = 0.1555922309786647e+0
+    v = 0.6480105801792886e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2197682022925330e+0
+    b = 0.9892878979686097e-1
+    v = 0.5897558896594636e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1564696098650355e+0
+    b = 0.4598642910675510e-1
+    v = 0.5095708849247346e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6027356673721295e+0
+    b = 0.3376625140173426e+0
+    v = 0.7536906428909755e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5496032320255096e+0
+    b = 0.2822301309727988e+0
+    v = 0.7472505965575118e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4921707755234567e+0
+    b = 0.2248632342592540e+0
+    v = 0.7343017132279698e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4309422998598483e+0
+    b = 0.1666224723456479e+0
+    v = 0.7130871582177445e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3664108182313672e+0
+    b = 0.1086964901822169e+0
+    v = 0.6817022032112776e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2990189057758436e+0
+    b = 0.5251989784120085e-1
+    v = 0.6380941145604121e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6268724013144998e+0
+    b = 0.2297523657550023e+0
+    v = 0.7550381377920310e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5707324144834607e+0
+    b = 0.1723080607093800e+0
+    v = 0.7478646640144802e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5096360901960365e+0
+    b = 0.1140238465390513e+0
+    v = 0.7335918720601220e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4438729938312456e+0
+    b = 0.5611522095882537e-1
+    v = 0.7110120527658118e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6419978471082389e+0
+    b = 0.1164174423140873e+0
+    v = 0.7571363978689501e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5817218061802611e+0
+    b = 0.5797589531445219e-1
+    v = 0.7489908329079234e-3
+    grids.append(SphGenOh(5, a, b, v))
+    return np.vstack(grids)
+
+
+def MakeAngularGrid_1730():
+    grids = []
+    a = 0
+    b = 0
+    v = 0.6309049437420976e-4
+    grids.append(SphGenOh(0, a, b, v))
+    v = 0.6398287705571748e-3
+    grids.append(SphGenOh(1, a, b, v))
+    v = 0.6357185073530720e-3
+    grids.append(SphGenOh(2, a, b, v))
+    a = 0.2860923126194662e-1
+    v = 0.2221207162188168e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.7142556767711522e-1
+    v = 0.3475784022286848e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.1209199540995559e+0
+    v = 0.4350742443589804e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.1738673106594379e+0
+    v = 0.4978569136522127e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.2284645438467734e+0
+    v = 0.5435036221998053e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.2834807671701512e+0
+    v = 0.5765913388219542e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.3379680145467339e+0
+    v = 0.6001200359226003e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.3911355454819537e+0
+    v = 0.6162178172717512e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4422860353001403e+0
+    v = 0.6265218152438485e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4907781568726057e+0
+    v = 0.6323987160974212e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.5360006153211468e+0
+    v = 0.6350767851540569e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6142105973596603e+0
+    v = 0.6354362775297107e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6459300387977504e+0
+    v = 0.6352302462706235e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6718056125089225e+0
+    v = 0.6358117881417972e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6910888533186254e+0
+    v = 0.6373101590310117e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.7030467416823252e+0
+    v = 0.6390428961368665e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.8354951166354646e-1
+    v = 0.3186913449946576e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.2050143009099486e+0
+    v = 0.4678028558591711e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.3370208290706637e+0
+    v = 0.5538829697598626e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.4689051484233963e+0
+    v = 0.6044475907190476e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.5939400424557334e+0
+    v = 0.6313575103509012e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.1394983311832261e+0
+    b = 0.4097581162050343e-1
+    v = 0.4078626431855630e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1967999180485014e+0
+    b = 0.8851987391293348e-1
+    v = 0.4759933057812725e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2546183732548967e+0
+    b = 0.1397680182969819e+0
+    v = 0.5268151186413440e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3121281074713875e+0
+    b = 0.1929452542226526e+0
+    v = 0.5643048560507316e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3685981078502492e+0
+    b = 0.2467898337061562e+0
+    v = 0.5914501076613073e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4233760321547856e+0
+    b = 0.3003104124785409e+0
+    v = 0.6104561257874195e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4758671236059246e+0
+    b = 0.3526684328175033e+0
+    v = 0.6230252860707806e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5255178579796463e+0
+    b = 0.4031134861145713e+0
+    v = 0.6305618761760796e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5718025633734589e+0
+    b = 0.4509426448342351e+0
+    v = 0.6343092767597889e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2686927772723415e+0
+    b = 0.4711322502423248e-1
+    v = 0.5176268945737826e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3306006819904809e+0
+    b = 0.9784487303942695e-1
+    v = 0.5564840313313692e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3904906850594983e+0
+    b = 0.1505395810025273e+0
+    v = 0.5856426671038980e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4479957951904390e+0
+    b = 0.2039728156296050e+0
+    v = 0.6066386925777091e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5027076848919780e+0
+    b = 0.2571529941121107e+0
+    v = 0.6208824962234458e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5542087392260217e+0
+    b = 0.3092191375815670e+0
+    v = 0.6296314297822907e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6020850887375187e+0
+    b = 0.3593807506130276e+0
+    v = 0.6340423756791859e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4019851409179594e+0
+    b = 0.5063389934378671e-1
+    v = 0.5829627677107342e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4635614567449800e+0
+    b = 0.1032422269160612e+0
+    v = 0.6048693376081110e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5215860931591575e+0
+    b = 0.1566322094006254e+0
+    v = 0.6202362317732461e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5758202499099271e+0
+    b = 0.2098082827491099e+0
+    v = 0.6299005328403779e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6259893683876795e+0
+    b = 0.2618824114553391e+0
+    v = 0.6347722390609353e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5313795124811891e+0
+    b = 0.5263245019338556e-1
+    v = 0.6203778981238834e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5893317955931995e+0
+    b = 0.1061059730982005e+0
+    v = 0.6308414671239979e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6426246321215801e+0
+    b = 0.1594171564034221e+0
+    v = 0.6362706466959498e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6511904367376113e+0
+    b = 0.5354789536565540e-1
+    v = 0.6375414170333233e-3
+    grids.append(SphGenOh(5, a, b, v))
+    return np.vstack(grids)
+
+
+def MakeAngularGrid_2030():
+    grids = []
+    a = 0
+    b = 0
+    v = 0.4656031899197431e-4
+    grids.append(SphGenOh(0, a, b, v))
+    v = 0.5421549195295507e-3
+    grids.append(SphGenOh(2, a, b, v))
+    a = 0.2540835336814348e-1
+    v = 0.1778522133346553e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6399322800504915e-1
+    v = 0.2811325405682796e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.1088269469804125e+0
+    v = 0.3548896312631459e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.1570670798818287e+0
+    v = 0.4090310897173364e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.2071163932282514e+0
+    v = 0.4493286134169965e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.2578914044450844e+0
+    v = 0.4793728447962723e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.3085687558169623e+0
+    v = 0.5015415319164265e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.3584719706267024e+0
+    v = 0.5175127372677937e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4070135594428709e+0
+    v = 0.5285522262081019e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4536618626222638e+0
+    v = 0.5356832703713962e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4979195686463577e+0
+    v = 0.5397914736175170e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.5393075111126999e+0
+    v = 0.5416899441599930e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6115617676843916e+0
+    v = 0.5419308476889938e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6414308435160159e+0
+    v = 0.5416936902030596e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6664099412721607e+0
+    v = 0.5419544338703164e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6859161771214913e+0
+    v = 0.5428983656630975e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6993625593503890e+0
+    v = 0.5442286500098193e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.7062393387719380e+0
+    v = 0.5452250345057301e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.7479028168349763e-1
+    v = 0.2568002497728530e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.1848951153969366e+0
+    v = 0.3827211700292145e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.3059529066581305e+0
+    v = 0.4579491561917824e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.4285556101021362e+0
+    v = 0.5042003969083574e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.5468758653496526e+0
+    v = 0.5312708889976025e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.6565821978343439e+0
+    v = 0.5438401790747117e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.1253901572367117e+0
+    b = 0.3681917226439641e-1
+    v = 0.3316041873197344e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1775721510383941e+0
+    b = 0.7982487607213301e-1
+    v = 0.3899113567153771e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2305693358216114e+0
+    b = 0.1264640966592335e+0
+    v = 0.4343343327201309e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2836502845992063e+0
+    b = 0.1751585683418957e+0
+    v = 0.4679415262318919e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3361794746232590e+0
+    b = 0.2247995907632670e+0
+    v = 0.4930847981631031e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3875979172264824e+0
+    b = 0.2745299257422246e+0
+    v = 0.5115031867540091e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4374019316999074e+0
+    b = 0.3236373482441118e+0
+    v = 0.5245217148457367e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4851275843340022e+0
+    b = 0.3714967859436741e+0
+    v = 0.5332041499895321e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5303391803806868e+0
+    b = 0.4175353646321745e+0
+    v = 0.5384583126021542e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5726197380596287e+0
+    b = 0.4612084406355461e+0
+    v = 0.5411067210798852e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2431520732564863e+0
+    b = 0.4258040133043952e-1
+    v = 0.4259797391468714e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3002096800895869e+0
+    b = 0.8869424306722721e-1
+    v = 0.4604931368460021e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3558554457457432e+0
+    b = 0.1368811706510655e+0
+    v = 0.4871814878255202e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4097782537048887e+0
+    b = 0.1860739985015033e+0
+    v = 0.5072242910074885e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4616337666067458e+0
+    b = 0.2354235077395853e+0
+    v = 0.5217069845235350e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5110707008417874e+0
+    b = 0.2842074921347011e+0
+    v = 0.5315785966280310e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5577415286163795e+0
+    b = 0.3317784414984102e+0
+    v = 0.5376833708758905e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6013060431366950e+0
+    b = 0.3775299002040700e+0
+    v = 0.5408032092069521e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3661596767261781e+0
+    b = 0.4599367887164592e-1
+    v = 0.4842744917904866e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4237633153506581e+0
+    b = 0.9404893773654421e-1
+    v = 0.5048926076188130e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4786328454658452e+0
+    b = 0.1431377109091971e+0
+    v = 0.5202607980478373e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5305702076789774e+0
+    b = 0.1924186388843570e+0
+    v = 0.5309932388325743e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5793436224231788e+0
+    b = 0.2411590944775190e+0
+    v = 0.5377419770895208e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6247069017094747e+0
+    b = 0.2886871491583605e+0
+    v = 0.5411696331677717e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4874315552535204e+0
+    b = 0.4804978774953206e-1
+    v = 0.5197996293282420e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5427337322059053e+0
+    b = 0.9716857199366665e-1
+    v = 0.5311120836622945e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5943493747246700e+0
+    b = 0.1465205839795055e+0
+    v = 0.5384309319956951e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6421314033564943e+0
+    b = 0.1953579449803574e+0
+    v = 0.5421859504051886e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6020628374713980e+0
+    b = 0.4916375015738108e-1
+    v = 0.5390948355046314e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6529222529856881e+0
+    b = 0.9861621540127005e-1
+    v = 0.5433312705027845e-3
+    grids.append(SphGenOh(5, a, b, v))
+    return np.vstack(grids)
+
+
+def MakeAngularGrid_2354():
+    grids = []
+    a = 0
+    b = 0
+    v = 0.3922616270665292e-4
+    grids.append(SphGenOh(0, a, b, v))
+    v = 0.4703831750854424e-3
+    grids.append(SphGenOh(1, a, b, v))
+    v = 0.4678202801282136e-3
+    grids.append(SphGenOh(2, a, b, v))
+    a = 0.2290024646530589e-1
+    v = 0.1437832228979900e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.5779086652271284e-1
+    v = 0.2303572493577644e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.9863103576375984e-1
+    v = 0.2933110752447454e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.1428155792982185e+0
+    v = 0.3402905998359838e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.1888978116601463e+0
+    v = 0.3759138466870372e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.2359091682970210e+0
+    v = 0.4030638447899798e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.2831228833706171e+0
+    v = 0.4236591432242211e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.3299495857966693e+0
+    v = 0.4390522656946746e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.3758840802660796e+0
+    v = 0.4502523466626247e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4204751831009480e+0
+    v = 0.4580577727783541e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4633068518751051e+0
+    v = 0.4631391616615899e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.5039849474507313e+0
+    v = 0.4660928953698676e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.5421265793440747e+0
+    v = 0.4674751807936953e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6092660230557310e+0
+    v = 0.4676414903932920e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6374654204984869e+0
+    v = 0.4674086492347870e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6615136472609892e+0
+    v = 0.4674928539483207e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6809487285958127e+0
+    v = 0.4680748979686447e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6952980021665196e+0
+    v = 0.4690449806389040e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.7041245497695400e+0
+    v = 0.4699877075860818e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6744033088306065e-1
+    v = 0.2099942281069176e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.1678684485334166e+0
+    v = 0.3172269150712804e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.2793559049539613e+0
+    v = 0.3832051358546523e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.3935264218057639e+0
+    v = 0.4252193818146985e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.5052629268232558e+0
+    v = 0.4513807963755000e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.6107905315437531e+0
+    v = 0.4657797469114178e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.1135081039843524e+0
+    b = 0.3331954884662588e-1
+    v = 0.2733362800522836e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1612866626099378e+0
+    b = 0.7247167465436538e-1
+    v = 0.3235485368463559e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2100786550168205e+0
+    b = 0.1151539110849745e+0
+    v = 0.3624908726013453e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2592282009459942e+0
+    b = 0.1599491097143677e+0
+    v = 0.3925540070712828e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3081740561320203e+0
+    b = 0.2058699956028027e+0
+    v = 0.4156129781116235e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3564289781578164e+0
+    b = 0.2521624953502911e+0
+    v = 0.4330644984623263e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4035587288240703e+0
+    b = 0.2982090785797674e+0
+    v = 0.4459677725921312e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4491671196373903e+0
+    b = 0.3434762087235733e+0
+    v = 0.4551593004456795e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4928854782917489e+0
+    b = 0.3874831357203437e+0
+    v = 0.4613341462749918e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5343646791958988e+0
+    b = 0.4297814821746926e+0
+    v = 0.4651019618269806e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5732683216530990e+0
+    b = 0.4699402260943537e+0
+    v = 0.4670249536100625e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2214131583218986e+0
+    b = 0.3873602040643895e-1
+    v = 0.3549555576441708e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2741796504750071e+0
+    b = 0.8089496256902013e-1
+    v = 0.3856108245249010e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3259797439149485e+0
+    b = 0.1251732177620872e+0
+    v = 0.4098622845756882e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3765441148826891e+0
+    b = 0.1706260286403185e+0
+    v = 0.4286328604268950e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4255773574530558e+0
+    b = 0.2165115147300408e+0
+    v = 0.4427802198993945e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4727795117058430e+0
+    b = 0.2622089812225259e+0
+    v = 0.4530473511488561e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5178546895819012e+0
+    b = 0.3071721431296201e+0
+    v = 0.4600805475703138e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5605141192097460e+0
+    b = 0.3508998998801138e+0
+    v = 0.4644599059958017e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6004763319352512e+0
+    b = 0.3929160876166931e+0
+    v = 0.4667274455712508e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3352842634946949e+0
+    b = 0.4202563457288019e-1
+    v = 0.4069360518020356e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3891971629814670e+0
+    b = 0.8614309758870850e-1
+    v = 0.4260442819919195e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4409875565542281e+0
+    b = 0.1314500879380001e+0
+    v = 0.4408678508029063e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4904893058592484e+0
+    b = 0.1772189657383859e+0
+    v = 0.4518748115548597e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5375056138769549e+0
+    b = 0.2228277110050294e+0
+    v = 0.4595564875375116e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5818255708669969e+0
+    b = 0.2677179935014386e+0
+    v = 0.4643988774315846e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6232334858144959e+0
+    b = 0.3113675035544165e+0
+    v = 0.4668827491646946e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4489485354492058e+0
+    b = 0.4409162378368174e-1
+    v = 0.4400541823741973e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5015136875933150e+0
+    b = 0.8939009917748489e-1
+    v = 0.4514512890193797e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5511300550512623e+0
+    b = 0.1351806029383365e+0
+    v = 0.4596198627347549e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5976720409858000e+0
+    b = 0.1808370355053196e+0
+    v = 0.4648659016801781e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6409956378989354e+0
+    b = 0.2257852192301602e+0
+    v = 0.4675502017157673e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5581222330827514e+0
+    b = 0.4532173421637160e-1
+    v = 0.4598494476455523e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6074705984161695e+0
+    b = 0.9117488031840314e-1
+    v = 0.4654916955152048e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6532272537379033e+0
+    b = 0.1369294213140155e+0
+    v = 0.4684709779505137e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6594761494500487e+0
+    b = 0.4589901487275583e-1
+    v = 0.4691445539106986e-3
+    grids.append(SphGenOh(5, a, b, v))
+    return np.vstack(grids)
+
+
+def MakeAngularGrid_2702():
+    grids = []
+    a = 0
+    b = 0
+    v = 0.2998675149888161e-4
+    grids.append(SphGenOh(0, a, b, v))
+    v = 0.4077860529495355e-3
+    grids.append(SphGenOh(2, a, b, v))
+    a = 0.2065562538818703e-1
+    v = 0.1185349192520667e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.5250918173022379e-1
+    v = 0.1913408643425751e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.8993480082038376e-1
+    v = 0.2452886577209897e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.1306023924436019e+0
+    v = 0.2862408183288702e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.1732060388531418e+0
+    v = 0.3178032258257357e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.2168727084820249e+0
+    v = 0.3422945667633690e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.2609528309173586e+0
+    v = 0.3612790520235922e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.3049252927938952e+0
+    v = 0.3758638229818521e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.3483484138084404e+0
+    v = 0.3868711798859953e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.3908321549106406e+0
+    v = 0.3949429933189938e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4320210071894814e+0
+    v = 0.4006068107541156e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4715824795890053e+0
+    v = 0.4043192149672723e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.5091984794078453e+0
+    v = 0.4064947495808078e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.5445580145650803e+0
+    v = 0.4075245619813152e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6072575796841768e+0
+    v = 0.4076423540893566e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6339484505755803e+0
+    v = 0.4074280862251555e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6570718257486958e+0
+    v = 0.4074163756012244e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6762557330090709e+0
+    v = 0.4077647795071246e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6911161696923790e+0
+    v = 0.4084517552782530e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.7012841911659961e+0
+    v = 0.4092468459224052e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.7064559272410020e+0
+    v = 0.4097872687240906e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6123554989894765e-1
+    v = 0.1738986811745028e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.1533070348312393e+0
+    v = 0.2659616045280191e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.2563902605244206e+0
+    v = 0.3240596008171533e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.3629346991663361e+0
+    v = 0.3621195964432943e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.4683949968987538e+0
+    v = 0.3868838330760539e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.5694479240657952e+0
+    v = 0.4018911532693111e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.6634465430993955e+0
+    v = 0.4089929432983252e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.1033958573552305e+0
+    b = 0.3034544009063584e-1
+    v = 0.2279907527706409e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1473521412414395e+0
+    b = 0.6618803044247135e-1
+    v = 0.2715205490578897e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1924552158705967e+0
+    b = 0.1054431128987715e+0
+    v = 0.3057917896703976e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2381094362890328e+0
+    b = 0.1468263551238858e+0
+    v = 0.3326913052452555e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2838121707936760e+0
+    b = 0.1894486108187886e+0
+    v = 0.3537334711890037e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3291323133373415e+0
+    b = 0.2326374238761579e+0
+    v = 0.3700567500783129e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3736896978741460e+0
+    b = 0.2758485808485768e+0
+    v = 0.3825245372589122e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4171406040760013e+0
+    b = 0.3186179331996921e+0
+    v = 0.3918125171518296e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4591677985256915e+0
+    b = 0.3605329796303794e+0
+    v = 0.3984720419937579e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4994733831718418e+0
+    b = 0.4012147253586509e+0
+    v = 0.4029746003338211e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5377731830445096e+0
+    b = 0.4403050025570692e+0
+    v = 0.4057428632156627e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5737917830001331e+0
+    b = 0.4774565904277483e+0
+    v = 0.4071719274114857e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2027323586271389e+0
+    b = 0.3544122504976147e-1
+    v = 0.2990236950664119e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2516942375187273e+0
+    b = 0.7418304388646328e-1
+    v = 0.3262951734212878e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3000227995257181e+0
+    b = 0.1150502745727186e+0
+    v = 0.3482634608242413e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3474806691046342e+0
+    b = 0.1571963371209364e+0
+    v = 0.3656596681700892e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3938103180359209e+0
+    b = 0.1999631877247100e+0
+    v = 0.3791740467794218e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4387519590455703e+0
+    b = 0.2428073457846535e+0
+    v = 0.3894034450156905e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4820503960077787e+0
+    b = 0.2852575132906155e+0
+    v = 0.3968600245508371e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5234573778475101e+0
+    b = 0.3268884208674639e+0
+    v = 0.4019931351420050e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5627318647235282e+0
+    b = 0.3673033321675939e+0
+    v = 0.4052108801278599e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5996390607156954e+0
+    b = 0.4061211551830290e+0
+    v = 0.4068978613940934e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3084780753791947e+0
+    b = 0.3860125523100059e-1
+    v = 0.3454275351319704e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3589988275920223e+0
+    b = 0.7928938987104867e-1
+    v = 0.3629963537007920e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4078628415881973e+0
+    b = 0.1212614643030087e+0
+    v = 0.3770187233889873e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4549287258889735e+0
+    b = 0.1638770827382693e+0
+    v = 0.3878608613694378e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5000278512957279e+0
+    b = 0.2065965798260176e+0
+    v = 0.3959065270221274e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5429785044928199e+0
+    b = 0.2489436378852235e+0
+    v = 0.4015286975463570e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5835939850491711e+0
+    b = 0.2904811368946891e+0
+    v = 0.4050866785614717e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6216870353444856e+0
+    b = 0.3307941957666609e+0
+    v = 0.4069320185051913e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4151104662709091e+0
+    b = 0.4064829146052554e-1
+    v = 0.3760120964062763e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4649804275009218e+0
+    b = 0.8258424547294755e-1
+    v = 0.3870969564418064e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5124695757009662e+0
+    b = 0.1251841962027289e+0
+    v = 0.3955287790534055e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5574711100606224e+0
+    b = 0.1679107505976331e+0
+    v = 0.4015361911302668e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5998597333287227e+0
+    b = 0.2102805057358715e+0
+    v = 0.4053836986719548e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6395007148516600e+0
+    b = 0.2518418087774107e+0
+    v = 0.4073578673299117e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5188456224746252e+0
+    b = 0.4194321676077518e-1
+    v = 0.3954628379231406e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5664190707942778e+0
+    b = 0.8457661551921499e-1
+    v = 0.4017645508847530e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6110464353283153e+0
+    b = 0.1273652932519396e+0
+    v = 0.4059030348651293e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6526430302051563e+0
+    b = 0.1698173239076354e+0
+    v = 0.4080565809484880e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6167551880377548e+0
+    b = 0.4266398851548864e-1
+    v = 0.4063018753664651e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6607195418355383e+0
+    b = 0.8551925814238349e-1
+    v = 0.4087191292799671e-3
+    grids.append(SphGenOh(5, a, b, v))
+    return np.vstack(grids)
+
+
+def MakeAngularGrid_3074():
+    grids = []
+    a = 0
+    b = 0
+    v = 0.2599095953754734e-4
+    grids.append(SphGenOh(0, a, b, v))
+    v = 0.3603134089687541e-3
+    grids.append(SphGenOh(1, a, b, v))
+    v = 0.3586067974412447e-3
+    grids.append(SphGenOh(2, a, b, v))
+    a = 0.1886108518723392e-1
+    v = 0.9831528474385880e-4
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4800217244625303e-1
+    v = 0.1605023107954450e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.8244922058397242e-1
+    v = 0.2072200131464099e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.1200408362484023e+0
+    v = 0.2431297618814187e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.1595773530809965e+0
+    v = 0.2711819064496707e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.2002635973434064e+0
+    v = 0.2932762038321116e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.2415127590139982e+0
+    v = 0.3107032514197368e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.2828584158458477e+0
+    v = 0.3243808058921213e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.3239091015338138e+0
+    v = 0.3349899091374030e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.3643225097962194e+0
+    v = 0.3430580688505218e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4037897083691802e+0
+    v = 0.3490124109290343e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4420247515194127e+0
+    v = 0.3532148948561955e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4787572538464938e+0
+    v = 0.3559862669062833e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.5137265251275234e+0
+    v = 0.3576224317551411e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.5466764056654611e+0
+    v = 0.3584050533086076e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6054859420813535e+0
+    v = 0.3584903581373224e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6308106701764562e+0
+    v = 0.3582991879040586e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6530369230179584e+0
+    v = 0.3582371187963125e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6718609524611158e+0
+    v = 0.3584353631122350e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6869676499894013e+0
+    v = 0.3589120166517785e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6980467077240748e+0
+    v = 0.3595445704531601e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.7048241721250522e+0
+    v = 0.3600943557111074e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.5591105222058232e-1
+    v = 0.1456447096742039e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.1407384078513916e+0
+    v = 0.2252370188283782e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.2364035438976309e+0
+    v = 0.2766135443474897e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.3360602737818170e+0
+    v = 0.3110729491500851e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.4356292630054665e+0
+    v = 0.3342506712303391e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.5321569415256174e+0
+    v = 0.3491981834026860e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.6232956305040554e+0
+    v = 0.3576003604348932e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.9469870086838469e-1
+    b = 0.2778748387309470e-1
+    v = 0.1921921305788564e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1353170300568141e+0
+    b = 0.6076569878628364e-1
+    v = 0.2301458216495632e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1771679481726077e+0
+    b = 0.9703072762711040e-1
+    v = 0.2604248549522893e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2197066664231751e+0
+    b = 0.1354112458524762e+0
+    v = 0.2845275425870697e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2624783557374927e+0
+    b = 0.1750996479744100e+0
+    v = 0.3036870897974840e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3050969521214442e+0
+    b = 0.2154896907449802e+0
+    v = 0.3188414832298066e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3472252637196021e+0
+    b = 0.2560954625740152e+0
+    v = 0.3307046414722089e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3885610219026360e+0
+    b = 0.2965070050624096e+0
+    v = 0.3398330969031360e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4288273776062765e+0
+    b = 0.3363641488734497e+0
+    v = 0.3466757899705373e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4677662471302948e+0
+    b = 0.3753400029836788e+0
+    v = 0.3516095923230054e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5051333589553359e+0
+    b = 0.4131297522144286e+0
+    v = 0.3549645184048486e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5406942145810492e+0
+    b = 0.4494423776081795e+0
+    v = 0.3570415969441392e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5742204122576457e+0
+    b = 0.4839938958841502e+0
+    v = 0.3581251798496118e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1865407027225188e+0
+    b = 0.3259144851070796e-1
+    v = 0.2543491329913348e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2321186453689432e+0
+    b = 0.6835679505297343e-1
+    v = 0.2786711051330776e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2773159142523882e+0
+    b = 0.1062284864451989e+0
+    v = 0.2985552361083679e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3219200192237254e+0
+    b = 0.1454404409323047e+0
+    v = 0.3145867929154039e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3657032593944029e+0
+    b = 0.1854018282582510e+0
+    v = 0.3273290662067609e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4084376778363622e+0
+    b = 0.2256297412014750e+0
+    v = 0.3372705511943501e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4499004945751427e+0
+    b = 0.2657104425000896e+0
+    v = 0.3448274437851510e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4898758141326335e+0
+    b = 0.3052755487631557e+0
+    v = 0.3503592783048583e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5281547442266309e+0
+    b = 0.3439863920645423e+0
+    v = 0.3541854792663162e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5645346989813992e+0
+    b = 0.3815229456121914e+0
+    v = 0.3565995517909428e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5988181252159848e+0
+    b = 0.4175752420966734e+0
+    v = 0.3578802078302898e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2850425424471603e+0
+    b = 0.3562149509862536e-1
+    v = 0.2958644592860982e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3324619433027876e+0
+    b = 0.7330318886871096e-1
+    v = 0.3119548129116835e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3785848333076282e+0
+    b = 0.1123226296008472e+0
+    v = 0.3250745225005984e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4232891028562115e+0
+    b = 0.1521084193337708e+0
+    v = 0.3355153415935208e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4664287050829722e+0
+    b = 0.1921844459223610e+0
+    v = 0.3435847568549328e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5078458493735726e+0
+    b = 0.2321360989678303e+0
+    v = 0.3495786831622488e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5473779816204180e+0
+    b = 0.2715886486360520e+0
+    v = 0.3537767805534621e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5848617133811376e+0
+    b = 0.3101924707571355e+0
+    v = 0.3564459815421428e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6201348281584888e+0
+    b = 0.3476121052890973e+0
+    v = 0.3578464061225468e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3852191185387871e+0
+    b = 0.3763224880035108e-1
+    v = 0.3239748762836212e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4325025061073423e+0
+    b = 0.7659581935637135e-1
+    v = 0.3345491784174287e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4778486229734490e+0
+    b = 0.1163381306083900e+0
+    v = 0.3429126177301782e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5211663693009000e+0
+    b = 0.1563890598752899e+0
+    v = 0.3492420343097421e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5623469504853703e+0
+    b = 0.1963320810149200e+0
+    v = 0.3537399050235257e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6012718188659246e+0
+    b = 0.2357847407258738e+0
+    v = 0.3566209152659172e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6378179206390117e+0
+    b = 0.2743846121244060e+0
+    v = 0.3581084321919782e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4836936460214534e+0
+    b = 0.3895902610739024e-1
+    v = 0.3426522117591512e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5293792562683797e+0
+    b = 0.7871246819312640e-1
+    v = 0.3491848770121379e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5726281253100033e+0
+    b = 0.1187963808202981e+0
+    v = 0.3539318235231476e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6133658776169068e+0
+    b = 0.1587914708061787e+0
+    v = 0.3570231438458694e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6515085491865307e+0
+    b = 0.1983058575227646e+0
+    v = 0.3586207335051714e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5778692716064976e+0
+    b = 0.3977209689791542e-1
+    v = 0.3541196205164025e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6207904288086192e+0
+    b = 0.7990157592981152e-1
+    v = 0.3574296911573953e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6608688171046802e+0
+    b = 0.1199671308754309e+0
+    v = 0.3591993279818963e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6656263089489130e+0
+    b = 0.4015955957805969e-1
+    v = 0.3595855034661997e-3
+    grids.append(SphGenOh(5, a, b, v))
+    return np.vstack(grids)
+
+
+def MakeAngularGrid_3470():
+    grids = []
+    a = 0
+    b = 0
+    v = 0.2040382730826330e-4
+    grids.append(SphGenOh(0, a, b, v))
+    v = 0.3178149703889544e-3
+    grids.append(SphGenOh(2, a, b, v))
+    a = 0.1721420832906233e-1
+    v = 0.8288115128076110e-4
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4408875374981770e-1
+    v = 0.1360883192522954e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.7594680813878681e-1
+    v = 0.1766854454542662e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.1108335359204799e+0
+    v = 0.2083153161230153e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.1476517054388567e+0
+    v = 0.2333279544657158e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.1856731870860615e+0
+    v = 0.2532809539930247e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.2243634099428821e+0
+    v = 0.2692472184211158e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.2633006881662727e+0
+    v = 0.2819949946811885e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.3021340904916283e+0
+    v = 0.2920953593973030e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.3405594048030089e+0
+    v = 0.2999889782948352e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.3783044434007372e+0
+    v = 0.3060292120496902e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4151194767407910e+0
+    v = 0.3105109167522192e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4507705766443257e+0
+    v = 0.3136902387550312e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4850346056573187e+0
+    v = 0.3157984652454632e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.5176950817792470e+0
+    v = 0.3170516518425422e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.5485384240820989e+0
+    v = 0.3176568425633755e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6039117238943308e+0
+    v = 0.3177198411207062e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6279956655573113e+0
+    v = 0.3175519492394733e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6493636169568952e+0
+    v = 0.3174654952634756e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6677644117704504e+0
+    v = 0.3175676415467654e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6829368572115624e+0
+    v = 0.3178923417835410e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6946195818184121e+0
+    v = 0.3183788287531909e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.7025711542057026e+0
+    v = 0.3188755151918807e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.7066004767140119e+0
+    v = 0.3191916889313849e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.5132537689946062e-1
+    v = 0.1231779611744508e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.1297994661331225e+0
+    v = 0.1924661373839880e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.2188852049401307e+0
+    v = 0.2380881867403424e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.3123174824903457e+0
+    v = 0.2693100663037885e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.4064037620738195e+0
+    v = 0.2908673382834366e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.4984958396944782e+0
+    v = 0.3053914619381535e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.5864975046021365e+0
+    v = 0.3143916684147777e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.6686711634580175e+0
+    v = 0.3187042244055363e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.8715738780835950e-1
+    b = 0.2557175233367578e-1
+    v = 0.1635219535869790e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1248383123134007e+0
+    b = 0.5604823383376681e-1
+    v = 0.1968109917696070e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1638062693383378e+0
+    b = 0.8968568601900765e-1
+    v = 0.2236754342249974e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2035586203373176e+0
+    b = 0.1254086651976279e+0
+    v = 0.2453186687017181e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2436798975293774e+0
+    b = 0.1624780150162012e+0
+    v = 0.2627551791580541e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2838207507773806e+0
+    b = 0.2003422342683208e+0
+    v = 0.2767654860152220e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3236787502217692e+0
+    b = 0.2385628026255263e+0
+    v = 0.2879467027765895e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3629849554840691e+0
+    b = 0.2767731148783578e+0
+    v = 0.2967639918918702e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4014948081992087e+0
+    b = 0.3146542308245309e+0
+    v = 0.3035900684660351e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4389818379260225e+0
+    b = 0.3519196415895088e+0
+    v = 0.3087338237298308e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4752331143674377e+0
+    b = 0.3883050984023654e+0
+    v = 0.3124608838860167e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5100457318374018e+0
+    b = 0.4235613423908649e+0
+    v = 0.3150084294226743e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5432238388954868e+0
+    b = 0.4574484717196220e+0
+    v = 0.3165958398598402e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5745758685072442e+0
+    b = 0.4897311639255524e+0
+    v = 0.3174320440957372e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1723981437592809e+0
+    b = 0.3010630597881105e-1
+    v = 0.2182188909812599e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2149553257844597e+0
+    b = 0.6326031554204694e-1
+    v = 0.2399727933921445e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2573256081247422e+0
+    b = 0.9848566980258631e-1
+    v = 0.2579796133514652e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2993163751238106e+0
+    b = 0.1350835952384266e+0
+    v = 0.2727114052623535e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3407238005148000e+0
+    b = 0.1725184055442181e+0
+    v = 0.2846327656281355e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3813454978483264e+0
+    b = 0.2103559279730725e+0
+    v = 0.2941491102051334e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4209848104423343e+0
+    b = 0.2482278774554860e+0
+    v = 0.3016049492136107e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4594519699996300e+0
+    b = 0.2858099509982883e+0
+    v = 0.3072949726175648e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4965640166185930e+0
+    b = 0.3228075659915428e+0
+    v = 0.3114768142886460e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5321441655571562e+0
+    b = 0.3589459907204151e+0
+    v = 0.3143823673666223e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5660208438582166e+0
+    b = 0.3939630088864310e+0
+    v = 0.3162269764661535e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5980264315964364e+0
+    b = 0.4276029922949089e+0
+    v = 0.3172164663759821e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2644215852350733e+0
+    b = 0.3300939429072552e-1
+    v = 0.2554575398967435e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3090113743443063e+0
+    b = 0.6803887650078501e-1
+    v = 0.2701704069135677e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3525871079197808e+0
+    b = 0.1044326136206709e+0
+    v = 0.2823693413468940e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3950418005354029e+0
+    b = 0.1416751597517679e+0
+    v = 0.2922898463214289e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4362475663430163e+0
+    b = 0.1793408610504821e+0
+    v = 0.3001829062162428e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4760661812145854e+0
+    b = 0.2170630750175722e+0
+    v = 0.3062890864542953e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5143551042512103e+0
+    b = 0.2545145157815807e+0
+    v = 0.3108328279264746e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5509709026935597e+0
+    b = 0.2913940101706601e+0
+    v = 0.3140243146201245e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5857711030329428e+0
+    b = 0.3274169910910705e+0
+    v = 0.3160638030977130e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6186149917404392e+0
+    b = 0.3623081329317265e+0
+    v = 0.3171462882206275e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3586894569557064e+0
+    b = 0.3497354386450040e-1
+    v = 0.2812388416031796e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4035266610019441e+0
+    b = 0.7129736739757095e-1
+    v = 0.2912137500288045e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4467775312332510e+0
+    b = 0.1084758620193165e+0
+    v = 0.2993241256502206e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4883638346608543e+0
+    b = 0.1460915689241772e+0
+    v = 0.3057101738983822e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5281908348434601e+0
+    b = 0.1837790832369980e+0
+    v = 0.3105319326251432e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5661542687149311e+0
+    b = 0.2212075390874021e+0
+    v = 0.3139565514428167e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6021450102031452e+0
+    b = 0.2580682841160985e+0
+    v = 0.3161543006806366e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6360520783610050e+0
+    b = 0.2940656362094121e+0
+    v = 0.3172985960613294e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4521611065087196e+0
+    b = 0.3631055365867002e-1
+    v = 0.2989400336901431e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4959365651560963e+0
+    b = 0.7348318468484350e-1
+    v = 0.3054555883947677e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5376815804038283e+0
+    b = 0.1111087643812648e+0
+    v = 0.3104764960807702e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5773314480243768e+0
+    b = 0.1488226085145408e+0
+    v = 0.3141015825977616e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6148113245575056e+0
+    b = 0.1862892274135151e+0
+    v = 0.3164520621159896e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6500407462842380e+0
+    b = 0.2231909701714456e+0
+    v = 0.3176652305912204e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5425151448707213e+0
+    b = 0.3718201306118944e-1
+    v = 0.3105097161023939e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5841860556907931e+0
+    b = 0.7483616335067346e-1
+    v = 0.3143014117890550e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6234632186851500e+0
+    b = 0.1125990834266120e+0
+    v = 0.3168172866287200e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6602934551848843e+0
+    b = 0.1501303813157619e+0
+    v = 0.3181401865570968e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6278573968375105e+0
+    b = 0.3767559930245720e-1
+    v = 0.3170663659156037e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6665611711264577e+0
+    b = 0.7548443301360158e-1
+    v = 0.3185447944625510e-3
+    grids.append(SphGenOh(5, a, b, v))
+    return np.vstack(grids)
+
+
+def MakeAngularGrid_3890():
+    grids = []
+    a = 0
+    b = 0
+    v = 0.1807395252196920e-4
+    grids.append(SphGenOh(0, a, b, v))
+    v = 0.2848008782238827e-3
+    grids.append(SphGenOh(1, a, b, v))
+    v = 0.2836065837530581e-3
+    grids.append(SphGenOh(2, a, b, v))
+    a = 0.1587876419858352e-1
+    v = 0.7013149266673816e-4
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4069193593751206e-1
+    v = 0.1162798021956766e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.7025888115257997e-1
+    v = 0.1518728583972105e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.1027495450028704e+0
+    v = 0.1798796108216934e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.1371457730893426e+0
+    v = 0.2022593385972785e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.1727758532671953e+0
+    v = 0.2203093105575464e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.2091492038929037e+0
+    v = 0.2349294234299855e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.2458813281751915e+0
+    v = 0.2467682058747003e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.2826545859450066e+0
+    v = 0.2563092683572224e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.3191957291799622e+0
+    v = 0.2639253896763318e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.3552621469299578e+0
+    v = 0.2699137479265108e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.3906329503406230e+0
+    v = 0.2745196420166739e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4251028614093031e+0
+    v = 0.2779529197397593e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4584777520111870e+0
+    v = 0.2803996086684265e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4905711358710193e+0
+    v = 0.2820302356715842e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.5212011669847385e+0
+    v = 0.2830056747491068e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.5501878488737995e+0
+    v = 0.2834808950776839e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6025037877479342e+0
+    v = 0.2835282339078929e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6254572689549016e+0
+    v = 0.2833819267065800e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6460107179528248e+0
+    v = 0.2832858336906784e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6639541138154251e+0
+    v = 0.2833268235451244e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6790688515667495e+0
+    v = 0.2835432677029253e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6911338580371512e+0
+    v = 0.2839091722743049e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6999385956126490e+0
+    v = 0.2843308178875841e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.7053037748656896e+0
+    v = 0.2846703550533846e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4732224387180115e-1
+    v = 0.1051193406971900e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.1202100529326803e+0
+    v = 0.1657871838796974e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.2034304820664855e+0
+    v = 0.2064648113714232e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.2912285643573002e+0
+    v = 0.2347942745819741e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.3802361792726768e+0
+    v = 0.2547775326597726e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.4680598511056146e+0
+    v = 0.2686876684847025e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.5528151052155599e+0
+    v = 0.2778665755515867e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.6329386307803041e+0
+    v = 0.2830996616782929e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.8056516651369069e-1
+    b = 0.2363454684003124e-1
+    v = 0.1403063340168372e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1156476077139389e+0
+    b = 0.5191291632545936e-1
+    v = 0.1696504125939477e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1520473382760421e+0
+    b = 0.8322715736994519e-1
+    v = 0.1935787242745390e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1892986699745931e+0
+    b = 0.1165855667993712e+0
+    v = 0.2130614510521968e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2270194446777792e+0
+    b = 0.1513077167409504e+0
+    v = 0.2289381265931048e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2648908185093273e+0
+    b = 0.1868882025807859e+0
+    v = 0.2418630292816186e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3026389259574136e+0
+    b = 0.2229277629776224e+0
+    v = 0.2523400495631193e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3400220296151384e+0
+    b = 0.2590951840746235e+0
+    v = 0.2607623973449605e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3768217953335510e+0
+    b = 0.2951047291750847e+0
+    v = 0.2674441032689209e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4128372900921884e+0
+    b = 0.3307019714169930e+0
+    v = 0.2726432360343356e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4478807131815630e+0
+    b = 0.3656544101087634e+0
+    v = 0.2765787685924545e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4817742034089257e+0
+    b = 0.3997448951939695e+0
+    v = 0.2794428690642224e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5143472814653344e+0
+    b = 0.4327667110812024e+0
+    v = 0.2814099002062895e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5454346213905650e+0
+    b = 0.4645196123532293e+0
+    v = 0.2826429531578994e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5748739313170252e+0
+    b = 0.4948063555703345e+0
+    v = 0.2832983542550884e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1599598738286342e+0
+    b = 0.2792357590048985e-1
+    v = 0.1886695565284976e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1998097412500951e+0
+    b = 0.5877141038139065e-1
+    v = 0.2081867882748234e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2396228952566202e+0
+    b = 0.9164573914691377e-1
+    v = 0.2245148680600796e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2792228341097746e+0
+    b = 0.1259049641962687e+0
+    v = 0.2380370491511872e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3184251107546741e+0
+    b = 0.1610594823400863e+0
+    v = 0.2491398041852455e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3570481164426244e+0
+    b = 0.1967151653460898e+0
+    v = 0.2581632405881230e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3949164710492144e+0
+    b = 0.2325404606175168e+0
+    v = 0.2653965506227417e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4318617293970503e+0
+    b = 0.2682461141151439e+0
+    v = 0.2710857216747087e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4677221009931678e+0
+    b = 0.3035720116011973e+0
+    v = 0.2754434093903659e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5023417939270955e+0
+    b = 0.3382781859197439e+0
+    v = 0.2786579932519380e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5355701836636128e+0
+    b = 0.3721383065625942e+0
+    v = 0.2809011080679474e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5672608451328771e+0
+    b = 0.4049346360466055e+0
+    v = 0.2823336184560987e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5972704202540162e+0
+    b = 0.4364538098633802e+0
+    v = 0.2831101175806309e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2461687022333596e+0
+    b = 0.3070423166833368e-1
+    v = 0.2221679970354546e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2881774566286831e+0
+    b = 0.6338034669281885e-1
+    v = 0.2356185734270703e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3293963604116978e+0
+    b = 0.9742862487067941e-1
+    v = 0.2469228344805590e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3697303822241377e+0
+    b = 0.1323799532282290e+0
+    v = 0.2562726348642046e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4090663023135127e+0
+    b = 0.1678497018129336e+0
+    v = 0.2638756726753028e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4472819355411712e+0
+    b = 0.2035095105326114e+0
+    v = 0.2699311157390862e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4842513377231437e+0
+    b = 0.2390692566672091e+0
+    v = 0.2746233268403837e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5198477629962928e+0
+    b = 0.2742649818076149e+0
+    v = 0.2781225674454771e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5539453011883145e+0
+    b = 0.3088503806580094e+0
+    v = 0.2805881254045684e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5864196762401251e+0
+    b = 0.3425904245906614e+0
+    v = 0.2821719877004913e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6171484466668390e+0
+    b = 0.3752562294789468e+0
+    v = 0.2830222502333124e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3350337830565727e+0
+    b = 0.3261589934634747e-1
+    v = 0.2457995956744870e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3775773224758284e+0
+    b = 0.6658438928081572e-1
+    v = 0.2551474407503706e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4188155229848973e+0
+    b = 0.1014565797157954e+0
+    v = 0.2629065335195311e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4586805892009344e+0
+    b = 0.1368573320843822e+0
+    v = 0.2691900449925075e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4970895714224235e+0
+    b = 0.1724614851951608e+0
+    v = 0.2741275485754276e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5339505133960747e+0
+    b = 0.2079779381416412e+0
+    v = 0.2778530970122595e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5691665792531440e+0
+    b = 0.2431385788322288e+0
+    v = 0.2805010567646741e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6026387682680377e+0
+    b = 0.2776901883049853e+0
+    v = 0.2822055834031040e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6342676150163307e+0
+    b = 0.3113881356386632e+0
+    v = 0.2831016901243473e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4237951119537067e+0
+    b = 0.3394877848664351e-1
+    v = 0.2624474901131803e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4656918683234929e+0
+    b = 0.6880219556291447e-1
+    v = 0.2688034163039377e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5058857069185980e+0
+    b = 0.1041946859721635e+0
+    v = 0.2738932751287636e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5443204666713996e+0
+    b = 0.1398039738736393e+0
+    v = 0.2777944791242523e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5809298813759742e+0
+    b = 0.1753373381196155e+0
+    v = 0.2806011661660987e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6156416039447128e+0
+    b = 0.2105215793514010e+0
+    v = 0.2824181456597460e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6483801351066604e+0
+    b = 0.2450953312157051e+0
+    v = 0.2833585216577828e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5103616577251688e+0
+    b = 0.3485560643800719e-1
+    v = 0.2738165236962878e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5506738792580681e+0
+    b = 0.7026308631512033e-1
+    v = 0.2778365208203180e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5889573040995292e+0
+    b = 0.1059035061296403e+0
+    v = 0.2807852940418966e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6251641589516930e+0
+    b = 0.1414823925236026e+0
+    v = 0.2827245949674705e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6592414921570178e+0
+    b = 0.1767207908214530e+0
+    v = 0.2837342344829828e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5930314017533384e+0
+    b = 0.3542189339561672e-1
+    v = 0.2809233907610981e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6309812253390175e+0
+    b = 0.7109574040369549e-1
+    v = 0.2829930809742694e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6666296011353230e+0
+    b = 0.1067259792282730e+0
+    v = 0.2841097874111479e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6703715271049922e+0
+    b = 0.3569455268820809e-1
+    v = 0.2843455206008783e-3
+    grids.append(SphGenOh(5, a, b, v))
+    return np.vstack(grids)
+
+
+def MakeAngularGrid_4334():
+    grids = []
+    a = 0
+    b = 0
+    v = 0.1449063022537883e-4
+    grids.append(SphGenOh(0, a, b, v))
+    v = 0.2546377329828424e-3
+    grids.append(SphGenOh(2, a, b, v))
+    a = 0.1462896151831013e-1
+    v = 0.6018432961087496e-4
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.3769840812493139e-1
+    v = 0.1002286583263673e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6524701904096891e-1
+    v = 0.1315222931028093e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.9560543416134648e-1
+    v = 0.1564213746876724e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.1278335898929198e+0
+    v = 0.1765118841507736e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.1613096104466031e+0
+    v = 0.1928737099311080e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.1955806225745371e+0
+    v = 0.2062658534263270e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.2302935218498028e+0
+    v = 0.2172395445953787e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.2651584344113027e+0
+    v = 0.2262076188876047e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.2999276825183209e+0
+    v = 0.2334885699462397e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.3343828669718798e+0
+    v = 0.2393355273179203e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.3683265013750518e+0
+    v = 0.2439559200468863e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4015763206518108e+0
+    v = 0.2475251866060002e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4339612026399770e+0
+    v = 0.2501965558158773e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4653180651114582e+0
+    v = 0.2521081407925925e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4954893331080803e+0
+    v = 0.2533881002388081e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.5243207068924930e+0
+    v = 0.2541582900848261e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.5516590479041704e+0
+    v = 0.2545365737525860e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6012371927804176e+0
+    v = 0.2545726993066799e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6231574466449819e+0
+    v = 0.2544456197465555e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6429416514181271e+0
+    v = 0.2543481596881064e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6604124272943595e+0
+    v = 0.2543506451429194e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6753851470408250e+0
+    v = 0.2544905675493763e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6876717970626160e+0
+    v = 0.2547611407344429e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6970895061319234e+0
+    v = 0.2551060375448869e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.7034746912553310e+0
+    v = 0.2554291933816039e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.7067017217542295e+0
+    v = 0.2556255710686343e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4382223501131123e-1
+    v = 0.9041339695118195e-4
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.1117474077400006e+0
+    v = 0.1438426330079022e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.1897153252911440e+0
+    v = 0.1802523089820518e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.2724023009910331e+0
+    v = 0.2060052290565496e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.3567163308709902e+0
+    v = 0.2245002248967466e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.4404784483028087e+0
+    v = 0.2377059847731150e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.5219833154161411e+0
+    v = 0.2468118955882525e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.5998179868977553e+0
+    v = 0.2525410872966528e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.6727803154548222e+0
+    v = 0.2553101409933397e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.7476563943166086e-1
+    b = 0.2193168509461185e-1
+    v = 0.1212879733668632e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1075341482001416e+0
+    b = 0.4826419281533887e-1
+    v = 0.1472872881270931e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1416344885203259e+0
+    b = 0.7751191883575742e-1
+    v = 0.1686846601010828e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1766325315388586e+0
+    b = 0.1087558139247680e+0
+    v = 0.1862698414660208e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2121744174481514e+0
+    b = 0.1413661374253096e+0
+    v = 0.2007430956991861e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2479669443408145e+0
+    b = 0.1748768214258880e+0
+    v = 0.2126568125394796e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2837600452294113e+0
+    b = 0.2089216406612073e+0
+    v = 0.2224394603372113e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3193344933193984e+0
+    b = 0.2431987685545972e+0
+    v = 0.2304264522673135e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3544935442438745e+0
+    b = 0.2774497054377770e+0
+    v = 0.2368854288424087e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3890571932288154e+0
+    b = 0.3114460356156915e+0
+    v = 0.2420352089461772e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4228581214259090e+0
+    b = 0.3449806851913012e+0
+    v = 0.2460597113081295e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4557387211304052e+0
+    b = 0.3778618641248256e+0
+    v = 0.2491181912257687e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4875487950541643e+0
+    b = 0.4099086391698978e+0
+    v = 0.2513528194205857e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5181436529962997e+0
+    b = 0.4409474925853973e+0
+    v = 0.2528943096693220e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5473824095600661e+0
+    b = 0.4708094517711291e+0
+    v = 0.2538660368488136e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5751263398976174e+0
+    b = 0.4993275140354637e+0
+    v = 0.2543868648299022e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1489515746840028e+0
+    b = 0.2599381993267017e-1
+    v = 0.1642595537825183e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1863656444351767e+0
+    b = 0.5479286532462190e-1
+    v = 0.1818246659849308e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2238602880356348e+0
+    b = 0.8556763251425254e-1
+    v = 0.1966565649492420e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2612723375728160e+0
+    b = 0.1177257802267011e+0
+    v = 0.2090677905657991e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2984332990206190e+0
+    b = 0.1508168456192700e+0
+    v = 0.2193820409510504e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3351786584663333e+0
+    b = 0.1844801892177727e+0
+    v = 0.2278870827661928e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3713505522209120e+0
+    b = 0.2184145236087598e+0
+    v = 0.2348283192282090e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4067981098954663e+0
+    b = 0.2523590641486229e+0
+    v = 0.2404139755581477e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4413769993687534e+0
+    b = 0.2860812976901373e+0
+    v = 0.2448227407760734e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4749487182516394e+0
+    b = 0.3193686757808996e+0
+    v = 0.2482110455592573e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5073798105075426e+0
+    b = 0.3520226949547602e+0
+    v = 0.2507192397774103e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5385410448878654e+0
+    b = 0.3838544395667890e+0
+    v = 0.2524765968534880e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5683065353670530e+0
+    b = 0.4146810037640963e+0
+    v = 0.2536052388539425e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5965527620663510e+0
+    b = 0.4443224094681121e+0
+    v = 0.2542230588033068e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2299227700856157e+0
+    b = 0.2865757664057584e-1
+    v = 0.1944817013047896e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2695752998553267e+0
+    b = 0.5923421684485993e-1
+    v = 0.2067862362746635e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3086178716611389e+0
+    b = 0.9117817776057715e-1
+    v = 0.2172440734649114e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3469649871659077e+0
+    b = 0.1240593814082605e+0
+    v = 0.2260125991723423e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3845153566319655e+0
+    b = 0.1575272058259175e+0
+    v = 0.2332655008689523e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4211600033403215e+0
+    b = 0.1912845163525413e+0
+    v = 0.2391699681532458e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4567867834329882e+0
+    b = 0.2250710177858171e+0
+    v = 0.2438801528273928e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4912829319232061e+0
+    b = 0.2586521303440910e+0
+    v = 0.2475370504260665e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5245364793303812e+0
+    b = 0.2918112242865407e+0
+    v = 0.2502707235640574e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5564369788915756e+0
+    b = 0.3243439239067890e+0
+    v = 0.2522031701054241e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5868757697775287e+0
+    b = 0.3560536787835351e+0
+    v = 0.2534511269978784e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6157458853519617e+0
+    b = 0.3867480821242581e+0
+    v = 0.2541284914955151e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3138461110672113e+0
+    b = 0.3051374637507278e-1
+    v = 0.2161509250688394e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3542495872050569e+0
+    b = 0.6237111233730755e-1
+    v = 0.2248778513437852e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3935751553120181e+0
+    b = 0.9516223952401907e-1
+    v = 0.2322388803404617e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4317634668111147e+0
+    b = 0.1285467341508517e+0
+    v = 0.2383265471001355e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4687413842250821e+0
+    b = 0.1622318931656033e+0
+    v = 0.2432476675019525e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5044274237060283e+0
+    b = 0.1959581153836453e+0
+    v = 0.2471122223750674e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5387354077925727e+0
+    b = 0.2294888081183837e+0
+    v = 0.2500291752486870e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5715768898356105e+0
+    b = 0.2626031152713945e+0
+    v = 0.2521055942764682e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6028627200136111e+0
+    b = 0.2950904075286713e+0
+    v = 0.2534472785575503e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6325039812653463e+0
+    b = 0.3267458451113286e+0
+    v = 0.2541599713080121e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3981986708423407e+0
+    b = 0.3183291458749821e-1
+    v = 0.2317380975862936e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4382791182133300e+0
+    b = 0.6459548193880908e-1
+    v = 0.2378550733719775e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4769233057218166e+0
+    b = 0.9795757037087952e-1
+    v = 0.2428884456739118e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5140823911194238e+0
+    b = 0.1316307235126655e+0
+    v = 0.2469002655757292e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5496977833862983e+0
+    b = 0.1653556486358704e+0
+    v = 0.2499657574265851e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5837047306512727e+0
+    b = 0.1988931724126510e+0
+    v = 0.2521676168486082e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6160349566926879e+0
+    b = 0.2320174581438950e+0
+    v = 0.2535935662645334e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6466185353209440e+0
+    b = 0.2645106562168662e+0
+    v = 0.2543356743363214e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4810835158795404e+0
+    b = 0.3275917807743992e-1
+    v = 0.2427353285201535e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5199925041324341e+0
+    b = 0.6612546183967181e-1
+    v = 0.2468258039744386e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5571717692207494e+0
+    b = 0.9981498331474143e-1
+    v = 0.2500060956440310e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5925789250836378e+0
+    b = 0.1335687001410374e+0
+    v = 0.2523238365420979e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6261658523859670e+0
+    b = 0.1671444402896463e+0
+    v = 0.2538399260252846e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6578811126669331e+0
+    b = 0.2003106382156076e+0
+    v = 0.2546255927268069e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5609624612998100e+0
+    b = 0.3337500940231335e-1
+    v = 0.2500583360048449e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5979959659984670e+0
+    b = 0.6708750335901803e-1
+    v = 0.2524777638260203e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6330523711054002e+0
+    b = 0.1008792126424850e+0
+    v = 0.2540951193860656e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6660960998103972e+0
+    b = 0.1345050343171794e+0
+    v = 0.2549524085027472e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6365384364585819e+0
+    b = 0.3372799460737052e-1
+    v = 0.2542569507009158e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6710994302899275e+0
+    b = 0.6755249309678028e-1
+    v = 0.2552114127580376e-3
+    grids.append(SphGenOh(5, a, b, v))
+    return np.vstack(grids)
+
+
+def MakeAngularGrid_4802():
+    grids = []
+    a = 0
+    b = 0
+    v = 0.9687521879420705e-4
+    grids.append(SphGenOh(0, a, b, v))
+    v = 0.2307897895367918e-3
+    grids.append(SphGenOh(1, a, b, v))
+    v = 0.2297310852498558e-3
+    grids.append(SphGenOh(2, a, b, v))
+    a = 0.2335728608887064e-1
+    v = 0.7386265944001919e-4
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4352987836550653e-1
+    v = 0.8257977698542210e-4
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6439200521088801e-1
+    v = 0.9706044762057630e-4
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.9003943631993181e-1
+    v = 0.1302393847117003e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.1196706615548473e+0
+    v = 0.1541957004600968e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.1511715412838134e+0
+    v = 0.1704459770092199e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.1835982828503801e+0
+    v = 0.1827374890942906e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.2165081259155405e+0
+    v = 0.1926360817436107e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.2496208720417563e+0
+    v = 0.2008010239494833e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.2827200673567900e+0
+    v = 0.2075635983209175e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.3156190823994346e+0
+    v = 0.2131306638690909e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.3481476793749115e+0
+    v = 0.2176562329937335e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.3801466086947226e+0
+    v = 0.2212682262991018e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4114652119634011e+0
+    v = 0.2240799515668565e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4419598786519751e+0
+    v = 0.2261959816187525e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4714925949329543e+0
+    v = 0.2277156368808855e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4999293972879466e+0
+    v = 0.2287351772128336e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.5271387221431248e+0
+    v = 0.2293490814084085e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.5529896780837761e+0
+    v = 0.2296505312376273e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6000856099481712e+0
+    v = 0.2296793832318756e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6210562192785175e+0
+    v = 0.2295785443842974e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6401165879934240e+0
+    v = 0.2295017931529102e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6571144029244334e+0
+    v = 0.2295059638184868e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6718910821718863e+0
+    v = 0.2296232343237362e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6842845591099010e+0
+    v = 0.2298530178740771e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6941353476269816e+0
+    v = 0.2301579790280501e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.7012965242212991e+0
+    v = 0.2304690404996513e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.7056471428242644e+0
+    v = 0.2307027995907102e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4595557643585895e-1
+    v = 0.9312274696671092e-4
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.1049316742435023e+0
+    v = 0.1199919385876926e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.1773548879549274e+0
+    v = 0.1598039138877690e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.2559071411236127e+0
+    v = 0.1822253763574900e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.3358156837985898e+0
+    v = 0.1988579593655040e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.4155835743763893e+0
+    v = 0.2112620102533307e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.4937894296167472e+0
+    v = 0.2201594887699007e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.5691569694793316e+0
+    v = 0.2261622590895036e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.6405840854894251e+0
+    v = 0.2296458453435705e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.7345133894143348e-1
+    b = 0.2177844081486067e-1
+    v = 0.1006006990267000e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1009859834044931e+0
+    b = 0.4590362185775188e-1
+    v = 0.1227676689635876e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1324289619748758e+0
+    b = 0.7255063095690877e-1
+    v = 0.1467864280270117e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1654272109607127e+0
+    b = 0.1017825451960684e+0
+    v = 0.1644178912101232e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1990767186776461e+0
+    b = 0.1325652320980364e+0
+    v = 0.1777664890718961e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2330125945523278e+0
+    b = 0.1642765374496765e+0
+    v = 0.1884825664516690e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2670080611108287e+0
+    b = 0.1965360374337889e+0
+    v = 0.1973269246453848e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3008753376294316e+0
+    b = 0.2290726770542238e+0
+    v = 0.2046767775855328e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3344475596167860e+0
+    b = 0.2616645495370823e+0
+    v = 0.2107600125918040e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3675709724070786e+0
+    b = 0.2941150728843141e+0
+    v = 0.2157416362266829e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4001000887587812e+0
+    b = 0.3262440400919066e+0
+    v = 0.2197557816920721e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4318956350436028e+0
+    b = 0.3578835350611916e+0
+    v = 0.2229192611835437e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4628239056795531e+0
+    b = 0.3888751854043678e+0
+    v = 0.2253385110212775e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4927563229773636e+0
+    b = 0.4190678003222840e+0
+    v = 0.2271137107548774e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5215687136707969e+0
+    b = 0.4483151836883852e+0
+    v = 0.2283414092917525e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5491402346984905e+0
+    b = 0.4764740676087880e+0
+    v = 0.2291161673130077e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5753520160126075e+0
+    b = 0.5034021310998277e+0
+    v = 0.2295313908576598e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1388326356417754e+0
+    b = 0.2435436510372806e-1
+    v = 0.1438204721359031e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1743686900537244e+0
+    b = 0.5118897057342652e-1
+    v = 0.1607738025495257e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2099737037950268e+0
+    b = 0.8014695048539634e-1
+    v = 0.1741483853528379e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2454492590908548e+0
+    b = 0.1105117874155699e+0
+    v = 0.1851918467519151e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2807219257864278e+0
+    b = 0.1417950531570966e+0
+    v = 0.1944628638070613e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3156842271975842e+0
+    b = 0.1736604945719597e+0
+    v = 0.2022495446275152e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3502090945177752e+0
+    b = 0.2058466324693981e+0
+    v = 0.2087462382438514e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3841684849519686e+0
+    b = 0.2381284261195919e+0
+    v = 0.2141074754818308e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4174372367906016e+0
+    b = 0.2703031270422569e+0
+    v = 0.2184640913748162e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4498926465011892e+0
+    b = 0.3021845683091309e+0
+    v = 0.2219309165220329e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4814146229807701e+0
+    b = 0.3335993355165720e+0
+    v = 0.2246123118340624e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5118863625734701e+0
+    b = 0.3643833735518232e+0
+    v = 0.2266062766915125e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5411947455119144e+0
+    b = 0.3943789541958179e+0
+    v = 0.2280072952230796e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5692301500357246e+0
+    b = 0.4234320144403542e+0
+    v = 0.2289082025202583e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5958857204139576e+0
+    b = 0.4513897947419260e+0
+    v = 0.2294012695120025e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2156270284785766e+0
+    b = 0.2681225755444491e-1
+    v = 0.1722434488736947e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2532385054909710e+0
+    b = 0.5557495747805614e-1
+    v = 0.1830237421455091e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2902564617771537e+0
+    b = 0.8569368062950249e-1
+    v = 0.1923855349997633e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3266979823143256e+0
+    b = 0.1167367450324135e+0
+    v = 0.2004067861936271e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3625039627493614e+0
+    b = 0.1483861994003304e+0
+    v = 0.2071817297354263e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3975838937548699e+0
+    b = 0.1803821503011405e+0
+    v = 0.2128250834102103e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4318396099009774e+0
+    b = 0.2124962965666424e+0
+    v = 0.2174513719440102e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4651706555732742e+0
+    b = 0.2445221837805913e+0
+    v = 0.2211661839150214e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4974752649620969e+0
+    b = 0.2762701224322987e+0
+    v = 0.2240665257813102e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5286517579627517e+0
+    b = 0.3075627775211328e+0
+    v = 0.2262439516632620e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5586001195731895e+0
+    b = 0.3382311089826877e+0
+    v = 0.2277874557231869e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5872229902021319e+0
+    b = 0.3681108834741399e+0
+    v = 0.2287854314454994e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6144258616235123e+0
+    b = 0.3970397446872839e+0
+    v = 0.2293268499615575e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2951676508064861e+0
+    b = 0.2867499538750441e-1
+    v = 0.1912628201529828e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3335085485472725e+0
+    b = 0.5867879341903510e-1
+    v = 0.1992499672238701e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3709561760636381e+0
+    b = 0.8961099205022284e-1
+    v = 0.2061275533454027e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4074722861667498e+0
+    b = 0.1211627927626297e+0
+    v = 0.2119318215968572e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4429923648839117e+0
+    b = 0.1530748903554898e+0
+    v = 0.2167416581882652e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4774428052721736e+0
+    b = 0.1851176436721877e+0
+    v = 0.2206430730516600e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5107446539535904e+0
+    b = 0.2170829107658179e+0
+    v = 0.2237186938699523e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5428151370542935e+0
+    b = 0.2487786689026271e+0
+    v = 0.2260480075032884e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5735699292556964e+0
+    b = 0.2800239952795016e+0
+    v = 0.2277098884558542e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6029253794562866e+0
+    b = 0.3106445702878119e+0
+    v = 0.2287845715109671e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6307998987073145e+0
+    b = 0.3404689500841194e+0
+    v = 0.2293547268236294e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3752652273692719e+0
+    b = 0.2997145098184479e-1
+    v = 0.2056073839852528e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4135383879344028e+0
+    b = 0.6086725898678011e-1
+    v = 0.2114235865831876e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4506113885153907e+0
+    b = 0.9238849548435643e-1
+    v = 0.2163175629770551e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4864401554606072e+0
+    b = 0.1242786603851851e+0
+    v = 0.2203392158111650e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5209708076611709e+0
+    b = 0.1563086731483386e+0
+    v = 0.2235473176847839e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5541422135830122e+0
+    b = 0.1882696509388506e+0
+    v = 0.2260024141501235e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5858880915113817e+0
+    b = 0.2199672979126059e+0
+    v = 0.2277675929329182e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6161399390603444e+0
+    b = 0.2512165482924867e+0
+    v = 0.2289102112284834e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6448296482255090e+0
+    b = 0.2818368701871888e+0
+    v = 0.2295027954625118e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4544796274917948e+0
+    b = 0.3088970405060312e-1
+    v = 0.2161281589879992e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4919389072146628e+0
+    b = 0.6240947677636835e-1
+    v = 0.2201980477395102e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5279313026985183e+0
+    b = 0.9430706144280313e-1
+    v = 0.2234952066593166e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5624169925571135e+0
+    b = 0.1263547818770374e+0
+    v = 0.2260540098520838e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5953484627093287e+0
+    b = 0.1583430788822594e+0
+    v = 0.2279157981899988e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6266730715339185e+0
+    b = 0.1900748462555988e+0
+    v = 0.2291296918565571e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6563363204278871e+0
+    b = 0.2213599519592567e+0
+    v = 0.2297533752536649e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5314574716585696e+0
+    b = 0.3152508811515374e-1
+    v = 0.2234927356465995e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5674614932298185e+0
+    b = 0.6343865291465561e-1
+    v = 0.2261288012985219e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6017706004970264e+0
+    b = 0.9551503504223951e-1
+    v = 0.2280818160923688e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6343471270264178e+0
+    b = 0.1275440099801196e+0
+    v = 0.2293773295180159e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6651494599127802e+0
+    b = 0.1593252037671960e+0
+    v = 0.2300528767338634e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6050184986005704e+0
+    b = 0.3192538338496105e-1
+    v = 0.2281893855065666e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6390163550880400e+0
+    b = 0.6402824353962306e-1
+    v = 0.2295720444840727e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6711199107088448e+0
+    b = 0.9609805077002909e-1
+    v = 0.2303227649026753e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6741354429572275e+0
+    b = 0.3211853196273233e-1
+    v = 0.2304831913227114e-3
+    grids.append(SphGenOh(5, a, b, v))
+    return np.vstack(grids)
+
+
+def MakeAngularGrid_5294():
+    grids = []
+    a = 0
+    b = 0
+    v = 0.9080510764308163e-4
+    grids.append(SphGenOh(0, a, b, v))
+    v = 0.2084824361987793e-3
+    grids.append(SphGenOh(2, a, b, v))
+    a = 0.2303261686261450e-1
+    v = 0.5011105657239616e-4
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.3757208620162394e-1
+    v = 0.5942520409683854e-4
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.5821912033821852e-1
+    v = 0.9564394826109721e-4
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.8403127529194872e-1
+    v = 0.1185530657126338e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.1122927798060578e+0
+    v = 0.1364510114230331e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.1420125319192987e+0
+    v = 0.1505828825605415e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.1726396437341978e+0
+    v = 0.1619298749867023e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.2038170058115696e+0
+    v = 0.1712450504267789e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.2352849892876508e+0
+    v = 0.1789891098164999e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.2668363354312461e+0
+    v = 0.1854474955629795e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.2982941279900452e+0
+    v = 0.1908148636673661e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.3295002922087076e+0
+    v = 0.1952377405281833e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.3603094918363593e+0
+    v = 0.1988349254282232e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.3905857895173920e+0
+    v = 0.2017079807160050e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4202005758160837e+0
+    v = 0.2039473082709094e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4490310061597227e+0
+    v = 0.2056360279288953e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4769586160311491e+0
+    v = 0.2068525823066865e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.5038679887049750e+0
+    v = 0.2076724877534488e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.5296454286519961e+0
+    v = 0.2081694278237885e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.5541776207164850e+0
+    v = 0.2084157631219326e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.5990467321921213e+0
+    v = 0.2084381531128593e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6191467096294587e+0
+    v = 0.2083476277129307e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6375251212901849e+0
+    v = 0.2082686194459732e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6540514381131168e+0
+    v = 0.2082475686112415e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6685899064391510e+0
+    v = 0.2083139860289915e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6810013009681648e+0
+    v = 0.2084745561831237e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6911469578730340e+0
+    v = 0.2087091313375890e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6988956915141736e+0
+    v = 0.2089718413297697e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.7041335794868720e+0
+    v = 0.2092003303479793e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.7067754398018567e+0
+    v = 0.2093336148263241e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.3840368707853623e-1
+    v = 0.7591708117365267e-4
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.9835485954117399e-1
+    v = 0.1083383968169186e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.1665774947612998e+0
+    v = 0.1403019395292510e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.2405702335362910e+0
+    v = 0.1615970179286436e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.3165270770189046e+0
+    v = 0.1771144187504911e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.3927386145645443e+0
+    v = 0.1887760022988168e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.4678825918374656e+0
+    v = 0.1973474670768214e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.5408022024266935e+0
+    v = 0.2033787661234659e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.6104967445752438e+0
+    v = 0.2072343626517331e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.6760910702685738e+0
+    v = 0.2091177834226918e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.6655644120217392e-1
+    b = 0.1936508874588424e-1
+    v = 0.9316684484675566e-4
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.9446246161270182e-1
+    b = 0.4252442002115869e-1
+    v = 0.1116193688682976e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1242651925452509e+0
+    b = 0.6806529315354374e-1
+    v = 0.1298623551559414e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1553438064846751e+0
+    b = 0.9560957491205369e-1
+    v = 0.1450236832456426e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1871137110542670e+0
+    b = 0.1245931657452888e+0
+    v = 0.1572719958149914e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2192612628836257e+0
+    b = 0.1545385828778978e+0
+    v = 0.1673234785867195e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2515682807206955e+0
+    b = 0.1851004249723368e+0
+    v = 0.1756860118725188e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2838535866287290e+0
+    b = 0.2160182608272384e+0
+    v = 0.1826776290439367e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3159578817528521e+0
+    b = 0.2470799012277111e+0
+    v = 0.1885116347992865e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3477370882791392e+0
+    b = 0.2781014208986402e+0
+    v = 0.1933457860170574e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3790576960890540e+0
+    b = 0.3089172523515731e+0
+    v = 0.1973060671902064e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4097938317810200e+0
+    b = 0.3393750055472244e+0
+    v = 0.2004987099616311e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4398256572859637e+0
+    b = 0.3693322470987730e+0
+    v = 0.2030170909281499e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4690384114718480e+0
+    b = 0.3986541005609877e+0
+    v = 0.2049461460119080e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4973216048301053e+0
+    b = 0.4272112491408562e+0
+    v = 0.2063653565200186e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5245681526132446e+0
+    b = 0.4548781735309936e+0
+    v = 0.2073507927381027e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5506733911803888e+0
+    b = 0.4815315355023251e+0
+    v = 0.2079764593256122e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5755339829522475e+0
+    b = 0.5070486445801855e+0
+    v = 0.2083150534968778e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1305472386056362e+0
+    b = 0.2284970375722366e-1
+    v = 0.1262715121590664e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1637327908216477e+0
+    b = 0.4812254338288384e-1
+    v = 0.1414386128545972e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1972734634149637e+0
+    b = 0.7531734457511935e-1
+    v = 0.1538740401313898e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2308694653110130e+0
+    b = 0.1039043639882017e+0
+    v = 0.1642434942331432e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2643899218338160e+0
+    b = 0.1334526587117626e+0
+    v = 0.1729790609237496e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2977171599622171e+0
+    b = 0.1636414868936382e+0
+    v = 0.1803505190260828e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3307293903032310e+0
+    b = 0.1942195406166568e+0
+    v = 0.1865475350079657e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3633069198219073e+0
+    b = 0.2249752879943753e+0
+    v = 0.1917182669679069e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3953346955922727e+0
+    b = 0.2557218821820032e+0
+    v = 0.1959851709034382e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4267018394184914e+0
+    b = 0.2862897925213193e+0
+    v = 0.1994529548117882e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4573009622571704e+0
+    b = 0.3165224536636518e+0
+    v = 0.2022138911146548e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4870279559856109e+0
+    b = 0.3462730221636496e+0
+    v = 0.2043518024208592e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5157819581450322e+0
+    b = 0.3754016870282835e+0
+    v = 0.2059450313018110e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5434651666465393e+0
+    b = 0.4037733784993613e+0
+    v = 0.2070685715318472e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5699823887764627e+0
+    b = 0.4312557784139123e+0
+    v = 0.2077955310694373e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5952403350947741e+0
+    b = 0.4577175367122110e+0
+    v = 0.2081980387824712e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2025152599210369e+0
+    b = 0.2520253617719557e-1
+    v = 0.1521318610377956e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2381066653274425e+0
+    b = 0.5223254506119000e-1
+    v = 0.1622772720185755e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2732823383651612e+0
+    b = 0.8060669688588620e-1
+    v = 0.1710498139420709e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3080137692611118e+0
+    b = 0.1099335754081255e+0
+    v = 0.1785911149448736e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3422405614587601e+0
+    b = 0.1399120955959857e+0
+    v = 0.1850125313687736e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3758808773890420e+0
+    b = 0.1702977801651705e+0
+    v = 0.1904229703933298e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4088458383438932e+0
+    b = 0.2008799256601680e+0
+    v = 0.1949259956121987e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4410450550841152e+0
+    b = 0.2314703052180836e+0
+    v = 0.1986161545363960e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4723879420561312e+0
+    b = 0.2618972111375892e+0
+    v = 0.2015790585641370e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5027843561874343e+0
+    b = 0.2920013195600270e+0
+    v = 0.2038934198707418e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5321453674452458e+0
+    b = 0.3216322555190551e+0
+    v = 0.2056334060538251e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5603839113834030e+0
+    b = 0.3506456615934198e+0
+    v = 0.2068705959462289e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5874150706875146e+0
+    b = 0.3789007181306267e+0
+    v = 0.2076753906106002e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6131559381660038e+0
+    b = 0.4062580170572782e+0
+    v = 0.2081179391734803e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2778497016394506e+0
+    b = 0.2696271276876226e-1
+    v = 0.1700345216228943e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3143733562261912e+0
+    b = 0.5523469316960465e-1
+    v = 0.1774906779990410e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3501485810261827e+0
+    b = 0.8445193201626464e-1
+    v = 0.1839659377002642e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3851430322303653e+0
+    b = 0.1143263119336083e+0
+    v = 0.1894987462975169e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4193013979470415e+0
+    b = 0.1446177898344475e+0
+    v = 0.1941548809452595e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4525585960458567e+0
+    b = 0.1751165438438091e+0
+    v = 0.1980078427252384e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4848447779622947e+0
+    b = 0.2056338306745660e+0
+    v = 0.2011296284744488e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5160871208276894e+0
+    b = 0.2359965487229226e+0
+    v = 0.2035888456966776e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5462112185696926e+0
+    b = 0.2660430223139146e+0
+    v = 0.2054516325352142e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5751425068101757e+0
+    b = 0.2956193664498032e+0
+    v = 0.2067831033092635e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6028073872853596e+0
+    b = 0.3245763905312779e+0
+    v = 0.2076485320284876e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6291338275278409e+0
+    b = 0.3527670026206972e+0
+    v = 0.2081141439525255e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3541797528439391e+0
+    b = 0.2823853479435550e-1
+    v = 0.1834383015469222e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3908234972074657e+0
+    b = 0.5741296374713106e-1
+    v = 0.1889540591777677e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4264408450107590e+0
+    b = 0.8724646633650199e-1
+    v = 0.1936677023597375e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4609949666553286e+0
+    b = 0.1175034422915616e+0
+    v = 0.1976176495066504e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4944389496536006e+0
+    b = 0.1479755652628428e+0
+    v = 0.2008536004560983e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5267194884346086e+0
+    b = 0.1784740659484352e+0
+    v = 0.2034280351712291e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5577787810220990e+0
+    b = 0.2088245700431244e+0
+    v = 0.2053944466027758e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5875563763536670e+0
+    b = 0.2388628136570763e+0
+    v = 0.2068077642882360e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6159910016391269e+0
+    b = 0.2684308928769185e+0
+    v = 0.2077250949661599e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6430219602956268e+0
+    b = 0.2973740761960252e+0
+    v = 0.2082062440705320e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4300647036213646e+0
+    b = 0.2916399920493977e-1
+    v = 0.1934374486546626e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4661486308935531e+0
+    b = 0.5898803024755659e-1
+    v = 0.1974107010484300e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5009658555287261e+0
+    b = 0.8924162698525409e-1
+    v = 0.2007129290388658e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5344824270447704e+0
+    b = 0.1197185199637321e+0
+    v = 0.2033736947471293e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5666575997416371e+0
+    b = 0.1502300756161382e+0
+    v = 0.2054287125902493e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5974457471404752e+0
+    b = 0.1806004191913564e+0
+    v = 0.2069184936818894e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6267984444116886e+0
+    b = 0.2106621764786252e+0
+    v = 0.2078883689808782e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6546664713575417e+0
+    b = 0.2402526932671914e+0
+    v = 0.2083886366116359e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5042711004437253e+0
+    b = 0.2982529203607657e-1
+    v = 0.2006593275470817e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5392127456774380e+0
+    b = 0.6008728062339922e-1
+    v = 0.2033728426135397e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5726819437668618e+0
+    b = 0.9058227674571398e-1
+    v = 0.2055008781377608e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6046469254207278e+0
+    b = 0.1211219235803400e+0
+    v = 0.2070651783518502e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6350716157434952e+0
+    b = 0.1515286404791580e+0
+    v = 0.2080953335094320e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6639177679185454e+0
+    b = 0.1816314681255552e+0
+    v = 0.2086284998988521e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5757276040972253e+0
+    b = 0.3026991752575440e-1
+    v = 0.2055549387644668e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6090265823139755e+0
+    b = 0.6078402297870770e-1
+    v = 0.2071871850267654e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6406735344387661e+0
+    b = 0.9135459984176636e-1
+    v = 0.2082856600431965e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6706397927793709e+0
+    b = 0.1218024155966590e+0
+    v = 0.2088705858819358e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6435019674426665e+0
+    b = 0.3052608357660639e-1
+    v = 0.2083995867536322e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6747218676375681e+0
+    b = 0.6112185773983089e-1
+    v = 0.2090509712889637e-3
+    grids.append(SphGenOh(5, a, b, v))
+    return np.vstack(grids)
+
+
+def MakeAngularGrid_5810():
+    grids = []
+    a = 0
+    b = 0
+    v = 0.9735347946175486e-5
+    grids.append(SphGenOh(0, a, b, v))
+    v = 0.1907581241803167e-3
+    grids.append(SphGenOh(1, a, b, v))
+    v = 0.1901059546737578e-3
+    grids.append(SphGenOh(2, a, b, v))
+    a = 0.1182361662400277e-1
+    v = 0.3926424538919212e-4
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.3062145009138958e-1
+    v = 0.6667905467294382e-4
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.5329794036834243e-1
+    v = 0.8868891315019135e-4
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.7848165532862220e-1
+    v = 0.1066306000958872e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.1054038157636201e+0
+    v = 0.1214506743336128e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.1335577797766211e+0
+    v = 0.1338054681640871e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.1625769955502252e+0
+    v = 0.1441677023628504e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.1921787193412792e+0
+    v = 0.1528880200826557e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.2221340534690548e+0
+    v = 0.1602330623773609e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.2522504912791132e+0
+    v = 0.1664102653445244e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.2823610860679697e+0
+    v = 0.1715845854011323e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.3123173966267560e+0
+    v = 0.1758901000133069e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.3419847036953789e+0
+    v = 0.1794382485256736e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.3712386456999758e+0
+    v = 0.1823238106757407e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.3999627649876828e+0
+    v = 0.1846293252959976e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4280466458648093e+0
+    v = 0.1864284079323098e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4553844360185711e+0
+    v = 0.1877882694626914e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.4818736094437834e+0
+    v = 0.1887716321852025e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.5074138709260629e+0
+    v = 0.1894381638175673e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.5319061304570707e+0
+    v = 0.1898454899533629e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.5552514978677286e+0
+    v = 0.1900497929577815e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.5981009025246183e+0
+    v = 0.1900671501924092e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6173990192228116e+0
+    v = 0.1899837555533510e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6351365239411131e+0
+    v = 0.1899014113156229e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6512010228227200e+0
+    v = 0.1898581257705106e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6654758363948120e+0
+    v = 0.1898804756095753e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6778410414853370e+0
+    v = 0.1899793610426402e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6881760887484110e+0
+    v = 0.1901464554844117e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.6963645267094598e+0
+    v = 0.1903533246259542e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.7023010617153579e+0
+    v = 0.1905556158463228e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.7059004636628753e+0
+    v = 0.1907037155663528e-3
+    grids.append(SphGenOh(3, a, b, v))
+    a = 0.3552470312472575e-1
+    v = 0.5992997844249967e-4
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.9151176620841283e-1
+    v = 0.9749059382456978e-4
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.1566197930068980e+0
+    v = 0.1241680804599158e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.2265467599271907e+0
+    v = 0.1437626154299360e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.2988242318581361e+0
+    v = 0.1584200054793902e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.3717482419703886e+0
+    v = 0.1694436550982744e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.4440094491758889e+0
+    v = 0.1776617014018108e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.5145337096756642e+0
+    v = 0.1836132434440077e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.5824053672860230e+0
+    v = 0.1876494727075983e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.6468283961043370e+0
+    v = 0.1899906535336482e-3
+    grids.append(SphGenOh(4, a, b, v))
+    a = 0.6095964259104373e-1
+    b = 0.1787828275342931e-1
+    v = 0.8143252820767350e-4
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.8811962270959388e-1
+    b = 0.3953888740792096e-1
+    v = 0.9998859890887728e-4
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1165936722428831e+0
+    b = 0.6378121797722990e-1
+    v = 0.1156199403068359e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1460232857031785e+0
+    b = 0.8985890813745037e-1
+    v = 0.1287632092635513e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1761197110181755e+0
+    b = 0.1172606510576162e+0
+    v = 0.1398378643365139e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2066471190463718e+0
+    b = 0.1456102876970995e+0
+    v = 0.1491876468417391e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2374076026328152e+0
+    b = 0.1746153823011775e+0
+    v = 0.1570855679175456e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2682305474337051e+0
+    b = 0.2040383070295584e+0
+    v = 0.1637483948103775e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2989653312142369e+0
+    b = 0.2336788634003698e+0
+    v = 0.1693500566632843e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3294762752772209e+0
+    b = 0.2633632752654219e+0
+    v = 0.1740322769393633e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3596390887276086e+0
+    b = 0.2929369098051601e+0
+    v = 0.1779126637278296e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3893383046398812e+0
+    b = 0.3222592785275512e+0
+    v = 0.1810908108835412e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4184653789358347e+0
+    b = 0.3512004791195743e+0
+    v = 0.1836529132600190e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4469172319076166e+0
+    b = 0.3796385677684537e+0
+    v = 0.1856752841777379e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4745950813276976e+0
+    b = 0.4074575378263879e+0
+    v = 0.1872270566606832e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5014034601410262e+0
+    b = 0.4345456906027828e+0
+    v = 0.1883722645591307e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5272493404551239e+0
+    b = 0.4607942515205134e+0
+    v = 0.1891714324525297e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5520413051846366e+0
+    b = 0.4860961284181720e+0
+    v = 0.1896827480450146e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5756887237503077e+0
+    b = 0.5103447395342790e+0
+    v = 0.1899628417059528e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1225039430588352e+0
+    b = 0.2136455922655793e-1
+    v = 0.1123301829001669e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1539113217321372e+0
+    b = 0.4520926166137188e-1
+    v = 0.1253698826711277e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1856213098637712e+0
+    b = 0.7086468177864818e-1
+    v = 0.1366266117678531e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2174998728035131e+0
+    b = 0.9785239488772918e-1
+    v = 0.1462736856106918e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2494128336938330e+0
+    b = 0.1258106396267210e+0
+    v = 0.1545076466685412e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2812321562143480e+0
+    b = 0.1544529125047001e+0
+    v = 0.1615096280814007e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3128372276456111e+0
+    b = 0.1835433512202753e+0
+    v = 0.1674366639741759e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3441145160177973e+0
+    b = 0.2128813258619585e+0
+    v = 0.1724225002437900e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3749567714853510e+0
+    b = 0.2422913734880829e+0
+    v = 0.1765810822987288e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4052621732015610e+0
+    b = 0.2716163748391453e+0
+    v = 0.1800104126010751e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4349335453522385e+0
+    b = 0.3007127671240280e+0
+    v = 0.1827960437331284e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4638776641524965e+0
+    b = 0.3294470677216479e+0
+    v = 0.1850140300716308e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4920046410462687e+0
+    b = 0.3576932543699155e+0
+    v = 0.1867333507394938e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5192273554861704e+0
+    b = 0.3853307059757764e+0
+    v = 0.1880178688638289e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5454609081136522e+0
+    b = 0.4122425044452694e+0
+    v = 0.1889278925654758e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5706220661424140e+0
+    b = 0.4383139587781027e+0
+    v = 0.1895213832507346e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5946286755181518e+0
+    b = 0.4634312536300553e+0
+    v = 0.1898548277397420e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.1905370790924295e+0
+    b = 0.2371311537781979e-1
+    v = 0.1349105935937341e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2242518717748009e+0
+    b = 0.4917878059254806e-1
+    v = 0.1444060068369326e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2577190808025936e+0
+    b = 0.7595498960495142e-1
+    v = 0.1526797390930008e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2908724534927187e+0
+    b = 0.1036991083191100e+0
+    v = 0.1598208771406474e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3236354020056219e+0
+    b = 0.1321348584450234e+0
+    v = 0.1659354368615331e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3559267359304543e+0
+    b = 0.1610316571314789e+0
+    v = 0.1711279910946440e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3876637123676956e+0
+    b = 0.1901912080395707e+0
+    v = 0.1754952725601440e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4187636705218842e+0
+    b = 0.2194384950137950e+0
+    v = 0.1791247850802529e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4491449019883107e+0
+    b = 0.2486155334763858e+0
+    v = 0.1820954300877716e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4787270932425445e+0
+    b = 0.2775768931812335e+0
+    v = 0.1844788524548449e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5074315153055574e+0
+    b = 0.3061863786591120e+0
+    v = 0.1863409481706220e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5351810507738336e+0
+    b = 0.3343144718152556e+0
+    v = 0.1877433008795068e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5619001025975381e+0
+    b = 0.3618362729028427e+0
+    v = 0.1887444543705232e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5875144035268046e+0
+    b = 0.3886297583620408e+0
+    v = 0.1894009829375006e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6119507308734495e+0
+    b = 0.4145742277792031e+0
+    v = 0.1897683345035198e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2619733870119463e+0
+    b = 0.2540047186389353e-1
+    v = 0.1517327037467653e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.2968149743237949e+0
+    b = 0.5208107018543989e-1
+    v = 0.1587740557483543e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3310451504860488e+0
+    b = 0.7971828470885599e-1
+    v = 0.1649093382274097e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3646215567376676e+0
+    b = 0.1080465999177927e+0
+    v = 0.1701915216193265e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3974916785279360e+0
+    b = 0.1368413849366629e+0
+    v = 0.1746847753144065e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4295967403772029e+0
+    b = 0.1659073184763559e+0
+    v = 0.1784555512007570e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4608742854473447e+0
+    b = 0.1950703730454614e+0
+    v = 0.1815687562112174e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4912598858949903e+0
+    b = 0.2241721144376724e+0
+    v = 0.1840864370663302e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5206882758945558e+0
+    b = 0.2530655255406489e+0
+    v = 0.1860676785390006e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5490940914019819e+0
+    b = 0.2816118409731066e+0
+    v = 0.1875690583743703e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5764123302025542e+0
+    b = 0.3096780504593238e+0
+    v = 0.1886453236347225e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6025786004213506e+0
+    b = 0.3371348366394987e+0
+    v = 0.1893501123329645e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6275291964794956e+0
+    b = 0.3638547827694396e+0
+    v = 0.1897366184519868e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3348189479861771e+0
+    b = 0.2664841935537443e-1
+    v = 0.1643908815152736e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.3699515545855295e+0
+    b = 0.5424000066843495e-1
+    v = 0.1696300350907768e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4042003071474669e+0
+    b = 0.8251992715430854e-1
+    v = 0.1741553103844483e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4375320100182624e+0
+    b = 0.1112695182483710e+0
+    v = 0.1780015282386092e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4699054490335947e+0
+    b = 0.1402964116467816e+0
+    v = 0.1812116787077125e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5012739879431952e+0
+    b = 0.1694275117584291e+0
+    v = 0.1838323158085421e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5315874883754966e+0
+    b = 0.1985038235312689e+0
+    v = 0.1859113119837737e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5607937109622117e+0
+    b = 0.2273765660020893e+0
+    v = 0.1874969220221698e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5888393223495521e+0
+    b = 0.2559041492849764e+0
+    v = 0.1886375612681076e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6156705979160163e+0
+    b = 0.2839497251976899e+0
+    v = 0.1893819575809276e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6412338809078123e+0
+    b = 0.3113791060500690e+0
+    v = 0.1897794748256767e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4076051259257167e+0
+    b = 0.2757792290858463e-1
+    v = 0.1738963926584846e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4423788125791520e+0
+    b = 0.5584136834984293e-1
+    v = 0.1777442359873466e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4760480917328258e+0
+    b = 0.8457772087727143e-1
+    v = 0.1810010815068719e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5085838725946297e+0
+    b = 0.1135975846359248e+0
+    v = 0.1836920318248129e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5399513637391218e+0
+    b = 0.1427286904765053e+0
+    v = 0.1858489473214328e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5701118433636380e+0
+    b = 0.1718112740057635e+0
+    v = 0.1875079342496592e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5990240530606021e+0
+    b = 0.2006944855985351e+0
+    v = 0.1887080239102310e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6266452685139695e+0
+    b = 0.2292335090598907e+0
+    v = 0.1894905752176822e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6529320971415942e+0
+    b = 0.2572871512353714e+0
+    v = 0.1898991061200695e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.4791583834610126e+0
+    b = 0.2826094197735932e-1
+    v = 0.1809065016458791e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5130373952796940e+0
+    b = 0.5699871359683649e-1
+    v = 0.1836297121596799e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5456252429628476e+0
+    b = 0.8602712528554394e-1
+    v = 0.1858426916241869e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5768956329682385e+0
+    b = 0.1151748137221281e+0
+    v = 0.1875654101134641e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6068186944699046e+0
+    b = 0.1442811654136362e+0
+    v = 0.1888240751833503e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6353622248024907e+0
+    b = 0.1731930321657680e+0
+    v = 0.1896497383866979e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6624927035731797e+0
+    b = 0.2017619958756061e+0
+    v = 0.1900775530219121e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5484933508028488e+0
+    b = 0.2874219755907391e-1
+    v = 0.1858525041478814e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.5810207682142106e+0
+    b = 0.5778312123713695e-1
+    v = 0.1876248690077947e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6120955197181352e+0
+    b = 0.8695262371439526e-1
+    v = 0.1889404439064607e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6416944284294319e+0
+    b = 0.1160893767057166e+0
+    v = 0.1898168539265290e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6697926391731260e+0
+    b = 0.1450378826743251e+0
+    v = 0.1902779940661772e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6147594390585488e+0
+    b = 0.2904957622341456e-1
+    v = 0.1890125641731815e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6455390026356783e+0
+    b = 0.5823809152617197e-1
+    v = 0.1899434637795751e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6747258588365477e+0
+    b = 0.8740384899884715e-1
+    v = 0.1904520856831751e-3
+    grids.append(SphGenOh(5, a, b, v))
+    a = 0.6772135750395347e+0
+    b = 0.2919946135808105e-1
+    v = 0.1905534498734563e-3
+    grids.append(SphGenOh(5, a, b, v))
+    return np.vstack(grids)
+
+# ~= (L+1)**2/3
+LEBEDEV_ORDER = {
+    0  : 1   ,
+    3  : 6   ,
+    5  : 14  ,
+    7  : 26  ,
+    9  : 38  ,
+    11 : 50  ,
+    13 : 74  ,
+    15 : 86  ,
+    17 : 110 ,
+    19 : 146 ,
+    21 : 170 ,
+    23 : 194 ,
+    25 : 230 ,
+    27 : 266 ,
+    29 : 302 ,
+    31 : 350 ,
+    35 : 434 ,
+    41 : 590 ,
+    47 : 770 ,
+    53 : 974 ,
+    59 : 1202,
+    65 : 1454,
+    71 : 1730,
+    77 : 2030,
+    83 : 2354,
+    89 : 2702,
+    95 : 3074,
+    101: 3470,
+    107: 3890,
+    113: 4334,
+    119: 4802,
+    125: 5294,
+    131: 5810
+}
+LEBEDEV_NGRID = np.array(list(LEBEDEV_ORDER.values()))
+
+@lru_cache(maxsize=50)
+def MakeAngularGrid(points):
+    '''Angular grids for specified Lebedev points'''
+    if points in (0, 1):
+        return np.array((0., 0., 0., 1.))
+
+    if points not in LEBEDEV_NGRID:
+        raise ValueError('Unsupported angular grids %d' % points)
+
+    fn = globals()['MakeAngularGrid_' + str(points)]
+    grids = fn()
+    return grids
diff --git a/pyscf/dft/gen_grid.py b/pyscf/dft/gen_grid.py
index eea7a69775..77c1c781fd 100644
--- a/pyscf/dft/gen_grid.py
+++ b/pyscf/dft/gen_grid.py
@@ -31,6 +31,7 @@
 from pyscf import lib
 from pyscf.lib import logger
 from pyscf.dft import radi
+from pyscf.dft.LebedevGrid import LEBEDEV_ORDER, LEBEDEV_NGRID, MakeAngularGrid
 from pyscf import gto
 from pyscf.gto.eval_gto import BLKSIZE, NBINS, CUTOFF, make_screen_index
 from pyscf import __config__
@@ -43,44 +44,6 @@
 ALIGNMENT_UNIT = 8
 NELEC_ERROR_TOL = getattr(__config__, 'dft_rks_prune_error_tol', 0.02)
 
-# ~= (L+1)**2/3
-LEBEDEV_ORDER = {
-    0  : 1   ,
-    3  : 6   ,
-    5  : 14  ,
-    7  : 26  ,
-    9  : 38  ,
-    11 : 50  ,
-    13 : 74  ,
-    15 : 86  ,
-    17 : 110 ,
-    19 : 146 ,
-    21 : 170 ,
-    23 : 194 ,
-    25 : 230 ,
-    27 : 266 ,
-    29 : 302 ,
-    31 : 350 ,
-    35 : 434 ,
-    41 : 590 ,
-    47 : 770 ,
-    53 : 974 ,
-    59 : 1202,
-    65 : 1454,
-    71 : 1730,
-    77 : 2030,
-    83 : 2354,
-    89 : 2702,
-    95 : 3074,
-    101: 3470,
-    107: 3890,
-    113: 4334,
-    119: 4802,
-    125: 5294,
-    131: 5810
-}
-LEBEDEV_NGRID = numpy.array(list(LEBEDEV_ORDER.values()))
-
 # SG0
 # S. Chien and P. Gill,  J. Comput. Chem. 27 (2006) 730-739.
 
@@ -265,9 +228,7 @@ def gen_atomic_grids(mol, atom_grid={}, radi_method=radi.gauss_chebyshev,
             coords = []
             vol = []
             for n in sorted(set(angs)):
-                grid = numpy.empty((n,4))
-                libdft.MakeAngularGrid(grid.ctypes.data_as(ctypes.c_void_p),
-                                       ctypes.c_int(n))
+                grid = MakeAngularGrid(n)
                 idx = numpy.where(angs==n)[0]
                 #coords.append(numpy.einsum('i,jk->jik', rad[idx], grid[:,:3]).reshape(-1,3))
                 #vol.append(numpy.einsum('i,j->ji', rad_weight[idx], grid[:,3]).ravel())
diff --git a/pyscf/solvent/ddcosmo.py b/pyscf/solvent/ddcosmo.py
index 80a4a6d8de..e0293aae8f 100644
--- a/pyscf/solvent/ddcosmo.py
+++ b/pyscf/solvent/ddcosmo.py
@@ -340,9 +340,7 @@ def regularize_xt(t, eta):
 
 def make_grids_one_sphere(lebedev_order):
     ngrid_1sph = gen_grid.LEBEDEV_ORDER[lebedev_order]
-    leb_grid = numpy.empty((ngrid_1sph,4))
-    gen_grid.libdft.MakeAngularGrid(leb_grid.ctypes.data_as(ctypes.c_void_p),
-                                    ctypes.c_int(ngrid_1sph))
+    leb_grid = gen_grid.MakeAngularGrid(ngrid_1sph)
     coords_1sph = leb_grid[:,:3]
     # Note the Lebedev angular grids are normalized to 1 in pyscf
     weights_1sph = 4*numpy.pi * leb_grid[:,3]
diff --git a/pyscf/solvent/pcm.py b/pyscf/solvent/pcm.py
index fc6412d292..75b777ad4b 100644
--- a/pyscf/solvent/pcm.py
+++ b/pyscf/solvent/pcm.py
@@ -139,9 +139,7 @@ def switch_h(x):
 
 def gen_surface(mol, ng=302, vdw_scale=1.2):
     '''J. Phys. Chem. A 1999, 103, 11060-11079'''
-    unit_sphere = numpy.empty((ng,4))
-    libdft.MakeAngularGrid(unit_sphere.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(ng))
-
+    unit_sphere = gen_grid.MakeAngularGrid(ng)
     atom_coords = mol.atom_coords(unit='B')
     charges = mol.atom_charges()
     N_J = ng * numpy.ones(mol.natm)

From 881dbbebcbc084ba2ba8fe968ff03fd34d7dfdf8 Mon Sep 17 00:00:00 2001
From: Qiming Sun <osirpt.sun@gmail.com>
Date: Sun, 11 Feb 2024 11:44:57 -0800
Subject: [PATCH 14/44] Add an API to dump system info

---
 pyscf/gto/mole.py | 19 +++----------------
 pyscf/lib/misc.py | 19 +++++++++++++++++++
 2 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/pyscf/gto/mole.py b/pyscf/gto/mole.py
index b79074aa56..4e06980ffb 100644
--- a/pyscf/gto/mole.py
+++ b/pyscf/gto/mole.py
@@ -25,9 +25,7 @@
 import sys
 import types
 import re
-import platform
 import gc
-import time
 
 import json
 import ctypes
@@ -2695,7 +2693,6 @@ def gto_norm(self, l, expnt):
 
     def dump_input(self):
         import __main__
-        import pyscf
         if hasattr(__main__, '__file__'):
             try:
                 filename = os.path.abspath(__main__.__file__)
@@ -2709,19 +2706,9 @@ def dump_input(self):
             except IOError:
                 logger.warn(self, 'input file does not exist')
 
-        self.stdout.write('System: %s  Threads %s\n' %
-                          (str(platform.uname()), lib.num_threads()))
-        self.stdout.write('Python %s\n' % sys.version)
-        self.stdout.write('numpy %s  scipy %s\n' %
-                          (numpy.__version__, scipy.__version__))
-        self.stdout.write('Date: %s\n' % time.ctime())
-        self.stdout.write('PySCF version %s\n' % pyscf.__version__)
-        info = lib.repo_info(os.path.join(__file__, '..', '..'))
-        self.stdout.write('PySCF path  %s\n' % info['path'])
-        if 'git' in info:
-            self.stdout.write(info['git'] + '\n')
-
-        self.stdout.write('\n')
+        self.stdout.write('\n'.join(lib.misc.format_sys_info()))
+
+        self.stdout.write('\n\n')
         for key in os.environ:
             if 'PYSCF' in key:
                 self.stdout.write('[ENV] %s %s\n' % (key, os.environ[key]))
diff --git a/pyscf/lib/misc.py b/pyscf/lib/misc.py
index 1b109208dc..46ebf5edf0 100644
--- a/pyscf/lib/misc.py
+++ b/pyscf/lib/misc.py
@@ -22,6 +22,8 @@
 
 import os
 import sys
+import time
+import platform
 import warnings
 import tempfile
 import functools
@@ -30,6 +32,7 @@
 import collections
 import ctypes
 import numpy
+import scipy
 import h5py
 from threading import Thread
 from multiprocessing import Queue, Process
@@ -1303,6 +1306,22 @@ def git_info(repo_path):
         pass
     return orig_head, head, branch
 
+def format_sys_info():
+    '''Format a list of system information for printing.'''
+    import pyscf
+    info = repo_info(os.path.join(__file__, '..', '..'))
+    result = [
+        f'System: {platform.uname()}  Threads {num_threads()}',
+        f'Python {sys.version}',
+        f'numpy {numpy.__version__}  scipy {scipy.__version__}',
+        f'Date: {time.ctime()}',
+        f'PySCF version {pyscf.__version__}',
+        f'PySCF path  {info["path"]}',
+    ]
+    if 'git' in info:
+        result.append(info['git'])
+    return result
+
 
 def isinteger(obj):
     '''

From 174e23c6b612c90fc2a54d1166042be7b7f72058 Mon Sep 17 00:00:00 2001
From: Xing Zhang <fishjojo@users.noreply.github.com>
Date: Wed, 21 Feb 2024 22:30:31 -0800
Subject: [PATCH 15/44] multigrid DFT version 2 (#2078)

---
 examples/pbc/27-multigrid.py               |    4 +-
 examples/pbc/27-multigrid2.py              |  238 ++++
 pyscf/gto/mole.py                          |   22 +-
 pyscf/gto/moleintor.py                     |    1 +
 pyscf/lib/CMakeLists.txt                   |   40 +
 pyscf/lib/dft/CMakeLists.txt               |   14 +-
 pyscf/lib/dft/grid_collocate.c             |  655 +++++++++
 pyscf/lib/dft/grid_common.c                |  660 +++++++++
 pyscf/lib/dft/grid_common.h                |  109 ++
 pyscf/lib/dft/grid_integrate.c             | 1358 +++++++++++++++++++
 pyscf/lib/dft/libxc_itrf.c                 |  290 +++-
 pyscf/lib/dft/multigrid.c                  |  744 +++++++++++
 pyscf/lib/dft/multigrid.h                  |   72 +
 pyscf/lib/dft/utils.c                      |   62 +
 pyscf/lib/dft/utils.h                      |   27 +
 pyscf/lib/np_helper/np_helper.h            |    7 +
 pyscf/lib/numpy_helper.py                  |   10 +
 pyscf/lib/pbc/CMakeLists.txt               |   12 +-
 pyscf/lib/pbc/cell.c                       |  280 ++++
 pyscf/lib/pbc/cell.h                       |   29 +
 pyscf/lib/pbc/fft.c                        |  147 ++
 pyscf/lib/pbc/fft.h                        |   26 +
 pyscf/lib/pbc/fill_ints.c                  |    6 +-
 pyscf/lib/pbc/fill_ints.h                  |   29 +
 pyscf/lib/pbc/fill_ints_screened.c         | 1012 ++++++++++++++
 pyscf/lib/pbc/hf_grad.c                    |   95 ++
 pyscf/lib/pbc/neighbor_list.c              |  206 +++
 pyscf/lib/pbc/neighbor_list.h              |   41 +
 pyscf/lib/pbc/optimizer.c                  |   38 +-
 pyscf/lib/pbc/optimizer.h                  |    6 +-
 pyscf/lib/pbc/pp.c                         |  448 +++++++
 pyscf/lib/test/test_numint_uniform_grid.py |    6 +-
 pyscf/pbc/df/incore.py                     |  244 ++++
 pyscf/pbc/dft/gks.py                       |    2 +-
 pyscf/pbc/dft/kgks.py                      |    2 +-
 pyscf/pbc/dft/krks.py                      |    6 +-
 pyscf/pbc/dft/krks_ksymm.py                |    6 +-
 pyscf/pbc/dft/kuks.py                      |    4 +-
 pyscf/pbc/dft/kuks_ksymm.py                |    6 +-
 pyscf/pbc/dft/multigrid/__init__.py        |   57 +
 pyscf/pbc/dft/{ => multigrid}/multigrid.py |  179 ++-
 pyscf/pbc/dft/multigrid/multigrid_pair.py  | 1405 ++++++++++++++++++++
 pyscf/pbc/dft/multigrid/pp.py              |  290 ++++
 pyscf/pbc/dft/multigrid/utils.py           |   70 +
 pyscf/pbc/dft/rks.py                       |    6 +-
 pyscf/pbc/dft/test/test_krks_ksym.py       |    8 +-
 pyscf/pbc/dft/test/test_multigrid.py       |   52 +-
 pyscf/pbc/dft/test/test_multigrid2.py      |   95 ++
 pyscf/pbc/dft/uks.py                       |    4 +-
 pyscf/pbc/grad/__init__.py                 |    7 +-
 pyscf/pbc/grad/krhf.py                     |    8 +-
 pyscf/pbc/grad/rhf.py                      |  167 +++
 pyscf/pbc/grad/rks.py                      |   24 +
 pyscf/pbc/grad/uhf.py                      |   92 ++
 pyscf/pbc/grad/uks.py                      |   24 +
 pyscf/pbc/gto/__init__.py                  |    1 +
 pyscf/pbc/gto/_pbcintor.py                 |   21 +-
 pyscf/pbc/gto/cell.py                      |  223 +++-
 pyscf/pbc/gto/ewald_methods.py             |  293 ++++
 pyscf/pbc/gto/neighborlist.py              |  199 +++
 pyscf/pbc/gto/pseudo/pp_int.py             |  367 ++++-
 pyscf/pbc/gto/pseudo/test/test_pp.py       |   36 +
 pyscf/pbc/gto/test/test_cell.py            |   25 +
 pyscf/pbc/scf/hf.py                        |   46 +-
 pyscf/pbc/scf/khf.py                       |   12 +-
 pyscf/pbc/scf/khf_ksymm.py                 |    6 +-
 pyscf/pbc/scf/kuhf.py                      |    6 +-
 pyscf/pbc/scf/kuhf_ksymm.py                |    6 +-
 pyscf/pbc/scf/test/test_hf.py              |   26 +-
 pyscf/pbc/scf/uhf.py                       |    9 +-
 pyscf/pbc/symm/geom.py                     |    2 +-
 pyscf/pbc/symm/pyscf_spglib.py             |    2 +-
 pyscf/pbc/symm/symmetry.py                 |    2 +-
 pyscf/pbc/tools/pbc.py                     |   58 +-
 pyscf/scf/atom_hf.py                       |   15 +-
 pyscf/scf/atom_hf_pp.py                    |  154 +++
 pyscf/scf/dhf.py                           |    4 +-
 pyscf/scf/diis.py                          |    8 +-
 pyscf/scf/hf.py                            |   34 +-
 pyscf/scf/uhf.py                           |    8 +-
 80 files changed, 10713 insertions(+), 302 deletions(-)
 create mode 100644 examples/pbc/27-multigrid2.py
 create mode 100644 pyscf/lib/dft/grid_collocate.c
 create mode 100644 pyscf/lib/dft/grid_common.c
 create mode 100644 pyscf/lib/dft/grid_common.h
 create mode 100644 pyscf/lib/dft/grid_integrate.c
 create mode 100644 pyscf/lib/dft/multigrid.c
 create mode 100644 pyscf/lib/dft/multigrid.h
 create mode 100644 pyscf/lib/dft/utils.c
 create mode 100644 pyscf/lib/dft/utils.h
 create mode 100644 pyscf/lib/pbc/cell.c
 create mode 100644 pyscf/lib/pbc/cell.h
 create mode 100644 pyscf/lib/pbc/fft.c
 create mode 100644 pyscf/lib/pbc/fft.h
 create mode 100644 pyscf/lib/pbc/fill_ints.h
 create mode 100644 pyscf/lib/pbc/fill_ints_screened.c
 create mode 100644 pyscf/lib/pbc/hf_grad.c
 create mode 100644 pyscf/lib/pbc/neighbor_list.c
 create mode 100644 pyscf/lib/pbc/neighbor_list.h
 create mode 100644 pyscf/lib/pbc/pp.c
 create mode 100644 pyscf/pbc/dft/multigrid/__init__.py
 rename pyscf/pbc/dft/{ => multigrid}/multigrid.py (95%)
 create mode 100644 pyscf/pbc/dft/multigrid/multigrid_pair.py
 create mode 100644 pyscf/pbc/dft/multigrid/pp.py
 create mode 100644 pyscf/pbc/dft/multigrid/utils.py
 create mode 100644 pyscf/pbc/dft/test/test_multigrid2.py
 create mode 100644 pyscf/pbc/grad/rhf.py
 create mode 100644 pyscf/pbc/grad/rks.py
 create mode 100644 pyscf/pbc/grad/uhf.py
 create mode 100644 pyscf/pbc/grad/uks.py
 create mode 100644 pyscf/pbc/gto/ewald_methods.py
 create mode 100644 pyscf/pbc/gto/neighborlist.py
 create mode 100644 pyscf/scf/atom_hf_pp.py

diff --git a/examples/pbc/27-multigrid.py b/examples/pbc/27-multigrid.py
index f1b1f85a95..6809f33e3d 100644
--- a/examples/pbc/27-multigrid.py
+++ b/examples/pbc/27-multigrid.py
@@ -31,9 +31,9 @@
 #
 # There are two ways to enable multigrid numerical integration
 #
-# Method 1: use multigrid.multigrid function to update SCF object
+# Method 1: use multigrid.multigrid_fftdf function to update SCF object
 #
-mf = multigrid.multigrid(mf)
+mf = multigrid.multigrid_fftdf(mf)
 mf.kernel()
 
 #
diff --git a/examples/pbc/27-multigrid2.py b/examples/pbc/27-multigrid2.py
new file mode 100644
index 0000000000..d73cd8fe50
--- /dev/null
+++ b/examples/pbc/27-multigrid2.py
@@ -0,0 +1,238 @@
+#from os.path import expanduser
+#home_dir = expanduser("~")
+#f = open(home_dir+'/.pyscf_conf.py', 'a')
+# use FFTW for fft, this requires to compile the FFTW library
+# cmake -DENABLE_FFTW=ON -DBUILD_FFTW=ON
+#f.write('pbc_tools_pbc_fft_engine=\'FFTW\'')
+#f.close()
+
+import numpy
+import pyscf
+from pyscf import lib
+from pyscf import pbc
+from pyscf.pbc import gto as pbcgto
+from pyscf.pbc import dft as pbcdft
+from pyscf.pbc.dft import multigrid
+
+cell=pbcgto.Cell()
+
+#Molecule
+boxlen=12.4138
+cell.a=numpy.array([[boxlen,0.0,0.0],[0.0,boxlen,0.0],[0.0,0.0,boxlen]])
+cell.atom="""
+O      12.235322       1.376642      10.869880
+O       6.445390       3.706940       8.650794
+O       0.085977       2.181322       8.276663
+O      12.052554       2.671366       2.147199
+O      12.250036       4.190930      12.092014
+O       7.187422       0.959062       4.733469
+O       8.346457       7.210040       4.667644
+O      12.361546      11.527875       8.106887
+O       3.299984       4.440816       9.193275
+O       2.855829       3.759909       6.552815
+O       1.392494       6.362753       0.586172
+O       1.858645       8.694013       2.068738
+O       3.770231      12.094519       8.652183
+O       6.432508       3.669828       2.772418
+O       1.998724       1.820217       4.876440
+O       8.248581       2.404730       6.931303
+O       5.753814       3.360029      12.461534
+O      11.322212       5.649239       2.236798
+O       4.277318       2.113956      10.590808
+O       5.405015       3.349247       5.484702
+O       6.493278      11.869958       0.684912
+O       3.275250       2.346576       2.425241
+O       7.981003       6.352512       7.507970
+O       5.985990       6.512854      12.194648
+O      10.636714      11.856872      12.209540
+O       9.312283       3.670384       3.508594
+O       1.106885       5.830301       6.638695
+O       8.008007       3.326363      10.869818
+O      12.403000       9.687405      11.761901
+O       4.219782       7.085315       8.153470
+O       3.781557       8.203821      11.563272
+O      11.088898       4.532081       7.809475
+O      10.387548       8.408890       1.017882
+O       1.979016       6.418091      10.374159
+O       4.660547       0.549666       5.617403
+O       8.745880      12.256257       8.089383
+O       2.662041      10.489890       0.092980
+O       7.241661      10.471815       4.226946
+O       2.276827       0.276647      10.810417
+O       8.887733       0.946877       1.333885
+O       1.943554       8.088552       7.567650
+O       9.667942       8.056759       9.868847
+O      10.905491       8.339638       6.484782
+O       3.507733       4.862402       1.557439
+O       8.010457       8.642846      12.055969
+O       8.374446      10.035932       6.690309
+O       5.635247       6.076875       5.563993
+O      11.728434       1.601906       5.079475
+O       9.771134       9.814114       3.548703
+O       3.944355      10.563450       4.687536
+O       0.890357       6.382287       4.065806
+O       6.862447       6.425182       2.488202
+O       3.813963       6.595122       3.762649
+O       6.562448       8.295463       8.807182
+O       9.809455       0.143325       3.886553
+O       4.117074      11.661225       2.221679
+O       5.295317       8.735561       2.763183
+O       9.971999       5.379339       5.340378
+O      12.254708       8.643874       3.957116
+O       2.344274      10.761274       6.829162
+O       7.013416       0.643488      10.518797
+O       5.152349      10.233624      10.359388
+O      11.184278       5.884064      10.298279
+O      12.252335       8.974142       9.070831
+H      12.415139       2.233125      11.257611
+H      11.922476       1.573799       9.986994
+H       5.608192       3.371543       8.971482
+H       6.731226       3.060851       8.004962
+H      -0.169205       1.565594       7.589645
+H      -0.455440       2.954771       8.118939
+H      12.125168       2.826463       1.205443
+H      12.888828       2.969761       2.504745
+H      11.553255       4.386613      11.465566
+H      12.818281       4.960808      12.067151
+H       7.049495       1.772344       4.247898
+H       6.353019       0.798145       5.174047
+H       7.781850       7.384852       5.420566
+H       9.103203       6.754017       5.035898
+H      12.771232      11.788645       8.931744
+H      12.018035      10.650652       8.276334
+H       3.557245       3.792529       9.848846
+H       2.543844       4.884102       9.577958
+H       2.320235       4.521250       6.329813
+H       2.872128       3.749963       7.509824
+H       1.209685       7.121391       1.140501
+H       2.238885       6.038801       0.894245
+H       2.763109       8.856353       2.336735
+H       1.329379       9.047369       2.783755
+H       4.315639      11.533388       9.203449
+H       3.098742      12.433043       9.244412
+H       5.987369       3.448974       3.590530
+H       5.813096       3.419344       2.086985
+H       1.057126       1.675344       4.969379
+H       2.248496       2.292119       5.670892
+H       8.508264       1.653337       7.464411
+H       8.066015       2.034597       6.067646
+H       5.197835       2.915542      11.821572
+H       6.630900       3.329981      12.079371
+H      10.788986       6.436672       2.127933
+H      11.657923       5.463602       1.359832
+H       3.544476       1.634958      10.977765
+H       4.755770       1.455054      10.087655
+H       4.465371       3.375459       5.665294
+H       5.682663       4.264430       5.524498
+H       6.174815      11.778676       1.582954
+H       5.713640      12.089924       0.174999
+H       3.476076       1.498708       2.028983
+H       2.730229       2.134295       3.182949
+H       7.119624       5.936450       7.474030
+H       8.536492       5.799405       6.958665
+H       5.909499       5.717477      11.667621
+H       6.125402       6.196758      13.087330
+H      11.203499      12.513536      11.804844
+H      10.260930      12.300153      12.970145
+H       9.985036       3.927685       2.878172
+H       8.545584       3.468329       2.972331
+H       1.399882       6.620092       7.093246
+H       0.963561       6.112523       5.735345
+H       8.067363       3.674002       9.979955
+H       8.000737       2.375959      10.756190
+H      11.821629      10.402510      12.020482
+H      12.206854       8.983242      12.379892
+H       3.461473       7.606485       7.889688
+H       3.844478       6.304711       8.560946
+H       3.179884       7.585614      11.148494
+H       4.401957       7.652030      12.039573
+H      11.573777       5.053211       7.169515
+H      10.342076       4.186083       7.320831
+H      10.065640       8.919194       1.760981
+H       9.629585       8.322499       0.439729
+H       1.396302       6.546079       9.625630
+H       1.405516       6.479759      11.138049
+H       4.024008       1.232518       5.405828
+H       4.736858       0.579881       6.571077
+H       9.452293      12.313381       8.732772
+H       8.976559      11.502788       7.545965
+H       1.834701      10.012311       0.153462
+H       3.295197       9.836403      -0.204175
+H       7.056724      11.401702       4.095264
+H       6.499038      10.020287       3.825865
+H       1.365541       0.487338      11.013887
+H       2.501591      -0.428131      11.417871
+H       8.644279       1.812362       1.005409
+H       8.142674       0.388030       1.112955
+H       1.272659       8.365063       8.191888
+H       2.142485       8.877768       7.063867
+H       8.961493       7.826192       9.265523
+H       9.227102       8.487654      10.601118
+H      10.150144       7.758934       6.392768
+H      10.596082       9.187988       6.167290
+H       3.463106       4.096188       2.129414
+H       3.919461       4.539801       0.755791
+H       7.418998       9.394959      12.028876
+H       7.430413       7.883095      12.106546
+H       7.972905      10.220334       5.841196
+H       7.675111       9.631498       7.203725
+H       5.332446       6.381336       6.419473
+H       5.000025       6.434186       4.943466
+H      11.575078       2.271167       4.412540
+H      11.219802       0.847030       4.783357
+H       8.865342       9.721516       3.843998
+H      10.000732      10.719285       3.758898
+H       3.186196      10.476397       5.265333
+H       4.407331      11.335128       5.013723
+H       0.558187       7.255936       3.859331
+H       0.341672       5.789383       3.552346
+H       7.459933       6.526049       3.229193
+H       6.696228       5.483739       2.440372
+H       3.864872       6.313007       2.849385
+H       2.876419       6.621201       3.953862
+H       5.631529       8.079145       8.753997
+H       7.003296       7.568245       8.367822
+H       9.615413       0.527902       3.031755
+H       8.962985       0.109366       4.332162
+H       3.825854      11.139182       1.474087
+H       4.063988      11.063232       2.967211
+H       5.784391       7.914558       2.708486
+H       4.780461       8.655167       3.566110
+H      10.880659       5.444664       5.046607
+H       9.593331       4.687991       4.797350
+H      11.562317       8.960134       3.376765
+H      11.926084       8.816948       4.839320
+H       2.856874      11.297981       7.433660
+H       1.492332      11.195517       6.786033
+H       7.145820       0.090200       9.749009
+H       7.227275       0.077690      11.260665
+H       4.662021       9.538430      10.798155
+H       5.994537       9.833472      10.142985
+H      10.544299       6.595857      10.301445
+H      11.281750       5.653082       9.374494
+H      12.103020       8.841164      10.006916
+H      11.491592       8.576221       8.647557
+"""
+cell.basis = 'gth-tzv2p'
+cell.ke_cutoff = 200  # kinetic energy cutoff in a.u.
+cell.max_memory = 8000 # in MB
+cell.precision = 1e-6 # integral precision
+cell.pseudo = 'gth-pade'
+cell.verbose = 4
+cell.use_loose_rcut = True # integral screening based on shell radii
+cell.use_particle_mesh_ewald = True # use particle mesh ewald for nuclear repulsion
+cell.build()
+#cell = pbc.tools.super_cell(cell, [1,2,2]) #build super cell by replicating unit cell
+
+mf=pbcdft.RKS(cell)
+#mf.xc = "LDA, VWN"
+mf.xc = "PBE,PBE"
+mf.init_guess = 'atom' # atom guess is fast
+mf.with_df = multigrid.MultiGridFFTDF2(cell)
+mf.with_df.ngrids = 4 # number of sets of grid points
+mf.kernel()
+
+# Nuclear Gradients
+from pyscf.pbc.grad import rks as rks_grad
+grad = rks_grad.Gradients(mf)
+g = grad.kernel()
diff --git a/pyscf/gto/mole.py b/pyscf/gto/mole.py
index 4e06980ffb..28d8fd444d 100644
--- a/pyscf/gto/mole.py
+++ b/pyscf/gto/mole.py
@@ -61,6 +61,7 @@
 NUC_MOD_OF = 2
 PTR_ZETA   = 3
 PTR_FRAC_CHARGE = 4
+PTR_RADIUS = 5
 ATM_SLOTS  = 6
 ATOM_OF    = 0
 ANG_OF     = 1
@@ -2412,6 +2413,15 @@ def ms(self, x):
         else:
             self.spin = int(round(2*x, 4))
 
+    @property
+    def enuc(self):
+        '''nuclear repulsion energy'''
+        if self._enuc is None:
+            self._enuc = self.energy_nuc()
+        return self._enuc
+    @enuc.setter
+    def enuc(self, enuc):
+        self._enuc = enuc
 
     copy = copy
 
@@ -2576,6 +2586,9 @@ def build(self, dump_input=True, parse_arg=ARGPARSE,
             # number of electrons are consistent.
             self.nelec
 
+        # reset nuclear energy
+        self.enuc = None
+
         if not self.magmom:
             self.magmom = [0,] * self.natm
         elif len(self.magmom) != self.natm:
@@ -2784,7 +2797,7 @@ def dump_input(self):
 
         if self.verbose >= logger.INFO:
             self.stdout.write('\n')
-            logger.info(self, 'nuclear repulsion = %.15g', self.energy_nuc())
+            logger.info(self, 'nuclear repulsion = %.15g', self.enuc)
             if self.symmetry:
                 if self.topgroup == self.groupname:
                     logger.info(self, 'point group symmetry = %s', self.topgroup)
@@ -3050,6 +3063,9 @@ def set_geom_(self, atoms_or_coords, unit=None, symmetry=None,
             mol.symmetry = symmetry
             mol.build(False, False)
 
+        # reset nuclear energy
+        mol.enuc = None
+
         if mol.verbose >= logger.INFO:
             logger.info(mol, 'New geometry')
             for ia, atom in enumerate(mol._atom):
@@ -3542,7 +3558,9 @@ def intor_by_shell(self, intor, shells, comp=None, grids=None):
 
     eval_ao = eval_gto = eval_gto
 
-    energy_nuc = get_enuc = energy_nuc
+    energy_nuc = energy_nuc
+    def get_enuc(self):
+        return self.enuc
 
     def get_ao_indices(self, bas_list, ao_loc=None):
         '''
diff --git a/pyscf/gto/moleintor.py b/pyscf/gto/moleintor.py
index e3d661f1e0..4c6a4ce8cf 100644
--- a/pyscf/gto/moleintor.py
+++ b/pyscf/gto/moleintor.py
@@ -429,6 +429,7 @@ def _get_intor_and_comp(intor_name, comp=None):
     'int2c2e_ip1ip2'            : (9, 9),
     'int2c2e_ipip1'             : (9, 9),
     'int3c1e'                   : (1, 1),
+    'int3c1e_ip1'               : (3, 3),
     'int3c1e_p2'                : (1, 1),
     'int3c1e_iprinv'            : (3, 3),
     'int2c2e'                   : (1, 1),
diff --git a/pyscf/lib/CMakeLists.txt b/pyscf/lib/CMakeLists.txt
index 1dc076da21..4b7236535e 100644
--- a/pyscf/lib/CMakeLists.txt
+++ b/pyscf/lib/CMakeLists.txt
@@ -136,6 +136,9 @@ else ()
   set(CMAKE_INSTALL_RPATH "\$ORIGIN:\$ORIGIN/deps/lib:\$ORIGIN/deps/lib64")
 endif ()
 
+option(ENABLE_FFTW "Using fftw3" OFF)
+option(BUILD_FFTW "Building fftw3" OFF)
+
 add_subdirectory(np_helper)
 add_subdirectory(gto)
 add_subdirectory(vhf)
@@ -198,6 +201,12 @@ option(ENABLE_XCFUN "Using xcfun for XC functional library" ON)
 option(BUILD_LIBXC "Download and build libxc library" ON)
 option(BUILD_XCFUN "Download and build xcfun library" ON)
 
+option(ENABLE_LIBXSMM "Using libxsmm" OFF)
+option(BUILD_LIBXSMM "Building libxsmm" OFF)
+if(APPLE)
+    set(ENABLE_LIBXSMM OFF)
+endif()
+
 if(NOT DISABLE_DFT)
 add_subdirectory(dft)
 
@@ -237,8 +246,39 @@ if(ENABLE_XCFUN AND BUILD_XCFUN)
   add_dependencies(xcfun_itrf libxcfun)
   add_dependencies(dft libxcfun)
 endif() # ENABLE_XCFUN
+
+if(ENABLE_LIBXSMM AND BUILD_LIBXSMM)
+  if(NOT EXISTS "${PROJECT_SOURCE_DIR}/deps/include/libxsmm.h")
+    ExternalProject_Add(libxsmm
+      GIT_REPOSITORY https://github.com/hfp/libxsmm.git
+      GIT_TAG 1.17
+      PREFIX ${PROJECT_BINARY_DIR}/deps
+      INSTALL_DIR ${PROJECT_SOURCE_DIR}/deps
+      CONFIGURE_COMMAND ""
+      BUILD_IN_SOURCE True
+      BUILD_COMMAND make -j4 PREFIX=<INSTALL_DIR> CXX=${CMAKE_CXX_COMPILER} CC=${CMAKE_C_COMPILER} STATIC=0 MALLOC=0 INTRINSICS=2 install
+      INSTALL_COMMAND ""
+    )
+    add_dependencies(dft libxsmm)
+  endif()
+endif()
 endif() # DISABLE_DFT
 
+if(ENABLE_FFTW AND BUILD_FFTW)
+#  if(NOT EXISTS "${PROJECT_SOURCE_DIR}/deps/include/fftw3.h")
+    ExternalProject_Add(libfftw3
+      URL https://www.fftw.org/fftw-3.3.10.tar.gz
+      PREFIX ${PROJECT_BINARY_DIR}/deps
+      INSTALL_DIR ${PROJECT_SOURCE_DIR}/deps
+      BUILD_IN_SOURCE True
+      CONFIGURE_COMMAND ./configure --enable-static=no --enable-shared=yes --enable-threads CXX=${CMAKE_CXX_COMPILER} CC=${CMAKE_C_COMPILER} prefix=<INSTALL_DIR>
+      BUILD_COMMAND make -j4 install
+    )
+    add_dependencies(fft libfftw3)
+    add_dependencies(pbc libfftw3)
+#  endif()
+endif()
+
 if(EXISTS "${PROJECT_SOURCE_DIR}/cmake.user.inc")
   include("${PROJECT_SOURCE_DIR}/cmake.user.inc")
 endif()
diff --git a/pyscf/lib/dft/CMakeLists.txt b/pyscf/lib/dft/CMakeLists.txt
index 6b01b7eca0..c7263183c8 100644
--- a/pyscf/lib/dft/CMakeLists.txt
+++ b/pyscf/lib/dft/CMakeLists.txt
@@ -15,14 +15,19 @@
 add_library(dft SHARED
   CxLebedevGrid.c grid_basis.c nr_numint.c r_numint.c
   numint_uniform_grid.c xc_deriv.c nr_numint_sparse.c
-  )
-add_dependencies(dft cgto cvhf np_helper)
+  multigrid.c grid_common.c grid_collocate.c grid_integrate.c utils.c
+)
+add_dependencies(dft cgto cvhf np_helper pbc)
 
 set_target_properties(dft PROPERTIES
   LIBRARY_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR})
 
-target_link_libraries(dft cvhf cgto cint np_helper ${BLAS_LIBRARIES} ${OPENMP_C_PROPERTIES})
-
+if(ENABLE_LIBXSMM)
+  add_definitions(-DHAVE_LIBXSMM)
+  target_link_libraries(dft cvhf cgto cint np_helper pbc xsmm ${BLAS_LIBRARIES} ${OPENMP_C_PROPERTIES})
+else()
+  target_link_libraries(dft cvhf cgto cint np_helper pbc ${BLAS_LIBRARIES} ${OPENMP_C_PROPERTIES})
+endif()
 
 if(ENABLE_LIBXC)
 add_library(xc_itrf SHARED libxc_itrf.c)
@@ -37,4 +42,3 @@ set_target_properties(xcfun_itrf PROPERTIES
   LIBRARY_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR})
 target_link_libraries(xcfun_itrf xcfun ${OPENMP_C_PROPERTIES})
 endif()
-
diff --git a/pyscf/lib/dft/grid_collocate.c b/pyscf/lib/dft/grid_collocate.c
new file mode 100644
index 0000000000..33842191d3
--- /dev/null
+++ b/pyscf/lib/dft/grid_collocate.c
@@ -0,0 +1,655 @@
+/* Copyright 2021- The PySCF Developers. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+ *
+ * Author: Xing Zhang <zhangxing.nju@gmail.com>
+ */
+
+#include <time.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <math.h>
+#include <complex.h>
+#include "config.h"
+#include "vhf/fblas.h"
+#include "np_helper/np_helper.h"
+#include "dft/multigrid.h"
+#include "dft/grid_common.h"
+#include "dft/utils.h"
+
+#define MAX_THREADS     256
+#define PTR_RADIUS        5
+
+static void transform_dm(double* dm_cart, double* dm,
+                         double* ish_contr_coeff, double* jsh_contr_coeff,
+                         int* ish_ao_loc, int* jsh_ao_loc,
+                         int* ish_bas, int* jsh_bas, int ish, int jsh,
+                         int ish0, int jsh0, int naoj, double* cache)
+{
+    int i0 = ish_ao_loc[ish] - ish_ao_loc[ish0];
+    int i1 = ish_ao_loc[ish+1] - ish_ao_loc[ish0];
+    int j0 = jsh_ao_loc[jsh] - jsh_ao_loc[jsh0];
+    int j1 = jsh_ao_loc[jsh+1] - jsh_ao_loc[jsh0];
+
+    int nrow = i1 - i0;
+    int ncol = j1 - j0;
+    double* pdm = dm + ((size_t)naoj) * i0 + j0;
+
+    int l_i = ish_bas[ANG_OF+ish*BAS_SLOTS];
+    int ncart_i = _LEN_CART[l_i];
+    int nprim_i = ish_bas[NPRIM_OF+ish*BAS_SLOTS];
+    int nao_i = nprim_i*ncart_i;
+    int l_j = jsh_bas[ANG_OF+jsh*BAS_SLOTS];
+    int ncart_j = _LEN_CART[l_j];
+    int nprim_j = jsh_bas[NPRIM_OF+jsh*BAS_SLOTS];
+    int nao_j = nprim_j*ncart_j;
+
+    const char TRANS_T = 'T';
+    const char TRANS_N = 'N';
+    const double D1 = 1;
+    const double D0 = 0;
+    //einsum("pi,ij,qj->pq", coeff_i, dm, coeff_j)
+    dgemm_wrapper(TRANS_T, TRANS_N, nao_j, nrow, ncol,
+           D1, jsh_contr_coeff, ncol, pdm, naoj, D0, cache, nao_j);
+    dgemm_wrapper(TRANS_N, TRANS_N, nao_j, nao_i, nrow,
+           D1, cache, nao_j, ish_contr_coeff, nrow, D0, dm_cart, nao_j);
+}
+
+
+static void add_rho_submesh(double* rho, double* pqr,
+                            int* mesh_lb, int* mesh_ub, int* submesh_lb,
+                            const int* mesh, const int* submesh)
+{
+    const int x0 = mesh_lb[0];
+    const int y0 = mesh_lb[1];
+    const int z0 = mesh_lb[2];
+
+    const int nx = mesh_ub[0] - x0;
+    const int ny = mesh_ub[1] - y0;
+    const int nz = mesh_ub[2] - z0;
+
+    const int x0_sub = submesh_lb[0];
+    const int y0_sub = submesh_lb[1];
+    const int z0_sub = submesh_lb[2];
+
+    const size_t mesh_yz = ((size_t) mesh[1]) * mesh[2];
+    const size_t submesh_yz = ((size_t) submesh[1]) * submesh[2];
+
+    int ix, iy, iz;
+    for (ix = 0; ix < nx; ix++) {
+        double* __restrict ptr_rho = rho + (ix + x0) * mesh_yz + y0 * mesh[2] + z0;
+        double* __restrict ptr_pqr = pqr + (ix + x0_sub) * submesh_yz + y0_sub * submesh[2] + z0_sub;
+        for (iy = 0; iy < ny; iy++) {
+            #pragma omp simd
+            for (iz = 0; iz < nz; iz++) {
+                ptr_rho[iz] += ptr_pqr[iz];
+            }
+            ptr_rho += mesh[2];
+            ptr_pqr += submesh[2];
+        }
+    }
+}
+
+
+static void _orth_rho(double *rho, double *dm_xyz,
+                      double fac, int topl,
+                      int *mesh, int *grid_slice,
+                      double *xs_exp, double *ys_exp, double *zs_exp,
+                      double *cache)
+{
+    const int l1 = topl + 1;
+    const int l1l1 = l1 * l1;
+    const int nx0 = grid_slice[0];
+    const int nx1 = grid_slice[1];
+    const int ny0 = grid_slice[2];
+    const int ny1 = grid_slice[3];
+    const int nz0 = grid_slice[4];
+    const int nz1 = grid_slice[5];
+    const int ngridx = nx1 - nx0;
+    const int ngridy = ny1 - ny0;
+    const int ngridz = nz1 - nz0;
+    if (ngridx == 0 || ngridy == 0 || ngridz == 0) {
+        return;
+    }
+
+    const char TRANS_N = 'N';
+    const char TRANS_T = 'T';
+    const double D0 = 0;
+    const double D1 = 1;
+    const int xcols = ngridy * ngridz;
+    double *xyr = cache;
+    double *xqr = xyr + l1l1 * ngridz;
+    double *pqr = xqr + l1 * xcols;
+    int ix, iy, iz, l;
+
+    dgemm_wrapper(TRANS_N, TRANS_N, ngridz, l1l1, l1,
+                  fac, zs_exp, ngridz, dm_xyz, l1,
+                  D0, xyr, ngridz);
+    for (l = 0; l <= topl; l++) {
+        dgemm_wrapper(TRANS_N, TRANS_T, ngridz, ngridy, l1,
+                      D1, xyr+l*l1*ngridz, ngridz, ys_exp, ngridy,
+                      D0, xqr+l*xcols, ngridz);
+    }
+    dgemm_wrapper(TRANS_N, TRANS_T, xcols, ngridx, l1,
+                  D1, xqr, xcols, xs_exp, ngridx,
+                  D0, pqr, xcols);
+
+    const int submesh[3] = {ngridx, ngridy, ngridz};
+    int lb[3], ub[3];
+    for (ix = 0; ix < ngridx;) {
+        lb[0] = modulo(ix + nx0, mesh[0]);
+        ub[0] = get_upper_bound(lb[0], mesh[0], ix, ngridx);
+        for (iy = 0; iy < ngridy;) {
+            lb[1] = modulo(iy + ny0, mesh[1]);
+            ub[1] = get_upper_bound(lb[1], mesh[1], iy, ngridy);
+            for (iz = 0; iz < ngridz;) {
+                lb[2] = modulo(iz + nz0, mesh[2]);
+                ub[2] = get_upper_bound(lb[2], mesh[2], iz, ngridz);
+                int lb_sub[3] = {ix, iy, iz};
+                add_rho_submesh(rho, pqr, lb, ub, lb_sub, mesh, submesh);
+                iz += ub[2] - lb[2];
+            }
+            iy += ub[1] - lb[1];
+        }
+        ix += ub[0] - lb[0];
+    }
+}
+
+
+void make_rho_lda_orth(double *rho, double *dm, int comp,
+                       int li, int lj, double ai, double aj,
+                       double *ri, double *rj, double fac, double cutoff,
+                       int dimension, double* dh, double *a, double *b,
+                       int *mesh, double *cache)
+{
+        int topl = li + lj;
+        int l1 = topl + 1;
+        int l1l1l1 = l1 * l1 * l1;
+        int grid_slice[6];
+        double *xs_exp, *ys_exp, *zs_exp;
+        int data_size = init_orth_data(&xs_exp, &ys_exp, &zs_exp,
+                                       grid_slice, dh, mesh, topl, cutoff,
+                                       ai, aj, ri, rj, cache);
+
+        if (data_size == 0) {
+                return;
+        }
+        cache += data_size;
+
+        double *dm_xyz = cache;
+        cache += l1l1l1;
+        memset(dm_xyz, 0, l1l1l1*sizeof(double));
+
+        _dm_to_dm_xyz(dm_xyz, dm, li, lj, ri, rj, cache);
+
+        _orth_rho(rho, dm_xyz, fac, topl, mesh, grid_slice,
+                  xs_exp, ys_exp, zs_exp, cache);
+}
+
+
+static void _apply_rho(void (*eval_rho)(), double *rho, double *dm,
+                       PGFPair* pgfpair, int comp, int dimension,
+                       double* dh, double *a, double *b, int *mesh,
+                       double* ish_gto_norm, double* jsh_gto_norm,
+                       int *ish_atm, int *ish_bas, double *ish_env,
+                       int *jsh_atm, int *jsh_bas, double *jsh_env,
+                       double* Ls, double *cache)
+{
+        int ish = pgfpair->ish;
+        int jsh = pgfpair->jsh;
+        int ipgf = pgfpair->ipgf;
+        int jpgf = pgfpair->jpgf;
+        int iL = pgfpair->iL;
+        double cutoff = pgfpair->radius;
+
+        double *ri = ish_env + ish_atm[PTR_COORD+ish_bas[ATOM_OF+ish*BAS_SLOTS]*ATM_SLOTS];
+        double *rj = jsh_env + jsh_atm[PTR_COORD+jsh_bas[ATOM_OF+jsh*BAS_SLOTS]*ATM_SLOTS];
+        double *rL = Ls + iL*3;
+        double rjL[3];
+        rjL[0] = rj[0] + rL[0];
+        rjL[1] = rj[1] + rL[1];
+        rjL[2] = rj[2] + rL[2];
+
+        const int li = ish_bas[ANG_OF+ish*BAS_SLOTS];
+        const int lj = jsh_bas[ANG_OF+jsh*BAS_SLOTS];
+        double ai = ish_env[ish_bas[PTR_EXP+ish*BAS_SLOTS]+ipgf];
+        double aj = jsh_env[jsh_bas[PTR_EXP+jsh*BAS_SLOTS]+jpgf];
+        double ci = ish_gto_norm[ipgf];
+        double cj = jsh_gto_norm[jpgf];
+        double aij = ai + aj;
+        double rrij = CINTsquare_dist(ri, rjL);
+        double eij = (ai * aj / aij) * rrij;
+        if (eij > EIJCUTOFF) {
+                return;
+        }
+        double fac = exp(-eij) * ci * cj * CINTcommon_fac_sp(li) * CINTcommon_fac_sp(lj);
+        if (fac < ish_env[PTR_EXPDROP] && fac < jsh_env[PTR_EXPDROP]) {
+                return;
+        }
+
+        (*eval_rho)(rho, dm, comp, li, lj, ai, aj, ri, rjL,
+                    fac, cutoff, dimension, dh, a, b, mesh, cache);
+}
+
+
+static size_t _rho_cache_size(int l, int nprim, int nctr, int* mesh, double radius, double* dh)
+{
+    size_t size = 0;
+    size_t mesh_size = ((size_t)mesh[0]) * mesh[1] * mesh[2];
+    size_t nmx = get_max_num_grid_orth(dh, radius);
+    int l1 = 2 * l + 1;
+    int l1l1 = l1 * l1;
+    int max_mesh = MAX(MAX(mesh[0], mesh[1]), mesh[2]);
+    size += (nprim * _LEN_CART[l]) * (nprim * _LEN_CART[l]); // dm_cart
+    size += _LEN_CART[l]*_LEN_CART[l]; // dm_pgf
+    size += nctr * _LEN_CART[l] * nprim * _LEN_CART[l]; // transform_dm
+    size += l1 * (mesh[0] + mesh[1] + mesh[2]); // xs_exp, ys_exp, zs_exp
+    size += l1l1 * l1; // dm_xyz
+    size += 3 * (_LEN_CART[l] + l1); // _dm_to_dm_xyz
+
+    size_t size_orth_components = l1 * nmx + nmx; // orth_components
+    size_t size_orth_rho = 0; // _orth_rho
+    if (nmx < max_mesh) {
+        size_orth_rho = l1l1*nmx + l1*nmx*nmx + nmx*nmx*nmx;
+    } else {
+        size_orth_rho = l1l1*mesh[2] + l1*mesh[1]*mesh[2] + mesh_size;
+    }
+    size += MAX(size_orth_rho, size_orth_components);
+    size += 1000000;
+    //printf("Memory allocated per thread for make_rho: %ld MB.\n", (size+mesh_size)*sizeof(double) / 1000000);
+    return size;
+}
+
+
+static size_t _rho_core_cache_size(int* mesh, double radius, double* dh)
+{
+    size_t size = 0;
+    size_t mesh_size = ((size_t)mesh[0]) * mesh[1] * mesh[2];
+    size_t nmx = get_max_num_grid_orth(dh, radius);
+    int l = 0;
+    int l1 = 1;
+    int l1l1 = l1 * l1;
+    int max_mesh = MAX(MAX(mesh[0], mesh[1]), mesh[2]);
+    size += l1 * (mesh[0] + mesh[1] + mesh[2]);
+    size += l1l1 * l1;
+    size += 3 * (_LEN_CART[l] + l1);
+
+    size_t size_orth_components = l1 * nmx + nmx;
+    size_t size_orth_rho = 0;
+    if (nmx < max_mesh) {
+        size_orth_rho = l1l1*nmx + l1*nmx*nmx + nmx*nmx*nmx;
+    } else {
+        size_orth_rho = l1l1*mesh[2] + l1*mesh[1]*mesh[2] + mesh_size;
+    }
+    size += MAX(size_orth_rho, size_orth_components);
+    //size += 1000000;
+    return size;
+}
+
+
+void grid_collocate_drv(void (*eval_rho)(), RS_Grid** rs_rho, double* dm, TaskList** task_list,
+                        int comp, int hermi, int *shls_slice, int* ish_ao_loc, int* jsh_ao_loc,
+                        int dimension, double* Ls, double* a, double* b,
+                        int* ish_atm, int* ish_bas, double* ish_env,
+                        int* jsh_atm, int* jsh_bas, double* jsh_env, int cart)
+{
+    TaskList* tl = *task_list;
+    GridLevel_Info* gridlevel_info = tl->gridlevel_info;
+    int nlevels = gridlevel_info->nlevels;
+
+    assert (comp == (*rs_rho)->comp);
+
+    const int ish0 = shls_slice[0];
+    const int ish1 = shls_slice[1];
+    const int jsh0 = shls_slice[2];
+    const int jsh1 = shls_slice[3];
+    const int nish = ish1 - ish0;
+    const int njsh = jsh1 - jsh0;
+    //const int nijsh = nish * njsh;
+    //const int naoi = ish_ao_loc[ish1] - ish_ao_loc[ish0];
+    const int naoj = jsh_ao_loc[jsh1] - jsh_ao_loc[jsh0];
+
+    double **gto_norm_i = (double**) malloc(sizeof(double*) * nish);
+    double **cart2sph_coeff_i = (double**) malloc(sizeof(double*) * nish);
+    get_cart2sph_coeff(cart2sph_coeff_i, gto_norm_i, ish0, ish1, ish_bas, ish_env, cart);
+    double **gto_norm_j = gto_norm_i;
+    double **cart2sph_coeff_j = cart2sph_coeff_i;
+    if (hermi != 1) {
+        gto_norm_j = (double**) malloc(sizeof(double*) * njsh);
+        cart2sph_coeff_j = (double**) malloc(sizeof(double*) * njsh);
+        get_cart2sph_coeff(cart2sph_coeff_j, gto_norm_j, jsh0, jsh1, jsh_bas, jsh_env, cart);
+    }
+
+    int ish_lmax = get_lmax(ish0, ish1, ish_bas);
+    int jsh_lmax = ish_lmax;
+    if (hermi != 1) {
+        jsh_lmax = get_lmax(jsh0, jsh1, jsh_bas);
+    }
+
+    int ish_nprim_max = get_nprim_max(ish0, ish1, ish_bas);
+    int jsh_nprim_max = ish_nprim_max;
+    if (hermi != 1) {
+        jsh_nprim_max = get_nprim_max(jsh0, jsh1, jsh_bas);
+    }
+
+    int ish_nctr_max = get_nctr_max(ish0, ish1, ish_bas);
+    int jsh_nctr_max = ish_nctr_max;
+    if (hermi != 1) {
+        jsh_nctr_max = get_nctr_max(jsh0, jsh1, jsh_bas);
+    } 
+
+    int ilevel;
+    int *mesh;
+    double max_radius;
+    double *rho, *rhobufs[MAX_THREADS];
+    Task* task;
+    size_t ntasks;
+    PGFPair** pgfpairs;
+    for (ilevel = 0; ilevel < nlevels; ilevel++) {
+        task = (tl->tasks)[ilevel];
+        ntasks = task->ntasks;
+        if (ntasks <= 0) {
+            continue;
+        }
+        pgfpairs = task->pgfpairs;
+        max_radius = task->radius;
+
+        rho = (*rs_rho)->data[ilevel];
+        mesh = gridlevel_info->mesh + ilevel*3;
+
+        double dh[9];
+        get_grid_spacing(dh, a, mesh);
+
+        int *task_loc;
+        int nblock = get_task_loc(&task_loc, pgfpairs, ntasks, ish0, ish1, jsh0, jsh1, hermi);
+
+        size_t cache_size = _rho_cache_size(MAX(ish_lmax,jsh_lmax), 
+                                            MAX(ish_nprim_max, jsh_nprim_max),
+                                            MAX(ish_nctr_max, jsh_nctr_max), mesh, max_radius, dh);
+        size_t ngrids = ((size_t)mesh[0]) * mesh[1] * mesh[2];
+
+#pragma omp parallel
+{
+    PGFPair *pgfpair = NULL;
+    int iblock, itask, ish, jsh;
+    double *ptr_gto_norm_i, *ptr_gto_norm_j;
+    double *cache0 = malloc(sizeof(double) * cache_size);
+    double *dm_cart = cache0;
+    double *dm_pgf = cache0 + ish_nprim_max*_LEN_CART[ish_lmax]*jsh_nprim_max*_LEN_CART[jsh_lmax];
+    double *cache = dm_pgf + _LEN_CART[ish_lmax]*_LEN_CART[jsh_lmax]; 
+
+    int thread_id = omp_get_thread_num();
+    double *rho_priv;
+    if (thread_id == 0) {
+        rho_priv = rho;
+    } else {
+        rho_priv = calloc(comp*ngrids, sizeof(double));
+    }
+    rhobufs[thread_id] = rho_priv;
+
+    #pragma omp for schedule(dynamic)
+    for (iblock = 0; iblock < nblock; iblock+=2) {
+        itask = task_loc[iblock];
+        pgfpair = pgfpairs[itask];
+        ish = pgfpair->ish;
+        jsh = pgfpair->jsh;
+        ptr_gto_norm_i = gto_norm_i[ish];
+        ptr_gto_norm_j = gto_norm_j[jsh];
+        transform_dm(dm_cart, dm, cart2sph_coeff_i[ish],
+                     cart2sph_coeff_j[jsh], ish_ao_loc, jsh_ao_loc,
+                     ish_bas, jsh_bas, ish, jsh, ish0, jsh0, naoj, cache);
+        for (; itask < task_loc[iblock+1]; itask++) {
+            pgfpair = pgfpairs[itask];
+            get_dm_pgfpair(dm_pgf, dm_cart, pgfpair, ish_bas, jsh_bas, hermi);
+            _apply_rho(eval_rho, rho_priv, dm_pgf, pgfpair, comp, dimension, dh, a, b, mesh,
+                       ptr_gto_norm_i, ptr_gto_norm_j, ish_atm, ish_bas, ish_env,
+                       jsh_atm, jsh_bas, jsh_env, Ls, cache);
+        }
+    }
+
+    free(cache0);
+    NPomp_dsum_reduce_inplace(rhobufs, comp*ngrids);
+    if (thread_id != 0) {
+        free(rho_priv);
+    }
+}
+    if (task_loc) {
+        free(task_loc);
+    }
+    } // loop ilevel
+
+    del_cart2sph_coeff(cart2sph_coeff_i, gto_norm_i, ish0, ish1);
+    if (hermi != 1) {
+        del_cart2sph_coeff(cart2sph_coeff_j, gto_norm_j, jsh0, jsh1);
+    }
+}
+
+
+void build_core_density(void (*eval_rho)(), double* rho,
+                        int* atm, int* bas, int nbas, double* env,
+                        int* mesh, int dimension, double* a, double* b, double max_radius)
+{
+    size_t ngrids;
+    ngrids = ((size_t) mesh[0]) * mesh[1] * mesh[2];
+
+    double dh[9];
+    get_grid_spacing(dh, a, mesh);
+
+    double *rhobufs[MAX_THREADS];
+    size_t cache_size =  _rho_core_cache_size(mesh, max_radius, dh);
+
+#pragma omp parallel
+{
+    int ia, ib;
+    double alpha, coeff, charge, rad, fac;
+    double dm[] = {1.0};
+    double *r0;
+    double *cache = (double*) malloc(sizeof(double) * cache_size);
+
+    int thread_id = omp_get_thread_num();
+    double *rho_priv;
+    if (thread_id == 0) {
+        rho_priv = rho;
+    } else {
+        rho_priv = calloc(ngrids, sizeof(double));
+    }
+    rhobufs[thread_id] = rho_priv;
+
+    #pragma omp for schedule(static)
+    for (ib = 0; ib < nbas; ib++) {
+        ia = bas[ib*BAS_SLOTS+ATOM_OF];
+        alpha = env[bas[ib*BAS_SLOTS+PTR_EXP]];
+        coeff = env[bas[ib*BAS_SLOTS+PTR_COEFF]];
+        charge = (double)atm[ia*ATM_SLOTS+CHARGE_OF];
+        r0 = env + atm[ia*ATM_SLOTS+PTR_COORD];
+        fac = -charge * coeff;
+        rad = env[atm[ia*ATM_SLOTS+PTR_RADIUS]];
+        eval_rho(rho_priv, dm, 1, 0, 0, alpha, 0., r0, r0,
+                 fac, rad, dimension, dh, a, b, mesh, cache);
+    }
+    free(cache);
+
+    NPomp_dsum_reduce_inplace(rhobufs, ngrids);
+    if (thread_id != 0) {
+        free(rho_priv);
+    }
+}
+}
+
+
+
+
+static void make_pgfparis_orth(
+            PGFPair* pgfpair, int comp, int dimension,
+            double* dh, double *a, double *b, int *mesh,
+            double* ish_gto_norm, double* jsh_gto_norm,
+            int *ish_atm, int *ish_bas, double *ish_env,
+            int *jsh_atm, int *jsh_bas, double *jsh_env,
+            double* Ls, double *cache)
+{
+        int ish = pgfpair->ish;
+        int jsh = pgfpair->jsh;
+        int ipgf = pgfpair->ipgf;
+        int jpgf = pgfpair->jpgf;
+        int iL = pgfpair->iL;
+        double cutoff = pgfpair->radius;
+
+        double *ri = ish_env + ish_atm[PTR_COORD+ish_bas[ATOM_OF+ish*BAS_SLOTS]*ATM_SLOTS];
+        double *rj = jsh_env + jsh_atm[PTR_COORD+jsh_bas[ATOM_OF+jsh*BAS_SLOTS]*ATM_SLOTS];
+        double *rL = Ls + iL*3;
+        double rjL[3];
+        rjL[0] = rj[0] + rL[0];
+        rjL[1] = rj[1] + rL[1];
+        rjL[2] = rj[2] + rL[2];
+
+        const int li = ish_bas[ANG_OF+ish*BAS_SLOTS];
+        const int lj = jsh_bas[ANG_OF+jsh*BAS_SLOTS];
+        double ai = ish_env[ish_bas[PTR_EXP+ish*BAS_SLOTS]+ipgf];
+        double aj = jsh_env[jsh_bas[PTR_EXP+jsh*BAS_SLOTS]+jpgf];
+        double ci = ish_gto_norm[ipgf];
+        double cj = jsh_gto_norm[jpgf];
+        double aij = ai + aj;
+        double rrij = CINTsquare_dist(ri, rjL);
+        double eij = (ai * aj / aij) * rrij;
+        if (eij > EIJCUTOFF) {
+                return;
+        }
+        double fac = exp(-eij) * ci * cj * CINTcommon_fac_sp(li) * CINTcommon_fac_sp(lj);
+        if (fac < ish_env[PTR_EXPDROP] && fac < jsh_env[PTR_EXPDROP]) {
+                return;
+        }
+
+        int topl = li + lj;
+        int grid_slice[6];
+        double *xs_exp, *ys_exp, *zs_exp;
+        int data_size = init_orth_data(&xs_exp, &ys_exp, &zs_exp,
+                                       grid_slice, dh, mesh, topl, cutoff,
+                                       ai, aj, ri, rj, cache);
+}
+
+
+void eval_pgfpairs(TaskList** task_list,
+                    int comp, int hermi, int *shls_slice, int* ish_ao_loc, int* jsh_ao_loc,
+                    int dimension, double* Ls, double* a, double* b,
+                    int* ish_atm, int* ish_bas, double* ish_env,
+                    int* jsh_atm, int* jsh_bas, double* jsh_env, int cart)
+{
+    TaskList* tl = *task_list;
+    GridLevel_Info* gridlevel_info = tl->gridlevel_info;
+    int nlevels = gridlevel_info->nlevels;
+
+    const int ish0 = shls_slice[0];
+    const int ish1 = shls_slice[1];
+    const int jsh0 = shls_slice[2];
+    const int jsh1 = shls_slice[3];
+    const int nish = ish1 - ish0;
+    const int njsh = jsh1 - jsh0;
+    //const int nijsh = nish * njsh;
+    //const int naoi = ish_ao_loc[ish1] - ish_ao_loc[ish0];
+    //const int naoj = jsh_ao_loc[jsh1] - jsh_ao_loc[jsh0];
+
+    double **gto_norm_i = (double**) malloc(sizeof(double*) * nish);
+    double **cart2sph_coeff_i = (double**) malloc(sizeof(double*) * nish);
+    get_cart2sph_coeff(cart2sph_coeff_i, gto_norm_i, ish0, ish1, ish_bas, ish_env, cart);
+    double **gto_norm_j = gto_norm_i;
+    double **cart2sph_coeff_j = cart2sph_coeff_i;
+    if (hermi != 1) {
+        gto_norm_j = (double**) malloc(sizeof(double*) * njsh);
+        cart2sph_coeff_j = (double**) malloc(sizeof(double*) * njsh);
+        get_cart2sph_coeff(cart2sph_coeff_j, gto_norm_j, jsh0, jsh1, jsh_bas, jsh_env, cart);
+    }
+
+    int ish_lmax = get_lmax(ish0, ish1, ish_bas);
+    int jsh_lmax = ish_lmax;
+    if (hermi != 1) {
+        jsh_lmax = get_lmax(jsh0, jsh1, jsh_bas);
+    }
+
+    int ish_nprim_max = get_nprim_max(ish0, ish1, ish_bas);
+    int jsh_nprim_max = ish_nprim_max;
+    if (hermi != 1) {
+        jsh_nprim_max = get_nprim_max(jsh0, jsh1, jsh_bas);
+    }
+
+    int ish_nctr_max = get_nctr_max(ish0, ish1, ish_bas);
+    int jsh_nctr_max = ish_nctr_max;
+    if (hermi != 1) {
+        jsh_nctr_max = get_nctr_max(jsh0, jsh1, jsh_bas);
+    } 
+
+    int ilevel;
+    int *mesh;
+    double max_radius;
+    Task* task;
+    size_t ntasks;
+    PGFPair** pgfpairs;
+    for (ilevel = 0; ilevel < nlevels; ilevel++) {
+        task = (tl->tasks)[ilevel];
+        ntasks = task->ntasks;
+        if (ntasks <= 0) {
+            continue;
+        }
+        pgfpairs = task->pgfpairs;
+        max_radius = task->radius;
+
+        mesh = gridlevel_info->mesh + ilevel*3;
+
+        double dh[9];
+        get_grid_spacing(dh, a, mesh);
+
+        int *task_loc;
+        int nblock = get_task_loc(&task_loc, pgfpairs, ntasks, ish0, ish1, jsh0, jsh1, hermi);
+
+        size_t cache_size = _rho_cache_size(MAX(ish_lmax,jsh_lmax), 
+                                            MAX(ish_nprim_max, jsh_nprim_max),
+                                            MAX(ish_nctr_max, jsh_nctr_max), mesh, max_radius, dh);
+        //size_t ngrids = ((size_t)mesh[0]) * mesh[1] * mesh[2];
+
+#pragma omp parallel
+{
+    PGFPair *pgfpair = NULL;
+    int iblock, itask, ish, jsh;
+    double *ptr_gto_norm_i, *ptr_gto_norm_j;
+    double *cache = malloc(sizeof(double) * cache_size);
+
+    #pragma omp for schedule(dynamic)
+    for (iblock = 0; iblock < nblock; iblock+=2) {
+        itask = task_loc[iblock];
+        pgfpair = pgfpairs[itask];
+        ish = pgfpair->ish;
+        jsh = pgfpair->jsh;
+        ptr_gto_norm_i = gto_norm_i[ish];
+        ptr_gto_norm_j = gto_norm_j[jsh];
+        for (; itask < task_loc[iblock+1]; itask++) {
+            pgfpair = pgfpairs[itask];
+            make_pgfparis_orth(pgfpair, comp, dimension, dh, a, b, mesh,
+                               ptr_gto_norm_i, ptr_gto_norm_j, ish_atm, ish_bas, ish_env,
+                               jsh_atm, jsh_bas, jsh_env, Ls, cache);
+        }
+    }
+
+    free(cache);
+}
+    if (task_loc) {
+        free(task_loc);
+    }
+    } // loop ilevel
+
+    del_cart2sph_coeff(cart2sph_coeff_i, gto_norm_i, ish0, ish1);
+    if (hermi != 1) {
+        del_cart2sph_coeff(cart2sph_coeff_j, gto_norm_j, jsh0, jsh1);
+    }
+}
diff --git a/pyscf/lib/dft/grid_common.c b/pyscf/lib/dft/grid_common.c
new file mode 100644
index 0000000000..f7e198ab17
--- /dev/null
+++ b/pyscf/lib/dft/grid_common.c
@@ -0,0 +1,660 @@
+/* Copyright 2021- The PySCF Developers. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+ *
+ * Author: Xing Zhang <zhangxing.nju@gmail.com>
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <math.h>
+#include "config.h"
+#include "cint.h"
+#include "vhf/fblas.h"
+#include "np_helper/np_helper.h"
+#include "dft/multigrid.h"
+#include "dft/grid_common.h"
+
+#define EXPMIN         -700
+
+
+int get_lmax(int ish0, int ish1, int* bas)
+{
+    int lmax = 0;
+    int ish;
+    for (ish = ish0; ish < ish1; ish++) {
+        lmax = MAX(lmax, bas[ANG_OF+ish*BAS_SLOTS]);
+    }
+    return lmax;
+}
+
+
+int get_nprim_max(int ish0, int ish1, int* bas)
+{
+    int nprim_max = 1;
+    int ish;
+    for (ish = ish0; ish < ish1; ish++) {
+        nprim_max = MAX(nprim_max, bas[NPRIM_OF+ish*BAS_SLOTS]);
+    }
+    return nprim_max;
+}
+
+
+int get_nctr_max(int ish0, int ish1, int* bas)
+{
+    int nctr_max = 1;
+    int ish;
+    for (ish = ish0; ish < ish1; ish++) {
+        nctr_max = MAX(nctr_max, bas[NCTR_OF+ish*BAS_SLOTS]);
+    }
+    return nctr_max;
+}
+
+
+void get_cart2sph_coeff(double** contr_coeff, double** gto_norm, 
+                        int ish0, int ish1, int* bas, double* env, int cart)
+{
+    int l;
+    int lmax = get_lmax(ish0, ish1, bas);
+    int nprim, ncart, nsph, nctr;
+    int ptr_exp, ptr_coeff;
+    int ish, ipgf, ic, i, j;
+
+    double **c2s = (double**) malloc(sizeof(double*) * (lmax+1));
+    for (l = 0; l <= lmax; l++) {
+        ncart = _LEN_CART[l];
+        if (l <= 1 || cart == 1) {
+            c2s[l] = (double*) calloc(ncart*ncart, sizeof(double));
+            for (i = 0; i < ncart; i++) {
+                c2s[l][i*ncart + i] = 1;
+            }
+        }
+        else {
+            nsph = 2*l + 1;
+            c2s[l] = (double*) calloc(nsph*ncart, sizeof(double));
+            double* gcart = (double*) calloc(ncart*ncart, sizeof(double));
+            for (i = 0; i < ncart; i++) {
+                gcart[i*ncart + i] = 1;
+            }
+            CINTc2s_ket_sph(c2s[l], ncart, gcart, l);
+            free(gcart);
+        }
+    }
+
+#pragma omp parallel private (ish, ipgf, ic, i, j, l,\
+                              ncart, nsph, nprim, nctr,\
+                              ptr_exp, ptr_coeff)
+{
+    #pragma omp for schedule(dynamic) 
+    for (ish = ish0; ish < ish1; ish++) {
+        l = bas[ANG_OF+ish*BAS_SLOTS];
+        ncart = _LEN_CART[l];
+        nsph = cart == 1 ? ncart : 2*l+1;
+        nprim = bas[NPRIM_OF+ish*BAS_SLOTS];
+        nctr = bas[NCTR_OF+ish*BAS_SLOTS];
+
+        ptr_exp = bas[PTR_EXP+ish*BAS_SLOTS];
+        gto_norm[ish] = (double*) malloc(sizeof(double) * nprim);
+        for (ipgf = 0; ipgf < nprim; ipgf++) {
+            gto_norm[ish][ipgf] = CINTgto_norm(l, env[ptr_exp+ipgf]);
+        }
+
+        ptr_coeff = bas[PTR_COEFF+ish*BAS_SLOTS];
+        double *buf = (double*) calloc(nctr*nprim, sizeof(double));
+        for (ipgf = 0; ipgf < nprim; ipgf++) {
+            double inv_norm = 1./gto_norm[ish][ipgf];
+            daxpy_(&nctr, &inv_norm, env+ptr_coeff+ipgf, &nprim, buf+ipgf, &nprim);
+        }
+
+        contr_coeff[ish] = (double*) malloc(sizeof(double) * nprim*ncart*nctr*nsph);
+        double* ptr_contr_coeff = contr_coeff[ish];
+        for (ipgf = 0; ipgf < nprim; ipgf++) {
+            for (i = 0; i < ncart; i++) {
+                for (ic = 0; ic < nctr; ic++) {
+                    for (j = 0; j < nsph; j++) {
+                        *ptr_contr_coeff = buf[ic*nprim+ipgf] * c2s[l][j*ncart+i];
+                        ptr_contr_coeff += 1;
+                    }
+                }
+            }
+        }
+        free(buf);
+    }
+}
+
+    for (l = 0; l <= lmax; l++) {
+        free(c2s[l]);
+    }
+    free(c2s);
+}
+
+
+void del_cart2sph_coeff(double** contr_coeff, double** gto_norm, int ish0, int ish1)
+{
+    int ish;
+    for (ish = ish0; ish < ish1; ish++) {
+        if (contr_coeff[ish]) {
+            free(contr_coeff[ish]);
+        }
+        if (gto_norm[ish]) {
+            free(gto_norm[ish]);
+        }
+    }
+    free(contr_coeff);
+    free(gto_norm);
+}
+
+
+int get_max_num_grid_orth(double* dh, double radius)
+{
+    double dx = MIN(MIN(dh[0], dh[4]), dh[8]);
+    int ngrid = 2 * (int) ceil(radius / dx) + 1;
+    return ngrid;
+}
+
+
+void get_grid_spacing(double* dh, double* a, int* mesh)
+{
+    int i, j;
+    for (i = 0; i < 3; i++) {
+        for (j = 0; j < 3; j++) {
+            dh[i*3+j] = a[i*3+j] / mesh[i];
+        }
+    }
+}
+
+
+int orth_components(double *xs_exp, int* bounds, double dx, double radius,
+                    double xi, double xj, double ai, double aj,
+                    int nx_per_cell, int topl, double *cache)
+{
+    double aij = ai + aj;
+    double xij = (ai * xi + aj * xj) / aij;
+    int x0_latt = (int) floor((xij - radius) / dx);
+    int x1_latt = (int) ceil((xij + radius) / dx);
+    int xij_latt = rint(xij / dx);
+    xij_latt = MAX(xij_latt, x0_latt);
+    xij_latt = MIN(xij_latt, x1_latt);
+    bounds[0] = x0_latt;
+    bounds[1] = x1_latt;
+    int ngridx = x1_latt - x0_latt;
+
+    double base_x = dx * xij_latt;
+    double x0xij = base_x - xij;
+    double _x0x0 = -aij * x0xij * x0xij;
+    if (_x0x0 < EXPMIN) {
+        return 0;
+    }
+
+    double *gridx = cache;
+    double *xs_all = xs_exp;
+    if (ngridx >= nx_per_cell) {
+        xs_all = gridx + ngridx;
+    }
+
+    double _dxdx = -aij * dx * dx;
+    double _x0dx = -2 * aij * x0xij * dx;
+    double exp_dxdx = exp(_dxdx);
+    double exp_2dxdx = exp_dxdx * exp_dxdx;
+    double exp_x0dx = exp(_x0dx + _dxdx);
+    double exp_x0x0 = exp(_x0x0);
+
+    int i;
+    int istart = xij_latt - x0_latt;
+    for (i = istart; i < ngridx; i++) {
+        xs_all[i] = exp_x0x0;
+        exp_x0x0 *= exp_x0dx;
+        exp_x0dx *= exp_2dxdx;
+    }
+
+    exp_x0dx = exp(_dxdx - _x0dx);
+    exp_x0x0 = exp(_x0x0);
+    for (i = istart-1; i >= 0; i--) {
+        exp_x0x0 *= exp_x0dx;
+        exp_x0dx *= exp_2dxdx;
+        xs_all[i] = exp_x0x0;
+    }
+
+    if (topl > 0) {
+        double x0xi = x0_latt * dx - xi;
+        for (i = 0; i < ngridx; i++) {
+            gridx[i] = x0xi + i * dx;
+        }
+        int l;
+        double *px0;
+        for (l = 1; l <= topl; l++) {
+            px0 = xs_all + (l-1) * ngridx;
+            for (i = 0; i < ngridx; i++) {
+                px0[ngridx+i] = px0[i] * gridx[i];
+            }
+        }
+    }
+
+    // add up contributions from all images to the referece image
+    if (ngridx >= nx_per_cell) {
+        memset(xs_exp, 0, (topl+1)*nx_per_cell*sizeof(double));
+        int ix, l, lb, ub, size_x;
+        for (ix = 0; ix < ngridx; ix++) {
+            lb = modulo(ix + x0_latt, nx_per_cell);
+            ub = get_upper_bound(lb, nx_per_cell, ix, ngridx);
+            size_x = ub - lb;
+            double* __restrict ptr_xs_exp = xs_exp + lb;
+            double* __restrict ptr_xs_all = xs_all + ix;
+            for (l = 0; l <= topl; l++) {
+                #pragma omp simd
+                for (i = 0; i < size_x; i++) {
+                    ptr_xs_exp[i] += ptr_xs_all[i];
+                }
+                ptr_xs_exp += nx_per_cell;
+                ptr_xs_all += ngridx;
+            }
+            ix += size_x - 1;
+        }
+
+        bounds[0] = 0;
+        bounds[1] = nx_per_cell;
+        ngridx = nx_per_cell;
+    }
+    return ngridx;
+}
+
+
+int _orth_components(double *xs_exp, int *img_slice, int *grid_slice,
+                     double a, double b, double cutoff,
+                     double xi, double xj, double ai, double aj,
+                     int periodic, int nx_per_cell, int topl, double *cache)
+{
+    double aij = ai + aj;
+    double xij = (ai * xi + aj * xj) / aij;
+    double heights_inv = b;
+    double xij_frac = xij * heights_inv;
+    double edge0 = xij_frac - cutoff * heights_inv;
+    double edge1 = xij_frac + cutoff * heights_inv;
+
+    if (edge0 == edge1) {
+        return 0;
+    }
+
+    int nimg0 = 0;
+    int nimg1 = 1;
+    if (periodic) {
+        nimg0 = (int) floor(edge0);
+        nimg1 = (int) ceil(edge1);
+    }
+
+    int nimg = nimg1 - nimg0;
+
+    int nmx0 = nimg0 * nx_per_cell;
+    int nmx1 = nimg1 * nx_per_cell;
+    int nmx = nmx1 - nmx0;
+
+    int nx0 = (int) floor(edge0 * nx_per_cell);
+    int nx1 = (int) ceil(edge1 * nx_per_cell);
+   
+    int nx0_edge = nx0 - nmx0;
+    int nx1_edge = nx1 - nmx0;
+
+    if (periodic) {
+        nx0 = nx0_edge % nx_per_cell;
+        nx1 = nx1_edge % nx_per_cell;
+        if (nx1 == 0) {
+            nx1 = nx_per_cell;
+        }
+    }
+    assert(nx0 == nx0_edge);
+
+    img_slice[0] = nimg0;
+    img_slice[1] = nimg1;
+    grid_slice[0] = nx0;
+    grid_slice[1] = nx1;
+
+    int ngridx = _num_grids_on_x(nimg, nx0, nx1, nx_per_cell);
+    if (ngridx == 0) {
+        return 0;
+    }
+
+    int i, m, l;
+    double *px0;
+
+    double *gridx = cache;
+    double *xs_all = cache + nmx;
+    if (nimg == 1) {
+        xs_all = xs_exp;
+    }
+
+    int grid_close_to_xij = rint(xij_frac * nx_per_cell) - nmx0;
+    grid_close_to_xij = MIN(grid_close_to_xij, nx1_edge);
+    grid_close_to_xij = MAX(grid_close_to_xij, nx0_edge);
+
+    double img0_x = a * nimg0;
+    double dx = a / nx_per_cell;
+    double base_x = img0_x + dx * grid_close_to_xij;
+    double x0xij = base_x - xij;
+    double _x0x0 = -aij * x0xij * x0xij;
+    if (_x0x0 < EXPMIN) {
+        return 0;
+    }
+
+    double _dxdx = -aij * dx * dx;
+    double _x0dx = -2 * aij * x0xij * dx;
+    double exp_dxdx = exp(_dxdx);
+    double exp_2dxdx = exp_dxdx * exp_dxdx;
+    double exp_x0dx = exp(_x0dx + _dxdx);
+    double exp_x0x0 = exp(_x0x0);
+
+    for (i = grid_close_to_xij; i < nx1_edge; i++) {
+        xs_all[i] = exp_x0x0;
+        exp_x0x0 *= exp_x0dx;
+        exp_x0dx *= exp_2dxdx;
+    }
+
+    exp_x0dx = exp(_dxdx - _x0dx);
+    exp_x0x0 = exp(_x0x0);
+    for (i = grid_close_to_xij-1; i >= nx0_edge; i--) {
+        exp_x0x0 *= exp_x0dx;
+        exp_x0dx *= exp_2dxdx;
+        xs_all[i] = exp_x0x0;
+    }
+
+    if (topl > 0) {
+        double x0xi = img0_x - xi;
+        for (i = nx0_edge; i < nx1_edge; i++) {
+            gridx[i] = x0xi + i * dx;
+        }
+        for (l = 1; l <= topl; l++) {
+            px0 = xs_all + (l-1) * nmx;
+            for (i = nx0_edge; i < nx1_edge; i++) {
+                px0[nmx+i] = px0[i] * gridx[i];
+            }
+        }
+    }
+
+    int idx1;
+    if (nimg > 1) {
+        for (l = 0; l <= topl; l++) {
+            px0 = xs_all + l * nmx;
+            for (i = nx0; i < nx_per_cell; i++) {
+                xs_exp[l*nx_per_cell+i] = px0[i];
+            }
+            memset(xs_exp+l*nx_per_cell, 0, nx0*sizeof(double));
+            for (m = 1; m < nimg; m++) {
+                px0 = xs_all + l * nmx + m*nx_per_cell;
+                idx1 = (m == nimg - 1) ? nx1 : nx_per_cell;
+                for (i = 0; i < idx1; i++) {
+                    xs_exp[l*nx_per_cell+i] += px0[i];
+                }
+            }
+        }
+    }
+    return ngridx;
+}
+
+
+int init_orth_data(double **xs_exp, double **ys_exp, double **zs_exp,
+                   int *grid_slice, double* dh, int* mesh, int topl, double radius,
+                   double ai, double aj, double *ri, double *rj, double *cache)
+{
+    int l1 = topl + 1;
+    *xs_exp = cache;
+    *ys_exp = *xs_exp + l1 * mesh[0];
+    *zs_exp = *ys_exp + l1 * mesh[1];
+    int data_size = l1 * (mesh[0] + mesh[1] + mesh[2]);
+    cache += data_size;
+
+    int ngridx = orth_components(*xs_exp, grid_slice, dh[0], radius,
+                                 ri[0], rj[0], ai, aj, mesh[0], topl, cache);
+    if (ngridx == 0) {
+            return 0;
+    }
+
+    int ngridy = orth_components(*ys_exp, grid_slice+2, dh[4], radius,
+                                 ri[1], rj[1], ai, aj, mesh[1], topl, cache);
+    if (ngridy == 0) {
+            return 0;
+    }
+
+    int ngridz = orth_components(*zs_exp, grid_slice+4, dh[8], radius,
+                                 ri[2], rj[2], ai, aj, mesh[2], topl, cache);
+    if (ngridz == 0) {
+            return 0;
+    }
+
+    return data_size;
+}
+
+
+int _init_orth_data(double **xs_exp, double **ys_exp, double **zs_exp,
+                    int *img_slice, int *grid_slice, int *mesh,
+                    int topl, int dimension, double cutoff,
+                    double ai, double aj, double *ri, double *rj,
+                    double *a, double *b, double *cache)
+{
+        int l1 = topl + 1;
+        *xs_exp = cache;
+        *ys_exp = *xs_exp + l1 * mesh[0];
+        *zs_exp = *ys_exp + l1 * mesh[1];
+        int data_size = l1 * (mesh[0] + mesh[1] + mesh[2]);
+        cache += data_size;
+
+        int ngridx = _orth_components(*xs_exp, img_slice, grid_slice,
+                                      a[0], b[0], cutoff, ri[0], rj[0], ai, aj,
+                                      (dimension>=1), mesh[0], topl, cache);
+        if (ngridx == 0) {
+                return 0;
+        }
+
+        int ngridy = _orth_components(*ys_exp, img_slice+2, grid_slice+2,
+                                      a[4], b[4], cutoff, ri[1], rj[1], ai, aj,
+                                      (dimension>=2), mesh[1], topl, cache);
+        if (ngridy == 0) {
+                return 0;
+        }
+
+        int ngridz = _orth_components(*zs_exp, img_slice+4, grid_slice+4,
+                                      a[8], b[8], cutoff, ri[2], rj[2], ai, aj,
+                                      (dimension>=3), mesh[2], topl, cache);
+        if (ngridz == 0) {
+                return 0;
+        }
+
+        return data_size;
+}
+
+
+void _get_dm_to_dm_xyz_coeff(double* coeff, double* rij, int lmax, double* cache)
+{
+    int l1 = lmax + 1;
+    int l, lx;
+
+    double *rx_pow = cache;
+    double *ry_pow = rx_pow + l1;
+    double *rz_pow = ry_pow + l1;
+
+    rx_pow[0] = 1.0;
+    ry_pow[0] = 1.0;
+    rz_pow[0] = 1.0;
+    for (lx = 1; lx <= lmax; lx++) {
+        rx_pow[lx] = rx_pow[lx-1] * rij[0];
+        ry_pow[lx] = ry_pow[lx-1] * rij[1];
+        rz_pow[lx] = rz_pow[lx-1] * rij[2];
+    }
+
+    int dj = _LEN_CART[lmax];
+    double *pcx = coeff;
+    double *pcy = pcx + dj;
+    double *pcz = pcy + dj;
+    for (l = 0; l <= lmax; l++){
+        for (lx = 0; lx <= l; lx++) {
+            pcx[lx] = BINOMIAL(l, lx) * rx_pow[l-lx];
+            pcy[lx] = BINOMIAL(l, lx) * ry_pow[l-lx];
+            pcz[lx] = BINOMIAL(l, lx) * rz_pow[l-lx];
+        }
+        pcx += l+1;
+        pcy += l+1;
+        pcz += l+1;
+    }
+}
+
+
+void _dm_to_dm_xyz(double* dm_xyz, double* dm, int li, int lj, double* ri, double* rj, double* cache)
+{
+    int lx, ly, lz;
+    int lx_i, ly_i, lz_i;
+    int lx_j, ly_j, lz_j;
+    int jx, jy, jz;
+    double rij[3];
+
+    rij[0] = ri[0] - rj[0];
+    rij[1] = ri[1] - rj[1];
+    rij[2] = ri[2] - rj[2];
+
+    int l1 = li + lj + 1;
+    int l1l1 = l1 * l1;
+    double *coeff = cache;
+    int dj = _LEN_CART[lj];
+    cache += 3 * dj;
+
+    _get_dm_to_dm_xyz_coeff(coeff, rij, lj, cache);
+
+    double cx, cxy, cxyz;
+    double *pcx = coeff;
+    double *pcy = pcx + dj;
+    double *pcz = pcy + dj;
+    double *pdm = dm;
+    for (lx_i = li; lx_i >= 0; lx_i--) {
+        for (ly_i = li-lx_i; ly_i >= 0; ly_i--) {
+            lz_i = li - lx_i - ly_i;
+            for (lx_j = lj; lx_j >= 0; lx_j--) {
+                for (ly_j = lj-lx_j; ly_j >= 0; ly_j--) {
+                    lz_j = lj - lx_j - ly_j;
+                    for (jx = 0; jx <= lx_j; jx++) {
+                        cx = pcx[jx+_LEN_CART0[lx_j]];
+                        lx = lx_i + jx;
+                        for (jy = 0; jy <= ly_j; jy++) {
+                            cxy = cx * pcy[jy+_LEN_CART0[ly_j]];
+                            ly = ly_i + jy;
+                            for (jz = 0; jz <= lz_j; jz++) {
+                                cxyz = cxy * pcz[jz+_LEN_CART0[lz_j]];
+                                lz = lz_i + jz;
+                                dm_xyz[lx*l1l1+ly*l1+lz] += cxyz * pdm[0];
+                            }
+                        }
+                    }
+                    pdm += 1;
+                }
+            }
+        }
+    }
+}
+
+
+void _dm_xyz_to_dm(double* dm_xyz, double* dm, int li, int lj, double* ri, double* rj, double* cache)
+{
+    int lx, ly, lz;
+    int lx_i, ly_i, lz_i;
+    int lx_j, ly_j, lz_j;
+    int jx, jy, jz;
+    double rij[3];
+
+    rij[0] = ri[0] - rj[0];
+    rij[1] = ri[1] - rj[1];
+    rij[2] = ri[2] - rj[2];
+
+    int l1 = li + lj + 1;
+    int l1l1 = l1 * l1;
+    double *coeff = cache;
+    int dj = _LEN_CART[lj];
+    cache += 3 * dj;
+
+    _get_dm_to_dm_xyz_coeff(coeff, rij, lj, cache);
+
+    double cx, cy, cz;
+    double *pcx = coeff;
+    double *pcy = pcx + dj;
+    double *pcz = pcy + dj;
+    double *pdm = dm;
+    for (lx_i = li; lx_i >= 0; lx_i--) {
+        for (ly_i = li-lx_i; ly_i >= 0; ly_i--) {
+            lz_i = li - lx_i - ly_i;
+            for (lx_j = lj; lx_j >= 0; lx_j--) {
+                for (ly_j = lj-lx_j; ly_j >= 0; ly_j--) {
+                    lz_j = lj - lx_j - ly_j;
+                    for (jx = 0; jx <= lx_j; jx++) {
+                        cx = pcx[jx+_LEN_CART0[lx_j]];
+                        lx = lx_i + jx;
+                        for (jy = 0; jy <= ly_j; jy++) {
+                            cy = pcy[jy+_LEN_CART0[ly_j]];
+                            ly = ly_i + jy;
+                            for (jz = 0; jz <= lz_j; jz++) {
+                                cz = pcz[jz+_LEN_CART0[lz_j]];
+                                lz = lz_i + jz;
+                                pdm[0] += cx*cy*cz * dm_xyz[lx*l1l1+ly*l1+lz];
+                            }
+                        }
+                    }
+                    pdm += 1;
+                }
+            }
+        }
+    }
+}
+
+
+void get_dm_pgfpair(double* dm_pgf, double* dm_cart, 
+                    PGFPair* pgfpair, int* ish_bas, int* jsh_bas, int hermi)
+{
+    int ish = pgfpair->ish;
+    int jsh = pgfpair->jsh;
+    int ipgf = pgfpair->ipgf;
+    int jpgf = pgfpair->jpgf;
+
+    int li = ish_bas[ANG_OF+ish*BAS_SLOTS];
+    int lj = jsh_bas[ANG_OF+jsh*BAS_SLOTS];
+    int di = _LEN_CART[li];
+    int dj = _LEN_CART[lj];
+
+    int nprim_j = jsh_bas[NPRIM_OF+jsh*BAS_SLOTS];
+    int ncol = nprim_j * dj;
+    double *pdm = dm_cart + (ipgf*di*ncol + jpgf*dj);
+    double *pdm_pgf = dm_pgf;
+    int i, j;
+    for (i = 0; i < di; i++) {
+        for (j = 0; j < dj; j++) {
+            pdm_pgf[j] = pdm[j];
+        }
+        pdm_pgf += dj;
+        pdm += ncol;
+    }
+
+    /*
+    if (hermi == 1 && ish == jsh) {
+        assert(di == dj);
+        for (i = 0; i < di; i++) {
+            for (j = i+1; j < dj; j++) {
+                dm_pgf[i*dj+j] *= 2;
+                dm_pgf[j*dj+i] = 0;
+            }
+        }
+    }*/
+    if (hermi == 1 && ish != jsh) {
+        pdm_pgf = dm_pgf;
+        for (i = 0; i < di; i++) {
+            for (j = 0; j < dj; j++) {
+                pdm_pgf[j] *= 2;
+            }
+            pdm_pgf += dj;
+        }
+    }
+}
diff --git a/pyscf/lib/dft/grid_common.h b/pyscf/lib/dft/grid_common.h
new file mode 100644
index 0000000000..36dc7e3655
--- /dev/null
+++ b/pyscf/lib/dft/grid_common.h
@@ -0,0 +1,109 @@
+/* Copyright 2021- The PySCF Developers. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+ */
+
+#ifndef HAVE_DEFINED_GRID_COMMON_H
+#define HAVE_DEFINED_GRID_COMMON_H
+
+#include "cint.h"
+
+#define EIJCUTOFF        60
+#define PTR_EXPDROP      16
+
+extern double CINTsquare_dist(const double *r1, const double *r2);
+extern double CINTcommon_fac_sp(int l);
+
+int get_lmax(int ish0, int ish1, int* bas);
+int get_nprim_max(int ish0, int ish1, int* bas);
+int get_nctr_max(int ish0, int ish1, int* bas);
+void get_cart2sph_coeff(double** contr_coeff, double** gto_norm,
+                        int ish0, int ish1, int* bas, double* env, int cart);
+void del_cart2sph_coeff(double** contr_coeff, double** gto_norm, int ish0, int ish1);
+
+static inline int _has_overlap(int nx0, int nx1, int nx_per_cell)
+{
+    return nx0 <= nx1;
+}
+
+static inline int _num_grids_on_x(int nimgx, int nx0, int nx1, int nx_per_cell)
+{
+    int ngridx;
+    if (nimgx == 1) {
+        ngridx = nx1 - nx0;
+    } else if (nimgx == 2 && !_has_overlap(nx0, nx1, nx_per_cell)) {
+        ngridx = nx1 - nx0 + nx_per_cell;
+    } else {
+        ngridx = nx_per_cell;
+    }
+    return ngridx;
+}
+
+
+static inline void _get_grid_mapping(int* xmap, int nx0, int nx1, int ngridx, int nimgx, bool is_x_split)
+{
+    int ix, nx;
+    if (nimgx == 1) {
+        for (ix = 0; ix < ngridx; ix++) {
+            xmap[ix] = ix + nx0;
+        }
+    } else if (is_x_split) {
+        for (ix = 0; ix < nx1; ix++) {
+            xmap[ix] = ix;
+        }
+        nx = nx0 - nx1;
+        for (ix = nx1; ix < ngridx; ix++) {
+            xmap[ix] = ix + nx;
+        }
+    } else {
+        for (ix = 0; ix < ngridx; ix++) {
+            xmap[ix] = ix;
+        }
+    }
+}
+
+
+static inline int modulo(int i, int n)
+{
+    return (i % n + n) % n;
+}
+
+
+static inline int get_upper_bound(int x0, int nx_per_cell, int ix, int ngridx)
+{
+    return x0 + MIN(nx_per_cell - x0, ngridx - ix);
+}
+
+int _orth_components(double *xs_exp, int *img_slice, int *grid_slice,
+                     double a, double b, double cutoff,
+                     double xi, double xj, double ai, double aj,
+                     int periodic, int nx_per_cell, int topl, double *cache);
+int _init_orth_data(double **xs_exp, double **ys_exp, double **zs_exp,
+                    int *img_slice, int *grid_slice, int *mesh,
+                    int topl, int dimension, double cutoff,
+                    double ai, double aj, double *ri, double *rj,
+                    double *a, double *b, double *cache);
+
+int init_orth_data(double **xs_exp, double **ys_exp, double **zs_exp,
+                   int *grid_slice, double* dh, int* mesh, int topl, double radius,
+                   double ai, double aj, double *ri, double *rj, double *cache);
+void get_grid_spacing(double* dh, double* a, int* mesh);
+
+void _get_dm_to_dm_xyz_coeff(double* coeff, double* rij, int lmax, double* cache);
+void _dm_to_dm_xyz(double* dm_xyz, double* dm, int li, int lj, double* ri, double* rj, double* cache);
+void _dm_xyz_to_dm(double* dm_xyz, double* dm, int li, int lj, double* ri, double* rj, double* cache);
+void get_dm_pgfpair(double* dm_pgf, double* dm_cart,
+                    PGFPair* pgfpair, int* ish_bas, int* jsh_bas, int hermi);
+int get_max_num_grid_orth(double* dh, double radius);
+#endif
diff --git a/pyscf/lib/dft/grid_integrate.c b/pyscf/lib/dft/grid_integrate.c
new file mode 100644
index 0000000000..9cabe864cb
--- /dev/null
+++ b/pyscf/lib/dft/grid_integrate.c
@@ -0,0 +1,1358 @@
+/* Copyright 2021- The PySCF Developers. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+ *
+ * Author: Xing Zhang <zhangxing.nju@gmail.com>
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include "config.h"
+#include "vhf/fblas.h"
+#include "np_helper/np_helper.h"
+#include "dft/multigrid.h"
+#include "dft/grid_common.h"
+#include "dft/utils.h"
+
+#define PTR_RADIUS      5
+
+
+void transform_dm_inverse(double* dm_cart, double* dm, int comp,
+                          double* ish_contr_coeff, double* jsh_contr_coeff,
+                          int* ish_ao_loc, int* jsh_ao_loc,
+                          int* ish_bas, int* jsh_bas, int ish, int jsh,
+                          int ish0, int jsh0, int naoi, int naoj, double* cache)
+{
+    int i0 = ish_ao_loc[ish] - ish_ao_loc[ish0];
+    int i1 = ish_ao_loc[ish+1] - ish_ao_loc[ish0];
+    int j0 = jsh_ao_loc[jsh] - jsh_ao_loc[jsh0];
+    int j1 = jsh_ao_loc[jsh+1] - jsh_ao_loc[jsh0];
+
+    int nrow = i1 - i0;
+    int ncol = j1 - j0;
+    double* pdm = dm + ((size_t)naoj) * i0 + j0;
+
+    int l_i = ish_bas[ANG_OF+ish*BAS_SLOTS];
+    int ncart_i = _LEN_CART[l_i];
+    int nprim_i = ish_bas[NPRIM_OF+ish*BAS_SLOTS];
+    int nao_i = nprim_i*ncart_i;
+    int l_j = jsh_bas[ANG_OF+jsh*BAS_SLOTS];
+    int ncart_j = _LEN_CART[l_j];
+    int nprim_j = jsh_bas[NPRIM_OF+jsh*BAS_SLOTS];
+    int nao_j = nprim_j*ncart_j;
+
+    const char TRANS_T = 'T';
+    const char TRANS_N = 'N';
+    const double D1 = 1;
+    const double D0 = 0;
+    double *buf = cache;
+
+    int ic;
+    for (ic = 0; ic < comp; ic++) {
+        //einsum("pi,pq,qj->ij", coeff_i, dm_cart, coeff_j)
+        dgemm_(&TRANS_N, &TRANS_N, &ncol, &nao_i, &nao_j,
+               &D1, jsh_contr_coeff, &ncol, dm_cart, &nao_j, &D0, buf, &ncol);
+        dgemm_(&TRANS_N, &TRANS_T, &ncol, &nrow, &nao_i,
+               &D1, buf, &ncol, ish_contr_coeff, &nrow, &D0, pdm, &naoj);
+        pdm += ((size_t)naoi) * naoj;
+        dm_cart += nao_i * nao_j;
+    }
+}
+
+
+static void fill_tril(double* mat, int comp, int* ish_ao_loc, int* jsh_ao_loc,
+                      int ish, int jsh, int ish0, int jsh0, int naoi, int naoj)
+{
+    int i0 = ish_ao_loc[ish] - ish_ao_loc[ish0];
+    int i1 = ish_ao_loc[ish+1] - ish_ao_loc[ish0];
+    int j0 = jsh_ao_loc[jsh] - jsh_ao_loc[jsh0];
+    int j1 = jsh_ao_loc[jsh+1] - jsh_ao_loc[jsh0];
+    int ni = i1 - i0;
+    int nj = j1 - j0;
+    size_t nao2 = ((size_t)naoi) * naoj;
+
+    double *pmat_up = mat + i0*((size_t)naoj) + j0;
+    double *pmat_low = mat + j0*((size_t)naoj) + i0;
+    int ic, i, j;
+    for (ic = 0; ic < comp; ic++) {
+        for (i = 0; i < ni; i++) {
+            for (j = 0; j < nj; j++) {
+                pmat_low[j*naoj+i] = pmat_up[i*naoj+j];
+            }
+        }
+        pmat_up += nao2;
+        pmat_low += nao2;
+    }
+}
+
+
+static void integrate_submesh(double* out, double* weights,
+                              double* xs_exp, double* ys_exp, double* zs_exp,
+                              double fac, int topl,
+                              int* mesh_lb, int* mesh_ub, int* submesh_lb,
+                              const int* mesh, const int* submesh, double* cache)
+{
+    const int l1 = topl + 1;
+    const int l1l1 = l1 * l1;
+    const int x0 = mesh_lb[0];
+    const int y0 = mesh_lb[1];
+    const int z0 = mesh_lb[2];
+
+    const int nx = mesh_ub[0] - x0;
+    const int ny = mesh_ub[1] - y0;
+    const int nz = mesh_ub[2] - z0;
+
+    const int x0_sub = submesh_lb[0];
+    const int y0_sub = submesh_lb[1];
+    const int z0_sub = submesh_lb[2];
+
+    const size_t mesh_yz = ((size_t) mesh[1]) * mesh[2];
+
+    const char TRANS_N = 'N';
+    const char TRANS_T = 'T';
+    const double D0 = 0;
+    const double D1 = 1;
+
+    double *lzlyx = cache;
+    double *zly = lzlyx + l1l1 * nx;
+    double *ptr_weights = weights + x0 * mesh_yz + y0 * mesh[2] + z0;
+
+    int ix;
+    for (ix = 0; ix < nx; ix++) {
+        dgemm_wrapper(TRANS_N, TRANS_N, nz, l1, ny,
+                      D1, ptr_weights, mesh[2], ys_exp+y0_sub, submesh[1],
+                      D0, zly, nz);
+        dgemm_wrapper(TRANS_T, TRANS_N, l1, l1, nz,
+                      D1, zs_exp+z0_sub, submesh[2], zly, nz,
+                      D0, lzlyx+l1l1*ix, l1);
+        ptr_weights += mesh_yz;
+    }
+    dgemm_wrapper(TRANS_N, TRANS_N, l1l1, l1, nx,
+                  fac, lzlyx, l1l1, xs_exp+x0_sub, submesh[0],
+                  D1, out, l1l1);
+}
+
+
+static void _orth_ints(double *out, double *weights, int topl, double fac,
+                       double *xs_exp, double *ys_exp, double *zs_exp,
+                       int *grid_slice, int *mesh, double *cache)
+{// NOTE: out is accumulated
+    const int nx0 = grid_slice[0];
+    const int nx1 = grid_slice[1];
+    const int ny0 = grid_slice[2];
+    const int ny1 = grid_slice[3];
+    const int nz0 = grid_slice[4];
+    const int nz1 = grid_slice[5];
+    const int ngridx = nx1 - nx0;
+    const int ngridy = ny1 - ny0;
+    const int ngridz = nz1 - nz0;
+    if (ngridx == 0 || ngridy == 0 || ngridz == 0) {
+        return;
+    }
+
+    const int submesh[3] = {ngridx, ngridy, ngridz};
+    int lb[3], ub[3];
+    int ix, iy, iz;
+    for (ix = 0; ix < ngridx;) {
+        lb[0] = modulo(ix + nx0, mesh[0]);
+        ub[0] = get_upper_bound(lb[0], mesh[0], ix, ngridx);
+        for (iy = 0; iy < ngridy;) {
+            lb[1] = modulo(iy + ny0, mesh[1]);
+            ub[1] = get_upper_bound(lb[1], mesh[1], iy, ngridy);
+            for (iz = 0; iz < ngridz;) {
+                lb[2] = modulo(iz + nz0, mesh[2]);
+                ub[2] = get_upper_bound(lb[2], mesh[2], iz, ngridz);
+                int lb_sub[3] = {ix, iy, iz};
+                integrate_submesh(out, weights, xs_exp, ys_exp, zs_exp, fac, topl,
+                                  lb, ub, lb_sub, mesh, submesh, cache);
+                iz += ub[2] - lb[2];
+            }
+            iy += ub[1] - lb[1];
+        }
+        ix += ub[0] - lb[0];
+    }
+}
+
+
+#define VRHO_LOOP_IP1(X, Y, Z) \
+    int lx, ly, lz; \
+    int jx, jy, jz; \
+    int l##X##_i_m1 = l##X##_i - 1; \
+    int l##X##_i_p1 = l##X##_i + 1; \
+    double cx, cy, cz, cfac; \
+    double fac_i = -2.0 * ai; \
+    for (j##Y = 0; j##Y <= l##Y##_j; j##Y++) { \
+        c##Y = pc##Y[j##Y+_LEN_CART0[l##Y##_j]]; \
+        l##Y = l##Y##_i + j##Y; \
+        for (j##Z = 0; j##Z <= l##Z##_j; j##Z++) { \
+            c##Z = pc##Z[j##Z+_LEN_CART0[l##Z##_j]]; \
+            l##Z = l##Z##_i + j##Z; \
+            cfac = c##Y * c##Z; \
+            for (j##X = 0; j##X <= l##X##_j; j##X++) { \
+                if (l##X##_i > 0) { \
+                    c##X = pc##X[j##X+_LEN_CART0[l##X##_j]] * l##X##_i; \
+                    l##X = l##X##_i_m1 + j##X; \
+                    pv1[0] += c##X * cfac * v1_xyz[lx*l1l1+ly*l1+lz]; \
+                } \
+                c##X = pc##X[j##X+_LEN_CART0[l##X##_j]] * fac_i; \
+                l##X = l##X##_i_p1 + j##X; \
+                pv1[0] += c##X * cfac * v1_xyz[lx*l1l1+ly*l1+lz]; \
+            } \
+        } \
+    }
+
+
+static void _vrho_loop_ip1_x(double* pv1, double* v1_xyz,
+                             double* pcx, double* pcy, double* pcz,
+                             double ai, double aj,
+                             int lx_i, int ly_i, int lz_i,
+                             int lx_j, int ly_j, int lz_j, int l1, int l1l1)
+{
+    VRHO_LOOP_IP1(x,y,z);
+}
+
+
+static void _vrho_loop_ip1_y(double* pv1, double* v1_xyz,
+                             double* pcx, double* pcy, double* pcz,
+                             double ai, double aj,
+                             int lx_i, int ly_i, int lz_i,
+                             int lx_j, int ly_j, int lz_j, int l1, int l1l1)
+{
+    VRHO_LOOP_IP1(y,x,z);
+}
+
+
+static void _vrho_loop_ip1_z(double* pv1, double* v1_xyz,
+                             double* pcx, double* pcy, double* pcz,
+                             double ai, double aj,
+                             int lx_i, int ly_i, int lz_i,
+                             int lx_j, int ly_j, int lz_j, int l1, int l1l1)
+{
+    VRHO_LOOP_IP1(z,x,y);
+}
+
+
+#define VSIGMA_LOOP(X, Y, Z) \
+    int lx, ly, lz; \
+    int jx, jy, jz; \
+    int l##X##_i_m1 = l##X##_i - 1; \
+    int l##X##_i_p1 = l##X##_i + 1; \
+    int l##X##_j_m1 = l##X##_j - 1; \
+    int l##X##_j_p1 = l##X##_j + 1; \
+    double cx, cy, cz, cfac; \
+    double fac_i = -2.0 * ai; \
+    double fac_j = -2.0 * aj; \
+    for (j##Y = 0; j##Y <= l##Y##_j; j##Y++) { \
+        c##Y = pc##Y[j##Y+_LEN_CART0[l##Y##_j]]; \
+        l##Y = l##Y##_i + j##Y; \
+        for (j##Z = 0; j##Z <= l##Z##_j; j##Z++) { \
+            c##Z = pc##Z[j##Z+_LEN_CART0[l##Z##_j]]; \
+            l##Z = l##Z##_i + j##Z; \
+            cfac = c##Y * c##Z; \
+            for (j##X = 0; j##X <= l##X##_j_m1; j##X++) { \
+                c##X = pc##X[j##X+_LEN_CART0[l##X##_j_m1]] * l##X##_j; \
+                l##X = l##X##_i + j##X; \
+                pv1[0] += c##X * cfac * v1_xyz[lx*l1l1+ly*l1+lz]; \
+            } \
+            for (j##X = 0; j##X <= l##X##_j_p1; j##X++) { \
+                c##X = pc##X[j##X+_LEN_CART0[l##X##_j_p1]] * fac_j; \
+                l##X = l##X##_i + j##X; \
+                pv1[0] += c##X * cfac * v1_xyz[lx*l1l1+ly*l1+lz]; \
+            } \
+            for (j##X = 0; j##X <= l##X##_j; j##X++) { \
+                if (l##X##_i > 0) { \
+                    c##X = pc##X[j##X+_LEN_CART0[l##X##_j]] * l##X##_i; \
+                    l##X = l##X##_i_m1 + j##X; \
+                    pv1[0] += c##X * cfac * v1_xyz[lx*l1l1+ly*l1+lz]; \
+                } \
+                c##X = pc##X[j##X+_LEN_CART0[l##X##_j]] * fac_i; \
+                l##X = l##X##_i_p1 + j##X; \
+                pv1[0] += c##X * cfac * v1_xyz[lx*l1l1+ly*l1+lz]; \
+            } \
+        } \
+    }
+
+
+static void _vsigma_loop_x(double* pv1, double* v1_xyz,
+                           double* pcx, double* pcy, double* pcz,
+                           double ai, double aj,
+                           int lx_i, int ly_i, int lz_i,
+                           int lx_j, int ly_j, int lz_j, int l1, int l1l1)
+{
+    VSIGMA_LOOP(x,y,z);
+}
+
+
+static void _vsigma_loop_y(double* pv1, double* v1_xyz,
+                           double* pcx, double* pcy, double* pcz,
+                           double ai, double aj,
+                           int lx_i, int ly_i, int lz_i,
+                           int lx_j, int ly_j, int lz_j, int l1, int l1l1)
+{
+    VSIGMA_LOOP(y,x,z);
+}
+
+
+static void _vsigma_loop_z(double* pv1, double* v1_xyz,
+                           double* pcx, double* pcy, double* pcz,
+                           double ai, double aj,
+                           int lx_i, int ly_i, int lz_i,
+                           int lx_j, int ly_j, int lz_j, int l1, int l1l1)
+{
+    VSIGMA_LOOP(z,x,y);
+}
+
+
+static void _v1_xyz_to_v1(void (*_v1_loop)(), double* v1_xyz, double* v1,
+                          int li, int lj, double ai, double aj,
+                          double* ri, double* rj, double* cache)
+{
+    int lx_i, ly_i, lz_i;
+    int lx_j, ly_j, lz_j;
+    double rij[3];
+
+    rij[0] = ri[0] - rj[0];
+    rij[1] = ri[1] - rj[1];
+    rij[2] = ri[2] - rj[2];
+
+    int l1 = li + lj + 2;
+    int l1l1 = l1 * l1;
+    double *coeff = cache;
+    int dj = _LEN_CART[lj+1];
+    cache += 3 * dj;
+
+    _get_dm_to_dm_xyz_coeff(coeff, rij, lj+1, cache);
+
+    double *pcx = coeff;
+    double *pcy = pcx + dj;
+    double *pcz = pcy + dj;
+    double *pv1 = v1;
+    for (lx_i = li; lx_i >= 0; lx_i--) {
+        for (ly_i = li-lx_i; ly_i >= 0; ly_i--) {
+            lz_i = li - lx_i - ly_i;
+            for (lx_j = lj; lx_j >= 0; lx_j--) {
+                for (ly_j = lj-lx_j; ly_j >= 0; ly_j--) {
+                    lz_j = lj - lx_j - ly_j;
+                    _v1_loop(pv1, v1_xyz, pcx, pcy, pcz, ai, aj,
+                             lx_i, ly_i, lz_i, lx_j, ly_j, lz_j, l1, l1l1);
+                    pv1 += 1;
+                }
+            }
+        }
+    }
+}
+
+/*
+#define SUM_NABLA_I \
+        if (lx_i > 0) { \
+            pv1[0] += lx_i * cxyzj * v1x[(lx-1)*l1l1+ly*l1+lz]; \
+        } \
+        pv1[0] += fac_i * cxyzj * v1x[(lx+1)*l1l1+ly*l1+lz]; \
+        if (ly_i > 0) { \
+            pv1[0] += ly_i * cxyzj * v1y[lx*l1l1+(ly-1)*l1+lz]; \
+        } \
+        pv1[0] += fac_i * cxyzj * v1y[lx*l1l1+(ly+1)*l1+lz]; \
+        if (lz_i > 0) { \
+            pv1[0] += lz_i * cxyzj * v1z[lx*l1l1+ly*l1+lz-1]; \
+        } \
+        pv1[0] += fac_i * cxyzj * v1z[lx*l1l1+ly*l1+lz+1];
+*/
+/*
+static void _vsigma_loop_ip1ip2_x(double* pv1, double* v1x, double* v1y, double* v1z,
+                       double* pcx, double* pcy, double* pcz,
+                       double ai, double aj,
+                       int lx_i, int ly_i, int lz_i,
+                       int lx_j, int ly_j, int lz_j, int l1, int l1l1)
+{
+    int lx, ly, lz;
+    int jx, jy, jz;
+    int lx_j_m1 = lx_j - 1;
+    int lx_j_p1 = lx_j + 1;
+    double cxj, cyj, czj, cyzj, cxyzj;
+    double fac_i = -2.0 * ai;
+    double fac_j = -2.0 * aj;
+
+    for (jy = 0; jy <= ly_j; jy++) {
+        cyj = pcy[jy+_LEN_CART0[ly_j]];
+        ly = ly_i + jy;
+        for (jz = 0; jz <= lz_j; jz++) {
+            czj = pcz[jz+_LEN_CART0[lz_j]];
+            lz = lz_i + jz;
+            cyzj = cyj * czj;
+            for (jx = 0; jx <= lx_j_m1; jx++) {
+                cxj = pcx[jx+_LEN_CART0[lx_j_m1]] * lx_j;
+                cxyzj = cxj * cyzj;
+                lx = lx_i + jx;
+                SUM_NABLA_I;
+            }
+            for (jx = 0; jx <= lx_j_p1; jx++) {
+                cxj = pcx[jx+_LEN_CART0[lx_j_p1]] * fac_j;
+                cxyzj = cxj * cyzj;
+                lx = lx_i + jx;
+                SUM_NABLA_I;
+            }
+        }
+    }
+}
+*/
+
+#define COMMON_INIT(x) \
+    int l##x##_i; \
+    int lx, ly, lz; \
+    int jx, jy, jz; \
+    int lx_j_m1 = lx_j - 1; \
+    int lx_j_p1 = lx_j + 1; \
+    int ly_j_m1 = ly_j - 1; \
+    int ly_j_p1 = ly_j + 1; \
+    int lz_j_m1 = lz_j - 1; \
+    int lz_j_p1 = lz_j + 1; \
+    double ci; \
+    double cxj, cyj, czj; \
+    double cyzj, cxzj, cxyj, cxyzj; \
+    double fac_i = -2.0 * ai; \
+    double fac_j = -2.0 * aj; \
+
+
+#define SUM_NABLA_J(x, y, z) \
+    for (j##y = 0; j##y <= l##y##_j; j##y++) { \
+        c##y##j = pc##y[j##y+_LEN_CART0[l##y##_j]]; \
+        l##y = l##y##_i + j##y; \
+        for (j##z = 0; j##z <= l##z##_j; j##z++) { \
+            c##z##j = pc##z[j##z+_LEN_CART0[l##z##_j]]; \
+            l##z = l##z##_i + j##z; \
+            c##y##z##j = c##y##j * c##z##j; \
+            for (j##x = 0; j##x <= l##x##_j_m1; j##x++) { \
+                c##x##j = pc##x[j##x+_LEN_CART0[l##x##_j_m1]] * l##x##_j; \
+                cxyzj = c##x##j * c##y##z##j; \
+                l##x = l##x##_i + j##x; \
+                pv1[0] += ci * cxyzj * v1##x[lx*l1l1+ly*l1+lz]; \
+            } \
+            for (j##x = 0; j##x <= l##x##_j_p1; j##x++) { \
+                c##x##j = pc##x[j##x+_LEN_CART0[l##x##_j_p1]] * fac_j; \
+                cxyzj = c##x##j * c##y##z##j; \
+                l##x = l##x##_i + j##x; \
+                pv1[0] += ci * cxyzj * v1##x[lx*l1l1+ly*l1+lz]; \
+            } \
+        } \
+    }
+
+
+static void _vsigma_loop_ip1ip2_x(double* pv1, double* v1x, double* v1y, double* v1z,
+                       double* pcx, double* pcy, double* pcz,
+                       double ai, double aj,
+                       int lx_i0, int ly_i, int lz_i,
+                       int lx_j, int ly_j, int lz_j, int l1, int l1l1)
+{
+    COMMON_INIT(x);
+
+    lx_i = lx_i0 + 1;
+    ci = fac_i;
+    SUM_NABLA_J(x,y,z);
+    SUM_NABLA_J(y,x,z);
+    SUM_NABLA_J(z,x,y);
+
+    if (lx_i0 > 0) {
+        lx_i = lx_i0 - 1;
+        ci = lx_i0;
+        SUM_NABLA_J(x,y,z);
+        SUM_NABLA_J(y,x,z);
+        SUM_NABLA_J(z,x,y);
+    }
+}
+
+/*
+static void _vsigma_loop_ip1ip2_y(double* pv1, double* v1x, double* v1y, double* v1z,
+                       double* pcx, double* pcy, double* pcz,
+                       double ai, double aj,
+                       int lx_i, int ly_i, int lz_i,
+                       int lx_j, int ly_j, int lz_j, int l1, int l1l1)
+{
+    int lx, ly, lz;
+    int jx, jy, jz;
+    int ly_j_m1 = ly_j - 1;
+    int ly_j_p1 = ly_j + 1;
+    double cxj, cyj, czj, cxzj, cxyzj;
+    double fac_i = -2.0 * ai;
+    double fac_j = -2.0 * aj;
+
+    for (jx = 0; jx <= lx_j; jx++) {
+        cxj = pcx[jx+_LEN_CART0[lx_j]];
+        lx = lx_i + jx;
+        for (jz = 0; jz <= lz_j; jz++) {
+            czj = pcz[jz+_LEN_CART0[lz_j]];
+            lz = lz_i + jz;
+            cxzj = cxj * czj;
+            for (jy = 0; jy <= ly_j_m1; jy++) {
+                cyj = pcy[jy+_LEN_CART0[ly_j_m1]] * ly_j;
+                cxyzj = cyj * cxzj;
+                ly = ly_i + jy;
+                SUM_NABLA_I;
+            }
+            for (jy = 0; jy <= ly_j_p1; jy++) {
+                cyj = pcy[jy+_LEN_CART0[ly_j_p1]] * fac_j;
+                cxyzj = cyj * cxzj;
+                ly = ly_i + jy;
+                SUM_NABLA_I;
+            }
+        }
+    }
+}
+*/
+
+static void _vsigma_loop_ip1ip2_y(double* pv1, double* v1x, double* v1y, double* v1z,
+                       double* pcx, double* pcy, double* pcz,
+                       double ai, double aj,
+                       int lx_i, int ly_i0, int lz_i,
+                       int lx_j, int ly_j, int lz_j, int l1, int l1l1)
+{
+    COMMON_INIT(y);
+
+    ly_i = ly_i0 + 1;
+    ci = fac_i;
+    SUM_NABLA_J(x,y,z);
+    SUM_NABLA_J(y,x,z);
+    SUM_NABLA_J(z,x,y);
+
+    if (ly_i0 > 0) {
+        ly_i = ly_i0 - 1;
+        ci = ly_i0;
+        SUM_NABLA_J(x,y,z);
+        SUM_NABLA_J(y,x,z);
+        SUM_NABLA_J(z,x,y);
+    }
+}
+
+
+/*
+static void _vsigma_loop_ip1ip2_z(double* pv1, double* v1x, double* v1y, double* v1z,
+                       double* pcx, double* pcy, double* pcz,
+                       double ai, double aj,
+                       int lx_i, int ly_i, int lz_i,
+                       int lx_j, int ly_j, int lz_j, int l1, int l1l1)
+{
+    int lx, ly, lz;
+    int jx, jy, jz;
+    int lz_j_m1 = lz_j - 1;
+    int lz_j_p1 = lz_j + 1;
+    double cxj, cyj, czj, cxyj, cxyzj;
+    double fac_i = -2.0 * ai;
+    double fac_j = -2.0 * aj;
+
+    for (jx = 0; jx <= lx_j; jx++) {
+        cxj = pcx[jx+_LEN_CART0[lx_j]];
+        lx = lx_i + jx;
+        for (jy = 0; jy <= ly_j; jy++) {
+            cyj = pcy[jy+_LEN_CART0[ly_j]];
+            ly = ly_i + jy;
+            cxyj = cxj * cyj;
+            for (jz = 0; jz <= lz_j_m1; jz++) {
+                czj = pcz[jz+_LEN_CART0[lz_j_m1]] * lz_j;
+                cxyzj = czj * cxyj;
+                lz = lz_i + jz;
+                SUM_NABLA_I;
+            }
+            for (jz = 0; jz <= lz_j_p1; jz++) {
+                czj = pcz[jz+_LEN_CART0[lz_j_p1]] * fac_j;
+                cxyzj = czj * cxyj;
+                lz = lz_i + jz;
+                SUM_NABLA_I;
+            }
+        }
+    }
+}
+*/
+
+static void _vsigma_loop_ip1ip2_z(double* pv1, double* v1x, double* v1y, double* v1z,
+                       double* pcx, double* pcy, double* pcz,
+                       double ai, double aj,
+                       int lx_i, int ly_i, int lz_i0,
+                       int lx_j, int ly_j, int lz_j, int l1, int l1l1)
+{
+    COMMON_INIT(z);
+
+    lz_i = lz_i0 + 1;
+    ci = fac_i;
+    SUM_NABLA_J(x,y,z);
+    SUM_NABLA_J(y,x,z);
+    SUM_NABLA_J(z,x,y);
+
+    if (lz_i0 > 0) {
+        lz_i = lz_i0 - 1;
+        ci = lz_i0;
+        SUM_NABLA_J(x,y,z);
+        SUM_NABLA_J(y,x,z);
+        SUM_NABLA_J(z,x,y);
+    }
+}
+
+
+static void _vsigma_ip1ip2(void (*_v1_loop)(), double* v1x,
+                           double* v1y, double* v1z, double* v1,
+                           int li, int lj, double ai, double aj,
+                           double* ri, double* rj, double* cache)
+{
+    int lx_i, ly_i, lz_i;
+    int lx_j, ly_j, lz_j;
+    double rij[3];
+
+    rij[0] = ri[0] - rj[0];
+    rij[1] = ri[1] - rj[1];
+    rij[2] = ri[2] - rj[2];
+
+    int topl = li + lj + 2;
+    int l1 = topl + 1;
+    int l1l1 = l1 * l1;
+    double *coeff = cache;
+    int dj = _LEN_CART[lj+1];
+    cache += 3 * dj;
+
+    _get_dm_to_dm_xyz_coeff(coeff, rij, lj+1, cache);
+
+    double *pcx = coeff;
+    double *pcy = pcx + dj;
+    double *pcz = pcy + dj;
+    double *pv1 = v1;
+    for (lx_i = li; lx_i >= 0; lx_i--) {
+        for (ly_i = li-lx_i; ly_i >= 0; ly_i--) {
+            lz_i = li - lx_i - ly_i;
+            for (lx_j = lj; lx_j >= 0; lx_j--) {
+                for (ly_j = lj-lx_j; ly_j >= 0; ly_j--) {
+                    lz_j = lj - lx_j - ly_j;
+                    _v1_loop(pv1, v1x, v1y, v1z, pcx, pcy, pcz, ai, aj,
+                             lx_i, ly_i, lz_i, lx_j, ly_j, lz_j, l1, l1l1);
+                    pv1 += 1;
+                }
+            }
+        }
+    }
+}
+
+
+static void _vsigma_loop_lap1_x(double* pv1, double* v1x, double* v1y, double* v1z,
+                       double* pcx, double* pcy, double* pcz,
+                       double ai, double aj,
+                       int lx_i, int ly_i, int lz_i,
+                       int lx_j, int ly_j, int lz_j, int l1, int l1l1)
+{
+    int lx, ly, lz;
+    int jx, jy, jz;
+    double cxj, cyj, czj, cxyj, cxyzj;
+    double fac_x;
+    double fac_i = -2.0 * ai;
+
+    for (jx = 0; jx <= lx_j; jx++) {
+        cxj = pcx[jx+_LEN_CART0[lx_j]];
+        lx = lx_i + jx;
+        for (jy = 0; jy <= ly_j; jy++) {
+            cyj = pcy[jy+_LEN_CART0[ly_j]];
+            ly = ly_i + jy;
+            cxyj = cxj * cyj;
+            for (jz = 0; jz <= lz_j; jz++) {
+                czj = pcz[jz+_LEN_CART0[lz_j]];
+                lz = lz_i + jz;
+                cxyzj = cxyj * czj;
+
+                fac_x = lx_i + 1;
+                pv1[0] += fac_x * fac_i * cxyzj * v1x[lx*l1l1+ly*l1+lz];
+                if (lx_i - 1 > 0) {
+                    fac_x = lx_i - 1;
+                    pv1[0] += fac_x * lx_i * cxyzj * v1x[(lx-2)*l1l1+ly*l1+lz];
+                }
+
+                if (lx_i > 0) {
+                    fac_x = lx_i;
+                    if (ly_i > 0) {
+                        pv1[0] += fac_x * ly_i * cxyzj * v1y[(lx-1)*l1l1+(ly-1)*l1+lz];
+                    }
+                    pv1[0] += fac_x * fac_i * cxyzj * v1y[(lx-1)*l1l1+(ly+1)*l1+lz];
+
+                    if (lz_i > 0) {
+                        pv1[0] += fac_x * lz_i * cxyzj * v1z[(lx-1)*l1l1+ly*l1+lz-1];
+                    }
+                    pv1[0] += fac_x * fac_i * cxyzj * v1z[(lx-1)*l1l1+ly*l1+lz+1];
+                }
+
+                fac_x = fac_i;
+                if (lx_i > 0) {
+                    pv1[0] += fac_x * lx_i * cxyzj * v1x[lx*l1l1+ly*l1+lz];
+                }
+                pv1[0] += fac_x * fac_i * cxyzj * v1x[(lx+2)*l1l1+ly*l1+lz];
+
+                if (ly_i > 0) {
+                    pv1[0] += fac_x * ly_i * cxyzj * v1y[(lx+1)*l1l1+(ly-1)*l1+lz];
+                }
+                pv1[0] += fac_x * fac_i * cxyzj * v1y[(lx+1)*l1l1+(ly+1)*l1+lz];
+
+                if (lz_i > 0) {
+                    pv1[0] += fac_x * lz_i * cxyzj * v1z[(lx+1)*l1l1+ly*l1+lz-1];
+                }
+                pv1[0] += fac_x * fac_i * cxyzj * v1z[(lx+1)*l1l1+ly*l1+lz+1];
+            }
+        }
+    }
+}
+
+
+static void _vsigma_loop_lap1_y(double* pv1, double* v1x, double* v1y, double* v1z,
+                       double* pcx, double* pcy, double* pcz,
+                       double ai, double aj,
+                       int lx_i, int ly_i, int lz_i,
+                       int lx_j, int ly_j, int lz_j, int l1, int l1l1)
+{
+    int lx, ly, lz;
+    int jx, jy, jz;
+    double cxj, cyj, czj, cxyj, cxyzj;
+    double fac_y;
+    double fac_i = -2.0 * ai;
+
+    for (jx = 0; jx <= lx_j; jx++) {
+        cxj = pcx[jx+_LEN_CART0[lx_j]];
+        lx = lx_i + jx;
+        for (jy = 0; jy <= ly_j; jy++) {
+            cyj = pcy[jy+_LEN_CART0[ly_j]];
+            ly = ly_i + jy;
+            cxyj = cxj * cyj;
+            for (jz = 0; jz <= lz_j; jz++) {
+                czj = pcz[jz+_LEN_CART0[lz_j]];
+                lz = lz_i + jz;
+                cxyzj = cxyj * czj;
+
+                fac_y = ly_i + 1;
+                pv1[0] += fac_y * fac_i * cxyzj * v1y[lx*l1l1+ly*l1+lz];
+                if (ly_i - 1 > 0) {
+                    fac_y = ly_i - 1;
+                    pv1[0] += fac_y * ly_i * cxyzj * v1y[lx*l1l1+(ly-2)*l1+lz];
+                }
+
+                if (ly_i > 0) {
+                    fac_y = ly_i;
+                    if (lx_i > 0) {
+                        pv1[0] += fac_y * lx_i * cxyzj * v1x[(lx-1)*l1l1+(ly-1)*l1+lz];
+                    }
+                    pv1[0] += fac_y * fac_i * cxyzj * v1x[(lx+1)*l1l1+(ly-1)*l1+lz];
+
+                    if (lz_i > 0) {
+                        pv1[0] += fac_y * lz_i * cxyzj * v1z[lx*l1l1+(ly-1)*l1+lz-1];
+                    }
+                    pv1[0] += fac_y * fac_i * cxyzj * v1z[lx*l1l1+(ly-1)*l1+lz+1];
+                }
+
+                fac_y = fac_i;
+                if (lx_i > 0) {
+                    pv1[0] += fac_y * lx_i * cxyzj * v1x[(lx-1)*l1l1+(ly+1)*l1+lz];
+                }
+                pv1[0] += fac_y * fac_i * cxyzj * v1x[(lx+1)*l1l1+(ly+1)*l1+lz];
+
+                if (ly_i > 0) {
+                    pv1[0] += fac_y * ly_i * cxyzj * v1y[lx*l1l1+ly*l1+lz];
+                }
+                pv1[0] += fac_y * fac_i * cxyzj * v1y[lx*l1l1+(ly+2)*l1+lz];
+
+                if (lz_i > 0) {
+                    pv1[0] += fac_y * lz_i * cxyzj * v1z[lx*l1l1+(ly+1)*l1+lz-1];
+                }
+                pv1[0] += fac_y * fac_i * cxyzj * v1z[lx*l1l1+(ly+1)*l1+lz+1];
+            }
+        }
+    }
+}
+
+
+static void _vsigma_loop_lap1_z(double* pv1, double* v1x, double* v1y, double* v1z,
+                       double* pcx, double* pcy, double* pcz,
+                       double ai, double aj,
+                       int lx_i, int ly_i, int lz_i,
+                       int lx_j, int ly_j, int lz_j, int l1, int l1l1)
+{
+    int lx, ly, lz;
+    int jx, jy, jz;
+    double cxj, cyj, czj, cxyj, cxyzj;
+    double fac_z;
+    double fac_i = -2.0 * ai;
+
+    for (jx = 0; jx <= lx_j; jx++) {
+        cxj = pcx[jx+_LEN_CART0[lx_j]];
+        lx = lx_i + jx;
+        for (jy = 0; jy <= ly_j; jy++) {
+            cyj = pcy[jy+_LEN_CART0[ly_j]];
+            ly = ly_i + jy;
+            cxyj = cxj * cyj;
+            for (jz = 0; jz <= lz_j; jz++) {
+                czj = pcz[jz+_LEN_CART0[lz_j]];
+                lz = lz_i + jz;
+                cxyzj = cxyj * czj;
+
+                fac_z = lz_i + 1;
+                pv1[0] += fac_z * fac_i * cxyzj * v1z[lx*l1l1+ly*l1+lz];
+                if (lz_i - 1 > 0) {
+                    fac_z = lz_i - 1;
+                    pv1[0] += fac_z * lz_i * cxyzj * v1z[lx*l1l1+ly*l1+lz-2];
+                }
+
+                if (lz_i > 0) {
+                    fac_z = lz_i;
+                    if (lx_i > 0) {
+                        pv1[0] += fac_z * lx_i * cxyzj * v1x[(lx-1)*l1l1+ly*l1+lz-1];
+                    }
+                    pv1[0] += fac_z * fac_i * cxyzj * v1x[(lx+1)*l1l1+ly*l1+lz-1];
+
+                    if (ly_i > 0) {
+                        pv1[0] += fac_z * ly_i * cxyzj * v1y[lx*l1l1+(ly-1)*l1+lz-1];
+                    }
+                    pv1[0] += fac_z * fac_i * cxyzj * v1y[lx*l1l1+(ly+1)*l1+lz-1];
+                }
+
+                fac_z = fac_i;
+                if (lx_i > 0) {
+                    pv1[0] += fac_z * lx_i * cxyzj * v1x[(lx-1)*l1l1+ly*l1+lz+1];
+                }
+                pv1[0] += fac_z * fac_i * cxyzj * v1x[(lx+1)*l1l1+ly*l1+lz+1];
+
+                if (ly_i > 0) {
+                    pv1[0] += fac_z * ly_i * cxyzj * v1y[lx*l1l1+(ly-1)*l1+lz+1];
+                }
+                pv1[0] += fac_z * fac_i * cxyzj * v1y[lx*l1l1+(ly+1)*l1+lz+1];
+
+                if (lz_i > 0) {
+                    pv1[0] += fac_z * lz_i * cxyzj * v1z[lx*l1l1+ly*l1+lz];
+                }
+                pv1[0] += fac_z * fac_i * cxyzj * v1z[lx*l1l1+ly*l1+lz+2];
+            }
+        }
+    }
+}
+
+
+static void _vsigma_lap1(void (*_v1_loop)(), double* v1x,
+                         double* v1y, double* v1z, double* v1,
+                         int li, int lj, double ai, double aj,
+                         double* ri, double* rj, double* cache)
+{
+    int lx_i, ly_i, lz_i;
+    int lx_j, ly_j, lz_j;
+    double rij[3];
+
+    rij[0] = ri[0] - rj[0];
+    rij[1] = ri[1] - rj[1];
+    rij[2] = ri[2] - rj[2];
+
+    int topl = li + lj + 2;
+    int l1 = topl + 1;
+    int l1l1 = l1 * l1;
+    double *coeff = cache;
+    int dj = _LEN_CART[lj];
+    cache += 3 * dj;
+
+    _get_dm_to_dm_xyz_coeff(coeff, rij, lj, cache);
+
+    double *pcx = coeff;
+    double *pcy = pcx + dj;
+    double *pcz = pcy + dj;
+    double *pv1 = v1;
+    for (lx_i = li; lx_i >= 0; lx_i--) {
+        for (ly_i = li-lx_i; ly_i >= 0; ly_i--) {
+            lz_i = li - lx_i - ly_i;
+            for (lx_j = lj; lx_j >= 0; lx_j--) {
+                for (ly_j = lj-lx_j; ly_j >= 0; ly_j--) {
+                    lz_j = lj - lx_j - ly_j;
+                    _v1_loop(pv1, v1x, v1y, v1z, pcx, pcy, pcz, ai, aj,
+                             lx_i, ly_i, lz_i, lx_j, ly_j, lz_j, l1, l1l1);
+                    pv1 += 1;
+                }
+            }
+        }
+    }
+}
+
+
+int eval_mat_gga_orth(double *weights, double *out, int comp,
+                      int li, int lj, double ai, double aj,
+                      double *ri, double *rj, double fac, double cutoff,
+                      int dimension, double* dh, double *a, double *b,
+                      int *mesh, double *cache)
+{
+        int topl = li + lj + 1;
+        int l1 = topl+1;
+        int l1l1l1 = l1 * l1 * l1;
+        double *mat_xyz = cache;
+        cache += l1l1l1;
+        int grid_slice[6];
+        double *xs_exp, *ys_exp, *zs_exp;
+
+        int data_size = init_orth_data(&xs_exp, &ys_exp, &zs_exp,
+                                       grid_slice, dh, mesh, topl, cutoff,
+                                       ai, aj, ri, rj, cache);
+        if (data_size == 0) {
+                return 0;
+        }
+        cache += data_size;
+
+        size_t ngrids = ((size_t)mesh[0]) * mesh[1] * mesh[2];
+        double *vx = weights + ngrids;
+        double *vy = vx + ngrids;
+        double *vz = vy + ngrids;
+
+        memset(mat_xyz, 0, l1l1l1*sizeof(double));
+        _orth_ints(mat_xyz, weights, li+lj, fac, xs_exp, ys_exp, zs_exp,
+                   grid_slice, mesh, cache);
+        _dm_xyz_to_dm(mat_xyz, out, li, lj, ri, rj, cache);
+
+        memset(mat_xyz, 0, l1l1l1*sizeof(double));
+        _orth_ints(mat_xyz, vx, topl, fac, xs_exp, ys_exp, zs_exp,
+                   grid_slice, mesh, cache);
+        _v1_xyz_to_v1(_vsigma_loop_x, mat_xyz, out, li, lj, ai, aj, ri, rj, cache);
+
+        memset(mat_xyz, 0, l1l1l1*sizeof(double));
+        _orth_ints(mat_xyz, vy, topl, fac, xs_exp, ys_exp, zs_exp,
+                   grid_slice, mesh, cache);
+        _v1_xyz_to_v1(_vsigma_loop_y, mat_xyz, out, li, lj, ai, aj, ri, rj, cache);
+
+        memset(mat_xyz, 0, l1l1l1*sizeof(double));
+        _orth_ints(mat_xyz, vz, topl, fac, xs_exp, ys_exp, zs_exp,
+                   grid_slice, mesh, cache);
+        _v1_xyz_to_v1(_vsigma_loop_z, mat_xyz, out, li, lj, ai, aj, ri, rj, cache);
+
+        return 1;
+}
+
+
+int eval_mat_lda_orth(double *weights, double *out, int comp,
+                      int li, int lj, double ai, double aj,
+                      double *ri, double *rj, double fac, double cutoff,
+                      int dimension, double* dh, double *a, double *b,
+                      int *mesh, double *cache)
+{
+        int topl = li + lj;
+        int l1 = topl+1;
+        int l1l1l1 = l1*l1*l1;
+        int grid_slice[6];
+        double *xs_exp, *ys_exp, *zs_exp;
+        int data_size = init_orth_data(&xs_exp, &ys_exp, &zs_exp,
+                                       grid_slice, dh, mesh, topl, cutoff,
+                                       ai, aj, ri, rj, cache);
+
+        if (data_size == 0) {
+                return 0;
+        }
+        cache += data_size;
+
+        double *dm_xyz = cache;
+        cache += l1l1l1;
+
+        memset(dm_xyz, 0, l1l1l1*sizeof(double));
+        _orth_ints(dm_xyz, weights, topl, fac, xs_exp, ys_exp, zs_exp,
+                   grid_slice, mesh, cache);
+
+        _dm_xyz_to_dm(dm_xyz, out, li, lj, ri, rj, cache);
+        return 1;
+}
+
+
+int eval_mat_lda_orth_ip1(double *weights, double *out, int comp,
+                          int li, int lj, double ai, double aj,
+                          double *ri, double *rj, double fac, double cutoff,
+                          int dimension, double* dh, double *a, double *b,
+                          int *mesh, double *cache)
+{
+        int dij = _LEN_CART[li] * _LEN_CART[lj];
+        int topl = li + lj + 1;
+        int l1 = topl+1;
+        int l1l1l1 = l1*l1*l1;
+        int grid_slice[6];
+        double *xs_exp, *ys_exp, *zs_exp;
+
+        int data_size = init_orth_data(&xs_exp, &ys_exp, &zs_exp,
+                                       grid_slice, dh, mesh, topl, cutoff,
+                                       ai, aj, ri, rj, cache);
+        if (data_size == 0) {
+                return 0;
+        }
+        cache += data_size;
+
+        double *mat_xyz = cache;
+        cache += l1l1l1;
+        double *pout_x = out;
+        double *pout_y = pout_x + dij;
+        double *pout_z = pout_y + dij;
+
+        memset(mat_xyz, 0, l1l1l1*sizeof(double));
+        _orth_ints(mat_xyz, weights, topl, fac, xs_exp, ys_exp, zs_exp,
+                   grid_slice, mesh, cache);
+        _v1_xyz_to_v1(_vrho_loop_ip1_x, mat_xyz, pout_x, li, lj, ai, aj, ri, rj, cache);
+        _v1_xyz_to_v1(_vrho_loop_ip1_y, mat_xyz, pout_y, li, lj, ai, aj, ri, rj, cache);
+        _v1_xyz_to_v1(_vrho_loop_ip1_z, mat_xyz, pout_z, li, lj, ai, aj, ri, rj, cache);
+        return 1;
+}
+
+
+int eval_mat_gga_orth_ip1(double *weights, double *out, int comp,
+                          int li, int lj, double ai, double aj,
+                          double *ri, double *rj, double fac, double cutoff,
+                          int dimension, double* dh, double *a, double *b,
+                          int *mesh, double *cache)
+{
+        int dij = _LEN_CART[li] * _LEN_CART[lj];
+        int topl = li + lj + 2;
+        int l1 = topl+1;
+        int l1l1l1 = l1*l1*l1;
+        int grid_slice[6];
+        double *xs_exp, *ys_exp, *zs_exp;
+
+        int data_size = init_orth_data(&xs_exp, &ys_exp, &zs_exp,
+                                       grid_slice, dh, mesh, topl, cutoff,
+                                       ai, aj, ri, rj, cache);
+        if (data_size == 0) {
+                return 0;
+        }
+        cache += data_size;
+
+        double *mat_xyz = cache;
+        double *mat_x = mat_xyz;
+        double *mat_y = mat_x + l1l1l1;
+        double *mat_z = mat_y + l1l1l1;
+        cache += l1l1l1*3;
+        double *pout_x = out;
+        double *pout_y = pout_x + dij;
+        double *pout_z = pout_y + dij;
+
+        size_t ngrids = ((size_t)mesh[0]) * mesh[1] * mesh[2];
+        double *vx = weights + ngrids;
+        double *vy = vx + ngrids;
+        double *vz = vy + ngrids;
+
+        //vrho part
+        memset(mat_xyz, 0, l1l1l1*sizeof(double));
+        _orth_ints(mat_xyz, weights, topl-1, fac, xs_exp, ys_exp, zs_exp,
+                   grid_slice, mesh, cache);
+        _v1_xyz_to_v1(_vrho_loop_ip1_x, mat_xyz, pout_x, li, lj, ai, aj, ri, rj, cache);
+        _v1_xyz_to_v1(_vrho_loop_ip1_y, mat_xyz, pout_y, li, lj, ai, aj, ri, rj, cache);
+        _v1_xyz_to_v1(_vrho_loop_ip1_z, mat_xyz, pout_z, li, lj, ai, aj, ri, rj, cache);
+
+        //vsigma part
+        memset(mat_x, 0, l1l1l1*sizeof(double));
+        _orth_ints(mat_x, vx, topl, fac, xs_exp, ys_exp, zs_exp,
+                   grid_slice, mesh, cache);
+
+        memset(mat_y, 0, l1l1l1*sizeof(double));
+        _orth_ints(mat_y, vy, topl, fac, xs_exp, ys_exp, zs_exp,
+                   grid_slice, mesh, cache);
+
+        memset(mat_z, 0, l1l1l1*sizeof(double));
+        _orth_ints(mat_z, vz, topl, fac, xs_exp, ys_exp, zs_exp,
+                   grid_slice, mesh, cache);
+
+        _vsigma_ip1ip2(_vsigma_loop_ip1ip2_x, mat_x, mat_y, mat_z,
+                       pout_x, li, lj, ai, aj, ri, rj, cache);
+        _vsigma_ip1ip2(_vsigma_loop_ip1ip2_y, mat_x, mat_y, mat_z,
+                       pout_y, li, lj, ai, aj, ri, rj, cache);
+        _vsigma_ip1ip2(_vsigma_loop_ip1ip2_z, mat_x, mat_y, mat_z,
+                       pout_z, li, lj, ai, aj, ri, rj, cache);
+
+        _vsigma_lap1(_vsigma_loop_lap1_x, mat_x, mat_y, mat_z,
+                     pout_x, li, lj, ai, aj, ri, rj, cache);
+        _vsigma_lap1(_vsigma_loop_lap1_y, mat_x, mat_y, mat_z,
+                     pout_y, li, lj, ai, aj, ri, rj, cache);
+        _vsigma_lap1(_vsigma_loop_lap1_z, mat_x, mat_y, mat_z,
+                     pout_z, li, lj, ai, aj, ri, rj, cache);
+        return 1;
+}
+
+
+void _apply_ints(int (*eval_ints)(), double *weights, double *mat,
+                        PGFPair* pgfpair, int comp, double fac, int dimension,
+                        double* dh, double *a, double *b, int *mesh,
+                        double* ish_gto_norm, double* jsh_gto_norm,
+                        int *ish_atm, int *ish_bas, double *ish_env,
+                        int *jsh_atm, int *jsh_bas, double *jsh_env,
+                        double* Ls, double *cache)
+{
+    int i_sh = pgfpair->ish;
+    int j_sh = pgfpair->jsh;
+    int ipgf = pgfpair->ipgf;
+    int jpgf = pgfpair->jpgf;
+    int iL = pgfpair->iL;
+    double cutoff = pgfpair->radius;
+
+    int li = ish_bas[ANG_OF+i_sh*BAS_SLOTS];
+    int lj = jsh_bas[ANG_OF+j_sh*BAS_SLOTS];
+    int di = _LEN_CART[li];
+    int dj = _LEN_CART[lj];
+
+    int ish_nprim = ish_bas[NPRIM_OF+i_sh*BAS_SLOTS];
+    int jsh_nprim = jsh_bas[NPRIM_OF+j_sh*BAS_SLOTS];
+    int naoi = ish_nprim * di;
+    int naoj = jsh_nprim * dj;
+
+    double *ri = ish_env + ish_atm[PTR_COORD+ish_bas[ATOM_OF+i_sh*BAS_SLOTS]*ATM_SLOTS];
+    double *rj = jsh_env + jsh_atm[PTR_COORD+jsh_bas[ATOM_OF+j_sh*BAS_SLOTS]*ATM_SLOTS];
+    double *rL = Ls + iL*3;
+    double rjL[3];
+    rjL[0] = rj[0] + rL[0];
+    rjL[1] = rj[1] + rL[1];
+    rjL[2] = rj[2] + rL[2];
+
+    double ai = ish_env[ish_bas[PTR_EXP+i_sh*BAS_SLOTS]+ipgf];
+    double aj = jsh_env[jsh_bas[PTR_EXP+j_sh*BAS_SLOTS]+jpgf];
+    double ci = ish_gto_norm[ipgf];
+    double cj = jsh_gto_norm[jpgf];
+    double aij = ai + aj;
+    double rrij = CINTsquare_dist(ri, rjL);
+    double eij = (ai * aj / aij) * rrij;
+    if (eij > EIJCUTOFF) {
+        return;
+    }
+    fac *= exp(-eij) * ci * cj * CINTcommon_fac_sp(li) * CINTcommon_fac_sp(lj);
+    if (fac < ish_env[PTR_EXPDROP] && fac < jsh_env[PTR_EXPDROP]) {
+        return;
+    }
+
+    double *out = cache;
+    memset(out, 0, comp*di*dj*sizeof(double));
+    cache += comp * di * dj;
+
+    int value = (*eval_ints)(weights, out, comp, li, lj, ai, aj, ri, rjL,
+                             fac, cutoff, dimension, dh, a, b, mesh, cache);
+
+    double *pmat = mat + ipgf*di*naoj + jpgf*dj;
+    if (value != 0) {
+        int i, j, ic;
+        for (ic = 0; ic < comp; ic++) {
+            for (i = 0; i < di; i++) {
+                #pragma omp simd
+                for (j = 0; j < dj; j++) {
+                    pmat[i*naoj+j] += out[i*dj+j];
+                } 
+            }
+            pmat += naoi * naoj;
+            out += di * dj;
+        }
+    }
+}
+
+
+static size_t _ints_cache_size(int l, int nprim, int nctr, int* mesh, double radius, double* dh, int comp)
+{
+    size_t size = 0;
+    size_t nmx = get_max_num_grid_orth(dh, radius);
+    int max_mesh = MAX(MAX(mesh[0], mesh[1]), mesh[2]);
+    int l1 = 2 * l + 1;
+    if (comp == 3) {
+        l1 += 1;
+    }
+    int l1l1 = l1 * l1;
+    int ncart = _LEN_CART[l1]; // use l1 to be safe
+
+    size += comp * nprim * nprim * ncart * ncart; // dm_cart
+    size += comp * ncart * ncart; // out
+    size += l1 * (mesh[0] + mesh[1] + mesh[2]); // xs_exp, ys_exp, zs_exp
+
+    size_t size_orth_components = l1 * nmx + nmx; // orth_components
+    size += l1l1 * l1; // dm_xyz
+    size += 3 * (ncart + l1); // _dm_xyz_to_dm
+
+    size_t size_orth_ints = 0;
+    if (nmx < max_mesh) {
+        size_orth_ints = (l1 + l1l1) * nmx;
+    } else {
+        size_orth_ints = l1*mesh[2] + l1l1*mesh[0];
+    }
+    size += MAX(size_orth_components, size_orth_ints);
+    size += nctr * ncart * nprim * ncart;
+    //size += 1000000;
+    //printf("Memory allocated per thread for make_mat: %ld MB.\n", size*sizeof(double) / 1000000);
+    return size;
+}
+
+
+static size_t _ints_core_cache_size(int* mesh, double radius, double* dh, int comp)
+{
+    size_t size = 0;
+    size_t nmx = get_max_num_grid_orth(dh, radius);
+    int max_mesh = MAX(MAX(mesh[0], mesh[1]), mesh[2]);
+    const int l = 0;
+    int l1 = l + 1;
+    if (comp == 3) {
+        l1 += 1;
+    }
+    int l1l1 = l1 * l1;
+    int ncart = _LEN_CART[l1];
+
+    size_t size_orth_components = l1 * nmx + nmx;
+    size_t size_orth_ints = 0;
+    if (nmx < max_mesh) {
+        size_orth_ints = (l1 + l1l1) * nmx;
+    } else {
+        size_orth_ints = l1*mesh[2] + l1l1*mesh[0];
+    }
+    size += MAX(size_orth_components, size_orth_ints);
+    size += l1 * (mesh[0] + mesh[1] + mesh[2]);
+    size += l1l1 * l1;
+    size += 3 * (ncart + l1);
+    //size += 1000000;
+    return size;
+}
+
+
+void grid_integrate_drv(int (*eval_ints)(), double* mat, double* weights, TaskList** task_list,
+                        int comp, int hermi, int grid_level, 
+                        int *shls_slice, int* ish_ao_loc, int* jsh_ao_loc,
+                        int dimension, double* Ls, double* a, double* b,
+                        int* ish_atm, int* ish_bas, double* ish_env,
+                        int* jsh_atm, int* jsh_bas, double* jsh_env, int cart)
+{
+    TaskList* tl = *task_list;
+    GridLevel_Info* gridlevel_info = tl->gridlevel_info;
+    Task *task = (tl->tasks)[grid_level];
+    int ntasks = task->ntasks;
+    if (ntasks <= 0) {
+        return;
+    }
+    double max_radius = task->radius;
+    PGFPair **pgfpairs = task->pgfpairs;
+    int* mesh = gridlevel_info->mesh + grid_level*3;
+
+    double dh[9];
+    get_grid_spacing(dh, a, mesh);
+
+    const int ish0 = shls_slice[0];
+    const int ish1 = shls_slice[1];
+    const int jsh0 = shls_slice[2];
+    const int jsh1 = shls_slice[3];
+    const int nish = ish1 - ish0;
+    const int njsh = jsh1 - jsh0;
+    //const int nijsh = nish * njsh;
+    const int naoi = ish_ao_loc[ish1] - ish_ao_loc[ish0];
+    const int naoj = jsh_ao_loc[jsh1] - jsh_ao_loc[jsh0];
+
+    int ish_lmax = get_lmax(ish0, ish1, ish_bas);
+    int jsh_lmax = ish_lmax;
+    if (hermi != 1) {
+        jsh_lmax = get_lmax(jsh0, jsh1, jsh_bas);
+    }
+
+    int ish_nprim_max = get_nprim_max(ish0, ish1, ish_bas);
+    int jsh_nprim_max = ish_nprim_max;
+    if (hermi != 1) {
+        jsh_nprim_max = get_nprim_max(jsh0, jsh1, jsh_bas);
+    }
+
+    int ish_nctr_max = get_nctr_max(ish0, ish1, ish_bas);
+    int jsh_nctr_max = ish_nctr_max;
+    if (hermi != 1) {
+        jsh_nctr_max = get_nctr_max(jsh0, jsh1, jsh_bas);
+    }
+
+    double **gto_norm_i = (double**) malloc(sizeof(double*) * nish);
+    double **cart2sph_coeff_i = (double**) malloc(sizeof(double*) * nish);
+    get_cart2sph_coeff(cart2sph_coeff_i, gto_norm_i, ish0, ish1, ish_bas, ish_env, cart);
+    double **gto_norm_j = gto_norm_i;
+    double **cart2sph_coeff_j = cart2sph_coeff_i;
+    if (hermi != 1) {
+        gto_norm_j = (double**) malloc(sizeof(double*) * njsh);
+        cart2sph_coeff_j = (double**) malloc(sizeof(double*) * njsh);
+        get_cart2sph_coeff(cart2sph_coeff_j, gto_norm_j, jsh0, jsh1, jsh_bas, jsh_env, cart);
+    }
+
+    int *task_loc;
+    int nblock = get_task_loc(&task_loc, pgfpairs, ntasks, ish0, ish1, jsh0, jsh1, hermi);
+
+    size_t cache_size = _ints_cache_size(MAX(ish_lmax,jsh_lmax),
+                                         MAX(ish_nprim_max, jsh_nprim_max),
+                                         MAX(ish_nctr_max, jsh_nctr_max), 
+                                         mesh, max_radius, dh, comp);
+
+#pragma omp parallel
+{
+    int ish, jsh, itask, iblock;
+    int li, lj, ish_nprim, jsh_nprim;
+    PGFPair *pgfpair = NULL;
+    double *ptr_gto_norm_i, *ptr_gto_norm_j;
+    double *cache0 = malloc(sizeof(double) * cache_size);
+    double *dm_cart = cache0;
+    int len_dm_cart = comp*ish_nprim_max*_LEN_CART[ish_lmax]*jsh_nprim_max*_LEN_CART[jsh_lmax];
+    double *cache = dm_cart + len_dm_cart;
+
+    #pragma omp for schedule(dynamic)
+    for (iblock = 0; iblock < nblock; iblock+=2) {
+        itask = task_loc[iblock];
+        pgfpair = pgfpairs[itask];
+        ish = pgfpair->ish;
+        jsh = pgfpair->jsh;
+        ptr_gto_norm_i = gto_norm_i[ish];
+        ptr_gto_norm_j = gto_norm_j[jsh];
+        li = ish_bas[ANG_OF+ish*BAS_SLOTS];
+        lj = jsh_bas[ANG_OF+jsh*BAS_SLOTS];
+        ish_nprim = ish_bas[NPRIM_OF+ish*BAS_SLOTS];
+        jsh_nprim = jsh_bas[NPRIM_OF+jsh*BAS_SLOTS];
+        len_dm_cart = comp*ish_nprim*_LEN_CART[li]*jsh_nprim*_LEN_CART[lj];
+        memset(dm_cart, 0, len_dm_cart * sizeof(double));
+        for (; itask < task_loc[iblock+1]; itask++) {
+            pgfpair = pgfpairs[itask];
+            _apply_ints(eval_ints, weights, dm_cart, pgfpair, comp, 1.0, dimension, dh, a, b, mesh,
+                        ptr_gto_norm_i, ptr_gto_norm_j, ish_atm, ish_bas, ish_env,
+                        jsh_atm, jsh_bas, jsh_env, Ls, cache);
+        }
+        transform_dm_inverse(dm_cart, mat, comp,
+                             cart2sph_coeff_i[ish], cart2sph_coeff_j[jsh],
+                             ish_ao_loc, jsh_ao_loc, ish_bas, jsh_bas,
+                             ish, jsh, ish0, jsh0, naoi, naoj, cache);
+        if (hermi == 1 && ish != jsh) {
+            fill_tril(mat, comp, ish_ao_loc, jsh_ao_loc,
+                      ish, jsh, ish0, jsh0, naoi, naoj);
+        }
+    }
+    free(cache0);
+}
+
+    if (task_loc) {
+        free(task_loc);
+    }
+    del_cart2sph_coeff(cart2sph_coeff_i, gto_norm_i, ish0, ish1);
+    if (hermi != 1) {
+        del_cart2sph_coeff(cart2sph_coeff_j, gto_norm_j, jsh0, jsh1);
+    }
+}
+
+
+void int_gauss_charge_v_rs(int (*eval_ints)(), double* out, double* v_rs, int comp,
+                           int* atm, int* bas, int nbas, double* env,
+                           int* mesh, int dimension, double* a, double* b, double max_radius)
+{
+    double dh[9];
+    get_grid_spacing(dh, a, mesh);
+
+    size_t cache_size = _ints_core_cache_size(mesh, max_radius, dh, comp);
+
+#pragma omp parallel
+{
+    int ia, ib;
+    double alpha, coeff, charge, rad, fac;
+    double *r0;
+    double *cache = (double*) malloc(sizeof(double) * cache_size);
+    #pragma omp for schedule(static)
+    for (ib = 0; ib < nbas; ib++) {
+        ia = bas[ib*BAS_SLOTS+ATOM_OF];
+        alpha = env[bas[ib*BAS_SLOTS+PTR_EXP]];
+        coeff = env[bas[ib*BAS_SLOTS+PTR_COEFF]];
+        charge = (double)atm[ia*ATM_SLOTS+CHARGE_OF];
+        r0 = env + atm[ia*ATM_SLOTS+PTR_COORD];
+        fac = -charge * coeff;
+        rad = env[atm[ia*ATM_SLOTS+PTR_RADIUS]];
+        (*eval_ints)(v_rs, out+ia*comp, comp, 0, 0, alpha, 0.0, r0, r0, 
+                     fac, rad, dimension, dh, a, b, mesh, cache);
+    }
+    free(cache);
+}
+}
diff --git a/pyscf/lib/dft/libxc_itrf.c b/pyscf/lib/dft/libxc_itrf.c
index 76d7497980..badeab597a 100644
--- a/pyscf/lib/dft/libxc_itrf.c
+++ b/pyscf/lib/dft/libxc_itrf.c
@@ -15,6 +15,7 @@
  *
  * Authors: Qiming Sun <osirpt.sun@gmail.com>
  *          Susi Lehtola <susi.lehtola@gmail.com>
+ *          Xing Zhang <zhangxing.nju@gmail.com>
  *
  * libxc from
  * http://www.tddft.org/programs/octopus/wiki/index.php/Libxc:manual
@@ -24,7 +25,10 @@
 #include <stdlib.h>
 #include <assert.h>
 #include <xc.h>
+#include "config.h"
 #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
+#define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
+#define MAX_THREADS     256
 
 // TODO: register python signal
 #define raise_error     return
@@ -83,13 +87,13 @@
  * In spin restricted case (spin == 1), rho_u is assumed to be the
  * spin-free quantities, rho_d is not used.
  */
-static void _eval_rho(double *rho, double *rho_u, int spin, int nvar, int np)
+static void _eval_rho(double *rho, double *rho_u, int spin, int nvar, int np, int ld_rho_u)
 {
         int i;
         double *sigma, *tau;
         double *gxu, *gyu, *gzu, *gxd, *gyd, *gzd;
         double *tau_u, *tau_d;
-        double *rho_d = rho_u + np * nvar;
+        double *rho_d = rho_u + ld_rho_u * nvar;
 
         switch (nvar) {
         case LDA_NVAR:
@@ -107,12 +111,12 @@ static void _eval_rho(double *rho, double *rho_u, int spin, int nvar, int np)
         case GGA_NVAR:
                 if (spin == 1) {
                         sigma = rho + np * 2;
-                        gxu = rho_u + np;
-                        gyu = rho_u + np * 2;
-                        gzu = rho_u + np * 3;
-                        gxd = rho_d + np;
-                        gyd = rho_d + np * 2;
-                        gzd = rho_d + np * 3;
+                        gxu = rho_u + ld_rho_u;
+                        gyu = rho_u + ld_rho_u * 2;
+                        gzu = rho_u + ld_rho_u * 3;
+                        gxd = rho_d + ld_rho_u;
+                        gyd = rho_d + ld_rho_u * 2;
+                        gzd = rho_d + ld_rho_u * 3;
                         for (i = 0; i < np; i++) {
                                 rho[i*2+0] = rho_u[i];
                                 rho[i*2+1] = rho_d[i];
@@ -122,9 +126,9 @@ static void _eval_rho(double *rho, double *rho_u, int spin, int nvar, int np)
                         }
                 } else {
                         sigma = rho + np;
-                        gxu = rho_u + np;
-                        gyu = rho_u + np * 2;
-                        gzu = rho_u + np * 3;
+                        gxu = rho_u + ld_rho_u;
+                        gyu = rho_u + ld_rho_u * 2;
+                        gzu = rho_u + ld_rho_u * 3;
                         for (i = 0; i < np; i++) {
                                 rho[i] = rho_u[i];
                                 sigma[i] = gxu[i]*gxu[i] + gyu[i]*gyu[i] + gzu[i]*gzu[i];
@@ -135,14 +139,14 @@ static void _eval_rho(double *rho, double *rho_u, int spin, int nvar, int np)
                 if (spin == 1) {
                         sigma = rho + np * 2;
                         tau = sigma + np * 3;
-                        gxu = rho_u + np;
-                        gyu = rho_u + np * 2;
-                        gzu = rho_u + np * 3;
-                        gxd = rho_d + np;
-                        gyd = rho_d + np * 2;
-                        gzd = rho_d + np * 3;
-                        tau_u  = rho_u + np * 4;
-                        tau_d  = rho_d + np * 4;
+                        gxu = rho_u + ld_rho_u;
+                        gyu = rho_u + ld_rho_u * 2;
+                        gzu = rho_u + ld_rho_u * 3;
+                        gxd = rho_d + ld_rho_u;
+                        gyd = rho_d + ld_rho_u * 2;
+                        gzd = rho_d + ld_rho_u * 3;
+                        tau_u  = rho_u + ld_rho_u * 4;
+                        tau_d  = rho_d + ld_rho_u * 4;
                         for (i = 0; i < np; i++) {
                                 rho[i*2+0] = rho_u[i];
                                 rho[i*2+1] = rho_d[i];
@@ -157,10 +161,10 @@ static void _eval_rho(double *rho, double *rho_u, int spin, int nvar, int np)
                 } else {
                         sigma = rho + np;
                         tau  = sigma + np;
-                        gxu = rho_u + np;
-                        gyu = rho_u + np * 2;
-                        gzu = rho_u + np * 3;
-                        tau_u = rho_u + np * 4;
+                        gxu = rho_u + ld_rho_u;
+                        gyu = rho_u + ld_rho_u * 2;
+                        gzu = rho_u + ld_rho_u * 3;
+                        tau_u = rho_u + ld_rho_u * 4;
                         for (i = 0; i < np; i++) {
                                 rho[i] = rho_u[i];
                                 sigma[i] = gxu[i]*gxu[i] + gyu[i]*gyu[i] + gzu[i]*gzu[i];
@@ -171,7 +175,7 @@ static void _eval_rho(double *rho, double *rho_u, int spin, int nvar, int np)
         }
 }
 static void _eval_xc(xc_func_type *func_x, int spin, int deriv, int np,
-                     double *rho, double *exc)
+                     double *rho, double *exc, int offset, int blksize)
 {
         double *sigma, *tau;
         double *lapl = rho;
@@ -266,6 +270,21 @@ static void _eval_xc(xc_func_type *func_x, int spin, int deriv, int np,
                         if (deriv > 3) {
                                 v4rho4 = v3rho3 + np * 4;
                         }
+
+                        // set offset
+                        exc += offset;
+                        if (deriv > 0) {
+                                vrho += offset * 2;
+                        }
+                        if (deriv > 1) {
+                                v2rho2 += offset * 3;
+                        }
+                        if (deriv > 2) {
+                                v3rho3 += offset * 4;
+                        }
+                        if (deriv > 3) {
+                                v4rho4 += offset * 5;
+                        }
                 } else {
                         if (deriv > 0) {
                                 vrho = exc + np;
@@ -279,15 +298,30 @@ static void _eval_xc(xc_func_type *func_x, int spin, int deriv, int np,
                         if (deriv > 3) {
                                 v4rho4 = v3rho3 + np;
                         }
+
+                        // set offset
+                        exc += offset;
+                        if (deriv > 0) {
+                                vrho += offset;
+                        }
+                        if (deriv > 1) {
+                                v2rho2 += offset;
+                        }
+                        if (deriv > 2) {
+                                v3rho3 += offset;
+                        }
+                        if (deriv > 3) {
+                                v4rho4 += offset;
+                        }
                 }
-                xc_lda(func_x, np, rho, exc, vrho, v2rho2, v3rho3, v4rho4);
+                xc_lda(func_x, blksize, rho, exc, vrho, v2rho2, v3rho3, v4rho4);
                 break;
         case XC_FAMILY_GGA:
 #ifdef XC_FAMILY_HYB_GGA
         case XC_FAMILY_HYB_GGA:
 #endif
                 if (spin == 1) {
-                        sigma = rho + np * 2;
+                        sigma = rho + blksize * 2;
                         if (deriv > 0) {
                                 vrho = exc + np;
                                 vsigma = vrho + np * 2;
@@ -310,8 +344,33 @@ static void _eval_xc(xc_func_type *func_x, int spin, int deriv, int np,
                                 v4rhosigma3  = v4rho2sigma2 + np * 3*6 ;
                                 v4sigma4     = v4rhosigma3  + np * 2*10;
                         }
+
+                        // set offset
+                        exc += offset;
+                        if (deriv > 0) {
+                                vrho += offset * 2;
+                                vsigma += offset * 3;
+                        }
+                        if (deriv > 1) {
+                                v2rho2 += offset * 3;
+                                v2rhosigma += offset * 6;
+                                v2sigma2 += offset * 6;
+                        }
+                        if (deriv > 2) {
+                                v3rho3 += offset * 4;
+                                v3rho2sigma += offset * 9;
+                                v3rhosigma2 += offset * 12;
+                                v3sigma3 += offset * 10;
+                        }
+                        if (deriv > 3) {
+                                v4rho4 += offset * 5;
+                                v4rho3sigma += offset * 4*3;
+                                v4rho2sigma2 += offset * 3*6;
+                                v4rhosigma3 += offset * 2*10;
+                                v4sigma4 += offset * 15;
+                        }
                 } else {
-                        sigma = rho + np;
+                        sigma = rho + blksize;
                         if (deriv > 0) {
                                 vrho = exc + np;
                                 vsigma = vrho + np;
@@ -334,8 +393,33 @@ static void _eval_xc(xc_func_type *func_x, int spin, int deriv, int np,
                                 v4rhosigma3  = v4rho2sigma2 + np;
                                 v4sigma4     = v4rhosigma3  + np;
                         }
+
+                        // set offset
+                        exc += offset;
+                        if (deriv > 0) {
+                                vrho += offset;
+                                vsigma += offset;
+                        }
+                        if (deriv > 1) {
+                                v2rho2 += offset;
+                                v2rhosigma += offset;
+                                v2sigma2 += offset;
+                        }
+                        if (deriv > 2) {
+                                v3rho3 += offset;
+                                v3rho2sigma += offset;
+                                v3rhosigma2 += offset;
+                                v3sigma3 += offset;
+                        }
+                        if (deriv > 3) {
+                                v4rho4 += offset;
+                                v4rho3sigma += offset;
+                                v4rho2sigma2 += offset;
+                                v4rhosigma3 += offset;
+                                v4sigma4 += offset;
+                        }
                 }
-                xc_gga(func_x, np, rho, sigma,
+                xc_gga(func_x, blksize, rho, sigma,
                        exc, vrho, vsigma,
                        v2rho2, v2rhosigma, v2sigma2,
                        v3rho3, v3rho2sigma, v3rhosigma2, v3sigma3,
@@ -346,8 +430,8 @@ static void _eval_xc(xc_func_type *func_x, int spin, int deriv, int np,
         case XC_FAMILY_HYB_MGGA:
 #endif
                 if (spin == 1) {
-                        sigma = rho + np * 2;
-                        tau = sigma + np * 3;
+                        sigma = rho + blksize * 2;
+                        tau = sigma + blksize * 3;
                         if (deriv > 0) {
                                 vrho = exc + np;
                                 vsigma = vrho + np * 2;
@@ -390,9 +474,54 @@ static void _eval_xc(xc_func_type *func_x, int spin, int deriv, int np,
                                 v4sigmatau3    = v4sigma2tau2   + np * 6*3  ;
                                 v4tau4         = v4sigmatau3    + np * 3*4  ;
                         }
+
+                        // set offset
+                        exc += offset;
+                        if (deriv > 0) {
+                                vrho   += offset * 2;
+                                vsigma += offset * 3;
+                                vtau   += offset * 2;
+                        }
+                        if (deriv > 1) {
+                                v2rho2      += offset * 3;
+                                v2rhosigma  += offset * 6;
+                                v2sigma2    += offset * 6;
+                                v2rhotau    += offset * 4;
+                                v2sigmatau  += offset * 6;
+                                v2tau2      += offset * 3;
+                        }
+                        if (deriv > 2) {
+                                v3rho3         += offset * 4 ;
+                                v3rho2sigma    += offset * 9 ;
+                                v3rhosigma2    += offset * 12;
+                                v3sigma3       += offset * 10;
+                                v3rho2tau      += offset * 6 ;
+                                v3rhosigmatau  += offset * 12;
+                                v3rhotau2      += offset * 6 ;
+                                v3sigma2tau    += offset * 12;
+                                v3sigmatau2    += offset * 9 ;
+                                v3tau3         += offset * 4 ;
+                        }
+                        if (deriv > 3) {
+                                v4rho4         += offset * 5    ;
+                                v4rho3sigma    += offset * 4*3  ;
+                                v4rho2sigma2   += offset * 3*6  ;
+                                v4rhosigma3    += offset * 2*10 ;
+                                v4sigma4       += offset * 15   ;
+                                v4rho3tau      += offset * 4*2  ;
+                                v4rho2sigmatau += offset * 3*3*2;
+                                v4rho2tau2     += offset * 3*3  ;
+                                v4rhosigma2tau += offset * 2*6*2;
+                                v4rhosigmatau2 += offset * 2*3*3;
+                                v4rhotau3      += offset * 2*4  ;
+                                v4sigma3tau    += offset * 10*2 ;
+                                v4sigma2tau2   += offset * 6*3  ;
+                                v4sigmatau3    += offset * 3*4  ;
+                                v4tau4         += offset * 5    ;
+                        }
                 } else {
-                        sigma = rho + np;
-                        tau = sigma + np;
+                        sigma = rho + blksize;
+                        tau = sigma + blksize;
                         if (deriv > 0) {
                                 vrho = exc + np;
                                 vsigma = vrho + np;
@@ -435,8 +564,53 @@ static void _eval_xc(xc_func_type *func_x, int spin, int deriv, int np,
                                 v4sigmatau3    = v4sigma2tau2   + np;
                                 v4tau4         = v4sigmatau3    + np;
                         }
+
+                        // set offset
+                        exc += offset;
+                        if (deriv > 0) {
+                                vrho   += offset;
+                                vsigma += offset;
+                                vtau   += offset;
+                        }
+                        if (deriv > 1) {
+                                v2rho2      += offset;
+                                v2rhosigma  += offset;
+                                v2sigma2    += offset;
+                                v2rhotau    += offset;
+                                v2sigmatau  += offset;
+                                v2tau2      += offset;
+                        }
+                        if (deriv > 2) {
+                                v3rho3         += offset;
+                                v3rho2sigma    += offset;
+                                v3rhosigma2    += offset;
+                                v3sigma3       += offset;
+                                v3rho2tau      += offset;
+                                v3rhosigmatau  += offset;
+                                v3rhotau2      += offset;
+                                v3sigma2tau    += offset;
+                                v3sigmatau2    += offset;
+                                v3tau3         += offset;
+                        }
+                        if (deriv > 3) {
+                                v4rho4         += offset;
+                                v4rho3sigma    += offset;
+                                v4rho2sigma2   += offset;
+                                v4rhosigma3    += offset;
+                                v4sigma4       += offset;
+                                v4rho3tau      += offset;
+                                v4rho2sigmatau += offset;
+                                v4rho2tau2     += offset;
+                                v4rhosigma2tau += offset;
+                                v4rhosigmatau2 += offset;
+                                v4rhotau3      += offset;
+                                v4sigma3tau    += offset;
+                                v4sigma2tau2   += offset;
+                                v4sigmatau3    += offset;
+                                v4tau4         += offset;
+                        }
                 }
-                xc_mgga(func_x, np, rho, sigma, lapl, tau,
+                xc_mgga(func_x, blksize, rho, sigma, lapl, tau,
                      exc, vrho, vsigma, vlapl, vtau,
                      v2rho2, v2rhosigma, v2rholapl, v2rhotau, v2sigma2,
                      v2sigmalapl, v2sigmatau, v2lapl2, v2lapltau, v2tau2,
@@ -705,6 +879,7 @@ static void axpy(double *dst, double *src, double fac,
 {
         int i, j;
         for (j = 0; j < nsrc; j++) {
+                #pragma omp parallel for schedule(static)
                 for (i = 0; i < np; i++) {
                         dst[j*np+i] += fac * src[i*nsrc+j];
                 }
@@ -760,6 +935,7 @@ static void merge_xc(double *dst, double *ebuf, double fac,
                         pout = dst + offsets1[order] * np;
                         pin = ebuf + offsets0[order] * np;
                         nsrc = offsets0[order+1] - offsets0[order];
+                        #pragma omp parallel for schedule(static)
                         for (i = 0; i < np * nsrc; i++) {
                                 pout[i] += fac * pin[i];
                         }
@@ -802,10 +978,36 @@ void LIBXC_eval_xc(int nfn, int *fn_id, double *fac, double *omega,
 {
         assert(deriv <= 4);
         double *ebuf = malloc(sizeof(double) * np * outlen);
-        double *rho = malloc(sizeof(double) * np * 7);
-        _eval_rho(rho, rho_u, spin, nvar, np);
-        int nspin = spin + 1;
 
+        double *rhobufs[MAX_THREADS];
+        int offsets[MAX_THREADS+1];
+#pragma omp parallel
+{
+        int iblk = omp_get_thread_num();
+        int nblk = omp_get_num_threads();
+        assert(nblk <= MAX_THREADS);
+
+        int blksize = np / nblk;
+        int ioff = iblk * blksize;
+        int np_mod = np % nblk;
+        if (iblk < np_mod) {
+            blksize += 1;
+        }
+        if (np_mod > 0) {
+            ioff += MIN(iblk, np_mod);
+        }
+        offsets[iblk] = ioff;
+        if (iblk == nblk-1) {
+            offsets[nblk] = np;
+            assert(ioff + blksize == np);
+        }
+
+        double *rho_priv = malloc(sizeof(double) * blksize * 7);
+        rhobufs[iblk] = rho_priv;
+        _eval_rho(rho_priv, rho_u+ioff, spin, nvar, blksize, np);
+}
+
+        int nspin = spin + 1;
         int i, j;
         xc_func_type func;
         for (i = 0; i < nfn; i++) {
@@ -857,13 +1059,25 @@ void LIBXC_eval_xc(int nfn, int *fn_id, double *fac, double *omega,
 #if defined XC_SET_RELATIVITY
                 xc_lda_x_set_params(&func, relativity);
 #endif
-                _eval_xc(&func, spin, deriv, np, rho, ebuf);
+
+#pragma omp parallel
+{
+                int iblk = omp_get_thread_num();
+                int offset = offsets[iblk];
+                int blksize = offsets[iblk+1] - offset;
+                _eval_xc(&func, spin, deriv, np, rhobufs[iblk], ebuf, offset, blksize);
+}
+
                 merge_xc(output, ebuf, fac[i],
                          spin, deriv, nvar, np, outlen, func.info->family);
                 xc_func_end(&func);
         }
         free(ebuf);
-        free(rho);
+#pragma omp parallel
+{
+        int iblk = omp_get_thread_num();
+        free(rhobufs[iblk]);
+}
 }
 
 int LIBXC_max_deriv_order(int xc_id)
diff --git a/pyscf/lib/dft/multigrid.c b/pyscf/lib/dft/multigrid.c
new file mode 100644
index 0000000000..593aedf1b8
--- /dev/null
+++ b/pyscf/lib/dft/multigrid.c
@@ -0,0 +1,744 @@
+/* Copyright 2021- The PySCF Developers. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+ *
+ * Author: Xing Zhang <zhangxing.nju@gmail.com>
+ */
+
+#include <stdlib.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include "config.h"
+#include "cint.h"
+#include "pbc/neighbor_list.h"
+#include "pbc/cell.h"
+#include "dft/multigrid.h"
+
+#define SQUARE(r)       (r[0]*r[0]+r[1]*r[1]+r[2]*r[2])
+#define MAX(x, y) (((x) > (y)) ? (x) : (y))
+#define BUF_SIZE 2000
+#define ADD_SIZE 1000
+#define RZERO 1e-6
+
+const int _LEN_CART[] = {
+    1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 66, 78, 91, 105, 120, 136
+};
+
+const int _LEN_CART0[] = {
+    0, 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 66, 78, 91, 105, 120
+};
+
+const int _BINOMIAL_COEF[] = {
+    1,
+    1,   1,
+    1,   2,   1,
+    1,   3,   3,   1,
+    1,   4,   6,   4,   1,
+    1,   5,  10,  10,   5,   1,
+    1,   6,  15,  20,  15,   6,   1,
+    1,   7,  21,  35,  35,  21,   7,   1,
+    1,   8,  28,  56,  70,  56,  28,   8,   1,
+    1,   9,  36,  84, 126, 126,  84,  36,   9,   1,
+    1,  10,  45, 120, 210, 252, 210, 120,  45,  10,   1,
+    1,  11,  55, 165, 330, 462, 462, 330, 165,  55,  11,   1,
+    1,  12,  66, 220, 495, 792, 924, 792, 495, 220,  66,  12,   1,
+    1,  13,  78, 286, 715,1287,1716,1716,1287, 715, 286,  78,  13,   1,
+    1,  14,  91, 364,1001,2002,3003,3432,3003,2002,1001, 364,  91,  14,   1,
+    1,  15, 105, 455,1365,3003,5005,6435,6435,5005,3003,1365, 455, 105,  15,   1,
+};
+
+double CINTsquare_dist(const double *r1, const double *r2);
+
+void init_gridlevel_info(GridLevel_Info** gridlevel_info,
+                         double* cutoff, int* mesh, int nlevels, double rel_cutoff)
+{
+    GridLevel_Info* gl_info = (GridLevel_Info*) malloc(sizeof(GridLevel_Info));
+    gl_info->nlevels = nlevels;
+    gl_info->rel_cutoff = rel_cutoff;
+    gl_info->cutoff = (double*) malloc(sizeof(double) * nlevels);
+    gl_info->mesh = (int*) malloc(sizeof(int) * nlevels * 3);
+    int i;
+    for (i = 0; i < nlevels; i++) {
+        (gl_info->cutoff)[i] = cutoff[i];
+        (gl_info->mesh)[i*3] = mesh[i*3];
+        (gl_info->mesh)[i*3+1] = mesh[i*3+1];
+        (gl_info->mesh)[i*3+2] = mesh[i*3+2];
+    }
+    *gridlevel_info = gl_info;
+}
+
+
+void init_rs_grid(RS_Grid** rs_grid, GridLevel_Info** gridlevel_info, int comp)
+{
+    RS_Grid* rg = (RS_Grid*) malloc(sizeof(RS_Grid));
+    GridLevel_Info* gl_info = *gridlevel_info;
+    int nlevels = gl_info->nlevels;
+    rg->nlevels = nlevels;
+    rg->gridlevel_info = gl_info;
+    rg->comp = comp;
+
+    int i;
+    size_t ngrid;
+    int *mesh = gl_info->mesh;
+    rg->data = (double**)malloc(sizeof(double*) * nlevels);
+    for (i = 0; i < nlevels; i++) {
+        ngrid = mesh[i*3] * mesh[i*3+1] * mesh[i*3+2];
+        (rg->data)[i] = calloc(comp*ngrid, sizeof(double));
+    }
+    *rs_grid = rg;
+}
+
+
+void del_rs_grid(RS_Grid** rs_grid)
+{
+    RS_Grid* rg = *rs_grid;
+    if (!rg) {
+        return;
+    }
+    if (rg->data) {
+        int i;
+        for (i = 0; i < rg->nlevels; i++) {
+            if (rg->data[i]) {
+                free(rg->data[i]);
+            }
+        }
+        free(rg->data);
+    }
+    rg->gridlevel_info = NULL;
+    free(rg);
+    *rs_grid = NULL;
+}
+
+
+void del_gridlevel_info(GridLevel_Info** gridlevel_info)
+{
+    GridLevel_Info* gl_info = *gridlevel_info;
+    if (!gl_info) {
+        return;
+    }
+    if (gl_info->cutoff) {
+        free(gl_info->cutoff);
+    }
+    if (gl_info->mesh) {
+        free(gl_info->mesh);
+    }
+    free(gl_info);
+    *gridlevel_info = NULL;
+}
+
+
+void init_pgfpair(PGFPair** pair_info,
+                  int ish, int ipgf, int jsh, int jpgf, int iL, double radius)
+{
+    PGFPair *pair0 = (PGFPair*) malloc(sizeof(PGFPair));
+    pair0->ish = ish;
+    pair0->ipgf = ipgf;
+    pair0->jsh = jsh;
+    pair0->jpgf = jpgf;
+    pair0->iL = iL;
+    pair0->radius = radius;
+    *pair_info = pair0;
+}
+
+
+bool pgfpairs_with_same_shells(PGFPair *pair1, PGFPair *pair2)
+{
+    if (!pair1 || !pair2) {
+        return false;
+    }
+    if (pair1->ish == pair2->ish && pair1->jsh == pair2->jsh) {
+        return true;
+    }
+    return false;
+}
+
+
+double pgfpair_radius(int la, int lb, double zeta, double zetb, double* ra, double* rab, double precision)
+{
+    double radius = 0;
+    double zetp = zeta + zetb;
+    double eps = precision * precision;
+
+    if (rab[0] < RZERO && rab[1] < RZERO && rab[2] < RZERO) {
+        radius = pgf_rcut(la+lb, zetp, 1., eps, radius);
+        return radius;
+    }
+
+    double prefactor = exp(-zeta*zetb/zetp*SQUARE(rab));
+    double rb[3], rp[3];
+    rb[0] = ra[0] + rab[0];
+    rb[1] = ra[1] + rab[1];
+    rb[2] = ra[2] + rab[2];
+    rp[0] = ra[0] + zetb/zetp*rab[0];
+    rp[1] = ra[1] + zetb/zetp*rab[1];
+    rp[2] = ra[2] + zetb/zetp*rab[2];
+
+    double rad_a = sqrt(CINTsquare_dist(ra, rp));
+    double rad_b = sqrt(CINTsquare_dist(rb, rp));
+
+    int lmax = la + lb;
+    double coef[lmax+1];
+    double rap[la+1];
+    double rbp[lb+1];
+
+    int lxa, lxb, i;
+    for (i = 0; i <= lmax; i++) {
+        coef[i] = 0;
+    }
+    rap[0] = 1.;
+    for (i = 1; i <= la; i++) {
+        rap[i] = rap[i-1] * rad_a;
+    }
+    rbp[0] = 1.;
+    for (i = 1; i <= lb; i++) {
+        rbp[i] = rbp[i-1] * rad_b;
+    }
+
+    for (lxa = 0; lxa <= la; lxa++) {
+        for (lxb = 0; lxb <= lb; lxb++) {
+            coef[lxa+lxb] += BINOMIAL(la, lxa) * BINOMIAL(lb, lxb) * rap[la-lxa] * rbp[lb-lxb];
+        }
+    }
+
+    for (i = 0; i <= lmax; i++){
+        coef[i] *= prefactor;
+        radius = MAX(radius, pgf_rcut(i, zetp, coef[i], eps, radius));
+    }
+    return radius;
+}
+
+
+void del_pgfpair(PGFPair** pair_info)
+{
+    PGFPair *pair0 = *pair_info;
+    if (!pair0) {
+        return;
+    } else {
+        free(pair0);
+    }
+    *pair_info = NULL;
+}
+
+
+//unlink the pgfpair data instead of deleting
+void nullify_pgfpair(PGFPair** pair_info)
+{
+    *pair_info = NULL;
+}
+
+
+void init_task(Task** task)
+{
+    Task *t0 = *task = (Task*) malloc(sizeof(Task));
+    t0->ntasks = 0;
+    t0->buf_size = BUF_SIZE; 
+    t0->pgfpairs = (PGFPair**) malloc(sizeof(PGFPair*) * t0->buf_size);
+    int i;
+    for (i = 0; i < t0->buf_size; i++) {
+        (t0->pgfpairs)[i] = NULL;
+    }
+}
+
+
+void del_task(Task** task)
+{
+    Task *t0 = *task;
+    if (!t0) {
+        return;
+    }
+    if (t0->pgfpairs) {
+        size_t i, ntasks = t0->ntasks;
+        for (i = 0; i < ntasks; i++) {
+            del_pgfpair(t0->pgfpairs + i);
+        }
+        free(t0->pgfpairs);
+    }
+    free(t0);
+    *task = NULL;
+}
+
+
+void nullify_task(Task** task)
+{
+    Task *t0 = *task;
+    if (!t0) {
+        return;
+    }
+    if (t0->pgfpairs) {
+        size_t i, ntasks = t0->ntasks;
+        for (i = 0; i < ntasks; i++) {
+            nullify_pgfpair(t0->pgfpairs + i);
+        }
+        free(t0->pgfpairs);
+    }
+    free(t0);
+    *task = NULL;
+}
+
+
+void init_task_list(TaskList** task_list, GridLevel_Info* gridlevel_info, int nlevels, int hermi)
+{
+    TaskList* tl = *task_list = (TaskList*) malloc(sizeof(TaskList));
+    tl->nlevels = nlevels;
+    tl->hermi = hermi;
+    tl->gridlevel_info = gridlevel_info;
+    tl->tasks = (Task**) malloc(sizeof(Task*)*nlevels);
+    int i;
+    for (i = 0; i < nlevels; i++) {
+        init_task(tl->tasks + i);
+    }
+}
+
+
+void del_task_list(TaskList** task_list)
+{
+    TaskList *tl = *task_list;
+    if (!tl) {
+        return;
+    }
+    if (tl->gridlevel_info) {
+        del_gridlevel_info(&(tl->gridlevel_info));
+        tl->gridlevel_info = NULL;
+    }
+    if (tl->tasks) {
+        int i;
+        for (i = 0; i < tl->nlevels; i++) {
+            if ((tl->tasks)[i]) {
+                del_task(tl->tasks + i);
+            }
+        }
+        free(tl->tasks);
+    }
+    free(tl);
+    *task_list = NULL;
+}
+
+
+void nullify_task_list(TaskList** task_list)
+{
+    TaskList *tl = *task_list;
+    if (!tl) {
+        return;
+    }
+    if (tl->gridlevel_info) {
+        tl->gridlevel_info = NULL;
+    }
+    if (tl->tasks) {
+        int i;
+        for (i = 0; i < tl->nlevels; i++) {
+            if ((tl->tasks)[i]) {
+                nullify_task(tl->tasks + i);
+            }
+        }
+        free(tl->tasks);
+    }
+    free(tl);
+    *task_list = NULL;
+}
+
+
+void update_task_list(TaskList** task_list, int grid_level, 
+                      int ish, int ipgf, int jsh, int jpgf, int iL, double radius)
+{
+    TaskList* tl = *task_list;
+    Task *t0 = (tl->tasks)[grid_level];
+    t0->ntasks += 1;
+    if (t0->ntasks > t0->buf_size) {
+        t0->buf_size += ADD_SIZE;
+        t0->pgfpairs = (PGFPair**) realloc(t0->pgfpairs, sizeof(PGFPair*) * t0->buf_size);
+    }
+    init_pgfpair(t0->pgfpairs + t0->ntasks - 1,
+                 ish, ipgf, jsh, jpgf, iL, radius);
+}
+
+
+void merge_task_list(TaskList** task_list, TaskList** task_list_loc)
+{
+    TaskList* tl = *task_list;
+    TaskList* tl_loc = *task_list_loc;
+    int ilevel, itask;
+    for (ilevel = 0; ilevel < tl->nlevels; ilevel++) {
+        Task *t0 = (tl->tasks)[ilevel];
+        Task *t1 = (tl_loc->tasks)[ilevel];
+        int itask_off = t0->ntasks;
+        int ntasks_loc = t1->ntasks;
+        t0->ntasks += ntasks_loc;
+        t0->buf_size = t0->ntasks;
+        t0->pgfpairs = (PGFPair**) realloc(t0->pgfpairs, sizeof(PGFPair*) * t0->buf_size);
+        PGFPair** ptr_pgfpairs = t0->pgfpairs + itask_off;
+        PGFPair** ptr_pgfpairs_loc = t1->pgfpairs;
+        for (itask = 0; itask < ntasks_loc; itask++) {
+            ptr_pgfpairs[itask] = ptr_pgfpairs_loc[itask];
+        }
+    }
+}
+
+
+int get_grid_level(GridLevel_Info* gridlevel_info, double alpha)
+{
+    int i;
+    int nlevels = gridlevel_info->nlevels;
+    int grid_level = nlevels - 1; //default use the most dense grid
+    double needed_cutoff = alpha * gridlevel_info->rel_cutoff;
+    for (i = 0; i < nlevels; i++) {
+        if ((gridlevel_info->cutoff)[i] >= needed_cutoff) {
+            grid_level = i;
+            break;
+        }
+    }
+    return grid_level;
+}
+
+
+void build_task_list(TaskList** task_list, NeighborList** neighbor_list,
+                     GridLevel_Info** gridlevel_info,
+                     int* ish_atm, int* ish_bas, double* ish_env, 
+                     double* ish_rcut, double** ipgf_rcut,
+                     int* jsh_atm, int* jsh_bas, double* jsh_env, 
+                     double* jsh_rcut, double** jpgf_rcut,
+                     int nish, int njsh, double* Ls, double precision, int hermi)
+{
+    GridLevel_Info *gl_info = *gridlevel_info;
+    int ilevel;
+    int nlevels = gl_info->nlevels;
+    init_task_list(task_list, gl_info, nlevels, hermi);
+    double max_radius[nlevels];
+    NeighborList *nl0 = *neighbor_list;
+
+#pragma omp parallel private(ilevel)
+{
+    double max_radius_loc[nlevels];
+    TaskList** task_list_loc = (TaskList**) malloc(sizeof(TaskList*));
+    init_task_list(task_list_loc, gl_info, nlevels, hermi);
+    NeighborPair *np0_ij;
+    int ish, jsh;
+    int li, lj;
+    int ipgf, jpgf;
+    int nipgf, njpgf;
+    int iL, iL_idx;
+    int ish_atm_id, jsh_atm_id;
+    int ish_alpha_of, jsh_alpha_of;
+    double ipgf_alpha, jpgf_alpha;
+    double *ish_ratm, *jsh_ratm, *rL;
+    double rij[3];
+    double dij, radius;
+
+    #pragma omp for schedule(dynamic)
+    for (ish = 0; ish < nish; ish++) {
+        li = ish_bas[ANG_OF+ish*BAS_SLOTS];
+        nipgf = ish_bas[NPRIM_OF+ish*BAS_SLOTS];
+        ish_atm_id = ish_bas[ish*BAS_SLOTS+ATOM_OF];
+        ish_ratm = ish_env + ish_atm[ish_atm_id*ATM_SLOTS+PTR_COORD];
+        ish_alpha_of = ish_bas[PTR_EXP+ish*BAS_SLOTS];
+        for (jsh = 0; jsh < njsh; jsh++) {
+            if (hermi == 1 && jsh < ish) {
+                continue;
+            }
+            np0_ij = (nl0->pairs)[ish*njsh + jsh];
+            if (np0_ij->nimgs > 0) {
+                lj = jsh_bas[ANG_OF+jsh*BAS_SLOTS];
+                njpgf = jsh_bas[NPRIM_OF+jsh*BAS_SLOTS];
+                jsh_atm_id = jsh_bas[jsh*BAS_SLOTS+ATOM_OF];
+                jsh_ratm = jsh_env + jsh_atm[jsh_atm_id*ATM_SLOTS+PTR_COORD];
+                jsh_alpha_of = jsh_bas[PTR_EXP+jsh*BAS_SLOTS];
+
+                for (iL_idx = 0; iL_idx < np0_ij->nimgs; iL_idx++){
+                    iL = (np0_ij->Ls_list)[iL_idx];
+                    rL = Ls + iL*3;
+                    rij[0] = jsh_ratm[0] + rL[0] - ish_ratm[0];
+                    rij[1] = jsh_ratm[1] + rL[1] - ish_ratm[1];
+                    rij[2] = jsh_ratm[2] + rL[2] - ish_ratm[2];
+                    dij = sqrt(SQUARE(rij));
+
+                    for (ipgf = 0; ipgf < nipgf; ipgf++) {
+                        if (ipgf_rcut[ish][ipgf] + jsh_rcut[jsh] < dij) {
+                            continue;
+                        }
+                        ipgf_alpha = ish_env[ish_alpha_of+ipgf];
+                        for (jpgf = 0; jpgf < njpgf; jpgf++) {
+                            //if (hermi == 1 && ish == jsh && jpgf < ipgf) {
+                            //    continue;
+                            //}
+                            if (ipgf_rcut[ish][ipgf] + jpgf_rcut[jsh][jpgf] < dij) {
+                                continue;
+                            }
+                            jpgf_alpha = jsh_env[jsh_alpha_of+jpgf]; 
+                            ilevel = get_grid_level(gl_info, ipgf_alpha+jpgf_alpha);
+                            radius = pgfpair_radius(li, lj, ipgf_alpha, jpgf_alpha, ish_ratm, rij, precision);
+                            if (radius < RZERO) {
+                                continue;
+                            }
+                            max_radius_loc[ilevel] = MAX(radius, max_radius_loc[ilevel]);
+                            update_task_list(task_list_loc, ilevel, ish, ipgf, jsh, jpgf, iL, radius);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    #pragma omp critical
+    merge_task_list(task_list, task_list_loc);
+
+    nullify_task_list(task_list_loc);
+    free(task_list_loc);
+
+    #pragma omp critical
+    for (ilevel = 0; ilevel < nlevels; ilevel++) {
+        max_radius[ilevel] = MAX(max_radius[ilevel], max_radius_loc[ilevel]);
+    }
+}
+
+    for (ilevel = 0; ilevel < nlevels; ilevel++) {
+        Task *t0 = ((*task_list)->tasks)[ilevel];
+        t0->radius = max_radius[ilevel];
+    }
+}
+
+
+int get_task_loc(int** task_loc, PGFPair** pgfpairs, int ntasks,
+                 int ish0, int ish1, int jsh0, int jsh1, int hermi)
+{
+    int n = -2;
+    int ish_prev = -1;
+    int jsh_prev = -1;
+    int itask, ish, jsh;
+    int *buf = (int*)malloc(sizeof(int) * ntasks*2);
+    PGFPair *pgfpair;
+    for(itask = 0; itask < ntasks; itask++){
+        pgfpair = pgfpairs[itask];
+        ish = pgfpair->ish;
+        jsh = pgfpair->jsh;
+        if (ish < ish0 || ish >= ish1) {
+            continue;
+        }
+        if (jsh < jsh0 || jsh >= jsh1) {
+            continue;
+        }
+        if (hermi == 1 && jsh < ish) {
+            continue;
+        }
+
+        if (ish != ish_prev || jsh != jsh_prev) {
+            n += 2;
+            buf[n] = itask;
+            buf[n+1] = itask+1;
+            ish_prev = ish;
+            jsh_prev = jsh;
+        } else {
+            buf[n+1] = itask+1;
+        }
+    }
+    n += 2;
+    *task_loc = (int*)realloc(buf, sizeof(int) * n);
+    return n;
+}
+
+
+void gradient_gs(double complex* out, double complex* f_gs, double* Gv,
+                 int n, size_t ng)
+{
+    int i;
+    double complex *outx, *outy, *outz;
+    for (i = 0; i < n; i++) {
+        outx = out;
+        outy = outx + ng;
+        outz = outy + ng;
+        #pragma omp parallel
+        {
+            size_t igrid;
+            double *pGv;
+            #pragma omp for schedule(static)
+            for (igrid = 0; igrid < ng; igrid++) {
+                pGv = Gv + igrid * 3;
+                outx[igrid] = pGv[0] * creal(f_gs[igrid]) * _Complex_I - pGv[0] * cimag(f_gs[igrid]);
+                outy[igrid] = pGv[1] * creal(f_gs[igrid]) * _Complex_I - pGv[1] * cimag(f_gs[igrid]);
+                outz[igrid] = pGv[2] * creal(f_gs[igrid]) * _Complex_I - pGv[2] * cimag(f_gs[igrid]);
+            }
+        }
+        f_gs += ng;
+        out += 3 * ng;
+    }
+}
+
+/*
+int get_task_loc_diff_ish(int** task_loc, PGFPair** pgfpairs, int ntasks,
+                          int ish0, int ish1)
+{
+    int n = -2;
+    int ish_prev = -1;
+    int itask, ish;
+    int *buf = (int*)malloc(sizeof(int) * ntasks*2);
+    PGFPair *pgfpair;
+    for(itask = 0; itask < ntasks; itask++){
+        pgfpair = pgfpairs[itask];
+        ish = pgfpair->ish;
+        if (ish < ish0 || ish >= ish1) {
+            continue;
+        }
+
+        if (ish != ish_prev) {
+            n += 2;
+            buf[n] = itask;
+            ish_prev = ish;
+        }
+        if (ish == ish_prev) {
+            buf[n+1] = itask+1;
+        }
+    }
+    n += 2;
+    *task_loc = (int*)realloc(buf, sizeof(int) * n);
+    return n;
+}
+*/
+
+/*
+typedef struct Task_Index_struct {
+    int ntasks;
+    int bufsize;
+    int* task_index;
+} Task_Index;
+
+
+void init_task_index(Task_Index* task_idx)
+{
+    task_idx->ntasks = 0;
+    task_idx->bufsize = 10;
+    task_idx->task_index = (int*)malloc(sizeof(int) * task_idx->bufsize);
+}
+
+
+void update_task_index(Task_Index* task_idx, int itask)
+{
+    task_idx->ntasks += 1;
+    if (task_idx->bufsize < task_idx->ntasks) {
+        task_idx->bufsize += 10;
+        task_idx->task_index = (int*)realloc(task_idx->task_index, sizeof(int) * task_idx->bufsize);
+    }
+    task_idx->task_index[task_idx->ntasks-1] = itask;
+}
+
+
+void del_task_index(Task_Index* task_idx)
+{
+    if (!task_idx) {
+        return;
+    }
+    if (task_idx->task_index) {
+        free(task_idx->task_index);
+    }
+    task_idx->ntasks = 0;
+    task_idx->bufsize = 0;
+}
+
+
+typedef struct Shlpair_Task_Index_struct {
+    int nish;
+    int njsh;
+    int ish0;
+    int jsh0;
+    Task_Index *task_index;
+} Shlpair_Task_Index;
+
+
+void init_shlpair_task_index(Shlpair_Task_Index* shlpair_task_idx,
+                             int ish0, int jsh0, int nish, int njsh)
+{
+    shlpair_task_idx->ish0 = ish0;
+    shlpair_task_idx->jsh0 = jsh0;
+    shlpair_task_idx->nish = nish;
+    shlpair_task_idx->njsh = njsh;
+    shlpair_task_idx->task_index = (Task_Index*)malloc(sizeof(Task_Index)*nish*njsh);
+
+    int ijsh;
+    for (ijsh = 0; ijsh < nish*njsh; ijsh++) {
+        init_task_index(shlpair_task_idx->task_index + ijsh);
+    }
+}
+
+
+void update_shlpair_task_index(Shlpair_Task_Index* shlpair_task_idx,
+                               int ish, int jsh, int itask)
+{
+    int ish0 = shlpair_task_idx->ish0;
+    int jsh0 = shlpair_task_idx->jsh0;
+    int njsh = shlpair_task_idx->njsh;
+    int ioff = ish - ish0;
+    int joff = jsh - jsh0;
+
+    update_task_index(shlpair_task_idx->task_index + ioff*njsh+joff, itask);
+}
+
+
+int get_task_index(Shlpair_Task_Index* shlpair_task_idx, int** idx, int ish, int jsh)
+{
+    int ish0 = shlpair_task_idx->ish0;
+    int jsh0 = shlpair_task_idx->jsh0;
+    int njsh = shlpair_task_idx->njsh;
+    int ioff = ish - ish0;
+    int joff = jsh - jsh0;
+    Task_Index *task_idx = shlpair_task_idx->task_index + ioff*njsh+joff;
+    int ntasks = task_idx->ntasks;
+    *idx = task_idx->task_index;
+    return ntasks;
+}
+
+
+void del_shlpair_task_index(Shlpair_Task_Index* shlpair_task_idx)
+{
+    if (!shlpair_task_idx) {
+        return;
+    }
+
+    int nish = shlpair_task_idx->nish;
+    int njsh = shlpair_task_idx->njsh;
+    int ijsh;
+    for (ijsh = 0; ijsh < nish*njsh; ijsh++) {
+        del_task_index(shlpair_task_idx->task_index + ijsh);
+    }
+    free(shlpair_task_idx->task_index);
+}
+
+
+Shlpair_Task_Index* get_shlpair_task_index(PGFPair** pgfpairs, int ntasks,
+            int ish0, int ish1, int jsh0, int jsh1, int hermi)
+{
+    const int nish = ish1 - ish0;
+    const int njsh = jsh1 - jsh0;
+
+    Shlpair_Task_Index* shlpair_task_idx = (Shlpair_Task_Index*) malloc(sizeof(Shlpair_Task_Index));
+    init_shlpair_task_index(shlpair_task_idx, ish0, jsh0, nish, njsh);
+
+    int itask;
+    int ish, jsh;
+    PGFPair *pgfpair = NULL;
+    for(itask = 0; itask < ntasks; itask++){
+        pgfpair = pgfpairs[itask];
+        ish = pgfpair->ish;
+        if (ish < ish0 || ish >= ish1) {
+            continue;
+        }
+        jsh = pgfpair->jsh;
+        if (jsh < jsh0 || jsh >= jsh1) {
+            continue;
+        }
+        if (hermi == 1 && jsh < ish) {
+            continue;
+        }
+        update_shlpair_task_index(shlpair_task_idx, ish, jsh, itask);
+    }
+    return shlpair_task_idx;
+}
+*/
diff --git a/pyscf/lib/dft/multigrid.h b/pyscf/lib/dft/multigrid.h
new file mode 100644
index 0000000000..e691a3ce12
--- /dev/null
+++ b/pyscf/lib/dft/multigrid.h
@@ -0,0 +1,72 @@
+/* Copyright 2021- The PySCF Developers. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+ *
+ * Author: Xing Zhang <zhangxing.nju@gmail.com>
+ */
+
+#ifndef HAVE_DEFINED_MULTIGRID_H
+#define HAVE_DEFINED_MULTIGRID_H
+
+#include <stdbool.h>
+
+#define BINOMIAL(n, i)  (_BINOMIAL_COEF[_LEN_CART0[n]+i])
+
+extern const int _LEN_CART[];
+extern const int _LEN_CART0[];
+extern const int _BINOMIAL_COEF[];
+
+typedef struct GridLevel_Info_struct {
+    int nlevels;
+    double rel_cutoff;
+    double *cutoff;
+    int *mesh;
+} GridLevel_Info;
+
+typedef struct RS_Grid_struct {
+    int nlevels;
+    GridLevel_Info* gridlevel_info;
+    int comp;
+    double** data;
+} RS_Grid;
+
+typedef struct PGFPair_struct {
+    int ish;
+    int ipgf;
+    int jsh;
+    int jpgf;
+    int iL;
+    double radius;
+} PGFPair;
+
+bool pgfpairs_with_same_shells(PGFPair*, PGFPair*);
+
+typedef struct Task_struct {
+    size_t buf_size;
+    size_t ntasks;
+    PGFPair** pgfpairs;
+    double radius;
+} Task;
+
+typedef struct TaskList_struct {
+    int nlevels;
+    int hermi;
+    GridLevel_Info* gridlevel_info;
+    Task** tasks;
+} TaskList;
+
+
+int get_task_loc(int** task_loc, PGFPair** pgfpairs, int ntasks,
+                 int ish0, int ish1, int jsh0, int jsh1, int hermi);
+#endif
diff --git a/pyscf/lib/dft/utils.c b/pyscf/lib/dft/utils.c
new file mode 100644
index 0000000000..04ef8e5b2f
--- /dev/null
+++ b/pyscf/lib/dft/utils.c
@@ -0,0 +1,62 @@
+/* Copyright 2021- The PySCF Developers. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+ *
+ * Author: Xing Zhang <zhangxing.nju@gmail.com>
+ */
+
+#include <complex.h>
+#include "config.h"
+#include "vhf/fblas.h"
+#if defined(HAVE_LIBXSMM)
+#include "libxsmm.h"
+#endif
+
+
+void dgemm_wrapper(const char transa, const char transb,
+                   const int m, const int n, const int k,
+                   const double alpha, const double* a, const int lda,
+                   const double* b, const int ldb,
+                   const double beta, double* c, const int ldc)
+{
+#if defined(HAVE_LIBXSMM)
+    if (transa == 'N') {
+        //libxsmm_dgemm(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc);
+        int prefetch = LIBXSMM_PREFETCH_AUTO;
+        int flags = transb != 'T' ? LIBXSMM_GEMM_FLAG_NONE : LIBXSMM_GEMM_FLAG_TRANS_B;
+        libxsmm_dmmfunction kernel = libxsmm_dmmdispatch(m, n, k, &lda, &ldb, &ldc,
+                                                         &alpha, &beta, &flags, &prefetch);
+        if (kernel) {
+            kernel(a,b,c,a,b,c);
+            return;
+        }
+    }
+#endif
+    dgemm_(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc);
+}
+
+void get_gga_vrho_gs(double complex *out, double complex *vrho_gs, double complex *vsigma1_gs,
+                     double *Gv, double weight, int ngrid)
+{
+    int i;
+    int ngrid2 = 2 * ngrid;
+    double complex fac = -2. * _Complex_I;
+    #pragma omp parallel for simd schedule(static)
+    for (i = 0; i < ngrid; i++) {
+        out[i] = ( Gv[i*3]   * vsigma1_gs[i]
+                  +Gv[i*3+1] * vsigma1_gs[i+ngrid]
+                  +Gv[i*3+2] * vsigma1_gs[i+ngrid2]) * fac + vrho_gs[i];
+        out[i] *= weight;
+    }
+}
diff --git a/pyscf/lib/dft/utils.h b/pyscf/lib/dft/utils.h
new file mode 100644
index 0000000000..1c85ff1fdc
--- /dev/null
+++ b/pyscf/lib/dft/utils.h
@@ -0,0 +1,27 @@
+/* Copyright 2021- The PySCF Developers. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+ *
+ * Author: Xing Zhang <zhangxing.nju@gmail.com>
+ */
+
+#ifndef HAVE_DEFINED_GRID_UTILS_H
+#define HAVE_DEFINED_GRID_UTILS_H
+
+extern void dgemm_wrapper(const char transa, const char transb,
+                   const int m, const int n, const int k,
+                   const double alpha, const double* a, const int lda,
+                   const double* b, const int ldb,
+                   const double beta, double* c, const int ldc);
+#endif
diff --git a/pyscf/lib/np_helper/np_helper.h b/pyscf/lib/np_helper/np_helper.h
index 2c8227c03d..3ed8d05574 100644
--- a/pyscf/lib/np_helper/np_helper.h
+++ b/pyscf/lib/np_helper/np_helper.h
@@ -61,3 +61,10 @@ void NPdset0(double *p, const size_t n);
 void NPzset0(double complex *p, const size_t n);
 void NPdcopy(double *out, const double *in, const size_t n);
 void NPzcopy(double complex *out, const double complex *in, const size_t n);
+
+void NPdgemm(const char trans_a, const char trans_b,
+             const int m, const int n, const int k,
+             const int lda, const int ldb, const int ldc,
+             const int offseta, const int offsetb, const int offsetc,
+             double *a, double *b, double *c,
+             const double alpha, const double beta);
diff --git a/pyscf/lib/numpy_helper.py b/pyscf/lib/numpy_helper.py
index 406fa54e20..58508d9f8b 100644
--- a/pyscf/lib/numpy_helper.py
+++ b/pyscf/lib/numpy_helper.py
@@ -1116,6 +1116,16 @@ def expm(a):
         y, buf = buf, y
     return y
 
+def ndarray_pointer_2d(array):
+    '''Get the C pointer of a 2D array
+    '''
+    assert array.ndim == 2
+    assert array.flags.c_contiguous
+
+    ptr = (array.ctypes.data +
+           numpy.arange(array.shape[0])*array.strides[0]).astype(numpy.uintp)
+    ptr = ptr.ctypes.data_as(ctypes.c_void_p)
+    return ptr
 
 class NPArrayWithTag(numpy.ndarray):
     # Initialize kwargs in function tag_array
diff --git a/pyscf/lib/pbc/CMakeLists.txt b/pyscf/lib/pbc/CMakeLists.txt
index 6d185fdf85..636cb75451 100644
--- a/pyscf/lib/pbc/CMakeLists.txt
+++ b/pyscf/lib/pbc/CMakeLists.txt
@@ -13,10 +13,20 @@
 # limitations under the License.
 
 add_library(pbc SHARED ft_ao.c fill_ints.c fill_ints_sr.c optimizer.c grid_ao.c
-  nr_direct.c symmetry.c inner_dot.c cint2e.c cint3c2e.c nr_ecp.c transform_mo.c)
+  nr_direct.c symmetry.c inner_dot.c cint2e.c cint3c2e.c nr_ecp.c transform_mo.c
+  neighbor_list.c cell.c pp.c hf_grad.c fill_ints_screened.c)
 add_dependencies(pbc cgto cvhf np_helper)
 
 set_target_properties(pbc PROPERTIES
   LIBRARY_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR})
 
 target_link_libraries(pbc cgto cint cvhf np_helper ${BLAS_LIBRARIES} ${OPENMP_C_PROPERTIES})
+
+if(ENABLE_FFTW)
+add_library(fft SHARED fft.c)
+set_target_properties(fft PROPERTIES
+  LIBRARY_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}
+  COMPILE_FLAGS ${OpenMP_C_FLAGS}
+  LINK_FLAGS ${OpenMP_C_FLAGS})
+target_link_libraries(fft fftw3_threads fftw3 ${BLAS_LIBRARIES})
+endif()
diff --git a/pyscf/lib/pbc/cell.c b/pyscf/lib/pbc/cell.c
new file mode 100644
index 0000000000..20bb96e72c
--- /dev/null
+++ b/pyscf/lib/pbc/cell.c
@@ -0,0 +1,280 @@
+/* Copyright 2021- The PySCF Developers. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+ *
+ * Author: Xing Zhang <zhangxing.nju@gmail.com>
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <complex.h>
+#include "config.h"
+#include "cint.h"
+#include "pbc/cell.h"
+#include "np_helper/np_helper.h"
+
+#define SQUARE(r) (r[0]*r[0]+r[1]*r[1]+r[2]*r[2])
+
+double pgf_rcut(int l, double alpha, double coeff, double precision, double r0)
+{
+    l += 2;
+
+    double rcut;
+    double rmin = sqrt(.5 * l / alpha) * 2.;
+    double gmax = coeff * pow(rmin, l) * exp(-alpha * rmin * rmin);
+    if (gmax < precision) {
+        return rmin;
+    }
+
+    double eps = MIN(rmin/10, RCUT_EPS);
+    double c = log(coeff / precision);
+    double rcut_last;
+    rcut = MAX(r0, rmin+eps);
+
+    int i;
+    for (i = 0; i < RCUT_MAX_CYCLE; i++) {
+        rcut_last = rcut;
+        rcut = sqrt((l*log(rcut) + c) / alpha);
+        if (fabs(rcut - rcut_last) < eps) {
+            break;
+        }
+    }
+    if (i == RCUT_MAX_CYCLE) {
+        //printf("r0 = %.6e, l = %d, alpha = %.6e, coeff = %.6e, precision=%.6e\n", r0, l, alpha, coeff, precision);
+        fprintf(stderr, "pgf_rcut did not converge in %d cycles: %.6f > %.6f.\n",
+                RCUT_MAX_CYCLE, fabs(rcut - rcut_last), eps);
+    }
+    return rcut; 
+}
+
+void rcut_by_shells(double* shell_radius, double** ptr_pgf_rcut, 
+                    int* bas, double* env, int nbas, 
+                    double r0, double precision)
+{
+#pragma omp parallel
+{
+    int ib, ic, p;
+    #pragma omp for schedule(static)
+    for (ib = 0; ib < nbas; ib ++) {
+        int l = bas[ANG_OF+ib*BAS_SLOTS];
+        int nprim = bas[NPRIM_OF+ib*BAS_SLOTS];
+        int ptr_exp = bas[PTR_EXP+ib*BAS_SLOTS];
+        int nctr = bas[NCTR_OF+ib*BAS_SLOTS];
+        int ptr_c = bas[PTR_COEFF+ib*BAS_SLOTS];
+        double rcut_max = 0, rcut;
+        for (p = 0; p < nprim; p++) {
+            double alpha = env[ptr_exp+p];
+            double cmax = 0;
+            for (ic = 0; ic < nctr; ic++) {
+                cmax = MAX(fabs(env[ptr_c+ic*nprim+p]), cmax);
+            }
+            rcut = pgf_rcut(l, alpha, cmax, precision, r0);
+            if (ptr_pgf_rcut) {
+                ptr_pgf_rcut[ib][p] = rcut;
+            }
+            rcut_max = MAX(rcut, rcut_max);
+        }
+        shell_radius[ib] = rcut_max;
+    }
+}
+}
+
+
+static void get_SI_real_imag(double* out_real, double* out_imag,
+                             double* coords, double* Gv,
+                             int natm, size_t ngrid)
+{
+#pragma omp parallel
+{
+    int ia;
+    size_t i;
+    double RG;
+    double *pcoords, *pGv;
+    double *pout_real, *pout_imag;
+    #pragma omp for schedule(static)
+    for (ia = 0; ia < natm; ia++) {
+        pcoords = coords + ia * 3;
+        pout_real = out_real + ia * ngrid;
+        pout_imag = out_imag + ia * ngrid;
+        for (i = 0; i < ngrid; i++) {
+            pGv = Gv + i * 3;
+            RG = pcoords[0] * pGv[0] + pcoords[1] * pGv[1] + pcoords[2] * pGv[2];
+            pout_real[i] = cos(RG);
+            pout_imag[i] = -sin(RG);
+        }
+    }
+}
+}
+
+
+void get_Gv(double* Gv, double* rx, double* ry, double* rz, int* mesh, double* b)
+{
+#pragma omp parallel
+{
+    int x, y, z;
+    double *pGv;
+    #pragma omp for schedule(dynamic)
+    for (x = 0; x < mesh[0]; x++) {
+        pGv = Gv + x * (size_t)mesh[1] * mesh[2] * 3;
+        for (y = 0; y < mesh[1]; y++) {
+        for (z = 0; z < mesh[2]; z++) {
+            pGv[0]  = rx[x] * b[0];
+            pGv[0] += ry[y] * b[3];
+            pGv[0] += rz[z] * b[6];
+            pGv[1]  = rx[x] * b[1];
+            pGv[1] += ry[y] * b[4];
+            pGv[1] += rz[z] * b[7];
+            pGv[2]  = rx[x] * b[2];
+            pGv[2] += ry[y] * b[5];
+            pGv[2] += rz[z] * b[8];
+            pGv += 3;
+        }}
+    }
+}
+}
+
+
+void ewald_gs_nuc_grad(double* out, double* Gv, double* charges, double* coords,
+                       double ew_eta, double weights, int natm, size_t ngrid)
+{
+    double *SI_real = (double*) malloc(natm*ngrid*sizeof(double));
+    double *SI_imag = (double*) malloc(natm*ngrid*sizeof(double)); 
+    get_SI_real_imag(SI_real, SI_imag, coords, Gv, natm, ngrid);
+
+    double *ZSI_real = calloc(ngrid, sizeof(double));
+    double *ZSI_imag = calloc(ngrid, sizeof(double));
+
+    NPdgemm('N', 'N', ngrid, 1, natm,
+            ngrid, natm, ngrid, 0, 0, 0,
+            SI_real, charges, ZSI_real, 1., 0.);
+    NPdgemm('N', 'N', ngrid, 1, natm,
+            ngrid, natm, ngrid, 0, 0, 0,
+            SI_imag, charges, ZSI_imag, 1., 0.);
+
+#pragma omp parallel
+{
+    int ia;
+    size_t i;
+    double charge_i;
+    double G2, coulG, tmp;
+    double *pout, *pGv;
+    double *pSI_real, *pSI_imag;
+    double fac = 4. * M_PI * weights;
+    double fac1 = 4. * ew_eta * ew_eta;
+
+    #pragma omp for schedule(static)
+    for (ia = 0; ia < natm; ia++) {
+        charge_i = charges[ia];
+        pout = out + ia * 3;
+        pSI_real = SI_real + ia * ngrid;
+        pSI_imag = SI_imag + ia * ngrid;
+        #pragma omp simd
+        for (i = 0; i < ngrid; i++) {
+            pGv = Gv + i*3;
+            G2 = SQUARE(pGv);
+            if (G2 < 1e-12) {continue;}
+            coulG = fac / G2 * exp(-G2 / fac1);
+            tmp  = coulG * charge_i;
+            tmp *= (pSI_imag[i] * ZSI_real[i] - pSI_real[i] * ZSI_imag[i]);
+            pout[0] += tmp * pGv[0];
+            pout[1] += tmp * pGv[1];
+            pout[2] += tmp * pGv[2];
+        }
+    }
+}
+    free(SI_real);
+    free(SI_imag);
+    free(ZSI_real);
+    free(ZSI_imag);
+}
+
+
+void get_ewald_direct(double* ewovrl, double* chargs, double* coords, double* Ls,
+                      double beta, double rcut, int natm, int nL)
+{
+    *ewovrl = 0.0;
+
+    #pragma omp parallel
+    {
+        int i, j, l;
+        double *ri, *rj, *rL;
+        double rij[3];
+        double r, qi, qj;
+        double e_loc = 0.0;
+        #pragma omp for schedule(static)
+        for (i = 0; i < natm; i++) {
+            ri = coords + i*3;
+            qi = chargs[i];
+            for (j = 0; j < natm; j++) {
+                rj = coords + j*3;
+                qj = chargs[j];
+                for (l = 0; l < nL; l++) {
+                    rL = Ls + l*3;
+                    rij[0] = rj[0] + rL[0] - ri[0];
+                    rij[1] = rj[1] + rL[1] - ri[1];
+                    rij[2] = rj[2] + rL[2] - ri[2];
+                    r = sqrt(SQUARE(rij));
+                    if (r > 1e-10 && r < rcut) {
+                        e_loc += qi * qj * erfc(beta * r) / r;
+                    }
+                }
+            }
+        }
+        e_loc *= 0.5;
+
+        #pragma omp critical
+        *ewovrl += e_loc;
+    }
+}
+
+
+void get_ewald_direct_nuc_grad(double* out, double* chargs, double* coords, double* Ls,
+                               double beta, double rcut, int natm, int nL)
+{
+    double fac = 2. * beta / sqrt(M_PI);
+    double beta2 = beta * beta;
+
+    #pragma omp parallel
+    {
+        int i, j, l;
+        double *ri, *rj, *rL, *pout;
+        double rij[3];
+        double r, r2, qi, qj, tmp;
+        #pragma omp for schedule(static)
+        for (i = 0; i < natm; i++) {
+            pout = out + i*3;
+            ri = coords + i*3;
+            qi = chargs[i];
+            for (j = 0; j < natm; j++) {
+                rj = coords + j*3;
+                qj = chargs[j];
+                for (l = 0; l < nL; l++) {
+                    rL = Ls + l*3;
+                    rij[0] = ri[0] - rj[0] + rL[0];
+                    rij[1] = ri[1] - rj[1] + rL[1];
+                    rij[2] = ri[2] - rj[2] + rL[2];
+                    r2 = SQUARE(rij);
+                    r = sqrt(r2);
+                    if (r > 1e-10 && r < rcut) {
+                        tmp  = qi * qj * (erfc(beta * r) / (r2 * r) + fac * exp(-beta2 * r2) / r2);
+                        pout[0] -= tmp * rij[0];
+                        pout[1] -= tmp * rij[1];
+                        pout[2] -= tmp * rij[2];
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/pyscf/lib/pbc/cell.h b/pyscf/lib/pbc/cell.h
new file mode 100644
index 0000000000..bec26bb2ea
--- /dev/null
+++ b/pyscf/lib/pbc/cell.h
@@ -0,0 +1,29 @@
+/* Copyright 2021- The PySCF Developers. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+ *
+ * Author: Xing Zhang <zhangxing.nju@gmail.com>
+ */
+
+#ifndef HAVE_DEFINED_CELL_H
+#define HAVE_DEFINED_CELL_H
+
+#define RCUT_MAX_CYCLE 10
+#define RCUT_EPS 1e-3
+
+double pgf_rcut(int l, double alpha, double coeff, double precision, double r0);
+void rcut_by_shells(double* shell_radius, double** ptr_pgf_rcut,
+                    int* bas, double* env, int nbas,
+                    double r0, double precision);
+#endif
diff --git a/pyscf/lib/pbc/fft.c b/pyscf/lib/pbc/fft.c
new file mode 100644
index 0000000000..3affbb9a02
--- /dev/null
+++ b/pyscf/lib/pbc/fft.c
@@ -0,0 +1,147 @@
+/* Copyright 2021- The PySCF Developers. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+ *
+ * Author: Xing Zhang <zhangxing.nju@gmail.com>
+ */
+
+#include <stdio.h>
+#include <complex.h>
+#include <fft.h>
+#include "config.h"
+
+#define BLKSIZE 128
+#define MAX(x, y) (((x) > (y)) ? (x) : (y))
+
+fftw_plan fft_create_r2c_plan(double* in, complex double* out, int rank, int* mesh)
+{
+    fftw_plan p;
+    p = fftw_plan_dft_r2c(rank, mesh, in, out, FFTW_ESTIMATE);
+    return p;
+}
+
+fftw_plan fft_create_c2r_plan(complex double* in, double* out, int rank, int* mesh)
+{
+    fftw_plan p;
+    p = fftw_plan_dft_c2r(rank, mesh, in, out, FFTW_ESTIMATE);
+    return p;
+}
+
+void fft_execute(fftw_plan p)
+{
+    fftw_execute(p);
+}
+
+void fft_destroy_plan(fftw_plan p)
+{
+    fftw_destroy_plan(p);
+}
+
+void _complex_fft(complex double* in, complex double* out, int* mesh, int rank, int sign)
+{
+    int i;
+    int nx = mesh[0];
+    int nyz = 1;
+    for (i = 1; i < rank; i++) {
+        nyz *= mesh[i];
+    }
+    int nmax = nyz / BLKSIZE * BLKSIZE;
+    fftw_plan p_2d = fftw_plan_dft(rank-1, mesh+1, in, out, sign, FFTW_ESTIMATE);
+    int nn[BLKSIZE] = {nx};
+    fftw_plan p_3d_x = fftw_plan_many_dft(1, nn, BLKSIZE,
+                                          out, NULL, nyz, 1,
+                                          out, NULL, nyz, 1,
+                                          sign, FFTW_ESTIMATE);
+
+    #pragma omp parallel private(i)
+    {
+        int off;
+        #pragma omp for schedule(dynamic)
+        for (i = 0; i < nx; i++) {
+            off = i * nyz;
+            fftw_execute_dft(p_2d, in+off, out+off);
+        }
+
+        #pragma omp for schedule(dynamic)
+        for (i = 0; i < nmax; i+=BLKSIZE) {
+            fftw_execute_dft(p_3d_x, out+i, out+i);
+        }
+    }
+    fftw_destroy_plan(p_2d);
+    fftw_destroy_plan(p_3d_x);
+    
+    int nres = nyz - nmax;
+    if (nres > 0) {
+        fftw_plan p_3d_x = fftw_plan_many_dft(1, nn, nres,
+                                          out+nmax, NULL, nyz, 1,
+                                          out+nmax, NULL, nyz, 1,
+                                          sign, FFTW_ESTIMATE);
+        fftw_execute(p_3d_x);
+        fftw_destroy_plan(p_3d_x);
+    }
+}
+
+void fft(complex double* in, complex double* out, int* mesh, int rank)
+{
+    _complex_fft(in, out, mesh, rank, FFTW_FORWARD);
+}
+
+void ifft(complex double* in, complex double* out, int* mesh, int rank)
+{
+    _complex_fft(in, out, mesh, rank, FFTW_BACKWARD);
+    size_t i, n = 1;
+    for (i = 0; i < rank; i++) {
+        n *= mesh[i];
+    }
+    double fac = 1. / (double)n;
+    #pragma omp parallel for schedule(static)
+    for (i = 0; i < n; i++) {
+        out[i] *= fac;
+    }
+}
+
+void rfft(double* in, complex double* out, int* mesh, int rank)
+{
+    fftw_plan p = fftw_plan_dft_r2c(rank, mesh, in, out, FFTW_ESTIMATE); 
+    fftw_execute(p);
+    fftw_destroy_plan(p);
+}
+
+void irfft(complex double* in, double* out, int* mesh, int rank)
+{
+    fftw_plan p = fftw_plan_dft_c2r(rank, mesh, in, out, FFTW_ESTIMATE);
+    fftw_execute(p);
+    fftw_destroy_plan(p);
+    size_t i, n = 1;
+    for (i = 0; i < rank; i++) {
+        n *= mesh[i];
+    }
+    double fac = 1. / (double)n;
+    #pragma omp parallel for schedule(static)
+    for (i = 0; i < n; i++) {
+        out[i] *= fac;
+    }
+}
+
+void _copy_d2z(double complex *out, const double *in, const size_t n)
+{
+#pragma omp parallel
+{
+    size_t i;
+    #pragma omp for schedule(static)
+    for (i = 0; i < n; i++) {
+        out[i] = in[i] + 0*_Complex_I;
+    }
+}
+}
diff --git a/pyscf/lib/pbc/fft.h b/pyscf/lib/pbc/fft.h
new file mode 100644
index 0000000000..edc5382f7e
--- /dev/null
+++ b/pyscf/lib/pbc/fft.h
@@ -0,0 +1,26 @@
+/* Copyright 2021- The PySCF Developers. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+ *
+ * Author: Xing Zhang <zhangxing.nju@gmail.com>
+ */
+
+#include <fftw3.h>
+
+#define FFT_PLAN fftw_plan
+
+FFT_PLAN fft_create_r2c_plan(double* in, complex double* out, int rank, int* mesh);
+FFT_PLAN fft_create_c2r_plan(complex double* in, double* out, int rank, int* mesh);
+void fft_execute(FFT_PLAN p);
+void fft_destroy_plan(FFT_PLAN p);
diff --git a/pyscf/lib/pbc/fill_ints.c b/pyscf/lib/pbc/fill_ints.c
index 36c853724c..95857b19ee 100644
--- a/pyscf/lib/pbc/fill_ints.c
+++ b/pyscf/lib/pbc/fill_ints.c
@@ -1260,9 +1260,9 @@ static void shift_bas(double *env_loc, double *env, double *Ls, int ptr, int iL)
         env_loc[ptr+2] = env[ptr+2] + Ls[iL*3+2];
 }
 
-static void sort2c_ks1(double complex *out, double *bufr, double *bufi,
-                       int *shls_slice, int *ao_loc, int nkpts, int comp,
-                       int jsh, int msh0, int msh1)
+void sort2c_ks1(double complex *out, double *bufr, double *bufi,
+                int *shls_slice, int *ao_loc, int nkpts, int comp,
+                int jsh, int msh0, int msh1)
 {
         const int ish0 = shls_slice[0];
         const int ish1 = shls_slice[1];
diff --git a/pyscf/lib/pbc/fill_ints.h b/pyscf/lib/pbc/fill_ints.h
new file mode 100644
index 0000000000..ec2000755e
--- /dev/null
+++ b/pyscf/lib/pbc/fill_ints.h
@@ -0,0 +1,29 @@
+/* Copyright 2014-2024 The PySCF Developers. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+ */
+
+#ifndef HAVE_DEFINED_PBC_FILL_INTS_H
+#define HAVE_DEFINED_PBC_FILL_INTS_H
+
+void sort2c_gs1(double *out, double *in, int *shls_slice, int *ao_loc,
+                int comp, int ish, int jsh);
+void sort2c_gs2_igtj(double *out, double *in, int *shls_slice, int *ao_loc,
+                     int comp, int ish, int jsh);
+void sort2c_gs2_ieqj(double *out, double *in, int *shls_slice, int *ao_loc,
+                     int comp, int ish, int jsh);
+void sort2c_ks1(double complex *out, double *bufr, double *bufi,
+                int *shls_slice, int *ao_loc, int nkpts, int comp,
+                int jsh, int msh0, int msh1);
+#endif
diff --git a/pyscf/lib/pbc/fill_ints_screened.c b/pyscf/lib/pbc/fill_ints_screened.c
new file mode 100644
index 0000000000..5d100c7ae3
--- /dev/null
+++ b/pyscf/lib/pbc/fill_ints_screened.c
@@ -0,0 +1,1012 @@
+/* Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+ *
+ * Author: Xing Zhang <zhangxing.nju@gmail.com>
+ */
+
+#include <stdlib.h>
+#include <complex.h>
+#include <assert.h>
+#include <string.h>
+#include "config.h"
+#include "cint.h"
+#include "vhf/fblas.h"
+#include "pbc/optimizer.h"
+#include "pbc/fill_ints.h"
+#include "pbc/neighbor_list.h"
+#include "np_helper/np_helper.h"
+
+#define INTBUFMAX       1000
+#define INTBUFMAX10     8000
+#define IMGBLK          80
+#define OF_CMPLX        2
+#define MAX_THREADS     256
+
+int GTOmax_shell_dim(int *ao_loc, int *shls_slice, int ncenter);
+int GTOmax_cache_size(int (*intor)(), int *shls_slice, int ncenter,
+                      int *atm, int natm, int *bas, int nbas, double *env);
+
+static int shloc_partition(int *kshloc, int *ao_loc, int ksh0, int ksh1, int dkmax)
+{
+        int ksh;
+        int nloc = 0;
+        int loclast = ao_loc[ksh0];
+        kshloc[0] = ksh0;
+        for (ksh = ksh0+1; ksh < ksh1; ksh++) {
+                assert(ao_loc[ksh+1] - ao_loc[ksh] < dkmax);
+                if (ao_loc[ksh+1] - loclast > dkmax) {
+                        nloc += 1;
+                        kshloc[nloc] = ksh;
+                        loclast = ao_loc[ksh];
+                }
+        }
+        nloc += 1;
+        kshloc[nloc] = ksh1;
+        return nloc;
+}
+
+static void shift_bas(double *env_loc, double *env, double *Ls, int ptr, int iL)
+{
+        env_loc[ptr+0] = env[ptr+0] + Ls[iL*3+0];
+        env_loc[ptr+1] = env[ptr+1] + Ls[iL*3+1];
+        env_loc[ptr+2] = env[ptr+2] + Ls[iL*3+2];
+}
+
+static void sort3c_gs1(double *out, double *in, int *shls_slice, int *ao_loc,
+                       int comp, int ish, int jsh, int msh0, int msh1)
+{
+        const int ish0 = shls_slice[0];
+        const int ish1 = shls_slice[1];
+        const int jsh0 = shls_slice[2];
+        const int jsh1 = shls_slice[3];
+        const int ksh0 = shls_slice[4];
+        const int ksh1 = shls_slice[5];
+        const size_t naoi = ao_loc[ish1] - ao_loc[ish0];
+        const size_t naoj = ao_loc[jsh1] - ao_loc[jsh0];
+        const size_t naok = ao_loc[ksh1] - ao_loc[ksh0];
+        const size_t njk = naoj * naok;
+        const size_t nijk = njk * naoi;
+
+        const int di = ao_loc[ish+1] - ao_loc[ish];
+        const int dj = ao_loc[jsh+1] - ao_loc[jsh];
+        const int ip = ao_loc[ish] - ao_loc[ish0];
+        const int jp = ao_loc[jsh] - ao_loc[jsh0];
+        const int dij = di * dj;
+        out += (ip * naoj + jp) * naok;
+
+        int i, j, k, ksh, ic, dk, dijk;
+        double *pin, *pout;
+
+        for (ksh = msh0; ksh < msh1; ksh++) {
+                dk = ao_loc[ksh+1] - ao_loc[ksh];
+                dijk = dij * dk;
+                for (ic = 0; ic < comp; ic++) {
+                        pout = out + nijk * ic + ao_loc[ksh]-ao_loc[ksh0];
+                        pin = in + dijk * ic;
+                        for (j = 0; j < dj; j++) {
+                                for (i = 0; i < di; i++) {
+                                for (k = 0; k < dk; k++) {
+                                        pout[i*njk+k] = pin[k*dij+i];
+                                } }
+                                pout += naok;
+                                pin += di;
+                        }
+                }
+                in += dijk * comp;
+        }
+}
+
+static void _nr3c_screened_fill_g(int (*intor)(), void (*fsort)(), double *out, int nkpts_ij,
+                         int nkpts, int comp, int nimgs, int ish, int jsh,
+                         double *buf, double *env_loc, double *Ls,
+                         double *expkL_r, double *expkL_i, int *kptij_idx,
+                         int *shls_slice, int *ao_loc,
+                         CINTOpt *cintopt, PBCOpt *pbcopt,
+                         int *atm, int natm, int *bas, int nbas, double *env,
+                         NeighborList** neighbor_list)
+{
+        const int ish0 = shls_slice[0];
+        const int ish1 = shls_slice[1];
+        const int jsh0 = shls_slice[2];
+        const int jsh1 = shls_slice[3];
+        const int ksh0 = shls_slice[4];
+        const int ksh1 = shls_slice[5];
+
+        jsh += jsh0;
+        ish += ish0;
+        int iptrxyz = atm[PTR_COORD+bas[ATOM_OF+ish*BAS_SLOTS]*ATM_SLOTS];
+        int jptrxyz = atm[PTR_COORD+bas[ATOM_OF+jsh*BAS_SLOTS]*ATM_SLOTS];
+        const int di = ao_loc[ish+1] - ao_loc[ish];
+        const int dj = ao_loc[jsh+1] - ao_loc[jsh];
+        const int dij = di * dj;
+        int dkmax = INTBUFMAX10 / dij / 2 * MIN(IMGBLK,nimgs);
+        int kshloc[ksh1-ksh0+1];
+        int nkshloc = shloc_partition(kshloc, ao_loc, ksh0, ksh1, dkmax);
+
+        int i, m, msh0, msh1, dijm;
+        int ksh, dk, iL, jL, dijkc, ksh_off, jsh_off;
+        int shls[3];
+
+        int nshi = ish1 - ish0;
+        int nshj = jsh1 - jsh0;
+        int nshij = nshi + nshj;
+        int idx_i, idx_j;
+
+        int dijmc = dij * dkmax * comp;
+        double *bufL = buf + dijmc;
+        double *cache = bufL + dijmc;
+        double *pbuf;
+        int (*fprescreen)();
+        if (pbcopt != NULL) {
+                fprescreen = pbcopt->fprescreen;
+        } else {
+                fprescreen = PBCnoscreen;
+        }
+
+        shls[0] = ish;
+        shls[1] = jsh;
+        jsh_off = jsh - nshi;
+        NeighborList *nl0 = *neighbor_list;
+        NeighborPair *np0_ki, *np0_kj;
+        for (m = 0; m < nkshloc; m++) {
+                msh0 = kshloc[m];
+                msh1 = kshloc[m+1];
+                dkmax = ao_loc[msh1] - ao_loc[msh0];
+                dijm = dij * dkmax;
+                dijmc = dijm * comp;
+                for (i = 0; i < dijmc; i++) {
+                    bufL[i] = 0;
+                }
+
+                pbuf = bufL;
+                for (ksh = msh0; ksh < msh1; ksh++){
+                    shls[2] = ksh;
+                    ksh_off = ksh - nshij;
+                    dk = ao_loc[ksh+1] - ao_loc[ksh];
+                    dijkc = dij*dk * comp;
+                    np0_ki = (nl0->pairs)[ksh_off*nshi + ish];
+                    np0_kj = (nl0->pairs)[ksh_off*nshj + jsh_off];
+                    if (np0_ki->nimgs > 0 && np0_kj->nimgs > 0) { 
+                        for (idx_i = 0; idx_i < np0_ki->nimgs; idx_i++){
+                            iL = (np0_ki->Ls_list)[idx_i];
+                            shift_bas(env_loc, env, Ls, iptrxyz, iL);
+                            for (idx_j = 0; idx_j < np0_kj->nimgs; idx_j++){
+                                jL = (np0_kj->Ls_list)[idx_j];
+                                shift_bas(env_loc, env, Ls, jptrxyz, jL);
+
+                                if ((*fprescreen)(shls, pbcopt, atm, bas, env_loc)) {
+                                    if ((*intor)(buf, NULL, shls, atm, natm, bas, nbas,
+                                        env_loc, cintopt, cache)) {
+                                        for (i = 0; i < dijkc; i++) {
+                                            pbuf[i] += buf[i];
+                                        }
+                                    }
+                                }
+                            } 
+
+                        }
+                    }
+                    pbuf += dijkc;
+                }
+
+                (*fsort)(out, bufL, shls_slice, ao_loc, comp, ish, jsh, msh0, msh1);
+        }
+}
+
+static void _nr3c_screened_sum_auxbas_fill_g(int (*intor)(), void (*fsort)(), double *out, int nkpts_ij,
+                         int nkpts, int comp, int nimgs, int ish, int jsh,
+                         double *buf, double *env_loc, double *Ls,
+                         double *expkL_r, double *expkL_i, int *kptij_idx,
+                         int *shls_slice, int *ao_loc,
+                         CINTOpt *cintopt, PBCOpt *pbcopt,
+                         int *atm, int natm, int *bas, int nbas, double *env,
+                         NeighborList** neighbor_list)
+{
+        const int ish0 = shls_slice[0];
+        const int ish1 = shls_slice[1];
+        const int jsh0 = shls_slice[2];
+        const int jsh1 = shls_slice[3];
+        const int ksh0 = shls_slice[4];
+        const int ksh1 = shls_slice[5];
+
+        jsh += jsh0;
+        ish += ish0;
+        int iptrxyz = atm[PTR_COORD+bas[ATOM_OF+ish*BAS_SLOTS]*ATM_SLOTS];
+        int jptrxyz = atm[PTR_COORD+bas[ATOM_OF+jsh*BAS_SLOTS]*ATM_SLOTS];
+        const int di = ao_loc[ish+1] - ao_loc[ish];
+        const int dj = ao_loc[jsh+1] - ao_loc[jsh];
+        const int dij = di * dj;
+        int dkmax = INTBUFMAX10 / dij / 2 * MIN(IMGBLK,nimgs);
+        //int kshloc[ksh1-ksh0+1];
+        //int nkshloc = shloc_partition(kshloc, ao_loc, ksh0, ksh1, dkmax);
+
+        int i, k, ic;
+        int ksh, dk, dijk, iL, jL, ksh_off, jsh_off;
+        int shls[3];
+
+        int nshi = ish1 - ish0;
+        int nshj = jsh1 - jsh0;
+        int nshij = nshi + nshj;
+        int idx_i, idx_j;
+
+        int dijmc = dij * dkmax * comp;
+        double *bufL = buf + dijmc;
+        double *cache = bufL + dijmc;
+        double *pbuf, *pbufL;
+        int (*fprescreen)();
+        if (pbcopt != NULL) {
+                fprescreen = pbcopt->fprescreen;
+        } else {
+                fprescreen = PBCnoscreen;
+        }
+
+        shls[0] = ish;
+        shls[1] = jsh;
+        jsh_off = jsh - nshi;
+        NeighborList *nl0 = *neighbor_list;
+        NeighborPair *np0_ki, *np0_kj;
+
+        int dijc = dij * comp;
+        for (i = 0; i < dijc; i++) {
+            bufL[i] = 0;
+        }
+
+        for (ksh = ksh0; ksh < ksh1; ksh++){
+            dk = ao_loc[ksh+1] - ao_loc[ksh];
+            assert(dk < dkmax);
+            dijk = dij * dk;
+            shls[2] = ksh;
+            ksh_off = ksh - nshij;
+            np0_ki = (nl0->pairs)[ksh_off*nshi + ish];
+            np0_kj = (nl0->pairs)[ksh_off*nshj + jsh_off];
+            if (np0_ki->nimgs > 0 && np0_kj->nimgs > 0) { 
+                for (idx_i = 0; idx_i < np0_ki->nimgs; idx_i++){
+                    iL = (np0_ki->Ls_list)[idx_i];
+                    shift_bas(env_loc, env, Ls, iptrxyz, iL);
+                    for (idx_j = 0; idx_j < np0_kj->nimgs; idx_j++){
+                        jL = (np0_kj->Ls_list)[idx_j];
+                        shift_bas(env_loc, env, Ls, jptrxyz, jL);
+
+                        if ((*fprescreen)(shls, pbcopt, atm, bas, env_loc)) {
+                            if ((*intor)(buf, NULL, shls, atm, natm, bas, nbas,
+                                env_loc, cintopt, cache)) {
+                                for (ic = 0; ic < comp; ic++) {
+                                    pbufL = bufL + ic * dij;
+                                    pbuf = buf + ic * dijk;
+                                    for (k = 0; k < dk; k++) {
+                                        for (i = 0; i < dij; i++) {
+                                            pbufL[i] += pbuf[i];
+                                        }
+                                        pbuf += dij;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        (*fsort)(out, bufL, shls_slice, ao_loc, comp, ish, jsh);
+}
+
+void PBCnr3c_screened_fill_gs1(int (*intor)(), double *out, int nkpts_ij,
+                      int nkpts, int comp, int nimgs, int ish, int jsh,
+                      double *buf, double *env_loc, double *Ls,
+                      double *expkL_r, double *expkL_i, int *kptij_idx,
+                      int *shls_slice, int *ao_loc,
+                      CINTOpt *cintopt, PBCOpt *pbcopt,
+                      int *atm, int natm, int *bas, int nbas, double *env,
+                      NeighborList** neighbor_list)
+{
+     _nr3c_screened_fill_g(intor, &sort3c_gs1, out, nkpts_ij, nkpts, comp, nimgs, ish, jsh,
+                  buf, env_loc, Ls, expkL_r, expkL_i, kptij_idx,
+                  shls_slice, ao_loc, cintopt, pbcopt, atm, natm, bas, nbas, env, neighbor_list);
+}
+
+static void sort3c_gs2_igtj(double *out, double *in, int *shls_slice, int *ao_loc,
+                            int comp, int ish, int jsh, int msh0, int msh1)
+{
+        const int ish0 = shls_slice[0];
+        const int ish1 = shls_slice[1];
+        const int jsh0 = shls_slice[2];
+        const int ksh0 = shls_slice[4];
+        const int ksh1 = shls_slice[5];
+        const size_t naok = ao_loc[ksh1] - ao_loc[ksh0];
+        const size_t off0 = ((size_t)ao_loc[ish0]) * (ao_loc[ish0] + 1) / 2;
+        const size_t nij = ((size_t)ao_loc[ish1]) * (ao_loc[ish1] + 1) / 2 - off0;
+        const size_t nijk = nij * naok;
+
+        const int di = ao_loc[ish+1] - ao_loc[ish];
+        const int dj = ao_loc[jsh+1] - ao_loc[jsh];
+        const int dij = di * dj;
+        const int jp = ao_loc[jsh] - ao_loc[jsh0];
+        out += (((size_t)ao_loc[ish])*(ao_loc[ish]+1)/2-off0 + jp) * naok;
+
+        int i, j, k, ij, ksh, ic, dk, dijk;
+        double *pin, *pout;
+
+        for (ksh = msh0; ksh < msh1; ksh++) {
+                dk = ao_loc[ksh+1] - ao_loc[ksh];
+                dijk = dij * dk;
+                for (ic = 0; ic < comp; ic++) {
+                        pout = out + nijk * ic + ao_loc[ksh]-ao_loc[ksh0];
+                        pin = in + dijk * ic;
+                        for (i = 0; i < di; i++) {
+                                for (j = 0; j < dj; j++) {
+                                        ij = j * di + i;
+                                        for (k = 0; k < dk; k++) {
+                                                pout[j*naok+k] = pin[k*dij+ij];
+                                        }
+                                }
+                                pout += (i+ao_loc[ish]+1) * naok;
+                        }
+                }
+                in += dijk * comp;
+        }
+}
+
+void sort2c_gs2_igtj(double *out, double *in, int *shls_slice, int *ao_loc,
+                     int comp, int ish, int jsh)
+{
+        const int ish0 = shls_slice[0];
+        const int ish1 = shls_slice[1];
+        const int jsh0 = shls_slice[2];
+        const size_t off0 = ((size_t)ao_loc[ish0]) * (ao_loc[ish0] + 1) / 2;
+        const size_t nij = ((size_t)ao_loc[ish1]) * (ao_loc[ish1] + 1) / 2 - off0;
+
+        const int di = ao_loc[ish+1] - ao_loc[ish];
+        const int dj = ao_loc[jsh+1] - ao_loc[jsh];
+        const int dij = di * dj;
+        const int jp = ao_loc[jsh] - ao_loc[jsh0];
+        out += ((size_t)ao_loc[ish])*(ao_loc[ish]+1)/2-off0 + jp;
+
+        int i, j, ic;
+        double *pin, *pout;
+
+        for (ic = 0; ic < comp; ic++) {
+                pout = out + nij * ic;
+                pin = in + dij * ic;
+                for (i = 0; i < di; i++) {
+                        for (j = 0; j < dj; j++) {
+                                pout[j] = pin[j*di+i];
+                        }
+                        pout += (i+ao_loc[ish]+1);
+                }
+        }
+}
+
+static void sort3c_gs2_ieqj(double *out, double *in, int *shls_slice, int *ao_loc,
+                            int comp, int ish, int jsh, int msh0, int msh1)
+{
+        const int ish0 = shls_slice[0];
+        const int ish1 = shls_slice[1];
+        const int jsh0 = shls_slice[2];
+        const int ksh0 = shls_slice[4];
+        const int ksh1 = shls_slice[5];
+        const size_t naok = ao_loc[ksh1] - ao_loc[ksh0];
+        const size_t off0 = ((size_t)ao_loc[ish0]) * (ao_loc[ish0] + 1) / 2;
+        const size_t nij = ((size_t)ao_loc[ish1]) * (ao_loc[ish1] + 1) / 2 - off0;
+        const size_t nijk = nij * naok;
+
+        const int di = ao_loc[ish+1] - ao_loc[ish];
+        const int dij = di * di;
+        const int jp = ao_loc[jsh] - ao_loc[jsh0];
+        out += (((size_t)ao_loc[ish])*(ao_loc[ish]+1)/2-off0 + jp) * naok;
+
+        int i, j, k, ij, ksh, ic, dk, dijk;
+        double *pin, *pout;
+
+        for (ksh = msh0; ksh < msh1; ksh++) {
+                dk = ao_loc[ksh+1] - ao_loc[ksh];
+                dijk = dij * dk;
+                for (ic = 0; ic < comp; ic++) {
+                        pout = out + nijk * ic + ao_loc[ksh]-ao_loc[ksh0];
+                        pin = in + dijk * ic;
+                        for (i = 0; i < di; i++) {
+                                for (j = 0; j <= i; j++) {
+                                        ij = j * di + i;
+                                        for (k = 0; k < dk; k++) {
+                                                pout[j*naok+k] = pin[k*dij+ij];
+                                        }
+                                }
+                                pout += (i+ao_loc[ish]+1) * naok;
+                        }
+                }
+                in += dijk * comp;
+        }
+}
+
+void sort2c_gs2_ieqj(double *out, double *in, int *shls_slice, int *ao_loc,
+                     int comp, int ish, int jsh)
+{
+        const int ish0 = shls_slice[0];
+        const int ish1 = shls_slice[1];
+        const int jsh0 = shls_slice[2];
+        const size_t off0 = ((size_t)ao_loc[ish0]) * (ao_loc[ish0] + 1) / 2;
+        const size_t nij = ((size_t)ao_loc[ish1]) * (ao_loc[ish1] + 1) / 2 - off0;
+
+        const int di = ao_loc[ish+1] - ao_loc[ish];
+        const int dij = di * di;
+        const int jp = ao_loc[jsh] - ao_loc[jsh0];
+        out += ((size_t)ao_loc[ish])*(ao_loc[ish]+1)/2-off0 + jp;
+
+        int i, j, ic;
+        double *pin, *pout;
+
+        for (ic = 0; ic < comp; ic++) {
+                pout = out + nij * ic;
+                pin = in + dij * ic;
+                for (i = 0; i < di; i++) {
+                        for (j = 0; j <= i; j++) {
+                                pout[j] = pin[j*di+i];
+                        }
+                        pout += (i+ao_loc[ish]+1);
+                }
+        }
+}
+
+void sort2c_gs1(double *out, double *in, int *shls_slice, int *ao_loc,
+                int comp, int ish, int jsh)
+{
+        const int ish0 = shls_slice[0];
+        const int ish1 = shls_slice[1];
+        const int jsh0 = shls_slice[2];
+        const int jsh1 = shls_slice[3];
+
+        const int di = ao_loc[ish+1] - ao_loc[ish];
+        const int dj = ao_loc[jsh+1] - ao_loc[jsh];
+        const int dij = di * dj;
+        const int ip = ao_loc[ish] - ao_loc[ish0];
+        const int jp = ao_loc[jsh] - ao_loc[jsh0];
+        const size_t naoi = ao_loc[ish1] - ao_loc[ish0];
+        const size_t naoj = ao_loc[jsh1] - ao_loc[jsh0];
+        const size_t nij = naoi * naoj;
+        out += ip * naoj + jp;
+
+        int i, j, ic;
+        double *pin, *pout;
+
+        for (ic = 0; ic < comp; ic++) {
+                pout = out + nij * ic;
+                pin = in + dij * ic;
+                for (i = 0; i < di; i++) {
+                        for (j = 0; j < dj; j++) {
+                                pout[j] = pin[j*di+i];
+                        }
+                        pout += naoj;
+                }
+        }
+}
+
+void PBCnr3c_screened_fill_gs2(int (*intor)(), double *out, int nkpts_ij,
+                      int nkpts, int comp, int nimgs, int ish, int jsh,
+                      double *buf, double *env_loc, double *Ls,
+                      double *expkL_r, double *expkL_i, int *kptij_idx,
+                      int *shls_slice, int *ao_loc,
+                      CINTOpt *cintopt, PBCOpt *pbcopt,
+                      int *atm, int natm, int *bas, int nbas, double *env,
+                      NeighborList** neighbor_list)
+{
+        int ip = ish + shls_slice[0];
+        int jp = jsh + shls_slice[2] - nbas;
+        if (ip > jp) {
+             _nr3c_screened_fill_g(intor, &sort3c_gs2_igtj, out,
+                          nkpts_ij, nkpts, comp, nimgs, ish, jsh,
+                          buf, env_loc, Ls, expkL_r, expkL_i, kptij_idx,
+                          shls_slice, ao_loc, cintopt, pbcopt, atm, natm, bas, nbas, env, neighbor_list);
+        } else if (ip == jp) {
+             _nr3c_screened_fill_g(intor, &sort3c_gs2_ieqj, out,
+                          nkpts_ij, nkpts, comp, nimgs, ish, jsh,
+                          buf, env_loc, Ls, expkL_r, expkL_i, kptij_idx,
+                          shls_slice, ao_loc, cintopt, pbcopt, atm, natm, bas, nbas, env, neighbor_list);
+        }
+}
+
+void PBCnr3c_screened_sum_auxbas_fill_gs1(int (*intor)(), double *out, int nkpts_ij,
+                      int nkpts, int comp, int nimgs, int ish, int jsh,
+                      double *buf, double *env_loc, double *Ls,
+                      double *expkL_r, double *expkL_i, int *kptij_idx,
+                      int *shls_slice, int *ao_loc,
+                      CINTOpt *cintopt, PBCOpt *pbcopt,
+                      int *atm, int natm, int *bas, int nbas, double *env,
+                      NeighborList** neighbor_list)
+{
+        _nr3c_screened_sum_auxbas_fill_g(intor, &sort2c_gs1, out,
+                          nkpts_ij, nkpts, comp, nimgs, ish, jsh,
+                          buf, env_loc, Ls, expkL_r, expkL_i, kptij_idx,
+                          shls_slice, ao_loc, cintopt, pbcopt, atm, natm, bas, nbas, env, neighbor_list);
+}
+
+void PBCnr3c_screened_sum_auxbas_fill_gs2(int (*intor)(), double *out, int nkpts_ij,
+                      int nkpts, int comp, int nimgs, int ish, int jsh,
+                      double *buf, double *env_loc, double *Ls,
+                      double *expkL_r, double *expkL_i, int *kptij_idx,
+                      int *shls_slice, int *ao_loc,
+                      CINTOpt *cintopt, PBCOpt *pbcopt,
+                      int *atm, int natm, int *bas, int nbas, double *env,
+                      NeighborList** neighbor_list)
+{
+        int ip = ish + shls_slice[0];
+        int jp = jsh + shls_slice[2] - nbas;
+        if (ip > jp) {
+             _nr3c_screened_sum_auxbas_fill_g(intor, &sort2c_gs2_igtj, out,
+                          nkpts_ij, nkpts, comp, nimgs, ish, jsh,
+                          buf, env_loc, Ls, expkL_r, expkL_i, kptij_idx,
+                          shls_slice, ao_loc, cintopt, pbcopt, atm, natm, bas, nbas, env, neighbor_list);
+        } else if (ip == jp) {
+             _nr3c_screened_sum_auxbas_fill_g(intor, &sort2c_gs2_ieqj, out,
+                          nkpts_ij, nkpts, comp, nimgs, ish, jsh,
+                          buf, env_loc, Ls, expkL_r, expkL_i, kptij_idx,
+                          shls_slice, ao_loc, cintopt, pbcopt, atm, natm, bas, nbas, env, neighbor_list);
+        }
+}
+
+static void contract_3c1e_ipik_dm_gs1(double *grad, double* dm, double *eri,
+                                      int *shls, int *ao_loc, int *atm, int natm,
+                                      int *bas, int nbas, int comp, int nao)
+{
+    const int ish = shls[0];
+    const int jsh = shls[1];
+    const int ksh = shls[2];
+
+    const int di = ao_loc[ish+1] - ao_loc[ish];
+    const int dj = ao_loc[jsh+1] - ao_loc[jsh];
+    const int dij = di * dj;
+    const size_t i0 = ao_loc[ish];
+    const size_t j0 = ao_loc[jsh] - nao;
+
+    const int ia = bas[ATOM_OF+ish*BAS_SLOTS];
+    const int ka = bas[ATOM_OF+ksh*BAS_SLOTS] - 2*natm;
+
+    int i, j, ic;
+    double *ptr_eri, *ptr_dm;
+    double *dm0 = dm + (i0 * nao + j0);
+    double ipi_dm[comp];
+    for (ic = 0; ic < comp; ic++) {
+        ipi_dm[ic] = 0;
+        ptr_dm = dm0;
+        ptr_eri = eri + dij * ic;
+        for (i = 0; i < di; i++) {
+            for (j = 0; j < dj; j++) {
+                ipi_dm[ic] += ptr_eri[j*di+i] * ptr_dm[j];
+            }
+            ptr_dm += nao;
+        }
+    }
+
+    for (ic = 0; ic < comp; ic++) {
+        grad[ia*comp+ic] += ipi_dm[ic];
+        grad[ka*comp+ic] -= ipi_dm[ic];
+    }
+}
+
+static void _nr3c1e_screened_nuc_grad_fill_g(int (*intor)(), void (*fcontract)(),
+            double *grad, double *dm, int nkpts_ij, int nkpts,
+            int comp, int nimgs, int ish, int jsh,
+            double *buf, double *env_loc, double *Ls,
+            double *expkL_r, double *expkL_i, int *kptij_idx,
+            int *shls_slice, int *ao_loc,
+            CINTOpt *cintopt, PBCOpt *pbcopt,
+            int *atm, int natm, int *bas, int nbas, double *env, int nao,
+            NeighborList** neighbor_list)
+{
+    const int ish0 = shls_slice[0];
+    //const int ish1 = shls_slice[1];
+    const int jsh0 = shls_slice[2];
+    //const int jsh1 = shls_slice[3];
+    const int ksh0 = shls_slice[4];
+    const int ksh1 = shls_slice[5];
+
+    ish += ish0;
+    jsh += jsh0;
+    int iptrxyz = atm[PTR_COORD+bas[ATOM_OF+ish*BAS_SLOTS]*ATM_SLOTS];
+    int jptrxyz = atm[PTR_COORD+bas[ATOM_OF+jsh*BAS_SLOTS]*ATM_SLOTS];
+    const int di = ao_loc[ish+1] - ao_loc[ish];
+    const int dj = ao_loc[jsh+1] - ao_loc[jsh];
+    const int dij = di * dj;
+    int dkmax = INTBUFMAX10 / dij / 2 * MIN(IMGBLK,nimgs);
+    //int kshloc[ksh1-ksh0+1];
+    //int nkshloc = shloc_partition(kshloc, ao_loc, ksh0, ksh1, dkmax);
+
+    int i, k, ic;
+    int ksh, dk, dijk, iL, jL, ksh_off, jsh_off;
+    int shls[3];
+
+    int idx_i, idx_j;
+
+    int dijc = dij * comp;
+    int dijmc = dijc * dkmax;
+    double *bufL = buf + dijmc;
+    double *cache = bufL + dijc;
+    double *pbuf, *pbufL;
+    int (*fprescreen)();
+    if (pbcopt != NULL) {
+            fprescreen = pbcopt->fprescreen;
+    } else {
+            fprescreen = PBCnoscreen;
+    }
+
+    shls[0] = ish;
+    shls[1] = jsh;
+    jsh_off = jsh - nbas;
+    NeighborList *nl0 = *neighbor_list;
+    NeighborPair *np0_ki, *np0_kj;
+
+    for (ksh = ksh0; ksh < ksh1; ksh++){
+        dk = ao_loc[ksh+1] - ao_loc[ksh];
+        assert(dk < dkmax);
+        dijk = dij * dk;
+        shls[2] = ksh;
+        ksh_off = ksh - nbas*2;
+        np0_ki = (nl0->pairs)[ksh_off*nbas + ish];
+        np0_kj = (nl0->pairs)[ksh_off*nbas + jsh_off];
+        if (np0_ki->nimgs > 0 && np0_kj->nimgs > 0) {
+            for (i = 0; i < dijc; i++) {
+                bufL[i] = 0;
+            }
+            for (idx_i = 0; idx_i < np0_ki->nimgs; idx_i++){
+                iL = (np0_ki->Ls_list)[idx_i];
+                shift_bas(env_loc, env, Ls, iptrxyz, iL);
+                for (idx_j = 0; idx_j < np0_kj->nimgs; idx_j++){
+                    jL = (np0_kj->Ls_list)[idx_j];
+                    shift_bas(env_loc, env, Ls, jptrxyz, jL);
+
+                    if ((*fprescreen)(shls, pbcopt, atm, bas, env_loc)) {
+                        if ((*intor)(buf, NULL, shls, atm, natm, bas, nbas,
+                                     env_loc, cintopt, cache))
+                        {
+                            for (ic = 0; ic < comp; ic++) {
+                                pbufL = bufL + ic * dij;
+                                pbuf = buf + ic * dijk;
+                                for (k = 0; k < dk; k++) {
+                                    for (i = 0; i < dij; i++) {
+                                        pbufL[i] += pbuf[i];
+                                    }
+                                    pbuf += dij;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            (*fcontract)(grad, dm, bufL, shls, ao_loc, atm, natm, bas, nbas, comp, nao);
+        }
+    }
+}
+
+void PBCnr3c1e_screened_nuc_grad_fill_gs1(int (*intor)(), double *out, double* dm,
+                      int nkpts_ij, int nkpts, int comp, int nimgs, int ish, int jsh,
+                      double *buf, double *env_loc, double *Ls,
+                      double *expkL_r, double *expkL_i, int *kptij_idx,
+                      int *shls_slice, int *ao_loc,
+                      CINTOpt *cintopt, PBCOpt *pbcopt,
+                      int *atm, int natm, int *bas, int nbas, double *env, int nao,
+                      NeighborList** neighbor_list)
+{
+        _nr3c1e_screened_nuc_grad_fill_g(intor, &contract_3c1e_ipik_dm_gs1, out, dm,
+                          nkpts_ij, nkpts, comp, nimgs, ish, jsh,
+                          buf, env_loc, Ls, expkL_r, expkL_i, kptij_idx,
+                          shls_slice, ao_loc, cintopt, pbcopt, atm, natm, bas, nbas, env, nao, neighbor_list);
+}
+
+void PBCnr3c_screened_drv(int (*intor)(), void (*fill)(), double complex *eri,
+                 int nkpts_ij, int nkpts, int comp, int nimgs,
+                 double *Ls, double complex *expkL, int *kptij_idx,
+                 int *shls_slice, int *ao_loc,
+                 CINTOpt *cintopt, PBCOpt *pbcopt,
+                 int *atm, int natm, int *bas, int nbas, double *env, int nenv,
+                 NeighborList** neighbor_list)
+{
+        assert(neighbor_list != NULL);
+        const int ish0 = shls_slice[0];
+        const int ish1 = shls_slice[1];
+        const int jsh0 = shls_slice[2];
+        const int jsh1 = shls_slice[3];
+        const int nish = ish1 - ish0;
+        const int njsh = jsh1 - jsh0;
+        double *expkL_r = malloc(sizeof(double) * nimgs*nkpts * OF_CMPLX);
+        double *expkL_i = expkL_r + nimgs*nkpts;
+        int i;
+        for (i = 0; i < nimgs*nkpts; i++) {
+                expkL_r[i] = creal(expkL[i]);
+                expkL_i[i] = cimag(expkL[i]);
+        }
+
+        size_t count;
+        count = (nkpts * OF_CMPLX + nimgs) * INTBUFMAX10 * comp;
+        count+= nimgs * nkpts * OF_CMPLX;
+        const int cache_size = GTOmax_cache_size(intor, shls_slice, 3,
+                                                 atm, natm, bas, nbas, env);
+
+#pragma omp parallel
+{
+        int ish, jsh, ij;
+        double *env_loc = malloc(sizeof(double)*nenv);
+        NPdcopy(env_loc, env, nenv);
+        double *buf = malloc(sizeof(double)*(count+cache_size));
+#pragma omp for schedule(dynamic)
+        for (ij = 0; ij < nish*njsh; ij++) {
+                ish = ij / njsh;
+                jsh = ij % njsh;
+                (*fill)(intor, eri, nkpts_ij, nkpts, comp, nimgs, ish, jsh,
+                        buf, env_loc, Ls, expkL_r, expkL_i, kptij_idx,
+                        shls_slice, ao_loc, cintopt, pbcopt, atm, natm, bas, nbas, env, neighbor_list);
+        }
+        free(buf);
+        free(env_loc);
+}
+        free(expkL_r);
+}
+
+void PBCnr3c_screened_sum_auxbas_drv(int (*intor)(), void (*fill)(), double complex *eri,
+                 int nkpts_ij, int nkpts, int comp, int nimgs,
+                 double *Ls, double complex *expkL, int *kptij_idx,
+                 int *shls_slice, int *ao_loc,
+                 CINTOpt *cintopt, PBCOpt *pbcopt,
+                 int *atm, int natm, int *bas, int nbas, double *env, int nenv,
+                 NeighborList** neighbor_list)
+{
+        assert(neighbor_list != NULL);
+        const int ish0 = shls_slice[0];
+        const int ish1 = shls_slice[1];
+        const int jsh0 = shls_slice[2];
+        const int jsh1 = shls_slice[3];
+        const int nish = ish1 - ish0;
+        const int njsh = jsh1 - jsh0;
+        double *expkL_r=NULL, *expkL_i=NULL;
+        //expkL_r = malloc(sizeof(double) * nimgs*nkpts * OF_CMPLX);
+        //expkL_i = expkL_r + nimgs*nkpts;
+        //int i;
+        //for (i = 0; i < nimgs*nkpts; i++) {
+        //        expkL_r[i] = creal(expkL[i]);
+        //        expkL_i[i] = cimag(expkL[i]);
+        //}
+
+        size_t count;
+        count = (nkpts * OF_CMPLX + nimgs) * INTBUFMAX10 * comp;
+        count+= nimgs * nkpts * OF_CMPLX;
+        const int cache_size = GTOmax_cache_size(intor, shls_slice, 3,
+                                                 atm, natm, bas, nbas, env);
+
+#pragma omp parallel
+{
+        int ish, jsh, ij;
+        double *env_loc = malloc(sizeof(double)*nenv);
+        NPdcopy(env_loc, env, nenv);
+        double *buf = malloc(sizeof(double)*(count+cache_size));
+#pragma omp for schedule(dynamic)
+        for (ij = 0; ij < nish*njsh; ij++) {
+                ish = ij / njsh;
+                jsh = ij % njsh;
+                (*fill)(intor, eri, nkpts_ij, nkpts, comp, nimgs, ish, jsh,
+                        buf, env_loc, Ls, expkL_r, expkL_i, kptij_idx,
+                        shls_slice, ao_loc, cintopt, pbcopt, atm, natm, bas, nbas, env, neighbor_list);
+        }
+        free(buf);
+        free(env_loc);
+}
+        //free(expkL_r);
+}
+
+void PBCnr3c1e_screened_nuc_grad_drv(int (*intor)(), void (*fill)(), 
+                 double* grad, double* dm,
+                 int nkpts_ij, int nkpts, int comp, int nimgs,
+                 double *Ls, double complex *expkL, int *kptij_idx,
+                 int *shls_slice, int *ao_loc,
+                 CINTOpt *cintopt, PBCOpt *pbcopt,
+                 int *atm, int natm, int *bas, int nbas, double *env, int nenv, int nao,
+                 NeighborList** neighbor_list)
+{
+        assert(neighbor_list != NULL);
+        const int ish0 = shls_slice[0];
+        const int ish1 = shls_slice[1];
+        const int jsh0 = shls_slice[2];
+        const int jsh1 = shls_slice[3];
+        const int nish = ish1 - ish0;
+        const int njsh = jsh1 - jsh0;
+        double *expkL_r=NULL, *expkL_i=NULL;
+        //double *expkL_r = malloc(sizeof(double) * nimgs*nkpts * OF_CMPLX);
+        //double *expkL_i = expkL_r + nimgs*nkpts;
+        //int i;
+        //for (i = 0; i < nimgs*nkpts; i++) {
+        //        expkL_r[i] = creal(expkL[i]);
+        //        expkL_i[i] = cimag(expkL[i]);
+        //}
+
+        size_t count;
+        count = (nkpts * OF_CMPLX + nimgs) * INTBUFMAX10 * comp;
+        count+= nimgs * nkpts * OF_CMPLX;
+        const int cache_size = GTOmax_cache_size(intor, shls_slice, 3,
+                                                 atm, natm, bas, nbas, env);
+
+        double *gradbufs[MAX_THREADS];
+#pragma omp parallel
+{
+        int ish, jsh, ij;
+        double *env_loc = malloc(sizeof(double)*nenv);
+        NPdcopy(env_loc, env, nenv);
+        double *grad_loc;
+        int thread_id = omp_get_thread_num();
+        if (thread_id == 0) {
+                grad_loc = grad;
+        } else {
+                grad_loc = calloc(natm*comp, sizeof(double));
+        }
+        gradbufs[thread_id] = grad_loc;
+
+        double *buf = malloc(sizeof(double)*(count+cache_size));
+        #pragma omp for schedule(dynamic)
+        for (ij = 0; ij < nish*njsh; ij++) {
+                ish = ij / njsh;
+                jsh = ij % njsh;
+                (*fill)(intor, grad_loc, dm, nkpts_ij, nkpts, comp, nimgs, ish, jsh,
+                        buf, env_loc, Ls, expkL_r, expkL_i, kptij_idx,
+                        shls_slice, ao_loc, cintopt, pbcopt, atm, natm, bas, nbas, env, nao, neighbor_list);
+        }
+        free(buf);
+        free(env_loc);
+
+        NPomp_dsum_reduce_inplace(gradbufs, natm*comp);
+        if (thread_id != 0) {
+                free(grad_loc);
+        }
+}
+        //free(expkL_r);
+}
+
+
+static int _nr2c_screened_fill(
+                int (*intor)(), double complex *out,
+                int nkpts, int comp, int nimgs, int jsh, int ish0,
+                double *buf, double *env_loc, double *Ls,
+                double *expkL_r, double *expkL_i,
+                int *shls_slice, int *ao_loc, CINTOpt *cintopt,
+                int *atm, int natm, int *bas, int nbas, double *env,
+                NeighborList** neighbor_list)
+{
+        const int ish1 = shls_slice[1];
+        const int jsh0 = shls_slice[2];
+        const int jsh1 = shls_slice[3];
+        const int nshi = ish1 - shls_slice[0];
+        const int nshj = jsh1 - jsh0;
+
+        const double D1 = 1;
+        const int I1 = 1;
+
+        ish0 += shls_slice[0];
+        jsh += jsh0;
+        int jsh_off = jsh - nshi;
+        int jptrxyz = atm[PTR_COORD+bas[ATOM_OF+jsh*BAS_SLOTS]*ATM_SLOTS];
+        const int dj = ao_loc[jsh+1] - ao_loc[jsh];
+        int dimax = INTBUFMAX10 / dj;
+        int ishloc[ish1-ish0+1];
+        int nishloc = shloc_partition(ishloc, ao_loc, ish0, ish1, dimax);
+
+        int m, msh0, msh1, dijc, dmjc, ish, di, empty;
+        int jL, idx_j;
+        int shls[2];
+        double *bufk_r = buf;
+        double *bufk_i, *bufL, *pbufk_r, *pbufk_i, *cache;
+
+        NeighborList *nl0 = *neighbor_list;
+        NeighborPair *np0;
+
+        shls[1] = jsh;
+        for (m = 0; m < nishloc; m++) {
+                msh0 = ishloc[m];
+                msh1 = ishloc[m+1];
+                dimax = ao_loc[msh1] - ao_loc[msh0];
+                dmjc = dj * dimax * comp;
+                bufk_i = bufk_r + dmjc * nkpts;
+                bufL   = bufk_i + dmjc * nkpts;
+                cache  = bufL   + dmjc;
+
+                memset(bufk_r, 0, 2*dmjc*nkpts*sizeof(double));
+                pbufk_r = bufk_r;
+                pbufk_i = bufk_i;
+                for (ish = msh0; ish < msh1; ish++) {
+                        shls[0] = ish;
+                        di = ao_loc[ish+1] - ao_loc[ish];
+                        dijc = di * dj * comp;
+                        np0 = (nl0->pairs)[ish*nshj + jsh_off];
+                        if (np0->nimgs > 0) {
+                                for (idx_j = 0; idx_j < np0->nimgs; idx_j++){
+                                        jL = (np0->Ls_list)[idx_j];
+                                        shift_bas(env_loc, env, Ls, jptrxyz, jL);
+                                        if ((*intor)(bufL, NULL, shls, atm, natm, bas, nbas,
+                                                     env_loc, cintopt, cache)) {
+                                                empty = 0;
+                                                dger_(&dijc, &nkpts, &D1, bufL, &I1,
+                                                      expkL_r+jL, &nimgs, pbufk_r, &dmjc);
+                                                dger_(&dijc, &nkpts, &D1, bufL, &I1,
+                                                      expkL_i+jL, &nimgs, pbufk_i, &dmjc);
+                                        }
+                                }
+                        }
+                        pbufk_r += dijc;
+                        pbufk_i += dijc;
+                }
+                sort2c_ks1(out, bufk_r, bufk_i, shls_slice, ao_loc,
+                           nkpts, comp, jsh, msh0, msh1);
+        }
+        return !empty;
+}
+
+void PBCnr2c_screened_fill_ks1(int (*intor)(), double complex *out,
+                      int nkpts, int comp, int nimgs, int jsh,
+                      double *buf, double *env_loc, double *Ls,
+                      double *expkL_r, double *expkL_i,
+                      int *shls_slice, int *ao_loc, CINTOpt *cintopt,
+                      int *atm, int natm, int *bas, int nbas, double *env,
+                      NeighborList** neighbor_list)
+{
+        _nr2c_screened_fill(intor, out, nkpts, comp, nimgs, jsh, 0,
+                   buf, env_loc, Ls, expkL_r, expkL_i, shls_slice, ao_loc,
+                   cintopt, atm, natm, bas, nbas, env, neighbor_list);
+}
+
+void PBCnr2c_screened_fill_ks2(int (*intor)(), double complex *out,
+                      int nkpts, int comp, int nimgs, int jsh,
+                      double *buf, double *env_loc, double *Ls,
+                      double *expkL_r, double *expkL_i,
+                      int *shls_slice, int *ao_loc, CINTOpt *cintopt,
+                      int *atm, int natm, int *bas, int nbas, double *env,
+                      NeighborList** neighbor_list)
+{
+        _nr2c_screened_fill(intor, out, nkpts, comp, nimgs, jsh, jsh,
+                   buf, env_loc, Ls, expkL_r, expkL_i, shls_slice, ao_loc,
+                   cintopt, atm, natm, bas, nbas, env, neighbor_list);
+}
+
+void PBCnr2c_screened_drv(int (*intor)(), void (*fill)(), double complex *out,
+                 int nkpts, int comp, int nimgs,
+                 double *Ls, double complex *expkL,
+                 int *shls_slice, int *ao_loc, CINTOpt *cintopt,
+                 int *atm, int natm, int *bas, int nbas, double *env, int nenv,
+                 NeighborList** neighbor_list)
+{
+        assert(neighbor_list != NULL);
+        const int jsh0 = shls_slice[2];
+        const int jsh1 = shls_slice[3];
+        const int njsh = jsh1 - jsh0;
+        double *expkL_r = malloc(sizeof(double) * nimgs*nkpts * OF_CMPLX);
+        double *expkL_i = expkL_r + nimgs*nkpts;
+        int i;
+        for (i = 0; i < nimgs*nkpts; i++) {
+                expkL_r[i] = creal(expkL[i]);
+                expkL_i[i] = cimag(expkL[i]);
+        }
+        const int cache_size = GTOmax_cache_size(intor, shls_slice, 2,
+                                                 atm, natm, bas, nbas, env);
+
+#pragma omp parallel
+{
+        int jsh;
+        double *env_loc = malloc(sizeof(double)*nenv);
+        NPdcopy(env_loc, env, nenv);
+        size_t count = (nkpts+1) * OF_CMPLX;
+        double *buf = malloc(sizeof(double)*(count*INTBUFMAX10*comp+cache_size));
+#pragma omp for schedule(dynamic)
+        for (jsh = 0; jsh < njsh; jsh++) {
+                (*fill)(intor, out, nkpts, comp, nimgs, jsh,
+                        buf, env_loc, Ls, expkL_r, expkL_i,
+                        shls_slice, ao_loc, cintopt, atm, natm, bas, nbas, env,
+                        neighbor_list);
+        }
+        free(buf);
+        free(env_loc);
+}
+        free(expkL_r);
+}
diff --git a/pyscf/lib/pbc/hf_grad.c b/pyscf/lib/pbc/hf_grad.c
new file mode 100644
index 0000000000..7c781fba19
--- /dev/null
+++ b/pyscf/lib/pbc/hf_grad.c
@@ -0,0 +1,95 @@
+/* Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+ *
+ * Author: Xing Zhang <zhangxing.nju@gmail.com>
+ */
+
+#include <stdlib.h>
+#include "config.h"
+#include "vhf/fblas.h"
+#include "np_helper/np_helper.h"
+#include "pbc/neighbor_list.h"
+
+#define MAX_THREADS 256
+
+void contract_vhf_dm(double* out, double* vhf, double* dm,
+                     NeighborList** neighbor_list,
+                     int* shls_slice, int* ao_loc, int* shls_atm,
+                     int comp, int natm, int nbas)
+{
+    const int ish0 = shls_slice[0];
+    const int ish1 = shls_slice[1];
+    const int jsh0 = shls_slice[2];
+    const int jsh1 = shls_slice[3];
+    const int nish = ish1 - ish0;
+    const int njsh = jsh1 - jsh0;
+    const size_t nijsh = (size_t)nish * njsh;
+    const size_t naoi = ao_loc[ish1] - ao_loc[ish0];
+    const size_t naoj = ao_loc[jsh1] - ao_loc[jsh0];
+
+    const int I1 = 1;
+    double *out_bufs[MAX_THREADS];
+
+#pragma omp parallel
+{
+    size_t ij, ish, jsh, p0, q0;
+    int ni, nj, i, ic, iatm, nimgs=1;
+    NeighborList *nl0=NULL;
+    if (neighbor_list != NULL) {
+        nl0 = *neighbor_list;
+    }
+    double *pvhf, *pdm;
+
+    int thread_id = omp_get_thread_num();
+    double *buf;
+    if (thread_id == 0) {
+        buf = out;
+    } else {
+        buf = calloc(comp*natm, sizeof(double));
+    }
+    out_bufs[thread_id] = buf;
+
+    #pragma omp for schedule(dynamic) 
+    for (ij = 0; ij < nijsh; ij++) {
+        ish = ij / njsh + ish0;
+        jsh = ij % njsh + jsh0;
+
+        if (nl0 != NULL) {
+            nimgs = ((nl0->pairs)[ish*nbas + jsh])->nimgs;
+        }
+        if (nimgs > 0) { // this shell pair has contribution
+            p0 = ao_loc[ish] - ao_loc[ish0];
+            q0 = ao_loc[jsh] - ao_loc[jsh0];
+            ni = ao_loc[ish+1] - ao_loc[ish];
+            nj = ao_loc[jsh+1] - ao_loc[jsh];
+
+            iatm = shls_atm[ish];
+            pvhf = vhf + (p0 * naoj + q0);
+            pdm = dm + (p0 * naoj + q0);
+            for (ic = 0; ic < comp; ic++) {
+                for (i = 0; i < ni; i++) {
+                    buf[iatm*3+ic] += ddot_(&nj, pvhf+i*naoj, &I1, pdm+i*naoj, &I1);
+                }
+                pvhf += naoi * naoj;
+            }
+        }
+    }
+
+    NPomp_dsum_reduce_inplace(out_bufs, comp*natm);
+    if (thread_id != 0) {
+        free(buf);
+    }
+}
+}
diff --git a/pyscf/lib/pbc/neighbor_list.c b/pyscf/lib/pbc/neighbor_list.c
new file mode 100644
index 0000000000..26fb52fd37
--- /dev/null
+++ b/pyscf/lib/pbc/neighbor_list.c
@@ -0,0 +1,206 @@
+/* Copyright 2021- The PySCF Developers. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+ *
+ * Author: Xing Zhang <zhangxing.nju@gmail.com>
+ */
+
+#include <stdlib.h>
+#include <math.h>
+#include "config.h"
+#include "cint.h"
+#include "pbc/neighbor_list.h"
+
+#define SQUARE(r)       (r[0]*r[0]+r[1]*r[1]+r[2]*r[2])
+
+void init_neighbor_pair(NeighborPair** np, int nimgs, int* Ls_list)
+{
+    NeighborPair *np0 = (NeighborPair*) malloc(sizeof(NeighborPair));
+    np0->nimgs = nimgs;
+    np0->q_cond = NULL;
+    np0->center = NULL;
+    if (nimgs > 0){
+        np0->Ls_list = (int*) malloc(sizeof(int)*nimgs);
+        int i;
+        for (i=0; i<nimgs; i++) {
+            np0->Ls_list[i] = Ls_list[i];
+        }
+    }
+    else {
+        np0->Ls_list = NULL;
+    }
+    *np = np0;
+}
+
+void del_neighbor_pair(NeighborPair** np)
+{
+    NeighborPair *np0 = *np;
+    if (!np0) {
+        return;
+    }
+    if (np0->Ls_list) {
+        free(np0->Ls_list);
+    }
+    if (np0->q_cond) {
+        free(np0->q_cond);
+    }
+    if (np0->center) {
+        free(np0->center);
+    }
+    free(np0);
+    *np = NULL;
+}
+
+void init_neighbor_list(NeighborList** nl, int nish, int njsh, int nimgs)
+{
+    NeighborList *nl0 = (NeighborList*) malloc(sizeof(NeighborList)); 
+    nl0->nish = nish;
+    nl0->njsh = njsh;
+    nl0->nimgs = nimgs;
+    nl0->pairs = (NeighborPair**) malloc(sizeof(NeighborPair*)*nish*njsh);
+    int ish, jsh;
+    for (ish=0; ish<nish; ish++)
+        for (jsh=0; jsh<njsh; jsh++) {
+            (nl0->pairs)[ish*njsh+jsh] = NULL;
+        }
+    *nl = nl0;
+}
+
+void build_neighbor_list(NeighborList** nl,
+                         int* ish_atm, int* ish_bas, double* ish_env, double* ish_rcut, 
+                         int* jsh_atm, int* jsh_bas, double* jsh_env, double* jsh_rcut,
+                         int nish, int njsh, double* Ls, int nimgs, int hermi)
+{
+    init_neighbor_list(nl, nish, njsh, nimgs);
+    NeighborList* nl0 = *nl;
+
+#pragma omp parallel
+{
+    int *buf = (int*) malloc(sizeof(int)*nimgs);
+    int ish, jsh, iL, nL;
+    int ish_atm_id, jsh_atm_id;
+    double ish_radius, jsh_radius, rmax, dij;
+    double *ish_ratm, *jsh_ratm, *rL;
+    double rij[3];
+    NeighborPair **np = NULL;
+#pragma omp for schedule(dynamic)
+    for (ish=0; ish<nish; ish++) {
+        ish_radius = ish_rcut[ish];
+        ish_atm_id = ish_bas[ish*BAS_SLOTS+ATOM_OF];
+        ish_ratm = ish_env + ish_atm[ish_atm_id*ATM_SLOTS+PTR_COORD];
+        for (jsh=0; jsh<njsh; jsh++) {
+            if (hermi == 1 && jsh < ish) {
+                continue;
+            }
+            jsh_radius = jsh_rcut[jsh];
+            jsh_atm_id = jsh_bas[jsh*BAS_SLOTS+ATOM_OF];
+            jsh_ratm = jsh_env + jsh_atm[jsh_atm_id*ATM_SLOTS+PTR_COORD];
+            rmax = ish_radius + jsh_radius;
+            nL = 0;
+            for (iL=0; iL<nimgs; iL++) {
+                rL = Ls + iL*3;
+                rij[0] = jsh_ratm[0] + rL[0] - ish_ratm[0];
+                rij[1] = jsh_ratm[1] + rL[1] - ish_ratm[1];
+                rij[2] = jsh_ratm[2] + rL[2] - ish_ratm[2];
+                dij = sqrt(SQUARE(rij));
+                if (dij < rmax) {
+                    buf[nL] = iL;
+                    nL += 1;
+                }
+            }
+            np = nl0->pairs + ish*njsh+jsh;
+            init_neighbor_pair(np, nL, buf);
+        }
+    }
+    free(buf);
+}
+}
+
+void del_neighbor_list(NeighborList** nl)
+{
+    NeighborList *nl0 = *nl;
+    if (!nl0) {
+        return;
+    }
+    int ish, jsh;
+    int nish = nl0->nish;
+    int njsh = nl0->njsh;
+    if (nl0->pairs) {
+        for (ish=0; ish<nish; ish++) {
+            for (jsh=0; jsh<njsh; jsh++) {
+                del_neighbor_pair(nl0->pairs + ish*njsh+jsh);
+            }
+        }
+        free(nl0->pairs);
+    }
+    free(nl0);
+    *nl = NULL;
+}
+
+
+int NLOpt_noscreen(int* shls, NeighborListOpt* opt)
+{
+    return 1;
+}
+
+int NLOpt_screen(int* shls, NeighborListOpt* opt)
+{
+    int ish = shls[0];
+    int jsh = shls[1];
+    NeighborList *nl = opt->nl;
+    int njsh = nl->njsh;
+    NeighborPair *np;
+    np = (nl->pairs)[ish*njsh + jsh];
+    return np->nimgs > 0;
+}
+
+void NLOpt_init(NeighborListOpt **opt)
+{
+    NeighborListOpt *opt0 = malloc(sizeof(NeighborListOpt));
+    opt0->nl = NULL;
+    opt0->fprescreen = &NLOpt_noscreen;
+    *opt = opt0;
+}
+
+void NLOpt_del(NeighborListOpt **opt)
+{
+    NeighborListOpt *opt0 = *opt;
+    if (!opt0) {
+        return;
+    }
+    free(opt0);
+    *opt = NULL;
+}
+
+void NLOpt_set_nl(NeighborListOpt *opt, NeighborList *nl)
+{
+    opt->nl = nl;
+}
+
+void NLOpt_reset(NeighborListOpt *opt)
+{
+    opt->nl = NULL;
+    opt->fprescreen = &NLOpt_screen;
+}
+
+void NLOpt_set_optimizer(NeighborListOpt *opt)
+{
+    opt->fprescreen = &NLOpt_screen;
+}
+
+void NLOpt_del_optimizer(NeighborListOpt *opt)
+{
+    opt->fprescreen = &NLOpt_noscreen;
+}
+
diff --git a/pyscf/lib/pbc/neighbor_list.h b/pyscf/lib/pbc/neighbor_list.h
new file mode 100644
index 0000000000..3364be1f3d
--- /dev/null
+++ b/pyscf/lib/pbc/neighbor_list.h
@@ -0,0 +1,41 @@
+/* Copyright 2021- The PySCF Developers. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+ *
+ * Author: Xing Zhang <zhangxing.nju@gmail.com>
+ */
+
+#ifndef HAVE_DEFINED_NEIGHBOR_LIST_H
+#define HAVE_DEFINED_NEIGHBOR_LIST_H
+typedef struct NeighborPair_struct {
+    int nimgs;
+    int *Ls_list;
+    double *q_cond;
+    double *center;
+} NeighborPair;
+
+typedef struct NeighborList_struct {
+    int nish;
+    int njsh;
+    int nimgs;
+    NeighborPair **pairs;
+} NeighborList;
+
+typedef struct NeighborListOpt_struct {
+    NeighborList *nl;
+    int (*fprescreen)(int *shls, struct NeighborListOpt_struct *opt);
+} NeighborListOpt;
+
+int NLOpt_noscreen(int* shls, NeighborListOpt* opt);
+#endif
diff --git a/pyscf/lib/pbc/optimizer.c b/pyscf/lib/pbc/optimizer.c
index d30c81c3e8..a37494ca0a 100644
--- a/pyscf/lib/pbc/optimizer.c
+++ b/pyscf/lib/pbc/optimizer.c
@@ -17,6 +17,7 @@
  */
 
 #include <stdlib.h>
+#include <math.h>
 #include "cint.h"
 #include "pbc/optimizer.h"
 
@@ -27,6 +28,7 @@ void PBCinit_optimizer(PBCOpt **opt, int *atm, int natm,
 {
         PBCOpt *opt0 = malloc(sizeof(PBCOpt));
         opt0->rrcut = NULL;
+        opt0->rcut = NULL;
         opt0->fprescreen = &PBCnoscreen;
         *opt = opt0;
 }
@@ -41,11 +43,13 @@ void PBCdel_optimizer(PBCOpt **opt)
         if (opt0->rrcut != NULL) {
                 free(opt0->rrcut);
         }
+        if (!opt0->rcut) {
+                free(opt0->rcut);
+        }
         free(opt0);
         *opt = NULL;
 }
 
-
 int PBCnoscreen(int *shls, PBCOpt *opt, int *atm, int *bas, double *env)
 {
         return 1;
@@ -68,6 +72,23 @@ int PBCrcut_screen(int *shls, PBCOpt *opt, int *atm, int *bas, double *env)
         return (rr < opt->rrcut[ish] || rr < opt->rrcut[jsh]);
 }
 
+int PBCrcut_screen_loose(int *shls, PBCOpt *opt, int *atm, int *bas, double *env)
+{
+        if (opt == NULL) {
+                return 1; // no screen
+        }
+        const int ish = shls[0];
+        const int jsh = shls[1];
+        const double *ri = env + atm[bas[ATOM_OF+ish*BAS_SLOTS]*ATM_SLOTS+PTR_COORD];
+        const double *rj = env + atm[bas[ATOM_OF+jsh*BAS_SLOTS]*ATM_SLOTS+PTR_COORD];
+        double rirj[3];
+        rirj[0] = ri[0] - rj[0];
+        rirj[1] = ri[1] - rj[1];
+        rirj[2] = ri[2] - rj[2];
+        double r = sqrt(SQUARE(rirj));
+        return r < opt->rcut[ish] + opt->rcut[jsh];
+}
+
 void PBCset_rcut_cond(PBCOpt *opt, double *rcut,
                       int *atm, int natm, int *bas, int nbas, double *env)
 {
@@ -82,3 +103,18 @@ void PBCset_rcut_cond(PBCOpt *opt, double *rcut,
                 opt->rrcut[i] = rcut[i] * rcut[i];
         }
 }
+
+void PBCset_rcut_cond_loose(PBCOpt *opt, double *rcut,
+                            int *atm, int natm, int *bas, int nbas, double *env)
+{
+        if (opt->rcut != NULL) {
+                free(opt->rcut);
+        }
+        opt->rcut = (double *)malloc(sizeof(double) * nbas);
+        opt->fprescreen = &PBCrcut_screen_loose;
+
+        int i;
+        for (i = 0; i < nbas; i++) {
+                opt->rcut[i] = rcut[i];
+        }
+}
diff --git a/pyscf/lib/pbc/optimizer.h b/pyscf/lib/pbc/optimizer.h
index ff3299715b..62c8be5d32 100644
--- a/pyscf/lib/pbc/optimizer.h
+++ b/pyscf/lib/pbc/optimizer.h
@@ -16,10 +16,11 @@
  * Author: Qiming Sun <osirpt.sun@gmail.com>
  */
 
-#if !defined(HAVE_DEFINED_CVHFOPT_H)
-#define HAVE_DEFINED_CVHFOPT_H
+#if !defined(HAVE_DEFINED_PBCOPT_H)
+#define HAVE_DEFINED_PBCOPT_H
 typedef struct PBCOpt_struct {
     double *rrcut;
+    double *rcut;
     int (*fprescreen)(int *shls, struct PBCOpt_struct *opt,
                       int *atm, int *bas, double *env);
 } PBCOpt;
@@ -27,4 +28,3 @@ typedef struct PBCOpt_struct {
 
 int PBCnoscreen(int *shls, PBCOpt *opt, int *atm, int *bas, double *env);
 int PBCrcut_screen(int *shls, PBCOpt *opt, int *atm, int *bas, double *env);
-
diff --git a/pyscf/lib/pbc/pp.c b/pyscf/lib/pbc/pp.c
new file mode 100644
index 0000000000..4885080544
--- /dev/null
+++ b/pyscf/lib/pbc/pp.c
@@ -0,0 +1,448 @@
+/* Copyright 2021- The PySCF Developers. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+ *
+ * Author: Xing Zhang <zhangxing.nju@gmail.com>
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <complex.h>
+#include <math.h>
+#include "config.h"
+#include "cint.h"
+#include "gto/gto.h"
+#include "vhf/fblas.h"
+#include "np_helper/np_helper.h"
+#include "pbc/fill_ints.h"
+#include "pbc/neighbor_list.h"
+
+#define HL_TABLE_SLOTS  7
+//#define ATOM_OF         0
+//#define ANG_OF          1
+#define HL_DIM_OF       2
+#define HL_DATA_OF      3
+#define HL_OFFSET0      4
+#define HF_OFFSET1      5
+#define HF_OFFSET2      6
+#define MAX_THREADS     256
+
+
+static void _ppnl_fill_g(void (*fsort)(), double* out, double** ints,
+                         int comp, int ish, int jsh, double* buf,
+                         int *shls_slice, int *ao_loc,
+                         int* hl_table, double* hl_data, int nhl,
+                         NeighborListOpt* nlopt)
+{
+    const int ish0 = shls_slice[0];
+    const int ish1 = shls_slice[1];
+    const int jsh0 = shls_slice[2];
+    const int jsh1 = shls_slice[3];
+
+    ish += ish0;
+    jsh += jsh0;
+
+    const int di = ao_loc[ish+1] - ao_loc[ish];
+    const int dj = ao_loc[jsh+1] - ao_loc[jsh];
+    const int dij = di *dj;
+    const int ioff = ao_loc[ish] - ao_loc[ish0];
+    const int joff = ao_loc[jsh] - ao_loc[jsh0];
+    const int naoi = ao_loc[ish1] - ao_loc[ish0];
+    const int naoj = ao_loc[jsh1] - ao_loc[jsh0];
+
+    int i, j, ij, pi, pj, ksh;
+    int hl_dim, nd;
+    int shls_ki[2], shls_kj[2];
+    int *table, *offset;
+    double *hl;
+    for (ij = 0; ij < dij; ij++) {
+        buf[ij] = 0;
+    }
+
+    int (*fprescreen)();
+    if (nlopt != NULL) {
+        fprescreen = nlopt->fprescreen;
+    } else {
+        fprescreen = NLOpt_noscreen;
+    }
+
+    const char TRANS_N = 'N';
+    const char TRANS_T = 'T';
+    const double D1 = 1.;
+    for (ksh = 0; ksh < nhl; ksh++) {
+        shls_ki[0] = ksh;
+        shls_ki[1] = ish;
+        shls_kj[0] = ksh;
+        shls_kj[1] = jsh;
+        if ((*fprescreen)(shls_ki, nlopt) && (*fprescreen)(shls_kj, nlopt)) {
+            table = hl_table + ksh * HL_TABLE_SLOTS;
+            hl_dim = table[HL_DIM_OF];
+            nd = table[ANG_OF] * 2 + 1;
+            offset = table + HL_OFFSET0;
+            hl = hl_data + table[HL_DATA_OF];
+            for (i=0; i<hl_dim; i++) {
+                pi = offset[i];
+                for (j=0; j<hl_dim; j++) {
+                    pj = offset[j];
+                    dgemm_(&TRANS_N, &TRANS_T, &di, &dj, &nd,
+                           hl+j+i*hl_dim, ints[i]+pi*naoi+ioff, &naoi,
+                           ints[j]+pj*naoj+joff, &naoj, &D1, buf, &di);
+                }
+            }
+        }
+    }
+    (*fsort)(out, buf, shls_slice, ao_loc, comp, ish, jsh);
+}
+
+
+void ppnl_fill_gs1(double* out, double** ints,
+                   int comp, int ish, int jsh, double* buf,
+                   int *shls_slice, int *ao_loc,
+                   int* hl_table, double* hl_data, int nhl,
+                   NeighborListOpt* nlopt)
+{
+    _ppnl_fill_g(&sort2c_gs1, out, ints, comp, ish, jsh, buf,
+                 shls_slice, ao_loc, hl_table, hl_data, nhl, nlopt);
+}
+
+
+void ppnl_fill_gs2(double* out, double** ints,
+                   int comp, int ish, int jsh, double* buf,
+                   int *shls_slice, int *ao_loc,
+                   int* hl_table, double* hl_data, int nhl,
+                   NeighborListOpt* nlopt)
+{
+    int ip = ish + shls_slice[0];
+    int jp = jsh + shls_slice[2];
+    if (ip > jp) {
+        _ppnl_fill_g(&sort2c_gs2_igtj, out, ints, comp, ish, jsh, buf,
+                     shls_slice, ao_loc, hl_table, hl_data, nhl, nlopt);
+    } else if (ip == jp) {
+        _ppnl_fill_g(&sort2c_gs2_ieqj, out, ints, comp, ish, jsh, buf,
+                     shls_slice, ao_loc, hl_table, hl_data, nhl, nlopt);
+    }
+}
+
+
+void contract_ppnl(void (*fill)(), double* out,
+                   double* ppnl_half0, double* ppnl_half1, double* ppnl_half2,
+                   int comp, int* shls_slice, int *ao_loc,
+                   int* hl_table, double* hl_data, int nhl,
+                   NeighborListOpt* nlopt)
+{
+    const int ish0 = shls_slice[0];
+    const int ish1 = shls_slice[1];
+    const int jsh0 = shls_slice[2];
+    const int jsh1 = shls_slice[3];
+    const int nish = ish1 - ish0;
+    const int njsh = jsh1 - jsh0;
+    const size_t nijsh = (size_t) nish * njsh;
+
+    double *ints[3] = {ppnl_half0, ppnl_half1, ppnl_half2};
+
+    int di = GTOmax_shell_dim(ao_loc, shls_slice+0, 1);
+    int dj = GTOmax_shell_dim(ao_loc, shls_slice+2, 1);
+    size_t buf_size = di*dj*comp;
+
+    #pragma omp parallel
+    {
+        int ish, jsh;
+        size_t ij;
+        double *buf = (double*) malloc(sizeof(double) * buf_size);
+        #pragma omp for schedule(dynamic)
+        for (ij = 0; ij < nijsh; ij++) {
+            ish = ij / njsh;
+            jsh = ij % njsh;
+            (*fill)(out, ints, comp, ish, jsh, buf,
+                    shls_slice, ao_loc, hl_table, hl_data, nhl, nlopt);
+        }
+        free(buf);
+    }
+}
+
+
+void contract_ppnl_ip1(double* out, int comp,
+                       double* ppnl_half0, double* ppnl_half1, double* ppnl_half2,
+                       double* ppnl_half_ip2_0, double* ppnl_half_ip2_1, double* ppnl_half_ip2_2,
+                       int* hl_table, double* hl_data, int nhl, int nao, int* naux,
+                       int* aux_id)
+{
+    const int One = 1;
+    const char TRANS_N = 'N';
+    //const char TRANS_T = 'T';
+    const double D1 = 1.;
+    const double D0 = 0.;
+
+    size_t nao_pair = (size_t) nao * nao;
+    memset(out, 0, nao_pair*comp*sizeof(double));
+
+    size_t n2[3];
+    n2[0] = (size_t) nao * naux[0];
+    n2[1] = (size_t) nao * naux[1];
+    n2[2] = (size_t) nao * naux[2];
+    size_t buf_size = 54 * (size_t) nao + 27;
+
+#pragma omp parallel
+{
+    size_t ib, id, i, p, ic;
+    double *pout;
+    double *buf = (double*) malloc(sizeof(double)*buf_size);
+
+    #pragma omp for schedule(dynamic)
+    for (p = 0; p < nao; p++){
+        pout = out + (size_t)p*nao;
+        for (id = 0; id < nhl; id++) {
+            ib = aux_id[id];
+            int *table = hl_table + ib * HL_TABLE_SLOTS;
+            int hl_dim = table[HL_DIM_OF];
+            int ptr = table[HL_DATA_OF];
+            int nd = table[ANG_OF] * 2 + 1;
+            int *offset = table + HL_OFFSET0;
+            double *hl = hl_data + ptr;
+            int lp_dim = nd * nao;
+            int ilp_dim = hl_dim * lp_dim;
+            int il_dim = hl_dim * nd;
+
+            double *ilp = buf;
+            double *ilp_ip2 = ilp + ilp_dim;
+            double *hilp = ilp_ip2 + nd*3;
+            for (ic = 0; ic < comp; ic++) {
+                for (i=0; i<hl_dim; i++) {
+                    int p0 = offset[i];
+                    if (i == 0) {
+                        dcopy_(&lp_dim, ppnl_half0+p0*nao, &One, ilp+i*lp_dim, &One);
+                        dcopy_(&nd, ppnl_half_ip2_0+p+p0*nao+ic*n2[0], &nao, ilp_ip2+i*nd, &One);
+                    }
+                    else if (i == 1) {
+                        dcopy_(&lp_dim, ppnl_half1+p0*nao, &One, ilp+i*lp_dim, &One);
+                        dcopy_(&nd, ppnl_half_ip2_1+p+p0*nao+ic*n2[1], &nao, ilp_ip2+i*nd, &One);
+                    }
+                    else if (i == 2) {
+                        dcopy_(&lp_dim, ppnl_half2+p0*nao, &One, ilp+i*lp_dim, &One);
+                        dcopy_(&nd, ppnl_half_ip2_2+p+p0*nao+ic*n2[2], &nao, ilp_ip2+i*nd, &One);
+                    }
+                }
+                dgemm_(&TRANS_N, &TRANS_N, &lp_dim, &hl_dim, &hl_dim, 
+                       &D1, ilp, &lp_dim, hl, &hl_dim, &D0, hilp, &lp_dim);
+                dgemm_(&TRANS_N, &TRANS_N, &nao, &One, &il_dim,
+                       &D1, hilp, &nao, ilp_ip2, &il_dim, &D1, pout+ic*nao_pair, &nao);
+            }
+        }
+    }
+    free(buf);
+}
+}
+
+
+static void _contract_vnuc_ip1_dm(double* out, double* in, double* dm, int comp,
+                                  int* shls_slice, int* ao_loc, int* bas,
+                                  int ish, int jsh, int naoi, int katm)
+{
+    const int di = ao_loc[ish+1] - ao_loc[ish];
+    const int dj = ao_loc[jsh+1] - ao_loc[jsh];
+    const int iatm = bas[ATOM_OF+ish*BAS_SLOTS];
+
+    const int One = 1;
+    int ic, j;
+    double buf[comp];
+    double *pdm;
+    for (ic = 0; ic < comp; ic++) {
+        buf[ic] = 0;
+        pdm = dm;
+        for (j = 0; j < dj; j++) {
+            buf[ic] += ddot_(&di, in, &One, pdm, &One);
+            in += di;
+            pdm += naoi;
+        }
+    }
+
+    for (ic = 0; ic < comp; ic++) {
+        out[iatm*comp+ic] += buf[ic];
+        out[katm*comp+ic] -= buf[ic];
+    }
+}
+
+
+void ppnl_nuc_grad_fill_gs1(double* out, double* dm, int comp,
+                            double** ints, double** ints_ip2,
+                            int* hl_table, double* hl_data, int nhl, int* naux,
+                            int* shls_slice, int* ao_loc, int* bas, double* buf, int ish, int jsh,
+                            NeighborListOpt* nlopt)
+{
+    const int ish0 = shls_slice[0];
+    const int ish1 = shls_slice[1];
+    const int jsh0 = shls_slice[2];
+    const int jsh1 = shls_slice[3];
+
+    ish += ish0;
+    jsh += jsh0;
+
+    const int di = ao_loc[ish+1] - ao_loc[ish];
+    const int dj = ao_loc[jsh+1] - ao_loc[jsh];
+    const int dij = di * dj;
+    const size_t dijm = (size_t)dij * comp;
+    const int i0 = ao_loc[ish] - ao_loc[ish0];
+    const int j0 = ao_loc[jsh] - ao_loc[jsh0];
+    const int naoi = ao_loc[ish1] - ao_loc[ish0];
+    const int naoj = ao_loc[jsh1] - ao_loc[jsh0];
+
+    size_t n2[3];
+    n2[0] = (size_t) naoi * naux[0];
+    n2[1] = (size_t) naoi * naux[1];
+    n2[2] = (size_t) naoi * naux[2];
+
+    int (*fprescreen)();
+    if (nlopt != NULL) {
+        fprescreen = nlopt->fprescreen;
+    } else {
+        fprescreen = NLOpt_noscreen;
+    }
+
+    const char TRANS_N = 'N';
+    const char TRANS_T = 'T';
+    const double D1 = 1.;
+
+    int i, j, pi, pj, ksh, ic;
+    int katm, l, hl_dim, nd;
+    int shls_ki[2], shls_kj[2];
+    int *table, *offset;
+    double *hl;
+    for (ksh = 0; ksh < nhl; ksh++) {
+        shls_ki[0] = ksh;
+        shls_ki[1] = ish;
+        shls_kj[0] = ksh;
+        shls_kj[1] = jsh;
+        if ((*fprescreen)(shls_ki, nlopt) && (*fprescreen)(shls_kj, nlopt)) {
+            table = hl_table + ksh * HL_TABLE_SLOTS;
+            katm = table[ATOM_OF];
+            l = table[ANG_OF];
+            hl_dim = table[HL_DIM_OF];
+            nd = 2 * l + 1;
+            offset = table + HL_OFFSET0;
+            hl = hl_data + table[HL_DATA_OF];
+
+            memset(buf, 0, dijm*sizeof(double));
+            for (ic = 0; ic < comp; ic++) {
+                for (i=0; i<hl_dim; i++) {
+                    pi = offset[i];
+                    for (j=0; j<hl_dim; j++) {
+                        pj = offset[j];
+                        dgemm_(&TRANS_N, &TRANS_T, &di, &dj, &nd,
+                               hl+j+i*hl_dim, ints_ip2[i]+ic*n2[i]+pi*naoi+i0, &naoi,
+                               ints[j]+pj*naoj+j0, &naoj, &D1, buf+ic*dij, &di);
+                    }
+                }
+            }
+            _contract_vnuc_ip1_dm(out, buf, dm+j0*naoi+i0, comp,
+                                  shls_slice, ao_loc, bas,
+                                  ish, jsh, naoi, katm);
+        }
+    }
+}
+
+
+void contract_ppnl_nuc_grad(void (*fill)(), double* grad, double* dm, int comp,
+                            double* ppnl_half0, double* ppnl_half1, double* ppnl_half2,
+                            double* ppnl_half_ip2_0, double* ppnl_half_ip2_1, double* ppnl_half_ip2_2,
+                            int* hl_table, double* hl_data, int nhl, int* naux,
+                            int* shls_slice, int* ao_loc, int* bas, int natm,
+                            NeighborListOpt* nlopt)
+{
+    const int ish0 = shls_slice[0];
+    const int ish1 = shls_slice[1];
+    const int jsh0 = shls_slice[2];
+    const int jsh1 = shls_slice[3];
+    const int nish = ish1 - ish0;
+    const int njsh = jsh1 - jsh0;
+    const size_t nijsh = (size_t)nish * njsh;
+
+    int di = GTOmax_shell_dim(ao_loc, shls_slice+0, 1);
+    int dj = GTOmax_shell_dim(ao_loc, shls_slice+2, 1);
+    size_t buf_size = di*dj*comp;
+
+    double *ints[3] = {ppnl_half0, ppnl_half1, ppnl_half2};
+    double *ints_ip2[3] = {ppnl_half_ip2_0, ppnl_half_ip2_1, ppnl_half_ip2_2};
+
+    double *gradbufs[MAX_THREADS];
+    #pragma omp parallel
+    {
+        int ish, jsh;
+        size_t ij;
+        double *grad_loc;
+        int thread_id = omp_get_thread_num();
+        if (thread_id == 0) {
+            grad_loc = grad;
+        } else {
+            grad_loc = calloc(natm*comp, sizeof(double));
+        }
+        gradbufs[thread_id] = grad_loc;
+        double *buf = (double*) malloc(sizeof(double)*buf_size);
+
+        #pragma omp for schedule(dynamic)
+        for (ij = 0; ij < nijsh; ij++) {
+            ish = ij / njsh;
+            jsh = ij % njsh;
+
+            (*fill)(grad_loc, dm, comp, ints, ints_ip2,
+                    hl_table, hl_data, nhl, naux,
+                    shls_slice, ao_loc, bas, buf, ish, jsh, nlopt);
+        }
+        free(buf);
+
+        NPomp_dsum_reduce_inplace(gradbufs, natm*comp);
+        if (thread_id != 0) {
+            free(grad_loc);
+        }
+    }
+}
+
+
+void pp_loc_part1_gs(double complex* out, double* coulG,
+                     double* Gv, double* G2, int G0idx, int ngrid,
+                     double* Z, double* coords, double* rloc,
+                     int natm)
+{
+#pragma omp parallel
+{
+    int ig, ia;
+    double vlocG, r0, RG;
+    double *Gv_loc, *coords_local;
+    #pragma omp for schedule(static)
+    for (ig = 0; ig < ngrid; ig++){
+        out[ig] = 0;
+        Gv_loc = Gv + ig*3;
+        for (ia = 0; ia < natm; ia++)
+        {
+            coords_local = coords + ia*3;
+            RG = (coords_local[0] * Gv_loc[0]
+                  + coords_local[1] * Gv_loc[1]
+                  + coords_local[2] * Gv_loc[2]);
+
+            r0 = rloc[ia];
+            if (r0 > 0) {
+                if (ig == G0idx) {
+                    vlocG = -2. * M_PI * Z[ia] * r0*r0;
+                }
+                else {
+                    vlocG = Z[ia] * coulG[ig] * exp(-0.5*r0*r0 * G2[ig]);
+                }
+            }
+            else { // Z/r
+                vlocG = Z[ia] * coulG[ig];
+            }
+            out[ig] -= (vlocG * cos(RG)) - (vlocG * sin(RG)) * _Complex_I;
+        }
+    }
+}
+}
diff --git a/pyscf/lib/test/test_numint_uniform_grid.py b/pyscf/lib/test/test_numint_uniform_grid.py
index 296dcbd61a..05e5664ab0 100644
--- a/pyscf/lib/test/test_numint_uniform_grid.py
+++ b/pyscf/lib/test/test_numint_uniform_grid.py
@@ -7,7 +7,7 @@
 from pyscf.pbc.dft import gen_grid
 from pyscf.pbc.dft import multigrid
 
-from pyscf.pbc.dft.multigrid import eval_mat, eval_rho
+from pyscf.pbc.dft.multigrid.multigrid import eval_mat, eval_rho
 
 def uncontract(cell):
     pcell, contr_coeff = cell.to_uncontracted_cartesian_basis()
@@ -18,8 +18,8 @@ def setUpModule():
     global bak_EXPDROP, bak_EXTRA_PREC
     global vxc, kpts, nkpts, nao, dm, dm_kpts, grids_orth, grids_north
     global ao_kpts_orth, ao_kpts_north, ao_orth, ao_north, ao_gamma_orth, ao_gamma_north
-    multigrid.EXPDROP, bak_EXPDROP = 1e-14, multigrid.EXPDROP
-    multigrid.EXTRA_PREC, bak_EXTRA_PREC = 1e-3, multigrid.EXTRA_PREC
+    multigrid.multigrid.EXPDROP, bak_EXPDROP = 1e-14, multigrid.multigrid.EXPDROP
+    multigrid.multigrid.EXTRA_PREC, bak_EXTRA_PREC = 1e-3, multigrid.multigrid.EXTRA_PREC
 
     numpy.random.seed(2)
     cell_orth = gto.M(atom='H1 1 1 0; H2 0 0 1',
diff --git a/pyscf/pbc/df/incore.py b/pyscf/pbc/df/incore.py
index 253250a405..76c23f8e3e 100644
--- a/pyscf/pbc/df/incore.py
+++ b/pyscf/pbc/df/incore.py
@@ -30,6 +30,7 @@
 from pyscf.pbc.tools import k2gamma
 from pyscf.pbc.tools import pbc as pbctools
 from pyscf import __config__
+from pyscf.pbc.gto import _pbcintor
 
 RCUT_THRESHOLD = getattr(__config__, 'pbc_scf_rsjk_rcut_threshold', 2.5)
 KECUT_THRESHOLD = getattr(__config__, 'pbc_scf_rsjk_kecut_threshold', 10.0)
@@ -471,3 +472,246 @@ def _conc_locs(ao_loc1, ao_loc2):
     basis accordingly.'''
     comp_loc = np.append(ao_loc1[:-1], ao_loc1[-1] + ao_loc2)
     return np.asarray(comp_loc, dtype=np.int32)
+
+# The following functions use pre-constructed shell pair list
+def aux_e2_sum_auxbas(cell, auxcell_or_auxbasis, intor='int3c2e', aosym='s1', comp=None,
+                      kptij_lst=np.zeros((1,2,3)), shls_slice=None, **kwargs):
+    r'''Compute :math:`\sum_{L} (ij|L)` on the fly.
+
+    Returns:
+        out : (nao_pair,) array
+    '''
+    if isinstance(auxcell_or_auxbasis, gto.MoleBase):
+        auxcell = auxcell_or_auxbasis
+    else:
+        assert isinstance(auxcell_or_auxbasis, str)
+        auxcell = make_auxcell(cell, auxcell_or_auxbasis)
+
+    int3c = wrap_int3c_sum_auxbas(cell, auxcell, intor, aosym, comp, kptij_lst, **kwargs)
+    out = int3c(shls_slice)
+    return out
+
+def wrap_int3c_sum_auxbas(cell, auxcell, intor='int3c2e', aosym='s1', comp=None,
+                          kptij_lst=np.zeros((1,2,3)), cintopt=None, pbcopt=None,
+                          neighbor_list=None):
+    if neighbor_list is None:
+        raise KeyError('Neighbor list is not initialized.')
+
+    log = logger.new_logger(cell)
+
+    nkptij = len(kptij_lst)
+    kpti = kptij_lst[:,0]
+    kptj = kptij_lst[:,1]
+    j_only = is_zero(kpti - kptj)
+    if j_only:
+        kpts = kpti
+        nkpts = len(kpts)
+        kptij_idx = np.arange(nkpts, dtype=np.int32)
+    else:
+        raise NotImplementedError
+
+    intor = cell._add_suffix(intor)
+    intor, comp = gto.moleintor._get_intor_and_comp(intor, comp)
+
+    pcell = cell.copy()
+    pcell._atm, pcell._bas, pcell._env = \
+            atm, bas, env = gto.conc_env(cell._atm, cell._bas, cell._env,
+                                         cell._atm, cell._bas, cell._env)
+    ao_loc = gto.moleintor.make_loc(bas, intor)
+    aux_loc = auxcell.ao_loc_nr()
+    ao_loc = np.asarray(np.hstack([ao_loc, ao_loc[-1]+aux_loc[1:]]),
+                        dtype=np.int32)
+    atm, bas, env = gto.conc_env(atm, bas, env,
+                                 auxcell._atm, auxcell._bas, auxcell._env)
+
+    Ls = cell.get_lattice_Ls()
+    nimgs = len(Ls)
+    nbas = cell.nbas
+
+    gamma_point_only = is_zero(kpts)
+    if gamma_point_only:
+        assert nkpts == 1
+        kk_type = 'g'
+        expkL = np.ones(1, dtype=np.complex128)
+        out_dtype = np.double
+    else:
+        raise NotImplementedError
+
+    fill = 'PBCnr3c_screened_sum_auxbas_fill_%s%s' % (kk_type, aosym[:2])
+    drv = libpbc.PBCnr3c_screened_sum_auxbas_drv
+
+    if cintopt is None:
+        if nbas > 0:
+            env[gto.PTR_EXPCUTOFF] = abs(np.log(cell.precision))
+            cintopt = _vhf.make_cintopt(atm, bas, env, intor)
+        else:
+            cintopt = lib.c_null_ptr()
+        if intor[:3] != 'ECP':
+            libpbc.CINTdel_pairdata_optimizer(cintopt)
+    if pbcopt is None:
+        pbcopt = _pbcintor.PBCOpt(pcell).init_rcut_cond(pcell)
+    if isinstance(pbcopt, _pbcintor.PBCOpt):
+        cpbcopt = pbcopt._this
+    else:
+        cpbcopt = lib.c_null_ptr()
+
+    def int3c(shls_slice=None, out=None):
+        t0 = (logger.process_clock(), logger.perf_counter())
+        if shls_slice is None:
+            shls_slice = (0, nbas, 0, nbas, 0, auxcell.nbas)
+        shls_slice = (shls_slice[0], shls_slice[1],
+                      nbas+shls_slice[2], nbas+shls_slice[3],
+                      nbas*2+shls_slice[4], nbas*2+shls_slice[5])
+        ni = ao_loc[shls_slice[1]] - ao_loc[shls_slice[0]]
+        nj = ao_loc[shls_slice[3]] - ao_loc[shls_slice[2]]
+
+        if aosym[:2] == 's2':
+            assert ni == nj
+            nao_pair = (ao_loc[shls_slice[1]]*(ao_loc[shls_slice[1]]+1)//2 -
+                        ao_loc[shls_slice[0]]*(ao_loc[shls_slice[0]]+1)//2)
+        else:
+            nao_pair = ni * nj
+
+        if out is None:
+            out = np.empty((nkptij,comp,nao_pair), dtype=out_dtype)
+
+        drv(getattr(libpbc, intor), getattr(libpbc, fill),
+            out.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(nkptij), ctypes.c_int(nkpts),
+            ctypes.c_int(comp), ctypes.c_int(nimgs),
+            Ls.ctypes.data_as(ctypes.c_void_p),
+            expkL.ctypes.data_as(ctypes.c_void_p),
+            kptij_idx.ctypes.data_as(ctypes.c_void_p),
+            (ctypes.c_int*6)(*shls_slice),
+            ao_loc.ctypes.data_as(ctypes.c_void_p), cintopt, cpbcopt,
+            atm.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(cell.natm),
+            bas.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(nbas),  # need to pass cell.nbas to libpbc.PBCnr3c_drv
+            env.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(env.size),
+            ctypes.byref(neighbor_list))
+
+        log.timer_debug1(f'pbc integral {intor}', *t0)
+
+        if comp == 1:
+            out = out[:,0]
+        if nkptij == 1:
+            out = out[0]
+        return out
+
+    return int3c
+
+def int3c1e_nuc_grad(cell, auxcell, dm, intor='int3c1e', aosym='s1', comp=3,
+                     kptij_lst=np.zeros((1,2,3)), shls_slice=None, **kwargs):
+    '''Compute the nuclear gradient contribution
+    to the 2nd local part of PP on the fly.
+    See `pbc.gto.pseudo.pp_int.vpploc_part2_nuc_grad`.
+
+    Returns:
+        out : (natm,comp) array
+    '''
+    if comp != 3:
+        raise NotImplementedError
+    if aosym != 's1':
+        raise NotImplementedError
+
+    int3c = wrap_int3c1e_nuc_grad(cell, auxcell, dm, intor, aosym, comp, kptij_lst, **kwargs)
+    out = int3c(shls_slice)
+    return out
+
+def wrap_int3c1e_nuc_grad(cell, auxcell, dm, intor='int3c1e', aosym='s1', comp=3,
+                          kptij_lst=np.zeros((1,2,3)), cintopt=None, pbcopt=None,
+                          neighbor_list=None):
+    if neighbor_list is None:
+        raise KeyError('Neighbor list is not initialized.')
+
+    log = logger.new_logger(cell)
+
+    nkptij = len(kptij_lst)
+    kpti = kptij_lst[:,0]
+    kptj = kptij_lst[:,1]
+    j_only = is_zero(kpti - kptj)
+    if j_only:
+        kpts = kpti
+        nkpts = len(kpts)
+        kptij_idx = np.arange(nkpts, dtype=np.int32)
+    else:
+        raise NotImplementedError
+
+    intor = cell._add_suffix(intor)
+    intor, comp = gto.moleintor._get_intor_and_comp(intor, comp)
+
+    pcell = cell.copy()
+    pcell._atm, pcell._bas, pcell._env = \
+            atm, bas, env = gto.conc_env(cell._atm, cell._bas, cell._env,
+                                         cell._atm, cell._bas, cell._env)
+    ao_loc = gto.moleintor.make_loc(bas, intor)
+    aux_loc = auxcell.ao_loc_nr()
+    ao_loc = np.asarray(np.hstack([ao_loc, ao_loc[-1]+aux_loc[1:]]),
+                        dtype=np.int32)
+    atm, bas, env = gto.conc_env(atm, bas, env,
+                                 auxcell._atm, auxcell._bas, auxcell._env)
+
+    Ls = cell.get_lattice_Ls()
+    nimgs = len(Ls)
+    nbas = cell.nbas
+
+    gamma_point_only = is_zero(kpts)
+    if gamma_point_only:
+        assert nkpts == 1
+        kk_type = 'g'
+        expkL = np.ones(1, dtype=np.complex128)
+        dm = np.asarray(dm, order="C", dtype=np.double)
+    else:
+        raise NotImplementedError
+
+    fill = 'PBCnr3c1e_screened_nuc_grad_fill_%s%s' % (kk_type, aosym[:2])
+    drv = libpbc.PBCnr3c1e_screened_nuc_grad_drv
+
+    if cintopt is None:
+        if nbas > 0:
+            env[gto.PTR_EXPCUTOFF] = abs(np.log(cell.precision))
+            cintopt = _vhf.make_cintopt(atm, bas, env, intor)
+        else:
+            cintopt = lib.c_null_ptr()
+        if intor[:3] != 'ECP':
+            libpbc.CINTdel_pairdata_optimizer(cintopt)
+    if pbcopt is None:
+        pbcopt = _pbcintor.PBCOpt(pcell).init_rcut_cond(pcell)
+    if isinstance(pbcopt, _pbcintor.PBCOpt):
+        cpbcopt = pbcopt._this
+    else:
+        cpbcopt = lib.c_null_ptr()
+
+    def int3c(shls_slice=None, out=None):
+        t0 = (logger.process_clock(), logger.perf_counter())
+        if shls_slice is None:
+            shls_slice = (0, nbas, 0, nbas, 0, auxcell.nbas)
+        shls_slice = (shls_slice[0], shls_slice[1],
+                      nbas+shls_slice[2], nbas+shls_slice[3],
+                      nbas*2+shls_slice[4], nbas*2+shls_slice[5])
+
+        if out is None:
+            out = np.zeros((nkptij,cell.natm,comp), dtype=np.double)
+
+        drv(getattr(libpbc, intor), getattr(libpbc, fill),
+            out.ctypes.data_as(ctypes.c_void_p),
+            dm.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(nkptij), ctypes.c_int(nkpts),
+            ctypes.c_int(comp), ctypes.c_int(nimgs),
+            Ls.ctypes.data_as(ctypes.c_void_p),
+            expkL.ctypes.data_as(ctypes.c_void_p),
+            kptij_idx.ctypes.data_as(ctypes.c_void_p),
+            (ctypes.c_int*6)(*shls_slice),
+            ao_loc.ctypes.data_as(ctypes.c_void_p), cintopt, cpbcopt,
+            atm.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(cell.natm),
+            bas.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(nbas),
+            env.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(env.size),
+            ctypes.c_int(cell.nao), ctypes.byref(neighbor_list))
+
+        log.timer_debug1(f'pbc integral {intor}', *t0)
+
+        if nkptij == 1:
+            out = out[0]
+        return out
+
+    return int3c
diff --git a/pyscf/pbc/dft/gks.py b/pyscf/pbc/dft/gks.py
index 8d496bbfb1..5536b53daa 100644
--- a/pyscf/pbc/dft/gks.py
+++ b/pyscf/pbc/dft/gks.py
@@ -77,7 +77,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
     ni = ks._numint
     n, exc, vxc = ni.get_vxc(cell, ks.grids, ks.xc, dm, hermi=hermi, kpt=kpt,
                              kpts_band=kpts_band, max_memory=max_memory)
-    logger.debug(ks, 'nelec by numeric integration = %s', n)
+    logger.info(ks, 'nelec by numeric integration = %s', n)
     t0 = logger.timer(ks, 'vxc', *t0)
 
     if not hybrid:
diff --git a/pyscf/pbc/dft/kgks.py b/pyscf/pbc/dft/kgks.py
index f43a8ee04c..fd97e43cd1 100644
--- a/pyscf/pbc/dft/kgks.py
+++ b/pyscf/pbc/dft/kgks.py
@@ -84,7 +84,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
     ni = ks._numint
     n, exc, vxc = ni.get_vxc(cell, ks.grids, ks.xc, dm, hermi=hermi, kpts=kpts,
                              kpts_band=kpts_band, max_memory=max_memory)
-    logger.debug(ks, 'nelec by numeric integration = %s', n)
+    logger.info(ks, 'nelec by numeric integration = %s', n)
     t0 = logger.timer(ks, 'vxc', *t0)
 
     nkpts = len(kpts)
diff --git a/pyscf/pbc/dft/krks.py b/pyscf/pbc/dft/krks.py
index 572a7614af..3cd23636b1 100644
--- a/pyscf/pbc/dft/krks.py
+++ b/pyscf/pbc/dft/krks.py
@@ -69,7 +69,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
         n, exc, vxc = multigrid.nr_rks(ks.with_df, ks.xc, dm, hermi,
                                        kpts, kpts_band,
                                        with_j=True, return_j=False)
-        logger.debug(ks, 'nelec by numeric integration = %s', n)
+        logger.info(ks, 'nelec by numeric integration = %s', n)
         t0 = logger.timer(ks, 'vxc', *t0)
         return vxc
 
@@ -84,7 +84,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
         max_memory = ks.max_memory - lib.current_memory()[0]
         n, exc, vxc = ni.nr_rks(cell, ks.grids, ks.xc, dm, 0, hermi,
                                 kpts, kpts_band, max_memory=max_memory)
-        logger.debug(ks, 'nelec by numeric integration = %s', n)
+        logger.info(ks, 'nelec by numeric integration = %s', n)
         if ks.nlc or ni.libxc.is_nlc(ks.xc):
             if ni.libxc.is_nlc(ks.xc):
                 xc = ks.xc
@@ -95,7 +95,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
                                           max_memory=max_memory)
             exc += enlc
             vxc += vnlc
-            logger.debug(ks, 'nelec with nlc grids = %s', n)
+            logger.info(ks, 'nelec with nlc grids = %s', n)
         t0 = logger.timer(ks, 'vxc', *t0)
 
     nkpts = len(kpts)
diff --git a/pyscf/pbc/dft/krks_ksymm.py b/pyscf/pbc/dft/krks_ksymm.py
index fb15bf6f40..0d9e1401e2 100644
--- a/pyscf/pbc/dft/krks_ksymm.py
+++ b/pyscf/pbc/dft/krks_ksymm.py
@@ -59,7 +59,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
         n, exc, vxc = multigrid.nr_rks(ks.with_df, ks.xc, dm_bz, hermi,
                                        kpts.kpts, kpts_band,
                                        with_j=True, return_j=False)
-        logger.debug(ks, 'nelec by numeric integration = %s', n)
+        logger.info(ks, 'nelec by numeric integration = %s', n)
         t0 = logger.timer(ks, 'vxc', *t0)
         return vxc
 
@@ -72,7 +72,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
         n, exc, vxc = ni.nr_rks(cell, ks.grids, ks.xc, dm_bz,
                                 kpts=kpts.kpts, kpts_band=kpts_band,
                                 max_memory=max_memory)
-        logger.debug(ks, 'nelec by numeric integration = %s', n)
+        logger.info(ks, 'nelec by numeric integration = %s', n)
         if ks.nlc or ni.libxc.is_nlc(ks.xc):
             if ni.libxc.is_nlc(ks.xc):
                 xc = ks.xc
@@ -83,7 +83,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
                                           0, hermi, kpts.kpts, max_memory=max_memory)
             exc += enlc
             vxc += vnlc
-            logger.debug(ks, 'nelec with nlc grids = %s', n)
+            logger.info(ks, 'nelec with nlc grids = %s', n)
         t0 = logger.timer(ks, 'vxc', *t0)
 
     weight = kpts.weights_ibz
diff --git a/pyscf/pbc/dft/kuks.py b/pyscf/pbc/dft/kuks.py
index a07949ccca..634c99f8ff 100644
--- a/pyscf/pbc/dft/kuks.py
+++ b/pyscf/pbc/dft/kuks.py
@@ -55,7 +55,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
         n, exc, vxc = multigrid.nr_uks(ks.with_df, ks.xc, dm, hermi,
                                        kpts, kpts_band,
                                        with_j=True, return_j=False)
-        logger.debug(ks, 'nelec by numeric integration = %s', n)
+        logger.info(ks, 'nelec by numeric integration = %s', n)
         t0 = logger.timer(ks, 'vxc', *t0)
         return vxc
 
@@ -79,7 +79,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
                                           0, hermi, kpts, max_memory=max_memory)
             exc += enlc
             vxc += vnlc
-        logger.debug(ks, 'nelec by numeric integration = %s', n)
+        logger.info(ks, 'nelec by numeric integration = %s', n)
         t0 = logger.timer(ks, 'vxc', *t0)
 
     nkpts = len(kpts)
diff --git a/pyscf/pbc/dft/kuks_ksymm.py b/pyscf/pbc/dft/kuks_ksymm.py
index eb02e674e9..15c2a623b5 100644
--- a/pyscf/pbc/dft/kuks_ksymm.py
+++ b/pyscf/pbc/dft/kuks_ksymm.py
@@ -58,7 +58,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
         n, exc, vxc = multigrid.nr_uks(ks.with_df, ks.xc, dm_bz, hermi,
                                        kpts.kpts, kpts_band,
                                        with_j=True, return_j=False)
-        logger.debug(ks, 'nelec by numeric integration = %s', n)
+        logger.info(ks, 'nelec by numeric integration = %s', n)
         t0 = logger.timer(ks, 'vxc', *t0)
         return vxc
 
@@ -71,7 +71,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
         n, exc, vxc = ni.nr_uks(cell, ks.grids, ks.xc, dm_bz,
                                 kpts=kpts.kpts, kpts_band=kpts_band,
                                 max_memory=max_memory)
-        logger.debug(ks, 'nelec by numeric integration = %s', n)
+        logger.info(ks, 'nelec by numeric integration = %s', n)
         if ks.nlc or ni.libxc.is_nlc(ks.xc):
             if ni.libxc.is_nlc(ks.xc):
                 xc = ks.xc
@@ -82,7 +82,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
                                           0, hermi, kpts.kpts, max_memory=max_memory)
             exc += enlc
             vxc += vnlc
-            logger.debug(ks, 'nelec with nlc grids = %s', n)
+            logger.info(ks, 'nelec with nlc grids = %s', n)
         t0 = logger.timer(ks, 'vxc', *t0)
 
     weight = kpts.weights_ibz
diff --git a/pyscf/pbc/dft/multigrid/__init__.py b/pyscf/pbc/dft/multigrid/__init__.py
new file mode 100644
index 0000000000..707853bf51
--- /dev/null
+++ b/pyscf/pbc/dft/multigrid/__init__.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python
+# Copyright 2014-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from .multigrid import MultiGridFFTDF
+from .multigrid import (
+    multigrid_fftdf as multigrid_fftdf,
+    _gen_rhf_response as _gen_rhf_response,
+    _gen_uhf_response as _gen_uhf_response,
+    nr_rks as nr_rks_v1,
+    nr_rks_fxc as nr_rks_fxc,
+    nr_rks_fxc_st as nr_rks_fxc_st,
+    nr_uks as nr_uks_v1,
+    nr_uks_fxc as nr_uks_fxc
+)
+
+from .multigrid_pair import MultiGridFFTDF2
+from .multigrid_pair import nr_rks as nr_rks_v2
+from .multigrid_pair import nr_uks as nr_uks_v2
+
+def nr_rks(mydf, xc_code, dm_kpts, hermi=1, kpts=None,
+           kpts_band=None, with_j=False, return_j=False, verbose=None):
+    if isinstance(mydf, MultiGridFFTDF2):
+        return nr_rks_v2(mydf, xc_code, dm_kpts, hermi=hermi, kpts=kpts,
+                         kpts_band=kpts_band, with_j=with_j,
+                         return_j=return_j, verbose=verbose) 
+    elif isinstance(mydf, MultiGridFFTDF):
+        return nr_rks_v1(mydf, xc_code, dm_kpts, hermi=hermi, kpts=kpts,
+                         kpts_band=kpts_band, with_j=with_j,
+                         return_j=return_j, verbose=verbose)
+    else:
+        raise TypeError("Wrong density fitting type for multigrid DFT.")
+
+def nr_uks(mydf, xc_code, dm_kpts, hermi=1, kpts=None,
+           kpts_band=None, with_j=False, return_j=False, verbose=None):
+    if isinstance(mydf, MultiGridFFTDF2):
+        return nr_uks_v2(mydf, xc_code, dm_kpts, hermi=hermi, kpts=kpts,
+                         kpts_band=kpts_band, with_j=with_j,
+                         return_j=return_j, verbose=verbose)
+    elif isinstance(mydf, MultiGridFFTDF):
+        return nr_uks_v1(mydf, xc_code, dm_kpts, hermi=hermi, kpts=kpts,
+                         kpts_band=kpts_band, with_j=with_j,
+                         return_j=return_j, verbose=verbose)
+    else:
+        raise TypeError("Wrong density fitting type for multigrid DFT.")
diff --git a/pyscf/pbc/dft/multigrid.py b/pyscf/pbc/dft/multigrid/multigrid.py
similarity index 95%
rename from pyscf/pbc/dft/multigrid.py
rename to pyscf/pbc/dft/multigrid/multigrid.py
index 80e72e551b..56fb3059cf 100644
--- a/pyscf/pbc/dft/multigrid.py
+++ b/pyscf/pbc/dft/multigrid/multigrid.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# Copyright 2014-2021 The PySCF Developers. All Rights Reserved.
+# Copyright 2014-2024 The PySCF Developers. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 import numpy
 import scipy.linalg
 
+from pyscf import __config__
 from pyscf import lib
 from pyscf.lib import logger
 from pyscf.gto import ATOM_OF, ANG_OF, NPRIM_OF, PTR_EXP, PTR_COEFF
@@ -29,12 +30,21 @@
 from pyscf.pbc import tools
 from pyscf.pbc import gto
 from pyscf.pbc.gto import pseudo
+from pyscf.pbc.gto.pseudo import pp_int
 from pyscf.pbc.dft import numint, gen_grid
-from pyscf.pbc.df.df_jk import _format_dms, _format_kpts_band, _format_jks
+from pyscf.pbc.df.df_jk import (
+    _format_dms,
+    _format_kpts_band,
+    _format_jks,
+)
 from pyscf.pbc.lib.kpts_helper import gamma_point
-from pyscf.pbc.df import fft
-from pyscf.pbc.df import ft_ao
-from pyscf import __config__
+from pyscf.pbc.df import fft, ft_ao
+from pyscf.pbc.dft.multigrid.utils import (
+    _take_4d,
+    _take_5d,
+    _takebak_4d,
+    _takebak_5d,
+)
 
 #sys.stderr.write('WARN: multigrid is an experimental feature. It is still in '
 #                 'testing\nFeatures and APIs may be changed in the future.\n')
@@ -367,23 +377,31 @@ def get_nuc(mydf, kpts=None):
         vne = vne[0]
     return numpy.asarray(vne)
 
-def get_pp(mydf, kpts=None):
+def get_pp(mydf, kpts=None, max_memory=4000):
     '''Get the periodic pseudotential nuc-el AO matrix, with G=0 removed.
     '''
     from pyscf import gto
     kpts, is_single_kpt = fft._check_kpts(mydf, kpts)
     cell = mydf.cell
     mesh = mydf.mesh
-    SI = cell.get_SI()
     Gv = cell.get_Gv(mesh)
-    vpplocG = pseudo.get_vlocG(cell, Gv)
-    vpplocG = -numpy.einsum('ij,ij->j', SI, vpplocG)
-    # from get_jvloc_G0 function
-    vpplocG[0] = numpy.sum(pseudo.get_alphas(cell))
-    ngrids = len(vpplocG)
+
+    ngrids = len(Gv)
+    vpplocG = numpy.empty((ngrids,), dtype=numpy.complex128)
+
+    mem_avail = max(max_memory, mydf.max_memory-lib.current_memory()[0])
+    blksize = int(mem_avail*1e6/((cell.natm*2)*16))
+    blksize = min(ngrids, max(21**3, blksize))
+    for ig0, ig1 in lib.prange(0, ngrids, blksize):
+        vpplocG_batch = pp_int.get_gth_vlocG_part1(cell, Gv[ig0:ig1])
+        SI = cell.get_SI(Gv[ig0:ig1])
+        vpplocG[ig0:ig1] = -numpy.einsum('ij,ij->j', SI, vpplocG_batch)
 
     hermi = 1
     vpp = _get_j_pass2(mydf, vpplocG, hermi, kpts)[0]
+    vpp2 = pp_int.get_pp_loc_part2(cell, kpts)
+    for k, kpt in enumerate(kpts):
+        vpp[k] += vpp2[k]
 
     # vppnonloc evaluated in reciprocal space
     fakemol = gto.Mole()
@@ -396,51 +414,76 @@ def get_pp(mydf, kpts=None):
     fakemol._bas[0,gto.PTR_EXP  ] = ptr+3
     fakemol._bas[0,gto.PTR_COEFF] = ptr+4
 
-    # buf for SPG_lmi upto l=0..3 and nl=3
-    buf = numpy.empty((48,ngrids), dtype=numpy.complex128)
-
     def vppnl_by_k(kpt):
-        Gk = Gv + kpt
-        G_rad = lib.norm(Gk, axis=1)
-        aokG = ft_ao.ft_ao(cell, Gv, kpt=kpt) * (ngrids/cell.vol)
-        vppnl = 0
+        SPG_lm_aoGs = []
         for ia in range(cell.natm):
             symb = cell.atom_symbol(ia)
             if symb not in cell._pseudo:
+                SPG_lm_aoGs.append(None)
                 continue
             pp = cell._pseudo[symb]
             p1 = 0
             for l, proj in enumerate(pp[5:]):
                 rl, nl, hl = proj
                 if nl > 0:
-                    fakemol._bas[0,gto.ANG_OF] = l
-                    fakemol._env[ptr+3] = .5*rl**2
-                    fakemol._env[ptr+4] = rl**(l+1.5)*numpy.pi**1.25
-                    pYlm_part = fakemol.eval_gto('GTOval', Gk)
+                    p1 = p1+nl*(l*2+1)
+            SPG_lm_aoGs.append(numpy.zeros((p1, cell.nao), dtype=numpy.complex128))
 
-                    p0, p1 = p1, p1+nl*(l*2+1)
-                    # pYlm is real, SI[ia] is complex
-                    pYlm = numpy.ndarray((nl,l*2+1,ngrids), dtype=numpy.complex128, buffer=buf[p0:p1])
-                    for k in range(nl):
-                        qkl = pseudo.pp._qli(G_rad*rl, l, k)
-                        pYlm[k] = pYlm_part.T * qkl
-                    #:SPG_lmi = numpy.einsum('g,nmg->nmg', SI[ia].conj(), pYlm)
-                    #:SPG_lm_aoG = numpy.einsum('nmg,gp->nmp', SPG_lmi, aokG)
-                    #:tmp = numpy.einsum('ij,jmp->imp', hl, SPG_lm_aoG)
-                    #:vppnl += numpy.einsum('imp,imq->pq', SPG_lm_aoG.conj(), tmp)
-            if p1 > 0:
-                SPG_lmi = buf[:p1]
-                SPG_lmi *= SI[ia].conj()
-                SPG_lm_aoGs = lib.zdot(SPG_lmi, aokG)
+        mem_avail = max(max_memory, mydf.max_memory-lib.current_memory()[0])
+        blksize = int(mem_avail*1e6/((48+cell.nao+13+3)*16))
+        blksize = min(ngrids, max(21**3, blksize))
+        vppnl = 0
+        for ig0, ig1 in lib.prange(0, ngrids, blksize):
+            ng = ig1 - ig0
+            # buf for SPG_lmi upto l=0..3 and nl=3
+            buf = numpy.empty((48,ng), dtype=numpy.complex128)
+            Gk = Gv[ig0:ig1] + kpt
+            G_rad = numpy.linalg.norm(Gk, axis=1)
+            aokG = ft_ao.ft_ao(cell, Gv[ig0:ig1], kpt=kpt) * (ngrids/cell.vol)
+            for ia in range(cell.natm):
+                symb = cell.atom_symbol(ia)
+                if symb not in cell._pseudo:
+                    continue
+                pp = cell._pseudo[symb]
                 p1 = 0
                 for l, proj in enumerate(pp[5:]):
                     rl, nl, hl = proj
                     if nl > 0:
+                        fakemol._bas[0,gto.ANG_OF] = l
+                        fakemol._env[ptr+3] = .5*rl**2
+                        fakemol._env[ptr+4] = rl**(l+1.5)*numpy.pi**1.25
+                        pYlm_part = fakemol.eval_gto('GTOval', Gk)
+
                         p0, p1 = p1, p1+nl*(l*2+1)
-                        hl = numpy.asarray(hl)
-                        SPG_lm_aoG = SPG_lm_aoGs[p0:p1].reshape(nl,l*2+1,-1)
-                        tmp = numpy.einsum('ij,jmp->imp', hl, SPG_lm_aoG)
-                        vppnl += numpy.einsum('imp,imq->pq', SPG_lm_aoG.conj(), tmp)
+                        # pYlm is real, SI[ia] is complex
+                        pYlm = numpy.ndarray((nl,l*2+1,ng), dtype=numpy.complex128, buffer=buf[p0:p1])
+                        for k in range(nl):
+                            qkl = pseudo.pp._qli(G_rad*rl, l, k)
+                            pYlm[k] = pYlm_part.T * qkl
+                        #:SPG_lmi = numpy.einsum('g,nmg->nmg', SI[ia].conj(), pYlm)
+                        #:SPG_lm_aoG = numpy.einsum('nmg,gp->nmp', SPG_lmi, aokG)
+                        #:tmp = numpy.einsum('ij,jmp->imp', hl, SPG_lm_aoG)
+                        #:vppnl += numpy.einsum('imp,imq->pq', SPG_lm_aoG.conj(), tmp)
+                if p1 > 0:
+                    SPG_lmi = buf[:p1]
+                    SPG_lmi *= cell.get_SI(Gv[ig0:ig1], atmlst=[ia,]).conj()
+                    SPG_lm_aoGs[ia] += lib.zdot(SPG_lmi, aokG)
+            buf = None
+        for ia in range(cell.natm):
+            symb = cell.atom_symbol(ia)
+            if symb not in cell._pseudo:
+                continue
+            pp = cell._pseudo[symb]
+            p1 = 0
+            for l, proj in enumerate(pp[5:]):
+                rl, nl, hl = proj
+                if nl > 0:
+                    p0, p1 = p1, p1+nl*(l*2+1)
+                    hl = numpy.asarray(hl)
+                    SPG_lm_aoG = SPG_lm_aoGs[ia][p0:p1].reshape(nl,l*2+1,-1)
+                    tmp = numpy.einsum('ij,jmp->imp', hl, SPG_lm_aoG)
+                    vppnl += numpy.einsum('imp,imq->pq', SPG_lm_aoG.conj(), tmp)
+        SPG_lm_aoGs=None
         return vppnl * (1./ngrids**2)
 
     for k, kpt in enumerate(kpts):
@@ -454,7 +497,6 @@ def vppnl_by_k(kpt):
         vpp = vpp[0]
     return numpy.asarray(vpp)
 
-
 def get_j_kpts(mydf, dm_kpts, hermi=1, kpts=numpy.zeros((1,3)), kpts_band=None):
     '''Get the Coulomb (J) AO matrix at sampled k-points.
 
@@ -1859,7 +1901,7 @@ def get_jk(self, dm, hermi=1, kpts=None, kpts_band=None,
     get_rho = get_rho
 
 
-def multigrid(mf):
+def multigrid_fftdf(mf):
     '''Use MultiGridFFTDF to replace the default FFTDF integration method in
     the DFT object.
     '''
@@ -1867,56 +1909,7 @@ def multigrid(mf):
     mf.with_df.__dict__.update(old_df.__dict__)
     return mf
 
+multigrid = multigrid_fftdf # for backward compatibility
 
 def _pgto_shells(cell):
     return cell._bas[:,NPRIM_OF].sum()
-
-def _take_4d(a, indices):
-    a_shape = a.shape
-    ranges = []
-    for i, s in enumerate(indices):
-        if s is None:
-            idx = numpy.arange(a_shape[i], dtype=numpy.int32)
-        else:
-            idx = numpy.asarray(s, dtype=numpy.int32)
-            idx[idx < 0] += a_shape[i]
-        ranges.append(idx)
-    idx = ranges[0][:,None] * a_shape[1] + ranges[1]
-    idy = ranges[2][:,None] * a_shape[3] + ranges[3]
-    a = a.reshape(a_shape[0]*a_shape[1], a_shape[2]*a_shape[3])
-    out = lib.take_2d(a, idx.ravel(), idy.ravel())
-    return out.reshape([len(s) for s in ranges])
-
-def _takebak_4d(out, a, indices):
-    out_shape = out.shape
-    a_shape = a.shape
-    ranges = []
-    for i, s in enumerate(indices):
-        if s is None:
-            idx = numpy.arange(a_shape[i], dtype=numpy.int32)
-        else:
-            idx = numpy.asarray(s, dtype=numpy.int32)
-            idx[idx < 0] += out_shape[i]
-        assert (len(idx) == a_shape[i])
-        ranges.append(idx)
-    idx = ranges[0][:,None] * out_shape[1] + ranges[1]
-    idy = ranges[2][:,None] * out_shape[3] + ranges[3]
-    nx = idx.size
-    ny = idy.size
-    out = out.reshape(out_shape[0]*out_shape[1], out_shape[2]*out_shape[3])
-    lib.takebak_2d(out, a.reshape(nx,ny), idx.ravel(), idy.ravel())
-    return out
-
-def _take_5d(a, indices):
-    a_shape = a.shape
-    a = a.reshape((a_shape[0]*a_shape[1],) + a_shape[2:])
-    indices = (None,) + indices[2:]
-    return _take_4d(a, indices)
-
-def _takebak_5d(out, a, indices):
-    a_shape = a.shape
-    out_shape = out.shape
-    a = a.reshape((a_shape[0]*a_shape[1],) + a_shape[2:])
-    out = out.reshape((out_shape[0]*out_shape[1],) + out_shape[2:])
-    indices = (None,) + indices[2:]
-    return _takebak_4d(out, a, indices)
diff --git a/pyscf/pbc/dft/multigrid/multigrid_pair.py b/pyscf/pbc/dft/multigrid/multigrid_pair.py
new file mode 100644
index 0000000000..3ef43b688d
--- /dev/null
+++ b/pyscf/pbc/dft/multigrid/multigrid_pair.py
@@ -0,0 +1,1405 @@
+#!/usr/bin/env python
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Author: Xing Zhang <zhangxing.nju@gmail.com>
+#
+
+import ctypes
+import numpy as np
+from pyscf import __config__
+from pyscf import lib
+from pyscf.lib import logger
+from pyscf.gto import moleintor
+from pyscf.pbc import tools
+from pyscf.pbc.lib.kpts_helper import gamma_point
+from pyscf.pbc.df import fft
+from pyscf.pbc.df.df_jk import (
+    _format_dms,
+    _format_kpts_band,
+    _format_jks,
+)
+from pyscf.pbc.dft.multigrid.pp import (
+    _get_vpplocG_part1,
+    _get_pp_without_erf,
+    vpploc_part1_nuc_grad,
+)
+from pyscf.pbc.dft.multigrid.utils import (
+    _take_4d,
+    _take_5d,
+    _takebak_4d,
+    _takebak_5d,
+)
+from pyscf.pbc.dft.multigrid.multigrid import MultiGridFFTDF
+
+NGRIDS = getattr(__config__, 'pbc_dft_multigrid_ngrids', 4)
+KE_RATIO = getattr(__config__, 'pbc_dft_multigrid_ke_ratio', 3.0)
+REL_CUTOFF = getattr(__config__, 'pbc_dft_multigrid_rel_cutoff', 20.0)
+GGA_METHOD = getattr(__config__, 'pbc_dft_multigrid_gga_method', 'FFT')
+
+EXTRA_PREC = getattr(__config__, 'pbc_gto_eval_gto_extra_precision', 1e-2)
+RHOG_HIGH_ORDER = getattr(__config__, 'pbc_dft_multigrid_rhog_high_order', False)
+PTR_EXPDROP = 16
+EXPDROP = getattr(__config__, 'pbc_dft_multigrid_expdrop', 1e-12)
+IMAG_TOL = 1e-9
+
+libdft = lib.load_library('libdft')
+
+def gradient_gs(f_gs, Gv):
+    r'''Compute the G-space components of :math:`\nabla f(r)`
+    given :math:`f(G)` and :math:`G`,
+    which is equivalent to einsum('np,px->nxp', f_gs, 1j*Gv)
+    '''
+    ng, dim = Gv.shape
+    assert dim == 3
+    Gv = np.asarray(Gv, order='C', dtype=np.double)
+    f_gs = np.asarray(f_gs.reshape(-1,ng), order='C', dtype=np.complex128)
+    n = f_gs.shape[0]
+    out = np.empty((n,dim,ng), dtype=np.complex128)
+
+    fn = getattr(libdft, 'gradient_gs', None)
+    try:
+        fn(out.ctypes.data_as(ctypes.c_void_p),
+           f_gs.ctypes.data_as(ctypes.c_void_p),
+           Gv.ctypes.data_as(ctypes.c_void_p),
+           ctypes.c_int(n), ctypes.c_size_t(ng))
+    except Exception as e:
+        raise RuntimeError(f'Error in gradient_gs: {e}')
+    return out
+
+
+class GridLevel_Info(ctypes.Structure):
+    '''
+    Info about the grid levels.
+    '''
+    _fields_ = [("nlevels", ctypes.c_int), # number of grid levels
+                ("rel_cutoff", ctypes.c_double),
+                ("cutoff", ctypes.POINTER(ctypes.c_double)),
+                ("mesh", ctypes.POINTER(ctypes.c_int))]
+
+class RS_Grid(ctypes.Structure):
+    '''
+    Values on real space multigrid.
+    '''
+    _fields_ = [("nlevels", ctypes.c_int),
+                ("gridlevel_info", ctypes.POINTER(GridLevel_Info)),
+                ("comp", ctypes.c_int),
+                # data is list of 1d arrays
+                ("data", ctypes.POINTER(ctypes.POINTER(ctypes.c_double)))]
+
+class PGFPair(ctypes.Structure):
+    '''
+    A primitive Gaussian function pair.
+    '''
+    _fields_ = [("ish", ctypes.c_int),
+                ("ipgf", ctypes.c_int),
+                ("jsh", ctypes.c_int),
+                ("jpgf", ctypes.c_int),
+                ("iL", ctypes.c_int),
+                ("radius", ctypes.c_double)]
+
+
+class Task(ctypes.Structure):
+    '''
+    A single task.
+    '''
+    _fields_ = [("buf_size", ctypes.c_size_t),
+                ("ntasks", ctypes.c_size_t),
+                ("pgfpairs", ctypes.POINTER(ctypes.POINTER(PGFPair))),
+                ("radius", ctypes.c_double)]
+
+
+class TaskList(ctypes.Structure):
+    '''
+    A task list.
+    '''
+    _fields_ = [("nlevels", ctypes.c_int),
+                ("hermi", ctypes.c_int),
+                ("gridlevel_info", ctypes.POINTER(GridLevel_Info)),
+                ("tasks", ctypes.POINTER(ctypes.POINTER(Task)))]
+
+
+def multi_grids_tasks(cell, ke_cutoff=None, hermi=0,
+                      ngrids=NGRIDS, ke_ratio=KE_RATIO, rel_cutoff=REL_CUTOFF):
+    if ke_cutoff is None:
+        ke_cutoff = cell.ke_cutoff
+    if ke_cutoff is None:
+        raise ValueError("cell.ke_cutoff is not set.")
+    ke1 = ke_cutoff
+    cutoff = [ke1,]
+    for i in range(ngrids-1):
+        ke1 /= ke_ratio
+        cutoff.append(ke1)
+    cutoff.reverse()
+    a = cell.lattice_vectors()
+    mesh = []
+    for ke in cutoff:
+        mesh.append(tools.cutoff_to_mesh(a, ke))
+    logger.info(cell, 'ke_cutoff for multigrid tasks:\n%s', cutoff)
+    logger.info(cell, 'meshes for multigrid tasks:\n%s', mesh)
+    gridlevel_info = init_gridlevel_info(cutoff, rel_cutoff, mesh)
+    task_list = build_task_list(cell, gridlevel_info, hermi=hermi)
+    return task_list
+
+
+def _update_task_list(mydf, hermi=0, ngrids=None, ke_ratio=None, rel_cutoff=None):
+    '''
+    Update :attr:`task_list` if necessary.
+    '''
+    cell = mydf.cell
+    if ngrids is None:
+        ngrids = mydf.ngrids
+    if ke_ratio is None:
+        ke_ratio = mydf.ke_ratio
+    if rel_cutoff is None:
+        rel_cutoff = mydf.rel_cutoff
+
+    need_update = False
+    task_list = getattr(mydf, 'task_list', None)
+    if task_list is None:
+        need_update = True
+    else:
+        hermi_orig = task_list.contents.hermi
+        nlevels = task_list.contents.nlevels
+        rel_cutoff_orig = task_list.contents.gridlevel_info.contents.rel_cutoff
+        #TODO also need to check kenetic energy cutoff change
+        if (hermi_orig > hermi or
+                nlevels != ngrids or
+                abs(rel_cutoff_orig-rel_cutoff) > 1e-12):
+            need_update = True
+
+    if need_update:
+        if task_list is not None:
+            free_task_list(task_list)
+        task_list = multi_grids_tasks(cell, hermi=hermi, ngrids=ngrids,
+                                      ke_ratio=ke_ratio, rel_cutoff=rel_cutoff)
+        mydf.task_list = task_list
+    return task_list
+
+
+def init_gridlevel_info(cutoff, rel_cutoff, mesh):
+    if cutoff[0] < 1e-15:
+        cutoff = cutoff[1:]
+    cutoff = np.asarray(cutoff, order='C', dtype=np.double)
+    mesh = np.asarray(np.asarray(mesh).reshape(-1,3), order='C', dtype=np.int32)
+    nlevels = len(cutoff)
+    gridlevel_info = ctypes.POINTER(GridLevel_Info)()
+    fn = getattr(libdft, "init_gridlevel_info", None)
+    try:
+        fn(ctypes.byref(gridlevel_info),
+           cutoff.ctypes.data_as(ctypes.c_void_p),
+           mesh.ctypes.data_as(ctypes.c_void_p),
+           ctypes.c_int(nlevels), ctypes.c_double(rel_cutoff))
+    except Exception as e:
+        raise RuntimeError("Failed to init grid level info. %s" % e)
+    return gridlevel_info
+
+
+def free_gridlevel_info(gridlevel_info):
+    fn = getattr(libdft, "del_gridlevel_info", None)
+    try:
+        fn(ctypes.byref(gridlevel_info))
+    except Exception as e:
+        raise RuntimeError("Failed to free grid level info. %s" % e)
+
+
+def init_rs_grid(gridlevel_info, comp):
+    '''
+    Initialize values on real space multigrid
+    '''
+    rs_grid = ctypes.POINTER(RS_Grid)()
+    fn = getattr(libdft, "init_rs_grid", None)
+    try:
+        fn(ctypes.byref(rs_grid),
+           ctypes.byref(gridlevel_info),
+           ctypes.c_int(comp))
+    except Exception as e:
+        raise RuntimeError("Failed to initialize real space multigrid data. %s" % e)
+    return rs_grid
+
+
+def free_rs_grid(rs_grid):
+    fn = getattr(libdft, "del_rs_grid", None)
+    try:
+        fn(ctypes.byref(rs_grid))
+    except Exception as e:
+        raise RuntimeError("Failed to free real space multigrid data. %s" % e)
+
+
+def build_task_list(cell, gridlevel_info, cell1=None, Ls=None, hermi=0, precision=None):
+    '''
+    Build the task list for multigrid DFT calculations.
+
+    Arguments:
+        cell : :class:`pbc.gto.cell.Cell`
+            The :class:`Cell` instance for the bra basis functions.
+        gridlevel_info : :class:`ctypes.POINTER`
+            The C pointer of the :class:`GridLevel_Info` structure.
+        cell1 : :class:`pbc.gto.cell.Cell`, optional
+            The :class:`Cell` instance for the ket basis functions.
+            If not given, both bra and ket basis functions come from cell.
+        Ls : (*,3) array, optional
+            The cartesian coordinates of the periodic images.
+            Default is calculated by :func:`cell.get_lattice_Ls`.
+        hermi : int, optional
+            If :math:`hermi=1`, the task list is built only for
+            the upper triangle of the matrix. Default is 0.
+        precision : float, optional
+            The integral precision. Default is :attr:`cell.precision`.
+
+    Returns: :class:`ctypes.POINTER`
+        The C pointer of the :class:`TaskList` structure.
+    '''
+    from pyscf.pbc.gto import build_neighbor_list_for_shlpairs, free_neighbor_list
+    if cell1 is None:
+        cell1 = cell
+    if Ls is None:
+        Ls = cell.get_lattice_Ls()
+    if precision is None:
+        precision = cell.precision
+
+    if hermi == 1 and cell1 is not cell:
+        logger.warn(cell,
+                    "Set hermi=0 because cell and cell1 are not the same.")
+        hermi = 0
+
+    ish_atm = np.asarray(cell._atm, order='C', dtype=np.int32)
+    ish_bas = np.asarray(cell._bas, order='C', dtype=np.int32)
+    ish_env = np.asarray(cell._env, order='C', dtype=float)
+    nish = len(ish_bas)
+    ish_rcut, ipgf_rcut = cell.rcut_by_shells(precision=precision,
+                                              return_pgf_radius=True)
+    assert nish == len(ish_rcut)
+    ptr_ipgf_rcut = lib.ndarray_pointer_2d(ipgf_rcut)
+
+    if cell1 is cell:
+        jsh_atm = ish_atm
+        jsh_bas = ish_bas
+        jsh_env = ish_env
+        jsh_rcut = ish_rcut
+        jpgf_rcut = ipgf_rcut
+        ptr_jpgf_rcut = ptr_ipgf_rcut
+    else:
+        jsh_atm = np.asarray(cell1._atm, order='C', dtype=np.int32)
+        jsh_bas = np.asarray(cell1._bas, order='C', dtype=np.int32)
+        jsh_env = np.asarray(cell1._env, order='C', dtype=float)
+        jsh_rcut, jpgf_rcut = cell1.rcut_by_shells(precision=precision,
+                                                   return_pgf_radius=True)
+        ptr_jpgf_rcut = lib.ndarray_pointer_2d(jpgf_rcut)
+    njsh = len(jsh_bas)
+    assert njsh == len(jsh_rcut)
+
+    nl = build_neighbor_list_for_shlpairs(cell, cell1, Ls=Ls,
+                                          ish_rcut=ish_rcut, jsh_rcut=jsh_rcut,
+                                          hermi=hermi)
+
+    task_list = ctypes.POINTER(TaskList)()
+    func = getattr(libdft, "build_task_list", None)
+    try:
+        func(ctypes.byref(task_list),
+             ctypes.byref(nl), ctypes.byref(gridlevel_info),
+             ish_atm.ctypes.data_as(ctypes.c_void_p),
+             ish_bas.ctypes.data_as(ctypes.c_void_p),
+             ish_env.ctypes.data_as(ctypes.c_void_p),
+             ish_rcut.ctypes.data_as(ctypes.c_void_p),
+             ptr_ipgf_rcut,
+             jsh_atm.ctypes.data_as(ctypes.c_void_p),
+             jsh_bas.ctypes.data_as(ctypes.c_void_p),
+             jsh_env.ctypes.data_as(ctypes.c_void_p),
+             jsh_rcut.ctypes.data_as(ctypes.c_void_p),
+             ptr_jpgf_rcut,
+             ctypes.c_int(nish), ctypes.c_int(njsh),
+             Ls.ctypes.data_as(ctypes.c_void_p),
+             ctypes.c_double(precision), ctypes.c_int(hermi))
+    except Exception as e:
+        raise RuntimeError("Failed to build task list. %s" % e)
+    free_neighbor_list(nl)
+    return task_list
+
+
+def free_task_list(task_list):
+    '''
+    Note:
+        This will also free task_list.contents.gridlevel_info.
+    '''
+    if task_list is None:
+        return
+    func = getattr(libdft, "del_task_list", None)
+    try:
+        func(ctypes.byref(task_list))
+    except Exception as e:
+        raise RuntimeError("Failed to free task list. %s" % e)
+
+
+def eval_rho(cell, dm, task_list, shls_slice=None, hermi=0, xctype='LDA', kpts=None,
+             dimension=None, cell1=None, shls_slice1=None, Ls=None,
+             a=None, ignore_imag=False):
+    '''
+    Collocate density (opt. gradients) on the real-space grid.
+    The two sets of Gaussian functions can be different.
+
+    Returns:
+        rho: RS_Grid object
+            Densities on real space multigrids.
+    '''
+    cell0 = cell
+    shls_slice0 = shls_slice
+    if cell1 is None:
+        cell1 = cell0
+
+    #TODO mixture of cartesian and spherical bases
+    assert cell0.cart == cell1.cart
+
+    ish_atm = np.asarray(cell0._atm, order='C', dtype=np.int32)
+    ish_bas = np.asarray(cell0._bas, order='C', dtype=np.int32)
+    ish_env = np.asarray(cell0._env, order='C', dtype=np.double)
+    ish_env[PTR_EXPDROP] = min(cell0.precision*EXTRA_PREC, EXPDROP)
+
+    if cell1 is cell0:
+        jsh_atm = ish_atm
+        jsh_bas = ish_bas
+        jsh_env = ish_env
+    else:
+        jsh_atm = np.asarray(cell1._atm, order='C', dtype=np.int32)
+        jsh_bas = np.asarray(cell1._bas, order='C', dtype=np.int32)
+        jsh_env = np.asarray(cell1._env, order='C', dtype=np.double)
+        jsh_env[PTR_EXPDROP] = min(cell1.precision*EXTRA_PREC, EXPDROP)
+
+    if shls_slice0 is None:
+        shls_slice0 = (0, cell0.nbas)
+    i0, i1 = shls_slice0
+    if shls_slice1 is None:
+        shls_slice1 = shls_slice0
+    j0, j1 = shls_slice1
+
+    if hermi == 1:
+        assert cell1 is cell0
+        assert i0 == j0 and i1 == j1
+
+    key0 = 'cart' if cell0.cart else 'sph'
+    ao_loc0 = moleintor.make_loc(ish_bas, key0)
+    naoi = ao_loc0[i1] - ao_loc0[i0]
+    if hermi == 1:
+        ao_loc1 = ao_loc0
+    else:
+        key1 = 'cart' if cell1.cart else 'sph'
+        ao_loc1 = moleintor.make_loc(jsh_bas, key1)
+    naoj = ao_loc1[j1] - ao_loc1[j0]
+
+    dm = np.asarray(dm, order='C')
+    assert dm.shape[-2:] == (naoi, naoj)
+
+    if dimension is None:
+        dimension = cell0.dimension
+    assert dimension == getattr(cell1, "dimension", None)
+
+    if Ls is None and dimension > 0:
+        Ls = np.asarray(cell0.get_lattice_Ls(), order='C')
+    elif Ls is None and dimension == 0:
+        Ls = np.zeros((1,3))
+
+    if dimension == 0 or kpts is None or gamma_point(kpts):
+        nkpts, nimgs = 1, Ls.shape[0]
+        dm = dm.reshape(-1,1,naoi,naoj)
+    else:
+        expkL = np.exp(1j*kpts.reshape(-1,3).dot(Ls.T))
+        nkpts, nimgs = expkL.shape
+        dm = dm.reshape(-1,nkpts,naoi,naoj)
+    n_dm = dm.shape[0]
+
+    #TODO check if cell1 has the same lattice vectors
+    if a is None:
+        a = cell0.lattice_vectors()
+    b = np.linalg.inv(a.T)
+
+    if abs(a-np.diag(a.diagonal())).max() < 1e-12:
+        lattice_type = '_orth'
+    else:
+        lattice_type = '_nonorth'
+    xctype = xctype.upper()
+    if xctype == 'LDA':
+        comp = 1
+    elif xctype == 'GGA':
+        if hermi == 1:
+            raise RuntimeError('hermi=1 is not supported for GGA functional')
+        comp = 4
+    else:
+        raise NotImplementedError('meta-GGA')
+
+    eval_fn = 'make_rho_' + xctype.lower() + lattice_type
+    drv = getattr(libdft, "grid_collocate_drv", None)
+
+    def make_rho_(rs_rho, dm):
+        try:
+            drv(getattr(libdft, eval_fn, None),
+                ctypes.byref(rs_rho),
+                dm.ctypes.data_as(ctypes.c_void_p),
+                ctypes.byref(task_list),
+                ctypes.c_int(comp), ctypes.c_int(hermi),
+                (ctypes.c_int*4)(i0, i1, j0, j1),
+                ao_loc0.ctypes.data_as(ctypes.c_void_p),
+                ao_loc1.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(dimension),
+                Ls.ctypes.data_as(ctypes.c_void_p),
+                a.ctypes.data_as(ctypes.c_void_p),
+                b.ctypes.data_as(ctypes.c_void_p),
+                ish_atm.ctypes.data_as(ctypes.c_void_p),
+                ish_bas.ctypes.data_as(ctypes.c_void_p),
+                ish_env.ctypes.data_as(ctypes.c_void_p),
+                jsh_atm.ctypes.data_as(ctypes.c_void_p),
+                jsh_bas.ctypes.data_as(ctypes.c_void_p),
+                jsh_env.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(cell0.cart))
+        except Exception as e:
+            raise RuntimeError("Failed to compute rho. %s" % e)
+        return rs_rho
+
+    gridlevel_info = task_list.contents.gridlevel_info
+    rho = []
+    for i, dm_i in enumerate(dm):
+        rs_rho = init_rs_grid(gridlevel_info, comp)
+        if dimension == 0 or kpts is None or gamma_point(kpts):
+            make_rho_(rs_rho, dm_i)
+        else:
+            raise NotImplementedError
+        rho.append(rs_rho)
+
+    if n_dm == 1:
+        rho = rho[0]
+    return rho
+
+
+def _eval_rhoG(mydf, dm_kpts, hermi=1, kpts=np.zeros((1,3)), deriv=0,
+               rhog_high_order=RHOG_HIGH_ORDER):
+    assert(deriv < 2)
+    cell = mydf.cell
+
+    dm_kpts = np.asarray(dm_kpts, order='C')
+    dms = _format_dms(dm_kpts, kpts)
+    nset, nkpts, nao = dms.shape[:3]
+
+    task_list = _update_task_list(mydf, hermi=hermi, ngrids=mydf.ngrids,
+                                  ke_ratio=mydf.ke_ratio, rel_cutoff=mydf.rel_cutoff)
+
+    gga_high_order = False
+    if deriv == 0:
+        xctype = 'LDA'
+        rhodim = 1
+    elif deriv == 1:
+        if rhog_high_order:
+            xctype = 'GGA'
+            rhodim = 4
+        else:  # approximate high order derivatives in reciprocal space
+            gga_high_order = True
+            xctype = 'LDA'
+            rhodim = 1
+            deriv = 0
+        assert(hermi == 1 or gamma_point(kpts))
+    elif deriv == 2:  # meta-GGA
+        raise NotImplementedError
+        assert(hermi == 1 or gamma_point(kpts))
+
+    ignore_imag = (hermi == 1)
+
+    rs_rho = eval_rho(cell, dms, task_list, hermi=hermi, xctype=xctype, kpts=kpts,
+                      ignore_imag=ignore_imag)
+
+    nx, ny, nz = mydf.mesh
+    rhoG = np.zeros((nset*rhodim,nx,ny,nz), dtype=np.complex128)
+    nlevels = task_list.contents.nlevels
+    meshes = task_list.contents.gridlevel_info.contents.mesh
+    meshes = np.ctypeslib.as_array(meshes, shape=(nlevels,3))
+    for ilevel in range(nlevels):
+        mesh = meshes[ilevel]
+        ngrids = np.prod(mesh)
+        if nset > 1:
+            rho = []
+            for i in range(nset):
+                rho.append(np.ctypeslib.as_array(rs_rho[i].contents.data[ilevel], shape=(ngrids,)))
+            rho = np.asarray(rho)
+        else:
+            rho = np.ctypeslib.as_array(rs_rho.contents.data[ilevel], shape=(ngrids,))
+
+        weight = 1./nkpts * cell.vol/ngrids
+        rho_freq = tools.fft(rho.reshape(nset*rhodim, -1), mesh)
+        rho = None
+        rho_freq *= weight
+        gx = np.fft.fftfreq(mesh[0], 1./mesh[0]).astype(np.int32)
+        gy = np.fft.fftfreq(mesh[1], 1./mesh[1]).astype(np.int32)
+        gz = np.fft.fftfreq(mesh[2], 1./mesh[2]).astype(np.int32)
+        _takebak_4d(rhoG, rho_freq.reshape((-1,) + tuple(mesh)), (None, gx, gy, gz))
+        rho_freq = None
+
+    if nset > 1:
+        for i in range(nset):
+            free_rs_grid(rs_rho[i])
+    else:
+        free_rs_grid(rs_rho)
+    rs_rho = None
+
+    rhoG = rhoG.reshape(nset,rhodim,-1)
+    if gga_high_order:
+        Gv = cell.get_Gv(mydf.mesh)
+        #:rhoG1 = np.einsum('np,px->nxp', 1j*rhoG[:,0], Gv)
+        rhoG1 = gradient_gs(rhoG[:,0], Gv)
+        rhoG = np.concatenate([rhoG, rhoG1], axis=1)
+        Gv = rhoG1 = None
+    return rhoG
+
+
+def eval_mat(cell, weights, task_list, shls_slice=None, comp=1, hermi=0, deriv=0,
+             xctype='LDA', kpts=None, grid_level=None, dimension=None, mesh=None,
+             cell1=None, shls_slice1=None, Ls=None, a=None):
+
+    cell0 = cell
+    shls_slice0 = shls_slice
+    if cell1 is None:
+        cell1 = cell0
+
+    if mesh is None:
+        mesh = cell0.mesh
+
+    #TODO mixture of cartesian and spherical bases
+    assert cell0.cart == cell1.cart
+
+    ish_atm = np.asarray(cell0._atm, order='C', dtype=np.int32)
+    ish_bas = np.asarray(cell0._bas, order='C', dtype=np.int32)
+    ish_env = np.asarray(cell0._env, order='C', dtype=np.double)
+    ish_env[PTR_EXPDROP] = min(cell0.precision*EXTRA_PREC, EXPDROP)
+
+    if cell1 is cell0:
+        jsh_atm = ish_atm
+        jsh_bas = ish_bas
+        jsh_env = ish_env
+    else:
+        jsh_atm = np.asarray(cell1._atm, order='C', dtype=np.int32)
+        jsh_bas = np.asarray(cell1._bas, order='C', dtype=np.int32)
+        jsh_env = np.asarray(cell1._env, order='C', dtype=np.double)
+        jsh_env[PTR_EXPDROP] = min(cell1.precision*EXTRA_PREC, EXPDROP)
+
+    if shls_slice0 is None:
+        shls_slice0 = (0, cell0.nbas)
+    i0, i1 = shls_slice0
+    if shls_slice1 is None:
+        shls_slice1 = (0, cell1.nbas)
+    j0, j1 = shls_slice1
+
+    if hermi == 1:
+        assert cell1 is cell0
+        assert i0 == j0 and i1 == j1
+
+    key0 = 'cart' if cell0.cart else 'sph'
+    ao_loc0 = moleintor.make_loc(ish_bas, key0)
+    naoi = ao_loc0[i1] - ao_loc0[i0]
+    if hermi == 1:
+        ao_loc1 = ao_loc0
+    else:
+        key1 = 'cart' if cell1.cart else 'sph'
+        ao_loc1 = moleintor.make_loc(jsh_bas, key1)
+    naoj = ao_loc1[j1] - ao_loc1[j0]
+
+    if dimension is None:
+        dimension = cell0.dimension
+    assert dimension == getattr(cell1, "dimension", None)
+
+    if Ls is None and dimension > 0:
+        Ls = np.asarray(cell0.get_lattice_Ls(), order='C')
+    elif Ls is None and dimension == 0:
+        Ls = np.zeros((1,3))
+
+    if dimension == 0 or kpts is None or gamma_point(kpts):
+        nkpts, nimgs = 1, Ls.shape[0]
+    else:
+        expkL = np.exp(1j*kpts.reshape(-1,3).dot(Ls.T))
+        nkpts, nimgs = expkL.shape
+
+    #TODO check if cell1 has the same lattice vectors
+    if a is None:
+        a = cell0.lattice_vectors()
+    b = np.linalg.inv(a.T)
+
+    if abs(a-np.diag(a.diagonal())).max() < 1e-12:
+        lattice_type = '_orth'
+    else:
+        lattice_type = '_nonorth'
+
+    weights = np.asarray(weights, order='C')
+    assert(weights.dtype == np.double)
+    xctype = xctype.upper()
+    n_mat = None
+    if xctype == 'LDA':
+        if weights.ndim == 1:
+            weights = weights.reshape(-1, np.prod(mesh))
+        else:
+            n_mat = weights.shape[0]
+    elif xctype == 'GGA':
+        if weights.ndim == 2:
+            weights = weights.reshape(-1, 4, np.prod(mesh))
+        else:
+            n_mat = weights.shape[0]
+    else:
+        raise NotImplementedError
+
+    eval_fn = 'eval_mat_' + xctype.lower() + lattice_type
+    if deriv > 0:
+        if deriv == 1:
+            assert comp == 3
+            assert hermi == 0
+            eval_fn += '_ip1'
+        else:
+            raise NotImplementedError
+    drv = getattr(libdft, "grid_integrate_drv", None)
+
+    def make_mat(wv):
+        if comp == 1:
+            mat = np.zeros((naoi, naoj))
+        else:
+            mat = np.zeros((comp, naoi, naoj))
+
+        try:
+            drv(getattr(libdft, eval_fn, None),
+                mat.ctypes.data_as(ctypes.c_void_p),
+                wv.ctypes.data_as(ctypes.c_void_p),
+                ctypes.byref(task_list),
+                ctypes.c_int(comp), ctypes.c_int(hermi),
+                ctypes.c_int(grid_level),
+                (ctypes.c_int*4)(i0, i1, j0, j1),
+                ao_loc0.ctypes.data_as(ctypes.c_void_p),
+                ao_loc1.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(dimension),
+                Ls.ctypes.data_as(ctypes.c_void_p),
+                a.ctypes.data_as(ctypes.c_void_p),
+                b.ctypes.data_as(ctypes.c_void_p),
+                ish_atm.ctypes.data_as(ctypes.c_void_p),
+                ish_bas.ctypes.data_as(ctypes.c_void_p),
+                ish_env.ctypes.data_as(ctypes.c_void_p),
+                jsh_atm.ctypes.data_as(ctypes.c_void_p),
+                jsh_bas.ctypes.data_as(ctypes.c_void_p),
+                jsh_env.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(cell0.cart))
+        except Exception as e:
+            raise RuntimeError("Failed to compute rho. %s" % e)
+        return mat
+
+    out = []
+    for wv in weights:
+        if dimension == 0 or kpts is None or gamma_point(kpts):
+            mat = make_mat(wv)
+        else:
+            raise NotImplementedError
+        out.append(mat)
+
+    if n_mat is None:
+        out = out[0]
+    return out
+
+
+def _get_j_pass2(mydf, vG, kpts=np.zeros((1,3)), hermi=1, verbose=None):
+    cell = mydf.cell
+    nkpts = len(kpts)
+    nao = cell.nao_nr()
+    nx, ny, nz = mydf.mesh
+    vG = vG.reshape(-1,nx,ny,nz)
+    nset = vG.shape[0]
+
+    task_list = _update_task_list(mydf, hermi=hermi, ngrids=mydf.ngrids,
+                                  ke_ratio=mydf.ke_ratio, rel_cutoff=mydf.rel_cutoff)
+
+    at_gamma_point = gamma_point(kpts)
+    if at_gamma_point:
+        vj_kpts = np.zeros((nset,nkpts,nao,nao))
+    else:
+        vj_kpts = np.zeros((nset,nkpts,nao,nao), dtype=np.complex128)
+
+    nlevels = task_list.contents.nlevels
+    meshes = task_list.contents.gridlevel_info.contents.mesh
+    meshes = np.ctypeslib.as_array(meshes, shape=(nlevels,3))
+    for ilevel in range(nlevels):
+        mesh = meshes[ilevel]
+        ngrids = np.prod(mesh)
+
+        gx = np.fft.fftfreq(mesh[0], 1./mesh[0]).astype(np.int32)
+        gy = np.fft.fftfreq(mesh[1], 1./mesh[1]).astype(np.int32)
+        gz = np.fft.fftfreq(mesh[2], 1./mesh[2]).astype(np.int32)
+        sub_vG = _take_4d(vG, (None, gx, gy, gz)).reshape(nset,ngrids)
+
+        v_rs = tools.ifft(sub_vG, mesh).reshape(nset,ngrids)
+        vR = np.asarray(v_rs.real, order='C')
+        vI = np.asarray(v_rs.imag, order='C')
+        if at_gamma_point:
+            v_rs = vR
+
+        mat = eval_mat(cell, vR, task_list, comp=1, hermi=hermi,
+                       xctype='LDA', kpts=kpts, grid_level=ilevel, mesh=mesh)
+        vj_kpts += np.asarray(mat).reshape(nset,-1,nao,nao)
+        if not at_gamma_point and abs(vI).max() > IMAG_TOL:
+            raise NotImplementedError
+
+    if nset == 1:
+        vj_kpts = vj_kpts[0]
+    return vj_kpts
+
+
+def _get_j_pass2_ip1(mydf, vG, kpts=np.zeros((1,3)), hermi=0, deriv=1, verbose=None):
+    if deriv == 1:
+        comp = 3
+        assert hermi == 0
+    else:
+        raise NotImplementedError
+
+    cell = mydf.cell
+    nkpts = len(kpts)
+    nao = cell.nao_nr()
+    nx, ny, nz = mydf.mesh
+    vG = vG.reshape(-1,nx,ny,nz)
+    nset = vG.shape[0]
+
+    task_list = _update_task_list(mydf, hermi=hermi, ngrids=mydf.ngrids,
+                                  ke_ratio=mydf.ke_ratio, rel_cutoff=mydf.rel_cutoff)
+
+    at_gamma_point = gamma_point(kpts)
+    if at_gamma_point:
+        vj_kpts = np.zeros((nset,nkpts,comp,nao,nao))
+    else:
+        vj_kpts = np.zeros((nset,nkpts,comp,nao,nao), dtype=np.complex128)
+
+    nlevels = task_list.contents.nlevels
+    meshes = task_list.contents.gridlevel_info.contents.mesh
+    meshes = np.ctypeslib.as_array(meshes, shape=(nlevels,3))
+    for ilevel in range(nlevels):
+        mesh = meshes[ilevel]
+        ngrids = np.prod(mesh)
+
+        gx = np.fft.fftfreq(mesh[0], 1./mesh[0]).astype(np.int32)
+        gy = np.fft.fftfreq(mesh[1], 1./mesh[1]).astype(np.int32)
+        gz = np.fft.fftfreq(mesh[2], 1./mesh[2]).astype(np.int32)
+        sub_vG = _take_4d(vG, (None, gx, gy, gz)).reshape(nset,ngrids)
+
+        v_rs = tools.ifft(sub_vG, mesh).reshape(nset,ngrids)
+        if at_gamma_point:
+            vR = np.asarray(v_rs.real, order='C', dtype=float)
+            #vI = None
+        else:
+            raise NotImplementedError
+
+        mat = eval_mat(cell, vR, task_list, comp=comp, hermi=hermi, deriv=deriv,
+                       xctype='LDA', kpts=kpts, grid_level=ilevel, mesh=mesh)
+        mat = np.asarray(mat).reshape(nset,-1,comp,nao,nao)
+        vj_kpts = np.add(vj_kpts, mat, out=vj_kpts)
+
+    if nset == 1:
+        vj_kpts = vj_kpts[0]
+    return vj_kpts
+
+
+def _get_gga_pass2(mydf, vG, kpts=np.zeros((1,3)), hermi=1, verbose=None):
+    cell = mydf.cell
+    nkpts = len(kpts)
+    nao = cell.nao_nr()
+    nx, ny, nz = mydf.mesh
+    vG = vG.reshape(-1,4,nx,ny,nz)
+    nset = vG.shape[0]
+
+    task_list = _update_task_list(mydf, hermi=hermi, ngrids=mydf.ngrids,
+                                  ke_ratio=mydf.ke_ratio, rel_cutoff=mydf.rel_cutoff)
+
+    if gamma_point(kpts):
+        veff = np.zeros((nset,nkpts,nao,nao))
+    else:
+        veff = np.zeros((nset,nkpts,nao,nao), dtype=np.complex128)
+
+    nlevels = task_list.contents.nlevels
+    meshes = task_list.contents.gridlevel_info.contents.mesh
+    meshes = np.ctypeslib.as_array(meshes, shape=(nlevels,3))
+    for ilevel in range(nlevels):
+        mesh = meshes[ilevel]
+        ngrids = np.prod(mesh)
+
+        gx = np.fft.fftfreq(mesh[0], 1./mesh[0]).astype(np.int32)
+        gy = np.fft.fftfreq(mesh[1], 1./mesh[1]).astype(np.int32)
+        gz = np.fft.fftfreq(mesh[2], 1./mesh[2]).astype(np.int32)
+        sub_vG = _take_5d(vG, (None, None, gx, gy, gz)).reshape(-1,ngrids)
+        wv = tools.ifft(sub_vG, mesh).real.reshape(nset,4,ngrids)
+        wv = np.asarray(wv, order='C')
+
+        mat = eval_mat(cell, wv, task_list, comp=1, hermi=hermi,
+                       xctype='GGA', kpts=kpts, grid_level=ilevel, mesh=mesh)
+        mat = np.asarray(mat).reshape(nset,-1,nao,nao)
+        veff = np.add(veff, mat, out=veff)
+        if not gamma_point(kpts):
+            raise NotImplementedError
+
+    if nset == 1:
+        veff = veff[0]
+    return veff
+
+
+def _get_gga_pass2_ip1(mydf, vG, kpts=np.zeros((1,3)), hermi=0, deriv=1, verbose=None):
+    if deriv == 1:
+        comp = 3
+        assert hermi == 0
+    else:
+        raise NotImplementedError
+
+    cell = mydf.cell
+    nkpts = len(kpts)
+    nao = cell.nao_nr()
+    nx, ny, nz = mydf.mesh
+    vG = vG.reshape(-1,4,nx,ny,nz)
+    nset = vG.shape[0]
+
+    task_list = _update_task_list(mydf, hermi=hermi, ngrids=mydf.ngrids,
+                                  ke_ratio=mydf.ke_ratio, rel_cutoff=mydf.rel_cutoff)
+
+    at_gamma_point = gamma_point(kpts)
+    if at_gamma_point:
+        vj_kpts = np.zeros((nset,nkpts,comp,nao,nao))
+    else:
+        vj_kpts = np.zeros((nset,nkpts,comp,nao,nao), dtype=np.complex128)
+
+    nlevels = task_list.contents.nlevels
+    meshes = task_list.contents.gridlevel_info.contents.mesh
+    meshes = np.ctypeslib.as_array(meshes, shape=(nlevels,3))
+    for ilevel in range(nlevels):
+        mesh = meshes[ilevel]
+        ngrids = np.prod(mesh)
+
+        gx = np.fft.fftfreq(mesh[0], 1./mesh[0]).astype(np.int32)
+        gy = np.fft.fftfreq(mesh[1], 1./mesh[1]).astype(np.int32)
+        gz = np.fft.fftfreq(mesh[2], 1./mesh[2]).astype(np.int32)
+        sub_vG = _take_5d(vG, (None, None, gx, gy, gz)).reshape(-1,ngrids)
+
+        v_rs = tools.ifft(sub_vG, mesh).reshape(nset,4,ngrids)
+        vR = np.asarray(v_rs.real, order='C')
+        vI = np.asarray(v_rs.imag, order='C')
+        if at_gamma_point:
+            v_rs = vR
+
+        mat = eval_mat(cell, vR, task_list, comp=comp, hermi=hermi, deriv=deriv,
+                       xctype='GGA', kpts=kpts, grid_level=ilevel, mesh=mesh)
+        vj_kpts += np.asarray(mat).reshape(nset,-1,comp,nao,nao)
+        if not at_gamma_point and abs(vI).max() > IMAG_TOL:
+            raise NotImplementedError
+
+    if nset == 1:
+        vj_kpts = vj_kpts[0]
+    return vj_kpts
+
+
+def _rks_gga_wv0(rho, vxc, weight):
+    vrho, vgamma = vxc[:2]
+    ngrid = vrho.size
+    wv = np.empty((4,ngrid))
+    wv[0]  = np.multiply(weight, vrho, out=wv[0])
+    for i in range(1, 4):
+        wv[i] = np.multiply(weight * 2, np.multiply(vgamma, rho[i], out=wv[i]), out=wv[i])
+    return wv
+
+
+def _uks_gga_wv0(rho, vxc, weight):
+    rhoa, rhob = rho
+    vrho, vsigma = vxc[:2]
+    ngrids = vrho.shape[0]
+    wv = np.empty((2, 4, ngrids))
+    wv[0,0]  = np.multiply(weight, vrho[:,0], out=wv[0,0])
+    for i in range(1,4):
+        wv[0,i] = np.multiply(2., np.multiply(rhoa[i], vsigma[:,0], out=wv[0,i]), out=wv[0,i])
+        wv[0,i] = np.add(wv[0,i], np.multiply(rhob[i], vsigma[:,1]), out=wv[0,i])
+        wv[0,i] = np.multiply(weight, wv[0,i], out=wv[0,i])
+    wv[1,0]  = np.multiply(weight, vrho[:,1], out=wv[1,0])
+    for i in range(1,4):
+        wv[1,i] = np.multiply(2., np.multiply(rhob[i], vsigma[:,2], out=wv[1,i]), out=wv[1,i])
+        wv[1,i] = np.add(wv[1,i], np.multiply(rhoa[i], vsigma[:,1]), out=wv[1,i])
+        wv[1,i] = np.multiply(weight, wv[1,i], out=wv[1,i])
+    return wv
+
+
+def _rks_gga_wv0_pw(cell, rho, vxc, weight, mesh):
+    vrho, vgamma = vxc[:2]
+    ngrid = vrho.size
+    buf = np.empty((3,ngrid))
+    for i in range(1, 4):
+        buf[i-1] = np.multiply(vgamma, rho[i], out=buf[i-1])
+
+    vrho_freq = tools.fft(vrho, mesh).reshape((1,ngrid))
+    buf_freq = tools.fft(buf, mesh).reshape((3,ngrid))
+    Gv = cell.get_Gv(mesh)
+    #out  = vrho_freq - 2j * np.einsum('px,xp->p', Gv, buf_freq)
+    #out *= weight
+
+    out = np.empty((ngrid,), order="C", dtype=np.complex128)
+    func = getattr(libdft, 'get_gga_vrho_gs', None)
+    func(out.ctypes.data_as(ctypes.c_void_p),
+         vrho_freq.ctypes.data_as(ctypes.c_void_p),
+         buf_freq.ctypes.data_as(ctypes.c_void_p),
+         Gv.ctypes.data_as(ctypes.c_void_p),
+         ctypes.c_double(weight), ctypes.c_int(ngrid))
+    return out
+
+
+def _uks_gga_wv0_pw(cell, rho, vxc, weight, mesh):
+    rhoa, rhob = rho
+    vrho, vgamma = vxc[:2]
+    ngrid = vrho.shape[0]
+    buf = np.empty((2,3,ngrid))
+    for i in range(1, 4):
+        buf[0,i-1] = np.multiply(vgamma[:,0], rhoa[i], out=buf[0,i-1])
+        tmp = np.multiply(vgamma[:,1], rhob[i])
+        tmp = np.multiply(.5, tmp, out=tmp)
+        buf[0,i-1] = np.add(buf[0,i-1], tmp, out=buf[0,i-1])
+
+        buf[1,i-1] = np.multiply(vgamma[:,2], rhob[i], out=buf[1,i-1])
+        tmp = np.multiply(vgamma[:,1], rhoa[i])
+        tmp = np.multiply(.5, tmp, out=tmp)
+        buf[1,i-1] = np.add(buf[1,i-1], tmp, out=buf[1,i-1])
+
+
+    vrho_freq = tools.fft(vrho.T, mesh).reshape((2,ngrid))
+    buf_freq = tools.fft(buf.reshape(-1,ngrid), mesh).reshape((2,3,ngrid))
+    Gv = cell.get_Gv(mesh)
+    #out  = vrho_freq - 2j * np.einsum('px,xp->p', Gv, buf_freq)
+    #out *= weight
+
+    out = np.empty((2,ngrid), order="C", dtype=np.complex128)
+    func = getattr(libdft, 'get_gga_vrho_gs')
+    for s in range(2):
+        func(out[s].ctypes.data_as(ctypes.c_void_p),
+             vrho_freq[s].ctypes.data_as(ctypes.c_void_p),
+             buf_freq[s].ctypes.data_as(ctypes.c_void_p),
+             Gv.ctypes.data_as(ctypes.c_void_p),
+             ctypes.c_double(weight), ctypes.c_int(ngrid))
+    return out
+
+
+def nr_rks(mydf, xc_code, dm_kpts, hermi=1, kpts=None,
+           kpts_band=None, with_j=False, return_j=False, verbose=None):
+    '''
+    Same as multigrid.nr_rks, but considers Hermitian symmetry also for GGA
+    '''
+    if kpts is None: kpts = mydf.kpts
+    log = logger.new_logger(mydf, verbose)
+    cell = mydf.cell
+    dm_kpts = np.asarray(dm_kpts, order='C')
+    dms = _format_dms(dm_kpts, kpts)
+    nset, nkpts, nao = dms.shape[:3]
+    kpts_band, input_band = _format_kpts_band(kpts_band, kpts), kpts_band
+
+    ni = mydf._numint
+    xctype = ni._xc_type(xc_code)
+    if xctype == 'LDA':
+        deriv = 0
+    elif xctype == 'GGA':
+        deriv = 1
+    rhoG = _eval_rhoG(mydf, dm_kpts, hermi, kpts, deriv)
+
+    mesh = mydf.mesh
+    ngrids = np.prod(mesh)
+
+    coulG = tools.get_coulG(cell, mesh=mesh)
+    #vG = np.einsum('ng,g->ng', rhoG[:,0], coulG)
+    vG = np.empty_like(rhoG[:,0], dtype=np.result_type(rhoG[:,0], coulG))
+    for i, rhoG_i in enumerate(rhoG[:,0]):
+        vG[i] = np.multiply(rhoG_i, coulG, out=vG[i])
+    coulG = None
+
+    if mydf.vpplocG_part1 is not None:
+        for i in range(nset):
+            #vG[i] += mydf.vpplocG_part1 * 2
+            vG[i] = np.add(vG[i], np.multiply(2., mydf.vpplocG_part1), out=vG[i])
+
+    #ecoul = .5 * np.einsum('ng,ng->n', rhoG[:,0].real, vG.real)
+    #ecoul+= .5 * np.einsum('ng,ng->n', rhoG[:,0].imag, vG.imag)
+    ecoul = np.zeros((rhoG.shape[0],))
+    for i in range(rhoG.shape[0]):
+        ecoul[i] = .5 * np.vdot(rhoG[i,0], vG[i]).real
+
+    ecoul /= cell.vol
+    log.debug('Multigrid Coulomb energy %s', ecoul)
+
+    if mydf.vpplocG_part1 is not None:
+        for i in range(nset):
+            #vG[i] -= mydf.vpplocG_part1
+            vG[i] = np.subtract(vG[i], mydf.vpplocG_part1, out=vG[i])
+
+    weight = cell.vol / ngrids
+    # *(1./weight) because rhoR is scaled by weight in _eval_rhoG.  When
+    # computing rhoR with IFFT, the weight factor is not needed.
+    rhoR = tools.ifft(rhoG.reshape(-1,ngrids), mesh).real * (1./weight)
+    rhoR = rhoR.reshape(nset,-1,ngrids)
+    wv_freq = []
+    nelec = np.zeros(nset)
+    excsum = np.zeros(nset)
+    for i in range(nset):
+        exc, vxc = ni.eval_xc(xc_code, rhoR[i], spin=0, deriv=1)[:2]
+        if xctype == 'LDA':
+            wv = np.multiply(weight, vxc[0])
+            wv_freq.append(tools.fft(wv, mesh))
+            wv = None
+        elif xctype == 'GGA':
+            if GGA_METHOD.upper() == 'FFT':
+                wv_freq.append(_rks_gga_wv0_pw(cell, rhoR[i], vxc, weight, mesh).reshape(1,ngrids))
+            else:
+                wv = _rks_gga_wv0(rhoR[i], vxc, weight)
+                wv_freq.append(tools.fft(wv, mesh))
+                wv = None
+        else:
+            raise NotImplementedError
+
+        nelec[i]  += np.sum(rhoR[i,0]) * weight
+        excsum[i] += np.sum(np.multiply(rhoR[i,0], exc)) * weight
+        exc = vxc = None
+
+    rhoR = rhoG = None
+
+    if len(wv_freq) == 1:
+        wv_freq = wv_freq[0].reshape(nset,-1,*mesh)
+    else:
+        wv_freq = np.asarray(wv_freq).reshape(nset,-1,*mesh)
+
+    if nset == 1:
+        ecoul = ecoul[0]
+        nelec = nelec[0]
+        excsum = excsum[0]
+    log.debug('Multigrid exc %s  nelec %s', excsum, nelec)
+
+    kpts_band, input_band = _format_kpts_band(kpts_band, kpts), kpts_band
+    if xctype == 'LDA':
+        if with_j:
+            wv_freq[:,0] += vG.reshape(nset,*mesh)
+        veff = _get_j_pass2(mydf, wv_freq, kpts_band, verbose=log)
+    elif xctype == 'GGA':
+        if with_j:
+            #wv_freq[:,0] += vG.reshape(nset,*mesh)
+            wv_freq[:,0] = np.add(wv_freq[:,0], vG.reshape(nset,*mesh), out=wv_freq[:,0])
+        if GGA_METHOD.upper() == 'FFT':
+            veff = _get_j_pass2(mydf, wv_freq, kpts_band, verbose=log)
+        else:
+            veff = _get_gga_pass2(mydf, wv_freq, kpts_band, hermi=hermi, verbose=log)
+    wv_freq = None
+    veff = _format_jks(veff, dm_kpts, input_band, kpts)
+
+    if return_j:
+        vj = _get_j_pass2(mydf, vG, kpts_band, verbose=log)
+        vj = _format_jks(veff, dm_kpts, input_band, kpts)
+    else:
+        vj = None
+    vG = None
+
+    veff = lib.tag_array(veff, ecoul=ecoul, exc=excsum, vj=vj, vk=None)
+    return nelec, excsum, veff
+
+def nr_uks(mydf, xc_code, dm_kpts, hermi=1, kpts=None,
+           kpts_band=None, with_j=False, return_j=False, verbose=None):
+    if kpts is None: kpts = mydf.kpts
+    log = logger.new_logger(mydf, verbose)
+    cell = mydf.cell
+    dm_kpts = np.asarray(dm_kpts, order='C')
+    dms = _format_dms(dm_kpts, kpts)
+    nset, nkpts, nao = dms.shape[:3]
+    nset //= 2
+    kpts_band, input_band = _format_kpts_band(kpts_band, kpts), kpts_band
+
+    mesh = mydf.mesh
+    ngrids = np.prod(mesh)
+    ni = mydf._numint
+    xctype = ni._xc_type(xc_code)
+    if xctype == 'LDA':
+        deriv = 0
+    elif xctype == 'GGA':
+        deriv = 1
+
+    rhoG = _eval_rhoG(mydf, dm_kpts, hermi, kpts, deriv)
+    rhoG = rhoG.reshape(nset,2,-1,ngrids)
+
+    coulG = tools.get_coulG(cell, mesh=mesh)
+    #vG = np.einsum('nsg,g->ng', rhoG[:,:,0], coulG)
+    vG = np.empty((nset,ngrids), dtype=np.result_type(rhoG[:,:,0], coulG))
+    for i, rhoG_i in enumerate(rhoG[:,:,0]):
+        vG[i] = np.multiply(np.add(rhoG_i[0], rhoG_i[1]), coulG, out=vG[i])
+    coulG = None
+
+    if mydf.vpplocG_part1 is not None:
+        for i in range(nset):
+            #vG[i] += mydf.vpplocG_part1 * 2
+            vG[i] = np.add(vG[i], np.multiply(2., mydf.vpplocG_part1), out=vG[i])
+
+    ecoul = np.zeros(nset)
+    for i in range(nset):
+        ecoul[i] = .5 * np.vdot(np.add(rhoG[i,0,0], rhoG[i,1,0]), vG[i]).real
+
+    ecoul /= cell.vol
+    log.debug('Multigrid Coulomb energy %s', ecoul)
+
+    if mydf.vpplocG_part1 is not None:
+        for i in range(nset):
+            #vG[i] -= mydf.vpplocG_part1
+            vG[i] = np.subtract(vG[i], mydf.vpplocG_part1, out=vG[i])
+
+    weight = cell.vol / ngrids
+    # *(1./weight) because rhoR is scaled by weight in _eval_rhoG.  When
+    # computing rhoR with IFFT, the weight factor is not needed.
+    rhoR = tools.ifft(rhoG.reshape(-1,ngrids), mesh).real * (1./weight)
+    rhoR = rhoR.reshape(nset,2,-1,ngrids)
+    wv_freq = []
+    nelec = np.zeros(nset)
+    excsum = np.zeros(nset)
+    for i in range(nset):
+        exc, vxc = ni.eval_xc(xc_code, rhoR[i], spin=1, deriv=1)[:2]
+        if xctype == 'LDA':
+            wv = np.multiply(weight, vxc[0].T)
+            wv_freq.append(tools.fft(wv, mesh))
+            wv = None
+        elif xctype == 'GGA':
+            if GGA_METHOD.upper() == 'FFT':
+                wv_freq.append(_uks_gga_wv0_pw(cell, rhoR[i], vxc, weight, mesh))
+            else:
+                wv = _uks_gga_wv0(rhoR[i], vxc, weight)
+                wv_freq.append(tools.fft(wv.reshape(-1,*mesh), mesh))
+                wv = None
+        else:
+            raise NotImplementedError
+
+        nelec[i]  += np.sum(rhoR[i,:,0]).sum() * weight
+        excsum[i] += np.sum(np.multiply(np.add(rhoR[i,0,0],rhoR[i,1,0]), exc)) * weight
+        exc = vxc = None
+
+    rhoR = rhoG = None
+
+    if len(wv_freq) == 1:
+        wv_freq = wv_freq[0].reshape(nset,2,-1,*mesh)
+    else:
+        wv_freq = np.asarray(wv_freq).reshape(nset,2,-1,*mesh)
+
+    if nset == 1:
+        ecoul = ecoul[0]
+        nelec = nelec[0]
+        excsum = excsum[0]
+    log.debug('Multigrid exc %s  nelec %s', excsum, nelec)
+
+    kpts_band, input_band = _format_kpts_band(kpts_band, kpts), kpts_band
+    if xctype == 'LDA':
+        if with_j:
+            for s in range(2):
+                wv_freq[:,s,0] += vG.reshape(nset,*mesh)
+        veff = _get_j_pass2(mydf, wv_freq, kpts_band, verbose=log)
+    elif xctype == 'GGA':
+        if with_j:
+            #wv_freq[:,:,0] += vG.reshape(nset,*mesh)
+            for s in range(2):
+                wv_freq[:,s,0] = np.add(wv_freq[:,s,0], vG.reshape(nset,*mesh), out=wv_freq[:,s,0])
+        if GGA_METHOD.upper() == 'FFT':
+            veff = _get_j_pass2(mydf, wv_freq, kpts_band, verbose=log)
+        else:
+            veff = _get_gga_pass2(mydf, wv_freq, kpts_band, hermi=hermi, verbose=log)
+    wv_freq = None
+    veff = _format_jks(veff, dm_kpts, input_band, kpts)
+
+    if return_j:
+        vj = _get_j_pass2(mydf, vG, kpts_band, verbose=log)
+        vj = _format_jks(veff, dm_kpts, input_band, kpts)
+    else:
+        vj = None
+    vG = None
+
+    veff = lib.tag_array(veff, ecoul=ecoul, exc=excsum, vj=vj, vk=None)
+    return nelec, excsum, veff
+
+def get_veff_ip1(mydf, dm_kpts, xc_code=None, kpts=np.zeros((1,3)), kpts_band=None, spin=0):
+    cell = mydf.cell
+    dm_kpts = np.asarray(dm_kpts, order='C')
+    dms = _format_dms(dm_kpts, kpts)
+    nset, nkpts, nao = dms.shape[:3]
+    kpts_band = _format_kpts_band(kpts_band, kpts)
+    if spin == 1:
+        nset //= 2
+
+    mesh = mydf.mesh
+    ngrids = np.prod(mesh)
+    ni = mydf._numint
+    xctype = ni._xc_type(xc_code)
+    if xctype == 'LDA':
+        deriv = 0
+    elif xctype == 'GGA':
+        deriv = 1
+    rhoG = _eval_rhoG(mydf, dm_kpts, hermi=1, kpts=kpts_band, deriv=deriv)
+    if spin == 1:
+        rhoG = rhoG.reshape(nset,2,-1,ngrids)
+    # cache rhoG for core density gradients
+    mydf.rhoG = rhoG
+
+    coulG = tools.get_coulG(cell, mesh=mesh)
+    vG = np.empty((nset,ngrids), dtype=np.result_type(rhoG, coulG))
+    for i in range(nset):
+        if spin == 0:
+            vG[i] = np.multiply(rhoG[i,0], coulG, out=vG[i])
+        elif spin == 1:
+            tmp = np.add(rhoG[i,0,0], rhoG[i,1,0])
+            vG[i] = np.multiply(tmp, coulG, out=vG[i])
+
+    if mydf.vpplocG_part1 is not None:
+        for i in range(nset):
+            vG[i] = np.add(vG[i], mydf.vpplocG_part1, out=vG[i])
+
+    weight = cell.vol / ngrids
+
+    # *(1./weight) because rhoR is scaled by weight in _eval_rhoG.  When
+    # computing rhoR with IFFT, the weight factor is not needed.
+    rhoR = tools.ifft(rhoG.reshape(-1,ngrids), mesh).real * (1./weight)
+    if spin == 0:
+        rhoR = rhoR.reshape(nset,-1,ngrids)
+    elif spin == 1:
+        rhoR = rhoR.reshape(nset,2,-1,ngrids)
+
+    wv_freq = []
+    for i in range(nset):
+        exc, vxc = ni.eval_xc(xc_code, rhoR[i], spin=spin, deriv=1)[:2]
+        if spin == 0:
+            if xctype == 'LDA':
+                wv = np.multiply(weight, vxc[0])
+                wv_freq.append(tools.fft(wv, mesh))
+                wv = None
+            elif xctype == 'GGA':
+                if GGA_METHOD.upper() == 'FFT':
+                    wv_freq.append(_rks_gga_wv0_pw(cell, rhoR[i], vxc, weight, mesh).reshape(1,ngrids))
+                else:
+                    wv = _rks_gga_wv0(rhoR[i], vxc, weight)
+                    wv_freq.append(tools.fft(wv, mesh))
+            else:
+                raise NotImplementedError
+        elif spin == 1:
+            if xctype == 'LDA':
+                wv = np.multiply(weight, vxc[0].T)
+                wv_freq.append(tools.fft(wv, mesh))
+                wv = None
+            elif xctype == 'GGA':
+                if GGA_METHOD.upper() == 'FFT':
+                    wv_freq.append(_uks_gga_wv0_pw(cell, rhoR[i], vxc, weight, mesh))
+                else:
+                    wv = _uks_gga_wv0(rhoR[i], vxc, weight)
+                    wv_freq.append(tools.fft(wv.reshape(-1,*mesh), mesh))
+                wv = None
+            else:
+                raise NotImplementedError
+
+    rhoR = rhoG = None
+    if spin == 0:
+        if len(wv_freq) == 1:
+            wv_freq = wv_freq[0].reshape(nset,-1,*mesh)
+        else:
+            wv_freq = np.asarray(wv_freq).reshape(nset,-1,*mesh)
+    elif spin == 1:
+        if len(wv_freq) == 1:
+            wv_freq = wv_freq[0].reshape(nset,2,-1,*mesh)
+        else:
+            wv_freq = np.asarray(wv_freq).reshape(nset,2,-1,*mesh)
+
+    for i in range(nset):
+        if spin == 0:
+            wv_freq[i,0] = np.add(wv_freq[i,0], vG[i].reshape(*mesh), out=wv_freq[i,0])
+        elif spin == 1:
+            for s in range(2):
+                wv_freq[i,s,0] = np.add(wv_freq[i,s,0], vG[i].reshape(*mesh), out=wv_freq[i,s,0])
+
+    if xctype == 'LDA':
+        vj_kpts = _get_j_pass2_ip1(mydf, wv_freq, kpts_band, hermi=0, deriv=1)
+    elif xctype == 'GGA':
+        if GGA_METHOD.upper() == 'FFT':
+            vj_kpts = _get_j_pass2_ip1(mydf, wv_freq, kpts_band, hermi=0, deriv=1)
+        else:
+            vj_kpts = _get_gga_pass2_ip1(mydf, wv_freq, kpts_band, hermi=0, deriv=1)
+    else:
+        raise NotImplementedError
+
+    comp = 3
+    nao = cell.nao
+    if spin == 0:
+        vj_kpts = vj_kpts.reshape(nset,nkpts,comp,nao,nao)
+    elif spin == 1:
+        vj_kpts = vj_kpts.reshape(nset,2,nkpts,comp,nao,nao)
+    vj_kpts = np.moveaxis(vj_kpts, -3, -4)
+
+    if nkpts == 1:
+        vj_kpts = vj_kpts[...,0,:,:]
+    if nset == 1:
+        vj_kpts = vj_kpts[0]
+    return vj_kpts
+
+
+class MultiGridFFTDF2(MultiGridFFTDF):
+    '''
+    Base class for multigrid DFT (version 2).
+
+    Attributes:
+        task_list : TaskList instance
+            Task list recording which primitive basis function pairs
+            need to be considered.
+        vpplocG_part1 : arrary
+            Short-range part of the local pseudopotential represented
+            in the reciprocal space. It is cached to reduce cost.
+        rhoG : array
+            Electronic density represented in the reciprocal space.
+            It is cached in nuclear gradient calculations to reduce cost.
+    '''
+    ngrids = getattr(__config__, 'pbc_dft_multigrid_ngrids', 4)
+    ke_ratio = getattr(__config__, 'pbc_dft_multigrid_ke_ratio', 3.0)
+    rel_cutoff = getattr(__config__, 'pbc_dft_multigrid_rel_cutoff', 20.0)
+    _keys = {'ngrids', 'ke_ratio', 'rel_cutoff',
+             'task_list', 'vpplocG_part1', 'rhoG'}
+
+    def __init__(self, cell, kpts=np.zeros((1,3))):
+        fft.FFTDF.__init__(self, cell, kpts)
+        self.task_list = None
+        self.vpplocG_part1 = None
+        self.rhoG = None
+        if not gamma_point(kpts):
+            raise NotImplementedError('MultiGridFFTDF2 only supports Gamma-point calculations.')
+        a = cell.lattice_vectors()
+        if abs(a-np.diag(a.diagonal())).max() > 1e-12:
+            raise NotImplementedError('MultiGridFFTDF2 only supports orthorhombic lattices.')
+
+    def reset(self, cell=None):
+        self.vpplocG_part1 = None
+        self.rhoG = None
+        if self.task_list is not None:
+            free_task_list(self.task_list)
+            self.task_list = None
+        fft.FFTDF.reset(self, cell=cell)
+
+    def __del__(self):
+        self.reset()
+
+    def get_veff_ip1(self, dm, xc_code=None, kpts=None, kpts_band=None, spin=0):
+        if kpts is None:
+            if self.kpts is None:
+                kpts = np.zeros(1,3)
+            else:
+                kpts = self.kpts
+        kpts = kpts.reshape(-1,3)
+        vj = get_veff_ip1(self, dm, xc_code=xc_code,
+                          kpts=kpts, kpts_band=kpts_band, spin=spin)
+        return vj
+
+    def get_pp(self, kpts=None):
+        '''Compute the GTH pseudopotential matrix, which includes
+        the second part of the local potential and the non-local potential.
+        The first part of the local potential is cached as `vpplocG_part1`,
+        which is the reciprocal space representation, to be added to the electron
+        density for computing the Coulomb matrix.
+        In order to get the full PP matrix, the potential due to `vpplocG_part1`
+        needs to be added.
+        '''
+        self.vpplocG_part1 = _get_vpplocG_part1(self, with_rho_core=True)
+        return _get_pp_without_erf(self, kpts)
+
+    vpploc_part1_nuc_grad = vpploc_part1_nuc_grad
diff --git a/pyscf/pbc/dft/multigrid/pp.py b/pyscf/pbc/dft/multigrid/pp.py
new file mode 100644
index 0000000000..13c0813dac
--- /dev/null
+++ b/pyscf/pbc/dft/multigrid/pp.py
@@ -0,0 +1,290 @@
+#!/usr/bin/env python
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Author: Xing Zhang <zhangxing.nju@gmail.com>
+#
+
+import ctypes
+import numpy
+from pyscf import __config__
+from pyscf import lib, gto
+from pyscf.lib import logger
+from pyscf.pbc import tools
+from pyscf.pbc.gto import pseudo
+from pyscf.pbc.gto.pseudo import pp_int
+from pyscf.pbc.lib.kpts_helper import gamma_point
+
+PP_WITH_RHO_CORE = getattr(__config__, 'pbc_dft_multigrid_pp_with_rho_core', True)
+
+libpbc = lib.load_library('libpbc')
+libdft = lib.load_library('libdft')
+
+def make_rho_core(cell, mesh=None, precision=None, atm_id=None):
+    if mesh is None:
+        mesh = cell.mesh
+    fakecell, max_radius = fake_cell_vloc_part1(cell, atm_id=atm_id, precision=precision)
+    atm = fakecell._atm
+    bas = fakecell._bas
+    env = fakecell._env
+
+    a = numpy.asarray(cell.lattice_vectors(), order='C', dtype=float)
+    if abs(a - numpy.diag(a.diagonal())).max() < 1e-12:
+        lattice_type = '_orth'
+    else:
+        lattice_type = '_nonorth'
+        raise NotImplementedError
+    eval_fn = 'make_rho_lda' + lattice_type
+
+    b = numpy.asarray(numpy.linalg.inv(a.T), order='C', dtype=float)
+    mesh = numpy.asarray(mesh, order='C', dtype=numpy.int32)
+    rho_core = numpy.zeros((numpy.prod(mesh),), order='C', dtype=float)
+    drv = getattr(libdft, 'build_core_density', None)
+    try:
+        drv(getattr(libdft, eval_fn),
+            rho_core.ctypes.data_as(ctypes.c_void_p),
+            atm.ctypes.data_as(ctypes.c_void_p),
+            bas.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(len(bas)),
+            env.ctypes.data_as(ctypes.c_void_p),
+            mesh.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(cell.dimension),
+            a.ctypes.data_as(ctypes.c_void_p),
+            b.ctypes.data_as(ctypes.c_void_p), ctypes.c_double(max_radius))
+    except Exception as e:
+        raise RuntimeError("Failed to compute rho_core. %s" % e)
+    return rho_core
+
+
+def _get_pp_without_erf(mydf, kpts=None):
+    '''Get the periodic pseudotential nuc-el AO matrix, with G=0 removed.
+    '''
+    cell = mydf.cell
+    if kpts is None:
+        kpts_lst = numpy.zeros((1,3))
+    else:
+        kpts_lst = numpy.reshape(kpts, (-1,3))
+
+    vpp = pp_int.get_pp_loc_part2(cell, kpts_lst)
+    vppnl = pp_int.get_pp_nl(cell, kpts_lst)
+
+    for k, kpt in enumerate(kpts_lst):
+        if gamma_point(kpt):
+            vpp[k] = vpp[k].real + vppnl[k].real
+        else:
+            vpp[k] += vppnl[k]
+    vppnl = None
+
+    if kpts is None or numpy.shape(kpts) == (3,):
+        vpp = vpp[0]
+    return numpy.asarray(vpp)
+
+
+def get_pp_loc_part1_gs(cell, Gv):
+    coulG = tools.get_coulG(cell, Gv=Gv)
+    G2 = numpy.einsum('ix,ix->i', Gv, Gv)
+    G0idx = numpy.where(G2==0)[0]
+    ngrid = len(G2)
+    Gv = numpy.asarray(Gv, order='C', dtype=numpy.double)
+    coulG = numpy.asarray(coulG, order='C', dtype=numpy.double)
+    G2 = numpy.asarray(G2, order='C', dtype=numpy.double)
+
+    coords = cell.atom_coords()
+    coords = numpy.asarray(coords, order='C', dtype=numpy.double)
+    Z = numpy.empty([cell.natm,], order='C', dtype=numpy.double)
+    rloc = numpy.empty([cell.natm,], order='C', dtype=numpy.double)
+    for ia in range(cell.natm):
+        Z[ia] = cell.atom_charge(ia)
+        symb = cell.atom_symbol(ia)
+        if symb in cell._pseudo:
+            rloc[ia] = cell._pseudo[symb][1]
+        else:
+            rloc[ia] = -999
+
+    out = numpy.empty((ngrid,), order='C', dtype=numpy.complex128)
+    fn = getattr(libpbc, "pp_loc_part1_gs", None)
+    try:
+        fn(out.ctypes.data_as(ctypes.c_void_p),
+           coulG.ctypes.data_as(ctypes.c_void_p),
+           Gv.ctypes.data_as(ctypes.c_void_p),
+           G2.ctypes.data_as(ctypes.c_void_p),
+           ctypes.c_int(G0idx), ctypes.c_int(ngrid),
+           Z.ctypes.data_as(ctypes.c_void_p),
+           coords.ctypes.data_as(ctypes.c_void_p),
+           rloc.ctypes.data_as(ctypes.c_void_p),
+           ctypes.c_int(cell.natm))
+    except Exception as e:
+        raise RuntimeError("Failed to get vlocG part1. %s" % e)
+    return out
+
+
+def _get_vpplocG_part1(mydf, with_rho_core=PP_WITH_RHO_CORE):
+    cell = mydf.cell
+    mesh = mydf.mesh
+
+    if not with_rho_core:
+        # compute rho_core directly in G-space
+        # this is much slower that the following
+        Gv = cell.get_Gv(mesh)
+        vpplocG_part1 = get_pp_loc_part1_gs(cell, Gv)
+    else:
+        # compute rho_core in real space then transform to G-space
+        weight = cell.vol / numpy.prod(mesh)
+        rho_core = make_rho_core(cell)
+        rhoG_core = weight * tools.fft(rho_core, mesh)
+        rho_core = None
+        coulG = tools.get_coulG(cell, mesh=mesh)
+        vpplocG_part1 = rhoG_core * coulG
+        rhoG_core = coulG = None
+        # G = 0 contribution
+        chargs = cell.atom_charges()
+        rloc = []
+        for ia in range(cell.natm):
+            symb = cell.atom_symbol(ia)
+            rloc.append(cell._pseudo[symb][1])
+        rloc = numpy.asarray(rloc)
+        vpplocG_part1[0] += 2. * numpy.pi * numpy.sum(rloc * rloc * chargs)
+    return vpplocG_part1
+
+
+def get_vpploc_part1_ip1(mydf, kpts=numpy.zeros((1,3))):
+    from .multigrid_pair import _get_j_pass2_ip1
+    if mydf.pp_with_erf:
+        return 0
+
+    mesh = mydf.mesh
+    vG = mydf.vpplocG_part1
+    vG.reshape(-1,*mesh)
+
+    vpp_kpts = _get_j_pass2_ip1(mydf, vG, kpts, hermi=0, deriv=1)
+    if gamma_point(kpts):
+        vpp_kpts = vpp_kpts.real
+    if len(kpts) == 1:
+        vpp_kpts = vpp_kpts[0]
+    return vpp_kpts
+
+
+def vpploc_part1_nuc_grad(mydf, dm, kpts=numpy.zeros((1,3)), atm_id=None, precision=None):
+    from .multigrid_pair import _eval_rhoG
+    t0 = (logger.process_clock(), logger.perf_counter())
+    cell = mydf.cell
+    fakecell, max_radius = fake_cell_vloc_part1(cell, atm_id=atm_id, precision=precision)
+    atm = fakecell._atm
+    bas = fakecell._bas
+    env = fakecell._env
+
+    a = numpy.asarray(cell.lattice_vectors(), order='C', dtype=float)
+    if abs(a - numpy.diag(a.diagonal())).max() < 1e-12:
+        lattice_type = '_orth'
+    else:
+        lattice_type = '_nonorth'
+        raise NotImplementedError
+    eval_fn = 'eval_mat_lda' + lattice_type + '_ip1'
+
+    b = numpy.asarray(numpy.linalg.inv(a.T), order='C', dtype=float)
+    mesh = numpy.asarray(mydf.mesh, order='C', dtype=numpy.int32)
+    ngrids = numpy.prod(mesh)
+    comp = 3
+    grad = numpy.zeros((len(atm),comp), order="C", dtype=float)
+    drv = getattr(libdft, 'int_gauss_charge_v_rs', None)
+
+    if mydf.rhoG is None:
+        rhoG = _eval_rhoG(mydf, dm, hermi=1, kpts=kpts, deriv=0)
+    else:
+        rhoG = mydf.rhoG
+    rhoG = rhoG[...,0,:]
+    rhoG = rhoG.reshape(-1,ngrids)
+    if rhoG.shape[0] == 2: #unrestricted
+        rhoG = rhoG[0] + rhoG[1]
+    else:
+        assert rhoG.shape[0] == 1
+        rhoG = rhoG[0]
+
+    coulG = tools.get_coulG(cell, mesh=mesh)
+    vG = numpy.multiply(rhoG, coulG)
+
+    v_rs = numpy.asarray(tools.ifft(vG, mesh).real, order="C")
+    try:
+        drv(getattr(libdft, eval_fn),
+            grad.ctypes.data_as(ctypes.c_void_p),
+            v_rs.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(comp),
+            atm.ctypes.data_as(ctypes.c_void_p),
+            bas.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(len(bas)),
+            env.ctypes.data_as(ctypes.c_void_p),
+            mesh.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(cell.dimension),
+            a.ctypes.data_as(ctypes.c_void_p),
+            b.ctypes.data_as(ctypes.c_void_p), ctypes.c_double(max_radius))
+    except Exception as e:
+        raise RuntimeError("Failed to computed nuclear gradients of vpploc part1. %s" % e)
+    grad *= -1
+    t0 = logger.timer(mydf, 'vpploc_part1_nuc_grad', *t0)
+    return grad
+
+
+def fake_cell_vloc_part1(cell, atm_id=None, precision=None):
+    '''
+    Generate fakecell for the non-local term of the local part of
+    the GTH pseudo-potential. Also stores the atomic radii.
+    Differs from pp_int.fake_cell_vloc(cell, cn=0) in the normalization factors.
+    '''
+    from pyscf.pbc.gto.cell import pgf_rcut
+    if atm_id is None:
+        atm_id = numpy.arange(cell.natm)
+    else:
+        atm_id = numpy.asarray(atm_id)
+    natm = len(atm_id)
+
+    if precision is None:
+        precision = cell.precision
+
+    max_radius = 0
+    kind = {}
+    # FIXME prec may be too tight
+    prec = precision ** 2
+    for symb in cell._pseudo:
+        charge = numpy.sum(cell._pseudo[symb][0])
+        rloc = cell._pseudo[symb][1]
+        zeta = .5 / rloc**2
+        norm = (zeta / numpy.pi) ** 1.5
+        radius = pgf_rcut(0, zeta, charge*norm, precision=prec)
+        max_radius = max(radius, max_radius)
+        kind[symb] = [zeta, norm, radius]
+
+    fake_env = [cell.atom_coords()[atm_id].ravel()]
+    fake_atm = cell._atm[atm_id].copy().reshape(natm,-1)
+    fake_atm[:,gto.PTR_COORD] = numpy.arange(0, natm*3, 3)
+    ptr = natm * 3
+    fake_bas = []
+    for ia, atm in enumerate(atm_id):
+        if cell.atom_charge(atm) == 0:  # pass ghost atoms
+            continue
+
+        symb = cell.atom_symbol(atm)
+        if symb in kind:
+            fake_env.append(kind[symb])
+        else:
+            alpha = 1e16
+            norm = (alpha / numpy.pi) ** 1.5
+            radius = 0.0
+            fake_env.append([alpha, norm, radius])
+        fake_bas.append([ia, 0, 1, 1, 0, ptr, ptr+1, 0])
+        fake_atm[ia,gto.PTR_RADIUS] = ptr+2
+        ptr += 3
+
+    fakecell = cell.copy(deep=False)
+    fakecell._atm = numpy.asarray(fake_atm, order="C", dtype=numpy.int32)
+    fakecell._bas = numpy.asarray(fake_bas, order="C", dtype=numpy.int32).reshape(-1, gto.BAS_SLOTS)
+    fakecell._env = numpy.asarray(numpy.hstack(fake_env), order="C", dtype=float)
+    return fakecell, max_radius
diff --git a/pyscf/pbc/dft/multigrid/utils.py b/pyscf/pbc/dft/multigrid/utils.py
new file mode 100644
index 0000000000..3ca9f0addb
--- /dev/null
+++ b/pyscf/pbc/dft/multigrid/utils.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+# Copyright 2014-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Author: Qiming Sun <osirpt.sun@gmail.com>
+#
+
+import numpy
+from pyscf import lib
+
+def _take_4d(a, indices):
+    a_shape = a.shape
+    ranges = []
+    for i, s in enumerate(indices):
+        if s is None:
+            idx = numpy.arange(a_shape[i], dtype=numpy.int32)
+        else:
+            idx = numpy.asarray(s, dtype=numpy.int32)
+            idx[idx < 0] += a_shape[i]
+        ranges.append(idx)
+    idx = ranges[0][:,None] * a_shape[1] + ranges[1]
+    idy = ranges[2][:,None] * a_shape[3] + ranges[3]
+    a = a.reshape(a_shape[0]*a_shape[1], a_shape[2]*a_shape[3])
+    out = lib.take_2d(a, idx.ravel(), idy.ravel())
+    return out.reshape([len(s) for s in ranges])
+
+def _takebak_4d(out, a, indices):
+    out_shape = out.shape
+    a_shape = a.shape
+    ranges = []
+    for i, s in enumerate(indices):
+        if s is None:
+            idx = numpy.arange(a_shape[i], dtype=numpy.int32)
+        else:
+            idx = numpy.asarray(s, dtype=numpy.int32)
+            idx[idx < 0] += out_shape[i]
+        assert (len(idx) == a_shape[i])
+        ranges.append(idx)
+    idx = ranges[0][:,None] * out_shape[1] + ranges[1]
+    idy = ranges[2][:,None] * out_shape[3] + ranges[3]
+    nx = idx.size
+    ny = idy.size
+    out = out.reshape(out_shape[0]*out_shape[1], out_shape[2]*out_shape[3])
+    lib.takebak_2d(out, a.reshape(nx,ny), idx.ravel(), idy.ravel())
+    return out
+
+def _take_5d(a, indices):
+    a_shape = a.shape
+    a = a.reshape((a_shape[0]*a_shape[1],) + a_shape[2:])
+    indices = (None,) + indices[2:]
+    return _take_4d(a, indices)
+
+def _takebak_5d(out, a, indices):
+    a_shape = a.shape
+    out_shape = out.shape
+    a = a.reshape((a_shape[0]*a_shape[1],) + a_shape[2:])
+    out = out.reshape((out_shape[0]*out_shape[1],) + out_shape[2:])
+    indices = (None,) + indices[2:]
+    return _takebak_4d(out, a, indices)
diff --git a/pyscf/pbc/dft/rks.py b/pyscf/pbc/dft/rks.py
index 228bc6e91a..d3dc8d1047 100644
--- a/pyscf/pbc/dft/rks.py
+++ b/pyscf/pbc/dft/rks.py
@@ -73,7 +73,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
         n, exc, vxc = multigrid.nr_rks(ks.with_df, ks.xc, dm, hermi,
                                        kpt.reshape(1,3), kpts_band,
                                        with_j=True, return_j=False)
-        logger.debug(ks, 'nelec by numeric integration = %s', n)
+        logger.info(ks, 'nelec by numeric integration = %s', n)
         t0 = logger.timer(ks, 'vxc', *t0)
         return vxc
 
@@ -87,7 +87,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
         max_memory = ks.max_memory - lib.current_memory()[0]
         n, exc, vxc = ni.nr_rks(cell, ks.grids, ks.xc, dm, 0, hermi,
                                 kpt, kpts_band, max_memory=max_memory)
-        logger.debug(ks, 'nelec by numeric integration = %s', n)
+        logger.info(ks, 'nelec by numeric integration = %s', n)
         if ks.nlc or ni.libxc.is_nlc(ks.xc):
             if ni.libxc.is_nlc(ks.xc):
                 xc = ks.xc
@@ -98,7 +98,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
                                           max_memory=max_memory)
             exc += enlc
             vxc += vnlc
-            logger.debug(ks, 'nelec with nlc grids = %s', n)
+            logger.info(ks, 'nelec with nlc grids = %s', n)
         t0 = logger.timer(ks, 'vxc', *t0)
 
     if not hybrid:
diff --git a/pyscf/pbc/dft/test/test_krks_ksym.py b/pyscf/pbc/dft/test/test_krks_ksym.py
index 6c7bd46c4a..615f1d456f 100644
--- a/pyscf/pbc/dft/test/test_krks_ksym.py
+++ b/pyscf/pbc/dft/test/test_krks_ksym.py
@@ -207,14 +207,14 @@ def test_rsh_mdf(self):
     def test_multigrid(self):
         kmf0 = krks.KRKS(cell, kpts=cell.make_kpts(nk))
         kmf0.xc = 'lda'
-        kmf0 = multigrid.multigrid(kmf0)
+        kmf0 = multigrid.multigrid_fftdf(kmf0)
         kmf0.kernel()
         rho0 = kmf0.get_rho()
 
         kpts = cell.make_kpts(nk,space_group_symmetry=True,time_reversal_symmetry=True)
         kmf = pscf.KRKS(cell, kpts=kpts)
         kmf.xc = 'lda'
-        kmf = multigrid.multigrid(kmf)
+        kmf = multigrid.multigrid_fftdf(kmf)
         kmf.kernel()
         self.assertAlmostEqual(kmf.e_tot, kmf0.e_tot, 7)
         rho = kmf.get_rho()
@@ -231,14 +231,14 @@ def test_multigrid(self):
     def test_multigrid_kuks(self):
         kmf0 = pscf.KUKS(cell, kpts=cell.make_kpts(nk))
         kmf0.xc = 'lda'
-        kmf0 = multigrid.multigrid(kmf0)
+        kmf0 = multigrid.multigrid_fftdf(kmf0)
         kmf0.kernel()
         rho0 = kmf0.get_rho()
 
         kpts = cell.make_kpts(nk,space_group_symmetry=True,time_reversal_symmetry=True)
         kmf = pscf.KUKS(cell, kpts=kpts)
         kmf.xc = 'lda'
-        kmf = multigrid.multigrid(kmf)
+        kmf = multigrid.multigrid_fftdf(kmf)
         kmf.kernel()
         self.assertAlmostEqual(kmf.e_tot, kmf0.e_tot, 7)
         rho = kmf.get_rho()
diff --git a/pyscf/pbc/dft/test/test_multigrid.py b/pyscf/pbc/dft/test/test_multigrid.py
index 2cd11e7732..9db362ded3 100644
--- a/pyscf/pbc/dft/test/test_multigrid.py
+++ b/pyscf/pbc/dft/test/test_multigrid.py
@@ -85,12 +85,24 @@ def test_orth_get_pp(self):
         self.assertEqual(out.shape, ref.shape)
         self.assertAlmostEqual(abs(ref-out).max(), 0, 8)
 
+        # test small memory
+        mydf = multigrid.MultiGridFFTDF(cell_orth)
+        mydf.max_memory = 10
+        out = mydf.get_pp(max_memory=2)
+        self.assertAlmostEqual(abs(ref-out).max(), 0, 8)
+
     def test_nonorth_get_pp(self):
         ref = df.FFTDF(cell_nonorth).get_pp()
         out = multigrid.MultiGridFFTDF(cell_nonorth).get_pp()
         self.assertEqual(out.shape, ref.shape)
         self.assertAlmostEqual(abs(ref-out).max(), 0, 8)
 
+        # test small memory
+        mydf = multigrid.MultiGridFFTDF(cell_nonorth)
+        mydf.max_memory = 10
+        out = mydf.get_pp(max_memory=2)
+        self.assertAlmostEqual(abs(ref-out).max(), 0, 8)
+
     def test_orth_get_nuc_kpts(self):
         ref = df.FFTDF(cell_orth).get_nuc(kpts)
         out = multigrid.MultiGridFFTDF(cell_orth).get_nuc(kpts)
@@ -133,7 +145,7 @@ def test_multigrid_kuks(self):
         mf = dft.KUKS(cell_he)
         mf.xc = 'lda,'
         ref = mf.get_veff(cell_he, numpy.array((dm_he,dm_he)), kpts=kpts)
-        out = multigrid.multigrid(mf).get_veff(cell_he, (dm_he,dm_he), kpts=kpts)
+        out = multigrid.multigrid_fftdf(mf).get_veff(cell_he, (dm_he,dm_he), kpts=kpts)
         self.assertEqual(out.shape, ref.shape)
         self.assertAlmostEqual(abs(ref-out).max(), 0, 8)
         self.assertAlmostEqual(abs(ref.exc-out.exc).max(), 0, 8)
@@ -143,7 +155,7 @@ def test_multigrid_krks(self):
         mf = dft.KRKS(cell_he)
         mf.xc = 'lda,'
         ref = mf.get_veff(cell_he, dm_he, kpts=kpts)
-        out = multigrid.multigrid(mf).get_veff(cell_he, dm_he, kpts=kpts)
+        out = multigrid.multigrid_fftdf(mf).get_veff(cell_he, dm_he, kpts=kpts)
         self.assertEqual(out.shape, ref.shape)
         self.assertAlmostEqual(abs(ref-out).max(), 0, 8)
         self.assertAlmostEqual(abs(ref.exc-out.exc).max(), 0, 8)
@@ -159,7 +171,7 @@ def test_multigrid_kroks(self):
         dm1 = lib.tag_array(numpy.array([dm1,dm1]), mo_coeff=mo,
                             mo_occ=mo_occ*2)
         ref = mf.get_veff(cell_he, dm1, kpts=kpts)
-        out = multigrid.multigrid(mf).get_veff(cell_he, dm1, kpts=kpts)
+        out = multigrid.multigrid_fftdf(mf).get_veff(cell_he, dm1, kpts=kpts)
         self.assertEqual(out.shape, ref.shape)
         self.assertAlmostEqual(abs(ref-out).max(), 0, 7)
         self.assertAlmostEqual(abs(ref.exc-out.exc).max(), 0, 7)
@@ -169,7 +181,7 @@ def test_multigrid_uks(self):
         mf = dft.UKS(cell_he)
         mf.xc = 'lda,'
         ref = mf.get_veff(cell_he, numpy.array((dm_he[0],dm_he[0])))
-        out = multigrid.multigrid(mf).get_veff(cell_he, (dm_he[0], dm_he[0]))
+        out = multigrid.multigrid_fftdf(mf).get_veff(cell_he, (dm_he[0], dm_he[0]))
         self.assertEqual(out.shape, ref.shape)
         self.assertAlmostEqual(abs(ref-out).max(), 0, 7)
         self.assertAlmostEqual(abs(ref.exc-out.exc).max(), 0, 7)
@@ -179,7 +191,7 @@ def test_multigrid_rks(self):
         mf = dft.RKS(cell_he)
         mf.xc = 'lda,'
         ref = mf.get_veff(cell_he, dm_he[0])
-        out = multigrid.multigrid(mf).get_veff(cell_he, dm_he[0])
+        out = multigrid.multigrid_fftdf(mf).get_veff(cell_he, dm_he[0])
         self.assertEqual(out.shape, ref.shape)
         self.assertAlmostEqual(abs(ref-out).max(), 0, 7)
         self.assertAlmostEqual(abs(ref.exc-out.exc).max(), 0, 7)
@@ -195,7 +207,7 @@ def test_multigrid_roks(self):
         dm1 = lib.tag_array(numpy.array([dm1,dm1]), mo_coeff=mo,
                             mo_occ=mo_occ*2)
         ref = mf.get_veff(cell_he, dm1)
-        out = multigrid.multigrid(mf).get_veff(cell_he, dm1)
+        out = multigrid.multigrid_fftdf(mf).get_veff(cell_he, dm1)
         self.assertEqual(out.shape, ref.shape)
         self.assertAlmostEqual(abs(ref-out).max(), 0, 7)
         self.assertAlmostEqual(abs(ref.exc-out.exc).max(), 0, 7)
@@ -218,8 +230,8 @@ def test_eval_rhoG_orth_kpts(self):
         numpy.random.seed(9)
         dm = numpy.random.random(dm1.shape) + numpy.random.random(dm1.shape) * 1j
         mydf = multigrid.MultiGridFFTDF(cell_orth)
-        rhoG = multigrid._eval_rhoG(mydf, dm, hermi=0, kpts=kpts, deriv=0,
-                                    rhog_high_order=True)
+        rhoG = multigrid.multigrid._eval_rhoG(mydf, dm, hermi=0, kpts=kpts, deriv=0,
+                                              rhog_high_order=True)
         self.assertTrue(rhoG.dtype == numpy.complex128)
 
         mydf = df.FFTDF(cell_orth)
@@ -232,8 +244,8 @@ def test_eval_rhoG_orth_kpts(self):
 
     def test_eval_rhoG_orth_gga(self):
         mydf = multigrid.MultiGridFFTDF(cell_orth)
-        rhoG = multigrid._eval_rhoG(mydf, dm, hermi=1, kpts=kpts, deriv=1,
-                                    rhog_high_order=True)
+        rhoG = multigrid.multigrid._eval_rhoG(mydf, dm, hermi=1, kpts=kpts, deriv=1,
+                                              rhog_high_order=True)
 
         mydf = df.FFTDF(cell_orth)
         ni = dft.numint.KNumInt()
@@ -245,8 +257,8 @@ def test_eval_rhoG_orth_gga(self):
 
     def test_eval_rhoG_nonorth_gga(self):
         mydf = multigrid.MultiGridFFTDF(cell_nonorth)
-        rhoG = multigrid._eval_rhoG(mydf, dm, hermi=1, kpts=kpts, deriv=1,
-                                    rhog_high_order=True)
+        rhoG = multigrid.multigrid._eval_rhoG(mydf, dm, hermi=1, kpts=kpts, deriv=1,
+                                              rhog_high_order=True)
 
         mydf = df.FFTDF(cell_nonorth)
         ni = dft.numint.KNumInt()
@@ -273,7 +285,7 @@ def test_gen_rhf_response(self):
                                     hermi=1, kpts=kpts)
         vj = mydf.get_jk(dm1, with_k=False, kpts=kpts)[0]
         ref += vj
-        v = multigrid._gen_rhf_response(mf, dm_he, hermi=1)(dm1)
+        v = multigrid.multigrid._gen_rhf_response(mf, dm_he, hermi=1)(dm1)
         self.assertEqual(ref.dtype, v.dtype)
         self.assertEqual(ref.shape, v.shape)
         self.assertAlmostEqual(abs(v-ref).max(), 0, 8)
@@ -282,7 +294,7 @@ def test_gen_rhf_response(self):
         ref = dft.numint.nr_rks_fxc(ni, cell_he, mydf.grids, mf.xc, dm_he, dm1,
                                     hermi=1, kpts=kpts)
         ref += vj
-        v = multigrid._gen_rhf_response(mf, dm_he, hermi=1)(dm1)
+        v = multigrid.multigrid._gen_rhf_response(mf, dm_he, hermi=1)(dm1)
         self.assertEqual(ref.dtype, v.dtype)
         self.assertEqual(ref.shape, v.shape)
         self.assertAlmostEqual(abs(v-ref).max(), 0, 6)
@@ -356,7 +368,7 @@ def test_nr_rks_fxc_st(self):
         mf.xc = 'b88,'
         ref = dft.numint.nr_rks_fxc_st(ni, cell_he, mydf.grids, mf.xc, dm_he, dm1,
                                        singlet=True, kpts=kpts)
-        v = multigrid._gen_rhf_response(mf, dm_he, singlet=True)(dm1)
+        v = multigrid.multigrid._gen_rhf_response(mf, dm_he, singlet=True)(dm1)
         self.assertEqual(ref.dtype, v.dtype)
         self.assertEqual(ref.shape, v.shape)
         self.assertAlmostEqual(abs(v-ref).max(), 0, 5)
@@ -364,7 +376,7 @@ def test_nr_rks_fxc_st(self):
         mf.xc = 'lda,'
         ref = dft.numint.nr_rks_fxc_st(ni, cell_he, mydf.grids, mf.xc, dm_he, dm1,
                                        singlet=False, kpts=kpts)
-        v = multigrid._gen_rhf_response(mf, dm_he, singlet=False)(dm1)
+        v = multigrid.multigrid._gen_rhf_response(mf, dm_he, singlet=False)(dm1)
         self.assertEqual(ref.dtype, v.dtype)
         self.assertEqual(ref.shape, v.shape)
         self.assertAlmostEqual(abs(v-ref).max(), 0, 4)
@@ -391,7 +403,7 @@ def test_gen_uhf_response(self):
         ref = dft.numint.nr_uks_fxc(ni, cell_he, mydf.grids, mf.xc, dm_he, dm1, hermi=1)
         vj = mydf.get_jk(dm1, with_k=False)[0]
         ref += vj[0] + vj[1]
-        v = multigrid._gen_uhf_response(mf, dm_he, with_j=True, hermi=1)(dm1)
+        v = multigrid.multigrid._gen_uhf_response(mf, dm_he, with_j=True, hermi=1)(dm1)
         self.assertEqual(ref.dtype, v.dtype)
         self.assertEqual(ref.shape, v.shape)
         self.assertAlmostEqual(abs(v-ref).max(), 0, 7)
@@ -399,7 +411,7 @@ def test_gen_uhf_response(self):
         mf.xc = 'b88,'
         ref = dft.numint.nr_uks_fxc(ni, cell_he, mydf.grids, mf.xc, dm_he, dm1, hermi=1)
         ref += vj[0] + vj[1]
-        v = multigrid._gen_uhf_response(mf, dm_he, with_j=True, hermi=1)(dm1)
+        v = multigrid.multigrid._gen_uhf_response(mf, dm_he, with_j=True, hermi=1)(dm1)
         self.assertEqual(ref.dtype, v.dtype)
         self.assertEqual(ref.shape, v.shape)
         self.assertAlmostEqual(abs(v-ref).max(), 0, 7)
@@ -454,11 +466,11 @@ def test_orth_uks_fxc_hermi0(self):
 
     def test_rcut_vs_ke_cut(self):
         xc = 'lda,'
-        with lib.temporary_env(multigrid, TASKS_TYPE='rcut'):
+        with lib.temporary_env(multigrid.multigrid, TASKS_TYPE='rcut'):
             mg_df = multigrid.MultiGridFFTDF(cell_orth)
             n1, exc1, v1 = multigrid.nr_rks(mg_df, xc, dm1, kpts=kpts)
             self.assertEqual(len(mg_df.tasks), 3)
-        with lib.temporary_env(multigrid, TASKS_TYPE='ke_cut'):
+        with lib.temporary_env(multigrid.multigrid, TASKS_TYPE='ke_cut'):
             mg_df = multigrid.MultiGridFFTDF(cell_orth)
             n2, exc2, v2 = multigrid.nr_rks(mg_df, xc, dm1, kpts=kpts)
             self.assertEqual(len(mg_df.tasks), 6)
diff --git a/pyscf/pbc/dft/test/test_multigrid2.py b/pyscf/pbc/dft/test/test_multigrid2.py
new file mode 100644
index 0000000000..f23c687a48
--- /dev/null
+++ b/pyscf/pbc/dft/test/test_multigrid2.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Author: Xing Zhang <zhangxing.nju@gmail.com>
+#
+
+import unittest
+import numpy
+from pyscf.pbc import gto, dft
+from pyscf.pbc.dft import multigrid
+from pyscf.pbc.grad import rks as rks_grad
+from pyscf.pbc.grad import uks as uks_grad
+from pyscf.pbc.grad import krks as krks_grad
+
+def setUpModule():
+    global cell
+    cell = gto.Cell()
+    boxlen = 5.0
+    cell.a = numpy.array([[boxlen,0.0,0.0],
+                          [0.0,boxlen,0.0],
+                          [0.0,0.0,boxlen]])
+    cell.atom = """
+        O          1.84560        1.21649        1.10372
+        H          2.30941        1.30070        1.92953
+        H          0.91429        1.26674        1.28886
+    """
+    cell.basis = 'gth-szv'
+    cell.ke_cutoff = 200
+    cell.pseudo = 'gth-pade'
+    cell.verbose = 0
+    cell.use_loose_rcut = True
+    cell.build()
+
+def tearDownModule():
+    global cell
+    del cell
+
+def _fftdf_energy_grad(cell, xc):
+    mf = dft.KRKS(cell, kpts=numpy.zeros((1,3)))
+    mf.xc = xc
+    e = mf.kernel()
+    grad = krks_grad.Gradients(mf)
+    g = grad.kernel()
+    return e, g
+
+def _multigrid2_energy_grad(cell, xc, spin=0):
+    if spin == 0:
+        mf = dft.RKS(cell)
+    elif spin == 1:
+        mf = dft.UKS(cell)
+    mf.xc =  xc
+    mf.with_df = multigrid.MultiGridFFTDF2(cell)
+    e = mf.kernel()
+    if spin == 0:
+        g = rks_grad.Gradients(mf).kernel()
+    elif spin == 1:
+        g = uks_grad.Gradients(mf).kernel()
+    return e, g
+
+class KnownValues(unittest.TestCase):
+    def test_orth_lda(self):
+        xc = 'lda, vwn'
+        e0, g0 = _fftdf_energy_grad(cell, xc)
+        e,  g  = _multigrid2_energy_grad(cell, xc, 0)
+        e1, g1 = _multigrid2_energy_grad(cell, xc, 1)
+        assert abs(e-e0) < 1e-8
+        assert abs(e1-e0) < 1e-8
+        assert abs(g-g0).max() < 2e-5
+        assert abs(g1-g0).max() < 2e-5
+
+    def test_orth_gga(self):
+        xc = 'pbe, pbe'
+        e0, g0 = _fftdf_energy_grad(cell, xc)
+        e,  g  = _multigrid2_energy_grad(cell, xc, 0)
+        e1, g1 = _multigrid2_energy_grad(cell, xc, 1)
+        assert abs(e-e0) < 1e-6
+        assert abs(e1-e0) < 1e-6
+        assert abs(g-g0).max() < 1e-4
+        assert abs(g1-g0).max() < 1e-4
+
+if __name__ == '__main__':
+    print("Full Tests for multigrid2")
+    unittest.main()
diff --git a/pyscf/pbc/dft/uks.py b/pyscf/pbc/dft/uks.py
index de72d6452d..20d8d14c71 100644
--- a/pyscf/pbc/dft/uks.py
+++ b/pyscf/pbc/dft/uks.py
@@ -57,7 +57,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
         n, exc, vxc = multigrid.nr_uks(ks.with_df, ks.xc, dm, hermi,
                                        kpt.reshape(1,3), kpts_band,
                                        with_j=True, return_j=False)
-        logger.debug(ks, 'nelec by numeric integration = %s', n)
+        logger.info(ks, 'nelec by numeric integration = %s', n)
         t0 = logger.timer(ks, 'vxc', *t0)
         return vxc
 
@@ -86,7 +86,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
                                           0, hermi, kpt, max_memory=max_memory)
             exc += enlc
             vxc += vnlc
-        logger.debug(ks, 'nelec by numeric integration = %s', n)
+        logger.info(ks, 'nelec by numeric integration = %s', n)
         t0 = logger.timer(ks, 'vxc', *t0)
 
     if not hybrid:
diff --git a/pyscf/pbc/grad/__init__.py b/pyscf/pbc/grad/__init__.py
index 5408a1eb50..e308bca1e1 100644
--- a/pyscf/pbc/grad/__init__.py
+++ b/pyscf/pbc/grad/__init__.py
@@ -19,7 +19,10 @@
 '''
 Analytical nuclear gradients for PBC
 '''
-
+from pyscf.pbc.grad import rhf
+from pyscf.pbc.grad import rks
+from pyscf.pbc.grad import uhf
+from pyscf.pbc.grad import uks
 from pyscf.pbc.grad import krhf
 from pyscf.pbc.grad import kuhf
 from pyscf.pbc.grad import krks
@@ -30,4 +33,4 @@
 from pyscf.pbc.grad.krks import Gradients as KRKS
 from pyscf.pbc.grad.kuks import Gradients as KUKS
 
-grad_nuc = krhf.grad_nuc
+grad_nuc = rhf.grad_nuc
diff --git a/pyscf/pbc/grad/krhf.py b/pyscf/pbc/grad/krhf.py
index 9fd628882f..0dd6a171e4 100644
--- a/pyscf/pbc/grad/krhf.py
+++ b/pyscf/pbc/grad/krhf.py
@@ -211,6 +211,10 @@ def hcore_deriv(atm_id):
 def grad_nuc(cell, atmlst):
     '''
     Derivatives of nuclear repulsion energy wrt nuclear coordinates
+
+    Notes:
+        An optimized version of this function is available in
+        `pbc.gto.ewald_methods.ewald_nuc_grad`
     '''
     chargs = cell.atom_charges()
     ew_eta, ew_cut = cell.get_ewald_params()
@@ -244,12 +248,14 @@ def grad_nuc(cell, atmlst):
     absG2[absG2==0] = 1e200
     ewg_grad = np.zeros([natom,3])
     SI = cell.get_SI(Gv)
-    if cell.low_dim_ft_type is None or cell.dimension == 3:
+    if cell.dimension != 2 or cell.low_dim_ft_type == 'inf_vacuum':
         coulG = 4*np.pi / absG2
         coulG *= weights
         ZSI = np.einsum("i,ij->j", chargs, SI)
         ZexpG2 = coulG * np.exp(-absG2/(4*ew_eta**2))
         ZexpG2_mod = ZexpG2.reshape(len(ZexpG2),1) * Gv
+    else:
+        raise NotImplementedError
     for i, qi in enumerate(chargs):
         Zfac = np.imag(ZSI * SI[i].conj()) * qi
         ewg_grad[i] = - np.sum(Zfac.reshape((len(Zfac),1)) * ZexpG2_mod, axis = 0)
diff --git a/pyscf/pbc/grad/rhf.py b/pyscf/pbc/grad/rhf.py
new file mode 100644
index 0000000000..720451b719
--- /dev/null
+++ b/pyscf/pbc/grad/rhf.py
@@ -0,0 +1,167 @@
+#!/usr/bin/env python
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Author: Xing Zhang <zhangxing.nju@gmail.com>
+#
+
+import ctypes
+import numpy as np
+from pyscf import __config__
+from pyscf import lib
+from pyscf.lib import logger
+from pyscf.grad import rhf as mol_rhf
+from pyscf.grad.rhf import _write
+from pyscf.pbc.gto.pseudo import pp_int
+from pyscf.pbc.lib.kpts_helper import gamma_point
+
+SCREEN_VHF_DM_CONTRA = getattr(__config__, 'pbc_rhf_grad_screen_vhf_dm_contract', True)
+libpbc = lib.load_library('libpbc')
+
+def grad_elec(mf_grad, mo_energy=None, mo_coeff=None, mo_occ=None,
+              atmlst=None, kpt=np.zeros(3)):
+    mf = mf_grad.base
+    mol = mf_grad.mol
+    if mo_energy is None: mo_energy = mf.mo_energy
+    if mo_occ is None:    mo_occ = mf.mo_occ
+    if mo_coeff is None:  mo_coeff = mf.mo_coeff
+    log = logger.Logger(mf_grad.stdout, mf_grad.verbose)
+
+    s1 = mf_grad.get_ovlp(mol, kpt)
+    dm0 = mf.make_rdm1(mo_coeff, mo_occ)
+
+    t0 = (logger.process_clock(), logger.perf_counter())
+    log.debug('Computing Gradients of NR-HF Coulomb repulsion')
+    vhf = mf_grad.get_veff(mol, dm0, kpt)
+    log.timer('gradients of 2e part', *t0)
+
+    dme0 = mf_grad.make_rdm1e(mo_energy, mo_coeff, mo_occ)
+
+    if atmlst is None:
+        atmlst = range(mol.natm)
+
+    de = 0
+    if gamma_point(kpt):
+        de  = mf.with_df.vpploc_part1_nuc_grad(dm0, kpts=kpt.reshape(-1,3))
+        de += pp_int.vpploc_part2_nuc_grad(mol, dm0)
+        de += pp_int.vppnl_nuc_grad(mol, dm0)
+        h1ao = -mol.pbc_intor('int1e_ipkin', kpt=kpt)
+        if getattr(mf.with_df, 'vpplocG_part1', None) is None:
+            h1ao += -mf.with_df.get_vpploc_part1_ip1(kpts=kpt.reshape(-1,3))
+        de += _contract_vhf_dm(mf_grad, np.add(h1ao, vhf), dm0) * 2
+        de += _contract_vhf_dm(mf_grad, s1, dme0) * -2
+        h1ao = s1 = vhf = dm0 = dme0 = None
+        de = de[atmlst]
+    else:
+        raise NotImplementedError
+
+    for k, ia in enumerate(atmlst):
+        de[k] += mf_grad.extra_force(ia, locals())
+
+    if log.verbose >= logger.DEBUG:
+        log.debug('gradients of electronic part')
+        _write(log, mol, de, atmlst)
+    return de
+
+
+def _contract_vhf_dm(mf_grad, vhf, dm, comp=3, atmlst=None,
+                     screen=SCREEN_VHF_DM_CONTRA):
+    from pyscf.gto.mole import ao_loc_nr, ATOM_OF
+    from pyscf.pbc.gto import build_neighbor_list_for_shlpairs, free_neighbor_list
+
+    t0 = (logger.process_clock(), logger.perf_counter())
+
+    mol = mf_grad.mol
+    natm = mol.natm
+    nbas = mol.nbas
+    shls_slice = np.asarray([0,nbas,0,nbas], order="C", dtype=np.int32)
+    ao_loc = np.asarray(ao_loc_nr(mol), order="C", dtype=np.int32)
+    shls_atm = np.asarray(mol._bas[:,ATOM_OF].copy(), order="C", dtype=np.int32)
+
+    de = np.zeros((natm,comp), order="C")
+    vhf = np.asarray(vhf, order="C")
+    dm = np.asarray(dm, order="C")
+
+    if screen:
+        neighbor_list = build_neighbor_list_for_shlpairs(mol)
+    else:
+        neighbor_list = lib.c_null_ptr()
+    func = getattr(libpbc, "contract_vhf_dm", None)
+    try:
+        func(de.ctypes.data_as(ctypes.c_void_p),
+             vhf.ctypes.data_as(ctypes.c_void_p),
+             dm.ctypes.data_as(ctypes.c_void_p),
+             ctypes.byref(neighbor_list),
+             shls_slice.ctypes.data_as(ctypes.c_void_p),
+             ao_loc.ctypes.data_as(ctypes.c_void_p),
+             shls_atm.ctypes.data_as(ctypes.c_void_p),
+             ctypes.c_int(comp), ctypes.c_int(natm),
+             ctypes.c_int(nbas))
+    except RuntimeError:
+        raise
+    free_neighbor_list(neighbor_list)
+
+    if atmlst is not None:
+        de = de[atmlst]
+
+    logger.timer(mf_grad, '_contract_vhf_dm', *t0)
+    return de
+
+
+def get_ovlp(cell, kpt=np.zeros(3)):
+    return -cell.pbc_intor('int1e_ipovlp', kpt=kpt)
+
+
+def get_veff(mf_grad, mol, dm, kpt=np.zeros(3)):
+    mf = mf_grad.base
+    mydf = mf.with_df
+    xc_code = getattr(mf, 'xc', None)
+    kpts = kpt.reshape(-1,3)
+    return -mydf.get_veff_ip1(dm, xc_code=xc_code, kpts=kpts)
+
+
+def grad_nuc(cell, atmlst=None, ew_eta=None, ew_cut=None):
+    from pyscf.pbc.gto import ewald_methods
+
+    t0 = (logger.process_clock(), logger.perf_counter())
+
+    grad = ewald_methods.ewald_nuc_grad(cell, ew_eta, ew_cut)
+    if atmlst is not None:
+        grad = grad[atmlst]
+
+    logger.timer(cell, 'nuclear gradient', *t0)
+    return grad
+
+
+class GradientsBase(mol_rhf.GradientsBase):
+    '''Base class for Gamma-point nuclear gradient'''
+    def grad_nuc(self, mol=None, atmlst=None):
+        if mol is None: mol = self.mol
+        return grad_nuc(mol, atmlst)
+
+    def get_ovlp(self, mol=None, kpt=np.zeros(3)):
+        if mol is None:
+            mol = self.mol
+        return get_ovlp(mol, kpt)
+
+
+class Gradients(GradientsBase):
+    '''Non-relativistic Gamma-point restricted Hartree-Fock gradients'''
+    def get_veff(self, mol=None, dm=None, kpt=np.zeros(3)):
+        if mol is None: mol = self.mol
+        if dm is None: dm = self.base.make_rdm1()
+        return get_veff(self, mol, dm, kpt)
+
+    make_rdm1e = mol_rhf.Gradients.make_rdm1e
+    grad_elec = grad_elec
diff --git a/pyscf/pbc/grad/rks.py b/pyscf/pbc/grad/rks.py
new file mode 100644
index 0000000000..1429050002
--- /dev/null
+++ b/pyscf/pbc/grad/rks.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Author: Xing Zhang <zhangxing.nju@gmail.com>
+#
+
+from pyscf.pbc.grad import rhf
+
+
+class Gradients(rhf.Gradients):
+    '''Non-relativistic Gamma-point restricted Kohn-Sham DFT gradients'''
+    pass
diff --git a/pyscf/pbc/grad/uhf.py b/pyscf/pbc/grad/uhf.py
new file mode 100644
index 0000000000..fd71aa0920
--- /dev/null
+++ b/pyscf/pbc/grad/uhf.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Author: Xing Zhang <zhangxing.nju@gmail.com>
+#
+
+import ctypes
+import numpy as np
+from pyscf import __config__
+from pyscf.lib import logger
+from pyscf.grad import uhf as mol_uhf
+from pyscf.grad.rhf import _write
+from pyscf.pbc.gto.pseudo import pp_int
+from pyscf.pbc.grad import rhf as rhf_grad
+from pyscf.pbc.lib.kpts_helper import gamma_point
+
+def grad_elec(mf_grad, mo_energy=None, mo_coeff=None, mo_occ=None, atmlst=None, kpt=np.zeros(3)):
+    mf = mf_grad.base
+    mol = mf_grad.mol
+    if mo_energy is None: mo_energy = mf.mo_energy
+    if mo_occ is None:    mo_occ = mf.mo_occ
+    if mo_coeff is None:  mo_coeff = mf.mo_coeff
+    log = logger.Logger(mf_grad.stdout, mf_grad.verbose)
+
+    s1 = mf_grad.get_ovlp(mol, kpt)
+    dm0 = mf.make_rdm1(mo_coeff, mo_occ)
+
+    t0 = (logger.process_clock(), logger.perf_counter())
+    log.debug('Computing Gradients of NR-HF Coulomb repulsion')
+    vhf = mf_grad.get_veff(mol, dm0, kpt)
+    log.timer('gradients of 2e part', *t0)
+
+    dme0 = mf_grad.make_rdm1e(mo_energy, mo_coeff, mo_occ)
+    dm0_sf = dm0[0] + dm0[1]
+    dme0_sf = dme0[0] + dme0[1]
+
+    if atmlst is None:
+        atmlst = range(mol.natm)
+
+    de = 0
+    if gamma_point(kpt):
+        de  = mf.with_df.vpploc_part1_nuc_grad(dm0_sf, kpts=kpt.reshape(-1,3))
+        de += pp_int.vpploc_part2_nuc_grad(mol, dm0_sf)
+        de += pp_int.vppnl_nuc_grad(mol, dm0_sf)
+        h1ao = -mol.pbc_intor('int1e_ipkin', kpt=kpt)
+        if getattr(mf.with_df, 'vpplocG_part1', None) is None:
+            h1ao += -mf.with_df.get_vpploc_part1_ip1(kpts=kpt.reshape(-1,3))
+        de += rhf_grad._contract_vhf_dm(mf_grad, h1ao, dm0_sf) * 2
+        for s in range(2):
+            de += rhf_grad._contract_vhf_dm(mf_grad, vhf[s], dm0[s]) * 2
+        de += rhf_grad._contract_vhf_dm(mf_grad, s1, dme0_sf) * -2
+        h1ao = s1 = vhf = dm0 = dme0 = dm0_sf = dme0_sf = None
+        de = de[atmlst]
+    else:
+        raise NotImplementedError
+
+    for k, ia in enumerate(atmlst):
+        de[k] += mf_grad.extra_force(ia, locals())
+
+    if log.verbose >= logger.DEBUG:
+        log.debug('gradients of electronic part')
+        _write(log, mol, de, atmlst)
+    return de
+
+def get_veff(mf_grad, mol, dm, kpt=np.zeros(3)):
+    mf = mf_grad.base
+    mydf = mf.with_df
+    xc_code = getattr(mf, 'xc', None)
+    kpts = kpt.reshape(-1,3)
+    return -mydf.get_veff_ip1(dm, xc_code=xc_code, kpts=kpts, spin=1)
+
+class Gradients(rhf_grad.GradientsBase):
+    '''Non-relativistic Gamma-point restricted Hartree-Fock gradients'''
+    def get_veff(self, mol=None, dm=None, kpt=np.zeros(3)):
+        if mol is None: mol = self.mol
+        if dm is None: dm = self.base.make_rdm1()
+        return get_veff(self, mol, dm, kpt)
+
+    make_rdm1e = mol_uhf.Gradients.make_rdm1e
+    grad_elec = grad_elec
diff --git a/pyscf/pbc/grad/uks.py b/pyscf/pbc/grad/uks.py
new file mode 100644
index 0000000000..4a6ce67c1a
--- /dev/null
+++ b/pyscf/pbc/grad/uks.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Author: Xing Zhang <zhangxing.nju@gmail.com>
+#
+
+from pyscf.pbc.grad import uhf
+
+
+class Gradients(uhf.Gradients):
+    '''Non-relativistic Gamma-point unrestricted Kohn-Sham DFT gradients'''
+    pass
diff --git a/pyscf/pbc/gto/__init__.py b/pyscf/pbc/gto/__init__.py
index dcaaddebbc..769b76c616 100644
--- a/pyscf/pbc/gto/__init__.py
+++ b/pyscf/pbc/gto/__init__.py
@@ -22,6 +22,7 @@
 from pyscf.pbc.gto.basis import parse, load, parse_ecp, load_ecp
 from pyscf.pbc.gto import pseudo
 from pyscf.pbc.gto.cell import *
+from pyscf.pbc.gto.neighborlist import *
 
 parse_pp = parsepp = pseudo.parse
 load_pp = loadpp = pseudo.load
diff --git a/pyscf/pbc/gto/_pbcintor.py b/pyscf/pbc/gto/_pbcintor.py
index f721eb0304..c5b921b2e0 100644
--- a/pyscf/pbc/gto/_pbcintor.py
+++ b/pyscf/pbc/gto/_pbcintor.py
@@ -33,15 +33,21 @@ def __init__(self, cell):
 
     def init_rcut_cond(self, cell, precision=None):
         if precision is None: precision = cell.precision
-        rcut = numpy.array([cell.bas_rcut(ib, precision)
-                            for ib in range(cell.nbas)])
+        if cell.use_loose_rcut:
+            rcut = cell.rcut_by_shells(precision)
+            fn_set_rcut_cond = getattr(libpbc, 'PBCset_rcut_cond_loose')
+        else:
+            rcut = numpy.array([cell.bas_rcut(ib, precision)
+                                for ib in range(cell.nbas)])
+            fn_set_rcut_cond = getattr(libpbc, 'PBCset_rcut_cond')
+
         natm = ctypes.c_int(cell._atm.shape[0])
         nbas = ctypes.c_int(cell._bas.shape[0])
-        libpbc.PBCset_rcut_cond(self._this,
-                                rcut.ctypes.data_as(ctypes.c_void_p),
-                                cell._atm.ctypes.data_as(ctypes.c_void_p), natm,
-                                cell._bas.ctypes.data_as(ctypes.c_void_p), nbas,
-                                cell._env.ctypes.data_as(ctypes.c_void_p))
+        fn_set_rcut_cond(self._this,
+                         rcut.ctypes.data_as(ctypes.c_void_p),
+                         cell._atm.ctypes.data_as(ctypes.c_void_p), natm,
+                         cell._bas.ctypes.data_as(ctypes.c_void_p), nbas,
+                         cell._env.ctypes.data_as(ctypes.c_void_p))
         return self
 
     def del_rcut_cond(self):
@@ -56,4 +62,5 @@ def __del__(self):
 
 class _CPBCOpt(ctypes.Structure):
     _fields_ = [('rrcut', ctypes.c_void_p),
+                ('rcut', ctypes.c_void_p),
                 ('fprescreen', ctypes.c_void_p)]
diff --git a/pyscf/pbc/gto/cell.py b/pyscf/pbc/gto/cell.py
index 87282fbfd4..872fda36e2 100644
--- a/pyscf/pbc/gto/cell.py
+++ b/pyscf/pbc/gto/cell.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# Copyright 2014-2021 The PySCF Developers. All Rights Reserved.
+# Copyright 2014-2024 The PySCF Developers. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -41,6 +41,9 @@
 WITH_GAMMA = getattr(__config__, 'pbc_gto_cell_make_kpts_with_gamma', True)
 EXP_DELIMITER = getattr(__config__, 'pbc_gto_cell_split_basis_exp_delimiter',
                         [1.0, 0.5, 0.25, 0.1, 0])
+# defined in lib/pbc/cell.h
+RCUT_EPS = 1e-3
+RCUT_MAX_CYCLE = 10
 
 libpbc = _pbcintor.libpbc
 
@@ -281,6 +284,89 @@ def intor_cross(intor, cell1, cell2, comp=None, hermi=0, kpts=None, kpt=None,
         mat = mat[0]
     return mat
 
+def _intor_cross_screened(
+        intor, cell1, cell2, comp=None, hermi=0, kpts=None, kpt=None,
+        shls_slice=None, **kwargs):
+    '''`intor_cross` with prescreening.
+
+    Notes:
+         This function may be subject to change.
+    '''
+    from pyscf.pbc.gto.neighborlist import NeighborListOpt
+    intor, comp = moleintor._get_intor_and_comp(cell1._add_suffix(intor), comp)
+
+    if kpts is None:
+        if kpt is not None:
+            kpts_lst = np.reshape(kpt, (1,3))
+        else:
+            kpts_lst = np.zeros((1,3))
+    else:
+        kpts_lst = np.reshape(kpts, (-1,3))
+    nkpts = len(kpts_lst)
+
+    pcell = cell1.copy(deep=False)
+    pcell.precision = min(cell1.precision, cell2.precision)
+    pcell._atm, pcell._bas, pcell._env = \
+            atm, bas, env = conc_env(cell1._atm, cell1._bas, cell1._env,
+                                     cell2._atm, cell2._bas, cell2._env)
+    if shls_slice is None:
+        shls_slice = (0, cell1.nbas, 0, cell2.nbas)
+    i0, i1, j0, j1 = shls_slice[:4]
+    j0 += cell1.nbas
+    j1 += cell1.nbas
+    ao_loc = moleintor.make_loc(bas, intor)
+    ni = ao_loc[i1] - ao_loc[i0]
+    nj = ao_loc[j1] - ao_loc[j0]
+    out = np.empty((nkpts,comp,ni,nj), dtype=np.complex128)
+
+    if hermi == 0:
+        aosym = 's1'
+    else:
+        aosym = 's2'
+    fill = getattr(libpbc, 'PBCnr2c_screened_fill_k'+aosym)
+    fintor = getattr(moleintor.libcgto, intor)
+    drv = libpbc.PBCnr2c_screened_drv
+
+    rcut = max(cell1.rcut, cell2.rcut)
+    Ls = cell1.get_lattice_Ls(rcut=rcut)
+    expkL = np.asarray(np.exp(1j*np.dot(kpts_lst, Ls.T)), order='C')
+
+    neighbor_list = kwargs.get('neighbor_list', None)
+    if neighbor_list is None:
+        nlopt = NeighborListOpt(cell1)
+        nlopt.build(cell1, cell2, Ls, set_optimizer=False)
+        neighbor_list = nlopt.nl
+
+    cintopt = lib.c_null_ptr()
+
+    drv(fintor, fill, out.ctypes.data_as(ctypes.c_void_p),
+        ctypes.c_int(nkpts), ctypes.c_int(comp), ctypes.c_int(len(Ls)),
+        Ls.ctypes.data_as(ctypes.c_void_p),
+        expkL.ctypes.data_as(ctypes.c_void_p),
+        (ctypes.c_int*4)(i0, i1, j0, j1),
+        ao_loc.ctypes.data_as(ctypes.c_void_p), cintopt,
+        atm.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(pcell.natm),
+        bas.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(pcell.nbas),
+        env.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(env.size),
+        ctypes.byref(neighbor_list))
+
+    nlopt = None
+
+    mat = []
+    for k, kpt in enumerate(kpts_lst):
+        v = out[k]
+        if hermi != 0:
+            for ic in range(comp):
+                lib.hermi_triu(v[ic], hermi=hermi, inplace=True)
+        if comp == 1:
+            v = v[0]
+        if abs(kpt).sum() < 1e-9:  # gamma_point
+            v = v.real
+        mat.append(v)
+
+    if kpts is None or np.shape(kpts) == (3,):  # A single k-point
+        mat = mat[0]
+    return mat
 
 def get_nimgs(cell, precision=None):
     r'''Choose number of basis function images in lattice sums
@@ -339,6 +425,9 @@ def estimate_rcut(cell, precision=None):
         return 0.01
     if precision is None:
         precision = cell.precision
+    if cell.use_loose_rcut:
+        return cell.rcut_by_shells(precision).max()
+
     exps, cs = _extract_pgto_params(cell, 'min')
     ls = cell._bas[:,mole.ANG_OF]
     rcut = _estimate_rcut(exps, ls, cs, precision)
@@ -491,7 +580,24 @@ def get_Gv_weights(cell, mesh=None, **kwargs):
             weights = np.einsum('i,k->ik', wxy, wz).reshape(-1)
 
     Gvbase = (rx, ry, rz)
-    Gv = np.dot(lib.cartesian_prod(Gvbase), b)
+
+    #:Gv = np.dot(lib.cartesian_prod(Gvbase), b)
+    # NOTE mesh can be different from the input mesh
+    mesh = np.asarray((len(rx),len(ry),len(rz)), dtype=np.int32)
+    Gv = np.empty((*mesh,3), order='C', dtype=float)
+    b = np.asarray(b, order='C')
+    rx = np.asarray(rx, order='C')
+    ry = np.asarray(ry, order='C')
+    rz = np.asarray(rz, order='C')
+    fn = libpbc.get_Gv
+    fn(Gv.ctypes.data_as(ctypes.c_void_p),
+       rx.ctypes.data_as(ctypes.c_void_p),
+       ry.ctypes.data_as(ctypes.c_void_p),
+       rz.ctypes.data_as(ctypes.c_void_p),
+       mesh.ctypes.data_as(ctypes.c_void_p),
+       b.ctypes.data_as(ctypes.c_void_p))
+    Gv = Gv.reshape(-1, 3)
+
     # 1/cell.vol == det(b)/(2pi)^3
     weights *= 1/(2*np.pi)**3
     return Gv, Gvbase, weights
@@ -504,7 +610,7 @@ def _non_uniform_Gv_base(n):
     #return np.hstack((0,rs,-rs[::-1])), np.hstack((0,ws,ws[::-1]))
     return np.hstack((rs,-rs[::-1])), np.hstack((ws,ws[::-1]))
 
-def get_SI(cell, Gv=None, mesh=None):
+def get_SI(cell, Gv=None, mesh=None, atmlst=None):
     '''Calculate the structure factor (0D, 1D, 2D, 3D) for all atoms; see MH (3.34).
 
     Args:
@@ -513,11 +619,16 @@ def get_SI(cell, Gv=None, mesh=None):
         Gv : (N,3) array
             G vectors
 
+        atmlst : list of ints, optional
+            Indices of atoms for which the structure factors are computed.
+
     Returns:
         SI : (natm, ngrids) ndarray, dtype=np.complex128
             The structure factor for each atom at each G-vector.
     '''
     coords = cell.atom_coords()
+    if atmlst is not None:
+        coords = coords[np.asarray(atmlst)]
     if Gv is None:
         if mesh is None:
             mesh = cell.mesh
@@ -598,6 +709,10 @@ def ewald(cell, ew_eta=None, ew_cut=None):
     if cell.natm == 0:
         return 0
 
+    if cell.dimension == 3 and cell.use_particle_mesh_ewald:
+        from pyscf.pbc.gto import ewald_methods
+        return ewald_methods.particle_mesh_ewald(cell, ew_eta, ew_cut)
+
     chargs = cell.atom_charges()
 
     if ew_eta is None or ew_cut is None:
@@ -639,7 +754,16 @@ def ewald(cell, ew_eta=None, ew_cut=None):
         # have relatively large error
         coulG = 4*np.pi / absG2
         coulG *= weights
-        ZSI = np.einsum("i,ij->j", chargs, cell.get_SI(Gv))
+
+        #:ZSI = np.einsum('i,ij->j', chargs, cell.get_SI(Gv))
+        ngrids = len(Gv)
+        ZSI = np.empty((ngrids,), dtype=np.complex128)
+        mem_avail = cell.max_memory - lib.current_memory()[0]
+        blksize = int((mem_avail*1e6 - cell.natm*24)/((3+cell.natm*2)*8))
+        blksize = min(ngrids, max(mesh[2], blksize))
+        for ig0, ig1 in lib.prange(0, ngrids, blksize):
+            np.einsum('i,ij->j', chargs, cell.get_SI(Gv[ig0:ig1]), out=ZSI[ig0:ig1])
+
         ZexpG2 = ZSI * np.exp(-absG2/(4*ew_eta**2))
         ewg = .5 * np.einsum('i,i,i', ZSI.conj(), ZexpG2, coulG).real
 
@@ -835,6 +959,59 @@ def _mesh_inf_vaccum(cell):
     # meshz has to be even number due to the symmetry on z+ and z-
     return int(meshz*.5 + .999) * 2
 
+def pgf_rcut(l, alpha, coeff, precision=INTEGRAL_PRECISION,
+             rcut=0, max_cycle=RCUT_MAX_CYCLE, eps=RCUT_EPS):
+    '''Estimate the cutoff radii of primitive Gaussian functions
+    based on their values in real space:
+    `c*rcut^(l+2)*exp(-alpha*rcut^2) ~ precision`.
+    '''
+    c = np.log(coeff / precision)
+
+    rmin = np.sqrt(.5 * (l+2) / alpha) * 2
+    eps = np.minimum(rmin/10, eps)
+    rcut = np.maximum(rcut, rmin+eps)
+    for i in range(max_cycle):
+        rcut_last = rcut
+        rcut = np.sqrt(((l+2) * np.log(rcut) + c) / alpha)
+        if np.all(abs(rcut - rcut_last) < eps):
+            return rcut
+    warnings.warn(f'cell.pgf_rcut failed to converge in {max_cycle} cycles.')
+    return rcut
+
+def rcut_by_shells(cell, precision=None, rcut=0,
+                   return_pgf_radius=False):
+    '''Compute shell and primitive gaussian function radii.
+    '''
+    # TODO the internal implementation loops over all shells,
+    # which can be optimized to loop over atom types.
+    if precision is None:
+        precision = cell.precision
+
+    bas = np.asarray(cell._bas, order='C')
+    env = np.asarray(cell._env, order='C')
+    nbas = len(bas)
+    shell_radius = np.empty((nbas,), order='C', dtype=float)
+    if return_pgf_radius:
+        nprim = bas[:,mole.NPRIM_OF].max()
+        # be careful that the unused memory blocks are not initialized
+        pgf_radius = np.empty((nbas,nprim), order='C', dtype=np.double)
+        ptr_pgf_radius = lib.ndarray_pointer_2d(pgf_radius)
+    else:
+        ptr_pgf_radius = lib.c_null_ptr()
+    fn = getattr(libpbc, 'rcut_by_shells', None)
+    try:
+        fn(shell_radius.ctypes.data_as(ctypes.c_void_p),
+           ptr_pgf_radius,
+           bas.ctypes.data_as(ctypes.c_void_p),
+           env.ctypes.data_as(ctypes.c_void_p),
+           ctypes.c_int(nbas), ctypes.c_double(rcut),
+           ctypes.c_double(precision))
+    except Exception as e:
+        raise RuntimeError(f'Failed to get shell radii.\n{e}')
+    if return_pgf_radius:
+        return shell_radius, pgf_radius
+    return shell_radius
+
 
 class Cell(mole.MoleBase):
     '''A Cell object holds the basic information of a crystal.
@@ -864,6 +1041,14 @@ class Cell(mole.MoleBase):
             infinity vacuum (inf_vacuum) or truncated Coulomb potential
             (analytic_2d_1). Unless explicitly specified, analytic_2d_1 is
             used for 2D system and inf_vacuum is assumed for 1D and 0D.
+        use_loose_rcut : bool
+            If set to True, a loose `rcut` determined by shell radius is used,
+            which is usually accurate enough for pure DFT calculations;
+            otherwise, a tight `rcut` determined by overlap integral is used.
+            Default value is False. Has no effect if `rcut` is set manually.
+        use_particle_mesh_ewald : bool
+            If set to True, use particle-mesh Ewald to compute the nuclear repulsion.
+            Default value is False, meaning to use classical Ewald summation.
         space_group_symmetry : bool
             Whether to consider space group symmetry. Default is False.
         symmorphic : bool
@@ -892,6 +1077,7 @@ class Cell(mole.MoleBase):
         'precision', 'exp_to_discard',
         'a', 'ke_cutoff', 'pseudo', 'dimension', 'low_dim_ft_type',
         'space_group_symmetry', 'symmorphic', 'lattice_symmetry', 'mesh', 'rcut',
+        'use_loose_rcut', 'use_particle_mesh_ewald',
     }
 
     def __init__(self, **kwargs):
@@ -906,6 +1092,8 @@ def __init__(self, **kwargs):
         #       density-fitting class.  This determines how the ewald produces
         #       its energy.
         self.low_dim_ft_type = None
+        self.use_loose_rcut = False
+        self.use_particle_mesh_ewald = False
         self.space_group_symmetry = False
         self.symmorphic = False
         self.lattice_symmetry = None
@@ -1082,7 +1270,9 @@ def build_lattice_symmetry(self, check_mesh_symmetry=True):
     def build(self, dump_input=True, parse_arg=mole.ARGPARSE,
               a=None, mesh=None, ke_cutoff=None, precision=None, nimgs=None,
               h=None, dimension=None, rcut= None, low_dim_ft_type=None,
-              space_group_symmetry=None, symmorphic=None, *args, **kwargs):
+              space_group_symmetry=None, symmorphic=None,
+              use_loose_rcut=None, use_particle_mesh_ewald=None,
+              *args, **kwargs):
         '''Setup Mole molecule and Cell and initialize some control parameters.
         Whenever you change the value of the attributes of :class:`Cell`,
         you need call this function to refresh the internal data of Cell.
@@ -1133,6 +1323,10 @@ def build(self, dump_input=True, parse_arg=mole.ARGPARSE,
         if rcut is not None: self.rcut = rcut
         if ke_cutoff is not None: self.ke_cutoff = ke_cutoff
         if low_dim_ft_type is not None: self.low_dim_ft_type = low_dim_ft_type
+        if use_loose_rcut is not None:
+            self.use_loose_rcut = use_loose_rcut
+        if use_particle_mesh_ewald is not None:
+            self.use_particle_mesh_ewald = use_particle_mesh_ewald
         if space_group_symmetry is not None:
             self.space_group_symmetry = space_group_symmetry
         if symmorphic is not None:
@@ -1265,7 +1459,7 @@ def build(self, dump_input=True, parse_arg=mole.ARGPARSE,
             logger.info(self, 'Cell volume = %g', self.vol)
             # Check atoms coordinates
             if self.dimension > 0 and self.natm > 0:
-                scaled_atom_coords = np.linalg.solve(_a.T, self.atom_coords().T).T
+                scaled_atom_coords = self.get_scaled_atom_coords(_a)
                 atom_boundary_max = scaled_atom_coords[:,:self.dimension].max(axis=0)
                 atom_boundary_min = scaled_atom_coords[:,:self.dimension].min(axis=0)
                 if (np.any(atom_boundary_max > 1) or np.any(atom_boundary_min < -1)):
@@ -1367,13 +1561,12 @@ def lattice_vectors(self):
         else:
             return a/self.unit
 
-    def get_scaled_positions(self):
-        ''' Get scaled atom positions.
+    def get_scaled_atom_coords(self, a=None):
+        ''' Get scaled atomic coordinates.
         '''
-        a = self.lattice_vectors()
-        atm_pos = self.atom_coords()
-        scaled_atm_pos = np.dot(atm_pos,np.linalg.inv(a))
-        return scaled_atm_pos
+        if a is None:
+            a = self.lattice_vectors()
+        return np.dot(self.atom_coords(), np.linalg.inv(a))
 
     def reciprocal_vectors(self, norm_to=2*np.pi):
         r'''
@@ -1475,6 +1668,7 @@ def loads_(self, molstr):
         return self
 
     bas_rcut = bas_rcut
+    rcut_by_shells = rcut_by_shells
 
     get_lattice_Ls = pbctools.get_lattice_Ls
 
@@ -1511,6 +1705,10 @@ def pbc_intor(self, intor, comp=None, hermi=0, kpts=None, kpt=None,
             # FIXME: Whether to check _built and call build?  ._bas and .basis
             # may not be consistent. calling .build() may leads to wrong intor env.
             #self.build(False, False)
+        if self.use_loose_rcut:
+            return _intor_cross_screened(
+                            intor, self, self, comp, hermi, kpts, kpt,
+                            shls_slice, **kwargs)
         return intor_cross(intor, self, self, comp, hermi, kpts, kpt,
                            shls_slice, **kwargs)
 
@@ -1551,6 +1749,7 @@ def to_mol(self):
         mol = self.view(mole.Mole)
         delattr(mol, 'a')
         delattr(mol, '_mesh')
+        mol.enuc = None #reset nuclear energy
         if mol.symmetry:
             mol._build_symmetry()
         return mol
diff --git a/pyscf/pbc/gto/ewald_methods.py b/pyscf/pbc/gto/ewald_methods.py
new file mode 100644
index 0000000000..75d028a564
--- /dev/null
+++ b/pyscf/pbc/gto/ewald_methods.py
@@ -0,0 +1,293 @@
+#!/usr/bin/env python
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Author: Xing Zhang <zhangxing.nju@gmail.com>
+#
+
+import ctypes
+import numpy as np
+import scipy
+from pyscf import __config__
+from pyscf import lib
+from pyscf.lib import logger
+from pyscf.gto import mole
+from pyscf.pbc import tools
+
+libpbc = lib.load_library('libpbc')
+
+INTERPOLATION_ORDER = getattr(__config__, 'pyscf_pbc_ewald_bspline_order', 10)
+
+def _bspline(u, n=4):
+    fac = 1. / scipy.special.factorial(n-1)
+    M = 0
+    for k in range(n+1):
+        fac1 = ((-1)**k) * scipy.special.binom(n, k)
+        M += fac1 * ((np.maximum(u-k, 0)) ** (n-1))
+    M *= fac
+    return M
+
+def _bspline_grad(u, n=4):
+    r'''
+    ... math::
+        \frac{dM}{du} = M_{n-1}(u) - M_{n-1}(u-1)
+    '''
+    dMdu = _bspline(u, n-1) - _bspline(u-1, n-1)
+    return dMdu
+
+def bspline(u, ng, n=4, deriv=0):
+    u = np.asarray(u).ravel()
+    u_floor = np.floor(u)
+    delta = u - u_floor
+    idx = []
+    val = []
+    for i in range(n):
+        idx.append(np.rint((u_floor - i) % ng).astype(int))
+        val.append(delta + i)
+
+    M = np.zeros((u.size, ng))
+    for i in range(n):
+        M[np.arange(u.size),idx[i]] += _bspline(val[i], n)
+
+    if deriv > 0:
+        if deriv > 1:
+            raise NotImplementedError
+        dM = np.zeros((u.size, ng))
+        for i in range(n):
+            dM[np.arange(u.size),idx[i]] += _bspline_grad(val[i], n)
+        M = [M, dM]
+
+    m = np.arange(ng)
+    b = np.exp(2*np.pi*1j*(n-1)*m/ng)
+    tmp = 0
+    for k in range(n-1):
+        tmp += _bspline(k+1, n) * np.exp(2*np.pi*1j*m*k/ng)
+    b /= tmp
+    if n % 2 > 0 and ng % 2 == 0 :
+        b[ng//2] = 0
+    return M, b, idx
+
+def _get_ewald_direct(cell, ew_eta=None, ew_cut=None):
+    if ew_eta is None or ew_cut is None:
+        ew_eta, ew_cut = cell.get_ewald_params()
+
+    chargs = np.asarray(cell.atom_charges(), order='C', dtype=float)
+    coords = np.asarray(cell.atom_coords(), order='C')
+    Lall = np.asarray(cell.get_lattice_Ls(rcut=ew_cut), order='C')
+
+    natm = len(chargs)
+    nL = len(Lall)
+    ewovrl = np.zeros([1])
+    fun = getattr(libpbc, "get_ewald_direct")
+    fun(ewovrl.ctypes.data_as(ctypes.c_void_p),
+        chargs.ctypes.data_as(ctypes.c_void_p),
+        coords.ctypes.data_as(ctypes.c_void_p),
+        Lall.ctypes.data_as(ctypes.c_void_p),
+        ctypes.c_double(ew_eta), ctypes.c_double(ew_cut),
+        ctypes.c_int(natm), ctypes.c_int(nL))
+    return ewovrl[0]
+
+def _get_ewald_direct_nuc_grad(cell, ew_eta=None, ew_cut=None):
+    if ew_eta is None or ew_cut is None:
+        ew_eta, ew_cut = cell.get_ewald_params()
+
+    chargs = np.asarray(cell.atom_charges(), order='C', dtype=float)
+    coords = np.asarray(cell.atom_coords(), order='C')
+    Lall = np.asarray(cell.get_lattice_Ls(rcut=ew_cut), order='C')
+
+    natm = len(chargs)
+    nL = len(Lall)
+    grad = np.zeros([natm,3], order='C', dtype=float)
+    fun = getattr(libpbc, "get_ewald_direct_nuc_grad")
+    fun(grad.ctypes.data_as(ctypes.c_void_p),
+        chargs.ctypes.data_as(ctypes.c_void_p),
+        coords.ctypes.data_as(ctypes.c_void_p),
+        Lall.ctypes.data_as(ctypes.c_void_p),
+        ctypes.c_double(ew_eta), ctypes.c_double(ew_cut),
+        ctypes.c_int(natm), ctypes.c_int(nL))
+    return grad
+
+
+# FIXME The default interpolation order may be too high
+def particle_mesh_ewald(cell, ew_eta=None, ew_cut=None,
+                        order=INTERPOLATION_ORDER):
+    if cell.dimension != 3:
+        raise NotImplementedError("Particle mesh ewald only works for 3D.")
+
+    chargs = cell.atom_charges()
+    coords = cell.atom_coords()
+    natm = len(coords)
+
+    if ew_eta is None or ew_cut is None:
+        ew_eta, ew_cut = cell.get_ewald_params()
+    log_precision = np.log(cell.precision / (chargs.sum()*16*np.pi**2))
+    ke_cutoff = -2*ew_eta**2*log_precision
+    mesh = cell.cutoff_to_mesh(ke_cutoff)
+
+    ewovrl = _get_ewald_direct(cell, ew_eta, ew_cut)
+    ewself  = -.5 * np.dot(chargs,chargs) * 2 * ew_eta / np.sqrt(np.pi)
+    if cell.dimension == 3:
+        ewself += -.5 * np.sum(chargs)**2 * np.pi/(ew_eta**2 * cell.vol)
+
+    b = cell.reciprocal_vectors(norm_to=1)
+    u = np.dot(coords, b.T) * mesh[None,:]
+
+    Mx, bx, idx = bspline(u[:,0], mesh[0], order)
+    My, by, idy = bspline(u[:,1], mesh[1], order)
+    Mz, bz, idz = bspline(u[:,2], mesh[2], order)
+
+    idx = np.asarray(idx).T
+    idy = np.asarray(idy).T
+    idz = np.asarray(idz).T
+    Mx_s = Mx[np.arange(natm)[:,None], idx]
+    My_s = My[np.arange(natm)[:,None], idy]
+    Mz_s = Mz[np.arange(natm)[:,None], idz]
+
+    #:Q = np.einsum('i,ix,iy,iz->xyz', chargs, Mx, My, Mz)
+    Q = np.zeros([*mesh])
+    for ia in range(len(chargs)):
+        Q_s = np.einsum('x,y,z->xyz', Mx_s[ia], My_s[ia], Mz_s[ia])
+        Q[np.ix_(idx[ia], idy[ia], idz[ia])] += chargs[ia] * Q_s
+
+    B = np.einsum('x,y,z->xyz', bx*bx.conj(), by*by.conj(), bz*bz.conj())
+
+    Gv, Gvbase, weights = cell.get_Gv_weights(mesh)
+    absG2 = np.einsum('ix,ix->i', Gv, Gv)
+    absG2[absG2==0] = 1e200
+    coulG = 4*np.pi / absG2
+    C = weights * coulG * np.exp(-absG2/(4*ew_eta**2))
+    C = C.reshape(*mesh)
+
+    Q_ifft = tools.ifft(Q, mesh).reshape(*mesh)
+    tmp = tools.fft(B * C * Q_ifft, mesh).real.reshape(*mesh)
+    ewg = 0.5 * np.prod(mesh) * np.einsum('xyz,xyz->', Q, tmp)
+
+    logger.debug(cell, 'Ewald components = %.15g, %.15g, %.15g', ewovrl, ewself, ewg)
+    return ewovrl + ewself + ewg
+
+def particle_mesh_ewald_nuc_grad(cell, ew_eta=None, ew_cut=None,
+                                 order=INTERPOLATION_ORDER):
+    if cell.dimension != 3:
+        raise NotImplementedError("Particle mesh ewald only works for 3D.")
+
+    chargs = cell.atom_charges()
+    coords = cell.atom_coords()
+
+    if ew_eta is None or ew_cut is None:
+        ew_eta, ew_cut = cell.get_ewald_params()
+    log_precision = np.log(cell.precision / (chargs.sum()*16*np.pi**2))
+    ke_cutoff = -2*ew_eta**2*log_precision
+    mesh = cell.cutoff_to_mesh(ke_cutoff)
+
+    grad_dir = _get_ewald_direct_nuc_grad(cell, ew_eta, ew_cut)
+
+    b = cell.reciprocal_vectors(norm_to=1)
+    u = np.dot(coords, b.T) * mesh[None,:]
+
+    [Mx, dMx], bx, idx = bspline(u[:,0], mesh[0], order, deriv=1)
+    [My, dMy], by, idy = bspline(u[:,1], mesh[1], order, deriv=1)
+    [Mz, dMz], bz, idz = bspline(u[:,2], mesh[2], order, deriv=1)
+
+    idx = np.asarray(idx).T
+    idy = np.asarray(idy).T
+    idz = np.asarray(idz).T
+    Mx_s = Mx[np.indices(idx.shape)[0], idx]
+    My_s = My[np.indices(idy.shape)[0], idy]
+    Mz_s = Mz[np.indices(idz.shape)[0], idz]
+    dMx_s = dMx[np.indices(idx.shape)[0], idx]
+    dMy_s = dMy[np.indices(idy.shape)[0], idy]
+    dMz_s = dMz[np.indices(idz.shape)[0], idz]
+
+    Q = np.zeros([*mesh])
+    for ia in range(len(chargs)):
+        Q_s = np.einsum('x,y,z->xyz', Mx_s[ia], My_s[ia], Mz_s[ia])
+        Q[np.ix_(idx[ia], idy[ia], idz[ia])] += chargs[ia] * Q_s
+
+    B = np.einsum('x,y,z->xyz', bx*bx.conj(), by*by.conj(), bz*bz.conj())
+
+    Gv, Gvbase, weights = cell.get_Gv_weights(mesh)
+    absG2 = np.einsum('ix,ix->i', Gv, Gv)
+    absG2[absG2==0] = 1e200
+    coulG = 4*np.pi / absG2
+    C = weights * coulG * np.exp(-absG2/(4*ew_eta**2))
+    C = C.reshape(*mesh)
+
+    Q_ifft = tools.ifft(Q, mesh).reshape(*mesh)
+    tmp = tools.fft(B * C * Q_ifft, mesh).real.reshape(*mesh)
+
+    ng = np.prod(mesh)
+    bK = b * mesh[:,None]
+    grad_rec = np.zeros_like(grad_dir)
+    for ia in range(len(chargs)):
+        mask = np.ix_(idx[ia], idy[ia], idz[ia])
+        dQ_s = np.einsum('x,y,z->xyz', dMx_s[ia], My_s[ia], Mz_s[ia])
+        dQdr = np.einsum('x,abc->xabc', bK[0], dQ_s)
+        grad_rec[ia] += np.einsum('xabc,abc->x', dQdr, tmp[mask])
+
+        dQ_s = np.einsum('x,y,z->xyz', Mx_s[ia], dMy_s[ia], Mz_s[ia])
+        dQdr = np.einsum('x,abc->xabc', bK[1], dQ_s)
+        grad_rec[ia] += np.einsum('xabc,abc->x', dQdr, tmp[mask])
+
+        dQ_s = np.einsum('x,y,z->xyz', Mx_s[ia], My_s[ia], dMz_s[ia])
+        dQdr = np.einsum('x,abc->xabc', bK[2], dQ_s)
+        grad_rec[ia] += np.einsum('xabc,abc->x', dQdr, tmp[mask])
+
+        grad_rec[ia] *= chargs[ia] * ng
+
+    # reciprocal space summation does not conserve momentum
+    shift = -np.sum(grad_rec, axis=0) / len(grad_rec)
+    logger.debug(cell, f'Shift ewald nuclear gradient by {shift} to keep momentum conservation.')
+    grad_rec += shift[None,:]
+
+    grad = grad_dir + grad_rec
+    return grad
+
+def ewald_nuc_grad(cell, ew_eta=None, ew_cut=None):
+    chargs = np.asarray(cell.atom_charges(), order='C', dtype=float)
+    coords = np.asarray(cell.atom_coords(), order='C')
+
+    if ew_eta is None or ew_cut is None:
+        ew_eta, ew_cut = cell.get_ewald_params()
+    log_precision = np.log(cell.precision / (chargs.sum()*16*np.pi**2))
+    ke_cutoff = -2*ew_eta**2*log_precision
+    mesh = cell.cutoff_to_mesh(ke_cutoff)
+
+    if cell.dimension == 3 and cell.use_particle_mesh_ewald:
+        return particle_mesh_ewald_nuc_grad(cell, ew_eta=ew_eta, ew_cut=ew_cut)
+
+    grad_dir = _get_ewald_direct_nuc_grad(cell, ew_eta, ew_cut)
+    grad_rec = np.zeros_like(grad_dir, order="C")
+
+    Gv, _, weights = cell.get_Gv_weights(mesh)
+    fn = getattr(libpbc, "ewald_gs_nuc_grad")
+    if cell.dimension != 2 or cell.low_dim_ft_type == 'inf_vacuum':
+        ngrids = len(Gv)
+        mem_avail = cell.max_memory - lib.current_memory()[0]
+        if mem_avail <= 0:
+            logger.warn(cell, "Not enough memory for computing ewald force.")
+        blksize = min(ngrids, max(mesh[2], int(mem_avail*1e6 / ((2+cell.natm*2)*8))))
+        for ig0, ig1 in lib.prange(0, ngrids, blksize):
+            ngrid_sub = ig1 - ig0
+            Gv_sub = np.asarray(Gv[ig0:ig1], order="C")
+            fn(grad_rec.ctypes.data_as(ctypes.c_void_p),
+               Gv_sub.ctypes.data_as(ctypes.c_void_p),
+               chargs.ctypes.data_as(ctypes.c_void_p),
+               coords.ctypes.data_as(ctypes.c_void_p),
+               ctypes.c_double(ew_eta), ctypes.c_double(weights),
+               ctypes.c_int(cell.natm), ctypes.c_size_t(ngrid_sub))
+    else:
+        raise NotImplementedError
+
+    grad = grad_dir + grad_rec
+    return grad
diff --git a/pyscf/pbc/gto/neighborlist.py b/pyscf/pbc/gto/neighborlist.py
new file mode 100644
index 0000000000..f4a0527ee2
--- /dev/null
+++ b/pyscf/pbc/gto/neighborlist.py
@@ -0,0 +1,199 @@
+#!/usr/bin/env python
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Author: Xing Zhang <zhangxing.nju@gmail.com>
+#
+
+import ctypes
+import numpy as np
+from pyscf import lib
+from pyscf.lib import logger
+
+libpbc = lib.load_library('libpbc')
+
+class _CNeighborPair(ctypes.Structure):
+    _fields_ = [("nimgs", ctypes.c_int),
+                ("Ls_list", ctypes.POINTER(ctypes.c_int)),
+                ("q_cond", ctypes.POINTER(ctypes.c_double)),
+                ("center", ctypes.POINTER(ctypes.c_double))]
+
+
+class _CNeighborList(ctypes.Structure):
+    _fields_ = [("nish", ctypes.c_int),
+                ("njsh", ctypes.c_int),
+                ("nimgs", ctypes.c_int),
+                ("pairs", ctypes.POINTER(ctypes.POINTER(_CNeighborPair)))]
+
+
+class _CNeighborListOpt(ctypes.Structure):
+    _fields_ = [("nl", ctypes.POINTER(_CNeighborList)),
+                ('fprescreen', ctypes.c_void_p)]
+
+
+def build_neighbor_list_for_shlpairs(cell, cell1=None, Ls=None,
+                                     ish_rcut=None, jsh_rcut=None, hermi=0,
+                                     precision=None):
+    '''
+    Build the neighbor list of shell pairs for periodic calculations.
+
+    Arguments:
+        cell : :class:`pbc.gto.cell.Cell`
+            The :class:`Cell` instance for the bra basis functions.
+        cell1 : :class:`pbc.gto.cell.Cell`, optional
+            The :class:`Cell` instance for the ket basis functions.
+            If not given, both bra and ket basis functions come from cell.
+        Ls : (*,3) array, optional
+            The cartesian coordinates of the periodic images.
+            Default is calculated by :func:`cell.get_lattice_Ls`.
+        ish_rcut : (nish,) array, optional
+            The cutoff radii of the shells for bra basis functions.
+        jsh_rcut : (njsh,) array, optional
+            The cutoff radii of the shells for ket basis functions.
+        hermi : int, optional
+            If :math:`hermi=1`, the task list is built only for
+            the upper triangle of the matrix. Default is 0.
+        precision : float, optional
+            The integral precision. Default is :attr:`cell.precision`.
+            If both ``ish_rcut`` and ``jsh_rcut`` are given,
+            ``precision`` will be ignored.
+
+    Returns: :class:`ctypes.POINTER`
+        The C pointer of the :class:`NeighborList` structure.
+    '''
+    if cell1 is None:
+        cell1 = cell
+    if Ls is None:
+        Ls = cell.get_lattice_Ls()
+    Ls = np.asarray(Ls, order='C', dtype=float)
+    nimgs = len(Ls)
+
+    if hermi == 1 and cell1 is not cell:
+        logger.warn(cell,
+                    "Set hermi=0 because cell and cell1 are not the same.")
+        hermi = 0
+
+    ish_atm = np.asarray(cell._atm, order='C', dtype=np.int32)
+    ish_bas = np.asarray(cell._bas, order='C', dtype=np.int32)
+    ish_env = np.asarray(cell._env, order='C', dtype=float)
+    nish = len(ish_bas)
+    if ish_rcut is None:
+        ish_rcut = cell.rcut_by_shells(precision=precision)
+    assert nish == len(ish_rcut)
+
+    if cell1 is cell:
+        jsh_atm = ish_atm
+        jsh_bas = ish_bas
+        jsh_env = ish_env
+        if jsh_rcut is None:
+            jsh_rcut = ish_rcut
+    else:
+        jsh_atm = np.asarray(cell1._atm, order='C', dtype=np.int32)
+        jsh_bas = np.asarray(cell1._bas, order='C', dtype=np.int32)
+        jsh_env = np.asarray(cell1._env, order='C', dtype=float)
+        if jsh_rcut is None:
+            jsh_rcut = cell1.rcut_by_shells(precision=precision)
+    njsh = len(jsh_bas)
+    assert njsh == len(jsh_rcut)
+
+    nl = ctypes.POINTER(_CNeighborList)()
+    func = getattr(libpbc, "build_neighbor_list", None)
+    try:
+        func(ctypes.byref(nl),
+             ish_atm.ctypes.data_as(ctypes.c_void_p),
+             ish_bas.ctypes.data_as(ctypes.c_void_p),
+             ish_env.ctypes.data_as(ctypes.c_void_p),
+             ish_rcut.ctypes.data_as(ctypes.c_void_p),
+             jsh_atm.ctypes.data_as(ctypes.c_void_p),
+             jsh_bas.ctypes.data_as(ctypes.c_void_p),
+             jsh_env.ctypes.data_as(ctypes.c_void_p),
+             jsh_rcut.ctypes.data_as(ctypes.c_void_p),
+             ctypes.c_int(nish), ctypes.c_int(njsh),
+             Ls.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(nimgs),
+             ctypes.c_int(hermi))
+    except Exception as e:
+        raise RuntimeError(f"Failed to build neighbor list for shell pairs.\n{e}")
+    return nl
+
+def free_neighbor_list(nl):
+    func = getattr(libpbc, "del_neighbor_list", None)
+    try:
+        func(ctypes.byref(nl))
+    except Exception as e:
+        raise RuntimeError(f"Failed to free neighbor list.\n{e}")
+
+def neighbor_list_to_ndarray(cell, cell1, nl):
+    '''
+    Returns:
+        Ls_list: (nLtot,) ndarray
+            indices of Ls
+        Ls_idx: (2 x nish x njsh,) ndarray
+            starting and ending indices in Ls_list
+    '''
+    nish = cell.nbas
+    njsh = cell1.nbas
+    Ls_list = []
+    Ls_idx = []
+    nLtot = 0
+    for i in range(nish):
+        for j in range(njsh):
+            pair = nl.contents.pairs[i*njsh+j]
+            nL = pair.contents.nimgs
+            nLtot += nL
+            for iL in range(nL):
+                idx = pair.contents.Ls_list[iL]
+                Ls_list.append(idx)
+            if nL > 0:
+                Ls_idx.extend([nLtot-nL, nLtot])
+            else:
+                Ls_idx.extend([-1,-1])
+    return np.asarray(Ls_list), np.asarray(Ls_idx)
+
+
+class NeighborListOpt():
+    def __init__(self, cell):
+        self.cell = cell
+        self.nl = None
+        self._this = ctypes.POINTER(_CNeighborListOpt)()
+        libpbc.NLOpt_init(ctypes.byref(self._this))
+
+    def build(self, cell=None, cell1=None, Ls=None,
+              ish_rcut=None, jsh_rcut=None,
+              hermi=0, precision=None,
+              set_nl=True, set_optimizer=True):
+        if cell is None:
+            cell = self.cell
+
+        if (set_nl or set_optimizer) and self.nl is None:
+            self.nl = build_neighbor_list_for_shlpairs(
+                            cell, cell1=cell1, Ls=Ls,
+                            ish_rcut=ish_rcut, jsh_rcut=jsh_rcut,
+                            hermi=hermi, precision=precision)
+            libpbc.NLOpt_set_nl(self._this, self.nl)
+
+        if set_optimizer:
+            libpbc.NLOpt_set_optimizer(self._this)
+
+    def reset(self, free_nl=True):
+        if self.nl is not None and free_nl:
+            free_neighbor_list(self.nl)
+        self.nl = None
+        libpbc.NLOpt_reset(self._this)
+
+    def __del__(self):
+        self.reset()
+        try:
+            libpbc.NLOpt_del(ctypes.byref(self._this))
+        except AttributeError:
+            pass
diff --git a/pyscf/pbc/gto/pseudo/pp_int.py b/pyscf/pbc/gto/pseudo/pp_int.py
index 6114fb7f86..2ff3436dbc 100644
--- a/pyscf/pbc/gto/pseudo/pp_int.py
+++ b/pyscf/pbc/gto/pseudo/pp_int.py
@@ -29,6 +29,17 @@
 from pyscf import lib
 from pyscf import gto
 from pyscf import __config__
+from pyscf.pbc.lib.kpts_helper import gamma_point
+
+EPS_PPL = getattr(__config__, "pbc_gto_pseudo_eps_ppl", 1e-2)
+HL_TABLE_SLOTS = 7
+ATOM_OF        = 0
+ANG_OF         = 1
+HL_DIM_OF      = 2
+HL_DATA_OF     = 3
+HL_OFFSET0     = 4
+HF_OFFSET1     = 5
+HF_OFFSET2     = 6
 
 libpbc = lib.load_library('libpbc')
 
@@ -106,12 +117,293 @@ def get_gth_vlocG_part1(cell, Gv):
 def get_pp_loc_part2(cell, kpts=None):
     '''PRB, 58, 3641 Eq (1), integrals associated to C1, C2, C3, C4
     '''
-    from pyscf.pbc.df.aft import _IntPPBuilder
-    vpploc = _IntPPBuilder(cell, kpts).get_pp_loc_part2()
+    if kpts is None or gamma_point(kpts):
+        vpploc = [get_pp_loc_part2_gamma(cell)]
+    else:
+        from pyscf.pbc.df.aft import _IntPPBuilder
+        vpploc = _IntPPBuilder(cell, kpts).get_pp_loc_part2()
     if kpts is None or numpy.shape(kpts) == (3,):
         vpploc = vpploc[0]
     return vpploc
 
+
+def get_pp_loc_part2_gamma(cell):
+    from pyscf.pbc.df import incore
+    from pyscf.pbc.gto import build_neighbor_list_for_shlpairs, free_neighbor_list
+
+    fake_cells = {}
+    for cn in range(1, 5):
+        fake_cell = fake_cell_vloc(cell, cn)
+        fake_cell.precision = EPS_PPL
+        if fake_cell.nbas > 0:
+            fake_cells[cn] = fake_cell
+
+    if not fake_cells:
+        if any(cell.atom_symbol(ia) in cell._pseudo for ia in range(cell.natm)):
+            pass
+        else:
+            lib.logger.warn(cell, 'cell.pseudo was specified but its elements %s '
+                            'were not found in the system.', cell._pseudo.keys())
+        return 0
+
+    intors = ('int3c2e', 'int3c1e', 'int3c1e_r2_origk',
+              'int3c1e_r4_origk', 'int3c1e_r6_origk')
+    kptij_lst = numpy.zeros((1,2,3))
+    Ls = cell.get_lattice_Ls()
+    buf = None
+    for i, (cn, fake_cell) in enumerate(fake_cells.items()):
+        neighbor_list = build_neighbor_list_for_shlpairs(fake_cell, cell, Ls)
+        v = incore.aux_e2_sum_auxbas(cell, fake_cell, intors[cn], aosym='s2', comp=1,
+                                     kptij_lst=kptij_lst, neighbor_list=neighbor_list)
+        if i == 0:
+            buf = v
+        else:
+            buf = numpy.add(buf, v, out=buf)
+        v = None
+        free_neighbor_list(neighbor_list)
+
+    vpploc = lib.unpack_tril(buf)
+    return vpploc
+
+
+# TODO add k-point sampling
+def vpploc_part2_nuc_grad(cell, dm, kpts=None):
+    '''
+    Nuclear gradients of the 2nd part of the local part of
+    the GTH pseudo potential, contracted with the density matrix.
+    '''
+    from pyscf.pbc.df import incore
+    from pyscf.pbc.gto import build_neighbor_list_for_shlpairs, free_neighbor_list
+    if kpts is not None and not gamma_point(kpts):
+        raise NotImplementedError("k-point sampling not available")
+
+    if kpts is None:
+        kpts_lst = numpy.zeros((1,3))
+    else:
+        kpts_lst = numpy.reshape(kpts, (-1,3))
+    kptij_lst = numpy.hstack((kpts_lst,kpts_lst)).reshape(-1,2,3)
+
+    intors = ('int3c2e_ip1', 'int3c1e_ip1', 'int3c1e_ip1_r2_origk',
+              'int3c1e_ip1_r4_origk', 'int3c1e_ip1_r6_origk')
+
+    Ls = cell.get_lattice_Ls()
+    count = 0
+    grad = 0
+    for cn in range(1, 5):
+        fakecell = fake_cell_vloc(cell, cn)
+        fakecell.precision = EPS_PPL
+        if fakecell.nbas > 0:
+            neighbor_list = build_neighbor_list_for_shlpairs(fakecell, cell, Ls)
+            buf = incore.int3c1e_nuc_grad(cell, fakecell, dm, intors[cn],
+                                          kptij_lst=kptij_lst, neighbor_list=neighbor_list)
+            if count == 0:
+                grad = buf
+            else:
+                grad = numpy.add(grad, buf, out=grad)
+            buf = None
+            count += 1
+            free_neighbor_list(neighbor_list)
+    grad *= -2
+    return grad
+
+
+def _prepare_hl_data(fakecell, hl_blocks):
+    offset = [0] * 3
+    hl_table = numpy.empty((len(hl_blocks),HL_TABLE_SLOTS), order='C', dtype=numpy.int32)
+    hl_data = []
+    ptr = 0
+    for ib, hl in enumerate(hl_blocks):
+        hl_table[ib,ATOM_OF] = fakecell._bas[ib,0]
+        hl_table[ib,ANG_OF] = l = fakecell.bas_angular(ib)
+        hl_dim = hl.shape[0]
+        hl_table[ib,HL_DIM_OF], hl_table[ib,HL_DATA_OF] = hl_dim, ptr
+        ptr += hl_dim**2
+        hl_data.extend(list(hl.ravel()))
+        nd = 2 * l + 1
+        for i in range(hl_dim):
+            hl_table[ib, i+HL_OFFSET0] = offset[i]
+            offset[i] += nd
+    hl_data = numpy.asarray(hl_data, order='C', dtype=numpy.double)
+    return hl_table, hl_data
+
+
+# TODO add k-point sampling
+def _contract_ppnl(cell, fakecell, hl_blocks, ppnl_half, comp=1, kpts=None):
+    from pyscf.pbc.gto import NeighborListOpt
+    if kpts is None:
+        kpts_lst = numpy.zeros((1,3))
+    else:
+        kpts_lst = numpy.reshape(kpts, (-1,3))
+
+    hl_table, hl_data = _prepare_hl_data(fakecell, hl_blocks)
+
+    opt = NeighborListOpt(fakecell)
+    opt.build(fakecell, cell)
+
+    shls_slice = (0, cell.nbas, 0, cell.nbas)
+    key = 'cart' if cell.cart else 'sph'
+    ao_loc = gto.moleintor.make_loc(cell._bas, key)
+
+    ppnl = []
+    nao = cell.nao_nr()
+    nao_pair = nao * (nao+1) // 2
+    for k, kpt in enumerate(kpts_lst):
+        ppnl_half0 = ppnl_half1 = ppnl_half2 = None
+        if len(ppnl_half[0]) > 0:
+            ppnl_half0 = ppnl_half[0][k]
+        if len(ppnl_half[1]) > 0:
+            ppnl_half1 = ppnl_half[1][k]
+        if len(ppnl_half[2]) > 0:
+            ppnl_half2 = ppnl_half[2][k]
+
+        if gamma_point(kpt):
+            if ppnl_half0 is not None:
+                ppnl_half0 = ppnl_half0.real
+            if ppnl_half1 is not None:
+                ppnl_half1 = ppnl_half1.real
+            if ppnl_half2 is not None:
+                ppnl_half2 = ppnl_half2.real
+            buf = numpy.empty([nao_pair], order='C', dtype=numpy.double)
+            fill = getattr(libpbc, 'ppnl_fill_gs2')
+        else:
+            buf = numpy.empty([nao_pair], order='C', dtype=numpy.complex128)
+            raise NotImplementedError
+
+        ppnl_half0 = numpy.asarray(ppnl_half0, order='C')
+        ppnl_half1 = numpy.asarray(ppnl_half1, order='C')
+        ppnl_half2 = numpy.asarray(ppnl_half2, order='C')
+
+        drv = getattr(libpbc, "contract_ppnl", None)
+        try:
+            drv(fill, buf.ctypes.data_as(ctypes.c_void_p),
+                ppnl_half0.ctypes.data_as(ctypes.c_void_p),
+                ppnl_half1.ctypes.data_as(ctypes.c_void_p),
+                ppnl_half2.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(comp), (ctypes.c_int*4)(*shls_slice),
+                ao_loc.ctypes.data_as(ctypes.c_void_p),
+                hl_table.ctypes.data_as(ctypes.c_void_p),
+                hl_data.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(len(hl_blocks)), opt._this)
+        except Exception as e:
+            raise RuntimeError(f"Failed to compute non-local pseudo-potential.\n{e}")
+
+        ppnl_k = lib.unpack_tril(buf)
+        ppnl.append(ppnl_k)
+
+    if kpts is None or numpy.shape(kpts) == (3,):
+        ppnl = ppnl[0]
+    return ppnl
+
+
+# TODO add k-point sampling
+def _contract_ppnl_nuc_grad(cell, fakecell, dms, hl_blocks, ppnl_half, ppnl_half_ip2,
+                            comp=3, kpts=None, hl_table=None, hl_data=None):
+    from pyscf.pbc.gto import NeighborListOpt
+    if kpts is None:
+        kpts_lst = numpy.zeros((1,3))
+    else:
+        kpts_lst = numpy.reshape(kpts, (-1,3))
+
+    if hl_table is None:
+        hl_table, hl_data = _prepare_hl_data(fakecell, hl_blocks)
+
+    opt = NeighborListOpt(fakecell)
+    opt.build(fakecell, cell)
+
+    nkpts = len(kpts_lst)
+    nao = cell.nao
+    dms = dms.reshape(nkpts, nao, nao)
+    shls_slice = (0, cell.nbas, 0, cell.nbas)
+    bas = numpy.asarray(cell._bas, order='C', dtype=numpy.int32)
+    key = 'cart' if cell.cart else 'sph'
+    ao_loc = gto.moleintor.make_loc(bas, key)
+
+    grad = []
+    for k, kpt in enumerate(kpts_lst):
+        dm = dms[k]
+        naux = [0] * 3
+        ppnl_half0 = ppnl_half1 = ppnl_half2 = None
+        if len(ppnl_half[0]) > 0:
+            ppnl_half0 = ppnl_half[0][k]
+            naux[0] = ppnl_half0.shape[0]
+        if len(ppnl_half[1]) > 0:
+            ppnl_half1 = ppnl_half[1][k]
+            naux[1] = ppnl_half1.shape[0]
+        if len(ppnl_half[2]) > 0:
+            ppnl_half2 = ppnl_half[2][k]
+            naux[2] = ppnl_half2.shape[0]
+
+        ppnl_half_ip2_0 = ppnl_half_ip2_1 = ppnl_half_ip2_2 = None
+        if len(ppnl_half_ip2[0]) > 0:
+            ppnl_half_ip2_0 = ppnl_half_ip2[0][k]
+            assert naux[0] == ppnl_half_ip2_0.shape[1]
+        if len(ppnl_half_ip2[1]) > 0:
+            ppnl_half_ip2_1 = ppnl_half_ip2[1][k]
+            assert naux[1] == ppnl_half_ip2_1.shape[1]
+        if len(ppnl_half_ip2[2]) > 0:
+            ppnl_half_ip2_2 = ppnl_half_ip2[2][k]
+            assert naux[2] == ppnl_half_ip2_2.shape[1]
+
+        naux = numpy.asarray(naux, dtype=numpy.int32)
+
+        if gamma_point(kpt):
+            dm = dm.real
+            if ppnl_half0 is not None:
+                ppnl_half0 = ppnl_half0.real
+                ppnl_half_ip2_0 = ppnl_half_ip2_0.real
+            if ppnl_half1 is not None:
+                ppnl_half1 = ppnl_half1.real
+                ppnl_half_ip2_1 = ppnl_half_ip2_1.real
+            if ppnl_half2 is not None:
+                ppnl_half2 = ppnl_half2.real
+                ppnl_half_ip2_2 = ppnl_half_ip2_2.real
+            grad_k = numpy.zeros([cell.natm, comp], order='C', dtype=numpy.double)
+            fill = getattr(libpbc, 'ppnl_nuc_grad_fill_gs1')
+        else:
+            grad_k = numpy.empty([cell.natm, comp], order='C', dtype=numpy.complex128)
+            raise NotImplementedError
+
+        dm = numpy.asarray(dm, order='C')
+        ppnl_half0 = numpy.asarray(ppnl_half0, order='C')
+        ppnl_half1 = numpy.asarray(ppnl_half1, order='C')
+        ppnl_half2 = numpy.asarray(ppnl_half2, order='C')
+        ppnl_half_ip2_0 = numpy.asarray(ppnl_half_ip2_0, order='C')
+        ppnl_half_ip2_1 = numpy.asarray(ppnl_half_ip2_1, order='C')
+        ppnl_half_ip2_2 = numpy.asarray(ppnl_half_ip2_2, order='C')
+
+        drv = getattr(libpbc, "contract_ppnl_nuc_grad", None)
+        try:
+            drv(fill,
+                grad_k.ctypes.data_as(ctypes.c_void_p),
+                dm.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(comp),
+                ppnl_half0.ctypes.data_as(ctypes.c_void_p),
+                ppnl_half1.ctypes.data_as(ctypes.c_void_p),
+                ppnl_half2.ctypes.data_as(ctypes.c_void_p),
+                ppnl_half_ip2_0.ctypes.data_as(ctypes.c_void_p),
+                ppnl_half_ip2_1.ctypes.data_as(ctypes.c_void_p),
+                ppnl_half_ip2_2.ctypes.data_as(ctypes.c_void_p),
+                hl_table.ctypes.data_as(ctypes.c_void_p),
+                hl_data.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(len(hl_blocks)),
+                naux.ctypes.data_as(ctypes.c_void_p),
+                (ctypes.c_int*4)(*shls_slice),
+                ao_loc.ctypes.data_as(ctypes.c_void_p),
+                bas.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(cell.natm), opt._this)
+        except Exception as e:
+            raise RuntimeError(f"Failed to compute non-local pp nuclear gradient.\n{e}")
+        grad.append(grad_k)
+
+    grad_tot = 0
+    if nkpts == 1:
+        grad_tot = grad[0]
+    else:
+        for k in range(nkpts):
+            grad_tot += grad[k]
+        grad_tot = grad_tot.real
+    return grad_tot
+
+
 def get_pp_nl(cell, kpts=None):
     if kpts is None:
         kpts_lst = numpy.zeros((1,3))
@@ -122,6 +414,10 @@ def get_pp_nl(cell, kpts=None):
     fakecell, hl_blocks = fake_cell_vnl(cell)
     ppnl_half = _int_vnl(cell, fakecell, hl_blocks, kpts_lst)
     nao = cell.nao_nr()
+
+    if gamma_point(kpts_lst):
+        return _contract_ppnl(cell, fakecell, hl_blocks, ppnl_half, kpts=kpts)
+
     buf = numpy.empty((3*9*nao), dtype=numpy.complex128)
 
     # We set this equal to zeros in case hl_blocks loop is skipped
@@ -148,7 +444,32 @@ def get_pp_nl(cell, kpts=None):
     return ppnl
 
 
-def fake_cell_vloc(cell, cn=0):
+def vppnl_nuc_grad(cell, dm, kpts=None):
+    '''
+    Nuclear gradients of the non-local part of the GTH pseudo potential,
+    contracted with the density matrix.
+    '''
+    if kpts is None:
+        kpts_lst = numpy.zeros((1,3))
+    else:
+        kpts_lst = numpy.reshape(kpts, (-1,3))
+
+    fakecell, hl_blocks = fake_cell_vnl(cell)
+    intors = ('int1e_ipovlp', 'int1e_r2_origi_ip2', 'int1e_r4_origi_ip2')
+    ppnl_half = _int_vnl(cell, fakecell, hl_blocks, kpts_lst)
+    ppnl_half_ip2 = _int_vnl(cell, fakecell, hl_blocks, kpts_lst, intors, comp=3)
+    # int1e_ipovlp computes ip1 so multiply -1 to get ip2
+    if len(ppnl_half_ip2[0]) > 0:
+        for k, kpt in enumerate(kpts_lst):
+            ppnl_half_ip2[0][k] *= -1
+
+    grad = _contract_ppnl_nuc_grad(cell, fakecell, dm, hl_blocks,
+                                   ppnl_half, ppnl_half_ip2, kpts=kpts)
+    grad *= -2
+    return grad
+
+
+def fake_cell_vloc(cell, cn=0, atm_id=None):
     '''Generate fake cell for V_{loc}.
 
     Each term of V_{loc} (erf, C_1, C_2, C_3, C_4) is a gaussian type
@@ -158,17 +479,23 @@ def fake_cell_vloc(cell, cn=0):
     The kwarg cn indiciates which term to generate for the fake cell.
     If cn = 0, the erf term is generated.  C_1,..,C_4 are generated with cn = 1..4
     '''
-    fake_env = [cell.atom_coords().ravel()]
-    fake_atm = cell._atm.copy()
-    fake_atm[:,gto.PTR_COORD] = numpy.arange(0, cell.natm*3, 3)
-    ptr = cell.natm * 3
+    if atm_id is None:
+        atm_id = numpy.arange(cell.natm)
+    else:
+        atm_id = numpy.asarray(atm_id)
+    natm = len(atm_id)
+
+    fake_env = [cell.atom_coords()[atm_id].ravel()]
+    fake_atm = cell._atm[atm_id].copy().reshape(natm,-1)
+    fake_atm[:,gto.PTR_COORD] = numpy.arange(0, natm*3, 3)
+    ptr = natm * 3
     fake_bas = []
     half_sph_norm = .5/numpy.pi**.5
-    for ia in range(cell.natm):
-        if cell.atom_charge(ia) == 0:  # pass ghost atoms
+    for ia, atm in enumerate(atm_id):
+        if cell.atom_charge(atm) == 0:  # pass ghost atoms
             continue
 
-        symb = cell.atom_symbol(ia)
+        symb = cell.atom_symbol(atm)
         if cn == 0:
             if symb in cell._pseudo:
                 pp = cell._pseudo[symb]
@@ -196,6 +523,7 @@ def fake_cell_vloc(cell, cn=0):
     fakecell._env = numpy.asarray(numpy.hstack(fake_env), dtype=numpy.double)
     return fakecell
 
+
 # sqrt(Gamma(l+1.5)/Gamma(l+2i+1.5))
 _PLI_FAC = 1/numpy.sqrt(numpy.array((
     (1, 3.75 , 59.0625  ),  # l = 0,
@@ -249,12 +577,14 @@ def fake_cell_vnl(cell):
 
     fakecell = cell.copy(deep=False)
     fakecell._atm = numpy.asarray(fake_atm, dtype=numpy.int32)
-    fakecell._bas = numpy.asarray(fake_bas, dtype=numpy.int32)
+    fakecell._bas = numpy.asarray(fake_bas, dtype=numpy.int32).reshape(-1, gto.BAS_SLOTS)
     fakecell._env = numpy.asarray(numpy.hstack(fake_env), dtype=numpy.double)
     return fakecell, hl_blocks
 
-def _int_vnl(cell, fakecell, hl_blocks, kpts):
+def _int_vnl(cell, fakecell, hl_blocks, kpts, intors=None, comp=1):
     '''Vnuc - Vloc'''
+    if intors is None:
+        intors = ['int1e_ovlp', 'int1e_r2_origi', 'int1e_r4_origi']
     rcut = max(cell.rcut, fakecell.rcut)
     Ls = cell.get_lattice_Ls(rcut=rcut)
     nimgs = len(Ls)
@@ -262,6 +592,7 @@ def _int_vnl(cell, fakecell, hl_blocks, kpts):
     nkpts = len(kpts)
 
     fill = getattr(libpbc, 'PBCnr2c_fill_ks1')
+    # TODO add screening
     cintopt = lib.c_null_ptr()
 
     def int_ket(_bas, intor):
@@ -279,8 +610,10 @@ def int_ket(_bas, intor):
         ao_loc = gto.moleintor.make_loc(bas, intor)
         ni = ao_loc[shls_slice[1]] - ao_loc[shls_slice[0]]
         nj = ao_loc[shls_slice[3]] - ao_loc[shls_slice[2]]
-        out = numpy.empty((nkpts,ni,nj), dtype=numpy.complex128)
-        comp = 1
+        if comp == 1:
+            out = numpy.empty((nkpts,ni,nj), dtype=numpy.complex128)
+        else:
+            out = numpy.empty((nkpts,comp,ni,nj), dtype=numpy.complex128)
 
         fintor = getattr(gto.moleintor.libcgto, intor)
 
@@ -297,7 +630,7 @@ def int_ket(_bas, intor):
         return out
 
     hl_dims = numpy.asarray([len(hl) for hl in hl_blocks])
-    out = (int_ket(fakecell._bas[hl_dims>0], 'int1e_ovlp'),
-           int_ket(fakecell._bas[hl_dims>1], 'int1e_r2_origi'),
-           int_ket(fakecell._bas[hl_dims>2], 'int1e_r4_origi'))
+    out = (int_ket(fakecell._bas[hl_dims>0], intors[0]),
+           int_ket(fakecell._bas[hl_dims>1], intors[1]),
+           int_ket(fakecell._bas[hl_dims>2], intors[2]))
     return out
diff --git a/pyscf/pbc/gto/pseudo/test/test_pp.py b/pyscf/pbc/gto/pseudo/test/test_pp.py
index c00057a064..95b343bbf6 100644
--- a/pyscf/pbc/gto/pseudo/test/test_pp.py
+++ b/pyscf/pbc/gto/pseudo/test/test_pp.py
@@ -22,6 +22,7 @@
 from pyscf.pbc.dft import numint
 from pyscf.pbc.gto import pseudo
 from pyscf.pbc.gto.pseudo import pp_int
+from pyscf.data.nist import BOHR
 
 
 def get_pp_loc_part2(cell, kpt=np.zeros(3)):
@@ -244,7 +245,42 @@ def test_pp(self):
         v1 = pseudo.get_pp(cell, k)
         self.assertAlmostEqual(abs(v0-v1).max(), 0, 6)
 
+    def test_pp_nuc_grad(self):
+        cell = pbcgto.Cell()
+        cell.atom = 'H 0 0 0; Na 0 0 0.8'
+        cell.a = np.diag([6,6,6])
+        cell.basis='gth-szv'
+        cell.pseudo='gth-pade'
+        cell.ke_cutoff=200
+        cell.build()
+
+        cellp = cell.copy()
+        cellp.atom = 'H 0 0 0; Na 0 0 0.8001'
+        cellp.build()
 
+        cellm = cell.copy()
+        cellm.atom = 'H 0 0 0; Na 0 0 0.7999'
+        cellm.build()
+
+        np.random.seed(1)
+        dm = np.random.rand(cell.nao, cell.nao)
+        dm = (dm + dm.T) / 2
+
+        # local_part2
+        vp = pp_int.get_pp_loc_part2(cellp)
+        vm = pp_int.get_pp_loc_part2(cellm)
+        v_fd = (vp - vm) / (0.0002 / BOHR)
+        grad = pp_int.vpploc_part2_nuc_grad(cell, dm)[1,2]
+        grad_fd = np.einsum("ij,ij->", v_fd, dm)
+        self.assertAlmostEqual(abs(grad - grad_fd), 0, 7)
+
+        # non-local
+        vp = pp_int.get_pp_nl(cellp)
+        vm = pp_int.get_pp_nl(cellm)
+        v_fd = (vp - vm) / (0.0002 / BOHR)
+        grad = pp_int.vppnl_nuc_grad(cell, dm)[1,2]
+        grad_fd = np.einsum("ij,ij->", v_fd, dm)
+        self.assertAlmostEqual(abs(grad - grad_fd), 0, 7)
 
 if __name__ == '__main__':
     print("Full Tests for pbc.gto.pseudo")
diff --git a/pyscf/pbc/gto/test/test_cell.py b/pyscf/pbc/gto/test/test_cell.py
index 5dee058140..bd7a0e067f 100644
--- a/pyscf/pbc/gto/test/test_cell.py
+++ b/pyscf/pbc/gto/test/test_cell.py
@@ -25,6 +25,7 @@
 from pyscf.pbc import gto as pgto
 from pyscf.pbc.gto import ecp
 from pyscf.pbc.tools import pbc as pbctools
+from pyscf.pbc.gto import ewald_methods
 
 
 def setUpModule():
@@ -252,6 +253,30 @@ def test_ewald_2d(self):
 #        eref = cell.to_mol().energy_nuc()
 #        self.assertAlmostEqual(cell.ewald(), eref, 2)
 
+    def test_particle_mesh_ewald(self):
+        cell = pgto.Cell()
+        cell.a = np.diag([10.,]*3)
+        cell.atom = '''
+            O          5.84560        5.21649        5.10372
+            H          6.30941        5.30070        5.92953
+            H          4.91429        5.26674        5.28886
+        '''
+        cell.pseudo = 'gth-pade'
+        cell.verbose = 0
+        cell.build()
+
+        cell1 = cell.copy()
+        cell1.use_particle_mesh_ewald = True
+        cell1.build()
+
+        e0 = cell.ewald()
+        e1 = cell1.ewald()
+        self.assertAlmostEqual(e0, e1, 6)
+
+        g0 = ewald_methods.ewald_nuc_grad(cell)
+        g1 = ewald_methods.ewald_nuc_grad(cell1)
+        self.assertAlmostEqual(abs(g1-g0).max(), 0, 6)
+
     def test_pbc_intor(self):
         numpy.random.seed(12)
         kpts = numpy.random.random((4,3))
diff --git a/pyscf/pbc/scf/hf.py b/pyscf/pbc/scf/hf.py
index 8225d778b6..f6c91336ed 100644
--- a/pyscf/pbc/scf/hf.py
+++ b/pyscf/pbc/scf/hf.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# Copyright 2014-2019 The PySCF Developers. All Rights Reserved.
+# Copyright 2014-2024 The PySCF Developers. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -53,23 +53,24 @@ def get_ovlp(cell, kpt=np.zeros(3)):
         # Avoid pbcopt's prescreening in the lattice sum, for better accuracy
         s = cell.pbc_intor('int1e_ovlp', hermi=0, kpts=kpt,
                            pbcopt=lib.c_null_ptr())
-    s = lib.asarray(s)
+    s = np.asarray(s)
     hermi_error = abs(s - np.rollaxis(s.conj(), -1, -2)).max()
     if hermi_error > cell.precision and hermi_error > 1e-12:
         logger.warn(cell, '%.4g error found in overlap integrals. '
                     'cell.precision  or  cell.rcut  can be adjusted to '
                     'improve accuracy.', hermi_error)
 
-    cond = np.max(lib.cond(s))
-    if cond * precision > 1e2:
-        prec = 1e7 / cond
-        rmin = gto.estimate_rcut(cell, prec*1e-5)
-        logger.warn(cell, 'Singularity detected in overlap matrix.  '
-                    'Integral accuracy may be not enough.\n      '
-                    'You can adjust  cell.precision  or  cell.rcut  to '
-                    'improve accuracy.  Recommended settings are\n      '
-                    'cell.precision < %.2g\n      '
-                    'cell.rcut > %.4g', prec, rmin)
+    if cell.verbose >= logger.DEBUG:
+        cond = np.max(lib.cond(s))
+        if cond * precision > 1e2:
+            prec = 1e7 / cond
+            rmin = gto.estimate_rcut(cell, prec*1e-5)
+            logger.warn(cell, 'Singularity detected in overlap matrix.  '
+                        'Integral accuracy may be not enough.\n      '
+                        'You can adjust  cell.precision  or  cell.rcut  to '
+                        'improve accuracy.  Recommended settings are\n      '
+                        'cell.precision < %.2g\n      '
+                        'cell.rcut > %.4g', prec, rmin)
     return s
 
 
@@ -615,11 +616,18 @@ def dump_flags(self, verbose=None):
         return self
 
     def check_sanity(self):
-        mol_hf.SCF.check_sanity(self)
+        lib.StreamObject.check_sanity(self)
         if (isinstance(self.exxdiv, str) and self.exxdiv.lower() != 'ewald' and
             isinstance(self.with_df, df.df.DF)):
             logger.warn(self, 'exxdiv %s is not supported in DF or MDF',
                         self.exxdiv)
+
+        if self.verbose >= logger.DEBUG:
+            s = self.get_ovlp()
+            cond = np.max(lib.cond(s))
+            if cond * 1e-17 > self.conv_tol:
+                logger.warn(self, 'Singularity detected in overlap matrix (condition number = %4.3g). '
+                            'SCF may be inaccurate and hard to converge.', cond)
         return self
 
     def get_hcore(self, cell=None, kpt=None):
@@ -738,7 +746,7 @@ def get_jk_incore(self, cell=None, dm=None, hermi=1, kpt=None, omega=None,
         return self.get_jk(cell, dm, hermi, kpt)
 
     def energy_nuc(self):
-        return self.cell.energy_nuc()
+        return self.cell.enuc
 
     @lib.with_doc(dip_moment.__doc__)
     def dip_moment(self, cell=None, dm=None, unit='Debye', verbose=logger.NOTE,
@@ -758,10 +766,10 @@ def _finalize(self):
             makov_payne_correction(self)
         return self
 
-    def get_init_guess(self, cell=None, key='minao'):
+    def get_init_guess(self, cell=None, key='minao', s1e=None):
         if cell is None: cell = self.cell
         dm = mol_hf.SCF.get_init_guess(self, cell, key)
-        dm = normalize_dm_(self, dm)
+        dm = normalize_dm_(self, dm, s1e)
         return dm
 
     def init_guess_by_1e(self, cell=None):
@@ -914,12 +922,14 @@ def _format_jks(vj, dm, kpts_band):
         vj = vj[0]
     return vj
 
-def normalize_dm_(mf, dm):
+def normalize_dm_(mf, dm, s1e=None):
     '''
     Scale density matrix to make it produce the correct number of electrons.
     '''
     cell = mf.cell
-    ne = np.einsum('ij,ji->', dm, mf.get_ovlp(cell)).real
+    if s1e is None:
+        s1e = mf.get_ovlp(cell)
+    ne = lib.einsum('ij,ji->', dm, s1e).real
     if abs(ne - cell.nelectron) > 0.01:
         logger.debug(mf, 'Big error detected in the electron number '
                      'of initial guess density matrix (Ne/cell = %g)!\n'
diff --git a/pyscf/pbc/scf/khf.py b/pyscf/pbc/scf/khf.py
index 1ef2d88908..89124e8af4 100644
--- a/pyscf/pbc/scf/khf.py
+++ b/pyscf/pbc/scf/khf.py
@@ -496,7 +496,7 @@ def dump_flags(self, verbose=None):
             self.with_df.dump_flags(verbose)
         return self
 
-    def get_init_guess(self, cell=None, key='minao'):
+    def get_init_guess(self, cell=None, key='minao', s1e=None):
         raise NotImplementedError
 
     def init_guess_by_1e(self, cell=None):
@@ -524,10 +524,10 @@ def get_jk(self, cell=None, dm_kpts=None, hermi=1, kpts=None, kpts_band=None,
         cpu0 = (logger.process_clock(), logger.perf_counter())
         if self.rsjk:
             vj, vk = self.rsjk.get_jk(dm_kpts, hermi, kpts, kpts_band,
-                                      with_j, with_k, omega, self.exxdiv)
+                                      with_j, with_k, omega=omega, exxdiv=self.exxdiv)
         else:
             vj, vk = self.with_df.get_jk(dm_kpts, hermi, kpts, kpts_band,
-                                         with_j, with_k, omega, self.exxdiv)
+                                         with_j, with_k, omega=omega, exxdiv=self.exxdiv)
         logger.timer(self, 'vj and vk', *cpu0)
         return vj, vk
 
@@ -700,7 +700,9 @@ def check_sanity(self):
                         'found in KRHF method.', cell.nelec, nkpts)
         return KSCF.check_sanity(self)
 
-    def get_init_guess(self, cell=None, key='minao'):
+    def get_init_guess(self, cell=None, key='minao', s1e=None):
+        if s1e is None:
+            s1e = self.get_ovlp(cell)
         dm = mol_hf.SCF.get_init_guess(self, cell, key)
         nkpts = len(self.kpts)
         if dm.ndim == 2:
@@ -708,7 +710,7 @@ def get_init_guess(self, cell=None, key='minao'):
             dm = np.repeat(dm[None,:,:], nkpts, axis=0)
         dm_kpts = dm
 
-        ne = np.einsum('kij,kji->', dm_kpts, self.get_ovlp(cell)).real
+        ne = lib.einsum('kij,kji->', dm_kpts, s1e).real
         # FIXME: consider the fractional num_electron or not? This maybe
         # relate to the charged system.
         nelectron = float(self.cell.tot_electrons(nkpts))
diff --git a/pyscf/pbc/scf/khf_ksymm.py b/pyscf/pbc/scf/khf_ksymm.py
index baaf5543a6..69e4d5c5d1 100644
--- a/pyscf/pbc/scf/khf_ksymm.py
+++ b/pyscf/pbc/scf/khf_ksymm.py
@@ -343,14 +343,16 @@ class KsymAdaptedKRHF(KsymAdaptedKSCF, khf.KRHF):
     to_ks = khf.KRHF.to_ks
     convert_from_ = khf.KRHF.convert_from_
 
-    def get_init_guess(self, cell=None, key='minao'):
+    def get_init_guess(self, cell=None, key='minao', s1e=None):
+        if s1e is None:
+            s1e = self.get_ovlp(cell)
         dm_kpts = mol_hf.SCF.get_init_guess(self, cell, key)
         if dm_kpts.ndim == 2:
             dm_kpts = np.asarray([dm_kpts]*self.kpts.nkpts_ibz)
         elif len(dm_kpts) != self.kpts.nkpts_ibz:
             dm_kpts = dm_kpts[self.kpts.ibz2bz]
 
-        ne = np.einsum('k,kij,kji', self.kpts.weights_ibz, dm_kpts, self.get_ovlp(cell)).real
+        ne = lib.einsum('k,kij,kji', self.kpts.weights_ibz, dm_kpts, s1e).real
         nkpts = self.kpts.nkpts
         ne *= nkpts
         nelectron = float(self.cell.tot_electrons(nkpts))
diff --git a/pyscf/pbc/scf/kuhf.py b/pyscf/pbc/scf/kuhf.py
index af56a2ced3..eae04c0713 100644
--- a/pyscf/pbc/scf/kuhf.py
+++ b/pyscf/pbc/scf/kuhf.py
@@ -416,7 +416,9 @@ def dump_flags(self, verbose=None):
                     'alpha = %d beta = %d', *self.nelec)
         return self
 
-    def get_init_guess(self, cell=None, key='minao'):
+    def get_init_guess(self, cell=None, key='minao', s1e=None):
+        if s1e is None:
+            s1e = self.get_ovlp(cell)
         dm_kpts = mol_hf.SCF.get_init_guess(self, cell, key)
         assert dm_kpts.shape[0] == 2
         nkpts = len(self.kpts)
@@ -424,7 +426,7 @@ def get_init_guess(self, cell=None, key='minao'):
             # dm[spin,nao,nao] at gamma point -> dm_kpts[spin,nkpts,nao,nao]
             dm_kpts = np.repeat(dm_kpts[:,None,:,:], nkpts, axis=1)
 
-        ne = np.einsum('xkij,kji->x', dm_kpts, self.get_ovlp(cell)).real
+        ne = lib.einsum('xkij,kji->x', dm_kpts, s1e).real
         nelec = np.asarray(self.nelec)
         if np.any(abs(ne - nelec) > 0.01*nkpts):
             logger.debug(self, 'Big error detected in the electron number '
diff --git a/pyscf/pbc/scf/kuhf_ksymm.py b/pyscf/pbc/scf/kuhf_ksymm.py
index 310de63289..4e10ed0fdc 100644
--- a/pyscf/pbc/scf/kuhf_ksymm.py
+++ b/pyscf/pbc/scf/kuhf_ksymm.py
@@ -155,7 +155,9 @@ def dump_flags(self, verbose=None):
                     'alpha = %d beta = %d', *self.nelec)
         return self
 
-    def get_init_guess(self, cell=None, key='minao'):
+    def get_init_guess(self, cell=None, key='minao', s1e=None):
+        if s1e is None:
+            s1e = self.get_ovlp(cell)
         dm_kpts = mol_hf.SCF.get_init_guess(self, cell, key)
         assert dm_kpts.shape[0]==2
         if dm_kpts.ndim != 4:
@@ -165,7 +167,7 @@ def get_init_guess(self, cell=None, key='minao'):
         elif dm_kpts.shape[1] != self.kpts.nkpts_ibz:
             dm_kpts = dm_kpts[:,self.kpts.ibz2bz]
 
-        ne = np.einsum('k,xkij,kji->x', self.kpts.weights_ibz, dm_kpts, self.get_ovlp(cell)).real
+        ne = lib.einsum('k,xkij,kji->x', self.kpts.weights_ibz, dm_kpts, s1e).real
         nkpts = self.kpts.nkpts
         ne *= nkpts
         nelec = np.asarray(self.nelec)
diff --git a/pyscf/pbc/scf/test/test_hf.py b/pyscf/pbc/scf/test/test_hf.py
index fe3387468b..3e47561cee 100644
--- a/pyscf/pbc/scf/test/test_hf.py
+++ b/pyscf/pbc/scf/test/test_hf.py
@@ -20,6 +20,7 @@
 import tempfile
 import numpy
 from pyscf import lib
+from pyscf.scf import atom_hf
 from pyscf.pbc import gto as pbcgto
 from pyscf.pbc.scf import hf as pbchf
 import pyscf.pbc.scf as pscf
@@ -511,7 +512,7 @@ def test_init_guess_by_1e(self):
         self.assertEqual(dm.ndim, 3)
         self.assertAlmostEqual(lib.fp(dm), 0.025922864381755062, 6)
 
-    def test_init_guess_by_atom(self):
+    def test_init_guess_by_minao(self):
         with lib.temporary_env(cell, dimension=1):
             dm = mf.get_init_guess(key='minao')
             kdm = kmf.get_init_guess(key='minao')
@@ -521,6 +522,29 @@ def test_init_guess_by_atom(self):
         self.assertEqual(kdm.ndim, 3)
         self.assertAlmostEqual(lib.fp(kdm), -1.714952331211208, 8)
 
+    def test_init_guess_by_atom(self):
+        with lib.temporary_env(cell, dimension=1):
+            dm = mf.get_init_guess(key='atom')
+            kdm = kmf.get_init_guess(key='atom')
+
+        self.assertAlmostEqual(lib.fp(dm), 0.18074522075843902, 7)
+
+        self.assertEqual(kdm.ndim, 3)
+        self.assertAlmostEqual(lib.fp(dm), 0.18074522075843902, 7)
+
+    def test_atom_hf_with_pp(self):
+        mol = pbcgto.Cell()
+        mol.build(
+            verbose = 7,
+            output = '/dev/null',
+            atom  = 'O 0 0 0; H 0 0 -1; H 0 0 1',
+            a = [[5, 0, 0], [0, 5, 0], [0, 0, 5]],
+            basis = 'gth-dzvp',
+            pseudo = 'gth-pade')
+        scf_result = atom_hf.get_atm_nrhf(mol)
+        self.assertAlmostEqual(scf_result['O'][0], -15.193243796069835, 9)
+        self.assertAlmostEqual(scf_result['H'][0], -0.49777509423571864, 9)
+
     def test_jk(self):
         nao = cell.nao
         numpy.random.seed(2)
diff --git a/pyscf/pbc/scf/uhf.py b/pyscf/pbc/scf/uhf.py
index b9d9b1407d..0d247f745e 100644
--- a/pyscf/pbc/scf/uhf.py
+++ b/pyscf/pbc/scf/uhf.py
@@ -221,10 +221,13 @@ def dip_moment(self, cell=None, dm=None, unit='Debye', verbose=logger.NOTE,
             rho = self.get_rho(dm)
         return dip_moment(cell, dm, unit, verbose, rho=rho, kpt=self.kpt, **kwargs)
 
-    def get_init_guess(self, cell=None, key='minao'):
-        if cell is None: cell = self.cell
+    def get_init_guess(self, cell=None, key='minao', s1e=None):
+        if cell is None:
+            cell = self.cell
+        if s1e is None:
+            s1e = self.get_ovlp(cell)
         dm = mol_uhf.UHF.get_init_guess(self, cell, key)
-        ne = np.einsum('xij,ji->x', dm, self.get_ovlp(cell)).real
+        ne = np.einsum('xij,ji->x', dm, s1e).real
         nelec = self.nelec
         if np.any(abs(ne - nelec) > 0.01):
             logger.debug(self, 'Big error detected in the electron number '
diff --git a/pyscf/pbc/symm/geom.py b/pyscf/pbc/symm/geom.py
index 74119a4483..ae698d2347 100644
--- a/pyscf/pbc/symm/geom.py
+++ b/pyscf/pbc/symm/geom.py
@@ -77,7 +77,7 @@ def search_space_group_ops(cell, rotations=None, tol=SYMPREC):
     '''
     if rotations is None: rotations = search_point_group_ops(cell, tol=tol)
     a = cell.lattice_vectors()
-    coords = cell.get_scaled_positions()
+    coords = cell.get_scaled_atom_coords()
     atmgrp = mole.atom_types(cell._atom, magmom=cell.magmom)
     atmgrp_spin_inv = {} #spin up and down inverted
     has_spin = False
diff --git a/pyscf/pbc/symm/pyscf_spglib.py b/pyscf/pbc/symm/pyscf_spglib.py
index 3a0d1442cb..f87117a8dd 100644
--- a/pyscf/pbc/symm/pyscf_spglib.py
+++ b/pyscf/pbc/symm/pyscf_spglib.py
@@ -29,7 +29,7 @@ def cell_to_spgcell(cell):
     Convert PySCF Cell object to spglib cell object
     '''
     a = cell.lattice_vectors()
-    atm_pos = cell.get_scaled_positions()
+    atm_pos = cell.get_scaled_atom_coords()
     atm_num = []
     from pyscf.data import elements
     for symbol in cell.elements:
diff --git a/pyscf/pbc/symm/symmetry.py b/pyscf/pbc/symm/symmetry.py
index c79bc81167..ce29e3afac 100644
--- a/pyscf/pbc/symm/symmetry.py
+++ b/pyscf/pbc/symm/symmetry.py
@@ -219,7 +219,7 @@ def dump_info(self):
 
 def _get_phase(cell, op, kpt_scaled, ignore_phase=False, tol=SYMPREC):
     kpt_scaled = op.a2b(cell).dot_rot(kpt_scaled)
-    coords_scaled = cell.get_scaled_positions().reshape(-1,3)
+    coords_scaled = cell.get_scaled_atom_coords().reshape(-1,3)
     natm = coords_scaled.shape[0]
     phase = np.ones((natm,), dtype=np.complex128)
     atm_map = np.arange(natm)
diff --git a/pyscf/pbc/tools/pbc.py b/pyscf/pbc/tools/pbc.py
index 7ca867fd21..20d45fe692 100644
--- a/pyscf/pbc/tools/pbc.py
+++ b/pyscf/pbc/tools/pbc.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 import warnings
+import ctypes
 import numpy as np
 import scipy.linalg
 from pyscf import lib
@@ -57,6 +58,44 @@ def _ifftn_blas(g, mesh):
     return out.reshape(-1, *mesh)
 
 if FFT_ENGINE == 'FFTW':
+    try:
+        libfft = lib.load_library('libfft')
+    except OSError:
+        raise RuntimeError("Failed to load libfft")
+
+    def _copy_d2z(a):
+        fn = libfft._copy_d2z
+        out = np.empty(a.shape, dtype=np.complex128)
+        fn(out.ctypes.data_as(ctypes.c_void_p),
+           a.ctypes.data_as(ctypes.c_void_p),
+           ctypes.c_size_t(a.size))
+        return out
+
+    def _complex_fftn_fftw(f, mesh, func):
+        if f.dtype == np.double and f.flags.c_contiguous:
+            # np.asarray or np.astype is too slow
+            f = _copy_d2z(f)
+        else:
+            f = np.asarray(f, order='C', dtype=np.complex128)
+        mesh = np.asarray(mesh, order='C', dtype=np.int32)
+        rank = len(mesh)
+        out = np.empty_like(f)
+        fn = getattr(libfft, func)
+        for i, fi in enumerate(f):
+            fn(fi.ctypes.data_as(ctypes.c_void_p),
+               out[i].ctypes.data_as(ctypes.c_void_p),
+               mesh.ctypes.data_as(ctypes.c_void_p),
+               ctypes.c_int(rank))
+        return out
+
+    def _fftn_wrapper(a):
+        mesh = a.shape[1:]
+        return _complex_fftn_fftw(a, mesh, 'fft')
+    def _ifftn_wrapper(a):
+        mesh = a.shape[1:]
+        return _complex_fftn_fftw(a, mesh, 'ifft')
+
+elif FFT_ENGINE == 'PYFFTW':
     # pyfftw is slower than np.fft in most cases
     try:
         import pyfftw
@@ -235,8 +274,9 @@ def get_coulG(cell, k=np.zeros(3), exx=False, mf=None, mesh=None, Gv=None,
     else:
         kG = Gv
 
-    equal2boundary = np.zeros(Gv.shape[0], dtype=bool)
+    equal2boundary = None
     if wrap_around and abs(k).sum() > 1e-9:
+        equal2boundary = np.zeros(Gv.shape[0], dtype=bool)
         # Here we 'wrap around' the high frequency k+G vectors into their lower
         # frequency counterparts.  Important if you want the gamma point and k-point
         # answers to agree
@@ -357,7 +397,8 @@ def get_coulG(cell, k=np.zeros(3), exx=False, mf=None, mesh=None, Gv=None,
         if cell.dimension > 0 and exxdiv == 'ewald' and len(G0_idx) > 0:
             coulG[G0_idx] += Nk*cell.vol*madelung(cell, kpts)
 
-    coulG[equal2boundary] = 0
+    if equal2boundary is not None:
+        coulG[equal2boundary] = 0
 
     # Scale the coulG kernel for attenuated Coulomb integrals.
     # * omega is used by RangeSeparatedJKBuilder which requires ewald probe charge
@@ -507,7 +548,7 @@ def get_lattice_Ls(cell, nimgs=None, rcut=None, dimension=None, discard=True):
 
     a = cell.lattice_vectors()
 
-    scaled_atom_coords = np.linalg.solve(a.T, cell.atom_coords().T).T
+    scaled_atom_coords = cell.get_scaled_atom_coords()
     atom_boundary_max = scaled_atom_coords[:,:dimension].max(axis=0)
     atom_boundary_min = scaled_atom_coords[:,:dimension].min(axis=0)
     if (np.any(atom_boundary_max > 1) or np.any(atom_boundary_min < -1)):
@@ -542,11 +583,12 @@ def find_boundary(a):
                              np.arange(-bounds[2], bounds[2]+1)))
     Ls = np.dot(Ts[:,:dimension], a[:dimension])
 
-    ovlp_penalty += 1e-200  # avoid /0
-    Ts_scaled = (Ts[:,:dimension] + 1e-200) / ovlp_penalty
-    ovlp_penalty_fac = 1. / abs(Ts_scaled).min(axis=1)
-    Ls_mask = np.linalg.norm(Ls, axis=1) * (1-ovlp_penalty_fac) < rcut
-    Ls = Ls[Ls_mask]
+    if discard:
+        ovlp_penalty += 1e-200  # avoid /0
+        Ts_scaled = (Ts[:,:dimension] + 1e-200) / ovlp_penalty
+        ovlp_penalty_fac = 1. / abs(Ts_scaled).min(axis=1)
+        Ls_mask = np.linalg.norm(Ls, axis=1) * (1-ovlp_penalty_fac) < rcut
+        Ls = Ls[Ls_mask]
     return np.asarray(Ls, order='C')
 
 
diff --git a/pyscf/scf/atom_hf.py b/pyscf/scf/atom_hf.py
index 58e0a585c3..4430963493 100644
--- a/pyscf/scf/atom_hf.py
+++ b/pyscf/scf/atom_hf.py
@@ -30,6 +30,7 @@ def get_atm_nrhf(mol, atomic_configuration=elements.NRSRHF_CONFIGURATION):
 
     atm_template = mol.copy(deep=False)
     atm_template.charge = 0
+    atm_template.enuc = 0
     atm_template.symmetry = False  # TODO: enable SO3 symmetry here
     atm_template.atom = atm_template._atom = []
     atm_template.cart = False  # AtomSphAverageRHF does not support cartesian basis
@@ -50,7 +51,6 @@ def get_atm_nrhf(mol, atomic_configuration=elements.NRSRHF_CONFIGURATION):
         atm._ecpbas[:,0] = 0
         if element in mol._pseudo:
             atm._pseudo = {element: mol._pseudo.get(element)}
-            raise NotImplementedError
         atm.spin = atm.nelectron % 2
 
         nao = atm.nao
@@ -59,6 +59,19 @@ def get_atm_nrhf(mol, atomic_configuration=elements.NRSRHF_CONFIGURATION):
             mo_occ = mo_energy = numpy.zeros(nao)
             mo_coeff = numpy.zeros((nao,nao))
             atm_scf_result[element] = (0, mo_energy, mo_coeff, mo_occ)
+        elif atm._pseudo:
+            from pyscf.scf import atom_hf_pp
+            atm.a = None
+            if atm.nelectron == 1:
+                atm_hf = atom_hf_pp.AtomHF1ePP(atm)
+            else:
+                atm_hf = atom_hf_pp.AtomSCFPP(atm)
+                atm_hf.atomic_configuration = atomic_configuration
+
+            atm_hf.verbose = mol.verbose
+            atm_hf.run()
+            atm_scf_result[element] = (atm_hf.e_tot, atm_hf.mo_energy,
+                                       atm_hf.mo_coeff, atm_hf.mo_occ)
         else:
             if atm.nelectron == 1:
                 atm_hf = AtomHF1e(atm)
diff --git a/pyscf/scf/atom_hf_pp.py b/pyscf/scf/atom_hf_pp.py
new file mode 100644
index 0000000000..19a2f73930
--- /dev/null
+++ b/pyscf/scf/atom_hf_pp.py
@@ -0,0 +1,154 @@
+#!/usr/bin/env python
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Author: Xing Zhang <zhangxing.nju@gmail.com>
+#
+
+import copy
+import numpy
+from scipy.special import erf
+
+from pyscf import lib
+from pyscf import gto, scf
+from pyscf.dft import gen_grid, numint
+from pyscf.pbc import gto as pbcgto
+from pyscf.scf import atom_hf, rohf
+
+def get_pp_loc_part1_rs(mol, coords):
+    atm_coords = mol.atom_coords()
+    out = 0
+    for ia in range(mol.natm):
+        r0 = atm_coords[ia]
+        r2 = numpy.sum((coords - r0)**2, axis=1)
+        r = numpy.sqrt(r2)
+        Zia = mol.atom_charge(ia)
+        symb = mol.atom_symbol(ia)
+        if symb in mol._pseudo:
+            pp = mol._pseudo[symb]
+            rloc, nexp, cexp = pp[1:3+1]
+        else:
+            rloc = 1e16
+        alpha = 1.0 / (numpy.sqrt(2) * rloc)
+        out += - Zia / r * erf(alpha * r)
+    return out
+
+def _aux_e2(cell, auxcell, intor, aosym='s1', comp=1):
+    intor = cell._add_suffix(intor)
+    pcell = copy.copy(cell)
+    pcell._atm, pcell._bas, pcell._env = \
+            atm, bas, env = gto.conc_env(cell._atm, cell._bas, cell._env,
+                                         cell._atm, cell._bas, cell._env)
+    ao_loc = gto.moleintor.make_loc(bas, intor)
+    aux_loc = auxcell.ao_loc_nr(auxcell.cart or 'ssc' in intor)
+    ao_loc = numpy.asarray(numpy.hstack([ao_loc, ao_loc[-1]+aux_loc[1:]]),
+                           dtype=numpy.int32)
+    atm, bas, env = gto.conc_env(atm, bas, env,
+                                 auxcell._atm, auxcell._bas, auxcell._env)
+    nbas = cell.nbas
+    shls_slice = (0, nbas, nbas, nbas*2, nbas*2, nbas*2+auxcell.nbas)
+    comp = 1
+    out = gto.moleintor.getints3c(intor, atm, bas, env, shls_slice=shls_slice,
+                                  comp=comp, aosym=aosym, ao_loc=ao_loc)
+    return out
+
+def get_pp_loc_part2(mol):
+    buf = 0
+    intors = ('int3c2e', 'int3c1e', 'int3c1e_r2_origk',
+              'int3c1e_r4_origk', 'int3c1e_r6_origk')
+    for cn in range(1, 5):
+        fakecell = pbcgto.pseudo.pp_int.fake_cell_vloc(mol, cn)
+        if fakecell.nbas > 0:
+            v = _aux_e2(mol, fakecell, intors[cn], aosym='s2', comp=1)
+            buf += numpy.einsum('...i->...', v)
+    if numpy.isscalar(buf):
+        vpp_loc =  buf
+    else:
+        vpp_loc = lib.unpack_tril(buf)
+    return vpp_loc
+
+def get_pp_loc(mol):
+    # TODO use analytic integral
+    grids = gen_grid.Grids(mol)
+    grids.level = 3
+    grids.build(with_non0tab=True)
+    _numint = numint.NumInt()
+
+    vpp = 0
+    for ao, mask, weight, coords in _numint.block_loop(mol, grids):
+        vloc = get_pp_loc_part1_rs(mol, coords)
+        vpp += numpy.einsum("g,g,gi,gj->ij", weight, vloc, ao, ao)
+    vpp += get_pp_loc_part2(mol)
+    return vpp
+
+def get_pp_nl(mol):
+    nao = mol.nao
+    fakecell, hl_blocks = pbcgto.pseudo.pp_int.fake_cell_vnl(mol)
+    ppnl_half = _int_vnl(mol, fakecell, hl_blocks)
+
+    ppnl = numpy.zeros((nao,nao), dtype=numpy.double)
+    offset = [0] * 3
+    for ib, hl in enumerate(hl_blocks):
+        l = fakecell.bas_angular(ib)
+        nd = 2 * l + 1
+        hl_dim = hl.shape[0]
+        ilp = numpy.ndarray((hl_dim,nd,nao), dtype=numpy.double)
+        for i in range(hl_dim):
+            p0 = offset[i]
+            ilp[i] = ppnl_half[i][p0:p0+nd]
+            offset[i] = p0 + nd
+        ppnl += numpy.einsum('ilp,ij,jlq->pq', ilp, hl, ilp)
+    return ppnl
+
+def _int_vnl(cell, fakecell, hl_blocks):
+    intopt = lib.c_null_ptr()
+
+    def int_ket(_bas, intor):
+        if len(_bas) == 0:
+            return []
+        intor = cell._add_suffix(intor)
+        atm, bas, env = gto.conc_env(cell._atm, cell._bas, cell._env,
+                                     fakecell._atm, _bas, fakecell._env)
+        atm = numpy.asarray(atm, dtype=numpy.int32)
+        bas = numpy.asarray(bas, dtype=numpy.int32)
+        env = numpy.asarray(env, dtype=numpy.double)
+        nbas = len(bas)
+        shls_slice = (cell.nbas, nbas, 0, cell.nbas)
+        ao_loc = gto.moleintor.make_loc(bas, intor)
+        ni = ao_loc[shls_slice[1]] - ao_loc[shls_slice[0]]
+        nj = ao_loc[shls_slice[3]] - ao_loc[shls_slice[2]]
+        out = numpy.empty((ni,nj), dtype=numpy.double)
+        comp = 1
+        out = gto.moleintor.getints2c(intor, atm, bas, env, shls_slice=shls_slice, comp=comp, hermi=0,
+                                      ao_loc=ao_loc, cintopt=intopt, out=out)
+        return out
+
+    hl_dims = numpy.asarray([len(hl) for hl in hl_blocks])
+    out = (int_ket(fakecell._bas[hl_dims>0], 'int1e_ovlp'),
+           int_ket(fakecell._bas[hl_dims>1], 'int1e_r2_origi'),
+           int_ket(fakecell._bas[hl_dims>2], 'int1e_r4_origi'))
+    return out
+
+class AtomSCFPP(atom_hf.AtomSphAverageRHF):
+    def get_hcore(self, mol=None):
+        if mol is None:
+            mol = self.mol
+        h = mol.intor('int1e_kin', hermi=1)
+        h += get_pp_nl(mol)
+        h += get_pp_loc(mol)
+        return h
+
+class AtomHF1ePP(rohf.HF1e, AtomSCFPP):
+    eig = AtomSCFPP.eig
+    get_hcore = AtomSCFPP.get_hcore
diff --git a/pyscf/scf/dhf.py b/pyscf/scf/dhf.py
index 32d2d0f7f2..6e29d5a450 100644
--- a/pyscf/scf/dhf.py
+++ b/pyscf/scf/dhf.py
@@ -285,14 +285,14 @@ def fproj(mo):
     return dm
 
 
-def get_init_guess(mol, key='minao'):
+def get_init_guess(mol, key='minao', **kwargs):
     '''Generate density matrix for initial guess
 
     Kwargs:
         key : str
             One of 'minao', 'atom', 'huckel', 'mod_huckel', 'hcore', '1e', 'chkfile'.
     '''
-    return UHF(mol).get_init_guess(mol, key)
+    return UHF(mol).get_init_guess(mol, key, **kwargs)
 
 def time_reversal_matrix(mol, mat):
     ''' T(A_ij) = A[T(i),T(j)]^*
diff --git a/pyscf/scf/diis.py b/pyscf/scf/diis.py
index 321f81cdfe..a442f58b9c 100644
--- a/pyscf/scf/diis.py
+++ b/pyscf/scf/diis.py
@@ -72,13 +72,13 @@ def get_num_vec(self):
 def get_err_vec_orig(s, d, f):
     '''error vector = SDF - FDS'''
     if isinstance(f, numpy.ndarray) and f.ndim == 2:
-        sdf = reduce(numpy.dot, (s,d,f))
+        sdf = reduce(lib.dot, (s,d,f))
         errvec = (sdf.conj().T - sdf).ravel()
 
     elif isinstance(f, numpy.ndarray) and f.ndim == 3 and s.ndim == 3:
         errvec = []
         for i in range(f.shape[0]):
-            sdf = reduce(numpy.dot, (s[i], d[i], f[i]))
+            sdf = reduce(lib.dot, (s[i], d[i], f[i]))
             errvec.append((sdf.conj().T - sdf).ravel())
         errvec = numpy.hstack(errvec)
 
@@ -98,7 +98,7 @@ def get_err_vec_orth(s, d, f, Corth):
         sym_forbid = orbsym[:,None] != orbsym
 
     if isinstance(f, numpy.ndarray) and f.ndim == 2:
-        sdf = reduce(numpy.dot, (Corth.conj().T, s, d, f, Corth))
+        sdf = reduce(lib.dot, (Corth.conj().T, s, d, f, Corth))
         if orbsym is not None:
             sdf[sym_forbid] = 0
         errvec = (sdf.conj().T - sdf).ravel()
@@ -106,7 +106,7 @@ def get_err_vec_orth(s, d, f, Corth):
     elif isinstance(f, numpy.ndarray) and f.ndim == 3 and s.ndim == 3:
         errvec = []
         for i in range(f.shape[0]):
-            sdf = reduce(numpy.dot, (Corth[i].conj().T, s[i], d[i], f[i], Corth[i]))
+            sdf = reduce(lib.dot, (Corth[i].conj().T, s[i], d[i], f[i], Corth[i]))
             if orbsym is not None:
                 sdf[sym_forbid] = 0
             errvec.append((sdf.conj().T - sdf).ravel())
diff --git a/pyscf/scf/hf.py b/pyscf/scf/hf.py
index b6ecb5ace0..7a8c0e8f22 100644
--- a/pyscf/scf/hf.py
+++ b/pyscf/scf/hf.py
@@ -115,8 +115,10 @@ def kernel(mf, conv_tol=1e-10, conv_tol_grad=None,
         logger.info(mf, 'Set gradient conv threshold to %g', conv_tol_grad)
 
     mol = mf.mol
+    s1e = mf.get_ovlp(mol)
+
     if dm0 is None:
-        dm = mf.get_init_guess(mol, mf.init_guess)
+        dm = mf.get_init_guess(mol, mf.init_guess, s1e=s1e)
     else:
         dm = dm0
 
@@ -128,13 +130,6 @@ def kernel(mf, conv_tol=1e-10, conv_tol_grad=None,
     scf_conv = False
     mo_energy = mo_coeff = mo_occ = None
 
-    s1e = mf.get_ovlp(mol)
-    cond = lib.cond(s1e)
-    logger.debug(mf, 'cond(S) = %s', cond)
-    if numpy.max(cond)*1e-17 > conv_tol:
-        logger.warn(mf, 'Singularity detected in overlap matrix (condition number = %4.3g). '
-                    'SCF may be inaccurate and hard to converge.', numpy.max(cond))
-
     # Skip SCF iterations. Compute only the total energy of the initial density
     if mf.max_cycle <= 0:
         fock = mf.get_fock(h1e, s1e, vhf, dm)  # = h1e + vhf, no DIIS
@@ -722,14 +717,14 @@ def fproj(mo):
     return dm
 
 
-def get_init_guess(mol, key='minao'):
+def get_init_guess(mol, key='minao', **kwargs):
     '''Generate density matrix for initial guess
 
     Kwargs:
         key : str
             One of 'minao', 'atom', 'huckel', 'hcore', '1e', 'chkfile'.
     '''
-    return RHF(mol).get_init_guess(mol, key)
+    return RHF(mol).get_init_guess(mol, key, **kwargs)
 
 
 # eigenvalue of d is 1
@@ -752,7 +747,7 @@ def level_shift(s, d, f, factor):
     Returns:
         New Fock matrix, 2D ndarray
     '''
-    dm_vir = s - reduce(numpy.dot, (s, d, s))
+    dm_vir = s - reduce(lib.dot, (s, d, s))
     return f + dm_vir * factor
 
 
@@ -1570,6 +1565,15 @@ def __init__(self, mol):
         self._opt = {None: None}
         self._eri = None # Note: self._eri requires large amount of memory
 
+    def check_sanity(self):
+        s1e = self.get_ovlp()
+        cond = lib.cond(s1e)
+        logger.debug(self, 'cond(S) = %s', cond)
+        if numpy.max(cond)*1e-17 > self.conv_tol:
+            logger.warn(self, 'Singularity detected in overlap matrix (condition number = %4.3g). '
+                        'SCF may be inaccurate and hard to converge.', numpy.max(cond))
+        return super().check_sanity()
+
     def build(self, mol=None):
         if mol is None: mol = self.mol
         if self.verbose >= logger.WARN:
@@ -1704,7 +1708,7 @@ def from_chk(self, chkfile=None, project=None):
         return self.init_guess_by_chkfile(chkfile, project)
     from_chk.__doc__ = init_guess_by_chkfile.__doc__
 
-    def get_init_guess(self, mol=None, key='minao'):
+    def get_init_guess(self, mol=None, key='minao', **kwargs):
         if not isinstance(key, str):
             return key
 
@@ -1742,7 +1746,7 @@ def get_init_guess(self, mol=None, key='minao'):
     energy_tot = energy_tot
 
     def energy_nuc(self):
-        return self.mol.energy_nuc()
+        return self.mol.enuc
 
     # A hook for overloading convergence criteria in SCF iterations. Assigning
     # a function
@@ -2103,8 +2107,8 @@ def check_sanity(self):
                         mol.nelectron)
         return SCF.check_sanity(self)
 
-    def get_init_guess(self, mol=None, key='minao'):
-        dm = SCF.get_init_guess(self, mol, key)
+    def get_init_guess(self, mol=None, key='minao', **kwargs):
+        dm = SCF.get_init_guess(self, mol, key, **kwargs)
         if self.verbose >= logger.DEBUG1:
             s = self.get_ovlp()
             nelec = numpy.einsum('ij,ji', dm, s).real
diff --git a/pyscf/scf/uhf.py b/pyscf/scf/uhf.py
index 4f07335bd6..0afc66d0ba 100644
--- a/pyscf/scf/uhf.py
+++ b/pyscf/scf/uhf.py
@@ -130,8 +130,8 @@ def _break_dm_spin_symm(mol, dm):
             dmb[...,p0:p1,p0:p1] = dma[...,p0:p1,p0:p1]
     return dma, dmb
 
-def get_init_guess(mol, key='minao'):
-    return UHF(mol).get_init_guess(mol, key)
+def get_init_guess(mol, key='minao', **kwargs):
+    return UHF(mol).get_init_guess(mol, key, **kwargs)
 
 def make_rdm1(mo_coeff, mo_occ, **kwargs):
     '''One-particle density matrix in AO representation
@@ -830,8 +830,8 @@ def make_rdm2(self, mo_coeff=None, mo_occ=None, **kwargs):
 
     energy_elec = energy_elec
 
-    def get_init_guess(self, mol=None, key='minao'):
-        dm = hf.SCF.get_init_guess(self, mol, key)
+    def get_init_guess(self, mol=None, key='minao', **kwargs):
+        dm = hf.SCF.get_init_guess(self, mol, key, **kwargs)
         if self.verbose >= logger.DEBUG1:
             s = self.get_ovlp()
             nelec =(numpy.einsum('ij,ji', dm[0], s).real,

From ce69d48e16996f25236fd1ca4b60e062f37c8369 Mon Sep 17 00:00:00 2001
From: sunchong137 <sunchong137@gmail.com>
Date: Sun, 25 Feb 2024 19:26:51 -0600
Subject: [PATCH 16/44] Fix smearing with predefined chemical potential (#2098)

* added hubbard model with UHF example, fixed chemical potential bug in addons.py

* fix pbc smearing

* add test for smearing with mu0

---------

Co-authored-by: fishjojo <zhangxing.nju@gmail.com>
---
 examples/scf/72-hubbard_finite_temp.py | 44 ++++++++++++++++++++++++++
 pyscf/pbc/scf/addons.py                | 16 +++++++---
 pyscf/scf/addons.py                    | 18 +++++++----
 pyscf/scf/test/test_addons.py          | 28 ++++++++++++++++
 4 files changed, 95 insertions(+), 11 deletions(-)
 create mode 100644 examples/scf/72-hubbard_finite_temp.py

diff --git a/examples/scf/72-hubbard_finite_temp.py b/examples/scf/72-hubbard_finite_temp.py
new file mode 100644
index 0000000000..9033191baf
--- /dev/null
+++ b/examples/scf/72-hubbard_finite_temp.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python
+#
+# Author: Chong Sun <sunchong137@gmail.com>
+#
+
+'''
+Simulate model systems with HF.
+Half-filled Hubbard model.
+'''
+
+from pyscf import gto, scf , ao2mo
+import numpy
+
+def _hubbard_hamilts_pbc(L, U):
+    h1e = numpy.zeros((L, L))
+    g2e = numpy.zeros((L,)*4)
+    for i in range(L):
+        h1e[i, (i+1)%L] = h1e[(i+1)%L, i] = -1 
+        g2e[i, i, i, i] = U
+    return h1e, g2e
+
+L = 10
+U = 4
+
+mol = gto.M()
+mol.nelectron = L
+mol.nao = L
+mol.spin = 0
+mol.incore_anyway = True
+mol.build()
+
+# set hamiltonian
+h1e, eri = _hubbard_hamilts_pbc(L, U)
+mf = scf.UHF(mol)
+mf.get_hcore = lambda *args: h1e
+mf._eri = ao2mo.restore(1, eri, L)
+mf.get_ovlp = lambda *args: numpy.eye(L)
+mf.kernel()
+
+# finite temperature 
+from pyscf.scf import addons
+beta = 1
+mf_ft = addons.smearing(mf, sigma=1./beta, method='fermi', fix_spin=True)
+mf_ft.kernel()
diff --git a/pyscf/pbc/scf/addons.py b/pyscf/pbc/scf/addons.py
index 4a726c25fc..474833fd98 100644
--- a/pyscf/pbc/scf/addons.py
+++ b/pyscf/pbc/scf/addons.py
@@ -121,12 +121,17 @@ def get_occ(self, mo_energy_kpts=None, mo_coeff_kpts=None):
             if self.mu0 is None:
                 mu_a, occa = mol_addons._smearing_optimize(f_occ, mo_es[0], nocc[0], sigma)
                 mu_b, occb = mol_addons._smearing_optimize(f_occ, mo_es[1], nocc[1], sigma)
-                mu = [mu_a, mu_b]
-                mo_occs = [occa, occb]
             else:
-                mu = self.mu0
-                mo_occs = f_occ(mu[0], mo_es[0], sigma)
-                mo_occs = f_occ(mu[1], mo_es[1], sigma)
+                if numpy.isscalar(self.mu0):
+                    mu_a = mu_b = self.mu0
+                elif len(self.mu0) == 2:
+                    mu_a, mu_b = self.mu0
+                else:
+                    raise TypeError(f'Unsupported mu0: {self.mu0}')
+                occa = f_occ(mu_a, mo_es[0], sigma)
+                occb = f_occ(mu_b, mo_es[1], sigma)
+            mu = [mu_a, mu_b]
+            mo_occs = [occa, occb]
             self.entropy  = self._get_entropy(mo_es[0], mo_occs[0], mu[0])
             self.entropy += self._get_entropy(mo_es[1], mo_occs[1], mu[1])
             self.entropy /= nkpts
@@ -163,6 +168,7 @@ def get_occ(self, mo_energy_kpts=None, mo_coeff_kpts=None):
             else:
                 # If mu0 is given, fix mu instead of electron number. XXX -Chong Sun
                 mu = self.mu0
+                assert numpy.isscalar(mu)
                 mo_occs = f_occ(mu, mo_es, sigma)
             self.entropy = self._get_entropy(mo_es, mo_occs, mu) / nkpts
             if is_rhf:
diff --git a/pyscf/scf/addons.py b/pyscf/scf/addons.py
index 213d11721a..a120087f33 100644
--- a/pyscf/scf/addons.py
+++ b/pyscf/scf/addons.py
@@ -140,12 +140,17 @@ def get_occ(self, mo_energy=None, mo_coeff=None):
             if self.mu0 is None:
                 mu_a, occa = _smearing_optimize(f_occ, mo_es[0], nocc[0], sigma)
                 mu_b, occb = _smearing_optimize(f_occ, mo_es[1], nocc[1], sigma)
-                mu = [mu_a, mu_b]
-                mo_occs = [occa, occb]
             else:
-                mu = self.mu0
-                mo_occs = f_occ(mu[0], mo_es[0], sigma)
-                mo_occs = f_occ(mu[1], mo_es[1], sigma)
+                if numpy.isscalar(self.mu0):
+                    mu_a = mu_b = self.mu0
+                elif len(self.mu0) == 2:
+                    mu_a, mu_b = self.mu0
+                else:
+                    raise TypeError(f'Unsupported mu0: {self.mu0}')
+                occa = f_occ(mu_a, mo_es[0], sigma)
+                occb = f_occ(mu_b, mo_es[1], sigma)
+            mu = [mu_a, mu_b]
+            mo_occs = [occa, occb]
             self.entropy  = self._get_entropy(mo_es[0], mo_occs[0], mu[0])
             self.entropy += self._get_entropy(mo_es[1], mo_occs[1], mu[1])
             fermi = (_get_fermi(mo_es[0], nocc[0]), _get_fermi(mo_es[1], nocc[1]))
@@ -163,7 +168,7 @@ def get_occ(self, mo_energy=None, mo_coeff=None):
             if is_rohf:
                 mo_occs = mo_occs[0] + mo_occs[1]
         else: # all orbitals treated with the same fermi level
-            nocc = nelectron = self.mol.tot_electrons()
+            nocc = nelectron = self.mol.nelectron
             if is_uhf:
                 mo_es = numpy.hstack(mo_energy)
             else:
@@ -176,6 +181,7 @@ def get_occ(self, mo_energy=None, mo_coeff=None):
             else:
                 # If mu0 is given, fix mu instead of electron number. XXX -Chong Sun
                 mu = self.mu0
+                assert numpy.isscalar(mu)
                 mo_occs = f_occ(mu, mo_es, sigma)
             self.entropy = self._get_entropy(mo_es, mo_occs, mu)
             if is_rhf:
diff --git a/pyscf/scf/test/test_addons.py b/pyscf/scf/test/test_addons.py
index f0d0da2ec6..88595191cd 100644
--- a/pyscf/scf/test/test_addons.py
+++ b/pyscf/scf/test/test_addons.py
@@ -459,6 +459,34 @@ def test_rohf_smearing(self):
         self.assertAlmostEqual(myhf_s.e_tot, -243.086989253, 5)
         self.assertAlmostEqual(myhf_s.entropy, 17.11431, 4)
 
+    def test_smearing_mu0(self):
+        def _hubbard_hamilts_pbc(L, U):
+            h1e = numpy.zeros((L, L))
+            g2e = numpy.zeros((L,)*4)
+            for i in range(L):
+                h1e[i, (i+1)%L] = h1e[(i+1)%L, i] = -1
+                g2e[i, i, i, i] = U
+            return h1e, g2e
+
+        L = 10
+        U = 4
+
+        mol = gto.M()
+        mol.nelectron = L
+        mol.nao = L
+        mol.incore_anyway = True
+        mol.build()
+
+        h1e, eri = _hubbard_hamilts_pbc(L, U)
+        mf = scf.UHF(mol)
+        mf.get_hcore = lambda *args: h1e
+        mf._eri = eri
+        mf.get_ovlp = lambda *args: numpy.eye(L)
+        mf_ft = addons.smearing(mf, sigma=.1, mu0=2., fix_spin=True)
+        mf_ft.kernel()
+        self.assertAlmostEqual(mf_ft.e_tot, -2.93405853397115, 5)
+        self.assertAlmostEqual(mf_ft.entropy, 0.11867520273160392, 5)
+
 if __name__ == "__main__":
     print("Full Tests for addons")
     unittest.main()

From fb49e40667f5d145074406c46047a0556dc94065 Mon Sep 17 00:00:00 2001
From: Qiming Sun <osirpt.sun@gmail.com>
Date: Mon, 26 Feb 2024 14:46:42 -0800
Subject: [PATCH 17/44] Add tests for ndarray_pointer_2d function (#2101)

* Add tests for ndarray_pointer_2d function

* Fix ndarray_pointer_2d
---
 pyscf/lib/numpy_helper.py                 | 10 ++++------
 pyscf/lib/test/test_numpy_helper.py       |  5 +++++
 pyscf/pbc/dft/multigrid/multigrid_pair.py |  4 ++--
 pyscf/pbc/gto/cell.py                     |  2 +-
 4 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/pyscf/lib/numpy_helper.py b/pyscf/lib/numpy_helper.py
index 58508d9f8b..e05e687f1e 100644
--- a/pyscf/lib/numpy_helper.py
+++ b/pyscf/lib/numpy_helper.py
@@ -1117,15 +1117,13 @@ def expm(a):
     return y
 
 def ndarray_pointer_2d(array):
-    '''Get the C pointer of a 2D array
+    '''Return an array that contains the addresses of the first element in each
+    row of the input 2d array.
     '''
     assert array.ndim == 2
     assert array.flags.c_contiguous
-
-    ptr = (array.ctypes.data +
-           numpy.arange(array.shape[0])*array.strides[0]).astype(numpy.uintp)
-    ptr = ptr.ctypes.data_as(ctypes.c_void_p)
-    return ptr
+    i = numpy.arange(array.shape[0])
+    return array.ctypes.data + (i * array.strides[0]).astype(numpy.uintp)
 
 class NPArrayWithTag(numpy.ndarray):
     # Initialize kwargs in function tag_array
diff --git a/pyscf/lib/test/test_numpy_helper.py b/pyscf/lib/test/test_numpy_helper.py
index 93e698f354..0b9ca0ec57 100644
--- a/pyscf/lib/test/test_numpy_helper.py
+++ b/pyscf/lib/test/test_numpy_helper.py
@@ -225,6 +225,11 @@ def test_split_reshape(self):
         self.assertRaises(ValueError, lib.split_reshape, numpy.arange(3), ((2,2),))
         self.assertRaises(ValueError, lib.split_reshape, numpy.arange(3), (2,2))
 
+    def test_ndarray_pointer_2d(self):
+        a = numpy.eye(3)
+        addr = lib.ndarray_pointer_2d(a)
+        self.assertTrue(all(addr == a.ctypes.data + numpy.array([0, 24, 48])))
+
 if __name__ == "__main__":
     print("Full Tests for numpy_helper")
     unittest.main()
diff --git a/pyscf/pbc/dft/multigrid/multigrid_pair.py b/pyscf/pbc/dft/multigrid/multigrid_pair.py
index 3ef43b688d..82068b7d93 100644
--- a/pyscf/pbc/dft/multigrid/multigrid_pair.py
+++ b/pyscf/pbc/dft/multigrid/multigrid_pair.py
@@ -313,12 +313,12 @@ def build_task_list(cell, gridlevel_info, cell1=None, Ls=None, hermi=0, precisio
              ish_bas.ctypes.data_as(ctypes.c_void_p),
              ish_env.ctypes.data_as(ctypes.c_void_p),
              ish_rcut.ctypes.data_as(ctypes.c_void_p),
-             ptr_ipgf_rcut,
+             ptr_ipgf_rcut.ctypes,
              jsh_atm.ctypes.data_as(ctypes.c_void_p),
              jsh_bas.ctypes.data_as(ctypes.c_void_p),
              jsh_env.ctypes.data_as(ctypes.c_void_p),
              jsh_rcut.ctypes.data_as(ctypes.c_void_p),
-             ptr_jpgf_rcut,
+             ptr_jpgf_rcut.ctypes,
              ctypes.c_int(nish), ctypes.c_int(njsh),
              Ls.ctypes.data_as(ctypes.c_void_p),
              ctypes.c_double(precision), ctypes.c_int(hermi))
diff --git a/pyscf/pbc/gto/cell.py b/pyscf/pbc/gto/cell.py
index 872fda36e2..15b4fa26d9 100644
--- a/pyscf/pbc/gto/cell.py
+++ b/pyscf/pbc/gto/cell.py
@@ -995,7 +995,7 @@ def rcut_by_shells(cell, precision=None, rcut=0,
         nprim = bas[:,mole.NPRIM_OF].max()
         # be careful that the unused memory blocks are not initialized
         pgf_radius = np.empty((nbas,nprim), order='C', dtype=np.double)
-        ptr_pgf_radius = lib.ndarray_pointer_2d(pgf_radius)
+        ptr_pgf_radius = lib.ndarray_pointer_2d(pgf_radius).ctypes
     else:
         ptr_pgf_radius = lib.c_null_ptr()
     fn = getattr(libpbc, 'rcut_by_shells', None)

From e2cc8c136ed0e57f8597ce3a06e66f97630916ac Mon Sep 17 00:00:00 2001
From: Victor Yu <victor.wz.yu.1012@gmail.com>
Date: Wed, 28 Feb 2024 00:16:25 -0600
Subject: [PATCH 18/44] Fix transform_ci for more than 64 orbitals (#2095)

* Fix transform_ci for more than 64 orbitals

* Separate occ_masks into a function
---
 pyscf/fci/addons.py | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/pyscf/fci/addons.py b/pyscf/fci/addons.py
index 95fda3f6ce..3f794f1a63 100644
--- a/pyscf/fci/addons.py
+++ b/pyscf/fci/addons.py
@@ -658,22 +658,15 @@ def transform_ci(ci, nelec, u):
     nb_new = cistring.num_strings(norb_new, nelecb)
     ci = ci.reshape(na_old, nb_old)
 
-    one_particle_strs_old = numpy.asarray([1 << i for i in range(norb_old)])
-    one_particle_strs_new = numpy.asarray([1 << i for i in range(norb_new)])
-
     if neleca == 0:
         trans_ci_a = numpy.ones((1, 1))
     else:
         trans_ci_a = numpy.zeros((na_old, na_new), dtype=ua.dtype)
-        strs_old = numpy.asarray(cistring.make_strings(range(norb_old), neleca))
-
-        # Unitary transformation array trans_ci is the overlap between two sets of CI basis.
-        occ_masks_old = (strs_old[:,None] & one_particle_strs_old) != 0
+        occ_masks_old = _init_occ_masks(norb_old, neleca, na_old)
         if norb_old == norb_new:
             occ_masks_new = occ_masks_old
         else:
-            strs_new = numpy.asarray(cistring.make_strings(range(norb_new), neleca))
-            occ_masks_new = (strs_new[:,None] & one_particle_strs_new) != 0
+            occ_masks_new = _init_occ_masks(norb_new, neleca, na_new)
 
         # Perform
         #for i in range(na_old): # old basis
@@ -692,14 +685,11 @@ def transform_ci(ci, nelec, u):
         trans_ci_b = numpy.ones((1, 1))
     else:
         trans_ci_b = numpy.zeros((nb_old, nb_new), dtype=ub.dtype)
-        strs_old = numpy.asarray(cistring.make_strings(range(norb_old), nelecb))
-
-        occ_masks_old = (strs_old[:,None] & one_particle_strs_old) != 0
+        occ_masks_old = _init_occ_masks(norb_old, nelecb, nb_old)
         if norb_old == norb_new:
             occ_masks_new = occ_masks_old
         else:
-            strs_new = numpy.asarray(cistring.make_strings(range(norb_new), nelecb))
-            occ_masks_new = (strs_new[:,None] & one_particle_strs_new) != 0
+            occ_masks_new = _init_occ_masks(norb_new, nelecb, nb_new)
 
         occ_idx_all_strs = numpy.where(occ_masks_new)[1].reshape(nb_new,nelecb)
         for i in range(nb_old):
@@ -725,4 +715,17 @@ def _unpack_nelec(nelec, spin=None):
         nelec = neleca, nelecb
     return nelec
 
+def _init_occ_masks(norb, nelec, nci):
+    one_particle_strs = numpy.asarray(cistring.make_strings(range(norb), 1))
+    strs = numpy.asarray(cistring.make_strings(range(norb), nelec))
+    if norb < 64:
+        occ_masks = (strs[:,None] & one_particle_strs) != 0
+    else:
+        occ_masks = numpy.zeros((nci, norb), dtype=bool)
+        for i in range(nci):
+            for j in range(norb):
+                if one_particle_strs[j][0] in strs[i]:
+                    occ_masks[i,j] = True
+    return occ_masks
+
 del (LARGE_CI_TOL, RETURN_STRS, PENALTY)

From a40064009cd3865bce6315d9f87323340e3f343c Mon Sep 17 00:00:00 2001
From: Xubo Wang <wangxubo0201@outlook.com>
Date: Wed, 28 Feb 2024 14:39:18 -0500
Subject: [PATCH 19/44] fix binomial function in fci_string

---
 pyscf/lib/mcscf/fci_string.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/pyscf/lib/mcscf/fci_string.c b/pyscf/lib/mcscf/fci_string.c
index adae90da46..5ef53b5161 100644
--- a/pyscf/lib/mcscf/fci_string.c
+++ b/pyscf/lib/mcscf/fci_string.c
@@ -130,12 +130,7 @@ static int binomial(int n, int m)
                 if (m*2 <= n) {
                         m = n - m;
                 }
-                uint64_t i;
-                uint64_t val = 1;
-                for (i = m; i <= n; i++) {
-                        val *= i;
-                        val /= i - m;
-                }
+		int val = binomial(n-1,m-1) + binomial(n-1,m);
                 return val;
         }
 }

From 93898c2b17c56f8632d1a1c3471df6ab97e88073 Mon Sep 17 00:00:00 2001
From: Qiming Sun <osirpt.sun@gmail.com>
Date: Thu, 7 Mar 2024 10:39:36 -0800
Subject: [PATCH 20/44] Fix xcfun high-order derivatives

---
 pyscf/dft/test/test_xc_deriv.py | 44 ++++++++++++++++++++++++++++++---
 pyscf/dft/xc_deriv.py           |  7 +++---
 pyscf/lib/libxcfun.patch        |  6 ++---
 3 files changed, 47 insertions(+), 10 deletions(-)

diff --git a/pyscf/dft/test/test_xc_deriv.py b/pyscf/dft/test/test_xc_deriv.py
index b6e430ffc2..ee976141b1 100644
--- a/pyscf/dft/test/test_xc_deriv.py
+++ b/pyscf/dft/test/test_xc_deriv.py
@@ -419,11 +419,11 @@ def test_libxc_mgga_deriv3(self):
     def test_libxc_gga_deriv4(self):
         rho1 = rho[:,:4].copy()
         xc1 = dft.libxc.eval_xc_eff('PBE', rho1, deriv=4)
-        self.assertAlmostEqual(xc1.sum(), -920.135878252819, 4)
+        self.assertAlmostEqual(xc1.sum(), -1141.356286780069, 1)
 
         rho1 = rho[1,:4].copy()
         xc1 = dft.libxc.eval_xc_eff('PBE', rho1, deriv=4)
-        self.assertAlmostEqual(xc1.sum(), -869.6617638095072, 4)
+        self.assertAlmostEqual(xc1.sum(), -615.116081052867, 1)
 
     @unittest.skipIf(not hasattr(dft, 'xcfun'), 'xcfun order')
     def test_xcfun_lda_deriv3(self):
@@ -525,11 +525,47 @@ def test_xcfun_mgga_deriv3(self):
     def test_xcfun_gga_deriv4(self):
         rho1 = rho[:,:4].copy()
         xc1 = dft.xcfun.eval_xc_eff('PBE', rho1, deriv=4)
-        self.assertAlmostEqual(xc1.sum(), -920.135878252819, 9)
+        self.assertAlmostEqual(xc1.sum(), -1141.356286780069, 9)
 
         rho1 = rho[1,:4].copy()
         xc1 = dft.xcfun.eval_xc_eff('PBE', rho1, deriv=4)
-        self.assertAlmostEqual(xc1.sum(), -869.6617638095072, 9)
+        self.assertAlmostEqual(xc1.sum(), -615.116081052867, 9)
+
+    @unittest.skipIf(not (hasattr(dft, 'xcfun') and dft.xcfun.MAX_DERIV_ORDER > 3), 'xcfun order')
+    def test_xcfun_gga_deriv4_finite_diff(self):
+        xctype = 'GGA'
+        deriv = 4
+        nvar = 4
+        delta = 1e-6
+
+        spin = 1
+        rhop = rho[:,:nvar].copy()
+        xcp = dft.xcfun.eval_xc1('pbe,', rhop, spin, deriv=deriv)
+        lxc = xc_deriv.transform_xc(rhop, xcp, xctype, spin,4)
+        for s in (0, 1):
+            for t in range(nvar):
+                rhop = rho[:,:nvar].copy()
+                rhop[s,t] += delta * .5
+                xcp = dft.xcfun.eval_xc1('pbe,', rhop, spin, deriv=deriv-1)
+                kxc0 = xc_deriv.transform_xc(rhop, xcp, xctype, spin, deriv-1)
+                rhop[s,t] -= delta
+                xcp = dft.xcfun.eval_xc1('pbe,', rhop, spin, deriv=deriv-1)
+                kxc1 = xc_deriv.transform_xc(rhop, xcp, xctype, spin, deriv-1)
+                self.assertAlmostEqual(abs((kxc0-kxc1)/delta - lxc[s,t]).max(), 0, 7)
+
+        spin = 0
+        rhop = rho[0,:nvar].copy()
+        xcp = dft.xcfun.eval_xc1('b88,', rhop, spin, deriv=deriv)
+        lxc = xc_deriv.transform_xc(rhop, xcp, xctype, spin,4)
+        for t in range(nvar):
+            rhop = rho[0,:nvar].copy()
+            rhop[t] += delta * .5
+            xcp = dft.xcfun.eval_xc1('b88,', rhop, spin, deriv=deriv-1)
+            kxc0 = xc_deriv.transform_xc(rhop, xcp, xctype, spin, deriv-1)
+            rhop[t] -= delta
+            xcp = dft.xcfun.eval_xc1('b88,', rhop, spin, deriv=deriv-1)
+            kxc1 = xc_deriv.transform_xc(rhop, xcp, xctype, spin, deriv-1)
+            self.assertAlmostEqual(abs((kxc0-kxc1)/delta - lxc[t]).max(), 0, 7)
 
 if __name__ == "__main__":
     print("Test xc_deriv")
diff --git a/pyscf/dft/xc_deriv.py b/pyscf/dft/xc_deriv.py
index 59d4c4d905..dd8df01397 100644
--- a/pyscf/dft/xc_deriv.py
+++ b/pyscf/dft/xc_deriv.py
@@ -580,6 +580,7 @@ def transform_xc(rho, xc_val, xctype, spin, order):
                         [dim_lst[i] for i in pair_comb] + rest_dims)
                     xc_tensor_1[diag_idx] += xc_sub
     else:
+        i3to2x2 = _product_uniq_indices(2, 2)
         for n_pairs in range(1, order//2+1):
             p0, p1 = offsets[order-n_pairs:order-n_pairs+2]
             xc_sub = _unfold_gga(rho, xc_val[p0:p1], spin, order-n_pairs,
@@ -589,9 +590,9 @@ def transform_xc(rho, xc_val, xctype, spin, order):
             for i in range(n_pairs):
                 xc_sub[(slice(None),)*i+(0,)] *= 2
                 xc_sub[(slice(None),)*i+(2,)] *= 2
-            sigma_idx = _product_uniq_indices(2, n_pairs*2)
-            xc_sub = xc_sub.reshape((3**n_pairs,) + xc_sub.shape[n_pairs:])
-            xc_sub = xc_sub[sigma_idx]
+            sigma_idx = (i3to2x2[(slice(None),)*2 + (np.newaxis,)*(i*2)]
+                         for i in reversed(range(n_pairs)))
+            xc_sub = xc_sub[tuple(sigma_idx)]
 
             low_sigmas = itertools.combinations(range(order), n_pairs*2)
             pair_combs = [list(itertools.chain(*p[::-1]))
diff --git a/pyscf/lib/libxcfun.patch b/pyscf/lib/libxcfun.patch
index 04e2a2a245..c49d1425e7 100644
--- a/pyscf/lib/libxcfun.patch
+++ b/pyscf/lib/libxcfun.patch
@@ -36,7 +36,7 @@ index 239cef5..6288e40 100644
 +        ttype in[XC_MAX_INVARS], out = 0;
 +        for (int i = 0; i < inlen; i++)
 +          in[i] = input[i];
-+        int k = 1 + inlen + (inlen * (inlen + 1)) / 2;
++        int k = (inlen + 1) * (inlen + 2) * (inlen + 3) * (inlen + 4) / 24; // comb(deriv-1+inlen, deriv-1);
 +        for (int i = 0; i < inlen; i++) {
 +          in[i].set(VAR0, 1);
 +          for (int j = i; j < inlen; j++) {
@@ -46,7 +46,7 @@ index 239cef5..6288e40 100644
 +              for (int s1 = s; s1 < inlen; s1++) {
 +                in[s1].set(VAR3, 1);
 +                for (int s2 = s1; s2 < inlen; s2++) {
-+                  in[s1].set(VAR4, 1);
++                  in[s2].set(VAR4, 1);
 +                  densvars<ttype> d(fun, in);
 +                  out = 0;
 +                  for (int n = 0; n < fun->nr_active_functionals; n++)
@@ -71,7 +71,7 @@ index 239cef5..6288e40 100644
 +        ttype in[XC_MAX_INVARS], out = 0;
 +        for (int i = 0; i < inlen; i++)
 +          in[i] = input[i];
-+        int k = 1 + inlen + (inlen * (inlen + 1)) / 2;
++        int k = (inlen + 1) * (inlen + 2) * (inlen + 3) / 6; // comb(deriv-1+inlen, deriv-1);
 +        for (int i = 0; i < inlen; i++) {
 +          in[i].set(VAR0, 1);
 +          for (int j = i; j < inlen; j++) {

From bd59bf629bfee6dcafe35969760d582cec79afcf Mon Sep 17 00:00:00 2001
From: Qiming Sun <osirpt.sun@gmail.com>
Date: Mon, 11 Mar 2024 22:24:29 -0700
Subject: [PATCH 21/44] More type checks in fci solver (#2122)

* float type checks

* Update mcscf docstring (issue #2119)

* Fix failed tests
---
 pyscf/fci/direct_nosym.py | 101 +++++++++++++++++---------------------
 pyscf/fci/direct_spin0.py |   6 ++-
 pyscf/fci/direct_spin1.py |  12 +++--
 pyscf/mcscf/__init__.py   |  24 ++-------
 pyscf/mcscf/mc1step.py    |  10 ++--
 5 files changed, 66 insertions(+), 87 deletions(-)

diff --git a/pyscf/fci/direct_nosym.py b/pyscf/fci/direct_nosym.py
index 5befc399cc..fcd67f5664 100644
--- a/pyscf/fci/direct_nosym.py
+++ b/pyscf/fci/direct_nosym.py
@@ -49,7 +49,8 @@ def contract_1e(h1e, fcivec, norb, nelec, link_index=None):
 
     na, nlinka = link_indexa.shape[:2]
     nb, nlinkb = link_indexb.shape[:2]
-    assert (fcivec.size == na*nb)
+    assert fcivec.size == na*nb
+    assert fcivec.dtype == h1e.dtype == numpy.float64
     ci1 = numpy.zeros_like(fcivec)
 
     libfci.FCIcontract_a_1e_nosym(h1e.ctypes.data_as(ctypes.c_void_p),
@@ -95,30 +96,47 @@ def contract_2e(eri, fcivec, norb, nelec, link_index=None):
 
     See also :func:`direct_nosym.absorb_h1e`
     '''
-    fcivec = numpy.asarray(fcivec, order='C')
     link_indexa, link_indexb = _unpack(norb, nelec, link_index)
-
     na, nlinka = link_indexa.shape[:2]
     nb, nlinkb = link_indexb.shape[:2]
-    assert (fcivec.size == na*nb)
-    ci1 = numpy.empty_like(fcivec)
-
-    libfci.FCIcontract_2es1(eri.ctypes.data_as(ctypes.c_void_p),
-                            fcivec.ctypes.data_as(ctypes.c_void_p),
-                            ci1.ctypes.data_as(ctypes.c_void_p),
-                            ctypes.c_int(norb),
-                            ctypes.c_int(na), ctypes.c_int(nb),
-                            ctypes.c_int(nlinka), ctypes.c_int(nlinkb),
-                            link_indexa.ctypes.data_as(ctypes.c_void_p),
-                            link_indexb.ctypes.data_as(ctypes.c_void_p))
-    return ci1.view(direct_spin1.FCIvector)
+    assert fcivec.size == na*nb
+    if fcivec.dtype == eri.dtype == numpy.float64:
+        fcivec = numpy.asarray(fcivec, order='C')
+        eri = numpy.asarray(eri, order='C')
+        ci1 = numpy.empty_like(fcivec)
+        libfci.FCIcontract_2es1(eri.ctypes.data_as(ctypes.c_void_p),
+                                fcivec.ctypes.data_as(ctypes.c_void_p),
+                                ci1.ctypes.data_as(ctypes.c_void_p),
+                                ctypes.c_int(norb),
+                                ctypes.c_int(na), ctypes.c_int(nb),
+                                ctypes.c_int(nlinka), ctypes.c_int(nlinkb),
+                                link_indexa.ctypes.data_as(ctypes.c_void_p),
+                                link_indexb.ctypes.data_as(ctypes.c_void_p))
+        return ci1.view(direct_spin1.FCIvector)
+
+    ciR = numpy.asarray(fcivec.real, order='C')
+    ciI = numpy.asarray(fcivec.imag, order='C')
+    eriR = numpy.asarray(eri.real, order='C')
+    eriI = numpy.asarray(eri.imag, order='C')
+    link_index = (link_indexa, link_indexb)
+    outR  = contract_2e(eriR, ciR, norb, nelec, link_index=link_index)
+    outR -= contract_2e(eriI, ciI, norb, nelec, link_index=link_index)
+    outI  = contract_2e(eriR, ciI, norb, nelec, link_index=link_index)
+    outI += contract_2e(eriI, ciR, norb, nelec, link_index=link_index)
+    out = outR.astype(numpy.complex128)
+    out.imag = outI
+    return outR
 
 def absorb_h1e(h1e, eri, norb, nelec, fac=1):
     '''Modify 2e Hamiltonian to include 1e Hamiltonian contribution.
     '''
     if not isinstance(nelec, (int, numpy.number)):
         nelec = sum(nelec)
-    h2e = ao2mo.restore(1, eri.copy(), norb).astype(h1e.dtype, copy=False)
+    if h1e.dtype == eri.dtype == numpy.float64:
+        h2e = ao2mo.restore(1, eri.copy(), norb)
+    else:
+        assert eri.ndim == 4
+        h2e = eri.astype(dtype=numpy.result_type(h1e, eri), copy=True)
     f1e = h1e - numpy.einsum('jiik->jk', h2e) * .5
     f1e = f1e * (1./(nelec+1e-100))
     for k in range(norb):
@@ -133,7 +151,12 @@ def energy(h1e, eri, fcivec, norb, nelec, link_index=None):
     ci1 = contract_2e(h2e, fcivec, norb, nelec, link_index)
     return numpy.dot(fcivec.reshape(-1), ci1.reshape(-1))
 
-make_hdiag = direct_spin1.make_hdiag
+def make_hdiag(h1e, eri, norb, nelec, compress=False):
+    if h1e.dtype == numpy.complex128:
+        h1e = h1e.real.copy()
+    if eri.dtype == numpy.complex128:
+        eri = eri.real.copy()
+    return direct_spin1.make_hdiag(h1e, eri, norb, nelec, compress)
 
 
 class FCISolver(direct_spin1.FCISolver):
@@ -151,6 +174,10 @@ def contract_2e(self, eri, fcivec, norb, nelec, link_index=None):
     def absorb_h1e(self, h1e, eri, norb, nelec, fac=1):
         return absorb_h1e(h1e, eri, norb, nelec, fac)
 
+    def make_hdiag(self, h1e, eri, norb, nelec, compress=False):
+        nelec = direct_spin1._unpack_nelec(nelec, self.spin)
+        return make_hdiag(h1e, eri, norb, nelec, compress)
+
     def kernel(self, h1e, eri, norb, nelec, ci0=None,
                tol=None, lindep=None, max_cycle=None, max_space=None,
                nroots=None, davidson_only=None, pspace_size=None,
@@ -160,6 +187,7 @@ def kernel(self, h1e, eri, norb, nelec, ci0=None,
             neleca = nelec - nelecb
         else:
             neleca, nelecb = nelec
+        davidson_only = True
         link_indexa = cistring.gen_linkstr_index(range(norb), neleca)
         link_indexb = cistring.gen_linkstr_index(range(norb), nelecb)
         e, c = direct_spin1.kernel_ms1(self, h1e, eri, norb, nelec, ci0,
@@ -206,42 +234,3 @@ def _unpack(norb, nelec, link_index):
         return link_indexa, link_indexb
     else:
         return link_index
-
-
-if __name__ == '__main__':
-    from functools import reduce
-    from pyscf import gto
-    from pyscf import scf
-
-    mol = gto.Mole()
-    mol.verbose = 0
-    mol.output = None#"out_h2o"
-    mol.atom = [
-        ['H', ( 1.,-1.    , 0.   )],
-        ['H', ( 0.,-1.    ,-1.   )],
-        ['H', ( 1.,-0.5   ,-1.   )],
-        #['H', ( 0.,-0.5   ,-1.   )],
-        #['H', ( 0.,-0.5   ,-0.   )],
-        ['H', ( 0.,-0.    ,-1.   )],
-        ['H', ( 1.,-0.5   , 0.   )],
-        ['H', ( 0., 1.    , 1.   )],
-    ]
-
-    mol.basis = {'H': 'sto-3g'}
-    mol.build()
-
-    m = scf.RHF(mol)
-    ehf = m.scf()
-
-    cis = FCISolver(mol)
-    norb = m.mo_coeff.shape[1]
-    nelec = mol.nelectron - 2
-    h1e = reduce(numpy.dot, (m.mo_coeff.T, m.get_hcore(), m.mo_coeff))
-    eri = ao2mo.incore.general(m._eri, (m.mo_coeff,)*4, compact=False)
-    eri = eri.reshape(norb,norb,norb,norb)
-    nea = nelec//2 + 1
-    neb = nelec//2 - 1
-    nelec = (nea, neb)
-
-    e1 = cis.kernel(h1e, eri, norb, nelec)[0]
-    print(e1, e1 - -7.7466756526056004)
diff --git a/pyscf/fci/direct_spin0.py b/pyscf/fci/direct_spin0.py
index 46a21fd790..5d80bc87ec 100644
--- a/pyscf/fci/direct_spin0.py
+++ b/pyscf/fci/direct_spin0.py
@@ -59,7 +59,8 @@ def contract_1e(f1e, fcivec, norb, nelec, link_index=None):
         # Handle computability. link_index should be (nparray, nparray)
         link_index = link_index[0]
     na, nlink = link_index.shape[:2]
-    assert (fcivec.size == na**2)
+    assert fcivec.size == na**2
+    assert fcivec.dtype == f1e.dtype == numpy.float64
     ci1 = numpy.empty_like(fcivec)
     f1e_tril = lib.pack_tril(f1e)
     libfci.FCIcontract_1e_spin0(f1e_tril.ctypes.data_as(ctypes.c_void_p),
@@ -92,7 +93,8 @@ def contract_2e(eri, fcivec, norb, nelec, link_index=None):
         # Handle computability. link_index should be (nparray, nparray)
         link_index = link_index[0]
     na, nlink = link_index.shape[:2]
-    assert (fcivec.size == na**2)
+    assert fcivec.size == na**2
+    assert fcivec.dtype == eri.dtype == numpy.float64
     ci1 = numpy.empty((na,na))
 
     libfci.FCIcontract_2e_spin0(eri.ctypes.data_as(ctypes.c_void_p),
diff --git a/pyscf/fci/direct_spin1.py b/pyscf/fci/direct_spin1.py
index 4b5b5a4946..5c63cfdf59 100644
--- a/pyscf/fci/direct_spin1.py
+++ b/pyscf/fci/direct_spin1.py
@@ -63,7 +63,8 @@ def contract_1e(f1e, fcivec, norb, nelec, link_index=None):
     link_indexa, link_indexb = _unpack(norb, nelec, link_index)
     na, nlinka = link_indexa.shape[:2]
     nb, nlinkb = link_indexb.shape[:2]
-    assert (fcivec.size == na*nb)
+    assert fcivec.size == na*nb
+    assert fcivec.dtype == f1e.dtype == numpy.float64
     f1e_tril = lib.pack_tril(f1e)
     ci1 = numpy.zeros_like(fcivec)
     libfci.FCIcontract_a_1e(f1e_tril.ctypes.data_as(ctypes.c_void_p),
@@ -123,11 +124,12 @@ def contract_2e(eri, fcivec, norb, nelec, link_index=None):
     See also :func:`direct_spin1.absorb_h1e`
     '''
     fcivec = numpy.asarray(fcivec, order='C')
-    eri = ao2mo.restore(4, eri, norb)
+    eri = numpy.asarray(ao2mo.restore(4, eri, norb), order='C')
     link_indexa, link_indexb = _unpack(norb, nelec, link_index)
     na, nlinka = link_indexa.shape[:2]
     nb, nlinkb = link_indexb.shape[:2]
-    assert (fcivec.size == na*nb)
+    assert fcivec.size == na*nb
+    assert fcivec.dtype == eri.dtype == numpy.float64
     ci1 = numpy.empty_like(fcivec)
 
     libfci.FCIcontract_2e_spin1(eri.ctypes.data_as(ctypes.c_void_p),
@@ -146,12 +148,12 @@ def make_hdiag(h1e, eri, norb, nelec, compress=False):
     Kwargs:
         compress (bool) : whether to remove symmetry forbidden elements
     '''
-    if h1e.dtype == numpy.complex128 or eri.dtype == numpy.complex128:
+    if not (h1e.dtype == eri.dtype == numpy.float64):
         raise NotImplementedError('Complex Hamiltonian')
 
     neleca, nelecb = _unpack_nelec(nelec)
     h1e = numpy.asarray(h1e, order='C')
-    eri = ao2mo.restore(1, eri, norb)
+    eri = numpy.asarray(ao2mo.restore(1, eri, norb), order='C')
     occslsta = occslstb = cistring.gen_occslst(range(norb), neleca)
     if neleca != nelecb:
         occslstb = cistring.gen_occslst(range(norb), nelecb)
diff --git a/pyscf/mcscf/__init__.py b/pyscf/mcscf/__init__.py
index 8d8b105ee6..82196ded33 100644
--- a/pyscf/mcscf/__init__.py
+++ b/pyscf/mcscf/__init__.py
@@ -81,42 +81,28 @@
         Converge threshold.  Default is 1e-7
     conv_tol_grad : float
         Converge threshold for CI gradients and orbital rotation gradients.
-        Default is 1e-4
+        If not specified, it is set to sqrt(conv_tol).
     max_stepsize : float
         The step size for orbital rotation.  Small step size is prefered.
-        Default is 0.03.  
+        Default is 0.02.  
         (NOTE although the default step size is small enough for many systems,
         it happens that the orbital optimizor crosses the barriar of local
         minimum and converge to the neighbour solution, e.g. the CAS(4,4) for
         C2H4 in the test files.  In these systems, adjusting max_stepsize,
-        max_ci_stepsize and max_cycle_micro, max_cycle_micro_inner and
-        ah_start_tol may be helpful)
+        max_ci_stepsize and max_cycle_micro and ah_start_tol may be helpful)
 
         >>> mc = mcscf.CASSCF(mf, 6, 6)
         >>> mc.max_stepsize = .01
         >>> mc.max_cycle_micro = 1
         >>> mc.max_cycle_macro = 100
-        >>> mc.max_cycle_micro_inner = 1
         >>> mc.ah_start_tol = 1e-6
 
-    max_ci_stepsize : float
-        The max size for approximate CI updates.  The approximate updates are
-        used in 1-step algorithm, to estimate the change of CI wavefunction wrt
-        the orbital rotation.  Small step size is prefered.  Default is 0.01.
     max_cycle_macro : int
         Max number of macro iterations.  Default is 50.
     max_cycle_micro : int
         Max number of micro iterations in each macro iteration.  Depending on
         systems, increasing this value might reduce the total macro
-        iterations.  Generally, 2 - 3 steps should be enough.  Default is 2.
-    max_cycle_micro_inner : int
-        Max number of steps for the orbital rotations allowed for the augmented
-        hessian solver.  It can affect the actual size of orbital rotation.
-        Even with a small max_stepsize, a few max_cycle_micro_inner can
-        accumulate the rotation and leads to a significant change of the CAS
-        space.  Depending on systems, increasing this value migh reduce the
-        total number of macro iterations.  The value between 2 - 8 is preferred.
-        Default is 4.
+        iterations.  Generally, 2 - 5 steps should be enough.  Default is 4.
     frozen : int or list
         If integer is given, the inner-most orbitals are excluded from optimization.
         Given the orbital indices (0-based) in a list, any doubly occupied core
@@ -131,7 +117,7 @@
         Linear dependence threshold for AH solver.  Default is 1e-16.
     ah_start_tol : flat, for AH solver.
         In AH solver, the orbital rotation is started without completely solving the AH problem.
-        This value is to control the start point. Default is 1e-4.
+        This value is to control the start point. Default is 2.5.
     ah_start_cycle : int, for AH solver.
         In AH solver, the orbital rotation is started without completely solving the AH problem.
         This value is to control the start point. Default is 3.
diff --git a/pyscf/mcscf/mc1step.py b/pyscf/mcscf/mc1step.py
index 020d6e6d88..0744ebb372 100644
--- a/pyscf/mcscf/mc1step.py
+++ b/pyscf/mcscf/mc1step.py
@@ -595,16 +595,16 @@ class CASSCF(casci.CASBase):
             Converge threshold.  Default is 1e-7
         conv_tol_grad : float
             Converge threshold for CI gradients and orbital rotation gradients.
-            Default is 1e-4
+            If not specified, it is set to sqrt(conv_tol).
         max_stepsize : float
             The step size for orbital rotation.  Small step (0.005 - 0.05) is prefered.
-            Default is 0.03.
+            Default is 0.02.
         max_cycle_macro : int
             Max number of macro iterations.  Default is 50.
         max_cycle_micro : int
             Max number of micro iterations in each macro iteration.  Depending on
             systems, increasing this value might reduce the total macro
-            iterations.  Generally, 2 - 5 steps should be enough.  Default is 3.
+            iterations.  Generally, 2 - 5 steps should be enough.  Default is 4.
         small_rot_tol : float
             Threshold for orbital rotation to be considered small. If the largest orbital
             rotation is smaller than this value, the CI solver will restart from the
@@ -620,10 +620,10 @@ class CASSCF(casci.CASBase):
             Linear dependence threshold for AH solver.  Default is 1e-14.
         ah_start_tol : flat, for AH solver.
             In AH solver, the orbital rotation is started without completely solving the AH problem.
-            This value is to control the start point. Default is 0.2.
+            This value is to control the start point. Default is 2.5.
         ah_start_cycle : int, for AH solver.
             In AH solver, the orbital rotation is started without completely solving the AH problem.
-            This value is to control the start point. Default is 2.
+            This value is to control the start point. Default is 3.
 
             ``ah_conv_tol``, ``ah_max_cycle``, ``ah_lindep``, ``ah_start_tol`` and ``ah_start_cycle``
             can affect the accuracy and performance of CASSCF solver.  Lower

From 4150f854ff293c89ce3bb1907c9d0710a960e66d Mon Sep 17 00:00:00 2001
From: Qiming Sun <osirpt.sun@gmail.com>
Date: Mon, 11 Mar 2024 22:24:56 -0700
Subject: [PATCH 22/44] Improve to_gpu (#2124)

* Update to_gpu

* Improve to_ks/to_hf

* Update to_gpu for df, solvent, qmm, and other methods

* Fix fci initialization
---
 pyscf/cc/ccsd.py                  |  2 +
 pyscf/cc/dfccsd.py                |  2 +
 pyscf/cc/gccsd.py                 |  2 +
 pyscf/cc/uccsd.py                 |  2 +
 pyscf/ci/cisd.py                  |  2 +
 pyscf/df/addons.py                |  2 +-
 pyscf/df/df.py                    |  4 +-
 pyscf/df/df_jk.py                 |  3 +-
 pyscf/df/grad/casscf.py           |  2 +
 pyscf/df/grad/rhf.py              |  8 ++--
 pyscf/df/grad/rks.py              |  4 +-
 pyscf/df/grad/sacasscf.py         |  2 +
 pyscf/df/grad/uhf.py              |  2 +
 pyscf/df/grad/uks.py              |  2 +
 pyscf/df/hessian/rhf.py           |  5 +--
 pyscf/df/hessian/rks.py           |  5 +--
 pyscf/df/hessian/uhf.py           |  1 +
 pyscf/df/hessian/uks.py           |  1 +
 pyscf/dft/dks.py                  |  2 +
 pyscf/dft/gen_grid.py             |  4 +-
 pyscf/dft/gks.py                  |  3 +-
 pyscf/dft/gks_symm.py             |  3 ++
 pyscf/dft/numint.py               |  5 +--
 pyscf/dft/rks.py                  |  7 +--
 pyscf/dft/rks_symm.py             |  7 ++-
 pyscf/dft/roks.py                 |  9 +---
 pyscf/dft/uks.py                  |  8 +---
 pyscf/dft/uks_symm.py             |  3 ++
 pyscf/fci/direct_spin1.py         |  2 +
 pyscf/grad/casci.py               |  2 +
 pyscf/grad/casscf.py              |  2 +
 pyscf/grad/ccsd.py                |  2 +
 pyscf/grad/cisd.py                |  2 +
 pyscf/grad/dhf.py                 |  2 +
 pyscf/grad/mp2.py                 |  2 +
 pyscf/grad/rhf.py                 |  4 +-
 pyscf/grad/rks.py                 |  4 +-
 pyscf/grad/tdrhf.py               |  2 +
 pyscf/grad/uhf.py                 |  3 +-
 pyscf/grad/uks.py                 |  3 +-
 pyscf/hessian/rhf.py              |  7 +--
 pyscf/hessian/rks.py              |  5 +--
 pyscf/hessian/uhf.py              |  3 +-
 pyscf/hessian/uks.py              |  4 +-
 pyscf/lib/diis.py                 |  2 +
 pyscf/lib/misc.py                 | 74 +++++++++++++++++++++++++++----
 pyscf/mcscf/casci.py              |  4 +-
 pyscf/mcscf/casci_symm.py         |  4 +-
 pyscf/mcscf/mc1step.py            |  4 +-
 pyscf/mcscf/newton_casscf_symm.py |  2 +-
 pyscf/mcscf/ucasci.py             |  2 +-
 pyscf/mcscf/umc1step.py           |  2 +-
 pyscf/mp/dfmp2.py                 |  2 +
 pyscf/mp/dfmp2_native.py          |  2 +
 pyscf/mp/dfump2_native.py         |  2 +
 pyscf/mp/gmp2.py                  |  2 +
 pyscf/mp/mp2.py                   |  2 +
 pyscf/mp/ump2.py                  |  2 +
 pyscf/pbc/cc/kccsd.py             |  2 +
 pyscf/pbc/cc/kccsd_rhf.py         |  2 +
 pyscf/pbc/cc/kccsd_uhf.py         |  2 +
 pyscf/pbc/df/df.py                |  2 +
 pyscf/pbc/df/fft.py               |  2 +
 pyscf/pbc/dft/gen_grid.py         |  4 ++
 pyscf/pbc/dft/gks.py              |  2 +
 pyscf/pbc/dft/kgks.py             |  2 +
 pyscf/pbc/dft/krks.py             |  2 +
 pyscf/pbc/dft/kroks.py            |  2 +
 pyscf/pbc/dft/kuks.py             |  2 +
 pyscf/pbc/dft/numint.py           |  4 ++
 pyscf/pbc/dft/rks.py              |  2 +
 pyscf/pbc/dft/roks.py             |  2 +
 pyscf/pbc/dft/uks.py              |  2 +
 pyscf/pbc/mp/kmp2.py              |  2 +
 pyscf/pbc/scf/ghf.py              |  2 +
 pyscf/pbc/scf/hf.py               |  1 +
 pyscf/pbc/scf/kghf.py             |  2 +
 pyscf/pbc/scf/khf.py              |  1 +
 pyscf/pbc/scf/krohf.py            |  1 +
 pyscf/pbc/scf/kuhf.py             |  1 +
 pyscf/pbc/scf/rohf.py             |  1 +
 pyscf/pbc/scf/rsjk.py             |  2 +
 pyscf/pbc/scf/uhf.py              |  1 +
 pyscf/qmmm/itrf.py                | 12 +++++
 pyscf/scf/dhf.py                  |  8 ++--
 pyscf/scf/diis.py                 |  4 --
 pyscf/scf/ghf.py                  | 10 ++---
 pyscf/scf/ghf_symm.py             |  3 +-
 pyscf/scf/hf.py                   | 15 ++++---
 pyscf/scf/hf_symm.py              |  6 +--
 pyscf/scf/rohf.py                 |  4 +-
 pyscf/scf/uhf.py                  |  7 ++-
 pyscf/scf/uhf_symm.py             |  3 +-
 pyscf/sgx/sgx.py                  |  5 +++
 pyscf/solvent/_attach_solvent.py  | 27 ++++++++++-
 pyscf/solvent/ddcosmo.py          |  2 +
 pyscf/soscf/newton_ah.py          |  3 ++
 pyscf/tdscf/rhf.py                |  7 +++
 pyscf/tdscf/uhf.py                |  4 ++
 pyscf/x2c/sfx2c1e.py              |  4 ++
 pyscf/x2c/x2c.py                  |  8 ++++
 101 files changed, 307 insertions(+), 130 deletions(-)

diff --git a/pyscf/cc/ccsd.py b/pyscf/cc/ccsd.py
index 460237208a..65ee49486f 100644
--- a/pyscf/cc/ccsd.py
+++ b/pyscf/cc/ccsd.py
@@ -1365,6 +1365,8 @@ def get_d2_diagnostic(self, t2=None):
         if t2 is None: t2 = self.t2
         return get_d2_diagnostic(t2)
 
+    to_gpu = lib.to_gpu
+
 CC = RCCSD = CCSD
 
 
diff --git a/pyscf/cc/dfccsd.py b/pyscf/cc/dfccsd.py
index f144734067..e95d4fae6e 100644
--- a/pyscf/cc/dfccsd.py
+++ b/pyscf/cc/dfccsd.py
@@ -48,6 +48,8 @@ def _add_vvvv(self, t1, t2, eris, out=None, with_ovvv=False, t2sym=None):
         assert (not self.direct)
         return ccsd.CCSD._add_vvvv(self, t1, t2, eris, out, with_ovvv, t2sym)
 
+    to_gpu = lib.to_gpu
+
 
 def _contract_vvvv_t2(mycc, mol, vvL, t2, out=None, verbose=None):
     '''Ht2 = numpy.einsum('ijcd,acdb->ijab', t2, vvvv)
diff --git a/pyscf/cc/gccsd.py b/pyscf/cc/gccsd.py
index 2db369cfd5..dda254966d 100644
--- a/pyscf/cc/gccsd.py
+++ b/pyscf/cc/gccsd.py
@@ -289,6 +289,8 @@ def spin2spatial(self, tx, orbspin=None):
                 orbspin = orbspin[self.get_frozen_mask()]
         return spin2spatial(tx, orbspin)
 
+    to_gpu = lib.to_gpu
+
 CCSD = GCCSD
 
 
diff --git a/pyscf/cc/uccsd.py b/pyscf/cc/uccsd.py
index a6cbe05519..5bd7b50a32 100644
--- a/pyscf/cc/uccsd.py
+++ b/pyscf/cc/uccsd.py
@@ -758,6 +758,8 @@ def vector_size(self, nmo=None, nocc=None):
     def amplitudes_from_rccsd(self, t1, t2):
         return amplitudes_from_rccsd(t1, t2)
 
+    to_gpu = lib.to_gpu
+
 CCSD = UCCSD
 
 
diff --git a/pyscf/ci/cisd.py b/pyscf/ci/cisd.py
index cbbab7859f..629b00da6d 100644
--- a/pyscf/ci/cisd.py
+++ b/pyscf/ci/cisd.py
@@ -1131,6 +1131,8 @@ def nuc_grad_method(self):
         from pyscf.grad import cisd
         return cisd.Gradients(self)
 
+    to_gpu = lib.to_gpu
+
 class RCISD(CISD):
     pass
 
diff --git a/pyscf/df/addons.py b/pyscf/df/addons.py
index 5cd32e7eae..684204b508 100644
--- a/pyscf/df/addons.py
+++ b/pyscf/df/addons.py
@@ -152,7 +152,7 @@ def make_auxbasis(mol, mp2fit=False):
         _basis.update(mol.basis)
         del (_basis['default'])
     else:
-        _basis = mol._basis
+        _basis = mol._basis or {}
 
     auxbasis = {}
     for k in _basis:
diff --git a/pyscf/df/df.py b/pyscf/df/df.py
index 403dd3c37e..82b0d83daa 100644
--- a/pyscf/df/df.py
+++ b/pyscf/df/df.py
@@ -308,9 +308,7 @@ def range_coulomb(self, omega):
             if auxmol_omega is not None:
                 auxmol.omega = auxmol_omega
 
-    def to_gpu(self):
-        from gpu4pyscf.df.df import DF as DF
-        return lib.to_gpu(self.__class__.reset(self.view(DF)))
+    to_gpu = lib.to_gpu
 
 GDF = DF
 
diff --git a/pyscf/df/df_jk.py b/pyscf/df/df_jk.py
index 8d21db6aa2..2c02564efd 100644
--- a/pyscf/df/df_jk.py
+++ b/pyscf/df/df_jk.py
@@ -228,8 +228,7 @@ def CASSCF(self, ncas, nelecas, auxbasis=None, ncore=None, frozen=None):
 
     def to_gpu(self):
         obj = self.undo_df().to_gpu().density_fit()
-        obj.__dict__.update(self.__dict__)
-        return lib.to_gpu(obj)
+        return lib.to_gpu(self, obj)
 
 
 def get_jk(dfobj, dm, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e-13):
diff --git a/pyscf/df/grad/casscf.py b/pyscf/df/grad/casscf.py
index 56815f96b7..243a100f83 100644
--- a/pyscf/df/grad/casscf.py
+++ b/pyscf/df/grad/casscf.py
@@ -224,6 +224,8 @@ def _finalize(self):
 
     as_scanner = as_scanner
 
+    to_gpu = lib.to_gpu
+
 Grad = Gradients
 
 #from pyscf import mcscf
diff --git a/pyscf/df/grad/rhf.py b/pyscf/df/grad/rhf.py
index 0c945d8d21..91ccd3e543 100644
--- a/pyscf/df/grad/rhf.py
+++ b/pyscf/df/grad/rhf.py
@@ -482,12 +482,14 @@ class Gradients(rhf_grad.Gradients):
     _keys = {'with_df', 'auxbasis_response'}
 
     def __init__(self, mf):
-        assert isinstance(mf, df.df_jk._DFHF)
         # Whether to include the response of DF auxiliary basis when computing
         # nuclear gradients of J/K matrices
         self.auxbasis_response = True
         rhf_grad.Gradients.__init__(self, mf)
 
+    def check_sanity(self):
+        assert isinstance(self.base, df.df_jk._DFHF)
+
     def get_jk(self, mol=None, dm=None, hermi=0, with_j=True, with_k=True,
                omega=None):
         if omega is None:
@@ -521,8 +523,6 @@ def extra_force(self, atom_id, envs):
         else:
             return 0
 
-    def to_gpu(self):
-        from gpu4pyscf.df.grad.rhf import Gradients
-        return lib.to_gpu(self.view(Gradients))
+    to_gpu = lib.to_gpu
 
 Grad = Gradients
diff --git a/pyscf/df/grad/rks.py b/pyscf/df/grad/rks.py
index aaa8f663ea..a53d3d0ae6 100644
--- a/pyscf/df/grad/rks.py
+++ b/pyscf/df/grad/rks.py
@@ -123,8 +123,6 @@ def extra_force(self, atom_id, envs):
             e1 += envs['vhf'].aux[atom_id]
         return e1
 
-    def to_gpu(self):
-        from gpu4pyscf.df.grad.rks import Gradients
-        return lib.to_gpu(self.view(Gradients))
+    to_gpu = lib.to_gpu
 
 Grad = Gradients
diff --git a/pyscf/df/grad/sacasscf.py b/pyscf/df/grad/sacasscf.py
index 1d985d993c..713f5f8e35 100644
--- a/pyscf/df/grad/sacasscf.py
+++ b/pyscf/df/grad/sacasscf.py
@@ -370,3 +370,5 @@ def kernel (self, **kwargs):
     def get_LdotJnuc (self, Lvec, **kwargs):
         with lib.temporary_env (sacasscf_grad, Lci_dot_dgci_dx=Lci_dot_dgci_dx, Lorb_dot_dgorb_dx=Lorb_dot_dgorb_dx):
             return sacasscf_grad.Gradients.get_LdotJnuc (self, Lvec, **kwargs)
+
+    to_gpu = lib.to_gpu
diff --git a/pyscf/df/grad/uhf.py b/pyscf/df/grad/uhf.py
index 0eec773b0d..af2e048591 100644
--- a/pyscf/df/grad/uhf.py
+++ b/pyscf/df/grad/uhf.py
@@ -60,4 +60,6 @@ def extra_force(self, atom_id, envs):
         else:
             return 0
 
+    to_gpu = lib.to_gpu
+
 Grad = Gradients
diff --git a/pyscf/df/grad/uks.py b/pyscf/df/grad/uks.py
index e6de663a95..9fa6f5cdf5 100644
--- a/pyscf/df/grad/uks.py
+++ b/pyscf/df/grad/uks.py
@@ -124,4 +124,6 @@ def extra_force(self, atom_id, envs):
             e1 += envs['vhf'].aux[atom_id]
         return e1
 
+    to_gpu = lib.to_gpu
+
 Grad = Gradients
diff --git a/pyscf/df/hessian/rhf.py b/pyscf/df/hessian/rhf.py
index 5c9f7a17ac..95bc7f9dcf 100644
--- a/pyscf/df/hessian/rhf.py
+++ b/pyscf/df/hessian/rhf.py
@@ -480,10 +480,7 @@ def __init__(self, mf):
 
     partial_hess_elec = partial_hess_elec
     make_h1 = make_h1
-
-    def to_gpu(self):
-        from gpu4pyscf.df.hessian.rhf import Hessian
-        return lib.to_gpu(self.view(Hessian))
+    to_gpu = lib.to_gpu
 
 #TODO: Insert into DF class
 
diff --git a/pyscf/df/hessian/rks.py b/pyscf/df/hessian/rks.py
index 79816d8cee..30b59fc8d1 100644
--- a/pyscf/df/hessian/rks.py
+++ b/pyscf/df/hessian/rks.py
@@ -126,10 +126,7 @@ def __init__(self, mf):
 
     partial_hess_elec = partial_hess_elec
     make_h1 = make_h1
-
-    def to_gpu(self):
-        from gpu4pyscf.df.hessian.rks import Hessian
-        return lib.to_gpu(self.view(Hessian))
+    to_gpu = lib.to_gpu
 
 
 if __name__ == '__main__':
diff --git a/pyscf/df/hessian/uhf.py b/pyscf/df/hessian/uhf.py
index 5cb20240f8..b252f99953 100644
--- a/pyscf/df/hessian/uhf.py
+++ b/pyscf/df/hessian/uhf.py
@@ -531,6 +531,7 @@ def __init__(self, mf):
 
     partial_hess_elec = partial_hess_elec
     make_h1 = make_h1
+    to_gpu = lib.to_gpu
 
 #TODO: Insert into DF class
 
diff --git a/pyscf/df/hessian/uks.py b/pyscf/df/hessian/uks.py
index 1afa995973..92624a128f 100644
--- a/pyscf/df/hessian/uks.py
+++ b/pyscf/df/hessian/uks.py
@@ -139,6 +139,7 @@ def __init__(self, mf):
 
     partial_hess_elec = partial_hess_elec
     make_h1 = make_h1
+    to_gpu = lib.to_gpu
 
 
 if __name__ == '__main__':
diff --git a/pyscf/dft/dks.py b/pyscf/dft/dks.py
index 783c3c546b..d1f35803b9 100644
--- a/pyscf/dft/dks.py
+++ b/pyscf/dft/dks.py
@@ -143,6 +143,8 @@ def x2c1e(self):
         return x2chf
     x2c = x2c1e
 
+    to_gpu = lib.to_gpu
+
 UKS = UDKS = DKS
 
 class RDKS(DKS, dhf.RDHF):
diff --git a/pyscf/dft/gen_grid.py b/pyscf/dft/gen_grid.py
index 77c1c781fd..b97ae96cde 100644
--- a/pyscf/dft/gen_grid.py
+++ b/pyscf/dft/gen_grid.py
@@ -587,9 +587,7 @@ def prune_by_density_(self, rho, threshold=0):
             self.screen_index = self.non0tab
         return self
 
-    def to_gpu(self):
-        from gpu4pyscf.dft.gen_grid import Grids
-        return lib.to_gpu(self.view(Grids))
+    to_gpu = lib.to_gpu
 
 
 def _default_rad(nuc, level=3):
diff --git a/pyscf/dft/gks.py b/pyscf/dft/gks.py
index edfd29e25b..26c6b902e9 100644
--- a/pyscf/dft/gks.py
+++ b/pyscf/dft/gks.py
@@ -177,8 +177,7 @@ def to_hf(self):
         '''Convert to GHF object.'''
         return self._transfer_attrs_(self.mol.GHF())
 
-    def to_gpu(self):
-        raise NotImplementedError
+    to_gpu = lib.to_gpu
 
 
 if __name__ == '__main__':
diff --git a/pyscf/dft/gks_symm.py b/pyscf/dft/gks_symm.py
index 60ff1c2309..1c599a247f 100644
--- a/pyscf/dft/gks_symm.py
+++ b/pyscf/dft/gks_symm.py
@@ -20,6 +20,7 @@
 Generalized Kohn-Sham
 '''
 
+from pyscf import lib
 from pyscf.lib import logger
 from pyscf.scf import ghf_symm
 from pyscf.dft import gks
@@ -57,6 +58,8 @@ def collinear(self, val):
     def nuc_grad_method(self):
         raise NotImplementedError
 
+    to_gpu = lib.to_gpu
+
 
 if __name__ == '__main__':
     import numpy
diff --git a/pyscf/dft/numint.py b/pyscf/dft/numint.py
index 0b3cfdc74c..1716042118 100644
--- a/pyscf/dft/numint.py
+++ b/pyscf/dft/numint.py
@@ -2865,10 +2865,7 @@ def make_rho(idm, ao, sindex, xctype):
                                      with_lapl)
         return make_rho, ndms, nao
 
-    def to_gpu(self):
-        from gpu4pyscf.dft.numint import NumInt
-        # Note: gpu4pyscf NumInt initializes additional things in __init__.py
-        return NumInt()
+    to_gpu = lib.to_gpu
 
 _NumInt = NumInt
 
diff --git a/pyscf/dft/rks.py b/pyscf/dft/rks.py
index 89f7e5d6d0..2a5d82c2d1 100644
--- a/pyscf/dft/rks.py
+++ b/pyscf/dft/rks.py
@@ -531,9 +531,4 @@ def to_hf(self):
         '''Convert to RHF object.'''
         return self._transfer_attrs_(self.mol.RHF())
 
-    def to_gpu(self):
-        from gpu4pyscf.dft.rks import RKS
-        obj = lib.to_gpu(hf.SCF.reset(self.view(RKS)))
-        # Attributes only defined in gpu4pyscf.RKS
-        obj.screen_tol = 1e-14
-        return obj
+    to_gpu = lib.to_gpu
diff --git a/pyscf/dft/rks_symm.py b/pyscf/dft/rks_symm.py
index f47071f37e..9956532e43 100644
--- a/pyscf/dft/rks_symm.py
+++ b/pyscf/dft/rks_symm.py
@@ -20,6 +20,7 @@
 Non-relativistic Restricted Kohn-Sham
 '''
 
+from pyscf import lib
 from pyscf.scf import hf_symm
 from pyscf.dft import rks
 from pyscf.dft import uks
@@ -46,12 +47,14 @@ def nuc_grad_method(self):
         from pyscf.grad import rks
         return rks.Gradients(self)
 
+    to_gpu = lib.to_gpu
+
 RKS = SymAdaptedRKS
 
 
 class SymAdaptedROKS(rks.KohnShamDFT, hf_symm.SymAdaptedROHF):
     ''' Restricted Kohn-Sham '''
-    def __init__(self, mol, xc='LDA,VWN'):
+    def __init__(self, mol=None, xc='LDA,VWN'):
         hf_symm.ROHF.__init__(self, mol)
         rks.KohnShamDFT.__init__(self, xc)
 
@@ -70,6 +73,8 @@ def nuc_grad_method(self):
         from pyscf.grad import roks
         return roks.Gradients(self)
 
+    to_gpu = lib.to_gpu
+
 ROKS = SymAdaptedROKS
 
 
diff --git a/pyscf/dft/roks.py b/pyscf/dft/roks.py
index 3f44d5227e..1247757207 100644
--- a/pyscf/dft/roks.py
+++ b/pyscf/dft/roks.py
@@ -65,14 +65,7 @@ def to_hf(self):
         '''Convert to ROHF object.'''
         return self._transfer_attrs_(self.mol.ROHF())
 
-    def to_gpu(self):
-        from pyscf.scf.hf import SCF
-        from gpu4pyscf.dft.roks import ROKS
-        obj = lib.to_gpu(SCF.reset(self.view(ROKS)))
-        # Attributes only defined in gpu4pyscf.RKS
-        obj.screen_tol = 1e-14
-        obj.disp = None
-        return obj
+    to_gpu = lib.to_gpu
 
 
 if __name__ == '__main__':
diff --git a/pyscf/dft/uks.py b/pyscf/dft/uks.py
index cf2f7d7486..9eee8202a7 100644
--- a/pyscf/dft/uks.py
+++ b/pyscf/dft/uks.py
@@ -197,10 +197,4 @@ def to_hf(self):
         '''Convert to UHF object.'''
         return self._transfer_attrs_(self.mol.UHF())
 
-    def to_gpu(self):
-        from pyscf.scf.hf import SCF
-        from gpu4pyscf.dft.uks import UKS
-        obj = lib.to_gpu(SCF.reset(self.view(UKS)))
-        # Attributes only defined in gpu4pyscf.RKS
-        obj.screen_tol = 1e-14
-        return obj
+    to_gpu = lib.to_gpu
diff --git a/pyscf/dft/uks_symm.py b/pyscf/dft/uks_symm.py
index c6f9597077..7ffa216b10 100644
--- a/pyscf/dft/uks_symm.py
+++ b/pyscf/dft/uks_symm.py
@@ -20,6 +20,7 @@
 Non-relativistic Unrestricted Kohn-Sham
 '''
 
+from pyscf import lib
 from pyscf.lib import logger
 from pyscf.scf import uhf_symm
 from pyscf.dft import uks
@@ -47,6 +48,8 @@ def nuc_grad_method(self):
         from pyscf.grad import uks
         return uks.Gradients(self)
 
+    to_gpu = lib.to_gpu
+
 UKS = SymAdaptedUKS
 
 
diff --git a/pyscf/fci/direct_spin1.py b/pyscf/fci/direct_spin1.py
index 5c63cfdf59..7f8d04f3d3 100644
--- a/pyscf/fci/direct_spin1.py
+++ b/pyscf/fci/direct_spin1.py
@@ -945,6 +945,8 @@ def transform_ci_for_orbital_rotation(self, fcivec, norb, nelec, u):
         nelec = _unpack_nelec(nelec, self.spin)
         return addons.transform_ci_for_orbital_rotation(fcivec, norb, nelec, u)
 
+    to_gpu = lib.to_gpu
+
 FCI = FCISolver
 
 class FCIvector(numpy.ndarray):
diff --git a/pyscf/grad/casci.py b/pyscf/grad/casci.py
index ccc29af896..d88adc891a 100644
--- a/pyscf/grad/casci.py
+++ b/pyscf/grad/casci.py
@@ -342,6 +342,8 @@ def _finalize(self):
 
     as_scanner = as_scanner
 
+    to_gpu = lib.to_gpu
+
 Grad = Gradients
 
 from pyscf import mcscf
diff --git a/pyscf/grad/casscf.py b/pyscf/grad/casscf.py
index 0ce78a82b3..466a0bd007 100644
--- a/pyscf/grad/casscf.py
+++ b/pyscf/grad/casscf.py
@@ -220,6 +220,8 @@ def _finalize(self):
 
     as_scanner = as_scanner
 
+    to_gpu = lib.to_gpu
+
 Grad = Gradients
 
 from pyscf import mcscf
diff --git a/pyscf/grad/ccsd.py b/pyscf/grad/ccsd.py
index 6ab4a5373b..18b67f1e36 100644
--- a/pyscf/grad/ccsd.py
+++ b/pyscf/grad/ccsd.py
@@ -456,6 +456,8 @@ def grad_nuc(self, mol=None, atmlst=None):
 
     as_scanner = as_scanner
 
+    to_gpu = lib.to_gpu
+
 Grad = Gradients
 
 ccsd.CCSD.Gradients = lib.class_as_method(Gradients)
diff --git a/pyscf/grad/cisd.py b/pyscf/grad/cisd.py
index 1a57c86e8f..da8099de23 100644
--- a/pyscf/grad/cisd.py
+++ b/pyscf/grad/cisd.py
@@ -203,6 +203,8 @@ def _finalize(self):
 
     as_scanner = as_scanner
 
+    to_gpu = lib.to_gpu
+
 Grad = Gradients
 
 cisd.CISD.Gradients = lib.class_as_method(Gradients)
diff --git a/pyscf/grad/dhf.py b/pyscf/grad/dhf.py
index 88913bf56b..aefb66dfee 100644
--- a/pyscf/grad/dhf.py
+++ b/pyscf/grad/dhf.py
@@ -217,6 +217,8 @@ def kernel(self, mo_energy=None, mo_coeff=None, mo_occ=None, atmlst=None):
 
     as_scanner = rhf_grad.as_scanner
 
+    to_gpu = lib.to_gpu
+
 Grad = Gradients
 
 from pyscf import scf
diff --git a/pyscf/grad/mp2.py b/pyscf/grad/mp2.py
index 65ff7c8834..5a767a20c2 100644
--- a/pyscf/grad/mp2.py
+++ b/pyscf/grad/mp2.py
@@ -309,6 +309,8 @@ def grad_nuc(self, mol=None, atmlst=None):
 
     as_scanner = as_scanner
 
+    to_gpu = lib.to_gpu
+
 Grad = Gradients
 
 # Inject to RMP2 class
diff --git a/pyscf/grad/rhf.py b/pyscf/grad/rhf.py
index e5eaf7f7f8..e45e3b5ed9 100644
--- a/pyscf/grad/rhf.py
+++ b/pyscf/grad/rhf.py
@@ -463,9 +463,7 @@ def make_rdm1e(self, mo_energy=None, mo_coeff=None, mo_occ=None):
 
     grad_elec = grad_elec
 
-    def to_gpu(self):
-        from gpu4pyscf.grad.rhf import Gradients
-        return lib.to_gpu(self.view(Gradients))
+    to_gpu = lib.to_gpu
 
 Grad = Gradients
 
diff --git a/pyscf/grad/rks.py b/pyscf/grad/rks.py
index b3c9c31ded..bb0198140d 100644
--- a/pyscf/grad/rks.py
+++ b/pyscf/grad/rks.py
@@ -622,9 +622,7 @@ def extra_force(self, atom_id, envs):
         else:
             return 0
 
-    def to_gpu(self):
-        from gpu4pyscf.grad.rks import Gradients
-        return lib.to_gpu(self.view(Gradients))
+    to_gpu = lib.to_gpu
 
 Grad = Gradients
 
diff --git a/pyscf/grad/tdrhf.py b/pyscf/grad/tdrhf.py
index bfb08aedba..2d4c5bbb4a 100644
--- a/pyscf/grad/tdrhf.py
+++ b/pyscf/grad/tdrhf.py
@@ -325,6 +325,8 @@ def _finalize(self):
 
     as_scanner = as_scanner
 
+    to_gpu = lib.to_gpu
+
 Grad = Gradients
 
 from pyscf import tdscf
diff --git a/pyscf/grad/uhf.py b/pyscf/grad/uhf.py
index c56878053b..949b7abf44 100644
--- a/pyscf/grad/uhf.py
+++ b/pyscf/grad/uhf.py
@@ -106,8 +106,7 @@ def make_rdm1e(self, mo_energy=None, mo_coeff=None, mo_occ=None):
 
     grad_elec = grad_elec
 
-    def to_gpu(self):
-        raise NotImplementedError
+    to_gpu = lib.to_gpu
 
 Grad = Gradients
 
diff --git a/pyscf/grad/uks.py b/pyscf/grad/uks.py
index 2f59f9dcc6..cc73955814 100644
--- a/pyscf/grad/uks.py
+++ b/pyscf/grad/uks.py
@@ -275,8 +275,7 @@ def extra_force(self, atom_id, envs):
         else:
             return 0
 
-    def to_gpu(self):
-        raise NotImplementedError
+    to_gpu = lib.to_gpu
 
 Grad = Gradients
 
diff --git a/pyscf/hessian/rhf.py b/pyscf/hessian/rhf.py
index a11fc9d7ff..9736eeff5c 100644
--- a/pyscf/hessian/rhf.py
+++ b/pyscf/hessian/rhf.py
@@ -487,9 +487,9 @@ def __init__(self, scf_method):
         self.verbose = scf_method.verbose
         self.stdout = scf_method.stdout
         self.mol = scf_method.mol
-        self.base = scf_method
         self.chkfile = scf_method.chkfile
         self.max_memory = self.mol.max_memory
+        self.base = scf_method
         self.atmlst = range(self.mol.natm)
         self.de = numpy.zeros((0,0,3,3))  # (A,B,dR_A,dR_B)
 
@@ -608,10 +608,7 @@ class Hessian(HessianBase):
     partial_hess_elec = partial_hess_elec
     hess_elec = hess_elec
     make_h1 = make_h1
-
-    def to_gpu(self):
-        from gpu4pyscf.hessian.rhf import Hessian
-        return lib.to_gpu(self.view(Hessian))
+    to_gpu = lib.to_gpu
 
 # Inject to RHF class
 from pyscf import scf
diff --git a/pyscf/hessian/rks.py b/pyscf/hessian/rks.py
index 41bcb63bd2..31ee13115c 100644
--- a/pyscf/hessian/rks.py
+++ b/pyscf/hessian/rks.py
@@ -590,10 +590,7 @@ def __init__(self, mf):
     partial_hess_elec = partial_hess_elec
     hess_elec = rhf_hess.hess_elec
     make_h1 = make_h1
-
-    def to_gpu(self):
-        from gpu4pyscf.hessian.rks import Hessian
-        return lib.to_gpu(self.view(Hessian))
+    to_gpu = lib.to_gpu
 
 from pyscf import dft
 dft.rks.RKS.Hessian = dft.rks_symm.RKS.Hessian = lib.class_as_method(Hessian)
diff --git a/pyscf/hessian/uhf.py b/pyscf/hessian/uhf.py
index 1b30e264ee..4b97fbf6f9 100644
--- a/pyscf/hessian/uhf.py
+++ b/pyscf/hessian/uhf.py
@@ -454,8 +454,7 @@ def solve_mo1(self, mo_energy, mo_coeff, mo_occ, h1ao_or_chkfile,
                          fx, atmlst, max_memory, verbose,
                          max_cycle=self.max_cycle, level_shift=self.level_shift)
 
-    def to_gpu(self):
-        raise NotImplementedError
+    to_gpu = lib.to_gpu
 
 from pyscf import scf
 scf.uhf.UHF.Hessian = lib.class_as_method(Hessian)
diff --git a/pyscf/hessian/uks.py b/pyscf/hessian/uks.py
index a911f661b4..2c3941452b 100644
--- a/pyscf/hessian/uks.py
+++ b/pyscf/hessian/uks.py
@@ -667,9 +667,7 @@ def __init__(self, mf):
     solve_mo1 = uhf_hess.Hessian.solve_mo1
     partial_hess_elec = partial_hess_elec
     make_h1 = make_h1
-
-    def to_gpu(self):
-        raise NotImplementedError
+    to_gpu = lib.to_gpu
 
 from pyscf import dft
 dft.uks.UKS.Hessian = dft.uks_symm.UKS.Hessian = lib.class_as_method(Hessian)
diff --git a/pyscf/lib/diis.py b/pyscf/lib/diis.py
index f6e1cbd9d9..db6b06d8db 100644
--- a/pyscf/lib/diis.py
+++ b/pyscf/lib/diis.py
@@ -334,6 +334,8 @@ def restore(self, filename, inplace=True):
         self._H[1:nd+1,1:nd+1] = e_mat
         return self
 
+    to_gpu = misc.to_gpu
+
 
 def restore(filename):
     '''Restore/construct diis object based on a diis file'''
diff --git a/pyscf/lib/misc.py b/pyscf/lib/misc.py
index 46ebf5edf0..9819f2a978 100644
--- a/pyscf/lib/misc.py
+++ b/pyscf/lib/misc.py
@@ -1360,17 +1360,73 @@ def isintsequence(obj):
             are_ints = are_ints and isinteger(i)
         return are_ints
 
-def to_gpu(method):
-    '''Recursively converts all attributes of a method to cupy objects or
-    gpu4pyscf objects.
+class _OmniObject:
+    '''Class with default attributes. When accessing an attribute that is not
+    initialized, a default value will be returned than raising an AttributeError.
+    '''
+    verbose = 0
+    max_memory = param.MAX_MEMORY
+    stdout = sys.stdout
+
+    def __init__(self, default_factory=None):
+        self._default = default_factory
+
+    def __getattr__(self, key):
+        return self._default
+
+# Many methods requires a mol or mf object in initialization.
+# These objects can be as the default arguments for these methods.
+# Then class can be instantiated easily like cls(omniobj) in the following
+# to_gpu function.
+omniobj = _OmniObject()
+omniobj.mol = omniobj
+omniobj._scf = omniobj
+omniobj.base = omniobj
+
+def to_gpu(method, out=None):
+    '''Convert a method to its corresponding GPU variant, and recursively
+    converts all attributes of a method to cupy objects or gpu4pyscf objects.
     '''
     import cupy
     from pyscf import gto
-    for key, val in method.__dict__.items():
-        if isinstance(val, gto.MoleBase):
-            continue
+
+    # If a GPU class inherits a CPU code, the "to_gpu" method may be resolved
+    # and available in the GPU class. Skip the conversion in this case.
+    if method.__module__.startswith('gpu4pyscf'):
+        return method
+
+    if out is None:
+        try:
+            import gpu4pyscf
+        except ImportError:
+            print('Library gpu4pyscf not found. You can install this package via\n'
+                  '    pip install gpu4pyscf-cuda11x\n'
+                  'See more installation info at https://github.com/pyscf/gpu4pyscf')
+            raise
+
+        # TODO: Is it necessary to implement scanner in gpu4pyscf?
+        if isinstance(method, (SinglePointScanner, GradScanner)):
+            method = method.undo_scanner()
+
+        import import_module
+        mod = import_module(method.__module__.replace('pyscf', 'gpu4pyscf'))
+        cls = getattr(mod, method.__class__.__name__)
+        # A temporary GPU instance. This ensures to initialize private
+        # attributes that are only available for GPU code.
+        out = cls(omniobj)
+
+    # Convert only the keys that are defined in the corresponding GPU class
+    cls_keys = [getattr(cls, '_keys', ()) for cls in out.__class__.__mro__[:-1]]
+    out_keys = set(out.__dict__).union(*cls_keys)
+    # Only overwrite the attributes of the same name.
+    keys = set(method.__dict__).intersection(out_keys)
+
+    for key in keys:
+        val = getattr(method, key)
         if isinstance(val, numpy.ndarray):
-            setattr(method, key, cupy.asarray(val))
+            val = cupy.asarray(val)
         elif hasattr(val, 'to_gpu'):
-            setattr(method, key, val.to_gpu())
-    return method
+            val = val.to_gpu()
+        setattr(out, key, val)
+    out.reset()
+    return out
diff --git a/pyscf/mcscf/casci.py b/pyscf/mcscf/casci.py
index 3f00365b48..2d96bfbc72 100644
--- a/pyscf/mcscf/casci.py
+++ b/pyscf/mcscf/casci.py
@@ -770,7 +770,7 @@ class CASBase(lib.StreamObject):
         'e_tot', 'e_cas', 'ci', 'mo_coeff', 'mo_energy', 'mo_occ', 'converged',
     }
 
-    def __init__(self, mf_or_mol, ncas, nelecas, ncore=None):
+    def __init__(self, mf_or_mol, ncas=0, nelecas=0, ncore=None):
         if isinstance(mf_or_mol, gto.Mole):
             mf = scf.RHF(mf_or_mol)
         else:
@@ -1170,6 +1170,8 @@ def nuc_grad_method(self):
         from pyscf.grad import casci
         return casci.Gradients(self)
 
+    to_gpu = lib.to_gpu
+
 scf.hf.RHF.CASCI = scf.rohf.ROHF.CASCI = lib.class_as_method(CASCI)
 scf.uhf.UHF.CASCI = None
 
diff --git a/pyscf/mcscf/casci_symm.py b/pyscf/mcscf/casci_symm.py
index a28e1dab9e..df70b31886 100644
--- a/pyscf/mcscf/casci_symm.py
+++ b/pyscf/mcscf/casci_symm.py
@@ -28,7 +28,7 @@
 from pyscf.scf.hf_symm import map_degeneracy
 
 class SymAdaptedCASCI(casci.CASCI):
-    def __init__(self, mf_or_mol, ncas, nelecas, ncore=None):
+    def __init__(self, mf_or_mol, ncas=0, nelecas=0, ncore=None):
         casci.CASCI.__init__(self, mf_or_mol, ncas, nelecas, ncore)
 
         assert (self.mol.symmetry)
@@ -74,6 +74,8 @@ def sort_mo_by_irrep(self, cas_irrep_nocc,
         return addons.sort_mo_by_irrep(self, mo_coeff, cas_irrep_nocc,
                                        cas_irrep_ncore, s)
 
+    to_gpu = lib.to_gpu
+
 CASCI = SymAdaptedCASCI
 
 def eig(mat, orbsym):
diff --git a/pyscf/mcscf/mc1step.py b/pyscf/mcscf/mc1step.py
index 0744ebb372..4a11427959 100644
--- a/pyscf/mcscf/mc1step.py
+++ b/pyscf/mcscf/mc1step.py
@@ -755,7 +755,7 @@ class CASSCF(casci.CASBase):
         'mo_energy', 'converged',
     }
 
-    def __init__(self, mf_or_mol, ncas, nelecas, ncore=None, frozen=None):
+    def __init__(self, mf_or_mol, ncas=0, nelecas=0, ncore=None, frozen=None):
         casci.CASBase.__init__(self, mf_or_mol, ncas, nelecas, ncore)
         self.frozen = frozen
 
@@ -1296,6 +1296,8 @@ def reset(self, mol=None):
         casci.CASBase.reset(self, mol=mol)
         self._max_stepsize = None
 
+    to_gpu = lib.to_gpu
+
 scf.hf.RHF.CASSCF = scf.rohf.ROHF.CASSCF = lib.class_as_method(CASSCF)
 scf.uhf.UHF.CASSCF = None
 
diff --git a/pyscf/mcscf/newton_casscf_symm.py b/pyscf/mcscf/newton_casscf_symm.py
index c8419933e1..eb526337b3 100644
--- a/pyscf/mcscf/newton_casscf_symm.py
+++ b/pyscf/mcscf/newton_casscf_symm.py
@@ -27,7 +27,7 @@
 
 class CASSCF(newton_casscf.CASSCF):
     __doc__ = newton_casscf.CASSCF.__doc__
-    def __init__(self, mf_or_mol, ncas, nelecas, ncore=None, frozen=None):
+    def __init__(self, mf_or_mol, ncas=0, nelecas=0, ncore=None, frozen=None):
         newton_casscf.CASSCF.__init__(self, mf_or_mol, ncas, nelecas, ncore, frozen)
         assert (self.mol.symmetry)
         self.fcisolver = fci.solver(self.mol, False, True)
diff --git a/pyscf/mcscf/ucasci.py b/pyscf/mcscf/ucasci.py
index 307f9e8a9c..1c15c25e42 100644
--- a/pyscf/mcscf/ucasci.py
+++ b/pyscf/mcscf/ucasci.py
@@ -119,7 +119,7 @@ def kernel(casci, mo_coeff=None, ci0=None, verbose=logger.NOTE, envs=None):
 
 class UCASBase(CASBase):
     # nelecas is tuple of (nelecas_alpha, nelecas_beta)
-    def __init__(self, mf_or_mol, ncas, nelecas, ncore=None):
+    def __init__(self, mf_or_mol, ncas=0, nelecas=0, ncore=None):
         #assert ('UHF' == mf.__class__.__name__)
         if isinstance(mf_or_mol, gto.Mole):
             mf = scf.UHF(mf_or_mol)
diff --git a/pyscf/mcscf/umc1step.py b/pyscf/mcscf/umc1step.py
index 53b7d1022a..bb8b8778a9 100644
--- a/pyscf/mcscf/umc1step.py
+++ b/pyscf/mcscf/umc1step.py
@@ -379,7 +379,7 @@ class UCASSCF(ucasci.UCASBase):
         'canonicalization', 'sorting_mo_energy',
     }
 
-    def __init__(self, mf_or_mol, ncas, nelecas, ncore=None, frozen=None):
+    def __init__(self, mf_or_mol, ncas=0, nelecas=0, ncore=None, frozen=None):
         ucasci.UCASBase.__init__(self, mf_or_mol, ncas, nelecas, ncore)
         self.frozen = frozen
 
diff --git a/pyscf/mp/dfmp2.py b/pyscf/mp/dfmp2.py
index d8cffdd57a..6522b21c19 100644
--- a/pyscf/mp/dfmp2.py
+++ b/pyscf/mp/dfmp2.py
@@ -140,6 +140,8 @@ def update_amps(self, t2, eris):
     def init_amps(self, mo_energy=None, mo_coeff=None, eris=None, with_t2=WITH_T2):
         return kernel(self, mo_energy, mo_coeff, eris, with_t2)
 
+    to_gpu = lib.to_gpu
+
 MP2 = DFMP2
 
 from pyscf import scf
diff --git a/pyscf/mp/dfmp2_native.py b/pyscf/mp/dfmp2_native.py
index 92a2ef0627..800c85d102 100644
--- a/pyscf/mp/dfmp2_native.py
+++ b/pyscf/mp/dfmp2_native.py
@@ -236,6 +236,8 @@ def kernel(self):
     def nuc_grad_method(self):
         raise NotImplementedError
 
+    to_gpu = lib.to_gpu
+
 
 MP2 = RMP2 = DFMP2 = DFRMP2
 
diff --git a/pyscf/mp/dfump2_native.py b/pyscf/mp/dfump2_native.py
index 5a3a3b0db8..a99f7dfb52 100644
--- a/pyscf/mp/dfump2_native.py
+++ b/pyscf/mp/dfump2_native.py
@@ -226,6 +226,8 @@ def delete(self):
     def nuc_grad_method(self):
         raise NotImplementedError
 
+    to_gpu = lib.to_gpu
+
 
 MP2 = UMP2 = DFMP2 = DFUMP2
 
diff --git a/pyscf/mp/gmp2.py b/pyscf/mp/gmp2.py
index 7fae2ffdd0..8f068df51b 100644
--- a/pyscf/mp/gmp2.py
+++ b/pyscf/mp/gmp2.py
@@ -214,6 +214,8 @@ def nuc_grad_method(self):
     def init_amps(self, mo_energy=None, mo_coeff=None, eris=None, with_t2=WITH_T2):
         return kernel(self, mo_energy, mo_coeff, eris, with_t2)
 
+    to_gpu = lib.to_gpu
+
 MP2 = GMP2
 
 scf.ghf.GHF.MP2 = lib.class_as_method(MP2)
diff --git a/pyscf/mp/mp2.py b/pyscf/mp/mp2.py
index 0955a18bb7..9d1dc431a5 100644
--- a/pyscf/mp/mp2.py
+++ b/pyscf/mp/mp2.py
@@ -649,6 +649,8 @@ def nuc_grad_method(self):
     def init_amps(self, mo_energy=None, mo_coeff=None, eris=None, with_t2=WITH_T2):
         return kernel(self, mo_energy, mo_coeff, eris, with_t2)
 
+    to_gpu = lib.to_gpu
+
 RMP2 = MP2
 
 from pyscf import scf
diff --git a/pyscf/mp/ump2.py b/pyscf/mp/ump2.py
index adbfc3b278..70c5bd8804 100644
--- a/pyscf/mp/ump2.py
+++ b/pyscf/mp/ump2.py
@@ -450,6 +450,8 @@ def nuc_grad_method(self):
     def init_amps(self, mo_energy=None, mo_coeff=None, eris=None, with_t2=WITH_T2):
         return kernel(self, mo_energy, mo_coeff, eris, with_t2)
 
+    to_gpu = lib.to_gpu
+
 MP2 = UMP2
 
 from pyscf import scf
diff --git a/pyscf/pbc/cc/kccsd.py b/pyscf/pbc/cc/kccsd.py
index 3fa54e7024..b837bb9212 100644
--- a/pyscf/pbc/cc/kccsd.py
+++ b/pyscf/pbc/cc/kccsd.py
@@ -463,6 +463,8 @@ def from_uccsd(self, t1, t2, orbspin=None):
     def to_uccsd(self, t1, t2, orbspin=None):
         return spin2spatial(t1, orbspin), spin2spatial(t2, orbspin)
 
+    to_gpu = lib.to_gpu
+
 CCSD = KCCSD = KGCCSD = GCCSD
 
 
diff --git a/pyscf/pbc/cc/kccsd_rhf.py b/pyscf/pbc/cc/kccsd_rhf.py
index 4d100bdd7d..e4b3fb92c5 100644
--- a/pyscf/pbc/cc/kccsd_rhf.py
+++ b/pyscf/pbc/cc/kccsd_rhf.py
@@ -653,6 +653,8 @@ def eaccsd(self, nroots=1, left=False, koopmans=False, guess=None,
     def ao2mo(self, mo_coeff=None):
         return _ERIS(self, mo_coeff)
 
+    to_gpu = lib.to_gpu
+
 #####################################
 # Wrapper functions for IP/EA-EOM
 #####################################
diff --git a/pyscf/pbc/cc/kccsd_uhf.py b/pyscf/pbc/cc/kccsd_uhf.py
index fbac93bf14..df8d580606 100644
--- a/pyscf/pbc/cc/kccsd_uhf.py
+++ b/pyscf/pbc/cc/kccsd_uhf.py
@@ -761,6 +761,8 @@ def vector_to_amplitudes(self, vec, nmo=None, nocc=None, nkpts=None):
         if nkpts is None: nkpts = self.nkpts
         return vector_to_amplitudes(vec, nmo, nocc, nkpts)
 
+    to_gpu = lib.to_gpu
+
 UCCSD = KUCCSD
 
 
diff --git a/pyscf/pbc/df/df.py b/pyscf/pbc/df/df.py
index a03d058109..55d5bd7f16 100644
--- a/pyscf/pbc/df/df.py
+++ b/pyscf/pbc/df/df.py
@@ -524,6 +524,8 @@ def get_naoaux(self):
                     naux += dat.shape[0]
         return naux
 
+    to_gpu = lib.to_gpu
+
 DF = GDF
 
 class CDERIArray:
diff --git a/pyscf/pbc/df/fft.py b/pyscf/pbc/df/fft.py
index 8a1b982239..1d538ae4ee 100644
--- a/pyscf/pbc/df/fft.py
+++ b/pyscf/pbc/df/fft.py
@@ -355,3 +355,5 @@ def get_naoaux(self):
         return ngrids * 2
 
     range_coulomb = aft.AFTDF.range_coulomb
+
+    to_gpu = lib.to_gpu
diff --git a/pyscf/pbc/dft/gen_grid.py b/pyscf/pbc/dft/gen_grid.py
index 1cc4d9fa91..840a237448 100644
--- a/pyscf/pbc/dft/gen_grid.py
+++ b/pyscf/pbc/dft/gen_grid.py
@@ -134,6 +134,8 @@ def make_mask(self, cell=None, coords=None, relativity=0, shls_slice=None,
         if coords is None: coords = self.coords
         return make_mask(cell, coords, relativity, shls_slice, verbose)
 
+    to_gpu = lib.to_gpu
+
 
 # modified from pyscf.dft.gen_grid.gen_partition
 def get_becke_grids(cell, atom_grid={}, radi_method=dft.radi.gauss_chebyshev,
@@ -257,6 +259,8 @@ def make_mask(self, cell=None, coords=None, relativity=0, shls_slice=None,
         if coords is None: coords = self.coords
         return make_mask(cell, coords, relativity, shls_slice, verbose)
 
+    to_gpu = lib.to_gpu
+
 AtomicGrids = BeckeGrids
 
 
diff --git a/pyscf/pbc/dft/gks.py b/pyscf/pbc/dft/gks.py
index 5536b53daa..a38a57a22b 100644
--- a/pyscf/pbc/dft/gks.py
+++ b/pyscf/pbc/dft/gks.py
@@ -143,3 +143,5 @@ def to_hf(self):
         '''Convert to GHF object.'''
         from pyscf.pbc import scf
         return self._transfer_attrs_(scf.GHF(self.cell, self.kpt))
+
+    to_gpu = lib.to_gpu
diff --git a/pyscf/pbc/dft/kgks.py b/pyscf/pbc/dft/kgks.py
index fd97e43cd1..7774f2e36e 100644
--- a/pyscf/pbc/dft/kgks.py
+++ b/pyscf/pbc/dft/kgks.py
@@ -148,3 +148,5 @@ def to_hf(self):
         '''Convert to KGHF object.'''
         from pyscf.pbc import scf
         return self._transfer_attrs_(scf.KGHF(self.cell, self.kpts))
+
+    to_gpu = lib.to_gpu
diff --git a/pyscf/pbc/dft/krks.py b/pyscf/pbc/dft/krks.py
index 3cd23636b1..6278605de9 100644
--- a/pyscf/pbc/dft/krks.py
+++ b/pyscf/pbc/dft/krks.py
@@ -184,6 +184,8 @@ def to_hf(self):
         from pyscf.pbc import scf
         return self._transfer_attrs_(scf.KRHF(self.cell, self.kpts))
 
+    to_gpu = lib.to_gpu
+
 
 if __name__ == '__main__':
     from pyscf.pbc import gto
diff --git a/pyscf/pbc/dft/kroks.py b/pyscf/pbc/dft/kroks.py
index e83d8e6d3e..1d2a1198f8 100644
--- a/pyscf/pbc/dft/kroks.py
+++ b/pyscf/pbc/dft/kroks.py
@@ -64,6 +64,8 @@ def to_hf(self):
         from pyscf.pbc import scf
         return self._transfer_attrs_(scf.KROHF(self.cell, self.kpts))
 
+    to_gpu = lib.to_gpu
+
 
 if __name__ == '__main__':
     from pyscf.pbc import gto
diff --git a/pyscf/pbc/dft/kuks.py b/pyscf/pbc/dft/kuks.py
index 634c99f8ff..ac66f973bc 100644
--- a/pyscf/pbc/dft/kuks.py
+++ b/pyscf/pbc/dft/kuks.py
@@ -160,6 +160,8 @@ def to_hf(self):
         from pyscf.pbc import scf
         return self._transfer_attrs_(scf.KUHF(self.cell, self.kpts))
 
+    to_gpu = lib.to_gpu
+
 
 if __name__ == '__main__':
     from pyscf.pbc import gto
diff --git a/pyscf/pbc/dft/numint.py b/pyscf/pbc/dft/numint.py
index 7d36fffb6a..056349f88e 100644
--- a/pyscf/pbc/dft/numint.py
+++ b/pyscf/pbc/dft/numint.py
@@ -1082,6 +1082,8 @@ def eval_rho1(self, cell, ao, dm, screen_index=None, xctype='LDA', hermi=0,
         return self.eval_rho(cell, ao, dm, screen_index, xctype, hermi,
                              with_lapl, verbose)
 
+    to_gpu = lib.to_gpu
+
 _NumInt = NumInt
 
 
@@ -1287,4 +1289,6 @@ def make_rho(idm, ao_kpts, non0tab, xctype):
     cache_xc_kernel1 = cache_xc_kernel1
     get_rho = get_rho
 
+    to_gpu = lib.to_gpu
+
 _KNumInt = KNumInt
diff --git a/pyscf/pbc/dft/rks.py b/pyscf/pbc/dft/rks.py
index d3dc8d1047..bf12735aa6 100644
--- a/pyscf/pbc/dft/rks.py
+++ b/pyscf/pbc/dft/rks.py
@@ -346,6 +346,8 @@ def to_hf(self):
         from pyscf.pbc import scf
         return self._transfer_attrs_(scf.RHF(self.cell, self.kpt))
 
+    to_gpu = lib.to_gpu
+
 
 if __name__ == '__main__':
     from pyscf.pbc import gto
diff --git a/pyscf/pbc/dft/roks.py b/pyscf/pbc/dft/roks.py
index f7fe097c10..d422cfc3cb 100644
--- a/pyscf/pbc/dft/roks.py
+++ b/pyscf/pbc/dft/roks.py
@@ -68,6 +68,8 @@ def to_hf(self):
         from pyscf.pbc import scf
         return self._transfer_attrs_(scf.ROHF(self.cell, self.kpt))
 
+    to_gpu = lib.to_gpu
+
 
 if __name__ == '__main__':
     from pyscf.pbc import gto
diff --git a/pyscf/pbc/dft/uks.py b/pyscf/pbc/dft/uks.py
index 20d8d14c71..9619ec4d25 100644
--- a/pyscf/pbc/dft/uks.py
+++ b/pyscf/pbc/dft/uks.py
@@ -144,6 +144,8 @@ def to_hf(self):
         from pyscf.pbc import scf
         return self._transfer_attrs_(scf.UHF(self.cell, self.kpt))
 
+    to_gpu = lib.to_gpu
+
 
 if __name__ == '__main__':
     from pyscf.pbc import gto
diff --git a/pyscf/pbc/mp/kmp2.py b/pyscf/pbc/mp/kmp2.py
index 9b7a53b2bb..bcea49ca41 100644
--- a/pyscf/pbc/mp/kmp2.py
+++ b/pyscf/pbc/mp/kmp2.py
@@ -783,6 +783,8 @@ def kernel(self, mo_energy=None, mo_coeff=None, with_t2=WITH_T2):
 
         return self.e_corr, self.t2
 
+    to_gpu = lib.to_gpu
+
 KRMP2 = KMP2
 
 
diff --git a/pyscf/pbc/scf/ghf.py b/pyscf/pbc/scf/ghf.py
index ddff1b850f..e5f12dc901 100644
--- a/pyscf/pbc/scf/ghf.py
+++ b/pyscf/pbc/scf/ghf.py
@@ -165,6 +165,8 @@ def convert_from_(self, mf):
         addons.convert_to_ghf(mf, self)
         return self
 
+    to_gpu = lib.to_gpu
+
 
 if __name__ == '__main__':
     from pyscf.pbc import gto
diff --git a/pyscf/pbc/scf/hf.py b/pyscf/pbc/scf/hf.py
index f6c91336ed..3b5de33a5f 100644
--- a/pyscf/pbc/scf/hf.py
+++ b/pyscf/pbc/scf/hf.py
@@ -897,6 +897,7 @@ class RHF(SCF):
     analyze = mol_hf.RHF.analyze
     spin_square = mol_hf.RHF.spin_square
     stability = mol_hf.RHF.stability
+    to_gpu = lib.to_gpu
 
     def nuc_grad_method(self):
         raise NotImplementedError
diff --git a/pyscf/pbc/scf/kghf.py b/pyscf/pbc/scf/kghf.py
index f271bd28da..2ce492067c 100644
--- a/pyscf/pbc/scf/kghf.py
+++ b/pyscf/pbc/scf/kghf.py
@@ -200,6 +200,8 @@ def __init__(self, cell, kpts=np.zeros((1,3)),
     analyze = khf.analyze
     convert_from_ = pbcghf.GHF.convert_from_
 
+    to_gpu = lib.to_gpu
+
     def get_hcore(self, cell=None, kpts=None):
         hcore = khf.KSCF.get_hcore(self, cell, kpts)
         hcore = lib.asarray([scipy.linalg.block_diag(h, h) for h in hcore])
diff --git a/pyscf/pbc/scf/khf.py b/pyscf/pbc/scf/khf.py
index 89124e8af4..49dff73d2b 100644
--- a/pyscf/pbc/scf/khf.py
+++ b/pyscf/pbc/scf/khf.py
@@ -688,6 +688,7 @@ class KRHF(KSCF):
 
     analyze = analyze
     spin_square = mol_hf.RHF.spin_square
+    to_gpu = lib.to_gpu
 
     def check_sanity(self):
         cell = self.cell
diff --git a/pyscf/pbc/scf/krohf.py b/pyscf/pbc/scf/krohf.py
index 6a23588fb7..b6b6da83a3 100644
--- a/pyscf/pbc/scf/krohf.py
+++ b/pyscf/pbc/scf/krohf.py
@@ -274,6 +274,7 @@ class KROHF(khf.KRHF):
     analyze = khf.analyze
     spin_square = pbcrohf.ROHF.spin_square
     canonicalize = canonicalize
+    to_gpu = lib.to_gpu
 
     def __init__(self, cell, kpts=np.zeros((1,3)),
                  exxdiv=getattr(__config__, 'pbc_scf_SCF_exxdiv', 'ewald')):
diff --git a/pyscf/pbc/scf/kuhf.py b/pyscf/pbc/scf/kuhf.py
index eae04c0713..ff911afac4 100644
--- a/pyscf/pbc/scf/kuhf.py
+++ b/pyscf/pbc/scf/kuhf.py
@@ -384,6 +384,7 @@ class KUHF(khf.KSCF):
     get_rho = get_rho
     analyze = khf.analyze
     canonicalize = canonicalize
+    to_gpu = lib.to_gpu
 
     def __init__(self, cell, kpts=np.zeros((1,3)),
                  exxdiv=getattr(__config__, 'pbc_scf_SCF_exxdiv', 'ewald')):
diff --git a/pyscf/pbc/scf/rohf.py b/pyscf/pbc/scf/rohf.py
index a34fade115..d45a507982 100644
--- a/pyscf/pbc/scf/rohf.py
+++ b/pyscf/pbc/scf/rohf.py
@@ -62,6 +62,7 @@ class ROHF(pbchf.RHF):
     spin_square = mol_rohf.ROHF.spin_square
     stability = mol_rohf.ROHF.stability
     dip_moment = pbchf.SCF.dip_moment
+    to_gpu = lib.to_gpu
 
     def __init__(self, cell, kpt=np.zeros(3),
                  exxdiv=getattr(__config__, 'pbc_scf_SCF_exxdiv', 'ewald')):
diff --git a/pyscf/pbc/scf/rsjk.py b/pyscf/pbc/scf/rsjk.py
index 3e15d09162..0efe64e822 100644
--- a/pyscf/pbc/scf/rsjk.py
+++ b/pyscf/pbc/scf/rsjk.py
@@ -1160,6 +1160,8 @@ def merge_dd(Gpq, p0, p1, ki_lst, kj_lst):
         log.timer_debug1('get_lr_k_kpts', *cpu0)
         return vk_kpts
 
+    to_gpu = lib.to_gpu
+
 RangeSeparationJKBuilder = RangeSeparatedJKBuilder
 
 def _purify(mat_kpts, phase):
diff --git a/pyscf/pbc/scf/uhf.py b/pyscf/pbc/scf/uhf.py
index 0d247f745e..3dfd53866b 100644
--- a/pyscf/pbc/scf/uhf.py
+++ b/pyscf/pbc/scf/uhf.py
@@ -128,6 +128,7 @@ class UHF(pbchf.SCF):
     canonicalize = mol_uhf.UHF.canonicalize
     spin_square = mol_uhf.UHF.spin_square
     stability = mol_uhf.UHF.stability
+    to_gpu = lib.to_gpu
 
     def __init__(self, cell, kpt=np.zeros(3),
                  exxdiv=getattr(__config__, 'pbc_scf_SCF_exxdiv', 'ewald')):
diff --git a/pyscf/qmmm/itrf.py b/pyscf/qmmm/itrf.py
index 3bb79202fb..f787f03309 100644
--- a/pyscf/qmmm/itrf.py
+++ b/pyscf/qmmm/itrf.py
@@ -188,6 +188,11 @@ def energy_nuc(self):
             nuc += q2*(charges/r).sum()
         return nuc
 
+    def to_gpu(self):
+        obj = self.undo_qmmm().to_gpu()
+        obj = qmmm_for_scf(obj, self.mm_mol)
+        return lib.to_gpu(self, obj)
+
     def nuc_grad_method(self):
         scf_grad = super().nuc_grad_method()
         return qmmm_grad_for_scf(scf_grad)
@@ -207,6 +212,8 @@ def undo_qmmm(self):
         obj._scf = self._scf.undo_qmmm()
         return obj
 
+    to_gpu = QMMMSCF.to_gpu
+
 
 def add_mm_charges_grad(scf_grad, atoms_or_coords, charges, radii=None, unit=None):
     '''Apply the MM charges in the QM gradients' method.  It affects both the
@@ -396,6 +403,11 @@ def grad_nuc_mm(self, mol=None):
             g_mm += q1 * numpy.einsum('i,ix,i->ix', charges, r1-coords, 1/r**3)
         return g_mm
 
+    def to_gpu(self):
+        obj = self.undo_qmmm().to_gpu()
+        obj = qmmm_grad_for_scf(obj)
+        return lib.to_gpu(self, obj)
+
 _QMMMGrad = QMMMGrad
 
 # Inject QMMM interface wrapper to other modules
diff --git a/pyscf/scf/dhf.py b/pyscf/scf/dhf.py
index 6e29d5a450..099074c71b 100644
--- a/pyscf/scf/dhf.py
+++ b/pyscf/scf/dhf.py
@@ -454,12 +454,11 @@ class DHF(hf.SCF):
     ssss_approx = getattr(__config__, 'scf_dhf_SCF_ssss_approx', 'Visscher')
 
     _keys = {'conv_tol', 'with_ssss', 'with_gaunt',
-                 'with_breit', 'ssss_approx', 'opt'}
+                 'with_breit', 'ssss_approx'}
 
     def __init__(self, mol):
         hf.SCF.__init__(self, mol)
         self._coulomb_level = 'SSSS' # 'SSSS' ~ LLLL+LLSS+SSSS
-        self.opt = None # (opt_llll, opt_ssll, opt_ssss, opt_gaunt)
 
     def dump_flags(self, verbose=None):
         hf.SCF.dump_flags(self, verbose)
@@ -518,8 +517,6 @@ def init_guess_by_chkfile(self, chkfile=None, project=None):
     def build(self, mol=None):
         if self.verbose >= logger.WARN:
             self.check_sanity()
-        if self.direct_scf:
-            self.opt = self.init_direct_scf(mol)
         return self
 
     def get_occ(self, mo_energy=None, mo_coeff=None):
@@ -686,7 +683,6 @@ def reset(self, mol=None):
             self.mol = mol
         self._coulomb_level = 'SSSS' # 'SSSS' ~ LLLL+LLSS+SSSS
         self._opt = {None: None}
-        self.opt = None # (opt_llll, opt_ssll, opt_ssss, opt_gaunt)
         return self
 
     def stability(self, internal=None, external=None, verbose=None, return_status=False):
@@ -749,6 +745,8 @@ def to_dks(self, xc='HF'):
 
     to_ks = to_dks
 
+    to_gpu = lib.to_gpu
+
 UHF = UDHF = DHF
 
 
diff --git a/pyscf/scf/diis.py b/pyscf/scf/diis.py
index a442f58b9c..0eea4da094 100644
--- a/pyscf/scf/diis.py
+++ b/pyscf/scf/diis.py
@@ -44,10 +44,6 @@ def __init__(self, mf=None, filename=None, Corth=None):
         self.space = 8
         self.Corth = Corth
         self.damp = 0
-        #?self._scf = mf
-        #?if hasattr(self._scf, 'get_orbsym'): # Symmetry adapted SCF objects
-        #?    self.orbsym = mf.get_orbsym(Corth)
-        #?    sym_forbid = self.orbsym[:,None] != self.orbsym
 
     def update(self, s, d, f, *args, **kwargs):
         errvec = get_err_vec(s, d, f, self.Corth)
diff --git a/pyscf/scf/ghf.py b/pyscf/scf/ghf.py
index 08ac67603d..ec35bed657 100644
--- a/pyscf/scf/ghf.py
+++ b/pyscf/scf/ghf.py
@@ -382,16 +382,14 @@ class GHF(hf.SCF):
         mo_coeff[nao:nao*2] are the coefficients of AO with beta spin.
     '''
 
+    with_soc = None
+
     _keys = {'with_soc'}
 
     get_init_guess = hf.RHF.get_init_guess
     get_occ = get_occ
     _finalize = uhf.UHF._finalize
 
-    def __init__(self, mol):
-        hf.SCF.__init__(self, mol)
-        self.with_soc = None
-
     def get_hcore(self, mol=None):
         if mol is None: mol = self.mol
         hcore = hf.get_hcore(mol)
@@ -541,9 +539,7 @@ def to_ks(self, xc='HF'):
         from pyscf import dft
         return self._transfer_attrs_(dft.GKS(self.mol, xc=xc))
 
-    def to_gpu(self):
-        from gpu4pyscf.scf import GHF
-        return lib.to_gpu(hf.SCF.reset(self.view(GHF)))
+    to_gpu = lib.to_gpu
 
 def _from_rhf_init_dm(dm, breaksym=True):
     dma = dm * .5
diff --git a/pyscf/scf/ghf_symm.py b/pyscf/scf/ghf_symm.py
index 8dc9a29520..3b419bff62 100644
--- a/pyscf/scf/ghf_symm.py
+++ b/pyscf/scf/ghf_symm.py
@@ -281,8 +281,7 @@ def get_orbsym(self, mo_coeff=None, s=None):
         return numpy.asarray(get_orbsym(self.mol, mo_coeff, s))
     orbsym = property(get_orbsym)
 
-    def to_gpu(self):
-        raise NotImplementedError
+    to_gpu = lib.to_gpu
 
 GHF = SymAdaptedGHF
 
diff --git a/pyscf/scf/hf.py b/pyscf/scf/hf.py
index 7a8c0e8f22..7119745708 100644
--- a/pyscf/scf/hf.py
+++ b/pyscf/scf/hf.py
@@ -1504,6 +1504,7 @@ class SCF(lib.StreamObject):
     conv_tol_grad = getattr(__config__, 'scf_hf_SCF_conv_tol_grad', None)
     max_cycle = getattr(__config__, 'scf_hf_SCF_max_cycle', 50)
     init_guess = getattr(__config__, 'scf_hf_SCF_init_guess', 'minao')
+    disp = None  # for DFT-D3 and DFT-D4
 
     # To avoid diis pollution from previous run, self.diis should not be
     # initialized as DIIS instance here
@@ -1542,7 +1543,6 @@ def __init__(self, mol):
         self.verbose = mol.verbose
         self.max_memory = mol.max_memory
         self.stdout = mol.stdout
-        self.disp = None
 
         # If chkfile is muted, SCF intermediates will not be dumped anywhere.
         if MUTE_CHKFILE:
@@ -2062,8 +2062,12 @@ def _transfer_attrs_(self, dst):
         '''This helper function transfers attributes from one SCF object to
         another SCF object. It is invoked by to_ks and to_hf methods.
         '''
+        # Search for all tracked attributes, including those in base classes
+        cls_keys = [getattr(cls, '_keys', ()) for cls in dst.__class__.__mro__[:-1]]
+        dst_keys = set(dst.__dict__).union(*cls_keys)
+
         loc_dic = self.__dict__
-        keys = dst.__dict__.keys() & loc_dic.keys()
+        keys = set(loc_dic).intersection(dst_keys)
         dst.__dict__.update({k: loc_dic[k] for k in keys})
         dst.converged = False
         return dst
@@ -2197,11 +2201,8 @@ def to_ks(self, xc='HF'):
         from pyscf import dft
         return self._transfer_attrs_(dft.RKS(self.mol, xc=xc))
 
-    def to_gpu(self):
-        # FIXME: consider the density_fit, x2c and soscf decoration
-        from gpu4pyscf.scf import RHF
-        obj = SCF.reset(self.view(RHF))
-        return lib.to_gpu(obj)
+    # FIXME: consider the density_fit, x2c and soscf decoration
+    to_gpu = lib.to_gpu
 
 def _hf1e_scf(mf, *args):
     logger.info(mf, '\n')
diff --git a/pyscf/scf/hf_symm.py b/pyscf/scf/hf_symm.py
index 5da8d7cd2a..966b1bcbaa 100644
--- a/pyscf/scf/hf_symm.py
+++ b/pyscf/scf/hf_symm.py
@@ -573,8 +573,7 @@ def get_orbsym(self, mo_coeff=None, s=None):
 
     canonicalize = canonicalize
 
-    def to_gpu(self):
-        raise NotImplementedError
+    to_gpu = lib.to_gpu
 
 RHF = SymAdaptedRHF
 
@@ -934,8 +933,7 @@ def canonicalize(self, mo_coeff, mo_occ, fock=None):
     get_wfnsym = get_wfnsym
     wfnsym = property(get_wfnsym)
 
-    def to_gpu(self):
-        raise NotImplementedError
+    to_gpu = lib.to_gpu
 
 ROHF = SymAdaptedROHF
 
diff --git a/pyscf/scf/rohf.py b/pyscf/scf/rohf.py
index 951e08a526..5ea9a6dc28 100644
--- a/pyscf/scf/rohf.py
+++ b/pyscf/scf/rohf.py
@@ -520,9 +520,7 @@ def to_ks(self, xc='HF'):
         from pyscf import dft
         return self._transfer_attrs_(dft.ROKS(self.mol, xc=xc))
 
-    def to_gpu(self):
-        from gpu4pyscf.scf import ROHF
-        return lib.to_gpu(hf.SCF.reset(self.view(ROHF)))
+    to_gpu = lib.to_gpu
 
 
 class HF1e(ROHF):
diff --git a/pyscf/scf/uhf.py b/pyscf/scf/uhf.py
index 0afc66d0ba..0ae13f1eb6 100644
--- a/pyscf/scf/uhf.py
+++ b/pyscf/scf/uhf.py
@@ -763,6 +763,8 @@ class UHF(hf.SCF):
     S^2 = 0.7570150, 2S+1 = 2.0070027
     '''
 
+    init_guess_breaksym = None
+
     _keys = {"init_guess_breaksym"}
 
     def __init__(self, mol):
@@ -771,7 +773,6 @@ def __init__(self, mol):
         # self.mo_occ => [mo_occ_a, mo_occ_b]
         # self.mo_energy => [mo_energy_a, mo_energy_b]
         self.nelec = None
-        self.init_guess_breaksym = None
 
     @property
     def nelec(self):
@@ -1066,9 +1067,7 @@ def to_ks(self, xc='HF'):
         from pyscf import dft
         return self._transfer_attrs_(dft.UKS(self.mol, xc=xc))
 
-    def to_gpu(self):
-        from gpu4pyscf.scf import UHF
-        return lib.to_gpu(hf.SCF.reset(self.view(UHF)))
+    to_gpu = lib.to_gpu
 
 def _hf1e_scf(mf, *args):
     logger.info(mf, '\n')
diff --git a/pyscf/scf/uhf_symm.py b/pyscf/scf/uhf_symm.py
index 1ea38b0956..7253fd96ea 100644
--- a/pyscf/scf/uhf_symm.py
+++ b/pyscf/scf/uhf_symm.py
@@ -565,8 +565,7 @@ def get_orbsym(self, mo_coeff=None, s=None):
 
     canonicalize = canonicalize
 
-    def to_gpu(self):
-        raise NotImplementedError
+    to_gpu = lib.to_gpu
 
 UHF = SymAdaptedUHF
 
diff --git a/pyscf/sgx/sgx.py b/pyscf/sgx/sgx.py
index 25f05e2e9e..c004745117 100644
--- a/pyscf/sgx/sgx.py
+++ b/pyscf/sgx/sgx.py
@@ -214,6 +214,9 @@ def post_kernel(self, envs):
         self._last_vj = 0
         self._last_vk = 0
 
+    def to_gpu(self):
+        raise NotImplementedError
+
     def method_not_implemented(self, *args, **kwargs):
         raise NotImplementedError
     nuc_grad_method = Gradients = method_not_implemented
@@ -374,3 +377,5 @@ def get_jk(self, dm, hermi=1, vhfopt=None, with_j=True, with_k=True,
         else:
             vj, vk = sgx_jk.get_jk(self, dm, hermi, with_j, with_k, direct_scf_tol)
         return vj, vk
+
+    to_gpu = lib.to_gpu
diff --git a/pyscf/solvent/_attach_solvent.py b/pyscf/solvent/_attach_solvent.py
index 1f98b9677d..8e0b6b177a 100644
--- a/pyscf/solvent/_attach_solvent.py
+++ b/pyscf/solvent/_attach_solvent.py
@@ -147,6 +147,11 @@ def stability(self, *args, **kwargs):
                                equilibrium_solvation=not self.with_solvent.frozen):
             return super().stability(*args, **kwargs)
 
+    def to_gpu(self):
+        obj = self.undo_solvent().to_gpu()
+        obj = _for_scf(obj, self.with_solvent)
+        return lib.to_gpu(self, obj)
+
 def _for_casscf(mc, solvent_obj, dm=None):
     '''Add solvent model to CASSCF method.
 
@@ -284,6 +289,11 @@ def nuc_grad_method(self):
 
     Gradients = nuc_grad_method
 
+    def to_gpu(self):
+        obj = self.undo_solvent().to_gpu()
+        obj = _for_casscf(obj, self.with_solvent)
+        return lib.to_gpu(self, obj)
+
 
 def _for_casci(mc, solvent_obj, dm=None):
     '''Add solvent model to CASCI method.
@@ -421,6 +431,11 @@ def nuc_grad_method(self):
 
     Gradients = nuc_grad_method
 
+    def to_gpu(self):
+        obj = self.undo_solvent().to_gpu()
+        obj = _for_casci(obj, self.with_solvent)
+        return lib.to_gpu(self, obj)
+
 
 def _for_post_scf(method, solvent_obj, dm=None):
     '''A wrapper of solvent model for post-SCF methods (CC, CI, MP etc.)
@@ -552,6 +567,11 @@ def nuc_grad_method(self):
 
     Gradients = nuc_grad_method
 
+    def to_gpu(self):
+        obj = self.undo_solvent().to_gpu()
+        obj = _for_post_scf(obj, self.with_solvent)
+        return lib.to_gpu(self, obj)
+
 
 def _for_tdscf(method, solvent_obj, dm=None):
     '''Add solvent model in TDDFT calculations.
@@ -583,7 +603,7 @@ def _for_tdscf(method, solvent_obj, dm=None):
 class TDSCFWithSolvent(_Solvation):
     _keys = {'with_solvent'}
 
-    def __init__(self, method, scf_with_solvent):
+    def __init__(self, method, scf_with_solvent=None):
         self.__dict__.update(method.__dict__)
         self._scf = scf_with_solvent
         self.with_solvent = self._scf.with_solvent
@@ -630,3 +650,8 @@ def get_ab(self, mf=None):
     def nuc_grad_method(self):
         grad_method = super().nuc_grad_method()
         return self.with_solvent.nuc_grad_method(grad_method)
+
+    def to_gpu(self):
+        obj = self.undo_solvent().to_gpu()
+        obj = _for_tdscf(obj, self.with_solvent)
+        return lib.to_gpu(self, obj)
diff --git a/pyscf/solvent/ddcosmo.py b/pyscf/solvent/ddcosmo.py
index e0293aae8f..88062c2b48 100644
--- a/pyscf/solvent/ddcosmo.py
+++ b/pyscf/solvent/ddcosmo.py
@@ -869,6 +869,8 @@ def nuc_grad_method(self, grad_method):
         else:
             return ddcosmo_grad.make_grad_object(grad_method)
 
+    to_gpu = lib.to_gpu
+
 DDCOSMO = ddCOSMO
 
 class Grids(gen_grid.Grids):
diff --git a/pyscf/soscf/newton_ah.py b/pyscf/soscf/newton_ah.py
index dca21e6f57..8a2a2dd0d7 100644
--- a/pyscf/soscf/newton_ah.py
+++ b/pyscf/soscf/newton_ah.py
@@ -800,6 +800,9 @@ def rotate_mo(self, mo_coeff, u, log=None):
                       _effective_svd(u[idx][:,idx], 1e-5))
         return mo
 
+    def to_gpu(self):
+        return self.undo_soscf().to_gpu()
+
 class _SecondOrderROHF(_CIAH_SOSCF):
     gen_g_hop = gen_g_hop_rohf
 
diff --git a/pyscf/tdscf/rhf.py b/pyscf/tdscf/rhf.py
index 265d37d110..c90fac184b 100644
--- a/pyscf/tdscf/rhf.py
+++ b/pyscf/tdscf/rhf.py
@@ -764,6 +764,9 @@ def _finalize(self):
         logger.note(self, 'Excited State energies (eV)\n%s', self.e * nist.HARTREE2EV)
         return self
 
+    def to_gpu(self):
+        raise NotImplementedError
+
 class TDA(TDBase):
     '''Tamm-Dancoff approximation
 
@@ -866,6 +869,8 @@ def pickeig(w, v, nroots, envs):
         self._finalize()
         return self.e, self.xy
 
+    to_gpu = lib.to_gpu
+
 CIS = TDA
 
 
@@ -1044,6 +1049,8 @@ def nuc_grad_method(self):
         from pyscf.grad import tdrhf
         return tdrhf.Gradients(self)
 
+    to_gpu = lib.to_gpu
+
 RPA = TDRHF = TDHF
 
 scf.hf.RHF.TDA = lib.class_as_method(TDA)
diff --git a/pyscf/tdscf/uhf.py b/pyscf/tdscf/uhf.py
index 398b777753..17554d1044 100644
--- a/pyscf/tdscf/uhf.py
+++ b/pyscf/tdscf/uhf.py
@@ -690,6 +690,8 @@ def pickeig(w, v, nroots, envs):
         self._finalize()
         return self.e, self.xy
 
+    to_gpu = lib.to_gpu
+
 CIS = TDA
 
 
@@ -858,6 +860,8 @@ def pickeig(w, v, nroots, envs):
         self._finalize()
         return self.e, self.xy
 
+    to_gpu = lib.to_gpu
+
 RPA = TDUHF = TDHF
 
 scf.uhf.UHF.TDA = lib.class_as_method(TDA)
diff --git a/pyscf/x2c/sfx2c1e.py b/pyscf/x2c/sfx2c1e.py
index 6b6d8f8334..c3fde295c1 100644
--- a/pyscf/x2c/sfx2c1e.py
+++ b/pyscf/x2c/sfx2c1e.py
@@ -154,6 +154,10 @@ def _transfer_attrs_(self, dst):
             dst = dst.sfx2c()
         return hf.SCF._transfer_attrs_(self, dst)
 
+    def to_gpu(self):
+        obj = self.undo_x2c().to_gpu().sfx2c1e()
+        return lib.to_gpu(self, obj)
+
 
 class SpinFreeX2CHelper(x2c.X2CHelperBase):
     '''1-component X2c (spin-free part only)
diff --git a/pyscf/x2c/x2c.py b/pyscf/x2c/x2c.py
index bd2eee5fed..29a6517b36 100644
--- a/pyscf/x2c/x2c.py
+++ b/pyscf/x2c/x2c.py
@@ -659,6 +659,8 @@ def to_ks(self, xc='HF'):
         from pyscf.x2c import dft
         return self._transfer_attrs_(dft.UKS(self.mol, xc=xc))
 
+    to_gpu = lib.to_gpu
+
 X2C_UHF = UHF
 
 class RHF(SCF):
@@ -680,6 +682,8 @@ def to_ks(self, xc='HF'):
         from pyscf.x2c import dft
         return self._transfer_attrs_(dft.RKS(self.mol, xc=xc))
 
+    to_gpu = lib.to_gpu
+
 X2C_RHF = RHF
 
 def x2c1e_ghf(mf):
@@ -799,6 +803,10 @@ def _transfer_attrs_(self, dst):
     def to_ks(self, xc='HF'):
         raise NotImplementedError
 
+    def to_gpu(self):
+        obj = self.undo_x2c().to_gpu().x2c1e()
+        return lib.to_gpu(self, obj)
+
 
 def _uncontract_mol(mol, xuncontract=None, exp_drop=0.2):
     '''mol._basis + uncontracted steep functions'''

From d3f622d46eef5d9b8702fa6f4577babfb6c2ccfe Mon Sep 17 00:00:00 2001
From: "Junjie, Yang" <yangjunjie0320@gmail.com>
Date: Fri, 15 Mar 2024 04:26:42 +0800
Subject: [PATCH 23/44] Improve memory usage for RPA (#2115)

* improve memory usage for RPA with outcore ao2mo; fix small bug

* raise something when loading libpbc.get_Gv

* fix flake8 issue

* fix flake8 issue

* used DF-RHF as EXX, changed ref values

* unrestricted case

* unrestricted case

* Restore pyscf/pbc/gto/cell.py
---
 examples/gw/03-drpa.py    |  32 ++++
 pyscf/gto/mole.py         |   5 +-
 pyscf/gw/rpa.py           | 329 +++++++++++++++++++++++++++-----------
 pyscf/gw/test/test_gw.py  |   5 +-
 pyscf/gw/test/test_ugw.py |   5 +-
 pyscf/gw/urpa.py          | 262 +++++++++++++++---------------
 6 files changed, 410 insertions(+), 228 deletions(-)
 create mode 100644 examples/gw/03-drpa.py

diff --git a/examples/gw/03-drpa.py b/examples/gw/03-drpa.py
new file mode 100644
index 0000000000..1c2596684a
--- /dev/null
+++ b/examples/gw/03-drpa.py
@@ -0,0 +1,32 @@
+'''
+Direct RPA correlation energy
+'''
+
+from pyscf import gto, dft, gw
+
+mol = gto.M(
+    atom = """
+O          0.48387       -0.41799       -0.63869
+H          0.58103        0.36034       -0.05009
+H          1.01598       -1.09574       -0.18434
+H          0.68517       -2.88004        0.87771
+O          1.59649       -2.63873        0.61189
+H          1.72242       -3.22647       -0.15071
+H         -2.47665        1.59686       -0.33246
+O         -1.55912        1.35297       -0.13891
+H         -1.25777        0.82058       -0.89427
+H         -1.87830       -2.91357       -0.21825
+O         -1.14269       -2.57648        0.31845
+H         -0.81003       -1.77219       -0.15155
+""",
+    basis = 'ccpvqz', verbose = 5,
+    )
+
+mf = dft.RKS(mol).density_fit()
+mf.xc = 'pbe'
+mf.kernel()
+
+import pyscf.gw.rpa
+rpa = gw.rpa.dRPA(mf)
+rpa.max_memory = 50
+rpa.kernel()
diff --git a/pyscf/gto/mole.py b/pyscf/gto/mole.py
index 28d8fd444d..9c0a2d2125 100644
--- a/pyscf/gto/mole.py
+++ b/pyscf/gto/mole.py
@@ -2420,8 +2420,8 @@ def enuc(self):
             self._enuc = self.energy_nuc()
         return self._enuc
     @enuc.setter
-    def enuc(self, enuc):
-        self._enuc = enuc
+    def enuc(self, x):
+        self._enuc = x
 
     copy = copy
 
@@ -2798,6 +2798,7 @@ def dump_input(self):
         if self.verbose >= logger.INFO:
             self.stdout.write('\n')
             logger.info(self, 'nuclear repulsion = %.15g', self.enuc)
+
             if self.symmetry:
                 if self.topgroup == self.groupname:
                     logger.info(self, 'point group symmetry = %s', self.topgroup)
diff --git a/pyscf/gw/rpa.py b/pyscf/gw/rpa.py
index 8304432298..c0d0bc306a 100755
--- a/pyscf/gw/rpa.py
+++ b/pyscf/gw/rpa.py
@@ -26,7 +26,8 @@
     X. Ren et al., New J. Phys. 14, 053020 (2012)
 """
 
-import numpy as np
+import numpy as np, scipy
+
 from pyscf import lib
 from pyscf.lib import logger
 from pyscf.ao2mo import _ao2mo
@@ -36,88 +37,123 @@
 einsum = lib.einsum
 
 # ****************************************************************************
-# core routines, kernel, rpa_ecorr, rho_response
+# core routines kernel
 # ****************************************************************************
 
-def kernel(rpa, mo_energy, mo_coeff, Lpq=None, nw=40, x0=0.5, verbose=logger.NOTE):
+def kernel(rpa, mo_energy, mo_coeff, cderi_ov=None, nw=40, x0=0.5, verbose=logger.NOTE):
     """
     RPA correlation and total energy
 
     Args:
-        Lpq : density fitting 3-center integral in MO basis.
-        nw : number of frequency point on imaginary axis.
-        x0: scaling factor for frequency grid.
+        cderi_ov:
+            Array-like object, Cholesky decomposed ERI in OV subspace.
+        nw:
+            number of frequency point on imaginary axis.
+        x0:
+            scaling factor for frequency grid.
 
     Returns:
-        e_tot : RPA total energy
-        e_hf : EXX energy
-        e_corr : RPA correlation energy
+        e_tot:
+            RPA total energy
+        e_hf:
+            EXX energy
+        e_corr:
+            RPA correlation energy
     """
     mf = rpa._scf
+
     # only support frozen core
     if rpa.frozen is not None:
         assert isinstance(rpa.frozen, int)
-        assert rpa.frozen < rpa.nocc
+        assert rpa.frozen < np.min(rpa.nocc)
+
+    # Get orbital number
+    with_df = rpa.with_df
+    naux = with_df.get_naoaux()
+    norb = rpa._scf.mol.nao_nr()
+
+    # Get memory information
+    max_memory = max(0, rpa.max_memory * 0.9 - lib.current_memory()[0])
+    if max_memory < naux ** 2 / 1e6:
+        logger.warn(
+            rpa, 'Memory may not be enough! Available memory %d MB < %d MB',
+            max_memory, naux ** 2 / 1e6
+                   )
+
+    # AO -> MO transformation
+    if cderi_ov is None:
+        blksize = int(max_memory * 1e6 / (8 * norb ** 2))
+        blksize = min(naux, blksize)
+        blksize = max(1, blksize)
+
+        # logger.debug(rpa, 'cderi    memory: %6d MB', naux * norb ** 2 * 8 / 1e6)
+        # logger.debug(rpa, 'cderi_ov memory: %6d MB', naux * nocc * nvir * 8 / 1e6)
+        logger.debug(rpa, 'ao2mo blksize = %d', blksize)
+        if blksize == 1:
+            logger.warn(rpa, 'Memory too small for ao2mo! blksize = 1')
+
+        cderi_ov = rpa.ao2mo(mo_coeff, blksize=blksize)
+
+    # Compute exact exchange energy (EXX)
+    e_hf = _ene_hf(mf, with_df)
+    e_ov = rpa.make_e_ov(mo_energy)
 
-    if Lpq is None:
-        Lpq = rpa.ao2mo(mo_coeff)
+    # Compute RPA correlation energy
+    e_corr = 0.0
 
-    # Grids for integration on imaginary axis
-    freqs, wts = _get_scaled_legendre_roots(nw, x0)
+    # Determine block size for dielectric matrix
+    blksize = int(max_memory * 1e6 / 8 / naux)
+    blksize = max(blksize, 1)
 
-    # Compute HF exchange energy (EXX)
-    dm = mf.make_rdm1()
-    rhf = scf.RHF(rpa.mol)
-    e_hf = rhf.energy_elec(dm=dm)[0]
-    e_hf += mf.energy_nuc()
+    if blksize == 1:
+        logger.warn(rpa, 'Memory too small for dielectric matrix! blksize = 1')
 
-    # Compute RPA correlation energy
-    e_corr = get_rpa_ecorr(rpa, Lpq, freqs, wts)
+    logger.debug(rpa, 'diel blksize = %d', blksize)
+
+    # Grids for numerical integration on imaginary axis
+    for omega, weigh in zip(*_get_scaled_legendre_roots(nw, x0)):
+        diel = rpa.make_dielectric_matrix(omega, e_ov, cderi_ov, blksize=blksize)
+        factor = weigh / (2.0 * np.pi)
+        e_corr += factor * np.log(np.linalg.det(np.eye(naux) - diel))
+        e_corr += factor * np.trace(diel)
 
     # Compute total energy
     e_tot = e_hf + e_corr
-
     logger.debug(rpa, '  RPA total energy = %s', e_tot)
     logger.debug(rpa, '  EXX energy = %s, RPA corr energy = %s', e_hf, e_corr)
 
     return e_tot, e_hf, e_corr
 
-def get_rpa_ecorr(rpa, Lpq, freqs, wts):
-    """
-    Compute RPA correlation energy
+# ****************************************************************************
+# frequency integral quadrature, legendre, clenshaw_curtis
+# ****************************************************************************
+
+def make_dielectric_matrix(omega, e_ov, cderi_ov, blksize=None):
     """
-    mo_energy = _mo_energy_without_core(rpa, rpa._scf.mo_energy)
-    nocc = rpa.nocc
-    nw = len(freqs)
-    naux = Lpq.shape[0]
+    Compute dielectric matrix at a given frequency omega
 
-    if (mo_energy[nocc] - mo_energy[nocc-1]) < 1e-3:
-        logger.warn(rpa, 'Current RPA code not well-defined for degeneracy!')
+    Args:
+        omega : float, frequency
+        e_ov : 1D array (nocc * nvir), orbital energy differences
+        cderi_ov : 2D array (naux, nocc * nvir), Cholesky decomposed ERI
+                   in OV subspace.
 
-    e_corr = 0.
-    for w in range(nw):
-        Pi = get_rho_response(freqs[w], mo_energy, Lpq[:, :nocc, nocc:])
-        ec_w = np.log(np.linalg.det(np.eye(naux) - Pi))
-        ec_w += np.trace(Pi)
-        e_corr += 1./(2.*np.pi) * ec_w * wts[w]
+    Returns:
+        diel : 2D array (naux, naux), dielectric matrix
+    """
+    assert blksize is not None
 
-    return e_corr
+    naux, nov = cderi_ov.shape
 
-def get_rho_response(omega, mo_energy, Lpq):
-    """
-    Compute density response function in auxiliary basis at freq iw.
-    """
-    naux, nocc, nvir = Lpq.shape
-    eia = mo_energy[:nocc, None] - mo_energy[None, nocc:]
-    eia = eia / (omega**2 + eia * eia)
-    # Response from both spin-up and spin-down density
-    Pia = Lpq * (eia * 4.0)
-    Pi = einsum('Pia, Qia -> PQ', Pia, Lpq)
-    return Pi
+    chi0 = (2.0 * e_ov / (omega ** 2 + e_ov ** 2)).ravel()
+    diel = np.zeros((naux, naux))
 
-# ****************************************************************************
-# frequency integral quadrature, legendre, clenshaw_curtis
-# ****************************************************************************
+    for s in [slice(*x) for x in lib.prange(0, nov, blksize)]:
+        v_ov = cderi_ov[:, s]
+        diel += np.dot(v_ov * chi0[s], v_ov.T)
+        v_ov = None
+
+    return diel
 
 def _get_scaled_legendre_roots(nw, x0=0.5):
     """
@@ -138,6 +174,7 @@ def _get_clenshaw_curtis_roots(nw):
     """
     Clenshaw-Curtis qaudrature on [0,inf)
     Ref: J. Chem. Phys. 132, 234114 (2010)
+
     Returns:
         freqs : 1D array
         wts : 1D array
@@ -149,22 +186,44 @@ def _get_clenshaw_curtis_roots(nw):
         t = (w + 1.0) / nw * np.pi * 0.5
         freqs[w] = a / np.tan(t)
         if w != nw - 1:
-            wts[w] = a*np.pi * 0.5 / nw / (np.sin(t)**2)
+            wts[w] = a * np.pi * 0.50 / nw / (np.sin(t)**2)
         else:
-            wts[w] = a*np.pi * 0.25 / nw / (np.sin(t)**2)
+            wts[w] = a * np.pi * 0.25 / nw / (np.sin(t)**2)
     return freqs[::-1], wts[::-1]
 
+def _ene_hf(mf=None, with_df=None):
+    """
+    Args:
+        mf: converged mean-field object, can be either HF or KS
+        with_df: density fitting object
+
+    Returns:
+        e_hf: float, total Hartree-Fock energy
+    """
+    assert mf.converged
+    hf_obj = mf if not isinstance(mf, scf.hf.KohnShamDFT) else mf.to_hf()
+
+    if not getattr(hf_obj, 'with_df', None):
+        hf_obj = hf_obj.density_fit(with_df=with_df)
+    dm = hf_obj.make_rdm1()
+
+    e_hf  = hf_obj.energy_elec(dm=dm)[0]
+    e_hf += hf_obj.energy_nuc()
+    return e_hf
+
 def _mo_energy_without_core(rpa, mo_energy):
     return mo_energy[get_frozen_mask(rpa)]
 
 def _mo_without_core(rpa, mo):
     return mo[:,get_frozen_mask(rpa)]
 
-class RPA(lib.StreamObject):
+class DirectRPA(lib.StreamObject):
 
     _keys = {
         'mol', 'frozen',
-        'with_df', 'mo_energy', 'mo_coeff', 'mo_occ', 'e_corr', 'e_hf', 'e_tot',
+        'with_df', 'mo_energy',
+        'mo_coeff', 'mo_occ',
+        'e_corr', 'e_hf', 'e_tot',
     }
 
     def __init__(self, mf, frozen=None, auxbasis=None):
@@ -185,8 +244,8 @@ def __init__(self, mf, frozen=None, auxbasis=None):
             else:
                 self.with_df.auxbasis = df.make_auxbasis(mf.mol, mp2fit=True)
 
-##################################################
-# don't modify the following attributes, they are not input options
+        ##################################################
+        # don't modify the following attributes, they are not input options
         self._nocc = None
         self._nmo = None
         self.mo_energy = mf.mo_energy
@@ -226,50 +285,111 @@ def nmo(self, n):
     get_nmo = get_nmo
     get_frozen_mask = get_frozen_mask
 
-    def kernel(self, mo_energy=None, mo_coeff=None, Lpq=None, nw=40, x0=0.5):
+    def kernel(self, mo_energy=None, mo_coeff=None, cderi_ov=None, nw=40, x0=0.5):
         """
-        Args:
-            mo_energy : 1D array (nmo), mean-field mo energy
-            mo_coeff : 2D array (nmo, nmo), mean-field mo coefficient
-            Lpq : 3D array (naux, nmo, nmo), 3-index ERI
-            nw: integer, grid number
-            x0: real, scaling factor for frequency grid
-
-        Returns:
-            self.e_tot : RPA total eenrgy
-            self.e_hf : EXX energy
-            self.e_corr : RPA correlation energy
+        The kernel function for direct RPA
         """
-        if mo_coeff is None:
-            mo_coeff = _mo_without_core(self, self._scf.mo_coeff)
-        if mo_energy is None:
-            mo_energy = _mo_energy_without_core(self, self._scf.mo_energy)
 
         cput0 = (logger.process_clock(), logger.perf_counter())
+
         self.dump_flags()
-        self.e_tot, self.e_hf, self.e_corr = \
-                        kernel(self, mo_energy, mo_coeff, Lpq=Lpq, nw=nw, x0=x0, verbose=self.verbose)
+        res = kernel(
+            self, mo_energy, mo_coeff,
+            cderi_ov=cderi_ov, nw=nw, x0=x0,
+            verbose=self.verbose
+                    )
+        self.e_tot, self.e_hf, self.e_corr = res
 
         logger.timer(self, 'RPA', *cput0)
         return self.e_corr
 
-    def ao2mo(self, mo_coeff=None):
+    def make_e_ov(self, mo_energy=None):
+        """
+        Compute orbital energy differences
+        """
+        if mo_energy is None:
+            mo_energy = _mo_energy_without_core(self, self.mo_energy)
+
+        nocc = self.nocc
+        e_ov = (mo_energy[:nocc, None] - mo_energy[None, nocc:]).ravel()
+
+        gap = (-e_ov.max(), )
+        logger.info(self, 'Lowest orbital energy difference: % 6.4e', np.min(gap))
+
+        if (np.min(gap) < 1e-3):
+            logger.warn(rpa, 'RPA code not well-defined for degenerate systems!')
+            logger.warn(rpa, 'Lowest orbital energy difference: % 6.4e', np.min(gap))
+
+        return e_ov
+
+    def make_dielectric_matrix(self, omega, e_ov=None, cderi_ov=None, blksize=None):
+        """
+        Args:
+            omega : float, frequency
+            e_ov : 1D array (nocc * nvir), orbital energy differences
+            mo_coeff :  (nao, nmo), mean-field mo coefficient
+            cderi_ov :  (naux, nocc, nvir), Cholesky decomposed ERI in OV subspace.
+
+        Returns:
+            diel : 2D array (naux, naux), dielectric matrix
+        """
+
+        assert e_ov is not None
+        assert cderi_ov is not None
+
+        blksize = blksize or max(e_ov.size)
+
+        diel = 2.0 * make_dielectric_matrix(
+            omega, e_ov,
+            cderi_ov if isinstance(cderi_ov, np.ndarray) else cderi_ov["cderi_ov"],
+            blksize=blksize
+                                     )
+
+        return diel
+
+    def ao2mo(self, mo_coeff=None, blksize=None):
         if mo_coeff is None:
-            mo_coeff = self.mo_coeff
-        nmo = self.nmo
+            mo_coeff = _mo_without_core(self, self.mo_coeff)
+
+        nocc = self.nocc
+        norb = self.nmo
+        nvir = norb - nocc
         naux = self.with_df.get_naoaux()
-        mem_incore = (2 * nmo**2*naux) * 8 / 1e6
-        mem_now = lib.current_memory()[0]
-
-        mo = np.asarray(mo_coeff, order='F')
-        ijslice = (0, nmo, 0, nmo)
-        Lpq = None
-        if (mem_incore + mem_now < 0.99 * self.max_memory) or self.mol.incore_anyway:
-            Lpq = _ao2mo.nr_e2(self.with_df._cderi, mo, ijslice, aosym='s2', out=Lpq)
-            return Lpq.reshape(naux, nmo, nmo)
+        sov = (0, nocc, nocc, norb) # slice for OV block
+
+        blksize  = naux if blksize is None else blksize
+        cderi_ov = None
+
+        cput0 = (logger.process_clock(), logger.perf_counter())
+        if blksize >= naux or self.mol.incore_anyway:
+            assert isinstance(self.with_df._cderi, np.ndarray)
+            cderi_ov = _ao2mo.nr_e2(
+                self.with_df._cderi, mo_coeff,
+                sov, aosym='s2', out=cderi_ov
+                                    )
+            logger.timer(self, 'incore ao2mo', *cput0)
+
         else:
-            logger.warn(self, 'Memory may not be enough!')
-            raise NotImplementedError
+            fswap = lib.H5TmpFile()
+            fswap.create_dataset('cderi_ov', (naux, nocc * nvir))
+
+            q0 = 0
+            for cderi in self.with_df.loop(blksize=blksize):
+                q1 = q0 + cderi.shape[0]
+                v_ov = _ao2mo.nr_e2(
+                    cderi, mo_coeff,
+                    sov, aosym='s2'
+                                    )
+                fswap['cderi_ov'][q0:q1] = v_ov
+                v_ov = None
+                q0 = q1
+
+            logger.timer(self, 'outcore ao2mo', *cput0)
+            cderi_ov = fswap
+
+        return cderi_ov
+
+RPA = dRPA = DirectRPA
 
 if __name__ == '__main__':
     from pyscf import gto, dft
@@ -279,7 +399,7 @@ def ao2mo(self, mo_coeff=None):
         [8 , (0. , 0.     , 0.)],
         [1 , (0. , -0.7571 , 0.5861)],
         [1 , (0. , 0.7571 , 0.5861)]]
-    mol.basis = 'def2-svp'
+    mol.basis = 'def2svp'
     mol.build()
 
     mf = dft.RKS(mol)
@@ -287,7 +407,28 @@ def ao2mo(self, mo_coeff=None):
     mf.kernel()
 
     rpa = RPA(mf)
-    rpa.kernel()
+    rpa.verbose = 6
+
+    nocc = rpa.nocc
+    nvir = rpa.nmo - nocc
+    norb = rpa.nmo
+    e_ov = - (rpa.mo_energy[:nocc, None] - rpa.mo_energy[None, nocc:]).ravel()
+    v_ov = rpa.ao2mo(rpa.mo_coeff, blksize=1)
+    e_corr_0 = rpa.kernel(cderi_ov=v_ov)
+
     print ('RPA e_tot, e_hf, e_corr = ', rpa.e_tot, rpa.e_hf, rpa.e_corr)
-    assert (abs(rpa.e_corr- -0.30783004035780076) < 1e-6)
-    assert (abs(rpa.e_tot- -76.26428191794182) < 1e-6)
+    assert (abs(rpa.e_corr - -0.307830040357800) < 1e-6)
+    assert (abs(rpa.e_tot  - -76.26651423730257) < 1e-6)
+
+    # Another implementation of direct RPA N^6
+    v_ov = np.array(v_ov["cderi_ov"])
+    a = e_ov * np.eye(nocc * nvir) + 2 * np.dot(v_ov.T, v_ov)
+    b = 2 * np.dot(v_ov.T, v_ov)
+    apb = a + b
+    amb = a - b
+    c = np.dot(amb, apb)
+    e_corr_1 = 0.5 * np.trace(
+        scipy.linalg.sqrtm(c) - a
+    )
+
+    assert abs(e_corr_0 - e_corr_1) < 1e-8
diff --git a/pyscf/gw/test/test_gw.py b/pyscf/gw/test/test_gw.py
index 0f405f9ac2..b5c366061c 100644
--- a/pyscf/gw/test/test_gw.py
+++ b/pyscf/gw/test/test_gw.py
@@ -78,9 +78,8 @@ def test_gw_exact(self):
     def test_rpa(self):
         rpa_obj = rpa.RPA(mf, frozen=0)
         rpa_obj.kernel()
-        self.assertAlmostEqual(rpa_obj.e_tot, -76.26428191794182, 6)
-        self.assertAlmostEqual(rpa_obj.e_corr, -0.30783004035780076, 6)
-
+        self.assertAlmostEqual(rpa_obj.e_tot,  -76.26651423730257, 6)
+        self.assertAlmostEqual(rpa_obj.e_corr, -0.307830040357800, 6)
 
 if __name__ == "__main__":
     print("Full Tests for GW")
diff --git a/pyscf/gw/test/test_ugw.py b/pyscf/gw/test/test_ugw.py
index 2234ced2d6..81ada072ff 100644
--- a/pyscf/gw/test/test_ugw.py
+++ b/pyscf/gw/test/test_ugw.py
@@ -41,8 +41,9 @@ def test_gwac_pade(self):
     def test_rpa(self):
         rpa_obj = urpa.URPA(mf, frozen=0)
         rpa_obj.kernel()
-        self.assertAlmostEqual(rpa_obj.e_tot, -74.98258098665727, 6)
-        self.assertAlmostEqual(rpa_obj.e_corr, -0.18821540003542925, 6)
+
+        self.assertAlmostEqual(rpa_obj.e_tot, -74.98369614250653, 6)
+        self.assertAlmostEqual(rpa_obj.e_corr, -0.1882153685614803, 6)
 
 
 if __name__ == "__main__":
diff --git a/pyscf/gw/urpa.py b/pyscf/gw/urpa.py
index 5d338a7638..576de8c30e 100755
--- a/pyscf/gw/urpa.py
+++ b/pyscf/gw/urpa.py
@@ -32,95 +32,10 @@
 from pyscf.ao2mo import _ao2mo
 from pyscf import df, scf
 from pyscf.mp.ump2 import get_nocc, get_nmo, get_frozen_mask
-from pyscf.gw.rpa import RPA, _get_scaled_legendre_roots
 
-einsum = lib.einsum
+import pyscf.gw.rpa
 
-# ****************************************************************************
-# core routines, kernel, rpa_ecorr, rho_response
-# ****************************************************************************
-
-def kernel(rpa, mo_energy, mo_coeff, Lpq=None, nw=40, x0=0.5, verbose=logger.NOTE):
-    """
-    RPA correlation and total energy
-
-    Args:
-        Lpq : density fitting 3-center integral in MO basis.
-        nw : number of frequency point on imaginary axis.
-        x0: scaling factor for frequency grid.
-
-    Returns:
-        e_tot : RPA total energy
-        e_hf : EXX energy
-        e_corr : RPA correlation energy
-    """
-    mf = rpa._scf
-    # only support frozen core
-    if rpa.frozen is not None:
-        assert isinstance(rpa.frozen, int)
-        assert (rpa.frozen < rpa.nocc[0] and rpa.frozen < rpa.nocc[1])
-
-    if Lpq is None:
-        Lpq = rpa.ao2mo(mo_coeff)
-
-    # Grids for integration on imaginary axis
-    freqs, wts = _get_scaled_legendre_roots(nw, x0)
-
-    # Compute HF exchange energy (EXX)
-    dm = mf.make_rdm1()
-    uhf = scf.UHF(rpa.mol)
-    e_hf = uhf.energy_elec(dm=dm)[0]
-    e_hf += mf.energy_nuc()
-
-    # Compute RPA correlation energy
-    e_corr = get_rpa_ecorr(rpa, Lpq, freqs, wts)
-
-    # Compute total energy
-    e_tot = e_hf + e_corr
-
-    logger.debug(rpa, '  RPA total energy = %s', e_tot)
-    logger.debug(rpa, '  EXX energy = %s, RPA corr energy = %s', e_hf, e_corr)
-
-    return e_tot, e_hf, e_corr
-
-def get_rpa_ecorr(rpa, Lpq, freqs, wts):
-    """
-    Compute RPA correlation energy
-    """
-    mo_energy = _mo_energy_without_core(rpa, rpa._scf.mo_energy)
-    nocca, noccb = rpa.nocc
-    nw = len(freqs)
-    naux = Lpq[0].shape[0]
-
-    homo = max(mo_energy[0][nocca-1], mo_energy[1][noccb-1])
-    lumo = min(mo_energy[0][nocca], mo_energy[1][noccb])
-    if (lumo-homo) < 1e-3:
-        logger.warn(rpa, 'Current RPA code not well-defined for degeneracy!')
-
-    e_corr = 0.
-    for w in range(nw):
-        Pi = get_rho_response(freqs[w], mo_energy, Lpq[0,:,:nocca,nocca:], Lpq[1,:,:noccb,noccb:])
-        ec_w = np.log(np.linalg.det(np.eye(naux) - Pi))
-        ec_w += np.trace(Pi)
-        e_corr += 1./(2.*np.pi) * ec_w * wts[w]
-
-    return e_corr
-
-def get_rho_response(omega, mo_energy, Lpqa, Lpqb):
-    '''
-    Compute density response function in auxiliary basis at freq iw
-    '''
-    naux, nocca, nvira = Lpqa.shape
-    naux, noccb, nvirb = Lpqb.shape
-    eia_a = mo_energy[0,:nocca,None] - mo_energy[0,None,nocca:]
-    eia_b = mo_energy[1,:noccb,None] - mo_energy[1,None,noccb:]
-    eia_a = eia_a / (omega**2 + eia_a*eia_a)
-    eia_b = eia_b / (omega**2 + eia_b*eia_b)
-    Pia_a = Lpqa * (eia_a * 2.0)
-    Pia_b = Lpqb * (eia_b * 2.0)
-    # Response from both spin-up and spin-down density
-    Pi = einsum('Pia, Qia -> PQ', Pia_a, Lpqa) + einsum('Pia, Qia -> PQ', Pia_b, Lpqb)
-    return Pi
+einsum = lib.einsum
 
 def _mo_energy_without_core(rpa, mo_energy):
     moidx = get_frozen_mask(rpa)
@@ -132,8 +47,7 @@ def _mo_without_core(rpa, mo):
     mo = (mo[0][:,moidx[0]], mo[1][:,moidx[1]])
     return np.asarray(mo)
 
-class URPA(RPA):
-
+class URPA(pyscf.gw.rpa.RPA):
     def dump_flags(self):
         log = logger.Logger(self.stdout, self.verbose)
         log.info('')
@@ -153,57 +67,144 @@ def dump_flags(self):
     get_nmo = get_nmo
     get_frozen_mask = get_frozen_mask
 
-    def kernel(self, mo_energy=None, mo_coeff=None, Lpq=None, nw=40, x0=0.5):
+    def make_e_ov(self, mo_energy=None):
+        """
+        Compute orbital energy differences
+        """
+        if mo_energy is None:
+            mo_energy = _mo_energy_without_core(self, self.mo_energy)
+
+        nocc_a, nocc_b = self.nocc
+        e_ov_a = (mo_energy[0][:nocc_a, None] - mo_energy[0][None, nocc_a:]).ravel()
+        e_ov_b = (mo_energy[1][:nocc_b, None] - mo_energy[1][None, nocc_b:]).ravel()
+
+        gap = (-e_ov_a.max(), -e_ov_b.max())
+        logger.info(self, 'Lowest orbital energy difference: (% 6.4e, % 6.4e)', gap[0], gap[1])
+
+        if (np.min(gap) < 1e-3):
+            logger.warn(self, 'RPA code not well-defined for degenerate systems!')
+            logger.warn(self, 'Lowest orbital energy difference: % 6.4e', np.min(gap))
+
+        return e_ov_a, e_ov_b
+
+    def make_dielectric_matrix(self, omega, e_ov=None, cderi_ov=None, blksize=None):
         """
         Args:
-            mo_energy : 2D array (2, nmo), mean-field mo energy
-            mo_coeff : 3D array (2, nmo, nmo), mean-field mo coefficient
-            Lpq : 4D array (2, naux, nmo, nmo), 3-index ERI
-            nw: integer, grid number
-            x0: real, scaling factor for frequency grid
+            omega : float, frequency
+            mo_energy : (2, nmo), mean-field mo energy
+            mo_coeff :  (2, nao, nmo), mean-field mo coefficient
+            cderi_ov :  (2, naux, nocc, nvir), Cholesky decomposed ERI in OV subspace.
 
         Returns:
-            self.e_tot : RPA total eenrgy
-            self.e_hf : EXX energy
-            self.e_corr : RPA correlation energy
+            diel : 2D array (naux, naux), dielectric matrix
         """
+        assert cderi_ov is not None
+        assert e_ov is not None
+
+        naux = self.with_df.get_naoaux()
+        blksize = blksize or max(e_ov[0].size, e_ov[1].size)
+
+        diel = np.zeros((naux, naux))
+        for s, e_ov_s in enumerate((e_ov[0], e_ov[1])):
+            cderi_ov_s = cderi_ov[s] if isinstance(cderi_ov, tuple) else cderi_ov["cderi_ov_%d" % s]
+            diel += pyscf.gw.rpa.make_dielectric_matrix(omega, e_ov_s, cderi_ov_s, blksize=blksize)
+
+        return diel
+
+    def ao2mo(self, mo_coeff=None, blksize=None):
         if mo_coeff is None:
-            mo_coeff = _mo_without_core(self, self._scf.mo_coeff)
-        if mo_energy is None:
-            mo_energy = _mo_energy_without_core(self, self._scf.mo_energy)
+            mo_coeff = _mo_without_core(self, self.mo_coeff)
 
-        cput0 = (logger.process_clock(), logger.perf_counter())
-        self.dump_flags()
-        self.e_tot, self.e_hf, self.e_corr = \
-                        kernel(self, mo_energy, mo_coeff, Lpq=Lpq, nw=nw, x0=x0, verbose=self.verbose)
+        mo_coeff_a = mo_coeff[0]
+        mo_coeff_b = mo_coeff[1]
 
-        logger.timer(self, 'RPA', *cput0)
-        return self.e_corr
+        nocc_a, nocc_b = self.nocc
+        norb_a, norb_b = self.nmo
+        nvir_a, nvir_b = norb_a - nocc_a, norb_b - nocc_b
 
-    def ao2mo(self, mo_coeff=None):
-        nmoa, nmob = self.nmo
-        nao = self.mo_coeff[0].shape[0]
         naux = self.with_df.get_naoaux()
-        mem_incore = (nmoa**2*naux + nmob**2*naux + nao**2*naux) * 8/1e6
-        mem_now = lib.current_memory()[0]
-
-        moa = np.asarray(mo_coeff[0], order='F')
-        mob = np.asarray(mo_coeff[1], order='F')
-        ijslicea = (0, nmoa, 0, nmoa)
-        ijsliceb = (0, nmob, 0, nmob)
-        Lpqa = None
-        Lpqb = None
-        if (mem_incore + mem_now < 0.99*self.max_memory) or self.mol.incore_anyway:
-            Lpqa = _ao2mo.nr_e2(self.with_df._cderi, moa, ijslicea, aosym='s2', out=Lpqa)
-            Lpqb = _ao2mo.nr_e2(self.with_df._cderi, mob, ijsliceb, aosym='s2', out=Lpqb)
-            return np.asarray((Lpqa.reshape(naux,nmoa,nmoa),Lpqb.reshape(naux,nmob,nmob)))
+        sov_a = (0, nocc_a, nocc_a, norb_a)
+        sov_b = (0, nocc_b, nocc_b, norb_b)
+
+        blksize  = naux if blksize is None else blksize
+        cderi_ov = None
+        cderi_ov_a = None
+        cderi_ov_b = None
+
+        cput0 = (logger.process_clock(), logger.perf_counter())
+        if blksize >= naux or self.mol.incore_anyway:
+            assert isinstance(self.with_df._cderi, np.ndarray)
+            cderi_ov_a = _ao2mo.nr_e2(
+                self.with_df._cderi, mo_coeff_a,
+                sov_a, aosym='s2', out=cderi_ov_a
+                                    )
+
+            cderi_ov_b = _ao2mo.nr_e2(
+                self.with_df._cderi, mo_coeff_b,
+                sov_b, aosym='s2', out=cderi_ov_b
+                                    )
+            cderi_ov = (cderi_ov_a, cderi_ov_b)
+
+            logger.timer(self, 'incore ao2mo', *cput0)
+
         else:
-            logger.warn(self, 'Memory may not be enough!')
-            raise NotImplementedError
+            fswap = lib.H5TmpFile()
+            fswap.create_dataset('cderi_ov_0', (naux, nocc_a * nvir_a), 'f8')
+            fswap.create_dataset('cderi_ov_1', (naux, nocc_b * nvir_b), 'f8')
+
+            q0 = 0
+            for cderi in self.with_df.loop(blksize=blksize):
+                q1 = q0 + cderi.shape[0]
+
+                v_ov_a = _ao2mo.nr_e2(
+                    cderi, mo_coeff_a,
+                    sov_a, aosym='s2'
+                                    )
+                fswap['cderi_ov_0'][q0:q1] = v_ov_a
+                v_ov_a = None
+
+                v_ov_b = _ao2mo.nr_e2(
+                    cderi, mo_coeff_b,
+                    sov_b, aosym='s2'
+                                    )
+                fswap['cderi_ov_1'][q0:q1] = v_ov_b
+                v_ov_b = None
+
+                q0 = q1
+
+            logger.timer(self, 'outcore ao2mo', *cput0)
+
+            cderi_ov = fswap
+
+        return cderi_ov
 
 
 if __name__ == '__main__':
     from pyscf import gto, dft
+    # Closed-shell unrestricted RPA
+    mol = gto.Mole()
+    mol.verbose = 4
+    mol.atom = [
+        [8 , (0. , 0.     , 0.)],
+        [1 , (0. , -0.7571 , 0.5861)],
+        [1 , (0. , 0.7571 , 0.5861)]]
+    mol.basis = 'def2svp'
+    mol.build()
+
+    mf = dft.UKS(mol)
+    mf.xc = 'pbe'
+    mf.kernel()
+
+    # Shall be identical to the restricted RPA result
+    rpa = URPA(mf)
+    rpa.max_memory = 0
+    rpa.verbose = 5
+    rpa.kernel()
+    print ('RPA e_tot, e_hf, e_corr = ', rpa.e_tot, rpa.e_hf, rpa.e_corr)
+    assert (abs(rpa.e_corr - -0.307830040357800) < 1e-6)
+    assert (abs(rpa.e_tot  - -76.26651423730257) < 1e-6)
+
+    # Open-shell RPA
     mol = gto.Mole()
     mol.verbose = 4
     mol.atom = 'F 0 0 0'
@@ -213,10 +214,17 @@ def ao2mo(self, mo_coeff=None):
 
     mf = dft.UKS(mol)
     mf.xc = 'pbe0'
+    mf.max_memory = 0
     mf.kernel()
 
     rpa = URPA(mf)
+    rpa.max_memory = 0
+    rpa.verbose = 5
     rpa.kernel()
     print ('RPA e_tot, e_hf, e_corr = ', rpa.e_tot, rpa.e_hf, rpa.e_corr)
-    assert (abs(rpa.e_corr- -0.20980646878974454) < 1e-6)
-    assert (abs(rpa.e_tot- -99.49292565821425) < 1e-6)
+    assert (abs(rpa.e_corr - -0.20980646878974454) < 1e-6)
+    assert (abs(rpa.e_tot  - -99.49455969299747) < 1e-6)
+
+
+
+

From 10f89c376371ed55b99075cd77ba209181629ca2 Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <wxj6000@gmail.com>
Date: Mon, 25 Mar 2024 16:31:40 -0700
Subject: [PATCH 24/44] DFT wrapper supporting conventional DFT methods (#2084)

* added solvent models

* add example for RHF

* cleanup variables

* support casci casscf and ccsd

* uncomment unittests

* change example name

* update reset

* for flake8

* fixed a bug in soscf/newton_ah.py

* updated for recent master changes

* remove whitespace

* remove whitespace

* native support dispersion correction

* fixed a bug in pcm

* Update hf.py

* move dispersion to addons

* remove disp in RKS

* call get_dispersion in kernels

* added unit test for d4

* fixed dispersion correction in testing h2o

* updated ci

* skip unittest if dftd3 or dftd4 is missing

* new high level dft wrapper

* clean up & move dispersion to energy_tot

* remove disp from dft _keys

* fixed import dftd3_xc_map

* fixed typo

* added a wrapper for is_nlc

* replaced more ni.libxc.is_nlc with ni.is_nlc

* added wb97x-d3 as a special case

* added disp_3body

* removesuffix -> replace

* address qiming's comments

* add dft_parser && change back to libxc.is_nlc

* fixed unit test

* disable wb97x-d3, wb97x-d

* skip dftd3 for py3.12 & skip wb97x-d3

* Update dft_parser; Add tests for dft_parser

* Add warning messages

* Update test_h2o.py

---------

Co-authored-by: Qiming Sun <osirpt.sun@gmail.com>
---
 examples/dft/00-simple_dft.py | 10 ++++
 pyscf/dft/dft_parser.py       | 87 +++++++++++++++++++++++++++++++++++
 pyscf/dft/libxc.py            | 12 ++++-
 pyscf/dft/rks.py              |  5 +-
 pyscf/dft/test/test_h2o.py    | 37 ++++++++++++++-
 pyscf/dft/test/test_libxc.py  | 14 ++++++
 pyscf/dft/xcfun.py            | 10 +++-
 pyscf/grad/dispersion.py      | 43 ++++++++++-------
 pyscf/hessian/dispersion.py   | 52 ++++++++++++---------
 pyscf/scf/dispersion.py       | 35 ++++++++------
 pyscf/scf/hf.py               | 17 ++++---
 11 files changed, 255 insertions(+), 67 deletions(-)
 create mode 100644 pyscf/dft/dft_parser.py

diff --git a/examples/dft/00-simple_dft.py b/examples/dft/00-simple_dft.py
index 7ba632fc6f..a84f56110e 100644
--- a/examples/dft/00-simple_dft.py
+++ b/examples/dft/00-simple_dft.py
@@ -34,3 +34,13 @@
 
 # Orbital energies, Mulliken population etc.
 mf.analyze()
+
+# shorten dft names
+mf = mol.KS(xc='b3lyp-d3bj')
+#mf = mol.KS(xc='b3lyp-d3zero')
+#mf = mol.KS(xc='b3lyp-d3bj2b')
+#mf = mol.KS(xc='b3lyp-d3bjatm')
+#mf = mol.KS(xc='b3lyp-d4')
+#mf = mol.KS(xc='wb97x-v')
+#mf = mol.KS(xc='wb97m-d3bj)
+#mf = mol.KS(xc='wb97x-d3)
diff --git a/pyscf/dft/dft_parser.py b/pyscf/dft/dft_parser.py
new file mode 100644
index 0000000000..d9fdd05cb9
--- /dev/null
+++ b/pyscf/dft/dft_parser.py
@@ -0,0 +1,87 @@
+
+# Copyright 2014-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''
+unified dft parser for coordinating dft protocols with
+1. xc functionals
+2. dispersion corrections / nonlocal correction
+3. GTO basis (TODO)
+4. geometrical counterpoise (gCP) correction (TODO)
+'''
+
+from functools import lru_cache
+import warnings
+
+# supported dispersion corrections
+DISP_VERSIONS = ['d3bj', 'd3zero', 'd3bjm', 'd3zerom', 'd3op', 'd4']
+
+@lru_cache(128)
+def parse_dft(dft_method):
+    ''' conventional dft method ->
+    (xc, enable nlc, (xc for dftd3, dispersion version, with 3-body dispersion))
+    '''
+    if not isinstance(dft_method, str):
+        return dft_method, None, (dft_method, None, False)
+    method_lower = dft_method.lower()
+
+    # special cases:
+    # - wb97x-d is not supported yet
+    # - wb97*-d3bj is wb97*-v with d3bj
+    # - wb97x-d3 is not supported yet
+    # - 3c method is not supported yet
+
+    if method_lower == 'wb97x-d':
+        raise NotImplementedError('wb97x-d is not supported yet.')
+
+    if method_lower == 'wb97m-d3bj':
+        return 'wb97m-v', False, ('wb97m', 'd3bj', False)
+    if method_lower == 'b97m-d3bj':
+        return 'b97m-v', False, ('b97m', 'd3bj', False)
+    if method_lower == 'wb97x-d3bj':
+        return 'wb97x-v', False, ('wb97x', 'd3bj', False)
+
+    # J. Chem. Theory Comput. 2013, 9, 1, 263-272
+    if method_lower in ['wb97x-d3']:
+        raise NotImplementedError('wb97x-d3 is not supported yet.')
+
+    if method_lower.endswith('-3c'):
+        raise NotImplementedError('*-3c methods are not supported yet.')
+
+    xc = dft_method
+    disp = None
+    for d in DISP_VERSIONS:
+        if method_lower.endswith(d):
+            disp = d
+            with_3body = False
+            xc = method_lower.replace(f'-{d}','')
+        elif method_lower.endswith(d+'2b'):
+            disp = d
+            with_3body = False
+            xc = method_lower.replace(f'-{d}2b', '')
+        elif method_lower.endswith(d+'atm'):
+            disp = d
+            with_3body = True
+            xc = method_lower.replace(f'-{d}atm', '')
+
+        if disp is not None:
+            if xc in ('b97m', 'wb97m'):
+                warnings.warn(
+                    f'{dft_method} is not a well-defined functional. '
+                    'The XC part is changed to {xc}-v')
+                return xc+'-v', False, (xc, disp, with_3body)
+            else:
+                return xc, None, (xc, disp, with_3body)
+
+    return xc, None, (xc, None, False)
diff --git a/pyscf/dft/libxc.py b/pyscf/dft/libxc.py
index 8e6d51267f..6b2a841a52 100644
--- a/pyscf/dft/libxc.py
+++ b/pyscf/dft/libxc.py
@@ -30,7 +30,7 @@
 from functools import lru_cache
 from pyscf import lib
 from pyscf.dft.xc.utils import remove_dup, format_xc_code
-from pyscf.dft import xc_deriv
+from pyscf.dft import xc_deriv, dft_parser
 from pyscf import __config__
 
 _itrf = lib.load_library('libxc_itrf')
@@ -922,6 +922,10 @@ def is_gga(xc_code):
 
 @lru_cache(100)
 def is_nlc(xc_code):
+    enable_nlc = dft_parser.parse_dft(xc_code)[1]
+    if enable_nlc is False:
+        return False
+    # identify nlc by xc_code itself if enable_nlc is None
     if isinstance(xc_code, str):
         if xc_code.isdigit():
             return _itrf.LIBXC_is_nlc(ctypes.c_int(int(xc_code)))
@@ -1087,6 +1091,7 @@ def parse_xc(description):
         decoded XC description, with the data structure
         (hybrid, alpha, omega), ((libxc-Id, fac), (libxc-Id, fac), ...)
     '''  # noqa: E501
+
     hyb = [0, 0, 0]  # hybrid, alpha, omega (== SR_HF, LR_HF, omega)
     if description is None:
         return tuple(hyb), ()
@@ -1105,6 +1110,8 @@ def parse_xc(description):
                       'To restore the VWN5 definition, you can put the setting '
                       '"B3LYP_WITH_VWN5 = True" in pyscf_conf.py')
 
+    description = dft_parser.parse_dft(description)[0]
+
     def assign_omega(omega, hyb_or_sr, lr=0):
         if hyb[2] == omega or omega == 0:
             hyb[0] += hyb_or_sr
@@ -1231,6 +1238,8 @@ def possible_c_for(key):
             parse_token(token, 'C')
     else:
         for token in description.replace('-', '+-').replace(';+', ';').split('+'):
+            # dftd3 cannot be used in a custom xc description
+            assert '-d3' not in token
             parse_token(token, 'compound XC', search_xc_alias=True)
     if hyb[2] == 0: # No omega is assigned. LR_HF is 0 for normal Coulomb operator
         hyb[1] = 0
@@ -1251,6 +1260,7 @@ def possible_c_for(key):
                    'WB97X-D'  : 'WB97X_D',
                    'WB97X-V'  : 'WB97X_V',
                    'WB97M-V'  : 'WB97M_V',
+                   'WB97X-D3' : 'WB97X_D3',
                    'B97M-V'   : 'B97M_V',
                    'M05-2X'   : 'M05_2X',
                    'M06-L'    : 'M06_L',
diff --git a/pyscf/dft/rks.py b/pyscf/dft/rks.py
index 2a5d82c2d1..5972f98e20 100644
--- a/pyscf/dft/rks.py
+++ b/pyscf/dft/rks.py
@@ -260,7 +260,6 @@ def define_xc_(ks, description, xctype='LDA', hyb=0, rsh=(0,0,0)):
     ks._numint = libxc.define_xc_(ks._numint, description, xctype, hyb, rsh)
     return ks
 
-
 def _dft_common_init_(mf, xc='LDA,VWN'):
     raise DeprecationWarning
 
@@ -321,10 +320,12 @@ class KohnShamDFT:
     -76.415443079840458
     '''
 
-    _keys = {'xc', 'nlc', 'grids', 'nlcgrids', 'small_rho_cutoff'}
+    _keys = {'xc', 'nlc', 'grids', 'disp', 'disp_with_3body', 'nlcgrids', 'small_rho_cutoff'}
 
     def __init__(self, xc='LDA,VWN'):
         self.xc = xc
+        self.disp = None
+        self.disp_with_3body = None
         self.nlc = ''
         self.grids = gen_grid.Grids(self.mol)
         self.grids.level = getattr(
diff --git a/pyscf/dft/test/test_h2o.py b/pyscf/dft/test/test_h2o.py
index 7eee60fe8c..1c1c1ececf 100644
--- a/pyscf/dft/test/test_h2o.py
+++ b/pyscf/dft/test/test_h2o.py
@@ -501,6 +501,41 @@ def test_nr_uks_vv10_high_cost(self):
         method.nlcgrids.atom_grid = {"H": (40, 110), "O": (40, 110),}
         self.assertAlmostEqual(method.scf(), -76.352381513158718, 8)
 
+    @unittest.skipIf('dftd3' not in sys.modules, "requires the dftd3 library")
+    def test_dft_parser(self):
+        from pyscf.scf import dispersion
+        method = dft.RKS(h2o, xc='wb97m-d3bj')
+        e_disp = dispersion.get_dispersion(method)
+        self.assertAlmostEqual(e_disp, -0.0007551366628786623, 9)
+        assert method._numint.libxc.is_nlc(method.xc) == False
+        fn_facs = method._numint.libxc.parse_xc(method.xc)
+        assert fn_facs[1][0][0] == 531
+
+        method = dft.RKS(h2o, xc='wb97x-d3bj')
+        e_disp = dispersion.get_dispersion(method)
+        self.assertAlmostEqual(e_disp, -0.0005697890844546384, 9)
+        assert method._numint.libxc.is_nlc(method.xc) == False
+        fn_facs = method._numint.libxc.parse_xc(method.xc)
+        assert fn_facs[1][0][0] == 466
+
+        method = dft.RKS(h2o, xc='b3lyp-d3bj')
+        e_disp = dispersion.get_dispersion(method)
+        self.assertAlmostEqual(e_disp, -0.0005738788210828446, 9)
+        fn_facs = method._numint.libxc.parse_xc(method.xc)
+        assert fn_facs[1][0][0] == 402
+
+        method = dft.RKS(h2o, xc='b3lyp-d3bjm2b')
+        e_disp = dispersion.get_dispersion(method)
+        self.assertAlmostEqual(e_disp, -0.0006949127588605776, 9)
+
+        method = dft.RKS(h2o, xc='b3lyp-d3bjmatm')
+        e_disp = dispersion.get_dispersion(method)
+        self.assertAlmostEqual(e_disp, -0.0006949125270554931, 9)
+
+        method = dft.UKS(h2o, xc='b3lyp-d3bjmatm')
+        e_disp = dispersion.get_dispersion(method)
+        self.assertAlmostEqual(e_disp, -0.0006949125270554931, 9)
+
     def test_camb3lyp_rsh_omega(self):
         mf = dft.RKS(h2o)
         mf.grids.atom_grid = {"H": (50, 194), "O": (50, 194),}
@@ -524,7 +559,7 @@ def test_dispersion(self):
         mf.xc = 'B3LYP'
         mf.disp = 'd3bj'
         mf.run(xc='B3LYP')
-        self.assertAlmostEqual(mf.e_tot, -76.38945547396322, 9)
+        self.assertAlmostEqual(mf.e_tot, -76.38552043811778, 9)
 
     def test_reset(self):
         mf = dft.RKS(h2o).newton()
diff --git a/pyscf/dft/test/test_libxc.py b/pyscf/dft/test/test_libxc.py
index 4bacb402e5..f4223fdeec 100644
--- a/pyscf/dft/test/test_libxc.py
+++ b/pyscf/dft/test/test_libxc.py
@@ -341,6 +341,20 @@ def test_m06(self):
         self.assertAlmostEqual(abs(numpy.hstack([fxc[i] for i in [0,1,2,4,6,9]])-fxc_ref).max(), 0, 7)
         self.assertAlmostEqual(abs(numpy.hstack([kxc[i] for i in [0,1,2,3,5,7,10,12,15,19]])-kxc_ref).max(), 0, 6)
 
+    def test_dft_parser(self):
+        from pyscf.dft.dft_parser import parse_dft
+        self.assertEqual(parse_dft('wb97m-d3bj'), ('wb97m-v', False, ('wb97m', 'd3bj', False)))
+        self.assertEqual(dft.libxc.parse_xc('wb97m-d3bj')[1][0][0], 531)
+        self.assertTrue(not dft.libxc.is_nlc('wb97m-d3bj'))
+
+        self.assertEqual(parse_dft('wb97-d3zerom'), ('wb97', None, ('wb97', 'd3zerom', False)))
+        self.assertTrue(not dft.libxc.is_nlc('wb97-d3zerom'))
+
+        self.assertEqual(parse_dft('wb97m-d3bjatm'), ('wb97m-v', False, ('wb97m', 'd3bj', True)))
+        self.assertTrue(not dft.libxc.is_nlc('wb97m-d3bjatm'))
+
+        self.assertEqual(parse_dft('wb97x-d3zero2b'), ('wb97x', None, ('wb97x', 'd3zero', False)))
+        self.assertTrue(not dft.libxc.is_nlc('wb97x-d3zero2b'))
 
 if __name__ == "__main__":
     print("Test libxc")
diff --git a/pyscf/dft/xcfun.py b/pyscf/dft/xcfun.py
index c49c5e9ab8..d830c87b64 100644
--- a/pyscf/dft/xcfun.py
+++ b/pyscf/dft/xcfun.py
@@ -28,7 +28,7 @@
 import numpy
 from pyscf import lib
 from pyscf.dft.xc.utils import remove_dup, format_xc_code
-from pyscf.dft import xc_deriv
+from pyscf.dft import xc_deriv, dft_parser
 from pyscf import __config__
 
 _itrf = lib.load_library('libxcfun_itrf')
@@ -318,6 +318,9 @@ def is_gga(xc_code):
 VV10_XC.update([(5000+i, VV10_XC[key]) for i, key in enumerate(VV10_XC)])
 
 def is_nlc(xc_code):
+    enable_nlc = dft_parser.parse_dft(xc_code)[1]
+    if enable_nlc is False:
+        return False
     fn_facs = parse_xc(xc_code)[1]
     return any(xid >= 5000 for xid, c in fn_facs)
 
@@ -420,6 +423,8 @@ def parse_xc(description):
     elif not isinstance(description, str): #isinstance(description, (tuple,list)):
         return parse_xc('%s,%s' % tuple(description))
 
+    description = dft_parser.parse_dft(description)[0]
+
     def assign_omega(omega, hyb_or_sr, lr=0):
         if hyb[2] == omega or omega == 0:
             hyb[0] += hyb_or_sr
@@ -430,6 +435,7 @@ def assign_omega(omega, hyb_or_sr, lr=0):
             hyb[2] = omega
         else:
             raise ValueError('Different values of omega found for RSH functionals')
+
     fn_facs = []
     def parse_token(token, suffix, search_xc_alias=False):
         if token:
@@ -503,6 +509,8 @@ def parse_token(token, suffix, search_xc_alias=False):
             parse_token(token, 'C')
     else:
         for token in description.replace('-', '+-').replace(';+', ';').split('+'):
+            # dftd3 cannot be used in a custom xc description
+            assert '-d3' not in token
             parse_token(token, 'XC', search_xc_alias=True)
     if hyb[2] == 0: # No omega is assigned. LR_HF is 0 for normal Coulomb operator
         hyb[1] = 0
diff --git a/pyscf/grad/dispersion.py b/pyscf/grad/dispersion.py
index b4865db0a8..2cd5fe705c 100644
--- a/pyscf/grad/dispersion.py
+++ b/pyscf/grad/dispersion.py
@@ -21,36 +21,45 @@
 '''
 
 import numpy
-from pyscf.scf.hf import KohnShamDFT
+from pyscf.dft.rks import KohnShamDFT
+from pyscf.dft import dft_parser
 
-def get_dispersion(mf_grad, disp_version=None):
+def get_dispersion(mf_grad, disp_version=None, with_3body=False):
     '''gradient of dispersion correction for RHF/RKS'''
+    mf = mf_grad.base
+    mol = mf.mol
+    if isinstance(mf, KohnShamDFT):
+        method = mf.xc
+    else:
+        method = 'hf'
+    method, disp, with_3body = dft_parser.parse_dft(method)[2]
+
+    # priority: args > mf.disp > dft_parser
     if disp_version is None:
-        disp_version = mf_grad.base.disp
-    mol = mf_grad.base.mol
-    disp_version = mf_grad.base.disp
+        disp_version = disp
+        # dispersion version can be customized via mf.disp
+        if hasattr(mf, 'disp') and mf.disp is not None:
+            disp_version = mf.disp
+
     if disp_version is None:
         return numpy.zeros([mol.natm,3])
 
-    if isinstance(mf_grad.base, KohnShamDFT):
-        method = mf_grad.base.xc
-    else:
-        method = 'hf'
+    # 3-body contribution can be disabled with mf.disp_with_3body
+    if hasattr(mf, 'disp_with_3body') and mf.disp_with_3body is not None:
+        with_3body = mf.disp_with_3body
 
     if disp_version[:2].upper() == 'D3':
         # raised error in SCF module, assuming dftd3 installed
         import dftd3.pyscf as disp
-        d3 = disp.DFTD3Dispersion(mol, xc=method, version=disp_version)
+        d3 = disp.DFTD3Dispersion(mol, xc=method, version=disp_version, atm=with_3body)
         _, g_d3 = d3.kernel()
         return g_d3
     elif disp_version[:2].upper() == 'D4':
-        from pyscf.data.elements import charge
-        atoms = numpy.array([ charge(a[0]) for a in mol._atom])
-        coords = mol.atom_coords()
-        from dftd4.interface import DampingParam, DispersionModel
-        model = DispersionModel(atoms, coords)
-        res = model.get_dispersion(DampingParam(method=method), grad=True)
-        return res.get("gradient")
+        # raised error in SCF module, assuming dftd3 installed
+        import dftd4.pyscf as disp
+        d4 = disp.DFTD4Dispersion(mol, xc=method, atm=with_3body)
+        _, g_d4 = d4.kernel()
+        return g_d4
     else:
         raise RuntimeError(f'dispersion correction: {disp_version} is not supported.')
 
diff --git a/pyscf/hessian/dispersion.py b/pyscf/hessian/dispersion.py
index 8751da7e1c..728b01e2cf 100644
--- a/pyscf/hessian/dispersion.py
+++ b/pyscf/hessian/dispersion.py
@@ -22,21 +22,33 @@
 
 
 import numpy
-from pyscf.scf.hf import KohnShamDFT
+from pyscf.dft.rks import KohnShamDFT
+from pyscf.dft import dft_parser
 
-def get_dispersion(hessobj, disp_version=None):
+def get_dispersion(hessobj, disp_version=None, with_3body=False):
+    mf = hessobj.base
+    mol = mf.mol
+    if isinstance(mf, KohnShamDFT):
+        method = mf.xc
+    else:
+        method = 'hf'
+    method, disp, with_3body = dft_parser.parse_dft(method)[2]
+
+    # priority: args > mf.disp > dft_parser
     if disp_version is None:
-        disp_version = hessobj.base.disp
-    mol = hessobj.base.mol
+        disp_version = disp
+        # dispersion version can be customized via mf.disp
+        if hasattr(mf, 'disp') and mf.disp is not None:
+            disp_version = mf.disp
+
     natm = mol.natm
-    mf = hessobj.base
     h_disp = numpy.zeros([natm,natm,3,3])
     if disp_version is None:
         return h_disp
-    if isinstance(hessobj.base, KohnShamDFT):
-        method = hessobj.base.xc
-    else:
-        method = 'hf'
+
+    # 3-body contribution can be disabled with mf.disp_with_3body
+    if hasattr(mf, 'disp_with_3body') and mf.disp_with_3body is not None:
+        with_3body = mf.disp_with_3body
 
     if mf.disp[:2].upper() == 'D3':
         import dftd3.pyscf as disp
@@ -47,12 +59,12 @@ def get_dispersion(hessobj, disp_version=None):
             for j in range(3):
                 coords[i,j] += eps
                 mol.set_geom_(coords, unit='Bohr')
-                d3 = disp.DFTD3Dispersion(mol, xc=method, version=mf.disp)
+                d3 = disp.DFTD3Dispersion(mol, xc=method, version=mf.disp, atm=with_3body)
                 _, g1 = d3.kernel()
 
                 coords[i,j] -= 2.0*eps
                 mol.set_geom_(coords, unit='Bohr')
-                d3 = disp.DFTD3Dispersion(mol, xc=method, version=mf.disp)
+                d3 = disp.DFTD3Dispersion(mol, xc=method, version=mf.disp, atm=with_3body)
                 _, g2 = d3.kernel()
 
                 coords[i,j] += eps
@@ -60,27 +72,21 @@ def get_dispersion(hessobj, disp_version=None):
             return h_disp
 
     elif mf.disp[:2].upper() == 'D4':
-        from pyscf.data.elements import charge
-        atoms = numpy.array([ charge(a[0]) for a in mol._atom])
-        coords = mol.atom_coords()
-        natm = mol.natm
-        from dftd4.interface import DampingParam, DispersionModel
-        params = DampingParam(method=method)
+        import dftd4.pyscf as disp
+        coords = hessobj.mol.atom_coords()
         mol = mol.copy()
         eps = 1e-5
         for i in range(natm):
             for j in range(3):
                 coords[i,j] += eps
                 mol.set_geom_(coords, unit='Bohr')
-                model = DispersionModel(atoms, coords)
-                res = model.get_dispersion(params, grad=True)
-                g1 = res.get("gradient")
+                d4 = disp.DFTD4Dispersion(mol, xc=method, atm=with_3body)
+                _, g1 = d4.kernel()
 
                 coords[i,j] -= 2.0*eps
                 mol.set_geom_(coords, unit='Bohr')
-                model = DispersionModel(atoms, coords)
-                res = model.get_dispersion(params, grad=True)
-                g2 = res.get("gradient")
+                d4 = disp.DFTD4Dispersion(mol, xc=method, atm=with_3body)
+                _, g2 = d4.kernel()
 
                 coords[i,j] += eps
                 h_disp[i,:,j,:] = (g1 - g2)/(2.0*eps)
diff --git a/pyscf/scf/dispersion.py b/pyscf/scf/dispersion.py
index 607ab5c9c7..94e9018ff7 100644
--- a/pyscf/scf/dispersion.py
+++ b/pyscf/scf/dispersion.py
@@ -20,20 +20,29 @@
 dispersion correction for HF and DFT
 '''
 
-
-import numpy
-from pyscf.scf.hf import KohnShamDFT
+from pyscf.dft.rks import KohnShamDFT
+from pyscf.dft import dft_parser
 
 def get_dispersion(mf, disp_version=None):
-    if disp_version is None:
-        disp_version = mf.disp
     mol = mf.mol
-    if disp_version is None:
-        return 0.0
     if isinstance(mf, KohnShamDFT):
         method = mf.xc
     else:
         method = 'hf'
+    method, disp, with_3body = dft_parser.parse_dft(method)[2]
+
+    # priority: args > mf.disp > dft_parser
+    if disp_version is None:
+        disp_version = disp
+        # dispersion version can be customized via mf.disp
+        if hasattr(mf, 'disp') and mf.disp is not None:
+            disp_version = mf.disp
+    if disp_version is None:
+        return 0.0
+
+    # 3-body contribution can be disabled with mf.disp_with_3body
+    if hasattr(mf, 'disp_with_3body') and mf.disp_with_3body is not None:
+        with_3body = mf.disp_with_3body
 
     # for dftd3
     if disp_version[:2].upper() == 'D3':
@@ -47,18 +56,15 @@ def get_dispersion(mf, disp_version=None):
         pip3 install dftd3 \n \
 **************************************")
 
-        d3 = disp.DFTD3Dispersion(mol, xc=method, version=disp_version)
+        d3 = disp.DFTD3Dispersion(mol, xc=method, version=disp_version, atm=with_3body)
         e_d3, _ = d3.kernel()
         mf.scf_summary['dispersion'] = e_d3
         return e_d3
 
     # for dftd4
     elif disp_version[:2].upper() == 'D4':
-        from pyscf.data.elements import charge
-        atoms = numpy.array([ charge(a[0]) for a in mol._atom])
-        coords = mol.atom_coords()
         try:
-            from dftd4.interface import DampingParam, DispersionModel
+            import dftd4.pyscf as disp
         except ImportError:
             raise ImportError("\n \
 cannot find dftd4 in the current environment. \n \
@@ -67,9 +73,8 @@ def get_dispersion(mf, disp_version=None):
         pip3 install dftd4 \n \
 ***************************************")
 
-        model = DispersionModel(atoms, coords)
-        res = model.get_dispersion(DampingParam(method=method), grad=False)
-        e_d4 = res.get("energy")
+        d4 = disp.DFTD4Dispersion(mol, xc=method, atm=with_3body)
+        e_d4, _ = d4.kernel()
         mf.scf_summary['dispersion'] = e_d4
         return e_d4
     else:
diff --git a/pyscf/scf/hf.py b/pyscf/scf/hf.py
index 7119745708..209fa26afe 100644
--- a/pyscf/scf/hf.py
+++ b/pyscf/scf/hf.py
@@ -229,12 +229,6 @@ def kernel(mf, conv_tol=1e-10, conv_tol_grad=None,
         if dump_chk:
             mf.dump_chk(locals())
 
-    #FIX DISP!!
-    if mf.disp is not None:
-        e_disp = mf.get_dispersion()
-        mf.scf_summary['dispersion'] = e_disp
-        e_tot += e_disp
-
     logger.timer(mf, 'scf_cycle', *cput0)
     # A post-processing hook before return
     mf.post_kernel(locals())
@@ -298,7 +292,16 @@ def energy_tot(mf, dm=None, h1e=None, vhf=None):
     '''
     nuc = mf.energy_nuc()
     e_tot = mf.energy_elec(dm, h1e, vhf)[0] + nuc
+    if mf.disp is not None:
+        if 'dispersion' in mf.scf_summary:
+            e_tot += mf.scf_summary['dispersion']
+        else:
+            e_disp = mf.get_dispersion()
+            mf.scf_summary['dispersion'] = e_disp
+            e_tot += e_disp
+
     mf.scf_summary['nuc'] = nuc.real
+
     return e_tot
 
 
@@ -1531,7 +1534,7 @@ class SCF(lib.StreamObject):
         'diis_file', 'diis_space_rollback', 'damp', 'level_shift',
         'direct_scf', 'direct_scf_tol', 'conv_check', 'callback',
         'mol', 'chkfile', 'mo_energy', 'mo_coeff', 'mo_occ',
-        'e_tot', 'converged', 'scf_summary', 'opt', 'disp',
+        'e_tot', 'converged', 'scf_summary', 'opt', 'disp', 'disp_with_3body',
     }
 
     def __init__(self, mol):

From 0a17e425e3c3dc28cfba0b54613194909db20548 Mon Sep 17 00:00:00 2001
From: xubwa <xubo.wang@colorado.edu>
Date: Wed, 27 Mar 2024 16:03:37 -0400
Subject: [PATCH 25/44] fix dipole moment in sfx2c1e

---
 pyscf/x2c/sfx2c1e.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pyscf/x2c/sfx2c1e.py b/pyscf/x2c/sfx2c1e.py
index c3fde295c1..2cc4d61f72 100644
--- a/pyscf/x2c/sfx2c1e.py
+++ b/pyscf/x2c/sfx2c1e.py
@@ -128,8 +128,9 @@ def dip_moment(self, mol=None, dm=None, unit='Debye', verbose=logger.NOTE,
             if picture_change:
                 xmol = self.with_x2c.get_xmol()[0]
                 nao = xmol.nao
-                prp = xmol.intor_symmetric('int1e_sprsp').reshape(3,4,nao,nao)[:,0]
-                ao_dip = self.with_x2c.picture_change(('int1e_r', prp))
+                prp = xmol.intor_symmetric('int1e_sprsp').reshape(3,4,nao,nao)[:,3]
+                c1 = 0.5/lib.param.LIGHT_SPEED
+                ao_dip = self.with_x2c.picture_change(('int1e_r', prp*c1**2))
             else:
                 ao_dip = mol.intor_symmetric('int1e_r')
 

From 175f787372d55ef3090e46798ca1374ac00c575d Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <wxj6000@gmail.com>
Date: Wed, 3 Apr 2024 15:14:12 -0700
Subject: [PATCH 26/44] debug and simplify to_gpu (#2149)

* support and simplify to_gpu

* remove comments

* flake8
---
 pyscf/cc/ccsd.py        | 11 +++++++++--
 pyscf/df/grad/rhf.py    |  2 --
 pyscf/df/grad/rks.py    |  2 --
 pyscf/df/grad/uhf.py    |  2 --
 pyscf/df/grad/uks.py    |  2 --
 pyscf/df/hessian/rhf.py |  1 -
 pyscf/df/hessian/rks.py |  1 -
 pyscf/df/hessian/uhf.py |  1 -
 pyscf/df/hessian/uks.py |  1 -
 pyscf/dft/numint.py     |  8 ++++++--
 pyscf/grad/rhf.py       | 10 +++++++---
 pyscf/grad/rks.py       |  2 --
 pyscf/grad/uhf.py       |  2 --
 pyscf/grad/uks.py       |  2 --
 pyscf/hessian/rhf.py    | 10 +++++++---
 pyscf/hessian/rks.py    |  1 -
 pyscf/hessian/uhf.py    |  2 --
 pyscf/hessian/uks.py    |  1 -
 pyscf/lib/misc.py       |  4 +++-
 pyscf/mp/dfmp2.py       |  2 --
 pyscf/mp/mp2.py         |  9 ++++++++-
 21 files changed, 40 insertions(+), 36 deletions(-)

diff --git a/pyscf/cc/ccsd.py b/pyscf/cc/ccsd.py
index 65ee49486f..67aab80aae 100644
--- a/pyscf/cc/ccsd.py
+++ b/pyscf/cc/ccsd.py
@@ -1240,6 +1240,15 @@ def density_fit(self, auxbasis=None, with_df=None):
     def nuc_grad_method(self):
         raise NotImplementedError
 
+    # to_gpu can be reused only when __init__ still takes mf
+    def to_gpu(self):
+        mf = self.base.to_gpu()
+        from importlib import import_module
+        mod = import_module(self.__module__.replace('pyscf', 'gpu4pyscf'))
+        cls = getattr(mod, self.__class__.__name__)
+        obj = cls(mf)
+        return obj
+
 class CCSD(CCSDBase):
     __doc__ = CCSDBase.__doc__
 
@@ -1365,8 +1374,6 @@ def get_d2_diagnostic(self, t2=None):
         if t2 is None: t2 = self.t2
         return get_d2_diagnostic(t2)
 
-    to_gpu = lib.to_gpu
-
 CC = RCCSD = CCSD
 
 
diff --git a/pyscf/df/grad/rhf.py b/pyscf/df/grad/rhf.py
index 91ccd3e543..cfa31375eb 100644
--- a/pyscf/df/grad/rhf.py
+++ b/pyscf/df/grad/rhf.py
@@ -523,6 +523,4 @@ def extra_force(self, atom_id, envs):
         else:
             return 0
 
-    to_gpu = lib.to_gpu
-
 Grad = Gradients
diff --git a/pyscf/df/grad/rks.py b/pyscf/df/grad/rks.py
index a53d3d0ae6..adfc7b7080 100644
--- a/pyscf/df/grad/rks.py
+++ b/pyscf/df/grad/rks.py
@@ -123,6 +123,4 @@ def extra_force(self, atom_id, envs):
             e1 += envs['vhf'].aux[atom_id]
         return e1
 
-    to_gpu = lib.to_gpu
-
 Grad = Gradients
diff --git a/pyscf/df/grad/uhf.py b/pyscf/df/grad/uhf.py
index af2e048591..0eec773b0d 100644
--- a/pyscf/df/grad/uhf.py
+++ b/pyscf/df/grad/uhf.py
@@ -60,6 +60,4 @@ def extra_force(self, atom_id, envs):
         else:
             return 0
 
-    to_gpu = lib.to_gpu
-
 Grad = Gradients
diff --git a/pyscf/df/grad/uks.py b/pyscf/df/grad/uks.py
index 9fa6f5cdf5..e6de663a95 100644
--- a/pyscf/df/grad/uks.py
+++ b/pyscf/df/grad/uks.py
@@ -124,6 +124,4 @@ def extra_force(self, atom_id, envs):
             e1 += envs['vhf'].aux[atom_id]
         return e1
 
-    to_gpu = lib.to_gpu
-
 Grad = Gradients
diff --git a/pyscf/df/hessian/rhf.py b/pyscf/df/hessian/rhf.py
index 95bc7f9dcf..d06fa9f473 100644
--- a/pyscf/df/hessian/rhf.py
+++ b/pyscf/df/hessian/rhf.py
@@ -480,7 +480,6 @@ def __init__(self, mf):
 
     partial_hess_elec = partial_hess_elec
     make_h1 = make_h1
-    to_gpu = lib.to_gpu
 
 #TODO: Insert into DF class
 
diff --git a/pyscf/df/hessian/rks.py b/pyscf/df/hessian/rks.py
index 30b59fc8d1..74c1bdd6c9 100644
--- a/pyscf/df/hessian/rks.py
+++ b/pyscf/df/hessian/rks.py
@@ -126,7 +126,6 @@ def __init__(self, mf):
 
     partial_hess_elec = partial_hess_elec
     make_h1 = make_h1
-    to_gpu = lib.to_gpu
 
 
 if __name__ == '__main__':
diff --git a/pyscf/df/hessian/uhf.py b/pyscf/df/hessian/uhf.py
index b252f99953..5cb20240f8 100644
--- a/pyscf/df/hessian/uhf.py
+++ b/pyscf/df/hessian/uhf.py
@@ -531,7 +531,6 @@ def __init__(self, mf):
 
     partial_hess_elec = partial_hess_elec
     make_h1 = make_h1
-    to_gpu = lib.to_gpu
 
 #TODO: Insert into DF class
 
diff --git a/pyscf/df/hessian/uks.py b/pyscf/df/hessian/uks.py
index 92624a128f..1afa995973 100644
--- a/pyscf/df/hessian/uks.py
+++ b/pyscf/df/hessian/uks.py
@@ -139,7 +139,6 @@ def __init__(self, mf):
 
     partial_hess_elec = partial_hess_elec
     make_h1 = make_h1
-    to_gpu = lib.to_gpu
 
 
 if __name__ == '__main__':
diff --git a/pyscf/dft/numint.py b/pyscf/dft/numint.py
index 1716042118..c448241587 100644
--- a/pyscf/dft/numint.py
+++ b/pyscf/dft/numint.py
@@ -2865,8 +2865,12 @@ def make_rho(idm, ao, sindex, xctype):
                                      with_lapl)
         return make_rho, ndms, nao
 
-    to_gpu = lib.to_gpu
-
+    def to_gpu(self):
+        try:
+            from gpu4pyscf.dft import numint
+            return numint.NumInt()
+        except ImportError:
+            raise ImportError('Cannot find GPU4PySCF')
 _NumInt = NumInt
 
 
diff --git a/pyscf/grad/rhf.py b/pyscf/grad/rhf.py
index e45e3b5ed9..c89da0e1f6 100644
--- a/pyscf/grad/rhf.py
+++ b/pyscf/grad/rhf.py
@@ -440,8 +440,14 @@ def _tag_rdm1 (self, dm, mo_coeff, mo_occ):
         to be split into alpha,beta in DF-ROHF subclass'''
         return lib.tag_array (dm, mo_coeff=mo_coeff, mo_occ=mo_occ)
 
+    # to_gpu can be reused only when __init__ still takes mf
     def to_gpu(self):
-        raise NotImplementedError
+        mf = self.base.to_gpu()
+        from importlib import import_module
+        mod = import_module(self.__module__.replace('pyscf', 'gpu4pyscf'))
+        cls = getattr(mod, self.__class__.__name__)
+        obj = cls(mf)
+        return obj
 
 # export the symbol GradientsMixin for backward compatibility.
 # GradientsMixin should be dropped in the future.
@@ -463,8 +469,6 @@ def make_rdm1e(self, mo_energy=None, mo_coeff=None, mo_occ=None):
 
     grad_elec = grad_elec
 
-    to_gpu = lib.to_gpu
-
 Grad = Gradients
 
 from pyscf import scf
diff --git a/pyscf/grad/rks.py b/pyscf/grad/rks.py
index bb0198140d..7aee5fadfc 100644
--- a/pyscf/grad/rks.py
+++ b/pyscf/grad/rks.py
@@ -622,8 +622,6 @@ def extra_force(self, atom_id, envs):
         else:
             return 0
 
-    to_gpu = lib.to_gpu
-
 Grad = Gradients
 
 from pyscf import dft
diff --git a/pyscf/grad/uhf.py b/pyscf/grad/uhf.py
index 949b7abf44..0f46458975 100644
--- a/pyscf/grad/uhf.py
+++ b/pyscf/grad/uhf.py
@@ -106,8 +106,6 @@ def make_rdm1e(self, mo_energy=None, mo_coeff=None, mo_occ=None):
 
     grad_elec = grad_elec
 
-    to_gpu = lib.to_gpu
-
 Grad = Gradients
 
 from pyscf import scf
diff --git a/pyscf/grad/uks.py b/pyscf/grad/uks.py
index cc73955814..644ab01584 100644
--- a/pyscf/grad/uks.py
+++ b/pyscf/grad/uks.py
@@ -275,8 +275,6 @@ def extra_force(self, atom_id, envs):
         else:
             return 0
 
-    to_gpu = lib.to_gpu
-
 Grad = Gradients
 
 from pyscf import dft
diff --git a/pyscf/hessian/rhf.py b/pyscf/hessian/rhf.py
index 9736eeff5c..f40df1bbe8 100644
--- a/pyscf/hessian/rhf.py
+++ b/pyscf/hessian/rhf.py
@@ -599,16 +599,20 @@ def kernel(self, mo_energy=None, mo_coeff=None, mo_occ=None, atmlst=None):
 
     gen_hop = gen_hop
 
+    # to_gpu can be reused only when __init__ still takes mf
     def to_gpu(self):
-        raise NotImplementedError
-
+        mf = self.base.to_gpu()
+        from importlib import import_module
+        mod = import_module(self.__module__.replace('pyscf', 'gpu4pyscf'))
+        cls = getattr(mod, self.__class__.__name__)
+        obj = cls(mf)
+        return obj
 
 class Hessian(HessianBase):
 
     partial_hess_elec = partial_hess_elec
     hess_elec = hess_elec
     make_h1 = make_h1
-    to_gpu = lib.to_gpu
 
 # Inject to RHF class
 from pyscf import scf
diff --git a/pyscf/hessian/rks.py b/pyscf/hessian/rks.py
index 31ee13115c..497b053383 100644
--- a/pyscf/hessian/rks.py
+++ b/pyscf/hessian/rks.py
@@ -590,7 +590,6 @@ def __init__(self, mf):
     partial_hess_elec = partial_hess_elec
     hess_elec = rhf_hess.hess_elec
     make_h1 = make_h1
-    to_gpu = lib.to_gpu
 
 from pyscf import dft
 dft.rks.RKS.Hessian = dft.rks_symm.RKS.Hessian = lib.class_as_method(Hessian)
diff --git a/pyscf/hessian/uhf.py b/pyscf/hessian/uhf.py
index 4b97fbf6f9..eabbe231d2 100644
--- a/pyscf/hessian/uhf.py
+++ b/pyscf/hessian/uhf.py
@@ -454,8 +454,6 @@ def solve_mo1(self, mo_energy, mo_coeff, mo_occ, h1ao_or_chkfile,
                          fx, atmlst, max_memory, verbose,
                          max_cycle=self.max_cycle, level_shift=self.level_shift)
 
-    to_gpu = lib.to_gpu
-
 from pyscf import scf
 scf.uhf.UHF.Hessian = lib.class_as_method(Hessian)
 
diff --git a/pyscf/hessian/uks.py b/pyscf/hessian/uks.py
index 2c3941452b..17a6693461 100644
--- a/pyscf/hessian/uks.py
+++ b/pyscf/hessian/uks.py
@@ -667,7 +667,6 @@ def __init__(self, mf):
     solve_mo1 = uhf_hess.Hessian.solve_mo1
     partial_hess_elec = partial_hess_elec
     make_h1 = make_h1
-    to_gpu = lib.to_gpu
 
 from pyscf import dft
 dft.uks.UKS.Hessian = dft.uks_symm.UKS.Hessian = lib.class_as_method(Hessian)
diff --git a/pyscf/lib/misc.py b/pyscf/lib/misc.py
index 9819f2a978..65219dacaf 100644
--- a/pyscf/lib/misc.py
+++ b/pyscf/lib/misc.py
@@ -1379,6 +1379,7 @@ def __getattr__(self, key):
 # Then class can be instantiated easily like cls(omniobj) in the following
 # to_gpu function.
 omniobj = _OmniObject()
+omniobj._built = True
 omniobj.mol = omniobj
 omniobj._scf = omniobj
 omniobj.base = omniobj
@@ -1408,7 +1409,7 @@ def to_gpu(method, out=None):
         if isinstance(method, (SinglePointScanner, GradScanner)):
             method = method.undo_scanner()
 
-        import import_module
+        from importlib import import_module
         mod = import_module(method.__module__.replace('pyscf', 'gpu4pyscf'))
         cls = getattr(mod, method.__class__.__name__)
         # A temporary GPU instance. This ensures to initialize private
@@ -1430,3 +1431,4 @@ def to_gpu(method, out=None):
         setattr(out, key, val)
     out.reset()
     return out
+
diff --git a/pyscf/mp/dfmp2.py b/pyscf/mp/dfmp2.py
index 6522b21c19..d8cffdd57a 100644
--- a/pyscf/mp/dfmp2.py
+++ b/pyscf/mp/dfmp2.py
@@ -140,8 +140,6 @@ def update_amps(self, t2, eris):
     def init_amps(self, mo_energy=None, mo_coeff=None, eris=None, with_t2=WITH_T2):
         return kernel(self, mo_energy, mo_coeff, eris, with_t2)
 
-    to_gpu = lib.to_gpu
-
 MP2 = DFMP2
 
 from pyscf import scf
diff --git a/pyscf/mp/mp2.py b/pyscf/mp/mp2.py
index 9d1dc431a5..e2d9caaa09 100644
--- a/pyscf/mp/mp2.py
+++ b/pyscf/mp/mp2.py
@@ -649,7 +649,14 @@ def nuc_grad_method(self):
     def init_amps(self, mo_energy=None, mo_coeff=None, eris=None, with_t2=WITH_T2):
         return kernel(self, mo_energy, mo_coeff, eris, with_t2)
 
-    to_gpu = lib.to_gpu
+    # to_gpu can be reused only when __init__ still takes mf
+    def to_gpu(self):
+        mf = self._scf.to_gpu()
+        from importlib import import_module
+        mod = import_module(self.__module__.replace('pyscf', 'gpu4pyscf'))
+        cls = getattr(mod, self.__class__.__name__)
+        obj = cls(mf)
+        return obj
 
 RMP2 = MP2
 

From eafc3575234aca3832d270f4e1193bec2119d2b4 Mon Sep 17 00:00:00 2001
From: Hong-Zhou Ye <hzyechem@gmail.com>
Date: Wed, 3 Apr 2024 18:14:46 -0400
Subject: [PATCH 27/44] bug fix for RCCSD(T) with complex orbitals (#2141)

* bug fix for vooo order

* fix flake8

---------

Co-authored-by: hongzhouye <>
---
 pyscf/cc/ccsd_t.py                      |  2 +-
 pyscf/cc/ccsd_t_slow.py                 |  2 +-
 pyscf/cc/qcisd_t_slow.py                |  2 +-
 pyscf/pbc/cc/test/test_rccsd_t_shift.py | 65 +++++++++++++++++++++++++
 4 files changed, 68 insertions(+), 3 deletions(-)
 create mode 100644 pyscf/pbc/cc/test/test_rccsd_t_shift.py

diff --git a/pyscf/cc/ccsd_t.py b/pyscf/cc/ccsd_t.py
index 52e0b972c8..4cb06abf84 100644
--- a/pyscf/cc/ccsd_t.py
+++ b/pyscf/cc/ccsd_t.py
@@ -176,7 +176,7 @@ def _sort_eri(mycc, eris, nocc, nvir, vvop, log):
 
 def _sort_t2_vooo_(mycc, orbsym, t1, t2, eris):
     assert (t2.flags.c_contiguous)
-    vooo = numpy.asarray(eris.ovoo).transpose(1,0,3,2).conj().copy()
+    vooo = numpy.asarray(eris.ovoo).transpose(1,0,2,3).conj().copy()
     nocc, nvir = t1.shape
     if mycc.mol.symmetry:
         orbsym = numpy.asarray(orbsym, dtype=numpy.int32)
diff --git a/pyscf/cc/ccsd_t_slow.py b/pyscf/cc/ccsd_t_slow.py
index 32d6d2960a..b22e36497c 100644
--- a/pyscf/cc/ccsd_t_slow.py
+++ b/pyscf/cc/ccsd_t_slow.py
@@ -45,7 +45,7 @@ def kernel(mycc, eris, t1=None, t2=None, verbose=logger.NOTE):
     eijk = lib.direct_sum('i,j,k->ijk', e_occ, e_occ, e_occ)
 
     eris_vvov = eris.get_ovvv().conj().transpose(1,3,0,2)
-    eris_vooo = numpy.asarray(eris.ovoo).conj().transpose(1,0,3,2)
+    eris_vooo = numpy.asarray(eris.ovoo).conj().transpose(1,0,2,3)
     eris_vvoo = numpy.asarray(eris.ovov).conj().transpose(1,3,0,2)
     fvo = eris.fock[nocc:,:nocc]
     def get_w(a, b, c):
diff --git a/pyscf/cc/qcisd_t_slow.py b/pyscf/cc/qcisd_t_slow.py
index 19a02779bc..8968449333 100644
--- a/pyscf/cc/qcisd_t_slow.py
+++ b/pyscf/cc/qcisd_t_slow.py
@@ -47,7 +47,7 @@ def kernel(mycc, eris, t1=None, t2=None, verbose=logger.NOTE):
     eijk = lib.direct_sum('i,j,k->ijk', e_occ, e_occ, e_occ)
 
     eris_vvov = eris.get_ovvv().conj().transpose(1,3,0,2)
-    eris_vooo = numpy.asarray(eris.ovoo).conj().transpose(1,0,3,2)
+    eris_vooo = numpy.asarray(eris.ovoo).conj().transpose(1,0,2,3)
     eris_vvoo = numpy.asarray(eris.ovov).conj().transpose(1,3,0,2)
     fvo = eris.fock[nocc:,:nocc]
     def get_w(a, b, c):
diff --git a/pyscf/pbc/cc/test/test_rccsd_t_shift.py b/pyscf/pbc/cc/test/test_rccsd_t_shift.py
new file mode 100644
index 0000000000..92178ab5ad
--- /dev/null
+++ b/pyscf/pbc/cc/test/test_rccsd_t_shift.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python
+# Copyright 2014-2018 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Authors: Hong-Zhou Ye <hzyechem@gmail.com>
+#
+
+import unittest
+import numpy as np
+
+from pyscf.pbc import gto, scf, cc
+from pyscf.cc.ccsd_t import kernel as CCSD_T
+
+
+def run_cell(cell, scaled_center):
+    kpt = cell.make_kpts([1,1,1], scaled_center=scaled_center)[0]
+
+    mf = scf.RHF(cell, kpt=kpt).rs_density_fit()
+    mf.with_df.omega = 0.1
+    mf.kernel()
+
+    mcc = cc.RCCSD(mf)
+    eris = mcc.ao2mo()
+    mcc.kernel(eris=eris)
+    eccsd = mcc.e_corr
+
+    et = CCSD_T(mcc, eris)
+
+    return eccsd, et
+
+
+class KnownValues(unittest.TestCase):
+    def test_water(self):
+        atom = '''
+        O          0.00000        0.00000        0.11779
+        H          0.00000        0.75545       -0.47116
+        H          0.00000       -0.75545       -0.47116
+        '''
+        basis = 'gth-dzvp'
+        pseudo = 'gth-hf-rev'
+        a = np.eye(3) * 30
+        cell = gto.M(atom=atom, basis=basis, a=a, pseudo=pseudo)
+
+        eccsd_gamma, et_gamma = run_cell(cell, [0,0,0])
+        self.assertAlmostEqual(eccsd_gamma, -0.2082317212, 8)
+        self.assertAlmostEqual(et_gamma   , -0.0033716894, 8)
+
+        eccsd_shifted, et_shifted = run_cell(cell, [0.1,0.1,0.1])
+        self.assertAlmostEqual(eccsd_gamma, eccsd_shifted, 8)
+        self.assertAlmostEqual(et_gamma   , et_shifted   , 8)
+
+if __name__ == '__main__':
+    print("RCCSD(T) with shift k-point test")
+    unittest.main()

From 9a152a9953f58bc632cb873bc6f9971e36216015 Mon Sep 17 00:00:00 2001
From: Michal Krompiec <michal.krompiec@gmail.com>
Date: Thu, 4 Apr 2024 15:28:07 +0100
Subject: [PATCH 28/44] Fix if mf.istype('UHF') for to_uhf() conversion

---
 pyscf/mp/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyscf/mp/__init__.py b/pyscf/mp/__init__.py
index ae700bb3b0..3b084823c3 100644
--- a/pyscf/mp/__init__.py
+++ b/pyscf/mp/__init__.py
@@ -54,7 +54,7 @@ def RMP2(mf, frozen=None, mo_coeff=None, mo_occ=None):
 
 def UMP2(mf, frozen=None, mo_coeff=None, mo_occ=None):
     mf = mf.remove_soscf()
-    if mf.istype('UHF'):
+    if not mf.istype('UHF'):
         mf = mf.to_uhf()
 
     if getattr(mf, 'with_df', None):

From 095130d0c0828adb58c9a5a40ede7ee95107cec7 Mon Sep 17 00:00:00 2001
From: Qiming Sun <osirpt.sun@gmail.com>
Date: Mon, 11 Mar 2024 22:22:55 -0700
Subject: [PATCH 29/44] Solve conflicts between @property and __getattr__

---
 pyscf/gto/mole.py     | 13 ++++++-------
 pyscf/pbc/gto/cell.py | 15 +++++++--------
 2 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/pyscf/gto/mole.py b/pyscf/gto/mole.py
index 9c0a2d2125..5f97ba0dc2 100644
--- a/pyscf/gto/mole.py
+++ b/pyscf/gto/mole.py
@@ -3709,12 +3709,11 @@ def __getattr__(self, key):
         from Mole object.
         '''
         if key[0] == '_':  # Skip private attributes and Python builtins
-            raise AttributeError('Mole object does not have attribute %s' % key)
-        elif key in ('_ipython_canary_method_should_not_exist_',
-                     '_repr_mimebundle_'):
-            # https://github.com/mewwts/addict/issues/26
-            # https://github.com/jupyter/notebook/issues/2014
-            raise AttributeError(f'Mole object has no attribute {key}')
+            # https://bugs.python.org/issue45985
+            # https://github.com/python/cpython/issues/103936
+            # @property and __getattr__ conflicts. As a temporary fix, call
+            # object.__getattribute__ method to re-raise AttributeError
+            return object.__getattribute__(self, key)
 
         # Import all available modules. Some methods are registered to other
         # classes/modules when importing modules in __all__.
@@ -3738,7 +3737,7 @@ def __getattr__(self, key):
         elif 'CI' in key or 'CC' in key or 'CAS' in key or 'MP' in key:
             mf = scf.HF(self)
         else:
-            raise AttributeError(f'Mole object has no attribute {key}')
+            return object.__getattribute__(self, key)
 
         method = getattr(mf, key)
 
diff --git a/pyscf/pbc/gto/cell.py b/pyscf/pbc/gto/cell.py
index 15b4fa26d9..713916526e 100644
--- a/pyscf/pbc/gto/cell.py
+++ b/pyscf/pbc/gto/cell.py
@@ -1157,12 +1157,11 @@ def __getattr__(self, key):
         from Cell object.
         '''
         if key[0] == '_':  # Skip private attributes and Python builtins
-            raise AttributeError('Cell object does not have attribute %s' % key)
-        elif key in ('_ipython_canary_method_should_not_exist_',
-                     '_repr_mimebundle_'):
-            # https://github.com/mewwts/addict/issues/26
-            # https://github.com/jupyter/notebook/issues/2014
-            raise AttributeError(f'Cell object has no attribute {key}')
+            # https://bugs.python.org/issue45985
+            # https://github.com/python/cpython/issues/103936
+            # @property and __getattr__ conflicts. As a temporary fix, call
+            # object.__getattribute__ method to re-raise AttributeError
+            return object.__getattribute__(self, key)
 
         # Import all available modules. Some methods are registered to other
         # classes/modules when importing modules in __all__.
@@ -1188,7 +1187,7 @@ def __getattr__(self, key):
             elif 'CI' in key or 'CC' in key or 'MP' in key:
                 mf = scf.KHF(self)
             else:
-                raise AttributeError(f'Cell object has no attribute {key}')
+                return object.__getattribute__(self, key)
             # Remove prefix 'K' because methods are registered without the leading 'K'
             key = key[1:]
         else:
@@ -1204,7 +1203,7 @@ def __getattr__(self, key):
             elif 'CI' in key or 'CC' in key or 'MP' in key:
                 mf = scf.HF(self)
             else:
-                raise AttributeError(f'Cell object has no attribute {key}')
+                return object.__getattribute__(self, key)
 
         method = getattr(mf, key)
 

From fdd4e487dafc605fbeaa4c63817138681c726ea1 Mon Sep 17 00:00:00 2001
From: chillenb <chillenbrand15@gmail.com>
Date: Tue, 9 Apr 2024 11:54:48 -0400
Subject: [PATCH 30/44] move static configurations from setup.py to
 pyproject.toml (#2144)

* move testing options to pytest.ini

* move static configurations from setup.py to pyproject.toml

* Update CI release jobs

---------

Co-authored-by: Qiming Sun <osirpt.sun@gmail.com>
---
 .github/workflows/ci.yml       |  2 +-
 .github/workflows/publish.yml  | 11 ++++-
 .github/workflows/run_tests.sh |  4 +-
 NOTICE                         |  1 +
 conda/build.sh                 |  2 +-
 pyproject.toml                 | 68 +++++++++++++++++++++++++++++++
 setup.cfg => pytest.ini        |  8 +---
 setup.py                       | 74 +++-------------------------------
 8 files changed, 89 insertions(+), 81 deletions(-)
 create mode 100644 pyproject.toml
 rename setup.cfg => pytest.ini (80%)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 81db232136..1da9486bb1 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -72,7 +72,7 @@ jobs:
             echo 'pbc_tools_pbc_fft_engine = "NUMPY"' > .pyscf_conf.py && \
             echo "dftd3_DFTD3PATH = './pyscf/lib/deps/lib'" >> .pyscf_conf.py && \
             echo "scf_hf_SCF_mute_chkfile = True" >> .pyscf_conf.py && \
-            ulimit -s 20000 && /opt/python/${{ matrix.pyver }}/bin/pytest pyscf/ --ignore=pyscf/adc --ignore=pyscf/pbc/df --ignore=pyscf/pbc/cc -s -c setup.cfg pyscf'
+            ulimit -s 20000 && /opt/python/${{ matrix.pyver }}/bin/pytest pyscf/ --ignore=pyscf/adc --ignore=pyscf/pbc/df --ignore=pyscf/pbc/cc -s -c pytest.ini pyscf'
 
   macos-build:
     runs-on: macos-latest
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index a792c40d0f..5f26d085da 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -16,7 +16,9 @@ jobs:
       - uses: actions/checkout@v3
       - name: Build wheels
         run: |
-            docker run --rm -v ${{ github.workspace }}:/src/pyscf pyscf/pyscf-pypa-env:latest \
+            docker run --rm -v ${{ github.workspace }}:/src/pyscf \
+            -e CMAKE_BUILD_PARALLEL_LEVEL=4 \
+            pyscf/pyscf-pypa-env:latest \
             bash /src/pyscf/docker/pypa-env/build-wheels.sh
       - name: List available wheels
         run: |
@@ -57,6 +59,7 @@ jobs:
             export src=${GITHUB_WORKSPACE:-/src/pyscf} && \
             export dst=${GITHUB_WORKSPACE:-/src/pyscf}/linux-wheels && \
             export CMAKE_CONFIGURE_ARGS="-DWITH_F12=OFF" && \
+            export CMAKE_BUILD_PARALLEL_LEVEL=4 && \
             mkdir -p /root/wheelhouse $src/linux-wheels && \
             sed -i "/            if basename(fn) not in needed_libs:/s/basename.*libs/1/" /opt/_internal/pipx/venvs/auditwheel/lib/python*/site-packages/auditwheel/wheel_abi.py && \
             /opt/python/${{ matrix.pyver }}/bin/pip wheel -v --no-deps --no-clean -w /root/wheelhouse $src && \
@@ -82,7 +85,8 @@ jobs:
       - uses: actions/checkout@v3
       - name: Build sdist
         run: |
-          python3 setup.py sdist
+          pip install build
+          python3 -m build -s
       - name: List available sdist
         run: |
           ls ${{ github.workspace }}/dist
@@ -108,6 +112,7 @@ jobs:
           CIBW_BUILD: cp311-macosx_x86_64
           CIBW_BUILD_VERBOSITY: "1"
           CMAKE_CONFIGURE_ARGS: "-DWITH_F12=OFF"
+          CMAKE_BUILD_PARALLEL_LEVEL: "4"
         with:
           output-dir: mac-wheels
       - name: List available wheels
@@ -133,6 +138,7 @@ jobs:
           # Cross-platform build for arm64 wheels on x86 platform
           CIBW_ARCHS_MACOS: "x86_64 universal2 arm64"
           CMAKE_CONFIGURE_ARGS: "-DWITH_F12=OFF"
+          CMAKE_BUILD_PARALLEL_LEVEL: "4"
           CMAKE_OSX_ARCHITECTURES: arm64
         with:
           output-dir: mac-wheels
@@ -161,6 +167,7 @@ jobs:
       - run: which python
       - name: Publish to conda
         run: |
+          export CMAKE_BUILD_PARALLEL_LEVEL=4
           export ANACONDA_API_TOKEN=${{ secrets.ANACONDA_TOKEN }}
           conda install -y anaconda-client conda-build
           conda config --set anaconda_upload yes
diff --git a/.github/workflows/run_tests.sh b/.github/workflows/run_tests.sh
index ec53aa7d24..707313c379 100755
--- a/.github/workflows/run_tests.sh
+++ b/.github/workflows/run_tests.sh
@@ -10,8 +10,8 @@ echo "scf_hf_SCF_mute_chkfile = True" >> .pyscf_conf.py
 version=$(python -c 'import sys; print("{0}.{1}".format(*sys.version_info[:2]))')
 # pytest-cov on Python 3.12 consumes huge memory
 if [ "$RUNNER_OS" == "Linux" ] && [ $version != "3.12" ]; then
-  pytest pyscf/ -s -c setup.cfg \
+  pytest pyscf/ -s -c pytest.ini \
     --cov-report xml --cov-report term --cov-config .coveragerc --cov pyscf
 else
-  pytest pyscf/ -s -c setup.cfg pyscf
+  pytest pyscf/ -s -c pytest.ini pyscf
 fi
diff --git a/NOTICE b/NOTICE
index dc52a6294c..327f4732e1 100644
--- a/NOTICE
+++ b/NOTICE
@@ -106,6 +106,7 @@ Jiachen Li
 Felipe S. S. Schneider
 Aniruddha Seal
 Peter Reinholdt
+Christopher Hillenbrand
 
 
 ---
diff --git a/conda/build.sh b/conda/build.sh
index 4fe6afe44b..c84e5f22f6 100755
--- a/conda/build.sh
+++ b/conda/build.sh
@@ -17,4 +17,4 @@ export CMAKE_CONFIGURE_ARGS="-DWITH_F12=OFF -DBLA_VENDOR=Intel10_64lp_seq"
 
 # env PYTHON not defined in certain conda-build version
 # $PYTHON -m pip install . -vv
-pip install -v --prefix=$PREFIX .
+MAKEFLAGS="-j4" pip install -v --prefix=$PREFIX .
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000000..7ba6770e34
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,68 @@
+[build-system]
+requires = ["setuptools >= 61.0", "wheel"]
+build-backend = "setuptools.build_meta"
+
+
+[project]
+name = "pyscf"
+dynamic = ["version"]
+description = "PySCF: Python-based Simulations of Chemistry Framework"
+readme = "README.md"
+classifiers = [
+  'Development Status :: 5 - Production/Stable',
+  'Intended Audience :: Science/Research',
+  'Intended Audience :: Developers',
+  'License :: OSI Approved :: Apache Software License',
+  'Programming Language :: C',
+  'Programming Language :: Python',
+  'Programming Language :: Python :: 3.7',
+  'Programming Language :: Python :: 3.8',
+  'Programming Language :: Python :: 3.9',
+  'Programming Language :: Python :: 3.10',
+  'Programming Language :: Python :: 3.11',
+  'Programming Language :: Python :: 3.12',
+  'Topic :: Software Development',
+  'Topic :: Scientific/Engineering',
+  'Operating System :: POSIX',
+  'Operating System :: Unix',
+  'Operating System :: MacOS',
+]
+
+maintainers = [{ name = "Qiming Sun", email = "osirpt.sun@gmail.com" }]
+
+authors = [{ name = "Qiming Sun", email = "osirpt.sun@gmail.com" }]
+
+license = { text = "Apache-2.0" }
+
+dependencies = [
+  'numpy>=1.13,!=1.16,!=1.17',
+  'scipy!=1.5.0,!=1.5.1',
+  'h5py>=2.7',
+  'setuptools',
+]
+
+[project.urls]
+Homepage = "http://www.pyscf.org"
+Repository = "https://github.com/pyscf/pyscf"
+Documentation = "http://www.pyscf.org"
+
+[project.optional-dependencies]
+
+geomopt = ["pyberny>=0.6.2", "geometric>=0.9.7.2", "pyscf-qsdopt"]
+doci = ["pyscf-doci"]
+properties = ["pyscf-properties"]
+semiempirical = ['pyscf-semiempirical']
+cppe = ["cppe"]
+pyqmc = ["pyqmc"]
+mcfun = ["mcfun>=0.2.1"]
+bse = ["basis-set-exchange"]
+
+all = ["pyscf[geomopt,doci,properties,semiempirical,cppe,pyqmc,mcfun,bse]"]
+
+# extras which should not be installed by "all" components
+cornell_shci = ["pyscf-cornell-shci"]
+nao = ["pyscf-nao"]
+fciqmcscf = ["pyscf-fciqmc"]
+tblis = ["pyscf-tblis"]
+icmpspt = ["pyscf-icmpspt"] # broken
+shciscf = ["pyscf-shciscf"] # broken
diff --git a/setup.cfg b/pytest.ini
similarity index 80%
rename from setup.cfg
rename to pytest.ini
index dfd113a2dd..b4ab584dd6 100644
--- a/setup.cfg
+++ b/pytest.ini
@@ -1,9 +1,4 @@
-[egg_info]
-tag_build = 
-tag_date = 0
-tag_svn_revision = 0
-
-[tool:pytest]
+[pytest]
 addopts = --import-mode=importlib
   -k "not _high_cost and not _skip"
   --ignore=examples
@@ -13,3 +8,4 @@ addopts = --import-mode=importlib
   --ignore-glob="*test_bz*"
   --ignore-glob="*pbc/cc/test/*test_h_*.py"
   --ignore-glob="*test_ks_noimport*.py"
+
diff --git a/setup.py b/setup.py
index e23d90fb58..cabb82e2ee 100755
--- a/setup.py
+++ b/setup.py
@@ -18,37 +18,6 @@
 from setuptools import setup, find_packages, Extension
 from setuptools.command.build_py import build_py
 
-CLASSIFIERS = [
-'Development Status :: 5 - Production/Stable',
-'Intended Audience :: Science/Research',
-'Intended Audience :: Developers',
-'License :: OSI Approved :: Apache Software License',
-'Programming Language :: C',
-'Programming Language :: Python',
-'Programming Language :: Python :: 3.7',
-'Programming Language :: Python :: 3.8',
-'Programming Language :: Python :: 3.9',
-'Programming Language :: Python :: 3.10',
-'Programming Language :: Python :: 3.11',
-'Programming Language :: Python :: 3.12',
-'Topic :: Software Development',
-'Topic :: Scientific/Engineering',
-'Operating System :: POSIX',
-'Operating System :: Unix',
-'Operating System :: MacOS',
-]
-
-NAME             = 'pyscf'
-MAINTAINER       = 'Qiming Sun'
-MAINTAINER_EMAIL = 'osirpt.sun@gmail.com'
-DESCRIPTION      = 'PySCF: Python-based Simulations of Chemistry Framework'
-#LONG_DESCRIPTION = ''
-URL              = 'http://www.pyscf.org'
-DOWNLOAD_URL     = 'http://github.com/pyscf/pyscf'
-LICENSE          = 'Apache License 2.0'
-AUTHOR           = 'Qiming Sun'
-AUTHOR_EMAIL     = 'osirpt.sun@gmail.com'
-PLATFORMS        = ['Linux', 'Mac OS-X', 'Unix']
 def get_version():
     topdir = os.path.abspath(os.path.join(__file__, '..'))
     with open(os.path.join(topdir, 'pyscf', '__init__.py'), 'r') as f:
@@ -59,25 +28,6 @@ def get_version():
     raise ValueError("Version string not found")
 VERSION = get_version()
 
-EXTRAS = {
-    'geomopt': ['pyberny>=0.6.2', 'geometric>=0.9.7.2', 'pyscf-qsdopt'],
-    #'dmrgscf': ['pyscf-dmrgscf'],
-    'doci': ['pyscf-doci'],
-    'icmpspt': ['pyscf-icmpspt'],
-    'properties': ['pyscf-properties'],
-    'semiempirical': ['pyscf-semiempirical'],
-    'shciscf': ['pyscf-shciscf'],
-    'cppe': ['cppe'],
-    'pyqmc': ['pyqmc'],
-    'mcfun': ['mcfun>=0.2.1'],
-    'bse': ['basis-set-exchange'],
-}
-EXTRAS['all'] = [p for extras in EXTRAS.values() for p in extras]
-# extras which should not be installed by "all" components
-EXTRAS['cornell_shci'] = ['pyscf-cornell-shci']
-EXTRAS['nao'] = ['pyscf-nao']
-EXTRAS['fciqmcscf'] = ['pyscf-fciqmc']
-EXTRAS['tblis'] = ['pyscf-tblis']
 
 def get_platform():
     from distutils.util import get_platform
@@ -117,9 +67,11 @@ def run(self):
         self.spawn(cmd)
 
         self.announce('Building binaries', level=3)
-        # Do not use high level parallel compilation. OOM may be triggered
-        # when compiling certain functionals in libxc.
-        cmd = ['cmake', '--build', self.build_temp, '-j', '2']
+        # By default do not use high level parallel compilation.
+        # OOM may be triggered when compiling certain functionals in libxc.
+        # Set the shell variable CMAKE_BUILD_PARALLEL_LEVEL=n to enable
+        # parallel compilation.
+        cmd = ['cmake', '--build', self.build_temp]
         build_args = os.getenv('CMAKE_BUILD_ARGS')
         if build_args:
             cmd.extend(build_args.split(' '))
@@ -150,27 +102,11 @@ def initialize_with_default_plat_name(self):
               'https://github.com/scipy/scipy/issues/16151)')
 
 setup(
-    name=NAME,
     version=VERSION,
-    description=DESCRIPTION,
-    long_description_content_type="text/markdown",
-    long_description=DESCRIPTION,
-    url=URL,
-    download_url=DOWNLOAD_URL,
-    license=LICENSE,
-    classifiers=CLASSIFIERS,
-    author=AUTHOR,
-    author_email=AUTHOR_EMAIL,
-    platforms=PLATFORMS,
     #package_dir={'pyscf': 'pyscf'},  # packages are under directory pyscf
     #include *.so *.dat files. They are now placed in MANIFEST.in
     #package_data={'': ['*.so', '*.dylib', '*.dll', '*.dat']},
     include_package_data=True,  # include everything in source control
     packages=find_packages(exclude=['*test*', '*examples*']),
     cmdclass={'build_py': CMakeBuildPy},
-    install_requires=['numpy>=1.13,!=1.16,!=1.17',
-                      _scipy_version,
-                      'h5py>=2.7',
-                      'setuptools'],
-    extras_require=EXTRAS,
 )

From d57f1d6c89c723e11a7f0933380a6139ba372554 Mon Sep 17 00:00:00 2001
From: fishjojo <zhangxing.nju@gmail.com>
Date: Mon, 8 Apr 2024 15:38:38 -0700
Subject: [PATCH 31/44] fix pbc df with KPoints input

---
 pyscf/pbc/df/df.py           |  3 +++
 pyscf/pbc/df/df_jk.py        |  3 ---
 pyscf/pbc/df/fft.py          |  3 +++
 pyscf/pbc/df/mdf.py          |  3 +++
 pyscf/pbc/df/mdf_jk.py       |  3 ---
 pyscf/pbc/df/test/test_df.py | 17 ++++++++++++++++-
 6 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/pyscf/pbc/df/df.py b/pyscf/pbc/df/df.py
index 55d5bd7f16..3791dacf8a 100644
--- a/pyscf/pbc/df/df.py
+++ b/pyscf/pbc/df/df.py
@@ -53,6 +53,7 @@
 from pyscf.pbc.df import df_ao2mo
 from pyscf.pbc.df.aft import estimate_eta, _check_kpts
 from pyscf.pbc.df.df_jk import zdotCN
+from pyscf.pbc.lib.kpts import KPoints
 from pyscf.pbc.lib.kpts_helper import (is_zero, gamma_point, member, unique,
                                        KPT_DIFF_TOL)
 from pyscf.pbc.df.gdf_builder import libpbc, _CCGDFBuilder, _CCNucBuilder
@@ -146,6 +147,8 @@ def __init__(self, cell, kpts=numpy.zeros((1,3))):
         self.verbose = cell.verbose
         self.max_memory = cell.max_memory
 
+        if isinstance(kpts, KPoints):
+            kpts = kpts.kpts
         self.kpts = kpts  # default is gamma point
         self.kpts_band = None
         self._auxbasis = None
diff --git a/pyscf/pbc/df/df_jk.py b/pyscf/pbc/df/df_jk.py
index 88c782b311..b556fd512a 100644
--- a/pyscf/pbc/df/df_jk.py
+++ b/pyscf/pbc/df/df_jk.py
@@ -28,7 +28,6 @@
 from pyscf import lib
 from pyscf.lib import logger, zdotNN, zdotCN, zdotNC
 from pyscf.pbc import tools
-from pyscf.pbc.lib.kpts import KPoints
 from pyscf.pbc.lib.kpts_helper import is_zero, gamma_point, member, get_kconserv_ria
 from pyscf import __config__
 
@@ -53,8 +52,6 @@ def density_fit(mf, auxbasis=None, mesh=None, with_df=None):
         else:
             kpts = numpy.reshape(mf.kpt, (1,3))
 
-        if isinstance(kpts, KPoints):
-            kpts = kpts.kpts
         with_df = df.DF(mf.cell, kpts)
         with_df.max_memory = mf.max_memory
         with_df.stdout = mf.stdout
diff --git a/pyscf/pbc/df/fft.py b/pyscf/pbc/df/fft.py
index 1d538ae4ee..382f317990 100644
--- a/pyscf/pbc/df/fft.py
+++ b/pyscf/pbc/df/fft.py
@@ -30,6 +30,7 @@
 from pyscf.pbc.df import fft_jk
 from pyscf.pbc.df import aft
 from pyscf.pbc.df.aft import _check_kpts
+from pyscf.pbc.lib.kpts import KPoints
 from pyscf.pbc.lib.kpts_helper import is_zero
 from pyscf import __config__
 
@@ -168,6 +169,8 @@ def __init__(self, cell, kpts=numpy.zeros((1,3))):
         self.verbose = cell.verbose
         self.max_memory = cell.max_memory
 
+        if isinstance(kpts, KPoints):
+            kpts = kpts.kpts
         self.kpts = kpts
 
         self.grids = gen_grid.UniformGrids(cell)
diff --git a/pyscf/pbc/df/mdf.py b/pyscf/pbc/df/mdf.py
index 741f349410..6cb21cf649 100644
--- a/pyscf/pbc/df/mdf.py
+++ b/pyscf/pbc/df/mdf.py
@@ -38,6 +38,7 @@
 from pyscf.pbc.df.gdf_builder import _CCGDFBuilder
 from pyscf.pbc.df.rsdf_builder import _RSGDFBuilder
 from pyscf.pbc.df.incore import libpbc, make_auxcell
+from pyscf.pbc.lib.kpts import KPoints
 from pyscf.pbc.lib.kpts_helper import is_zero, member, unique
 from pyscf.pbc.df import mdf_jk
 from pyscf.pbc.df import mdf_ao2mo
@@ -55,6 +56,8 @@ def __init__(self, cell, kpts=np.zeros((1,3))):
         self.verbose = cell.verbose
         self.max_memory = cell.max_memory
 
+        if isinstance(kpts, KPoints):
+            kpts = kpts.kpts
         self.kpts = kpts  # default is gamma point
         self.kpts_band = None
         self._auxbasis = None
diff --git a/pyscf/pbc/df/mdf_jk.py b/pyscf/pbc/df/mdf_jk.py
index de305a736c..f3fe0e2ee4 100644
--- a/pyscf/pbc/df/mdf_jk.py
+++ b/pyscf/pbc/df/mdf_jk.py
@@ -26,7 +26,6 @@
 from pyscf.lib import logger
 from pyscf.pbc.df import df_jk
 from pyscf.pbc.df import aft_jk
-from pyscf.pbc.lib.kpts import KPoints
 
 #
 # Divide the Coulomb potential to two parts.  Computing short range part in
@@ -52,8 +51,6 @@ def density_fit(mf, auxbasis=None, mesh=None, with_df=None):
         else:
             kpts = numpy.reshape(mf.kpt, (1,3))
 
-        if isinstance(kpts, KPoints):
-            kpts = kpts.kpts
         with_df = mdf.MDF(mf.cell, kpts)
         with_df.max_memory = mf.max_memory
         with_df.stdout = mf.stdout
diff --git a/pyscf/pbc/df/test/test_df.py b/pyscf/pbc/df/test/test_df.py
index 62a14500a2..20036dee82 100644
--- a/pyscf/pbc/df/test/test_df.py
+++ b/pyscf/pbc/df/test/test_df.py
@@ -20,7 +20,7 @@
 from pyscf import ao2mo, gto
 from pyscf.pbc import gto as pgto
 from pyscf.pbc import scf as pscf
-from pyscf.pbc.df import df, aug_etb, FFTDF
+from pyscf.pbc.df import df, aug_etb, FFTDF, mdf
 from pyscf.pbc.df import gdf_builder
 #from mpi4pyscf.pbc.df import df
 pyscf.pbc.DEBUG = False
@@ -250,6 +250,21 @@ def test_cell_with_cart(self):
         eri1 = df.GDF(cell).set(auxbasis=aug_etb(cell)).get_eri()
         self.assertAlmostEqual(abs(eri1-eri0).max(), 0, 2)
 
+    def test_kpoints_input(sef):
+        cell.space_group_symmetry = True
+        cell.build()
+        kpts = cell.make_kpts([2,2,2],
+                              space_group_symmetry=True,
+                              time_reversal_symmetry=True)
+
+        mydf = df.GDF(cell, kpts=kpts)
+        assert mydf.kpts.shape == (8,3)
+
+        mydf = FFTDF(cell, kpts=kpts)
+        assert mydf.kpts.shape == (8,3)
+
+        mydf = mdf.MDF(cell, kpts=kpts)
+        assert mydf.kpts.shape == (8,3)
 
 if __name__ == '__main__':
     print("Full Tests for df")

From 92defdf2b3efeaee02233ae9836904c5c52234e6 Mon Sep 17 00:00:00 2001
From: Qiming Sun <osirpt.sun@gmail.com>
Date: Fri, 12 Apr 2024 18:49:19 -0700
Subject: [PATCH 32/44] Fix pip builder

---
 .github/workflows/ci.yml | 2 +-
 pyproject.toml           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 1da9486bb1..6b74a10a5f 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -59,7 +59,7 @@ jobs:
         run: |
             docker run --rm -v ${{ github.workspace }}:/src/pyscf:rw --workdir=/src/pyscf ${{ env.img }} \
             bash -exc '/opt/python/${{ matrix.pyver }}/bin/pip install --upgrade pip setuptools && \
-            /opt/python/${{ matrix.pyver }}/bin/pip install "numpy!=1.16,!=1.17" "scipy!=1.5" h5py pytest pytest-cov pytest-timer pyberny geometric && \
+            /opt/python/${{ matrix.pyver }}/bin/pip install "numpy!=1.16,!=1.17" "scipy!=1.5" h5py==3.10 pytest pytest-cov pytest-timer pyberny geometric && \
             yum install -y epel-release && \
             yum-config-manager --enable epel && \
             yum install -y openblas-devel gcc cmake curl && \
diff --git a/pyproject.toml b/pyproject.toml
index 7ba6770e34..926ba415d5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,5 +1,5 @@
 [build-system]
-requires = ["setuptools >= 61.0", "wheel"]
+requires = ["setuptools >= 61.0", "wheel", "cmake"]
 build-backend = "setuptools.build_meta"
 
 

From 7f0a1c3bca2bd97925547b4edcc34f722a26832a Mon Sep 17 00:00:00 2001
From: Hong-Zhou Ye <hzyechem@gmail.com>
Date: Sat, 13 Apr 2024 01:12:15 -0400
Subject: [PATCH 33/44] reset enuc in _build_supcell_ (#2164)

* reset enuc in _build_supcell_

* add test for enuc reset

---------

Co-authored-by: hongzhouye <>
---
 pyscf/pbc/tools/pbc.py           | 1 +
 pyscf/pbc/tools/test/test_pbc.py | 6 +++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/pyscf/pbc/tools/pbc.py b/pyscf/pbc/tools/pbc.py
index 20d45fe692..5151d4d4d5 100644
--- a/pyscf/pbc/tools/pbc.py
+++ b/pyscf/pbc/tools/pbc.py
@@ -676,6 +676,7 @@ def _build_supcell_(supcell, cell, Ls):
     x, y, z = coords.T
     supcell.atom = supcell._atom = list(zip(symbs, zip(x, y, z)))
     supcell.unit = 'B'
+    supcell.enuc = None # reset nuclear energy
 
     # Do not call supcell.build() to initialize supcell since it may normalize
     # the basis contraction coefficients
diff --git a/pyscf/pbc/tools/test/test_pbc.py b/pyscf/pbc/tools/test/test_pbc.py
index 3a8774f78e..52251aac79 100644
--- a/pyscf/pbc/tools/test/test_pbc.py
+++ b/pyscf/pbc/tools/test/test_pbc.py
@@ -144,9 +144,13 @@ def test_super_cell(self):
                        mesh = [3]*3,
                        atom ='''He .1 .0 .0''',
                        basis = 'ccpvdz')
-        cl2 = tools.super_cell(cl1, [2,3,4])
+        _ = cl1.enuc
+        ncopy = [2,3,4]
+        ncell = ncopy[0]*ncopy[1]*ncopy[2]
+        cl2 = tools.super_cell(cl1, ncopy)
         self.assertAlmostEqual(lib.fp(cl2.atom_coords()), -18.946080642714836, 9)
         self.assertAlmostEqual(lib.fp(cl2._bas[:,gto.ATOM_OF]), 16.515144238434807, 9)
+        self.assertAlmostEqual(cl1.enuc, cl2.enuc / ncell, 9)
 
     def test_super_cell_with_symm(self):
         cl1 = pbcgto.M(a = 1.4 * numpy.eye(3),

From 77e13d0de49dde3bf3ee11a9a3ea5f9a86705a24 Mon Sep 17 00:00:00 2001
From: Hong-Zhou Ye <hzyechem@gmail.com>
Date: Sat, 13 Apr 2024 01:16:56 -0400
Subject: [PATCH 34/44] Bug fix for single k-point JK-build in PBC DF (#2165)

* bug fix for j-build in get_jk

* bug fix for k-build in get_jk

---------

Co-authored-by: hongzhouye <>
---
 pyscf/pbc/df/aft_jk.py           |  2 +-
 pyscf/pbc/df/df_jk.py            |  8 ++++----
 pyscf/pbc/df/test/test_aft_jk.py | 20 ++++++++++++++++++++
 pyscf/pbc/df/test/test_df_jk.py  | 15 +++++++++++++++
 4 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/pyscf/pbc/df/aft_jk.py b/pyscf/pbc/df/aft_jk.py
index d6627545bd..f0f8acb229 100644
--- a/pyscf/pbc/df/aft_jk.py
+++ b/pyscf/pbc/df/aft_jk.py
@@ -732,7 +732,7 @@ def get_jk(mydf, dm, hermi=1, kpt=numpy.zeros(3),
                     iLkI *= vkcoulG[p0:p1].reshape(1,nG,1)
                     zdotNC(iLkR.reshape(nao,-1), iLkI.reshape(nao,-1),
                            pLqR.reshape(nao,-1).T, pLqI.reshape(nao,-1).T,
-                           1, vkR[i], vkI[i])
+                           1, vkR[i], vkI[i], 1)
             #t2 = log.timer_debug1('        with_k', *t2)
         pqkR = pqkI = pLqR = pLqI = iLkR = iLkI = None
         #t2 = log.timer_debug1('%d:%d'%(p0,p1), *t2)
diff --git a/pyscf/pbc/df/df_jk.py b/pyscf/pbc/df/df_jk.py
index b556fd512a..68bf20a010 100644
--- a/pyscf/pbc/df/df_jk.py
+++ b/pyscf/pbc/df/df_jk.py
@@ -1261,12 +1261,12 @@ def contract_k(pLqR, pLqI, sign):
         if with_j:
             #:rho_coeff = numpy.einsum('Lpq,xqp->xL', Lpq, dms)
             #:vj += numpy.dot(rho_coeff, Lpq.reshape(-1,nao**2))
-            rhoR  = numpy.einsum('Lpq,xpq->xL', LpqR, dmsR)
+            rhoR  = numpy.einsum('Lpq,xqp->xL', LpqR, dmsR)
             if not j_real:
                 LpqI = LpqI.reshape(-1,nao,nao)
-                rhoR -= numpy.einsum('Lpq,xpq->xL', LpqI, dmsI)
-                rhoI  = numpy.einsum('Lpq,xpq->xL', LpqR, dmsI)
-                rhoI += numpy.einsum('Lpq,xpq->xL', LpqI, dmsR)
+                rhoR -= numpy.einsum('Lpq,xqp->xL', LpqI, dmsI)
+                rhoI  = numpy.einsum('Lpq,xqp->xL', LpqR, dmsI)
+                rhoI += numpy.einsum('Lpq,xqp->xL', LpqI, dmsR)
             vjR += sign * numpy.einsum('xL,Lpq->xpq', rhoR, LpqR)
             if not j_real:
                 vjR -= sign * numpy.einsum('xL,Lpq->xpq', rhoI, LpqI)
diff --git a/pyscf/pbc/df/test/test_aft_jk.py b/pyscf/pbc/df/test/test_aft_jk.py
index 7bd6d6915d..697c07f4b5 100644
--- a/pyscf/pbc/df/test/test_aft_jk.py
+++ b/pyscf/pbc/df/test/test_aft_jk.py
@@ -119,6 +119,26 @@ def test_jk(self):
         self.assertAlmostEqual(ej1, 12.233546641482697, 8)
         self.assertAlmostEqual(ek1, 43.946958026023722, 7)
 
+    def test_jk_complex_dm(self):
+        scaled_center = [0.3728,0.5524,0.7672]
+        kpt = cell.make_kpts([1,1,1], scaled_center=scaled_center)[0]
+        mf = scf.RHF(cell, kpt=kpt)
+        dm = mf.init_guess_by_1e()
+
+        mydf = aft.AFTDF(cell, kpts=[kpt])
+        vj1, vk1 = mydf.get_jk(dm, kpts=kpt, exxdiv='ewald')
+        vjs, vks = mydf.get_jk([dm], kpts=[kpt], exxdiv='ewald')
+        vj , vk  = vjs[0], vks[0]
+
+        ej1 = numpy.einsum('ij,ji->', vj1, dm)
+        ek1 = numpy.einsum('ij,ji->', vk1, dm)
+        ej  = numpy.einsum('ij,ji->', vj , dm)
+        ek  = numpy.einsum('ij,ji->', vk , dm)
+
+        # kpts and single kpt AFTDF must match exactly
+        self.assertAlmostEqual(ej1, ej, 10)
+        self.assertAlmostEqual(ek1, ek, 10)
+
     def test_aft_j(self):
         numpy.random.seed(1)
         nao = cell.nao_nr()
diff --git a/pyscf/pbc/df/test/test_df_jk.py b/pyscf/pbc/df/test/test_df_jk.py
index e9b5fa1e52..ed8c37a5d9 100644
--- a/pyscf/pbc/df/test/test_df_jk.py
+++ b/pyscf/pbc/df/test/test_df_jk.py
@@ -81,6 +81,21 @@ def test_jk_single_kpt(self):
         self.assertAlmostEqual(ej1, 25.8129854469354, 6)
         self.assertAlmostEqual(ek1, 72.6088517709998, 6)
 
+    def test_jk_single_kpt_complex_dm(self):
+        scaled_center = [0.3728,0.5524,0.7672]
+        kpt = cell0.make_kpts([1,1,1], scaled_center=scaled_center)[0]
+        mf = pscf.RHF(cell0, kpt=kpt).density_fit('weigend')
+        dm = mf.init_guess_by_1e()
+        with lib.temporary_env(mf.cell, incore_anyway=True):
+            vj1, vk1 = mf.get_jk(dm=dm) # from mol_hf.dot_eri_dm
+        ej1 = numpy.einsum('ij,ji->', vj1, dm)
+        ek1 = numpy.einsum('ij,ji->', vk1, dm)
+        vj, vk = mf.with_df.get_jk(dm=dm, kpts=kpt, exxdiv=mf.exxdiv)
+        ej = numpy.einsum('ij,ji->', vj, dm)
+        ek = numpy.einsum('ij,ji->', vk, dm)
+        self.assertAlmostEqual(ej1, ej, 10)
+        self.assertAlmostEqual(ek1, ek, 10)
+
     def test_jk_single_kpt_high_cost(self):
         mf0 = pscf.RHF(cell)
         mf0.exxdiv = None

From c549c46be5963f657a48da6ee908fb5ed3bdc870 Mon Sep 17 00:00:00 2001
From: Maximilian Scheurer <max.scheurer@me.com>
Date: Fri, 12 Apr 2024 12:59:02 +0200
Subject: [PATCH 35/44] add missing init file

---
 pyscf/solvent/grad/__init__.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)
 create mode 100644 pyscf/solvent/grad/__init__.py

diff --git a/pyscf/solvent/grad/__init__.py b/pyscf/solvent/grad/__init__.py
new file mode 100644
index 0000000000..2b02f141a1
--- /dev/null
+++ b/pyscf/solvent/grad/__init__.py
@@ -0,0 +1,17 @@
+#!/usr/bin/env python
+# Copyright 2014-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Author: Qiming Sun <osirpt.sun@gmail.com>
+#
\ No newline at end of file

From 4f08ae58d842cfafc8a6bf1144d19d4a478958b9 Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <wxj6000@gmail.com>
Date: Mon, 15 Apr 2024 12:35:03 -0700
Subject: [PATCH 36/44] DFTD3 & DFTD4 builder (#2161)

* local build dftd3/dftd4

* flake8

* fixed issues apis

* add compilation tool for mac

* fixed bugs in unittest

* fixed an issue in unit test

* add fortran compiler

* fortran compiler

* Move dftd3 and dftd4 builder to a separated repo (pyscf-dispersion)

* enable dftd3 & dftd4 for python3.12

* Adjust tests

---------

Co-authored-by: Qiming Sun <osirpt.sun@gmail.com>
---
 .github/workflows/ci_linux/python_deps.sh |  3 +-
 .github/workflows/ci_macos/deps_apt.sh    |  1 -
 pyscf/dft/test/test_h2o.py                | 16 ++---
 pyscf/grad/dispersion.py                  | 19 +++---
 pyscf/grad/test/test_rhf.py               | 14 ++---
 pyscf/grad/test/test_rks.py               | 55 +++++++++++++++++
 pyscf/grad/test/test_uhf.py               | 73 +++++++++++++++++++++++
 pyscf/grad/test/test_uks.py               | 32 ++++++++++
 pyscf/hessian/dispersion.py               | 27 +++++----
 pyscf/hessian/test/test_rhf.py            | 33 ++++++++++
 pyscf/hessian/test/test_rks.py            | 25 +++-----
 pyscf/hessian/test/test_uhf.py            | 34 ++++++++++-
 pyscf/hessian/test/test_uks.py            | 62 +++++++++++++++----
 pyscf/lib/CMakeLists.txt                  |  2 +-
 pyscf/scf/dispersion.py                   | 35 ++++-------
 15 files changed, 334 insertions(+), 97 deletions(-)

diff --git a/.github/workflows/ci_linux/python_deps.sh b/.github/workflows/ci_linux/python_deps.sh
index 3f52c7cb8a..d772db29e6 100755
--- a/.github/workflows/ci_linux/python_deps.sh
+++ b/.github/workflows/ci_linux/python_deps.sh
@@ -2,13 +2,12 @@
 python -m pip install --upgrade pip
 pip install "numpy!=1.16,!=1.17" "scipy!=1.5" h5py pytest pytest-cov pytest-timer
 pip install pyberny
+pip install --no-deps pyscf-dispersion
 
 version=$(python -c 'import sys; version=sys.version_info[:2]; print("{0}.{1}".format(*version))')
 if [ $version != '3.12' ]; then
     pip install geometric
     pip install spglib
-    pip install dftd3
-    pip install dftd4
 fi
 
 #cppe
diff --git a/.github/workflows/ci_macos/deps_apt.sh b/.github/workflows/ci_macos/deps_apt.sh
index 742e13d6fd..f1f641af19 100755
--- a/.github/workflows/ci_macos/deps_apt.sh
+++ b/.github/workflows/ci_macos/deps_apt.sh
@@ -1,2 +1 @@
 #!/usr/bin/env bash
-exit 0
diff --git a/pyscf/dft/test/test_h2o.py b/pyscf/dft/test/test_h2o.py
index 1c1c1ececf..c12b295ead 100644
--- a/pyscf/dft/test/test_h2o.py
+++ b/pyscf/dft/test/test_h2o.py
@@ -18,18 +18,10 @@
 from pyscf import gto
 from pyscf import lib
 from pyscf import dft
-
-
-import sys
-try:
-    import dftd3
-except ImportError:
-    pass
-
 try:
-    import dftd4
+    from pyscf.dispersion import dftd3, dftd4
 except ImportError:
-    pass
+    dftd3 = dftd4 = None
 
 def setUpModule():
     global h2o, h2osym, h2o_cation, h2osym_cation
@@ -501,7 +493,7 @@ def test_nr_uks_vv10_high_cost(self):
         method.nlcgrids.atom_grid = {"H": (40, 110), "O": (40, 110),}
         self.assertAlmostEqual(method.scf(), -76.352381513158718, 8)
 
-    @unittest.skipIf('dftd3' not in sys.modules, "requires the dftd3 library")
+    @unittest.skipIf(dftd3 is None, "requires the dftd3 library")
     def test_dft_parser(self):
         from pyscf.scf import dispersion
         method = dft.RKS(h2o, xc='wb97m-d3bj')
@@ -553,7 +545,7 @@ def test_camb3lyp_rsh_omega(self):
         mf2.kernel()
         self.assertAlmostEqual(mf1.e_tot, -76.36649222362115, 9)
 
-    @unittest.skipIf('dftd3' not in sys.modules, "requires the dftd3 library")
+    @unittest.skipIf(dftd3 is None, "requires the dftd3 library")
     def test_dispersion(self):
         mf = dft.RKS(h2o)
         mf.xc = 'B3LYP'
diff --git a/pyscf/grad/dispersion.py b/pyscf/grad/dispersion.py
index 2cd5fe705c..dc8ca7965c 100644
--- a/pyscf/grad/dispersion.py
+++ b/pyscf/grad/dispersion.py
@@ -26,6 +26,11 @@
 
 def get_dispersion(mf_grad, disp_version=None, with_3body=False):
     '''gradient of dispersion correction for RHF/RKS'''
+    try:
+        from pyscf.dispersion import dftd3, dftd4
+    except ImportError:
+        print('dftd3 and dftd4 not available. Install them with `pip install pyscf-dispersion`')
+        raise
     mf = mf_grad.base
     mol = mf.mol
     if isinstance(mf, KohnShamDFT):
@@ -49,16 +54,14 @@ def get_dispersion(mf_grad, disp_version=None, with_3body=False):
         with_3body = mf.disp_with_3body
 
     if disp_version[:2].upper() == 'D3':
-        # raised error in SCF module, assuming dftd3 installed
-        import dftd3.pyscf as disp
-        d3 = disp.DFTD3Dispersion(mol, xc=method, version=disp_version, atm=with_3body)
-        _, g_d3 = d3.kernel()
+        d3_model = dftd3.DFTD3Dispersion(mol, xc=method, version=disp_version, atm=with_3body)
+        res = d3_model.get_dispersion(grad=True)
+        g_d3 = res.get('gradient')
         return g_d3
     elif disp_version[:2].upper() == 'D4':
-        # raised error in SCF module, assuming dftd3 installed
-        import dftd4.pyscf as disp
-        d4 = disp.DFTD4Dispersion(mol, xc=method, atm=with_3body)
-        _, g_d4 = d4.kernel()
+        d4_model = dftd4.DFTD4Dispersion(mol, xc=method, atm=with_3body)
+        res = d4_model.get_dispersion(grad=True)
+        g_d4 = res.get('gradient')
         return g_d4
     else:
         raise RuntimeError(f'dispersion correction: {disp_version} is not supported.')
diff --git a/pyscf/grad/test/test_rhf.py b/pyscf/grad/test/test_rhf.py
index 263be449b5..087e6cf40e 100644
--- a/pyscf/grad/test/test_rhf.py
+++ b/pyscf/grad/test/test_rhf.py
@@ -18,16 +18,10 @@
 from pyscf import gto, scf, lib
 from pyscf import grad
 
-import sys
 try:
-    import dftd3
+    from pyscf.dispersion import dftd3, dftd4
 except ImportError:
-    pass
-
-try:
-    import dftd4
-except ImportError:
-    pass
+    dftd3 = dftd4 = None
 
 def setUpModule():
     global mol
@@ -82,7 +76,7 @@ def test_df_rhf_grad(self):
         e2 = mfs('O  0.  0.  0.001; H  0.  -0.757  0.587; H  0.  0.757   0.587')
         self.assertAlmostEqual(g[0,2], (e2-e1)/0.002*lib.param.BOHR, 5)
 
-    @unittest.skipIf('dftd3' not in sys.modules, "requires the dftd3 library")
+    @unittest.skipIf(dftd3 is None, "requires the dftd3 library")
     def test_rhf_d3_grad(self):
         mf = scf.RHF(mol)
         mf.disp = 'd3bj'
@@ -94,7 +88,7 @@ def test_rhf_d3_grad(self):
         e2 = mf_scan('O  0.  0.  0.001; H  0.  -0.757  0.587; H  0.  0.757   0.587')
         self.assertAlmostEqual((e2-e1)/0.002*lib.param.BOHR, g[0,2], 5)
 
-    @unittest.skipIf('dftd4' not in sys.modules, "requires the dftd4 library")
+    @unittest.skipIf(dftd4 is None, "requires the dftd4 library")
     def test_rhf_d4_grad(self):
         mf = scf.RHF(mol)
         mf.disp = 'd4'
diff --git a/pyscf/grad/test/test_rks.py b/pyscf/grad/test/test_rks.py
index fe760651de..456e11cdc4 100644
--- a/pyscf/grad/test/test_rks.py
+++ b/pyscf/grad/test/test_rks.py
@@ -18,6 +18,11 @@
 from pyscf import gto, dft, lib
 from pyscf.dft import radi
 from pyscf.grad import rks
+try:
+    from pyscf.dispersion import dftd3, dftd4
+except ImportError:
+    dftd3 = dftd4 = None
+
 
 def grids_response(grids):
     # JCP 98, 5612 (1993); DOI:10.1063/1.464906
@@ -189,6 +194,30 @@ def test_finite_diff_rks_grad(self):
         e2 = mf_scanner(mol1.set_geom_('O  0. 0. -.0001; 1  0. -0.757 0.587; 1  0. 0.757 0.587'))
         self.assertAlmostEqual(g[0,2], (e1-e2)/2e-4*lib.param.BOHR, 6)
 
+    def test_fnite_diff_rks_d3_grad(self):
+        mol1 = mol.copy()
+        mf = dft.RKS(mol)
+        mf.conv_tol = 1e-14
+        mf.kernel()
+        g = mf.nuc_grad_method().set(grid_response=True).kernel()
+
+        mf_scanner = mf.as_scanner()
+        e1 = mf_scanner(mol1.set_geom_('O  0. 0. 0.0001; 1  0. -0.757 0.587; 1  0. 0.757 0.587'))
+        e2 = mf_scanner(mol1.set_geom_('O  0. 0. -.0001; 1  0. -0.757 0.587; 1  0. 0.757 0.587'))
+        self.assertAlmostEqual(g[0,2], (e1-e2)/2e-4*lib.param.BOHR, 6)
+
+    def test_fnite_diff_rks_d4_grad(self):
+        mol1 = mol.copy()
+        mf = dft.RKS(mol)
+        mf.conv_tol = 1e-14
+        mf.kernel()
+        g = mf.nuc_grad_method().set(grid_response=True).kernel()
+
+        mf_scanner = mf.as_scanner()
+        e1 = mf_scanner(mol1.set_geom_('O  0. 0. 0.0001; 1  0. -0.757 0.587; 1  0. 0.757 0.587'))
+        e2 = mf_scanner(mol1.set_geom_('O  0. 0. -.0001; 1  0. -0.757 0.587; 1  0. 0.757 0.587'))
+        self.assertAlmostEqual(g[0,2], (e1-e2)/2e-4*lib.param.BOHR, 6)
+
     def test_finite_diff_df_rks_grad(self):
         mf1 = mf.density_fit ().run ()
         g = mf1.nuc_grad_method ().set (grid_response=True).kernel ()
@@ -200,6 +229,32 @@ def test_finite_diff_df_rks_grad(self):
         e2 = mf_scanner(mol1.set_geom_('O  0. 0. -.0001; 1  0. -0.757 0.587; 1  0. 0.757 0.587'))
         self.assertAlmostEqual(g[0,2], (e1-e2)/2e-4*lib.param.BOHR, 6)
 
+    @unittest.skipIf(dftd3 is None, "requires the dftd3 library")
+    def test_finite_diff_df_rks_d3_grad(self):
+        mf1 = mf.density_fit ()
+        mf1.disp = 'd3bj'
+        mf1.kernel()
+        g = mf1.nuc_grad_method ().set (grid_response=True).kernel ()
+
+        mol1 = mol.copy()
+        mf_scanner = mf1.as_scanner()
+        e1 = mf_scanner(mol1.set_geom_('O  0. 0. 0.0001; 1  0. -0.757 0.587; 1  0. 0.757 0.587'))
+        e2 = mf_scanner(mol1.set_geom_('O  0. 0. -.0001; 1  0. -0.757 0.587; 1  0. 0.757 0.587'))
+        self.assertAlmostEqual(g[0,2], (e1-e2)/2e-4*lib.param.BOHR, 6)
+
+    @unittest.skipIf(dftd4 is None, "requires the dftd4 library")
+    def test_finite_diff_df_rks_d4_grad(self):
+        mf1 = mf.density_fit ()
+        mf1.disp = 'd4'
+        mf1.kernel()
+        g = mf1.nuc_grad_method ().set (grid_response=True).kernel ()
+
+        mol1 = mol.copy()
+        mf_scanner = mf1.as_scanner()
+        e1 = mf_scanner(mol1.set_geom_('O  0. 0. 0.0001; 1  0. -0.757 0.587; 1  0. 0.757 0.587'))
+        e2 = mf_scanner(mol1.set_geom_('O  0. 0. -.0001; 1  0. -0.757 0.587; 1  0. 0.757 0.587'))
+        self.assertAlmostEqual(g[0,2], (e1-e2)/2e-4*lib.param.BOHR, 6)
+
     def test_rks_grad_lda(self):
         mol_hf = gto.Mole()
         mol_hf.atom = [
diff --git a/pyscf/grad/test/test_uhf.py b/pyscf/grad/test/test_uhf.py
index c6c055bf05..25d3885031 100644
--- a/pyscf/grad/test/test_uhf.py
+++ b/pyscf/grad/test/test_uhf.py
@@ -17,6 +17,11 @@
 import numpy
 from pyscf import gto, scf, lib
 from pyscf import grad
+try:
+    from pyscf.dispersion import dftd3, dftd4
+except ImportError:
+    dftd3 = dftd4 = None
+
 
 def setUpModule():
     global mol, mol1
@@ -104,6 +109,40 @@ def test_finite_diff_uhf_grad(self):
 H             -0.43459905    0.65805058   -0.00861418''')
         self.assertAlmostEqual(g[2,1], (e2-e1)/2e-4*lib.param.BOHR, 7)
 
+    @unittest.skipIf(dftd3 is None, "requires the dftd3 library")
+    def test_finite_diff_uhf_d3_grad(self):
+        mf = scf.UHF(mol)
+        mf.disp = 'd3bj'
+        mf.conv_tol = 1e-14
+        e0 = mf.kernel()
+        g = grad.UHF(mf).kernel()
+        mf_scanner = mf.as_scanner()
+
+        e1 = mf_scanner('''O    0.   0.       0.
+                        1    0.   -0.758   0.587
+                        1    0.   0.757    0.587''')
+        e2 = mf_scanner('''O    0.   0.       0.
+                        1    0.   -0.756   0.587
+                        1    0.   0.757    0.587''')
+        self.assertAlmostEqual(g[1,1], (e2-e1)/2e-3*lib.param.BOHR, 5)
+
+    @unittest.skipIf(dftd4 is None, "requires the dftd4 library")
+    def test_finite_diff_uhf_d4_grad(self):
+        mf = scf.UHF(mol)
+        mf.disp = 'd4'
+        mf.conv_tol = 1e-14
+        e0 = mf.kernel()
+        g = grad.UHF(mf).kernel()
+        mf_scanner = mf.as_scanner()
+
+        e1 = mf_scanner('''O    0.   0.       0.
+                        1    0.   -0.758   0.587
+                        1    0.   0.757    0.587''')
+        e2 = mf_scanner('''O    0.   0.       0.
+                        1    0.   -0.756   0.587
+                        1    0.   0.757    0.587''')
+        self.assertAlmostEqual(g[1,1], (e2-e1)/2e-3*lib.param.BOHR, 5)
+
     def test_finite_diff_df_uhf_grad(self):
         mf = scf.UHF(mol).density_fit ()
         mf.conv_tol = 1e-14
@@ -157,6 +196,40 @@ def test_finite_diff_df_uhf_grad(self):
 H             -0.43459905    0.65805058   -0.00861418''')
         self.assertAlmostEqual(g[2,1], (e2-e1)/2e-4*lib.param.BOHR, 7)
 
+    @unittest.skipIf(dftd4 is None, "requires the dftd4 library")
+    def test_finite_diff_df_uhf_d4_grad(self):
+        mf = scf.UHF(mol).density_fit ()
+        mf.conv_tol = 1e-14
+        mf.disp = 'd3bj'
+        e0 = mf.kernel()
+        g = mf.nuc_grad_method ().kernel()
+        mf_scanner = mf.as_scanner()
+
+        e1 = mf_scanner('''O    0.   0.       0.
+                        1    0.   -0.758   0.587
+                        1    0.   0.757    0.587''')
+        e2 = mf_scanner('''O    0.   0.       0.
+                        1    0.   -0.756   0.587
+                        1    0.   0.757    0.587''')
+        self.assertAlmostEqual(g[1,1], (e2-e1)/2e-3*lib.param.BOHR, 5)
+
+    @unittest.skipIf(dftd4 is None, "requires the dftd4 library")
+    def test_finite_diff_df_uhf_d4_grad(self):
+        mf = scf.UHF(mol).density_fit ()
+        mf.conv_tol = 1e-14
+        mf.disp = 'd4'
+        e0 = mf.kernel()
+        g = mf.nuc_grad_method ().kernel()
+        mf_scanner = mf.as_scanner()
+
+        e1 = mf_scanner('''O    0.   0.       0.
+                        1    0.   -0.758   0.587
+                        1    0.   0.757    0.587''')
+        e2 = mf_scanner('''O    0.   0.       0.
+                        1    0.   -0.756   0.587
+                        1    0.   0.757    0.587''')
+        self.assertAlmostEqual(g[1,1], (e2-e1)/2e-3*lib.param.BOHR, 5)
+
     def test_uhf_grad_one_atom(self):
         mol = gto.Mole()
         mol.atom = [['He', (0.,0.,0.)], ]
diff --git a/pyscf/grad/test/test_uks.py b/pyscf/grad/test/test_uks.py
index effd7218a7..9a763444ba 100644
--- a/pyscf/grad/test/test_uks.py
+++ b/pyscf/grad/test/test_uks.py
@@ -18,6 +18,10 @@
 from pyscf import gto, dft, lib
 from pyscf.dft import radi
 from pyscf.grad import uks
+try:
+    from pyscf.dispersion import dftd3, dftd4
+except ImportError:
+    dftd3 = dftd4 = None
 
 
 def setUpModule():
@@ -73,6 +77,34 @@ def test_finite_diff_df_uks_grad(self):
         e2 = mf_scanner(mol1.set_geom_('O  0. 0. -.0001; 1  0. -0.757 0.587; 1  0. 0.757 0.587'))
         self.assertAlmostEqual(g[0,2], (e1-e2)/2e-4*lib.param.BOHR, 6)
 
+    @unittest.skipIf(dftd3 is None, "requires the dftd3 library")
+    def test_fnite_diff_uks_d3_grad(self):
+        mol1 = mol.copy()
+        mf = dft.UKS(mol)
+        mf.disp = 'd3bj'
+        mf.conv_tol = 1e-14
+        mf.kernel()
+        g = mf.nuc_grad_method().set(grid_response=True).kernel()
+
+        mf_scanner = mf.as_scanner()
+        e1 = mf_scanner(mol1.set_geom_('O  0. 0. 0.0001; 1  0. -0.757 0.587; 1  0. 0.757 0.587'))
+        e2 = mf_scanner(mol1.set_geom_('O  0. 0. -.0001; 1  0. -0.757 0.587; 1  0. 0.757 0.587'))
+        self.assertAlmostEqual(g[0,2], (e1-e2)/2e-4*lib.param.BOHR, 6)
+
+    @unittest.skipIf(dftd4 is None, "requires the dftd4 library")
+    def test_fnite_diff_uks_d4_grad(self):
+        mol1 = mol.copy()
+        mf = dft.UKS(mol)
+        mf.disp = 'd4'
+        mf.conv_tol = 1e-14
+        mf.kernel()
+        g = mf.nuc_grad_method().set(grid_response=True).kernel()
+
+        mf_scanner = mf.as_scanner()
+        e1 = mf_scanner(mol1.set_geom_('O  0. 0. 0.0001; 1  0. -0.757 0.587; 1  0. 0.757 0.587'))
+        e2 = mf_scanner(mol1.set_geom_('O  0. 0. -.0001; 1  0. -0.757 0.587; 1  0. 0.757 0.587'))
+        self.assertAlmostEqual(g[0,2], (e1-e2)/2e-4*lib.param.BOHR, 6)
+
     def test_uks_grad_lda(self):
         mol = gto.Mole()
         mol.atom = [
diff --git a/pyscf/hessian/dispersion.py b/pyscf/hessian/dispersion.py
index 728b01e2cf..060d5b1a10 100644
--- a/pyscf/hessian/dispersion.py
+++ b/pyscf/hessian/dispersion.py
@@ -26,6 +26,11 @@
 from pyscf.dft import dft_parser
 
 def get_dispersion(hessobj, disp_version=None, with_3body=False):
+    try:
+        from pyscf.dispersion import dftd3, dftd4
+    except ImportError:
+        print('dftd3 and dftd4 not available. Install them with `pip install pyscf-dispersion`')
+        raise
     mf = hessobj.base
     mol = mf.mol
     if isinstance(mf, KohnShamDFT):
@@ -51,7 +56,6 @@ def get_dispersion(hessobj, disp_version=None, with_3body=False):
         with_3body = mf.disp_with_3body
 
     if mf.disp[:2].upper() == 'D3':
-        import dftd3.pyscf as disp
         coords = hessobj.mol.atom_coords()
         mol = mol.copy()
         eps = 1e-5
@@ -59,20 +63,21 @@ def get_dispersion(hessobj, disp_version=None, with_3body=False):
             for j in range(3):
                 coords[i,j] += eps
                 mol.set_geom_(coords, unit='Bohr')
-                d3 = disp.DFTD3Dispersion(mol, xc=method, version=mf.disp, atm=with_3body)
-                _, g1 = d3.kernel()
+                d3_model = dftd3.DFTD3Dispersion(mol, xc=method, version=mf.disp, atm=with_3body)
+                res = d3_model.get_dispersion(grad=True)
+                g1 = res.get('gradient')
 
                 coords[i,j] -= 2.0*eps
                 mol.set_geom_(coords, unit='Bohr')
-                d3 = disp.DFTD3Dispersion(mol, xc=method, version=mf.disp, atm=with_3body)
-                _, g2 = d3.kernel()
+                d3_model = dftd3.DFTD3Dispersion(mol, xc=method, version=mf.disp, atm=with_3body)
+                res = d3_model.get_dispersion(grad=True)
+                g2 = res.get('gradient')
 
                 coords[i,j] += eps
                 h_disp[i,:,j,:] = (g1 - g2)/(2.0*eps)
             return h_disp
 
     elif mf.disp[:2].upper() == 'D4':
-        import dftd4.pyscf as disp
         coords = hessobj.mol.atom_coords()
         mol = mol.copy()
         eps = 1e-5
@@ -80,13 +85,15 @@ def get_dispersion(hessobj, disp_version=None, with_3body=False):
             for j in range(3):
                 coords[i,j] += eps
                 mol.set_geom_(coords, unit='Bohr')
-                d4 = disp.DFTD4Dispersion(mol, xc=method, atm=with_3body)
-                _, g1 = d4.kernel()
+                d4_model = dftd4.DFTD4Dispersion(mol, xc=method, atm=with_3body)
+                res = d4_model.get_dispersion(grad=True)
+                g1 = res.get('gradient')
 
                 coords[i,j] -= 2.0*eps
                 mol.set_geom_(coords, unit='Bohr')
-                d4 = disp.DFTD4Dispersion(mol, xc=method, atm=with_3body)
-                _, g2 = d4.kernel()
+                d4_model = dftd4.DFTD4Dispersion(mol, xc=method, atm=with_3body)
+                res = d4_model.get_dispersion(grad=True)
+                g2 = res.get('gradient')
 
                 coords[i,j] += eps
                 h_disp[i,:,j,:] = (g1 - g2)/(2.0*eps)
diff --git a/pyscf/hessian/test/test_rhf.py b/pyscf/hessian/test/test_rhf.py
index 7f3bfdb2f9..b0c3cbbeda 100644
--- a/pyscf/hessian/test/test_rhf.py
+++ b/pyscf/hessian/test/test_rhf.py
@@ -17,6 +17,10 @@
 import numpy
 from pyscf import gto, scf, lib
 from pyscf import grad, hessian
+try:
+    from pyscf.dispersion import dftd3, dftd4
+except ImportError:
+    dftd3 = dftd4 = None
 
 def setUpModule():
     global mol
@@ -32,6 +36,7 @@ def setUpModule():
 
 def tearDownModule():
     global mol
+    mol.stdout.close()
     del mol
 
 class KnownValues(unittest.TestCase):
@@ -86,6 +91,34 @@ def test_finite_diff_rhf_hess(self):
         e2 = g_scanner(pmol.set_geom_('O  0. 0. -.0001; 1  0. -0.757 0.587; 1  0. 0.757 0.587'))[1]
         self.assertAlmostEqual(abs(hess[0,:,2] - (e1-e2)/2e-4*lib.param.BOHR).max(), 0, 4)
 
+    @unittest.skipIf(dftd3 is None, "requires the dftd3 library")
+    def test_finite_diff_rhf_d3_hess(self):
+        mf = scf.RHF(mol)
+        mf.conv_tol = 1e-14
+        mf.disp = 'd3bj'
+        e0 = mf.kernel()
+        hess = hessian.RHF(mf).kernel()
+
+        g_scanner = mf.nuc_grad_method().as_scanner()
+        pmol = mol.copy()
+        e1 = g_scanner(pmol.set_geom_('O  0. 0. 0.0001; 1  0. -0.757 0.587; 1  0. 0.757 0.587'))[1]
+        e2 = g_scanner(pmol.set_geom_('O  0. 0. -.0001; 1  0. -0.757 0.587; 1  0. 0.757 0.587'))[1]
+        self.assertAlmostEqual(abs(hess[0,:,2] - (e1-e2)/2e-4*lib.param.BOHR).max(), 0, 4)
+
+    @unittest.skipIf(dftd4 is None, "requires the dftd4 library")
+    def test_finite_diff_rhf_d4_hess_high_cost(self):
+        mf = scf.RHF(mol)
+        mf.conv_tol = 1e-14
+        mf.disp = 'd4'
+        e0 = mf.kernel()
+        hess = hessian.RHF(mf).kernel()
+
+        g_scanner = mf.nuc_grad_method().as_scanner()
+        pmol = mol.copy()
+        e1 = g_scanner(pmol.set_geom_('O  0. 0. 0.0001; 1  0. -0.757 0.587; 1  0. 0.757 0.587'))[1]
+        e2 = g_scanner(pmol.set_geom_('O  0. 0. -.0001; 1  0. -0.757 0.587; 1  0. 0.757 0.587'))[1]
+        self.assertAlmostEqual(abs(hess[0,:,2] - (e1-e2)/2e-4*lib.param.BOHR).max(), 0, 4)
+
 #        e1 = g_scanner(pmol.set_geom_('O  0. 0.0001 0.; 1  0. -0.757 0.587; 1  0. 0.757 0.587'))[1]
 #        e2 = g_scanner(pmol.set_geom_('O  0. -.0001 0.; 1  0. -0.757 0.587; 1  0. 0.757 0.587'))[1]
 #        self.assertAlmostEqual(abs(hess[0,:,1] - (e1-e2)/2e-4*lib.param.BOHR).max(), 0, 4)
diff --git a/pyscf/hessian/test/test_rks.py b/pyscf/hessian/test/test_rks.py
index ab7389f015..5d6a976ad8 100644
--- a/pyscf/hessian/test/test_rks.py
+++ b/pyscf/hessian/test/test_rks.py
@@ -17,17 +17,10 @@
 import numpy
 from pyscf import gto, dft, lib
 from pyscf import grad, hessian
-
-import sys
-try:
-    import dftd3
-except ImportError:
-    pass
-
 try:
-    import dftd4
+    from pyscf.dispersion import dftd3, dftd4
 except ImportError:
-    pass
+    dftd3 = dftd4 = None
 
 def setUpModule():
     global mol, h4
@@ -43,6 +36,7 @@ def setUpModule():
 
     h4 = gto.Mole()
     h4.verbose = 0
+    h4.output = '/dev/null'
     h4.atom = [
         [1 , (1. ,  0.     , 0.000)],
         [1 , (0. ,  1.     , 0.000)],
@@ -55,6 +49,7 @@ def setUpModule():
 def tearDownModule():
     global mol, h4
     mol.stdout.close()
+    h4.stdout.close()
     del mol, h4
 
 def finite_diff(mf):
@@ -127,15 +122,14 @@ def test_finite_diff_b3lyp_hess(self):
         #FIXME: errors seems too big
         self.assertAlmostEqual(abs(hess[0,:,2] - (e1-e2)/2e-4*lib.param.BOHR).max(), 0, 3)
 
-    @unittest.skipIf('dftd3' not in sys.modules, "requires the dftd3 library")
-    def test_finite_diff_b3lyp_d3_hess(self):
+    @unittest.skipIf(dftd3 is None, "requires the dftd3 library")
+    def test_finite_diff_b3lyp_d3_hess_high_cost(self):
         mf = dft.RKS(mol)
         mf.conv_tol = 1e-14
         mf.xc = 'b3lyp'
         mf.disp = 'd3bj'
-        e0 = mf.kernel()
+        mf.kernel()
         hess = mf.Hessian().kernel()
-        self.assertAlmostEqual(lib.fp(hess), -0.7586078053657133, 6)
 
         g_scanner = mf.nuc_grad_method().as_scanner()
         pmol = mol.copy()
@@ -144,15 +138,14 @@ def test_finite_diff_b3lyp_d3_hess(self):
         #FIXME: errors seems too big
         self.assertAlmostEqual(abs(hess[0,:,2] - (e1-e2)/2e-4*lib.param.BOHR).max(), 0, 3)
 
-    @unittest.skipIf('dftd4' not in sys.modules, "requires the dftd4 library")
+    @unittest.skipIf(dftd4 is None, "requires the dftd4 library")
     def test_finite_diff_b3lyp_d4_hess(self):
         mf = dft.RKS(mol)
         mf.conv_tol = 1e-14
         mf.xc = 'b3lyp'
         mf.disp = 'd4'
-        e0 = mf.kernel()
+        mf.kernel()
         hess = mf.Hessian().kernel()
-        self.assertAlmostEqual(lib.fp(hess), -0.7588415571313422, 6)
 
         g_scanner = mf.nuc_grad_method().as_scanner()
         pmol = mol.copy()
diff --git a/pyscf/hessian/test/test_uhf.py b/pyscf/hessian/test/test_uhf.py
index 06d32b38ad..64b7765603 100644
--- a/pyscf/hessian/test/test_uhf.py
+++ b/pyscf/hessian/test/test_uhf.py
@@ -17,6 +17,10 @@
 import numpy
 from pyscf import gto, scf, lib
 from pyscf import grad, hessian
+try:
+    from pyscf.dispersion import dftd3, dftd4
+except ImportError:
+    dftd3 = dftd4 = None
 
 def setUpModule():
     global mol
@@ -33,6 +37,7 @@ def setUpModule():
 
 def tearDownModule():
     global mol
+    mol.stdout.close()
     del mol
 
 class KnownValues(unittest.TestCase):
@@ -45,7 +50,7 @@ def test_uhf_hess(self):
         hess = hobj.kernel()
         self.assertAlmostEqual(lib.fp(hess), -0.20243405976628576, 5)
 
-    def test_finite_diff_rhf_hess(self):
+    def test_finite_diff_uhf_hess(self):
         mf = scf.UHF(mol)
         mf.conv_tol = 1e-14
         e0 = mf.kernel()
@@ -58,6 +63,33 @@ def test_finite_diff_rhf_hess(self):
         e2 = g_scanner(pmol.set_geom_('O  0. 0. -.0001; 1  0. -0.757 0.587; 1  0. 0.757 0.587'))[1]
         self.assertAlmostEqual(abs(hess[0,:,2] - (e1-e2)/2e-4*lib.param.BOHR).max(), 0, 4)
 
+    @unittest.skipIf(dftd3 is None, "requires the dftd3 library")
+    def test_finite_diff_uhf_d3_hess(self):
+        mf = scf.UHF(mol)
+        mf.conv_tol = 1e-14
+        mf.disp = 'd3bj'
+        e0 = mf.kernel()
+        hess = mf.Hessian().kernel()
+
+        g_scanner = mf.nuc_grad_method().as_scanner()
+        pmol = mol.copy()
+        e1 = g_scanner(pmol.set_geom_('O  0. 0. 0.0001; 1  0. -0.757 0.587; 1  0. 0.757 0.587'))[1]
+        e2 = g_scanner(pmol.set_geom_('O  0. 0. -.0001; 1  0. -0.757 0.587; 1  0. 0.757 0.587'))[1]
+        self.assertAlmostEqual(abs(hess[0,:,2] - (e1-e2)/2e-4*lib.param.BOHR).max(), 0, 4)
+
+    @unittest.skipIf(dftd4 is None, "requires the dftd4 library")
+    def test_finite_diff_uhf_d4_hess(self):
+        mf = scf.UHF(mol)
+        mf.conv_tol = 1e-14
+        mf.disp = 'd4'
+        e0 = mf.kernel()
+        hess = mf.Hessian().kernel()
+
+        g_scanner = mf.nuc_grad_method().as_scanner()
+        pmol = mol.copy()
+        e1 = g_scanner(pmol.set_geom_('O  0. 0. 0.0001; 1  0. -0.757 0.587; 1  0. 0.757 0.587'))[1]
+        e2 = g_scanner(pmol.set_geom_('O  0. 0. -.0001; 1  0. -0.757 0.587; 1  0. 0.757 0.587'))[1]
+        self.assertAlmostEqual(abs(hess[0,:,2] - (e1-e2)/2e-4*lib.param.BOHR).max(), 0, 4)
 
 if __name__ == "__main__":
     print("Full Tests for UHF Hessian")
diff --git a/pyscf/hessian/test/test_uks.py b/pyscf/hessian/test/test_uks.py
index e2394bef2e..f35ef29870 100644
--- a/pyscf/hessian/test/test_uks.py
+++ b/pyscf/hessian/test/test_uks.py
@@ -17,9 +17,13 @@
 import numpy
 from pyscf import gto, dft, lib
 from pyscf import grad, hessian
+try:
+    from pyscf.dispersion import dftd3, dftd4
+except ImportError:
+    dftd3 = dftd4 = None
 
 def setUpModule():
-    global mol
+    global mol, h4
     mol = gto.Mole()
     mol.verbose = 5
     mol.output = '/dev/null'
@@ -32,21 +36,23 @@ def setUpModule():
     mol.spin = 1
     mol.build()
 
-h4 = gto.Mole()
-h4.verbose = 0
-h4.atom = [
-    [1 , (1. ,  0.     , 0.000)],
-    [1 , (0. ,  1.     , 0.000)],
-    [1 , (0. , -1.517  , 1.177)],
-    [1 , (0. ,  1.517  , 1.177)]]
-h4.basis = '631g'
-h4.spin = 2
-h4.unit = 'B'
-h4.build()
+    h4 = gto.Mole()
+    h4.verbose = 0
+    h4.output = '/dev/null'
+    h4.atom = [
+        [1 , (1. ,  0.     , 0.000)],
+        [1 , (0. ,  1.     , 0.000)],
+        [1 , (0. , -1.517  , 1.177)],
+        [1 , (0. ,  1.517  , 1.177)]]
+    h4.basis = '631g'
+    h4.spin = 2
+    h4.unit = 'B'
+    h4.build()
 
 def tearDownModule():
     global mol, h4
     mol.stdout.close()
+    h4.stdout.close()
     del mol, h4
 
 def finite_diff(mf):
@@ -119,6 +125,38 @@ def test_finite_diff_b3lyp_hess(self):
         #FIXME: errors seems too big
         self.assertAlmostEqual(abs(hess[0,:,2] - (e1-e2)/2e-4*lib.param.BOHR).max(), 0, 3)
 
+    @unittest.skipIf(dftd3 is None, "requires the dftd3 library")
+    def test_finite_diff_b3lyp_d3_hess_high_cost(self):
+        mf = dft.UKS(mol)
+        mf.conv_tol = 1e-14
+        mf.xc = 'b3lyp'
+        mf.disp = 'd3bj'
+        mf.kernel()
+        hess = mf.Hessian().kernel()
+
+        g_scanner = mf.nuc_grad_method().as_scanner()
+        pmol = mol.copy()
+        e1 = g_scanner(pmol.set_geom_('O  0. 0. 0.0001; 1  0. -0.757 0.587; 1  0. 0.757 0.587'))[1]
+        e2 = g_scanner(pmol.set_geom_('O  0. 0. -.0001; 1  0. -0.757 0.587; 1  0. 0.757 0.587'))[1]
+        #FIXME: errors seems too big
+        self.assertAlmostEqual(abs(hess[0,:,2] - (e1-e2)/2e-4*lib.param.BOHR).max(), 0, 3)
+
+    @unittest.skipIf(dftd4 is None, "requires the dftd4 library")
+    def test_finite_diff_b3lyp_d4_hess_high_cost(self):
+        mf = dft.UKS(mol)
+        mf.conv_tol = 1e-14
+        mf.xc = 'b3lyp'
+        mf.disp = 'd4'
+        mf.kernel()
+        hess = mf.Hessian().kernel()
+
+        g_scanner = mf.nuc_grad_method().as_scanner()
+        pmol = mol.copy()
+        e1 = g_scanner(pmol.set_geom_('O  0. 0. 0.0001; 1  0. -0.757 0.587; 1  0. 0.757 0.587'))[1]
+        e2 = g_scanner(pmol.set_geom_('O  0. 0. -.0001; 1  0. -0.757 0.587; 1  0. 0.757 0.587'))[1]
+        #FIXME: errors seems too big
+        self.assertAlmostEqual(abs(hess[0,:,2] - (e1-e2)/2e-4*lib.param.BOHR).max(), 0, 3)
+
     def test_finite_diff_wb97x_hess(self):
         mf = dft.UKS(mol)
         mf.conv_tol = 1e-14
diff --git a/pyscf/lib/CMakeLists.txt b/pyscf/lib/CMakeLists.txt
index 4b7236535e..84e7622019 100644
--- a/pyscf/lib/CMakeLists.txt
+++ b/pyscf/lib/CMakeLists.txt
@@ -168,7 +168,7 @@ if(BUILD_LIBCINT)
     set(LIBCINT_GIT https://github.com/sunqm/qcint.git) # qcint is an optimized implementation for x86-64 architecture
     set(LIBCINT_VERSION v6.1.2)
     if(NOT BUILD_MARCH_NATIVE)
-      message(WARNING "The BUILD_MARCH_NATIVE option is not specified! qcint may not compile unless you explicitly pass compiler flags that turn on vectorization!")    
+      message(WARNING "The BUILD_MARCH_NATIVE option is not specified! qcint may not compile unless you explicitly pass compiler flags that turn on vectorization!")
     endif()
   endif()
 
diff --git a/pyscf/scf/dispersion.py b/pyscf/scf/dispersion.py
index 94e9018ff7..5b5af27831 100644
--- a/pyscf/scf/dispersion.py
+++ b/pyscf/scf/dispersion.py
@@ -24,6 +24,11 @@
 from pyscf.dft import dft_parser
 
 def get_dispersion(mf, disp_version=None):
+    try:
+        from pyscf.dispersion import dftd3, dftd4
+    except ImportError:
+        print('dftd3 and dftd4 not available. Install them with `pip install pyscf-dispersion`')
+        raise
     mol = mf.mol
     if isinstance(mf, KohnShamDFT):
         method = mf.xc
@@ -46,35 +51,17 @@ def get_dispersion(mf, disp_version=None):
 
     # for dftd3
     if disp_version[:2].upper() == 'D3':
-        try:
-            import dftd3.pyscf as disp
-        except ImportError:
-            raise ImportError("\n \
-cannot find dftd3 in the current environment.\n \
-please install dftd3 via \n \
-**************************************\n\
-        pip3 install dftd3 \n \
-**************************************")
-
-        d3 = disp.DFTD3Dispersion(mol, xc=method, version=disp_version, atm=with_3body)
-        e_d3, _ = d3.kernel()
+        d3_model = dftd3.DFTD3Dispersion(mol, xc=method, version=disp_version, atm=with_3body)
+        res = d3_model.get_dispersion()
+        e_d3 = res.get('energy')
         mf.scf_summary['dispersion'] = e_d3
         return e_d3
 
     # for dftd4
     elif disp_version[:2].upper() == 'D4':
-        try:
-            import dftd4.pyscf as disp
-        except ImportError:
-            raise ImportError("\n \
-cannot find dftd4 in the current environment. \n \
-please install dftd4 via \n \
-***************************************\n \
-        pip3 install dftd4 \n \
-***************************************")
-
-        d4 = disp.DFTD4Dispersion(mol, xc=method, atm=with_3body)
-        e_d4, _ = d4.kernel()
+        d4_model = dftd4.DFTD4Dispersion(mol, xc=method, atm=with_3body)
+        res = d4_model.get_dispersion()
+        e_d4 = res.get('energy')
         mf.scf_summary['dispersion'] = e_d4
         return e_d4
     else:

From 16d547143f3ab5d8733a177e4f70d2b01b370da5 Mon Sep 17 00:00:00 2001
From: Qiming Sun <osirpt.sun@gmail.com>
Date: Tue, 9 Apr 2024 16:44:03 -0700
Subject: [PATCH 37/44] Kpoint dft to_hf error (fix issue #2157)

---
 pyscf/pbc/dft/kgks.py           | 11 +++++++++--
 pyscf/pbc/dft/krks.py           | 28 ++++++++--------------------
 pyscf/pbc/dft/kroks.py          | 28 ++++++++--------------------
 pyscf/pbc/dft/kuks.py           | 28 ++++++++--------------------
 pyscf/pbc/dft/test/test_kgks.py | 14 ++++++++++++++
 pyscf/pbc/dft/test/test_krks.py | 26 ++++++++++++++++++++++++++
 pyscf/pbc/dft/test/test_kuks.py | 14 ++++++++++++++
 7 files changed, 87 insertions(+), 62 deletions(-)

diff --git a/pyscf/pbc/dft/kgks.py b/pyscf/pbc/dft/kgks.py
index 7774f2e36e..fb9e1f2f68 100644
--- a/pyscf/pbc/dft/kgks.py
+++ b/pyscf/pbc/dft/kgks.py
@@ -146,7 +146,14 @@ def nuc_grad_method(self):
 
     def to_hf(self):
         '''Convert to KGHF object.'''
-        from pyscf.pbc import scf
-        return self._transfer_attrs_(scf.KGHF(self.cell, self.kpts))
+        from pyscf.pbc import scf, df
+        out = self._transfer_attrs_(scf.KGHF(self.cell, self.kpts))
+
+        # Pure functionals only construct J-type integrals. Enable all integrals for KHF.
+        if (not self._numint.libxc.is_hybrid_xc(self.xc) and
+            len(self.kpts) > 1 and getattr(self.with_df, '_j_only', False)):
+            out.with_df._j_only = False
+            out.with_df.reset()
+        return out
 
     to_gpu = lib.to_gpu
diff --git a/pyscf/pbc/dft/krks.py b/pyscf/pbc/dft/krks.py
index 6278605de9..d690647879 100644
--- a/pyscf/pbc/dft/krks.py
+++ b/pyscf/pbc/dft/krks.py
@@ -181,25 +181,13 @@ def nuc_grad_method(self):
 
     def to_hf(self):
         '''Convert to KRHF object.'''
-        from pyscf.pbc import scf
-        return self._transfer_attrs_(scf.KRHF(self.cell, self.kpts))
+        from pyscf.pbc import scf, df
+        out = self._transfer_attrs_(scf.KRHF(self.cell, self.kpts))
+        # Pure functionals only construct J-type integrals. Enable all integrals for KHF.
+        if (not self._numint.libxc.is_hybrid_xc(self.xc) and
+            len(self.kpts) > 1 and getattr(self.with_df, '_j_only', False)):
+            out.with_df._j_only = False
+            out.with_df.reset()
+        return out
 
     to_gpu = lib.to_gpu
-
-
-if __name__ == '__main__':
-    from pyscf.pbc import gto
-    cell = gto.Cell()
-    cell.unit = 'A'
-    cell.atom = 'C 0.,  0.,  0.; C 0.8917,  0.8917,  0.8917'
-    cell.a = '''0.      1.7834  1.7834
-                1.7834  0.      1.7834
-                1.7834  1.7834  0.    '''
-
-    cell.basis = 'gth-szv'
-    cell.pseudo = 'gth-pade'
-    cell.verbose = 7
-    cell.output = '/dev/null'
-    cell.build()
-    mf = KRKS(cell, cell.make_kpts([2,1,1]))
-    print(mf.kernel())
diff --git a/pyscf/pbc/dft/kroks.py b/pyscf/pbc/dft/kroks.py
index 1d2a1198f8..640bf05e97 100644
--- a/pyscf/pbc/dft/kroks.py
+++ b/pyscf/pbc/dft/kroks.py
@@ -61,25 +61,13 @@ def dump_flags(self, verbose=None):
 
     def to_hf(self):
         '''Convert to KROHF object.'''
-        from pyscf.pbc import scf
-        return self._transfer_attrs_(scf.KROHF(self.cell, self.kpts))
+        from pyscf.pbc import scf, df
+        out = self._transfer_attrs_(scf.KROHF(self.cell, self.kpts))
+        # Pure functionals only construct J-type integrals. Enable all integrals for KHF.
+        if (not self._numint.libxc.is_hybrid_xc(self.xc) and
+            len(self.kpts) > 1 and getattr(self.with_df, '_j_only', False)):
+            out.with_df._j_only = False
+            out.with_df.reset()
+        return out
 
     to_gpu = lib.to_gpu
-
-
-if __name__ == '__main__':
-    from pyscf.pbc import gto
-    cell = gto.Cell()
-    cell.unit = 'A'
-    cell.atom = 'C 0.,  0.,  0.; C 0.8917,  0.8917,  0.8917'
-    cell.a = '''0.      1.7834  1.7834
-                1.7834  0.      1.7834
-                1.7834  1.7834  0.    '''
-
-    cell.basis = 'gth-szv'
-    cell.pseudo = 'gth-pade'
-    cell.verbose = 7
-    cell.output = '/dev/null'
-    cell.build()
-    mf = KROKS(cell, cell.make_kpts([2,1,1]))
-    print(mf.kernel())
diff --git a/pyscf/pbc/dft/kuks.py b/pyscf/pbc/dft/kuks.py
index ac66f973bc..732adcfa32 100644
--- a/pyscf/pbc/dft/kuks.py
+++ b/pyscf/pbc/dft/kuks.py
@@ -157,25 +157,13 @@ def nuc_grad_method(self):
 
     def to_hf(self):
         '''Convert to KUHF object.'''
-        from pyscf.pbc import scf
-        return self._transfer_attrs_(scf.KUHF(self.cell, self.kpts))
+        from pyscf.pbc import scf, df
+        out = self._transfer_attrs_(scf.KUHF(self.cell, self.kpts))
+        # Pure functionals only construct J-type integrals. Enable all integrals for KHF.
+        if (not self._numint.libxc.is_hybrid_xc(self.xc) and
+            len(self.kpts) > 1 and getattr(self.with_df, '_j_only', False)):
+            out.with_df._j_only = False
+            out.with_df.reset()
+        return out
 
     to_gpu = lib.to_gpu
-
-
-if __name__ == '__main__':
-    from pyscf.pbc import gto
-    cell = gto.Cell()
-    cell.unit = 'A'
-    cell.atom = 'C 0.,  0.,  0.; C 0.8917,  0.8917,  0.8917'
-    cell.a = '''0.      1.7834  1.7834
-                1.7834  0.      1.7834
-                1.7834  1.7834  0.    '''
-
-    cell.basis = 'gth-szv'
-    cell.pseudo = 'gth-pade'
-    cell.verbose = 7
-    cell.output = '/dev/null'
-    cell.build()
-    mf = KUKS(cell, cell.make_kpts([2,1,1]))
-    print(mf.kernel())
diff --git a/pyscf/pbc/dft/test/test_kgks.py b/pyscf/pbc/dft/test/test_kgks.py
index a4fba351c4..6ebd1165ea 100644
--- a/pyscf/pbc/dft/test/test_kgks.py
+++ b/pyscf/pbc/dft/test/test_kgks.py
@@ -22,6 +22,7 @@
 from pyscf import lib
 from pyscf.pbc import gto as gto
 from pyscf.pbc import dft as dft
+from pyscf.pbc import scf as pbcscf
 from pyscf.pbc.df import rsdf_builder, gdf_builder
 try:
     import mcfun
@@ -189,6 +190,19 @@ def test_mcol_x2c_kgks_lda(self):
         mf.run()
         self.assertAlmostEqual(mf.e_tot, -1.4910121442258883, 6)
 
+    def test_to_hf(self):
+        mf = dft.KGKS(cell).density_fit()
+        mf.with_df._j_only = True
+        a_hf = mf.to_hf()
+        self.assertTrue(a_hf.with_df._j_only)
+        self.assertTrue(isinstance(a_hf, pbcscf.kghf.KGHF))
+
+        mf = dft.KGKS(cell, kpts=cell.make_kpts([2,1,1])).density_fit()
+        mf.with_df._j_only = True
+        a_hf = mf.to_hf()
+        self.assertTrue(not a_hf.with_df._j_only)
+        self.assertTrue(isinstance(a_hf, pbcscf.kghf.KGHF))
+
 
 if __name__ == '__main__':
     print("Full Tests for pbc.dft.kgks")
diff --git a/pyscf/pbc/dft/test/test_krks.py b/pyscf/pbc/dft/test/test_krks.py
index 090711ec54..e8418cb972 100644
--- a/pyscf/pbc/dft/test/test_krks.py
+++ b/pyscf/pbc/dft/test/test_krks.py
@@ -22,6 +22,7 @@
 
 from pyscf.pbc import gto as pbcgto
 from pyscf.pbc import dft as pbcdft
+from pyscf.pbc import scf as pbcscf
 
 
 def build_cell(mesh):
@@ -143,6 +144,31 @@ def test_rsh_df(self):
         mf.kernel()
         self.assertAlmostEqual(mf.e_tot, -2.4766238116030683, 5)
 
+    def test_to_hf(self):
+        mf = pbcdft.KRKS(cell).density_fit()
+        mf.with_df._j_only = True
+        a_hf = mf.to_hf()
+        self.assertTrue(a_hf.with_df._j_only)
+        self.assertTrue(isinstance(a_hf, pbcscf.khf.KRHF))
+
+        mf = pbcdft.KRKS(cell, kpts=cell.make_kpts([2,1,1])).density_fit()
+        mf.with_df._j_only = True
+        a_hf = mf.to_hf()
+        self.assertTrue(not a_hf.with_df._j_only)
+        self.assertTrue(isinstance(a_hf, pbcscf.khf.KRHF))
+
+        mf = pbcdft.KROKS(cell).density_fit()
+        mf.with_df._j_only = True
+        a_hf = mf.to_hf()
+        self.assertTrue(a_hf.with_df._j_only)
+        self.assertTrue(isinstance(a_hf, pbcscf.krohf.KROHF))
+
+        mf = pbcdft.KROKS(cell, kpts=cell.make_kpts([2,1,1])).density_fit()
+        mf.with_df._j_only = True
+        a_hf = mf.to_hf()
+        self.assertTrue(not a_hf.with_df._j_only)
+        self.assertTrue(isinstance(a_hf, pbcscf.krohf.KROHF))
+
 # TODO: test the reset method of pbcdft.KRKS, pbcdft.RKS whether the reset
 # methods of all subsequent objects are called
 
diff --git a/pyscf/pbc/dft/test/test_kuks.py b/pyscf/pbc/dft/test/test_kuks.py
index d99fddf7fa..a81ae3051f 100644
--- a/pyscf/pbc/dft/test/test_kuks.py
+++ b/pyscf/pbc/dft/test/test_kuks.py
@@ -19,6 +19,7 @@
 import unittest
 import numpy as np
 from pyscf.pbc import gto as pbcgto
+from pyscf.pbc import scf as pbcscf
 from pyscf.pbc import dft as pbcdft
 
 
@@ -91,6 +92,19 @@ def test_rsh_df(self):
         mf.kernel()
         self.assertAlmostEqual(mf.e_tot, -2.4766238116030683, 7)
 
+    def test_to_hf(self):
+        mf = pbcdft.KUKS(cell).density_fit()
+        mf.with_df._j_only = True
+        a_hf = mf.to_hf()
+        self.assertTrue(a_hf.with_df._j_only)
+        self.assertTrue(isinstance(a_hf, pbcscf.kuhf.KUHF))
+
+        mf = pbcdft.KUKS(cell, kpts=cell.make_kpts([2,1,1])).density_fit()
+        mf.with_df._j_only = True
+        a_hf = mf.to_hf()
+        self.assertTrue(not a_hf.with_df._j_only)
+        self.assertTrue(isinstance(a_hf, pbcscf.kuhf.KUHF))
+
 
 if __name__ == '__main__':
     print("Full Tests for pbc.dft.kuks")

From 25a24fed24229a250cb2b0fd9c571666bfc010ce Mon Sep 17 00:00:00 2001
From: Qiming Sun <osirpt.sun@gmail.com>
Date: Sat, 13 Apr 2024 23:11:57 -0700
Subject: [PATCH 38/44] ancient GCC compatibility

---
 pyscf/lib/dft/utils.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/pyscf/lib/dft/utils.c b/pyscf/lib/dft/utils.c
index 04ef8e5b2f..fc27b4a48f 100644
--- a/pyscf/lib/dft/utils.c
+++ b/pyscf/lib/dft/utils.c
@@ -52,11 +52,20 @@ void get_gga_vrho_gs(double complex *out, double complex *vrho_gs, double comple
     int i;
     int ngrid2 = 2 * ngrid;
     double complex fac = -2. * _Complex_I;
-    #pragma omp parallel for simd schedule(static)
+#pragma omp parallel
+{
+    double complex v;
+// ensure OpenMP 4.0
+#if defined _OPENMP && _OPENMP >= 201307
+    #pragma omp for simd schedule(static)
+#else
+    #pragma omp for schedule(static)
+#endif
     for (i = 0; i < ngrid; i++) {
-        out[i] = ( Gv[i*3]   * vsigma1_gs[i]
-                  +Gv[i*3+1] * vsigma1_gs[i+ngrid]
-                  +Gv[i*3+2] * vsigma1_gs[i+ngrid2]) * fac + vrho_gs[i];
-        out[i] *= weight;
+        v = ( Gv[i*3]   * vsigma1_gs[i]
+             +Gv[i*3+1] * vsigma1_gs[i+ngrid]
+             +Gv[i*3+2] * vsigma1_gs[i+ngrid2]) * fac + vrho_gs[i];
+        out[i] = v * weight;
     }
 }
+}

From 6d3b24bb64e2a5edb7990b6e3304068981a33f54 Mon Sep 17 00:00:00 2001
From: Matthew Hennefarth <matthew.hennefarth@gmail.com>
Date: Mon, 15 Apr 2024 13:42:13 -0500
Subject: [PATCH 39/44] fix

---
 pyscf/df/grad/casdm2_util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyscf/df/grad/casdm2_util.py b/pyscf/df/grad/casdm2_util.py
index 6149e9eb51..4f6a5efbb0 100644
--- a/pyscf/df/grad/casdm2_util.py
+++ b/pyscf/df/grad/casdm2_util.py
@@ -476,7 +476,7 @@ def grad_elec_dferi (mc_grad, mo_cas=None, ci=None, dfcasdm2=None, casdm2=None,
 
     aoslices = mol.aoslice_by_atom ()
     dE = np.array ([dE[:,p0:p1].sum (axis=1) for p0, p1 in aoslices[:,2:]]).transpose (1,0,2)
-    return np.ascontiguousarray (dE)
+    return np.ascontiguousarray (dE)[:,atmlst,:]
 
 if __name__ == '__main__':
     from pyscf.tools import molden

From 7d3caf1e14d05f0e2a9bfaf904507e91a30b4034 Mon Sep 17 00:00:00 2001
From: Zhenyu Zhu ajz34 <ajz34@outlook.com>
Date: Mon, 29 Apr 2024 05:16:18 +0800
Subject: [PATCH 40/44] fix: infinite recursion atom_hf call when ECP with
 super-heavy atoms (>Cm) (#2183)

* bugfix: resolve recursive initial guess when atomic charge > 96 (> Cm) and ecp activated

* Adjust AGF2 tests

---------

Co-authored-by: Qiming Sun <osirpt.sun@gmail.com>
---
 pyscf/agf2/test/test_c_agf2.py | 16 ++++++++--------
 pyscf/scf/atom_hf.py           |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/pyscf/agf2/test/test_c_agf2.py b/pyscf/agf2/test/test_c_agf2.py
index b77551aa96..54325599ff 100644
--- a/pyscf/agf2/test/test_c_agf2.py
+++ b/pyscf/agf2/test/test_c_agf2.py
@@ -42,8 +42,8 @@ def test_c_ragf2(self):
         gf_vir = aux.GreensFunction(np.random.random(self.nvir), np.eye(self.nmo, self.nvir))
         vv1, vev1 = _agf2.build_mats_ragf2_outcore(xija, gf_occ.energy, gf_vir.energy)
         vv2, vev2 = _agf2.build_mats_ragf2_incore(xija, gf_occ.energy, gf_vir.energy)
-        self.assertAlmostEqual(np.max(np.absolute(vv1-vv2)), 0.0, 10)
-        self.assertAlmostEqual(np.max(np.absolute(vev1-vev2)), 0.0, 10)
+        self.assertAlmostEqual(np.max(np.absolute(vv1-vv2)), 0.0, 8)
+        self.assertAlmostEqual(np.max(np.absolute(vev1-vev2)), 0.0, 8)
 
     def test_c_dfragf2(self):
         qxi = np.random.random((self.naux, self.nmo*self.nocc)) / self.naux
@@ -52,8 +52,8 @@ def test_c_dfragf2(self):
         gf_vir = aux.GreensFunction(np.random.random(self.nvir), np.eye(self.nmo, self.nvir))
         vv1, vev1 = _agf2.build_mats_dfragf2_outcore(qxi, qja, gf_occ.energy, gf_vir.energy)
         vv2, vev2 = _agf2.build_mats_dfragf2_incore(qxi, qja, gf_occ.energy, gf_vir.energy)
-        self.assertAlmostEqual(np.max(np.absolute(vv1-vv2)), 0.0, 10)
-        self.assertAlmostEqual(np.max(np.absolute(vev1-vev2)), 0.0, 10)
+        self.assertAlmostEqual(np.max(np.absolute(vv1-vv2)), 0.0, 8)
+        self.assertAlmostEqual(np.max(np.absolute(vev1-vev2)), 0.0, 8)
 
     def test_c_uagf2(self):
         xija = np.random.random((2, self.nmo, self.nocc, self.nocc, self.nvir))
@@ -63,8 +63,8 @@ def test_c_uagf2(self):
                   aux.GreensFunction(np.random.random(self.nvir), np.eye(self.nmo, self.nvir)))
         vv1, vev1 = _agf2.build_mats_uagf2_outcore(xija, (gf_occ[0].energy, gf_occ[1].energy), (gf_vir[0].energy, gf_vir[1].energy))
         vv2, vev2 = _agf2.build_mats_uagf2_incore(xija, (gf_occ[0].energy, gf_occ[1].energy), (gf_vir[0].energy, gf_vir[1].energy))
-        self.assertAlmostEqual(np.max(np.absolute(vv1-vv2)), 0.0, 10)
-        self.assertAlmostEqual(np.max(np.absolute(vev1-vev2)), 0.0, 10)
+        self.assertAlmostEqual(np.max(np.absolute(vv1-vv2)), 0.0, 8)
+        self.assertAlmostEqual(np.max(np.absolute(vev1-vev2)), 0.0, 8)
 
     def test_c_dfuagf2(self):
         qxi = np.random.random((2, self.naux, self.nmo*self.nocc)) / self.naux
@@ -75,8 +75,8 @@ def test_c_dfuagf2(self):
                   aux.GreensFunction(np.random.random(self.nvir), np.eye(self.nmo, self.nvir)))
         vv1, vev1 = _agf2.build_mats_dfuagf2_outcore(qxi, qja, (gf_occ[0].energy, gf_occ[1].energy), (gf_vir[0].energy, gf_vir[1].energy))
         vv2, vev2 = _agf2.build_mats_dfuagf2_incore(qxi, qja, (gf_occ[0].energy, gf_occ[1].energy), (gf_vir[0].energy, gf_vir[1].energy))
-        self.assertAlmostEqual(np.max(np.absolute(vv1-vv2)), 0.0, 10)
-        self.assertAlmostEqual(np.max(np.absolute(vev1-vev2)), 0.0, 10)
+        self.assertAlmostEqual(np.max(np.absolute(vv1-vv2)), 0.0, 8)
+        self.assertAlmostEqual(np.max(np.absolute(vev1-vev2)), 0.0, 8)
 
 
 if __name__ == '__main__':
diff --git a/pyscf/scf/atom_hf.py b/pyscf/scf/atom_hf.py
index 4430963493..56dca05b28 100644
--- a/pyscf/scf/atom_hf.py
+++ b/pyscf/scf/atom_hf.py
@@ -93,7 +93,7 @@ def __init__(self, mol):
         hf.SCF.__init__(self, mol)
 
         # The default initial guess minao does not have super-heavy elements
-        if mol.atom_charge(0) > 96:
+        if gto.charge(mol.atom_symbol(0)) > 96:
             self.init_guess = '1e'
 
         self = self.apply(addons.remove_linear_dep_)

From 02469ebfd6e6137d7f69baf4461cc8d40c85f462 Mon Sep 17 00:00:00 2001
From: jeanwsr <srwang20@fudan.edu.cn>
Date: Fri, 26 Apr 2024 18:34:22 +0800
Subject: [PATCH 41/44] fix x_id in parse_token

---
 pyscf/dft/libxc.py           | 4 ++--
 pyscf/dft/test/test_libxc.py | 3 +++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/pyscf/dft/libxc.py b/pyscf/dft/libxc.py
index 6b2a841a52..2c9aec1ebe 100644
--- a/pyscf/dft/libxc.py
+++ b/pyscf/dft/libxc.py
@@ -1189,8 +1189,8 @@ def parse_token(token, ftype, search_xc_alias=False):
                     else:
                         # Some libxc functionals may not be listed in the
                         # XC_CODES table. Query libxc directly
-                        func_id = _itrf.xc_functional_get_number(ctypes.c_char_p(key.encode()))
-                        if func_id == -1:
+                        x_id = _itrf.xc_functional_get_number(ctypes.c_char_p(key.encode()))
+                        if x_id == -1:
                             raise KeyError(f"LibXCFunctional: name '{key}' not found.")
                 if isinstance(x_id, str):
                     hyb1, fn_facs1 = parse_xc(x_id)
diff --git a/pyscf/dft/test/test_libxc.py b/pyscf/dft/test/test_libxc.py
index f4223fdeec..373b81b119 100644
--- a/pyscf/dft/test/test_libxc.py
+++ b/pyscf/dft/test/test_libxc.py
@@ -123,6 +123,9 @@ def test_parse_xc(self):
 
         self.assertEqual(dft.libxc.parse_xc('Xpbe,')[1], ((123,1),))
         self.assertEqual(dft.libxc.parse_xc('pbe,' )[1], ((101,1),))
+        self.assertEqual(dft.libxc.parse_xc('gga_x_pbe_gaussian' )[1], ((321,1),))
+
+
         hyb, fn_facs = dft.libxc.parse_xc('PBE*.4+LDA')
         self.assertEqual(fn_facs, ((101, 0.4), (130, 0.4), (1, 1)))
         self.assertRaises(KeyError, dft.libxc.parse_xc, 'PBE+VWN')

From a343760d0b9bf0adc22c1d7daf5a9a3d4477bb8d Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <wxj6000@gmail.com>
Date: Thu, 2 May 2024 22:31:46 -0700
Subject: [PATCH 42/44] fixing unit test for dftd3 and dftd4

---
 pyscf/grad/test/test_rks.py | 14 ++++++++------
 pyscf/grad/test/test_uks.py |  8 ++++----
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/pyscf/grad/test/test_rks.py b/pyscf/grad/test/test_rks.py
index 456e11cdc4..cbee809485 100644
--- a/pyscf/grad/test/test_rks.py
+++ b/pyscf/grad/test/test_rks.py
@@ -194,9 +194,10 @@ def test_finite_diff_rks_grad(self):
         e2 = mf_scanner(mol1.set_geom_('O  0. 0. -.0001; 1  0. -0.757 0.587; 1  0. 0.757 0.587'))
         self.assertAlmostEqual(g[0,2], (e1-e2)/2e-4*lib.param.BOHR, 6)
 
-    def test_fnite_diff_rks_d3_grad(self):
+    @unittest.skipIf(dftd3 is None, "requires the dftd3 library")
+    def test_finite_diff_rks_d3_grad(self):
         mol1 = mol.copy()
-        mf = dft.RKS(mol)
+        mf = dft.RKS(mol, xc='b3lyp')
         mf.conv_tol = 1e-14
         mf.kernel()
         g = mf.nuc_grad_method().set(grid_response=True).kernel()
@@ -206,9 +207,10 @@ def test_fnite_diff_rks_d3_grad(self):
         e2 = mf_scanner(mol1.set_geom_('O  0. 0. -.0001; 1  0. -0.757 0.587; 1  0. 0.757 0.587'))
         self.assertAlmostEqual(g[0,2], (e1-e2)/2e-4*lib.param.BOHR, 6)
 
-    def test_fnite_diff_rks_d4_grad(self):
+    @unittest.skipIf(dftd4 is None, "requires the dftd4 library")
+    def test_finite_diff_rks_d4_grad(self):
         mol1 = mol.copy()
-        mf = dft.RKS(mol)
+        mf = dft.RKS(mol, xc='b3lyp')
         mf.conv_tol = 1e-14
         mf.kernel()
         g = mf.nuc_grad_method().set(grid_response=True).kernel()
@@ -231,7 +233,7 @@ def test_finite_diff_df_rks_grad(self):
 
     @unittest.skipIf(dftd3 is None, "requires the dftd3 library")
     def test_finite_diff_df_rks_d3_grad(self):
-        mf1 = mf.density_fit ()
+        mf1 = dft.RKS(mol, xc='b3lyp').density_fit ()
         mf1.disp = 'd3bj'
         mf1.kernel()
         g = mf1.nuc_grad_method ().set (grid_response=True).kernel ()
@@ -244,7 +246,7 @@ def test_finite_diff_df_rks_d3_grad(self):
 
     @unittest.skipIf(dftd4 is None, "requires the dftd4 library")
     def test_finite_diff_df_rks_d4_grad(self):
-        mf1 = mf.density_fit ()
+        mf1 = dft.RKS(mol, xc='b3lyp').density_fit ()
         mf1.disp = 'd4'
         mf1.kernel()
         g = mf1.nuc_grad_method ().set (grid_response=True).kernel ()
diff --git a/pyscf/grad/test/test_uks.py b/pyscf/grad/test/test_uks.py
index 9a763444ba..5a082a9621 100644
--- a/pyscf/grad/test/test_uks.py
+++ b/pyscf/grad/test/test_uks.py
@@ -78,9 +78,9 @@ def test_finite_diff_df_uks_grad(self):
         self.assertAlmostEqual(g[0,2], (e1-e2)/2e-4*lib.param.BOHR, 6)
 
     @unittest.skipIf(dftd3 is None, "requires the dftd3 library")
-    def test_fnite_diff_uks_d3_grad(self):
+    def test_finite_diff_uks_d3_grad(self):
         mol1 = mol.copy()
-        mf = dft.UKS(mol)
+        mf = dft.UKS(mol, xc='b3lyp')
         mf.disp = 'd3bj'
         mf.conv_tol = 1e-14
         mf.kernel()
@@ -92,9 +92,9 @@ def test_fnite_diff_uks_d3_grad(self):
         self.assertAlmostEqual(g[0,2], (e1-e2)/2e-4*lib.param.BOHR, 6)
 
     @unittest.skipIf(dftd4 is None, "requires the dftd4 library")
-    def test_fnite_diff_uks_d4_grad(self):
+    def test_finite_diff_uks_d4_grad(self):
         mol1 = mol.copy()
-        mf = dft.UKS(mol)
+        mf = dft.UKS(mol, xc='b3lyp')
         mf.disp = 'd4'
         mf.conv_tol = 1e-14
         mf.kernel()

From 9a48cd1d29068c9051e154aed4bb965bde9a5197 Mon Sep 17 00:00:00 2001
From: Michal Krompiec <michal.krompiec@gmail.com>
Date: Fri, 3 May 2024 20:32:31 +0100
Subject: [PATCH 43/44] Run CASCI and CASSCF without symmetry if symmetry is C1
 (#2195)

* Run CASCI and CASSCF without symmetry if symmetry is C1

* groupname, not symmetry!
---
 pyscf/mcscf/__init__.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pyscf/mcscf/__init__.py b/pyscf/mcscf/__init__.py
index 82196ded33..db8a6e53bd 100644
--- a/pyscf/mcscf/__init__.py
+++ b/pyscf/mcscf/__init__.py
@@ -190,7 +190,7 @@ def CASSCF(mf_or_mol, ncas, nelecas, ncore=None, frozen=None):
     if isinstance(mf, _DFHF) and mf.with_df:
         return DFCASSCF(mf, ncas, nelecas, ncore, frozen)
 
-    if mf.mol.symmetry:
+    if mf.mol.symmetry and mf.mol.groupname != 'C1':
         mc = mc1step_symm.CASSCF(mf, ncas, nelecas, ncore, frozen)
     else:
         mc = mc1step.CASSCF(mf, ncas, nelecas, ncore, frozen)
@@ -214,7 +214,7 @@ def CASCI(mf_or_mol, ncas, nelecas, ncore=None):
     if isinstance(mf, _DFHF) and mf.with_df:
         return DFCASCI(mf, ncas, nelecas, ncore)
 
-    if mf.mol.symmetry:
+    if mf.mol.symmetry and mf.mol.groupname != 'C1':
         mc = casci_symm.CASCI(mf, ncas, nelecas, ncore)
     else:
         mc = casci.CASCI(mf, ncas, nelecas, ncore)
@@ -279,7 +279,7 @@ def DFCASSCF(mf_or_mol, ncas, nelecas, auxbasis=None, ncore=None,
     if isinstance(mf, scf.uhf.UHF):
         mf = mf.to_rhf()
 
-    if mf.mol.symmetry:
+    if mf.mol.symmetry and mf.mol.groupname != 'C1':
         mc = mc1step_symm.CASSCF(mf, ncas, nelecas, ncore, frozen)
     else:
         mc = mc1step.CASSCF(mf, ncas, nelecas, ncore, frozen)
@@ -296,7 +296,7 @@ def DFCASCI(mf_or_mol, ncas, nelecas, auxbasis=None, ncore=None):
     if isinstance(mf, scf.uhf.UHF):
         mf = mf.to_rhf()
 
-    if mf.mol.symmetry:
+    if mf.mol.symmetry and mf.mol.groupname != 'C1':
         mc = casci_symm.CASCI(mf, ncas, nelecas, ncore)
     else:
         mc = casci.CASCI(mf, ncas, nelecas, ncore)

From 940e4ac16f02eeef3fc944eae190d7f0609a60e7 Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <wxj6000@gmail.com>
Date: Fri, 3 May 2024 23:42:36 -0700
Subject: [PATCH 44/44] move auxbasis_response out of __init__ (#2192)

* move auxbasis_response out of __init__

* relax unit test in test_c_agf2

* import hessian

* Update __init__.py
---
 pyscf/df/grad/rhf.py    | 7 ++++---
 pyscf/df/grad/rks.py    | 7 ++++---
 pyscf/df/grad/uhf.py    | 7 ++++---
 pyscf/df/grad/uks.py    | 6 +++---
 pyscf/df/hessian/rhf.py | 2 +-
 pyscf/df/hessian/rks.py | 2 +-
 pyscf/df/hessian/uhf.py | 2 +-
 pyscf/df/hessian/uks.py | 2 +-
 8 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/pyscf/df/grad/rhf.py b/pyscf/df/grad/rhf.py
index cfa31375eb..94aad0be24 100644
--- a/pyscf/df/grad/rhf.py
+++ b/pyscf/df/grad/rhf.py
@@ -482,11 +482,12 @@ class Gradients(rhf_grad.Gradients):
     _keys = {'with_df', 'auxbasis_response'}
 
     def __init__(self, mf):
-        # Whether to include the response of DF auxiliary basis when computing
-        # nuclear gradients of J/K matrices
-        self.auxbasis_response = True
         rhf_grad.Gradients.__init__(self, mf)
 
+    # Whether to include the response of DF auxiliary basis when computing
+    # nuclear gradients of J/K matrices
+    auxbasis_response = True
+
     def check_sanity(self):
         assert isinstance(self.base, df.df_jk._DFHF)
 
diff --git a/pyscf/df/grad/rks.py b/pyscf/df/grad/rks.py
index adfc7b7080..1802299b68 100644
--- a/pyscf/df/grad/rks.py
+++ b/pyscf/df/grad/rks.py
@@ -107,11 +107,12 @@ class Gradients(rks_grad.Gradients):
     _keys = {'with_df', 'auxbasis_response'}
 
     def __init__(self, mf):
-        # Whether to include the response of DF auxiliary basis when computing
-        # nuclear gradients of J/K matrices
-        self.auxbasis_response = True
         rks_grad.Gradients.__init__(self, mf)
 
+    # Whether to include the response of DF auxiliary basis when computing
+    # nuclear gradients of J/K matrices
+    auxbasis_response = True
+
     get_jk = df_rhf_grad.Gradients.get_jk
     get_j = df_rhf_grad.Gradients.get_j
     get_k = df_rhf_grad.Gradients.get_k
diff --git a/pyscf/df/grad/uhf.py b/pyscf/df/grad/uhf.py
index 0eec773b0d..6295633ede 100644
--- a/pyscf/df/grad/uhf.py
+++ b/pyscf/df/grad/uhf.py
@@ -35,11 +35,12 @@ class Gradients(uhf_grad.Gradients):
     _keys = {'with_df', 'auxbasis_response'}
 
     def __init__(self, mf):
-        # Whether to include the response of DF auxiliary basis when computing
-        # nuclear gradients of J/K matrices
-        self.auxbasis_response = True
         uhf_grad.Gradients.__init__(self, mf)
 
+    # Whether to include the response of DF auxiliary basis when computing
+    # nuclear gradients of J/K matrices
+    auxbasis_response = True
+
     get_jk = df_rhf_grad.Gradients.get_jk
     get_j = df_rhf_grad.Gradients.get_j
     get_k = df_rhf_grad.Gradients.get_k
diff --git a/pyscf/df/grad/uks.py b/pyscf/df/grad/uks.py
index e6de663a95..30fe60b361 100644
--- a/pyscf/df/grad/uks.py
+++ b/pyscf/df/grad/uks.py
@@ -108,11 +108,11 @@ class Gradients(uks_grad.Gradients):
     _keys = {'with_df', 'auxbasis_response'}
 
     def __init__(self, mf):
-        # Whether to include the response of DF auxiliary basis when computing
-        # nuclear gradients of J/K matrices
-        self.auxbasis_response = True
         uks_grad.Gradients.__init__(self, mf)
 
+    # Whether to include the response of DF auxiliary basis when computing
+    # nuclear gradients of J/K matrices
+    auxbasis_response = True
     get_jk = df_rhf_grad.Gradients.get_jk
     get_j = df_rhf_grad.Gradients.get_j
     get_k = df_rhf_grad.Gradients.get_k
diff --git a/pyscf/df/hessian/rhf.py b/pyscf/df/hessian/rhf.py
index d06fa9f473..6c4cd691e5 100644
--- a/pyscf/df/hessian/rhf.py
+++ b/pyscf/df/hessian/rhf.py
@@ -475,9 +475,9 @@ def _load_dim0(dat, p0, p1):
 class Hessian(rhf_hess.Hessian):
     '''Non-relativistic restricted Hartree-Fock hessian'''
     def __init__(self, mf):
-        self.auxbasis_response = 1
         rhf_hess.Hessian.__init__(self, mf)
 
+    auxbasis_response = 1
     partial_hess_elec = partial_hess_elec
     make_h1 = make_h1
 
diff --git a/pyscf/df/hessian/rks.py b/pyscf/df/hessian/rks.py
index 74c1bdd6c9..1d1073d657 100644
--- a/pyscf/df/hessian/rks.py
+++ b/pyscf/df/hessian/rks.py
@@ -121,9 +121,9 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
 class Hessian(rks_hess.Hessian):
     '''Non-relativistic RKS hessian'''
     def __init__(self, mf):
-        self.auxbasis_response = 1
         rks_hess.Hessian.__init__(self, mf)
 
+    auxbasis_response = 1
     partial_hess_elec = partial_hess_elec
     make_h1 = make_h1
 
diff --git a/pyscf/df/hessian/uhf.py b/pyscf/df/hessian/uhf.py
index 5cb20240f8..be863e87ec 100644
--- a/pyscf/df/hessian/uhf.py
+++ b/pyscf/df/hessian/uhf.py
@@ -526,9 +526,9 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
 class Hessian(uhf_hess.Hessian):
     '''Non-relativistic UHF hessian'''
     def __init__(self, mf):
-        self.auxbasis_response = 1
         uhf_hess.Hessian.__init__(self, mf)
 
+    auxbasis_response = 1
     partial_hess_elec = partial_hess_elec
     make_h1 = make_h1
 
diff --git a/pyscf/df/hessian/uks.py b/pyscf/df/hessian/uks.py
index 1afa995973..3c86207f36 100644
--- a/pyscf/df/hessian/uks.py
+++ b/pyscf/df/hessian/uks.py
@@ -134,9 +134,9 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
 class Hessian(uks_hess.Hessian):
     '''Non-relativistic RKS hessian'''
     def __init__(self, mf):
-        self.auxbasis_response = 1
         uks_hess.Hessian.__init__(self, mf)
 
+    auxbasis_response = 1
     partial_hess_elec = partial_hess_elec
     make_h1 = make_h1