LCOV - CP2K Regtests (git:561f475) - src/grid/ref/grid_ref_task

LCOV - code coverage report

Current view:	top level - src/grid/ref - grid_ref_task_list.c (source / functions)		Coverage	Total	Hit
Test:	CP2K Regtests (git:561f475)	Lines:	99.5 %	191	190
Test Date:	2026-06-21 06:48:54	Functions:	100.0 %	10	10

            Line data    Source code

       1              : /*----------------------------------------------------------------------------*/
       2              : /*  CP2K: A general program to perform molecular dynamics simulations         */
       3              : /*  Copyright 2000-2026 CP2K developers group <https://cp2k.org>              */
       4              : /*                                                                            */
       5              : /*  SPDX-License-Identifier: BSD-3-Clause                                     */
       6              : /*----------------------------------------------------------------------------*/
       7              : 
       8              : #include <assert.h>
       9              : #include <math.h>
      10              : #include <omp.h>
      11              : #include <stdint.h>
      12              : #include <stdio.h>
      13              : #include <stdlib.h>
      14              : #include <string.h>
      15              : 
      16              : #include "../common/grid_common.h"
      17              : #include "grid_ref_collocate.h"
      18              : #include "grid_ref_integrate.h"
      19              : #include "grid_ref_task_list.h"
      20              : #include "grid_ref_task_list_internal.h"
      21              : 
      22              : /*******************************************************************************
      23              :  * \brief Comperator passed to qsort to compare two tasks.
      24              :  * \author Ole Schuett
      25              :  ******************************************************************************/
      26     66802701 : static int compare_tasks(const void *a, const void *b) {
      27     66802701 :   const grid_ref_task *task_a = a, *task_b = b;
      28     66802701 :   if (task_a->level != task_b->level) {
      29      4773005 :     return task_a->level - task_b->level;
      30     62029696 :   } else if (task_a->block_num != task_b->block_num) {
      31     34322844 :     return task_a->block_num - task_b->block_num;
      32     27706852 :   } else if (task_a->iset != task_b->iset) {
      33      3236146 :     return task_a->iset - task_b->iset;
      34              :   } else {
      35     24470706 :     return task_a->jset - task_b->jset;
      36              :   }
      37              : }
      38              : /*******************************************************************************
      39              :  * \brief Allocates a task list for the reference backend.
      40              :  *        See grid_task_list.h for details.
      41              :  * \author Ole Schuett
      42              :  ******************************************************************************/
      43        15929 : void grid_ref_create_task_list(
      44              :     const bool orthorhombic, const int ntasks, const int nlevels,
      45              :     const int natoms, const int nkinds, const int nblocks,
      46              :     const int block_offsets[nblocks], const double atom_positions[natoms][3],
      47              :     const int atom_kinds[natoms], const grid_basis_set *basis_sets[nkinds],
      48              :     const int level_list[ntasks], const int iatom_list[ntasks],
      49              :     const int jatom_list[ntasks], const int iset_list[ntasks],
      50              :     const int jset_list[ntasks], const int ipgf_list[ntasks],
      51              :     const int jpgf_list[ntasks], const int border_mask_list[ntasks],
      52              :     const int block_num_list[ntasks], const double radius_list[ntasks],
      53              :     const double rab_list[ntasks][3], const int npts_global[nlevels][3],
      54              :     const int npts_local[nlevels][3], const int shift_local[nlevels][3],
      55              :     const int border_width[nlevels][3], const double dh[nlevels][3][3],
      56              :     const double dh_inv[nlevels][3][3], grid_ref_task_list **task_list_out) {
      57              : 
      58        15929 :   if (*task_list_out != NULL) {
      59              :     // This is actually an opportunity to reuse some buffers.
      60         6312 :     grid_ref_free_task_list(*task_list_out);
      61              :   }
      62              : 
      63        15929 :   grid_ref_task_list_internal *task_list =
      64        15929 :       malloc(sizeof(grid_ref_task_list_internal));
      65        15929 :   assert(task_list != NULL);
      66              : 
      67        15929 :   task_list->orthorhombic = orthorhombic;
      68        15929 :   task_list->ntasks = ntasks;
      69        15929 :   task_list->nlevels = nlevels;
      70        15929 :   task_list->natoms = natoms;
      71        15929 :   task_list->nkinds = nkinds;
      72        15929 :   task_list->nblocks = nblocks;
      73              : 
      74        15929 :   size_t size = nblocks * sizeof(int);
      75        15929 :   task_list->block_offsets = malloc(size);
      76        15929 :   assert(task_list->block_offsets != NULL || size == 0);
      77        15929 :   if (size != 0) {
      78        15929 :     memcpy(task_list->block_offsets, block_offsets, size);
      79              :   }
      80              : 
      81        15929 :   size = 3 * natoms * sizeof(double);
      82        15929 :   task_list->atom_positions = malloc(size);
      83        15929 :   assert(task_list->atom_positions != NULL || size == 0);
      84        15929 :   if (size != 0) {
      85        15929 :     memcpy(task_list->atom_positions, atom_positions, size);
      86              :   }
      87              : 
      88        15929 :   size = natoms * sizeof(int);
      89        15929 :   task_list->atom_kinds = malloc(size);
      90        15929 :   assert(task_list->atom_kinds != NULL || size == 0);
      91        15929 :   if (size != 0) {
      92        15929 :     memcpy(task_list->atom_kinds, atom_kinds, size);
      93              :   }
      94              : 
      95        15929 :   size = nkinds * sizeof(grid_basis_set *);
      96        15929 :   task_list->basis_sets = malloc(size);
      97        15929 :   assert(task_list->basis_sets != NULL || size == 0);
      98        15929 :   if (size != 0) {
      99        15929 :     memcpy(task_list->basis_sets, basis_sets, size);
     100              :   }
     101              : 
     102        15929 :   size = ntasks * sizeof(grid_ref_task);
     103        15929 :   task_list->tasks = malloc(size);
     104        15929 :   assert(task_list->tasks != NULL || size == 0);
     105        15929 : #pragma omp parallel for schedule(static) if (ntasks > GRID_OMP_MIN_ITERATIONS)
     106              :   for (int i = 0; i < ntasks; i++) {
     107              :     task_list->tasks[i].level = level_list[i];
     108              :     task_list->tasks[i].iatom = iatom_list[i];
     109              :     task_list->tasks[i].jatom = jatom_list[i];
     110              :     task_list->tasks[i].iset = iset_list[i];
     111              :     task_list->tasks[i].jset = jset_list[i];
     112              :     task_list->tasks[i].ipgf = ipgf_list[i];
     113              :     task_list->tasks[i].jpgf = jpgf_list[i];
     114              :     task_list->tasks[i].border_mask = border_mask_list[i];
     115              :     task_list->tasks[i].block_num = block_num_list[i];
     116              :     task_list->tasks[i].radius = radius_list[i];
     117              :     task_list->tasks[i].rab[0] = rab_list[i][0];
     118              :     task_list->tasks[i].rab[1] = rab_list[i][1];
     119              :     task_list->tasks[i].rab[2] = rab_list[i][2];
     120              :   }
     121              : 
     122              :   // Store grid layouts.
     123        15929 :   size = nlevels * sizeof(grid_ref_layout);
     124        15929 :   task_list->layouts = malloc(size);
     125        15929 :   assert(task_list->layouts != NULL || size == 0);
     126        78906 :   for (int level = 0; level < nlevels; level++) {
     127       251908 :     for (int i = 0; i < 3; i++) {
     128       188931 :       task_list->layouts[level].npts_global[i] = npts_global[level][i];
     129       188931 :       task_list->layouts[level].npts_local[i] = npts_local[level][i];
     130       188931 :       task_list->layouts[level].shift_local[i] = shift_local[level][i];
     131       188931 :       task_list->layouts[level].border_width[i] = border_width[level][i];
     132       755724 :       for (int j = 0; j < 3; j++) {
     133       566793 :         task_list->layouts[level].dh[i][j] = dh[level][i][j];
     134       566793 :         task_list->layouts[level].dh_inv[i][j] = dh_inv[level][i][j];
     135              :       }
     136              :     }
     137              :   }
     138              : 
     139              :   // Sort tasks by level, block_num, iset, and jset.
     140        15929 :   qsort(task_list->tasks, ntasks, sizeof(grid_ref_task), &compare_tasks);
     141              : 
     142              :   // Find first and last task for each level and block.
     143        15929 :   size = nlevels * nblocks * sizeof(int);
     144        15929 :   task_list->first_level_block_task = malloc(size);
     145        15929 :   assert(task_list->first_level_block_task != NULL || size == 0);
     146        15929 :   task_list->last_level_block_task = malloc(size);
     147        15929 :   assert(task_list->last_level_block_task != NULL || size == 0);
     148        15929 :   const int nlevel_blocks = nlevels * nblocks;
     149        15929 : #pragma omp parallel for schedule(static) if (nlevel_blocks >                  \
     150              :                                                   GRID_OMP_MIN_ITERATIONS)
     151              :   for (int i = 0; i < nlevel_blocks; i++) {
     152              :     task_list->first_level_block_task[i] = 0;
     153              :     task_list->last_level_block_task[i] = -1; // last < first means no tasks
     154              :   }
     155      9920329 :   for (int itask = 0; itask < ntasks; itask++) {
     156      9904400 :     const int level = task_list->tasks[itask].level - 1;
     157      9904400 :     const int block_num = task_list->tasks[itask].block_num - 1;
     158      9904400 :     if (itask == 0 || task_list->tasks[itask - 1].level - 1 != level ||
     159      9860126 :         task_list->tasks[itask - 1].block_num - 1 != block_num) {
     160      1122144 :       task_list->first_level_block_task[level * nblocks + block_num] = itask;
     161              :     }
     162      9904400 :     task_list->last_level_block_task[level * nblocks + block_num] = itask;
     163              :   }
     164              : 
     165              :   // Find largest Cartesian subblock size.
     166        15929 :   task_list->maxco = 0;
     167        44409 :   for (int i = 0; i < nkinds; i++) {
     168        28480 :     task_list->maxco = imax(task_list->maxco, task_list->basis_sets[i]->maxco);
     169              :   }
     170              : 
     171              :   // Initialize thread-local storage.
     172        15929 :   size = omp_get_max_threads() * sizeof(double *);
     173        15929 :   task_list->threadlocals = malloc(size);
     174        15929 :   assert(task_list->threadlocals != NULL);
     175        15929 :   memset(task_list->threadlocals, 0, size);
     176        15929 :   size = omp_get_max_threads() * sizeof(size_t);
     177        15929 :   task_list->threadlocal_sizes = malloc(size);
     178        15929 :   assert(task_list->threadlocal_sizes != NULL);
     179        15929 :   memset(task_list->threadlocal_sizes, 0, size);
     180              : 
     181        15929 :   *task_list_out = task_list;
     182        15929 : }
     183              : 
     184              : /*******************************************************************************
     185              :  * \brief Deallocates given task list, basis_sets have to be freed separately.
     186              :  * \author Ole Schuett
     187              :  ******************************************************************************/
     188        15929 : void grid_ref_free_task_list(grid_ref_task_list *ptr) {
     189        15929 :   if (ptr == NULL)
     190              :     return;
     191              : 
     192        15929 :   grid_ref_task_list_internal *task_list = (grid_ref_task_list_internal *)ptr;
     193              : 
     194        15929 :   free(task_list->block_offsets);
     195        15929 :   free(task_list->atom_positions);
     196        15929 :   free(task_list->atom_kinds);
     197        15929 :   free(task_list->basis_sets);
     198        15929 :   free(task_list->tasks);
     199        15929 :   free(task_list->layouts);
     200        15929 :   free(task_list->first_level_block_task);
     201        15929 :   free(task_list->last_level_block_task);
     202        31858 :   for (int i = 0; i < omp_get_max_threads(); i++) {
     203        15929 :     if (task_list->threadlocals[i] != NULL) {
     204            6 :       free(task_list->threadlocals[i]);
     205              :     }
     206              :   }
     207        15929 :   free(task_list->threadlocals);
     208        15929 :   free(task_list->threadlocal_sizes);
     209        15929 :   free(task_list);
     210              : }
     211              : 
     212              : /*******************************************************************************
     213              :  * \brief Prototype for BLAS dgemm.
     214              :  * \author Ole Schuett
     215              :  ******************************************************************************/
     216              : void dgemm_(const char *transa, const char *transb, const int *m, const int *n,
     217              :             const int *k, const double *alpha, const double *a, const int *lda,
     218              :             const double *b, const int *ldb, const double *beta, double *c,
     219              :             const int *ldc);
     220              : 
     221              : /*******************************************************************************
     222              :  * \brief Convenient wrapper to hide Fortran nature of dgemm_, swapping a and b.
     223              :  * \author Ole Schuett
     224              :  ******************************************************************************/
     225         1928 : static void dgemm(const char transa, const char transb, const int m,
     226              :                   const int n, const int k, const double alpha, const double *a,
     227              :                   const int lda, const double *b, const int ldb,
     228              :                   const double beta, double *c, const int ldc) {
     229         1928 :   dgemm_(&transb, &transa, &n, &m, &k, &alpha, b, &ldb, a, &lda, &beta, c,
     230              :          &ldc);
     231         1928 : }
     232              : 
     233              : /*******************************************************************************
     234              :  * \brief Transforms pab from contracted spherical to prim. cartesian basis.
     235              :  * \author Ole Schuett
     236              :  ******************************************************************************/
     237          580 : static void load_pab(const grid_basis_set *ibasis, const grid_basis_set *jbasis,
     238              :                      const int iset, const int jset, const bool transpose,
     239          580 :                      const double *block, double *pab) {
     240              : 
     241              :   // Define some more convenient aliases.
     242          580 :   const int ncoseta = ncoset(ibasis->lmax[iset]);
     243          580 :   const int ncosetb = ncoset(jbasis->lmax[jset]);
     244          580 :   const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
     245          580 :   const int ncob = jbasis->npgf[jset] * ncosetb;
     246              : 
     247          580 :   const int nsgf_seta = ibasis->nsgf_set[iset]; // size of spherical set
     248          580 :   const int nsgf_setb = jbasis->nsgf_set[jset];
     249          580 :   const int nsgfa = ibasis->nsgf; // size of entire spherical basis
     250          580 :   const int nsgfb = jbasis->nsgf;
     251          580 :   const int sgfa = ibasis->first_sgf[iset] - 1; // start of spherical set
     252          580 :   const int sgfb = jbasis->first_sgf[jset] - 1;
     253          580 :   const int maxcoa = ibasis->maxco;
     254          580 :   const int maxcob = jbasis->maxco;
     255              : 
     256          580 :   double work[nsgf_setb * ncoa];
     257          580 :   if (transpose) {
     258              :     // work[nsgf_setb][ncoa] = MATMUL(subblock, ibasis->sphi)
     259          412 :     dgemm('N', 'N', nsgf_setb, ncoa, nsgf_seta, 1.0,
     260          412 :           &block[sgfb * nsgfa + sgfa], nsgfa, &ibasis->sphi[sgfa * maxcoa],
     261              :           maxcoa, 0.0, work, ncoa);
     262              :   } else {
     263              :     // work[nsgf_setb][ncoa] = MATMUL(TRANSPOSE(subblock), ibasis->sphi)
     264          168 :     dgemm('T', 'N', nsgf_setb, ncoa, nsgf_seta, 1.0,
     265          168 :           &block[sgfa * nsgfb + sgfb], nsgfb, &ibasis->sphi[sgfa * maxcoa],
     266              :           maxcoa, 0.0, work, ncoa);
     267              :   }
     268              :   // pab[ncob][ncoa] = MATMUL(TRANSPOSE(jbasis->sphi), work)
     269          580 :   dgemm('T', 'N', ncob, ncoa, nsgf_setb, 1.0, &jbasis->sphi[sgfb * maxcob],
     270              :         maxcob, work, ncoa, 0.0, pab, ncoa);
     271          580 : }
     272              : 
     273              : /*******************************************************************************
     274              :  * \brief Collocate a range of tasks which are destined for the same grid level.
     275              :  * \author Ole Schuett
     276              :  ******************************************************************************/
     277          168 : static void collocate_one_grid_level(
     278              :     const grid_ref_task_list *ptr, const int *first_block_task,
     279              :     const int *last_block_task, const enum grid_func func,
     280              :     const int npts_global[3], const int npts_local[3], const int shift_local[3],
     281              :     const int border_width[3], const double dh[3][3], const double dh_inv[3][3],
     282              :     const double *pab_blocks, offload_buffer *grid) {
     283              : 
     284          168 :   if (ptr == NULL)
     285              :     return;
     286              : 
     287          168 :   grid_ref_task_list_internal *task_list = (grid_ref_task_list_internal *)ptr;
     288              : 
     289              : // Using default(shared) because with GCC 9 the behavior around const changed:
     290              : // https://www.gnu.org/software/gcc/gcc-9/porting_to.html
     291          168 : #pragma omp parallel default(shared)
     292              :   {
     293              :     const int ithread = omp_get_thread_num();
     294              :     const int nthreads = omp_get_num_threads();
     295              : 
     296              :     // Initialize variables to detect when a new subblock has to be fetched.
     297              :     int old_offset = -1, old_iset = -1, old_jset = -1;
     298              : 
     299              :     // Matrix pab is re-used across tasks.
     300              :     double pab[imax(task_list->maxco * task_list->maxco, 1)];
     301              : 
     302              :     // Ensure that grid can fit into thread-local storage, reallocate if needed.
     303              :     const int npts_local_total = npts_local[0] * npts_local[1] * npts_local[2];
     304              :     const size_t grid_size = npts_local_total * sizeof(double);
     305              :     if (task_list->threadlocal_sizes[ithread] < grid_size) {
     306              :       if (task_list->threadlocals[ithread] != NULL) {
     307              :         free(task_list->threadlocals[ithread]);
     308              :       }
     309              :       task_list->threadlocals[ithread] = malloc(grid_size);
     310              :       assert(task_list->threadlocals[ithread] != NULL);
     311              :       task_list->threadlocal_sizes[ithread] = grid_size;
     312              :     }
     313              : 
     314              :     // Zero thread-local copy of the grid.
     315              :     double *const my_grid = task_list->threadlocals[ithread];
     316              :     memset(my_grid, 0, grid_size);
     317              : 
     318              :     // Parallelize over blocks to avoid unnecessary calls to load_pab.
     319              :     const int chunk_size = imax(1, task_list->nblocks / (nthreads * 50));
     320              : #pragma omp for schedule(dynamic, chunk_size)
     321              :     for (int block_num = 0; block_num < task_list->nblocks; block_num++) {
     322              :       const int first_task = first_block_task[block_num];
     323              :       const int last_task = last_block_task[block_num];
     324              : 
     325              :       for (int itask = first_task; itask <= last_task; itask++) {
     326              :         // Define some convenient aliases.
     327              :         const grid_ref_task *task = &task_list->tasks[itask];
     328              :         const int iatom = task->iatom - 1;
     329              :         const int jatom = task->jatom - 1;
     330              :         const int iset = task->iset - 1;
     331              :         const int jset = task->jset - 1;
     332              :         const int ipgf = task->ipgf - 1;
     333              :         const int jpgf = task->jpgf - 1;
     334              :         const int ikind = task_list->atom_kinds[iatom] - 1;
     335              :         const int jkind = task_list->atom_kinds[jatom] - 1;
     336              :         const grid_basis_set *ibasis = task_list->basis_sets[ikind];
     337              :         const grid_basis_set *jbasis = task_list->basis_sets[jkind];
     338              :         const double zeta = ibasis->zet[iset * ibasis->maxpgf + ipgf];
     339              :         const double zetb = jbasis->zet[jset * jbasis->maxpgf + jpgf];
     340              :         const int ncoseta = ncoset(ibasis->lmax[iset]);
     341              :         const int ncosetb = ncoset(jbasis->lmax[jset]);
     342              :         const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
     343              :         const int ncob = jbasis->npgf[jset] * ncosetb;
     344              :         const int block_num = task->block_num - 1;
     345              :         const int block_offset = task_list->block_offsets[block_num];
     346              :         const double *block = &pab_blocks[block_offset];
     347              :         const bool transpose = (iatom <= jatom);
     348              : 
     349              :         // Load subblock from buffer and decontract into Cartesian sublock pab.
     350              :         // The previous pab can be reused when only ipgf or jpgf has changed.
     351              :         if (block_offset != old_offset || iset != old_iset ||
     352              :             jset != old_jset) {
     353              :           old_offset = block_offset;
     354              :           old_iset = iset;
     355              :           old_jset = jset;
     356              :           load_pab(ibasis, jbasis, iset, jset, transpose, block, pab);
     357              :         }
     358              : 
     359              :         grid_ref_collocate_pgf_product(
     360              :             /*orthorhombic=*/task_list->orthorhombic,
     361              :             /*border_mask=*/task->border_mask,
     362              :             /*func=*/func,
     363              :             /*la_max=*/ibasis->lmax[iset],
     364              :             /*la_min=*/ibasis->lmin[iset],
     365              :             /*lb_max=*/jbasis->lmax[jset],
     366              :             /*lb_min=*/jbasis->lmin[jset],
     367              :             /*zeta=*/zeta,
     368              :             /*zetb=*/zetb,
     369              :             /*rscale=*/(iatom == jatom) ? 1 : 2,
     370              :             /*dh=*/dh,
     371              :             /*dh_inv=*/dh_inv,
     372              :             /*ra=*/&task_list->atom_positions[3 * iatom],
     373              :             /*rab=*/task->rab,
     374              :             /*npts_global=*/npts_global,
     375              :             /*npts_local=*/npts_local,
     376              :             /*shift_local=*/shift_local,
     377              :             /*border_width=*/border_width,
     378              :             /*radius=*/task->radius,
     379              :             /*o1=*/ipgf * ncoseta,
     380              :             /*o2=*/jpgf * ncosetb,
     381              :             /*n1=*/ncoa,
     382              :             /*n2=*/ncob,
     383              :             /*pab=*/(const double(*)[ncoa])pab,
     384              :             /*grid=*/my_grid);
     385              : 
     386              :       } // end of task loop
     387              :     } // end of block loop
     388              : 
     389              : // While there should be an implicit barrier at the end of the block loop, this
     390              : // explicit barrier eliminates occasional seg faults with icc compiled binaries.
     391              : #pragma omp barrier
     392              : 
     393              :     // Merge thread-local grids via an efficient tree reduction.
     394              :     const int nreduction_cycles = ceil(log(nthreads) / log(2)); // tree depth
     395              :     for (int icycle = 1; icycle <= nreduction_cycles; icycle++) {
     396              :       // Threads are divided into groups, whose size doubles with each cycle.
     397              :       // After a cycle the reduced data is stored at first thread of each group.
     398              :       const int group_size = 1 << icycle; // 2**icycle
     399              :       const int igroup = ithread / group_size;
     400              :       const int dest_thread = igroup * group_size;
     401              :       const int src_thread = dest_thread + group_size / 2;
     402              :       // The last group might actually be a bit smaller.
     403              :       const int actual_group_size = imin(group_size, nthreads - dest_thread);
     404              :       // Parallelize summation by dividing grid points across group members.
     405              :       const int rank = modulo(ithread, group_size); // position within the group
     406              :       const int lb = (npts_local_total * rank) / actual_group_size;
     407              :       const int ub = (npts_local_total * (rank + 1)) / actual_group_size;
     408              :       if (src_thread < nthreads) {
     409              :         GRID_PRAGMA_SIMD_LOOP
     410              :         for (int i = lb; i < ub; i++) {
     411              :           task_list->threadlocals[dest_thread][i] +=
     412              :               task_list->threadlocals[src_thread][i];
     413              :         }
     414              :       }
     415              : #pragma omp barrier
     416              :     }
     417              : 
     418              :     // Copy final result from first thread into shared grid.
     419              :     const int lb = (npts_local_total * ithread) / nthreads;
     420              :     const int ub = (npts_local_total * (ithread + 1)) / nthreads;
     421              :     GRID_PRAGMA_SIMD_LOOP
     422              :     for (int i = lb; i < ub; i++) {
     423              :       grid->host_buffer[i] = task_list->threadlocals[0][i];
     424              :     }
     425              : 
     426              :   } // end of omp parallel region
     427              : }
     428              : 
     429              : /*******************************************************************************
     430              :  * \brief Collocate all tasks of in given list onto given grids.
     431              :  *        See grid_task_list.h for details.
     432              :  * \author Ole Schuett
     433              :  ******************************************************************************/
     434           42 : void grid_ref_collocate_task_list(const grid_ref_task_list *ptr,
     435              :                                   const enum grid_func func, const int nlevels,
     436              :                                   const offload_buffer *pab_blocks,
     437              :                                   offload_buffer *grids[nlevels]) {
     438              : 
     439           42 :   if (ptr == NULL)
     440              :     return;
     441              : 
     442           42 :   grid_ref_task_list_internal *task_list = (grid_ref_task_list_internal *)ptr;
     443              : 
     444           42 :   assert(task_list->nlevels == nlevels);
     445              : 
     446          210 :   for (int level = 0; level < task_list->nlevels; level++) {
     447          168 :     const int idx = level * task_list->nblocks;
     448          168 :     const int *first_block_task = &task_list->first_level_block_task[idx];
     449          168 :     const int *last_block_task = &task_list->last_level_block_task[idx];
     450          168 :     const grid_ref_layout *layout = &task_list->layouts[level];
     451          168 :     collocate_one_grid_level(
     452          168 :         task_list, first_block_task, last_block_task, func, layout->npts_global,
     453          168 :         layout->npts_local, layout->shift_local, layout->border_width,
     454          168 :         layout->dh, layout->dh_inv, pab_blocks->host_buffer, grids[level]);
     455              :   }
     456              : }
     457              : 
     458              : /*******************************************************************************
     459              :  * \brief Transforms hab from prim. cartesian to contracted spherical basis.
     460              :  * \author Ole Schuett
     461              :  ******************************************************************************/
     462          384 : static inline void store_hab(const grid_basis_set *ibasis,
     463              :                              const grid_basis_set *jbasis, const int iset,
     464              :                              const int jset, const bool transpose,
     465          384 :                              const double *hab, double *block) {
     466              : 
     467              :   // Define some more convenient aliases.
     468          384 :   const int ncoseta = ncoset(ibasis->lmax[iset]);
     469          384 :   const int ncosetb = ncoset(jbasis->lmax[jset]);
     470          384 :   const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
     471          384 :   const int ncob = jbasis->npgf[jset] * ncosetb;
     472              : 
     473          384 :   const int nsgf_seta = ibasis->nsgf_set[iset]; // size of spherical set
     474          384 :   const int nsgf_setb = jbasis->nsgf_set[jset];
     475          384 :   const int nsgfa = ibasis->nsgf; // size of entire spherical basis
     476          384 :   const int nsgfb = jbasis->nsgf;
     477          384 :   const int sgfa = ibasis->first_sgf[iset] - 1; // start of spherical set
     478          384 :   const int sgfb = jbasis->first_sgf[jset] - 1;
     479          384 :   const int maxcoa = ibasis->maxco;
     480          384 :   const int maxcob = jbasis->maxco;
     481              : 
     482          384 :   double work[nsgf_setb * ncoa];
     483              : 
     484              :   // work[nsgf_setb][ncoa] = MATMUL(jbasis->sphi, hab)
     485          384 :   dgemm('N', 'N', nsgf_setb, ncoa, ncob, 1.0, &jbasis->sphi[sgfb * maxcob],
     486              :         maxcob, hab, ncoa, 0.0, work, ncoa);
     487              : 
     488          384 :   if (transpose) {
     489              :     // subblock[nsgf_setb][nsgf_seta] += MATMUL(work, TRANSPOSE(ibasis->sphi))
     490          288 :     dgemm('N', 'T', nsgf_setb, nsgf_seta, ncoa, 1.0, work, ncoa,
     491          288 :           &ibasis->sphi[sgfa * maxcoa], maxcoa, 1.0,
     492          288 :           &block[sgfb * nsgfa + sgfa], nsgfa);
     493              :   } else {
     494              :     // subblock[nsgf_seta][nsgf_setb] += MATMUL(ibasis->sphi, TRANSPOSE(work))
     495           96 :     dgemm('N', 'T', nsgf_seta, nsgf_setb, ncoa, 1.0,
     496           96 :           &ibasis->sphi[sgfa * maxcoa], maxcoa, work, ncoa, 1.0,
     497           96 :           &block[sgfa * nsgfb + sgfb], nsgfb);
     498              :   }
     499          384 : }
     500              : 
     501              : /*******************************************************************************
     502              :  * \brief Integrate a range of tasks that belong to the same grid level.
     503              :  * \author Ole Schuett
     504              :  ******************************************************************************/
     505          136 : static void integrate_one_grid_level(
     506              :     const grid_ref_task_list *ptr, const int *first_block_task,
     507              :     const int *last_block_task, const bool compute_tau, const int natoms,
     508              :     const int npts_global[3], const int npts_local[3], const int shift_local[3],
     509              :     const int border_width[3], const double dh[3][3], const double dh_inv[3][3],
     510              :     const offload_buffer *pab_blocks, const offload_buffer *grid,
     511              :     offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3]) {
     512              : 
     513          136 :   if (ptr == NULL)
     514              :     return;
     515              : 
     516          136 :   grid_ref_task_list_internal *task_list = (grid_ref_task_list_internal *)ptr;
     517              : 
     518              : // Using default(shared) because with GCC 9 the behavior around const changed:
     519              : // https://www.gnu.org/software/gcc/gcc-9/porting_to.html
     520          136 : #pragma omp parallel default(shared)
     521              :   {
     522              :     // Initialize variables to detect when a new subblock has to be fetched.
     523              :     int old_offset = -1, old_iset = -1, old_jset = -1;
     524              :     grid_basis_set *old_ibasis = NULL, *old_jbasis = NULL;
     525              :     bool old_transpose = false;
     526              : 
     527              :     // Matrix pab and hab are re-used across tasks.
     528              :     double pab[imax(task_list->maxco * task_list->maxco, 1)];
     529              :     double hab[imax(task_list->maxco * task_list->maxco, 1)];
     530              : 
     531              :     // Parallelize over blocks to avoid concurred access to hab_blocks.
     532              :     const int nthreads = omp_get_num_threads();
     533              :     const int chunk_size = imax(1, task_list->nblocks / (nthreads * 50));
     534              : #pragma omp for schedule(dynamic, chunk_size)
     535              :     for (int block_num = 0; block_num < task_list->nblocks; block_num++) {
     536              :       const int first_task = first_block_task[block_num];
     537              :       const int last_task = last_block_task[block_num];
     538              : 
     539              :       // Accumulate forces per block as it corresponds to a pair of atoms.
     540              :       const int iatom = task_list->tasks[first_task].iatom - 1;
     541              :       const int jatom = task_list->tasks[first_task].jatom - 1;
     542              :       double my_forces[2][3] = {0};
     543              :       double my_virials[2][3][3] = {0};
     544              : 
     545              :       for (int itask = first_task; itask <= last_task; itask++) {
     546              :         // Define some convenient aliases.
     547              :         const grid_ref_task *task = &task_list->tasks[itask];
     548              :         assert(task->block_num - 1 == block_num);
     549              :         assert(task->iatom - 1 == iatom && task->jatom - 1 == jatom);
     550              :         const int ikind = task_list->atom_kinds[iatom] - 1;
     551              :         const int jkind = task_list->atom_kinds[jatom] - 1;
     552              :         grid_basis_set *ibasis = task_list->basis_sets[ikind];
     553              :         grid_basis_set *jbasis = task_list->basis_sets[jkind];
     554              :         const int iset = task->iset - 1;
     555              :         const int jset = task->jset - 1;
     556              :         const int ipgf = task->ipgf - 1;
     557              :         const int jpgf = task->jpgf - 1;
     558              :         const double zeta = ibasis->zet[iset * ibasis->maxpgf + ipgf];
     559              :         const double zetb = jbasis->zet[jset * jbasis->maxpgf + jpgf];
     560              :         const int ncoseta = ncoset(ibasis->lmax[iset]);
     561              :         const int ncosetb = ncoset(jbasis->lmax[jset]);
     562              :         const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
     563              :         const int ncob = jbasis->npgf[jset] * ncosetb;
     564              :         const int block_offset = task_list->block_offsets[block_num];
     565              :         const bool transpose = (iatom <= jatom);
     566              :         const bool pab_required = (forces != NULL || virial != NULL);
     567              : 
     568              :         // Load pab and store hab subblocks when needed.
     569              :         // Previous hab and pab can be reused when only ipgf or jpgf changed.
     570              :         if (block_offset != old_offset || iset != old_iset ||
     571              :             jset != old_jset) {
     572              :           if (pab_required) {
     573              :             load_pab(ibasis, jbasis, iset, jset, transpose,
     574              :                      &pab_blocks->host_buffer[block_offset], pab);
     575              :           }
     576              :           if (old_offset >= 0) { // skip first iteration
     577              :             store_hab(old_ibasis, old_jbasis, old_iset, old_jset, old_transpose,
     578              :                       hab, &hab_blocks->host_buffer[old_offset]);
     579              :           }
     580              :           memset(hab, 0, ncoa * ncob * sizeof(double));
     581              :           old_offset = block_offset;
     582              :           old_iset = iset;
     583              :           old_jset = jset;
     584              :           old_ibasis = ibasis;
     585              :           old_jbasis = jbasis;
     586              :           old_transpose = transpose;
     587              :         }
     588              : 
     589              :         grid_ref_integrate_pgf_product(
     590              :             /*orthorhombic=*/task_list->orthorhombic,
     591              :             /*compute_tau=*/compute_tau,
     592              :             /*border_mask=*/task->border_mask,
     593              :             /*la_max=*/ibasis->lmax[iset],
     594              :             /*la_min=*/ibasis->lmin[iset],
     595              :             /*lb_max=*/jbasis->lmax[jset],
     596              :             /*lb_min=*/jbasis->lmin[jset],
     597              :             /*zeta=*/zeta,
     598              :             /*zetb=*/zetb,
     599              :             /*dh=*/dh,
     600              :             /*dh_inv=*/dh_inv,
     601              :             /*ra=*/&task_list->atom_positions[3 * iatom],
     602              :             /*rab=*/task->rab,
     603              :             /*npts_global=*/npts_global,
     604              :             /*npts_local=*/npts_local,
     605              :             /*shift_local=*/shift_local,
     606              :             /*border_width=*/border_width,
     607              :             /*radius=*/task->radius,
     608              :             /*o1=*/ipgf * ncoseta,
     609              :             /*o2=*/jpgf * ncosetb,
     610              :             /*n1=*/ncoa,
     611              :             /*n2=*/ncob,
     612              :             /*grid=*/grid->host_buffer,
     613              :             /*hab=*/(double(*)[ncoa])hab,
     614              :             /*pab=*/(pab_required) ? (const double(*)[ncoa])pab : NULL,
     615              :             /*forces=*/(forces != NULL) ? my_forces : NULL,
     616              :             /*virials=*/(virial != NULL) ? my_virials : NULL,
     617              :             /*hdab=*/NULL,
     618              :             /*hadb=*/NULL,
     619              :             /*a_hdab=*/NULL);
     620              : 
     621              :       } // end of task loop
     622              : 
     623              :       // Merge thread-local forces and virial into shared ones.
     624              :       // It does not seem worth the trouble to accumulate them thread-locally.
     625              :       const double scalef = (iatom == jatom) ? 1.0 : 2.0;
     626              :       if (forces != NULL) {
     627              : #pragma omp critical(forces)
     628              :         for (int i = 0; i < 3; i++) {
     629              :           forces[iatom][i] += scalef * my_forces[0][i];
     630              :           forces[jatom][i] += scalef * my_forces[1][i];
     631              :         }
     632              :       }
     633              :       if (virial != NULL) {
     634              : #pragma omp critical(virial)
     635              :         for (int i = 0; i < 3; i++) {
     636              :           for (int j = 0; j < 3; j++) {
     637              :             virial[i][j] += scalef * my_virials[0][i][j];
     638              :             virial[i][j] += scalef * my_virials[1][i][j];
     639              :           }
     640              :         }
     641              :       }
     642              : 
     643              :     } // end of block loop
     644              : 
     645              :     // store final hab
     646              :     if (old_offset >= 0) {
     647              :       store_hab(old_ibasis, old_jbasis, old_iset, old_jset, old_transpose, hab,
     648              :                 &hab_blocks->host_buffer[old_offset]);
     649              :     }
     650              : 
     651              :   } // end of omp parallel region
     652              : }
     653              : 
     654              : /*******************************************************************************
     655              :  * \brief Integrate all tasks of in given list from given grids.
     656              :  *        See grid_task_list.h for details.
     657              :  * \author Ole Schuett
     658              :  ******************************************************************************/
     659           34 : void grid_ref_integrate_task_list(
     660              :     const grid_ref_task_list *ptr, const bool compute_tau, const int natoms,
     661              :     const int nlevels, const offload_buffer *pab_blocks,
     662              :     const offload_buffer *grids[nlevels], offload_buffer *hab_blocks,
     663              :     double forces[natoms][3], double virial[3][3]) {
     664           34 :   if (ptr == NULL)
     665              :     return;
     666              : 
     667           34 :   grid_ref_task_list_internal *task_list = (grid_ref_task_list_internal *)ptr;
     668           34 :   assert(task_list->nlevels == nlevels);
     669           34 :   assert(task_list->natoms == natoms);
     670              : 
     671              :   // Zero result arrays.
     672           34 :   memset(hab_blocks->host_buffer, 0, hab_blocks->size);
     673           34 :   if (forces != NULL) {
     674            6 :     memset(forces, 0, natoms * 3 * sizeof(double));
     675              :   }
     676           34 :   if (virial != NULL) {
     677            0 :     memset(virial, 0, 9 * sizeof(double));
     678              :   }
     679              : 
     680          170 :   for (int level = 0; level < task_list->nlevels; level++) {
     681          136 :     const int idx = level * task_list->nblocks;
     682          136 :     const int *first_block_task = &task_list->first_level_block_task[idx];
     683          136 :     const int *last_block_task = &task_list->last_level_block_task[idx];
     684          136 :     const grid_ref_layout *layout = &task_list->layouts[level];
     685          136 :     integrate_one_grid_level(
     686              :         task_list, first_block_task, last_block_task, compute_tau, natoms,
     687          136 :         layout->npts_global, layout->npts_local, layout->shift_local,
     688          136 :         layout->border_width, layout->dh, layout->dh_inv, pab_blocks,
     689          136 :         grids[level], hab_blocks, forces, virial);
     690              :   }
     691              : }
     692              : 
     693              : // EOF

Generated by: LCOV version 2.0-1