Line data Source code
1 : /*----------------------------------------------------------------------------*/
2 : /* CP2K: A general program to perform molecular dynamics simulations */
3 : /* Copyright 2000-2026 CP2K developers group <https://cp2k.org> */
4 : /* */
5 : /* SPDX-License-Identifier: BSD-3-Clause */
6 : /*----------------------------------------------------------------------------*/
7 :
8 : #include <assert.h>
9 : #include <math.h>
10 : #include <omp.h>
11 : #include <stdint.h>
12 : #include <stdio.h>
13 : #include <stdlib.h>
14 : #include <string.h>
15 :
16 : #include "../common/grid_common.h"
17 : #include "grid_cpu_collocate.h"
18 : #include "grid_cpu_integrate.h"
19 : #include "grid_cpu_task_list.h"
20 :
21 : /*******************************************************************************
22 : * \brief Comperator passed to qsort to compare two tasks.
23 : * \author Ole Schuett
24 : ******************************************************************************/
25 59954441 : static int compare_tasks(const void *a, const void *b) {
26 59954441 : const grid_cpu_task *task_a = a, *task_b = b;
27 59954441 : if (task_a->level != task_b->level) {
28 4371954 : return task_a->level - task_b->level;
29 55582487 : } else if (task_a->block_num != task_b->block_num) {
30 28890407 : return task_a->block_num - task_b->block_num;
31 26692080 : } else if (task_a->iset != task_b->iset) {
32 3201040 : return task_a->iset - task_b->iset;
33 : } else {
34 23491040 : return task_a->jset - task_b->jset;
35 : }
36 : }
37 : /*******************************************************************************
38 : * \brief Allocates a task list for the cpu backend.
39 : * See grid_task_list.h for details.
40 : * \author Ole Schuett
41 : ******************************************************************************/
42 15580 : void grid_cpu_create_task_list(
43 : const bool orthorhombic, const int ntasks, const int nlevels,
44 : const int natoms, const int nkinds, const int nblocks,
45 : const int block_offsets[nblocks], const double atom_positions[natoms][3],
46 : const int atom_kinds[natoms], const grid_basis_set *basis_sets[nkinds],
47 : const int level_list[ntasks], const int iatom_list[ntasks],
48 : const int jatom_list[ntasks], const int iset_list[ntasks],
49 : const int jset_list[ntasks], const int ipgf_list[ntasks],
50 : const int jpgf_list[ntasks], const int border_mask_list[ntasks],
51 : const int block_num_list[ntasks], const double radius_list[ntasks],
52 : const double rab_list[ntasks][3], const int npts_global[nlevels][3],
53 : const int npts_local[nlevels][3], const int shift_local[nlevels][3],
54 : const int border_width[nlevels][3], const double dh[nlevels][3][3],
55 : const double dh_inv[nlevels][3][3], grid_cpu_task_list **task_list_out) {
56 :
57 15580 : if (*task_list_out != NULL) {
58 : // This is actually an opportunity to reuse some buffers.
59 6202 : grid_cpu_free_task_list(*task_list_out);
60 : }
61 :
62 15580 : grid_cpu_task_list *task_list = malloc(sizeof(grid_cpu_task_list));
63 15580 : assert(task_list != NULL);
64 :
65 15580 : task_list->orthorhombic = orthorhombic;
66 15580 : task_list->ntasks = ntasks;
67 15580 : task_list->nlevels = nlevels;
68 15580 : task_list->natoms = natoms;
69 15580 : task_list->nkinds = nkinds;
70 15580 : task_list->nblocks = nblocks;
71 :
72 15580 : size_t size = nblocks * sizeof(int);
73 15580 : task_list->block_offsets = malloc(size);
74 15580 : assert(task_list->block_offsets != NULL || size == 0);
75 15580 : if (size != 0) {
76 15334 : memcpy(task_list->block_offsets, block_offsets, size);
77 : }
78 :
79 15580 : size = 3 * natoms * sizeof(double);
80 15580 : task_list->atom_positions = malloc(size);
81 15580 : assert(task_list->atom_positions != NULL || size == 0);
82 15580 : if (size != 0) {
83 15580 : memcpy(task_list->atom_positions, atom_positions, size);
84 : }
85 :
86 15580 : size = natoms * sizeof(int);
87 15580 : task_list->atom_kinds = malloc(size);
88 15580 : assert(task_list->atom_kinds != NULL || size == 0);
89 15580 : if (size != 0) {
90 15580 : memcpy(task_list->atom_kinds, atom_kinds, size);
91 : }
92 :
93 15580 : size = nkinds * sizeof(grid_basis_set *);
94 15580 : task_list->basis_sets = malloc(size);
95 15580 : assert(task_list->basis_sets != NULL || size == 0);
96 15580 : if (size != 0) {
97 15580 : memcpy(task_list->basis_sets, basis_sets, size);
98 : }
99 :
100 15580 : size = ntasks * sizeof(grid_cpu_task);
101 15580 : task_list->tasks = malloc(size);
102 15580 : assert(task_list->tasks != NULL || size == 0);
103 15580 : #pragma omp parallel for schedule(static) if (ntasks > GRID_OMP_MIN_ITERATIONS)
104 : for (int i = 0; i < ntasks; i++) {
105 : task_list->tasks[i].level = level_list[i];
106 : task_list->tasks[i].iatom = iatom_list[i];
107 : task_list->tasks[i].jatom = jatom_list[i];
108 : task_list->tasks[i].iset = iset_list[i];
109 : task_list->tasks[i].jset = jset_list[i];
110 : task_list->tasks[i].ipgf = ipgf_list[i];
111 : task_list->tasks[i].jpgf = jpgf_list[i];
112 : task_list->tasks[i].border_mask = border_mask_list[i];
113 : task_list->tasks[i].block_num = block_num_list[i];
114 : task_list->tasks[i].radius = radius_list[i];
115 : task_list->tasks[i].rab[0] = rab_list[i][0];
116 : task_list->tasks[i].rab[1] = rab_list[i][1];
117 : task_list->tasks[i].rab[2] = rab_list[i][2];
118 : }
119 :
120 : // Store grid layouts.
121 15580 : size = nlevels * sizeof(grid_cpu_layout);
122 15580 : task_list->layouts = malloc(size);
123 15580 : assert(task_list->layouts != NULL || size == 0);
124 77228 : for (int level = 0; level < nlevels; level++) {
125 246592 : for (int i = 0; i < 3; i++) {
126 184944 : task_list->layouts[level].npts_global[i] = npts_global[level][i];
127 184944 : task_list->layouts[level].npts_local[i] = npts_local[level][i];
128 184944 : task_list->layouts[level].shift_local[i] = shift_local[level][i];
129 184944 : task_list->layouts[level].border_width[i] = border_width[level][i];
130 739776 : for (int j = 0; j < 3; j++) {
131 554832 : task_list->layouts[level].dh[i][j] = dh[level][i][j];
132 554832 : task_list->layouts[level].dh_inv[i][j] = dh_inv[level][i][j];
133 : }
134 : }
135 : }
136 :
137 : // Sort tasks by level, block_num, iset, and jset.
138 15580 : qsort(task_list->tasks, ntasks, sizeof(grid_cpu_task), &compare_tasks);
139 :
140 : // Find first and last task for each level and block.
141 15580 : size = nlevels * nblocks * sizeof(int);
142 15580 : task_list->first_level_block_task = malloc(size);
143 15580 : assert(task_list->first_level_block_task != NULL || size == 0);
144 15580 : task_list->last_level_block_task = malloc(size);
145 15580 : assert(task_list->last_level_block_task != NULL || size == 0);
146 15580 : const int nlevel_blocks = nlevels * nblocks;
147 15580 : #pragma omp parallel for schedule(static) if (nlevel_blocks > \
148 : GRID_OMP_MIN_ITERATIONS)
149 : for (int i = 0; i < nlevel_blocks; i++) {
150 : task_list->first_level_block_task[i] = 0;
151 : task_list->last_level_block_task[i] = -1; // last < first means no tasks
152 : }
153 9137239 : for (int itask = 0; itask < ntasks; itask++) {
154 9121659 : const int level = task_list->tasks[itask].level - 1;
155 9121659 : const int block_num = task_list->tasks[itask].block_num - 1;
156 9121659 : if (itask == 0 || task_list->tasks[itask - 1].level - 1 != level ||
157 9079188 : task_list->tasks[itask - 1].block_num - 1 != block_num) {
158 901784 : task_list->first_level_block_task[level * nblocks + block_num] = itask;
159 : }
160 9121659 : task_list->last_level_block_task[level * nblocks + block_num] = itask;
161 : }
162 :
163 : // Find largest Cartesian subblock size.
164 15580 : task_list->maxco = 0;
165 43475 : for (int i = 0; i < nkinds; i++) {
166 27895 : task_list->maxco = imax(task_list->maxco, task_list->basis_sets[i]->maxco);
167 : }
168 :
169 : // Initialize thread-local storage.
170 15580 : size = omp_get_max_threads() * sizeof(double *);
171 15580 : task_list->threadlocals = malloc(size);
172 15580 : assert(task_list->threadlocals != NULL);
173 15580 : memset(task_list->threadlocals, 0, size);
174 15580 : size = omp_get_max_threads() * sizeof(size_t);
175 15580 : task_list->threadlocal_sizes = malloc(size);
176 15580 : assert(task_list->threadlocal_sizes != NULL);
177 15580 : memset(task_list->threadlocal_sizes, 0, size);
178 :
179 15580 : *task_list_out = task_list;
180 15580 : }
181 :
182 : /*******************************************************************************
183 : * \brief Deallocates given task list, basis_sets have to be freed separately.
184 : * \author Ole Schuett
185 : ******************************************************************************/
186 15580 : void grid_cpu_free_task_list(grid_cpu_task_list *task_list) {
187 15580 : free(task_list->block_offsets);
188 15580 : free(task_list->atom_positions);
189 15580 : free(task_list->atom_kinds);
190 15580 : free(task_list->basis_sets);
191 15580 : free(task_list->tasks);
192 15580 : free(task_list->layouts);
193 15580 : free(task_list->first_level_block_task);
194 15580 : free(task_list->last_level_block_task);
195 31160 : for (int i = 0; i < omp_get_max_threads(); i++) {
196 15580 : if (task_list->threadlocals[i] != NULL) {
197 13650 : free(task_list->threadlocals[i]);
198 : }
199 : }
200 15580 : free(task_list->threadlocals);
201 15580 : free(task_list->threadlocal_sizes);
202 15580 : free(task_list);
203 15580 : }
204 :
205 : /*******************************************************************************
206 : * \brief Prototype for BLAS dgemm.
207 : * \author Ole Schuett
208 : ******************************************************************************/
209 : void dgemm_(const char *transa, const char *transb, const int *m, const int *n,
210 : const int *k, const double *alpha, const double *a, const int *lda,
211 : const double *b, const int *ldb, const double *beta, double *c,
212 : const int *ldc);
213 :
214 : /*******************************************************************************
215 : * \brief Convenient wrapper to hide Fortran nature of dgemm_, swapping a and b.
216 : * \author Ole Schuett
217 : ******************************************************************************/
218 54489318 : static void dgemm(const char transa, const char transb, const int m,
219 : const int n, const int k, const double alpha, const double *a,
220 : const int lda, const double *b, const int ldb,
221 : const double beta, double *c, const int ldc) {
222 54489318 : dgemm_(&transb, &transa, &n, &m, &k, &alpha, b, &ldb, a, &lda, &beta, c,
223 : &ldc);
224 54489318 : }
225 :
226 : /*******************************************************************************
227 : * \brief Transforms pab from contracted spherical to prim. cartesian basis.
228 : * \author Ole Schuett
229 : ******************************************************************************/
230 14499397 : static void load_pab(const grid_basis_set *ibasis, const grid_basis_set *jbasis,
231 : const int iset, const int jset, const bool transpose,
232 14499397 : const double *block, double *pab) {
233 :
234 : // Define some more convenient aliases.
235 14499397 : const int ncoseta = ncoset(ibasis->lmax[iset]);
236 14499397 : const int ncosetb = ncoset(jbasis->lmax[jset]);
237 14499397 : const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
238 14499397 : const int ncob = jbasis->npgf[jset] * ncosetb;
239 :
240 14499397 : const int nsgf_seta = ibasis->nsgf_set[iset]; // size of spherical set
241 14499397 : const int nsgf_setb = jbasis->nsgf_set[jset];
242 14499397 : const int nsgfa = ibasis->nsgf; // size of entire spherical basis
243 14499397 : const int nsgfb = jbasis->nsgf;
244 14499397 : const int sgfa = ibasis->first_sgf[iset] - 1; // start of spherical set
245 14499397 : const int sgfb = jbasis->first_sgf[jset] - 1;
246 14499397 : const int maxcoa = ibasis->maxco;
247 14499397 : const int maxcob = jbasis->maxco;
248 :
249 14499397 : double work[nsgf_setb * ncoa];
250 14499397 : if (transpose) {
251 : // work[nsgf_setb][ncoa] = MATMUL(subblock, ibasis->sphi)
252 8670438 : dgemm('N', 'N', nsgf_setb, ncoa, nsgf_seta, 1.0,
253 8670438 : &block[sgfb * nsgfa + sgfa], nsgfa, &ibasis->sphi[sgfa * maxcoa],
254 : maxcoa, 0.0, work, ncoa);
255 : } else {
256 : // work[nsgf_setb][ncoa] = MATMUL(TRANSPOSE(subblock), ibasis->sphi)
257 5828959 : dgemm('T', 'N', nsgf_setb, ncoa, nsgf_seta, 1.0,
258 5828959 : &block[sgfa * nsgfb + sgfb], nsgfb, &ibasis->sphi[sgfa * maxcoa],
259 : maxcoa, 0.0, work, ncoa);
260 : }
261 : // pab[ncob][ncoa] = MATMUL(TRANSPOSE(jbasis->sphi), work)
262 14499397 : dgemm('T', 'N', ncob, ncoa, nsgf_setb, 1.0, &jbasis->sphi[sgfb * maxcob],
263 : maxcob, work, ncoa, 0.0, pab, ncoa);
264 14499397 : }
265 :
266 : /*******************************************************************************
267 : * \brief Collocate a range of tasks which are destined for the same grid level.
268 : * \author Ole Schuett
269 : ******************************************************************************/
270 898936 : static void collocate_one_grid_level(
271 : const grid_cpu_task_list *task_list, const int *first_block_task,
272 : const int *last_block_task, const enum grid_func func,
273 : const int npts_global[3], const int npts_local[3], const int shift_local[3],
274 : const int border_width[3], const double dh[3][3], const double dh_inv[3][3],
275 : const double *pab_blocks, offload_buffer *grid) {
276 :
277 : // Using default(shared) because with GCC 9 the behavior around const changed:
278 : // https://www.gnu.org/software/gcc/gcc-9/porting_to.html
279 898936 : #pragma omp parallel default(shared)
280 : {
281 : const int ithread = omp_get_thread_num();
282 : const int nthreads = omp_get_num_threads();
283 :
284 : // Initialize variables to detect when a new subblock has to be fetched.
285 : int old_offset = -1, old_iset = -1, old_jset = -1;
286 :
287 : // Matrix pab is re-used across tasks.
288 : double pab[imax(task_list->maxco * task_list->maxco, 1)];
289 :
290 : // Ensure that grid can fit into thread-local storage, reallocate if needed.
291 : const int npts_local_total = npts_local[0] * npts_local[1] * npts_local[2];
292 : const size_t grid_size = npts_local_total * sizeof(double);
293 : if (task_list->threadlocal_sizes[ithread] < grid_size) {
294 : if (task_list->threadlocals[ithread] != NULL) {
295 : free(task_list->threadlocals[ithread]);
296 : }
297 : task_list->threadlocals[ithread] = malloc(grid_size);
298 : assert(task_list->threadlocals[ithread] != NULL);
299 : task_list->threadlocal_sizes[ithread] = grid_size;
300 : }
301 :
302 : // Zero thread-local copy of the grid.
303 : double *const my_grid = task_list->threadlocals[ithread];
304 : memset(my_grid, 0, grid_size);
305 :
306 : // Parallelize over blocks to avoid unnecessary calls to load_pab.
307 : const int chunk_size = imax(1, task_list->nblocks / (nthreads * 50));
308 : #pragma omp for schedule(dynamic, chunk_size)
309 : for (int block_num = 0; block_num < task_list->nblocks; block_num++) {
310 : const int first_task = first_block_task[block_num];
311 : const int last_task = last_block_task[block_num];
312 :
313 : for (int itask = first_task; itask <= last_task; itask++) {
314 : // Define some convenient aliases.
315 : const grid_cpu_task *task = &task_list->tasks[itask];
316 : const int iatom = task->iatom - 1;
317 : const int jatom = task->jatom - 1;
318 : const int iset = task->iset - 1;
319 : const int jset = task->jset - 1;
320 : const int ipgf = task->ipgf - 1;
321 : const int jpgf = task->jpgf - 1;
322 : const int ikind = task_list->atom_kinds[iatom] - 1;
323 : const int jkind = task_list->atom_kinds[jatom] - 1;
324 : const grid_basis_set *ibasis = task_list->basis_sets[ikind];
325 : const grid_basis_set *jbasis = task_list->basis_sets[jkind];
326 : const double zeta = ibasis->zet[iset * ibasis->maxpgf + ipgf];
327 : const double zetb = jbasis->zet[jset * jbasis->maxpgf + jpgf];
328 : const int ncoseta = ncoset(ibasis->lmax[iset]);
329 : const int ncosetb = ncoset(jbasis->lmax[jset]);
330 : const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
331 : const int ncob = jbasis->npgf[jset] * ncosetb;
332 : const int block_num = task->block_num - 1;
333 : const int block_offset = task_list->block_offsets[block_num];
334 : const double *block = &pab_blocks[block_offset];
335 : const bool transpose = (iatom <= jatom);
336 :
337 : // Load subblock from buffer and decontract into Cartesian sublock pab.
338 : // The previous pab can be reused when only ipgf or jpgf has changed.
339 : if (block_offset != old_offset || iset != old_iset ||
340 : jset != old_jset) {
341 : old_offset = block_offset;
342 : old_iset = iset;
343 : old_jset = jset;
344 : load_pab(ibasis, jbasis, iset, jset, transpose, block, pab);
345 : }
346 :
347 : grid_cpu_collocate_pgf_product(
348 : /*orthorhombic=*/task_list->orthorhombic,
349 : /*border_mask=*/task->border_mask,
350 : /*func=*/func,
351 : /*la_max=*/ibasis->lmax[iset],
352 : /*la_min=*/ibasis->lmin[iset],
353 : /*lb_max=*/jbasis->lmax[jset],
354 : /*lb_min=*/jbasis->lmin[jset],
355 : /*zeta=*/zeta,
356 : /*zetb=*/zetb,
357 : /*rscale=*/(iatom == jatom) ? 1 : 2,
358 : /*dh=*/dh,
359 : /*dh_inv=*/dh_inv,
360 : /*ra=*/&task_list->atom_positions[3 * iatom],
361 : /*rab=*/task->rab,
362 : /*npts_global=*/npts_global,
363 : /*npts_local=*/npts_local,
364 : /*shift_local=*/shift_local,
365 : /*border_width=*/border_width,
366 : /*radius=*/task->radius,
367 : /*o1=*/ipgf * ncoseta,
368 : /*o2=*/jpgf * ncosetb,
369 : /*n1=*/ncoa,
370 : /*n2=*/ncob,
371 : /*pab=*/(const double(*)[ncoa])pab,
372 : /*grid=*/my_grid);
373 :
374 : } // end of task loop
375 : } // end of block loop
376 :
377 : // While there should be an implicit barrier at the end of the block loop, this
378 : // explicit barrier eliminates occasional seg faults with icc compiled binaries.
379 : #pragma omp barrier
380 :
381 : // Merge thread-local grids via an efficient tree reduction.
382 : const int nreduction_cycles = ceil(log(nthreads) / log(2)); // tree depth
383 : for (int icycle = 1; icycle <= nreduction_cycles; icycle++) {
384 : // Threads are divided into groups, whose size doubles with each cycle.
385 : // After a cycle the reduced data is stored at first thread of each group.
386 : const int group_size = 1 << icycle; // 2**icycle
387 : const int igroup = ithread / group_size;
388 : const int dest_thread = igroup * group_size;
389 : const int src_thread = dest_thread + group_size / 2;
390 : // The last group might actually be a bit smaller.
391 : const int actual_group_size = imin(group_size, nthreads - dest_thread);
392 : // Parallelize summation by dividing grid points across group members.
393 : const int rank = modulo(ithread, group_size); // position within the group
394 : const int64_t lb = ((int64_t)npts_local_total * rank) / actual_group_size;
395 : const int64_t ub =
396 : ((int64_t)npts_local_total * (rank + 1)) / actual_group_size;
397 : if (src_thread < nthreads) {
398 : GRID_PRAGMA_SIMD_LOOP
399 : for (int i = (int)lb; i < (int)ub; i++) {
400 : task_list->threadlocals[dest_thread][i] +=
401 : task_list->threadlocals[src_thread][i];
402 : }
403 : }
404 : #pragma omp barrier
405 : }
406 :
407 : // Copy final result from first thread into shared grid.
408 : const int64_t lb = ((int64_t)npts_local_total * ithread) / nthreads;
409 : const int64_t ub = ((int64_t)npts_local_total * (ithread + 1)) / nthreads;
410 : GRID_PRAGMA_SIMD_LOOP
411 : for (int i = (int)lb; i < (int)ub; i++) {
412 : grid->host_buffer[i] = task_list->threadlocals[0][i];
413 : }
414 :
415 : } // end of omp parallel region
416 898936 : }
417 :
418 : /*******************************************************************************
419 : * \brief Collocate all tasks of in given list onto given grids.
420 : * See grid_task_list.h for details.
421 : * \author Ole Schuett
422 : ******************************************************************************/
423 226926 : void grid_cpu_collocate_task_list(const grid_cpu_task_list *task_list,
424 : const enum grid_func func, const int nlevels,
425 : const offload_buffer *pab_blocks,
426 : offload_buffer *grids[nlevels]) {
427 :
428 226926 : assert(task_list->nlevels == nlevels);
429 :
430 1125862 : for (int level = 0; level < task_list->nlevels; level++) {
431 898936 : const int idx = level * task_list->nblocks;
432 898936 : const int *first_block_task = &task_list->first_level_block_task[idx];
433 898936 : const int *last_block_task = &task_list->last_level_block_task[idx];
434 898936 : const grid_cpu_layout *layout = &task_list->layouts[level];
435 898936 : collocate_one_grid_level(
436 898936 : task_list, first_block_task, last_block_task, func, layout->npts_global,
437 898936 : layout->npts_local, layout->shift_local, layout->border_width,
438 898936 : layout->dh, layout->dh_inv, pab_blocks->host_buffer, grids[level]);
439 : }
440 226926 : }
441 :
442 : /*******************************************************************************
443 : * \brief Transforms hab from prim. cartesian to contracted spherical basis.
444 : * \author Ole Schuett
445 : ******************************************************************************/
446 12745262 : static inline void store_hab(const grid_basis_set *ibasis,
447 : const grid_basis_set *jbasis, const int iset,
448 : const int jset, const bool transpose,
449 12745262 : const double *hab, double *block) {
450 :
451 : // Define some more convenient aliases.
452 12745262 : const int ncoseta = ncoset(ibasis->lmax[iset]);
453 12745262 : const int ncosetb = ncoset(jbasis->lmax[jset]);
454 12745262 : const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
455 12745262 : const int ncob = jbasis->npgf[jset] * ncosetb;
456 :
457 12745262 : const int nsgf_seta = ibasis->nsgf_set[iset]; // size of spherical set
458 12745262 : const int nsgf_setb = jbasis->nsgf_set[jset];
459 12745262 : const int nsgfa = ibasis->nsgf; // size of entire spherical basis
460 12745262 : const int nsgfb = jbasis->nsgf;
461 12745262 : const int sgfa = ibasis->first_sgf[iset] - 1; // start of spherical set
462 12745262 : const int sgfb = jbasis->first_sgf[jset] - 1;
463 12745262 : const int maxcoa = ibasis->maxco;
464 12745262 : const int maxcob = jbasis->maxco;
465 :
466 12745262 : double work[nsgf_setb * ncoa];
467 :
468 : // work[nsgf_setb][ncoa] = MATMUL(jbasis->sphi, hab)
469 12745262 : dgemm('N', 'N', nsgf_setb, ncoa, ncob, 1.0, &jbasis->sphi[sgfb * maxcob],
470 : maxcob, hab, ncoa, 0.0, work, ncoa);
471 :
472 12745262 : if (transpose) {
473 : // subblock[nsgf_setb][nsgf_seta] += MATMUL(work, TRANSPOSE(ibasis->sphi))
474 7606145 : dgemm('N', 'T', nsgf_setb, nsgf_seta, ncoa, 1.0, work, ncoa,
475 7606145 : &ibasis->sphi[sgfa * maxcoa], maxcoa, 1.0,
476 7606145 : &block[sgfb * nsgfa + sgfa], nsgfa);
477 : } else {
478 : // subblock[nsgf_seta][nsgf_setb] += MATMUL(ibasis->sphi, TRANSPOSE(work))
479 5139117 : dgemm('N', 'T', nsgf_seta, nsgf_setb, ncoa, 1.0,
480 5139117 : &ibasis->sphi[sgfa * maxcoa], maxcoa, work, ncoa, 1.0,
481 5139117 : &block[sgfa * nsgfb + sgfb], nsgfb);
482 : }
483 12745262 : }
484 :
485 : /*******************************************************************************
486 : * \brief Integrate a range of tasks that belong to the same grid level.
487 : * \author Ole Schuett
488 : ******************************************************************************/
489 819788 : static void integrate_one_grid_level(
490 : const grid_cpu_task_list *task_list, const int *first_block_task,
491 : const int *last_block_task, const bool compute_tau, const int natoms,
492 : const int npts_global[3], const int npts_local[3], const int shift_local[3],
493 : const int border_width[3], const double dh[3][3], const double dh_inv[3][3],
494 : const offload_buffer *pab_blocks, const offload_buffer *grid,
495 : offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3]) {
496 :
497 : // Using default(shared) because with GCC 9 the behavior around const changed:
498 : // https://www.gnu.org/software/gcc/gcc-9/porting_to.html
499 819788 : #pragma omp parallel default(shared)
500 : {
501 : // Initialize variables to detect when a new subblock has to be fetched.
502 : int old_offset = -1, old_iset = -1, old_jset = -1;
503 : grid_basis_set *old_ibasis = NULL, *old_jbasis = NULL;
504 : bool old_transpose = false;
505 :
506 : // Matrix pab and hab are re-used across tasks.
507 : double pab[imax(task_list->maxco * task_list->maxco, 1)];
508 : double hab[imax(task_list->maxco * task_list->maxco, 1)];
509 :
510 : // Parallelize over blocks to avoid concurred access to hab_blocks.
511 : const int nthreads = omp_get_num_threads();
512 : const int chunk_size = imax(1, task_list->nblocks / (nthreads * 50));
513 : #pragma omp for schedule(dynamic, chunk_size)
514 : for (int block_num = 0; block_num < task_list->nblocks; block_num++) {
515 : const int first_task = first_block_task[block_num];
516 : const int last_task = last_block_task[block_num];
517 :
518 : // Accumulate forces per block as it corresponds to a pair of atoms.
519 : const int iatom = task_list->tasks[first_task].iatom - 1;
520 : const int jatom = task_list->tasks[first_task].jatom - 1;
521 : double my_forces[2][3] = {0};
522 : double my_virials[2][3][3] = {0};
523 :
524 : for (int itask = first_task; itask <= last_task; itask++) {
525 : // Define some convenient aliases.
526 : const grid_cpu_task *task = &task_list->tasks[itask];
527 : assert(task->block_num - 1 == block_num);
528 : assert(task->iatom - 1 == iatom && task->jatom - 1 == jatom);
529 : const int ikind = task_list->atom_kinds[iatom] - 1;
530 : const int jkind = task_list->atom_kinds[jatom] - 1;
531 : grid_basis_set *ibasis = task_list->basis_sets[ikind];
532 : grid_basis_set *jbasis = task_list->basis_sets[jkind];
533 : const int iset = task->iset - 1;
534 : const int jset = task->jset - 1;
535 : const int ipgf = task->ipgf - 1;
536 : const int jpgf = task->jpgf - 1;
537 : const double zeta = ibasis->zet[iset * ibasis->maxpgf + ipgf];
538 : const double zetb = jbasis->zet[jset * jbasis->maxpgf + jpgf];
539 : const int ncoseta = ncoset(ibasis->lmax[iset]);
540 : const int ncosetb = ncoset(jbasis->lmax[jset]);
541 : const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
542 : const int ncob = jbasis->npgf[jset] * ncosetb;
543 : const int block_offset = task_list->block_offsets[block_num];
544 : const bool transpose = (iatom <= jatom);
545 : const bool pab_required = (forces != NULL || virial != NULL);
546 :
547 : // Load pab and store hab subblocks when needed.
548 : // Previous hab and pab can be reused when only ipgf or jpgf changed.
549 : if (block_offset != old_offset || iset != old_iset ||
550 : jset != old_jset) {
551 : if (pab_required) {
552 : load_pab(ibasis, jbasis, iset, jset, transpose,
553 : &pab_blocks->host_buffer[block_offset], pab);
554 : }
555 : if (old_offset >= 0) { // skip first iteration
556 : store_hab(old_ibasis, old_jbasis, old_iset, old_jset, old_transpose,
557 : hab, &hab_blocks->host_buffer[old_offset]);
558 : }
559 : memset(hab, 0, ncoa * ncob * sizeof(double));
560 : old_offset = block_offset;
561 : old_iset = iset;
562 : old_jset = jset;
563 : old_ibasis = ibasis;
564 : old_jbasis = jbasis;
565 : old_transpose = transpose;
566 : }
567 :
568 : grid_cpu_integrate_pgf_product(
569 : /*orthorhombic=*/task_list->orthorhombic,
570 : /*compute_tau=*/compute_tau,
571 : /*border_mask=*/task->border_mask,
572 : /*la_max=*/ibasis->lmax[iset],
573 : /*la_min=*/ibasis->lmin[iset],
574 : /*lb_max=*/jbasis->lmax[jset],
575 : /*lb_min=*/jbasis->lmin[jset],
576 : /*zeta=*/zeta,
577 : /*zetb=*/zetb,
578 : /*dh=*/dh,
579 : /*dh_inv=*/dh_inv,
580 : /*ra=*/&task_list->atom_positions[3 * iatom],
581 : /*rab=*/task->rab,
582 : /*npts_global=*/npts_global,
583 : /*npts_local=*/npts_local,
584 : /*shift_local=*/shift_local,
585 : /*border_width=*/border_width,
586 : /*radius=*/task->radius,
587 : /*o1=*/ipgf * ncoseta,
588 : /*o2=*/jpgf * ncosetb,
589 : /*n1=*/ncoa,
590 : /*n2=*/ncob,
591 : /*grid=*/grid->host_buffer,
592 : /*hab=*/(double(*)[ncoa])hab,
593 : /*pab=*/(pab_required) ? (const double(*)[ncoa])pab : NULL,
594 : /*forces=*/(forces != NULL) ? my_forces : NULL,
595 : /*virials=*/(virial != NULL) ? my_virials : NULL,
596 : /*hdab=*/NULL,
597 : /*hadb=*/NULL,
598 : /*a_hdab=*/NULL);
599 :
600 : } // end of task loop
601 :
602 : // Merge thread-local forces and virial into shared ones.
603 : // It does not seem worth the trouble to accumulate them thread-locally.
604 : const double scalef = (iatom == jatom) ? 1.0 : 2.0;
605 : if (forces != NULL) {
606 : #pragma omp critical(forces)
607 : for (int i = 0; i < 3; i++) {
608 : forces[iatom][i] += scalef * my_forces[0][i];
609 : forces[jatom][i] += scalef * my_forces[1][i];
610 : }
611 : }
612 : if (virial != NULL) {
613 : #pragma omp critical(virial)
614 : for (int i = 0; i < 3; i++) {
615 : for (int j = 0; j < 3; j++) {
616 : virial[i][j] += scalef * my_virials[0][i][j];
617 : virial[i][j] += scalef * my_virials[1][i][j];
618 : }
619 : }
620 : }
621 :
622 : } // end of block loop
623 :
624 : // store final hab
625 : if (old_offset >= 0) {
626 : store_hab(old_ibasis, old_jbasis, old_iset, old_jset, old_transpose, hab,
627 : &hab_blocks->host_buffer[old_offset]);
628 : }
629 :
630 : } // end of omp parallel region
631 819788 : }
632 :
633 : /*******************************************************************************
634 : * \brief Integrate all tasks of in given list from given grids.
635 : * See grid_task_list.h for details.
636 : * \author Ole Schuett
637 : ******************************************************************************/
638 206926 : void grid_cpu_integrate_task_list(
639 : const grid_cpu_task_list *task_list, const bool compute_tau,
640 : const int natoms, const int nlevels, const offload_buffer *pab_blocks,
641 : const offload_buffer *grids[nlevels], offload_buffer *hab_blocks,
642 : double forces[natoms][3], double virial[3][3]) {
643 :
644 206926 : assert(task_list->nlevels == nlevels);
645 206926 : assert(task_list->natoms == natoms);
646 206926 : assert(hab_blocks != NULL);
647 :
648 : // Zero result arrays.
649 206926 : if (hab_blocks->size != 0) {
650 204915 : memset(hab_blocks->host_buffer, 0, hab_blocks->size);
651 : }
652 206926 : if (forces != NULL) {
653 24695 : memset(forces, 0, natoms * 3 * sizeof(double));
654 : }
655 206926 : if (virial != NULL) {
656 3761 : memset(virial, 0, 9 * sizeof(double));
657 : }
658 :
659 1026714 : for (int level = 0; level < task_list->nlevels; level++) {
660 819788 : const int idx = level * task_list->nblocks;
661 819788 : const int *first_block_task = &task_list->first_level_block_task[idx];
662 819788 : const int *last_block_task = &task_list->last_level_block_task[idx];
663 819788 : const grid_cpu_layout *layout = &task_list->layouts[level];
664 819788 : integrate_one_grid_level(
665 : task_list, first_block_task, last_block_task, compute_tau, natoms,
666 819788 : layout->npts_global, layout->npts_local, layout->shift_local,
667 819788 : layout->border_width, layout->dh, layout->dh_inv, pab_blocks,
668 819788 : grids[level], hab_blocks, forces, virial);
669 : }
670 206926 : }
671 :
672 : // EOF
|