Line data Source code
1 : /*----------------------------------------------------------------------------*/
2 : /* CP2K: A general program to perform molecular dynamics simulations */
3 : /* Copyright 2000-2026 CP2K developers group <https://cp2k.org> */
4 : /* */
5 : /* SPDX-License-Identifier: BSD-3-Clause */
6 : /*----------------------------------------------------------------------------*/
7 : #include "offload_mempool.h"
8 : #include "../mpiwrap/cp_mpi.h"
9 : #include "offload_library.h"
10 : #include "offload_runtime.h"
11 :
12 : #include <assert.h>
13 : #include <inttypes.h>
14 : #include <omp.h>
15 : #include <stdbool.h>
16 : #include <stdio.h>
17 : #include <stdlib.h>
18 : #include <string.h>
19 :
20 : #if defined(__parallel)
21 : #include <mpi.h>
22 : #endif
23 :
24 : #if defined(__LIBXSTREAM)
25 : #include <libxstream/libxstream.h>
26 : #include <libxstream/libxstream_opencl.h>
27 : #elif defined(__LIBXS)
28 : #include <libxs/libxs_malloc.h>
29 : #endif
30 :
31 : #define OFFLOAD_MEMPOOL_PRINT(FN, MSG, OUTPUT_UNIT) \
32 : ((FN)(MSG, (int)strlen(MSG), OUTPUT_UNIT))
33 : #define OFFLOAD_MEMPOOL_OMPALLOC 1
34 :
35 : #if !defined(__LIBXSTREAM)
36 : /*******************************************************************************
37 : * \brief Private struct for storing a chunk of memory.
38 : * \author Ole Schuett
39 : ******************************************************************************/
40 : typedef struct offload_memchunk {
41 : void *mem; // first: allows to cast memchunk into mem-ptr...
42 : struct offload_memchunk *next;
43 : size_t size, used;
44 : } offload_memchunk_t;
45 :
46 : /*******************************************************************************
47 : * \brief Private struct for storing a memory pool.
48 : * \author Ole Schuett
49 : ******************************************************************************/
50 : typedef struct offload_mempool {
51 : offload_memchunk_t *available_head, *allocated_head; // single-linked lists
52 : } offload_mempool_t;
53 :
54 : /*******************************************************************************
55 : * \brief Private pools for host and device memory.
56 : * \author Ole Schuett
57 : ******************************************************************************/
58 : static offload_mempool_t mempool_host = {0};
59 : static offload_mempool_t mempool_device = {0};
60 :
61 : /*******************************************************************************
62 : * \brief Private counters for statistics.
63 : * \author Hans Pabst
64 : ******************************************************************************/
65 : static struct {
66 : uint64_t mallocs, mempeak;
67 : } host_stats = {0, 0};
68 : static struct {
69 : uint64_t mallocs, mempeak;
70 : } device_stats = {0, 0};
71 :
72 : /*******************************************************************************
73 : * \brief Private routine for actually allocating system memory.
74 : * \author Ole Schuett
75 : ******************************************************************************/
76 147457 : static void *actual_malloc(const size_t size, const bool on_device) {
77 147457 : if (size == 0) {
78 : return NULL;
79 : }
80 :
81 147457 : void *memory = NULL;
82 : #if defined(__OFFLOAD)
83 : if (on_device) {
84 : offload_activate_chosen_device();
85 : offloadMalloc(&memory, size);
86 : } else {
87 : offload_activate_chosen_device();
88 : offloadMallocHost(&memory, size);
89 : }
90 : #elif OFFLOAD_MEMPOOL_OMPALLOC && (201811 /*v5.0*/ <= _OPENMP)
91 : memory = omp_alloc(size, omp_null_allocator);
92 : #elif defined(__parallel) && !OFFLOAD_MEMPOOL_OMPALLOC
93 : if (MPI_SUCCESS != MPI_Alloc_mem((MPI_Aint)size, MPI_INFO_NULL, &memory)) {
94 : fprintf(stderr, "ERROR: MPI_Alloc_mem failed at %s:%i\n", name, __FILE__,
95 : __LINE__);
96 : MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
97 : }
98 : #else
99 147457 : memory = malloc(size);
100 : #endif
101 :
102 : // Update statistics.
103 147457 : if (on_device) {
104 50141 : #pragma omp atomic
105 : ++device_stats.mallocs;
106 : } else {
107 97316 : #pragma omp atomic
108 : ++host_stats.mallocs;
109 : }
110 :
111 147457 : assert(memory != NULL);
112 : return memory;
113 : }
114 :
115 : /*******************************************************************************
116 : * \brief Private routine for actually freeing system memory.
117 : * \author Ole Schuett
118 : ******************************************************************************/
119 270504 : static void actual_free(void *memory, const bool on_device) {
120 270504 : if (NULL == memory) {
121 : return;
122 : }
123 :
124 : #if defined(__OFFLOAD)
125 : if (on_device) {
126 : offload_activate_chosen_device();
127 : offloadFree(memory);
128 : } else {
129 : offload_activate_chosen_device();
130 : offloadFreeHost(memory);
131 : }
132 : #elif OFFLOAD_MEMPOOL_OMPALLOC && (201811 /*v5.0*/ <= _OPENMP)
133 : (void)on_device; // mark used
134 : omp_free(memory, omp_null_allocator);
135 : #elif defined(__parallel) && !OFFLOAD_MEMPOOL_OMPALLOC
136 : (void)on_device; // mark used
137 : if (MPI_SUCCESS != MPI_Free_mem(memory)) {
138 : fprintf(stderr, "ERROR: MPI_Free_mem failed at %s:%i\n", name, __FILE__,
139 : __LINE__);
140 : MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
141 : }
142 : #else
143 147441 : (void)on_device; // mark used
144 147441 : free(memory);
145 : #endif
146 : }
147 :
148 : /*******************************************************************************
149 : * \brief Private routine for allocating host or device memory from the pool.
150 : * \author Ole Schuett and Hans Pabst
151 : ******************************************************************************/
152 4124866 : static void *internal_mempool_malloc(offload_mempool_t *pool, const size_t size,
153 : const bool on_device) {
154 4124866 : if (size == 0) {
155 : return NULL;
156 : }
157 :
158 4071166 : offload_memchunk_t *chunk;
159 :
160 8142332 : #pragma omp critical(offload_mempool_modify)
161 : {
162 : // Find a possible chunk to reuse or reclaim in available list.
163 4071166 : offload_memchunk_t **reuse = NULL,
164 4071166 : **reclaim = NULL; // ** for easy list removal
165 4071166 : offload_memchunk_t **indirect = &pool->available_head;
166 83156103 : while (*indirect != NULL) {
167 80127017 : const size_t s = (*indirect)->size;
168 80127017 : if (size <= s && (reuse == NULL || s < (*reuse)->size)) {
169 6012298 : reuse = indirect; // reuse smallest suitable chunk
170 6012298 : if (s == size) {
171 : break; // perfect match, exit early
172 : }
173 74114719 : } else if (reclaim == NULL || (*reclaim)->size < s) {
174 7530898 : reclaim = indirect; // reclaim largest unsuitable chunk
175 : }
176 79084937 : indirect = &(*indirect)->next;
177 : }
178 :
179 : // Select an existing chunk or allocate a new one.
180 4071166 : if (reuse != NULL) {
181 : // Reusing an exising chunk that's already large enough.
182 3923709 : chunk = *reuse;
183 3923709 : *reuse = chunk->next; // remove chunk from available list.
184 147457 : } else if (reclaim != NULL) {
185 : // Reclaiming an existing chunk (resize will happen outside crit. region).
186 24394 : chunk = *reclaim;
187 24394 : *reclaim = chunk->next; // remove chunk from available list.
188 : } else {
189 : // Found no available chunk, allocate a new one.
190 123063 : chunk = calloc(1, sizeof(offload_memchunk_t));
191 123063 : assert(chunk != NULL);
192 : }
193 : }
194 :
195 : // Resize chunk outside of critical region before adding it to allocated list.
196 4071166 : if (chunk->size < size) {
197 147457 : actual_free(chunk->mem, on_device);
198 147457 : chunk->mem = actual_malloc(size, on_device);
199 147457 : chunk->size = size;
200 : }
201 :
202 4071166 : chunk->used = size; // for statistics
203 :
204 : // Insert chunk into allocated list.
205 4071166 : #pragma omp critical(offload_mempool_modify)
206 : {
207 4071166 : chunk->next = pool->allocated_head;
208 4071166 : pool->allocated_head = chunk;
209 : }
210 :
211 4071166 : return chunk->mem;
212 : }
213 :
214 : /*******************************************************************************
215 : * \brief Private routine for releasing memory back to the pool.
216 : * \author Ole Schuett
217 : ******************************************************************************/
218 4830207 : static void internal_mempool_free(offload_mempool_t *pool, const void *mem) {
219 4830207 : if (mem == NULL) {
220 : return;
221 : }
222 :
223 8142332 : #pragma omp critical(offload_mempool_modify)
224 : {
225 4071166 : offload_memchunk_t **indirect = &pool->allocated_head;
226 15471188 : while (*indirect != NULL && (*indirect)->mem != mem) {
227 11400022 : indirect = &(*indirect)->next;
228 : }
229 4071166 : offload_memchunk_t *chunk = *indirect;
230 4071166 : assert(chunk != NULL && chunk->mem == mem);
231 4071166 : *indirect = chunk->next;
232 4071166 : chunk->next = pool->available_head;
233 4071166 : pool->available_head = chunk;
234 : }
235 : }
236 :
237 : /*******************************************************************************
238 : * \brief Private routine for freeing all memory in the pool.
239 : * \author Ole Schuett and Hans Pabst
240 : ******************************************************************************/
241 20688 : static void internal_mempool_clear(offload_mempool_t *pool,
242 : const bool on_device) {
243 41376 : #pragma omp critical(offload_mempool_modify)
244 : {
245 20688 : assert(pool->allocated_head == NULL);
246 143735 : while (pool->available_head != NULL) {
247 123047 : offload_memchunk_t *chunk = pool->available_head;
248 123047 : pool->available_head = chunk->next;
249 123047 : actual_free(chunk->mem, on_device);
250 123047 : free(chunk);
251 : }
252 : }
253 20688 : }
254 :
255 : /*******************************************************************************
256 : * \brief Private routine for summing alloc sizes of all chunks in given list.
257 : * \author Ole Schuett and Hans Pabst
258 : ******************************************************************************/
259 125056 : static uint64_t sum_chunks_size(const offload_memchunk_t *head, size_t offset) {
260 125056 : uint64_t result = 0;
261 495917 : for (const offload_memchunk_t *chunk = head; chunk != NULL;
262 370861 : chunk = chunk->next) {
263 370861 : result += *(const size_t *)((const char *)chunk + offset);
264 : }
265 125056 : return result;
266 : }
267 : #endif /* !defined(__LIBXSTREAM) */
268 :
269 : /*******************************************************************************
270 : * \brief Internal routine for allocating host memory from the pool.
271 : * \author Ole Schuett
272 : ******************************************************************************/
273 3830407 : void *offload_mempool_host_malloc(const size_t size) {
274 : #if defined(__LIBXSTREAM)
275 : return libxs_malloc(libxstream_opencl_config.pool_hst, size,
276 : LIBXS_MALLOC_AUTO);
277 : #else
278 3830407 : return internal_mempool_malloc(&mempool_host, size, false);
279 : #endif
280 : }
281 :
282 : /*******************************************************************************
283 : * \brief Internal routine for allocating device memory from the pool
284 : * \author Ole Schuett
285 : ******************************************************************************/
286 294459 : void *offload_mempool_device_malloc(const size_t size) {
287 : #if defined(__LIBXSTREAM)
288 : void *memory = NULL;
289 : const int result = libxstream_mem_allocate(&memory, size);
290 : assert(EXIT_SUCCESS == result);
291 : return memory;
292 : #else
293 294459 : return internal_mempool_malloc(&mempool_device, size, true);
294 : #endif
295 : }
296 :
297 : /*******************************************************************************
298 : * \brief Internal routine for releasing memory back to the pool.
299 : * \author Ole Schuett
300 : ******************************************************************************/
301 4535748 : void offload_mempool_host_free(const void *memory) {
302 : #if defined(__LIBXSTREAM)
303 : libxs_free((void *)memory);
304 : #else
305 4535748 : internal_mempool_free(&mempool_host, memory);
306 : #endif
307 4535748 : }
308 :
309 : /*******************************************************************************
310 : * \brief Internal routine for releasing memory back to the pool.
311 : * \author Ole Schuett
312 : ******************************************************************************/
313 294459 : void offload_mempool_device_free(const void *memory) {
314 : #if defined(__LIBXSTREAM)
315 : const int result = libxstream_mem_deallocate((void *)memory);
316 : assert(EXIT_SUCCESS == result);
317 : #else
318 294459 : internal_mempool_free(&mempool_device, memory);
319 : #endif
320 294459 : }
321 :
322 : /*******************************************************************************
323 : * \brief Internal routine for freeing all memory in the pool.
324 : * \author Ole Schuett
325 : ******************************************************************************/
326 10344 : void offload_mempool_clear(void) {
327 : #if defined(__LIBXSTREAM)
328 : (void)0;
329 : #else
330 : {
331 10344 : const uint64_t hsize = sum_chunks_size(mempool_host.available_head,
332 : offsetof(offload_memchunk_t, size)) +
333 10344 : sum_chunks_size(mempool_host.allocated_head,
334 : offsetof(offload_memchunk_t, size));
335 10344 : const uint64_t dsize = sum_chunks_size(mempool_device.available_head,
336 : offsetof(offload_memchunk_t, size)) +
337 10344 : sum_chunks_size(mempool_device.allocated_head,
338 : offsetof(offload_memchunk_t, size));
339 10344 : if (host_stats.mempeak < hsize)
340 8959 : host_stats.mempeak = hsize;
341 10344 : if (device_stats.mempeak < dsize)
342 8941 : device_stats.mempeak = dsize;
343 : }
344 10344 : internal_mempool_clear(&mempool_host, false);
345 10344 : internal_mempool_clear(&mempool_device, true);
346 : #endif
347 10344 : }
348 :
349 : /*******************************************************************************
350 : * \brief Internal routine to query statistics.
351 : * \author Hans Pabst
352 : ******************************************************************************/
353 10460 : void offload_mempool_stats_get(offload_mempool_stats_t *memstats) {
354 10460 : assert(NULL != memstats);
355 20920 : #pragma omp critical(offload_mempool_modify)
356 : {
357 : #if defined(__LIBXSTREAM)
358 : if (NULL != libxstream_opencl_config.pool_hst) {
359 : libxs_malloc_pool_info_t info;
360 : libxs_malloc_pool_info(libxstream_opencl_config.pool_hst, &info);
361 : memstats->host_mallocs = info.nmallocs;
362 : memstats->host_used = info.used;
363 : memstats->host_size = info.size;
364 : memstats->host_peak = info.peak;
365 : } else {
366 : memstats->host_mallocs = 0;
367 : memstats->host_used = 0;
368 : memstats->host_size = 0;
369 : memstats->host_peak = 0;
370 : }
371 : if (NULL != libxstream_opencl_config.pool_dev) {
372 : libxs_malloc_pool_info_t info;
373 : libxs_malloc_pool_info(libxstream_opencl_config.pool_dev, &info);
374 : memstats->device_mallocs = info.nmallocs;
375 : memstats->device_used = info.used;
376 : memstats->device_size = info.size;
377 : memstats->device_peak = info.peak;
378 : } else {
379 : memstats->device_mallocs = 0;
380 : memstats->device_used = 0;
381 : memstats->device_size = 0;
382 : memstats->device_peak = 0;
383 : }
384 : #else
385 10460 : memstats->host_mallocs = host_stats.mallocs;
386 10460 : memstats->host_used = sum_chunks_size(mempool_host.available_head,
387 10460 : offsetof(offload_memchunk_t, used)) +
388 10460 : sum_chunks_size(mempool_host.allocated_head,
389 : offsetof(offload_memchunk_t, used));
390 10460 : memstats->host_size = sum_chunks_size(mempool_host.available_head,
391 10460 : offsetof(offload_memchunk_t, size)) +
392 10460 : sum_chunks_size(mempool_host.allocated_head,
393 : offsetof(offload_memchunk_t, size));
394 10460 : memstats->host_peak = memstats->host_size < host_stats.mempeak
395 : ? host_stats.mempeak
396 10460 : : memstats->host_size;
397 10460 : memstats->device_mallocs = device_stats.mallocs;
398 20920 : memstats->device_used =
399 10460 : sum_chunks_size(mempool_device.available_head,
400 10460 : offsetof(offload_memchunk_t, used)) +
401 10460 : sum_chunks_size(mempool_device.allocated_head,
402 : offsetof(offload_memchunk_t, used));
403 20920 : memstats->device_size =
404 10460 : sum_chunks_size(mempool_device.available_head,
405 10460 : offsetof(offload_memchunk_t, size)) +
406 10460 : sum_chunks_size(mempool_device.allocated_head,
407 : offsetof(offload_memchunk_t, size));
408 10460 : memstats->device_peak = memstats->device_size < device_stats.mempeak
409 : ? device_stats.mempeak
410 10460 : : memstats->device_size;
411 : #endif
412 : }
413 10460 : }
414 :
415 : /*******************************************************************************
416 : * \brief Print allocation statistics..
417 : * \author Hans Pabst
418 : ******************************************************************************/
419 10460 : void offload_mempool_stats_print(int fortran_comm,
420 : void (*print_func)(const char *, int, int),
421 : int output_unit) {
422 10460 : assert(omp_get_num_threads() == 1);
423 :
424 10460 : char buffer[100];
425 10460 : const cp_mpi_comm_t comm = cp_mpi_comm_f2c(fortran_comm);
426 10460 : offload_mempool_stats_t memstats;
427 10460 : offload_mempool_stats_get(&memstats);
428 10460 : cp_mpi_max_uint64(&memstats.device_mallocs, 1, comm);
429 10460 : cp_mpi_max_uint64(&memstats.host_mallocs, 1, comm);
430 :
431 10460 : if (0 != memstats.device_mallocs || 0 != memstats.host_mallocs) {
432 9082 : OFFLOAD_MEMPOOL_PRINT(print_func, "\n", output_unit);
433 9082 : OFFLOAD_MEMPOOL_PRINT(
434 : print_func,
435 : " ----------------------------------------------------------------"
436 : "---------------\n",
437 : output_unit);
438 9082 : OFFLOAD_MEMPOOL_PRINT(
439 : print_func,
440 : " - "
441 : " -\n",
442 : output_unit);
443 :
444 9082 : OFFLOAD_MEMPOOL_PRINT(
445 : print_func,
446 : " - OFFLOAD MEMPOOL STATISTICS "
447 : " -\n",
448 : output_unit);
449 9082 : OFFLOAD_MEMPOOL_PRINT(
450 : print_func,
451 : " - "
452 : " -\n",
453 : output_unit);
454 9082 : OFFLOAD_MEMPOOL_PRINT(
455 : print_func,
456 : " ----------------------------------------------------------------"
457 : "---------------\n",
458 : output_unit);
459 9082 : OFFLOAD_MEMPOOL_PRINT(print_func,
460 : " Memory consumption "
461 : " Number of allocations Used [MiB] Size [MiB]\n",
462 : output_unit);
463 : }
464 10460 : if (0 < memstats.device_mallocs) {
465 9068 : cp_mpi_max_uint64(&memstats.device_peak, 1, comm);
466 9068 : snprintf(buffer, sizeof(buffer),
467 : " Device "
468 : " %20" PRIuPTR " %10" PRIuPTR " %10" PRIuPTR "\n",
469 9068 : (uintptr_t)memstats.device_mallocs,
470 9068 : (uintptr_t)((memstats.device_used + (512U << 10)) >> 20),
471 9068 : (uintptr_t)((memstats.device_peak + (512U << 10)) >> 20));
472 9068 : OFFLOAD_MEMPOOL_PRINT(print_func, buffer, output_unit);
473 : }
474 10460 : if (0 < memstats.host_mallocs) {
475 9082 : cp_mpi_max_uint64(&memstats.host_peak, 1, comm);
476 9082 : snprintf(buffer, sizeof(buffer),
477 : " Host "
478 : " %20" PRIuPTR " %10" PRIuPTR " %10" PRIuPTR "\n",
479 9082 : (uintptr_t)memstats.host_mallocs,
480 9082 : (uintptr_t)((memstats.host_used + (512U << 10)) >> 20),
481 9082 : (uintptr_t)((memstats.host_peak + (512U << 10)) >> 20));
482 9082 : OFFLOAD_MEMPOOL_PRINT(print_func, buffer, output_unit);
483 : }
484 10460 : if (0 < memstats.device_mallocs || 0 < memstats.host_mallocs) {
485 9082 : OFFLOAD_MEMPOOL_PRINT(
486 : print_func,
487 : " ----------------------------------------------------------------"
488 : "---------------\n",
489 : output_unit);
490 : }
491 10460 : }
492 :
493 : // EOF
|