LCOV - code coverage report
Current view: top level - src/offload - offload_mempool.c (source / functions) Coverage Total Hit
Test: CP2K Regtests (git:561f475) Lines: 100.0 % 149 149
Test Date: 2026-06-21 06:48:54 Functions: 100.0 % 13 13

            Line data    Source code
       1              : /*----------------------------------------------------------------------------*/
       2              : /*  CP2K: A general program to perform molecular dynamics simulations         */
       3              : /*  Copyright 2000-2026 CP2K developers group <https://cp2k.org>              */
       4              : /*                                                                            */
       5              : /*  SPDX-License-Identifier: BSD-3-Clause                                     */
       6              : /*----------------------------------------------------------------------------*/
       7              : #include "offload_mempool.h"
       8              : #include "../mpiwrap/cp_mpi.h"
       9              : #include "offload_library.h"
      10              : #include "offload_runtime.h"
      11              : 
      12              : #include <assert.h>
      13              : #include <inttypes.h>
      14              : #include <omp.h>
      15              : #include <stdbool.h>
      16              : #include <stdio.h>
      17              : #include <stdlib.h>
      18              : #include <string.h>
      19              : 
      20              : #if defined(__parallel)
      21              : #include <mpi.h>
      22              : #endif
      23              : 
      24              : #if defined(__LIBXSTREAM)
      25              : #include <libxstream/libxstream.h>
      26              : #include <libxstream/libxstream_opencl.h>
      27              : #elif defined(__LIBXS)
      28              : #include <libxs/libxs_malloc.h>
      29              : #endif
      30              : 
      31              : #define OFFLOAD_MEMPOOL_PRINT(FN, MSG, OUTPUT_UNIT)                            \
      32              :   ((FN)(MSG, (int)strlen(MSG), OUTPUT_UNIT))
      33              : #define OFFLOAD_MEMPOOL_OMPALLOC 1
      34              : 
      35              : #if !defined(__LIBXSTREAM)
      36              : /*******************************************************************************
      37              :  * \brief Private struct for storing a chunk of memory.
      38              :  * \author Ole Schuett
      39              :  ******************************************************************************/
      40              : typedef struct offload_memchunk {
      41              :   void *mem; // first: allows to cast memchunk into mem-ptr...
      42              :   struct offload_memchunk *next;
      43              :   size_t size, used;
      44              : } offload_memchunk_t;
      45              : 
      46              : /*******************************************************************************
      47              :  * \brief Private struct for storing a memory pool.
      48              :  * \author Ole Schuett
      49              :  ******************************************************************************/
      50              : typedef struct offload_mempool {
      51              :   offload_memchunk_t *available_head, *allocated_head; // single-linked lists
      52              : } offload_mempool_t;
      53              : 
      54              : /*******************************************************************************
      55              :  * \brief Private pools for host and device memory.
      56              :  * \author Ole Schuett
      57              :  ******************************************************************************/
      58              : static offload_mempool_t mempool_host = {0};
      59              : static offload_mempool_t mempool_device = {0};
      60              : 
      61              : /*******************************************************************************
      62              :  * \brief Private counters for statistics.
      63              :  * \author Hans Pabst
      64              :  ******************************************************************************/
      65              : static struct {
      66              :   uint64_t mallocs, mempeak;
      67              : } host_stats = {0, 0};
      68              : static struct {
      69              :   uint64_t mallocs, mempeak;
      70              : } device_stats = {0, 0};
      71              : 
      72              : /*******************************************************************************
      73              :  * \brief Private routine for actually allocating system memory.
      74              :  * \author Ole Schuett
      75              :  ******************************************************************************/
      76       147457 : static void *actual_malloc(const size_t size, const bool on_device) {
      77       147457 :   if (size == 0) {
      78              :     return NULL;
      79              :   }
      80              : 
      81       147457 :   void *memory = NULL;
      82              : #if defined(__OFFLOAD)
      83              :   if (on_device) {
      84              :     offload_activate_chosen_device();
      85              :     offloadMalloc(&memory, size);
      86              :   } else {
      87              :     offload_activate_chosen_device();
      88              :     offloadMallocHost(&memory, size);
      89              :   }
      90              : #elif OFFLOAD_MEMPOOL_OMPALLOC && (201811 /*v5.0*/ <= _OPENMP)
      91              :   memory = omp_alloc(size, omp_null_allocator);
      92              : #elif defined(__parallel) && !OFFLOAD_MEMPOOL_OMPALLOC
      93              :   if (MPI_SUCCESS != MPI_Alloc_mem((MPI_Aint)size, MPI_INFO_NULL, &memory)) {
      94              :     fprintf(stderr, "ERROR: MPI_Alloc_mem failed at %s:%i\n", name, __FILE__,
      95              :             __LINE__);
      96              :     MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
      97              :   }
      98              : #else
      99       147457 :   memory = malloc(size);
     100              : #endif
     101              : 
     102              :   // Update statistics.
     103       147457 :   if (on_device) {
     104        50141 : #pragma omp atomic
     105              :     ++device_stats.mallocs;
     106              :   } else {
     107        97316 : #pragma omp atomic
     108              :     ++host_stats.mallocs;
     109              :   }
     110              : 
     111       147457 :   assert(memory != NULL);
     112              :   return memory;
     113              : }
     114              : 
     115              : /*******************************************************************************
     116              :  * \brief Private routine for actually freeing system memory.
     117              :  * \author Ole Schuett
     118              :  ******************************************************************************/
     119       270504 : static void actual_free(void *memory, const bool on_device) {
     120       270504 :   if (NULL == memory) {
     121              :     return;
     122              :   }
     123              : 
     124              : #if defined(__OFFLOAD)
     125              :   if (on_device) {
     126              :     offload_activate_chosen_device();
     127              :     offloadFree(memory);
     128              :   } else {
     129              :     offload_activate_chosen_device();
     130              :     offloadFreeHost(memory);
     131              :   }
     132              : #elif OFFLOAD_MEMPOOL_OMPALLOC && (201811 /*v5.0*/ <= _OPENMP)
     133              :   (void)on_device; // mark used
     134              :   omp_free(memory, omp_null_allocator);
     135              : #elif defined(__parallel) && !OFFLOAD_MEMPOOL_OMPALLOC
     136              :   (void)on_device; // mark used
     137              :   if (MPI_SUCCESS != MPI_Free_mem(memory)) {
     138              :     fprintf(stderr, "ERROR: MPI_Free_mem failed at %s:%i\n", name, __FILE__,
     139              :             __LINE__);
     140              :     MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
     141              :   }
     142              : #else
     143       147441 :   (void)on_device; // mark used
     144       147441 :   free(memory);
     145              : #endif
     146              : }
     147              : 
     148              : /*******************************************************************************
     149              :  * \brief Private routine for allocating host or device memory from the pool.
     150              :  * \author Ole Schuett and Hans Pabst
     151              :  ******************************************************************************/
     152      4124866 : static void *internal_mempool_malloc(offload_mempool_t *pool, const size_t size,
     153              :                                      const bool on_device) {
     154      4124866 :   if (size == 0) {
     155              :     return NULL;
     156              :   }
     157              : 
     158      4071166 :   offload_memchunk_t *chunk;
     159              : 
     160      8142332 : #pragma omp critical(offload_mempool_modify)
     161              :   {
     162              :     // Find a possible chunk to reuse or reclaim in available list.
     163      4071166 :     offload_memchunk_t **reuse = NULL,
     164      4071166 :                        **reclaim = NULL; // ** for easy list removal
     165      4071166 :     offload_memchunk_t **indirect = &pool->available_head;
     166     83156103 :     while (*indirect != NULL) {
     167     80127017 :       const size_t s = (*indirect)->size;
     168     80127017 :       if (size <= s && (reuse == NULL || s < (*reuse)->size)) {
     169      6012298 :         reuse = indirect; // reuse smallest suitable chunk
     170      6012298 :         if (s == size) {
     171              :           break; // perfect match, exit early
     172              :         }
     173     74114719 :       } else if (reclaim == NULL || (*reclaim)->size < s) {
     174      7530898 :         reclaim = indirect; // reclaim largest unsuitable chunk
     175              :       }
     176     79084937 :       indirect = &(*indirect)->next;
     177              :     }
     178              : 
     179              :     // Select an existing chunk or allocate a new one.
     180      4071166 :     if (reuse != NULL) {
     181              :       // Reusing an exising chunk that's already large enough.
     182      3923709 :       chunk = *reuse;
     183      3923709 :       *reuse = chunk->next; // remove chunk from available list.
     184       147457 :     } else if (reclaim != NULL) {
     185              :       // Reclaiming an existing chunk (resize will happen outside crit. region).
     186        24394 :       chunk = *reclaim;
     187        24394 :       *reclaim = chunk->next; // remove chunk from available list.
     188              :     } else {
     189              :       // Found no available chunk, allocate a new one.
     190       123063 :       chunk = calloc(1, sizeof(offload_memchunk_t));
     191       123063 :       assert(chunk != NULL);
     192              :     }
     193              :   }
     194              : 
     195              :   // Resize chunk outside of critical region before adding it to allocated list.
     196      4071166 :   if (chunk->size < size) {
     197       147457 :     actual_free(chunk->mem, on_device);
     198       147457 :     chunk->mem = actual_malloc(size, on_device);
     199       147457 :     chunk->size = size;
     200              :   }
     201              : 
     202      4071166 :   chunk->used = size; // for statistics
     203              : 
     204              :   // Insert chunk into allocated list.
     205      4071166 : #pragma omp critical(offload_mempool_modify)
     206              :   {
     207      4071166 :     chunk->next = pool->allocated_head;
     208      4071166 :     pool->allocated_head = chunk;
     209              :   }
     210              : 
     211      4071166 :   return chunk->mem;
     212              : }
     213              : 
     214              : /*******************************************************************************
     215              :  * \brief Private routine for releasing memory back to the pool.
     216              :  * \author Ole Schuett
     217              :  ******************************************************************************/
     218      4830207 : static void internal_mempool_free(offload_mempool_t *pool, const void *mem) {
     219      4830207 :   if (mem == NULL) {
     220              :     return;
     221              :   }
     222              : 
     223      8142332 : #pragma omp critical(offload_mempool_modify)
     224              :   {
     225      4071166 :     offload_memchunk_t **indirect = &pool->allocated_head;
     226     15471188 :     while (*indirect != NULL && (*indirect)->mem != mem) {
     227     11400022 :       indirect = &(*indirect)->next;
     228              :     }
     229      4071166 :     offload_memchunk_t *chunk = *indirect;
     230      4071166 :     assert(chunk != NULL && chunk->mem == mem);
     231      4071166 :     *indirect = chunk->next;
     232      4071166 :     chunk->next = pool->available_head;
     233      4071166 :     pool->available_head = chunk;
     234              :   }
     235              : }
     236              : 
     237              : /*******************************************************************************
     238              :  * \brief Private routine for freeing all memory in the pool.
     239              :  * \author Ole Schuett and Hans Pabst
     240              :  ******************************************************************************/
     241        20688 : static void internal_mempool_clear(offload_mempool_t *pool,
     242              :                                    const bool on_device) {
     243        41376 : #pragma omp critical(offload_mempool_modify)
     244              :   {
     245        20688 :     assert(pool->allocated_head == NULL);
     246       143735 :     while (pool->available_head != NULL) {
     247       123047 :       offload_memchunk_t *chunk = pool->available_head;
     248       123047 :       pool->available_head = chunk->next;
     249       123047 :       actual_free(chunk->mem, on_device);
     250       123047 :       free(chunk);
     251              :     }
     252              :   }
     253        20688 : }
     254              : 
     255              : /*******************************************************************************
     256              :  * \brief Private routine for summing alloc sizes of all chunks in given list.
     257              :  * \author Ole Schuett and Hans Pabst
     258              :  ******************************************************************************/
     259       125056 : static uint64_t sum_chunks_size(const offload_memchunk_t *head, size_t offset) {
     260       125056 :   uint64_t result = 0;
     261       495917 :   for (const offload_memchunk_t *chunk = head; chunk != NULL;
     262       370861 :        chunk = chunk->next) {
     263       370861 :     result += *(const size_t *)((const char *)chunk + offset);
     264              :   }
     265       125056 :   return result;
     266              : }
     267              : #endif /* !defined(__LIBXSTREAM) */
     268              : 
     269              : /*******************************************************************************
     270              :  * \brief Internal routine for allocating host memory from the pool.
     271              :  * \author Ole Schuett
     272              :  ******************************************************************************/
     273      3830407 : void *offload_mempool_host_malloc(const size_t size) {
     274              : #if defined(__LIBXSTREAM)
     275              :   return libxs_malloc(libxstream_opencl_config.pool_hst, size,
     276              :                       LIBXS_MALLOC_AUTO);
     277              : #else
     278      3830407 :   return internal_mempool_malloc(&mempool_host, size, false);
     279              : #endif
     280              : }
     281              : 
     282              : /*******************************************************************************
     283              :  * \brief Internal routine for allocating device memory from the pool
     284              :  * \author Ole Schuett
     285              :  ******************************************************************************/
     286       294459 : void *offload_mempool_device_malloc(const size_t size) {
     287              : #if defined(__LIBXSTREAM)
     288              :   void *memory = NULL;
     289              :   const int result = libxstream_mem_allocate(&memory, size);
     290              :   assert(EXIT_SUCCESS == result);
     291              :   return memory;
     292              : #else
     293       294459 :   return internal_mempool_malloc(&mempool_device, size, true);
     294              : #endif
     295              : }
     296              : 
     297              : /*******************************************************************************
     298              :  * \brief Internal routine for releasing memory back to the pool.
     299              :  * \author Ole Schuett
     300              :  ******************************************************************************/
     301      4535748 : void offload_mempool_host_free(const void *memory) {
     302              : #if defined(__LIBXSTREAM)
     303              :   libxs_free((void *)memory);
     304              : #else
     305      4535748 :   internal_mempool_free(&mempool_host, memory);
     306              : #endif
     307      4535748 : }
     308              : 
     309              : /*******************************************************************************
     310              :  * \brief Internal routine for releasing memory back to the pool.
     311              :  * \author Ole Schuett
     312              :  ******************************************************************************/
     313       294459 : void offload_mempool_device_free(const void *memory) {
     314              : #if defined(__LIBXSTREAM)
     315              :   const int result = libxstream_mem_deallocate((void *)memory);
     316              :   assert(EXIT_SUCCESS == result);
     317              : #else
     318       294459 :   internal_mempool_free(&mempool_device, memory);
     319              : #endif
     320       294459 : }
     321              : 
     322              : /*******************************************************************************
     323              :  * \brief Internal routine for freeing all memory in the pool.
     324              :  * \author Ole Schuett
     325              :  ******************************************************************************/
     326        10344 : void offload_mempool_clear(void) {
     327              : #if defined(__LIBXSTREAM)
     328              :   (void)0;
     329              : #else
     330              :   {
     331        10344 :     const uint64_t hsize = sum_chunks_size(mempool_host.available_head,
     332              :                                            offsetof(offload_memchunk_t, size)) +
     333        10344 :                            sum_chunks_size(mempool_host.allocated_head,
     334              :                                            offsetof(offload_memchunk_t, size));
     335        10344 :     const uint64_t dsize = sum_chunks_size(mempool_device.available_head,
     336              :                                            offsetof(offload_memchunk_t, size)) +
     337        10344 :                            sum_chunks_size(mempool_device.allocated_head,
     338              :                                            offsetof(offload_memchunk_t, size));
     339        10344 :     if (host_stats.mempeak < hsize)
     340         8959 :       host_stats.mempeak = hsize;
     341        10344 :     if (device_stats.mempeak < dsize)
     342         8941 :       device_stats.mempeak = dsize;
     343              :   }
     344        10344 :   internal_mempool_clear(&mempool_host, false);
     345        10344 :   internal_mempool_clear(&mempool_device, true);
     346              : #endif
     347        10344 : }
     348              : 
     349              : /*******************************************************************************
     350              :  * \brief Internal routine to query statistics.
     351              :  * \author Hans Pabst
     352              :  ******************************************************************************/
     353        10460 : void offload_mempool_stats_get(offload_mempool_stats_t *memstats) {
     354        10460 :   assert(NULL != memstats);
     355        20920 : #pragma omp critical(offload_mempool_modify)
     356              :   {
     357              : #if defined(__LIBXSTREAM)
     358              :     if (NULL != libxstream_opencl_config.pool_hst) {
     359              :       libxs_malloc_pool_info_t info;
     360              :       libxs_malloc_pool_info(libxstream_opencl_config.pool_hst, &info);
     361              :       memstats->host_mallocs = info.nmallocs;
     362              :       memstats->host_used = info.used;
     363              :       memstats->host_size = info.size;
     364              :       memstats->host_peak = info.peak;
     365              :     } else {
     366              :       memstats->host_mallocs = 0;
     367              :       memstats->host_used = 0;
     368              :       memstats->host_size = 0;
     369              :       memstats->host_peak = 0;
     370              :     }
     371              :     if (NULL != libxstream_opencl_config.pool_dev) {
     372              :       libxs_malloc_pool_info_t info;
     373              :       libxs_malloc_pool_info(libxstream_opencl_config.pool_dev, &info);
     374              :       memstats->device_mallocs = info.nmallocs;
     375              :       memstats->device_used = info.used;
     376              :       memstats->device_size = info.size;
     377              :       memstats->device_peak = info.peak;
     378              :     } else {
     379              :       memstats->device_mallocs = 0;
     380              :       memstats->device_used = 0;
     381              :       memstats->device_size = 0;
     382              :       memstats->device_peak = 0;
     383              :     }
     384              : #else
     385        10460 :     memstats->host_mallocs = host_stats.mallocs;
     386        10460 :     memstats->host_used = sum_chunks_size(mempool_host.available_head,
     387        10460 :                                           offsetof(offload_memchunk_t, used)) +
     388        10460 :                           sum_chunks_size(mempool_host.allocated_head,
     389              :                                           offsetof(offload_memchunk_t, used));
     390        10460 :     memstats->host_size = sum_chunks_size(mempool_host.available_head,
     391        10460 :                                           offsetof(offload_memchunk_t, size)) +
     392        10460 :                           sum_chunks_size(mempool_host.allocated_head,
     393              :                                           offsetof(offload_memchunk_t, size));
     394        10460 :     memstats->host_peak = memstats->host_size < host_stats.mempeak
     395              :                               ? host_stats.mempeak
     396        10460 :                               : memstats->host_size;
     397        10460 :     memstats->device_mallocs = device_stats.mallocs;
     398        20920 :     memstats->device_used =
     399        10460 :         sum_chunks_size(mempool_device.available_head,
     400        10460 :                         offsetof(offload_memchunk_t, used)) +
     401        10460 :         sum_chunks_size(mempool_device.allocated_head,
     402              :                         offsetof(offload_memchunk_t, used));
     403        20920 :     memstats->device_size =
     404        10460 :         sum_chunks_size(mempool_device.available_head,
     405        10460 :                         offsetof(offload_memchunk_t, size)) +
     406        10460 :         sum_chunks_size(mempool_device.allocated_head,
     407              :                         offsetof(offload_memchunk_t, size));
     408        10460 :     memstats->device_peak = memstats->device_size < device_stats.mempeak
     409              :                                 ? device_stats.mempeak
     410        10460 :                                 : memstats->device_size;
     411              : #endif
     412              :   }
     413        10460 : }
     414              : 
     415              : /*******************************************************************************
     416              :  * \brief Print allocation statistics..
     417              :  * \author Hans Pabst
     418              :  ******************************************************************************/
     419        10460 : void offload_mempool_stats_print(int fortran_comm,
     420              :                                  void (*print_func)(const char *, int, int),
     421              :                                  int output_unit) {
     422        10460 :   assert(omp_get_num_threads() == 1);
     423              : 
     424        10460 :   char buffer[100];
     425        10460 :   const cp_mpi_comm_t comm = cp_mpi_comm_f2c(fortran_comm);
     426        10460 :   offload_mempool_stats_t memstats;
     427        10460 :   offload_mempool_stats_get(&memstats);
     428        10460 :   cp_mpi_max_uint64(&memstats.device_mallocs, 1, comm);
     429        10460 :   cp_mpi_max_uint64(&memstats.host_mallocs, 1, comm);
     430              : 
     431        10460 :   if (0 != memstats.device_mallocs || 0 != memstats.host_mallocs) {
     432         9082 :     OFFLOAD_MEMPOOL_PRINT(print_func, "\n", output_unit);
     433         9082 :     OFFLOAD_MEMPOOL_PRINT(
     434              :         print_func,
     435              :         " ----------------------------------------------------------------"
     436              :         "---------------\n",
     437              :         output_unit);
     438         9082 :     OFFLOAD_MEMPOOL_PRINT(
     439              :         print_func,
     440              :         " -                                                               "
     441              :         "              -\n",
     442              :         output_unit);
     443              : 
     444         9082 :     OFFLOAD_MEMPOOL_PRINT(
     445              :         print_func,
     446              :         " -                          OFFLOAD MEMPOOL STATISTICS           "
     447              :         "              -\n",
     448              :         output_unit);
     449         9082 :     OFFLOAD_MEMPOOL_PRINT(
     450              :         print_func,
     451              :         " -                                                               "
     452              :         "              -\n",
     453              :         output_unit);
     454         9082 :     OFFLOAD_MEMPOOL_PRINT(
     455              :         print_func,
     456              :         " ----------------------------------------------------------------"
     457              :         "---------------\n",
     458              :         output_unit);
     459         9082 :     OFFLOAD_MEMPOOL_PRINT(print_func,
     460              :                           " Memory consumption               "
     461              :                           " Number of allocations  Used [MiB]  Size [MiB]\n",
     462              :                           output_unit);
     463              :   }
     464        10460 :   if (0 < memstats.device_mallocs) {
     465         9068 :     cp_mpi_max_uint64(&memstats.device_peak, 1, comm);
     466         9068 :     snprintf(buffer, sizeof(buffer),
     467              :              " Device                            "
     468              :              " %20" PRIuPTR "  %10" PRIuPTR "  %10" PRIuPTR "\n",
     469         9068 :              (uintptr_t)memstats.device_mallocs,
     470         9068 :              (uintptr_t)((memstats.device_used + (512U << 10)) >> 20),
     471         9068 :              (uintptr_t)((memstats.device_peak + (512U << 10)) >> 20));
     472         9068 :     OFFLOAD_MEMPOOL_PRINT(print_func, buffer, output_unit);
     473              :   }
     474        10460 :   if (0 < memstats.host_mallocs) {
     475         9082 :     cp_mpi_max_uint64(&memstats.host_peak, 1, comm);
     476         9082 :     snprintf(buffer, sizeof(buffer),
     477              :              " Host                              "
     478              :              " %20" PRIuPTR "  %10" PRIuPTR "  %10" PRIuPTR "\n",
     479         9082 :              (uintptr_t)memstats.host_mallocs,
     480         9082 :              (uintptr_t)((memstats.host_used + (512U << 10)) >> 20),
     481         9082 :              (uintptr_t)((memstats.host_peak + (512U << 10)) >> 20));
     482         9082 :     OFFLOAD_MEMPOOL_PRINT(print_func, buffer, output_unit);
     483              :   }
     484        10460 :   if (0 < memstats.device_mallocs || 0 < memstats.host_mallocs) {
     485         9082 :     OFFLOAD_MEMPOOL_PRINT(
     486              :         print_func,
     487              :         " ----------------------------------------------------------------"
     488              :         "---------------\n",
     489              :         output_unit);
     490              :   }
     491        10460 : }
     492              : 
     493              : // EOF
        

Generated by: LCOV version 2.0-1