2 * University of Illinois Open Source License
 
    3 * Copyright 2024 Luthey-Schulten Group,
 
    6 * CUDA kernels for particle size-aware RDME operations
 
    9#ifndef LM_CUDA_PARTICLE_SIZE_KERNELS_CUH
 
   10#define LM_CUDA_PARTICLE_SIZE_KERNELS_CUH
 
   13#include "core/Types.h"
 
   14#include "cuda/constant.cuh"
 
   16// Ensure particle_t is defined for CUDA compilation
 
   18typedef uint32_t particle_t;
 
   22typedef uint32_t lattice_size_t;
 
   25// Maximum number of particle types supported in constant memory
 
   26#define MAX_PARTICLE_TYPES_CONSTANT 256
 
   28// Constant memory for particle sizes (accessible from all kernels) - must be global
 
   29extern __constant__ uint32_t particle_sizes_constant[MAX_PARTICLE_TYPES_CONSTANT];
 
   35 * @brief Device function to get particle size from constant memory
 
   36 * @param particleType Particle type ID
 
   37 * @return Size of the particle (default: 1)
 
   39__device__ __forceinline__ uint32_t getParticleSize(particle_t particleType) {
 
   40    return (particleType < MAX_PARTICLE_TYPES_CONSTANT) ?
 
   41           particle_sizes_constant[particleType] : 1;
 
   45 * @brief Template device function to calculate total size occupancy at a site
 
   46 * @param particles Array of particles at the site
 
   47 * @param count Number of particles
 
   48 * @return Total size of all particles
 
   50template<typename ParticleType>
 
   51__device__ uint32_t calculateSiteOccupancy(const ParticleType* particles, uint32_t count) {
 
   52    uint32_t totalSize = 0;
 
   53    for (uint32_t i = 0; i < count; ++i) {
 
   54        totalSize += getParticleSize(particles[i]);
 
   59__device__ void validateKernelParams(uint32_t latticeXSize, uint32_t latticeYSize, 
 
   60    uint32_t latticeZSize, uint32_t particlesPerSite);
 
   64 * @brief Memory-safe site occupancy calculation with overflow protection
 
   65 * @param particles Array of particles at the site
 
   66 * @param count Number of particles
 
   67 * @param maxCount Maximum safe count to prevent buffer overflow
 
   68 * @return Total size of all particles, UINT32_MAX if overflow detected
 
   70template<typename ParticleType>
 
   71__device__ uint32_t calculateSiteOccupancySafe(const ParticleType* particles, uint32_t count, uint32_t maxCount);
 
   74 * @brief Template device function to check if adding a particle would exceed capacity
 
   75 * @param particles Current particles at site
 
   76 * @param count Current particle count
 
   77 * @param newParticle Particle to add
 
   78 * @param maxCapacity Maximum site capacity
 
   79 * @return True if particle can be added
 
   81template<typename ParticleType>
 
   82__device__ bool canAddParticleDevice(const ParticleType* particles, uint32_t count,
 
   83                                    particle_t newParticle, uint32_t maxCapacity) {
 
   84    uint32_t currentOccupancy = calculateSiteOccupancy(particles, count);
 
   85    uint32_t newParticleSize = getParticleSize(newParticle);
 
   86    return (currentOccupancy + newParticleSize) <= maxCapacity;
 
   89// Explicit instantiations for common types
 
   90__device__ uint32_t calculateSiteOccupancy(const uint8_t* particles, uint32_t count);
 
   91__device__ uint32_t calculateSiteOccupancy(const uint32_t* particles, uint32_t count);
 
   92__device__ bool canAddParticleDevice(const uint8_t* particles, uint32_t count,
 
   93                                    particle_t newParticle, uint32_t maxCapacity);
 
   94__device__ bool canAddParticleDevice(const uint32_t* particles, uint32_t count,
 
   95                                    particle_t newParticle, uint32_t maxCapacity);
 
   98 * @brief Kernel to validate size constraints across entire lattice
 
   99 * @param lattice Particle lattice data
 
  100 * @param siteLattice Site type lattice
 
  101 * @param violationFlags Output array for violation flags
 
  102 * @param latticeSize Total lattice size
 
  103 * @param particlesPerSite Maximum particles per site
 
  105__global__ void validateSizeConstraintsKernel(
 
  106    const uint8_t* lattice,
 
  107    const uint8_t* siteLattice,
 
  108    uint32_t* violationFlags,
 
  109    uint32_t latticeSize,
 
  110    uint32_t particlesPerSite
 
  114 * @brief Kernel to calculate occupancy statistics
 
  115 * @param lattice Particle lattice data
 
  116 * @param occupancyCounts Output array for occupancy histogram
 
  117 * @param latticeSize Total lattice size
 
  118 * @param particlesPerSite Maximum particles per site
 
  119 * @param maxOccupancy Maximum occupancy value to track
 
  121__global__ void calculateOccupancyStatsKernel(
 
  122    const uint8_t* lattice,
 
  123    uint32_t* occupancyCounts,
 
  124    uint32_t latticeSize,
 
  125    uint32_t particlesPerSite,
 
  126    uint32_t maxOccupancy
 
  131 * @brief Host function to copy particle sizes to constant memory
 
  132 * @param particleSizes Host array of particle sizes
 
  133 * @param numTypes Number of particle types
 
  134 * @return cudaError_t error code
 
  136cudaError_t copyParticleSizesToConstantMemory(const uint32_t* particleSizes, size_t numTypes);
 
  139 * @brief Host function to validate lattice size constraints (ByteLattice)
 
  140 * @param lattice Device pointer to lattice data
 
  141 * @param siteLattice Device pointer to site data
 
  142 * @param latticeSize Total lattice size
 
  143 * @param particlesPerSite Maximum particles per site
 
  144 * @param stream CUDA stream for async execution
 
  145 * @return Number of sites violating constraints
 
  147uint32_t validateLatticeConstraints(const uint8_t* lattice, const uint8_t* siteLattice,
 
  148                                   uint32_t latticeSize, uint32_t particlesPerSite,
 
  149                                   cudaStream_t stream = 0);
 
  152 * @brief Host function to validate lattice size constraints (IntLattice)
 
  153 * @param lattice Device pointer to lattice data
 
  154 * @param siteLattice Device pointer to site data
 
  155 * @param latticeSize Total lattice size
 
  156 * @param particlesPerSite Maximum particles per site
 
  157 * @param stream CUDA stream for async execution
 
  158 * @return Number of sites violating constraints
 
  160uint32_t validateLatticeConstraints(const uint32_t* lattice, const uint8_t* siteLattice,
 
  161                                   uint32_t latticeSize, uint32_t particlesPerSite,
 
  162                                   cudaStream_t stream = 0);
 
  165 * @brief Host function to get occupancy statistics from GPU (ByteLattice)
 
  166 * @param lattice Device pointer to lattice data
 
  167 * @param latticeSize Total lattice size
 
  168 * @param particlesPerSite Maximum particles per site
 
  169 * @param occupancyCounts Host output array for histogram
 
  170 * @param maxOccupancy Maximum occupancy to track
 
  171 * @param stream CUDA stream for async execution
 
  172 * @return cudaError_t error code
 
  174cudaError_t getOccupancyStatistics(const uint8_t* lattice, uint32_t latticeSize,
 
  175                                  uint32_t particlesPerSite, uint32_t* occupancyCounts,
 
  176                                  uint32_t maxOccupancy, cudaStream_t stream = 0);
 
  179 * @brief Host function to get occupancy statistics from GPU (IntLattice)
 
  180 * @param lattice Device pointer to lattice data
 
  181 * @param latticeSize Total lattice size
 
  182 * @param particlesPerSite Maximum particles per site
 
  183 * @param occupancyCounts Host output array for histogram
 
  184 * @param maxOccupancy Maximum occupancy to track
 
  185 * @param stream CUDA stream for async execution
 
  186 * @return cudaError_t error code
 
  188cudaError_t getOccupancyStatistics(const uint32_t* lattice, uint32_t latticeSize,
 
  189                                  uint32_t particlesPerSite, uint32_t* occupancyCounts,
 
  190                                  uint32_t maxOccupancy, cudaStream_t stream = 0);
 
  194// =============== NEW EFFICIENT SIZE TRACKING KERNELS ===============
 
  197 * @brief Initialize site sizes lattice from current particle lattice
 
  198 * @param particles Particle lattice data
 
  199 * @param siteSizes Output site sizes lattice (uint32_t per site)
 
  200 * @param particleSizes Particle sizes lookup table
 
  201 * @param latticeSize Total number of sites
 
  202 * @param particlesPerSite Maximum particles per site
 
  204__global__ void initializeSiteSizesKernel(
 
  205    const uint8_t* particles,
 
  207    const uint32_t* particleSizes,
 
  208    uint32_t latticeSize,
 
  209    uint32_t particlesPerSite
 
  213 * @brief Check if particles can be added to sites (batch operation)
 
  214 * @param siteSizes Current site sizes lattice
 
  215 * @param particleSizes Particle sizes lookup table
 
  216 * @param subvolumes Array of subvolume indices to check
 
  217 * @param particleTypes Array of particle types to add
 
  218 * @param results Output array of boolean results
 
  219 * @param maxCapacity Maximum capacity per site
 
  220 * @param count Number of operations to perform
 
  222__global__ void canAddParticlesBatchKernel(
 
  223    const uint32_t* siteSizes,
 
  224    const uint32_t* particleSizes,
 
  225    const lattice_size_t* subvolumes,
 
  226    const particle_t* particleTypes,
 
  228    uint32_t maxCapacity,
 
  233 * @brief Add particles to sites and update size lattice (batch operation)
 
  234 * @param siteSizes Site sizes lattice to update
 
  235 * @param particleSizes Particle sizes lookup table
 
  236 * @param subvolumes Array of subvolume indices
 
  237 * @param particleTypes Array of particle types to add
 
  238 * @param maxCapacity Maximum capacity per site
 
  239 * @param count Number of operations to perform
 
  240 * @param results Output array indicating success/failure
 
  242__global__ void addParticlesBatchUpdateSizesKernel(
 
  244    const uint32_t* particleSizes,
 
  245    const lattice_size_t* subvolumes,
 
  246    const particle_t* particleTypes,
 
  247    uint32_t maxCapacity,
 
  253 * @brief Remove particles from sites and update size lattice (batch operation)
 
  254 * @param siteSizes Site sizes lattice to update
 
  255 * @param particleSizes Particle sizes lookup table
 
  256 * @param subvolumes Array of subvolume indices
 
  257 * @param particleTypes Array of particle types to remove
 
  258 * @param count Number of operations to perform
 
  260__global__ void removeParticlesBatchUpdateSizesKernel(
 
  262    const uint32_t* particleSizes,
 
  263    const lattice_size_t* subvolumes,
 
  264    const particle_t* particleTypes,
 
  269 * @brief Fast size-aware diffusion with dedicated site size lattice
 
  270 * @param inParticles Input particle lattice
 
  271 * @param inSiteSizes Input site sizes lattice
 
  272 * @param outParticles Output particle lattice
 
  273 * @param outSiteSizes Output site sizes lattice
 
  274 * @param inSites Site types
 
  275 * @param particleSizes Particle sizes lookup table
 
  276 * @param timestepHash Random seed
 
  277 * @param latticeXSize X dimension
 
  278 * @param latticeYSize Y dimension
 
  279 * @param latticeZSize Z dimension
 
  280 * @param particlesPerSite Maximum particles per site
 
  281 * @param maxCapacity Maximum site capacity
 
  283__global__ void fastSizeAwareDiffusionKernel(
 
  284    const uint8_t* inParticles,
 
  285    const uint32_t* inSiteSizes,
 
  286    uint8_t* outParticles,
 
  287    uint32_t* outSiteSizes,
 
  288    const uint8_t* inSites,
 
  289    const uint32_t* particleSizes,
 
  290    unsigned long long timestepHash,
 
  291    uint32_t latticeXSize,
 
  292    uint32_t latticeYSize,
 
  293    uint32_t latticeZSize,
 
  294    uint32_t particlesPerSite,
 
  298 * @brief Perform size-aware propagation
 
  299 * @param outLattice Output lattice
 
  300 * @param outSizeLattice Output size lattice
 
  301 * @param window Window
 
  302 * @param sizeWindow Size window
 
  303 * @param choices Choices
 
  304 * @param latticeIndex Lattice index
 
  305 * @param latticeXYSize Lattice XY size
 
  306 * @param latticeXYZSize Lattice XYZ size
 
  307 * @param windowIndex Window index
 
  308 * @param particlesPerSite Maximum particles per site
 
  309 * @param siteOverflowList Site overflow list
 
  311__device__ void performSizeAwarePropagation(
 
  312    unsigned int* __restrict__ outLattice,
 
  313    uint32_t* __restrict__ outSizeLattice,
 
  314    const unsigned int* __restrict__ window,
 
  315    const uint32_t* __restrict__ sizeWindow,
 
  316    const uint8_t* __restrict__ choices,
 
  317    const uint32_t latticeIndex,
 
  318    const uint32_t latticeXYSize,
 
  319    const uint32_t latticeXYZSize,
 
  320    const uint32_t windowIndex,
 
  321    const uint32_t particlesPerSite,
 
  322    uint32_t* __restrict__ siteOverflowList);
 
  324// Host wrapper functions for the new kernels
 
  325cudaError_t initializeSiteSizes(const uint8_t* particles, uint32_t* siteSizes,
 
  326                               const uint32_t* particleSizes, uint32_t latticeSize,
 
  327                               uint32_t particlesPerSite, cudaStream_t stream = 0);
 
  329cudaError_t canAddParticlesBatch(const uint32_t* siteSizes, const uint32_t* particleSizes,
 
  330                                const lattice_size_t* subvolumes, const particle_t* particleTypes,
 
  331                                bool* results, uint32_t maxCapacity, uint32_t count,
 
  332                                cudaStream_t stream = 0);
 
  334cudaError_t addParticlesBatchUpdateSizes(uint32_t* siteSizes, const uint32_t* particleSizes,
 
  335                                        const lattice_size_t* subvolumes, const particle_t* particleTypes,
 
  336                                        uint32_t maxCapacity, uint32_t count, bool* results,
 
  337                                        cudaStream_t stream = 0);
 
  339cudaError_t removeParticlesBatchUpdateSizes(uint32_t* siteSizes, const uint32_t* particleSizes,
 
  340                                           const lattice_size_t* subvolumes, const particle_t* particleTypes,
 
  341                                           uint32_t count, cudaStream_t stream = 0);
 
  347 * @brief Kernel for windowed size-aware diffusion
 
  348 * @param inLattice Input lattice state
 
  349 * @param inSites Site types
 
  350 * @param inSizeLattice Input size lattice
 
  351 * @param outLattice Output lattice state
 
  352 * @param outSizeLattice Output size lattice
 
  353 * @param timestepHash Random seed
 
  354 * @param siteOverflowList List for tracking overflow sites
 
  355 * @param latticeXSize X dimension
 
  356 * @param latticeYSize Y dimension
 
  357 * @param latticeZSize Z dimension
 
  358 * @param particlesPerSite Maximum particles per site
 
  360__global__ void sizeAwareWindowedDiffusionKernel(
 
  361    const unsigned int* inLattice,
 
  362    const uint8_t* inSites,
 
  363    const uint32_t* inSizeLattice,
 
  364    unsigned int* outLattice,
 
  365    uint32_t* outSizeLattice,
 
  366    const unsigned long long timestepHash,
 
  367    uint32_t* siteOverflowList,
 
  368    uint32_t latticeXSize,
 
  369    uint32_t latticeYSize,
 
  370    uint32_t latticeZSize,
 
  371    uint32_t particlesPerSite);
 
  374 * @brief Kernel for size-aware reaction processing
 
  375 * @param inLattice Input lattice state
 
  376 * @param inSites Site types
 
  377 * @param outLattice Output lattice state
 
  378 * @param timestepHash Random seed for this timestep
 
  379 * @param siteOverflowList List for tracking overflow sites
 
  380 * @param reactionMatrix Reaction stoichiometry matrix
 
  381 * @param reactionLocationMatrix Reaction location matrix
 
  382 * @param latticeXSize X dimension
 
  383 * @param latticeYSize Y dimension
 
  384 * @param latticeZSize Z dimension
 
  385 * @param particlesPerSite Maximum particles per site
 
  386 * @param numReactions Number of reactions
 
  387 * @param numSpecies Number of species
 
  389__global__ void sizeAwareReactionKernel(
 
  390    const uint8_t* inLattice,
 
  391    const uint32_t* inSizeLattice,
 
  392    const uint8_t* inSites,
 
  393    uint32_t* outSizeLattice,
 
  395    unsigned long long timestepHash,
 
  396    uint32_t* siteOverflowList,
 
  397    const int8_t* reactionMatrix,
 
  398    const uint8_t* reactionLocationMatrix,
 
  399    uint32_t latticeXSize,
 
  400    uint32_t latticeYSize,
 
  401    uint32_t latticeZSize,
 
  402    uint32_t particlesPerSite,
 
  403    uint32_t numReactions,
 
  405    uint32_t numSiteTypes
 
  409 * Optimized size-aware reaction kernel with precomputed propensities
 
  410 * Multiple signatures based on compile-time macros (MPD_GLOBAL_S_MATRIX, MPD_GLOBAL_R_MATRIX)
 
  412#ifdef MPD_GLOBAL_S_MATRIX
 
  413#ifdef MPD_GLOBAL_R_MATRIX
 
  414__global__ void sizeAwarePrecompReactionKernel(
 
  415    const unsigned int* inLattice,
 
  416    const uint32_t* inSizeLattice,
 
  417    const uint8_t* inSites,
 
  418    uint32_t* outSizeLattice,
 
  419    unsigned int* outLattice,
 
  420    const unsigned long long timestepHash,
 
  421    uint32_t* siteOverflowList,
 
  422    const int8_t* __restrict__ SG,
 
  423    const uint8_t* __restrict__ RLG,
 
  424    const unsigned int* __restrict__ reactionOrdersG,
 
  425    const unsigned int* __restrict__ reactionSitesG,
 
  426    const unsigned int* __restrict__ D1G,
 
  427    const unsigned int* __restrict__ D2G,
 
  428    const float* __restrict__ reactionRatesG,
 
  429    const float* __restrict__ qp0,
 
  430    const float* __restrict__ qp1,
 
  431    const float* __restrict__ qp2,
 
  432    uint32_t latticeXSize,
 
  433    uint32_t latticeYSize,
 
  434    uint32_t latticeZSize,
 
  435    uint32_t particlesPerSite,
 
  436    uint32_t numSpecies);
 
  438__global__ void sizeAwarePrecompReactionKernel(
 
  439    const uint8_t* inLattice,
 
  440    const uint32_t* inSizeLattice,
 
  441    const uint8_t* inSites,
 
  442    uint32_t* outSizeLattice,
 
  444    unsigned long long timestepHash,
 
  445    uint32_t* siteOverflowList,
 
  446    const int8_t* __restrict__ SG,
 
  447    const uint8_t* __restrict__ RLG,
 
  448    const unsigned int* __restrict__ reactionOrdersG,
 
  449    const unsigned int* __restrict__ reactionSitesG,
 
  450    const unsigned int* __restrict__ D1G,
 
  451    const unsigned int* __restrict__ D2G,
 
  452    const float* __restrict__ reactionRatesG,
 
  453    const float* __restrict__ qp0,
 
  454    const float* __restrict__ qp1,
 
  455    const float* __restrict__ qp2,
 
  456    uint32_t latticeXSize,
 
  457    uint32_t latticeYSize,
 
  458    uint32_t latticeZSize,
 
  459    uint32_t particlesPerSite,
 
  460    uint32_t numSpecies);
 
  463__global__ void sizeAwarePrecompReactionKernel(
 
  464    const uint8_t* inLattice,
 
  465    const uint32_t* inSizeLattice,
 
  466    const uint8_t* inSites,
 
  467    uint32_t* outSizeLattice,
 
  469    unsigned long long timestepHash,
 
  470    uint32_t* siteOverflowList,
 
  471    const float* __restrict__ qp0,
 
  472    const float* __restrict__ qp1,
 
  473    const float* __restrict__ qp2,
 
  474    uint32_t latticeXSize,
 
  475    uint32_t latticeYSize,
 
  476    uint32_t latticeZSize,
 
  477    uint32_t particlesPerSite,
 
  478    uint32_t numSpecies);
 
  481// =============== BYTE LATTICE OVERFLOW CORRECTION KERNEL ===============
 
  484 * Byte-lattice-compatible overflow correction kernel
 
  485 * Handles overflows for uint8_t lattices instead of unsigned int lattices
 
  487__global__ void correct_byte_overflows(
 
  489    uint32_t* siteOverflowList,
 
  490    uint32_t latticeXSize,
 
  491    uint32_t latticeYSize,
 
  492    uint32_t latticeZSize,
 
  493    uint32_t particlesPerSite);
 
  495// =============== NEW 1D SIZE-AWARE DIFFUSION KERNELS ===============
 
  498 * @brief Size-aware Z-direction diffusion kernel (1D approach)
 
  500__global__ void sizeAwareZDiffusionKernel(
 
  501    const uint8_t* inLattice,
 
  502    const uint8_t* inSites,
 
  503    const uint32_t* inSizeLattice,
 
  505    uint32_t* outSizeLattice,
 
  506    const unsigned long long timestepHash,
 
  507    uint32_t* siteOverflowList,
 
  508    uint32_t latticeXSize,
 
  509    uint32_t latticeYSize,
 
  510    uint32_t latticeZSize,
 
  511    uint32_t particlesPerSite
 
  515 * @brief Size-aware Y-direction diffusion kernel (1D approach)
 
  517__global__ void sizeAwareYDiffusionKernel(
 
  518    const uint8_t* inLattice,
 
  519    const uint8_t* inSites,
 
  520    const uint32_t* inSizeLattice,
 
  522    uint32_t* outSizeLattice,
 
  523    const unsigned long long timestepHash,
 
  524    uint32_t* siteOverflowList,
 
  525    uint32_t latticeXSize,
 
  526    uint32_t latticeYSize,
 
  527    uint32_t latticeZSize,
 
  528    uint32_t particlesPerSite
 
  532 * @brief Size-aware X-direction diffusion kernel (1D approach)
 
  534__global__ void sizeAwareXDiffusionKernel(
 
  535    const uint8_t* inLattice,
 
  536    const uint8_t* inSites,
 
  537    const uint32_t* inSizeLattice,
 
  539    uint32_t* outSizeLattice,
 
  540    const unsigned long long timestepHash,
 
  541    uint32_t* siteOverflowList,
 
  542    uint32_t latticeXSize,
 
  543    uint32_t latticeYSize,
 
  544    uint32_t latticeZSize,
 
  545    uint32_t particlesPerSite
 
  549#endif // LM_CUDA_PARTICLE_SIZE_KERNELS_CUH