Commit c77268ee authored by Michael Kuron's avatar Michael Kuron 💡

successfully compiles for AMD GPU

parent d13ef9a7
......@@ -179,27 +179,46 @@ if (WITH_CUDA)
target_link_libraries(${target} "cufft")
endfunction()
else()
find_package(CUDA 7.0)
if(CUDA_FOUND)
list(APPEND LIBRARIES ${CUDA_CUFFT_LIBRARIES})
list(APPEND LIBRARIES ${CUDA_LIBRARIES})
list(APPEND CMAKE_MODULE_PATH "/opt/rocm/hip/cmake")
find_package(HIP QUIET MODULE)
if(HIP_FOUND)
find_package(HIP MODULE)
message(STATUS "Found HIP compiler: ${HIP_HIPCC_EXECUTABLE}")
set(CUDA 1)
set(CUDA_NVCC_FLAGS_DEBUG "${CUDA_NVCC_FLAGS_DEBUG} -g -G")
set(CUDA_NVCC_FLAGS_RELEASE "${CUDA_NVCC_FLAGS_RELEASE} -O3 -DNDEBUG")
set(CUDA_NVCC_FLAGS_MINSIZEREL "${CUDA_NVCC_FLAGS_MINSIZEREL} -Os -DNDEBUG")
set(CUDA_NVCC_FLAGS_RELWITHDEBINFO "${CUDA_NVCC_FLAGS_RELWITHDEBINFO} -g -G -O2")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode=arch=compute_30,code=sm_30 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_52,code=compute_52")
list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
if (CUDA_VERSION VERSION_LESS "8.0" AND CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "5.0")
list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED -D_FORCE_INLINES")
endif()
SET(CUDA_PROPAGATE_HOST_FLAGS OFF)
if (CMAKE_OSX_SYSROOT)
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler -isysroot -Xcompiler ${CMAKE_OSX_SYSROOT}")
endif()
endif(CUDA_FOUND)
HIP_INCLUDE_DIRECTORIES("${CMAKE_BINARY_DIR}/src/core" "${CMAKE_SOURCE_DIR}/src/core")
function(cuda_include_directories)
HIP_INCLUDE_DIRECTORIES(${ARGV})
endfunction()
function(cuda_add_library)
HIP_ADD_LIBRARY(${ARGV})
endfunction()
find_library(ROCFFT_LIB name "rocfft" PATHS "${HIP_ROOT_DIR}/lib")
function(CUDA_ADD_CUFFT_TO_TARGET target)
target_link_libraries(${target} "${ROCFFT_LIB}")
endfunction()
else()
find_package(CUDA 7.0)
if(CUDA_FOUND)
list(APPEND LIBRARIES ${CUDA_CUFFT_LIBRARIES})
list(APPEND LIBRARIES ${CUDA_LIBRARIES})
set(CUDA 1)
set(CUDA_NVCC_FLAGS_DEBUG "${CUDA_NVCC_FLAGS_DEBUG} -g -G")
set(CUDA_NVCC_FLAGS_RELEASE "${CUDA_NVCC_FLAGS_RELEASE} -O3 -DNDEBUG")
set(CUDA_NVCC_FLAGS_MINSIZEREL "${CUDA_NVCC_FLAGS_MINSIZEREL} -Os -DNDEBUG")
set(CUDA_NVCC_FLAGS_RELWITHDEBINFO "${CUDA_NVCC_FLAGS_RELWITHDEBINFO} -g -G -O2")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode=arch=compute_30,code=sm_30 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_52,code=compute_52")
list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
if (CUDA_VERSION VERSION_LESS "8.0" AND CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "5.0")
list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED -D_FORCE_INLINES")
endif()
SET(CUDA_PROPAGATE_HOST_FLAGS OFF)
if (CMAKE_OSX_SYSROOT)
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler -isysroot -Xcompiler ${CMAKE_OSX_SYSROOT}")
endif()
endif(CUDA_FOUND)
endif()
endif()
endif(WITH_CUDA)
......@@ -275,8 +294,7 @@ endif()
# Check for the HIP submodule
# and try to check it out if not found or update it if found.
# TODO: don't do this if hipcc is used
if(CUDA AND EXISTS "${CMAKE_SOURCE_DIR}/.git")
if(CUDA AND NOT HIP_HIPCC_EXECUTABLE AND EXISTS "${CMAKE_SOURCE_DIR}/.git")
# Try to find git
find_package(Git)
if(GIT_FOUND)
......
......@@ -179,27 +179,27 @@ void EspressoSystemInterface::split_particle_struct() {
dim3 block(512, 1, 1);
if (m_needsQGpu && m_needsRGpu)
split_kernel_rq<<<grid, block>>>(gpu_get_particle_pointer(), m_r_gpu_begin,
hipLaunchKernelGGL(split_kernel_rq, dim3(grid), dim3(block), 0, 0, gpu_get_particle_pointer(), m_r_gpu_begin,
m_q_gpu_begin, n);
if (m_needsQGpu && !m_needsRGpu)
split_kernel_q<<<grid, block>>>(gpu_get_particle_pointer(), m_q_gpu_begin,
hipLaunchKernelGGL(split_kernel_q, dim3(grid), dim3(block), 0, 0, gpu_get_particle_pointer(), m_q_gpu_begin,
n);
if (!m_needsQGpu && m_needsRGpu)
split_kernel_r<<<grid, block>>>(gpu_get_particle_pointer(), m_r_gpu_begin,
hipLaunchKernelGGL(split_kernel_r, dim3(grid), dim3(block), 0, 0, gpu_get_particle_pointer(), m_r_gpu_begin,
n);
#ifdef LB_GPU
if (m_needsVGpu)
split_kernel_v<<<grid, block>>>(gpu_get_particle_pointer(), m_v_gpu_begin,
hipLaunchKernelGGL(split_kernel_v, dim3(grid), dim3(block), 0, 0, gpu_get_particle_pointer(), m_v_gpu_begin,
n);
#endif
#ifdef DIPOLES
if (m_needsDipGpu)
split_kernel_dip<<<grid, block>>>(gpu_get_particle_pointer(),
hipLaunchKernelGGL(split_kernel_dip, dim3(grid), dim3(block), 0, 0, gpu_get_particle_pointer(),
m_dip_gpu_begin, n);
#endif
if (m_needsQuatuGpu)
split_kernel_quatu<<<grid, block>>>(gpu_get_particle_pointer(),
hipLaunchKernelGGL(split_kernel_quatu, dim3(grid), dim3(block), 0, 0, gpu_get_particle_pointer(),
m_quatu_gpu_begin, n);
}
......@@ -993,7 +993,7 @@ void initBHgpu(int blocks) {
grid.x = blocks * FACTOR5;
block.x = THREADS5;
KERNELCALL(initializationKernel, grid, block, ());
KERNELCALL(initializationKernel, grid, block);
#ifdef __CUDACC__
// According to the experimental performance optimization:
......@@ -1017,7 +1017,7 @@ void buildBoxBH(int blocks) {
block.x = THREADS1;
hipDeviceSynchronize();
KERNELCALL(boundingBoxKernel, grid, block, ());
KERNELCALL(boundingBoxKernel, grid, block);
cuda_safe_mem(hipDeviceSynchronize());
}
......@@ -1030,7 +1030,7 @@ void buildTreeBH(int blocks) {
grid.x = blocks * FACTOR2;
block.x = THREADS2;
KERNELCALL(treeBuildingKernel, grid, block, ());
KERNELCALL(treeBuildingKernel, grid, block);
cuda_safe_mem(hipDeviceSynchronize());
}
......@@ -1044,7 +1044,7 @@ void summarizeBH(int blocks) {
grid.x = blocks * FACTOR3;
block.x = THREADS3;
KERNELCALL(summarizationKernel, grid, block, ());
KERNELCALL(summarizationKernel, grid, block);
cuda_safe_mem(hipDeviceSynchronize());
}
......@@ -1058,7 +1058,7 @@ void sortBH(int blocks) {
grid.x = blocks * FACTOR4;
block.x = THREADS4;
KERNELCALL(sortKernel, grid, block, ());
KERNELCALL(sortKernel, grid, block);
cuda_safe_mem(hipDeviceSynchronize());
}
......@@ -1071,7 +1071,7 @@ int forceBH(BHData *bh_data, dds_float k, float *f, float *torque) {
grid.x = bh_data->blocks * FACTOR5;
block.x = THREADS5;
KERNELCALL(forceCalculationKernel, grid, block, (k, f, torque));
KERNELCALL(forceCalculationKernel, grid, block, k, f, torque);
cuda_safe_mem(hipDeviceSynchronize());
cuda_safe_mem(hipMemcpy(&error_code, bh_data->err, sizeof(int),
......@@ -1094,7 +1094,7 @@ int energyBH(BHData *bh_data, dds_float k, float *E) {
cuda_safe_mem(hipMemset(energySum, 0, (int)(sizeof(dds_float) * grid.x)));
KERNELCALL_shared(energyCalculationKernel, grid, block,
block.x * sizeof(dds_float), (k, energySum));
block.x * sizeof(dds_float), k, energySum);
cuda_safe_mem(hipDeviceSynchronize());
// Sum the results of all blocks
......
......@@ -306,7 +306,7 @@ void DipolarDirectSum_kernel_wrapper_force(dds_float k, int n, float *pos,
// printf("box_l: %f %f %f\n",box_l[0],box_l[1],box_l[2]);
KERNELCALL(DipolarDirectSum_kernel_force, grid, block,
(k, n, pos, dip, f, torque, box_l_gpu, periodic_gpu));
k, n, pos, dip, f, torque, box_l_gpu, periodic_gpu);
hipFree(box_l_gpu);
hipFree(periodic_gpu);
}
......@@ -347,12 +347,12 @@ void DipolarDirectSum_kernel_wrapper_energy(dds_float k, int n, float *pos,
// This will sum the energies up to the block level
KERNELCALL_shared(DipolarDirectSum_kernel_energy, grid, block,
bs * sizeof(dds_float),
(k, n, pos, dip, box_l_gpu, periodic_gpu, energySum));
k, n, pos, dip, box_l_gpu, periodic_gpu, energySum);
// printf(" Still here after energy kernel\n");
// Sum the results of all blocks
// One thread per block in the prev kernel
// KERNELCALL(sumKernel,1,1,(energySum,block.x,E));
// KERNELCALL(sumKernel,1,1,energySum,block.x,E);
thrust::device_ptr<dds_float> t(energySum);
float x = thrust::reduce(t, t + grid.x);
cuda_safe_mem(hipMemcpy(E, &x, sizeof(float), hipMemcpyHostToDevice));
......
......@@ -79,5 +79,5 @@ void HarmonicOrientationWell_kernel_wrapper(float x, float y, float z, float k,
}
KERNELCALL(HarmonicOrientationWell_kernel, grid, block,
(x, y, z, k, n, quatu, torque))
x, y, z, k, n, quatu, torque)
}
......@@ -57,5 +57,5 @@ void HarmonicWell_kernel_wrapper(float x, float y, float z, float k, int n,
block.x = 512;
}
KERNELCALL(HarmonicWell_kernel, grid, block, (x, y, z, k, n, pos, f))
KERNELCALL(HarmonicWell_kernel, grid, block, x, y, z, k, n, pos, f)
}
......@@ -232,7 +232,7 @@ void Mmm1dgpuForce::tune(SystemInterface &s, mmm1dgpu_real _maxPWerror,
int *dev_cutoff;
int maxCut = 30;
cuda_safe_mem(hipMalloc((void **)&dev_cutoff, sizeof(int)));
besselTuneKernel<<<1, 1>>>(dev_cutoff, far_switch_radius, maxCut);
hipLaunchKernelGGL(besselTuneKernel, dim3(1), dim3(1), 0, 0, dev_cutoff, far_switch_radius, maxCut);
cuda_safe_mem(hipMemcpy(&bessel_cutoff, dev_cutoff, sizeof(int),
hipMemcpyDeviceToHost));
hipFree(dev_cutoff);
......@@ -506,13 +506,13 @@ void Mmm1dgpuForce::computeForces(SystemInterface &s) {
int blocksRed = s.npart_gpu() / numThreads + 1;
KERNELCALL(
forcesKernel, numBlocks(s), numThreads,
(s.rGpuBegin(), s.qGpuBegin(), dev_forcePairs, s.npart_gpu(), pairs))
s.rGpuBegin(), s.qGpuBegin(), dev_forcePairs, s.npart_gpu(), pairs)
KERNELCALL(vectorReductionKernel, blocksRed, numThreads,
(dev_forcePairs, s.fGpuBegin(), s.npart_gpu()))
dev_forcePairs, s.fGpuBegin(), s.npart_gpu())
} else {
KERNELCALL(
forcesKernel, numBlocks(s), numThreads,
(s.rGpuBegin(), s.qGpuBegin(), s.fGpuBegin(), s.npart_gpu(), pairs))
s.rGpuBegin(), s.qGpuBegin(), s.fGpuBegin(), s.npart_gpu(), pairs)
}
}
......@@ -543,12 +543,12 @@ void Mmm1dgpuForce::computeEnergy(SystemInterface &s) {
KERNELCALL_shared(
energiesKernel, numBlocks(s), numThreads, shared,
(s.rGpuBegin(), s.qGpuBegin(), dev_energyBlocks, s.npart_gpu(), 0));
s.rGpuBegin(), s.qGpuBegin(), dev_energyBlocks, s.npart_gpu(), 0);
KERNELCALL_shared(sumKernel, 1, numThreads, shared,
(dev_energyBlocks, numBlocks(s)));
dev_energyBlocks, numBlocks(s));
KERNELCALL(scaleAndAddKernel, 1, 1,
(&(((CUDA_energy *)s.eGpu())->coulomb), &dev_energyBlocks[0], 1,
0.5)); // we have counted every interaction twice, so halve the
&(((CUDA_energy *)s.eGpu())->coulomb), &dev_energyBlocks[0], 1,
0.5); // we have counted every interaction twice, so halve the
// total energy
}
......@@ -563,7 +563,7 @@ float Mmm1dgpuForce::force_benchmark(SystemInterface &s) {
cuda_safe_mem(hipEventCreate(&eventStop));
cuda_safe_mem(hipEventRecord(eventStart, stream[0]));
KERNELCALL(forcesKernel, numBlocks(s), numThreads,
(s.rGpuBegin(), s.qGpuBegin(), dev_f_benchmark, s.npart_gpu(), 0))
s.rGpuBegin(), s.qGpuBegin(), dev_f_benchmark, s.npart_gpu(), 0)
cuda_safe_mem(hipEventRecord(eventStop, stream[0]));
cuda_safe_mem(hipEventSynchronize(eventStop));
cuda_safe_mem(hipEventElapsedTime(&elapsedTime, eventStart, eventStop));
......
......@@ -324,12 +324,12 @@ void gpu_change_number_of_part_to_comm() {
KERNELCALL(init_particle_force, dim_grid_particles,
threads_per_block_particles,
(particle_forces_device, particle_torques_device,
particle_seeds_device));
particle_forces_device, particle_torques_device,
particle_seeds_device);
#ifdef SHANCHEN
KERNELCALL(init_fluid_composition, dim_grid_particles,
threads_per_block_particles, (fluid_composition_device));
threads_per_block_particles, fluid_composition_device);
#endif
}
}
......@@ -448,7 +448,7 @@ void copy_forces_from_GPU(ParticleRange particles) {
KERNELCALL(reset_particle_force, dim_grid_particles,
threads_per_block_particles,
(particle_forces_device, particle_torques_device));
particle_forces_device, particle_torques_device);
hipDeviceSynchronize();
}
......
......@@ -29,6 +29,10 @@ extern hipStream_t stream[1];
extern hipError_t CU_err;
extern hipError_t _err;
#ifndef __CUDACC__
#define make_uint3 dim3
#endif
/**erroroutput for memory allocation and memory copy
* @param err cuda error code
* @param *file .cu file were the error took place
......@@ -43,15 +47,17 @@ void _cuda_check_errors(const dim3 &block, const dim3 &grid,
#define cuda_safe_mem(a) _cuda_safe_mem((a), __FILE__, __LINE__)
#define KERNELCALL_shared(_f, _a, _b, _s, _params) \
_f<<<_a, _b, _s, stream[0]>>> _params; \
#define KERNELCALL_shared(_f, _a, _b, _s, ...) \
hipLaunchKernelGGL(_f, _a, _b, _s, stream[0], ##__VA_ARGS__); \
_cuda_check_errors(_a, _b, #_f, __FILE__, __LINE__);
#define KERNELCALL_stream(_function, _grid, _block, _stream, _params) \
_function<<<_grid, _block, 0, _stream>>> _params; \
#define KERNELCALL_stream(_function, _grid, _block, _stream, ...) \
hipLaunchKernelGGL(_function, _grid, _block, 0, _stream, ##__VA_ARGS__); \
_cuda_check_errors(_grid, _block, #_function, __FILE__, __LINE__);
#define KERNELCALL(_f, _a, _b, _params) \
KERNELCALL_shared(_f, _a, _b, 0, _params)
#define KERNELCALL(_f, _a, _b, ...) \
KERNELCALL_shared(_f, _a, _b, 0, ##__VA_ARGS__)
#endif
......@@ -473,34 +473,29 @@ void assign_charges(const CUDA_particle_data *const pdata, const P3MGpuData p) {
switch (cao) {
case 1:
assign_charge_kernel<1, false><<<grid, block>>>(pdata, p, parts_per_block);
hipLaunchKernelGGL((assign_charge_kernel<1, false>), dim3(grid), dim3(block), 0, 0, pdata, p, parts_per_block);
break;
case 2:
assign_charge_kernel<2, false><<<grid, block>>>(pdata, p, parts_per_block);
hipLaunchKernelGGL((assign_charge_kernel<2, false>), dim3(grid), dim3(block), 0, 0, pdata, p, parts_per_block);
break;
case 3:
assign_charge_kernel<3, true>
<<<grid, block, 3 * parts_per_block * cao * sizeof(REAL_TYPE)>>>(
hipLaunchKernelGGL((assign_charge_kernel<3, true>), dim3(grid), dim3(block), 3 * parts_per_block * cao * sizeof(REAL_TYPE), 0,
pdata, p, parts_per_block);
break;
case 4:
assign_charge_kernel<4, true>
<<<grid, block, 3 * parts_per_block * cao * sizeof(REAL_TYPE)>>>(
hipLaunchKernelGGL((assign_charge_kernel<4, true>), dim3(grid), dim3(block), 3 * parts_per_block * cao * sizeof(REAL_TYPE), 0,
pdata, p, parts_per_block);
break;
case 5:
assign_charge_kernel<5, true>
<<<grid, block, 3 * parts_per_block * cao * sizeof(REAL_TYPE)>>>(
hipLaunchKernelGGL((assign_charge_kernel<5, true>), dim3(grid), dim3(block), 3 * parts_per_block * cao * sizeof(REAL_TYPE), 0,
pdata, p, parts_per_block);
break;
case 6:
assign_charge_kernel<6, true>
<<<grid, block, 3 * parts_per_block * cao * sizeof(REAL_TYPE)>>>(
hipLaunchKernelGGL((assign_charge_kernel<6, true>), dim3(grid), dim3(block), 3 * parts_per_block * cao * sizeof(REAL_TYPE), 0,
pdata, p, parts_per_block);
break;
case 7:
assign_charge_kernel<7, true>
<<<grid, block, 3 * parts_per_block * cao * sizeof(REAL_TYPE)>>>(
hipLaunchKernelGGL((assign_charge_kernel<7, true>), dim3(grid), dim3(block), 3 * parts_per_block * cao * sizeof(REAL_TYPE), 0,
pdata, p, parts_per_block);
break;
default:
......@@ -608,36 +603,31 @@ void assign_forces(const CUDA_particle_data *const pdata, const P3MGpuData p,
* > 2 */
switch (cao) {
case 1:
assign_forces_kernel<1, false><<<grid, block>>>(
hipLaunchKernelGGL((assign_forces_kernel<1, false>), dim3(grid), dim3(block), 0, 0,
pdata, p, lb_particle_force_gpu, prefactor, parts_per_block);
break;
case 2:
assign_forces_kernel<2, false><<<grid, block>>>(
hipLaunchKernelGGL((assign_forces_kernel<2, false>), dim3(grid), dim3(block), 0, 0,
pdata, p, lb_particle_force_gpu, prefactor, parts_per_block);
break;
case 3:
assign_forces_kernel<3, true>
<<<grid, block, 3 * parts_per_block * cao * sizeof(float)>>>(
hipLaunchKernelGGL((assign_forces_kernel<3, true>), dim3(grid), dim3(block), 3 * parts_per_block * cao * sizeof(float), 0,
pdata, p, lb_particle_force_gpu, prefactor, parts_per_block);
break;
case 4:
assign_forces_kernel<4, true>
<<<grid, block, 3 * parts_per_block * cao * sizeof(float)>>>(
hipLaunchKernelGGL((assign_forces_kernel<4, true>), dim3(grid), dim3(block), 3 * parts_per_block * cao * sizeof(float), 0,
pdata, p, lb_particle_force_gpu, prefactor, parts_per_block);
break;
case 5:
assign_forces_kernel<5, true>
<<<grid, block, 3 * parts_per_block * cao * sizeof(float)>>>(
hipLaunchKernelGGL((assign_forces_kernel<5, true>), dim3(grid), dim3(block), 3 * parts_per_block * cao * sizeof(float), 0,
pdata, p, lb_particle_force_gpu, prefactor, parts_per_block);
break;
case 6:
assign_forces_kernel<6, true>
<<<grid, block, 3 * parts_per_block * cao * sizeof(float)>>>(
hipLaunchKernelGGL((assign_forces_kernel<6, true>), dim3(grid), dim3(block), 3 * parts_per_block * cao * sizeof(float), 0,
pdata, p, lb_particle_force_gpu, prefactor, parts_per_block);
break;
case 7:
assign_forces_kernel<7, true>
<<<grid, block, 3 * parts_per_block * cao * sizeof(float)>>>(
hipLaunchKernelGGL((assign_forces_kernel<7, true>), dim3(grid), dim3(block), 3 * parts_per_block * cao * sizeof(float), 0,
pdata, p, lb_particle_force_gpu, prefactor, parts_per_block);
break;
default:
......@@ -754,31 +744,31 @@ void p3m_gpu_init(int cao, int mesh[3], double alpha) {
switch (p3m_gpu_data.cao) {
case 1:
KERNELCALL(calculate_influence_function_device<1>, grid, block,
(p3m_gpu_data));
p3m_gpu_data);
break;
case 2:
KERNELCALL(calculate_influence_function_device<2>, grid, block,
(p3m_gpu_data));
p3m_gpu_data);
break;
case 3:
KERNELCALL(calculate_influence_function_device<3>, grid, block,
(p3m_gpu_data));
p3m_gpu_data);
break;
case 4:
KERNELCALL(calculate_influence_function_device<4>, grid, block,
(p3m_gpu_data));
p3m_gpu_data);
break;
case 5:
KERNELCALL(calculate_influence_function_device<5>, grid, block,
(p3m_gpu_data));
p3m_gpu_data);
break;
case 6:
KERNELCALL(calculate_influence_function_device<6>, grid, block,
(p3m_gpu_data));
p3m_gpu_data);
break;
case 7:
KERNELCALL(calculate_influence_function_device<7>, grid, block,
(p3m_gpu_data));
p3m_gpu_data);
break;
}
}
......@@ -825,10 +815,10 @@ void p3m_gpu_add_farfield_force() {
}
/** Do convolution */
KERNELCALL(apply_influence_function, gridConv, threadsConv, (p3m_gpu_data));
KERNELCALL(apply_influence_function, gridConv, threadsConv, p3m_gpu_data);
/** Take derivative */
KERNELCALL(apply_diff_op, gridConv, threadsConv, (p3m_gpu_data));
KERNELCALL(apply_diff_op, gridConv, threadsConv, p3m_gpu_data);
/** Transform the components of the electric field back */
FFT_BACK_FFT(p3m_gpu_fft_plans.back_plan, p3m_gpu_data.force_mesh_x,
......
......@@ -292,31 +292,31 @@ double p3m_k_space_error_gpu(double prefactor, int *mesh, int cao, int npart,
switch (cao) {
case 1:
KERNELCALL(p3m_k_space_error_gpu_kernel_ik<1>, grid, block,
(mesh3, meshi, alpha_L, thrust::raw_pointer_cast(he_q.data())));
mesh3, meshi, alpha_L, thrust::raw_pointer_cast(he_q.data()));
break;
case 2:
KERNELCALL(p3m_k_space_error_gpu_kernel_ik<2>, grid, block,
(mesh3, meshi, alpha_L, thrust::raw_pointer_cast(he_q.data())));
mesh3, meshi, alpha_L, thrust::raw_pointer_cast(he_q.data()));
break;
case 3:
KERNELCALL(p3m_k_space_error_gpu_kernel_ik<3>, grid, block,
(mesh3, meshi, alpha_L, thrust::raw_pointer_cast(he_q.data())));
mesh3, meshi, alpha_L, thrust::raw_pointer_cast(he_q.data()));
break;
case 4:
KERNELCALL(p3m_k_space_error_gpu_kernel_ik<4>, grid, block,
(mesh3, meshi, alpha_L, thrust::raw_pointer_cast(he_q.data())));
mesh3, meshi, alpha_L, thrust::raw_pointer_cast(he_q.data()));
break;
case 5:
KERNELCALL(p3m_k_space_error_gpu_kernel_ik<5>, grid, block,
(mesh3, meshi, alpha_L, thrust::raw_pointer_cast(he_q.data())));
mesh3, meshi, alpha_L, thrust::raw_pointer_cast(he_q.data()));
break;
case 6:
KERNELCALL(p3m_k_space_error_gpu_kernel_ik<6>, grid, block,
(mesh3, meshi, alpha_L, thrust::raw_pointer_cast(he_q.data())));
mesh3, meshi, alpha_L, thrust::raw_pointer_cast(he_q.data()));
break;
case 7:
KERNELCALL(p3m_k_space_error_gpu_kernel_ik<7>, grid, block,
(mesh3, meshi, alpha_L, thrust::raw_pointer_cast(he_q.data())));
mesh3, meshi, alpha_L, thrust::raw_pointer_cast(he_q.data()));
break;
}
......
......@@ -2203,8 +2203,8 @@ void ek_calculate_electrostatic_coupling() {
dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
KERNELCALL(ek_spread_particle_force, dim_grid, threads_per_block,
(gpu_get_particle_pointer(), gpu_get_particle_force_pointer(),
ek_lbparameters_gpu));
gpu_get_particle_pointer(), gpu_get_particle_force_pointer(),
ek_lbparameters_gpu);
}
#endif
......@@ -2217,7 +2217,7 @@ void ek_integrate_electrostatics() {
(threads_per_block * blocks_per_grid_y);
dim3 dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
KERNELCALL(ek_gather_species_charge_density, dim_grid, threads_per_block, ());
KERNELCALL(ek_gather_species_charge_density, dim_grid, threads_per_block);
#ifdef EK_ELECTROSTATIC_COUPLING
if (ek_parameters.es_coupling) {
......@@ -2228,7 +2228,7 @@ void ek_integrate_electrostatics() {
electrostatics->calculatePotential(
(hipfftComplex *)ek_parameters.charge_potential_buffer);
KERNELCALL(ek_calc_electric_field, dim_grid, threads_per_block,
(ek_parameters.charge_potential_buffer));
ek_parameters.charge_potential_buffer);
}
#endif
......@@ -2243,7 +2243,7 @@ void ek_integrate_electrostatics() {
particle_data_gpu = gpu_get_particle_pointer();
KERNELCALL(ek_gather_particle_charge_density, dim_grid, threads_per_block,
(particle_data_gpu, ek_lbparameters_gpu));
particle_data_gpu, ek_lbparameters_gpu);
}
electrostatics->calculatePotential();
......@@ -2265,25 +2265,25 @@ void ek_integrate() {
(in ek_calculate_quantities / ek_displacement), which is copied in this
routine */
// KERNELCALL( ek_clear_node_force, dim_grid, threads_per_block, ( node_f ) );
// KERNELCALL( ek_clear_node_force, dim_grid, threads_per_block, node_f );
/* Integrate diffusion-advection */
for (int i = 0; i < ek_parameters.number_of_species; i++) {
KERNELCALL(ek_clear_fluxes, dim_grid, threads_per_block, ());
KERNELCALL(ek_clear_fluxes, dim_grid, threads_per_block);
KERNELCALL(
ek_calculate_quantities, dim_grid, threads_per_block,
(i, *current_nodes, node_f, ek_lbparameters_gpu, ek_lb_device_values));
i, *current_nodes, node_f, ek_lbparameters_gpu, ek_lb_device_values);
#ifdef EK_BOUNDARIES
if (ek_parameters.stencil == 1) {
KERNELCALL(ek_apply_boundaries, dim_grid, threads_per_block,
(i, *current_nodes, node_f));
i, *current_nodes, node_f);
}
#endif
KERNELCALL(ek_propagate_densities, dim_grid, threads_per_block, (i));
KERNELCALL(ek_propagate_densities, dim_grid, threads_per_block, i);
}
/* Integrate electrostatics */
......@@ -2305,10 +2305,9 @@ void ek_init_species_density_wallcharge(ekfloat *wallcharge_species_density,
(threads_per_block * blocks_per_grid_y);
dim3 dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
KERNELCALL(ek_init_species_density_homogeneous, dim_grid, threads_per_block,
());
KERNELCALL(ek_init_species_density_homogeneous, dim_grid, threads_per_block);
KERNELCALL(ek_clear_boundary_densities, dim_grid, threads_per_block,
(*current_nodes));
*current_nodes);
if (wallcharge_species != -1) {
cuda_safe_mem(hipMemcpy(ek_parameters.rho[wallcharge_species],
......@@ -2347,6 +2346,19 @@ void ek_init_species(int species) {
}
}
__global__ void copy_ek_parameters_pointer(EK_parameters** ptr) {
*ptr = &ek_parameters_gpu[0];
}
static void fetch_ek_parameters_pointer() {
EK_parameters** ptr_gpu;
cuda_safe_mem(hipMalloc((void **)&ptr_gpu, sizeof(EK_parameters*)));
KERNELCALL(copy_ek_parameters_pointer, 1, 1, ptr_gpu);
cuda_safe_mem(hipMemcpy(ek_parameters_gpu_pointer, ptr_gpu,
sizeof(EK_parameters*), hipMemcpyDeviceToHost));
hipFree(ptr_gpu);
}
int ek_init() {
if (ek_parameters.agrid < 0.0 || ek_parameters.viscosity < 0.0 ||
ek_parameters.T < 0.0 || ek_parameters.prefactor < 0.0) {
......@@ -2362,12 +2374,7 @@ int ek_init() {
dim3 dim_grid;
if (!ek_initialized) {
if (cudaGetSymbolAddress((void **)&ek_parameters_gpu_pointer,
ek_parameters_gpu) != cudaSuccess) {
fprintf(stderr, "ERROR: Fetching constant memory pointer\n");
return 1;
}
fetch_ek_parameters_pointer();
for (int i = 0; i < MAX_NUMBER_OF_SPECIES; i++) {
ek_parameters.species_index[i] = -1;
......@@ -2492,7 +2499,7 @@ int ek_init() {
threads_per_block * blocks_per_grid_y - 1) /
(threads_per_block * blocks_per_grid_y);
dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
KERNELCALL(ek_clear_node_force, dim_grid, threads_per_block, (node_f));
KERNELCALL(ek_clear_node_force, dim_grid, threads_per_block, node_f);
ek_initialized = true;
} else {
......@@ -2523,7 +2530,7 @@ int ek_init() {
dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
KERNELCALL(ek_init_species_density_homogeneous, dim_grid,
threads_per_block, ());
threads_per_block);
#endif
ek_integrate_electrostatics();
......@@ -2781,15 +2788,15 @@ int ek_node_print_flux(int species, int x, int y, int z, double *flux) {
(threads_per_block * blocks_per_grid_y);
dim3 dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
KERNELCALL(ek_clear_fluxes, dim_grid, threads_per_block, ());
KERNELCALL(ek_clear_fluxes, dim_grid, threads_per_block);
KERNELCALL(ek_calculate_quantities, dim_grid, threads_per_block,
(ek_parameters.species_index[species], *current_nodes, node_f,
ek_lbparameters_gpu, ek_lb_device_values));
ek_parameters.species_index[species], *current_nodes, node_f,
ek_lbparameters_gpu, ek_lb_device_values);
reset_LB_force_densities_GPU(false);
#ifdef EK_BOUNDARIES
KERNELCALL(ek_apply_boundaries, dim_grid, threads_per_block,
(ek_parameters.species_index[species], *current_nodes, node_f));
ek_parameters.species_index[species], *current_nodes, node_f);
#endif
cuda_safe_mem(
......@@ -3008,15 +3015,15 @@ int ek_print_vtk_flux(int species, char *filename) {
(threads_per_block * blocks_per_grid_y);
dim3 dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
KERNELCALL(ek_clear_fluxes, dim_grid, threads_per_block, ());
KERNELCALL(ek_clear_fluxes, dim_grid, threads_per_block);
KERNELCALL(ek_calculate_quantities, dim_grid, threads_per_block,
(ek_parameters.species_index[species], *current_nodes, node_f,
ek_lbparameters_gpu, ek_lb_device_values));
ek_parameters.species_index[species], *current_nodes, node_f,
ek_lbparameters_gpu, ek_lb_device_values);
reset_LB_force_densities_GPU(false);
#ifdef EK_BOUNDARIES
KERNELCALL(ek_apply_boundaries, dim_grid, threads_per_block,
(ek_parameters.species_index[species], *current_nodes, node_f));
ek_parameters.species_index[species], *current_nodes, node_f);
#endif
cuda_safe_mem(
......@@ -3698,7 +3705,7 @@ ekfloat ek_calculate_net_charge() {
(threads_per_block * blocks_per_grid_y);
dim3 dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
KERNELCALL(ek_calculate_system_charge, dim_grid, threads_per_block, (charge_gpu));