Commit d643d10b authored by Michael Kuron's avatar Michael Kuron 💡

Remove HIP dependency on Nvidia platform

parent f13769b9
Pipeline #3616 failed with stages
in 80 minutes and 4 seconds
[submodule "libs/h5xx"]
path = libs/h5xx
url = https://github.com/h5md/h5xx.git
[submodule "libs/HIP"]
path = libs/HIP
url = https://github.com/ROCm-Developer-Tools/HIP
[submodule "libs/rocFFT"]
path = libs/rocFFT
url = https://github.com/ROCmSoftwarePlatform/rocFFT
......@@ -294,26 +294,6 @@ if(WITH_HDF5 AND EXISTS "${CMAKE_SOURCE_DIR}/.git")
endif()
endif()
# Check for the HIP submodule
# and try to check it out if not found or update it if found.
if(CUDA AND NOT HIP_HIPCC_EXECUTABLE AND EXISTS "${CMAKE_SOURCE_DIR}/.git")
# Try to find git
find_package(Git)
if(GIT_FOUND)
foreach(sm IN ITEMS HIP rocFFT)
if (NOT EXISTS "${CMAKE_SOURCE_DIR}/libs/HIP/.git")
execute_process(COMMAND ${GIT_EXECUTABLE} submodule update --init -- libs/${sm}
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
else()
execute_process(COMMAND ${GIT_EXECUTABLE} submodule update -- libs/${sm}
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
endif()
endforeach(sm)
endif()
include_directories(SYSTEM "${CMAKE_SOURCE_DIR}/libs/HIP/include")
include_directories(SYSTEM "${CMAKE_SOURCE_DIR}/libs/rocFFT/library/include")
endif()
if(WITH_SCAFACOS)
find_package(PkgConfig)
pkg_check_modules(SCAFACOS scafacos)
......
Subproject commit 473fc17d0a546abebf9488e3fab53716f9c4b89d
Subproject commit 07e8086949849f42c0b2912e616c6f6c0311a5d4
......@@ -17,7 +17,7 @@
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <hip/hip_runtime.h>
#include "cuda_wrapper.hpp"
#include "EspressoSystemInterface.hpp"
#include "cuda_init.hpp"
......@@ -132,36 +132,36 @@ __global__ void split_kernel_quatu(CUDA_particle_data *particles, float *quatu,
void EspressoSystemInterface::reallocDeviceMemory(int n) {
if (m_needsRGpu && ((n != m_gpu_npart) || (m_r_gpu_begin == 0))) {
if (m_r_gpu_begin != 0)
cuda_safe_mem(hipFree(m_r_gpu_begin));
cuda_safe_mem(hipMalloc(&m_r_gpu_begin, 3 * n * sizeof(float)));
cuda_safe_mem(cudaFree(m_r_gpu_begin));
cuda_safe_mem(cudaMalloc(&m_r_gpu_begin, 3 * n * sizeof(float)));
m_r_gpu_end = m_r_gpu_begin + 3 * n;
}
#ifdef DIPOLES
if (m_needsDipGpu && ((n != m_gpu_npart) || (m_dip_gpu_begin == 0))) {
if (m_dip_gpu_begin != 0)
cuda_safe_mem(hipFree(m_dip_gpu_begin));
cuda_safe_mem(hipMalloc(&m_dip_gpu_begin, 3 * n * sizeof(float)));
cuda_safe_mem(cudaFree(m_dip_gpu_begin));
cuda_safe_mem(cudaMalloc(&m_dip_gpu_begin, 3 * n * sizeof(float)));
m_dip_gpu_end = m_dip_gpu_begin + 3 * n;
}
#endif
if (m_needsVGpu && ((n != m_gpu_npart) || (m_v_gpu_begin == 0))) {
if (m_v_gpu_begin != 0)
cuda_safe_mem(hipFree(m_v_gpu_begin));
cuda_safe_mem(hipMalloc(&m_v_gpu_begin, 3 * n * sizeof(float)));
cuda_safe_mem(cudaFree(m_v_gpu_begin));
cuda_safe_mem(cudaMalloc(&m_v_gpu_begin, 3 * n * sizeof(float)));
m_v_gpu_end = m_v_gpu_begin + 3 * n;
}
if (m_needsQGpu && ((n != m_gpu_npart) || (m_q_gpu_begin == 0))) {
if (m_q_gpu_begin != 0)
cuda_safe_mem(hipFree(m_q_gpu_begin));
cuda_safe_mem(hipMalloc(&m_q_gpu_begin, 3 * n * sizeof(float)));
cuda_safe_mem(cudaFree(m_q_gpu_begin));
cuda_safe_mem(cudaMalloc(&m_q_gpu_begin, 3 * n * sizeof(float)));
m_q_gpu_end = m_q_gpu_begin + 3 * n;
}
if (m_needsQuatuGpu && ((n != m_gpu_npart) || (m_quatu_gpu_begin == 0))) {
if (m_quatu_gpu_begin != 0)
cuda_safe_mem(hipFree(m_quatu_gpu_begin));
cuda_safe_mem(hipMalloc(&m_quatu_gpu_begin, 3 * n * sizeof(float)));
cuda_safe_mem(cudaFree(m_quatu_gpu_begin));
cuda_safe_mem(cudaMalloc(&m_quatu_gpu_begin, 3 * n * sizeof(float)));
m_quatu_gpu_end = m_quatu_gpu_begin + 3 * n;
}
......
......@@ -18,7 +18,7 @@
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <hip/hip_runtime.h>
#include "cuda_wrapper.hpp"
#include "../cuda_init.hpp"
#include "../cuda_utils.hpp"
......@@ -995,7 +995,6 @@ void initBHgpu(int blocks) {
KERNELCALL(initializationKernel, grid, block);
#ifdef __CUDACC__
// According to the experimental performance optimization:
cudaFuncSetCacheConfig(boundingBoxKernel, cudaFuncCachePreferShared);
cudaFuncSetCacheConfig(treeBuildingKernel, cudaFuncCachePreferL1);
......@@ -1003,9 +1002,8 @@ void initBHgpu(int blocks) {
cudaFuncSetCacheConfig(sortKernel, cudaFuncCachePreferL1);
cudaFuncSetCacheConfig(forceCalculationKernel, cudaFuncCachePreferL1);
cudaFuncSetCacheConfig(energyCalculationKernel, cudaFuncCachePreferL1);
#endif
hipGetLastError(); // reset error value
cudaGetLastError(); // reset error value
}
// Building Barnes-Hut spatial min/max position box
......@@ -1016,9 +1014,9 @@ void buildBoxBH(int blocks) {
grid.x = blocks * FACTOR1;
block.x = THREADS1;
hipDeviceSynchronize();
cudaThreadSynchronize();
KERNELCALL(boundingBoxKernel, grid, block);
cuda_safe_mem(hipDeviceSynchronize());
cuda_safe_mem(cudaThreadSynchronize());
}
// Building Barnes-Hut tree in a linear childd array representation
......@@ -1031,7 +1029,7 @@ void buildTreeBH(int blocks) {
block.x = THREADS2;
KERNELCALL(treeBuildingKernel, grid, block);
cuda_safe_mem(hipDeviceSynchronize());
cuda_safe_mem(cudaThreadSynchronize());
}
// Calculate octant cells masses and cell index counts.
......@@ -1045,7 +1043,7 @@ void summarizeBH(int blocks) {
block.x = THREADS3;
KERNELCALL(summarizationKernel, grid, block);
cuda_safe_mem(hipDeviceSynchronize());
cuda_safe_mem(cudaThreadSynchronize());
}
// Sort particle indexes according to the BH tree representation.
......@@ -1059,7 +1057,7 @@ void sortBH(int blocks) {
block.x = THREADS4;
KERNELCALL(sortKernel, grid, block);
cuda_safe_mem(hipDeviceSynchronize());
cuda_safe_mem(cudaThreadSynchronize());
}
// Force calculation.
......@@ -1072,10 +1070,10 @@ int forceBH(BHData *bh_data, dds_float k, float *f, float *torque) {
block.x = THREADS5;
KERNELCALL(forceCalculationKernel, grid, block, k, f, torque);
cuda_safe_mem(hipDeviceSynchronize());
cuda_safe_mem(cudaThreadSynchronize());
cuda_safe_mem(hipMemcpy(&error_code, bh_data->err, sizeof(int),
hipMemcpyDeviceToHost));
cuda_safe_mem(cudaMemcpy(&error_code, bh_data->err, sizeof(int),
cudaMemcpyDeviceToHost));
return error_code;
}
......@@ -1089,32 +1087,32 @@ int energyBH(BHData *bh_data, dds_float k, float *E) {
block.x = THREADS5;
dds_float *energySum;
cuda_safe_mem(hipMalloc(&energySum, (int)(sizeof(dds_float) * grid.x)));
cuda_safe_mem(cudaMalloc(&energySum, (int)(sizeof(dds_float) * grid.x)));
// cleanup the memory for the energy sum
cuda_safe_mem(hipMemset(energySum, 0, (int)(sizeof(dds_float) * grid.x)));
cuda_safe_mem(cudaMemset(energySum, 0, (int)(sizeof(dds_float) * grid.x)));
KERNELCALL_shared(energyCalculationKernel, grid, block,
block.x * sizeof(dds_float), k, energySum);
cuda_safe_mem(hipDeviceSynchronize());
cuda_safe_mem(cudaThreadSynchronize());
// Sum the results of all blocks
// One energy part per block in the prev kernel
thrust::device_ptr<dds_float> t(energySum);
float x = thrust::reduce(t, t + grid.x);
cuda_safe_mem(hipMemcpy(E, &x, sizeof(float), hipMemcpyHostToDevice));
cuda_safe_mem(cudaMemcpy(E, &x, sizeof(float), cudaMemcpyHostToDevice));
cuda_safe_mem(hipFree(energySum));
cuda_safe_mem(hipMemcpy(&error_code, bh_data->err, sizeof(int),
hipMemcpyDeviceToHost));
cuda_safe_mem(cudaFree(energySum));
cuda_safe_mem(cudaMemcpy(&error_code, bh_data->err, sizeof(int),
cudaMemcpyDeviceToHost));
return error_code;
}
// Function to set the BH method parameters.
void setBHPrecision(float *epssq, float *itolsq) {
cuda_safe_mem(hipMemcpyToSymbol(HIP_SYMBOL(epssqd), epssq, sizeof(float), 0,
hipMemcpyHostToDevice));
cuda_safe_mem(hipMemcpyToSymbol(HIP_SYMBOL(itolsqd), itolsq, sizeof(float), 0,
hipMemcpyHostToDevice));
cuda_safe_mem(cudaMemcpyToSymbol(HIP_SYMBOL(epssqd), epssq, sizeof(float), 0,
cudaMemcpyHostToDevice));
cuda_safe_mem(cudaMemcpyToSymbol(HIP_SYMBOL(itolsqd), itolsq, sizeof(float), 0,
cudaMemcpyHostToDevice));
}
// An allocation of the GPU device memory and an initialization where it is
......@@ -1139,33 +1137,33 @@ void allocBHmemCopy(int nbodies, BHData *bh_data) {
bh_data->nnodes--;
if (bh_data->err != 0)
cuda_safe_mem(hipFree(bh_data->err));
cuda_safe_mem(hipMalloc((void **)&(bh_data->err), sizeof(int)));
cuda_safe_mem(cudaFree(bh_data->err));
cuda_safe_mem(cudaMalloc((void **)&(bh_data->err), sizeof(int)));
if (bh_data->child != 0)
cuda_safe_mem(hipFree(bh_data->child));
cuda_safe_mem(hipMalloc((void **)&(bh_data->child),
cuda_safe_mem(cudaFree(bh_data->child));
cuda_safe_mem(cudaMalloc((void **)&(bh_data->child),
sizeof(int) * (bh_data->nnodes + 1) * 8));
if (bh_data->count != 0)
cuda_safe_mem(hipFree(bh_data->count));
cuda_safe_mem(hipMalloc((void **)&(bh_data->count),
cuda_safe_mem(cudaFree(bh_data->count));
cuda_safe_mem(cudaMalloc((void **)&(bh_data->count),
sizeof(int) * (bh_data->nnodes + 1)));
if (bh_data->start != 0)
cuda_safe_mem(hipFree(bh_data->start));
cuda_safe_mem(hipMalloc((void **)&(bh_data->start),
cuda_safe_mem(cudaFree(bh_data->start));
cuda_safe_mem(cudaMalloc((void **)&(bh_data->start),
sizeof(int) * (bh_data->nnodes + 1)));
if (bh_data->sort != 0)
cuda_safe_mem(hipFree(bh_data->sort));
cuda_safe_mem(hipMalloc((void **)&(bh_data->sort),
cuda_safe_mem(cudaFree(bh_data->sort));
cuda_safe_mem(cudaMalloc((void **)&(bh_data->sort),
sizeof(int) * (bh_data->nnodes + 1)));
// Weight coefficients of m_bhnnodes nodes: both particles and octant cells
if (bh_data->mass != 0)
cuda_safe_mem(hipFree(bh_data->mass));
cuda_safe_mem(hipMalloc((void **)&(bh_data->mass),
cuda_safe_mem(cudaFree(bh_data->mass));
cuda_safe_mem(cudaMalloc((void **)&(bh_data->mass),
sizeof(float) * (bh_data->nnodes + 1)));
// n particles have unitary weight coefficients.
......@@ -1174,47 +1172,47 @@ void allocBHmemCopy(int nbodies, BHData *bh_data) {
for (int i = 0; i < bh_data->nbodies; i++) {
mass_tmp[i] = 1.0f;
}
cuda_safe_mem(hipMemcpy(bh_data->mass, mass_tmp,
cuda_safe_mem(cudaMemcpy(bh_data->mass, mass_tmp,
sizeof(float) * bh_data->nbodies,
hipMemcpyHostToDevice));
cudaMemcpyHostToDevice));
delete[] mass_tmp;
// (max[3*i], max[3*i+1], max[3*i+2])
// are the octree box dynamical spatial constraints
// this array is updating per each block at each interaction calculation
// within the boundingBoxKernel
if (bh_data->maxp != 0)
cuda_safe_mem(hipFree(bh_data->maxp));
cuda_safe_mem(hipMalloc((void **)&(bh_data->maxp),
cuda_safe_mem(cudaFree(bh_data->maxp));
cuda_safe_mem(cudaMalloc((void **)&(bh_data->maxp),
sizeof(float) * bh_data->blocks * FACTOR1 * 3));
// (min[3*i], min[3*i+1], min[3*i+2])
// are the octree box dynamical spatial constraints
// this array is updating per each block at each interaction calculation
// within the boundingBoxKernel
if (bh_data->minp != 0)
cuda_safe_mem(hipFree(bh_data->minp));
cuda_safe_mem(hipMalloc((void **)&(bh_data->minp),
cuda_safe_mem(cudaFree(bh_data->minp));
cuda_safe_mem(cudaMalloc((void **)&(bh_data->minp),
sizeof(float) * bh_data->blocks * FACTOR1 * 3));
if (bh_data->r != 0)
cuda_safe_mem(hipFree(bh_data->r));
cuda_safe_mem(cudaFree(bh_data->r));
cuda_safe_mem(
hipMalloc(&(bh_data->r), 3 * (bh_data->nnodes + 1) * sizeof(float)));
cudaMalloc(&(bh_data->r), 3 * (bh_data->nnodes + 1) * sizeof(float)));
if (bh_data->u != 0)
cuda_safe_mem(hipFree(bh_data->u));
cuda_safe_mem(cudaFree(bh_data->u));
cuda_safe_mem(
hipMalloc(&(bh_data->u), 3 * (bh_data->nnodes + 1) * sizeof(float)));
cudaMalloc(&(bh_data->u), 3 * (bh_data->nnodes + 1) * sizeof(float)));
}
// Populating of array pointers allocated in GPU device before.
// Copy the particle data to the Barnes-Hut related arrays.
void fillConstantPointers(float *r, float *dip, BHData bh_data) {
cuda_safe_mem(hipMemcpyToSymbol(HIP_SYMBOL(para), &bh_data, sizeof(BHData), 0,
hipMemcpyHostToDevice));
cuda_safe_mem(hipMemcpy(bh_data.r, r, 3 * bh_data.nbodies * sizeof(float),
hipMemcpyDeviceToDevice));
cuda_safe_mem(hipMemcpy(bh_data.u, dip, 3 * bh_data.nbodies * sizeof(float),
hipMemcpyDeviceToDevice));
cuda_safe_mem(cudaMemcpyToSymbol(HIP_SYMBOL(para), &bh_data, sizeof(BHData), 0,
cudaMemcpyHostToDevice));
cuda_safe_mem(cudaMemcpy(bh_data.r, r, 3 * bh_data.nbodies * sizeof(float),
cudaMemcpyDeviceToDevice));
cuda_safe_mem(cudaMemcpy(bh_data.u, dip, 3 * bh_data.nbodies * sizeof(float),
cudaMemcpyDeviceToDevice));
}
#endif // BARNES_HUT
#include <hip/hip_runtime.h>
#include "cuda_wrapper.hpp"
#include "config.hpp"
#include <thrust/device_ptr.h>
......@@ -297,18 +297,18 @@ void DipolarDirectSum_kernel_wrapper_force(dds_float k, int n, float *pos,
dds_float *box_l_gpu;
int *periodic_gpu;
cuda_safe_mem(hipMalloc((void **)&box_l_gpu, 3 * sizeof(dds_float)));
cuda_safe_mem(hipMalloc((void **)&periodic_gpu, 3 * sizeof(int)));
cuda_safe_mem(hipMemcpy(box_l_gpu, box_l, 3 * sizeof(dds_float),
hipMemcpyHostToDevice));
cuda_safe_mem(hipMemcpy(periodic_gpu, periodic, 3 * sizeof(int),
hipMemcpyHostToDevice));
cuda_safe_mem(cudaMalloc((void **)&box_l_gpu, 3 * sizeof(dds_float)));
cuda_safe_mem(cudaMalloc((void **)&periodic_gpu, 3 * sizeof(int)));
cuda_safe_mem(cudaMemcpy(box_l_gpu, box_l, 3 * sizeof(dds_float),
cudaMemcpyHostToDevice));
cuda_safe_mem(cudaMemcpy(periodic_gpu, periodic, 3 * sizeof(int),
cudaMemcpyHostToDevice));
// printf("box_l: %f %f %f\n",box_l[0],box_l[1],box_l[2]);
KERNELCALL(DipolarDirectSum_kernel_force, grid, block,
k, n, pos, dip, f, torque, box_l_gpu, periodic_gpu);
hipFree(box_l_gpu);
hipFree(periodic_gpu);
cudaFree(box_l_gpu);
cudaFree(periodic_gpu);
}
void DipolarDirectSum_kernel_wrapper_energy(dds_float k, int n, float *pos,
......@@ -332,15 +332,15 @@ void DipolarDirectSum_kernel_wrapper_energy(dds_float k, int n, float *pos,
dds_float *box_l_gpu;
int *periodic_gpu;
cuda_safe_mem(hipMalloc((void **)&box_l_gpu, 3 * sizeof(dds_float)));
cuda_safe_mem(hipMalloc((void **)&periodic_gpu, 3 * sizeof(int)));
cuda_safe_mem(cudaMalloc((void **)&box_l_gpu, 3 * sizeof(dds_float)));
cuda_safe_mem(cudaMalloc((void **)&periodic_gpu, 3 * sizeof(int)));
cuda_safe_mem(
hipMemcpy(box_l_gpu, box_l, 3 * sizeof(float), hipMemcpyHostToDevice));
cuda_safe_mem(hipMemcpy(periodic_gpu, periodic, 3 * sizeof(int),
hipMemcpyHostToDevice));
cudaMemcpy(box_l_gpu, box_l, 3 * sizeof(float), cudaMemcpyHostToDevice));
cuda_safe_mem(cudaMemcpy(periodic_gpu, periodic, 3 * sizeof(int),
cudaMemcpyHostToDevice));
dds_float *energySum;
cuda_safe_mem(hipMalloc(&energySum, (int)(sizeof(dds_float) * grid.x)));
cuda_safe_mem(cudaMalloc(&energySum, (int)(sizeof(dds_float) * grid.x)));
// printf("box_l: %f %f %f\n",box_l[0],box_l[1],box_l[2]);
......@@ -355,11 +355,11 @@ void DipolarDirectSum_kernel_wrapper_energy(dds_float k, int n, float *pos,
// KERNELCALL(sumKernel,1,1,energySum,block.x,E);
thrust::device_ptr<dds_float> t(energySum);
float x = thrust::reduce(t, t + grid.x);
cuda_safe_mem(hipMemcpy(E, &x, sizeof(float), hipMemcpyHostToDevice));
cuda_safe_mem(cudaMemcpy(E, &x, sizeof(float), cudaMemcpyHostToDevice));
cuda_safe_mem(hipFree(energySum));
cuda_safe_mem(hipFree(box_l_gpu));
cuda_safe_mem(hipFree(periodic_gpu));
cuda_safe_mem(cudaFree(energySum));
cuda_safe_mem(cudaFree(box_l_gpu));
cuda_safe_mem(cudaFree(periodic_gpu));
}
#endif
......@@ -17,7 +17,7 @@
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <hip/hip_runtime.h>
#include "cuda_wrapper.hpp"
#include "HarmonicOrientationWell.hpp"
......
......@@ -17,7 +17,7 @@
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <hip/hip_runtime.h>
#include "cuda_wrapper.hpp"
#include "HarmonicWell.hpp"
......
......@@ -17,7 +17,7 @@
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <hip/hip_runtime.h>
#include "cuda_wrapper.hpp"
#include "actor/Mmm1dgpuForce.hpp"
#include "cuda_utils.hpp"
......@@ -31,7 +31,7 @@
// the code is mostly multi-GPU capable, but Espresso is not yet
const int deviceCount = 1;
float multigpu_factors[] = {1.0};
#define hipSetDevice(d)
#define cudaSetDevice(d)
#include "EspressoSystemInterface.hpp"
#include "electrostatics_magnetostatics/mmm1d.hpp"
......@@ -101,10 +101,10 @@ void Mmm1dgpuForce::setup(SystemInterface &s) {
// do the latter.
pairs = 2;
for (int d = 0; d < deviceCount; d++) {
hipSetDevice(d);
cudaSetDevice(d);
size_t freeMem, totalMem;
hipMemGetInfo(&freeMem, &totalMem);
cudaMemGetInfo(&freeMem, &totalMem);
if (freeMem / 2 <
3 * s.npart_gpu() * s.npart_gpu() *
sizeof(
......@@ -117,16 +117,16 @@ void Mmm1dgpuForce::setup(SystemInterface &s) {
}
}
if (dev_forcePairs)
hipFree(dev_forcePairs);
cudaFree(dev_forcePairs);
if (pairs) // we need memory to store force pairs
{
cuda_safe_mem(
hipMalloc((void **)&dev_forcePairs,
cudaMalloc((void **)&dev_forcePairs,
3 * s.npart_gpu() * s.npart_gpu() * sizeof(mmm1dgpu_real)));
}
if (dev_energyBlocks)
hipFree(dev_energyBlocks);
cuda_safe_mem(hipMalloc((void **)&dev_energyBlocks,
cudaFree(dev_energyBlocks);
cuda_safe_mem(cudaMalloc((void **)&dev_energyBlocks,
numBlocks(s) * sizeof(mmm1dgpu_real)));
host_npart = s.npart_gpu();
}
......@@ -140,7 +140,7 @@ unsigned int Mmm1dgpuForce::numBlocks(SystemInterface &s) {
Mmm1dgpuForce::~Mmm1dgpuForce() {
modpsi_destroy();
hipFree(dev_forcePairs);
cudaFree(dev_forcePairs);
}
__forceinline__ __device__ mmm1dgpu_real sqpow(mmm1dgpu_real x) {
......@@ -231,11 +231,11 @@ void Mmm1dgpuForce::tune(SystemInterface &s, mmm1dgpu_real _maxPWerror,
{
int *dev_cutoff;
int maxCut = 30;
cuda_safe_mem(hipMalloc((void **)&dev_cutoff, sizeof(int)));
cuda_safe_mem(cudaMalloc((void **)&dev_cutoff, sizeof(int)));
hipLaunchKernelGGL(besselTuneKernel, dim3(1), dim3(1), 0, 0, dev_cutoff, far_switch_radius, maxCut);
cuda_safe_mem(hipMemcpy(&bessel_cutoff, dev_cutoff, sizeof(int),
hipMemcpyDeviceToHost));
hipFree(dev_cutoff);
cuda_safe_mem(cudaMemcpy(&bessel_cutoff, dev_cutoff, sizeof(int),
cudaMemcpyDeviceToHost));
cudaFree(dev_cutoff);
if (_bessel_cutoff != -2 &&
bessel_cutoff >=
maxCut) // we already have our switching radius and only need to
......@@ -265,7 +265,7 @@ void Mmm1dgpuForce::set_params(mmm1dgpu_real _boxz,
for (int d = 0; d < deviceCount; d++) {
// double colons are needed to access the constant memory variables because
// they are file globals and we have identically named class variables
hipSetDevice(d);
cudaSetDevice(d);
if (manual) // tuning needs to be performed again
{
far_switch_radius = _far_switch_radius;
......@@ -274,28 +274,28 @@ void Mmm1dgpuForce::set_params(mmm1dgpu_real _boxz,
if (_far_switch_radius >= 0) {
mmm1d_params.far_switch_radius_2 =
_far_switch_radius * _far_switch_radius;
cuda_safe_mem(hipMemcpyToSymbol(HIP_SYMBOL(::far_switch_radius_2), &_far_switch_radius_2, sizeof(mmm1dgpu_real)));
cuda_safe_mem(cudaMemcpyToSymbol(HIP_SYMBOL(::far_switch_radius_2), &_far_switch_radius_2, sizeof(mmm1dgpu_real)));
far_switch_radius = _far_switch_radius;
}
if (_boxz > 0) {
host_boxz = _boxz;
cuda_safe_mem(hipMemcpyToSymbol(HIP_SYMBOL(::boxz), &_boxz, sizeof(mmm1dgpu_real)));
cuda_safe_mem(hipMemcpyToSymbol(HIP_SYMBOL(::uz), &_uz, sizeof(mmm1dgpu_real)));
cuda_safe_mem(cudaMemcpyToSymbol(HIP_SYMBOL(::boxz), &_boxz, sizeof(mmm1dgpu_real)));
cuda_safe_mem(cudaMemcpyToSymbol(HIP_SYMBOL(::uz), &_uz, sizeof(mmm1dgpu_real)));
}
if (_coulomb_prefactor != 0) {
cuda_safe_mem(hipMemcpyToSymbol(HIP_SYMBOL(::coulomb_prefactor), &_coulomb_prefactor,
cuda_safe_mem(cudaMemcpyToSymbol(HIP_SYMBOL(::coulomb_prefactor), &_coulomb_prefactor,
sizeof(mmm1dgpu_real)));
coulomb_prefactor = _coulomb_prefactor;
}
if (_bessel_cutoff > 0) {
mmm1d_params.bessel_cutoff = _bessel_cutoff;
cuda_safe_mem(
hipMemcpyToSymbol(HIP_SYMBOL(::bessel_cutoff), &_bessel_cutoff, sizeof(int)));
cudaMemcpyToSymbol(HIP_SYMBOL(::bessel_cutoff), &_bessel_cutoff, sizeof(int)));
bessel_cutoff = _bessel_cutoff;
}
if (_maxPWerror > 0) {
mmm1d_params.maxPWerror = _maxPWerror;
cuda_safe_mem(hipMemcpyToSymbol(HIP_SYMBOL(::maxPWerror), &_maxPWerror,
cuda_safe_mem(cudaMemcpyToSymbol(HIP_SYMBOL(::maxPWerror), &_maxPWerror,
sizeof(mmm1dgpu_real)));
maxPWerror = _maxPWerror;
}
......@@ -553,23 +553,23 @@ void Mmm1dgpuForce::computeEnergy(SystemInterface &s) {
}
float Mmm1dgpuForce::force_benchmark(SystemInterface &s) {
hipEvent_t eventStart, eventStop;
cudaEvent_t eventStart, eventStop;
float elapsedTime;
mmm1dgpu_real *dev_f_benchmark;
cuda_safe_mem(hipMalloc((void **)&dev_f_benchmark,
cuda_safe_mem(cudaMalloc((void **)&dev_f_benchmark,
3 * s.npart_gpu() * sizeof(mmm1dgpu_real)));
cuda_safe_mem(hipEventCreate(&eventStart));
cuda_safe_mem(hipEventCreate(&eventStop));
cuda_safe_mem(hipEventRecord(eventStart, stream[0]));
cuda_safe_mem(cudaEventCreate(&eventStart));
cuda_safe_mem(cudaEventCreate(&eventStop));
cuda_safe_mem(cudaEventRecord(eventStart, stream[0]));
KERNELCALL(forcesKernel, numBlocks(s), numThreads,
s.rGpuBegin(), s.qGpuBegin(), dev_f_benchmark, s.npart_gpu(), 0)
cuda_safe_mem(hipEventRecord(eventStop, stream[0]));
cuda_safe_mem(hipEventSynchronize(eventStop));
cuda_safe_mem(hipEventElapsedTime(&elapsedTime, eventStart, eventStop));
cuda_safe_mem(hipEventDestroy(eventStart));
cuda_safe_mem(hipEventDestroy(eventStop));
cuda_safe_mem(hipFree(dev_f_benchmark));
cuda_safe_mem(cudaEventRecord(eventStop, stream[0]));
cuda_safe_mem(cudaEventSynchronize(eventStop));
cuda_safe_mem(cudaEventElapsedTime(&elapsedTime, eventStart, eventStop));
cuda_safe_mem(cudaEventDestroy(eventStart));
cuda_safe_mem(cudaEventDestroy(eventStop));
cuda_safe_mem(cudaFree(dev_f_benchmark));
return elapsedTime;
}
......
......@@ -67,7 +67,7 @@ int modpsi_init() {
}
for (int d = 0; d < deviceCount; d++) {
hipSetDevice(d);
cudaSetDevice(d);
// copy to GPU
int linModPsiSize = linModPsi_offsets[2 * n_modPsi - 1] +
......@@ -76,15 +76,15 @@ int modpsi_init() {
printf("ERROR: __constant__ device_linModPsi[] is not large enough\n");
exit(EXIT_FAILURE);
}
cuda_safe_mem(hipMemcpyToSymbol(HIP_SYMBOL(device_linModPsi_offsets),
cuda_safe_mem(cudaMemcpyToSymbol(HIP_SYMBOL(device_linModPsi_offsets),
linModPsi_offsets,
2 * n_modPsi * sizeof(int)));
cuda_safe_mem(hipMemcpyToSymbol(HIP_SYMBOL(device_linModPsi_lengths),
cuda_safe_mem(cudaMemcpyToSymbol(HIP_SYMBOL(device_linModPsi_lengths),
linModPsi_lengths,
2 * n_modPsi * sizeof(int)));
cuda_safe_mem(hipMemcpyToSymbol(HIP_SYMBOL(device_linModPsi), linModPsi,
cuda_safe_mem(cudaMemcpyToSymbol(HIP_SYMBOL(device_linModPsi), linModPsi,
linModPsiSize * sizeof(mmm1dgpu_real)));
cuda_safe_mem(hipMemcpyToSymbol(HIP_SYMBOL(device_n_modPsi), &n_modPsi, sizeof(int)));
cuda_safe_mem(cudaMemcpyToSymbol(HIP_SYMBOL(device_n_modPsi), &n_modPsi, sizeof(int)));
}
return 0;
......
This diff is collapsed.
......@@ -17,7 +17,7 @@
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <hip/hip_runtime.h>
#include "cuda_wrapper.hpp"
#include "cuda_init.hpp"
#include "cuda_utils.hpp"
......@@ -38,24 +38,24 @@ static const int computeCapabilityMinMinor = 1;
const char *cuda_error;
void cuda_init() { hipStreamCreate(&stream[0]); }
void cuda_init() { cudaStreamCreate(&stream[0]); }
/// get the number of CUDA devices.
int cuda_get_n_gpus() {
int deviceCount;
hipError_t error = hipGetDeviceCount(&deviceCount);
if (error != hipSuccess) {
cuda_error = hipGetErrorString(error);
cudaError_t error = cudaGetDeviceCount(&deviceCount);
if (error != cudaSuccess) {
cuda_error = cudaGetErrorString(error);
return -1;
}
return deviceCount;
}
int cuda_check_gpu(int dev) {
hipDeviceProp_t deviceProp;
hipError_t error = hipGetDeviceProperties(&deviceProp, dev);
if (error != hipSuccess) {
cuda_error = hipGetErrorString(error);
cudaDeviceProp deviceProp;
cudaError_t error = cudaGetDeviceProperties(&deviceProp, dev);
if (error != cudaSuccess) {
cuda_error = cudaGetErrorString(error);
return ES_ERROR;
}
if (deviceProp.major < computeCapabilityMinMajor ||
......@@ -68,10 +68,10 @@ int cuda_check_gpu(int dev) {
}
void cuda_get_gpu_name(int dev, char name[64]) {
hipDeviceProp_t deviceProp;
hipError_t error = hipGetDeviceProperties(&deviceProp, dev);
if (error != hipSuccess) {
cuda_error = hipGetErrorString(error);
cudaDeviceProp deviceProp;
cudaError_t error = cudaGetDeviceProperties(&deviceProp, dev);
if (error != cudaSuccess) {
cuda_error = cudaGetErrorString(error);
strcpy(name, "no GPU");
return;
}
......@@ -80,10 +80,10 @@ void cuda_get_gpu_name(int dev, char name[64]) {
}
int cuda_get_device_props(const int dev, EspressoGpuDevice &d) {
hipDeviceProp_t deviceProp;
hipError_t error = hipGetDeviceProperties(&deviceProp, dev);
if (error != hipSuccess) {
cuda_error = hipGetErrorString(error);
cudaDeviceProp deviceProp;
cudaError_t error = cudaGetDeviceProperties(&deviceProp, dev);
if (error != cudaSuccess) {
cuda_error = cudaGetErrorString(error);
return ES_ERROR;
}
strncpy(d.name, deviceProp.name, 64);
......@@ -98,12 +98,12 @@ int cuda_get_device_props(const int dev, EspressoGpuDevice &d) {
}
int cuda_set_device(int dev) {
hipSetDevice(dev);
hipStreamDestroy(stream[0]);
hipError_t error = hipStreamCreate(&stream[0]);
cudaSetDevice(dev);
cudaStreamDestroy(stream[0]);
cudaError_t error = cudaStreamCreate(&stream[0]);
if (error != hipSuccess) {
cuda_error = hipGetErrorString(error);
if (error != cudaSuccess) {
cuda_error = cudaGetErrorString(error);
throw std::runtime_error(cuda_error);
}
......@@ -112,9 +112,9 @@ int cuda_set_device(int dev) {
int cuda_get_device() {
int dev;