/* * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. * * Please refer to the NVIDIA end user license agreement (EULA) associated * with this source code for terms and conditions that govern your use of * this software. Any use, reproduction, disclosure, or distribution of * this software and related documentation outside the terms of the EULA * is strictly prohibited. * */ #include #include #include #if defined(__APPLE__) || defined(MACOSX) #include #else #include #endif // CUDA standard includes #include #include #include "bodysystem.h" __constant__ float softeningSquared; __constant__ double softeningSquared_fp64; cudaError_t setSofteningSquared(float softeningSq) { return cudaMemcpyToSymbol(softeningSquared, &softeningSq, sizeof(float), 0, cudaMemcpyHostToDevice); } cudaError_t setSofteningSquared(double softeningSq) { return cudaMemcpyToSymbol(softeningSquared_fp64, &softeningSq, sizeof(double), 0, cudaMemcpyHostToDevice); } template struct SharedMemory { __device__ inline operator T *() { extern __shared__ int __smem[]; return (T *)__smem; } __device__ inline operator const T *() const { extern __shared__ int __smem[]; return (T *)__smem; } }; template __device__ T rsqrt_T(T x) { return rsqrt(x); } template<> __device__ float rsqrt_T(float x) { return rsqrtf(x); } template<> __device__ double rsqrt_T(double x) { return rsqrt(x); } // Macros to simplify shared memory addressing #define SX(i) sharedPos[i+blockDim.x*threadIdx.y] // This macro is only used when multithreadBodies is true (below) #define SX_SUM(i,j) sharedPos[i+blockDim.x*j] template __device__ T getSofteningSquared() { return softeningSquared; } template <> __device__ double getSofteningSquared() { return softeningSquared_fp64; } template struct DeviceData { T *dPos[2]; // mapped host pointers T *dVel; cudaEvent_t event; unsigned int offset; unsigned int numBodies; }; template __device__ typename vec3::Type bodyBodyInteraction(typename vec3::Type ai, typename vec4::Type bi, typename vec4::Type bj) { typename vec3::Type r; // r_ij [3 FLOPS] r.x = bj.x - bi.x; r.y = bj.y - bi.y; r.z = bj.z - bi.z; // distSqr = dot(r_ij, r_ij) + EPS^2 [6 FLOPS] T distSqr = r.x * r.x + r.y * r.y + r.z * r.z; distSqr += getSofteningSquared(); // invDistCube =1/distSqr^(3/2) [4 FLOPS (2 mul, 1 sqrt, 1 inv)] T invDist = rsqrt_T(distSqr); T invDistCube = invDist * invDist * invDist; // s = m_j * invDistCube [1 FLOP] T s = bj.w * invDistCube; // a_i = a_i + s * r_ij [6 FLOPS] ai.x += r.x * s; ai.y += r.y * s; ai.z += r.z * s; return ai; } template __device__ typename vec3::Type computeBodyAccel(typename vec4::Type bodyPos, typename vec4::Type *positions, int numTiles) { typename vec4::Type *sharedPos = SharedMemory::Type>(); typename vec3::Type acc = {0.0f, 0.0f, 0.0f}; for (int tile = 0; tile < numTiles; tile++) { sharedPos[threadIdx.x] = positions[tile * blockDim.x + threadIdx.x]; __syncthreads(); // This is the "tile_calculation" from the GPUG3 article. #pragma unroll 128 for (unsigned int counter = 0; counter < blockDim.x; counter++) { acc = bodyBodyInteraction(acc, bodyPos, sharedPos[counter]); } __syncthreads(); } return acc; } template __global__ void integrateBodies(typename vec4::Type *__restrict__ newPos, typename vec4::Type *__restrict__ oldPos, typename vec4::Type *vel, unsigned int deviceOffset, unsigned int deviceNumBodies, float deltaTime, float damping, int numTiles) { int index = blockIdx.x * blockDim.x + threadIdx.x; if (index >= deviceNumBodies) { return; } typename vec4::Type position = oldPos[deviceOffset + index]; typename vec3::Type accel = computeBodyAccel(position, oldPos, numTiles); // acceleration = force / mass; // new velocity = old velocity + acceleration * deltaTime // note we factor out the body's mass from the equation, here and in bodyBodyInteraction // (because they cancel out). Thus here force == acceleration typename vec4::Type velocity = vel[deviceOffset + index]; velocity.x += accel.x * deltaTime; velocity.y += accel.y * deltaTime; velocity.z += accel.z * deltaTime; velocity.x *= damping; velocity.y *= damping; velocity.z *= damping; // new position = old position + velocity * deltaTime position.x += velocity.x * deltaTime; position.y += velocity.y * deltaTime; position.z += velocity.z * deltaTime; // store new position and velocity newPos[deviceOffset + index] = position; vel[deviceOffset + index] = velocity; } template void integrateNbodySystem(DeviceData *deviceData, cudaGraphicsResource **pgres, unsigned int currentRead, float deltaTime, float damping, unsigned int numBodies, unsigned int numDevices, int blockSize, bool bUsePBO) { if (bUsePBO) { checkCudaErrors(cudaGraphicsResourceSetMapFlags(pgres[currentRead], cudaGraphicsMapFlagsReadOnly)); checkCudaErrors(cudaGraphicsResourceSetMapFlags(pgres[1-currentRead], cudaGraphicsMapFlagsWriteDiscard)); checkCudaErrors(cudaGraphicsMapResources(2, pgres, 0)); size_t bytes; checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&(deviceData[0].dPos[currentRead]), &bytes, pgres[currentRead])); checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&(deviceData[0].dPos[1-currentRead]), &bytes, pgres[1-currentRead])); } for (unsigned int dev = 0; dev != numDevices; dev++) { if (numDevices > 1) { cudaSetDevice(dev); } int numBlocks = (deviceData[dev].numBodies + blockSize-1) / blockSize; int numTiles = (numBodies + blockSize - 1) / blockSize; int sharedMemSize = blockSize * 4 * sizeof(T); // 4 floats for pos integrateBodies<<< numBlocks, blockSize, sharedMemSize >>> ((typename vec4::Type *)deviceData[dev].dPos[1-currentRead], (typename vec4::Type *)deviceData[dev].dPos[currentRead], (typename vec4::Type *)deviceData[dev].dVel, deviceData[dev].offset, deviceData[dev].numBodies, deltaTime, damping, numTiles); if (numDevices > 1) { checkCudaErrors(cudaEventRecord(deviceData[dev].event)); // MJH: Hack on older driver versions to force kernel launches to flush! cudaStreamQuery(0); } // check if kernel invocation generated an error getLastCudaError("Kernel execution failed"); } if (numDevices > 1) { for (unsigned int dev = 0; dev < numDevices; dev++) { checkCudaErrors(cudaEventSynchronize(deviceData[dev].event)); } } if (bUsePBO) { checkCudaErrors(cudaGraphicsUnmapResources(2, pgres, 0)); } } // Explicit specializations needed to generate code template void integrateNbodySystem(DeviceData *deviceData, cudaGraphicsResource **pgres, unsigned int currentRead, float deltaTime, float damping, unsigned int numBodies, unsigned int numDevices, int blockSize, bool bUsePBO); template void integrateNbodySystem(DeviceData *deviceData, cudaGraphicsResource **pgres, unsigned int currentRead, float deltaTime, float damping, unsigned int numBodies, unsigned int numDevices, int blockSize, bool bUsePBO);