std::shared_ptr<PotentialPairDPD> dpdc(new PP_DPD(sysdef,nlist));
GPUArray<Scalar4>& force_array_1 = dpdc->getForceArray();
ArrayHandle<Scalar4> h_force_1(force_array_1,access_location::host,access_mode::read);
dpd_conservative_force_test< PotentialPairGPU<EvaluatorPairDPDThermo, gpu_compute_dpdthermo_forces > >(std::shared_ptr<ExecutionConfiguration>(new ExecutionConfiguration(ExecutionConfiguration::GPU)));
ForceCompute.h
//! Get the array of computed forces
GlobalArray<Scalar4>& getForceArray()
{
return m_force;
}
HOOMDMath.h
// Handle both single and double precision through a define
#ifdef SINGLE_PRECISION
//! Floating point type (single precision)
typedef float Scalar;
//! Floating point type with x,y elements (single precision)
typedef float2 Scalar2;
//! Floating point type with x,y elements (single precision)
typedef float3 Scalar3;
//! Floating point type with x,y,z,w elements (single precision)
typedef float4 Scalar4;
#else
//! Floating point type (double precision)
typedef double Scalar;
//! Floating point type with x,y elements (double precision)
typedef double2 Scalar2;
//! Floating point type with x,y,z elements (double precision)
typedef double3 Scalar3;
//! Floating point type with x,y,z,w elements (double precision)
typedef double4 Scalar4;
#endif
//! make a scalar2 value
HOSTDEVICE inline Scalar2 make_scalar2(Scalar x, Scalar y)
{
Scalar2 retval;
retval.x = x;
retval.y = y;
return retval;
}
//! make a scalar3 value
HOSTDEVICE inline Scalar3 make_scalar3(Scalar x, Scalar y, Scalar z)
{
Scalar3 retval;
retval.x = x;
retval.y = y;
retval.z = z;
return retval;
}
//! make a scalar4 value
HOSTDEVICE inline Scalar4 make_scalar4(Scalar x, Scalar y, Scalar z, Scalar w)
{
Scalar4 retval;
retval.x = x;
retval.y = y;
retval.z = z;
retval.w = w;
return retval;
}
PotentialPairDPDThermoGPU.cuh
// Handle both single and double precision through a define
#ifdef SINGLE_PRECISION
//! Floating point type (single precision)
typedef float Scalar;
//! Floating point type with x,y elements (single precision)
typedef float2 Scalar2;
//! Floating point type with x,y elements (single precision)
typedef float3 Scalar3;
//! Floating point type with x,y,z,w elements (single precision)
typedef float4 Scalar4;
#else
//! Floating point type (double precision)
typedef double Scalar;
//! Floating point type with x,y elements (double precision)
typedef double2 Scalar2;
//! Floating point type with x,y,z elements (double precision)
typedef double3 Scalar3;
//! Floating point type with x,y,z,w elements (double precision)
typedef double4 Scalar4;
#endif
//! make a scalar2 value
HOSTDEVICE inline Scalar2 make_scalar2(Scalar x, Scalar y)
{
Scalar2 retval;
retval.x = x;
retval.y = y;
return retval;
}
//! make a scalar3 value
HOSTDEVICE inline Scalar3 make_scalar3(Scalar x, Scalar y, Scalar z)
{
Scalar3 retval;
retval.x = x;
retval.y = y;
retval.z = z;
return retval;
}
//! make a scalar4 value
HOSTDEVICE inline Scalar4 make_scalar4(Scalar x, Scalar y, Scalar z, Scalar w)
{
Scalar4 retval;
retval.x = x;
retval.y = y;
retval.z = z;
retval.w = w;
return retval;
}
PotentialPairDPDThermoGPU.cuh
gpu_compute_dpd_forces -> DPDForceComputeKernel -> launch
//! Kernel driver that computes pair DPD thermo forces on the GPU
/*! \param args Additional options
\param d_params Per type-pair parameters for the evaluator
This is just a driver function for gpu_compute_dpd_forces_kernel(), see it for details.
*/
template< class evaluator >
cudaError_t gpu_compute_dpd_forces(const dpd_pair_args_t& args,
const typename evaluator::param_type *d_params)
{
assert(d_params);
assert(args.d_rcutsq);
assert(args.ntypes > 0);
// run the kernel
if (args.compute_capability < 35 && args.size_nlist > args.max_tex1d_width)
{
if (args.compute_virial)
{
switch (args.shift_mode)
{
case 0:
{
DPDForceComputeKernel<evaluator, 0, 1, 1, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
break;
}
case 1:
{
DPDForceComputeKernel<evaluator, 1, 1, 1, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
break;
}
default:
return cudaErrorUnknown;
}
}
else
{
switch (args.shift_mode)
{
case 0:
{
DPDForceComputeKernel<evaluator, 0, 0, 1, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
break;
}
case 1:
{
DPDForceComputeKernel<evaluator, 1, 0, 1, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
break;
}
default:
return cudaErrorUnknown;
}
}
}
else
{
if (args.compute_virial)
{
switch (args.shift_mode)
{
case 0:
{
DPDForceComputeKernel<evaluator, 0, 1, 0, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
break;
}
case 1:
{
DPDForceComputeKernel<evaluator, 1, 1, 0, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
break;
}
default:
return cudaErrorUnknown;
}
}
else
{
switch (args.shift_mode)
{
case 0:
{
DPDForceComputeKernel<evaluator, 0, 0, 0, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
break;
}
case 1:
{
DPDForceComputeKernel<evaluator, 1, 0, 0, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
break;
}
default:
return cudaErrorUnknown;
}
}
}
return cudaSuccess;
}
PotentialPairDPDThermoGPU.cuh
template<class evaluator, unsigned int shift_mode, unsigned int compute_virial, unsigned int use_gmem_nlist, int tpp>
struct DPDForceComputeKernel
{
//! Launcher for the DPD force kernel
/*!
* \param args Other arguments to pass onto the kernel
* \param d_params Parameters for the potential, stored per type pair
*/
static void launch(const dpd_pair_args_t& args, const typename evaluator::param_type *d_params)
{
if (tpp == args.threads_per_particle)
{
// setup the grid to run the kernel
unsigned int block_size = args.block_size;
Index2D typpair_idx(args.ntypes);
unsigned int shared_bytes = (sizeof(Scalar) + sizeof(typename evaluator::param_type))
* typpair_idx.getNumElements();
static unsigned int max_block_size = UINT_MAX;
if (max_block_size == UINT_MAX)
max_block_size = dpd_get_max_block_size(gpu_compute_dpd_forces_kernel<evaluator, shift_mode, compute_virial, use_gmem_nlist, tpp>);
if (args.compute_capability < 35) gpu_dpd_pair_force_bind_textures(args);
block_size = block_size < max_block_size ? block_size : max_block_size;
dim3 grid(args.N / (block_size/tpp) + 1, 1, 1);
gpu_compute_dpd_forces_kernel<evaluator, shift_mode, compute_virial, use_gmem_nlist, tpp>
<<<grid, block_size, shared_bytes>>>
(args.d_force,
args.d_virial,
args.virial_pitch,
args.N,
args.d_pos,
args.d_vel,
args.d_tag,
args.box,
args.d_n_neigh,
args.d_nlist,
args.d_head_list,
d_params,
args.d_rcutsq,
args.seed,
args.timestep,
args.deltaT,
args.T,
args.ntypes);
}
else
{
DPDForceComputeKernel<evaluator, shift_mode, compute_virial, use_gmem_nlist, tpp/2>::launch(args, d_params);
}
}
};
// positions
GlobalArray< Scalar4 > pos(N, m_exec_conf);
m_pos.swap(pos);
GlobalArray<Scalar4> m_pos;
ArrayHandle< Scalar4 > h_pos(m_pos, access_location::host, access_mode::readwrite);
GlobalArray.h
//! Definition of GlobalArray using CRTP
template<class T>
class GlobalArray : public GPUArrayBase<T, GlobalArray<T> >
{
public:
//! Empty constructor
GlobalArray()
: m_num_elements(0), m_pitch(0), m_height(0), m_acquired(false), m_align_bytes(0)
{ }
/*! Allocate a 1D array in managed memory
\param num_elements Number of elements in array
\param exec_conf The current execution configuration
*/
GlobalArray(unsigned int num_elements, std::shared_ptr<const ExecutionConfiguration> exec_conf,
const std::string& tag = std::string() )
: m_exec_conf(exec_conf),
#ifndef ALWAYS_USE_MANAGED_MEMORY
// explicit copy should be elided
m_fallback(exec_conf->allConcurrentManagedAccess() ?
GPUArray<T>() : GPUArray<T>(num_elements, exec_conf)),
#endif
m_num_elements(num_elements), m_pitch(num_elements), m_height(1), m_acquired(false), m_tag(tag),
m_align_bytes(0)
{
#ifndef ALWAYS_USE_MANAGED_MEMORY
if (!this->m_exec_conf->allConcurrentManagedAccess())
return;
#endif
assert(this->m_exec_conf);
#ifdef ENABLE_CUDA
if (this->m_exec_conf->isCUDAEnabled())
{
// use OS page size as minimum alignment
m_align_bytes = getpagesize();
}
#endif
if (m_num_elements > 0)
allocate();
}
GPUArray.h
GPUArray.h
//! CRTP (Curiously recurring template pattern) interface for GPUArray/GlobalArray
template<class T, class Derived>
class GPUArrayBase
{
public:
//! Get the number of elements
/*!
- For 1-D allocated GPUArrays, this is the number of elements allocated.
- For 2-D allocated GPUArrays, this is the \b total number of elements (\a pitch * \a height) allocated
*/
unsigned int getNumElements() const
{
return static_cast<Derived const&>(*this).getNumElements();
}
//! Test if the GPUArray is NULL
bool isNull() const
{
return static_cast<Derived const&>(*this).isNull();
}
//! Get the width of the allocated rows in elements
/*!
- For 2-D allocated GPUArrays, this is the total width of a row in memory (including the padding added for coalescing)
- For 1-D allocated GPUArrays, this is the simply the number of elements allocated.
*/
unsigned int getPitch() const
{
return static_cast<Derived const&>(*this).getPitch();
}
//! Get the number of rows allocated
/*!
- For 2-D allocated GPUArrays, this is the height given to the constructor
- For 1-D allocated GPUArrays, this is the simply 1.
*/
unsigned int getHeight() const
{
return static_cast<Derived const&>(*this).getHeight();
}
//! Resize the GPUArray
void resize(unsigned int num_elements)
{
static_cast<Derived&>(*this).resize(num_elements);
}
//! Resize a 2D GPUArray
void resize(unsigned int width, unsigned int height)
{
static_cast<Derived&>(*this).resize(width,height);
}
protected:
//! Acquires the data pointer for use
inline ArrayHandleDispatch<T> acquire(const access_location::Enum location, const access_mode::Enum mode
#ifdef ENABLE_CUDA
, bool async = false
#endif
) const
{
return static_cast<Derived const&>(*this).acquire(location, mode
#ifdef ENABLE_CUDA
, async
#endif
);
}
//! Release the data pointer
inline void release() const
{
return static_cast<Derived const&>(*this).release();
}
//! Returns the acquire state
inline bool isAcquired() const
{
return static_cast<Derived const&>(*this).isAcquired();
}
// need to be friend of the ArrayHandle class
friend class ArrayHandle<T>;
friend class ArrayHandleAsync<T>;
private:
// Make constructor private to prevent mistakes
GPUArrayBase() {};
friend Derived;
};
/*! \param args Additional options
\param d_params Per type-pair parameters for the evaluator
This is just a driver function for gpu_compute_dpd_forces_kernel(), see it for details.
*/
template< class evaluator >
cudaError_t gpu_compute_dpd_forces(const dpd_pair_args_t& args,
const typename evaluator::param_type *d_params)
{
assert(d_params);
assert(args.d_rcutsq);
assert(args.ntypes > 0);
// run the kernel
if (args.compute_capability < 35 && args.size_nlist > args.max_tex1d_width)
{
if (args.compute_virial)
{
switch (args.shift_mode)
{
case 0:
{
DPDForceComputeKernel<evaluator, 0, 1, 1, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
break;
}
case 1:
{
DPDForceComputeKernel<evaluator, 1, 1, 1, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
break;
}
default:
return cudaErrorUnknown;
}
}
else
{
switch (args.shift_mode)
{
case 0:
{
DPDForceComputeKernel<evaluator, 0, 0, 1, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
break;
}
case 1:
{
DPDForceComputeKernel<evaluator, 1, 0, 1, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
break;
}
default:
return cudaErrorUnknown;
}
}
}
else
{
if (args.compute_virial)
{
switch (args.shift_mode)
{
case 0:
{
DPDForceComputeKernel<evaluator, 0, 1, 0, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
break;
}
case 1:
{
DPDForceComputeKernel<evaluator, 1, 1, 0, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
break;
}
default:
return cudaErrorUnknown;
}
}
else
{
switch (args.shift_mode)
{
case 0:
{
DPDForceComputeKernel<evaluator, 0, 0, 0, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
break;
}
case 1:
{
DPDForceComputeKernel<evaluator, 1, 0, 0, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
break;
}
default:
return cudaErrorUnknown;
}
}
}
return cudaSuccess;
}
PotentialPairDPDThermoGPU.cuh
template<class evaluator, unsigned int shift_mode, unsigned int compute_virial, unsigned int use_gmem_nlist, int tpp>
struct DPDForceComputeKernel
{
//! Launcher for the DPD force kernel
/*!
* \param args Other arguments to pass onto the kernel
* \param d_params Parameters for the potential, stored per type pair
*/
static void launch(const dpd_pair_args_t& args, const typename evaluator::param_type *d_params)
{
if (tpp == args.threads_per_particle)
{
// setup the grid to run the kernel
unsigned int block_size = args.block_size;
Index2D typpair_idx(args.ntypes);
unsigned int shared_bytes = (sizeof(Scalar) + sizeof(typename evaluator::param_type))
* typpair_idx.getNumElements();
static unsigned int max_block_size = UINT_MAX;
if (max_block_size == UINT_MAX)
max_block_size = dpd_get_max_block_size(gpu_compute_dpd_forces_kernel<evaluator, shift_mode, compute_virial, use_gmem_nlist, tpp>);
if (args.compute_capability < 35) gpu_dpd_pair_force_bind_textures(args);
block_size = block_size < max_block_size ? block_size : max_block_size;
dim3 grid(args.N / (block_size/tpp) + 1, 1, 1);
gpu_compute_dpd_forces_kernel<evaluator, shift_mode, compute_virial, use_gmem_nlist, tpp>
<<<grid, block_size, shared_bytes>>>
(args.d_force,
args.d_virial,
args.virial_pitch,
args.N,
args.d_pos,
args.d_vel,
args.d_tag,
args.box,
args.d_n_neigh,
args.d_nlist,
args.d_head_list,
d_params,
args.d_rcutsq,
args.seed,
args.timestep,
args.deltaT,
args.T,
args.ntypes);
}
else
{
DPDForceComputeKernel<evaluator, shift_mode, compute_virial, use_gmem_nlist, tpp/2>::launch(args, d_params);
}
}
};
// positions
GlobalArray< Scalar4 > pos(N, m_exec_conf);
m_pos.swap(pos);
GlobalArray<Scalar4> m_pos;
ArrayHandle< Scalar4 > h_pos(m_pos, access_location::host, access_mode::readwrite);
GlobalArray.h
//! Definition of GlobalArray using CRTP
template<class T>
class GlobalArray : public GPUArrayBase<T, GlobalArray<T> >
{
public:
//! Empty constructor
GlobalArray()
: m_num_elements(0), m_pitch(0), m_height(0), m_acquired(false), m_align_bytes(0)
{ }
/*! Allocate a 1D array in managed memory
\param num_elements Number of elements in array
\param exec_conf The current execution configuration
*/
GlobalArray(unsigned int num_elements, std::shared_ptr<const ExecutionConfiguration> exec_conf,
const std::string& tag = std::string() )
: m_exec_conf(exec_conf),
#ifndef ALWAYS_USE_MANAGED_MEMORY
// explicit copy should be elided
m_fallback(exec_conf->allConcurrentManagedAccess() ?
GPUArray<T>() : GPUArray<T>(num_elements, exec_conf)),
#endif
m_num_elements(num_elements), m_pitch(num_elements), m_height(1), m_acquired(false), m_tag(tag),
m_align_bytes(0)
{
#ifndef ALWAYS_USE_MANAGED_MEMORY
if (!this->m_exec_conf->allConcurrentManagedAccess())
return;
#endif
assert(this->m_exec_conf);
#ifdef ENABLE_CUDA
if (this->m_exec_conf->isCUDAEnabled())
{
// use OS page size as minimum alignment
m_align_bytes = getpagesize();
}
#endif
if (m_num_elements > 0)
allocate();
}
GPUArray.h
template<class T>
private:
ArrayHandleDispatch<T> dispatch; //!< Reference to the dispatch object that manages the acquire/release
public:
T* const data; //!< Pointer to data
};
class ArrayHandle
{
public:
template<class Derived>
inline ArrayHandle(const GPUArrayBase<T, Derived>& gpu_array, const access_location::Enum location = access_location::host,
const access_mode::Enum mode = access_mode::readwrite);
//! Notifies the containing GPUArray that the handle has been released
virtual inline ~ArrayHandle() = default;
{
public:
template<class Derived>
inline ArrayHandle(const GPUArrayBase<T, Derived>& gpu_array, const access_location::Enum location = access_location::host,
const access_mode::Enum mode = access_mode::readwrite);
//! Notifies the containing GPUArray that the handle has been released
virtual inline ~ArrayHandle() = default;
private:
ArrayHandleDispatch<T> dispatch; //!< Reference to the dispatch object that manages the acquire/release
public:
T* const data; //!< Pointer to data
};
GPUArray.h
//! CRTP (Curiously recurring template pattern) interface for GPUArray/GlobalArray
template<class T, class Derived>
class GPUArrayBase
{
public:
//! Get the number of elements
/*!
- For 1-D allocated GPUArrays, this is the number of elements allocated.
- For 2-D allocated GPUArrays, this is the \b total number of elements (\a pitch * \a height) allocated
*/
unsigned int getNumElements() const
{
return static_cast<Derived const&>(*this).getNumElements();
}
//! Test if the GPUArray is NULL
bool isNull() const
{
return static_cast<Derived const&>(*this).isNull();
}
//! Get the width of the allocated rows in elements
/*!
- For 2-D allocated GPUArrays, this is the total width of a row in memory (including the padding added for coalescing)
- For 1-D allocated GPUArrays, this is the simply the number of elements allocated.
*/
unsigned int getPitch() const
{
return static_cast<Derived const&>(*this).getPitch();
}
//! Get the number of rows allocated
/*!
- For 2-D allocated GPUArrays, this is the height given to the constructor
- For 1-D allocated GPUArrays, this is the simply 1.
*/
unsigned int getHeight() const
{
return static_cast<Derived const&>(*this).getHeight();
}
//! Resize the GPUArray
void resize(unsigned int num_elements)
{
static_cast<Derived&>(*this).resize(num_elements);
}
//! Resize a 2D GPUArray
void resize(unsigned int width, unsigned int height)
{
static_cast<Derived&>(*this).resize(width,height);
}
protected:
//! Acquires the data pointer for use
inline ArrayHandleDispatch<T> acquire(const access_location::Enum location, const access_mode::Enum mode
#ifdef ENABLE_CUDA
, bool async = false
#endif
) const
{
return static_cast<Derived const&>(*this).acquire(location, mode
#ifdef ENABLE_CUDA
, async
#endif
);
}
//! Release the data pointer
inline void release() const
{
return static_cast<Derived const&>(*this).release();
}
//! Returns the acquire state
inline bool isAcquired() const
{
return static_cast<Derived const&>(*this).isAcquired();
}
// need to be friend of the ArrayHandle class
friend class ArrayHandle<T>;
friend class ArrayHandleAsync<T>;
private:
// Make constructor private to prevent mistakes
GPUArrayBase() {};
friend Derived;
};
沒有留言:
張貼留言