.....
//! CRTP (Curiously recurring template pattern) interface for GPUArray/GlobalArray
template<class T, class Derived>
class GPUArrayBase
{
public:
.....
//! Get the number of rows allocated
/*!
- For 2-D allocated GPUArrays, this is the height given to the constructor
- For 1-D allocated GPUArrays, this is the simply 1.
*/
unsigned int getHeight() const
{
return m_height;
}
.....
}
.....
namespace hoomd
{
namespace detail
{
template<class T>
class cuda_deleter
{
public:
//! Default constructor
cuda_deleter()
: m_use_device(false), m_N(0), m_mapped(false)
{}
//! Ctor
/*! \param exec_conf Execution configuration
\param use_device whether the array is managed or on the host
*/
cuda_deleter(std::shared_ptr<const ExecutionConfiguration> exec_conf, bool use_device, const unsigned int N,
bool mapped)
: m_exec_conf(exec_conf), m_use_device(use_device), m_N(N), m_mapped(mapped)
{ }
//! Delete the host array
/*! \param ptr Start of aligned memory allocation
*/
void operator()(T *ptr)
{
if (ptr == nullptr)
return;
#ifdef ENABLE_CUDA
if (m_use_device && ! m_mapped)
{
assert(m_exec_conf);
.....
cudaFree(ptr);
CHECK_CUDA_ERROR();
}
#endif
}
private:
std::shared_ptr<const ExecutionConfiguration> m_exec_conf; //!< The execution configuration
bool m_use_device; //!< Whether to use cudaMallocManaged
unsigned int m_N; //!< Number of elements in array
bool m_mapped; //!< True if this is host-mapped memory
};
template<class T>
class host_deleter
{
public:
//! Default constructor
host_deleter()
: m_use_device(false), m_N(0)
{}
//! Ctor
/*! \param exec_conf Execution configuration
\param use_device whether the array is managed or on the host
*/
host_deleter(std::shared_ptr<const ExecutionConfiguration> exec_conf, bool use_device, const unsigned int N)
: m_exec_conf(exec_conf), m_use_device(use_device), m_N(N)
{ }
//! Delete the CUDA array
/*! \param ptr Start of aligned memory allocation
*/
void operator()(T *ptr)
{
if (ptr == nullptr)
return;
.....
#ifdef ENABLE_CUDA
if (m_use_device)
{
assert(m_exec_conf);
// unregister host memory from CUDA driver
cudaHostUnregister(ptr);
CHECK_CUDA_ERROR();
}
#endif
// free the allocation
free(ptr);
}
private:
std::shared_ptr<const ExecutionConfiguration> m_exec_conf; //!< The execution configuration
bool m_use_device; //!< Whether to use hostMallocManaged
unsigned int m_N; //!< Number of elements in array
};
} // end namespace detail
} // end namespace hoomd
.....
template<class T>
class GPUArray : public GPUArrayBase<T, GPUArray<T> >
{
...
protected:
//! Clear memory starting from a given element
/*! \param first The first element to clear
*/
inline void memclear(unsigned int first=0);
.....
private:
//! Helper function to allocate memory
inline void allocate();
#ifdef ENABLE_CUDA
//! Helper function to copy memory from the device to host
inline void memcpyDeviceToHost(bool async) const;
//! Helper function to copy memory from the host to device
inline void memcpyHostToDevice(bool async) const;
#endif
}
.....
//******************************************
// GPUArray implementation
// *****************************************
.....
\param exec_conf Shared pointer to the execution configuration for managing CUDA initialization and shutdown
*/
template<class T> GPUArray<T>::GPUArray(unsigned int num_elements, std::shared_ptr<const ExecutionConfiguration> exec_conf) :
m_num_elements(num_elements), m_pitch(num_elements), m_height(1), m_acquired(false), m_data_location(data_location::host),
#ifdef ENABLE_CUDA
m_mapped(false),
#endif
m_exec_conf(exec_conf)
{
// allocate and clear memory
allocate();
memclear();
}
.....
#ifdef ENABLE_CUDA
/*! \param num_elements Number of elements to allocate in the array
\param exec_conf Shared pointer to the execution configuration for managing CUDA initialization and shutdown
\param mapped True if we are using mapped-pinned memory
*/
template<class T> GPUArray<T>::GPUArray(unsigned int num_elements, std::shared_ptr<const ExecutionConfiguration> exec_conf, bool mapped) :
m_num_elements(num_elements), m_pitch(num_elements), m_height(1), m_acquired(false), m_data_location(data_location::host),
m_mapped(mapped),
m_exec_conf(exec_conf)
{
// allocate and clear memory
allocate();
memclear();
}
.....
template<class T> void GPUArray<T>::allocate()
{
.....
void *host_ptr = nullptr;
// allocate host memory
// at minimum, alignment needs to be 32 bytes for AVX
int retval = posix_memalign(&host_ptr, 32, m_num_elements*sizeof(T));
.....
#ifdef ENABLE_CUDA
void *device_ptr = nullptr;
if (use_device)
{
// register pointer for DMA
cudaHostRegister(host_ptr,m_num_elements*sizeof(T), m_mapped ? cudaHostRegisterMapped : cudaHostRegisterDefault);
CHECK_CUDA_ERROR();
}
#endif
.....
#ifdef ENABLE_CUDA
assert(!d_data);
if (m_exec_conf && m_exec_conf->isCUDAEnabled())
{
// allocate and/or map host memory
if (m_mapped)
{
cudaHostGetDevicePointer(&device_ptr, h_data.get(), 0);
CHECK_CUDA_ERROR();
}
else
{
cudaMalloc(&device_ptr, m_num_elements*sizeof(T));
CHECK_CUDA_ERROR();
}
.....
}
#endif
/*! \pre allocate() has been called
\post All allocated memory is set to 0
*/
template<class T> void GPUArray<T>::memclear(unsigned int first)
{
// don't do anything if there are no elements
if (! h_data.get())
return;
assert(h_data);
assert(first < m_num_elements);
// clear memory
memset((void *)(h_data.get()+first), 0, sizeof(T)*(m_num_elements-first));
#ifdef ENABLE_CUDA
if (m_exec_conf && m_exec_conf->isCUDAEnabled())
{
assert(d_data);
if (! m_mapped) cudaMemset(d_data.get()+first, 0, (m_num_elements-first)*sizeof(T));
}
#endif
}
#ifdef ENABLE_CUDA
/*! \post All memory on the device is copied to the host array
*/
template<class T> void GPUArray<T>::memcpyDeviceToHost(bool async) const
{
// don't do anything if there are no elements
if (!h_data.get())
return;
if (m_mapped)
{
// if we are using mapped pinned memory, no need to copy, only synchronize
if (!async) cudaDeviceSynchronize();
return;
}
.....
if (async)
cudaMemcpyAsync(h_data.get(), d_data.get(), sizeof(T)*m_num_elements, cudaMemcpyDeviceToHost);
else
cudaMemcpy(h_data.get(), d_data.get(), sizeof(T)*m_num_elements, cudaMemcpyDeviceToHost);
if (m_exec_conf->isCUDAErrorCheckingEnabled())
CHECK_CUDA_ERROR();
}
/*! \post All memory on the host is copied to the device array
*/
template<class T> void GPUArray<T>::memcpyHostToDevice(bool async) const
{
// don't do anything if there are no elements
if (!h_data.get())
return;
if (m_mapped)
{
// if we are using mapped pinned memory, no need to copy
// rely on CUDA's implicit synchronization
return;
}
.....
if (async)
cudaMemcpyAsync(d_data.get(), h_data.get(), sizeof(T)*m_num_elements, cudaMemcpyHostToDevice);
else
cudaMemcpy(d_data.get(), h_data.get(), sizeof(T)*m_num_elements, cudaMemcpyHostToDevice);
if (m_exec_conf->isCUDAErrorCheckingEnabled())
CHECK_CUDA_ERROR();
}
#endif
}
Example
一、cudaHostAlloc的使用
#include <iostream>
#include <numeric>
#include <stdlib.h>
__global__ void add1(float* input){
int idx = threadIdx.x;
input[idx] += idx;
}
int main(void)
{
float* temp;
cudaHostAlloc(&temp, sizeof(float)*12, cudaHostAllocDefault);
for(int i = 0; i < 12; ++i){
temp[i] = i;
}
add1<<<1,12>>>(temp);
for(int i = 0; i < 12; ++i){
std::cout<< temp[i] << std::endl;
}
cudaFreeHost(temp);
return 0;
}
二、cudaHostRegister与cudaHostGetDevicePointer使用
#include <iostream>
#include <numeric>
#include <stdlib.h>
__global__ void add1(float* input){
int idx = threadIdx.x;
input[idx] += idx;
}
int main(void)
{
float* temp = (float*)malloc(sizeof(float)*12);
cudaHostRegister(temp, sizeof(float)*12, cudaHostRegisterMapped);
for(int i = 0; i < 12; ++i){
temp[i] = i;
}
float* device;
cudaHostGetDevicePointer(&device, temp, 0);
add1<<<1,12>>>(device);
for(int i = 0; i < 12; ++i){
std::cout<< temp[i] << std::endl;
}
cudaHostUnregister(temp);
return 0;
}
參考
https://blog.csdn.net/u012235274/article/details/52474504
沒有留言:
張貼留言