網頁

2019年3月2日 星期六

allocate and deleter 1-D array

Defines the GPUArray class



.....

//! CRTP (Curiously recurring template pattern) interface for GPUArray/GlobalArray
template<class T, class Derived>
class GPUArrayBase
{
public:
.....

        //! Get the number of rows allocated
        /*!
         - For 2-D allocated GPUArrays, this is the height given to the constructor
         - For 1-D allocated GPUArrays, this is the simply 1.
        */
        unsigned int getHeight() const
            {
            return m_height;
            }

.....

}

.....

namespace hoomd
{
namespace detail
{

template<class T>
class cuda_deleter
    {
    public:
        //! Default constructor
        cuda_deleter()
            : m_use_device(false), m_N(0), m_mapped(false)
            {}

        //! Ctor
        /*! \param exec_conf Execution configuration
            \param use_device whether the array is managed or on the host
         */
        cuda_deleter(std::shared_ptr<const ExecutionConfiguration> exec_conf, bool use_device, const unsigned int N,
            bool mapped)
            : m_exec_conf(exec_conf), m_use_device(use_device), m_N(N), m_mapped(mapped)
            { }

        //! Delete the host array
        /*! \param ptr Start of aligned memory allocation
         */
        void operator()(T *ptr)
            {
            if (ptr == nullptr)
                return;

            #ifdef ENABLE_CUDA
            if (m_use_device && ! m_mapped)
                {
                assert(m_exec_conf);
               
.....

                cudaFree(ptr);
                CHECK_CUDA_ERROR();
                }
            #endif
            }

    private:
        std::shared_ptr<const ExecutionConfiguration> m_exec_conf; //!< The execution configuration
        bool m_use_device;     //!< Whether to use cudaMallocManaged
        unsigned int m_N;      //!< Number of elements in array
        bool m_mapped;         //!< True if this is host-mapped memory
    };

template<class T>
class host_deleter
    {
    public:
        //! Default constructor
        host_deleter()
            : m_use_device(false), m_N(0)
            {}

        //! Ctor
        /*! \param exec_conf Execution configuration
            \param use_device whether the array is managed or on the host
         */
        host_deleter(std::shared_ptr<const ExecutionConfiguration> exec_conf, bool use_device, const unsigned int N)
            : m_exec_conf(exec_conf), m_use_device(use_device), m_N(N)
            { }

        //! Delete the CUDA array
        /*! \param ptr Start of aligned memory allocation
         */
        void operator()(T *ptr)
            {
            if (ptr == nullptr)
                return;

.....

            #ifdef ENABLE_CUDA
            if (m_use_device)
                {
                assert(m_exec_conf);

                // unregister host memory from CUDA driver
                cudaHostUnregister(ptr);
                CHECK_CUDA_ERROR();
                }
            #endif

            // free the allocation
            free(ptr);
            }

    private:
        std::shared_ptr<const ExecutionConfiguration> m_exec_conf; //!< The execution configuration
        bool m_use_device;     //!< Whether to use hostMallocManaged
        unsigned int m_N;      //!< Number of elements in array
    };
} // end namespace detail

} // end namespace hoomd

.....

template<class T>
class GPUArray : public GPUArrayBase<T, GPUArray<T> >
{

...

protected:
        //! Clear memory starting from a given element
        /*! \param first The first element to clear
         */
        inline void memclear(unsigned int first=0);

.....

private:
        //! Helper function to allocate memory
        inline void allocate();
#ifdef ENABLE_CUDA
        //! Helper function to copy memory from the device to host
        inline void memcpyDeviceToHost(bool async) const;
        //! Helper function to copy memory from the host to device
        inline void memcpyHostToDevice(bool async) const;
#endif
}

.....

//******************************************
// GPUArray implementation
// *****************************************


.....

/*! \param num_elements Number of elements to allocate in the array
    \param exec_conf Shared pointer to the execution configuration for managing CUDA initialization and shutdown
*/
template<class T> GPUArray<T>::GPUArray(unsigned int num_elements, std::shared_ptr<const ExecutionConfiguration> exec_conf) :
        m_num_elements(num_elements), m_pitch(num_elements), m_height(1), m_acquired(false), m_data_location(data_location::host),
#ifdef ENABLE_CUDA
        m_mapped(false),
#endif
        m_exec_conf(exec_conf)
    {
    // allocate and clear memory
    allocate();
    memclear();
    }

.....

#ifdef ENABLE_CUDA
/*! \param num_elements Number of elements to allocate in the array
    \param exec_conf Shared pointer to the execution configuration for managing CUDA initialization and shutdown
    \param mapped True if we are using mapped-pinned memory
*/
template<class T> GPUArray<T>::GPUArray(unsigned int num_elements, std::shared_ptr<const ExecutionConfiguration> exec_conf, bool mapped) :
        m_num_elements(num_elements), m_pitch(num_elements), m_height(1), m_acquired(false), m_data_location(data_location::host),
        m_mapped(mapped),
        m_exec_conf(exec_conf)
    {
    // allocate and clear memory
    allocate();
    memclear();
    }

.....

template<class T> void GPUArray<T>::allocate()
{

.....

    void *host_ptr = nullptr;

    // allocate host memory
    // at minimum, alignment needs to be 32 bytes for AVX
    int retval = posix_memalign(&host_ptr, 32, m_num_elements*sizeof(T));

.....

#ifdef ENABLE_CUDA
    void *device_ptr = nullptr;
    if (use_device)
        {
        // register pointer for DMA
        cudaHostRegister(host_ptr,m_num_elements*sizeof(T), m_mapped ? cudaHostRegisterMapped : cudaHostRegisterDefault);
        CHECK_CUDA_ERROR();
        }
#endif

.....

#ifdef ENABLE_CUDA
    assert(!d_data);
    if (m_exec_conf && m_exec_conf->isCUDAEnabled())
        {
        // allocate and/or map host memory
        if (m_mapped)
            {
            cudaHostGetDevicePointer(&device_ptr, h_data.get(), 0);
            CHECK_CUDA_ERROR();
            }
        else
            {
            cudaMalloc(&device_ptr, m_num_elements*sizeof(T));
            CHECK_CUDA_ERROR();
            }

.....

        }
#endif

/*! \pre allocate() has been called
    \post All allocated memory is set to 0
*/
template<class T> void GPUArray<T>::memclear(unsigned int first)
    {
    // don't do anything if there are no elements
    if (! h_data.get())
        return;

    assert(h_data);
    assert(first < m_num_elements);

    // clear memory
    memset((void *)(h_data.get()+first), 0, sizeof(T)*(m_num_elements-first));

#ifdef ENABLE_CUDA
    if (m_exec_conf && m_exec_conf->isCUDAEnabled())
        {
        assert(d_data);
        if (! m_mapped) cudaMemset(d_data.get()+first, 0, (m_num_elements-first)*sizeof(T));
        }
#endif
    }


#ifdef ENABLE_CUDA
/*! \post All memory on the device is copied to the host array
*/
template<class T> void GPUArray<T>::memcpyDeviceToHost(bool async) const
    {
    // don't do anything if there are no elements
    if (!h_data.get())
        return;

    if (m_mapped)
        {
        // if we are using mapped pinned memory, no need to copy, only synchronize
        if (!async) cudaDeviceSynchronize();
        return;
        }

.....

    if (async)
        cudaMemcpyAsync(h_data.get(), d_data.get(), sizeof(T)*m_num_elements, cudaMemcpyDeviceToHost);
    else
        cudaMemcpy(h_data.get(), d_data.get(), sizeof(T)*m_num_elements, cudaMemcpyDeviceToHost);

    if (m_exec_conf->isCUDAErrorCheckingEnabled())
        CHECK_CUDA_ERROR();
    }

/*! \post All memory on the host is copied to the device array
*/
template<class T> void GPUArray<T>::memcpyHostToDevice(bool async) const
    {
    // don't do anything if there are no elements
    if (!h_data.get())
        return;

    if (m_mapped)
        {
        // if we are using mapped pinned memory, no need to copy
        // rely on CUDA's implicit synchronization
        return;
        }

.....

    if (async)
        cudaMemcpyAsync(d_data.get(), h_data.get(), sizeof(T)*m_num_elements, cudaMemcpyHostToDevice);
    else
        cudaMemcpy(d_data.get(), h_data.get(), sizeof(T)*m_num_elements, cudaMemcpyHostToDevice);

    if (m_exec_conf->isCUDAErrorCheckingEnabled())
        CHECK_CUDA_ERROR();
    }
#endif

}

Example
一、cudaHostAlloc的使用

#include <iostream>
#include <numeric>
#include <stdlib.h>

__global__ void add1(float* input){
    int idx = threadIdx.x;
    input[idx] += idx;
}
int main(void)
{
    float* temp;
    cudaHostAlloc(&temp, sizeof(float)*12, cudaHostAllocDefault);
    for(int i = 0; i < 12; ++i){
        temp[i] = i;
    }
    add1<<<1,12>>>(temp);
    for(int i = 0; i < 12; ++i){
        std::cout<< temp[i] << std::endl;
    }
    cudaFreeHost(temp);
    return 0;
}

二、cudaHostRegister与cudaHostGetDevicePointer使用

#include <iostream>
#include <numeric>
#include <stdlib.h>

__global__ void add1(float* input){
    int idx = threadIdx.x;
    input[idx] += idx;
}
int main(void)
{
    float* temp = (float*)malloc(sizeof(float)*12);
    cudaHostRegister(temp, sizeof(float)*12, cudaHostRegisterMapped);
    for(int i = 0; i < 12; ++i){
        temp[i] = i;
    }
    float* device;
    cudaHostGetDevicePointer(&device, temp, 0);
    add1<<<1,12>>>(device);
    for(int i = 0; i < 12; ++i){
        std::cout<< temp[i] << std::endl;
    }
    cudaHostUnregister(temp);
    return 0;
}

參考
https://blog.csdn.net/u012235274/article/details/52474504

沒有留言:

張貼留言