網頁

2019年2月6日 星期三

hoomd instruction

test_dpd_integrator.cc
std::shared_ptr<PotentialPairDPD> dpdc(new PP_DPD(sysdef,nlist));

GPUArray<Scalar4>& force_array_1 =  dpdc->getForceArray();

ArrayHandle<Scalar4> h_force_1(force_array_1,access_location::host,access_mode::read);

dpd_conservative_force_test< PotentialPairGPU<EvaluatorPairDPDThermo, gpu_compute_dpdthermo_forces > >(std::shared_ptr<ExecutionConfiguration>(new ExecutionConfiguration(ExecutionConfiguration::GPU)));

ForceCompute.h
//! Get the array of computed forces
GlobalArray<Scalar4>& getForceArray()
{
    return m_force;
}



ExecutionConfiguration.h

HOOMDMath.h
// Handle both single and double precision through a define
#ifdef SINGLE_PRECISION
//! Floating point type (single precision)
typedef float Scalar;
//! Floating point type with x,y elements (single precision)
typedef float2 Scalar2;
//! Floating point type with x,y elements (single precision)
typedef float3 Scalar3;
//! Floating point type with x,y,z,w elements (single precision)
typedef float4 Scalar4;
#else
//! Floating point type (double precision)
typedef double Scalar;
//! Floating point type with x,y elements (double precision)
typedef double2 Scalar2;
//! Floating point type with x,y,z elements (double precision)
typedef double3 Scalar3;
//! Floating point type with x,y,z,w elements (double precision)
typedef double4 Scalar4;
#endif

//! make a scalar2 value
HOSTDEVICE inline Scalar2 make_scalar2(Scalar x, Scalar y)
    {
    Scalar2 retval;
    retval.x = x;
    retval.y = y;
    return retval;
    }

//! make a scalar3 value
HOSTDEVICE inline Scalar3 make_scalar3(Scalar x, Scalar y, Scalar z)
    {
    Scalar3 retval;
    retval.x = x;
    retval.y = y;
    retval.z = z;
    return retval;
    }

//! make a scalar4 value
HOSTDEVICE inline Scalar4 make_scalar4(Scalar x, Scalar y, Scalar z, Scalar w)
    {
    Scalar4 retval;
    retval.x = x;
    retval.y = y;
    retval.z = z;
    retval.w = w;
    return retval;
    }


PotentialPairDPDThermoGPU.cuh
gpu_compute_dpd_forces -> DPDForceComputeKernel -> launch


//! Kernel driver that computes pair DPD thermo forces on the GPU
/*! \param args Additional options
    \param d_params Per type-pair parameters for the evaluator
    This is just a driver function for gpu_compute_dpd_forces_kernel(), see it for details.
*/
template< class evaluator >
cudaError_t gpu_compute_dpd_forces(const dpd_pair_args_t& args,
                                   const typename evaluator::param_type *d_params)
    {
    assert(d_params);
    assert(args.d_rcutsq);
    assert(args.ntypes > 0);

    // run the kernel
    if (args.compute_capability < 35 && args.size_nlist > args.max_tex1d_width)
        {
        if (args.compute_virial)
            {
            switch (args.shift_mode)
                {
                case 0:
                    {
                    DPDForceComputeKernel<evaluator, 0, 1, 1, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
                    break;
                    }
                case 1:
                    {
                    DPDForceComputeKernel<evaluator, 1, 1, 1, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
                    break;
                    }
                default:
                    return cudaErrorUnknown;
                }
            }
        else
            {
            switch (args.shift_mode)
                {
                case 0:
                    {
                    DPDForceComputeKernel<evaluator, 0, 0, 1, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
                    break;
                    }
                case 1:
                    {
                    DPDForceComputeKernel<evaluator, 1, 0, 1, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
                    break;
                    }
                default:
                    return cudaErrorUnknown;
                }
            }
        }
    else
        {
        if (args.compute_virial)
            {
            switch (args.shift_mode)
                {
                case 0:
                    {
                    DPDForceComputeKernel<evaluator, 0, 1, 0, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
                    break;
                    }
                case 1:
                    {
                    DPDForceComputeKernel<evaluator, 1, 1, 0, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
                    break;
                    }
                default:
                    return cudaErrorUnknown;
                }
            }
        else
            {
            switch (args.shift_mode)
                {
                case 0:
                    {
                    DPDForceComputeKernel<evaluator, 0, 0, 0, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
                    break;
                    }
                case 1:
                    {
                    DPDForceComputeKernel<evaluator, 1, 0, 0, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
                    break;
                    }
                default:
                    return cudaErrorUnknown;
                }
            }
        }

    return cudaSuccess;
    }



PotentialPairDPDThermoGPU.cuh
template<class evaluator, unsigned int shift_mode, unsigned int compute_virial, unsigned int use_gmem_nlist, int tpp>
struct DPDForceComputeKernel
    {
    //! Launcher for the DPD force kernel
    /*!
     * \param args Other arguments to pass onto the kernel
     * \param d_params Parameters for the potential, stored per type pair
     */
    static void launch(const dpd_pair_args_t& args, const typename evaluator::param_type *d_params)
        {
        if (tpp == args.threads_per_particle)
            {
            // setup the grid to run the kernel
            unsigned int block_size = args.block_size;

            Index2D typpair_idx(args.ntypes);
            unsigned int shared_bytes = (sizeof(Scalar) + sizeof(typename evaluator::param_type))
                                        * typpair_idx.getNumElements();

            static unsigned int max_block_size = UINT_MAX;
            if (max_block_size == UINT_MAX)
                max_block_size = dpd_get_max_block_size(gpu_compute_dpd_forces_kernel<evaluator, shift_mode, compute_virial, use_gmem_nlist, tpp>);

            if (args.compute_capability < 35) gpu_dpd_pair_force_bind_textures(args);

            block_size = block_size < max_block_size ? block_size : max_block_size;
            dim3 grid(args.N / (block_size/tpp) + 1, 1, 1);

            gpu_compute_dpd_forces_kernel<evaluator, shift_mode, compute_virial, use_gmem_nlist, tpp>
                                <<<grid, block_size, shared_bytes>>>
                                (args.d_force,
                                args.d_virial,
                                args.virial_pitch,
                                args.N,
                                args.d_pos,
                                args.d_vel,
                                args.d_tag,
                                args.box,
                                args.d_n_neigh,
                                args.d_nlist,
                                args.d_head_list,
                                d_params,
                                args.d_rcutsq,
                                args.seed,
                                args.timestep,
                                args.deltaT,
                                args.T,
                                args.ntypes);
            }
        else
            {
            DPDForceComputeKernel<evaluator, shift_mode, compute_virial, use_gmem_nlist, tpp/2>::launch(args, d_params);
            }
        }
    };

// positions
GlobalArray< Scalar4 > pos(N, m_exec_conf);
m_pos.swap(pos);
GlobalArray<Scalar4> m_pos;
ArrayHandle< Scalar4 > h_pos(m_pos, access_location::host, access_mode::readwrite);



GlobalArray.h
//! Definition of GlobalArray using CRTP
template<class T>
class GlobalArray : public GPUArrayBase<T, GlobalArray<T> >
    {
    public:
        //! Empty constructor
        GlobalArray()
            : m_num_elements(0), m_pitch(0), m_height(0), m_acquired(false), m_align_bytes(0)
            { }
        /*! Allocate a 1D array in managed memory
            \param num_elements Number of elements in array
            \param exec_conf The current execution configuration
         */
        GlobalArray(unsigned int num_elements, std::shared_ptr<const ExecutionConfiguration> exec_conf,
            const std::string& tag = std::string() )
            : m_exec_conf(exec_conf),
            #ifndef ALWAYS_USE_MANAGED_MEMORY
            // explicit copy should be elided
            m_fallback(exec_conf->allConcurrentManagedAccess() ?
                GPUArray<T>() : GPUArray<T>(num_elements, exec_conf)),
            #endif
            m_num_elements(num_elements), m_pitch(num_elements), m_height(1), m_acquired(false), m_tag(tag),
            m_align_bytes(0)
            {
            #ifndef ALWAYS_USE_MANAGED_MEMORY
            if (!this->m_exec_conf->allConcurrentManagedAccess())
                return;
            #endif

            assert(this->m_exec_conf);
            #ifdef ENABLE_CUDA
            if (this->m_exec_conf->isCUDAEnabled())
                {
                // use OS page size as minimum alignment
                m_align_bytes = getpagesize();
                }
            #endif

            if (m_num_elements > 0)
                allocate();
            }



GPUArray.h
template<class T>
class ArrayHandle
{
public:
    template<class Derived>
    inline ArrayHandle(const GPUArrayBase<T, Derived>& gpu_array, const access_location::Enum location = access_location::host,
    const access_mode::Enum mode = access_mode::readwrite);
    //! Notifies the containing GPUArray that the handle has been released
    virtual inline ~ArrayHandle() = default;

private:
    ArrayHandleDispatch<T> dispatch; //!< Reference to the dispatch object that manages the acquire/release

public:
    T* const data; //!< Pointer to data
};



GPUArray.h
//! CRTP (Curiously recurring template pattern) interface for GPUArray/GlobalArray
template<class T, class Derived>
class GPUArrayBase
    {
    public:
        //! Get the number of elements
        /*!
         - For 1-D allocated GPUArrays, this is the number of elements allocated.
         - For 2-D allocated GPUArrays, this is the \b total number of elements (\a pitch * \a height) allocated
        */
        unsigned int getNumElements() const
            {
            return static_cast<Derived const&>(*this).getNumElements();
            }

        //! Test if the GPUArray is NULL
        bool isNull() const
            {
            return static_cast<Derived const&>(*this).isNull();
            }

        //! Get the width of the allocated rows in elements
        /*!
         - For 2-D allocated GPUArrays, this is the total width of a row in memory (including the padding added for coalescing)
         - For 1-D allocated GPUArrays, this is the simply the number of elements allocated.
        */
        unsigned int getPitch() const
            {
            return static_cast<Derived const&>(*this).getPitch();
            }

        //! Get the number of rows allocated
        /*!
         - For 2-D allocated GPUArrays, this is the height given to the constructor
         - For 1-D allocated GPUArrays, this is the simply 1.
        */
        unsigned int getHeight() const
            {
            return static_cast<Derived const&>(*this).getHeight();
            }

        //! Resize the GPUArray
        void resize(unsigned int num_elements)
            {
            static_cast<Derived&>(*this).resize(num_elements);
            }

        //! Resize a 2D GPUArray
        void resize(unsigned int width, unsigned int height)
            {
            static_cast<Derived&>(*this).resize(width,height);
            }

    protected:
        //! Acquires the data pointer for use
        inline ArrayHandleDispatch<T> acquire(const access_location::Enum location, const access_mode::Enum mode
        #ifdef ENABLE_CUDA
                         , bool async = false
        #endif
                        ) const
            {
            return static_cast<Derived const&>(*this).acquire(location, mode
            #ifdef ENABLE_CUDA
                , async
            #endif
                );
            }

        //! Release the data pointer
        inline void release() const
            {
            return static_cast<Derived const&>(*this).release();
            }

        //! Returns the acquire state
        inline bool isAcquired() const
            {
            return static_cast<Derived const&>(*this).isAcquired();
            }

        // need to be friend of the ArrayHandle class
        friend class ArrayHandle<T>;
        friend class ArrayHandleAsync<T>;

    private:
        // Make constructor private to prevent mistakes
        GPUArrayBase() {};
        friend Derived;
    };

沒有留言:

張貼留言