2019年2月27日 星期三
2019年2月15日 星期五
2019年2月12日 星期二
2019年2月11日 星期一
2019年2月10日 星期日
2019年2月9日 星期六
安裝cereal
cereal是一個header-only C ++ 11序列化函式庫。 cereal採用任意數據類型並可逆地將它們轉換為不同的表示形式,例如compact binary encodings,XML或JSON。 cereal被設計為快速,重量輕,易於擴展 - 它沒有外部依賴性,可以很容易地與其他代碼捆綁或獨立使用。
2019年2月8日 星期五
Page-Locked Host Memory
與Unlock Pages in Memory相比,Lock Pages in Memory速度更快。此安全性設定決定哪些使用者能使用處理程序來保留實體記憶體中的資料,阻止系統將資料分頁到磁碟上的虛擬記憶體,降低可用的隨機存取記憶體 (RAM) 數量,而對系統效能造成顯著影響。
CUDA提供了cudaHostAlloc和cudaHostRegister調用來分配或註冊頁面鎖定memory。如果主機memory被鎖定,Nvidia驅動程序會檢查memory傳輸,並根據複製代碼路徑發出問題。
CUDA提供了cudaHostAlloc和cudaHostRegister調用來分配或註冊頁面鎖定memory。如果主機memory被鎖定,Nvidia驅動程序會檢查memory傳輸,並根據複製代碼路徑發出問題。
2019年2月6日 星期三
CUDA syntax
CUDA syntax
Source code is in .cu files, which contain mixture of host (CPU) and device (GPU) code.
hoomd instruction
test_dpd_integrator.cc
std::shared_ptr<PotentialPairDPD> dpdc(new PP_DPD(sysdef,nlist));
GPUArray<Scalar4>& force_array_1 = dpdc->getForceArray();
ArrayHandle<Scalar4> h_force_1(force_array_1,access_location::host,access_mode::read);
dpd_conservative_force_test< PotentialPairGPU<EvaluatorPairDPDThermo, gpu_compute_dpdthermo_forces > >(std::shared_ptr<ExecutionConfiguration>(new ExecutionConfiguration(ExecutionConfiguration::GPU)));
ForceCompute.h
//! Get the array of computed forces
GlobalArray<Scalar4>& getForceArray()
{
return m_force;
}
ExecutionConfiguration.h
std::shared_ptr<PotentialPairDPD> dpdc(new PP_DPD(sysdef,nlist));
GPUArray<Scalar4>& force_array_1 = dpdc->getForceArray();
ArrayHandle<Scalar4> h_force_1(force_array_1,access_location::host,access_mode::read);
dpd_conservative_force_test< PotentialPairGPU<EvaluatorPairDPDThermo, gpu_compute_dpdthermo_forces > >(std::shared_ptr<ExecutionConfiguration>(new ExecutionConfiguration(ExecutionConfiguration::GPU)));
ForceCompute.h
//! Get the array of computed forces
GlobalArray<Scalar4>& getForceArray()
{
return m_force;
}
HOOMDMath.h
// Handle both single and double precision through a define
#ifdef SINGLE_PRECISION
//! Floating point type (single precision)
typedef float Scalar;
//! Floating point type with x,y elements (single precision)
typedef float2 Scalar2;
//! Floating point type with x,y elements (single precision)
typedef float3 Scalar3;
//! Floating point type with x,y,z,w elements (single precision)
typedef float4 Scalar4;
#else
//! Floating point type (double precision)
typedef double Scalar;
//! Floating point type with x,y elements (double precision)
typedef double2 Scalar2;
//! Floating point type with x,y,z elements (double precision)
typedef double3 Scalar3;
//! Floating point type with x,y,z,w elements (double precision)
typedef double4 Scalar4;
#endif
//! make a scalar2 value
HOSTDEVICE inline Scalar2 make_scalar2(Scalar x, Scalar y)
{
Scalar2 retval;
retval.x = x;
retval.y = y;
return retval;
}
//! make a scalar3 value
HOSTDEVICE inline Scalar3 make_scalar3(Scalar x, Scalar y, Scalar z)
{
Scalar3 retval;
retval.x = x;
retval.y = y;
retval.z = z;
return retval;
}
//! make a scalar4 value
HOSTDEVICE inline Scalar4 make_scalar4(Scalar x, Scalar y, Scalar z, Scalar w)
{
Scalar4 retval;
retval.x = x;
retval.y = y;
retval.z = z;
retval.w = w;
return retval;
}
PotentialPairDPDThermoGPU.cuh
// Handle both single and double precision through a define
#ifdef SINGLE_PRECISION
//! Floating point type (single precision)
typedef float Scalar;
//! Floating point type with x,y elements (single precision)
typedef float2 Scalar2;
//! Floating point type with x,y elements (single precision)
typedef float3 Scalar3;
//! Floating point type with x,y,z,w elements (single precision)
typedef float4 Scalar4;
#else
//! Floating point type (double precision)
typedef double Scalar;
//! Floating point type with x,y elements (double precision)
typedef double2 Scalar2;
//! Floating point type with x,y,z elements (double precision)
typedef double3 Scalar3;
//! Floating point type with x,y,z,w elements (double precision)
typedef double4 Scalar4;
#endif
//! make a scalar2 value
HOSTDEVICE inline Scalar2 make_scalar2(Scalar x, Scalar y)
{
Scalar2 retval;
retval.x = x;
retval.y = y;
return retval;
}
//! make a scalar3 value
HOSTDEVICE inline Scalar3 make_scalar3(Scalar x, Scalar y, Scalar z)
{
Scalar3 retval;
retval.x = x;
retval.y = y;
retval.z = z;
return retval;
}
//! make a scalar4 value
HOSTDEVICE inline Scalar4 make_scalar4(Scalar x, Scalar y, Scalar z, Scalar w)
{
Scalar4 retval;
retval.x = x;
retval.y = y;
retval.z = z;
retval.w = w;
return retval;
}
PotentialPairDPDThermoGPU.cuh
gpu_compute_dpd_forces -> DPDForceComputeKernel -> launch
//! Kernel driver that computes pair DPD thermo forces on the GPU
/*! \param args Additional options
\param d_params Per type-pair parameters for the evaluator
This is just a driver function for gpu_compute_dpd_forces_kernel(), see it for details.
*/
template< class evaluator >
cudaError_t gpu_compute_dpd_forces(const dpd_pair_args_t& args,
const typename evaluator::param_type *d_params)
{
assert(d_params);
assert(args.d_rcutsq);
assert(args.ntypes > 0);
// run the kernel
if (args.compute_capability < 35 && args.size_nlist > args.max_tex1d_width)
{
if (args.compute_virial)
{
switch (args.shift_mode)
{
case 0:
{
DPDForceComputeKernel<evaluator, 0, 1, 1, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
break;
}
case 1:
{
DPDForceComputeKernel<evaluator, 1, 1, 1, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
break;
}
default:
return cudaErrorUnknown;
}
}
else
{
switch (args.shift_mode)
{
case 0:
{
DPDForceComputeKernel<evaluator, 0, 0, 1, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
break;
}
case 1:
{
DPDForceComputeKernel<evaluator, 1, 0, 1, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
break;
}
default:
return cudaErrorUnknown;
}
}
}
else
{
if (args.compute_virial)
{
switch (args.shift_mode)
{
case 0:
{
DPDForceComputeKernel<evaluator, 0, 1, 0, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
break;
}
case 1:
{
DPDForceComputeKernel<evaluator, 1, 1, 0, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
break;
}
default:
return cudaErrorUnknown;
}
}
else
{
switch (args.shift_mode)
{
case 0:
{
DPDForceComputeKernel<evaluator, 0, 0, 0, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
break;
}
case 1:
{
DPDForceComputeKernel<evaluator, 1, 0, 0, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
break;
}
default:
return cudaErrorUnknown;
}
}
}
return cudaSuccess;
}
PotentialPairDPDThermoGPU.cuh
template<class evaluator, unsigned int shift_mode, unsigned int compute_virial, unsigned int use_gmem_nlist, int tpp>
struct DPDForceComputeKernel
{
//! Launcher for the DPD force kernel
/*!
* \param args Other arguments to pass onto the kernel
* \param d_params Parameters for the potential, stored per type pair
*/
static void launch(const dpd_pair_args_t& args, const typename evaluator::param_type *d_params)
{
if (tpp == args.threads_per_particle)
{
// setup the grid to run the kernel
unsigned int block_size = args.block_size;
Index2D typpair_idx(args.ntypes);
unsigned int shared_bytes = (sizeof(Scalar) + sizeof(typename evaluator::param_type))
* typpair_idx.getNumElements();
static unsigned int max_block_size = UINT_MAX;
if (max_block_size == UINT_MAX)
max_block_size = dpd_get_max_block_size(gpu_compute_dpd_forces_kernel<evaluator, shift_mode, compute_virial, use_gmem_nlist, tpp>);
if (args.compute_capability < 35) gpu_dpd_pair_force_bind_textures(args);
block_size = block_size < max_block_size ? block_size : max_block_size;
dim3 grid(args.N / (block_size/tpp) + 1, 1, 1);
gpu_compute_dpd_forces_kernel<evaluator, shift_mode, compute_virial, use_gmem_nlist, tpp>
<<<grid, block_size, shared_bytes>>>
(args.d_force,
args.d_virial,
args.virial_pitch,
args.N,
args.d_pos,
args.d_vel,
args.d_tag,
args.box,
args.d_n_neigh,
args.d_nlist,
args.d_head_list,
d_params,
args.d_rcutsq,
args.seed,
args.timestep,
args.deltaT,
args.T,
args.ntypes);
}
else
{
DPDForceComputeKernel<evaluator, shift_mode, compute_virial, use_gmem_nlist, tpp/2>::launch(args, d_params);
}
}
};
// positions
GlobalArray< Scalar4 > pos(N, m_exec_conf);
m_pos.swap(pos);
GlobalArray<Scalar4> m_pos;
ArrayHandle< Scalar4 > h_pos(m_pos, access_location::host, access_mode::readwrite);
GlobalArray.h
//! Definition of GlobalArray using CRTP
template<class T>
class GlobalArray : public GPUArrayBase<T, GlobalArray<T> >
{
public:
//! Empty constructor
GlobalArray()
: m_num_elements(0), m_pitch(0), m_height(0), m_acquired(false), m_align_bytes(0)
{ }
/*! Allocate a 1D array in managed memory
\param num_elements Number of elements in array
\param exec_conf The current execution configuration
*/
GlobalArray(unsigned int num_elements, std::shared_ptr<const ExecutionConfiguration> exec_conf,
const std::string& tag = std::string() )
: m_exec_conf(exec_conf),
#ifndef ALWAYS_USE_MANAGED_MEMORY
// explicit copy should be elided
m_fallback(exec_conf->allConcurrentManagedAccess() ?
GPUArray<T>() : GPUArray<T>(num_elements, exec_conf)),
#endif
m_num_elements(num_elements), m_pitch(num_elements), m_height(1), m_acquired(false), m_tag(tag),
m_align_bytes(0)
{
#ifndef ALWAYS_USE_MANAGED_MEMORY
if (!this->m_exec_conf->allConcurrentManagedAccess())
return;
#endif
assert(this->m_exec_conf);
#ifdef ENABLE_CUDA
if (this->m_exec_conf->isCUDAEnabled())
{
// use OS page size as minimum alignment
m_align_bytes = getpagesize();
}
#endif
if (m_num_elements > 0)
allocate();
}
GPUArray.h
GPUArray.h
//! CRTP (Curiously recurring template pattern) interface for GPUArray/GlobalArray
template<class T, class Derived>
class GPUArrayBase
{
public:
//! Get the number of elements
/*!
- For 1-D allocated GPUArrays, this is the number of elements allocated.
- For 2-D allocated GPUArrays, this is the \b total number of elements (\a pitch * \a height) allocated
*/
unsigned int getNumElements() const
{
return static_cast<Derived const&>(*this).getNumElements();
}
//! Test if the GPUArray is NULL
bool isNull() const
{
return static_cast<Derived const&>(*this).isNull();
}
//! Get the width of the allocated rows in elements
/*!
- For 2-D allocated GPUArrays, this is the total width of a row in memory (including the padding added for coalescing)
- For 1-D allocated GPUArrays, this is the simply the number of elements allocated.
*/
unsigned int getPitch() const
{
return static_cast<Derived const&>(*this).getPitch();
}
//! Get the number of rows allocated
/*!
- For 2-D allocated GPUArrays, this is the height given to the constructor
- For 1-D allocated GPUArrays, this is the simply 1.
*/
unsigned int getHeight() const
{
return static_cast<Derived const&>(*this).getHeight();
}
//! Resize the GPUArray
void resize(unsigned int num_elements)
{
static_cast<Derived&>(*this).resize(num_elements);
}
//! Resize a 2D GPUArray
void resize(unsigned int width, unsigned int height)
{
static_cast<Derived&>(*this).resize(width,height);
}
protected:
//! Acquires the data pointer for use
inline ArrayHandleDispatch<T> acquire(const access_location::Enum location, const access_mode::Enum mode
#ifdef ENABLE_CUDA
, bool async = false
#endif
) const
{
return static_cast<Derived const&>(*this).acquire(location, mode
#ifdef ENABLE_CUDA
, async
#endif
);
}
//! Release the data pointer
inline void release() const
{
return static_cast<Derived const&>(*this).release();
}
//! Returns the acquire state
inline bool isAcquired() const
{
return static_cast<Derived const&>(*this).isAcquired();
}
// need to be friend of the ArrayHandle class
friend class ArrayHandle<T>;
friend class ArrayHandleAsync<T>;
private:
// Make constructor private to prevent mistakes
GPUArrayBase() {};
friend Derived;
};
/*! \param args Additional options
\param d_params Per type-pair parameters for the evaluator
This is just a driver function for gpu_compute_dpd_forces_kernel(), see it for details.
*/
template< class evaluator >
cudaError_t gpu_compute_dpd_forces(const dpd_pair_args_t& args,
const typename evaluator::param_type *d_params)
{
assert(d_params);
assert(args.d_rcutsq);
assert(args.ntypes > 0);
// run the kernel
if (args.compute_capability < 35 && args.size_nlist > args.max_tex1d_width)
{
if (args.compute_virial)
{
switch (args.shift_mode)
{
case 0:
{
DPDForceComputeKernel<evaluator, 0, 1, 1, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
break;
}
case 1:
{
DPDForceComputeKernel<evaluator, 1, 1, 1, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
break;
}
default:
return cudaErrorUnknown;
}
}
else
{
switch (args.shift_mode)
{
case 0:
{
DPDForceComputeKernel<evaluator, 0, 0, 1, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
break;
}
case 1:
{
DPDForceComputeKernel<evaluator, 1, 0, 1, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
break;
}
default:
return cudaErrorUnknown;
}
}
}
else
{
if (args.compute_virial)
{
switch (args.shift_mode)
{
case 0:
{
DPDForceComputeKernel<evaluator, 0, 1, 0, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
break;
}
case 1:
{
DPDForceComputeKernel<evaluator, 1, 1, 0, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
break;
}
default:
return cudaErrorUnknown;
}
}
else
{
switch (args.shift_mode)
{
case 0:
{
DPDForceComputeKernel<evaluator, 0, 0, 0, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
break;
}
case 1:
{
DPDForceComputeKernel<evaluator, 1, 0, 0, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
break;
}
default:
return cudaErrorUnknown;
}
}
}
return cudaSuccess;
}
PotentialPairDPDThermoGPU.cuh
template<class evaluator, unsigned int shift_mode, unsigned int compute_virial, unsigned int use_gmem_nlist, int tpp>
struct DPDForceComputeKernel
{
//! Launcher for the DPD force kernel
/*!
* \param args Other arguments to pass onto the kernel
* \param d_params Parameters for the potential, stored per type pair
*/
static void launch(const dpd_pair_args_t& args, const typename evaluator::param_type *d_params)
{
if (tpp == args.threads_per_particle)
{
// setup the grid to run the kernel
unsigned int block_size = args.block_size;
Index2D typpair_idx(args.ntypes);
unsigned int shared_bytes = (sizeof(Scalar) + sizeof(typename evaluator::param_type))
* typpair_idx.getNumElements();
static unsigned int max_block_size = UINT_MAX;
if (max_block_size == UINT_MAX)
max_block_size = dpd_get_max_block_size(gpu_compute_dpd_forces_kernel<evaluator, shift_mode, compute_virial, use_gmem_nlist, tpp>);
if (args.compute_capability < 35) gpu_dpd_pair_force_bind_textures(args);
block_size = block_size < max_block_size ? block_size : max_block_size;
dim3 grid(args.N / (block_size/tpp) + 1, 1, 1);
gpu_compute_dpd_forces_kernel<evaluator, shift_mode, compute_virial, use_gmem_nlist, tpp>
<<<grid, block_size, shared_bytes>>>
(args.d_force,
args.d_virial,
args.virial_pitch,
args.N,
args.d_pos,
args.d_vel,
args.d_tag,
args.box,
args.d_n_neigh,
args.d_nlist,
args.d_head_list,
d_params,
args.d_rcutsq,
args.seed,
args.timestep,
args.deltaT,
args.T,
args.ntypes);
}
else
{
DPDForceComputeKernel<evaluator, shift_mode, compute_virial, use_gmem_nlist, tpp/2>::launch(args, d_params);
}
}
};
// positions
GlobalArray< Scalar4 > pos(N, m_exec_conf);
m_pos.swap(pos);
GlobalArray<Scalar4> m_pos;
ArrayHandle< Scalar4 > h_pos(m_pos, access_location::host, access_mode::readwrite);
GlobalArray.h
//! Definition of GlobalArray using CRTP
template<class T>
class GlobalArray : public GPUArrayBase<T, GlobalArray<T> >
{
public:
//! Empty constructor
GlobalArray()
: m_num_elements(0), m_pitch(0), m_height(0), m_acquired(false), m_align_bytes(0)
{ }
/*! Allocate a 1D array in managed memory
\param num_elements Number of elements in array
\param exec_conf The current execution configuration
*/
GlobalArray(unsigned int num_elements, std::shared_ptr<const ExecutionConfiguration> exec_conf,
const std::string& tag = std::string() )
: m_exec_conf(exec_conf),
#ifndef ALWAYS_USE_MANAGED_MEMORY
// explicit copy should be elided
m_fallback(exec_conf->allConcurrentManagedAccess() ?
GPUArray<T>() : GPUArray<T>(num_elements, exec_conf)),
#endif
m_num_elements(num_elements), m_pitch(num_elements), m_height(1), m_acquired(false), m_tag(tag),
m_align_bytes(0)
{
#ifndef ALWAYS_USE_MANAGED_MEMORY
if (!this->m_exec_conf->allConcurrentManagedAccess())
return;
#endif
assert(this->m_exec_conf);
#ifdef ENABLE_CUDA
if (this->m_exec_conf->isCUDAEnabled())
{
// use OS page size as minimum alignment
m_align_bytes = getpagesize();
}
#endif
if (m_num_elements > 0)
allocate();
}
GPUArray.h
template<class T>
private:
ArrayHandleDispatch<T> dispatch; //!< Reference to the dispatch object that manages the acquire/release
public:
T* const data; //!< Pointer to data
};
class ArrayHandle
{
public:
template<class Derived>
inline ArrayHandle(const GPUArrayBase<T, Derived>& gpu_array, const access_location::Enum location = access_location::host,
const access_mode::Enum mode = access_mode::readwrite);
//! Notifies the containing GPUArray that the handle has been released
virtual inline ~ArrayHandle() = default;
{
public:
template<class Derived>
inline ArrayHandle(const GPUArrayBase<T, Derived>& gpu_array, const access_location::Enum location = access_location::host,
const access_mode::Enum mode = access_mode::readwrite);
//! Notifies the containing GPUArray that the handle has been released
virtual inline ~ArrayHandle() = default;
private:
ArrayHandleDispatch<T> dispatch; //!< Reference to the dispatch object that manages the acquire/release
public:
T* const data; //!< Pointer to data
};
GPUArray.h
//! CRTP (Curiously recurring template pattern) interface for GPUArray/GlobalArray
template<class T, class Derived>
class GPUArrayBase
{
public:
//! Get the number of elements
/*!
- For 1-D allocated GPUArrays, this is the number of elements allocated.
- For 2-D allocated GPUArrays, this is the \b total number of elements (\a pitch * \a height) allocated
*/
unsigned int getNumElements() const
{
return static_cast<Derived const&>(*this).getNumElements();
}
//! Test if the GPUArray is NULL
bool isNull() const
{
return static_cast<Derived const&>(*this).isNull();
}
//! Get the width of the allocated rows in elements
/*!
- For 2-D allocated GPUArrays, this is the total width of a row in memory (including the padding added for coalescing)
- For 1-D allocated GPUArrays, this is the simply the number of elements allocated.
*/
unsigned int getPitch() const
{
return static_cast<Derived const&>(*this).getPitch();
}
//! Get the number of rows allocated
/*!
- For 2-D allocated GPUArrays, this is the height given to the constructor
- For 1-D allocated GPUArrays, this is the simply 1.
*/
unsigned int getHeight() const
{
return static_cast<Derived const&>(*this).getHeight();
}
//! Resize the GPUArray
void resize(unsigned int num_elements)
{
static_cast<Derived&>(*this).resize(num_elements);
}
//! Resize a 2D GPUArray
void resize(unsigned int width, unsigned int height)
{
static_cast<Derived&>(*this).resize(width,height);
}
protected:
//! Acquires the data pointer for use
inline ArrayHandleDispatch<T> acquire(const access_location::Enum location, const access_mode::Enum mode
#ifdef ENABLE_CUDA
, bool async = false
#endif
) const
{
return static_cast<Derived const&>(*this).acquire(location, mode
#ifdef ENABLE_CUDA
, async
#endif
);
}
//! Release the data pointer
inline void release() const
{
return static_cast<Derived const&>(*this).release();
}
//! Returns the acquire state
inline bool isAcquired() const
{
return static_cast<Derived const&>(*this).isAcquired();
}
// need to be friend of the ArrayHandle class
friend class ArrayHandle<T>;
friend class ArrayHandleAsync<T>;
private:
// Make constructor private to prevent mistakes
GPUArrayBase() {};
friend Derived;
};
Template non-type parameters
模板(template) parameters除了是一種類型(type)外,Template classes和functions可以使用另一種稱為模板非類型參數(template non-type parameters)。
5.3.2. Device Memory Accesses (To be continuous)
記憶讀取可能會需要傳輸很多次,這取決於warp內的thread它們如何執行記憶體addresses分佈。例如global memory的通則是addresses分散越多,throughput越少。
Size and Alignment Requirement
若且唯若資料型態大小為1, 2, 4, 8,或16 bytes,而且資料自然地被對齊(address大小為前述的倍數),則對停留在global memory的數據其任何存取(經由variable或pointer)都編譯為單一global memory指令。
對於structure,編譯器可以使用對齊specifiers __align __(8)或 __align __(16)強制執行大小和對齊要求,例如
float z;
};
停留在global memory或者是由驅動或runtime API 中的一個memory allocation routines返回的variable其任何address始終對齊至少256個bytes。
讀取非自然對齊的8 byte或16 bytes words會產生不正確的結果,因此必須特別注意保持這些類型的任何值或陣列的起始address的對齊。
以下這個典型情況是當使用一些自定義global memory allocation時,多組陣列的allocation(多次呼叫cudaMalloc()或cuMemAlloc())被單獨大的block memmory分割成多組陣列所取代 ,在這種情況下,可能容易忽視每個的起始地址都已經偏離原本大的block的起始地址。
Global Memory
global memory到device memory是經由32,64或128bytes memory transactions進行傳輸。例如,如果為每個thread 4 bytes存取,throughput分成八次的話,可以得到32 bytes的memory transactions。
global memory到device memory是經由32,64或128bytes memory transactions進行傳輸。例如,如果為每個thread 4 bytes存取,throughput分成八次的話,可以得到32 bytes的memory transactions。
Size and Alignment Requirement
若且唯若資料型態大小為1, 2, 4, 8,或16 bytes,而且資料自然地被對齊(address大小為前述的倍數),則對停留在global memory的數據其任何存取(經由variable或pointer)都編譯為單一global memory指令。
對於structure,編譯器可以使用對齊specifiers __align __(8)或 __align __(16)強制執行大小和對齊要求,例如
struct __align__(8) { float x;float y;};
或
struct __align__(16) { float x;float y;
float z;
};
停留在global memory或者是由驅動或runtime API 中的一個memory allocation routines返回的variable其任何address始終對齊至少256個bytes。
讀取非自然對齊的8 byte或16 bytes words會產生不正確的結果,因此必須特別注意保持這些類型的任何值或陣列的起始address的對齊。
以下這個典型情況是當使用一些自定義global memory allocation時,多組陣列的allocation(多次呼叫cudaMalloc()或cuMemAlloc())被單獨大的block memmory分割成多組陣列所取代 ,在這種情況下,可能容易忽視每個的起始地址都已經偏離原本大的block的起始地址。
2019年2月5日 星期二
Chapter 3. PROGRAMMING INTERFACE
CUDA C為熟悉C編程語言的用戶提供了一條簡單的途徑,可以輕鬆編寫程序以供device執行。它由C語言extension和runtime library組成。
核心語言extensions已在編程模型中引入。 它們允許programmers將kernel定義為C function,並在每次呼叫function時使用一些新語法來指定grid和block維度,可以在C語言extensions中找到所有extensions的完整描述,使用nvcc編譯任何包含C語言extensions的source file。
runtime在Compilation Workflow中引入,提供C函數執行
1.host端對device端記憶體allocate和deallocate
2.host端和device端記憶體之間傳輸數據
3.管理multiple GPU等。
核心語言extensions已在編程模型中引入。 它們允許programmers將kernel定義為C function,並在每次呼叫function時使用一些新語法來指定grid和block維度,可以在C語言extensions中找到所有extensions的完整描述,使用nvcc編譯任何包含C語言extensions的source file。
runtime在Compilation Workflow中引入,提供C函數執行
1.host端對device端記憶體allocate和deallocate
2.host端和device端記憶體之間傳輸數據
3.管理multiple GPU等。
Structure Packing
首先要了解的是,現在的處理器為了讓記憶體快速地存取,我們電腦的編譯器在設計基礎的datatypes是有被限制的,接下來我們會討論C語言,當然其他程式語言也是會受到一樣的限制。
CUDA B.5. Memory Fence Functions
B.5. Memory Fence Functions
CUDA編程模型用一種weakly-ordered memory模式,也就是說,Memory在寫入shared memory, global memory, page-locked host memory, or the memory of a peer device時不必按照程序中的順序來執行,例如:
X = 10;
Y = 20; }
int A = X;
int B = Y; }
CUDA編程模型用一種weakly-ordered memory模式,也就是說,Memory在寫入shared memory, global memory, page-locked host memory, or the memory of a peer device時不必按照程序中的順序來執行,例如:
__device__ volatile int X = 1, Y = 2;
__device__ void writeXY()
{X = 10;
Y = 20; }
__device__ void readXY()
{int A = X;
int B = Y; }
有可能會發生
A=1,B=2
A=10,B=2
A=10,B=20
避免此情形發生,我們可以執行以下指令
1.void __threadfence_block()
2.void __threadfence()
3.void __threadfence_system()
void __threadfence_block()
1.在執行完__threadfence_block()之後,如果有calling thread要記憶體進行寫入動作發生之前,在剛剛呼叫__threadfence_block()之前的block裡所有thread若有執行過記憶體寫入,block裡所有thread都會"確保"有觀察寫入的全部內容。
2.在執行完__threadfence_block()之後,如果有calling thread要進行記憶體讀取的話,在剛剛呼叫__threadfence_block()之前的calling thread若有執行過記憶體讀取,讀取過的全部內容都會"確保"是有順序的。
1.在執行完__threadfence_block()之後,如果有calling thread要記憶體進行寫入動作發生之前,在剛剛呼叫__threadfence_block()之前的block裡所有thread若有執行過記憶體寫入,block裡所有thread都會"確保"有觀察寫入的全部內容。
void __threadfence()
1.__threadfence_block()對block裡所有thread的功能,__threadfence()都有。
2.在執行完__threadfence()之前,如果有calling thread要做任何一個記憶體寫入動作發生之前,會"確保"device的任何一個thread在__threadfence()之後,觀察calling thread沒有寫入全部記憶體。
2.在執行完__threadfence()之前,如果有calling thread要做任何一個記憶體寫入動作發生之前,會"確保"device的任何一個thread在__threadfence()之後,觀察calling thread沒有寫入全部記憶體。
注意!為了保證這個順序是對的,觀察的thread必須真正觀察記憶體而不是cached版本,因此可以利用volatile指令。
void __threadfence_system()
1.__threadfence_block()對block裡所有thread的功能,__threadfence_system()都有。
2.在執行完__threadfence_system()之後,如果有calling thread要做任何一個記憶體寫入動作發生之前,會"確保"device在__threadfence_system()之後,任何一個thread會觀察calling thread沒有寫入全部記憶體。
在前面的例子,插入一個fence function call在X = 10; and Y = 20; and between int A = X; and int B = Y之間,將會確保thread 2的A = 10和B = 20。假如thread 1和thread 2都在同一個block,用__threadfence_block()即可。假如thread 1和thread 2在同一個device但不在同一個block,就得一定要用__threadfence()。假如thread 1和thread 2不在同一個device,就得一定要用__threadfence_system()。
以下例子如果沒有在result[blockIdx.x] = partialSum和unsigned int value = atomicInc(&count, gridDim.x)之間執行__threadfence()的話,可能會發生atomicInc沒有全部count完畢,最後一個block先偷跑到float totalSum = calculateTotalSum(result)。
B.6. Synchronization Functions
void __syncthreads()
1.等到thread block裡所有的threads都到達此點,而且__syncthreads()之前block裡全部的threads在做所有global和shared memory存取都是可見的。
void __threadfence_system()
1.__threadfence_block()對block裡所有thread的功能,__threadfence_system()都有。
2.在執行完__threadfence_system()之後,如果有calling thread要做任何一個記憶體寫入動作發生之前,會"確保"device在__threadfence_system()之後,任何一個thread會觀察calling thread沒有寫入全部記憶體。
在前面的例子,插入一個fence function call在X = 10; and Y = 20; and between int A = X; and int B = Y之間,將會確保thread 2的A = 10和B = 20。假如thread 1和thread 2都在同一個block,用__threadfence_block()即可。假如thread 1和thread 2在同一個device但不在同一個block,就得一定要用__threadfence()。假如thread 1和thread 2不在同一個device,就得一定要用__threadfence_system()。
以下例子如果沒有在result[blockIdx.x] = partialSum和unsigned int value = atomicInc(&count, gridDim.x)之間執行__threadfence()的話,可能會發生atomicInc沒有全部count完畢,最後一個block先偷跑到float totalSum = calculateTotalSum(result)。
__device__ unsigned int count = 0; __shared__ bool isLastBlockDone; __global__ void sum(const float* array, unsigned int N,
volatile float* result) // Each block sums a subset of the input array.
{ float partialSum = calculatePartialSum(array, N); if (threadIdx.x == 0) {
// Thread 0 of each block stores the partial sum // to global memory. The compiler will use // a store operation that bypasses the L1 cache // since the "result" variable is declared as // volatile. This ensures that the threads of // the last block will read the correct partial // sums computed by all other blocks. result[blockIdx.x] = partialSum;
// Thread 0 makes sure that the incrementation // of the "count" variable is only performed after // the partial sum has been written to global memory. __threadfence();
// Thread 0 signals that it is done.
unsigned int value = atomicInc(&count, gridDim.x);
// Thread 0 determines if its block is the last // block to be done. isLastBlockDone = (value == (gridDim.x - 1));
}
}
}
}
// Synchronize to make sure that each thread reads // the correct value of isLastBlockDone. __syncthreads();
if (isLastBlockDone) {
// The last block sums the partial sums // stored in result[0 .. gridDim.x-1] float totalSum = calculateTotalSum(result);
if (threadIdx.x == 0) {
// Thread 0 of last block stores the total sum
// to global memory and resets the count
// varialble, so that the next kernel call
// works properly.
result[0] = totalSum;
count = 0;
}
}
}
}
B.6. Synchronization Functions
void __syncthreads()
1.等到thread block裡所有的threads都到達此點,而且__syncthreads()之前block裡全部的threads在做所有global和shared memory存取都是可見的。
2.__syncthreads()是用來協調相同block的threads,當一個block存取相同的shared或global memory位址時,可能有read-after-write, write-after-read或write-after-write的危險性。
__syncthreads() is used to coordinate communication between the threads of the same block. When some threads within a block access the same addresses in shared
or global memory, there are potential read-after-write, write-after-read, or write-after- write hazards for some of these memory accesses. These data hazards can be avoided by synchronizing threads in-between these accesses.
__syncthreads()用於協調同一塊的線程之間的通信。 當塊中的某些線程訪問共享中的相同地址時
或全局存儲器,對於某些存儲器訪問,存在潛在的寫後讀,寫後讀或寫後寫危險。 通過同步這些訪問之間的線程可以避免這些數據危險。
or global memory, there are potential read-after-write, write-after-read, or write-after- write hazards for some of these memory accesses. These data hazards can be avoided by synchronizing threads in-between these accesses.
__syncthreads()用於協調同一塊的線程之間的通信。 當塊中的某些線程訪問共享中的相同地址時
或全局存儲器,對於某些存儲器訪問,存在潛在的寫後讀,寫後讀或寫後寫危險。 通過同步這些訪問之間的線程可以避免這些數據危險。
2019年2月3日 星期日
CUDA
// Kernel definition
__global__ void MatAdd(float A[N][N], float B[N][N],
float C[N][N])
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < N && j < N)
C[i][j] = A[i][j] + B[i][j];
}
int main()
{
...
// Kernel invocation
dim3 threadsPerBlock(16, 16);
dim3 numBlocks(N / threadsPerBlock.x, N / threadsPerBlock.y);
MatAdd<<<numBlocks, threadsPerBlock>>>(A, B, C);
...
}
https://blog.csdn.net/u012033124/article/details/70792877https://yq.aliyun.com/articles/444192?spm=a2c4e.11153940.blogcont437440.16.2f4aaf64F2T67U
cudaMalloc() cudaMallocPitch() cudaMalloc3D()
https://yq.aliyun.com/articles/444192?spm=a2c4e.11153940.blogcont444205.14.42e71425hoeObL
shared memory
https://yq.aliyun.com/articles/444205?spm=a2c4e.11153940.blogcont444192.14.188634e15E3RtT
https://blog.csdn.net/full_speed_turbo/article/details/73733290
cudaHostAlloc()和cudaFreeHost()
https://yq.aliyun.com/articles/444231?spm=a2c4e.11153940.blogcont444205.15.5aa614255iM6tp
https://yq.aliyun.com/articles/448549?spm=a2c4e.11153940.blogcont444231.15.d1fc5034TYekrN
stream
3.2.5.5. Streams
https://yq.aliyun.com/articles/448557?spm=a2c4e.11153940.blogcont448554.14.28344c6dV7YOto
https://yq.aliyun.com/articles/448561?spm=a2c4e.11153940.blogcont448557.17.2c52d49dVV9Scq
https://yq.aliyun.com/articles/448566?spm=a2c4e.11153940.blogcont448561.15.4e907b5ezol2m0
https://yq.aliyun.com/articles/484099?spm=a2c4e.11153940.blogcont484097.17.76b66293aEsr2M
multipleGPU
https://yq.aliyun.com/articles/448571?spm=a2c4e.11153940.blogcont448566.17.1e614788cIHHoR
https://yq.aliyun.com/articles/448576?spm=a2c4e.11153940.blogcont448571.13.30d852afZP1di4
Texture
https://yq.aliyun.com/articles/448580?spm=a2c4e.11153940.blogcont448576.16.4ebf63040D2e77
https://yq.aliyun.com/articles/448584?spm=a2c4e.11153940.blogcont448580.14.7f87cd4bHTF7ZU
https://yq.aliyun.com/articles/460312?spm=a2c4e.11153940.blogcont460310.12.473b2f49JS6MDy
https://yq.aliyun.com/articles/471830?spm=a2c4e.11153940.blogcont471829.16.112938b3O8iK8A
https://yq.aliyun.com/articles/471831?spm=a2c4e.11153940.blogcont471830.14.26e13209b9Exln
https://yq.aliyun.com/articles/471833?spm=a2c4e.11153940.blogcont471831.15.11a22111JCojTg
Surface Memory
https://yq.aliyun.com/articles/460313?spm=a2c4e.11153940.blogcont460312.16.70e27216o0ECHA
https://yq.aliyun.com/articles/460355?spm=a2c4e.11153940.blogcont460313.15.59016e33CVlIhQ
https://yq.aliyun.com/articles/471835?spm=a2c4e.11153940.blogcont471833.16.44f347d0aJFIGt
https://yq.aliyun.com/articles/471840?spm=a2c4e.11153940.blogcont471835.15.51882fadx8OSyx
與OpenGL互操作性
https://yq.aliyun.com/articles/460359?spm=a2c4e.11153940.blogcont460355.15.4e464750Cq0KhG
Direct3D互操作性
https://yq.aliyun.com/articles/460363?spm=a2c4e.11153940.blogcont460359.14.6a2b73b6QeWWcV
__align __
https://yq.aliyun.com/articles/463136?spm=a2c4e.11153940.blogcont463135.17.42c31b192cve2G
_fdividef(x,y) rsqrtf() sinf(x) cosf(x) tanf(x) sincosf(x) 3.141592653589793f
https://yq.aliyun.com/articles/467251?spm=a2c4e.11153940.blogcont463137.14.6e6d1e7cASdFfX
https://docs.nvidia.com/cuda/cuda-math-api/modules.html#modules
http://www.cplusplus.com/doc/tutorial/constants/
extern __shared__ __restrict__
https://yq.aliyun.com/articles/467260?spm=a2c4e.11153940.blogcont467254.18.768e5c37fWyzGi
https://www.itread01.com/content/1544596582.html
__threadfence_block() __threadfence() __threadfence_system()
https://yq.aliyun.com/articles/467266?spm=a2c4e.11153940.blogcont467260.18.75ad33c2Kq3B2f
__syncthreads() __syncthreads_count(int predicate) __syncthreads_and(int predicate)
__syncthreads_or(int predicate) __syncwarp(unsigned mask=0xffffffff)
https://yq.aliyun.com/articles/471829?spm=a2c4e.11153940.blogcont469057.15.76b12503xox7AI
__ldg atomicAdd() atomicAdd_system() atomicAdd_block()
https://yq.aliyun.com/articles/474404?spm=a2c4e.11153940.blogcont471840.17.5ce65ba0DOLjFO
https://yq.aliyun.com/articles/474407?spm=a2c4e.11153940.blogcont474404.14.588e720dS642ER
https://yq.aliyun.com/articles/474408?spm=a2c4e.11153940.blogcont474407.12.39865848yNv7FY
Warp Match Warp Shuffle
https://yq.aliyun.com/articles/474409?spm=a2c4e.11153940.blogcont474408.13.368d798bHn9bQs
bcast scan4() warpReduce()
https://yq.aliyun.com/articles/474410?spm=a2c4e.11153940.blogcont474409.14.78a84e35xe8b0w
https://yq.aliyun.com/articles/474412?spm=a2c4e.11153940.blogcont474410.17.263f7661imqauw
__prof_trigger
https://yq.aliyun.com/articles/474414?spm=a2c4e.11153940.blogcont474412.17.6faaeef6JkKvoN
printf()
https://yq.aliyun.com/articles/474416?spm=a2c4e.11153940.blogcont474414.18.28806b5ajPLbeK
void* malloc void* memcpy void* memset
https://yq.aliyun.com/articles/479275?spm=a2c4e.11153940.blogcont474416.16.1e0a54a3qnJWkr
https://yq.aliyun.com/articles/479277?spm=a2c4e.11153940.blogcont479275.19.b33a14aeUyX6om
__launch_bounds __() maxThreadsPerBlock() minBlocksPerMultiprocessor()
https://yq.aliyun.com/articles/479279?spm=a2c4e.11153940.blogcont479277.16.2d63e72fUMEoBY
cooperative_groups
https://yq.aliyun.com/articles/479280?spm=a2c4e.11153940.blogcont479279.16.94025904Av0cxE
thread_block
https://yq.aliyun.com/articles/479281?spm=a2c4e.11153940.blogcont479280.20.3ed33005DzECgg
cg::coalesced_group
https://yq.aliyun.com/articles/484085?spm=a2c4e.11153940.blogcont479291.17.302b35e8gEFQge
grid_group
https://yq.aliyun.com/articles/484087?spm=a2c4e.11153940.blogcont484085.18.71af55c5q63JYF
CUDA Dynamic Parallelism
D.1. Introduction
https://yq.aliyun.com/articles/484090?spm=a2c4e.11153940.blogcont484087.15.2fce1461iUJifm
D.2.1.1. Parent and Child Gridshttps://yq.aliyun.com/articles/484092?spm=a2c4e.11153940.blogcont484090.19.5e44102aBBlwGm
D.2.1.5. Ordering and Concurrencyhttps://yq.aliyun.com/articles/484094?spm=a2c4e.11153940.blogcont484092.17.bc5f299a8BGZxY
D.3.1.1. Device-Side Kernel Launch
https://yq.aliyun.com/articles/484097?spm=a2c4e.11153940.blogcont484094.16.69f149cbZtauoH
D.3.1.6.3. Shared Memory Variable Declarations
https://yq.aliyun.com/articles/484101?spm=a2c4e.11153940.blogcont484099.16.a2945f3a5o0NFK
D.3.1.2. Streams
https://yq.aliyun.com/articles/484099?spm=a2c4e.11153940.blogcont484097.17.76b66293aEsr2M
D.3.1.6.1. Device and Constant Memory
https://yq.aliyun.com/articles/484101?spm=a2c4e.11153940.blogcont484099.16.4e265f3aV8oPZu
D.3.1.7. API Errors and Launch Failures
https://yq.aliyun.com/articles/484103?spm=a2c4e.11153940.blogcont484101.14.767b2d03UcvCUw
D.3.3.1. Including Device Runtime API in CUDA Code
https://yq.aliyun.com/articles/484106?spm=a2c4e.11153940.blogcont484103.18.3215bfd6Lsbfkg
D.4.3. Implementation Restrictions and Limitations
https://yq.aliyun.com/articles/486308?spm=a2c4e.11153940.blogcont484106.18.1ba91458umVT2z
E. Mathematical Functions
https://yq.aliyun.com/articles/486309?spm=a2c4e.11153940.blogcont486308.18.66bc5a8b3QuRyV
F. C/C++ Language Support
https://yq.aliyun.com/articles/486310?spm=a2c4e.11153940.blogcont486309.14.a34674f4T716E2
F.3.3. Qualifiers
https://yq.aliyun.com/articles/486311?spm=a2c4e.11153940.blogcont486310.15.5a566a59McSA6W
F.3.9.3. Function Parameters F.3.9.4. Static Variables within Function F.3.9.5. Function Pointers F.3.9.6. Function Recursion F.3.9.7. Friend Functions F.3.9.8. Operator Function F.3.10. Classes
https://yq.aliyun.com/articles/486313?spm=a2c4e.11153940.blogcont486312.17.4e2d7106OAStzs
F.3.11. Templates F.3.12. Trigraphs and Digraphs F.3.13. Const-qualified variables F.3.14. Long Double F.3.15. Deprecation Annotation
https://yq.aliyun.com/articles/486317?spm=a2c4e.11153940.blogcont486313.16.286b5ee5OW5dgN
grid block分配
https://www.itread01.com/content/1541990407.html
cudaMallocPitch and cudaMemcpy2D
https://stackoverflow.com/questions/35771430/cudamallocpitch-and-cudamemcpy2d
避免Strict Aliasing
void foo(const float* __restrict__ a, const float* __restrict__ b, float* __restrict__ c)
https://www.oschina.net/question/234345_52682
2019年2月2日 星期六
Numpy
Numpy是一種Numerical Python,相較於python sequences, lists, sets, tuples,Numpy廣泛應用在進行數值運算,接下來要簡單介紹Numpy的功能。
訂閱:
文章 (Atom)