Chiustin: 2月 2019

cereal是一個header-only C ++ 11序列化函式庫。 cereal採用任意數據類型並可逆地將它們轉換為不同的表示形式，例如compact binary encodings，XML或JSON。 cereal被設計為快速，重量輕，易於擴展 - 它沒有外部依賴性，可以很容易地與其他代碼捆綁或獨立使用。

2019年2月8日星期五

與Unlock Pages in Memory相比，Lock Pages in Memory速度更快。此安全性設定決定哪些使用者能使用處理程序來保留實體記憶體中的資料，阻止系統將資料分頁到磁碟上的虛擬記憶體，降低可用的隨機存取記憶體 (RAM) 數量，而對系統效能造成顯著影響。
CUDA提供了cudaHostAlloc和cudaHostRegister調用來分配或註冊頁面鎖定memory。如果主機memory被鎖定，Nvidia驅動程序會檢查memory傳輸，並根據複製代碼路徑發出問題。

2019年2月6日星期三

CUDA syntax

Source code is in .cu files, which contain mixture of host (CPU) and device (GPU) code.

hoomd instruction

test_dpd_integrator.cc
std::shared_ptr<PotentialPairDPD> dpdc(new PP_DPD(sysdef,nlist));

GPUArray<Scalar4>& force_array_1 = dpdc->getForceArray();

ArrayHandle<Scalar4> h_force_1(force_array_1,access_location::host,access_mode::read);

dpd_conservative_force_test< PotentialPairGPU<EvaluatorPairDPDThermo, gpu_compute_dpdthermo_forces > >(std::shared_ptr<ExecutionConfiguration>(new ExecutionConfiguration(ExecutionConfiguration::GPU)));

ForceCompute.h
//! Get the array of computed forces
GlobalArray<Scalar4>& getForceArray()
{
return m_force;
}

ExecutionConfiguration.h

HOOMDMath.h
// Handle both single and double precision through a define
#ifdef SINGLE_PRECISION
//! Floating point type (single precision)
typedef float Scalar;
//! Floating point type with x,y elements (single precision)
typedef float2 Scalar2;
//! Floating point type with x,y elements (single precision)
typedef float3 Scalar3;
//! Floating point type with x,y,z,w elements (single precision)
typedef float4 Scalar4;
#else
//! Floating point type (double precision)
typedef double Scalar;
//! Floating point type with x,y elements (double precision)
typedef double2 Scalar2;
//! Floating point type with x,y,z elements (double precision)
typedef double3 Scalar3;
//! Floating point type with x,y,z,w elements (double precision)
typedef double4 Scalar4;
#endif

//! make a scalar2 value
HOSTDEVICE inline Scalar2 make_scalar2(Scalar x, Scalar y)
{
Scalar2 retval;
retval.x = x;
retval.y = y;
return retval;
}

//! make a scalar3 value
HOSTDEVICE inline Scalar3 make_scalar3(Scalar x, Scalar y, Scalar z)
{
Scalar3 retval;
retval.x = x;
retval.y = y;
retval.z = z;
return retval;
}

//! make a scalar4 value
HOSTDEVICE inline Scalar4 make_scalar4(Scalar x, Scalar y, Scalar z, Scalar w)
{
Scalar4 retval;
retval.x = x;
retval.y = y;
retval.z = z;
retval.w = w;
return retval;
}

PotentialPairDPDThermoGPU.cuh

gpu_compute_dpd_forces -> DPDForceComputeKernel -> launch

//! Kernel driver that computes pair DPD thermo forces on the GPU
/*! \param args Additional options
\param d_params Per type-pair parameters for the evaluator
This is just a driver function for gpu_compute_dpd_forces_kernel(), see it for details.
*/
template< class evaluator >
cudaError_t gpu_compute_dpd_forces(const dpd_pair_args_t& args,
const typename evaluator::param_type *d_params)
{
assert(d_params);
assert(args.d_rcutsq);
assert(args.ntypes > 0);

// run the kernel
if (args.compute_capability < 35 && args.size_nlist > args.max_tex1d_width)
{
if (args.compute_virial)
{
switch (args.shift_mode)
{
case 0:
{
DPDForceComputeKernel<evaluator, 0, 1, 1, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
break;
}
case 1:
{
DPDForceComputeKernel<evaluator, 1, 1, 1, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
break;
}
default:
return cudaErrorUnknown;
}
}
else
{
switch (args.shift_mode)
{
case 0:
{
DPDForceComputeKernel<evaluator, 0, 0, 1, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
break;
}
case 1:
{
DPDForceComputeKernel<evaluator, 1, 0, 1, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
break;
}
default:
return cudaErrorUnknown;
}
}
}
else
{
if (args.compute_virial)
{
switch (args.shift_mode)
{
case 0:
{
DPDForceComputeKernel<evaluator, 0, 1, 0, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
break;
}
case 1:
{
DPDForceComputeKernel<evaluator, 1, 1, 0, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
break;
}
default:
return cudaErrorUnknown;
}
}
else
{
switch (args.shift_mode)
{
case 0:
{
DPDForceComputeKernel<evaluator, 0, 0, 0, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
break;
}
case 1:
{
DPDForceComputeKernel<evaluator, 1, 0, 0, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
break;
}
default:
return cudaErrorUnknown;
}
}
}

return cudaSuccess;
}

PotentialPairDPDThermoGPU.cuh
template<class evaluator, unsigned int shift_mode, unsigned int compute_virial, unsigned int use_gmem_nlist, int tpp>
struct DPDForceComputeKernel
{
//! Launcher for the DPD force kernel
/*!
* \param args Other arguments to pass onto the kernel
* \param d_params Parameters for the potential, stored per type pair
*/
static void launch(const dpd_pair_args_t& args, const typename evaluator::param_type *d_params)
{
if (tpp == args.threads_per_particle)
{
// setup the grid to run the kernel
unsigned int block_size = args.block_size;

Index2D typpair_idx(args.ntypes);
unsigned int shared_bytes = (sizeof(Scalar) + sizeof(typename evaluator::param_type))
* typpair_idx.getNumElements();

static unsigned int max_block_size = UINT_MAX;
if (max_block_size == UINT_MAX)
max_block_size = dpd_get_max_block_size(gpu_compute_dpd_forces_kernel<evaluator, shift_mode, compute_virial, use_gmem_nlist, tpp>);

if (args.compute_capability < 35) gpu_dpd_pair_force_bind_textures(args);

block_size = block_size < max_block_size ? block_size : max_block_size;
dim3 grid(args.N / (block_size/tpp) + 1, 1, 1);

gpu_compute_dpd_forces_kernel<evaluator, shift_mode, compute_virial, use_gmem_nlist, tpp>
<<<grid, block_size, shared_bytes>>>
(args.d_force,
args.d_virial,
args.virial_pitch,
args.N,
args.d_pos,
args.d_vel,
args.d_tag,
args.box,
args.d_n_neigh,
args.d_nlist,
args.d_head_list,
d_params,
args.d_rcutsq,
args.seed,
args.timestep,
args.deltaT,
args.T,
args.ntypes);
}
else
{
DPDForceComputeKernel<evaluator, shift_mode, compute_virial, use_gmem_nlist, tpp/2>::launch(args, d_params);
}
}
};

// positions
GlobalArray< Scalar4 > pos(N, m_exec_conf);
m_pos.swap(pos);
GlobalArray<Scalar4> m_pos;
ArrayHandle< Scalar4 > h_pos(m_pos, access_location::host, access_mode::readwrite);

GlobalArray.h
//! Definition of GlobalArray using CRTP
template<class T>
class GlobalArray : public GPUArrayBase<T, GlobalArray<T> >
{
public:
//! Empty constructor
GlobalArray()
: m_num_elements(0), m_pitch(0), m_height(0), m_acquired(false), m_align_bytes(0)
{ }
/*! Allocate a 1D array in managed memory
\param num_elements Number of elements in array
\param exec_conf The current execution configuration
*/
GlobalArray(unsigned int num_elements, std::shared_ptr<const ExecutionConfiguration> exec_conf,
const std::string& tag = std::string() )
: m_exec_conf(exec_conf),
#ifndef ALWAYS_USE_MANAGED_MEMORY
// explicit copy should be elided
m_fallback(exec_conf->allConcurrentManagedAccess() ?
GPUArray<T>() : GPUArray<T>(num_elements, exec_conf)),
#endif
m_num_elements(num_elements), m_pitch(num_elements), m_height(1), m_acquired(false), m_tag(tag),
m_align_bytes(0)
{
#ifndef ALWAYS_USE_MANAGED_MEMORY
if (!this->m_exec_conf->allConcurrentManagedAccess())
return;
#endif

assert(this->m_exec_conf);
#ifdef ENABLE_CUDA
if (this->m_exec_conf->isCUDAEnabled())
{
// use OS page size as minimum alignment
m_align_bytes = getpagesize();
}
#endif

if (m_num_elements > 0)
allocate();
}

GPUArray.h

template<class T>

class ArrayHandle
{
public:
template<class Derived>
inline ArrayHandle(const GPUArrayBase<T, Derived>& gpu_array, const access_location::Enum location = access_location::host,
const access_mode::Enum mode = access_mode::readwrite);
//! Notifies the containing GPUArray that the handle has been released
virtual inline ~ArrayHandle() = default;

private:
ArrayHandleDispatch<T> dispatch; //!< Reference to the dispatch object that manages the acquire/release

public:
T* const data; //!< Pointer to data
};

GPUArray.h
//! CRTP (Curiously recurring template pattern) interface for GPUArray/GlobalArray
template<class T, class Derived>
class GPUArrayBase
{
public:
//! Get the number of elements
/*!
- For 1-D allocated GPUArrays, this is the number of elements allocated.
- For 2-D allocated GPUArrays, this is the \b total number of elements (\a pitch * \a height) allocated
*/
unsigned int getNumElements() const
{
return static_cast<Derived const&>(*this).getNumElements();
}

//! Test if the GPUArray is NULL
bool isNull() const
{
return static_cast<Derived const&>(*this).isNull();
}

//! Get the width of the allocated rows in elements
/*!
- For 2-D allocated GPUArrays, this is the total width of a row in memory (including the padding added for coalescing)
- For 1-D allocated GPUArrays, this is the simply the number of elements allocated.
*/
unsigned int getPitch() const
{
return static_cast<Derived const&>(*this).getPitch();
}

//! Get the number of rows allocated
/*!
- For 2-D allocated GPUArrays, this is the height given to the constructor
- For 1-D allocated GPUArrays, this is the simply 1.
*/
unsigned int getHeight() const
{
return static_cast<Derived const&>(*this).getHeight();
}

//! Resize the GPUArray
void resize(unsigned int num_elements)
{
static_cast<Derived&>(*this).resize(num_elements);
}

//! Resize a 2D GPUArray
void resize(unsigned int width, unsigned int height)
{
static_cast<Derived&>(*this).resize(width,height);
}

protected:
//! Acquires the data pointer for use
inline ArrayHandleDispatch<T> acquire(const access_location::Enum location, const access_mode::Enum mode
#ifdef ENABLE_CUDA
, bool async = false
#endif
) const
{
return static_cast<Derived const&>(*this).acquire(location, mode
#ifdef ENABLE_CUDA
, async
#endif
);
}

//! Release the data pointer
inline void release() const
{
return static_cast<Derived const&>(*this).release();
}

//! Returns the acquire state
inline bool isAcquired() const
{
return static_cast<Derived const&>(*this).isAcquired();
}

// need to be friend of the ArrayHandle class
friend class ArrayHandle<T>;
friend class ArrayHandleAsync<T>;

private:
// Make constructor private to prevent mistakes
GPUArrayBase() {};
friend Derived;
};

Template non-type parameters

模板(template) parameters除了是一種類型(type)外，Template classes和functions可以使用另一種稱為模板非類型參數(template non-type parameters)。

5.3.2. Device Memory Accesses (To be continuous)

記憶讀取可能會需要傳輸很多次，這取決於warp內的thread它們如何執行記憶體addresses分佈。例如global memory的通則是addresses分散越多，throughput越少。

Global Memory
global memory到device memory是經由32,64或128bytes memory transactions進行傳輸。例如，如果為每個thread 4 bytes存取，throughput分成八次的話，可以得到32 bytes的memory transactions。

Size and Alignment Requirement
若且唯若資料型態大小為1, 2, 4, 8,或16 bytes，而且資料自然地被對齊(address大小為前述的倍數)，則對停留在global memory的數據其任何存取(經由variable或pointer)都編譯為單一global memory指令。

對於structure，編譯器可以使用對齊specifiers __align __（8）或 __align __（16）強制執行大小和對齊要求，例如

struct __align__(8) {
    float x;


    float y; 

};

或

struct __align__(16) {
    float x;

float y;
float z;
};

停留在global memory或者是由驅動或runtime API 中的一個memory allocation routines返回的variable其任何address始終對齊至少256個bytes。

讀取非自然對齊的8 byte或16 bytes words會產生不正確的結果，因此必須特別注意保持這些類型的任何值或陣列的起始address的對齊。

以下這個典型情況是當使用一些自定義global memory allocation時，多組陣列的allocation（多次呼叫cudaMalloc（）或cuMemAlloc（））被單獨大的block memmory分割成多組陣列所取代，在這種情況下，可能容易忽視每個的起始地址都已經偏離原本大的block的起始地址。

2019年2月5日星期二

Chapter 3. PROGRAMMING INTERFACE

CUDA C為熟悉C編程語言的用戶提供了一條簡單的途徑，可以輕鬆編寫程序以供device執行。它由C語言extension和runtime library組成。

核心語言extensions已在編程模型中引入。它們允許programmers將kernel定義為C function，並在每次呼叫function時使用一些新語法來指定grid和block維度，可以在C語言extensions中找到所有extensions的完整描述，使用nvcc編譯任何包含C語言extensions的source file。

runtime在Compilation Workflow中引入，提供C函數執行
1.host端對device端記憶體allocate和deallocate
2.host端和device端記憶體之間傳輸數據
3.管理multiple GPU等。

cuda curand

curand--device API
一、簡介
二、device API
１．創建隨機算法狀態對象
２．對算法狀態進行初始化
３．生成隨機數

Structure Packing

首先要了解的是，現在的處理器為了讓記憶體快速地存取，我們電腦的編譯器在設計基礎的datatypes是有被限制的，接下來我們會討論C語言，當然其他程式語言也是會受到一樣的限制。

CUDA B.5. Memory Fence Functions

B.5. Memory Fence Functions
CUDA編程模型用一種weakly-ordered memory模式，也就是說，Memory在寫入shared memory, global memory, page-locked host memory, or the memory of a peer device時不必按照程序中的順序來執行，例如：

__device__ volatile int X = 1, Y = 2;
__device__ void writeXY()

{
X = 10;
Y = 20; }

__device__ void readXY()

{
int A = X;
int B = Y; }

有可能會發生

A=1,B=2

A=10,B=2

A=10,B=20

避免此情形發生，我們可以執行以下指令

1.void __threadfence_block()

2.void __threadfence()

3.void __threadfence_system()

void __threadfence_block()
1.在執行完__threadfence_block()之後，如果有calling thread要記憶體進行寫入動作發生之前，在剛剛呼叫__threadfence_block()之前的block裡所有thread若有執行過記憶體寫入，block裡所有thread都會"確保"有觀察寫入的全部內容。

2.在執行完__threadfence_block()之後，如果有calling thread要進行記憶體讀取的話，在剛剛呼叫__threadfence_block()之前的calling thread若有執行過記憶體讀取，讀取過的全部內容都會"確保"是有順序的。

void __threadfence()

1.__threadfence_block()對block裡所有thread的功能，__threadfence()都有。
2.在執行完__threadfence()之前，如果有calling thread要做任何一個記憶體寫入動作發生之前，會"確保"device的任何一個thread在__threadfence()之後，觀察calling thread沒有寫入全部記憶體。

注意!為了保證這個順序是對的，觀察的thread必須真正觀察記憶體而不是cached版本，因此可以利用volatile指令。

void __threadfence_system()
1.__threadfence_block()對block裡所有thread的功能，__threadfence_system()都有。
2.在執行完__threadfence_system()之後，如果有calling thread要做任何一個記憶體寫入動作發生之前，會"確保"device在__threadfence_system()之後，任何一個thread會觀察calling thread沒有寫入全部記憶體。

在前面的例子，插入一個fence function call在X = 10; and Y = 20; and between int A = X; and int B = Y之間，將會確保thread 2的A = 10和B = 20。假如thread 1和thread 2都在同一個block，用__threadfence_block()即可。假如thread 1和thread 2在同一個device但不在同一個block，就得一定要用__threadfence()。假如thread 1和thread 2不在同一個device，就得一定要用__threadfence_system()。

以下例子如果沒有在result[blockIdx.x] = partialSum和unsigned int value = atomicInc(&count, gridDim.x)之間執行__threadfence()的話，可能會發生atomicInc沒有全部count完畢，最後一個block先偷跑到float totalSum = calculateTotalSum(result)。

__device__ unsigned int count = 0;
__shared__ bool isLastBlockDone;
__global__ void sum(const float* array, unsigned int N,

                volatile float* result)
// Each block sums a subset of the input array.

{
    float partialSum = calculatePartialSum(array, N);
    if (threadIdx.x == 0) {

        // Thread 0 of each block stores the partial sum
        // to global memory. The compiler will use
        // a store operation that bypasses the L1 cache
        // since the "result" variable is declared as
        // volatile. This ensures that the threads of
        // the last block will read the correct partial
        // sums computed by all other blocks.
        result[blockIdx.x] = partialSum;

        // Thread 0 makes sure that the incrementation
        // of the "count" variable is only performed after
        // the partial sum has been written to global memory.
        __threadfence();

        // Thread 0 signals that it is done.

        unsigned int value = atomicInc(&count, gridDim.x);

        // Thread 0 determines if its block is the last
        // block to be done.
        isLastBlockDone = (value == (gridDim.x - 1));

}

    // Synchronize to make sure that each thread reads
    // the correct value of isLastBlockDone.
    __syncthreads();

    if (isLastBlockDone) {

        // The last block sums the partial sums
        // stored in result[0 .. gridDim.x-1]
        float totalSum = calculateTotalSum(result);

        if (threadIdx.x == 0) {

            // Thread 0 of last block stores the total sum
            // to global memory and resets the count
            // varialble, so that the next kernel call
            // works properly.

            result[0] = totalSum;

count = 0; }
}
}
}

B.6. Synchronization Functions
void __syncthreads()
1.等到thread block裡所有的threads都到達此點，而且__syncthreads()之前block裡全部的threads在做所有global和shared memory存取都是可見的。

2.__syncthreads()是用來協調相同block的threads，當一個block存取相同的shared或global memory位址時，可能有read-after-write, write-after-read或write-after-write的危險性。

__syncthreads() is used to coordinate communication between the threads of the same block. When some threads within a block access the same addresses in shared
or global memory, there are potential read-after-write, write-after-read, or write-after- write hazards for some of these memory accesses. These data hazards can be avoided by synchronizing threads in-between these accesses.
__syncthreads（）用於協調同一塊的線程之間的通信。當塊中的某些線程訪問共享中的相同地址時
或全局存儲器，對於某些存儲器訪問，存在潛在的寫後讀，寫後讀或寫後寫危險。通過同步這些訪問之間的線程可以避免這些數據危險。

2019年2月3日星期日

CUDA

// Kernel definition
__global__ void MatAdd(float A[N][N], float B[N][N],
float C[N][N])
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < N && j < N)
C[i][j] = A[i][j] + B[i][j];
}
int main()
{
...
// Kernel invocation
dim3 threadsPerBlock(16, 16);
dim3 numBlocks(N / threadsPerBlock.x, N / threadsPerBlock.y);
MatAdd<<<numBlocks, threadsPerBlock>>>(A, B, C);
...
}

2019年2月2日星期六

Numpy

Numpy是一種Numerical Python，相較於python sequences, lists, sets, tuples，Numpy廣泛應用在進行數值運算，接下來要簡單介紹Numpy的功能。

訂閱：文章 (Atom)

Chiustin

網頁

2019年2月27日星期三

Pytorch

2019年2月15日星期五

MPI

2019年2月12日星期二

Neural Network Tutorial – Multi Layer Perceptron

2019年2月11日星期一

如何優化神經網路

2019年2月10日星期日

通過添加一個神經元讓一種二分類深度神經網絡的損失函數局部極小值消失

2019年2月9日星期六

安裝cereal

2019年2月8日星期五

Page-Locked Host Memory

2019年2月6日星期三

CUDA syntax

CUDA syntax

hoomd instruction

Template non-type parameters

5.3.2. Device Memory Accesses (To be continuous)

2019年2月5日星期二

Chapter 3. PROGRAMMING INTERFACE

cuda curand

Structure Packing

CUDA B.5. Memory Fence Functions

2019年2月3日星期日

CUDA

2019年2月2日星期六

Numpy

總網頁瀏覽量

關於我自己

網頁

2019年2月27日 星期三

2019年2月15日 星期五

2019年2月12日 星期二

2019年2月11日 星期一

2019年2月10日 星期日

2019年2月9日 星期六

2019年2月8日 星期五

2019年2月6日 星期三

CUDA syntax

2019年2月5日 星期二

2019年2月3日 星期日

2019年2月2日 星期六

2019年2月27日星期三

2019年2月15日星期五

2019年2月12日星期二

2019年2月11日星期一

2019年2月10日星期日

2019年2月9日星期六

2019年2月8日星期五

2019年2月6日星期三

2019年2月5日星期二

2019年2月3日星期日

2019年2月2日星期六