Chiustin: Thread與Warp

__ballot(int predicate)：指的是當前線程所在的Wrap中第N個線程對應的predicate值不爲零，則將整數零的第N位進行置位

__popc(ballot(int predicate))：返回warp中bool不爲零的線程數目

asm("mov.u32 %0, %laneid;" : "=r"(ret))：獲得ret爲當前線程在所在Warp中的ID

%lanemask_lt：32-bit mask with bits set in positions less than the thread's lane number in the warp
https://docs.nvidia.com/cuda/parallel-thread-execution/index.html.

__popc(ret & __ballot(int predicate))：返回的值爲當前線程在所在的Warp中是第幾個滿足條件的

__popc ( unsigned int x ): Count the number of bits that are set to 1 in x.
https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__INTRINSIC__INT.html

Example:

nvcc -arch=sm_61 -o popc popc.cu

#include <iostream>
#include <cstdio>
using namespace std;

__device__ __forceinline__ int laneId()
{
 unsigned int ret;
 asm("mov.u32 %0, %laneid;" : "=r"(ret));
 return ret;
}

__device__ __forceinline__ int laneMaskLt()
{
 unsigned int ret;
 asm("mov.u32 %0, %lanemask_lt;" : "=r"(ret));
 return ret;
}

__global__ void testKernel(int *a, int *b, int *c, int *d, int *e, int n)
{
 int x = threadIdx.x + blockIdx.x * blockDim.x;
 if (x >= n)
 {
  return;
 }
 a[x] = __ballot(x > 10);
 b[x] = laneMaskLt();
 d[x] = __popc(b[x] & a[x]);
 c[x] = __popc(a[x]);
 e[x] = laneId();
}

int main()
{
 int *a, *b, *c, *d, *e, *dev_a, *dev_b, *dev_c, *dev_d, *dev_e;
 int n = 64;
 int size = n * sizeof(int);
 a = (int *)malloc(size);
 b = (int *)malloc(size);
 c = (int *)malloc(size);
 d = (int *)malloc(size);
 e = (int *)malloc(size);
 cudaMalloc(&dev_a, size);
 cudaMalloc(&dev_b, size);
 cudaMalloc(&dev_c, size);
 cudaMalloc(&dev_d, size);
 cudaMalloc(&dev_e, size);

 testKernel<<<1, n>>>(dev_a, dev_b, dev_c, dev_d, dev_e, n);

 cudaMemcpy(a, dev_a, size, cudaMemcpyDeviceToHost);
 cudaMemcpy(b, dev_b, size, cudaMemcpyDeviceToHost);
 cudaMemcpy(c, dev_c, size, cudaMemcpyDeviceToHost);
 cudaMemcpy(d, dev_d, size, cudaMemcpyDeviceToHost);
 cudaMemcpy(e, dev_e, size, cudaMemcpyDeviceToHost);

 for (int i = 0; i < n; ++i)
 {
  printf("%d    %d    %d    %d    %d\n", a[i], b[i], c[i], d[i], e[i]);
 }
 cudaFree(dev_a);
 cudaFree(dev_b);
 cudaFree(dev_c);
 cudaFree(dev_d);
 cudaFree(dev_e);
 free(a);
 free(b);
 free(c);
 free(d);
 free(e);
}

n=64代表這64個thread會被分成兩個Warp ，第一個Warp會印出

由於__ballot(x > 10)，所以__popc(a[x])會累計21個thread邏輯成立，__popc(b[x] & a[x])是當前線程在所在的Warp中是第幾個滿足條件

由於__ballot(x > 10)，所以__popc(a[x])會累計32個thread邏輯成立，__popc(b[x] & a[x])是當前線程在所在的Warp中是第幾個滿足條件

參考

https://blog.csdn.net/u010646276/article/details/46830665

Chiustin

網頁

2019年1月2日星期三

Thread與Warp

沒有留言:

張貼留言

總網頁瀏覽量

關於我自己

網頁

2019年1月2日 星期三

Thread與Warp

沒有留言:

張貼留言

2019年1月2日星期三