__popc(ballot(int predicate)):返回warp中bool不爲零的線程數目
asm("mov.u32 %0, %laneid;" : "=r"(ret)):獲得ret爲當前線程在所在Warp中的ID
%lanemask_lt:32-bit mask with bits set in positions less than the thread's lane number in the warp
https://docs.nvidia.com/cuda/parallel-thread-execution/index.html.
__popc(ret & __ballot(int predicate)):返回的值爲當前線程在所在的Warp中是第幾個滿足條件的
__popc ( unsigned int x ): Count the number of bits that are set to 1 in x.
https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__INTRINSIC__INT.html
Example:
nvcc -arch=sm_61 -o popc popc.cu
由於__ballot(x > 10),所以__popc(a[x])會累計32個thread邏輯成立,__popc(b[x] & a[x])是當前線程在所在的Warp中是第幾個滿足條件
參考
#include <iostream>
#include <cstdio>
using namespace std;
__device__ __forceinline__ int laneId()
{
unsigned int ret;
asm("mov.u32 %0, %laneid;" : "=r"(ret));
return ret;
}
__device__ __forceinline__ int laneMaskLt()
{
unsigned int ret;
asm("mov.u32 %0, %lanemask_lt;" : "=r"(ret));
return ret;
}
__global__ void testKernel(int *a, int *b, int *c, int *d, int *e, int n)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
if (x >= n)
{
return;
}
a[x] = __ballot(x > 10);
b[x] = laneMaskLt();
d[x] = __popc(b[x] & a[x]);
c[x] = __popc(a[x]);
e[x] = laneId();
}
int main()
{
int *a, *b, *c, *d, *e, *dev_a, *dev_b, *dev_c, *dev_d, *dev_e;
int n = 64;
int size = n * sizeof(int);
a = (int *)malloc(size);
b = (int *)malloc(size);
c = (int *)malloc(size);
d = (int *)malloc(size);
e = (int *)malloc(size);
cudaMalloc(&dev_a, size);
cudaMalloc(&dev_b, size);
cudaMalloc(&dev_c, size);
cudaMalloc(&dev_d, size);
cudaMalloc(&dev_e, size);
testKernel<<<1, n>>>(dev_a, dev_b, dev_c, dev_d, dev_e, n);
cudaMemcpy(a, dev_a, size, cudaMemcpyDeviceToHost);
cudaMemcpy(b, dev_b, size, cudaMemcpyDeviceToHost);
cudaMemcpy(c, dev_c, size, cudaMemcpyDeviceToHost);
cudaMemcpy(d, dev_d, size, cudaMemcpyDeviceToHost);
cudaMemcpy(e, dev_e, size, cudaMemcpyDeviceToHost);
for (int i = 0; i < n; ++i)
{
printf("%d %d %d %d %d\n", a[i], b[i], c[i], d[i], e[i]);
}
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
cudaFree(dev_d);
cudaFree(dev_e);
free(a);
free(b);
free(c);
free(d);
free(e);
}
n=64代表這64個thread會被分成兩個Warp ,第一個Warp會印出由於__ballot(x > 10),所以__popc(a[x])會累計32個thread邏輯成立,__popc(b[x] & a[x])是當前線程在所在的Warp中是第幾個滿足條件
沒有留言:
張貼留言