cuda线程束原语 __shfl_xor、__shfl、__shfl_up()、__shfl_down()
在CC3.0以上,支持了shuffle指令,允許thread直接讀其他thread的寄存器值,只要兩個thread在 同一個warp中,這種比通過shared Memory進行thread間的通訊效果更好,latency更低,同時也不消耗額外的內存資源來執行數據交換。
這里介紹warp中的一個概念lane,一個lane就是一個warp中的一個thread,每個lane在同一個warp中由lane索引唯一確定,因此其范圍為[0,31]。在一個一維的block中,可以通過下面兩個公式計算索引:
laneID = threadIdx.x % 32
warpID = threadIdx.x / 32
例如,在同一個block中的thread1和33擁有相同的lane索引1。
Variants of the Warp Shuffle Instruction
有兩種設置shuffle的指令:一種針對整型變量,另一種針對浮點型變量。每種設置都包含四種shuffle指令變量。為了交換整型變量,使用過如下函數:
參考書籍:《cuda專家手冊|GPU編程權威》
1:_shfl_xor
首先介紹__shfl_xor,因為最先用到它。
__shfl_xor(var,laneMask):Copy from a lane based on bitwise XOR of own lane ID
意思就是從當前的線程id與laneMak異或運算的值作為線程號的,把這個線程號的var值取出來。
演示圖:
舉例:
tid =0
laneMask =16
tid xor?laneMask(0000 xor 1000)=0111=15
所有取到的值為15號線程的var
那我們看下完成測試代碼:
__global__ void test_shfl_xor(int A[], int B[]) {int tid = threadIdx.x;int best = B[tid];//best = subgroup_min<32>(best, 0xffffffffu);best = __shfl_xor(best, 8);A[tid] = best; }int main() {int *A,*Ad, *B, *Bd;int n = 32;int size = n * sizeof(int);// CPU端分配內存A = (int*)malloc(size);B = (int*)malloc(size);for (int i = 0; i < n; i++){B[i] = rand()%101;std::cout << B[i] << std::endl;}std::cout <<"----------------------------" << std::endl;// GPU端分配內存cudaMalloc((void**)&Ad, size);cudaMalloc((void**)&Bd, size);cudaMemcpy(Bd, B, size, cudaMemcpyHostToDevice);// 定義kernel執行配置,(1024*1024/512)個block,每個block里面有512個線程dim3 dimBlock(128);dim3 dimGrid(1000);// 執行kernelconst auto t1 = std::chrono::system_clock::now();test__shfl_xor << <1, 32 >> > (Ad,Bd);cudaMemcpy(A, Ad, size, cudaMemcpyDeviceToHost);// 校驗誤差float max_error = 0.0;for (int i = 0; i < 32; i++){std::cout << A[i] << std::endl;}// 釋放CPU端、GPU端的內存free(A); cudaFree(Ad);free(B);cudaFree(Bd); return 0; }運行結果:
41 85 72 38 80 69 65 68 96 22 49 67 51 61 63 87 66 24 80 83 71 60 64 52 90 60 49 31 23 99 94 11
----------------------------
66 24 80 83 71 60 64 52 90 60 49 31 23 99 94 11 41 85 72 38 80 69 65 68 96 22 49 67 51 61 63 87
--------------------------------
再高階的用法,求取線程束中最大值:
template <typename T, unsigned int GROUP_SIZE, unsigned int STEP> struct subgroup_min_impl {static __device__ T call(T x, uint32_t mask) { #if CUDA_VERSION >= 9000x = min(x, __shfl_xor_sync(mask, x, STEP / 2, GROUP_SIZE)); #elsex = min(x, __shfl_xor(x, STEP / 2, GROUP_SIZE)); #endifreturn subgroup_min_impl<T, GROUP_SIZE, STEP / 2>::call(x, mask);} }; template <typename T, unsigned int GROUP_SIZE> struct subgroup_min_impl<T, GROUP_SIZE, 1u> {static __device__ T call(T x, uint32_t) {return x;} };template <unsigned int GROUP_SIZE, typename T> __device__ inline T subgroup_min(T x, uint32_t mask) {return subgroup_min_impl<T, GROUP_SIZE, GROUP_SIZE>::call(x, mask); }__global__ void test__shfl_xor(int A[], int B[]) {int tid = threadIdx.x;int best = B[tid];best = subgroup_min<32>(best, 0xffffffffu);//best = __shfl_xor(best, 16);A[tid] = best; }int main() {int *A,*Ad, *B, *Bd;int n = 32;int size = n * sizeof(int);// CPU端分配內存A = (int*)malloc(size);B = (int*)malloc(size);for (int i = 0; i < n; i++){ B[i] = rand()%101;std::cout << B[i] << std::endl;}std::cout <<"----------------------------" << std::endl;// GPU端分配內存cudaMalloc((void**)&Ad, size);cudaMalloc((void**)&Bd, size);cudaMemcpy(Bd, B, size, cudaMemcpyHostToDevice); // 定義kernel執行配置,(1024*1024/512)個block,每個block里面有512個線程dim3 dimBlock(128);dim3 dimGrid(1000);// 執行kernelconst auto t1 = std::chrono::system_clock::now();test_shfl_xor << <1, 32 >> > (Ad,Bd);cudaMemcpy(A, Ad, size, cudaMemcpyDeviceToHost);// 校驗誤差float max_error = 0.0;for (int i = 0; i < 32; i++){std::cout << A[i] << std::endl;}cout << "max error is " << max_error << endl;// 釋放CPU端、GPU端的內存free(A);free(B); cudaFree(Ad);cudaFree(Bd);return 0; }運行結果
41 85 72 38 80 69 65 68 96 22 49 67 51 61 63 87 66 24 80 83 71 60 64 52 90 60 49 31 23 99 94 11
----------------------------
11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11
_shfl_xor介紹完畢
--------------------------------------
2.__shfl()
?
Direct copy from indexed lane:復制lane id數據
__shfl(int var,int srclane,int width =32)
?
這個就是比較簡單,咱們直接上代碼:
__global__ void test_shfl(int A[], int B[]) {int tid = threadIdx.x;int best = B[tid];best = __shfl(best, 3);A[tid] = best; }int main() {int *A,*Ad, *B, *Bd;int n = 32;int size = n * sizeof(int);// CPU端分配內存A = (int*)malloc(size);B = (int*)malloc(size);for (int i = 0; i < n; i++){ B[i] = rand()%101;std::cout << B[i] << std::endl;}std::cout <<"----------------------------" << std::endl;// GPU端分配內存cudaMalloc((void**)&Ad, size);cudaMalloc((void**)&Bd, size);cudaMemcpy(Bd, B, size, cudaMemcpyHostToDevice); // 定義kernel執行配置,(1024*1024/512)個block,每個block里面有512個線程dim3 dimBlock(128);dim3 dimGrid(1000);// 執行kernelconst auto t1 = std::chrono::system_clock::now();test_shfl << <1, 32 >> > (Ad,Bd);cudaMemcpy(A, Ad, size, cudaMemcpyDeviceToHost);// 校驗誤差float max_error = 0.0;for (int i = 0; i < 32; i++){std::cout << A[i] << std::endl;}cout << "max error is " << max_error << endl;// 釋放CPU端、GPU端的內存free(A);free(B); cudaFree(Ad);cudaFree(Bd);return 0; }按以上代碼邏輯,取得數據全是第3號線程的數:
運行結果:
41 85 72 38 80 69 65 68 96 22 49 67 51 61 63 87 66 24 80 83 71 60 64 52 90 60 49 31 23 99 94 11
----------------------------
38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38
?
--------------------------------------------------------------------------------------------------------------------------------
3.__shfl_up()
__shfl_up(int var,unsigned int delta,int width =32):Copy from a lane with lower ID relative to caller
?
把tid-delta的線程好的var復制給tid的 var,如果tid-delta<0,var保持原來的值
見代碼:
__global__ void test_shfl_up(int A[], int B[]) {int tid = threadIdx.x;int best = B[tid];best = __shfl_up(best, 3);A[tid] = best; }運行結果:
41 85 72 38 80 69 65 68 96 22 49 67 51 61 63 87 66 24 80 83 71 60 64 52 90 60 49 31 23 99 94 11
----------------------------
41 85 72 41 85 72 38 80 69 65 68 96 22 49 67 51 61 63 87 66 24 80 83 71 60 64 52 90 60 49 31 23
--------------------------------------------------------------------------------------
4.__shfl_down
__shfl_down(int var,unsigned int delta,int width =32)
把tid+delta的線程好的var復制給tid的 var,如果tid+delta>32,var保持原來的值
測試代碼:
__global__ void test_shfl_down(int A[], int B[]) {int tid = threadIdx.x;int best = B[tid];best = __shfl_down(best, 3);A[tid] = best; }運行結果:
41 85 72 38 80 69 65 68 96 22 49 67 51 61 63 87 66 24 80 83 71 60 64 52 90 60 49 31 23 99 94 11
----------------------------
38 80 69 65 68 96 22 49 67 51 61 63 87 66 24 80 83 71 60 64 52 90 60 49 31 23 99 94 11 99 94 11
?
總結
以上是生活随笔為你收集整理的cuda线程束原语 __shfl_xor、__shfl、__shfl_up()、__shfl_down()的全部內容,希望文章能夠幫你解決所遇到的問題。