Say I have 8 threads, I want to get array tsum[laneID%8] from lane 0/8 for thread of laneID.
One possible way is to use 16 move dpp instructions with row shift, like this:
__asm ( \ "s_nop 1\n" \ "v_mov_b32 %[dst0], %[src0]\n" \ "v_mov_b32_dpp %[dst1], %[src1] row_shr:1\n" \ "v_mov_b32_dpp %[dst2], %[src2] row_shr:2\n" \ "v_mov_b32_dpp %[dst3], %[src3] row_shr:3\n" \ "v_mov_b32_dpp %[dst4], %[src4] row_shr:4\n" \ "v_mov_b32_dpp %[dst5], %[src5] row_shr:5\n" \ "v_mov_b32_dpp %[dst6], %[src6] row_shr:6\n" \ "v_mov_b32_dpp %[dst7], %[src7] row_shr:7\n" \ "s_nop 1\n" \ : [dst0] "=&v" (s[0]), \ [dst1] "=&v" (s[1]), \ [dst2] "=&v" (s[2]), \ [dst3] "=&v" (s[3]), \ [dst4] "=&v" (s[4]), \ [dst5] "=&v" (s[5]), \ [dst6] "=&v" (s[6]), \ [dst7] "=&v" (s[7]) \ : [src0] "v" (tsum[0]), \ [src1] "v" (tsum[1]), \ [src2] "v" (tsum[2]), \ [src3] "v" (tsum[3]), \ [src4] "v" (tsum[4]), \ [src5] "v" (tsum[5]), \ [src6] "v" (tsum[6]), \ [src7] "v" (tsum[7])); \
Can I do this job in one instruction, that select different src from lane 0, based on my thread id?