Hi all,
GCC generates this code for the shuffle() below:
movaps xmm0,XMMWORD PTR [rip+0x125]
pshufb xmm4,xmm0
Ideally this should be:
pshufb xmm4,XMMWORD PTR [rip+0x125]
What is the extended ASM syntax to generate this single instruction?
Many thanks, Adam
PS: The commented out intrinsic generates the optimal code for this example. This doesn't work in general (GCC is likely to generate unnecessary register copies in the presence of global register variables).
#include <stdint.h>
typedef int8_t xmm_t __attribute__ ((vector_size (16)));
const xmm_t xmm_shuf={128, 0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15};
register xmm_t xmm __asm__("xmm4");
#define NTL ".intel_syntax noprefix\n"
#define ATT ".att_syntax\n"
void shuffle() {
//xmm=__builtin_ia32_pshufb128(xmm, xmm_shuf);
__asm__(NTL"pshufb %0, %1\n"ATT : "=x" (xmm) : "x" (xmm_shuf));
}
int main() {
}
$ gcc -Os -std=gnu99 -msse4.1 -flax-vector-conversions pshufb_128bit_constant.c && objdump -d -m i386:x86-64:intel a.out |less
0000000000400494 <shuffle>:
400494: 0f 28 05 25 01 00 00 movaps xmm0,XMMWORD PTR [rip+0x125] # 4005c0 <xmm_shuf+0x10>
40049b: 66 0f 38 00 e0 pshufb xmm4,xmm0
4004a0: c3 ret