Coding Memorandum

プログラミングに関する備忘録

スポンサーサイト

上記の広告は1ヶ月以上更新のないブログに表示されています。
新しい記事を書く事で広告が消せます。

SSSE3 / SSE4 命令一覧

SSSE3,SSE4について,技術のキャッチアップのために命令を書き出してみました。

SSSE3

▼命令

Packed Absolute Value PABSB
PABSW
PABSD
PABSB with 128 bit operands:
Unsigned DEST[7:0] ← ABS(SRC[7:.0])
Repeat operation for 2nd through 15th bytes
Unsigned DEST[127:120] ← ABS(SRC[127:120])
Packed Aline Right PALIGNR PALIGNR with 128-bit operands:
temp1[255:0] = CONCATENATE(DEST,SRC)>>(imm8*8)
DEST[127:0] = temp1[127:0]
Packed Horizontal Add PHADDW
PHADDD
PHADDD with 128-bit operands:
xmm1[31-0] = xmm1[63-32] + xmm1[31-0];
xmm1[63-32] = xmm1[127-96] + xmm1[95-64];
xmm1[95-64] = xmm2[63-32] + xmm2[31-0];
xmm1[127-96] = xmm2[127-96] + xmm2[95-64];
Packed Horizontal Add and Saturate PHADDSW PHADDSW with 128-bit operands :
xmm1[15-0]= Sat(xmm1[31-16] + xmm1[15-0]);
xmm1[31-16] = Sat(xmm1[63-48] + xmm1[47-32]);
xmm1[47-32] = Sat(xmm1[95-80] + xmm1[79-64]);
xmm1[63-48] = Sat(xmm1[127-112] + xmm1[111-96]);
xmm1[79-64] = Sat(xmm2[31-16] + xmm2[15-0]);
xmm1[95-80] = Sat(xmm2[63-48] + xmm2[47-32]);
xmm1[111-96] = Sat(xmm2[95-80] + xmm2[79-64]);
xmm1[127-112] = Sat(xmm2[127-112] + xmm2[111-96]);
※Sat : SaturateToSignedWord
Packed Horizontal Subtract PHSUBW
PHSUBD
PHSUBD with 128-bit operands:
xmm1[31-0] = xmm1[31-0] - xmm1[63-32];
xmm1[63-32] = xmm1[95-64] - xmm1[127-96];
xmm1[95-64] = xmm2[31-0] - xmm2[63-32];
xmm1[127-96] = xmm2[95-64] - xmm2[127-96];
Packed Horizontal Subtract and Saturate PHSUBSW PHSUBSW with 128-bit operands:
xmm1[15-0] = Sat(xmm1[15-0] - xmm1[31-16]);
xmm1[31-16] = Sat(xmm1[47-32] - xmm1[63-48]);
xmm1[47-32] = Sat(xmm1[79-64] - xmm1[95-80]);
xmm1[63-48] = Sat(xmm1[111-96] - xmm1[127-112]);
xmm1[79-64] = Sat(xmm2[15-0] - xmm2[31-16]);
xmm1[95-80] =Sat(xmm2[47-32] - xmm2[63-48]);
xmm1[111-96] =Sat(xmm2[79-64] - xmm2[95-80]);
xmm1[127-112]= Sat(xmm2[111-96] - xmm2[127-112]);
Multiply and Add Packed Signed and Unsigned Bytes PMADDUBSW PMADDUBSW with 128 bit operands:
DEST[15-0] =
     Sat(SRC[15-8]* DEST[15-8]+SRC[7-0]*DEST[7-0]);
// Repeat operation for 2nd through 7th word
SRC1/DEST[127-112] =
     Sat(SRC[127-120]*DEST[127-120]+
           SRC[119-112]* DEST[119-112]);
Packed Multiply High With Round and Scale PMULHRSW PMULHRSW with 128-bit operand:
temp0[31:0] = INT((DEST[15:0] * SRC[15:0])>>14) + 1;
temp1[31:0] = INT((DEST[31:16] * SRC[31:16])>>14) + 1;
temp2[31:0] = INT((DEST[47:32] * SRC[47:32])>>14) + 1;
temp3[31:0] = INT((DEST[63:48] * SRC[63:48])>>14) + 1;
temp4[31:0] = INT((DEST[79:64] * SRC[79:64])>>14) + 1;
temp5[31:0] = INT((DEST[95:80] * SRC[95:80])>>14) + 1;
temp6[31:0] = INT((DEST[111:96] * SRC[111:96])>>14)+1;
temp7[31:0] = INT((DEST[127:112] * SRC[127:112)>>14)+1;
DEST[15:0] = temp0[16:1];
DEST[31:16] = temp1[16:1];
DEST[47:32] = temp2[16:1];
DEST[63:48] = temp3[16:1];
DEST[79:64] = temp4[16:1];
DEST[95:80] = temp5[16:1];
DEST[111:96] = temp6[16:1];
DEST[127:112] = temp7[16:1];
Packed Shuffle Bytes PSHUFB PSHUFB with 128 bit operands:
for i = 0 to 15 {
  if (SRC[(i * 8)+7] == 1 ) then
    DEST[(i*8)+7..(i*8)+0] ← 0;
  else
    index[3..0] ← SRC[(i*8)+3 .. (i*8)+0];
    DEST[(i*8)+7..(i*8)+0]←DEST[(index*8+7)..(index*8+0)];
  endif
}
Packed Sign PSIGNB
PSIGNW
PSIGND
PSIGNB with 128 bit operands:
IF (SRC[7:0] < 0 )
    DEST[7:0] ← Neg(DEST[7:0])
ELSEIF (SRC[7:0] == 0 )
    DEST[7:0] ← 0
ELSEIF (SRC[7:0] > 0 )
    DEST[7:0] ← DEST[7:0]
Repeat operation for 2nd through 15th bytes
IF (SRC[127:120] < 0 )
    DEST[127:120] ← Neg(DEST[127:120])
ELSEIF (SRC[127:120] == 0 )
    DEST[127:120] ← 0
ELSEIF (SRC[127:120] > 0 )
    DEST[127:120] ← DEST[127:120]

▼C/C++ Intrinsic

Packed Absolute Value PABSB __m64 _mm_abs_pi8 (__m64 a)
PABSB __m128i _mm_abs_epi8 (__m128i a)
PABSW __m64 _mm_abs_pi16 (__m64 a)
PABSW __m128i _mm_abs_epi16 (__m128i a)
PABSD __m64 _mm_abs_pi32 (__m64 a)
PABSD __m128i _mm_abs_epi32 (__m128i a)
Packed Aline Right PALIGNR __m64 _mm_alignr_pi8 (__m64 a, __m64 b, int n)
PALIGNR __m128i _mm_alignr_epi8 (__m128i a, __m128i b, int n)
Packed Horizontal Add PHADDW __m64 _mm_hadd_pi16 (__m64 a, __m64 b)
PHADDW __m128i _mm_hadd_epi16 (__m128i a, __m128i b)
PHADDD __m64 _mm_hadd_pi32 (__m64 a, __m64 b)
PHADDD __m128i _mm_hadd_epi32 (__m128i a, __m128i b)
Packed Horizontal Add and Saturate PHADDSW __m64 _mm_hadds_pi16 (__m64 a, __m64 b)
PHADDSW __m128i _mm_hadds_epi16 (__m128i a, __m128i b)
Packed Horizontal Subtract PHSUBW __m64 _mm_hsub_pi16 (__m64 a, __m64 b)
PHSUBW __m128i _mm_hsub_epi16 (__m128i a, __m128i b)
PHSUBD __m64 _mm_hsub_pi32 (__m64 a, __m64 b)
PHSUBD __m128i _mm_hsub_epi32 (__m128i a, __m128i b)
Packed Horizontal Subtract and Saturate PHSUBSW __m64 _mm_hsubs_pi16 (__m64 a, __m64 b)
PHSUBSW __m128i _mm_hsubs_epi16 (__m128i a, __m128i b)
Multiply and Add Packed Signed and Unsigned Bytes PMADDUBSW __m64 _mm_maddubs_pi16 (__m64 a, __m64 b)
PMADDUBSW __m128i _mm_maddubs_epi16 (__m128i a, __m128i b)
Packed Multiply High With Round and Scale PMULHRSW __m64 _mm_mulhrs_pi16 (__m64 a, __m64 b)
PMULHRSW __m128i _mm_mulhrs_epi16 (__m128i a, __m128i b)
Packed Shuffle Bytes PSHUFB __m64 _mm_shuffle_pi8 (__m64 a, __m64 b)
PSHUFB __m128i _mm_shuffle_epi8 (__m128i a, __m128i b)
Packed Sign PSIGNB __m64 _mm_sign_pi8 (__m64 a, __m64 b)
PSIGNB __m128i _mm_sign_epi8 (__m128i a, __m128i b)
PSIGNW __m64 _mm_sign_pi16 (__m64 a, __m64 b)
PSIGNW __m128i _mm_sign_epi16 (__m128i a, __m128i b)
PSIGND __m64 _mm_sign_pi32 (__m64 a, __m64 b)
PSIGND __m128i _mm_sign_epi32 (__m128i a, __m128i b)

SSE4.1

▼命令

Blend BLENDPD
BLENDPS
PBLENDW
BLENDPS:
IF (imm8[0] == 1)
  THEN DEST[31:0] ← D3 SRC[31:0];
  ELSE DEST[31:0] ← DEST[31:0]; FI;
IF (imm8[1] == 1)
  THEN DEST[63:32] ← SRC[63:32];
  ELSE DEST[63:32] ← DEST[63:32]; FI;
IF (imm8[2] == 1)
  THEN DEST[95:64] ← SRC[95:64];
  ELSE DEST[95:64] ← DEST[95:64]; FI;
IF (imm8[3] == 1)
  THEN DEST[127:96] ← SRC[127:96];
  ELSE DEST[127:96] ← DEST[127:96]; FI;
Variable Blend BLENDVPD
BLENDVPS
PBLENDVB
BLENDVPS:
MASK ← XMM0;
IF (MASK[31] == 1)
  THEN DEST[31:0] ← SRC[31:0];
  ELSE DEST[31:0] ← DEST[31:0]); FI;
IF (MASK[63] == 1)
  THEN DEST[63:32] ← SRC[63:32]);
  ELSE DEST[63:32] ← DEST[63:32]); FI;
IF (MASK[95] == 1)
  THEN DEST[95:64] ← SRC[95:64]);
  ELSE DEST[95:64] ← DEST[95:64]); FI;
IF (MASK[127] == 1)
  THEN DEST[127:96] ← SRC[127:96]);
  ELSE DEST[127:96] ← DEST[127:96]); FI;
Dot Products DPPD
DPPS
DPPD:
IF (imm8[4] = 1)
  THEN Temp1[63:0] ← DEST[63:0] * SRC[63:0];
  ELSE Temp1[63:0] ← +0.0; FI;
IF (imm8[5] = 1)
  THEN Temp1[127:64] ← DEST[127:64] * SRC[127:64];
  ELSE Temp1[127:64] ← +0.0; FI;
Temp2[63:0] ← Temp1[63:0] + Temp1[127:64];
IF (imm8[0] = 1)
  THEN DEST[63:0] ← Temp2[63:0];
  ELSE DEST[63:0] ← +0.0; FI;
IF (imm8[1] = 1)
  THEN DEST[127:64] ← Temp2[63:0];
  ELSE DEST[127:64] ← +0.0; FI;
Extract Packed Single Precision FP EXTRACTPS IF (64-Bit Mode and the destination is a GPR )
  THEN
    DEST[31:0]←(SRC>>(32*imm8[1:0])) AND 0FFFFFFFFh;
    DEST[63:32] ← ZERO_FILL;
  ELSE
    DEST[31:0]←(SRC>>(32*imm8[1:0])) AND 0FFFFFFFFh;
FI;
Insert Packed Single Precision FP INSERTPS IF (SRC == REG) THEN COUNT_S ← imm8[7:6];
  ELSE COUNT_S ← 0; FI;
COUNT_D ← imm8[5:4];
ZMASK ← imm8[3:0]; CASE (COUNT_S) OF
  0: TMP ← SRC[31:0];
  1: TMP ← SRC[63:32];
  2: TMP ← SRC[95:64];
  3: TMP ← SRC[127:96]; CASE (COUNT_D) OF
  0: TMP2[31:0] ← TMP;
     TMP2[127:32] ← DEST[127:32];
  1: TMP2[63:32] ← TMP;
     TMP2[31:0] ← DEST[31:0];
     TMP2[127:64] ← DEST[127:64];
  2: TMP2[95:64] ← TMP;
     TMP2[63:0] ← DEST[63:0];
     TMP2[127:96] ← DEST[127:96];
  3: TMP2[127:96] ← TMP;
     TMP2[95:0] ← DEST[95:0];
IF (ZMASK[0] == 1) THEN DEST[31:0] ← 00000000H;
  ELSE DEST[31:0] ← TMP2[31:0];
  IF (ZMASK[1] == 1) THEN DEST[63:32] ← 00000000H;
    ELSE DEST[63:32] ← TMP2[63:32];
    IF (ZMASK[2] == 1) THEN DEST[95:64] ← 00000000H;
      ELSE DEST[95:64] ← TMP2[95:64];
      IF (ZMASK[3] == 1) THEN DEST[127:96] ← 00000000H;
        ELSE DEST[127:96] ← TMP2[127:96];
      FI;
    FI;
  FI;
FI;
Load Double
Quadword Non-temporal Aligned
MOVNTDQA DST ← SRC;
Multiple Packed Sums of
Absolute Difference
MPSADBW SRC_OFFSET ← imm8[1:0]*32
DEST_OFFSET ← imm8[2]*32
DEST_BYTE0 ← DEST[DEST_OFFSET+7:DEST_OFFSET]
DEST_BYTE1 ← DEST[DEST_OFFSET+15:DEST_OFFSET+8]
DEST_BYTE2 ← DEST[DEST_OFFSET+23:DEST_OFFSET+16]
DEST_BYTE3 ← DEST[DEST_OFFSET+31:DEST_OFFSET+24]
DEST_BYTE4 ← DEST[DEST_OFFSET+39:DEST_OFFSET+32]
DEST_BYTE5 ← DEST[DEST_OFFSET+47:DEST_OFFSET+40]
DEST_BYTE6 ← DEST[DEST_OFFSET+55:DEST_OFFSET+48]
DEST_BYTE7 ← DEST[DEST_OFFSET+63:DEST_OFFSET+56]
DEST_BYTE8 ← DEST[DEST_OFFSET+71:DEST_OFFSET+64]
DEST_BYTE9 ← DEST[DEST_OFFSET+79:DEST_OFFSET+72]
DEST_BYTE10 ← DEST[DEST_OFFSET+87:DEST_OFFSET+80] SRC_BYTE0 ← SRC[SRC_OFFSET+7:SRC_OFFSET]
SRC_BYTE1 ← SRC[SRC_OFFSET+15:SRC_OFFSET+8]
SRC_BYTE2 ← SRC[SRC_OFFSET+23:SRC_OFFSET+16]
SRC_BYTE3 ← SRC[SRC_OFFSET+31:SRC_OFFSET+24] TEMP0 ← ABS( DEST_BYTE0 - SRC_BYTE0)
TEMP1 ← ABS( DEST_BYTE1 - SRC_BYTE1)
TEMP2 ← ABS( DEST_BYTE2 - SRC_BYTE2)
TEMP3 ← ABS( DEST_BYTE3 - SRC_BYTE3)
DEST[15:0] ← TEMP0 + TEMP1 + TEMP2 + TEMP3
TEMP0 ← ABS( DEST_BYTE1 - SRC_BYTE0)
TEMP1 ← ABS( DEST_BYTE2 - SRC_BYTE1)
TEMP2 ← ABS( DEST_BYTE3 - SRC_BYTE2)
TEMP3 ← ABS( DEST_BYTE4 - SRC_BYTE3)
DEST[31:16] ← TEMP0 + TEMP1 + TEMP2 + TEMP3
TEMP0 ← ABS( DEST_BYTE2 - SRC_BYTE0)
TEMP1 ← ABS( DEST_BYTE3 - SRC_BYTE1)
TEMP2 ← ABS( DEST_BYTE4 - SRC_BYTE2)
TEMP3 ← ABS( DEST_BYTE5 - SRC_BYTE3)
DEST[47:32] ← TEMP0 + TEMP1 + TEMP2 + TEMP3
TEMP0 ← ABS( DEST_BYTE3 - SRC_BYTE0)
TEMP1 ← ABS( DEST_BYTE4 - SRC_BYTE1)
TEMP2 ← ABS( DEST_BYTE5 - SRC_BYTE2)
TEMP3 ← ABS( DEST_BYTE6 - SRC_BYTE3)
DEST[63:48] ← TEMP0 + TEMP1 + TEMP2 + TEMP3
TEMP0 ← ABS( DEST_BYTE4 - SRC_BYTE0)
TEMP1 ← ABS( DEST_BYTE5 - SRC_BYTE1)
TEMP2 ← ABS( DEST_BYTE6 - SRC_BYTE2)
TEMP3 ← ABS( DEST_BYTE7 - SRC_BYTE3)
DEST[79:64] ← TEMP0 + TEMP1 + TEMP2 + TEMP3
TEMP0 ← ABS( DEST_BYTE5 - SRC_BYTE0)
TEMP1 ← ABS( DEST_BYTE6 - SRC_BYTE1)
TEMP2 ← ABS( DEST_BYTE7 - SRC_BYTE2)
TEMP3 ← ABS( DEST_BYTE8 - SRC_BYTE3)
DEST[95:80] ← TEMP0 + TEMP1 + TEMP2 + TEMP3
TEMP0 ← ABS( DEST_BYTE6 - SRC_BYTE0)
TEMP1 ← ABS( DEST_BYTE7 - SRC_BYTE1)
TEMP2 ← ABS( DEST_BYTE8 - SRC_BYTE2)
TEMP3 ← ABS( DEST_BYTE9 - SRC_BYTE3)
DEST[111:96] ← TEMP0 + TEMP1 + TEMP2 + TEMP3
TEMP0 ← ABS( DEST_BYTE7 - SRC_BYTE0)
TEMP1 ← ABS( DEST_BYTE8 - SRC_BYTE1)
TEMP2 ← ABS( DEST_BYTE9 - SRC_BYTE2)
TEMP3 ← ABS( DEST_BYTE10 - SRC_BYTE3)
DEST[127:112] ← TEMP0 + TEMP1 + TEMP2 + TEMP3
Pack with Unsigned
Saturation
PACKUSDW TMP[15:0] ← (DEST[31:0] < 0) ? 0 : DEST[15:0];
DEST[15:0] ← (DEST[31:0] > FFFFH) ? FFFFH : TMP[15:0] ;
TMP[31:16] ← (DEST[63:32] < 0) ? 0 : DEST[47:32];
DEST[31:16]←(DEST[63:32]>FFFFH)?FFFFH : TMP[31:16] ;
TMP[47:32] ← (DEST[95:64] < 0) ? 0 : DEST[79:64];
DEST[47:32]←(DEST[95:64]>FFFFH)?FFFFH : TMP[47:32] ;
TMP[63:48] ← (DEST[127:96] < 0) ? 0 : DEST[111:96];
DEST[63:48]←(DEST[127:96]>FFFFH)?FFFFH : TMP[63:48] ;
TMP[63:48] ← (DEST[127:96] < 0) ? 0 : DEST[111:96];
DEST[63:48]←(DEST[127:96]>FFFFH)?FFFFH : TMP[63:48] ;
TMP[79:64] ← (SRC[31:0] < 0) ? 0 : SRC[15:0];
DEST[63:48] ← (SRC[31:0] > FFFFH) ? FFFFH : TMP[79:64] ;
TMP[95:80] ← (SRC[63:32] < 0) ? 0 : SRC[47:32];
DEST[95:80]←(SRC[63:32]>FFFFH)?FFFFH : TMP[95:80] ;
TMP[111:96] ← (SRC[95:64] < 0) ? 0 : SRC[79:64];
DEST[111:96]←(SRC[95:64]>FFFFH)?FFFFH : TMP[111:96] ;
TMP[127:112] ← (SRC[127:96] < 0) ? 0 : SRC[111:96];
DEST[128:112]←(SRC[127:96]>FFFFH)?FFFFH:TMP[127:112];
Compare Packed Qword
Data of Equal
PCMPEQQ IF (DEST[63:0] = SRC[63:0])
  THEN DEST[63:0] ← FFFFFFFFFFFFFFFFH;
  ELSE DEST[63:0] ← 0; FI;
IF (DEST[127:64] = SRC[127:64])
  THEN DEST[127:64] ← FFFFFFFFFFFFFFFFH;
  ELSE DEST[127:64] ← 0; FI;
Extract PEXTRB
PEXTRD
PEXTRQ
PEXTRW
PEXTRD:
SEL ← COUNT[1:0];
TEMP ← (Src >> SEL*32) AND FFFF_FFFFH;
DEST ← TEMP;
PEXTRW instruction with 128-bit source operand:
SEL ← COUNT[2:0];
TEMP ← (SRC >> (SEL ? 16)) AND FFFFH;
r32[15:0] ← TEMP[15:0];
r32[31:16] ← ZERO_FILL;
Packed Horizontal
Word Minimum
PHMINPOSUW INDEX ← 0;
MIN ← SRC[15:0]
IF (SRC[31:16] < MIN)
  THEN INDEX ← 1; MIN ← SRC[31:16]; FI;
IF (SRC[47:32] < MIN)
  THEN INDEX ← 2; MIN ← SRC[47:32]; FI;
* Repeat operation for words 3 through 6
IF (SRC[127:112] < MIN)
  THEN INDEX ← 7; MIN ← SRC[127:112]; FI;
DEST[15:0] ← MIN;
DEST[18:16] ← INDEX;
DEST[127:19] ← 0000000000000000000000000000H;
Insert PINSRB
PINSRD
PINSRQ
PINSRB:
SEL ← COUNT[3:0];
MASK ← (0FFH << (SEL * 8));
TEMP ← (((SRC[7:0] << (SEL *8)) AND MASK);
DEST ← ((DEST AND NOT MASK) OR TEMP);
Maximum of Packed
Signed Byte/Dword
PMAXSB
PMAXSD
PMAXSB:
IF (DEST[7:0] > SRC[7:0])
  THEN DEST[7:0] ← DEST[7:0];
  ELSE DEST[7:0] ← SRC[7:0]; FI;
・・・
IF (DEST[127:120] > SRC[127:120])
  THEN DEST[127:120] ← DEST[127:120];
  ELSE DEST[127:120] ← SRC[127:120]; FI;
Maximum of Packed
Unsigned Dowrd/Word
PMAXUD
PMAXUW
PMAXUD:
IF (DEST[31:0] > SRC[31:0])
  THEN DEST[31:0] ← DEST[31:0];
  ELSE DEST[31:0] ← SRC[31:0]; FI;
・・・
IF (DEST[127:96] > SRC[127:96])
  THEN DEST[127:96] ← DEST[127:96];
  ELSE DEST[127:96] ← SRC[127:96]; FI;
Minimum of Packed Signed
Byte
PMINSB

IF (DEST[7:0] < SRC[7:0])
  THEN DEST[7:0] ← DEST[7:0];
  ELSE DEST[7:0] ← SRC[7:0]; FI;
・・・
IF (DEST[127:120] < SRC[127:120])
  THEN DEST[127:120] ← DEST[127:120];
  ELSE DEST[127:120] ← SRC[127:120]; FI;
Minimum of Packed Dword PMINSD
PMINUD
PMINSBと同機能
Minimum of Packed Word PMINUW PMINSBと同機能
Packed Move with Sign Extend PMOVSXBW
PMOVSXBD
PMOVSXBQ
PMOVSXWD
PMOVSXWQ
PMOVSXDQ
PMOVSXWD:
DEST[31:0] ← SignExtend(SRC[15:0]);
DEST[63:32] ← SignExtend(SRC[31:16]);
DEST[95:64] ← SignExtend(SRC[47:32]);
DEST[127:96] ← SignExtend(SRC[63:48]);
Packed Move with Zero
Extend
PMOVZXBW
PMOVZXBD
PMOVZXBQ
PMOVZXWD
PMOVZXWQ
PMOVZXDQ
PMOVZXWD:
DEST[31:0] ← ZeroExtend(SRC[15:0]);
DEST[63:32] ← ZeroExtend(SRC[31:16]);
DEST[95:64] ← ZeroExtend(SRC[47:32]);
DEST[127:96] ← ZeroExtend(SRC[63:48]);
Multiply Packed Unsigned Dword PMULDQ
DEST[63:0] = DEST[31:0] * SRC[31:0];
DEST[127:64] = DEST[95:64] * SRC[95:64];
Multiply Packed Unsigned Dword Integers and Store Low Result PMULLD Temp0[63:0] ← DEST[31:0] * SRC[31:0];
Temp1[63:0] ← DEST[63:32] * SRC[63:32];
Temp2[63:0] ← DEST[95:64] * SRC[95:64];
Temp3[63:0] ← DEST[127:96] * SRC[127:96];
DEST[31:0] ← Temp0[31:0];
DEST[63:32] ← Temp1[31:0];
DEST[95:64] ← Temp2[31:0];
DEST[127:96] ← Temp3[31:0];
Logical Compare PTEST IF (SRC[127:0] bitwiseAND DEST[127:0] = 0)
  THEN ZF ← 1;
  ELSE ZF ← 0; FI;
IF (SRC[127:0] bitwiseAND (bitwiseNOT DEST[127:0] ) = 0)
  THEN CF ← 1;
  ELSE CF ← 0; FI;
DEST[127:0] Unmodified;
AF = OF = PF = SF ← 0;
Round Packed Double/Single Precision FP ROUNDPD
ROUNDPS
ROUNDPD:
IF (imm[2] == ‘1)
  THEN // rounding mode is determined by MXCSR.RC
    DEST[63:0] ← ConvertDPFPToInteger_M(SRC[63:0]);
    DEST[127:64] ← ConvertDPFPToInteger_M(SRC[127:64]);
  ELSE // rounding mode is determined by IMM8.RC
    DEST[63:0] ← ConvertDPFPToInteger_Imm(SRC[63:0]);
    DEST[127:64]←ConvertDPFPToInteger_Imm(SRC[127:64]);
FI
Round Scalar Double/Single Precision FP ROUNDSD
ROUNDSS
ROUNDSD:
IF (imm[2] == ‘1)
  THEN // rounding mode is determined by MXCSR.RC
    DEST[63:0] ← ConvertDPFPToInteger_M(SRC[63:0]);
  ELSE // rounding mode is determined by IMM8.RC
    DEST[63:0] ← ConvertDPFPToInteger_Imm(SRC[63:0]);
FI;
DEST[127:63] remains unchanged ;

▼C/C++ Intrinsic

Blend BLENDPD __m128d _mm_blend_pd (__m128d v1, __m128d v2, const int mask);
BLENDPS __m128 _mm_blend_ps (__m128 v1, __m128 v2, const int mask);
PBLENDW __m128i _mm_blend_epi16 (__m128i v1, __m128i v2, const int mask);
Variable Blend BLENDVPD __m128d _mm_blendv_pd(__m128d v1, __m128d v2, __m128d v3);
BLENDVPS __m128 _mm_blendv_ps(__m128 v1, __m128 v2, __m128 v3);
PBLENDVB __m128i _mm_blendv_epi8 (__m128i v1, __m128i v2, __m128i mask);
Dot Products DPPD __m128d _mm_dp_pd ( __m128d a, __m128d b, const int mask);
DPPS __m128 _mm_dp_ps ( __m128 a, __m128 b, const int mask);
Extract Packed Single Precision FP EXTRACTPS int _mm_extract_ps(__m128 src, const int ndx);
Insert Packed Single Precision FP INSERTPS __m128 _mm_insert_ps(__m128 dst, __m128 src, const int ndx);
Load Double
Quadword Non-temporal Aligned
MOVNTDQA __m128i _mm_stream_load_si128 (__m128i *p);
Multiple Packed Sums of
Absolute Difference
MPSADBW __m128i _mm_mpsadbw_epu8 (__m128i s1, __m128i s2, const int mask);
Pack with Unsigned
Saturation
PACKUSDW __m128i _mm_packus_epi32(__m128i m1, __m128i m2);
Compare Packed Qword
Data of Equal
PCMPEQQ __m128i _mm_cmpeq_epi64(__m128i a, __m128i b);
Extract PEXTRB int _mm_extract_epi8 (__m128i src, const int ndx);
PEXTRD int _mm_extract_epi32 (__m128i src, const int ndx);
PEXTRQ __int64 _mm_extract_epi64 (__m128i src, const int ndx);
PEXTRW int _mm_extract_pi16 (__m64 a, int n)
PEXTRW int _mm_extract_epi16 ( __m128i a, int imm)
Packed Horizontal
Word Minimum
PHMINPOSUW __m128i _mm_minpos_epu16( __m128i packed_words);
Insert PINSRB __m128i _mm_insert_epi8 (__m128i s1, int s2, const int ndx);
PINSRD __m128i _mm_insert_epi32 (__m128i s2, int s, const int ndx);
PINSRQ __m128i _mm_insert_epi64(__m128i s2, __int64 s, const int ndx);
Maximum of Packed
Signed Byte/Dword
PMAXSB __m128i _mm_max_epi8 ( __m128i a, __m128i b);
PMAXSD __m128i _mm_max_epi32 ( __m128i a, __m128i b);
Maximum of Packed
Unsigned Dowrd/Word
PMAXUD __m128i _mm_max_epu32 ( __m128i a, __m128i b);
PMAXUW__m128i _mm_max_epu16 ( __m128i a, __m128i b);
Minimum of Packed Signed
Byte
PMINSB __m128i _mm_min_epi8 ( __m128i a, __m128i b);
Minimum of Packed Dword PMINSD __m128i _mm_min_epi32 ( __m128i a, __m128i b);
PMINUD __m128i _mm_min_epu32 ( __m128i a, __m128i b);
Minimum of Packed Word PMINUW __m128i _mm_min_epu16 ( __m128i a, __m128i b);
Packed Move with Sign Extend PMOVSXBW __m128i _mm_ cvtepi8_epi16 ( __m128i a);
PMOVSXBD __m128i _mm_ cvtepi8_epi32 ( __m128i a);
PMOVSXBQ __m128i _mm_ cvtepi8_epi64 ( __m128i a);
PMOVSXWD __m128i _mm_ cvtepi16_epi32 ( __m128i a);
PMOVSXWQ __m128i _mm_ cvtepi16_epi64 ( __m128i a);
PMOVSXDQ __m128i _mm_ cvtepi32_epi64 ( __m128i a);
Packed Move with Zero
Extend
PMOVZXBW __m128i _mm_ cvtepu8_epi16 ( __m128i a);
PMOVZXBD __m128i _mm_ cvtepu8_epi32 ( __m128i a);
PMOVZXBQ __m128i _mm_ cvtepu8_epi64 ( __m128i a);
PMOVZXWD __m128i _mm_ cvtepu16_epi32 ( __m128i a);
PMOVZXWQ __m128i _mm_ cvtepu16_epi64 ( __m128i a);
PMOVZXDQ __m128i _mm_ cvtepu32_epi64 ( __m128i a);
Multiply Packed Unsigned Dword PMULDQ __m128i _mm_mul_epi32( __m128i a, __m128i b);
Multiply Packed Unsigned Dword Integers and Store Low Result PMULLUD __m128i _mm_mullo_epi32(__m128i a, __m128i b);
Logical Compare PTEST int _mm_testz_si128 (__m128i s1, __m128i s2);
int _mm_testc_si128 (__m128i s1, __m128i s2);
int _mm_testnzc_si128 (__m128i s1, __m128i s2);
Round Packed Double/Single Precision FP ROUNDPD __m128 mm_round_pd(__m128d s1, int iRoundMode);
              __m128 _mm_floor_pd(__m128d s1);
              __m128 _mm_ceil_pd(__m128d s1);
ROUNDPS __m128 _mm_round_ps(__m128 s1, int iRoundMode);
              __m128 _mm_floor_ps(__m128 s1);
              __m128 _mm_ceil_ps(__m128 s1);
Round Scalar Double/Single Precision FP ROUNDSD __m128d mm_round_sd(__m128d dst, __m128d s1, int iRoundMode);
              __m128d mm_floor_sd(__m128d dst, __m128d s1);
              __m128d mm_ceil_sd(__m128d dst, __m128d s1);
ROUNDSS __m128 mm_round_ss(__m128 dst, __m128 s1, int iRoundMode);
              __m128 mm_floor_ss(__m128 dst, __m128 s1);
              __m128 mm_ceil_ss(__m128 dst, __m128 s1);

SSE4.2

▼命令

Accumulate CRC32 CRC32 CRC32 instruction for 8-bit source operand and 32-bit destination operand:
TEMP1[7-0] ← BIT_REFLECT8(SRC[7-0]) //ビット反転
TEMP2[31-0] ← BIT_REFLECT32 (DEST[31-0])
TEMP3[39-0] ← TEMP1[7-0] << 32
TEMP4[39-0] ← TEMP2[31-0] << 8
TEMP5[39-0] ← TEMP3[39-0] XOR TEMP4[39-0]
TEMP6[31-0] ← TEMP5[39-0] MOD2 11EDC6F41H
DEST[31-0] ← BIT_REFLECT (TEMP6[31-0])
Packed Compare
Explicit Length Strings To Index
PCMPESTRI 文字列比較
Packed Compare
Explicit Length Strings To Mask
PCMPESTRM 文字列比較
Packed Compare
Implicit Length String To Index
PCMPISTRI 文字列比較
Packed Compare
Implicit Length Strings To Mask
PCMPISTRM 文字列比較
Packed Compare Greater
Than
PCMPGTQ IF (DEST[63-0] > SRC[63-0])
  THEN DEST[63-0] ← FFFFFFFFFFFFFFFFH;
  ELSE DEST[63-0] ← 0; FI
IF (DEST[127-64] > SRC[127-64])
  THEN DEST[127-64] ← FFFFFFFFFFFFFFFFH;
  ELSE DEST[127-64] ← 0; FI
Return Number of Bits Set to
1
POPCNT Count = 0;
For (i=0; i < OperandSize; i++)
{   IF (SRC[ i] = 1) // i’th bit
  THEN Count++; FI;
}
DEST ← Count;

▼C/C++ Intrinsic

Accumulate CRC32 unsigned int _mm_crc32_u8( unsigned int crc, unsigned char data )
unsigned int _mm_crc32_u16( unsigned int crc, unsigned short data )
unsigned int _mm_crc32_u32( unsigned int crc, unsigned int data )
unsinged __int64 _mm_crc32_u64( unsinged __int64 crc, unsigned __int64 data )
Packed Compare
Explicit Length Strings To Index
int _mm_cmpestri (__m128i a, int la, __m128i b, int lb, const int mode);
int _mm_cmpestra (__m128i a, int la, __m128i b, int lb, const int mode);
int _mm_cmpestrc (__m128i a, int la, __m128i b, int lb, const int mode);
int _mm_cmpestro (__m128i a, int la, __m128i b, int lb, const int mode);
int _mm_cmpestrs (__m128i a, int la, __m128i b, int lb, const int mode);
int _mm_cmpestrz (__m128i a, int la, __m128i b, int lb, const int mode);
Packed Compare
Explicit Length Strings To Mask
__m128i _mm_cmpestrm (__m128i a, int la, __m128i b, int lb, const int mode);
int _mm_cmpestra (__m128i a, int la, __m128i b, int lb, const int mode);
int _mm_cmpestrc (__m128i a, int la, __m128i b, int lb, const int mode);
int _mm_cmpestro (__m128i a, int la, __m128i b, int lb, const int mode);
int _mm_cmpestrs (__m128i a, int la, __m128i b, int lb, const int mode);
int _mm_cmpestrz (__m128i a, int la, __m128i b, int lb, const int mode);
Packed Compare
Implicit Length String To Index
int _mm_cmpistri (__m128i a, __m128i b, const int mode);
int _mm_cmpistra (__m128i a, __m128i b, const int mode);
int _mm_cmpistrc (__m128i a, __m128i b, const int mode);
int _mm_cmpistro (__m128i a, __m128i b, const int mode);
int _mm_cmpistrs (__m128i a, __m128i b, const int mode);
int _mm_cmpistrz (__m128i a, __m128i b, const int mode);
Packed Compare
Implicit Length Strings To Mask
__m128i _mm_cmpistrm (__m128i a, __m128i b, const int mode);
int _mm_cmpistra (__m128i a, __m128i b, const int mode);
int _mm_cmpistrc (__m128i a, __m128i b, const int mode);
int _mm_cmpistro (__m128i a, __m128i b, const int mode);
int _mm_cmpistrs (__m128i a, __m128i b, const int mode);
int _mm_cmpistrz (__m128i a, __m128i b, const int mode);
Packed Compare Greater
Than
PCMPGTQ __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
Return Number of Bits Set to
1
POPCNT int _mm_popcnt_u32(unsigned int a);
POPCNT int64_t _mm_popcnt_u64(unsigned __int64 a);

コメント

コメントの投稿


管理者にだけ表示を許可する

トラックバック

トラックバック URL
http://msirocoder.blog35.fc2.com/tb.php/58-5c1606ac
この記事にトラックバックする(FC2ブログユーザー)

FC2Ad

上記広告は1ヶ月以上更新のないブログに表示されています。新しい記事を書くことで広告を消せます。