x86 性能优化

x86 性能优化

x86性能优化

- CPU架构

- x86汇编和SIMD指令

- 工具和参考资料


CPU简化图

X86处理器结构图

  • 流水线
  • 分支预测BPU
  • 复杂指令被拆分成uops
  • 若干个decode单元
  • 若干个执行单元
  • 多发射,乱序执行
  • 寄存器重命名消除依赖
    1
    2
    3
    4
    5
    6
    mov eax, [mem1]        mov eax, [mem1]
    imul eax, 6 imul eax, 6
    mov [mem2], eax mov [mem2], eax
    mov ebx, [mem3] mov eax, [mem3]; cpu给eax分配一个物理寄存器,消除前后指令对于eax的依赖
    add ebx, 2 add eax, 2
    mov [mem4], ebx mov [mem4], eax

优化基本原则

  • 尽量使用简单指令,复杂指令解码器只有一个

  • 使用延时小,吞吐量高的指令

    • 避免除法
    • 避免浮点计算
    • 使用shift替代乘法除法
  • 减少分支跳转。分支预测失败造成性能损失(20-50cycles)。

1
2
3
4
5
; if (b > a) b = a
sub eax, ebx ; = a-b
sbb edx,edx ;=(b>a)?0xFFFFFFFF:0
and edx,eax ;=(b>a)?a-b:0
add ebx, edx ; Result is in ebx
1
2
3
4
// 使用likely告诉编译器,func()的二进制尽量靠近调用,提高cache命中率
if (likely(value)) {
func();
}
  • 尽量使前后若干条指令走不同port,使用不同的执行单元
  • 打断前后计算的依赖
    1
    2
    优化前
    float a, b, c, d, y; y = a + b + c + d;
    1
    2
    3
    4
    优化后
    float a, b, c, d, y;
    y = (a + b) + (c + d);
    a + b, c + d可独立计算
  • 数据对齐
    1
    2
    3
    4
    5
    6
    7
    8
    9
    __declspec(align(16))
    __attribute__((aligned(16)))
    /**
    * Motion vector cache.
    */
    DECLARE_ALIGNED(16, int16_t, mv_cache)[2][5 * 8][2];
    DECLARE_ALIGNED(8, int8_t, ref_cache)[2][5 * 8];
    DECLARE_ALIGNED(16, uint8_t, mvd_cache)[2][5 * 8][2];
    uint8_t direct_cache[5 * 8];

X86寄存器

通用寄存器

bit 0 - 63 bit 0 - 31 bit 0 - 15 bit8-15 bit 0 - 7
RAX EAX AX AH AL
RBX EBX BX BH BL
RCX ECX CX CH CL
RDX EDX DX DH DL
RSI ESI SI
RDI EDI DI
RBP EBP BP
RSP ESP SP
RFlags EFlags Flags
RIP EIP IP
R8-R15 R8D-R15D R8W-R15W R8B-R15B

⚠️ R8-R15仅存在于64位CPU上


浮点寄存器和64bit向量寄存器

bit 0 - 79 bit 0 - 63
ST(0) MM0
ST(1) MM1
ST(2) MM2
ST(3) MM3
ST(4) MM4
ST(5) MM5
ST(6) MM6
ST(7) MM7

向量寄存器

bit 0 - 128 bit 0 - 255 bit 0 - 511
XMM0 YMM0 ZMM0
XMM1 YMM1 ZMM1
XMM2 YMM2 ZMM2
XMM3 YMM3 ZMM3
XMM4 YMM4 ZMM4
XMM5 YMM5 ZMM5
XMM6 YMM6 ZMM6
XMM7 YMM7 ZMM7
XMM8 YMM8 ZMM8
XMM9 YMM9 ZMM9
XMM10 YMM10 ZMM10
XMM11 YMM11 ZMM11
XMM12 YMM12 ZMM12
XMM13 YMM13 ZMM13
XMM14 YMM14 ZMM14
XMM15 YMM15 ZMM15
ZMM16
ZMM17
ZMM18
ZMM19
ZMM20
ZMM21
ZMM22
ZMM23
ZMM24
ZMM25
ZMM26
ZMM27
ZMM28
ZMM29
ZMM30
ZMM31

⚠️ 32位CPU上仅存在XMM0-XMM7,YMM0-YMM7,ZMM0-ZMM7


汇编开发方式

内联汇编

优点

  • 直接访问c/c++变量
  • 快速针对局部代码进行优化
    缺点
  • 没有宏
  • 格式不统一,intel/AT&T
Intel Code AT&T Code
mov eax,1 movl $1,%eax
mov ebx,0ffh movl $0xff,%ebx
int 80h int $0x80
mov ebx, eax movl %eax, %ebx
mov eax,[ecx] movl (%ecx),%eax
mov eax,[ebx+3] movl 3(%ebx),%eax
mov eax,[ebx+20h] movl 0x20(%ebx),%eax
add eax,[ebx+ecx*2h] addl (%ebx,%ecx,0x2),%eax
lea eax,[ebx+ecx] l eal (%ebx,%ecx),%eax
sub eax,[ebx+ecx*4h-20h] subl -0x20(%ebx,%ecx,0x4),%eax

AT&T格式

  • 每条指令放在一个双引号内,或者将所有的指令都放着一个双引号内。
  • 每条指令都要包含一个分隔符。合法的分隔符是换行符(\n)或者分号。用换行符的时候通常后面放一个制表符\t。对此前文已经有所说明。
  • 访问C语言变量用%0,%1…等等。

Intel

  • 可以直接使用数组地址

Intrinsics

优点

  • 像调函数一样使用指令,易上手

缺点

  • 无法精确控制数据的load/save,可能造成额外的开销
1
2
3
4
5
6
7
8
#include <xmmintrin.h>
void add(float *a, float *b, float *c) {
__m128 t0, t1;
t0 = _mm_load_ps(a);
t1 = _mm_load_ps(b);
t0 = _mm_add_ps(t0, t1);
_mm_store_ps(c, t0);
}

nasm汇编

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
; -----------------------------------------------------------------------------
; 一个输出 Fibonacci 数列前 90 项的 64 位 Linux 程序。
; 如何编译执行:
;
; nasm -felf64 fib.asm && gcc fib.o && ./a.out
; -----------------------------------------------------------------------------

global main
extern printf

section .text
main:
push rbx ; 因为需要用 rbx 寄存器所以需要保存

mov ecx, 90 ; ecx 作为计数器直至减到 0
xor rax, rax ; rax 将记录当前的数字
xor rbx, rbx ; rbx 将记录下一个的数字
inc rbx ; rbx 初始值 1
print:
; 我们需要调用 printf 函数, 但是我们也同时在使用 rax,rbx 和 rcx 这三个寄存器。
; 调用 printf 函数会破坏 rax 和 rcx 这两个寄存器的值, 所以我们要在调用前保存
; 并且在调用后恢复这两个寄存器中的数据。

push rax ; 调用者保存寄存器
push rcx ; 调用者保存寄存器

mov rdi, format ; 设置第一个参数 (format)
mov rsi, rax ; 设置第二个参数 (current_number)
xor rax, rax ; 因为 printf 是多参数的

; 栈内已经对齐, 因为我们压入了三个 8 字节的数据。
call printf ; printf(format, current_number)

pop rcx ; 恢复调用者所保存的寄存器
pop rax ; 恢复调用者所保存的寄存器

mov rdx, rax ; 保存当前的数字
mov rax, rbx ; 下一个数字保存在当前数字的位置
add rbx, rdx ; 计算得到下一个数字
dec ecx ; ecx 减 1
jnz print ; 如果不是 0, 继续循环

pop rbx ; 返回前恢复 rbx 的值
ret
format:
db "%20ld", 10, 0

多行宏
%macro 宏名 1-4 参数名列表
宏体
%endmacro
%0为参数个数 %?为宏名

%macro multipush 1-*
%rep %0
push %1
%rotate 1
%endrep
%endmacro

%define 单行
%assign 定义常数

%if
; some code which only appears if is met
%elif
; only appears if is not met but is
%else
; this appears if neither nor was met
%endif

macro的奇技淫巧参见x86inc.asm和x86util.asm


SIMD发展历史

MMX™ 1996 - Intel® Pentium Pentium® with MMX™ and Pentium with MMX™ and Pentium® II processors processors

  • Introduced 64-bit MMX registers for SIMD integer operations
  • Supports SIMD operations on packed byte word and double Supports SIMD operations on packed byte, word, and double-word integers word integers
  • Useful for multimedia and communications software

SSE(Streaming SIMD extensions) 1999 – Intel® Pentium Pentium® III processor III processor

  • Introduced 128 Introduced 128-bit extended memory manager (XMM) registers for SIMD integers and FP-SP operands
  • Executes FP and SIMD simultaneously
  • Introduced data prefetch instructions
  • Useful for 3D g y, g, g/ g eometry, 3D rendering, and video encoding/decoding

SSE2 2000 – Intel® Pentium Pentium® 4 and Intel 4 and Intel® Xeon processors ™ processors

  • Added extra 64-bit SIMD integ pp er support
  • Has same XMM reg g gp p ( isters for SIMD integer and floating point double precision (FP-DP)
  • Has 144 new instructions for data support (no new registers)
  • Adds support for cacheability and memory ordering operations
  • Useful for 3D graphics, video encoding/decoding and encryption

SSE3 2004 – Intel® Pentium® 4 Processor

  • Accelerates performance of Streaming SIMD Extensions technology, Streaming SIMD Extensions 2 technology and X87 technology, and X87-FP math capabilities.
  • Useful in some 3D operations (Quaternions) complex arithmetic and video codec algorithms Useful in some 3D operations (Quaternions), complex arithmetic and video codec algorithms

SSSE3 2006 – Itl n e ® Core® 2 Processor

  • application performance improvement.
  • potential for specifc application domains

SSE4 2007

  • Efficient Accelerated String and Text Processing.

AVX(Advance vector extensions) 2011 - 2nd and 3rd generation Intel® Core™ Processors

  • Introduced 256 bit registers YMM
  • Flexible unaligned memory access support

AVX2 2013 - 4th generation Intel® Core™ Processors

  • Vector-vector shifts
  • Bit Manipulation Instructions (BMI)

    AVX512 2016

  • Introduced 512 bit registers ZMM
  • 512 bit vector instruction set extension

SIMD数据类型

64位SIMD数据


SIMD指令

Packed Arithmetic

paddb[w|d|q|sb|usb]

  • p packed
  • b byte; w 2bytes; d 4bytes; q 8bytes
  • u unsigned byte integers
  • s saturation

Comparison

Intel/AMD Mnemonic | Description
|—–|——|
PCMPEQB mm, mm/m64 | Compare packed bytes for equality
PCMPEQW mm, mm/m64 | Compare packed words for equality
PCMPEQD mm, mm/m64 | Compare packed doublewords for equality
PCMPGTB mm, mm/m64 | Compare packed signed byte integers for greater than
PCMPGTW mm, mm/m64 | Compare packed signed word integers for greater than
PCMPGTD mm, mm/m64 | Compare packed signed doubleword integers for greater than


Conversion

Intel/AMD Mnemonic | Description
|—–|——|
PACKSSDW | pack doublewords into words with signed saturation
PACKSSWB |pack words into bytes with signed saturation
PACKUSWB |pack words into bytes with unsigned saturation
PUNPCKHBW | unpack high-order bytes
PUNPCKHDQ |unpack high-order doublewords
PUNPCKHWD | unpack high-order words
PUNPCKLBW |unpack low-order bytes
PUNPCKLDQ |unpack low-order doublewords
PUNPCKLWD |unpack low-order words


Data Transfer Instructions

Intel/AMD Mnemonic | Description
|—–|——|
MOVD mm, r/m32 | Move doubleword
MOVD r/m32, mm | Move doubleword
MOVQ mm/m64, mm | Move quadword
MOVQ mm, mm/m64 | Move quadword
MOVQ mm, r/m64 | Move quadword
MOVQ r/m64, mm | Move quadword
MOVDQ2Q mm, xmm | move quadword integer from XMM to MMX registers
MOVDQA xmm1, xmm2/m128 | Move aligned double quadword
MOVDQA xmm2/m128, xmm1 | Move aligned double quadword
MOVDQU xmm1, xmm2/m128 | Move unaligned double quadword
MOVDQU xmm2/m128, xmm1 | Move unaligned double quadword
MOVQ2DQ xmm, mm | Move quadword from MMX register to low quadword of XMM register
LDDQU xmm1, mem | Load unaligned data and return double quadword


Logic Instructions

Intel/AMD Mnemonic | Description
|—–|——|
PAND mm, mm/m64 | Bitwise AND
PANDN mm, mm/m64 | Bitwise AND NOT
POR mm, mm/m64 | Bitwise OR
PXOR mm, mm/m64 | Bitwise XOR


Shift and Rotate

用来替代2的乘法和除法

Intel/AMD Mnemonic | Description
|—–|——|
PSLLD |shift packed doublewords left logical
PSLLQ |shift packed quadword left logical
PSLLW |shift packed words left logical
PSRAD |shift packed doublewords right arithmetic
PSRAW |shift packed words right arithmetic
PSRLD |shift packed doublewords right logical
PSRLQ |shift packed quadword right logical
PSRLW |shift packed words right logical


shuffle

broadcast
reverse
rotate


常用汇编snippet

产生常数向量

优点,执行时间固定。从内存load常量数据,如果cache命中了数据,速度也不慢,如果数据不在cache中,需要从memory读入,速度就慢很多

1
2
3
4
5
6
pxor mm0, mm0   ; generate a zero register in MM0
pcmpeq mm1, mm1 ; Generate all 1's in register MM1, which is -1 in each of the packed data type fields

psubb mm0, mm1 [psubw mm0, mm1] (psubd mm0, mm1)
psrlw mm1, 16-n(psrld mm1, 32-n)


无符号数绝对值差

1
2
3
4
movq mm2, mm0     ; make a copy of mm0
psubusb mm0, mm1 ; compute difference one way
psubusb mm1, mm2 ; compute difference the other way
por mm0, mm1 ; OR them together

有符号数绝对值差

1
2
psubw xmm0, xmm1 ; subtract words
pabsw xmm1, xmm0 ; results in XMM1

绝对值

1
2
3
pxor mm1, mm1
psubw mm1, mm0
pmaxsw mm1, mm0

以上取绝对值的算法优点在于,消除了比较判断,避免BTB预测失败造成的开销


16位扩展成32位

1
2
3
movdqa     xmm1, xmm0
punpcklwd xmm0, xmm7 ; unpack the 4 low-end words into 4 32-bit doubleword
punpckhwd xmm1, xmm7 ; unpackthe4high-endwords into 4 32-bit doublewords

数位的扩展和压缩是常见的操作,计算中间值往往超过原始像素8bit的数值范围


Cache

数据局部性

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
int colsarray_bad(int a[M][N])
{
int i,j,sum=0;
for(j = 0; j<N; i++) {
for(i = 0; i<N; j++) {
sum += a[i][j];
}
}
return sum;
}

int colsarray_good(int a[M][N])
{
int i,j,sum=0;
for(i = 0; i<M; i++) {
for(j = 0; j<N; j++) {
sum += a[i][j];
}
}
return sum;
}

hot-cold分离


Loop优化

将Loop无关代码移到Loop体外,避免冗余计算

1
2
3
4
5
6
7
8
for (int i = 0; i < strlen(str); i++>) {
;;;;
}

int len = strlen(str);
for (int i = 0; i < len; i++>) {
;;;;
}

Loop中避免判断

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
int i;
for (i = 0; i < 20; i++) {
if (i % 2 == 0) {
FuncA(i);
} else {
FuncB(i);
}
FuncC(i);
}

int i;
for (i = 0; i < 20; i += 2) {
FuncA(i);
FuncC(i);
FuncB(i+1);
FuncC(i+1);
}

循环展开

1
2
double list[100], sum = 0.;
for (int i = 0; i < 100; i++) sum += list[i];
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
lea     esi,  list     ; list must be aligned by 16
movapd xmm0, [esi] ; list[0], list[1]
movapd xmm1, [esi+16] ; list[2], list[3]
add esi, 800 ; Point to end of list
mov eax, 32-800 ; Index to list[4] from end of list
L1:
addpd xmm0, [esi+eax] ; Add list[i], list[i+1]
addpd xmm1, [esi+eax+16];Add list[i+2], list[i+3]
add eax, 32 ; i+=4
js
addpd xmm0, xmm1 ;Add the two accumulators together
movhlps xmm1, xmm0
addsd xmm0, xmm1 ;Add the two vector elements
movsd [sum], xmm0



问题指令集

  • XCHG 强制cpu之间同步
  • Division=慢
  • MASKMOV 跳过cache直接写memory
  • INC/DEC 不会修改carry flag

边界检查

1
2
3
4
float list[16];
int i;
....
list[i & 15] += 1.0f;
1
2
3
4
const int min = 100, max = 110; int i; ...
if (i >= min && i <= max)

if ((unsigned int)(i - min) <= (unsigned int)(max - min))

练习

1
2
float a; bool b;
a = b ? 1.5f : 2.6f;
1
2
3
4
5
6
int list[300];
int i;
for(i=0; i<300; i++) {
list[i] = i % 3;
}

1
2
float a, b;
a = b * 1.2;
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
for(i=0; i<n; i++)
{
if( (lpDRV->dpCAPS) & CLIPPING )
{
DoOneThing(i);
}
else if( (lpDRV->dpCAPS) & ALREADYCULLED )
{
DoSomethingElse(i);
}
else
{
DoYetAnotherThing(i);
}
}


if( (lpDRV->dpCAPS) & CLIPPING )
{
for(i=0; i<n; i++)
{
DoOneThing(i);
}
}
else if( (lpDRV->dpCAPS) & ALREADYCULLED )
{
for(i=0; i<n; i++)
{
DoSomethingElse(i);
}
}
else
{
for(i=0; i<n; i++)
{
DoYetAnotherThing(i);
}
}



汇编开发常见问题

  • 使用mmx指令后用emms清mm寄存器状态,因为mm寄存器和浮点寄存器是一套
  • 寄存器状态。部分寄存器使用前必须保存状态,使用后恢复。EBX, ESI, EDI, EBP
  • push/pop操作必须匹配
  • 计算前必须先统计数值范围,8bit/16bit/32bit,有符号/无符号
    • [1 -5 20 20 -5 1] / 32

参考资料

intel性能优化官方指南

agner个人主页

微指令表

vtune性能分析工具

gnu内联汇编

X86指令集

Intel Architecture Code Analyzer
分析结果
使用example

bithack

Computer Architecture and Organization

magic