2020-09-14

x86 性能优化

x86性能优化

- CPU架构

- x86汇编和SIMD指令

- 工具和参考资料

CPU简化图

X86处理器结构图

流水线
分支预测BPU
复杂指令被拆分成uops
若干个decode单元
若干个执行单元
多发射，乱序执行

寄存器重命名消除依赖

mov eax, [mem1]        mov eax, [mem1]
imul eax, 6            imul eax, 6
mov [mem2], eax        mov [mem2], eax
mov  ebx, [mem3]       mov  eax, [mem3]; cpu给eax分配一个物理寄存器，消除前后指令对于eax的依赖
add  ebx, 2            add  eax, 2
mov  [mem4], ebx       mov  [mem4], eax

优化基本原则

尽量使用简单指令，复杂指令解码器只有一个
使用延时小，吞吐量高的指令
- 避免除法
- 避免浮点计算
- 使用shift替代乘法除法
减少分支跳转。分支预测失败造成性能损失（20-50cycles）。

; if (b > a) b = a
sub eax, ebx ; = a-b
sbb edx,edx  ;=(b>a)?0xFFFFFFFF:0
and edx,eax  ;=(b>a)?a-b:0
add ebx, edx ; Result is in ebx

// 使用likely告诉编译器，func()的二进制尽量靠近调用，提高cache命中率
if (likely(value)) {
  func();
}

尽量使前后若干条指令走不同port，使用不同的执行单元

打断前后计算的依赖

1 2	优化前 float a, b, c, d, y; y = a + b + c + d;

优化后
float a, b, c, d, y;
y = (a + b) + (c + d);
a + b, c + d可独立计算

数据对齐

__declspec(align(16))
__attribute__((aligned(16)))
/**
* Motion vector cache.
*/
DECLARE_ALIGNED(16, int16_t, mv_cache)[2][5 * 8][2];
DECLARE_ALIGNED(8, int8_t, ref_cache)[2][5 * 8];
DECLARE_ALIGNED(16, uint8_t, mvd_cache)[2][5 * 8][2];
uint8_t direct_cache[5 * 8];

X86寄存器

通用寄存器

bit 0 - 63	bit 0 - 31	bit 0 - 15	bit8-15	bit 0 - 7
RAX	EAX	AX	AH	AL
RBX	EBX	BX	BH	BL
RCX	ECX	CX	CH	CL
RDX	EDX	DX	DH	DL
RSI	ESI	SI
RDI	EDI	DI
RBP	EBP	BP
RSP	ESP	SP
RFlags	EFlags	Flags
RIP	EIP	IP
R8-R15	R8D-R15D	R8W-R15W		R8B-R15B

⚠️ R8-R15仅存在于64位CPU上

浮点寄存器和64bit向量寄存器

bit 0 - 79	bit 0 - 63
ST(0)	MM0
ST(1)	MM1
ST(2)	MM2
ST(3)	MM3
ST(4)	MM4
ST(5)	MM5
ST(6)	MM6
ST(7)	MM7

向量寄存器

bit 0 - 128	bit 0 - 255	bit 0 - 511
XMM0	YMM0	ZMM0
XMM1	YMM1	ZMM1
XMM2	YMM2	ZMM2
XMM3	YMM3	ZMM3
XMM4	YMM4	ZMM4
XMM5	YMM5	ZMM5
XMM6	YMM6	ZMM6
XMM7	YMM7	ZMM7
XMM8	YMM8	ZMM8
XMM9	YMM9	ZMM9
XMM10	YMM10	ZMM10
XMM11	YMM11	ZMM11
XMM12	YMM12	ZMM12
XMM13	YMM13	ZMM13
XMM14	YMM14	ZMM14
XMM15	YMM15	ZMM15
		ZMM16
		ZMM17
		ZMM18
		ZMM19
		ZMM20
		ZMM21
		ZMM22
		ZMM23
		ZMM24
		ZMM25
		ZMM26
		ZMM27
		ZMM28
		ZMM29
		ZMM30
		ZMM31

⚠️ 32位CPU上仅存在XMM0-XMM7，YMM0-YMM7，ZMM0-ZMM7

汇编开发方式

内联汇编

优点

直接访问c/c++变量
快速针对局部代码进行优化
缺点
没有宏
格式不统一，intel/AT&T

Intel Code	AT&T Code
mov eax,1	movl $1,%eax
mov ebx,0ffh	movl $0xff,%ebx
int 80h	int $0x80
mov ebx, eax	movl %eax, %ebx
mov eax,[ecx]	movl (%ecx),%eax
mov eax,[ebx+3]	movl 3(%ebx),%eax
mov eax,[ebx+20h]	movl 0x20(%ebx),%eax
add eax,[ebx+ecx*2h]	addl (%ebx,%ecx,0x2),%eax
lea eax,[ebx+ecx] l eal	(%ebx,%ecx),%eax
sub eax,[ebx+ecx*4h-20h]	subl -0x20(%ebx,%ecx,0x4),%eax

AT&T格式

每条指令放在一个双引号内，或者将所有的指令都放着一个双引号内。
每条指令都要包含一个分隔符。合法的分隔符是换行符(\n)或者分号。用换行符的时候通常后面放一个制表符\t。对此前文已经有所说明。
访问C语言变量用%0,%1…等等。

Intel

可以直接使用数组地址

Intrinsics

优点

像调函数一样使用指令，易上手

缺点

无法精确控制数据的load/save，可能造成额外的开销

#include <xmmintrin.h>
void add(float *a, float *b, float *c) {
  __m128 t0, t1;
  t0 = _mm_load_ps(a);
  t1 = _mm_load_ps(b);
  t0 = _mm_add_ps(t0, t1);
  _mm_store_ps(c, t0);
}

nasm汇编

; -----------------------------------------------------------------------------
; 一个输出 Fibonacci 数列前 90 项的 64 位 Linux 程序。
; 如何编译执行:
;
;     nasm -felf64 fib.asm && gcc fib.o && ./a.out
; -----------------------------------------------------------------------------

        global  main
        extern  printf

        section .text
main:
        push    rbx                     ; 因为需要用 rbx 寄存器所以需要保存

        mov     ecx, 90                 ; ecx 作为计数器直至减到 0
        xor     rax, rax                ; rax 将记录当前的数字
        xor     rbx, rbx                ; rbx 将记录下一个的数字
        inc     rbx                     ; rbx 初始值 1
print:
        ; 我们需要调用 printf 函数, 但是我们也同时在使用 rax,rbx 和 rcx 这三个寄存器。
        ; 调用 printf 函数会破坏 rax 和 rcx 这两个寄存器的值, 所以我们要在调用前保存
        ; 并且在调用后恢复这两个寄存器中的数据。

        push    rax                     ; 调用者保存寄存器
        push    rcx                     ; 调用者保存寄存器

        mov     rdi, format             ; 设置第一个参数 (format)
        mov     rsi, rax                ; 设置第二个参数 (current_number)
        xor     rax, rax                ; 因为 printf 是多参数的

        ; 栈内已经对齐, 因为我们压入了三个 8 字节的数据。
        call    printf                  ; printf(format, current_number)

        pop     rcx                     ; 恢复调用者所保存的寄存器
        pop     rax                     ; 恢复调用者所保存的寄存器

        mov     rdx, rax                ; 保存当前的数字
        mov     rax, rbx                ; 下一个数字保存在当前数字的位置
        add     rbx, rdx                ; 计算得到下一个数字
        dec     ecx                     ; ecx 减 1
        jnz     print                   ; 如果不是 0, 继续循环

        pop     rbx                     ; 返回前恢复 rbx 的值
        ret
format:
        db  "%20ld", 10, 0

多行宏
%macro 宏名 1-4 参数名列表
宏体
%endmacro
%0为参数个数 %?为宏名

%macro multipush 1-*
%rep %0
push %1
%rotate 1
%endrep
%endmacro

%define 单行
%assign 定义常数

%if
; some code which only appears if is met
%elif
; only appears if is not met but is
%else
; this appears if neither nor was met
%endif

macro的奇技淫巧参见x86inc.asm和x86util.asm

SIMD发展历史

MMX™ 1996 - Intel® Pentium Pentium® with MMX™ and Pentium with MMX™ and Pentium® II processors processors

Introduced 64-bit MMX registers for SIMD integer operations
Supports SIMD operations on packed byte word and double Supports SIMD operations on packed byte, word, and double-word integers word integers
Useful for multimedia and communications software

SSE(Streaming SIMD extensions) 1999 – Intel® Pentium Pentium® III processor III processor

Introduced 128 Introduced 128-bit extended memory manager (XMM) registers for SIMD integers and FP-SP operands
Executes FP and SIMD simultaneously
Introduced data prefetch instructions
Useful for 3D g y, g, g/ g eometry, 3D rendering, and video encoding/decoding

SSE2 2000 – Intel® Pentium Pentium® 4 and Intel 4 and Intel® Xeon processors ™ processors

Added extra 64-bit SIMD integ pp er support
Has same XMM reg g gp p ( isters for SIMD integer and floating point double precision (FP-DP)
Has 144 new instructions for data support (no new registers)
Adds support for cacheability and memory ordering operations
Useful for 3D graphics, video encoding/decoding and encryption

SSE3 2004 – Intel® Pentium® 4 Processor

Accelerates performance of Streaming SIMD Extensions technology, Streaming SIMD Extensions 2 technology and X87 technology, and X87-FP math capabilities.
Useful in some 3D operations (Quaternions) complex arithmetic and video codec algorithms Useful in some 3D operations (Quaternions), complex arithmetic and video codec algorithms

SSSE3 2006 – Itl n e ® Core® 2 Processor

application performance improvement.
potential for specifc application domains

SSE4 2007

Efficient Accelerated String and Text Processing.

AVX(Advance vector extensions) 2011 - 2nd and 3rd generation Intel® Core™ Processors

Introduced 256 bit registers YMM
Flexible unaligned memory access support

AVX2 2013 - 4th generation Intel® Core™ Processors

Vector-vector shifts
Bit Manipulation Instructions (BMI)
AVX512 2016
Introduced 512 bit registers ZMM
512 bit vector instruction set extension

SIMD数据类型

64位SIMD数据

SIMD指令

Packed Arithmetic

paddb[w|d|q|sb|usb]

p packed
b byte; w 2bytes; d 4bytes; q 8bytes
u unsigned byte integers
s saturation

Comparison

Conversion

Data Transfer Instructions

Logic Instructions

Shift and Rotate

用来替代2的乘法和除法

shuffle

broadcast
reverse
rotate

常用汇编snippet

产生常数向量

优点，执行时间固定。从内存load常量数据，如果cache命中了数据，速度也不慢，如果数据不在cache中，需要从memory读入，速度就慢很多

pxor mm0, mm0   ; generate a zero register in MM0
pcmpeq mm1, mm1 ; Generate all 1's in register MM1, which is -1 in each of the packed data type fields

psubb mm0, mm1 [psubw mm0, mm1] (psubd mm0, mm1)
psrlw mm1, 16-n(psrld mm1, 32-n)

无符号数绝对值差

movq mm2, mm0     ; make a copy of mm0
psubusb mm0, mm1  ; compute difference one way
psubusb mm1, mm2  ; compute difference the other way
por mm0, mm1      ; OR them together

有符号数绝对值差

1 2	psubw xmm0, xmm1 ; subtract words pabsw xmm1, xmm0 ; results in XMM1

绝对值

1
2
3

pxor mm1, mm1
psubw mm1, mm0
pmaxsw mm1, mm0

以上取绝对值的算法优点在于，消除了比较判断，避免BTB预测失败造成的开销

16位扩展成32位

1
2
3

movdqa     xmm1, xmm0
punpcklwd  xmm0, xmm7 ; unpack the 4 low-end words into 4 32-bit doubleword
punpckhwd  xmm1, xmm7 ; unpackthe4high-endwords into 4 32-bit doublewords

数位的扩展和压缩是常见的操作，计算中间值往往超过原始像素8bit的数值范围

Cache

数据局部性

int colsarray_bad(int a[M][N])
{
  int i,j,sum=0;
  for(j = 0; j<N; i++) {
    for(i = 0; i<N; j++) {
      sum += a[i][j];
    }
  }
  return sum;
}

int colsarray_good(int a[M][N])
{
  int i,j,sum=0;
  for(i = 0; i<M; i++) {
    for(j = 0; j<N; j++) {
      sum += a[i][j];
    }
  }
  return sum;
}

hot-cold分离

Loop优化

将Loop无关代码移到Loop体外，避免冗余计算

for (int i = 0; i < strlen(str); i++>) {
;;;;
}

int len = strlen(str);
for (int i = 0; i < len; i++>) {
;;;;
}

Loop中避免判断

int i;
for (i = 0; i < 20; i++) {
  if (i % 2 == 0) {
    FuncA(i);
  } else {
    FuncB(i);
  }
  FuncC(i);
}

int i;
for (i = 0; i < 20; i += 2) {
  FuncA(i);
  FuncC(i);
  FuncB(i+1);
  FuncC(i+1);
}

循环展开

1 2	double list[100], sum = 0.; for (int i = 0; i < 100; i++) sum += list[i];

lea     esi,  list     ; list must be aligned by 16
movapd  xmm0, [esi]    ; list[0], list[1]
movapd  xmm1, [esi+16] ; list[2], list[3]
add   esi, 800         ; Point to end of list
mov eax, 32-800        ; Index to list[4] from end of list
L1:
addpd xmm0, [esi+eax]  ; Add list[i], list[i+1]
addpd xmm1, [esi+eax+16];Add list[i+2], list[i+3]
add eax,  32            ; i+=4
js
addpd xmm0, xmm1        ;Add the two accumulators together
movhlps xmm1, xmm0
addsd xmm0, xmm1        ;Add the two vector elements
movsd [sum], xmm0

问题指令集

XCHG 强制cpu之间同步
Division=慢
MASKMOV 跳过cache直接写memory
INC/DEC 不会修改carry flag

边界检查

float list[16];
int i;
....
list[i & 15] += 1.0f;

const int min = 100, max = 110; int i; ...
if (i >= min && i <= max)

if ((unsigned int)(i - min) <= (unsigned int)(max - min))

练习

1 2	float a; bool b; a = b ? 1.5f : 2.6f;

int list[300];
int i;
for(i=0; i<300; i++) {
  list[i] = i % 3;
}

1 2	float a, b; a = b * 1.2;

for(i=0; i<n; i++)
{
    if( (lpDRV->dpCAPS) & CLIPPING )
    {
        DoOneThing(i);
    }
    else if( (lpDRV->dpCAPS) & ALREADYCULLED )
    {
        DoSomethingElse(i);
    }
    else
    {
        DoYetAnotherThing(i);
    }
}


    if( (lpDRV->dpCAPS) & CLIPPING )
    {
        for(i=0; i<n; i++)
        {
          DoOneThing(i);
        }
    }
    else if( (lpDRV->dpCAPS) & ALREADYCULLED )
    {
        for(i=0; i<n; i++)
        {
          DoSomethingElse(i);
        }
    }
    else
    {
        for(i=0; i<n; i++)
        {
          DoYetAnotherThing(i);
        }
    }