Optimizing C63 for x86

Transcript Optimizing C63 for x86

Optimizing C63 for
x86
{
Group 9



Bird’s-eye view: gprof of reference encoder
Optimizing SAD
Results
Outline

Each sample counts as 0.01 seconds.

%
cumulative
time
seconds
93.05
37.36
3.04
38.58
1.77
39.29
1.25
39.79
0.35
39.93
0.17
40.00








self
self
total
seconds
calls us/call us/call
37.36 500214528
0.07
0.07
1.22
705672
1.73
54.67
0.71
712800
1.00
1.00
0.50
712800
0.70
0.70
0.14
713100
0.20
0.29
0.07 16398942
0.00
0.00
name
sad_block_8x8
me_block_8x8
dequant_idct_block_8x8
dct_quant_block_8x8
flush_bits
put_bits
granularity: each sample hit covers 2 byte(s) for 0.02% of 40.16 seconds
gprof: reference,
foreman
Optimizing SAD
















void sad_block_8x8(uint8_t *block1, uint8_t *block2, int stride, int *result)
{
int v;
__m128i r = _mm_setzero_si128();
for (v = 0; v < 8; v += 2)
{
const __m128i b1 = _mm_set_epi64(*(__m64 *) &block1[(v+0)*stride],
*(__m64 *) &block1[(v+1)*stride]);
const __m128i b2 = _mm_set_epi64(*(__m64 *) &block2[(v+0)*stride],
*(__m64 *) &block2[(v+1)*stride]);
r = _mm_add_epi16(r, _mm_sad_epu8(b2, b1));
}
*result =
_mm_extract_epi16(r, 0) +
_mm_extract_epi16(r, 4);
}
SAD SSE2
PSAD

Gprof:
Improved SSE2 SAD uses 3.69 s vs. 38.50s*

Cachegrind:
Lots of branch prediction misses in
me_block_8x8
*) Foreman sequence on gpu-7
How well does this
perform?


























void sad_block_8x8x8(uint8_t *block1, uint8_t *block2, int stride, int *best,
int *result)
{
int v;
__m128i r = _mm_setzero_si128();
union {
__m128i v;
struct {
uint16_t sad;
unsigned int index : 3;
} minpos;
} mp;
for (v = 0; v < 8; v += 2) {
const __m128i b1 = _mm_set_epi64(*(__m64 *) &block1[(v+1)*stride],
*(__m64 *) &block1[(v+0)*stride]);
const __m128i b2 = _mm_loadu_si128((__m128i *) &block2[(v+0)*stride]);
const __m128i b3 = _mm_loadu_si128((__m128i *) &block2[(v+1)*stride]);
r = _mm_add_epi16(r, _mm_mpsadbw_epu8(b2, b1, 0b000));
r = _mm_add_epi16(r, _mm_mpsadbw_epu8(b2, b1, 0b101));
r = _mm_add_epi16(r, _mm_mpsadbw_epu8(b3, b1, 0b010));
r = _mm_add_epi16(r, _mm_mpsadbw_epu8(b3, b1, 0b111));
}
mp.v = _mm_minpos_epu16(r);
*result = mp.minpos.sad;
*best
= mp.minpos.index;
}
SAD SSE4.1
MPSADBW+PHMINSUM

Gprof:
Improved SSE4.1 SAD uses 0.90s vs 3.69s

Cachegrind:
Branch prediction misses reduced by a factor of 8

Intel’s IACA tool:
CPU pipeline appears to be filled!
Appears both source and reference block loads compete with
(V)MPSADBW for the CPUs execution ports

Assembly:
Better utilize AVX’s non-destructive instructions (less register
copies)
Better utilize loaded data for SAD computations with same source
block
How well does this
perform?









Load source block once:
.macro .load_src
vmovq (%rdi),
%xmm0
vpinsrq $1,(%rdi,%rdx),%xmm0,%xmm0
vmovq (%rdi,%rdx,2),%xmm1
vmovhps (%rdi,%r8), %xmm1,%xmm1
vmovq (%rdi,%rdx,4),%xmm2
vmovhps (%rdi,%r9), %xmm2,%xmm2
vmovq (%rdi,%r8,2), %xmm3
vmovhps (%rdi,%rax), %xmm3,%xmm3
#
#
#
#
#
#
#
#
src[0]
src[1]
src[2]
src[3]
src[4]
src[5]
src[6]
src[7]






Do sad for 8x8 - 8x8 blocks (relative y = 0…8, x = 0…8):
.macro .8x8x8x8
vmovdqu (%rsi), %xmm12
# ref[0]
.8x8x1
0, 0, 12, 4 0
vmovdqu (%rsi,%rdx), %xmm13
.8x8x1
0, 1, 13, 4 0
.8x8x1
1, 0, 13, 5 0
# ref[1]
SAD 8x8x8x8:
Less src loads, branches

Gprof:
Improved SAD 8x8x8x8 uses 0.53s vs 0.90s

Valgrind:
Even less branching
How well does this
perform?













sad_4x8x8x8x8:
.load_src
mov
.8x8x8x8
lea
.8x8x8x8
lea
.8x8x8x8
lea
.8x8x8x8
…
vphminposuw
…
ret
%rsi, %rdi
# Load source block from %rdi
# Reference block x,y
8(%rdi), %rsi
# Reference block x+8, y
(%rdi,%rdx,8), %rsi
8(%rdi,%rdx,8), %rsi #
SAD 4x8x8x8x8:
Branchless UV-plane ME

Gprof:
Improved 4x8x8x8x8 SAD uses 0.47s vs 0.53s

Valgrind:
Even less branching!

Total runtime of Foreman sequence reduced
from ~40.1s to ~1.6s (factor of 25)
How well does this
perform?
Instruction
Function variant
Cycle mean, adjusted
for single 8x8 block
MPSADBW
SAD
8x 8x8
7.5
VMPSADBW
SAD
2x8x 8x8
3.6
SAD
4x8x 8x8
4.3
SAD
8x8x 8x8 v1
2.9
SAD
8x8x 8x8 v2
2.8
Future research
SAD 4x 8x8x 8x8
2.7
SAD 16x8x8x8x8
2.6?
Even less branches? 
SAD assembly code:
Iterative improvement
PSNR
Mean
PSNR
95%
SSIM
Mean
SSIM
95%
Tractor reference
33.1 3045
34.46 549
0.9614 4
0.97672
Tractor
33.1 2339
34.46 587
0.9614 6
0.97672
Foreman reference
33.4 6215
35.36 280
0.951 53
0.964 76
Foreman
33.4 4806
35.36 217
0.951 39
0.964 36
Image quality:
comparable

NOT ON NDLAB!


Each sample counts as 0.01 seconds.
%
cumulative
self
self
total
time
seconds
seconds
calls
s/call
s/call name
92.90
124.66
124.66 1809584896
0.00
0.00 sad_block_8x8
3.06
128.76
4.10
49
0.08
2.63 c63_motion_estimate
1.58
130.89
2.13 2448000
0.00
0.00 dct_quant_block_8x8
1.58
133.01
2.13 2448000
0.00
0.00 dequant_idct_block_8x8
0.26
133.36
0.35 2448000
0.00
0.00 write_block
0.18
133.60
0.24
150
0.00
0.02 dequantize_idct
0.16
133.82
0.22
49
0.00
0.00 c63_motion_compensate
0.13
133.99
0.17
150
0.00
0.02 dct_quantize
0.11
134.14
0.15 55939437
0.00
0.00 put_bits

granularity: each sample hit covers 2 byte(s) for 0.01% of 134.21 seconds










gprof: reference,
tractor 50 frames (-O3)

NOT ON NDLAB!

Each sample counts as 0.01 seconds.













%
cumulative
time
seconds
31.15
1.83
24.75
3.28
23.81
4.67
7.17
5.09
4.44
5.35
3.24
5.54
1.71
5.64
1.54
5.73
0.85
5.78
0.68
5.82
0.51
5.85
self
seconds
calls
1.83 6869016
1.45 2448000
1.40 2448000
0.42
50
0.26
150
0.19 55937692
0.10
150
0.09 9792000
0.05
798700
0.04
49
0.03
49
self
ms/call
0.00
0.00
0.00
8.40
1.73
0.00
0.67
0.00
0.00
0.82
0.61
total
ms/call
0.00
0.00
0.00
12.20
11.34
0.00
10.64
0.00
0.00
39.09
0.61
name
sad_4x8x8x8x8
dct_quant_block_8x8
dequant_idct_block_8x8
write_frame
dequantize_idct
put_bits
dct_quantize
transpose_block_avx
sad_8x8x8x8
c63_motion_estimate
c63_motion_compensate

granularity: each sample hit covers 2 byte(s) for 0.17% of 5.86 seconds

134.2/5.86 = ~24
gprof: improved,
tractor 50 frames (-O3)

Optimizing C63 for x86

Transcript Optimizing C63 for x86

Directory