[Flac-dev] An assembly optimization and fix
Miroslav Lichvar
lichvarm at phoenix.inf.upol.cz
Tue Sep 17 09:05:01 PDT 2002
I have optimized FLAC__fixed_compute_best_predictor_asm_ia32_mmx_cmov
function and fixed bug when data_len == 0. Now the function is about
50% faster and flac -5 is about 5% faster on my box. I have tested it
thoroughly, I think it can go to flac 1.0.4.
--
Miroslav Lichvar
-------------- next part --------------
--- src/libFLAC/ia32/fixed_asm.nasm.orig 2002-01-26 19:05:12.000000000 +0100
+++ src/libFLAC/ia32/fixed_asm.nasm 2002-09-17 16:19:08.000000000 +0200
@@ -76,107 +76,73 @@
push edi
sub esp, byte 16
; qword [esp] == temp space for loading FLAC__uint64s to FPU regs
- ; dword [esp] == last_error_0
- ; dword [esp + 4] == last_error_1
- ; dword [esp + 8] == last_error_2
- ; dword [esp + 12] == last_error_3
- ; eax == error
; ebx == &data[i]
; ecx == loop counter (i)
- ; edx == temp
- ; edi == save
; ebp == order
; mm0 == total_error_1:total_error_0
- ; mm1 == total_error_3:total_error_2
- ; mm2 == 0:total_error_4
- ; mm3/4 == 0:unpackarea
- ; mm5 == abs(error_1):abs(error_0)
- ; mm5 == abs(error_3):abs(error_2)
+ ; mm1 == total_error_2:total_error_3
+ ; mm2 == :total_error_4
+ ; mm3 == last_error_1:last_error_0
+ ; mm4 == last_error_2:last_error_3
- pxor mm0, mm0 ; total_error_1 = total_error_0 = 0
- pxor mm1, mm1 ; total_error_3 = total_error_2 = 0
- pxor mm2, mm2 ; total_error_4 = 0
- mov ebx, [esp + 36] ; ebx = data[]
- mov ecx, [ebx - 4] ; ecx == data[-1] last_error_0 = data[-1]
- mov eax, [ebx - 8] ; eax == data[-2]
- mov ebp, [ebx - 16] ; ebp == data[-4]
- mov ebx, [ebx - 12] ; ebx == data[-3]
- mov edx, ecx
- sub edx, eax ; last_error_1 = data[-1] - data[-2]
- mov esi, edx
- sub esi, eax
- add esi, ebx ; last_error_2 = last_error_1 - (data[-2] - data[-3])
- shl ebx, 1
- mov edi, esi
- sub edi, eax
- add edi, ebx
- sub edi, ebp ; last_error_3 = last_error_2 - (data[-2] - 2*data[-3] + data[-4]);
- mov ebx, [esp + 36] ; ebx = data[]
- mov [esp], ecx ; [esp] = last_error_0
- mov [esp + 4], edx ; [esp + 4] = last_error_1
- mov [esp + 8], esi ; [esp + 8] = last_error_2
- mov [esp + 12], edi ; [esp + 12] = last_error_3
mov ecx, [esp + 40] ; ecx = data_len
+ test ecx, ecx
+ jz near .data_len_is_0
+
+ mov ebx, [esp + 36] ; ebx = data[]
+ movd mm3, [ebx - 4] ; mm3 = 0:last_error_0
+ movd mm2, [ebx - 8] ; mm2 = 0:data[-2]
+ movd mm1, [ebx - 12] ; mm1 = 0:data[-3]
+ movd mm0, [ebx - 16] ; mm0 = 0:data[-4]
+ movq mm5, mm3 ; mm5 = 0:last_error_0
+ psubd mm5, mm2 ; mm5 = 0:last_error_1
+ punpckldq mm3, mm5 ; mm3 = last_error_1:last_error_0
+ psubd mm2, mm1 ; mm2 = 0:data[-2] - data[-3]
+ psubd mm5, mm2 ; mm5 = 0:last_error_2
+ movq mm4, mm5 ; mm4 = 0:last_error_2
+ psubd mm4, mm2 ; mm4 = 0:last_error_2 - (data[-2] - data[-3])
+ paddd mm4, mm1 ; mm4 = 0:last_error_2 - (data[-2] - 2 * data[-3])
+ psubd mm4, mm0 ; mm4 = 0:last_error_3
+ punpckldq mm4, mm5 ; mm4 = last_error_2:last_error_3
+ pxor mm0, mm0 ; mm0 = total_error_1:total_error_0
+ pxor mm1, mm1 ; mm1 = total_error_2:total_error_3
+ pxor mm2, mm2 ; mm2 = 0:total_error_4
- ; for(i = 0; i < data_len; i++) {
- ; error_0 = data[i] ; save = error_0; total_error_0 += local_abs(error_0);
- ; error_1 -= last_error_0; last_error_0 = save; save = error_1; total_error_1 += local_abs(error_1);
- ; error_2 -= last_error_1; last_error_1 = save; save = error_2; total_error_2 += local_abs(error_2);
- ; error_3 -= last_error_2; last_error_2 = save; save = error_3; total_error_3 += local_abs(error_3);
- ; error_4 -= last_error_3; last_error_3 = save; total_error_4 += local_abs(error_4);
- ; }
ALIGN 16
.loop:
- mov eax, [ebx] ; eax = error_0 = data[i]
- add ebx, 4
- mov edi, eax ; edi == save = error_0
- mov edx, eax ; edx = error_0
- neg edx ; edx = -error_0
- cmovns eax, edx ; eax = abs(error_0)
- movd mm5, eax ; mm5 = 0:abs(error_0)
- mov edx, [esp] ; edx = last_error_0
- mov eax, edi ; eax = error(error_0)
- mov [esp], edi ; [esp] == last_error_0 = save
- sub eax, edx ; error -= last_error_0
- mov edi, eax ; edi == save = error_1
- mov edx, eax ; edx = error_1
- neg edx ; edx = -error_1
- cmovns eax, edx ; eax = abs(error_1)
- movd mm4, eax ; mm4 = 0:abs(error_1)
- punpckldq mm5, mm4 ; mm5 = abs(error_1):abs(error_0)
- mov edx, [esp + 4] ; edx = last_error_1
- mov eax, edi ; eax = error(error_1)
- mov [esp + 4], edi ; [esp + 4] == last_error_1 = save
- sub eax, edx ; error -= last_error_1
- mov edi, eax ; edi == save = error_2
- mov edx, eax ; edx = error_2
- paddd mm0, mm5 ; [CR] total_error_1 += abs(error_1) ; total_error_0 += abs(error_0)
- neg edx ; edx = -error_2
- cmovns eax, edx ; eax = abs(error_2)
- movd mm5, eax ; mm5 = 0:abs(error_2)
- mov edx, [esp + 8] ; edx = last_error_2
- mov eax, edi ; eax = error(error_2)
- mov [esp + 8], edi ; [esp + 8] == last_error_2 = save
- sub eax, edx ; error -= last_error_2
- mov edi, eax ; edi == save = error_3
- mov edx, eax ; edx = error_3
- neg edx ; edx = -error_3
- cmovns eax, edx ; eax = abs(error_3)
- movd mm4, eax ; mm4 = 0:abs(error_3)
- punpckldq mm5, mm4 ; mm5 = abs(error_3):abs(error_2)
- mov edx, [esp + 12] ; edx = last_error_3
- mov eax, edi ; eax = error(error_3)
- mov [esp + 12], edi ; [esp + 12] == last_error_3 = save
- sub eax, edx ; error -= last_error_3
- mov edx, eax ; edx = error_4
- paddd mm1, mm5 ; [CR] total_error_3 += abs(error_3) ; total_error_2 += abs(error_2)
- neg edx ; edx = -error_4
- cmovns eax, edx ; eax = abs(error_4)
- movd mm5, eax ; mm5 = 0:abs(error_4)
- paddd mm2, mm5 ; total_error_4 += abs(error_4)
+ movd mm7, [ebx] ; mm7 = 0:error_0
+ add ebx, byte 4
+ movq mm6, mm7 ; mm6 = 0:error_0
+ psubd mm7, mm3 ; mm7 = :error_1
+ punpckldq mm6, mm7 ; mm6 = error_1:error_0
+ movq mm5, mm6 ; mm5 = error_1:error_0
+ movq mm7, mm6 ; mm7 = error_1:error_0
+ psubd mm5, mm3 ; mm5 = error_2:
+ movq mm3, mm6 ; mm3 = error_1:error_0
+ psrad mm6, 31
+ pxor mm7, mm6
+ psubd mm7, mm6 ; mm7 = abs(error_1):abs(error_0)
+ paddd mm0, mm7 ; mm0 = total_error_1:total_error_0
+ movq mm6, mm5 ; mm6 = error_2:
+ psubd mm5, mm4 ; mm5 = error_3:
+ punpckhdq mm5, mm6 ; mm5 = error_2:error_3
+ movq mm7, mm5 ; mm7 = error_2:error_3
+ movq mm6, mm5 ; mm6 = error_2:error_3
+ psubd mm5, mm4 ; mm5 = :error_4
+ movq mm4, mm6 ; mm4 = error_2:error_3
+ psrad mm6, 31
+ pxor mm7, mm6
+ psubd mm7, mm6 ; mm7 = abs(error_2):abs(error_3)
+ paddd mm1, mm7 ; mm1 = total_error_2:total_error_3
+ movq mm6, mm5 ; mm6 = :error_4
+ psrad mm5, 31
+ pxor mm6, mm5
+ psubd mm6, mm5 ; mm6 = :abs(error_4)
+ paddd mm2, mm6 ; mm2 = :total_error_4
+
dec ecx
- jnz near .loop
+ jnz short .loop
; if(total_error_0 < min(min(min(total_error_1, total_error_2), total_error_3), total_error_4))
; order = 0;
@@ -188,56 +154,42 @@
; order = 3;
; else
; order = 4;
+ movq mm3, mm0 ; mm3 = total_error_1:total_error_0
movd edi, mm2 ; edi = total_error_4
- movq mm4, mm1 ; mm4 = total_error_3:total_error_2
- psrlq mm4, 32 ; mm4 = 0:total_error_3
+ movd esi, mm1 ; esi = total_error_3
+ movd eax, mm0 ; eax = total_error_0
+ punpckhdq mm1, mm1 ; mm1 = total_error_2:total_error_2
+ punpckhdq mm3, mm3 ; mm3 = total_error_1:total_error_1
movd edx, mm1 ; edx = total_error_2
- movd esi, mm4 ; esi = total_error_3
- movq mm3, mm0 ; mm3 = total_error_1:total_error_0
- psrlq mm3, 32 ; mm3 = 0:total_error_1
- movd ebx, mm0 ; ebx = total_error_0
movd ecx, mm3 ; ecx = total_error_1
- emms
- mov eax, ebx ; eax = total_error_0
- cmp ecx, ebx
+
+ xor ebx, ebx
+ xor ebp, ebp
+ inc ebx
+ cmp ecx, eax
cmovb eax, ecx ; eax = min(total_error_0, total_error_1)
+ cmovbe ebp, ebx
+ inc ebx
cmp edx, eax
cmovb eax, edx ; eax = min(total_error_0, total_error_1, total_error_2)
+ cmovbe ebp, ebx
+ inc ebx
cmp esi, eax
cmovb eax, esi ; eax = min(total_error_0, total_error_1, total_error_2, total_error_3)
+ cmovbe ebp, ebx
+ inc ebx
cmp edi, eax
cmovb eax, edi ; eax = min(total_error_0, total_error_1, total_error_2, total_error_3, total_error_4)
+ cmovbe ebp, ebx
+ movd ebx, mm0 ; ebx = total_error_0
+ emms
- cmp eax, ebx
- jne .not_order_0
- xor ebp, ebp
- jmp short .got_order
-.not_order_0:
- cmp eax, ecx
- jne .not_order_1
- mov ebp, 1
- jmp short .got_order
-.not_order_1:
- cmp eax, edx
- jne .not_order_2
- mov ebp, 2
- jmp short .got_order
-.not_order_2:
- cmp eax, esi
- jne .not_order_3
- mov ebp, 3
- jmp short .got_order
-.not_order_3:
- mov ebp, 4
-.got_order:
; residual_bits_per_sample[0] = (FLAC__real)((data_len > 0 && total_error_0 > 0) ? log(M_LN2 * (double)total_error_0 / (double)data_len) / M_LN2 : 0.0);
; residual_bits_per_sample[1] = (FLAC__real)((data_len > 0 && total_error_1 > 0) ? log(M_LN2 * (double)total_error_1 / (double)data_len) / M_LN2 : 0.0);
; residual_bits_per_sample[2] = (FLAC__real)((data_len > 0 && total_error_2 > 0) ? log(M_LN2 * (double)total_error_2 / (double)data_len) / M_LN2 : 0.0);
; residual_bits_per_sample[3] = (FLAC__real)((data_len > 0 && total_error_3 > 0) ? log(M_LN2 * (double)total_error_3 / (double)data_len) / M_LN2 : 0.0);
; residual_bits_per_sample[4] = (FLAC__real)((data_len > 0 && total_error_4 > 0) ? log(M_LN2 * (double)total_error_4 / (double)data_len) / M_LN2 : 0.0);
xor eax, eax
- cmp eax, [esp + 40]
- je near .data_len_is_0
fild dword [esp + 40] ; ST = data_len (NOTE: assumes data_len is <2gigs)
.rbps_0:
test ebx, ebx
@@ -321,9 +273,14 @@
jmp short .end
.data_len_is_0:
; data_len == 0, so residual_bits_per_sample[*] = 0.0
- mov ecx, 5 ; eax still == 0, ecx = # of dwords of 0 to store
+ xor ebp, ebp
mov edi, [esp + 44]
- rep stosd
+ mov [edi], ebp
+ mov [edi + 4], ebp
+ mov [edi + 8], ebp
+ mov [edi + 12], ebp
+ mov [edi + 16], ebp
+ add ebp, byte 4 ; order = 4
.end:
mov eax, ebp ; return order
More information about the Flac-dev
mailing list