[Theora-dev] MMX IDCT for theora-exp
Rudolf Marek
r.marek at sh.cvut.cz
Wed Jul 20 08:22:55 PDT 2005
Hello,
I'm attaching IDCT MMX patch. I reused IDCT from theora-a3-MMXd.zip.
It should work on 64bit X86 platform too.
Here is most used functions when playing video with jet aircrafts (gripen)
Ogg logical stream 310b2968 is Theora 720x480 29.97 fps video
Encoded frame content is 720x480 with 0x0 offset
I can play this video with like 200-300 frame drops on Athlon XP 1700+
CPU load (with music when playing from50% to full, mostly 60%)
CPU: Athlon, speed 1466.91 MHz (estimated)
Counted CPU_CLK_UNHALTED events (Cycles outside of halt state) with a unit mask of 0x00 (No unit mask) count 400000
Counted DATA_CACHE_MISSES events (Data cache misses) with a unit mask of 0x00 (No unit mask) count 2000
samples % samples % image name symbol name
124337 22.0173 91089 23.4683 dump theora_decode_packetin
83446 14.7764 114246 29.4345 libc-2.3.2.so (no symbols)
74011 13.1057 33746 8.6944 dump oc_state_loop_filter_frag_rows
57706 10.2185 9204 2.3713 libogg.so.0.5.2 (no symbols)
39182 6.9383 10146 2.6140 dump oc_state_frag_recon_mmx
31095 5.5062 38650 9.9578 dump oc_frag_recon_inter2_mmx
24133 4.2734 12945 3.3352 dump oc_frag_pred_dc
22053 3.9051 11120 2.8650 dump oc_huff_token_decode
12497 2.2129 163 0.0420 dump oc_idct8x8_mmx
10376 1.8374 22113 5.6972 dump oc_frag_recon_inter_mmx
9553 1.6916 266 0.0685 dump oc_idct8x8_10_mmx
9489 1.6803 1123 0.2893 dump theora_look
9401 1.6647 3291 0.8479 dump oc_token_expand_run_cat1a
7733 1.3693 2563 0.6603 dump oc_token_expand_const
6488 1.1489 12456 3.2092 dump oc_state_frag_copy_mmx
5488 0.9718 3577 0.9216 dump oc_state_get_mv_offsets
5334 0.9445 2286 0.5890 dump oc_token_expand_run
3351 0.5934 1794 0.4622 dump anonymous symbol from section .plt
3158 0.5592 393 0.1013 dump oc_token_skip_val
2836 0.5022 786 0.2025 dump oc_token_skip_run_cat1a
2572 0.4554 982 0.2530 dump oc_token_expand_cat2
2087 0.3696 16 0.0041 dump oc_dct_token_skip
1884 0.3336 658 0.1695 dump oc_token_skip_eob
1788 0.3166 5600 1.4428 dump oc_frag_recon_intra_mmx
1542 0.2731 12 0.0031 dump oc_frag_recon_inter2
1514 0.2681 1578 0.4066 dump oc_vlc_mode_unpack
1441 0.2552 1055 0.2718 dump oc_state_frag_recon
1411 0.2499 382 0.0984 dump oc_token_skip_run
1275 0.2258 1598 0.4117 dump oc_dec_mb_modes_unpack
1008 0.1785 460 0.1185 dump oc_token_expand_cati
999 0.1769 1538 0.3963 dump oc_restore_fpu
884 0.1565 581 0.1497 dump oc_token_dec1val_const
614 0.1087 145 0.0374 dump oc_token_expand_zrl
543 0.0962 1 2.6e-04 dump oc_frag_recon_inter
509 0.0901 15 0.0039 dump oc_frag_recon_intra
470 0.0832 505 0.1301 dump oc_token_dec1val_cat2
445 0.0788 96 0.0247 dump oc_token_dec1val_zrl
344 0.0609 81 0.0209 dump oc_state_borders_fill_rows
324 0.0574 237 0.0611 dump oc_restore_fpu_mmx
260 0.0460 36 0.0093 dump main
218 0.0386 83 0.0214 dump stripe_decoded
213 0.0377 67 0.0173 dump oc_token_skip_zrl
212 0.0375 237 0.0611 dump oc_token_dec1val_cati
209 0.0370 57 0.0147 dump oc_sb_run_unpack
170 0.0301 11 0.0028 dump oc_vlc_mv_comp_unpack
30 0.0053 11 0.0028 dump oc_state_borders_fill_caps
26 0.0046 4 0.0010 dump oc_clc_mv_comp_unpack
12 0.0021 6 0.0015 dump oc_ycbcr_buffer_flip
11 0.0019 13 0.0033 dump theora_granule_time
10 0.0018 1 2.6e-04 dump oc_state_loop_filter_init
6 0.0011 100 0.0258 dump oc_state_frag_copy
6 0.0011 5 0.0013 dump oc_token_skip_eob6
5 8.9e-04 2 5.2e-04 dump oc_clc_mode_unpack
5 8.9e-04 5 0.0013 dump oc_state_init
3 5.3e-04 0 0 dump oc_set_chroma_mvs00
2 3.5e-04 0 0 dump oc_dequant_tables_init
2 3.5e-04 2 5.2e-04 ld-2.3.2.so (no symbols)
1 1.8e-04 0 0 dump oc_huff_tree_mindepth
1 1.8e-04 0 0 dump oc_huff_tree_occupancy
As you can see loopfilter needs some work. After loopfilter opts I will take a look again on
oc_state_frag_recon_mmx and oc_state_frag_inter2_mmx and maybe I can add quantization in MMX too.
Derf have you any objections againts the patch?
Thanks
regards
Rudolf
-------------- next part --------------
diff -Naur a/lib/idct.c b/lib/idct.c
--- a/lib/idct.c 2005-07-20 11:39:30.355887750 +0200
+++ b/lib/idct.c 2005-07-20 11:25:26.451147000 +0200
@@ -208,7 +208,7 @@
_y: The buffer to store the result in.
This may be the same as _x.
_x: The input coefficients. */
-void oc_idct8x8(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
+void oc_idct8x8_c(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
const ogg_int16_t *in;
ogg_int16_t *end;
ogg_int16_t *out;
@@ -236,7 +236,7 @@
_y: The buffer to store the result in.
This may be the same as _x.
_x: The input coefficients. */
-void oc_idct8x8_10(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
+void oc_idct8x8_10_c(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
const ogg_int16_t *in;
ogg_int16_t *end;
ogg_int16_t *out;
diff -Naur a/lib/idct.h b/lib/idct.h
--- a/lib/idct.h 2005-07-20 11:39:30.383889500 +0200
+++ b/lib/idct.h 2005-07-20 11:25:36.019745000 +0200
@@ -3,7 +3,7 @@
#if !defined(_idct_H)
# define _idct_H (1)
-void oc_idct8x8(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
-void oc_idct8x8_10(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
+void oc_idct8x8_c(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
+void oc_idct8x8_10_c(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
#endif
diff -Naur a/lib/internal.h b/lib/internal.h
--- a/lib/internal.h 2005-07-20 11:39:30.355887750 +0200
+++ b/lib/internal.h 2005-07-20 11:46:53.083556500 +0200
@@ -239,6 +239,9 @@
void (*state_frag_copy)(const oc_theora_state *_state,
const int *_fragis,int _nfragis,int _dst_frame,int _src_frame,int _pli);
void (*restore_fpu)(void);
+ void (*oc_state_frag_recon)(oc_theora_state *_state,const oc_fragment *_frag,
+ int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
+ ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]);
}oc_base_opt_vtable;
@@ -385,9 +388,6 @@
int oc_state_mbi_for_pos(oc_theora_state *_state,int _mbx,int _mby);
int oc_state_get_mv_offsets(oc_theora_state *_state,int *_offset0,
int *_offset1,int _dx,int _dy,int _ystride,int _pli);
-void oc_state_frag_recon(oc_theora_state *_state,const oc_fragment *_frag,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
- ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]);
int oc_state_loop_filter_init(oc_theora_state *_state,int *_bv);
void oc_state_loop_filter(oc_theora_state *_state,int _frame);
@@ -421,5 +421,8 @@
void oc_state_frag_copy_c(const oc_theora_state *_state,const int *_fragis,
int _nfragis,int _dst_frame,int _src_frame,int _pli);
void oc_restore_fpu_c(void);
+void oc_state_frag_recon_c(oc_theora_state *_state,const oc_fragment *_frag,
+ int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
+ ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]);
#endif
diff -Naur a/lib/Makefile.am b/lib/Makefile.am
--- a/lib/Makefile.am 2005-07-20 11:39:30.383889500 +0200
+++ b/lib/Makefile.am 2005-07-20 11:57:27.867228000 +0200
@@ -5,6 +5,7 @@
EXTRA_DIST = \
x86/cpu.c \
+ x86/mmxidct.c \
x86/mmxfrag.c \
x86/mmxstate.c \
x86/x86state.c
@@ -12,6 +13,7 @@
if OC_X86ASM
X86ASM_FILES = \
x86/cpu.c \
+ x86/mmxidct.c \
x86/mmxfrag.c \
x86/mmxstate.c \
x86/x86state.c
diff -Naur a/lib/state.c b/lib/state.c
--- a/lib/state.c 2005-07-20 11:39:30.351887500 +0200
+++ b/lib/state.c 2005-07-20 11:56:57.753346000 +0200
@@ -508,6 +508,7 @@
void oc_state_vtable_init_c(oc_theora_state *_state){
+ _state->opt_vtable.oc_state_frag_recon=oc_state_frag_recon_c;
_state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_c;
_state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_c;
_state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_c;
@@ -787,6 +788,14 @@
void oc_state_frag_recon(oc_theora_state *_state,const oc_fragment *_frag,
int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]){
+_state->opt_vtable.oc_state_frag_recon(_state,_frag,_pli,_dct_coeffs,
+ _last_zzi,_ncoefs,_dc_iquant,
+ _ac_iquant);
+}
+
+void oc_state_frag_recon_c(oc_theora_state *_state,const oc_fragment *_frag,
+ int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
+ ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]){
ogg_int16_t dct_buf[64];
ogg_int16_t res_buf[64];
int dst_framei;
@@ -837,11 +846,11 @@
the iDCT.*/
if(_last_zzi<10){
for(;zzi<10;zzi++)dct_buf[OC_FZIG_ZAG[zzi]]=0;
- oc_idct8x8_10(res_buf,dct_buf);
+ oc_idct8x8_10_c(res_buf,dct_buf);
}
else{
for(;zzi<64;zzi++)dct_buf[OC_FZIG_ZAG[zzi]]=0;
- oc_idct8x8(res_buf,dct_buf);
+ oc_idct8x8_c(res_buf,dct_buf);
}
}
/*Fill in the target buffer.*/
diff -Naur a/lib/x86/cpu.c b/lib/x86/cpu.c
--- a/lib/x86/cpu.c 2005-07-20 11:39:29.063807000 +0200
+++ b/lib/x86/cpu.c 2005-07-20 11:43:38.231379000 +0200
@@ -13,6 +13,7 @@
********************************************************************/
#include "cpu.h"
+#include "x86int.h"
ogg_uint32_t oc_cpu_flags_get(void){
ogg_uint32_t eax;
@@ -95,5 +96,7 @@
/*Implement me.*/
flags=0;
}
+ /* Hack call fill IDCT contants */
+ fillidctconstants_mmx();
return flags;
}
diff -Naur a/lib/x86/mmxidct.c b/lib/x86/mmxidct.c
--- a/lib/x86/mmxidct.c 1970-01-01 01:00:00.000000000 +0100
+++ b/lib/x86/mmxidct.c 2005-07-20 16:49:45.187242000 +0200
@@ -0,0 +1,650 @@
+#include <ogg/ogg.h>
+#include "dct.h"
+#include "idct.h"
+
+#define ASM asm
+/* 4 masks come in order low word to high */
+#define MaskOffset 0
+/* 7 cosines come in order pi/16 * (1 ... 7) */
+#define CosineOffset 32
+#define EightOffset 88
+#define IdctAdjustBeforeShift 8
+
+
+ogg_uint16_t __attribute__((aligned(8),used)) idctconstants[(4+7+1) * 4];
+ogg_uint16_t idctcosTbl[ 7] =
+{
+ 64277, 60547, 54491, 46341, 36410, 25080, 12785
+};
+
+/* I'm leaving original VP3 table even I'm not using it whole. It can be used
+ for MMX quantzation later - Ruik */
+
+void fillidctconstants_mmx(void)
+{
+ int j = 16;
+ ogg_uint16_t * p;
+
+ do
+ {
+ idctconstants[ --j] = 0;
+ }
+ while( j);
+
+ idctconstants[0] = idctconstants[5] = idctconstants[10] = idctconstants[15] = 65535;
+
+ j = 1;
+ do
+ {
+ p = idctconstants + ( (j+3) << 2);
+ p[0] = p[1] = p[2] = p[3] = idctcosTbl[ j - 1];
+ }
+ while( ++j <= 7);
+
+ idctconstants[44] = idctconstants[45] = idctconstants[46] = idctconstants[47] = IdctAdjustBeforeShift;
+}
+
+
+#define MtoSTR(s) #s
+
+#define Dump "call MMX_dump\n"
+
+#define BeginIDCT "#BeginIDCT\n"\
+ \
+ " movq " I(3)","r2"\n" \
+ \
+ " movq " C(3)","r6"\n" \
+ " movq " r2","r4"\n" \
+ " movq " J(5)","r7"\n" \
+ " pmulhw " r6","r4"\n" \
+ " movq " C(5)","r1"\n" \
+ " pmulhw " r7","r6"\n" \
+ " movq " r1","r5"\n" \
+ " pmulhw " r2","r1"\n" \
+ " movq " I(1)","r3"\n" \
+ " pmulhw " r7","r5"\n" \
+ " movq " C(1)","r0"\n" \
+ " paddw " r2","r4"\n" \
+ " paddw " r7","r6"\n" \
+ " paddw " r1","r2"\n" \
+ " movq " J(7)","r1"\n" \
+ " paddw " r5","r7"\n" \
+ " movq " r0","r5"\n" \
+ " pmulhw " r3","r0"\n" \
+ " paddsw " r7","r4"\n" \
+ " pmulhw " r1","r5"\n" \
+ " movq " C(7)","r7"\n" \
+ " psubsw " r2","r6"\n" \
+ " paddw " r3","r0"\n" \
+ " pmulhw " r7","r3"\n" \
+ " movq " I(2)","r2"\n" \
+ " pmulhw " r1","r7"\n" \
+ " paddw " r1","r5"\n" \
+ " movq " r2","r1"\n" \
+ " pmulhw " C(2)","r2"\n" \
+ " psubsw " r5","r3"\n" \
+ " movq " J(6)","r5"\n" \
+ " paddsw " r7","r0"\n" \
+ " movq " r5","r7"\n" \
+ " psubsw " r4","r0"\n" \
+ " pmulhw " C(2)","r5"\n" \
+ " paddw " r1","r2"\n" \
+ " pmulhw " C(6)","r1"\n" \
+ " paddsw " r4","r4"\n" \
+ " paddsw " r0","r4"\n" \
+ " psubsw " r6","r3"\n" \
+ " paddw " r7","r5"\n" \
+ " paddsw " r6","r6"\n" \
+ " pmulhw " C(6)","r7"\n" \
+ " paddsw " r3","r6"\n" \
+ " movq " r4","I(1)"\n" \
+ " psubsw " r5","r1"\n" \
+ " movq " C(4)","r4"\n" \
+ " movq " r3","r5"\n" \
+ " pmulhw " r4","r3"\n" \
+ " paddsw " r2","r7"\n" \
+ " movq " r6","I(2)"\n" \
+ " movq " r0","r2"\n" \
+ " movq " I(0)","r6"\n" \
+ " pmulhw " r4","r0"\n" \
+ " paddw " r3","r5"\n" \
+ "\n"\
+ " movq " J(4)","r3"\n" \
+ " psubsw " r1","r5"\n" \
+ " paddw " r0","r2"\n" \
+ " psubsw " r3","r6"\n" \
+ " movq " r6","r0"\n" \
+ " pmulhw " r4","r6"\n" \
+ " paddsw " r3","r3"\n" \
+ " paddsw " r1","r1"\n" \
+ " paddsw " r0","r3"\n" \
+ " paddsw " r5","r1"\n" \
+ " pmulhw " r3","r4"\n" \
+ " paddsw " r0","r6"\n" \
+ " psubsw " r2","r6"\n" \
+ " paddsw " r2","r2"\n" \
+ " movq " I(1)","r0"\n" \
+ " paddsw " r6","r2"\n" \
+ " paddw " r3","r4"\n" \
+ " psubsw " r1","r2"\n" \
+ "#end BeginIDCT\n"
+// end BeginIDCT macro (38 cycles).
+
+#define RowIDCT ASM("\n"\
+ "#RowIDCT\n"\
+ BeginIDCT \
+ "\n"\
+ " movq "I(2)","r3"\n" /* r3 = D. */ \
+ " psubsw "r7","r4"\n" /* r4 = E. = E - G */ \
+ " paddsw "r1","r1"\n" /* r1 = H. + H. */ \
+ " paddsw "r7","r7"\n" /* r7 = G + G */ \
+ " paddsw "r2","r1"\n" /* r1 = R1 = A.. + H. */ \
+ " paddsw "r4","r7"\n" /* r7 = G. = E + G */ \
+ " psubsw "r3","r4"\n" /* r4 = R4 = E. - D. */ \
+ " paddsw "r3","r3"\n" \
+ " psubsw "r5","r6"\n" /* r6 = R6 = F. - B.. */ \
+ " paddsw "r5","r5"\n" \
+ " paddsw "r4","r3"\n" /* r3 = R3 = E. + D. */ \
+ " paddsw "r6","r5"\n" /* r5 = R5 = F. + B.. */ \
+ " psubsw "r0","r7"\n" /* r7 = R7 = G. - C. */ \
+ " paddsw "r0","r0"\n" \
+ " movq "r1","I(1)"\n" /* save R1 */ \
+ " paddsw "r7","r0"\n" /* r0 = R0 = G. + C. */ \
+ "#end RowIDCT"\
+);
+// end RowIDCT macro (8 + 38 = 46 cycles)
+
+
+/* Following macro does two 4x4 transposes in place.
+
+ At entry (we assume):
+
+ r0 = a3 a2 a1 a0
+ I(1) = b3 b2 b1 b0
+ r2 = c3 c2 c1 c0
+ r3 = d3 d2 d1 d0
+
+ r4 = e3 e2 e1 e0
+ r5 = f3 f2 f1 f0
+ r6 = g3 g2 g1 g0
+ r7 = h3 h2 h1 h0
+
+ At exit, we have:
+
+ I(0) = d0 c0 b0 a0
+ I(1) = d1 c1 b1 a1
+ I(2) = d2 c2 b2 a2
+ I(3) = d3 c3 b3 a3
+
+ J(4) = h0 g0 f0 e0
+ J(5) = h1 g1 f1 e1
+ J(6) = h2 g2 f2 e2
+ J(7) = h3 g3 f3 e3
+
+ I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
+ J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7.
+
+ Since r1 is free at entry, we calculate the Js first. */
+
+
+
+#define Transpose ASM("\n#Transpose\n" \
+ \
+ " movq "r4","r1"\n" \
+ " punpcklwd "r5","r4"\n" \
+ " movq "r0","I(0)"\n" \
+ " punpckhwd "r5","r1"\n" \
+ " movq "r6","r0"\n" \
+ " punpcklwd "r7","r6"\n" \
+ " movq "r4","r5"\n" \
+ " punpckldq "r6","r4"\n" \
+ " punpckhdq "r6","r5"\n" \
+ " movq "r1","r6"\n" \
+ " movq "r4","J(4)"\n" \
+ " punpckhwd "r7","r0"\n" \
+ " movq "r5","J(5)"\n" \
+ " punpckhdq "r0","r6"\n" \
+ " movq "I(0)","r4"\n" \
+ " punpckldq "r0","r1"\n" \
+ " movq "I(1)","r5"\n" \
+ " movq "r4","r0"\n" \
+ " movq "r6","J(7)"\n" \
+ " punpcklwd "r5","r0"\n" \
+ " movq "r1","J(6)"\n" \
+ " punpckhwd "r5","r4"\n" \
+ " movq "r2","r5"\n" \
+ " punpcklwd "r3","r2"\n" \
+ " movq "r0","r1"\n" \
+ " punpckldq "r2","r0"\n" \
+ " punpckhdq "r2","r1"\n" \
+ " movq "r4","r2"\n" \
+ " movq "r0","I(0)"\n" \
+ " punpckhwd "r3","r5"\n" \
+ " movq "r1","I(1)"\n" \
+ " punpckhdq "r5","r4"\n" \
+ " punpckldq "r5","r2"\n" \
+ \
+ " movq "r4","I(3)"\n" \
+ \
+ " movq "r2","I(2)"\n" \
+ "#end Transpose\n"\
+);
+// end Transpose macro (19 cycles).
+
+#define ColumnIDCT ASM("\n"\
+ "#ColumnIDCT\n"\
+ BeginIDCT \
+ "\n"\
+ " paddsw "Eight","r2"\n" \
+ " paddsw "r1","r1"\n" /* r1 = H. + H. */ \
+ " paddsw "r2","r1"\n" /* r1 = R1 = A.. + H. */ \
+ " psraw ""$4"","r2"\n" /* r2 = NR2 */ \
+ " psubsw "r7","r4"\n" /* r4 = E. = E - G */ \
+ " psraw ""$4"","r1"\n" /* r1 = NR1 */ \
+ " movq "I(2)","r3"\n" /* r3 = D. */ \
+ " paddsw "r7","r7"\n" /* r7 = G + G */ \
+ " movq "r2","I(2)"\n" /* store NR2 at I2 */ \
+ " paddsw "r4","r7"\n" /* r7 = G. = E + G */ \
+ " movq "r1","I(1)"\n" /* store NR1 at I1 */ \
+ " psubsw "r3","r4"\n" /* r4 = R4 = E. - D. */ \
+ " paddsw "Eight","r4"\n" \
+ " paddsw "r3","r3"\n" /* r3 = D. + D. */ \
+ " paddsw "r4","r3"\n" /* r3 = R3 = E. + D. */ \
+ " psraw ""$4"","r4"\n" /* r4 = NR4 */ \
+ " psubsw "r5","r6"\n" /* r6 = R6 = F. - B.. */ \
+ " psraw ""$4"","r3"\n" /* r3 = NR3 */ \
+ " paddsw "Eight","r6"\n" \
+ " paddsw "r5","r5"\n" /* r5 = B.. + B.. */ \
+ " paddsw "r6","r5"\n" /* r5 = R5 = F. + B.. */ \
+ " psraw ""$4"","r6"\n" /* r6 = NR6 */ \
+ " movq "r4","J(4)"\n" /* store NR4 at J4 */ \
+ " psraw ""$4"","r5"\n" /* r5 = NR5 */ \
+ " movq "r3","I(3)"\n" /* store NR3 at I3 */ \
+ " psubsw "r0","r7"\n" /* r7 = R7 = G. - C. */ \
+ " paddsw "Eight","r7"\n" \
+ " paddsw "r0","r0"\n" /* r0 = C. + C. */ \
+ " paddsw "r7","r0"\n" /* r0 = R0 = G. + C. */ \
+ " psraw ""$4"","r7"\n" /* r7 = NR7 */ \
+ " movq "r6","J(6)"\n" /* store NR6 at J6 */ \
+ " psraw ""$4"","r0"\n" /* r0 = NR0 */ \
+ " movq "r5","J(5)"\n" /* store NR5 at J5 */ \
+ " movq "r7","J(7)"\n" /* store NR7 at J7 */ \
+ " movq "r0","I(0)"\n" /* store NR0 at I0 */ \
+ "#end ColumnIDCT\n"\
+);
+// end ColumnIDCT macro (38 + 19 = 57 cycles)
+
+
+void MMX_dump()
+{
+ ASM
+ ("\
+ movq %mm0,(%edi)\n\
+ movq %mm1,8(%edi)\n\
+ movq %mm2,16(%edi)\n\
+ movq %mm3,24(%edi)\n\
+ movq %mm4,32(%edi)\n\
+ movq %mm5,40(%edi)\n\
+ movq %mm6,48(%edi)\n\
+ movq %mm7,56(%edi)\n\
+ ret"
+ );
+}
+
+void oc_idct8x8_mmx(ogg_int16_t _y[64]){
+
+/* this routine accepts 8x8 matrix but in transposed form
+ every 4x4 submatrix is transposed */
+
+# define r0 "%mm0"
+# define r1 "%mm1"
+# define r2 "%mm2"
+# define r3 "%mm3"
+# define r4 "%mm4"
+# define r5 "%mm5"
+# define r6 "%mm6"
+# define r7 "%mm7"
+
+
+# undef M
+
+#if (defined(__amd64__) || defined(__x86_64__))
+
+ __asm__ __volatile__ ("\n"
+ "#lea %1,%%rcx\n"
+ :
+ : "d" (_y),
+ "c" (idctconstants)
+
+ );
+
+# define MIDM(M,I) MtoSTR(M+I*8(%rcx))
+# define M(I) MIDM( MaskOffset , I )
+# define MIDC(M,I) MtoSTR(M+(I-1)*8(%rcx))
+# define C(I) MIDC( CosineOffset , I )
+# define MIDEight(M) MtoSTR(M(%rcx))
+# define Eight MIDEight(EightOffset)
+
+# define I( K) MtoSTR((K*16)(%rdx))
+# define J( K) MtoSTR(((K - 4) * 16)+8(%rdx))
+
+
+#else
+ __asm__ __volatile__ ("\n"
+ "movl $idctconstants,%%ecx\n"
+ :
+ : "d" (_y)
+ : "ecx"
+ );
+
+# define MIDM(M,I) MtoSTR(M+I*8(%ecx))
+# define M(I) MIDM( MaskOffset , I )
+# define MIDC(M,I) MtoSTR(M+(I-1)*8(%ecx))
+# define C(I) MIDC( CosineOffset , I )
+# define MIDEight(M) MtoSTR(M(%ecx))
+# define Eight MIDEight(EightOffset)
+
+# define I( K) MtoSTR((K*16)(%edx))
+# define J( K) MtoSTR(((K - 4) * 16)+8(%edx))
+
+#endif
+
+ RowIDCT
+ Transpose
+
+# undef I
+# undef J
+
+#if (defined(__amd64__) || defined(__x86_64__))
+# define I( K) MtoSTR((K * 16)+64(%rdx))
+# define J( K) MtoSTR(((K - 4)*16)+72(%rdx))
+#else
+# define I( K) MtoSTR((K * 16)+64(%edx))
+# define J( K) MtoSTR(((K - 4)*16)+72(%edx))
+#endif
+ RowIDCT
+ Transpose
+
+# undef I
+# undef J
+
+
+
+#if (defined(__amd64__) || defined(__x86_64__))
+# define I( K) MtoSTR((K * 16)(%rdx))
+#else
+# define I( K) MtoSTR((K * 16)(%edx))
+#endif
+
+# define J( K) I( K)
+
+ ColumnIDCT
+
+# undef I
+# undef J
+
+#if (defined(__amd64__) || defined(__x86_64__))
+# define I( K) MtoSTR((K * 16)+8(%rdx))
+#else
+# define I( K) MtoSTR((K * 16)+8(%edx))
+
+#endif
+
+# define J( K) I( K)
+
+ ColumnIDCT
+
+# undef I
+# undef J
+
+ ASM("\n"
+ " emms\n"
+ );
+
+}
+
+
+
+#define BeginIDCT_10 "#BeginIDCT_10\n"\
+ " movq "I(3)","r2"\n" \
+ " nop\n" \
+\
+ " movq "C(3)","r6"\n" \
+ " movq "r2","r4"\n" \
+\
+ " movq "C(5)","r1"\n" \
+ " pmulhw "r6","r4"\n" \
+\
+ " movq "I(1)","r3"\n" \
+ " pmulhw "r2","r1"\n" \
+\
+ " movq "C(1)","r0"\n" \
+ " paddw "r2","r4"\n" \
+\
+ " pxor "r6","r6"\n" \
+ " paddw "r1","r2"\n" \
+\
+ " movq "I(2)","r5"\n" \
+ " pmulhw "r3","r0"\n" \
+\
+ " movq "r5","r1"\n" \
+ " paddw "r3","r0"\n" \
+\
+ " pmulhw "C(7)","r3"\n" \
+ " psubsw "r2","r6"\n" \
+\
+ " pmulhw "C(2)","r5"\n" \
+ " psubsw "r4","r0"\n" \
+\
+ " movq "I(2)","r7"\n"\
+ " paddsw "r4","r4"\n" \
+\
+ " paddw "r5","r7"\n" \
+ " paddsw "r0","r4"\n" \
+\
+ " pmulhw "C(6)","r1"\n" \
+ " psubsw "r6","r3"\n" \
+\
+ " movq "r4","I(1)"\n" \
+ " paddsw "r6","r6"\n" \
+\
+ " movq "C(4)","r4"\n" \
+ " paddsw "r3","r6"\n" \
+\
+ " movq "r3","r5"\n" \
+ " pmulhw "r4","r3"\n" \
+\
+ " movq "r6","I(2)"\n" \
+ " movq "r0","r2"\n" \
+\
+ " movq "I(0)","r6"\n" \
+ " pmulhw "r4","r0"\n" \
+\
+ " paddw "r3","r5"\n" \
+ " paddw "r0","r2"\n" \
+\
+ " psubsw "r1","r5"\n" \
+ " pmulhw "r4","r6"\n" \
+\
+ " paddw "I(0)","r6"\n" \
+ " paddsw "r1","r1"\n" \
+\
+ " movq "r6","r4"\n" \
+ " paddsw "r5","r1"\n" \
+\
+ " psubsw "r2","r6"\n" \
+ " paddsw "r2","r2"\n" \
+\
+ " movq "I(1)","r0"\n" \
+ " paddsw "r6","r2"\n" \
+\
+ " psubsw "r1","r2"\n" \
+ " nop\n" \
+ "#end BeginIDCT_10\n"
+// end BeginIDCT_10 macro (25 cycles).
+
+
+#define RowIDCT_10 ASM("\n"\
+ "#RowIDCT_10\n"\
+ BeginIDCT_10 \
+ "\n"\
+ " movq "I(2)","r3"\n" /* r3 = D. */ \
+ " psubsw "r7","r4"\n" /* r4 = E. = E - G */ \
+ " paddsw "r1","r1"\n" /* r1 = H. + H. */ \
+ " paddsw "r7","r7"\n" /* r7 = G + G */ \
+ " paddsw "r2","r1"\n" /* r1 = R1 = A.. + H. */ \
+ " paddsw "r4","r7"\n" /* r7 = G. = E + G */ \
+ " psubsw "r3","r4"\n" /* r4 = R4 = E. - D. */ \
+ " paddsw "r3","r3"\n" \
+ " psubsw "r5","r6"\n" /* r6 = R6 = F. - B.. */ \
+ " paddsw "r5","r5"\n" \
+ " paddsw "r4","r3"\n" /* r3 = R3 = E. + D. */ \
+ " paddsw "r6","r5"\n" /* r5 = R5 = F. + B.. */ \
+ " psubsw "r0","r7"\n" /* r7 = R7 = G. - C. */ \
+ " paddsw "r0","r0"\n" \
+ " movq "r1","I(1)"\n" /* save R1 */ \
+ " paddsw "r7","r0"\n" /* r0 = R0 = G. + C. */ \
+ "#end RowIDCT_10\n"\
+);
+// end RowIDCT macro (8 + 38 = 46 cycles)
+
+
+#define ColumnIDCT_10 ASM("\n"\
+ "#ColumnIDCT_10\n"\
+ BeginIDCT_10 \
+ "\n"\
+ " paddsw "Eight","r2"\n" \
+ " paddsw "r1","r1"\n" /* r1 = H. + H. */ \
+ " paddsw "r2","r1"\n" /* r1 = R1 = A.. + H. */ \
+ " psraw ""$4"","r2"\n" /* r2 = NR2 */ \
+ " psubsw "r7","r4"\n" /* r4 = E. = E - G */ \
+ " psraw ""$4"","r1"\n" /* r1 = NR1 */ \
+ " movq "I(2)","r3"\n" /* r3 = D. */ \
+ " paddsw "r7","r7"\n" /* r7 = G + G */ \
+ " movq "r2","I(2)"\n" /* store NR2 at I2 */ \
+ " paddsw "r4","r7"\n" /* r7 = G. = E + G */ \
+ " movq "r1","I(1)"\n" /* store NR1 at I1 */ \
+ " psubsw "r3","r4"\n" /* r4 = R4 = E. - D. */ \
+ " paddsw "Eight","r4"\n" \
+ " paddsw "r3","r3"\n" /* r3 = D. + D. */ \
+ " paddsw "r4","r3"\n" /* r3 = R3 = E. + D. */ \
+ " psraw ""$4"","r4"\n" /* r4 = NR4 */ \
+ " psubsw "r5","r6"\n" /* r6 = R6 = F. - B.. */ \
+ " psraw ""$4"","r3"\n" /* r3 = NR3 */ \
+ " paddsw "Eight","r6"\n" \
+ " paddsw "r5","r5"\n" /* r5 = B.. + B.. */ \
+ " paddsw "r6","r5"\n" /* r5 = R5 = F. + B.. */ \
+ " psraw ""$4"","r6"\n" /* r6 = NR6 */ \
+ " movq "r4","J(4)"\n" /* store NR4 at J4 */ \
+ " psraw ""$4"","r5"\n" /* r5 = NR5 */ \
+ " movq "r3","I(3)"\n" /* store NR3 at I3 */ \
+ " psubsw "r0","r7"\n" /* r7 = R7 = G. - C. */ \
+ " paddsw "Eight","r7"\n" \
+ " paddsw "r0","r0"\n" /* r0 = C. + C. */ \
+ " paddsw "r7","r0"\n" /* r0 = R0 = G. + C. */ \
+ " psraw ""$4"","r7"\n" /* r7 = NR7 */ \
+ " movq "r6","J(6)"\n" /* store NR6 at J6 */ \
+ " psraw ""$4"","r0"\n" /* r0 = NR0 */ \
+ " movq "r5","J(5)"\n" /* store NR5 at J5 */ \
+ \
+ " movq "r7","J(7)"\n" /* store NR7 at J7 */ \
+ \
+ " movq "r0","I(0)"\n" /* store NR0 at I0 */ \
+ "#end ColumnIDCT_10\n"\
+);
+// end ColumnIDCT macro (38 + 19 = 57 cycles)
+
+
+/* IDCT 10 */
+void oc_idct8x8_10_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64])
+
+{
+
+# define r0 "%mm0"
+# define r1 "%mm1"
+# define r2 "%mm2"
+# define r3 "%mm3"
+# define r4 "%mm4"
+# define r5 "%mm5"
+# define r6 "%mm6"
+# define r7 "%mm7"
+
+# undef M
+
+#if (defined(__amd64__) || defined(__x86_64__))
+
+# define M(I) MIDM( MaskOffset , I )
+# define MIDC(M,I) MtoSTR(M+(I-1)*8(%rcx))
+# define C(I) MIDC( CosineOffset , I )
+# define MIDEight(M) MtoSTR(M(%rcx))
+# define Eight MIDEight(EightOffset)
+
+ __asm__ __volatile__ ("\n"
+ :
+ : "d" (_y),
+ "c" (idctconstants)
+
+ );
+
+# define I( K) MtoSTR((K*16)(%rdx))
+# define J( K) MtoSTR(((K - 4) * 16)+8(%rdx))
+
+#else
+# define M(I) MIDM( MaskOffset , I )
+# define MIDC(M,I) MtoSTR(M+(I-1)*8(%ecx))
+# define C(I) MIDC( CosineOffset , I )
+# define MIDEight(M) MtoSTR(M(%ecx))
+# define Eight MIDEight(EightOffset)
+
+ __asm__ __volatile__ ("\n"
+ "mov $idctconstants,%%ecx\n"
+ :
+ : "d" (_y)
+ : "ecx"
+ );
+
+# define I( K) MtoSTR((K*16)(%edx))
+# define J( K) MtoSTR(((K - 4) * 16)+8(%edx))
+
+#endif
+
+
+//; Done w/dequant + descramble + partial transpose; now do the idct itself.
+
+
+ RowIDCT_10
+ Transpose
+
+# undef I
+# undef J
+
+#if (defined(__amd64__) || defined(__x86_64__))
+# define I( K) MtoSTR((K * 16)(%rdx))
+#else
+# define I( K) MtoSTR((K * 16)(%edx))
+#endif
+# define J( K) I( K)
+
+ ColumnIDCT_10
+
+# undef I
+# undef J
+
+#if (defined(__amd64__) || defined(__x86_64__))
+# define I( K) MtoSTR((K * 16)+8(%rdx))
+#else
+# define I( K) MtoSTR((K * 16)+8(%edx))
+#endif
+# define J( K) I( K)
+
+ ColumnIDCT_10
+
+# undef I
+# undef J
+
+ ASM("\n"
+ " emms\n"
+ );
+}
+
+
diff -Naur a/lib/x86/mmxstate.c b/lib/x86/mmxstate.c
--- a/lib/x86/mmxstate.c 2005-07-20 11:39:29.059806750 +0200
+++ b/lib/x86/mmxstate.c 2005-07-20 16:48:32.718713000 +0200
@@ -10,7 +10,171 @@
* *
********************************************************************
*/
+#include <ogg/ogg.h>
#include "x86int.h"
+#include "../internal.h"
+
+static const __attribute__((aligned(8),used)) const int OC_FZIG_ZAGMMX[128]={
+0, 8, 1, 2, 9, 16, 24, 17,
+10, 3, 32, 11, 18, 25, 4, 12,
+5, 26, 19, 40, 33, 34, 41, 48,
+27, 6, 13, 20, 28, 21, 14, 7,
+56, 49, 42, 35, 43, 50, 57, 36,
+15, 22, 29, 30, 23, 44, 37, 58,
+51, 59, 38, 45, 52, 31, 60, 53,
+46, 39, 47, 54, 61, 62, 55, 63,
+64, 64, 64, 64, 64, 64, 64, 64,
+64, 64, 64, 64, 64, 64, 64, 64,
+64, 64, 64, 64, 64, 64, 64, 64,
+64, 64, 64, 64, 64, 64, 64, 64,
+64, 64, 64, 64, 64, 64, 64, 64,
+64, 64, 64, 64, 64, 64, 64, 64,
+64, 64, 64, 64, 64, 64, 64, 64,
+64, 64, 64, 64, 64, 64, 64, 64
+};
+
+void oc_state_frag_recon_mmx(oc_theora_state *_state,const oc_fragment *_frag,
+ int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
+ ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]){
+ ogg_int16_t __attribute__((aligned(8),used)) res_buf[64];
+ int dst_framei;
+ int dst_ystride;
+ int zzi;
+ int ci;
+ /*_last_zzi is subtly different from an actual count of the number of
+ coefficients we decoded for this block.
+ It contains the value of zzi BEFORE the final token in the block was
+ decoded.
+ In most cases this is an EOB token (the continuation of an EOB run from a
+ previous block counts), and so this is the same as the coefficient count.
+ However, in the case that the last token was NOT an EOB token, but filled
+ the block up with exactly 64 coefficients, _last_zzi will be less than 64.
+ Provided the last token was not a pure zero run, the minimum value it can
+ be is 46, and so that doesn't affect any of the cases in this routine.
+ However, if the last token WAS a pure zero run of length 63, then _last_zzi
+ will be 1 while the number of coefficients decoded is 64.
+ Thus, we will trigger the following special case, where the real
+ coefficient count would not.
+ Note also that a zero run of length 64 will give _last_zzi a value of 0,
+ but we still process the DC coefficient, which might have a non-zero value
+ due to DC prediction.
+ Although convoluted, this is arguably the correct behavior: it allows us to
+ dequantize fewer coefficients and use a smaller transform when the block
+ ends with a long zero run instead of a normal EOB token.
+ It could be smarter... multiple separate zero runs at the end of a block
+ will fool it, but an encoder that generates these really deserves what it
+ gets.
+ Needless to say we inherited this approach from VP3.*/
+
+ /*Special case only having a DC component.*/
+ if(_last_zzi<2){
+ ogg_int16_t p;
+ /*Why is the iquant product rounded in this case and no others?
+ Who knows.*/
+ p=(ogg_int16_t)((ogg_int32_t)_frag->dc*_dc_iquant+15>>5);
+
+#if (defined(__amd64__) || defined(__x86_64__))
+ /* for(ci=0;ci<64;ci++)res_buf[ci]=p; */
+ __asm__ __volatile__(
+ "mov %%rdi,%%rdx\n" /* I cant tell the GCC that EDI value is clobbered */
+ "cld\n"
+ "rep\n"
+ "stosq\n"
+ "mov %%rdx,%%rdi\n"
+ :
+ : "D" (res_buf), "a" (p), "c" (16)
+ : "memory", "cc", "rdx"
+ );
+#else
+ __asm__ __volatile__(
+ "mov %%edi,%%edx\n" /* I cant tell the GCC that EDI value is clobbered */
+ "cld\n"
+ "rep\n"
+ "stosw\n"
+ "mov %%edx,%%edi\n" /* I cant tell the GCC that EDI value is clobbered */
+ :
+ : "D" (res_buf), "a" (p), "c" (64)
+ : "memory", "%edx", "cc"
+ );
+#endif
+ }
+ else{
+
+ /*Then, fill in the remainder of the coefficients with 0's, and perform
+ the iDCT.*/
+
+ /* First zero the buffer */
+
+#if (defined(__amd64__) || defined(__x86_64__))
+
+ __asm__ __volatile__(
+ "mov %%rdi,%%rdx\n" /* I cant tell the GCC that EDI value is clobbered */
+ "xor %%rax,%%rax\n"
+ "cld\n"
+ "rep\n"
+ "stosq\n"
+ "mov %%rdx,%%rdi\n"
+ :
+ : "D" (res_buf), "c" (16)
+ : "memory", "cc", "rdx"
+ );
+
+#else
+ __asm__ __volatile__(
+ "mov %%edi,%%edx\n" /* I cant tell the GCC that EDI value is clobbered */
+ "xor %%eax,%%eax\n"
+ "cld\n"
+ "rep\n"
+ "stosw\n"
+ "mov %%edx,%%edi\n" /* I cant tell the GCC that EDI value is clobbered */
+ :
+ : "D" (res_buf), "c" (64)
+ : "memory", "%edx", "cc"
+ );
+
+#endif
+ res_buf[0]=(ogg_int16_t)((ogg_int32_t)_frag->dc*_dc_iquant);
+
+ for(zzi=1;zzi<_ncoefs;zzi++){
+ int ci;
+ ci=OC_FZIG_ZAG[zzi];
+ res_buf[OC_FZIG_ZAGMMX[zzi]]=(ogg_int16_t)((ogg_int32_t)_dct_coeffs[ci]*_ac_iquant[ci]);
+ }
+ if(_last_zzi<10){
+ oc_idct8x8_10_mmx(res_buf);
+ }
+ else{
+ oc_idct8x8_mmx(res_buf);
+ }
+ }
+ /*Fill in the target buffer.*/
+ dst_framei=_state->ref_frame_idx[OC_FRAME_SELF];
+ dst_ystride=_state->ref_frame_bufs[dst_framei][_pli].ystride;
+ /*For now ystride values in all ref frames assumed to be equal.*/
+ if(_frag->mbmode==OC_MODE_INTRA){
+ oc_frag_recon_intra(_state,_frag->buffer[dst_framei],dst_ystride,res_buf);
+ }
+ else{
+ int ref_framei;
+ int ref_ystride;
+ int mvoffset0;
+ int mvoffset1;
+ ref_framei=_state->ref_frame_idx[OC_FRAME_FOR_MODE[_frag->mbmode]];
+ ref_ystride=_state->ref_frame_bufs[ref_framei][_pli].ystride;
+ if(oc_state_get_mv_offsets(_state,&mvoffset0,&mvoffset1,_frag->mv[0],
+ _frag->mv[1],ref_ystride,_pli)>1){
+ oc_frag_recon_inter2(_state,_frag->buffer[dst_framei],dst_ystride,
+ _frag->buffer[ref_framei]+mvoffset0,ref_ystride,
+ _frag->buffer[ref_framei]+mvoffset1,ref_ystride,res_buf);
+ }
+ else{
+ oc_frag_recon_inter(_state,_frag->buffer[dst_framei],dst_ystride,
+ _frag->buffer[ref_framei]+mvoffset0,ref_ystride,res_buf);
+ }
+ }
+ oc_restore_fpu(_state);
+}
+
/*Copies the fragments specified by the lists of fragment indices from one
frame to another.
diff -Naur a/lib/x86/x86int.h b/lib/x86/x86int.h
--- a/lib/x86/x86int.h 2005-07-20 11:39:29.063807000 +0200
+++ b/lib/x86/x86int.h 2005-07-20 13:30:16.883269750 +0200
@@ -3,7 +3,6 @@
# include "../internal.h"
void oc_state_vtable_init_x86(oc_theora_state *_state);
-
void oc_frag_recon_intra_mmx(unsigned char *_dst,int _dst_ystride,
const ogg_int16_t *_residue);
void oc_frag_recon_inter_mmx(unsigned char *_dst,int _dst_ystride,
@@ -14,5 +13,11 @@
void oc_state_frag_copy_mmx(const oc_theora_state *_state,const int *_fragis,
int _nfragis,int _dst_frame,int _src_frame,int _pli);
void oc_restore_fpu_mmx(void);
+void oc_idct8x8_mmx(ogg_int16_t _y[64]);
+void oc_idct8x8_10_mmx(ogg_int16_t _y[64]);
+void oc_state_frag_recon_mmx(oc_theora_state *_state,const oc_fragment *_frag,
+ int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
+ ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]);
+void fillidctconstants_mmx(void);
#endif
diff -Naur a/lib/x86/x86state.c b/lib/x86/x86state.c
--- a/lib/x86/x86state.c 2005-07-20 11:39:29.063807000 +0200
+++ b/lib/x86/x86state.c 2005-07-20 11:32:49.138813250 +0200
@@ -4,6 +4,7 @@
void oc_state_vtable_init_x86(oc_theora_state *_state){
_state->cpu_flags=oc_cpu_flags_get();
if(_state->cpu_flags&OC_CPU_X86_MMX){
+ _state->opt_vtable.oc_state_frag_recon=oc_state_frag_recon_mmx;
_state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
_state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
_state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx;
More information about the Theora-dev
mailing list