[xiph-commits] r16954 - in branches/theorarm-merge-branch: . lib lib/arm
robin at svn.xiph.org
robin at svn.xiph.org
Sun Mar 7 09:59:26 PST 2010
Author: robin
Date: 2010-03-07 09:59:26 -0800 (Sun, 07 Mar 2010)
New Revision: 16954
Added:
branches/theorarm-merge-branch/lib/arm/
branches/theorarm-merge-branch/lib/arm/ARMdecode.s
branches/theorarm-merge-branch/lib/arm/ARMfrag.s
branches/theorarm-merge-branch/lib/arm/ARMidct.s
branches/theorarm-merge-branch/lib/arm/ARMint.h
branches/theorarm-merge-branch/lib/arm/ARMoffsets.s
branches/theorarm-merge-branch/lib/arm/ARMoptions.s
branches/theorarm-merge-branch/lib/arm/ARMpp.s
branches/theorarm-merge-branch/lib/arm/ARMstate.c
Modified:
branches/theorarm-merge-branch/COPYING
branches/theorarm-merge-branch/lib/Makefile.am
branches/theorarm-merge-branch/lib/decode.c
branches/theorarm-merge-branch/lib/internal.c
branches/theorarm-merge-branch/lib/internal.h
branches/theorarm-merge-branch/lib/state.c
Log:
First drop of Theorarm code, tidied up a bit to be more suitable for merging.
This code is still under GNU GPL, not BSD, but this is anticipated to change
within a month or so.
This merged version of the code is currently untested, as I haven't got
cross compiling working with the standard libTheora makefiles yet, but is
(I hope) a reasonable first step.
The ARM code is dropped in pretty much unchanged from Theorarm, the C has
been tweaked a bit. The bitstream handling code from Theorarm is not in this
code yet, as libTheora is written to work with the current libOgg, and my
bitreading code is written to work with the version of libOgg present in
Tremor/Tremolo.
The C changes here make sense for ARM, but may not be ideal for other
architectures - additional work will be required to resolve this. It
may be that we need to discard some of these changes, or fork routines
for different architectures.
Modified: branches/theorarm-merge-branch/COPYING
===================================================================
--- branches/theorarm-merge-branch/COPYING 2010-03-07 03:06:29 UTC (rev 16953)
+++ branches/theorarm-merge-branch/COPYING 2010-03-07 17:59:26 UTC (rev 16954)
@@ -1,3 +1,16 @@
+The license for this branch is currently 'in flux'. Currently all the
+changes from the branch on which this is based are Copyright (C) 2009-2010
+Robin Watts for Pinknoise Productions Ltd, and can be used either under
+license from Pinknoise Productions, or under the standard GNU GPL.
+
+It is anticipated that this will change back to the simple BSD license
+used by the original branch within a month or so, whereupon this
+branch will be merged back to the trunk/branch it forked from.
+
+The original copyright statement follows:
+-----------------------------------------
+
+
Copyright (C) 2002-2009 Xiph.org Foundation
Redistribution and use in source and binary forms, with or without
Modified: branches/theorarm-merge-branch/lib/Makefile.am
===================================================================
--- branches/theorarm-merge-branch/lib/Makefile.am 2010-03-07 03:06:29 UTC (rev 16953)
+++ branches/theorarm-merge-branch/lib/Makefile.am 2010-03-07 17:59:26 UTC (rev 16954)
@@ -17,7 +17,8 @@
x86/mmxstate.c \
x86/x86int.h \
x86/x86state.c \
- x86_vc
+ x86_vc \
+ arm
lib_LTLIBRARIES = libtheoradec.la libtheoraenc.la libtheora.la
@@ -94,15 +95,26 @@
x86/mmxfrag.c \
x86/mmxstate.c \
x86/x86state.c
+
+decoder_arm_sources = \
+ arm/ARMfrag.s \
+ arm/ARMidct.s \
+ arm/ARMpp.s \
+ arm/ARMstate.c
+
if CPU_x86_64
decoder_arch_sources = $(decoder_x86_sources)
else
if CPU_x86_32
decoder_arch_sources = $(decoder_x86_sources)
else
+if CPU_ARM
+decoder_arch_sources = $(decoder_arm_sources)
+else
decoder_arch_sources =
endif
endif
+endif
decoder_sources = \
apiwrapper.c \
@@ -141,7 +153,9 @@
x86/mmxfrag.h \
x86/mmxloop.h \
x86/x86int.h \
- x86/sse2trans.h
+ x86/sse2trans.h \
+ arm/ARMint.h \
+ arm/ARMoptions.s
libtheoradec_la_SOURCES = \
$(decoder_sources) \
Added: branches/theorarm-merge-branch/lib/arm/ARMdecode.s
===================================================================
--- branches/theorarm-merge-branch/lib/arm/ARMdecode.s (rev 0)
+++ branches/theorarm-merge-branch/lib/arm/ARMdecode.s 2010-03-07 17:59:26 UTC (rev 16954)
@@ -0,0 +1,342 @@
+; Theorarm library
+; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
+
+ AREA |.text|, CODE, READONLY
+
+ GET common.s
+ GET ARMoffsets.s
+
+ EXPORT oc_sb_run_unpack
+ EXPORT oc_block_run_unpack
+ EXPORT oc_dec_partial_sb_flags_unpack
+ EXPORT oc_dec_coded_sb_flags_unpack
+ EXPORT oc_dec_coded_flags_unpack
+
+ IMPORT oc_pack_look
+ IMPORT oc_pack_read1
+ IMPORT oc_pack_adv
+
+oc_sb_run_unpack
+ STMFD r13!,{r0,r4,r14}
+
+ MOV r1, #18
+ BL oc_pack_look
+ MOV r1, #1 ; r1 = adv = 1
+ MVN r2, #0 ; r2 = sub = -1
+ TST r0, #0x20000 ; if (bits&0x20000)
+ MOVNE r1, #3 ; r1 = adv = 3
+ MOVNE r2, #2 ; r2 = sub = 2
+ TSTNE r0, #0x10000 ; if (bits&0x10000)
+ MOVNE r1, #4 ; r1 = adv = 4
+ MOVNE r2, #8 ; r2 = sub = 8
+ TSTNE r0, #0x08000 ; if (bits&0x08000)
+ MOVNE r1, #6 ; r1 = adv = 6
+ MOVNE r2, #50 ; r2 = sub = 50
+ TSTNE r0, #0x04000 ; if (bits&0x04000)
+ MOVNE r1, #8 ; r1 = adv = 8
+ MOVNE r2, #230 ; r2 = sub = 230
+ TSTNE r0, #0x02000 ; if (bits&0x02000)
+ MOVNE r1, #10 ; r1 = adv = 10
+ ADDNE r2, r2, #974-230; r2 = sub = 974
+ TSTNE r0, #0x01000 ; if (bits&0x01000)
+ MOVNE r1, #18 ; r1 = adv = 18
+ LDRNE r2, =258014 ; r2 = sub = 258014
+ RSB r4, r1, #18
+ RSB r4, r2, r0, LSR r4; (r4 = bits>>(18-adv))-sub
+ LDR r0, [r13],#4
+ BL oc_pack_adv
+ MOV r0, r4
+
+ LDMFD r13!,{r4,PC}
+
+oc_block_run_unpack
+ STMFD r13!,{r0,r4,r14}
+
+ MOV r1, #9
+ BL oc_pack_look
+ MOV r1, #2 ; r1 = adv = 2
+ MVN r2, #0 ; r2 = sub = -1
+ TST r0, #0x100 ; if (bits&0x100)
+ MOVNE r1, #3 ; r1 = adv = 3
+ MOVNE r2, #1 ; r2 = sub = 1
+ TSTNE r0, #0x080 ; if (bits&0x080)
+ MOVNE r1, #4 ; r1 = adv = 4
+ MOVNE r2, #7 ; r2 = sub = 7
+ TSTNE r0, #0x040 ; if (bits&0x040)
+ MOVNE r1, #6 ; r1 = adv = 6
+ MOVNE r2, #49 ; r2 = sub = 49
+ TSTNE r0, #0x020 ; if (bits&0x020)
+ MOVNE r1, #7 ; r1 = adv = 8
+ MOVNE r2, #109 ; r2 = sub = 109
+ TSTNE r0, #0x010 ; if (bits&0x010)
+ MOVNE r1, #9 ; r1 = adv = 10
+ ADDNE r2, r2, #481-109; r2 = sub = 481
+ RSB r4, r1, #9
+ RSB r4, r2, r0, LSR r4; (r4 = bits>>(9-adv))-sub
+ LDR r0, [r13],#4
+ BL oc_pack_adv
+ SUB r0, r4, #1
+
+ LDMFD r13!,{r4,PC}
+
+oc_dec_partial_sb_flags_unpack
+ ; r0 = dec
+ STMFD r13!,{r5-r11,r14}
+
+ LDR r5, =DEC_OPB
+ LDR r9, [r0, #DEC_STATE_NSBS] ; r9 = nsbs
+ LDR r10,[r0, #DEC_STATE_SB_FLAGS] ; r10= sb_flags
+ ADD r11,r0, r5 ; r11= dec->opb
+ MOV r8, #0 ; r8 = npartial = 0
+ MOV r7, #0x1000 ; r7 is >=0x1000 if full_run
+ CMP r9, #0
+ BLE odpsfu_end
+odpsfu_lp
+ CMP r7, #0x1000 ; if (full_run) (i.e. if >= 0)
+ MOVGE r0, r11
+ BLGE oc_pack_read1 ; r0 = oc_pack_read1
+ MOV r6, r0 ; r6 = flag
+ MOV r0, r11
+ BL oc_sb_run_unpack
+ ; r0 = run_count
+ SUB r7, r0, #0x21 ; r7 is >= 0x1000 if full_run
+ CMP r0, r9 ; if (run_count > nsbs)
+ MOVGT r0, r9 ; run_count = nsbs
+ SUB r9, r9, r0 ; nsbs -= run_count
+ MLA r8, r6, r0, r8 ; r8 = npartial+=run_count*flag
+odpsfu_lp2
+ LDRB r1, [r10]
+ SUBS r0, r0, #1
+ BIC r1, r1, #CODED_PARTIALLY|CODED_FULLY
+ ORR r1, r1, r6, LSL #CODED_PARTIALLY_SHIFT
+ STRB r1, [r10], #1
+ BGT odpsfu_lp2
+
+ ; r0 = flag
+ RSB r0, r6, #1 ; r0 = flag = !flag
+ CMP r9, #0
+ BGT odpsfu_lp
+odpsfu_end
+ MOV r0, r8 ; return npartial
+ LDMFD r13!,{r5-r11,PC}
+
+oc_dec_coded_sb_flags_unpack
+ ; r0 = dec
+ STMFD r13!,{r5-r11,r14}
+
+ LDR r10,[r0, #DEC_STATE_SB_FLAGS] ; r10= sb_flags
+ LDR r9, [r0, #DEC_STATE_NSBS] ; r9 = nsbs
+ LDR r5, =DEC_OPB
+ MOV r7, #0x1000 ; r7 is >=0x1000 if full_run
+ ADD r9, r10, r9 ; r9 = sb_flags_end
+ LDRB r1, [r10], #1
+ ADD r11,r0, r5 ; r11= dec->opb
+odcsfu_lp
+ TST r1, #CODED_PARTIALLY ; while ((sbflags++)->coded_part)
+ LDRNEB r1, [r10], #1
+ BNE odcsfu_lp
+ SUB r10,r10,#1 ; sb_flags--
+odcsfu_lp2
+ CMP r7, #0x1000 ; if (full_run) (i.e. if >= 0)
+ MOVGE r0, r11
+ BLGE oc_pack_read1 ; r0 = oc_pack_read1
+ MOV r6, r0 ; r6 = flag
+ MOV r0, r11
+ BL oc_sb_run_unpack
+ ; r0 = run_count
+ LDRB r1, [r10]
+ SUB r7, r0, #0x21 ; r7 is >= 0x1000 if full_run
+odcsfu_lp3
+ TST r1, #CODED_PARTIALLY
+ BNE odcsfu_end_lp3
+ SUBS r0, r0, #1
+ BLT odcsfu_break
+ ORR r1, r1, r6, LSL #CODED_FULLY_SHIFT
+odcsfu_end_lp3
+ STRB r1, [r10],#1
+ CMP r10,r9
+ LDRNEB r1, [r10]
+ BNE odcsfu_lp3
+odcsfu_break
+ ; r0 = flag
+ RSB r0, r6, #1 ; r0 = flag = !flag
+ CMP r10,r9
+ BNE odcsfu_lp2
+
+ LDMFD r13!,{r5-r11,PC}
+
+oc_dec_coded_flags_unpack
+ ; r0 = dec
+ STMFD r13!,{r4-r11,r14}
+
+ MOV r4, r0 ; r4 = dec
+ BL oc_dec_partial_sb_flags_unpack ; r0 = npartial=oc_dec_par...
+ LDR r3, [r4, #DEC_STATE_NSBS] ; r3 = nsbs
+ LDR r5, =DEC_OPB
+ MOV r7, r0 ; r7 = npartial
+ CMP r7, r3 ; if (npartial < nsbs)
+ MOVLT r0, r4
+ ADD r5, r4, r5 ; r5 = &dec->opb
+ BLLT oc_dec_coded_sb_flags_unpack ; dec_cdd_sb_flags_unpk(dec)
+ CMP r7, #0 ; if (npartial>0)
+ MOVLE r0, #1
+ MOVGT r0, r5
+ BLGT oc_pack_read1 ; flag=!oc_pack_read1(opb)
+ EOR r9, r0, #1 ; else flag = 0
+ STMFD r13!,{r4,r5}
+
+ LDR r7, [r4, #DEC_STATE_CODED_FRAGIS];r7 = coded_fragis
+ LDR r12,[r4, #DEC_STATE_NFRAGS] ; r12= nfrags
+ LDR r6, [r4, #DEC_STATE_SB_MAPS] ; r6 = sb_maps
+ LDR r2, [r4, #DEC_STATE_SB_FLAGS] ; r2 = sb_flags
+ LDR r10,[r4, #DEC_STATE_FRAGS] ; r10= frags
+ MOV r8, #0 ; r8 = run_count=0
+ ADD r12,r7, r12,LSL #2 ; r12= uncoded_fragis
+ MOV r1, #3 ; r1 = pli=3
+ ADD r14, r4, #DEC_STATE_FPLANES+FPLANE_NSBS ;r14= nsbs_ptr
+odcfu_lp
+ LDR r3, [r14], #FPLANE_SIZE ; r3 = nsbs
+ MOV r11,#0 ; r11= ncoded_fragis
+odcfu_lp8
+ ; r0 = nsbs_ptr r9 = flag
+ ; r1 = r5 = r10= frags
+ ; r2 = sb_flags r6 = sb_maps r11= ncoded_fragis
+ ; r3 = nsbs r7 = coded_fragis r12= uncoded_fragis
+ ; r4 = r8 = run_count r14= nsbs_ptr
+ ; r1 = fragis=sb_maps
+ LDRB r4, [r2], #1 ; r4 = flags =*sb_flags++
+ AND r5, r4, #CODED_FULLY|CODED_PARTIALLY
+ MOV r4, r4, LSL #4
+ ORR r4, r4, #0x8
+ [ CODED_FULLY < CODED_PARTIALLY
+ CMP r5, #CODED_FULLY
+ BGT odcfu_partially_coded
+ |
+ CMP r5, #CODED_PARTIALLY
+ BEQ odcfu_partially_coded
+ ]
+ BLT odcfu_uncoded
+ ; Coded Fully case r9 = flag
+ ; r1 = r5 = r10= frags
+ ; r2 = sb_flags r6 = sb_maps/fragip r11= ncoded_fragis
+ ; r3 = nsbs r7 = coded_fragis r12= uncoded_fragis
+ ; r4 = flags/counter r8 = run_count r14= nsbs_ptr
+odcfu_lp2
+ TST r4, #1<<(4+VALUE_BIT_SHIFT)
+ ADDEQ r6, r6, #16
+ BEQ odcfu_skip
+ SUB r4, r4, #4<<28
+odcfu_lp3
+ LDR r5, [r6], #4 ; r5 = fragi=*fragip++
+ ; Stall (2 on Xscale)
+ CMP r5, #0 ; if(fragi>=0)
+ LDRGE r0, [r10,r5, LSL #2] ;
+ STRGE r5, [r7], #4 ; *coded_fragis++=fragi
+ ADDGE r11,r11,#1 ; ncoded_fragis++;
+ ORRGE r0, r0, #FRAGMENT_CODED
+ STRGE r0, [r10,r5, LSL #2] ; frags[fragi].coded=1
+ ADDS r4, r4, #1<<28
+ BLT odcfu_lp3
+odcfu_skip
+ MOVS r4, r4, LSR #1
+ BCC odcfu_lp2
+
+ B odcfu_common
+odcfu_uncoded
+ ; Uncoded case r9 = flag
+ ; r1 = r5 = r10= frags
+ ; r2 = sb_flags r6 = sb_maps/fragip r11= ncoded_fragis
+ ; r3 = nsbs r7 = coded_fragis r12= uncoded_fragis
+ ; r4 = flags/counter r8 = run_count r14= nsbs_ptr
+odcfu_lp4
+ TST r4, #1<<(4+VALUE_BIT_SHIFT)
+ ADDEQ r6, r6, #16
+ BEQ odcfu_skip2
+ SUB r4, r4, #4<<28
+odcfu_lp5
+ LDR r5, [r6], #4 ; r5 = fragi=*fragip++
+ ; Stall (2 on Xscale)
+ CMP r5, #0 ; if(fragi>=0)
+ LDRGE r0, [r10,r5, LSL #2] ;
+ STRGE r5, [r12,#-4]! ; *--uncoded_fragis=fragi
+ ; Stall (on Xscale)
+ BICGE r0, r0, #FRAGMENT_CODED
+ STRGE r0, [r10,r5, LSL #2] ; frags[fragi].coded=1
+ ADDS r4, r4, #1<<28
+ BLT odcfu_lp5
+odcfu_skip2
+ MOVS r4, r4, LSR #1
+ BCC odcfu_lp4
+
+ B odcfu_common
+odcfu_partially_coded
+ ; Partially coded case r9 = flag
+ ; r1 = r5 = scratch r10= frags
+ ; r2 = sb_flags r6 = sb_maps/fragip r11= ncoded_fragis
+ ; r3 = nsbs r7 = coded_fragis r12= uncoded_fragis
+ ; r4 = flags/counter r8 = run_count r14= nsbs_ptr
+odcfu_lp6
+ TST r4, #1<<(4+VALUE_BIT_SHIFT)
+ ADDEQ r6, r6, #16
+ BEQ odcfu_skip3
+ SUB r4, r4, #4<<28
+odcfu_lp7
+ LDR r5, [r6], #4 ; r5 = fragi=*fragip++
+ ; Stall (2 on Xscale)
+ CMP r5, #0 ; if(fragi>=0)
+ BLT odcfu_skip4
+ SUBS r8, r8, #1 ; if (--run_count < 0)
+ BGE odcfu_skip5
+ LDR r0, [r13,#4] ; r0 = &dec->opb
+ STMFD r13!,{r1-r3,r12,r14}
+ BL oc_block_run_unpack ; run_count=oc_block_run_unpack
+ MOV r8, r0
+ LDMFD r13!,{r1-r3,r12,r14}
+ EOR r9, r9, #1 ; flag=!flag
+odcfu_skip5
+ LDR r0, [r10,r5, LSL #2] ;
+ CMP r9, #0 ; if (flag)
+ STRNE r5, [r7], #4 ; *coded_fragis++=fragi
+ ADDNE r11,r11,#1 ; ncoded_fragis++
+ STREQ r5, [r12,#-4]! ; else *--uncoded_fragis=fragi
+ ORRNE r0, r0, #FRAGMENT_CODED
+ BICEQ r0, r0, #FRAGMENT_CODED
+ STR r0, [r10,r5, LSL #2] ; frags[fragi].coded=flag
+odcfu_skip4
+ ADDS r4, r4, #1<<28
+ BLT odcfu_lp7
+odcfu_skip3
+ MOVS r4, r4, LSR #1
+ BCC odcfu_lp6
+
+odcfu_common
+ ; r0 = r9 = flag
+ ; r1 = r5 = r10= nsbs_ptr
+ ; r2 = sb_flags r6 = sb_maps/fragip r11= ncoded_fragis
+ ; r3 = nsbs r7 = coded_fragis r12= uncoded_fragis
+ ; r4 = flags/counter r8 = run_count r14= frags
+ SUBS r3, r3, #1 ; nsbs--
+ BGT odcfu_lp8
+
+ LDR r4, [r13]
+ ; r0 = r9 = flag
+ ; r1 = pli r5 = r10= nsbs_ptr
+ ; r2 = sb_flags r6 = fragip r11= ncoded_fragis
+ ; r3 = nsbs r7 = coded_fragis r12= uncoded_fragis
+ ; r4 = dec r8 = run_count r14= frags
+ ; _dec->state.ncoded_fragis[pli]=ncoded_fragis
+ SUBS r1, r1, #1
+ ADD r5, r4, #DEC_STATE_NCODED_FRAGIS+2*4
+ STR r11,[r5, -r1, LSL #2]
+ BGT odcfu_lp
+
+ ; dec->state.ntotal_coded_fragis=coded_fragis-dec->state.coded_fragis
+ LDR r0, [r4, #DEC_STATE_CODED_FRAGIS]
+ ADD r13,r13,#8
+ SUB r0, r7, r0
+ MOV r0, r0, LSR #2
+ STR r0, [r4, #DEC_STATE_NTOTAL_CODED_FRAGIS]
+
+ LDMFD r13!,{r4-r11,PC}
+
+ END
Property changes on: branches/theorarm-merge-branch/lib/arm/ARMdecode.s
___________________________________________________________________
Added: svn:executable
+ *
Added: branches/theorarm-merge-branch/lib/arm/ARMfrag.s
===================================================================
--- branches/theorarm-merge-branch/lib/arm/ARMfrag.s (rev 0)
+++ branches/theorarm-merge-branch/lib/arm/ARMfrag.s 2010-03-07 17:59:26 UTC (rev 16954)
@@ -0,0 +1,706 @@
+; Theorarm library
+; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
+
+ AREA |.text|, CODE, READONLY
+
+ GET ARMoptions.s
+
+ EXPORT oc_frag_recon_inter2_arm
+ EXPORT oc_frag_recon_inter_arm
+ EXPORT oc_frag_recon_intra_arm
+ EXPORT oc_frag_copy_list_arm
+
+ [ ARM_HAS_NEON
+ [ 1
+oc_frag_copy_list_arm
+ ; r0 = dst_frame_data
+ ; r1 = src_frame_data
+ ; r2 = ystride
+ ; r3 = nfragis
+ ; <> = fragis
+ ; <> = frag_buf_offs
+ STMFD r13!,{r4-r6,r14}
+
+ SUBS r3, r3, #1
+ LDRGE r12,[r13,#4*4] ; r12= fragis
+ BLT ofcl_end
+ LDR r14,[r13,#4*5] ; r14= frag_buf_offs
+ LDR r6, [r12],#4 ; r5 = fragis[fragii]
+ ; Stall (2 on Xscale)
+ LDR r6, [r14,r6, LSL #2] ; r5 = frag_buf_offs[fragis[fragii]]
+ ; Stall (on XScale)
+ofcl_lp
+ ADD r4, r1, r6
+ VLD1.32 {D0}, [r4], r2
+ VLD1.32 {D1}, [r4], r2
+ VLD1.32 {D2}, [r4], r2
+ VLD1.32 {D3}, [r4], r2
+ ADD r5, r6, r0
+ VLD1.32 {D4}, [r4], r2
+ SUBS r3, r3, #1
+ VLD1.32 {D5}, [r4], r2
+ VLD1.32 {D6}, [r4], r2
+ VLD1.32 {D7}, [r4], r2
+ VST1.32 {D0}, [r5], r2
+ LDRGE r6, [r12],#4 ; r6 = fragis[fragii]
+ VST1.32 {D1}, [r5], r2
+ VST1.32 {D2}, [r5], r2
+ VST1.32 {D3}, [r5], r2
+ VST1.32 {D4}, [r5], r2
+ LDRGE r6, [r14,r6, LSL #2] ; r5 = frag_buf_offs[fragis[fragii]]
+ VST1.32 {D5}, [r5], r2
+ VST1.32 {D6}, [r5], r2
+ VST1.32 {D7}, [r5], r2
+ BGE ofcl_lp
+ofcl_end
+ LDMFD r13!,{r4-r6,PC}
+ |
+oc_frag_copy_list_arm
+ ; r0 = dst_frame_data
+ ; r1 = src_frame_data
+ ; r2 = ystride
+ ; r3 = nfragis
+ ; <> = fragis
+ ; <> = frag_buf_offs
+ STMFD r13!,{r4-r11,r14}
+
+ SUBS r3, r3, #1
+ BLT ofcl_end
+ LDR r12,[r13,#4*9] ; r12= fragis
+ LDR r14,[r13,#4*10] ; r14= frag_buf_offs
+ofcl_lp
+ LDR r5, [r12],#4 ; r5 = fragis[fragii]
+ ; Stall (2 on Xscale)
+ LDR r5, [r14,r5, LSL #2] ; r5 = frag_buf_offs[fragis[fragii]]
+ SUBS r3, r3, #1
+ ; Stall (on XScale)
+ ADD r4, r1, r5
+ VLD1.32 {D0}, [r4], r2
+ VLD1.32 {D1}, [r4], r2
+ VLD1.32 {D2}, [r4], r2
+ VLD1.32 {D3}, [r4], r2
+ VLD1.32 {D4}, [r4], r2
+ VLD1.32 {D5}, [r4], r2
+ VLD1.32 {D6}, [r4], r2
+ VLD1.32 {D7}, [r4], r2
+ ADD r5, r5, r0
+ VST1.32 {D0}, [r5], r2
+ VST1.32 {D1}, [r5], r2
+ VST1.32 {D2}, [r5], r2
+ VST1.32 {D3}, [r5], r2
+ VST1.32 {D4}, [r5], r2
+ VST1.32 {D5}, [r5], r2
+ VST1.32 {D6}, [r5], r2
+ VST1.32 {D7}, [r5], r2
+ BGE ofcl_lp
+ofcl_end
+ LDMFD r13!,{r4-r11,PC}
+ ]
+ |
+ [ ARM_HAS_LDRD
+oc_frag_copy_list_arm
+ ; r0 = dst_frame_data
+ ; r1 = src_frame_data
+ ; r2 = ystride
+ ; r3 = nfragis
+ ; <> = fragis
+ ; <> = frag_buf_offs
+ STMFD r13!,{r4-r11,r14}
+
+ SUBS r3, r3, #1
+ BLT ofcl_end
+ LDR r12,[r13,#4*9] ; r12= fragis
+ LDR r14,[r13,#4*10] ; r14= frag_buf_offs
+ofcl_lp
+ LDR r5, [r12],#4 ; r5 = fragis[fragii]
+ MOV r4, r1
+ ; Stall (on Xscale)
+ LDR r5, [r14,r5, LSL #2] ; r5 = frag_buf_offs[fragis[fragii]]
+ SUBS r3, r3, #1
+ ; Stall (on XScale)
+ LDRD r6, [r4, r5]! ; r4 = src_frame_data+frag_buf_off
+ LDRD r8, [r4, r2]!
+ ; Stall
+ STRD r6, [r5, r0]! ; r5 = dst_frame_data+frag_buf_off
+ STRD r8, [r5, r2]!
+ ; Stall
+ LDRD r6, [r4, r2]! ; On Xscale at least, doing 3 consecutive
+ LDRD r8, [r4, r2]! ; loads causes a stall, but that's no worse
+ LDRD r10,[r4, r2]! ; than us only doing 2, and having to do
+ ; another pair of LDRD/STRD later on.
+ ; Stall
+ STRD r6, [r5, r2]!
+ STRD r8, [r5, r2]!
+ STRD r10,[r5, r2]!
+ ; Stall
+ LDRD r6, [r4, r2]!
+ LDRD r8, [r4, r2]!
+ LDRD r10,[r4, r2]!
+ ; Stall
+ STRD r6, [r5, r2]!
+ STRD r8, [r5, r2]!
+ STRD r10,[r5, r2]!
+ BGE ofcl_lp
+ofcl_end
+ LDMFD r13!,{r4-r11,PC}
+ |
+oc_frag_copy_list_arm
+ ; r0 = dst_frame_data
+ ; r1 = src_frame_data
+ ; r2 = ystride
+ ; r3 = nfragis
+ ; <> = fragis
+ ; <> = frag_buf_offs
+ STMFD r13!,{r4-r6,r11,r14}
+
+ SUBS r3, r3, #1
+ BLT ofcl_end
+ LDR r12,[r13,#4*5] ; r12 = fragis
+ LDR r14,[r13,#4*6] ; r14 = frag_buf_offs
+ SUB r2, r2, #4
+ofcl_lp
+ LDR r11,[r12],#4 ; r11 = fragis[fragii]
+ ; Stall (2 on Xscale)
+ LDR r11,[r14,r11,LSL #2] ; r11 = frag_buf_offs[fragis[fragii]]
+ SUBS r3, r3, #1
+ ; Stall (on XScale)
+ ADD r4, r1, r11 ; r4 = src_frame_data+frag_buf_off
+
+ LDR r6, [r4], #4
+ ADD r11,r0, r11 ; r11= dst_frame_data+frag_buf_off
+ LDR r5, [r4], r2
+ STR r6, [r11],#4
+ LDR r6, [r4], #4
+ STR r5, [r11],r2
+ LDR r5, [r4], r2
+ STR r6, [r11],#4
+ LDR r6, [r4], #4
+ STR r5, [r11],r2
+ LDR r5, [r4], r2
+ STR r6, [r11],#4
+ LDR r6, [r4], #4
+ STR r5, [r11],r2
+ LDR r5, [r4], r2
+ STR r6, [r11],#4
+ LDR r6, [r4], #4
+ STR r5, [r11],r2
+ LDR r5, [r4], r2
+ STR r6, [r11],#4
+ LDR r6, [r4], #4
+ STR r5, [r11],r2
+ LDR r5, [r4], r2
+ STR r6, [r11],#4
+ LDR r6, [r4], #4
+ STR r5, [r11],r2
+ LDR r5, [r4], r2
+ STR r6, [r11],#4
+ LDR r6, [r4], #4
+ LDR r4, [r4]
+ STR r5, [r11],r2
+ STR r6, [r11],#4
+ STR r4, [r11]
+ BGE ofcl_lp
+ofcl_end
+ LDMFD r13!,{r4-r6,r11,PC}
+ ]
+ ]
+
+ [ ARM_HAS_NEON
+oc_frag_recon_intra_arm
+ ; r0 = unsigned char *dst
+ ; r1 = int ystride
+ ; r2 = const ogg_int16_t residue[64]
+ MOV r3, #128
+ VLDMIA r2, {D0-D15} ; D0 = 3333222211110000 etc ; 9(8) cycles
+ VDUP.S16 Q8, r3
+ VQADD.S16 Q0, Q0, Q8
+ VQADD.S16 Q1, Q1, Q8
+ VQADD.S16 Q2, Q2, Q8
+ VQADD.S16 Q3, Q3, Q8
+ VQADD.S16 Q4, Q4, Q8
+ VQADD.S16 Q5, Q5, Q8
+ VQADD.S16 Q6, Q6, Q8
+ VQADD.S16 Q7, Q7, Q8
+
+ VQMOVUN.S16 D0, Q0 ; D0 = 7766554433221100 ; 1 cycle
+ VQMOVUN.S16 D1, Q1 ; D1 = FFEEDDCCBBAA9988 ; 1 cycle
+ VQMOVUN.S16 D2, Q2 ; D2 = NNMMLLKKJJIIHHGG ; 1 cycle
+ VQMOVUN.S16 D3, Q3 ; D3 = VVUUTTSSRRQQPPOO ; 1 cycle
+ VQMOVUN.S16 D4, Q4 ; D4 = ddccbbaaZZYYXXWW ; 1 cycle
+ VQMOVUN.S16 D5, Q5 ; D5 = llkkjjiihhggffee ; 1 cycle
+ VQMOVUN.S16 D6, Q6 ; D6 = ttssrrqqppoonnmm ; 1 cycle
+ VQMOVUN.S16 D7, Q7 ; D7 = !!@@zzyyxxwwvvuu ; 1 cycle
+
+ VST1.64 {D0}, [r0], r1
+ VST1.64 {D1}, [r0], r1
+ VST1.64 {D2}, [r0], r1
+ VST1.64 {D3}, [r0], r1
+ VST1.64 {D4}, [r0], r1
+ VST1.64 {D5}, [r0], r1
+ VST1.64 {D6}, [r0], r1
+ VST1.64 {D7}, [r0], r1
+
+ MOV PC,R14
+oc_frag_recon_inter_arm
+ ; r0 = unsigned char *dst
+ ; r1 = const unsigned char *src
+ ; r2 = int ystride
+ ; r3 = const ogg_int16_t residue[64]
+ VLD1.64 {D24}, [r1], r2
+ VLD1.64 {D25}, [r1], r2
+ VLD1.64 {D26}, [r1], r2
+ VLD1.64 {D27}, [r1], r2
+ VLD1.64 {D28}, [r1], r2
+ VLD1.64 {D29}, [r1], r2
+ VLD1.64 {D30}, [r1], r2
+ VLD1.64 {D31}, [r1], r2
+ VLDMIA r3, {D0-D15} ; D0 = 3333222211110000 etc ; 9(8) cycles
+ VMOVL.U8 Q8, D24 ; Q8 = __77__66__55__44__33__22__11__00
+ VMOVL.U8 Q9, D25 ; etc
+ VMOVL.U8 Q10,D26
+ VMOVL.U8 Q11,D27
+ VMOVL.U8 Q12,D28
+ VMOVL.U8 Q13,D29
+ VMOVL.U8 Q14,D30
+ VMOVL.U8 Q15,D31
+ VQADD.S16 Q0, Q0, Q8
+ VQADD.S16 Q1, Q1, Q9
+ VQADD.S16 Q2, Q2, Q10
+ VQADD.S16 Q3, Q3, Q11
+ VQADD.S16 Q4, Q4, Q12
+ VQADD.S16 Q5, Q5, Q13
+ VQADD.S16 Q6, Q6, Q14
+ VQADD.S16 Q7, Q7, Q15
+
+ VQMOVUN.S16 D0, Q0
+ VQMOVUN.S16 D1, Q1
+ VQMOVUN.S16 D2, Q2
+ VQMOVUN.S16 D3, Q3
+ VQMOVUN.S16 D4, Q4
+ VQMOVUN.S16 D5, Q5
+ VQMOVUN.S16 D6, Q6
+ VQMOVUN.S16 D7, Q7
+
+ VST1.64 {D0}, [r0], r2
+ VST1.64 {D1}, [r0], r2
+ VST1.64 {D2}, [r0], r2
+ VST1.64 {D3}, [r0], r2
+ VST1.64 {D4}, [r0], r2
+ VST1.64 {D5}, [r0], r2
+ VST1.64 {D6}, [r0], r2
+ VST1.64 {D7}, [r0], r2
+
+ MOV PC,R14
+oc_frag_recon_inter2_arm
+ ; r0 = unsigned char *dst
+ ; r1 = const unsigned char *src1
+ ; r2 = const unsigned char *src2
+ ; r3 = int ystride
+ LDR r12,[r13]
+ ; r12= const ogg_int16_t residue[64]
+ VLD1.64 {D16}, [r1], r3
+ VLD1.64 {D17}, [r1], r3
+ VLD1.64 {D18}, [r1], r3
+ VLD1.64 {D19}, [r1], r3
+ VLD1.64 {D20}, [r1], r3
+ VLD1.64 {D21}, [r1], r3
+ VLD1.64 {D22}, [r1], r3
+ VLD1.64 {D23}, [r1], r3
+ VLD1.64 {D24}, [r2], r3
+ VLD1.64 {D25}, [r2], r3
+ VLD1.64 {D26}, [r2], r3
+ VLD1.64 {D27}, [r2], r3
+ VLD1.64 {D28}, [r2], r3
+ VLD1.64 {D29}, [r2], r3
+ VLD1.64 {D30}, [r2], r3
+ VLD1.64 {D31}, [r2], r3
+ VLDMIA r12,{D0-D15}
+ VHADD.U8 Q12,Q8, Q12 ; Q12= FFEEDDCCBBAA99887766554433221100
+ VHADD.U8 Q13,Q9, Q13
+ VHADD.U8 Q14,Q10,Q14
+ VHADD.U8 Q15,Q11,Q15
+ VMOVL.U8 Q8, D24 ; Q8 = __77__66__55__44__33__22__11__00
+ VMOVL.U8 Q9, D25 ; etc
+ VMOVL.U8 Q10,D26
+ VMOVL.U8 Q11,D27
+ VMOVL.U8 Q12,D28
+ VMOVL.U8 Q13,D29
+ VMOVL.U8 Q14,D30
+ VMOVL.U8 Q15,D31
+
+ VQADD.S16 Q0, Q0, Q8
+ VQADD.S16 Q1, Q1, Q9
+ VQADD.S16 Q2, Q2, Q10
+ VQADD.S16 Q3, Q3, Q11
+ VQADD.S16 Q4, Q4, Q12
+ VQADD.S16 Q5, Q5, Q13
+ VQADD.S16 Q6, Q6, Q14
+ VQADD.S16 Q7, Q7, Q15
+
+ VQMOVUN.S16 D0, Q0
+ VQMOVUN.S16 D1, Q1
+ VQMOVUN.S16 D2, Q2
+ VQMOVUN.S16 D3, Q3
+ VQMOVUN.S16 D4, Q4
+ VQMOVUN.S16 D5, Q5
+ VQMOVUN.S16 D6, Q6
+ VQMOVUN.S16 D7, Q7
+
+ VST1.64 {D0}, [r0], r3
+ VST1.64 {D1}, [r0], r3
+ VST1.64 {D2}, [r0], r3
+ VST1.64 {D3}, [r0], r3
+ VST1.64 {D4}, [r0], r3
+ VST1.64 {D5}, [r0], r3
+ VST1.64 {D6}, [r0], r3
+ VST1.64 {D7}, [r0], r3
+
+ MOV PC,R14
+ |
+ [ (ARMV6 && ARM_HAS_LDRD)
+oc_frag_recon_intra_arm
+ ; r0 = unsigned char *dst
+ ; r1 = int ystride
+ ; r2 = const ogg_int16_t residue[64]
+ STMFD r13!,{r4-r6,r14}
+
+ MOV r14,#8
+ MOV r12,r2
+ LDR r6, =0x00800080
+ofrintra_lp
+ LDRD r2, [r12],#8 ; r2 = 11110000 r3 = 33332222
+ LDRD r4, [r12],#8 ; r4 = 55554444 r5 = 77776666
+ SUBS r14,r14,#1
+ QADD16 r2, r2, r6
+ QADD16 r3, r3, r6
+ QADD16 r4, r4, r6
+ QADD16 r5, r5, r6
+ USAT16 r2, #8, r2 ; r2 = __11__00
+ USAT16 r3, #8, r3 ; r3 = __33__22
+ USAT16 r4, #8, r4 ; r4 = __55__44
+ USAT16 r5, #8, r5 ; r5 = __77__66
+ ADD r2, r2, r2, LSR #8 ; r2 = __111100
+ ADD r3, r3, r3, LSR #8 ; r3 = __333322
+ BIC r2, r2, #0x00FF0000 ; r2 = ____1100
+ ORR r2, r2, r3, LSL #16 ; r2 = 33221100
+ ADD r4, r4, r4, LSR #8 ; r4 = __555544
+ ADD r5, r5, r5, LSR #8 ; r5 = __777766
+ BIC r4, r4, #0x00FF0000 ; r4 = ____5544
+ ORR r3, r4, r5, LSL #16 ; r3 = 77665544
+ STRD r2, [r0], r1
+ BGT ofrintra_lp
+
+ LDMFD r13!,{r4-r6,PC}
+
+oc_frag_recon_inter_arm
+ ; r0 = unsigned char *dst
+ ; r1 = const unsigned char *src
+ ; r2 = int ystride
+ ; r3 = const ogg_int16_t residue[64]
+ STMFD r13!,{r4-r11,r14}
+
+ MOV r14,#8
+ LDR r12,=0x00FF00FF
+ofrinter_lp
+ [ ARM_CAN_UNALIGN_LDRD
+ LDRD r4, [r1], r2 ; Unaligned ; r4 = 33221100 r5 = 77665544
+ |
+ LDR r5, [r1, #4]
+ LDR r4, [r1], r2
+ ]
+ LDRD r6, [r3], #8 ; r6 = 11110000 r7 = 33332222
+ SUBS r14,r14,#1
+ PKHBT r10,r4, r4, LSL #8 ; r10= 22111100
+ PKHTB r4, r4, r4, ASR #8 ; r4 = 33222211
+ LDRD r8, [r3], #8 ; r8 = 55554444 r9 = 77776666
+ PKHBT r11,r5, r5, LSL #8 ; r11= 66555544
+ PKHTB r5, r5, r5, ASR #8 ; r5 = 77666655
+ AND r10,r12,r10 ; r10= __11__00
+ AND r4, r12,r4, LSR #8 ; r4 = __33__22
+ AND r11,r12,r11 ; r11= __11__00
+ AND r5, r12,r5, LSR #8 ; r5 = __33__22
+ QADD16 r6, r6, r10 ; r6 = xx11xx00
+ QADD16 r7, r7, r4 ; r7 = xx33xx22
+ QADD16 r8, r8, r11 ; r8 = xx55xx44
+ QADD16 r9, r9, r5 ; r9 = xx77xx66
+ USAT16 r6, #8, r6 ; r6 = __11__00
+ USAT16 r7, #8, r7 ; r7 = __33__22
+ USAT16 r8, #8, r8 ; r8 = __55__44
+ USAT16 r9, #8, r9 ; r9 = __77__66
+ ADD r6, r6, r6, LSR #8 ; r6 = __111100
+ ADD r7, r7, r7, LSR #8 ; r7 = __333322
+ BIC r6, r6, #0x00FF0000 ; r6 = ____1100
+ ORR r6, r6, r7, LSL #16 ; r6 = 33221100
+ ADD r8, r8, r8, LSR #8 ; r8 = __555544
+ ADD r9, r9, r9, LSR #8 ; r9 = __777766
+ BIC r8, r8, #0x00FF0000 ; r8 = ____5544
+ ORR r7, r8, r9, LSL #16 ; r9 = 77665544
+ STRD r6, [r0], r2
+ BGT ofrinter_lp
+
+ LDMFD r13!,{r4-r11,PC}
+
+oc_frag_recon_inter2_arm
+ ; r0 = unsigned char *dst
+ ; r1 = const unsigned char *src1
+ ; r2 = const unsigned char *src2
+ ; r3 = int ystride
+ LDR r12,[r13]
+ ; r12= const ogg_int16_t residue[64]
+ STMFD r13!,{r4-r11,r14}
+
+ MOV r14,#8
+ LDR r7, =0x00FF00FF
+ofrinter2_lp
+ LDR r5, [r1, #4] ; Unaligned ; r5 = src1[1] = 77665544
+ LDR r6, [r2, #4] ; Unaligned ; r6 = src2[1] = 77665544
+ SUBS r14,r14,#1
+ LDRD r8, [r12,#8] ; r8 = 55554444 r9 = 77776666
+ UHADD8 r5, r5, r6 ; r5 = (src1[7,6,5,4] + src2[7,6,5,4])>>1
+ PKHBT r6, r5, r5, LSL #8 ; r6 = 66555544
+ PKHTB r5, r5, r5, ASR #8 ; r5 = 77666655
+ AND r6, r7, r6 ; r6 = __55__44
+ AND r5, r7, r5, LSR #8 ; r5 = __77__66
+ QADD16 r8, r8, r6 ; r8 = xx55xx44
+ QADD16 r9, r9, r5 ; r9 = xx77xx66
+ LDR r5, [r1], r3 ; Unaligned ; r5 = src1[0] = 33221100
+ LDR r6, [r2], r3 ; Unaligned ; r6 = src2[0] = 33221100
+ USAT16 r8, #8, r8 ; r8 = __55__44
+ USAT16 r9, #8, r9 ; r9 = __77__66
+ ADD r8, r8, r8, LSR #8 ; r8 = __555544
+ ADD r9, r9, r9, LSR #8 ; r9 = __777766
+ LDRD r10,[r12],#16 ; r10= 33332222 r11= 11110000
+ BIC r8, r8, #0x00FF0000 ; r8 = ____5544
+
+ UHADD8 r5, r5, r6 ; r5 = (src1[3,2,1,0] + src2[3,2,1,0])>>1
+ ORR r9, r8, r9, LSL #16 ; r9 = 77665544
+ PKHBT r6, r5, r5, LSL #8 ; r6 = 22111100
+ PKHTB r5, r5, r5, ASR #8 ; r5 = 33222211
+ AND r6, r7, r6 ; r6 = __11__00
+ AND r5, r7, r5, LSR #8 ; r5 = __33__22
+ QADD16 r10,r10,r6 ; r10= xx11xx00
+ QADD16 r11,r11,r5 ; r11= xx33xx22
+ USAT16 r10,#8, r10 ; r10= __11__00
+ USAT16 r11,#8, r11 ; r11= __33__22
+ ADD r10,r10,r10,LSR #8 ; r10= __111100
+ ADD r11,r11,r11,LSR #8 ; r11= __333322
+ BIC r10,r10,#0x00FF0000 ; r10= ____1100
+ ORR r8, r10,r11,LSL #16 ; r8 = 33221100
+ STRD r8, [r0], r3
+
+ BGT ofrinter2_lp
+
+ LDMFD r13!,{r4-r11,PC}
+ |
+ ; Vanilla ARM v4 version
+oc_frag_recon_intra_arm
+ ; r0 = unsigned char *dst
+ ; r1 = int ystride
+ ; r2 = const ogg_int16_t residue[64]
+ STMFD r13!,{r4,r5,r14}
+
+ MOV r14,#8
+ MOV r5, #255
+ SUB r1, r1, #7
+ofrintra_lp
+ LDRSH r3, [r2], #2
+ LDRSH r4, [r2], #2
+ LDRSH r12,[r2], #2
+ ADDS r3, r3, #128
+ CMPGT r5, r3
+ EORLT r3, r5, r3, ASR #32
+ STRB r3, [r0], #1
+ ADDS r4, r4, #128
+ CMPGT r5, r4
+ EORLT r4, r5, r4, ASR #32
+ LDRSH r3, [r2], #2
+ STRB r4, [r0], #1
+ ADDS r12,r12,#128
+ CMPGT r5, r12
+ EORLT r12,r5, r12,ASR #32
+ LDRSH r4, [r2], #2
+ STRB r12,[r0], #1
+ ADDS r3, r3, #128
+ CMPGT r5, r3
+ EORLT r3, r5, r3, ASR #32
+ LDRSH r12,[r2], #2
+ STRB r3, [r0], #1
+ ADDS r4, r4, #128
+ CMPGT r5, r4
+ EORLT r4, r5, r4, ASR #32
+ LDRSH r3, [r2], #2
+ STRB r4, [r0], #1
+ ADDS r12,r12,#128
+ CMPGT r5, r12
+ EORLT r12,r5, r12,ASR #32
+ LDRSH r4, [r2], #2
+ STRB r12,[r0], #1
+ ADDS r3, r3, #128
+ CMPGT r5, r3
+ EORLT r3, r5, r3, ASR #32
+ STRB r3, [r0], #1
+ ADDS r4, r4, #128
+ CMPGT r5, r4
+ EORLT r4, r5, r4, ASR #32
+ STRB r4, [r0], r1
+ SUBS r14,r14,#1
+ BGT ofrintra_lp
+
+ LDMFD r13!,{r4,r5,PC}
+
+oc_frag_recon_inter_arm
+ ; r0 = unsigned char *dst
+ ; r1 = const unsigned char *src
+ ; r2 = int ystride
+ ; r3 = const ogg_int16_t residue[64]
+ STMFD r13!,{r5,r9-r11,r14}
+
+ MOV r9, #8
+ MOV r5, #255
+ SUB r2, r2, #7
+ofrinter_lp
+ LDRSH r12,[r3], #2
+ LDRB r14,[r1], #1
+ LDRSH r11,[r3], #2
+ LDRB r10,[r1], #1
+ ADDS r12,r12,r14
+ CMPGT r5, r12
+ EORLT r12,r5, r12,ASR #32
+ STRB r12,[r0], #1
+ ADDS r11,r11,r10
+ CMPGT r5, r11
+ LDRSH r12,[r3], #2
+ LDRB r14,[r1], #1
+ EORLT r11,r5, r11,ASR #32
+ STRB r11,[r0], #1
+ ADDS r12,r12,r14
+ CMPGT r5, r12
+ LDRSH r11,[r3], #2
+ LDRB r10,[r1], #1
+ EORLT r12,r5, r12,ASR #32
+ STRB r12,[r0], #1
+ ADDS r11,r11,r10
+ CMPGT r5, r11
+ LDRSH r12,[r3], #2
+ LDRB r14,[r1], #1
+ EORLT r11,r5, r11,ASR #32
+ STRB r11,[r0], #1
+ ADDS r12,r12,r14
+ CMPGT r5, r12
+ LDRSH r11,[r3], #2
+ LDRB r10,[r1], #1
+ EORLT r12,r5, r12,ASR #32
+ STRB r12,[r0], #1
+ ADDS r11,r11,r10
+ CMPGT r5, r11
+ LDRSH r12,[r3], #2
+ LDRB r14,[r1], #1
+ EORLT r11,r5, r11,ASR #32
+ STRB r11,[r0], #1
+ ADDS r12,r12,r14
+ CMPGT r5, r12
+ LDRSH r11,[r3], #2
+ LDRB r10,[r1], r2
+ EORLT r12,r5, r12,ASR #32
+ STRB r12,[r0], #1
+ ADDS r11,r11,r10
+ CMPGT r5, r11
+ EORLT r11,r5, r11,ASR #32
+ STRB r11,[r0], r2
+ SUBS r9, r9, #1
+ BGT ofrinter_lp
+
+ LDMFD r13!,{r5,r9-r11,PC}
+
+oc_frag_recon_inter2_arm
+ ; r0 = unsigned char *dst
+ ; r1 = const unsigned char *src1
+ ; r2 = const unsigned char *src2
+ ; r3 = int ystride
+ LDR r12,[r13]
+ ; r12= const ogg_int16_t residue[64]
+ STMFD r13!,{r4-r8,r14}
+
+ MOV r14,#8
+ MOV r8, #255
+ SUB r3, r3, #7
+ofrinter2_lp
+ LDRB r5, [r1], #1
+ LDRB r6, [r2], #1
+ LDRSH r4, [r12],#2
+ LDRB r7, [r1], #1
+ ADD r5, r5, r6
+ ADDS r5, r4, r5, LSR #1
+ CMPGT r8, r5
+ LDRB r6, [r2], #1
+ LDRSH r4, [r12],#2
+ EORLT r5, r8, r5, ASR #32
+ STRB r5, [r0], #1
+
+ ADD r7, r7, r6
+ ADDS r7, r4, r7, LSR #1
+ CMPGT r8, r7
+ LDRB r5, [r1], #1
+ LDRB r6, [r2], #1
+ LDRSH r4, [r12],#2
+ EORLT r7, r8, r7, ASR #32
+ STRB r7, [r0], #1
+
+ ADD r5, r5, r6
+ ADDS r5, r4, r5, LSR #1
+ CMPGT r8, r5
+ LDRB r7, [r1], #1
+ LDRB r6, [r2], #1
+ LDRSH r4, [r12],#2
+ EORLT r5, r8, r5, ASR #32
+ STRB r5, [r0], #1
+
+ ADD r7, r7, r6
+ ADDS r7, r4, r7, LSR #1
+ CMPGT r8, r7
+ LDRB r5, [r1], #1
+ LDRB r6, [r2], #1
+ LDRSH r4, [r12],#2
+ EORLT r7, r8, r7, ASR #32
+ STRB r7, [r0], #1
+
+ ADD r5, r5, r6
+ ADDS r5, r4, r5, LSR #1
+ CMPGT r8, r5
+ LDRB r7, [r1], #1
+ LDRB r6, [r2], #1
+ LDRSH r4, [r12],#2
+ EORLT r5, r8, r5, ASR #32
+ STRB r5, [r0], #1
+
+ ADD r7, r7, r6
+ ADDS r7, r4, r7, LSR #1
+ CMPGT r8, r7
+ LDRB r5, [r1], #1
+ LDRB r6, [r2], #1
+ LDRSH r4, [r12],#2
+ EORLT r7, r8, r7, ASR #32
+ STRB r7, [r0], #1
+
+ ADD r5, r5, r6
+ ADDS r5, r4, r5, LSR #1
+ CMPGT r8, r5
+ LDRB r7, [r1], r3
+ LDRB r6, [r2], r3
+ LDRSH r4, [r12],#2
+ EORLT r5, r8, r5, ASR #32
+ STRB r5, [r0], #1
+
+ ADD r7, r7, r6
+ ADDS r7, r4, r7, LSR #1
+ CMPGT r8, r7
+ EORLT r7, r8, r7, ASR #32
+ STRB r7, [r0], r3
+
+ SUBS r14,r14,#1
+ BGT ofrinter2_lp
+
+ LDMFD r13!,{r4-r8,PC}
+ ]
+ ]
+ END
Property changes on: branches/theorarm-merge-branch/lib/arm/ARMfrag.s
___________________________________________________________________
Added: svn:executable
+ *
Added: branches/theorarm-merge-branch/lib/arm/ARMidct.s
===================================================================
--- branches/theorarm-merge-branch/lib/arm/ARMidct.s (rev 0)
+++ branches/theorarm-merge-branch/lib/arm/ARMidct.s 2010-03-07 17:59:26 UTC (rev 16954)
@@ -0,0 +1,841 @@
+; Theorarm library
+; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
+
+ AREA |.text|, CODE, READONLY
+
+ EXPORT oc_idct8x8_arm
+
+oc_idct8x8_arm
+ ; r0 = ogg_int16_t *y
+ ; r1 = int last_zzi
+ CMP r1, #3
+ BLT oc_idct8x8_3
+ CMP r1, #6
+ BLT oc_idct8x8_6
+ CMP r1, #10
+ BLT oc_idct8x8_10
+oc_idct8x8_slow
+ STMFD r13!,{r4-r11,r14}
+ SUB r13,r13,#64*2
+
+ MOV r1, r0 ; read from r1
+ MOV r0, r13 ; write to temp storage
+ BL idct8core
+ ADD r1, r1, #16
+ BL idct8core
+ ADD r1, r1, #16
+ BL idct8core
+ ADD r1, r1, #16
+ BL idct8core
+ ADD r1, r1, #16
+ BL idct8core
+ ADD r1, r1, #16
+ BL idct8core
+ ADD r1, r1, #16
+ BL idct8core
+ ADD r1, r1, #16
+ BL idct8core
+
+ SUB r0, r1, #7*16 ; Now src becomes dst
+ MOV r1, r13 ; and dst becomes src
+ BL idct8core_down
+ ADD r1, r1, #16
+ BL idct8core_down
+ ADD r1, r1, #16
+ BL idct8core_down
+ ADD r1, r1, #16
+ BL idct8core_down
+ ADD r1, r1, #16
+ BL idct8core_down
+ ADD r1, r1, #16
+ BL idct8core_down
+ ADD r1, r1, #16
+ BL idct8core_down
+ ADD r1, r1, #16
+ BL idct8core_down
+
+ ADD r13,r13,#64*2
+ LDMFD r13!,{r4-r11,PC}
+
+oc_idct8x8_10
+ STMFD r13!,{r4-r11,r14}
+ SUB r13,r13,#64*2
+
+ MOV r1, r0 ; read from r1
+ MOV r0, r13 ; write to temp storage
+ BL idct4core
+ BL idct3core
+ BL idct2core
+ BL idct1core
+
+ SUB r0, r1, #4*16 ; Now src becomes dst
+ MOV r1, r13 ; and dst becomes src
+ BL idct4core_down
+ BL idct4core_down
+ BL idct4core_down
+ BL idct4core_down
+ BL idct4core_down
+ BL idct4core_down
+ BL idct4core_down
+ BL idct4core_down
+
+ ADD r13,r13,#64*2
+ LDMFD r13!,{r4-r11,PC}
+oc_idct8x8_6
+ STMFD r13!,{r4-r11,r14}
+ SUB r13,r13,#64*2
+
+ MOV r1, r0 ; read from r1
+ MOV r0, r13 ; write to temp storage
+ BL idct3core
+ BL idct2core
+ BL idct1core
+
+ SUB r0, r1, #3*16 ; Now src becomes dst
+ MOV r1, r13 ; and dst becomes src
+ BL idct3core_down
+ BL idct3core_down
+ BL idct3core_down
+ BL idct3core_down
+ BL idct3core_down
+ BL idct3core_down
+ BL idct3core_down
+ BL idct3core_down
+
+ ADD r13,r13,#64*2
+ LDMFD r13!,{r4-r11,PC}
+oc_idct8x8_3
+ STMFD r13!,{r4-r11,r14}
+ SUB r13,r13,#64*2
+
+ MOV r1, r0 ; read from r1
+ MOV r0, r13 ; write to temp storage
+ BL idct2core
+ BL idct1core
+
+ SUB r0, r1, #2*16 ; Now src becomes dst
+ MOV r1, r13 ; and dst becomes src
+ BL idct2core_down
+ BL idct2core_down
+ BL idct2core_down
+ BL idct2core_down
+ BL idct2core_down
+ BL idct2core_down
+ BL idct2core_down
+ BL idct2core_down
+
+ ADD r13,r13,#64*2
+ LDMFD r13!,{r4-r11,PC}
+
+ [ 0 = 1
+ EXPORT idct8_1
+ EXPORT idct8_2
+ EXPORT idct8_3
+ EXPORT idct8_4
+ EXPORT idct8
+ EXPORT oc_idct8x8_slow
+ EXPORT oc_idct8x8_10
+ EXPORT oc_idct8x8_3
+idct8_2
+ ; r0 = ogg_int16_t *y (destination)
+ ; r1 = const ogg_int16_t *x (source)
+ STMFD r13!,{r4-r11,r14}
+
+ BL idct2core
+
+ LDMFD r13!,{r4-r11,PC}
+idct8_3
+ ; r0 = ogg_int16_t *y (destination)
+ ; r1 = const ogg_int16_t *x (source)
+ STMFD r13!,{r4-r11,r14}
+
+ BL idct3core
+
+ LDMFD r13!,{r4-r11,PC}
+idct8_4
+ ; r0 = ogg_int16_t *y (destination)
+ ; r1 = const ogg_int16_t *x (source)
+ STMFD r13!,{r4-r11,r14}
+
+ BL idct4core
+
+ LDMFD r13!,{r4-r11,PC}
+idct8_1
+ ]
+idct1core
+ ; r0 = ogg_int16_t *y (destination)
+ ; r1 = const ogg_int16_t *x (source)
+ LDRSH r3, [r1], #16
+ MOV r12,#0x05
+ ORR r12,r12,#0xB500
+ MUL r3, r12, r3
+ ; Stall ?
+ MOV r3, r3, ASR #16
+ STRH r3, [r0], #2
+ STRH r3, [r0, #14]
+ STRH r3, [r0, #30]
+ STRH r3, [r0, #46]
+ STRH r3, [r0, #62]
+ STRH r3, [r0, #78]
+ STRH r3, [r0, #94]
+ STRH r3, [r0, #110]
+ MOV PC,R14
+
+idct2core
+ ; r0 = ogg_int16_t *y (destination)
+ ; r1 = const ogg_int16_t *x (source)
+ LDRSH r2, [r1], #16 ; r2 = x[0]
+ LDR r12,OC_C4S4
+ LDRSH r11,[r1, #-14] ; r11= x[1]
+ LDR r3, OC_C7S1
+ MUL r2, r12,r2 ; r2 = t[0]<<16 = OC_C4S4*x[0]
+ LDR r10,OC_C1S7
+ MUL r3, r11,r3 ; r3 = t[4]<<16 = OC_C7S1*x[1]
+ MOV r2, r2, ASR #16 ; r2 = t[0]
+ MUL r11,r10,r11 ; r11= t[7]<<16 = OC_C1S7*x[1]
+ MOV r3, r3, ASR #16 ; r3 = t[4]
+ MUL r10,r12,r3 ; r10= t[5]<<16 = OC_C4S4*t[4]
+ MOV r11,r11,ASR #16 ; r11= t[7]
+ MUL r12,r11,r12 ; r12= t[6]<<16 = OC_C4S4*t[7]
+ MOV r10,r10,ASR #16 ; r10= t[5]
+ ADD r12,r2,r12,ASR #16 ; r12= t[0]+t[6]
+ ADD r12,r12,r10 ; r12= t[0]+t2[6] = t[0]+t[6]+t[5]
+ SUB r10,r12,r10,LSL #1 ; r10= t[0]+t2[5] = t[0]+t[6]-t[5]
+ ADD r3, r3, r2 ; r3 = t[0]+t[4]
+ ADD r11,r11,r2 ; r11= t[0]+t[7]
+ STRH r11,[r0], #2 ; y[0] = t[0]+t[7]
+ STRH r12,[r0, #14] ; y[1] = t[0]+t[6]
+ STRH r10,[r0, #30] ; y[2] = t[0]+t[5]
+ STRH r3, [r0, #46] ; y[3] = t[0]+t[4]
+ RSB r3, r3, r2, LSL #1 ; r3 = t[0]*2-(t[0]+t[4])=t[0]-t[4]
+ RSB r10,r10,r2, LSL #1 ; r10= t[0]*2-(t[0]+t[5])=t[0]-t[5]
+ RSB r12,r12,r2, LSL #1 ; r12= t[0]*2-(t[0]+t[6])=t[0]-t[6]
+ RSB r11,r11,r2, LSL #1 ; r1 = t[0]*2-(t[0]+t[7])=t[0]-t[7]
+ STRH r3, [r0, #62] ; y[4] = t[0]-t[4]
+ STRH r10,[r0, #78] ; y[5] = t[0]-t[5]
+ STRH r12,[r0, #94] ; y[6] = t[0]-t[6]
+ STRH r11,[r0, #110] ; y[7] = t[0]-t[7]
+
+ MOV PC,r14
+idct2core_down
+ ; r0 = ogg_int16_t *y (destination)
+ ; r1 = const ogg_int16_t *x (source)
+ LDRSH r2, [r1], #16 ; r2 = x[0]
+ LDR r12,OC_C4S4
+ LDRSH r11,[r1, #-14] ; r11= x[1]
+ LDR r3, OC_C7S1
+ MUL r2, r12,r2 ; r2 = t[0]<<16 = OC_C4S4*x[0]
+ LDR r10,OC_C1S7
+ MUL r3, r11,r3 ; r3 = t[4]<<16 = OC_C7S1*x[1]
+ MOV r2, r2, ASR #16 ; r2 = t[0]
+ MUL r11,r10,r11 ; r11= t[7]<<16 = OC_C1S7*x[1]
+ ADD r2, r2, #8 ; r2 = t[0]+8
+ MOV r3, r3, ASR #16 ; r3 = t[4]
+ MUL r10,r12,r3 ; r10= t[5]<<16 = OC_C4S4*t[4]
+ MOV r11,r11,ASR #16 ; r11= t[7]
+ MUL r12,r11,r12 ; r12= t[6]<<16 = OC_C4S4*t[7]
+ MOV r10,r10,ASR #16 ; r10= t[5]
+ ADD r12,r2,r12,ASR #16 ; r12= t[0]+t[6]+8
+ ADD r12,r12,r10 ; r12= t[0]+t2[6] = t[0]+t[6]+t[5]+8
+ SUB r10,r12,r10,LSL #1 ; r10= t[0]+t2[5] = t[0]+t[6]-t[5]+8
+ ADD r3, r3, r2 ; r3 = t[0]+t[4]+8
+ ADD r11,r11,r2 ; r11= t[0]+t[7]+8
+ MOV r4, r11,ASR #4
+ MOV r5, r12,ASR #4
+ MOV r6, r10,ASR #4
+ MOV r7, r3, ASR #4
+ RSB r3, r3, r2, LSL #1 ;r3 =t[0]*2+8-(t[0]+t[4])=t[0]-t[4]+8
+ RSB r10,r10,r2, LSL #1 ;r10=t[0]*2+8-(t[0]+t[5])=t[0]-t[5]+8
+ RSB r12,r12,r2, LSL #1 ;r12=t[0]*2+8-(t[0]+t[6])=t[0]-t[6]+8
+ RSB r11,r11,r2, LSL #1 ;r11=t[0]*2+8-(t[0]+t[7])=t[0]-t[7]+8
+ MOV r3, r3, ASR #4
+ MOV r10,r10,ASR #4
+ MOV r12,r12,ASR #4
+ MOV r11,r11,ASR #4
+ STRH r4, [r0], #2 ; y[0] = t[0]+t[7]
+ STRH r5, [r0, #14] ; y[1] = t[0]+t[6]
+ STRH r6, [r0, #30] ; y[2] = t[0]+t[5]
+ STRH r7, [r0, #46] ; y[3] = t[0]+t[4]
+ STRH r3, [r0, #62] ; y[4] = t[0]-t[4]
+ STRH r10,[r0, #78] ; y[5] = t[0]-t[5]
+ STRH r12,[r0, #94] ; y[6] = t[0]-t[6]
+ STRH r11,[r0, #110] ; y[7] = t[0]-t[7]
+
+ MOV PC,r14
+idct3core
+ LDRSH r2, [r1], #16 ; r2 = x[0]
+ LDR r12,OC_C4S4 ; r12= OC_C4S4
+ LDRSH r3, [r1, #-12] ; r3 = x[2]
+ LDR r10,OC_C6S2 ; r10= OC_C6S2
+ MUL r2, r12,r2 ; r2 = t[0]<<16 = OC_C4S4*x[0]
+ LDR r4, OC_C2S6 ; r4 = OC_C2S6
+ MUL r10,r3, r10 ; r10= t[2]<<16 = OC_C6S2*x[2]
+ LDRSH r11,[r1, #-14] ; r11= x[1]
+ MUL r3, r4, r3 ; r3 = t[3]<<16 = OC_C2S6*x[2]
+ LDR r4, OC_C7S1 ; r4 = OC_C7S1
+ LDR r5, OC_C1S7 ; r5 = OC_C1S7
+ MOV r2, r2, ASR #16 ; r2 = t[0]
+ MUL r4, r11,r4 ; r4 = t[4]<<16 = OC_C7S1*x[1]
+ ADD r3, r2, r3, ASR #16 ; r3 = t[0]+t[3]
+ MUL r11,r5, r11 ; r11= t[7]<<16 = OC_C1S7*x[1]
+ MOV r4, r4, ASR #16 ; r4 = t[4]
+ MUL r5, r12,r4 ; r5 = t[5]<<16 = OC_C4S4*t[4]
+ MOV r11,r11,ASR #16 ; r11= t[7]
+ MUL r12,r11,r12 ; r12= t[6]<<16 = OC_C4S4*t[7]
+
+ ADD r10,r2, r10,ASR #16 ; r10= t[1] = t[0]+t[2]
+ RSB r6, r10,r2, LSL #1 ; r6 = t[2] = t[0]-t[2]
+ ; r3 = t2[0] = t[0]+t[3]
+ RSB r2, r3, r2, LSL #1 ; r2 = t2[3] = t[0]-t[3]
+ MOV r12,r12,ASR #16 ; r12= t[6]
+ ADD r5, r12,r5, ASR #16 ; r5 = t2[6] = t[6]+t[5]
+ RSB r12,r5, r12,LSL #1 ; r12= t2[5] = t[6]-t[5]
+
+ ADD r11,r3, r11 ; r11= t2[0]+t[7]
+ ADD r5, r10,r5 ; r5 = t[1]+t2[6]
+ ADD r12,r6, r12 ; r12= t[2]+t2[5]
+ ADD r4, r2, r4 ; r4 = t2[3]+t[4]
+ STRH r11,[r0], #2 ; y[0] = t[0]+t[7]
+ STRH r5, [r0, #14] ; y[1] = t[1]+t2[6]
+ STRH r12,[r0, #30] ; y[2] = t[2]+t2[5]
+ STRH r4, [r0, #46] ; y[3] = t2[3]+t[4]
+
+ RSB r11,r11,r3, LSL #1 ; r11= t2[0] - t[7]
+ RSB r5, r5, r10,LSL #1 ; r5 = t[1] - t2[6]
+ RSB r12,r12,r6, LSL #1 ; r6 = t[2] - t2[5]
+ RSB r4, r4, r2, LSL #1 ; r4 = t2[3] - t[4]
+ STRH r4, [r0, #62] ; y[4] = t2[3]-t[4]
+ STRH r12,[r0, #78] ; y[5] = t[2]-t2[5]
+ STRH r5, [r0, #94] ; y[6] = t[1]-t2[6]
+ STRH r11,[r0, #110] ; y[7] = t2[0]-t[7]
+
+ MOV PC,R14
+idct3core_down
+ LDRSH r2, [r1], #16 ; r2 = x[0]
+ LDR r12,OC_C4S4 ; r12= OC_C4S4
+ LDRSH r3, [r1, #-12] ; r3 = x[2]
+ LDR r10,OC_C6S2 ; r10= OC_C6S2
+ MUL r2, r12,r2 ; r2 = t[0]<<16 = OC_C4S4*x[0]
+ LDR r4, OC_C2S6 ; r4 = OC_C2S6
+ MUL r10,r3, r10 ; r10= t[2]<<16 = OC_C6S2*x[2]
+ LDRSH r11,[r1, #-14] ; r11= x[1]
+ MUL r3, r4, r3 ; r3 = t[3]<<16 = OC_C2S6*x[2]
+ LDR r4, OC_C7S1 ; r4 = OC_C7S1
+ LDR r5, OC_C1S7 ; r5 = OC_C1S7
+ MOV r2, r2, ASR #16 ; r2 = t[0]
+ ADD r2, r2, #8 ; r2 = t[0]+8
+ MUL r4, r11,r4 ; r4 = t[4]<<16 = OC_C7S1*x[1]
+ ADD r3, r2, r3, ASR #16 ; r3 = t[0]+t[3]+8
+ MUL r11,r5, r11 ; r11= t[7]<<16 = OC_C1S7*x[1]
+ MOV r4, r4, ASR #16 ; r4 = t[4]
+ MUL r5, r12,r4 ; r5 = t[5]<<16 = OC_C4S4*t[4]
+ MOV r11,r11,ASR #16 ; r11= t[7]
+ MUL r12,r11,r12 ; r12= t[6]<<16 = OC_C4S4*t[7]
+
+ ADD r10,r2, r10,ASR #16 ; r10= t[1] = t[0]+t[2]+8
+ RSB r6, r10,r2, LSL #1 ; r6 = t[2] = t[0]-t[2]+8
+ ; r3 = t2[0] = t[0]+t[3]+8
+ RSB r2, r3, r2, LSL #1 ; r2 = t2[3] = t[0]-t[3]+8
+ MOV r12,r12,ASR #16 ; r12= t[6]
+ ADD r5, r12,r5, ASR #16 ; r5 = t2[6] = t[6]+t[5]
+ RSB r12,r5, r12,LSL #1 ; r12= t2[5] = t[6]-t[5]
+
+ ADD r11,r3, r11 ; r11= t2[0]+t[7]
+ ADD r5, r10,r5 ; r5 = t[1] +t2[6]
+ ADD r12,r6, r12 ; r12= t[2] +t2[5]
+ ADD r4, r2, r4 ; r4 = t2[3]+t[4]
+ RSB r3, r11,r3, LSL #1 ; r11= t2[0] - t[7]
+ RSB r10,r5, r10,LSL #1 ; r5 = t[1] - t2[6]
+ RSB r6, r12,r6, LSL #1 ; r6 = t[2] - t2[5]
+ RSB r2, r4, r2, LSL #1 ; r4 = t2[3] - t[4]
+ MOV r11,r11,ASR #4
+ MOV r5, r5, ASR #4
+ MOV r12,r12,ASR #4
+ MOV r4, r4, ASR #4
+ MOV r2, r2, ASR #4
+ MOV r6, r6, ASR #4
+ MOV r10,r10,ASR #4
+ MOV r3, r3, ASR #4
+ STRH r11,[r0], #2 ; y[0] = t[0]+t[7]
+ STRH r5, [r0, #14] ; y[1] = t[1]+t2[6]
+ STRH r12,[r0, #30] ; y[2] = t[2]+t2[5]
+ STRH r4, [r0, #46] ; y[3] = t2[3]+t[4]
+ STRH r2, [r0, #62] ; y[4] = t2[3]-t[4]
+ STRH r6, [r0, #78] ; y[5] = t[2]-t2[5]
+ STRH r10,[r0, #94] ; y[6] = t[1]-t2[6]
+ STRH r3, [r0, #110] ; y[7] = t2[0]-t[7]
+
+ MOV PC,R14
+
+idct4core
+ ; r0 = ogg_int16_t *y (destination)
+ ; r1 = const ogg_int16_t *x (source)
+ LDRSH r2, [r1], #16 ; r2 = x[0]
+ LDR r10,OC_C4S4 ; r10= OC_C4S4
+ LDRSH r12,[r1, #-12] ; r12= x[2]
+ LDR r4, OC_C6S2 ; r4 = OC_C6S2
+ MUL r2, r10,r2 ; r2 = t[0]<<16 = OC_C4S4*x[0]
+ LDR r5, OC_C2S6 ; r5 = OC_C2S6
+ MUL r4, r12,r4 ; r4 = t[2]<<16 = OC_C6S2*x[2]
+ LDRSH r3, [r1, #-14] ; r3 = x[1]
+ MUL r5, r12,r5 ; r5 = t[3]<<16 = OC_C2S6*x[2]
+ LDR r6, OC_C7S1 ; r6 = OC_C7S1
+ LDR r12,OC_C1S7 ; r12= OC_C1S7
+ LDRSH r11,[r1, #-10] ; r11= x[3]
+ MUL r6, r3, r6 ; r6 = t[4]<<16 = OC_C7S1*x[1]
+ LDR r7, OC_C5S3 ; r7 = OC_C5S3
+ MUL r3, r12,r3 ; r3 = t[7]<<16 = OC_C1S7*x[1]
+ LDR r8, OC_C3S5 ; r8 = OC_C3S5
+ MUL r7, r11,r7 ; r7 = -t[5]<<16 = OC_C5S3*x[3]
+ MOV r2, r2, ASR #16 ; r2 = t[0]
+ MUL r11,r8, r11 ; r11= t[6]<<16 = OC_C3S5*x[3]
+
+ MOV r6, r6, ASR #16 ; r6 = t[4]
+ SUB r7, r6, r7, ASR #16 ; r7 = t2[4]=t[4]+t[5] (as r7=-t[5])
+ RSB r6, r7, r6, LSL #1 ; r6 = t[4]-t[5]
+ MUL r6, r10,r6 ; r6 = t2[5]<<16 =OC_C4S4*(t[4]-t[5])
+
+ MOV r3, r3, ASR #16 ; r3 = t[7]
+ ADD r11,r3, r11,ASR #16 ; r11= t2[7]=t[7]+t[6]
+ RSB r3, r11,r3, LSL #1 ; r3 = t[7]-t[6]
+ MUL r3, r10,r3 ; r3 = t2[6]<<16 =OC_C4S4*(t[7]-t[6])
+
+ ADD r4, r2, r4, ASR #16 ; r4 = t[1] = t[0] + t[2]
+ RSB r10,r4, r2, LSL #1 ; r10= t[2] = t[0] - t[2]
+
+ ADD r5, r2, r5, ASR #16 ; r5 = t[0] = t[0] + t[3]
+ RSB r2, r5, r2, LSL #1 ; r2 = t[3] = t[0] - t[3]
+
+ MOV r3, r3, ASR #16 ; r3 = t2[6]
+ ADD r6, r3, r6, ASR #16 ; r6 = t3[6] = t2[6]+t2[5]
+ RSB r3, r6, r3, LSL #1 ; r3 = t3[5] = t2[6]-t2[5]
+
+ ADD r11,r5, r11 ; r11= t[0]+t2[7]
+ ADD r6, r4, r6 ; r6 = t[1]+t3[6]
+ ADD r3, r10,r3 ; r3 = t[2]+t3[5]
+ ADD r7, r2, r7 ; r7 = t[3]+t2[4]
+ STRH r11,[r0], #2 ; y[0] = t[0]+t[7]
+ STRH r6, [r0, #14] ; y[1] = t[1]+t2[6]
+ STRH r3, [r0, #30] ; y[2] = t[2]+t2[5]
+ STRH r7, [r0, #46] ; y[3] = t2[3]+t[4]
+
+ RSB r11,r11,r5, LSL #1 ; r11= t[0]-t2[7]
+ RSB r6, r6, r4, LSL #1 ; r6 = t[1]-t3[6]
+ RSB r3, r3, r10,LSL #1 ; r3 = t[2]-t3[5]
+ RSB r7, r7, r2, LSL #1 ; r7 = t[3]-t2[4]
+ STRH r7, [r0, #62] ; y[4] = t2[3]-t[4]
+ STRH r3, [r0, #78] ; y[5] = t[2]-t2[5]
+ STRH r6, [r0, #94] ; y[6] = t[1]-t2[6]
+ STRH r11, [r0, #110] ; y[7] = t2[0]-t[7]
+
+ MOV PC,r14
+idct4core_down
+ ; r0 = ogg_int16_t *y (destination)
+ ; r1 = const ogg_int16_t *x (source)
+ LDRSH r2, [r1], #16 ; r2 = x[0]
+ LDR r10,OC_C4S4 ; r10= OC_C4S4
+ LDRSH r12,[r1, #-12] ; r12= x[2]
+ LDR r4, OC_C6S2 ; r4 = OC_C6S2
+ MUL r2, r10,r2 ; r2 = t[0]<<16 = OC_C4S4*x[0]
+ LDR r5, OC_C2S6 ; r5 = OC_C2S6
+ MUL r4, r12,r4 ; r4 = t[2]<<16 = OC_C6S2*x[2]
+ LDRSH r3, [r1, #-14] ; r3 = x[1]
+ MUL r5, r12,r5 ; r5 = t[3]<<16 = OC_C2S6*x[2]
+ LDR r6, OC_C7S1 ; r6 = OC_C7S1
+ LDR r12,OC_C1S7 ; r12= OC_C1S7
+ LDRSH r11,[r1, #-10] ; r11= x[3]
+ MUL r6, r3, r6 ; r6 = t[4]<<16 = OC_C7S1*x[1]
+ LDR r7, OC_C5S3 ; r7 = OC_C5S3
+ MUL r3, r12,r3 ; r3 = t[7]<<16 = OC_C1S7*x[1]
+ LDR r8, OC_C3S5 ; r8 = OC_C3S5
+ MUL r7, r11,r7 ; r7 = -t[5]<<16 = OC_C5S3*x[3]
+ MOV r2, r2, ASR #16 ; r2 = t[0]
+ MUL r11,r8, r11 ; r11= t[6]<<16 = OC_C3S5*x[3]
+
+ MOV r6, r6, ASR #16 ; r6 = t[4]
+ SUB r7, r6, r7, ASR #16 ; r7 = t2[4]=t[4]+t[5] (as r7=-t[5])
+ RSB r6, r7, r6, LSL #1 ; r6 = t[4]-t[5]
+ MUL r6, r10,r6 ; r6 = t2[5]<<16 =OC_C4S4*(t[4]-t[5])
+
+ MOV r3, r3, ASR #16 ; r3 = t[7]
+ ADD r11,r3, r11,ASR #16 ; r11= t2[7]=t[7]+t[6]
+ RSB r3, r11,r3, LSL #1 ; r3 = t[7]-t[6]
+ MUL r3, r10,r3 ; r3 = t2[6]<<16 =OC_C4S4*(t[7]-t[6])
+
+ ADD r4, r2, r4, ASR #16 ; r4 = t[1] = t[0] + t[2]
+ RSB r10,r4, r2, LSL #1 ; r10= t[2] = t[0] - t[2]
+
+ ADD r5, r2, r5, ASR #16 ; r5 = t[0] = t[0] + t[3]
+ RSB r2, r5, r2, LSL #1 ; r2 = t[3] = t[0] - t[3]
+
+ MOV r3, r3, ASR #16 ; r3 = t2[6]
+ ADD r6, r3, r6, ASR #16 ; r6 = t3[6] = t2[6]+t2[5]
+ RSB r3, r6, r3, LSL #1 ; r3 = t3[5] = t2[6]-t2[5]
+
+ ADD r5, r5, r11 ; r5 = t[0]+t2[7]
+ ADD r4, r4, r6 ; r4 = t[1]+t3[6]
+ ADD r10,r10,r3 ; r10= t[2]+t3[5]
+ ADD r2, r2, r7 ; r2 = t[3]+t2[4]
+ ADD r2, r2, #8
+ ADD r10,r10,#8
+ ADD r4, r4, #8
+ ADD r5, r5, #8
+ SUB r11,r5, r11,LSL #1 ; r11= t[0]-t2[7]
+ SUB r6, r4, r6, LSL #1 ; r6 = t[1]-t3[6]
+ SUB r3, r10,r3, LSL #1 ; r3 = t[2]-t3[5]
+ SUB r7, r2, r7, LSL #1 ; r7 = t[3]-t2[4]
+ MOV r11,r11,ASR #4
+ MOV r6, r6, ASR #4
+ MOV r3, r3, ASR #4
+ MOV r7, r7, ASR #4
+ MOV r2, r2, ASR #4
+ MOV r10,r10,ASR #4
+ MOV r4, r4, ASR #4
+ MOV r5, r5, ASR #4
+ STRH r5,[r0], #2 ; y[0] = t[0]+t[7]
+ STRH r4, [r0, #14] ; y[1] = t[1]+t2[6]
+ STRH r10,[r0, #30] ; y[2] = t[2]+t2[5]
+ STRH r2, [r0, #46] ; y[3] = t2[3]+t[4]
+ STRH r7, [r0, #62] ; y[4] = t2[3]-t[4]
+ STRH r3, [r0, #78] ; y[5] = t[2]-t2[5]
+ STRH r6, [r0, #94] ; y[6] = t[1]-t2[6]
+ STRH r11,[r0, #110] ; y[7] = t2[0]-t[7]
+
+ MOV PC,r14
+
+OC_C1S7
+ DCD 64277 ; FB15
+OC_C2S6
+ DCD 60547 ; EC83
+OC_C4S4
+ DCD 46341 ; B505
+OC_C6S2
+ DCD 25080 ; 61F8
+OC_C7S1
+ DCD 12785 ; 31F1
+OC_C3S5
+ DCD 54491 ; D4DB
+OC_C5S3
+ DCD 36410 ; 8E3A
+
+ ALIGN
+idct8
+ ; r0 = ogg_int16_t *y (destination)
+ ; r1 = const ogg_int16_t *x (source)
+ STMFD r13!,{r4-r11,r14}
+
+ LDRSH r2, [r1] ; r2 = x[0]
+ LDRSH r6, [r1, #8] ; r6 = x[4]
+ LDR r12,OC_C4S4 ; r12= C4S4
+ LDRSH r4, [r1, #4] ; r4 = x[2]
+ ADD r2, r2, r6 ; r2 = x[0] + x[4]
+ SUB r6, r2, r6, LSL #1 ; r6 = x[0] - x[4]
+ MUL r2, r12,r2 ; r2 = t[0]<<16 = C4S4*(x[0]+x[4])
+ LDRSH r8, [r1, #12] ; r8 = x[6]
+ LDR r7, OC_C6S2 ; r7 = OC_C6S2
+ MUL r6, r12,r6 ; r6 = t[1]<<16 = C4S4*(x[0]-x[4])
+ LDR r14,OC_C2S6 ; r14= OC_C2S6
+ MUL r3, r4, r7 ; r3 = OC_C6S2*x[2]
+ LDR r5, OC_C7S1 ; r5 = OC_C7S1
+ MUL r4, r14,r4 ; r4 = OC_C2S6*x[2]
+ MOV r3, r3, ASR #16 ; r3 = OC_C6S2*x[2]>>16
+ MUL r14,r8, r14 ; r14= OC_C2S6*x[6]
+ MOV r4, r4, ASR #16 ; r4 = OC_C2S6*x[2]>>16
+ MUL r8, r7, r8 ; r8 = OC_C6S2*x[6]
+ LDR r7, OC_C1S7 ; r7 = OC_C1S7
+ SUB r3, r3, r14,ASR #16 ; r3=t[2]=C6S2*x[2]>>16-C2S6*x[6]>>16
+ LDRSH r14,[r1, #2] ; r14= x[1]
+ ADD r4, r4, r8, ASR #16 ; r4=t[3]=C2S6*x[2]>>16+C6S2*x[6]>>16
+ LDRSH r8, [r1, #14] ; r8 = x[7]
+ MUL r9, r5, r14 ; r9 = OC_C7S1*x[1]
+ LDRSH r10,[r1, #10] ; r10= x[5]
+ MUL r14,r7, r14 ; r14= OC_C1S7*x[1]
+ MOV r9, r9, ASR #16 ; r9 = OC_C7S1*x[1]>>16
+ MUL r7, r8, r7 ; r7 = OC_C1S7*x[7]
+ MOV r14,r14,ASR #16 ; r14= OC_C1S7*x[1]>>16
+ MUL r8, r5, r8 ; r8 = OC_C7S1*x[7]
+ LDRSH r1, [r1, #6] ; r1 = x[3]
+ LDR r5, OC_C3S5 ; r5 = OC_C3S5
+ LDR r11,OC_C5S3 ; r11= OC_C5S3
+ ADD r8, r14,r8, ASR #16 ; r8=t[7]=C1S7*x[1]>>16+C7S1*x[7]>>16
+ MUL r14,r5, r10 ; r14= OC_C3S5*x[5]
+ SUB r9, r9, r7, ASR #16 ; r9=t[4]=C7S1*x[1]>>16-C1S7*x[7]>>16
+ MUL r10,r11,r10 ; r10= OC_C5S3*x[5]
+ MOV r14,r14,ASR #16 ; r14= OC_C3S5*x[5]>>16
+ MUL r11,r1, r11 ; r11= OC_C5S3*x[3]
+ MOV r10,r10,ASR #16 ; r10= OC_C5S3*x[5]>>16
+ MUL r1, r5, r1 ; r1 = OC_C3S5*x[3]
+ SUB r14,r14,r11,ASR #16 ;r14=t[5]=C3S5*x[5]>>16-C5S3*x[3]>>16
+ ADD r10,r10,r1, ASR #16 ;r10=t[6]=C5S3*x[5]>>16+C3S5*x[3]>>16
+
+ ; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t[7] r9=t[4]
+ ; r10=t[6] r12=C4S4 r14=t[5]
+
+ ; Stage 2
+ ; 4-5 butterfly
+ ADD r9, r9, r14 ; r9 = t2[4] = t[4]+t[5]
+ SUB r14,r9, r14, LSL #1 ; r14= t[4]-t[5]
+ MUL r14,r12,r14 ; r14= t2[5]<<16 = C4S4*(t[4]-t[5])
+
+ ; 7-6 butterfly
+ ADD r8, r8, r10 ; r8 = t2[7] = t[7]+t[6]
+ SUB r10,r8, r10, LSL #1 ; r10= t[7]-t[6]
+ MUL r10,r12,r10 ; r10= t2[6]<<16 = C4S4*(t[7]+t[6])
+
+ ; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t2[7] r9=t2[4]
+ ; r10=t2[6]<<16 r12=C4S4 r14=t2[5]<<16
+
+ ; Stage 3
+ ; 0-3 butterfly
+ ADD r2, r4, r2, ASR #16 ; r2 = t2[0] = t[0] + t[3]
+ SUB r4, r2, r4, LSL #1 ; r4 = t2[3] = t[0] - t[3]
+
+ ; 1-2 butterfly
+ ADD r6, r3, r6, ASR #16 ; r6 = t2[1] = t[1] + t[2]
+ SUB r3, r6, r3, LSL #1 ; r3 = t2[2] = t[1] - t[2]
+
+ ; 6-5 butterfly
+ MOV r14,r14,ASR #16 ; r14= t2[5]
+ ADD r10,r14,r10,ASR #16 ; r10= t3[6] = t[6] + t[5]
+ SUB r14,r10,r14,LSL #1 ; r14= t3[5] = t[6] - t[5]
+
+ ; r2=t2[0] r3=t2[2] r4=t2[3] r6=t2[1] r8=t2[7] r9=t2[4]
+ ; r10=t3[6] r14=t3[5]
+
+ ; Stage 4
+ ADD r2, r2, r8 ; r2 = t[0] + t[7]
+ ADD r6, r6, r10 ; r6 = t[1] + t[6]
+ ADD r3, r3, r14 ; r3 = t[2] + t[5]
+ ADD r4, r4, r9 ; r4 = t[3] + t[4]
+ SUB r8, r2, r8, LSL #1 ; r8 = t[0] - t[7]
+ SUB r10,r6, r10,LSL #1 ; r10= t[1] - t[6]
+ SUB r14,r3, r14,LSL #1 ; r14= t[2] - t[5]
+ SUB r9, r4, r9, LSL #1 ; r9 = t[3] - t[4]
+ STRH r2, [r0], #2 ; y[0] = t[0]+t[7]
+ STRH r6, [r0, #14] ; y[1] = t[1]+t[6]
+ STRH r3, [r0, #30] ; y[2] = t[2]+t[5]
+ STRH r4, [r0, #46] ; y[3] = t[3]+t[4]
+ STRH r9, [r0, #62] ; y[4] = t[3]-t[4]
+ STRH r14,[r0, #78] ; y[5] = t[2]-t[5]
+ STRH r10,[r0, #94] ; y[6] = t[1]-t[6]
+ STRH r8, [r0, #110] ; y[7] = t[0]-t[7]
+
+ LDMFD r13!,{r4-r11,PC}
+idct8core
+ ; r0 = ogg_int16_t *y (destination)
+ ; r1 = const ogg_int16_t *x (source)
+ STMFD r13!,{r1,r14}
+
+ LDRSH r2, [r1] ; r2 = x[0]
+ LDRSH r6, [r1, #8] ; r6 = x[4]
+ LDR r12,OC_C4S4 ; r12= C4S4
+ LDRSH r4, [r1, #4] ; r4 = x[2]
+ ADD r2, r2, r6 ; r2 = x[0] + x[4]
+ SUB r6, r2, r6, LSL #1 ; r6 = x[0] - x[4]
+ MUL r2, r12,r2 ; r2 = t[0]<<16 = C4S4*(x[0]+x[4])
+ LDRSH r8, [r1, #12] ; r8 = x[6]
+ LDR r7, OC_C6S2 ; r7 = OC_C6S2
+ MUL r6, r12,r6 ; r6 = t[1]<<16 = C4S4*(x[0]-x[4])
+ LDR r14,OC_C2S6 ; r14= OC_C2S6
+ MUL r3, r4, r7 ; r3 = OC_C6S2*x[2]
+ LDR r5, OC_C7S1 ; r5 = OC_C7S1
+ MUL r4, r14,r4 ; r4 = OC_C2S6*x[2]
+ MOV r3, r3, ASR #16 ; r3 = OC_C6S2*x[2]>>16
+ MUL r14,r8, r14 ; r14= OC_C2S6*x[6]
+ MOV r4, r4, ASR #16 ; r4 = OC_C2S6*x[2]>>16
+ MUL r8, r7, r8 ; r8 = OC_C6S2*x[6]
+ LDR r7, OC_C1S7 ; r7 = OC_C1S7
+ SUB r3, r3, r14,ASR #16 ; r3=t[2]=C6S2*x[2]>>16-C2S6*x[6]>>16
+ LDRSH r14,[r1, #2] ; r14= x[1]
+ ADD r4, r4, r8, ASR #16 ; r4=t[3]=C2S6*x[2]>>16+C6S2*x[6]>>16
+ LDRSH r8, [r1, #14] ; r8 = x[7]
+ MUL r9, r5, r14 ; r9 = OC_C7S1*x[1]
+ LDRSH r10,[r1, #10] ; r10= x[5]
+ MUL r14,r7, r14 ; r14= OC_C1S7*x[1]
+ MOV r9, r9, ASR #16 ; r9 = OC_C7S1*x[1]>>16
+ MUL r7, r8, r7 ; r7 = OC_C1S7*x[7]
+ MOV r14,r14,ASR #16 ; r14= OC_C1S7*x[1]>>16
+ MUL r8, r5, r8 ; r8 = OC_C7S1*x[7]
+ LDRSH r1, [r1, #6] ; r1 = x[3]
+ LDR r5, OC_C3S5 ; r5 = OC_C3S5
+ LDR r11,OC_C5S3 ; r11= OC_C5S3
+ ADD r8, r14,r8, ASR #16 ; r8=t[7]=C1S7*x[1]>>16+C7S1*x[7]>>16
+ MUL r14,r5, r10 ; r14= OC_C3S5*x[5]
+ SUB r9, r9, r7, ASR #16 ; r9=t[4]=C7S1*x[1]>>16-C1S7*x[7]>>16
+ MUL r10,r11,r10 ; r10= OC_C5S3*x[5]
+ MOV r14,r14,ASR #16 ; r14= OC_C3S5*x[5]>>16
+ MUL r11,r1, r11 ; r11= OC_C5S3*x[3]
+ MOV r10,r10,ASR #16 ; r10= OC_C5S3*x[5]>>16
+ MUL r1, r5, r1 ; r1 = OC_C3S5*x[3]
+ SUB r14,r14,r11,ASR #16 ;r14=t[5]=C3S5*x[5]>>16-C5S3*x[3]>>16
+ ADD r10,r10,r1, ASR #16 ;r10=t[6]=C5S3*x[5]>>16+C3S5*x[3]>>16
+
+ ; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t[7] r9=t[4]
+ ; r10=t[6] r12=C4S4 r14=t[5]
+
+ ; Stage 2
+ ; 4-5 butterfly
+ ADD r9, r9, r14 ; r9 = t2[4] = t[4]+t[5]
+ SUB r14,r9, r14, LSL #1 ; r14= t[4]-t[5]
+ MUL r14,r12,r14 ; r14= t2[5]<<16 = C4S4*(t[4]-t[5])
+
+ ; 7-6 butterfly
+ ADD r8, r8, r10 ; r8 = t2[7] = t[7]+t[6]
+ SUB r10,r8, r10, LSL #1 ; r10= t[7]-t[6]
+ MUL r10,r12,r10 ; r10= t2[6]<<16 = C4S4*(t[7]+t[6])
+
+ ; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t2[7] r9=t2[4]
+ ; r10=t2[6]<<16 r12=C4S4 r14=t2[5]<<16
+
+ ; Stage 3
+ ; 0-3 butterfly
+ ADD r2, r4, r2, ASR #16 ; r2 = t2[0] = t[0] + t[3]
+ SUB r4, r2, r4, LSL #1 ; r4 = t2[3] = t[0] - t[3]
+
+ ; 1-2 butterfly
+ ADD r6, r3, r6, ASR #16 ; r6 = t2[1] = t[1] + t[2]
+ SUB r3, r6, r3, LSL #1 ; r3 = t2[2] = t[1] - t[2]
+
+ ; 6-5 butterfly
+ MOV r14,r14,ASR #16 ; r14= t2[5]
+ ADD r10,r14,r10,ASR #16 ; r10= t3[6] = t[6] + t[5]
+ SUB r14,r10,r14,LSL #1 ; r14= t3[5] = t[6] - t[5]
+
+ ; r2=t2[0] r3=t2[2] r4=t2[3] r6=t2[1] r8=t2[7] r9=t2[4]
+ ; r10=t3[6] r14=t3[5]
+
+ ; Stage 4
+ ADD r2, r2, r8 ; r2 = t[0] + t[7]
+ ADD r6, r6, r10 ; r6 = t[1] + t[6]
+ ADD r3, r3, r14 ; r3 = t[2] + t[5]
+ ADD r4, r4, r9 ; r4 = t[3] + t[4]
+ SUB r8, r2, r8, LSL #1 ; r8 = t[0] - t[7]
+ SUB r10,r6, r10,LSL #1 ; r10= t[1] - t[6]
+ SUB r14,r3, r14,LSL #1 ; r14= t[2] - t[5]
+ SUB r9, r4, r9, LSL #1 ; r9 = t[3] - t[4]
+ STRH r2, [r0], #2 ; y[0] = t[0]+t[7]
+ STRH r6, [r0, #14] ; y[1] = t[1]+t[6]
+ STRH r3, [r0, #30] ; y[2] = t[2]+t[5]
+ STRH r4, [r0, #46] ; y[3] = t[3]+t[4]
+ STRH r9, [r0, #62] ; y[4] = t[3]-t[4]
+ STRH r14,[r0, #78] ; y[5] = t[2]-t[5]
+ STRH r10,[r0, #94] ; y[6] = t[1]-t[6]
+ STRH r8, [r0, #110] ; y[7] = t[0]-t[7]
+
+ LDMFD r13!,{r1,PC}
+idct8core_down
+ ; r0 = ogg_int16_t *y (destination)
+ ; r1 = const ogg_int16_t *x (source)
+ STMFD r13!,{r1,r14}
+
+ LDRSH r2, [r1] ; r2 = x[0]
+ LDRSH r6, [r1, #8] ; r6 = x[4]
+ LDR r12,OC_C4S4 ; r12= C4S4
+ LDRSH r4, [r1, #4] ; r4 = x[2]
+ ADD r2, r2, r6 ; r2 = x[0] + x[4]
+ SUB r6, r2, r6, LSL #1 ; r6 = x[0] - x[4]
+ MUL r2, r12,r2 ; r2 = t[0]<<16 = C4S4*(x[0]+x[4])
+ LDRSH r8, [r1, #12] ; r8 = x[6]
+ LDR r7, OC_C6S2 ; r7 = OC_C6S2
+ MUL r6, r12,r6 ; r6 = t[1]<<16 = C4S4*(x[0]-x[4])
+ LDR r14,OC_C2S6 ; r14= OC_C2S6
+ MUL r3, r4, r7 ; r3 = OC_C6S2*x[2]
+ LDR r5, OC_C7S1 ; r5 = OC_C7S1
+ MUL r4, r14,r4 ; r4 = OC_C2S6*x[2]
+ MOV r3, r3, ASR #16 ; r3 = OC_C6S2*x[2]>>16
+ MUL r14,r8, r14 ; r14= OC_C2S6*x[6]
+ MOV r4, r4, ASR #16 ; r4 = OC_C2S6*x[2]>>16
+ MUL r8, r7, r8 ; r8 = OC_C6S2*x[6]
+ LDR r7, OC_C1S7 ; r7 = OC_C1S7
+ SUB r3, r3, r14,ASR #16 ; r3=t[2]=C6S2*x[2]>>16-C2S6*x[6]>>16
+ LDRSH r14,[r1, #2] ; r14= x[1]
+ ADD r4, r4, r8, ASR #16 ; r4=t[3]=C2S6*x[2]>>16+C6S2*x[6]>>16
+ LDRSH r8, [r1, #14] ; r8 = x[7]
+ MUL r9, r5, r14 ; r9 = OC_C7S1*x[1]
+ LDRSH r10,[r1, #10] ; r10= x[5]
+ MUL r14,r7, r14 ; r14= OC_C1S7*x[1]
+ MOV r9, r9, ASR #16 ; r9 = OC_C7S1*x[1]>>16
+ MUL r7, r8, r7 ; r7 = OC_C1S7*x[7]
+ MOV r14,r14,ASR #16 ; r14= OC_C1S7*x[1]>>16
+ MUL r8, r5, r8 ; r8 = OC_C7S1*x[7]
+ LDRSH r1, [r1, #6] ; r1 = x[3]
+ LDR r5, OC_C3S5 ; r5 = OC_C3S5
+ LDR r11,OC_C5S3 ; r11= OC_C5S3
+ ADD r8, r14,r8, ASR #16 ; r8=t[7]=C1S7*x[1]>>16+C7S1*x[7]>>16
+ MUL r14,r5, r10 ; r14= OC_C3S5*x[5]
+ SUB r9, r9, r7, ASR #16 ; r9=t[4]=C7S1*x[1]>>16-C1S7*x[7]>>16
+ MUL r10,r11,r10 ; r10= OC_C5S3*x[5]
+ MOV r14,r14,ASR #16 ; r14= OC_C3S5*x[5]>>16
+ MUL r11,r1, r11 ; r11= OC_C5S3*x[3]
+ MOV r10,r10,ASR #16 ; r10= OC_C5S3*x[5]>>16
+ MUL r1, r5, r1 ; r1 = OC_C3S5*x[3]
+ SUB r14,r14,r11,ASR #16 ;r14=t[5]=C3S5*x[5]>>16-C5S3*x[3]>>16
+ ADD r10,r10,r1, ASR #16 ;r10=t[6]=C5S3*x[5]>>16+C3S5*x[3]>>16
+
+ ; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t[7] r9=t[4]
+ ; r10=t[6] r12=C4S4 r14=t[5]
+
+ ; Stage 2
+ ; 4-5 butterfly
+ ADD r9, r9, r14 ; r9 = t2[4] = t[4]+t[5]
+ SUB r14,r9, r14, LSL #1 ; r14= t[4]-t[5]
+ MUL r14,r12,r14 ; r14= t2[5]<<16 = C4S4*(t[4]-t[5])
+
+ ; 7-6 butterfly
+ ADD r8, r8, r10 ; r8 = t2[7] = t[7]+t[6]
+ SUB r10,r8, r10, LSL #1 ; r10= t[7]-t[6]
+ MUL r10,r12,r10 ; r10= t2[6]<<16 = C4S4*(t[7]+t[6])
+
+ ; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t2[7] r9=t2[4]
+ ; r10=t2[6]<<16 r12=C4S4 r14=t2[5]<<16
+
+ ; Stage 3
+ ; 0-3 butterfly
+ ADD r2, r4, r2, ASR #16 ; r2 = t2[0] = t[0] + t[3]
+ SUB r4, r2, r4, LSL #1 ; r4 = t2[3] = t[0] - t[3]
+
+ ; 1-2 butterfly
+ ADD r6, r3, r6, ASR #16 ; r6 = t2[1] = t[1] + t[2]
+ SUB r3, r6, r3, LSL #1 ; r3 = t2[2] = t[1] - t[2]
+
+ ; 6-5 butterfly
+ MOV r14,r14,ASR #16 ; r14= t2[5]
+ ADD r10,r14,r10,ASR #16 ; r10= t3[6] = t[6] + t[5]
+ SUB r14,r10,r14,LSL #1 ; r14= t3[5] = t[6] - t[5]
+
+ ; r2=t2[0] r3=t2[2] r4=t2[3] r6=t2[1] r8=t2[7] r9=t2[4]
+ ; r10=t3[6] r14=t3[5]
+
+ ; Stage 4
+ ADD r2, r2, r8 ; r2 = t[0] + t[7]
+ ADD r6, r6, r10 ; r6 = t[1] + t[6]
+ ADD r3, r3, r14 ; r3 = t[2] + t[5]
+ ADD r4, r4, r9 ; r4 = t[3] + t[4]
+ ADD r2, r2, #8
+ ADD r6, r6, #8
+ ADD r3, r3, #8
+ ADD r4, r4, #8
+ SUB r8, r2, r8, LSL #1 ; r8 = t[0] - t[7]
+ SUB r10,r6, r10,LSL #1 ; r10= t[1] - t[6]
+ SUB r14,r3, r14,LSL #1 ; r14= t[2] - t[5]
+ SUB r9, r4, r9, LSL #1 ; r9 = t[3] - t[4]
+ MOV r2, r2, ASR #4
+ MOV r6, r6, ASR #4
+ MOV r3, r3, ASR #4
+ MOV r4, r4, ASR #4
+ MOV r8, r8, ASR #4
+ MOV r10,r10,ASR #4
+ MOV r14,r14,ASR #4
+ MOV r9, r9, ASR #4
+ STRH r2, [r0], #2 ; y[0] = t[0]+t[7]
+ STRH r6, [r0, #14] ; y[1] = t[1]+t[6]
+ STRH r3, [r0, #30] ; y[2] = t[2]+t[5]
+ STRH r4, [r0, #46] ; y[3] = t[3]+t[4]
+ STRH r9, [r0, #62] ; y[4] = t[3]-t[4]
+ STRH r14,[r0, #78] ; y[5] = t[2]-t[5]
+ STRH r10,[r0, #94] ; y[6] = t[1]-t[6]
+ STRH r8, [r0, #110] ; y[7] = t[0]-t[7]
+
+ LDMFD r13!,{r1,PC}
+
+ END
Property changes on: branches/theorarm-merge-branch/lib/arm/ARMidct.s
___________________________________________________________________
Added: svn:executable
+ *
Added: branches/theorarm-merge-branch/lib/arm/ARMint.h
===================================================================
--- branches/theorarm-merge-branch/lib/arm/ARMint.h (rev 0)
+++ branches/theorarm-merge-branch/lib/arm/ARMint.h 2010-03-07 17:59:26 UTC (rev 16954)
@@ -0,0 +1,42 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id: x86int.h 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+
+#if !defined(_arm_ARMint_H)
+# define _arm_ARMint_H (1)
+# include "../internal.h"
+
+void oc_state_vtable_init_arm(oc_theora_state *_state);
+
+void oc_frag_copy_arm(unsigned char *_dst,
+ const unsigned char *_src,int _ystride);
+void oc_frag_recon_intra_arm(unsigned char *_dst,int _ystride,
+ const ogg_int16_t *_residue);
+void oc_frag_recon_inter_arm(unsigned char *_dst,
+ const unsigned char *_src,int _ystride,const ogg_int16_t *_residue);
+void oc_frag_recon_inter2_arm(unsigned char *_dst,const unsigned char *_src1,
+ const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
+void oc_idct8x8_arm(ogg_int16_t _y[64],int _last_zzi);
+void oc_state_frag_recon_arm(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant);
+void oc_state_frag_copy_list_arm(const oc_theora_state *_state,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
+ int _dst_frame,int _src_frame,int _pli);
+void oc_state_loop_filter_frag_rows_arm(const oc_theora_state *_state,
+ int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
+void oc_restore_fpu_arm(void);
+
+#endif
Property changes on: branches/theorarm-merge-branch/lib/arm/ARMint.h
___________________________________________________________________
Added: svn:executable
+ *
Added: branches/theorarm-merge-branch/lib/arm/ARMoffsets.s
===================================================================
--- branches/theorarm-merge-branch/lib/arm/ARMoffsets.s (rev 0)
+++ branches/theorarm-merge-branch/lib/arm/ARMoffsets.s 2010-03-07 17:59:26 UTC (rev 16954)
@@ -0,0 +1,37 @@
+; Theorarm library
+; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
+
+FPLANE_NSBS * 0x1C
+FPLANE_SIZE * 0x20
+
+STATE_FPLANES * 0x48
+STATE_FRAGS * 0xA8
+STATE_NFRAGS * 0xB4
+STATE_SB_MAPS * 0xB8
+STATE_SB_FLAGS * 0xBC
+STATE_NSBS * 0xC0
+STATE_CODED_FRAGIS * 0xD8
+STATE_NCODED_FRAGIS * 0xDC
+STATE_NTOTAL_CODED_FRAGIS * 0xE8
+
+DEC_STATE_FPLANES * 0+STATE_FPLANES
+DEC_STATE_FRAGS * 0+STATE_FRAGS
+DEC_STATE_NFRAGS * 0+STATE_NFRAGS
+DEC_STATE_SB_MAPS * 0+STATE_SB_MAPS
+DEC_STATE_SB_FLAGS * 0+STATE_SB_FLAGS
+DEC_STATE_NSBS * 0+STATE_NSBS
+DEC_STATE_CODED_FRAGIS * 0+STATE_CODED_FRAGIS
+DEC_STATE_NCODED_FRAGIS * 0+STATE_NCODED_FRAGIS
+DEC_STATE_NTOTAL_CODED_FRAGIS * 0+STATE_NTOTAL_CODED_FRAGIS
+DEC_OPB * 0xC944
+
+CODED_FULLY_SHIFT * 0
+CODED_PARTIALLY_SHIFT * 1
+VALUE_BIT_SHIFT * 2
+CODED_FULLY * (1<<CODED_FULLY_SHIFT)
+CODED_PARTIALLY * (1<<CODED_PARTIALLY_SHIFT)
+
+FRAGMENT_CODED * 1
+
+ END
+
Property changes on: branches/theorarm-merge-branch/lib/arm/ARMoffsets.s
___________________________________________________________________
Added: svn:executable
+ *
Added: branches/theorarm-merge-branch/lib/arm/ARMoptions.s
===================================================================
--- branches/theorarm-merge-branch/lib/arm/ARMoptions.s (rev 0)
+++ branches/theorarm-merge-branch/lib/arm/ARMoptions.s 2010-03-07 17:59:26 UTC (rev 16954)
@@ -0,0 +1,22 @@
+; Theorarm library
+; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
+
+; Set the following to 1 if we have LDRD/STRD
+ARM_HAS_LDRD * 1
+
+; Set the following to 1 if we have ARMV6 or higher
+ARMV6 * 1
+
+; Set the following to 1 if we have NEON
+ARM_HAS_NEON * 1
+
+; Set the following to 1 if LDR/STR can work on unaligned addresses
+ARM_CAN_UNALIGN * 0
+
+; Set the following to 1 if LDRD/STRD can work on unaligned addresses
+ARM_CAN_UNALIGN_LDRD * 0
+
+QEMU * 0
+
+ END
+
Property changes on: branches/theorarm-merge-branch/lib/arm/ARMoptions.s
___________________________________________________________________
Added: svn:executable
+ *
Added: branches/theorarm-merge-branch/lib/arm/ARMpp.s
===================================================================
--- branches/theorarm-merge-branch/lib/arm/ARMpp.s (rev 0)
+++ branches/theorarm-merge-branch/lib/arm/ARMpp.s 2010-03-07 17:59:26 UTC (rev 16954)
@@ -0,0 +1,878 @@
+; Theorarm library
+; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
+
+ AREA |.text|, CODE, READONLY
+
+ GET ARMoptions.s
+
+ EXPORT oc_filter_hedge
+ EXPORT oc_filter_vedge
+ EXPORT oc_dering_block
+
+ [ 0;ARM_HAS_NEON
+ ; Unfinished
+oc_filter_hedge
+ ; r0 = uint8_t *rdst
+ ; r1 = int dst_ystride
+ ; r2 = const uint8_t *rsrc
+ ; r3 = int src_ystride
+ ; <> = int qstep
+ ; <> = int flimit
+ ; <> = int *variance0
+ ; <> = int *variance1
+ STMFD r13!,{r4-r11,r14}
+
+ ; variance0sum is 8*255 at most.
+
+ ; r14 will hold variance0sum in the bottom 12 bits, variance1sum
+ ; in the next 12 bits, and loop count in the top 8 bits.
+ MOV r14,#4<<28 ; r14= variancesums
+ofhs_lp
+ VLD1.64 {D0, D1 }, [r2], r3 ; Q0 = s[0]
+ VLD1.64 {D2, D3 }, [r2], r3 ; Q1 = s[1]
+ VLD1.64 {D4, D5 }, [r2], r3 ; Q2 = s[2]
+ VLD1.64 {D6, D7 }, [r2], r3 ; Q3 = s[3]
+ VLD1.64 {D8, D9 }, [r2], r3 ; Q4 = s[4]
+ VLD1.64 {D10,D11}, [r2], r3 ; Q5 = s[5]
+ VLD1.64 {D12,D13}, [r2], r3 ; Q6 = s[6]
+ VLD1.64 {D14,D15}, [r2], r3 ; Q7 = s[7]
+ VLD1.64 {D16,D17}, [r2], r3 ; Q8 = s[8]
+ VLD1.64 {D18,D19}, [r2], r3 ; Q9 = s[9]
+ VABDL.U8 Q10,D2, D0 ; Q10= abs(s[1]-s[0]) (bottoms)
+ VABDL.U8 Q11,D3, D1 ; Q11= abs(s[1]-s[0]) (tops)
+ VABDL.U8 Q14,D12,D10 ; Q14= abs(s[6]-s[5]) (bottoms)
+ VABDL.U8 Q15,D13,D11 ; Q15= abs(s[6]-s[5]) (tops)
+ VABAL.U8 Q10,D4, D2 ; Q10+=abs(s[2]-s[1]) (bottoms)
+ VABAL.U8 Q11,D5, D3 ; Q11+=abs(s[2]-s[1]) (tops)
+ VABDL.U8 Q14,D14,D12 ; Q14= abs(s[7]-s[6]) (bottoms)
+ VABDL.U8 Q15,D15,D13 ; Q15= abs(s[7]-s[6]) (tops)
+ VABAL.U8 Q10,D6, D4 ; Q10+=abs(s[3]-s[2]) (bottoms)
+ VABAL.U8 Q11,D7, D5 ; Q11+=abs(s[3]-s[2]) (tops)
+ VABDL.U8 Q14,D16,D14 ; Q14= abs(s[8]-s[7]) (bottoms)
+ VABDL.U8 Q15,D17,D15 ; Q15= abs(s[8]-s[7]) (tops)
+ VABAL.U8 Q10,D8, D6 ; Q10+=abs(s[4]-s[3]) (bottoms)
+ VABAL.U8 Q11,D9, D7 ; Q11+=abs(s[4]-s[3]) (tops)
+ VABDL.U8 Q14,D18,D16 ; Q14= abs(s[8]-s[7]) (bottoms)
+ VABDL.U8 Q15,D19,D17 ; Q15= abs(s[8]-s[7]) (tops)
+ VABDL.U8 Q12,D10,D8 ; Q12= abs(s[5]-s[4]) (bottoms)
+ VABDL.U8 Q13,D11,D9 ; Q13= abs(s[5]-s[4]) (tops)
+
+ ; Q10/11=num0 Q12/13=abs(s[5]-s[4]) Q14/15=sum1
+ MOV r9, #0
+ USADA8 r14,r9,r4,r14 ; r14=variance0sum+=sum of sum0's
+ USAD8 r9, r9,r6 ; r9 =sum of sum1's
+ ADD r14,r14,r9, LSL #12 ; r14=variance1sum+=sum of sum1's
+
+ LDR r7, [r13,#4*9] ; r9 = qstep
+ LDR r10,[r13,#4*10] ; r10= flimit
+ MOV r11,#0
+ ORR r7, r7, r7, LSL #8
+ ORR r7, r7, r7, LSL #16
+ ORR r10,r10,r10,LSL #8
+ ORR r10,r10,r10,LSL #16
+ USUB8 r9, r5, r7 ; Set GE bit if (abs(r[4]-r[5])>=qstep)
+ SEL r9, r11,r9 ; bytes are NE if (abs(r[4]-r[5])<qstep)
+ USUB8 r7, r4, r10 ; Set GE bit if (sum0>=flimit)
+ SEL r9, r11,r9 ; bytes are NE if (sum0<flimit) && above cond
+ USUB8 r7, r6, r10 ; Set GE bit if (sum1>=flimit)
+ SEL r9, r11,r9 ; bytes are NE if (sum1<flimit) && above cond
+
+ SUB r2, r2, r3, LSL #3
+ SUB r0, r0, r1, LSL #3
+
+ |
+ [ ARMV6 = 1
+oc_filter_hedge
+ ; r0 = uint8_t *rdst
+ ; r1 = int dst_ystride
+ ; r2 = const uint8_t *rsrc
+ ; r3 = int src_ystride
+ ; <> = int qstep
+ ; <> = int flimit
+ ; <> = int *variance0
+ ; <> = int *variance1
+ STMFD r13!,{r4-r11,r14}
+
+ ; variance0sum is 8*255 at most.
+
+ ; r14 will hold variance0sum in the bottom 12 bits, variance1sum
+ ; in the next 12 bits, and loop count in the top 8 bits.
+ MOV r14,#4<<28 ; r14= variancesums
+ofhs_lp
+ LDR r4, [r2], r3 ; r4 = s[0]
+ LDR r5, [r2], r3 ; r5 = s[1]
+ LDR r6, [r2], r3 ; r6 = s[2]
+ LDR r7, [r2], r3 ; r7 = s[3]
+ STR r5, [r0], r1 ; store s[1]
+ STR r6, [r0], r1 ; store s[2]
+ STR r7, [r0], r1 ; store s[3]
+ USUB8 r9, r4, r5
+ USUB8 r4, r5, r4
+ SEL r4, r4, r9 ; r4 = sum0 = abs(s[0]-s[1])
+ USUB8 r9, r5, r6
+ USUB8 r5, r6, r5
+ SEL r5, r5, r9 ; r5 = abs(s[2]-s[1]) in 4 bytes
+ UQADD8 r4, r4, r5 ; r4 = sum0 += abs(s[2]-s[1])
+ LDR r5, [r2], r3 ; r5 = s[4]
+ USUB8 r9, r6, r7
+ USUB8 r6, r7, r6
+ SEL r6, r6, r9 ; r6 = abs(s[3]-s[2]) in 4 bytes
+ UQADD8 r4, r4, r6 ; r4 = sum0 += abs(s[3]-s[2])
+ LDR r6, [r2], r3 ; r6 = s[5]
+ STR r5, [r0], r1 ; store s[4]
+ USUB8 r9, r7, r5
+ USUB8 r7, r5, r7
+ SEL r7, r7, r9 ; r7 = abs(s[4]-s[3]) in 4 bytes
+ UQADD8 r4, r4, r7 ; r4 = sum0 += abs(s[4]-s[3])
+ LDR r7, [r2], r3 ; r6 = s[6]
+ STR r6, [r0], r1 ; store s[5]
+ USUB8 r9, r5, r6
+ USUB8 r5, r6, r5
+ SEL r5, r5, r9 ; r5 = abs(s[5]-s[4]) in 4 bytes
+ LDR r10,[r2], r3 ; r10= s[7]
+ STR r7, [r0], r1 ; store s[6]
+ USUB8 r9, r6, r7
+ USUB8 r6, r7, r6
+ SEL r6, r6, r9 ; r6 = sum1 = abs(s[6]-s[5])
+ LDR r11,[r2], r3 ; r11= s[8]
+ STR r10,[r0], r1 ; store s[7]
+ USUB8 r9, r7, r10
+ USUB8 r7, r10,r7
+ SEL r7, r7, r9 ; r7 = abs(s[7]-s[6]) in 4 bytes
+ UQADD8 r6, r6, r7 ; r6 = sum1 += abs(s[7]-s[6])
+ LDR r7, [r2], -r3 ; r7 = s[9]
+ STR r11,[r0], r1 ; store s[8]
+ USUB8 r9, r10,r11
+ USUB8 r10,r11,r10
+ SEL r10,r10,r9 ; r10= abs(s[8]-s[7]) in 4 bytes
+ UQADD8 r6, r6, r10 ; r6 = sum1 += abs(s[8]-s[7])
+ USUB8 r9, r11,r7
+ USUB8 r11,r7, r11
+ SEL r11,r11,r9 ; r11= abs(s[9]-s[8]) in 4 bytes
+ UQADD8 r6, r6, r11 ; r6 = sum1 += abs(s[9]-s[8])
+
+ ; r4=sum0 r5=abs(s[5]-s[4]) r6=sum1
+ MOV r9, #0
+ USADA8 r14,r9, r4, r14 ; r14=variance0sum+=sum of sum0's
+ USAD8 r9, r9, r6 ; r9 =sum of sum1's
+ ADD r14,r14,r9, LSL #12 ; r14=variance1sum+=sum of sum1's
+
+ LDR r7, [r13,#4*9] ; r9 = qstep
+ LDR r10,[r13,#4*10] ; r10= flimit
+ MOV r11,#0
+ ORR r7, r7, r7, LSL #8
+ ORR r7, r7, r7, LSL #16
+ ORR r10,r10,r10,LSL #8
+ ORR r10,r10,r10,LSL #16
+ USUB8 r9, r5, r7 ; Set GE bit if (abs(r[4]-r[5])>=qstep)
+ SEL r9, r11,r9 ; bytes are NE if (abs(r[4]-r[5])<qstep)
+ USUB8 r7, r4, r10 ; Set GE bit if (sum0>=flimit)
+ SEL r9, r11,r9 ; bytes are NE if (sum0<flimit) && above cond
+ USUB8 r7, r6, r10 ; Set GE bit if (sum1>=flimit)
+ SEL r9, r11,r9 ; bytes are NE if (sum1<flimit) && above cond
+
+ SUB r2, r2, r3, LSL #3
+ SUB r0, r0, r1, LSL #3
+
+ STMFD r13!,{r9,r14}
+ TST r9,#0x000000FF
+ BLNE do_hedge
+ ADD r0, r0, #1
+ ADD r2, r2, #1
+ TST r9,#0x0000FF00
+ BLNE do_hedge
+ ADD r0, r0, #1
+ ADD r2, r2, #1
+ TST r9,#0x00FF0000
+ BLNE do_hedge
+ ADD r0, r0, #1
+ ADD r2, r2, #1
+ TST r9,#0xFF000000
+ BLNE do_hedge
+ ADD r0, r0, #1
+ ADD r2, r2, #1
+ LDMFD r13!,{r9,r14}
+
+ SUBS r14,r14,#4<<28
+ BGE ofhs_lp
+
+ LDR r4, [r13,#4*(9+2)] ; r4 = variance0
+ LDR r5, [r13,#4*(9+3)] ; r5 = variance1
+ MOV r12,r14,LSL #20 ; r12= variance0sum<<20
+ LDR r6, [r4] ; r6 = *variance0
+ LDR r7, [r5] ; r7 = *variance1
+ BIC r14,r14,#0xFF000000
+ ADD r6, r6, r12,LSR #20 ; r6 = *variance0 += variance0sum
+ ADD r7, r7, r14,LSR #12 ; r7 = *variance1 += variance1sum
+ STR r6, [r4] ; r4 = *variance0
+ STR r7, [r5] ; r5 = *variance1
+
+ LDMFD r13!,{r4-r11,PC}
+do_hedge
+ ; Do the filter...
+ LDRB r4, [r2], r3 ; r4 = r[0]
+ LDRB r5, [r2], r3 ; r5 = r[1]
+ LDRB r6, [r2], r3 ; r6 = r[2]
+ LDRB r7, [r2], r3 ; r7 = r[3]
+ LDRB r8, [r2], r3 ; r8 = r[4]
+ LDRB r9, [r2], r3 ; r9 = r[5]
+ LDRB r12,[r2], r3 ; r12= r[6]
+ ADD r10,r4, r5
+ ADD r10,r10,r6
+ ADD r10,r10,r7
+ ADD r10,r10,r8 ; r10= r[0]+r[1]+r[2]+r[3]+r[4]
+ ADD r10,r10,#4 ; r10= r[0]+r[1]+r[2]+r[3]+r[4]+4
+ ADD r11,r10,r4,LSL #1;r11= r[0]*3+r[1]+r[2]+r[3]+r[4]+4
+ ADD r11,r11,r5 ; r11= r[0]*3+r[1]*2+r[2]+r[3]+r[4]+4
+ MOV r11,r11,ASR #3 ; r11= r[0]*3+r[1]*2+r[2]+r[3]+r[4]+4>>3
+ STRB r11,[r0], r1
+ ADD r10,r10,r9 ; r10= r[0]+r[1]+r[2]+r[3]+r[4]+r[5]+4
+ ADD r11,r10,r4 ; r11= r[0]*2+r[1]+r[2]+r[3]+r[4]+r[5]+4
+ ADD r11,r11,r6 ; r11= r[0]*2+r[1]+r[2]*2+r[3]+r[4]+r[5]+4
+ MOV r11,r11,ASR #3 ; r11= r[0]*2+r[1]+r[2]*2+r[3]+r[4]+r[5]+4>>3
+ STRB r11,[r0], r1
+ ADD r10,r10,r12 ; r10= r[0]+r[1]+r[2]+r[3]+r[4]+r[5]+r[6]+4
+ ADD r11,r10,r7 ; r11= r[0]+r[1]+r[2]+r[3]*2+r[4]+r[5]+r[6]+4
+ SUB r10,r10,r4 ; r10= r[1]+r[2]+r[3]+r[4]+r[5]+r[6]+4
+ LDRB r4, [r2], r3 ; r4 = r[7]
+ MOV r11,r11,ASR #3;r11= r[0]+r[1]+r[2]+r[3]*2+r[4]+r[5]+r[6]+4>>3
+ STRB r11,[r0], r1
+ ADD r10,r10,r4 ; r10= r[1]+r[2]+r[3]+r[4]+r[5]+r[6]+r[7]+4
+ ADD r11,r10,r8 ; r11= r[1]+r[2]+r[3]+r[4]*2+r[5]+r[6]+r[7]+4
+ SUB r10,r10,r5 ; r10= r[2]+r[3]+r[4]+r[5]+r[6]+r[7]+4
+ LDRB r5, [r2], r3 ; r5 = r[8]
+ MOV r11,r11,ASR #3;r11= r[1]+r[2]+r[3]+r[4]*2+r[5]+r[6]+r[7]+4>>3
+ STRB r11,[r0], r1
+ ADD r10,r10,r5 ; r10= r[2]+r[3]+r[4]+r[5]+r[6]+r[7]+r[8]+4
+ ADD r11,r10,r9 ; r11= r[2]+r[3]+r[4]+r[5]*2+r[6]+r[7]+r[8]+4
+ SUB r10,r10,r6 ; r10= r[3]+r[4]+r[5]+r[6]+r[7]+r[8]+4
+ LDRB r6, [r2], -r3 ; r6 = r[9]
+ MOV r11,r11,ASR #3;r11= r[2]+r[3]+r[4]+r[5]*2+r[6]+r[7]+r[8]+4>>3
+ STRB r11,[r0], r1
+ ADD r10,r10,r6 ; r10= r[3]+r[4]+r[5]+r[6]+r[7]+r[8]+r[9]+4
+ ADD r11,r10,r12 ; r11= r[3]+r[4]+r[5]+r[6]*3+r[7]+r[8]+r[9]+4
+ MOV r11,r11,ASR #3;r11= r[3]+r[4]+r[5]+r[6]*3+r[7]+r[8]+r[9]+4>>3
+ STRB r11,[r0], r1
+ SUB r10,r10,r7 ; r10= r[4]+r[5]+r[6]+r[7]+r[8]+r[9]+4
+ ADD r10,r10,r6 ; r10= r[4]+r[5]+r[6]+r[7]+r[8]+r[9]*2+4
+ ADD r11,r10,r4 ; r11= r[4]+r[5]+r[6]+r[7]*2+r[8]+r[9]*2+4
+ MOV r11,r11,ASR #3 ; r11= r[4]+r[5]+r[6]+r[7]*2+r[8]+r[9]*2+4>>3
+ STRB r11,[r0], r1
+ SUB r10,r10,r8
+ ADD r10,r10,r6 ; r10= r[5]+r[6]+r[7]+r[8]+r[9]*3+4
+ ADD r10,r10,r5 ; r10= r[5]+r[6]+r[7]+r[8]*2+r[9]*3+4
+ MOV r10,r10,ASR #3 ; r10= r[5]+r[6]+r[7]+r[8]*2+r[9]*3+4>>3
+ STRB r10,[r0], r1
+ SUB r2, r2, r3, LSL #3
+ SUB r0, r0, r1, LSL #3
+
+ LDR r9,[r13]
+ MOV PC,R14
+ |
+oc_filter_hedge
+ ; r0 = uint8_t *rdst
+ ; r1 = int dst_ystride
+ ; r2 = const uint8_t *rsrc
+ ; r3 = int src_ystride
+ ; <> = int qstep
+ ; <> = int flimit
+ ; <> = int *variance0
+ ; <> = int *variance1
+ STMFD r13!,{r4-r11,r14}
+
+ ; variance0sum is 8*255 at most.
+
+ ; r14 will hold variance0sum in the bottom 12 bits, variance1sum
+ ; in the next 12 bits, and loop count in the top 8 bits.
+ MOV r14,#8<<24 ; r14= variancesums = 0 | (bx<<24)
+ SUB r0, r0, #1
+ SUB r2, r2, #1
+ofh_lp
+ SUBS r14,r14,#1<<24
+ BLT ofh_end
+ofh_lp2
+ ADD r0, r0, #1
+ ADD r2, r2, #1
+ LDRB r4, [r2], r3 ; r4 = r[0]
+ LDRB r5, [r2], r3 ; r5 = r[1]
+ LDRB r6, [r2], r3 ; r6 = r[2]
+ LDRB r7, [r2], r3 ; r7 = r[3]
+ STRB r5, [r0], r1 ; dst[1]
+ STRB r6, [r0], r1 ; dst[2]
+ STRB r7, [r0], r1 ; dst[3]
+ SUBS r4, r5, r4 ; r4 = r[1]-r[0]
+ RSBLT r4, r4, #0 ; r4 = sum0 = abs(r[1]-r[0])
+ SUBS r5, r6, r5 ; r5 = r[2]-r[1]
+ ADDGE r4, r4, r5
+ SUBLT r4, r4, r5 ; r4 = sum0 += abs(r[2]-r[1])
+ LDRB r5, [r2], r3 ; r5 = r[4]
+ SUBS r6, r7, r6 ; r6 = r[3]-r[2]
+ ADDGE r4, r4, r6
+ SUBLT r4, r4, r6 ; r4 = sum0 += abs(r[3]-r[2])
+ LDRB r6, [r2], r3 ; r6 = r[5]
+ STRB r5, [r0], r1 ; dst[4]
+ SUBS r7, r5, r7 ; r7 = r[4]-r[3]
+ ADDGE r4, r4, r7
+ SUBLT r4, r4, r7 ; r4 = sum0 += abs(r[4]-r[3])
+ LDRB r7, [r2], r3 ; r7 = r[6]
+ STRB r6, [r0], r1 ; dst[5]
+ SUBS r5, r6, r5 ; r5 = r[5]-r[4]
+ RSBLT r5, r5, #0 ; r5 = abs(r[5]-r[4])
+ LDRB r8, [r2], r3 ; r8 = r[7]
+ STRB r7, [r0], r1 ; dst[6]
+ SUBS r6, r6, r7 ; r6 = r[5]-r[6]
+ RSBLT r6, r6, #0 ; r6 = sum1 = abs(r[5]-r[6])
+ SUBS r7, r7, r8 ; r7 = r[6]-r[7]
+ LDRB r9, [r2], r3 ; r9 = r[8]
+ STRB r8, [r0], r1 ; dst[7]
+ ADDGE r6, r6, r7
+ SUBLT r6, r6, r7 ; r6 = sum1 += abs(r[6]-r[7])
+ SUBS r8, r8, r9 ; r8 = r[7]-r[8]
+ LDRB r7, [r2], -r3 ; r[9]
+ STRB r9, [r0], r1 ; dst[8]
+ SUB r2, r2, r3, LSL #3
+ SUB r0, r0, r1, LSL #3
+ ADDGE r6, r6, r8
+ SUBLT r6, r6, r8 ; r6 = sum1 += abs(r[7]-r[8])
+ SUBS r9, r9, r7 ; r9 = r[8]-r[9]
+ ADDGE r6, r6, r9
+ SUBLT r6, r6, r9 ; r6 = sum1 += abs(r[8]-r[9])
+
+ CMP r4, #255
+ ADDLT r14,r14,r4
+ ADDGE r14,r14,#255 ; variance0sum += min(255, sum0)
+
+ LDR r9, [r13,#4*9] ; r9 = qstep
+ LDR r10,[r13,#4*10] ; r10= flimit
+
+ CMP r6, #255
+ ADDLT r14,r14,r6, LSL #12
+ ADDGE r14,r14,#255<<12 ; variance1sum += min(255, sum1)
+
+ CMP r4, r10 ; if (sum0<flimit)
+ CMPLT r6, r10 ; &&(sum1<flimit)
+ CMPLT r5, r9 ; &&(abs(r[5]-r[4])<qstep)
+ BGE ofh_lp
+
+ ; Do the filter...
+ LDRB r4, [r2], r3 ; r4 = r[0]
+ LDRB r5, [r2], r3 ; r5 = r[1]
+ LDRB r6, [r2], r3 ; r6 = r[2]
+ LDRB r7, [r2], r3 ; r7 = r[3]
+ LDRB r8, [r2], r3 ; r8 = r[4]
+ LDRB r9, [r2], r3 ; r9 = r[5]
+ LDRB r12,[r2], r3 ; r12= r[6]
+ ADD r10,r4, r5
+ ADD r10,r10,r6
+ ADD r10,r10,r7
+ ADD r10,r10,r8 ; r10= r[0]+r[1]+r[2]+r[3]+r[4]
+ ADD r10,r10,#4 ; r10= r[0]+r[1]+r[2]+r[3]+r[4]+4
+ ADD r11,r10,r4,LSL #1;r11= r[0]*3+r[1]+r[2]+r[3]+r[4]+4
+ ADD r11,r11,r5 ; r11= r[0]*3+r[1]*2+r[2]+r[3]+r[4]+4
+ MOV r11,r11,ASR #3 ; r11= r[0]*3+r[1]*2+r[2]+r[3]+r[4]+4>>3
+ STRB r11,[r0], r1
+ ADD r10,r10,r9 ; r10= r[0]+r[1]+r[2]+r[3]+r[4]+r[5]+4
+ ADD r11,r10,r4 ; r11= r[0]*2+r[1]+r[2]+r[3]+r[4]+r[5]+4
+ ADD r11,r11,r6 ; r11= r[0]*2+r[1]+r[2]*2+r[3]+r[4]+r[5]+4
+ MOV r11,r11,ASR #3 ; r11= r[0]*2+r[1]+r[2]*2+r[3]+r[4]+r[5]+4>>3
+ STRB r11,[r0], r1
+ ADD r10,r10,r12 ; r10= r[0]+r[1]+r[2]+r[3]+r[4]+r[5]+r[6]+4
+ ADD r11,r10,r7 ; r11= r[0]+r[1]+r[2]+r[3]*2+r[4]+r[5]+r[6]+4
+ SUB r10,r10,r4 ; r10= r[1]+r[2]+r[3]+r[4]+r[5]+r[6]+4
+ LDRB r4, [r2], r3 ; r4 = r[7]
+ MOV r11,r11,ASR #3;r11= r[0]+r[1]+r[2]+r[3]*2+r[4]+r[5]+r[6]+4>>3
+ STRB r11,[r0], r1
+ ADD r10,r10,r4 ; r10= r[1]+r[2]+r[3]+r[4]+r[5]+r[6]+r[7]+4
+ ADD r11,r10,r8 ; r11= r[1]+r[2]+r[3]+r[4]*2+r[5]+r[6]+r[7]+4
+ SUB r10,r10,r5 ; r10= r[2]+r[3]+r[4]+r[5]+r[6]+r[7]+4
+ LDRB r5, [r2], r3 ; r5 = r[8]
+ MOV r11,r11,ASR #3;r11= r[1]+r[2]+r[3]+r[4]*2+r[5]+r[6]+r[7]+4>>3
+ STRB r11,[r0], r1
+ ADD r10,r10,r5 ; r10= r[2]+r[3]+r[4]+r[5]+r[6]+r[7]+r[8]+4
+ ADD r11,r10,r9 ; r11= r[2]+r[3]+r[4]+r[5]*2+r[6]+r[7]+r[8]+4
+ SUB r10,r10,r6 ; r10= r[3]+r[4]+r[5]+r[6]+r[7]+r[8]+4
+ LDRB r6, [r2], -r3 ; r6 = r[9]
+ MOV r11,r11,ASR #3;r11= r[2]+r[3]+r[4]+r[5]*2+r[6]+r[7]+r[8]+4>>3
+ STRB r11,[r0], r1
+ ADD r10,r10,r6 ; r10= r[3]+r[4]+r[5]+r[6]+r[7]+r[8]+r[9]+4
+ ADD r11,r10,r12 ; r11= r[3]+r[4]+r[5]+r[6]*3+r[7]+r[8]+r[9]+4
+ MOV r11,r11,ASR #3;r11= r[3]+r[4]+r[5]+r[6]*3+r[7]+r[8]+r[9]+4>>3
+ STRB r11,[r0], r1
+ SUB r10,r10,r7 ; r10= r[4]+r[5]+r[6]+r[7]+r[8]+r[9]+4
+ ADD r10,r10,r6 ; r10= r[4]+r[5]+r[6]+r[7]+r[8]+r[9]*2+4
+ ADD r11,r10,r4 ; r11= r[4]+r[5]+r[6]+r[7]*2+r[8]+r[9]*2+4
+ MOV r11,r11,ASR #3 ; r11= r[4]+r[5]+r[6]+r[7]*2+r[8]+r[9]*2+4>>3
+ STRB r11,[r0], r1
+ SUB r10,r10,r8
+ ADD r10,r10,r6 ; r10= r[5]+r[6]+r[7]+r[8]+r[9]*3+4
+ ADD r10,r10,r5 ; r10= r[5]+r[6]+r[7]+r[8]*2+r[9]*3+4
+ MOV r10,r10,ASR #3 ; r10= r[5]+r[6]+r[7]+r[8]*2+r[9]*3+4>>3
+ STRB r10,[r0], r1
+ SUB r2, r2, r3, LSL #3
+ SUB r0, r0, r1, LSL #3
+
+ SUBS r14,r14,#1<<24
+ BGE ofh_lp2
+ofh_end
+ LDR r4, [r13,#4*(9+2)] ; r4 = variance0
+ LDR r5, [r13,#4*(9+3)] ; r5 = variance1
+ MOV r12,r14,LSL #20 ; r12= variance0sum<<20
+ LDR r6, [r4] ; r6 = *variance0
+ LDR r7, [r5] ; r7 = *variance1
+ BIC r14,r14,#0xFF000000
+ ADD r6, r6, r12,LSR #20 ; r6 = *variance0 += variance0sum
+ ADD r7, r7, r14,LSR #12 ; r7 = *variance1 += variance1sum
+ STR r6, [r4] ; r4 = *variance0
+ STR r7, [r5] ; r5 = *variance1
+
+ LDMFD r13!,{r4-r11,PC}
+ ]
+ ]
+oc_filter_vedge
+ ; r0 = uint8_t *rdst
+ ; r1 = int dst_ystride
+ ; r2 = int qstep
+ ; r3 = int flimit
+ ; <> = int *variances
+ STMFD r13!,{r4-r11,r14}
+
+ ; variance0sum is 8*255 at most.
+
+ ; r14 will hold variance0sum in the bottom 12 bits, variance1sum
+ ; in the next 12 bits, and loop count in the top 8 bits.
+ MOV r14,#8<<24 ; r14= variancesums = 0 | (bx<<24)
+ SUB r0, r0, r1
+ofv_lp
+ SUBS r14,r14,#1<<24
+ BLT ofv_end
+ofv_lp2
+ ADD r0, r0, r1
+ LDRB r4, [r0, #-1] ; r4 = r[0]
+ LDRB r5, [r0] ; r5 = r[1]
+ LDRB r6, [r0, #1] ; r6 = r[2]
+ LDRB r7, [r0, #2] ; r7 = r[3]
+ SUBS r4, r5, r4 ; r4 = r[1]-r[0]
+ RSBLT r4, r4, #0 ; r4 = sum0 = abs(r[1]-r[0])
+ SUBS r5, r6, r5 ; r5 = r[2]-r[1]
+ ADDGE r4, r4, r5
+ SUBLT r4, r4, r5 ; r4 = sum0 += abs(r[2]-r[1])
+ LDRB r5, [r0, #3] ; r5 = r[4]
+ SUBS r6, r7, r6 ; r6 = r[3]-r[2]
+ ADDGE r4, r4, r6
+ SUBLT r4, r4, r6 ; r4 = sum0 += abs(r[3]-r[2])
+ LDRB r6, [r0, #4] ; r6 = r[5]
+ SUBS r7, r5, r7 ; r7 = r[4]-r[3]
+ ADDGE r4, r4, r7
+ SUBLT r4, r4, r7 ; r4 = sum0 += abs(r[4]-r[3])
+ LDRB r7, [r0, #5] ; r7 = r[6]
+ SUBS r5, r6, r5 ; r5 = r[5]-r[4]
+ RSBLT r5, r5, #0 ; r5 = abs(r[5]-r[4])
+ LDRB r8, [r0, #6] ; r8 = r[7]
+ SUBS r6, r6, r7 ; r6 = r[5]-r[6]
+ RSBLT r6, r6, #0 ; r6 = sum1 = abs(r[5]-r[6])
+ SUBS r7, r7, r8 ; r7 = r[6]-r[7]
+ LDRB r9, [r0, #7] ; r9 = r[8]
+ ADDGE r6, r6, r7
+ SUBLT r6, r6, r7 ; r6 = sum1 += abs(r[6]-r[7])
+ SUBS r8, r8, r9 ; r8 = r[7]-r[8]
+ LDRB r7, [r0, #8] ; r[9]
+ ADDGE r6, r6, r8
+ SUBLT r6, r6, r8 ; r6 = sum1 += abs(r[7]-r[8])
+ SUBS r9, r9, r7 ; r9 = r[8]-r[9]
+ ADDGE r6, r6, r9
+ SUBLT r6, r6, r9 ; r6 = sum1 += abs(r[8]-r[9])
+
+ CMP r4, #255
+ ADDLT r14,r14,r4
+ ADDGE r14,r14,#255 ; variance0sum += min(255, sum0)
+
+ CMP r6, #255
+ ADDLT r14,r14,r6, LSL #12
+ ADDGE r14,r14,#255<<12 ; variance1sum += min(255, sum1)
+
+ CMP r4, r3 ; if (sum0<flimit)
+ CMPLT r6, r3 ; &&(sum1<flimit)
+ CMPLT r5, r2 ; &&(abs(r[5]-r[4])<qstep)
+ BGE ofv_lp
+
+ ; Do the filter...
+ LDRB r4, [r0, #-1] ; r4 = r[0]
+ LDRB r5, [r0] ; r5 = r[1]
+ LDRB r6, [r0, #1] ; r6 = r[2]
+ LDRB r7, [r0, #2] ; r7 = r[3]
+ LDRB r8, [r0, #3] ; r8 = r[4]
+ LDRB r9, [r0, #4] ; r9 = r[5]
+ LDRB r12,[r0, #5] ; r12= r[6]
+ ADD r10,r4, r5
+ ADD r10,r10,r6
+ ADD r10,r10,r7
+ ADD r10,r10,r8 ; r10= r[0]+r[1]+r[2]+r[3]+r[4]
+ ADD r10,r10,#4 ; r10= r[0]+r[1]+r[2]+r[3]+r[4]+4
+ ADD r11,r10,r4,LSL #1;r11= r[0]*3+r[1]+r[2]+r[3]+r[4]+4
+ ADD r11,r11,r5 ; r11= r[0]*3+r[1]*2+r[2]+r[3]+r[4]+4
+ MOV r11,r11,ASR #3 ; r11= r[0]*3+r[1]*2+r[2]+r[3]+r[4]+4>>3
+ STRB r11,[r0]
+ ADD r10,r10,r9 ; r10= r[0]+r[1]+r[2]+r[3]+r[4]+r[5]+4
+ ADD r11,r10,r4 ; r11= r[0]*2+r[1]+r[2]+r[3]+r[4]+r[5]+4
+ ADD r11,r11,r6 ; r11= r[0]*2+r[1]+r[2]*2+r[3]+r[4]+r[5]+4
+ MOV r11,r11,ASR #3 ; r11= r[0]*2+r[1]+r[2]*2+r[3]+r[4]+r[5]+4>>3
+ STRB r11,[r0, #1]
+ ADD r10,r10,r12 ; r10= r[0]+r[1]+r[2]+r[3]+r[4]+r[5]+r[6]+4
+ ADD r11,r10,r7 ; r11= r[0]+r[1]+r[2]+r[3]*2+r[4]+r[5]+r[6]+4
+ SUB r10,r10,r4 ; r10= r[1]+r[2]+r[3]+r[4]+r[5]+r[6]+4
+ LDRB r4, [r0, #6] ; r4 = r[7]
+ MOV r11,r11,ASR #3;r11= r[0]+r[1]+r[2]+r[3]*2+r[4]+r[5]+r[6]+4>>3
+ STRB r11,[r0, #2]
+ ADD r10,r10,r4 ; r10= r[1]+r[2]+r[3]+r[4]+r[5]+r[6]+r[7]+4
+ ADD r11,r10,r8 ; r11= r[1]+r[2]+r[3]+r[4]*2+r[5]+r[6]+r[7]+4
+ SUB r10,r10,r5 ; r10= r[2]+r[3]+r[4]+r[5]+r[6]+r[7]+4
+ LDRB r5, [r0, #7] ; r5 = r[8]
+ MOV r11,r11,ASR #3;r11= r[1]+r[2]+r[3]+r[4]*2+r[5]+r[6]+r[7]+4>>3
+ STRB r11,[r0, #3]
+ ADD r10,r10,r5 ; r10= r[2]+r[3]+r[4]+r[5]+r[6]+r[7]+r[8]+4
+ ADD r11,r10,r9 ; r11= r[2]+r[3]+r[4]+r[5]*2+r[6]+r[7]+r[8]+4
+ SUB r10,r10,r6 ; r10= r[3]+r[4]+r[5]+r[6]+r[7]+r[8]+4
+ LDRB r6, [r0, #8] ; r6 = r[9]
+ MOV r11,r11,ASR #3;r11= r[2]+r[3]+r[4]+r[5]*2+r[6]+r[7]+r[8]+4>>3
+ STRB r11,[r0, #4]
+ ADD r10,r10,r6 ; r10= r[3]+r[4]+r[5]+r[6]+r[7]+r[8]+r[9]+4
+ ADD r11,r10,r12 ; r11= r[3]+r[4]+r[5]+r[6]*3+r[7]+r[8]+r[9]+4
+ MOV r11,r11,ASR #3;r11= r[3]+r[4]+r[5]+r[6]*3+r[7]+r[8]+r[9]+4>>3
+ STRB r11,[r0, #5]
+ SUB r10,r10,r7 ; r10= r[4]+r[5]+r[6]+r[7]+r[8]+r[9]+4
+ ADD r10,r10,r6 ; r10= r[4]+r[5]+r[6]+r[7]+r[8]+r[9]*2+4
+ ADD r11,r10,r4 ; r11= r[4]+r[5]+r[6]+r[7]*2+r[8]+r[9]*2+4
+ MOV r11,r11,ASR #3 ; r11= r[4]+r[5]+r[6]+r[7]*2+r[8]+r[9]*2+4>>3
+ STRB r11,[r0, #6]
+ SUB r10,r10,r8
+ ADD r10,r10,r6 ; r10= r[5]+r[6]+r[7]+r[8]+r[9]*3+4
+ ADD r10,r10,r5 ; r10= r[5]+r[6]+r[7]+r[8]*2+r[9]*3+4
+ MOV r10,r10,ASR #3 ; r10= r[5]+r[6]+r[7]+r[8]*2+r[9]*3+4>>3
+ STRB r10,[r0, #7]
+
+ SUBS r14,r14,#1<<24
+ BGE ofv_lp2
+ofv_end
+ LDR r4, [r13,#4*9] ; r4 = variances
+ MOV r12,r14,LSL #20 ; r12= variance0sum<<20
+ ; Stall on Xscale
+ LDR r6, [r4] ; r6 = variances[0]
+ LDR r7, [r4, #4] ; r7 = variances[0]
+ BIC r14,r14,#0xFF000000
+ ADD r6, r6, r12,LSR #20 ; r6 = variances[0] += variance0sum
+ ADD r7, r7, r14,LSR #12 ; r7 = variances[1] += variance1sum
+ STR r6, [r4] ; r4 = variances[0]
+ STR r7, [r4, #4] ; r5 = variances[1]
+
+ LDMFD r13!,{r4-r11,PC}
+
+oc_dering_block
+ ; r0 = unsigned char *dst
+ ; r1 = int ystride
+ ; r2 = int b
+ ; r3 = int dc_scale
+ ; r4 = int sharp_mod
+ ; r5 = int strong
+ STMFD r13!,{r4-r11,r14}
+
+ LDR r4, [r13,#4*9] ; r4 = sharp_mod
+ LDR r5, [r13,#4*10] ; r5 = strong
+
+ SUB r13,r13,#72*2 ; make space for vmod and hmod
+
+ ADD r7, r3, r3, LSL #1 ; r7 = 3*_dc_scale
+ MOV r6, #24
+ ADD r6, r6, r5, LSL #3 ; r6 = MOD_MAX[strong]
+ CMP r7, r6
+ MOVLT r6, r7 ; r6 = mod_hi=MIN(3*_dc_scale,r6)
+ ADD r3, r3, #96 ; r3 = _dc_scale += 96
+ RSB r5, r5, #1 ; r5 = strong=MOD_SHIFT[strong]
+ EOR r2, r2, #15 ; Reverse the sense of the bits
+
+ MOV r7, r0 ; r7 = src = dst
+ MOV r8, r0 ; r8 = psrc = src
+ TST r2, #4
+ SUBNE r8, r8, r1 ; r8 = psrc = src-(ystride&-!!(b&4))
+ MOV r9, r13 ; r9 = vmod
+ MOV r14,#8 ; r14= by=8
+odb_lp1
+ MOV r12,#8 ; r12= bx=8
+odb_lp2
+ LDRB r10,[r7], #1 ; r10= *src++
+ LDRB r11,[r8], #1 ; r11= *psrc++
+ ; Stall (2 on Xscale)
+ SUBS r10,r10,r11 ; r10= *src++ - *psrc++
+ RSBLT r10,r10,#0 ; r10= abs(*src++ - *psrc++)
+ SUBS r10,r3, r10,LSL r5 ; r10= mod = dc_scale-(r10)<<strong
+ MOVLT r10,r4 ; if (mod<0) r10= mod = sharp_mod
+ BLT odb_sharp1 ; else ...
+ SUBS r10,r10,#64 ; r10 = mod-64
+ MOVLT r10,#0
+ CMP r10,r6
+ MOVGT r10,r6 ; r10= OC_CLAMPI(0,mod-64,mod_hi)
+odb_sharp1
+ STRB r10,[r9], #1 ; *pvmod++ = r10
+ SUBS r12,r12,#1
+ BGT odb_lp2
+ SUB r8, r7, #8 ; r8 = psrc = src-8
+ MOV r7, r8 ; r7 = src= psrc
+ TST r2, #8 ; if (b&8) (reversed earlier!)
+ TSTEQ r14,#0xFE ; || (by>1)
+ ADDNE r7, r7, r1 ; r7 = src= psrc+ystride&-(...)
+ SUBS r14,r14,#1
+ BGE odb_lp1
+
+ MOV r7, r0 ; r7 = src = dst
+ MOV r8, r0 ; r8 = psrc = src
+ TST r2, #1
+ SUBNE r8, r8, #1 ; r8 = psrc = src-(b&1)
+ ADD r9, r13,#72 ; r9 = hmod
+ MOV r14,#8 ; r14= bx=8
+odb_lp3
+ MOV r12,#8 ; r12= by=8
+odb_lp4
+ LDRB r10,[r7], r1 ; r10= *src src +=ystride
+ LDRB r11,[r8], r1 ; r11= *psrc psrc+=ystride
+ ; Stall (2 on Xscale)
+ SUBS r10,r10,r11 ; r10= *src - *psrc
+ RSBLT r10,r10,#0 ; r10= abs(*src - *psrc)
+ SUBS r10,r3, r10,LSL r5 ; r10= mod = dc_scale-(r10)<<strong
+ MOVLT r10,r4 ; if (mod<0) r10= mod = sharp_mod
+ BLT odb_sharp2 ; else ...
+ SUBS r10,r10,#64 ; r10 = mod-64
+ MOVLT r10,#0
+ CMP r10,r6
+ MOVGT r10,r6 ; r10= OC_CLAMPI(0,mod-64,mod_hi)
+odb_sharp2
+ STRB r10,[r9], #1 ; *phmod++ = r10
+ SUBS r12,r12,#1
+ BGT odb_lp4
+ SUB r8, r7, r1, LSL #3 ; r8 = psrc = src - (ystride<<3)
+ MOV r7, r8 ; r7 = src= psrc
+ TST r2, #2 ; if (b&2) (reversed earlier!)
+ TSTEQ r14,#0xFE ; || (bx>1)
+ ADDNE r7, r7, #1 ; r7 = src= psrc+((b&2)|(bx>1))
+ SUBS r14,r14,#1
+ BGE odb_lp3
+
+ ; r0 = src = dst
+ ; r1 = ystride
+ ; r2 = b
+ ADD r3, r0, r1 ; r3 = nsrc=src+ystride
+ MOV r4, r0 ; r4 = psrc=src
+ TST r2, #4
+ SUBNE r4, r4, r1 ; r4 = psrc=src-(ystride&-!(b&4))
+ MOV r5, r13 ; r5 = pvmod = vmod
+ ADD r6, r13, #72 ; r6 = phmod = hmod
+ MOV r14,#7 ; r14= by=7
+ MOV r8, #255 ; r8 = 255 = magic clipping constant
+odb_lp5
+ LDRSB r10,[r6], #8 ; r10= w0=*phmod phmod+=8
+ AND r9, r2, #1 ; r9 = (b&1)
+ LDRB r9, [r0, -r9] ; r9 = *(src-(b&1))
+ MOV r11,#64 ; r11= d = 64
+ LDRSB r7, [r5], #1 ; r7 = w1=*pvmod++
+ MLA r11,r9, r10,r11 ; r12= d+=w0* *(src-(b&1))
+ LDRB r9, [r4], #1 ; r9 = *psrc++
+ RSB r12,r10,#128 ; r12= a = 128-w0
+ LDRSB r10,[r5, #7] ; r10= w2=pvmod[7]
+ MLA r11,r9, r7, r11 ; r11= d+=w1 * *psrc++
+ LDRB r9, [r3], #1 ; r9 = *nsrc++
+ SUB r12,r12,r7 ; r12= a-=w1
+ LDRSB r7, [r6], #8 ; r7 = w3=*phmod phmod+=8
+ MLA r11,r9, r10,r11 ; r11= d+=w2 * *nsrc++
+ LDRB r9, [r0, #1]! ; r9 = *++src
+ SUB r12,r12,r10 ; r12= a-=w2
+ LDRB r10,[r0, #-1] ; r10= src[-1]
+ MLA r11,r9, r7, r11 ; r11= d+=w3 * *++src
+ SUB r12,r12,r7 ; r12= a-=w3
+ MLA r11,r10,r12,r11 ; r11= a*src[-1]+d
+ MOVS r11,r11,ASR #7
+ CMPPL r8, r11
+ EORMI r11,r8, r11, ASR #32 ; r11= a=CLAMP(...)
+ STRB r11,[r0, #-1] ; src[-1]=a
+ SUB r14,r14,#6<<4 ; bx=6
+odb_lp6
+ ; r7 = w3
+ ; r11= a
+ MOV r12,#64
+ LDRSB r10,[r5], #1 ; r10= w0= *pvmod++
+ LDRB r9, [r4], #1 ; r9 = *psrc++
+ MLA r11,r7, r11,r12 ; r11= d = 64+w3*a
+ RSB r12,r7, #128 ; r12= a = 128-w3
+ LDRSB r7, [r5, #7] ; r7 = w1= pvmod[7]
+ MLA r11,r9, r10,r11 ; r11= d+= w0 * *psrc++
+ LDRB r9, [r3], #1 ; r9 = *nsrc++
+ SUB r12,r12,r10 ; r12= a -= w0
+ LDRSB r10, [r6], #8 ; r10= w3=*phmod phmod+=8
+ MLA r11,r9, r7, r11 ; r11= d+= w1 * *nsrc++
+ LDRB r9, [r0, #1]! ; r9 = *++src
+ SUB r12,r12,r7 ; r12= a -= w1
+ LDRB r7, [r0, #-1] ; r7 = src[-1]
+ MLA r11,r9, r10,r11 ; r11= d+= w3 * *++src
+ SUB r12,r12,r10 ; r12= a -= w3
+ MLA r11,r7, r12,r11 ; r11= d+= a * src[-1]
+ MOV r7, r10 ; r7 = w3
+ MOVS r11,r11,ASR #7
+ CMPPL r8, r11
+ EORMI r11,r8, r11,ASR #32 ; r11= a=CLAMP(...)
+ STRB r11,[r0, #-1] ; src[-1]=a
+ ADDS r14,r14,#1<<4
+ BLT odb_lp6
+
+ ; r7 = w3
+ ; r11= a
+ MOV r12,#64
+ LDRSB r10,[r5], #1 ; r10= w0= *pvmod++
+ LDRB r9, [r4], #1 ; r9 = *psrc++
+ MLA r11,r7, r11,r12 ; r11= d = 64+w3*a
+ RSB r12,r7, #128 ; r12= a = 128-w3
+ LDRSB r7, [r5, #7] ; r7 = w1= pvmod[7]
+ MLA r11,r9, r10,r11 ; r11= d+= w0 * *psrc++
+ LDRB r9, [r3], #-7 ; r9 = *nsrc nsrc-=7
+ SUB r12,r12,r10 ; r12= a -= w0
+ LDRSB r10, [r6], #-63 ; r10= w3=*phmod phmod+=8
+ MLA r11,r9, r7, r11 ; r11= d+= w1 * *nsrc
+ TST r2, #2 ; if (b&2)==0
+ LDREQB r9, [r0] ; r9 = *src
+ LDRNEB r9, [r0, #1] ; else r9 = src[1]
+ SUB r12,r12,r7 ; r12= a -= w1
+ LDRB r7, [r0] ; r7 = src[0]
+ MLA r11,r9, r10,r11 ; r11= d+= w3 * src[(b&2)>>1]
+ SUB r12,r12,r10 ; r12= a -= w3
+ MLA r11,r7, r12,r11 ; r11= d+= a * src[0]
+ MOVS r11,r11,ASR #7
+ CMPPL r8, r11
+ EORMI r11,r8, r11,ASR #32 ; r11= a=CLAMP(...)
+ STRB r11,[r0] ; src[0]=a
+
+ SUB r4, r0, #7 ; r4 = psrc = src-7
+ MOV r0, r3 ; r0 = src = nsrc
+ TST r2, #8
+ TSTEQ r14,#0xFE
+ ADDNE r3, r3, r1
+
+ SUBS r14,r14,#1
+ BGE odb_lp5
+
+ ADD r13,r13,#72*2
+
+ LDMFD r13!,{r4-r11,PC}
+
+ [ 0 = 1
+
+;Some idle scribblings about doing hedge using SWAR
+ ; r10= 00FF00FF
+ ; r9 = 80008000
+
+ LDR r4, [r2], r3 ; r4 = r[0]
+ LDR r5, [r2], r3 ; r5 = r[1]
+ LDR r6, [r2], r3 ; r6 = r[2]
+ LDR r7, [r2], r3 ; r7 = r[3]
+ STR r5, [r0], r1 ; dst[1]
+ STR r6, [r0], r1 ; dst[2]
+ STR r7, [r0], r1 ; dst[3]
+ AND r4, r4, r10 ; r4 = ..aa..AA
+ AND r5, r5, r10 ; r5 = ..bb..BB
+ AND r6, r6, r10 ; r6 = ..cc..CC
+ AND r7, r7, r10 ; r7 = ..dd..DD
+ ORR r4, r4, r9 ; r4 = ^.aa^.AA
+ SUB r4, r4, r5 ; r4 = r[0]-r[1]
+ AND r12,r4, r9, LSR #1 ; r12= sign bits
+ SUB r4, r4, r12,LSR #14
+ SUB r12,r12,r12,LSR #14
+ EOR r4, r4, r12 ; r4 = abs(r[0]-r[1])
+
+ ORR r5, r5, r9 ; r5 = ^.bb^.BB
+ SUB r5, r5, r6 ; r5 = r[1]-r[2]
+ AND r12,r5, r9, LSR #1 ; r12= sign bits
+ SUB r5, r5, r12,LSR #14
+ SUB r12,r12,r12,LSR #14
+ EOR r5, r5, r12 ; r5 = abs(r[2]-r[1])
+ ADD r4, r4, r5
+
+ LDR r5, [r2], r3 ; r5 = r[1]
+ ORR r6, r6, r9 ; r6 = ^.cc^.CC
+ SUB r6, r6, r7 ; r6 = r[2]-r[3]
+ AND r12,r6, r9, LSR #1 ; r12= sign bits
+ SUB r6, r6, r12,LSR #14
+ SUB r12,r12,r12,LSR #14
+ EOR r6, r6, r12 ; r6 = abs(r[3]-r[2])
+ ADD r4, r4, r6
+
+ AND r5, r5, r10 ; r5 = ..ee..EE
+ ORR r7, r7, r5 ; r7 = ^.dd^.DD
+ SUB r7, r7, r8 ; r7 = r[3]-r[4]
+ AND r12,r7, r9, LSR #1 ; r12= sign bits
+ SUB r7, r7, r12,LSR #14
+ SUB r12,r12,r12,LSR #14
+ EOR r7, r7, r12 ; r7 = abs(r[4]-r[3])
+ ADD r4, r4, r7
+
+; Doesn't look like it'll pay off
+
+ ]
+
+ [ 0 = 1
+
+; Some idle scribblings about using ARMV6 SIMD for hedge
+
+ LDR r4, [r2], r3 ; r4 = s[0]
+ LDR r5, [r2], r3 ; r5 = s[1]
+ LDR r7, [r2], r3 ; r7 = s[2]
+ ADD r0, r0, r1
+ SSUB8 r9, r4, r5
+ STR r5, [r0], r1 ; store s[1]
+ SSUB8 r4, r5, r4
+ SEL r4, r4, r9 ; r4 = abs(s[0]-s[1]) in 4 bytes
+ AND r4, r6, r4 ; r4 = sum0 (for bytes 0 and 2)
+ AND r8, r6, r4, LSR #8 ; r8 = sum0 (for bytes 1 and 3)
+ SSUB r9, r5, r7
+ SSUB r5, r7, r5
+ SEL r5, r5, r9 ; r5 = abs(s[2]-s[1]) in 4 bytes
+ AND r9, r6, r5
+ ADD r4, r4, r9 ; r4 = sum0 (for bytes 0 and 2)
+ AND r9, r6, r5, LSR #8
+ LDR r5, [r2], r3 ; r5 = s[3]
+ STR r7, [r0], r1 ; store s[2]
+ ADD r8, r8, r9 ; r8 = sum0 (for bytes 1 and 3)
+ SSUB8 r9, r7, r5
+ SSUB8 r7, r5, r7
+ SEL r7, r7, r9 ; r7 = abs(s[3]-s[2]) in 4 bytes
+ AND r9, r6, r7
+ ADD r4, r4, r9 ; r4 = sum0 (for bytes 0 and 2)
+ AND r9, r6, r7, LSR #8
+ LDR r7, [r2], r3 ; r7 = s[4]
+ STR r5, [r0], r1 ; store s[3]
+ ADD r8, r8, r9 ; r8 = sum0 (for bytes 1 and 3)
+ SSUB8 r9, r5, r7
+ SSUB8 r5, r7, r5
+ SEL r5, r5, r7 ; r5 = abs(s[4]-s[3]) in 4 bytes
+ AND r9, r6, r5
+ ADD r4, r4, r9 ; r4 = sum0 (for bytes 0 and 2)
+ AND r9, r6, r5, LSR #8
+ LDR r5, [r2], r3 ; r5 = s[5]
+ STR r7, [r0], r1 ; store s[4]
+ ADD r8, r8, r9 ; r8 = sum0 (for bytes 1 and 3)
+ SSUB8 r9, r7, r5
+ SSUB8 r7, r5, r7
+ LDR r10,[r2], r3 ; r10= s[6]
+ STR r5, [r0], r1 ; store s[5]
+ SEL r7, r7, r9 ; r7 = abs(s[5]-s[4]) in 4 bytes
+ SSUB8 r9, r10,r5
+ SSUB8 r5, r5, r10
+ SEL r5, r5, r9 ; r5 = abs(s[6]-s[5]) in 4 bytes
+ AND r9, r6, r5 ; r9 = sum1 (for bytes 0 and 3)
+ LDR r11,[r2], r3 ; r11= s[7]
+ STR r10,[r0], r1 ; store s[6]
+ AND r5, r5, r6 ; r5 = sum1 (for bytes 1 and 2)
+ SSUB8 r9, r11,r10
+ SSUB8 r10,r10,r11
+ SEL r10,r10,r11 ; r10= abs(s[7]-s[6]) in 4 bytes
+ AND r12,r6, r10
+ ADD r9, r9,r12 ; r9 = sum1 (for bytes 0 and 3)
+ AND r12,r6, r10,LSR #8
+ ADD r5, r5,r12 ; r5 = sum1 (for bytes 1 and 2)
+ ]
+
+ END
Property changes on: branches/theorarm-merge-branch/lib/arm/ARMpp.s
___________________________________________________________________
Added: svn:executable
+ *
Added: branches/theorarm-merge-branch/lib/arm/ARMstate.c
===================================================================
--- branches/theorarm-merge-branch/lib/arm/ARMstate.c (rev 0)
+++ branches/theorarm-merge-branch/lib/arm/ARMstate.c 2010-03-07 17:59:26 UTC (rev 16954)
@@ -0,0 +1,40 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id: $
+
+ ********************************************************************/
+
+#include "ARMint.h"
+
+#if defined(OC_ARM_ASM)
+
+void oc_state_vtable_init_arm(oc_theora_state *_state){
+ _state->opt_vtable.frag_copy=oc_frag_copy_arm;
+ _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_arm;
+ _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_arm;
+ _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_arm;
+ _state->opt_vtable.idct8x8=oc_idct8x8_arm;
+ _state->opt_vtable.state_frag_recon=oc_state_frag_recon_arm;
+ _state->opt_vtable.state_frag_copy_list=oc_state_frag_copy_list_arm;
+ _state->opt_vtable.state_loop_filter_frag_rows=
+ oc_state_loop_filter_frag_rows_arm;
+ _state->opt_vtable.restore_fpu=oc_restore_fpu_arm;
+ _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG;
+}
+
+void oc_restore_fpu_arm(void)
+{
+}
+
+#endif
Property changes on: branches/theorarm-merge-branch/lib/arm/ARMstate.c
___________________________________________________________________
Added: svn:executable
+ *
Modified: branches/theorarm-merge-branch/lib/decode.c
===================================================================
--- branches/theorarm-merge-branch/lib/decode.c 2010-03-07 03:06:29 UTC (rev 16953)
+++ branches/theorarm-merge-branch/lib/decode.c 2010-03-07 17:59:26 UTC (rev 16954)
@@ -27,6 +27,18 @@
# include <cairo.h>
#endif
+#ifdef OC_ARM_ASM
+extern void oc_memzero_16_64ARM(ogg_uint16_t *);
+extern void oc_memzero_ptrdiff_64ARM(ptrdiff_t *);
+extern void oc_memset_al_mult8ARM(void *buffer, size_t size, int value);
+#define oc_memzero_16_64(B) oc_memzero_16_64ARM(B)
+#define oc_memzero_ptrdiff_64(B) oc_memzero_ptrdiff_64ARM(B)
+#define oc_memset_al_mult8(B,V,S) oc_memset_al_mult8ARM(B,S,V)
+#else
+#define oc_memzero_16_64(B) memset((void*)(B),0,64*sizeof(ogg_uint16_t))
+#define oc_memzero_ptrdiff_64(B) memset((void*)(B),0,64*sizeof(ptrdiff_t))
+#define oc_memset_al_mult8(B,V,S) memset((void*)(B),(V),(S))
+#endif
/*No post-processing.*/
#define OC_PP_LEVEL_DISABLED (0)
@@ -295,10 +307,12 @@
};
-
+#ifdef OC_ARM_ASM
+extern int oc_sb_run_unpack(oc_pack_buf *_opb);
+#else
static int oc_sb_run_unpack(oc_pack_buf *_opb){
long bits;
- int ret;
+ int adv, sub;
/*Coding scheme:
Codeword Run Length
0 1
@@ -308,32 +322,52 @@
11110xxx 10-17
111110xxxx 18-33
111111xxxxxxxxxxxx 34-4129*/
- bits=oc_pack_read1(_opb);
- if(bits==0)return 1;
- bits=oc_pack_read(_opb,2);
- if((bits&2)==0)return 2+(int)bits;
- else if((bits&1)==0){
- bits=oc_pack_read1(_opb);
- return 4+(int)bits;
+ bits=oc_pack_look(_opb,18);
+ adv=1;
+ sub=-1;
+ if (bits&0x20000)
+ {
+ adv = 3;
+ sub = 2;
+ if (bits&0x10000)
+ {
+ adv = 4;
+ sub = 8;
+ if (bits&0x08000)
+ {
+ adv = 6;
+ sub = 50;
+ if (bits&0x04000)
+ {
+ adv = 8;
+ sub = 230;
+ if (bits&0x02000)
+ {
+ adv = 10;
+ sub = 974;
+ if (bits&0x01000)
+ {
+ adv = 18;
+ sub = 258014;
+ }
+ }
+ }
+ }
+ }
}
- bits=oc_pack_read(_opb,3);
- if((bits&4)==0)return 6+(int)bits;
- else if((bits&2)==0){
- ret=10+((bits&1)<<2);
- bits=oc_pack_read(_opb,2);
- return ret+(int)bits;
- }
- else if((bits&1)==0){
- bits=oc_pack_read(_opb,4);
- return 18+(int)bits;
- }
- bits=oc_pack_read(_opb,12);
- return 34+(int)bits;
+ oc_pack_adv(_opb,adv);
+ bits = (bits>>(18-adv))-sub;
+ return bits;
}
+#endif
+#ifdef OC_ARM_ASM
+extern int oc_block_run_unpack(oc_pack_buf *_opb);
+#else
static int oc_block_run_unpack(oc_pack_buf *_opb){
long bits;
- long bits2;
+ int adv, sub;
+ /*long bits2;*/
/*Coding scheme:
Codeword Run Length
0x 1-2
@@ -342,26 +376,41 @@
1110xx 7-10
11110xx 11-14
11111xxxx 15-30*/
- bits=oc_pack_read(_opb,2);
- if((bits&2)==0)return 1+(int)bits;
- else if((bits&1)==0){
- bits=oc_pack_read1(_opb);
- return 3+(int)bits;
+ bits=oc_pack_look(_opb,9);
+ adv = 2;
+ sub = -1;
+ if(bits&0x100)
+ {
+ adv = 3;
+ sub = 1;
+ if (bits&0x080)
+ {
+ adv = 4;
+ sub = 7;
+ if (bits&0x040)
+ {
+ adv = 6;
+ sub = 49;
+ if (bits&0x020)
+ {
+ adv = 7;
+ sub = 109;
+ if (bits&0x010)
+ {
+ adv = 9;
+ sub = 481;
+ }
+ }
+ }
+ }
}
- bits=oc_pack_read(_opb,2);
- if((bits&2)==0)return 5+(int)bits;
- else if((bits&1)==0){
- bits=oc_pack_read(_opb,2);
- return 7+(int)bits;
- }
- bits=oc_pack_read(_opb,3);
- if((bits&4)==0)return 11+bits;
- bits2=oc_pack_read(_opb,2);
- return 15+((bits&3)<<2)+bits2;
+ oc_pack_adv(_opb,adv);
+ bits = (bits>>(9-adv))-sub;
+ return bits-1;
}
+#endif
-
static int oc_dec_init(oc_dec_ctx *_dec,const th_info *_info,
const th_setup_info *_setup){
int qti;
@@ -437,26 +486,21 @@
static int oc_dec_frame_header_unpack(oc_dec_ctx *_dec){
- long val;
/*Check to make sure this is a data packet.*/
- val=oc_pack_read1(&_dec->opb);
- if(val!=0)return TH_EBADPACKET;
+ if(oc_pack_read1(&_dec->opb)!=0)
+ return TH_EBADPACKET;
/*Read in the frame type (I or P).*/
- val=oc_pack_read1(&_dec->opb);
- _dec->state.frame_type=(int)val;
+ _dec->state.frame_type=(int)oc_pack_read1(&_dec->opb);
/*Read in the qi list.*/
- val=oc_pack_read(&_dec->opb,6);
- _dec->state.qis[0]=(unsigned char)val;
- val=oc_pack_read1(&_dec->opb);
- if(!val)_dec->state.nqis=1;
+ _dec->state.qis[0]=(unsigned char)oc_pack_read(&_dec->opb,6);
+ if(!oc_pack_read1(&_dec->opb))
+ _dec->state.nqis=1;
else{
- val=oc_pack_read(&_dec->opb,6);
- _dec->state.qis[1]=(unsigned char)val;
- val=oc_pack_read1(&_dec->opb);
- if(!val)_dec->state.nqis=2;
+ _dec->state.qis[1]=(unsigned char)oc_pack_read(&_dec->opb,6);
+ if(!oc_pack_read1(&_dec->opb))
+ _dec->state.nqis=2;
else{
- val=oc_pack_read(&_dec->opb,6);
- _dec->state.qis[2]=(unsigned char)val;
+ _dec->state.qis[2]=(unsigned char)oc_pack_read(&_dec->opb,6);
_dec->state.nqis=3;
}
}
@@ -466,8 +510,8 @@
I don't know why these remain.*/
/*I wanted to eliminate wasted bits, but not all config wiggle room
--Monty.*/
- val=oc_pack_read(&_dec->opb,3);
- if(val!=0)return TH_EIMPL;
+ if(oc_pack_read(&_dec->opb,3)!=0)
+ return TH_EIMPL;
}
return 0;
}
@@ -482,200 +526,250 @@
const oc_sb_flags *sb_flags;
oc_fragment *frags;
ptrdiff_t *coded_fragis;
- ptrdiff_t ncoded_fragis;
- ptrdiff_t prev_ncoded_fragis;
- unsigned nsbs;
- unsigned sbi;
int pli;
coded_fragis=_dec->state.coded_fragis;
- prev_ncoded_fragis=ncoded_fragis=0;
sb_maps=(const oc_sb_map *)_dec->state.sb_maps;
sb_flags=_dec->state.sb_flags;
frags=_dec->state.frags;
- sbi=nsbs=0;
for(pli=0;pli<3;pli++){
- nsbs+=_dec->state.fplanes[pli].nsbs;
- for(;sbi<nsbs;sbi++){
+ ptrdiff_t ncoded_fragis=0;
+ unsigned nsbs=_dec->state.fplanes[pli].nsbs;
+ for(;nsbs>0;nsbs--){
int quadi;
- for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){
- int bi;
- for(bi=0;bi<4;bi++){
- ptrdiff_t fragi;
- fragi=sb_maps[sbi][quadi][bi];
- if(fragi>=0){
- frags[fragi].coded=1;
- frags[fragi].mb_mode=OC_MODE_INTRA;
- coded_fragis[ncoded_fragis++]=fragi;
+ const ptrdiff_t *fragip=&(*sb_maps++)[0][0];
+ int flags=(sb_flags++)->quad_valid;
+ for(quadi=4;quadi>0;quadi--){
+ if(flags&1){
+ int bi;
+ for(bi=4;bi>0;bi--){
+ ptrdiff_t fragi;
+ fragi=*fragip++;
+ if(fragi>=0){
+ frags[fragi].coded=1;
+ frags[fragi].mb_mode=OC_MODE_INTRA;
+ *coded_fragis++=fragi;
+ ncoded_fragis++;
+ }
}
- }
+ }else
+ fragip+=4;
+ flags>>=1;
}
}
- _dec->state.ncoded_fragis[pli]=ncoded_fragis-prev_ncoded_fragis;
- prev_ncoded_fragis=ncoded_fragis;
+ _dec->state.ncoded_fragis[pli]=ncoded_fragis;
}
- _dec->state.ntotal_coded_fragis=ncoded_fragis;
+ _dec->state.ntotal_coded_fragis=coded_fragis-_dec->state.coded_fragis;
}
/*Decodes the bit flags indicating whether each super block is partially coded
or not.
Return: The number of partially coded super blocks.*/
+#ifndef OC_ARM_ASM
static unsigned oc_dec_partial_sb_flags_unpack(oc_dec_ctx *_dec){
oc_sb_flags *sb_flags;
unsigned nsbs;
unsigned sbi;
unsigned npartial;
unsigned run_count;
- long val;
int flag;
- val=oc_pack_read1(&_dec->opb);
- flag=(int)val;
+ int full_run;
+
sb_flags=_dec->state.sb_flags;
nsbs=_dec->state.nsbs;
- sbi=npartial=0;
- while(sbi<nsbs){
- int full_run;
+ npartial=0;
+ full_run=1;
+ while(nsbs>0){
+ if(full_run){
+ flag=(int)oc_pack_read1(&_dec->opb);
+ }
+ else
+ flag=!flag;
run_count=oc_sb_run_unpack(&_dec->opb);
full_run=run_count>=4129;
+ run_count=OC_MINI(run_count,nsbs);
+ nsbs-=run_count;
+ if(flag)npartial+=run_count;
do{
- sb_flags[sbi].coded_partially=flag;
- sb_flags[sbi].coded_fully=0;
- npartial+=flag;
- sbi++;
+ sb_flags->coded_partially=flag;
+ (sb_flags++)->coded_fully=0;
}
- while(--run_count>0&&sbi<nsbs);
- if(full_run&&sbi<nsbs){
- val=oc_pack_read1(&_dec->opb);
- flag=(int)val;
- }
- else flag=!flag;
+ while(--run_count>0);
}
/*TODO: run_count should be 0 here.
If it's not, we should issue a warning of some kind.*/
return npartial;
}
+#endif
/*Decodes the bit flags for whether or not each non-partially-coded super
block is fully coded or not.
This function should only be called if there is at least one
non-partially-coded super block.
Return: The number of partially coded super blocks.*/
+#ifndef OC_ARM_ASM
static void oc_dec_coded_sb_flags_unpack(oc_dec_ctx *_dec){
oc_sb_flags *sb_flags;
- unsigned nsbs;
- unsigned sbi;
+ oc_sb_flags *sb_flags_end;
unsigned run_count;
- long val;
int flag;
+ int full_run;
sb_flags=_dec->state.sb_flags;
- nsbs=_dec->state.nsbs;
+ sb_flags_end=sb_flags+_dec->state.nsbs;
/*Skip partially coded super blocks.*/
- for(sbi=0;sb_flags[sbi].coded_partially;sbi++);
- val=oc_pack_read1(&_dec->opb);
- flag=(int)val;
+ while((sb_flags++)->coded_partially);
+ sb_flags--;
+ full_run=1;
do{
- int full_run;
+ if (full_run){
+ flag=(int)oc_pack_read1(&_dec->opb);
+ }
+ else
+ flag=!flag;
run_count=oc_sb_run_unpack(&_dec->opb);
full_run=run_count>=4129;
- for(;sbi<nsbs;sbi++){
- if(sb_flags[sbi].coded_partially)continue;
- if(run_count--<=0)break;
- sb_flags[sbi].coded_fully=flag;
+ for(;sb_flags!=sb_flags_end;sb_flags++){
+ if(sb_flags->coded_partially)
+ continue;
+ if(run_count--<=0)
+ break;
+ sb_flags->coded_fully=flag;
}
- if(full_run&&sbi<nsbs){
- val=oc_pack_read1(&_dec->opb);
- flag=(int)val;
- }
- else flag=!flag;
}
- while(sbi<nsbs);
+ while(sb_flags!=sb_flags_end);
/*TODO: run_count should be 0 here.
If it's not, we should issue a warning of some kind.*/
}
+#endif
+#ifdef OC_ARM_ASM
+extern void oc_dec_coded_flags_unpack(oc_dec_ctx *_dec);
+#else
static void oc_dec_coded_flags_unpack(oc_dec_ctx *_dec){
const oc_sb_map *sb_maps;
const oc_sb_flags *sb_flags;
oc_fragment *frags;
unsigned nsbs;
- unsigned sbi;
unsigned npartial;
- long val;
int pli;
int flag;
int run_count;
ptrdiff_t *coded_fragis;
ptrdiff_t *uncoded_fragis;
- ptrdiff_t ncoded_fragis;
- ptrdiff_t nuncoded_fragis;
- ptrdiff_t prev_ncoded_fragis;
npartial=oc_dec_partial_sb_flags_unpack(_dec);
if(npartial<_dec->state.nsbs)oc_dec_coded_sb_flags_unpack(_dec);
if(npartial>0){
- val=oc_pack_read1(&_dec->opb);
- flag=!(int)val;
+ flag=!(int)oc_pack_read1(&_dec->opb);
}
- else flag=0;
+ else
+ flag=0;
sb_maps=(const oc_sb_map *)_dec->state.sb_maps;
sb_flags=_dec->state.sb_flags;
frags=_dec->state.frags;
- sbi=nsbs=run_count=0;
+ run_count=0;
coded_fragis=_dec->state.coded_fragis;
uncoded_fragis=coded_fragis+_dec->state.nfrags;
- prev_ncoded_fragis=ncoded_fragis=nuncoded_fragis=0;
for(pli=0;pli<3;pli++){
- nsbs+=_dec->state.fplanes[pli].nsbs;
- for(;sbi<nsbs;sbi++){
+ ptrdiff_t ncoded_fragis=0;
+ nsbs=_dec->state.fplanes[pli].nsbs;
+ for(;nsbs!=0;nsbs--){
+ const ptrdiff_t *fragip=&(*sb_maps++)[0][0];
int quadi;
- for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){
- int bi;
- for(bi=0;bi<4;bi++){
- ptrdiff_t fragi;
- fragi=sb_maps[sbi][quadi][bi];
- if(fragi>=0){
- int coded;
- if(sb_flags[sbi].coded_fully)coded=1;
- else if(!sb_flags[sbi].coded_partially)coded=0;
- else{
- if(run_count<=0){
- run_count=oc_block_run_unpack(&_dec->opb);
- flag=!flag;
+ int flags=(sb_flags++)->quad_valid;
+ if(sb_flags[-1].coded_fully){
+ for(quadi=4;quadi>0;quadi--){
+ if(flags&1){
+ int bi;
+ for(bi=4;bi>0;bi--){
+ ptrdiff_t fragi;
+ fragi=*fragip++;
+ if(fragi>=0){
+ *coded_fragis++=fragi;ncoded_fragis++;
+ frags[fragi].coded=1;
}
- run_count--;
- coded=flag;
}
- if(coded)coded_fragis[ncoded_fragis++]=fragi;
- else *(uncoded_fragis-++nuncoded_fragis)=fragi;
- frags[fragi].coded=coded;
}
+ else{
+ fragip+=4;
+ }
+ flags>>=1;
}
}
+ else if(!sb_flags[-1].coded_partially){
+ for(quadi=4;quadi>0;quadi--){
+ if(flags&1){
+ int bi;
+ for(bi=4;bi>0;bi--){
+ ptrdiff_t fragi;
+ fragi=*fragip++;
+ if(fragi>=0){
+ *(--uncoded_fragis)=fragi;
+ frags[fragi].coded=0;
+ }
+ }
+ }
+ else{
+ fragip+=4;
+ }
+ flags>>=1;
+ }
+ }
+ else
+ {
+ for(quadi=4;quadi>0;quadi--){
+ if(flags&1){
+ int bi;
+ for(bi=4;bi>0;bi--){
+ ptrdiff_t fragi;
+ fragi=*fragip++;
+ if(fragi>=0){
+ if(--run_count<0){
+ run_count=oc_block_run_unpack(&_dec->opb);
+ flag=!flag;
+ }
+ if(flag){*coded_fragis++=fragi;ncoded_fragis++;}
+ else *(--uncoded_fragis)=fragi;
+ frags[fragi].coded=flag;
+ }
+ }
+ }
+ else{
+ fragip+=4;
+ }
+ flags>>=1;
+ }
+ }
}
- _dec->state.ncoded_fragis[pli]=ncoded_fragis-prev_ncoded_fragis;
- prev_ncoded_fragis=ncoded_fragis;
+ _dec->state.ncoded_fragis[pli]=ncoded_fragis;
}
- _dec->state.ntotal_coded_fragis=ncoded_fragis;
+ _dec->state.ntotal_coded_fragis=coded_fragis-_dec->state.coded_fragis;
/*TODO: run_count should be 0 here.
If it's not, we should issue a warning of some kind.*/
}
+#endif
-
typedef int (*oc_mode_unpack_func)(oc_pack_buf *_opb);
static int oc_vlc_mode_unpack(oc_pack_buf *_opb){
- long val;
- int i;
- for(i=0;i<7;i++){
- val=oc_pack_read1(_opb);
- if(!val)break;
+ int bits=~oc_pack_look(_opb,7);
+ int i=0;
+ if((bits&0x78)==0){
+ i=4;
+ bits<<=4;
}
+ if((bits&0x60)==0){
+ i+=2;
+ bits<<=2;
+ }
+ if((bits&0x40)==0){
+ i+=1;
+ }
+ oc_pack_adv(_opb,((i==7)?7:i+1));
return i;
}
static int oc_clc_mode_unpack(oc_pack_buf *_opb){
- long val;
- val=oc_pack_read(_opb,3);
- return (int)val;
+ return (int)oc_pack_read(_opb,3);
}
/*Unpacks the list of macro block modes for INTER frames.*/
@@ -688,10 +782,8 @@
oc_mode_unpack_func mode_unpack;
size_t nmbs;
size_t mbi;
- long val;
int mode_scheme;
- val=oc_pack_read(&_dec->opb,3);
- mode_scheme=(int)val;
+ mode_scheme=(int)oc_pack_read(&_dec->opb,3);
if(mode_scheme==0){
int mi;
/*Just in case, initialize the modes to something.
@@ -699,10 +791,11 @@
corrupt and the rest of the packet is garbage anyway, but this way we
won't crash, and we'll decode SOMETHING.*/
/*LOOP VECTORIZES*/
- for(mi=0;mi<OC_NMODES;mi++)scheme0_alphabet[mi]=OC_MODE_INTER_NOMV;
- for(mi=0;mi<OC_NMODES;mi++){
- val=oc_pack_read(&_dec->opb,3);
- scheme0_alphabet[val]=OC_MODE_ALPHABETS[6][mi];
+ unsigned char *alp = scheme0_alphabet;
+ for(mi=OC_NMODES;mi>0;mi--)*alp++=OC_MODE_INTER_NOMV;
+ alphabet=&OC_MODE_ALPHABETS[6][0];
+ for(mi=OC_NMODES;mi>0;mi--){
+ scheme0_alphabet[oc_pack_read(&_dec->opb,3)]=*alphabet++;
}
alphabet=scheme0_alphabet;
}
@@ -713,16 +806,19 @@
mb_maps=(const oc_mb_map *)_dec->state.mb_maps;
nmbs=_dec->state.nmbs;
frags=_dec->state.frags;
- for(mbi=0;mbi<nmbs;mbi++){
- if(mb_modes[mbi]!=OC_MODE_INVALID){
- int bi;
+ for(;nmbs>0;nmbs--){
+ if(*mb_modes++!=OC_MODE_INVALID){
/*Check for a coded luma block in this macro block.*/
- for(bi=0;bi<4&&!frags[mb_maps[mbi][0][bi]].coded;bi++);
- /*We found one, decode a mode.*/
- if(bi<4)mb_modes[mbi]=alphabet[(*mode_unpack)(&_dec->opb)];
+ if (frags[mb_maps[0][0][0]].coded ||
+ frags[mb_maps[0][0][1]].coded ||
+ frags[mb_maps[0][0][2]].coded ||
+ frags[mb_maps[0][0][3]].coded)
+ /*We found one, decode a mode.*/
+ mb_modes[-1]=alphabet[(*mode_unpack)(&_dec->opb)];
/*There were none: INTER_NOMV is forced.*/
- else mb_modes[mbi]=OC_MODE_INTER_NOMV;
+ else mb_modes[-1]=OC_MODE_INTER_NOMV;
}
+ mb_maps++;
}
}
@@ -779,17 +875,15 @@
oc_mv *frag_mvs;
const unsigned char *map_idxs;
int map_nidxs;
- oc_mv last_mv[2];
- oc_mv cbmvs[4];
+ oc_mv2 last_mv;
+ oc_mv4 cbmvs;
size_t nmbs;
size_t mbi;
- long val;
set_chroma_mvs=OC_SET_CHROMA_MVS_TABLE[_dec->state.info.pixel_fmt];
- val=oc_pack_read1(&_dec->opb);
- mv_comp_unpack=val?oc_clc_mv_comp_unpack:oc_vlc_mv_comp_unpack;
+ mv_comp_unpack=oc_pack_read1(&_dec->opb)?oc_clc_mv_comp_unpack:oc_vlc_mv_comp_unpack;
map_idxs=OC_MB_MAP_IDXS[_dec->state.info.pixel_fmt];
map_nidxs=OC_MB_MAP_NIDXS[_dec->state.info.pixel_fmt];
- memset(last_mv,0,sizeof(last_mv));
+ ZERO_MV2(last_mv);
frags=_dec->state.frags;
frag_mvs=_dec->state.frag_mvs;
mb_maps=(const oc_mb_map *)_dec->state.mb_maps;
@@ -797,7 +891,7 @@
nmbs=_dec->state.nmbs;
for(mbi=0;mbi<nmbs;mbi++){
int mb_mode;
- mb_mode=mb_modes[mbi];
+ mb_mode=*mb_modes++;
if(mb_mode!=OC_MODE_INVALID){
oc_mv mbmv;
ptrdiff_t fragi;
@@ -806,72 +900,74 @@
int ncoded;
int mapi;
int mapii;
+ const ptrdiff_t *mb_maps_p=&mb_maps[mbi][0][0];
/*Search for at least one coded fragment.*/
ncoded=mapii=0;
do{
- mapi=map_idxs[mapii];
- fragi=mb_maps[mbi][mapi>>2][mapi&3];
+ mapi=*map_idxs++;
+ fragi=mb_maps_p[mapi];
if(frags[fragi].coded)coded[ncoded++]=mapi;
}
while(++mapii<map_nidxs);
+ map_idxs-=map_nidxs;
if(ncoded<=0)continue;
switch(mb_mode){
case OC_MODE_INTER_MV_FOUR:{
- oc_mv lbmvs[4];
+ oc_mv4 lbmvs;
int bi;
/*Mark the tail of the list, so we don't accidentally go past it.*/
coded[ncoded]=-1;
for(bi=codedi=0;bi<4;bi++){
if(coded[codedi]==bi){
codedi++;
- fragi=mb_maps[mbi][0][bi];
+ fragi=mb_maps_p[bi];
frags[fragi].mb_mode=mb_mode;
- lbmvs[bi][0]=(signed char)(*mv_comp_unpack)(&_dec->opb);
- lbmvs[bi][1]=(signed char)(*mv_comp_unpack)(&_dec->opb);
- memcpy(frag_mvs[fragi],lbmvs[bi],sizeof(lbmvs[bi]));
+ frag_mvs[fragi].v[0]=lbmvs.v[bi].v[0]=(signed char)(*mv_comp_unpack)(&_dec->opb);
+ frag_mvs[fragi].v[1]=lbmvs.v[bi].v[1]=(signed char)(*mv_comp_unpack)(&_dec->opb);
}
- else lbmvs[bi][0]=lbmvs[bi][1]=0;
+ else ZERO_MV(lbmvs.v[bi]);
}
if(codedi>0){
- memcpy(last_mv[1],last_mv[0],sizeof(last_mv[1]));
- memcpy(last_mv[0],lbmvs[coded[codedi-1]],sizeof(last_mv[0]));
+ COPY_MV(last_mv.v[1],last_mv.v[0]);
+ COPY_MV(last_mv.v[0],lbmvs.v[coded[codedi-1]]);
}
if(codedi<ncoded){
- (*set_chroma_mvs)(cbmvs,(const oc_mv *)lbmvs);
+ (*set_chroma_mvs)(&cbmvs,&lbmvs);
for(;codedi<ncoded;codedi++){
mapi=coded[codedi];
- bi=mapi&3;
- fragi=mb_maps[mbi][mapi>>2][bi];
+ fragi=mb_maps_p[mapi];
frags[fragi].mb_mode=mb_mode;
- memcpy(frag_mvs[fragi],cbmvs[bi],sizeof(cbmvs[bi]));
+ COPY_MV(frag_mvs[fragi],cbmvs.v[mapi&3]);
}
}
}break;
case OC_MODE_INTER_MV:{
- memcpy(last_mv[1],last_mv[0],sizeof(last_mv[1]));
- mbmv[0]=last_mv[0][0]=(signed char)(*mv_comp_unpack)(&_dec->opb);
- mbmv[1]=last_mv[0][1]=(signed char)(*mv_comp_unpack)(&_dec->opb);
+ COPY_MV(last_mv.v[1],last_mv.v[0]);
+ mbmv.v[0]=last_mv.v[0].v[0]=(signed char)(*mv_comp_unpack)(&_dec->opb);
+ mbmv.v[1]=last_mv.v[0].v[1]=(signed char)(*mv_comp_unpack)(&_dec->opb);
}break;
- case OC_MODE_INTER_MV_LAST:memcpy(mbmv,last_mv[0],sizeof(mbmv));break;
+ case OC_MODE_INTER_MV_LAST:{
+ COPY_MV(mbmv,last_mv.v[0]);
+ }break;
case OC_MODE_INTER_MV_LAST2:{
- memcpy(mbmv,last_mv[1],sizeof(mbmv));
- memcpy(last_mv[1],last_mv[0],sizeof(last_mv[1]));
- memcpy(last_mv[0],mbmv,sizeof(last_mv[0]));
+ COPY_MV(mbmv,last_mv.v[1]);
+ COPY_MV(last_mv.v[1],last_mv.v[0]);
+ COPY_MV(last_mv.v[0],mbmv);
}break;
case OC_MODE_GOLDEN_MV:{
- mbmv[0]=(signed char)(*mv_comp_unpack)(&_dec->opb);
- mbmv[1]=(signed char)(*mv_comp_unpack)(&_dec->opb);
+ mbmv.v[0]=(signed char)(*mv_comp_unpack)(&_dec->opb);
+ mbmv.v[1]=(signed char)(*mv_comp_unpack)(&_dec->opb);
}break;
- default:memset(mbmv,0,sizeof(mbmv));break;
+ default:ZERO_MV(mbmv);break;
}
/*4MV mode fills in the fragments itself.
For all other modes we can use this common code.*/
if(mb_mode!=OC_MODE_INTER_MV_FOUR){
for(codedi=0;codedi<ncoded;codedi++){
mapi=coded[codedi];
- fragi=mb_maps[mbi][mapi>>2][mapi&3];
+ fragi=mb_maps_p[mapi];
frags[fragi].mb_mode=mb_mode;
- memcpy(frag_mvs[fragi],mbmv,sizeof(mbmv));
+ COPY_MV(frag_mvs[fragi],mbmv);
}
}
}
@@ -881,8 +977,8 @@
static void oc_dec_block_qis_unpack(oc_dec_ctx *_dec){
oc_fragment *frags;
const ptrdiff_t *coded_fragis;
+ const ptrdiff_t *coded_fragis_end;
ptrdiff_t ncoded_fragis;
- ptrdiff_t fragii;
ptrdiff_t fragi;
ncoded_fragis=_dec->state.ntotal_coded_fragis;
if(ncoded_fragis<=0)return;
@@ -891,15 +987,15 @@
if(_dec->state.nqis==1){
/*If this frame has only a single qi value, then just use it for all coded
fragments.*/
- for(fragii=0;fragii<ncoded_fragis;fragii++){
- frags[coded_fragis[fragii]].qii=0;
+ for(;ncoded_fragis>0;ncoded_fragis--){
+ frags[*coded_fragis++].qii=0;
}
}
else{
- long val;
int flag;
int nqi1;
int run_count;
+ int full_run;
/*Otherwise, we decode a qi index for each fragment, using two passes of
the same binary RLE scheme used for super-block coded bits.
The first pass marks each fragment as having a qii of 0 or greater than
@@ -908,51 +1004,51 @@
At first we just store the qii in the fragment.
After all the qii's are decoded, we make a final pass to replace them
with the corresponding qi's for this frame.*/
- val=oc_pack_read1(&_dec->opb);
- flag=(int)val;
nqi1=0;
- fragii=0;
- while(fragii<ncoded_fragis){
- int full_run;
+ full_run=1;
+ while(ncoded_fragis>0){
+ if(full_run){
+ flag=(int)oc_pack_read1(&_dec->opb);
+ }
+ else flag=!flag;
run_count=oc_sb_run_unpack(&_dec->opb);
full_run=run_count>=4129;
+ if (run_count>ncoded_fragis)
+ run_count=ncoded_fragis;
+ ncoded_fragis-=run_count;
+ nqi1+=flag;
do{
- frags[coded_fragis[fragii++]].qii=flag;
- nqi1+=flag;
+ frags[*coded_fragis++].qii=flag;
}
- while(--run_count>0&&fragii<ncoded_fragis);
- if(full_run&&fragii<ncoded_fragis){
- val=oc_pack_read1(&_dec->opb);
- flag=(int)val;
- }
- else flag=!flag;
+ while(--run_count>0);
}
+ ncoded_fragis=_dec->state.ntotal_coded_fragis;
+ coded_fragis-=ncoded_fragis;
/*TODO: run_count should be 0 here.
If it's not, we should issue a warning of some kind.*/
/*If we have 3 different qi's for this frame, and there was at least one
fragment with a non-zero qi, make the second pass.*/
if(_dec->state.nqis==3&&nqi1>0){
+ const ptrdiff_t *coded_fragis_end=coded_fragis+ncoded_fragis;
/*Skip qii==0 fragments.*/
- for(fragii=0;frags[coded_fragis[fragii]].qii==0;fragii++);
- val=oc_pack_read1(&_dec->opb);
- flag=(int)val;
+ for(;frags[*coded_fragis++].qii==0;);
+ coded_fragis--;
+ full_run=1;
do{
- int full_run;
+ if (full_run)
+ flag=(int)oc_pack_read1(&_dec->opb);
+ else
+ flag=!flag;
run_count=oc_sb_run_unpack(&_dec->opb);
full_run=run_count>=4129;
- for(;fragii<ncoded_fragis;fragii++){
- fragi=coded_fragis[fragii];
+ for(;coded_fragis<coded_fragis_end;*coded_fragis++){
+ fragi=*coded_fragis;
if(frags[fragi].qii==0)continue;
if(run_count--<=0)break;
frags[fragi].qii+=flag;
}
- if(full_run&&fragii<ncoded_fragis){
- val=oc_pack_read1(&_dec->opb);
- flag=(int)val;
- }
- else flag=!flag;
}
- while(fragii<ncoded_fragis);
+ while(coded_fragis<coded_fragis_end);
/*TODO: run_count should be 0 here.
If it's not, we should issue a warning of some kind.*/
}
@@ -989,16 +1085,16 @@
ptrdiff_t eobi;
int rli;
ncoded_fragis+=_dec->state.ncoded_fragis[pli];
- memset(run_counts,0,sizeof(run_counts));
+ oc_memzero_ptrdiff_64(run_counts);
_dec->eob_runs[pli][0]=eobs;
_dec->ti0[pli][0]=ti;
/*Continue any previous EOB run, if there was one.*/
- eobi=eobs;
- if(ncoded_fragis-fragii<eobi)eobi=ncoded_fragis-fragii;
+ eobi=OC_MINI(eobs,ncoded_fragis);
+ ncoded_fragis-=eobi;
eob_count=eobi;
eobs-=eobi;
- while(eobi-->0)frags[coded_fragis[fragii++]].dc=0;
- while(fragii<ncoded_fragis){
+ while(eobi-->0)frags[*coded_fragis++].dc=0;
+ while(ncoded_fragis>0){
int token;
int cw;
int eb;
@@ -1018,10 +1114,11 @@
eobs=cw>>OC_DCT_CW_EOB_SHIFT&0xFFF;
if(cw==OC_DCT_CW_FINISH)eobs=OC_DCT_EOB_FINISH;
if(eobs){
- eobi=OC_MINI(eobs,ncoded_fragis-fragii);
+ eobi=OC_MINI(eobs,ncoded_fragis);
eob_count+=eobi;
eobs-=eobi;
- while(eobi-->0)frags[coded_fragis[fragii++]].dc=0;
+ ncoded_fragis-=eobi;
+ while(eobi-->0)frags[*coded_fragis++].dc=0;
}
else{
int coeff;
@@ -1030,16 +1127,18 @@
coeff=cw>>OC_DCT_CW_MAG_SHIFT;
if(skip)coeff=0;
run_counts[skip]++;
- frags[coded_fragis[fragii++]].dc=coeff;
+ frags[*coded_fragis++].dc=coeff;
+ ncoded_fragis--;
}
}
/*Add the total EOB count to the longest run length.*/
- run_counts[63]+=eob_count;
/*And convert the run_counts array to a moment table.*/
- for(rli=63;rli-->0;)run_counts[rli]+=run_counts[rli+1];
/*Finally, subtract off the number of coefficients that have been
accounted for by runs started in this coefficient.*/
- for(rli=64;rli-->0;)_ntoks_left[pli][rli]-=run_counts[rli];
+ for(rli=63;rli>0;rli--){
+ eob_count+=run_counts[rli];
+ _ntoks_left[pli][rli]-=eob_count;
+ }
}
_dec->dct_tokens_count=ti;
return eobs;
@@ -1071,16 +1170,13 @@
_dec->eob_runs[pli][_zzi]=_eobs;
_dec->ti0[pli][_zzi]=ti;
ntoks_left=_ntoks_left[pli][_zzi];
- memset(run_counts,0,sizeof(run_counts));
+ oc_memzero_ptrdiff_64(run_counts);
eob_count=0;
- ntoks=0;
- while(ntoks+_eobs<ntoks_left){
+ while(eob_count+=_eobs,0<(int)(ntoks_left-=_eobs)){
int token;
int cw;
int eb;
int skip;
- ntoks+=_eobs;
- eob_count+=_eobs;
token=oc_huff_token_decode(&_dec->opb,
_dec->huff_tables[_huff_idxs[pli+1>>1]]);
dct_tokens[ti++]=(unsigned char)token;
@@ -1098,20 +1194,39 @@
if(cw==OC_DCT_CW_FINISH)_eobs=OC_DCT_EOB_FINISH;
if(_eobs==0){
run_counts[skip]++;
- ntoks++;
+ ntoks_left--;
}
}
/*Add the portion of the last EOB run actually used by this coefficient.*/
- eob_count+=ntoks_left-ntoks;
+ eob_count+=ntoks_left;
/*And remove it from the remaining EOB count.*/
- _eobs-=ntoks_left-ntoks;
+ _eobs=-ntoks_left;
/*Add the total EOB count to the longest run length.*/
- run_counts[63]+=eob_count;
- /*And convert the run_counts array to a moment table.*/
- for(rli=63;rli-->0;)run_counts[rli]+=run_counts[rli+1];
+ /* RJW: This top one does the same, and should be faster with any
+ * sane compiler, at least on ARM. GCC makes the bottom one faster. */
+#if 0
+ {
+ ptrdiff_t *r=&run_counts[63];
+ ptrdiff_t *p=&_ntoks_left[pli][64];
+ for(rli=_zzi-1;rli>0;rli--)
+ eob_count+=*r--;
+ /*Finally, subtract off the number of coefficients that have been
+ accounted for by runs started in this coefficient.*/
+ for(rli=64-_zzi;rli-->0;){
+ eob_count+=*r--;
+ *p-- -= eob_count;
+ }
+ }
+#else
+ for(rli=63;rli>64-_zzi;rli--)
+ eob_count+=run_counts[rli];
/*Finally, subtract off the number of coefficients that have been
accounted for by runs started in this coefficient.*/
- for(rli=64-_zzi;rli-->0;)_ntoks_left[pli][_zzi+rli]-=run_counts[rli];
+ for(rli=64-_zzi;rli-->0;){
+ eob_count+=run_counts[rli];
+ _ntoks_left[pli][_zzi+rli]-=eob_count;
+ }
+#endif
}
_dec->dct_tokens_count=ti;
return _eobs;
@@ -1145,26 +1260,23 @@
ptrdiff_t ntoks_left[3][64];
int huff_idxs[2];
ptrdiff_t eobs;
- long val;
int pli;
int zzi;
int hgi;
for(pli=0;pli<3;pli++)for(zzi=0;zzi<64;zzi++){
ntoks_left[pli][zzi]=_dec->state.ncoded_fragis[pli];
}
- val=oc_pack_read(&_dec->opb,4);
- huff_idxs[0]=(int)val;
- val=oc_pack_read(&_dec->opb,4);
- huff_idxs[1]=(int)val;
+ zzi=oc_pack_read(&_dec->opb,8);
+ huff_idxs[0]=(int)zzi>>4;
+ huff_idxs[1]=(int)zzi&15;;
_dec->eob_runs[0][0]=0;
eobs=oc_dec_dc_coeff_unpack(_dec,huff_idxs,ntoks_left);
#if defined(HAVE_CAIRO)
_dec->telemetry_dc_bytes=oc_pack_bytes_left(&_dec->opb);
#endif
- val=oc_pack_read(&_dec->opb,4);
- huff_idxs[0]=(int)val;
- val=oc_pack_read(&_dec->opb,4);
- huff_idxs[1]=(int)val;
+ zzi=oc_pack_read(&_dec->opb,8);
+ huff_idxs[0]=(int)zzi>>4;
+ huff_idxs[1]=(int)zzi&15;
zzi=1;
for(hgi=1;hgi<5;hgi++){
huff_idxs[0]+=16;
@@ -1303,7 +1415,7 @@
typedef struct{
- int bounding_values[256];
+ signed char bounding_values[257];
ptrdiff_t ti[3][64];
ptrdiff_t eob_runs[3][64];
const ptrdiff_t *coded_fragis[3];
@@ -1386,7 +1498,6 @@
oc_fragment *frags;
int *pred_last;
ptrdiff_t ncoded_fragis;
- ptrdiff_t fragi;
int fragx;
int fragy;
int fragy0;
@@ -1401,89 +1512,103 @@
pred_last=_pipe->pred_last[_pli];
frags=_dec->state.frags;
ncoded_fragis=0;
- fragi=fplane->froffset+fragy0*(ptrdiff_t)nhfrags;
- for(fragy=fragy0;fragy<fragy_end;fragy++){
- if(fragy==0){
- /*For the first row, all of the cases reduce to just using the previous
- predictor for the same reference frame.*/
- for(fragx=0;fragx<nhfrags;fragx++,fragi++){
- if(frags[fragi].coded){
- int ref;
- ref=OC_FRAME_FOR_MODE(frags[fragi].mb_mode);
- pred_last[ref]=frags[fragi].dc+=pred_last[ref];
- ncoded_fragis++;
- }
+ frags+=fplane->froffset+fragy0*(ptrdiff_t)nhfrags;
+ fragy=fragy0;
+ if(fragy0==0){
+ /*For the first row, all of the cases reduce to just using the previous
+ predictor for the same reference frame.*/
+ for(fragx=nhfrags;fragx>0;fragx--){
+ if((*frags++).coded){
+ int ref;
+ ref=OC_FRAME_FOR_MODE(frags[-1].mb_mode);
+ pred_last[ref]=frags[-1].dc+=pred_last[ref];
+ ncoded_fragis++;
}
}
- else{
- oc_fragment *u_frags;
- int l_ref;
- int ul_ref;
- int u_ref;
- u_frags=frags-nhfrags;
- l_ref=-1;
- ul_ref=-1;
- u_ref=u_frags[fragi].coded?OC_FRAME_FOR_MODE(u_frags[fragi].mb_mode):-1;
- for(fragx=0;fragx<nhfrags;fragx++,fragi++){
- int ur_ref;
- if(fragx+1>=nhfrags)ur_ref=-1;
- else{
- ur_ref=u_frags[fragi+1].coded?
- OC_FRAME_FOR_MODE(u_frags[fragi+1].mb_mode):-1;
+ fragy++;
+ }
+ fragy=fragy_end-fragy;
+ for(;fragy>0;fragy--){
+ oc_fragment *u_frags;
+ int l_ref;
+ int ul_ref;
+ int u_ref;
+ u_frags=frags-nhfrags;
+ l_ref=-1;
+ ul_ref=-1;
+ u_ref=u_frags->coded?OC_FRAME_FOR_MODE(u_frags->mb_mode):-1;
+ for(fragx=nhfrags-1;fragx>=0;u_frags++,fragx--){
+ int ur_ref;
+ if(fragx<=0)ur_ref=-1;
+ else{
+ ur_ref=u_frags[1].coded?
+ OC_FRAME_FOR_MODE(u_frags[1].mb_mode):-1;
+ }
+ if((*frags++).coded){
+ int pred;
+ int ref;
+ ref=OC_FRAME_FOR_MODE(frags[-1].mb_mode);
+ /*We break out a separate case based on which of our neighbors use
+ the same reference frames.
+ This is somewhat faster than trying to make a generic case which
+ handles all of them, since it reduces lots of poorly predicted
+ jumps to one switch statement, and also lets a number of the
+ multiplications be optimized out by strength reduction.*/
+ switch((l_ref==ref)|(ul_ref==ref)<<1|
+ (u_ref==ref)<<2|(ur_ref==ref)<<3){
+ default:pred=pred_last[ref];break;
+ case 1:
+ case 3:pred=frags[-2].dc;break;
+ case 2:pred=u_frags[-1].dc;break;
+ case 4:
+ case 6:
+ case 12:pred=u_frags->dc;break;
+ case 5:{
+ pred=(frags[-2].dc+u_frags->dc);
+ if(pred<0)pred+=1;
+ pred>>=1;
+ }break;
+ case 8:pred=u_frags[1].dc;break;
+ case 9:
+ case 11:
+ case 13:{
+ pred=(75*frags[-2].dc+53*u_frags[1].dc);
+ if(pred<0)pred+=127;
+ pred>>=7;
+ }break;
+ case 10:{
+ pred=(u_frags[-1].dc+u_frags[1].dc);
+ if(pred<0)pred+=1;
+ pred>>=1;
+ }break;
+ case 14:{
+ pred=(3*(u_frags[-1].dc+u_frags[1].dc)+10*u_frags->dc);
+ if(pred<0)pred+=15;
+ pred>>=4;
+ }break;
+ case 7:
+ case 15:{
+ int p0;
+ int p1;
+ int p2;
+ p0=frags[-2].dc;
+ p1=u_frags[-1].dc;
+ p2=u_frags->dc;
+ pred=(29*(p0+p2)-26*p1);
+ if(pred<0)pred+=31;
+ pred>>=5;
+ if(abs(pred-p2)>128)pred=p2;
+ else if(abs(pred-p0)>128)pred=p0;
+ else if(abs(pred-p1)>128)pred=p1;
+ }break;
}
- if(frags[fragi].coded){
- int pred;
- int ref;
- ref=OC_FRAME_FOR_MODE(frags[fragi].mb_mode);
- /*We break out a separate case based on which of our neighbors use
- the same reference frames.
- This is somewhat faster than trying to make a generic case which
- handles all of them, since it reduces lots of poorly predicted
- jumps to one switch statement, and also lets a number of the
- multiplications be optimized out by strength reduction.*/
- switch((l_ref==ref)|(ul_ref==ref)<<1|
- (u_ref==ref)<<2|(ur_ref==ref)<<3){
- default:pred=pred_last[ref];break;
- case 1:
- case 3:pred=frags[fragi-1].dc;break;
- case 2:pred=u_frags[fragi-1].dc;break;
- case 4:
- case 6:
- case 12:pred=u_frags[fragi].dc;break;
- case 5:pred=(frags[fragi-1].dc+u_frags[fragi].dc)/2;break;
- case 8:pred=u_frags[fragi+1].dc;break;
- case 9:
- case 11:
- case 13:{
- pred=(75*frags[fragi-1].dc+53*u_frags[fragi+1].dc)/128;
- }break;
- case 10:pred=(u_frags[fragi-1].dc+u_frags[fragi+1].dc)/2;break;
- case 14:{
- pred=(3*(u_frags[fragi-1].dc+u_frags[fragi+1].dc)
- +10*u_frags[fragi].dc)/16;
- }break;
- case 7:
- case 15:{
- int p0;
- int p1;
- int p2;
- p0=frags[fragi-1].dc;
- p1=u_frags[fragi-1].dc;
- p2=u_frags[fragi].dc;
- pred=(29*(p0+p2)-26*p1)/32;
- if(abs(pred-p2)>128)pred=p2;
- else if(abs(pred-p0)>128)pred=p0;
- else if(abs(pred-p1)>128)pred=p1;
- }break;
- }
- pred_last[ref]=frags[fragi].dc+=pred;
- ncoded_fragis++;
- l_ref=ref;
- }
- else l_ref=-1;
- ul_ref=u_ref;
- u_ref=ur_ref;
+ pred_last[ref]=frags[-1].dc+=pred;
+ ncoded_fragis++;
+ l_ref=ref;
}
+ else l_ref=-1;
+ ul_ref=u_ref;
+ u_ref=ur_ref;
}
}
_pipe->ncoded_fragis[_pli]=ncoded_fragis;
@@ -1521,7 +1646,7 @@
ti=_pipe->ti[_pli];
eob_runs=_pipe->eob_runs[_pli];
for(qti=0;qti<2;qti++)dc_quant[qti]=_pipe->dequant[_pli][0][qti][0];
- for(fragii=0;fragii<ncoded_fragis;fragii++){
+ for(fragii=ncoded_fragis;fragii!=0;fragii--){
/*This array is made one element larger because the zig-zag index array
uses the final element as a dumping ground for out-of-range indices
to protect us from buffer overflow.*/
@@ -1530,13 +1655,13 @@
ptrdiff_t fragi;
int last_zzi;
int zzi;
- fragi=coded_fragis[fragii];
- for(zzi=0;zzi<64;zzi++)dct_coeffs[zzi]=0;
+ //for(zzi=0;zzi<64;zzi++)dct_coeffs[zzi]=0;
+ oc_memzero_16_64(dct_coeffs);
+ fragi=*coded_fragis++;
qti=frags[fragi].mb_mode!=OC_MODE_INTRA;
ac_quant=_pipe->dequant[_pli][frags[fragi].qii][qti];
/*Decode the AC coefficients.*/
for(zzi=0;zzi<64;){
- int token;
last_zzi=zzi;
if(eob_runs[zzi]){
eob_runs[zzi]--;
@@ -1548,6 +1673,7 @@
int rlen;
int coeff;
int lti;
+ int token;
lti=ti[zzi];
token=dct_tokens[lti++];
cw=OC_DCT_CODE_WORD[token];
@@ -1599,27 +1725,30 @@
}
/*Filter a horizontal block edge.*/
-static void oc_filter_hedge(unsigned char *_dst,int _dst_ystride,
- const unsigned char *_src,int _src_ystride,int _qstep,int _flimit,
+#ifdef OC_ARM_ASM
+extern void oc_filter_hedge( unsigned char *rdst,
+ int _dst_ystride,
+ const unsigned char *rsrc,
+ int _src_ystride,
+ int _qstep,
+ int _flimit,
+ int *_variance0,
+ int *_variance1);
+#else
+static void oc_filter_hedge(unsigned char *rdst,int _dst_ystride,
+ const unsigned char *rsrc,int _src_ystride,int _qstep,int _flimit,
int *_variance0,int *_variance1){
- unsigned char *rdst;
- const unsigned char *rsrc;
- unsigned char *cdst;
- const unsigned char *csrc;
int r[10];
int sum0;
int sum1;
int bx;
int by;
- rdst=_dst;
- rsrc=_src;
- for(bx=0;bx<8;bx++){
- cdst=rdst;
- csrc=rsrc;
+ for(bx=8;bx>0;bx--){
for(by=0;by<10;by++){
- r[by]=*csrc;
- csrc+=_src_ystride;
+ r[by]=*rsrc;
+ rsrc+=_src_ystride;
}
+ rsrc-=_src_ystride*10;
sum0=sum1=0;
for(by=0;by<4;by++){
sum0+=abs(r[by+1]-r[by]);
@@ -1628,46 +1757,52 @@
*_variance0+=OC_MINI(255,sum0);
*_variance1+=OC_MINI(255,sum1);
if(sum0<_flimit&&sum1<_flimit&&r[5]-r[4]<_qstep&&r[4]-r[5]<_qstep){
- *cdst=(unsigned char)(r[0]*3+r[1]*2+r[2]+r[3]+r[4]+4>>3);
- cdst+=_dst_ystride;
- *cdst=(unsigned char)(r[0]*2+r[1]+r[2]*2+r[3]+r[4]+r[5]+4>>3);
- cdst+=_dst_ystride;
+ *rdst=(unsigned char)(r[0]*3+r[1]*2+r[2]+r[3]+r[4]+4>>3);
+ rdst+=_dst_ystride;
+ *rdst=(unsigned char)(r[0]*2+r[1]+r[2]*2+r[3]+r[4]+r[5]+4>>3);
+ rdst+=_dst_ystride;
for(by=0;by<4;by++){
- *cdst=(unsigned char)(r[by]+r[by+1]+r[by+2]+r[by+3]*2+
+ *rdst=(unsigned char)(r[by]+r[by+1]+r[by+2]+r[by+3]*2+
r[by+4]+r[by+5]+r[by+6]+4>>3);
- cdst+=_dst_ystride;
+ rdst+=_dst_ystride;
}
- *cdst=(unsigned char)(r[4]+r[5]+r[6]+r[7]*2+r[8]+r[9]*2+4>>3);
- cdst+=_dst_ystride;
- *cdst=(unsigned char)(r[5]+r[6]+r[7]+r[8]*2+r[9]*3+4>>3);
+ *rdst=(unsigned char)(r[4]+r[5]+r[6]+r[7]*2+r[8]+r[9]*2+4>>3);
+ rdst+=_dst_ystride;
+ *rdst=(unsigned char)(r[5]+r[6]+r[7]+r[8]*2+r[9]*3+4>>3);
+ rdst-=7*_dst_ystride;
}
else{
for(by=1;by<=8;by++){
- *cdst=(unsigned char)r[by];
- cdst+=_dst_ystride;
+ *rdst=(unsigned char)r[by];
+ rdst+=_dst_ystride;
}
+ rdst-=8*_dst_ystride;
}
rdst++;
rsrc++;
}
}
+#endif
/*Filter a vertical block edge.*/
-static void oc_filter_vedge(unsigned char *_dst,int _dst_ystride,
+#ifdef OC_ARM_ASM
+extern void oc_filter_vedge(unsigned char *cdst,
+ int _dst_ystride,
+ int _qstep,
+ int _flimit,
+ int *_variances);
+#else
+static void oc_filter_vedge(unsigned char *cdst,int _dst_ystride,
int _qstep,int _flimit,int *_variances){
- unsigned char *rdst;
- const unsigned char *rsrc;
- unsigned char *cdst;
int r[10];
int sum0;
int sum1;
int bx;
int by;
- cdst=_dst;
for(by=0;by<8;by++){
- rsrc=cdst-1;
- rdst=cdst;
- for(bx=0;bx<10;bx++)r[bx]=*rsrc++;
+ cdst--;
+ for(bx=0;bx<10;bx++)r[bx]=*cdst++;
+ cdst-=9;
sum0=sum1=0;
for(bx=0;bx<4;bx++){
sum0+=abs(r[bx+1]-r[bx]);
@@ -1676,18 +1811,20 @@
_variances[0]+=OC_MINI(255,sum0);
_variances[1]+=OC_MINI(255,sum1);
if(sum0<_flimit&&sum1<_flimit&&r[5]-r[4]<_qstep&&r[4]-r[5]<_qstep){
- *rdst++=(unsigned char)(r[0]*3+r[1]*2+r[2]+r[3]+r[4]+4>>3);
- *rdst++=(unsigned char)(r[0]*2+r[1]+r[2]*2+r[3]+r[4]+r[5]+4>>3);
+ *cdst++=(unsigned char)(r[0]*3+r[1]*2+r[2]+r[3]+r[4]+4>>3);
+ *cdst++=(unsigned char)(r[0]*2+r[1]+r[2]*2+r[3]+r[4]+r[5]+4>>3);
for(bx=0;bx<4;bx++){
- *rdst++=(unsigned char)(r[bx]+r[bx+1]+r[bx+2]+r[bx+3]*2+
+ *cdst++=(unsigned char)(r[bx]+r[bx+1]+r[bx+2]+r[bx+3]*2+
r[bx+4]+r[bx+5]+r[bx+6]+4>>3);
}
- *rdst++=(unsigned char)(r[4]+r[5]+r[6]+r[7]*2+r[8]+r[9]*2+4>>3);
- *rdst=(unsigned char)(r[5]+r[6]+r[7]+r[8]*2+r[9]*3+4>>3);
+ *cdst++=(unsigned char)(r[4]+r[5]+r[6]+r[7]*2+r[8]+r[9]*2+4>>3);
+ *cdst=(unsigned char)(r[5]+r[6]+r[7]+r[8]*2+r[9]*3+4>>3);
+ cdst-=7;
}
cdst+=_dst_ystride;
}
}
+#endif
static void oc_dec_deblock_frag_rows(oc_dec_ctx *_dec,
th_img_plane *_dst,th_img_plane *_src,int _pli,int _fragy0,
@@ -1775,107 +1912,113 @@
}
}
-static void oc_dering_block(unsigned char *_idata,int _ystride,int _b,
+#ifdef OC_ARM_ASM
+extern void oc_dering_block(unsigned char *dst,int _ystride,int _b,
+ int _dc_scale,int _sharp_mod,int _strong);
+#else
+static void oc_dering_block(unsigned char *dst,int _ystride,int _b,
int _dc_scale,int _sharp_mod,int _strong){
- static const unsigned char OC_MOD_MAX[2]={24,32};
- static const unsigned char OC_MOD_SHIFT[2]={1,0};
- const unsigned char *psrc;
- const unsigned char *src;
- const unsigned char *nsrc;
- unsigned char *dst;
- int vmod[72];
- int hmod[72];
+ static const unsigned char MOD_MAX[2]={24,32};
+ static const unsigned char MOD_SHIFT[2]={1,0};
+ unsigned char *psrc;
+ unsigned char *src;
+ unsigned char *nsrc;
+ signed char *pvmod;
+ signed char *phmod;
+ signed char vmod[72];
+ signed char hmod[72];
int mod_hi;
int by;
int bx;
- mod_hi=OC_MINI(3*_dc_scale,OC_MOD_MAX[_strong]);
- dst=_idata;
+ mod_hi=OC_MINI(3*_dc_scale,MOD_MAX[_strong]);
+ _dc_scale+=32+64;
+ _strong=MOD_SHIFT[_strong];
src=dst;
psrc=src-(_ystride&-!(_b&4));
- for(by=0;by<9;by++){
- for(bx=0;bx<8;bx++){
+ pvmod=vmod;
+ for(by=8;by>=0;by--){
+ for(bx=8;bx>0;bx--){
int mod;
- mod=32+_dc_scale-(abs(src[bx]-psrc[bx])<<OC_MOD_SHIFT[_strong]);
- vmod[(by<<3)+bx]=mod<-64?_sharp_mod:OC_CLAMPI(0,mod,mod_hi);
+ mod=_dc_scale-(abs(*src++-*psrc++)<<_strong);
+ *pvmod++=mod<0?_sharp_mod:OC_CLAMPI(0,mod-64,mod_hi);
}
- psrc=src;
- src+=_ystride&-(!(_b&8)|by<7);
+ psrc=src-8;
+ src =psrc+(_ystride&-((!(_b&8))|by>1));
}
- nsrc=dst;
+ src=dst;
psrc=dst-!(_b&1);
- for(bx=0;bx<9;bx++){
- src=nsrc;
- for(by=0;by<8;by++){
+ phmod=hmod;
+ for(bx=8;bx>=0;bx--){
+ for(by=8;by>0;by--){
int mod;
- mod=32+_dc_scale-(abs(*src-*psrc)<<OC_MOD_SHIFT[_strong]);
- hmod[(bx<<3)+by]=mod<-64?_sharp_mod:OC_CLAMPI(0,mod,mod_hi);
+ mod=_dc_scale-(abs(*src-*psrc)<<_strong);
+ *phmod++=mod<0?_sharp_mod:OC_CLAMPI(0,mod-64,mod_hi);
psrc+=_ystride;
src+=_ystride;
}
- psrc=nsrc;
- nsrc+=!(_b&2)|bx<7;
+ psrc=src - (_ystride<<3);
+ src =psrc+(!(_b&2)|(bx>1));
}
src=dst;
psrc=src-(_ystride&-!(_b&4));
nsrc=src+_ystride;
- for(by=0;by<8;by++){
+ phmod=hmod;
+ pvmod=vmod;
+ for(by=8;by>0;by--){
int a;
- int b;
+ int d;
int w;
a=128;
- b=64;
- w=hmod[by];
+ d=64;
+ w=*phmod; phmod+=8;
a-=w;
- b+=w**(src-!(_b&1));
- w=vmod[by<<3];
+ d+=w**(src-!(_b&1));
+ w=*pvmod++;
a-=w;
- b+=w*psrc[0];
- w=vmod[by+1<<3];
+ d+=w* *psrc++;
+ w=pvmod[7];
a-=w;
- b+=w*nsrc[0];
- w=hmod[(1<<3)+by];
+ d+=w* *nsrc++;
+ w=*phmod; phmod+=8;
a-=w;
- b+=w*src[1];
- dst[0]=OC_CLAMP255(a*src[0]+b>>7);
- for(bx=1;bx<7;bx++){
+ d+=w* *++src;
+ src[-1]=a=OC_CLAMP255(a*src[-1]+d>>7);
+ for(bx=6;bx>0;bx--){
+ d=64;
+ d+=w*a;
a=128;
- b=64;
- w=hmod[(bx<<3)+by];
a-=w;
- b+=w*src[bx-1];
- w=vmod[(by<<3)+bx];
+ w=*pvmod++;
a-=w;
- b+=w*psrc[bx];
- w=vmod[(by+1<<3)+bx];
+ d+=w* *psrc++;
+ w=pvmod[7];
a-=w;
- b+=w*nsrc[bx];
- w=hmod[(bx+1<<3)+by];
+ d+=w* *nsrc++;
+ w=*phmod; phmod+=8;
a-=w;
- b+=w*src[bx+1];
- dst[bx]=OC_CLAMP255(a*src[bx]+b>>7);
+ d+=w* *++src;
+ src[-1]=a=OC_CLAMP255(a*src[-1]+d>>7);
}
+ d=64;
+ d+=w*a;
a=128;
- b=64;
- w=hmod[(7<<3)+by];
a-=w;
- b+=w*src[6];
- w=vmod[(by<<3)+7];
+ w=*pvmod++;
a-=w;
- b+=w*psrc[7];
- w=vmod[(by+1<<3)+7];
+ d+=w* *psrc++;
+ w=pvmod[7];
a-=w;
- b+=w*nsrc[7];
- w=hmod[(8<<3)+by];
+ d+=w* *nsrc; nsrc-=7;
+ w=*phmod; phmod+=1-8*8;
a-=w;
- b+=w*src[7+!(_b&2)];
- dst[7]=OC_CLAMP255(a*src[7]+b>>7);
- dst+=_ystride;
- psrc=src;
+ d+=w*src[!(_b&2)];
+ src[0]=OC_CLAMP255(a*src[0]+d>>7);
+ psrc=src-7;
src=nsrc;
- nsrc+=_ystride&-(!(_b&8)|by<6);
+ nsrc+=_ystride&-(!(_b&8)|by>2);
}
}
-
+#endif
#define OC_DERING_THRESH1 (384)
#define OC_DERING_THRESH2 (4*OC_DERING_THRESH1)
#define OC_DERING_THRESH3 (5*OC_DERING_THRESH1)
@@ -2043,30 +2186,6 @@
}
}
-/*We're decoding an INTER frame, but have no initialized reference
- buffers (i.e., decoding did not start on a key frame).
- We initialize them to a solid gray here.*/
-static void oc_dec_init_dummy_frame(th_dec_ctx *_dec){
- th_info *info;
- size_t yplane_sz;
- size_t cplane_sz;
- int yhstride;
- int yheight;
- int chstride;
- int cheight;
- _dec->state.ref_frame_idx[OC_FRAME_GOLD]=0;
- _dec->state.ref_frame_idx[OC_FRAME_PREV]=0;
- _dec->state.ref_frame_idx[OC_FRAME_SELF]=1;
- info=&_dec->state.info;
- yhstride=info->frame_width+2*OC_UMV_PADDING;
- yheight=info->frame_height+2*OC_UMV_PADDING;
- chstride=yhstride>>!(info->pixel_fmt&1);
- cheight=yheight>>!(info->pixel_fmt&2);
- yplane_sz=yhstride*(size_t)yheight;
- cplane_sz=chstride*(size_t)cheight;
- memset(_dec->state.ref_frame_data[0],0x80,yplane_sz+2*cplane_sz);
-}
-
int th_decode_packetin(th_dec_ctx *_dec,const ogg_packet *_op,
ogg_int64_t *_granpos){
int ret;
@@ -2082,7 +2201,7 @@
int pli;
int notstart;
int notdone;
- oc_pack_readinit(&_dec->opb,_op->packet,_op->bytes);
+ oc_pack_readinit(&_dec->opb,_op->packet);
#if defined(HAVE_CAIRO)
_dec->telemetry_frame_bytes=_op->bytes;
#endif
@@ -2093,9 +2212,27 @@
if(_dec->state.frame_type!=OC_INTRA_FRAME&&
(_dec->state.ref_frame_idx[OC_FRAME_GOLD]<0||
_dec->state.ref_frame_idx[OC_FRAME_PREV]<0)){
- /*No reference frames yet!*/
- oc_dec_init_dummy_frame(_dec);
- refi=_dec->state.ref_frame_idx[OC_FRAME_SELF];
+ th_info *info;
+ size_t yplane_sz;
+ size_t cplane_sz;
+ int yhstride;
+ int yheight;
+ int chstride;
+ int cheight;
+ /*We're decoding an INTER frame, but have no initialized reference
+ buffers (i.e., decoding did not start on a key frame).
+ We initialize them to a solid gray here.*/
+ _dec->state.ref_frame_idx[OC_FRAME_GOLD]=0;
+ _dec->state.ref_frame_idx[OC_FRAME_PREV]=0;
+ _dec->state.ref_frame_idx[OC_FRAME_SELF]=refi=1;
+ info=&_dec->state.info;
+ yhstride=info->frame_width+2*OC_UMV_PADDING;
+ yheight=info->frame_height+2*OC_UMV_PADDING;
+ chstride=yhstride>>!(info->pixel_fmt&1);
+ cheight=yheight>>!(info->pixel_fmt&2);
+ yplane_sz=yhstride*(size_t)yheight;
+ cplane_sz=chstride*(size_t)cheight;
+ oc_memset_al_mult8(_dec->state.ref_frame_data[0],0x80,yplane_sz+2*cplane_sz);
}
else{
for(refi=0;refi==_dec->state.ref_frame_idx[OC_FRAME_GOLD]||
@@ -2268,16 +2405,6 @@
return 0;
}
else{
- if(_dec->state.ref_frame_idx[OC_FRAME_GOLD]<0||
- _dec->state.ref_frame_idx[OC_FRAME_PREV]<0){
- int refi;
- /*No reference frames yet!*/
- oc_dec_init_dummy_frame(_dec);
- refi=_dec->state.ref_frame_idx[OC_FRAME_PREV];
- _dec->state.ref_frame_idx[OC_FRAME_SELF]=refi;
- memcpy(_dec->pp_frame_buf,_dec->state.ref_frame_bufs[refi],
- sizeof(_dec->pp_frame_buf[0])*3);
- }
/*Just update the granule position and return.*/
_dec->state.granpos=(_dec->state.keyframe_num+_dec->state.granpos_bias<<
_dec->state.info.keyframe_granule_shift)
Modified: branches/theorarm-merge-branch/lib/internal.c
===================================================================
--- branches/theorarm-merge-branch/lib/internal.c 2010-03-07 03:06:29 UTC (rev 16953)
+++ branches/theorarm-merge-branch/lib/internal.c 2010-03-07 17:59:26 UTC (rev 16954)
@@ -103,13 +103,13 @@
(4:2:0).
_cbmvs: The chroma block-level motion vectors to fill in.
_lbmvs: The luma block-level motion vectors.*/
-static void oc_set_chroma_mvs00(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){
+static void oc_set_chroma_mvs00(oc_mv4 *_cbmvs,const oc_mv4 *_lbmvs){
int dx;
int dy;
- dx=_lbmvs[0][0]+_lbmvs[1][0]+_lbmvs[2][0]+_lbmvs[3][0];
- dy=_lbmvs[0][1]+_lbmvs[1][1]+_lbmvs[2][1]+_lbmvs[3][1];
- _cbmvs[0][0]=(signed char)OC_DIV_ROUND_POW2(dx,2,2);
- _cbmvs[0][1]=(signed char)OC_DIV_ROUND_POW2(dy,2,2);
+ dx=_lbmvs->v[0].v[0]+_lbmvs->v[1].v[0]+_lbmvs->v[2].v[0]+_lbmvs->v[3].v[0];
+ dy=_lbmvs->v[0].v[1]+_lbmvs->v[1].v[1]+_lbmvs->v[2].v[1]+_lbmvs->v[3].v[1];
+ _cbmvs->v[0].v[0]=(signed char)OC_DIV_ROUND_POW2(dx,2,2);
+ _cbmvs->v[0].v[1]=(signed char)OC_DIV_ROUND_POW2(dy,2,2);
}
/*The function used to fill in the chroma plane motion vectors for a macro
@@ -117,17 +117,17 @@
This version is for use with chroma decimated in the Y direction.
_cbmvs: The chroma block-level motion vectors to fill in.
_lbmvs: The luma block-level motion vectors.*/
-static void oc_set_chroma_mvs01(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){
+static void oc_set_chroma_mvs01(oc_mv4 *_cbmvs,const oc_mv4 *_lbmvs){
int dx;
int dy;
- dx=_lbmvs[0][0]+_lbmvs[2][0];
- dy=_lbmvs[0][1]+_lbmvs[2][1];
- _cbmvs[0][0]=(signed char)OC_DIV_ROUND_POW2(dx,1,1);
- _cbmvs[0][1]=(signed char)OC_DIV_ROUND_POW2(dy,1,1);
- dx=_lbmvs[1][0]+_lbmvs[3][0];
- dy=_lbmvs[1][1]+_lbmvs[3][1];
- _cbmvs[1][0]=(signed char)OC_DIV_ROUND_POW2(dx,1,1);
- _cbmvs[1][1]=(signed char)OC_DIV_ROUND_POW2(dy,1,1);
+ dx=_lbmvs->v[0].v[0]+_lbmvs->v[2].v[0];
+ dy=_lbmvs->v[0].v[1]+_lbmvs->v[2].v[1];
+ _cbmvs->v[0].v[0]=(signed char)OC_DIV_ROUND_POW2(dx,1,1);
+ _cbmvs->v[0].v[1]=(signed char)OC_DIV_ROUND_POW2(dy,1,1);
+ dx=_lbmvs->v[1].v[0]+_lbmvs->v[3].v[0];
+ dy=_lbmvs->v[1].v[1]+_lbmvs->v[3].v[1];
+ _cbmvs->v[1].v[0]=(signed char)OC_DIV_ROUND_POW2(dx,1,1);
+ _cbmvs->v[1].v[1]=(signed char)OC_DIV_ROUND_POW2(dy,1,1);
}
/*The function used to fill in the chroma plane motion vectors for a macro
@@ -135,17 +135,17 @@
This version is for use with chroma decimated in the X direction (4:2:2).
_cbmvs: The chroma block-level motion vectors to fill in.
_lbmvs: The luma block-level motion vectors.*/
-static void oc_set_chroma_mvs10(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){
+static void oc_set_chroma_mvs10(oc_mv4 *_cbmvs,const oc_mv4 *_lbmvs){
int dx;
int dy;
- dx=_lbmvs[0][0]+_lbmvs[1][0];
- dy=_lbmvs[0][1]+_lbmvs[1][1];
- _cbmvs[0][0]=(signed char)OC_DIV_ROUND_POW2(dx,1,1);
- _cbmvs[0][1]=(signed char)OC_DIV_ROUND_POW2(dy,1,1);
- dx=_lbmvs[2][0]+_lbmvs[3][0];
- dy=_lbmvs[2][1]+_lbmvs[3][1];
- _cbmvs[2][0]=(signed char)OC_DIV_ROUND_POW2(dx,1,1);
- _cbmvs[2][1]=(signed char)OC_DIV_ROUND_POW2(dy,1,1);
+ dx=_lbmvs->v[0].v[0]+_lbmvs->v[1].v[0];
+ dy=_lbmvs->v[0].v[1]+_lbmvs->v[1].v[1];
+ _cbmvs->v[0].v[0]=(signed char)OC_DIV_ROUND_POW2(dx,1,1);
+ _cbmvs->v[0].v[1]=(signed char)OC_DIV_ROUND_POW2(dy,1,1);
+ dx=_lbmvs->v[2].v[0]+_lbmvs->v[3].v[0];
+ dy=_lbmvs->v[2].v[1]+_lbmvs->v[3].v[1];
+ _cbmvs->v[2].v[0]=(signed char)OC_DIV_ROUND_POW2(dx,1,1);
+ _cbmvs->v[2].v[1]=(signed char)OC_DIV_ROUND_POW2(dy,1,1);
}
/*The function used to fill in the chroma plane motion vectors for a macro
@@ -155,8 +155,11 @@
_lmbmv: The luma macro-block level motion vector to fill in for use in
prediction.
_lbmvs: The luma block-level motion vectors.*/
-static void oc_set_chroma_mvs11(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){
- memcpy(_cbmvs,_lbmvs,4*sizeof(_lbmvs[0]));
+static void oc_set_chroma_mvs11(oc_mv4 *_cbmvs,const oc_mv4 *_lbmvs){
+ COPY_MV(_cbmvs->v[0],_lbmvs->v[0]);
+ COPY_MV(_cbmvs->v[1],_lbmvs->v[1]);
+ COPY_MV(_cbmvs->v[2],_lbmvs->v[2]);
+ COPY_MV(_cbmvs->v[3],_lbmvs->v[3]);
}
/*A table of functions used to fill in the chroma plane motion vectors for a
@@ -250,7 +253,11 @@
Note that this correctly interprets a 0-byte packet as a video data packet.
Return: 1 for a header packet, 0 for a data packet.*/
int th_packet_isheader(ogg_packet *_op){
- return _op->bytes>0?_op->packet[0]>>7:0;
+#ifdef WORK_WITH_TREMOLO
+ return _op->bytes>0?_op->packet->buffer->data[0]>>7:0;
+#else
+ return _op->bytes>0?_op->packet[0]>>7:0;
+#endif
}
/*Determines the frame type of a video data packet.
@@ -258,5 +265,9 @@
Return: 1 for a key frame, 0 for a delta frame, and -1 for a header
packet.*/
int th_packet_iskeyframe(ogg_packet *_op){
+#ifdef WORK_WITH_TREMOLO
+ return _op->bytes<=0?0:_op->packet->buffer->data[0]&0x80?-1:!(_op->packet->buffer->data[0]&0x40);
+#else
return _op->bytes<=0?0:_op->packet[0]&0x80?-1:!(_op->packet[0]&0x40);
+#endif
}
Modified: branches/theorarm-merge-branch/lib/internal.h
===================================================================
--- branches/theorarm-merge-branch/lib/internal.h 2010-03-07 03:06:29 UTC (rev 16953)
+++ branches/theorarm-merge-branch/lib/internal.h 2010-03-07 17:59:26 UTC (rev 16954)
@@ -190,9 +190,65 @@
typedef ptrdiff_t oc_mb_map_plane[4];
/*A map from a macro block to fragment numbers.*/
typedef oc_mb_map_plane oc_mb_map[3];
+
+
+/* In order to allow efficient manipulation (copying, zeroing) of motion
+ * vectors on certain platforms (notably ARM), we need to ensure that they
+ * are kept aligned in memory. To force them to be aligned, we keep them in
+ * unions with types of the required size.
+ *
+ * Because of C's somewhat baroque type sizing rules we need to detect
+ * that this is appropriate. We can't use sizeof() at compile time, so are
+ * forced to use a formulation that looks at the values set in limits.
+ */
+#if SCHAR_MAX==127 && SHRT_MAX==32767
+/* Sane 8bit char/16bit short system */
+#define ALIGNED_MVS
+#endif
+
/*A motion vector.*/
-typedef signed char oc_mv[2];
+typedef union
+{
+ signed char v[2];
+#ifdef ALIGNED_MVS
+ short s;
+#endif
+} oc_mv;
+/*A pair of motion vectors*/
+typedef union
+{
+ oc_mv v[2];
+#ifdef ALIGNED_MVS
+ int i;
+#endif
+} oc_mv2;
+/*4 motion vectors*/
+typedef union
+{
+ oc_mv v[4];
+#ifdef ALIGNED_MVS
+ int i[2];
+#endif
+} oc_mv4;
+#ifdef ALIGNED_MVS
+/* If we are using ALIGNED_MVs, then we can use the fast copy mechanisms */
+#define COPY_MV(D,S) (((D).s)=((S).s))
+#define COPY_MV4(D,S) ((((D).i[0])=((S).i[0])),(((D).i[1])=((S).i[1])))
+#define ZERO_MV(D) (((D).s)=0)
+#define ZERO_MV2(D) (((D).i)=0)
+#else
+/* If we aren't using ALIGNED_MVs, then we need to do it another way.
+ * To produce code equivalent to the existing trunk code, use:
+ * #define COPY_MV(D,S) memcpy(&D,&S,sizeof(oc_mv))
+ * #define COPY_MV4(D,S) memcpy(&D,&S,sizeof(oc_mv4))
+ * Instead however, we can do structure copies, which should be no worse and
+ * doesn't rely on C compilers 'magically' inlining memcpy. */
+#define COPY_MV(D,S) (*(&D)=*(&S))
+#define COPY_MV4(D,S) (*(&D)=*(&S))
+#define ZERO_MV(D) memset(&D,0,sizeof(oc_mv))
+#define ZERO_MV2(D) memset(&D,0,sizeof(oc_mv2))
+#endif
/*Super block information.*/
@@ -390,7 +446,7 @@
_lmbmv: The luma macro-block level motion vector to fill in for use in
prediction.
_lbmvs: The luma block-level motion vectors.*/
-typedef void (*oc_set_chroma_mvs_func)(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]);
+typedef void (*oc_set_chroma_mvs_func)(oc_mv4 *_cbmvs,const oc_mv4 *_lbmvs);
@@ -437,7 +493,7 @@
int oc_state_get_mv_offsets(const oc_theora_state *_state,int _offsets[2],
int _pli,int _dx,int _dy);
-int oc_state_loop_filter_init(oc_theora_state *_state,int *_bv);
+int oc_state_loop_filter_init(oc_theora_state *_state,signed char*_bv);
void oc_state_loop_filter(oc_theora_state *_state,int _frame);
#if defined(OC_DUMP_IMAGES)
int oc_state_dump_frame(const oc_theora_state *_state,int _frame,
@@ -461,8 +517,9 @@
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
int _dst_frame,int _src_frame,int _pli);
void oc_state_loop_filter_frag_rows(const oc_theora_state *_state,
- int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
void oc_restore_fpu(const oc_theora_state *_state);
+void oc_idct8x8(ogg_int16_t _y[64],int _last_zzi);
/*Default pure-C implementations.*/
void oc_frag_copy_c(unsigned char *_dst,
Modified: branches/theorarm-merge-branch/lib/state.c
===================================================================
--- branches/theorarm-merge-branch/lib/state.c 2010-03-07 03:06:29 UTC (rev 16953)
+++ branches/theorarm-merge-branch/lib/state.c 2010-03-07 17:59:26 UTC (rev 16954)
@@ -24,6 +24,8 @@
#else
# include "x86/x86int.h"
#endif
+#elif defined(OC_ARM_ASM)
+# include "arm/ARMint.h"
#endif
#if defined(OC_DUMP_IMAGES)
# include <stdio.h>
@@ -589,7 +591,6 @@
_ogg_free(_state->ref_frame_data[0]);
}
-
void oc_state_vtable_init_c(oc_theora_state *_state){
_state->opt_vtable.frag_copy=oc_frag_copy_c;
_state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_c;
@@ -608,6 +609,8 @@
void oc_state_vtable_init(oc_theora_state *_state){
#if defined(OC_X86_ASM)
oc_state_vtable_init_x86(_state);
+#elif defined (OC_ARM_ASM)
+ oc_state_vtable_init_arm(_state);
#else
oc_state_vtable_init_c(_state);
#endif
More information about the commits
mailing list