[Flac-dev] flac-1.1.1 completely broken on linux/ppc and on macosx
if built with the standard toolchain (not xcode)
Luca Barbato
lu_zero at gentoo.org
Wed Oct 6 10:07:38 PDT 2004
Josh Coalson wrote:
> thanks for the feedback, but it would really help if you supply
> a patch (diff -c), I didn't understand all the changes you
> described.
I hope it helps.
lu
--
Luca Barbato
Developer
Gentoo Linux http://www.gentoo.org/~lu_zero
-------------- next part --------------
*** /tmp/lpc_asm.s Wed Oct 6 14:06:11 2004
--- src/libFLAC/ppc/lpc_asm.s Tue Jul 27 21:32:05 2004
***************
*** 1,93 ****
! # libFLAC - Free Lossless Audio Codec library
! # Copyright (C) 2004 Josh Coalson
! #
! # Redistribution and use in source and binary forms, with or without
! # modification, are permitted provided that the following conditions
! # are met:
! #
! # - Redistributions of source code must retain the above copyright
! # notice, this list of conditions and the following disclaimer.
! #
! # - Redistributions in binary form must reproduce the above copyright
! # notice, this list of conditions and the following disclaimer in the
! # documentation and/or other materials provided with the distribution.
! #
! # - Neither the name of the Xiph.org Foundation nor the names of its
! # contributors may be used to endorse or promote products derived from
! # this software without specific prior written permission.
! #
! # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
! # ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
! # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
! # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
! # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
! # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
! # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
! # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
! # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
! # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
! # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.text
.align 2
.globl _FLAC__lpc_restore_signal_asm_ppc_altivec_16
- .type _FLAC__lpc_restore_signal_asm_ppc_altivec_16, @function
-
.globl _FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8
- .type _FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8, @function
_FLAC__lpc_restore_signal_asm_ppc_altivec_16:
! # r3: residual[]
! # r4: data_len
! # r5: qlp_coeff[]
! # r6: order
! # r7: lp_quantization
! # r8: data[]
!
! # see src/libFLAC/lpc.c:FLAC__lpc_restore_signal()
! # these is a PowerPC/Altivec assembly version which requires bps<=16 (or actual
! # bps<=15 for mid-side coding, since that uses an extra bit)
!
! # these should be fast; the inner loop is unrolled (it takes no more than
! # 3*(order%4) instructions, all of which are arithmetic), and all of the
! # coefficients and all relevant history stay in registers, so the outer loop
! # has only one load from memory (the residual)
! # I have not yet run this through simg4, so there may be some avoidable stalls,
! # and there may be a somewhat more clever way to do the outer loop
! # the branch mechanism may prevent dynamic loading; I still need to examine
! # this issue, and there may be a more elegant method
stmw r31,-4(r1)
addi r9,r1,-28
li r31,0xf
! andc r9,r9,r31 # for quadword-aligned stack data
! slwi r6,r6,2 # adjust for word size
slwi r4,r4,2
! add r4,r4,r8 # r4 = data+data_len
! mfspr r0,256 # cache old vrsave
! addis r31,0,0xffff
! ori r31,r31,0xfc00
! mtspr 256,r31 # declare VRs in vrsave
! cmplw cr0,r8,r4 # i<data_len
bc 4,0,L1400
! # load coefficients into v0-v7 and initial history into v8-v15
li r31,0xf
! and r31,r8,r31 # r31: data%4
li r11,16
! subf r31,r31,r11 # r31: 4-(data%4)
! slwi r31,r31,3 # convert to bits for vsro
li r10,-4
stw r31,-4(r9)
lvewx v0,r10,r9
vspltisb v18,-1
! vsro v18,v18,v0 # v18: mask vector
li r31,0x8
lvsl v0,0,r31
--- 1,90 ----
! ; libFLAC - Free Lossless Audio Codec library
! ; Copyright (C) 2004 Josh Coalson
! ;
! ; Redistribution and use in source and binary forms, with or without
! ; modification, are permitted provided that the following conditions
! ; are met:
! ;
! ; - Redistributions of source code must retain the above copyright
! ; notice, this list of conditions and the following disclaimer.
! ;
! ; - Redistributions in binary form must reproduce the above copyright
! ; notice, this list of conditions and the following disclaimer in the
! ; documentation and/or other materials provided with the distribution.
! ;
! ; - Neither the name of the Xiph.org Foundation nor the names of its
! ; contributors may be used to endorse or promote products derived from
! ; this software without specific prior written permission.
! ;
! ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
! ; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
! ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
! ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
! ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
! ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
! ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
! ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
! ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
! ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
! ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.text
.align 2
.globl _FLAC__lpc_restore_signal_asm_ppc_altivec_16
.globl _FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8
_FLAC__lpc_restore_signal_asm_ppc_altivec_16:
! ; r3: residual[]
! ; r4: data_len
! ; r5: qlp_coeff[]
! ; r6: order
! ; r7: lp_quantization
! ; r8: data[]
!
! ; see src/libFLAC/lpc.c:FLAC__lpc_restore_signal()
! ; these is a PowerPC/Altivec assembly version which requires bps<=16 (or actual
! ; bps<=15 for mid-side coding, since that uses an extra bit)
!
! ; these should be fast; the inner loop is unrolled (it takes no more than
! ; 3*(order%4) instructions, all of which are arithmetic), and all of the
! ; coefficients and all relevant history stay in registers, so the outer loop
! ; has only one load from memory (the residual)
! ; I have not yet run this through simg4, so there may be some avoidable stalls,
! ; and there may be a somewhat more clever way to do the outer loop
! ; the branch mechanism may prevent dynamic loading; I still need to examine
! ; this issue, and there may be a more elegant method
stmw r31,-4(r1)
addi r9,r1,-28
li r31,0xf
! andc r9,r9,r31 ; for quadword-aligned stack data
! slwi r6,r6,2 ; adjust for word size
slwi r4,r4,2
! add r4,r4,r8 ; r4 = data+data_len
! mfspr r0,256 ; cache old vrsave
! addis r31,0,hi16(0xfffffc00)
! ori r31,r31,lo16(0xfffffc00)
! mtspr 256,r31 ; declare VRs in vrsave
! cmplw cr0,r8,r4 ; i<data_len
bc 4,0,L1400
! ; load coefficients into v0-v7 and initial history into v8-v15
li r31,0xf
! and r31,r8,r31 ; r31: data%4
li r11,16
! subf r31,r31,r11 ; r31: 4-(data%4)
! slwi r31,r31,3 ; convert to bits for vsro
li r10,-4
stw r31,-4(r9)
lvewx v0,r10,r9
vspltisb v18,-1
! vsro v18,v18,v0 ; v18: mask vector
li r31,0x8
lvsl v0,0,r31
***************
*** 97,110 ****
vspltisb v2,0
vspltisb v3,-1
vmrglw v2,v2,v3
! vsel v0,v1,v0,v2 # v0: reversal permutation vector
add r10,r5,r6
! lvsl v17,0,r5 # v17: coefficient alignment permutation vector
! vperm v17,v17,v17,v0 # v17: reversal coefficient alignment permutation vector
mr r11,r8
! lvsl v16,0,r11 # v16: history alignment permutation vector
lvx v0,0,r5
addi r5,r5,16
--- 94,107 ----
vspltisb v2,0
vspltisb v3,-1
vmrglw v2,v2,v3
! vsel v0,v1,v0,v2 ; v0: reversal permutation vector
add r10,r5,r6
! lvsl v17,0,r5 ; v17: coefficient alignment permutation vector
! vperm v17,v17,v17,v0 ; v17: reversal coefficient alignment permutation vector
mr r11,r8
! lvsl v16,0,r11 ; v16: history alignment permutation vector
lvx v0,0,r5
addi r5,r5,16
***************
*** 117,124 ****
cmplw cr0,r5,r10
bc 12,0,L1101
vand v0,v0,v18
! addis r31,0,L1307 at ha
! ori r31,r31,L1307 at l
b L1199
L1101:
--- 114,121 ----
cmplw cr0,r5,r10
bc 12,0,L1101
vand v0,v0,v18
! addis r31,0,hi16(L1307)
! ori r31,r31,lo16(L1307)
b L1199
L1101:
***************
*** 131,138 ****
cmplw cr0,r5,r10
bc 12,0,L1102
vand v1,v1,v18
! addis r31,0,L1306 at ha
! ori r31,r31,L1306 at l
b L1199
L1102:
--- 128,135 ----
cmplw cr0,r5,r10
bc 12,0,L1102
vand v1,v1,v18
! addis r31,0,hi16(L1306)
! ori r31,r31,lo16(L1306)
b L1199
L1102:
***************
*** 145,152 ****
cmplw cr0,r5,r10
bc 12,0,L1103
vand v2,v2,v18
! lis r31,L1305 at ha
! la r31,L1305 at l(r31)
b L1199
L1103:
--- 142,149 ----
cmplw cr0,r5,r10
bc 12,0,L1103
vand v2,v2,v18
! addis r31,0,hi16(L1305)
! ori r31,r31,lo16(L1305)
b L1199
L1103:
***************
*** 159,166 ****
cmplw cr0,r5,r10
bc 12,0,L1104
vand v3,v3,v18
! lis r31,L1304 at ha
! la r31,L1304 at l(r31)
b L1199
L1104:
--- 156,163 ----
cmplw cr0,r5,r10
bc 12,0,L1104
vand v3,v3,v18
! addis r31,0,hi16(L1304)
! ori r31,r31,lo16(L1304)
b L1199
L1104:
***************
*** 173,180 ****
cmplw cr0,r5,r10
bc 12,0,L1105
vand v4,v4,v18
! lis r31,L1303 at ha
! la r31,L1303 at l(r31)
b L1199
L1105:
--- 170,177 ----
cmplw cr0,r5,r10
bc 12,0,L1105
vand v4,v4,v18
! addis r31,0,hi16(L1303)
! ori r31,r31,lo16(L1303)
b L1199
L1105:
***************
*** 187,194 ****
cmplw cr0,r5,r10
bc 12,0,L1106
vand v5,v5,v18
! lis r31,L1302 at ha
! la r31,L1302 at l(r31)
b L1199
L1106:
--- 184,191 ----
cmplw cr0,r5,r10
bc 12,0,L1106
vand v5,v5,v18
! addis r31,0,hi16(L1302)
! ori r31,r31,lo16(L1302)
b L1199
L1106:
***************
*** 201,208 ****
cmplw cr0,r5,r10
bc 12,0,L1107
vand v6,v6,v18
! lis r31,L1301 at ha
! la r31,L1301 at l(r31)
b L1199
L1107:
--- 198,205 ----
cmplw cr0,r5,r10
bc 12,0,L1107
vand v6,v6,v18
! addis r31,0,hi16(L1301)
! ori r31,r31,lo16(L1301)
b L1199
L1107:
***************
*** 213,242 ****
lvx v19,0,r11
vperm v15,v19,v15,v16
vand v7,v7,v18
! lis r31,L1300 at ha
! la r31,L1300 at l(r31)
L1199:
mtctr r31
! # set up invariant vectors
! vspltish v16,0 # v16: zero vector
li r10,-12
! lvsr v17,r10,r8 # v17: result shift vector
! lvsl v18,r10,r3 # v18: residual shift back vector
li r10,-4
stw r7,-4(r9)
! lvewx v19,r10,r9 # v19: lp_quantization vector
L1200:
! vmulosh v20,v0,v8 # v20: sum vector
bcctr 20,0
L1300:
vmulosh v21,v7,v15
! vsldoi v15,v15,v14,4 # increment history
vaddsws v20,v20,v21
L1301:
--- 210,239 ----
lvx v19,0,r11
vperm v15,v19,v15,v16
vand v7,v7,v18
! addis r31,0,hi16(L1300)
! ori r31,r31,lo16(L1300)
L1199:
mtctr r31
! ; set up invariant vectors
! vspltish v16,0 ; v16: zero vector
li r10,-12
! lvsr v17,r10,r8 ; v17: result shift vector
! lvsl v18,r10,r3 ; v18: residual shift back vector
li r10,-4
stw r7,-4(r9)
! lvewx v19,r10,r9 ; v19: lp_quantization vector
L1200:
! vmulosh v20,v0,v8 ; v20: sum vector
bcctr 20,0
L1300:
vmulosh v21,v7,v15
! vsldoi v15,v15,v14,4 ; increment history
vaddsws v20,v20,v21
L1301:
***************
*** 270,342 ****
vaddsws v20,v20,v21
L1307:
! vsumsws v20,v20,v16 # v20[3]: sum
! vsraw v20,v20,v19 # v20[3]: sum >> lp_quantization
! lvewx v21,0,r3 # v21[n]: *residual
! vperm v21,v21,v21,v18 # v21[3]: *residual
! vaddsws v20,v21,v20 # v20[3]: *residual + (sum >> lp_quantization)
! vsldoi v18,v18,v18,4 # increment shift vector
! vperm v21,v20,v20,v17 # v21[n]: shift for storage
! vsldoi v17,v17,v17,12 # increment shift vector
stvewx v21,0,r8
vsldoi v20,v20,v20,12
! vsldoi v8,v8,v20,4 # insert value onto history
addi r3,r3,4
addi r8,r8,4
! cmplw cr0,r8,r4 # i<data_len
bc 12,0,L1200
L1400:
! mtspr 256,r0 # restore old vrsave
lmw r31,-4(r1)
blr
_FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8:
! # r3: residual[]
! # r4: data_len
! # r5: qlp_coeff[]
! # r6: order
! # r7: lp_quantization
! # r8: data[]
!
! # see _FLAC__lpc_restore_signal_asm_ppc_altivec_16() above
! # this version assumes order<=8; it uses fewer vector registers, which should
! # save time in context switches, and has less code, which may improve
! # instruction caching
stmw r31,-4(r1)
addi r9,r1,-28
li r31,0xf
! andc r9,r9,r31 # for quadword-aligned stack data
! slwi r6,r6,2 # adjust for word size
slwi r4,r4,2
! add r4,r4,r8 # r4 = data+data_len
! mfspr r0,256 # cache old vrsave
! addis r31,0,0xffc0
! ori r31,r31,0x0000
! mtspr 256,r31 # declare VRs in vrsave
! cmplw cr0,r8,r4 # i<data_len
bc 4,0,L2400
! # load coefficients into v0-v1 and initial history into v2-v3
li r31,0xf
! and r31,r8,r31 # r31: data%4
li r11,16
! subf r31,r31,r11 # r31: 4-(data%4)
! slwi r31,r31,3 # convert to bits for vsro
li r10,-4
stw r31,-4(r9)
lvewx v0,r10,r9
vspltisb v6,-1
! vsro v6,v6,v0 # v6: mask vector
li r31,0x8
lvsl v0,0,r31
--- 267,339 ----
vaddsws v20,v20,v21
L1307:
! vsumsws v20,v20,v16 ; v20[3]: sum
! vsraw v20,v20,v19 ; v20[3]: sum >> lp_quantization
! lvewx v21,0,r3 ; v21[n]: *residual
! vperm v21,v21,v21,v18 ; v21[3]: *residual
! vaddsws v20,v21,v20 ; v20[3]: *residual + (sum >> lp_quantization)
! vsldoi v18,v18,v18,4 ; increment shift vector
! vperm v21,v20,v20,v17 ; v21[n]: shift for storage
! vsldoi v17,v17,v17,12 ; increment shift vector
stvewx v21,0,r8
vsldoi v20,v20,v20,12
! vsldoi v8,v8,v20,4 ; insert value onto history
addi r3,r3,4
addi r8,r8,4
! cmplw cr0,r8,r4 ; i<data_len
bc 12,0,L1200
L1400:
! mtspr 256,r0 ; restore old vrsave
lmw r31,-4(r1)
blr
_FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8:
! ; r3: residual[]
! ; r4: data_len
! ; r5: qlp_coeff[]
! ; r6: order
! ; r7: lp_quantization
! ; r8: data[]
!
! ; see _FLAC__lpc_restore_signal_asm_ppc_altivec_16() above
! ; this version assumes order<=8; it uses fewer vector registers, which should
! ; save time in context switches, and has less code, which may improve
! ; instruction caching
stmw r31,-4(r1)
addi r9,r1,-28
li r31,0xf
! andc r9,r9,r31 ; for quadword-aligned stack data
! slwi r6,r6,2 ; adjust for word size
slwi r4,r4,2
! add r4,r4,r8 ; r4 = data+data_len
! mfspr r0,256 ; cache old vrsave
! addis r31,0,hi16(0xffc00000)
! ori r31,r31,lo16(0xffc00000)
! mtspr 256,r31 ; declare VRs in vrsave
! cmplw cr0,r8,r4 ; i<data_len
bc 4,0,L2400
! ; load coefficients into v0-v1 and initial history into v2-v3
li r31,0xf
! and r31,r8,r31 ; r31: data%4
li r11,16
! subf r31,r31,r11 ; r31: 4-(data%4)
! slwi r31,r31,3 ; convert to bits for vsro
li r10,-4
stw r31,-4(r9)
lvewx v0,r10,r9
vspltisb v6,-1
! vsro v6,v6,v0 ; v6: mask vector
li r31,0x8
lvsl v0,0,r31
***************
*** 346,359 ****
vspltisb v2,0
vspltisb v3,-1
vmrglw v2,v2,v3
! vsel v0,v1,v0,v2 # v0: reversal permutation vector
add r10,r5,r6
! lvsl v5,0,r5 # v5: coefficient alignment permutation vector
! vperm v5,v5,v5,v0 # v5: reversal coefficient alignment permutation vector
mr r11,r8
! lvsl v4,0,r11 # v4: history alignment permutation vector
lvx v0,0,r5
addi r5,r5,16
--- 343,356 ----
vspltisb v2,0
vspltisb v3,-1
vmrglw v2,v2,v3
! vsel v0,v1,v0,v2 ; v0: reversal permutation vector
add r10,r5,r6
! lvsl v5,0,r5 ; v5: coefficient alignment permutation vector
! vperm v5,v5,v5,v0 ; v5: reversal coefficient alignment permutation vector
mr r11,r8
! lvsl v4,0,r11 ; v4: history alignment permutation vector
lvx v0,0,r5
addi r5,r5,16
***************
*** 366,373 ****
cmplw cr0,r5,r10
bc 12,0,L2101
vand v0,v0,v6
! lis r31,L2301 at ha
! la r31,L2301 at l(r31)
b L2199
L2101:
--- 363,370 ----
cmplw cr0,r5,r10
bc 12,0,L2101
vand v0,v0,v6
! addis r31,0,hi16(L2301)
! ori r31,r31,lo16(L2301)
b L2199
L2101:
***************
*** 378,402 ****
lvx v7,0,r11
vperm v3,v7,v3,v4
vand v1,v1,v6
! lis r31,L2300 at ha
! la r31,L2300 at l(r31)
L2199:
mtctr r31
! # set up invariant vectors
! vspltish v4,0 # v4: zero vector
li r10,-12
! lvsr v5,r10,r8 # v5: result shift vector
! lvsl v6,r10,r3 # v6: residual shift back vector
li r10,-4
stw r7,-4(r9)
! lvewx v7,r10,r9 # v7: lp_quantization vector
L2200:
! vmulosh v8,v0,v2 # v8: sum vector
bcctr 20,0
L2300:
--- 375,399 ----
lvx v7,0,r11
vperm v3,v7,v3,v4
vand v1,v1,v6
! addis r31,0,hi16(L2300)
! ori r31,r31,lo16(L2300)
L2199:
mtctr r31
! ; set up invariant vectors
! vspltish v4,0 ; v4: zero vector
li r10,-12
! lvsr v5,r10,r8 ; v5: result shift vector
! lvsl v6,r10,r3 ; v6: residual shift back vector
li r10,-4
stw r7,-4(r9)
! lvewx v7,r10,r9 ; v7: lp_quantization vector
L2200:
! vmulosh v8,v0,v2 ; v8: sum vector
bcctr 20,0
L2300:
***************
*** 405,431 ****
vaddsws v8,v8,v9
L2301:
! vsumsws v8,v8,v4 # v8[3]: sum
! vsraw v8,v8,v7 # v8[3]: sum >> lp_quantization
! lvewx v9,0,r3 # v9[n]: *residual
! vperm v9,v9,v9,v6 # v9[3]: *residual
! vaddsws v8,v9,v8 # v8[3]: *residual + (sum >> lp_quantization)
! vsldoi v6,v6,v6,4 # increment shift vector
! vperm v9,v8,v8,v5 # v9[n]: shift for storage
! vsldoi v5,v5,v5,12 # increment shift vector
stvewx v9,0,r8
vsldoi v8,v8,v8,12
! vsldoi v2,v2,v8,4 # insert value onto history
addi r3,r3,4
addi r8,r8,4
! cmplw cr0,r8,r4 # i<data_len
bc 12,0,L2200
L2400:
! mtspr 256,r0 # restore old vrsave
lmw r31,-4(r1)
blr
--- 402,428 ----
vaddsws v8,v8,v9
L2301:
! vsumsws v8,v8,v4 ; v8[3]: sum
! vsraw v8,v8,v7 ; v8[3]: sum >> lp_quantization
! lvewx v9,0,r3 ; v9[n]: *residual
! vperm v9,v9,v9,v6 ; v9[3]: *residual
! vaddsws v8,v9,v8 ; v8[3]: *residual + (sum >> lp_quantization)
! vsldoi v6,v6,v6,4 ; increment shift vector
! vperm v9,v8,v8,v5 ; v9[n]: shift for storage
! vsldoi v5,v5,v5,12 ; increment shift vector
stvewx v9,0,r8
vsldoi v8,v8,v8,12
! vsldoi v2,v2,v8,4 ; insert value onto history
addi r3,r3,4
addi r8,r8,4
! cmplw cr0,r8,r4 ; i<data_len
bc 12,0,L2200
L2400:
! mtspr 256,r0 ; restore old vrsave
lmw r31,-4(r1)
blr
More information about the Flac-dev
mailing list