[Flac-dev] Altivec, automake

Brady Patterson brady at spaceship.com
Sun Jul 25 16:51:02 PDT 2004


Here's what I listed in that email. Merging doesn't appear to be necessary. If
you have any build problems, let me know.

Note that my detection code is Darwin-specific. It's a BSD call (sysctl()), so
a change to the platform-detection macros should enable it to work on other
BSDs. However, I don't know what that would be, and I couldn't determine any
safe way to do the check in Linux, nor do I have any way to test anything other
than OS X. The altivec code itself should work on any platform with altivec.

Regarding performance, command-line decoding will see a modest improvement, but
it's really bottlenecked by the MD5 checking (and I don't have a clue as to how
to optimize that). Real-time decoding should be improved substantially -- I've
been meaning to test that and will get back to you with the results.

--
Brady Patterson (brady at spaceship.com)
RLRR LRLL RLLR LRRL RRLR LLRL

On Sun, 25 Jul 2004, Josh Coalson wrote:
> PS, Brady, do you still have these patches?  you don't have to
> make them current, just send them as-is, I can merge them.
-------------- next part --------------
Index: configure.in
===================================================================
RCS file: /cvsroot/flac/flac/configure.in,v
retrieving revision 1.82
diff -c -r1.82 configure.in
*** configure.in	19 May 2003 23:59:49 -0000	1.82
--- configure.in	25 Jul 2004 23:13:35 -0000
***************
*** 208,213 ****
--- 208,225 ----
  AC_DEFINE(FLAC__USE_3DNOW)
  fi
  
+ AC_ARG_ENABLE(altivec,
+ [  --disable-altivec              Disable Altivec optimizations],
+ [case "${enableval}" in
+ 	yes) use_altivec=true ;;
+ 	no)  use_altivec=false ;;
+ 	*) AC_MSG_ERROR(bad value ${enableval} for --enable-altivec) ;;
+ esac],[use_altivec=true])
+ AM_CONDITIONAL(FLaC__USE_ALTIVEC, test x$use_altivec = xtrue)
+ if test x$use_altivec = xtrue ; then
+ AC_DEFINE(FLAC__USE_ALTIVEC)
+ fi
+ 
  AC_ARG_ENABLE(local-xmms-plugin,
  [  --enable-local-xmms-plugin     Install XMMS plugin to ~/.xmms/Plugins instead of system location],
  [case "${enableval}" in
***************
*** 380,385 ****
--- 392,398 ----
  AH_TEMPLATE(FLAC__NO_ASM,  [define to disable use of assembly code])
  AH_TEMPLATE(FLAC__SSE_OS,  [define if your operating system supports SSE instructions])
  AH_TEMPLATE(FLAC__USE_3DNOW,  [define to enable use of 3Dnow! instructions])
+ AH_TEMPLATE(FLAC__USE_ALTIVEC,  [define to enable use of Altivec instructions])
  AH_TEMPLATE(ID3LIB_MAJOR,  [define to major version number of id3lib])
  AH_TEMPLATE(ID3LIB_MINOR,  [define to minor version number of id3lib])
  AH_TEMPLATE(ID3LIB_PATCH,  [define to patch level of id3lib])
***************
*** 389,394 ****
--- 402,408 ----
  	src/Makefile \
  	src/libFLAC/Makefile \
  	src/libFLAC/ia32/Makefile \
+ 	src/libFLAC/ppc/Makefile \
  	src/libFLAC/include/Makefile \
  	src/libFLAC/include/private/Makefile \
  	src/libFLAC/include/protected/Makefile \
-------------- next part --------------
Index: cpu.c
===================================================================
RCS file: /cvsroot/flac/flac/src/libFLAC/cpu.c,v
retrieving revision 1.14
diff -c -r1.14 cpu.c
*** cpu.c	31 Jan 2003 23:34:57 -0000	1.14
--- cpu.c	25 Jul 2004 23:16:52 -0000
***************
*** 37,42 ****
--- 37,50 ----
  #include <config.h>
  #endif
  
+ #if defined FLAC__CPU_PPC
+ #if !defined FLAC__NO_ASM
+ #if defined __APPLE__ && defined __MACH__
+ #include <sys/sysctl.h>
+ #endif /* __APPLE__ && __MACH__ */
+ #endif /* FLAC__NO_ASM */
+ #endif /* FLAC__CPU_PPC */
+ 
  const unsigned FLAC__CPUINFO_IA32_CPUID_CMOV = 0x00008000;
  const unsigned FLAC__CPUINFO_IA32_CPUID_MMX = 0x00800000;
  const unsigned FLAC__CPUINFO_IA32_CPUID_FXSR = 0x01000000;
***************
*** 78,83 ****
--- 86,115 ----
  #else
  	info->use_asm = false;
  #endif
+ #elif defined FLAC__CPU_PPC
+ 	info->type = FLAC__CPUINFO_TYPE_PPC;
+ #if !defined FLAC__NO_ASM
+ 	info->use_asm = true;
+ #ifdef FLAC__USE_ALTIVEC
+ #if defined __APPLE__ && defined __MACH__
+ 	{
+ 		int selectors[2] = { CTL_HW, HW_VECTORUNIT };
+ 		int result = 0;
+ 		size_t length = sizeof(result);
+ 		int error = sysctl(selectors, 2, &result, &length, 0, 0);
+ 
+ 		info->data.ppc.altivec = error==0 ? result!=0 : 0;
+ 	}
+ #else /* __APPLE__ && __MACH__ */
+ 	/* don't know of any other thread-safe way to check */
+ 	info->data.ppc.altivec = 0;
+ #endif /* __APPLE__ && __MACH__ */
+ #else /* FLAC__USE_ALTIVEC */
+ 	info->data.ppc.altivec = 0;
+ #endif /* FLAC__USE_ALTIVEC */
+ #else /* FLAC__NO_ASM */
+ 	info->use_asm = false;
+ #endif /* FLAC__NO_ASM */
  #else
  	info->type = FLAC__CPUINFO_TYPE_UNKNOWN;
  	info->use_asm = false;
-------------- next part --------------
Index: stream_decoder.c
===================================================================
RCS file: /cvsroot/flac/flac/src/libFLAC/stream_decoder.c,v
retrieving revision 1.87
diff -c -r1.87 stream_decoder.c
*** stream_decoder.c	20 May 2003 00:01:50 -0000	1.87
--- stream_decoder.c	25 Jul 2004 23:17:39 -0000
***************
*** 101,110 ****
  	void (*local_lpc_restore_signal)(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
  	void (*local_lpc_restore_signal_64bit)(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
  	void (*local_lpc_restore_signal_16bit)(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
  	void *client_data;
  	FLAC__BitBuffer *input;
  	FLAC__int32 *output[FLAC__MAX_CHANNELS];
! 	FLAC__int32 *residual[FLAC__MAX_CHANNELS];
  	FLAC__EntropyCodingMethod_PartitionedRiceContents partitioned_rice_contents[FLAC__MAX_CHANNELS];
  	unsigned output_capacity, output_channels;
  	FLAC__uint32 last_frame_number;
--- 101,111 ----
  	void (*local_lpc_restore_signal)(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
  	void (*local_lpc_restore_signal_64bit)(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
  	void (*local_lpc_restore_signal_16bit)(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
+ 	void (*local_lpc_restore_signal_16bit_order8)(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
  	void *client_data;
  	FLAC__BitBuffer *input;
  	FLAC__int32 *output[FLAC__MAX_CHANNELS];
! 	FLAC__int32 *residual[FLAC__MAX_CHANNELS]; /* must add 15 and mask low 4 bits before using */
  	FLAC__EntropyCodingMethod_PartitionedRiceContents partitioned_rice_contents[FLAC__MAX_CHANNELS];
  	unsigned output_capacity, output_channels;
  	FLAC__uint32 last_frame_number;
***************
*** 281,286 ****
--- 282,288 ----
  	decoder->private_->local_lpc_restore_signal = FLAC__lpc_restore_signal;
  	decoder->private_->local_lpc_restore_signal_64bit = FLAC__lpc_restore_signal_wide;
  	decoder->private_->local_lpc_restore_signal_16bit = FLAC__lpc_restore_signal;
+ 	decoder->private_->local_lpc_restore_signal_16bit_order8 = FLAC__lpc_restore_signal;
  	/* now override with asm where appropriate */
  #ifndef FLAC__NO_ASM
  	if(decoder->private_->cpuinfo.use_asm) {
***************
*** 290,301 ****
--- 292,311 ----
  		if(decoder->private_->cpuinfo.data.ia32.mmx) {
  			decoder->private_->local_lpc_restore_signal = FLAC__lpc_restore_signal_asm_ia32;
  			decoder->private_->local_lpc_restore_signal_16bit = FLAC__lpc_restore_signal_asm_ia32_mmx;
+ 			decoder->private_->local_lpc_restore_signal_16bit_order8 = FLAC__lpc_restore_signal_asm_ia32_mmx;
  		}
  		else {
  			decoder->private_->local_lpc_restore_signal = FLAC__lpc_restore_signal_asm_ia32;
  			decoder->private_->local_lpc_restore_signal_16bit = FLAC__lpc_restore_signal_asm_ia32;
+ 			decoder->private_->local_lpc_restore_signal_16bit_order8 = FLAC__lpc_restore_signal_asm_ia32;
  		}
  #endif
+ #elif defined FLAC__CPU_PPC
+ 		FLAC__ASSERT(decoder->private_->cpuinfo.type == FLAC__CPUINFO_TYPE_PPC);
+ 		if(decoder->private_->cpuinfo.data.ppc.altivec) {
+ 			decoder->private_->local_lpc_restore_signal_16bit = FLAC__lpc_restore_signal_asm_ppc_altivec_16;
+ 			decoder->private_->local_lpc_restore_signal_16bit_order8 = FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8;
+ 		}
  #endif
  	}
  #endif
***************
*** 748,754 ****
  		memset(tmp, 0, sizeof(FLAC__int32)*4);
  		decoder->private_->output[i] = tmp + 4;
  
! 		tmp = (FLAC__int32*)malloc(sizeof(FLAC__int32)*size);
  		if(tmp == 0) {
  			decoder->protected_->state = FLAC__STREAM_DECODER_MEMORY_ALLOCATION_ERROR;
  			return false;
--- 758,766 ----
  		memset(tmp, 0, sizeof(FLAC__int32)*4);
  		decoder->private_->output[i] = tmp + 4;
  
! 		/* need quadword alignment for vector optimizations: */
!     /* allocate extra 15 bytes; then must add 15 and mask low 4 bits before using */
! 		tmp = (FLAC__int32*)malloc(sizeof(FLAC__int32)*size+15U);
  		if(tmp == 0) {
  			decoder->protected_->state = FLAC__STREAM_DECODER_MEMORY_ALLOCATION_ERROR;
  			return false;
***************
*** 1809,1818 ****
  	FLAC__int32 i32;
  	FLAC__uint32 u32;
  	unsigned u;
  
  	decoder->private_->frame.subframes[channel].type = FLAC__SUBFRAME_TYPE_FIXED;
  
! 	subframe->residual = decoder->private_->residual[channel];
  	subframe->order = order;
  
  	/* read warm-up samples */
--- 1821,1831 ----
  	FLAC__int32 i32;
  	FLAC__uint32 u32;
  	unsigned u;
+ 	FLAC__int32 *residual = (FLAC__int32 *)((long)decoder->private_->residual[channel]+15U & ~0xf);
  
  	decoder->private_->frame.subframes[channel].type = FLAC__SUBFRAME_TYPE_FIXED;
  
! 	subframe->residual = residual;
  	subframe->order = order;
  
  	/* read warm-up samples */
***************
*** 1841,1847 ****
  	/* read residual */
  	switch(subframe->entropy_coding_method.type) {
  		case FLAC__ENTROPY_CODING_METHOD_PARTITIONED_RICE:
! 			if(!read_residual_partitioned_rice_(decoder, order, subframe->entropy_coding_method.data.partitioned_rice.order, &decoder->private_->partitioned_rice_contents[channel], decoder->private_->residual[channel]))
  				return false;
  			break;
  		default:
--- 1854,1860 ----
  	/* read residual */
  	switch(subframe->entropy_coding_method.type) {
  		case FLAC__ENTROPY_CODING_METHOD_PARTITIONED_RICE:
! 			if(!read_residual_partitioned_rice_(decoder, order, subframe->entropy_coding_method.data.partitioned_rice.order, &decoder->private_->partitioned_rice_contents[channel], residual))
  				return false;
  			break;
  		default:
***************
*** 1850,1856 ****
  
  	/* decode the subframe */
  	memcpy(decoder->private_->output[channel], subframe->warmup, sizeof(FLAC__int32) * order);
! 	FLAC__fixed_restore_signal(decoder->private_->residual[channel], decoder->private_->frame.header.blocksize-order, order, decoder->private_->output[channel]+order);
  
  	return true;
  }
--- 1863,1869 ----
  
  	/* decode the subframe */
  	memcpy(decoder->private_->output[channel], subframe->warmup, sizeof(FLAC__int32) * order);
! 	FLAC__fixed_restore_signal(residual, decoder->private_->frame.header.blocksize-order, order, decoder->private_->output[channel]+order);
  
  	return true;
  }
***************
*** 1861,1870 ****
  	FLAC__int32 i32;
  	FLAC__uint32 u32;
  	unsigned u;
  
  	decoder->private_->frame.subframes[channel].type = FLAC__SUBFRAME_TYPE_LPC;
  
! 	subframe->residual = decoder->private_->residual[channel];
  	subframe->order = order;
  
  	/* read warm-up samples */
--- 1874,1884 ----
  	FLAC__int32 i32;
  	FLAC__uint32 u32;
  	unsigned u;
+ 	FLAC__int32 *residual = (FLAC__int32 *)((long)decoder->private_->residual[channel]+15U & ~0xf);
  
  	decoder->private_->frame.subframes[channel].type = FLAC__SUBFRAME_TYPE_LPC;
  
! 	subframe->residual = residual;
  	subframe->order = order;
  
  	/* read warm-up samples */
***************
*** 1915,1921 ****
  	/* read residual */
  	switch(subframe->entropy_coding_method.type) {
  		case FLAC__ENTROPY_CODING_METHOD_PARTITIONED_RICE:
! 			if(!read_residual_partitioned_rice_(decoder, order, subframe->entropy_coding_method.data.partitioned_rice.order, &decoder->private_->partitioned_rice_contents[channel], decoder->private_->residual[channel]))
  				return false;
  			break;
  		default:
--- 1929,1935 ----
  	/* read residual */
  	switch(subframe->entropy_coding_method.type) {
  		case FLAC__ENTROPY_CODING_METHOD_PARTITIONED_RICE:
! 			if(!read_residual_partitioned_rice_(decoder, order, subframe->entropy_coding_method.data.partitioned_rice.order, &decoder->private_->partitioned_rice_contents[channel], residual))
  				return false;
  			break;
  		default:
***************
*** 1925,1936 ****
  	/* decode the subframe */
  	memcpy(decoder->private_->output[channel], subframe->warmup, sizeof(FLAC__int32) * order);
  	if(bps + subframe->qlp_coeff_precision + FLAC__bitmath_ilog2(order) <= 32)
! 		if(bps <= 16 && subframe->qlp_coeff_precision <= 16)
! 			decoder->private_->local_lpc_restore_signal_16bit(decoder->private_->residual[channel], decoder->private_->frame.header.blocksize-order, subframe->qlp_coeff, order, subframe->quantization_level, decoder->private_->output[channel]+order);
  		else
! 			decoder->private_->local_lpc_restore_signal(decoder->private_->residual[channel], decoder->private_->frame.header.blocksize-order, subframe->qlp_coeff, order, subframe->quantization_level, decoder->private_->output[channel]+order);
  	else
! 		decoder->private_->local_lpc_restore_signal_64bit(decoder->private_->residual[channel], decoder->private_->frame.header.blocksize-order, subframe->qlp_coeff, order, subframe->quantization_level, decoder->private_->output[channel]+order);
  
  	return true;
  }
--- 1939,1954 ----
  	/* decode the subframe */
  	memcpy(decoder->private_->output[channel], subframe->warmup, sizeof(FLAC__int32) * order);
  	if(bps + subframe->qlp_coeff_precision + FLAC__bitmath_ilog2(order) <= 32)
! 		if(bps <= 16 && subframe->qlp_coeff_precision <= 16) {
! 			if(order <= 8)
! 				decoder->private_->local_lpc_restore_signal_16bit_order8(residual, decoder->private_->frame.header.blocksize-order, subframe->qlp_coeff, order, subframe->quantization_level, decoder->private_->output[channel]+order);
! 			else
! 				decoder->private_->local_lpc_restore_signal_16bit(residual, decoder->private_->frame.header.blocksize-order, subframe->qlp_coeff, order, subframe->quantization_level, decoder->private_->output[channel]+order);
! 		}
  		else
! 			decoder->private_->local_lpc_restore_signal(residual, decoder->private_->frame.header.blocksize-order, subframe->qlp_coeff, order, subframe->quantization_level, decoder->private_->output[channel]+order);
  	else
! 		decoder->private_->local_lpc_restore_signal_64bit(residual, decoder->private_->frame.header.blocksize-order, subframe->qlp_coeff, order, subframe->quantization_level, decoder->private_->output[channel]+order);
  
  	return true;
  }
***************
*** 1938,1945 ****
  FLAC__bool read_subframe_verbatim_(FLAC__StreamDecoder *decoder, unsigned channel, unsigned bps)
  {
  	FLAC__Subframe_Verbatim *subframe = &decoder->private_->frame.subframes[channel].data.verbatim;
! 	FLAC__int32 x, *residual = decoder->private_->residual[channel];
  	unsigned i;
  
  	decoder->private_->frame.subframes[channel].type = FLAC__SUBFRAME_TYPE_VERBATIM;
  
--- 1956,1966 ----
  FLAC__bool read_subframe_verbatim_(FLAC__StreamDecoder *decoder, unsigned channel, unsigned bps)
  {
  	FLAC__Subframe_Verbatim *subframe = &decoder->private_->frame.subframes[channel].data.verbatim;
! 	FLAC__int32 x;
  	unsigned i;
+ 	FLAC__int32 *residual = (FLAC__int32 *)((long)decoder->private_->residual[channel]+15U & ~0xf);
+ 
+ 	FLAC__ASSERT((((long)residual) & 0xf) == 0);
  
  	decoder->private_->frame.subframes[channel].type = FLAC__SUBFRAME_TYPE_VERBATIM;
  
***************
*** 1965,1970 ****
--- 1986,1993 ----
  	const unsigned partitions = 1u << partition_order;
  	const unsigned partition_samples = partition_order > 0? decoder->private_->frame.header.blocksize >> partition_order : decoder->private_->frame.header.blocksize - predictor_order;
  
+ 	FLAC__ASSERT((((long)residual) & 0xf) == 0);
+ 
  	if(!FLAC__format_entropy_coding_method_partitioned_rice_contents_ensure_size(partitioned_rice_contents, max(6, partition_order))) {
  		decoder->protected_->state = FLAC__STREAM_DECODER_MEMORY_ALLOCATION_ERROR;
  		return false;
-------------- next part --------------
Index: cpu.h
===================================================================
RCS file: /cvsroot/flac/flac/src/libFLAC/include/private/cpu.h,v
retrieving revision 1.11
diff -c -r1.11 cpu.h
*** cpu.h	31 Jan 2003 23:34:58 -0000	1.11
--- cpu.h	25 Jul 2004 23:15:40 -0000
***************
*** 40,45 ****
--- 40,46 ----
  
  typedef enum {
  	FLAC__CPUINFO_TYPE_IA32,
+ 	FLAC__CPUINFO_TYPE_PPC,
  	FLAC__CPUINFO_TYPE_UNKNOWN
  } FLAC__CPUInfo_Type;
  
***************
*** 54,59 ****
--- 55,64 ----
  	FLAC__bool extmmx;
  } FLAC__CPUInfo_IA32;
  
+ typedef struct {
+ 	FLAC__bool altivec;
+ } FLAC__CPUInfo_PPC;
+ 
  extern const unsigned FLAC__CPUINFO_IA32_CPUID_CMOV;
  extern const unsigned FLAC__CPUINFO_IA32_CPUID_MMX;
  extern const unsigned FLAC__CPUINFO_IA32_CPUID_FXSR;
***************
*** 69,74 ****
--- 74,80 ----
  	FLAC__CPUInfo_Type type;
  	union {
  		FLAC__CPUInfo_IA32 ia32;
+ 		FLAC__CPUInfo_PPC ppc;
  	} data;
  } FLAC__CPUInfo;
  


More information about the Flac-dev mailing list