diff options
Diffstat (limited to 'arith.h')
-rwxr-xr-x | arith.h | 252 |
1 files changed, 250 insertions, 2 deletions
@@ -1,3 +1,5 @@ +#ifndef _ZAPTEL_ARITH_H +#define _ZAPTEL_ARITH_H /* * Handy add/subtract functions to operate on chunks of shorts. * Feel free to add customizations for additional architectures @@ -5,7 +7,7 @@ */ #ifdef CONFIG_ZAPTEL_MMX - +#ifdef ZT_CHUNKSIZE static inline void __ACSS(volatile short *dst, const short *src) { __asm__ __volatile__ ( @@ -46,6 +48,7 @@ static inline void __SCSS(volatile short *dst, const short *src) ); } + #if (ZT_CHUNKSIZE == 8) #define ACSS(a,b) __ACSS(a,b) #define SCSS(a,b) __SCSS(a,b) @@ -65,9 +68,197 @@ static inline void SCSS(volatile short *dst, const short *src) #else #error No MMX for ZT_CHUNKSIZE < 8 #endif +#endif +static inline int CONVOLVE(const int *coeffs, const short *hist, int len) +{ + int sum; + /* Divide length by 16 */ + len >>= 4; + + /* Clear our accumulator, mm4 */ + + /* + + For every set of eight... + + Load 16 coefficients into four registers... + Shift each word right 16 to make them shorts... + Pack the resulting shorts into two registers... + With the coefficients now in mm0 and mm2, load the + history into mm1 and mm3... + Multiply/add mm1 into mm0, and mm3 into mm2... + Add mm2 into mm0 (without saturation, alas). Now we have two half-results. + Accumulate in mm4 (again, without saturation, alas) + */ + __asm__ ( + "pxor %%mm4, %%mm4;\n" + "mov %1, %%edi;\n" + "mov %2, %%esi;\n" + "mov %3, %%ecx;\n" + "1:" + "movq 0(%%edi), %%mm0;\n" + "movq 8(%%edi), %%mm1;\n" + "movq 16(%%edi), %%mm2;\n" + "movq 24(%%edi), %%mm3;\n" + /* can't use 4/5 since 4 is the accumulator for us */ + "movq 32(%%edi), %%mm6;\n" + "movq 40(%%edi), %%mm7;\n" + "psrad $16, %%mm0;\n" + "psrad $16, %%mm1;\n" + "psrad $16, %%mm2;\n" + "psrad $16, %%mm3;\n" + "psrad $16, %%mm6;\n" + "psrad $16, %%mm7;\n" + "packssdw %%mm1, %%mm0;\n" + "packssdw %%mm3, %%mm2;\n" + "packssdw %%mm7, %%mm6;\n" + "movq 0(%%esi), %%mm1;\n" + "movq 8(%%esi), %%mm3;\n" + "movq 16(%%esi), %%mm7;\n" + "pmaddwd %%mm1, %%mm0;\n" + "pmaddwd %%mm3, %%mm2;\n" + "pmaddwd %%mm7, %%mm6;\n" + "paddd %%mm6, %%mm4;\n" + "paddd %%mm2, %%mm4;\n" + "paddd %%mm0, %%mm4;\n" + /* Come back and do for the last few bytes */ + "movq 48(%%edi), %%mm6;\n" + "movq 56(%%edi), %%mm7;\n" + "psrad $16, %%mm6;\n" + "psrad $16, %%mm7;\n" + "packssdw %%mm7, %%mm6;\n" + "movq 24(%%esi), %%mm7;\n" + "pmaddwd %%mm7, %%mm6;\n" + "paddd %%mm6, %%mm4;\n" + "add $64, %%edi;\n" + "add $32, %%esi;\n" + "dec %%ecx;\n" + "jnz 1b;\n" + "movq %%mm4, %%mm0;\n" + "psrlq $32, %%mm0;\n" + "paddd %%mm0, %%mm4;\n" + "movd %%mm4, %0;\n" + : "=r" (sum) + : "r" (coeffs), "r" (hist), "r" (len) + : "%ecx", "%edi", "%esi" + ); + + return sum; +} + +static inline void UPDATE(volatile int *taps, const short *history, const int nsuppr, const int ntaps) +{ + int i; + int correction; + for (i=0;i<ntaps;i++) { + correction = history[i] * nsuppr; + taps[i] += correction; + } +} + +static inline void UPDATE2(volatile int *taps, volatile short *taps_short, const short *history, const int nsuppr, const int ntaps) +{ + int i; + int correction; +#if 0 + ntaps >>= 4; + /* First, load up taps, */ + __asm__ ( + "pxor %%mm4, %%mm4;\n" + "mov %0, %%edi;\n" + "mov %1, %%esi;\n" + "mov %3, %%ecx;\n" + "1:" + "jnz 1b;\n" + "movq %%mm4, %%mm0;\n" + "psrlq $32, %%mm0;\n" + "paddd %%mm0, %%mm4;\n" + "movd %%mm4, %0;\n" + : "=r" (taps), "=r" (taps_short) + : "r" (history), "r" (nsuppr), "r" (ntaps), "0" (taps) + : "%ecx", "%edi", "%esi" + ); +#endif +#if 1 + for (i=0;i<ntaps;i++) { + correction = history[i] * nsuppr; + taps[i] += correction; + taps_short[i] = taps[i] >> 16; + } +#endif +} + +static inline int CONVOLVE2(const short *coeffs, const short *hist, int len) +{ + int sum; + /* Divide length by 16 */ + len >>= 4; + + /* Clear our accumulator, mm4 */ + + /* + + For every set of eight... + Load in eight coefficients and eight historic samples, multliply add and + accumulate the result + */ + __asm__ ( + "pxor %%mm4, %%mm4;\n" + "mov %1, %%edi;\n" + "mov %2, %%esi;\n" + "mov %3, %%ecx;\n" + "1:" + "movq 0(%%edi), %%mm0;\n" + "movq 8(%%edi), %%mm2;\n" + "movq 0(%%esi), %%mm1;\n" + "movq 8(%%esi), %%mm3;\n" + "pmaddwd %%mm1, %%mm0;\n" + "pmaddwd %%mm3, %%mm2;\n" + "paddd %%mm2, %%mm4;\n" + "paddd %%mm0, %%mm4;\n" + "movq 16(%%edi), %%mm0;\n" + "movq 24(%%edi), %%mm2;\n" + "movq 16(%%esi), %%mm1;\n" + "movq 24(%%esi), %%mm3;\n" + "pmaddwd %%mm1, %%mm0;\n" + "pmaddwd %%mm3, %%mm2;\n" + "paddd %%mm2, %%mm4;\n" + "paddd %%mm0, %%mm4;\n" + "add $32, %%edi;\n" + "add $32, %%esi;\n" + "dec %%ecx;\n" + "jnz 1b;\n" + "movq %%mm4, %%mm0;\n" + "psrlq $32, %%mm0;\n" + "paddd %%mm0, %%mm4;\n" + "movd %%mm4, %0;\n" + : "=r" (sum) + : "r" (coeffs), "r" (hist), "r" (len) + : "%ecx", "%edi", "%esi" + ); + + return sum; +} +static inline short MAX16(const short *y, int len, int *pos) +{ + int k; + short max = 0; + int bestpos = 0; + for (k=0;k<len;k++) { + if (max < y[k]) { + bestpos = k; + max = y[k]; + } + } + *pos = (len - 1 - bestpos); + return max; +} + + #else +#ifdef ZT_CHUNKSIZE static inline void ACSS(short *dst, short *src) { int x,sum; @@ -96,4 +287,61 @@ static inline void SCSS(short *dst, short *src) } } -#endif +#endif /* ZT_CHUNKSIZE */ + +static inline int CONVOLVE(const int *coeffs, const short *hist, int len) +{ + int x; + int sum = 0; + for (x=0;x<len;x++) + sum += (coeffs[x] >> 16) * hist[x]; + return sum; +} + +static inline int CONVOLVE2(const short *coeffs, const short *hist, int len) +{ + int x; + int sum = 0; + for (x=0;x<len;x++) + sum += coeffs[x] * hist[x]; + return sum; +} + +static inline void UPDATE(int *taps, const short *history, const int nsuppr, const int ntaps) +{ + int i; + int correction; + for (i=0;i<ntaps;i++) { + correction = history[i] * nsuppr; + taps[i] += correction; + } +} + +static inline void UPDATE2(int *taps, short *taps_short, const short *history, const int nsuppr, const int ntaps) +{ + int i; + int correction; + for (i=0;i<ntaps;i++) { + correction = history[i] * nsuppr; + taps[i] += correction; + taps_short[i] = taps[i] >> 16; + } +} + +static inline short MAX16(const short *y, int len, int *pos) +{ + int k; + short max = 0; + int bestpos = 0; + for (k=0;k<len;k++) { + if (max < y[k]) { + bestpos = k; + max = y[k]; + } + } + *pos = (len - 1 - bestpos); + return max; +} + +#endif /* MMX */ +#endif /* _ZAPTEL_ARITH_H */ |