1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
|
/*
* Handy add/subtract functions to operate on chunks of shorts.
* Feel free to add customizations for additional architectures
*
*/
#ifdef CONFIG_ZAPTEL_MMX
static inline void __ACSS(volatile short *dst, const short *src)
{
__asm__ __volatile__ (
"movq 0(%0), %%mm0;\n"
"movq 0(%1), %%mm1;\n"
"movq 8(%0), %%mm2;\n"
"movq 8(%1), %%mm3;\n"
"paddsw %%mm1, %%mm0;\n"
"paddsw %%mm3, %%mm2;\n"
"movq %%mm0, 0(%0);\n"
"movq %%mm2, 8(%0);\n"
: "=r" (dst)
: "r" (src), "0" (dst)
: "memory"
#if CLOBBERMMX
, "%mm0", "%mm1", "%mm2", "%mm3"
#endif
);
}
static inline void __SCSS(volatile short *dst, const short *src)
{
__asm__ __volatile__ (
"movq 0(%0), %%mm0;\n"
"movq 0(%1), %%mm1;\n"
"movq 8(%0), %%mm2;\n"
"movq 8(%1), %%mm3;\n"
"psubsw %%mm1, %%mm0;\n"
"psubsw %%mm3, %%mm2;\n"
"movq %%mm0, 0(%0);\n"
"movq %%mm2, 8(%0);\n"
: "=r" (dst)
: "r" (src), "0" (dst)
: "memory"
#if CLOBBERMMX
, "%mm0", "%mm1", "%mm2", "%mm3"
#endif
);
}
#if (ZT_CHUNKSIZE == 8)
#define ACSS(a,b) __ACSS(a,b)
#define SCSS(a,b) __SCSS(a,b)
#elif (ZT_CHUNKSIZE > 8)
static inline void ACSS(volatile short *dst, const short *src)
{
int x;
for (x=0;x<ZT_CHUNKSIZE;x+=8)
__ACSS(dst + x, src + x);
}
static inline void SCSS(volatile short *dst, const short *src)
{
int x;
for (x=0;x<ZT_CHUNKSIZE;x+=8)
__SCSS(dst + x, src + x);
}
#else
#error No MMX for ZT_CHUNKSIZE < 8
#endif
#else
static inline void ACSS(short *dst, short *src)
{
int x,sum;
/* Add src to dst with saturation, storing in dst */
for (x=0;x<ZT_CHUNKSIZE;x++) {
sum = dst[x]+src[x];
if (sum > 32767)
sum = 32767;
else if (sum < -32768)
sum = -32768;
dst[x] = sum;
}
}
static inline void SCSS(short *dst, short *src)
{
int x,sum;
/* Add src to dst with saturation, storing in dst */
for (x=0;x<ZT_CHUNKSIZE;x++) {
sum = dst[x]-src[x];
if (sum > 32767)
sum = 32767;
else if (sum < -32768)
sum = -32768;
dst[x] = sum;
}
}
#endif
|