summaryrefslogtreecommitdiff
path: root/third_party/webrtc/src/webrtc/common_audio/signal_processing/filter_ar_fast_q12_armv7.S
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/webrtc/src/webrtc/common_audio/signal_processing/filter_ar_fast_q12_armv7.S')
-rw-r--r--third_party/webrtc/src/webrtc/common_audio/signal_processing/filter_ar_fast_q12_armv7.S218
1 files changed, 218 insertions, 0 deletions
diff --git a/third_party/webrtc/src/webrtc/common_audio/signal_processing/filter_ar_fast_q12_armv7.S b/third_party/webrtc/src/webrtc/common_audio/signal_processing/filter_ar_fast_q12_armv7.S
new file mode 100644
index 00000000..76c8eee7
--- /dev/null
+++ b/third_party/webrtc/src/webrtc/common_audio/signal_processing/filter_ar_fast_q12_armv7.S
@@ -0,0 +1,218 @@
+@
+@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+@
+@ Use of this source code is governed by a BSD-style license
+@ that can be found in the LICENSE file in the root of the source
+@ tree. An additional intellectual property rights grant can be found
+@ in the file PATENTS. All contributing project authors may
+@ be found in the AUTHORS file in the root of the source tree.
+@
+
+@ This file contains the function WebRtcSpl_FilterARFastQ12(), optimized for
+@ ARMv7 platform. The description header can be found in
+@ signal_processing_library.h
+@
+@ Output is bit-exact with the generic C code as in filter_ar_fast_q12.c, and
+@ the reference C code at end of this file.
+
+@ Assumptions:
+@ (1) data_length > 0
+@ (2) coefficients_length > 1
+
+@ Register usage:
+@
+@ r0: &data_in[i]
+@ r1: &data_out[i], for result ouput
+@ r2: &coefficients[0]
+@ r3: coefficients_length
+@ r4: Iteration counter for the outer loop.
+@ r5: data_out[j] as multiplication inputs
+@ r6: Calculated value for output data_out[]; interation counter for inner loop
+@ r7: Partial sum of a filtering multiplication results
+@ r8: Partial sum of a filtering multiplication results
+@ r9: &data_out[], for filtering input; data_in[i]
+@ r10: coefficients[j]
+@ r11: Scratch
+@ r12: &coefficients[j]
+
+#include "webrtc/system_wrappers/interface/asm_defines.h"
+
+GLOBAL_FUNCTION WebRtcSpl_FilterARFastQ12
+.align 2
+DEFINE_FUNCTION WebRtcSpl_FilterARFastQ12
+ push {r4-r11}
+
+ ldrsh r12, [sp, #32] @ data_length
+ subs r4, r12, #1
+ beq ODD_LENGTH @ jump if data_length == 1
+
+LOOP_LENGTH:
+ add r12, r2, r3, lsl #1
+ sub r12, #4 @ &coefficients[coefficients_length - 2]
+ sub r9, r1, r3, lsl #1
+ add r9, #2 @ &data_out[i - coefficients_length + 1]
+ ldr r5, [r9], #4 @ data_out[i - coefficients_length + {1,2}]
+
+ mov r7, #0 @ sum1
+ mov r8, #0 @ sum2
+ subs r6, r3, #3 @ Iteration counter for inner loop.
+ beq ODD_A_LENGTH @ branch if coefficients_length == 3
+ blt POST_LOOP_A_LENGTH @ branch if coefficients_length == 2
+
+LOOP_A_LENGTH:
+ ldr r10, [r12], #-4 @ coefficients[j - 1], coefficients[j]
+ subs r6, #2
+ smlatt r8, r10, r5, r8 @ sum2 += coefficients[j] * data_out[i - j + 1];
+ smlatb r7, r10, r5, r7 @ sum1 += coefficients[j] * data_out[i - j];
+ smlabt r7, r10, r5, r7 @ coefficients[j - 1] * data_out[i - j + 1];
+ ldr r5, [r9], #4 @ data_out[i - j + 2], data_out[i - j + 3]
+ smlabb r8, r10, r5, r8 @ coefficients[j - 1] * data_out[i - j + 2];
+ bgt LOOP_A_LENGTH
+ blt POST_LOOP_A_LENGTH
+
+ODD_A_LENGTH:
+ ldrsh r10, [r12, #2] @ Filter coefficients coefficients[2]
+ sub r12, #2 @ &coefficients[0]
+ smlabb r7, r10, r5, r7 @ sum1 += coefficients[2] * data_out[i - 2];
+ smlabt r8, r10, r5, r8 @ sum2 += coefficients[2] * data_out[i - 1];
+ ldr r5, [r9, #-2] @ data_out[i - 1], data_out[i]
+
+POST_LOOP_A_LENGTH:
+ ldr r10, [r12] @ coefficients[0], coefficients[1]
+ smlatb r7, r10, r5, r7 @ sum1 += coefficients[1] * data_out[i - 1];
+
+ ldr r9, [r0], #4 @ data_in[i], data_in[i + 1]
+ smulbb r6, r10, r9 @ output1 = coefficients[0] * data_in[i];
+ sub r6, r7 @ output1 -= sum1;
+
+ sbfx r11, r6, #12, #16
+ ssat r7, #16, r6, asr #12
+ cmp r7, r11
+ addeq r6, r6, #2048
+ ssat r6, #16, r6, asr #12
+ strh r6, [r1], #2 @ Store data_out[i]
+
+ smlatb r8, r10, r6, r8 @ sum2 += coefficients[1] * data_out[i];
+ smulbt r6, r10, r9 @ output2 = coefficients[0] * data_in[i + 1];
+ sub r6, r8 @ output1 -= sum1;
+
+ sbfx r11, r6, #12, #16
+ ssat r7, #16, r6, asr #12
+ cmp r7, r11
+ addeq r6, r6, #2048
+ ssat r6, #16, r6, asr #12
+ strh r6, [r1], #2 @ Store data_out[i + 1]
+
+ subs r4, #2
+ bgt LOOP_LENGTH
+ blt END @ For even data_length, it's done. Jump to END.
+
+@ Process i = data_length -1, for the case of an odd length.
+ODD_LENGTH:
+ add r12, r2, r3, lsl #1
+ sub r12, #4 @ &coefficients[coefficients_length - 2]
+ sub r9, r1, r3, lsl #1
+ add r9, #2 @ &data_out[i - coefficients_length + 1]
+ mov r7, #0 @ sum1
+ mov r8, #0 @ sum1
+ subs r6, r3, #2 @ inner loop counter
+ beq EVEN_A_LENGTH @ branch if coefficients_length == 2
+
+LOOP2_A_LENGTH:
+ ldr r10, [r12], #-4 @ coefficients[j - 1], coefficients[j]
+ ldr r5, [r9], #4 @ data_out[i - j], data_out[i - j + 1]
+ subs r6, #2
+ smlatb r7, r10, r5, r7 @ sum1 += coefficients[j] * data_out[i - j];
+ smlabt r8, r10, r5, r8 @ coefficients[j - 1] * data_out[i - j + 1];
+ bgt LOOP2_A_LENGTH
+ addlt r12, #2
+ blt POST_LOOP2_A_LENGTH
+
+EVEN_A_LENGTH:
+ ldrsh r10, [r12, #2] @ Filter coefficients coefficients[1]
+ ldrsh r5, [r9] @ data_out[i - 1]
+ smlabb r7, r10, r5, r7 @ sum1 += coefficients[1] * data_out[i - 1];
+
+POST_LOOP2_A_LENGTH:
+ ldrsh r10, [r12] @ Filter coefficients coefficients[0]
+ ldrsh r9, [r0] @ data_in[i]
+ smulbb r6, r10, r9 @ output1 = coefficients[0] * data_in[i];
+ sub r6, r7 @ output1 -= sum1;
+ sub r6, r8 @ output1 -= sum1;
+ sbfx r8, r6, #12, #16
+ ssat r7, #16, r6, asr #12
+ cmp r7, r8
+ addeq r6, r6, #2048
+ ssat r6, #16, r6, asr #12
+ strh r6, [r1] @ Store the data_out[i]
+
+END:
+ pop {r4-r11}
+ bx lr
+
+@Reference C code:
+@
+@void WebRtcSpl_FilterARFastQ12(int16_t* data_in,
+@ int16_t* data_out,
+@ int16_t* __restrict coefficients,
+@ size_t coefficients_length,
+@ size_t data_length) {
+@ size_t i = 0;
+@ size_t j = 0;
+@
+@ assert(data_length > 0);
+@ assert(coefficients_length > 1);
+@
+@ for (i = 0; i < data_length - 1; i += 2) {
+@ int32_t output1 = 0;
+@ int32_t sum1 = 0;
+@ int32_t output2 = 0;
+@ int32_t sum2 = 0;
+@
+@ for (j = coefficients_length - 1; j > 2; j -= 2) {
+@ sum1 += coefficients[j] * data_out[i - j];
+@ sum1 += coefficients[j - 1] * data_out[i - j + 1];
+@ sum2 += coefficients[j] * data_out[i - j + 1];
+@ sum2 += coefficients[j - 1] * data_out[i - j + 2];
+@ }
+@
+@ if (j == 2) {
+@ sum1 += coefficients[2] * data_out[i - 2];
+@ sum2 += coefficients[2] * data_out[i - 1];
+@ }
+@
+@ sum1 += coefficients[1] * data_out[i - 1];
+@ output1 = coefficients[0] * data_in[i];
+@ output1 -= sum1;
+@ // Saturate and store the output.
+@ output1 = WEBRTC_SPL_SAT(134215679, output1, -134217728);
+@ data_out[i] = (int16_t)((output1 + 2048) >> 12);
+@
+@ sum2 += coefficients[1] * data_out[i];
+@ output2 = coefficients[0] * data_in[i + 1];
+@ output2 -= sum2;
+@ // Saturate and store the output.
+@ output2 = WEBRTC_SPL_SAT(134215679, output2, -134217728);
+@ data_out[i + 1] = (int16_t)((output2 + 2048) >> 12);
+@ }
+@
+@ if (i == data_length - 1) {
+@ int32_t output1 = 0;
+@ int32_t sum1 = 0;
+@
+@ for (j = coefficients_length - 1; j > 1; j -= 2) {
+@ sum1 += coefficients[j] * data_out[i - j];
+@ sum1 += coefficients[j - 1] * data_out[i - j + 1];
+@ }
+@
+@ if (j == 1) {
+@ sum1 += coefficients[1] * data_out[i - 1];
+@ }
+@
+@ output1 = coefficients[0] * data_in[i];
+@ output1 -= sum1;
+@ // Saturate and store the output.
+@ output1 = WEBRTC_SPL_SAT(134215679, output1, -134217728);
+@ data_out[i] = (int16_t)((output1 + 2048) >> 12);
+@ }
+@}