summaryrefslogtreecommitdiff
path: root/pjmedia/src/pjmedia-codec/speex/preprocess_spx.c
diff options
context:
space:
mode:
Diffstat (limited to 'pjmedia/src/pjmedia-codec/speex/preprocess_spx.c')
-rw-r--r--pjmedia/src/pjmedia-codec/speex/preprocess_spx.c1258
1 files changed, 688 insertions, 570 deletions
diff --git a/pjmedia/src/pjmedia-codec/speex/preprocess_spx.c b/pjmedia/src/pjmedia-codec/speex/preprocess_spx.c
index ba2e6076..d82230cf 100644
--- a/pjmedia/src/pjmedia-codec/speex/preprocess_spx.c
+++ b/pjmedia/src/pjmedia-codec/speex/preprocess_spx.c
@@ -1,6 +1,6 @@
-/* Copyright (C) 2003 Epic Games
- Written by Jean-Marc Valin
-
+/* Copyright (C) 2003 Epic Games (written by Jean-Marc Valin)
+ Copyright (C) 2004-2006 Epic Games
+
File: preprocess.c
Preprocessor with denoising based on the algorithm by Ephraim and Malah
@@ -31,96 +31,356 @@
POSSIBILITY OF SUCH DAMAGE.
*/
+
+/*
+ Recommended papers:
+
+ Y. Ephraim and D. Malah, "Speech enhancement using minimum mean-square error
+ short-time spectral amplitude estimator". IEEE Transactions on Acoustics,
+ Speech and Signal Processing, vol. ASSP-32, no. 6, pp. 1109-1121, 1984.
+
+ Y. Ephraim and D. Malah, "Speech enhancement using minimum mean-square error
+ log-spectral amplitude estimator". IEEE Transactions on Acoustics, Speech and
+ Signal Processing, vol. ASSP-33, no. 2, pp. 443-445, 1985.
+
+ I. Cohen and B. Berdugo, "Speech enhancement for non-stationary noise environments".
+ Signal Processing, vol. 81, no. 2, pp. 2403-2418, 2001.
+
+ Stefan Gustafsson, Rainer Martin, Peter Jax, and Peter Vary. "A psychoacoustic
+ approach to combined acoustic echo cancellation and noise reduction". IEEE
+ Transactions on Speech and Audio Processing, 2002.
+
+ J.-M. Valin, J. Rouat, and F. Michaud, "Microphone array post-filter for separation
+ of simultaneous non-stationary sources". In Proceedings IEEE International
+ Conference on Acoustics, Speech, and Signal Processing, 2004.
+*/
+
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <math.h>
#include "speex/speex_preprocess.h"
+#include "speex/speex_echo.h"
#include "misc.h"
-#include "smallft.h"
-
-#define max(a,b) ((a) > (b) ? (a) : (b))
-#define min(a,b) ((a) < (b) ? (a) : (b))
+#include "fftwrap.h"
+#include "filterbank.h"
+#include "math_approx.h"
#ifndef M_PI
#define M_PI 3.14159263
#endif
-#define SQRT_M_PI_2 0.88623
#define LOUDNESS_EXP 2.5
-#define NB_BANDS 8
+#define NB_BANDS 24
+
+#define SPEECH_PROB_START_DEFAULT QCONST16(0.35f,15)
+#define SPEECH_PROB_CONTINUE_DEFAULT QCONST16(0.20f,15)
+#define NOISE_SUPPRESS_DEFAULT -25
+#define ECHO_SUPPRESS_DEFAULT -45
+#define ECHO_SUPPRESS_ACTIVE_DEFAULT -15
+
+#ifndef NULL
+#define NULL 0
+#endif
+
+#define SQR(x) ((x)*(x))
+#define SQR16(x) (MULT16_16((x),(x)))
+#define SQR16_Q15(x) (MULT16_16_Q15((x),(x)))
+
+#ifdef FIXED_POINT
+static inline spx_word16_t DIV32_16_Q8(spx_word32_t a, spx_word32_t b)
+{
+ if (SHR32(a,7) >= b)
+ {
+ return 32767;
+ } else {
+ if (b>=QCONST32(1,23))
+ {
+ a = SHR32(a,8);
+ b = SHR32(b,8);
+ }
+ if (b>=QCONST32(1,19))
+ {
+ a = SHR32(a,4);
+ b = SHR32(b,4);
+ }
+ if (b>=QCONST32(1,15))
+ {
+ a = SHR32(a,4);
+ b = SHR32(b,4);
+ }
+ a = SHL32(a,8);
+ return PDIV32_16(a,b);
+ }
+
+}
+static inline spx_word16_t DIV32_16_Q15(spx_word32_t a, spx_word32_t b)
+{
+ if (SHR32(a,15) >= b)
+ {
+ return 32767;
+ } else {
+ if (b>=QCONST32(1,23))
+ {
+ a = SHR32(a,8);
+ b = SHR32(b,8);
+ }
+ if (b>=QCONST32(1,19))
+ {
+ a = SHR32(a,4);
+ b = SHR32(b,4);
+ }
+ if (b>=QCONST32(1,15))
+ {
+ a = SHR32(a,4);
+ b = SHR32(b,4);
+ }
+ a = SHL32(a,15)-a;
+ return DIV32_16(a,b);
+ }
+}
+#define SNR_SCALING 256.f
+#define SNR_SCALING_1 0.0039062f
+#define SNR_SHIFT 8
+
+#define FRAC_SCALING 32767.f
+#define FRAC_SCALING_1 3.0518e-05
+#define FRAC_SHIFT 1
-#define SPEEX_PROB_START_DEFAULT 0.35f
-#define SPEEX_PROB_CONTINUE_DEFAULT 0.20f
+#define EXPIN_SCALING 2048.f
+#define EXPIN_SCALING_1 0.00048828f
+#define EXPIN_SHIFT 11
+#define EXPOUT_SCALING_1 1.5259e-05
-#define ZMIN .1
-#define ZMAX .316
-#define ZMIN_1 10
-#define LOG_MIN_MAX_1 0.86859
+#define NOISE_SHIFT 7
-static void conj_window(float *w, int len)
+#else
+
+#define DIV32_16_Q8(a,b) ((a)/(b))
+#define DIV32_16_Q15(a,b) ((a)/(b))
+#define SNR_SCALING 1.f
+#define SNR_SCALING_1 1.f
+#define SNR_SHIFT 0
+#define FRAC_SCALING 1.f
+#define FRAC_SCALING_1 1.f
+#define FRAC_SHIFT 0
+#define NOISE_SHIFT 0
+
+#define EXPIN_SCALING 1.f
+#define EXPIN_SCALING_1 1.f
+#define EXPOUT_SCALING_1 1.f
+
+#endif
+
+/** Speex pre-processor state. */
+struct SpeexPreprocessState_ {
+ /* Basic info */
+ int frame_size; /**< Number of samples processed each time */
+ int ps_size; /**< Number of points in the power spectrum */
+ int sampling_rate; /**< Sampling rate of the input/output */
+ int nbands;
+ FilterBank *bank;
+
+ /* Parameters */
+ int denoise_enabled;
+ int agc_enabled;
+ float agc_level;
+ int vad_enabled;
+ int dereverb_enabled;
+ spx_word16_t reverb_decay;
+ spx_word16_t reverb_level;
+ spx_word16_t speech_prob_start;
+ spx_word16_t speech_prob_continue;
+ int noise_suppress;
+ int echo_suppress;
+ int echo_suppress_active;
+ SpeexEchoState *echo_state;
+
+ /* DSP-related arrays */
+ spx_word16_t *frame; /**< Processing frame (2*ps_size) */
+ spx_word16_t *ft; /**< Processing frame in freq domain (2*ps_size) */
+ spx_word32_t *ps; /**< Current power spectrum */
+ spx_word16_t *gain2; /**< Adjusted gains */
+ spx_word16_t *gain_floor; /**< Minimum gain allowed */
+ spx_word16_t *window; /**< Analysis/Synthesis window */
+ spx_word32_t *noise; /**< Noise estimate */
+ spx_word32_t *reverb_estimate; /**< Estimate of reverb energy */
+ spx_word32_t *old_ps; /**< Power spectrum for last frame */
+ spx_word16_t *gain; /**< Ephraim Malah gain */
+ spx_word16_t *prior; /**< A-priori SNR */
+ spx_word16_t *post; /**< A-posteriori SNR */
+
+ spx_word32_t *S; /**< Smoothed power spectrum */
+ spx_word32_t *Smin; /**< See Cohen paper */
+ spx_word32_t *Stmp; /**< See Cohen paper */
+ int *update_prob; /**< Propability of speech presence for noise update */
+
+ spx_word16_t *zeta; /**< Smoothed a priori SNR */
+ spx_word32_t *echo_noise;
+ spx_word32_t *residual_echo;
+
+ /* Misc */
+ spx_word16_t *inbuf; /**< Input buffer (overlapped analysis) */
+ spx_word16_t *outbuf; /**< Output buffer (for overlap and add) */
+
+#ifndef FIXED_POINT
+ float *loudness_weight; /**< Perceptual loudness curve */
+ float loudness; /**< loudness estimate */
+ float loudness2; /**< loudness estimate */
+ int nb_loudness_adapt; /**< Number of frames used for loudness adaptation so far */
+#endif
+ int nb_adapt; /**< Number of frames used for adaptation so far */
+ int was_speech;
+ int min_count; /**< Number of frames processed so far */
+ void *fft_lookup; /**< Lookup table for the FFT */
+#ifdef FIXED_POINT
+ int frame_shift;
+#endif
+};
+
+
+static void conj_window(spx_word16_t *w, int len)
{
int i;
for (i=0;i<len;i++)
{
- float x=4*((float)i)/len;
+ spx_word16_t tmp;
+ spx_word16_t x = DIV32_16(MULT16_16(QCONST16(4.f,13),i),len);
int inv=0;
- if (x<1)
+ if (x<QCONST16(1.f,13))
{
- } else if (x<2)
+ } else if (x<QCONST16(2.f,13))
{
- x=2-x;
+ x=QCONST16(2.f,13)-x;
inv=1;
- } else if (x<3)
+ } else if (x<QCONST16(3.f,13))
{
- x=x-2;
+ x=x-QCONST16(2.f,13);
inv=1;
} else {
- x=4-x;
+ x=QCONST16(2.f,13)-x+QCONST16(2.f,13); /* 4 - x */
}
- x*=1.9979;
- w[i]=(.5-.5*cos(x))*(.5-.5*cos(x));
+ x = MULT16_16_Q14(QCONST16(1.271903f,14), x);
+ tmp = SQR16_Q15(QCONST16(.5f,15)-MULT16_16_P15(QCONST16(.5f,15),spx_cos_norm(QCONST32(x,2))));
if (inv)
- w[i]=1-w[i];
- w[i]=sqrt(w[i]);
+ tmp=SUB16(Q15_ONE,tmp);
+ w[i]=spx_sqrt(SHL32(EXTEND32(tmp),15));
+ }
+}
+
+
+#ifdef FIXED_POINT
+/* This function approximates the gain function
+ y = gamma(1.25)^2 * M(-.25;1;-x) / sqrt(x)
+ which multiplied by xi/(1+xi) is the optimal gain
+ in the loudness domain ( sqrt[amplitude] )
+ Input in Q11 format, output in Q15
+*/
+static inline spx_word32_t hypergeom_gain(spx_word32_t xx)
+{
+ int ind;
+ spx_word16_t frac;
+ /* Q13 table */
+ static const spx_word16_t table[21] = {
+ 6730, 8357, 9868, 11267, 12563, 13770, 14898,
+ 15959, 16961, 17911, 18816, 19682, 20512, 21311,
+ 22082, 22827, 23549, 24250, 24931, 25594, 26241};
+ ind = SHR32(xx,10);
+ if (ind<0)
+ return Q15_ONE;
+ if (ind>19)
+ return ADD32(EXTEND32(Q15_ONE),EXTEND32(DIV32_16(QCONST32(.1296,23), SHR32(xx,EXPIN_SHIFT-SNR_SHIFT))));
+ frac = SHL32(xx-SHL32(ind,10),5);
+ return SHL32(DIV32_16(PSHR32(MULT16_16(Q15_ONE-frac,table[ind]) + MULT16_16(frac,table[ind+1]),7),(spx_sqrt(SHL32(xx,15)+6711))),7);
+}
+
+static inline spx_word16_t qcurve(spx_word16_t x)
+{
+ x = MAX16(x, 1);
+ return DIV32_16(SHL32(EXTEND32(32767),9),ADD16(512,MULT16_16_Q15(QCONST16(.60f,15),DIV32_16(32767,x))));
+}
+
+/* Compute the gain floor based on different floors for the background noise and residual echo */
+static void compute_gain_floor(int noise_suppress, int effective_echo_suppress, spx_word32_t *noise, spx_word32_t *echo, spx_word16_t *gain_floor, int len)
+{
+ int i;
+
+ if (noise_suppress > effective_echo_suppress)
+ {
+ spx_word16_t noise_gain, gain_ratio;
+ noise_gain = EXTRACT16(MIN32(Q15_ONE,SHR32(spx_exp(MULT16_16(QCONST16(0.11513,11),noise_suppress)),1)));
+ gain_ratio = EXTRACT16(MIN32(Q15_ONE,SHR32(spx_exp(MULT16_16(QCONST16(.2302585f,11),effective_echo_suppress-noise_suppress)),1)));
+
+ /* gain_floor = sqrt [ (noise*noise_floor + echo*echo_floor) / (noise+echo) ] */
+ for (i=0;i<len;i++)
+ gain_floor[i] = MULT16_16_Q15(noise_gain,
+ spx_sqrt(SHL32(EXTEND32(DIV32_16_Q15(PSHR32(noise[i],NOISE_SHIFT) + MULT16_32_Q15(gain_ratio,echo[i]),
+ (1+PSHR32(noise[i],NOISE_SHIFT) + echo[i]) )),15)));
+ } else {
+ spx_word16_t echo_gain, gain_ratio;
+ echo_gain = EXTRACT16(MIN32(Q15_ONE,SHR32(spx_exp(MULT16_16(QCONST16(0.11513,11),effective_echo_suppress)),1)));
+ gain_ratio = EXTRACT16(MIN32(Q15_ONE,SHR32(spx_exp(MULT16_16(QCONST16(.2302585f,11),noise_suppress-effective_echo_suppress)),1)));
+
+ /* gain_floor = sqrt [ (noise*noise_floor + echo*echo_floor) / (noise+echo) ] */
+ for (i=0;i<len;i++)
+ gain_floor[i] = MULT16_16_Q15(echo_gain,
+ spx_sqrt(SHL32(EXTEND32(DIV32_16_Q15(MULT16_32_Q15(gain_ratio,PSHR32(noise[i],NOISE_SHIFT)) + echo[i],
+ (1+PSHR32(noise[i],NOISE_SHIFT) + echo[i]) )),15)));
}
}
+#else
/* This function approximates the gain function
y = gamma(1.25)^2 * M(-.25;1;-x) / sqrt(x)
which multiplied by xi/(1+xi) is the optimal gain
in the loudness domain ( sqrt[amplitude] )
*/
-PJ_INLINE(float) hypergeom_gain(float x)
+static inline spx_word32_t hypergeom_gain(spx_word32_t xx)
{
int ind;
float integer, frac;
+ float x;
static const float table[21] = {
0.82157f, 1.02017f, 1.20461f, 1.37534f, 1.53363f, 1.68092f, 1.81865f,
1.94811f, 2.07038f, 2.18638f, 2.29688f, 2.40255f, 2.50391f, 2.60144f,
2.69551f, 2.78647f, 2.87458f, 2.96015f, 3.04333f, 3.12431f, 3.20326f};
-
- if (x>9.5)
- return 1+.1296/x;
-
- integer = floor(2*x);
- frac = 2*x-integer;
- ind = (int)integer;
-
- return ((1-frac)*table[ind] + frac*table[ind+1])/sqrt(x+.0001f);
+ x = EXPIN_SCALING_1*xx;
+ integer = floor(2*x);
+ ind = (int)integer;
+ if (ind<0)
+ return FRAC_SCALING;
+ if (ind>19)
+ return FRAC_SCALING*(1+.1296/x);
+ frac = 2*x-integer;
+ return FRAC_SCALING*((1-frac)*table[ind] + frac*table[ind+1])/sqrt(x+.0001f);
}
-static inline float qcurve(float x)
+static inline spx_word16_t qcurve(spx_word16_t x)
{
- return 1.f/(1.f+.1f/(x*x));
+ return 1.f/(1.f+.15f/(SNR_SCALING_1*x));
}
+static void compute_gain_floor(int noise_suppress, int effective_echo_suppress, spx_word32_t *noise, spx_word32_t *echo, spx_word16_t *gain_floor, int len)
+{
+ int i;
+ float echo_floor;
+ float noise_floor;
+
+ noise_floor = exp(.2302585f*noise_suppress);
+ echo_floor = exp(.2302585f*effective_echo_suppress);
+
+ /* Compute the gain floor based on different floors for the background noise and residual echo */
+ for (i=0;i<len;i++)
+ gain_floor[i] = FRAC_SCALING*sqrt(noise_floor*PSHR32(noise[i],NOISE_SHIFT) + echo_floor*echo[i])/sqrt(1+PSHR32(noise[i],NOISE_SHIFT) + echo[i]);
+}
+
+#endif
SpeexPreprocessState *speex_preprocess_state_init(int frame_size, int sampling_rate)
{
int i;
- int N, N3, N4;
+ int N, N3, N4, M;
SpeexPreprocessState *st = (SpeexPreprocessState *)speex_alloc(sizeof(SpeexPreprocessState));
st->frame_size = frame_size;
@@ -157,45 +417,49 @@ SpeexPreprocessState *speex_preprocess_state_init(int frame_size, int sampling_r
st->agc_level = 8000;
st->vad_enabled = 0;
st->dereverb_enabled = 0;
- st->reverb_decay = .5;
- st->reverb_level = .2;
-
- st->speech_prob_start = SPEEX_PROB_START_DEFAULT;
- st->speech_prob_continue = SPEEX_PROB_CONTINUE_DEFAULT;
-
- st->frame = (float*)speex_alloc(2*N*sizeof(float));
- st->ps = (float*)speex_alloc(N*sizeof(float));
- st->gain2 = (float*)speex_alloc(N*sizeof(float));
- st->window = (float*)speex_alloc(2*N*sizeof(float));
- st->noise = (float*)speex_alloc(N*sizeof(float));
- st->reverb_estimate = (float*)speex_alloc(N*sizeof(float));
- st->old_ps = (float*)speex_alloc(N*sizeof(float));
- st->gain = (float*)speex_alloc(N*sizeof(float));
- st->prior = (float*)speex_alloc(N*sizeof(float));
- st->post = (float*)speex_alloc(N*sizeof(float));
- st->loudness_weight = (float*)speex_alloc(N*sizeof(float));
- st->inbuf = (float*)speex_alloc(N3*sizeof(float));
- st->outbuf = (float*)speex_alloc(N3*sizeof(float));
- st->echo_noise = (float*)speex_alloc(N*sizeof(float));
-
- st->S = (float*)speex_alloc(N*sizeof(float));
- st->Smin = (float*)speex_alloc(N*sizeof(float));
- st->Stmp = (float*)speex_alloc(N*sizeof(float));
- st->update_prob = (float*)speex_alloc(N*sizeof(float));
+ st->reverb_decay = 0;
+ st->reverb_level = 0;
+ st->noise_suppress = NOISE_SUPPRESS_DEFAULT;
+ st->echo_suppress = ECHO_SUPPRESS_DEFAULT;
+ st->echo_suppress_active = ECHO_SUPPRESS_ACTIVE_DEFAULT;
- st->zeta = (float*)speex_alloc(N*sizeof(float));
- st->Zpeak = 0;
- st->Zlast = 0;
+ st->speech_prob_start = SPEECH_PROB_START_DEFAULT;
+ st->speech_prob_continue = SPEECH_PROB_CONTINUE_DEFAULT;
- st->noise_bands = (float*)speex_alloc(NB_BANDS*sizeof(float));
- st->noise_bands2 = (float*)speex_alloc(NB_BANDS*sizeof(float));
- st->speech_bands = (float*)speex_alloc(NB_BANDS*sizeof(float));
- st->speech_bands2 = (float*)speex_alloc(NB_BANDS*sizeof(float));
- st->noise_bandsN = st->speech_bandsN = 1;
+ st->echo_state = NULL;
+
+ st->nbands = NB_BANDS;
+ M = st->nbands;
+ st->bank = filterbank_new(M, sampling_rate, N, 1);
+
+ st->frame = (spx_word16_t*)speex_alloc(2*N*sizeof(spx_word16_t));
+ st->window = (spx_word16_t*)speex_alloc(2*N*sizeof(spx_word16_t));
+ st->ft = (spx_word16_t*)speex_alloc(2*N*sizeof(spx_word16_t));
+
+ st->ps = (spx_word32_t*)speex_alloc((N+M)*sizeof(spx_word32_t));
+ st->noise = (spx_word32_t*)speex_alloc((N+M)*sizeof(spx_word32_t));
+ st->echo_noise = (spx_word32_t*)speex_alloc((N+M)*sizeof(spx_word32_t));
+ st->residual_echo = (spx_word32_t*)speex_alloc((N+M)*sizeof(spx_word32_t));
+ st->reverb_estimate = (spx_word32_t*)speex_alloc((N+M)*sizeof(spx_word32_t));
+ st->old_ps = (spx_word32_t*)speex_alloc((N+M)*sizeof(spx_word32_t));
+ st->prior = (spx_word16_t*)speex_alloc((N+M)*sizeof(spx_word16_t));
+ st->post = (spx_word16_t*)speex_alloc((N+M)*sizeof(spx_word16_t));
+ st->gain = (spx_word16_t*)speex_alloc((N+M)*sizeof(spx_word16_t));
+ st->gain2 = (spx_word16_t*)speex_alloc((N+M)*sizeof(spx_word16_t));
+ st->gain_floor = (spx_word16_t*)speex_alloc((N+M)*sizeof(spx_word16_t));
+ st->zeta = (spx_word16_t*)speex_alloc((N+M)*sizeof(spx_word16_t));
+
+ st->S = (spx_word32_t*)speex_alloc(N*sizeof(spx_word32_t));
+ st->Smin = (spx_word32_t*)speex_alloc(N*sizeof(spx_word32_t));
+ st->Stmp = (spx_word32_t*)speex_alloc(N*sizeof(spx_word32_t));
+ st->update_prob = (int*)speex_alloc(N*sizeof(int));
+
+ st->inbuf = (spx_word16_t*)speex_alloc(N3*sizeof(spx_word16_t));
+ st->outbuf = (spx_word16_t*)speex_alloc(N3*sizeof(spx_word16_t));
conj_window(st->window, 2*N3);
for (i=2*N3;i<2*st->ps_size;i++)
- st->window[i]=1;
+ st->window[i]=Q15_ONE;
if (N4>0)
{
@@ -205,22 +469,25 @@ SpeexPreprocessState *speex_preprocess_state_init(int frame_size, int sampling_r
st->window[i+N3]=1;
}
}
- for (i=0;i<N;i++)
+ for (i=0;i<N+M;i++)
{
- st->noise[i]=1e4;
- st->reverb_estimate[i]=0.;
- st->old_ps[i]=1e4;
- st->gain[i]=1;
- st->post[i]=1;
- st->prior[i]=1;
+ st->noise[i]=QCONST32(1.f,NOISE_SHIFT);
+ st->reverb_estimate[i]=0;
+ st->old_ps[i]=1;
+ st->gain[i]=Q15_ONE;
+ st->post[i]=SHL16(1, SNR_SHIFT);
+ st->prior[i]=SHL16(1, SNR_SHIFT);
}
+ for (i=0;i<N;i++)
+ st->update_prob[i] = 1;
for (i=0;i<N3;i++)
{
st->inbuf[i]=0;
st->outbuf[i]=0;
}
-
+#ifndef FIXED_POINT
+ st->loudness_weight = (float*)speex_alloc(N*sizeof(float));
for (i=0;i<N;i++)
{
float ff=((float)i)*.5*sampling_rate/((float)N);
@@ -229,27 +496,26 @@ SpeexPreprocessState *speex_preprocess_state_init(int frame_size, int sampling_r
st->loudness_weight[i]=.01f;
st->loudness_weight[i] *= st->loudness_weight[i];
}
-
- st->speech_prob = 0;
- st->last_speech = 1000;
st->loudness = pow(6000,LOUDNESS_EXP);
st->loudness2 = 6000;
st->nb_loudness_adapt = 0;
+#endif
+ st->was_speech = 0;
- st->fft_lookup = (struct drft_lookup*)speex_alloc(sizeof(struct drft_lookup));
- spx_drft_init(st->fft_lookup,2*N);
+ st->fft_lookup = spx_fft_init(2*N);
st->nb_adapt=0;
- st->consec_noise=0;
- st->nb_preprocess=0;
+ st->min_count=0;
return st;
}
void speex_preprocess_state_destroy(SpeexPreprocessState *st)
{
speex_free(st->frame);
+ speex_free(st->ft);
speex_free(st->ps);
speex_free(st->gain2);
+ speex_free(st->gain_floor);
speex_free(st->window);
speex_free(st->noise);
speex_free(st->reverb_estimate);
@@ -257,8 +523,11 @@ void speex_preprocess_state_destroy(SpeexPreprocessState *st)
speex_free(st->gain);
speex_free(st->prior);
speex_free(st->post);
+#ifndef FIXED_POINT
speex_free(st->loudness_weight);
+#endif
speex_free(st->echo_noise);
+ speex_free(st->residual_echo);
speex_free(st->S);
speex_free(st->Smin);
@@ -266,241 +535,17 @@ void speex_preprocess_state_destroy(SpeexPreprocessState *st)
speex_free(st->update_prob);
speex_free(st->zeta);
- speex_free(st->noise_bands);
- speex_free(st->noise_bands2);
- speex_free(st->speech_bands);
- speex_free(st->speech_bands2);
-
speex_free(st->inbuf);
speex_free(st->outbuf);
- spx_drft_clear(st->fft_lookup);
- speex_free(st->fft_lookup);
-
+ spx_fft_destroy(st->fft_lookup);
+ filterbank_destroy(st->bank);
speex_free(st);
}
-static void update_noise(SpeexPreprocessState *st, float *ps, spx_int32_t *echo)
-{
- int i;
- float beta;
- st->nb_adapt++;
- beta=1.0f/st->nb_adapt;
- if (beta < .05f)
- beta=.05f;
-
- if (!echo)
- {
- for (i=0;i<st->ps_size;i++)
- st->noise[i] = (1.f-beta)*st->noise[i] + beta*ps[i];
- } else {
- for (i=0;i<st->ps_size;i++)
- st->noise[i] = (1.f-beta)*st->noise[i] + beta*max(1.f,ps[i]-st->frame_size*st->frame_size*4.0*echo[i]);
-#if 0
- for (i=0;i<st->ps_size;i++)
- st->noise[i] = 0;
-#endif
- }
-}
-
-static int speex_compute_vad(SpeexPreprocessState *st, float *ps, float mean_prior, float mean_post)
-{
- int i, is_speech=0;
- int N = st->ps_size;
- float scale=.5f/N;
-
- /* FIXME: Clean this up a bit */
- {
- float bands[NB_BANDS];
- int j;
- float p0, p1;
- float tot_loudness=0;
- float x = sqrt(mean_post);
-
- for (i=5;i<N-10;i++)
- {
- tot_loudness += scale*st->ps[i] * st->loudness_weight[i];
- }
-
- for (i=0;i<NB_BANDS;i++)
- {
- bands[i]=1e4f;
- for (j=i*N/NB_BANDS;j<(i+1)*N/NB_BANDS;j++)
- {
- bands[i] += ps[j];
- }
- bands[i]=log(bands[i]);
- }
-
- /*p1 = .0005+.6*exp(-.5*(x-.4)*(x-.4)*11)+.1*exp(-1.2*x);
- if (x<1.5)
- p0=.1*exp(2*(x-1.5));
- else
- p0=.02+.1*exp(-.2*(x-1.5));
- */
-
- p0=1.f/(1.f+exp(3.f*(1.5f-x)));
- p1=1.f-p0;
-
- /*fprintf (stderr, "%f %f ", p0, p1);*/
- /*p0 *= .99*st->speech_prob + .01*(1-st->speech_prob);
- p1 *= .01*st->speech_prob + .99*(1-st->speech_prob);
-
- st->speech_prob = p0/(p1+p0);
- */
-
- if (st->noise_bandsN < 50 || st->speech_bandsN < 50)
- {
- if (mean_post > 5.f)
- {
- float adapt = 1./st->speech_bandsN++;
- if (adapt<.005f)
- adapt = .005f;
- for (i=0;i<NB_BANDS;i++)
- {
- st->speech_bands[i] = (1.f-adapt)*st->speech_bands[i] + adapt*bands[i];
- /*st->speech_bands2[i] = (1-adapt)*st->speech_bands2[i] + adapt*bands[i]*bands[i];*/
- st->speech_bands2[i] = (1.f-adapt)*st->speech_bands2[i] + adapt*(bands[i]-st->speech_bands[i])*(bands[i]-st->speech_bands[i]);
- }
- } else {
- float adapt = 1./st->noise_bandsN++;
- if (adapt<.005f)
- adapt = .005f;
- for (i=0;i<NB_BANDS;i++)
- {
- st->noise_bands[i] = (1.f-adapt)*st->noise_bands[i] + adapt*bands[i];
- /*st->noise_bands2[i] = (1-adapt)*st->noise_bands2[i] + adapt*bands[i]*bands[i];*/
- st->noise_bands2[i] = (1.f-adapt)*st->noise_bands2[i] + adapt*(bands[i]-st->noise_bands[i])*(bands[i]-st->noise_bands[i]);
- }
- }
- }
- p0=p1=1;
- for (i=0;i<NB_BANDS;i++)
- {
- float noise_var, speech_var;
- float noise_mean, speech_mean;
- float tmp1, tmp2, pr;
-
- /*noise_var = 1.01*st->noise_bands2[i] - st->noise_bands[i]*st->noise_bands[i];
- speech_var = 1.01*st->speech_bands2[i] - st->speech_bands[i]*st->speech_bands[i];*/
- noise_var = st->noise_bands2[i];
- speech_var = st->speech_bands2[i];
- if (noise_var < .1f)
- noise_var = .1f;
- if (speech_var < .1f)
- speech_var = .1f;
-
- /*speech_var = sqrt(speech_var*noise_var);
- noise_var = speech_var;*/
- if (speech_var < .05f*speech_var)
- noise_var = .05f*speech_var;
- if (speech_var < .05f*noise_var)
- speech_var = .05f*noise_var;
-
- if (bands[i] < st->noise_bands[i])
- speech_var = noise_var;
- if (bands[i] > st->speech_bands[i])
- noise_var = speech_var;
-
- speech_mean = st->speech_bands[i];
- noise_mean = st->noise_bands[i];
- if (noise_mean < speech_mean - 5.f)
- noise_mean = speech_mean - 5.f;
-
- tmp1 = exp(-.5f*(bands[i]-speech_mean)*(bands[i]-speech_mean)/speech_var)/sqrt(2.f*M_PI*speech_var);
- tmp2 = exp(-.5f*(bands[i]-noise_mean)*(bands[i]-noise_mean)/noise_var)/sqrt(2.f*M_PI*noise_var);
- /*fprintf (stderr, "%f ", (float)(p0/(.01+p0+p1)));*/
- /*fprintf (stderr, "%f ", (float)(bands[i]));*/
- pr = tmp1/(1e-25+tmp1+tmp2);
- /*if (bands[i] < st->noise_bands[i])
- pr=.01;
- if (bands[i] > st->speech_bands[i] && pr < .995)
- pr=.995;*/
- if (pr>.999f)
- pr=.999f;
- if (pr<.001f)
- pr=.001f;
- /*fprintf (stderr, "%f ", pr);*/
- p0 *= pr;
- p1 *= (1-pr);
- }
-
- p0 = pow(p0,.2);
- p1 = pow(p1,.2);
-
-#if 1
- p0 *= 2.f;
- p0=p0/(p1+p0);
- if (st->last_speech>20)
- {
- float tmp = sqrt(tot_loudness)/st->loudness2;
- tmp = 1.f-exp(-10.f*tmp);
- if (p0>tmp)
- p0=tmp;
- }
- p1=1-p0;
-#else
- if (sqrt(tot_loudness) < .6f*st->loudness2 && p0>15.f*p1)
- p0=15.f*p1;
- if (sqrt(tot_loudness) < .45f*st->loudness2 && p0>7.f*p1)
- p0=7.f*p1;
- if (sqrt(tot_loudness) < .3f*st->loudness2 && p0>3.f*p1)
- p0=3.f*p1;
- if (sqrt(tot_loudness) < .15f*st->loudness2 && p0>p1)
- p0=p1;
- /*fprintf (stderr, "%f %f ", (float)(sqrt(tot_loudness) /( .25*st->loudness2)), p0/(p1+p0));*/
-#endif
-
- p0 *= .99f*st->speech_prob + .01f*(1-st->speech_prob);
- p1 *= .01f*st->speech_prob + .99f*(1-st->speech_prob);
-
- st->speech_prob = p0/(1e-25f+p1+p0);
- /*fprintf (stderr, "%f %f %f ", tot_loudness, st->loudness2, st->speech_prob);*/
-
- if (st->speech_prob > st->speech_prob_start
- || (st->last_speech < 20 && st->speech_prob > st->speech_prob_continue))
- {
- is_speech = 1;
- st->last_speech = 0;
- } else {
- st->last_speech++;
- if (st->last_speech<20)
- is_speech = 1;
- }
-
- if (st->noise_bandsN > 50 && st->speech_bandsN > 50)
- {
- if (mean_post > 5)
- {
- float adapt = 1./st->speech_bandsN++;
- if (adapt<.005f)
- adapt = .005f;
- for (i=0;i<NB_BANDS;i++)
- {
- st->speech_bands[i] = (1-adapt)*st->speech_bands[i] + adapt*bands[i];
- /*st->speech_bands2[i] = (1-adapt)*st->speech_bands2[i] + adapt*bands[i]*bands[i];*/
- st->speech_bands2[i] = (1-adapt)*st->speech_bands2[i] + adapt*(bands[i]-st->speech_bands[i])*(bands[i]-st->speech_bands[i]);
- }
- } else {
- float adapt = 1./st->noise_bandsN++;
- if (adapt<.005f)
- adapt = .005f;
- for (i=0;i<NB_BANDS;i++)
- {
- st->noise_bands[i] = (1-adapt)*st->noise_bands[i] + adapt*bands[i];
- /*st->noise_bands2[i] = (1-adapt)*st->noise_bands2[i] + adapt*bands[i]*bands[i];*/
- st->noise_bands2[i] = (1-adapt)*st->noise_bands2[i] + adapt*(bands[i]-st->noise_bands[i])*(bands[i]-st->noise_bands[i]);
- }
- }
- }
-
-
- }
-
- return is_speech;
-}
-
-static void speex_compute_agc(SpeexPreprocessState *st, float mean_prior)
+/* FIXME: The AGC doesn't work yet with fixed-point*/
+#ifndef FIXED_POINT
+static void speex_compute_agc(SpeexPreprocessState *st)
{
int i;
int N = st->ps_size;
@@ -535,7 +580,7 @@ static void speex_compute_agc(SpeexPreprocessState *st, float mean_prior)
for (i=2;i<N;i++)
{
- loudness += scale*st->ps[i] * st->gain2[i] * st->gain2[i] * st->loudness_weight[i];
+ loudness += scale*st->ps[i] * FRAC_SCALING_1*FRAC_SCALING_1*st->gain2[i] * st->gain2[i] * st->loudness_weight[i];
}
loudness=sqrt(loudness);
/*if (loudness < 2*pow(st->loudness, 1.0/LOUDNESS_EXP) &&
@@ -558,6 +603,7 @@ static void speex_compute_agc(SpeexPreprocessState *st, float mean_prior)
st->gain2[i] *= agc_gain;
}
+#endif
static void preprocess_analysis(SpeexPreprocessState *st, spx_int16_t *x)
{
@@ -565,7 +611,7 @@ static void preprocess_analysis(SpeexPreprocessState *st, spx_int16_t *x)
int N = st->ps_size;
int N3 = 2*N - st->frame_size;
int N4 = st->frame_size - N3;
- float *ps=st->ps;
+ spx_word32_t *ps=st->ps;
/* 'Build' input frame */
for (i=0;i<N3;i++)
@@ -579,295 +625,333 @@ static void preprocess_analysis(SpeexPreprocessState *st, spx_int16_t *x)
/* Windowing */
for (i=0;i<2*N;i++)
- st->frame[i] *= st->window[i];
+ st->frame[i] = MULT16_16_Q15(st->frame[i], st->window[i]);
+#ifdef FIXED_POINT
+ {
+ spx_word16_t max_val=0;
+ for (i=0;i<2*N;i++)
+ max_val = MAX16(max_val, ABS16(st->frame[i]));
+ st->frame_shift = 14-spx_ilog2(EXTEND32(max_val));
+ for (i=0;i<2*N;i++)
+ st->frame[i] = SHL16(st->frame[i], st->frame_shift);
+ }
+#endif
+
/* Perform FFT */
- spx_drft_forward(st->fft_lookup, st->frame);
-
+ spx_fft(st->fft_lookup, st->frame, st->ft);
+
/* Power spectrum */
- ps[0]=1;
+ ps[0]=MULT16_16(st->ft[0],st->ft[0]);
for (i=1;i<N;i++)
- ps[i]=1+st->frame[2*i-1]*st->frame[2*i-1] + st->frame[2*i]*st->frame[2*i];
+ ps[i]=MULT16_16(st->ft[2*i-1],st->ft[2*i-1]) + MULT16_16(st->ft[2*i],st->ft[2*i]);
+ for (i=0;i<N;i++)
+ st->ps[i] = PSHR32(st->ps[i], 2*st->frame_shift);
+ filterbank_compute_bank32(st->bank, ps, ps+N);
}
static void update_noise_prob(SpeexPreprocessState *st)
{
int i;
+ int min_range;
int N = st->ps_size;
for (i=1;i<N-1;i++)
- st->S[i] = 100.f+ .8f*st->S[i] + .05f*st->ps[i-1]+.1f*st->ps[i]+.05f*st->ps[i+1];
+ st->S[i] = MULT16_32_Q15(QCONST16(.8f,15),st->S[i]) + MULT16_32_Q15(QCONST16(.05f,15),st->ps[i-1])
+ + MULT16_32_Q15(QCONST16(.1f,15),st->ps[i]) + MULT16_32_Q15(QCONST16(.05f,15),st->ps[i+1]);
+ st->S[0] = MULT16_32_Q15(QCONST16(.8f,15),st->S[0]) + MULT16_32_Q15(QCONST16(.2f,15),st->ps[0]);
+ st->S[N-1] = MULT16_32_Q15(QCONST16(.8f,15),st->S[N-1]) + MULT16_32_Q15(QCONST16(.2f,15),st->ps[N-1]);
- if (st->nb_preprocess<1)
+ if (st->nb_adapt==1)
{
- for (i=1;i<N-1;i++)
- st->Smin[i] = st->Stmp[i] = st->S[i]+100.f;
+ for (i=0;i<N;i++)
+ st->Smin[i] = st->Stmp[i] = 0;
}
- if (st->nb_preprocess%200==0)
+ if (st->nb_adapt < 100)
+ min_range = 15;
+ else if (st->nb_adapt < 1000)
+ min_range = 50;
+ else if (st->nb_adapt < 10000)
+ min_range = 150;
+ else
+ min_range = 300;
+ if (st->min_count > min_range)
{
- for (i=1;i<N-1;i++)
+ st->min_count = 0;
+ for (i=0;i<N;i++)
{
- st->Smin[i] = min(st->Stmp[i], st->S[i]);
+ st->Smin[i] = MIN32(st->Stmp[i], st->S[i]);
st->Stmp[i] = st->S[i];
}
} else {
- for (i=1;i<N-1;i++)
+ for (i=0;i<N;i++)
{
- st->Smin[i] = min(st->Smin[i], st->S[i]);
- st->Stmp[i] = min(st->Stmp[i], st->S[i]);
+ st->Smin[i] = MIN32(st->Smin[i], st->S[i]);
+ st->Stmp[i] = MIN32(st->Stmp[i], st->S[i]);
}
}
- for (i=1;i<N-1;i++)
+ for (i=0;i<N;i++)
{
- st->update_prob[i] *= .2f;
- if (st->S[i] > 2.5*st->Smin[i])
- st->update_prob[i] += .8f;
+ if (MULT16_32_Q15(QCONST16(.4f,15),st->S[i]) > ADD32(st->Smin[i],EXTEND32(20)))
+ st->update_prob[i] = 1;
+ else
+ st->update_prob[i] = 0;
/*fprintf (stderr, "%f ", st->S[i]/st->Smin[i]);*/
/*fprintf (stderr, "%f ", st->update_prob[i]);*/
}
}
-#define NOISE_OVERCOMPENS 1.4
+#define NOISE_OVERCOMPENS 1.
+
+void speex_echo_get_residual(SpeexEchoState *st, spx_word32_t *Yout, int len);
int speex_preprocess(SpeexPreprocessState *st, spx_int16_t *x, spx_int32_t *echo)
{
+ return speex_preprocess_run(st, x);
+}
+
+int speex_preprocess_run(SpeexPreprocessState *st, spx_int16_t *x)
+{
int i;
- int is_speech=1;
- float mean_post=0;
- float mean_prior=0;
+ int M;
int N = st->ps_size;
int N3 = 2*N - st->frame_size;
int N4 = st->frame_size - N3;
- float scale=.5f/N;
- float *ps=st->ps;
- float Zframe=0, Pframe;
-
+ spx_word32_t *ps=st->ps;
+ spx_word32_t Zframe;
+ spx_word16_t Pframe;
+ spx_word16_t beta, beta_1;
+ spx_word16_t effective_echo_suppress;
+
+ st->nb_adapt++;
+ st->min_count++;
+
+ beta = MAX16(QCONST16(.03,15),DIV32_16(Q15_ONE,st->nb_adapt));
+ beta_1 = Q15_ONE-beta;
+ M = st->nbands;
+ /* Deal with residual echo if provided */
+ if (st->echo_state)
+ {
+ speex_echo_get_residual(st->echo_state, st->residual_echo, N);
+#ifndef FIXED_POINT
+ /* If there are NaNs or ridiculous values, it'll show up in the DC and we just reset everything to zero */
+ if (!(st->residual_echo[0] >=0 && st->residual_echo[0]<N*1e9f))
+ {
+ for (i=0;i<N;i++)
+ st->residual_echo[i] = 0;
+ }
+#endif
+ for (i=0;i<N;i++)
+ st->echo_noise[i] = MAX32(MULT16_32_Q15(QCONST16(.6f,15),st->echo_noise[i]), st->residual_echo[i]);
+ filterbank_compute_bank32(st->bank, st->echo_noise, st->echo_noise+N);
+ } else {
+ for (i=0;i<N+M;i++)
+ st->echo_noise[i] = 0;
+ }
preprocess_analysis(st, x);
update_noise_prob(st);
- st->nb_preprocess++;
-
- /* Noise estimation always updated for the 20 first times */
- if (st->nb_adapt<10)
+ /* Noise estimation always updated for the 10 first frames */
+ /*if (st->nb_adapt<10)
{
- update_noise(st, ps, echo);
+ for (i=1;i<N-1;i++)
+ st->update_prob[i] = 0;
}
-
- /* Deal with residual echo if provided */
- if (echo)
- for (i=1;i<N;i++)
- st->echo_noise[i] = (.3f*st->echo_noise[i] + st->frame_size*st->frame_size*4.0*echo[i]);
-
- /* Compute a posteriori SNR */
- for (i=1;i<N;i++)
+ */
+
+ /* Update the noise estimate for the frequencies where it can be */
+ for (i=0;i<N;i++)
{
- float tot_noise = 1.f+ NOISE_OVERCOMPENS*st->noise[i] + st->echo_noise[i] + st->reverb_estimate[i];
- st->post[i] = ps[i]/tot_noise - 1.f;
- if (st->post[i]>100.f)
- st->post[i]=100.f;
- /*if (st->post[i]<0)
- st->post[i]=0;*/
- mean_post+=st->post[i];
+ if (!st->update_prob[i] || st->ps[i] < PSHR32(st->noise[i], NOISE_SHIFT))
+ st->noise[i] = MAX32(EXTEND32(0),MULT16_32_Q15(beta_1,st->noise[i]) + MULT16_32_Q15(beta,SHL32(st->ps[i],NOISE_SHIFT)));
}
- mean_post /= N;
- if (mean_post<0.f)
- mean_post=0.f;
+ filterbank_compute_bank32(st->bank, st->noise, st->noise+N);
/* Special case for first frame */
if (st->nb_adapt==1)
- for (i=1;i<N;i++)
+ for (i=0;i<N+M;i++)
st->old_ps[i] = ps[i];
- /* Compute a priori SNR */
- {
- /* A priori update rate */
- for (i=1;i<N;i++)
- {
- float gamma = .1+.9*st->prior[i]*st->prior[i]/((1+st->prior[i])*(1+st->prior[i]));
- float tot_noise = 1.f+ NOISE_OVERCOMPENS*st->noise[i] + st->echo_noise[i] + st->reverb_estimate[i];
- /* A priori SNR update */
- st->prior[i] = gamma*max(0.0f,st->post[i]) +
- (1.f-gamma)* (.8*st->gain[i]*st->gain[i]*st->old_ps[i]/tot_noise + .2*st->prior[i]);
-
- if (st->prior[i]>100.f)
- st->prior[i]=100.f;
-
- mean_prior+=st->prior[i];
- }
- }
- mean_prior /= N;
-
-#if 0
- for (i=0;i<N;i++)
- {
- fprintf (stderr, "%f ", st->prior[i]);
- }
- fprintf (stderr, "\n");
-#endif
- /*fprintf (stderr, "%f %f\n", mean_prior,mean_post);*/
-
- if (st->nb_preprocess>=20)
+ /* Compute a posteriori SNR */
+ for (i=0;i<N+M;i++)
{
- int do_update = 0;
- float noise_ener=0, sig_ener=0;
- /* If SNR is low (both a priori and a posteriori), update the noise estimate*/
- /*if (mean_prior<.23 && mean_post < .5)*/
- if (mean_prior<.23f && mean_post < .5f)
- do_update = 1;
- for (i=1;i<N;i++)
- {
- noise_ener += st->noise[i];
- sig_ener += ps[i];
- }
- if (noise_ener > 3.f*sig_ener)
- do_update = 1;
- /*do_update = 0;*/
- if (do_update)
- {
- st->consec_noise++;
- } else {
- st->consec_noise=0;
- }
+ spx_word16_t gamma;
+
+ /* Total noise estimate including residual echo and reverberation */
+ spx_word32_t tot_noise = ADD32(ADD32(ADD32(EXTEND32(1), PSHR32(st->noise[i],NOISE_SHIFT)) , st->echo_noise[i]) , st->reverb_estimate[i]);
+
+ /* A posteriori SNR = ps/noise - 1*/
+ st->post[i] = SUB16(DIV32_16_Q8(ps[i],tot_noise), QCONST16(1.f,SNR_SHIFT));
+ st->post[i]=MIN16(st->post[i], QCONST16(100.f,SNR_SHIFT));
+
+ /* Computing update gamma = .1 + .9*(old/(old+noise))^2 */
+ gamma = QCONST16(.1f,15)+MULT16_16_Q15(QCONST16(.89f,15),SQR16_Q15(DIV32_16_Q15(st->old_ps[i],ADD32(st->old_ps[i],tot_noise))));
+
+ /* A priori SNR update = gamma*max(0,post) + (1-gamma)*old/noise */
+ st->prior[i] = EXTRACT16(PSHR32(ADD32(MULT16_16(gamma,MAX16(0,st->post[i])), MULT16_16(Q15_ONE-gamma,DIV32_16_Q8(st->old_ps[i],tot_noise))), 15));
+ st->prior[i]=MIN16(st->prior[i], QCONST16(100.f,SNR_SHIFT));
}
- if (st->vad_enabled)
- is_speech = speex_compute_vad(st, ps, mean_prior, mean_post);
-
+ /*print_vec(st->post, N+M, "");*/
- if (st->consec_noise>=3)
+ /* Recursive average of the a priori SNR. A bit smoothed for the psd components */
+ st->zeta[0] = PSHR32(ADD32(MULT16_16(QCONST16(.7f,15),st->zeta[0]), MULT16_16(QCONST16(.3f,15),st->prior[0])),15);
+ for (i=1;i<N-1;i++)
+ st->zeta[i] = PSHR32(ADD32(ADD32(ADD32(MULT16_16(QCONST16(.7f,15),st->zeta[i]), MULT16_16(QCONST16(.15f,15),st->prior[i])),
+ MULT16_16(QCONST16(.075f,15),st->prior[i-1])), MULT16_16(QCONST16(.075f,15),st->prior[i+1])),15);
+ for (i=N-1;i<N+M;i++)
+ st->zeta[i] = PSHR32(ADD32(MULT16_16(QCONST16(.7f,15),st->zeta[i]), MULT16_16(QCONST16(.3f,15),st->prior[i])),15);
+
+ /* Speech probability of presence for the entire frame is based on the average filterbank a priori SNR */
+ Zframe = 0;
+ for (i=N;i<N+M;i++)
+ Zframe = ADD32(Zframe, EXTEND32(st->zeta[i]));
+ Pframe = QCONST16(.1f,15)+MULT16_16_Q15(QCONST16(.899f,15),qcurve(DIV32_16(Zframe,st->nbands)));
+
+ effective_echo_suppress = EXTRACT16(PSHR32(ADD32(MULT16_16(SUB16(Q15_ONE,Pframe), st->echo_suppress), MULT16_16(Pframe, st->echo_suppress_active)),15));
+
+ compute_gain_floor(st->noise_suppress, effective_echo_suppress, st->noise+N, st->echo_noise+N, st->gain_floor+N, M);
+
+ /* Compute Ephraim & Malah gain speech probability of presence for each critical band (Bark scale)
+ Technically this is actually wrong because the EM gaim assumes a slightly different probability
+ distribution */
+ for (i=N;i<N+M;i++)
{
- update_noise(st, st->old_ps, echo);
- } else {
- for (i=1;i<N-1;i++)
- {
- if (st->update_prob[i]<.5f/* || st->ps[i] < st->noise[i]*/)
- {
- if (echo)
- st->noise[i] = .95f*st->noise[i] + .05f*max(1.0f,st->ps[i]-st->frame_size*st->frame_size*4.0*echo[i]);
- else
- st->noise[i] = .95f*st->noise[i] + .05f*st->ps[i];
- }
- }
- }
+ /* See EM and Cohen papers*/
+ spx_word32_t theta;
+ /* Gain from hypergeometric function */
+ spx_word32_t MM;
+ /* Weiner filter gain */
+ spx_word16_t prior_ratio;
+ /* a priority probability of speech presence based on Bark sub-band alone */
+ spx_word16_t P1;
+ /* Speech absence a priori probability (considering sub-band and frame) */
+ spx_word16_t q;
+#ifdef FIXED_POINT
+ spx_word16_t tmp;
+#endif
+
+ prior_ratio = PDIV32_16(SHL32(EXTEND32(st->prior[i]), 15), ADD16(st->prior[i], SHL32(1,SNR_SHIFT)));
+ theta = MULT16_32_P15(prior_ratio, QCONST32(1.f,EXPIN_SHIFT)+SHL32(EXTEND32(st->post[i]),EXPIN_SHIFT-SNR_SHIFT));
- for (i=1;i<N;i++)
- {
- st->zeta[i] = .7f*st->zeta[i] + .3f*st->prior[i];
+ MM = hypergeom_gain(theta);
+ /* Gain with bound */
+ st->gain[i] = EXTRACT16(MIN32(Q15_ONE, MULT16_32_Q15(prior_ratio, MM)));
+ /* Save old Bark power spectrum */
+ st->old_ps[i] = MULT16_32_P15(QCONST16(.2f,15),st->old_ps[i]) + MULT16_32_P15(MULT16_16_P15(QCONST16(.8f,15),SQR16_Q15(st->gain[i])),ps[i]);
+
+ P1 = QCONST16(.199f,15)+MULT16_16_Q15(QCONST16(.8f,15),qcurve (st->zeta[i]));
+ q = Q15_ONE-MULT16_16_Q15(Pframe,P1);
+#ifdef FIXED_POINT
+ theta = MIN32(theta, EXTEND32(32767));
+/*Q8*/tmp = MULT16_16_Q15((SHL32(1,SNR_SHIFT)+st->prior[i]),EXTRACT16(MIN32(Q15ONE,SHR32(spx_exp(-EXTRACT16(theta)),1))));
+ tmp = MIN16(QCONST16(3.,SNR_SHIFT), tmp); /* Prevent overflows in the next line*/
+/*Q8*/tmp = PSHR(MULT16_16(PDIV32_16(SHL32(EXTEND32(q),8),(Q15_ONE-q)),tmp),8);
+ st->gain2[i]=DIV32_16(SHL(EXTEND32(32767),SNR_SHIFT), ADD16(256,tmp));
+#else
+ st->gain2[i]=1/(1.f + (q/(1.f-q))*(1+st->prior[i])*exp(-theta));
+#endif
}
-
+ /* Convert the EM gains and speech prob to linear frequency */
+ filterbank_compute_psd16(st->bank,st->gain2+N, st->gain2);
+ filterbank_compute_psd16(st->bank,st->gain+N, st->gain);
+
+ /* Use 1 for linear gain resolution (best) or 0 for Bark gain resolution (faster) */
+ if (1)
{
- int freq_start = (int)(300.0f*2.f*N/st->sampling_rate);
- int freq_end = (int)(2000.0f*2.f*N/st->sampling_rate);
- for (i=freq_start;i<freq_end;i++)
+ filterbank_compute_psd16(st->bank,st->gain_floor+N, st->gain_floor);
+
+ /* Compute gain according to the Ephraim-Malah algorithm -- linear frequency */
+ for (i=0;i<N;i++)
{
- Zframe += st->zeta[i];
- }
- Zframe /= (freq_end-freq_start);
- }
- st->Zlast = Zframe;
-
- Pframe = qcurve(Zframe);
+ spx_word32_t MM;
+ spx_word32_t theta;
+ spx_word16_t prior_ratio;
+ spx_word16_t tmp;
+ spx_word16_t p;
+ spx_word16_t g;
+
+ /* Wiener filter gain */
+ prior_ratio = PDIV32_16(SHL32(EXTEND32(st->prior[i]), 15), ADD16(st->prior[i], SHL32(1,SNR_SHIFT)));
+ theta = MULT16_32_P15(prior_ratio, QCONST32(1.f,EXPIN_SHIFT)+SHL32(EXTEND32(st->post[i]),EXPIN_SHIFT-SNR_SHIFT));
+
+ /* Optimal estimator for loudness domain */
+ MM = hypergeom_gain(theta);
+ /* EM gain with bound */
+ g = EXTRACT16(MIN32(Q15_ONE, MULT16_32_Q15(prior_ratio, MM)));
+ /* Interpolated speech probability of presence */
+ p = st->gain2[i];
+
+ /* Constrain the gain to be close to the Bark scale gain */
+ if (MULT16_16_Q15(QCONST16(.333f,15),g) > st->gain[i])
+ g = MULT16_16(3,st->gain[i]);
+ st->gain[i] = g;
+
+ /* Save old power spectrum */
+ st->old_ps[i] = MULT16_32_P15(QCONST16(.2f,15),st->old_ps[i]) + MULT16_32_P15(MULT16_16_P15(QCONST16(.8f,15),SQR16_Q15(st->gain[i])),ps[i]);
+
+ /* Apply gain floor */
+ if (st->gain[i] < st->gain_floor[i])
+ st->gain[i] = st->gain_floor[i];
- /*fprintf (stderr, "%f\n", Pframe);*/
- /* Compute gain according to the Ephraim-Malah algorithm */
- for (i=1;i<N;i++)
- {
- float MM;
- float theta;
- float prior_ratio;
- float p, q;
- float zeta1;
- float P1;
-
- prior_ratio = st->prior[i]/(1.0001f+st->prior[i]);
- theta = (1.f+st->post[i])*prior_ratio;
-
- if (i==1 || i==N-1)
- zeta1 = st->zeta[i];
- else
- zeta1 = .25f*st->zeta[i-1] + .5f*st->zeta[i] + .25f*st->zeta[i+1];
- P1 = qcurve (zeta1);
-
- /* FIXME: add global prob (P2) */
- q = 1-Pframe*P1;
- q = 1-P1;
- if (q>.95f)
- q=.95f;
- p=1.f/(1.f + (q/(1.f-q))*(1.f+st->prior[i])*exp(-theta));
- /*p=1;*/
-
- /* Optimal estimator for loudness domain */
- MM = hypergeom_gain(theta);
+ /* Exponential decay model for reverberation (unused) */
+ /*st->reverb_estimate[i] = st->reverb_decay*st->reverb_estimate[i] + st->reverb_decay*st->reverb_level*st->gain[i]*st->gain[i]*st->ps[i];*/
+
+ /* Take into account speech probability of presence (loudness domain MMSE estimator) */
+ /* gain2 = [p*sqrt(gain)+(1-p)*sqrt(gain _floor) ]^2 */
+ tmp = MULT16_16_P15(p,spx_sqrt(SHL32(EXTEND32(st->gain[i]),15))) + MULT16_16_P15(SUB16(Q15_ONE,p),spx_sqrt(SHL32(EXTEND32(st->gain_floor[i]),15)));
+ st->gain2[i]=SQR16_Q15(tmp);
- st->gain[i] = prior_ratio * MM;
- /*Put some (very arbitraty) limit on the gain*/
- if (st->gain[i]>2.f)
- {
- st->gain[i]=2.f;
+ /* Use this if you want a log-domain MMSE estimator instead */
+ /*st->gain2[i] = pow(st->gain[i], p) * pow(st->gain_floor[i],1.f-p);*/
}
-
- st->reverb_estimate[i] = st->reverb_decay*st->reverb_estimate[i] + st->reverb_decay*st->reverb_level*st->gain[i]*st->gain[i]*st->ps[i];
- if (st->denoise_enabled)
+ } else {
+ for (i=N;i<N+M;i++)
{
- st->gain2[i] = p*p*st->gain[i];
- /*st->gain2[i]=(p*sqrt(st->gain[i])+.05*(1-p))*(p*sqrt(st->gain[i])+.05*(1-p));*/
- /*st->gain2[i] = pow(st->gain[i], p) * pow(.2f,1.f-p);*/
- } else {
- st->gain2[i]=1.f;
+ spx_word16_t tmp;
+ spx_word16_t p = st->gain2[i];
+ st->gain[i] = MAX16(st->gain[i], st->gain_floor[i]);
+ tmp = MULT16_16_P15(p,spx_sqrt(SHL32(EXTEND32(st->gain[i]),15))) + MULT16_16_P15(SUB16(Q15_ONE,p),spx_sqrt(SHL32(EXTEND32(st->gain_floor[i]),15)));
+ st->gain2[i]=SQR16_Q15(tmp);
}
+ filterbank_compute_psd16(st->bank,st->gain2+N, st->gain2);
}
- st->gain2[0]=st->gain[0]=0.f;
- st->gain2[N-1]=st->gain[N-1]=0.f;
- /*
- for (i=30;i<N-2;i++)
+ /* If noise suppression is off, don't apply the gain (but then why call this in the first place!) */
+ if (!st->denoise_enabled)
{
- st->gain[i] = st->gain2[i]*st->gain2[i] + (1-st->gain2[i])*.333*(.6*st->gain2[i-1]+st->gain2[i]+.6*st->gain2[i+1]+.4*st->gain2[i-2]+.4*st->gain2[i+2]);
+ for (i=0;i<N+M;i++)
+ st->gain2[i]=Q15_ONE;
}
- for (i=30;i<N-2;i++)
- st->gain2[i] = st->gain[i];
- */
+
+ /*FIXME: This *will* not work for fixed-point */
+#ifndef FIXED_POINT
if (st->agc_enabled)
- speex_compute_agc(st, mean_prior);
-
-#if 0
- if (!is_speech)
- {
- for (i=0;i<N;i++)
- st->gain2[i] = 0;
- }
-#if 0
- else {
- for (i=0;i<N;i++)
- st->gain2[i] = 1;
- }
-#endif
+ speex_compute_agc(st);
#endif
-
+
/* Apply computed gain */
for (i=1;i<N;i++)
{
- st->frame[2*i-1] *= st->gain2[i];
- st->frame[2*i] *= st->gain2[i];
+ st->ft[2*i-1] = MULT16_16_P15(st->gain2[i],st->ft[2*i-1]);
+ st->ft[2*i] = MULT16_16_P15(st->gain2[i],st->ft[2*i]);
}
-
- /* Get rid of the DC and very low frequencies */
- st->frame[0]=0;
- st->frame[1]=0;
- st->frame[2]=0;
- /* Nyquist frequency is mostly useless too */
- st->frame[2*N-1]=0;
-
+ st->ft[0] = MULT16_16_P15(st->gain2[0],st->ft[0]);
+ st->ft[2*N-1] = MULT16_16_P15(st->gain2[N-1],st->ft[2*N-1]);
+
/* Inverse FFT with 1/N scaling */
- spx_drft_backward(st->fft_lookup, st->frame);
-
+ spx_ifft(st->fft_lookup, st->ft, st->frame);
+ /* Scale back to original (lower) amplitude */
for (i=0;i<2*N;i++)
- st->frame[i] *= scale;
+ st->frame[i] = PSHR16(st->frame[i], st->frame_shift);
+ /*FIXME: This *will* not work for fixed-point */
+#ifndef FIXED_POINT
+ if (st->agc_enabled)
{
float max_sample=0;
for (i=0;i<2*N;i++)
@@ -880,9 +964,11 @@ int speex_preprocess(SpeexPreprocessState *st, spx_int16_t *x, spx_int32_t *echo
st->frame[i] *= damp;
}
}
-
+#endif
+
+ /* Synthesis window (for WOLA) */
for (i=0;i<2*N;i++)
- st->frame[i] *= st->window[i];
+ st->frame[i] = MULT16_16_Q15(st->frame[i], st->window[i]);
/* Perform overlap and add */
for (i=0;i<N3;i++)
@@ -894,47 +980,55 @@ int speex_preprocess(SpeexPreprocessState *st, spx_int16_t *x, spx_int32_t *echo
for (i=0;i<N3;i++)
st->outbuf[i] = st->frame[st->frame_size+i];
- /* Save old power spectrum */
- for (i=1;i<N;i++)
- st->old_ps[i] = ps[i];
-
- return is_speech;
+ /* FIXME: This VAD is a kludge */
+ if (st->vad_enabled)
+ {
+ if (Pframe > st->speech_prob_start || (st->was_speech && Pframe > st->speech_prob_continue))
+ {
+ st->was_speech=1;
+ return 1;
+ } else
+ {
+ st->was_speech=0;
+ return 0;
+ }
+ } else {
+ return 1;
+ }
}
-void speex_preprocess_estimate_update(SpeexPreprocessState *st, spx_int16_t *x, spx_int32_t *echo)
+void speex_preprocess_estimate_update(SpeexPreprocessState *st, spx_int16_t *x)
{
int i;
int N = st->ps_size;
int N3 = 2*N - st->frame_size;
+ int M;
+ spx_word32_t *ps=st->ps;
- float *ps=st->ps;
-
+ M = st->nbands;
+ st->min_count++;
+
preprocess_analysis(st, x);
update_noise_prob(st);
-
- st->nb_preprocess++;
for (i=1;i<N-1;i++)
{
- if (st->update_prob[i]<.5f || st->ps[i] < st->noise[i])
+ if (!st->update_prob[i] || st->ps[i] < PSHR32(st->noise[i],NOISE_SHIFT))
{
- if (echo)
- st->noise[i] = .95f*st->noise[i] + .1f*max(1.0f,st->ps[i]-st->frame_size*st->frame_size*4.0*echo[i]);
- else
- st->noise[i] = .95f*st->noise[i] + .1f*st->ps[i];
+ st->noise[i] = MULT16_32_Q15(QCONST16(.95f,15),st->noise[i]) + MULT16_32_Q15(QCONST16(.05f,15),SHL32(st->ps[i],NOISE_SHIFT));
}
}
for (i=0;i<N3;i++)
- st->outbuf[i] = x[st->frame_size-N3+i]*st->window[st->frame_size+i];
+ st->outbuf[i] = MULT16_16_Q15(x[st->frame_size-N3+i],st->window[st->frame_size+i]);
/* Save old power spectrum */
- for (i=1;i<N;i++)
+ for (i=0;i<N+M;i++)
st->old_ps[i] = ps[i];
- for (i=1;i<N;i++)
- st->reverb_estimate[i] *= st->reverb_decay;
+ for (i=0;i<N;i++)
+ st->reverb_estimate[i] = MULT16_32_Q15(st->reverb_decay, st->reverb_estimate[i]);
}
@@ -946,17 +1040,17 @@ int speex_preprocess_ctl(SpeexPreprocessState *state, int request, void *ptr)
switch(request)
{
case SPEEX_PREPROCESS_SET_DENOISE:
- st->denoise_enabled = (*(int*)ptr);
+ st->denoise_enabled = (*(spx_int32_t*)ptr);
break;
case SPEEX_PREPROCESS_GET_DENOISE:
- (*(int*)ptr) = st->denoise_enabled;
+ (*(spx_int32_t*)ptr) = st->denoise_enabled;
break;
-
+#ifndef FIXED_POINT
case SPEEX_PREPROCESS_SET_AGC:
- st->agc_enabled = (*(int*)ptr);
+ st->agc_enabled = (*(spx_int32_t*)ptr);
break;
case SPEEX_PREPROCESS_GET_AGC:
- (*(int*)ptr) = st->agc_enabled;
+ (*(spx_int32_t*)ptr) = st->agc_enabled;
break;
case SPEEX_PREPROCESS_SET_AGC_LEVEL:
@@ -969,21 +1063,22 @@ int speex_preprocess_ctl(SpeexPreprocessState *state, int request, void *ptr)
case SPEEX_PREPROCESS_GET_AGC_LEVEL:
(*(float*)ptr) = st->agc_level;
break;
-
+#endif
case SPEEX_PREPROCESS_SET_VAD:
- st->vad_enabled = (*(int*)ptr);
+ speex_warning("The VAD has been replaced by a hack pending a complete rewrite");
+ st->vad_enabled = (*(spx_int32_t*)ptr);
break;
case SPEEX_PREPROCESS_GET_VAD:
- (*(int*)ptr) = st->vad_enabled;
+ (*(spx_int32_t*)ptr) = st->vad_enabled;
break;
case SPEEX_PREPROCESS_SET_DEREVERB:
- st->dereverb_enabled = (*(int*)ptr);
+ st->dereverb_enabled = (*(spx_int32_t*)ptr);
for (i=0;i<st->ps_size;i++)
st->reverb_estimate[i]=0;
break;
case SPEEX_PREPROCESS_GET_DEREVERB:
- (*(int*)ptr) = st->dereverb_enabled;
+ (*(spx_int32_t*)ptr) = st->dereverb_enabled;
break;
case SPEEX_PREPROCESS_SET_DEREVERB_LEVEL:
@@ -1001,24 +1096,47 @@ int speex_preprocess_ctl(SpeexPreprocessState *state, int request, void *ptr)
break;
case SPEEX_PREPROCESS_SET_PROB_START:
- st->speech_prob_start = (*(int*)ptr) / 100.0;
- if ( st->speech_prob_start > 1 || st->speech_prob_start < 0 )
- st->speech_prob_start = SPEEX_PROB_START_DEFAULT;
+ *(spx_int32_t*)ptr = MIN32(Q15_ONE,MAX32(0, *(spx_int32_t*)ptr));
+ st->speech_prob_start = DIV32_16(MULT16_16(32767,*(spx_int32_t*)ptr), 100);
break;
case SPEEX_PREPROCESS_GET_PROB_START:
- (*(int*)ptr) = st->speech_prob_start * 100;
+ (*(spx_int32_t*)ptr) = MULT16_16_Q15(st->speech_prob_start, 100);
break;
case SPEEX_PREPROCESS_SET_PROB_CONTINUE:
- st->speech_prob_continue = (*(int*)ptr) / 100.0;
- if ( st->speech_prob_continue > 1 || st->speech_prob_continue < 0 )
- st->speech_prob_continue = SPEEX_PROB_CONTINUE_DEFAULT;
+ *(spx_int32_t*)ptr = MIN32(Q15_ONE,MAX32(0, *(spx_int32_t*)ptr));
+ st->speech_prob_continue = DIV32_16(MULT16_16(32767,*(spx_int32_t*)ptr), 100);
break;
case SPEEX_PREPROCESS_GET_PROB_CONTINUE:
- (*(int*)ptr) = st->speech_prob_continue * 100;
+ (*(spx_int32_t*)ptr) = MULT16_16_Q15(st->speech_prob_continue, 100);
+ break;
+
+ case SPEEX_PREPROCESS_SET_NOISE_SUPPRESS:
+ st->noise_suppress = -ABS(*(spx_int32_t*)ptr);
+ break;
+ case SPEEX_PREPROCESS_GET_NOISE_SUPPRESS:
+ (*(spx_int32_t*)ptr) = st->noise_suppress;
+ break;
+ case SPEEX_PREPROCESS_SET_ECHO_SUPPRESS:
+ st->echo_suppress = -ABS(*(spx_int32_t*)ptr);
+ break;
+ case SPEEX_PREPROCESS_GET_ECHO_SUPPRESS:
+ (*(spx_int32_t*)ptr) = st->echo_suppress;
+ break;
+ case SPEEX_PREPROCESS_SET_ECHO_SUPPRESS_ACTIVE:
+ st->echo_suppress_active = -ABS(*(spx_int32_t*)ptr);
+ break;
+ case SPEEX_PREPROCESS_GET_ECHO_SUPPRESS_ACTIVE:
+ (*(spx_int32_t*)ptr) = st->echo_suppress_active;
+ break;
+ case SPEEX_PREPROCESS_SET_ECHO_STATE:
+ st->echo_state = (SpeexEchoState*)ptr;
+ break;
+ case SPEEX_PREPROCESS_GET_ECHO_STATE:
+ ptr = (void*)st->echo_state;
break;
- default:
+ default:
speex_warning_int("Unknown speex_preprocess_ctl request: ", request);
return -1;
}