From 67d4e56dc8a365871c3dca4f04fcf8b9c9f47ee6 Mon Sep 17 00:00:00 2001 From: Benny Prijono Date: Thu, 23 Nov 2006 10:19:46 +0000 Subject: Updated Speex to their latest SVN (1.2-beta). AEC seems to work much better now and take less CPU, so I increased default tail length in PJSUA to 800ms. git-svn-id: http://svn.pjsip.org/repos/pjproject/trunk@823 74dad513-b988-da41-8d7b-12977e46ad98 --- pjmedia/src/pjmedia-codec/speex/preprocess_spx.c | 1258 ++++++++++++---------- 1 file changed, 688 insertions(+), 570 deletions(-) (limited to 'pjmedia/src/pjmedia-codec/speex/preprocess_spx.c') diff --git a/pjmedia/src/pjmedia-codec/speex/preprocess_spx.c b/pjmedia/src/pjmedia-codec/speex/preprocess_spx.c index ba2e6076..d82230cf 100644 --- a/pjmedia/src/pjmedia-codec/speex/preprocess_spx.c +++ b/pjmedia/src/pjmedia-codec/speex/preprocess_spx.c @@ -1,6 +1,6 @@ -/* Copyright (C) 2003 Epic Games - Written by Jean-Marc Valin - +/* Copyright (C) 2003 Epic Games (written by Jean-Marc Valin) + Copyright (C) 2004-2006 Epic Games + File: preprocess.c Preprocessor with denoising based on the algorithm by Ephraim and Malah @@ -31,96 +31,356 @@ POSSIBILITY OF SUCH DAMAGE. */ + +/* + Recommended papers: + + Y. Ephraim and D. Malah, "Speech enhancement using minimum mean-square error + short-time spectral amplitude estimator". IEEE Transactions on Acoustics, + Speech and Signal Processing, vol. ASSP-32, no. 6, pp. 1109-1121, 1984. + + Y. Ephraim and D. Malah, "Speech enhancement using minimum mean-square error + log-spectral amplitude estimator". IEEE Transactions on Acoustics, Speech and + Signal Processing, vol. ASSP-33, no. 2, pp. 443-445, 1985. + + I. Cohen and B. Berdugo, "Speech enhancement for non-stationary noise environments". + Signal Processing, vol. 81, no. 2, pp. 2403-2418, 2001. + + Stefan Gustafsson, Rainer Martin, Peter Jax, and Peter Vary. "A psychoacoustic + approach to combined acoustic echo cancellation and noise reduction". IEEE + Transactions on Speech and Audio Processing, 2002. + + J.-M. Valin, J. Rouat, and F. Michaud, "Microphone array post-filter for separation + of simultaneous non-stationary sources". In Proceedings IEEE International + Conference on Acoustics, Speech, and Signal Processing, 2004. +*/ + #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include "speex/speex_preprocess.h" +#include "speex/speex_echo.h" #include "misc.h" -#include "smallft.h" - -#define max(a,b) ((a) > (b) ? (a) : (b)) -#define min(a,b) ((a) < (b) ? (a) : (b)) +#include "fftwrap.h" +#include "filterbank.h" +#include "math_approx.h" #ifndef M_PI #define M_PI 3.14159263 #endif -#define SQRT_M_PI_2 0.88623 #define LOUDNESS_EXP 2.5 -#define NB_BANDS 8 +#define NB_BANDS 24 + +#define SPEECH_PROB_START_DEFAULT QCONST16(0.35f,15) +#define SPEECH_PROB_CONTINUE_DEFAULT QCONST16(0.20f,15) +#define NOISE_SUPPRESS_DEFAULT -25 +#define ECHO_SUPPRESS_DEFAULT -45 +#define ECHO_SUPPRESS_ACTIVE_DEFAULT -15 + +#ifndef NULL +#define NULL 0 +#endif + +#define SQR(x) ((x)*(x)) +#define SQR16(x) (MULT16_16((x),(x))) +#define SQR16_Q15(x) (MULT16_16_Q15((x),(x))) + +#ifdef FIXED_POINT +static inline spx_word16_t DIV32_16_Q8(spx_word32_t a, spx_word32_t b) +{ + if (SHR32(a,7) >= b) + { + return 32767; + } else { + if (b>=QCONST32(1,23)) + { + a = SHR32(a,8); + b = SHR32(b,8); + } + if (b>=QCONST32(1,19)) + { + a = SHR32(a,4); + b = SHR32(b,4); + } + if (b>=QCONST32(1,15)) + { + a = SHR32(a,4); + b = SHR32(b,4); + } + a = SHL32(a,8); + return PDIV32_16(a,b); + } + +} +static inline spx_word16_t DIV32_16_Q15(spx_word32_t a, spx_word32_t b) +{ + if (SHR32(a,15) >= b) + { + return 32767; + } else { + if (b>=QCONST32(1,23)) + { + a = SHR32(a,8); + b = SHR32(b,8); + } + if (b>=QCONST32(1,19)) + { + a = SHR32(a,4); + b = SHR32(b,4); + } + if (b>=QCONST32(1,15)) + { + a = SHR32(a,4); + b = SHR32(b,4); + } + a = SHL32(a,15)-a; + return DIV32_16(a,b); + } +} +#define SNR_SCALING 256.f +#define SNR_SCALING_1 0.0039062f +#define SNR_SHIFT 8 + +#define FRAC_SCALING 32767.f +#define FRAC_SCALING_1 3.0518e-05 +#define FRAC_SHIFT 1 -#define SPEEX_PROB_START_DEFAULT 0.35f -#define SPEEX_PROB_CONTINUE_DEFAULT 0.20f +#define EXPIN_SCALING 2048.f +#define EXPIN_SCALING_1 0.00048828f +#define EXPIN_SHIFT 11 +#define EXPOUT_SCALING_1 1.5259e-05 -#define ZMIN .1 -#define ZMAX .316 -#define ZMIN_1 10 -#define LOG_MIN_MAX_1 0.86859 +#define NOISE_SHIFT 7 -static void conj_window(float *w, int len) +#else + +#define DIV32_16_Q8(a,b) ((a)/(b)) +#define DIV32_16_Q15(a,b) ((a)/(b)) +#define SNR_SCALING 1.f +#define SNR_SCALING_1 1.f +#define SNR_SHIFT 0 +#define FRAC_SCALING 1.f +#define FRAC_SCALING_1 1.f +#define FRAC_SHIFT 0 +#define NOISE_SHIFT 0 + +#define EXPIN_SCALING 1.f +#define EXPIN_SCALING_1 1.f +#define EXPOUT_SCALING_1 1.f + +#endif + +/** Speex pre-processor state. */ +struct SpeexPreprocessState_ { + /* Basic info */ + int frame_size; /**< Number of samples processed each time */ + int ps_size; /**< Number of points in the power spectrum */ + int sampling_rate; /**< Sampling rate of the input/output */ + int nbands; + FilterBank *bank; + + /* Parameters */ + int denoise_enabled; + int agc_enabled; + float agc_level; + int vad_enabled; + int dereverb_enabled; + spx_word16_t reverb_decay; + spx_word16_t reverb_level; + spx_word16_t speech_prob_start; + spx_word16_t speech_prob_continue; + int noise_suppress; + int echo_suppress; + int echo_suppress_active; + SpeexEchoState *echo_state; + + /* DSP-related arrays */ + spx_word16_t *frame; /**< Processing frame (2*ps_size) */ + spx_word16_t *ft; /**< Processing frame in freq domain (2*ps_size) */ + spx_word32_t *ps; /**< Current power spectrum */ + spx_word16_t *gain2; /**< Adjusted gains */ + spx_word16_t *gain_floor; /**< Minimum gain allowed */ + spx_word16_t *window; /**< Analysis/Synthesis window */ + spx_word32_t *noise; /**< Noise estimate */ + spx_word32_t *reverb_estimate; /**< Estimate of reverb energy */ + spx_word32_t *old_ps; /**< Power spectrum for last frame */ + spx_word16_t *gain; /**< Ephraim Malah gain */ + spx_word16_t *prior; /**< A-priori SNR */ + spx_word16_t *post; /**< A-posteriori SNR */ + + spx_word32_t *S; /**< Smoothed power spectrum */ + spx_word32_t *Smin; /**< See Cohen paper */ + spx_word32_t *Stmp; /**< See Cohen paper */ + int *update_prob; /**< Propability of speech presence for noise update */ + + spx_word16_t *zeta; /**< Smoothed a priori SNR */ + spx_word32_t *echo_noise; + spx_word32_t *residual_echo; + + /* Misc */ + spx_word16_t *inbuf; /**< Input buffer (overlapped analysis) */ + spx_word16_t *outbuf; /**< Output buffer (for overlap and add) */ + +#ifndef FIXED_POINT + float *loudness_weight; /**< Perceptual loudness curve */ + float loudness; /**< loudness estimate */ + float loudness2; /**< loudness estimate */ + int nb_loudness_adapt; /**< Number of frames used for loudness adaptation so far */ +#endif + int nb_adapt; /**< Number of frames used for adaptation so far */ + int was_speech; + int min_count; /**< Number of frames processed so far */ + void *fft_lookup; /**< Lookup table for the FFT */ +#ifdef FIXED_POINT + int frame_shift; +#endif +}; + + +static void conj_window(spx_word16_t *w, int len) { int i; for (i=0;i19) + return ADD32(EXTEND32(Q15_ONE),EXTEND32(DIV32_16(QCONST32(.1296,23), SHR32(xx,EXPIN_SHIFT-SNR_SHIFT)))); + frac = SHL32(xx-SHL32(ind,10),5); + return SHL32(DIV32_16(PSHR32(MULT16_16(Q15_ONE-frac,table[ind]) + MULT16_16(frac,table[ind+1]),7),(spx_sqrt(SHL32(xx,15)+6711))),7); +} + +static inline spx_word16_t qcurve(spx_word16_t x) +{ + x = MAX16(x, 1); + return DIV32_16(SHL32(EXTEND32(32767),9),ADD16(512,MULT16_16_Q15(QCONST16(.60f,15),DIV32_16(32767,x)))); +} + +/* Compute the gain floor based on different floors for the background noise and residual echo */ +static void compute_gain_floor(int noise_suppress, int effective_echo_suppress, spx_word32_t *noise, spx_word32_t *echo, spx_word16_t *gain_floor, int len) +{ + int i; + + if (noise_suppress > effective_echo_suppress) + { + spx_word16_t noise_gain, gain_ratio; + noise_gain = EXTRACT16(MIN32(Q15_ONE,SHR32(spx_exp(MULT16_16(QCONST16(0.11513,11),noise_suppress)),1))); + gain_ratio = EXTRACT16(MIN32(Q15_ONE,SHR32(spx_exp(MULT16_16(QCONST16(.2302585f,11),effective_echo_suppress-noise_suppress)),1))); + + /* gain_floor = sqrt [ (noise*noise_floor + echo*echo_floor) / (noise+echo) ] */ + for (i=0;i9.5) - return 1+.1296/x; - - integer = floor(2*x); - frac = 2*x-integer; - ind = (int)integer; - - return ((1-frac)*table[ind] + frac*table[ind+1])/sqrt(x+.0001f); + x = EXPIN_SCALING_1*xx; + integer = floor(2*x); + ind = (int)integer; + if (ind<0) + return FRAC_SCALING; + if (ind>19) + return FRAC_SCALING*(1+.1296/x); + frac = 2*x-integer; + return FRAC_SCALING*((1-frac)*table[ind] + frac*table[ind+1])/sqrt(x+.0001f); } -static inline float qcurve(float x) +static inline spx_word16_t qcurve(spx_word16_t x) { - return 1.f/(1.f+.1f/(x*x)); + return 1.f/(1.f+.15f/(SNR_SCALING_1*x)); } +static void compute_gain_floor(int noise_suppress, int effective_echo_suppress, spx_word32_t *noise, spx_word32_t *echo, spx_word16_t *gain_floor, int len) +{ + int i; + float echo_floor; + float noise_floor; + + noise_floor = exp(.2302585f*noise_suppress); + echo_floor = exp(.2302585f*effective_echo_suppress); + + /* Compute the gain floor based on different floors for the background noise and residual echo */ + for (i=0;iframe_size = frame_size; @@ -157,45 +417,49 @@ SpeexPreprocessState *speex_preprocess_state_init(int frame_size, int sampling_r st->agc_level = 8000; st->vad_enabled = 0; st->dereverb_enabled = 0; - st->reverb_decay = .5; - st->reverb_level = .2; - - st->speech_prob_start = SPEEX_PROB_START_DEFAULT; - st->speech_prob_continue = SPEEX_PROB_CONTINUE_DEFAULT; - - st->frame = (float*)speex_alloc(2*N*sizeof(float)); - st->ps = (float*)speex_alloc(N*sizeof(float)); - st->gain2 = (float*)speex_alloc(N*sizeof(float)); - st->window = (float*)speex_alloc(2*N*sizeof(float)); - st->noise = (float*)speex_alloc(N*sizeof(float)); - st->reverb_estimate = (float*)speex_alloc(N*sizeof(float)); - st->old_ps = (float*)speex_alloc(N*sizeof(float)); - st->gain = (float*)speex_alloc(N*sizeof(float)); - st->prior = (float*)speex_alloc(N*sizeof(float)); - st->post = (float*)speex_alloc(N*sizeof(float)); - st->loudness_weight = (float*)speex_alloc(N*sizeof(float)); - st->inbuf = (float*)speex_alloc(N3*sizeof(float)); - st->outbuf = (float*)speex_alloc(N3*sizeof(float)); - st->echo_noise = (float*)speex_alloc(N*sizeof(float)); - - st->S = (float*)speex_alloc(N*sizeof(float)); - st->Smin = (float*)speex_alloc(N*sizeof(float)); - st->Stmp = (float*)speex_alloc(N*sizeof(float)); - st->update_prob = (float*)speex_alloc(N*sizeof(float)); + st->reverb_decay = 0; + st->reverb_level = 0; + st->noise_suppress = NOISE_SUPPRESS_DEFAULT; + st->echo_suppress = ECHO_SUPPRESS_DEFAULT; + st->echo_suppress_active = ECHO_SUPPRESS_ACTIVE_DEFAULT; - st->zeta = (float*)speex_alloc(N*sizeof(float)); - st->Zpeak = 0; - st->Zlast = 0; + st->speech_prob_start = SPEECH_PROB_START_DEFAULT; + st->speech_prob_continue = SPEECH_PROB_CONTINUE_DEFAULT; - st->noise_bands = (float*)speex_alloc(NB_BANDS*sizeof(float)); - st->noise_bands2 = (float*)speex_alloc(NB_BANDS*sizeof(float)); - st->speech_bands = (float*)speex_alloc(NB_BANDS*sizeof(float)); - st->speech_bands2 = (float*)speex_alloc(NB_BANDS*sizeof(float)); - st->noise_bandsN = st->speech_bandsN = 1; + st->echo_state = NULL; + + st->nbands = NB_BANDS; + M = st->nbands; + st->bank = filterbank_new(M, sampling_rate, N, 1); + + st->frame = (spx_word16_t*)speex_alloc(2*N*sizeof(spx_word16_t)); + st->window = (spx_word16_t*)speex_alloc(2*N*sizeof(spx_word16_t)); + st->ft = (spx_word16_t*)speex_alloc(2*N*sizeof(spx_word16_t)); + + st->ps = (spx_word32_t*)speex_alloc((N+M)*sizeof(spx_word32_t)); + st->noise = (spx_word32_t*)speex_alloc((N+M)*sizeof(spx_word32_t)); + st->echo_noise = (spx_word32_t*)speex_alloc((N+M)*sizeof(spx_word32_t)); + st->residual_echo = (spx_word32_t*)speex_alloc((N+M)*sizeof(spx_word32_t)); + st->reverb_estimate = (spx_word32_t*)speex_alloc((N+M)*sizeof(spx_word32_t)); + st->old_ps = (spx_word32_t*)speex_alloc((N+M)*sizeof(spx_word32_t)); + st->prior = (spx_word16_t*)speex_alloc((N+M)*sizeof(spx_word16_t)); + st->post = (spx_word16_t*)speex_alloc((N+M)*sizeof(spx_word16_t)); + st->gain = (spx_word16_t*)speex_alloc((N+M)*sizeof(spx_word16_t)); + st->gain2 = (spx_word16_t*)speex_alloc((N+M)*sizeof(spx_word16_t)); + st->gain_floor = (spx_word16_t*)speex_alloc((N+M)*sizeof(spx_word16_t)); + st->zeta = (spx_word16_t*)speex_alloc((N+M)*sizeof(spx_word16_t)); + + st->S = (spx_word32_t*)speex_alloc(N*sizeof(spx_word32_t)); + st->Smin = (spx_word32_t*)speex_alloc(N*sizeof(spx_word32_t)); + st->Stmp = (spx_word32_t*)speex_alloc(N*sizeof(spx_word32_t)); + st->update_prob = (int*)speex_alloc(N*sizeof(int)); + + st->inbuf = (spx_word16_t*)speex_alloc(N3*sizeof(spx_word16_t)); + st->outbuf = (spx_word16_t*)speex_alloc(N3*sizeof(spx_word16_t)); conj_window(st->window, 2*N3); for (i=2*N3;i<2*st->ps_size;i++) - st->window[i]=1; + st->window[i]=Q15_ONE; if (N4>0) { @@ -205,22 +469,25 @@ SpeexPreprocessState *speex_preprocess_state_init(int frame_size, int sampling_r st->window[i+N3]=1; } } - for (i=0;inoise[i]=1e4; - st->reverb_estimate[i]=0.; - st->old_ps[i]=1e4; - st->gain[i]=1; - st->post[i]=1; - st->prior[i]=1; + st->noise[i]=QCONST32(1.f,NOISE_SHIFT); + st->reverb_estimate[i]=0; + st->old_ps[i]=1; + st->gain[i]=Q15_ONE; + st->post[i]=SHL16(1, SNR_SHIFT); + st->prior[i]=SHL16(1, SNR_SHIFT); } + for (i=0;iupdate_prob[i] = 1; for (i=0;iinbuf[i]=0; st->outbuf[i]=0; } - +#ifndef FIXED_POINT + st->loudness_weight = (float*)speex_alloc(N*sizeof(float)); for (i=0;iloudness_weight[i]=.01f; st->loudness_weight[i] *= st->loudness_weight[i]; } - - st->speech_prob = 0; - st->last_speech = 1000; st->loudness = pow(6000,LOUDNESS_EXP); st->loudness2 = 6000; st->nb_loudness_adapt = 0; +#endif + st->was_speech = 0; - st->fft_lookup = (struct drft_lookup*)speex_alloc(sizeof(struct drft_lookup)); - spx_drft_init(st->fft_lookup,2*N); + st->fft_lookup = spx_fft_init(2*N); st->nb_adapt=0; - st->consec_noise=0; - st->nb_preprocess=0; + st->min_count=0; return st; } void speex_preprocess_state_destroy(SpeexPreprocessState *st) { speex_free(st->frame); + speex_free(st->ft); speex_free(st->ps); speex_free(st->gain2); + speex_free(st->gain_floor); speex_free(st->window); speex_free(st->noise); speex_free(st->reverb_estimate); @@ -257,8 +523,11 @@ void speex_preprocess_state_destroy(SpeexPreprocessState *st) speex_free(st->gain); speex_free(st->prior); speex_free(st->post); +#ifndef FIXED_POINT speex_free(st->loudness_weight); +#endif speex_free(st->echo_noise); + speex_free(st->residual_echo); speex_free(st->S); speex_free(st->Smin); @@ -266,241 +535,17 @@ void speex_preprocess_state_destroy(SpeexPreprocessState *st) speex_free(st->update_prob); speex_free(st->zeta); - speex_free(st->noise_bands); - speex_free(st->noise_bands2); - speex_free(st->speech_bands); - speex_free(st->speech_bands2); - speex_free(st->inbuf); speex_free(st->outbuf); - spx_drft_clear(st->fft_lookup); - speex_free(st->fft_lookup); - + spx_fft_destroy(st->fft_lookup); + filterbank_destroy(st->bank); speex_free(st); } -static void update_noise(SpeexPreprocessState *st, float *ps, spx_int32_t *echo) -{ - int i; - float beta; - st->nb_adapt++; - beta=1.0f/st->nb_adapt; - if (beta < .05f) - beta=.05f; - - if (!echo) - { - for (i=0;ips_size;i++) - st->noise[i] = (1.f-beta)*st->noise[i] + beta*ps[i]; - } else { - for (i=0;ips_size;i++) - st->noise[i] = (1.f-beta)*st->noise[i] + beta*max(1.f,ps[i]-st->frame_size*st->frame_size*4.0*echo[i]); -#if 0 - for (i=0;ips_size;i++) - st->noise[i] = 0; -#endif - } -} - -static int speex_compute_vad(SpeexPreprocessState *st, float *ps, float mean_prior, float mean_post) -{ - int i, is_speech=0; - int N = st->ps_size; - float scale=.5f/N; - - /* FIXME: Clean this up a bit */ - { - float bands[NB_BANDS]; - int j; - float p0, p1; - float tot_loudness=0; - float x = sqrt(mean_post); - - for (i=5;ips[i] * st->loudness_weight[i]; - } - - for (i=0;ispeech_prob + .01*(1-st->speech_prob); - p1 *= .01*st->speech_prob + .99*(1-st->speech_prob); - - st->speech_prob = p0/(p1+p0); - */ - - if (st->noise_bandsN < 50 || st->speech_bandsN < 50) - { - if (mean_post > 5.f) - { - float adapt = 1./st->speech_bandsN++; - if (adapt<.005f) - adapt = .005f; - for (i=0;ispeech_bands[i] = (1.f-adapt)*st->speech_bands[i] + adapt*bands[i]; - /*st->speech_bands2[i] = (1-adapt)*st->speech_bands2[i] + adapt*bands[i]*bands[i];*/ - st->speech_bands2[i] = (1.f-adapt)*st->speech_bands2[i] + adapt*(bands[i]-st->speech_bands[i])*(bands[i]-st->speech_bands[i]); - } - } else { - float adapt = 1./st->noise_bandsN++; - if (adapt<.005f) - adapt = .005f; - for (i=0;inoise_bands[i] = (1.f-adapt)*st->noise_bands[i] + adapt*bands[i]; - /*st->noise_bands2[i] = (1-adapt)*st->noise_bands2[i] + adapt*bands[i]*bands[i];*/ - st->noise_bands2[i] = (1.f-adapt)*st->noise_bands2[i] + adapt*(bands[i]-st->noise_bands[i])*(bands[i]-st->noise_bands[i]); - } - } - } - p0=p1=1; - for (i=0;inoise_bands2[i] - st->noise_bands[i]*st->noise_bands[i]; - speech_var = 1.01*st->speech_bands2[i] - st->speech_bands[i]*st->speech_bands[i];*/ - noise_var = st->noise_bands2[i]; - speech_var = st->speech_bands2[i]; - if (noise_var < .1f) - noise_var = .1f; - if (speech_var < .1f) - speech_var = .1f; - - /*speech_var = sqrt(speech_var*noise_var); - noise_var = speech_var;*/ - if (speech_var < .05f*speech_var) - noise_var = .05f*speech_var; - if (speech_var < .05f*noise_var) - speech_var = .05f*noise_var; - - if (bands[i] < st->noise_bands[i]) - speech_var = noise_var; - if (bands[i] > st->speech_bands[i]) - noise_var = speech_var; - - speech_mean = st->speech_bands[i]; - noise_mean = st->noise_bands[i]; - if (noise_mean < speech_mean - 5.f) - noise_mean = speech_mean - 5.f; - - tmp1 = exp(-.5f*(bands[i]-speech_mean)*(bands[i]-speech_mean)/speech_var)/sqrt(2.f*M_PI*speech_var); - tmp2 = exp(-.5f*(bands[i]-noise_mean)*(bands[i]-noise_mean)/noise_var)/sqrt(2.f*M_PI*noise_var); - /*fprintf (stderr, "%f ", (float)(p0/(.01+p0+p1)));*/ - /*fprintf (stderr, "%f ", (float)(bands[i]));*/ - pr = tmp1/(1e-25+tmp1+tmp2); - /*if (bands[i] < st->noise_bands[i]) - pr=.01; - if (bands[i] > st->speech_bands[i] && pr < .995) - pr=.995;*/ - if (pr>.999f) - pr=.999f; - if (pr<.001f) - pr=.001f; - /*fprintf (stderr, "%f ", pr);*/ - p0 *= pr; - p1 *= (1-pr); - } - - p0 = pow(p0,.2); - p1 = pow(p1,.2); - -#if 1 - p0 *= 2.f; - p0=p0/(p1+p0); - if (st->last_speech>20) - { - float tmp = sqrt(tot_loudness)/st->loudness2; - tmp = 1.f-exp(-10.f*tmp); - if (p0>tmp) - p0=tmp; - } - p1=1-p0; -#else - if (sqrt(tot_loudness) < .6f*st->loudness2 && p0>15.f*p1) - p0=15.f*p1; - if (sqrt(tot_loudness) < .45f*st->loudness2 && p0>7.f*p1) - p0=7.f*p1; - if (sqrt(tot_loudness) < .3f*st->loudness2 && p0>3.f*p1) - p0=3.f*p1; - if (sqrt(tot_loudness) < .15f*st->loudness2 && p0>p1) - p0=p1; - /*fprintf (stderr, "%f %f ", (float)(sqrt(tot_loudness) /( .25*st->loudness2)), p0/(p1+p0));*/ -#endif - - p0 *= .99f*st->speech_prob + .01f*(1-st->speech_prob); - p1 *= .01f*st->speech_prob + .99f*(1-st->speech_prob); - - st->speech_prob = p0/(1e-25f+p1+p0); - /*fprintf (stderr, "%f %f %f ", tot_loudness, st->loudness2, st->speech_prob);*/ - - if (st->speech_prob > st->speech_prob_start - || (st->last_speech < 20 && st->speech_prob > st->speech_prob_continue)) - { - is_speech = 1; - st->last_speech = 0; - } else { - st->last_speech++; - if (st->last_speech<20) - is_speech = 1; - } - - if (st->noise_bandsN > 50 && st->speech_bandsN > 50) - { - if (mean_post > 5) - { - float adapt = 1./st->speech_bandsN++; - if (adapt<.005f) - adapt = .005f; - for (i=0;ispeech_bands[i] = (1-adapt)*st->speech_bands[i] + adapt*bands[i]; - /*st->speech_bands2[i] = (1-adapt)*st->speech_bands2[i] + adapt*bands[i]*bands[i];*/ - st->speech_bands2[i] = (1-adapt)*st->speech_bands2[i] + adapt*(bands[i]-st->speech_bands[i])*(bands[i]-st->speech_bands[i]); - } - } else { - float adapt = 1./st->noise_bandsN++; - if (adapt<.005f) - adapt = .005f; - for (i=0;inoise_bands[i] = (1-adapt)*st->noise_bands[i] + adapt*bands[i]; - /*st->noise_bands2[i] = (1-adapt)*st->noise_bands2[i] + adapt*bands[i]*bands[i];*/ - st->noise_bands2[i] = (1-adapt)*st->noise_bands2[i] + adapt*(bands[i]-st->noise_bands[i])*(bands[i]-st->noise_bands[i]); - } - } - } - - - } - - return is_speech; -} - -static void speex_compute_agc(SpeexPreprocessState *st, float mean_prior) +/* FIXME: The AGC doesn't work yet with fixed-point*/ +#ifndef FIXED_POINT +static void speex_compute_agc(SpeexPreprocessState *st) { int i; int N = st->ps_size; @@ -535,7 +580,7 @@ static void speex_compute_agc(SpeexPreprocessState *st, float mean_prior) for (i=2;ips[i] * st->gain2[i] * st->gain2[i] * st->loudness_weight[i]; + loudness += scale*st->ps[i] * FRAC_SCALING_1*FRAC_SCALING_1*st->gain2[i] * st->gain2[i] * st->loudness_weight[i]; } loudness=sqrt(loudness); /*if (loudness < 2*pow(st->loudness, 1.0/LOUDNESS_EXP) && @@ -558,6 +603,7 @@ static void speex_compute_agc(SpeexPreprocessState *st, float mean_prior) st->gain2[i] *= agc_gain; } +#endif static void preprocess_analysis(SpeexPreprocessState *st, spx_int16_t *x) { @@ -565,7 +611,7 @@ static void preprocess_analysis(SpeexPreprocessState *st, spx_int16_t *x) int N = st->ps_size; int N3 = 2*N - st->frame_size; int N4 = st->frame_size - N3; - float *ps=st->ps; + spx_word32_t *ps=st->ps; /* 'Build' input frame */ for (i=0;iframe[i] *= st->window[i]; + st->frame[i] = MULT16_16_Q15(st->frame[i], st->window[i]); +#ifdef FIXED_POINT + { + spx_word16_t max_val=0; + for (i=0;i<2*N;i++) + max_val = MAX16(max_val, ABS16(st->frame[i])); + st->frame_shift = 14-spx_ilog2(EXTEND32(max_val)); + for (i=0;i<2*N;i++) + st->frame[i] = SHL16(st->frame[i], st->frame_shift); + } +#endif + /* Perform FFT */ - spx_drft_forward(st->fft_lookup, st->frame); - + spx_fft(st->fft_lookup, st->frame, st->ft); + /* Power spectrum */ - ps[0]=1; + ps[0]=MULT16_16(st->ft[0],st->ft[0]); for (i=1;iframe[2*i-1]*st->frame[2*i-1] + st->frame[2*i]*st->frame[2*i]; + ps[i]=MULT16_16(st->ft[2*i-1],st->ft[2*i-1]) + MULT16_16(st->ft[2*i],st->ft[2*i]); + for (i=0;ips[i] = PSHR32(st->ps[i], 2*st->frame_shift); + filterbank_compute_bank32(st->bank, ps, ps+N); } static void update_noise_prob(SpeexPreprocessState *st) { int i; + int min_range; int N = st->ps_size; for (i=1;iS[i] = 100.f+ .8f*st->S[i] + .05f*st->ps[i-1]+.1f*st->ps[i]+.05f*st->ps[i+1]; + st->S[i] = MULT16_32_Q15(QCONST16(.8f,15),st->S[i]) + MULT16_32_Q15(QCONST16(.05f,15),st->ps[i-1]) + + MULT16_32_Q15(QCONST16(.1f,15),st->ps[i]) + MULT16_32_Q15(QCONST16(.05f,15),st->ps[i+1]); + st->S[0] = MULT16_32_Q15(QCONST16(.8f,15),st->S[0]) + MULT16_32_Q15(QCONST16(.2f,15),st->ps[0]); + st->S[N-1] = MULT16_32_Q15(QCONST16(.8f,15),st->S[N-1]) + MULT16_32_Q15(QCONST16(.2f,15),st->ps[N-1]); - if (st->nb_preprocess<1) + if (st->nb_adapt==1) { - for (i=1;iSmin[i] = st->Stmp[i] = st->S[i]+100.f; + for (i=0;iSmin[i] = st->Stmp[i] = 0; } - if (st->nb_preprocess%200==0) + if (st->nb_adapt < 100) + min_range = 15; + else if (st->nb_adapt < 1000) + min_range = 50; + else if (st->nb_adapt < 10000) + min_range = 150; + else + min_range = 300; + if (st->min_count > min_range) { - for (i=1;imin_count = 0; + for (i=0;iSmin[i] = min(st->Stmp[i], st->S[i]); + st->Smin[i] = MIN32(st->Stmp[i], st->S[i]); st->Stmp[i] = st->S[i]; } } else { - for (i=1;iSmin[i] = min(st->Smin[i], st->S[i]); - st->Stmp[i] = min(st->Stmp[i], st->S[i]); + st->Smin[i] = MIN32(st->Smin[i], st->S[i]); + st->Stmp[i] = MIN32(st->Stmp[i], st->S[i]); } } - for (i=1;iupdate_prob[i] *= .2f; - if (st->S[i] > 2.5*st->Smin[i]) - st->update_prob[i] += .8f; + if (MULT16_32_Q15(QCONST16(.4f,15),st->S[i]) > ADD32(st->Smin[i],EXTEND32(20))) + st->update_prob[i] = 1; + else + st->update_prob[i] = 0; /*fprintf (stderr, "%f ", st->S[i]/st->Smin[i]);*/ /*fprintf (stderr, "%f ", st->update_prob[i]);*/ } } -#define NOISE_OVERCOMPENS 1.4 +#define NOISE_OVERCOMPENS 1. + +void speex_echo_get_residual(SpeexEchoState *st, spx_word32_t *Yout, int len); int speex_preprocess(SpeexPreprocessState *st, spx_int16_t *x, spx_int32_t *echo) +{ + return speex_preprocess_run(st, x); +} + +int speex_preprocess_run(SpeexPreprocessState *st, spx_int16_t *x) { int i; - int is_speech=1; - float mean_post=0; - float mean_prior=0; + int M; int N = st->ps_size; int N3 = 2*N - st->frame_size; int N4 = st->frame_size - N3; - float scale=.5f/N; - float *ps=st->ps; - float Zframe=0, Pframe; - + spx_word32_t *ps=st->ps; + spx_word32_t Zframe; + spx_word16_t Pframe; + spx_word16_t beta, beta_1; + spx_word16_t effective_echo_suppress; + + st->nb_adapt++; + st->min_count++; + + beta = MAX16(QCONST16(.03,15),DIV32_16(Q15_ONE,st->nb_adapt)); + beta_1 = Q15_ONE-beta; + M = st->nbands; + /* Deal with residual echo if provided */ + if (st->echo_state) + { + speex_echo_get_residual(st->echo_state, st->residual_echo, N); +#ifndef FIXED_POINT + /* If there are NaNs or ridiculous values, it'll show up in the DC and we just reset everything to zero */ + if (!(st->residual_echo[0] >=0 && st->residual_echo[0]residual_echo[i] = 0; + } +#endif + for (i=0;iecho_noise[i] = MAX32(MULT16_32_Q15(QCONST16(.6f,15),st->echo_noise[i]), st->residual_echo[i]); + filterbank_compute_bank32(st->bank, st->echo_noise, st->echo_noise+N); + } else { + for (i=0;iecho_noise[i] = 0; + } preprocess_analysis(st, x); update_noise_prob(st); - st->nb_preprocess++; - - /* Noise estimation always updated for the 20 first times */ - if (st->nb_adapt<10) + /* Noise estimation always updated for the 10 first frames */ + /*if (st->nb_adapt<10) { - update_noise(st, ps, echo); + for (i=1;iupdate_prob[i] = 0; } - - /* Deal with residual echo if provided */ - if (echo) - for (i=1;iecho_noise[i] = (.3f*st->echo_noise[i] + st->frame_size*st->frame_size*4.0*echo[i]); - - /* Compute a posteriori SNR */ - for (i=1;inoise[i] + st->echo_noise[i] + st->reverb_estimate[i]; - st->post[i] = ps[i]/tot_noise - 1.f; - if (st->post[i]>100.f) - st->post[i]=100.f; - /*if (st->post[i]<0) - st->post[i]=0;*/ - mean_post+=st->post[i]; + if (!st->update_prob[i] || st->ps[i] < PSHR32(st->noise[i], NOISE_SHIFT)) + st->noise[i] = MAX32(EXTEND32(0),MULT16_32_Q15(beta_1,st->noise[i]) + MULT16_32_Q15(beta,SHL32(st->ps[i],NOISE_SHIFT))); } - mean_post /= N; - if (mean_post<0.f) - mean_post=0.f; + filterbank_compute_bank32(st->bank, st->noise, st->noise+N); /* Special case for first frame */ if (st->nb_adapt==1) - for (i=1;iold_ps[i] = ps[i]; - /* Compute a priori SNR */ - { - /* A priori update rate */ - for (i=1;iprior[i]*st->prior[i]/((1+st->prior[i])*(1+st->prior[i])); - float tot_noise = 1.f+ NOISE_OVERCOMPENS*st->noise[i] + st->echo_noise[i] + st->reverb_estimate[i]; - /* A priori SNR update */ - st->prior[i] = gamma*max(0.0f,st->post[i]) + - (1.f-gamma)* (.8*st->gain[i]*st->gain[i]*st->old_ps[i]/tot_noise + .2*st->prior[i]); - - if (st->prior[i]>100.f) - st->prior[i]=100.f; - - mean_prior+=st->prior[i]; - } - } - mean_prior /= N; - -#if 0 - for (i=0;iprior[i]); - } - fprintf (stderr, "\n"); -#endif - /*fprintf (stderr, "%f %f\n", mean_prior,mean_post);*/ - - if (st->nb_preprocess>=20) + /* Compute a posteriori SNR */ + for (i=0;inoise[i]; - sig_ener += ps[i]; - } - if (noise_ener > 3.f*sig_ener) - do_update = 1; - /*do_update = 0;*/ - if (do_update) - { - st->consec_noise++; - } else { - st->consec_noise=0; - } + spx_word16_t gamma; + + /* Total noise estimate including residual echo and reverberation */ + spx_word32_t tot_noise = ADD32(ADD32(ADD32(EXTEND32(1), PSHR32(st->noise[i],NOISE_SHIFT)) , st->echo_noise[i]) , st->reverb_estimate[i]); + + /* A posteriori SNR = ps/noise - 1*/ + st->post[i] = SUB16(DIV32_16_Q8(ps[i],tot_noise), QCONST16(1.f,SNR_SHIFT)); + st->post[i]=MIN16(st->post[i], QCONST16(100.f,SNR_SHIFT)); + + /* Computing update gamma = .1 + .9*(old/(old+noise))^2 */ + gamma = QCONST16(.1f,15)+MULT16_16_Q15(QCONST16(.89f,15),SQR16_Q15(DIV32_16_Q15(st->old_ps[i],ADD32(st->old_ps[i],tot_noise)))); + + /* A priori SNR update = gamma*max(0,post) + (1-gamma)*old/noise */ + st->prior[i] = EXTRACT16(PSHR32(ADD32(MULT16_16(gamma,MAX16(0,st->post[i])), MULT16_16(Q15_ONE-gamma,DIV32_16_Q8(st->old_ps[i],tot_noise))), 15)); + st->prior[i]=MIN16(st->prior[i], QCONST16(100.f,SNR_SHIFT)); } - if (st->vad_enabled) - is_speech = speex_compute_vad(st, ps, mean_prior, mean_post); - + /*print_vec(st->post, N+M, "");*/ - if (st->consec_noise>=3) + /* Recursive average of the a priori SNR. A bit smoothed for the psd components */ + st->zeta[0] = PSHR32(ADD32(MULT16_16(QCONST16(.7f,15),st->zeta[0]), MULT16_16(QCONST16(.3f,15),st->prior[0])),15); + for (i=1;izeta[i] = PSHR32(ADD32(ADD32(ADD32(MULT16_16(QCONST16(.7f,15),st->zeta[i]), MULT16_16(QCONST16(.15f,15),st->prior[i])), + MULT16_16(QCONST16(.075f,15),st->prior[i-1])), MULT16_16(QCONST16(.075f,15),st->prior[i+1])),15); + for (i=N-1;izeta[i] = PSHR32(ADD32(MULT16_16(QCONST16(.7f,15),st->zeta[i]), MULT16_16(QCONST16(.3f,15),st->prior[i])),15); + + /* Speech probability of presence for the entire frame is based on the average filterbank a priori SNR */ + Zframe = 0; + for (i=N;izeta[i])); + Pframe = QCONST16(.1f,15)+MULT16_16_Q15(QCONST16(.899f,15),qcurve(DIV32_16(Zframe,st->nbands))); + + effective_echo_suppress = EXTRACT16(PSHR32(ADD32(MULT16_16(SUB16(Q15_ONE,Pframe), st->echo_suppress), MULT16_16(Pframe, st->echo_suppress_active)),15)); + + compute_gain_floor(st->noise_suppress, effective_echo_suppress, st->noise+N, st->echo_noise+N, st->gain_floor+N, M); + + /* Compute Ephraim & Malah gain speech probability of presence for each critical band (Bark scale) + Technically this is actually wrong because the EM gaim assumes a slightly different probability + distribution */ + for (i=N;iold_ps, echo); - } else { - for (i=1;iupdate_prob[i]<.5f/* || st->ps[i] < st->noise[i]*/) - { - if (echo) - st->noise[i] = .95f*st->noise[i] + .05f*max(1.0f,st->ps[i]-st->frame_size*st->frame_size*4.0*echo[i]); - else - st->noise[i] = .95f*st->noise[i] + .05f*st->ps[i]; - } - } - } + /* See EM and Cohen papers*/ + spx_word32_t theta; + /* Gain from hypergeometric function */ + spx_word32_t MM; + /* Weiner filter gain */ + spx_word16_t prior_ratio; + /* a priority probability of speech presence based on Bark sub-band alone */ + spx_word16_t P1; + /* Speech absence a priori probability (considering sub-band and frame) */ + spx_word16_t q; +#ifdef FIXED_POINT + spx_word16_t tmp; +#endif + + prior_ratio = PDIV32_16(SHL32(EXTEND32(st->prior[i]), 15), ADD16(st->prior[i], SHL32(1,SNR_SHIFT))); + theta = MULT16_32_P15(prior_ratio, QCONST32(1.f,EXPIN_SHIFT)+SHL32(EXTEND32(st->post[i]),EXPIN_SHIFT-SNR_SHIFT)); - for (i=1;izeta[i] = .7f*st->zeta[i] + .3f*st->prior[i]; + MM = hypergeom_gain(theta); + /* Gain with bound */ + st->gain[i] = EXTRACT16(MIN32(Q15_ONE, MULT16_32_Q15(prior_ratio, MM))); + /* Save old Bark power spectrum */ + st->old_ps[i] = MULT16_32_P15(QCONST16(.2f,15),st->old_ps[i]) + MULT16_32_P15(MULT16_16_P15(QCONST16(.8f,15),SQR16_Q15(st->gain[i])),ps[i]); + + P1 = QCONST16(.199f,15)+MULT16_16_Q15(QCONST16(.8f,15),qcurve (st->zeta[i])); + q = Q15_ONE-MULT16_16_Q15(Pframe,P1); +#ifdef FIXED_POINT + theta = MIN32(theta, EXTEND32(32767)); +/*Q8*/tmp = MULT16_16_Q15((SHL32(1,SNR_SHIFT)+st->prior[i]),EXTRACT16(MIN32(Q15ONE,SHR32(spx_exp(-EXTRACT16(theta)),1)))); + tmp = MIN16(QCONST16(3.,SNR_SHIFT), tmp); /* Prevent overflows in the next line*/ +/*Q8*/tmp = PSHR(MULT16_16(PDIV32_16(SHL32(EXTEND32(q),8),(Q15_ONE-q)),tmp),8); + st->gain2[i]=DIV32_16(SHL(EXTEND32(32767),SNR_SHIFT), ADD16(256,tmp)); +#else + st->gain2[i]=1/(1.f + (q/(1.f-q))*(1+st->prior[i])*exp(-theta)); +#endif } - + /* Convert the EM gains and speech prob to linear frequency */ + filterbank_compute_psd16(st->bank,st->gain2+N, st->gain2); + filterbank_compute_psd16(st->bank,st->gain+N, st->gain); + + /* Use 1 for linear gain resolution (best) or 0 for Bark gain resolution (faster) */ + if (1) { - int freq_start = (int)(300.0f*2.f*N/st->sampling_rate); - int freq_end = (int)(2000.0f*2.f*N/st->sampling_rate); - for (i=freq_start;ibank,st->gain_floor+N, st->gain_floor); + + /* Compute gain according to the Ephraim-Malah algorithm -- linear frequency */ + for (i=0;izeta[i]; - } - Zframe /= (freq_end-freq_start); - } - st->Zlast = Zframe; - - Pframe = qcurve(Zframe); + spx_word32_t MM; + spx_word32_t theta; + spx_word16_t prior_ratio; + spx_word16_t tmp; + spx_word16_t p; + spx_word16_t g; + + /* Wiener filter gain */ + prior_ratio = PDIV32_16(SHL32(EXTEND32(st->prior[i]), 15), ADD16(st->prior[i], SHL32(1,SNR_SHIFT))); + theta = MULT16_32_P15(prior_ratio, QCONST32(1.f,EXPIN_SHIFT)+SHL32(EXTEND32(st->post[i]),EXPIN_SHIFT-SNR_SHIFT)); + + /* Optimal estimator for loudness domain */ + MM = hypergeom_gain(theta); + /* EM gain with bound */ + g = EXTRACT16(MIN32(Q15_ONE, MULT16_32_Q15(prior_ratio, MM))); + /* Interpolated speech probability of presence */ + p = st->gain2[i]; + + /* Constrain the gain to be close to the Bark scale gain */ + if (MULT16_16_Q15(QCONST16(.333f,15),g) > st->gain[i]) + g = MULT16_16(3,st->gain[i]); + st->gain[i] = g; + + /* Save old power spectrum */ + st->old_ps[i] = MULT16_32_P15(QCONST16(.2f,15),st->old_ps[i]) + MULT16_32_P15(MULT16_16_P15(QCONST16(.8f,15),SQR16_Q15(st->gain[i])),ps[i]); + + /* Apply gain floor */ + if (st->gain[i] < st->gain_floor[i]) + st->gain[i] = st->gain_floor[i]; - /*fprintf (stderr, "%f\n", Pframe);*/ - /* Compute gain according to the Ephraim-Malah algorithm */ - for (i=1;iprior[i]/(1.0001f+st->prior[i]); - theta = (1.f+st->post[i])*prior_ratio; - - if (i==1 || i==N-1) - zeta1 = st->zeta[i]; - else - zeta1 = .25f*st->zeta[i-1] + .5f*st->zeta[i] + .25f*st->zeta[i+1]; - P1 = qcurve (zeta1); - - /* FIXME: add global prob (P2) */ - q = 1-Pframe*P1; - q = 1-P1; - if (q>.95f) - q=.95f; - p=1.f/(1.f + (q/(1.f-q))*(1.f+st->prior[i])*exp(-theta)); - /*p=1;*/ - - /* Optimal estimator for loudness domain */ - MM = hypergeom_gain(theta); + /* Exponential decay model for reverberation (unused) */ + /*st->reverb_estimate[i] = st->reverb_decay*st->reverb_estimate[i] + st->reverb_decay*st->reverb_level*st->gain[i]*st->gain[i]*st->ps[i];*/ + + /* Take into account speech probability of presence (loudness domain MMSE estimator) */ + /* gain2 = [p*sqrt(gain)+(1-p)*sqrt(gain _floor) ]^2 */ + tmp = MULT16_16_P15(p,spx_sqrt(SHL32(EXTEND32(st->gain[i]),15))) + MULT16_16_P15(SUB16(Q15_ONE,p),spx_sqrt(SHL32(EXTEND32(st->gain_floor[i]),15))); + st->gain2[i]=SQR16_Q15(tmp); - st->gain[i] = prior_ratio * MM; - /*Put some (very arbitraty) limit on the gain*/ - if (st->gain[i]>2.f) - { - st->gain[i]=2.f; + /* Use this if you want a log-domain MMSE estimator instead */ + /*st->gain2[i] = pow(st->gain[i], p) * pow(st->gain_floor[i],1.f-p);*/ } - - st->reverb_estimate[i] = st->reverb_decay*st->reverb_estimate[i] + st->reverb_decay*st->reverb_level*st->gain[i]*st->gain[i]*st->ps[i]; - if (st->denoise_enabled) + } else { + for (i=N;igain2[i] = p*p*st->gain[i]; - /*st->gain2[i]=(p*sqrt(st->gain[i])+.05*(1-p))*(p*sqrt(st->gain[i])+.05*(1-p));*/ - /*st->gain2[i] = pow(st->gain[i], p) * pow(.2f,1.f-p);*/ - } else { - st->gain2[i]=1.f; + spx_word16_t tmp; + spx_word16_t p = st->gain2[i]; + st->gain[i] = MAX16(st->gain[i], st->gain_floor[i]); + tmp = MULT16_16_P15(p,spx_sqrt(SHL32(EXTEND32(st->gain[i]),15))) + MULT16_16_P15(SUB16(Q15_ONE,p),spx_sqrt(SHL32(EXTEND32(st->gain_floor[i]),15))); + st->gain2[i]=SQR16_Q15(tmp); } + filterbank_compute_psd16(st->bank,st->gain2+N, st->gain2); } - st->gain2[0]=st->gain[0]=0.f; - st->gain2[N-1]=st->gain[N-1]=0.f; - /* - for (i=30;idenoise_enabled) { - st->gain[i] = st->gain2[i]*st->gain2[i] + (1-st->gain2[i])*.333*(.6*st->gain2[i-1]+st->gain2[i]+.6*st->gain2[i+1]+.4*st->gain2[i-2]+.4*st->gain2[i+2]); + for (i=0;igain2[i]=Q15_ONE; } - for (i=30;igain2[i] = st->gain[i]; - */ + + /*FIXME: This *will* not work for fixed-point */ +#ifndef FIXED_POINT if (st->agc_enabled) - speex_compute_agc(st, mean_prior); - -#if 0 - if (!is_speech) - { - for (i=0;igain2[i] = 0; - } -#if 0 - else { - for (i=0;igain2[i] = 1; - } -#endif + speex_compute_agc(st); #endif - + /* Apply computed gain */ for (i=1;iframe[2*i-1] *= st->gain2[i]; - st->frame[2*i] *= st->gain2[i]; + st->ft[2*i-1] = MULT16_16_P15(st->gain2[i],st->ft[2*i-1]); + st->ft[2*i] = MULT16_16_P15(st->gain2[i],st->ft[2*i]); } - - /* Get rid of the DC and very low frequencies */ - st->frame[0]=0; - st->frame[1]=0; - st->frame[2]=0; - /* Nyquist frequency is mostly useless too */ - st->frame[2*N-1]=0; - + st->ft[0] = MULT16_16_P15(st->gain2[0],st->ft[0]); + st->ft[2*N-1] = MULT16_16_P15(st->gain2[N-1],st->ft[2*N-1]); + /* Inverse FFT with 1/N scaling */ - spx_drft_backward(st->fft_lookup, st->frame); - + spx_ifft(st->fft_lookup, st->ft, st->frame); + /* Scale back to original (lower) amplitude */ for (i=0;i<2*N;i++) - st->frame[i] *= scale; + st->frame[i] = PSHR16(st->frame[i], st->frame_shift); + /*FIXME: This *will* not work for fixed-point */ +#ifndef FIXED_POINT + if (st->agc_enabled) { float max_sample=0; for (i=0;i<2*N;i++) @@ -880,9 +964,11 @@ int speex_preprocess(SpeexPreprocessState *st, spx_int16_t *x, spx_int32_t *echo st->frame[i] *= damp; } } - +#endif + + /* Synthesis window (for WOLA) */ for (i=0;i<2*N;i++) - st->frame[i] *= st->window[i]; + st->frame[i] = MULT16_16_Q15(st->frame[i], st->window[i]); /* Perform overlap and add */ for (i=0;ioutbuf[i] = st->frame[st->frame_size+i]; - /* Save old power spectrum */ - for (i=1;iold_ps[i] = ps[i]; - - return is_speech; + /* FIXME: This VAD is a kludge */ + if (st->vad_enabled) + { + if (Pframe > st->speech_prob_start || (st->was_speech && Pframe > st->speech_prob_continue)) + { + st->was_speech=1; + return 1; + } else + { + st->was_speech=0; + return 0; + } + } else { + return 1; + } } -void speex_preprocess_estimate_update(SpeexPreprocessState *st, spx_int16_t *x, spx_int32_t *echo) +void speex_preprocess_estimate_update(SpeexPreprocessState *st, spx_int16_t *x) { int i; int N = st->ps_size; int N3 = 2*N - st->frame_size; + int M; + spx_word32_t *ps=st->ps; - float *ps=st->ps; - + M = st->nbands; + st->min_count++; + preprocess_analysis(st, x); update_noise_prob(st); - - st->nb_preprocess++; for (i=1;iupdate_prob[i]<.5f || st->ps[i] < st->noise[i]) + if (!st->update_prob[i] || st->ps[i] < PSHR32(st->noise[i],NOISE_SHIFT)) { - if (echo) - st->noise[i] = .95f*st->noise[i] + .1f*max(1.0f,st->ps[i]-st->frame_size*st->frame_size*4.0*echo[i]); - else - st->noise[i] = .95f*st->noise[i] + .1f*st->ps[i]; + st->noise[i] = MULT16_32_Q15(QCONST16(.95f,15),st->noise[i]) + MULT16_32_Q15(QCONST16(.05f,15),SHL32(st->ps[i],NOISE_SHIFT)); } } for (i=0;ioutbuf[i] = x[st->frame_size-N3+i]*st->window[st->frame_size+i]; + st->outbuf[i] = MULT16_16_Q15(x[st->frame_size-N3+i],st->window[st->frame_size+i]); /* Save old power spectrum */ - for (i=1;iold_ps[i] = ps[i]; - for (i=1;ireverb_estimate[i] *= st->reverb_decay; + for (i=0;ireverb_estimate[i] = MULT16_32_Q15(st->reverb_decay, st->reverb_estimate[i]); } @@ -946,17 +1040,17 @@ int speex_preprocess_ctl(SpeexPreprocessState *state, int request, void *ptr) switch(request) { case SPEEX_PREPROCESS_SET_DENOISE: - st->denoise_enabled = (*(int*)ptr); + st->denoise_enabled = (*(spx_int32_t*)ptr); break; case SPEEX_PREPROCESS_GET_DENOISE: - (*(int*)ptr) = st->denoise_enabled; + (*(spx_int32_t*)ptr) = st->denoise_enabled; break; - +#ifndef FIXED_POINT case SPEEX_PREPROCESS_SET_AGC: - st->agc_enabled = (*(int*)ptr); + st->agc_enabled = (*(spx_int32_t*)ptr); break; case SPEEX_PREPROCESS_GET_AGC: - (*(int*)ptr) = st->agc_enabled; + (*(spx_int32_t*)ptr) = st->agc_enabled; break; case SPEEX_PREPROCESS_SET_AGC_LEVEL: @@ -969,21 +1063,22 @@ int speex_preprocess_ctl(SpeexPreprocessState *state, int request, void *ptr) case SPEEX_PREPROCESS_GET_AGC_LEVEL: (*(float*)ptr) = st->agc_level; break; - +#endif case SPEEX_PREPROCESS_SET_VAD: - st->vad_enabled = (*(int*)ptr); + speex_warning("The VAD has been replaced by a hack pending a complete rewrite"); + st->vad_enabled = (*(spx_int32_t*)ptr); break; case SPEEX_PREPROCESS_GET_VAD: - (*(int*)ptr) = st->vad_enabled; + (*(spx_int32_t*)ptr) = st->vad_enabled; break; case SPEEX_PREPROCESS_SET_DEREVERB: - st->dereverb_enabled = (*(int*)ptr); + st->dereverb_enabled = (*(spx_int32_t*)ptr); for (i=0;ips_size;i++) st->reverb_estimate[i]=0; break; case SPEEX_PREPROCESS_GET_DEREVERB: - (*(int*)ptr) = st->dereverb_enabled; + (*(spx_int32_t*)ptr) = st->dereverb_enabled; break; case SPEEX_PREPROCESS_SET_DEREVERB_LEVEL: @@ -1001,24 +1096,47 @@ int speex_preprocess_ctl(SpeexPreprocessState *state, int request, void *ptr) break; case SPEEX_PREPROCESS_SET_PROB_START: - st->speech_prob_start = (*(int*)ptr) / 100.0; - if ( st->speech_prob_start > 1 || st->speech_prob_start < 0 ) - st->speech_prob_start = SPEEX_PROB_START_DEFAULT; + *(spx_int32_t*)ptr = MIN32(Q15_ONE,MAX32(0, *(spx_int32_t*)ptr)); + st->speech_prob_start = DIV32_16(MULT16_16(32767,*(spx_int32_t*)ptr), 100); break; case SPEEX_PREPROCESS_GET_PROB_START: - (*(int*)ptr) = st->speech_prob_start * 100; + (*(spx_int32_t*)ptr) = MULT16_16_Q15(st->speech_prob_start, 100); break; case SPEEX_PREPROCESS_SET_PROB_CONTINUE: - st->speech_prob_continue = (*(int*)ptr) / 100.0; - if ( st->speech_prob_continue > 1 || st->speech_prob_continue < 0 ) - st->speech_prob_continue = SPEEX_PROB_CONTINUE_DEFAULT; + *(spx_int32_t*)ptr = MIN32(Q15_ONE,MAX32(0, *(spx_int32_t*)ptr)); + st->speech_prob_continue = DIV32_16(MULT16_16(32767,*(spx_int32_t*)ptr), 100); break; case SPEEX_PREPROCESS_GET_PROB_CONTINUE: - (*(int*)ptr) = st->speech_prob_continue * 100; + (*(spx_int32_t*)ptr) = MULT16_16_Q15(st->speech_prob_continue, 100); + break; + + case SPEEX_PREPROCESS_SET_NOISE_SUPPRESS: + st->noise_suppress = -ABS(*(spx_int32_t*)ptr); + break; + case SPEEX_PREPROCESS_GET_NOISE_SUPPRESS: + (*(spx_int32_t*)ptr) = st->noise_suppress; + break; + case SPEEX_PREPROCESS_SET_ECHO_SUPPRESS: + st->echo_suppress = -ABS(*(spx_int32_t*)ptr); + break; + case SPEEX_PREPROCESS_GET_ECHO_SUPPRESS: + (*(spx_int32_t*)ptr) = st->echo_suppress; + break; + case SPEEX_PREPROCESS_SET_ECHO_SUPPRESS_ACTIVE: + st->echo_suppress_active = -ABS(*(spx_int32_t*)ptr); + break; + case SPEEX_PREPROCESS_GET_ECHO_SUPPRESS_ACTIVE: + (*(spx_int32_t*)ptr) = st->echo_suppress_active; + break; + case SPEEX_PREPROCESS_SET_ECHO_STATE: + st->echo_state = (SpeexEchoState*)ptr; + break; + case SPEEX_PREPROCESS_GET_ECHO_STATE: + ptr = (void*)st->echo_state; break; - default: + default: speex_warning_int("Unknown speex_preprocess_ctl request: ", request); return -1; } -- cgit v1.2.3