From 5c4dfab39616ef259653a2a45b4034db13a81854 Mon Sep 17 00:00:00 2001 From: Nanang Izzuddin Date: Tue, 19 Aug 2008 11:04:32 +0000 Subject: Ticket #490: Updated VAD with new algorithm. git-svn-id: http://svn.pjsip.org/repos/pjproject/trunk@2222 74dad513-b988-da41-8d7b-12977e46ad98 --- pjmedia/include/pjmedia/silencedet.h | 29 ++-- pjmedia/src/pjmedia/silencedet.c | 282 +++++++++++++++++++---------------- 2 files changed, 165 insertions(+), 146 deletions(-) diff --git a/pjmedia/include/pjmedia/silencedet.h b/pjmedia/include/pjmedia/silencedet.h index af6f0e28..5f6a2491 100644 --- a/pjmedia/include/pjmedia/silencedet.h +++ b/pjmedia/include/pjmedia/silencedet.h @@ -1,6 +1,6 @@ /* $Id$ */ /* - * Copyright (C) 2003-2008 Benny Prijono + * Copyright (C) 2003-2007 Benny Prijono * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -108,26 +108,27 @@ PJ_DECL(pj_status_t) pjmedia_silence_det_set_adaptive(pjmedia_silence_det *sd, * Set other silence detector parameters. * * @param sd The silence detector - * @param min_silence Minimum duration of silence (in msec) before + * @param before_silence Minimum duration of silence (in msec) before * silence is reported. If -1 is specified, then * the default value will be used. The default is * 400 msec. - * @param min_signal Minimum duration of signal (in msec) before - * signal is reported. If -1 is specified, then - * the default value will be used. The default is - * equal to one frame. - * @param recalc_time The interval to recalculate signal and silence - * proportion and to readjust the silence threshold - * when adaptive silence detection is set. If -1 - * is specified, then the default value will be used. - * The default value is 5000 (msec). + * @param recalc_time1 The interval (in msec) to recalculate threshold + * in non-silence condition when adaptive silence + * detection is set. If -1 is specified, then the + * default value will be used. The default is 4000 + * (msec). + * @param recalc_time2 The interval (in msec) to recalculate threshold + * in silence condition when adaptive silence detection + * is set. If -1 is specified, then the default value + * will be used. The default value is 2000 (msec). * * @return PJ_SUCCESS on success. */ PJ_DECL(pj_status_t) pjmedia_silence_det_set_params( pjmedia_silence_det *sd, - int min_silence, - int min_signal, - int recalc_time); + int before_silence, + int recalc_time1, + int recalc_time2); + /** * Disable the silence detector. diff --git a/pjmedia/src/pjmedia/silencedet.c b/pjmedia/src/pjmedia/silencedet.c index f5d29e6c..59738f1e 100644 --- a/pjmedia/src/pjmedia/silencedet.c +++ b/pjmedia/src/pjmedia/silencedet.c @@ -1,6 +1,6 @@ /* $Id$ */ /* - * Copyright (C) 2003-2008 Benny Prijono + * Copyright (C) 2003-2007 Benny Prijono * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -23,16 +23,42 @@ #include #include - #define THIS_FILE "silencedet.c" +#if 0 +# define TRACE_(x) PJ_LOG(3,x) +#else +# define TRACE_(x) +#endif + +/** + * This enumeration specifies operation mode of silence detector + */ typedef enum pjmedia_silence_det_mode { VAD_MODE_NONE, VAD_MODE_FIXED, VAD_MODE_ADAPTIVE } pjmedia_silence_det_mode; +/** + * Default settings + */ +#define DEF_RECALC_ON_VOICED 4000 /* Time to recalculate threshold + in voiced condition, in ms */ +#define DEF_RECALC_ON_SILENCE 2000 /* Time to recalculate threshold + in silence condition, in ms. */ +#define DEF_BEFORE_SILENCE 400 /* Silence time before really changing + state into SILENCE, in ms. */ +#define DEF_THRESHOLD 1000 /* Default threshold. */ +/** + * This enumeration specifies the states of the silence detector. + */ +enum pjmedia_silence_det_state { + STATE_SILENCE, + STATE_START_SILENCE, + STATE_VOICED +}; /** * This structure holds the silence detector state. @@ -41,20 +67,23 @@ struct pjmedia_silence_det { char objname[PJ_MAX_OBJ_NAME]; /**< VAD name. */ - int mode; /**< VAD mode. */ - unsigned ptime; /**< Frame time, in msec. */ - - unsigned min_signal_cnt; /**< # of signal frames.before talk burst */ - unsigned min_silence_cnt; /**< # of silence frames before silence. */ - unsigned recalc_cnt; /**< # of frames before adaptive recalc. */ + int mode; /**< VAD mode. */ + unsigned ptime; /**< Frame time, in msec. */ - pj_bool_t in_talk; /**< In talk burst? */ - unsigned cur_cnt; /**< # of frames in current mode. */ - unsigned signal_cnt; /**< # of signal frames received. */ - unsigned silence_cnt; /**< # of silence frames received */ - unsigned cur_threshold; /**< Current silence threshold. */ - unsigned weakest_signal; /**< Weakest signal detected. */ - unsigned loudest_silence; /**< Loudest silence detected. */ + unsigned threshold; /**< Current threshold level. */ + unsigned sum_level; /**< Total sum of recent level. */ + unsigned sum_cnt; /**< Number of level summed. */ + unsigned silence_timer; /**< Silence condition timer. */ + unsigned voiced_timer; /**< Voiced condition timer. */ + + enum pjmedia_silence_det_state state;/**< Silence detector state. */ + unsigned recalc_on_voiced; /**< Setting of time to recalc + threshold in voiced condition. */ + unsigned recalc_on_silence; /**< Setting of time to recalc + threshold in silence condition.*/ + unsigned before_silence; /**< Setting of silence time before + really changing state into SILENCE, + in ms. */ }; @@ -70,20 +99,15 @@ PJ_DEF(pj_status_t) pjmedia_silence_det_create( pj_pool_t *pool, sd = PJ_POOL_ZALLOC_T(pool, pjmedia_silence_det); - pj_ansi_strncpy(sd->objname, THIS_FILE, PJ_MAX_OBJ_NAME); + pj_ansi_snprintf(sd->objname, PJ_MAX_OBJ_NAME, THIS_FILE, sd); sd->objname[PJ_MAX_OBJ_NAME-1] = '\0'; sd->ptime = samples_per_frame * 1000 / clock_rate; - sd->signal_cnt = 0; - sd->silence_cnt = 0; - sd->weakest_signal = 0xFFFFFFFFUL; - sd->loudest_silence = 0; /* Default settings */ pjmedia_silence_det_set_params(sd, -1, -1, -1); - /* Restart in fixed, silent mode */ - sd->in_talk = PJ_FALSE; + /* Restart in adaptive, silent mode */ pjmedia_silence_det_set_adaptive( sd, -1 ); *p_sd = sd; @@ -101,17 +125,16 @@ PJ_DEF(pj_status_t) pjmedia_silence_det_set_name( pjmedia_silence_det *sd, return PJ_SUCCESS; } - PJ_DEF(pj_status_t) pjmedia_silence_det_set_adaptive(pjmedia_silence_det *sd, int threshold) { PJ_ASSERT_RETURN(sd, PJ_EINVAL); if (threshold < 0) - threshold = PJMEDIA_SILENCE_DET_THRESHOLD; + threshold = DEF_THRESHOLD; sd->mode = VAD_MODE_ADAPTIVE; - sd->cur_threshold = threshold; + sd->threshold = threshold; return PJ_SUCCESS; } @@ -122,31 +145,31 @@ PJ_DEF(pj_status_t) pjmedia_silence_det_set_fixed( pjmedia_silence_det *sd, PJ_ASSERT_RETURN(sd, PJ_EINVAL); if (threshold < 0) - threshold = PJMEDIA_SILENCE_DET_THRESHOLD; + threshold = DEF_THRESHOLD; sd->mode = VAD_MODE_FIXED; - sd->cur_threshold = threshold; + sd->threshold = threshold; return PJ_SUCCESS; } PJ_DEF(pj_status_t) pjmedia_silence_det_set_params( pjmedia_silence_det *sd, - int min_silence, - int min_signal, - int recalc_time) + int before_silence, + int recalc_time1, + int recalc_time2) { PJ_ASSERT_RETURN(sd, PJ_EINVAL); - if (min_silence == -1) - min_silence = 500; - if (min_signal < 0) - min_signal = sd->ptime; - if (recalc_time < 0) - recalc_time = 2000; + if (recalc_time1 < 0) + recalc_time1 = DEF_RECALC_ON_VOICED; + if (recalc_time2 < 0) + recalc_time2 = DEF_RECALC_ON_SILENCE; + if (before_silence < 0) + before_silence = DEF_BEFORE_SILENCE; - sd->min_signal_cnt = min_signal / sd->ptime; - sd->min_silence_cnt = min_silence / sd->ptime; - sd->recalc_cnt = recalc_time / sd->ptime; + sd->recalc_on_voiced = recalc_time1; + sd->recalc_on_silence = recalc_time2; + sd->before_silence = before_silence; return PJ_SUCCESS; } @@ -186,109 +209,104 @@ PJ_DEF(pj_int32_t) pjmedia_calc_avg_signal( const pj_int16_t samples[], PJ_DEF(pj_bool_t) pjmedia_silence_det_apply( pjmedia_silence_det *sd, pj_uint32_t level) { - pj_bool_t have_signal; + int avg_recent_level; - /* Always return false if VAD is disabled */ if (sd->mode == VAD_MODE_NONE) return PJ_FALSE; - /* Convert PCM level to ulaw */ - level = pjmedia_linear2ulaw(level) ^ 0xff; - - /* Do we have signal? */ - have_signal = level > sd->cur_threshold; - - /* We we're in transition between silence and signel, increment the - * current frame counter. We will only switch mode when we have enough - * frames. - */ - if (sd->in_talk != have_signal) { - unsigned limit; - - sd->cur_cnt++; - - limit = (sd->in_talk ? sd->min_silence_cnt : - sd->min_signal_cnt); - - if (sd->cur_cnt > limit) { - - /* Swap mode */ - sd->in_talk = !sd->in_talk; - - /* Restart adaptive cur_threshold measurements */ - sd->weakest_signal = 0xFFFFFFFFUL; - sd->loudest_silence = 0; - sd->signal_cnt = 0; - sd->silence_cnt = 0; - sd->cur_cnt = 0; - } + if (sd->mode == VAD_MODE_FIXED) + return (level < sd->threshold); - } else { - /* Reset frame count */ - sd->cur_cnt = 0; - } - + /* Calculating recent level */ + sd->sum_level += level; + ++sd->sum_cnt; + avg_recent_level = (sd->sum_level / sd->sum_cnt); - /* Count the number of silent and signal frames and calculate min/max */ - if (have_signal) { - if (level < sd->weakest_signal) - sd->weakest_signal = level; - sd->signal_cnt++; - } - else { - if (level > sd->loudest_silence) - sd->loudest_silence = level; - sd->silence_cnt++; - } + if (level > sd->threshold) { + sd->silence_timer = 0; + sd->voiced_timer += sd->ptime; - /* See if we have had enough frames to look at proportions of - * silence/signal frames. - */ - if ((sd->signal_cnt + sd->silence_cnt) > sd->recalc_cnt) { - - if (sd->mode == VAD_MODE_ADAPTIVE) { - pj_bool_t updated = PJ_TRUE; - unsigned pct_signal, new_threshold = sd->cur_threshold; - - /* Get percentage of signal */ - pct_signal = sd->signal_cnt * 100 / - (sd->signal_cnt + sd->silence_cnt); - - /* Adjust according to signal/silence proportions. */ - if (pct_signal > 95) { - new_threshold += (sd->weakest_signal+1 - sd->cur_threshold)/2; - } else if (pct_signal < 5) { - new_threshold = (sd->cur_threshold+sd->loudest_silence)/2+1; - } else if (pct_signal > 80) { - new_threshold++; - } else if (pct_signal < 10) { - new_threshold--; - } else { - updated = PJ_FALSE; - } - - if (new_threshold > PJMEDIA_SILENCE_DET_MAX_THRESHOLD) - new_threshold = PJMEDIA_SILENCE_DET_MAX_THRESHOLD; - - if (updated && sd->cur_threshold != new_threshold) { - PJ_LOG(5,(sd->objname, - "Vad cur_threshold updated %d-->%d. " - "Signal lo=%d", - sd->cur_threshold, new_threshold, - sd->weakest_signal)); - sd->cur_threshold = new_threshold; - } - } + switch(sd->state) { + case STATE_VOICED: + if (sd->voiced_timer > sd->recalc_on_voiced) { + /* Voiced for long time (>recalc_on_voiced), current + * threshold seems to be too low. + */ + sd->threshold = (avg_recent_level + sd->threshold) >> 1; + TRACE_((THIS_FILE,"Re-adjust threshold (in talk burst)" + "to %d", sd->threshold)); + + sd->voiced_timer = 0; + + /* Reset sig_level */ + sd->sum_level = avg_recent_level; + sd->sum_cnt = 1; + } + break; + + case STATE_SILENCE: + TRACE_((THIS_FILE,"Starting talk burst (level=%d threshold=%d)", + level, sd->threshold)); - /* Reset. */ - sd->weakest_signal = 0xFFFFFFFFUL; - sd->loudest_silence = 0; - sd->signal_cnt = 0; - sd->silence_cnt = 0; + case STATE_START_SILENCE: + sd->state = STATE_VOICED; + + /* Reset sig_level */ + sd->sum_level = level; + sd->sum_cnt = 1; + + break; + + default: + pj_assert(0); + break; + } + } else { + sd->voiced_timer = 0; + sd->silence_timer += sd->ptime; + + switch(sd->state) { + case STATE_SILENCE: + if (sd->silence_timer >= sd->recalc_on_silence) { + sd->threshold = avg_recent_level << 1; + TRACE_((THIS_FILE,"Re-adjust threshold (in silence)" + "to %d", sd->threshold)); + + sd->silence_timer = 0; + + /* Reset sig_level */ + sd->sum_level = avg_recent_level; + sd->sum_cnt = 1; + } + break; + + case STATE_VOICED: + sd->state = STATE_START_SILENCE; + + /* Reset sig_level */ + sd->sum_level = level; + sd->sum_cnt = 1; + + case STATE_START_SILENCE: + if (sd->silence_timer >= sd->before_silence) { + sd->state = STATE_SILENCE; + sd->threshold = avg_recent_level << 1; + TRACE_((THIS_FILE,"Starting silence (level=%d " + "threshold=%d)", level, sd->threshold)); + + /* Reset sig_level */ + sd->sum_level = avg_recent_level; + sd->sum_cnt = 1; + } + break; + + default: + pj_assert(0); + break; + } } - - return !sd->in_talk; + return (sd->state == STATE_SILENCE); } -- cgit v1.2.3