summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBenny Prijono <bennylp@teluu.com>2008-08-10 16:15:14 +0000
committerBenny Prijono <bennylp@teluu.com>2008-08-10 16:15:14 +0000
commit4d391149bdafa66e0a16dc7c96f628fff745d2ac (patch)
treecfa7bfff8edfb9c7ab2e9558e99611e737e9bece
parentc0970767b422b18bb22e71efac3d6353bba37006 (diff)
Ticket #590: new echo suppressor which should work much better than the old one
git-svn-id: http://svn.pjsip.org/repos/pjproject/trunk@2199 74dad513-b988-da41-8d7b-12977e46ad98
-rw-r--r--pjmedia/src/pjmedia/echo_suppress.c649
-rw-r--r--pjsip-apps/src/samples/aectest.c54
2 files changed, 657 insertions, 46 deletions
diff --git a/pjmedia/src/pjmedia/echo_suppress.c b/pjmedia/src/pjmedia/echo_suppress.c
index a86a058d..b0f32e0f 100644
--- a/pjmedia/src/pjmedia/echo_suppress.c
+++ b/pjmedia/src/pjmedia/echo_suppress.c
@@ -17,8 +17,10 @@
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <pjmedia/types.h>
+#include <pjmedia/alaw_ulaw.h>
#include <pjmedia/errno.h>
#include <pjmedia/silencedet.h>
+#include <pj/array.h>
#include <pj/assert.h>
#include <pj/lock.h>
#include <pj/log.h>
@@ -29,15 +31,225 @@
#define THIS_FILE "echo_suppress.c"
+/* Maximum float constant */
+#define MAX_FLOAT (float)1.701411e38
+
+/* The effective learn duration (in seconds) before we declare that learning
+ * is complete. The actual learning duration itself may be longer depending
+ * on the conversation pattern (e.g. we can't detect echo if speaker is only
+ * playing silence).
+ */
+#define MAX_CALC_DURATION_SEC 3
+
+/* The internal audio segment length, in milliseconds. 10ms shold be good
+ * and no need to change it.
+ */
+#define SEGMENT_PTIME 10
+
+/* The length of the template signal in milliseconds. The longer the template,
+ * the better correlation will be found, at the expense of more processing
+ * and longer learning time.
+ */
+#define TEMPLATE_PTIME 200
+
+/* How long to look back in the past to see if either mic or speaker is
+ * active.
+ */
+#define SIGNAL_LOOKUP_MSEC 200
+
+/* The minimum level value to be considered as talking, in uLaw complement
+ * (0-255).
+ */
+#define MIN_SIGNAL_ULAW 35
+
+/* The period (in seconds) on which the ES will analize it's effectiveness,
+ * and it may trigger soft-reset to force recalculation.
+ */
+#define CHECK_PERIOD 30
+
+/* Maximum signal level of average echo residue (in uLaw complement). When
+ * the residue value exceeds this value, we force the ES to re-learn.
+ */
+#define MAX_RESIDUE 2.5
+
+
+#if 0
+# define TRACE_(expr) PJ_LOG(5,expr)
+#else
+# define TRACE_(expr)
+#endif
+
+PJ_INLINE(float) fabs(float val)
+{
+ if (val < 0)
+ return -val;
+ else
+ return val;
+}
+
+
+#if defined(PJ_HAS_FLOATING_POINT) && PJ_HAS_FLOATING_POINT!=0
+ typedef float pj_ufloat_t;
+# define pj_ufloat_from_float(f) (f)
+# define pj_ufloat_mul_u(val1, f) ((val1) * (f))
+# define pj_ufloat_mul_i(val1, f) ((val1) * (f))
+#else
+ typedef pj_uint32_t pj_ufloat_t;
+
+ pj_ufloat_t pj_ufloat_from_float(float f)
+ {
+ return (pj_ufloat_t)(f * 65536);
+ }
+
+ unsigned pj_ufloat_mul_u(unsigned val1, pj_ufloat_t val2)
+ {
+ return (val1 * val2) >> 16;
+ }
+
+ int pj_ufloat_mul_i(int val1, pj_ufloat_t val2)
+ {
+ return (val1 * (pj_int32_t)val2) >> 16;
+ }
+#endif
+
+
+/* Conversation state */
+typedef enum talk_state
+{
+ ST_NULL,
+ ST_LOCAL_TALK,
+ ST_REM_SILENT,
+ ST_DOUBLETALK,
+ ST_REM_TALK
+} talk_state;
+
+const char *state_names[] =
+{
+ "Null",
+ "local talking",
+ "remote silent",
+ "doubletalk",
+ "remote talking"
+};
+
+
+/* Description:
+
+ The echo suppressor tries to find the position of echoed signal by looking
+ at the correlation between signal played to the speaker (played signal)
+ and the signal captured from the microphone (recorded signal).
+
+ To do this, it first divides the frames (from mic and speaker) into
+ segments, calculate the audio level of the segment, and save the level
+ information in the playback and record history (play_hist and rec_hist
+ respectively).
+
+ In the history, the newest element (depicted as "t0" in the diagram belo)
+ is put in the last position of the array.
+
+ The record history size is as large as the template size (tmpl_cnt), since
+ we will use the record history as the template to find the best matching
+ position in the playback history.
+
+ Here is the record history buffer:
+
+ <--templ_cnt-->
+ +-------------+
+ | rec_hist |
+ +-------------+
+ t-templ_cnt......t0
+
+ As you can see, the newest frame ("t0") is put as the last element in the
+ array.
+
+ The playback history size is larger than record history, since we need to
+ find the matching pattern in the past. The playback history size is
+ "templ_cnt + tail_cnt", where "tail_cnt" is the number of segments equal
+ to the maximum tail length. The maximum tail length is set when the ES
+ is created.
+
+ Here is the playback history buffer:
+
+ <-----tail_cnt-----> <--templ_cnt-->
+ +-------------------+--------------+
+ | play_hist |
+ +-------------------+--------------+
+ t-play_hist_cnt...t-templ_cnt.......t0
+
+
+
+ Learning:
+
+ During the processing, the ES calculates the following values:
+ - the correlation value, that is how similar the playback signal compared
+ to the mic signal. The lower the correlation value the better (i.e. more
+ similar) the signal is. The correlation value is done over the template
+ duration.
+ - the gain scaling factor, that is the ratio between mic signal and
+ speaker signal. The ES calculates both the minimum and average ratios.
+
+ The ES calculates both the values above for every tail position in the
+ playback history. The values are saved in arrays below:
+
+ <-----tail_cnt----->
+ +-------------------+
+ | corr_sum |
+ +-------------------+
+ | min_factor |
+ +-------------------+
+ | avg_factor |
+ +-------------------+
+
+ At the end of processing, the ES iterates through the correlation array and
+ picks the tail index with the lowest corr_sum value. This is the position
+ where echo is most likely to be found.
+
+
+ Processing:
+
+ Once learning is done, the ES will change the level of the mic signal
+ depending on the state of the conversation and according to the ratio that
+ has been found in the learning phase above.
+
+ */
/*
- * Simple echo suppresor
+ * The simple echo suppresor state
*/
typedef struct echo_supp
{
- pjmedia_silence_det *sd;
- unsigned samples_per_frame;
- unsigned tail_ms;
+ unsigned clock_rate; /* Clock rate. */
+ pj_uint16_t samples_per_frame; /* Frame length in samples */
+ pj_uint16_t samples_per_segment;/* Segment length in samples */
+ pj_uint16_t tail_ms; /* Tail length in milliseconds */
+ pj_uint16_t tail_samples; /* Tail length in samples. */
+
+ pj_bool_t learning; /* Are we still learning yet? */
+ talk_state talk_state; /* Current talking state */
+ int tail_index; /* Echo location, -1 if not found */
+
+ unsigned max_calc; /* # of calc before learning complete.
+ (see MAX_CALC_DURATION_SEC) */
+ unsigned calc_cnt; /* Number of calculations so far */
+
+ unsigned update_cnt; /* # of updates */
+ unsigned templ_cnt; /* Template length, in # of segments */
+ unsigned tail_cnt; /* Tail length, in # of segments */
+ unsigned play_hist_cnt; /* # of segments in play_hist */
+ pj_uint16_t *play_hist; /* Array of playback levels */
+ pj_uint16_t *rec_hist; /* Array of rec levels */
+
+ float *corr_sum; /* Array of corr for each tail pos. */
+ float *tmp_corr; /* Temporary corr array calculation */
+ float best_corr; /* Best correlation so far. */
+
+ float *min_factor; /* Array of minimum scaling factor */
+ float *avg_factor; /* Array of average scaling factor */
+ float *tmp_factor; /* Array to store provisional result */
+
+ unsigned running_cnt; /* Running duration in # of frames */
+ float residue; /* Accummulated echo residue. */
+ float last_factor; /* Last factor applied to mic signal */
} echo_supp;
@@ -54,24 +266,52 @@ PJ_DEF(pj_status_t) echo_supp_create( pj_pool_t *pool,
void **p_state )
{
echo_supp *ec;
- pj_status_t status;
- PJ_UNUSED_ARG(clock_rate);
PJ_UNUSED_ARG(channel_count);
PJ_UNUSED_ARG(options);
+ PJ_ASSERT_RETURN(samples_per_frame >= SEGMENT_PTIME * clock_rate / 1000,
+ PJ_ENOTSUP);
+
ec = PJ_POOL_ZALLOC_T(pool, struct echo_supp);
- ec->samples_per_frame = samples_per_frame;
- ec->tail_ms = tail_ms;
+ ec->clock_rate = clock_rate;
+ ec->samples_per_frame = (pj_uint16_t)samples_per_frame;
+ ec->samples_per_segment = (pj_uint16_t)(SEGMENT_PTIME * clock_rate / 1000);
+ ec->tail_ms = (pj_uint16_t)tail_ms;
+ ec->tail_samples = (pj_uint16_t)(tail_ms * clock_rate / 1000);
+
+ ec->templ_cnt = TEMPLATE_PTIME / SEGMENT_PTIME;
+ ec->tail_cnt = (pj_uint16_t)(tail_ms / SEGMENT_PTIME);
+ ec->play_hist_cnt = (pj_uint16_t)(ec->tail_cnt+ec->templ_cnt);
+
+ ec->max_calc = (pj_uint16_t)(MAX_CALC_DURATION_SEC * clock_rate /
+ ec->samples_per_segment);
- status = pjmedia_silence_det_create(pool, clock_rate, samples_per_frame,
- &ec->sd);
- if (status != PJ_SUCCESS)
- return status;
+ ec->rec_hist = (pj_uint16_t*)
+ pj_pool_alloc(pool, ec->templ_cnt *
+ sizeof(ec->rec_hist[0]));
- pjmedia_silence_det_set_name(ec->sd, "ecsu%p");
- pjmedia_silence_det_set_adaptive(ec->sd, PJMEDIA_ECHO_SUPPRESS_THRESHOLD);
- pjmedia_silence_det_set_params(ec->sd, 100, 500, 3000);
+ /* Note: play history has twice number of elements */
+ ec->play_hist = (pj_uint16_t*)
+ pj_pool_alloc(pool, ec->play_hist_cnt *
+ sizeof(ec->play_hist[0]));
+
+ ec->corr_sum = (float*)
+ pj_pool_alloc(pool, ec->tail_cnt *
+ sizeof(ec->corr_sum[0]));
+ ec->tmp_corr = (float*)
+ pj_pool_alloc(pool, ec->tail_cnt *
+ sizeof(ec->tmp_corr[0]));
+ ec->min_factor = (float*)
+ pj_pool_alloc(pool, ec->tail_cnt *
+ sizeof(ec->min_factor[0]));
+ ec->avg_factor = (float*)
+ pj_pool_alloc(pool, ec->tail_cnt *
+ sizeof(ec->avg_factor[0]));
+ ec->tmp_factor = (float*)
+ pj_pool_alloc(pool, ec->tail_cnt *
+ sizeof(ec->tmp_factor[0]));
+ echo_supp_reset(ec);
*p_state = ec;
return PJ_SUCCESS;
@@ -89,15 +329,257 @@ PJ_DEF(pj_status_t) echo_supp_destroy(void *state)
/*
- * Reset
+ * Hard reset
*/
PJ_DEF(void) echo_supp_reset(void *state)
{
- PJ_UNUSED_ARG(state);
- return;
+ unsigned i;
+ echo_supp *ec = (echo_supp*) state;
+
+ pj_bzero(ec->rec_hist, ec->templ_cnt * sizeof(ec->rec_hist[0]));
+ pj_bzero(ec->play_hist, ec->play_hist_cnt * sizeof(ec->play_hist[0]));
+
+ for (i=0; i<ec->tail_cnt; ++i) {
+ ec->corr_sum[i] = ec->avg_factor[i] = 0;
+ ec->min_factor[i] = MAX_FLOAT;
+ }
+
+ ec->update_cnt = 0;
+ ec->calc_cnt = 0;
+ ec->learning = PJ_TRUE;
+ ec->tail_index = -1;
+ ec->best_corr = MAX_FLOAT;
+ ec->talk_state = ST_NULL;
+ ec->last_factor = 1.0;
+ ec->residue = 0;
+ ec->running_cnt = 0;
+}
+
+/*
+ * Soft reset to force the EC to re-learn without having to discard all
+ * rec and playback history.
+ */
+PJ_DEF(void) echo_supp_soft_reset(void *state)
+{
+ unsigned i;
+
+ echo_supp *ec = (echo_supp*) state;
+
+ for (i=0; i<ec->tail_cnt; ++i) {
+ ec->corr_sum[i] = 0;
+ }
+
+ ec->update_cnt = 0;
+ ec->calc_cnt = 0;
+ ec->learning = PJ_TRUE;
+ ec->best_corr = MAX_FLOAT;
+ ec->residue = 0;
+ ec->running_cnt = 0;
+
+ PJ_LOG(4,(THIS_FILE, "Echo suppressor soft reset. Re-learning.."));
+}
+
+
+/* Set state */
+static void echo_supp_set_state(echo_supp *ec, enum talk_state state)
+{
+ if (state != ec->talk_state) {
+ TRACE_((THIS_FILE, "[%03d.%03d] %s --> %s",
+ (ec->update_cnt * SEGMENT_PTIME / 1000),
+ ((ec->update_cnt * SEGMENT_PTIME) % 1000),
+ state_names[ec->talk_state],
+ state_names[state]));
+ ec->talk_state = state;
+ }
}
/*
+ * Update EC state
+ */
+static void echo_supp_update(echo_supp *ec, pj_int16_t *rec_frm,
+ const pj_int16_t *play_frm)
+{
+ int prev_index;
+ unsigned i, frm_level, sum_rec_level;
+ float rec_corr;
+
+ ++ec->update_cnt;
+ if (ec->update_cnt > 0x7FFFFFFF)
+ ec->update_cnt = 0x7FFFFFFF; /* Detect overflow */
+
+ /* Calculate current play frame level */
+ frm_level = pjmedia_calc_avg_signal(play_frm, ec->samples_per_segment);
+ ++frm_level; /* to avoid division by zero */
+
+ /* Push current frame level to the back of the play history */
+ pj_array_erase(ec->play_hist, sizeof(pj_uint16_t), ec->play_hist_cnt, 0);
+ ec->play_hist[ec->play_hist_cnt-1] = (pj_uint16_t) frm_level;
+
+ /* Calculate level of current mic frame */
+ frm_level = pjmedia_calc_avg_signal(rec_frm, ec->samples_per_segment);
+ ++frm_level; /* to avoid division by zero */
+
+ /* Push to the back of the rec history */
+ pj_array_erase(ec->rec_hist, sizeof(pj_uint16_t), ec->templ_cnt, 0);
+ ec->rec_hist[ec->templ_cnt-1] = (pj_uint16_t) frm_level;
+
+
+ /* Can't do the calc until the play history is full. */
+ if (ec->update_cnt < ec->play_hist_cnt)
+ return;
+
+ /* Skip if learning is done */
+ if (!ec->learning)
+ return;
+
+
+ /* Calculate rec signal pattern */
+ rec_corr = 0;
+ sum_rec_level = 0;
+ for (i=0; i < ec->templ_cnt-1; ++i) {
+ float corr;
+ corr = (float)ec->rec_hist[i+1] / ec->rec_hist[i];
+ rec_corr += corr;
+ sum_rec_level += ec->rec_hist[i];
+ }
+ sum_rec_level += ec->rec_hist[i];
+
+ /* Iterate through the play history and calculate the signal correlation
+ * for every tail position in the play_hist. Save the result in temporary
+ * array since we may bail out early if the conversation state is not good
+ * to detect echo.
+ */
+ for (i=0; i < ec->tail_cnt; ++i) {
+ unsigned j, end, sum_play_level, ulaw;
+ float play_corr = 0, corr_diff;
+
+ sum_play_level = 0;
+ for (j=i, end=i+ec->templ_cnt-1; j<end; ++j) {
+ float corr;
+ corr = (float)ec->play_hist[j+1] / ec->play_hist[j];
+ play_corr += corr;
+ sum_play_level += ec->play_hist[j];
+ }
+ sum_play_level += ec->play_hist[j];
+
+ /* Bail out if remote isn't talking */
+ ulaw = pjmedia_linear2ulaw(sum_play_level/ec->templ_cnt) ^ 0xFF;
+ if (ulaw < MIN_SIGNAL_ULAW) {
+ echo_supp_set_state(ec, ST_REM_SILENT);
+ return;
+ }
+
+ /* Bail out if local user is talking */
+ if (sum_rec_level >= sum_play_level) {
+ echo_supp_set_state(ec, ST_LOCAL_TALK);
+ return;
+ }
+
+ /* Also bail out if we suspect there's a doubletalk */
+ ulaw = pjmedia_linear2ulaw(sum_rec_level/ec->templ_cnt) ^ 0xFF;
+ if (ulaw > MIN_SIGNAL_ULAW) {
+ echo_supp_set_state(ec, ST_DOUBLETALK);
+ return;
+ }
+
+ /* Calculate correlation and save to temporary array */
+ corr_diff = fabs(play_corr - rec_corr);
+ ec->tmp_corr[i] = corr_diff;
+
+ /* Also calculate the gain factor between mic and speaker level */
+ ec->tmp_factor[i] = (float)sum_rec_level / sum_play_level;
+ pj_assert(ec->tmp_factor[i] < 1);
+ }
+
+ /* We seem to have good signal, we can update the EC state */
+ echo_supp_set_state(ec, ST_REM_TALK);
+
+ /* Accummulate the correlation value to the history and at the same
+ * time find the tail index of the best correlation.
+ */
+ prev_index = ec->tail_index;
+ for (i=1; i<ec->tail_cnt-1; ++i) {
+ float *p = &ec->corr_sum[i], sum;
+
+ /* Accummulate correlation value for this tail position */
+ ec->corr_sum[i] += ec->tmp_corr[i];
+
+ /* Update the min and avg gain factor for this tail position */
+ if (ec->tmp_factor[i] < ec->min_factor[i])
+ ec->min_factor[i] = ec->tmp_factor[i];
+ ec->avg_factor[i] = ((ec->avg_factor[i] * ec->tail_cnt) +
+ ec->tmp_factor[i]) /
+ (ec->tail_cnt + 1);
+
+ /* To get the best correlation, also include the correlation
+ * value of the neighbouring tail locations.
+ */
+ sum = *(p-1) + (*p)*2 + *(p+1);
+ //sum = *p;
+
+ /* See if we have better correlation value */
+ if (sum < ec->best_corr) {
+ ec->tail_index = i;
+ ec->best_corr = sum;
+ }
+ }
+
+ if (ec->tail_index != prev_index) {
+ unsigned duration;
+ int imin, iavg;
+
+ duration = ec->update_cnt * SEGMENT_PTIME;
+ imin = (int)(ec->min_factor[ec->tail_index] * 1000);
+ iavg = (int)(ec->avg_factor[ec->tail_index] * 1000);
+
+ PJ_LOG(4,(THIS_FILE,
+ "Echo suppressor updated at t=%03d.%03ds, echo tail=%d msec"
+ ", factor min/avg=%d.%03d/%d.%03d",
+ (duration/1000), (duration%1000),
+ (ec->tail_cnt-ec->tail_index) * SEGMENT_PTIME,
+ imin/1000, imin%1000,
+ iavg/1000, iavg%1000));
+
+ }
+
+ ++ec->calc_cnt;
+
+ if (ec->calc_cnt > ec->max_calc) {
+ unsigned duration;
+ int imin, iavg;
+
+
+ ec->learning = PJ_FALSE;
+ ec->running_cnt = 0;
+
+ duration = ec->update_cnt * SEGMENT_PTIME;
+ imin = (int)(ec->min_factor[ec->tail_index] * 1000);
+ iavg = (int)(ec->avg_factor[ec->tail_index] * 1000);
+
+ PJ_LOG(4,(THIS_FILE,
+ "Echo suppressor learning done at t=%03d.%03ds, tail=%d ms"
+ ", factor min/avg=%d.%03d/%d.%03d",
+ (duration/1000), (duration%1000),
+ (ec->tail_cnt-ec->tail_index) * SEGMENT_PTIME,
+ imin/1000, imin%1000,
+ iavg/1000, iavg%1000));
+ }
+
+}
+
+
+/* Amplify frame */
+static void amplify_frame(pj_int16_t *frm, unsigned length,
+ pj_ufloat_t factor)
+{
+ unsigned i;
+
+ for (i=0; i<length; ++i) {
+ frm[i] = (pj_int16_t)pj_ufloat_mul_i(frm[i], factor);
+ }
+}
+
+/*
* Perform echo cancellation.
*/
PJ_DEF(pj_status_t) echo_supp_cancel_echo( void *state,
@@ -106,25 +588,132 @@ PJ_DEF(pj_status_t) echo_supp_cancel_echo( void *state,
unsigned options,
void *reserved )
{
+ unsigned i, N;
echo_supp *ec = (echo_supp*) state;
- pj_bool_t silence;
PJ_UNUSED_ARG(options);
PJ_UNUSED_ARG(reserved);
- silence = pjmedia_silence_det_detect(ec->sd, play_frm,
- ec->samples_per_frame, NULL);
+ /* Calculate number of segments. This should be okay even if
+ * samples_per_frame is not a multiply of samples_per_segment, since
+ * we only calculate level.
+ */
+ N = ec->samples_per_frame / ec->samples_per_segment;
+ pj_assert(N>0);
+ for (i=0; i<N; ++i) {
+ unsigned pos = i * ec->samples_per_segment;
+ echo_supp_update(ec, rec_frm+pos, play_frm+pos);
+ }
+
+ if (ec->tail_index < 0) {
+ /* Not ready */
+ } else {
+ unsigned lookup_cnt, rec_level=0, play_level=0;
+ unsigned tail_cnt;
+ float factor;
+
+ /* How many previous segments to lookup */
+ lookup_cnt = SIGNAL_LOOKUP_MSEC / SEGMENT_PTIME;
+ if (lookup_cnt > ec->templ_cnt)
+ lookup_cnt = ec->templ_cnt;
- if (!silence) {
-#if defined(PJMEDIA_ECHO_SUPPRESS_FACTOR) && PJMEDIA_ECHO_SUPPRESS_FACTOR!=0
- unsigned i;
- for (i=0; i<ec->samples_per_frame; ++i) {
- rec_frm[i] = (pj_int16_t)(rec_frm[i] >>
- PJMEDIA_ECHO_SUPPRESS_FACTOR);
+ /* Lookup in recording history to get maximum mic level, to see
+ * if local user is currently talking
+ */
+ for (i=ec->templ_cnt - lookup_cnt; i < ec->templ_cnt; ++i) {
+ if (ec->rec_hist[i] > rec_level)
+ rec_level = ec->rec_hist[i];
+ }
+ rec_level = pjmedia_linear2ulaw(rec_level) ^ 0xFF;
+
+ /* Calculate the detected tail length, in # of segments */
+ tail_cnt = (ec->tail_cnt - ec->tail_index);
+
+ /* Lookup in playback history to get max speaker level, to see
+ * if remote user is currently talking
+ */
+ for (i=ec->play_hist_cnt -lookup_cnt -tail_cnt;
+ i<ec->play_hist_cnt-tail_cnt; ++i)
+ {
+ if (ec->play_hist[i] > play_level)
+ play_level = ec->play_hist[i];
+ }
+ play_level = pjmedia_linear2ulaw(play_level) ^ 0xFF;
+
+ if (rec_level >= MIN_SIGNAL_ULAW) {
+ if (play_level < MIN_SIGNAL_ULAW) {
+ /* Mic is talking, speaker is idle. Let mic signal pass as is.
+ */
+ factor = 1.0;
+ echo_supp_set_state(ec, ST_LOCAL_TALK);
+ } else {
+ /* Seems that both are talking. Scale the mic signal
+ * down a little bit to reduce echo, while allowing both
+ * parties to talk at the same time.
+ */
+ factor = (float)(ec->avg_factor[ec->tail_index] * 2);
+ echo_supp_set_state(ec, ST_DOUBLETALK);
+ }
+ } else {
+ if (play_level < MIN_SIGNAL_ULAW) {
+ /* Both mic and speaker seems to be idle. Also scale the
+ * mic signal down with average factor to reduce low power
+ * echo.
+ */
+ factor = ec->avg_factor[ec->tail_index] * 3 / 2;
+ echo_supp_set_state(ec, ST_REM_SILENT);
+ } else {
+ /* Mic is idle, but there's something playing in speaker.
+ * Scale the mic down to minimum
+ */
+ factor = ec->min_factor[ec->tail_index] / 2;
+ echo_supp_set_state(ec, ST_REM_TALK);
+ }
+ }
+
+ /* Smoothen the transition */
+ if (factor > ec->last_factor)
+ factor = (factor + ec->last_factor) / 2;
+ else
+ factor = (factor + ec->last_factor*9) / 10;
+
+ /* Amplify frame */
+ amplify_frame(rec_frm, ec->samples_per_frame,
+ pj_ufloat_from_float(factor));
+ ec->last_factor = factor;
+
+ if (ec->talk_state == ST_REM_TALK) {
+ unsigned level, recalc_cnt;
+
+ /* Get the adjusted frame signal level */
+ level = pjmedia_calc_avg_signal(rec_frm, ec->samples_per_frame);
+ level = pjmedia_linear2ulaw(level) ^ 0xFF;
+
+ /* Accumulate average echo residue to see the ES effectiveness */
+ ec->residue = ((ec->residue * ec->running_cnt) + level) /
+ (ec->running_cnt + 1);
+
+ ++ec->running_cnt;
+
+ /* Check if we need to re-learn */
+ recalc_cnt = CHECK_PERIOD * ec->clock_rate / ec->samples_per_frame;
+ if (ec->running_cnt > recalc_cnt) {
+ int iresidue;
+
+ iresidue = (int)(ec->residue*1000);
+
+ PJ_LOG(5,(THIS_FILE, "Echo suppressor residue = %d.%03d",
+ iresidue/1000, iresidue%1000));
+
+ if (ec->residue > MAX_RESIDUE && !ec->learning) {
+ echo_supp_soft_reset(ec);
+ ec->residue = 0;
+ } else {
+ ec->running_cnt = 0;
+ ec->residue = 0;
+ }
+ }
}
-#else
- pjmedia_zero_samples(rec_frm, ec->samples_per_frame);
-#endif
}
return PJ_SUCCESS;
diff --git a/pjsip-apps/src/samples/aectest.c b/pjsip-apps/src/samples/aectest.c
index 14017749..773f7c94 100644
--- a/pjsip-apps/src/samples/aectest.c
+++ b/pjsip-apps/src/samples/aectest.c
@@ -56,6 +56,7 @@ static const char *desc =
" options:\n"
" -d The delay between playback and capture in ms. Default is zero.\n"
" -l Set the echo tail length in ms. Default is 200 ms \n"
+" -r Set repeat count (default=1) \n"
" -a Algorithm: 0=default, 1=speex, 3=echo suppress \n";
/*
@@ -91,10 +92,10 @@ int main(int argc, char *argv[])
unsigned latency_ms = 0;
unsigned tail_ms = TAIL_LENGTH;
pj_timestamp t0, t1;
- int c;
+ int i, repeat=1, c;
pj_optind = 0;
- while ((c=pj_getopt(argc, argv, "d:l:a:")) !=-1) {
+ while ((c=pj_getopt(argc, argv, "d:l:a:r:")) !=-1) {
switch (c) {
case 'd':
latency_ms = atoi(pj_optarg);
@@ -121,6 +122,14 @@ int main(int argc, char *argv[])
}
}
break;
+ case 'r':
+ repeat = atoi(pj_optarg);
+ if (repeat < 1) {
+ puts("Invalid algorithm");
+ puts(desc);
+ return 1;
+ }
+ break;
}
}
@@ -210,25 +219,30 @@ int main(int argc, char *argv[])
play_frame.buf = pj_pool_alloc(pool, wav_play->info.samples_per_frame<<1);
rec_frame.buf = pj_pool_alloc(pool, wav_play->info.samples_per_frame<<1);
pj_get_timestamp(&t0);
- for (;;) {
- play_frame.size = wav_play->info.samples_per_frame << 1;
- status = pjmedia_port_get_frame(wav_play, &play_frame);
- if (status != PJ_SUCCESS)
- break;
+ for (i=0; i < repeat; ++i) {
+ for (;;) {
+ play_frame.size = wav_play->info.samples_per_frame << 1;
+ status = pjmedia_port_get_frame(wav_play, &play_frame);
+ if (status != PJ_SUCCESS)
+ break;
- status = pjmedia_echo_playback(ec, (short*)play_frame.buf);
+ status = pjmedia_echo_playback(ec, (short*)play_frame.buf);
- rec_frame.size = wav_play->info.samples_per_frame << 1;
- status = pjmedia_port_get_frame(wav_rec, &rec_frame);
- if (status != PJ_SUCCESS)
- break;
+ rec_frame.size = wav_play->info.samples_per_frame << 1;
+ status = pjmedia_port_get_frame(wav_rec, &rec_frame);
+ if (status != PJ_SUCCESS)
+ break;
- status = pjmedia_echo_capture(ec, (short*)rec_frame.buf, 0);
+ status = pjmedia_echo_capture(ec, (short*)rec_frame.buf, 0);
- //status = pjmedia_echo_cancel(ec, (short*)rec_frame.buf,
- // (short*)play_frame.buf, 0, NULL);
+ //status = pjmedia_echo_cancel(ec, (short*)rec_frame.buf,
+ // (short*)play_frame.buf, 0, NULL);
- pjmedia_port_put_frame(wav_out, &rec_frame);
+ pjmedia_port_put_frame(wav_out, &rec_frame);
+ }
+
+ pjmedia_wav_player_port_set_pos(wav_play, 0);
+ pjmedia_wav_player_port_set_pos(wav_rec, 0);
}
pj_get_timestamp(&t1);
@@ -257,6 +271,14 @@ int main(int argc, char *argv[])
/* Shutdown PJLIB */
pj_shutdown();
+#if 0
+ {
+ char s[10];
+ puts("ENTER to quit");
+ fgets(s, sizeof(s), stdin);
+ }
+#endif
+
/* Done. */
return 0;
}