1 files changed, 276 insertions, 348 deletions
diff --git a/pjmedia/src/pjmedia-codec/speex/ltp.c b/pjmedia/src/pjmedia-codec/speex/ltp.c
index 94189c34..9a5a295e 100644
--- a/pjmedia/src/pjmedia-codec/speex/ltp.c
+++ b/pjmedia/src/pjmedia-codec/speex/ltp.c
@@ -55,7 +55,7 @@
 #endif
 
 #ifndef OVERRIDE_INNER_PROD
-static spx_word32_t inner_prod(const spx_word16_t *x, const spx_word16_t *y, int len)
+spx_word32_t inner_prod(const spx_word16_t *x, const spx_word16_t *y, int len)
 {
    spx_word32_t sum=0;
    len >>= 2;
@@ -75,7 +75,7 @@ static spx_word32_t inner_prod(const spx_word16_t *x, const spx_word16_t *y, int
 
 #ifndef OVERRIDE_PITCH_XCORR
 #if 0 /* HINT: Enable this for machines with enough registers (i.e. not x86) */
-static void pitch_xcorr(const spx_word16_t *_x, const spx_word16_t *_y, spx_word32_t *corr, int len, int nb_pitch, char *stack)
+void pitch_xcorr(const spx_word16_t *_x, const spx_word16_t *_y, spx_word32_t *corr, int len, int nb_pitch, char *stack)
 {
    int i,j;
    for (i=0;i<nb_pitch;i+=4)
@@ -138,7 +138,7 @@ static void pitch_xcorr(const spx_word16_t *_x, const spx_word16_t *_y, spx_word
 
 }
 #else
-static void pitch_xcorr(const spx_word16_t *_x, const spx_word16_t *_y, spx_word32_t *corr, int len, int nb_pitch, char *stack)
+void pitch_xcorr(const spx_word16_t *_x, const spx_word16_t *_y, spx_word32_t *corr, int len, int nb_pitch, char *stack)
 {
    int i;
    for (i=0;i<nb_pitch;i++)
@@ -152,128 +152,122 @@ static void pitch_xcorr(const spx_word16_t *_x, const spx_word16_t *_y, spx_word
 #endif
 
 #ifndef OVERRIDE_COMPUTE_PITCH_ERROR
-static inline spx_word32_t compute_pitch_error(spx_word32_t *C, spx_word16_t *g, spx_word16_t pitch_control)
+static inline spx_word32_t compute_pitch_error(spx_word16_t *C, spx_word16_t *g, spx_word16_t pitch_control)
 {
    spx_word32_t sum = 0;
-   sum = ADD32(sum,MULT16_32_Q15(MULT16_16_16(g[0],pitch_control),C[0]));
-   sum = ADD32(sum,MULT16_32_Q15(MULT16_16_16(g[1],pitch_control),C[1]));
-   sum = ADD32(sum,MULT16_32_Q15(MULT16_16_16(g[2],pitch_control),C[2]));
-   sum = SUB32(sum,MULT16_32_Q15(MULT16_16_16(g[0],g[1]),C[3]));
-   sum = SUB32(sum,MULT16_32_Q15(MULT16_16_16(g[2],g[1]),C[4]));
-   sum = SUB32(sum,MULT16_32_Q15(MULT16_16_16(g[2],g[0]),C[5]));
-   sum = SUB32(sum,MULT16_32_Q15(MULT16_16_16(g[0],g[0]),C[6]));
-   sum = SUB32(sum,MULT16_32_Q15(MULT16_16_16(g[1],g[1]),C[7]));
-   sum = SUB32(sum,MULT16_32_Q15(MULT16_16_16(g[2],g[2]),C[8]));
+   sum = ADD32(sum,MULT16_16(MULT16_16_16(g[0],pitch_control),C[0]));
+   sum = ADD32(sum,MULT16_16(MULT16_16_16(g[1],pitch_control),C[1]));
+   sum = ADD32(sum,MULT16_16(MULT16_16_16(g[2],pitch_control),C[2]));
+   sum = SUB32(sum,MULT16_16(MULT16_16_16(g[0],g[1]),C[3]));
+   sum = SUB32(sum,MULT16_16(MULT16_16_16(g[2],g[1]),C[4]));
+   sum = SUB32(sum,MULT16_16(MULT16_16_16(g[2],g[0]),C[5]));
+   sum = SUB32(sum,MULT16_16(MULT16_16_16(g[0],g[0]),C[6]));
+   sum = SUB32(sum,MULT16_16(MULT16_16_16(g[1],g[1]),C[7]));
+   sum = SUB32(sum,MULT16_16(MULT16_16_16(g[2],g[2]),C[8]));
    return sum;
 }
 #endif
 
-void open_loop_nbest_pitch(spx_sig_t *sw, int start, int end, int len, int *pitch, spx_word16_t *gain, int N, char *stack)
+#ifndef OVERRIDE_OPEN_LOOP_NBEST_PITCH
+void open_loop_nbest_pitch(spx_word16_t *sw, int start, int end, int len, int *pitch, spx_word16_t *gain, int N, char *stack)
 {
    int i,j,k;
    VARDECL(spx_word32_t *best_score);
+   VARDECL(spx_word32_t *best_ener);
    spx_word32_t e0;
    VARDECL(spx_word32_t *corr);
    VARDECL(spx_word32_t *energy);
-   VARDECL(spx_word32_t *score);
-   VARDECL(spx_word16_t *swn2);
-   spx_word16_t *swn;
 
    ALLOC(best_score, N, spx_word32_t);
+   ALLOC(best_ener, N, spx_word32_t);
    ALLOC(corr, end-start+1, spx_word32_t);
    ALLOC(energy, end-start+2, spx_word32_t);
-   ALLOC(score, end-start+1, spx_word32_t);
-
-#ifdef FIXED_POINT
-   ALLOC(swn2, end+len, spx_word16_t);
-   normalize16(sw-end, swn2, 16384, end+len);
-   swn = swn2 + end;
-#else
-   swn = sw;
-#endif
 
    for (i=0;i<N;i++)
    {
         best_score[i]=-1;
+        best_ener[i]=0;
         pitch[i]=start;
    }
 
-
-   energy[0]=inner_prod(swn-start, swn-start, len);
-   e0=inner_prod(swn, swn, len);
-   for (i=start;i<=end;i++)
+   energy[0]=inner_prod(sw-start, sw-start, len);
+   e0=inner_prod(sw, sw, len);
+   for (i=start;i<end;i++)
    {
       /* Update energy for next pitch*/
-      energy[i-start+1] = SUB32(ADD32(energy[i-start],SHR32(MULT16_16(swn[-i-1],swn[-i-1]),6)), SHR32(MULT16_16(swn[-i+len-1],swn[-i+len-1]),6));
+      energy[i-start+1] = SUB32(ADD32(energy[i-start],SHR32(MULT16_16(sw[-i-1],sw[-i-1]),6)), SHR32(MULT16_16(sw[-i+len-1],sw[-i+len-1]),6));
       if (energy[i-start+1] < 0)
          energy[i-start+1] = 0;
    }
 
-   pitch_xcorr(swn, swn-end, corr, len, end-start+1, stack);
+   pitch_xcorr(sw, sw-end, corr, len, end-start+1, stack);
 
+   /* FIXME: Fixed-point and floating-point code should be merged */
 #ifdef FIXED_POINT
    {
       VARDECL(spx_word16_t *corr16);
       VARDECL(spx_word16_t *ener16);
       ALLOC(corr16, end-start+1, spx_word16_t);
       ALLOC(ener16, end-start+1, spx_word16_t);
-      normalize16(corr, corr16, 16384, end-start+1);
-      normalize16(energy, ener16, 16384, end-start+1);
+      /* Normalize to 180 so we can square it and it still fits in 16 bits */
+      normalize16(corr, corr16, 180, end-start+1);
+      normalize16(energy, ener16, 180, end-start+1);
 
       for (i=start;i<=end;i++)
       {
-         spx_word16_t g;
-         spx_word32_t tmp;
-         tmp = corr16[i-start];
-         if (tmp>0)
-         {
-            if (SHR16(corr16[i-start],4)>ener16[i-start])
-               tmp = SHL32(EXTEND32(ener16[i-start]),14);
-            else if (-SHR16(corr16[i-start],4)>ener16[i-start])
-               tmp = -SHL32(EXTEND32(ener16[i-start]),14);
-            else
-               tmp = SHL32(tmp,10);
-            g = DIV32_16(tmp, 8+ener16[i-start]);
-            score[i-start] = MULT16_16(corr16[i-start],g);
-         } else
+         spx_word16_t tmp = MULT16_16_16(corr16[i-start],corr16[i-start]);
+         /* Instead of dividing the tmp by the energy, we multiply on the other side */
+         if (MULT16_16(tmp,best_ener[N-1])>MULT16_16(best_score[N-1],ADD16(1,ener16[i-start])))
          {
-            score[i-start] = 1;
+            /* We can safely put it last and then check */
+            best_score[N-1]=tmp;
+            best_ener[N-1]=ener16[i-start]+1;
+            pitch[N-1]=i;
+            /* Check if it comes in front of others */
+            for (j=0;j<N-1;j++)
+            {
+               if (MULT16_16(tmp,best_ener[j])>MULT16_16(best_score[j],ADD16(1,ener16[i-start])))
+               {
+                  for (k=N-1;k>j;k--)
+                  {
+                     best_score[k]=best_score[k-1];
+                     best_ener[k]=best_ener[k-1];
+                     pitch[k]=pitch[k-1];
+                  }
+                  best_score[j]=tmp;
+                  best_ener[j]=ener16[i-start]+1;
+                  pitch[j]=i;
+                  break;
+               }
+            }
          }
       }
    }
 #else
    for (i=start;i<=end;i++)
    {
-      float g = corr[i-start]/(1+energy[i-start]);
-      if (g>16)
-         g = 16;
-      else if (g<-16)
-         g = -16;
-      score[i-start] = g*corr[i-start];
-   }
-#endif
-
-   /* Extract best scores */
-   for (i=start;i<=end;i++)
-   {
-      if (score[i-start]>best_score[N-1])
+      float tmp = corr[i-start]*corr[i-start];
+      if (tmp*best_ener[N-1]>best_score[N-1]*(1+energy[i-start]))
       {
          for (j=0;j<N;j++)
          {
-            if (score[i-start] > best_score[j])
+            if (tmp*best_ener[j]>best_score[j]*(1+energy[i-start]))
             {
                for (k=N-1;k>j;k--)
                {
                   best_score[k]=best_score[k-1];
+                  best_ener[k]=best_ener[k-1];
                   pitch[k]=pitch[k-1];
                }
-               best_score[j]=score[i-start];
+               best_score[j]=tmp;
+               best_ener[j]=energy[i-start]+1;
                pitch[j]=i;
                break;
             }
          }
       }
    }
+#endif
 
    /* Compute open-loop gain */
    if (gain)
@@ -290,164 +284,131 @@ void open_loop_nbest_pitch(spx_sig_t *sw, int start, int end, int len, int *pitc
        }
    }
 }
+#endif
+
+#ifndef OVERRIDE_PITCH_GAIN_SEARCH_3TAP_VQ
+static int pitch_gain_search_3tap_vq(
+  const signed char *gain_cdbk,
+  int                gain_cdbk_size,
+  spx_word16_t      *C16,
+  spx_word16_t       max_gain
+)
+{
+  const signed char *ptr=gain_cdbk;
+  int                best_cdbk=0;
+  spx_word32_t       best_sum=-VERY_LARGE32;
+  spx_word32_t       sum=0;
+  spx_word16_t       g[3];
+  spx_word16_t       pitch_control=64;
+  spx_word16_t       gain_sum;
+  int                i;
+
+  for (i=0;i<gain_cdbk_size;i++) {
+         
+    ptr = gain_cdbk+4*i;
+    g[0]=ADD16((spx_word16_t)ptr[0],32);
+    g[1]=ADD16((spx_word16_t)ptr[1],32);
+    g[2]=ADD16((spx_word16_t)ptr[2],32);
+    gain_sum = (spx_word16_t)ptr[3];
+         
+    sum = compute_pitch_error(C16, g, pitch_control);
+         
+    if (sum>best_sum && gain_sum<=max_gain) {
+      best_sum=sum;
+      best_cdbk=i;
+    }
+  }
 
+  return best_cdbk;
+}
+#endif
 
 /** Finds the best quantized 3-tap pitch predictor by analysis by synthesis */
-static spx_word64_t pitch_gain_search_3tap(
-const spx_sig_t target[],       /* Target vector */
+static spx_word32_t pitch_gain_search_3tap(
+const spx_word16_t target[],       /* Target vector */
 const spx_coef_t ak[],          /* LPCs for this subframe */
 const spx_coef_t awk1[],        /* Weighted LPCs #1 for this subframe */
 const spx_coef_t awk2[],        /* Weighted LPCs #2 for this subframe */
 spx_sig_t exc[],                /* Excitation */
-const void *par,
+const signed char *gain_cdbk,
+int gain_cdbk_size,
 int   pitch,                    /* Pitch value */
 int   p,                        /* Number of LPC coeffs */
 int   nsf,                      /* Number of samples in subframe */
 SpeexBits *bits,
 char *stack,
-const spx_sig_t *exc2,
+const spx_word16_t *exc2,
 const spx_word16_t *r,
-spx_sig_t *new_target,
+spx_word16_t *new_target,
 int  *cdbk_index,
-int cdbk_offset,
-int plc_tuning
+int plc_tuning,
+spx_word32_t cumul_gain
 )
 {
    int i,j;
-   VARDECL(spx_sig_t *tmp1);
-   VARDECL(spx_sig_t *tmp2);
-   spx_sig_t *x[3];
-   spx_sig_t *e[3];
+   VARDECL(spx_word16_t *tmp1);
+   VARDECL(spx_word16_t *e);
+   spx_word16_t *x[3];
    spx_word32_t corr[3];
    spx_word32_t A[3][3];
-   int   gain_cdbk_size;
-   const signed char *gain_cdbk;
    spx_word16_t gain[3];
-   spx_word64_t err;
+   spx_word32_t err;
+   spx_word16_t max_gain=128;
+   int          best_cdbk=0;
 
-   const ltp_params *params;
-   params = (const ltp_params*) par;
-   gain_cdbk_size = 1<<params->gain_bits;
-   gain_cdbk = params->gain_cdbk + 3*gain_cdbk_size*cdbk_offset;
-   ALLOC(tmp1, 3*nsf, spx_sig_t);
-   ALLOC(tmp2, 3*nsf, spx_sig_t);
+   ALLOC(tmp1, 3*nsf, spx_word16_t);
+   ALLOC(e, nsf, spx_word16_t);
 
+   if (cumul_gain > 262144)
+      max_gain = 31;
+   
    x[0]=tmp1;
    x[1]=tmp1+nsf;
    x[2]=tmp1+2*nsf;
    
-   e[0]=tmp2;
-   e[1]=tmp2+nsf;
-   e[2]=tmp2+2*nsf;
-   for (i=2;i>=0;i--)
    {
-      int pp=pitch+1-i;
+      VARDECL(spx_mem_t *mm);
+      int pp=pitch-1;
+      ALLOC(mm, p, spx_mem_t);
       for (j=0;j<nsf;j++)
       {
          if (j-pp<0)
-            e[i][j]=exc2[j-pp];
+            e[j]=exc2[j-pp];
          else if (j-pp-pitch<0)
-            e[i][j]=exc2[j-pp-pitch];
+            e[j]=exc2[j-pp-pitch];
          else
-            e[i][j]=0;
+            e[j]=0;
       }
-
-      if (i==2)
-         syn_percep_zero(e[i], ak, awk1, awk2, x[i], nsf, p, stack);
-      else {
-         for (j=0;j<nsf-1;j++)
-            x[i][j+1]=x[i+1][j];
-         x[i][0]=0;
-         for (j=0;j<nsf;j++)
-         {
-            x[i][j]=ADD32(x[i][j],SHL32(MULT16_32_Q15(r[j], e[i][0]),1));
-         }
-      }
-   }
-
-#ifdef FIXED_POINT
-   {
-      /* If using fixed-point, we need to normalize the signals first */
-      spx_word16_t *y[3];
-      VARDECL(spx_word16_t *ytmp);
-      VARDECL(spx_word16_t *t);
-
-      spx_sig_t max_val=1;
-      int sig_shift;
-      
-      ALLOC(ytmp, 3*nsf, spx_word16_t);
-#if 0
-      ALLOC(y[0], nsf, spx_word16_t);
-      ALLOC(y[1], nsf, spx_word16_t);
-      ALLOC(y[2], nsf, spx_word16_t);
-#else
-      y[0] = ytmp;
-      y[1] = ytmp+nsf;
-      y[2] = ytmp+2*nsf;
-#endif
-      ALLOC(t, nsf, spx_word16_t);
-      for (j=0;j<3;j++)
-      {
-         for (i=0;i<nsf;i++)
-         {
-            spx_sig_t tmp = x[j][i];
-            if (tmp<0)
-               tmp = -tmp;
-            if (tmp > max_val)
-               max_val = tmp;
-         }
-      }
-      for (i=0;i<nsf;i++)
-      {
-         spx_sig_t tmp = target[i];
-         if (tmp<0)
-            tmp = -tmp;
-         if (tmp > max_val)
-            max_val = tmp;
-      }
-
-      sig_shift=0;
-      while (max_val>16384)
-      {
-         sig_shift++;
-         max_val >>= 1;
-      }
-
-      for (j=0;j<3;j++)
-      {
-         for (i=0;i<nsf;i++)
-         {
-            y[j][i] = EXTRACT16(SHR32(x[j][i],sig_shift));
-         }
-      }
-      for (i=0;i<nsf;i++)
-      {
-         t[i] = EXTRACT16(SHR32(target[i],sig_shift));
-      }
-
-      for (i=0;i<3;i++)
-         corr[i]=inner_prod(y[i],t,nsf);
-      
-      for (i=0;i<3;i++)
-         for (j=0;j<=i;j++)
-            A[i][j]=A[j][i]=inner_prod(y[i],y[j],nsf);
+      for (j=0;j<p;j++)
+         mm[j] = 0;
+      iir_mem16(e, ak, e, nsf, p, mm, stack);
+      for (j=0;j<p;j++)
+         mm[j] = 0;
+      filter_mem16(e, awk1, awk2, e, nsf, p, mm, stack);
+      for (j=0;j<nsf;j++)
+         x[2][j] = e[j];
    }
-#else
+   for (i=1;i>=0;i--)
    {
-      for (i=0;i<3;i++)
-         corr[i]=inner_prod(x[i],target,nsf);
-      
-      for (i=0;i<3;i++)
-         for (j=0;j<=i;j++)
-            A[i][j]=A[j][i]=inner_prod(x[i],x[j],nsf);
+      spx_word16_t e0=exc2[-pitch-1+i];
+      x[i][0]=MULT16_16_Q14(r[0], e0);
+      for (j=0;j<nsf-1;j++)
+         x[i][j+1]=ADD32(x[i+1][j],MULT16_16_P14(r[j+1], e0));
    }
-#endif
+
+   for (i=0;i<3;i++)
+      corr[i]=inner_prod(x[i],target,nsf);
+   for (i=0;i<3;i++)
+      for (j=0;j<=i;j++)
+         A[i][j]=A[j][i]=inner_prod(x[i],x[j],nsf);
 
    {
       spx_word32_t C[9];
-      const signed char *ptr=gain_cdbk;
-      int best_cdbk=0;
-      spx_word32_t best_sum=0;
+#ifdef FIXED_POINT
+      spx_word16_t C16[9];
+#else
+      spx_word16_t *C16=C;
+#endif      
       C[0]=corr[2];
       C[1]=corr[1];
       C[2]=corr[0];
@@ -461,111 +422,73 @@ int plc_tuning
       /*plc_tuning *= 2;*/
       if (plc_tuning<2)
          plc_tuning=2;
+      if (plc_tuning>30)
+         plc_tuning=30;
 #ifdef FIXED_POINT
-      C[0] = MAC16_32_Q15(C[0],MULT16_16_16(plc_tuning,-327),C[0]);
-      C[1] = MAC16_32_Q15(C[1],MULT16_16_16(plc_tuning,-327),C[1]);
-      C[2] = MAC16_32_Q15(C[2],MULT16_16_16(plc_tuning,-327),C[2]);
       C[0] = SHL32(C[0],1);
       C[1] = SHL32(C[1],1);
       C[2] = SHL32(C[2],1);
       C[3] = SHL32(C[3],1);
       C[4] = SHL32(C[4],1);
       C[5] = SHL32(C[5],1);
+      C[6] = MAC16_32_Q15(C[6],MULT16_16_16(plc_tuning,655),C[6]);
+      C[7] = MAC16_32_Q15(C[7],MULT16_16_16(plc_tuning,655),C[7]);
+      C[8] = MAC16_32_Q15(C[8],MULT16_16_16(plc_tuning,655),C[8]);
+      normalize16(C, C16, 32767, 9);
 #else
-      C[0]*=1-.01*plc_tuning;
-      C[1]*=1-.01*plc_tuning;
-      C[2]*=1-.01*plc_tuning;
-      C[6]*=.5*(1+.01*plc_tuning);
-      C[7]*=.5*(1+.01*plc_tuning);
-      C[8]*=.5*(1+.01*plc_tuning);
+      C[6]*=.5*(1+.02*plc_tuning);
+      C[7]*=.5*(1+.02*plc_tuning);
+      C[8]*=.5*(1+.02*plc_tuning);
 #endif
-      for (i=0;i<gain_cdbk_size;i++)
-      {
-         spx_word32_t sum=0;
-         spx_word16_t g[3];
-         spx_word16_t pitch_control=64;
-         spx_word16_t gain_sum;
-         
-         ptr = gain_cdbk+3*i;
-         g[0]=ADD16((spx_word16_t)ptr[0],32);
-         g[1]=ADD16((spx_word16_t)ptr[1],32);
-         g[2]=ADD16((spx_word16_t)ptr[2],32);
-
-         /* We favor "safe" pitch values to handle packet loss better */
-         gain_sum = ADD16(ADD16(g[1],MAX16(g[0], 0)),MAX16(g[2], 0));
-         if (gain_sum > 64)
-         {
-            gain_sum = SUB16(gain_sum, 64);
-            if (gain_sum > 127)
-               gain_sum = 127;
-#ifdef FIXED_POINT
-            pitch_control =  SUB16(64,EXTRACT16(PSHR32(MULT16_16(64,MULT16_16_16(plc_tuning, gain_sum)),10)));
-#else
-            pitch_control = 64*(1.-.001*plc_tuning*gain_sum);
-#endif
-            if (pitch_control < 0)
-               pitch_control = 0;
-         }
-         
-         sum = compute_pitch_error(C, g, pitch_control);
-         
-         if (sum>best_sum || i==0)
-         {
-            best_sum=sum;
-            best_cdbk=i;
-         }
-      }
+
+      best_cdbk = pitch_gain_search_3tap_vq(gain_cdbk, gain_cdbk_size, C16, max_gain);
+
 #ifdef FIXED_POINT
-      gain[0] = ADD16(32,(spx_word16_t)gain_cdbk[best_cdbk*3]);
-      gain[1] = ADD16(32,(spx_word16_t)gain_cdbk[best_cdbk*3+1]);
-      gain[2] = ADD16(32,(spx_word16_t)gain_cdbk[best_cdbk*3+2]);
+      gain[0] = ADD16(32,(spx_word16_t)gain_cdbk[best_cdbk*4]);
+      gain[1] = ADD16(32,(spx_word16_t)gain_cdbk[best_cdbk*4+1]);
+      gain[2] = ADD16(32,(spx_word16_t)gain_cdbk[best_cdbk*4+2]);
       /*printf ("%d %d %d %d\n",gain[0],gain[1],gain[2], best_cdbk);*/
 #else
-      gain[0] = 0.015625*gain_cdbk[best_cdbk*3]  + .5;
-      gain[1] = 0.015625*gain_cdbk[best_cdbk*3+1]+ .5;
-      gain[2] = 0.015625*gain_cdbk[best_cdbk*3+2]+ .5;
+      gain[0] = 0.015625*gain_cdbk[best_cdbk*4]  + .5;
+      gain[1] = 0.015625*gain_cdbk[best_cdbk*4+1]+ .5;
+      gain[2] = 0.015625*gain_cdbk[best_cdbk*4+2]+ .5;
 #endif
       *cdbk_index=best_cdbk;
    }
 
-#ifdef FIXED_POINT
-   for (i=0;i<nsf;i++)
-     exc[i]=SHL32(ADD32(ADD32(MULT16_32_Q15(SHL16(gain[0],7),e[2][i]), MULT16_32_Q15(SHL16(gain[1],7),e[1][i])),
-                        MULT16_32_Q15(SHL16(gain[2],7),e[0][i])), 2);
-   
-   err=0;
    for (i=0;i<nsf;i++)
+      exc[i]=0;
+   for (i=0;i<3;i++)
    {
-      spx_word16_t perr2;
-      spx_sig_t tmp = SHL32(ADD32(ADD32(MULT16_32_Q15(SHL16(gain[0],7),x[2][i]),MULT16_32_Q15(SHL16(gain[1],7),x[1][i])),
-                                  MULT16_32_Q15(SHL16(gain[2],7),x[0][i])),2);
-      spx_sig_t perr=SUB32(target[i],tmp);
-      new_target[i] = SUB32(target[i], tmp);
-      perr2 = EXTRACT16(PSHR32(perr,15));
-      err = ADD64(err,MULT16_16(perr2,perr2));
-      
+      int j;
+      int tmp1, tmp3;
+      int pp=pitch+1-i;
+      tmp1=nsf;
+      if (tmp1>pp)
+         tmp1=pp;
+      for (j=0;j<tmp1;j++)
+         exc[j]=MAC16_16(exc[j],SHL16(gain[2-i],7),exc2[j-pp]);
+      tmp3=nsf;
+      if (tmp3>pp+pitch)
+         tmp3=pp+pitch;
+      for (j=tmp1;j<tmp3;j++)
+         exc[j]=MAC16_16(exc[j],SHL16(gain[2-i],7),exc2[j-pp-pitch]);
    }
-#else
-   for (i=0;i<nsf;i++)
-      exc[i]=gain[0]*e[2][i]+gain[1]*e[1][i]+gain[2]*e[0][i];
-   
-   err=0;
    for (i=0;i<nsf;i++)
    {
-      spx_sig_t tmp = gain[2]*x[0][i]+gain[1]*x[1][i]+gain[0]*x[2][i];
-      new_target[i] = target[i] - tmp;
-      err+=new_target[i]*new_target[i];
+      spx_word32_t tmp = ADD32(ADD32(MULT16_16(gain[0],x[2][i]),MULT16_16(gain[1],x[1][i])),
+                            MULT16_16(gain[2],x[0][i]));
+      new_target[i] = SUB16(target[i], EXTRACT16(PSHR32(tmp,6)));
    }
-#endif
+   err = inner_prod(new_target, new_target, nsf);
 
    return err;
 }
 
-
 /** Finds the best quantized 3-tap pitch predictor by analysis by synthesis */
 int pitch_search_3tap(
-spx_sig_t target[],                 /* Target vector */
-spx_sig_t *sw,
+spx_word16_t target[],                 /* Target vector */
+spx_word16_t *sw,
 spx_coef_t ak[],                     /* LPCs for this subframe */
 spx_coef_t awk1[],                   /* Weighted LPCs #1 for this subframe */
 spx_coef_t awk2[],                   /* Weighted LPCs #2 for this subframe */
@@ -578,24 +501,32 @@ int   p,                        /* Number of LPC coeffs */
 int   nsf,                      /* Number of samples in subframe */
 SpeexBits *bits,
 char *stack,
-spx_sig_t *exc2,
+spx_word16_t *exc2,
 spx_word16_t *r,
 int complexity,
 int cdbk_offset,
-int plc_tuning
+int plc_tuning,
+spx_word32_t *cumul_gain
 )
 {
    int i,j;
    int cdbk_index, pitch=0, best_gain_index=0;
    VARDECL(spx_sig_t *best_exc);
-   VARDECL(spx_sig_t *new_target);
-   VARDECL(spx_sig_t *best_target);
+   VARDECL(spx_word16_t *new_target);
+   VARDECL(spx_word16_t *best_target);
    int best_pitch=0;
-   spx_word64_t err, best_err=-1;
+   spx_word32_t err, best_err=-1;
    int N;
    const ltp_params *params;
+   const signed char *gain_cdbk;
+   int   gain_cdbk_size;
+   
    VARDECL(int *nbest);
-
+   
+   params = (const ltp_params*) par;
+   gain_cdbk_size = 1<<params->gain_bits;
+   gain_cdbk = params->gain_cdbk + 4*gain_cdbk_size*cdbk_offset;
+   
    N=complexity;
    if (N>10)
       N=10;
@@ -614,23 +545,24 @@ int plc_tuning
       return start;
    }
    
-   ALLOC(best_exc, nsf, spx_sig_t);
-   ALLOC(new_target, nsf, spx_sig_t);
-   ALLOC(best_target, nsf, spx_sig_t);
-   
    if (N>end-start+1)
       N=end-start+1;
    if (end != start)
       open_loop_nbest_pitch(sw, start, end, nsf, nbest, NULL, N, stack);
    else
       nbest[0] = start;
+   
+   ALLOC(best_exc, nsf, spx_sig_t);
+   ALLOC(new_target, nsf, spx_word16_t);
+   ALLOC(best_target, nsf, spx_word16_t);
+   
    for (i=0;i<N;i++)
    {
       pitch=nbest[i];
       for (j=0;j<nsf;j++)
          exc[j]=0;
-      err=pitch_gain_search_3tap(target, ak, awk1, awk2, exc, par, pitch, p, nsf,
-                                 bits, stack, exc2, r, new_target, &cdbk_index, cdbk_offset, plc_tuning);
+      err=pitch_gain_search_3tap(target, ak, awk1, awk2, exc, gain_cdbk, gain_cdbk_size, pitch, p, nsf,
+                                 bits, stack, exc2, r, new_target, &cdbk_index, plc_tuning, *cumul_gain);
       if (err<best_err || best_err<0)
       {
          for (j=0;j<nsf;j++)
@@ -642,10 +574,15 @@ int plc_tuning
          best_gain_index=cdbk_index;
       }
    }
-   
    /*printf ("pitch: %d %d\n", best_pitch, best_gain_index);*/
    speex_bits_pack(bits, best_pitch-start, params->pitch_bits);
    speex_bits_pack(bits, best_gain_index, params->gain_bits);
+#ifdef FIXED_POINT
+   *cumul_gain = MULT16_32_Q13(SHL16(params->gain_cdbk[4*best_gain_index+3],8), MAX32(1024,*cumul_gain));
+#else
+   *cumul_gain = 0.03125*MAX32(1024,*cumul_gain)*params->gain_cdbk[4*best_gain_index+3];
+#endif
+   /*printf ("%f\n", cumul_gain);*/
    /*printf ("encode pitch: %d %d\n", best_pitch, best_gain_index);*/
    for (i=0;i<nsf;i++)
       exc[i]=best_exc[i];
@@ -656,10 +593,11 @@ int plc_tuning
 }
 
 void pitch_unquant_3tap(
-spx_sig_t exc[],                    /* Excitation */
+spx_word16_t exc[],             /* Input excitation */
+spx_word32_t exc_out[],         /* Output excitation */
 int   start,                    /* Smallest pitch value allowed */
 int   end,                      /* Largest pitch value allowed */
-spx_word16_t pitch_coef,               /* Voicing (pitch) coefficient */
+spx_word16_t pitch_coef,        /* Voicing (pitch) coefficient */
 const void *par,
 int   nsf,                      /* Number of samples in subframe */
 int *pitch_val,
@@ -682,20 +620,20 @@ int cdbk_offset
 
    params = (const ltp_params*) par;
    gain_cdbk_size = 1<<params->gain_bits;
-   gain_cdbk = params->gain_cdbk + 3*gain_cdbk_size*cdbk_offset;
+   gain_cdbk = params->gain_cdbk + 4*gain_cdbk_size*cdbk_offset;
 
    pitch = speex_bits_unpack_unsigned(bits, params->pitch_bits);
    pitch += start;
    gain_index = speex_bits_unpack_unsigned(bits, params->gain_bits);
    /*printf ("decode pitch: %d %d\n", pitch, gain_index);*/
 #ifdef FIXED_POINT
-   gain[0] = ADD16(32,(spx_word16_t)gain_cdbk[gain_index*3]);
-   gain[1] = ADD16(32,(spx_word16_t)gain_cdbk[gain_index*3+1]);
-   gain[2] = ADD16(32,(spx_word16_t)gain_cdbk[gain_index*3+2]);
+   gain[0] = ADD16(32,(spx_word16_t)gain_cdbk[gain_index*4]);
+   gain[1] = ADD16(32,(spx_word16_t)gain_cdbk[gain_index*4+1]);
+   gain[2] = ADD16(32,(spx_word16_t)gain_cdbk[gain_index*4+2]);
 #else
-   gain[0] = 0.015625*gain_cdbk[gain_index*3]+.5;
-   gain[1] = 0.015625*gain_cdbk[gain_index*3+1]+.5;
-   gain[2] = 0.015625*gain_cdbk[gain_index*3+2]+.5;
+   gain[0] = 0.015625*gain_cdbk[gain_index*4]+.5;
+   gain[1] = 0.015625*gain_cdbk[gain_index*4+1]+.5;
+   gain[2] = 0.015625*gain_cdbk[gain_index*4+2]+.5;
 #endif
 
    if (count_lost && pitch > subframe_offset)
@@ -728,66 +666,36 @@ int cdbk_offset
    gain_val[0]=gain[0];
    gain_val[1]=gain[1];
    gain_val[2]=gain[2];
-
+   gain[0] = SHL16(gain[0],7);
+   gain[1] = SHL16(gain[1],7);
+   gain[2] = SHL16(gain[2],7);
+   for (i=0;i<nsf;i++)
+      exc_out[i]=0;
+   for (i=0;i<3;i++)
    {
-      spx_sig_t *e[3];
-      VARDECL(spx_sig_t *tmp2);
-      ALLOC(tmp2, 3*nsf, spx_sig_t);
-      e[0]=tmp2;
-      e[1]=tmp2+nsf;
-      e[2]=tmp2+2*nsf;
-      
-      for (i=0;i<3;i++)
-      {
-         int j;
-         int pp=pitch+1-i;
-#if 0
-         for (j=0;j<nsf;j++)
-         {
-            if (j-pp<0)
-               e[i][j]=exc[j-pp];
-            else if (j-pp-pitch<0)
-               e[i][j]=exc[j-pp-pitch];
-            else
-               e[i][j]=0;
-         }
-#else
-         {
-            int tmp1, tmp3;
-            tmp1=nsf;
-            if (tmp1>pp)
-               tmp1=pp;
-            for (j=0;j<tmp1;j++)
-               e[i][j]=exc[j-pp];
-            tmp3=nsf;
-            if (tmp3>pp+pitch)
-               tmp3=pp+pitch;
-            for (j=tmp1;j<tmp3;j++)
-               e[i][j]=exc[j-pp-pitch];
-            for (j=tmp3;j<nsf;j++)
-               e[i][j]=0;
-         }
-#endif
-      }
-
-#ifdef FIXED_POINT
-      {
-         for (i=0;i<nsf;i++)
-            exc[i]=SHL32(ADD32(ADD32(MULT16_32_Q15(SHL16(gain[0],7),e[2][i]), MULT16_32_Q15(SHL16(gain[1],7),e[1][i])),
-                               MULT16_32_Q15(SHL16(gain[2],7),e[0][i])), 2);
-      }
-#else
-      for (i=0;i<nsf;i++)
-         exc[i]=VERY_SMALL+gain[0]*e[2][i]+gain[1]*e[1][i]+gain[2]*e[0][i];
-#endif
+      int j;
+      int tmp1, tmp3;
+      int pp=pitch+1-i;
+      tmp1=nsf;
+      if (tmp1>pp)
+         tmp1=pp;
+      for (j=0;j<tmp1;j++)
+         exc_out[j]=MAC16_16(exc_out[j],gain[2-i],exc[j-pp]);
+      tmp3=nsf;
+      if (tmp3>pp+pitch)
+         tmp3=pp+pitch;
+      for (j=tmp1;j<tmp3;j++)
+         exc_out[j]=MAC16_16(exc_out[j],gain[2-i],exc[j-pp-pitch]);
    }
+   /*for (i=0;i<nsf;i++)
+   exc[i]=PSHR32(exc32[i],13);*/
 }
 
 
 /** Forced pitch delay and gain */
 int forced_pitch_quant(
-spx_sig_t target[],                 /* Target vector */
-spx_sig_t *sw,
+spx_word16_t target[],                 /* Target vector */
+spx_word16_t *sw,
 spx_coef_t ak[],                     /* LPCs for this subframe */
 spx_coef_t awk1[],                   /* Weighted LPCs #1 for this subframe */
 spx_coef_t awk2[],                   /* Weighted LPCs #2 for this subframe */
@@ -800,30 +708,45 @@ int   p,                        /* Number of LPC coeffs */
 int   nsf,                      /* Number of samples in subframe */
 SpeexBits *bits,
 char *stack,
-spx_sig_t *exc2,
+spx_word16_t *exc2,
 spx_word16_t *r,
 int complexity,
 int cdbk_offset,
-int plc_tuning
+int plc_tuning,
+spx_word32_t *cumul_gain
 )
 {
    int i;
-   float coef = GAIN_SCALING_1*pitch_coef;
-   if (coef>.99)
-      coef=.99;
-   for (i=0;i<nsf;i++)
+   VARDECL(spx_sig_t *res);
+   ALLOC(res, nsf, spx_sig_t);
+#ifdef FIXED_POINT
+   if (pitch_coef>63)
+      pitch_coef=63;
+#else
+   if (pitch_coef>.99)
+      pitch_coef=.99;
+#endif
+   for (i=0;i<nsf&&i<start;i++)
+   {
+      exc[i]=MULT16_16(SHL16(pitch_coef, 7),exc2[i-start]);
+   }
+   for (;i<nsf;i++)
    {
-      exc[i]=exc[i-start]*coef;
+      exc[i]=MULT16_32_Q15(SHL16(pitch_coef, 9),exc[i-start]);
    }
+   syn_percep_zero(exc, ak, awk1, awk2, res, nsf, p, stack);
+   for (i=0;i<nsf;i++)
+      target[i]=EXTRACT16(SATURATE(SUB32(EXTEND32(target[i]),PSHR32(res[i],SIG_SHIFT-1)),32700));
    return start;
 }
 
 /** Unquantize forced pitch delay and gain */
 void forced_pitch_unquant(
-spx_sig_t exc[],                    /* Excitation */
+spx_word16_t exc[],             /* Input excitation */
+spx_word32_t exc_out[],         /* Output excitation */
 int   start,                    /* Smallest pitch value allowed */
 int   end,                      /* Largest pitch value allowed */
-spx_word16_t pitch_coef,               /* Voicing (pitch) coefficient */
+spx_word16_t pitch_coef,        /* Voicing (pitch) coefficient */
 const void *par,
 int   nsf,                      /* Number of samples in subframe */
 int *pitch_val,
@@ -837,12 +760,17 @@ int cdbk_offset
 )
 {
    int i;
-   float coef = GAIN_SCALING_1*pitch_coef;
-   if (coef>.99)
-      coef=.99;
+#ifdef FIXED_POINT
+   if (pitch_coef>63)
+      pitch_coef=63;
+#else
+   if (pitch_coef>.99)
+      pitch_coef=.99;
+#endif
    for (i=0;i<nsf;i++)
    {
-      exc[i]=exc[i-start]*coef;
+      exc_out[i]=MULT16_16(exc[i-start],SHL16(pitch_coef,7));
+      exc[i] = PSHR(exc_out[i],13);
    }
    *pitch_val = start;
    gain_val[0]=gain_val[2]=0;