Batchnorm1d optimized and parameterized point cnn layer in phoneme det

microsoft · Oct 26, 2020 · 214495c · 214495c
1 parent 190d3b4
commit 214495c
Show file tree

Hide file tree

Showing 8 changed files with 88 additions and 49 deletions.
diff --git a/c_reference/include/conv1d.h b/c_reference/include/conv1d.h
@@ -158,18 +158,22 @@ int avgpool1d(float* output_signal, unsigned out_time, const float* input_signal
  * @param[in]    input_signal     pointer to the input signal. size = in_time * in_channels
  * @param[in]    in_time          number of time steps in the input
  * @param[in]    in_channels      number of input channels. The output will have the same number of channels
- * @param[in]    mean             pointer to the mean for the batch normalization, size = in_channels
- * @param[in]    var              pointer to the variance for the batch normalization, size = in_channels
- * @param[in]    affine           whether the affine operations are applied
- * @param[in]    gamma            pointer to the scaling factors for the post-norm affine operation, size = in_channels. Provide Null/0 if affine is False(non-zero)
- * @param[in]    beta             pointer to the offsets for the post-norm affine operation, size = in_channels. Provide Null/0 if affine is False(non-zero)
+ * @param[in]    mean             pointer to the mean for the batch normalization, size = in_channels. if affine_config  = 2, then pass a NULL/0
+ * @param[in]    var              pointer to the variance for the batch normalization, size = in_channels. if affine_config  = 2, then pass a NULL/0
+ * @param[in]    affine_config    whether the affine operations are applied
+ *                                if affine_config = 0, then only mean and var are used
+ *                                if affine_config = 1, then mean, var, gamma and beta are used for the final computation.
+ *                                if affine_config = 2, then only the gamma and beta are used. gamma = original_gamma/sqrt(var), beta = original_beta - gamma * mean/sqrt(var)
+ *                                Note: Use affine_config = 2 for faster calculations. The new gamma and beta would need to be pre-computed, stored and passed
+ * @param[in]    gamma            pointer to the scaling factors for the post-norm affine operation, size = in_channels. Provide Null/0 if affine_config is 0
+ * @param[in]    beta             pointer to the offsets for the post-norm affine operation, size = in_channels. Provide Null/0 if affine_config is 0
  * @param[in]    in_place         in-place computation of the batchnorm i.e. the output is stored in-place of the input signal. Storage efficient
  * @param[in]    eps              a very small +ve value to avoid division by 0. For the default value, assign = 0.00001
  */
 int batchnorm1d(float* output_signal, float* input_signal,
   unsigned in_time, unsigned in_channels,
   const float* const mean, const float* const var, 
-  unsigned affine, const float* const gamma , const float* const beta,
+  unsigned affine_config, const float* const gamma , const float* const beta,
   unsigned in_place, float eps);
 
 #endif
diff --git a/c_reference/include/dscnn.h b/c_reference/include/dscnn.h
@@ -4,18 +4,27 @@
 #ifndef __DSCNN_H__
 #define __DSCNN_H__
 
+// Function pointer for the Conv layer to be passed as a parameter. (conv1d or conv1d_lr only)
+typedef int (*conv_layer)(float*, unsigned, unsigned, const float*, 
+                          unsigned, unsigned, unsigned, unsigned, 
+                          const void*, unsigned, unsigned);
+
 /**
  * @brief Model definition for the 1D Convolution block applied before the RNN
  * @brief sub-layers : batchnorm1d -> conv1d_lr
  * @param[out]   output_signal       pointer to the final output signal, minimum size = out_time * in_channels. out_time has to be calculated based on the reduction from all the conv and pool layers
  * @param[in]    input_signal        pointer to the input signal. size = in_time * in_channels
  * @param[in]    in_time             number of time steps in the input_signal
  * @param[in]    in_channels         number of input channels
- * @param[in]    mean                pointer to the mean for the batch normalization, size = in_channels
- * @param[in]    var                 pointer to the variance for the batch normalization, size = in_channels
- * @param[in]    affine              whether the affine operations are applied
- * @param[in]    gamma               pointer to the scaling factors for the post-norm affine operation, size = in_channels
- * @param[in]    beta                pointer to the offsets for the post-norm affine operation, size = in_channels
+ * @param[in]    mean                pointer to the mean for the batch normalization, size = in_channels. Pass NULL/0 for affine_config = 2
+ * @param[in]    var                 pointer to the variance for the batch normalization, size = in_channels. Pass NULL/0 for affine_config = 2
+ * @param[in]    affine_config       whether the affine operations are applied
+ *                                   if affine_config = 0, then only mean and var are used
+ *                                   if affine_config = 1, then mean, var, gamma and beta are used for the final computation.
+ *                                   if affine_config = 2, then only the gamma and beta are used. gamma = original_gamma/sqrt(var), beta = original_beta - gamma * mean/sqrt(var)
+ *                                   Note: Use affine_config = 2 for faster calculations. The new gamma and beta would need to be pre-computed, stored and passed
+ * @param[in]    gamma               pointer to the scaling factors for the post-norm affine operation, size = in_channels. Pass NULL/0 for affine_config = 0
+ * @param[in]    beta                pointer to the offsets for the post-norm affine operation, size = in_channels. Pass NULL/0 for affine_config = 0
  * @param[in]    in_place            in-place computation check for the batchnorm. Storage efficient
  * @param[in]    cnn_hidden          hidden state/out_channels dimensions for the low-rank CNN. The final channel size of this block
  * @param[in]    cnn_padding         padding for the low-rank CNN layer. Note: applied to both sides of the input 
@@ -31,7 +40,7 @@
 int phon_pred_lr_cnn(float* output_signal, float* input_signal,
   unsigned in_time, unsigned in_channels,
   const float* const mean, const float* const var,
-  unsigned affine, float* gamma, float* beta, unsigned in_place,
+  unsigned affine_config, float* gamma, float* beta, unsigned in_place,
   unsigned cnn_hidden, unsigned cnn_padding, unsigned cnn_kernel_size,
   const void* cnn_params, unsigned cnn_stride, unsigned cnn_activation);
 
@@ -42,11 +51,15 @@ int phon_pred_lr_cnn(float* output_signal, float* input_signal,
  * @param[in]    input_signal           pointer to the input signal. size = in_time * in_channels
  * @param[in]    in_time                number of time steps in the input
  * @param[in]    in_channels            number of input channels
- * @param[in]    mean                   pointer to the mean for the batch normalization, size = in_channels
- * @param[in]    var                    pointer to the variance for the batch normalization, size = in_channels
- * @param[in]    affine                 whether the affine operations are applied
- * @param[in]    gamma                  pointer to the scaling factors for the post-norm affine operation, size = in_channels
- * @param[in]    beta                   pointer to the offsets for the post-norm affine operation, size = in_channels
+ * @param[in]    mean                   pointer to the mean for the batch normalization, size = in_channels. Pass NULL/0 for affine_config = 2
+ * @param[in]    var                    pointer to the variance for the batch normalization, size = in_channels. Pass NULL/0 for affine_config = 2
+ * @param[in]    affine_config          whether the affine operations are applied
+ *                                      if affine_config = 0, then only mean and var are used
+ *                                      if affine_config = 1, then mean, var, gamma and beta are used for the final computation.
+ *                                      if affine_config = 2, then only the gamma and beta are used. gamma = original_gamma/sqrt(var), beta = original_beta - gamma * mean/sqrt(var)
+ *                                      Note: Use affine_config = 2 for faster calculations. The new gamma and beta would need to be pre-computed, stored and passed
+ * @param[in]    gamma                  pointer to the scaling factors for the post-norm affine operation, size = in_channels. Pass NULL/0 for affine_config = 0
+ * @param[in]    beta                   pointer to the offsets for the post-norm affine operation, size = in_channels. Pass NULL/0 for affine_config = 0
  * @param[in]    in_place               in-place computation of the batchnorm. Storage efficient
  * @param[in]    depth_cnn_padding      padding for the depth CNN layer. Note: applied to both sides of the input to the depth CNN
  * @param[in]    depth_cnn_kernel_size  kernel size of the depth CNN
@@ -77,9 +90,9 @@ int phon_pred_lr_cnn(float* output_signal, float* input_signal,
  *                                      3: relu
  */
 int phon_pred_depth_point_lr_cnn(float* output_signal, float* input_signal,
-  unsigned in_time, unsigned in_channels,
+  conv_layer point_cnn, unsigned in_time, unsigned in_channels,
   const float* const mean, const float* const var,
-  unsigned affine, const float* const gamma, const float* const beta, unsigned in_place,
+  unsigned affine_config, const float* const gamma, const float* const beta, unsigned in_place,
   unsigned depth_cnn_padding, unsigned depth_cnn_kernel_size,
   const void* depth_cnn_params, unsigned depth_cnn_stride, unsigned depth_cnn_activation,
   unsigned point_cnn_hidden, unsigned point_cnn_padding, unsigned point_cnn_kernel_size,

diff --git a/c_reference/src/conv1d.c b/c_reference/src/conv1d.c
@@ -171,6 +171,7 @@ int avgpool1d(float* output_signal, unsigned out_time, const float* input_signal
   unsigned padding, unsigned kernel_size, unsigned stride, unsigned activation) {
 
   // Iterate over the time steps and average them. Similar to Conv1D_Dept with a filter kernel of ones
+  float scale = 1.0/(float)kernel_size;
   for (unsigned t_in = 0, t_out = 0; t_out < out_time; t_out++, t_in += stride) {
     for (unsigned ci = 0; ci < in_channels; ci++) {
       float sum = 0;
@@ -183,16 +184,16 @@ int avgpool1d(float* output_signal, unsigned out_time, const float* input_signal
         }
       }
       if (activation == 1) {
-        output_signal[t_out * in_channels + ci] = sigmoid(sum / (float)kernel_size);
+        output_signal[t_out * in_channels + ci] = sigmoid(sum * scale);
       }
       else if (activation == 2) {
-        output_signal[t_out * in_channels + ci] = tanh(sum / (float)kernel_size);
+        output_signal[t_out * in_channels + ci] = tanh(sum * scale);
       }
       else if (activation == 3) {
-        output_signal[t_out * in_channels + ci] = relu(sum / (float)kernel_size);
+        output_signal[t_out * in_channels + ci] = relu(sum * scale);
       }
       else {
-        output_signal[t_out * in_channels + ci] = sum / (float)kernel_size;
+        output_signal[t_out * in_channels + ci] = sum * scale;
       }
     }
   }
@@ -202,10 +203,10 @@ int avgpool1d(float* output_signal, unsigned out_time, const float* input_signal
 int batchnorm1d(float* output_signal, float* input_signal,
   unsigned in_time, unsigned in_channels,
   const float* const mean, const float* const var,
-  unsigned affine, const float* const gamma , const float* const beta,
+  unsigned affine_config, const float* const gamma , const float* const beta,
   unsigned in_place, float eps) {
   // Check if affine values was learnt
-  if (affine) {
+  if (affine_config == 1) {
     // Check for in-place computation
     if (in_place) {
       for (unsigned t = 0; t < in_time; t++) {
@@ -228,6 +229,27 @@ int batchnorm1d(float* output_signal, float* input_signal,
       }
     }
   }
+  else if (affine_config == 2) {
+    // Check for in-place computation
+    if (in_place) {
+      for (unsigned t = 0; t < in_time; t++) {
+        for (unsigned d = 0; d < in_channels; d++) {
+          input_signal[t * in_channels + d] = (gamma[d]
+                                               * input_signal[t * in_channels + d])
+                                               + beta[d];
+        }
+      }
+    }
+    else {
+      for (unsigned t = 0; t < in_time; t++) {
+        for (unsigned d = 0; d < in_channels; d++) {
+          output_signal[t * in_channels + d] = (gamma[d]
+                                                * input_signal[t * in_channels + d])
+                                                + beta[d];
+        }
+      }
+    }
+  }
   else {
       // Check for in-place computation
     if (in_place) {

diff --git a/c_reference/src/dscnn.c b/c_reference/src/dscnn.c
@@ -11,7 +11,7 @@
 int phon_pred_lr_cnn(float* output_signal, float* input_signal,
   unsigned in_time, unsigned in_channels,
   const float* const mean, const float* const var,
-  unsigned affine, float* gamma, float* beta, unsigned in_place,
+  unsigned affine_config, float* gamma, float* beta, unsigned in_place,
   unsigned cnn_hidden, unsigned cnn_padding, unsigned cnn_kernel_size,
   const void* cnn_params, unsigned cnn_stride, unsigned cnn_activation) {
 
@@ -20,7 +20,7 @@ int phon_pred_lr_cnn(float* output_signal, float* input_signal,
     // BatchNorm
     batchnorm1d(0, input_signal,
       in_time, in_channels, 
-      mean, var, affine, gamma, beta,
+      mean, var, affine_config, gamma, beta,
       in_place, 0.00001);
     // CNN
     conv1d_lr(output_signal, out_time, cnn_hidden, input_signal, 
@@ -32,7 +32,7 @@ int phon_pred_lr_cnn(float* output_signal, float* input_signal,
     float* norm_out = (float*)malloc(in_time * in_channels * sizeof(float));
     batchnorm1d(norm_out, input_signal,
       in_time, in_channels, 
-      mean, var, affine, gamma, beta,
+      mean, var, affine_config, gamma, beta,
       in_place, 0.00001);
     // CNN
     conv1d_lr(output_signal, out_time, cnn_hidden, norm_out, 
@@ -44,9 +44,9 @@ int phon_pred_lr_cnn(float* output_signal, float* input_signal,
 }
 
 int phon_pred_depth_point_lr_cnn(float* output_signal, float* input_signal,
-  unsigned in_time, unsigned in_channels,
+  conv_layer point_cnn, unsigned in_time, unsigned in_channels,
   const float* const mean, const float* const var,
-  unsigned affine, const float* const gamma, const float* const beta, unsigned in_place,
+  unsigned affine_config, const float* const gamma, const float* const beta, unsigned in_place,
   unsigned depth_cnn_padding, unsigned depth_cnn_kernel_size,
   const void* depth_cnn_params, unsigned depth_cnn_stride, unsigned depth_cnn_activation,
   unsigned point_cnn_hidden, unsigned point_cnn_padding, unsigned point_cnn_kernel_size,
@@ -66,7 +66,7 @@ int phon_pred_depth_point_lr_cnn(float* output_signal, float* input_signal,
     batchnorm1d(0, act_out,
       in_time, in_channels, 
       mean, var,
-      affine, gamma, beta,
+      affine_config, gamma, beta,
       in_place, 0.00001);
     // Depth CNN
     depth_out = (float*)malloc(out_time * in_channels * sizeof(float));
@@ -81,7 +81,7 @@ int phon_pred_depth_point_lr_cnn(float* output_signal, float* input_signal,
     batchnorm1d(norm_out, act_out,
       in_time, in_channels, 
       mean, var,
-      affine, gamma, beta,
+      affine_config, gamma, beta,
       in_place, 0.00001);
     free(act_out);
     // Depth CNN
@@ -96,7 +96,7 @@ int phon_pred_depth_point_lr_cnn(float* output_signal, float* input_signal,
   in_time = out_time;
   out_time = in_time - point_cnn_kernel_size + 2 * point_cnn_padding + 1;
   float* point_out = (float*)malloc(out_time * point_cnn_hidden * sizeof(float));
-  conv1d_lr(point_out, out_time, point_cnn_hidden, depth_out, 
+  point_cnn(point_out, out_time, point_cnn_hidden, depth_out, 
     in_time, in_channels, point_cnn_padding, point_cnn_kernel_size, 
     point_cnn_params, point_cnn_stride, point_cnn_activation);
   free(depth_out);

diff --git a/c_reference/tests/kws/postcnn_params.h b/c_reference/tests/kws/postcnn_params.h
diff --git a/c_reference/tests/kws/precnn_params.h b/c_reference/tests/kws/precnn_params.h
diff --git a/c_reference/tests/kws/rnn_params.h b/c_reference/tests/kws/rnn_params.h
diff --git a/c_reference/tests/kws/test_phoneme_det_cnn_rnn.c b/c_reference/tests/kws/test_phoneme_det_cnn_rnn.c
@@ -165,12 +165,12 @@ void phoneme_prediction(float* mem_buf) {
   // Use the in-place computation only if the input can be discarded/altered. Else avoid in-place computation for this layer
   phon_pred_lr_cnn(cnn1_out, mem_buf,
     in_time, PRE_CNN_IN_FEATURES,
-    BNORM_CNN1_MEAN, BNORM_CNN1_VAR, 0, 0, 0, PRE_CNN_BNORM_INPLACE,
+    0, 0, PRE_CNN_BNORM_AFFINE, CNN1_SCALE, CNN1_OFFSET, PRE_CNN_BNORM_INPLACE,
     PRE_CNN_OUT_FEATURES, PRE_CNN_FILT_PAD, PRE_CNN_FILT,
     &conv_params, PRE_CNN_STRIDE, PRE_CNN_FILT_ACT); // regular tanh activation
 
   batchnorm1d(0, cnn1_out, in_time, RNN_IN_FEATURES, 
-    BNORM_RNN_MEAN, BNORM_RNN_VAR, 0, 0, 0, 1, 0.00001); // Currently in-place only and no affine values
+    0, 0, RNN_BNORM_AFFINE, RNN_SCALE, RNN_OFFSET, 1, 0.00001);
 
   /* Bricked Bi-FastGRNN Block */
   out_time = in_time/RNN_HOP + 1;
@@ -194,8 +194,8 @@ void phoneme_prediction(float* mem_buf) {
   out_time = out_time - POST_CNN_POOL + (POST_CNN_POOL_PAD << 1) + 1;
   float* cnn2_out = (float*)malloc(out_time * POST_CNN_INTER_FEATURES * sizeof(float));
   phon_pred_depth_point_lr_cnn(cnn2_out, rnn_out,
-    in_time, POST_CNN_INTER_FEATURES,
-    CNN2_BNORM_MEAN, CNN2_BNORM_VAR, 0, 0, 0, POST_CNN_BNORM_INPLACE,
+    conv1d_lr, in_time, POST_CNN_INTER_FEATURES,
+    0, 0, POST_CNN_BNORM_AFFINE, CNN2_SCALE, CNN2_OFFSET, POST_CNN_BNORM_INPLACE,
     POST_CNN_DEPTH_PAD, POST_CNN_DEPTH_FILT,
     &depth_param_2, POST_CNN_DEPTH_STRIDE, POST_CNN_DEPTH_ACT,
     POST_CNN_INTER_FEATURES, POST_CNN_POINT_PAD, POST_CNN_POINT_FILT,
@@ -209,8 +209,8 @@ void phoneme_prediction(float* mem_buf) {
   out_time = out_time - POST_CNN_POOL + (POST_CNN_POOL_PAD << 1) + 1;
   float* cnn3_out = (float*)malloc(out_time * POST_CNN_INTER_FEATURES * sizeof(float));
   phon_pred_depth_point_lr_cnn(cnn3_out, cnn2_out,
-    in_time, POST_CNN_INTER_FEATURES,
-    CNN3_BNORM_MEAN, CNN3_BNORM_VAR, 0, 0, 0, POST_CNN_BNORM_INPLACE,
+    conv1d_lr, in_time, POST_CNN_INTER_FEATURES,
+    0, 0, POST_CNN_BNORM_AFFINE, CNN3_SCALE, CNN3_OFFSET, POST_CNN_BNORM_INPLACE,
     POST_CNN_DEPTH_PAD, POST_CNN_DEPTH_FILT,
     &depth_param_3, POST_CNN_DEPTH_STRIDE, POST_CNN_DEPTH_ACT,
     POST_CNN_INTER_FEATURES, POST_CNN_POINT_PAD, POST_CNN_POINT_FILT,
@@ -224,8 +224,8 @@ void phoneme_prediction(float* mem_buf) {
   out_time = out_time - POST_CNN_POOL + (POST_CNN_POOL_PAD << 1) + 1;
   float* cnn4_out = (float*)malloc(out_time * POST_CNN_INTER_FEATURES * sizeof(float));
   phon_pred_depth_point_lr_cnn(cnn4_out, cnn3_out,
-    in_time, POST_CNN_INTER_FEATURES,
-    CNN4_BNORM_MEAN, CNN4_BNORM_VAR, 0, 0, 0, POST_CNN_BNORM_INPLACE,
+    conv1d_lr, in_time, POST_CNN_INTER_FEATURES,
+    0, 0, POST_CNN_BNORM_AFFINE, CNN4_SCALE, CNN4_OFFSET, POST_CNN_BNORM_INPLACE,
     POST_CNN_DEPTH_PAD, POST_CNN_DEPTH_FILT,
     &depth_param_4, POST_CNN_DEPTH_STRIDE, POST_CNN_DEPTH_ACT,
     POST_CNN_INTER_FEATURES, POST_CNN_POINT_PAD, POST_CNN_POINT_FILT,
@@ -239,8 +239,8 @@ void phoneme_prediction(float* mem_buf) {
   out_time = out_time - POST_CNN_POOL + (POST_CNN_POOL_PAD << 1) + 1;
   float* pred = (float*)malloc(out_time * POST_CNN_OUT_FEATURES * sizeof(float));
   phon_pred_depth_point_lr_cnn(pred, cnn4_out,
-    in_time, POST_CNN_INTER_FEATURES,
-    CNN5_BNORM_MEAN, CNN5_BNORM_VAR, 0, 0, 0, POST_CNN_BNORM_INPLACE,
+    conv1d_lr, in_time, POST_CNN_INTER_FEATURES,
+    0, 0, POST_CNN_BNORM_AFFINE, CNN5_SCALE, CNN5_OFFSET, POST_CNN_BNORM_INPLACE,
     POST_CNN_DEPTH_PAD, POST_CNN_DEPTH_FILT,
     &depth_param_5, POST_CNN_DEPTH_STRIDE, POST_CNN_DEPTH_ACT,
     POST_CNN_OUT_FEATURES, POST_CNN_POINT_PAD, POST_CNN_POINT_FILT,