diff --git a/include/tkDNN/Layer.h b/include/tkDNN/Layer.h
index 9154e1b5..9bd84322 100644
--- a/include/tkDNN/Layer.h
+++ b/include/tkDNN/Layer.h
@@ -50,7 +50,7 @@ class Layer {
     }
     void setFinal() { this->final = true; }
     dataDim_t input_dim, output_dim;
-    dnnType *dstData;  //where results will be putted
+    dnnType *dstData = nullptr;  //where results will be putted
 
     int id = 0;
     bool final;        //if the layer is the final one
@@ -122,7 +122,7 @@ class LayerWgs : public Layer {
     __half *data16_d  = nullptr, *bias16_d  = nullptr;
     __half *bias216_h = nullptr, *bias216_d = nullptr;
 
-    __half *power16_h    = nullptr;
+    __half *power16_h    = nullptr,    *power16_d = nullptr;
     __half *scales16_h   = nullptr,   *scales16_d = nullptr;
     __half *mean16_h     = nullptr,     *mean16_d = nullptr;
     __half *variance16_h = nullptr, *variance16_d = nullptr;
@@ -164,6 +164,7 @@ class LayerWgs : public Layer {
             if(  scales16_d != nullptr) { cudaFree(  scales16_d);   scales16_d = nullptr; }
             if(    mean16_d != nullptr) { cudaFree(    mean16_d);     mean16_d = nullptr; }
             if(variance16_d != nullptr) { cudaFree(variance16_d); variance16_d = nullptr; } 
+            if(   power16_d != nullptr) { cudaFree(   power16_d);    power16_d = nullptr; }
         }
     }
 };
diff --git a/include/tkDNN/utils.h b/include/tkDNN/utils.h
index f9f6ae77..aa73e9ed 100644
--- a/include/tkDNN/utils.h
+++ b/include/tkDNN/utils.h
@@ -116,5 +116,6 @@ void matrixMulAdd(  cublasHandle_t handle, dnnType* srcData, dnnType* dstData,
                     dnnType* add_vector, int dim, dnnType mul);
 
 void getMemUsage(double& vm_usage_kb, double& resident_set_kb);
+void printCudaMemUsage();
 void removePathAndExtension(const std::string &full_string, std::string &name);
 #endif //UTILS_H
diff --git a/src/Layer.cpp b/src/Layer.cpp
index 9f04ca54..a355b905 100644
--- a/src/Layer.cpp
+++ b/src/Layer.cpp
@@ -24,6 +24,11 @@ Layer::~Layer() {
 
     checkCUDNN( cudnnDestroyTensorDescriptor(srcTensorDesc) );
     checkCUDNN( cudnnDestroyTensorDescriptor(dstTensorDesc) );
+
+    if(dstData != nullptr) {
+        cudaFree(dstData);
+        dstData = nullptr;
+    }
 }
 
 }}
\ No newline at end of file
diff --git a/src/LayerWgs.cpp b/src/LayerWgs.cpp
index 21edf79b..4afb7cce 100644
--- a/src/LayerWgs.cpp
+++ b/src/LayerWgs.cpp
@@ -80,7 +80,7 @@ LayerWgs::LayerWgs(Network *net, int inputs, int outputs,
         variance16_h = new __half[b_size];
         scales16_h   = new __half[b_size];
 
-        //cudaMalloc(&power16_d, b_size*sizeof(__half));
+        cudaMalloc(&power16_d, b_size*sizeof(__half));
         cudaMalloc(&mean16_d, b_size*sizeof(__half));
         cudaMalloc(&variance16_d, b_size*sizeof(__half));
         cudaMalloc(&scales16_d, b_size*sizeof(__half));
@@ -91,8 +91,8 @@ LayerWgs::LayerWgs(Network *net, int inputs, int outputs,
 
         //init power array of ones
         cudaMemcpy(tmp_d, power_h, b_size*sizeof(float), cudaMemcpyHostToDevice);
-        //float2half(tmp_d, power16_d, b_size);
-        //cudaMemcpy(power16_h, power16_d, b_size*sizeof(__half), cudaMemcpyDeviceToHost);
+        float2half(tmp_d, power16_d, b_size);
+        cudaMemcpy(power16_h, power16_d, b_size*sizeof(__half), cudaMemcpyDeviceToHost);
 
         //mean array
         cudaMemcpy(tmp_d, mean_h, b_size*sizeof(float), cudaMemcpyHostToDevice);
diff --git a/src/Network.cpp b/src/Network.cpp
index 9adcc48c..7fa291fd 100644
--- a/src/Network.cpp
+++ b/src/Network.cpp
@@ -128,6 +128,7 @@ void Network::print() {
     }
     printCenteredTitle("", '=', 60);
     std::cout<<"\n";
+    printCudaMemUsage();
 }
 const char *Network::getNetworkRTName(const char *network_name){
     networkName = network_name;
diff --git a/src/NetworkRT.cpp b/src/NetworkRT.cpp
index 0824eba0..9f53b06c 100644
--- a/src/NetworkRT.cpp
+++ b/src/NetworkRT.cpp
@@ -134,6 +134,7 @@ NetworkRT::NetworkRT(Network *net, const char *name) {
         networkRT->markOutput(*input);
 
         std::cout<<"Selected maxBatchSize: "<<builderRT->getMaxBatchSize()<<"\n";
+        printCudaMemUsage();
         std::cout<<"Building tensorRT cuda engine...\n";
 #if NV_TENSORRT_MAJOR >= 6                
         engineRT = builderRT->buildEngineWithConfig(*networkRT, *configRT);
diff --git a/src/utils.cpp b/src/utils.cpp
index 3bd91194..65030f06 100644
--- a/src/utils.cpp
+++ b/src/utils.cpp
@@ -197,6 +197,12 @@ void getMemUsage(double& vm_usage_kb, double& resident_set_kb){
    resident_set_kb = rss * page_size_kb;
 }
 
+void printCudaMemUsage() {
+    size_t free, total;
+    checkCuda( cudaMemGetInfo(&free, &total) );	 
+    std::cout<<"GPU free memory: "<<double(free)/1e6<<" mb.\n";
+}
+
 void removePathAndExtension(const std::string &full_string, std::string &name){
     name = full_string;
     std::string tmp_str = full_string;