worked on transformer, new embedding layer

egillax · egillax · commit b74341fb3a0b · 2022-04-16T22:05:29.000+02:00
diff --git a/R/Dataset.R b/R/Dataset.R
@@ -29,15 +29,21 @@ Dataset <- torch::dataset(
     dataCat <- data[, !numericalIndex]
     self$cat <- torch::torch_tensor(as.matrix(dataCat), dtype=torch::torch_long())
     
+    # for (i in dim(data)[[1]])
+    # 
     # comment out the sparse matrix for now, is really slow need to find 
     # a better solution for converting it to dense before feeding to model
     # matrix <- as(dataCat, 'dgTMatrix') # convert to triplet sparse format
     # sparseIndices <- torch::torch_tensor(matrix(c(matrix@i + 1, matrix@j + 1), ncol=2), dtype = torch::torch_long())
     # values <- torch::torch_tensor(matrix(c(matrix@x)), dtype = torch::torch_float32())
-    # self$cat <- torch::torch_sparse_coo_tensor(indices=sparseIndices$t(), 
-    #                                            values=values$squeeze(), 
-    #                                            dtype=torch::torch_float32())$coalesce()
+    # self$cat <- torch::torch_sparse_coo_tensor(indices=sparseIndices$t(),
+                                               # values=values$squeeze(),
+                                               # dtype=torch::torch_float32())$coalesce()
+    if (sum(numericalIndex) == 0) {
+      self$num <- NULL
+    } else  {
     self$num <- torch::torch_tensor(as.matrix(data[,numericalIndex, drop = F]), dtype=torch::torch_float32())
+    }
   },
   
   .getNumericalIndex = function() {
@@ -53,9 +59,11 @@ Dataset <- torch::dataset(
   },
   
   numNumFeatures = function() {
-    return (
-      self$num$shape[2]
-    )
+      if (!is.null(self$num)) {
+      return(self$num$shape[2])
+      } else {
+      return(0)
+      }
   },
   
   .getbatch = function(item) {
diff --git a/R/Estimator.R b/R/Estimator.R
@@ -475,7 +475,11 @@ Estimator <- R6::R6Class(
       coro::loop(for (b in batchIndex) {
         self$optimizer$zero_grad()
         cat <- dataset[b]$cat$to(device=self$device)
+        if (!is.null(dataset[b]$num)) {
         num <- dataset[b]$num$to(device=self$device)
+        } else {
+        num <- dataset[b]$num
+        }
         target <- dataset[b]$target$to(device=self$device)
         out <- self$model(num, cat)
         loss <- self$criterion(out, target)
diff --git a/R/Transformer.R b/R/Transformer.R
@@ -65,8 +65,9 @@ Transformer <- torch::nn_module(
                        headNorm=torch::nn_layer_norm,
                        attNorm=torch::nn_layer_norm,
                        dimHidden){
-    self$embedding <- Embedding(catFeatures, dimToken)
+    self$embedding <- Embedding(catFeatures + 1, dimToken) # + 1 for padding idx
     dimToken <- dimToken + numFeatures # because I concatenate numerical features to embedding
+    self$classToken <- ClassToken(dimToken)
     
     self$layers <- torch::nn_module_list(lapply(1:numBlocks,
                       function(x) {
@@ -93,24 +94,33 @@ Transformer <- torch::nn_module(
   },
   forward = function(x_num, x_cat){
     x_cat <- self$embedding(x_cat)
+    if (!is.null(x_num)) {
     x <- torch::torch_cat(list(x_cat, x_num), dim=2L)
+    } else {
+      x <- x_cat
+    }
+    x <- self$classToken(x)
     for (i in 1:length(self$layers)) {
       layer <- self$layers[[i]]
       xResidual <- self$startResidual(layer, 'attention', x)
 
       if (i==length(self$layers)) {
-        xResidual <- layer$attention(xResidual[,-1], xResidual) # in final layer take only attention on CLS token
-        x <- x[,-1]
+        dims <- xResidual$shape
+        # in final layer take only attention on CLS token
+        xResidual <- layer$attention(xResidual[,-1]$view(c(dims[1], 1, dims[3])), 
+                                     xResidual, xResidual)
+        xResidual <- xResidual[[1]]
+        x <- x[,-1]$view(c(dims[1], 1, dims[3]))
         } else {
         xResidual <- layer$attention(xResidual, xResidual)
         }
       x <- self$endResidual(layer, 'attention', x, xResidual)
 
-      xResidual <- self$startResidual(layer, 'ffn', x, xResidual)
+      xResidual <- self$startResidual(layer, 'ffn', x)
       xResidual <- layer$ffn(xResidual)
       x <- self$endResidual(layer, 'ffn', x, xResidual)
     }
-    x <- self$head(x)
+    x <- self$head(x)[,1] # remove singleton dimension
     return(x)
   },
   startResidual = function(layer, stage, x) {
@@ -123,7 +133,7 @@ Transformer <- torch::nn_module(
   },
   endResidual = function(layer, stage, x, xResidual) {
     dropoutKey <- paste0(stage, 'ResDropout')
-    xResidual <-layer$dropoutKey(xResidual)
+    xResidual <-layer[[dropoutKey]](xResidual)
     x <- x + xResidual
     return(x)
   }
@@ -167,11 +177,28 @@ Head <- torch::nn_module(
 Embedding <- torch::nn_module(
   name='Embedding',
   initialize = function(numEmbeddings, embeddingDim) {
-    self$embedding <- torch::nn_embedding(numEmbeddings, embeddingDim)
-    categoryOffsets <- torch::torch_arange(1, numEmbeddings)
+    self$embedding <- torch::nn_embedding(numEmbeddings, embeddingDim, padding_idx = 1)
+    categoryOffsets <- torch::torch_arange(1, numEmbeddings, dtype=torch::torch_long())
     self$register_buffer('categoryOffsets', categoryOffsets, persistent=FALSE)
   },
   forward = function(x_cat) {
-    x <- self$embedding(x_cat * self$categoryOffsets)
+    x <- self$embedding(x_cat * self$categoryOffsets + 1L)
     }
 )
+
+# adds a class token embedding to embeddings
+ClassToken <- torch::nn_module(
+  name='ClassToken',
+  initialize = function(dimToken) {
+    self$weight <- torch::nn_parameter(torch::torch_empty(dimToken,1))
+    torch::nn_init_kaiming_uniform_(self$weight, a=sqrt(5))
+  },
+  expand = function(dims) {
+    newDims <- vector("integer", length(dims) - 1) + 1
+    return (self$weight$view(c(newDims,-1))$expand(c(dims, -1)))
+    
+  },
+  forward = function(x) {
+    return(torch::torch_cat(c(x, self$expand(c(dim(x)[[1]], 1))), dim=2))
+  }
+)
diff --git a/extras/example.R b/extras/example.R
@@ -11,18 +11,23 @@ plpData <- simulatePlpData(
   n = sampleSize
 )
 
-
 populationSet <- PatientLevelPrediction::createStudyPopulationSettings(
   requireTimeAtRisk = F, 
   riskWindowStart = 1, 
   riskWindowEnd = 365)
 
-
-modelSettings <- setResNet(numLayers = 2, sizeHidden = 64, hiddenFactor = 1,
-                          residualDropout = 0, hiddenDropout = 0.2, normalization = 'BatchNorm',
-                          activation = 'RelU', sizeEmbedding = 64, weightDecay = 1e-6,
-                          learningRate = 3e-4, seed = 42, hyperParamSearch = 'random',
-                          randomSample = 1, device = 'cuda:0',batchSize = 32,epochs = 1)
+# 
+# modelSettings <- setResNet(numLayers = 2, sizeHidden = 64, hiddenFactor = 1,
+#                           residualDropout = 0, hiddenDropout = 0.2, normalization = 'BatchNorm',
+#                           activation = 'RelU', sizeEmbedding = 64, weightDecay = 1e-6,
+#                           learningRate = 3e-4, seed = 42, hyperParamSearch = 'random',
+#                           randomSample = 1, device = 'cuda:0',batchSize = 32,epochs = 1)
+
+modelSettings <- setTransformer(numBlocks=1, dimToken = 12, dimOut = 1, numHeads = 1,
+                                attDropout = 0.2, ffnDropout = 0.2, resDropout = 0,
+                                dimHidden = 8, batchSize = 4, hyperParamSearch = 'random',
+                                weightDecay = 1e-6, learningRate = 3e-4, epochs = 5,
+                                device = 'cuda:0', randomSamples = 1, seed = 42)
 
 res2 <- PatientLevelPrediction::runPlp(
 plpData = plpData,