Style code (GHA)

lpantano · lpantano · commit b98f1bbc42a6 · 2025-04-17T13:43:00.000Z
diff --git a/03_differential_expression/scRNA_pseudobulk.Rmd b/03_differential_expression/scRNA_pseudobulk.Rmd
@@ -64,7 +64,7 @@ library(viridis)
 library(caTools)
 library(shiny)
 library(bcbioR)
-#library(future)
+# library(future)
 
 ggplot2::theme_set(theme_prism(base_size = 12))
 # https://grafify-vignettes.netlify.app/colour_palettes.html
@@ -143,14 +143,15 @@ if (isUrl(seurat_obj)) {
 }
 
 DefaultAssay(seurat) <- "RNA"
-
 ```
 
 After filtering, each sample contributed the following number of cells to the analysis:
 
 ```{r meta pre doub}
-seurat@meta.data %>% group_by(orig.ident) %>% summarize(n_bins = n()) %>% sanitize_datatable
-
+seurat@meta.data %>%
+  group_by(orig.ident) %>%
+  summarize(n_bins = n()) %>%
+  sanitize_datatable()
 ```
 
 # Aggregate counts
@@ -175,21 +176,21 @@ To aggregate the counts, we will use the AggregateExpression() function from Seu
 seurat$sample <- seurat$orig.ident
 
 bulk <- AggregateExpression(
-            seurat,
-            return.seurat = T,
-            assays = "RNA",
-            group.by = c("sample",column )
+  seurat,
+  return.seurat = T,
+  assays = "RNA",
+  group.by = c("sample", column)
 )
 ```
 
 ## Add number of cells per sample per cluster to the metadata
 
 ```{r}
 # Number of cells by sample and celltype
-n_cells <- seurat@meta.data %>% 
-              dplyr::count(sample, .data[[column]]) %>% 
-              dplyr::rename("n_cells"="n")
-#n_cells$sample <- str_replace(n_cells$sample, "_", "-")
+n_cells <- seurat@meta.data %>%
+  dplyr::count(sample, .data[[column]]) %>%
+  dplyr::rename("n_cells" = "n")
+# n_cells$sample <- str_replace(n_cells$sample, "_", "-")
 
 ## extra check if aggregated sample names start with numbers
 n_cells$sample <- ifelse(grepl("^\\d", n_cells$sample), paste0("g", n_cells$sample), n_cells$sample)
@@ -198,7 +199,7 @@ n_cells$sample <- ifelse(grepl("^\\d", n_cells$sample), paste0("g", n_cells$samp
 meta_bulk <- bulk@meta.data
 meta_bulk$sample <- str_replace_all(meta_bulk$sample, "-", "_")
 
-meta_bulk <- meta_bulk %>% left_join(n_cells, by=c("sample", column))
+meta_bulk <- meta_bulk %>% left_join(n_cells, by = c("sample", column))
 
 rownames(meta_bulk) <- meta_bulk$orig.ident
 bulk@meta.data <- meta_bulk
@@ -209,7 +210,9 @@ bulk@meta.data <- meta_bulk
 
 stopifnot(all(Cells(bulk) == row.names(bulk@meta.data)))
 
-bulk@meta.data %>% head() %>% sanitize_datatable()
+bulk@meta.data %>%
+  head() %>%
+  sanitize_datatable()
 ```
 
 # DE analysis with DESeq2
@@ -225,12 +228,12 @@ Idents(object = bulk) <- column
 Before moving on to a pseudobulk DGE analysis, it is important to identify how many cells we aggregated for each sample. We need to make sure that we have enough cells per sample after subsetting to one celltype. We recommend 50 cells per sample to move forward with confidence.
 
 ```{r}
-ggplot(bulk@meta.data, aes(x=orig.ident, y=n_cells)) +
-    geom_bar(stat="identity", color="black", aes(fill=.data[[column]])) +
-    theme_classic() +
-    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
-    labs(x="Sample name", y="Number of cells") +
-    geom_text(aes(label=n_cells), vjust=-0.5)
+ggplot(bulk@meta.data, aes(x = orig.ident, y = n_cells)) +
+  geom_bar(stat = "identity", color = "black", aes(fill = .data[[column]])) +
+  theme_classic() +
+  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
+  labs(x = "Sample name", y = "Number of cells") +
+  geom_text(aes(label = n_cells), vjust = -0.5)
 ```
 
 ## Run DE analysis
@@ -242,7 +245,7 @@ Before fitting the model, we often look at a metric called dispersion, which is
 We use the below dispersion plot, which should show an inverse relationship between dispersion and mean expression, to get an idea of whether our data is a good fit for the model.
 
 ```{r}
-cluster_counts <- t(FetchData(bulk, layer="counts", vars=rownames(bulk)))
+cluster_counts <- t(FetchData(bulk, layer = "counts", vars = rownames(bulk)))
 formula <- as.formula(paste0("~ ", " + ", column))
 
 
@@ -351,21 +354,22 @@ for (contrast in names(de_list)) {
 ### Run pheatmap using the metadata data frame for the annotation
 for (contrast in names(de_list)) {
   cat("### ", contrast, "\n\n")
-  res_sig = de_list[[contrast]][["sig"]]
-  ma=norm_matrix[res_sig$gene_id,]
-  if (length(res_sig$gene_id) > 2){
-    p1=pheatmap(ma, 
-             color = inferno(10), 
-             cluster_rows = T, 
-             show_rownames = F,
-             border_color = NA, 
-             fontsize = 10, 
-             scale = "row", 
-             fontsize_row = 10, 
-             height = 20)
+  res_sig <- de_list[[contrast]][["sig"]]
+  ma <- norm_matrix[res_sig$gene_id, ]
+  if (length(res_sig$gene_id) > 2) {
+    p1 <- pheatmap(ma,
+      color = inferno(10),
+      cluster_rows = T,
+      show_rownames = F,
+      border_color = NA,
+      fontsize = 10,
+      scale = "row",
+      fontsize_row = 10,
+      height = 20
+    )
     print(p1)
   } else {
-      print('Need >2 DEGs to make heatmap')
+    print("Need >2 DEGs to make heatmap")
   }
   cat("\n\n")
 }
@@ -374,12 +378,14 @@ for (contrast in names(de_list)) {
 ## Differentially Expressed Genes {.tabset}
 
 ```{r sig_genes_table, results='asis'}
-dt_list=list()
-for (contrast in names(de_list)){
-  res_sig=de_list[[contrast]][["sig"]]
-  dt_list=c(dt_list, 
-            list(h3(contrast)), 
-            list(DT::datatable(res_sig)))
+dt_list <- list()
+for (contrast in names(de_list)) {
+  res_sig <- de_list[[contrast]][["sig"]]
+  dt_list <- c(
+    dt_list,
+    list(h3(contrast)),
+    list(DT::datatable(res_sig))
+  )
 }
 tagList(dt_list)
 ```
@@ -391,28 +397,30 @@ n <- 16
 
 for (contrast in names(de_list)) {
   cat("### ", contrast, "\n\n")
-  res_sig = de_list[[contrast]][["sig"]]
-  
-  if(nrow(res_sig) > 0){
-    top_n <- res_sig %>% slice_min(order_by = padj, n = n, with_ties = F) %>% 
+  res_sig <- de_list[[contrast]][["sig"]]
+
+  if (nrow(res_sig) > 0) {
+    top_n <- res_sig %>%
+      slice_min(order_by = padj, n = n, with_ties = F) %>%
       dplyr::select(gene_id)
-    top_n_exp <- vsd_matrix %>% as.data.frame() %>% 
-      rownames_to_column('gene_id') %>%
-      # dplyr::select(-group, -group_name) %>% 
-      pivot_longer(!gene_id, names_to = 'sample', values_to = 'normalized_counts') %>%
+    top_n_exp <- vsd_matrix %>%
+      as.data.frame() %>%
+      rownames_to_column("gene_id") %>%
+      # dplyr::select(-group, -group_name) %>%
+      pivot_longer(!gene_id, names_to = "sample", values_to = "normalized_counts") %>%
       right_join(top_n, relationship = "many-to-many") %>%
-      left_join(meta_bulk, by = c("sample"="orig.ident")) #%>%
-      # filter(.data[[column]] %in% contrasts[[contrast]][2:3]) # can uncomment this line if you want to include only the groups being compared
-    
-    p1=ggplot(top_n_exp, aes(x = .data[[column]], y = normalized_counts)) +
-      geom_boxplot(outlier.shape = NA, linewidth=0.5, color="grey") + 
+      left_join(meta_bulk, by = c("sample" = "orig.ident")) # %>%
+    # filter(.data[[column]] %in% contrasts[[contrast]][2:3]) # can uncomment this line if you want to include only the groups being compared
+
+    p1 <- ggplot(top_n_exp, aes(x = .data[[column]], y = normalized_counts)) +
+      geom_boxplot(outlier.shape = NA, linewidth = 0.5, color = "grey") +
       geom_point() +
-      facet_wrap(~gene_id, scales = 'free_y') + 
-      ggtitle(str_interp('Expression of Top ${n} DEGs - Pseudobulk')) +
-      theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
+      facet_wrap(~gene_id, scales = "free_y") +
+      ggtitle(str_interp("Expression of Top ${n} DEGs - Pseudobulk")) +
+      theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
     print(p1)
   } else {
-      print('Need at least 1 DEG to make plot')
+    print("Need at least 1 DEG to make plot")
   }
   cat("\n\n")
 }
@@ -424,31 +432,35 @@ for (contrast in names(de_list)) {
 n <- 16
 
 # subset seurat object to include only sig DE genes, helps with speed
-top_n_all <- lapply(names(de_list), function(contrast){
-  res_sig = de_list[[contrast]][["sig"]]
-  res_sig %>% slice_min(order_by = padj, n = n, with_ties = F) %>% 
-    dplyr::select(gene_id) %>% mutate(contrast = contrast)
+top_n_all <- lapply(names(de_list), function(contrast) {
+  res_sig <- de_list[[contrast]][["sig"]]
+  res_sig %>%
+    slice_min(order_by = padj, n = n, with_ties = F) %>%
+    dplyr::select(gene_id) %>%
+    mutate(contrast = contrast)
 }) %>% bind_rows()
 
 seurat_topn_all <- subset(seurat, features = unique(top_n_all$gene_id))
 Idents(seurat_topn_all) <- column
 
-for (contrast in names(de_list)){
+for (contrast in names(de_list)) {
   cat("### ", contrast, "{.tabset} \n\n")
-  res_sig = de_list[[contrast]][["sig"]]
-  if(nrow(res_sig) > 0){
-    top_n <- res_sig %>% slice_min(order_by = padj, n = n, with_ties = F) %>% 
+  res_sig <- de_list[[contrast]][["sig"]]
+  if (nrow(res_sig) > 0) {
+    top_n <- res_sig %>%
+      slice_min(order_by = padj, n = n, with_ties = F) %>%
       dplyr::select(gene_id)
-    
-    p <- DotPlot(seurat_topn_all, 
-                 # idents = contrasts[[contrast]][2:3], # can uncomment this line if you want to include only the groups being compared
-                 features = top_n$gene_id) &
-      scale_color_cb_friendly(discrete = F, palette = 'heatmap') &
+
+    p <- DotPlot(seurat_topn_all,
+      # idents = contrasts[[contrast]][2:3], # can uncomment this line if you want to include only the groups being compared
+      features = top_n$gene_id
+    ) &
+      scale_color_cb_friendly(discrete = F, palette = "heatmap") &
       scale_x_discrete(guide = guide_axis(angle = 45))
-    
+
     print(p)
   } else {
-    print('Need at least 1 DEG to make plot')
+    print("Need at least 1 DEG to make plot")
   }
 
   cat("\n\n")