|
| 1 | +--- |
| 2 | +title: "Visium QC" |
| 3 | +author: "Harvard Chan Bioinformatics Core" |
| 4 | +date: "`r Sys.Date()`" |
| 5 | +output: |
| 6 | + html_document: |
| 7 | + code_folding: hide |
| 8 | + df_print: paged |
| 9 | + highlights: pygments |
| 10 | + number_sections: false |
| 11 | + self_contained: true |
| 12 | + theme: default |
| 13 | + toc: true |
| 14 | + toc_float: |
| 15 | + collapsed: true |
| 16 | + smooth_scroll: true |
| 17 | +params: |
| 18 | + project_file: ./reports/information.R |
| 19 | + results_dir: ./results |
| 20 | + path_data: ./data |
| 21 | +--- |
| 22 | + |
| 23 | +```{r, cache = FALSE, message = FALSE, warning=FALSE} |
| 24 | +# This set up the working directory to this file so all files can be found |
| 25 | +library(rstudioapi) |
| 26 | +setwd(fs::path_dir(getSourceEditorContext()$path)) |
| 27 | +# NOTE: This code will check version, this is our recommendation, it may work |
| 28 | +#. other versions |
| 29 | +stopifnot(R.version$major>= 4) # requires R4 |
| 30 | +if (compareVersion(R.version$minor,"3.1")<0) warning("We recommend >= R4.3.1") |
| 31 | +stopifnot(compareVersion(BiocManager::version(), "3.18")>=0) |
| 32 | +stopifnot(compareVersion(package.version("Seurat"), "5.0.0")>=0) |
| 33 | +``` |
| 34 | + |
| 35 | +This code is in this  revision. |
| 36 | + |
| 37 | + |
| 38 | +```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE, echo=FALSE,} |
| 39 | +# Libraries |
| 40 | +library(bcbioR) |
| 41 | +library(Seurat) |
| 42 | +library(tidyverse) |
| 43 | +library(glue) |
| 44 | +library(gridExtra) |
| 45 | +library(scales) |
| 46 | +
|
| 47 | +
|
| 48 | +# Plotting aesthetics |
| 49 | +colors=cb_friendly_cols(1:15) |
| 50 | +ggplot2::theme_set(theme_prism(base_size = 14)) |
| 51 | +opts_chunk[["set"]]( |
| 52 | + cache = F, |
| 53 | + cache.lazy = FALSE, |
| 54 | + dev = c("png", "pdf"), |
| 55 | + error = TRUE, |
| 56 | + highlight = TRUE, |
| 57 | + message = FALSE, |
| 58 | + prompt = FALSE, |
| 59 | + tidy = FALSE, |
| 60 | + warning = FALSE, |
| 61 | + echo = T, |
| 62 | + fig.height = 4) |
| 63 | +
|
| 64 | +# set seed for reproducibility |
| 65 | +set.seed(1234567890L) |
| 66 | +``` |
| 67 | + |
| 68 | +# Project details |
| 69 | + |
| 70 | +- Project: `r project` |
| 71 | +- PI: `r PI` |
| 72 | +- Analyst: `r analyst` |
| 73 | +- Experiment: `r experiment` |
| 74 | +- Aim: `r aim` |
| 75 | + |
| 76 | +The following samples were found in the data folder for this project: |
| 77 | + |
| 78 | +```{r samples} |
| 79 | +samples <- list.files(path_data) |
| 80 | +samples |
| 81 | +``` |
| 82 | + |
| 83 | +## Visium Description |
| 84 | + |
| 85 | +This report is adapted from: https://nbisweden.github.io/workshop-scRNAseq/labs/compiled/seurat/seurat_07_spatial.html |
| 86 | + |
| 87 | +Spatial transcriptomic data with the Visium platform is in many ways similar to scRNAseq data. The data is represented per spot on the slide, as we have spatial barcode bust no cellular barcodes. Each spot contains UMI counts for 5-20 cells instead of single cells, but is still quite sparse in the same way as scRNAseq data is, but with the additional information about spatial location in the tissue. |
| 88 | + |
| 89 | + |
| 90 | +# 10X Sequencing Metrics {.tabset} |
| 91 | + |
| 92 | +Here, we take a quick look at metrics reported from spaceranger for each sample. |
| 93 | + |
| 94 | +```{r summary-metrics} |
| 95 | +# Create a for loop to create one dataframe with 10X metrics |
| 96 | +metrics <- data.frame() |
| 97 | +for (sample in samples) { |
| 98 | + m <- read.csv(glue("{path_data}/{sample}/outs/metrics_summary.csv")) |
| 99 | + metrics <- rbind(metrics, m) |
| 100 | +} |
| 101 | +``` |
| 102 | + |
| 103 | +## Number of spots per slide |
| 104 | + |
| 105 | +```{r nSpots} |
| 106 | +metrics %>% |
| 107 | + ggplot(aes(x=Sample.ID, y=(Number.of.Spots.Under.Tissue), fill=Sample.ID)) + |
| 108 | + geom_col() + |
| 109 | + theme_classic() + |
| 110 | + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1), |
| 111 | + axis.title = element_text(size=rel(1.5)), |
| 112 | + plot.title = element_text(hjust = 0.5, face="bold")) + |
| 113 | + ggtitle("Number of Spots") + |
| 114 | + xlab("") + ylab("Number of spots") |
| 115 | +``` |
| 116 | + |
| 117 | +## Number of sequencing reads |
| 118 | + |
| 119 | + |
| 120 | +```{r nReads} |
| 121 | +metrics %>% |
| 122 | + ggplot(aes(x=Sample.ID, y=(Number.of.Reads/ 1000000), fill=Sample.ID)) + |
| 123 | + geom_col() + |
| 124 | + theme_classic() + |
| 125 | + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1), |
| 126 | + axis.title = element_text(size=rel(1.5)), |
| 127 | + plot.title = element_text(hjust = 0.5, face="bold")) + |
| 128 | + ggtitle("Number of sequencing reads") + |
| 129 | + xlab("") + ylab("Number of reads (in millions)") |
| 130 | +``` |
| 131 | + |
| 132 | + |
| 133 | +## Mean reads per spot |
| 134 | + |
| 135 | +**The recommended value here is 50,000.** |
| 136 | + |
| 137 | +```{r reads_per_spot} |
| 138 | +metrics %>% |
| 139 | + ggplot(aes(x=Sample.ID, y=Mean.Reads.per.Spot, fill=Sample.ID)) + |
| 140 | + geom_col() + |
| 141 | + theme_classic() + |
| 142 | + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1), |
| 143 | + axis.title = element_text(size=rel(1.5)), |
| 144 | + plot.title = element_text(hjust = 0.5, face="bold")) + |
| 145 | + ggtitle("Mean reads per spot") + |
| 146 | + geom_hline(yintercept=50000, linetype=2) |
| 147 | +``` |
| 148 | + |
| 149 | + |
| 150 | +## Fraction of reads in spots under tissue |
| 151 | + |
| 152 | +**The recommended value here is > 50%.** |
| 153 | + |
| 154 | +Low fraction reads in spots under the tissue indicate that many of the reads were not assigned to tissue covered spots. This could be caused by: |
| 155 | + |
| 156 | +* High levels of ambient RNA resulting from inefficient permeabilization |
| 157 | +* The incorrect image or orientation was used |
| 158 | +* Poor tissue detection |
| 159 | + |
| 160 | +```{r frac_reads} |
| 161 | +metrics %>% |
| 162 | + ggplot(aes(x=Sample.ID, y= (Fraction.Reads.in.Spots.Under.Tissue * 100), fill=Sample.ID)) + |
| 163 | + geom_col() + |
| 164 | + theme_classic() + |
| 165 | + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1), |
| 166 | + axis.title = element_text(size=rel(1.5)), |
| 167 | + plot.title = element_text(hjust = 0.5, face="bold")) + |
| 168 | + xlab("") + ylab("Fraction of reads in spots") + |
| 169 | + ggtitle("Fraction of reads") + |
| 170 | + geom_hline(yintercept = 50, linetype =2) + |
| 171 | + ylim(c(0, 100)) |
| 172 | +``` |
| 173 | + |
| 174 | + |
| 175 | +# Setup for analysis |
| 176 | + |
| 177 | +Next, we will run quality control in a similar manner to scRNAseq data. Start by loading in each sample and then merged into one Seurat object. |
| 178 | + |
| 179 | +```{r create_seurat} |
| 180 | +# Create seurat list |
| 181 | +seurat_list <- list() |
| 182 | +for (sample in samples) { |
| 183 | + |
| 184 | + # Load data to create a Seurat object |
| 185 | + x <- Load10X_Spatial(glue("{path_data}/{sample}/outs/")) |
| 186 | + x$orig.ident <- sample |
| 187 | +
|
| 188 | + # Add individual seurat objects to list |
| 189 | + seurat_list[[sample]] <- x |
| 190 | +} |
| 191 | +
|
| 192 | +# Merge together every seurat object in list |
| 193 | +merged <- merge( |
| 194 | + x = seurat_list[[samples[1]]], |
| 195 | + y = seurat_list[samples[2: length(samples)]], |
| 196 | + project = project, |
| 197 | + add.cell.ids = samples |
| 198 | +) |
| 199 | +
|
| 200 | +merged |
| 201 | +``` |
| 202 | + |
| 203 | +Next we make sure that all the relevant metadata for each cell is included in the seurat object by looking at the first few rows of the dataframe. Note that `nCounts` refers to the total number of UMIs for a spot while `nFeatures` tallies the number of genes that are expressed in the spot. |
| 204 | + |
| 205 | +```{r view_metadata} |
| 206 | +# This is great spot to add any additional metadata you may have for the samples |
| 207 | +
|
| 208 | +# Calculate novelty score |
| 209 | +merged$log10GenesPerUMI <- log10(merged$nFeature_Spatial) / log10(merged$nCount_Spatial) |
| 210 | +
|
| 211 | + |
| 212 | +``` |
| 213 | + |
| 214 | +# Quality control per spot {.tabset} |
| 215 | + |
| 216 | +Similar to scRNAseq we use statistics on number of counts, number of features and percent mitochondria for quality control. However, the expected distributions for high-quality spots are different (compared to high-quality cells in scRNA-seq), since spots may contain zero, one, or multiple cells. |
| 217 | + |
| 218 | +## Number of UMIs detected per spot |
| 219 | + |
| 220 | +This number is really dependent on tissue type, RNA quality, and sequencing depth. For example, with one of the public Visium datasets using the mouse brain hippocampus region, we saw an average of 4.5k genes and 20k UMIs per spot. |
| 221 | + |
| 222 | + |
| 223 | +```{r nUMI} |
| 224 | + |
| 225 | + ggplot(aes(x=orig.ident, y = nCount_Spatial)) + |
| 226 | + geom_violin(aes(fill=orig.ident), alpha = 0.5) + |
| 227 | + geom_boxplot(width=0.1) + |
| 228 | + geom_hline(yintercept = 10000, linetype = "dashed") + |
| 229 | + scale_y_log10() + |
| 230 | + theme_classic() + |
| 231 | + ylab("density") + |
| 232 | + theme(plot.title = element_text(hjust=0.5, face="bold")) + |
| 233 | + ggtitle("Num UMIs per spot") |
| 234 | +``` |
| 235 | + |
| 236 | +## Number of genes detected per spot |
| 237 | + |
| 238 | +The number of genes per spot will vary by sample, however in this dataset values are generally on the high end. _The dashed line is drawn at 4000 genes, which is a fairly high threshold._ |
| 239 | + |
| 240 | + |
| 241 | +```{r nGenes, fig.width=9} |
| 242 | + |
| 243 | + ggplot(aes(x=orig.ident, y = nFeature_Spatial)) + |
| 244 | + geom_violin(aes(fill=orig.ident), alpha = 0.5) + |
| 245 | + geom_boxplot(width=0.1) + |
| 246 | + geom_hline(yintercept = 4000, linetype = "dashed") + |
| 247 | + scale_y_log10() + |
| 248 | + theme_classic() + |
| 249 | + ylab("density") + |
| 250 | + theme(plot.title = element_text(hjust=0.5, face="bold")) + |
| 251 | + ggtitle("Num Genes per Spot") |
| 252 | +``` |
| 253 | + |
| 254 | +## Overall complexity of transcriptional profile per spot |
| 255 | + |
| 256 | +We can evaluate each spot in terms of how complex the RNA species are by using a measure called the novelty score. The novelty score is computed by taking the ratio of nGenes over nUMI. If there are many captured transcripts (high nUMI) and a low number of genes detected in a cell, this likely means that you only captured a low number of genes and simply sequenced transcripts from those lower number of genes over and over again. |
| 257 | + |
| 258 | +With scRNA-seq this is more easily interpreted for a single cell, but for spatial data this would give us complexity of the spot, which is across multiple cells. |
| 259 | + |
| 260 | + |
| 261 | +```{r complexity} |
| 262 | + |
| 263 | + ggplot(aes(x=log10GenesPerUMI, color=orig.ident, fill=orig.ident)) + |
| 264 | + geom_density(alpha = 0.2) + |
| 265 | + theme_classic() + |
| 266 | + geom_vline(xintercept = 0.8) + |
| 267 | + ggtitle("Novelty score") + |
| 268 | + theme(plot.title = element_text(hjust=0.5, face="bold")) |
| 269 | +``` |
| 270 | + |
| 271 | + |
| 272 | +# QC metrics visualized on slides {.tabset} |
| 273 | + |
| 274 | +Here, we can look at the distribution of UMI and gene counts on the individual tissue slide. |
| 275 | + |
| 276 | +```{r spatial-plot, results='asis', fig.width=7, fig.height=4} |
| 277 | +# Iterate through each sample to make visualizations for each slide |
| 278 | +for (sample in samples) { |
| 279 | + cat("\n\n") |
| 280 | +
|
| 281 | + # Print plots for nUMI and nGenes for each spot on the slide |
| 282 | + cat("## Sample", sample, "\n") |
| 283 | + p <- SpatialFeaturePlot(seurat_list[[sample]], |
| 284 | + features = c("nCount_Spatial", "nFeature_Spatial"), |
| 285 | + max.cutoff = 'q80') |
| 286 | + print(p) |
| 287 | + cat("\n") |
| 288 | + } |
| 289 | +``` |
| 290 | + |
| 291 | + |
| 292 | +# Normalization and Clustering |
| 293 | + |
| 294 | +Seurat recommends the use of SCTransform for spatial data. SCTransform normalizes the data, detects high-variance features, and stores the data in the SCT assay. We then perform clustering (at a default resolution of 0.8 with 30 PCs) and create UMAP coordinates. |
| 295 | + |
| 296 | +Let's look at the UMAP and clustering for each group to see if further integration is necessary. |
| 297 | + |
| 298 | +```{r sc_workflow} |
| 299 | +# Making sure that the rownames of metadata are the same as what is stored in Cells() |
| 300 | +rownames([email protected]) <- Cells(merged) |
| 301 | +
|
| 302 | +# Standard processing of counts matrix |
| 303 | +# SCTransform, PCA, clusters, and run UMAP |
| 304 | +merged <- SCTransform(merged, assay = "Spatial", verbose = TRUE) |
| 305 | +merged <- RunPCA(merged , assay = "SCT", verbose = FALSE) |
| 306 | +merged <- FindNeighbors(merged, reduction = "pca", dims = 1:30) |
| 307 | +merged <- FindClusters(merged, verbose = FALSE, |
| 308 | + resolution = c(0.2, 0.4, 0.6, 0.8, 1)) |
| 309 | +merged <- RunUMAP(merged, reduction = "pca", dims = 1:30) |
| 310 | +``` |
| 311 | + |
| 312 | +## UMAP distribution {.tabset} |
| 313 | + |
| 314 | +### Samples |
| 315 | + |
| 316 | +```{r umap_sample} |
| 317 | +Idents(merged) <- "orig.ident" |
| 318 | +DimPlot(merged, reduction = "umap") |
| 319 | +``` |
| 320 | + |
| 321 | +### Samples split |
| 322 | + |
| 323 | +```{r umap_sample_split} |
| 324 | +# Get UMAP coordinates |
| 325 | +df <- FetchData(merged, c("umap_1", "umap_2", "orig.ident")) |
| 326 | +
|
| 327 | +# Colors for each sample |
| 328 | +palette_fill <- hue_pal()(length(samples)) |
| 329 | +names(palette_fill) <- samples |
| 330 | +
|
| 331 | +# Create plot list to arrange later |
| 332 | +plot_list <- list() |
| 333 | +for (sample in samples) { |
| 334 | + |
| 335 | + # Get cells of one sample |
| 336 | + df_sample <- df[df$orig.ident == sample, ] |
| 337 | +
|
| 338 | + # Plot color dots (for each sample) overlaid over grey |
| 339 | + p <- ggplot() + |
| 340 | + geom_point(data=df, |
| 341 | + aes(x=umap_1, y=umap_2), color="lightgray", |
| 342 | + alpha = 0.5, size=1, show.legend=FALSE) + |
| 343 | + geom_point(data=df_sample, |
| 344 | + aes(x=umap_1, y=umap_2), |
| 345 | + color=palette_fill[sample]) + |
| 346 | + theme_classic() + |
| 347 | + ggtitle(sample) |
| 348 | + |
| 349 | + plot_list[[sample]] <- p |
| 350 | +} |
| 351 | +
|
| 352 | +grid.arrange(grobs = plot_list) |
| 353 | +``` |
| 354 | + |
| 355 | + |
| 356 | +### Clusters at resolution 0.8 |
| 357 | + |
| 358 | +```{r umap_clusters} |
| 359 | +Idents(merged) <- "SCT_snn_res.0.8" |
| 360 | +p <- DimPlot(merged, reduction = "umap") |
| 361 | +LabelClusters(p, id ="ident", fontface = "bold", size = 6, |
| 362 | + bg.colour = "white", bg.r = .2, force = 0) |
| 363 | +``` |
| 364 | + |
| 365 | +# Clusters ontop slides |
| 366 | + |
| 367 | +```{r cluster_slides fig.height=15} |
| 368 | +SpatialDimPlot(merged, label = TRUE, ncol=1) |
| 369 | +``` |
| 370 | + |
| 371 | +# Save seurat object |
| 372 | + |
| 373 | +```{r save_seurat} |
| 374 | +saveRDS(merged, "results/01_qc.RDS") |
| 375 | +``` |
| 376 | + |
| 377 | + |
| 378 | +# Session Information |
| 379 | + |
| 380 | +```{r} |
| 381 | +sessionInfo() |
| 382 | +``` |
0 commit comments