Skip to content

Commit 2342ac6

Browse files
committed
Merge branch 'will_changes_docs' of https://github.com/bcbio/rnaseq-reports into will_changes_docs
2 parents 17ea4cd + 720fbf0 commit 2342ac6

File tree

2 files changed

+69
-52
lines changed

2 files changed

+69
-52
lines changed

01_quality_assessment/QC.Rmd

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ sanitize_datatable <- function(df, ...) {
123123
# This code will load from bcbio or nf-core folder
124124
# NOTE make sure to set numerator and denominator
125125
coldata <- load_coldata(coldata_fn)
126-
# Change this line to change the levels to the desired order.
126+
# Change this line to change the levels to the desired order.
127127
# It will affect downstream colors in plots.
128128
coldata[[factor_of_interest]] <- as.factor(coldata[[factor_of_interest]])
129129
coldata$sample <- row.names(coldata)

02_differential_expression/DEG.Rmd

Lines changed: 68 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -621,7 +621,7 @@ de_list <- lapply(contrasts, function(contrast) {
621621
drop_na(pvalue) # Remove genes who don't have a p-value (lots of lowly expressed genes)
622622
623623
# Extract the significant genes
624-
res_sig <- res %>%
624+
res_sig <- res %>%
625625
filter(padj < 0.05) %>% # Retain the values that have an adjusted p-value less than 0.05
626626
arrange(padj) %>% # Arrange the genes by the lowest adjusted p-value
627627
mutate(gene_name = ifelse(test = gene_name == "" | is.na(gene_name), # If the gene_name value for a gene is NA or blank then
@@ -640,7 +640,7 @@ de_list <- lapply(contrasts, function(contrast) {
640640
})
641641
642642
# NOTE: If you manually add any other comparison to the list with the following variables, the code below will make the plots for those as well:
643-
# de_list <- c(de_list, new_comparison=list(lfc=resLFC,
643+
# de_list <- c(de_list, new_comparison=list(lfc=resLFC,
644644
# lfcs=resLFCS,
645645
# all=res, sig=res_sig))
646646
```
@@ -762,15 +762,15 @@ for (contrast in names(de_list)) {
762762
# Retrieve the significantly differential expressed genes data frame for the contrast
763763
res_sig <- de_list[[contrast]][["sig"]]
764764
# Populate the dt_list list
765-
dt_list <- c(
765+
dt_list <- c(
766766
dt_list, # With the contents of dt_list from the previous loop
767767
list(h3(contrast)), # Odd numbered objects in the list will be the contrast name in a HTML renderable Heading 3 size
768768
list(DT::datatable(res_sig)) # Even numbered object in the list will hold HTML renderable data table from the DT package containing the significant genes
769769
)
770770
}
771771
772772
# Render the dt_list into HTML
773-
tagList(dt_list)
773+
tagList(dt_list)
774774
```
775775

776776
## Plot top 16 genes {.tabset}
@@ -787,44 +787,55 @@ for (contrast in names(de_list)) {
787787
# Retrieve the significantly differential expressed genes data frame for the contrast
788788
res_sig <- de_list[[contrast]][["sig"]]
789789
# Extract the names of the top n (defined at the start of the code block) most significant genes
790-
top_n <- res_sig %>% # Use the significantly differentially expressed genes
791-
slice_min( # Retain the genes with the smallest values
790+
top_n <- res_sig %>% # Use the significantly differentially expressed genes
791+
slice_min( # Retain the genes with the smallest values
792792
order_by = padj, # As ordered by their adjusted p-value
793793
n = n, # The number of genes to retain
794-
with_ties = FALSE) %>% # Do not retain ties if they return the same adjusted p-value
794+
with_ties = FALSE
795+
) %>% # Do not retain ties if they return the same adjusted p-value
795796
dplyr::select(gene_name, gene_id) # Only retain the gene_name and gene_id columns
796-
# Extract the variance stabilized transformed expression of the top n (defined at the start of the code block) most significant genes
797+
# Extract the variance stabilized transformed expression of the top n (defined at the start of the code block) most significant genes
797798
top_n_exp <- norm_matrix %>% # Use the variance stabilized transformed expression counts matrix
798799
as.data.frame() %>% # Convert the matrix to a data frame
799800
rownames_to_column("gene_id") %>% # Make the row names (the gene_ids) into a column named "gene_id"
800801
# dplyr::select(-group, -group_name) %>%
801802
pivot_longer( # Pivot the data across the rows and give each value its own new row
802803
cols = !gene_id, # Pivot all columns except the gene_id column
803804
names_to = "sample", # Name the samples column to be "sample"
804-
values_to = "log2_expression") %>% # Name the variance stabilized transformed counts column to "log2_expression"
805+
values_to = "log2_expression"
806+
) %>% # Name the variance stabilized transformed counts column to "log2_expression"
805807
right_join( # Subset the expression table and add a new column to the table
806808
y = top_n, # Subset by the values matching by gene_id from top_n and add the gene_name
807-
relationship = "many-to-many") %>% # Multiple rows in the norm_matrix can rows in top_n
808-
left_join( # Add data
809+
relationship = "many-to-many"
810+
) %>% # Multiple rows in the norm_matrix can rows in top_n
811+
left_join( # Add data
809812
y = coldata, # Add the metadata
810-
by = "sample") # Match it by the value in sample
813+
by = "sample"
814+
) # Match it by the value in sample
811815
812816
# Create a facet wrapped plot for the expression of the top n (defined at the start of the code block) most significant genes
813-
p1 <- ggplot(top_n_exp, # Use the data from top_n_exp to create this figure
814-
aes_string(x = column, # In each panel the x-axis will have the variable of interest
815-
y = "log2_expression")) + # In each panel the y-axis will be the variance stabilized transformed counts
817+
p1 <- ggplot(
818+
top_n_exp, # Use the data from top_n_exp to create this figure
819+
aes_string(
820+
x = column, # In each panel the x-axis will have the variable of interest
821+
y = "log2_expression"
822+
)
823+
) + # In each panel the y-axis will be the variance stabilized transformed counts
816824
geom_boxplot( # Each panel will display boxplots
817825
outlier.shape = NA, # Don't display outlier points
818826
linewidth = 0.5, # Set the line width of the box in the boxplot
819-
color = "grey") + # Color of the box outline in each panel as "grey"
827+
color = "grey"
828+
) + # Color of the box outline in each panel as "grey"
820829
geom_point() + # Overlap the data points on the boxplot
821830
facet_wrap(~gene_name) + # Make a different panel for each gene
822831
# facet_wrap(~gene_name, scales = "free") + # If you want the y-axis to be able to be different in each plot use this facet-wrap instead of the above line of code
823832
ggtitle(str_interp("Expression of Top ${n} DEGs")) + # Add a title
824-
theme(axis.text.x = element_text(angle = 90, # Rotate the text on the x-axis 90 degrees
825-
vjust = 0.5, # Center the text relative to the tick mark on the x-axis
826-
hjust = 1)) # Right align the text against the x-axis
827-
# Print the plot
833+
theme(axis.text.x = element_text(
834+
angle = 90, # Rotate the text on the x-axis 90 degrees
835+
vjust = 0.5, # Center the text relative to the tick mark on the x-axis
836+
hjust = 1
837+
)) # Right align the text against the x-axis
838+
# Print the plot
828839
print(p1)
829840
# Print two new lines
830841
cat("\n\n")
@@ -872,28 +883,31 @@ fa_gsea_list <- lapply(de_list, function(contrast) {
872883
x = ann.org, # Annotation database to search
873884
keys = gsea_input$gene_id, # The gene_ids to search for
874885
keytype = "ENSEMBL", # The gene_ids (keys) are ENSEMBL annotations
875-
columns = c("ENTREZID", "SYMBOL")) # Return the queried ENSEMBL ID, along with the ENTREZ ID and the gene symbol
876-
# Combine gene annotation with the lfc information
886+
columns = c("ENTREZID", "SYMBOL")
887+
) # Return the queried ENSEMBL ID, along with the ENTREZ ID and the gene symbol
888+
# Combine gene annotation with the lfc information
877889
input_entrezid <- inner_join(
878-
x = gsea_input, # Combine the ENSEMBL ID and lfc input with
879-
y = input_entrezid, # The ENTREZ ID and gene symbol information
880-
by = c("gene_id" = "ENSEMBL")) %>% # Join these table together by common ENSEMBL IDs
890+
x = gsea_input, # Combine the ENSEMBL ID and lfc input with
891+
y = input_entrezid, # The ENTREZ ID and gene symbol information
892+
by = c("gene_id" = "ENSEMBL")
893+
) %>% # Join these table together by common ENSEMBL IDs
881894
filter(!is.na(ENTREZID)) %>% # Remove the genes without an ENTREZ ID
882895
distinct(ENTREZID, .keep_all = TRUE) %>% # Retain the unique ENTREZ IDs and all columns of data
883896
arrange(desc(lfc)) # Arrange the data frame by descending values of lfc
884-
897+
885898
# Run the run_fgsea_v2 function found in FA.R within the 00_libs directory to run the GSEA analysis and assign the output to the object tb
886899
tb <- run_fgsea_v2(
887900
input = input_entrezid, # Query for the GSEA analysis
888-
all_in_life = all_in_life) # Annotation databases to query against
901+
all_in_life = all_in_life
902+
) # Annotation databases to query against
889903
tb %>% filter(padj < 0.05) # Filter the GSEA output for results with an adjust p-value less than 0.05 and return them
890904
})
891905
```
892906

893907
```{r print-gsea, results='asis', eval=run_FA}
894908
# NOTE: DT::datatables doesn't work with tabset and for loops
895909
# You can use the following code to print dynamically or call manually sanitize_datatable() multiple times
896-
# Create an empty list to assign contrast values and DT data tables to
910+
# Create an empty list to assign contrast values and DT data tables to
897911
dt_list <- list()
898912
899913
# For each of the contrasts in the de_list list object to extract the name of the contrast and the GSEA results to be rendered into an HTML table
@@ -910,7 +924,7 @@ for (contrast in names(de_list)) {
910924
}
911925
912926
# Render the dt_list into HTML
913-
tagList(dt_list)
927+
tagList(dt_list)
914928
```
915929

916930
# Pathway Analysis- Over-representation
@@ -922,11 +936,11 @@ Over-Representation Analysis (ORA) is a statistical method used to determine whe
922936
- Prior Knowledge Integration: Utilizes existing biological knowledge through predefined gene sets.
923937

924938
```{r run-ora, warning=F, message=F, eval=run_FA}
925-
# Go through each contrast in the de_list list object and
939+
# Go through each contrast in the de_list list object and
926940
fa_list <- lapply(de_list, function(contrast) {
927941
# For a given contrast pull out the full differential expression results
928942
res <- contrast[["all"]]
929-
943+
930944
# Extract the universe of genes to the ORA on and assign it to the object universe
931945
universe <- res %>% # Start with the full differential expression results (NOTE: these do exclude genes without any expression, p-value or adjusted p-value)
932946
filter(!is.na(padj)) %>% # Remove any adjusted p-values that are NA, but there should be since they were filtered when making the de_list object
@@ -944,14 +958,15 @@ fa_list <- lapply(de_list, function(contrast) {
944958
# NOTE: Change to the correct species if not working in human or mouse
945959
input_entrezid <- rdata %>% # Start with the full annotations
946960
filter(gene_id %in% ora_input, !is.na(entrez)) # Only retain the the annotations that are are in the ora_input object (ajusted p-value less than 0.01, lfc greater than the absolute value of 0.3) and have an ENTREZ ID
947-
961+
948962
# AnnotationDbi::select(ann.org, ora_input, 'ENSEMBL', columns = c('ENTREZID', 'SYMBOL'))
949-
963+
950964
# Run the ORA using the run_fora_v2 function within the FA.R file in the 00_libs directory
951965
all <- run_fora_v2(
952966
input = input_entrezid, # Data frame of genes that are being evaluated for enrichment
953967
uni = universe_mapping, # Universe of gene to look for enrichment within
954-
all_in_life = all_in_life) # Annotation databases to query against
968+
all_in_life = all_in_life
969+
) # Annotation databases to query against
955970
956971
# Create the input vector of genes for the ORA with a positive lfc
957972
ora_input <- res %>% # Start with the full differential expression results (NOTE: these do exclude genes without any expression, p-value or adjusted p-value)
@@ -961,12 +976,13 @@ fa_list <- lapply(de_list, function(contrast) {
961976
# NOTE: Change to the correct species if not working in human or mouse
962977
input_entrezid <- rdata %>% # Start with the full annotations
963978
filter(gene_id %in% ora_input, !is.na(entrez)) # Only retain the the annotations that are are in the ora_input object (adjusted p-value less than 0.01 and lfc greater than 0.3) and have an ENTREZ ID
964-
979+
965980
# Run the ORA using the run_fora_v2 function within the FA.R file in the 00_libs directory
966981
up <- run_fora_v2(
967982
input = input_entrezid, # Data frame of genes that are being evaluated for enrichment
968983
uni = universe_mapping, # Universe of gene to look for enrichment within
969-
all_in_life = all_in_life) # Annotation databases to query against
984+
all_in_life = all_in_life
985+
) # Annotation databases to query against
970986
971987
# Create the input vector of genes for the ORA with a negative lfc
972988
ora_input <- res %>% # Start with the full differential expression results (NOTE: these do exclude genes without any expression, p-value or adjusted p-value)
@@ -976,19 +992,20 @@ fa_list <- lapply(de_list, function(contrast) {
976992
# NOTE: Change to the correct species if not working in human or mouse
977993
input_entrezid <- rdata %>% # Start with the full annotations
978994
filter(gene_id %in% ora_input, !is.na(entrez)) # Only retain the the annotations that are are in the ora_input object (adjusted p-value less than 0.01 and lfc less than -0.3) and have an ENTREZ ID
979-
995+
980996
# Run the ORA using the run_fora_v2 function within the FA.R file in the 00_libs directory
981997
down <- run_fora_v2(
982998
input = input_entrezid, # Data frame of genes that are being evaluated for enrichment
983999
uni = universe_mapping, # Universe of gene to look for enrichment within
984-
all_in_life = all_in_life) # Annotation databases to query against
1000+
all_in_life = all_in_life
1001+
) # Annotation databases to query against
9851002
9861003
# Create a list to hold the ORA results for a given contrast
9871004
list(
9881005
all = all, # Assign the positive and negative lfc combined ORA results to "all"
9891006
up = up, # Assign the positive lfc combined ORA results to "up"
9901007
down = down # Assign the negative lfc combined ORA results to "down"
991-
)
1008+
)
9921009
})
9931010
```
9941011

@@ -998,14 +1015,14 @@ fa_list <- lapply(de_list, function(contrast) {
9981015
# NOTE DT::datatables doesn't work with tabset and for loops
9991016
# You can use the following code to print dynamically or call manually sanitize_datatable() multiple times
10001017
1001-
# Create an empty list to assign contrast values and DT data tables to
1018+
# Create an empty list to assign contrast values and DT data tables to
10021019
dt_list <- list()
10031020
# For each of the contrasts in the de_list list object to extract the name of the contrast and the ORA results to be rendered into an HTML table
10041021
for (contrast in names(de_list)) {
10051022
# Create a dataframe to hold the significantly over-represented terms for genes with a positive and negative lfc
10061023
res_sig <- fa_list[[contrast]][["all"]] %>% # Start with the complete list of terms used in the ORA evaluated for negative or positive lfc
10071024
filter(padj < 0.05) # Retain only the results with adjusted p-values of 0.05
1008-
1025+
10091026
# Populate the dt_list list
10101027
dt_list <- c(
10111028
dt_list, # With the contents of dt_list from the previous loop
@@ -1021,14 +1038,14 @@ tagList(dt_list)
10211038
## Down-regulated genes {.tabset}
10221039

10231040
```{r print-ora-down, results='asis', eval=run_FA}
1024-
# Create an empty list to assign contrast values and DT data tables to
1041+
# Create an empty list to assign contrast values and DT data tables to
10251042
dt_list <- list()
10261043
# For each of the contrasts in the de_list list object to extract the name of the contrast and the ORA results to be rendered into an HTML table
10271044
for (contrast in names(de_list)) {
10281045
# Create a dataframe to hold the significantly over-represented terms for genes with a negative lfc
1029-
res_sig <- fa_list[[contrast]][["down"]] %>% # Start with the complete list of terms used in the ORA evaluated for negative lfc
1046+
res_sig <- fa_list[[contrast]][["down"]] %>% # Start with the complete list of terms used in the ORA evaluated for negative lfc
10301047
filter(padj < 0.05) # Retain only the results with adjusted p-values of 0.05
1031-
1048+
10321049
# Populate the dt_list list
10331050
dt_list <- c(
10341051
dt_list, # With the contents of dt_list from the previous loop
@@ -1044,14 +1061,14 @@ tagList(dt_list)
10441061
## Up-regulated genes {.tabset}
10451062

10461063
```{r print-ora-up, results='asis', eval=run_FA}
1047-
# Create an empty list to assign contrast values and DT data tables to
1064+
# Create an empty list to assign contrast values and DT data tables to
10481065
dt_list <- list()
10491066
# For each of the contrasts in the de_list list object to extract the name of the contrast and the ORA results to be rendered into an HTML table
10501067
for (contrast in names(de_list)) {
10511068
# Create a dataframe to hold the significantly over-represented terms for genes with a positive lfc
10521069
res_sig <- fa_list[[contrast]][["up"]] %>% # Start with the complete list of terms used in the ORA evaluated for positive lfc
10531070
filter(padj < 0.05) # Retain only the results with adjusted p-values of 0.05
1054-
1071+
10551072
# Populate the dt_list list
10561073
dt_list <- c(
10571074
dt_list, # With the contents of dt_list from the previous loop
@@ -1081,7 +1098,7 @@ filename <- paste0(filenames)
10811098
10821099
# Assign the file path to save the file path for the expression data CSV to the object name_expression_fn
10831100
name_expression_fn <- file.path( # Construct a file path
1084-
basedir, # Base directory path to write output files to specificied in the params.R file within the 00_params directory
1101+
basedir, # Base directory path to write output files to specificied in the params.R file within the 00_params directory
10851102
str_interp("${filename}_expression.csv") # Interpret the value of "filename" and insert that in front of "_expression.csv"
10861103
)
10871104
@@ -1097,20 +1114,20 @@ write_csv(counts_norm, name_expression_fn)
10971114
for (contrast in names(contrasts)) {
10981115
# Create a file name for the differential expression analysis
10991116
name_deg_fn <- file.path( # Construct a file path
1100-
basedir, # Base directory path to write output files to specificied in the params.R file within the 00_params directory
1117+
basedir, # Base directory path to write output files to specificied in the params.R file within the 00_params directory
11011118
str_interp("${filename}_${contrast}_deg.csv") # Interpret the value of "filename" and "contrast", then insert that in front of "_deg.csv"
11021119
)
1103-
1120+
11041121
# Create a file name for the ORA analysis
11051122
name_pathways_fn <- file.path( # Construct a file path
1106-
basedir, # Base directory path to write output files to specificied in the params.R file within the 00_params directory
1123+
basedir, # Base directory path to write output files to specificied in the params.R file within the 00_params directory
11071124
str_interp("${filename}_${contrast}_pathways.csv") # Interpret the value of "filename" and "contrast", then insert that in front of "_pathways.csv"
11081125
)
11091126
11101127
# Assign the full differential expression analysis for a given contrast to the object called res_for_writing
11111128
res_for_writing <- de_list[[contrast]][["all"]] %>% # Start with the full differential expression analysis dataframe for the given contrast
11121129
mutate(comparison = contrast) # Add a column to the table with the given contrast
1113-
1130+
11141131
# Assign the full ORA for positive and negative lfc for a given contrast to the object called pathways_for_writing
11151132
# NOTE: Choose what ORA to save (all, down or up). To save everything, add more lines of this code
11161133
pathways_for_writing <- fa_list[[contrast]][["all"]] %>% # Start with the full ORA dataframe evaluating negative and positive lfc for the given contrast

0 commit comments

Comments
 (0)