-
Notifications
You must be signed in to change notification settings - Fork 6
License
GeneralElectric/CellDistinguisher
Folders and files
Name | Name | Last commit message | Last commit date | |
---|---|---|---|---|
Repository files navigation
# README # CellDistinguisher # Tested/run on Linux # # Generic input # exprLinear: a matrix of linear expression values with probes sets (or genes) as rows and samples as columns. # genesymb: a vector of gene names associated with the probe sets or NULL if not available. # # To run, get the software and your data as described below. Start R at the command line, then copy-paste the commands. ############################# # Run CellDistinguisher using a local text file/matrix with linear gene expression data # Gene expression data from pooled normal tissues was obtained from the RNA-Seq Atlas (http://medicalgenomics.org/rna_seq_atlas). # Mixed samples were generated by combining the expression values of known genes from # five tissues: adipose, colon, heart, hypothalamus, kidney. # # Input provided to run CellDistinguisher: expression_values.mixed_samples.tsv # Two additional files are provided for comparing them to the results of the deconvolution # based on distinguishers: # 1. Known composition of the mixed samples: known_fractions.tsv # 2. Known expression signatures of the pure tissues: expression_signatures.pure_tissues.tsv ############################# R ## Get the software. You will need the gtools and Matrix packages. ## You may also want the CellMix and GEOquery packages. Note that if ## the following approach using remote access does not work for ## installing the CellDistinguisher package, you can download the ## package directly from ## https://github.com/GeneralElectric/CellDistinguisher/archive/master.tar.gz ## and then install it with any approach that uses local files. { if (!suppressWarnings(require("gtools", quietly=TRUE))) { install.packages("gtools") library("gtools") } if (!suppressWarnings(require("Matrix", quietly=TRUE))) { install.packages("Matrix") library("Matrix") } if (!suppressWarnings(require("CellDistinguisher", quietly=TRUE))) { if (!suppressWarnings(require("devtools", quietly=TRUE))) { install.packages("devtools") library("devtools") } install_github("GeneralElectric/CellDistinguisher") library("CellDistinguisher") } } # Read the data that contains gene symbols instead of probe ID-s in first columns inDataFile <- system.file("extdata", "expression_values.mixed_samples.tsv", package = "CellDistinguisher") inData <- as.matrix(read.table(inDataFile, sep="\t", header = TRUE, row.names = 1)) # Remove lines with all 0 elements # Expression data is often stored as log values. In such cases, make sure it is transformed back to linear. exprLinear <- inData[ rowSums(inData==0) != ncol(inData), ] # Compute the distinguishers. With the below call we are asking for 1-100 distinguishers for each of 5 cell classes. # We are filtering out probe sets that have expression values above 99.9 percentile. # We are filtering out probe sets for which the second largest expression value among the heterogeneous samples # is less than or equal to 0.333 of the first largest. distinguishers <- gecd_CellDistinguisher(exprLinear, genesymb=NULL, numCellClasses=5, minDistinguisherAlternatives=1, maxDistinguisherAlternatives=100, minAlternativesLengthsNormalized=0.5, expressionQuantileForFilter=0.999, expressionConcentrationRatio=0.333, verbose=0) # Run deconvolution using these distinguishers deconvolution <- gecd_DeconvolutionByDistinguishers(exprLinear, distinguishers$bestDistinguishers, nonNegativeOnly = TRUE, convexSolution = TRUE, verbose = 0) # Alternatively, one of the deconvolution algorithms of CellMix (ssKL or ssFrobenius) can be used. Note that you must install the CellMix package before running these commands. library(CellMix) deconvolutionSSKL <- gecd_DeconvolutionCellMix(exprLinear, distinguishers$bestDistinguishers, method="ssKL", maxIter=5) # Get the sample composition: samples as columns, cell types as rows # For a large number of samples, it is easier to use the transposed form of the sample composition matrix sampleCompositions <- t(deconvolution$sampleCompositions) # Expression signatures of pure cell types/subtypes/activities celltypeSignatures <- deconvolution$cellSubclassSignatures # Output #1 - list of distinguisher genes for each cell class/type write.table(distinguishers$bestDistinguishers, file="distinguishers.tsv",sep="\t", col.names = F, row.names = F, quote=FALSE) # Output #2 - sample composition: the fraction of each of the 5 cell types in every sample write.table(sampleCompositions, file="sample_composition.tsv",sep="\t", col.names = T, row.names = T, quote=FALSE) # Output #3 - computed expression signatures of pure cell types write.table(round(celltypeSignatures,4),file="celltype_signatures.tsv",sep="\t", col.names = NA, row.names = T, quote=FALSE) ############################################################################### # Run CellDistinguisher using a GEO accession number to download the study data ############################################################################### ###################################################################### ### GSE19830 ### http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE19830 ### ### Shen-Orr SS, Tibshirani R, Khatri P, Bodian DL, Staedtler F, Perry ### NM, Hastie T, Sarwal MM, Davis MM, Butte AJ. Cell type-specific ### gene expression differences in complex tissues. Nat Methods. 2010 ### Apr;7(4):287-9. doi: 10.1038/nmeth.1439. Epub 2010 Mar 7. PubMed ### PMID: 20208531; PubMed Central PMCID: PMC3699332. ### ### Title: Expression data from pure/mixed brain, liver and lung to ### test feasability and sensitivity of statistical deconvolution ### ### Organism: Rattus norvegicus ### Experiment type: Expression profiling by array ### ### Tissue samples from the brain, liver and lung of a single rat were ### analyzed using expression arrays (Affymetrix) in triplicate. ### Homogenates of those three tissues were then mixed ### together at the cRNA level and the gene expression in each ### mixed sample was measured the same way. R ## Note that you must install the GEOquery package before running these commands. library(GEOquery) options(download.file.method='wget') options(download.file.method.GEOquery='wget') GSE19830GEO <- getGEO("GSE19830", GSEMatrix=FALSE) GSE19830Data <- gecd_DataLoader$GSEGeneric(GSE19830GEO, retrievedAs="log2", genesymbColname="Gene Symbol") ## Keep only first of several alternative gene symbols that are separated by "//" GSE19830Data[[1]]$genesymb <- sub("(.*?) //.*", "\\1", GSE19830Data[[1]]$genesymb) GSE19830ExprLinear <- GSE19830Data[[1]]$exprLinear GSE19830genesymb <- GSE19830Data[[1]]$genesymb GSE19830SampleInfo <- GSE19830Data[[1]]$sampleInfo GSE19830ProbesetInfo <- GSE19830Data[[1]]$probesetInfo save(GSE19830ExprLinear, GSE19830genesymb, GSE19830SampleInfo, GSE19830ProbesetInfo, file="GSE19830.RData") # Next time just load the RData file load("GSE19830.RData") exprLinear = GSE19830ExprLinear exprLinearClasses = NULL sampleInfo = GSE19830SampleInfo probesetInfo = GSE19830ProbesetInfo genesymb = GSE19830genesymb MyDistinguishers <- gecd_CellDistinguisher(exprLinear[,-(1:9)], genesymb=genesymb, numCellClasses=3, probesWithGenesOnly=TRUE, minDistinguisherAlternatives=1, maxDistinguisherAlternatives=100, minAlternativesLengthsNormalized=0.5, expressionQuantileForFilter=0.999, expressionConcentrationRatio=0.333, verbose=0) head(MyDistinguishers$bestDistinguishersGeneNames) # Output #1 - list of distinguisher genes for each cell class/type write.table(MyDistinguishers$bestDistinguishersGeneNames, file="distinguishers.tsv",sep="\t", col.names = F, row.names = F, quote=FALSE) ## Perform deconvolution with default method MyDeconvolution <- gecd_DeconvolutionByDistinguishers(exprLinear, MyDistinguishers$bestDistinguishers, nonNegativeOnly=TRUE, convexSolution=TRUE) sampleCompositions <- MyDeconvolution$sampleCompositions ## Transpose the matrix to have the cell types as columns, samples as rows sampleCompositions <- t(sampleCompositions) ## Cell/tissue types are labeled with the top distinguisher gene colnames(sampleCompositions) <- MyDistinguishers$bestDistinguishersGeneNames[1,] # Output #2: Sample composition saved into a tsv file write.table(t(round(sampleCompositions, 3)), col.names= T, file="rat_tissue_sample_compositions_top50dist.txt", sep="\t", quote = F)
About
No description, website, or topics provided.
Resources
License
Stars
Watchers
Forks
Releases
No releases published
Packages 0
No packages published