To see the analysis used to create the datasets open the notebook in the notebooks/ directory.
Dataset can be found on S3 here:
NOTE: It is highly recommended that you use these pre-built versions to save processing time and compute costs.
Apache Spark 2.4+
sbt clean assembly
Below are examples of running the pipeline on AWS EMR. These steps assume that the bootstrap actions scripts/ directory were run when the EMR cluster was started up.
##### Create a parquet with both the wildtype and alternate sequences
spark-submit --class org.nationwidechildrens.igm.vienna.AlternatesGenerator ./rna-stability-assembly-2.0.0-DEV.jar s3://nch-igm-rna-stability/human.all.rna.fna.gz s3://nch-igm-rna-stability/human_all_results_2019-02-22_0.parquet NM 5000
##### Create a parquet with just the wildtype sequences
[hadoop@ip-10-130-25-226 ~]$ spark-shell
scala> val df ="s3://nch-igm-rna-stability/human_all_results_2019-02-22_0.parquet")
df: org.apache.spark.sql.DataFrame = [nm_id: string, transcript_position: int ... 5 more fields]
scala> df.count
res0: Long = 635443744
scala> val wildDF = df.where("isWildType = true")
wildDF: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [nm_id: string, transcript_position: int ... 5 more fields]
scala> wildDF.count
res1: Long = 158860936
scala> wildDF.write.parquet("s3://nch-igm-rna-stability/human_all_results_wildType_Only_2019-02-22_1.parquet")
##### Generate the wildtype fold data
spark-submit --class org.nationwidechildrens.igm.vienna.RNAFoldRunner ./rna-stability-assembly-2.0.0-DEV.jar s3://nch-igm-rna-stability/human_all_results_wildType_Only_2019-02-22_1.parquet s3://nch-igm-rna-stability/human_all_results_wildType_Only_rnafold_2019-02-22_1.parquet /home/hadoop/vienna 20000
##### Scrubbing the alternate data set
##### Dropping empty score columns
##### Adding isWildType column
[hadoop@ip-10-130-25-226 ~]$ spark-shell
scala> val df ="s3://nch-igm-rna-stability/human_all_results_rnafold_2_4_11.parquet")
df: org.apache.spark.sql.DataFrame = [nm_id: string, transcript_position: int ... 26 more fields]
scala> df.printSchema
|-- nm_id: string (nullable = true)
|-- transcript_position: integer (nullable = true)
|-- ref: string (nullable = true)
|-- alt: string (nullable = true)
|-- sequence: string (nullable = true)
|-- wildSequence: string (nullable = true)
|-- mfeValue: double (nullable = true)
|-- efeValue: double (nullable = true)
|-- meafeValue: double (nullable = true)
|-- meaValue: double (nullable = true)
|-- cfeValue: double (nullable = true)
|-- cdValue: double (nullable = true)
|-- freqMfeEnsemble: double (nullable = true)
|-- endValue: double (nullable = true)
|-- deltaMFE: double (nullable = true)
|-- deltaEFE: double (nullable = true)
|-- deltaMEAFE: double (nullable = true)
|-- deltaCFE: double (nullable = true)
|-- deltaEND: double (nullable = true)
|-- deltaCD: double (nullable = true)
|-- mfeed: integer (nullable = true)
|-- meaed: integer (nullable = true)
|-- efeed: double (nullable = true)
|-- cfeed: integer (nullable = true)
|-- mfeStructure: string (nullable = true)
|-- efeStructure: string (nullable = true)
|-- meafeStructure: string (nullable = true)
|-- cfeStructure: string (nullable = true)
scala> val df2 = df.drop("deltaMFE","deltaMEAFE","deltaEFE","deltaCFE","deltaCD","deltaEND","mfeed","meaed",
| "efeed","cfeed")
df2: org.apache.spark.sql.DataFrame = [nm_id: string, transcript_position: int ... 16 more fields]
scala> df2.printSchema
|-- nm_id: string (nullable = true)
|-- transcript_position: integer (nullable = true)
|-- ref: string (nullable = true)
|-- alt: string (nullable = true)
|-- sequence: string (nullable = true)
|-- wildSequence: string (nullable = true)
|-- mfeValue: double (nullable = true)
|-- efeValue: double (nullable = true)
|-- meafeValue: double (nullable = true)
|-- meaValue: double (nullable = true)
|-- cfeValue: double (nullable = true)
|-- cdValue: double (nullable = true)
|-- freqMfeEnsemble: double (nullable = true)
|-- endValue: double (nullable = true)
|-- mfeStructure: string (nullable = true)
|-- efeStructure: string (nullable = true)
|-- meafeStructure: string (nullable = true)
|-- cfeStructure: string (nullable = true)
scala> val df3 = df2.withColumn("isWildType", lit(false))
df3: org.apache.spark.sql.DataFrame = [nm_id: string, transcript_position: int ... 17 more fields]
scala> df3.printSchema
|-- nm_id: string (nullable = true)
|-- transcript_position: integer (nullable = true)
|-- ref: string (nullable = true)
|-- alt: string (nullable = true)
|-- sequence: string (nullable = true)
|-- wildSequence: string (nullable = true)
|-- mfeValue: double (nullable = true)
|-- efeValue: double (nullable = true)
|-- meafeValue: double (nullable = true)
|-- meaValue: double (nullable = true)
|-- cfeValue: double (nullable = true)
|-- cdValue: double (nullable = true)
|-- freqMfeEnsemble: double (nullable = true)
|-- endValue: double (nullable = true)
|-- mfeStructure: string (nullable = true)
|-- efeStructure: string (nullable = true)
|-- meafeStructure: string (nullable = true)
|-- cfeStructure: string (nullable = true)
|-- isWildType: boolean (nullable = false)
scala> df3.write.parquet("s3://nch-igm-rna-stability/human_all_results_rnafold_2019-02-22.parquet")
##### Combining the alternate and wildtype RNAfold data sets
[hadoop@ip-10-130-26-129 ~]$ spark-shell
scala> :paste
// Entering paste mode (ctrl-D to finish)
val altDF ="s3://nch-igm-rna-stability/human_all_results_rnafold_2019-02-22.parquet")
val wtDF ="s3://nch-igm-rna-stability/human_all_results_wildType_Only_rnafold_2019-02-22_1.parquet")
val wtRenamedDF = wtDF.withColumnRenamed("mfeValue", "wt_mfeValue")
.withColumnRenamed("efeValue", "wt_efeValue")
.withColumnRenamed("meafeValue", "wt_meafeValue")
.withColumnRenamed("meaValue", "wt_meaValue")
.withColumnRenamed("cfeValue", "wt_cfeValue")
.withColumnRenamed("cdValue", "wt_cdValue")
.withColumnRenamed("freqMfeEnsemble", "wt_freqMfeEnsemble")
.withColumnRenamed("endValue", "wt_endValue")
.withColumnRenamed("mfeStructure", "wt_mfeStructure")
.withColumnRenamed("efeStructure", "wt_efeStructure")
.withColumnRenamed("meafeStructure", "wt_meafeStructure")
.withColumnRenamed("cfeStructure", "wt_cfeStructure")
val joinedDF = altDF.join(wtRenamedDF, Seq("nm_id","transcript_position"))
// Exiting paste mode, now interpreting.
altDF: org.apache.spark.sql.DataFrame = [nm_id: string, transcript_position: int ... 17 more fields]
wtDF: org.apache.spark.sql.DataFrame = [nm_id: string, transcript_position: int ... 17 more fields]
wtRenamedDF: org.apache.spark.sql.DataFrame = [nm_id: string, transcript_position: int ... 12 more fields]
joinedDF: org.apache.spark.sql.DataFrame = [nm_id: string, transcript_position: int ... 29 more fields]
scala> joinedDF.write.parquet("s3://nch-igm-rna-stability/human_all_results_rnafold_2019-02-24_1.parquet")
# Run RNADistance
spark-submit --conf spark.driver.maxResultSize=3g --conf --class org.nationwidechildrens.igm.vienna.RNADistanceRunner ./rna-stability-assembly-2.0.0-DEV.jar s3://nch-igm-rna-stability/human_all_results_rnafold_2019-02-24_1.parquet s3://nch-igm-rna-stability/human_all_results_rnadistance_2019-02-24_2.parquet /home/hadoop/vienna
# Run RNApdist
spark-submit --class org.nationwidechildrens.igm.vienna.RNAPDistRunner ./rna-stability-assembly-2.0.0-DEV.jar s3://nch-igm-rna-stability/human_all_results.parquet s3://nch-igm-rna-stability/human_all_results_rnapdist_2019_02_25_1.parquet /home/hadoop/vienna 50000
# Join RNADistance parquet to RNAPdist data frame
spark-submit --class org.nationwidechildrens.igm.vienna.DataframeJoiner ./rna-stability-assembly-2.0.0-DEV.jar s3://nch-igm-rna-stability/human_all_results_rnadistance_2019-02-24_2.parquet s3://nch-igm-rna-stability/human_all_results_rnapdist_2019_02_25_1.parquet s3://nch-igm-rna-stability/human_all_results_vienna_2019_02_26_0.parquet
# Map all transcripts to genomic positions
spark-submit --class org.nationwidechildrens.igm.vienna.GenomicPositionMapper ./rna-stability-assembly-2.0.0-DEV.jar s3://nch-igm-rna-stability/human_all_results_vienna_2019_02_26_0.parquet s3://nch-igm-rna-stability/human_all_results_genomic_positioned_2019_02_26_0.parquet s3://nch-igm-rna-stability/GCF_000001405.33_knownrefseq_alignments.gff3 NM
# Liftover to HG19
spark-submit --class org.nationwidechildrens.igm.vienna.Liftover ./rna-stability-assembly-2.0.0-DEV.jar s3://nch-igm-rna-stability/human_all_results_genomic_positioned_2019_02_26_0.parquet s3://nch-igm-rna-stability/human_all_results_genomic_positioned_liftover_2019_02_26_0.parquet /home/hadoop/hg38ToHg19.over.chain /home/hadoop/hg19.fa