Skip to content

Commit

Permalink
SKCM High Weight Fix (#146)
Browse files Browse the repository at this point in the history
* fix skcm high weight genes

* rerun notebook and update high weight gene files

* rerun notebook and add pdfs
  • Loading branch information
gwaybio authored Jan 31, 2019
1 parent 85be2c9 commit 644f34a
Show file tree
Hide file tree
Showing 6 changed files with 77 additions and 148 deletions.
69 changes: 22 additions & 47 deletions extract_tybalt_weights.ipynb

Large diffs are not rendered by default.

Binary file modified figures/sex_node_gene_scatter.pdf
Binary file not shown.
Binary file modified figures/skcm_metastasis_node_gene_scatter.pdf
Binary file not shown.
76 changes: 17 additions & 59 deletions results/high_weight_genes_node53_skcm.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,23 @@ CMTM5 0.0643278881907 positive
MAG 0.0604131445289 positive
FAM181A 0.060397323221 positive
STK32A 0.058834053576 positive
SOX10 0.0579484254122 positive
ATP13A4 0.0573384203017 positive
KCNJ16 0.056596595794 positive
KCNJ10 0.0563892126083 positive
P2RY12 0.0562625303864 positive
WDR49 0.0561990886927 positive
LHFPL3 0.0559598356485 positive
OLIG1 0.0555346608162 positive
CNGA3 0.0547106526792 positive
ATP10B 0.054338093847 positive
TSHR 0.0542526952922 positive
RFX4 0.054226025939 positive
CNDP1 0.0533419512212 positive
NKX2-8 0.0517432540655 positive
HPCAL4 0.0515612959862 positive
BTBD17 0.0512298606336 positive
LPAR5 0.0503086894751 positive
FRMD1 -0.106690756977 negative
SMPX -0.10114300251 negative
HSD17B2 -0.0930502191186 negative
Expand Down Expand Up @@ -87,62 +104,3 @@ SLC6A19 -0.0670078843832 negative
ADH6 -0.0667067393661 negative
TH -0.06642293185 negative
EPN3 -0.0661694258451 negative
SCGB2A2 -0.0656955465674 negative
WIF1 -0.065556615591 negative
KIF12 -0.0652892589569 negative
VSNL1 -0.0648402050138 negative
ADH1A -0.0647200495005 negative
TCF21 -0.0642112269998 negative
RNF186 -0.0640726536512 negative
SIX3 -0.0639745742083 negative
NTF3 -0.0637269169092 negative
NPHS1 -0.063662096858 negative
UGT1A6 -0.0635573044419 negative
IL20 -0.063524864614 negative
SCGB1A1 -0.0631539821625 negative
APCS -0.0630542561412 negative
SERPINA4 -0.0629084184766 negative
ANKRD1 -0.0628286525607 negative
IGFL1 -0.0628137290478 negative
FGF19 -0.0628041774035 negative
SLCO1B3 -0.0625266209245 negative
HABP2 -0.062512204051 negative
C2CD4A -0.0624180994928 negative
PHGR1 -0.0622870624065 negative
TM4SF5 -0.0622657500207 negative
KIAA1239 -0.0622381865978 negative
CREB3L3 -0.0621040016413 negative
SLC17A2 -0.0620975159109 negative
CYP2B6 -0.0618478655815 negative
PITX1 -0.0616861470044 negative
TINAG -0.0614473000169 negative
UGT1A1 -0.0611726231873 negative
TMEM40 -0.0609937980771 negative
BMP3 -0.0607865080237 negative
PVALB -0.0606927722692 negative
CYP2C9 -0.0606540739536 negative
MS4A8B -0.0606052726507 negative
AKR1B10 -0.060557808727 negative
ADH1C -0.0603707879782 negative
CCBE1 -0.06029156968 negative
C6orf141 -0.0601504072547 negative
SERPINA10 -0.0600171349943 negative
C6orf222 -0.0600081756711 negative
SLC5A8 -0.0599093176425 negative
FABP6 -0.0597450882196 negative
C1orf230 -0.0595846250653 negative
MAL2 -0.0595811977983 negative
CDHR2 -0.0592592917383 negative
SH3RF2 -0.059135235846 negative
UGT3A2 -0.0589714162052 negative
NGB -0.0589020252228 negative
SERPINB5 -0.0588819533587 negative
MS4A15 -0.0588764734566 negative
KRT31 -0.0588437430561 negative
TYRP1 -0.0586223378778 negative
SLC18A2 -0.0585580952466 negative
C2orf54 -0.0584515072405 negative
KLK8 -0.0582778267562 negative
C11orf53 -0.0581982061267 negative
PRAP1 -0.0581519156694 negative
UGT1A3 -0.0581151507795 negative
55 changes: 17 additions & 38 deletions results/high_weight_genes_node66_skcm.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,23 @@ KCNC2 0.0608262717724 positive
NBLA00301 0.0608116649091 positive
C14orf180 0.0608115382493 positive
HAND2 0.0595848038793 positive
KLB 0.0592151097953 positive
TYR 0.0590780451894 positive
MAGEC2 0.058785457164 positive
ITIH2 0.0587661415339 positive
SMYD1 0.0582903809845 positive
SLC13A2 0.0577883422375 positive
ABCG5 0.0576085075736 positive
PGLYRP2 0.0573373138905 positive
C1orf173 0.0568920262158 positive
APOB 0.0568893961608 positive
AGXT2L1 0.0567338243127 positive
APOA1 0.0565741434693 positive
S100B 0.0560903698206 positive
SLC6A2 0.0559865087271 positive
TTPA 0.0553827807307 positive
PAX3 0.0553316250443 positive
GYG2 0.0552380494773 positive
SALL3 -0.11357973516 negative
VWC2 -0.10353513062 negative
FXYD4 -0.101971246302 negative
Expand Down Expand Up @@ -111,41 +128,3 @@ SPINK2 -0.064710393548 negative
SFTA2 -0.0644171386957 negative
RFX4 -0.0642841458321 negative
C12orf56 -0.0642215907574 negative
LOC554202 -0.0639718025923 negative
GGT8P -0.0639604181051 negative
KHDRBS2 -0.0638420805335 negative
PGC -0.0638191178441 negative
PRAC -0.0637577623129 negative
KCNH6 -0.0636623576283 negative
DPCR1 -0.0634637326002 negative
CDH10 -0.0633975788951 negative
CPNE6 -0.0633297264576 negative
PRSS1 -0.0632374659181 negative
CLEC18C -0.0632153898478 negative
SLC9A4 -0.0627805814147 negative
ILDR2 -0.0626278221607 negative
C6orf222 -0.062568962574 negative
C9orf71 -0.0625510141253 negative
FA2H -0.0625374689698 negative
CNTNAP5 -0.062008086592 negative
RPTN -0.0618103928864 negative
NKX2-2 -0.061545047909 negative
C12orf36 -0.0614939145744 negative
CPA6 -0.0614471361041 negative
PLA2G4F -0.0613784827292 negative
CDH16 -0.0613649114966 negative
PAX7 -0.0612799413502 negative
POU6F2 -0.0612063407898 negative
SLIT1 -0.0608963407576 negative
LINGO3 -0.0608698837459 negative
HMX1 -0.0608213655651 negative
BEST3 -0.0606641843915 negative
ATCAY -0.0606600120664 negative
CACNG7 -0.0605688877404 negative
C20orf85 -0.0604568943381 negative
IFNE -0.060180157423 negative
SSTR5 -0.0601705051959 negative
GALNT9 -0.060115210712 negative
ALPP -0.060073569417 negative
SOX21 -0.0599375888705 negative
PADI1 -0.059573918581 negative
25 changes: 21 additions & 4 deletions scripts/nbconverted/extract_tybalt_weights.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

# In[1]:


import os
import pandas as pd
from keras.models import load_model
Expand All @@ -22,27 +23,31 @@

# In[2]:


sns.set(style='white', color_codes=True)
sns.set_context('paper', rc={'font.size':8, 'axes.titlesize':10, 'axes.labelsize':15})


# In[3]:

get_ipython().magic('matplotlib inline')

get_ipython().run_line_magic('matplotlib', 'inline')
plt.style.use('seaborn-notebook')


# Because of the complex architecture involved in encoding the data, we will use the `decoded` weights to describe feature encoding specific activation patterns

# In[4]:


# Load the decoder model
decoder_model_file = os.path.join('models', 'decoder_onehidden_vae.hdf5')
decoder = load_model(decoder_model_file)


# In[5]:


# Load RNAseq file
rnaseq_file = os.path.join('data', 'pancan_scaled_zeroone_rnaseq.tsv.gz')
rnaseq_df = pd.read_table(rnaseq_file, index_col=0)
Expand All @@ -51,6 +56,7 @@

# In[6]:


# For a future pathway analysis, the background genes are important
# Also needed to set column names on weights
background_file = os.path.join('data', 'background_genes.txt')
Expand All @@ -62,6 +68,7 @@

# In[7]:


# Extract the weights from the decoder model
weights = []
for layer in decoder.layers:
Expand All @@ -74,6 +81,7 @@

# In[8]:


# Write the genes to file
weight_file = os.path.join('results', 'tybalt_gene_weights.tsv')
weight_layer_df.to_csv(weight_file, sep='\t')
Expand All @@ -87,6 +95,7 @@

# In[9]:


# We previously identified node 82 as robustly separating sex in the data set:
# Visualize the distribution of gene weights here
sex_node_plot = weight_layer_df.loc[[82, 85], :].T
Expand All @@ -108,6 +117,7 @@

# In[10]:


# There are 17 genes with high activation in node 82
# All genes are located on sex chromosomes
sex_node_plot.head(17)
Expand All @@ -119,6 +129,7 @@

# In[11]:


# We previously observed metastasis samples being robustly separated by two features
# Visualize the feature scores here
met_node_plot = weight_layer_df.loc[[53, 66], :].T
Expand All @@ -142,6 +153,7 @@

# In[12]:


def output_high_weight_genes(weight_df, encoding, filename, thresh=2.5):
"""
Function to process and output high weight genes given specific feature encodings
Expand All @@ -154,11 +166,14 @@ def output_high_weight_genes(weight_df, encoding, filename, thresh=2.5):
.sort_values(ascending=False).index)[encoding]
)

hw_pos_df = pd.DataFrame(encoding_df[encoding_df > encoding_df.std() * thresh])
hw_pos_cutoff = encoding_df.mean() + (encoding_df.std() * thresh)
hw_pos_df = pd.DataFrame(encoding_df[encoding_df > hw_pos_cutoff])
hw_pos_df = hw_pos_df.assign(direction='positive')
hw_neg_df = pd.DataFrame(encoding_df[encoding_df < -encoding_df.std() * thresh])
hw_neg_df = hw_neg_df.assign(direction='negative')

hw_neg_cutoff = encoding_df.mean() - (encoding_df.std() * thresh)
hw_neg_df = pd.DataFrame(encoding_df[encoding_df < hw_neg_cutoff])
hw_neg_df = hw_neg_df.assign(direction='negative')

hw_df = pd.concat([hw_pos_df, hw_neg_df])
hw_df.index.name = 'genes'
hw_df.to_csv(filename, sep='\t')
Expand All @@ -167,6 +182,7 @@ def output_high_weight_genes(weight_df, encoding, filename, thresh=2.5):

# In[13]:


# Encoding 66
hw_node66_file = os.path.join('results', 'high_weight_genes_node66_skcm.tsv')
node66_df = output_high_weight_genes(met_node_plot, 'encoding 66', hw_node66_file)
Expand All @@ -175,6 +191,7 @@ def output_high_weight_genes(weight_df, encoding, filename, thresh=2.5):

# In[14]:


# Encoding 53
hw_node53_file = os.path.join('results', 'high_weight_genes_node53_skcm.tsv')
node53_df = output_high_weight_genes(met_node_plot, 'encoding 53', hw_node53_file)
Expand Down

0 comments on commit 644f34a

Please sign in to comment.