-
Notifications
You must be signed in to change notification settings - Fork 0
/
depmap_expr.py
93 lines (79 loc) · 2.79 KB
/
depmap_expr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import pandas as pd
import xarray as xa
import numpy as np
from .common.defs import lazy_property
from .depmap.depmap import public_21q1 as release
from .common.dir import cached_property, Dir
from .helpers import config
from pathlib import Path
import zarr
import numcodecs as nc
import dask.array as daa
from .ncbi.sql import ncbi
config.exec()
class Expr:
@property
def release(self):
return release
@lazy_property
def storage(self):
return Dir(config.cache).child('depmap').child('expr')
@lazy_property
def _expr(self):
return release.expr
@lazy_property
@cached_property(type=Dir.pickle)
def rows(self):
return self._expr.iloc[:, 0].rename('rows')
@lazy_property
@cached_property(type=Dir.pickle)
def cols(self):
cols = pd.Series(self._expr.columns[1:]).to_frame('cols')
cols['symbol'] = cols.cols.str.replace(' .*$', '', regex=True)
cols['entrez'] = cols.cols.str.replace('^.*\(|\)$', '', regex=True).astype(int)
return cols
@lazy_property
@cached_property(type=Dir.pickle)
def col_map_location(self):
cols = self.cols
map_location = ncbi.query(ncbi.sql['map_location'], 'homo_sapiens')
map_location = cols.set_index('entrez').\
join(map_location.set_index('entrez'), how='inner')
map_location = map_location.reset_index(drop=True)[['cols', 'map_location']].drop_duplicates()
return map_location
@property
def row_annot(self):
return self.release.samples.rename(columns={'DepMap_ID': 'rows'})
@lazy_property
def mat1(self):
path = Path(self.storage.path) / 'mat.zarr'
if not path.exists():
mat = np.array(self._expr.iloc[:, 1:]).astype('float16')
z = zarr.open(
str(path), mode='w',
shape=mat.shape, dtype=mat.dtype,
chunks=(1000, 1000),
compressor=nc.Blosc(cname='zstd', clevel=3)
)
z[:,:] = mat
return zarr.open(str(path), mode='r')
@lazy_property
def mat2(self):
rows = self.rows
cols = self.cols
mat = self.mat1
data = xa.Dataset()
data['rows'] = ('rows', rows)
data = data.merge(cols.set_index('cols').to_xarray())
data['data'] = (('rows', 'cols'), daa.from_zarr(mat))
return data
@lazy_property
def mat3(self):
mat = self.mat2.copy()
mat = mat.merge(self.row_annot.set_index('rows'), join='inner')
mat = mat.merge(self.col_map_location.set_index('cols'), join='inner')
mat = mat.sel(cols=np.isnan(mat.data).sum(axis=0)==0)
mat['data'] = (('rows', 'cols'), mat.data.data.rechunk(-1, 1000))
mat = mat.sel(cols=mat.data.mean(axis=0)>1.5)
return mat
expr = Expr()