diff --git a/.gitignore b/.gitignore index 77a35eb..9b2db68 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,7 @@ __pycache__ /notebooks/.ipynb_checkpoints _tmp* + +#dist/ +#build/ +#*.egg-info diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..66b6e30 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,4 @@ +include ./sample_data/subsampled_B-L0-0.2.h5ad.rar +include ./sample_data/hvg_frequencies.csv +include ./sample_data/lineage_colors.csv +include README.md \ No newline at end of file diff --git a/build/lib/swnn/__init__.py b/build/lib/swnn/__init__.py deleted file mode 100644 index 2106ad2..0000000 --- a/build/lib/swnn/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -# -*- coding: UTF-8 -*- -""" -Construction of developmental tree from single-cell RNA-seq data using -StagewiseNN -""" - -from .utils import * -from .multipartite_graph import * -from .graph2tree import ( - adaptive_tree, - max_connection, - make_children_dict, - find_children, -) -from .builder import Builder - - diff --git a/build/lib/swnn/builder.py b/build/lib/swnn/builder.py deleted file mode 100644 index af1ca55..0000000 --- a/build/lib/swnn/builder.py +++ /dev/null @@ -1,248 +0,0 @@ -# -*- coding: UTF-8 -*- -""" -@CreateDate: 2021/07/25 -@Author: Xingyan Liu -@File: builder.py -@Project: stagewiseNN -""" -import os -import sys -from pathlib import Path -from typing import Sequence, Mapping, Optional, Union, Callable -import logging -import pandas as pd -import numpy as np -from scipy import sparse -import scanpy as sc - -from .utils import quick_preprocess_raw, make_binary -from .multipartite_graph import stagewise_knn -from .graph2tree import max_connection, adaptive_tree - - -class BuilderParams: - - def __init__(self, **kwargs): - - self._dict = {} - self.update(**kwargs) - - def update(self, **kwargs): - self._dict.update(**kwargs) - return self - - @property - def keys(self): - return self._dict.keys() - - def __getattr__(self, key): - return self._dict[key] - - -class Builder(object): - - def __init__( - self, - stage_order: Sequence, - **build_params - ): - """ - Parameters - ---------- - stage_order: Sequence - the order of stages - """ - self.stage_order = stage_order - self._params = BuilderParams(**build_params) - self._distmat = None - self._connect = None - self._stage_lbs = None - self._group_lbs = None - self._edgedf = None - self._refined_group_lbs = None - - @property - def stage_lbs(self): - return self._stage_lbs - - @property - def group_lbs(self): - return self._group_lbs - - # @group_lbs.setter - # def group_lbs(self, group_lbs): - # pass - - @property - def distmat(self): - return self._distmat - - @property - def connect(self): - return self._connect - - @property - def connect_bin(self): - """ binarized edges """ - if self._connect is not None: - return make_binary(self._connect) - return None - - @property - def edgedf(self): - return self._edgedf - - @property - def refined_group_lbs(self): - return self._refined_group_lbs - - def build_graph( - self, - X, stage_lbs, - binary_edge: bool = True, - ks: Union[Sequence[int], int] = 10, - n_pcs: Union[Sequence[int], int] = 10, - pca_base_on: Optional[str] = 'stacked', - leaf_size: int = 5, - **kwargs - ): - """ - Build multipartite KNN-graph stage-by-stage. - - Parameters - ---------- - X: np.ndarray or sparse matrix - data matrix, of shape (n_samples, n_features) - stage_lbs: Sequence - stage labels for each sample (nodes in `build_graph`) - binary_edge: bool (default=True) - whether to use the binarized edges. Set as True may cause some - information loss but a more robust result. - ks: - the number of nearest neighbors to be calculated. - n_pcs: - The number of principal components after PCA reduction. - If `pca_base_on` is None, this will be ignored. - pca_base_on: str {'x1', 'x2', 'stacked', None} (default='stacked') - if None, perform KNN on the original data space. - leaf_size: int (default=5) - Leaf size passed to BallTree or KDTree, for adjusting the - approximation level. The higher the faster, while of - less promises to find the exact nearest neighbors. - Setting as 1 for brute-force (exact) KNN. - kwargs: - other parameters for `stagewise_knn` - - Returns - ------- - distmat: sparse.csr_matrix - the distance matrix, of shape (n_samples, n_samples) - connect: sparse.csr_matrix - the connectivities matrix, of shape (n_samples, n_samples) - """ - self._stage_lbs = stage_lbs - distmat, connect = stagewise_knn( - X, self.stage_lbs, - stage_order=self.stage_order, - k=ks, - leaf_size=leaf_size, # 1 for brute-force KNN - pca_base_on=pca_base_on, - n_pcs=n_pcs, - binary_edge=False, - **kwargs - ) - self._distmat = distmat - self._connect = connect - if binary_edge: - connect = self.connect_bin - - # record parameters - self._params.update( - binary_edge=binary_edge, - ks=ks, - n_pcs=n_pcs, - pca_base_on=pca_base_on, - leaf_size=leaf_size, - ) - - return distmat, connect - - def build_tree( - self, - group_lbs: Sequence, - stage_lbs: Optional[Sequence] = None, - ignore_pa=(), - ext_sep: str = '_', - ): - """ - Adaptatively build the developmental tree from the stagewise-KNN graph. - - Parameters - ---------- - group_lbs: Sequence - group labels for each sample (nodes in `build_graph`) - stage_lbs: Sequence - stage labels for each sample (nodes in `build_graph`) - ignore_pa: list or set - parent nodes to be ignored; empty tuple by default. - ext_sep: str - parse string for automatically extract the stage-labels from - `group_lbs` - - Returns - ------- - edgedf: pd.DataFrame - pd.DataFrame of columns {'node', 'parent', 'prop'}, - and of the same number of rows as number of total stage-clusters. - the column 'prop' is the proportion of nodes that have votes for - the current parent. - refined_group_lbs: - refined group labels for each sample (e.g. single-cell) - """ - # connect-matrix NOT calculated by StagewiseNN may cause un-expected - # result by using `sparse.triu()`. - # TODO: define `take_cross_stage_edges(spmatrix)` - conn_upper = sparse.triu(self.connect) - adj_max = max_connection(conn_upper) - - self._group_lbs = group_lbs - if self.stage_lbs is None: - self._stage_lbs = stage_lbs - - edgedf, refined_group_lbs = adaptive_tree( - adj_max, self.group_lbs, - stage_lbs=self.stage_lbs, - stage_ord=self.stage_order, - ignore_pa=ignore_pa, - ext_sep=ext_sep, - ) - - self._edgedf = edgedf - self._refined_group_lbs = refined_group_lbs - - # record parameters - self._params.update( - ignore_pa=ignore_pa, - ext_sep=ext_sep, - ) - - return edgedf, refined_group_lbs - - -def __test__(): - pass - - -if __name__ == '__main__': - import time - - logging.basicConfig( - level=logging.DEBUG, - format='%(asctime)s %(filename)s-%(lineno)d-%(funcName)s(): ' - '%(levelname)s\n%(message)s' - ) - t = time.time() - __test__() - print('Done running file: {}\nTime: {}'.format( - os.path.abspath(__file__), time.time() - t, - )) diff --git a/build/lib/swnn/graph2tree.py b/build/lib/swnn/graph2tree.py deleted file mode 100644 index 6340c81..0000000 --- a/build/lib/swnn/graph2tree.py +++ /dev/null @@ -1,362 +0,0 @@ -# -*- coding: UTF-8 -*- -# @CreateDate: 2020/07/18 -# @Author: Xingyan Liu -# @File: graph2tree.py -# @Project: stagewiseNN - -import os -import sys -from pathlib import Path -from typing import Sequence, List, Mapping, Optional, Union, Callable, Any -import logging -import pandas as pd -import numpy as np -from scipy import sparse -from sklearn.preprocessing import label_binarize -from .utils import label_binarize_each - - -def adaptive_tree(adj_max: sparse.spmatrix, - group_lbs: Sequence, - stage_lbs: Optional[Sequence] = None, - stage_ord: Optional[Sequence] = None, - ignore_pa: Union[List, set] = (), - ext_sep: str = '_', - ): - """ - Adaptatively build the developmental tree from the stagewise-KNN graph. - - Parameters - ---------- - adj_max: - sparse.csc_matrix, shape = (n_points, n_points) - adjacent matrix of single points (cells) - group_lbs: - np.array, shape = (n_points,) - group labels specifying each cluster in each stage. - e.g. 'stage1_1' specifies the cluster 1 in stage1. - stage_lbs: - np.array, shape = (n_points,) - stage labels, better be explicitly assigned. If None, this will be - extracted from `group_lbs`, and may cause unexpected results. - stage_ord: - np.array, shape = (n_stages,) - order of stages, better provided by user; if None, it will be decided - automatically. - ignore_pa: list or set - parent nodes to be ignored; empty tuple by default. - ext_sep: str - parse string for automatically extract the stage-labels from `group_lbs` - - Returns - ------- - edgedf: pd.DataFrame - a DataFrame with each row representing an edge, columns are - ['node', 'parent', 'prop'], where 'prop' is the proportion of nodes - that have voted for the current parent. - group_lbs: - refined group-labels for each sample (e.g. single-cell) - - Examples - -------- - >>> edgedf, group_lbs = adaptive_tree(adj_max, group_lbs, stage_ord=stages) - - """ - group_lbs = np.asarray(group_lbs).copy() - if stage_lbs is None: - stage_lbs = _extract_field(group_lbs, sep=ext_sep, i=0) - if stage_ord is None: - print('stage orders are decided automatically:') - stage_ord = pd.unique(stage_lbs) - print(stage_ord) - - stg_grp_dict = make_stage_group_dict(group_lbs, stage_lbs=stage_lbs) - - adj_max = sparse.csc_matrix(adj_max) - edgedfs = [] - for i, stg0 in enumerate(stage_ord[: -1]): - - groups0 = stg_grp_dict[stg0] - if len(groups0) < 2: - continue - - stg1 = stage_ord[i + 1] - inds0 = np.flatnonzero(stage_lbs == stg0) - inds1 = np.flatnonzero(stage_lbs == stg1) - adj_sub = adj_max[:, inds1][inds0, :] - group_lbs0 = group_lbs[inds0] - group_lbs1 = group_lbs[inds1] - # groups1 = stg_grp_dict[stg1] - print(f'connecting stage {stg0} and {stg1}') - - edgedf_sub, new_group_lbs1, new_groups1 = connect_near_stages( - adj_sub, - group_lbs0, group_lbs1, - groups0=groups0, groups1=stg_grp_dict[stg1], - # stg0=stg0, stg1=stg1, - df2edges=True, - ignore_pa=ignore_pa, - ) - group_lbs[inds1] = new_group_lbs1 - stg_grp_dict[stg1] = new_groups1 - edgedfs.append(edgedf_sub) - print() - - edgedf = pd.concat(edgedfs, axis=0, ignore_index=True) - - return edgedf, group_lbs # , stg_grp_dict - - -def connect_near_stages(adj_max, - group_lbs0, group_lbs1, - groups0=None, groups1=None, - # stg0=None, stg1=None, - ignore_pa: Sequence = (), - df2edges: bool = True, - sep: str = '_', - ): - """ - Parameters - ---------- - adj_max: - csc_sparse matrix, adjacent matrix of samples (cells), - shape (n_samples, n_samples) - group_lbs0: - group labels of the parent stage - group_lbs1: - group labels of the descendent stage - - Returns - ------- - edgedf: - a DataFrame with each row representing an edge, columns are - ['node', 'parent', 'prop'], where 'prop' is the proportion of nodes - that vote for the current parent. - new_group_lbs1: - np.array, refined group labels - new_groups1: - unique group names from `new_group_lbs1` - - """ - adj_max = sparse.csc_matrix(adj_max) - groups0 = pd.unique(group_lbs0) if groups0 is None else groups0 - groups1 = pd.unique(group_lbs1) if groups1 is None else groups1 - # each column normalized to unit sum - # group_conn = agg_group_edges(adj_max, group_lbs0, group_lbs1, - # groups0=groups0, groups1=groups1,) - # print(group_conn) - voting_props = agg_group_edge_props(adj_max, group_lbs0, group_lbs1, - groups0=groups0, groups1=groups1, - axis=0) - - winner_pas = voting_props.idxmax(axis=0) # a series - single_pas = [x for x in groups0 if x not in winner_pas.values] - print('parent nodes that had no descendent:', single_pas) - for p in ignore_pa: - if p in single_pas: - print(f'ignore single parent node {p}') - single_pas.remove(p) - - # modify the groups labels in group1 - is_strlbs = isinstance(group_lbs1[0], str) - if is_strlbs: - stg1 = groups1[0].split(sep)[0] - new_group_lbs1 = _extract_field(group_lbs1, sep=sep, i=-1).astype(int) - else: # integer labels - new_group_lbs1 = group_lbs1.copy() - max_num = new_group_lbs1.max() - - print('Taking descendant-points from other nodes (groups)') - # adj_max = max_connection(adj_max, axis=0) - for i, pa in enumerate(single_pas): - parent_ids = group_lbs0 == pa - sub_adj = adj_max[parent_ids, :] # sns.heatmap(sub_adj.toarray()) - rows, cols = sub_adj.nonzero() # `cols` is the indices of child-nodes - new_name = max_num + 1 + i - new_group_lbs1[cols] = new_name - - new_groups1 = np.unique(new_group_lbs1) - if is_strlbs: - print('pasting stage labels') - new_groups1 = [f'{stg1}{sep}{x}' for x in new_groups1] - new_group_lbs1 = np.array( - list(map(lambda x: f'{stg1}{sep}{x}', new_group_lbs1))) - - # ========= new voting proportions ============ - voting_props = agg_group_edge_props(adj_max, group_lbs0, new_group_lbs1, - groups0=groups0, groups1=new_groups1, - axis=0, verbose=True) - - edgedf = max_connection(voting_props, axis=0, df2edges=df2edges) - return edgedf, new_group_lbs1, new_groups1 - - -def agg_group_edge_props(adj, group_lbs0, group_lbs1=None, - groups0=None, groups1=None, - # asdf = True, - axis=0, verbose=True): - group_conn = agg_group_edges(adj, group_lbs0, group_lbs1=group_lbs1, - groups0=groups0, groups1=groups1, asdf=True, - verbose=verbose) - # normalize each column to unit sum - voting_props = group_conn.apply(lambda x: x / x.sum(), axis=axis) - return voting_props - - -def agg_group_edges(adj, group_lbs0, group_lbs1=None, - groups0=None, groups1=None, asdf=True, verbose=True): - """ - Parameters - ---------- - adj: - adjacent matrix of shape (N0, N1), if `group_lbs1` is None, then set N0=N1. - group_lbs0: - a list or a np.array of shape (N0,) - group_lbs1: - a list or a np.array of shape (N1,) - - Returns - ------- - group_conn: summation of connected edges between given groups - """ - # if sparse.issparse(adj): - adj = sparse.csc_matrix(adj) - - groups0 = pd.unique(group_lbs0) if groups0 is None else groups0 - lb1hot0 = label_binarize_each(group_lbs0, classes=groups0, sparse_out=True) - if group_lbs1 is None: - lb1hot1 = lb1hot0 - groups1 = groups0 - else: - groups1 = pd.unique(group_lbs1) if groups1 is not None else groups1 - lb1hot1 = label_binarize_each(group_lbs1, classes=groups1, - sparse_out=True) - if verbose: - print('---> aggregating edges...') - print('unique labels of rows:', groups0) - print('unique labels of columns:', groups1) - print('grouping elements (edges)') - print('shape of the one-hot-labels:', lb1hot0.shape, lb1hot1.shape) - group_conn = lb1hot0.T.dot(adj).dot(lb1hot1) - # print(group_conn.shape) - if asdf: - group_conn = pd.DataFrame(group_conn.toarray(), - index=groups0, columns=groups1) - - return group_conn - - -def max_connection(adj, axis=0, df2edges=False): - """ - keep only max element (connection) for each column (axis=0), and remove - the other elements (connections) - """ - if isinstance(adj, pd.DataFrame): - def keep_max(x): - cut = x.max() - x[x < cut] = 0 - return x - - adj_max = adj.apply(keep_max, axis=axis) - if df2edges: - adj_max = _make_edgedf(adj_max, col_key='node', row_key='parent', - data_key='prop', ) - else: - adj = sparse.csc_matrix(adj) - shape = adj.shape - vmaxs = adj.max(axis=0).A[0] # .A[0] for csc matrix - idxmax = adj.argmax(axis=axis).A[0] # .A[0] for csc matrix - adj_max = sparse.coo_matrix( - (vmaxs, - (idxmax, np.arange(shape[1]))), - shape=shape) - return adj_max - - -def _make_edgedf(df, col_key='node', row_key='parent', - data_key='prop', ): - coo_data = sparse.coo_matrix(df.values) - edgedf = pd.DataFrame({ - col_key: df.columns.take(coo_data.col), - row_key: df.index.take(coo_data.row), - data_key: coo_data.data, - }) - - return edgedf - - -def make_stage_group_dict(group_lbs, stage_lbs=None): - if stage_lbs is None: - stage_lbs = _extract_field(group_lbs, sep='_', i=0) - - stages = pd.unique(stage_lbs) - group_lbs = np.asarray(group_lbs) - dct = {} - for stg in stages: - dct[stg] = pd.unique(group_lbs[stage_lbs == stg]) - return dct - - -def find_children( - nodes: Sequence, - children_dict: Mapping[Any, Sequence], - n: int = 100): - """ - Parameters - ---------- - nodes: - better a list of node(s) to be looked up - children_dict: dict - parent (key) -> children (value) - n: - max number of iterations - """ - if isinstance(nodes, (int, str)): - nodes = [nodes] - has_children = [nd in children_dict.keys() for nd in nodes] - has_any_children = any(has_children) - if not has_any_children: - return [] - if n < 1: - return [] - - children = [] - for i, nd in enumerate(nodes): - if has_children[i]: - children += children_dict[nd] - - n -= 1 - children = children + find_children(children, children_dict, n=n) - # if None in children: - # children.remove(None) - return children - - -def make_children_dict(df_tree, column_pa='parent', column_ch='node'): - """ making a dict for looking up descendants - """ - children_dict = df_tree.groupby(column_pa)[column_ch].apply(lambda x: list(x)) - return children_dict.to_dict() - - -def _extract_field(labels, sep='_', i=0): - return np.array(list(map(lambda x: x.split(sep)[i], labels))) - - -def __test__(): - pass - - -if __name__ == '__main__': - import time - - logging.basicConfig( - level=logging.DEBUG, - format='%(asctime)s %(filename)s-%(lineno)d-%(funcName)s(): ' - '%(levelname)s\n%(message)s' - ) - t = time.time() - __test__() - print('Done running file: {}\nTime: {}'.format( - os.path.abspath(__file__), time.time() - t, - )) diff --git a/build/lib/swnn/multipartite_graph.py b/build/lib/swnn/multipartite_graph.py deleted file mode 100644 index 384de67..0000000 --- a/build/lib/swnn/multipartite_graph.py +++ /dev/null @@ -1,266 +0,0 @@ -# -*- coding: UTF-8 -*- -# @CreateDate: 2020/07/18 -# @Author: Xingyan Liu -# @File: multipartite_graph.py -# @Project: stagewiseNN - -import os -import sys -from pathlib import Path -from typing import Sequence, Mapping, Optional, Union, Callable -import logging -import pandas as pd -import numpy as np -from scipy import sparse -from sklearn.preprocessing import normalize -from sklearn.decomposition import PCA -from sklearn.neighbors import NearestNeighbors, KDTree - - -def stagewise_knn(X: np.ndarray, - stage_lbs: Sequence, - stage_order: Sequence, - leaf_size: Optional[int] = 5, - n_pcs: Union[Sequence[int], int] = 30, - k: Union[Sequence[int], int] = 30, - pca_base_on: Optional[str] = 'stack', - binary_edge: bool = True, - norm_dists=False, - **kwargs - ): - """ - Build multipartite KNN-graph stage-by-stage. - - Parameters - ---------- - X: np.ndarray or sparse matrix - data matrix, of shape (n_samples, n_features) - stage_lbs: Sequence - stage labels for each sample (nodes in `build_graph`) - stage_order: Sequence - stage order - binary_edge: bool (default=True) - whether to use the binarized edges. Set as True may cause some - information loss but a more robust result. - k: - the number of nearest neighbors to be calculated. - n_pcs: - The number of principal components after PCA reduction. - If `pca_base_on` is None, this will be ignored. - pca_base_on: str {'x1', 'x2', 'stacked', None} (default='stacked') - if None, perform KNN on the original data space. - leaf_size: int (default=5) - Leaf size passed to BallTree or KDTree, for adjusting the - approximation level. The higher the faster, while of - less promises to find the exact nearest neighbors. - Setting as 1 for brute-force (exact) KNN. - norm_dists: bool - whether to normalize the distance for each pair of adjacent-stages. - - Returns - ------- - distmat: sparse.csr_matrix - the distance matrix, of shape (n_samples, n_samples) - connect: sparse.csr_matrix - the connectivities matrix, of shape (n_samples, n_samples) - - """ - # setting parameters - n_pcs = [n_pcs] * len(stage_order) if isinstance(n_pcs, int) else n_pcs - k = [k] * len(stage_order) if isinstance(k, int) else k - - stage_lbs = np.asarray(stage_lbs) - N = X.shape[0] - Inds = np.arange(N) - - iis = [] - dds = [] - jjs = [] - conns = [] - for i, stg1 in enumerate(stage_order): - if i == 0: - # stg0 = stg1 - inds_earlier = inds_later = stage_lbs == stg1 - X_earlier = X[inds_earlier, :] - X_later = None - else: - stg0 = stage_order[i - 1] - inds_earlier = stage_lbs == stg0 - inds_later = stage_lbs == stg1 - X_earlier = X[inds_earlier, :] - X_later = X[inds_later, :] - - if pca_base_on is not None: - X_earlier, X_later = pca_transform( - X_earlier, X_later, - n_pcs=n_pcs[i], base_on=pca_base_on - ) - - dist, inds0 = approx_knn(X_earlier, X_later, k=k[i], - leaf_size=leaf_size, **kwargs) - inds = np.take(Inds[inds_earlier], inds0) - conn = _dist_to_connection(dist, norm=norm_dists, binarize=binary_edge) - - iis.append(np.repeat(Inds[inds_later], k[i])) - jjs.append(inds.flatten()) - dds.append(dist.flatten()) - conns.append(conn.flatten()) - - logging.info('Done on KNN searching, constructing the results distances ' - 'and connections') - iis, jjs, dds, conns = tuple(map( - lambda x: np.concatenate(x), (iis, jjs, dds, conns) - )) - - distmat = sparse.coo_matrix((dds, (iis, jjs)), shape=(N, N)) - distmat += distmat.T - connect = sparse.coo_matrix((conns, (iis, jjs)), shape=(N, N)) - connect += connect.T - logging.info(f'distance matrix of shape {distmat.shape} and ' - f'{distmat.nnz} non-zeros') - logging.info(f'connection matrix of shape {connect.shape} and ' - f'{connect.nnz} non-zeros') - return distmat, connect - - -def _dist_to_connection( - dist: np.ndarray, - norm: bool = True, - binarize: bool = False, - sigma: float = 1., -): - """ - dist: shape=(n_samples, n_neighbors) - """ - if binarize: - return np.ones_like(dist, dtype=float) - else: - if norm: - conn = normalize( - _connect_heat_kernel(_normalize_dists(dist), sigma), - 'l1', axis=1, - ) * np.sqrt(dist.shape[1]) - else: - conn = normalize( - _connect_heat_kernel(dist, sigma), 'l1', axis=1 - ) * np.sqrt(dist.shape[1]) - return conn - - -def _normalize_dists(d): - """ - d: 2-D np.array, normalization will perform on each row - """ - return np.array(list(map(_normalize_dists_single, d))) - - -def _normalize_dists_single(d): - """ - d: 1-D np.array - """ - d1 = d[d.nonzero()] - vmin = d1.min() * 0.99 - # vmax = d1.max() * 0.99 - med = np.median(d1) - d2 = (d - vmin) / (med - vmin) - d2[d2 < 0] = 0 - return d2 - - -def _connect_heat_kernel(d, sigma): - return np.exp(-np.square(d / sigma)) - - -def pca_transform( - X1: np.ndarray, - X2: Optional[np.ndarray] = None, - n_pcs: int = 50, - base_on: str = 'stacked', - **kwargs -): - """ - base_on: {'x1', 'x2', 'stacked'} - """ - pca = PCA(n_components=n_pcs, **kwargs) - if X2 is None: - logging.debug(f'base_on=X1, and X2=None') - return pca.fit_transform(X1), None - if base_on.lower() == 'x1': - logging.debug(f'PCA base_on: {base_on}') - X1 = pca.fit_transform(X1) - X2 = pca.transform(X2) - elif base_on.lower() == 'x2': - logging.debug(f'PCA base_on: {base_on}') - X2 = pca.fit_transform(X2) - X1 = pca.transform(X1) - else: - logging.debug(f'PCA on the stacked data matrix (base_on={base_on})') - n1 = X1.shape[0] - X_pca = pca.fit_transform(np.vstack([X1, X2])) - X1 = X_pca[: n1, :] - X2 = X_pca[n1:, :] - return X1, X2 - - -def approx_knn( - X_ref: np.ndarray, - X_que: Optional[np.ndarray] = None, - metric='cosine', - k: int = 5, - precis: float = 0.1, - leaf_size: Optional[int] = 5, - leaf_size_max: int = 30, - algorithm='kd_tree', - metric_params=None, - **kwargs -): - """ - Parameters - ---------- - algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'} - default='auto' - leaf_size : int, default=30 - Leaf size passed to BallTree or KDTree. - """ - if leaf_size is None: - n = X_ref.shape[0] - leaf_size = min([int(np.sqrt(n * n) * precis), leaf_size_max]) - leaf_size = max([leaf_size, 1]) - elif leaf_size <= 1: - algorithm = 'brute' - - if metric == 'cosine': # pretend cosine matric - X_ref = normalize(X_ref, axis=1) - if X_que is not None: - X_que = normalize(X_que, axis=1) - metric = 'minkowski' - indexer = NearestNeighbors( - n_neighbors=k, - algorithm=algorithm, - leaf_size=leaf_size, - metric=metric, metric_params=metric_params, - **kwargs - ).fit(X_ref) - dist, inds = indexer.kneighbors(X_que, return_distance=True) - # indexer = KDTree(X_ref, leaf_size=leaf_size, metric=metric) - # dist, inds = indexer.query(X_que, k=k) - return dist, inds - - -def __test__(): - pass - - -if __name__ == '__main__': - import time - - logging.basicConfig( - level=logging.DEBUG, - format='%(asctime)s %(filename)s-%(lineno)d-%(funcName)s(): ' - '%(levelname)s\n%(message)s' - ) - t = time.time() - __test__() - print('Done running file: {}\nTime: {}'.format( - os.path.abspath(__file__), time.time() - t, - )) diff --git a/build/lib/swnn/utils/__init__.py b/build/lib/swnn/utils/__init__.py deleted file mode 100644 index b876a4f..0000000 --- a/build/lib/swnn/utils/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# -*- coding: UTF-8 -*- -""" -@CreateDate: 2021/07/18 -@Author: Xingyan Liu -@File: __init__.py -@Project: stagewiseNN -""" - -from .plot import * -from .process import * - - - diff --git a/build/lib/swnn/utils/_scale.py b/build/lib/swnn/utils/_scale.py deleted file mode 100644 index e870e83..0000000 --- a/build/lib/swnn/utils/_scale.py +++ /dev/null @@ -1,152 +0,0 @@ -# -*- coding: UTF-8 -*- -""" -@CreateDate: 2021/07/18 -@Author: Xingyan Liu -@File: _scale.py -@Project: stagewiseNN -""" -import os -import sys -from pathlib import Path -from typing import Sequence, Mapping, Optional, Union, Callable -import logging -import pandas as pd -import numpy as np -from scipy import sparse -import scanpy as sc -from sklearn.preprocessing import StandardScaler, label_binarize - - -def zscore(X, with_mean=True, scale=True, ): - """ For each column of X, do centering (z-scoring) - ==== - code borrowed from `scanpy.pp._simple` - """ - scaler = StandardScaler(with_mean=with_mean, copy=True).partial_fit(X) - if scale: - # user R convention (unbiased estimator) - e_adjust = np.sqrt(X.shape[0] / (X.shape[0] - 1)) - scaler.scale_ *= e_adjust - else: - scaler.scale_ = np.array([1] * X.shape[1]) - X_new = scaler.transform(X) - if isinstance(X, pd.DataFrame): - X_new = pd.DataFrame(X_new, index=X.index, columns=X.columns) - return X_new - - -def group_zscore(X, labels, with_mean=True, scale=True, max_value=None): - """ - For each column of X, do within-group centering (z-scoring) - ====== - X: np.array, shape (n_samples, n_features) - i.e. each row of X is an observation, wile each column is a feature - - with_mean: boolean, True by default - If True, center the data before scaling, and X shoud be a dense matrix. - """ - isdf = False - if isinstance(X, pd.DataFrame): - isdf = True - index, columns, X = X.index, X.columns, X.values - X = X.astype(np.float).copy() - labels = np.asarray(labels) - unique_labels = np.unique(labels) - for lb in unique_labels: - ind = labels == lb - if sum(ind) == 1: - logging.warning(f'ignoring class {lb} with only one sample.') - continue - X[ind, :] = zscore(X[ind, :], with_mean=with_mean, scale=scale) - - if max_value is not None: - X[X > max_value] = max_value - logging.info('... clipping at max_value', max_value) - - if isdf: - X = pd.DataFrame(X, index=index, columns=columns) - return X - - -def group_zscore_adata(adt, key='counts', groupby='batch', key_new=None, - max_value=None, - with_mean=True, - cover=True, **kwds): - """ - adt: AnnData - key: str in ['X_pca', 'count'] - can be a key from adt.obsm, e.g. `key='X_pca'` - If key == 'counts', then do scaling on `adt.X` - and cover the old count matrix, ignoring the `cover` parameter - groupby: str; - A key from adt.obs, from which the labels are take - cover: bool - whether to cover the old X with the scored X - """ - labels = adt.obs[groupby] - if key == 'counts': - logging.info('doing z-score scaling on count matrix, ' - 'transformed into a dense array') - if sparse.issparse(adt.X): - X = adt.X.toarray() - else: - X = adt.X - if not cover: - adt = adt.copy() - X_scaled = group_zscore( - X, labels, with_mean=with_mean, - max_value=max_value, **kwds) - # logging.debug('X_scaled = %s', X_scaled) - adt.X = X_scaled - else: - if cover: - key_new = key - else: - key_new = key + '_new' if key_new is None else key_new - adt.obsm[key_new] = group_zscore( - adt.obsm[key], labels, with_mean=with_mean, max_value=None, **kwds) - - return adt - - -def wrapper_scale(adata, zero_center=True, max_value=None, - groupby=None, copy=False, **kwds): - """ - Wrapper function for centering and scaling data matrix `X` in sc.AnnData, - extended for within-batch cprocessing. - - Example - ======= - wrapper_scale(adata, groupby='batch') - """ - if groupby is not None: - logging.info(f'doing within-group scaling, group by [ {groupby} ]') - return group_zscore_adata(adata, key='counts', - max_value=max_value, - groupby=groupby, - with_mean=zero_center, - cover=not copy, - **kwds) - else: - logging.info('using the build-in function `sc.pp.scale(..)`') - return sc.pp.scale(adata, zero_center=zero_center, - max_value=max_value, copy=copy) - - -def __test__(): - pass - - -if __name__ == '__main__': - import time - - logging.basicConfig( - level=logging.DEBUG, - format='%(asctime)s %(filename)s-%(lineno)d-%(funcName)s(): ' - '%(levelname)s\n%(message)s' - ) - t = time.time() - __test__() - print('Done running file: {}\nTime: {}'.format( - os.path.abspath(__file__), time.time() - t, - )) diff --git a/build/lib/swnn/utils/plot.py b/build/lib/swnn/utils/plot.py deleted file mode 100644 index f2a4b31..0000000 --- a/build/lib/swnn/utils/plot.py +++ /dev/null @@ -1,33 +0,0 @@ -# -*- coding: UTF-8 -*- -""" -@CreateDate: 2020/07/18 -@Author: Xingyan Liu -@File: plot.py -@Project: stagewiseNN -""" -import os -import sys -from pathlib import Path -from typing import Sequence, Mapping, Optional, Union, Callable -import logging -import pandas as pd -import numpy as np - - -def __test__(): - pass - - -if __name__ == '__main__': - import time - - logging.basicConfig( - level=logging.DEBUG, - format='%(asctime)s %(filename)s-%(lineno)d-%(funcName)s(): ' - '%(levelname)s\n%(message)s' - ) - t = time.time() - __test__() - print('Done running file: {}\nTime: {}'.format( - os.path.abspath(__file__), time.time() - t, - )) diff --git a/build/lib/swnn/utils/process.py b/build/lib/swnn/utils/process.py deleted file mode 100644 index 4cbf1da..0000000 --- a/build/lib/swnn/utils/process.py +++ /dev/null @@ -1,355 +0,0 @@ -# -*- coding: UTF-8 -*- -""" -@CreateDate: 2020/07/18 -@Author: Xingyan Liu -@File: process.py -@Project: stagewiseNN -""" -import os -import sys -from pathlib import Path -from typing import Sequence, Mapping, Optional, Union, Callable -import logging -import pandas as pd -import numpy as np -import scanpy as sc -from scipy import sparse -from sklearn.preprocessing import label_binarize -from ._scale import wrapper_scale - - -def check_dirs(path): - if os.path.exists(path): - print('already exists:\n\t%s' % path) - else: - os.makedirs(path) - print('a new directory made:\n\t%s' % path) - - -def reverse_dict(d: dict, ): - """ - the values of the dict must be list-like type - """ - d_rev = {} - for k in d.keys(): - vals = d[k] - _d = dict.fromkeys(vals, k) - d_rev.update(_d) - return d_rev - - -def describe_dataframe(df: pd.DataFrame, **kwargs): - for c in df.columns: - print(c.center(40, '-'), **kwargs) - print(describe_series(df[c], asstr=True), **kwargs) - - -def describe_series( - srs: Sequence, max_cats: int = 100, - asstr: bool = False, -): - """ inspect data-structure """ - srs = pd.Series(srs) - if len(srs.unique()) <= max_cats: - desc_type = 'Value counts' - result = srs.value_counts(dropna=False) - elif isinstance(srs[0], (int, float)): - desc_type = 'Numerical summary' - result = srs.describe() - else: - desc_type = 'Header lines' - result = srs.head() - if asstr: - return f'{desc_type}:\n{result}' - else: - return result, desc_type - - -def make_binary(mat): - mat_bin = mat.copy() - mat_bin[mat_bin > 0] = 1. - return mat_bin - - -def set_adata_hvgs( - adata: sc.AnnData, - gene_list: Optional[Sequence] = None, - indicator: Optional[Sequence[bool]] = None, - slim: bool = True, - copy: bool = False, -): - """ - Setting the given (may be pre-computed) set of genes as highly variable, - if `copy` is False, changes will be made to the input adata. - if slim is True and adata.raw is None, raw data will be backup. - """ - if copy: - adata = adata.copy() - logging.info( - 'Setting the given set of %d genes as highly variable' % len(gene_list)) - if (indicator is None) and (gene_list is not None): - indicator = [g in gene_list for g in adata.var_names] - adata.var['highly_variable'] = indicator - if slim: - if adata.raw is None: - adata.raw = adata - logging.info('slimming adata to contain only HVGs') - adata = adata[:, adata.var['highly_variable']].copy() - return adata - - -def change_names( - seq: Sequence, - mapping: Optional[Mapping] = None, - **kwmaps -) -> list: - mapping = {} if mapping is None else mapping - mapping.update(kwmaps) - func = lambda x: mapping.get(x, x) - return list(map(func, seq)) - - -def normalize_default( - adata, target_sum=None, - copy=False, log_only=False, -): - if copy: - adata = adata.copy() - logging.info('A copy of AnnData made!') - else: - logging.info('No copy was made, the input AnnData will be changed!') - logging.info('normalizing datasets with default settings.') - if not log_only: - logging.info( - f'performing total-sum normalization, target_sum={target_sum}...') - sc.pp.normalize_total(adata, target_sum=target_sum) - else: - logging.info('skipping total-sum normalization') - sc.pp.log1p(adata) - return adata - - -def normalize_log_then_total( - adata, target_sum=None, - copy=False, -): - """ For SplitSeq data, performing log(x+1) BEFORE total-sum normalization - will results a better UMAP visualization (e.g. clusters would be less - confounded by different total-counts ). - """ - if copy: - adata = adata.copy() - logging.info('A copy of AnnData made!') - sc.pp.log1p(adata, ) - sc.pp.normalize_total(adata, target_sum=target_sum, ) - return adata - - -def set_precomputed_neighbors( - adata, - distances, - connectivities=None, - n_neighbors=15, - metric='cosine', # pretended parameter - method='umap', # pretended parameter - metric_kwds=None, # pretended parameter - use_rep=None, # pretended parameter - n_pcs=None, # pretended parameter - key_added=None, # -): - if key_added is None: - key_added = 'neighbors' - conns_key = 'connectivities' - dists_key = 'distances' - else: - conns_key = key_added + '_connectivities' - dists_key = key_added + '_distances' - - if connectivities is None: - connectivities = distances.copy().tocsr() - connectivities[connectivities > 0] = 1 - - adata.obsp[dists_key] = distances - adata.obsp[conns_key] = connectivities - - adata.uns[key_added] = {} - neighbors_dict = adata.uns[key_added] - neighbors_dict['connectivities_key'] = conns_key - neighbors_dict['distances_key'] = dists_key - - neighbors_dict['params'] = {'n_neighbors': n_neighbors, 'method': method} - neighbors_dict['params']['metric'] = metric - if metric_kwds is not None: - neighbors_dict['params']['metric_kwds'] = metric_kwds - if use_rep is not None: - neighbors_dict['params']['use_rep'] = use_rep - if n_pcs is not None: - neighbors_dict['params']['n_pcs'] = n_pcs - - return adata - - -def quick_preprocess_raw( - adata: sc.AnnData, - target_sum: Optional[int] = None, - hvgs: Optional[Sequence] = None, - batch_key=None, - copy=True, - log_first: bool = False, - **hvg_kwds -) -> sc.AnnData: - if copy: - _adata = adata.copy() - logging.info('A copy of AnnData made!') - else: - _adata = adata - logging.info('No copy was made, the input AnnData will be changed!') - # 1: normalization - if log_first: - normalize_log_then_total(_adata, target_sum=target_sum) - else: - normalize_default(_adata, target_sum=target_sum) - # 2: HVG selection (skipped if `hvgs` is given) - if hvgs is None: - sc.pp.highly_variable_genes( - _adata, batch_key=batch_key, **hvg_kwds) - indicator = _adata.var['highly_variable'] - # _adata = _adata[:, _adata.var['highly_variable']] - else: - indicator = None - _adata = set_adata_hvgs(_adata, gene_list=hvgs, indicator=indicator, ) - # 3: z-score - wrapper_scale(_adata, groupby=batch_key) - return _adata - - -def label_binarize_each(labels, classes, sparse_out=True): - lb1hot = label_binarize(labels, classes=classes, sparse_output=sparse_out) - if len(classes) == 2: - lb1hot = lb1hot.toarray() - lb1hot = np.c_[1 - lb1hot, lb1hot] - if sparse_out: - lb1hot = sparse.csc_matrix(lb1hot) - return lb1hot - - -def group_mean(X, labels, - binary=False, classes=None, features=None, - print_groups=True): - """ - This function may work with more efficiency than `df.groupby().mean()` - when handling sparse matrix. - - Parameters - ---------- - X: shape (n_samples, n_features) - labels: shape (n_samples, ) - classes: optional - names of groups - features: optional - names of features - print_groups: bool - whether to inspect the groups - """ - classes = np.unique(labels, ) if classes is None else classes - if binary: - X = (X > 0) # .astype('float') - print('Binarized...the results will be the expression proportions.') - - if len(classes) == 1: - grp_mean = X.mean(axis=0).T - else: - lb1hot = label_binarize_each(labels, classes=classes, sparse_out=True) - print(f'Calculating feature averages for {len(classes)} groups') - if print_groups: - print(classes) - grp_mean = X.T.dot(lb1hot) / lb1hot.sum(axis=0) - grp_mean = pd.DataFrame(grp_mean, columns=classes, index=features) - return grp_mean - - -def group_mean_dense( - X, labels, binary=False, - index_name='group', - classes=None, -): - classes = np.unique(labels, ) if classes is None else classes - if binary: - X = (X > 0) # .astype('float') - logging.info('Binarized...the results will be the expression ' - 'proportions.') - tmp = pd.DataFrame(X) - tmp[index_name] = list(labels) - avgs = tmp.groupby(index_name).mean() - # print(avgs.shape) - return avgs.T # each column as a group - - -def group_median_dense( - X, labels, binary=False, - index_name='group', - classes=None, -): - classes = np.unique(labels, ) if classes is None else classes - if binary: - X = (X > 0) # .astype('float') - print('Binarized...the results will be the expression proportions.') - tmp = pd.DataFrame(X) - tmp[index_name] = list(labels) - avgs = tmp.groupby(index_name).median() - print(avgs.shape) - return avgs.T # each column as a group - - -def group_mean_adata(adata: sc.AnnData, - groupby: str, - features=None, binary=False, use_raw=False): - """ - Parameters - ---------- - groupby: a column name in adata.obs - features: a subset of names in adata.var_names (or adata.raw.var_names) - - Returns - ------- - a pd.DataFrame with features as index and groups as columns - """ - labels = adata.obs[groupby] - logging.debug(f'Computing averages grouped by {groupby}') - if use_raw and adata.raw is not None: - if features is not None: - features = [f for f in features if f in adata.raw.var_names] - X = adata.raw[:, features].X - else: - features = adata.raw.var_names - X = adata.raw.X - else: - if features is not None: - features = [f for f in features if f in adata.var_names] - X = adata[:, features].X - else: - features = adata.var_names - X = adata.X - if sparse.issparse(X): - return group_mean(X, labels, binary=binary, features=features) - else: - return group_mean_dense(X, labels, binary=binary, ) - - -def __test__(): - pass - - -if __name__ == '__main__': - import time - - logging.basicConfig( - level=logging.DEBUG, - format='%(asctime)s %(filename)s-%(lineno)d-%(funcName)s(): ' - '%(levelname)s\n%(message)s' - ) - t = time.time() - __test__() - print('Done running file: {}\nTime: {}'.format( - os.path.abspath(__file__), time.time() - t, - )) diff --git a/dist/swnn-0.1.0-py3-none-any.whl b/dist/swnn-0.1.0-py3-none-any.whl deleted file mode 100644 index 65bdc34..0000000 Binary files a/dist/swnn-0.1.0-py3-none-any.whl and /dev/null differ diff --git a/dist/swnn-0.1.0-py3.8.egg b/dist/swnn-0.1.0-py3.8.egg deleted file mode 100644 index f7be45a..0000000 Binary files a/dist/swnn-0.1.0-py3.8.egg and /dev/null differ diff --git a/dist/swnn-0.1.0.tar.gz b/dist/swnn-0.1.0.tar.gz deleted file mode 100644 index e94d8eb..0000000 Binary files a/dist/swnn-0.1.0.tar.gz and /dev/null differ diff --git a/swnn.egg-info/PKG-INFO b/swnn.egg-info/PKG-INFO deleted file mode 100644 index a547e61..0000000 --- a/swnn.egg-info/PKG-INFO +++ /dev/null @@ -1,87 +0,0 @@ -Metadata-Version: 2.1 -Name: swnn -Version: 0.1.0 -Summary: A tool for building developmental tree from scRNA-seq -Home-page: https://github.com/XingyanLiu/stagewiseNN -Author: Xingyan Liu -Author-email: 544568643@qq.com -License: MIT -Platform: UNKNOWN -Classifier: License :: OSI Approved :: MIT License -Classifier: Programming Language :: Python -Classifier: Programming Language :: Python :: 3 -Classifier: Programming Language :: Python :: 3.6 -Classifier: Programming Language :: Python :: Implementation :: CPython -Classifier: Programming Language :: Python :: Implementation :: PyPy -Requires-Python: >=3.6.0 -Description-Content-Type: text/markdown -Provides-Extra: downstream analysis -License-File: LICENSE - - -stagewiseNN -=========== - -To be completed! - -**stagewiseNN** is a computational tool for constructing -developmental tree from Multi-staged single-cell RNA-seq data. - -(see [documentation](https://xingyanliu.github.io/stagewiseNN/index.html) for detailed guides) - -It is easy to use: - -```python -import swnn - -# ====== Inputs ====== -# data_matrix = .. -# stage_labels = .. -# group_labels = .. -# stage_order = [f'stage_{i}' for i in range(5)] - -builder = swnn.Builder(stage_order=stage_order) -# step 1: -# building (stage-wise) single-cell graph -distmat, connect = builder.build_graph( - X=data_matrix, stage_lbs=stage_labels, - ) -# step 2: -# build developmental tree from single-cell graph -builder.build_tree(group_labels, stage_labels,) -``` - - -Installation ------------- - -Install stagewiseNN by running (in the command line): - -```shell -pip install swnn -``` - -or install from source code: - -```shell -git clone https://github.com/XingyanLiu/stagewiseNN.git -cd stagewiseNN -python setup.py install -``` - -Contribute ----------- - -- Issue Tracker: https://github.com/XingyanLiu/stagewiseNN/issues -- Source Code: https://github.com/XingyanLiu/stagewiseNN - -Support -------- - -If you are having issues, please let us know. -We have a mailing list located at: - -* xingyan@amss.ac.cn -* 544568643@qq.com - - diff --git a/swnn.egg-info/SOURCES.txt b/swnn.egg-info/SOURCES.txt deleted file mode 100644 index 4a045af..0000000 --- a/swnn.egg-info/SOURCES.txt +++ /dev/null @@ -1,16 +0,0 @@ -LICENSE -README.md -setup.py -swnn/__init__.py -swnn/builder.py -swnn/graph2tree.py -swnn/multipartite_graph.py -swnn.egg-info/PKG-INFO -swnn.egg-info/SOURCES.txt -swnn.egg-info/dependency_links.txt -swnn.egg-info/requires.txt -swnn.egg-info/top_level.txt -swnn/utils/__init__.py -swnn/utils/_scale.py -swnn/utils/plot.py -swnn/utils/process.py \ No newline at end of file diff --git a/swnn.egg-info/dependency_links.txt b/swnn.egg-info/dependency_links.txt deleted file mode 100644 index 8b13789..0000000 --- a/swnn.egg-info/dependency_links.txt +++ /dev/null @@ -1 +0,0 @@ - diff --git a/swnn.egg-info/requires.txt b/swnn.egg-info/requires.txt deleted file mode 100644 index 3759197..0000000 --- a/swnn.egg-info/requires.txt +++ /dev/null @@ -1,6 +0,0 @@ -scanpy -scikit_learn -scipy - -[downstream analysis] -scanpy diff --git a/swnn.egg-info/top_level.txt b/swnn.egg-info/top_level.txt deleted file mode 100644 index cf92c39..0000000 --- a/swnn.egg-info/top_level.txt +++ /dev/null @@ -1 +0,0 @@ -swnn