Merge branch 'dev'

STOmics · Jun 30, 2022 · 138fa3e · 138fa3e
2 parents 9f72b0d + 46408f4
commit 138fa3e
Show file tree

Hide file tree

Showing 36 changed files with 539 additions and 5,947 deletions.
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -43,7 +43,7 @@ flatbuffers==2.0
 fonttools==4.28.5
 fsspec==2022.1.0
 gast==0.4.0
-gefpy>=0.5.4.11
+gefpy>=0.5.5
 glog==0.3.1
 google-auth==2.3.3
 google-auth-oauthlib==0.4.6
@@ -128,7 +128,7 @@ PIMS==0.5
 pkginfo==1.8.2
 prometheus-client==0.12.0
 prompt-toolkit==3.0.24
-protobuf==4.0.0rc2
+protobuf==3.19.4
 psutil==5.9.0
 ptyprocess==0.7.0
 pyarrow==6.0.1

diff --git a/docs/source/Tutorials/Examples.rst b/docs/source/Tutorials/Examples.rst
@@ -19,6 +19,7 @@ Additional functionalities
 .. nbgallery::
 
     hotspot
+    gaussian_smooth
 
 Interactive Visualisation
 --------------------------------------------

diff --git a/docs/source/Tutorials/cell_correction.rst b/docs/source/Tutorials/cell_correction.rst
@@ -44,7 +44,7 @@ The type of model only can be specified to deep-learning or deep-cell, more deai
 
 You can also predict on gpu, specify gpu id by parameter gpu, if -1, predict on cpu.
 
-In the out_dir directory, there is a new directory named deep-learning or deep-cell, it contains the generated mask whoes name is ends with mask.tif.
+In the out_dir directory, there is a new directory named deep-learning or deep-cell, it contains the generated mask whoes name ends with mask.tif.
 
 .. code:: python
 

diff --git a/docs/source/Tutorials/gaussian_smooth.rst b/docs/source/Tutorials/gaussian_smooth.rst
@@ -0,0 +1,74 @@
+Gaussian smooth
+----------------
+This example show how to run the function of gaussian smooth on Stereopy.
+
+Gaussian smooth can make expression matrix closer, the detail of algorithm refer to https://www.biorxiv.org/content/10.1101/2022.05.26.493527v1.abstract.
+
+Generally, you should do some preprocessing such as filtering cells, filtering genes, normalization, pca before running gaussian smooth.
+
+Especially, you must to save raw expression matrix by running raw_checkpoint before all of the operations such as normalization those will change the values of expression matrix.
+
+Once ran the raw_checkpoint, you can not run the operations those will change the 1-dimension of expression matrix before running gaussian smooth.
+
+Also, you need to run pca before running gaussian smooth.
+
+.. code:: python
+
+    import stereo as st
+
+    input_file = "/jdfssz2/ST_BIOINTEL/P20Z10200N0039/06.user/liulin4/demo/jiace/raw/SS200000141TL_B5_raw.h5ad"
+    data = st.io.read_ann_h5ad(input_file, spatial_key='spatial')
+    data.tl.cal_qc()
+    data.tl.filter_cells(min_gene=300, pct_counts_mt=10)
+    data.tl.filter_genes(min_cell=10)
+    data.tl.raw_checkpoint()
+    data.tl.normalize_total(target_sum=10000)
+    data.tl.log1p()
+    data.tl.pca(use_highly_genes=False, n_pcs=50, svd_solver='arpack')
+    data.tl.gaussian_smooth(n_neighbors=10, smooth_threshold=90)
+    data.tl.scale(max_value=10) #only for gaussian_smooth_scatter_by_gene
+    data.plt.gaussian_smooth_scatter_by_gene(gene_name='C1ql2')
+    data.plt.gaussian_smooth_scatter_by_gene(gene_name='Irx2')
+    data.plt.gaussian_smooth_scatter_by_gene(gene_name='Calb1')
+
++--------------------------------------------+--------------------------------+
+|.. image:: ../_static/gaussian_smooth_1.png |.. image:: ../_static/C1ql2.jpg |
++--------------------------------------------+--------------------------------+
+|.. image:: ../_static/gaussian_smooth_2.png |.. image:: ../_static/Inx2.jpg  |
++--------------------------------------------+--------------------------------+
+|.. image:: ../_static/gaussian_smooth_3.png |.. image:: ../_static/cabl1.jpg |
++--------------------------------------------+--------------------------------+
+
+After, if you want to do other operations such as clustering, you need to do the same preprocessing you did before.
+
+Because of the preprocessing you did before just only for searching the nearest points, the result still base on the raw expression matrix saved by running raw_checkpoint.
+
+.. code:: python
+
+    import os
+    import stereo as st
+
+    input_file = "/jdfssz2/ST_BIOINTEL/P20Z10200N0039/06.user/liulin4/demo/jiace/raw/SS200000141TL_B5_raw.h5ad"
+    data = st.io.read_ann_h5ad(input_file, spatial_key='spatial')
+    data.tl.cal_qc()
+    data.tl.filter_cells(min_gene=300, pct_counts_mt=10)
+    data.tl.filter_genes(min_cell=10)
+    data.tl.raw_checkpoint()
+    data.tl.normalize_total(target_sum=10000)
+    data.tl.log1p()
+    data.tl.pca(use_highly_genes=False, n_pcs=50, svd_solver='arpack')
+    data.tl.gaussian_smooth(n_neighbors=10, smooth_threshold=90)
+    data.tl.normalize_total(target_sum=10000)
+    data.tl.log1p()
+    data.tl.pca(use_highly_genes=False, n_pcs=50, svd_solver='arpack')
+    data.tl.neighbors(pca_res_key='pca', n_pcs=30, res_key='neighbors')
+    data.tl.leiden(neighbors_res_key='neighbors', res_key='leiden')
+    data.plt.cluster_scatter(res_key='leiden')
+
+Gaussian smooth can make clustering result to more subtypes.
+
++---------------------------------------------------+---------------------------------------------------+
+|Before                                             |After                                              |
++===================================================+===================================================+
+|.. image:: ../_static/clustering_before_smooth.png |.. image:: ../_static/clustering_after_smooth.png  |
++---------------------------------------------------+---------------------------------------------------+
diff --git a/docs/source/Tutorials/quick_start.ipynb b/docs/source/Tutorials/quick_start.ipynb
diff --git a/docs/source/Tutorials/read_and_write.rst b/docs/source/Tutorials/read_and_write.rst
@@ -7,8 +7,8 @@ File Format
 Here is the list of input file that Stereopy could read:
 
 :GEM: GEM file contains GeneID, x,y,count. x, y represent spatial positions of the gene in the tissue section, and count means the number of gene expression.
-:BGEF: One type of GEF file. The suffix of BGEF file includes: SN.raw.gef, SN.gef, SN.tissue.gef（`BGEF details <https://www.processon.com/view/link/610cc49c7d9c087bbd1ab7ab#map>`_)
-:CGEF: One type of GEF file. The suffix of CGEF file is cellbin.gef (`CGEF details <https://www.processon.com/view/link/6274de9c0791290711fa418d#map>`_)
+:GEF(Square Bin): Square Bin type of GEF file. The suffix of GEF file includes: SN.raw.gef, SN.gef, SN.tissue.gef（`Square Bin GEF details <https://www.processon.com/view/link/610cc49c7d9c087bbd1ab7ab#map>`_)
+:GEF(Cell bin): Cell bin type of GEF file. The suffix of GEF file is cellbin.gef (`Cell bin GEF details <https://www.processon.com/view/link/6274de9c0791290711fa418d#map>`_)
 :stereo h5ad: One type of h5ad file containing StereoExpData object which is generated by Stereopy.
 :anndata h5ad: One type of h5ad file containing Anndata object which is generated by scanpy.
 
@@ -48,7 +48,7 @@ GEF file
 You could get the info from input GEF file and use the info to set the parameter of :mod:`stereo.io.read_gef`
 
 
-BGEF file
+GEF file (Square Bin)
 ****************
 .. code:: python
 
@@ -59,11 +59,10 @@ BGEF file
     # read the GEF file
     mouse_data_path = './DP8400013846TR_F5.SN.tissue.gef'
     data = st.io.read_gef(
-            file_path= mouse_data_path, sep='\t',
-            bin_size=100, is_sparse=True,
-            gene_list= None, region= None,)
+            file_path= mouse_data_path,
+            bin_size=100, is_sparse=True,)
 
-CGEF file
+GEF file (Cell Bin)
 ****************
 .. code:: python
 
@@ -72,12 +71,11 @@ CGEF file
     import stereo as st
 
     # read the GEF file
-    mouse_data_path = './DP8400013846TR_F5.gef'
+    mouse_data_path = './DP8400013846TR_F5.cellbin.gef'
     data = st.io.read_gef(
-            file_path= mouse_data_path, sep='\t',
+            file_path= mouse_data_path,
             is_sparse=True,
-            bin_type='cell_bins',
-            gene_list= None, region= None,)
+            bin_type='cell_bins',)
 
 `parameters <https://stereopy.readthedocs.io/en/latest/api/stereo.io.read_gef.html#stereo.io.read_gef>`_
 
@@ -200,7 +198,7 @@ stereo h5ad file
     # you could create a dictionary which is similar to data.tl.key_record:
     outkey_record = {'cluster':['leiden','louvain'],}
 
-    st.io.write_h5ad(data, use_raw=True, use_result=True, key_record=outkey_record)
+    st.io.write_h5ad(data, use_raw=True, use_result=True, key_record=outkey_record, output='./DP8400013846TR_F5.h5ad')
 
 
 anndata h5ad file

diff --git a/docs/source/_static/C1ql2.jpg b/docs/source/_static/C1ql2.jpg
diff --git a/docs/source/_static/Inx2.jpg b/docs/source/_static/Inx2.jpg
diff --git a/docs/source/_static/cabl1.jpg b/docs/source/_static/cabl1.jpg
diff --git a/docs/source/_static/clustering_after_smooth.png b/docs/source/_static/clustering_after_smooth.png
diff --git a/docs/source/_static/clustering_before_smooth.png b/docs/source/_static/clustering_before_smooth.png
diff --git a/docs/source/_static/gaussian_smooth_1.png b/docs/source/_static/gaussian_smooth_1.png
diff --git a/docs/source/_static/gaussian_smooth_2.png b/docs/source/_static/gaussian_smooth_2.png
diff --git a/docs/source/_static/gaussian_smooth_3.png b/docs/source/_static/gaussian_smooth_3.png
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -78,5 +78,6 @@
 nbsphinx_thumbnails = {
     "Tutorials/cell_correction": "_static/cell_bin_correction.png",
     "Tutorials/tissue_cut": "_static/tissue_segmentation.png",
+    "Tutorials/gaussian_smooth": "_static/gaussian_smooth_1.png",
 }
 
diff --git a/docs/source/release_note.rst b/docs/source/release_note.rst
@@ -3,6 +3,15 @@ Release Notes
 
 .. role:: small
 
+Version 0.3.1
+------------------
+0.3.1 :2022-06-30
+~~~~~~~~~~~~~~~~~~~~~
+1. Add gaussian smooth function.
+2. The pca function increases the svd_solver parameter
+3. The write_h5ad function increases the output parameter
+4. Update requirements.txt.
+
 Version 0.3.0
 ------------------
 0.3.0 :2022-06-10
@@ -58,7 +67,7 @@ Version 0.2.0
 Stereopy provides the analysis process based on spatial omics, including reading, preprocessing, clustering,
 differential expression testing and visualization, etc. There are the updates we made in this version.
 
-1. We propose StereoExpData, which is  a data format specially adapted to spatial omics analysis.
+1. We propose StereoExpData, which is a data format specially adapted to spatial omics analysis.
 2. Support reading the gef file, which is faster than reading gem file.
 3. Support the conversion between StereoExpData and AnnData.
 4. Add the interactive visualization function for selecting data, you can dynamically select the area of interest, and then perform the next step of analysis.

diff --git a/requirements.txt b/requirements.txt
@@ -7,7 +7,7 @@ pandas==1.3.4
 scipy==1.7.3
 seaborn~=0.11.1
 h5py>=3.2.1,<=3.6.0
-gefpy>=0.5.4.11
+gefpy>=0.5.5
 setuptools>=41.0.0,<=61.2.0
 numba~=0.53.1
 statsmodels==0.12.1
@@ -52,6 +52,6 @@ torch==1.10.0
 opencv-python>=4.5.4.58,<=4.5.5.64
 tensorflow==2.7.0
 tensorflow-io-gcs-filesystem==0.24.0
-protobuf==4.0.0rc2
+protobuf==3.19.4
 hotspotsc==0.9.1
 distributed==2022.2.0
diff --git a/setup.py b/setup.py
@@ -15,7 +15,7 @@
 
 setup(
     name='stereopy',
-    version='0.3.0',
+    version='0.3.1',
     setup_requires=['setuptools_scm', 'numpy', 'panel', 'pytest', 'quilt3', 'scipy', 'phenograph'],
     description='Spatial transcriptomic analysis in python.',
     long_description=Path('README.md').read_text('utf-8'),

diff --git a/stereo/algorithm/cell_correction.py b/stereo/algorithm/cell_correction.py
@@ -218,12 +218,12 @@ def gmm_correction(self, cell_data, bg_adjust_label):
         bg_data = []
         for tmp in bg_adjust_label:
             bg_data.append(tmp[tmp.label != 0])
-        adjust_data = pd.concat(bg_data)
+        adjust_data = pd.concat(bg_data).sort_values('score')
         adjust_data = adjust_data.drop_duplicates(subset=['geneID', 'x', 'y', 'UMICount'], keep='first').rename(columns={'score':'tag'})
         adjust_data['tag'] = 'adjust'
         cell_data['tag'] = 'raw'
-        adjust_data['label'] = adjust_data['label'].astype('int64')
-        cell_data['label'] = cell_data['label'].astype('int64')
+        adjust_data['label'] = adjust_data['label'].astype('uint32')
+        cell_data['label'] = cell_data['label'].astype('uint32')
         result = pd.concat([adjust_data, cell_data])
         return result
 

diff --git a/stereo/algorithm/dim_reduce.py b/stereo/algorithm/dim_reduce.py
@@ -40,16 +40,33 @@ def factor_analysis(x, n_pcs):
     return tran_x
 
 
-def pca(x, n_pcs, random_state=0):
+def pca(x, n_pcs, svd_solver='auto', random_state=0):
     """
     Principal component analysis.
 
     :param x: 2D array, shape (M, N)
     :param n_pcs: the number of features for a return array after reducing.
+    :param svd_solver: {'auto', 'full', 'arpack', 'randomized'}, default to 'auto'
+                If auto :
+                    The solver is selected by a default policy based on `X.shape` and
+                    `n_pcs`: if the input data is larger than 500x500 and the
+                    number of components to extract is lower than 80% of the smallest
+                    dimension of the data, then the more efficient 'randomized'
+                    method is enabled. Otherwise the exact full SVD is computed and
+                    optionally truncated afterwards.
+                If full :
+                    run exact full SVD calling the standard LAPACK solver via
+                    `scipy.linalg.svd` and select the components by postprocessing
+                If arpack :
+                    run SVD truncated to n_pcs calling ARPACK solver via
+                    `scipy.sparse.linalg.svds`. It requires strictly
+                    0 < n_pcs < min(x.shape)
+                If randomized :
+                    run randomized SVD by the method of Halko et al.
     :param random_state : int, RandomState instance
-    :return:  ndarray of shape (n_samples, n_components) Embedding of the training data in low-dimensional space.
+    :return:  ndarray of shape (n_samples, n_pcs) Embedding of the training data in low-dimensional space.
     """
-    pca_obj = PCA(n_components=n_pcs, random_state=random_state)
+    pca_obj = PCA(n_components=n_pcs, svd_solver=svd_solver, random_state=random_state)
     x_pca = pca_obj.fit_transform(x)
     variance = pca_obj.explained_variance_
     variance_ratio = pca_obj.explained_variance_ratio_

diff --git a/stereo/algorithm/gaussian_smooth.py b/stereo/algorithm/gaussian_smooth.py
@@ -0,0 +1,54 @@
+import numpy as np
+import scipy.sparse as sp
+from scipy.spatial import distance
+from sklearn.neighbors import NearestNeighbors
+from ..log_manager import logger
+from ..utils.time_consume import TimeConsume
+
+def _gaussan_c(dis, gs=0.95, a=1, b=0):
+    return np.sqrt(-(dis - b)**2 / 2 / np.log(gs / a))
+
+def _gaussan_weight(dis, a=1, b=0, c=12500):
+    gs = a * np.exp(-(dis - b)**2 / 2 / (c**2))
+    return gs
+
+
+def _sp_graph_weight(arr, a=1, b=0, c=12500):
+    out = arr.copy()
+    row, col = np.nonzero(arr)
+    for ro, co in zip(row, col):
+        out[ro, co] = _gaussan_weight(arr[ro, co], a=a, b=b, c=c)
+    return out
+
+def gaussian_smooth(pca_exp_matrix: np.ndarray,
+                    raw_exp_matrix: np.ndarray,
+                    cells_position: np.ndarray,
+                    n_neighbors: int = 10,
+                    smooth_threshold: float = 90,
+                    a: float = 1,
+                    b: float = 0):
+    if sp.issparse(pca_exp_matrix):
+        pca_exp_matrix = pca_exp_matrix.toarray()
+    if sp.issparse(raw_exp_matrix):
+        raw_exp_matrix = raw_exp_matrix.toarray()
+    tc = TimeConsume()
+    tk = tc.start()
+    Euc_distance = distance.cdist(cells_position, cells_position).astype(np.float32)
+    # logger.info(f'distance.cdist: {tc.get_time_consumed(tk)}')
+
+    nbrs = NearestNeighbors(n_neighbors=n_neighbors, algorithm='ball_tree').fit(pca_exp_matrix)
+    # logger.info(f'NearestNeighbors.fit: {tc.get_time_consumed(tk)}')
+
+    adjecent_matrice = nbrs.kneighbors_graph(pca_exp_matrix).astype(np.float32).toarray()
+    # logger.info(f'kneighbors_graph: {tc.get_time_consumed(tk, restart=False)}')
+    aa = np.multiply(adjecent_matrice, Euc_distance)  ## 自己和自己的距离为0
+    aa_nonzero = aa[np.nonzero(aa)]
+    # aa = aa.tocsr()
+    dist_threshold = np.percentile(aa_nonzero, smooth_threshold)
+    c = _gaussan_c(dist_threshold)
+    ##### smoothing
+    gauss_weight = _sp_graph_weight(aa, a, b, c)
+    temp_nor_para = np.squeeze(np.asarray(np.sum(gauss_weight, axis=1)))
+
+    new_adata = np.asarray(((gauss_weight.dot(raw_exp_matrix)).T / temp_nor_para).T)
+    return new_adata
diff --git a/stereo/config.py b/stereo/config.py
@@ -26,7 +26,7 @@ def __init__(
             n_jobs=1,
             log_file: Union[str, Path, None] = None,
             log_level: str = "info",
-            log_format: str = "%(asctime)s %(name)s %(process)d %(thread)d %(module)s %(lineno)d %(levelname)s: %(message)s",
+            log_format: str = "[%(asctime)s][%(name)s][%(process)d][%(thread)d][%(module)s][%(lineno)d][%(levelname)s]: %(message)s",
             output: str = "./output",
             data_dir: str = None
     ):
@@ -52,20 +52,20 @@ def colormaps(self):
 
     @property
     def linear_colormaps(self):
-        colormaps = {n: palette[n] for n in ['rainbow', 'fire', 'bgy', 'bgyw', 'bmy', 'gray', 'kbc', 'CET_D4']}
+        colormaps = {n: palette[n] for n in ['rainbow', 'fire', 'bgy', 'bgyw', 'bmy', 'gray', 'kbc', 'CET_D4', 'blues', 'CET_L4']}
         stmap_colors = ['#0c3383', '#0a88ba', '#f2d338', '#f28f38', '#d91e1e']
         nodes = [0.0, 0.25, 0.50, 0.75, 1.0]
         mycmap = mpl_colors.LinearSegmentedColormap.from_list("mycmap", list(zip(nodes, stmap_colors)))
         color_list = [mpl_colors.rgb2hex(mycmap(i)) for i in range(mycmap.N)]
         colormaps['stereo'] = color_list
         return colormaps
 
-    def linear_colors(self, colors):
+    def linear_colors(self, colors, reverse=False):
         if isinstance(colors, str):
             if colors not in self.linear_colormaps:
                 raise ValueError(f'{colors} not in colormaps, color value range in {self.linear_colormaps.keys()}')
             else:
-                return self.linear_colormaps[colors]
+                return self.linear_colormaps[colors][::-1] if reverse else self.linear_colormaps[colors]
         elif isinstance(colors, list):
             return colors
         else:
-Original file line number
+Diff line change
@@ Expand Up / @@ -19,6 +19,7 @@ Additional functionalities @@
     .. nbgallery::
         hotspot
+        gaussian_smooth
     Interactive Visualisation
     --------------------------------------------
@@ Expand Down @@