do some optimizations and fix some bugs.

STOmics · Sep 4, 2024 · 2063880 · 2063880
1 parent d7d82df
commit 2063880
Show file tree

Hide file tree

Showing 24 changed files with 1,160 additions and 810 deletions.
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -98,6 +98,8 @@ ipython==8.10.0
 ipython-genutils==0.2.0
 ipywidgets==8.0.4
 
+mudata>=0.2.3
+
 Sphinx>=7.1.2
 nbsphinx>=0.9.3
 sphinx-autodoc-typehints>=1.24.0

diff --git a/docs/source/Tutorials(Multi-sample)/Comparative_Analysis.ipynb b/docs/source/Tutorials(Multi-sample)/Comparative_Analysis.ipynb
diff --git a/docs/source/Tutorials(Multi-sample)/SpaSEG.ipynb b/docs/source/Tutorials(Multi-sample)/SpaSEG.ipynb
@@ -95,7 +95,7 @@
     "\n",
     "Download our [example data](http://116.6.21.110:8090/share/c5d9e7f3-7d66-4154-87e1-a740f4bb4dbc), the demo data used here is 16 samples of 3D drosophila stereo-seq data.\n",
     "\n",
-    "`bin_type` is important for constructing the training data, we have to input it correctly, available `bin_type` is **'bins'** and **'cell_bins'**.\n",
+    "`bin_type` is important for constructing the training data, it must be input correctly, available `bin_type` are **'bins'** and **'cell_bins'**.\n",
     "\n",
     "\n",
     "<div class=\"alert alert-info\">\n",

diff --git a/docs/source/content/06_Release_notes.rst b/docs/source/content/06_Release_notes.rst
@@ -3,6 +3,38 @@ Release Notes
 
 .. role:: small
 
+Version 1.4.0
+------------------
+1.4.0 : 2024-09-06
+~~~~~~~~~~~~~~~~~~~
+
+.. _SpaSEG: ../Tutorials(Multi-sample)/SpaSEG.html
+.. |SpaSEG| replace:: **SpaSEG**
+
+.. _st.plt.cells_plotting: stereo.plots.PlotCollection.cells_plotting.html
+.. |st.plt.cells_plotting| replace:: `st.plt.cells_plotting`
+
+.. _st.io.write_h5mu: stereo.io.write_h5mu.html
+.. |st.io.write_h5mu| replace:: `st.io.write_h5mu`
+
+.. _st.io.mudata_to_msdata: stereo.io.mudata_to_msdata.html
+.. |st.io.mudata_to_msdata| replace:: `st.io.mudata_to_msdata`
+
+Features:
+
+1. Addition of new algorithm |SpaSEG|_ for multiple **SRT** analysis.
+2. Addition of **colorbar** or **legend** for `st.plt.cells_plotting`.
+3. |st.plt.cells_plotting|_ supports exporting plots as **PNG**, **SVG** or **PDF**.
+4. Addition of method |st.io.write_h5mu|_ and |st.io.mudata_to_msdata|_ for conversion between **MSData** and **MuData**.
+
+BUG Fixes:
+
+1. Fixed the problem that **CellCorrection** is incompatible with small-size images (less than 2000px in any dimension) when using the method **EDM**.
+2. Fixed the problem that `MSData.to_integrate` is incompatible when the number of cells in the integrated sample is less than the total number of cells in all single samples.
+3. Fixed the problem that `st.plt.time_series_tree_plot` can not capture the result of **PAGA**, leading to an incorrect plot.
+4. Fixed other bugs.
+
+
 Version 1.3.1
 ------------------
 1.3.1 : 2024-06-28

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -70,6 +70,36 @@ Workflow
 Latest Additions
 ------------------
 
+Version 1.4.0
+~~~~~~~~~~~~~~~~~~~
+1.4.0 : 2024-09-06
+
+.. _SpaSEG: Tutorials(Multi-sample)/SpaSEG.html
+.. |SpaSEG| replace:: **SpaSEG**
+
+.. _st.plt.cells_plotting: content/stereo.plots.PlotCollection.cells_plotting.html
+.. |st.plt.cells_plotting| replace:: `st.plt.cells_plotting`
+
+.. _st.io.write_h5mu: content/stereo.io.write_h5mu.html
+.. |st.io.write_h5mu| replace:: `st.io.write_h5mu`
+
+.. _st.io.mudata_to_msdata: content/stereo.io.mudata_to_msdata.html
+.. |st.io.mudata_to_msdata| replace:: `st.io.mudata_to_msdata`
+
+Features:
+
+1. Addition of new algorithm |SpaSEG|_ for multiple **SRT** analysis.
+2. Addition of **colorbar** or **legend** for `st.plt.cells_plotting`.
+3. |st.plt.cells_plotting|_ supports exporting plots as **PNG**, **SVG** or **PDF**.
+4. Addition of method |st.io.write_h5mu|_ and |st.io.mudata_to_msdata|_ for conversion between **MSData** and **MuData**.
+
+BUG Fixes:
+
+1. Fixed the problem that **CellCorrection** is incompatible with small-size images (less than 2000px in any dimension) when using the method **EDM**.
+2. Fixed the problem that `MSData.to_integrate` is incompatible when the number of cells in the integrated sample is less than the total number of cells in all single samples.
+3. Fixed the problem that `st.plt.time_series_tree_plot` can not capture the result of **PAGA**, leading to an incorrect plot.
+4. Fixed other bugs.
+
 Version 1.3.1
 ~~~~~~~~~~~~~~~~~~~
 1.3.1 : 2024-06-28
@@ -105,24 +135,6 @@ BUG Fixes:
 2. Fixed the bug of losing part of metadata when writing **StereoExpData** / **MSData** into **Stereo-h5ad** or **h5ms** file.
 3. Fixed the incompatibility problem with **AnnData** when performing `st.tl.sctransform`.
 
-Version 1.2.0
-~~~~~~~~~~~~~~~~~~~
-1.2.0 : 2024-03-30
-
-Features:
-
-1. `st.io.read_gem` and `st.io.read_gef` support expression matrix files with geneID information.
-2. Analysis results of `find_marker_genes`  will be saved into the output AnnData h5ad.
-3. Upgraded tissue segmentation algorithm.
-4. Addition of `st.tl.adjusted_rand_score` to calculate the adjusted Rand coefficient between two clusters.
-5. Addition of `st.tl.silhouette_score` to calculate the average silhouette coefficient of a cluster.
-6. `h5ad2rds.R` is compatible with AnnData version > 0.7.5, to convert from h5ad to rds files.
-7. Addition of the clustering category labels to the graph of `st.plt.paga_compare`.
-
-BUG Fixes:
-
-1. Fixed the error of high memory consumption when converting `X.raw` into AnnData.
-
 
 .. toctree::
     :titlesonly:

diff --git a/stereo/algorithm/dim_reduce.py b/stereo/algorithm/dim_reduce.py
@@ -47,7 +47,7 @@ def factor_analysis(x, n_pcs):
     return tran_x
 
 
-def pca(x, n_pcs, svd_solver='auto', random_state=0):
+def pca(x, n_pcs, svd_solver='auto', random_state=0, dtype='float32'):
     """
     Principal component analysis.
 
@@ -81,16 +81,10 @@ def pca(x, n_pcs, svd_solver='auto', random_state=0):
             )
             svd_solver = 'arpack'
         if x.dtype.char not in "fFdD":
-            x = x.astype(np.float32)
-            logger.info('exp_matrix dType is changed to float32')
+            x = x.astype(dtype)
+            logger.info(f'exp_matrix dType is not float, it is changed to {dtype}')
         output = _pca_with_sparse(x, n_pcs, solver=svd_solver, random_state=random_state)
-        # this is just a wrapper for the results
-        # pca_ = PCA(n_components=n_pcs, svd_solver=svd_solver)
-        # pca_.components_ = output['components']
-        # pca_.explained_variance_ = output['variance']
-        # pca_.explained_variance_ratio_ = output['variance_ratio']
-        # return dict([('x_pca', output['X_pca']), ('variance', output['variance']), ('variance_ratio', output['variance_ratio']), ('pcs', pca_.components_.T)]) # noqa
-        return dict(
+        result = dict(
             [('x_pca', output['X_pca']), ('variance', output['variance']), ('variance_ratio', output['variance_ratio']),
              ('pcs', output['components'].T)])
     else:
@@ -99,7 +93,13 @@ def pca(x, n_pcs, svd_solver='auto', random_state=0):
         variance = pca_obj.explained_variance_
         variance_ratio = pca_obj.explained_variance_ratio_
         pcs = pca_obj.components_.T
-        return dict([('x_pca', x_pca), ('variance', variance), ('variance_ratio', variance_ratio), ('pcs', pcs)])
+        result = dict([('x_pca', x_pca), ('variance', variance), ('variance_ratio', variance_ratio), ('pcs', pcs)])
+
+    if result['x_pca'].dtype.descr != np.dtype(dtype).descr:
+        logger.info(f'x_pca dType is changed from {result["x_pca"].dtype} to {dtype}')
+        result['x_pca'] = result['x_pca'].astype(dtype)
+
+    return result
 
 
 def _pca_with_sparse(X, n_pcs, solver='arpack', mu=None, random_state=None):

diff --git a/stereo/algorithm/dpt/struct.py b/stereo/algorithm/dpt/struct.py
@@ -10,6 +10,7 @@
 )
 
 import numpy as np
+import pandas as pd
 import scipy
 from scipy.sparse import (
     issparse,
@@ -211,10 +212,13 @@ def _backwards_compat_get_full_eval(stereo_exp_data):
 
 
 def _backwards_compat_get_full_X_diffmap(stereo_exp_data) -> np.ndarray:
+    X_diffmap = stereo_exp_data.tl.result['X_diffmap']
+    if isinstance(X_diffmap, pd.DataFrame):
+        X_diffmap = X_diffmap.to_numpy()
     if 'X_diffmap0' in stereo_exp_data.tl.result:
-        return np.c_[stereo_exp_data.tl.result['X_diffmap0'].values[:, None], stereo_exp_data.tl.result['X_diffmap']]
+        return np.c_[stereo_exp_data.tl.result['X_diffmap0'].values[:, None], X_diffmap]
     else:
-        return stereo_exp_data.tl.result['X_diffmap']
+        return X_diffmap
 
 
 def _get_indices_distances_from_dense_matrix(D, n_neighbors: int):

diff --git a/stereo/common.py b/stereo/common.py
@@ -8,4 +8,4 @@
 """
 
 # version
-version = '1.3.1'
+version = '1.4.0'
diff --git a/stereo/core/cell.py b/stereo/core/cell.py
@@ -69,6 +69,14 @@ def __getitem__(self, key):
     def __len__(self):
         return self.size
 
+    @property
+    def matrix(self):
+        return self._matrix
+
+    @property
+    def pairwise(self):
+        return self._pairwise
+
     @property
     def size(self):
         return self._obs.index.size
@@ -185,15 +193,6 @@ def sub_set(self, index):
 
         if self.cell_border is not None:
             self.cell_border = self.cell_border[index]
-        # if isinstance(index, list) or isinstance(index, slice):
-        #     self._obs = self._obs.iloc[index].copy()
-        # elif isinstance(index, np.ndarray):
-        #     if index.dtype == bool:
-        #         self._obs = self._obs[index].copy()
-        #     else:
-        #         self._obs = self._obs.iloc[index].copy()
-        # else:
-        #     self._obs = self._obs.iloc[index].copy()
         if isinstance(index, pd.Series):
             index = index.to_numpy()
         self._obs = self._obs.iloc[index].copy()
@@ -304,6 +303,14 @@ def __contains__(self, item):
     def _obs(self):
         return self.__based_ann_data.obs
 
+    @property
+    def matrix(self):
+        return self.__based_ann_data.obsm
+
+    @property
+    def pairwise(self):
+        return self.__based_ann_data.obsp
+
     # @property
     # def loc(self):
     #     return self.__based_ann_data.obs.loc

diff --git a/stereo/core/gene.py b/stereo/core/gene.py
@@ -13,7 +13,9 @@
 import numpy as np
 import pandas as pd
 from anndata import AnnData
+from scipy.sparse import spmatrix
 
+from stereo.log_manager import logger
 
 class Gene(object):
     def __init__(
@@ -54,6 +56,14 @@ def __getitem__(self, key):
     def __len__(self):
         return self.size
 
+    @property
+    def matrix(self):
+        return self._matrix
+
+    @property
+    def pairwise(self):
+        return self._pairwise
+
     @property
     def size(self):
         return self.gene_name.size
@@ -148,21 +158,44 @@ def sub_set(self, index):
         :param index: a numpy array of index info.
         :return: the subset of Gene object.
         """
-        # if isinstance(index, list) or isinstance(index, slice):
-        #     self._var = self._var.iloc[index].copy()
-        # elif isinstance(index, np.ndarray):
-        #     if index.dtype == bool:
-        #         self._var = self._var[index].copy()
-        #     else:
-        #         self._var = self._var.iloc[index].copy()
-        # else:
-        #     self._var = self._var.iloc[index].copy()
         if isinstance(index, pd.Series):
             index = index.to_numpy()
         self._var = self._var.iloc[index].copy()
         for col in self._var.columns:
             if self._var[col].dtype.name == 'category':
                 self._var[col] = self._var[col].cat.remove_unused_categories()
+
+        for key, value in self._matrix.items():
+            if isinstance(value, pd.DataFrame):
+                self._matrix[key] = value.iloc[index].copy()
+                self._matrix[key].reset_index(drop=True, inplace=True)
+            elif isinstance(value, (np.ndarray, spmatrix)):
+                self._matrix[key] = value[index]
+            else:
+                logger.warning(f'Subsetting from {key} of type {type(value)} in gene.matrix is not supported.')
+
+        for key, value in self._pairwise.items():
+            if isinstance(value, pd.DataFrame):
+                columns = value.columns[index]
+                self._pairwise[key] = value.iloc[index][columns].copy()
+                self._pairwise[key].reset_index(drop=True, inplace=True)
+            elif isinstance(value, (np.ndarray, spmatrix)):
+                if len(value.shape) != 2:
+                    logger.warning(f'Subsetting from {key} of shape {value.shape} in gene.pairwise is not supported.')
+                    continue
+                self._pairwise[key] = value[index][:, index]
+            elif isinstance(value, dict):
+                for k, v in value.items():
+                    if isinstance(v, pd.DataFrame):
+                        columns = v.columns[index]
+                        self._pairwise[key][k] = v.iloc[index][columns].copy()
+                        self._pairwise[key][k].reset_index(drop=True, inplace=True)
+                    elif isinstance(v, (np.ndarray, spmatrix)):
+                        self._pairwise[key][k] = v[index][:, index]
+                    else:
+                        logger.warning(f'Subsetting from {key}.{k} of type {type(v)} in gene.pairwise is not supported.')
+            else:
+                logger.warning(f'Subsetting from {key} of type {type(value)} in gene.pairwise is not supported.')
         return self
 
     def to_df(self, copy=False):
@@ -210,6 +243,14 @@ def __contains__(self, item):
     def _var(self):
         return self.__based_ann_data.var
 
+    @property
+    def matrix(self):
+        return self.__based_ann_data.varm
+
+    @property
+    def pairwise(self):
+        return self.__based_ann_data.varp
+
     # @property
     # def loc(self):
     #     return self.__based_ann_data.var.loc