Merge pull request #2 from earmingol/dev

Update to v0.3.0
earmingol · Oct 21, 2024 · 922ee26 · 922ee26
2 parents 7ab1902 + 728952f
commit 922ee26
Show file tree

Hide file tree

Showing 44 changed files with 4,111 additions and 3,155 deletions.
diff --git a/README.rst b/README.rst
@@ -13,7 +13,6 @@
    :target: https://pepy.tech/project/sccellfie
 
 
-
 Metabolic activity from single-cell and spatial transcriptomics with scCellFie
 -----------------------------------------------------------------------------------------
 
@@ -54,6 +53,53 @@ Features
 
 - **Organisms:** Metabolic database and analysis available for human and mouse.
 
+Quick start
+-----------
+A quick example of how to use scCellFie with a single-cell dataset and generate results::
+
+        import sccellfie
+        import scanpy as sc
+
+        # Load the dataset
+        adata = sc.read(filename='BALF-COVID19.h5ad',
+                        backup_url='https://zenodo.org/record/7535867/files/BALF-COVID19-Liao_et_al-NatMed-2020.h5ad')
+
+        # Run one-command scCellFie pipeline
+        results = sccellfie.run_sccellfie_pipeline(adata,
+                                                   organism='human',
+                                                   sccellfie_data_folder=None,
+                                                   n_counts_col='n_counts',
+                                                   process_by_group=False,
+                                                   groupby=None,
+                                                   neighbors_key='neighbors',
+                                                   n_neighbors=10,
+                                                   batch_key='sample',
+                                                   threshold_key='sccellfie_threshold',
+                                                   smooth_cells=True,
+                                                   alpha=0.33,
+                                                   chunk_size=5000,
+                                                   disable_pbar=False,
+                                                   save_folder=None,
+                                                   save_filename=None
+                                                  )
+
+To access metabolic activities, we need to inspect `results['adata']`:
+
+- The processed single-cell data is located in the AnnData object `results['adata']`.
+- The reaction activities for each cell are located in the AnnData object `results['adata'].reactions`.
+- The metabolic task activities for each cell are located in the AnnData object `results['adata'].metabolic_tasks`.
+
+In particular:
+
+- `results['adata']`: contains gene expression in `.X`.
+- `results['adata'].layers['gene_scores']`: contains gene scores as in the original CellFie paper.
+- `results['adata'].uns['Rxn-Max-Genes']`: contains determinant genes for each reaction per cell.
+- `results['adata'].reactions`: contains reaction scores in `.X` so every scanpy function can be used on this object to visualize or compare values.
+- `results['adata'].metabolic_tasks`: contains metabolic task scores in `.X` so every scanpy function can be used on this object to visualize or compare values.
+
+Other keys in the `results` dictionary are associated with the scCellFie database and are already filtered for the elements present
+in the dataset (`'gpr_rules'`, `'task_by_gene'`, `'rxn_by_gene'`, `'task_by_rxn'`, `'rxn_info'`, `'task_info'`, `'thresholds'`, `'organism'`).
+
 How to cite
 -----------
 

diff --git a/sccellfie/__init__.py b/sccellfie/__init__.py
@@ -9,5 +9,6 @@
 from . import spatial
 from . import stats as tl
 from .expression import aggregation, smoothing, thresholds
+from .sccellfie_pipeline import run_sccellfie_pipeline
 
-__version__ = "0.2.3"
+__version__ = "0.3.0"
diff --git a/sccellfie/datasets/__init__.py b/sccellfie/datasets/__init__.py
@@ -1 +1,2 @@
-from .gene_info import (retrieve_ensembl2symbol_data)
+from .gene_info import (retrieve_ensembl2symbol_data)
+from .database import (load_sccellfie_database)
diff --git a/sccellfie/datasets/database.py b/sccellfie/datasets/database.py
@@ -0,0 +1,118 @@
+import pandas as pd
+import os
+
+
+def load_sccellfie_database(organism='human', task_folder=None, rxn_info_filename=None, task_info_filename=None,
+                            task_by_rxn_filename=None, task_by_gene_filename=None, rxn_by_gene_filename=None,
+                            thresholds_filename=None):
+    """
+    Loads files of the metabolic task database from either a local folder, individual file paths, or predefined URLs.
+
+    Parameters
+    ----------
+    organism : str, optional (default: 'human')
+        The organism to retrieve data for. Choose 'human' or 'mouse'. Used when loading from URLs.
+
+    task_folder : str, optional (default: None)
+        The local folder path containing CellFie data files. If provided, this takes priority.
+
+    rxn_info_filename : str, optional (default: None)
+        Full path for reaction information JSON file.
+
+    task_info_filename : str, optional (default: None)
+        Full path for task information CSV file.
+
+    task_by_rxn_filename : str, optional (default: None)
+        Full path for task by reaction CSV file.
+
+    task_by_gene_filename : str, optional (default: None)
+        Full path for task by gene CSV file.
+
+    rxn_by_gene_filename : str, optional (default: None)
+        Full path for reaction by gene CSV file.
+
+    thresholds_filename : str, optional (default: None)
+        Full path for thresholds CSV file.
+
+    Returns
+    -------
+    data : dict
+        A dictionary containing the loaded data frames and information.
+        Keys are 'rxn_info', 'task_info', 'task_by_rxn', 'task_by_gene', 'rxn_by_gene',
+        'thresholds', and 'organism'.
+        Examples of dataframes can be found at https://github.com/earmingol/scCellFie/raw/refs/heads/main/task_data/homo_sapiens/
+    """
+    # Define default URLs for human and mouse data
+    default_urls = {
+        'human': 'https://github.com/earmingol/scCellFie/raw/refs/heads/main/task_data/homo_sapiens/',
+        'mouse': 'https://github.com/earmingol/scCellFie/raw/refs/heads/main/task_data/mus_musculus/'
+    }
+
+    # Define default file names
+    default_file_names = {
+        'human': {
+            'rxn_info': 'Rxn-Info-Recon2-2.json',
+            'task_info': 'Task-Info.csv',
+            'task_by_rxn': 'Task_by_Rxn.csv',
+            'task_by_gene': 'Task_by_Gene.csv',
+            'rxn_by_gene': 'Rxn_by_Gene.csv',
+            'thresholds': 'Thresholds.csv'
+        },
+        'mouse': {
+            'rxn_info': 'Rxn-Info-iMM1415.json',
+            'task_info': 'Task-Info.csv',
+            'task_by_rxn': 'Task_by_Rxn.csv',
+            'task_by_gene': 'Task_by_Gene.csv',
+            'rxn_by_gene': 'Rxn_by_Gene.csv',
+            'thresholds': 'Thresholds.csv'
+        }
+    }
+
+    # Determine the base path and file names
+    if task_folder:
+        base_path = task_folder
+        file_paths = {
+            'rxn_info': os.path.join(base_path, default_file_names[organism]['rxn_info']),
+            'task_info': os.path.join(base_path, default_file_names[organism]['task_info']),
+            'task_by_rxn': os.path.join(base_path, default_file_names[organism]['task_by_rxn']),
+            'task_by_gene': os.path.join(base_path, default_file_names[organism]['task_by_gene']),
+            'rxn_by_gene': os.path.join(base_path, default_file_names[organism]['rxn_by_gene']),
+            'thresholds': os.path.join(base_path, default_file_names[organism]['thresholds'])
+        }
+    else:
+        base_path = default_urls.get(organism.lower())
+        if not base_path:
+            raise ValueError("Invalid organism. Choose 'human' or 'mouse', or provide a custom folder path.")
+        file_paths = {
+            'rxn_info': rxn_info_filename or f"{base_path}/{default_file_names[organism]['rxn_info']}",
+            'task_info': task_info_filename or f"{base_path}/{default_file_names[organism]['task_info']}",
+            'task_by_rxn': task_by_rxn_filename or f"{base_path}/{default_file_names[organism]['task_by_rxn']}",
+            'task_by_gene': task_by_gene_filename or f"{base_path}/{default_file_names[organism]['task_by_gene']}",
+            'rxn_by_gene': rxn_by_gene_filename or f"{base_path}/{default_file_names[organism]['rxn_by_gene']}",
+            'thresholds': thresholds_filename or f"{base_path}/{default_file_names[organism]['thresholds']}"
+        }
+
+    # Function to load a file
+    def load_file(file_key, index_col=None):
+        full_path = file_paths[file_key]
+        try:
+            if full_path.endswith('.json'):
+                return pd.read_json(full_path)
+            elif full_path.endswith('.csv'):
+                return pd.read_csv(full_path, index_col=index_col)
+            else:
+                raise ValueError(f"Unsupported file format: {full_path}")
+        except Exception as e:
+            print(f"Error loading {full_path}: {str(e)}")
+            return None
+
+    # Load all files
+    data = {}
+    data['rxn_info'] = load_file('rxn_info')
+    data['task_info'] = load_file('task_info')
+    data['task_by_rxn'] = load_file('task_by_rxn', index_col='Task')
+    data['task_by_gene'] = load_file('task_by_gene', index_col='Task')
+    data['rxn_by_gene'] = load_file('rxn_by_gene', index_col='Reaction')
+    data['thresholds'] = load_file('thresholds', index_col='symbol')
+    data['organism'] = organism
+    return data
diff --git a/sccellfie/datasets/gene_info.py b/sccellfie/datasets/gene_info.py
@@ -7,7 +7,7 @@ def retrieve_ensembl2symbol_data(filename=None, organism='human'):
 
     Parameters
     ----------
-    filename : str, optional
+    filename : str, optional (default: None)
         The file path to a custom CSV file containing Ensembl IDs and gene symbols.
 
     organism : str, optional (default: 'human')

diff --git a/sccellfie/datasets/tests/test_database.py b/sccellfie/datasets/tests/test_database.py
@@ -0,0 +1,116 @@
+import pytest
+import os
+import tempfile
+import pandas as pd
+
+from unittest.mock import patch
+
+from sccellfie.datasets.database import load_sccellfie_database  # Replace 'your_module' with the actual module name
+
+# Mock data for testing
+mock_json_data = pd.DataFrame({'col1': [1, 2, 3], 'col2': ['a', 'b', 'c']})
+mock_csv_data = pd.DataFrame({'col1': [4, 5, 6], 'col2': ['d', 'e', 'f']})
+
+
+@pytest.fixture
+def mock_read_json(monkeypatch):
+    def mock_read(path):
+        return mock_json_data
+
+    monkeypatch.setattr(pd, 'read_json', mock_read)
+
+
+@pytest.fixture
+def mock_read_csv(monkeypatch):
+    def mock_read(path, index_col=None):
+        return mock_csv_data
+
+    monkeypatch.setattr(pd, 'read_csv', mock_read)
+
+
+def test_load_sccellfie_database_default_urls(mock_read_json, mock_read_csv):
+    data = load_sccellfie_database(organism='human')
+    assert isinstance(data, dict)
+    assert 'rxn_info' in data
+    assert 'task_info' in data
+    assert 'task_by_rxn' in data
+    assert 'task_by_gene' in data
+    assert 'rxn_by_gene' in data
+    assert 'thresholds' in data
+    assert data['organism'] == 'human'
+    assert data['rxn_info'].equals(mock_json_data)
+    assert data['task_info'].equals(mock_csv_data)
+
+
+def test_load_sccellfie_database_local_folder():
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        # Create mock files
+        pd.DataFrame().to_json(os.path.join(tmpdirname, 'Rxn-Info-Recon2-2.json'))
+        pd.DataFrame().to_csv(os.path.join(tmpdirname, 'Task-Info.csv'))
+        pd.DataFrame().to_csv(os.path.join(tmpdirname, 'Task_by_Rxn.csv'))
+        pd.DataFrame().to_csv(os.path.join(tmpdirname, 'Task_by_Gene.csv'))
+        pd.DataFrame().to_csv(os.path.join(tmpdirname, 'Rxn_by_Gene.csv'))
+        pd.DataFrame().to_csv(os.path.join(tmpdirname, 'Thresholds.csv'))
+
+        data = load_sccellfie_database(organism='human', task_folder=tmpdirname)
+        assert isinstance(data, dict)
+        assert 'rxn_info' in data
+        assert 'task_info' in data
+        assert 'task_by_rxn' in data
+        assert 'task_by_gene' in data
+        assert 'rxn_by_gene' in data
+        assert 'thresholds' in data
+        assert data['organism'] == 'human'
+
+
+def test_load_sccellfie_database_individual_files():
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        # Create mock files with unique names
+        rxn_info_path = os.path.join(tmpdirname, 'custom_rxn_info.json')
+        task_info_path = os.path.join(tmpdirname, 'custom_task_info.csv')
+        task_by_rxn_path = os.path.join(tmpdirname, 'custom_task_by_rxn.csv')
+        task_by_gene_path = os.path.join(tmpdirname, 'custom_task_by_gene.csv')
+        rxn_by_gene_path = os.path.join(tmpdirname, 'custom_rxn_by_gene.csv')
+        thresholds_path = os.path.join(tmpdirname, 'custom_thresholds.csv')
+
+        pd.DataFrame().to_json(rxn_info_path)
+        pd.DataFrame().to_csv(task_info_path)
+        pd.DataFrame().to_csv(task_by_rxn_path)
+        pd.DataFrame().to_csv(task_by_gene_path)
+        pd.DataFrame().to_csv(rxn_by_gene_path)
+        pd.DataFrame().to_csv(thresholds_path)
+
+        data = load_sccellfie_database(
+            organism='human',
+            rxn_info_filename=rxn_info_path,
+            task_info_filename=task_info_path,
+            task_by_rxn_filename=task_by_rxn_path,
+            task_by_gene_filename=task_by_gene_path,
+            rxn_by_gene_filename=rxn_by_gene_path,
+            thresholds_filename=thresholds_path
+        )
+        assert isinstance(data, dict)
+        assert 'rxn_info' in data
+        assert 'task_info' in data
+        assert 'task_by_rxn' in data
+        assert 'task_by_gene' in data
+        assert 'rxn_by_gene' in data
+        assert 'thresholds' in data
+        assert data['organism'] == 'human'
+
+
+def test_load_sccellfie_database_invalid_organism():
+    with pytest.raises(ValueError):
+        load_sccellfie_database(organism='invalid')
+
+
+@patch('pandas.read_json')
+@patch('pandas.read_csv')
+def test_load_sccellfie_database_file_error(mock_read_csv, mock_read_json):
+    mock_read_json.side_effect = Exception("Mock JSON read error")
+    mock_read_csv.side_effect = Exception("Mock CSV read error")
+
+    data = load_sccellfie_database(organism='human')
+    assert isinstance(data, dict)
+    assert all(value is None for key, value in data.items() if key != 'organism')
+    assert data['organism'] == 'human'
diff --git a/sccellfie/tests/toy_inputs.py → sccellfie/datasets/toy_inputs.py b/sccellfie/tests/toy_inputs.py → sccellfie/datasets/toy_inputs.py
diff --git a/sccellfie/expression/smoothing.py b/sccellfie/expression/smoothing.py
@@ -6,7 +6,7 @@
 
 def get_smoothing_matrix(adata, mode, neighbors_key='neighbors'):
     """
-    Calculate the smoothing matrix S based on the nearest neighbor graph in adata.obsp.
+    Calculates the smoothing matrix S based on the nearest neighbor graph in adata.obsp.
 
     Parameters
     ----------
@@ -58,7 +58,7 @@ def get_smoothing_matrix(adata, mode, neighbors_key='neighbors'):
 def smooth_expression_knn(adata, key_added='smoothed_X', neighbors_key='neighbors', mode='connectivity', alpha=0.33,
                           n_chunks=None, chunk_size=None, use_raw=False, disable_pbar=False):
     """
-    Smooth expression values based on KNNs of single cells using Scanpy.
+    Smooths expression values based on KNNs of single cells using Scanpy.
 
     Parameters
     ----------
@@ -134,7 +134,7 @@ def smooth_expression_knn(adata, key_added='smoothed_X', neighbors_key='neighbor
     smoothed_matrix = np.zeros(X.shape)
 
     # Iterate over chunks of cells
-    for i in tqdm(range(n_chunks), disable=disable_pbar):
+    for i in tqdm(range(n_chunks), disable=disable_pbar, desc='Smoothing Expression'):
         start_idx = i * chunk_size
         end_idx = min((i + 1) * chunk_size, n_cells)
 

diff --git a/sccellfie/expression/tests/test_aggregation.py b/sccellfie/expression/tests/test_aggregation.py
@@ -4,7 +4,7 @@
 import scipy.sparse as sparse
 
 from sccellfie.expression.aggregation import agg_expression_cells, top_mean
-from sccellfie.tests.toy_inputs import create_random_adata, create_controlled_adata
+from sccellfie.datasets.toy_inputs import create_random_adata, create_controlled_adata
 
 
 @pytest.mark.parametrize("use_raw", [False, True])

diff --git a/sccellfie/expression/tests/test_smoothing.py b/sccellfie/expression/tests/test_smoothing.py
@@ -3,7 +3,7 @@
 
 from scipy.sparse import csr_matrix
 from sccellfie.expression.smoothing import get_smoothing_matrix, smooth_expression_knn
-from sccellfie.tests.toy_inputs import create_controlled_adata
+from sccellfie.datasets.toy_inputs import create_controlled_adata
 
 def test_get_smoothing_matrix():
     # Create a controlled adata object with known connectivities

diff --git a/sccellfie/expression/tests/test_threshold.py b/sccellfie/expression/tests/test_threshold.py
@@ -3,7 +3,7 @@
 
 from pandas.testing import assert_frame_equal
 from sccellfie.expression.thresholds import get_local_mean_threshold, get_global_mean_threshold, get_local_percentile_threshold, get_global_percentile_threshold, get_local_trimean_threshold, get_global_trimean_threshold, set_manual_threshold
-from sccellfie.tests.toy_inputs import create_controlled_adata
+from sccellfie.datasets.toy_inputs import create_controlled_adata
 
 
 @pytest.mark.parametrize("use_raw, lower_bound, upper_bound, exclude_zeros",