Skip to content

Commit

Permalink
Merge pull request #2 from earmingol/dev
Browse files Browse the repository at this point in the history
Update to v0.3.0
  • Loading branch information
earmingol authored Oct 21, 2024
2 parents 7ab1902 + 728952f commit 922ee26
Show file tree
Hide file tree
Showing 44 changed files with 4,111 additions and 3,155 deletions.
48 changes: 47 additions & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
:target: https://pepy.tech/project/sccellfie



Metabolic activity from single-cell and spatial transcriptomics with scCellFie
-----------------------------------------------------------------------------------------

Expand Down Expand Up @@ -54,6 +53,53 @@ Features

- **Organisms:** Metabolic database and analysis available for human and mouse.

Quick start
-----------
A quick example of how to use scCellFie with a single-cell dataset and generate results::

import sccellfie
import scanpy as sc

# Load the dataset
adata = sc.read(filename='BALF-COVID19.h5ad',
backup_url='https://zenodo.org/record/7535867/files/BALF-COVID19-Liao_et_al-NatMed-2020.h5ad')

# Run one-command scCellFie pipeline
results = sccellfie.run_sccellfie_pipeline(adata,
organism='human',
sccellfie_data_folder=None,
n_counts_col='n_counts',
process_by_group=False,
groupby=None,
neighbors_key='neighbors',
n_neighbors=10,
batch_key='sample',
threshold_key='sccellfie_threshold',
smooth_cells=True,
alpha=0.33,
chunk_size=5000,
disable_pbar=False,
save_folder=None,
save_filename=None
)

To access metabolic activities, we need to inspect `results['adata']`:

- The processed single-cell data is located in the AnnData object `results['adata']`.
- The reaction activities for each cell are located in the AnnData object `results['adata'].reactions`.
- The metabolic task activities for each cell are located in the AnnData object `results['adata'].metabolic_tasks`.

In particular:

- `results['adata']`: contains gene expression in `.X`.
- `results['adata'].layers['gene_scores']`: contains gene scores as in the original CellFie paper.
- `results['adata'].uns['Rxn-Max-Genes']`: contains determinant genes for each reaction per cell.
- `results['adata'].reactions`: contains reaction scores in `.X` so every scanpy function can be used on this object to visualize or compare values.
- `results['adata'].metabolic_tasks`: contains metabolic task scores in `.X` so every scanpy function can be used on this object to visualize or compare values.

Other keys in the `results` dictionary are associated with the scCellFie database and are already filtered for the elements present
in the dataset (`'gpr_rules'`, `'task_by_gene'`, `'rxn_by_gene'`, `'task_by_rxn'`, `'rxn_info'`, `'task_info'`, `'thresholds'`, `'organism'`).

How to cite
-----------

Expand Down
3 changes: 2 additions & 1 deletion sccellfie/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,6 @@
from . import spatial
from . import stats as tl
from .expression import aggregation, smoothing, thresholds
from .sccellfie_pipeline import run_sccellfie_pipeline

__version__ = "0.2.3"
__version__ = "0.3.0"
3 changes: 2 additions & 1 deletion sccellfie/datasets/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
from .gene_info import (retrieve_ensembl2symbol_data)
from .gene_info import (retrieve_ensembl2symbol_data)
from .database import (load_sccellfie_database)
118 changes: 118 additions & 0 deletions sccellfie/datasets/database.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
import pandas as pd
import os


def load_sccellfie_database(organism='human', task_folder=None, rxn_info_filename=None, task_info_filename=None,
task_by_rxn_filename=None, task_by_gene_filename=None, rxn_by_gene_filename=None,
thresholds_filename=None):
"""
Loads files of the metabolic task database from either a local folder, individual file paths, or predefined URLs.
Parameters
----------
organism : str, optional (default: 'human')
The organism to retrieve data for. Choose 'human' or 'mouse'. Used when loading from URLs.
task_folder : str, optional (default: None)
The local folder path containing CellFie data files. If provided, this takes priority.
rxn_info_filename : str, optional (default: None)
Full path for reaction information JSON file.
task_info_filename : str, optional (default: None)
Full path for task information CSV file.
task_by_rxn_filename : str, optional (default: None)
Full path for task by reaction CSV file.
task_by_gene_filename : str, optional (default: None)
Full path for task by gene CSV file.
rxn_by_gene_filename : str, optional (default: None)
Full path for reaction by gene CSV file.
thresholds_filename : str, optional (default: None)
Full path for thresholds CSV file.
Returns
-------
data : dict
A dictionary containing the loaded data frames and information.
Keys are 'rxn_info', 'task_info', 'task_by_rxn', 'task_by_gene', 'rxn_by_gene',
'thresholds', and 'organism'.
Examples of dataframes can be found at https://github.com/earmingol/scCellFie/raw/refs/heads/main/task_data/homo_sapiens/
"""
# Define default URLs for human and mouse data
default_urls = {
'human': 'https://github.com/earmingol/scCellFie/raw/refs/heads/main/task_data/homo_sapiens/',
'mouse': 'https://github.com/earmingol/scCellFie/raw/refs/heads/main/task_data/mus_musculus/'
}

# Define default file names
default_file_names = {
'human': {
'rxn_info': 'Rxn-Info-Recon2-2.json',
'task_info': 'Task-Info.csv',
'task_by_rxn': 'Task_by_Rxn.csv',
'task_by_gene': 'Task_by_Gene.csv',
'rxn_by_gene': 'Rxn_by_Gene.csv',
'thresholds': 'Thresholds.csv'
},
'mouse': {
'rxn_info': 'Rxn-Info-iMM1415.json',
'task_info': 'Task-Info.csv',
'task_by_rxn': 'Task_by_Rxn.csv',
'task_by_gene': 'Task_by_Gene.csv',
'rxn_by_gene': 'Rxn_by_Gene.csv',
'thresholds': 'Thresholds.csv'
}
}

# Determine the base path and file names
if task_folder:
base_path = task_folder
file_paths = {
'rxn_info': os.path.join(base_path, default_file_names[organism]['rxn_info']),
'task_info': os.path.join(base_path, default_file_names[organism]['task_info']),
'task_by_rxn': os.path.join(base_path, default_file_names[organism]['task_by_rxn']),
'task_by_gene': os.path.join(base_path, default_file_names[organism]['task_by_gene']),
'rxn_by_gene': os.path.join(base_path, default_file_names[organism]['rxn_by_gene']),
'thresholds': os.path.join(base_path, default_file_names[organism]['thresholds'])
}
else:
base_path = default_urls.get(organism.lower())
if not base_path:
raise ValueError("Invalid organism. Choose 'human' or 'mouse', or provide a custom folder path.")
file_paths = {
'rxn_info': rxn_info_filename or f"{base_path}/{default_file_names[organism]['rxn_info']}",
'task_info': task_info_filename or f"{base_path}/{default_file_names[organism]['task_info']}",
'task_by_rxn': task_by_rxn_filename or f"{base_path}/{default_file_names[organism]['task_by_rxn']}",
'task_by_gene': task_by_gene_filename or f"{base_path}/{default_file_names[organism]['task_by_gene']}",
'rxn_by_gene': rxn_by_gene_filename or f"{base_path}/{default_file_names[organism]['rxn_by_gene']}",
'thresholds': thresholds_filename or f"{base_path}/{default_file_names[organism]['thresholds']}"
}

# Function to load a file
def load_file(file_key, index_col=None):
full_path = file_paths[file_key]
try:
if full_path.endswith('.json'):
return pd.read_json(full_path)
elif full_path.endswith('.csv'):
return pd.read_csv(full_path, index_col=index_col)
else:
raise ValueError(f"Unsupported file format: {full_path}")
except Exception as e:
print(f"Error loading {full_path}: {str(e)}")
return None

# Load all files
data = {}
data['rxn_info'] = load_file('rxn_info')
data['task_info'] = load_file('task_info')
data['task_by_rxn'] = load_file('task_by_rxn', index_col='Task')
data['task_by_gene'] = load_file('task_by_gene', index_col='Task')
data['rxn_by_gene'] = load_file('rxn_by_gene', index_col='Reaction')
data['thresholds'] = load_file('thresholds', index_col='symbol')
data['organism'] = organism
return data
2 changes: 1 addition & 1 deletion sccellfie/datasets/gene_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ def retrieve_ensembl2symbol_data(filename=None, organism='human'):
Parameters
----------
filename : str, optional
filename : str, optional (default: None)
The file path to a custom CSV file containing Ensembl IDs and gene symbols.
organism : str, optional (default: 'human')
Expand Down
116 changes: 116 additions & 0 deletions sccellfie/datasets/tests/test_database.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import pytest
import os
import tempfile
import pandas as pd

from unittest.mock import patch

from sccellfie.datasets.database import load_sccellfie_database # Replace 'your_module' with the actual module name

# Mock data for testing
mock_json_data = pd.DataFrame({'col1': [1, 2, 3], 'col2': ['a', 'b', 'c']})
mock_csv_data = pd.DataFrame({'col1': [4, 5, 6], 'col2': ['d', 'e', 'f']})


@pytest.fixture
def mock_read_json(monkeypatch):
def mock_read(path):
return mock_json_data

monkeypatch.setattr(pd, 'read_json', mock_read)


@pytest.fixture
def mock_read_csv(monkeypatch):
def mock_read(path, index_col=None):
return mock_csv_data

monkeypatch.setattr(pd, 'read_csv', mock_read)


def test_load_sccellfie_database_default_urls(mock_read_json, mock_read_csv):
data = load_sccellfie_database(organism='human')
assert isinstance(data, dict)
assert 'rxn_info' in data
assert 'task_info' in data
assert 'task_by_rxn' in data
assert 'task_by_gene' in data
assert 'rxn_by_gene' in data
assert 'thresholds' in data
assert data['organism'] == 'human'
assert data['rxn_info'].equals(mock_json_data)
assert data['task_info'].equals(mock_csv_data)


def test_load_sccellfie_database_local_folder():
with tempfile.TemporaryDirectory() as tmpdirname:
# Create mock files
pd.DataFrame().to_json(os.path.join(tmpdirname, 'Rxn-Info-Recon2-2.json'))
pd.DataFrame().to_csv(os.path.join(tmpdirname, 'Task-Info.csv'))
pd.DataFrame().to_csv(os.path.join(tmpdirname, 'Task_by_Rxn.csv'))
pd.DataFrame().to_csv(os.path.join(tmpdirname, 'Task_by_Gene.csv'))
pd.DataFrame().to_csv(os.path.join(tmpdirname, 'Rxn_by_Gene.csv'))
pd.DataFrame().to_csv(os.path.join(tmpdirname, 'Thresholds.csv'))

data = load_sccellfie_database(organism='human', task_folder=tmpdirname)
assert isinstance(data, dict)
assert 'rxn_info' in data
assert 'task_info' in data
assert 'task_by_rxn' in data
assert 'task_by_gene' in data
assert 'rxn_by_gene' in data
assert 'thresholds' in data
assert data['organism'] == 'human'


def test_load_sccellfie_database_individual_files():
with tempfile.TemporaryDirectory() as tmpdirname:
# Create mock files with unique names
rxn_info_path = os.path.join(tmpdirname, 'custom_rxn_info.json')
task_info_path = os.path.join(tmpdirname, 'custom_task_info.csv')
task_by_rxn_path = os.path.join(tmpdirname, 'custom_task_by_rxn.csv')
task_by_gene_path = os.path.join(tmpdirname, 'custom_task_by_gene.csv')
rxn_by_gene_path = os.path.join(tmpdirname, 'custom_rxn_by_gene.csv')
thresholds_path = os.path.join(tmpdirname, 'custom_thresholds.csv')

pd.DataFrame().to_json(rxn_info_path)
pd.DataFrame().to_csv(task_info_path)
pd.DataFrame().to_csv(task_by_rxn_path)
pd.DataFrame().to_csv(task_by_gene_path)
pd.DataFrame().to_csv(rxn_by_gene_path)
pd.DataFrame().to_csv(thresholds_path)

data = load_sccellfie_database(
organism='human',
rxn_info_filename=rxn_info_path,
task_info_filename=task_info_path,
task_by_rxn_filename=task_by_rxn_path,
task_by_gene_filename=task_by_gene_path,
rxn_by_gene_filename=rxn_by_gene_path,
thresholds_filename=thresholds_path
)
assert isinstance(data, dict)
assert 'rxn_info' in data
assert 'task_info' in data
assert 'task_by_rxn' in data
assert 'task_by_gene' in data
assert 'rxn_by_gene' in data
assert 'thresholds' in data
assert data['organism'] == 'human'


def test_load_sccellfie_database_invalid_organism():
with pytest.raises(ValueError):
load_sccellfie_database(organism='invalid')


@patch('pandas.read_json')
@patch('pandas.read_csv')
def test_load_sccellfie_database_file_error(mock_read_csv, mock_read_json):
mock_read_json.side_effect = Exception("Mock JSON read error")
mock_read_csv.side_effect = Exception("Mock CSV read error")

data = load_sccellfie_database(organism='human')
assert isinstance(data, dict)
assert all(value is None for key, value in data.items() if key != 'organism')
assert data['organism'] == 'human'
File renamed without changes.
6 changes: 3 additions & 3 deletions sccellfie/expression/smoothing.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

def get_smoothing_matrix(adata, mode, neighbors_key='neighbors'):
"""
Calculate the smoothing matrix S based on the nearest neighbor graph in adata.obsp.
Calculates the smoothing matrix S based on the nearest neighbor graph in adata.obsp.
Parameters
----------
Expand Down Expand Up @@ -58,7 +58,7 @@ def get_smoothing_matrix(adata, mode, neighbors_key='neighbors'):
def smooth_expression_knn(adata, key_added='smoothed_X', neighbors_key='neighbors', mode='connectivity', alpha=0.33,
n_chunks=None, chunk_size=None, use_raw=False, disable_pbar=False):
"""
Smooth expression values based on KNNs of single cells using Scanpy.
Smooths expression values based on KNNs of single cells using Scanpy.
Parameters
----------
Expand Down Expand Up @@ -134,7 +134,7 @@ def smooth_expression_knn(adata, key_added='smoothed_X', neighbors_key='neighbor
smoothed_matrix = np.zeros(X.shape)

# Iterate over chunks of cells
for i in tqdm(range(n_chunks), disable=disable_pbar):
for i in tqdm(range(n_chunks), disable=disable_pbar, desc='Smoothing Expression'):
start_idx = i * chunk_size
end_idx = min((i + 1) * chunk_size, n_cells)

Expand Down
2 changes: 1 addition & 1 deletion sccellfie/expression/tests/test_aggregation.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import scipy.sparse as sparse

from sccellfie.expression.aggregation import agg_expression_cells, top_mean
from sccellfie.tests.toy_inputs import create_random_adata, create_controlled_adata
from sccellfie.datasets.toy_inputs import create_random_adata, create_controlled_adata


@pytest.mark.parametrize("use_raw", [False, True])
Expand Down
2 changes: 1 addition & 1 deletion sccellfie/expression/tests/test_smoothing.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from scipy.sparse import csr_matrix
from sccellfie.expression.smoothing import get_smoothing_matrix, smooth_expression_knn
from sccellfie.tests.toy_inputs import create_controlled_adata
from sccellfie.datasets.toy_inputs import create_controlled_adata

def test_get_smoothing_matrix():
# Create a controlled adata object with known connectivities
Expand Down
2 changes: 1 addition & 1 deletion sccellfie/expression/tests/test_threshold.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from pandas.testing import assert_frame_equal
from sccellfie.expression.thresholds import get_local_mean_threshold, get_global_mean_threshold, get_local_percentile_threshold, get_global_percentile_threshold, get_local_trimean_threshold, get_global_trimean_threshold, set_manual_threshold
from sccellfie.tests.toy_inputs import create_controlled_adata
from sccellfie.datasets.toy_inputs import create_controlled_adata


@pytest.mark.parametrize("use_raw, lower_bound, upper_bound, exclude_zeros",
Expand Down
Loading

0 comments on commit 922ee26

Please sign in to comment.