Skip to content

Commit

Permalink
feat: select columns for automatic analysis
Browse files Browse the repository at this point in the history
  • Loading branch information
neindochoh committed Aug 7, 2023
1 parent 0a2132b commit a074381
Show file tree
Hide file tree
Showing 8 changed files with 36 additions and 21 deletions.
6 changes: 4 additions & 2 deletions renumics/spotlight/analysis/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@
importlib.import_module(analyzers_namespace.__name__ + "." + module_info.name)


def find_issues(data_source: DataSource, dtypes: ColumnTypeMapping) -> List[DataIssue]:
def find_issues(
data_source: DataSource, columns: List[str], dtypes: ColumnTypeMapping
) -> List[DataIssue]:
"""
Find dataset issues in the data source
"""
Expand All @@ -28,7 +30,7 @@ def find_issues(data_source: DataSource, dtypes: ColumnTypeMapping) -> List[Data

issues: List[DataIssue] = []
for analyze in registered_analyzers:
issues.extend(analyze(data_source, dtypes))
issues.extend(analyze(data_source, columns, dtypes))

logger.info("Analysis done.")

Expand Down
6 changes: 3 additions & 3 deletions renumics/spotlight/analysis/analyzers/cleanlab.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"""

import inspect
from typing import Iterable
from typing import Iterable, List

import numpy as np
import cleanlab.outlier
Expand All @@ -19,13 +19,13 @@

@data_analyzer
def analyze_with_cleanlab(
data_source: DataSource, dtypes: ColumnTypeMapping
data_source: DataSource, columns: List[str], dtypes: ColumnTypeMapping
) -> Iterable[DataIssue]:
"""
Find (embedding) outliers with cleanlab
"""

embedding_columns = (col for col, dtype in dtypes.items() if dtype == Embedding)
embedding_columns = (col for col in columns if dtypes.get(col) == Embedding)
for column_name in embedding_columns:
col_values = data_source.get_column(column_name, dtypes[column_name]).values
embeddings = np.array(col_values, dtype=object)
Expand Down
4 changes: 2 additions & 2 deletions renumics/spotlight/analysis/analyzers/cleanvision.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,13 +103,13 @@ def _get_cell_data_safe(

@data_analyzer
def analyze_with_cleanvision(
data_source: DataSource, dtypes: ColumnTypeMapping
data_source: DataSource, columns: List[str], dtypes: ColumnTypeMapping
) -> Iterable[DataIssue]:
"""
find image issues using cleanvision
"""

image_columns = [col for col, dtype in dtypes.items() if dtype == Image]
image_columns = [col for col in columns if dtypes.get(col) == Image]

for column_name in image_columns:
# load image data from data source
Expand Down
2 changes: 1 addition & 1 deletion renumics/spotlight/analysis/typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,4 @@ class DataIssue(BaseModel):
description: str = ""


DataAnalyzer = Callable[[DataSource, ColumnTypeMapping], Iterable[DataIssue]]
DataAnalyzer = Callable[[DataSource, List[str], ColumnTypeMapping], Iterable[DataIssue]]
17 changes: 12 additions & 5 deletions renumics/spotlight/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ class SpotlightApp(FastAPI):
# data issues
issues: Optional[List[DataIssue]] = []
_custom_issues: List[DataIssue] = []
analyze_issues: bool = True
analyze_columns: Union[List[str], bool] = False

def __init__(self) -> None:
super().__init__()
Expand All @@ -112,7 +112,7 @@ def __init__(self) -> None:
self.vite_url = None
self.username = ""
self.filebrowsing_allowed = False
self.analyze_issues = False
self.analyze_columns = False
self.issues = None
self._custom_issues = []

Expand Down Expand Up @@ -303,7 +303,7 @@ def update(self, config: AppConfig) -> None:
if config.dtypes is not None:
self._user_dtypes = config.dtypes
if config.analyze is not None:
self.analyze_issues = config.analyze
self.analyze_columns = config.analyze
if config.custom_issues is not None:
self.custom_issues = config.custom_issues
if config.dataset is not None:
Expand Down Expand Up @@ -407,7 +407,8 @@ def _update_issues(self) -> None:
Update issues and notify client about.
"""

if not self.analyze_issues:
# early out for [] and False
if not self.analyze_columns:
self.issues = []
self._broadcast(IssuesUpdatedMessage())
return
Expand All @@ -417,8 +418,14 @@ def _update_issues(self) -> None:
self._broadcast(IssuesUpdatedMessage())
if table is None:
return

if self.analyze_columns is True:
columns = table.column_names
else:
columns = self.analyze_columns

task = self.task_manager.create_task(
find_issues, (table, self._dtypes), name="update_issues"
find_issues, (table, columns, self._dtypes), name="update_issues"
)

def _on_issues_ready(future: Future) -> None:
Expand Down
2 changes: 1 addition & 1 deletion renumics/spotlight/app_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class AppConfig:
project_root: Optional[Path] = None

# data analysis
analyze: Optional[bool] = None
analyze: Optional[Union[bool, List[str]]] = None
custom_issues: Optional[List[DataIssue]] = None

# frontend
Expand Down
16 changes: 11 additions & 5 deletions renumics/spotlight/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import platform
import signal
import sys
from typing import Optional, Tuple, Union
from typing import Optional, Tuple, Union, List
from pathlib import Path

import click
Expand Down Expand Up @@ -95,10 +95,15 @@ def cli_dtype_callback(
help="Whether to allow users to browse and open datasets.",
)
@click.option(
"--analyze",
"--analyze-all",
is_flag=True,
default=False,
help="Automatically analyze common dataset errors.",
help="Automatically analyze issues for all columns.",
)
@click.option(
"--analyze",
default=[],
help="Automatically analyze issues for all columns.",
)
@click.option("-v", "--verbose", is_flag=True)
@click.version_option(spotlight.__version__)
Expand All @@ -110,7 +115,8 @@ def main(
dtype: Optional[ColumnTypeMapping],
no_browser: bool,
filebrowsing: bool,
analyze: bool,
analyze: List[str],
analyze_all: bool,
verbose: bool,
) -> None:
"""
Expand All @@ -134,5 +140,5 @@ def main(
no_browser=no_browser,
allow_filebrowsing=filebrowsing,
wait="forever",
analyze=analyze,
analyze=True if analyze_all else analyze,
)
4 changes: 2 additions & 2 deletions renumics/spotlight/viewer.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def show(
allow_filebrowsing: Union[bool, Literal["auto"]] = "auto",
wait: Union[bool, Literal["auto", "forever"]] = "auto",
dtype: Optional[ColumnTypeMapping] = None,
analyze: Optional[bool] = None,
analyze: Optional[Union[bool, List[str]]] = None,
issues: Optional[Collection[DataIssue]] = None,
) -> None:
"""
Expand Down Expand Up @@ -320,7 +320,7 @@ def show(
allow_filebrowsing: Union[bool, Literal["auto"]] = "auto",
wait: Union[bool, Literal["auto", "forever"]] = "auto",
dtype: Optional[ColumnTypeMapping] = None,
analyze: Optional[bool] = None,
analyze: Optional[Union[bool, List[str]]] = None,
issues: Optional[Collection[DataIssue]] = None,
) -> Viewer:
"""
Expand Down

0 comments on commit a074381

Please sign in to comment.