2-3× Speedup for BaseDataset.set_task() (#378)

zzachw · web-flow · commit 9305a6f6a7f3 · 2025-05-07T21:51:10.000-05:00
* Enhance Patient class with optimized filtering methods

- Add fast time range filtering via binary search on sorted timestamps
- Add efficient event type filtering using pre-built index lookups
- Reduce timestamp precision from microseconds to milliseconds
- Set default num_workers=1 in set_task for better memory control
- Remove unused dev flag from child MIMIC4 dataset classes

* Refactor InHospitalMortalityMIMIC4 and Readmission30DaysMIMIC4 for improved performance and clarity

* Fix bug in get_events to ensure event_type is only asserted when filters are provided
diff --git a/pyhealth/data/data.py b/pyhealth/data/data.py
@@ -1,7 +1,10 @@
+import operator
 from dataclasses import dataclass, field
 from datetime import datetime
+from functools import reduce
 from typing import Dict, List, Mapping, Optional, Union
 
+import numpy as np
 import polars as pl
 
 
@@ -91,7 +94,8 @@ class Patient:
 
     Attributes:
         patient_id (str): Unique patient identifier.
-        data_source (pl.DataFrame): DataFrame containing all events.
+        data_source (pl.DataFrame): DataFrame containing all events, sorted by timestamp.
+        event_type_partitions (Dict[str, pl.DataFrame]): Dictionary mapping event types to their respective DataFrame partitions.
     """
 
     def __init__(self, patient_id: str, data_source: pl.DataFrame) -> None:
@@ -104,6 +108,42 @@ def __init__(self, patient_id: str, data_source: pl.DataFrame) -> None:
         """
         self.patient_id = patient_id
         self.data_source = data_source.sort("timestamp")
+        self.event_type_partitions = self.data_source.partition_by("event_type", maintain_order=True, as_dict=True)
+
+    def _filter_by_time_range_regular(self, df: pl.DataFrame, start: Optional[datetime], end: Optional[datetime]) -> pl.DataFrame:
+        """Regular filtering by time. Time complexity: O(n)."""
+        if start is not None:
+            df = df.filter(pl.col("timestamp") >= start)
+        if end is not None:
+            df = df.filter(pl.col("timestamp") <= end)
+        return df
+
+    def _filter_by_time_range_fast(self, df: pl.DataFrame, start: Optional[datetime], end: Optional[datetime]) -> pl.DataFrame:
+        """Fast filtering by time using binary search on sorted timestamps. Time complexity: O(log n)."""
+        if start is None and end is None:
+            return df
+        df = df.filter(pl.col("timestamp").is_not_null())
+        ts_col = df["timestamp"].to_numpy()
+        start_idx = 0
+        end_idx = len(ts_col)
+        if start is not None:
+            start_idx = np.searchsorted(ts_col, start, side="left")
+        if end is not None:
+            end_idx = np.searchsorted(ts_col, end, side="right")
+        return df.slice(start_idx, end_idx - start_idx)
+
+    def _filter_by_event_type_regular(self, df: pl.DataFrame, event_type: Optional[str]) -> pl.DataFrame:
+        """Regular filtering by event type. Time complexity: O(n)."""
+        if event_type:
+            df = df.filter(pl.col("event_type") == event_type)
+        return df
+
+    def _filter_by_event_type_fast(self, df: pl.DataFrame, event_type: Optional[str]) -> pl.DataFrame:
+        """Fast filtering by event type using pre-built event type index. Time complexity: O(1)."""
+        if event_type:
+            return self.event_type_partitions.get((event_type,), df[:0])
+        else:
+            return df
 
     def get_events(
         self,
@@ -129,37 +169,41 @@ def get_events(
             Union[pl.DataFrame, List[Event]]: Filtered events as a DataFrame 
             or a list of Event objects.
         """
-        df = self.data_source
-        if event_type:
-            df = df.filter(pl.col("event_type") == event_type)
-        if start:
-            df = df.filter(pl.col("timestamp") >= start)
-        if end:
-            df = df.filter(pl.col("timestamp") <= end)
+        # faster filtering (by default)
+        df = self._filter_by_event_type_fast(self.data_source, event_type)
+        df = self._filter_by_time_range_fast(df, start, end)
 
-        filters = filters or []
-        for filt in filters:
+        # regular filtering (commented out by default)
+        # df = self._filter_by_event_type_regular(self.data_source, event_type)
+        # df = self._filter_by_time_range_regular(df, start, end)
+
+        if filters:
             assert event_type is not None, "event_type must be provided if filters are provided"
+        else:
+            filters = []
+        exprs = []
+        for filt in filters:
             if not (isinstance(filt, tuple) and len(filt) == 3):
                 raise ValueError(f"Invalid filter format: {filt} (must be tuple of (attr, op, value))")
             attr, op, val = filt
             col_expr = pl.col(f"{event_type}/{attr}")
             # Build operator expression
             if op == "==":
-                expr = col_expr == val
+                exprs.append(col_expr == val)
             elif op == "!=":
-                expr = col_expr != val
+                exprs.append(col_expr != val)
             elif op == "<":
-                expr = col_expr < val
+                exprs.append(col_expr < val)
             elif op == "<=":
-                expr = col_expr <= val
+                exprs.append(col_expr <= val)
             elif op == ">":
-                expr = col_expr > val
+                exprs.append(col_expr > val)
             elif op == ">=":
-                expr = col_expr >= val
+                exprs.append(col_expr >= val)
             else:
                 raise ValueError(f"Unsupported operator: {op} in filter {filt}")
-            df = df.filter(expr)
+        if exprs:
+            df = df.filter(reduce(operator.and_, exprs))
         if return_df:
             return df
         return [Event.from_dict(d) for d in df.to_dicts()]
diff --git a/pyhealth/datasets/base_dataset.py b/pyhealth/datasets/base_dataset.py
@@ -97,7 +97,7 @@ def __init__(
         tables: List[str],
         dataset_name: Optional[str] = None,
         config_path: Optional[str] = None,
-        dev: bool = False,  # Added dev parameter
+        dev: bool = False,
     ):
         """Initializes the BaseDataset.
 
@@ -115,7 +115,7 @@ def __init__(
         self.tables = tables
         self.dataset_name = dataset_name or self.__class__.__name__
         self.config = load_yaml_config(config_path)
-        self.dev = dev  # Store dev mode flag
+        self.dev = dev
 
         logger.info(
             f"Initializing {self.dataset_name} dataset from {self.root} (dev mode: {self.dev})"
@@ -147,6 +147,21 @@ def collected_global_event_df(self) -> pl.DataFrame:
                 df = df.join(limited_patients, on="patient_id", how="inner")
 
             self._collected_global_event_df = df.collect()
+
+            # Profile the Polars collect() operation (commented out by default)
+            # self._collected_global_event_df, profile = df.profile()
+            # profile = profile.with_columns([
+            #     (pl.col("end") - pl.col("start")).alias("duration"),
+            # ])
+            # profile = profile.with_columns([
+            #     (pl.col("duration") / profile["duration"].sum() * 100).alias("percentage")
+            # ])
+            # profile = profile.sort("duration", descending=True)
+            # with pl.Config() as cfg:
+            #     cfg.set_tbl_rows(-1)
+            #     cfg.set_fmt_str_lengths(200)
+            #     print(profile)
+
             logger.info(
                 f"Collected dataframe with shape: {self._collected_global_event_df.shape}"
             )
@@ -247,7 +262,8 @@ def load_table(self, table_name: str) -> pl.LazyFrame:
         base_columns = [
             patient_id_expr.alias("patient_id"),
             pl.lit(table_name).cast(pl.Utf8).alias("event_type"),
-            timestamp_expr.cast(pl.Datetime).alias("timestamp"),
+            # ms should be sufficient for most cases
+            timestamp_expr.cast(pl.Datetime(time_unit="ms")).alias("timestamp"),
         ]
 
         # Flatten attribute columns with event_type prefix
@@ -326,14 +342,15 @@ def default_task(self) -> Optional[BaseTask]:
         return None
 
     def set_task(
-        self, task: Optional[BaseTask] = None, num_workers: Optional[int] = None
+        self, task: Optional[BaseTask] = None, num_workers: int = 1
     ) -> SampleDataset:
         """Processes the base dataset to generate the task-specific sample dataset.
 
         Args:
             task (Optional[BaseTask]): The task to set. Uses default task if None.
-            num_workers (Optional[int]): Number of workers for parallel processing.
-                Use None to use all available cores (max 32). Use 1 for single-threaded.
+            num_workers (int): Number of workers for multi-threading. Default is 1.
+                This is because the task function is usually CPU-bound. And using
+                multi-threading may not speed up the task function.
 
         Returns:
             SampleDataset: The generated sample dataset.
@@ -351,26 +368,26 @@ def set_task(
 
         filtered_global_event_df = task.pre_filter(self.collected_global_event_df)
 
-        # Determine number of workers
-        if num_workers is None:
-            num_workers = min(8, os.cpu_count())
-
         logger.info(f"Generating samples with {num_workers} worker(s)...")
 
         samples = []
 
         if num_workers == 1:
+            # single-threading (by default)
             for patient in tqdm(
                 self.iter_patients(filtered_global_event_df),
-                desc=f"Generating samples for {task.task_name}",
+                total=filtered_global_event_df["patient_id"].n_unique(),
+                desc=f"Generating samples for {task.task_name} with 1 worker",
+                smoothing=0,
             ):
                 samples.extend(task(patient))
         else:
-            logger.info(f"Generating samples for {task.task_name}")
+            # multi-threading (not recommended)
+            logger.info(f"Generating samples for {task.task_name} with {num_workers} workers")
             patients = list(self.iter_patients(filtered_global_event_df))
             with ThreadPoolExecutor(max_workers=num_workers) as executor:
                 futures = [executor.submit(task, patient) for patient in patients]
-                for future in as_completed(futures):
+                for future in tqdm(as_completed(futures), total=len(futures), desc=f"Collecting samples for {task.task_name} from {num_workers} workers"):
                     samples.extend(future.result())
 
         sample_dataset = SampleDataset(
diff --git a/pyhealth/datasets/mimic4.py b/pyhealth/datasets/mimic4.py
@@ -210,7 +210,7 @@ def __init__(
         note_config_path: Optional[str] = None,
         cxr_config_path: Optional[str] = None,
         dataset_name: str = "mimic4",
-        dev: bool = False,  # Added dev parameter
+        dev: bool = False,
     ):
         log_memory_usage("Starting MIMIC4Dataset init")
 
@@ -220,8 +220,10 @@ def __init__(
         self.root = None
         self.tables = None
         self.config = None
-        self.dev = dev  # Store dev mode flag
-        
+        # Dev flag is only used in the MIMIC4Dataset class
+        # to ensure the same set of patients are used for all sub-datasets.
+        self.dev = dev
+
         # We need at least one root directory
         if not any([ehr_root, note_root, cxr_root]):
             raise ValueError("At least one root directory must be provided")
@@ -238,7 +240,6 @@ def __init__(
                 root=ehr_root,
                 tables=ehr_tables,
                 config_path=ehr_config_path,
-                dev=dev  # Pass dev mode flag
             )
             log_memory_usage("After EHR dataset initialization")
 
@@ -249,7 +250,6 @@ def __init__(
                 root=note_root,
                 tables=note_tables,
                 config_path=note_config_path,
-                dev=dev  # Pass dev mode flag
             )
             log_memory_usage("After Note dataset initialization")
 
@@ -260,7 +260,6 @@ def __init__(
                 root=cxr_root,
                 tables=cxr_tables,
                 config_path=cxr_config_path,
-                dev=dev  # Pass dev mode flag
             )
             log_memory_usage("After CXR dataset initialization")
 
diff --git a/pyhealth/tasks/in_hospital_mortality_mimic4.py b/pyhealth/tasks/in_hospital_mortality_mimic4.py
@@ -1,5 +1,5 @@
 from datetime import datetime, timedelta
-from typing import Any, Dict, List, ClassVar
+from typing import Any, ClassVar, Dict, List
 
 import polars as pl
 
@@ -8,11 +8,16 @@
 
 class InHospitalMortalityMIMIC4(BaseTask):
     """Task for predicting in-hospital mortality using MIMIC-IV dataset.
-    
+
+    This task leverages lab results to predict the likelihood of in-hospital
+    mortality.
+
     Attributes:
         task_name (str): The name of the task.
-        input_schema (Dict[str, str]): The input schema for the task.
-        output_schema (Dict[str, str]): The output schema for the task.
+        input_schema (Dict[str, str]): The schema for input data, which includes:
+            - labs: A timeseries of lab results.    
+        output_schema (Dict[str, str]): The schema for output data, which includes:
+            - mortality: A binary indicator of mortality.
     """
     task_name: str = "InHospitalMortalityMIMIC4"
     input_schema: Dict[str, str] = {"labs": "timeseries"}
@@ -33,7 +38,7 @@ class InHospitalMortalityMIMIC4(BaseTask):
             "Phosphate": ["50970"],
         },
     }
-    
+
     # Create flat list of all lab items for use in the function
     LABITEMS: ClassVar[List[str]] = [
         item for category in LAB_CATEGORIES.values() 
@@ -42,25 +47,16 @@ class InHospitalMortalityMIMIC4(BaseTask):
     ]
 
     def __call__(self, patient: Any) -> List[Dict[str, Any]]:
-        """Processes a single patient for the in-hospital mortality prediction task.
-
-        Args:
-            patient (Any): A Patient object containing patient data.
-
-        Returns:
-            List[Dict[str, Any]]: A list of samples, each sample is a dict with patient_id,
-            admission_id, labs, and mortality as keys.
-        """
         input_window_hours = 48
         samples = []
-        
+
         demographics = patient.get_events(event_type="patients")
         assert len(demographics) == 1
         demographics = demographics[0]
         anchor_age = int(demographics.anchor_age)        
         if anchor_age < 18:
             return []
-    
+
         admissions = patient.get_events(event_type="admissions")
         for admission in admissions:
             admission_dischtime = datetime.strptime(admission.dischtime, "%Y-%m-%d %H:%M:%S")
@@ -95,7 +91,9 @@ def __call__(self, patient: Any) -> List[Dict[str, Any]]:
             labevents_df = labevents_df.pivot(
                 index="timestamp",
                 columns="labevents/itemid",
-                values="labevents/valuenum"
+                values="labevents/valuenum",
+                # in case of multiple values for the same timestamp
+                aggregate_function="first",
             )
             labevents_df = labevents_df.sort("timestamp")
 
@@ -104,13 +102,13 @@ def __call__(self, patient: Any) -> List[Dict[str, Any]]:
             missing_cols = [item for item in self.LABITEMS if item not in existing_cols]
             for col in missing_cols:
                 labevents_df = labevents_df.with_columns(pl.lit(None).alias(col))
-        
+
             # Reorder columns by LABITEMS
             labevents_df = labevents_df.select(
                 "timestamp",
                 *self.LABITEMS
             )
-            
+
             timestamps = labevents_df["timestamp"].to_list()
             lab_values = labevents_df.drop("timestamp").to_numpy()
 
@@ -124,4 +122,4 @@ def __call__(self, patient: Any) -> List[Dict[str, Any]]:
                 }
             )
 
-        return samples
+        return samples
diff --git a/pyhealth/tasks/readmission_30days_mimic4.py b/pyhealth/tasks/readmission_30days_mimic4.py