Implements experiment to detect the hot functions of a project (#884)

* Implements experiment to detect the hot functions of a project * Make xray a local import * Adds missing docs * Update varats-core/varats/report/hot_functions_report.py Co-authored-by: Sebastian Böhm <[email protected]> * Rewords docs * Fixes type errors --------- Co-authored-by: Sebastian Böhm <[email protected]>
se-sic · Aug 21, 2024 · ef50d4f · ef50d4f
1 parent 08aa984
commit ef50d4f
Show file tree

Hide file tree

Showing 4 changed files with 382 additions and 0 deletions.
diff --git a/varats-core/varats/report/hot_functions_report.py b/varats-core/varats/report/hot_functions_report.py
@@ -0,0 +1,104 @@
+import typing as tp
+from dataclasses import dataclass
+from pathlib import Path
+
+from pandas import read_csv
+
+from varats.experiment.workload_util import WorkloadSpecificReportAggregate
+from varats.report.report import BaseReport, ReportAggregate
+
+
+@dataclass
+class XRayFunctionWrapper:
+    name: str
+    count: int
+    sum_time: float
+
+
+class HotFunctionReport(BaseReport, shorthand="HFR", file_type=".csv"):
+    """Report class to load and evaluate the hot function data."""
+
+    MAX_TRACK_FUNCTIONS = 50
+
+    def __init__(self, path: Path) -> None:
+        super().__init__(path)
+        self.__function_data = read_csv(path)
+
+    def top_n_functions(self, limit: int = 10) -> tp.List[XRayFunctionWrapper]:
+        """Determines the `n` hottest functions in which the most time was
+        spent."""
+        self.__function_data.sort_values(
+            by='sum', ascending=False, inplace=True
+        )
+        return [
+            XRayFunctionWrapper(
+                name=row["function"], count=row['count'], sum_time=row["sum"]
+            ) for _, row in self.__function_data.head(limit).iterrows()
+        ]
+
+    def hot_functions(self, threshold: int = 2) -> tp.List[XRayFunctionWrapper]:
+        """
+        Args:
+            threshold: min percentage a function needs as total
+                        time to count as hot
+        """
+        if threshold < 0 or threshold > 100:
+            raise ValueError(
+                "Threshold value needs to be in the range [0,...,100] "
+                f"but was {threshold}"
+            )
+
+        self.__function_data.sort_values(
+            by='sum', ascending=False, inplace=True
+        )
+        # The total time tracked only includes time spend in the top n
+        # (MAX_TRACK_FUNCTIONS) functions
+        total_time_tracked = self.__function_data["sum"].max()
+
+        if threshold == 0:
+            sum_time_cutoff = 0
+        else:
+            sum_time_cutoff = (total_time_tracked * threshold) / 100
+
+        return [
+            XRayFunctionWrapper(
+                name=row["function"], count=row['count'], sum_time=row["sum"]
+            )
+            for _, row in self.__function_data.iterrows()
+            if row["sum"] > sum_time_cutoff
+        ]
+
+    def print_full_dump(self) -> None:
+        print(f"{self.__function_data}")
+
+
+class WLHotFunctionAggregate(
+    WorkloadSpecificReportAggregate[HotFunctionReport],
+    shorthand="WL" + HotFunctionReport.SHORTHAND + ReportAggregate.SHORTHAND,
+    file_type=ReportAggregate.FILE_TYPE
+):
+
+    def __init__(self, path: Path) -> None:
+        super().__init__(path, HotFunctionReport)
+
+    def dump_all_reports(self) -> None:
+        """Dumps the contents of all loaded hot functions reports."""
+        for wl_name in self.workload_names():
+            for report in self.reports(wl_name):
+                report.print_full_dump()
+
+    def hot_functions_per_workload(
+        self, threshold: int = 2
+    ) -> tp.Dict[str, tp.List[XRayFunctionWrapper]]:
+        """
+        Args:
+            threshold: min percentage a function needs as
+                        total time to count as hot
+        """
+        res: tp.Dict[str, tp.List[XRayFunctionWrapper]] = {}
+        for wl_name in self.workload_names():
+            # TODO: repetition handling
+            for report in self.reports(wl_name):
+                res[wl_name] = report.hot_functions(threshold=threshold)
+
+        return res
diff --git a/varats/varats/experiments/vara/hot_function_experiment.py b/varats/varats/experiments/vara/hot_function_experiment.py
@@ -0,0 +1,181 @@
+"""Experiment that detects the hot functions of a project."""
+import typing as tp
+from pathlib import Path
+
+from benchbuild.command import ProjectCommand, cleanup
+from benchbuild.extensions import compiler, run, time
+from benchbuild.utils import actions
+from plumbum import local
+
+from varats.experiment.experiment_util import (
+    ZippedReportFolder,
+    create_new_success_result_filepath,
+    get_default_compile_error_wrapped,
+    ExperimentHandle,
+)
+from varats.experiment.workload_util import WorkloadCategory, workload_commands
+from varats.experiments.vara.feature_experiment import FeatureExperiment
+from varats.experiments.vara.feature_perf_precision import (
+    select_project_binaries,
+)
+from varats.project.project_util import BinaryType, ProjectBinaryWrapper
+from varats.project.varats_project import VProject
+from varats.report.hot_functions_report import (
+    HotFunctionReport,
+    WLHotFunctionAggregate,
+)
+from varats.report.report import ReportSpecification
+from varats.utils.config import get_current_config_id
+
+
+def perf_prec_workload_commands(
+    project: VProject, binary: ProjectBinaryWrapper
+) -> tp.List[ProjectCommand]:
+    """Uniformly select the workloads that should be processed."""
+
+    wl_commands = []
+
+    if not project.name.startswith(
+        "SynthIP"
+    ) and project.name != "SynthSAFieldSensitivity":
+        # Example commands from these CS are to "fast"
+        wl_commands += workload_commands(
+            project, binary, [WorkloadCategory.EXAMPLE]
+        )
+
+    wl_commands += workload_commands(project, binary, [WorkloadCategory.SMALL])
+
+    wl_commands += workload_commands(project, binary, [WorkloadCategory.MEDIUM])
+
+    return wl_commands
+
+
+class RunXRayProfiler(actions.ProjectStep):  # type: ignore
+    """Profiling step that runs a XRay instrumented binary to extract function-
+    level measurement data."""
+
+    NAME = "RunInstrumentedXRayBinaries"
+    DESCRIPTION = "Profile a project that was instrumented \
+        with xray instrumentations."
+
+    project: VProject
+
+    def __init__(
+        self, project: VProject, experiment_handle: ExperimentHandle
+    ) -> None:
+        super().__init__(project=project)
+        self.__experiment_handle = experiment_handle
+
+    def __call__(self) -> actions.StepResult:
+        return self.run_instrumented_code()
+
+    def __str__(self, indent: int = 0) -> str:
+        return str(
+            actions.textwrap.indent(
+                f"* {self.project.name}: Run VaRA "
+                "measurements together with XRay", indent * " "
+            )
+        )
+
+    def run_instrumented_code(self) -> actions.StepResult:
+        """Run the instrumented code to detect hot functions."""
+        # pylint: disable=import-outside-toplevel
+        from plumbum.cmd import llvm_xray
+
+        for binary in self.project.binaries:
+            if binary.type != BinaryType.EXECUTABLE:
+                # Skip libraries as we cannot run them
+                continue
+
+            with local.cwd(local.path(self.project.builddir)):
+
+                result_filepath = create_new_success_result_filepath(
+                    exp_handle=self.__experiment_handle,
+                    report_type=self.__experiment_handle.report_spec().
+                    main_report,
+                    project=self.project,
+                    binary=binary,
+                    config_id=get_current_config_id(self.project)
+                )
+                with ZippedReportFolder(
+                    result_filepath.full_path()
+                ) as reps_tmp_dir:
+                    for rep in range(0, 1):
+                        for prj_command in perf_prec_workload_commands(
+                            project=self.project, binary=binary
+                        ):
+                            hot_function_report_file = Path(reps_tmp_dir) / (
+                                "hot-func-trace_"
+                                f"{prj_command.command.label}_{rep}"
+                                ".csv"
+                            )
+
+                            unique_tracefile_tag = \
+                                f"xray_{prj_command.command.label}_{rep}."
+                            with local.env(
+                                XRAY_OPTIONS=" ".join([
+                                    "patch_premain=true",
+                                    "xray_mode=xray-basic",
+                                    f"xray_logfile_base={unique_tracefile_tag}"
+                                ])
+                            ):
+                                with cleanup(prj_command):
+                                    pb_cmd = prj_command.command.as_plumbum(
+                                        project=self.project
+                                    )
+                                    pb_cmd(retcode=binary.valid_exit_codes)
+
+                            for f in Path(".").iterdir():
+                                if f.name.startswith(unique_tracefile_tag):
+                                    xray_log_path = f.absolute()
+                                    break
+
+                            instr_map_path = local.path(
+                                self.project.primary_source
+                            ) / binary.path
+
+                            llvm_xray(
+                                "account", f"{xray_log_path}",
+                                "--deduce-sibling-calls",
+                                f"--instr_map={instr_map_path}",
+                                f"--output={hot_function_report_file}",
+                                "--format=csv",
+                                f"--top={HotFunctionReport.MAX_TRACK_FUNCTIONS}"
+                            )
+
+        return actions.StepResult.OK
+
+
+class XRayFindHotFunctions(FeatureExperiment, shorthand="HF"):
+    """Experiment for finding hot functions in code."""
+
+    NAME = "DetermineHotFunctions"
+    REPORT_SPEC = ReportSpecification(WLHotFunctionAggregate)
+
+    def actions_for_project(
+        self, project: VProject
+    ) -> tp.MutableSequence[actions.Step]:
+        project.cflags += [
+            "-fxray-instrument",
+            "-fxray-instruction-threshold=1",
+        ]
+
+        project.runtime_extension = run.RuntimeExtension(project, self) \
+            << time.RunWithTime()
+
+        project.compiler_extension = compiler.RunCompiler(project, self)
+
+        project.compile = get_default_compile_error_wrapped(
+            self.get_handle(), project,
+            self.get_handle().report_spec().main_report
+        )
+
+        binary = select_project_binaries(project)[0]
+        if binary.type != BinaryType.EXECUTABLE:
+            raise AssertionError("Experiment only works with executables.")
+
+        return [
+            actions.Compile(project),
+            RunXRayProfiler(project, self.get_handle()),
+            actions.Clean(project),
+        ]
diff --git a/varats/varats/tables/hot_functions.py b/varats/varats/tables/hot_functions.py
@@ -0,0 +1,96 @@
+"""Module for the HotFunctionsTable."""
+import typing as tp
+
+import pandas as pd
+
+from varats.experiments.vara.hot_function_experiment import XRayFindHotFunctions
+from varats.paper.paper_config import get_loaded_paper_config
+from varats.paper_mgmt.case_study import get_case_study_file_name_filter
+from varats.report.hot_functions_report import WLHotFunctionAggregate
+from varats.revision.revisions import get_processed_revisions_files
+from varats.table.table import Table
+from varats.table.table_utils import dataframe_to_table
+from varats.table.tables import TableFormat, TableGenerator
+
+
+class HotFunctionsTable(Table, table_name="hot_functions"):
+    """A concice table that provides a quick overview of all the detected hot
+    functions."""
+
+    def tabulate(self, table_format: TableFormat, wrap_table: bool) -> str:
+        case_studies = get_loaded_paper_config().get_all_case_studies()
+
+        df = pd.DataFrame()
+
+        for case_study in case_studies:
+            project_name = case_study.project_name
+
+            experiment_type = XRayFindHotFunctions
+            report_files = get_processed_revisions_files(
+                project_name, experiment_type, WLHotFunctionAggregate,
+                get_case_study_file_name_filter(case_study)
+            )
+
+            for report_filepath in report_files:
+                agg_hot_functions_report = WLHotFunctionAggregate(
+                    report_filepath.full_path()
+                )
+                report_file = agg_hot_functions_report.filename
+
+                hot_funcs = agg_hot_functions_report.hot_functions_per_workload(
+                    threshold=2
+                )
+
+                entries = []
+                for workload_name in agg_hot_functions_report.workload_names():
+                    hot_func_data = hot_funcs[workload_name]
+                    for hf in hot_func_data:
+                        new_row = {
+                            "Project":
+                                project_name,
+                            "Binary":
+                                report_file.binary_name,
+                            "Revision":
+                                str(report_file.commit_hash),
+                            "Workload":
+                                workload_name,
+                            "FunctionName":
+                                hf.name,
+                            "TimeSpent":
+                                hf.sum_time,
+                            "Reps":
+                                len(
+                                    agg_hot_functions_report.
+                                    reports(workload_name)
+                                )
+                        }
+
+                        # df = df.append(new_row, ignore_index=True)
+                        entries.append(pd.DataFrame([new_row]))
+
+                df = pd.concat(entries, ignore_index=True)
+
+        df.sort_values(["Project", "Binary"], inplace=True)
+        df.set_index(
+            ["Project", "Binary"],
+            inplace=True,
+        )
+
+        kwargs: tp.Dict[str, tp.Any] = {}
+
+        return dataframe_to_table(
+            df,
+            table_format,
+            wrap_table=wrap_table,
+            wrap_landscape=True,
+            **kwargs
+        )
+
+
+class HotFunctionsTableGenerator(
+    TableGenerator, generator_name="hot-functions", options=[]
+):
+    """Generator for `HotFunctionsTable`."""
+
+    def generate(self) -> tp.List[Table]:
+        return [HotFunctionsTable(self.table_config, **self.table_kwargs)]
diff --git a/varats/varats/tools/bb_config.py b/varats/varats/tools/bb_config.py
@@ -123,6 +123,7 @@ def update_experiments(bb_cfg: s.Configuration) -> None:
         'varats.experiments.vara.marker_tester',
         'varats.experiments.vara.phasar_fta',
         'varats.experiments.vara.feature_region_verifier_experiment',
+        'varats.experiments.vara.hot_function_experiment',
     ]