From 1d50c06ded0e9148dcb8a200a009a7d9e74d3bd5 Mon Sep 17 00:00:00 2001 From: antsh3k Date: Tue, 5 Mar 2024 20:06:06 +0000 Subject: [PATCH] Flag typing bug issue --- medcat/evaluate_mct_export/mct_analysis.py | 121 ++++++++++----------- 1 file changed, 59 insertions(+), 62 deletions(-) diff --git a/medcat/evaluate_mct_export/mct_analysis.py b/medcat/evaluate_mct_export/mct_analysis.py index 17cf1e7..0f7d87c 100644 --- a/medcat/evaluate_mct_export/mct_analysis.py +++ b/medcat/evaluate_mct_export/mct_analysis.py @@ -321,67 +321,7 @@ def _eval(self, metacat_model, mct_export) -> dict: result = self._eval_model(metacat_model.model, data, config=metacat_model.config, tokenizer=metacat_model.tokenizer) return {'predictions': result, 'meta_values': _} - - def generate_report(self, path: str = 'mct_report.xlsx', meta_ann=False, concept_filter: Optional[List] = None): - """ - :param path: Outfile path - :param meta_ann: Include Meta_annotation evaluation in the summary as well - :param concept_filter: Filter the report to only display select concepts of interest. List of cuis. - :return: A full excel report for MedCATtrainer annotation work done. - """ - if not self.cat: - raise ValueError("No model pack specified") - if concept_filter: - with pd.ExcelWriter(path) as writer: - print('Generating report...') - # array-like is allowed by documentation but not by typing - df = pd.DataFrame.from_dict([self.cat.get_model_card(as_dict=True)]).T.reset_index(drop=False) # type: ignore - df.columns = ['MCT report', f'Generated on {date.today().strftime("%Y/%m/%d")}'] # type: ignore - df = pd.concat([df, pd.DataFrame([['MCT Custom filter', concept_filter]], columns=df.columns)], - ignore_index = True) - df.to_excel(writer, index=False, sheet_name='medcat_model_card') - self.user_stats().to_excel(writer, index=False, sheet_name='user_stats') - print('Evaluating annotations...') - if meta_ann: - ann_df = self.full_annotation_df() - ann_df = ann_df[ann_df['cui'].isin(concept_filter)].reset_index(drop=True) - ann_df['timestamp'] = ann_df['timestamp'].dt.tz_localize(None) # Remove timezone information - ann_df.to_excel(writer, index=False, sheet_name='annotations') - else: - ann_df = self.annotation_df() - ann_df = ann_df[ann_df['cui'].isin(concept_filter)].reset_index(drop=True) - ann_df['timestamp'] = ann_df['timestamp'].dt.tz_localize(None) # Remove timezone information - ann_df.to_excel(writer, index=False, sheet_name='annotations') - performance_summary_df = self.concept_summary() - performance_summary_df = performance_summary_df[performance_summary_df['cui'].isin(concept_filter)]\ - .reset_index(drop=True) - performance_summary_df.to_excel(writer, index=False, sheet_name='concept_summary') - if meta_ann: - print('Evaluating meta_annotations...') - meta_anns_df = self.meta_anns_concept_summary() - meta_anns_df = meta_anns_df[meta_anns_df['cui'].isin(concept_filter)].reset_index(drop=True) - meta_anns_df.to_excel(writer, index=True, sheet_name='meta_annotations_summary') - else: - with pd.ExcelWriter(path) as writer: - print('Generating report...') - df = pd.DataFrame.from_dict([self.cat.get_model_card(as_dict=True)]).T.reset_index(drop=False) # type: ignore - df.columns = ['MCT report', f'Generated on {date.today().strftime("%Y/%m/%d")}'] # type: ignore - df.to_excel(writer, index=False, sheet_name='medcat_model_card') - self.user_stats().to_excel(writer, index=False, sheet_name='user_stats') - print('Evaluating annotations...') - if meta_ann: - self.full_annotation_df().to_excel(writer, index=False, sheet_name='annotations') - else: - self.annotation_df().to_excel(writer, index=False, sheet_name='annotations') - self.concept_summary().to_excel(writer, index=False, sheet_name='concept_summary') - if meta_ann: - print('Evaluating meta_annotations...') - self.meta_anns_concept_summary().to_excel(writer, index=True, sheet_name='meta_annotations_summary') - return print(f"MCT report saved to: {path}") - - -''' TODO: clean uo the insert method with the meta_annotations def full_annotation_df(self) -> pd.DataFrame: """ DataFrame of all annotations created including meta_annotation predictions. @@ -410,7 +350,7 @@ def full_annotation_df(self) -> pd.DataFrame: else: pred_meta_values.append(_meta_values.get(meta_results['predictions'][counter], np.nan)) counter += 1 - meta_df.insert(int(meta_df.columns.get_loc(meta_model)) + 1, f'predict_{meta_model}', pred_meta_values) + meta_df.insert(int(meta_df.columns.get_loc(meta_model)) + 1, f'predict_{meta_model}', pred_meta_values) # TODO fix this line return meta_df @@ -465,4 +405,61 @@ def meta_anns_concept_summary(self) -> pd.DataFrame: meta_anns_df = meta_anns_df.rename_axis('cui').reset_index(drop=False) meta_anns_df.insert(1, 'concept_name', meta_anns_df['cui'].map(self.cat.cdb.cui2preferred_name)) return meta_anns_df -''' + + def generate_report(self, path: str = 'mct_report.xlsx', meta_ann=False, concept_filter: Optional[List] = None): + """ + :param path: Outfile path + :param meta_ann: Include Meta_annotation evaluation in the summary as well + :param concept_filter: Filter the report to only display select concepts of interest. List of cuis. + :return: A full excel report for MedCATtrainer annotation work done. + """ + if not self.cat: + raise ValueError("No model pack specified") + if concept_filter: + with pd.ExcelWriter(path) as writer: + print('Generating report...') + # array-like is allowed by documentation but not by typing + df = pd.DataFrame.from_dict([self.cat.get_model_card(as_dict=True)]).T.reset_index(drop=False) # type: ignore + df.columns = ['MCT report', f'Generated on {date.today().strftime("%Y/%m/%d")}'] # type: ignore + df = pd.concat([df, pd.DataFrame([['MCT Custom filter', concept_filter]], columns=df.columns)], + ignore_index = True) + df.to_excel(writer, index=False, sheet_name='medcat_model_card') + self.user_stats().to_excel(writer, index=False, sheet_name='user_stats') + print('Evaluating annotations...') + if meta_ann: + ann_df = self.full_annotation_df() + ann_df = ann_df[ann_df['cui'].isin(concept_filter)].reset_index(drop=True) + ann_df['timestamp'] = ann_df['timestamp'].dt.tz_localize(None) # Remove timezone information + ann_df.to_excel(writer, index=False, sheet_name='annotations') + else: + ann_df = self.annotation_df() + ann_df = ann_df[ann_df['cui'].isin(concept_filter)].reset_index(drop=True) + ann_df['timestamp'] = ann_df['timestamp'].dt.tz_localize(None) # Remove timezone information + ann_df.to_excel(writer, index=False, sheet_name='annotations') + performance_summary_df = self.concept_summary() + performance_summary_df = performance_summary_df[performance_summary_df['cui'].isin(concept_filter)]\ + .reset_index(drop=True) + performance_summary_df.to_excel(writer, index=False, sheet_name='concept_summary') + if meta_ann: + print('Evaluating meta_annotations...') + meta_anns_df = self.meta_anns_concept_summary() + meta_anns_df = meta_anns_df[meta_anns_df['cui'].isin(concept_filter)].reset_index(drop=True) + meta_anns_df.to_excel(writer, index=True, sheet_name='meta_annotations_summary') + else: + with pd.ExcelWriter(path) as writer: + print('Generating report...') + df = pd.DataFrame.from_dict([self.cat.get_model_card(as_dict=True)]).T.reset_index(drop=False) # type: ignore + df.columns = ['MCT report', f'Generated on {date.today().strftime("%Y/%m/%d")}'] # type: ignore + df.to_excel(writer, index=False, sheet_name='medcat_model_card') + self.user_stats().to_excel(writer, index=False, sheet_name='user_stats') + print('Evaluating annotations...') + if meta_ann: + self.full_annotation_df().to_excel(writer, index=False, sheet_name='annotations') + else: + self.annotation_df().to_excel(writer, index=False, sheet_name='annotations') + self.concept_summary().to_excel(writer, index=False, sheet_name='concept_summary') + if meta_ann: + print('Evaluating meta_annotations...') + self.meta_anns_concept_summary().to_excel(writer, index=True, sheet_name='meta_annotations_summary') + + return print(f"MCT report saved to: {path}")