Skip to content

Commit

Permalink
Move statistics function, adapt and update csv
Browse files Browse the repository at this point in the history
  • Loading branch information
dobraczka committed Apr 8, 2024
1 parent 16e703c commit d5547d4
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 61 deletions.
7 changes: 5 additions & 2 deletions dataset_statistics.csv
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,11 @@ MovieGraphBenchmark,moviegraphbenchmark_imdb_tvdb,imdb,5129,17507,20800,3,13,608
MovieGraphBenchmark,moviegraphbenchmark_imdb_tvdb,tvdb,7814,15455,20902,3,9,7683,1483,22663,25583
MovieGraphBenchmark,moviegraphbenchmark_tmdb_tvdb,tmdb,6061,27903,23761,4,30,9991,1920,64,26138
MovieGraphBenchmark,moviegraphbenchmark_tmdb_tvdb,tvdb,7814,15455,20902,3,9,7683,1920,22663,26138
MED_BBK,med_bbk,MED,9162,158357,11467,32,19,10858,8885,0,5619
MED_BBK,med_bbk,BBK,9162,50307,44987,20,21,36608,8885,0,5619
MovieGraphBenchmark,moviegraphbenchmark_multi,imdb,5129,17507,20800,3,13,6082,3598,1,31230
MovieGraphBenchmark,moviegraphbenchmark_multi,tmdb,6061,27903,23761,4,30,9991,3598,64,31230
MovieGraphBenchmark,moviegraphbenchmark_multi,tvdb,7814,15455,20902,3,9,7683,3598,22663,31230
MED_BBK,med_bbk,MED,9162,158357,11467,32,19,10858,9162,0,9162
MED_BBK,med_bbk,BBK,9162,50307,44987,20,21,36608,9162,0,9162
OAEI,oaei_marvelcinematicuniverse_marvel,marvelcinematicuniverse,216033,1094598,130517,130,110,56566,1654,0,1654
OAEI,oaei_marvelcinematicuniverse_marvel,marvel,1472619,5152898,1580468,63,127,749980,1654,0,1654
OAEI,oaei_memoryalpha_memorybeta,memoryalpha,254537,2096198,430730,180,287,226110,9296,0,9296
Expand Down
56 changes: 0 additions & 56 deletions sylloge/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
Callable,
Dict,
Generic,
Iterable,
List,
Literal,
Mapping,
Expand Down Expand Up @@ -1085,58 +1084,3 @@ class BinaryZipEADatasetWithPreSplitFolds(

def __repr__(self) -> str:
return self._binary_repr_adjustment(super().__repr__())


def create_statistics_df(
datasets: Iterable[MultiSourceEADataset], seperate_attribute_relations: bool = True
):
rows = []
triples_col = (
["Relation Triples", "Attribute Triples"]
if seperate_attribute_relations
else ["Triples"]
)
index_cols = ["Dataset family", "Task Name", "Dataset Name"]
columns = [
*index_cols,
"Entities",
*triples_col,
"Relations",
"Properties",
"Literals",
"Clusters",
"Intra-dataset Matches",
"All Matches",
]
for ds in datasets:
ds_family = str(ds.__class__.__name__).split(".")[-1]
ds_stats, num_clusters = ds.statistics()
all_matches = ds.ent_links.number_of_links
intra_dataset_matches = (0,) * len(ds.dataset_names)
if isinstance(ds.ent_links, PrefixedClusterHelper):
intra_dataset_matches = ds.ent_links.number_of_intra_links
for i, (ds_side, ds_side_name) in enumerate(zip(ds_stats, ds.dataset_names)):
if seperate_attribute_relations:
triples = [ds_side.rel_triples, ds_side.attr_triples]
else:
triples = [ds_side.triples]
rows.append(
[
ds_family,
ds.canonical_name,
ds_side_name,
ds_side.entities,
*triples,
ds_side.relations,
ds_side.properties,
ds_side.literals,
num_clusters,
intra_dataset_matches[i],
all_matches,
]
)
statistics_df = pd.DataFrame(
rows,
columns=columns,
)
return statistics_df.set_index(index_cols)
65 changes: 62 additions & 3 deletions sylloge/create_statistic.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,68 @@
from typing import Dict, Iterable, Tuple

import pandas as pd
from eche import ClusterHelper, PrefixedClusterHelper

from sylloge import MED_BBK, OAEI, MovieGraphBenchmark, MultiSourceEADataset, OpenEA
from sylloge.base import create_statistics_df


def create_statistics_df(
datasets: Iterable[MultiSourceEADataset], seperate_attribute_relations: bool = True
):
rows = []
triples_col = (
["Relation Triples", "Attribute Triples"]
if seperate_attribute_relations
else ["Triples"]
)
index_cols = ["Dataset family", "Task Name", "Dataset Name"]
columns = [
*index_cols,
"Entities",
*triples_col,
"Relations",
"Properties",
"Literals",
"Clusters",
"Intra-dataset Matches",
"All Matches",
]
for ds in datasets:
ds_family = str(ds.__class__.__name__).split(".")[-1]
ds_stats, num_clusters = ds.statistics()
intra_dataset_matches = (0,) * len(ds.dataset_names)
if isinstance(ds.ent_links, ClusterHelper):
all_matches = ds.ent_links.number_of_links
if isinstance(ds.ent_links, PrefixedClusterHelper):
intra_dataset_matches = ds.ent_links.number_of_intra_links
else:
all_matches = len(ds.ent_links)
for i, (ds_side, ds_side_name) in enumerate(zip(ds_stats, ds.dataset_names)):
if seperate_attribute_relations:
triples = [ds_side.rel_triples, ds_side.attr_triples]
else:
triples = [ds_side.triples]
rows.append(
[
ds_family,
ds.canonical_name,
ds_side_name,
ds_side.entities,
*triples,
ds_side.relations,
ds_side.properties,
ds_side.literals,
num_clusters,
intra_dataset_matches[i],
all_matches,
]
)
statistics_df = pd.DataFrame(
rows,
columns=columns,
)
return statistics_df.set_index(index_cols)


all_classes_with_args: Tuple[Tuple[type[MultiSourceEADataset], Dict[str, str]], ...] = (
(OpenEA, {"graph_pair": "D_W", "size": "15K", "version": "V1"}),
Expand Down Expand Up @@ -35,7 +94,7 @@
)


def create_statistic(
def create_and_write_statistic(
classes_with_args: Iterable[
Tuple[type[MultiSourceEADataset], Dict[str, str]]
] = all_classes_with_args,
Expand All @@ -50,4 +109,4 @@ def create_statistic(


if __name__ == "__main__":
create_statistic()
create_and_write_statistic()

0 comments on commit d5547d4

Please sign in to comment.