Skip to content

ea_dataset

KlinkerDataset dataclass

Helper class to hold info of benchmark datasets.

Source code in klinker/data/ea_dataset.py
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
@dataclass
class KlinkerDataset:
    """Helper class to hold info of benchmark datasets."""

    left: KlinkerFrame
    right: KlinkerFrame
    gold: pd.DataFrame
    left_rel: Optional[pd.DataFrame] = None
    right_rel: Optional[pd.DataFrame] = None

    @classmethod
    def from_sylloge(cls, dataset: EADataset, clean: bool = False) -> "KlinkerDataset":
        """Create a klinker dataset from sylloge dataset.

        Args:
          dataset: EADataset: Sylloge dataset.
          clean: bool: Clean attribute information.

        Returns:
            klinker dataset

        Examples:

            >>> # doctest: +SKIP
            >>> from sylloge import MovieGraphBenchmark
            >>> ds = KlinkerDataset.from_sylloge(MovieGraphBenchmark())

        """
        left: Union[KlinkerDaskFrame, KlinkerPandasFrame]
        right: Union[KlinkerDaskFrame, KlinkerPandasFrame]
        if dataset.backend == "pandas":
            left = KlinkerTriplePandasFrame.from_df(
                dataset.attr_triples_left, table_name="left", id_col="head"
            )
            right = KlinkerTriplePandasFrame.from_df(
                dataset.attr_triples_right, table_name="right", id_col="head"
            )
        elif dataset.backend == "dask":
            left = KlinkerTripleDaskFrame.from_dask_dataframe(
                dataset.attr_triples_left, table_name="left", id_col="head"
            )
            right = KlinkerTripleDaskFrame.from_dask_dataframe(
                dataset.attr_triples_right, table_name="right", id_col="head"
            )
        else:
            raise ValueError(f"Unknown dataset backend {dataset.backend}")

        if clean:
            # remove datatype
            left["tail"] = left["tail"].map(lambda x: str(x).split("^^")[0])
            right["tail"] = right["tail"].map(lambda x: str(x).split("^^")[0])

        return cls(
            left=left,
            right=right,
            left_rel=dataset.rel_triples_left,
            right_rel=dataset.rel_triples_right,
            gold=dataset.ent_links,
        )

    def _sample_side(
        self, sample: pd.DataFrame, side: Side
    ) -> Tuple[KlinkerFrame, Optional[pd.DataFrame]]:
        if side == "left":
            rel_df = self.left_rel
            attr_df = self.left
            sample_col = sample.columns[0]
        else:
            rel_df = self.right_rel
            attr_df = self.right
            sample_col = sample.columns[1]
        sampled_attr_df = attr_df[attr_df[attr_df.id_col].isin(sample[sample_col])]
        if rel_df is None:
            return sampled_attr_df, None
        return (
            sampled_attr_df,
            rel_df[
                rel_df["head"].isin(sample[sample_col])
                | rel_df["tail"].isin(sample[sample_col])
            ],
        )

    def sample(self, size: int) -> "KlinkerDataset":
        """Get a sample of the dataset.

        Note:
            Currently this only takes the first n entities of the gold standard.

        Args:
          size: int: size of the sample

        Returns:
            sampled klinker dataset

        Examples:

            >>> # doctest: +SKIP
            >>> from sylloge import MovieGraphBenchmark
            >>> ds = KlinkerDataset.from_sylloge(MovieGraphBenchmark())
            >>> sampled = ds.sample(10)

        """
        # TODO actually sample
        sample_ent_links = self.gold.iloc[:size]
        sample_left, sample_left_rel = self._sample_side(sample_ent_links, "left")
        sample_right, sample_right_rel = self._sample_side(sample_ent_links, "right")
        return KlinkerDataset(
            left=sample_left,
            right=sample_right,
            left_rel=sample_left_rel,
            right_rel=sample_right_rel,
            gold=sample_ent_links,
        )

from_sylloge(dataset, clean=False) classmethod

Create a klinker dataset from sylloge dataset.

Parameters:

Name Type Description Default
dataset EADataset

EADataset: Sylloge dataset.

required
clean bool

bool: Clean attribute information.

False

Returns:

Type Description
KlinkerDataset

klinker dataset

Examples:

>>> # doctest: +SKIP
>>> from sylloge import MovieGraphBenchmark
>>> ds = KlinkerDataset.from_sylloge(MovieGraphBenchmark())
Source code in klinker/data/ea_dataset.py
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
@classmethod
def from_sylloge(cls, dataset: EADataset, clean: bool = False) -> "KlinkerDataset":
    """Create a klinker dataset from sylloge dataset.

    Args:
      dataset: EADataset: Sylloge dataset.
      clean: bool: Clean attribute information.

    Returns:
        klinker dataset

    Examples:

        >>> # doctest: +SKIP
        >>> from sylloge import MovieGraphBenchmark
        >>> ds = KlinkerDataset.from_sylloge(MovieGraphBenchmark())

    """
    left: Union[KlinkerDaskFrame, KlinkerPandasFrame]
    right: Union[KlinkerDaskFrame, KlinkerPandasFrame]
    if dataset.backend == "pandas":
        left = KlinkerTriplePandasFrame.from_df(
            dataset.attr_triples_left, table_name="left", id_col="head"
        )
        right = KlinkerTriplePandasFrame.from_df(
            dataset.attr_triples_right, table_name="right", id_col="head"
        )
    elif dataset.backend == "dask":
        left = KlinkerTripleDaskFrame.from_dask_dataframe(
            dataset.attr_triples_left, table_name="left", id_col="head"
        )
        right = KlinkerTripleDaskFrame.from_dask_dataframe(
            dataset.attr_triples_right, table_name="right", id_col="head"
        )
    else:
        raise ValueError(f"Unknown dataset backend {dataset.backend}")

    if clean:
        # remove datatype
        left["tail"] = left["tail"].map(lambda x: str(x).split("^^")[0])
        right["tail"] = right["tail"].map(lambda x: str(x).split("^^")[0])

    return cls(
        left=left,
        right=right,
        left_rel=dataset.rel_triples_left,
        right_rel=dataset.rel_triples_right,
        gold=dataset.ent_links,
    )

sample(size)

Get a sample of the dataset.

Note

Currently this only takes the first n entities of the gold standard.

Parameters:

Name Type Description Default
size int

int: size of the sample

required

Returns:

Type Description
KlinkerDataset

sampled klinker dataset

Examples:

>>> # doctest: +SKIP
>>> from sylloge import MovieGraphBenchmark
>>> ds = KlinkerDataset.from_sylloge(MovieGraphBenchmark())
>>> sampled = ds.sample(10)
Source code in klinker/data/ea_dataset.py
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
def sample(self, size: int) -> "KlinkerDataset":
    """Get a sample of the dataset.

    Note:
        Currently this only takes the first n entities of the gold standard.

    Args:
      size: int: size of the sample

    Returns:
        sampled klinker dataset

    Examples:

        >>> # doctest: +SKIP
        >>> from sylloge import MovieGraphBenchmark
        >>> ds = KlinkerDataset.from_sylloge(MovieGraphBenchmark())
        >>> sampled = ds.sample(10)

    """
    # TODO actually sample
    sample_ent_links = self.gold.iloc[:size]
    sample_left, sample_left_rel = self._sample_side(sample_ent_links, "left")
    sample_right, sample_right_rel = self._sample_side(sample_ent_links, "right")
    return KlinkerDataset(
        left=sample_left,
        right=sample_right,
        left_rel=sample_left_rel,
        right_rel=sample_right_rel,
        gold=sample_ent_links,
    )