Skip to content

ea_dataset

KlinkerDataset dataclass

Helper class to hold info of benchmark datasets.

Source code in klinker/data/ea_dataset.py
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
@dataclass
class KlinkerDataset:
    """Helper class to hold info of benchmark datasets."""

    left: KlinkerFrame
    right: KlinkerFrame
    gold: pd.DataFrame
    left_rel: Optional[pd.DataFrame] = None
    right_rel: Optional[pd.DataFrame] = None

    @classmethod
    def from_sylloge(
        cls,
        dataset: MultiSourceEADataset,
        clean: bool = False,
        partition_size: Optional[str] = None,
    ) -> "KlinkerDataset":
        """Create a klinker dataset from sylloge dataset.

        Args:
        ----
          dataset: EADataset: Sylloge dataset.
          clean: bool: Clean attribute information.

        Returns:
        -------
            klinker dataset

        Examples:
        --------
            >>> # doctest: +SKIP
            >>> from sylloge import MovieGraphBenchmark
            >>> ds = KlinkerDataset.from_sylloge(MovieGraphBenchmark())

        """
        left: Union[KlinkerDaskFrame, KlinkerPandasFrame]
        right: Union[KlinkerDaskFrame, KlinkerPandasFrame]
        ds_names = dataset.dataset_names

        attr_left = dataset.attr_triples[0]
        attr_right = dataset.attr_triples[1]
        left_rel = dataset.rel_triples[0]
        right_rel = dataset.rel_triples[1]
        if dataset.backend == "pandas":
            left = KlinkerTriplePandasFrame.from_df(
                attr_left, table_name=ds_names[0], id_col="head"
            )
            right = KlinkerTriplePandasFrame.from_df(
                attr_right, table_name=ds_names[1], id_col="head"
            )
        elif dataset.backend == "dask":
            if partition_size:
                attr_left, attr_right, left_rel, right_rel = [
                    frame.repartition(partition_size=partition_size)
                    for frame in [
                        dataset.attr_triples[0],
                        dataset.attr_triples[1],
                        left_rel,
                        right_rel,
                    ]
                ]
            left = KlinkerTripleDaskFrame.from_dask_dataframe(
                attr_left, table_name=ds_names[0], id_col="head"
            )
            right = KlinkerTripleDaskFrame.from_dask_dataframe(
                attr_right, table_name=ds_names[1], id_col="head"
            )
        else:
            raise ValueError(f"Unknown dataset backend {dataset.backend}")

        if clean:
            # remove datatype
            left["tail"] = left["tail"].map(lambda x: str(x).split("^^")[0])
            right["tail"] = right["tail"].map(lambda x: str(x).split("^^")[0])

        if isinstance(dataset.ent_links, PrefixedClusterHelper):
            ent_links = pd.DataFrame(
                dataset.ent_links.all_pairs_no_intra(), columns=dataset.dataset_names
            )
        else:
            ent_links = dataset.ent_links.rename(
                columns={
                    "left": dataset.dataset_names[0],
                    "right": dataset.dataset_names[1],
                }
            )
        return cls(
            left=left,
            right=right,
            left_rel=left_rel,
            right_rel=right_rel,
            gold=ent_links,
        )

    def _sample_side(
        self, sample: pd.DataFrame, side: Side
    ) -> Tuple[KlinkerFrame, Optional[pd.DataFrame]]:
        if side == "left":
            rel_df = self.left_rel
            attr_df = self.left
            sample_col = sample.columns[0]
        else:
            rel_df = self.right_rel
            attr_df = self.right
            sample_col = sample.columns[1]
        sampled_attr_df = attr_df[attr_df[attr_df.id_col].isin(sample[sample_col])]
        if rel_df is None:
            return sampled_attr_df, None
        return (
            sampled_attr_df,
            rel_df[
                rel_df["head"].isin(sample[sample_col])
                | rel_df["tail"].isin(sample[sample_col])
            ],
        )

    def sample(self, frac: float) -> "KlinkerDataset":
        """Get a sample of the dataset.

        Note:
        ----
            Currently this only takes the first n entities of the gold standard.

        Args:
        ----
          frac: percentage of whole
        Returns:
        -------
            sampled klinker dataset

        Examples:
        --------
            >>> # doctest: +SKIP
            >>> from sylloge import MovieGraphBenchmark
            >>> ds = KlinkerDataset.from_sylloge(MovieGraphBenchmark())
            >>> sampled = ds.sample(0.2)

        """
        # TODO actually sample
        sample_ent_links = self.gold.sample(frac=frac)
        sample_left, sample_left_rel = self._sample_side(sample_ent_links, "left")
        sample_right, sample_right_rel = self._sample_side(sample_ent_links, "right")
        return KlinkerDataset(
            left=sample_left,
            right=sample_right,
            left_rel=sample_left_rel,
            right_rel=sample_right_rel,
            gold=sample_ent_links,
        )

from_sylloge(dataset, clean=False, partition_size=None) classmethod

Create a klinker dataset from sylloge dataset.


dataset: EADataset: Sylloge dataset. clean: bool: Clean attribute information.


klinker dataset

Examples:


>>> # doctest: +SKIP
>>> from sylloge import MovieGraphBenchmark
>>> ds = KlinkerDataset.from_sylloge(MovieGraphBenchmark())
Source code in klinker/data/ea_dataset.py
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
@classmethod
def from_sylloge(
    cls,
    dataset: MultiSourceEADataset,
    clean: bool = False,
    partition_size: Optional[str] = None,
) -> "KlinkerDataset":
    """Create a klinker dataset from sylloge dataset.

    Args:
    ----
      dataset: EADataset: Sylloge dataset.
      clean: bool: Clean attribute information.

    Returns:
    -------
        klinker dataset

    Examples:
    --------
        >>> # doctest: +SKIP
        >>> from sylloge import MovieGraphBenchmark
        >>> ds = KlinkerDataset.from_sylloge(MovieGraphBenchmark())

    """
    left: Union[KlinkerDaskFrame, KlinkerPandasFrame]
    right: Union[KlinkerDaskFrame, KlinkerPandasFrame]
    ds_names = dataset.dataset_names

    attr_left = dataset.attr_triples[0]
    attr_right = dataset.attr_triples[1]
    left_rel = dataset.rel_triples[0]
    right_rel = dataset.rel_triples[1]
    if dataset.backend == "pandas":
        left = KlinkerTriplePandasFrame.from_df(
            attr_left, table_name=ds_names[0], id_col="head"
        )
        right = KlinkerTriplePandasFrame.from_df(
            attr_right, table_name=ds_names[1], id_col="head"
        )
    elif dataset.backend == "dask":
        if partition_size:
            attr_left, attr_right, left_rel, right_rel = [
                frame.repartition(partition_size=partition_size)
                for frame in [
                    dataset.attr_triples[0],
                    dataset.attr_triples[1],
                    left_rel,
                    right_rel,
                ]
            ]
        left = KlinkerTripleDaskFrame.from_dask_dataframe(
            attr_left, table_name=ds_names[0], id_col="head"
        )
        right = KlinkerTripleDaskFrame.from_dask_dataframe(
            attr_right, table_name=ds_names[1], id_col="head"
        )
    else:
        raise ValueError(f"Unknown dataset backend {dataset.backend}")

    if clean:
        # remove datatype
        left["tail"] = left["tail"].map(lambda x: str(x).split("^^")[0])
        right["tail"] = right["tail"].map(lambda x: str(x).split("^^")[0])

    if isinstance(dataset.ent_links, PrefixedClusterHelper):
        ent_links = pd.DataFrame(
            dataset.ent_links.all_pairs_no_intra(), columns=dataset.dataset_names
        )
    else:
        ent_links = dataset.ent_links.rename(
            columns={
                "left": dataset.dataset_names[0],
                "right": dataset.dataset_names[1],
            }
        )
    return cls(
        left=left,
        right=right,
        left_rel=left_rel,
        right_rel=right_rel,
        gold=ent_links,
    )

sample(frac)

Get a sample of the dataset.

Note:
Currently this only takes the first n entities of the gold standard.

frac: percentage of whole Returns:


sampled klinker dataset

Examples:


>>> # doctest: +SKIP
>>> from sylloge import MovieGraphBenchmark
>>> ds = KlinkerDataset.from_sylloge(MovieGraphBenchmark())
>>> sampled = ds.sample(0.2)
Source code in klinker/data/ea_dataset.py
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
def sample(self, frac: float) -> "KlinkerDataset":
    """Get a sample of the dataset.

    Note:
    ----
        Currently this only takes the first n entities of the gold standard.

    Args:
    ----
      frac: percentage of whole
    Returns:
    -------
        sampled klinker dataset

    Examples:
    --------
        >>> # doctest: +SKIP
        >>> from sylloge import MovieGraphBenchmark
        >>> ds = KlinkerDataset.from_sylloge(MovieGraphBenchmark())
        >>> sampled = ds.sample(0.2)

    """
    # TODO actually sample
    sample_ent_links = self.gold.sample(frac=frac)
    sample_left, sample_left_rel = self._sample_side(sample_ent_links, "left")
    sample_right, sample_right_rel = self._sample_side(sample_ent_links, "right")
    return KlinkerDataset(
        left=sample_left,
        right=sample_right,
        left_rel=sample_left_rel,
        right_rel=sample_right_rel,
        gold=sample_ent_links,
    )