Skip to content

blockbuilder

ClusteringEmbeddingBlockBuilder

Bases: EmbeddingBlockBuilder

Use clustering of embeddings for blockbuilding.

Source code in klinker/blockers/embedding/blockbuilder.py
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
class ClusteringEmbeddingBlockBuilder(EmbeddingBlockBuilder):
    """Use clustering of embeddings for blockbuilding."""

    def _cluster(
        self,
        left: GeneralVector,
        right: GeneralVector,
    ) -> Tuple[np.ndarray, np.ndarray]:
        """Cluster embeddings.

        Args:
          left: GeneralVector: left embeddings.
          right: GeneralVector: right embeddings.

        Returns:
            cluster labels of left/right
        """
        raise NotImplementedError

    @staticmethod
    def blocks_side(
        cluster_labels: np.ndarray, names: List[str], data_name: str
    ) -> pd.DataFrame:
        """Create blocks form cluster labels for one side.

        Args:
          cluster_labels: np.ndarray: Cluster labels.
          names: List[str]: Entity names.
          data_name: str: Name of dataset.

        Returns:
            Blocks for one side as pandas DataFrame
        """
        blocked = pd.DataFrame([names, cluster_labels]).transpose().groupby(1).agg(set)
        blocked.columns = [data_name]
        blocked.index.name = "cluster"
        return blocked

    def build_blocks(
        self,
        left: NamedVector,
        right: NamedVector,
        left_name: str,
        right_name: str,
    ) -> pd.DataFrame:
        """Build blocks from given embeddings.

        Args:
          left: NamedVector: Left embeddings.
          right: NamedVector: Right embeddings.
          left_name: str: Name of left dataset.
          right_name: str: Name of right dataset.

        Returns:
            Blocks
        """
        left_cluster_labels, right_cluster_labels = self._cluster(
            left.vectors, right.vectors
        )
        left_blocks = ClusteringEmbeddingBlockBuilder.blocks_side(
            left_cluster_labels, left.names, left_name
        )
        right_blocks = ClusteringEmbeddingBlockBuilder.blocks_side(
            right_cluster_labels, right.names, right_name
        )
        return KlinkerBlockManager.from_pandas(
            left_blocks.join(right_blocks, how="inner")
        )

blocks_side(cluster_labels, names, data_name) staticmethod

Create blocks form cluster labels for one side.

Parameters:

Name Type Description Default
cluster_labels ndarray

np.ndarray: Cluster labels.

required
names List[str]

List[str]: Entity names.

required
data_name str

str: Name of dataset.

required

Returns:

Type Description
DataFrame

Blocks for one side as pandas DataFrame

Source code in klinker/blockers/embedding/blockbuilder.py
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
@staticmethod
def blocks_side(
    cluster_labels: np.ndarray, names: List[str], data_name: str
) -> pd.DataFrame:
    """Create blocks form cluster labels for one side.

    Args:
      cluster_labels: np.ndarray: Cluster labels.
      names: List[str]: Entity names.
      data_name: str: Name of dataset.

    Returns:
        Blocks for one side as pandas DataFrame
    """
    blocked = pd.DataFrame([names, cluster_labels]).transpose().groupby(1).agg(set)
    blocked.columns = [data_name]
    blocked.index.name = "cluster"
    return blocked

build_blocks(left, right, left_name, right_name)

Build blocks from given embeddings.

Parameters:

Name Type Description Default
left NamedVector

NamedVector: Left embeddings.

required
right NamedVector

NamedVector: Right embeddings.

required
left_name str

str: Name of left dataset.

required
right_name str

str: Name of right dataset.

required

Returns:

Type Description
DataFrame

Blocks

Source code in klinker/blockers/embedding/blockbuilder.py
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
def build_blocks(
    self,
    left: NamedVector,
    right: NamedVector,
    left_name: str,
    right_name: str,
) -> pd.DataFrame:
    """Build blocks from given embeddings.

    Args:
      left: NamedVector: Left embeddings.
      right: NamedVector: Right embeddings.
      left_name: str: Name of left dataset.
      right_name: str: Name of right dataset.

    Returns:
        Blocks
    """
    left_cluster_labels, right_cluster_labels = self._cluster(
        left.vectors, right.vectors
    )
    left_blocks = ClusteringEmbeddingBlockBuilder.blocks_side(
        left_cluster_labels, left.names, left_name
    )
    right_blocks = ClusteringEmbeddingBlockBuilder.blocks_side(
        right_cluster_labels, right.names, right_name
    )
    return KlinkerBlockManager.from_pandas(
        left_blocks.join(right_blocks, how="inner")
    )

EmbeddingBlockBuilder

Base class for building blocks from embeddings.

Source code in klinker/blockers/embedding/blockbuilder.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
class EmbeddingBlockBuilder:
    """Base class for building blocks from embeddings."""

    def build_blocks(
        self,
        left: NamedVector,
        right: NamedVector,
        left_name: str,
        right_name: str,
    ) -> KlinkerBlockManager:
        """Build blocks from given embeddings.

        Args:
          left: NamedVector: Left embeddings.
          right: NamedVector: Right embeddings.
          left_name: str: Name of left dataset.
          right_name: str: Name of right dataset.

        Returns:
            Blocks
        """
        raise NotImplementedError

build_blocks(left, right, left_name, right_name)

Build blocks from given embeddings.

Parameters:

Name Type Description Default
left NamedVector

NamedVector: Left embeddings.

required
right NamedVector

NamedVector: Right embeddings.

required
left_name str

str: Name of left dataset.

required
right_name str

str: Name of right dataset.

required

Returns:

Type Description
KlinkerBlockManager

Blocks

Source code in klinker/blockers/embedding/blockbuilder.py
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
def build_blocks(
    self,
    left: NamedVector,
    right: NamedVector,
    left_name: str,
    right_name: str,
) -> KlinkerBlockManager:
    """Build blocks from given embeddings.

    Args:
      left: NamedVector: Left embeddings.
      right: NamedVector: Right embeddings.
      left_name: str: Name of left dataset.
      right_name: str: Name of right dataset.

    Returns:
        Blocks
    """
    raise NotImplementedError

HDBSCANEmbeddingBlockBuilder

Bases: ClusteringEmbeddingBlockBuilder

Use HDBSCAN clustering for block building.

For information about parameter selection visit https://hdbscan.readthedocs.io/en/latest/parameter_selection.html.

Parameters:

Name Type Description Default
min_cluster_size int

int: The minimum size of clusters.

5
min_samples Optional[int]

Optional[int]: The number of samples in a neighbourhood for a point to be considered a core point.

None
cluster_selection_epsilon float

float: A distance threshold. Clusters below this value will be merged.

0.0
metric str

str: Distance metric to use.

'euclidean'
alpha float

float: A distance scaling parameter as used in robust single linkage.

1.0
p Optional[float]

Optional[float]: p value to use if using the minkowski metric.

None
cluster_selection_method str

str: The method used to select clusters from the condensed tree.

'eom'
kwargs

Arguments passed to the distance metric

{}

Examples:

>>> import numpy as np
>>> from klinker.data import NamedVector
>>> from klinker.blockers.embedding.blockbuilder import HDBSCANEmbeddingBlockBuilder
>>> left = np.random.rand(50,2)
>>> right = np.random.rand(50,2)
>>> left_names = [f"left_{i}" for i in range(len(left))]
>>> right_names = [f"right_{i}" for i in range(len(right))]
>>> left_v = NamedVector(left_names, left)
>>> right_v = NamedVector(right_names, right)
>>> emb_bb = HDBSCANEmbeddingBlockBuilder()
>>> blocks = emb_bb.build_blocks(left_v, right_v, "left", "right")
>>> blocks[0].compute() #doctest: +SKIP
                              left                right
cluster
0        {left_22, left_3, left_7}  {right_6, right_27}
Source code in klinker/blockers/embedding/blockbuilder.py
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
class HDBSCANEmbeddingBlockBuilder(ClusteringEmbeddingBlockBuilder):
    """Use HDBSCAN clustering for block building.

    For information about parameter selection visit <https://hdbscan.readthedocs.io/en/latest/parameter_selection.html>.

    Args:
        min_cluster_size: int: The minimum size of clusters.
        min_samples: Optional[int]: The number of samples in a neighbourhood for a point to be considered a core point.
        cluster_selection_epsilon: float: A distance threshold. Clusters below this value will be merged.
        metric: str: Distance metric to use.
        alpha: float: A distance scaling parameter as used in robust single linkage.
        p: Optional[float]: p value to use if using the minkowski metric.
        cluster_selection_method: str: The method used to select clusters from the condensed tree.
        kwargs: Arguments passed to the distance metric

    Examples:

        >>> import numpy as np
        >>> from klinker.data import NamedVector
        >>> from klinker.blockers.embedding.blockbuilder import HDBSCANEmbeddingBlockBuilder
        >>> left = np.random.rand(50,2)
        >>> right = np.random.rand(50,2)
        >>> left_names = [f"left_{i}" for i in range(len(left))]
        >>> right_names = [f"right_{i}" for i in range(len(right))]
        >>> left_v = NamedVector(left_names, left)
        >>> right_v = NamedVector(right_names, right)
        >>> emb_bb = HDBSCANEmbeddingBlockBuilder()
        >>> blocks = emb_bb.build_blocks(left_v, right_v, "left", "right")
        >>> blocks[0].compute() #doctest: +SKIP
                                      left                right
        cluster
        0        {left_22, left_3, left_7}  {right_6, right_27}

    """

    def __init__(
        self,
        min_cluster_size: int = 5,
        min_samples: Optional[int] = None,
        cluster_selection_epsilon: float = 0.0,
        metric: str = "euclidean",
        alpha: float = 1.0,
        p: Optional[float] = None,
        cluster_selection_method: str = "eom",
        **kwargs
    ):
        self.clusterer = HDBSCAN(
            min_cluster_size=min_cluster_size,
            min_samples=min_samples,
            cluster_selection_epsilon=cluster_selection_epsilon,
            metric=metric,
            alpha=alpha,
            p=p,
            **kwargs
        )

    def _cluster(
        self,
        left: GeneralVector,
        right: GeneralVector,
    ) -> Tuple[np.ndarray, np.ndarray]:
        """Cluster embeddings.

        Args:
          left: GeneralVector: left embeddings.
          right: GeneralVector: right embeddings.

        Returns:
            cluster labels of left/right
        """
        cluster_labels = self.clusterer.fit_predict(np.concatenate([left, right]))
        return cluster_labels[: len(left)], cluster_labels[len(left) :]

KiezEmbeddingBlockBuilder

Bases: NearestNeighborEmbeddingBlockBuilder

Use kiez for nearest neighbor calculation.

Parameters:

Name Type Description Default
n_neighbors int

number k nearest neighbors.

5
algorithm Optional[Union[str, NNAlgorithm, Type[NNAlgorithm]]]

nearest neighbor algorithm.

None
algorithm_kwargs Optional[Dict[str, Any]]

keyword arguments for initialising nearest neighbor algorithm.

None
hubness Optional[Union[str, HubnessReduction, Type[HubnessReduction]]]

hubness reduction method if wanted.

None
hubness_kwargs Optional[Dict[str, Any]]

keyword arguments for initialising hubness reduction.

None

Examples:

>>> import numpy as np
>>> from klinker.data import NamedVector
>>> from klinker.blockers.embedding import KiezEmbeddingBlockBuilder
>>> left = np.random.rand(50,2)
>>> right = np.random.rand(50,2)
>>> left_names = [f"left_{i}" for i in range(10)]
>>> left_names = [f"left_{i}" for i in range(len(left))]
>>> right_names = [f"right_{i}" for i in range(len(right))]
>>> left_v = NamedVector(left_names, left)
>>> right_v = NamedVector(right_names, right)
>>> emb_bb = KiezEmbeddingBlockBuilder()
>>> blocks = emb_bb.build_blocks(left_v, right_v, "left", "right") # doctest: +SKIP
>>> blocks[0].compute() # doctest: +SKIP
               left                                              right
0  [left_0]  [right_3, right_24, right_11, right_46, right_37]
Source code in klinker/blockers/embedding/blockbuilder.py
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
class KiezEmbeddingBlockBuilder(NearestNeighborEmbeddingBlockBuilder):
    """Use kiez for nearest neighbor calculation.

    Args:
        n_neighbors: number k nearest neighbors.
        algorithm: nearest neighbor algorithm.
        algorithm_kwargs: keyword arguments for initialising nearest neighbor algorithm.
        hubness: hubness reduction method if wanted.
        hubness_kwargs: keyword arguments for initialising hubness reduction.

    Examples:

        >>> import numpy as np
        >>> from klinker.data import NamedVector
        >>> from klinker.blockers.embedding import KiezEmbeddingBlockBuilder
        >>> left = np.random.rand(50,2)
        >>> right = np.random.rand(50,2)
        >>> left_names = [f"left_{i}" for i in range(10)]
        >>> left_names = [f"left_{i}" for i in range(len(left))]
        >>> right_names = [f"right_{i}" for i in range(len(right))]
        >>> left_v = NamedVector(left_names, left)
        >>> right_v = NamedVector(right_names, right)
        >>> emb_bb = KiezEmbeddingBlockBuilder()
        >>> blocks = emb_bb.build_blocks(left_v, right_v, "left", "right") # doctest: +SKIP
        >>> blocks[0].compute() # doctest: +SKIP
                       left                                              right
        0  [left_0]  [right_3, right_24, right_11, right_46, right_37]

    """

    def __init__(
        self,
        n_neighbors: int = 5,
        algorithm: Optional[Union[str, NNAlgorithm, Type[NNAlgorithm]]] = None,
        algorithm_kwargs: Optional[Dict[str, Any]] = None,
        hubness: Optional[Union[str, HubnessReduction, Type[HubnessReduction]]] = None,
        hubness_kwargs: Optional[Dict[str, Any]] = None,
    ) -> None:
        self.kiez = Kiez(
            n_neighbors=n_neighbors,
            algorithm=algorithm,
            algorithm_kwargs=algorithm_kwargs,
            hubness=hubness,
            hubness_kwargs=hubness_kwargs,
        )

    def _get_neighbors(
        self,
        left: GeneralVector,
        right: GeneralVector,
    ) -> np.ndarray:
        """Get nearest neighbors of of left entities in right embeddings.

        Args:
          left: GeneralVector: Left embeddings.
          right: GeneralVector: Right embeddings.

        Returns:
            nearest neighbors
        """
        if isinstance(left, torch.Tensor) and isinstance(right, torch.Tensor):
            left = left.detach().cpu().numpy()
            right = right.detach().cpu().numpy()
        self.kiez.fit(left, right)
        neighs = self.kiez.kneighbors(return_distance=False)
        assert isinstance(neighs, np.ndarray)  # for mypy
        return neighs

NearestNeighborEmbeddingBlockBuilder

Bases: EmbeddingBlockBuilder

Build blocks from embeddings by using n-nearest neigbors as blocks.

Source code in klinker/blockers/embedding/blockbuilder.py
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
class NearestNeighborEmbeddingBlockBuilder(EmbeddingBlockBuilder):
    """Build blocks from embeddings by using n-nearest neigbors as blocks."""

    def _get_neighbors(
        self,
        left: GeneralVector,
        right: GeneralVector,
    ) -> np.ndarray:
        """Get nearest neighbors of of left entities in right embeddings.

        Args:
          left: GeneralVector: Left embeddings.
          right: GeneralVector: Right embeddings.

        Returns:
            nearest neighbors
        """
        raise NotImplementedError

    def build_blocks(
        self,
        left: NamedVector,
        right: NamedVector,
        left_name: str,
        right_name: str,
    ) -> KlinkerBlockManager:
        """Build blocks from given embeddings.

        Args:
          left: NamedVector: Left embeddings.
          right: NamedVector: Right embeddings.
          left_name: str: Name of left dataset.
          right_name: str: Name of right dataset.

        Returns:
            Blocks
        """
        neighbors = self._get_neighbors(left=left.vectors, right=right.vectors)
        df = pd.DataFrame(neighbors)
        df[right_name] = df.applymap(
            lambda x, right: right.names[x],
            right=right,
        ).values.tolist()
        df[left_name] = [[name] for name in left.names]
        return KlinkerBlockManager.from_pandas(df[[left_name, right_name]])

build_blocks(left, right, left_name, right_name)

Build blocks from given embeddings.

Parameters:

Name Type Description Default
left NamedVector

NamedVector: Left embeddings.

required
right NamedVector

NamedVector: Right embeddings.

required
left_name str

str: Name of left dataset.

required
right_name str

str: Name of right dataset.

required

Returns:

Type Description
KlinkerBlockManager

Blocks

Source code in klinker/blockers/embedding/blockbuilder.py
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
def build_blocks(
    self,
    left: NamedVector,
    right: NamedVector,
    left_name: str,
    right_name: str,
) -> KlinkerBlockManager:
    """Build blocks from given embeddings.

    Args:
      left: NamedVector: Left embeddings.
      right: NamedVector: Right embeddings.
      left_name: str: Name of left dataset.
      right_name: str: Name of right dataset.

    Returns:
        Blocks
    """
    neighbors = self._get_neighbors(left=left.vectors, right=right.vectors)
    df = pd.DataFrame(neighbors)
    df[right_name] = df.applymap(
        lambda x, right: right.names[x],
        right=right,
    ).values.tolist()
    df[left_name] = [[name] for name in left.names]
    return KlinkerBlockManager.from_pandas(df[[left_name, right_name]])