Skip to content

standard

StandardBlocker

Bases: Blocker

Block on same values of a specific column.

Examples:

>>> # doctest: +SKIP
>>> from sylloge import MovieGraphBenchmark
>>> from klinker.data import KlinkerDataset
>>> ds = KlinkerDataset.from_sylloge(MovieGraphBenchmark(),clean=True)
>>> from klinker.blockers import StandardBlocker
>>> blocker = StandardBlocker(blocking_key="tail")
>>> blocks = blocker.assign(left=ds.left, right=ds.right)
Reference

Fellegi, Ivan P. and Alan B. Sunter. 'A Theory for Record Linkage.' Journal of the American Statistical Association 64 (1969): 1183-1210.

Source code in klinker/blockers/standard.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
class StandardBlocker(Blocker):
    """Block on same values of a specific column.

    Examples:

        >>> # doctest: +SKIP
        >>> from sylloge import MovieGraphBenchmark
        >>> from klinker.data import KlinkerDataset
        >>> ds = KlinkerDataset.from_sylloge(MovieGraphBenchmark(),clean=True)
        >>> from klinker.blockers import StandardBlocker
        >>> blocker = StandardBlocker(blocking_key="tail")
        >>> blocks = blocker.assign(left=ds.left, right=ds.right)

    Quote: Reference
        Fellegi, Ivan P. and Alan B. Sunter. 'A Theory for Record Linkage.' Journal of the American Statistical Association 64 (1969): 1183-1210.
    """

    def __init__(self, blocking_key: str):
        self.blocking_key = blocking_key

    def _inner_assign(self, kf: KlinkerFrame) -> pd.DataFrame:
        id_col = kf.id_col
        table_name = kf.table_name
        assert table_name

        # TODO address code duplication
        if isinstance(kf, KlinkerDaskFrame):
            series = (
                kf[[id_col, self.blocking_key]]
                .groupby(self.blocking_key)
                .apply(
                    lambda x, id_col: list(set(x[id_col])),
                    id_col=kf.id_col,
                    meta=pd.Series(
                        [], dtype=object, index=pd.Index([], name=self.blocking_key)
                    ),
                )
            )
        else:
            series = (
                kf[[id_col, self.blocking_key]]
                .groupby(self.blocking_key)
                .apply(
                    lambda x, id_col: list(set(x[id_col])),
                    id_col=kf.id_col,
                )
            )
        blocked = kf.__class__._upgrade_from_series(
            series,
            columns=[table_name],
            table_name=table_name,
            id_col=id_col,
            reset_index=False,
        )
        return blocked

    def assign(
        self,
        left: KlinkerFrame,
        right: KlinkerFrame,
        left_rel: Optional[KlinkerFrame] = None,
        right_rel: Optional[KlinkerFrame] = None,
    ) -> KlinkerBlockManager:
        """Assign entity ids to blocks.

        Args:
          left: KlinkerFrame: Contains entity attribute information of left dataset.
          right: KlinkerFrame: Contains entity attribute information of right dataset.
          left_rel: Optional[KlinkerFrame]:  (Default value = None) Contains relational information of left dataset.
          right_rel: Optional[KlinkerFrame]:  (Default value = None) Contains relational information of left dataset.

        Returns:
            KlinkerBlockManager: instance holding the resulting blocks.
        """
        left_assign = self._inner_assign(left)
        right_assign = self._inner_assign(right)
        pd_blocks = left_assign.join(right_assign, how="inner")
        if isinstance(left_assign, dd.DataFrame):
            return KlinkerBlockManager(pd_blocks)
        return KlinkerBlockManager.from_pandas(pd_blocks)

assign(left, right, left_rel=None, right_rel=None)

Assign entity ids to blocks.

Parameters:

Name Type Description Default
left KlinkerFrame

KlinkerFrame: Contains entity attribute information of left dataset.

required
right KlinkerFrame

KlinkerFrame: Contains entity attribute information of right dataset.

required
left_rel Optional[KlinkerFrame]

Optional[KlinkerFrame]: (Default value = None) Contains relational information of left dataset.

None
right_rel Optional[KlinkerFrame]

Optional[KlinkerFrame]: (Default value = None) Contains relational information of left dataset.

None

Returns:

Name Type Description
KlinkerBlockManager KlinkerBlockManager

instance holding the resulting blocks.

Source code in klinker/blockers/standard.py
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
def assign(
    self,
    left: KlinkerFrame,
    right: KlinkerFrame,
    left_rel: Optional[KlinkerFrame] = None,
    right_rel: Optional[KlinkerFrame] = None,
) -> KlinkerBlockManager:
    """Assign entity ids to blocks.

    Args:
      left: KlinkerFrame: Contains entity attribute information of left dataset.
      right: KlinkerFrame: Contains entity attribute information of right dataset.
      left_rel: Optional[KlinkerFrame]:  (Default value = None) Contains relational information of left dataset.
      right_rel: Optional[KlinkerFrame]:  (Default value = None) Contains relational information of left dataset.

    Returns:
        KlinkerBlockManager: instance holding the resulting blocks.
    """
    left_assign = self._inner_assign(left)
    right_assign = self._inner_assign(right)
    pd_blocks = left_assign.join(right_assign, how="inner")
    if isinstance(left_assign, dd.DataFrame):
        return KlinkerBlockManager(pd_blocks)
    return KlinkerBlockManager.from_pandas(pd_blocks)