Skip to content

qgrams

QgramsBlocker

Bases: StandardBlocker

Blocker relying on qgram procedure

Parameters:

Name Type Description Default
blocking_key str

str: On which attribute the blocking should be done

required
q int

int: how big the qgrams should be.

3

Attributes:

Name Type Description
blocking_key

str: On which attribute the blocking should be done

q

int: how big the qgrams should be.

Examples:

>>> # doctest: +SKIP
>>> from sylloge import MovieGraphBenchmark
>>> from klinker.data import KlinkerDataset
>>> ds = KlinkerDataset.from_sylloge(MovieGraphBenchmark(),clean=True)
>>> from klinker.blockers import QgramsBlocker
>>> blocker = QgramsBlocker(blocking_key="tail")
>>> blocks = blocker.assign(left=ds.left, right=ds.right)
Source code in klinker/blockers/qgrams.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
class QgramsBlocker(StandardBlocker):
    """Blocker relying on qgram procedure

    Args:
        blocking_key: str: On which attribute the blocking should be done
        q: int: how big the qgrams should be.

    Attributes:
        blocking_key: str: On which attribute the blocking should be done
        q: int: how big the qgrams should be.

    Examples:

        >>> # doctest: +SKIP
        >>> from sylloge import MovieGraphBenchmark
        >>> from klinker.data import KlinkerDataset
        >>> ds = KlinkerDataset.from_sylloge(MovieGraphBenchmark(),clean=True)
        >>> from klinker.blockers import QgramsBlocker
        >>> blocker = QgramsBlocker(blocking_key="tail")
        >>> blocks = blocker.assign(left=ds.left, right=ds.right)
    """

    def __init__(self, blocking_key: str, q: int = 3):
        super().__init__(blocking_key=blocking_key)
        self.q = q

    def qgram_tokenize(self, x: str) -> Optional[List[str]]:
        """Tokenize into qgrams

        Args:
          x: str: input string

        Returns:
            list of qgrams
        """
        if x is None:
            return None
        else:
            return ["".join(tok) for tok in ngrams(x, self.q)]

    def assign(
        self,
        left: KlinkerFrame,
        right: KlinkerFrame,
        left_rel: Optional[KlinkerFrame] = None,
        right_rel: Optional[KlinkerFrame] = None,
    ) -> KlinkerBlockManager:
        """Assign entity ids to blocks.

        Args:
          left: KlinkerFrame: Contains entity attribute information of left dataset.
          right: KlinkerFrame: Contains entity attribute information of right dataset.
          left_rel: Optional[KlinkerFrame]:  (Default value = None) Contains relational information of left dataset.
          right_rel: Optional[KlinkerFrame]:  (Default value = None) Contains relational information of left dataset.

        Returns:
            KlinkerBlockManager: instance holding the resulting blocks.
        """
        assert isinstance(self.blocking_key, str)
        qgramed = []
        for tab in [left, right]:

            reduced = tab.set_index(tab.id_col)[self.blocking_key]
            if isinstance(left, dd.DataFrame):
                series = reduced.apply(
                    self.qgram_tokenize, meta=(self.blocking_key, "object")
                )
            else:
                series = reduced.apply(self.qgram_tokenize)
            series = series.explode()

            kf = tab.__class__._upgrade_from_series(
                series,
                table_name=tab.table_name,
                id_col=tab.id_col,
                columns=[tab.id_col, self.blocking_key],
            )
            qgramed.append(kf)
        return super().assign(left=qgramed[0], right=qgramed[1])

assign(left, right, left_rel=None, right_rel=None)

Assign entity ids to blocks.

Parameters:

Name Type Description Default
left KlinkerFrame

KlinkerFrame: Contains entity attribute information of left dataset.

required
right KlinkerFrame

KlinkerFrame: Contains entity attribute information of right dataset.

required
left_rel Optional[KlinkerFrame]

Optional[KlinkerFrame]: (Default value = None) Contains relational information of left dataset.

None
right_rel Optional[KlinkerFrame]

Optional[KlinkerFrame]: (Default value = None) Contains relational information of left dataset.

None

Returns:

Name Type Description
KlinkerBlockManager KlinkerBlockManager

instance holding the resulting blocks.

Source code in klinker/blockers/qgrams.py
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
def assign(
    self,
    left: KlinkerFrame,
    right: KlinkerFrame,
    left_rel: Optional[KlinkerFrame] = None,
    right_rel: Optional[KlinkerFrame] = None,
) -> KlinkerBlockManager:
    """Assign entity ids to blocks.

    Args:
      left: KlinkerFrame: Contains entity attribute information of left dataset.
      right: KlinkerFrame: Contains entity attribute information of right dataset.
      left_rel: Optional[KlinkerFrame]:  (Default value = None) Contains relational information of left dataset.
      right_rel: Optional[KlinkerFrame]:  (Default value = None) Contains relational information of left dataset.

    Returns:
        KlinkerBlockManager: instance holding the resulting blocks.
    """
    assert isinstance(self.blocking_key, str)
    qgramed = []
    for tab in [left, right]:

        reduced = tab.set_index(tab.id_col)[self.blocking_key]
        if isinstance(left, dd.DataFrame):
            series = reduced.apply(
                self.qgram_tokenize, meta=(self.blocking_key, "object")
            )
        else:
            series = reduced.apply(self.qgram_tokenize)
        series = series.explode()

        kf = tab.__class__._upgrade_from_series(
            series,
            table_name=tab.table_name,
            id_col=tab.id_col,
            columns=[tab.id_col, self.blocking_key],
        )
        qgramed.append(kf)
    return super().assign(left=qgramed[0], right=qgramed[1])

qgram_tokenize(x)

Tokenize into qgrams

Parameters:

Name Type Description Default
x str

str: input string

required

Returns:

Type Description
Optional[List[str]]

list of qgrams

Source code in klinker/blockers/qgrams.py
36
37
38
39
40
41
42
43
44
45
46
47
48
def qgram_tokenize(self, x: str) -> Optional[List[str]]:
    """Tokenize into qgrams

    Args:
      x: str: input string

    Returns:
        list of qgrams
    """
    if x is None:
        return None
    else:
        return ["".join(tok) for tok in ngrams(x, self.q)]