Bases: StandardBlocker
Blocker relying on qgram procedure.
blocking_key: str: On which attribute the blocking should be done
q: int: how big the qgrams should be.
blocking_key: str: On which attribute the blocking should be done
q: int: how big the qgrams should be.
Examples:
>>> # doctest: +SKIP
>>> from sylloge import MovieGraphBenchmark
>>> from klinker.data import KlinkerDataset
>>> ds = KlinkerDataset.from_sylloge(MovieGraphBenchmark(),clean=True)
>>> from klinker.blockers import QgramsBlocker
>>> blocker = QgramsBlocker(blocking_key="tail")
>>> blocks = blocker.assign(left=ds.left, right=ds.right)
Source code in klinker/blockers/qgrams.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93 | class QgramsBlocker(StandardBlocker):
"""Blocker relying on qgram procedure.
Args:
----
blocking_key: str: On which attribute the blocking should be done
q: int: how big the qgrams should be.
Attributes:
----------
blocking_key: str: On which attribute the blocking should be done
q: int: how big the qgrams should be.
Examples:
--------
>>> # doctest: +SKIP
>>> from sylloge import MovieGraphBenchmark
>>> from klinker.data import KlinkerDataset
>>> ds = KlinkerDataset.from_sylloge(MovieGraphBenchmark(),clean=True)
>>> from klinker.blockers import QgramsBlocker
>>> blocker = QgramsBlocker(blocking_key="tail")
>>> blocks = blocker.assign(left=ds.left, right=ds.right)
"""
def __init__(self, blocking_key: str, q: int = 3):
super().__init__(blocking_key=blocking_key)
self.q = q
def qgram_tokenize(self, x: str) -> Optional[List[str]]:
"""Tokenize into qgrams.
Args:
----
x: str: input string
Returns:
-------
list of qgrams
"""
if x is None:
return None
else:
return ["".join(tok) for tok in ngrams(x, self.q)]
def assign(
self,
left: KlinkerFrame,
right: KlinkerFrame,
left_rel: Optional[KlinkerFrame] = None,
right_rel: Optional[KlinkerFrame] = None,
) -> KlinkerBlockManager:
"""Assign entity ids to blocks.
Args:
----
left: KlinkerFrame: Contains entity attribute information of left dataset.
right: KlinkerFrame: Contains entity attribute information of right dataset.
left_rel: Optional[KlinkerFrame]: (Default value = None) Contains relational information of left dataset.
right_rel: Optional[KlinkerFrame]: (Default value = None) Contains relational information of left dataset.
Returns:
-------
KlinkerBlockManager: instance holding the resulting blocks.
"""
assert isinstance(self.blocking_key, str)
qgramed = []
for tab in [left, right]:
reduced = tab.set_index(tab.id_col)[self.blocking_key]
if isinstance(left, dd.DataFrame):
series = reduced.apply(
self.qgram_tokenize, meta=(self.blocking_key, "object")
)
else:
series = reduced.apply(self.qgram_tokenize)
series = series.explode()
kf = tab.__class__._upgrade_from_series(
series,
table_name=tab.table_name,
id_col=tab.id_col,
columns=[tab.id_col, self.blocking_key],
)
qgramed.append(kf)
return super().assign(left=qgramed[0], right=qgramed[1])
|
assign(left, right, left_rel=None, right_rel=None)
Assign entity ids to blocks.
left: KlinkerFrame: Contains entity attribute information of left dataset.
right: KlinkerFrame: Contains entity attribute information of right dataset.
left_rel: Optional[KlinkerFrame]: (Default value = None) Contains relational information of left dataset.
right_rel: Optional[KlinkerFrame]: (Default value = None) Contains relational information of left dataset.
KlinkerBlockManager: instance holding the resulting blocks.
Source code in klinker/blockers/qgrams.py
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93 | def assign(
self,
left: KlinkerFrame,
right: KlinkerFrame,
left_rel: Optional[KlinkerFrame] = None,
right_rel: Optional[KlinkerFrame] = None,
) -> KlinkerBlockManager:
"""Assign entity ids to blocks.
Args:
----
left: KlinkerFrame: Contains entity attribute information of left dataset.
right: KlinkerFrame: Contains entity attribute information of right dataset.
left_rel: Optional[KlinkerFrame]: (Default value = None) Contains relational information of left dataset.
right_rel: Optional[KlinkerFrame]: (Default value = None) Contains relational information of left dataset.
Returns:
-------
KlinkerBlockManager: instance holding the resulting blocks.
"""
assert isinstance(self.blocking_key, str)
qgramed = []
for tab in [left, right]:
reduced = tab.set_index(tab.id_col)[self.blocking_key]
if isinstance(left, dd.DataFrame):
series = reduced.apply(
self.qgram_tokenize, meta=(self.blocking_key, "object")
)
else:
series = reduced.apply(self.qgram_tokenize)
series = series.explode()
kf = tab.__class__._upgrade_from_series(
series,
table_name=tab.table_name,
id_col=tab.id_col,
columns=[tab.id_col, self.blocking_key],
)
qgramed.append(kf)
return super().assign(left=qgramed[0], right=qgramed[1])
|
qgram_tokenize(x)
Tokenize into qgrams.
x: str: input string
Source code in klinker/blockers/qgrams.py
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52 | def qgram_tokenize(self, x: str) -> Optional[List[str]]:
"""Tokenize into qgrams.
Args:
----
x: str: input string
Returns:
-------
list of qgrams
"""
if x is None:
return None
else:
return ["".join(tok) for tok in ngrams(x, self.q)]
|