lsh

`MinHashLSHBlocker`

Bases: SchemaAgnosticBlocker

Blocker relying on MinHashLSH procedure.

tokenize_fn Callable: Function that tokenizes entity attribute values.
threshold: float: Jaccard threshold to use in underlying lsh procedure.
num_perm: int: number of permutations used in minhash algorithm.
weights: Tuple[float,float]: false positive/false negative weighting (must add up to one)

tokenize_fn Callable: Function that tokenizes entity attribute values.
threshold: float: Jaccard threshold to use in underlying lsh procedure.
num_perm: int: number of permutations used in minhash algorithm.
weights: Tuple[float,float]: false positive/false negative weighting (must add up to one)

Examples:

>>> # doctest: +SKIP
>>> from sylloge import MovieGraphBenchmark
>>> from klinker.data import KlinkerDataset
>>> ds = KlinkerDataset.from_sylloge(MovieGraphBenchmark(),clean=True)
>>> from klinker.blockers import MinHashLSHBlocker
>>> blocker = MinHashLSHBlocker(threshold=0.8, weights=(0.7,0.3))
>>> blocks = blocker.assign(left=ds.left, right=ds.right)

Source code in klinker/blockers/lsh.py

class MinHashLSHBlocker(SchemaAgnosticBlocker):
    """Blocker relying on MinHashLSH procedure.

    Args:
    ----
        tokenize_fn Callable: Function that tokenizes entity attribute values.
        threshold: float: Jaccard threshold to use in underlying lsh procedure.
        num_perm: int: number of permutations used in minhash algorithm.
        weights: Tuple[float,float]: false positive/false negative weighting (must add up to one)

    Attributes:
    ----------
        tokenize_fn Callable: Function that tokenizes entity attribute values.
        threshold: float: Jaccard threshold to use in underlying lsh procedure.
        num_perm: int: number of permutations used in minhash algorithm.
        weights: Tuple[float,float]: false positive/false negative weighting (must add up to one)

    Examples:
    --------
        >>> # doctest: +SKIP
        >>> from sylloge import MovieGraphBenchmark
        >>> from klinker.data import KlinkerDataset
        >>> ds = KlinkerDataset.from_sylloge(MovieGraphBenchmark(),clean=True)
        >>> from klinker.blockers import MinHashLSHBlocker
        >>> blocker = MinHashLSHBlocker(threshold=0.8, weights=(0.7,0.3))
        >>> blocks = blocker.assign(left=ds.left, right=ds.right)

    """

    def __init__(
        self,
        tokenize_fn: Callable = word_tokenize,
        stop_words: Optional[List[str]] = None,
        min_token_length: int = 3,
        threshold: float = 0.5,
        num_perm: int = 128,
        weights: Tuple[float, float] = (0.5, 0.5),
    ):
        self.tokenizer = FilteredTokenizer(
            tokenize_fn=tokenize_fn,
            min_token_length=min_token_length,
            stop_words=stop_words,
        )
        self.threshold = threshold
        self.num_perm = num_perm
        self.weights = weights

    def _inner_encode(self, val: str):
        """Encodes string to list of bytes.

        Args:
        ----
          val: str: input string.

        Returns:
        -------
            list of bytes.
        """
        return [tok.encode("utf-8") for tok in self.tokenizer.tokenize(str(val))]

    def _assign(
        self,
        left: SeriesType,
        right: SeriesType,
        left_rel: Optional[KlinkerFrame] = None,
        right_rel: Optional[KlinkerFrame] = None,
    ) -> KlinkerBlockManager:
        """Assign entity ids to blocks.

        Uses minhash algorithm to encode entities via tokenized attributes.
        Fills a lsh instance with the left hashes.
        Queries using the right hashes.

        Args:
        ----
          left: SeriesType: concatenated entity attribute values of left dataset as series.
          right: SeriesType: concatenated entity attribute values of left dataset as series.
          left_rel: Optional[KlinkerFrame]:  (Default value = None) Contains relational information of left dataset.
          right_rel: Optional[KlinkerFrame]:  (Default value = None) Contains relational information of left dataset.

        Returns:
        -------
            KlinkerBlockManager: instance holding the resulting blocks.
        """
        lsh = MinHashLSH(
            threshold=self.threshold,
            num_perm=self.num_perm,
            weights=self.weights,
        )
        if isinstance(left, dd.Series):
            left.map_partitions(
                _insert,
                lsh=lsh,
                encode_fn=self._inner_encode,
                meta=left._meta.index,
            ).compute()
            blocks = right.map_partitions(
                _query,
                lsh=lsh,
                encode_fn=self._inner_encode,
                left_name=left.name,
                right_name=right.name,
                meta=pd.DataFrame([], columns=[left.name, right.name], dtype="O"),
            )
            return KlinkerBlockManager(blocks)
        else:
            _insert(left, lsh=lsh, encode_fn=self._inner_encode)
            blocks = _query(
                right,
                lsh=lsh,
                encode_fn=self._inner_encode,
                left_name=left.name,
                right_name=right.name,
            )
            return KlinkerBlockManager.from_pandas(blocks)