klinker

`KlinkerBlockManager`

Class for handling of blocks.

Parameters:

Name	Type	Description	Default
`blocks`	`DataFrame`	dataframe with blocks.	required

Examples:

>>> from klinker import KlinkerBlockManager
>>> kbm = KlinkerBlockManager.from_dict({ "block1": [[1,3,4],[3,4,5]], "block2": [[3,4,5],[5,6]]}, dataset_names=("A","B"))
>>> kbm.blocks.compute()
                A          B
block1  [1, 3, 4]  [3, 4, 5]
block2  [3, 4, 5]     [5, 6]
>>> kbm["block1"].compute()
                A          B
block1  [1, 3, 4]  [3, 4, 5]
>>> len(kbm)
2
>>> set(kbm.all_pairs())
{(4, 4), (5, 5), (3, 4), (1, 5), (4, 3), (4, 6), (1, 4), (4, 5), (3, 3), (5, 6), (3, 6), (1, 3), (3, 5)}
>>> kbm.block_sizes
block1    6
block2    5
Name: block_sizes, dtype: int64
>>> kbm.mean_block_size
5.5
>>> kbm.to_dict()
{'block1': ([1, 3, 4], [3, 4, 5]), 'block2': ([3, 4, 5], [5, 6])}

```

Source code in klinker/data/blocks.py

class KlinkerBlockManager:
    """Class for handling of blocks.

    Args:
        blocks: dataframe with blocks.

    Examples:

        >>> from klinker import KlinkerBlockManager
        >>> kbm = KlinkerBlockManager.from_dict({ "block1": [[1,3,4],[3,4,5]], "block2": [[3,4,5],[5,6]]}, dataset_names=("A","B"))
        >>> kbm.blocks.compute()
                        A          B
        block1  [1, 3, 4]  [3, 4, 5]
        block2  [3, 4, 5]     [5, 6]
        >>> kbm["block1"].compute()
                        A          B
        block1  [1, 3, 4]  [3, 4, 5]
        >>> len(kbm)
        2
        >>> set(kbm.all_pairs())
        {(4, 4), (5, 5), (3, 4), (1, 5), (4, 3), (4, 6), (1, 4), (4, 5), (3, 3), (5, 6), (3, 6), (1, 3), (3, 5)}
        >>> kbm.block_sizes
        block1    6
        block2    5
        Name: block_sizes, dtype: int64
        >>> kbm.mean_block_size
        5.5
        >>> kbm.to_dict()
        {'block1': ([1, 3, 4], [3, 4, 5]), 'block2': ([3, 4, 5], [5, 6])}

        ```
    """

    def __init__(self, blocks: dd.DataFrame):
        self.blocks = blocks
        grouped = []
        for column_name in self.blocks.columns:
            cur_ex = self.blocks[column_name].explode()
            grouped.append(cur_ex.to_frame().groupby(by=column_name))
        self._grouped = tuple(grouped)

    def __getitem__(self, key):
        return self.blocks.loc[key]

    def __len__(self) -> int:
        return len(self.blocks)

    def __repr__(self) -> str:
        return f"KlinkerBlockManager(blocks=\n{self.blocks.__repr__()})"

    def to_dict(self) -> Dict[Union[str, int], Tuple[Union[str, int], Union[str, int]]]:
        """Return blocks as dict.

        Returns:
          The dict has block names as keys and a tuple of sets of entity ids.
        """
        return (
            self.blocks.apply(tuple, axis=1, meta=pd.Series([], dtype=object))
            .compute()
            .to_dict()
        )

    def find_blocks(self, entity_id: Union[str, int], column_id: int) -> np.ndarray:
        """Find blocks where entity id belongs to.

        Args:
          entity_id: Union[str, int]: Entity id.
          column_id: int: Whether entity belongs to left (0) or right (1) dataset.

        Returns:
            Blocks where entity id belongs to.
        """
        return self._grouped[column_id].get_group(entity_id).index.values.compute()

    def entity_pairs(
        self, entity_id: Union[str, int], column_id: int
    ) -> Generator[Tuple[Union[int, str], ...], None, None]:
        """Get all pairs where this entity shows up.

        Args:
          entity_id: Union[str, int]: Entity id.
          column_id: int: Whether entity belongs to left (0) or right (1) dataset.

        Returns:
            Generator for these pairs.
        """
        cur_blocks = self.find_blocks(entity_id, column_id)
        other_column = 0 if column_id == 1 else 1
        other_column_name = self.blocks.columns[other_column]
        return (
            pair
            for blk_name in cur_blocks
            for _, blk in self.blocks.loc[blk_name][other_column_name].compute().items()
            for pair in itertools.product({entity_id}, blk)
        )

    def all_pairs(self) -> Generator[Tuple[Union[int, str], ...], None, None]:
        """Get all pairs

        Returns:
            Generator that creates all pairs, from blocks (including duplicates).
        """
        for block_tuple in self.blocks.itertuples(index=False, name=None):
            for pair in itertools.product(*block_tuple):
                yield pair

    @property
    def block_sizes(self) -> pd.DataFrame:
        """Sizes of blocks"""
        meta = pd.Series([], dtype="int64", name="block_sizes")
        return self.blocks.apply(
            lambda x: sum(len(v) for v in x), axis=1, meta=meta
        ).compute()

    @property
    def mean_block_size(self) -> float:
        """Mean size of all blocks."""
        return self.block_sizes.mean()

    @classmethod
    def combine(
        cls, this: "KlinkerBlockManager", other: "KlinkerBlockManager"
    ) -> "KlinkerBlockManager":
        """Combine blocks.

        Args:
          this: one block manager to combine
          other: other block manager to combine

        Returns:
          Combined KlinkerBlockManager

        Examples:

            >>> from klinker import KlinkerBlockManager
            >>> kbm = KlinkerBlockManager.from_dict({"block1": [[1,3,4],[3,4,5]], "block2": [[3,4,5],[5,6]]}, dataset_names=("A","B"))
            >>> kbm2 = KlinkerBlockManager.from_dict({"block3": [[7,4],[12,8]]}, dataset_names=("A","B"))
            >>> kbm_merged = KlinkerBlockManager.combine(kbm, kbm2)
            >>> kbm_merged.blocks.compute()
                            A          B
            block1  [1, 3, 4]  [3, 4, 5]
            block2  [3, 4, 5]     [5, 6]
            block3     [7, 4]    [12, 8]

        """

        def _merge_blocks(
            row: pd.Series, output_names: Sequence[str], left_right_names: Sequence[str]
        ):
            nonnull = row[~row.isnull()]
            if len(nonnull) == 2:  # no block overlap
                nonnull.index = output_names
                return nonnull
            else:
                A_left = set(nonnull[left_right_names[0]])
                A_right = set(nonnull[left_right_names[2]])
                B_left = set(nonnull[left_right_names[1]])
                B_right = set(nonnull[left_right_names[3]])
                A = list(A_left.union(A_right))
                B = list(B_left.union(B_right))
                return pd.Series([A, B], index=output_names, name=nonnull.name)

        if list(this.blocks.columns) != list(other.blocks.columns):
            raise ValueError("Cannot combine blocks from different datasets!")

        output_names = this.blocks.columns
        left_suffix = "left"
        right_suffix = "right"
        left_right_names = [
            col + suffix
            for col_names, suffix in zip(
                [this.blocks.columns, other.blocks.columns], [left_suffix, right_suffix]
            )
            for col in col_names
        ]
        joined = this.blocks.join(
            other.blocks, how="outer", lsuffix="left", rsuffix="right"
        )

        meta = pd.DataFrame([], columns=output_names)
        return cls(
            joined.apply(
                _merge_blocks,
                output_names=output_names,
                left_right_names=left_right_names,
                axis=1,
                meta=meta,
            )
        )

    def to_parquet(self, path: Union[str, pathlib.Path], **kwargs):
        """Write blocks as parquet file(s).

        Args:
          path: Union[str, pathlib.Path]: Where to write.
          **kwargs: passed to the parquet function
        """
        if "schema" not in kwargs:
            left, right = self.blocks.columns[:2]
            block_type = pa.list_(pa.string())
            schema = {
                left: block_type,
                right: block_type,
            }
        else:
            schema = kwargs.pop["schema"]  # type: ignore
        try:
            self.blocks.to_parquet(path, schema=schema, **kwargs)
        except ValueError:
            # If index is incorrectly assumed by dask to be string
            # and it turns out to be int64 an error would be thrown
            # This is kind of a dirty hack
            schema["__null_dask_index__"] = pa.int64()
            self.blocks.to_parquet(path, schema=schema, **kwargs)

    @classmethod
    def read_parquet(
        cls,
        path: Union[str, pathlib.Path],
        calculate_divisions: bool = True,
        **kwargs,
    ) -> "KlinkerBlockManager":
        """Read blocks from parquet.

        Args:
          path: Union[str, pathlib.Path]: Path where blocks are stored.
          calculate_divisions: bool: Calculate index divisions.
          **kwargs: Passed to `dd.read_parquet` function.

        Returns:
            Blocks as KlinkerBlockManager
        """
        return cls(
            dd.read_parquet(
                path=path,
                calculate_divisions=calculate_divisions,
                **kwargs,
            )
        )

    @classmethod
    def from_pandas(
        cls, df: pd.DataFrame, npartitions: int = 1, **kwargs
    ) -> "KlinkerBlockManager":
        """Create from pandas.

        Args:
          df: pd.DataFrame: DataFrame
          npartitions: int:  Partitions for dask
          **kwargs: Passed to `dd.from_pandas`

        Returns:
            Blocks as KlinkerBlockManager

        Examples:

            >>> import pandas as pd
            >>> from klinker import KlinkerBlockManager
            >>> pd_blocks = pd.DataFrame({'A': {'block1': [1, 3, 4], 'block2': [3, 4, 5]}, 'B': {'block1': [3, 4, 5], 'block2': [5, 6]}})
            >>> kbm = KlinkerBlockManager.from_pandas(pd_blocks)

        """
        return cls(dd.from_pandas(df, npartitions=npartitions, **kwargs))

    @classmethod
    def from_dict(
        cls,
        block_dict: Dict[
            BlockIdTypeVar, Tuple[List[EntityIdTypeVar], List[EntityIdTypeVar]]
        ],
        dataset_names: Tuple[str, str] = ("left", "right"),
        npartitions: int = 1,
        **kwargs,
    ) -> "KlinkerBlockManager":
        """

        Args:
          block_dict: Dictionary with block information.
          dataset_names: Tuple[str, str]: Tuple of dataset names.
          npartitions: int: Partitions used for dask.
          **kwargs: Passed to `dd.from_dict`.

        Returns:
            Blocks as KlinkerBlockManager

        Examples:

            >>> from klinker import KlinkerBlockManager
            >>> kbm = KlinkerBlockManager.from_dict({"block1": [[1,3,4],[3,4,5]], "block2": [[3,4,5],[5,6]]}, dataset_names=("A","B"))

        """
        return cls(
            dd.from_dict(
                block_dict,
                orient="index",
                columns=dataset_names,
                npartitions=npartitions,
                **kwargs,
            )
        )

    @classmethod
    @deprecated(reason="Please use parquet files")
    def read_pickle(cls, path) -> "KlinkerBlockManager":
        with open(path, "rb") as in_file:
            res = pickle.load(in_file)
            if isinstance(res, dict):
                return cls.from_dict(res)
            elif isinstance(res, pd.DataFrame):
                return cls.from_pandas(res)
            elif hasattr(res, "blocks") and isinstance(res.blocks, dict):
                return cls.from_dict(
                    {
                        bk: (list(left_v), list(right_v))
                        for bk, (left_v, right_v) in res.blocks.items()
                    }
                )  # type: ignore
            else:
                raise ValueError(f"Unknown pickled object of type {type(res)}")

`block_sizes: pd.DataFrame` `property`

Sizes of blocks

`mean_block_size: float` `property`

Mean size of all blocks.

`all_pairs()`

Get all pairs

Returns:

Type	Description
`Generator[Tuple[Union[int, str], ...], None, None]`	Generator that creates all pairs, from blocks (including duplicates).

Source code in klinker/data/blocks.py

def all_pairs(self) -> Generator[Tuple[Union[int, str], ...], None, None]:
    """Get all pairs

    Returns:
        Generator that creates all pairs, from blocks (including duplicates).
    """
    for block_tuple in self.blocks.itertuples(index=False, name=None):
        for pair in itertools.product(*block_tuple):
            yield pair

`combine(this, other)` `classmethod`

Combine blocks.

Parameters:

Name	Type	Description	Default
`this`	`KlinkerBlockManager`	one block manager to combine	required
`other`	`KlinkerBlockManager`	other block manager to combine	required

Returns:

Type	Description
`KlinkerBlockManager`	Combined KlinkerBlockManager

Examples:

>>> from klinker import KlinkerBlockManager
>>> kbm = KlinkerBlockManager.from_dict({"block1": [[1,3,4],[3,4,5]], "block2": [[3,4,5],[5,6]]}, dataset_names=("A","B"))
>>> kbm2 = KlinkerBlockManager.from_dict({"block3": [[7,4],[12,8]]}, dataset_names=("A","B"))
>>> kbm_merged = KlinkerBlockManager.combine(kbm, kbm2)
>>> kbm_merged.blocks.compute()
                A          B
block1  [1, 3, 4]  [3, 4, 5]
block2  [3, 4, 5]     [5, 6]
block3     [7, 4]    [12, 8]

Source code in klinker/data/blocks.py

@classmethod
def combine(
    cls, this: "KlinkerBlockManager", other: "KlinkerBlockManager"
) -> "KlinkerBlockManager":
    """Combine blocks.

    Args:
      this: one block manager to combine
      other: other block manager to combine

    Returns:
      Combined KlinkerBlockManager

    Examples:

        >>> from klinker import KlinkerBlockManager
        >>> kbm = KlinkerBlockManager.from_dict({"block1": [[1,3,4],[3,4,5]], "block2": [[3,4,5],[5,6]]}, dataset_names=("A","B"))
        >>> kbm2 = KlinkerBlockManager.from_dict({"block3": [[7,4],[12,8]]}, dataset_names=("A","B"))
        >>> kbm_merged = KlinkerBlockManager.combine(kbm, kbm2)
        >>> kbm_merged.blocks.compute()
                        A          B
        block1  [1, 3, 4]  [3, 4, 5]
        block2  [3, 4, 5]     [5, 6]
        block3     [7, 4]    [12, 8]

    """

    def _merge_blocks(
        row: pd.Series, output_names: Sequence[str], left_right_names: Sequence[str]
    ):
        nonnull = row[~row.isnull()]
        if len(nonnull) == 2:  # no block overlap
            nonnull.index = output_names
            return nonnull
        else:
            A_left = set(nonnull[left_right_names[0]])
            A_right = set(nonnull[left_right_names[2]])
            B_left = set(nonnull[left_right_names[1]])
            B_right = set(nonnull[left_right_names[3]])
            A = list(A_left.union(A_right))
            B = list(B_left.union(B_right))
            return pd.Series([A, B], index=output_names, name=nonnull.name)

    if list(this.blocks.columns) != list(other.blocks.columns):
        raise ValueError("Cannot combine blocks from different datasets!")

    output_names = this.blocks.columns
    left_suffix = "left"
    right_suffix = "right"
    left_right_names = [
        col + suffix
        for col_names, suffix in zip(
            [this.blocks.columns, other.blocks.columns], [left_suffix, right_suffix]
        )
        for col in col_names
    ]
    joined = this.blocks.join(
        other.blocks, how="outer", lsuffix="left", rsuffix="right"
    )

    meta = pd.DataFrame([], columns=output_names)
    return cls(
        joined.apply(
            _merge_blocks,
            output_names=output_names,
            left_right_names=left_right_names,
            axis=1,
            meta=meta,
        )
    )

`entity_pairs(entity_id, column_id)`

Get all pairs where this entity shows up.

Parameters:

Name	Type	Description	Default
`entity_id`	`Union[str, int]`	Union[str, int]: Entity id.	required
`column_id`	`int`	int: Whether entity belongs to left (0) or right (1) dataset.	required

Returns:

Type	Description
`Generator[Tuple[Union[int, str], ...], None, None]`	Generator for these pairs.

Source code in klinker/data/blocks.py

def entity_pairs(
    self, entity_id: Union[str, int], column_id: int
) -> Generator[Tuple[Union[int, str], ...], None, None]:
    """Get all pairs where this entity shows up.

    Args:
      entity_id: Union[str, int]: Entity id.
      column_id: int: Whether entity belongs to left (0) or right (1) dataset.

    Returns:
        Generator for these pairs.
    """
    cur_blocks = self.find_blocks(entity_id, column_id)
    other_column = 0 if column_id == 1 else 1
    other_column_name = self.blocks.columns[other_column]
    return (
        pair
        for blk_name in cur_blocks
        for _, blk in self.blocks.loc[blk_name][other_column_name].compute().items()
        for pair in itertools.product({entity_id}, blk)
    )

`find_blocks(entity_id, column_id)`

Find blocks where entity id belongs to.

Parameters:

Name	Type	Description	Default
`entity_id`	`Union[str, int]`	Union[str, int]: Entity id.	required
`column_id`	`int`	int: Whether entity belongs to left (0) or right (1) dataset.	required

Returns:

Type	Description
`ndarray`	Blocks where entity id belongs to.

Source code in klinker/data/blocks.py

def find_blocks(self, entity_id: Union[str, int], column_id: int) -> np.ndarray:
    """Find blocks where entity id belongs to.

    Args:
      entity_id: Union[str, int]: Entity id.
      column_id: int: Whether entity belongs to left (0) or right (1) dataset.

    Returns:
        Blocks where entity id belongs to.
    """
    return self._grouped[column_id].get_group(entity_id).index.values.compute()

`from_dict(block_dict, dataset_names=('left', 'right'), npartitions=1, **kwargs)` `classmethod`

Parameters:

Name	Type	Description	Default
`block_dict`	`Dict[BlockIdTypeVar, Tuple[List[EntityIdTypeVar], List[EntityIdTypeVar]]]`	Dictionary with block information.	required
`dataset_names`	`Tuple[str, str]`	Tuple[str, str]: Tuple of dataset names.	`('left', 'right')`
`npartitions`	`int`	int: Partitions used for dask.	`1`
`**kwargs`		Passed to `dd.from_dict`.	`{}`

Returns:

Type	Description
`KlinkerBlockManager`	Blocks as KlinkerBlockManager

Examples:

>>> from klinker import KlinkerBlockManager
>>> kbm = KlinkerBlockManager.from_dict({"block1": [[1,3,4],[3,4,5]], "block2": [[3,4,5],[5,6]]}, dataset_names=("A","B"))

Source code in klinker/data/blocks.py

@classmethod
def from_dict(
    cls,
    block_dict: Dict[
        BlockIdTypeVar, Tuple[List[EntityIdTypeVar], List[EntityIdTypeVar]]
    ],
    dataset_names: Tuple[str, str] = ("left", "right"),
    npartitions: int = 1,
    **kwargs,
) -> "KlinkerBlockManager":
    """

    Args:
      block_dict: Dictionary with block information.
      dataset_names: Tuple[str, str]: Tuple of dataset names.
      npartitions: int: Partitions used for dask.
      **kwargs: Passed to `dd.from_dict`.

    Returns:
        Blocks as KlinkerBlockManager

    Examples:

        >>> from klinker import KlinkerBlockManager
        >>> kbm = KlinkerBlockManager.from_dict({"block1": [[1,3,4],[3,4,5]], "block2": [[3,4,5],[5,6]]}, dataset_names=("A","B"))

    """
    return cls(
        dd.from_dict(
            block_dict,
            orient="index",
            columns=dataset_names,
            npartitions=npartitions,
            **kwargs,
        )
    )

`from_pandas(df, npartitions=1, **kwargs)` `classmethod`

Create from pandas.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	pd.DataFrame: DataFrame	required
`npartitions`	`int`	int: Partitions for dask	`1`
`**kwargs`		Passed to `dd.from_pandas`	`{}`

Returns:

Type	Description
`KlinkerBlockManager`	Blocks as KlinkerBlockManager

Examples:

>>> import pandas as pd
>>> from klinker import KlinkerBlockManager
>>> pd_blocks = pd.DataFrame({'A': {'block1': [1, 3, 4], 'block2': [3, 4, 5]}, 'B': {'block1': [3, 4, 5], 'block2': [5, 6]}})
>>> kbm = KlinkerBlockManager.from_pandas(pd_blocks)

Source code in klinker/data/blocks.py

@classmethod
def from_pandas(
    cls, df: pd.DataFrame, npartitions: int = 1, **kwargs
) -> "KlinkerBlockManager":
    """Create from pandas.

    Args:
      df: pd.DataFrame: DataFrame
      npartitions: int:  Partitions for dask
      **kwargs: Passed to `dd.from_pandas`

    Returns:
        Blocks as KlinkerBlockManager

    Examples:

        >>> import pandas as pd
        >>> from klinker import KlinkerBlockManager
        >>> pd_blocks = pd.DataFrame({'A': {'block1': [1, 3, 4], 'block2': [3, 4, 5]}, 'B': {'block1': [3, 4, 5], 'block2': [5, 6]}})
        >>> kbm = KlinkerBlockManager.from_pandas(pd_blocks)

    """
    return cls(dd.from_pandas(df, npartitions=npartitions, **kwargs))

`read_parquet(path, calculate_divisions=True, **kwargs)` `classmethod`

Read blocks from parquet.

Parameters:

Name	Type	Description	Default
`path`	`Union[str, Path]`	Union[str, pathlib.Path]: Path where blocks are stored.	required
`calculate_divisions`	`bool`	bool: Calculate index divisions.	`True`
`**kwargs`		Passed to `dd.read_parquet` function.	`{}`

Returns:

Type	Description
`KlinkerBlockManager`	Blocks as KlinkerBlockManager

Source code in klinker/data/blocks.py

@classmethod
def read_parquet(
    cls,
    path: Union[str, pathlib.Path],
    calculate_divisions: bool = True,
    **kwargs,
) -> "KlinkerBlockManager":
    """Read blocks from parquet.

    Args:
      path: Union[str, pathlib.Path]: Path where blocks are stored.
      calculate_divisions: bool: Calculate index divisions.
      **kwargs: Passed to `dd.read_parquet` function.

    Returns:
        Blocks as KlinkerBlockManager
    """
    return cls(
        dd.read_parquet(
            path=path,
            calculate_divisions=calculate_divisions,
            **kwargs,
        )
    )

`to_dict()`

Return blocks as dict.

Returns:

Type	Description
`Dict[Union[str, int], Tuple[Union[str, int], Union[str, int]]]`	The dict has block names as keys and a tuple of sets of entity ids.

Source code in klinker/data/blocks.py

def to_dict(self) -> Dict[Union[str, int], Tuple[Union[str, int], Union[str, int]]]:
    """Return blocks as dict.

    Returns:
      The dict has block names as keys and a tuple of sets of entity ids.
    """
    return (
        self.blocks.apply(tuple, axis=1, meta=pd.Series([], dtype=object))
        .compute()
        .to_dict()
    )

`to_parquet(path, **kwargs)`

Write blocks as parquet file(s).

Parameters:

Name	Type	Description	Default
`path`	`Union[str, Path]`	Union[str, pathlib.Path]: Where to write.	required
`**kwargs`		passed to the parquet function	`{}`

Source code in klinker/data/blocks.py

def to_parquet(self, path: Union[str, pathlib.Path], **kwargs):
    """Write blocks as parquet file(s).

    Args:
      path: Union[str, pathlib.Path]: Where to write.
      **kwargs: passed to the parquet function
    """
    if "schema" not in kwargs:
        left, right = self.blocks.columns[:2]
        block_type = pa.list_(pa.string())
        schema = {
            left: block_type,
            right: block_type,
        }
    else:
        schema = kwargs.pop["schema"]  # type: ignore
    try:
        self.blocks.to_parquet(path, schema=schema, **kwargs)
    except ValueError:
        # If index is incorrectly assumed by dask to be string
        # and it turns out to be int64 an error would be thrown
        # This is kind of a dirty hack
        schema["__null_dask_index__"] = pa.int64()
        self.blocks.to_parquet(path, schema=schema, **kwargs)

`KlinkerDaskFrame`

Bases: DataFrame, AbstractKlinkerFrame

Parallel KlinkerFrame.

Please don't use the __init__ method but rather from_dask_dataframe for initialisation!

Parameters:

Name	Description	Default
`dsk`	The dask graph to compute this KlinkerFrame	required
`name`	The key prefix that specifies which keys in the dask comprise this particular KlinkerFrame	required
`meta`	An empty klinkerframe object with names, dtypes, and indices matching the expected output.	required
`divisions`	Values along which we partition our blocks on the index	required

Returns:

Type	Description
	KlinkerDaskFrame

Examples:

>>> import pandas as pd
>>> from klinker.data import KlinkerDaskFrame
>>> import dask.dataframe as dd
>>> df = dd.from_pandas(pd.DataFrame([("1","John", "Doe"),("2","Jane","Doe")],columns=["id","first name", "surname"]),npartitions=1)
>>> kdf = KlinkerDaskFrame.from_dask_dataframe(df, table_name="A", id_col="id")
>>> kdf
Dask KlinkerDaskFrame Structure:
                   id first name surname
npartitions=1
0              object     object  object
1                 ...        ...     ...
Dask Name: KlinkerPandasFrame, 2 graph layers
Table Name: A, id_col: id

Source code in klinker/data/enhanced_df.py

class KlinkerDaskFrame(dd.core.DataFrame, AbstractKlinkerFrame):
    """Parallel KlinkerFrame.

    Please don't use the `__init__` method but rather `from_dask_dataframe` for
    initialisation!

    Args:
      dsk: The dask graph to compute this KlinkerFrame
      name: The key prefix that specifies which keys in the dask comprise this particular KlinkerFrame
      meta: An empty klinkerframe object with names, dtypes, and indices matching the expected output.
      divisions: Values along which we partition our blocks on the index

    Returns:
        KlinkerDaskFrame

    Examples:

        >>> import pandas as pd
        >>> from klinker.data import KlinkerDaskFrame
        >>> import dask.dataframe as dd
        >>> df = dd.from_pandas(pd.DataFrame([("1","John", "Doe"),("2","Jane","Doe")],columns=["id","first name", "surname"]),npartitions=1)
        >>> kdf = KlinkerDaskFrame.from_dask_dataframe(df, table_name="A", id_col="id")
        >>> kdf
        Dask KlinkerDaskFrame Structure:
                           id first name surname
        npartitions=1
        0              object     object  object
        1                 ...        ...     ...
        Dask Name: KlinkerPandasFrame, 2 graph layers
        Table Name: A, id_col: id

    """

    _partition_type = KlinkerPandasFrame

    def __init__(
        self,
        dsk,
        name,
        meta,
        divisions,
        table_name: Optional[str] = None,
        id_col: str = "id",
    ):
        super().__init__(dsk, name, meta, divisions)
        if table_name is None:
            self._table_name = meta.table_name
            self._id_col = meta.id_col
        else:
            self._table_name = table_name
            self._id_col = id_col

    @staticmethod
    def _static_propagate_klinker_attributes(
        new_object: "KlinkerDaskFrame", table_name: str, id_col: str
    ) -> "KlinkerDaskFrame":
        new_object.table_name = table_name
        new_object.id_col = id_col
        return new_object

    @property
    def non_id_columns(self) -> List[str]:
        """All columns which are not `id_col`"""
        return self._meta.non_id_columns

    @classmethod
    def _upgrade_from_series(
        cls,
        series,
        columns: List[str],
        table_name: Optional[str],
        id_col: str,
        reset_index: bool = True,
        meta=no_default,
    ) -> "KlinkerFrame":
        assert table_name
        kf = series.map_partitions(
            KlinkerPandasFrame._upgrade_from_series,
            columns=columns,
            table_name=table_name,
            id_col=id_col,
            reset_index=reset_index,
            meta=meta,
        )
        return KlinkerDaskFrame._static_propagate_klinker_attributes(
            kf, table_name, id_col
        )

    def concat_values(
        self,
    ) -> dd.Series:
        """Concatenate attribute values.

        Returns:
            dd.Series with concatenated values and id_col as index.

        Examples:

            >>> import pandas as pd
            >>> from klinker.data import KlinkerDaskFrame
            >>> import dask.dataframe as dd
            >>> df = dd.from_pandas(pd.DataFrame([("1","John", "Doe"),("2","Jane","Doe")],columns=["id","first name", "surname"]),npartitions=1)
            >>> kdf = KlinkerDaskFrame.from_dask_dataframe(df, table_name="A", id_col="id")
            >>> kdf.concat_values().compute()
            id
            1    John Doe
            2    Jane Doe
            Name: A, dtype: object

        """
        self = self.fillna("")
        assert self.table_name
        meta = pd.Series([], name=self.table_name, dtype="str")
        meta.index.name = self.id_col
        return self.map_partitions(
            M.concat_values,
            meta=meta,
        )

    @classmethod
    def from_dask_dataframe(
        cls,
        df: dd.DataFrame,
        table_name: str,
        id_col: str,
        meta=no_default,
        construction_class: Type[KlinkerPandasFrame] = KlinkerPandasFrame,
    ) -> "KlinkerDaskFrame":
        """Create KlinkDaskFrame from dask dataframe.

        Args:
          df: dd.DataFrame: Dask dataframe.
          table_name: str: Name of dataset.
          id_col: str: Column where entity_ids are stored
          meta: meta for dask
          construction_class: Either :class:`KlinkerPandasFrame` or :class:`KlinkerTriplePandasFrame`

        Returns:
            KlinkerDaskFrame

        Examples:

            >>> import pandas as pd
            >>> from klinker.data import KlinkerDaskFrame
            >>> import dask.dataframe as dd
            >>> df = dd.from_pandas(pd.DataFrame([("1","John", "Doe"),("2","Jane","Doe")],columns=["id","first name", "surname"]),npartitions=1)
            >>> kdf = KlinkerDaskFrame.from_dask_dataframe(df, table_name="A", id_col="id")
            >>> kdf
            Dask KlinkerDaskFrame Structure:
                               id first name surname
            npartitions=1
            0              object     object  object
            1                 ...        ...     ...
            Dask Name: KlinkerPandasFrame, 2 graph layers
            Table Name: A, id_col: id

        """
        new_df = df.map_partitions(
            construction_class,
            table_name=table_name,
            id_col=id_col,
            meta=meta,
        )
        meta = new_df._meta if meta is no_default else meta
        return cls(
            dsk=new_df.dask,
            name=new_df._name,
            meta=meta,
            divisions=new_df.divisions,
            table_name=table_name,
            id_col=id_col,
        )

    def __repr__(self) -> str:
        return (
            super().__repr__()
            + f"\nTable Name: {self.table_name}, id_col: {self.id_col}"
        )

`non_id_columns: List[str]` `property`

All columns which are not id_col

`concat_values()`

Concatenate attribute values.

Returns:

Type	Description
`Series`	dd.Series with concatenated values and id_col as index.

Examples:

>>> import pandas as pd
>>> from klinker.data import KlinkerDaskFrame
>>> import dask.dataframe as dd
>>> df = dd.from_pandas(pd.DataFrame([("1","John", "Doe"),("2","Jane","Doe")],columns=["id","first name", "surname"]),npartitions=1)
>>> kdf = KlinkerDaskFrame.from_dask_dataframe(df, table_name="A", id_col="id")
>>> kdf.concat_values().compute()
id
1    John Doe
2    Jane Doe
Name: A, dtype: object

Source code in klinker/data/enhanced_df.py

def concat_values(
    self,
) -> dd.Series:
    """Concatenate attribute values.

    Returns:
        dd.Series with concatenated values and id_col as index.

    Examples:

        >>> import pandas as pd
        >>> from klinker.data import KlinkerDaskFrame
        >>> import dask.dataframe as dd
        >>> df = dd.from_pandas(pd.DataFrame([("1","John", "Doe"),("2","Jane","Doe")],columns=["id","first name", "surname"]),npartitions=1)
        >>> kdf = KlinkerDaskFrame.from_dask_dataframe(df, table_name="A", id_col="id")
        >>> kdf.concat_values().compute()
        id
        1    John Doe
        2    Jane Doe
        Name: A, dtype: object

    """
    self = self.fillna("")
    assert self.table_name
    meta = pd.Series([], name=self.table_name, dtype="str")
    meta.index.name = self.id_col
    return self.map_partitions(
        M.concat_values,
        meta=meta,
    )

`from_dask_dataframe(df, table_name, id_col, meta=no_default, construction_class=KlinkerPandasFrame)` `classmethod`

Create KlinkDaskFrame from dask dataframe.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	dd.DataFrame: Dask dataframe.	required
`table_name`	`str`	str: Name of dataset.	required
`id_col`	`str`	str: Column where entity_ids are stored	required
`meta`		meta for dask	`no_default`
`construction_class`	`Type[KlinkerPandasFrame]`	Either :class:`KlinkerPandasFrame` or :class:`KlinkerTriplePandasFrame`	`KlinkerPandasFrame`

Returns:

Type	Description
`KlinkerDaskFrame`	KlinkerDaskFrame

Examples:

>>> import pandas as pd
>>> from klinker.data import KlinkerDaskFrame
>>> import dask.dataframe as dd
>>> df = dd.from_pandas(pd.DataFrame([("1","John", "Doe"),("2","Jane","Doe")],columns=["id","first name", "surname"]),npartitions=1)
>>> kdf = KlinkerDaskFrame.from_dask_dataframe(df, table_name="A", id_col="id")
>>> kdf
Dask KlinkerDaskFrame Structure:
                   id first name surname
npartitions=1
0              object     object  object
1                 ...        ...     ...
Dask Name: KlinkerPandasFrame, 2 graph layers
Table Name: A, id_col: id

Source code in klinker/data/enhanced_df.py

@classmethod
def from_dask_dataframe(
    cls,
    df: dd.DataFrame,
    table_name: str,
    id_col: str,
    meta=no_default,
    construction_class: Type[KlinkerPandasFrame] = KlinkerPandasFrame,
) -> "KlinkerDaskFrame":
    """Create KlinkDaskFrame from dask dataframe.

    Args:
      df: dd.DataFrame: Dask dataframe.
      table_name: str: Name of dataset.
      id_col: str: Column where entity_ids are stored
      meta: meta for dask
      construction_class: Either :class:`KlinkerPandasFrame` or :class:`KlinkerTriplePandasFrame`

    Returns:
        KlinkerDaskFrame

    Examples:

        >>> import pandas as pd
        >>> from klinker.data import KlinkerDaskFrame
        >>> import dask.dataframe as dd
        >>> df = dd.from_pandas(pd.DataFrame([("1","John", "Doe"),("2","Jane","Doe")],columns=["id","first name", "surname"]),npartitions=1)
        >>> kdf = KlinkerDaskFrame.from_dask_dataframe(df, table_name="A", id_col="id")
        >>> kdf
        Dask KlinkerDaskFrame Structure:
                           id first name surname
        npartitions=1
        0              object     object  object
        1                 ...        ...     ...
        Dask Name: KlinkerPandasFrame, 2 graph layers
        Table Name: A, id_col: id

    """
    new_df = df.map_partitions(
        construction_class,
        table_name=table_name,
        id_col=id_col,
        meta=meta,
    )
    meta = new_df._meta if meta is no_default else meta
    return cls(
        dsk=new_df.dask,
        name=new_df._name,
        meta=meta,
        divisions=new_df.divisions,
        table_name=table_name,
        id_col=id_col,
    )

`KlinkerDataset` `dataclass`

Helper class to hold info of benchmark datasets.

Source code in klinker/data/ea_dataset.py

@dataclass
class KlinkerDataset:
    """Helper class to hold info of benchmark datasets."""

    left: KlinkerFrame
    right: KlinkerFrame
    gold: pd.DataFrame
    left_rel: Optional[pd.DataFrame] = None
    right_rel: Optional[pd.DataFrame] = None

    @classmethod
    def from_sylloge(cls, dataset: EADataset, clean: bool = False) -> "KlinkerDataset":
        """Create a klinker dataset from sylloge dataset.

        Args:
          dataset: EADataset: Sylloge dataset.
          clean: bool: Clean attribute information.

        Returns:
            klinker dataset

        Examples:

            >>> # doctest: +SKIP
            >>> from sylloge import MovieGraphBenchmark
            >>> ds = KlinkerDataset.from_sylloge(MovieGraphBenchmark())

        """
        left: Union[KlinkerDaskFrame, KlinkerPandasFrame]
        right: Union[KlinkerDaskFrame, KlinkerPandasFrame]
        if dataset.backend == "pandas":
            left = KlinkerTriplePandasFrame.from_df(
                dataset.attr_triples_left, table_name="left", id_col="head"
            )
            right = KlinkerTriplePandasFrame.from_df(
                dataset.attr_triples_right, table_name="right", id_col="head"
            )
        elif dataset.backend == "dask":
            left = KlinkerTripleDaskFrame.from_dask_dataframe(
                dataset.attr_triples_left, table_name="left", id_col="head"
            )
            right = KlinkerTripleDaskFrame.from_dask_dataframe(
                dataset.attr_triples_right, table_name="right", id_col="head"
            )
        else:
            raise ValueError(f"Unknown dataset backend {dataset.backend}")

        if clean:
            # remove datatype
            left["tail"] = left["tail"].map(lambda x: str(x).split("^^")[0])
            right["tail"] = right["tail"].map(lambda x: str(x).split("^^")[0])

        return cls(
            left=left,
            right=right,
            left_rel=dataset.rel_triples_left,
            right_rel=dataset.rel_triples_right,
            gold=dataset.ent_links,
        )

    def _sample_side(
        self, sample: pd.DataFrame, side: Side
    ) -> Tuple[KlinkerFrame, Optional[pd.DataFrame]]:
        if side == "left":
            rel_df = self.left_rel
            attr_df = self.left
            sample_col = sample.columns[0]
        else:
            rel_df = self.right_rel
            attr_df = self.right
            sample_col = sample.columns[1]
        sampled_attr_df = attr_df[attr_df[attr_df.id_col].isin(sample[sample_col])]
        if rel_df is None:
            return sampled_attr_df, None
        return (
            sampled_attr_df,
            rel_df[
                rel_df["head"].isin(sample[sample_col])
                | rel_df["tail"].isin(sample[sample_col])
            ],
        )

    def sample(self, size: int) -> "KlinkerDataset":
        """Get a sample of the dataset.

        Note:
            Currently this only takes the first n entities of the gold standard.

        Args:
          size: int: size of the sample

        Returns:
            sampled klinker dataset

        Examples:

            >>> # doctest: +SKIP
            >>> from sylloge import MovieGraphBenchmark
            >>> ds = KlinkerDataset.from_sylloge(MovieGraphBenchmark())
            >>> sampled = ds.sample(10)

        """
        # TODO actually sample
        sample_ent_links = self.gold.iloc[:size]
        sample_left, sample_left_rel = self._sample_side(sample_ent_links, "left")
        sample_right, sample_right_rel = self._sample_side(sample_ent_links, "right")
        return KlinkerDataset(
            left=sample_left,
            right=sample_right,
            left_rel=sample_left_rel,
            right_rel=sample_right_rel,
            gold=sample_ent_links,
        )

`from_sylloge(dataset, clean=False)` `classmethod`

Create a klinker dataset from sylloge dataset.

Parameters:

Name	Type	Description	Default
`dataset`	`EADataset`	EADataset: Sylloge dataset.	required
`clean`	`bool`	bool: Clean attribute information.	`False`

Returns:

Type	Description
`KlinkerDataset`	klinker dataset

Examples:

>>> # doctest: +SKIP
>>> from sylloge import MovieGraphBenchmark
>>> ds = KlinkerDataset.from_sylloge(MovieGraphBenchmark())

Source code in klinker/data/ea_dataset.py

@classmethod
def from_sylloge(cls, dataset: EADataset, clean: bool = False) -> "KlinkerDataset":
    """Create a klinker dataset from sylloge dataset.

    Args:
      dataset: EADataset: Sylloge dataset.
      clean: bool: Clean attribute information.

    Returns:
        klinker dataset

    Examples:

        >>> # doctest: +SKIP
        >>> from sylloge import MovieGraphBenchmark
        >>> ds = KlinkerDataset.from_sylloge(MovieGraphBenchmark())

    """
    left: Union[KlinkerDaskFrame, KlinkerPandasFrame]
    right: Union[KlinkerDaskFrame, KlinkerPandasFrame]
    if dataset.backend == "pandas":
        left = KlinkerTriplePandasFrame.from_df(
            dataset.attr_triples_left, table_name="left", id_col="head"
        )
        right = KlinkerTriplePandasFrame.from_df(
            dataset.attr_triples_right, table_name="right", id_col="head"
        )
    elif dataset.backend == "dask":
        left = KlinkerTripleDaskFrame.from_dask_dataframe(
            dataset.attr_triples_left, table_name="left", id_col="head"
        )
        right = KlinkerTripleDaskFrame.from_dask_dataframe(
            dataset.attr_triples_right, table_name="right", id_col="head"
        )
    else:
        raise ValueError(f"Unknown dataset backend {dataset.backend}")

    if clean:
        # remove datatype
        left["tail"] = left["tail"].map(lambda x: str(x).split("^^")[0])
        right["tail"] = right["tail"].map(lambda x: str(x).split("^^")[0])

    return cls(
        left=left,
        right=right,
        left_rel=dataset.rel_triples_left,
        right_rel=dataset.rel_triples_right,
        gold=dataset.ent_links,
    )

`sample(size)`

Get a sample of the dataset.

Note

Currently this only takes the first n entities of the gold standard.

Parameters:

Name	Type	Description	Default
`size`	`int`	int: size of the sample	required

Returns:

Type	Description
`KlinkerDataset`	sampled klinker dataset

Examples:

>>> # doctest: +SKIP
>>> from sylloge import MovieGraphBenchmark
>>> ds = KlinkerDataset.from_sylloge(MovieGraphBenchmark())
>>> sampled = ds.sample(10)

Source code in klinker/data/ea_dataset.py

def sample(self, size: int) -> "KlinkerDataset":
    """Get a sample of the dataset.

    Note:
        Currently this only takes the first n entities of the gold standard.

    Args:
      size: int: size of the sample

    Returns:
        sampled klinker dataset

    Examples:

        >>> # doctest: +SKIP
        >>> from sylloge import MovieGraphBenchmark
        >>> ds = KlinkerDataset.from_sylloge(MovieGraphBenchmark())
        >>> sampled = ds.sample(10)

    """
    # TODO actually sample
    sample_ent_links = self.gold.iloc[:size]
    sample_left, sample_left_rel = self._sample_side(sample_ent_links, "left")
    sample_right, sample_right_rel = self._sample_side(sample_ent_links, "right")
    return KlinkerDataset(
        left=sample_left,
        right=sample_right,
        left_rel=sample_left_rel,
        right_rel=sample_right_rel,
        gold=sample_ent_links,
    )

klinker

KlinkerBlockManager

block_sizes: pd.DataFrame property

mean_block_size: float property

all_pairs()

combine(this, other) classmethod

entity_pairs(entity_id, column_id)

find_blocks(entity_id, column_id)

from_dict(block_dict, dataset_names=('left', 'right'), npartitions=1, **kwargs) classmethod

from_pandas(df, npartitions=1, **kwargs) classmethod

read_parquet(path, calculate_divisions=True, **kwargs) classmethod

to_dict()

to_parquet(path, **kwargs)

KlinkerDaskFrame

non_id_columns: List[str] property

concat_values()

from_dask_dataframe(df, table_name, id_col, meta=no_default, construction_class=KlinkerPandasFrame) classmethod

KlinkerDataset dataclass

from_sylloge(dataset, clean=False) classmethod

sample(size)

`KlinkerBlockManager`

`block_sizes: pd.DataFrame` `property`

`mean_block_size: float` `property`

`all_pairs()`

`combine(this, other)` `classmethod`

`entity_pairs(entity_id, column_id)`

`find_blocks(entity_id, column_id)`

`from_dict(block_dict, dataset_names=('left', 'right'), npartitions=1, **kwargs)` `classmethod`

`from_pandas(df, npartitions=1, **kwargs)` `classmethod`

`read_parquet(path, calculate_divisions=True, **kwargs)` `classmethod`

`to_dict()`

`to_parquet(path, **kwargs)`

`KlinkerDaskFrame`

`non_id_columns: List[str]` `property`

`concat_values()`

`from_dask_dataframe(df, table_name, id_col, meta=no_default, construction_class=KlinkerPandasFrame)` `classmethod`

`KlinkerDataset` `dataclass`

`from_sylloge(dataset, clean=False)` `classmethod`

`sample(size)`