Source code for gismo.common

#!/usr/bin/env python
# coding: utf-8
#
# GISMO: a Generic Information Search with a Mind of its Own

import zstandard as zstd
import errno
import os
import dill as pickle
import numpy as np

from pathlib import Path
from tempfile import NamedTemporaryFile
from contextlib import contextmanager



[docs]
@contextmanager
def safe_write(path):
    """
    Context manager to write a file in two steps: first use a tmp file,
    then rename the file if everything went well.
    In case of error, the temp file is deleted.
    """
    path = Path(path)
    with NamedTemporaryFile(mode="wb", dir=path.parent, delete=False) as tmpfile:
        tmp_path = Path(tmpfile.name)
        try:
            yield tmpfile
            # Proper closure of temp file
            tmpfile.close()
            # Atomic renaming
            tmp_path.replace(path)
        except Exception:
            # Error => delete the temp file
            if tmp_path.exists():
                tmp_path.unlink()
            raise  # Propagate error




[docs]
class MixInIO:
    """
    Provide basic save/load capacities to other classes.
    """


[docs]
    def dump(
        self, filename: str, path=".", overwrite=False, compress=True, stemize=True
    ):
        """
        Save instance to file.

        Parameters
        ----------
        filename: str
            The stem of the filename.
        path: :py:class:`str` or :py:class:`~pathlib.Path`, optional
            The location path.
        overwrite: bool, default=False
            Should existing file be erased if it exists?
        compress: bool, default=True
            Should Zstd compression be used?
        stemize: bool, default=True
            Trim any extension (e.g. .xxx)

        Examples
        ----------

        >>> import tempfile
        >>> v1 = ToyClass(42)
        >>> v2 = ToyClass()
        >>> v2.value
        0
        >>> with tempfile.TemporaryDirectory() as tmpdirname:
        ...     v1.dump(filename='myfile', compress=True, path=tmpdirname)
        ...     dir_content = [file.name for file in Path(tmpdirname).glob('*')]
        ...     v2 = ToyClass.load(filename='myfile', path=Path(tmpdirname))
        ...     v1.dump(filename='myfile', compress=True, path=tmpdirname) # doctest.ELLIPSIS
        File ...myfile.pkl.zst already exists! Use overwrite option to overwrite.
        >>> dir_content
        ['myfile.pkl.zst']
        >>> v2.value
        42

        >>> with tempfile.TemporaryDirectory() as tmpdirname:
        ...     v1.dump(filename='myfile', compress=False, path=tmpdirname)
        ...     v1.dump(filename='myfile', compress=False, path=tmpdirname) # doctest.ELLIPSIS
        File ...myfile.pkl already exists! Use overwrite option to overwrite.

        >>> v1.value = 51
        >>> with tempfile.TemporaryDirectory() as tmpdirname:
        ...     v1.dump(filename='myfile', path=tmpdirname, compress=False)
        ...     v1.dump(filename='myfile', path=tmpdirname, overwrite=True, compress=False)
        ...     v2 = ToyClass.load(filename='myfile', path=tmpdirname)
        ...     dir_content = [file.name for file in Path(tmpdirname).glob('*')]
        >>> dir_content
        ['myfile.pkl']
        >>> v2.value
        51

        >>> with tempfile.TemporaryDirectory() as tmpdirname:
        ...    v2 = ToyClass.load(filename='thisfilenamedoesnotexist')
        Traceback (most recent call last):
         ...
        FileNotFoundError: [Errno 2] No such file or directory: ...
        """
        path = Path(path)
        fn = Path(filename)
        if stemize:
            fn = Path(fn.stem)
        if compress:
            destination = path / (fn.name + ".pkl.zst")
            if destination.exists() and not overwrite:
                print(
                    f"File {destination} already exists! Use overwrite option to overwrite."
                )
            else:
                with safe_write(destination) as f:
                    cctx = zstd.ZstdCompressor(level=3)
                    with cctx.stream_writer(f) as z:
                        pickle.dump(self, z, protocol=5)
        else:
            destination = path / (fn.name + ".pkl")
            if destination.exists() and not overwrite:
                print(
                    f"File {destination} already exists! Use overwrite option to overwrite."
                )
            else:
                with safe_write(destination) as f:
                    pickle.dump(self, f)



[docs]
    @classmethod
    def load(cls, filename: str, path="."):
        """
        Load instance from file.

        Parameters
        ----------
        filename: str
            The stem of the filename.
        path: :py:class:`str` or :py:class:`~pathlib.Path`, optional
            The location path.
        """
        path = Path(path)
        dest = path / Path(filename).with_suffix(".pkl")
        if dest.exists():
            with open(dest, "rb") as f:
                return pickle.load(f)
        else:
            dest = dest.with_suffix(".pkl.zst")
            if dest.exists():
                dctx = zstd.ZstdDecompressor()
                # Load compressed data
                with open(dest, "rb") as f, dctx.stream_reader(f) as z:
                    return pickle.load(z)
            else:
                raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), dest)





[docs]
class ToyClass(MixInIO):
    def __init__(self, value=0):
        self.value = value




[docs]
def auto_k(data, order=None, max_k=100, target=1.0):
    """
    Proposes a threshold k of significant values according to a relevance vector.

    Parameters
    ----------
    data: :class:`~numpy.ndarray`
        Vector with positive relevance values.
    order: list of int, optional
        Ordered indices of ``data``
    max_k: int
        Maximal number of entries to return; also number of entries used to determine threshold.
    target: float
        Threshold modulation. Higher target means less result.
        A target set to 1.0 corresponds to using the average of the max_k top values as threshold.

    Returns
    -------
    k: int
        Recommended number of values.

    Example
    --------
    >>> data = np.array([30, 1, 2, .3, 4, 50, 80])
    >>> auto_k(data)
    3
    """
    if order is None:
        order = np.argsort(-data)
    ordered_data = data[order[:max_k]]
    max_k = min(max_k, len(data))
    threshold = np.sum(ordered_data) * target / max_k
    k = int(np.sum(ordered_data >= threshold))
    return max(1, k)



toy_source_text = [
    "Gizmo is a Mogwaï.",
    "This is a sentence about Blade.",
    "This is another sentence about Shadoks.",
    "This very long sentence, with a lot of stuff about Star Wars inside, makes at some point a side "
    "reference to the Gremlins movie by comparing Gizmo and Yoda.",
    "In chinese folklore, a Mogwaï is a demon.",
]
"""A minimal source example where items are :py:obj:`str`."""

toy_source_dict = [
    {"title": "First Document", "content": "Gizmo is a Mogwaï."},
    {"title": "Second Document", "content": "This is a sentence about Blade."},
    {"title": "Third Document", "content": "This is another sentence about Shadoks."},
    {
        "title": "Fourth Document",
        "content": "This very long sentence, with a lot of stuff about Star Wars inside, "
        "makes at some point a side reference to the Gremlins movie by "
        "comparing Gizmo and Yoda.",
    },
    {"title": "Fifth Document", "content": "In chinese folklore, a Mogwaï is a demon."},
]
"""A minimal source example where items are :py:obj:`dict` with keys `title` and `content`."""