Source code for gismo.common

#!/usr/bin/env python
# coding: utf-8
#
# GISMO: a Generic Information Search with a Mind of its Own

import zstandard as zstd
import errno
import os
import dill as pickle
import numpy as np

from pathlib import Path
from tempfile import NamedTemporaryFile
from contextlib import contextmanager


[docs] @contextmanager def safe_write(path): """ Context manager to write a file in two steps: first use a tmp file, then rename the file if everything went well. In case of error, the temp file is deleted. """ path = Path(path) with NamedTemporaryFile(mode="wb", dir=path.parent, delete=False) as tmpfile: tmp_path = Path(tmpfile.name) try: yield tmpfile # Proper closure of temp file tmpfile.close() # Atomic renaming tmp_path.replace(path) except Exception: # Error => delete the temp file if tmp_path.exists(): tmp_path.unlink() raise # Propagate error
[docs] class MixInIO: """ Provide basic save/load capacities to other classes. """
[docs] def dump( self, filename: str, path=".", overwrite=False, compress=True, stemize=True ): """ Save instance to file. Parameters ---------- filename: str The stem of the filename. path: :py:class:`str` or :py:class:`~pathlib.Path`, optional The location path. overwrite: bool, default=False Should existing file be erased if it exists? compress: bool, default=True Should Zstd compression be used? stemize: bool, default=True Trim any extension (e.g. .xxx) Examples ---------- >>> import tempfile >>> v1 = ToyClass(42) >>> v2 = ToyClass() >>> v2.value 0 >>> with tempfile.TemporaryDirectory() as tmpdirname: ... v1.dump(filename='myfile', compress=True, path=tmpdirname) ... dir_content = [file.name for file in Path(tmpdirname).glob('*')] ... v2 = ToyClass.load(filename='myfile', path=Path(tmpdirname)) ... v1.dump(filename='myfile', compress=True, path=tmpdirname) # doctest.ELLIPSIS File ...myfile.pkl.zst already exists! Use overwrite option to overwrite. >>> dir_content ['myfile.pkl.zst'] >>> v2.value 42 >>> with tempfile.TemporaryDirectory() as tmpdirname: ... v1.dump(filename='myfile', compress=False, path=tmpdirname) ... v1.dump(filename='myfile', compress=False, path=tmpdirname) # doctest.ELLIPSIS File ...myfile.pkl already exists! Use overwrite option to overwrite. >>> v1.value = 51 >>> with tempfile.TemporaryDirectory() as tmpdirname: ... v1.dump(filename='myfile', path=tmpdirname, compress=False) ... v1.dump(filename='myfile', path=tmpdirname, overwrite=True, compress=False) ... v2 = ToyClass.load(filename='myfile', path=tmpdirname) ... dir_content = [file.name for file in Path(tmpdirname).glob('*')] >>> dir_content ['myfile.pkl'] >>> v2.value 51 >>> with tempfile.TemporaryDirectory() as tmpdirname: ... v2 = ToyClass.load(filename='thisfilenamedoesnotexist') Traceback (most recent call last): ... FileNotFoundError: [Errno 2] No such file or directory: ... """ path = Path(path) fn = Path(filename) if stemize: fn = Path(fn.stem) if compress: destination = path / (fn.name + ".pkl.zst") if destination.exists() and not overwrite: print( f"File {destination} already exists! Use overwrite option to overwrite." ) else: with safe_write(destination) as f: cctx = zstd.ZstdCompressor(level=3) with cctx.stream_writer(f) as z: pickle.dump(self, z, protocol=5) else: destination = path / (fn.name + ".pkl") if destination.exists() and not overwrite: print( f"File {destination} already exists! Use overwrite option to overwrite." ) else: with safe_write(destination) as f: pickle.dump(self, f)
[docs] @classmethod def load(cls, filename: str, path="."): """ Load instance from file. Parameters ---------- filename: str The stem of the filename. path: :py:class:`str` or :py:class:`~pathlib.Path`, optional The location path. """ path = Path(path) dest = path / Path(filename).with_suffix(".pkl") if dest.exists(): with open(dest, "rb") as f: return pickle.load(f) else: dest = dest.with_suffix(".pkl.zst") if dest.exists(): dctx = zstd.ZstdDecompressor() # Load compressed data with open(dest, "rb") as f, dctx.stream_reader(f) as z: return pickle.load(z) else: raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), dest)
[docs] class ToyClass(MixInIO): def __init__(self, value=0): self.value = value
[docs] def auto_k(data, order=None, max_k=100, target=1.0): """ Proposes a threshold k of significant values according to a relevance vector. Parameters ---------- data: :class:`~numpy.ndarray` Vector with positive relevance values. order: list of int, optional Ordered indices of ``data`` max_k: int Maximal number of entries to return; also number of entries used to determine threshold. target: float Threshold modulation. Higher target means less result. A target set to 1.0 corresponds to using the average of the max_k top values as threshold. Returns ------- k: int Recommended number of values. Example -------- >>> data = np.array([30, 1, 2, .3, 4, 50, 80]) >>> auto_k(data) 3 """ if order is None: order = np.argsort(-data) ordered_data = data[order[:max_k]] max_k = min(max_k, len(data)) threshold = np.sum(ordered_data) * target / max_k k = int(np.sum(ordered_data >= threshold)) return max(1, k)
toy_source_text = [ "Gizmo is a Mogwaï.", "This is a sentence about Blade.", "This is another sentence about Shadoks.", "This very long sentence, with a lot of stuff about Star Wars inside, makes at some point a side " "reference to the Gremlins movie by comparing Gizmo and Yoda.", "In chinese folklore, a Mogwaï is a demon.", ] """A minimal source example where items are :py:obj:`str`.""" toy_source_dict = [ {"title": "First Document", "content": "Gizmo is a Mogwaï."}, {"title": "Second Document", "content": "This is a sentence about Blade."}, {"title": "Third Document", "content": "This is another sentence about Shadoks."}, { "title": "Fourth Document", "content": "This very long sentence, with a lot of stuff about Star Wars inside, " "makes at some point a side reference to the Gremlins movie by " "comparing Gizmo and Yoda.", }, {"title": "Fifth Document", "content": "In chinese folklore, a Mogwaï is a demon."}, ] """A minimal source example where items are :py:obj:`dict` with keys `title` and `content`."""