Source code for gismo.common

#!/usr/bin/env python
# coding: utf-8
#
# GISMO: a Generic Information Search with a Mind of its Own

import gzip
import errno
import os
import dill as pickle
import numpy as np

from pathlib import Path


[docs]class MixInIO: """ Provide basic save/load capacities to other classes. """
[docs] def dump(self, filename: str, path='.', overwrite=False, compress=True): """ Save instance to file. Parameters ---------- filename: str The stem of the filename. path: :py:class:`str` or :py:class:`~pathlib.Path`, optional The location path. overwrite: bool Should existing file be erased if it exists? compress: bool Should gzip compression be used? Examples ---------- >>> import tempfile >>> v1 = ToyClass(42) >>> v2 = ToyClass() >>> v2.value 0 >>> with tempfile.TemporaryDirectory() as tmpdirname: ... v1.dump(filename='myfile', compress=True, path=tmpdirname) ... dir_content = [file.name for file in Path(tmpdirname).glob('*')] ... v2 = ToyClass.load(filename='myfile', path=Path(tmpdirname)) ... v1.dump(filename='myfile', compress=True, path=tmpdirname) # doctest.ELLIPSIS File ...myfile.pkl.gz already exists! Use overwrite option to overwrite. >>> dir_content ['myfile.pkl.gz'] >>> v2.value 42 >>> with tempfile.TemporaryDirectory() as tmpdirname: ... v1.dump(filename='myfile', compress=False, path=tmpdirname) ... v1.dump(filename='myfile', compress=False, path=tmpdirname) # doctest.ELLIPSIS File ...myfile.pkl already exists! Use overwrite option to overwrite. >>> v1.value = 51 >>> with tempfile.TemporaryDirectory() as tmpdirname: ... v1.dump(filename='myfile', path=tmpdirname, compress=False) ... v1.dump(filename='myfile', path=tmpdirname, overwrite=True, compress=False) ... v2 = ToyClass.load(filename='myfile', path=tmpdirname) ... dir_content = [file.name for file in Path(tmpdirname).glob('*')] >>> dir_content ['myfile.pkl'] >>> v2.value 51 >>> with tempfile.TemporaryDirectory() as tmpdirname: ... v2 = ToyClass.load(filename='thisfilenamedoesnotexist') Traceback (most recent call last): ... FileNotFoundError: [Errno 2] No such file or directory: ... """ path = Path(path) destination = path / Path(filename).stem if compress: destination = destination.with_suffix(".pkl.gz") if destination.exists() and not overwrite: print(f"File {destination} already exists! Use overwrite option to overwrite.") else: with gzip.open(destination, "wb") as f: pickle.dump(self, f) else: destination = destination.with_suffix(".pkl") if destination.exists() and not overwrite: print(f"File {destination} already exists! Use overwrite option to overwrite.") else: with open(destination, "wb") as f: pickle.dump(self, f)
[docs] @classmethod def load(cls, filename: str, path='.'): """ Load instance from file. Parameters ---------- filename: str The stem of the filename. path: :py:class:`str` or :py:class:`~pathlib.Path`, optional The location path. """ path = Path(path) dest = path / Path(filename).with_suffix(".pkl") if dest.exists(): with open(dest, 'rb') as f: return pickle.load(f) else: dest = dest.with_suffix('.pkl.gz') if dest.exists(): with gzip.open(dest) as f: return pickle.load(f) else: raise FileNotFoundError( errno.ENOENT, os.strerror(errno.ENOENT), dest)
[docs]class ToyClass(MixInIO): def __init__(self, value=0): self.value = value
[docs]def auto_k(data, order=None, max_k=100, target=1.0): """ Proposes a threshold k of significant values according to a relevance vector. Parameters ---------- data: :class:`~numpy.ndarray` Vector with positive relevance values. order: list of int, optional Ordered indices of ``data`` max_k: int Maximal number of entries to return; also number of entries used to determine threshold. target: float Threshold modulation. Higher target means less result. A target set to 1.0 corresponds to using the average of the max_k top values as threshold. Returns ------- k: int Recommended number of values. Example -------- >>> data = np.array([30, 1, 2, .3, 4, 50, 80]) >>> auto_k(data) 3 """ if order is None: order = np.argsort(-data) ordered_data = data[order[:max_k]] max_k = min(max_k, len(data)) threshold = np.sum(ordered_data)*target/max_k k = int(np.sum(ordered_data >= threshold)) return max(1, k)
toy_source_text = ['Gizmo is a Mogwaï.', 'This is a sentence about Blade.', 'This is another sentence about Shadoks.', 'This very long sentence, with a lot of stuff about Star Wars inside, makes at some point a side ' 'reference to the Gremlins movie by comparing Gizmo and Yoda.', 'In chinese folklore, a Mogwaï is a demon.'] """A minimal source example where items are :py:obj:`str`.""" toy_source_dict = [{'title': 'First Document', 'content': 'Gizmo is a Mogwaï.'}, {'title': 'Second Document', 'content': 'This is a sentence about Blade.'}, {'title': 'Third Document', 'content': 'This is another sentence about Shadoks.'}, {'title': 'Fourth Document', 'content': 'This very long sentence, with a lot of stuff about Star Wars inside, ' 'makes at some point a side reference to the Gremlins movie by ' 'comparing Gizmo and Yoda.'}, {'title': 'Fifth Document', 'content': 'In chinese folklore, a Mogwaï is a demon.'}] """A minimal source example where items are :py:obj:`dict` with keys `title` and `content`."""