Source code for gismo.corpus

#!/usr/bin/env python
# coding: utf-8
#
# GISMO: a Generic Information Search with a Mind of its Own

from gismo.common import MixInIO, toy_source_text, toy_source_dict

import numpy as np
from itertools import chain



[docs]
class Corpus(MixInIO):
    """
    The Corpus class is the starting point of any Gismo workflow. It abstracts dataset pre-processing.
    It is just a list of items (called documents in Gismo) augmented with a method that describes
    how to convert a document to a string object. It is used to build an :py:class:`~gismo.embedding.Embedding`.

    Parameters
    ----------
    source: list
        The list of items that constitutes the dataset to analyze. Actually, any iterable object with :func:`__len__`
        and :func:`__getitem__` methods can potentially be used as a source
        (see :py:class:`~gismo.filesource.FileSource` for an example).
    to_text: function, optional
        The function that transforms an item from the source into plain text (:py:class:`str`). If not set, it will
        default to the identity function ``lambda x: x``.

    Examples
    --------

    The following code uses the :py:obj:`~gismo.common.toy_source_text` list as source and specifies that the text
    extraction method should be: take the 15 first characters and add `...`.

    When we iterate with the :py:meth:`~gismo.corpus.Corpus.iterate` method, observe that the extraction is **not**
    applied.

    >>> corpus = Corpus(toy_source_text, to_text=lambda x: f"{x[:15]}...")
    >>> for c in corpus.iterate():
    ...    print(c)
    Gizmo is a Mogwaï.
    This is a sentence about Blade.
    This is another sentence about Shadoks.
    This very long sentence, with a lot of stuff about Star Wars inside, makes at some point a side reference to the Gremlins movie by comparing Gizmo and Yoda.
    In chinese folklore, a Mogwaï is a demon.

    When we iterate with the :py:meth:`~gismo.corpus.Corpus.iterate_text` method, observe that the extraction **is**
    applied.


    >>> for c in corpus.iterate_text():
    ...    print(c)
    Gizmo is a Mogw...
    This is a sente...
    This is another...
    This very long ...
    In chinese folk...

    A corpus object can be saved/loaded with the :py:meth:`~gismo.common.MixInIO.dump` and
    :py:meth:`~gismo.common.MixInIO.load` methods inherited from the MixIn :py:class:`~gismo.common.MixInIO` class.
    The :py:meth:`~gismo.common.MixInIO.load` method is a class method to be used instead of the constructor.

    >>> import tempfile
    >>> corpus1 = Corpus(toy_source_text)
    >>> with tempfile.TemporaryDirectory() as tmpdirname:
    ...    corpus1.dump(filename="myfile", path=tmpdirname)
    ...    corpus2 = Corpus.load(filename="myfile", path=tmpdirname)
    >>> corpus2[0]
    'Gizmo is a Mogwaï.'
    """

    def __init__(self, source=None, to_text=None):
        self.source = source
        self.i = 0
        self.n = 0 if source is None or not hasattr(source, "__len__") else len(source)
        self.iter = None
        if to_text is None:
            self.to_text = lambda x: x
        else:
            self.to_text = to_text

    def iterate_text(self, to_text=None):
        if to_text is None:
            to_text = self.to_text
        return (to_text(entry) for entry in self.source)

    def iterate(self):
        return (entry for entry in self.source)

    def __getitem__(self, i):
        return self.source[i]

    def __len__(self):
        return self.n


[docs]
    def merge_new_source(self, new_source, doc2key=None):
        """
        Incorporate new entries while avoiding the creation of duplicates. This method is typically used when you have
        a dynamic source like a RSS feed and you want to periodically update your corpus.

        Parameters
        ----------
        new_source: list
                 Source compatible (e.g. similar item type) with the current source.
        doc2key: function
                 Callback that provides items with unique hashable keys, used to avoid duplicates.

        Examples
        --------

        The following code uses the :py:obj:`~gismo.common.toy_source_dict` list as source and add two new items,
        including a redundant one.

        >>> corpus = Corpus(toy_source_dict.copy(), to_text=lambda x: x['content'][:14])
        >>> len(corpus)
        5
        >>> new_corpus = [{"title": "Another document", "content": "I don't know what to say!"},
        ...     {'title': 'Fifth Document', 'content': 'In chinese folklore, a Mogwaï is a demon.'}]
        >>> corpus.merge_new_source(new_corpus, doc2key=lambda e: e['title'])
        >>> len(corpus)
        6
        >>> for c in corpus.iterate_text():
        ...    print(c)
        Gizmo is a Mog
        This is a sent
        This is anothe
        This very long
        In chinese fol
        I don't know w
        """
        if doc2key is None:
            print("Incremental corpus requires to provide a doc2key function")
            return self
        if self.source is None:
            self.source = []
        new_keys = {doc2key(d) for d in new_source} - {doc2key(d) for d in self.source}
        self.source += [d for d in new_source if doc2key(d) in new_keys]
        self.n = len(self.source)





[docs]
class CorpusList(MixInIO):
    """
    This class makes a list of corpi behave like one single virtual corpus. This is useful to glue together corpi with
    distinct shapes and :py:meth:`to_text` methods.

    Parameters
    ----------
    corpus_list: list of :py:class:`.Corpus`
        The list of corpi to glue.


    Example
    -------
    >>> multi_corp = CorpusList([Corpus(toy_source_text, lambda x: x[:15]+"..."),
    ...                          Corpus(toy_source_dict, lambda e: e['title'])])
    >>> len(multi_corp)
    10
    >>> multi_corp[7]
    {'title': 'Third Document', 'content': 'This is another sentence about Shadoks.'}
    >>> for c in multi_corp.iterate_text():
    ...    print(c)
    Gizmo is a Mogw...
    This is a sente...
    This is another...
    This very long ...
    In chinese folk...
    First Document
    Second Document
    Third Document
    Fourth Document
    Fifth Document
    """

    def __init__(self, corpus_list=None, filename=None, path="."):
        if corpus_list is None or len(corpus_list) == 0:
            print("Please provide a non-empty list of corpi!")
        else:
            self.corpus_list = corpus_list
            self.cum_n = np.cumsum([len(corpus) for corpus in self.corpus_list])
            self.n = self.cum_n[-1]

    def iterate(self):
        return chain.from_iterable([corpus.iterate() for corpus in self.corpus_list])

    def iterate_text(self):
        return chain.from_iterable(
            [corpus.iterate_text() for corpus in self.corpus_list]
        )

    def __getitem__(self, i):
        corpus_indice = np.searchsorted(self.cum_n, i, side="right")
        local_i = i if corpus_indice == 0 else (i - self.cum_n[corpus_indice - 1])
        return self.corpus_list[corpus_indice][local_i]

    def __len__(self):
        return self.n