#!/usr/bin/env python
# coding: utf-8
#
# GISMO: a Generic Information Search with a Mind of its Own
from gismo.common import MixInIO, toy_source_text, toy_source_dict
import numpy as np
from itertools import chain
[docs]class Corpus(MixInIO):
"""
The Corpus class is the starting point of any Gismo workflow. It abstracts dataset pre-processing.
It is just a list of items (called documents in Gismo) augmented with a method that describes
how to convert a document to a string object. It is used to build an :py:class:`~gismo.embedding.Embedding`.
Parameters
----------
source: list
The list of items that constitutes the dataset to analyze. Actually, any iterable object with :func:`__len__`
and :func:`__getitem__` methods can potentially be used as a source
(see :py:class:`~gismo.filesource.FileSource` for an example).
to_text: function, optional
The function that transforms an item from the source into plain text (:py:class:`str`). If not set, it will
default to the identity function ``lambda x: x``.
Examples
--------
The following code uses the :py:obj:`~gismo.common.toy_source_text` list as source and specifies that the text
extraction method should be: take the 15 first characters and add `...`.
When we iterate with the :py:meth:`~gismo.corpus.Corpus.iterate` method, observe that the extraction is **not**
applied.
>>> corpus = Corpus(toy_source_text, to_text=lambda x: f"{x[:15]}...")
>>> for c in corpus.iterate():
... print(c)
Gizmo is a Mogwaï.
This is a sentence about Blade.
This is another sentence about Shadoks.
This very long sentence, with a lot of stuff about Star Wars inside, makes at some point a side reference to the Gremlins movie by comparing Gizmo and Yoda.
In chinese folklore, a Mogwaï is a demon.
When we iterate with the :py:meth:`~gismo.corpus.Corpus.iterate_text` method, observe that the extraction **is**
applied.
>>> for c in corpus.iterate_text():
... print(c)
Gizmo is a Mogw...
This is a sente...
This is another...
This very long ...
In chinese folk...
A corpus object can be saved/loaded with the :py:meth:`~gismo.common.MixInIO.dump` and
:py:meth:`~gismo.common.MixInIO.load` methods inherited from the MixIn :py:class:`~gismo.common.MixInIO` class.
The :py:meth:`~gismo.common.MixInIO.load` method is a class method to be used instead of the constructor.
>>> import tempfile
>>> corpus1 = Corpus(toy_source_text)
>>> with tempfile.TemporaryDirectory() as tmpdirname:
... corpus1.dump(filename="myfile", path=tmpdirname)
... corpus2 = Corpus.load(filename="myfile", path=tmpdirname)
>>> corpus2[0]
'Gizmo is a Mogwaï.'
"""
def __init__(self, source=None, to_text=None):
self.source = source
self.i = 0
self.n = 0 if source is None or not hasattr(source, '__len__') else len(source)
self.iter = None
if to_text is None:
self.to_text = lambda x: x
else:
self.to_text = to_text
def iterate_text(self, to_text=None):
if to_text is None:
to_text = self.to_text
return (to_text(entry) for entry in self.source)
def iterate(self):
return (entry for entry in self.source)
def __getitem__(self, i):
return self.source[i]
def __len__(self):
return self.n
[docs] def merge_new_source(self, new_source, doc2key=None):
"""
Incorporate new entries while avoiding the creation of duplicates. This method is typically used when you have
a dynamic source like a RSS feed and you want to periodically update your corpus.
Parameters
----------
new_source: list
Source compatible (e.g. similar item type) with the current source.
doc2key: function
Callback that provides items with unique hashable keys, used to avoid duplicates.
Examples
--------
The following code uses the :py:obj:`~gismo.common.toy_source_dict` list as source and add two new items,
including a redundant one.
>>> corpus = Corpus(toy_source_dict.copy(), to_text=lambda x: x['content'][:14])
>>> len(corpus)
5
>>> new_corpus = [{"title": "Another document", "content": "I don't know what to say!"},
... {'title': 'Fifth Document', 'content': 'In chinese folklore, a Mogwaï is a demon.'}]
>>> corpus.merge_new_source(new_corpus, doc2key=lambda e: e['title'])
>>> len(corpus)
6
>>> for c in corpus.iterate_text():
... print(c)
Gizmo is a Mog
This is a sent
This is anothe
This very long
In chinese fol
I don't know w
"""
if doc2key is None:
print("Incremental corpus requires to provide a doc2key function")
return self
if self.source is None:
self.source = []
new_keys = {doc2key(d) for d in new_source} - {doc2key(d) for d in self.source}
self.source += [d for d in new_source if doc2key(d) in new_keys]
self.n = len(self.source)
[docs]class CorpusList(MixInIO):
"""
This class makes a list of corpi behave like one single virtual corpus. This is useful to glue together corpi with
distinct shapes and :py:meth:`to_text` methods.
Parameters
----------
corpus_list: list of :py:class:`.Corpus`
The list of corpi to glue.
Example
-------
>>> multi_corp = CorpusList([Corpus(toy_source_text, lambda x: x[:15]+"..."),
... Corpus(toy_source_dict, lambda e: e['title'])])
>>> len(multi_corp)
10
>>> multi_corp[7]
{'title': 'Third Document', 'content': 'This is another sentence about Shadoks.'}
>>> for c in multi_corp.iterate_text():
... print(c)
Gizmo is a Mogw...
This is a sente...
This is another...
This very long ...
In chinese folk...
First Document
Second Document
Third Document
Fourth Document
Fifth Document
"""
def __init__(self, corpus_list=None, filename=None, path='.'):
if corpus_list is None or len(corpus_list) == 0:
print("Please provide a non-empty list of corpi!")
else:
self.corpus_list = corpus_list
self.cum_n = np.cumsum([len(corpus) for corpus in self.corpus_list])
self.n = self.cum_n[-1]
def iterate(self):
return chain.from_iterable([corpus.iterate() for corpus in self.corpus_list])
def iterate_text(self):
return chain.from_iterable([corpus.iterate_text() for corpus in self.corpus_list])
def __getitem__(self, i):
corpus_indice = np.searchsorted(self.cum_n, i, side='right')
local_i = i if corpus_indice == 0 else (i - self.cum_n[corpus_indice - 1])
return self.corpus_list[corpus_indice][local_i]
def __len__(self):
return self.n