"""Main module."""
import numpy as np
from scipy.sparse import vstack, csr_matrix
from sklearn.feature_extraction.text import CountVectorizer
from functools import partial
from gismo.common import MixInIO, toy_source_dict, auto_k
from gismo.datasets.dblp import url2source
from gismo.corpus import Corpus
from gismo.embedding import Embedding
from gismo.diteration import DIteration
from gismo.parameters import Parameters
from gismo.clustering import subspace_clusterize, covering_order, subspace_distortion
from gismo.post_processing import post_documents_item_raw, post_documents_item_content, post_documents_cluster_json, \
post_features_item_raw, post_features_cluster_json, post_documents_cluster_print, post_features_cluster_print
[docs]class Gismo(MixInIO):
"""
Gismo mixes a corpus and its embedding to provide search and structure methods.
Parameters
----------
corpus: Corpus
Defines the documents of the gismo.
embedding: Embedding
Defines the embedding of the gismo.
kwargs: dict
Custom default runtime parameters.
You just need to specify the parameters that differ from :obj:`~gismo.parameters.DEFAULT_PARAMETERS`.
Example
-------
The Corpus class defines how documents of a source should be converted to plain text.
>>> corpus = Corpus(toy_source_dict, lambda x: x['content'])
The Embedding class extracts features (e.g. words) and computes weights between documents and features.
>>> vectorizer = CountVectorizer(dtype=float)
>>> embedding = Embedding(vectorizer=vectorizer)
>>> embedding.fit_transform(corpus)
>>> embedding.m # number of features
36
The Gismo class combines them for performing queries.
After a query is performed, one can ask for the best items.
The number of items to return can be specified with parameter ``k`` or automatically adjusted.
>>> gismo = Gismo(corpus, embedding)
>>> success = gismo.rank("Gizmo")
>>> gismo.parameters.target_k = .2 # The toy dataset is very small, so we lower the auto_k parameter.
>>> gismo.get_documents_by_rank()
[{'title': 'First Document', 'content': 'Gizmo is a Mogwaï.'}, {'title': 'Fourth Document', 'content': 'This very long sentence, with a lot of stuff about Star Wars inside, makes at some point a side reference to the Gremlins movie by comparing Gizmo and Yoda.'}, {'title': 'Fifth Document', 'content': 'In chinese folklore, a Mogwaï is a demon.'}]
Post processing functions can be used to tweak the returned object (the underlying ranking is unchanged)
>>> gismo.post_documents_item = partial(post_documents_item_content, max_size=44)
>>> gismo.get_documents_by_rank()
['Gizmo is a Mogwaï.', 'This very long sentence, with a lot of stuff', 'In chinese folklore, a Mogwaï is a demon.']
Ranking also works on features.
>>> gismo.get_features_by_rank()
['mogwaï', 'gizmo', 'is', 'in', 'demon', 'chinese', 'folklore']
Clustering organizes results can provide additional hints on their relationships.
>>> gismo.post_documents_cluster = post_documents_cluster_print
>>> gismo.get_documents_by_cluster(resolution=.9) # doctest: +NORMALIZE_WHITESPACE
F: 0.60. R: 0.65. S: 0.98.
- F: 0.71. R: 0.57. S: 0.98.
-- Gizmo is a Mogwaï. (R: 0.54; S: 0.99)
-- In chinese folklore, a Mogwaï is a demon. (R: 0.04; S: 0.71)
- This very long sentence, with a lot of stuff (R: 0.08; S: 0.69)
>>> gismo.post_features_cluster = post_features_cluster_print
>>> gismo.get_features_by_cluster() # doctest: +NORMALIZE_WHITESPACE
F: 0.03. R: 0.29. S: 0.98.
- F: 1.00. R: 0.27. S: 0.99.
-- mogwaï (R: 0.12; S: 0.99)
-- gizmo (R: 0.12; S: 0.99)
-- is (R: 0.03; S: 0.99)
- F: 1.00. R: 0.02. S: 0.07.
-- in (R: 0.00; S: 0.07)
-- demon (R: 0.00; S: 0.07)
-- chinese (R: 0.00; S: 0.07)
-- folklore (R: 0.00; S: 0.07)
As an alternative to a textual query, the :meth:`~gismo.gismo.Gismo.rank` method can directly use a vector
`z` as input.
>>> z, s = gismo.embedding.query_projection("gizmo chinese folklore")
>>> z # doctest: +NORMALIZE_WHITESPACE
<1x36 sparse matrix of type '<class 'numpy.float64'>'
with 3 stored elements in Compressed Sparse Row format>
>>> s = gismo.rank(z=z)
>>> s
True
>>> gismo.get_documents_by_rank(k=2)
['In chinese folklore, a Mogwaï is a demon.', 'Gizmo is a Mogwaï.']
>>> gismo.get_features_by_rank()
['mogwaï', 'in', 'chinese', 'folklore', 'demon', 'gizmo', 'is']
The class also offers :meth:`~gismo.gismo.Gismo.get_documents_by_coverage` and
:meth:`~gismo.gismo.Gismo.get_features_by_coverage` that yield
a list of results obtained from a Covering-like traversal of the ranked cluster.
To demonstrate it, we first add an outsider document to the corpus and rebuild Gismo.
>>> new_entry = {'title': 'Minority Report', 'content': 'Totally unrelated stuff.'}
>>> corpus = Corpus(toy_source_dict+[new_entry], lambda x: x['content'])
>>> vectorizer = CountVectorizer(dtype=float)
>>> embedding = Embedding(vectorizer=vectorizer)
>>> embedding.fit_transform(corpus)
>>> gismo = Gismo(corpus, embedding)
>>> gismo.post_documents_item = post_documents_item_content
>>> success = gismo.rank("Gizmo")
>>> gismo.parameters.target_k = .3
Remind the classical rank-based result.
>>> gismo.get_documents_by_rank()
['Gizmo is a Mogwaï.', 'This very long sentence, with a lot of stuff about Star Wars inside, makes at some point a side reference to the Gremlins movie by comparing Gizmo and Yoda.', 'In chinese folklore, a Mogwaï is a demon.']
Gismo can use the cluster to propose alternate results that try to cover more subjects.
>>> gismo.get_documents_by_coverage()
['Gizmo is a Mogwaï.', 'Totally unrelated stuff.', 'This is a sentence about Blade.']
Note how the new entry, which has nothing to do with the rest, is pushed into the results.
By setting the ``wide`` option to False, we get an alternative that focuses on mainstream results.
>>> gismo.get_documents_by_coverage(wide=False)
['Gizmo is a Mogwaï.', 'This is a sentence about Blade.', 'This very long sentence, with a lot of stuff about Star Wars inside, makes at some point a side reference to the Gremlins movie by comparing Gizmo and Yoda.']
The same principle applies for features.
>>> gismo.get_features_by_rank()
['mogwaï', 'gizmo', 'is', 'in', 'chinese', 'folklore', 'demon']
>>> gismo.get_features_by_coverage()
['mogwaï', 'this', 'in', 'by', 'gizmo', 'is', 'chinese']
"""
def __init__(self, corpus=None, embedding=None, **kwargs):
self.corpus = corpus
self.embedding = embedding
self.diteration = DIteration(n=embedding.n, m=embedding.m)
self.parameters = Parameters(**kwargs)
self.post_documents_item = post_documents_item_raw
self.post_features_item = post_features_item_raw
self.post_documents_cluster = post_documents_cluster_json
self.post_features_cluster = post_features_cluster_json
# Ranking Part
[docs] def rank(self, query="", z=None, **kwargs):
"""
Runs the Diteration using query as starting point
Parameters
----------
query: str
Text that starts DIteration
z: :class:`~scipy.sparse.csr_matrix`, optional
Query vector to use in place of the textual query
kwargs: dict, optional
Custom runtime parameters.
Returns
-------
success: bool
success of the query projection. If projection fails, a ranking on uniform distribution is performed.
"""
p = self.parameters(**kwargs)
if z is None:
z, success = self.embedding.query_projection(query)
else:
success = True
self.diteration(self.embedding.x, self.embedding.y, z,
alpha=p['alpha'], n_iter=p['n_iter'],
offset=p['offset'], memory=p['memory'])
return success
[docs] def get_documents_by_rank(self, k=None, **kwargs):
"""
Returns a list of top documents according to the current ranking.
By default, the documents are post_processed through the post_documents_item method.
Parameters
----------
k: int, optional
Number of documents to output. If not set, k is automatically computed
using the max_k and target_k runtime parameters.
kwargs: dict, optional
Custom runtime parameters.
Returns
-------
list
"""
p = self.parameters(**kwargs)
if k is None:
k = auto_k(data=self.diteration.x_relevance,
order=self.diteration.x_order,
max_k=p['max_k'],
target=p['target_k'])
if p['post']:
return [self.post_documents_item(self, i) for i in self.diteration.x_order[:k]]
else:
return self.diteration.x_order[:k]
[docs] def get_features_by_rank(self, k=None, **kwargs):
"""
Returns a list of top features according to the current ranking.
By default, the features are post_processed through the post_features_item method.
Parameters
----------
k: int, optional
Number of documents to output. If not set, k is automatically computed
using the max_k and target_k runtime parameters.
kwargs: dict, optional
Custom runtime parameters.
Returns
-------
list
"""
p = self.parameters(**kwargs)
if k is None:
k = auto_k(data=self.diteration.y_relevance,
order=self.diteration.y_order,
max_k=p['max_k'],
target=p['target_k'])
if p['post']:
return [self.post_features_item(self, i) for i in self.diteration.y_order[:k]]
else:
return self.diteration.y_order[:k]
# Cluster part
[docs] def get_documents_by_cluster_from_indices(self, indices, **kwargs):
"""
Returns a cluster of documents.
The cluster is by default post_processed through the post_documents_cluster method.
Parameters
----------
indices: list of int
The indices of documents to be processed. It is assumed that the documents
are sorted by importance.
kwargs: dict, optional
Custom runtime parameters.
Returns
-------
object
"""
p = self.parameters(**kwargs)
subspace = vstack([self.embedding.x[i, :] for i in indices])
if p['distortion']>0:
subspace_distortion(indices=subspace.indices, data=subspace.data,
relevance=self.diteration.y_relevance, distortion=p['distortion'])
cluster = subspace_clusterize(subspace, p['resolution'], indices)
if p['post']:
return self.post_documents_cluster(self, cluster)
return cluster
[docs] def get_documents_by_cluster(self, k=None, **kwargs):
"""
Returns a cluster of the best ranked documents.
The cluster is by default post_processed through the post_documents_cluster method.
Parameters
----------
k: int, optional
Number of documents to output. If not set, k is automatically computed
using the max_k and target_k runtime parameters.
kwargs: dict, optional
Custom runtime parameters.
Returns
-------
object
"""
p = self.parameters(**kwargs)
if k is None:
k = auto_k(data=self.diteration.x_relevance,
order=self.diteration.x_order,
max_k=p['max_k'],
target=p['target_k'])
return self.get_documents_by_cluster_from_indices(self.diteration.x_order[:k], **kwargs)
[docs] def get_features_by_cluster_from_indices(self, indices, **kwargs):
"""
Returns a cluster of features.
The cluster is by default post_processed through the post_features_cluster method.
Parameters
----------
indices: list of int
The indices of features to be processed. It is assumed that the features
are sorted by importance.
kwargs: dict, optional
Custom runtime parameters
Returns
-------
object
"""
p = self.parameters(**kwargs)
subspace = vstack([self.embedding.y[i, :] for i in indices])
if p['distortion']>0:
subspace_distortion(indices=subspace.indices, data=subspace.data,
relevance=self.diteration.x_relevance, distortion=p['distortion'])
cluster = subspace_clusterize(subspace, p['resolution'], indices)
if p['post']:
return self.post_features_cluster(self, cluster)
return cluster
[docs] def get_features_by_cluster(self, k=None, **kwargs):
"""
Returns a cluster of the best ranked features.
The cluster is by default post_processed through the post_features_cluster method.
Parameters
----------
k: int, optional
Number of documents to output. If not set, k is automatically computed
using the max_k and target_k runtime parameters.
kwargs: dict, optional
Custom runtime parameters.
Returns
-------
object
"""
p = self.parameters(**kwargs)
if k is None:
k = auto_k(data=self.diteration.y_relevance,
order=self.diteration.y_order,
max_k=p['max_k'],
target=p['target_k'])
return self.get_features_by_cluster_from_indices(self.diteration.y_order[:k], **kwargs)
# Covering part
[docs] def get_documents_by_coverage(self, k=None, **kwargs):
"""
Returns a list of top covering documents.
By default, the documents are post_processed through the post_documents_item method.
Parameters
----------
k: int, optional
Number of documents to output. If not set, k is automatically computed
using the max_k and target_k runtime parameters.
kwargs: dict, optional
Custom runtime parameters.
Returns
-------
list
"""
p = self.parameters(**kwargs)
post = p['post']
if k is None:
k = auto_k(data=self.diteration.x_relevance,
order=self.diteration.x_order,
max_k=p['max_k'],
target=p['target_k'])
p['post'] = False
cluster = self.get_documents_by_cluster(k=int(k * p['stretch']),
**p)
indices = covering_order(cluster, wide=p['wide'])[:k]
if post:
return [self.post_documents_item(self, i) for i in indices]
else:
return indices
[docs] def get_features_by_coverage(self, k=None, **kwargs):
"""
Returns a list of top covering features.
By default, the features are post_processed through the post_features_item method.
Parameters
----------
k: int, optional
Number of documents to output. If not set, k is automatically computed
using the max_k and target_k runtime parameters.
kwargs: dict, optional
Custom runtime parameters.
Returns
-------
list
"""
p = self.parameters(**kwargs)
post = p['post']
if k is None:
k = auto_k(data=self.diteration.y_relevance,
order=self.diteration.y_order,
max_k=p['max_k'],
target=p['target_k'])
p['post'] = False
cluster = self.get_features_by_cluster(k=int(k * p['stretch']),
**p)
indices = covering_order(cluster, wide=p['wide'])[:k]
if post:
return [self.post_features_item(self, i) for i in indices]
else:
return indices
[docs]class XGismo(Gismo):
"""
Given two distinct embeddings base on the same set of documents, builds a new gismo.
The features of ``x_embedding`` are the corpus of this new gismo.
The features of ``y_embedding`` are the features of this new gismo.
The dual embedding of the new gismo is obtained by crossing the two input dual embeddings.
xgismo behaves essentially as a gismo object. The main difference is an additional parameter ``y`` for the
rank method, to control if the query projection should be performed on the ``y_embedding`` or on the
``x_embedding``.
Parameters
----------
x_embedding: Embedding
The *left* embedding, which defines the documents of the xgismo.
y_embedding: Embedding
The *right* embedding, which defines the features of the xgismo.
filename: str, optional
If set, will load xgismo from file.
path: str or Path, optional
Directory where the xgismo is to be loaded from.
kwargs: dict
Custom default runtime parameters.
You just need to specify the parameters that differ from :class:`~gismo.parameters.DEFAULT_PARAMETERS`.
Examples
---------
One the main use case for XGismo consists in transforming a list of articles into a Gismo that relates authors
and the words they use. Let's start by retrieving a few articles.
>>> toy_url = "https://dblp.org/pers/xx/m/Mathieu:Fabien.xml"
>>> source = [a for a in url2source(toy_url) if int(a['year'])<2023]
Then we build the embedding of words.
>>> corpus = Corpus(source, to_text=lambda x: x['title'])
>>> w_count = CountVectorizer(dtype=float, stop_words='english')
>>> w_embedding = Embedding(w_count)
>>> w_embedding.fit_transform(corpus)
And the embedding of authors.
>>> to_authors_text = lambda dic: " ".join([a.replace(' ', '_') for a in dic['authors']])
>>> corpus.to_text = to_authors_text
>>> a_count = CountVectorizer(dtype=float, preprocessor=lambda x:x, tokenizer=lambda x: x.split(' '))
>>> a_embedding = Embedding(a_count)
>>> a_embedding.fit_transform(corpus)
We can now combine the two embeddings in one xgismo.
>>> xgismo = XGismo(a_embedding, w_embedding)
>>> xgismo.post_documents_item = lambda g, i: g.corpus[i].replace('_', ' ')
We can use xgismo to query keyword(s).
>>> success = xgismo.rank("Pagerank")
>>> xgismo.get_documents_by_rank()
['Mohamed Bouklit', 'Dohy Hong', 'The Dang Huynh']
We can use it to query researcher(s).
>>> success = xgismo.rank("Anne_Bouillard", y=False)
>>> xgismo.get_documents_by_rank()
['Anne Bouillard', 'Elie de Panafieu', 'Céline Comte', 'Thomas Deiß', 'Philippe Sehier', 'Dmitry Lebedev']
"""
def __init__(self, x_embedding=None, y_embedding=None, filename=None, path=".", **kwargs):
if filename is not None:
self.load(filename=filename, path=path)
else:
embedding = Embedding()
embedding.n = x_embedding.m
embedding.m = y_embedding.m
embedding.features = y_embedding.features
embedding.x = np.dot(x_embedding.y, y_embedding.x)
embedding.x_norm = np.ones(embedding.n)
embedding.y = np.dot(y_embedding.y, x_embedding.x)
embedding.y_norm = np.ones(embedding.m)
embedding.idf = y_embedding.idf
super().__init__(corpus=Corpus(x_embedding.features, to_text=lambda x: x), embedding=embedding, **kwargs)
self.x_projection = x_embedding.query_projection
self.y_projection = y_embedding.query_projection
[docs] def rank(self, query="", y=True, **kwargs):
"""
Runs the DIteration using query as starting point.
``query`` can be evaluated on features (``y=True``) or documents (``y=False``).
Parameters
----------
query: str
Text that starts DIteration
y: bool
Determines if query should be evaluated on features (``True``) or documents (``False``).
kwargs: dict, optional
Custom runtime parameters.
Returns
-------
success: bool
success of the query projection. If projection fails, a ranking on uniform distribution is performed.
"""
p = self.parameters(**kwargs)
if y:
z, found = self.y_projection(query)
offset = 1.0
else:
z, found = self.x_projection(query)
z = np.dot(z, self.embedding.x)
offset = 0.0
self.embedding._result_found = found
self.diteration(self.embedding.x, self.embedding.y, z,
alpha=p['alpha'], n_iter=p['n_iter'],
offset=offset, memory=p['memory'])
return found