Source code for gismo.landmarks

import numpy as np
import logging
from scipy.sparse import csr_matrix, vstack, hstack

from gismo.common import auto_k, toy_source_text
from gismo.parameters import Parameters, DEFAULT_LANDMARKS_PARAMETERS
from gismo.corpus import Corpus
from gismo.embedding import Embedding
from gismo.gismo import Gismo
from gismo.clustering import subspace_clusterize, Cluster, subspace_distortion
from gismo.post_processing import post_landmarks_item_raw, post_landmarks_cluster_json

logging.basicConfig()
log = logging.getLogger("Gismo")


[docs]def get_direction(reference, balance): """ Converts a reference object into a `n+m` direction (dense or sparse depending on reference type). Parameters ---------- reference: Gismo or Landmarks or Cluster or np.ndarray or csr_matrix. The object from which a direction will be extracted. balance: float in range [0.0, 1.0] The trade-off between documents and features. Set to 0.0, only the feature space will be used. Set to 1.0, only the document space will be used. Returns ------- np.ndarray or csr_matrix A `n+m` direction. """ if isinstance(reference, Gismo): return np.hstack([balance * reference.diteration.x_relevance, (1 - balance) * reference.diteration.y_relevance]) if isinstance(reference, Landmarks): return np.hstack([balance * reference.x_direction, (1 - balance) * reference.y_direction]) if isinstance(reference, Cluster): return reference.vector return reference
[docs]class Landmarks(Corpus): """ The `Landmarks` class is a subclass :py:class:`~gismo.corpus.Corpus`. It offers the capability to batch-rank all its entries against a :py:class:`~gismo.gismo.Gismo` instance. After it has been processed, a `Landmarks` can be used to analyze/classify :py:class:`~gismo.gismo.Gismo` queries, :py:class:`~gismo.clustering.Cluster`, or :py:class:`~gismo.landmarks.Landmarks`. Landmarks also offers the possibility to reduce a source or a gismo to its neighborhood. This can be useful if the source is huge and one wants something smaller for performance. Parameters ------------ source: list The list of items that form Landmarks. to_text: function The function that transforms an item into text kwargs: dict Custom default runtime parameters. You just need to specify the parameters that differ from :obj:`~gismo.parameters.DEFAULT_LANDMARKS_PARAMETERS`. Examples -------- Landmarks lean on a Gismo. We can use a toy Gismo to start with. >>> corpus = Corpus(toy_source_text) >>> embedding = Embedding() >>> embedding.fit_transform(corpus) >>> gismo = Gismo(corpus, embedding) >>> print(toy_source_text) # doctest: +NORMALIZE_WHITESPACE ['Gizmo is a Mogwaï.', 'This is a sentence about Blade.', 'This is another sentence about Shadoks.', 'This very long sentence, with a lot of stuff about Star Wars inside, makes at some point a side reference to the Gremlins movie by comparing Gizmo and Yoda.', 'In chinese folklore, a Mogwaï is a demon.'] Landmarks are constructed exactly like a Gismo object, with a source and a `to_text` function. >>> landmarks_source = [{'name': 'Movies', 'content': 'Star Wars, Gremlins, and Blade are movies.'}, ... {'name': 'Gremlins', 'content': 'The Gremlins movie features a Mogwai.'}, ... {'name': 'Star Wars', 'content': 'The Star Wars movies feature Yoda.'}, ... {'name': 'Shadoks', 'content': 'Shadoks is a French sarcastic show.'},] >>> landmarks = Landmarks(landmarks_source, to_text=lambda e: e['content']) The :func:`~gismo.landmarks.Landmarks.fit` method compute gismo queries for all landmarks and retain the results. >>> landmarks.fit(gismo) We run the request *Yoda* and look at the key landmarks. Note that *Gremlins* comes before *Star Wars*. This is actually *correct* in this small dataset: the word `Yoda` only exists in one sentence, which contains the words `Gremlins` and `Gizmo`. >>> success = gismo.rank('yoda') >>> landmarks.get_landmarks_by_rank(gismo) # doctest: +NORMALIZE_WHITESPACE [{'name': 'Gremlins', 'content': 'The Gremlins movie features a Mogwai.'}, {'name': 'Star Wars', 'content': 'The Star Wars movies feature Yoda.'}, {'name': 'Movies', 'content': 'Star Wars, Gremlins, and Blade are movies.'}] For better readibility, we set the item post_processing to return the `name` of a landmark item. >>> landmarks.post_item = lambda lmk, i: lmk[i]['name'] >>> landmarks.get_landmarks_by_rank(gismo) ['Gremlins', 'Star Wars', 'Movies'] The balance adjusts between documents and features spaces. A balance set to 1.0 focuses only on documents. >>> success = gismo.rank('blade') >>> landmarks.get_landmarks_by_rank(gismo, balance=1) ['Movies'] A balance set to 0.0 focuses only on features. For *blade*, this triggers *Shadoks* as a secondary result, because of the shared word *sentence*. >>> landmarks.get_landmarks_by_rank(gismo, balance=0) ['Movies', 'Shadoks'] Landmarks can be used to analyze landmarks. >>> landmarks.get_landmarks_by_rank(landmarks) ['Gremlins', 'Star Wars'] See again how balance can change things. Here a balance set to 0.0 (using only features) fully changes the results. >>> landmarks.get_landmarks_by_rank(landmarks, balance=0) ['Shadoks'] Like for :py:class:`~gismo.gismo.Gismo`, landmarks can provide clusters. >>> success = gismo.rank('gizmo') >>> landmarks.get_landmarks_by_cluster(gismo) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS {'landmark': {'name': 'Gremlins', 'content': 'The Gremlins movie features a Mogwai.'}, 'focus': 0.999998..., 'children': [{'landmark': {'name': 'Gremlins', 'content': 'The Gremlins movie features a Mogwai.'}, 'focus': 1.0, 'children': []}, {'landmark': {'name': 'Star Wars', 'content': 'The Star Wars movies feature Yoda.'}, 'focus': 1.0, 'children': []}, {'landmark': {'name': 'Movies', 'content': 'Star Wars, Gremlins, and Blade are movies.'}, 'focus': 1.0, 'children': []}]} We can set the `post_cluster` attribute to customize the output. Gismo provides a simple display. >>> from gismo.post_processing import post_landmarks_cluster_print >>> landmarks.post_cluster = post_landmarks_cluster_print >>> landmarks.get_landmarks_by_cluster(gismo) # doctest: +NORMALIZE_WHITESPACE F: 1.00. - Gremlins - Star Wars - Movies Like for :py:class:`~gismo.gismo.Gismo`, parameters like `k`, `distortion`, or `resolution` can be used. >>> landmarks.get_landmarks_by_cluster(gismo, k=4, distortion=False, resolution=.9) # doctest: +NORMALIZE_WHITESPACE F: 0.03. - F: 0.93. -- F: 1.00. --- Gremlins --- Star Wars -- Movies - Shadoks Note that a :py:class:`~gismo.clustering.Cluster` can also be used as reference for the :meth:`~gismo.landmarks.Landmarks.get_landmarks_by_rank` and :meth:`~gismo.landmarks.Landmarks.get_landmarks_by_cluster` methods. >>> cluster = landmarks.get_landmarks_by_cluster(gismo, post=False) >>> landmarks.get_landmarks_by_rank(cluster) ['Gremlins', 'Star Wars', 'Movies'] Yet, you cannot use anything as reference. For example, you cannot use a string as such. >>> landmarks.get_landmarks_by_rank("Landmarks do not use external queries (pass them to a gismo") # doctest.ELLIPSIS Traceback (most recent call last): ... TypeError: bad operand type for unary -: 'NoneType' Last but not least, landmarks can be used to reduce the size of a source or a :py:class:`~gismo.gismo.Gismo`. The reduction is controlled by the `x_density` attribute that tells the number of documents each landmark will allow to keep. >>> landmarks.parameters.x_density = 1 >>> reduced_gismo = landmarks.get_reduced_gismo(gismo) >>> reduced_gismo.corpus.source # doctest: +NORMALIZE_WHITESPACE ['This is another sentence about Shadoks.', 'This very long sentence, with a lot of stuff about Star Wars inside, makes at some point a side reference to the Gremlins movie by comparing Gizmo and Yoda.'] Side remark #1: in the constructor, `to_text` indicates how to convert an item to `str`, while `ranking_function` specifies how to run a query on a :py:class:`~gismo.gismo.Gismo`. Yet, it is possible to have the text conversion handled by the `ranking_function`. >>> landmarks = Landmarks(landmarks_source, rank=lambda g, q: g.rank(q['content'])) >>> landmarks.fit(gismo) >>> success = gismo.rank('yoda') >>> landmarks.post_item = lambda lmk, i: lmk[i]['name'] >>> landmarks.get_landmarks_by_rank(gismo) ['Star Wars', 'Movies', 'Gremlins'] However, this is bad practice. When you only need to customize the way an item is converted to text, you should stick to `to_text`. `ranking_function` is for more elaborated filters that require to change the default way gismo does queries. Side remark #2: if a landmark item query fails (its text does not intersect the gismo features), the default uniform projection will be used and a warning will be issued. This may yield to undesired results. >>> landmarks_source.append({'name': 'unrelated', 'content': 'unrelated.'}) >>> landmarks = Landmarks(landmarks_source, to_text=lambda e: e['content']) >>> landmarks.fit(gismo) >>> success = gismo.rank('gizmo') >>> landmarks.post_item = lambda lmk, i: lmk[i]['name'] >>> landmarks.get_landmarks_by_rank(gismo) ['Shadoks', 'unrelated'] """ def __init__(self, source=None, to_text=None, **kwargs): self.parameters = Parameters(parameter_list=DEFAULT_LANDMARKS_PARAMETERS, **kwargs) self.x_vectors = None self.y_vectors = None self.x_direction = None self.y_direction = None self.post_item = post_landmarks_item_raw self.post_cluster = post_landmarks_cluster_json super().__init__(source=source, to_text=to_text) def embed_entry(self, gismo, entry, **kwargs): p = self.parameters(**kwargs) log.debug(f"Processing {entry}.") success = p['rank'](gismo, entry) if not success: log.warning(f"Query {entry} didn't match any feature.") indptr = [0, min(p['y_density'], gismo.embedding.m)] indices = gismo.diteration.y_order[:p['y_density']] data = gismo.diteration.y_relevance[indices] y = csr_matrix((data, indices, indptr), shape=(1, gismo.embedding.m)) indptr = [0, min(p['x_density'], gismo.embedding.n)] indices = gismo.diteration.x_order[:p['x_density']] data = gismo.diteration.x_relevance[indices] x = csr_matrix((data, indices, indptr), shape=(1, gismo.embedding.n)) log.debug(f"Landmarks of {entry} computed.") return x / np.sum(x), y / np.sum(y)
[docs] def fit(self, gismo, **kwargs): """ Runs gismo queries on all landmarks. The relevance results are used to build two set of vectors: `x_vectors` is the vectors on the document space; `y_vectors` is the vectors on the document space. On each space, vectors are summed to build a direction, which is a sort of vector summary of the landmarks. Parameters ---------- gismo: Gismo The Gismo on which vectors will be computed. kwargs: dict Custom Landmarks runtime parameters. Returns ------- None """ log.info(f"Start computation of {len(self)} landmarks.") xy = [self.embed_entry(gismo, entry, **kwargs) for entry in self.iterate_text()] self.x_vectors = vstack([v[0] for v in xy]) self.x_direction = np.squeeze(np.asarray(np.sum(self.x_vectors, axis=0))) self.y_vectors = vstack([v[1] for v in xy]) self.y_direction = np.squeeze(np.asarray(np.sum(self.y_vectors, axis=0))) log.info(f"All landmarks are built.")
def get_base(self, balance): return csr_matrix(hstack([balance * self.x_vectors, (1 - balance) * self.y_vectors])) def get_landmarks_by_rank(self, reference, k=None, base=None, **kwargs): p = self.parameters(**kwargs) if base is None: base = self.get_base(p['balance']) direction = get_direction(reference, p['balance']) if isinstance(direction, np.ndarray): similarities = base.dot(direction) elif isinstance(direction, csr_matrix): similarities = np.squeeze(base.dot(direction.T).toarray()) else: log.error("Direction type not supported. Direction must be gismo.gismo.Gismo, gismo.clustering.Cluster, " "gismo.landmarks.Landmarks, numpy.ndarray or scipy.sparse.csr_matrix.") similarities = None order = np.argsort(-similarities) if k is None: k = auto_k(data=similarities, order=order, max_k=p['max_k'], target=p['target_k']) if p['post']: return [self.post_item(self, i) for i in order[:k]] else: return order[:k] def get_landmarks_by_cluster(self, reference, k=None, **kwargs): p = self.parameters(**kwargs) base = self.get_base(p['balance']) direction = get_direction(reference, p['balance']) order = self.get_landmarks_by_rank(reference=direction, k=k, base=base, target_k=p['target_k'], max_k=p['max_k'], balance=p['balance'], post=False) subspace = vstack([base[i, :] for i in order]) if p['distortion']>0: subspace_distortion(indices=subspace.indices, data=subspace.data, relevance=direction, distortion=p['distortion']) cluster = subspace_clusterize(subspace, resolution=p['resolution'], indices=order) if p['post']: return self.post_cluster(self, cluster) else: return cluster def get_reduced_source(self, gismo, rebuild=True): if rebuild: self.fit(gismo) reduced_indices = {i for i in self.x_vectors.indices} return [a for i, a in enumerate(gismo.corpus.source) if i in reduced_indices] def get_reduced_gismo(self, gismo, rebuild=True): reduced_corpus = Corpus(self.get_reduced_source(gismo, rebuild=rebuild), to_text=gismo.corpus.to_text) reduced_embedding = Embedding(vectorizer=gismo.embedding.vectorizer) reduced_embedding.fit_transform(reduced_corpus) reduced_gismo = Gismo(reduced_corpus, reduced_embedding) reduced_gismo.parameters = gismo.parameters return reduced_gismo