Source code for gismo.diteration

from numba import njit
import numpy as np

from gismo.parameters import ALPHA, N_ITER, OFFSET, MEMORY


# diffusion: starting point is on the feature (words) space
# provides ranking on features (X) and documents (Y)
[docs]@njit def jit_diffusion(x_pointers, x_indices, x_data, y_pointers, y_indices, y_data, z_indices, z_data, x_relevance, y_relevance, alpha, n_iter, offset: float, x_fluid, y_fluid): """ Core diffusion engine written to be compatible with `Numba <https://numba.pydata.org/>`_. This is where the `DIteration <https://arxiv.org/pdf/1501.06350.pdf>`_ algorithm is applied inline. Parameters ---------- x_pointers: :class:`~numpy.ndarray` Pointers of the :class:`~scipy.sparse.csr_matrix` embedding of documents. x_indices: :class:`~numpy.ndarray` Indices of the :class:`~scipy.sparse.csr_matrix` embedding of documents. x_data: :class:`~numpy.ndarray` Data of the :class:`~scipy.sparse.csr_matrix` embedding of documents. y_pointers: :class:`~numpy.ndarray` Pointers of the :class:`~scipy.sparse.csr_matrix` embedding of features. y_indices: :class:`~numpy.ndarray` Indices of the :class:`~scipy.sparse.csr_matrix` embedding of features. y_data: :class:`~numpy.ndarray` Data of the :class:`~scipy.sparse.csr_matrix` embedding of features. z_indices: :class:`~numpy.ndarray` Indices of the :class:`~scipy.sparse.csr_matrix` embedding of the query projection. z_data: :class:`~numpy.ndarray` Data of the :class:`~scipy.sparse.csr_matrix` embedding of the query_projection. x_relevance: :class:`~numpy.ndarray` Placeholder for relevance of documents. y_relevance: :class:`~numpy.ndarray` Placeholder for relevance of features. alpha: float in range [0.0, 1.0] Damping factor. Controls the trade-off between closeness and centrality. n_iter: int Number of round-trip diffusions to perform. Higher value means better precision but longer execution time. offset: float in range [0.0, 1.0] Controls how much of the initial fluid should be deduced form the relevance. x_fluid: :class:`~numpy.ndarray` Placeholder for fluid on the side of documents. y_fluid: :class:`~numpy.ndarray` Placeholder for fluid on the side of features. """ n = len(x_pointers) - 1 m = len(y_pointers) - 1 # Reset fluids x_fluid[:] = 0 y_fluid[:] = 0 for ind, data in zip(z_indices, z_data): y_relevance[ind] -= data * offset # First round penalty y_fluid[ind] = data # Core diffusion for turn in range(n_iter): for j in range(m): f = y_fluid[j] y_fluid[j] = 0.0 if f > 0: y_relevance[j] += f x_fluid[y_indices[y_pointers[j]:y_pointers[j + 1]]] += f * alpha * y_data[ y_pointers[j]:y_pointers[j + 1]] for i in range(n): f = x_fluid[i] x_fluid[i] = 0.0 if f > 0: x_relevance[i] += f y_fluid[x_indices[x_pointers[i]:x_pointers[i + 1]]] += f * alpha * x_data[ x_pointers[i]:x_pointers[i + 1]] # Don't waste the last drop of fluid, it's free! for i in range(m): y_relevance[i] += y_fluid[i]
[docs]class DIteration: """ This class is in charge of performing the `DIteration <https://arxiv.org/pdf/1501.06350.pdf>`_ algorithm. Parameters ---------- n: int Number of documents. m: int Number of features. Attributes ---------- x_relevance: :class:`~numpy.ndarray` Relevance of documents. y_relevance: :class:`~numpy.ndarray` Relevance of features. x_order: :class:`~numpy.ndarray` Indices of documents sorted by relevance. y_order: :class:`~numpy.ndarray` Indices of features sorted by relevance. """ def __init__(self, n, m): self.x_relevance = np.zeros(n) self.y_relevance = np.zeros(m) self.x_order = None self.y_order = None self._x_fluid = np.zeros(n) self._y_fluid = np.zeros(m) def __call__(self, x, y, z, alpha=ALPHA, n_iter=N_ITER, offset: float = OFFSET, memory=MEMORY): """ Performs DIteration algorithm and populate relevance / order vectors. Parameters ---------- x: :class:`~scipy.sparse.csr_matrix` Embedding of documents in feature space. y: :class:`~scipy.sparse.csr_matrix` Embedding of features in document space. z: class:`~scipy.sparse.csr_matrix` Embedding of query in feature space. alpha: float in range [0.0, 1.0] Damping factor. Controls the trade-off between closeness and centrality. n_iter: int Number of round-trip diffusions to perform. Higher value means better precision but longer execution time. offset: float in range [0.0, 1.0] Controls how much of the initial fluid should be deduced form the relevance. memory: float in range [0.0, 1.0] Controls how much of previous computation is kept when performing a new diffusion. """ self.x_relevance[:] *= memory self.y_relevance[:] *= memory jit_diffusion(x_pointers=x.indptr, x_indices=x.indices, x_data=x.data, y_pointers=y.indptr, y_indices=y.indices, y_data=y.data, z_indices=z.indices, z_data=z.data, x_relevance=self.x_relevance, y_relevance=self.y_relevance, alpha=alpha, n_iter=n_iter, offset=offset, x_fluid=self._x_fluid, y_fluid=self._y_fluid) self.x_order = np.argsort(-self.x_relevance) self.y_order = np.argsort(-self.y_relevance)