Source code for gismo.diteration

from numba import njit
import numpy as np

from gismo.parameters import ALPHA, N_ITER, OFFSET, MEMORY


# diffusion: starting point is on the feature (words) space
# provides ranking on features (X) and documents (Y)

[docs]
@njit
def jit_diffusion(
    x_pointers,
    x_indices,
    x_data,
    y_pointers,
    y_indices,
    y_data,
    z_indices,
    z_data,
    x_relevance,
    y_relevance,
    alpha,
    n_iter,
    offset: float,
    x_fluid,
    y_fluid,
):
    """
    Core diffusion engine written to be compatible with `Numba <https://numba.pydata.org/>`_.
    This is where the `DIteration <https://arxiv.org/pdf/1501.06350.pdf>`_
    algorithm is applied inline.

    Parameters
    ----------
    x_pointers: :class:`~numpy.ndarray`
        Pointers of the :class:`~scipy.sparse.csr_matrix` embedding of documents.
    x_indices: :class:`~numpy.ndarray`
        Indices of the :class:`~scipy.sparse.csr_matrix` embedding of documents.
    x_data: :class:`~numpy.ndarray`
        Data of the :class:`~scipy.sparse.csr_matrix` embedding of documents.
    y_pointers: :class:`~numpy.ndarray`
        Pointers of the :class:`~scipy.sparse.csr_matrix` embedding of features.
    y_indices: :class:`~numpy.ndarray`
        Indices of the :class:`~scipy.sparse.csr_matrix` embedding of features.
    y_data: :class:`~numpy.ndarray`
        Data of the :class:`~scipy.sparse.csr_matrix` embedding of features.
    z_indices: :class:`~numpy.ndarray`
        Indices of the :class:`~scipy.sparse.csr_matrix` embedding of the query projection.
    z_data: :class:`~numpy.ndarray`
        Data of the :class:`~scipy.sparse.csr_matrix` embedding of the query_projection.
    x_relevance: :class:`~numpy.ndarray`
        Placeholder for relevance of documents.
    y_relevance: :class:`~numpy.ndarray`
        Placeholder for relevance of features.
    alpha: float in range [0.0, 1.0]
        Damping factor. Controls the trade-off between closeness and centrality.
    n_iter: int
        Number of round-trip diffusions to perform. Higher value means better precision
        but longer execution time.
    offset: float in range [0.0, 1.0]
        Controls how much of the initial fluid should be deduced form the relevance.
    x_fluid: :class:`~numpy.ndarray`
        Placeholder for fluid on the side of documents.
    y_fluid: :class:`~numpy.ndarray`
        Placeholder for fluid on the side of features.
    """
    n = len(x_pointers) - 1
    m = len(y_pointers) - 1

    # Reset fluids
    x_fluid[:] = 0
    y_fluid[:] = 0
    for ind, data in zip(z_indices, z_data):
        y_relevance[ind] -= data * offset  # First round penalty
        y_fluid[ind] = data

    # Core diffusion
    for turn in range(n_iter):
        for j in range(m):
            f = y_fluid[j]
            y_fluid[j] = 0.0
            if f > 0:
                y_relevance[j] += f
                x_fluid[y_indices[y_pointers[j] : y_pointers[j + 1]]] += (
                    f * alpha * y_data[y_pointers[j] : y_pointers[j + 1]]
                )
        for i in range(n):
            f = x_fluid[i]
            x_fluid[i] = 0.0
            if f > 0:
                x_relevance[i] += f
                y_fluid[x_indices[x_pointers[i] : x_pointers[i + 1]]] += (
                    f * alpha * x_data[x_pointers[i] : x_pointers[i + 1]]
                )

    # Don't waste the last drop of fluid, it's free!
    for i in range(m):
        y_relevance[i] += y_fluid[i]




[docs]
class DIteration:
    """
    This class is in charge of performing the
    `DIteration <https://arxiv.org/pdf/1501.06350.pdf>`_
    algorithm.

    Parameters
    ----------
    n: int
        Number of documents.
    m: int
        Number of features.

    Attributes
    ----------
    x_relevance: :class:`~numpy.ndarray`
        Relevance of documents.
    y_relevance: :class:`~numpy.ndarray`
        Relevance of features.
    x_order: :class:`~numpy.ndarray`
        Indices of documents sorted by relevance.
    y_order: :class:`~numpy.ndarray`
        Indices of features sorted by relevance.
    """

    def __init__(self, n, m):
        self.x_relevance = np.zeros(n)
        self.y_relevance = np.zeros(m)
        self.x_order = None
        self.y_order = None
        self._x_fluid = np.zeros(n)
        self._y_fluid = np.zeros(m)

    def __call__(
        self, x, y, z, alpha=ALPHA, n_iter=N_ITER, offset: float = OFFSET, memory=MEMORY
    ):
        """
        Performs DIteration algorithm and populate relevance / order vectors.

        Parameters
        ----------
        x: :class:`~scipy.sparse.csr_matrix`
            Embedding of documents in feature space.
        y: :class:`~scipy.sparse.csr_matrix`
            Embedding of features in document space.
        z:  class:`~scipy.sparse.csr_matrix`
            Embedding of query in feature space.
        alpha: float in range [0.0, 1.0]
            Damping factor. Controls the trade-off between closeness and centrality.
        n_iter: int
            Number of round-trip diffusions to perform. Higher value means better precision
            but longer execution time.
        offset: float in range [0.0, 1.0]
            Controls how much of the initial fluid should be deduced form the relevance.
        memory: float in range [0.0, 1.0]
            Controls how much of previous computation is kept
            when performing a new diffusion.
        """
        self.x_relevance[:] *= memory
        self.y_relevance[:] *= memory
        jit_diffusion(
            x_pointers=x.indptr,
            x_indices=x.indices,
            x_data=x.data,
            y_pointers=y.indptr,
            y_indices=y.indices,
            y_data=y.data,
            z_indices=z.indices,
            z_data=z.data,
            x_relevance=self.x_relevance,
            y_relevance=self.y_relevance,
            alpha=alpha,
            n_iter=n_iter,
            offset=offset,
            x_fluid=self._x_fluid,
            y_fluid=self._y_fluid,
        )
        self.x_order = np.argsort(-self.x_relevance)
        self.y_order = np.argsort(-self.y_relevance)