Source code for minsketch.least_squares_sketch

# -*- coding: utf-8 -*-
"""A least-squares estimator for count-min sketches, as introduced by Lee, Lui,
Yoon, & Zhang:
https://www.usenix.org/legacy/event/imc05/tech/full_papers/lee/lee.pdf
"""
from itertools import izip

import numpy

import count_min_sketch


[docs]class LeastSquaresTopNSketch(count_min_sketch.TopNCountMinSketch):
    """The least-squares top-N sketch only overrides the most common item
    finding heuristic:
    """

[docs]    def most_common(self, k=None):
        r"""Generate a better estimate for the top k most-common entries using
        the least squares estimator. We always solve the solution for the
        entire top n, and return the first k, to allow for more accuracy if
        only a subset is wanted (as the rest end up being used as noise
        variables).

        Formally, we construct :math:`\vec{b}` as a concatenation of the tables
        rows, implemented by table.to_vector(). The size of :math:`\vec{b}` is
        thus (table.depth * table.width) x 1.

        We construct :math:`A` as having the same number or rows as
        :math:`\vec{b}`, and having :math:`k + 1` columns, using the following
        values:

        for :math:`i \in \{0, 1, ..., \text{table.width} - 1\}`
        and :math:`j \in \{0, 1, ..., \text{table.depth} - 1 \}`,

        we set \ :math:`A_{\text{table.depth} * i + j + 1, l} = 1` if either
        the item :math:`x_l` is hashed into :math:`T[i][j]`, or if
        :math:`l = \text{table.width} + 1`, and otherwise we leave it as 0.

        We then solve using least-squares estimation (using the pseudoinverse)
        :math:`A\vec{x}=\vec{b}`, sort the results in a descending fashion
        (discarding the noise), and report the top k entries.

        :param k: The number of top results to return - <= to n
        :return: The least-squares estimate for how often these top k appeared
        """
        if k is None or k > self.n:
            k = self.n

        b_vector = self.table.to_vector()
        a_matrix = numpy.zeros(
            (self.table.depth * self.table.width, self.n + 1))

        largest = [item for _, item in self.heap.n_largest(self.n)]
        largest.reverse()

        for l in range(len(self.top_n)):
            item = largest[l]
            for table_index, hash_index in izip(
                    range(self.table.depth), self.hash(item)):
                a_matrix[table_index * self.table.width + hash_index, l] = 1

        # Set the last column to be entirely ones
        a_matrix[:, self.n] = 1

        # Solve the equation, removing the error term
        x_vector = numpy.linalg.lstsq(a_matrix, b_vector)[0][:-1]

        new_values = zip(largest, [int(x) for x in x_vector])
        new_values.sort(key=lambda pair: pair[1], reverse=True)
        return new_values[:k]