Source code for minsketch.update_strategy

# -*- coding: utf-8 -*-
"""An implementation of naive and conservative update strategies.
Each update strategy also implements an error-estimate, used by count-min-mean.
"""
from itertools import izip


[docs]class NaiveUpdateStrategy(object):
    """ This could clearly be a function, rather than a class. Why make it a
    class, then? Two reasons:
    1) Compatibility with other strategies, all of whom are classes
    2) In the event I'll want to implement stateful updating strategies in the
    future
    3) I already ended up using its class-ness, to couple error estimation
    strategies here.
    """

    def __call__(self, table, hashes, count):
        """The most naive possible updating strategy - always update by
        everything
        :param table: The table being updated
        :param hashes: The hashes of the current item, each per row in the
        table
        :param count: The count to increment by
        :return: The new minimal count for this item being updated
        """
        return min(
            [table.increment(i, hashes[i], count) for i in range(table.depth)])

[docs]    def count_mean_error_estimates(self, table, values):
        """Provide an error estimate, using the total insertions into the table
        as a baseline
        :param table: The table being updated
        :param values: Values of the item we return error estimates for
        :return: Error estimates for each of these values
        """
        baselines = [table.total] * table.depth
        return self._error_estimates(table, values, baselines)

    def _error_estimates(self, table, values, baselines):
        return [(baseline - value) / (table.width - 1)
                for value, baseline in izip(values, baselines)]


[docs]class ConservativeUpdateStrategy(NaiveUpdateStrategy):
    """A conservative update strategy, as introduced by Cormode (2009) and
    Goyal (2010):
    http://www.cs.utah.edu/~amitg/papers/goyal10GEMS.pdf
    At every point, only update the minimal number of items.

    As for why this is a class, see the class-comment for NaiveUpdateStrategy
    """

    def __call__(self, table, hashes, count):
        """
        This only supports positive counts - with negative counts, this
        approach is insensible, and might lead to unexpected results.
        :param table: The table being updated
        :param hashes: The hashes of the current item, each per row in the
        table
        :param count: The count to increment by
        :return: The new minimal count for this item being updated
        """
        if 0 > count:
            raise ValueError(
                'Conservative updating does not support negative counts')

        current_values = [table.get(i, hashes[i]) for i in range(table.depth)]
        new_current_min = min(current_values) + count
        [table.set(i, hashes[i], new_current_min)
         for i, value in izip(range(table.depth), current_values)
         if value < new_current_min]

        return new_current_min

[docs]    def count_mean_error_estimates(self, table, values):
        """Error estimates for the count-min-mean, but in this case, we used
        the row-wise sum as an estimator for the entry in that row, rather than
        the entire total.
        :param table: The table being updated
        :param values: Values of the item we return error estimates for
        :return: Error estimates for each of these values
        """
        baselines = [sum(row) for row in table.row_iter()]
        return self._error_estimates(table, values, baselines)