Source code for minsketch.count_mean_sketch

# -*- coding: utf-8 -*-
"""This model implements count-mean-min sketches, as described by Goyal and
Daumé (2011):
http://www.aclweb.org/anthology/D12-1100

The design here is a little iffy - there's a fair amount of code duplication
between the two classes. Perhaps a mixin would fit better, or changing the
abstractions.
"""
from itertools import izip

from numpy import median

import count_min_sketch
import double_hashing


# TODO: Re-consider design here at some point - Mixins? Different abstraction?


[docs]class CountMeanMinSketch(count_min_sketch.TopNCountMinSketch):
    """A count-mean-min sketch, based on the regular TopNCountMinSketch.

    The two main changes include using error-estimates from the updater
    (regular or conservative), and returning min between the minimum of the
    regularly queried values, and the median of the values after accounting
    for the error estimates.
    """

[docs]    def get(self, item):
        hashes = self.hash(item)
        values = [self.table.get(i, hashes[i])
                  for i in range(self.table.depth)]
        error_estimates = self.updater.count_mean_error_estimates(self.table,
                                                                  values)
        return min(
            median(
                sorted([value - error
                        for value, error in izip(values, error_estimates)])),
            min(values))

[docs]    def most_common(self, k=None):
        """Re-query the top-n keys using the count-mean query algorithm, and
        then return the top k of the re-queried results
        :param k: The number of top items to return, at most n, as this
        function was initialized
        :return: The top k items observed by this CM sketch, and how often they
        were seen
        """
        mean_values = [(key, self.get(key)) for _, key in self.top_n]
        mean_values.sort(key=lambda pair: pair[1], reverse=True)

        if k is None or k > self.n:
            k = self.n

        return mean_values[:k]


[docs]class HashPairCountMeanMinSketch(double_hashing.HashPairCMSketch):
    """A count-mean-min sketch, based on the HashPair CM SKETCH.

        The two main changes include using error-estimates from the updater
        (regular or conservative), and returning min between the minimum of
        the regularly queried values, and the median of the values after
        accounting for the error estimates.
        """

[docs]    def get(self, item):
        hashes = self.hash(item)
        values = [self.table.get(i, hashes[i])
                  for i in range(self.table.depth)]
        error_estimates = self.updater.count_mean_error_estimates(self.table,
                                                                  values)
        return min(
            median(
                sorted([value - error
                        for value, error in izip(values, error_estimates)])),
            min(values))

[docs]    def most_common(self, k=None):
        """Re-query the top-n keys using the count-mean query algorithm,
        and then return the top k of the re-queried results
        :param k: The number of top items to return, at most n, as this
        function was initialized
        :return: The top k items observed by this CM sketch, and how often they
        were seen
        """
        mean_values = [(key, self.get(key)) for _, key in self.top_n]
        mean_values.sort(key=lambda pair: pair[1], reverse=True)

        if k is None or k > self.n:
            k = self.n

        return mean_values[:k]