Source code for minsketch.double_hashing

# -*- coding: utf-8 -*-
"""Two implementations of ideas from "Less Hashing, Same Performance: Building
a Better Bloom Filter," by Kirsch and Mitzenmacher (2008):
https://www.eecs.harvard.edu/~michaelm/postscripts/rsa2008.pdf
"""
import math
from gmpy2 import next_prime

import count_min_sketch
import hash_strategy
import lossy_strategy
import sketch_tables
import update_strategy


[docs]class HashPairCMSketch(count_min_sketch.TopNCountMinSketch): r"""A single hash-pair count-min sketch, based on Kirsch & Mitzenmacher (2008). This essentially ignores the epsilon parameter, and provides a robust implementation of a count-min sketch using only two hash functions. """ def __init__(self, delta, epsilon, hash_gen=None, n=count_min_sketch.DEFAULT_N, table_class=sketch_tables.ListBackedSketchTable, update_strategy=update_strategy.NaiveUpdateStrategy, lossy_strategy=lossy_strategy.NoLossyUpdateStrategy): """Most parameters conform to TopNCountMinSketch, except for the delta and hash_gen: :param delta: How often (on expectation) we accept an error. Ignored here, but received to conform to the same initialization signature of the rest of the count-min sketches. :param epsilon: What margin (on expectation) we accept for an error :param hash_gen: We allow receiving a hash generator to be able to share them between different instances, if we want to guarantee their independence. :param n: The top-N items to track. :param table_class: The class of table to use in backing this CM-sketch :param update_strategy: The updating strategy to use (currently naive or convservative) :param lossy_strategy: The lossy window strategy to use (naive, no threshold, threshold of 1, threshold of window count, threshold of the square root of the window count) """ width = next_prime(int(math.ceil(2 * math.e / epsilon))) depth = int( math.ceil(math.log(1.0 / (epsilon - epsilon / (2 * math.e**2))))) print('The effective delta is {delta}'.format(delta=math.exp(-1 * depth))) if hash_gen is None: hash_gen = hash_strategy.UniversalHashFunctionGenerator( hash_strategy.ARBITRARY_LARGE_PRIME_NUMBER) super(HashPairCMSketch, self).__init__( delta, epsilon, depth, width, n, table_class, hash_strategy.DoubleHashingStrategy(depth, width, hash_gen), update_strategy, lossy_strategy)
[docs]class MultiHashPairTopNCMSketch(count_min_sketch.TopNCountMinSketch): r"""From the previously referenced paper: "Given such a result, it is straightforward to obtain a variation that uses :math:`2\frac{\ln \frac{1}{\delta}}{\ln \frac{1}{\epsilon}}` pairwise independent hash functions and achieves the desired failure probability :math:`\delta`: simply build :math:`2\frac{\ln \frac{1}{\delta}}{\ln \frac{1}{\epsilon}}` independent copies of this data structure, and always answer a point query with the minimum estimate given by one of those copies." I believe the second 2 is a mistake - so I only build :math:`\frac{\ln \frac{1}{\delta}}{\ln \frac{1}{\epsilon}}` rounded up sketches. """ def __init__(self, delta, epsilon, n=count_min_sketch.DEFAULT_N, table_class=sketch_tables.ListBackedSketchTable, update_strategy=update_strategy.NaiveUpdateStrategy, lossy_strategy=lossy_strategy.NoLossyUpdateStrategy): self._init_top_n(n) self.count = int( math.ceil(math.log(1.0 / delta) / math.log(1.0 / epsilon))) hash_gen = hash_strategy.UniversalHashFunctionGenerator( hash_strategy.ARBITRARY_LARGE_PRIME_NUMBER) self.sketches = [HashPairCMSketch( delta, epsilon, hash_gen, table_class=table_class, update_strategy=update_strategy, lossy_strategy=lossy_strategy) for _ in range(self.count)]
[docs] def insert(self, item, count=1): new_min_count = min( [sketch.insert(item, count) for sketch in self.sketches]) self._update_top_n(item, new_min_count) return new_min_count
[docs] def get(self, item): return min([sketch.get(item) for sketch in self.sketches])