Source code for minsketch.hash_strategy

# -*- coding: utf-8 -*-
"""Implements two basic hashing strategies - a universal hashing function as
described by Cormen et al., and a hash-pair strategy as discussed by Kirsch and
Mitzenmacher (2008).
"""
import random

ARBITRARY_LARGE_PRIME_NUMBER = 4294967291  # Largest 32 bit prime


[docs]class UniversalHashFunctionGenerator(object): r"""Implementation of a universal hash function family generator, as described in Cormen et al.'s Introduction to Algorithms. The sets are used to guarantee independence between the hash functions in this family. For a given prime number p, and a width of table m, functions are constructed as :math:`f(x)=((ax + b)\mod p)\mod m, \ a \in \{1, 2, ..., p - 1\}, \ b \in \{0, 1, ..., p - 1\}` """ def __init__(self, m): self.a_set = {None, } self.b_set = {None, } self.m = m def __call__(self, *args, **kwargs): a = None b = None while a in self.a_set and b in self.b_set: a = random.randint(1, ARBITRARY_LARGE_PRIME_NUMBER - 1) b = random.randint(0, ARBITRARY_LARGE_PRIME_NUMBER - 1) self.a_set.add(a) self.b_set.add(b) return lambda x: ((a * x + b) % ARBITRARY_LARGE_PRIME_NUMBER) % self.m
[docs]class NaiveHashingStrategy(object): """An implementation of the basic hashing strategy, using a different member of a universal function for each row """ def __init__(self, depth, width, hash_gen=None): self.depth = depth self.width = width if hash_gen is None: hash_gen = UniversalHashFunctionGenerator(width) self.hashes = [hash_gen() for _ in range(depth)] def __call__(self, item): item = hash( item) # This handles strings well while returning itself for ints return [self.hashes[i](item) for i in range(self.depth)]
[docs]class DoubleHashingStrategy(object): r"""A single hash-pair count-min hashing scheme, based on Kirsch & Mitzenmacher (2008): https://www.eecs.harvard.edu/~michaelm/postscripts/rsa2008.pdf Each item (x) is hashed as: :math:`h_1(x) + j * h_2(x) \ \forall \ j \in \{0, 1, ..., d - 1\}` """ def __init__(self, depth, width, hash_gen=None): self.depth = depth self.width = width if hash_gen is None: hash_gen = UniversalHashFunctionGenerator( ARBITRARY_LARGE_PRIME_NUMBER) self.hash_gen = hash_gen self.first_hash = hash_gen() self.second_hash = hash_gen() def __call__(self, item): item = hash( item) # This handles strings well while returning itself for ints first = self.first_hash(item) second = self.second_hash(item) return [(first + j * second) % self.width for j in range(self.depth)]