Source code for minsketch.counter_sketch_hybrid

# -*- coding: utf-8 -*-
""" This module implements a best-of-both-worlds approach - as python Counter
objects are much faster than all count-min sketches implemented, but take
indefinite amounts of space, we create a hybrid - saving new data into a
Counter as it arrives, and batch-updating the count-min sketch.
"""
from collections import Counter
from itertools import izip

DEFAULT_BATCH_SIZE = 10 ** 4


[docs]class SketchCounterHybrid(object): """The hybrid object, as described in the module documentation. On initial benchmarking, using the default batch size, it provides a performance boost of about 10x over using the sketch alone (on ~1M words, from 88 seconds to 8), at the cost about 2.5x memory (208kb to 500kb). """ def __init__(self, sketch, batch_size=DEFAULT_BATCH_SIZE): """The sketch is initialized externally, but we expect a sketch that supports the most_common operation :param sketch: Expected to conform to the minsketch.TopNCountMinSketch interface :param batch_size: The batches in which to update the sketch """ self.sketch = sketch self.batch_size = batch_size self.counter = Counter() self.current_batch = 0 def _digest_counter(self): self.sketch.update(*zip(*self.counter.iteritems())) self.counter = Counter() self.current_batch = 0 def insert(self, item, count=1): self.counter[item] += count self.current_batch += count if self.current_batch > self.batch_size: self._digest_counter() def update(self, items, counts=None): if counts is None: [self.insert(item) for item in items] else: [self.insert(item, count) for item, count in izip(items, counts)] def get(self, item): counter_out = item in self.counter and self.counter.get(item) or 0 return counter_out + self.sketch.get(item) def most_common(self, k=None): self._digest_counter() return self.sketch.most_common(k)