Skip to content

Commit

Permalink
Fix slow iteration, add string representation
Browse files Browse the repository at this point in the history
  • Loading branch information
Ezibenroc committed Jan 20, 2024
1 parent a1b3a70 commit 176d346
Show file tree
Hide file tree
Showing 2 changed files with 95 additions and 79 deletions.
160 changes: 82 additions & 78 deletions pyroaring/abstract_bitmap.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,59 @@ cdef croaring.roaring_bitmap_t *deserialize_ptr(char *buff):
ptr = croaring.roaring_bitmap_portable_deserialize(buff)
return ptr

def _string_rep(bm):
skip_rows = len(bm) > 500 #this is the cutoff number for the truncating to kick in.
table_max_width = 80 # this isn't the length of the entire output, it's only for the numeric part
num_lines_if_skipping = 5 # the number of lines to show in the beginning and the end when output is being truncated

head = bm.__class__.__name__ + '(['
row_start_buffer = ' ' * len(head)
tail = '])'

try:
maxval = bm.max()
except ValueError:
# empty bitmap
return head + tail

element_max_length = len(str(maxval))
column_width = element_max_length + 2

num_columns = table_max_width // column_width

num_rows = len(bm) / float(num_columns)
if not num_rows.is_integer():
num_rows += 1
num_rows = int(num_rows)
rows = []
row_idx = 0
skipped = False
while row_idx < num_rows:
row_ints = bm[row_idx * num_columns:(row_idx + 1) * num_columns]

line = []
for i in row_ints:
s = str(i)
if num_rows == 1:
# no padding if all numbers fit on a single line
line.append(s)
else:
line.append(' ' * (element_max_length - len(s)) + s)

if row_idx == 0:
prefix = head
else:
prefix = row_start_buffer
rows.append(prefix + ', '.join(line) + ',')
row_idx += 1
if skip_rows and not skipped and row_idx >= num_lines_if_skipping:
rows.append((' ' * ((table_max_width + len(head)) // 2)) + '...')
skipped = True
row_idx = num_rows - num_lines_if_skipping

rows[-1] = rows[-1].rstrip(',') # remove trailing comma from the last line
return '\n'.join(rows) + tail

cdef class AbstractBitMap:
"""
An efficient and light-weight ordered set of 32 bits integers.
Expand Down Expand Up @@ -233,57 +286,7 @@ cdef class AbstractBitMap:
return str(self)

def __str__(self):
skip_rows = len(self) > 500 #this is the cutoff number for the truncating to kick in.
table_max_width = 80 # this isn't the length of the entire output, it's only for the numeric part
num_lines_if_skipping = 5 # the number of lines to show in the beginning and the end when output is being truncated

head = self.__class__.__name__ + '(['
row_start_buffer = ' ' * len(head)
tail = '])'

try:
maxval = self.max()
except ValueError:
# empty bitmap
return head + tail

element_max_length = len(str(maxval))
column_width = element_max_length + 2

num_columns = table_max_width // column_width

num_rows = len(self) / float(num_columns)
if not num_rows.is_integer():
num_rows += 1
num_rows = int(num_rows)
rows = []
row_idx = 0
skipped = False
while row_idx < num_rows:
row_ints = self[row_idx * num_columns:(row_idx + 1) * num_columns]

line = []
for i in row_ints:
s = str(i)
if num_rows == 1:
# no padding if all numbers fit on a single line
line.append(s)
else:
line.append(' ' * (element_max_length - len(s)) + s)

if row_idx == 0:
prefix = head
else:
prefix = row_start_buffer
rows.append(prefix + ', '.join(line) + ',')
row_idx += 1
if skip_rows and not skipped and row_idx >= num_lines_if_skipping:
rows.append((' ' * ((table_max_width + len(head)) // 2)) + '...')
skipped = True
row_idx = num_rows - num_lines_if_skipping

rows[-1] = rows[-1].rstrip(',') # remove trailing comma from the last line
return '\n'.join(rows) + tail
return _string_rep(self)

def flip(self, uint64_t start, uint64_t end):
"""
Expand Down Expand Up @@ -645,20 +648,19 @@ cdef class AbstractBitMap:
return elt

cdef _get_slice(self, sl):
"""For a faster computation, four different methods, depending on the slice."""
"""For a faster computation, different methods, depending on the slice."""
start, stop, step = sl.indices(len(self))
sign = 1 if step > 0 else -1
if (sign > 0 and start >= stop) or (sign < 0 and start <= stop):
if (sign > 0 and start >= stop) or (sign < 0 and start <= stop): # empty chunk
return self.__class__()
r = range(start, stop, step)
assert len(r) > 0
if abs(step) == 1:
first_elt = self._get_elt(start)
last_elt = self._get_elt(stop-sign)
values = range(first_elt, last_elt+sign, step)
result = self & self.__class__(values, copy_on_write=self.copy_on_write)
return result
elif len(r) < len(self) / 100: # TODO find a good threshold for performance?
first_elt = self._get_elt(start)
last_elt = self._get_elt(stop-sign)
values = range(first_elt, last_elt+sign, step)
if abs(step) == 1 and len(values) <= len(self) / 100: # contiguous and small chunk of the bitmap
return self & self.__class__(values)
else: # generic case
if step < 0:
start = r[-1]
stop = r[0] + 1
Expand All @@ -667,8 +669,6 @@ cdef class AbstractBitMap:
start = r[0]
stop = r[-1] + 1
return self._generic_get_slice(start, stop, step)
else:
return self.__class__(self.to_array()[sl]) # could be more efficient...

cdef _generic_get_slice(self, uint32_t start, uint32_t stop, uint32_t step):
"""Assume that start, stop and step > 0 and that the result will not be empty."""
Expand Down Expand Up @@ -818,7 +818,7 @@ cdef class AbstractBitMap64:
>>> BitMap64()
BitMap64([])
>>> BitMap64([1, 123456789, 27])
BitMap([1, 27, 123456789])
BitMap64([1, 27, 123456789])
"""

cdef from_ptr(self, croaring.roaring64_bitmap_t *ptr) noexcept:
Expand Down Expand Up @@ -849,7 +849,6 @@ cdef class AbstractBitMap64:
return croaring.roaring64_bitmap_get_cardinality(self._c_bitmap)

def __richcmp__(self, other, int op):
self._check_compatibility(other)
if op == 0: # <
return croaring.roaring64_bitmap_is_strict_subset((<AbstractBitMap64?>self)._c_bitmap, (<AbstractBitMap64?>other)._c_bitmap)
elif op == 1: # <=
Expand Down Expand Up @@ -923,6 +922,12 @@ cdef class AbstractBitMap64:
finally:
croaring.roaring64_iterator_free(iterator)

def __repr__(self):
return str(self)

def __str__(self):
return _string_rep(self)

def min(self):
"""
Return the minimum element of the bitmap.
Expand Down Expand Up @@ -997,20 +1002,19 @@ cdef class AbstractBitMap64:
return elt

cdef _get_slice(self, sl):
"""For a faster computation, four different methods, depending on the slice."""
"""For a faster computation, different methods, depending on the slice."""
start, stop, step = sl.indices(len(self))
sign = 1 if step > 0 else -1
if (sign > 0 and start >= stop) or (sign < 0 and start <= stop):
if (sign > 0 and start >= stop) or (sign < 0 and start <= stop): # empty chunk
return self.__class__()
r = range(start, stop, step)
assert len(r) > 0
if abs(step) == 1:
first_elt = self._get_elt(start)
last_elt = self._get_elt(stop-sign)
values = range(first_elt, last_elt+sign, step)
result = self & self.__class__(values)
return result
else:
first_elt = self._get_elt(start)
last_elt = self._get_elt(stop-sign)
values = range(first_elt, last_elt+sign, step)
if abs(step) == 1 and len(values) <= len(self) / 100: # contiguous and small chunk of the bitmap
return self & self.__class__(values)
else: # generic case
if step < 0:
start = r[-1]
stop = r[0] + 1
Expand Down Expand Up @@ -1098,7 +1102,7 @@ cdef class AbstractBitMap64:
>>> BitMap64([1, 2]).isdisjoint(BitMap64([3, 4]))
True
>>> BitMap64([1, 2, 3]).BitMap64(BitMap([3, 4]))
>>> BitMap64([1, 2, 3]).isdisjoint(BitMap64([3, 4]))
False
"""
Expand All @@ -1108,7 +1112,7 @@ cdef class AbstractBitMap64:
"""
Report whether another set contains this set.
>>> BitMap64([1, 2]).BitMap64(BitMap([1, 2, 3, 4]))
>>> BitMap64([1, 2]).issubset(BitMap64([1, 2, 3, 4]))
True
>>> BitMap64([1, 2]).issubset(BitMap64([3, 4]))
Expand All @@ -1121,10 +1125,10 @@ cdef class AbstractBitMap64:
"""
Report whether this set contains another set.
>>> BitMap64([1, 2, 3, 4]).BitMap64(BitMap([1, 2]))
>>> BitMap64([1, 2, 3, 4]).issuperset(BitMap64([1, 2]))
True
>>> BitMap64([1, 2]).BitMap64(BitMap([3, 4]))
>>> BitMap64([1, 2]).issuperset(BitMap64([3, 4]))
False
"""
Expand All @@ -1137,7 +1141,7 @@ cdef class AbstractBitMap64:
(i.e. all elements that are in this set but not the others.)
>>> BitMap64.difference(BitMap64([1, 2, 3]), BitMap64([2, 20]), BitMap64([3, 30]))
BitMap([1])
BitMap64([1])
"""
size = len(bitmaps)
Expand Down
14 changes: 13 additions & 1 deletion pyroaring/bitmap.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -261,4 +261,16 @@ cdef class BitMap(AbstractBitMap):
croaring.roaring_bitmap_remove_range(self._c_bitmap, range_start, range_end)

cdef class BitMap64(AbstractBitMap64):
pass
def add(self, uint64_t value):
"""
Add an element to the bitmap. This has no effect if the element is already present.
>>> bm = BitMap64()
>>> bm.add(42)
>>> bm
BitMap64([42])
>>> bm.add(42)
>>> bm
BitMap64([42])
"""
croaring.roaring64_bitmap_add(self._c_bitmap, value)

0 comments on commit 176d346

Please sign in to comment.