From 176d3461be7cd7f11500664c2e9e9fd1e54cfcd8 Mon Sep 17 00:00:00 2001 From: Tom Cornebize Date: Sat, 20 Jan 2024 23:17:35 +0100 Subject: [PATCH] Fix slow iteration, add string representation --- pyroaring/abstract_bitmap.pxi | 160 +++++++++++++++++----------------- pyroaring/bitmap.pxi | 14 ++- 2 files changed, 95 insertions(+), 79 deletions(-) diff --git a/pyroaring/abstract_bitmap.pxi b/pyroaring/abstract_bitmap.pxi index d4536f4..16b86b7 100644 --- a/pyroaring/abstract_bitmap.pxi +++ b/pyroaring/abstract_bitmap.pxi @@ -17,6 +17,59 @@ cdef croaring.roaring_bitmap_t *deserialize_ptr(char *buff): ptr = croaring.roaring_bitmap_portable_deserialize(buff) return ptr +def _string_rep(bm): + skip_rows = len(bm) > 500 #this is the cutoff number for the truncating to kick in. + table_max_width = 80 # this isn't the length of the entire output, it's only for the numeric part + num_lines_if_skipping = 5 # the number of lines to show in the beginning and the end when output is being truncated + + head = bm.__class__.__name__ + '([' + row_start_buffer = ' ' * len(head) + tail = '])' + + try: + maxval = bm.max() + except ValueError: + # empty bitmap + return head + tail + + element_max_length = len(str(maxval)) + column_width = element_max_length + 2 + + num_columns = table_max_width // column_width + + num_rows = len(bm) / float(num_columns) + if not num_rows.is_integer(): + num_rows += 1 + num_rows = int(num_rows) + rows = [] + row_idx = 0 + skipped = False + while row_idx < num_rows: + row_ints = bm[row_idx * num_columns:(row_idx + 1) * num_columns] + + line = [] + for i in row_ints: + s = str(i) + if num_rows == 1: + # no padding if all numbers fit on a single line + line.append(s) + else: + line.append(' ' * (element_max_length - len(s)) + s) + + if row_idx == 0: + prefix = head + else: + prefix = row_start_buffer + rows.append(prefix + ', '.join(line) + ',') + row_idx += 1 + if skip_rows and not skipped and row_idx >= num_lines_if_skipping: + rows.append((' ' * ((table_max_width + len(head)) // 2)) + '...') + skipped = True + row_idx = num_rows - num_lines_if_skipping + + rows[-1] = rows[-1].rstrip(',') # remove trailing comma from the last line + return '\n'.join(rows) + tail + cdef class AbstractBitMap: """ An efficient and light-weight ordered set of 32 bits integers. @@ -233,57 +286,7 @@ cdef class AbstractBitMap: return str(self) def __str__(self): - skip_rows = len(self) > 500 #this is the cutoff number for the truncating to kick in. - table_max_width = 80 # this isn't the length of the entire output, it's only for the numeric part - num_lines_if_skipping = 5 # the number of lines to show in the beginning and the end when output is being truncated - - head = self.__class__.__name__ + '([' - row_start_buffer = ' ' * len(head) - tail = '])' - - try: - maxval = self.max() - except ValueError: - # empty bitmap - return head + tail - - element_max_length = len(str(maxval)) - column_width = element_max_length + 2 - - num_columns = table_max_width // column_width - - num_rows = len(self) / float(num_columns) - if not num_rows.is_integer(): - num_rows += 1 - num_rows = int(num_rows) - rows = [] - row_idx = 0 - skipped = False - while row_idx < num_rows: - row_ints = self[row_idx * num_columns:(row_idx + 1) * num_columns] - - line = [] - for i in row_ints: - s = str(i) - if num_rows == 1: - # no padding if all numbers fit on a single line - line.append(s) - else: - line.append(' ' * (element_max_length - len(s)) + s) - - if row_idx == 0: - prefix = head - else: - prefix = row_start_buffer - rows.append(prefix + ', '.join(line) + ',') - row_idx += 1 - if skip_rows and not skipped and row_idx >= num_lines_if_skipping: - rows.append((' ' * ((table_max_width + len(head)) // 2)) + '...') - skipped = True - row_idx = num_rows - num_lines_if_skipping - - rows[-1] = rows[-1].rstrip(',') # remove trailing comma from the last line - return '\n'.join(rows) + tail + return _string_rep(self) def flip(self, uint64_t start, uint64_t end): """ @@ -645,20 +648,19 @@ cdef class AbstractBitMap: return elt cdef _get_slice(self, sl): - """For a faster computation, four different methods, depending on the slice.""" + """For a faster computation, different methods, depending on the slice.""" start, stop, step = sl.indices(len(self)) sign = 1 if step > 0 else -1 - if (sign > 0 and start >= stop) or (sign < 0 and start <= stop): + if (sign > 0 and start >= stop) or (sign < 0 and start <= stop): # empty chunk return self.__class__() r = range(start, stop, step) assert len(r) > 0 - if abs(step) == 1: - first_elt = self._get_elt(start) - last_elt = self._get_elt(stop-sign) - values = range(first_elt, last_elt+sign, step) - result = self & self.__class__(values, copy_on_write=self.copy_on_write) - return result - elif len(r) < len(self) / 100: # TODO find a good threshold for performance? + first_elt = self._get_elt(start) + last_elt = self._get_elt(stop-sign) + values = range(first_elt, last_elt+sign, step) + if abs(step) == 1 and len(values) <= len(self) / 100: # contiguous and small chunk of the bitmap + return self & self.__class__(values) + else: # generic case if step < 0: start = r[-1] stop = r[0] + 1 @@ -667,8 +669,6 @@ cdef class AbstractBitMap: start = r[0] stop = r[-1] + 1 return self._generic_get_slice(start, stop, step) - else: - return self.__class__(self.to_array()[sl]) # could be more efficient... cdef _generic_get_slice(self, uint32_t start, uint32_t stop, uint32_t step): """Assume that start, stop and step > 0 and that the result will not be empty.""" @@ -818,7 +818,7 @@ cdef class AbstractBitMap64: >>> BitMap64() BitMap64([]) >>> BitMap64([1, 123456789, 27]) - BitMap([1, 27, 123456789]) + BitMap64([1, 27, 123456789]) """ cdef from_ptr(self, croaring.roaring64_bitmap_t *ptr) noexcept: @@ -849,7 +849,6 @@ cdef class AbstractBitMap64: return croaring.roaring64_bitmap_get_cardinality(self._c_bitmap) def __richcmp__(self, other, int op): - self._check_compatibility(other) if op == 0: # < return croaring.roaring64_bitmap_is_strict_subset((self)._c_bitmap, (other)._c_bitmap) elif op == 1: # <= @@ -923,6 +922,12 @@ cdef class AbstractBitMap64: finally: croaring.roaring64_iterator_free(iterator) + def __repr__(self): + return str(self) + + def __str__(self): + return _string_rep(self) + def min(self): """ Return the minimum element of the bitmap. @@ -997,20 +1002,19 @@ cdef class AbstractBitMap64: return elt cdef _get_slice(self, sl): - """For a faster computation, four different methods, depending on the slice.""" + """For a faster computation, different methods, depending on the slice.""" start, stop, step = sl.indices(len(self)) sign = 1 if step > 0 else -1 - if (sign > 0 and start >= stop) or (sign < 0 and start <= stop): + if (sign > 0 and start >= stop) or (sign < 0 and start <= stop): # empty chunk return self.__class__() r = range(start, stop, step) assert len(r) > 0 - if abs(step) == 1: - first_elt = self._get_elt(start) - last_elt = self._get_elt(stop-sign) - values = range(first_elt, last_elt+sign, step) - result = self & self.__class__(values) - return result - else: + first_elt = self._get_elt(start) + last_elt = self._get_elt(stop-sign) + values = range(first_elt, last_elt+sign, step) + if abs(step) == 1 and len(values) <= len(self) / 100: # contiguous and small chunk of the bitmap + return self & self.__class__(values) + else: # generic case if step < 0: start = r[-1] stop = r[0] + 1 @@ -1098,7 +1102,7 @@ cdef class AbstractBitMap64: >>> BitMap64([1, 2]).isdisjoint(BitMap64([3, 4])) True - >>> BitMap64([1, 2, 3]).BitMap64(BitMap([3, 4])) + >>> BitMap64([1, 2, 3]).isdisjoint(BitMap64([3, 4])) False """ @@ -1108,7 +1112,7 @@ cdef class AbstractBitMap64: """ Report whether another set contains this set. - >>> BitMap64([1, 2]).BitMap64(BitMap([1, 2, 3, 4])) + >>> BitMap64([1, 2]).issubset(BitMap64([1, 2, 3, 4])) True >>> BitMap64([1, 2]).issubset(BitMap64([3, 4])) @@ -1121,10 +1125,10 @@ cdef class AbstractBitMap64: """ Report whether this set contains another set. - >>> BitMap64([1, 2, 3, 4]).BitMap64(BitMap([1, 2])) + >>> BitMap64([1, 2, 3, 4]).issuperset(BitMap64([1, 2])) True - >>> BitMap64([1, 2]).BitMap64(BitMap([3, 4])) + >>> BitMap64([1, 2]).issuperset(BitMap64([3, 4])) False """ @@ -1137,7 +1141,7 @@ cdef class AbstractBitMap64: (i.e. all elements that are in this set but not the others.) >>> BitMap64.difference(BitMap64([1, 2, 3]), BitMap64([2, 20]), BitMap64([3, 30])) - BitMap([1]) + BitMap64([1]) """ size = len(bitmaps) diff --git a/pyroaring/bitmap.pxi b/pyroaring/bitmap.pxi index c52a388..ebf0725 100644 --- a/pyroaring/bitmap.pxi +++ b/pyroaring/bitmap.pxi @@ -261,4 +261,16 @@ cdef class BitMap(AbstractBitMap): croaring.roaring_bitmap_remove_range(self._c_bitmap, range_start, range_end) cdef class BitMap64(AbstractBitMap64): - pass + def add(self, uint64_t value): + """ + Add an element to the bitmap. This has no effect if the element is already present. + + >>> bm = BitMap64() + >>> bm.add(42) + >>> bm + BitMap64([42]) + >>> bm.add(42) + >>> bm + BitMap64([42]) + """ + croaring.roaring64_bitmap_add(self._c_bitmap, value) \ No newline at end of file