Skip to content

Commit

Permalink
Split diff by block type (#22830)
Browse files Browse the repository at this point in the history
* Split diff by block type

* Remove unecessary and redundant code

* Fix ordering of cache/stash + increase validity of tests

* Update src/olympia/blocklist/tests/test_mlbf.py

Co-authored-by: William Durand <[email protected]>

* Update src/olympia/blocklist/mlbf.py

Co-authored-by: William Durand <[email protected]>

---------

Co-authored-by: William Durand <[email protected]>
  • Loading branch information
KevinMind and willdurand authored Nov 12, 2024
1 parent c1e5241 commit 5846124
Show file tree
Hide file tree
Showing 3 changed files with 312 additions and 95 deletions.
9 changes: 6 additions & 3 deletions src/olympia/blocklist/cron.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from olympia.zadmin.models import get_config

from .mlbf import MLBF
from .models import Block, BlocklistSubmission
from .models import Block, BlocklistSubmission, BlockType
from .tasks import cleanup_old_files, process_blocklistsubmission, upload_filter
from .utils import datetime_to_ts

Expand Down Expand Up @@ -89,7 +89,9 @@ def _upload_mlbf_to_remote_settings(*, force_base=False):
else base_filter
)

changes_count = mlbf.blocks_changed_since_previous(previous_filter)
changes_count = mlbf.blocks_changed_since_previous(
BlockType.BLOCKED, previous_filter
)
statsd.incr(
'blocklist.cron.upload_mlbf_to_remote_settings.blocked_changed', changes_count
)
Expand Down Expand Up @@ -119,7 +121,8 @@ def _upload_mlbf_to_remote_settings(*, force_base=False):
force_base
or base_filter is None
or previous_filter is None
or mlbf.blocks_changed_since_previous(base_filter) > BASE_REPLACE_THRESHOLD
or mlbf.blocks_changed_since_previous(BlockType.BLOCKED, base_filter)
> BASE_REPLACE_THRESHOLD
)

if make_base_filter:
Expand Down
72 changes: 47 additions & 25 deletions src/olympia/blocklist/mlbf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import os
import secrets
from enum import Enum
from typing import List, Optional, Set, Tuple
from typing import Dict, List, Optional, Tuple

from django.utils.functional import cached_property

Expand All @@ -19,6 +19,16 @@
log = olympia.core.logger.getLogger('z.amo.blocklist')


def ordered_diff_lists(
previous: List[str], current: List[str]
) -> Tuple[List[str], List[str], int]:
# Use lists instead of sets to maintain order
extras = [x for x in current if x not in previous]
deletes = [x for x in previous if x not in current]
changed_count = len(extras) + len(deletes)
return extras, deletes, changed_count


def generate_mlbf(stats, blocked, not_blocked):
log.info('Starting to generating bloomfilter')

Expand Down Expand Up @@ -123,8 +133,17 @@ def __init__(self, *args, **kwargs):

@cached_property
def _all_blocks(self):
return BlockVersion.objects.filter(version__file__is_signed=True).values_list(
'block__guid', 'version__version', 'version_id', 'block_type', named=True
return (
BlockVersion.objects.filter(version__file__is_signed=True)
.distinct()
.order_by('id')
.values_list(
'block__guid',
'version__version',
'version_id',
'block_type',
named=True,
)
)

def _format_blocks(self, block_type: BlockType) -> List[str]:
Expand All @@ -148,16 +167,19 @@ def soft_blocked_items(self) -> List[str]:
def not_blocked_items(self) -> List[str]:
all_blocks_ids = [version.version_id for version in self._all_blocks]
not_blocked_items = MLBF.hash_filter_inputs(
Version.unfiltered.exclude(id__in=all_blocks_ids or ()).values_list(
'addon__addonguid__guid', 'version'
)
Version.unfiltered.exclude(id__in=all_blocks_ids or ())
.distinct()
.order_by('id')
.values_list('addon__addonguid__guid', 'version')
)
# even though we exclude all the version ids in the query there's an
# edge case where the version string occurs twice for an addon so we
# ensure not_blocked_items contain no blocked_items or soft_blocked_items.
return list(
set(not_blocked_items) - set(self.blocked_items + self.soft_blocked_items)
)
return [
item
for item in not_blocked_items
if item not in self.blocked_items + self.soft_blocked_items
]


class MLBF:
Expand Down Expand Up @@ -213,33 +235,33 @@ def generate_and_write_filter(self):

def generate_diffs(
self, previous_mlbf: 'MLBF' = None
) -> Tuple[Set[str], Set[str], int]:
previous = set(
[] if previous_mlbf is None else previous_mlbf.data.blocked_items
)
current = set(self.data.blocked_items)
extras = current - previous
deletes = previous - current
changed_count = (
len(extras) + len(deletes) if len(previous) > 0 else len(current)
)
return extras, deletes, changed_count
) -> Dict[BlockType, Tuple[List[str], List[str], int]]:
return {
block_type: ordered_diff_lists(
[] if previous_mlbf is None else previous_mlbf.data[block_type],
self.data[block_type],
)
for block_type in BlockType
}

def generate_and_write_stash(self, previous_mlbf: 'MLBF' = None):
# compare previous with current blocks
extras, deletes, _ = self.generate_diffs(previous_mlbf)
extras, deletes, _ = self.generate_diffs(previous_mlbf)[BlockType.BLOCKED]
stash_json = {
'blocked': list(extras),
'unblocked': list(deletes),
'blocked': extras,
'unblocked': deletes,
}
# write stash
stash_path = self.stash_path
with self.storage.open(stash_path, 'w') as json_file:
log.info(f'Writing to file {stash_path}')
json.dump(stash_json, json_file)

def blocks_changed_since_previous(self, previous_mlbf: 'MLBF' = None):
return self.generate_diffs(previous_mlbf)[2]
def blocks_changed_since_previous(
self, block_type: BlockType = BlockType.BLOCKED, previous_mlbf: 'MLBF' = None
):
_, _, changed_count = self.generate_diffs(previous_mlbf)[block_type]
return changed_count

@classmethod
def load_from_storage(
Expand Down
Loading

0 comments on commit 5846124

Please sign in to comment.