diff --git a/src/olympia/blocklist/cron.py b/src/olympia/blocklist/cron.py index 643bd414159d..29399e83c68f 100644 --- a/src/olympia/blocklist/cron.py +++ b/src/olympia/blocklist/cron.py @@ -12,7 +12,7 @@ from olympia.zadmin.models import get_config from .mlbf import MLBF -from .models import Block, BlocklistSubmission +from .models import Block, BlocklistSubmission, BlockType from .tasks import cleanup_old_files, process_blocklistsubmission, upload_filter from .utils import datetime_to_ts @@ -89,7 +89,9 @@ def _upload_mlbf_to_remote_settings(*, force_base=False): else base_filter ) - changes_count = mlbf.blocks_changed_since_previous(previous_filter) + changes_count = mlbf.blocks_changed_since_previous( + BlockType.BLOCKED, previous_filter + ) statsd.incr( 'blocklist.cron.upload_mlbf_to_remote_settings.blocked_changed', changes_count ) @@ -119,7 +121,8 @@ def _upload_mlbf_to_remote_settings(*, force_base=False): force_base or base_filter is None or previous_filter is None - or mlbf.blocks_changed_since_previous(base_filter) > BASE_REPLACE_THRESHOLD + or mlbf.blocks_changed_since_previous(BlockType.BLOCKED, base_filter) + > BASE_REPLACE_THRESHOLD ) if make_base_filter: diff --git a/src/olympia/blocklist/mlbf.py b/src/olympia/blocklist/mlbf.py index c236fe45f207..ce3a9f30bda5 100644 --- a/src/olympia/blocklist/mlbf.py +++ b/src/olympia/blocklist/mlbf.py @@ -2,7 +2,7 @@ import os import secrets from enum import Enum -from typing import List, Optional, Set, Tuple +from typing import Dict, List, Optional, Tuple from django.utils.functional import cached_property @@ -19,6 +19,16 @@ log = olympia.core.logger.getLogger('z.amo.blocklist') +def ordered_diff_lists( + previous: List[str], current: List[str] +) -> Tuple[List[str], List[str], int]: + # Use lists instead of sets to maintain order + extras = [x for x in current if x not in previous] + deletes = [x for x in previous if x not in current] + changed_count = len(extras) + len(deletes) + return extras, deletes, changed_count + + def generate_mlbf(stats, blocked, not_blocked): log.info('Starting to generating bloomfilter') @@ -123,8 +133,17 @@ def __init__(self, *args, **kwargs): @cached_property def _all_blocks(self): - return BlockVersion.objects.filter(version__file__is_signed=True).values_list( - 'block__guid', 'version__version', 'version_id', 'block_type', named=True + return ( + BlockVersion.objects.filter(version__file__is_signed=True) + .distinct() + .order_by('id') + .values_list( + 'block__guid', + 'version__version', + 'version_id', + 'block_type', + named=True, + ) ) def _format_blocks(self, block_type: BlockType) -> List[str]: @@ -148,16 +167,19 @@ def soft_blocked_items(self) -> List[str]: def not_blocked_items(self) -> List[str]: all_blocks_ids = [version.version_id for version in self._all_blocks] not_blocked_items = MLBF.hash_filter_inputs( - Version.unfiltered.exclude(id__in=all_blocks_ids or ()).values_list( - 'addon__addonguid__guid', 'version' - ) + Version.unfiltered.exclude(id__in=all_blocks_ids or ()) + .distinct() + .order_by('id') + .values_list('addon__addonguid__guid', 'version') ) # even though we exclude all the version ids in the query there's an # edge case where the version string occurs twice for an addon so we # ensure not_blocked_items contain no blocked_items or soft_blocked_items. - return list( - set(not_blocked_items) - set(self.blocked_items + self.soft_blocked_items) - ) + return [ + item + for item in not_blocked_items + if item not in self.blocked_items + self.soft_blocked_items + ] class MLBF: @@ -213,24 +235,21 @@ def generate_and_write_filter(self): def generate_diffs( self, previous_mlbf: 'MLBF' = None - ) -> Tuple[Set[str], Set[str], int]: - previous = set( - [] if previous_mlbf is None else previous_mlbf.data.blocked_items - ) - current = set(self.data.blocked_items) - extras = current - previous - deletes = previous - current - changed_count = ( - len(extras) + len(deletes) if len(previous) > 0 else len(current) - ) - return extras, deletes, changed_count + ) -> Dict[BlockType, Tuple[List[str], List[str], int]]: + return { + block_type: ordered_diff_lists( + [] if previous_mlbf is None else previous_mlbf.data[block_type], + self.data[block_type], + ) + for block_type in BlockType + } def generate_and_write_stash(self, previous_mlbf: 'MLBF' = None): # compare previous with current blocks - extras, deletes, _ = self.generate_diffs(previous_mlbf) + extras, deletes, _ = self.generate_diffs(previous_mlbf)[BlockType.BLOCKED] stash_json = { - 'blocked': list(extras), - 'unblocked': list(deletes), + 'blocked': extras, + 'unblocked': deletes, } # write stash stash_path = self.stash_path @@ -238,8 +257,11 @@ def generate_and_write_stash(self, previous_mlbf: 'MLBF' = None): log.info(f'Writing to file {stash_path}') json.dump(stash_json, json_file) - def blocks_changed_since_previous(self, previous_mlbf: 'MLBF' = None): - return self.generate_diffs(previous_mlbf)[2] + def blocks_changed_since_previous( + self, block_type: BlockType = BlockType.BLOCKED, previous_mlbf: 'MLBF' = None + ): + _, _, changed_count = self.generate_diffs(previous_mlbf)[block_type] + return changed_count @classmethod def load_from_storage( diff --git a/src/olympia/blocklist/tests/test_mlbf.py b/src/olympia/blocklist/tests/test_mlbf.py index b78d32630abc..d61067ee7fcd 100644 --- a/src/olympia/blocklist/tests/test_mlbf.py +++ b/src/olympia/blocklist/tests/test_mlbf.py @@ -27,9 +27,11 @@ def setUp(self): self.storage = SafeStorage() self.user = user_factory() - def _blocked_addon(self, **kwargs): + def _blocked_addon(self, block_type=BlockType.BLOCKED, **kwargs): addon = addon_factory(**kwargs) - block = block_factory(guid=addon.guid, updated_by=self.user) + block = block_factory( + guid=addon.guid, updated_by=self.user, block_type=block_type + ) return addon, block def _version(self, addon, is_signed=True): @@ -141,16 +143,47 @@ def test_load_returns_expected_data(self): addon, block = self._blocked_addon() notblocked_version = addon.current_version - block_version = self._block_version( - block, self._version(addon), block_type=BlockType.BLOCKED - ) + second_notblocked_version = self._version(addon) + blocked_versions = [] + blocked_hashes = [] + soft_blocked_versions = [] + soft_blocked_hashes = [] + + for _ in range(10): + # Create a blocked version with hashed guid:version + block_version = self._block_version( + block, self._version(addon), block_type=BlockType.BLOCKED + ) + blocked_versions.append(block_version) + blocked_hashes.append( + (block_version.block.guid, block_version.version.version) + ) + + # Create a soft blocked version with hashed guid:version + soft_blocked_version = self._block_version( + block, self._version(addon), block_type=BlockType.SOFT_BLOCKED + ) + soft_blocked_versions.append(soft_blocked_version) + soft_blocked_hashes.append( + (soft_blocked_version.block.guid, soft_blocked_version.version.version) + ) mlbf_data = MLBFDataBaseLoader(self.storage) + assert mlbf_data[MLBFDataType.BLOCKED] == MLBF.hash_filter_inputs( - [(block_version.block.guid, block_version.version.version)] + blocked_hashes + ) + assert mlbf_data[MLBFDataType.SOFT_BLOCKED] == MLBF.hash_filter_inputs( + soft_blocked_hashes ) assert mlbf_data[MLBFDataType.NOT_BLOCKED] == MLBF.hash_filter_inputs( - [(notblocked_version.addon.guid, notblocked_version.version)] + [ + (notblocked_version.addon.guid, notblocked_version.version), + ( + second_notblocked_version.addon.guid, + second_notblocked_version.version, + ), + ] ) def test_blocked_items_caches_excluded_version_ids(self): @@ -159,16 +192,24 @@ def test_blocked_items_caches_excluded_version_ids(self): to exclude in the notblocked_items property. """ addon, block = self._blocked_addon() - block_version = self._block_version( + not_blocked_version = addon.current_version + hard_blocked = self._block_version( block, self._version(addon), block_type=BlockType.BLOCKED ) + soft_block = self._block_version( + block, self._version(addon), block_type=BlockType.SOFT_BLOCKED + ) with self.assertNumQueries(2): mlbf_data = MLBFDataBaseLoader(self.storage) - assert ( - MLBF.hash_filter_inputs( - [(block_version.block.guid, block_version.version.version)] - ) - not in mlbf_data.blocked_items + + assert mlbf_data.blocked_items == MLBF.hash_filter_inputs( + [(hard_blocked.block.guid, hard_blocked.version.version)] + ) + assert mlbf_data.soft_blocked_items == MLBF.hash_filter_inputs( + [(soft_block.block.guid, soft_block.version.version)] + ) + assert mlbf_data.not_blocked_items == MLBF.hash_filter_inputs( + [(not_blocked_version.addon.guid, not_blocked_version.version)] ) def test_hash_filter_inputs_returns_set_of_hashed_strings(self): @@ -193,6 +234,49 @@ def test_get_data_from_db(self): mlbf_data = MLBFDataBaseLoader(mlbf.storage) assert mlbf.data._raw == mlbf_data._raw + def test_cache_json_is_sorted(self): + addon, block = self._blocked_addon() + + notblocked_version = addon.current_version + second_notblocked_version = self._version(addon) + + blocked_versions = [] + blocked_hashes = [] + soft_blocked_versions = [] + soft_blocked_hashes = [] + + for _ in range(10): + blocked_version = self._block_version( + block, self._version(addon), block_type=BlockType.BLOCKED + ) + blocked_versions.append(blocked_version) + blocked_hashes.append( + (blocked_version.block.guid, blocked_version.version.version) + ) + soft_blocked_version = self._block_version( + block, self._version(addon), block_type=BlockType.SOFT_BLOCKED + ) + soft_blocked_versions.append(soft_blocked_version) + soft_blocked_hashes.append( + (soft_blocked_version.block.guid, soft_blocked_version.version.version) + ) + + mlbf = MLBF.generate_from_db('test') + with mlbf.storage.open(mlbf.data._cache_path, 'r') as f: + assert json.load(f) == { + 'blocked': MLBF.hash_filter_inputs(blocked_hashes), + 'soft_blocked': MLBF.hash_filter_inputs(soft_blocked_hashes), + 'not_blocked': MLBF.hash_filter_inputs( + [ + (notblocked_version.addon.guid, notblocked_version.version), + ( + second_notblocked_version.addon.guid, + second_notblocked_version.version, + ), + ] + ), + } + def test_load_from_storage_returns_data_from_storage(self): self._blocked_addon() mlbf = MLBF.generate_from_db('test') @@ -214,30 +298,28 @@ def test_diff_returns_stateless_without_previous(self): addon, _ = self._blocked_addon(file_kw={'is_signed': True}) base_mlbf = MLBF.generate_from_db('base') - assert base_mlbf.generate_diffs() == ( - set( + stateless_diff = { + BlockType.BLOCKED: ( MLBF.hash_filter_inputs( [(addon.block.guid, addon.current_version.version)] - ) + ), + [], + 1, ), - set(), - 1, - ) + BlockType.SOFT_BLOCKED: ([], [], 0), + } + + assert base_mlbf.generate_diffs() == stateless_diff next_mlbf = MLBF.generate_from_db('next') # If we don't include the base_mlbf, unblocked version will still be in the diff - assert next_mlbf.generate_diffs() == ( - set( - MLBF.hash_filter_inputs( - [(addon.block.guid, addon.current_version.version)] - ) - ), - set(), - 1, - ) + assert next_mlbf.generate_diffs() == stateless_diff # Providing a previous mlbf with the unblocked version already included # removes it from the diff - assert next_mlbf.generate_diffs(previous_mlbf=base_mlbf) == (set(), set(), 0) + assert next_mlbf.generate_diffs(previous_mlbf=base_mlbf) == { + BlockType.BLOCKED: ([], [], 0), + BlockType.SOFT_BLOCKED: ([], [], 0), + } def test_diff_no_changes(self): addon, block = self._blocked_addon() @@ -245,7 +327,10 @@ def test_diff_no_changes(self): base_mlbf = MLBF.generate_from_db('test') next_mlbf = MLBF.generate_from_db('test_two') - assert next_mlbf.generate_diffs(previous_mlbf=base_mlbf) == (set(), set(), 0) + assert next_mlbf.generate_diffs(previous_mlbf=base_mlbf) == { + BlockType.BLOCKED: ([], [], 0), + BlockType.SOFT_BLOCKED: ([], [], 0), + } def test_diff_block_added(self): addon, block = self._blocked_addon() @@ -257,15 +342,16 @@ def test_diff_block_added(self): next_mlbf = MLBF.generate_from_db('test_two') - assert next_mlbf.generate_diffs(previous_mlbf=base_mlbf) == ( - set( + assert next_mlbf.generate_diffs(previous_mlbf=base_mlbf) == { + BlockType.BLOCKED: ( MLBF.hash_filter_inputs( [(new_block.block.guid, new_block.version.version)] - ) + ), + [], + 1, ), - set(), - 1, - ) + BlockType.SOFT_BLOCKED: ([], [], 0), + } def test_diff_block_removed(self): addon, block = self._blocked_addon() @@ -276,15 +362,16 @@ def test_diff_block_removed(self): block_version.delete() next_mlbf = MLBF.generate_from_db('test_two') - assert next_mlbf.generate_diffs(previous_mlbf=base_mlbf) == ( - set(), - set( + assert next_mlbf.generate_diffs(previous_mlbf=base_mlbf) == { + BlockType.BLOCKED: ( + [], MLBF.hash_filter_inputs( [(block_version.block.guid, block_version.version.version)] - ) + ), + 1, ), - 1, - ) + BlockType.SOFT_BLOCKED: ([], [], 0), + } def test_diff_block_added_and_removed(self): addon, block = self._blocked_addon() @@ -300,45 +387,101 @@ def test_diff_block_added_and_removed(self): next_mlbf = MLBF.generate_from_db('test_two') - assert next_mlbf.generate_diffs(previous_mlbf=base_mlbf) == ( - set( + assert next_mlbf.generate_diffs(previous_mlbf=base_mlbf) == { + BlockType.BLOCKED: ( MLBF.hash_filter_inputs( [(new_block.block.guid, new_block.version.version)] - ) - ), - set( + ), MLBF.hash_filter_inputs( [(block_version.block.guid, block_version.version.version)] - ) + ), + 2, ), - 2, - ) + BlockType.SOFT_BLOCKED: ([], [], 0), + } - def test_generate_stash_returns_expected_stash(self): + def test_diff_block_hard_to_soft(self): addon, block = self._blocked_addon() block_version = self._block_version( block, self._version(addon), block_type=BlockType.BLOCKED ) + base_mlbf = MLBF.generate_from_db('test') + block_version.update(block_type=BlockType.SOFT_BLOCKED) + next_mlbf = MLBF.generate_from_db('test_two') + + assert next_mlbf.generate_diffs(previous_mlbf=base_mlbf) == { + BlockType.BLOCKED: ( + [], + MLBF.hash_filter_inputs( + [(block_version.block.guid, block_version.version.version)] + ), + 1, + ), + BlockType.SOFT_BLOCKED: ( + MLBF.hash_filter_inputs( + [(block_version.block.guid, block_version.version.version)] + ), + [], + 1, + ), + } + + def test_diff_block_soft_to_hard(self): + addon, block = self._blocked_addon() + block_version = self._block_version( + block, self._version(addon), block_type=BlockType.SOFT_BLOCKED + ) + base_mlbf = MLBF.generate_from_db('test') + block_version.update(block_type=BlockType.BLOCKED) + next_mlbf = MLBF.generate_from_db('test_two') + + assert next_mlbf.generate_diffs(previous_mlbf=base_mlbf) == { + BlockType.BLOCKED: ( + MLBF.hash_filter_inputs( + [(block_version.block.guid, block_version.version.version)] + ), + [], + 1, + ), + BlockType.SOFT_BLOCKED: ( + [], + MLBF.hash_filter_inputs( + [(block_version.block.guid, block_version.version.version)] + ), + 1, + ), + } + + def test_generate_stash_returns_expected_stash(self): + addon, block = self._blocked_addon() + block_versions = [ + self._block_version(block, self._version(addon)) for _ in range(10) + ] mlbf = MLBF.generate_from_db('test') mlbf.generate_and_write_stash() + expected_blocked = [ + (block_version.block.guid, block_version.version.version) + for block_version in block_versions + ] + with mlbf.storage.open(mlbf.stash_path, 'r') as f: assert json.load(f) == { - 'blocked': MLBF.hash_filter_inputs( - [(block_version.block.guid, block_version.version.version)] - ), + 'blocked': MLBF.hash_filter_inputs(expected_blocked), 'unblocked': [], } - block_version.delete() + + # Remove the last block version + block_versions[-1].delete() + expected_unblocked = expected_blocked[-1:] next_mlbf = MLBF.generate_from_db('test_two') next_mlbf.generate_and_write_stash(previous_mlbf=mlbf) + with next_mlbf.storage.open(next_mlbf.stash_path, 'r') as f: assert json.load(f) == { 'blocked': [], - 'unblocked': MLBF.hash_filter_inputs( - [(block_version.block.guid, block_version.version.version)] - ), + 'unblocked': MLBF.hash_filter_inputs(expected_unblocked), } def test_changed_count_returns_expected_count(self): @@ -346,16 +489,53 @@ def test_changed_count_returns_expected_count(self): self._block_version(block, self._version(addon), block_type=BlockType.BLOCKED) first_mlbf = MLBF.generate_from_db('first') # Include the new blocked version - assert first_mlbf.blocks_changed_since_previous() == 1 + assert first_mlbf.blocks_changed_since_previous(BlockType.BLOCKED) == 1 + assert first_mlbf.blocks_changed_since_previous(BlockType.SOFT_BLOCKED) == 0 self._block_version(block, self._version(addon), block_type=BlockType.BLOCKED) # The count should not change because the data is already calculated - assert first_mlbf.blocks_changed_since_previous() == 1 + assert first_mlbf.blocks_changed_since_previous(BlockType.BLOCKED) == 1 + assert first_mlbf.blocks_changed_since_previous(BlockType.SOFT_BLOCKED) == 0 + self._block_version( + block, self._version(addon), block_type=BlockType.SOFT_BLOCKED + ) next_mlbf = MLBF.generate_from_db('next') - # The count should include both blocked versions since we are not comparing - assert next_mlbf.blocks_changed_since_previous() == 2 + # The count should include both blocked versions and the soft blocked version + # since we are not comparing to a previous mlbf + assert next_mlbf.blocks_changed_since_previous(BlockType.BLOCKED) == 2 + assert next_mlbf.blocks_changed_since_previous(BlockType.SOFT_BLOCKED) == 1 # When comparing to the first mlbf, # the count should only include the second block - assert next_mlbf.blocks_changed_since_previous(previous_mlbf=first_mlbf) == 1 + assert ( + next_mlbf.blocks_changed_since_previous( + previous_mlbf=first_mlbf, block_type=BlockType.BLOCKED + ) + == 1 + ) + # The count should still include the soft blocked version since it was + # created after the first_mlbf + assert ( + next_mlbf.blocks_changed_since_previous( + previous_mlbf=first_mlbf, block_type=BlockType.SOFT_BLOCKED + ) + == 1 + ) + final_mlbf = MLBF.generate_from_db('final') + # The soft blocked version is no longer a change comparing to the previous mlbf + # but is still a change to the original mlbf from before it was created + assert ( + final_mlbf.blocks_changed_since_previous( + previous_mlbf=next_mlbf, + block_type=BlockType.SOFT_BLOCKED, + ) + == 0 + ) + assert ( + final_mlbf.blocks_changed_since_previous( + previous_mlbf=first_mlbf, + block_type=BlockType.BLOCKED, + ) + == 1 + ) def test_generate_filter_not_raises_if_all_versions_unblocked(self): """ @@ -398,15 +578,27 @@ def test_duplicate_guid_is_blocked(self): reused_addon.update(guid=GUID_REUSE_FORMAT.format(addon.id)) reused_addon.addonguid.update(guid=addon.guid) - self._block_version( + (reused_addon_hash,) = MLBF.hash_filter_inputs( + [(reused_addon.addonguid.guid, version)] + ) + + soft_blocked_version = self._block_version( block, self._version(addon), block_type=BlockType.SOFT_BLOCKED ) mlbf = MLBF.generate_from_db('test') - (block_version,) = MLBF.hash_filter_inputs([(addon.guid, version)]) - assert block_version in mlbf.data.blocked_items - assert block_version not in mlbf.data.not_blocked_items + (block_version_hash,) = MLBF.hash_filter_inputs([(addon.guid, version)]) + (soft_blocked_version_hash,) = MLBF.hash_filter_inputs( + [(soft_blocked_version.block.guid, soft_blocked_version.version.version)] + ) + + # There is a duplicate hash but we will exclude it from the not_blocked versions + assert block_version_hash == reused_addon_hash + + assert mlbf.data.blocked_items == [block_version_hash] + assert mlbf.data.soft_blocked_items == [soft_blocked_version_hash] + assert mlbf.data.not_blocked_items == [] def test_no_soft_blocked_versions_empty_array(self): addon, block = self._blocked_addon()