diff --git a/wp1/credentials.py.example b/wp1/credentials.py.example index 8acc9a17..9a3b922c 100644 --- a/wp1/credentials.py.example +++ b/wp1/credentials.py.example @@ -178,6 +178,11 @@ CREDENTIALS = { 'password': 'farmpass', 'hook_token': 'hook-token-abc', } + + 'FILE_PATH': { + # Path where pageviews.bz2 file (~3GB) will be downloaded. + 'pageviews': '/tmp/pageviews', + } }, # EDIT: Remove the next line after you've provided actual production credentials. diff --git a/wp1/scores.py b/wp1/scores.py index ffc1c397..5bd0b3a7 100644 --- a/wp1/scores.py +++ b/wp1/scores.py @@ -85,7 +85,7 @@ def download_pageviews(): with requests.get(get_pageview_url(), stream=True) as r: r.raise_for_status() - with open(PAGEVIEW_FILE_NAME, 'wb') as f: + with open(cur_filepath, 'wb') as f: # Read data in 8 KB chunks for chunk in r.iter_content(chunk_size=8 * 1024): f.write(chunk) @@ -140,7 +140,7 @@ def pageview_components(): try: views = int(parts[4]) except ValueError: - log.warning('Views field wasn\'t int in pageview dump: %r', line) + logger.warning('Views field wasn\'t int in pageview dump: %r', line) continue if (tally is not None and tally.lang == lang and tally.name == name and @@ -172,7 +172,7 @@ def update_db_pageviews(wp10db, lang, article, page_id, views): }) -def update_pageviews(filter_lang=None): +def update_pageviews(filter_lang=None, commit_after=50000): download_pageviews() # Convert filter lang to bytes if necessary @@ -191,7 +191,7 @@ def update_pageviews(filter_lang=None): update_db_pageviews(wp10db, lang, article, page_id, views) n += 1 - if n >= 50000: + if n >= commit_after: logger.debug('Committing') wp10db.commit() n = 0 diff --git a/wp1/scores_test.py b/wp1/scores_test.py index 19ea58f9..00c21363 100644 --- a/wp1/scores_test.py +++ b/wp1/scores_test.py @@ -1,5 +1,6 @@ import bz2 from datetime import datetime +import os.path import unittest from unittest.mock import patch, MagicMock, mock_open @@ -35,7 +36,33 @@ af.wikipedia 1712 753 mobile-web 2 O2 af.wikipedia 1712 753 desktop 20 E12J7U1''' +pageview_error_text = b'''af.wikipedia 1701 1402 desktop 4 F1 +af.wikipedia 1701 1402 mobile-web 3 O2T1 +af.wikipedia 1702 1404 mobile-web 3 L1O2 +af.wikipedia 1702 1404 desktop 1 P1 +af.wikipedia - 1405 mobile-web 3 C1O2 +af.wikipedia - 1405 desktop 1 ^1 +af.wikipedia 1704 1406 mobile-web 4 A1O2T1 +af.wikipedia 1704 1406 desktop 2 F1 +af.wikipedia 1705 1407 mobile-web 3 O3 +af.wikipedia 1705 1407 desktop 1 F1 +af.wikipedia 1706 desktop 8 H8 +af.wikipedia 1706 mobile-web 4 C1O2Y1 +af.wikipedia 1707 1409 mobile-web 2 O2 +af.wikipedia 1707 1409 desktop 3 H1J1 +af.wikipedia 1708 1410 desktop X V1]1 +af.wikipedia 1708 1410 mobile-web Z O1 +af.wikipedia 1709 1411 desktop 2 F1 +af.wikipedia 1709 1411 mobile-web 2 O2 +af.wikipedia \xc3\xa9\xc3\xa1\xc3\xb8 3774 mobile-web 1 A1 +af.wikipedia \xc3\xa9\xc3\xa1\xc3\xb8 3774 mobile-web 2 F2 +af.wikipedia 1711 752 mobile-web 4 C1O2U1 +af.wikipedia 1711 752 desktop 1 K1 +af.wikipedia 1712 753 mobile-web 2 O2 +af.wikipedia 1712 753 desktop 20 E12J7U1''' + pageview_bz2 = bz2.compress(pageview_text) +pageview_error_bz2 = bz2.compress(pageview_error_text) class ScoresTest(BaseWpOneDbTest): @@ -105,6 +132,61 @@ def test_get_pageview_file_path(self): actual = scores.get_pageview_file_path('pageviews-202404-user.bz2') self.assertEqual('/tmp/pageviews/pageviews-202404-user.bz2', actual) + @patch('wp1.scores.get_current_datetime', return_value=datetime(2024, 5, 25)) + @patch('wp1.scores.requests.get') + def test_download_pageviews(self, mock_get_response, mock_datetime): + context = MagicMock() + resp = MagicMock() + resp.iter_content.return_value = (pageview_bz2,) + context.__enter__.return_value = resp + mock_get_response.return_value = context + + file_path = scores.get_cur_file_path() + if os.path.exists(file_path): + os.remove(file_path) + + scores.download_pageviews() + + mock_get_response.assert_called_once() + self.assertTrue(os.path.exists(file_path)) + + @patch('wp1.scores.get_current_datetime', return_value=datetime(2024, 5, 25)) + @patch('wp1.scores.requests.get') + def test_download_pageviews_remove_prev(self, mock_get_response, + mock_datetime): + context = MagicMock() + resp = MagicMock() + resp.iter_content.return_value = (pageview_bz2,) + context.__enter__.return_value = resp + mock_get_response.return_value = context + + file_path = scores.get_prev_file_path() + # Create empty file + open(file_path, 'a').close() + + scores.download_pageviews() + + self.assertFalse(os.path.exists(file_path)) + + @patch('wp1.scores.get_current_datetime', return_value=datetime(2024, 5, 25)) + @patch('wp1.scores.requests.get') + def test_download_pageviews_skip_existing(self, mock_get_response, + mock_datetime): + context = MagicMock() + resp = MagicMock() + resp.iter_content.return_value = (pageview_bz2,) + context.__enter__.return_value = resp + mock_get_response.return_value = context + + file_path = scores.get_cur_file_path() + # Create empty file + open(file_path, 'a').close() + + scores.download_pageviews() + + mock_get_response.assert_not_called() + self.assertTrue(os.path.exists(file_path)) + @patch('wp1.scores.get_current_datetime', return_value=datetime(2024, 5, 25)) @patch("builtins.open", new_callable=mock_open, read_data=pageview_bz2) def test_raw_pageviews(self, mock_file_open, mock_datetime): @@ -140,6 +222,24 @@ def test_pageview_components(self, mock_file_open): self.assertEqual(expected, actual) + @patch("builtins.open", new_callable=mock_open, read_data=pageview_error_bz2) + def test_pageview_components_errors(self, mock_file_open): + expected = [ + (b'af', b'1701', b'1402', 7), + (b'af', b'1702', b'1404', 4), + (b'af', b'1704', b'1406', 6), + (b'af', b'1705', b'1407', 4), + (b'af', b'1707', b'1409', 5), + (b'af', b'1709', b'1411', 4), + (b'af', b'\xc3\xa9\xc3\xa1\xc3\xb8', b'3774', 3), + (b'af', b'1711', b'752', 5), + (b'af', b'1712', b'753', 22), + ] + + actual = list(scores.pageview_components()) + + self.assertEqual(expected, actual) + def test_update_db_pageviews(self): scores.update_db_pageviews(self.wp10db, 'en', 'Statue_of_Liberty', 1234, 100) @@ -170,3 +270,37 @@ def test_update_db_pageviews_existing(self): self.assertEqual(result['ps_article'], b'Statue_of_Liberty') self.assertEqual(result['ps_page_id'], 1234) self.assertEqual(result['ps_views'], 200) + + @patch('wp1.scores.download_pageviews') + @patch('wp1.scores.pageview_components') + def test_update_pageviews(self, mock_components, mock_download): + mock_components.return_value = ( + (b'en', b'Statue_of_Liberty', 100, 100), + (b'en', b'Eiffel_Tower', 200, 200), + (b'fr', b'George-\xc3\x89tienne_Cartier_Monument', 300, 300), + ) + + scores.update_pageviews(commit_after=2) + + mock_download.assert_called_once() + with self.wp10db.cursor() as cursor: + cursor.execute('SELECT COUNT(*) as cnt FROM page_scores') + n = cursor.fetchone()['cnt'] + self.assertEqual(3, n) + + @patch('wp1.scores.download_pageviews') + @patch('wp1.scores.pageview_components') + def test_update_pageviews_filter(self, mock_components, mock_download): + mock_components.return_value = ( + (b'en', b'Statue_of_Liberty', 100, 100), + (b'en', b'Eiffel_Tower', 200, 200), + (b'fr', b'George-\xc3\x89tienne_Cartier_Monument', 300, 300), + ) + + scores.update_pageviews(filter_lang='fr') + + mock_download.assert_called_once() + with self.wp10db.cursor() as cursor: + cursor.execute('SELECT COUNT(*) as cnt FROM page_scores') + n = cursor.fetchone()['cnt'] + self.assertEqual(1, n)