From 2e0b43df482cc29b702d08254edcaa4974eb9cd3 Mon Sep 17 00:00:00 2001 From: Travis Briggs Date: Mon, 29 Apr 2024 16:30:30 +0200 Subject: [PATCH 01/13] Migration for adding scores table --- .../20240429_01_qtSms-add-scores-table.py | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 db/migrations/20240429_01_qtSms-add-scores-table.py diff --git a/db/migrations/20240429_01_qtSms-add-scores-table.py b/db/migrations/20240429_01_qtSms-add-scores-table.py new file mode 100644 index 00000000..3beb48c8 --- /dev/null +++ b/db/migrations/20240429_01_qtSms-add-scores-table.py @@ -0,0 +1,22 @@ +""" +Add scores table +""" + +from yoyo import step + +__depends__ = {'20230528_02_pL6ka-add-b-selection-zim-version-to-builders'} + +steps = [ + step('''CREATE TABLE page_scores ( + ps_project VARBINARY(255), + ps_page_id INTEGER NOT NULL, + ps_article VARBINARY(255), + ps_views INTEGER DEFAULT 0, + ps_links INTEGER DEFAULT 0, + ps_lang_links INTEGER DEFAULT 0, + ps_score INTEGER DEFAULT 0, + KEY `page_title` (`ps_project`, `ps_article`), + KEY `page_id` (`ps_page_id`) + )''', + "DROP TABLE page_scores") +] From 690d65619ed21fc36a3abe9db457568fbb72b321 Mon Sep 17 00:00:00 2001 From: Travis Briggs Date: Tue, 30 Apr 2024 10:06:48 +0200 Subject: [PATCH 02/13] Retrieve wiki languages --- wp1/exceptions.py | 3 +++ wp1/scores.py | 20 ++++++++++++++++++++ wp1/scores_test.py | 47 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 70 insertions(+) create mode 100644 wp1/scores.py create mode 100644 wp1/scores_test.py diff --git a/wp1/exceptions.py b/wp1/exceptions.py index 88214646..31f2ea7e 100644 --- a/wp1/exceptions.py +++ b/wp1/exceptions.py @@ -24,3 +24,6 @@ class ObjectNotFoundError(Wp1Error): class UserNotAuthorizedError(Wp1Error): pass + +class Wp1ScoreProcessingError(Wp1Error): + pass \ No newline at end of file diff --git a/wp1/scores.py b/wp1/scores.py new file mode 100644 index 00000000..244b2e55 --- /dev/null +++ b/wp1/scores.py @@ -0,0 +1,20 @@ +import csv +import requests + +from wp1.exceptions import Wp1ScoreProcessingError + + +def wiki_languages(): + r = requests.get( + 'https://wikistats.wmcloud.org/api.php?action=dump&table=wikipedias&format=csv' + ) + try: + r.raise_for_status() + except requests.exceptions.HTTPError as e: + raise Wp1ScoreProcessingError('Could not retrieve wiki list') from e + + reader = csv.reader(r.text.splitlines()) + # Skip the header row + next(reader, None) + for row in reader: + yield row[2] diff --git a/wp1/scores_test.py b/wp1/scores_test.py new file mode 100644 index 00000000..9eb91459 --- /dev/null +++ b/wp1/scores_test.py @@ -0,0 +1,47 @@ +import unittest +from unittest.mock import patch, MagicMock + +import requests + +from wp1.exceptions import Wp1ScoreProcessingError +from wp1.scores import wiki_languages + + +class ScoresTest(unittest.TestCase): + + @patch('wp1.scores.requests') + def test_wiki_languages(self, mock_requests): + mock_response = MagicMock() + mock_response.text = ( + 'id,lang,prefix,total,good,views,edits,users,admins,ts,loclang,images,' + 'loclanglink,activeusers,version,si_mainpage,si_base,si_sitename,si_generator,' + 'si_phpversion,si_phpsapi,si_dbtype,si_dbversion,si_rev,si_case,si_rights,' + 'si_lang,si_fallback8bitEncoding,si_writeapi,si_timezone,si_timeoffset,' + 'si_articlepath,si_scriptpath,si_script,si_variantarticlepath,si_server,' + 'si_wikiid,si_time,method,http,status,ratio\n' + '2,English,en,60556624,6818615,63208806,1216695232,47328206,861,"2024-04-30 00:06:16",' + 'English,916605,English_language,122676,1.28.0-wmf.13,,,,"MediaWiki 1.43.0-wmf.2",,,,,' + ',,,,,,,,,,,,,,,8,200,a,0.1126\n' + '153,Cebuano,ceb,11228672,6118766,0,35037562,115188,5,"2024-04-30 00:01:34"' + ',"Sinugboanong Binisaya",1,Sinugboanon,149,1.28.0-wmf.13,,,,"MediaWiki 1.43.0-wmf.2",,' + ',,,,,,,,,,,,,,,,,,8,200,a,0.5449\n' + '10,German,de,8007675,2905495,8543798,242922549,4359000,174,"2024-04-30 00:08:06",' + 'Deutsch,129233,Deutsch,17684,1.28.0-wmf.13,,,,"MediaWiki 1.43.0-wmf.2",,,,,,,,,,,,,,,,,' + ',,,8,200,a,0.3628\n' + '1,French,fr,13037937,2608568,2234272,214217598,4914029,146,"2024-04-30 00:08:24",' + 'Français,71651,Français,16929,1.28.0-wmf.13,,,,"MediaWiki 1.43.0-wmf.2",,,,,,,' + ',,,,,,,,,,,,,8,200,a,0.2001\n') + mock_requests.get.return_value = mock_response + + actual = list(wiki_languages()) + self.assertEqual(['en', 'ceb', 'de', 'fr'], actual) + + @patch('wp1.scores.requests') + def test_wiki_languages_raises_on_http_error(self, mock_requests): + mock_response = MagicMock() + mock_response.raise_for_status.side_effect = requests.exceptions.HTTPError + mock_requests.exceptions.HTTPError = requests.exceptions.HTTPError + mock_requests.get.return_value = mock_response + + with self.assertRaises(Wp1ScoreProcessingError): + list(wiki_languages()) From 5ec13195faa8cbb45a07edcd0b7b618108964fcb Mon Sep 17 00:00:00 2001 From: Travis Briggs Date: Tue, 30 Apr 2024 14:41:59 +0200 Subject: [PATCH 03/13] Functions for streaming page view data --- wp1/scores.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/wp1/scores.py b/wp1/scores.py index 244b2e55..66287947 100644 --- a/wp1/scores.py +++ b/wp1/scores.py @@ -1,3 +1,5 @@ +from bz2 import BZ2Decompressor + import csv import requests @@ -18,3 +20,37 @@ def wiki_languages(): next(reader, None) for row in reader: yield row[2] + + +def raw_pageviews(decode=True): + + def as_bytes(): + with requests.get( + 'https://dumps.wikimedia.org/other/pageview_complete/monthly/2024/2024-03/pageviews-202403-automated.bz2', + stream=True) as r: + + decompressor = BZ2Decompressor() + trailing = b'' + # Read data in 1 MB chunks + for http_chunk in r.iter_content(chunk_size=1024 * 1024): + data = decompressor.decompress(http_chunk) + lines = [line for line in data.split(b'\n') if line] + if not lines: + continue + + yield trailing + lines[0] + yield from lines[1:-1] + trailing = lines[-1] + + if decode: + for line in as_bytes(): + yield line.decode('utf-8') + else: + yield from as_bytes() + + +def pageviews_for_lang(lang): + needle = f'{lang}.wikipedia' + for line in raw_pageviews(): + if needle in line: + yield line From 0d237d4a602e352cdf75601fbfd0bfa9ac567c43 Mon Sep 17 00:00:00 2001 From: Travis Briggs Date: Wed, 1 May 2024 14:19:54 +0200 Subject: [PATCH 04/13] Re-work page_scores table --- .../20240429_01_qtSms-add-scores-table.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/db/migrations/20240429_01_qtSms-add-scores-table.py b/db/migrations/20240429_01_qtSms-add-scores-table.py index 3beb48c8..a4a54a8c 100644 --- a/db/migrations/20240429_01_qtSms-add-scores-table.py +++ b/db/migrations/20240429_01_qtSms-add-scores-table.py @@ -7,16 +7,16 @@ __depends__ = {'20230528_02_pL6ka-add-b-selection-zim-version-to-builders'} steps = [ - step('''CREATE TABLE page_scores ( - ps_project VARBINARY(255), + step( + '''CREATE TABLE page_scores ( + ps_lang VARBINARY(255), ps_page_id INTEGER NOT NULL, - ps_article VARBINARY(255), + ps_article VARBINARY(1024), ps_views INTEGER DEFAULT 0, ps_links INTEGER DEFAULT 0, ps_lang_links INTEGER DEFAULT 0, ps_score INTEGER DEFAULT 0, - KEY `page_title` (`ps_project`, `ps_article`), - KEY `page_id` (`ps_page_id`) - )''', - "DROP TABLE page_scores") + PRIMARY KEY (`ps_lang`, `ps_page_id`), + KEY `lang_article` (`ps_lang`, `ps_article`) + )''', 'DROP TABLE page_scores') ] From 39b0cfa4cb2664551ce2baf6df1297bed2dcce82 Mon Sep 17 00:00:00 2001 From: Travis Briggs Date: Wed, 1 May 2024 14:20:26 +0200 Subject: [PATCH 05/13] Add methods for saving pageviews to db --- wp1/scores.py | 40 +++++++++++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 7 deletions(-) diff --git a/wp1/scores.py b/wp1/scores.py index 66287947..e56e2077 100644 --- a/wp1/scores.py +++ b/wp1/scores.py @@ -4,6 +4,7 @@ import requests from wp1.exceptions import Wp1ScoreProcessingError +from wp1.wp10_db import connect as wp10_connect def wiki_languages(): @@ -22,7 +23,7 @@ def wiki_languages(): yield row[2] -def raw_pageviews(decode=True): +def raw_pageviews(decode=False): def as_bytes(): with requests.get( @@ -31,8 +32,8 @@ def as_bytes(): decompressor = BZ2Decompressor() trailing = b'' - # Read data in 1 MB chunks - for http_chunk in r.iter_content(chunk_size=1024 * 1024): + # Read data in 32 MB chunks + for http_chunk in r.iter_content(chunk_size=32 * 1024 * 1024): data = decompressor.decompress(http_chunk) lines = [line for line in data.split(b'\n') if line] if not lines: @@ -49,8 +50,33 @@ def as_bytes(): yield from as_bytes() -def pageviews_for_lang(lang): - needle = f'{lang}.wikipedia' +def pageview_components(): for line in raw_pageviews(): - if needle in line: - yield line + parts = line.split(b' ') + if len(parts) != 6 or parts[2] == b'null': + # Skip pages that don't have a pageid + continue + + # Language code, article name, article page id, views + yield parts[0].split(b'.')[0], parts[1], parts[2], parts[4] + + +def update_pageviews(wp10db, lang, article, page_id, views): + with wp10db.cursor() as cursor: + cursor.execute( + '''INSERT INTO page_scores (ps_lang, ps_page_id, ps_article, ps_views) + VALUES (%(lang)s, %(page_id)s, %(article)s, %(views)s) + ON DUPLICATE KEY UPDATE ps_views = %(views)s + ''', { + 'lang': lang, + 'page_id': page_id, + 'article': article, + 'views': views + }) + wp10db.commit() + + +def update_all_pageviews(): + wp10db = wp10_connect() + for lang, article, page_id, views in pageview_components(): + update_pageviews(wp10db, lang, article, page_id, views) From 185d7ad86fb48e0b1c430ff38de2013fbf88848d Mon Sep 17 00:00:00 2001 From: Travis Briggs Date: Sat, 25 May 2024 11:23:57 -0700 Subject: [PATCH 06/13] Multiple small changes: - Add WP1_USER_AGENT to http calls - Get pageviews for previous month instead of hardcoded time - Throw error on non-successfuly HTTP status - Allow a filter_lang when updating DB from pageviews - Only commit db after every 10000 rows --- wp1/scores.py | 44 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 36 insertions(+), 8 deletions(-) diff --git a/wp1/scores.py b/wp1/scores.py index e56e2077..6f7b8303 100644 --- a/wp1/scores.py +++ b/wp1/scores.py @@ -1,16 +1,18 @@ from bz2 import BZ2Decompressor import csv +from datetime import datetime, timedelta import requests +from wp1.constants import WP1_USER_AGENT from wp1.exceptions import Wp1ScoreProcessingError from wp1.wp10_db import connect as wp10_connect def wiki_languages(): r = requests.get( - 'https://wikistats.wmcloud.org/api.php?action=dump&table=wikipedias&format=csv' - ) + 'https://wikistats.wmcloud.org/api.php?action=dump&table=wikipedias&format=csv', + headers={'User-Agent': WP1_USER_AGENT}) try: r.raise_for_status() except requests.exceptions.HTTPError as e: @@ -25,10 +27,21 @@ def wiki_languages(): def raw_pageviews(decode=False): + def get_pageview_url(): + now = datetime.now() + dt = datetime(now.year, now.month, 1) - timedelta(weeks=4) + return dt.strftime( + 'https://dumps.wikimedia.org/other/pageview_complete/monthly/' + '%Y/%Y-%m/pageviews-%Y%m-automated.bz2') + def as_bytes(): - with requests.get( - 'https://dumps.wikimedia.org/other/pageview_complete/monthly/2024/2024-03/pageviews-202403-automated.bz2', - stream=True) as r: + url = get_pageview_url() + with requests.get(url, stream=True, + headers={'User-Agent': WP1_USER_AGENT}) as r: + try: + r.raise_for_status() + except requests.exceptions.HTTPError as e: + raise Wp1ScoreProcessingError('Could not retrieve pageview data') from e decompressor = BZ2Decompressor() trailing = b'' @@ -57,6 +70,10 @@ def pageview_components(): # Skip pages that don't have a pageid continue + if parts[1] == b'' or parts[1] == b'-': + # Skip pages that don't have a title + continue + # Language code, article name, article page id, views yield parts[0].split(b'.')[0], parts[1], parts[2], parts[4] @@ -73,10 +90,21 @@ def update_pageviews(wp10db, lang, article, page_id, views): 'article': article, 'views': views }) - wp10db.commit() -def update_all_pageviews(): +def update_all_pageviews(filter_lang=None): + # Convert filter lang to bytes if necessary + if filter_lang is not None and isinstance(filter_lang, str): + filter_lang = filter_lang.encode('utf-8') + wp10db = wp10_connect() + n = 0 for lang, article, page_id, views in pageview_components(): - update_pageviews(wp10db, lang, article, page_id, views) + if filter_lang is None or lang == filter_lang: + update_pageviews(wp10db, lang, article, page_id, views) + + n += 1 + if n >= 10000: + wp10db.commit() + n = 0 + wp10db.commit() From e989b5cb172a272ddc6e2bdaec5c0617782efe76 Mon Sep 17 00:00:00 2001 From: Travis Briggs Date: Sat, 25 May 2024 18:10:37 -0700 Subject: [PATCH 07/13] Add page_scores table to test schema --- wp10_test.down.sql | 1 + wp10_test.up.sql | 12 ++++++++++++ 2 files changed, 13 insertions(+) diff --git a/wp10_test.down.sql b/wp10_test.down.sql index dff53a40..245a20b2 100644 --- a/wp10_test.down.sql +++ b/wp10_test.down.sql @@ -12,3 +12,4 @@ DROP TABLE IF EXISTS `builders`; DROP TABLE IF EXISTS `selections`; DROP TABLE IF EXISTS `custom`; DROP TABLE IF EXISTS `zim_files`; +DROP TABLE IF EXISTS `page_scores`; \ No newline at end of file diff --git a/wp10_test.up.sql b/wp10_test.up.sql index 4bc821b2..59f63fb6 100644 --- a/wp10_test.up.sql +++ b/wp10_test.up.sql @@ -138,6 +138,18 @@ CREATE TABLE zim_files ( z_description tinyblob ); +CREATE TABLE `page_scores` ( + `ps_lang` varbinary(255) NOT NULL, + `ps_page_id` int(11) NOT NULL, + `ps_article` varbinary(1024) DEFAULT NULL, + `ps_views` int(11) DEFAULT 0, + `ps_links` int(11) DEFAULT 0, + `ps_lang_links` int(11) DEFAULT 0, + `ps_score` int(11) DEFAULT 0, + PRIMARY KEY (`ps_lang`,`ps_page_id`), + KEY `lang_article` (`ps_lang`,`ps_article`) +); + INSERT INTO `global_rankings` (gr_type, gr_rating, gr_ranking) VALUES ('importance', 'Unknown-Class', 0); INSERT INTO `global_rankings` (gr_type, gr_rating, gr_ranking) VALUES ('importance', 'NA-Class', 50); INSERT INTO `global_rankings` (gr_type, gr_rating, gr_ranking) VALUES ('importance', 'Low-Class', 100); From f86586538b227138eb24dba677c7c08cd44bf371 Mon Sep 17 00:00:00 2001 From: Travis Briggs Date: Sat, 25 May 2024 18:32:48 -0700 Subject: [PATCH 08/13] Re-factore scores.py to make it more testable and add some tests --- wp1/scores.py | 79 ++++++++++++++++------ wp1/scores_test.py | 164 +++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 218 insertions(+), 25 deletions(-) diff --git a/wp1/scores.py b/wp1/scores.py index 6f7b8303..09cd1d08 100644 --- a/wp1/scores.py +++ b/wp1/scores.py @@ -1,4 +1,6 @@ from bz2 import BZ2Decompressor +from collections import namedtuple +from contextlib import contextmanager import csv from datetime import datetime, timedelta @@ -6,8 +8,12 @@ from wp1.constants import WP1_USER_AGENT from wp1.exceptions import Wp1ScoreProcessingError +from wp1.time import get_current_datetime from wp1.wp10_db import connect as wp10_connect +PageviewRecord = namedtuple('PageviewRecord', + ['lang', 'name', 'page_id', 'views']) + def wiki_languages(): r = requests.get( @@ -25,24 +31,31 @@ def wiki_languages(): yield row[2] -def raw_pageviews(decode=False): +def get_pageview_url(): + now = get_current_datetime() + dt = datetime(now.year, now.month, 1) - timedelta(weeks=4) + return dt.strftime( + 'https://dumps.wikimedia.org/other/pageview_complete/monthly/' + '%Y/%Y-%m/pageviews-%Y%m-user.bz2') - def get_pageview_url(): - now = datetime.now() - dt = datetime(now.year, now.month, 1) - timedelta(weeks=4) - return dt.strftime( - 'https://dumps.wikimedia.org/other/pageview_complete/monthly/' - '%Y/%Y-%m/pageviews-%Y%m-automated.bz2') - def as_bytes(): - url = get_pageview_url() - with requests.get(url, stream=True, - headers={'User-Agent': WP1_USER_AGENT}) as r: - try: - r.raise_for_status() - except requests.exceptions.HTTPError as e: - raise Wp1ScoreProcessingError('Could not retrieve pageview data') from e +@contextmanager +def get_pageview_response(): + url = get_pageview_url() + with requests.get(url, stream=True, + headers={'User-Agent': WP1_USER_AGENT}) as r: + try: + r.raise_for_status() + except requests.exceptions.HTTPError as e: + raise Wp1ScoreProcessingError('Could not retrieve pageview data') from e + + yield r + + +def raw_pageviews(decode=False): + def as_bytes(): + with get_pageview_response() as r: decompressor = BZ2Decompressor() trailing = b'' # Read data in 32 MB chunks @@ -56,6 +69,8 @@ def as_bytes(): yield from lines[1:-1] trailing = lines[-1] + yield trailing + if decode: for line in as_bytes(): yield line.decode('utf-8') @@ -64,6 +79,7 @@ def as_bytes(): def pageview_components(): + tally = None for line in raw_pageviews(): parts = line.split(b' ') if len(parts) != 6 or parts[2] == b'null': @@ -74,17 +90,38 @@ def pageview_components(): # Skip pages that don't have a title continue - # Language code, article name, article page id, views - yield parts[0].split(b'.')[0], parts[1], parts[2], parts[4] + lang = parts[0].split(b'.')[0] + name = parts[1] + page_id = parts[2] + try: + views = int(parts[4]) + except ValueError: + # Views field wasn't int + log.warning('Views field wasn\'t int in pageview dump: %r', line) + continue + + if (tally is not None and tally.lang == lang and tally.name == name and + tally.page_id == page_id): + # This is a view on the same page from a different interface (mobile v + # desktop etc) + new_dict = {**tally._asdict(), 'views': tally.views + views} + tally = PageviewRecord(**new_dict) + else: + # Language code, article name, article page id, views + if tally is not None: + yield tally.lang, tally.name, tally.page_id, tally.views + tally = PageviewRecord(lang, name, page_id, views) + + yield tally.lang, tally.name, tally.page_id, tally.views -def update_pageviews(wp10db, lang, article, page_id, views): +def update_db_pageviews(wp10db, lang, article, page_id, views): with wp10db.cursor() as cursor: cursor.execute( '''INSERT INTO page_scores (ps_lang, ps_page_id, ps_article, ps_views) VALUES (%(lang)s, %(page_id)s, %(article)s, %(views)s) ON DUPLICATE KEY UPDATE ps_views = %(views)s - ''', { + ''', { 'lang': lang, 'page_id': page_id, 'article': article, @@ -92,7 +129,7 @@ def update_pageviews(wp10db, lang, article, page_id, views): }) -def update_all_pageviews(filter_lang=None): +def update_pageviews(filter_lang=None): # Convert filter lang to bytes if necessary if filter_lang is not None and isinstance(filter_lang, str): filter_lang = filter_lang.encode('utf-8') @@ -101,7 +138,7 @@ def update_all_pageviews(filter_lang=None): n = 0 for lang, article, page_id, views in pageview_components(): if filter_lang is None or lang == filter_lang: - update_pageviews(wp10db, lang, article, page_id, views) + update_db_pageviews(wp10db, lang, article, page_id, views) n += 1 if n >= 10000: diff --git a/wp1/scores_test.py b/wp1/scores_test.py index 9eb91459..984abf7e 100644 --- a/wp1/scores_test.py +++ b/wp1/scores_test.py @@ -1,13 +1,45 @@ +import bz2 +from datetime import datetime import unittest from unittest.mock import patch, MagicMock import requests +from wp1.base_db_test import BaseWpOneDbTest +from wp1.constants import WP1_USER_AGENT from wp1.exceptions import Wp1ScoreProcessingError -from wp1.scores import wiki_languages +from wp1 import scores -class ScoresTest(unittest.TestCase): +class ScoresTest(BaseWpOneDbTest): + pageview_text = b'''af.wikipedia 1701 1402 desktop 4 F1 +af.wikipedia 1701 1402 mobile-web 3 O2T1 +af.wikipedia 1702 1404 mobile-web 3 L1O2 +af.wikipedia 1702 1404 desktop 1 P1 +af.wikipedia 1703 1405 mobile-web 3 C1O2 +af.wikipedia 1703 1405 desktop 1 ^1 +af.wikipedia 1704 1406 mobile-web 4 A1O2T1 +af.wikipedia 1704 1406 desktop 2 F1 +af.wikipedia 1705 1407 mobile-web 3 O3 +af.wikipedia 1705 1407 desktop 1 F1 +af.wikipedia 1706 1408 desktop 8 H8 +af.wikipedia 1706 1408 mobile-web 4 C1O2Y1 +af.wikipedia 1707 1409 mobile-web 2 O2 +af.wikipedia 1707 1409 desktop 3 H1J1 +af.wikipedia 1708 1410 desktop 4 V1]1 +af.wikipedia 1708 1410 mobile-web 1 O1 +af.wikipedia 1709 1411 desktop 2 F1 +af.wikipedia 1709 1411 mobile-web 2 O2 +af.wikipedia \xc3\xa9\xc3\xa1\xc3\xb8 3774 mobile-web 1 A1 +af.wikipedia \xc3\xa9\xc3\xa1\xc3\xb8 3774 mobile-web 2 F2 +af.wikipedia 1711 752 mobile-web 4 C1O2U1 +af.wikipedia 1711 752 desktop 1 K1 +af.wikipedia 1712 753 mobile-web 2 O2 +af.wikipedia 1712 753 desktop 20 E12J7U1''' + + @property + def pageview_bz2(self): + return bz2.compress(self.pageview_text) @patch('wp1.scores.requests') def test_wiki_languages(self, mock_requests): @@ -33,7 +65,7 @@ def test_wiki_languages(self, mock_requests): ',,,,,,,,,,,,,8,200,a,0.2001\n') mock_requests.get.return_value = mock_response - actual = list(wiki_languages()) + actual = list(scores.wiki_languages()) self.assertEqual(['en', 'ceb', 'de', 'fr'], actual) @patch('wp1.scores.requests') @@ -44,4 +76,128 @@ def test_wiki_languages_raises_on_http_error(self, mock_requests): mock_requests.get.return_value = mock_response with self.assertRaises(Wp1ScoreProcessingError): - list(wiki_languages()) + list(scores.wiki_languages()) + + @patch('wp1.scores.get_current_datetime', return_value=datetime(2024, 5, 25)) + def test_get_pageview_url(self, mock_datetime): + actual = scores.get_pageview_url() + self.assertEqual( + 'https://dumps.wikimedia.org/other/pageview_complete/monthly/' + '2024/2024-04/pageviews-202404-user.bz2', actual) + + @patch('wp1.scores.requests.get') + def test_get_pageview_response(self, mock_get): + context = MagicMock() + expected = MagicMock() + context.__enter__.return_value = expected + mock_get.return_value = context + with scores.get_pageview_response() as actual: + self.assertEqual(expected, actual) + + @patch('wp1.scores.requests.get') + def test_get_pageview_response_non_success(self, mock_get): + context = MagicMock() + resp = MagicMock() + resp.raise_for_status.side_effect = requests.exceptions.HTTPError + context.__enter__.return_value = resp + mock_get.return_value = context + with self.assertRaises( + Wp1ScoreProcessingError), scores.get_pageview_response() as actual: + pass + + @patch('wp1.scores.get_current_datetime', return_value=datetime(2024, 5, 25)) + @patch('wp1.scores.get_pageview_response') + def test_raw_pageviews(self, mock_get_response, mock_datetime): + context = MagicMock() + resp = MagicMock() + resp.iter_content.return_value = (self.pageview_bz2,) + context.__enter__.return_value = resp + mock_get_response.return_value = context + + actual = b'\n'.join(scores.raw_pageviews()) + + self.assertEqual(self.pageview_text, actual) + + @patch('wp1.scores.get_current_datetime', return_value=datetime(2024, 5, 25)) + @patch('wp1.scores.get_pageview_response') + def test_raw_pageviews(self, mock_get_response, mock_datetime): + context = MagicMock() + resp = MagicMock() + resp.iter_content.return_value = (self.pageview_bz2,) + context.__enter__.return_value = resp + mock_get_response.return_value = context + + actual = b'\n'.join(scores.raw_pageviews()) + + self.assertEqual(self.pageview_text, actual) + + @patch('wp1.scores.get_current_datetime', return_value=datetime(2024, 5, 25)) + @patch('wp1.scores.get_pageview_response') + def test_raw_pageviews_decode(self, mock_get_response, mock_datetime): + context = MagicMock() + resp = MagicMock() + resp.iter_content.return_value = (self.pageview_bz2,) + context.__enter__.return_value = resp + mock_get_response.return_value = context + + actual = '\n'.join(scores.raw_pageviews(decode=True)) + + self.assertEqual(self.pageview_text.decode('utf-8'), actual) + + @patch('wp1.scores.get_pageview_response') + def test_pageview_components(self, mock_get_response): + context = MagicMock() + resp = MagicMock() + resp.iter_content.return_value = (self.pageview_bz2,) + context.__enter__.return_value = resp + mock_get_response.return_value = context + + expected = [ + (b'af', b'1701', b'1402', 7), + (b'af', b'1702', b'1404', 4), + (b'af', b'1703', b'1405', 4), + (b'af', b'1704', b'1406', 6), + (b'af', b'1705', b'1407', 4), + (b'af', b'1706', b'1408', 12), + (b'af', b'1707', b'1409', 5), + (b'af', b'1708', b'1410', 5), + (b'af', b'1709', b'1411', 4), + (b'af', b'\xc3\xa9\xc3\xa1\xc3\xb8', b'3774', 3), + (b'af', b'1711', b'752', 5), + (b'af', b'1712', b'753', 22), + ] + + actual = list(scores.pageview_components()) + + self.assertEqual(expected, actual) + + def test_update_db_pageviews(self): + scores.update_db_pageviews(self.wp10db, 'en', 'Statue_of_Liberty', 1234, + 100) + + with self.wp10db.cursor() as cursor: + cursor.execute('SELECT * FROM page_scores WHERE ps_page_id = 1234') + result = cursor.fetchone() + self.assertIsNotNone(result) + self.assertEqual(result['ps_lang'], b'en') + self.assertEqual(result['ps_article'], b'Statue_of_Liberty') + self.assertEqual(result['ps_page_id'], 1234) + self.assertEqual(result['ps_views'], 100) + + def test_update_db_pageviews_existing(self): + with self.wp10db.cursor() as cursor: + cursor.execute( + 'INSERT INTO page_scores VALUES ("en", "Statue_of_Liberty", 1234, 100' + ) + + scores.update_db_pageviews(self.wp10db, 'en', 'Statue_of_Liberty', 1234, + 200) + + with self.wp10db.cursor() as cursor: + cursor.execute('SELECT * FROM page_scores WHERE ps_page_id = 1234') + result = cursor.fetchone() + self.assertIsNotNone(result) + self.assertEqual(result['ps_lang'], b'en') + self.assertEqual(result['ps_article'], b'Statue_of_Liberty') + self.assertEqual(result['ps_page_id'], 1234) + self.assertEqual(result['ps_views'], 200) From 8c65d13a96e9f3abf6157786f37acc8b6546ebd0 Mon Sep 17 00:00:00 2001 From: Travis Briggs Date: Sat, 17 Aug 2024 08:10:05 -0700 Subject: [PATCH 09/13] Download full pageviews file, HTTP streaming was not working --- docker-compose.yml | 1 + docker/dev-db/README.md | 2 +- wp1/credentials.py.dev.e2e | 4 ++ wp1/credentials.py.example | 10 ++++ wp1/scores.py | 100 +++++++++++++++++++++++++++++-------- wp1/scores_test.py | 95 ++++++++++++----------------------- 6 files changed, 127 insertions(+), 85 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 91b3c548..6de643bc 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -14,6 +14,7 @@ services: - /data/wp1bot/credentials.py:/usr/src/app/wp1/credentials.py - /data/wp1bot/db/yoyo.ini:/usr/src/app/db/production/yoyo.ini - /srv/log/wp1bot/:/var/log/wp1bot/ + - /srv/data/wp1bot/:/var/data/wp1bot/ links: - redis logging: diff --git a/docker/dev-db/README.md b/docker/dev-db/README.md index 2d27255c..8a50cb80 100644 --- a/docker/dev-db/README.md +++ b/docker/dev-db/README.md @@ -14,7 +14,7 @@ The dev database will need to be migrated in the following circumstances: To migrate, cd to the `db/dev` directory and run the following command: ```bash -PYTHONPATH=$PYTHONPATH:../.. yoyo apply +PYTHONPATH=$PYTHONPATH:../.. pipenv run yoyo apply ``` The `PYTHONPATH` environment variable is necessary because some of the migrations diff --git a/wp1/credentials.py.dev.e2e b/wp1/credentials.py.dev.e2e index 9855819b..f0e02680 100644 --- a/wp1/credentials.py.dev.e2e +++ b/wp1/credentials.py.dev.e2e @@ -39,6 +39,10 @@ CREDENTIALS = { 'secret': '', 'bucket': 'org-kiwix-dev-wp1', }, + 'FILE_PATH': { + # Path where pageviews.bz2 file (~3GB) will be downloaded. + 'pageviews': '/tmp/pageviews', + } }, Environment.TEST: {}, Environment.PRODUCTION: {} diff --git a/wp1/credentials.py.example b/wp1/credentials.py.example index 548fb89a..8acc9a17 100644 --- a/wp1/credentials.py.example +++ b/wp1/credentials.py.example @@ -122,6 +122,11 @@ CREDENTIALS = { # server, to ensure requests to the webhook endpoint are valid. 'hook_token': '', # EDIT this line }, + + 'FILE_PATH': { + # Path where pageviews.bz2 file (~3GB) will be downloaded. + 'pageviews': '/tmp/pageviews', + } }, # Environment for python nosetests. In this environment, only the MySQL database @@ -253,4 +258,9 @@ CREDENTIALS = { # # server, to ensure requests to the webhook endpoint are valid. # 'hook_token': '', # EDIT this line # }, + + # 'FILE_PATH': { + # # Path where pageviews.bz2 file (~3GB) will be downloaded. + # 'pageviews': '/var/data/wp1bot/pageviews', + # } } diff --git a/wp1/scores.py b/wp1/scores.py index 09cd1d08..ffc1c397 100644 --- a/wp1/scores.py +++ b/wp1/scores.py @@ -1,6 +1,8 @@ from bz2 import BZ2Decompressor from collections import namedtuple from contextlib import contextmanager +import logging +import os.path import csv from datetime import datetime, timedelta @@ -14,11 +16,22 @@ PageviewRecord = namedtuple('PageviewRecord', ['lang', 'name', 'page_id', 'views']) +logger = logging.getLogger(__name__) + +try: + from wp1.credentials import ENV, CREDENTIALS +except ImportError: + logger.exception('The file credentials.py must be populated manually in ' + 'order to download pageviews') + CREDENTIALS = None + ENV = None + def wiki_languages(): r = requests.get( 'https://wikistats.wmcloud.org/api.php?action=dump&table=wikipedias&format=csv', - headers={'User-Agent': WP1_USER_AGENT}) + headers={'User-Agent': WP1_USER_AGENT}, + ) try: r.raise_for_status() except requests.exceptions.HTTPError as e: @@ -31,44 +44,75 @@ def wiki_languages(): yield row[2] -def get_pageview_url(): +def get_pageview_url(prev=False): + weeks = 4 + if prev: + weeks = 8 + now = get_current_datetime() - dt = datetime(now.year, now.month, 1) - timedelta(weeks=4) + dt = datetime(now.year, now.month, 1) - timedelta(weeks=weeks) return dt.strftime( 'https://dumps.wikimedia.org/other/pageview_complete/monthly/' '%Y/%Y-%m/pageviews-%Y%m-user.bz2') -@contextmanager -def get_pageview_response(): - url = get_pageview_url() - with requests.get(url, stream=True, - headers={'User-Agent': WP1_USER_AGENT}) as r: - try: - r.raise_for_status() - except requests.exceptions.HTTPError as e: - raise Wp1ScoreProcessingError('Could not retrieve pageview data') from e +def get_pageview_file_path(filename): + path = CREDENTIALS[ENV]['FILE_PATH']['pageviews'] + os.makedirs(path, exist_ok=True) + return os.path.join(path, filename) + + +def get_prev_file_path(): + prev_filename = get_pageview_url(prev=True).split('/')[-1] + return get_pageview_file_path(prev_filename) + + +def get_cur_file_path(): + cur_filename = get_pageview_url().split('/')[-1] + return get_pageview_file_path(cur_filename) + - yield r +def download_pageviews(): + # Clean up file from last month + prev_filepath = get_prev_file_path() + if os.path.exists(prev_filepath): + os.remove(prev_filepath) + + cur_filepath = get_cur_file_path() + if os.path.exists(cur_filepath): + # File already downloaded + return + + with requests.get(get_pageview_url(), stream=True) as r: + r.raise_for_status() + with open(PAGEVIEW_FILE_NAME, 'wb') as f: + # Read data in 8 KB chunks + for chunk in r.iter_content(chunk_size=8 * 1024): + f.write(chunk) def raw_pageviews(decode=False): def as_bytes(): - with get_pageview_response() as r: - decompressor = BZ2Decompressor() - trailing = b'' - # Read data in 32 MB chunks - for http_chunk in r.iter_content(chunk_size=32 * 1024 * 1024): - data = decompressor.decompress(http_chunk) + decompressor = BZ2Decompressor() + trailing = b'' + with open(get_cur_file_path(), 'rb') as f: + while True: + # Read data in 1 MB chunks + chunk = f.read(1024 * 1024) + if not chunk: + break + data = decompressor.decompress(chunk) lines = [line for line in data.split(b'\n') if line] if not lines: continue + # Reunite incomplete lines yield trailing + lines[0] yield from lines[1:-1] trailing = lines[-1] + # Nothing left, yield the last line yield trailing if decode: @@ -96,7 +140,6 @@ def pageview_components(): try: views = int(parts[4]) except ValueError: - # Views field wasn't int log.warning('Views field wasn\'t int in pageview dump: %r', line) continue @@ -130,10 +173,17 @@ def update_db_pageviews(wp10db, lang, article, page_id, views): def update_pageviews(filter_lang=None): + download_pageviews() + # Convert filter lang to bytes if necessary if filter_lang is not None and isinstance(filter_lang, str): filter_lang = filter_lang.encode('utf-8') + if filter_lang is None: + logger.info('Updating all pageviews') + else: + logger.info('Updating pageviews for %s', filter_lang.decode('utf-8')) + wp10db = wp10_connect() n = 0 for lang, article, page_id, views in pageview_components(): @@ -141,7 +191,15 @@ def update_pageviews(filter_lang=None): update_db_pageviews(wp10db, lang, article, page_id, views) n += 1 - if n >= 10000: + if n >= 50000: + logger.debug('Committing') wp10db.commit() n = 0 wp10db.commit() + logger.info('Done') + + +if __name__ == '__main__': + logging.basicConfig(level=logging.INFO, + format='%(levelname)s %(asctime)s: %(message)s') + update_pageviews() diff --git a/wp1/scores_test.py b/wp1/scores_test.py index 984abf7e..19ea58f9 100644 --- a/wp1/scores_test.py +++ b/wp1/scores_test.py @@ -1,7 +1,7 @@ import bz2 from datetime import datetime import unittest -from unittest.mock import patch, MagicMock +from unittest.mock import patch, MagicMock, mock_open import requests @@ -10,9 +10,7 @@ from wp1.exceptions import Wp1ScoreProcessingError from wp1 import scores - -class ScoresTest(BaseWpOneDbTest): - pageview_text = b'''af.wikipedia 1701 1402 desktop 4 F1 +pageview_text = b'''af.wikipedia 1701 1402 desktop 4 F1 af.wikipedia 1701 1402 mobile-web 3 O2T1 af.wikipedia 1702 1404 mobile-web 3 L1O2 af.wikipedia 1702 1404 desktop 1 P1 @@ -37,9 +35,10 @@ class ScoresTest(BaseWpOneDbTest): af.wikipedia 1712 753 mobile-web 2 O2 af.wikipedia 1712 753 desktop 20 E12J7U1''' - @property - def pageview_bz2(self): - return bz2.compress(self.pageview_text) +pageview_bz2 = bz2.compress(pageview_text) + + +class ScoresTest(BaseWpOneDbTest): @patch('wp1.scores.requests') def test_wiki_languages(self, mock_requests): @@ -85,73 +84,43 @@ def test_get_pageview_url(self, mock_datetime): 'https://dumps.wikimedia.org/other/pageview_complete/monthly/' '2024/2024-04/pageviews-202404-user.bz2', actual) - @patch('wp1.scores.requests.get') - def test_get_pageview_response(self, mock_get): - context = MagicMock() - expected = MagicMock() - context.__enter__.return_value = expected - mock_get.return_value = context - with scores.get_pageview_response() as actual: - self.assertEqual(expected, actual) - - @patch('wp1.scores.requests.get') - def test_get_pageview_response_non_success(self, mock_get): - context = MagicMock() - resp = MagicMock() - resp.raise_for_status.side_effect = requests.exceptions.HTTPError - context.__enter__.return_value = resp - mock_get.return_value = context - with self.assertRaises( - Wp1ScoreProcessingError), scores.get_pageview_response() as actual: - pass + @patch('wp1.scores.get_current_datetime', return_value=datetime(2024, 5, 25)) + def test_get_pageview_url_prev(self, mock_datetime): + actual = scores.get_pageview_url(prev=True) + self.assertEqual( + 'https://dumps.wikimedia.org/other/pageview_complete/monthly/' + '2024/2024-03/pageviews-202403-user.bz2', actual) @patch('wp1.scores.get_current_datetime', return_value=datetime(2024, 5, 25)) - @patch('wp1.scores.get_pageview_response') - def test_raw_pageviews(self, mock_get_response, mock_datetime): - context = MagicMock() - resp = MagicMock() - resp.iter_content.return_value = (self.pageview_bz2,) - context.__enter__.return_value = resp - mock_get_response.return_value = context + def test_get_prev_file_path(self, mock_datetime): + actual = scores.get_prev_file_path() + self.assertEqual('/tmp/pageviews/pageviews-202403-user.bz2', actual) - actual = b'\n'.join(scores.raw_pageviews()) + @patch('wp1.scores.get_current_datetime', return_value=datetime(2024, 5, 25)) + def test_get_cur_file_path(self, mock_datetime): + actual = scores.get_cur_file_path() + self.assertEqual('/tmp/pageviews/pageviews-202404-user.bz2', actual) - self.assertEqual(self.pageview_text, actual) + def test_get_pageview_file_path(self): + actual = scores.get_pageview_file_path('pageviews-202404-user.bz2') + self.assertEqual('/tmp/pageviews/pageviews-202404-user.bz2', actual) @patch('wp1.scores.get_current_datetime', return_value=datetime(2024, 5, 25)) - @patch('wp1.scores.get_pageview_response') - def test_raw_pageviews(self, mock_get_response, mock_datetime): - context = MagicMock() - resp = MagicMock() - resp.iter_content.return_value = (self.pageview_bz2,) - context.__enter__.return_value = resp - mock_get_response.return_value = context - + @patch("builtins.open", new_callable=mock_open, read_data=pageview_bz2) + def test_raw_pageviews(self, mock_file_open, mock_datetime): actual = b'\n'.join(scores.raw_pageviews()) - self.assertEqual(self.pageview_text, actual) + self.assertEqual(pageview_text, actual) @patch('wp1.scores.get_current_datetime', return_value=datetime(2024, 5, 25)) - @patch('wp1.scores.get_pageview_response') - def test_raw_pageviews_decode(self, mock_get_response, mock_datetime): - context = MagicMock() - resp = MagicMock() - resp.iter_content.return_value = (self.pageview_bz2,) - context.__enter__.return_value = resp - mock_get_response.return_value = context - + @patch("builtins.open", new_callable=mock_open, read_data=pageview_bz2) + def test_raw_pageviews_decode(self, mock_file_open, mock_datetime): actual = '\n'.join(scores.raw_pageviews(decode=True)) - self.assertEqual(self.pageview_text.decode('utf-8'), actual) - - @patch('wp1.scores.get_pageview_response') - def test_pageview_components(self, mock_get_response): - context = MagicMock() - resp = MagicMock() - resp.iter_content.return_value = (self.pageview_bz2,) - context.__enter__.return_value = resp - mock_get_response.return_value = context + self.assertEqual(pageview_text.decode('utf-8'), actual) + @patch("builtins.open", new_callable=mock_open, read_data=pageview_bz2) + def test_pageview_components(self, mock_file_open): expected = [ (b'af', b'1701', b'1402', 7), (b'af', b'1702', b'1404', 4), @@ -187,8 +156,8 @@ def test_update_db_pageviews(self): def test_update_db_pageviews_existing(self): with self.wp10db.cursor() as cursor: cursor.execute( - 'INSERT INTO page_scores VALUES ("en", "Statue_of_Liberty", 1234, 100' - ) + 'INSERT INTO page_scores (ps_lang, ps_article, ps_page_id, ps_views) ' + 'VALUES ("en", "Statue_of_Liberty", 1234, 100)') scores.update_db_pageviews(self.wp10db, 'en', 'Statue_of_Liberty', 1234, 200) From a0d3fbdead1e697dce2146e566f920cb56be631d Mon Sep 17 00:00:00 2001 From: Travis Briggs Date: Sat, 17 Aug 2024 08:31:39 -0700 Subject: [PATCH 10/13] Update e2e credentials --- wp1/credentials.py.e2e | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/wp1/credentials.py.e2e b/wp1/credentials.py.e2e index 25708211..9240e37e 100644 --- a/wp1/credentials.py.e2e +++ b/wp1/credentials.py.e2e @@ -71,6 +71,10 @@ CREDENTIALS = { 'user': 'farmuser', 'password': 'farmpass', 'hook_token': 'hook-token-abc', + }, + 'FILE_PATH': { + # Path where pageviews.bz2 file (~3GB) will be downloaded. + 'pageviews': '/tmp/pageviews', } }, Environment.PRODUCTION: {}, From a62ca072828a8f08e81557c44f284b6079c7fdd9 Mon Sep 17 00:00:00 2001 From: Travis Briggs Date: Sat, 17 Aug 2024 09:05:24 -0700 Subject: [PATCH 11/13] Add more tests --- wp1/credentials.py.example | 5 ++ wp1/scores.py | 8 +-- wp1/scores_test.py | 134 +++++++++++++++++++++++++++++++++++++ 3 files changed, 143 insertions(+), 4 deletions(-) diff --git a/wp1/credentials.py.example b/wp1/credentials.py.example index 8acc9a17..9a3b922c 100644 --- a/wp1/credentials.py.example +++ b/wp1/credentials.py.example @@ -178,6 +178,11 @@ CREDENTIALS = { 'password': 'farmpass', 'hook_token': 'hook-token-abc', } + + 'FILE_PATH': { + # Path where pageviews.bz2 file (~3GB) will be downloaded. + 'pageviews': '/tmp/pageviews', + } }, # EDIT: Remove the next line after you've provided actual production credentials. diff --git a/wp1/scores.py b/wp1/scores.py index ffc1c397..5bd0b3a7 100644 --- a/wp1/scores.py +++ b/wp1/scores.py @@ -85,7 +85,7 @@ def download_pageviews(): with requests.get(get_pageview_url(), stream=True) as r: r.raise_for_status() - with open(PAGEVIEW_FILE_NAME, 'wb') as f: + with open(cur_filepath, 'wb') as f: # Read data in 8 KB chunks for chunk in r.iter_content(chunk_size=8 * 1024): f.write(chunk) @@ -140,7 +140,7 @@ def pageview_components(): try: views = int(parts[4]) except ValueError: - log.warning('Views field wasn\'t int in pageview dump: %r', line) + logger.warning('Views field wasn\'t int in pageview dump: %r', line) continue if (tally is not None and tally.lang == lang and tally.name == name and @@ -172,7 +172,7 @@ def update_db_pageviews(wp10db, lang, article, page_id, views): }) -def update_pageviews(filter_lang=None): +def update_pageviews(filter_lang=None, commit_after=50000): download_pageviews() # Convert filter lang to bytes if necessary @@ -191,7 +191,7 @@ def update_pageviews(filter_lang=None): update_db_pageviews(wp10db, lang, article, page_id, views) n += 1 - if n >= 50000: + if n >= commit_after: logger.debug('Committing') wp10db.commit() n = 0 diff --git a/wp1/scores_test.py b/wp1/scores_test.py index 19ea58f9..00c21363 100644 --- a/wp1/scores_test.py +++ b/wp1/scores_test.py @@ -1,5 +1,6 @@ import bz2 from datetime import datetime +import os.path import unittest from unittest.mock import patch, MagicMock, mock_open @@ -35,7 +36,33 @@ af.wikipedia 1712 753 mobile-web 2 O2 af.wikipedia 1712 753 desktop 20 E12J7U1''' +pageview_error_text = b'''af.wikipedia 1701 1402 desktop 4 F1 +af.wikipedia 1701 1402 mobile-web 3 O2T1 +af.wikipedia 1702 1404 mobile-web 3 L1O2 +af.wikipedia 1702 1404 desktop 1 P1 +af.wikipedia - 1405 mobile-web 3 C1O2 +af.wikipedia - 1405 desktop 1 ^1 +af.wikipedia 1704 1406 mobile-web 4 A1O2T1 +af.wikipedia 1704 1406 desktop 2 F1 +af.wikipedia 1705 1407 mobile-web 3 O3 +af.wikipedia 1705 1407 desktop 1 F1 +af.wikipedia 1706 desktop 8 H8 +af.wikipedia 1706 mobile-web 4 C1O2Y1 +af.wikipedia 1707 1409 mobile-web 2 O2 +af.wikipedia 1707 1409 desktop 3 H1J1 +af.wikipedia 1708 1410 desktop X V1]1 +af.wikipedia 1708 1410 mobile-web Z O1 +af.wikipedia 1709 1411 desktop 2 F1 +af.wikipedia 1709 1411 mobile-web 2 O2 +af.wikipedia \xc3\xa9\xc3\xa1\xc3\xb8 3774 mobile-web 1 A1 +af.wikipedia \xc3\xa9\xc3\xa1\xc3\xb8 3774 mobile-web 2 F2 +af.wikipedia 1711 752 mobile-web 4 C1O2U1 +af.wikipedia 1711 752 desktop 1 K1 +af.wikipedia 1712 753 mobile-web 2 O2 +af.wikipedia 1712 753 desktop 20 E12J7U1''' + pageview_bz2 = bz2.compress(pageview_text) +pageview_error_bz2 = bz2.compress(pageview_error_text) class ScoresTest(BaseWpOneDbTest): @@ -105,6 +132,61 @@ def test_get_pageview_file_path(self): actual = scores.get_pageview_file_path('pageviews-202404-user.bz2') self.assertEqual('/tmp/pageviews/pageviews-202404-user.bz2', actual) + @patch('wp1.scores.get_current_datetime', return_value=datetime(2024, 5, 25)) + @patch('wp1.scores.requests.get') + def test_download_pageviews(self, mock_get_response, mock_datetime): + context = MagicMock() + resp = MagicMock() + resp.iter_content.return_value = (pageview_bz2,) + context.__enter__.return_value = resp + mock_get_response.return_value = context + + file_path = scores.get_cur_file_path() + if os.path.exists(file_path): + os.remove(file_path) + + scores.download_pageviews() + + mock_get_response.assert_called_once() + self.assertTrue(os.path.exists(file_path)) + + @patch('wp1.scores.get_current_datetime', return_value=datetime(2024, 5, 25)) + @patch('wp1.scores.requests.get') + def test_download_pageviews_remove_prev(self, mock_get_response, + mock_datetime): + context = MagicMock() + resp = MagicMock() + resp.iter_content.return_value = (pageview_bz2,) + context.__enter__.return_value = resp + mock_get_response.return_value = context + + file_path = scores.get_prev_file_path() + # Create empty file + open(file_path, 'a').close() + + scores.download_pageviews() + + self.assertFalse(os.path.exists(file_path)) + + @patch('wp1.scores.get_current_datetime', return_value=datetime(2024, 5, 25)) + @patch('wp1.scores.requests.get') + def test_download_pageviews_skip_existing(self, mock_get_response, + mock_datetime): + context = MagicMock() + resp = MagicMock() + resp.iter_content.return_value = (pageview_bz2,) + context.__enter__.return_value = resp + mock_get_response.return_value = context + + file_path = scores.get_cur_file_path() + # Create empty file + open(file_path, 'a').close() + + scores.download_pageviews() + + mock_get_response.assert_not_called() + self.assertTrue(os.path.exists(file_path)) + @patch('wp1.scores.get_current_datetime', return_value=datetime(2024, 5, 25)) @patch("builtins.open", new_callable=mock_open, read_data=pageview_bz2) def test_raw_pageviews(self, mock_file_open, mock_datetime): @@ -140,6 +222,24 @@ def test_pageview_components(self, mock_file_open): self.assertEqual(expected, actual) + @patch("builtins.open", new_callable=mock_open, read_data=pageview_error_bz2) + def test_pageview_components_errors(self, mock_file_open): + expected = [ + (b'af', b'1701', b'1402', 7), + (b'af', b'1702', b'1404', 4), + (b'af', b'1704', b'1406', 6), + (b'af', b'1705', b'1407', 4), + (b'af', b'1707', b'1409', 5), + (b'af', b'1709', b'1411', 4), + (b'af', b'\xc3\xa9\xc3\xa1\xc3\xb8', b'3774', 3), + (b'af', b'1711', b'752', 5), + (b'af', b'1712', b'753', 22), + ] + + actual = list(scores.pageview_components()) + + self.assertEqual(expected, actual) + def test_update_db_pageviews(self): scores.update_db_pageviews(self.wp10db, 'en', 'Statue_of_Liberty', 1234, 100) @@ -170,3 +270,37 @@ def test_update_db_pageviews_existing(self): self.assertEqual(result['ps_article'], b'Statue_of_Liberty') self.assertEqual(result['ps_page_id'], 1234) self.assertEqual(result['ps_views'], 200) + + @patch('wp1.scores.download_pageviews') + @patch('wp1.scores.pageview_components') + def test_update_pageviews(self, mock_components, mock_download): + mock_components.return_value = ( + (b'en', b'Statue_of_Liberty', 100, 100), + (b'en', b'Eiffel_Tower', 200, 200), + (b'fr', b'George-\xc3\x89tienne_Cartier_Monument', 300, 300), + ) + + scores.update_pageviews(commit_after=2) + + mock_download.assert_called_once() + with self.wp10db.cursor() as cursor: + cursor.execute('SELECT COUNT(*) as cnt FROM page_scores') + n = cursor.fetchone()['cnt'] + self.assertEqual(3, n) + + @patch('wp1.scores.download_pageviews') + @patch('wp1.scores.pageview_components') + def test_update_pageviews_filter(self, mock_components, mock_download): + mock_components.return_value = ( + (b'en', b'Statue_of_Liberty', 100, 100), + (b'en', b'Eiffel_Tower', 200, 200), + (b'fr', b'George-\xc3\x89tienne_Cartier_Monument', 300, 300), + ) + + scores.update_pageviews(filter_lang='fr') + + mock_download.assert_called_once() + with self.wp10db.cursor() as cursor: + cursor.execute('SELECT COUNT(*) as cnt FROM page_scores') + n = cursor.fetchone()['cnt'] + self.assertEqual(1, n) From a844c5222f97a12b64536c00182b69ac4cef16cf Mon Sep 17 00:00:00 2001 From: Travis Briggs Date: Mon, 19 Aug 2024 12:33:06 -0700 Subject: [PATCH 12/13] Code review fixes, with test --- wp1/scores.py | 16 +++++++++++----- wp1/scores_test.py | 16 ++++++++++++++++ 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/wp1/scores.py b/wp1/scores.py index 5bd0b3a7..2d1da7cc 100644 --- a/wp1/scores.py +++ b/wp1/scores.py @@ -31,6 +31,7 @@ def wiki_languages(): r = requests.get( 'https://wikistats.wmcloud.org/api.php?action=dump&table=wikipedias&format=csv', headers={'User-Agent': WP1_USER_AGENT}, + timeout=60, ) try: r.raise_for_status() @@ -83,12 +84,17 @@ def download_pageviews(): # File already downloaded return - with requests.get(get_pageview_url(), stream=True) as r: + with requests.get(get_pageview_url(), stream=True, timeout=60) as r: r.raise_for_status() - with open(cur_filepath, 'wb') as f: - # Read data in 8 KB chunks - for chunk in r.iter_content(chunk_size=8 * 1024): - f.write(chunk) + try: + with open(cur_filepath, 'wb') as f: + # Read data in 8 KB chunks + for chunk in r.iter_content(chunk_size=8 * 1024): + f.write(chunk) + except Exception as e: + logger.exception('Error downloading pageviews') + os.remove(cur_filepath) + raise Wp1ScoreProcessingError('Error downloading pageviews') from e def raw_pageviews(decode=False): diff --git a/wp1/scores_test.py b/wp1/scores_test.py index 00c21363..b8ca010a 100644 --- a/wp1/scores_test.py +++ b/wp1/scores_test.py @@ -187,6 +187,22 @@ def test_download_pageviews_skip_existing(self, mock_get_response, mock_get_response.assert_not_called() self.assertTrue(os.path.exists(file_path)) + @patch('wp1.scores.requests.get') + def test_download_pageviews_handle_error(self, mock_get_response): + context = MagicMock() + resp = MagicMock() + # Return partial data and then raise an exception + resp.iter_content.side_effect = (pageview_bz2[:100], + requests.exceptions.HTTPError) + context.__enter__.return_value = resp + mock_get_response.return_value = context + + with self.assertRaises(Wp1ScoreProcessingError): + scores.download_pageviews() + + file_path = scores.get_cur_file_path() + self.assertFalse(os.path.exists(file_path)) + @patch('wp1.scores.get_current_datetime', return_value=datetime(2024, 5, 25)) @patch("builtins.open", new_callable=mock_open, read_data=pageview_bz2) def test_raw_pageviews(self, mock_file_open, mock_datetime): From c44be718fb3e048906fcbd5bfc2dd5c2d19801ce Mon Sep 17 00:00:00 2001 From: Travis Briggs Date: Mon, 19 Aug 2024 19:20:44 -0700 Subject: [PATCH 13/13] Download in 8 MB chunks --- wp1/scores.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wp1/scores.py b/wp1/scores.py index 2d1da7cc..8688053f 100644 --- a/wp1/scores.py +++ b/wp1/scores.py @@ -88,8 +88,8 @@ def download_pageviews(): r.raise_for_status() try: with open(cur_filepath, 'wb') as f: - # Read data in 8 KB chunks - for chunk in r.iter_content(chunk_size=8 * 1024): + # Read data in 8 MB chunks + for chunk in r.iter_content(chunk_size=8 * 1024 * 1024): f.write(chunk) except Exception as e: logger.exception('Error downloading pageviews')