diff --git a/db/migrations/20240429_01_qtSms-add-scores-table.py b/db/migrations/20240429_01_qtSms-add-scores-table.py new file mode 100644 index 00000000..a4a54a8c --- /dev/null +++ b/db/migrations/20240429_01_qtSms-add-scores-table.py @@ -0,0 +1,22 @@ +""" +Add scores table +""" + +from yoyo import step + +__depends__ = {'20230528_02_pL6ka-add-b-selection-zim-version-to-builders'} + +steps = [ + step( + '''CREATE TABLE page_scores ( + ps_lang VARBINARY(255), + ps_page_id INTEGER NOT NULL, + ps_article VARBINARY(1024), + ps_views INTEGER DEFAULT 0, + ps_links INTEGER DEFAULT 0, + ps_lang_links INTEGER DEFAULT 0, + ps_score INTEGER DEFAULT 0, + PRIMARY KEY (`ps_lang`, `ps_page_id`), + KEY `lang_article` (`ps_lang`, `ps_article`) + )''', 'DROP TABLE page_scores') +] diff --git a/docker-compose.yml b/docker-compose.yml index 91b3c548..6de643bc 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -14,6 +14,7 @@ services: - /data/wp1bot/credentials.py:/usr/src/app/wp1/credentials.py - /data/wp1bot/db/yoyo.ini:/usr/src/app/db/production/yoyo.ini - /srv/log/wp1bot/:/var/log/wp1bot/ + - /srv/data/wp1bot/:/var/data/wp1bot/ links: - redis logging: diff --git a/docker/dev-db/README.md b/docker/dev-db/README.md index 2d27255c..8a50cb80 100644 --- a/docker/dev-db/README.md +++ b/docker/dev-db/README.md @@ -14,7 +14,7 @@ The dev database will need to be migrated in the following circumstances: To migrate, cd to the `db/dev` directory and run the following command: ```bash -PYTHONPATH=$PYTHONPATH:../.. yoyo apply +PYTHONPATH=$PYTHONPATH:../.. pipenv run yoyo apply ``` The `PYTHONPATH` environment variable is necessary because some of the migrations diff --git a/wp1/credentials.py.dev.e2e b/wp1/credentials.py.dev.e2e index 9855819b..f0e02680 100644 --- a/wp1/credentials.py.dev.e2e +++ b/wp1/credentials.py.dev.e2e @@ -39,6 +39,10 @@ CREDENTIALS = { 'secret': '', 'bucket': 'org-kiwix-dev-wp1', }, + 'FILE_PATH': { + # Path where pageviews.bz2 file (~3GB) will be downloaded. + 'pageviews': '/tmp/pageviews', + } }, Environment.TEST: {}, Environment.PRODUCTION: {} diff --git a/wp1/credentials.py.e2e b/wp1/credentials.py.e2e index 25708211..9240e37e 100644 --- a/wp1/credentials.py.e2e +++ b/wp1/credentials.py.e2e @@ -71,6 +71,10 @@ CREDENTIALS = { 'user': 'farmuser', 'password': 'farmpass', 'hook_token': 'hook-token-abc', + }, + 'FILE_PATH': { + # Path where pageviews.bz2 file (~3GB) will be downloaded. + 'pageviews': '/tmp/pageviews', } }, Environment.PRODUCTION: {}, diff --git a/wp1/credentials.py.example b/wp1/credentials.py.example index 548fb89a..9a3b922c 100644 --- a/wp1/credentials.py.example +++ b/wp1/credentials.py.example @@ -122,6 +122,11 @@ CREDENTIALS = { # server, to ensure requests to the webhook endpoint are valid. 'hook_token': '', # EDIT this line }, + + 'FILE_PATH': { + # Path where pageviews.bz2 file (~3GB) will be downloaded. + 'pageviews': '/tmp/pageviews', + } }, # Environment for python nosetests. In this environment, only the MySQL database @@ -173,6 +178,11 @@ CREDENTIALS = { 'password': 'farmpass', 'hook_token': 'hook-token-abc', } + + 'FILE_PATH': { + # Path where pageviews.bz2 file (~3GB) will be downloaded. + 'pageviews': '/tmp/pageviews', + } }, # EDIT: Remove the next line after you've provided actual production credentials. @@ -253,4 +263,9 @@ CREDENTIALS = { # # server, to ensure requests to the webhook endpoint are valid. # 'hook_token': '', # EDIT this line # }, + + # 'FILE_PATH': { + # # Path where pageviews.bz2 file (~3GB) will be downloaded. + # 'pageviews': '/var/data/wp1bot/pageviews', + # } } diff --git a/wp1/exceptions.py b/wp1/exceptions.py index 88214646..31f2ea7e 100644 --- a/wp1/exceptions.py +++ b/wp1/exceptions.py @@ -24,3 +24,6 @@ class ObjectNotFoundError(Wp1Error): class UserNotAuthorizedError(Wp1Error): pass + +class Wp1ScoreProcessingError(Wp1Error): + pass \ No newline at end of file diff --git a/wp1/scores.py b/wp1/scores.py new file mode 100644 index 00000000..8688053f --- /dev/null +++ b/wp1/scores.py @@ -0,0 +1,211 @@ +from bz2 import BZ2Decompressor +from collections import namedtuple +from contextlib import contextmanager +import logging +import os.path + +import csv +from datetime import datetime, timedelta +import requests + +from wp1.constants import WP1_USER_AGENT +from wp1.exceptions import Wp1ScoreProcessingError +from wp1.time import get_current_datetime +from wp1.wp10_db import connect as wp10_connect + +PageviewRecord = namedtuple('PageviewRecord', + ['lang', 'name', 'page_id', 'views']) + +logger = logging.getLogger(__name__) + +try: + from wp1.credentials import ENV, CREDENTIALS +except ImportError: + logger.exception('The file credentials.py must be populated manually in ' + 'order to download pageviews') + CREDENTIALS = None + ENV = None + + +def wiki_languages(): + r = requests.get( + 'https://wikistats.wmcloud.org/api.php?action=dump&table=wikipedias&format=csv', + headers={'User-Agent': WP1_USER_AGENT}, + timeout=60, + ) + try: + r.raise_for_status() + except requests.exceptions.HTTPError as e: + raise Wp1ScoreProcessingError('Could not retrieve wiki list') from e + + reader = csv.reader(r.text.splitlines()) + # Skip the header row + next(reader, None) + for row in reader: + yield row[2] + + +def get_pageview_url(prev=False): + weeks = 4 + if prev: + weeks = 8 + + now = get_current_datetime() + dt = datetime(now.year, now.month, 1) - timedelta(weeks=weeks) + return dt.strftime( + 'https://dumps.wikimedia.org/other/pageview_complete/monthly/' + '%Y/%Y-%m/pageviews-%Y%m-user.bz2') + + +def get_pageview_file_path(filename): + path = CREDENTIALS[ENV]['FILE_PATH']['pageviews'] + os.makedirs(path, exist_ok=True) + return os.path.join(path, filename) + + +def get_prev_file_path(): + prev_filename = get_pageview_url(prev=True).split('/')[-1] + return get_pageview_file_path(prev_filename) + + +def get_cur_file_path(): + cur_filename = get_pageview_url().split('/')[-1] + return get_pageview_file_path(cur_filename) + + +def download_pageviews(): + # Clean up file from last month + prev_filepath = get_prev_file_path() + if os.path.exists(prev_filepath): + os.remove(prev_filepath) + + cur_filepath = get_cur_file_path() + if os.path.exists(cur_filepath): + # File already downloaded + return + + with requests.get(get_pageview_url(), stream=True, timeout=60) as r: + r.raise_for_status() + try: + with open(cur_filepath, 'wb') as f: + # Read data in 8 MB chunks + for chunk in r.iter_content(chunk_size=8 * 1024 * 1024): + f.write(chunk) + except Exception as e: + logger.exception('Error downloading pageviews') + os.remove(cur_filepath) + raise Wp1ScoreProcessingError('Error downloading pageviews') from e + + +def raw_pageviews(decode=False): + + def as_bytes(): + decompressor = BZ2Decompressor() + trailing = b'' + with open(get_cur_file_path(), 'rb') as f: + while True: + # Read data in 1 MB chunks + chunk = f.read(1024 * 1024) + if not chunk: + break + data = decompressor.decompress(chunk) + lines = [line for line in data.split(b'\n') if line] + if not lines: + continue + + # Reunite incomplete lines + yield trailing + lines[0] + yield from lines[1:-1] + trailing = lines[-1] + + # Nothing left, yield the last line + yield trailing + + if decode: + for line in as_bytes(): + yield line.decode('utf-8') + else: + yield from as_bytes() + + +def pageview_components(): + tally = None + for line in raw_pageviews(): + parts = line.split(b' ') + if len(parts) != 6 or parts[2] == b'null': + # Skip pages that don't have a pageid + continue + + if parts[1] == b'' or parts[1] == b'-': + # Skip pages that don't have a title + continue + + lang = parts[0].split(b'.')[0] + name = parts[1] + page_id = parts[2] + try: + views = int(parts[4]) + except ValueError: + logger.warning('Views field wasn\'t int in pageview dump: %r', line) + continue + + if (tally is not None and tally.lang == lang and tally.name == name and + tally.page_id == page_id): + # This is a view on the same page from a different interface (mobile v + # desktop etc) + new_dict = {**tally._asdict(), 'views': tally.views + views} + tally = PageviewRecord(**new_dict) + else: + # Language code, article name, article page id, views + if tally is not None: + yield tally.lang, tally.name, tally.page_id, tally.views + tally = PageviewRecord(lang, name, page_id, views) + + yield tally.lang, tally.name, tally.page_id, tally.views + + +def update_db_pageviews(wp10db, lang, article, page_id, views): + with wp10db.cursor() as cursor: + cursor.execute( + '''INSERT INTO page_scores (ps_lang, ps_page_id, ps_article, ps_views) + VALUES (%(lang)s, %(page_id)s, %(article)s, %(views)s) + ON DUPLICATE KEY UPDATE ps_views = %(views)s + ''', { + 'lang': lang, + 'page_id': page_id, + 'article': article, + 'views': views + }) + + +def update_pageviews(filter_lang=None, commit_after=50000): + download_pageviews() + + # Convert filter lang to bytes if necessary + if filter_lang is not None and isinstance(filter_lang, str): + filter_lang = filter_lang.encode('utf-8') + + if filter_lang is None: + logger.info('Updating all pageviews') + else: + logger.info('Updating pageviews for %s', filter_lang.decode('utf-8')) + + wp10db = wp10_connect() + n = 0 + for lang, article, page_id, views in pageview_components(): + if filter_lang is None or lang == filter_lang: + update_db_pageviews(wp10db, lang, article, page_id, views) + + n += 1 + if n >= commit_after: + logger.debug('Committing') + wp10db.commit() + n = 0 + wp10db.commit() + logger.info('Done') + + +if __name__ == '__main__': + logging.basicConfig(level=logging.INFO, + format='%(levelname)s %(asctime)s: %(message)s') + update_pageviews() diff --git a/wp1/scores_test.py b/wp1/scores_test.py new file mode 100644 index 00000000..b8ca010a --- /dev/null +++ b/wp1/scores_test.py @@ -0,0 +1,322 @@ +import bz2 +from datetime import datetime +import os.path +import unittest +from unittest.mock import patch, MagicMock, mock_open + +import requests + +from wp1.base_db_test import BaseWpOneDbTest +from wp1.constants import WP1_USER_AGENT +from wp1.exceptions import Wp1ScoreProcessingError +from wp1 import scores + +pageview_text = b'''af.wikipedia 1701 1402 desktop 4 F1 +af.wikipedia 1701 1402 mobile-web 3 O2T1 +af.wikipedia 1702 1404 mobile-web 3 L1O2 +af.wikipedia 1702 1404 desktop 1 P1 +af.wikipedia 1703 1405 mobile-web 3 C1O2 +af.wikipedia 1703 1405 desktop 1 ^1 +af.wikipedia 1704 1406 mobile-web 4 A1O2T1 +af.wikipedia 1704 1406 desktop 2 F1 +af.wikipedia 1705 1407 mobile-web 3 O3 +af.wikipedia 1705 1407 desktop 1 F1 +af.wikipedia 1706 1408 desktop 8 H8 +af.wikipedia 1706 1408 mobile-web 4 C1O2Y1 +af.wikipedia 1707 1409 mobile-web 2 O2 +af.wikipedia 1707 1409 desktop 3 H1J1 +af.wikipedia 1708 1410 desktop 4 V1]1 +af.wikipedia 1708 1410 mobile-web 1 O1 +af.wikipedia 1709 1411 desktop 2 F1 +af.wikipedia 1709 1411 mobile-web 2 O2 +af.wikipedia \xc3\xa9\xc3\xa1\xc3\xb8 3774 mobile-web 1 A1 +af.wikipedia \xc3\xa9\xc3\xa1\xc3\xb8 3774 mobile-web 2 F2 +af.wikipedia 1711 752 mobile-web 4 C1O2U1 +af.wikipedia 1711 752 desktop 1 K1 +af.wikipedia 1712 753 mobile-web 2 O2 +af.wikipedia 1712 753 desktop 20 E12J7U1''' + +pageview_error_text = b'''af.wikipedia 1701 1402 desktop 4 F1 +af.wikipedia 1701 1402 mobile-web 3 O2T1 +af.wikipedia 1702 1404 mobile-web 3 L1O2 +af.wikipedia 1702 1404 desktop 1 P1 +af.wikipedia - 1405 mobile-web 3 C1O2 +af.wikipedia - 1405 desktop 1 ^1 +af.wikipedia 1704 1406 mobile-web 4 A1O2T1 +af.wikipedia 1704 1406 desktop 2 F1 +af.wikipedia 1705 1407 mobile-web 3 O3 +af.wikipedia 1705 1407 desktop 1 F1 +af.wikipedia 1706 desktop 8 H8 +af.wikipedia 1706 mobile-web 4 C1O2Y1 +af.wikipedia 1707 1409 mobile-web 2 O2 +af.wikipedia 1707 1409 desktop 3 H1J1 +af.wikipedia 1708 1410 desktop X V1]1 +af.wikipedia 1708 1410 mobile-web Z O1 +af.wikipedia 1709 1411 desktop 2 F1 +af.wikipedia 1709 1411 mobile-web 2 O2 +af.wikipedia \xc3\xa9\xc3\xa1\xc3\xb8 3774 mobile-web 1 A1 +af.wikipedia \xc3\xa9\xc3\xa1\xc3\xb8 3774 mobile-web 2 F2 +af.wikipedia 1711 752 mobile-web 4 C1O2U1 +af.wikipedia 1711 752 desktop 1 K1 +af.wikipedia 1712 753 mobile-web 2 O2 +af.wikipedia 1712 753 desktop 20 E12J7U1''' + +pageview_bz2 = bz2.compress(pageview_text) +pageview_error_bz2 = bz2.compress(pageview_error_text) + + +class ScoresTest(BaseWpOneDbTest): + + @patch('wp1.scores.requests') + def test_wiki_languages(self, mock_requests): + mock_response = MagicMock() + mock_response.text = ( + 'id,lang,prefix,total,good,views,edits,users,admins,ts,loclang,images,' + 'loclanglink,activeusers,version,si_mainpage,si_base,si_sitename,si_generator,' + 'si_phpversion,si_phpsapi,si_dbtype,si_dbversion,si_rev,si_case,si_rights,' + 'si_lang,si_fallback8bitEncoding,si_writeapi,si_timezone,si_timeoffset,' + 'si_articlepath,si_scriptpath,si_script,si_variantarticlepath,si_server,' + 'si_wikiid,si_time,method,http,status,ratio\n' + '2,English,en,60556624,6818615,63208806,1216695232,47328206,861,"2024-04-30 00:06:16",' + 'English,916605,English_language,122676,1.28.0-wmf.13,,,,"MediaWiki 1.43.0-wmf.2",,,,,' + ',,,,,,,,,,,,,,,8,200,a,0.1126\n' + '153,Cebuano,ceb,11228672,6118766,0,35037562,115188,5,"2024-04-30 00:01:34"' + ',"Sinugboanong Binisaya",1,Sinugboanon,149,1.28.0-wmf.13,,,,"MediaWiki 1.43.0-wmf.2",,' + ',,,,,,,,,,,,,,,,,,8,200,a,0.5449\n' + '10,German,de,8007675,2905495,8543798,242922549,4359000,174,"2024-04-30 00:08:06",' + 'Deutsch,129233,Deutsch,17684,1.28.0-wmf.13,,,,"MediaWiki 1.43.0-wmf.2",,,,,,,,,,,,,,,,,' + ',,,8,200,a,0.3628\n' + '1,French,fr,13037937,2608568,2234272,214217598,4914029,146,"2024-04-30 00:08:24",' + 'Français,71651,Français,16929,1.28.0-wmf.13,,,,"MediaWiki 1.43.0-wmf.2",,,,,,,' + ',,,,,,,,,,,,,8,200,a,0.2001\n') + mock_requests.get.return_value = mock_response + + actual = list(scores.wiki_languages()) + self.assertEqual(['en', 'ceb', 'de', 'fr'], actual) + + @patch('wp1.scores.requests') + def test_wiki_languages_raises_on_http_error(self, mock_requests): + mock_response = MagicMock() + mock_response.raise_for_status.side_effect = requests.exceptions.HTTPError + mock_requests.exceptions.HTTPError = requests.exceptions.HTTPError + mock_requests.get.return_value = mock_response + + with self.assertRaises(Wp1ScoreProcessingError): + list(scores.wiki_languages()) + + @patch('wp1.scores.get_current_datetime', return_value=datetime(2024, 5, 25)) + def test_get_pageview_url(self, mock_datetime): + actual = scores.get_pageview_url() + self.assertEqual( + 'https://dumps.wikimedia.org/other/pageview_complete/monthly/' + '2024/2024-04/pageviews-202404-user.bz2', actual) + + @patch('wp1.scores.get_current_datetime', return_value=datetime(2024, 5, 25)) + def test_get_pageview_url_prev(self, mock_datetime): + actual = scores.get_pageview_url(prev=True) + self.assertEqual( + 'https://dumps.wikimedia.org/other/pageview_complete/monthly/' + '2024/2024-03/pageviews-202403-user.bz2', actual) + + @patch('wp1.scores.get_current_datetime', return_value=datetime(2024, 5, 25)) + def test_get_prev_file_path(self, mock_datetime): + actual = scores.get_prev_file_path() + self.assertEqual('/tmp/pageviews/pageviews-202403-user.bz2', actual) + + @patch('wp1.scores.get_current_datetime', return_value=datetime(2024, 5, 25)) + def test_get_cur_file_path(self, mock_datetime): + actual = scores.get_cur_file_path() + self.assertEqual('/tmp/pageviews/pageviews-202404-user.bz2', actual) + + def test_get_pageview_file_path(self): + actual = scores.get_pageview_file_path('pageviews-202404-user.bz2') + self.assertEqual('/tmp/pageviews/pageviews-202404-user.bz2', actual) + + @patch('wp1.scores.get_current_datetime', return_value=datetime(2024, 5, 25)) + @patch('wp1.scores.requests.get') + def test_download_pageviews(self, mock_get_response, mock_datetime): + context = MagicMock() + resp = MagicMock() + resp.iter_content.return_value = (pageview_bz2,) + context.__enter__.return_value = resp + mock_get_response.return_value = context + + file_path = scores.get_cur_file_path() + if os.path.exists(file_path): + os.remove(file_path) + + scores.download_pageviews() + + mock_get_response.assert_called_once() + self.assertTrue(os.path.exists(file_path)) + + @patch('wp1.scores.get_current_datetime', return_value=datetime(2024, 5, 25)) + @patch('wp1.scores.requests.get') + def test_download_pageviews_remove_prev(self, mock_get_response, + mock_datetime): + context = MagicMock() + resp = MagicMock() + resp.iter_content.return_value = (pageview_bz2,) + context.__enter__.return_value = resp + mock_get_response.return_value = context + + file_path = scores.get_prev_file_path() + # Create empty file + open(file_path, 'a').close() + + scores.download_pageviews() + + self.assertFalse(os.path.exists(file_path)) + + @patch('wp1.scores.get_current_datetime', return_value=datetime(2024, 5, 25)) + @patch('wp1.scores.requests.get') + def test_download_pageviews_skip_existing(self, mock_get_response, + mock_datetime): + context = MagicMock() + resp = MagicMock() + resp.iter_content.return_value = (pageview_bz2,) + context.__enter__.return_value = resp + mock_get_response.return_value = context + + file_path = scores.get_cur_file_path() + # Create empty file + open(file_path, 'a').close() + + scores.download_pageviews() + + mock_get_response.assert_not_called() + self.assertTrue(os.path.exists(file_path)) + + @patch('wp1.scores.requests.get') + def test_download_pageviews_handle_error(self, mock_get_response): + context = MagicMock() + resp = MagicMock() + # Return partial data and then raise an exception + resp.iter_content.side_effect = (pageview_bz2[:100], + requests.exceptions.HTTPError) + context.__enter__.return_value = resp + mock_get_response.return_value = context + + with self.assertRaises(Wp1ScoreProcessingError): + scores.download_pageviews() + + file_path = scores.get_cur_file_path() + self.assertFalse(os.path.exists(file_path)) + + @patch('wp1.scores.get_current_datetime', return_value=datetime(2024, 5, 25)) + @patch("builtins.open", new_callable=mock_open, read_data=pageview_bz2) + def test_raw_pageviews(self, mock_file_open, mock_datetime): + actual = b'\n'.join(scores.raw_pageviews()) + + self.assertEqual(pageview_text, actual) + + @patch('wp1.scores.get_current_datetime', return_value=datetime(2024, 5, 25)) + @patch("builtins.open", new_callable=mock_open, read_data=pageview_bz2) + def test_raw_pageviews_decode(self, mock_file_open, mock_datetime): + actual = '\n'.join(scores.raw_pageviews(decode=True)) + + self.assertEqual(pageview_text.decode('utf-8'), actual) + + @patch("builtins.open", new_callable=mock_open, read_data=pageview_bz2) + def test_pageview_components(self, mock_file_open): + expected = [ + (b'af', b'1701', b'1402', 7), + (b'af', b'1702', b'1404', 4), + (b'af', b'1703', b'1405', 4), + (b'af', b'1704', b'1406', 6), + (b'af', b'1705', b'1407', 4), + (b'af', b'1706', b'1408', 12), + (b'af', b'1707', b'1409', 5), + (b'af', b'1708', b'1410', 5), + (b'af', b'1709', b'1411', 4), + (b'af', b'\xc3\xa9\xc3\xa1\xc3\xb8', b'3774', 3), + (b'af', b'1711', b'752', 5), + (b'af', b'1712', b'753', 22), + ] + + actual = list(scores.pageview_components()) + + self.assertEqual(expected, actual) + + @patch("builtins.open", new_callable=mock_open, read_data=pageview_error_bz2) + def test_pageview_components_errors(self, mock_file_open): + expected = [ + (b'af', b'1701', b'1402', 7), + (b'af', b'1702', b'1404', 4), + (b'af', b'1704', b'1406', 6), + (b'af', b'1705', b'1407', 4), + (b'af', b'1707', b'1409', 5), + (b'af', b'1709', b'1411', 4), + (b'af', b'\xc3\xa9\xc3\xa1\xc3\xb8', b'3774', 3), + (b'af', b'1711', b'752', 5), + (b'af', b'1712', b'753', 22), + ] + + actual = list(scores.pageview_components()) + + self.assertEqual(expected, actual) + + def test_update_db_pageviews(self): + scores.update_db_pageviews(self.wp10db, 'en', 'Statue_of_Liberty', 1234, + 100) + + with self.wp10db.cursor() as cursor: + cursor.execute('SELECT * FROM page_scores WHERE ps_page_id = 1234') + result = cursor.fetchone() + self.assertIsNotNone(result) + self.assertEqual(result['ps_lang'], b'en') + self.assertEqual(result['ps_article'], b'Statue_of_Liberty') + self.assertEqual(result['ps_page_id'], 1234) + self.assertEqual(result['ps_views'], 100) + + def test_update_db_pageviews_existing(self): + with self.wp10db.cursor() as cursor: + cursor.execute( + 'INSERT INTO page_scores (ps_lang, ps_article, ps_page_id, ps_views) ' + 'VALUES ("en", "Statue_of_Liberty", 1234, 100)') + + scores.update_db_pageviews(self.wp10db, 'en', 'Statue_of_Liberty', 1234, + 200) + + with self.wp10db.cursor() as cursor: + cursor.execute('SELECT * FROM page_scores WHERE ps_page_id = 1234') + result = cursor.fetchone() + self.assertIsNotNone(result) + self.assertEqual(result['ps_lang'], b'en') + self.assertEqual(result['ps_article'], b'Statue_of_Liberty') + self.assertEqual(result['ps_page_id'], 1234) + self.assertEqual(result['ps_views'], 200) + + @patch('wp1.scores.download_pageviews') + @patch('wp1.scores.pageview_components') + def test_update_pageviews(self, mock_components, mock_download): + mock_components.return_value = ( + (b'en', b'Statue_of_Liberty', 100, 100), + (b'en', b'Eiffel_Tower', 200, 200), + (b'fr', b'George-\xc3\x89tienne_Cartier_Monument', 300, 300), + ) + + scores.update_pageviews(commit_after=2) + + mock_download.assert_called_once() + with self.wp10db.cursor() as cursor: + cursor.execute('SELECT COUNT(*) as cnt FROM page_scores') + n = cursor.fetchone()['cnt'] + self.assertEqual(3, n) + + @patch('wp1.scores.download_pageviews') + @patch('wp1.scores.pageview_components') + def test_update_pageviews_filter(self, mock_components, mock_download): + mock_components.return_value = ( + (b'en', b'Statue_of_Liberty', 100, 100), + (b'en', b'Eiffel_Tower', 200, 200), + (b'fr', b'George-\xc3\x89tienne_Cartier_Monument', 300, 300), + ) + + scores.update_pageviews(filter_lang='fr') + + mock_download.assert_called_once() + with self.wp10db.cursor() as cursor: + cursor.execute('SELECT COUNT(*) as cnt FROM page_scores') + n = cursor.fetchone()['cnt'] + self.assertEqual(1, n) diff --git a/wp10_test.down.sql b/wp10_test.down.sql index dff53a40..245a20b2 100644 --- a/wp10_test.down.sql +++ b/wp10_test.down.sql @@ -12,3 +12,4 @@ DROP TABLE IF EXISTS `builders`; DROP TABLE IF EXISTS `selections`; DROP TABLE IF EXISTS `custom`; DROP TABLE IF EXISTS `zim_files`; +DROP TABLE IF EXISTS `page_scores`; \ No newline at end of file diff --git a/wp10_test.up.sql b/wp10_test.up.sql index 4bc821b2..59f63fb6 100644 --- a/wp10_test.up.sql +++ b/wp10_test.up.sql @@ -138,6 +138,18 @@ CREATE TABLE zim_files ( z_description tinyblob ); +CREATE TABLE `page_scores` ( + `ps_lang` varbinary(255) NOT NULL, + `ps_page_id` int(11) NOT NULL, + `ps_article` varbinary(1024) DEFAULT NULL, + `ps_views` int(11) DEFAULT 0, + `ps_links` int(11) DEFAULT 0, + `ps_lang_links` int(11) DEFAULT 0, + `ps_score` int(11) DEFAULT 0, + PRIMARY KEY (`ps_lang`,`ps_page_id`), + KEY `lang_article` (`ps_lang`,`ps_article`) +); + INSERT INTO `global_rankings` (gr_type, gr_rating, gr_ranking) VALUES ('importance', 'Unknown-Class', 0); INSERT INTO `global_rankings` (gr_type, gr_rating, gr_ranking) VALUES ('importance', 'NA-Class', 50); INSERT INTO `global_rankings` (gr_type, gr_rating, gr_ranking) VALUES ('importance', 'Low-Class', 100);