diff --git a/wp1/scores.py b/wp1/scores.py index 5bd0b3a7..2d1da7cc 100644 --- a/wp1/scores.py +++ b/wp1/scores.py @@ -31,6 +31,7 @@ def wiki_languages(): r = requests.get( 'https://wikistats.wmcloud.org/api.php?action=dump&table=wikipedias&format=csv', headers={'User-Agent': WP1_USER_AGENT}, + timeout=60, ) try: r.raise_for_status() @@ -83,12 +84,17 @@ def download_pageviews(): # File already downloaded return - with requests.get(get_pageview_url(), stream=True) as r: + with requests.get(get_pageview_url(), stream=True, timeout=60) as r: r.raise_for_status() - with open(cur_filepath, 'wb') as f: - # Read data in 8 KB chunks - for chunk in r.iter_content(chunk_size=8 * 1024): - f.write(chunk) + try: + with open(cur_filepath, 'wb') as f: + # Read data in 8 KB chunks + for chunk in r.iter_content(chunk_size=8 * 1024): + f.write(chunk) + except Exception as e: + logger.exception('Error downloading pageviews') + os.remove(cur_filepath) + raise Wp1ScoreProcessingError('Error downloading pageviews') from e def raw_pageviews(decode=False): diff --git a/wp1/scores_test.py b/wp1/scores_test.py index 00c21363..b8ca010a 100644 --- a/wp1/scores_test.py +++ b/wp1/scores_test.py @@ -187,6 +187,22 @@ def test_download_pageviews_skip_existing(self, mock_get_response, mock_get_response.assert_not_called() self.assertTrue(os.path.exists(file_path)) + @patch('wp1.scores.requests.get') + def test_download_pageviews_handle_error(self, mock_get_response): + context = MagicMock() + resp = MagicMock() + # Return partial data and then raise an exception + resp.iter_content.side_effect = (pageview_bz2[:100], + requests.exceptions.HTTPError) + context.__enter__.return_value = resp + mock_get_response.return_value = context + + with self.assertRaises(Wp1ScoreProcessingError): + scores.download_pageviews() + + file_path = scores.get_cur_file_path() + self.assertFalse(os.path.exists(file_path)) + @patch('wp1.scores.get_current_datetime', return_value=datetime(2024, 5, 25)) @patch("builtins.open", new_callable=mock_open, read_data=pageview_bz2) def test_raw_pageviews(self, mock_file_open, mock_datetime):