-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodels.py
207 lines (187 loc) · 7.49 KB
/
models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
import argparse
import os
import filetype
import requests
from bs4 import BeautifulSoup
from envyaml import EnvYAML
from requests.exceptions import HTTPError
from exceptions import TazDownloadFormatException, TazConfigurationError, TazDownloadError
dir_path = os.path.dirname(os.path.realpath(__file__))
class TazConfiguration:
"""
This class represents the configuration that is needed to run the program.
On initialization it trys to load the configuration from either the config.yaml or from the arguments passed.
"""
# List of tuples that each defines a single configuration that can be set either in the config.yaml by passing it
# as an argument.
# CONFIGURATIONS[0]: configuration name
# CONFIGURATIONS[1]: is it required?
CONFIGURATIONS = [
('id', True),
('password', True),
('download_format', False),
('download_folder', False),
('nextcloud_webdav_url', False),
('nextcloud_webdav_password', False),
('limit_requests', False),
('log_level', False),
]
def __init__(self):
self._config = {}
# try to load configuration
try:
self._load_config()
except TazDownloadFormatException:
raise
except Exception:
raise
def _load_config(self):
# Try to load config.yaml
try:
conf_yaml = EnvYAML(os.path.join(dir_path, 'config.yaml'), os.path.join(dir_path, '.env'))
except Exception as e:
raise Exception(f"Something went wrong when reading config.yaml.\n{e}")
# Get console arguments
console_args = self._parse_arguments()
# Set configurations by preferring console arguments over settings in config.yaml
for conf, required in self.CONFIGURATIONS:
if conf in console_args and getattr(console_args, conf) is not None:
self._config[conf] = getattr(console_args, conf)
elif conf_yaml.get(conf, None) is not None:
self._config[conf] = conf_yaml[conf]
else:
if required:
raise TazConfigurationError(conf)
def _parse_arguments(self):
"""
Parse command line arguments.
"""
argparser = argparse.ArgumentParser(
description='Download taz e-paper',
)
argparser.add_argument(
'-i',
'--id',
action='store',
type=str,
help='Your taz-ID',
)
argparser.add_argument(
'-p',
'--password',
action='store',
type=str,
help='Your password',
)
argparser.add_argument(
'-f',
'--download-format',
action='store',
type=str,
choices=['pdf', 'epub', 'epubt', 'html', 'ascii', 'mobi', 'mobit'],
help='The e-paper format',
)
argparser.add_argument(
'-d',
'--download_folder',
action='store',
type=str,
help='The path to a folder where the e-paper should be stored',
)
argparser.add_argument(
'--nextcloud_webdav_url',
action='store',
type=str,
help='The url of a Nextcloud webdav',
)
argparser.add_argument(
'--nextcloud_webdav_password',
action='store',
type=str,
help='The webdav password',
)
argparser.add_argument(
'-l',
'--limit-requests',
action='store_true',
default=None,
help='Only query website for available newspaper if tomorrow\'s newspaper has not already been downloaded',
)
argparser.add_argument(
'--log_level',
action='store',
choices=['notset', 'debug', 'info', 'warning', 'error', 'critical'],
help='Set the log level',
)
return argparser.parse_args()
def get_config(self) -> dict:
return self._config
class TazDownloader:
download_formats = ["pdf", "epub", "epubt", "html", "ascii", "mobi", "mobit"]
BASE_URL = "https://dl.taz.de/"
HEADERS = {"User-agent": 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/79.0.3945.130 Safari/537.36'}
def __init__(self, taz_id: str, password: str, download_format: str = "pdf"):
self.taz_id = taz_id
self.password = password
if download_format in self.download_formats:
self.download_url = self.BASE_URL + download_format
else:
raise TazDownloadFormatException(download_format)
def scrape_newspaper(self) -> list:
"""
Scrapes the newspaper available for download from https://dl.taz.de/
:return: a list of file names (str)
"""
try:
page = requests.get(self.download_url, headers=self.HEADERS)
soup = BeautifulSoup(page.content, 'html.parser')
return [n['value'] for n in soup.find("select").find_all("option")]
except HTTPError as http_e:
raise TazDownloadError(f"Could not scrape available newspaper editions:\n{http_e}")
def download_newspaper(self, taz: str, download_folder: str = os.path.join(dir_path, 'tmp')):
"""
Downloads a newspaper from dl.taz.de and stores it in tmp folder
"""
# Check if folder exists
try:
if not os.path.isdir(download_folder):
os.makedirs(download_folder)
except Exception as e:
raise TazDownloadError(f"Could not find or create \"{download_folder}\":\n{e}")
# download taz
try:
with requests.get(
self.download_url,
stream=True,
headers=self.HEADERS,
params={
'username': self.taz_id,
'password': self.password,
'id': taz,
'Laden': '+Laden+',
}
) as r:
# write response to file
with open(os.path.join(download_folder, taz), "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
# Unfortunately, the taz website does not respond with a http error code if the credentials are wrong.
# So we have to check if the response is a pdf file or the html page with an error message.
try:
if filetype.guess(os.path.join(download_folder, taz)).mime != 'application/pdf':
raise TazDownloadError()
except (AttributeError, TazDownloadError) as e:
# Try to get the error message from the html file to put it in the log
with open(os.path.join(download_folder, taz), 'r') as f:
soup = BeautifulSoup(f.read(), 'html.parser')
error_displayed_on_page = soup.find('p', class_='error').text
if error_displayed_on_page:
os.remove(os.path.join(download_folder, taz))
raise TazDownloadError(error_displayed_on_page)
else:
os.remove(os.path.join(download_folder, taz))
raise TazDownloadError(e)
return True
except HTTPError as http_e:
raise TazDownloadError(http_e)