-
Notifications
You must be signed in to change notification settings - Fork 1
/
stonks_extract.py
executable file
·363 lines (296 loc) · 13.6 KB
/
stonks_extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Title: stonks_extract - Extraction module with functions to pull
data from multiple sources (online and local) and output as
pandas dataframe for further processing and analysis.
Version 0.1
- initial decoupling of extraction code from stonks_utils
- support for dividendhistory.org data extraction for US and Canadian markets
- support for reading in dividendhistory.org datasets from local csv
"""
import pandas as pd
import datetime
from urllib.error import HTTPError
from datetime import date
import numpy as np
import yfinance as yf
import stonks_utils as su
# ------------------------ Remote Extractors DividendHistory-------------------------
# get summary of tickers from dividendhistory.org
# in_market is one of tsx, nyse, nasdaq
# adds exchange and epoch time stamp for next ex-div
# returns a dataframe with the following structure
# Symbol, Company, Yield Percentage, Next Ex-div Date, Exchange, ex-div epoch
def get_ticker_summary_divhistory(in_market: str):
if not (in_market in su.MARKETS):
print('in_market must be one of tsx, nyse, nasdaq')
return pd.DataFrame()
try:
dfs = pd.read_html(f'https://dividendhistory.org/{in_market}', keep_default_na=False, header=0, index_col=0)
except HTTPError as err:
print(f"Could not get TSX ticks: HTTP Code: {err.code} Path Tried: {err.url}")
return pd.DataFrame()
df = dfs[0]
df['Exchange'] = "TSX"
df['ex-div epoch'] = df.apply (lambda x: int(((datetime.datetime.strptime(x['Next Ex-div Date'], "%Y-%m-%d")).timestamp())*1000), axis=1)
return df
# returns ticker symbols in a list extracted from divhistory remotely
def get_tickers_divhistory(in_market: str):
if not (in_market in su.MARKETS):
print('in_market must be one of tsx, nyse, nasdaq')
return su.FAILURE
try:
dfs = pd.read_html(f'https://dividendhistory.org/{in_market}', keep_default_na=False)
except HTTPError as err:
print(f"Could not get {in_market} ticks: HTTP Code: {err.code} Path Tried: {err.url}")
return su.FAILURE
df = dfs[0]
sym_list = df['Symbol'].to_list()
return (sym_list)
# get dividend history from dividendhistory.org for given ticker symbol
# returns DataFrame
def get_div_history_DH(in_sym, in_market):
# extract module for pulling dividend history given ticker
if not is_in_exchange_DH(in_sym, in_market):
print(f"{in_sym} is not in {in_market} on dividendhistory")
return pd.DataFrame()
in_sym = in_sym.replace('.', '_')
if in_market in ('nyse', 'nasdaq'):
url_str = f'https://dividendhistory.org/payout/{in_sym}/'
else:
url_str = f'https://dividendhistory.org/payout/{in_market}/{in_sym}/'
try:
dfs = pd.read_html(url_str)
except:
print(f"Could not get {in_market} Div History for {in_sym}")
return pd.DataFrame()
# small decision block to deal with optional anoucements header on some pages
if len(dfs)==3:
df = dfs[0]
elif len(dfs)==4:
df = dfs[1]
else:
print('something else going on here')
# drop the last column (i can calculate div increase percentages myself)
# clean up dataframe before returning
df['Cash Amount'] = df['Cash Amount'].str.replace('$', '')
df['Cash Amount'] = df['Cash Amount'].str.replace('\*\*', '')
df = df.iloc[:, :-1]
df['Exchange'] = in_market
df['Symbol'] = in_sym.replace('_', '.')
# fill any empty Payout Date fields with todays date
# this is a work around for an annoying gap in dataset
curr_date = {"Payout Date": date.today().strftime("%Y-%m-%d")}
df = df.fillna(value=curr_date)
df['ex-div date epoch'] = df.apply (lambda x: int(((datetime.datetime.strptime(x['Ex-Dividend Date'], "%Y-%m-%d")).timestamp())*1000), axis=1)
df['payout date epoch'] = df.apply (lambda x: int(((datetime.datetime.strptime(x['Payout Date'], "%Y-%m-%d")).timestamp())*1000), axis=1)
return df
# simple function to check if a symbol belongs to an exhange
# based on dividendhistory
def is_in_exchange_DH(in_sym, in_market):
sym_list = get_tickers_divhistory(in_market)
return (in_sym in sym_list)
# get and parse weekly stock report from dividendhistory.org
# note: previous reports are not publicly linked on site but are
# accessible with URL tweak.
# returns DF with results and adds epoch timstamps
def get_DH_weekly_report(in_market, in_date:date):
# check if date is a friday as reports are only published on Fridays
if not su.is_friday(in_date):
print("DH reports are only published on Fridays - check your date")
return pd.DataFrame()
# validate market
if in_market in ('nyse', 'nasdaq'):
market = 'USA'
elif in_market == 'tsx':
market = 'CAN'
elif in_market in ('CAN', 'USA'):
market = in_market
else:
print(f'Market {in_market} not valid')
return pd.DataFrame()
#data_file_path = "{1}weekly_divhistory_reports/{0}/".format(market, in_file_path)
web_path = 'https://dividendhistory.org/reports/{3}/{2}-{1}-{0}-report.htm'.format(in_date.strftime("%d"), in_date.strftime("%m"), in_date.year, market)
#data_file_path = data_file_path + "div_history_report-{3}-{2}-{1}-{0}.csv".format(in_date.strftime("%d"), in_date.strftime("%m"), in_date.year, market)
try:
dfs = pd.read_html(web_path, keep_default_na=False)
#write_file = True
#i+=1
#printProgressBar(i + 1, len(fridays), prefix = '{0} Weekly Report Progress:'.format(market), suffix = 'Complete', length = 50)
except HTTPError as err:
print(f"Path not found: {err.url}: Code: {err.code}")
return pd.DataFrame()
#write_file = False
# take first DataFrame in list
df = dfs[0]
# cut off extra null columns
df = df.iloc[:, :16]
# clean up - fix column names
colnames = []
for col in df.columns:
colnames.append(col[1])
df.columns = colnames
# remove sector category rows from data
# this function assumes that all cells are populated with same string
# when its a header row
for row_num, row in df.iterrows():
if row['Price'] == row['Name']:
df = df.drop(row_num)
date_str = "{0}-{1}-{2}".format(in_date.year, in_date.strftime("%m"), in_date.strftime("%d"))
df['report_date_epoch'] = int(((datetime.datetime.strptime(date_str, "%Y-%m-%d")).timestamp())*1000)
# strip % from yld and PayRto
df['Yld'] = df['Yld'].map(lambda x: x.lstrip('').rstrip('%'))
df['PayRto'] = df['PayRto'].map(lambda x: x.lstrip('').rstrip('%'))
# split div frequency from ex-div date
# had to run in a try statement because of at least one report
# that had no dates, just frequency for ex-div date .. if this happens,
# whole div-freq column is set to -
try:
df['div_freq'] = (df['Ex-Div'].str.split(pat=r'\d\d\d\d-\d\d-\d\d', expand=True))[1]
df['Ex-Div'] = df['Ex-Div'].map(lambda x: x.lstrip('').rstrip('AQSMU'))
except:
df['div_freq'] = "-"
pass
# need to reinsert nans for whitespace cells as nans are removed on import
# once nans are back in, fillnan with -
# all of this is because of one ticker symbol (NA) which is interpreted as a NaN on import
# unless keep_default_na=false
df = df.replace(r'^\s*$', np.nan, regex=True)
df = df.fillna(value='-')
df['market'] = in_market
try:
df['ex_div_epoch'] = df.apply (lambda x: "-" if x['Ex-Div'] == "-" else int(((datetime.datetime.strptime(x['Ex-Div'], "%Y-%m-%d")).timestamp())*1000), axis=1)
except:
df['ex_div_epoch'] = "-"
return df
# --------------------- REMOTE EXTRACTIONS Yahoo ----------------------------
# get and parse price history from yahoo_finance for given ticker
# and given date range
# Either in_period = max (full history) or last_day
# OR
# in_period = spec AND requires at least in_start str 'YYYY-MM-DD'
# no in_end specified will default to today
# returns DataFrame
def get_ticker_price_history_yahoo(in_sym: str, in_market:str, in_period:str, *xargs, **kwargs):
if in_market not in su.MARKETS:
print('market must be one of TSX, NYSE, or NASDAQ')
return pd.DataFrame()
if not is_in_exchange_DH(in_sym, in_market):
print(f'ticker not in exchange {in_sym} is not in {in_market}')
return pd.DataFrame()
# convert standard ticker to yahoo ticker notation
# replace . with _
# append .(excchange digraph) TSX the only one supported right now
if in_period == 'spec':
start = kwargs.get('in_start')
end = kwargs.get('in_end')
if start!=None:
if end==None:
end = date.today() # use yesterday as enddate by default. enddate in yfinance in non-inclusive
else:
print('You need to provide a start date with spec')
return pd.DataFrame()
# PREP
in_sym = in_sym.replace('.', '-')
if in_market == 'tsx':
in_sym = in_sym + '.TO'
curr_tick = yf.Ticker(in_sym)
# auto_adjust set to false to get prices adjusted for splits only (not dividends)
if in_period == 'max' or in_period =='last_day':
df = curr_tick.history(period=in_period, auto_adjust=False)
else:
df = curr_tick.history(start=start, end=end, auto_adjust=False)
if df.empty is False:
in_sym = in_sym.replace('-', '.')
in_sym = in_sym.replace('.TO', '')
df['symbol'] = in_sym
df['market'] = in_market
df = df.reset_index()
df['date_epoch'] = df.apply (lambda x: int(x['Date'].timestamp())*1000, axis=1)
else:
pass
return df
# --------------------------- LOCAL EXTRACTIONS ----------------------------------
# grab divdidend history for ticker from local csv
# original source dividendhistory.org
# returns DataFrame
# EXTRACT
def get_DH_div_history_local(in_data_path:str, in_ticker:str, in_market:str):
if in_market not in su.MARKETS:
print("Market must be one of tsx, nyse. nasdaq")
return pd.DataFrame()
try:
data_file_path = in_data_path
#temp_tick = in_ticker.replace('.', '_')
#print(in_ticker)
dfs = pd.read_csv(f'{data_file_path}{in_market}/DH_div_history_{in_ticker}.csv', keep_default_na=False)
#div_path = data_file_path + "yield_history/"
#make_dir(div_path)
except:
print(f"Could not find {data_file_path}{in_market}/DH_div_history_{in_ticker}.csv")
return pd.DataFrame()
return dfs
# grab weekly DH report from local data store csv
# returns full data frame of report on given day
# in_data_path full root directory for DH reports
# original source dividendhistory.org
# EXTRACT
def get_DH_weekly_report_local(in_data_path:str, in_market:str, in_date:date):
if in_market not in su.MARKETS:
print("Market must be one of tsx, nyse. nasdaq")
return pd.DataFrame()
if not su.is_friday(in_date):
print("Date must be a Friday in the past")
print(in_date)
return pd.DataFrame()
try:
if in_market == 'tsx':
dfs = pd.read_csv(f'{in_data_path}CAN/div_history_report-CAN-{in_date.year}-{in_date.month}-{in_date.day}.csv', keep_default_na=False)
else:
dfs = pd.read_csv(f'{in_data_path}USA/div_history_report-USA-{in_date.year}-{in_date.month}-{in_date.day}.csv', keep_default_na=False)
#dfs.drop(columns=["Name", "Price", "Yld", "Ex-Div", "PayRto", "PE", "PB", "Beta", "Mkt Cap", "WK%", "MO%", "2MO%", "3MO%", "6MO%", "1YR%", "report_date_epoch", "ex_div_epoch"], inplace=True)
except:
print(f'Could not find weekly report for {in_market}')
return pd.DataFrame()
return dfs
# grab price history for ticker
# in_data_path is full path to directory containing price recrords from yahoo
# returns DataFrame
# EXTRACT
def get_ticker_price_history_yahoo_local(in_data_path:str, in_market:str, in_ticker:str):
if in_market not in su.MARKETS:
print("Market must be one of tsx, nyse. nasdaq")
return pd.DataFrame()
try:
#if in_market == 'tsx':
## in_ticker_can = in_ticker.replace('.', '-')
# in_ticker_can = in_ticker_can + '.TO'
dfs = pd.read_csv(f'{in_data_path}yahoo_price_history_{in_ticker}_{in_market}.csv', keep_default_na=False)
# else:
#in_ticker_us = in_ticker.replace('.', '-')
# dfs2 = pd.read_csv(f'{data_file_path}price_history/yahoo_price_history_{in_ticker}.csv')
#print(dfs2)
except:
print(f"{in_data_path}yahoo_price_history_{in_ticker}_{in_market}.csv - Failed")
return pd.DataFrame()
#print(dfs2)
return dfs
# get list of tickers sourced from dividendhistory.org from local csv
#
# returns list of tickers symbols only populated from local copy to divhistory ticker list
# will extract a column named Symbol from provided dataframe and return as a list
# FUTURE : could generalize as extact named column as list
def get_tickers_divhistory_local(in_path, in_file_name):
try:
dfs = pd.read_csv(f'{in_path}{in_file_name}', keep_default_na=False)
#print(dfs)
except:
print(f"Could not find file {in_path}{in_file_name}")
return su.FAILURE
try:
sym_list = dfs['Symbol'].to_list()
except KeyError:
print("DF does not contain key Symbol")
return su.FAILURE
return (sym_list)