-
Notifications
You must be signed in to change notification settings - Fork 19
/
merge_data_files.py
72 lines (57 loc) · 2.59 KB
/
merge_data_files.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import pandas as pd
from datetime import datetime, timedelta
def list_diff(list1, list2):
""""
returns list1 - list2 and list2 - list1
to find difference between 2 lists.
"""
l1_l2 = []
l2_l1 = []
for ele in list1:
if not ele in list2:
l1_l2.append(ele)
for ele in list2:
if not ele in list1:
l2_l1.append(ele)
return l1_l2, l2_l1
def merge_crypto_gnews_sentiment(crypto_data_filename, google_news_data_filename, output_data_filename):
"""
News data is per day, and crypto data is per hour.
So news data is replicated 24 times for each hour w.r.t. each day so that it can be concatinated with crypto data
:param crypto_data_filename:
:param google_news_data_filename:
:param output_data_filename:
:return:
"""
crypto_df = pd.read_csv(crypto_data_filename, index_col=0)
crypto_df.index = pd.to_datetime(crypto_df.index)
news_df = pd.read_csv(google_news_data_filename, index_col=0)
ilist = [str(d) for d in news_df.index]
hr1 = timedelta(hours=1)
news_col = list(news_df.columns)
news_df_comb = pd.DataFrame(columns=news_col)
for ilist_index, ilist_item in enumerate(ilist):
dt = datetime.strptime(ilist_item, '%Y-%m-%d')
for hr_ in range(0, 24):
row_id_new = dt.strftime('%Y-%m-%d %H:00:00')
row_id_value = news_df.loc[ilist_item]
dt += hr1
news_df_comb.loc[row_id_new] = row_id_value
news_df_comb.index.name = 'timestamp'
news_df_comb.to_csv(google_news_data_filename[0:-4] + '_with_timestamp.csv')
result = pd.concat([crypto_df, news_df_comb], axis=1)
result.to_csv(output_data_filename)
return True
def merge_crypto_gnews_reddit_sentiment(crypto_gnews_filename, reddit_data_filename, crypto_gnews_reddit_filename):
crypto_gnews_df = pd.read_csv(crypto_gnews_filename, index_col=0)
reddit_df = pd.read_csv(reddit_data_filename, index_col=0)
result = pd.concat([crypto_gnews_df, reddit_df], axis=1)
result.to_csv(crypto_gnews_reddit_filename)
if __name__ == '__main__':
crypto_data_filename = 'crypto_data_master_cleaned.csv'
gnews_data_filename = 'google_news_final_sentiment.csv'
crypto_gnews_filename = 'crypto_data_news_final.csv'
reddit_data_filename = 'reddit_data_sentiment_bucketized.csv'
crypto_gnews_reddit_filename = 'crypto_data_news_reddit_final.csv'
merge_crypto_gnews_sentiment(crypto_data_filename, gnews_data_filename, crypto_gnews_filename)
merge_crypto_gnews_reddit_sentiment(crypto_gnews_filename, reddit_data_filename, crypto_gnews_reddit_filename)