-
Notifications
You must be signed in to change notification settings - Fork 0
/
twitter_data_load.py
211 lines (140 loc) · 6.33 KB
/
twitter_data_load.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
# Twitter Data Load
# Loads Twitter data into a Pandas Dataframe
# Import Modules
# Standard library imports
import pandas as pd
# Third party imports
# Local imports
import twitter_request_handler as treq
import twitter_print as tprint
# Define Functions
def load_data(search_url, query_params, max_iterations):
"""
Loads data retrieved from the Twitter API into a Pandas Dataframe
"""
#------------------------------------
# Set Up
# ------------------------------------
# Create an empty "final" data list
final_twitter_data_list = []
# Print the initial query parameters
tprint.print_start_params (search_url, query_params, max_iterations)
# Set request iteration counter
iteration_counter = 1
# ------------------------------------
# Initial Request to Twitter API
# ------------------------------------
# Make an initial request to check validity of Twitter credentials
# and the initial query parameters
# Print the iteration counter for tracking purposes
tprint.print_iteration(iteration_counter)
# Print the query parameters used
tprint.print_iteration_params(query_params)
# Call Twitter Request function
twitter_return = treq.twitter_request(search_url, query_params)
# Now check on the initial Twitter response status code
initial_status_code = twitter_return['status_code']
# Print Twitter Response Status Code
tprint.print_status_code(initial_status_code)
if initial_status_code != 200:
# Catch error
final_status_code = initial_status_code
message = twitter_return['message']
# -------------------------------------------
# Extract Data from Initial Request Response
# -------------------------------------------
# If the initial status code = 200, then we are good to go
if initial_status_code == 200:
# Keep track of status code
final_status_code = initial_status_code
# Grab the Twitter data and meta data
twitter_data_list = treq.add_data_to_list (twitter_return)
twitter_meta_dict = treq.extract_meta_data (twitter_return)
# Print the meta data
tprint.print_meta_data(twitter_meta_dict)
# Add the extracted data to the master list using the extend method
final_twitter_data_list.extend(twitter_data_list)
# -------------------------------------------
# Make Additional Requests to Twitter API
# -------------------------------------------
# If a 'next_token' key exists in the meta data,
# there is more data to retrieve for the query
# Set loop to check for next token key
while 'next_token' in twitter_meta_dict.keys():
# Grab more data
twitter_return = treq.get_more_data(twitter_meta_dict,
search_url, query_params)
# Check on the Twitter response status code
new_status_code = twitter_return['status_code']
if new_status_code != 200:
# Catch error
final_status_code = new_status_code
message = twitter_return['message']
break
# If the new status code = 200, then we are good to continue
if new_status_code == 200:
# Keep track of status code
final_status_code = new_status_code
# Check if next iteration will exceed maximum request iterations
if iteration_counter + 1 > max_iterations:
break
# Update the iteration counter
iteration_counter = iteration_counter + 1
# Print the iteration counter for tracking purposes
tprint.print_iteration(iteration_counter)
# Print the query parameters used
tprint.print_iteration_params(query_params)
# Print Twitter Response Status Code
tprint.print_status_code(new_status_code)
# Grab the Twitter data and meta data
twitter_data_list = treq.add_data_to_list (twitter_return)
twitter_meta_dict = treq.extract_meta_data (twitter_return)
# Print the meta data
tprint.print_meta_data(twitter_meta_dict)
# Add the extracted data to the master list
final_twitter_data_list.extend(twitter_data_list)
# ------------------------------------------------
# Load All Extracted Data into a Pandas Dataframe
# ------------------------------------------------
if final_status_code == 200:
# Check the keys in the final twitter data list of dictionaries
# Need to see if an "extra" key named 'withheld' exists
# for any of the dictionaries
# And if the key exists, it needs to be deleted
updated_twitter_data_list=treq.check_final_data_keys(
final_twitter_data_list)
# Load the final twitter data into a Pandas dataframe
df_twitter_data=pd.DataFrame(updated_twitter_data_list)
# Return the dataframe as output
output=df_twitter_data
# -------------------------------------------
# Prepare Summary of Query Results
# -------------------------------------------
if final_status_code == 200:
# Print Summary Title
tprint.print_summary_title()
# Capture total iterations performed
total_iterations = iteration_counter
# Print Final Status Code
tprint.print_final_status_code(final_status_code)
# Check if the Query managed to complete
# within the Maximum Request Iterations limit
if 'next_token' in twitter_meta_dict.keys(): # Query did not complete
tprint.print_query_incomplete(total_iterations, max_iterations)
else:
tprint.print_query_completed(total_iterations, max_iterations)
# Print the summary contents of resulting dataframe
tprint.print_dataframe(df_twitter_data)
# Print End
tprint.print_end()
if final_status_code != 200:
# Print Whoops Error Message
tprint.print_error_message(final_status_code, message)
#------------------------------------
# Wrap Up
# ------------------------------------
# Set return
if final_status_code == 200:
return output
else:
return