-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
130 lines (100 loc) · 4.12 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
# ------PREPARATION OF DATA-------
# Reading the data
data = pd.read_csv('cost_revenue_dirty.csv')
# Getting rid of unwanted characters in the dataframe
chars_to_remove = ["$", ","]
columns_to_convert = ["USD_Production_Budget",
"USD_Worldwide_Gross",
"USD_Domestic_Gross"]
for col in columns_to_convert:
for char in chars_to_remove:
data[col] = data[col].astype(str).str.replace(char, "")
data[col] = pd.to_numeric(data[col])
# Converting the Release_Date column to Pandas Datetime type
data.Release_Date = pd.to_datetime(data.Release_Date)
# Finding international releases
international_releases = data.loc[(data.USD_Domestic_Gross == 0) &
(data.USD_Worldwide_Gross != 0)]
# Filtering unreleased films as of the date data was pulled
scrape_date = pd.Timestamp('2018-5-1')
unreleased_films = data.loc[(data.Release_Date > scrape_date)]
data_clean = data.drop(unreleased_films.index) # Clean data = data_clean
# Creating a decade column where years are converted to their corresponding decades
data_clean['Decade'] = data_clean.Release_Date.dt.year - (data_clean.Release_Date.dt.year % 10)
# -----CHARTS for data_clean------
# Revenue vs Budget bubble chart for the data_clean
"""
plt.figure(figsize=(8, 4), dpi=200)
ax = sns.scatterplot(data=data_clean,
x='USD_Production_Budget',
y='USD_Worldwide_Gross',
hue='USD_Worldwide_Gross',
size='USD_Worldwide_Gross')
ax.set(ylim=(0, 3000000000),
xlim=(0, 450000000),
ylabel='Revenue in $ billions',
xlabel='Budget in $100 millions')
plt.show()
"""
# Budget and Revenue vs Year bubble chart for the data_clean
"""
plt.figure(figsize=(8, 4), dpi=200)
with sns.axes_style('darkgrid'):
ax = sns.scatterplot(data=data_clean,
x='Release_Date',
y='USD_Production_Budget',
hue='USD_Worldwide_Gross',
size='USD_Worldwide_Gross')
ax.set(ylabel='Budget in $100 millions',
xlabel='Year')
plt.show()
"""
# Separating the old and new films (1980 is taken as the border)
old_films = data_clean.loc[data_clean.Decade < 1980]
new_films = data_clean.loc[data_clean.Decade >= 1980]
# ----- NEW FILMS REVENUE vs BUDGET CHART (linear regression included) -----
"""
plt.figure(figsize=(8,4), dpi=200)
with sns.axes_style('darkgrid'):
ax = sns.regplot(data=new_films,
x='USD_Production_Budget',
y='USD_Worldwide_Gross',
scatter_kws = {'alpha': 0.4},
line_kws = {'color': 'black'})
ax.set(ylim=(0, 3000000000),
xlim=(0, 450000000),
ylabel='Revenue in $ billions',
xlabel='Budget in $100 millions')
plt.show()
"""
# ----- OLD FILMS REVENUE vs BUDGET CHART (linear regression included) -----
"""
plt.figure(figsize=(8, 4), dpi=200)
with sns.axes_style('darkgrid'):
ax = sns.regplot(data=old_films,
x='USD_Production_Budget',
y='USD_Worldwide_Gross',
scatter_kws={'alpha': 0.4},
line_kws={'color': 'black'})
ax.set(xlim=(0, 45000000),
ylabel='Revenue in $ billions',
xlabel='Budget in $100 millions')
plt.show()
"""
# ---- RUNNING OUR OWN LINEAR REGRESSION FOR NEW FILMS -----
"""
regression = LinearRegression()
X = pd.DataFrame(new_films, columns=['USD_Production_Budget'])
y = pd.DataFrame(new_films, columns=['USD_Worldwide_Gross'])
regression.fit(X, y)
print(regression.score(X, y))# Our model explains about the percentage of the output of this value
# (~57% in this case)
print(regression.intercept_)
print(regression.coef_)
# intercept and coefficient values are used to estimate revenue for a given budget.
# Revenue = Intercept + coefficient * budget
"""