-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocessing.py
65 lines (55 loc) · 1.92 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import zipfile
from collections import Counter
import pandas as pd
def unpacking_zip_file(path: str) -> pd.DataFrame:
"""
Get DataFrame from csv file.
:param path: string with the path to csv file.
:return: DataFrame with the data.
"""
with zipfile.ZipFile(path) as z:
data = []
for file in z.namelist():
data.append(pd.read_csv(z.open(file), delimiter=","))
return pd.concat(data, ignore_index=True)
def data_cleaning(df: pd.DataFrame) -> pd.DataFrame:
"""
Get cleaned DataFrame from any unnecessary data.
:param df: dataframe with noisy data.
:return: DataFrame with the clean data.
"""
df["Latitude"] = (
df["Latitude"]
.astype("str")
.str.extract(r"^(-?\d+\.\d+)", expand=False)
.astype(float)
)
df["Longitude"] = (
df["Longitude"]
.astype("str")
.str.extract(r"^(-?\d+\.\d+)", expand=False)
.astype(float)
)
res = df.drop(df[(df["Latitude"] > 90)].index)
res_1 = res.drop(res[res["Latitude"] < -90].index)
res_2 = res_1.drop(res_1[res_1["Longitude"] < -180].index)
res_3 = res_2.drop(res_2[res_2["Longitude"] > 180].index)
res_3 = res_3.dropna(axis=0, how="any")
return res_3.reset_index(drop=True)
def get_most_common(series: pd.Series) -> pd.Series:
"""
Find the most common value in series.
:param series: series.
:return: series of the most common value.
"""
x = list(series)
my_counter = Counter(x)
return my_counter.most_common(1)[0][0]
def get_dataframe_with_top_cities(df: pd.DataFrame) -> pd.DataFrame:
"""
Finds most popular cities in grouped dataframe by country.
:param df: dataframe with all the cities.
:return: dataframe with only most popular cities.
"""
cities = df.groupby(["Country"]).agg(get_most_common)["City"]
return df[df["City"].isin(cities)].sort_values(["City"]).reset_index(drop=True)