-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_creation.py
106 lines (82 loc) · 2.95 KB
/
data_creation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import numpy as np
import pandas as pd
from faker import Faker
# will create three types of datasets - users, datasets, downloads
# users DB
fake = Faker()
types_of_occupation = ['student', 'industrialist', 'professor', 'government_employee']
users_df = pd.DataFrame([
{
"user_id": str(x) ,
"name" : fake.name(),
"occupation": types_of_occupation[np.random.randint(low = 0, high = 4, size=(1))[0]]
}
for x in range(1000)
])
#datasets DB
types_of_datasets = ['agriculture', 'water', 'transport', 'forestry', 'tourism', 'urban', 'rural']
types_of_usefullness = ['teaching', 'research', 'industry', 'survey']
indian_states = ["Andhra Pradesh",
"Arunachal Pradesh ",
"Assam","Bihar",
"Chhattisgarh",
"Goa",
"Gujarat",
"Haryana",
"Himachal Pradesh",
"Jammu and Kashmir",
"Jharkhand",
"Karnataka",
"Kerala",
"Madhya Pradesh",
"Maharashtra",
"Manipur",
"Meghalaya",
"Mizoram",
"Nagaland",
"Odisha",
"Punjab",
"Rajasthan",
"Sikkim",
"Tamil Nadu",
"Telangana",
"Tripura",
"Uttar Pradesh",
"Uttarakhand",
"West Bengal",
"Andaman and Nicobar Islands",
"Chandigarh",
"Dadra and Nagar Haveli",
"Daman and Diu",
"Lakshadweep",
"National Capital Territory of Delhi",
"Puducherry"]
datasets_df = pd.DataFrame([
{
"dataset_id": str(x) ,
"type_of_dataset": types_of_datasets[np.random.randint(low = 0, high = len(types_of_datasets), size=(1))[0]] ,
"state": indian_states[np.random.randint(low = 0, high = len(indian_states), size=(1))[0]],
"useful_for": types_of_usefullness[np.random.randint(low = 0, high = len(types_of_usefullness), size=(1))[0]]
}
for x in range(500)
])
titles = []
for i in range(datasets_df.shape[0]):
title = f"{datasets_df['type_of_dataset'].iloc[i]} dataset of {datasets_df['state'].iloc[i]} for {datasets_df['useful_for'].iloc[i]}"
titles.append(title)
datasets_df['title'] = titles
# downloads DB
downloads_df = pd.DataFrame([{
"download_id": str(x) ,
"user_id": str(np.random.randint(low = 0, high = len(users_df), size = 1)[0]) ,
"dataset_id": str(np.random.randint(low = 0, high = len(datasets_df), size = 1)[0])
}
for x in range(100_000)
])
download_titles = []
for i in range(downloads_df.shape[0]):
download_titles.append(datasets_df['title'][int(downloads_df['dataset_id'].iloc[i])])
downloads_df['title'] = download_titles
users_df.to_csv('./data/users_db.csv')
datasets_df.to_csv('./data/datasets_db.csv')
downloads_df.to_csv('./data/downloads_db.csv')