-
Notifications
You must be signed in to change notification settings - Fork 6
/
count_users.py
executable file
·196 lines (166 loc) · 6.24 KB
/
count_users.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
#! /usr/bin/env python
"""
Create a list of all usernames and which study areas they're found in.
Based on combine_stats.py
There are three files to combine for each study area:
userstats_{placename}.csv
a modified output of mvexel's userstats scripts, with monthly totals for users
...and overall totals by month for the whole study area
output_userstatsbydate_{placename}_raster_1000m.tsv
my blankspot counts by date and user, not summed for months. Does not include totals.
NOTE: this assumes 1000m rasters. Would need to be extended for other scales.
"""
folder = "userstats/"
# The prefixes and postfixes for the input files
userstats_filename_template = [folder + "userstats_", ".csv"]
blankspots_filename_template = [folder + "output_userstatsbydate_", "_raster_1000m.tsv"]
output_filename = folder + "usercount_raster_1000m.tsv"
#import MapGardening
import optparse
import csv
import datetime
usage = "usage: %prog [options]"
p = optparse.OptionParser(usage)
p.add_option('--place', '-p',
default="all"
)
options, arguments = p.parse_args()
if options.place == "all":
#places = MapGardening.get_all_places()
places = [
"amsterdam",
"auckland",
"barcelona",
"bayarea",
"berlin",
"boston",
"buenosaires",
"cairo",
"chicago",
"crimea",
"cyprus",
"douala",
"haiti",
"istanbul",
"jakarta",
"jerusalem",
"kathmandu",
"lasvegas",
"london",
"losangeles",
"manchester",
"mexicocity",
"miami",
"minsk",
"montevideo",
"montreal",
"moscow",
"mumbai",
"nairobi",
"newyork",
"quebec",
"paris",
"rio",
"santiago",
"seattle",
"seoul",
"sydney",
"tirana",
"tokyo",
"toronto",
"vancouver",
"yaounde"
]
else:
placename = options.place
#place = MapGardening.get_place(placename)
#places = {placename: place}
places = [placename]
#MapGardening.init_logging()
usercounts = {}
for placename in places:
#for placename in places.keys():
#print "combining output stats for", placename
userstats_filename = userstats_filename_template[0] + placename + userstats_filename_template[1]
blankspots_filename = blankspots_filename_template[0] + placename + blankspots_filename_template[1]
#print userstats_filename
#print blankspots_filename
data = {}
# First, parse the mvexel-style output.
# We are ignoring many other fields in that table
# ...for example anything about ways and relations
fields_of_interest = ["nodes", "nodes_created", "cur nodes"]
head = None
doc = csv.reader(open(userstats_filename), dialect='excel', delimiter='\t')
for row in doc:
if not head:
head = row
else:
username = row[head.index('username')]
# My files print anonymous users differently
if (username == ""):
username = "NULL"
date = row[head.index('year')]
if not username in data:
data[username] = {}
data[username]['uid'] = row[head.index('uid')]
if not 'edits' in data[username]:
data[username]['edits'] = {}
if not date in data[username]['edits']:
data[username]['edits'][date] = {}
for field in fields_of_interest:
data[username]['edits'][date][field] = row[head.index(field)]
# Now parse the blankspot data
# This is all borrowed from the combine_stats.py script
username = "total"
if not username in data:
data[username] = {}
if not 'uid' in data[username]:
data[username]['uid'] = "" # if we loaded the user from mvexel style, this will be set already
if not 'edits' in data[username]:
data[username]['edits'] = {}
fields_of_interest = ["count", "v1count", "blankcount"]
head = None
doc = csv.reader(open(blankspots_filename), dialect='excel', delimiter='\t')
for row in doc:
if not head:
head = row
else:
username = row[head.index('user_name')]
date = datetime.datetime.strptime(row[head.index('date')], "%Y-%m-%d").date()
date = date.replace(day=1) # Round to first day of month
date = date.strftime("%Y-%m-%d")
if not username in data:
data[username] = {}
if not 'uid' in data[username]:
data[username]['uid'] = "" # if we loaded the user from mvexel style, this will be set already
if not 'edits' in data[username]:
data[username]['edits'] = {}
if not date in data[username]['edits']:
data[username]['edits'][date] = {}
if not date in data['total']['edits']:
data['total']['edits'][date] = {}
for field in fields_of_interest:
if not field in data[username]['edits'][date]:
data[username]['edits'][date][field] = 0
if not field in data['total']['edits'][date]:
data['total']['edits'][date][field] = 0
data[username]['edits'][date][field] += int(row[head.index(field)])
data['total']['edits'][date][field] += int(row[head.index(field)])
# For each user in this place, add to the usercounts list
# Note that we're not actually counting anything...
# ...just listing the places where this user can be found
for username in sorted(data.keys()):
if not username in usercounts:
usercounts[username] = {}
usercounts[username]['uid'] = data[username]['uid']
usercounts[username]['places'] = [placename]
else:
usercounts[username]['places'].append(placename);
# Finally print the usercounts into a new file, stepping through a sorted list
# of usernames. Print the number of places for each user, as well as a comma-separated list of placenames (note that this is a TSV file, so they will all be in the same column.
with open(output_filename, 'wb') as outfile:
doc = csv.writer(outfile, dialect='excel', delimiter='\t')
doc.writerow(['username','uid','count','places'])
for username in sorted(usercounts.keys()):
doc.writerow([username, usercounts[username]['uid'], len(usercounts[username]['places']), ','.join(usercounts[username]['places'])])