-
Notifications
You must be signed in to change notification settings - Fork 203
/
LogCheck.py
117 lines (98 loc) · 7.64 KB
/
LogCheck.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import sys
import os.path
import csv
# Script will generate a list of URL that from Apache web access log that have least unique IP address or unique user-agents
# Written for Python 3
# Ideal Percentage of URL to display
# Will display more base on matching count
urlpercentage = 0.05
# Hold the filename for Apache web access log
weblogfileName = None
# apache log fields
apachelogsfields = ['ip', 'identd', 'frank', 'time_part0', 'time_part1', 'request', 'status', 'size', 'referer', 'user_agent']
# function output the url based on lower counts unique ip address and lower counts of unique user-agents
def analyze_weblog(filename):
uniqueurlcount = 0 # count of unique URL in web log
urls = [] # list of unique URL, also index into lists of lists of unique ip address and user-agents
uniqueipcount = [] # list of unique ip address count for URL
uniqueuseragentscount = [] # list of unique use agents for URL
iplist = [] # list of list of ip address per unique URL to keep track of unique URL
useragentlist = [] # list of list of user-agents per unique URL to keep track of unique user-agents
print("The weblog file to analyze is %s" % filename)
with open(filename, mode='r') as csv_file: # read in web log as csv file
csv_reader = csv.reader(csv_file, delimiter=' ')
for row in csv_reader:
# handles simple case where file has comments start with #
if (row[0][0] != '#'):
# extract only fields of interest from the web log
ipaddress = row[apachelogsfields.index('ip')] # ip address
request = row[apachelogsfields.index('request')] # request (URL part of request)
status = row[apachelogsfields.index('status')] # user-agent
user_agent = row[apachelogsfields.index('user_agent')]
# print('ipaddress: %s request: %s status: %s user_agent: %s' % (ipaddress, request, status, user_agent))
url = (request.partition(' ')[2]).partition(' ')[0] # extract URL from request field
# print ('url %s' % url)
if (status >= '200' and status <= '299'): # only request with status of 200 - 299
if (url not in urls): # determine if URL is already been seen
uniqueurlcount += 1 # if not increment unique URL count
urls.append(url) # append new URL to the unique URL list
uniqueipcount.append(0) # append an element of zero for the unique ip count list
uniqueuseragentscount.append(0) # append an element of zero for the unique user-agents count list
newiplist = [] # new empty element list for ip address tracking per URL
iplist.append(newiplist) # append empty list to list of list of ip per URL
newuseragentlist = [] # new empty element list for user-agents tracking per URL
useragentlist.append(newuseragentlist) # append empty list to list of list of user-agents per URL
if (user_agent not in useragentlist[urls.index(url)]): # determine if user-agents is in the particular URL list
useragentlist[urls.index(url)].append(user_agent) # if not append user-agents to user-agents list for the particular URL list
temp = uniqueuseragentscount[urls.index(url)] + 1 # also increment unique user-agents count for that URL
uniqueuseragentscount[urls.index(url)] = temp
if (ipaddress not in iplist[urls.index(url)]): # determine if ip address is in the particular URL list
iplist[urls.index(url)].append(ipaddress) # if not append ip address to ip address list for the particular URL list
temp = uniqueipcount[urls.index(url)] + 1 # also increment unique ip address count for that URL
uniqueipcount[urls.index(url)] = temp
# print(urls)
# print('uniqueurlcount: %s' % uniqueurlcount)
# print(uniqueuseragentscount)
# print(uniqueipcount)
# print('amount of useragentlist: %s' % len(uniqueuseragentscount))
# print('amount in the iplist: %s' % len(uniqueipcount))
numberofurltodisplay = urlpercentage * uniqueurlcount # Determine line that represent percentage of URL wanted
intnumberofurltodisplay = int(numberofurltodisplay)
if (numberofurltodisplay > intnumberofurltodisplay): # Round up
intnumberofurltodisplay += 1
tempuniqueuseragentscount = uniqueuseragentscount.copy() # Create a temporary copy of list of unqiue user-agents count to sort
tempuniqueuseragentscount.sort()
# Array start at 0 need to subtract -1 from index
useragentcounttodisplay = tempuniqueuseragentscount[(intnumberofurltodisplay -1)] # determine the count of unique user-agents to display
tempuniqueipcount = uniqueipcount.copy() # Create a temporary copy of list of unqiue ip address count to sort
tempuniqueipcount.sort()
# Array start at 0 need to subtract -1 from index
ipcounttodisplay = tempuniqueipcount[(intnumberofurltodisplay -1)] # determine the count of ip address to display
print('URL with least user agents')
print('--------------------------')
for count in range (0, (useragentcounttodisplay + 1)): # Increament thru count to count of unique user-agents to display to order url output based on count
index = 0
for elementuseragentcount in uniqueuseragentscount: # Increment thru unique user-agents count list
if (elementuseragentcount == count): # List URL where where user-agents is equal to count
print(urls[index])
index += 1
print('URL with least IP address')
print('-------------------------')
for count in range (0, (ipcounttodisplay + 1)): # Increament thru count to count of unique ip address to display to order url output based on count
index = 0
for elementipcount in uniqueipcount: # Increment thru unique ip address count list
if (elementipcount == count): # List URL where where user-agents is equal to count
print(urls[index])
index += 1
if __name__ == '__main__':
try:
if len(sys.argv) == 2: # Simple check if an agrument is passed (assume weblog file
weblogfileName=sys.argv[1]
print ("Web log file to read is %s" % weblogfileName)
if(os.path.isfile(weblogfileName)):
analyze_weblog(weblogfileName)
else:
print ('Usage: python3 %s <weblogfile>' % sys.argv[0]) # Print usage statement
except Exception as e:
print("You must provide a valid filename (path) of a web logfile")
raise