-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathwwwordlist.py
133 lines (121 loc) · 6.35 KB
/
wwwordlist.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!/usr/bin/env
# Scrape a Page and Spit Out Unique Word List for Enumeration
# I whipped this up in a pinch when CeWL was giving me problems.
# 2022 - @RackunSec
import requests
import sys
import re
import urllib3 ## Used to weed out the SSL/TLS warnings
urllib3.disable_warnings() ## Disable SSL/TLS issues
class Wwwordlist():
def __init__(self):
## Update this for the user agent:
self.ua = 'Mozilla/5.0 (X11; Linux x86_64; rv:91.0) Gecko/20100101 Firefox/91.0'
self.unique_words = [] ## Store all unique words - will be printed at the end.
self.unique_attribs_list = [] ## Store all attributes of HTML tags
def usage(self): ## Simple Usage
print("Usage: python3 wwwordlist.py (URI) | tee file.txt")
sys.exit()
def word_check(self,word): ## Check word to make sure it's a word:
if "/" in word or ":" in word:
self.delim_line_splitter(word)
return
if re.match(r'^[^A-Za-z0-9]+$',word) or re.match(r'^$',word) or len(word)<=1 or re.match(r'^\s*https://[^ ]+$',word) or re.search('[^\x00-\x7F]+',word):
## Things like "..." or "&..;", blank lines, etc
return False
else:
return True
def validate_uri(self,uri): ## Was a Valid URI/URL given to me?
if re.match(r"^http(s)?://.*",uri):
return True
else:
self.print_error(f"Not a Valid URI: {uri}")
def print_error(self,msg): ## Print Errors to the screen and exit.
print(f"[!] {msg}")
sys.exit()
def get_dom(self,uri): ## Get the DOM from the provided URI/URL
headers = {"User-Agent": self.ua}
try:
response_data = requests.get(uri,headers=headers,verify=False) ## Make the HTTP request and get the data
self.generate_wordlist(response_data) ## generate the word list from this.
except Exception as e:
print(f"[!] Could not access {uri}: {e}")
sys.exit()
def scrub_dom_line(self,line): ## Scrub the DOM line of crap that shouldn't be in a wordlist
line_scrubbed = re.sub(r"<[^>]+(>|$)","",line) # Delete out all HTML tags
line_scrubbed = re.sub(r"(^\s+|^\s+$)","",line_scrubbed) # Remove all prepended and appended whitespace
line_scrubbed = re.sub(r"&[a-z]+;"," ",line_scrubbed) # remove etc
line_scrubbed = re.sub(r"[^A-Za-z0-9._/ -]","",line_scrubbed) # Remove Everything that is not a word character ("_-" is okay)
return line_scrubbed
def delim_line_splitter(self,line):
if "/" in line: ## This could include URIs, but self.word_check() will remove them.
line_array = line.split("/")
for line_item in line_array:
if " " in line_item: # the line had a forward slash AND a space:
line_item2 = line_item.split()
for word2 in line_item2:
if word2 not in self.unique_words: # remove blank lines and such
if self.word_check(word2):
self.unique_words.append(word2)
else:
if len(line_item)!=1 and line_item not in self.unique_words: # remove blank lines and such
if self.word_check(line_item):
self.unique_words.append(line_item)
if ":" in line: ## Colons are not being scrubbed using re.sub() in self.scrub_dom_line() ?
line_array = line.split(":")
for line_item in line_array:
if " " in line_item: # the line had a forward slash AND a space:
line_item2 = line_item.split()
for word2 in line_item2:
if word2 not in self.unique_words: # remove blank lines and such
if self.word_check(word2):
self.unique_words.append(word2)
else:
if len(line_item)!=1 and line_item not in self.unique_words: # remove blank lines and such
if self.word_check(line_item):
self.unique_words.append(line_item)
if re.search(r'\[[^]]+\]',line): ## Square brackets:
data = re.sub(r'[^[]*\[([^\]]+)\].*',r'\1',line)
if self.word_check(data):
self.unique_words.append(data)
def generate_wordlist(self,response_data): ## generate and print the actual wordlist
for i,line in enumerate(response_data.iter_lines()):
try:
decoded = line.decode("utf-8")
except Exception as e: ## Non ASCII bytes found.
continue
if re.search(r'<[^=>]+="',decoded): ## We have attributes we can pull out:
attribs = decoded.split("=") ## Split the line up
for attrib in attribs: ## Look at each side of the equation:
if re.search(r'"[^"]+"',attrib):
attrib_clean = re.sub(r'^[^"]*"([^"]+)".*',r'\1',attrib)
if attrib_clean not in self.unique_attribs_list:
if self.word_check(attrib_clean):
self.unique_attribs_list.append(attrib_clean)
## Done:
continue ## We can continue now that we pulled out the attributes
if decoded == "":
continue
line_scrubbed = self.scrub_dom_line(decoded)
if line_scrubbed == "":
continue ## skip blank lines and URLs/URIs
else:
line_array = line_scrubbed.split() # split by whitespace
for line_item in line_array:
if line_item not in self.unique_words: # remove blank lines and such
if self.word_check(line_item):
self.unique_words.append(line_item)
self.unique_words = self.unique_words + self.unique_attribs_list ## Combine the lists
self.unique_words.sort() ## Sort the list
for word in self.unique_words:
print(word)
#pass
def main():
wwwordlist = Wwwordlist() ## instantiate the object from the class above
if len(sys.argv)!=2:
wwwordlist.usage()
else:
wwwordlist.validate_uri(sys.argv[1]) ## This will fail if bad URI
wwwordlist.get_dom(sys.argv[1]) ## do your stuff, wwwordlist!
if __name__ == "__main__":
main()