-
Notifications
You must be signed in to change notification settings - Fork 29
/
Copy pathget-email-by-tieba.py
272 lines (256 loc) · 10.1 KB
/
get-email-by-tieba.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
#! /usr/bin/env python
# -*- coding: utf-8 -*-
#@author 昌维 [[email protected]]
#@version 2016-07-28 21:05:47
import urllib2
import urllib
import hashlib
import json
import re
import sys
import getopt
import time
# class UnicodeStreamFilter:
# def __init__(self, target):
# self.target = target
# self.encoding = 'utf-8'
# self.errors = 'replace'
# self.encode_to = self.target.encoding
# def write(self, s):
# if type(s) == str:
# s = s.decode("utf-8")
# s = s.encode(self.encode_to, self.errors).decode(self.encode_to)
# self.target.write(s)
# if sys.stdout.encoding == 'cp936':
# sys.stdout = UnicodeStreamFilter(sys.stdout)
def main():
print """
*****************************************
* Get Tieba's Email&Telephone *
* Code By changwei *
*****************************************
author : changwei
date : 2016-07-28 21:05:47
email : [email protected]
website : www.changwei.me
-----------------------------------------
usage : <-t thread_id | -f forum_name -s start_page -e end_page> <-o out_file_path> [-i split_str]
example1 : -t 2314539885 -o C:\info.txt
example2 : -f bug -o C:\info.txt
example3 : -f bug -o C:\info.txt -i ------
output_file3 : [email protected]
-----------------------------------------
"""
# print sys.argv
opts, args = getopt.getopt(sys.argv[1:], "t:f:s:e:o:i:")
thread_id = ''
forum_name = ''
start_page = ''
end_page = ''
out_file_path = ''
split_str = ''
for op, value in opts:
if op == "-t":
thread_id = value
elif op == "-f":
forum_name = value
elif op == "-s":
start_page = int(value)
elif op == "-e":
end_page = int(value)
elif op == "-o":
out_file_path = value
elif op == "-i":
split_str = value
info = [] # 用于存储fetch结果
if thread_id == '' and forum_name == '':
print '1.fetch by forum_name\n2.fetch by thread_id'
fetch_type = raw_input("please press the fetch type:")
# print '-----------------------------------------\nplease input the parameter'
if fetch_type=='1': # fetch by forum_name
forum_name = raw_input("please input the forum_name:")
start_page = raw_input("please input the start_page:")
end_page = raw_input("please input the end_page:")
out_file_path = raw_input("please input the out_file_path:")
info = info + fetchByForumName(forum_name,start_page,end_page,split_str)
elif fetch_type=='2': # fetch by thread_id
thread_id = raw_input("please input the thread_id:")
out_file_path = raw_input("please input the out_file_path:")
info = info + fetchByThreadId(thread_id,split_str)
else:
print 'error input!!!'
elif thread_id != '' and out_file_path != '':
info = info + fetchByThreadId(thread_id,split_str)
elif forum_name != '' and start_page != '' and end_page != '' and out_file_path != '':
info = info + fetchByForumName(forum_name,start_page,end_page,split_str)
else:
print 'error input!!!'
return
write_to_file(out_file_path,info)
print 'Please open the file. Path:[' + out_file_path + ']'
# print thread
# thread['post_list']['0']['content']['0']['text'] content同级[author][name]为id
# thread['thread']['valid_post_num']
# print getInfoByInput('[email protected],[email protected],[email protected],18970459733,12345678901')
#
def initInfo():
ISOTIMEFORMAT='%Y-%m-%d %X'
print '' + time.strftime( ISOTIMEFORMAT, time.localtime() ) + ' start!!!\n'
def write_to_file(out_file_path,content):
'写入content集合到out_file_path里面,此处content为抓取得到的数据,数据类型为list,用户可以手动修改保存方式,比如说保存在数据库或者json里面等等'
file = open(out_file_path,'a')
file.writelines(content)
print content
def fetchByForumName(forum_name,start_page,end_page,split_str=''):
'从指定贴吧中抓取邮箱和手机号数据'
initInfo()
info = []
tids = getTidsByKw(forum_name,start_page,end_page)
for tid in tids:
if tid == '':continue
thread_json = getThreadByTid(tid=tid,pn='0')
thread = json.loads(thread_json)
pn_total = int(thread['thread']['valid_post_num']) / 30 + 1
for x in xrange(0,pn_total):
thread_json = getThreadByTid(tid=tid,pn=str(x))
thread = json.loads(thread_json)
for thread_text in thread['post_list']:
# print thread_text['content']
if len(thread_text['content'])>0:
if thread_text['content'][0].has_key('text'):
if len(getInfoByInput(thread_text['content'][0]['text'].encode('utf8')))!=0:
if split_str == '':
info.append(getInfoByInput(thread_text['content'][0]['text'].encode('utf8')) + '\n')
else:
info.append(getInfoByInput(thread_text['content'][0]['text'].encode('utf8')) + split_str + thread_text['author']['name'].encode('utf8') + '\n')
return info
def fetchByThreadId(tid,split_str=''):
'从指定帖子中抓取邮箱和手机号数据'
initInfo()
info = []
thread_json = getThreadByTid(tid=tid,pn='0')
thread = json.loads(thread_json)
pn_total = int(thread['thread']['valid_post_num']) / 30 + 1
for x in xrange(0,pn_total):
thread_json = getThreadByTid(tid=tid,pn=str(x))
thread = json.loads(thread_json)
for thread_text in thread['post_list']:
# print thread_text['content'][0]['text'].encode('utf8')
if len(thread_text['content'])>0:
if thread_text['content'][0].has_key('text'):
if len(getInfoByInput(thread_text['content'][0]['text'].encode('utf8')))!=0:
if split_str == '':
info.append(getInfoByInput(thread_text['content'][0]['text'].encode('utf8')) + '\n')
else:
info.append(getInfoByInput(thread_text['content'][0]['text'].encode('utf8')) + split_str + thread_text['author']['name'].encode('utf8') + '\n')
return info
def getThreadByTid(tid,pn='0'):
'通过tid获取帖子内容'
data = ['_client_id=wappc_1396611108603_817',
'_client_type=2',
'_client_version=5.7.0',
'_phone_imei=642b43b58d21b7a5814e1fd41b08e2a6',
'from=tieba',
'kz=' + tid,
'pn=' + pn,
'q_type=2',
'rn=30',
'with_floor=1']
# print getSignByPostData(data)
# print data
data.append("sign="+getSignByPostData(data))
#定义post的地址
url = 'http://c.tieba.baidu.com/c/f/pb/page'
# post_data = urllib.urlencode(data)
post_data = "&".join(data)
# 设置头部
headers = { 'Content-Type':'application/x-www-form-urlencoded',
'Referer':'http://tieba.baidu.com/',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0',
'Connection':'keep-alive' }
# req.add_header('Content-Type','application/x-www-form-urlencoded');
# req.add_header('Referer','http://tieba.baidu.com/');
# req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0');
# req.add_header('Connection','keep-alive');
# print post_data
#提交,发送数据
req = urllib2.Request(url, post_data, headers)
response = urllib2.urlopen(req)
#获取提交后返回的信息
content = response.read()
return content
def getTidsByKw(kw='',start_page='0',end_page=''):
'通过贴吧名获取帖子号列表'
tids = []
for pn in xrange(int(start_page),int(end_page)):
data = [
'_client_id=wappc_1396611108603_817',
'_client_type=2',
'_client_version=5.7.0',
'_phone_imei=642b43b58d21b7a5814e1fd41b08e2a6',
'from=tieba',
'kw=' + kw,
'pn=' + str(pn),
'q_type=2',
'rn=30',
'with_group=1']
# print getSignByPostData(data)
# print data
data.append("sign="+getSignByPostData(data))
#定义post的地址
url = 'http://c.tieba.baidu.com/c/f/frs/page'
# post_data = urllib.urlencode(data)
post_data = "&".join(data)
# 设置头部
headers = { 'Content-Type':'application/x-www-form-urlencoded',
'Referer':'http://tieba.baidu.com/',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0',
'Connection':'keep-alive' }
# req.add_header('Content-Type','application/x-www-form-urlencoded');
# req.add_header('Referer','http://tieba.baidu.com/');
# req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0');
# req.add_header('Connection','keep-alive');
# print post_data
#提交,发送数据
req = urllib2.Request(url, post_data, headers)
response = urllib2.urlopen(req)
#获取提交后返回的信息
content = response.read()
forum_json = json.loads(content)
tids = tids + forum_json['forum']['tids'].split(',')
return tids
def getSignByPostData(post_data):
'通过post数据获得sign校验码'
sign = hashlib.md5()
# print "".join(post_data)
sign.update("".join(post_data)+"tiebaclient!!!")
return sign.hexdigest()
def getInfoByInput(inputmail):
"""
说明:
[^\._-][\w\.-]+@(?:[A-Za-z0-9]+\.)+[A-Za-z]+$匹配邮箱。
综合目前国内常用的邮箱,大概通用的规则包括:
1、[^\._],不能以下划线和句点开头。
2、[\w\.]+,包括字母、数字。而对句点及下划线各提供商有差别,对此有效性不做更严格的判断。
3、@是必须的。
4、(?:[A-Za-z0-9]+\.)+[A-Za-z]+$,@后以xxx.xxx结尾,考虑到多级域名,会有这种情况xxx.xxx.xxx如[email protected]
^0\d{2,3}\d{7,8}$|^1[358]\d{9}$|^147\d{8}$匹配电话号码。
只考虑国内的情况,大概通用的规则包括:
1、^0\d{2,3},固定电话区号3-4位数字,以0开头。
2、d{7,8}$,固定电话一般7-8位数字。
3、国内目前的手机号码都是11位数字,三大运营商的号码段基本都在上面列出来了,我们这里除了147的号码的段,其他的都只考虑前两位,
第三位就不考虑了,否则,工作量也很大。前两位包括13*、15*、18*。
"""
# p=re.compile('[^\._-][\w\.-]+@(?:[A-Za-z0-9]+\.)+[A-Za-z]+$|^0\d{2,3}\d{7,8}$|^1[358]\d{9}$|^147\d{8}')
# match = p.match(inputmail)
# return match.group()
#
regex = re.compile(r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}|1[3|4|5|7|8]\d{9}\b", re.IGNORECASE)
result = str(re.findall(regex, inputmail)).replace("']",'').replace("['",'').replace("[]",'').replace("', '",'')
return result
if __name__ == '__main__':
start_time = time.time()
main()
end_time = time.time()
print '\nFinished in ' + str(end_time - start_time) + 's'
print "Get Tieba's Email&Telephone. Code By changwei[[email protected]]"