-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspiderNews.py
64 lines (56 loc) · 1.73 KB
/
spiderNews.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
from requests_html import HTMLSession
import requests
import pymysql
import os
session = HTMLSession()
host = 'http://www.zhishabang.cn/'
def download(url):
r = requests.get(host + url)
with open(url, 'wb') as f:
f.write(r.content)
def getUrls(imgs):
for i in imgs:
urls = i.attrs['src'].split('/')
file_dir = 'uploads/allimg/' + urls[3]
if not os.path.exists(file_dir):
os.makedirs(file_dir)
url = file_dir + '/' + urls[4]
download(url)
'''
数据库操作
'''
# 打开数据库连接
db = pymysql.connect("39.105.191.239", "fastadmin", "fastadmin", "fastadmin", charset='utf8')
# 使用cursor()方法获取操作游标
cursor = db.cursor()
"""
新闻采集
415条信息
"""
newList = []
T = []
for i in range(415, 445):
try:
id = i + 1
url = host + '/article/' + str(id) + '.html'
resbonContent = session.get(url)
title = resbonContent.html.find('.news_detail h1', first=True).text
# time = resbonContent.html.find('.news_detail .news_time time', first=True).text
content = resbonContent.html.find('.news_sub p')
imgs = resbonContent.html.find('.news_sub img')
getUrls(imgs)
contentHtml = ''
for i in range(len(content)):
if i != 0:
contentHtml += content[i].html
T.append([title, contentHtml])
print('第'+str(id)+'成功√')
except:
print('第'+str(id)+'条失败===================')
# SQL 插入语句
sql = "INSERT INTO fa_article(title,content) VALUES (%s,%s)"
# 一个tuple或者list
cursor.executemany(sql, T)
db.commit() # 只要是修改了表内容的操作,后面一定要提交,否则不起作用
# 关闭数据库连接
db.close()