大致分析了下京东评论 相同手机型号的产品用的评论都是一样的,所以每个型号的爬一个就可以了;
每一个评论最多只能爬100页,每页10条, 加上好中差评 大概能有2000多条不重复的评论
{productId}就是对应产品的productId;
{score}对应全部/好/中/差评 0:全部评价 1:差评 2:中评 3:好评
comment_url = 'https://sclub.jd.com/comment/productPageComments.action?productId={productId}&score={score}&sortType=5&page={page}&pageSize=10&isShadowSku=0&rid=0&fold=1'
response = requests.get(start_urls[0])
soup = BeautifulSoup(response.content, 'lxml')
content = soup.find_all('div', class_='erji')
dic = {}
model = ''
for a in content:
a_label = a.find_all('a')
for href in a_label:
productId = re.compile('\d+').findall(href.get('href'))[0]
dic['https:' + href.get('href')] = [productId, href.get_text()]
# print('------->',dic)
{'https://item.jd.com/5544068.html': ['5544068', 'HUAWEI Mate 10'],
'https://item.jd.com/5826236.html': ['5826236', 'HUAWEI Mate 10 Pro'],
'https://item.jd.com/3888284.html': ['3888284', 'HUAWEI Mate 9'],
'https://item.jd.com/3749093.html': ['3749093', 'HUAWEI Mate 9 Pro'],
'https://item.jd.com/6946605.html': ['6946605', 'HUAWEI P20'],
'https://item.jd.com/6946625.html': ['6946625', 'HUAWEI P20 Pro'],
'https://item.jd.com/5148387.html': ['5148387', '麦芒6 极光蓝'],
'https://item.jd.com/8026730.html': ['8026730', 'HUAWEI nova 3'], ...}
def start_requests(self):
for k,v in self.dic.items():
productId = v[0]
global model
model = v[1]
for score in range(4):
page = 0
while page < 101:
yield Request(self.comment_url.format(productId=productId,score=score,page=page),self.parse,dont_filter=True)
page += 1
time.sleep(1)
def parse(self, response):
datas = json.loads(response.text)['comments']
if datas:
for data in datas:
item = JdParseItem()
for field in item.fields:
if field in data.keys():
item['model'] = model
if field == 'productSales':
item[field] = data.get(field)[0]['saleValue']
else:
item[field] = data.get(field)
yield item