import requests
from lxml import html
from openpyxl import Workbook
def spider(sn, book_list=[]):
""" 爬取京东的图书数据 """
url = 'https://search.jd.com/Search?keyword={0}'.format(sn)
print(url)
wb = Workbook()
ws = wb.active
ws.title = '京东图书爬取'
ws.append(['标题', '价格', '店铺', '购买链接'])
# 获取HTML文档
resp = requests.get(url, headers={
'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6776.400 QQBrowser/10.3.2601.400'
})
print(resp.encoding)
resp.encoding = 'utf-8'
html_doc = resp.text
# 获取xpath对象
selector = html.fromstring(html_doc)
# 找到列表的集合
ul_list = selector.xpath('//div[@id="J_goodsList"]/ul/li')
print(len(ul_list))
# 解析对应的内容,标题,价格,链接
for li in ul_list:
# 标题
title = li.xpath('div/div[@class="p-name"]/a/@title')
print(title[0])
# 购买链接
link = li.xpath('div/div[@class="p-name"]/a/@href')
print(link[0])
# 价格
price = li.xpath('div/div[@class="p-price"]/strong/i/text()')
print(price[0])
# 店铺
store = li.xpath('div//a[@class="curr-shop"]/@title')
print(store[0])
book_list.append({
'title': title[0],
'price': price[0],
'link': link[0],
'store': store[0]
})
# 添加新的行
l = [title[0].encode('utf-8'), price[0].encode('utf-8'), store[0].encode('utf-8'), link[0].encode('utf-8')]
ws.append(l)
wb.save('./book_jd.xlsx')
if __name__ == '__main__':
spider('9787115428028')