淘宝双11做了较大调整,不按接口返回了,使用正则爬去代码参考
import requests
import re
import json
def spider_tb(sn ,book_list=[]):
url = 'https://s.taobao.com/search?q={0}'.format(sn)
#获取html内容
text = requests.get(url).text
# 使用正则表达式找到json对象
p = re.compile(r'g_page_config = (\{.+\});\s*', re.M)
rest = p.search(text)
if rest:
print(rest.group(1))
data = json.loads(rest.group(1))
bk_list = data['mods']['itemlist']['data']['auctions']
print (len (bk_list))
for bk in bk_list:
#标题
title = bk["raw_title"]
print(title)
#价格
price = bk["view_price"]
print(price)
#购买链接
link = bk["detail_url"]
print(link)
#商家
store = bk["nick"]
print(store)
book_list.append({ 'title' : title, 'price' : price, 'link' : link, 'store' : store })
print ('{title}:{price}:{link}:{store}'.format( title = title, price = price, link = link, store = store ))
if __name__ == '__main__':
spider_tb('9787115428028')