# -*- coding:utf-8 -*-
from fake_useragent import UserAgent
from lxml import etree
import pandas as pd
import requests
import time
import os
import xlrd
import chardet
import traceback
def getColumnIndex(table, columnName):
columnIndex = None
#print table
for i in range(table.ncols):
#print columnName
#print table.cell_value(0, i)
if(table.cell_value(0, i) == columnName):
columnIndex = i
break
return columnIndex
def readExcelDataByName(fileName, sheetName):
#print fileName
table = None
errorMsg = ""
try:
data = xlrd.open_workbook(fileName)
table = data.sheet_by_name(sheetName)
except Exception as msg:
errorMsg = msg
return table, errorMsg
def readExcelDataByIndex(fileName, sheetIndex):
table = None
errorMsg = ""
try:
data = xlrd.open_workbook(fileName)
table = data.sheet_by_index(sheetIndex)
except Exception as msg:
errorMsg = msg
return table, errorMsg
#def get_all_page(url, page, headers, proxies):
def get_all_page(url, headers, proxies):
# 打印当前页数
# print('====================================================================================================')
# print('========================================page:[', page ,']==================================================')
# print('====================================================================================================')
html_code = requests.get(url, headers=headers, proxies=proxies).text
print(RootPath)
# 单页中设备列表
equip_items = etree.HTML(html_code).xpath(RootPath)
print(equip_items)
print(len(equip_items))
for item in equip_items:
# 设备详细信息
try:
title = item.xpath(TitlePath)[0]
print(title)
price = item.xpath(PricePath)[0] if item.xpath(PricePath) else 0
print(price)
link = 'https://www.amazon.cn/' + item.xpath(LinkPath)[0]
print(link)
stars = float(item.xpath(StarsPath)[0][:3].replace(',', '')) if item.xpath(StarsPath) else 0
#print(stars)
follows = int(item.xpath(FollowPath)[0].replace(',', '')) if item.xpath(FollowPath) else 0
#print(follows)
equip_list.append([title, price, stars, follows, link])
print(f'商品名称:{title} 价格:{price} 评分:{stars} 收藏:{follows} 链接:{link}')
except:
continue
# 获取下一页链接
# next_page = etree.HTML(html_code).xpath(PagePath)
# next_page_link = 'https://www.amazon.cn/' + next_page[0] if next_page else ''
#
# # 若存在下一页继续爬取
# if next_page_link:
# if page < 11:
# page += 1
# get_all_page(next_page_link, page, headers, proxies)
if __name__ == '__main__':
xlsfile = 'C:/Users/nikki/Desktop/excel/example.xlsx'
table = readExcelDataByName(xlsfile, 'Sheet1')[0]
line = 2
# 获取第line行的值
url = table.cell_value(line, getColumnIndex(table, 'url'))
RootPath = table.cell_value(line, getColumnIndex(table, 'RootPath'))
TitlePath = table.cell_value(line, getColumnIndex(table, 'TitlePath'))
PricePath = table.cell_value(line, getColumnIndex(table, 'PricePath'))
LinkPath = table.cell_value(line, getColumnIndex(table, 'LinkPath'))
StarsPath = table.cell_value(line, getColumnIndex(table, 'StarsPath'))
FollowPath = table.cell_value(line, getColumnIndex(table, 'FollowPath'))
PagePath = table.cell_value(line, getColumnIndex(table, 'PagePath'))
print(url)
# 网页请求配置
#url = 'https://www.amazon.cn/s?rh=n%3A42459071&brr=1&rd=1&ref=sa_menu_softwa_l2_b42459071'
ua = UserAgent()
headers = {
'User-Agent': UserAgent().random
}
proxies = {
'HTTPS': '182.99.154.21:4235'
}
equip_list = []
# 爬取函数入口
#get_all_page(url, page=1, headers=headers, proxies=proxies)
get_all_page(url, headers=headers, proxies=proxies)
# 结果保存为 DataFrame
equips_df = pd.DataFrame(equip_list, columns=['title', 'price', 'stars', 'follows', 'link'])
# 不存在则创建output文件夹
if not os.path.isdir('output'):
os.mkdir('output')
# DataFrame结果输出到csv
# equips_df.sort_values('follows', ascending=False).to_csv(f'output/equip_follows_rank.csv', sep=',',na_rep='NA', index=False)
是在第93行
异常返回的是:
url = table.cell_value(line, getColumnIndex(table, ‘url’))
AttributeError: ‘NoneType’ object has no attribute ‘cell_value’