爬取页面的url返回的是json文件,每个url有十个我需要的数据。关于继续爬取,只是修改了起始url的一个值,类似于爬取知乎回答。但是我如果直接运行文件,就会出现大量的重复数据,只有给数据库加约束才能避免,是因为我的代码没有过滤url???
spider
class baiduJobSpider(scrapy.Spider):
name = 'baiduJob'
allowed_domains = ['zhaopin.baidu.com']
surl = 'https://zhaopin.baidu.com/api/qzasync?query=%E5%B7%A5%E4%BD%9C%E4%BF%A1%E6%81%AF%E7%BD%91%E7%AB%99&zp_fr=aladdin-5463-zp_exact_new&city_sug=%25E8%25A5%25BF%25E5%25AE%2589&city=%25E8%25A5%25BF%25E5%25AE%2589&is_adq=1&pcmod=1&token=%3D%3DAnSvq1X%2Bt0EyVZbaWZpR5kXqpZadIZlpmmk9WaqtGa&pn=0&rn={0}' #url模板
page = 0 #控制url的变化
start_urls = [surl.format(page)]
def start_requests(self): #为url添加cookie信息
cookie = "BAIDUID=06C53A82E42CF1C6DA9656C111250C3F:FG=1; BIDUPSID=06C53A82E42CF1C6DA9656C111250C3F; PSTM=1555035049; pgv_pvi=9556887552; delPer=0; PSINO=1; H_PS_PSSID=1452_21123_29237_28518_29099_28834_29221_29458; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; ZP_FR=aladdin-5463-zp_exact_new"
trans = transCookie(cookie)
cookie_dict=trans.stringToDict()
return [scrapy.Request(url=self.start_urls[0], dont_filter=True, cookies=cookie_dict)]
def parse(self,response): #对每个url中的数据进行处理
job_json = json.loads(response.text) #将json对象转换为python对象
for job in job_json["data"]["disp_data"]:
job_item = baiduJobItem()
job_item['title'] = job['title']
job_item['pc_url'] = job['pc_url']
job_item['url_id'] = getMd5(job['pc_url'])
job_item['lastmod'] = job['lastmod']
job_item['source'] = job['source']
job_item['company'] = job['company']
job_item['city'] = job['city']
job_item['jobtype'] = job['type']
job_item['salary'] = job['salary']
job_item['requirements'] = job['requirements']
job_item['info'] = "公司:%s 城市:%s 工作类型:%s 薪资:%s %s"%(job['company'],job['city'],job['type'],job['salary'],job['requirements'])
yield job_item
self.page += 1
next_url = self.surl.format(self.page) #创建新的url
yield scrapy.Request(url=next_url) #提交新的下载申请
pass
带你彻底掌握Scrapy,用Django+Elasticsearch搭建搜索引擎
了解课程