# -*- coding: utf-8 -*-
import scrapy
from urllib import parse
from scrapy.http import Request
import re;
import time
from bs.items import JobbleBsItem
from utils.common import get_md5
from bs.items import leetcodeItem
import datetime
import json
import requests
from bs.items import Artile_item_loader #scrapy提供的库来解析css
class leetcodeSpider(scrapy.Spider):
name = 'leetcode'
allowed_domains = ['leetcode.com']
start_urls = ['https://leetcode.com/api/problems/all/']
def parse(self, response) :
#获取下一页的url,交给scrapy下载
res=json.loads(response.text,encoding="utf8")
problemSet = res["stat_status_pairs"]
for i in range(len(problemSet))://将所有问题列表循环
Item=leetcodeItem()
title = problemSet[i]["stat"]["question__title_slug"]
Item["title"] = problemSet[i]["stat"]["question__title"]
Item["levels"] = problemSet[i]["difficulty"]["level"]
url = "https://leetcode.com/problems/" + title + "/description/"
yield scrapy.Request(url,callback=self.parse_detail,meta={'Item':Item,'title':title})//依次请求
def parse_detail(self,response):
url_link = "https://leetcode.com/graphql"
setCookie = response.headers["Set-Cookie"]
Item = response.meta["Item"]
try:
pattern = re.compile("csrftoken=(.*?);.*?", re.S)
csrftoken = re.search(pattern, setCookie.decode("utf-8"))
data = {"operationName": "questionData",
"variables": {"titleSlug": response.meta["title"]},
"query": """
query questionData($titleSlug: String!) {
question(titleSlug: $titleSlug) {
questionId
questionFrontendId
boundTopicId
title
titleSlug
content
translatedTitle
translatedContent
isPaidOnly
difficulty
likes
dislikes
isLiked
similarQuestions
contributors {
username
profileUrl
avatarUrl
__typename
}
langToValidPlayground
topicTags {
name
slug
translatedName
__typename
}
companyTagStats
codeSnippets {
lang
langSlug
code
__typename
}
stats
hints
solution {
id
canSeeDetail
__typename
}
status
sampleTestCase
metaData
judgerAvailable
judgeType
mysqlSchemas
enableRunCode
enableTestMode
envInfo
libraryUrl
__typename
}
}
"""
}
headers = {
'x-csrftoken': csrftoken.group(1),
'referer': url_link,
'content-type': 'application/json',
'origin': 'https://leetcode.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'
}
cookies = {'__cfduid': 'd9ce37537c705e759f6bea15fffc9c58b1525271602',
'_ga': 'GA1.2.5783653.1525271604',
'_gid': 'GA1.2.344320119.1533189808',
'csrftoken': csrftoken.group(1),
' _gat': '1'}
dumpJsonData = json.dumps(data)
response = requests.post(url_link, data=dumpJsonData, headers=headers, cookies=cookies)
dictInfo = json.loads(response.text)
content = dictInfo["data"]["question"]["content"]
Item["dislike"] = dictInfo["data"]["question"]["dislikes"]
Item["likes"] = dictInfo["data"]["question"]["likes"]
Item["questionId"] = dictInfo["data"]["question"]["questionFrontendId"]
Item["stats"] = dictInfo["data"]["question"]["stats"]
Item["topicTags"] = dictInfo["data"]["question"]["topicTags"][0]["name"]
yield Item;
except Exception as e:
print(e)
print("错误:" + url_link)
老师我这个写法是不是不太行啊 我一次性获取了全部的url然后用for循环请求,发现下载延迟设置无效,暂停也停不下来,
带你彻底掌握Scrapy,用Django+Elasticsearch搭建搜索引擎
了解课程