先贴源码,熊猫tv凉了,我选择斗鱼,而且问题和直播平台没有关系
from urllib import request
import gzip
import re
class Spider():
url = 'https://www.douyu.com/g_LOL'
root_pattern = '<div class="DyListCover-info">[\s\S]*?</div>'
root_detailNum = '<span class="DyListCover-hot is-template">[\s\S]*?</span>'
root_detailName = '<h2 class="DyListCover-user is-template">[\s\S]*?</h2>'
real_num = '</svg>([\s\S]*?)</span>'
real_name = '</svg>([\s\S]*?)</h2>'
real_num_seed = '([\s\S]*?)[\u4e00-\u9fa5]'
def __fetch_content(self):
r = request.urlopen(Spider.url)
htmls = r.read()
htmls = gzip.decompress(htmls)
print(type(htmls))
encoded_htmls = htmls.decode("utf-8")
return encoded_htmls
def go(self):
htmls = self.__fetch_content()
anchors = self.__analysis(htmls)
anchors = self.__sort(anchors)
self.show(anchors)
def show(self,anchors):
for anchor in anchors:
print(anchor['name']+'---------------'+anchor['number'])
def __sort(self,anchors):
anchors = sorted(anchors,key = self.__sort_seed,reverse = True)
return anchors
def __sort_seed(self,anchor):
number = re.findall(Spider.real_num_seed,anchor['number'])[0]
number = float(number)*10000
return number
def __analysis(self, htmls):
root_html = re.findall(Spider.root_pattern,htmls)
anchors = []
for a in root_html:
anchor = self.__distribute(a)
if anchor:
anchors.append(anchor)
return anchors
def __distribute(self, root_html):
num_Inside = re.findall(Spider.root_detailNum,root_html)
if num_Inside:
name_Inside = re.findall(Spider.root_detailName,root_html)
if name_Inside:
real_num = re.findall(Spider.real_num,num_Inside[0])[0]
real_name = re.findall(Spider.real_name,name_Inside[0])[0]
return {'name':real_name,'number':real_num}
spider = Spider()
spider.go()
问题有两个,
第一个问题是 __sort 方法中调用的 sorted 函数的参数:key=self.__sort_seed, 为什么这个函数后面没有括号以及括号内的参数?__sort_seed方法是自定义的含一个参数(第二个问题与这个参数有关)的方法,返回的是浮点型的number人数
第二个问题是,我们的anchors数据一个list,他的结构是这样的,我希望形象一些来说:
anchors = [anchor1,anchor2,anchor3……],而这里每一个anchor1,2,3都是一个dict,比如说anchor1 = {‘name’:name,‘number’:number} 。__sort_seed方法传入的参数应该是一个anchor,因为我们可以通过anchor[‘number’]来访问key(number)的value。我们在__sort方法中传入的参数是anchors,我们可以对anchors[0]使用__sort_seed方法但是不能对anchors使用,那么,在sorted函数中我们是如何区分的?这里的self到底是啥啊?