【笔记】Scrapy分布式与ScrapySplash组合使用

前言

Scrapy分布式与ScrapySplash组合使用

重写去重类

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
from __future__ import absolute_import
from copy import deepcopy
from scrapy.utils.request import request_fingerprint
from scrapy.utils.url import canonicalize_url
from scrapy_splash.utils import dict_hash
from scrapy_redis.dupefilter import RFPDupeFilter


# 实现了scrapy_redis的指纹去重功能和scrapy_splash渲染功能
class SplashAwareDupeFilter(RFPDupeFilter):

def splash_request_fingerprint(request, include_headers=None):
# 生成指纹
fp = request_fingerprint(request, include_headers=include_headers)
# 实现了splash的渲染功能代码
if 'splash' not in request.meta:
return fp
splash_options = deepcopy(request.meta['splash'])
args = splash_options.setdefault('args', {})
if 'url' in args:
args['url'] = canonicalize_url(args['url'], keep_fragments=True)
return dict_hash(splash_options, fp)

def request_fingerprint(self, request):
return self.splash_request_fingerprint(request)

修改配置文件

  • 使用重写后的去重类
1
DUPEFILTER_CLASS = '<package_name>.<file_name>.SplashAwareDupeFilter'

完成

参考文献

哔哩哔哩——莉莉的茉莉花
CSDN——adamyoungjack