1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
| import scrapy from demo.items import DemoItem from scrapy_splash import SplashRequest
class TestSpider(scrapy.Spider): name = "test" allowed_domains = ["loli.fj.cn"] start_urls = ["https://loli.fj.cn"]
def start_requests(self): yield SplashRequest(self.start_urls[0], callback=self.parse, args={'wait': 10}, endpoint='render.html')
def parse(self, response, *args, **kwargs): # 通过xpath解析数据 node_list = response.xpath('//article') # 遍历节点列表 for node in node_list: # 实例化模型 item = DemoItem() item['title'] = node.xpath('./header/h2/a/text()').extract_first() item['content'] = node.xpath('./div[@itemprop="articleBody"]/p/text()').extract_first() # 返回数据 yield item
|