摘要:LinkExtractor 并不是只能在CrawlSpider类型的spider中使用,还可以在其他地方使用。

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor

class QuotesSpider(scrapy.Spider):
    name = 'quotes'
    allowed_domains = ['quotes.toscrape.com']
    start_urls = ['http://quotes.toscrape.com/']
    link_extractor = LinkExtractor(allow=r'/page/\d+/',restrict_css='li.next')  #声明一个LinkExtractor对象
#   def start_requests(self):
#       url = "http://quotes.toscrape.com/"
#       yield scrapy.Request(url, callback = self.parse)

    def parse(self, response):
        quote_selector_list = response.css('body > div > div:nth-child(2) > div.col-md-8 div.quote')

        for quote_selector in quote_selector_list:
            quote = quote_selector.css('span.text::text').extract_first()
            author = quote_selector.css('span small.author::text').extract_first()
            tags = quote_selector.css('div.tags a.tag::text').extract()

            yield {'quote':quote, 'author':author, 'tags':tags}
        links = self.link_extractor.extract_links(response) #爬取链接
        
        if links:
            yield scrapy.Request(links[0].url, callback = self.parse)

参考:链接提取LinkExtractor与全站爬取利器CrawlSpider - 简书