scrapy中的LinkExtractor的使用(二)
摘要:LinkExtractor 并不是只能在CrawlSpider类型的spider中使用,还可以在其他地方使用。
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
class QuotesSpider(scrapy.Spider):
name = 'quotes'
allowed_domains = ['quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com/']
link_extractor = LinkExtractor(allow=r'/page/\d+/',restrict_css='li.next') #声明一个LinkExtractor对象
# def start_requests(self):
# url = "http://quotes.toscrape.com/"
# yield scrapy.Request(url, callback = self.parse)
def parse(self, response):
quote_selector_list = response.css('body > div > div:nth-child(2) > div.col-md-8 div.quote')
for quote_selector in quote_selector_list:
quote = quote_selector.css('span.text::text').extract_first()
author = quote_selector.css('span small.author::text').extract_first()
tags = quote_selector.css('div.tags a.tag::text').extract()
yield {'quote':quote, 'author':author, 'tags':tags}
links = self.link_extractor.extract_links(response) #爬取链接
if links:
yield scrapy.Request(links[0].url, callback = self.parse)
本作品采用 知识共享署名-相同方式共享 4.0 国际许可协议 进行许可。