简书URL地址分析
可以指定爬虫抓取的规则,支持正则表达式,目前简书
- https://www.jianshu.com/p/df7cad4eb8d8
- https://www.jianshu.com/p/07b0456cbadb?*****
- https://www.jianshu.com/p/.*
rules = ( Rule(LinkExtractor(allow=r'https://www.jianshu.com/p/[0-9a-z]{12}.*'), callback='parse_item', follow=True), )
获取简书文章数据
import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from items import ArticleItem from lxml import etree class JsSpider(CrawlSpider): name = 'js' allowed_domains = ['jianshu.com'] start_urls = ['https://www.jianshu.com/'] rules = ( # 支持正则表达式 .* 代表后面可有可无 # callback是指定要解析的方法 # follow可以理解为回调自己的回调函数 (如果当前抓取的页面里面还有符合条件的地址,则继续跟进解析) # 简书首页底部的"阅读更多"后续通过AJAX加载实现 # allow=r'.*/p/[0-9a-z].*' Rule(LinkExtractor(allow=r'.*/p/[0-9a-z]{12}.*'), callback='parse_item', follow=True), ) # 也可以通过命令行方式测试: # 1: 输入要测试的命令:scrapy shell https://www.jianshu.com/p/00b7130b2fad # 2:交互式命令行中执行:response.xpath("//h1[@class='title']/text()").get() def parse_item(self, response): html = etree.HTML(response.text) print(html.xpath("//title/text()")[0].split('-')[0]) return None
根据业务需求创建模型(items.py)
import scrapy class JianshuItem(scrapy.Item): # define the fields for your item here like: title = scrapy.Field() name = scrapy.Field() collection = scrapy.Field() url = scrapy.Field()
测试parse_item解析函数
# 也可以通过命令行方式测试: # 1: 输入要测试的命令:scrapy shell https://www.jianshu.com/p/00b7130b2fad # 2:交互式命令行中执行:response.xpath("//h1[@class='title']/text()").get() def parse_item(self, response): from lxml import etree html = etree.HTML(response.text) print(html.xpath("//title/text()")[0].split('-')[0]) time.sleep(1) item = JianshuItem() item['title'] = html.xpath("//title/text()")[0].split('-')[0] item['name'] = html.xpath("//span[@class='name']/a/text()")[0] item['url'] = response.url.split('?')[0] collection = html.xpath("//div[@class='include-collection']/a/div[@class='name']/text()") if collection: item['collection'] = '|'.join(collection) yield item