简书URL地址分析

可以指定爬虫抓取的规则,支持正则表达式,目前简书

  1. https://www.jianshu.com/p/df7cad4eb8d8
  2. https://www.jianshu.com/p/07b0456cbadb?*****
  3. https://www.jianshu.com/p/.*
rules = (
        Rule(LinkExtractor(allow=r'https://www.jianshu.com/p/[0-9a-z]{12}.*'), callback='parse_item', follow=True),
)

获取简书文章数据

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from items import ArticleItem
from lxml import etree

class JsSpider(CrawlSpider):
    name = 'js'
    allowed_domains = ['jianshu.com']
    start_urls = ['https://www.jianshu.com/']

    rules = (
        # 支持正则表达式   .* 代表后面可有可无
        # callback是指定要解析的方法
        # follow可以理解为回调自己的回调函数 (如果当前抓取的页面里面还有符合条件的地址,则继续跟进解析)
        # 简书首页底部的"阅读更多"后续通过AJAX加载实现
        # allow=r'.*/p/[0-9a-z].*'
        Rule(LinkExtractor(allow=r'.*/p/[0-9a-z]{12}.*'), callback='parse_item', follow=True),
    )

    # 也可以通过命令行方式测试:
    #    1: 输入要测试的命令:scrapy shell https://www.jianshu.com/p/00b7130b2fad
    #    2:交互式命令行中执行:response.xpath("//h1[@class='title']/text()").get()
    def parse_item(self, response):
        html = etree.HTML(response.text)
        print(html.xpath("//title/text()")[0].split('-')[0])
        return None

根据业务需求创建模型(items.py)

import scrapy

class JianshuItem(scrapy.Item):
    # define the fields for your item here like:
    title = scrapy.Field()
    name = scrapy.Field()
    collection = scrapy.Field()
    url = scrapy.Field()

测试parse_item解析函数

# 也可以通过命令行方式测试:
#    1: 输入要测试的命令:scrapy shell https://www.jianshu.com/p/00b7130b2fad
#    2:交互式命令行中执行:response.xpath("//h1[@class='title']/text()").get()
def parse_item(self, response):
    from lxml import etree
    html = etree.HTML(response.text)
    print(html.xpath("//title/text()")[0].split('-')[0])
    time.sleep(1)
    item = JianshuItem()
    item['title'] = html.xpath("//title/text()")[0].split('-')[0]
    item['name'] = html.xpath("//span[@class='name']/a/text()")[0]
    item['url'] = response.url.split('?')[0]
    collection = html.xpath("//div[@class='include-collection']/a/div[@class='name']/text()")
    if collection:
        item['collection'] = '|'.join(collection)
    yield item