{"id":173,"date":"2020-07-23T09:32:46","date_gmt":"2020-07-23T01:32:46","guid":{"rendered":"http:\/\/www.gaoxigang.com\/?p=173"},"modified":"2020-07-23T09:32:46","modified_gmt":"2020-07-23T01:32:46","slug":"scrapy-%e7%88%ac%e8%99%ab%e6%a1%86%e6%9e%b6-%e8%8e%b7%e5%8f%96%e7%ae%80%e4%b9%a6%e6%96%87%e7%ab%a0%e6%95%b0%e6%8d%ae","status":"publish","type":"post","link":"https:\/\/www.gaoxigang.com\/index.php\/2020\/07\/23\/scrapy-%e7%88%ac%e8%99%ab%e6%a1%86%e6%9e%b6-%e8%8e%b7%e5%8f%96%e7%ae%80%e4%b9%a6%e6%96%87%e7%ab%a0%e6%95%b0%e6%8d%ae\/","title":{"rendered":"Scrapy \u722c\u866b\u6846\u67b6-\u83b7\u53d6\u7b80\u4e66\u6587\u7ae0\u6570\u636e"},"content":{"rendered":"\n<h2 class=\"wp-block-heading\">\u7b80\u4e66URL\u5730\u5740\u5206\u6790<\/h2>\n\n\n\n<blockquote class=\"wp-block-quote is-layout-flow wp-block-quote-is-layout-flow\"><p>\u53ef\u4ee5\u6307\u5b9a\u722c\u866b\u6293\u53d6\u7684\u89c4\u5219\uff0c\u652f\u6301\u6b63\u5219\u8868\u8fbe\u5f0f\uff0c\u76ee\u524d\u7b80\u4e66<\/p><\/blockquote>\n\n\n\n<ol class=\"wp-block-list\"><li>https:\/\/www.jianshu.com\/p\/df7cad4eb8d8<\/li><li>https:\/\/www.jianshu.com\/p\/07b0456cbadb?*****<\/li><li>https:\/\/www.jianshu.com\/p\/.*<\/li><\/ol>\n\n\n\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"generic\" data-enlighter-theme=\"\" data-enlighter-highlight=\"\" data-enlighter-linenumbers=\"\" data-enlighter-lineoffset=\"\" data-enlighter-title=\"\" data-enlighter-group=\"\">rules = (\n        Rule(LinkExtractor(allow=r'https:\/\/www.jianshu.com\/p\/[0-9a-z]{12}.*'), callback='parse_item', follow=True),\n)\n<\/pre>\n\n\n\n<h2 class=\"wp-block-heading\">\u83b7\u53d6\u7b80\u4e66\u6587\u7ae0\u6570\u636e<\/h2>\n\n\n\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\" data-enlighter-theme=\"\" data-enlighter-highlight=\"\" data-enlighter-linenumbers=\"\" data-enlighter-lineoffset=\"\" data-enlighter-title=\"\" data-enlighter-group=\"\">import scrapy\nfrom scrapy.linkextractors import LinkExtractor\nfrom scrapy.spiders import CrawlSpider, Rule\nfrom items import ArticleItem\nfrom lxml import etree\n\nclass JsSpider(CrawlSpider):\n    name = 'js'\n    allowed_domains = ['jianshu.com']\n    start_urls = ['https:\/\/www.jianshu.com\/']\n\n    rules = (\n        # \u652f\u6301\u6b63\u5219\u8868\u8fbe\u5f0f   .* \u4ee3\u8868\u540e\u9762\u53ef\u6709\u53ef\u65e0\n        # callback\u662f\u6307\u5b9a\u8981\u89e3\u6790\u7684\u65b9\u6cd5\n        # follow\u53ef\u4ee5\u7406\u89e3\u4e3a\u56de\u8c03\u81ea\u5df1\u7684\u56de\u8c03\u51fd\u6570 (\u5982\u679c\u5f53\u524d\u6293\u53d6\u7684\u9875\u9762\u91cc\u9762\u8fd8\u6709\u7b26\u5408\u6761\u4ef6\u7684\u5730\u5740\uff0c\u5219\u7ee7\u7eed\u8ddf\u8fdb\u89e3\u6790)\n        # \u7b80\u4e66\u9996\u9875\u5e95\u90e8\u7684\"\u9605\u8bfb\u66f4\u591a\"\u540e\u7eed\u901a\u8fc7AJAX\u52a0\u8f7d\u5b9e\u73b0\n        # allow=r'.*\/p\/[0-9a-z].*'\n        Rule(LinkExtractor(allow=r'.*\/p\/[0-9a-z]{12}.*'), callback='parse_item', follow=True),\n    )\n\n    # \u4e5f\u53ef\u4ee5\u901a\u8fc7\u547d\u4ee4\u884c\u65b9\u5f0f\u6d4b\u8bd5\uff1a\n    #    1: \u8f93\u5165\u8981\u6d4b\u8bd5\u7684\u547d\u4ee4\uff1ascrapy shell https:\/\/www.jianshu.com\/p\/00b7130b2fad\n    #    2\uff1a\u4ea4\u4e92\u5f0f\u547d\u4ee4\u884c\u4e2d\u6267\u884c\uff1aresponse.xpath(\"\/\/h1[@class='title']\/text()\").get()\n    def parse_item(self, response):\n        html = etree.HTML(response.text)\n        print(html.xpath(\"\/\/title\/text()\")[0].split('-')[0])\n        return None\n<\/pre>\n\n\n\n<h2 class=\"wp-block-heading\">\u6839\u636e\u4e1a\u52a1\u9700\u6c42\u521b\u5efa\u6a21\u578b(items.py)<\/h2>\n\n\n\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"generic\" data-enlighter-theme=\"\" data-enlighter-highlight=\"\" data-enlighter-linenumbers=\"\" data-enlighter-lineoffset=\"\" data-enlighter-title=\"\" data-enlighter-group=\"\">import scrapy\n\nclass JianshuItem(scrapy.Item):\n    # define the fields for your item here like:\n    title = scrapy.Field()\n    name = scrapy.Field()\n    collection = scrapy.Field()\n    url = scrapy.Field()\n<\/pre>\n\n\n\n<h2 class=\"wp-block-heading\">\u6d4b\u8bd5parse_item\u89e3\u6790\u51fd\u6570<\/h2>\n\n\n\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\" data-enlighter-theme=\"\" data-enlighter-highlight=\"\" data-enlighter-linenumbers=\"\" data-enlighter-lineoffset=\"\" data-enlighter-title=\"\" data-enlighter-group=\"\"># \u4e5f\u53ef\u4ee5\u901a\u8fc7\u547d\u4ee4\u884c\u65b9\u5f0f\u6d4b\u8bd5\uff1a\n#    1: \u8f93\u5165\u8981\u6d4b\u8bd5\u7684\u547d\u4ee4\uff1ascrapy shell https:\/\/www.jianshu.com\/p\/00b7130b2fad\n#    2\uff1a\u4ea4\u4e92\u5f0f\u547d\u4ee4\u884c\u4e2d\u6267\u884c\uff1aresponse.xpath(\"\/\/h1[@class='title']\/text()\").get()\ndef parse_item(self, response):\n    from lxml import etree\n    html = etree.HTML(response.text)\n    print(html.xpath(\"\/\/title\/text()\")[0].split('-')[0])\n    time.sleep(1)\n    item = JianshuItem()\n    item['title'] = html.xpath(\"\/\/title\/text()\")[0].split('-')[0]\n    item['name'] = html.xpath(\"\/\/span[@class='name']\/a\/text()\")[0]\n    item['url'] = response.url.split('?')[0]\n    collection = html.xpath(\"\/\/div[@class='include-collection']\/a\/div[@class='name']\/text()\")\n    if collection:\n        item['collection'] = '|'.join(collection)\n    yield item\n<\/pre>\n","protected":false},"excerpt":{"rendered":"<p>\u7b80\u4e66URL\u5730\u5740\u5206\u6790 \u53ef\u4ee5\u6307\u5b9a\u722c\u866b\u6293\u53d6\u7684\u89c4\u5219\uff0c\u652f\u6301\u6b63\u5219\u8868\u8fbe\u5f0f\uff0c\u76ee\u524d\u7b80\u4e66 https:\/\/www.jianshu. [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[4],"tags":[],"class_list":["post-173","post","type-post","status-publish","format-standard","hentry","category-biji"],"_links":{"self":[{"href":"https:\/\/www.gaoxigang.com\/index.php\/wp-json\/wp\/v2\/posts\/173","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.gaoxigang.com\/index.php\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.gaoxigang.com\/index.php\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.gaoxigang.com\/index.php\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/www.gaoxigang.com\/index.php\/wp-json\/wp\/v2\/comments?post=173"}],"version-history":[{"count":0,"href":"https:\/\/www.gaoxigang.com\/index.php\/wp-json\/wp\/v2\/posts\/173\/revisions"}],"wp:attachment":[{"href":"https:\/\/www.gaoxigang.com\/index.php\/wp-json\/wp\/v2\/media?parent=173"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.gaoxigang.com\/index.php\/wp-json\/wp\/v2\/categories?post=173"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.gaoxigang.com\/index.php\/wp-json\/wp\/v2\/tags?post=173"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}