{"id":156,"date":"2020-07-22T09:39:22","date_gmt":"2020-07-22T01:39:22","guid":{"rendered":"http:\/\/www.gaoxigang.com\/?p=156"},"modified":"2020-07-22T09:39:22","modified_gmt":"2020-07-22T01:39:22","slug":"scrapy-%e7%88%ac%e8%99%ab%e6%a1%86%e6%9e%b6-%e5%9f%ba%e4%ba%8e%e9%aa%8c%e8%af%81%e7%a0%81%e7%99%bb%e5%bd%95","status":"publish","type":"post","link":"https:\/\/www.gaoxigang.com\/index.php\/2020\/07\/22\/scrapy-%e7%88%ac%e8%99%ab%e6%a1%86%e6%9e%b6-%e5%9f%ba%e4%ba%8e%e9%aa%8c%e8%af%81%e7%a0%81%e7%99%bb%e5%bd%95\/","title":{"rendered":"Scrapy \u722c\u866b\u6846\u67b6-\u57fa\u4e8e\u9a8c\u8bc1\u7801\u767b\u5f55"},"content":{"rendered":"\n<h2 class=\"wp-block-heading\">PIL\u5e93\u57fa\u672c\u4ecb\u7ecd<\/h2>\n\n\n\n<blockquote class=\"wp-block-quote is-layout-flow wp-block-quote-is-layout-flow\"><p>PIL\uff1aPython Imaging Library\uff0c\u5df2\u7ecf\u662fPython\u5e73\u53f0\u4e8b\u5b9e\u4e0a\u7684\u56fe\u50cf\u5904\u7406\u6807\u51c6\u5e93\u4e86\u3002PIL\u529f\u80fd\u975e\u5e38\u5f3a\u5927\uff0c\u4f46API\u5374\u975e\u5e38\u7b80\u5355\u6613\u7528<\/p><\/blockquote>\n\n\n\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\" data-enlighter-theme=\"\" data-enlighter-highlight=\"\" data-enlighter-linenumbers=\"\" data-enlighter-lineoffset=\"\" data-enlighter-title=\"\" data-enlighter-group=\"\"># \u52a0\u8f7d\u56fe\u7247\ndata = Image.open(\"..\/data\/getcode.do.jpg\")\ndata.show()\n# \u628a\u56fe\u7247\u8f6c\u5316\u4e3a\u77e9\u9635\u7c7b\u578b\ndata = np.array(data)\nprint(data,data.shape)<\/pre>\n\n\n\n<h2 class=\"wp-block-heading\">\u5b8c\u6210\u767b\u5f55\u9a8c\u8bc1\u7801\u8bc6\u522b\u64cd\u4f5c<\/h2>\n\n\n\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\" data-enlighter-theme=\"\" data-enlighter-highlight=\"\" data-enlighter-linenumbers=\"\" data-enlighter-lineoffset=\"\" data-enlighter-title=\"\" data-enlighter-group=\"\"># -*- coding: utf-8 -*-\nimport scrapy\nfrom scrapy import cmdline\nfrom urllib import request\nfrom PIL import Image\n\n\nclass DoubanSpiderSpider(scrapy.Spider):\n    name = 'douban_spider'\n    allowed_domains = ['movie.douban.com']\n    # \u542f\u52a8url\u5730\u5740\n    start_urls = ['http:\/\/www.renren.com\/PLogin.do']\n\n    def parse(self, response):\n        formdata = {\n            'email': '970138074@qq.com',\n            'password': 'pythonspider'\n        }\n        # \u83b7\u53d6\u5f53\u524d\u7684\u9a8c\u8bc1\u7801\u5730\u5740\n        img_url = response.xpath(\"\/\/*[@id='verifyPic_login']\/@src\")\n        print(img_url)\n        if img_url:\n            # \u6839\u636e\u56fe\u7247\u5730\u5740\u83b7\u53d6\u56fe\u7247\u4fe1\u606f\n            check_code = self.parse_image(img_url)\n            formdata['icode'] = check_code\n        yield scrapy.FormRequest(url='http:\/\/www.renren.com\/PLogin.do', formdata=formdata, callback=self.after_login)\n\n    # next(parse)\n    def parse_image(self, image_url):\n        # from urllib import request\n        request.urlretrieve(image_url, 'check_code.png')\n        image = Image.open('check_code.png')\n        image.show()\n        check_code = input(\"\u8bf7\u8f93\u5165\u9a8c\u8bc1\u7801\")\n        return check_code\n\n    def after_login(self, response):\n        print('---->' , response.url)\n\n\nif __name__ == \"__main__\":\n    # cmdline.execute(\"scrapy crawl -s LOG_FILE=all.log douban_spider\".split(' '))\n    cmdline.execute(\"scrapy crawl douban_spider\".split(' '))\n    # cmdline.execute(\"scrapy crawl -o douban.csv douban_spider\".split(' '))\n<\/pre>\n\n\n\n<h4 class=\"wp-block-heading\">\u7248\u672c\u4e8c<\/h4>\n\n\n\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\" data-enlighter-theme=\"\" data-enlighter-highlight=\"\" data-enlighter-linenumbers=\"\" data-enlighter-lineoffset=\"\" data-enlighter-title=\"\" data-enlighter-group=\"\">import scrapy\nfrom urllib import request\nfrom PIL import Image\nfrom lxml import etree\n\n\nclass RenrenSpiderSpider(scrapy.Spider):\n    name = 'renren_spider'\n    allowed_domains = ['renren.com']\n    start_urls = ['http:\/\/www.renren.com\/SysHome.do']\n\n    def parse(self, response):\n        loginForm = {\n            'email':'1147040@qq.com',\n            'password':'scrapy_demo1'\n        }\n        #img_url = etree.HTML(response.text).xpath(\"\/\/*[@id='verifyPic_login']\/@src\")\n        img_url = 'http:\/\/icode.renren.com\/getcode.do?t=web_login&amp;rnd=Math.random()'\n        print(img_url)\n        if img_url:\n            check_code = self.__parse_image(img_url)\n            loginForm['icode'] = check_code\n        return scrapy.FormRequest(url='http:\/\/www.renren.com\/PLogin.do',formdata=loginForm,callback=self.after_login)\n\n    def __parse_image(self,img_url):\n        request.urlretrieve(img_url,\"check_code.png\")\n        image = Image.open('check_code.png')\n        image.show()\n        check_code = input('\u8bf7\u8f93\u5165\u9a8c\u8bc1\u7801')\n        return check_code\n\n    def after_login(self,reponse):\n        with open('renren.hmtl','w+',encoding='utf-8') as f:\n            f.write(reponse.text)\n<\/pre>\n\n\n\n<blockquote class=\"wp-block-quote is-layout-flow wp-block-quote-is-layout-flow\"><p>\u2014\u2014\u2014\u2014\u2014\u2014\u2014\u2014\u2014\u2014\u2014\u2014\u2014\u2014\u2014\u2014<\/p><p>\u7248\u6743\u58f0\u660e\uff1a\u672c\u6587\u4e3aCSDN\u535a\u4e3b\u300clsqzedu\u300d\u7684\u539f\u521b\u6587\u7ae0\uff0c\u9075\u5faaCC 4.0 BY-SA\u7248\u6743\u534f\u8bae\uff0c\u8f6c\u8f7d\u8bf7\u9644\u4e0a\u539f\u6587\u51fa\u5904\u94fe\u63a5\u53ca\u672c\u58f0\u660e\u3002<\/p><p>\u539f\u6587\u94fe\u63a5\uff1ahttps:\/\/blog.csdn.net\/lsqzedu\/article\/details\/99707709<\/p><\/blockquote>\n","protected":false},"excerpt":{"rendered":"<p>PIL\u5e93\u57fa\u672c\u4ecb\u7ecd PIL\uff1aPython Imaging Library\uff0c\u5df2\u7ecf\u662fPython\u5e73\u53f0\u4e8b\u5b9e\u4e0a\u7684\u56fe\u50cf\u5904\u7406 [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[4],"tags":[],"class_list":["post-156","post","type-post","status-publish","format-standard","hentry","category-biji"],"_links":{"self":[{"href":"https:\/\/www.gaoxigang.com\/index.php\/wp-json\/wp\/v2\/posts\/156","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.gaoxigang.com\/index.php\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.gaoxigang.com\/index.php\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.gaoxigang.com\/index.php\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/www.gaoxigang.com\/index.php\/wp-json\/wp\/v2\/comments?post=156"}],"version-history":[{"count":0,"href":"https:\/\/www.gaoxigang.com\/index.php\/wp-json\/wp\/v2\/posts\/156\/revisions"}],"wp:attachment":[{"href":"https:\/\/www.gaoxigang.com\/index.php\/wp-json\/wp\/v2\/media?parent=156"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.gaoxigang.com\/index.php\/wp-json\/wp\/v2\/categories?post=156"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.gaoxigang.com\/index.php\/wp-json\/wp\/v2\/tags?post=156"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}