{"id":147,"date":"2020-07-16T09:25:50","date_gmt":"2020-07-16T01:25:50","guid":{"rendered":"http:\/\/www.gaoxigang.com\/?p=147"},"modified":"2020-07-16T09:25:50","modified_gmt":"2020-07-16T01:25:50","slug":"scrapy-%e7%88%ac%e8%99%ab%e6%a1%86%e6%9e%b6-%e7%88%ac%e8%99%ab%e4%bc%aa%e8%a3%85%e4%b8%8e%e8%87%aa%e5%8a%a8%e7%99%bb%e5%bd%95","status":"publish","type":"post","link":"https:\/\/www.gaoxigang.com\/index.php\/2020\/07\/16\/scrapy-%e7%88%ac%e8%99%ab%e6%a1%86%e6%9e%b6-%e7%88%ac%e8%99%ab%e4%bc%aa%e8%a3%85%e4%b8%8e%e8%87%aa%e5%8a%a8%e7%99%bb%e5%bd%95\/","title":{"rendered":"Scrapy \u722c\u866b\u6846\u67b6-\u722c\u866b\u4f2a\u88c5\u4e0e\u81ea\u52a8\u767b\u5f55"},"content":{"rendered":"\n<h2 class=\"wp-block-heading\">\u53cd\u722c\u866b\u6280\u672f<\/h2>\n\n\n\n<ol class=\"wp-block-list\"><li>\u5224\u65adUser-Agent\uff0c\u662f\u5426\u4e3a\u6d4f\u89c8\u5668<\/li><li>\u5224\u65ad\u77ed\u65f6\u95f4\u5185\u4e00\u4e2aIP\u7684\u8bbf\u95ee\u6b21\u6570<\/li><li>\u6709\u4e9b\u8d44\u6e90\u5fc5\u987b\u7528\u6237\u767b\u5f55\u540e\u624d\u80fd\u8bbf\u95ee<\/li><li>\u77ed\u65f6\u95f4\u540c\u4e00\u4e2a\u7528\u6237\u4f7f\u7528\u4e0d\u540cIP\u8bbf\u95ee\u8d44\u6e90<\/li><li>\u5f02\u5e38\u767b\u5f55 \u9a8c\u8bc1\u7801\uff0c\u6ed1\u52a8\u5355\u51fb\u9a8c\u8bc1 \u6570\u636e\u52a0\u5bc6\u5904\u7406<\/li><\/ol>\n\n\n\n<h2 class=\"wp-block-heading\">\u722c\u866b\u6280\u672f<\/h2>\n\n\n\n<p>User-Agent\uff1a\u53d1\u8d77\u8bf7\u6c42\u65f6\u6dfb\u52a0\u5934\u4fe1\u606f<br>\u4f2a\u88c5\u6d4f\u89c8\u5668 \u77ed\u65f6\u95f4\u5185\u8bbf\u95ee\u6b21\u6570\u9650\u5236<br>\u53ef\u4ee5\u4f7f\u7528\u4ee3\u7406\u6216\u8005\u8bbe\u7f6e\u5ef6\u8fdf\u722c\u53d6<br>\u767b\u5f55\u540e\u8bbf\u95ee\uff0c\u6a21\u62df\u767b\u5f55\u4fdd\u5b58cookie,\u8bf7\u6c42\u65f6\u6dfb\u52a0cookie\u4fe1\u606f<br>\u5f02\u5e38\u767b\u5f55\uff1a\u51c6\u5907\u5927\u91cf\u8d26\u53f7\uff0c\u7ed1\u5b9a\u4e0d\u540c\u4ee3\u7406\u8fdb\u884c\u722c\u53d6<br>\u9a8c\u8bc1\u7801\uff1a\u4f7f\u7528OCR,\u673a\u5668\u5b66\u4e60\u8fdb\u884c\u5904\u7406\uff0c\u6216\u8005\u7b2c\u4e09\u65b9API \u4f7f\u7528selenium\u8fdb\u884c\u6293\u53d6\u64cd\u4f5c<br><\/p>\n\n\n\n<h2 class=\"wp-block-heading\">\u968f\u673aUser-Agent\u8bbe\u7f6e<\/h2>\n\n\n\n<blockquote class=\"wp-block-quote is-layout-flow wp-block-quote-is-layout-flow\"><p>\u767e\u5ea6\u8f93\u5165\uff1ascrapy user agent \u83b7\u53d6\u968f\u673a\u7684agent\u5217\u8868<\/p><p>\u6d4b\u8bd5user-agent\u7f51\u7ad9\uff1ahttp:\/\/www.httpbin.org\/get<\/p><\/blockquote>\n\n\n\n<blockquote class=\"wp-block-quote is-layout-flow wp-block-quote-is-layout-flow\"><p>\u4fee\u6539middlewares\u6587\u4ef6\u4fee\u6539\u6216\u6dfb\u52a0UserAgentMiddleware\u7c7b<\/p><\/blockquote>\n\n\n\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\" data-enlighter-theme=\"\" data-enlighter-highlight=\"\" data-enlighter-linenumbers=\"\" data-enlighter-lineoffset=\"\" data-enlighter-title=\"\" data-enlighter-group=\"\">import random\n\nclass UserAgentMiddleware(object):\n    def process_request(self,request,spider):\n        MY_USER_AGENT = [\n            \"Mozilla\/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)\",\n            \"Mozilla\/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)\",\n            \"Mozilla\/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)\",\n            \"Mozilla\/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)\",\n            \"Mozilla\/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident\/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)\",\n            \"Mozilla\/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident\/4.0; WOW64; Trident\/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)\",\n            \"Mozilla\/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)\",\n            \"Mozilla\/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit\/523.15 (KHTML, like Gecko, Safari\/419.3) Arora\/0.3 (Change: 287 c9dfb30)\",\n            \"Mozilla\/5.0 (X11; U; Linux; en-US) AppleWebKit\/527+ (KHTML, like Gecko, Safari\/419.3) Arora\/0.6\",\n            \"Mozilla\/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko\/20070215 K-Ninja\/2.1.1\",\n            \"Mozilla\/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko\/20080705 Firefox\/3.0 Kapiko\/3.0\",\n            \"Mozilla\/5.0 (X11; Linux i686; U;) Gecko\/20070322 Kazehakase\/0.4.5\",\n            \"Mozilla\/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora\/1.9.0.8-1.fc10 Kazehakase\/0.5.6\",\n            \"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/535.11 (KHTML, like Gecko) Chrome\/17.0.963.56 Safari\/535.11\",\n            \"Mozilla\/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit\/535.20 (KHTML, like Gecko) Chrome\/19.0.1036.7 Safari\/535.20\",\n            \"Opera\/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto\/2.9.168 Version\/11.52\",\n            \"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/536.11 (KHTML, like Gecko) Chrome\/20.0.1132.11 TaoBrowser\/2.0 Safari\/536.11\",\n            \"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/537.1 (KHTML, like Gecko) Chrome\/21.0.1180.71 Safari\/537.1 LBBROWSER\",\n            \"Mozilla\/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident\/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)\",\n            \"Mozilla\/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)\",\n            \"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/535.11 (KHTML, like Gecko) Chrome\/17.0.963.84 Safari\/535.11 LBBROWSER\",\n            \"Mozilla\/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident\/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)\",\n            \"Mozilla\/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident\/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser\/7.0.3698.400)\",\n            \"Mozilla\/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)\",\n            \"Mozilla\/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident\/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)\",\n            \"Mozilla\/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)\",\n            \"Mozilla\/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident\/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)\",\n            \"Mozilla\/5.0 (Windows NT 5.1) AppleWebKit\/537.1 (KHTML, like Gecko) Chrome\/21.0.1180.89 Safari\/537.1\",\n            \"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/537.1 (KHTML, like Gecko) Chrome\/21.0.1180.89 Safari\/537.1\",\n            \"Mozilla\/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit\/533.17.9 (KHTML, like Gecko) Version\/5.0.2 Mobile\/8C148 Safari\/6533.18.5\",\n            \"Mozilla\/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko\/20110307 Firefox\/4.0b13pre\",\n            \"Mozilla\/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko\/20100101 Firefox\/16.0\",\n            \"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/537.11 (KHTML, like Gecko) Chrome\/23.0.1271.64 Safari\/537.11\",\n            \"Mozilla\/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko\/20100922 Ubuntu\/10.10 (maverick) Firefox\/3.6.10\",\n            \"Mozilla\/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/58.0.3029.110 Safari\/537.36\",\n        ]\n        user_agent = random.choice(MY_USER_AGENT)\n        request.headers['User-Agent'] = user_agent\n        return None<\/pre>\n\n\n\n<blockquote class=\"wp-block-quote is-layout-flow wp-block-quote-is-layout-flow\"><p>\u7136\u540e\u5728settings\u6587\u4ef6\u4e2d\u4fee\u6539DOWNLOADER_MIDDLEWARES\u4e3a\u65b0\u6dfb\u52a0\u7684\u7c7b<\/p><\/blockquote>\n\n\n\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\" data-enlighter-theme=\"\" data-enlighter-highlight=\"\" data-enlighter-linenumbers=\"\" data-enlighter-lineoffset=\"\" data-enlighter-title=\"\" data-enlighter-group=\"\"># Enable or disable downloader middlewares\n# See https:\/\/docs.scrapy.org\/en\/latest\/topics\/downloader-middleware.html\nDOWNLOADER_MIDDLEWARES = {\n    'douban.middlewares.UserAgentMiddleware': 543,\n}<\/pre>\n","protected":false},"excerpt":{"rendered":"<p>\u53cd\u722c\u866b\u6280\u672f \u5224\u65adUser-Agent\uff0c\u662f\u5426\u4e3a\u6d4f\u89c8\u5668 \u5224\u65ad\u77ed\u65f6\u95f4\u5185\u4e00\u4e2aIP\u7684\u8bbf\u95ee\u6b21\u6570 \u6709\u4e9b\u8d44\u6e90\u5fc5\u987b\u7528\u6237\u767b\u5f55\u540e\u624d\u80fd [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[4],"tags":[],"class_list":["post-147","post","type-post","status-publish","format-standard","hentry","category-biji"],"_links":{"self":[{"href":"https:\/\/www.gaoxigang.com\/index.php\/wp-json\/wp\/v2\/posts\/147","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.gaoxigang.com\/index.php\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.gaoxigang.com\/index.php\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.gaoxigang.com\/index.php\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/www.gaoxigang.com\/index.php\/wp-json\/wp\/v2\/comments?post=147"}],"version-history":[{"count":0,"href":"https:\/\/www.gaoxigang.com\/index.php\/wp-json\/wp\/v2\/posts\/147\/revisions"}],"wp:attachment":[{"href":"https:\/\/www.gaoxigang.com\/index.php\/wp-json\/wp\/v2\/media?parent=147"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.gaoxigang.com\/index.php\/wp-json\/wp\/v2\/categories?post=147"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.gaoxigang.com\/index.php\/wp-json\/wp\/v2\/tags?post=147"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}