{"id":126,"date":"2020-07-08T09:11:59","date_gmt":"2020-07-08T01:11:59","guid":{"rendered":"http:\/\/www.gaoxigang.com\/?p=126"},"modified":"2020-07-08T09:11:59","modified_gmt":"2020-07-08T01:11:59","slug":"scrapy-%e7%88%ac%e8%99%ab%e6%a1%86%e6%9e%b6-xpath%e8%af%ad%e6%b3%95%e4%bb%8b%e7%bb%8d","status":"publish","type":"post","link":"https:\/\/www.gaoxigang.com\/index.php\/2020\/07\/08\/scrapy-%e7%88%ac%e8%99%ab%e6%a1%86%e6%9e%b6-xpath%e8%af%ad%e6%b3%95%e4%bb%8b%e7%bb%8d\/","title":{"rendered":"Scrapy \u722c\u866b\u6846\u67b6-xpath\u8bed\u6cd5\u4ecb\u7ecd"},"content":{"rendered":"<p><strong>Xpath\u7b80\u4ecb<\/strong><br \/>\nXPath \u662f\u4e00\u95e8\u5728 XML \u6587\u6863\u4e2d\u67e5\u627e\u4fe1\u606f\u7684\u8bed\u8a00\u3002XPath \u7528\u4e8e\u5728 XML \u6587\u6863\u4e2d\u901a\u8fc7\u5143\u7d20\u548c\u5c5e\u6027\u8fdb\u884c\u5bfc\u822a\uff0c\u5176\u5b9eHTML\u5c31\u662f\u4e00\u79cd\u7279\u6b8a\u7684XML\uff0c\u56e0\u6b64\u5927\u5bb6\u5728\u5b66\u4e60XPath\u65f6\u9700\u8981\u4e86\u89e3\u57fa\u672c\u7684HTML\u548cXML<\/p>\n<blockquote><p>XPath \u4f7f\u7528\u8def\u5f84\u8868\u8fbe\u5f0f\u5728 XML \u6587\u6863\u4e2d\u8fdb\u884c\u5bfc\u822a<br \/>\nXPath \u5305\u542b\u4e00\u4e2a\u6807\u51c6\u51fd\u6570\u5e93<br \/>\nXPath \u662f XSLT \u4e2d\u7684\u4e3b\u8981\u5143\u7d20<br \/>\nXPath\u662f\u4e00\u4e2a W3C \u6807\u51c6<\/p><\/blockquote>\n<p><strong>Xpath\u8bed\u6cd5<\/strong><\/p>\n<table>\n<thead>\n<tr>\n<th>\u8868\u8fbe\u5f0f<\/th>\n<th>\u63cf\u8ff0<\/th>\n<\/tr>\n<\/thead>\n<tbody>\n<tr>\n<td>nodename<\/td>\n<td>\u9009\u53d6\u6b64\u8282\u70b9\u7684\u6240\u6709\u5b50\u8282\u70b9<\/td>\n<\/tr>\n<tr>\n<td>\/<\/td>\n<td>\u4ece\u6839\u8282\u70b9\u9009\u53d6<\/td>\n<\/tr>\n<tr>\n<td>\/\/<\/td>\n<td>\u4ece\u5339\u914d\u9009\u62e9\u7684\u5f53\u524d\u8282\u70b9\u9009\u62e9\u6587\u6863\u4e2d\u7684\u8282\u70b9\uff0c\u800c\u4e0d\u8003\u8651\u5b83\u4eec\u7684\u4f4d\u7f6e<\/td>\n<\/tr>\n<tr>\n<td>.<\/td>\n<td>\u9009\u53d6\u5f53\u524d\u8282\u70b9<\/td>\n<\/tr>\n<tr>\n<td>\u2026<\/td>\n<td>\u9009\u53d6\u5f53\u524d\u8282\u70b9\u7684\u7236\u8282\u70b9<\/td>\n<\/tr>\n<tr>\n<td>@<\/td>\n<td>\u9009\u53d6\u5c5e\u6027<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<p><strong>Xpath\u5b9e\u8df5<\/strong><\/p>\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\">data = \"\"\"\n&lt;div&gt;\n    &lt;ul&gt;\n         &lt;li class=\"item-0\"&gt;&lt;a&gt;first item&lt;\/a&gt;&lt;\/li&gt;\n         &lt;li class=\"item-1\"&gt;&lt;a href=\"link2.html\"&gt;second item&lt;\/a&gt;&lt;\/li&gt;\n         &lt;li class=\"item-inactive\"&gt;&lt;a href=\"link3.html\"&gt;third item&lt;\/a&gt;&lt;\/li&gt;\n         &lt;li class=\"item-1\"&gt;&lt;a href=\"link4.html\"&gt;fourth item&lt;\/a&gt;&lt;\/li&gt;\n         &lt;li class=\"item-0\"&gt;&lt;a href=\"link5.html\" id='kw'&gt;fifth item&lt;\/a&gt;\n     &lt;\/ul&gt;\n&lt;\/div&gt;\n\"\"\"\n\nfrom lxml import etree\n# \u8f93\u51fa\u4e00\u4e2a\u6807\u51c6\u7684HTML\u683c\u5f0f\nhtml = etree.HTML(data)\nprint(etree.tostring(html))\n# xpath\u8bed\u6cd5: \u6807\u7b7e,\u6807\u7b7e\u7684\u503c,\u6807\u7b7e\u7684\u5c5e\u6027\n# \u4ece\u6839\u8282\u70b9\u9009\u53d6\n# li_list = html.xpath(\"\/html\/body\/div\/ul\/li\")\nli_list = html.xpath(\"\/\/li\")\nprint(\"\u4ece\u5339\u914d\u9009\u62e9\u7684\u5f53\u524d\u8282\u70b9,\u800c\u4e0d\u8003\u8651\u5b83\u4eec\u7684\u4f4d\u7f6e\")\nfor li in li_list:\n    print(li)\nprint(\"\u83b7\u53d6\u6807\u7b7e\u7684\u503c\u91c7\u7528text()\u51fd\u6570\")\ntext_list = html.xpath(\"\/\/li\/a\/text()\")\nfor text in text_list:\n    print(text)\nprint(\"\u83b7\u53d6\u6807\u7b7e\u7684\u503c(\u6839\u636e\u5c5e\u6027\u7b5b\u9009)\")\ntext_list = html.xpath(\"\/\/li\/a[@href='link5.html']\/text()\")\nfor text in text_list:\n    print(text)\nprint(\"\u83b7\u53d6\u6807\u7b7e\u7684\u5c5e\u6027,@\u4ee3\u8868\u83b7\u53d6\u5c5e\u6027\")\nhref_list = html.xpath(\"\/\/li\/a\/@href\")\nfor href in href_list:\n    print(href)\n<\/pre>\n<p>\u2014\u2014\u2014\u2014\u2014\u2014\u2014\u2014\u2014\u2014\u2014\u2014\u2014\u2014\u2014\u2014<br \/>\n\u7248\u6743\u58f0\u660e\uff1a\u672c\u6587\u4e3aCSDN\u535a\u4e3b\u300clsqzedu\u300d\u7684\u539f\u521b\u6587\u7ae0\uff0c\u9075\u5faaCC 4.0 BY-SA\u7248\u6743\u534f\u8bae\uff0c\u8f6c\u8f7d\u8bf7\u9644\u4e0a\u539f\u6587\u51fa\u5904\u94fe\u63a5\u53ca\u672c\u58f0\u660e\u3002<br \/>\n\u539f\u6587\u94fe\u63a5\uff1ahttps:\/\/blog.csdn.net\/lsqzedu\/article\/details\/99697735<\/p>\n","protected":false},"excerpt":{"rendered":"<p>Xpath\u7b80\u4ecb XPath \u662f\u4e00\u95e8\u5728 XML \u6587\u6863\u4e2d\u67e5\u627e\u4fe1\u606f\u7684\u8bed\u8a00\u3002XPath \u7528\u4e8e\u5728 XML \u6587\u6863\u4e2d\u901a\u8fc7\u5143\u7d20 [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[4],"tags":[],"class_list":["post-126","post","type-post","status-publish","format-standard","hentry","category-biji"],"_links":{"self":[{"href":"https:\/\/www.gaoxigang.com\/index.php\/wp-json\/wp\/v2\/posts\/126","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.gaoxigang.com\/index.php\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.gaoxigang.com\/index.php\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.gaoxigang.com\/index.php\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/www.gaoxigang.com\/index.php\/wp-json\/wp\/v2\/comments?post=126"}],"version-history":[{"count":0,"href":"https:\/\/www.gaoxigang.com\/index.php\/wp-json\/wp\/v2\/posts\/126\/revisions"}],"wp:attachment":[{"href":"https:\/\/www.gaoxigang.com\/index.php\/wp-json\/wp\/v2\/media?parent=126"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.gaoxigang.com\/index.php\/wp-json\/wp\/v2\/categories?post=126"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.gaoxigang.com\/index.php\/wp-json\/wp\/v2\/tags?post=126"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}