iT邦幫忙

2021 iThome 鐵人賽

DAY 26
1
Modern Web

陪聊_伃時不候 Line Bot 聊天機器人系列 第 26

使用 Python 實作網路爬蟲

requests

接下來將透過 Python 所提供的一些模組來擷取網頁上的資料,那麼就從「如何取得網頁」開始吧!

我們可以使用 Python 的模組 requests 來下載網頁,透過 requests 建立適當的 HTTP 請求,從網頁伺服器上取得需要的資料,想要使用這個模組前必須先安裝模組,只要開啟終端機,輸入以下指令就可以完成安裝了!

pip install requests

安裝完成後就可以來實際下載網頁看看,這邊我們使用 Google 首頁來做示範

import requests

url = 'https://www.google.com/'
response = requests.get(url)

print(response.encoding)
print(response.status_code)
print(response.headers)
print(response.text)

執行後可以看到以下結果:

ISO-8859-1
200
{'Date': 'Sat, 09 Oct 2021 13:16:04 GMT', 'Expires': '-1', 'Cache-Control': 'private, max-age=0', 'Content-Type': 'text/html; charset=ISO-8859-1', 'P3P': 'CP="This is not a P3P policy! See g.co/p3phelp for more info."', 'Content-Encoding': 'gzip', 'Server': 'gws', 'X-XSS-Protection': '0', 'X-Frame-Options': 'SAMEORIGIN', 'Set-Cookie': '1P_JAR=2021-10-09-13; expires=Mon, 08-Nov-2021 13:16:04 GMT; path=/; domain=.google.com; Secure, NID=511=poM_9_NWaALIoKPt57zR9DODZS3IDRwuAXb4zhChjO0P5GiNfS2579nxfo2r6YPnmEQxVINoBE17g8_atyHpxMInX7NMm301YqLCGObRvGhKl8PDNqcwE8fCop-p5GcG4Vchx39C-HHN2cqjOK_xCpbJ5Qe7hyGB3d9pMnVhcPU; expires=Sun, 10-Apr-2022 13:16:04 GMT; path=/; domain=.google.com; HttpOnly', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000,h3-T051=":443"; ma=2592000,h3-Q050=":443"; ma=2592000,h3-Q046=":443"; ma=2592000,h3-Q043=":443"; ma=2592000,quic=":443"; ma=2592000; v="46,43"', 'Transfer-Encoding': 'chunked'}
<!doctype html><html itemscope="" itemtype="http://schema.org/WebPage" lang="zh-TW"><head><meta content="text/html; charset=UTF-8" http-equiv="Content-Type"><meta content="/images/branding/googleg/1x/googleg_standard_color_128dp.png" itemprop="image"><title>Google</title><script nonce="v9E3JTENyOsTwGMsoihZxw==">(function(){window.google={kEI:'FJZhYde-G76Mr7wPhZii-AI',kEXPI:'0,1302536,56873,6058,207,2414,2390,926,1390,383,246,5,1354,5251,1122515,1197767,634,328866,51224,16114,28684,893,16679,4859,1361,9290,3028,17581,4998,13228,2676,1171,4192,6430,7432,11613,2777,919,5081,887,706,1279,2212,530,149,561,542,840,1983,4314,108,3406,606,2023,1777,522,14668,3227,2845,7,12354,5096,16320,908,2,941,2614,3784,9358,3,576,6459,149,13975,4,1528,2304,1236,5226,5260,2015,18375,2658,6701,655,31,5616,5797,2215,2305,638,1494,16786,651,1871,3290,2545,4094,3138,8,906,3,3541,1,11943,2767,1814,283,38,874,5992,16728,1715,2,14022,1931,3909,1680,743,2351,3502,1576,3,8289,595,1160,1286,5414,2380,2719,4013,2,530,3,5473,4635,3640,2,6,132,2,817,6768,4568,2577,10,3122,278,271,3020,3,965,3,23,2,1,3,2612,85,182,2934,665,2,4437,919,569,1372,286,314,59,3,169,269,379,3105,2122,3617,229,66,795,320,474,40,414,46,43,1120,505,304,579,722,228,712,245,731,1651,239,114,644,1566,1352,3,950,189,616,2,23,1002,12,261,133,274,138,2,533,76,51,3,8,7,216,1441,686,437,31,20,1148,96,37,3,2,340,147,396,1011,4,39,38,144,223,2,705,957,532,285,1550,373,883,5559467,446,160,1802825,4193364,519,212,2800485,882,444,1,2,80,1,1796,1,9,2553,1,748,141,795,563,1,4265,1,1,2,1331,4142,2609,155,17,13,72,139,4,2,20,2,169,13,19,46,5,39,96,548,29,2,2,1,2,1,2,2,7,4,1,2,2,2,2,2,2,353,513,186,1,1,158,3,2,2,2,2,2,4,2,3,3,269,551,12,5,1,8,11,4,86,4,3,75,49,1,7,18,11,5,2,6,2,12,2,2,31,23953814,4041352,338,3,2414,1491,9,1434,1516,289,2425,229,281,341,552,1030,142,325',kBL:'RsSZ'};google.sn='webhp';google.kHL='zh-TW';})();(function(){
var f=this||self;var h,k=[];function l(a){for(var b;a&&(!a.getAttribute||!(b=a.getAttribute("eid")));)a=a.parentNode;return b||h}function m(a){for(var b=null;a&&(!a.getAttribute||!(b=a.getAttribute("leid")));)a=a.parentNode;return b}
function n(a,b,c,d,g){var e="";c||-1!==b.search("&ei=")||(e="&ei="+l(d),-1===b.search("&lei=")&&(d=m(d))&&(e+="&lei="+d));d="";!c&&f._cshid&&-1===b.search("&cshid=")&&"slh"!==a&&(d="&cshid="+f._cshid);c=c||"/"+(g||"gen_204")+"?atyp=i&ct="+a+"&cad="+b+e+"&zx="+Date.now()+d;/^http:/i.test(c)&&"https:"===window.location.protocol&&(google.ml&&google.ml(Error("a"),!1,{src:c,glmm:1}),c="");return c};h=google.kEI;google.getEI=l;google.getLEI=m;google.ml=function(){return null};google.log=function(a,b,c,d,g){if(c=n(a,b,c,d,g)){a=new Image;var e=k.length;k[e]=a;a.onerror=a.onload=a.onabort=function(){delete k[e]};a.src=c}};google.logUrl=n;}).call(this);(function(){
google.y={};google.sy=[];google.x=function(a,b){if(a)var c=a.id;else{do c=Math.random();while(google.y[c])}google.y[c]=[a,b];return!1};google.sx=function(a){google.sy.push(a)};google.lm=[];google.plm=function(a){google.lm.push.apply(google.lm,a)};google.lq=[];google.load=function(a,b,c){google.lq.push([[a],b,c])};google.loadAll=function(a,b){google.lq.push([a,b])};google.bx=!1;google.lx=function(){};}).call(this);google.f={};(function(){
document.documentElement.addEventListener("submit",function(b){var a;if(a=b.target){var c=a.getAttribute("data-submitfalse");a="1"===c||"q"===c&&!a.elements.q.value?!0:!1}else a=!1;a&&(b.preventDefault(),b.stopPropagation())},!0);document.documentElement.addEventListener("click",function(b){var a;a:{for(a=b.target;a&&a!==document.documentElement;a=a.parentElement)if("A"===a.tagName){a="1"===a.getAttribute("data-nohref");break a}a=!1}a&&b.preventDefault()},!0);}).call(this);</script><style>#gbar,#guser{font-size:13px;padding-top:1px !important;}#gbar{height:22px}#guser{padding-bottom:7px !important;text-align:right}.gbh,.gbd{border-top:1px solid #c9d7f1;font-size:1px}.gbh{height:0;position:absolute;top:24px;width:100%}@media all{.gb1{height:22px;margin-right:.5em;vertical-align:top}#gbar{float:left}}a.gb1,a.gb4{text-decoration:underline !important}a.gb1,a.gb4{color:#00c !important}.gbi .gb4{color:#dd8e27 !important}.gbf .gb4{color:#900 !important}
</style><style>body,td,a,p,.h{font-family:arial,sans-serif}body{margin:0;overflow-y:scroll}#gog{padding:3px 8px 0}td{line-height:.8em}.gac_m td{line-height:17px}form{margin-bottom:20px}.h{color:#1558d6}em{color:#c5221f;font-style:normal;font-weight:normal}a em{text-decoration:underline}.lst{height:25px;width:496px}.gsfi,.lst{font:18px arial,sans-serif}.gsfs{font:17px arial,sans-serif}.ds{display:inline-box;display:inline-block;margin:3px 0 4px;margin-left:4px}input{font-family:inherit}body{background:#fff;color:#000}a{color:#4b11a8;text-decoration:none}a:hover,a:active{text-decoration:underline}.fl a{color:#1558d6}a:visited{color:#4b11a8}.sblc{padding-top:5px}.sblc a{display:block;margin:2px 0;margin-left:13px;font-size:11px}.lsbb{background:#f8f9fa;border:solid 1px;border-color:#dadce0 #70757a #70757a #dadce0;height:30px}.lsbb{display:block}#WqQANb a{display:inline-block;margin:0 12px}.lsb{background:url(/images/nav_logo229.png) 0 -261px repeat-x;border:none;color:#000;cursor:pointer;height:30px;margin:0;outline:0;font:15px arial,sans-serif;vertical-align:top}.lsb:active{background:#dadce0}.lst:focus{outline:none}</style><script nonce="v9E3JTENyOsTwGMsoihZxw==">(function(){window.google.erd={sp:'hp',jsr:0,bv:1455};
var f=this||self;var g,h,k=null!==(g=f.mei)&&void 0!==g?g:1,l=null!==(h=f.sdo)&&void 0!==h?h:!0,n=0,p,q=google.erd,t=q.jsr;google.ml=function(a,b,e,m,d){d=void 0===d?2:d;b&&(p=a&&a.message);if(google.dl)return google.dl(a,d,e),null;if(0>t){window.console&&console.error(a,e);if(-2===t)throw a;b=!1}else b=!a||!a.message||"Error loading script"===a.message||n>=k&&!m?!1:!0;if(!b)return null;n++;e=e||{};b=encodeURIComponent;var c="/gen_204?atyp=i&ei="+b(google.kEI);google.kEXPI&&(c+="&jexpid="+b(google.kEXPI));c+="&srcpg="+b(q.sp)+"&jsr="+b(q.jsr)+"&bver="+b(q.bv)+("&jsel="+d);c+="&sn="+b(google.sn);for(var r in e)c+="&",c+=b(r),c+="=",c+=b(e[r]);c=c+"&emsg="+b(a.name+": "+a.message);c=c+"&jsst="+b(a.stack||"N/A");12288<=c.length&&(c=c.substr(0,12288));a=c;m||google.log(0,"",a);return a};window.onerror=function(a,b,e,m,d){p!==a&&google.ml(d instanceof Error?d:Error(a),!1,void 0,!1,!d||d instanceof SyntaxError?2:0);p=null;l&&n>=k&&(window.onerror=null)};})();</script></head><body bgcolor="#fff"><script nonce="v9E3JTENyOsTwGMsoihZxw==">(function(){var src='/images/nav_logo229.png';var iesg=false;document.body.onload = function(){window.n && window.n();if (document.images){new Image().src=src;}
if (!iesg){document.f&&document.f.q.focus();document.gbqf&&document.gbqf.q.focus();}
}
})();</script><div id="mngb"><div id=gbar><nobr><b class=gb1>搜尋</b> <a class=gb1 href="https://www.google.com.tw/imghp?hl=zh-TW&tab=wi">圖片</a> <a class=gb1 href="https://maps.google.com.tw/maps?hl=zh-TW&tab=wl">地圖</a> <a class=gb1 href="https://play.google.com/?hl=zh-TW&tab=w8">Play</a> <a class=gb1 href="https://www.youtube.com/?gl=TW&tab=w1">YouTube</a> <a class=gb1 href="https://news.google.com/?tab=wn">新聞</a> <a class=gb1 href="https://mail.google.com/mail/?tab=wm">Gmail</a> <a class=gb1 href="https://drive.google.com/?tab=wo">雲端硬碟</a> <a class=gb1 style="text-decoration:none" href="https://www.google.com.tw/intl/zh-TW/about/products?tab=wh"><u>更多</u> »</a></nobr></div><div id=guser width=100%><nobr><span id=gbn class=gbi></span><span id=gbf class=gbf></span><span id=gbe></span><a href="http://www.google.com.tw/history/optout?hl=zh-TW" class=gb4>網頁記錄</a> | <a  href="/preferences?hl=zh-TW" class=gb4>設定</a> | <a target=_top id=gb_70 href="https://accounts.google.com/ServiceLogin?hl=zh-TW&passive=true&continue=https://www.google.com/&ec=GAZAAQ" class=gb4>登入</a></nobr></div><div class=gbh style=left:0></div><div class=gbh style=right:0></div></div><center><br clear="all" id="lgpd"><div id="lga"><img alt="Google" height="92" src="/images/branding/googlelogo/1x/googlelogo_white_background_color_272x92dp.png" style="padding:28px 0 14px" width="272" id="hplogo"><br><br></div><form action="/search" name="f"><table cellpadding="0" cellspacing="0"><tr valign="top"><td width="25%"> </td><td align="center" nowrap=""><input name="ie" value="ISO-8859-1" type="hidden"><input value="zh-TW" name="hl" type="hidden"><input name="source" type="hidden" value="hp"><input name="biw" type="hidden"><input name="bih" type="hidden"><div class="ds" style="height:32px;margin:4px 0"><input class="lst" style="margin:0;padding:5px 8px 0 6px;vertical-align:top;color:#000" autocomplete="off" value="" title="Google 搜尋" maxlength="2048" name="q" size="57"></div><br style="line-height:0"><span class="ds"><span class="lsbb"><input class="lsb" value="Google 搜尋" name="btnG" type="submit"></span></span><span class="ds"><span class="lsbb"><input class="lsb" id="tsuid1" value="好手氣" name="btnI" type="submit"><script nonce="v9E3JTENyOsTwGMsoihZxw==">(function(){var id='tsuid1';document.getElementById(id).onclick = function(){if (this.form.q.value){this.checked = 1;if (this.form.iflsig)this.form.iflsig.disabled = false;}
else top.location='/doodles/';};})();</script><input value="ALs-wAMAAAAAYWGkJIFOgcK31AgVaKU1ZxmPx79BI2TX" name="iflsig" type="hidden"></span></span></td><td class="fl sblc" align="left" nowrap="" width="25%"><a href="/advanced_search?hl=zh-TW&authuser=0">進階搜尋</a></td></tr></table><input id="gbv" name="gbv" type="hidden" value="1"><script nonce="v9E3JTENyOsTwGMsoihZxw==">(function(){
var a,b="1";if(document&&document.getElementById)if("undefined"!=typeof XMLHttpRequest)b="2";else if("undefined"!=typeof ActiveXObject){var c,d,e=["MSXML2.XMLHTTP.6.0","MSXML2.XMLHTTP.3.0","MSXML2.XMLHTTP","Microsoft.XMLHTTP"];for(c=0;d=e[c++];)try{new ActiveXObject(d),b="2"}catch(h){}}a=b;if("2"==a&&-1==location.search.indexOf("&gbv=2")){var f=google.gbvu,g=document.getElementById("gbv");g&&(g.value=a);f&&window.setTimeout(function(){location.href=f},0)};}).call(this);</script></form><div id="gac_scont"></div><div style="font-size:83%;min-height:3.5em"><br></div><span id="footer"><div style="font-size:10pt"><div style="margin:19px auto;text-align:center" id="WqQANb"><a href="/intl/zh-TW/ads/">廣告服務</a><a href="http://www.google.com.tw/intl/zh-TW/services/">商業解決方案</a><a href="/intl/zh-TW/about.html">關於 Google</a><a href="https://www.google.com/setprefdomain?prefdom=TW&prev=https://www.google.com.tw/&sig=K_ZTLdssy1W_XCJpK6iNTwP9ipPnA%3D">Google.com.tw</a></div></div><p style="font-size:8pt;color:#70757a">© 2021 - <a href="/intl/zh-TW/policies/privacy/">隱私權</a> - <a href="/intl/zh-TW/policies/terms/">服務條款</a></p></span></center><script nonce="v9E3JTENyOsTwGMsoihZxw==">(function(){window.google.cdo={height:757,width:1440};(function(){
var a=window.innerWidth,b=window.innerHeight;if(!a||!b){var c=window.document,d="CSS1Compat"==c.compatMode?c.documentElement:c.body;a=d.clientWidth;b=d.clientHeight}a&&b&&(a!=google.cdo.width||b!=google.cdo.height)&&google.log("","","/client_204?&atyp=i&biw="+a+"&bih="+b+"&ei="+google.kEI);}).call(this);})();</script> <script nonce="v9E3JTENyOsTwGMsoihZxw==">(function(){google.xjs={ck:'',cs:'',excm:[]};})();</script>  <script nonce="v9E3JTENyOsTwGMsoihZxw==">(function(){var u='/xjs/_/js/k\x3dxjs.hp.en.cnW1_qixMgc.O/am\x3dAPgEWA/d\x3d1/ed\x3d1/rs\x3dACT90oHKPvY_M3V2AcVreCv9Lwp6QoNieQ/m\x3dsb_he,d';
var e=this||self,f=function(a){return a};
var g;var l=function(a,b){this.g=b===h?a:""};l.prototype.toString=function(){return this.g+""};var h={};function m(){var a=u;google.lx=function(){n(a);google.lx=function(){}};google.bx||google.lx()}
function n(a){google.timers&&google.timers.load&&google.tick&&google.tick("load","xjsls");var b=document;var c="SCRIPT";"application/xhtml+xml"===b.contentType&&(c=c.toLowerCase());c=b.createElement(c);if(void 0===g){b=null;var k=e.trustedTypes;if(k&&k.createPolicy){try{b=k.createPolicy("goog#html",{createHTML:f,createScript:f,createScriptURL:f})}catch(p){e.console&&e.console.error(p.message)}g=b}else g=b}a=(b=g)?b.createScriptURL(a):a;a=new l(a,h);c.src=a instanceof l&&a.constructor===l?a.g:"type_error:TrustedResourceUrl";var d;a=(c.ownerDocument&&c.ownerDocument.defaultView||window).document;(d=(b=null===(d=a.querySelector)||void 0===d?void 0:d.call(a,"script[nonce]"))?b.nonce||b.getAttribute("nonce")||"":"")&&c.setAttribute("nonce",d);document.body.appendChild(c);google.psa=!0};setTimeout(function(){m()},0);})();(function(){window.google.xjsu='/xjs/_/js/k\x3dxjs.hp.en.cnW1_qixMgc.O/am\x3dAPgEWA/d\x3d1/ed\x3d1/rs\x3dACT90oHKPvY_M3V2AcVreCv9Lwp6QoNieQ/m\x3dsb_he,d';})();function _DumpException(e){throw e;}
function _F_installCss(c){}
(function(){google.jl={attn:false,blt:'none',chnk:0,dw:false,dwu:true,emtn:0,end:0,ine:false,lls:'default',pdt:0,rep:0,snet:true,strt:0,ubm:false,uwp:true};})();(function(){var pmc='{\x22d\x22:{},\x22sb_he\x22:{\x22agen\x22:false,\x22cgen\x22:false,\x22client\x22:\x22heirloom-hp\x22,\x22dh\x22:true,\x22dhqt\x22:true,\x22ds\x22:\x22\x22,\x22ffql\x22:\x22zh-TW\x22,\x22fl\x22:true,\x22host\x22:\x22google.com\x22,\x22isbh\x22:28,\x22jsonp\x22:true,\x22msgs\x22:{\x22cibl\x22:\x22清除搜尋\x22,\x22dym\x22:\x22您是不是要查:\x22,\x22lcky\x22:\x22好手氣\x22,\x22lml\x22:\x22瞭解詳情\x22,\x22oskt\x22:\x22輸入工具\x22,\x22psrc\x22:\x22已從您的「\\u003Ca href\x3d\\\x22/history\\\x22\\u003E網頁記錄\\u003C/a\\u003E」中移除這筆搜尋記錄\x22,\x22psrl\x22:\x22移除\x22,\x22sbit\x22:\x22以圖搜尋\x22,\x22srch\x22:\x22Google 搜尋\x22},\x22ovr\x22:{},\x22pq\x22:\x22\x22,\x22refpd\x22:true,\x22refspre\x22:true,\x22rfs\x22:[],\x22sbas\x22:\x220 3px 8px 0 rgba(0,0,0,0.2),0 0 0 1px rgba(0,0,0,0.08)\x22,\x22sbpl\x22:16,\x22sbpr\x22:16,\x22scd\x22:10,\x22stok\x22:\x22xjvp6Yc8bFwU5ffJNLZkjqrhqo8\x22,\x22uhde\x22:false}}';google.pmc=JSON.parse(pmc);})();</script>        </body></html>

接下來針對程式進行進一步的介紹,line 6 中的 encoding 指的是網頁編碼的方式,line 7 的 status_code 則是指網頁的狀態,line 8 與 line 9 中的 headers 跟 text 分別表示網頁的表頭與內容。

而伺服器回傳的常見狀態碼如下:

  • 2xx:成功獲取資料,例如「200」代表「OK」
  • 4xx:用戶端錯誤,例如「404」代表「找不到」
  • 5xx:伺服器故障,例如「502」代表「閘道故障」

上一篇
淺談網路爬蟲
下一篇
使用 Python 實作網路爬蟲 part 2
系列文
陪聊_伃時不候 Line Bot 聊天機器人30

尚未有邦友留言

立即登入留言