個人認為爬蟲是網頁技術,實用性前幾名的。舉凡股票價格抓取,104人力銀行職缺,後台自動登入,591租屋資訊查詢,使用爬蟲都可以讓人事半功倍。
Ruby 可以做爬蟲的 Gem 不少,例如:
今天要用的是另一款 Gem:Selenium,並以Chrome為基底驅動
首先要先安裝 chromedirver,安裝過程網路上有,這裡就不詳述
Gemfile
gem 'selenium-webdriver'
記得要
bundle install
require 'selenium-webdriver'
# 爬蟲的目標網址
login_url = 'https://www.example.com/admin'
# 透過 options 設定 driver
options = Selenium::WebDriver::Chrome::Options.new
# 不用打開圖形介面,開發前期先不加才看的到畫面
options.add_argument('--headless')
# 指定瀏覽器的解析度
options.add_argument('--window-size=1440,900')
# docker原本的分享記憶體在 /dev/shm 是 64MB,會造成chorme crash,所以要改成寫入到 /tmp
options.add_argument('--disable-dev-shm-usage')
# 以最高權限運行
options.add_argument('--no-sandbox')
# google document 提到需要加上這個屬性來規避 bug
options.add_argument('--disable-gpu')
# 指定使用 chrome 為基底
@driver = Selenium::WebDriver.for :chrome, options: options
# 設定bridge,讓headless時可以下載檔案
bridge = @driver.send(:bridge)
path = '/session/:session_id/chromium/send_command'
path[':session_id'] = bridge.session_id
bridge.http.call(:post,
path,
cmd: 'Page.setDownloadBehavior',
params: {
behavior: 'allow',
# 指定下載到 tmp 資料夾
downloadPath: Dir.pwd + '/tmp/',
},)
# 導向指定連結
@driver.navigate.to login_url
find_element -- 找到第一個符合的 element
find_elements -- 找到所有符合的 elements
element = driver.find_element(:id, "q")
element = driver.find_element(:class, 'highlight')
# or
element = driver.find_element(:class_name, 'highlight')
# <div class="highlight" style="display: none; ">...</div>
element = driver.find_element(:tag_name, 'div')
ps. display: none
也抓得到
# <input id="q" name='search' type='text'>…</input>
element = driver.find_element(:name, 'search')
# <a href="http://www.google.com/search?q=cheese">cheese</a>
element = driver.find_element(:link, 'cheese')
# or
element = driver.find_element(:link_text, 'cheese')
用在對方的html結構可能改變,但是文字不變
# <a href="http://www.google.com/search?q=cheese">search for cheese</a>
element = driver.find_element(:partial_link_text, 'cheese')
# <ul class="dropdown-menu">
# <li><a href="/login/form">Login</a></li>
# <li><a href="/logout">Logout</a></li>
# </ul>
element = driver.find_element(:xpath, '//a[@href='/logout']')
# <div id="food">
# <span class="dairy">milk</span>
# <span class="dairy aged">cheese</span>
# </div>
element = driver.find_element(:css, '#food span.dairy')
ps. 跟 document.querySelector一樣
driver.find_element(:id, 'BUTTON_ID).click
# input some text
driver.find_element(:id, 'TextArea').send_keys 'InputText'
driver.find_element(:id,'Element').displayed?
driver.find_element(:id,'Element').text
driver.find_element(:id, 'Element').attribute('class')
# check if it is selected
driver.find_element(:id, 'CheckBox').selected?
# select the element
driver.find_element(:id, 'CheckBox').click
# deselect the element
driver.find_element(:id, 'CheckBox').clear
@driver.find_element(:xpath, "//option[@value=#{start_day.year}]").click
# get the select element; then get all the options for this element
all_options = driver.find_element(:tag_name, "select").find_elements(:tag_name, "option")
# select the options
all_options.each do |option|
puts "Value is: " + option.attribute("value")
option.click
end
driver.execute_script("return window.location.pathname")
# 滑到特定位置
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
# set the timeout to 10 seconds
wait = Selenium::WebDriver::Wait.new(:timeout => 10)
# wait 10 seconds until the element appear
wait.until { driver.find_element(:id => "foo") }
# set the timeout for implicit waits as 10 seconds
driver.manage.timeouts.implicit_wait = 10
#獲取開啟的多個視窗控制代碼
windows=driver.window_handles
#切換到當前最新開啟的視窗
driver.switch_to(windows[-1])
driver.switch_to.window(driver.window_handles.last )
driver.close()
driver.quit()
# switch to a frame
driver.switch_to.frame "some-frame" # name or id
driver.switch_to.frame driver.find_element(:id, 'some-frame') # frame element
# switch back to the main document
# 不切換回來,就困在iframe
driver.switch_to.default_content
alert = @driver.switch_to.alert
if alert.text.include? '同意請點選「確定」完成登入;不同意請點選「取消」停止登入'
alert.accept
end
# 指定 照片儲存路徑
screenshot_path = 'tmp/reconcile_task_files/screenshot.png'
driver.save_screenshot(screenshot_path)