最近想要把google drive的資料夾內容名稱全部爬下來,但發現用一般滾輪下拉的方式都沒辦法成功,後來想說使用鍵盤操作按下鍵,雖然沒有出現錯誤但卻沒作用,爬出來的東西就是沒有滾動的樣子,還是說我x_path找錯地方了?
(還是有辦法可以找出滾輪js的資訊做or鼠標進行拖拉的方式來成功?
import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
import time
from datetime import datetime
from selenium.webdriver.common.keys import Keys
chrome_options = webdriver.ChromeOptions()
prefs = {"profile.default_content_setting_values.notifications" : 2}
chrome_options.add_experimental_option("prefs",prefs)
driver = webdriver.Chrome(chrome_options=chrome_options)
url = google driver的網址
driver.get(url)
div = driver.find_element_by_xpath('//*[@class="PolqHc sd-ph"]/div')
div.send_keys(Keys.DOWN)
soup = BeautifulSoup(driver.page_source)
for title in soup.select('.Q5txwe'):
print(title.text)
js="var action=document.documentElement.scrollTop=100000"
driver.execute_script(js)
在其他網站都能滾,但在google drive就不行了
試試
lenOfPage = driver.execute_script('window.scrollTo(0, [hard code the height])')
有google drive api用api抓就好啦...
要先去開api。
理論上正常使用是不會用到錢。
from __future__ import print_function
import os.path
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
# If modifying these scopes, delete the file token.json.
SCOPES = ['https://www.googleapis.com/auth/drive.metadata.readonly']
def main():
"""Shows basic usage of the Drive v3 API.
Prints the names and ids of the first 10 files the user has access to.
"""
creds = None
# The file token.json stores the user's access and refresh tokens, and is
# created automatically when the authorization flow completes for the first
# time.
if os.path.exists('token.json'):
creds = Credentials.from_authorized_user_file('token.json', SCOPES)
# If there are no (valid) credentials available, let the user log in.
if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
creds.refresh(Request())
else:
flow = InstalledAppFlow.from_client_secrets_file(
'credentials.json', SCOPES)
creds = flow.run_local_server(port=0)
# Save the credentials for the next run
with open('token.json', 'w') as token:
token.write(creds.to_json())
service = build('drive', 'v3', credentials=creds)
# Call the Drive v3 API
results = service.files().list(q="mimeType='application/vnd.google-apps.folder'",
fields='nextPageToken, files(id, name)',
).execute()
# results = service.files().list(
# pageSize=10, fields="nextPageToken, files(id, name)").execute()
items = results.get('files', [])
if not items:
print('No files found.')
else:
print('Files:')
for item in items:
print(u'{0} ({1})'.format(item['name'], item['id']))
if __name__ == '__main__':
main()