新手作
程式碼:
# selenium為操作瀏覽器用的package
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from requests_html import HTML
import time
import pandas as pd
import os
import pymysql
basedir = os.path.abspath(os.path.dirname(__file__))
chrome_options=Options()
driver = webdriver.Chrome(executable_path=r'D:\\pytxt\\chromedriver.exe', options=chrome_options)
# 第一頁url
urls = [
'https://www.104.com.tw/jobs/search/?ro=0&isnew=30&kwop=7&keyword=%E8%AA%BF%E9%85%92&expansionType=area%2Cspec%2Ccom%2Cjob%2Cwf%2Cwktm&area=6001008000&order=12&asc=0&page=1&mode=s&jobsource=2018indexpoc',
]
# 創建要爬的資料的字典
data = {
'job_name':[],
'company_name':[],
'industry':[],
'area':[],
'working_years':[],
'degree':[],
}
for url in urls:
driver.get(url)
time.sleep(2)
html = HTML(html = driver.page_source)
jobs = html.find('article.b-block--top-bord.job-list-item.b-clearfix.js-job-item')
for job in jobs:
data['job_name'].append(job.attrs['data-job-name'])
data['company_name'].append(job.attrs['data-cust-name'])
data['industry'].append(job.attrs['data-indcat-desc'])
job_contents = job.find('ul.b-list-inline.b-clearfix.job-list-intro.b-content')
for job_content in job_contents:
data['area'].append(job_content.find('li')[0].text)
data['working_years'].append(job_content.find('li')[1].text)
data['degree'].append(job_content.find('li')[2].text)
pd.DataFrame(data).to_csv("101.csv", encoding='utf-8-sig', index=False)
# 關閉瀏覽器
driver.quit()
db=pymysql.connect(host="localhost", user="root", password="", database="job", charset="utf8")
cursor = db.cursor() # 建立cursor物件
# 建立SQL指令INSERT字串
sql = """INSERT INTO job-drink (job_name,company_name,industry,area,working_years,degree)
VALUES ('{0}','{1}','{2}',{3},'{4}','{5}')"""
sql = sql.format(data['job_name'],data['company_name'],data['industry'],data['area'],data['working_years'],data['degree'])
print(sql)
try:
cursor.execute(sql) # 執行SQL指令
db.commit() # 確認交易
print("新增一筆記錄...")
except:
db.rollback() # 回復交易
print("新增記錄失敗...")
db.close() # 關閉資料庫連接
放入SQL結果:
排列存入SQL會整個亂掉
各位大大這是要如何解決?
你要存入的不是一筆,
所以應該是要用迴圈去跑,
另外你在接資料的時候,
可以考慮用物件(class),
裡面包含五個欄位,
然後資料取回來是class的陣列.
如果要用迴圈跑,
SQL語法結尾要加;
不過更好的方式是用,區隔,
譬如
INSERT INTO `table` (`col1`, `col2`, `col3`, `col4`, `col5`) VALUES ('1', '2', '3', '4', '5'), ('6', '7', '8', '9', '10');
看起來你是一包df資料,可以試試以下code
from sqlalchemy import create_engine
import pymysql
engine = create_engine("mysql+pymysql://{}:{}@{}/{}?charset={}".format('user', 'password', 'ip:port', '資料庫','utf8'))
con = engine.connect()#建立連線
bar_chart.to_sql(name='資料表', con=con, if_exists='append', index=False)