python爬蟲爬學校教師資訊

python 網路爬蟲資料搜集

breakgod 2025-01-07 10:54:42 ‧ 645 瀏覽

分享至

各位好

最近小弟要練習python
想說拿學校的教師資訊來作題目

先訂一個目標，先針一個系所，有明確網址的來作爬蟲
https://www.im.tku.edu.tw/%e5%b0%88%e4%bb%bb%e6%95%99%e5%b8%ab/

藉由爬蟲解析出網頁中的教師資訊：
姓名、職稱、專長、電子信箱

然而分析出來的結果，只有抓到部分的資訊
有一些沒有解析出來，就先作略過
想請教一下是有什麼問題導致無法解析的呢？
(初學python，菜味重…)
要如何調整、優化，使之可以爬完整個系所的教師資訊
還請版上的大大指點一下，謝謝

後續還會想說擴大到整個學校
(還沒有方向要如何取得所有科系的師資網址…
最後是找全台各大專院校的~
(感覺很作夢XD

以下是小弟的程式碼，還請指點，謝謝

import requests
from bs4 import BeautifulSoup
import csv

# 目標網址
url = "https://www.im.tku.edu.tw/%e5%b0%88%e4%bb%bb%e6%95%99%e5%b8%ab/"

# 發送 GET 請求
response = requests.get(url)

# 確認請求成功
if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')

    # 選取包含教師資訊的部分
    faculty_sections = soup.select(".elementor-column")

    # 儲存結果的清單
    faculty_data = []

    for section in faculty_sections:
        try:
            print(section)
            # 解析教師姓名、職稱、專長、電子信箱、研究室位置
            print("=====s======")
            name = section.select_one(".elementor-heading-title span").text.strip()
            print(name)
            title = section.select_one(".elementor-widget-text-editor b").text.strip()
            print(title)
            expertise = section.find_all("b")[1].text.strip()
            print(expertise)
            email = section.select_one("a[href^='mailto']").text.strip()
            print(email)
            print("=====e======")

            # 將資料加入結果清單
            faculty_data.append({
                "姓名": name,
                "職稱": title,
                "專長": expertise,
                "電子信箱": email
            })
        except Exception as e:
            print(f"解析失敗: {e}")

    # 將資料寫入 CSV 檔案
    with open('faculty_data.csv', 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ["姓名", "職稱", "專長", "電子信箱"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        # 寫入欄位名稱
        writer.writeheader()

        # 寫入資料
        for faculty in faculty_data:
            writer.writerow(faculty)

    print("資料已成功寫入 faculty_data.csv")
else:
    print(f"無法存取網站: {url}")

登入發表討論

直播研討會

{{ item.channelVendor }} {{ item.webinarstarted }} |

直播中

2 個回答

haward79

iT邦研究生 1 級 ‧ 2025-01-08 12:17:39

最佳解答

試試看這個吧！
元素篩選是關鍵

import requests
from bs4 import BeautifulSoup
import csv

# 目標網址
url = "https://www.im.tku.edu.tw/%e5%b0%88%e4%bb%bb%e6%95%99%e5%b8%ab/"

# 發送 GET 請求
response = requests.get(url)

# 確認請求成功
if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')

    # 選取教師姓名（較有唯一性）
    faculty_headings = soup.select(".elementor-widget-heading")

    # 儲存結果的清單
    faculty_data = []

    for heading in faculty_headings:
        # 教師姓名
        name = heading.get_text(strip=True)

        # 先到父元件（因為其他欄位在父元件下），再選其他欄位
        items = heading.parent.select('p>span')

        try:
            # 解析職稱、專長、電子信箱
            title = items[0].get_text(strip=True)
            expertise = items[1].get_text(strip=True)
            email = items[2].get_text(strip=True)

            # 將資料加入結果清單
            faculty_data.append({
                "姓名": name,
                "職稱": title,
                "專長": expertise,
                "電子信箱": email
            })
        except Exception as e:
            print(f"解析失敗: {e}")

    print(faculty_data)

    # 將資料寫入 CSV 檔案

else:
    print(f"無法存取網站: {url}")