DAY2 - 題庫建立 - iT 邦幫忙::一起幫忙解決難題，拯救 IT 人的一天

2025 iThome 鐵人賽

DAY 2

AI & Data

打造你的數位圖書館：從雜亂檔案到個人化知識庫系列第 2 篇

DAY2 - 題庫建立

17th鐵人賽

shaix0

2025-09-16 20:46:57

143 瀏覽

分享至

建立題庫前，需要先處理題目的原檔，把需要的部份都提取出來。除了基本的題目、選項和答案之外，把題目提供的來源書籍替換掉符號，再和額外的資訊一併拆出。因為這次只會處理到選擇題，所以提取後可以直接以json儲存，方便前端抓取。

// 原題目
社團法人台灣醫學資訊學會 
醫學資訊管理師檢定考試試題 （2025年4月） 
*表示出自「醫學資訊管理學」之頁次 
**表示出自「常用醫護術語」之頁次 

選擇題50題，每題2分，共100分，請選擇一個最正確的答案。 
(B) 1. 國內政治活動頻繁，很多民眾有焦慮、憂鬱，甚至會有憤怒情緒的狀況。焦慮的英文是？ (**173) 
  (A) Melancholy (B) Anxiety (C) Irritable (D) Exhausted

// 處理完要長這樣
"題目": "國內政治活動頻繁，很多民眾有焦慮、憂鬱，甚至會有憤怒情緒的狀況。焦慮的英文是？",
"選項": ["Melancholy", "Anxiety", "Irritable", "Exhausted"]
"來源書籍": "常用醫護術語",
"頁次": "173",
"答案": "Anxiety",
"考試時間": "2025年4月",
"來源檔案": "2025年4月.pdf"

▍程式碼

讀入pdf檔，準備用函式處理

import fitz  # PyMuPDF
import json
import re
import os

# 獲取資料夾中所有以 .pdf 結尾的檔案
pdf_folder_path = os.path.join( "pdf檔的資料夾路徑" )

try:
    pdf_files = [os.path.join(pdf_folder_path, f) for f in os.listdir(pdf_folder_path) if f.endswith('.pdf')]
except FileNotFoundError:
    print(f"錯誤: 找不到指定的資料夾 '{pdf_folder_path}'")
    pdf_files = []

all_extracted_questions = []

# 遍歷每個 PDF 檔案
for pdf_path in pdf_files:
    filename = os.path.basename(pdf_path)
    # 原檔檔名是考試時間，直接提取
    exam_date = os.path.splitext(filename)[0]

    text = ""
    try:
        with fitz.open(pdf_path) as doc:
            for page in doc:
                text += page.get_text()
        #print(text)
        print(f"成功從 {filename} 提取文本。")

        # 函式處理文本
        questions_from_file = parse_questions_from_text(text, exam_date, filename)
        all_extracted_questions.extend(questions_from_file)

    except Exception as e:
        print(f"從 {filename} 提取文本或解析時發生錯誤: {e}")
        continue

從pdf中提取出需要的部分

def parse_questions_from_text(text, exam_date, source_filename):
    questions = []
    
    # 移除空格和換行
    processed_text = re.sub(r'\s+', '', text)
    
    # 提取需要的部分
    pattern = re.compile(
        r'(?P<answer_letters>(?:\([A-D]\)))\s*'       # 提取答案
        r'(?P<question_number>\d+)\.\s*'              # 題號與點
        r'(?P<question_text>.*?)\s*'                  # 題目文本，非貪婪匹配
        r'(?P<source_info>\(\*{1,2}.*?\)| \(時事\)| \(時事.*?\))?' # 來源
        r'\s*(?P<options_text>(?:\(.\).*?){4})?'      # 選項文本，匹配4個
        r'(?=\s*\(\w\)\s*\d|\s*\Z)',
    )    
    matches = re.finditer(pattern, processed_text)

    # 分割選項
    option_pattern = re.compile(r'\([A-D]\)(.*?)(?=\([A-D]\)|$)')

    for match in matches:
        try:
            answer_letters_str = match.group('answer_letters').strip()
            answer_letters = re.findall(r'\(([A-D])\)', answer_letters_str) 
            question_number = match.group('question_number').strip()
            question_text = match.group('question_text').strip()
            source_info_str = match.group('source_info').strip().strip('()') if match.group('source_info') else ""
            options_text = match.group('options_text').strip() if match.group('options_text') else ""

            # 提取選項列表
            options_list = [opt.strip() for opt in re.findall(option_pattern, options_text)]

            # 題庫皆為四選一選擇題，確保只有四個選項，否則視為失敗，避免出錯
            if len(options_list) != 4:
                print(f"警告：在 {exam_date} 的第 {question_number} 題選項不完整，可能為解析錯誤。已跳過該題。")
                continue
            
            # 根據答案字母找到答案文字
            answer_text = "無答案"

            for letter in answer_letters:
                if len(options_list) >= ord(letter.upper()) - ord('A') + 1:
                    answer_index = ord(letter.upper()) - ord('A')
                    answer_text = options_list[answer_index]

            # 根據標記處理出處和頁次
            book_source = "無"
            page_number = "無"

            if '**' in source_info_str:
                book_source = "常用醫護術語"
                page_match = re.search(r'\*\*(.*)', source_info_str)
                if page_match:
                    page_number = page_match.group(1).strip()
            elif '*' in source_info_str:
                book_source = "醫學資訊管理學"
                page_match = re.search(r'\*(.*)', source_info_str)
                if page_match:
                    page_number = page_match.group(1).strip()
            
            # 文章開頭沒有展示，但部分題目來源為時事
            if '時事' in source_info_str:
                if book_source != "無":
                    book_source += "、時事"
                else:
                    book_source = "時事"

            questions.append({
                "題目": question_text,
                "選項": options_list,
                "來源書籍": book_source,
                "頁次": page_number,
                "答案": answer_text,
                "考試時間": exam_date,
                "來源檔案": source_filename
            })
        except Exception as e:
            print(f"解析題目時發生錯誤: {e}. 原始匹配文字: {match.group(0)}")
            continue

    return questions

最後將處理後題目儲存到指定位置

# 將所有整理好的題目儲存為 JSON 檔案
output_folder = os.path.join( "要輸出的資料夾路徑" )  # 指定輸出資料夾
# 確保輸出資料夾存在
os.makedirs(output_folder, exist_ok=True)
output_filename = os.path.join(output_folder, "all_questions.json")  # 將檔案儲存到指定資料夾

with open(output_filename, 'w', encoding='utf-8') as f:
    json.dump(all_extracted_questions, f, ensure_ascii=False, indent=4)

print(f"\n所有 {len(all_extracted_questions)} 個題目已成功整理並儲存至 {output_filename}")