安裝Matplotlib
pip install matplotlib
import pandas as pd
import matplotlib.pyplot as plt
# 從CSV文件讀取數據到Pandas DataFrame中
df = pd.read_csv('ptt_articles.csv')
# 將圖片URL列表和表格數據從字符串轉換回列表
df['images'] = df['images'].apply(lambda x: eval(x) if x else [])
df['tables'] = df['tables'].apply(lambda x: eval(x) if x else [])
# 計算每篇文章中的圖片數量和表格數量
df['image_count'] = df['images'].apply(len)
df['table_count'] = df['tables'].apply(len)
# 基本數據可視化
# 1. 繪製每篇文章的圖片數量分佈圖
plt.figure(figsize=(10, 6))
plt.hist(df['image_count'], bins=range(0, df['image_count'].max() + 2), color='skyblue', edgecolor='black')
plt.title('Distribution of Image Counts in Articles')
plt.xlabel('Number of Images')
plt.ylabel('Number of Articles')
plt.grid(True)
plt.show()
# 2. 繪製每篇文章的表格數量分佈圖
plt.figure(figsize=(10, 6))
plt.hist(df['table_count'], bins=range(0, df['table_count'].max() + 2), color='lightgreen', edgecolor='black')
plt.title('Distribution of Table Counts in Articles')
plt.xlabel('Number of Tables')
plt.ylabel('Number of Articles')
plt.grid(True)
plt.show()
# 3. 繪製圖片數量與表格數量的關係圖
plt.figure(figsize=(10, 6))
plt.scatter(df['image_count'], df['table_count'], color='coral')
plt.title('Relationship between Image Counts and Table Counts')
plt.xlabel('Number of Images')
plt.ylabel('Number of Tables')
plt.grid(True)
plt.show()