Day26 Stepstone Posting 達石職缺

第 11 屆 iThome 鐵人賽

DAY 26

AI & Data

Hands on Data Cleaning and Scraping 資料清理與爬蟲實作系列第 26 篇

11th鐵人賽 pandas data visualization python data cleaning

kyt

2019-09-27 07:07:37

2509 瀏覽

分享至

# 載入所需套件 import the packages we need
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import plotly as py 

import warnings # 忽略警告訊息 
warnings.filterwarnings("ignore")

# 讀入昨天存的檔案來分析 read in the file we created yesterday
df = pd.read_csv('df.csv') 
df.info() # 查看資料細節 the info of data
df.head(3) # 叫出前三筆資料看看 print out the top three rows of data

# 把地點分布畫出來，看看這些職缺除了在柏林以外其他地點也有職缺的情形
# plot out to see how the locations look like
df['Location'] = df.Location.str.replace('Munich', 'München').str.replace(' a. M.', ' ').str.replace('a. M.', ' ').str.replace(' am Main', ' ').str.replace(' a.M.', ' ').str.replace('/M.', ' ').str.replace(' in ', '').str.replace(' in', '').str.replace('Alle', ' ').str.replace(' bei', '').str.replace('oder', '').str.replace('und', '').str.replace('/', ' ').str.replace(',', ' ').str.replace('   ', ' ').str.replace(' /M.', '.a.M.').str.replace('oder', '').str.replace('und', '').str.replace('/', ' ').str.replace(',', ' ').str.replace('  ', ' ').str.replace(' ', ',')
df.Location.head(3)
all_item_ls = np.concatenate(df.Location.map(lambda am:am.split(',')))
items = pd.Series(all_item_ls).value_counts().head(20)
plt.figure(figsize=(18 , 6))
items.plot(kind='bar')
plt.xticks(rotation=45, fontsize=12)

# 看看這些職缺簡介都寫了些什麼字
# see the word counts of the job posting descriptions
df['Description'] = df.Description.str.replace('(', '').str.replace(')', '').str.replace(',', '').str.replace('.', '').str.replace('*', '').str.replace('-', ' ').str.replace('&', '').str.replace('   ', ' ').str.replace('  ', ' ').str.replace(' ', ',')
df.Description.head(3)
all_words = np.concatenate(df.Description.map(lambda am:am.split(',')))
words = pd.Series(all_words).value_counts()

words.head(15)

words.tail(10)

# 看看職缺標題都寫了些什麼字
# see the word counts of the job postings
df['Jobs'] = df.Jobs.str.replace('(', '').str.replace(')', '').str.replace(',', ' ').str.replace('.', '').str.replace('/', '').str.replace('*', '').str.replace('-', ' ').str.replace('–', ' ').str.replace('&', '').str.replace('   ', ' ').str.replace('  ', ' ').str.replace(' ', ',')
df.Jobs.head(3)
all_jobs = np.concatenate(df.Jobs.map(lambda am:am.split(',')))
jobs = pd.Series(all_jobs).value_counts()