今天我們要來模擬基於柴比雪夫不等式的異常值檢測,首先我們先用NumPy產生一條隨機亂數
import pandas as pd
import numpy as np
if __name__ == "__main__":
# 隨機產生數列
data = np.random.rand(25) * 25
data = data.astype("int32")
print(data)
# [23 19 13 11 6 9 11 13 14 23 10 21 24 2 11 20 6 16 12 22 12 1 3 6 6]
我們來看一下圖
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
if __name__ == "__main__":
# 隨機產生數列
data = np.random.rand(25) * 25
data = data.astype("int32")
print(data)
# [23 19 13 11 6 9 11 13 14 23 10 21 24 2 11 20 6 16 12 22 12 1 3 6 6]
plt.plot(data)
plt.show()
定義基於柴比雪夫不等式的異常值檢測副程式,參數有兩個,一個是Pandas的Series,另一個是高於數列平均值的標準差倍數
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
def chebyshev(data, n):
avg = data.mean()
std = n * data.std()
#將平均值以下n倍標準差的資料去除
minmask = data < avg - std
data[minmask] = np.nan
#將平均值以上n倍標準差的資料去除
maxmask = data > avg + std
data[maxmask] = np.nan
return data
if __name__ == "__main__":
# 隨機產生數列
data = np.random.rand(25) * 25
data = data.astype("int32")
print(data)
# [23 19 13 11 6 9 11 13 14 23 10 21 24 2 11 20 6 16 12 22 12 1 3 6 6]
#建立Series
data = pd.Series(data)
#將平均值以上及以下1倍標準差的資料去除
data = chebyshev(data, 1)
print(data)
"""
0 NaN
1 13.0
2 7.0
3 11.0
4 12.0
5 11.0
6 17.0
7 5.0
8 14.0
9 15.0
10 NaN
11 NaN
12 NaN
13 12.0
14 NaN
15 15.0
16 15.0
17 NaN
18 8.0
19 17.0
20 NaN
21 NaN
22 17.0
23 16.0
24 NaN
dtype: float64
"""