[Collect Data 演變史]
最傳統的方式 : 用自己的電腦,本機定時下載資料
缺點 : 電腦要一直開著,記憶體久而久之 就會被占滿
Firebase 來做儲存,Heroku 設定定時執行 從ptx下載資料
缺點 : Heroku free dyno 有用完的時候,還要等下個月重啟
Firebase 來做儲存,GCP 定時 運行下載ptx資料的程式
缺點 : 當要拿資料來分析 訓練時,讀取次數會超過
最後我採取的方式 : GCP 運行 GCP 以 JSON 方式儲存
但是 如果以GCP 最小單位(1min)來執行,處理Data時會來不及寫入,所以後來改成 每五分鐘 做收集
那各個階段 都有收集資料,那就可以當作我們的 testing data。
而訓練資料是不斷增加的,所以我起初覺得應該準確率會上升,但... 請看 Day29 會公布我觀察到的結果。
MyCode - 從Firebase 把所有資料抓下來
import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore
import json
cred = credentials.Certificate('./serviceAccount.json')
firebase_admin.initialize_app(cred)
db = firestore.client()
path = "bus"
collection_ref = db.collection(path)
docs = collection_ref.get()
save = []
for doc in docs:
save.append(doc.to_dict())
import json
with open('AllData.json','w') as f:
json.dump(save,f,indent=4) # 加上indent=4 會讓視覺效果更優,不然都擠在一起...
處理Data 三步驟 Run on GCP
收集資料
每五分鐘 下載一次資料寫入JSON - (1)
from hashlib import sha1
import hmac
from wsgiref.handlers import format_date_time
from datetime import datetime
from time import mktime
import base64
from requests import request
from pprint import pprint
import json
import smtplib
from email.mime.text import MIMEText
app_id = ''
app_key = ''
class Auth():
def __init__(self, app_id, app_key):
self.app_id = app_id
self.app_key = app_key
def get_auth_header(self):
xdate = format_date_time(mktime(datetime.now().timetuple()))
hashed = hmac.new(self.app_key.encode('utf8'), ('x-date: ' + xdate).encode('utf8'), sha1)
signature = base64.b64encode(hashed.digest()).decode()
authorization = 'hmac username="' + self.app_id + '", ' + \
'algorithm="hmac-sha1", ' + \
'headers="x-date", ' + \
'signature="' + signature + '"'
return {
'Authorization': authorization,
'x-date': format_date_time(mktime(datetime.now().timetuple())),
'Accept - Encoding': 'gzip'
}
if __name__ == '__main__':
a = Auth(app_id, app_key)
response = request('get', "https://ptx.transportdata.tw/MOTC/v2/Bus/RealTimeByFrequency/InterCity/9018?$filter=Direction%20eq%20'0'&$top=150&$format=JSON", headers= a.get_auth_header())
str = str(response.content,'utf-8')
jsonValue = json.loads(str)
listA = []
try:
#將已有的讀出來
with open('/home/turningpoint1125/record2.json' , 'r') as reader:
jf = json.loads(reader.read())
#print(jf)
for item in jsonValue:
mydict = {}
#print(item['PlateNumb'])
#print('GPSTime:'+item['GPSTime'])
#print(item['BusPosition']['PositionLat'])
#print('經度:',item['BusPosition']['PositionLon'])
#print('Speed',item['Speed'])
#mydict['PlateNumb'] = item['PlateNumb'] #key
mydict['GPSTime'] = item['GPSTime']
mydict['Lat'] = item['BusPosition']['PositionLat']
mydict['Lon'] = item['BusPosition']['PositionLon']
mydict['Speed'] = item['Speed']
print(mydict)
#jf.append(mydict)
if item['PlateNumb'] in jf:
jf[item['PlateNumb']].append(mydict)
else:
jf[item['PlateNumb']] = list()
jf[item['PlateNumb']].append(mydict)
#listA.append(mydict)
with open("/home/turningpoint1125/record2.json","w+") as dump_f:
json.dump(jf,dump_f,indent=4)
except:
gmail_user = 'turningpoint1125@gmail.com'
gmail_password = 'XXX' # your gmail password
msg = MIMEText('Programmer No Life')
msg['Subject'] = 'GCP運行情況'
msg['From'] = 'turningpoint1125@gmail.com'
msg['To'] = 'turningpoint1125@gmail.com'
server = smtplib.SMTP_SSL('smtp.gmail.com', 465)
server.ehlo()
server.login(gmail_user, gmail_password)
server.send_message(msg)
server.quit()
print('Email sent!')
處理資料
計算 其他feature&label 並 儲存訓練資料 - (2)
#載入收集來的資料
import json
with open('/home/turningpoint1125/record2.json','r') as load_f:
load_dict = json.load(load_f)
#print(load_dict) #測試是否載入資料成功
#計算其他需要的特徵值 - distance,time
#print(len(load_dict)) #共14台 負責跑台中
GPSTime = []
Lat = []
Lon = []
Speed = []
train_number = 0
for key in load_dict:
#print(load_dict[key]) #[{第一筆資料},{第2筆資料},...]...[]
#print(load_dict[key][0])
#input('Stop')
for item in load_dict[key]:
if len(item) == 4 :
#print(item['Lat']) #每一筆資料的距離(target)的依據 #Lat
#print(item['Lon']) #每一筆資料的距離(target)的依據 #Lon
GPSTime.append(item['GPSTime'])
Lat.append(item['Lat'])
Lon.append(item['Lon'])
Speed.append(item['Speed'])
train_number = train_number+1
#計算 label
##前備工具(1):計算點點距離
import math
def getDistance(latA, lonA, latB, lonB):
ra = 6378140 # radius of equator: meter
rb = 6356755 # radius of polar: meter
flatten = (ra - rb) / ra # Partial rate of the earth
# change angle to radians
radLatA = math.radians(latA)
radLonA = math.radians(lonA)
radLatB = math.radians(latB)
radLonB = math.radians(lonB)
pA = math.atan(rb / ra * math.tan(radLatA))
pB = math.atan(rb / ra * math.tan(radLatB))
x = math.acos(math.sin(pA) * math.sin(pB) + math.cos(pA) * math.cos(pB) * math.cos(radLonA - radLonB))
c1 = (math.sin(x) - x) * (math.sin(pA) + math.sin(pB))**2 / math.cos(x / 2)**2
c2 = (math.sin(x) + x) * (math.sin(pA) - math.sin(pB))**2 / math.sin(x / 2)**2
dr = flatten / 8 * (c1 - c2)
distance = ra * (x + dr)
return distance
##前備工具(2):載入路線經緯度資料
with open('/home/turningpoint1125/save.json' , 'r') as reader:
route = json.loads(reader.read())
##前備工具(3):載入距離對照資料
with open('/home/turningpoint1125/disTable.json' , 'r') as reader:
distoNTCU = json.loads(reader.read())
#計算離target的大略距離(先)
add_dis = []
for i in range(train_number):
min = 3000
rd = -1 # 不列入train data
for j in range(0,13):
if(float(Lat[i])!=route[j][0] and float(Lon[i])!=route[j][1]):
dis = getDistance(float(Lat[i]),float(Lon[i]),route[j][0],route[j][1])
else:
dis = 0
if dis < min :
rd = distoNTCU[j]
min = dis
add_dis.append(rd)
#print(add_dis)
#print(len(add_dis))
#計算距離TARGET 所花時間(label)(後)
from datetime import datetime
#from dateutil.parser import parse
ctime = {}
for i in range(train_number):
flag = 0
if str(add_dis[i])=="26.136332967166602":
ctime[i] = "0"
flag = 1
j = i - 1
while flag == 1 and str(add_dis[j])!="26.136332967166602":
if j==1 or str(add_dis[j])=="-1":
break
year = GPSTime[j][0:4]
year = int(year)
#print('year:',int(year))
month = GPSTime[j][5:7]
#print('month:',int(month))
month = int(month)
day = GPSTime[j][8:10]
#print('day:',int(day))
day = int(day)
hour = GPSTime[j][11:13]
#print('hour:',int(hour))
hour = int(hour)
minu = GPSTime[j][14:16]
minu = int(minu)
#print('min:',minu)
year2 = GPSTime[j+1][0:4]
year2 = int(year2)
month2 = GPSTime[j+1][5:7]
month2 = int(month2)
day2 = GPSTime[j+1][8:10]
day2 = int(day2)
hour2 = GPSTime[j+1][11:13]
hour2 = int(hour2)
min2 = GPSTime[j+1][14:16]
min2 = int(min2)
#input('stop')
if (year2-year)==0 and (month2-month)==0 and (day2-day)==0 and (hour2-hour)==0 and (min2-minu)==5:
#X = GPSTime[i][0:10]+' '+GPSTime[i][11:16]
#X = datetime.strptime(str(X),"%Y-%m-%d %H:%M")
#print(X)
#print(A)
#input('stop')
X=GPSTime[i][14:16]
X=int(X)
#a = X - A
#a=str(a)
ctime[j] = X - minu
#print(a)
#input('stop')
else:
break
j = j - 1
#print(ctime)
import time
today = time.strftime("%Y-%m-%d",time.localtime())
filename = '/home/turningpoint1125/'+today+'.csv'
#將訓練資料儲成表格.csv
import csv
with open(filename,'w',newline='') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['GPSTime','Lat','Lon','disTO','Speed','Time'])
for i in range(train_number):
if i in ctime.keys():
writer.writerow([GPSTime[i],Lat[i],Lon[i],add_dis[i],Speed[i],ctime[i]])
print('today data saved')
紀錄參數,準確率,並寄信通知 11:30 pm - (3)
import pandas as pd
from sklearn import linear_model
from sklearn import preprocessing # 標準化1
from sklearn.preprocessing import MinMaxScaler # 標準化2
scaler = MinMaxScaler() # 標準化2
import time
today = time.strftime("%Y-%m-%d",time.localtime())
filename = '/home/turningpoint1125/'+today+'.csv'
df = pd.read_csv(filename)
print('資料數量:',len(df))
#df_normalize = preprocessing.scale(df.drop(['Time','GPSTime'],axis='columns')) # 標準化1
#df_normalize = scaler.fit_transform(df.drop(['Time','GPSTime'],axis='columns')) # 標準化2
#print(df_normalize)
reg = linear_model.LinearRegression()
reg.fit(df.drop(['Time','GPSTime'],axis='columns'),df.Time)
#reg.fit(df_normalize,df.Time) # 標準化 1 2
#print('R^2:',reg.score(df_normalize,df.Time))
print('R^2:',reg.score(df.drop(['Time','GPSTime'],axis='columns'),df.Time))
print('weight:',reg.coef_ )
print('bias',reg.intercept_ )
import csv
with open('/home/turningpoint1125/daily_log.csv','a',encoding='utf8',newline='') as fd :
writer = csv.writer(fd)
writer.writerow([float(reg.coef_[0:1]),float(reg.coef_[1:2]),float(reg.coef_[2:3]),float(reg.coef_[3:4]),float(reg.intercept_ ),float(reg.score(df.drop(['Time','GPSTime'],axis='columns'),df.Time))])
import smtplib
from email.mime.text import MIMEText
gmail_user = '@gmail.com'
gmail_password = '' # your gmail password
LatW = str(reg.coef_[0:1])
LonW = str(reg.coef_[1:2])
DisW = str(reg.coef_[2:3])
SpeW = str(reg.coef_[3:4])
context = 'Lat: '+ LatW + '\n' + 'Lon: '+LonW+'\n'+'Dis: '+DisW+'\n'+'Speed: '+SpeW+'\n'+'Bias: '+str(reg.intercept_)+'\n'+'R^2: '+str(reg.score(df.drop(['Time','GPSTime'],axis='columns'),df.Time))+'\n'
msg = MIMEText(context)
msg['Subject'] = 'Good Night!'
msg['From'] = '@gmail.com'
msg['To'] = '@gmail.com'
server = smtplib.SMTP_SSL('smtp.gmail.com', 465)
server.ehlo()
server.login(gmail_user, gmail_password)
server.send_message(msg)
server.quit()
print('Email sent!')