iT邦幫忙

第 11 屆 iThome 鐵人賽

DAY 28
0
AI & Data

Predicting Inter Bus Arrival Times 系列 第 28

Day 28 Talk about Data

  • 分享至 

  • xImage
  •  

[Collect Data 演變史]

最傳統的方式 : 用自己的電腦,本機定時下載資料
缺點 : 電腦要一直開著,記憶體久而久之 就會被占滿

Firebase 來做儲存,Heroku 設定定時執行 從ptx下載資料
缺點 : Heroku free dyno 有用完的時候,還要等下個月重啟

Firebase 來做儲存,GCP 定時 運行下載ptx資料的程式
缺點 : 當要拿資料來分析 訓練時,讀取次數會超過

最後我採取的方式 : GCP 運行 GCP 以 JSON 方式儲存
但是 如果以GCP 最小單位(1min)來執行,處理Data時會來不及寫入,所以後來改成 每五分鐘 做收集

那各個階段 都有收集資料,那就可以當作我們的 testing data。
而訓練資料是不斷增加的,所以我起初覺得應該準確率會上升,但... 請看 Day29 會公布我觀察到的結果。


MyCode - 從Firebase 把所有資料抓下來

import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore
import json

cred = credentials.Certificate('./serviceAccount.json')
firebase_admin.initialize_app(cred)
db = firestore.client()

path = "bus"
collection_ref = db.collection(path)
docs = collection_ref.get()
save = []
for doc in docs:
    save.append(doc.to_dict())    

import json
with open('AllData.json','w') as f:
    json.dump(save,f,indent=4) # 加上indent=4 會讓視覺效果更優,不然都擠在一起...

處理Data 三步驟 Run on GCP

收集資料 每五分鐘 下載一次資料寫入JSON - (1)

from hashlib import sha1
import hmac
from wsgiref.handlers import format_date_time
from datetime import datetime
from time import mktime
import base64
from requests import request
from pprint import pprint
import json

import smtplib
from email.mime.text import MIMEText

app_id = ''
app_key = ''

class Auth():

    def __init__(self, app_id, app_key):
        self.app_id = app_id
        self.app_key = app_key

    def get_auth_header(self):
        xdate = format_date_time(mktime(datetime.now().timetuple()))
        hashed = hmac.new(self.app_key.encode('utf8'), ('x-date: ' + xdate).encode('utf8'), sha1)
        signature = base64.b64encode(hashed.digest()).decode()

        authorization = 'hmac username="' + self.app_id + '", ' + \
                        'algorithm="hmac-sha1", ' + \
                        'headers="x-date", ' + \
                        'signature="' + signature + '"'
        return {
            'Authorization': authorization,
            'x-date': format_date_time(mktime(datetime.now().timetuple())),
            'Accept - Encoding': 'gzip'
        }


if __name__ == '__main__':
    a = Auth(app_id, app_key)
    response = request('get', "https://ptx.transportdata.tw/MOTC/v2/Bus/RealTimeByFrequency/InterCity/9018?$filter=Direction%20eq%20'0'&$top=150&$format=JSON", headers= a.get_auth_header())
    
    str = str(response.content,'utf-8')
   
    jsonValue = json.loads(str)
    
    listA  = []
    try:
        #將已有的讀出來
        with open('/home/turningpoint1125/record2.json' , 'r') as reader:
            jf = json.loads(reader.read())
        #print(jf)
    
        for item in jsonValue:
            mydict = {}
            #print(item['PlateNumb'])
            #print('GPSTime:'+item['GPSTime'])
            #print(item['BusPosition']['PositionLat'])
            #print('經度:',item['BusPosition']['PositionLon'])
            #print('Speed',item['Speed'])
            #mydict['PlateNumb'] = item['PlateNumb'] #key
            mydict['GPSTime'] = item['GPSTime']
            mydict['Lat'] = item['BusPosition']['PositionLat']
            mydict['Lon'] = item['BusPosition']['PositionLon']
            mydict['Speed'] = item['Speed']
            print(mydict)
            #jf.append(mydict)
            if item['PlateNumb'] in jf:
                jf[item['PlateNumb']].append(mydict)
            else:
                jf[item['PlateNumb']] = list()
                jf[item['PlateNumb']].append(mydict)            
            #listA.append(mydict)
        with open("/home/turningpoint1125/record2.json","w+") as dump_f:
            json.dump(jf,dump_f,indent=4)       
    except:
        gmail_user = 'turningpoint1125@gmail.com'
        gmail_password = 'XXX' # your gmail password

        msg = MIMEText('Programmer No Life')
        msg['Subject'] = 'GCP運行情況'
        msg['From'] = 'turningpoint1125@gmail.com'
        msg['To'] = 'turningpoint1125@gmail.com'

        server = smtplib.SMTP_SSL('smtp.gmail.com', 465)
        server.ehlo()
        server.login(gmail_user, gmail_password)
        server.send_message(msg)
        server.quit()

        print('Email sent!')            

處理資料 計算 其他feature&label 並 儲存訓練資料 - (2)

#載入收集來的資料
import json
with open('/home/turningpoint1125/record2.json','r') as load_f:
    load_dict = json.load(load_f)

#print(load_dict) #測試是否載入資料成功

#計算其他需要的特徵值 - distance,time

#print(len(load_dict)) #共14台 負責跑台中
GPSTime = []
Lat     = []
Lon     = []
Speed   = []

train_number = 0
for key in load_dict:
    #print(load_dict[key]) #[{第一筆資料},{第2筆資料},...]...[]
    #print(load_dict[key][0])
    #input('Stop')
    
    for item in load_dict[key]:
        if len(item) == 4 :
            #print(item['Lat']) #每一筆資料的距離(target)的依據 #Lat
            #print(item['Lon']) #每一筆資料的距離(target)的依據 #Lon
            GPSTime.append(item['GPSTime'])
            Lat.append(item['Lat'])
            Lon.append(item['Lon'])
            Speed.append(item['Speed'])
            train_number = train_number+1        
        

#計算 label
##前備工具(1):計算點點距離
import math
def getDistance(latA, lonA, latB, lonB):  
    ra = 6378140  # radius of equator: meter  
    rb = 6356755  # radius of polar: meter  
    flatten = (ra - rb) / ra  # Partial rate of the earth  
    # change angle to radians  
    radLatA = math.radians(latA)  
    radLonA = math.radians(lonA)  
    radLatB = math.radians(latB)  
    radLonB = math.radians(lonB)        
    pA = math.atan(rb / ra * math.tan(radLatA))  
    pB = math.atan(rb / ra * math.tan(radLatB))  
    x = math.acos(math.sin(pA) * math.sin(pB) + math.cos(pA) * math.cos(pB) * math.cos(radLonA - radLonB))   
    c1 = (math.sin(x) - x) * (math.sin(pA) + math.sin(pB))**2 / math.cos(x / 2)**2        
    c2 = (math.sin(x) + x) * (math.sin(pA) - math.sin(pB))**2 / math.sin(x / 2)**2    
    dr = flatten / 8 * (c1 - c2)  
    distance = ra * (x + dr)    
    return distance
##前備工具(2):載入路線經緯度資料
with open('/home/turningpoint1125/save.json' , 'r') as reader:
    route = json.loads(reader.read())
##前備工具(3):載入距離對照資料
with open('/home/turningpoint1125/disTable.json' , 'r') as reader:
    distoNTCU = json.loads(reader.read())
#計算離target的大略距離(先)
add_dis = []
for i in range(train_number):
    min = 3000
    rd = -1     # 不列入train data
    for j in range(0,13):
        if(float(Lat[i])!=route[j][0] and float(Lon[i])!=route[j][1]):
            dis = getDistance(float(Lat[i]),float(Lon[i]),route[j][0],route[j][1])
        else:
            dis = 0        
        if dis < min :
            rd = distoNTCU[j]
            min = dis            
    add_dis.append(rd)
#print(add_dis)
#print(len(add_dis))
#計算距離TARGET 所花時間(label)(後)
from datetime import datetime
#from dateutil.parser import parse
ctime = {}
for i in range(train_number):
    
    flag = 0
    if  str(add_dis[i])=="26.136332967166602":
        ctime[i] = "0"    
        flag = 1
        j = i - 1        
    while flag == 1 and str(add_dis[j])!="26.136332967166602":
        if j==1 or str(add_dis[j])=="-1":
            break

        year = GPSTime[j][0:4]
        year = int(year)
        #print('year:',int(year))
        month = GPSTime[j][5:7]
        #print('month:',int(month))
        month = int(month)
        day  = GPSTime[j][8:10]
        #print('day:',int(day))
        day = int(day)
        hour = GPSTime[j][11:13]
        #print('hour:',int(hour))
        hour = int(hour)
        minu = GPSTime[j][14:16]
        minu = int(minu)
        #print('min:',minu)
        
        year2 = GPSTime[j+1][0:4]
        year2 = int(year2)
        month2 = GPSTime[j+1][5:7]
        month2 = int(month2)
        day2  = GPSTime[j+1][8:10]
        day2  = int(day2)
        hour2 = GPSTime[j+1][11:13]
        hour2 = int(hour2)
        min2 = GPSTime[j+1][14:16]
        min2 = int(min2)        
        #input('stop')
        if (year2-year)==0 and (month2-month)==0 and (day2-day)==0 and (hour2-hour)==0 and (min2-minu)==5:
            #X = GPSTime[i][0:10]+' '+GPSTime[i][11:16]
            #X = datetime.strptime(str(X),"%Y-%m-%d %H:%M")
            #print(X)
            #print(A)
            #input('stop')
            X=GPSTime[i][14:16]
            X=int(X)
            
            #a = X - A 
            #a=str(a)
            ctime[j] = X - minu
            #print(a)
            #input('stop')
        else:
            break
        j = j - 1            
#print(ctime)

import time
today = time.strftime("%Y-%m-%d",time.localtime())
filename = '/home/turningpoint1125/'+today+'.csv' 
#將訓練資料儲成表格.csv
import csv
with open(filename,'w',newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['GPSTime','Lat','Lon','disTO','Speed','Time'])
    for i in range(train_number):
        if i in ctime.keys():
            writer.writerow([GPSTime[i],Lat[i],Lon[i],add_dis[i],Speed[i],ctime[i]])
print('today data saved')

紀錄參數,準確率,並寄信通知 11:30 pm - (3)

import pandas as pd
from sklearn import linear_model

from sklearn import preprocessing                # 標準化1
from sklearn.preprocessing import MinMaxScaler   # 標準化2
scaler = MinMaxScaler()                          # 標準化2

import time
today = time.strftime("%Y-%m-%d",time.localtime())
filename = '/home/turningpoint1125/'+today+'.csv'
df = pd.read_csv(filename)
print('資料數量:',len(df))


#df_normalize = preprocessing.scale(df.drop(['Time','GPSTime'],axis='columns'))  # 標準化1
#df_normalize = scaler.fit_transform(df.drop(['Time','GPSTime'],axis='columns')) # 標準化2
#print(df_normalize)

reg = linear_model.LinearRegression()
reg.fit(df.drop(['Time','GPSTime'],axis='columns'),df.Time)
#reg.fit(df_normalize,df.Time)  # 標準化 1 2
#print('R^2:',reg.score(df_normalize,df.Time))

print('R^2:',reg.score(df.drop(['Time','GPSTime'],axis='columns'),df.Time))
print('weight:',reg.coef_ )
print('bias',reg.intercept_ )

import csv
with open('/home/turningpoint1125/daily_log.csv','a',encoding='utf8',newline='') as fd :
    writer = csv.writer(fd)
    writer.writerow([float(reg.coef_[0:1]),float(reg.coef_[1:2]),float(reg.coef_[2:3]),float(reg.coef_[3:4]),float(reg.intercept_ ),float(reg.score(df.drop(['Time','GPSTime'],axis='columns'),df.Time))])
    
import smtplib
from email.mime.text import MIMEText
gmail_user = '@gmail.com'
gmail_password = '' # your gmail password

LatW =  str(reg.coef_[0:1])
LonW =  str(reg.coef_[1:2])
DisW =  str(reg.coef_[2:3])
SpeW =  str(reg.coef_[3:4])
context = 'Lat: '+ LatW + '\n' + 'Lon: '+LonW+'\n'+'Dis: '+DisW+'\n'+'Speed: '+SpeW+'\n'+'Bias: '+str(reg.intercept_)+'\n'+'R^2: '+str(reg.score(df.drop(['Time','GPSTime'],axis='columns'),df.Time))+'\n'
 
msg = MIMEText(context)
msg['Subject'] = 'Good Night!'
msg['From'] = '@gmail.com'
msg['To'] = '@gmail.com'

server = smtplib.SMTP_SSL('smtp.gmail.com', 465)
server.ehlo()
server.login(gmail_user, gmail_password)
server.send_message(msg)
server.quit()

print('Email sent!')  

上一篇
Day 27 sklearn
下一篇
Day 29 數據/圖表 分析
系列文
Predicting Inter Bus Arrival Times 30
圖片
  直播研討會
圖片
{{ item.channelVendor }} {{ item.webinarstarted }} |
{{ formatDate(item.duration) }}
直播中

尚未有邦友留言

立即登入留言