想預測自己玩張的瀏覽人次,請前往https://ithome-predict-browse-count.herokuapp.com/index/<你的文章的網頁連結>,舉例來說https://ithome-predict-browse-count.herokuapp.com/index/https://ithelp.ithome.com.tw/articles/10195825/。
雖然30天已經寫完,不過之前承諾過大家要預測ithome鐵人文章的瀏覽人次,因此又多了這篇讓大家玩玩。
import os
import pandas as pd
import jieba
jieba.set_dictionary('dict.txt.big')
with open('stops.txt', 'r', encoding='utf8') as f:
    stops = f.read().split('\n')
    
import numpy as np
import json
import matplotlib.pyplot as plt
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from datetime import datetime
starttime= datetime.now()
我已經連續爬了15天的所有貼文並存成csv檔。
files = [os.path.join('articles', i) for i in os.listdir("articles")]
df = pd.DataFrame()
for f in files:
    df_part = pd.read_csv(f)
    df = pd.concat([df, df_part], ignore_index=True)
df["publish_datetime"] = df["publish_datetime"].apply(pd.to_datetime)
df["crawled_date"] = df["crawled_date"].apply(pd.to_datetime)
for h in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
    df.loc[pd.notnull(df[h]), h] = df.loc[pd.notnull(df[h]), h].apply(eval)
print(len(df))
print(datetime.now()-starttime)
LabelEncoder可以幫你把類別資料轉成數值型類別。如果每一筆資料有多個標籤,MultiLabelBinarizer則會幫你自動將其Onehot Encoding,這邊把一個詞當作一個標籤,不過因為這邊詞量太大,如果用Onehot可能會有記憶體不足的狀況,因此這邊先使用索引值編碼就好,這也就是為甚麼要寫getidxs function。另外,我把發文後到被我爬下來的時間長轉成小時。
le_group = LabelEncoder()
le_group.fit(df['group'])
le_corpus_day = LabelEncoder()
max_corpus_day = df['corpus_day'].max()
le_corpus_day.fit(df['corpus_day'])
mlb = MultiLabelBinarizer()
term_idx_mapping = {}
def preprocess(df, train=True):
    df.fillna('None', inplace=True)
    df[df['corpus_day'] > max_corpus_day] = max_corpus_day  ## for testing purpose
    def preprocess_applyfun(row):
        # combine all headers
        header = ""
        for h in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            if row[h] != "None":
                header += " ".join(row[h]) + "\n"
        # Tokenize
        row['article_title'] = [w for w in jieba.cut(row['article_title'], cut_all=True)]
        row['corpus_title'] = [w for w in jieba.cut(row['corpus_title'], cut_all=True)]
        row['header'] = [w for w in jieba.cut(header, cut_all=True)]
        row['text_content'] = [w for w in jieba.cut(row['text_content'], cut_all=True) if w not in stops]
        # cal_publish_hours
        timedelta = row['crawled_date'] - row['publish_datetime']
        row['publish_hours'] = timedelta.days * 24 + timedelta.seconds // 3600
        # group and corpus_day categorilize
        row['group'] = le_group.transform([row['group']])[0]
        row['corpus_day'] = le_corpus_day.transform([row['corpus_day']])[0]
        return row
    df = df.apply(preprocess_applyfun, axis=1)
    if train:
        mlb.fit(np.hstack([df['article_title'], df['corpus_title'], df['header'], df['text_content']]))
        for idx, term in enumerate(mlb.classes_):
            term_idx_mapping[term] = idx
            
    # Serialize tokens
    def getidxs(terms):
        idxs = []
        for term in terms:
            if term in term_idx_mapping.keys():
                idx = term_idx_mapping.get(term)
                idxs.append(idx)
        return idxs
    df['article_title'] = df['article_title'].apply(getidxs)
    df['corpus_title'] = df['corpus_title'].apply(getidxs)
    df['header'] = df['header'].apply(getidxs)
    df['text_content'] = df['text_content'].apply(getidxs)
    return df
df = preprocess(df)
print(datetime.now()-starttime)
df[['group', 'corpus_title', 'corpus_day', 'article_title', 'browse_count',
     'text_content', 'header', 'publish_hours']]

#EXTRACT DEVELOPTMENT TEST
from sklearn.model_selection import train_test_split
dtrain, dvalid = train_test_split(df, random_state=233, train_size=0.90)
print(dtrain.shape)
print(dvalid.shape)
# (38325, 20)
# (4259, 20)
article_title, corpus_title, header, text_content這四個欄位已經被轉換成索引值,但是長度並不一,因此要填上零,讓他們等長,下面pad_sequence會用到。至於MAX_TEXT, MAX_GROUP, MAX_CORPUS_DAY則是在Keras Input時告訴它總共分成幾類。
#EMBEDDINGS MAX VALUE
# print(df['article_title'].apply(len).max())
# print(df['corpus_title'].apply(len).max())
# print(df['header'].apply(len).max())
# print(df['text_content'].apply(len).max())
MAX_ARTICLE_TITLE_SEQ = 60 #60
MAX_CORPUS_TITLE_SEQ = 20 #22
MAX_HEADER_SEQ = 250 #260
MAX_TEXT_CONTENT_SEQ = 500 #1195
MAX_TEXT = len(term_idx_mapping) +1
MAX_GROUP = len(le_group.classes_) 
MAX_CORPUS_DAY = len(le_corpus_day.classes_)
print(MAX_ARTICLE_TITLE_SEQ)  # 60
print(MAX_CORPUS_TITLE_SEQ)  # 20
print(MAX_HEADER_SEQ)  # 250
print(MAX_TEXT_CONTENT_SEQ)  # 500
print(MAX_TEXT)  # 67647
print(MAX_GROUP)  # 7
print(MAX_CORPUS_DAY)  # 34
print(datetime.now() - starttime)  # 0:16:17.879698
#KERAS DATA DEFINITION
from keras.preprocessing.sequence import pad_sequences
def get_keras_data(dataset):
    X = {
        "seq_article_title":pad_sequences(dataset['article_title'], maxlen=MAX_ARTICLE_TITLE_SEQ),
        "seq_corpus_title":pad_sequences(dataset['corpus_title'], maxlen=MAX_CORPUS_TITLE_SEQ),
        "seq_header":pad_sequences(dataset['header'], maxlen=MAX_HEADER_SEQ),
        "seq_text_content":pad_sequences(dataset['text_content'], maxlen=MAX_TEXT_CONTENT_SEQ),
        'group': np.array(dataset['group']),
        'corpus_day': np.array(dataset['corpus_day']),
        'publish_hours': np.array(dataset['publish_hours']),
    }
    return X
X_train = get_keras_data(dtrain)
X_valid = get_keras_data(dvalid)
print(datetime.now() - starttime)
這裡有用到Keras的MultiInput,其中跟文字相關的都使用RNN。
#KERAS MODEL DEFINITION
from keras.layers import Input, Dropout, Dense, BatchNormalization, \
    Activation, concatenate, GRU, Embedding, Flatten
from keras.models import Model
from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping#, TensorBoard
from keras import backend as K
from keras import optimizers
from keras import initializers
def rmsle_cust(y_true, y_pred):
    first_log = K.log(K.clip(y_pred, K.epsilon(), None) + 1.)
    second_log = K.log(K.clip(y_true, K.epsilon(), None) + 1.)
    return K.sqrt(K.mean(K.square(first_log - second_log), axis=-1))
def get_model():
    #params
    dr = 0.20
    
    #Inputs
    seq_corpus_title = Input(shape=[X_train["seq_corpus_title"].shape[1]], name="seq_corpus_title")
    seq_article_title = Input(shape=[X_train["seq_article_title"].shape[1]], name="seq_article_title")
    seq_header = Input(shape=[X_train["seq_header"].shape[1]], name="seq_header")
    seq_text_content = Input(shape=[X_train["seq_text_content"].shape[1]], name="seq_text_content")
    group = Input(shape=[1], name="group")
    corpus_day = Input(shape=[1], name="corpus_day")
    publish_hours = Input(shape=[1], name="publish_hours")
    
    #Embeddings layers
    emb_corpus_title = Embedding(MAX_TEXT, 10)(seq_corpus_title)
    emb_article_title = Embedding(MAX_TEXT, 10)(seq_article_title)
    emb_header = Embedding(MAX_TEXT, 10)(seq_header)
    emb_text_content = Embedding(MAX_TEXT, 100)(seq_text_content)
    
    emb_group = Embedding(MAX_GROUP, 5)(group)
    emb_corpus_day = Embedding(MAX_CORPUS_DAY, 10)(corpus_day)
    
    rnn_layer1 = GRU(8) (emb_corpus_title)
    rnn_layer2 = GRU(8) (emb_article_title)
    rnn_layer3 = GRU(8) (emb_header)
    rnn_layer4 = GRU(16) (emb_text_content)
    
    #main layer
    main_l = concatenate([
        rnn_layer1,
        rnn_layer2,
        rnn_layer3,
        rnn_layer4,
        Flatten() (emb_group),
        Flatten() (emb_corpus_day),
        publish_hours
    ])
    main_l = Dropout(dr)(Dense(512,activation='relu') (main_l))
    main_l = Dropout(dr)(Dense(64,activation='relu') (main_l))
    main_l = Dropout(dr)(Dense(32,activation='relu') (main_l))
    
    #output
    output = Dense(1, activation="linear") (main_l)
    
    #model
    model = Model([ seq_corpus_title, seq_article_title, seq_header, 
                   seq_text_content, group, corpus_day, publish_hours], output)
    
    #optimizer = optimizers.RMSprop()
    optimizer = optimizers.Adam()
    model.compile(loss="mse", optimizer=optimizer, metrics=["mae"])
    return model
model = get_model()
model.summary()
_______________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                  
===============================================================================================
seq_corpus_title (InputLayer)   (None, 20)           0                                         
_______________________________________________________________________________________________
seq_article_title (InputLayer)  (None, 60)           0                                         
_______________________________________________________________________________________________
seq_header (InputLayer)         (None, 250)          0                                         
_______________________________________________________________________________________________
seq_text_content (InputLayer)   (None, 500)          0                                         
_______________________________________________________________________________________________
group (InputLayer)              (None, 1)            0                                         
_______________________________________________________________________________________________
corpus_day (InputLayer)         (None, 1)            0                                         
_______________________________________________________________________________________________
embedding_1 (Embedding)         (None, 20, 10)       676470      seq_corpus_title[0][0]        
_______________________________________________________________________________________________
embedding_2 (Embedding)         (None, 60, 10)       676470      seq_article_title[0][0]       
_______________________________________________________________________________________________
embedding_3 (Embedding)         (None, 250, 10)      676470      seq_header[0][0]              
_______________________________________________________________________________________________
embedding_4 (Embedding)         (None, 500, 100)     6764700     seq_text_content[0][0]        
_______________________________________________________________________________________________
embedding_5 (Embedding)         (None, 1, 5)         35          group[0][0]                   
_______________________________________________________________________________________________
embedding_6 (Embedding)         (None, 1, 10)        340         corpus_day[0][0]              
_______________________________________________________________________________________________
gru_1 (GRU)                     (None, 8)            456         embedding_1[0][0]             
_______________________________________________________________________________________________
gru_2 (GRU)                     (None, 8)            456         embedding_2[0][0]             
_______________________________________________________________________________________________
gru_3 (GRU)                     (None, 8)            456         embedding_3[0][0]             
_______________________________________________________________________________________________
gru_4 (GRU)                     (None, 16)           5616        embedding_4[0][0]             
_______________________________________________________________________________________________
flatten_1 (Flatten)             (None, 5)            0           embedding_5[0][0]             
_______________________________________________________________________________________________
flatten_2 (Flatten)             (None, 10)           0           embedding_6[0][0]             
_______________________________________________________________________________________________
publish_hours (InputLayer)      (None, 1)            0                                         
_______________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 56)           0           gru_1[0][0]                   
                                                                 gru_2[0][0]                   
                                                                 gru_3[0][0]                   
                                                                 gru_4[0][0]                   
                                                                 flatten_1[0][0]               
                                                                 flatten_2[0][0]               
                                                                 publish_hours[0][0]           
_______________________________________________________________________________________________
dense_1 (Dense)                 (None, 512)          29184       concatenate_1[0][0]           
_______________________________________________________________________________________________
dropout_1 (Dropout)             (None, 512)          0           dense_1[0][0]                 
_______________________________________________________________________________________________
dense_2 (Dense)                 (None, 64)           32832       dropout_1[0][0]               
_______________________________________________________________________________________________
dropout_2 (Dropout)             (None, 64)           0           dense_2[0][0]                 
_______________________________________________________________________________________________
dense_3 (Dense)                 (None, 32)           2080        dropout_2[0][0]               
_______________________________________________________________________________________________
dropout_3 (Dropout)             (None, 32)           0           dense_3[0][0]                 
_______________________________________________________________________________________________
dense_4 (Dense)                 (None, 1)            33          dropout_3[0][0]               
===============================================================================================
Total params: 8,865,
Trainable params: 8,865,
Non-trainable params
_______________________________________________________________________________________________
gc用來節省記憶體空間,然後就用Model去fit。
import gc
gc.collect()
#FITTING THE MODEL
epochs = 5
BATCH_SIZE = 512 * 3
steps = int(len(X_train)/BATCH_SIZE) * epochs
lr_init, lr_fin = 0.017, 0.009
exp_decay = lambda init, fin, steps: (init/fin)**(1/(steps-1)) - 1
lr_decay = exp_decay(lr_init, lr_fin, steps)
model = get_model()
K.set_value(model.optimizer.lr, lr_init)
K.set_value(model.optimizer.decay, lr_decay)
history = model.fit(X_train, dtrain.browse_count
                    , epochs=epochs
                    , batch_size=BATCH_SIZE
#                     , validation_split=0.1
                    , validation_data=(X_valid, dvalid.browse_count)
                    , verbose=1
                    )
print(datetime.now() - starttime)
Train on 38325 samples, validate on 4259 samples
Epoch 1/5
38325/38325 [===========] - 209s 5ms/step - loss: 1026696.9060 - mean_absolute_error: 220.6634 - val_loss: 566747.7857 - val_mean_absolute_error: 162.5117
Epoch 2/5
38325/38325 [===========] - 182s 5ms/step - loss: 790294.7851 - mean_absolute_error: 189.1313 - val_loss: 133049.8886 - val_mean_absolute_error: 159.8171
Epoch 3/5
38325/38325 [===========] - 203s 5ms/step - loss: 549999.5994 - mean_absolute_error: 204.5251 - val_loss: 262894.7622 - val_mean_absolute_error: 143.3210
Epoch 4/5
38325/38325 [===========] - 230s 6ms/step - loss: 431161.4882 - mean_absolute_error: 173.1293 - val_loss: 123154.7756 - val_mean_absolute_error: 150.0321
Epoch 5/5
38325/38325 [===========] - 226s 6ms/step - loss: 234846.9607 - mean_absolute_error: 161.3210 - val_loss: 85303.8106 - val_mean_absolute_error: 121.6386
1:04:58.245270
from sklearn.metrics import mean_squared_error
def getdiff(model, valid=True):
    df = pd.DataFrame(dvalid['browse_count'].values, columns=['browse_count_true'])
    df['browse_count_pred'] = (model.predict(X_valid))
    return df
df_diff = getdiff(model)
mse =  mean_squared_error(df_diff['browse_count_true'].values, df_diff['browse_count_pred'].values)
print(mse)
# Plot outputs
mpl.rcParams['figure.figsize'] = 10, 5
df_diff_sorted = df_diff.sort_values('browse_count_true')
plt.scatter(range(len(df_diff_sorted)), df_diff_sorted['browse_count_true'].values, color='black', s=0.5)
plt.scatter(range(len(df_diff_sorted)), df_diff_sorted['browse_count_pred'].values, color='red', s=0.5)
plt.show()
df_diff

紅色的部分是預測值,黑色的部分是實際值。在比較低點的時候普遍來說不會差距太大,不過也有比例尺因素在裡面啦(笑),在瀏覽人次比較高的地方的時候,預測的誤差就相對就多了,不過還是會有著比低點來的高的預測瀏覽量。
| idx | browse_count_true | browse_count_pred | 
|---|---|---|
| 0 | 271 | 272.936371 | 
| 1 | 256 | 259.646454 | 
| 2 | 849 | 720.042664 | 
| 3 | 305 | 291.747192 | 
| 4 | 208 | 259.776764 | 
| 5 | 222 | 237.322327 | 
| 6 | 209 | 217.769241 | 
| 7 | 342 | 271.587341 | 
| 8 | 67 | 37.776466 | 
| 9 | 254 | 249.315750 | 
| 10 | 583 | 345.432556 | 
| 11 | 630 | 456.542908 | 
| 12 | 694 | 467.314087 | 
| 13 | 249 | 327.678284 | 
| 14 | 822 | 755.098511 | 
| 15 | 229 | 180.495590 | 
| 16 | 316 | 361.333923 | 
| 17 | 409 | 305.247406 | 
| 18 | 571 | 415.976074 | 
| 19 | 390 | 302.599060 | 
| 20 | 113 | 76.910370 | 
import requests
from bs4 import BeautifulSoup
import re
import os
import pandas as pd
import jieba
jieba.set_dictionary('dict.txt.big')
with open('stops.txt', 'r', encoding='utf8') as f:
    stops = f.read().split('\n')
    
import numpy as np
import json
import matplotlib.pyplot as plt
import matplotlib
zhfont1 = matplotlib.font_manager.FontProperties(fname='simsun.ttf')  ## plt中文字
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from datetime import datetime
def get_article_data(url):
    if not url.startswith("https://ithelp.ithome.com.tw/articles/"):
        assert "請給ithome文章的網址"
        
    row = {}
    res = requests.get(url)
    soup = BeautifulSoup(res.text, 'lxml')
    ## group
    group = soup.select(".qa-header")[0].find_all('a')[0].text.replace(' ', '').replace('\n', '')
    ## linke count
    like_count = int(soup.select('.likeGroup__num')[0].text)  ## 定位讚的次數
    ## article header
    header = soup.select('.qa-header')[0]
    corpusinfo = header.select('h3')[0].text.replace(' ', '').replace('\n', '')
    corpus_title = corpusinfo.split('第')[0]  ## 定位文章集的的主題
    corpus_day = int(re.findall(r'第[\d]+篇', corpusinfo)[0].replace('第', '').replace('篇', ''))  ## 定位參賽第幾天
    article_title = header.select('h2')[0].text.replace(' ', '').replace('\n', '')  ## 定位文章的title
    writer_name = header.select('.ir-article-info__name')[0].text.replace(' ', '').replace('\n', '')  ## 定位作者名稱
    writer_url = header.select('.ir-article-info__name')[0]['href']  ## 定位作者的個人資訊業面
    publish_date_str = header.select('.qa-header__info-time')[0]['title']  ## 定位發文日期,為了讓日期的格式被讀成python的datetime,所以做了下面很瑣碎的事
    date_items = pd.Series(publish_date_str.split(' ')[0].split('-') + publish_date_str.split(' ')[1].split(':')).astype(int)
    publish_datetime = datetime(date_items[0], date_items[1], date_items[2], date_items[3], date_items[4], date_items[5])
    browse_count = int(re.findall(r'[\d]+', header.select('.ir-article-info__view')[0].text)[0])  ## 定位瀏覽次數
    ## markdown_html
    markdown_html = soup.select('.markdown__style')[0]
    text_content = "\n".join([p.text for p in markdown_html.select('p')])  ## 定位所有文章的段落,這邊我懶得爬讀片跟程式碼了
    h1 = [h1.text for h1 in markdown_html.select('h1')]  ## 定位文章的標題們
    h2 = [h2.text for h2 in markdown_html.select('h2')] 
    h3 = [h3.text for h3 in markdown_html.select('h3')]
    h4 = [h4.text for h4 in markdown_html.select('h4')]
    h5 = [h5.text for h5 in markdown_html.select('h5')]
    h6 = [h6.text for h6 in markdown_html.select('h6')]
    row['group'] = group
    row['like_count'] = like_count
    row['corpus_title'] = corpus_title
    row['corpus_day'] = corpus_day
    row['article_title'] = article_title
    row['writer_name'] = writer_name
    row['writer_url'] = writer_url
    row['publish_datetime'] = publish_datetime
    row['browse_count'] = browse_count
    row['text_content'] = text_content
    row['h1'] = h1 if h1 != [] else None
    row['h2'] = h2 if h2 != [] else None
    row['h3'] = h3 if h3 != [] else None
    row['h4'] = h4 if h4 != [] else None
    row['h5'] = h5 if h5 != [] else None
    row['h6'] = h6 if h6 != [] else None
    row['crawled_date'] = datetime.now()
    
    return row
這邊要手動輸入成預測頭十天的瀏覽量,所以不需要publish_hours這個欄位。
le_group = LabelEncoder()
le_group.classes_ = np.load('le_group.npy')
le_corpus_day = LabelEncoder()
le_corpus_day.classes_ = np.load('le_corpus_day.npy')    
with open('term_idx_mapping.json', 'r', encoding='utf8') as f:
    term_idx_mapping = json.load(f)
with open('max_corpus_day', 'r', encoding='utf8') as f:
    max_corpus_day = int(f.read())
def preprocess(df):
    df.fillna('None', inplace=True)
    df[df['corpus_day'] > max_corpus_day] = max_corpus_day  ## for testing purpose
    def preprocess_applyfun(row):
        # combine all headers
        header = ""
        for h in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            if row[h] != "None":
                header += " ".join(row[h]) + "\n"
        # Tokenize
        row['article_title'] = [w for w in jieba.cut(row['article_title'], cut_all=True)]
        row['corpus_title'] = [w for w in jieba.cut(row['corpus_title'], cut_all=True)]
        row['header'] = [w for w in jieba.cut(header, cut_all=True)]
        row['text_content'] = [w for w in jieba.cut(row['text_content'], cut_all=True) if w not in stops]
        # cal_publish_hours
#         timedelta = row['crawled_date'] - row['publish_datetime']
#         row['publish_hours'] = timedelta.days * 24 + timedelta.seconds // 3600
        # group and corpus_day categorilize
        row['group'] = le_group.transform([row['group']])[0]
        row['corpus_day'] = le_corpus_day.transform([row['corpus_day']])[0]
        return row
    df = df.apply(preprocess_applyfun, axis=1)
            
    # Serialize tokens
    def getidxs(terms):
        idxs = []
        for term in terms:
            if term in term_idx_mapping.keys():
                idx = term_idx_mapping.get(term)
                idxs.append(idx)
        return idxs
    df['article_title'] = df['article_title'].apply(getidxs)
    df['corpus_title'] = df['corpus_title'].apply(getidxs)
    df['header'] = df['header'].apply(getidxs)
    df['text_content'] = df['text_content'].apply(getidxs)
    return df
這邊大家可以自行更改url以預測你想預測那篇文章的瀏覽數。
from keras.models import load_model
model = load_model("20180105 1248.model")
MAX_ARTICLE_TITLE_SEQ = 60 #60
MAX_CORPUS_TITLE_SEQ = 20 #22
MAX_HEADER_SEQ = 250 #260
MAX_TEXT_CONTENT_SEQ = 500 #1195
url = "https://ithelp.ithome.com.tw/articles/10195707"
data = get_article_data(url)
df_test = pd.DataFrame([data] * 10)
for i in range(len(df_test)):
    df_test.loc[i, 'publish_hours'] = 24 * (i+1)
df_test = preprocess(df_test)
df_test[['group', 'corpus_title', 'corpus_day', 'article_title', 'browse_count',
     'text_content', 'header', 'publish_hours']]

#KERAS DATA DEFINITION
from keras.preprocessing.sequence import pad_sequences
def get_keras_data(dataset):
    X = {
        "seq_article_title":pad_sequences(dataset['article_title'], maxlen=MAX_ARTICLE_TITLE_SEQ),
        "seq_corpus_title":pad_sequences(dataset['corpus_title'], maxlen=MAX_CORPUS_TITLE_SEQ),
        "seq_header":pad_sequences(dataset['header'], maxlen=MAX_HEADER_SEQ),
        "seq_text_content":pad_sequences(dataset['text_content'], maxlen=MAX_TEXT_CONTENT_SEQ),
        'group': np.array(dataset['group']),
        'corpus_day': np.array(dataset['corpus_day']),
        'publish_hours': np.array(dataset['publish_hours']),
    }
    return X
X_test = get_keras_data(df_test)
predict_result = model.predict(X_test)
mpl.rcParams['figure.figsize'] = 10, 5
df_pred = pd.DataFrame(predict_result, index=['day'+str(i)+'\n('+ str(i*24) +'hours)' for i in range(1,11)], columns=[data['article_title']])
ax = df_pred.plot(kind='line', legend=False, figsize=(10, 5), grid=True)
plt.show()
df_pred.astype(int).T

