上市
http://www.twse.com.tw/exchangeReport/STOCK_DAY?response=json&date=20180801&stockNo=2330
上櫃
http://www.tpex.org.tw/web/stock/aftertrading/daily_trading_info/st43_result.php?d=107/08&stkno=3105
只要對這兩個網址傳入不同的日期跟股票代碼
import requests
json_data=requests.get('http://www.twse.com.tw/exchangeReport/STOCK_DAY?response=json&date=20180801&stockNo=2330').json()
columns= ['日期', '成交股數', '成交金額', '開盤價', '最高價', '最低價', '收盤價', '漲跌價差', '成交筆數']
data=pd.DataFrame(json_data['data'],columns=columns)
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>日期</th>
<th>成交股數</th>
<th>成交金額</th>
<th>開盤價</th>
<th>最高價</th>
<th>最低價</th>
<th>收盤價</th>
<th>漲跌價差</th>
<th>成交筆數</th>
</tr>
</thead>
<tbody>
<tr>
<th>0</th>
<td>107/08/01</td>
<td>29,777,161</td>
<td>7,375,488,342</td>
<td>247.00</td>
<td>248.00</td>
<td>246.50</td>
<td>248.00</td>
<td>+2.00</td>
<td>11,667</td>
</tr>
清理資料
import re
data['日期']=data['日期'].apply(lambda x: re.sub('(\d+)(/\d+/\d+)',lambda y: str(int(y.group(1))+1911)+y.group(2),x))
data[['成交股數','成交金額','成交筆數']]=data[['成交股數','成交金額','成交筆數']].applymap(lambda x:x.replace(',',''))
data.iloc[:,1:]=data.iloc[:,1:].applymap(float)
data[['成交股數','成交金額']]=data[['成交股數','成交金額']]/1000
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>日期</th>
<th>成交股數</th>
<th>成交金額</th>
<th>開盤價</th>
<th>最高價</th>
<th>最低價</th>
<th>收盤價</th>
<th>漲跌價差</th>
<th>成交筆數</th>
</tr>
</thead>
<tbody>
<tr>
<th>0</th>
<td>2018/08/01</td>
<td>29777.2</td>
<td>7.37549e+06</td>
<td>247</td>
<td>248</td>
<td>246.5</td>
<td>248</td>
<td>2</td>
<td>11667</td>
</tr>
json_data=requests.get('http://www.tpex.org.tw/web/stock/aftertrading/daily_trading_info/st43_result.php?d=107/08&stkno=3105').json()
columns= ['日期', '成交股數', '成交金額', '開盤價', '最高價', '最低價', '收盤價', '漲跌價差', '成交筆數']
data=pd.DataFrame(json_data['aaData'],columns=columns)
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>日期</th>
<th>成交股數</th>
<th>成交金額</th>
<th>開盤價</th>
<th>最高價</th>
<th>最低價</th>
<th>收盤價</th>
<th>漲跌價差</th>
<th>成交筆數</th>
</tr>
</thead>
<tbody>
<tr>
<th>0</th>
<td>107/08/01</td>
<td>7,031</td>
<td>1,090,434</td>
<td>148.50</td>
<td>159.00</td>
<td>148.50</td>
<td>156.00</td>
<td>8.00</td>
<td>4,696</td>
</tr>
清理資料
import re
data['日期']=data['日期'].apply(lambda x: re.sub('(\d+)(/\d+/\d+)',lambda y: str(int(y.group(1))+1911)+y.group(2),x))
data[['成交股數','成交金額','成交筆數']]=data[['成交股數','成交金額','成交筆數']].applymap(lambda x:x.replace(',',''))
data.iloc[:,1:]=data.iloc[:,1:].applymap(float)
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>日期</th>
<th>成交股數</th>
<th>成交金額</th>
<th>開盤價</th>
<th>最高價</th>
<th>最低價</th>
<th>收盤價</th>
<th>漲跌價差</th>
<th>成交筆數</th>
</tr>
</thead>
<tbody>
<tr>
<th>0</th>
<td>2018/08/01</td>
<td>7031</td>
<td>1.09043e+06</td>
<td>148.5</td>
<td>159</td>
<td>148.5</td>
<td>156</td>
<td>8</td>
<td>4696</td>
</tr>
確定可以爬後改用scrapy重寫 把資料寫入到DB並定期排程
大概每天下午3點後可以抓當天資料
# -*- coding: utf-8 -*-
import json
import re
import scrapy
import pandas as pd
from skhome.extensions import MongoDatabase
import time
from datetime import datetime
today = datetime.today()
TWSE_URL = 'http://www.twse.com.tw/exchangeReport/STOCK_DAY?response=json&date={y}{m:02d}{d:02d}&stockNo={code}'
TPEX_URL = 'http://www.tpex.org.tw/web/stock/aftertrading/daily_trading_info/st43_result.php?d={y}/{m:02d}&stkno={code}'
columns = ['_id', '市場別', '產業別', 'name', 'code', 'date', '成交股數', '成交金額', '開盤價', '最高價', '最低價', '收盤價', '漲跌價差', '成交筆數']
class StockDaySpider(scrapy.Spider):
name = 'stock_days'
custom_settings = {
'DOWNLOAD_DELAY': 1,
'CONCURRENT_REQUESTS': 1,
'MONGODB_COLLECTION': 'stock_day',
'MONGODB_ITEM_CACHE': 1,
'MONGODB_HAS_ID_FIELD': True,
'MONGODB_UNIQ_KEY': [("date", -1), ("code", 1)],
'COOKIES_ENABLED': False,
}
def start_requests(self):
with MongoDatabase('stock_code') as collection:
stock_code = collection.export_df({'dtype': {'$in': ['股票', 'ETF']}})
stock_code = stock_code.sample(frac=1)
for date in pd.date_range('20170101', '20180901', freq='MS')[::-1]:
for s in stock_code.itertuples():
y = date.year
m = date.month
d = date.day
if s.市場別 == '上市':
url = TWSE_URL.format(y=y, m=m, d=d, code=s.code)
time.sleep(10)
yield scrapy.Request(url, meta={'市場別': s.市場別, '產業別': s.產業別, 'name': s.name, 'code': s.code})
else:
y = y - 1911
url = TPEX_URL.format(y=y, m=m, d=d, code=s.code)
yield scrapy.Request(url, meta={'市場別': s.市場別, '產業別': s.產業別, 'name': s.name, 'code': s.code})
def parse(self, response):
m = response.meta
json_data = json.loads(response.text)
if m['市場別'] == '上市':
try:
data = json_data['data']
except KeyError:
data = []
else:
data = json_data['aaData']
for d in data:
d[0] = re.sub('(\d+)(/\d+/\d+)', lambda y: str(int(y.group(1)) + 1911) + y.group(2), d[0])
d[1] = int(d[1].replace(',', ''))
d[2] = int(d[2].replace(',', ''))
if m['市場別'] == '上市':
d[1] = int(d[1] / 1000)
d[2] = int(d[2] / 1000)
d[3] = '' if d[3] == '--' else float(d[3].replace(',', ''))
d[4] = '' if d[4] == '--' else float(d[4].replace(',', ''))
d[5] = '' if d[5] == '--' else float(d[5].replace(',', ''))
d[6] = '' if d[6] == '--' else float(d[6].replace(',', ''))
d[7] = float(0.0 if d[7].replace(',', '') == 'X0.00' else d[7].replace(',', ''))
d[8] = int(d[8].replace(',', ''))
_id = d[0] + '_' + m['code']
yield dict(zip(columns, [_id, m['市場別'], m['產業別'], m['name'], m['code'], *d]))
經過測試,上市要爬慢點,上櫃可以爬快點,把上市櫃資料交互爬可以讓單一網站負擔較小