download.py
工具抓取訓練用資料import os
import sys
import argparse
import pandas as pd
from datetime import datetime, timedelta
def get_sp500_tickers() -> list:
"""
從維基百科抓取S&P 500的股票代碼列表。
"""
try:
url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
tables = pd.read_html(url)
df = tables[0]
tickers = df['Symbol'].tolist()
return tickers
except Exception as e:
print(f"Error fetching S&P 500 tickers: {e}")
return []
def get_dji30_tickers() -> list:
"""
返回DJI 30的股票代碼列表。
"""
return [
'AAPL', 'AMGN', 'AXP', 'BA', 'CAT', 'CRM', 'CSCO', 'CVX', 'DIS', 'DOW',
'GS', 'HD', 'HON', 'IBM', 'INTC', 'JNJ', 'JPM', 'KO', 'MCD', 'MMM',
'MRK', 'MSFT', 'NKE', 'PG', 'TRV', 'UNH', 'V', 'VZ', 'WBA', 'WMT'
]
def main():
sys.path.append(
os.path.dirname(
os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
# 設置 argparse 來處理命令行參數
parser = argparse.ArgumentParser(
description="Polygon.io Data Fetching Command Line Tool")
parser.add_argument('--sp500',
action='store_true',
help='將 S&P 500 成分股加入 ticker_list 中')
parser.add_argument('--dow30',
action='store_true',
help='將 DJI 30 成分股加入 ticker_list 中')
parser.add_argument('--ticker', type=str, help='將單個 ticker 加入 ticker_list')
parser.add_argument('--tickers',
type=str,
nargs='+',
help='將多個 tickers 加入 ticker_list')
parser.add_argument('--timespan',
'-t',
type=str,
default='hour',
help='指定抓取的時間跨度(如 minute, hour, day, week)')
parser.add_argument('--multiplier',
'-m',
type=int,
default=1,
help='指定時間跨度的倍數 (如 15 表示每 15 分鐘, 默認為 1)')
parser.add_argument('--start',
'-s',
type=str,
required=True,
help='指定開始日期 (YYYY-MM-DD)')
parser.add_argument('--end',
'-e',
type=str,
required=True,
help='指定結束日期 (YYYY-MM-DD)')
args = parser.parse_args()
# 初始化 ticker_list
ticker_list = []
# 處理參數,將相應的 ticker 加入到 ticker_list
if args.sp500:
ticker_list += get_sp500_tickers()
if args.dow30:
ticker_list += get_dji30_tickers()
if args.ticker:
ticker_list.append(args.ticker)
if args.tickers:
ticker_list += args.tickers
# 確認至少有一個 ticker 被指定
if not ticker_list:
print(
"Error: 至少需要指定一個 ticker (通過 --sp500, --dow30, --ticker 或 --tickers)"
)
sys.exit(1)
# 設定日期
start_date_str = args.start
end_date_str = args.end
# 初始化 PolygonIODownloader
from PolygonIO.PolygonIODownloader import PolygonIODownloader # 請將 'your_module' 替換為包含上述類別的模組名稱
polygon_wrapper = PolygonIODownloader(root_dir='./datasets')
# 抓取數據
for idx, ticker in enumerate(ticker_list, start=1):
print(f"Fetching data for {ticker} ({idx}/{len(ticker_list)})...")
try:
df_ohlcv = polygon_wrapper.fetch_ohlcv(
ticker_list=[ticker],
start_date_str=start_date_str,
end_date_str=end_date_str,
timespan=args.timespan,
multiplier=args.multiplier)
if df_ohlcv.empty:
print(f"No data fetched for {ticker}.")
else:
print(
f"Data for {ticker} fetched successfully. {len(df_ohlcv)} rows."
)
# 印出前10筆, 中間10筆, 最後10筆
print("前10筆資料:")
print(df_ohlcv.head(10))
mid_point = len(df_ohlcv) // 2
print("中間10筆資料:")
print(df_ohlcv.iloc[mid_point - 5:mid_point + 5])
print("最後10筆資料:")
print(df_ohlcv.tail(10))
except Exception as e:
print(f"Error fetching data for {ticker}: {e}")
if __name__ == "__main__":
main()
如果是免費帳號,則要修改下載程式,在每次list_aggs(...)
API Call之間拉出間隔,並且時間只能是2年以內的資料。
class PolygonIOCore:
......
def fetch(self,
ticker: str,
start_date_str: str,
end_date_str: str,
timespan: str,
multiplier: int = 1) -> pd.DataFrame:
......
for a in self.client.list_aggs(ticker=ticker,
multiplier=multiplier,
timespan=timespan,
from_=start_date_str,
to=end_date_str,
limit=5000):
aggs.append(a)
### 在每個API CALL之間加入Sleep (免費帳號, 每秒5次API CALL)
time.sleep(12)
開始抓取S&P500以及DOW30的成分股,一小時框,近五年內的資料。python download.py --sp500 --dow30 --timespan hour --multiplier 1 --start 2019-09-21 --end 2024-09-21
在漫長的下載後,成功了下載了所有股票,結果後來發現DOW30的成分股全都是S&P500的成分股,總共抓了503支股票,不過看過下載玩的1hour股價後,發現還要解決的問題其實不少
從下面的log可以看到,這幾支股票,一樣的抓取條件,但抓到的筆數相差卻非常大
Data for MMM fetched successfully. 14954 rows.
Data for AOS fetched successfully. 10458 rows.
Data for ABT fetched successfully. 13442 rows.
Data for ABBV fetched successfully. 15239 rows.
0 2019-09-23 08:00:00+00:00 139.1472 139.1472 139.1472 139.1472 119.600 MMM
0 2019-09-23 13:00:00+00:00 47.000 47.48 46.860 47.425 55140.0 AOS
0 2019-09-23 12:00:00+00:00 83.5800 83.580 83.5800 83.5800 731.0 ABT
0 2019-09-23 11:00:00+00:00 72.5000 73.0500 72.500 73.0500 640.0 ABBV
原因目前認為應該是,盤前盤後的交易造成的,每支股票不同。這樣的資料如果要用來訓練跟測試,至少需要把數據對齊才有辦法使用。
以下是部分log
python download.py --sp500 --dow30 --timespan hour --multiplier 1 --start 2019-09-21 --end 2024-09-21
Fetching data for MMM (1/533)...
Data saved to ./datasets\PolygonIODownloader\MMM_2019-09-21_2024-09-21_1_hour_raw.csv
Data for MMM fetched successfully. 14954 rows.
前10筆資料:
timestamp open high low close volume ticker
0 2019-09-23 08:00:00+00:00 139.1472 139.1472 139.1472 139.1472 119.600 MMM
1 2019-09-23 12:00:00+00:00 139.7074 139.7074 139.4314 139.4314 241.592 MMM
2 2019-09-23 13:00:00+00:00 138.5033 139.0301 137.5418 138.8880 238668.976 MMM
3 2019-09-23 14:00:00+00:00 138.8462 138.8462 137.8094 138.1773 227292.624 MMM
4 2019-09-23 15:00:00+00:00 138.1886 138.8712 138.1689 138.6622 209298.804 MMM
5 2019-09-23 16:00:00+00:00 138.6455 139.6990 138.3870 139.3311 334236.552 MMM
6 2019-09-23 17:00:00+00:00 139.3729 139.8401 139.1973 139.5987 165931.844 MMM
7 2019-09-23 18:00:00+00:00 139.6237 139.8161 139.5151 139.7324 188353.256 MMM
8 2019-09-23 19:00:00+00:00 139.7324 139.9916 139.4398 139.4482 490236.812 MMM
9 2019-09-23 20:00:00+00:00 139.4314 139.4314 139.4314 139.4314 21472.984 MMM
最後10筆資料:
timestamp open high low close volume ticker
14944 2024-09-20 12:00:00+00:00 133.800 133.8000 133.750 133.750 919.0 MMM
14945 2024-09-20 13:00:00+00:00 133.780 133.7800 132.771 133.215 858654.0 MMM
14946 2024-09-20 14:00:00+00:00 133.205 133.8500 133.020 133.810 414482.0 MMM
14947 2024-09-20 15:00:00+00:00 133.790 134.5142 133.760 134.280 279221.0 MMM
14948 2024-09-20 16:00:00+00:00 134.295 134.7600 134.210 134.445 241361.0 MMM
14949 2024-09-20 17:00:00+00:00 134.460 134.9500 134.290 134.710 442866.0 MMM
14950 2024-09-20 18:00:00+00:00 134.700 134.7800 134.105 134.595 388438.0 MMM
14951 2024-09-20 19:00:00+00:00 134.580 134.8000 134.000 134.760 1187421.0 MMM
14952 2024-09-20 20:00:00+00:00 134.770 135.3300 133.720 134.770 148209.0 MMM
14953 2024-09-20 22:00:00+00:00 134.770 134.7700 134.770 134.770 10357.0 MMM
Fetching data for AOS (2/533)...
Data saved to ./datasets\PolygonIODownloader\AOS_2019-09-21_2024-09-21_1_hour_raw.csv
Data for AOS fetched successfully. 10458 rows.
前10筆資料:
timestamp open high low close volume ticker
0 2019-09-23 13:00:00+00:00 47.000 47.48 46.860 47.425 55140.0 AOS
1 2019-09-23 14:00:00+00:00 47.420 47.42 47.130 47.310 74617.0 AOS
2 2019-09-23 15:00:00+00:00 47.310 47.70 47.240 47.700 104694.0 AOS
3 2019-09-23 16:00:00+00:00 47.710 47.86 47.660 47.770 76278.0 AOS
4 2019-09-23 17:00:00+00:00 47.790 47.87 47.680 47.860 54102.0 AOS
5 2019-09-23 18:00:00+00:0 f0 47.860 47.88 47.710 47.740 59210.0 AOS
6 2019-09-23 19:00:00+00:00 47.735 47.96 47.735 47.740 203440.0 AOS
7 2019-09-23 20:00:00+00:00 47.770 47.77 47.770 47.770 32096.0 AOS
8 2019-09-24 13:00:00+00:00 47.950 48.24 47.650 47.720 72433.0 AOS
9 2019-09-24 14:00:00+00:00 47.715 47.74 47.300 47.680 86794.0 AOS
最後10筆資料:
timestamp open high low close volume ticker
10448 2024-09-19 20:00:00+00:00 84.290 84.2900 84.2900 84.290 7204.0 AOS
10449 2024-09-20 10:00:00+00:00 84.200 84.2400 84.2000 84.240 422.0 AOS
10450 2024-09-20 13:00:00+00:00 84.020 84.1900 82.8200 82.875 127059.0 AOS
10451 2024-09-20 14:00:00+00:00 82.875 83.3800 82.6200 83.130 67117.0 AOS
10452 2024-09-20 15:00:00+00:00 83.070 83.2500 82.8650 83.010 54899.0 AOS
10453 2024-09-20 16:00:00+00:00 83.010 83.1600 82.9026 83.110 40683.0 AOS
10454 2024-09-20 17:00:00+00:00 83.140 83.4150 83.0300 83.110 37185.0 AOS
10455 2024-09-20 18:00:00+00:00 83.070 83.1200 82.7950 83.070 68247.0 AOS
10456 2024-09-20 19:00:00+00:00 83.070 83.2900 82.7200 82.900 282742.0 AOS
10457 2024-09-20 20:00:00+00:00 82.890 82.8916 82.8900 82.890 46638.0 AOS
Fetching data for ABT (3/533)...
Data saved to ./datasets\PolygonIODownloader\ABT_2019-09-21_2024-09-21_1_hour_raw.csv
Data for ABT fetched successfully. 13442 rows.
前10筆資料:
timestamp open high low close volume ticker
0 2019-09-23 12:00:00+00:00 83.5800 83.580 83.5800 83.5800 731.0 ABT
1 2019-09-23 13:00:00+00:00 83.6000 83.630 83.1900 83.4200 199670.0 ABT
2 2019-09-23 14:00:00+00:00 83.4300 83.590 83.2700 83.4800 198560.0 ABT
3 2019-09-23 15:00:00+00:00 83.4800 83.720 83.4200 83.6496 210529.0 ABT
4 2019-09-23 16:00:00+00:00 83.6323 83.664 83.4500 83.4800 126953.0 ABT
5 2019-09-23 17:00:00+00:00 83.4900 83.490 83.1300 83.1600 168024.0 ABT
6 2019-09-23 18:00:00+00:00 83.1732 83.310 83.1732 83.2600 156499.0 ABT
7 2019-09-23 19:00:00+00:00 83.2500 83.430 83.1600 83.1700 465495.0 ABT
8 2019-09-23 20:00:00+00:00 83.1600 83.160 83.1600 83.1600 158481.0 ABT
9 2019-09-23 21:00:00+00:00 83.2000 83.200 83.1600 83.1600 714.0 ABT
最後10筆資料:
timestamp open high low close volume ticker
13432 2024-09-20 12:00:00+00:00 114.360 114.3600 114.3600 114.3600 152.0 ABT
13433 2024-09-20 13:00:00+00:00 113.620 114.1300 113.2000 113.8700 1383365.0 ABT
13434 2024-09-20 14:00:00+00:00 113.950 114.1000 113.5550 113.8350 575450.0 ABT
13435 2024-09-20 15:00:00+00:00 113.825 114.1200 113.6950 113.7000 416296.0 ABT
13436 2024-09-20 16:00:00+00:00 113.690 113.8000 113.5500 113.6700 236489.0 ABT
13437 2024-09-20 17:00:00+00:00 113.680 114.1600 113.6750 114.1110 324759.0 ABT
13438 2024-09-20 18:00:00+00:00 114.120 114.1500 113.5650 113.7150 308884.0 ABT
13439 2024-09-20 19:00:00+00:00 113.710 113.9100 113.4420 113.6900 1218445.0 ABT
13440 2024-09-20 20:00:00+00:00 113.700 113.7016 113.6300 113.6300 130750.0 ABT
13441 2024-09-20 22:00:00+00:00 113.700 113.7000 113.3319 113.3319 388.0 ABT
Fetching data for ABBV (4/533)...
Data saved to ./datasets\PolygonIODownloader\ABBV_2019-09-21_2024-09-21_1_hour_raw.csv
Data for ABBV fetched successfully. 15239 rows.
前10筆資料:
timestamp open high low close volume ticker
0 2019-09-23 11:00:00+00:00 72.5000 73.0500 72.500 73.0500 640.0 ABBV
1 2019-09-23 12:00:00+00:00 73.3500 73.3500 72.390 72.8600 3817.0 ABBV
2 2019-09-23 13:00:00+00:00 72.9000 72.9611 72.050 72.7817 717311.0 ABBV
3 2019-09-23 14:00:00+00:00 72.7600 73.3000 72.760 73.0850 859381.0 ABBV
4 2019-09-23 15:00:00+00:00 73.0900 73.5400 73.010 73.4100 880159.0 ABBV
5 2019-09-23 16:00:00+00:00 73.4174 73.6800 73.360 73.5800 801864.0 ABBV
6 2019-09-23 17:00:00+00:00 73.5800 73.7800 73.570 73.7500 702725.0 ABBV
7 2019-09-23 18:00:00+00:00 73.7490 73.7600 73.325 73.3300 1091298.0 ABBV
8 2019-09-23 19:00:00+00:00 73.3300 73.3900 72.910 72.9700 1556149.0 ABBV
9 2019-09-23 20:00:00+00:00 72.9700 73.2000 72.910 73.1000 249423.0 ABBV
最後10筆資料:
timestamp open high low close volume ticker
15229 2024-09-20 13:00:00+00:00 193.000 193.5000 192.13 192.9550 1307749.0 ABBV
15230 2024-09-20 14:00:00+00:00 193.130 193.8210 192.89 193.1300 597094.0 ABBV
15231 2024-09-20 15:00:00+00:00 193.100 193.2600 192.58 192.6550 317190.0 ABBV
15232 2024-09-20 16:00:00+00:00 192.640 193.0100 192.41 192.9801 196176.0 ABBV
15233 2024-09-20 17:00:00+00:00 193.005 193.6100 192.99 193.4150 206329.0 ABBV
15234 2024-09-20 18:00:00+00:00 193.410 193.4400 192.56 192.8300 778455.0 ABBV
15235 2024-09-20 19:00:00+00:00 192.830 193.5550 192.60 193.4600 879074.0 ABBV
15236 2024-09-20 20:00:00+00:00 193.470 193.4716 193.00 193.4700 455551.0 ABBV
15237 2024-09-20 21:00:00+00:00 193.280 193.2900 193.28 193.2900 510.0 ABBV
15238 2024-09-20 22:00:00+00:00 193.470 193.4700 193.47 193.4700 1311.0 ABBV
像是'BRK.B'從2019-09-30
以後才開始有資料,不知道為啥,應該不可能是那時候才上市,可能有其他原因。
timestamp open high low close volume ticker
0 2019-09-30 09:00:00+00:00 207.750 207.75 207.75 207.75 100.0 BRK.B
1 2019-09-30 11:00:00+00:00 207.550 207.69 207.55 207.63 5824.0 BRK.B
2 2019-09-30 12:00:00+00:00 207.600 207.60 207.60 207.60 707.0 BRK.B
3 2019-09-30 13:00:00+00:00 207.800 208.30 207.77 207.95 216762.0 BRK.B
4 2019-09-30 14:00:00+00:00 207.945 208.00 207.55 207.96 221958.0 BRK.B
所以接下來我需要花時間研究一下,如何處理、對齊這些資料,並把這些資料切分成train
和trade
兩個部分,之後進行訓練。