download.py 工具抓取訓練用資料import os
import sys
import argparse
import pandas as pd
from datetime import datetime, timedelta
def get_sp500_tickers() -> list:
"""
從維基百科抓取S&P 500的股票代碼列表。
"""
try:
url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
tables = pd.read_html(url)
df = tables[0]
tickers = df['Symbol'].tolist()
return tickers
except Exception as e:
print(f"Error fetching S&P 500 tickers: {e}")
return []
def get_dji30_tickers() -> list:
"""
返回DJI 30的股票代碼列表。
"""
return [
'AAPL', 'AMGN', 'AXP', 'BA', 'CAT', 'CRM', 'CSCO', 'CVX', 'DIS', 'DOW',
'GS', 'HD', 'HON', 'IBM', 'INTC', 'JNJ', 'JPM', 'KO', 'MCD', 'MMM',
'MRK', 'MSFT', 'NKE', 'PG', 'TRV', 'UNH', 'V', 'VZ', 'WBA', 'WMT'
]
def main():
sys.path.append(
os.path.dirname(
os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
# 設置 argparse 來處理命令行參數
parser = argparse.ArgumentParser(
description="Polygon.io Data Fetching Command Line Tool")
parser.add_argument('--sp500',
action='store_true',
help='將 S&P 500 成分股加入 ticker_list 中')
parser.add_argument('--dow30',
action='store_true',
help='將 DJI 30 成分股加入 ticker_list 中')
parser.add_argument('--ticker', type=str, help='將單個 ticker 加入 ticker_list')
parser.add_argument('--tickers',
type=str,
nargs='+',
help='將多個 tickers 加入 ticker_list')
parser.add_argument('--timespan',
'-t',
type=str,
default='hour',
help='指定抓取的時間跨度(如 minute, hour, day, week)')
parser.add_argument('--multiplier',
'-m',
type=int,
default=1,
help='指定時間跨度的倍數 (如 15 表示每 15 分鐘, 默認為 1)')
parser.add_argument('--start',
'-s',
type=str,
required=True,
help='指定開始日期 (YYYY-MM-DD)')
parser.add_argument('--end',
'-e',
type=str,
required=True,
help='指定結束日期 (YYYY-MM-DD)')
args = parser.parse_args()
# 初始化 ticker_list
ticker_list = []
# 處理參數,將相應的 ticker 加入到 ticker_list
if args.sp500:
ticker_list += get_sp500_tickers()
if args.dow30:
ticker_list += get_dji30_tickers()
if args.ticker:
ticker_list.append(args.ticker)
if args.tickers:
ticker_list += args.tickers
# 確認至少有一個 ticker 被指定
if not ticker_list:
print(
"Error: 至少需要指定一個 ticker (通過 --sp500, --dow30, --ticker 或 --tickers)"
)
sys.exit(1)
# 設定日期
start_date_str = args.start
end_date_str = args.end
# 初始化 PolygonIODownloader
from PolygonIO.PolygonIODownloader import PolygonIODownloader # 請將 'your_module' 替換為包含上述類別的模組名稱
polygon_wrapper = PolygonIODownloader(root_dir='./datasets')
# 抓取數據
for idx, ticker in enumerate(ticker_list, start=1):
print(f"Fetching data for {ticker} ({idx}/{len(ticker_list)})...")
try:
df_ohlcv = polygon_wrapper.fetch_ohlcv(
ticker_list=[ticker],
start_date_str=start_date_str,
end_date_str=end_date_str,
timespan=args.timespan,
multiplier=args.multiplier)
if df_ohlcv.empty:
print(f"No data fetched for {ticker}.")
else:
print(
f"Data for {ticker} fetched successfully. {len(df_ohlcv)} rows."
)
# 印出前10筆, 中間10筆, 最後10筆
print("前10筆資料:")
print(df_ohlcv.head(10))
mid_point = len(df_ohlcv) // 2
print("中間10筆資料:")
print(df_ohlcv.iloc[mid_point - 5:mid_point + 5])
print("最後10筆資料:")
print(df_ohlcv.tail(10))
except Exception as e:
print(f"Error fetching data for {ticker}: {e}")
if __name__ == "__main__":
main()
如果是免費帳號,則要修改下載程式,在每次list_aggs(...)API Call之間拉出間隔,並且時間只能是2年以內的資料。
class PolygonIOCore:
......
def fetch(self,
ticker: str,
start_date_str: str,
end_date_str: str,
timespan: str,
multiplier: int = 1) -> pd.DataFrame:
......
for a in self.client.list_aggs(ticker=ticker,
multiplier=multiplier,
timespan=timespan,
from_=start_date_str,
to=end_date_str,
limit=5000):
aggs.append(a)
### 在每個API CALL之間加入Sleep (免費帳號, 每秒5次API CALL)
time.sleep(12)
開始抓取S&P500以及DOW30的成分股,一小時框,近五年內的資料。python download.py --sp500 --dow30 --timespan hour --multiplier 1 --start 2019-09-21 --end 2024-09-21
在漫長的下載後,成功了下載了所有股票,結果後來發現DOW30的成分股全都是S&P500的成分股,總共抓了503支股票,不過看過下載玩的1hour股價後,發現還要解決的問題其實不少
從下面的log可以看到,這幾支股票,一樣的抓取條件,但抓到的筆數相差卻非常大
Data for MMM fetched successfully. 14954 rows.
Data for AOS fetched successfully. 10458 rows.
Data for ABT fetched successfully. 13442 rows.
Data for ABBV fetched successfully. 15239 rows.0 2019-09-23 08:00:00+00:00 139.1472 139.1472 139.1472 139.1472 119.600 MMM
0 2019-09-23 13:00:00+00:00 47.000 47.48 46.860 47.425 55140.0 AOS
0 2019-09-23 12:00:00+00:00 83.5800 83.580 83.5800 83.5800 731.0 ABT
0 2019-09-23 11:00:00+00:00 72.5000 73.0500 72.500 73.0500 640.0 ABBV
原因目前認為應該是,盤前盤後的交易造成的,每支股票不同。這樣的資料如果要用來訓練跟測試,至少需要把數據對齊才有辦法使用。
以下是部分log
python download.py --sp500 --dow30 --timespan hour --multiplier 1 --start 2019-09-21 --end 2024-09-21
Fetching data for MMM (1/533)...
Data saved to ./datasets\PolygonIODownloader\MMM_2019-09-21_2024-09-21_1_hour_raw.csv
Data for MMM fetched successfully. 14954 rows.
前10筆資料:
timestamp open high low close volume ticker
0 2019-09-23 08:00:00+00:00 139.1472 139.1472 139.1472 139.1472 119.600 MMM
1 2019-09-23 12:00:00+00:00 139.7074 139.7074 139.4314 139.4314 241.592 MMM
2 2019-09-23 13:00:00+00:00 138.5033 139.0301 137.5418 138.8880 238668.976 MMM
3 2019-09-23 14:00:00+00:00 138.8462 138.8462 137.8094 138.1773 227292.624 MMM
4 2019-09-23 15:00:00+00:00 138.1886 138.8712 138.1689 138.6622 209298.804 MMM
5 2019-09-23 16:00:00+00:00 138.6455 139.6990 138.3870 139.3311 334236.552 MMM
6 2019-09-23 17:00:00+00:00 139.3729 139.8401 139.1973 139.5987 165931.844 MMM
7 2019-09-23 18:00:00+00:00 139.6237 139.8161 139.5151 139.7324 188353.256 MMM
8 2019-09-23 19:00:00+00:00 139.7324 139.9916 139.4398 139.4482 490236.812 MMM
9 2019-09-23 20:00:00+00:00 139.4314 139.4314 139.4314 139.4314 21472.984 MMM
最後10筆資料:
timestamp open high low close volume ticker
14944 2024-09-20 12:00:00+00:00 133.800 133.8000 133.750 133.750 919.0 MMM
14945 2024-09-20 13:00:00+00:00 133.780 133.7800 132.771 133.215 858654.0 MMM
14946 2024-09-20 14:00:00+00:00 133.205 133.8500 133.020 133.810 414482.0 MMM
14947 2024-09-20 15:00:00+00:00 133.790 134.5142 133.760 134.280 279221.0 MMM
14948 2024-09-20 16:00:00+00:00 134.295 134.7600 134.210 134.445 241361.0 MMM
14949 2024-09-20 17:00:00+00:00 134.460 134.9500 134.290 134.710 442866.0 MMM
14950 2024-09-20 18:00:00+00:00 134.700 134.7800 134.105 134.595 388438.0 MMM
14951 2024-09-20 19:00:00+00:00 134.580 134.8000 134.000 134.760 1187421.0 MMM
14952 2024-09-20 20:00:00+00:00 134.770 135.3300 133.720 134.770 148209.0 MMM
14953 2024-09-20 22:00:00+00:00 134.770 134.7700 134.770 134.770 10357.0 MMM
Fetching data for AOS (2/533)...
Data saved to ./datasets\PolygonIODownloader\AOS_2019-09-21_2024-09-21_1_hour_raw.csv
Data for AOS fetched successfully. 10458 rows.
前10筆資料:
timestamp open high low close volume ticker
0 2019-09-23 13:00:00+00:00 47.000 47.48 46.860 47.425 55140.0 AOS
1 2019-09-23 14:00:00+00:00 47.420 47.42 47.130 47.310 74617.0 AOS
2 2019-09-23 15:00:00+00:00 47.310 47.70 47.240 47.700 104694.0 AOS
3 2019-09-23 16:00:00+00:00 47.710 47.86 47.660 47.770 76278.0 AOS
4 2019-09-23 17:00:00+00:00 47.790 47.87 47.680 47.860 54102.0 AOS
5 2019-09-23 18:00:00+00:0 f0 47.860 47.88 47.710 47.740 59210.0 AOS
6 2019-09-23 19:00:00+00:00 47.735 47.96 47.735 47.740 203440.0 AOS
7 2019-09-23 20:00:00+00:00 47.770 47.77 47.770 47.770 32096.0 AOS
8 2019-09-24 13:00:00+00:00 47.950 48.24 47.650 47.720 72433.0 AOS
9 2019-09-24 14:00:00+00:00 47.715 47.74 47.300 47.680 86794.0 AOS
最後10筆資料:
timestamp open high low close volume ticker
10448 2024-09-19 20:00:00+00:00 84.290 84.2900 84.2900 84.290 7204.0 AOS
10449 2024-09-20 10:00:00+00:00 84.200 84.2400 84.2000 84.240 422.0 AOS
10450 2024-09-20 13:00:00+00:00 84.020 84.1900 82.8200 82.875 127059.0 AOS
10451 2024-09-20 14:00:00+00:00 82.875 83.3800 82.6200 83.130 67117.0 AOS
10452 2024-09-20 15:00:00+00:00 83.070 83.2500 82.8650 83.010 54899.0 AOS
10453 2024-09-20 16:00:00+00:00 83.010 83.1600 82.9026 83.110 40683.0 AOS
10454 2024-09-20 17:00:00+00:00 83.140 83.4150 83.0300 83.110 37185.0 AOS
10455 2024-09-20 18:00:00+00:00 83.070 83.1200 82.7950 83.070 68247.0 AOS
10456 2024-09-20 19:00:00+00:00 83.070 83.2900 82.7200 82.900 282742.0 AOS
10457 2024-09-20 20:00:00+00:00 82.890 82.8916 82.8900 82.890 46638.0 AOS
Fetching data for ABT (3/533)...
Data saved to ./datasets\PolygonIODownloader\ABT_2019-09-21_2024-09-21_1_hour_raw.csv
Data for ABT fetched successfully. 13442 rows.
前10筆資料:
timestamp open high low close volume ticker
0 2019-09-23 12:00:00+00:00 83.5800 83.580 83.5800 83.5800 731.0 ABT
1 2019-09-23 13:00:00+00:00 83.6000 83.630 83.1900 83.4200 199670.0 ABT
2 2019-09-23 14:00:00+00:00 83.4300 83.590 83.2700 83.4800 198560.0 ABT
3 2019-09-23 15:00:00+00:00 83.4800 83.720 83.4200 83.6496 210529.0 ABT
4 2019-09-23 16:00:00+00:00 83.6323 83.664 83.4500 83.4800 126953.0 ABT
5 2019-09-23 17:00:00+00:00 83.4900 83.490 83.1300 83.1600 168024.0 ABT
6 2019-09-23 18:00:00+00:00 83.1732 83.310 83.1732 83.2600 156499.0 ABT
7 2019-09-23 19:00:00+00:00 83.2500 83.430 83.1600 83.1700 465495.0 ABT
8 2019-09-23 20:00:00+00:00 83.1600 83.160 83.1600 83.1600 158481.0 ABT
9 2019-09-23 21:00:00+00:00 83.2000 83.200 83.1600 83.1600 714.0 ABT
最後10筆資料:
timestamp open high low close volume ticker
13432 2024-09-20 12:00:00+00:00 114.360 114.3600 114.3600 114.3600 152.0 ABT
13433 2024-09-20 13:00:00+00:00 113.620 114.1300 113.2000 113.8700 1383365.0 ABT
13434 2024-09-20 14:00:00+00:00 113.950 114.1000 113.5550 113.8350 575450.0 ABT
13435 2024-09-20 15:00:00+00:00 113.825 114.1200 113.6950 113.7000 416296.0 ABT
13436 2024-09-20 16:00:00+00:00 113.690 113.8000 113.5500 113.6700 236489.0 ABT
13437 2024-09-20 17:00:00+00:00 113.680 114.1600 113.6750 114.1110 324759.0 ABT
13438 2024-09-20 18:00:00+00:00 114.120 114.1500 113.5650 113.7150 308884.0 ABT
13439 2024-09-20 19:00:00+00:00 113.710 113.9100 113.4420 113.6900 1218445.0 ABT
13440 2024-09-20 20:00:00+00:00 113.700 113.7016 113.6300 113.6300 130750.0 ABT
13441 2024-09-20 22:00:00+00:00 113.700 113.7000 113.3319 113.3319 388.0 ABT
Fetching data for ABBV (4/533)...
Data saved to ./datasets\PolygonIODownloader\ABBV_2019-09-21_2024-09-21_1_hour_raw.csv
Data for ABBV fetched successfully. 15239 rows.
前10筆資料:
timestamp open high low close volume ticker
0 2019-09-23 11:00:00+00:00 72.5000 73.0500 72.500 73.0500 640.0 ABBV
1 2019-09-23 12:00:00+00:00 73.3500 73.3500 72.390 72.8600 3817.0 ABBV
2 2019-09-23 13:00:00+00:00 72.9000 72.9611 72.050 72.7817 717311.0 ABBV
3 2019-09-23 14:00:00+00:00 72.7600 73.3000 72.760 73.0850 859381.0 ABBV
4 2019-09-23 15:00:00+00:00 73.0900 73.5400 73.010 73.4100 880159.0 ABBV
5 2019-09-23 16:00:00+00:00 73.4174 73.6800 73.360 73.5800 801864.0 ABBV
6 2019-09-23 17:00:00+00:00 73.5800 73.7800 73.570 73.7500 702725.0 ABBV
7 2019-09-23 18:00:00+00:00 73.7490 73.7600 73.325 73.3300 1091298.0 ABBV
8 2019-09-23 19:00:00+00:00 73.3300 73.3900 72.910 72.9700 1556149.0 ABBV
9 2019-09-23 20:00:00+00:00 72.9700 73.2000 72.910 73.1000 249423.0 ABBV
最後10筆資料:
timestamp open high low close volume ticker
15229 2024-09-20 13:00:00+00:00 193.000 193.5000 192.13 192.9550 1307749.0 ABBV
15230 2024-09-20 14:00:00+00:00 193.130 193.8210 192.89 193.1300 597094.0 ABBV
15231 2024-09-20 15:00:00+00:00 193.100 193.2600 192.58 192.6550 317190.0 ABBV
15232 2024-09-20 16:00:00+00:00 192.640 193.0100 192.41 192.9801 196176.0 ABBV
15233 2024-09-20 17:00:00+00:00 193.005 193.6100 192.99 193.4150 206329.0 ABBV
15234 2024-09-20 18:00:00+00:00 193.410 193.4400 192.56 192.8300 778455.0 ABBV
15235 2024-09-20 19:00:00+00:00 192.830 193.5550 192.60 193.4600 879074.0 ABBV
15236 2024-09-20 20:00:00+00:00 193.470 193.4716 193.00 193.4700 455551.0 ABBV
15237 2024-09-20 21:00:00+00:00 193.280 193.2900 193.28 193.2900 510.0 ABBV
15238 2024-09-20 22:00:00+00:00 193.470 193.4700 193.47 193.4700 1311.0 ABBV
像是'BRK.B'從2019-09-30以後才開始有資料,不知道為啥,應該不可能是那時候才上市,可能有其他原因。
timestamp open high low close volume ticker
0 2019-09-30 09:00:00+00:00 207.750 207.75 207.75 207.75 100.0 BRK.B
1 2019-09-30 11:00:00+00:00 207.550 207.69 207.55 207.63 5824.0 BRK.B
2 2019-09-30 12:00:00+00:00 207.600 207.60 207.60 207.60 707.0 BRK.B
3 2019-09-30 13:00:00+00:00 207.800 208.30 207.77 207.95 216762.0 BRK.B
4 2019-09-30 14:00:00+00:00 207.945 208.00 207.55 207.96 221958.0 BRK.B
所以接下來我需要花時間研究一下,如何處理、對齊這些資料,並把這些資料切分成train和trade兩個部分,之後進行訓練。