前篇文章有用個工具
使用pyquery解析HTML的table存成DataFrame
pip install pyquery
pip install htmltable-df
<table width="100%" border="5" bordercolor="#FF6600" bgcolor="#FFFFFF">
<tbody>
<tr>
<th class="tt" colspan="2"> </th>
<th class="tt" colspan="5">營業收入</th>
<th class="tt" colspan="3">累計營業收入</th>
<th rowspan="2" class="tt">備註</th>
</tr>
<tr>
<th class="tt">公司<br>代號</th>
<th class="tt">公司名稱</th>
<th class="tt">當月營收</th>
<th class="tt">上月營收</th>
<th class="tt">去年當月營收</th>
<th class="tt">上月比較<br>增減(%)</th>
<th class="tt">去年同月<br>增減(%)</th>
<th class="tt">當月累計營收</th>
<th class="tt">去年累計營收</th>
<th class="tt">前期比較<br>增減(%)</th>
</tr>
<tr align="right">
<td align="center">1101</td>
<td align="left">台泥</td>
<td nowrap=""> 10,757,628</td>
<td nowrap=""> 11,539,982</td>
<td nowrap=""> 7,858,569</td>
<td nowrap=""> -6.77</td>
<td nowrap=""> 36.89</td>
<td nowrap=""> 57,500,244</td>
<td nowrap=""> 45,893,851</td>
<td nowrap=""> 25.28</td>
<td align="center">-</td>
</tr>
from htmltable_df.extractor import Extractor
#html可以是PyQuery物件或str
extractor = Extractor(html)
extractor.df()
print out:
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>公司代號</th>
<th>公司名稱</th>
<th>營業收入_當月營收</th>
<th>營業收入_上月營收</th>
<th>營業收入_去年當月營收</th>
<th>營業收入_上月比較增減(%)</th>
<th>營業收入_去年同月增減(%)</th>
<th>累計營業收入_當月累計營收</th>
<th>累計營業收入_去年累計營收</th>
<th>累計營業收入_前期比較增減(%)</th>
<th>備註</th>
</tr>
</thead>
<tbody>
<tr>
<th>0</th>
<td>1101</td>
<td>台泥</td>
<td>10757628</td>
<td>11539982</td>
<td>7858569</td>
<td>-6.77</td>
<td>36.89</td>
<td>57500244</td>
<td>45893851</td>
<td>25.28</td>
<td>-</td>
</tr>
<tr>
<th>1</th>
<td>1102</td>
<td>亞泥</td>
<td>7549925</td>
<td>7698165</td>
<td>5331442</td>
<td>-1.92</td>
<td>41.61</td>
<td>39010235</td>
<td>28812149</td>
<td>35.39</td>
<td>-</td>
</tr>
# 也可以指定到第2列都是header 結果一樣
extractor.df(header=2)