前篇文章有用個工具
使用pyquery解析HTML的table存成DataFrame
pip install pyquery
pip install htmltable-df
<table width="100%" border="5" bordercolor="#FF6600" bgcolor="#FFFFFF">
    <tbody>
    <tr>
        <th class="tt" colspan="2"> </th>
        <th class="tt" colspan="5">營業收入</th>
        <th class="tt" colspan="3">累計營業收入</th>
        <th rowspan="2" class="tt">備註</th>
    </tr>
    <tr>
        <th class="tt">公司<br>代號</th>
        <th class="tt">公司名稱</th>
        <th class="tt">當月營收</th>
        <th class="tt">上月營收</th>
        <th class="tt">去年當月營收</th>
        <th class="tt">上月比較<br>增減(%)</th>
        <th class="tt">去年同月<br>增減(%)</th>
        <th class="tt">當月累計營收</th>
        <th class="tt">去年累計營收</th>
        <th class="tt">前期比較<br>增減(%)</th>
    </tr>
    <tr align="right">
        <td align="center">1101</td>
        <td align="left">台泥</td>
        <td nowrap=""> 10,757,628</td>
        <td nowrap=""> 11,539,982</td>
        <td nowrap=""> 7,858,569</td>
        <td nowrap=""> -6.77</td>
        <td nowrap=""> 36.89</td>
        <td nowrap=""> 57,500,244</td>
        <td nowrap=""> 45,893,851</td>
        <td nowrap=""> 25.28</td>
        <td align="center">-</td>
    </tr>
from htmltable_df.extractor import Extractor
#html可以是PyQuery物件或str
extractor = Extractor(html)
extractor.df()
print out:
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>公司代號</th>
      <th>公司名稱</th>
      <th>營業收入_當月營收</th>
      <th>營業收入_上月營收</th>
      <th>營業收入_去年當月營收</th>
      <th>營業收入_上月比較增減(%)</th>
      <th>營業收入_去年同月增減(%)</th>
      <th>累計營業收入_當月累計營收</th>
      <th>累計營業收入_去年累計營收</th>
      <th>累計營業收入_前期比較增減(%)</th>
      <th>備註</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>1101</td>
      <td>台泥</td>
      <td>10757628</td>
      <td>11539982</td>
      <td>7858569</td>
      <td>-6.77</td>
      <td>36.89</td>
      <td>57500244</td>
      <td>45893851</td>
      <td>25.28</td>
      <td>-</td>
    </tr>
    <tr>
      <th>1</th>
      <td>1102</td>
      <td>亞泥</td>
      <td>7549925</td>
      <td>7698165</td>
      <td>5331442</td>
      <td>-1.92</td>
      <td>41.61</td>
      <td>39010235</td>
      <td>28812149</td>
      <td>35.39</td>
      <td>-</td>
    </tr>
# 也可以指定到第2列都是header 結果一樣
extractor.df(header=2)