1

# 版本資訊

python版本3.7(用Spyder IDE開發)
pandas版本0.23.4

# 問題描述

n的值大約是一萬左右，
dataframe中的每的格子是一個浮點數，

# 測試範例

(由於1000個dataframes有點多，測試只生成100個)

``````import random
import pandas as pd
import time

def random_str(length=4):
return ''.join([chr(ord('a')+random.randrange(26)) for _ in range(length)])

def random_df(col_num):
data = {}
for _ in range(col_num):
key = random_str()
data[key] = [random.uniform(0,1)]
return pd.DataFrame.from_dict(data)

random.seed(100) #為了確保每次產生的結果相同，設固定的seed
datas = [random_df(10000) for _ in range(100)]

## 以下開始做concat的測試
start = time.time()
res = pd.DataFrame()
for d in datas:
res = pd.concat([res,d], ignore_index=True)
end = time.time()
print(f"串接dataframes時間: {end-start:4f}")
``````

# 迷你範例- 只串接5個dataframes

``````import random
import pandas as pd
import time

def random_str(length=4):
return ''.join([chr(ord('a')+random.randrange(26)) for _ in range(length)])

def random_df(col_num):
data = {}
for _ in range(col_num):
key = random_str()
data[key] = [random.uniform(0,1)]
return pd.DataFrame.from_dict(data)

random.seed(100) #為了確保每次產生的結果相同，設固定的seed
datas = [random_df(10000) for _ in range(5)]

## 以下開始做concat的測試
start = time.time()
res = pd.DataFrame()
for d in datas:
res = pd.concat([res,d], ignore_index=True)
end = time.time()
print(f"串接dataframes時間: {end-start:4f}")
``````

hokou iT邦新手 4 級 ‧ 2020-09-09 09:28:51 檢舉

### 4 個回答

6
japhenchen
iT邦大師 1 級 ‧ 2020-09-09 09:53:38

i5-3470/8GB/win10 2004/python3.8.5 debug on VisualCode

``````import random
import pandas as pd
import time

def random_str(length=4):
return ''.join([chr(ord('a')+random.randrange(26)) for _ in range(length)])

def random_df(col_num):
data = {}
for _ in range(col_num):
key = random_str()
data[key] = [random.uniform(0,1)]
return data

start = time.time()
random.seed(100) #為了確保每次產生的結果相同，設固定的seed
datas=pd.DataFrame([random_df(10000) for _ in range(100)])

end = time.time()
print(datas)
print(f"組成dataframes時間: {end-start:4f}")

``````

``````2                     NaN                   NaN  ...                    NaN                   NaN
3                     NaN                   NaN  ...                    NaN                   NaN
4                     NaN                   NaN  ...                    NaN                   NaN
..                    ...                   ...  ...                    ...                   ...
95   [0.7715769496453805]                   NaN  ...                    NaN                   NaN
96                    NaN                   NaN  ...                    NaN                   NaN
97                    NaN                   NaN  ...                    NaN                   NaN
98                    NaN                   NaN  ...                    NaN                   NaN
99   [0.0783298473116465]                   NaN  ...  [0.26089702985762075]  [0.4339433542861454]
``````

2
listennn08
iT邦高手 7 級 ‧ 2020-09-09 09:50:12

``````start = time.time()
res1 = pd.DataFrame()
res2 = pd.DataFrame()
pivot = 0
bpivot = len(datas) -1
if (len(datas) % 2):
while pivot < bpivot:
res1 = pd.concat([res1, datas[pivot]], ignore_index=True)
res2 = pd.concat([datas[bpivot], res2], ignore_index=True)
pivot += 1
bpivot -= 1
res1 = pd.concat([res1, datas[pivot]], ignore_index=True)
res1 = pd.concat([res1, res2], ignore_index=True)
else:
while pivot < bpivot:
res1 = pd.concat([res1, datas[pivot]], ignore_index=True)
res2 = pd.concat([datas[bpivot], res2], ignore_index=True)
pivot += 1
bpivot -= 1
res1=pd.concat([res1, res2], ignore_index=True)
end = time.time()
print(f"串接dataframes時間: {end-start:4f}")
``````

``````import math
from queue import Queue

def concatList(arr):
res = pd.DataFrame()
for d in arr:
res = pd.concat([res,d], ignore_index=True)
q.put(res)

start = time.time()
q = Queue()

t1.start()
res1 = pd.DataFrame()
for d in datas[math.ceil(len(datas)/2):len(datas)]:
res1 = pd.concat([res1, d], ignore_index=True)
t1.join()

res2 = q.get()

res1 = pd.concat([res1, res2], ignore_index=True)
end = time.time()
print(f"串接dataframes時間: {end-start:4f}")
``````

2
I code so I am
iT邦研究生 2 級 ‧ 2020-09-09 09:58:27

``````import modin.pandas as pd

s = time.time()
df = pd.concat([df for _ in range(5)])
e = time.time()
print("Modin Concat Time = {}".format(e-s))
``````

0
lingoo21
iT邦新手 5 級 ‧ 2020-10-04 16:51:04

1000row的時間

``````def random_sr(col_num):
data = {}
for _ in range(col_num):
key = random_str()
data[key] = [random.uniform(0,1)]
return pd.Series(data)
datas = [random_sr(10000) for _ in range(1000)]
## 以下開始做concat的測試
start = time.time()
r2 = pd.concat(datas, axis=1, ignore_index=True)
r2 = r2.T
end = time.time()
print(f"串接dataframes時間: {end-start:4f}")

``````
``````r2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Columns: 456976 entries, oitv to ndly
dtypes: object(456976)
memory usage: 3.4+ GB
``````

intel-i7/16g/mac-os