# modules we'll use
import pandas as pd
import numpy as np
# helpful character encoding module
import chardet
# set seed for reproducibility
np.random.seed(0)
Character encodings是指說從原始的二進位編碼("01101000010100100"),轉成人類可以讀取的文本("hi")
若是嘗試使用的編碼和原始的編碼不相同,就會得到混亂文本,稱為mojibake
,例如
æ–‡å—化ã??
當特定字節和讀取字節的編碼中沒有關係時,就會打印出以下內容:
����������
現在字符編碼不匹配的情況已經比較少了,但是還是有一個主要的問題,就是有很多不同的字符編碼,但最需要了解的編碼是UTF-8
UTF-8是標準的文本編碼,所有的python代碼都是UTF-8,理想情況下,所有的資料都應該要是UTF-8。當資料不是UTF-8的時候,就可能會出錯
# start with a string
before = "This is the euro symbol: €"
# check to see what datatype it is
type(before)
str
我們可以將str轉為bytes
# encode it to a different encoding, replacing characters that raise errors
after = before.encode("utf-8", errors="replace")
# check the type
type(after)
bytes
若觀察after的資料,會發現前面多了一個b
那是因為我們將bytes會把資料轉為ASCII,這邊可以看到€
這個符號已經被替換成像mojibake
的狀況"\xe2\x82\xac"
# take a look at what the bytes look like
after
b'This is the euro symbol: \xe2\x82\xac'
但是當我們再將資料解碼成utf-8的時候,資料就又變成正確的了
# convert it back to utf-8
print(after.decode("utf-8"))
This is the euro symbol: €
若我們再將bytes的資料編碼成ascii,就又會出錯
我們可以將編碼看成是錄製聲音時的不同方式。可以再CD跟卡帶上面錄製相同的音樂,雖然音樂聽起來大致上是相同的,但是必須要用適合的設備來撥放。正確的解碼器就像是用CD撥放器來撥放CD,若是用卡帶撥放器就不能撥放CD了
# try to decode our bytes with the ascii encoding
print(after.decode("ascii"))
UnicodeDecodeErrorTraceback (most recent call last)
<ipython-input-6-50fd8662e3ae> in <module>
1 # try to decode our bytes with the ascii encoding
----> 2 print(after.decode("ascii"))
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 25: ordinal not in range(128)
若我們將資料先編碼為ascii,會造成一些字符沒辦法使用,再將ascii解碼時,就會發現無法使用的字符解碼回來之後也不會恢復原本的樣子
# start with a string
before = "This is the euro symbol: €"
# encode it to a different encoding, replacing characters that raise errors
after = before.encode("ascii", errors = "replace")
# convert it back to utf-8
print(after.decode("ascii"))
# We've lost the original underlying byte string! It's been
# replaced with the underlying byte string for the unknown character :(
This is the euro symbol: ?
所以要盡可能避免這樣做,在python中,要盡量將資料保持在UTF-8
最好確認這件事情的時候是讀取文件的時候,下面會說明要如何做
許多文件都有可能會使用到UTF-8,這些也是Python默認情況下所期望的,大多是情況不會遇到問題,但是有時候會出現以下錯誤
# try to read in a file not in UTF-8
kickstarter_2016 = pd.read_csv("./ks-projects-201612.csv")
UnicodeDecodeErrorTraceback (most recent call last)
pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._convert_tokens()
pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._convert_with_dtype()
pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._string_convert()
pandas/_libs/parsers.pyx in pandas._libs.parsers._string_box_utf8()
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x99 in position 11: invalid start byte
During handling of the above exception, another exception occurred:
UnicodeDecodeErrorTraceback (most recent call last)
<ipython-input-8-a0f34aff1a4b> in <module>
1 # try to read in a file not in UTF-8
----> 2 kickstarter_2016 = pd.read_csv("./ks-projects-201612.csv")
/opt/conda/lib/python3.6/site-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)
700 skip_blank_lines=skip_blank_lines)
701
--> 702 return _read(filepath_or_buffer, kwds)
703
704 parser_f.__name__ = name
/opt/conda/lib/python3.6/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
433
434 try:
--> 435 data = parser.read(nrows)
436 finally:
437 parser.close()
/opt/conda/lib/python3.6/site-packages/pandas/io/parsers.py in read(self, nrows)
1137 def read(self, nrows=None):
1138 nrows = _validate_integer('nrows', nrows)
-> 1139 ret = self._engine.read(nrows)
1140
1141 # May alter columns / col_dict
/opt/conda/lib/python3.6/site-packages/pandas/io/parsers.py in read(self, nrows)
1993 def read(self, nrows=None):
1994 try:
-> 1995 data = self._reader.read(nrows)
1996 except StopIteration:
1997 if self._first_chunk:
pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader.read()
pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._read_low_memory()
pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._read_rows()
pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._convert_column_data()
pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._convert_tokens()
pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._convert_with_dtype()
pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._string_convert()
pandas/_libs/parsers.pyx in pandas._libs.parsers._string_box_utf8()
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x99 in position 11: invalid start byte
會發生錯誤較有可能的原因為我們想要將ascii編碼的資料用UTF-8讀取
可以透過讀取前面的資料來看出編碼,不需要直接就讀取全部的資料
# look at the first ten thousand bytes to guess the character encoding
with open("./ks-projects-201612.csv", 'rb') as rawdata:
result = chardet.detect(rawdata.read(10000))
# check what the character encoding might be
print(result)
{'encoding': 'Windows-1252', 'confidence': 0.73, 'language': ''}
根據前面10000筆的資料,有73%的猜測資料型態為Windows-1252
,因此我們來試試看用Windows-1252
來解碼
# read in the file with the encoding detected by chardet
kickstarter_2016 = pd.read_csv("./ks-projects-201612.csv", encoding='Windows-1252')
# look at the first few lines
kickstarter_2016.head()
/opt/conda/lib/python3.6/site-packages/IPython/core/interactiveshell.py:3072: DtypeWarning: Columns (13,14,15) have mixed types. Specify dtype option on import or set low_memory=False.
interactivity=interactivity, compiler=compiler, result=result)
好不容易將資料轉為UTF-8,現在我們要將資料用csv儲存起來
# save our file (will be saved as UTF-8 by default!)
kickstarter_2016.to_csv("ks-projects-201801-utf8.csv")