UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
<_io.TextIOWrapper name='test_csv_190114.csv' mode='r' encoding='UTF-8'>
import pandas as pd
import numpy as np
# make dataframe and write it as csv file
df = pd.DataFrame(
{
'col_a':np.random.random(10),
'col_b':np.random.random(10),
}
)
#print(df.head())
########################################
# utf-8로 인코딩했을 때 출력 결과가 utf8로 나오지만,
########################################
to_csv_param_dict = {
'path_or_buf':"test_csv_190114.csv",
'sep': ',',
'encoding':'utf-8'
}
df.to_csv(**to_csv_param_dict)
print("=="*20)
print(f"== df csv file is encoded {to_csv_param_dict['encoding']}")
with open('test_csv_190114.csv') as f:
print(f"file type is: {f}")
print(f.readlines())
print("=="*20)
########################################
# utf-16으로 인코딩해도 utf-8로 결과가 나옴.
########################################
to_csv_param_dict = {
'path_or_buf':"test_csv_190114.csv",
'sep': ',',
'encoding':'utf-16'
}
df.to_csv(**to_csv_param_dict)
print(f"== df csv file is encoded {to_csv_param_dict['encoding']}")
with open('test_csv_190114.csv') as f:
print(f"file type is: {f}")
print(f.readlines())
========================================
== df csv file is encoded utf-8
file type is: <_io.TextIOWrapper name='test_csv_190114.csv' mode='r' encoding='UTF-8'>
[',col_a,col_b\n', '0,0.013845954945651995,0.9711504967882624\n', '1,0.22623236002583202,0.8325203572048832\n', '2,0.624492205639335,0.08927496566336979\n', '3,0.8551070031229804,0.6534901938649194\n', '4,0.5739514150486329,0.47376986796101206\n', '5,0.8231597457095641,0.7995586032801442\n', '6,0.9593301357288136,0.8635370988998786\n', '7,0.1966328421358593,0.7588163653359546\n', '8,0.1577009039418904,0.7749176617999953\n', '9,0.2998450716697477,0.5567583140954053\n']
========================================
== df csv file is encoded utf-16
file type is: <_io.TextIOWrapper name='test_csv_190114.csv' mode='r' encoding='UTF-8'>
Traceback (most recent call last):
File "pd_csv.py", line 41, in <module>
print(f.readlines())
File "/Users/frhyme/anaconda3/lib/python3.6/codecs.py", line 321, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
import pandas as pd
import numpy as np
import chardet
# 무작위의 dataframe을 만들어줍니다.
# 이 값을 가지고, 다양한 인코딩 방식으로 저장하고 읽고 할 것입니다
np.random.seed(0)
N = 10
df = pd.DataFrame({
'col_a': np.random.random(N),
'col_b': np.random.random(N),
'col_c': ['a' for i in range(0, N)]
})
# 다음과 같은 다양한 인코딩 방식으로 저장하고 이 값을 추정합니다.
encoding_ways = [
'utf-8',
'utf-16',
'utf-32-le',
'ISO 8859-1',
'windows-1251',
'euc-kr'
]
print("=="*30)
for encoding_w in encoding_ways:
# 각 encoding_w에 맞게 인코딩해주고,
file_name = f"temp_csv_190114_{encoding_w}.csv"
df.to_csv(file_name, encoding=encoding_w)
print(f"== encoded as {encoding_w}")
# binary로 파일을 읽어서, chardet.detect 를 사용하여 추정합니다.
with open(file_name, 'rb') as rawdata:
result = chardet.detect(rawdata.read(10000))
# check what the character encoding might be
#print("--"*30)
# 추정된 결과를 표시하고,
detected_encoding_way = result['encoding']
print(f"== detected as {detected_encoding_way} ::: {result}")
# 추정된 방식으로 파일을 읽어봅ㅂ니다.
df_read = pd.read_csv(file_name, encoding=detected_encoding_way)
print(f"== decoded as {detected_encoding_way}")
print(df.head(2))
print("=="*30)
============================================================
== encoded as utf-8
== detected as ascii ::: {'encoding': 'ascii', 'confidence': 1.0, 'language': ''}
== decoded as ascii
col_a col_b col_c
0 0.548814 0.791725 a
1 0.715189 0.528895 a
============================================================
== encoded as utf-16
== detected as UTF-16 ::: {'encoding': 'UTF-16', 'confidence': 1.0, 'language': ''}
== decoded as UTF-16
col_a col_b col_c
0 0.548814 0.791725 a
1 0.715189 0.528895 a
============================================================
== encoded as utf-32-le
== detected as ascii ::: {'encoding': 'ascii', 'confidence': 1.0, 'language': ''}
== decoded as ascii
col_a col_b col_c
0 0.548814 0.791725 a
1 0.715189 0.528895 a
============================================================
== encoded as ISO 8859-1
== detected as ascii ::: {'encoding': 'ascii', 'confidence': 1.0, 'language': ''}
== decoded as ascii
col_a col_b col_c
0 0.548814 0.791725 a
1 0.715189 0.528895 a
============================================================
== encoded as windows-1251
== detected as ascii ::: {'encoding': 'ascii', 'confidence': 1.0, 'language': ''}
== decoded as ascii
col_a col_b col_c
0 0.548814 0.791725 a
1 0.715189 0.528895 a
============================================================
== encoded as euc-kr
== detected as ascii ::: {'encoding': 'ascii', 'confidence': 1.0, 'language': ''}
== decoded as ascii
col_a col_b col_c
0 0.548814 0.791725 a
1 0.715189 0.528895 a
============================================================
댓글남기기