import nupmy as np
def make_n_dim_norm_array(dim=2, sample_size=100, mu1=1, mu2=5):
xy1 = np.array([np.random.normal(mu1, 1, sample_size)]*dim) # shape:
xy2 = np.array([np.random.normal(mu2, 1, sample_size)]*dim)
xy = [xy1, xy2]
"""
horizontally stack: 현재 xy는 2 by 10의 리스트가 row로 쌓여있는 형태임.
이를 hstack(horizontally stack)해주면, 수평선 방향으로 쌓아준다는 것이니까,
xy1 의 오른쪽에 xy2가 붙는다고 생각하면 됨. 따라서 2*10 + 2*10 이므로 2*20이 됨.
"""
xy = np.hstack(xy)# shape: 2*20
"""
그 다음 shape을 바꿔서 각 row에 xy가 있도록 변경
"""
xy = xy.reshape(sample_size*2, dim)
return xy
import time
for i in range(0, 5):
dim = 3+3*i
sample_size = 200
xy = make_n_dim_norm_array(dim=dim, sample_size = sample_size, mu1=1, mu2 = 2+i)
iter_num = 3
TSNE_total = 0
for j in range(0, iter_num):
TSNE_time = time.time()
TSNE(n_components=2).fit_transform(xy)
TSNE_time = time.time() - TSNE_time
TSNE_total+=TSNE_time
PCA_total = 0
for j in range(0, iter_num):
PCA_time = time.time()
PCA(n_components=2).fit_transform(xy)
PCA_time = time.time() - PCA_time
PCA_total+=PCA_time
TSNE_total /=iter_num
PCA_total /=iter_num
print("TSNE_mean: {}, PCA_mean: {}".format(round(TSNE_total, 2), round(PCA_total, 2)))
print("dim: {}, sample_size: {}, how faster? {}".format(dim, sample_size, round(TSNE_total/PCA_total,2)))
TSNE_mean: 9.16, PCA_mean: 0.0
dim: 3, sample_size: 200, how faster? 7494.0
TSNE_mean: 9.49, PCA_mean: 0.0
dim: 6, sample_size: 200, how faster? 18506.24
TSNE_mean: 9.06, PCA_mean: 0.0
dim: 9, sample_size: 200, how faster? 9407.07
TSNE_mean: 8.93, PCA_mean: 0.0
dim: 12, sample_size: 200, how faster? 1894.6
TSNE_mean: 8.45, PCA_mean: 0.01
dim: 15, sample_size: 200, how faster? 1650.62
import time
import matplotlib.pyplot as plt
plt_lst = []
for i in range(0, 5):
dim = 3
sample_size = 100+i*1000
xy = make_n_dim_norm_array(dim=dim, sample_size = sample_size, mu1=1, mu2 = 2+i)
iter_num = 1
TSNE_total = 0
for j in range(0, iter_num):
TSNE_time = time.time()
TSNE(n_components=2).fit_transform(xy)
TSNE_time = time.time() - TSNE_time
TSNE_total+=TSNE_time
PCA_total = 0
for j in range(0, iter_num):
PCA_time = time.time()
PCA(n_components=2).fit_transform(xy)
PCA_time = time.time() - PCA_time
PCA_total+=PCA_time
TSNE_total /=iter_num
PCA_total /=iter_num
plt_lst.append((sample_size, TSNE_total))
print("TSNE_mean: {}, PCA_mean: {}".format(round(TSNE_total, 2), round(PCA_total, 2)))
print("dim: {}, sample_size: {}, how faster? {}".format(dim, sample_size, round(TSNE_total/PCA_total,2)))
plt.figure(figsize=(12, 4))
x_lst = [x[0] for x in plt_lst]
y_lst = [x[1] for x in plt_lst]
plt.plot(x_lst, y_lst, marker='o')
plt.plot([x_lst[0], x_lst[-1]], [y_lst[0], y_lst[-1]])
plt.savefig("../../assets/images/markdown_img/tsne_time_plotting_20180517.svg")
plt.show()
TSNE_mean: 3.71, PCA_mean: 0.0
dim: 3, sample_size: 100, how faster? 6250.78
TSNE_mean: 62.17, PCA_mean: 0.01
dim: 3, sample_size: 1100, how faster? 5315.78
TSNE_mean: 131.66, PCA_mean: 0.04
dim: 3, sample_size: 2100, how faster? 3086.22
TSNE_mean: 195.78, PCA_mean: 0.02
dim: 3, sample_size: 3100, how faster? 9979.7
TSNE_mean: 279.01, PCA_mean: 0.02
dim: 3, sample_size: 4100, how faster? 12288.78
import time
plt_lst = []
for i in range(0, 10):
dim = 100
sample_size = 10000*(i+1)
xy = make_n_dim_norm_array(dim=dim, sample_size = sample_size, mu1=1, mu2 = 2+i)
iter_num = 3
for j in range(0, iter_num):
PCA_time = time.time()
PCA(n_components=2).fit_transform(xy)
PCA_time = time.time() - PCA_time
PCA_total+=PCA_time
PCA_total /=iter_num
plt_lst.append((sample_size, PCA_total))
print("dim: {}, sample_size: {}, PCA_mean? {}".format(dim, sample_size, round(PCA_total,2)))
dim: 100, sample_size: 10000, PCA_mean? 1.59
dim: 100, sample_size: 20000, PCA_mean? 0.98
dim: 100, sample_size: 30000, PCA_mean? 0.89
dim: 100, sample_size: 40000, PCA_mean? 1.04
dim: 100, sample_size: 50000, PCA_mean? 1.24
dim: 100, sample_size: 60000, PCA_mean? 1.48
dim: 100, sample_size: 70000, PCA_mean? 1.96
dim: 100, sample_size: 80000, PCA_mean? 3.08
dim: 100, sample_size: 90000, PCA_mean? 3.08
dim: 100, sample_size: 100000, PCA_mean? 3.67
댓글남기기