import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline


df_train = pd.read_csv("./input_train.csv")
df_test = pd.read_csv("./input_test.csv")


print(df_train.shape)
df_train.head()

(9999, 5)


print(df_test.shape)
df_test.head()

(1999, 5)


df_train = pd.read_csv("./input_train.csv", header = None)
df_test = pd.read_csv("./input_test.csv", header = None)


print(df_train.shape)
df_train.head()

(10000, 5)


print(df_test.shape)
df_test.head()

(2000, 5)


g = sns.PairGrid(df_train)
g.map_upper(sns.scatterplot)
g.map_diag(sns.kdeplot)
g.map_lower(sns.kdeplot)

<seaborn.axisgrid.PairGrid at 0x2312e693da0>


df_norm = df_train.copy()
scaler = StandardScaler()
np.array(df_norm[4]).reshape(-1, 1)
df_norm[4] = scaler.fit_transform(np.array(df_norm[4]).reshape(-1, 1)).reshape(-1)


fig = plt.figure(figsize = (16, 5))
plt.hist(df_train[4], bins = np.arange(df_train[4].min(), df_train[4].max()+0.5, 0.5), label = 'default', alpha = 1/2)
plt.hist(df_norm[4], bins = np.arange(df_norm[4].min(), df_norm[4].max()+0.5, 0.5), label = 'normalized', alpha = 1/2)
plt.legend()

<matplotlib.legend.Legend at 0x231328bab70>


model = KMeans(n_clusters = 2)


df_fit = df_norm.copy()
df_fit['cluster'] = model.fit_predict(df_fit)


model.cluster_centers_

array([[-7.76717045e-03, -6.96423899e-01,  1.09249921e+00,
        -5.26539542e-03, -7.18284061e-04],
       [-2.56455209e-02,  5.90663263e-01, -9.25675071e-01,
        -2.09290748e-03,  5.98460322e-04]])


pca = PCA(n_components=2) # 주성분을 몇개로 할지 결정
principalComponents = pca.fit_transform(df_fit[df_train.columns[:5]])


means = pca.transform(model.cluster_centers_)


plt.figure(figsize = (10,10))
plt.scatter(x = principalComponents[df_fit['cluster'] == 0,0], y = principalComponents[df_fit['cluster'] == 0,1],c ='b', alpha = 1/10)
plt.scatter(x = principalComponents[df_fit['cluster'] == 1,0], y = principalComponents[df_fit['cluster'] == 1,1],c = 'r', alpha = 1/10)
plt.scatter(x = means[0,0], y = means[0,1], c = 'b', label = '"0"')
plt.scatter(x = means[1,0], y = means[1,1], c = 'r', label = '"1"')
sns.kdeplot(x = principalComponents[:,0], y = principalComponents[:,1])
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()

<matplotlib.legend.Legend at 0x231329495f8>


df_norm = df_test.copy()
df_norm[4] = scaler.fit_transform(np.array(df_norm[4]).reshape(-1, 1)).reshape(-1)


df_fit = df_norm.copy()
df_fit['cluster'] = model.predict(df_fit)


principalComponents = pca.transform(df_fit[df_train.columns[:5]])


plt.figure(figsize = (10,10))
plt.scatter(x = principalComponents[df_fit['cluster'] == 0,0], y = principalComponents[df_fit['cluster'] == 0,1],c ='b', alpha = 1/10)
plt.scatter(x = principalComponents[df_fit['cluster'] == 1,0], y = principalComponents[df_fit['cluster'] == 1,1],c = 'r', alpha = 1/10)
plt.scatter(x = means[0,0], y = means[0,1], c = 'b', label = '"0"')
plt.scatter(x = means[1,0], y = means[1,1], c = 'r', label = '"1"')
sns.kdeplot(x = principalComponents[:,0], y = principalComponents[:,1])
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()

<matplotlib.legend.Legend at 0x231329f1f98>


output = df_test.copy()
output['cluster'] = df_fit['cluster']
output.to_csv("output_test.csv",index = False)

	1.078734452	1.684207748	-1.697788431	1.648822418	13.00967663
0	0.054549	0.040486	0.976355	0.557014	-9.760446
1	1.192383	-1.471495	1.522287	-1.294598	-5.511185
2	-0.173229	0.716604	-0.934982	1.043867	17.703281
3	-0.619846	-0.365162	-1.487529	-0.102718	-5.883243
4	1.585655	0.728519	-0.237613	0.944235	13.855260

	-1.067544264	1.959152616	-2.279075869	0.978654322	-10.55106835
0	-1.553500	-1.542898	0.979257	-0.815807	11.912946
1	0.860294	0.916555	0.613720	-0.097540	26.607359
2	-0.015839	1.279196	-1.161477	0.362325	-15.563002
3	-0.063135	-0.337933	0.987439	0.223840	-7.202818
4	0.526858	0.191385	-0.905205	-0.368720	14.430606

	0	1	2	3	4
0	1.078734	1.684208	-1.697788	1.648822	13.009677
1	0.054549	0.040486	0.976355	0.557014	-9.760446
2	1.192383	-1.471495	1.522287	-1.294598	-5.511185
3	-0.173229	0.716604	-0.934982	1.043867	17.703281
4	-0.619846	-0.365162	-1.487529	-0.102718	-5.883243

	0	1	2	3	4
0	-1.067544	1.959153	-2.279076	0.978654	-10.551068
1	-1.553500	-1.542898	0.979257	-0.815807	11.912946
2	0.860294	0.916555	0.613720	-0.097540	26.607359
3	-0.015839	1.279196	-1.161477	0.362325	-15.563002
4	-0.063135	-0.337933	0.987439	0.223840	-7.202818

티스토리

KMean 군집화 연습(약간의 데이터 분석을 끼얹은)

KMean 군집화 연습(약간의 데이터 분석을 끼얹은)

들어가며¶

라이브러리 불러오기¶

데이터 불러오기 및 확인¶

간단한 데이터 분석¶

데이터 전처리¶

군집화¶

테스트¶