Wednesday, March 2, 2016

Running a k-means Cluster Analysis (Python)

Introduction:

Dataset: "gapminder.csv"

Variables: All variables within the dataset


Results:

Python Output 1:
-------------
According to the graph, I will choose 2-cluster or 4-cluster as the possible solutions for the k-means analysis  by the elbow method.

Python Output 2:
--------------

The two clusters are separated in a appropriate extent.




Python Output 4:
----------
The 4 clusters are also clearly separated.

he following are the mean values of the trained dataset :

Python Output 5:
Clustering variable means by 7-cluster
         level_0    index  polityscore  incomeperperson  alcconsumption  \
cluster                                                                  
0          20.00   96.900    -0.491388        -0.804625       -0.376205  
1          22.75  102.875     0.471166         0.594527        0.428931  

         armedforcesrate  breastcancerper100th  co2emissions  \
cluster                                                      
0               0.075222             -0.745935     -0.167833  
1              -0.295073              0.611362     -0.086379  

         femaleemployrate   hivrate  internetuserate  lifeexpectancy  \
cluster                                                              
0               -0.009326  0.245639        -0.843254       -0.763695  
1                0.082395 -0.159996         0.729558        0.740224  

         oilperperson  relectricperperson  suicideper100th  employrate  \
cluster                                                                
0           -0.483513           -0.657739        -0.222527    0.105981  
1            0.186257            0.427704        -0.043312   -0.018832  

         urbanrate
cluster            
0        -0.523812
1         0.420491


Python Code:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
from sklearn.cluster import KMeans

data = pd.read_csv("gapminder.csv")

data['polityscore'] = pd.to_numeric(data['polityscore'], errors='coerce')
data['incomeperperson'] =  pd.to_numeric(data['incomeperperson'], errors='coerce')
data['alcconsumption'] =  pd.to_numeric(data['alcconsumption'], errors='coerce')
data['armedforcesrate'] =  pd.to_numeric(data['armedforcesrate'],errors='coerce')
data['breastcancerper100th'] =  pd.to_numeric(data['breastcancerper100th'], errors='coerce')
data["co2emissions"] =  pd.to_numeric(data["co2emissions"], errors='coerce')
data["femaleemployrate"] =  pd.to_numeric(data["femaleemployrate"], errors='coerce')
data['hivrate'] =  pd.to_numeric(data['hivrate'], errors='coerce')
data['internetuserate'] =  pd.to_numeric(data['internetuserate'], errors='coerce')
data['lifeexpectancy'] =  pd.to_numeric(data['lifeexpectancy'], errors='coerce')
data['oilperperson'] =  pd.to_numeric(data['oilperperson'], errors='coerce')
data['relectricperperson'] =  pd.to_numeric(data['relectricperperson'], errors='coerce')
data['suicideper100th'] =  pd.to_numeric(data['suicideper100th'], errors='coerce')
data['employrate'] =  pd.to_numeric(data['employrate'], errors='coerce')
data['urbanrate'] =  pd.to_numeric(data["urbanrate"], errors='coerce')
data_clean = data.dropna()

cluster= data_clean[['polityscore',"incomeperperson","alcconsumption","armedforcesrate",
"breastcancerper100th","co2emissions","femaleemployrate","hivrate",
"internetuserate","lifeexpectancy","oilperperson","relectricperperson",
"suicideper100th","employrate","urbanrate"]]

clustervar=cluster.copy()

from sklearn import preprocessing

clustervar['polityscore']=preprocessing.scale(clustervar['polityscore'].astype('float64'))
clustervar['incomeperperson']=preprocessing.scale(clustervar['incomeperperson'].astype('float64'))
clustervar['alcconsumption']=preprocessing.scale(clustervar['alcconsumption'].astype('float64'))
clustervar['armedforcesrate']=preprocessing.scale(clustervar['armedforcesrate'].astype('float64'))
clustervar['breastcancerper100th']=preprocessing.scale(clustervar['breastcancerper100th'].astype('float64'))
clustervar['co2emissions']=preprocessing.scale(clustervar['co2emissions'].astype('float64'))
clustervar['femaleemployrate']=preprocessing.scale(clustervar['femaleemployrate'].astype('float64'))
clustervar['hivrate']=preprocessing.scale(clustervar['hivrate'].astype('float64'))
clustervar['internetuserate']=preprocessing.scale(clustervar['internetuserate'].astype('float64'))
clustervar['lifeexpectancy']=preprocessing.scale(clustervar['lifeexpectancy'].astype('float64'))
clustervar['oilperperson']=preprocessing.scale(clustervar['oilperperson'].astype('float64'))
clustervar['relectricperperson']=preprocessing.scale(clustervar['relectricperperson'].astype('float64'))
clustervar['suicideper100th']=preprocessing.scale(clustervar['suicideper100th'].astype('float64'))
clustervar['employrate']=preprocessing.scale(clustervar['employrate'].astype('float64'))
clustervar['urbanrate']=preprocessing.scale(clustervar['urbanrate'].astype('float64'))

clus_train, clus_test = train_test_split(clustervar, test_size=0.2, random_state=23)

from scipy.spatial.distance import cdist
clusters=range(1,10)
meandist=[]

for k in clusters:
    model=KMeans(n_clusters=k)
    model.fit(clus_train)
    clusassign=model.predict(clus_train)
    meandist.append(sum(np.min(cdist(clus_train, model.cluster_centers_, 'euclidean'), axis=1))
    / clus_train.shape[0])
   
plt.plot(clusters, meandist)
plt.xlabel('Number of clusters')
plt.ylabel('Average distance')
plt.title('Selecting k with the Elbow Method')

#Solution 1 : 2 Clusters
model1=KMeans(n_clusters=2)
model1.fit(clus_train)
clusassign=model.predict(clus_train)

from sklearn.decomposition import PCA
pca_2 = PCA(2)
plot_columns = pca_2.fit_transform(clus_train)
plt.scatter(x=plot_columns[:,0], y=plot_columns[:,1], c=model1.labels_,)
plt.xlabel('Canonical variable 1')
plt.ylabel('Canonical variable 2')
plt.title('Scatterplot of Canonical Variables for 2 Clusters')
plt.show()



#Solution 2 : 4 Clusters
model2=KMeans(n_clusters=4)
model2.fit(clus_train)
clusassign=model2.predict(clus_train)

from sklearn.decomposition import PCA
pca_2 = PCA(2)
plot_columns = pca_2.fit_transform(clus_train)
plt.scatter(x=plot_columns[:,0], y=plot_columns[:,1], c=model2.labels_,)
plt.xlabel('Canonical variable 1')
plt.ylabel('Canonical variable 2')
plt.title('Scatterplot of Canonical Variables for 4 Clusters')
plt.show()

#mean values of the variables of the training dataset
clus_train.reset_index(level=0, inplace=True)
cluslist2=list(clus_train['index'])
labels2=list(model2.labels_)
newlist2=dict(zip(cluslist, labels))
newlist2
newclus2=DataFrame.from_dict(newlist2, orient='index')
newclus2
newclus2.columns = ['cluster']
newclus2.reset_index(level=0, inplace=True)
merged_train2=pd.merge(clus_train, newclus2, on='index')
merged_train2.head(n=100)
merged_train2.cluster.value_counts()

clustergrp2 = merged_train2.groupby('cluster').mean()
print ("Clustering variable means by 7-cluster")
print(clustergrp2)