Data Analysis and Interpretation Specialization: March 2016

Introduction:

Dataset: "gapminder.csv"

Variables: All variables within the dataset

Results:

Python Output 1:

-------------
According to the graph, I will choose 2-cluster or 4-cluster as the possible solutions for the k-means analysis by the elbow method.

Python Output 2:

--------------

The two clusters are separated in a appropriate extent.

Python Output 4:

----------
The 4 clusters are also clearly separated.

he following are the mean values of the trained dataset :

Python Output 5:
Clustering variable means by 7-cluster
level_0 index polityscore incomeperperson alcconsumption \
cluster
0 20.00 96.900 -0.491388 -0.804625 -0.376205
1 22.75 102.875 0.471166 0.594527 0.428931

armedforcesrate breastcancerper100th co2emissions \
cluster
0 0.075222 -0.745935 -0.167833
1 -0.295073 0.611362 -0.086379

femaleemployrate hivrate internetuserate lifeexpectancy \
cluster
0 -0.009326 0.245639 -0.843254 -0.763695
1 0.082395 -0.159996 0.729558 0.740224

oilperperson relectricperperson suicideper100th employrate \
cluster
0 -0.483513 -0.657739 -0.222527 0.105981
1 0.186257 0.427704 -0.043312 -0.018832

urbanrate
cluster
0 -0.523812
1 0.420491

Python Code:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
from sklearn.cluster import KMeans

data = pd.read_csv("gapminder.csv")

data['polityscore'] = pd.to_numeric(data['polityscore'], errors='coerce')
data['incomeperperson'] = pd.to_numeric(data['incomeperperson'], errors='coerce')
data['alcconsumption'] = pd.to_numeric(data['alcconsumption'], errors='coerce')
data['armedforcesrate'] = pd.to_numeric(data['armedforcesrate'],errors='coerce')
data['breastcancerper100th'] = pd.to_numeric(data['breastcancerper100th'], errors='coerce')
data["co2emissions"] = pd.to_numeric(data["co2emissions"], errors='coerce')
data["femaleemployrate"] = pd.to_numeric(data["femaleemployrate"], errors='coerce')
data['hivrate'] = pd.to_numeric(data['hivrate'], errors='coerce')
data['internetuserate'] = pd.to_numeric(data['internetuserate'], errors='coerce')
data['lifeexpectancy'] = pd.to_numeric(data['lifeexpectancy'], errors='coerce')
data['oilperperson'] = pd.to_numeric(data['oilperperson'], errors='coerce')
data['relectricperperson'] = pd.to_numeric(data['relectricperperson'], errors='coerce')
data['suicideper100th'] = pd.to_numeric(data['suicideper100th'], errors='coerce')
data['employrate'] = pd.to_numeric(data['employrate'], errors='coerce')
data['urbanrate'] = pd.to_numeric(data["urbanrate"], errors='coerce')
data_clean = data.dropna()

cluster= data_clean[['polityscore',"incomeperperson","alcconsumption","armedforcesrate",
"breastcancerper100th","co2emissions","femaleemployrate","hivrate",
"internetuserate","lifeexpectancy","oilperperson","relectricperperson",
"suicideper100th","employrate","urbanrate"]]

clustervar=cluster.copy()

from sklearn import preprocessing

clustervar['polityscore']=preprocessing.scale(clustervar['polityscore'].astype('float64'))
clustervar['incomeperperson']=preprocessing.scale(clustervar['incomeperperson'].astype('float64'))
clustervar['alcconsumption']=preprocessing.scale(clustervar['alcconsumption'].astype('float64'))
clustervar['armedforcesrate']=preprocessing.scale(clustervar['armedforcesrate'].astype('float64'))
clustervar['breastcancerper100th']=preprocessing.scale(clustervar['breastcancerper100th'].astype('float64'))
clustervar['co2emissions']=preprocessing.scale(clustervar['co2emissions'].astype('float64'))
clustervar['femaleemployrate']=preprocessing.scale(clustervar['femaleemployrate'].astype('float64'))
clustervar['hivrate']=preprocessing.scale(clustervar['hivrate'].astype('float64'))
clustervar['internetuserate']=preprocessing.scale(clustervar['internetuserate'].astype('float64'))
clustervar['lifeexpectancy']=preprocessing.scale(clustervar['lifeexpectancy'].astype('float64'))
clustervar['oilperperson']=preprocessing.scale(clustervar['oilperperson'].astype('float64'))
clustervar['relectricperperson']=preprocessing.scale(clustervar['relectricperperson'].astype('float64'))
clustervar['suicideper100th']=preprocessing.scale(clustervar['suicideper100th'].astype('float64'))
clustervar['employrate']=preprocessing.scale(clustervar['employrate'].astype('float64'))
clustervar['urbanrate']=preprocessing.scale(clustervar['urbanrate'].astype('float64'))

clus_train, clus_test = train_test_split(clustervar, test_size=0.2, random_state=23)

from scipy.spatial.distance import cdist
clusters=range(1,10)
meandist=[]

for k in clusters:
model=KMeans(n_clusters=k)
model.fit(clus_train)
clusassign=model.predict(clus_train)
meandist.append(sum(np.min(cdist(clus_train, model.cluster_centers_, 'euclidean'), axis=1))
/ clus_train.shape[0])

plt.plot(clusters, meandist)
plt.xlabel('Number of clusters')
plt.ylabel('Average distance')
plt.title('Selecting k with the Elbow Method')

#Solution 1 : 2 Clusters
model1=KMeans(n_clusters=2)
model1.fit(clus_train)
clusassign=model.predict(clus_train)

from sklearn.decomposition import PCA
pca_2 = PCA(2)
plot_columns = pca_2.fit_transform(clus_train)
plt.scatter(x=plot_columns[:,0], y=plot_columns[:,1], c=model1.labels_,)
plt.xlabel('Canonical variable 1')
plt.ylabel('Canonical variable 2')
plt.title('Scatterplot of Canonical Variables for 2 Clusters')
plt.show()

#Solution 2 : 4 Clusters
model2=KMeans(n_clusters=4)
model2.fit(clus_train)
clusassign=model2.predict(clus_train)

from sklearn.decomposition import PCA
pca_2 = PCA(2)
plot_columns = pca_2.fit_transform(clus_train)
plt.scatter(x=plot_columns[:,0], y=plot_columns[:,1], c=model2.labels_,)
plt.xlabel('Canonical variable 1')
plt.ylabel('Canonical variable 2')
plt.title('Scatterplot of Canonical Variables for 4 Clusters')
plt.show()

#mean values of the variables of the training dataset
clus_train.reset_index(level=0, inplace=True)
cluslist2=list(clus_train['index'])
labels2=list(model2.labels_)
newlist2=dict(zip(cluslist, labels))
newlist2
newclus2=DataFrame.from_dict(newlist2, orient='index')
newclus2
newclus2.columns = ['cluster']
newclus2.reset_index(level=0, inplace=True)
merged_train2=pd.merge(clus_train, newclus2, on='index')
merged_train2.head(n=100)
merged_train2.cluster.value_counts()

clustergrp2 = merged_train2.groupby('cluster').mean()
print ("Clustering variable means by 7-cluster")
print(clustergrp2)

Data Analysis and Interpretation Specialization

Wednesday, March 2, 2016

Running a k-means Cluster Analysis (Python)