Saturday, February 27, 2016

Running a Lasso Regression Analysis (Python)

Introduction:

Dataset: "gapminder.csv"

Target Variable: "polityscore"

Predictors: "incomeperperson","alcconsumption","armedforcesrate","breastcancerper100th","co2emissions",
"femaleemployrate","hivrate","internetuserate","lifeexpectancy","oilperperson","relectricperperson",
"suicideper100th","employrate","urbanrate"

Algorithms:

-K-fold cross validation
-(LASSO) Least Angle Regression

For the target variable, "polityscore", its value ranges from -10 to 10 from the original dataset. Now it is divided into 2 groups:
[-10,0]=0
(0,10]=1

Results:

regression coefficients:

{'alcconsumption': 0.0,
 'armedforcesrate': -0.094224107805211468,
 'breastcancerper100th': 0.0025113191339298656,
 'co2emissions': 0.0,
 'employrate': 0.0,
 'femaleemployrate': 0.0,
 'hivrate': 0.0,
 'incomeperperson': 0.0,
 'internetuserate': 0.01682045898832887,
 'lifeexpectancy': 0.0,
 'oilperperson': 0.0,
 'relectricperperson': 0.0,
 'suicideper100th': 0.0,

 'urbanrate': 0.0}





MSE from training and test data:

training data MSE
0.111268760118
test data MSE
0.112600452979

R-square from training and test data
training data R-square
0.244465249376
test data R-square
-0.0847176970291

Summary:
The chosen predictors are 'armedforcesrate', 'breastcancerper100th' and 'internetuserate' according to the output.

The correlation of "internetuserate" and "polityscore" has been studied by my previous exercise. The other two variables seem difficult to be explained the relationship with "polityscore" in human sense.. In fact, this is also the limitation of the LASSO Regression or "Big Data, Simple Algorithm" which seeks for the "optimization"  of the prediction not the explanation of the variables.

Python Code:

import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LassoLarsCV

#Load the dataset
data = pd.read_csv("gapminder.csv")

# Data Management


data['polityscore'] = data['polityscore'].convert_objects(convert_numeric=True)
data['incomeperperson'] = data['incomeperperson'].convert_objects(convert_numeric=True)
data['alcconsumption'] = data['alcconsumption'].convert_objects(convert_numeric=True)
data['armedforcesrate'] = data['armedforcesrate'].convert_objects(convert_numeric=True)
data['breastcancerper100th'] = data['breastcancerper100th'].convert_objects(convert_numeric=True)
data["co2emissions"] = data["co2emissions"].convert_objects(convert_numeric=True)
data["femaleemployrate"] = data["femaleemployrate"].convert_objects(convert_numeric=True)
data['hivrate'] = data['hivrate'].convert_objects(convert_numeric=True)
data['internetuserate'] = data['internetuserate'].convert_objects(convert_numeric=True)
data['lifeexpectancy'] = data['lifeexpectancy'].convert_objects(convert_numeric=True)
data['oilperperson'] = data['oilperperson'].convert_objects(convert_numeric=True)
data['relectricperperson'] = data['relectricperperson'].convert_objects(convert_numeric=True)
data['suicideper100th'] = data['suicideper100th'].convert_objects(convert_numeric=True)
data['employrate'] = data['employrate'].convert_objects(convert_numeric=True)
data['urbanrate'] = data["urbanrate"].convert_objects(convert_numeric=True)
data_clean = data.dropna(axis=0, how='any')

def politysco (row):
 

   if row['polityscore'] <= 0 :
      return 0
   elif row['polityscore'] <= 10:
      return 1

data_clean['polityscore'] = data_clean.apply (lambda row: politysco (row),axis=1)


#select predictor variables and target variable as separate data sets
predvar= data_clean[["incomeperperson","alcconsumption","armedforcesrate",
"breastcancerper100th","co2emissions","femaleemployrate","hivrate",
"internetuserate","lifeexpectancy","oilperperson","relectricperperson",
"suicideper100th","employrate","urbanrate"]]

target = data_clean.polityscore


# standardize predictors to have mean=0 and sd=1
predictors=predvar.copy()
predictors=predictors.dropna()
from sklearn import preprocessing

predictors['incomeperperson']=preprocessing.scale(predictors['incomeperperson'].astype('float64'))
predictors['alcconsumption']=preprocessing.scale(predictors['alcconsumption'].astype('float64'))
predictors['armedforcesrate']=preprocessing.scale(predictors['armedforcesrate'].astype('float64'))
predictors['breastcancerper100th']=preprocessing.scale(predictors['breastcancerper100th'].astype('float64'))
predictors['co2emissions']=preprocessing.scale(predictors['co2emissions'].astype('float64'))
predictors['femaleemployrate']=preprocessing.scale(predictors['femaleemployrate'].astype('float64'))
predictors['hivrate']=preprocessing.scale(predictors['hivrate'].astype('float64'))
predictors['internetuserate']=preprocessing.scale(predictors['internetuserate'].astype('float64'))
predictors['lifeexpectancy']=preprocessing.scale(predictors['lifeexpectancy'].astype('float64'))
predictors['oilperperson']=preprocessing.scale(predictors['oilperperson'].astype('float64'))
predictors['relectricperperson']=preprocessing.scale(predictors['relectricperperson'].astype('float64'))
predictors['suicideper100th']=preprocessing.scale(predictors['suicideper100th'].astype('float64'))
predictors['employrate']=preprocessing.scale(predictors['employrate'].astype('float64'))
predictors['urbanrate']=preprocessing.scale(predictors['urbanrate'].astype('float64'))


# split data into train and test sets
pred_train, pred_test, tar_train, tar_test = train_test_split(predictors, target,
                                                              test_size=.3, random_state=123)

# specify the lasso regression model
model=LassoLarsCV(cv=10, precompute=False).fit(pred_train,tar_train)

# print variable names and regression coefficients
dict(zip(predictors.columns, model.coef_))

# plot coefficient progression
m_log_alphas = -np.log10(model.alphas_)
ax = plt.gca()
plt.plot(m_log_alphas, model.coef_path_.T)
plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
            label='alpha CV')
plt.ylabel('Regression Coefficients')
plt.xlabel('-log(alpha)')
plt.title('Regression Coefficients Progression for Lasso Paths')

# plot mean square error for each fold
m_log_alphascv = -np.log10(model.cv_alphas_)
plt.figure()
plt.plot(m_log_alphascv, model.cv_mse_path_, ':')
plt.plot(m_log_alphascv, model.cv_mse_path_.mean(axis=-1), 'k',
         label='Average across the folds', linewidth=2)
plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
            label='alpha CV')
plt.legend()
plt.xlabel('-log(alpha)')
plt.ylabel('Mean squared error')
plt.title('Mean squared error on each fold')
       

# MSE from training and test data
from sklearn.metrics import mean_squared_error
train_error = mean_squared_error(tar_train, model.predict(pred_train))
test_error = mean_squared_error(tar_test, model.predict(pred_test))
print ('training data MSE')
print(train_error)
print ('test data MSE')
print(test_error)

# R-square from training and test data
rsquared_train=model.score(pred_train,tar_train)
rsquared_test=model.score(pred_test,tar_test)
print ('training data R-square')
print(rsquared_train)
print ('test data R-square')
print(rsquared_test)

Sunday, February 7, 2016

Running a Random Forest (Python)

Introduction:

Dataset: gapminder.csv

Predictors: 'internetuserate','urbanrate','employrate','lifeexpectancy','alcconsumption',
'armedforcesrate','breastcancerper100th','co2emissions','femaleemployrate','hivrate'

Targets: 'polityscore'


"polityscore" reflects the democracy level of a country. The score ranges from -10 to 10. 10 marks means the country is the most democratic. I divided it into 2 levels :[-10,0),[0,10),which return as 0 and 1 respectively,

Results:

Data Partitioning:

-predictors in training dataset: 10 variables and 76 observations
-predictors in test dataset: 10 variables and 52 observations
-target in training dataser: 1 variable and 76 observations
-target in test dataset: 1 variable and 52 observations

Training-test ratio: 0.6

Confusion matrix for the target_test sample:
[[ 8,  3],
 [ 7, 34]]

Accuracy=0.80769230769230771

Feature-importance score:
[ 0.08430852  0.08336156  0.09066508  0.14997917  0.0512591   0.07579398
  0.11722497  0.07404398  0.15731402  0.11604963]
The 'femaleemployrate' 's score is highest (=0.15731402)



Accuracy Scores with different number of trees:





Python Code:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import sklearn.metrics
from sklearn import datasets
from sklearn.ensemble import ExtraTreesClassifier



data = pd.read_csv("gapminder.csv")
data['polityscore'] = data['polityscore'].convert_objects(convert_numeric=True)
data['internetuserate'] = data['internetuserate'].convert_objects(convert_numeric=True)
data['urbanrate'] = data['urbanrate'].convert_objects(convert_numeric=True)
data['employrate'] = data['employrate'].convert_objects(convert_numeric=True)
data['lifeexpectancy'] = data['lifeexpectancy'].convert_objects(convert_numeric=True)
data['alcconsumption'] = data['alcconsumption'].convert_objects(convert_numeric=True)
data['armedforcesrate'] = data['armedforcesrate'].convert_objects(convert_numeric=True)
data['breastcancerper100th'] = data['breastcancerper100th'].convert_objects(convert_numeric=True)
data['co2emissions'] = data['co2emissions'].convert_objects(convert_numeric=True)
data['femaleemployrate'] = data['femaleemployrate'].convert_objects(convert_numeric=True)
data['hivrate'] = data['hivrate'].convert_objects(convert_numeric=True)



data_clean = data.dropna()
data_clean.dtypes
data_clean.describe()

def politysco (row):

   if row['polityscore'] <= 0 :
      return 0
   elif row['polityscore'] <= 10:
      return 1
 
data_clean['polityscore'] = data_clean.apply (lambda row: politysco (row),axis=1)


predictors = data_clean[['internetuserate','urbanrate','employrate','lifeexpectancy','alcconsumption',
'armedforcesrate','breastcancerper100th','co2emissions','femaleemployrate','hivrate']]

targets =data_clean['polityscore']

pred_train, pred_test, tar_train, tar_test  = train_test_split(predictors, targets, test_size=.4)

print(pred_train.shape)
print(pred_test.shape)
print(tar_train.shape)
print(tar_test.shape)

#Build model on training data
from sklearn.ensemble import RandomForestClassifier

classifier=RandomForestClassifier(n_estimators=9)
classifier=classifier.fit(pred_train,tar_train)

predictions=classifier.predict(pred_test)

sklearn.metrics.confusion_matrix(tar_test,predictions)
sklearn.metrics.accuracy_score(tar_test, predictions)

#Displaying the decision tree
model = ExtraTreesClassifier()
model.fit(pred_train,tar_train)
# display the relative importance of each attribute
print(model.feature_importances_)




trees=range(9)
accuracy=np.zeros(9)

for idx in range(len(trees)):
   classifier=RandomForestClassifier(n_estimators=idx + 1)
   classifier=classifier.fit(pred_train,tar_train)
   predictions=classifier.predict(pred_test)
   accuracy[idx]=sklearn.metrics.accuracy_score(tar_test, predictions)

print('Accuracy Scores with different number of trees' )  
plt.cla()
plt.plot(trees, accuracy)








Running a Classification Tree (Python)

Introduction:

Dataset: gapminder.csv

Predictors: 'internetuserate','urbanrate','employrate','lifeexpectancy'

Targets: polityscore

"polityscore" reflects the democracy level of a country. The score ranges from -10 to 10. 10 marks means the country is the most democratic. I divided it into 2 levels :[-10,0),[0,10),which return as 0 and 1 respectively,

Results:
Data Partitioning:

-predictors in training dataset: 4 variables and 91 observations
-predictors in test dataset: 4 variables and 61 observations
-target in training dataser: 1 variable and 91 observations
-target in test dataset: 1 variable and 61 observations

Training-test ratio: 0.6

Confusion matrix for the target_test sample:
[[ 6, 18],
 [ 9, 28]]

True Negative=6
True Positive =28
False Negative =9
False Positive=18

Accuracy=0.5901639344262295

Binary Decision  Tree:


Python Code:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np

import matplotlib.pylab as plt
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import sklearn.metrics



data = pd.read_csv("gapminder.csv")
data['polityscore'] = data['polityscore'].convert_objects(convert_numeric=True)
data['internetuserate'] = data['internetuserate'].convert_objects(convert_numeric=True)
data['urbanrate'] = data['urbanrate'].convert_objects(convert_numeric=True)
data['employrate'] = data['employrate'].convert_objects(convert_numeric=True)
data['lifeexpectancy'] = data['lifeexpectancy'].convert_objects(convert_numeric=True)

data_clean = data.dropna()
data_clean.dtypes
data_clean.describe()

def politysco (row):

   if row['polityscore'] <= 0 :
      return 0
   elif row['polityscore'] <= 10:
      return 1
 
data_clean['polityscore'] = data_clean.apply (lambda row: politysco (row),axis=1)


predictors = data_clean[['internetuserate','urbanrate','employrate','lifeexpectancy',]]

targets =data_clean['polityscore']

pred_train, pred_test, tar_train, tar_test  = train_test_split(predictors, targets, test_size=.4)

print(pred_train.shape)
print(pred_test.shape)
print(tar_train.shape)
print(tar_test.shape)

#Build model on training data
classifier=DecisionTreeClassifier()
classifier=classifier.fit(pred_train,tar_train)

predictions=classifier.predict(pred_test)

sklearn.metrics.confusion_matrix(tar_test,predictions)
sklearn.metrics.accuracy_score(tar_test, predictions)

#Displaying the decision tree
from sklearn import tree
#from StringIO import StringIO
from io import StringIO
#from StringIO import StringIO
from IPython.display import Image
out = StringIO()
tree.export_graphviz(classifier, out_file=out)

import pydotplus
graph=pydotplus.graph_from_dot_data(out.getvalue())
Image(graph.create_png())