import pandas as pd
import numpy as np
import os 
import matplotlib.pyplot as plt
import seaborn as sns


import scipy.stats as stats

#For linear regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split,  GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

#Decision tree regressor
from sklearn.tree import DecisionTreeRegressor

#Ensemble techniques
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor


#Importing the file into the dataframe
filename = os.path.join(os.getcwd(), "data", "WHR2018Chapter2OnlineData.csv")
df = pd.read_csv(filename, header = 0)


# Viewing the dataframe (First ten columns)
print("Initial 10 rows of the dataframe")
print("-------------------------------------------------------")
df.head(n = 11)

Initial 10 rows of the dataframe
-------------------------------------------------------


df.shape

(1562, 19)


columns_renamed = {'Life Ladder': 'life_ladder', 'Log GDP per capita': 'log_GDP_per_capita' ,
       'Social support' : 'social_support', 'Healthy life expectancy at birth': "life_expectancy",
       'Freedom to make life choices': 'freedom_choices', 'Generosity': 'generosity',
       'Perceptions of corruption' : "corruption_perception", 'Positive affect': 'positive_effect', 'Negative affect': 'negative_effect',
       'Confidence in national government' : 'government_confidence', 'Democratic Quality': 'democratic_quality',
       'Delivery Quality': 'delivery_quality', 'Standard deviation of ladder by country-year': 'ladder_standard_deviation',
       'Standard deviation/Mean of ladder by country-year': "mean_of_ladder",
       'GINI index (World Bank estimate)': "gini_WB",
       'GINI index (World Bank estimate), average 2000-15': 'gini_avg',
       'gini of household income reported in Gallup, by wp5-year': 'gini_gallup_5yr'}

df.rename(columns = columns_renamed, inplace = True)
df.head()


# Glancing through the dataset
df.loc[df['life_expectancy'] <= 60]


# Using np.select worked. Previously when I used just the
conditions = [(df['life_expectancy'] > 70),
              (df['life_expectancy'] <= 70)
             ]
values = ['>70', '<=70']
df['life_expectancy_label'] = np.select(conditions, values)


# Checking if my code worked
condition = (df['life_expectancy_label'] == "<=70")
df[condition]


condition1 = (df['life_expectancy_label'] == "<=70")
print("Number of rows with life expectancy less than or equal to 70 = " + str(df[condition1].shape[0]))

condition2 = (df['life_expectancy_label'] == ">70")
print("Number of rows with life expectancy greater than 70 = " + str(df[condition2].shape[0]))

Number of rows with life expectancy less than or equal to 70 = 1267
Number of rows with life expectancy greater than 70 = 286


df.describe()


# Performing sampling 

# Taking a percentage 
percentage = 0.4
# Getting the number of rows
rows = df.shape[0]
# Using subset feature of pandas using sample()
df_sample = df.sample(int(percentage*rows))

df_sample.shape

(624, 20)


## Using filters to check for specific columns
df_sample['country'].value_counts()

Tajikistan        10
Kosovo            10
Uganda             8
Sri Lanka          7
Brazil             7
                  ..
Ivory Coast        1
South Sudan        1
Czech Republic     1
Guyana             1
Bhutan             1
Name: country, Length: 161, dtype: int64


df_dropped = df_sample.dropna(subset = ['gini_WB'])
df_dropped


life_exp_outlier = np.percentile(df_dropped['gini_WB'],99.0)
life_exp_outlier

0.55826


## Find the mean values of a specific column
gini_WB_mean = df_sample['gini_WB'].mean()

df_sample['gini_WB'].fillna(value = gini_WB_mean, inplace = True)


df_sample


# Finding the 99.0 percentile and winsorizing those rows
outlier = np.percentile(df_sample['gini_WB'], 99.0)
outlier

0.55131


# Doing the winsorization in-place so that I don't have to drop the column again
df_sample['gini_WB_win'] = stats.mstats.winsorize(df_sample['gini_WB'], limits = [0.01,0.01], inplace = False)


# Checking if winsorization worked by comparing gini_WB column with gini_WB_win column
df_sample.loc[df_sample['gini_WB'] > 0.52]


# Finding columns with missing values
df.isnull().any()

country                      False
year                         False
life_ladder                  False
log_GDP_per_capita            True
social_support                True
life_expectancy               True
freedom_choices               True
generosity                    True
corruption_perception         True
positive_effect               True
negative_effect               True
government_confidence         True
democratic_quality            True
delivery_quality              True
ladder_standard_deviation    False
mean_of_ladder               False
gini_WB                       True
gini_avg                      True
gini_gallup_5yr               True
life_expectancy_label        False
dtype: bool


# Finding the mean values to fill in the na values in these columns
column_names = ['log_GDP_per_capita',
       'social_support', 'life_expectancy', 'freedom_choices', 'generosity',
       'corruption_perception', 'positive_effect', 'negative_effect',
       'government_confidence', 'democratic_quality', 'delivery_quality','gini_WB', 'gini_avg',
       'gini_gallup_5yr']

for column in column_names:
    mean = df[column].mean()  
    df[column] = df[column].fillna(mean) ## Filling in the NA values 
    df[column] = stats.mstats.winsorize(df[column], limits = [0.01,0.01]) ## Winsorizing the columns


conditions = [(df['life_expectancy'] > 70),
              (df['life_expectancy'] <= 70)
             ]
values = ['>70', '<=70']
df['life_expectancy_label'] = np.select(conditions, values)


# Countries with life expectancy greater than 70
life70 = df['country'].loc[df['life_expectancy'] > 70].unique()
life70

array(['Australia', 'Austria', 'Belgium', 'Canada', 'Cyprus',
       'Czech Republic', 'Denmark', 'Finland', 'France', 'Germany',
       'Greece', 'Hong Kong S.A.R. of China', 'Iceland', 'Ireland',
       'Israel', 'Italy', 'Japan', 'Luxembourg', 'Malta', 'Netherlands',
       'New Zealand', 'Norway', 'Portugal', 'Singapore', 'Slovenia',
       'South Korea', 'Spain', 'Sweden', 'Switzerland',
       'Taiwan Province of China', 'United Kingdom'], dtype=object)


df_sub = df[['year','life_expectancy', 'gini_WB', 'social_support', 'corruption_perception', 'freedom_choices', 'generosity','life_expectancy_label']].copy()
sns.pairplot(data = df_sub, hue = 'life_expectancy_label' )

<seaborn.axisgrid.PairGrid at 0x7fc5fc779cf8>


for column in column_names:
    plt.figure()
    sns.regplot(x = column, y = 'life_expectancy', data = df, ci = 95)


df.columns
column_names = ['life_ladder', 'log_GDP_per_capita',
       'social_support', 'freedom_choices', 'generosity',
       'corruption_perception', 'positive_effect', 'negative_effect',
       'government_confidence', 'democratic_quality', 'delivery_quality']


df_normalized = df.copy() # copying the dataframe 

X = df_normalized[column_names]
y = df_normalized['life_expectancy']


print(X)
print(y)

      life_ladder  log_GDP_per_capita  social_support  freedom_choices  \
0        3.723590            7.168690        0.450662         0.718114   
1        4.401778            7.333790        0.552308         0.678896   
2        4.758381            7.386629        0.539075         0.600127   
3        3.831719            7.415019        0.521104         0.495901   
4        3.782938            7.517126        0.520637         0.530935   
...           ...                 ...             ...              ...   
1557     4.690188            7.565154        0.799274         0.575884   
1558     4.184451            7.562753        0.765839         0.642034   
1559     3.703191            7.556052        0.735800         0.667193   
1560     3.735400            7.538829        0.768425         0.732971   
1561     3.638300            7.538187        0.754147         0.752826   

      generosity  corruption_perception  positive_effect  negative_effect  \
0       0.181819               0.881686         0.517637         0.258195   
1       0.203614               0.850035         0.583926         0.237092   
2       0.137630               0.706766         0.618265         0.275324   
3       0.175329               0.731109         0.611387         0.267175   
4       0.247159               0.775620         0.710385         0.267919   
...          ...                    ...              ...              ...   
1557   -0.076716               0.830937         0.711885         0.182288   
1558   -0.045885               0.820217         0.725214         0.239111   
1559   -0.094585               0.810457         0.715079         0.178861   
1560   -0.065283               0.723612         0.737636         0.208555   
1561   -0.066005               0.751208         0.806428         0.224051   

      government_confidence  democratic_quality  delivery_quality  
0                  0.612072           -1.929690         -1.642179  
1                  0.611545           -2.017452         -1.635025  
2                  0.299357           -1.991810         -1.617176  
3                  0.307386           -1.919018         -1.616221  
4                  0.435440           -1.842996         -1.404078  
...                     ...                 ...               ...  
1557               0.527755           -1.026085         -1.526321  
1558               0.566209           -0.985267         -1.484067  
1559               0.590012           -0.893078         -1.357514  
1560               0.699344           -0.863044         -1.371214  
1561               0.682647           -0.126617          0.004947  

[1562 rows x 11 columns]
0       49.209663
1       49.624432
2       50.008961
3       50.367298
4       50.709263
          ...    
1557    48.949745
1558    50.051235
1559    50.925652
1560    51.800068
1561    52.674484
Name: life_expectancy, Length: 1562, dtype: float64


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3,random_state = 1234)


print(X_train.shape)
print(X_test.shape)

(1093, 11)
(469, 11)


# Model training
model = LinearRegression() #no hyperparameters
model.fit(X_train,y_train)

prediction = model.predict(X_test)


print('Model Summary:\n')

print('Intercept:')
print('alpha = ' , model.intercept_)

# Printing the weights that different parameters add to the model
print('\nWeights:')
i = 0
for w in model.coef_:
    print('w_',i+1,'= ', w, ' [ weight of ', column_names[i],']')
    i += 1

Model Summary:

Intercept:
alpha =  21.590961708390594

Weights:
w_ 1 =  1.3836931913078234  [ weight of  life_ladder ]
w_ 2 =  3.7939849897715527  [ weight of  log_GDP_per_capita ]
w_ 3 =  0.5444246602489426  [ weight of  social_support ]
w_ 4 =  1.3402433007513204  [ weight of  freedom_choices ]
w_ 5 =  0.8070354764691233  [ weight of  generosity ]
w_ 6 =  0.04599995213654018  [ weight of  corruption_perception ]
w_ 7 =  -3.3635380659709746  [ weight of  positive_effect ]
w_ 8 =  3.554204486149732  [ weight of  negative_effect ]
w_ 9 =  -3.8656614817340023  [ weight of  government_confidence ]
w_ 10 =  -0.16633849195606265  [ weight of  democratic_quality ]
w_ 11 =  1.376216906298403  [ weight of  delivery_quality ]


# The mean squared error for multiple regression 
mr_rmse = mean_squared_error(y_test, prediction, squared = False)
mr_r2 = r2_score(y_test, prediction)
print('\nModel Performance\n\nRMSE =   %.3f'
      % mr_rmse)
# The coefficient of determination
print(' R^2 =   %.3f'
      % mr_r2)

Model Performance

RMSE =   3.854
 R^2 =   0.762


column_names = ['life_ladder', 'log_GDP_per_capita',
       'social_support', 'freedom_choices', 'generosity',
       'corruption_perception', 'positive_effect', 'negative_effect',
       'government_confidence', 'democratic_quality', 'delivery_quality']

for column in column_names:
    X = df[column].to_frame()
    y = df['life_expectancy']
    
    #Train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=123)
    
    #Training the model
    ols_model = LinearRegression()
    ols_model.fit(X_train,y_train)
    prediction = ols_model.predict(X_test)
    print('\nModel Summary for ',column,'\n\nWeight =  ', ols_model.coef_[0], '[ weight of feature:',column, ' ]')
    print('Alpha = ', ols_model.intercept_, '[ intercept ]')
    
    #visualize
    plt.figure()
    plt.scatter(X, y,  color='black',s=15);
    plt.plot(X_test, prediction, color='blue', linewidth=3);
    plt.xlabel(column);
    plt.ylabel('life expectancy');

Model Summary for  life_ladder 

Weight =   5.00004746337455 [ weight of feature: life_ladder  ]
Alpha =  35.29288521069044 [ intercept ]

Model Summary for  log_GDP_per_capita 

Weight =   5.587345756465542 [ weight of feature: log_GDP_per_capita  ]
Alpha =  10.849347958682337 [ intercept ]

Model Summary for  social_support 

Weight =   40.10020095243395 [ weight of feature: social_support  ]
Alpha =  29.897485465505504 [ intercept ]

Model Summary for  freedom_choices 

Weight =   18.10448672420675 [ weight of feature: freedom_choices  ]
Alpha =  49.347030808324604 [ intercept ]

Model Summary for  generosity 

Weight =   1.5286642444489646 [ weight of feature: generosity  ]
Alpha =  62.63426089744073 [ intercept ]

Model Summary for  corruption_perception 

Weight =   -12.907310124088754 [ weight of feature: corruption_perception  ]
Alpha =  72.33850465680052 [ intercept ]

Model Summary for  positive_effect 

Weight =   19.391134857461708 [ weight of feature: positive_effect  ]
Alpha =  48.793828200930335 [ intercept ]

Model Summary for  negative_effect 

Weight =   -9.324654140824984 [ weight of feature: negative_effect  ]
Alpha =  65.06280311286886 [ intercept ]

Model Summary for  government_confidence 

Weight =   -7.997148561280471 [ weight of feature: government_confidence  ]
Alpha =  66.4651499235437 [ intercept ]

Model Summary for  democratic_quality 

Weight =   5.394792746244312 [ weight of feature: democratic_quality  ]
Alpha =  63.17152194522956 [ intercept ]

Model Summary for  delivery_quality 

Weight =   5.756564544045377 [ weight of feature: delivery_quality  ]
Alpha =  62.42135709329236 [ intercept ]


columns = ['life_ladder', 'log_GDP_per_capita',
       'social_support', 'life_expectancy', 'freedom_choices', 'generosity',
       'corruption_perception', 'positive_effect', 'negative_effect',
       'government_confidence', 'democratic_quality', 'delivery_quality',
       'ladder_standard_deviation', 'mean_of_ladder', 'gini_WB', 'gini_avg',
       'gini_gallup_5yr']

for column in columns:
    df_normalized[column] = (df_normalized[column] - df_normalized[column].min()) / (df_normalized[column].max() - df_normalized[column].min())


column_names = ['life_ladder', 'log_GDP_per_capita',
       'social_support', 'freedom_choices', 'generosity',
       'corruption_perception', 'positive_effect', 'negative_effect',
       'government_confidence', 'democratic_quality', 'delivery_quality']

for column in column_names:
    X = df_normalized[column].to_frame()
    y = df_normalized['life_expectancy']
    
    #Train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=123)
    
    #Training the model
    ols_model = LinearRegression()
    ols_model.fit(X_train,y_train)
    prediction = ols_model.predict(X_test)
    print('\nModel Summary for ',column,'\n\nWeight =  ', ols_model.coef_[0], '[ weight of feature:',column, ' ]')
    print('Alpha = ', ols_model.intercept_, '[ intercept ]')
    print('\nModel Performance\n\nRMSE =   %.2f'
      % np.sqrt(mean_squared_error(y_test, prediction)))
    print(' R^2 =   %.2f'
      % r2_score(y_test, prediction))
    print("---------------------------------------------------------------------------")
    
    #visualize
    plt.figure()
    plt.scatter(X, y,  color='black',s=15)
    plt.plot(X_test, prediction, color='blue', linewidth=3)
    plt.xlabel(column)
    plt.ylabel('life expectancy')

Model Summary for  life_ladder 

Weight =   0.8303941613266514 [ weight of feature: life_ladder  ]
Alpha =  0.19203365876284234 [ intercept ]

Model Performance

RMSE =   0.17
 R^2 =   0.55
---------------------------------------------------------------------------

Model Summary for  log_GDP_per_capita 

Weight =   0.8242093320851095 [ weight of feature: log_GDP_per_capita  ]
Alpha =  0.17066931070519437 [ intercept ]

Model Performance

RMSE =   0.14
 R^2 =   0.69
---------------------------------------------------------------------------

Model Summary for  social_support 

Weight =   0.6433994738807811 [ weight of feature: social_support  ]
Alpha =  0.1724267515221693 [ intercept ]

Model Performance

RMSE =   0.21
 R^2 =   0.34
---------------------------------------------------------------------------

Model Summary for  freedom_choices 

Weight =   0.33232457108967384 [ weight of feature: freedom_choices  ]
Alpha =  0.4195534198874547 [ intercept ]

Model Performance

RMSE =   0.24
 R^2 =   0.10
---------------------------------------------------------------------------

Model Summary for  generosity 

Weight =   0.03514906418452554 [ weight of feature: generosity  ]
Alpha =  0.6131255577730372 [ intercept ]

Model Performance

RMSE =   0.26
 R^2 =   -0.02
---------------------------------------------------------------------------

Model Summary for  corruption_perception 

Weight =   -0.32252812316210655 [ weight of feature: corruption_perception  ]
Alpha =  0.8631552465251379 [ intercept ]

Model Performance

RMSE =   0.25
 R^2 =   0.08
---------------------------------------------------------------------------

Model Summary for  positive_effect 

Weight =   0.2571616991132741 [ weight of feature: positive_effect  ]
Alpha =  0.47761255061950253 [ intercept ]

Model Performance

RMSE =   0.24
 R^2 =   0.10
---------------------------------------------------------------------------

Model Summary for  negative_effect 

Weight =   -0.11728601294634253 [ weight of feature: negative_effect  ]
Alpha =  0.6693521792020967 [ intercept ]

Model Performance

RMSE =   0.26
 R^2 =   -0.01
---------------------------------------------------------------------------

Model Summary for  government_confidence 

Weight =   -0.19788154228583407 [ weight of feature: government_confidence  ]
Alpha =  0.7145427025199573 [ intercept ]

Model Performance

RMSE =   0.25
 R^2 =   0.01
---------------------------------------------------------------------------

Model Summary for  democratic_quality 

Weight =   0.5845707480257327 [ weight of feature: democratic_quality  ]
Alpha =  0.3063078596074761 [ intercept ]

Model Performance

RMSE =   0.21
 R^2 =   0.32
---------------------------------------------------------------------------

Model Summary for  delivery_quality 

Weight =   0.6564194014514876 [ weight of feature: delivery_quality  ]
Alpha =  0.3273963438330487 [ intercept ]

Model Performance

RMSE =   0.18
 R^2 =   0.48
---------------------------------------------------------------------------


y = df_normalized['life_expectancy']
X = df_normalized.drop(columns = ['life_expectancy', 'country', 'life_expectancy_label'])


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 123)


max_depth = [4,8]
min_samples_leaf = [25,50]

param_grid = {'max_depth': max_depth, 'min_samples_leaf': min_samples_leaf}


# Running a grid search by first initializing the decision tree regressor model.

dt_regressor = DecisionTreeRegressor()

dt_grid = GridSearchCV(dt_regressor, param_grid, cv = 3)

dt_grid_search = dt_grid.fit(X_train, y_train)


rmse_DT = dt_grid_search.best_score_
print("RMSE for the best model is: ", rmse_DT)

RMSE for the best model is:  0.7883939845225294


# Finding the best parameters
dt_best_params = dt_grid_search.best_params_
dt_best_params

{'max_depth': 8, 'min_samples_leaf': 25}


# Using these parameters to fit and re-evaluate the model
dt_model = DecisionTreeRegressor(max_depth = 8, min_samples_leaf = 25)
dt_model.fit(X_train, y_train)

prediction = dt_model.predict(X_test)
dt_rmse = mean_squared_error(y_test, prediction, squared = False)
dt_r2 = r2_score(y_test,prediction)


print("RMSE for new DT Regressor model: ", dt_rmse)
print("R2 score: ", dt_r2)

RMSE for new DT Regressor model:  0.11744432432043246
R2 score:  0.7884196422013722


gbdt_model = GradientBoostingRegressor(max_depth = 3, n_estimators = 100)
gbdt_model.fit(X_train, y_train)
y_gbdt = gbdt_model.predict(X_test)


gbdt_rmse = mean_squared_error(y_test, y_gbdt, squared = False)

gbdt_r2 = r2_score(y_test, y_gbdt)


print("RMSE for GBDT model: ", gbdt_rmse)
print("R2: ", gbdt_r2)

RMSE for GBDT model:  0.07888231263680062
R2:  0.904551183229596


n_estimator = [100, 200, 300, 400, 500]

for n in n_estimator:
    rf_model = RandomForestRegressor(max_depth = 32, n_estimators = n) #testing the ideal value for n
    rf_model.fit(X_train, y_train)
    rf_y_pred = rf_model.predict(X_test)
    rf_rmse = mean_squared_error(y_test, rf_y_pred, squared = False)
    rf_r2 = r2_score(y_test, rf_y_pred)
    
    print("RMSE for n_estimators ", n, " = ", rf_rmse)
    print("R2_score for n_estimators ", n, " = ", rf_r2)
    print("--------------------------------------------------------")

RMSE for n_estimators  100  =  0.07476010843395128
R2_score for n_estimators  100  =  0.9142663872155729
--------------------------------------------------------
RMSE for n_estimators  200  =  0.0734342064728498
R2_score for n_estimators  200  =  0.9172804634748956
--------------------------------------------------------
RMSE for n_estimators  300  =  0.0738136486404338
R2_score for n_estimators  300  =  0.9164234140855263
--------------------------------------------------------
RMSE for n_estimators  400  =  0.0748789900908593
R2_score for n_estimators  400  =  0.9139935075258963
--------------------------------------------------------
RMSE for n_estimators  500  =  0.07323691072406122
R2_score for n_estimators  500  =  0.9177243516821939
--------------------------------------------------------


# I chose n_estimators = 200
rf_model = RandomForestRegressor(max_depth = 32, n_estimators = 200) #testing th
rf_model.fit(X_train, y_train)
rf_y_pred = rf_model.predict(X_test)
rf_rmse = mean_squared_error(y_test, rf_y_pred, squared = False)
rf_r2 = r2_score(y_test, rf_y_pred)

print("RMSE for Random Forest Model = ", rf_rmse)
print("R2_score = ", rf_r2)

RMSE for Random Forest Model =  0.07344730733747297
R2_score =  0.9172509460559959


RMSE_values = [mr_rmse, dt_rmse, gbdt_rmse, rf_rmse]
R2_values = [mr_r2, dt_r2, gbdt_r2, rf_r2]
labels = ['MR', 'DT', 'GBDT', 'RF']

rg = np.arange(4)
width = 0.4
plt.bar(rg, RMSE_values, width, label = "RMSE", color = 'blue')
plt.bar(rg+width, R2_values, width, label = "R2", color = 'pink')

# Adding labels to plot
plt.xticks(rg + width/2, labels)
plt.xlabel("Models Used")
plt.ylabel("RMSE and R2")

plt.title("Model performance for World Happiness Report Data")
plt.legend(loc = 'upper left', ncol =2)
plt.show()

	country	year	Life Ladder	Log GDP per capita	Social support	Healthy life expectancy at birth	Freedom to make life choices	Generosity	Perceptions of corruption	Positive affect	Negative affect	Confidence in national government	Democratic Quality	Delivery Quality	Standard deviation of ladder by country-year	Standard deviation/Mean of ladder by country-year	GINI index (World Bank estimate)	GINI index (World Bank estimate), average 2000-15	gini of household income reported in Gallup, by wp5-year
0	Afghanistan	2008	3.723590	7.168690	0.450662	49.209663	0.718114	0.181819	0.881686	0.517637	0.258195	0.612072	-1.929690	-1.655084	1.774662	0.476600	NaN	NaN	NaN
1	Afghanistan	2009	4.401778	7.333790	0.552308	49.624432	0.678896	0.203614	0.850035	0.583926	0.237092	0.611545	-2.044093	-1.635025	1.722688	0.391362	NaN	NaN	0.441906
2	Afghanistan	2010	4.758381	7.386629	0.539075	50.008961	0.600127	0.137630	0.706766	0.618265	0.275324	0.299357	-1.991810	-1.617176	1.878622	0.394803	NaN	NaN	0.327318
3	Afghanistan	2011	3.831719	7.415019	0.521104	50.367298	0.495901	0.175329	0.731109	0.611387	0.267175	0.307386	-1.919018	-1.616221	1.785360	0.465942	NaN	NaN	0.336764
4	Afghanistan	2012	3.782938	7.517126	0.520637	50.709263	0.530935	0.247159	0.775620	0.710385	0.267919	0.435440	-1.842996	-1.404078	1.798283	0.475367	NaN	NaN	0.344540
5	Afghanistan	2013	3.572100	7.503376	0.483552	51.042980	0.577955	0.074735	0.823204	0.620585	0.273328	0.482847	-1.879709	-1.403036	1.223690	0.342569	NaN	NaN	0.304368
6	Afghanistan	2014	3.130896	7.484583	0.525568	51.370525	0.508514	0.118579	0.871242	0.531691	0.374861	0.409048	-1.773257	-1.312503	1.395396	0.445686	NaN	NaN	0.413974
7	Afghanistan	2015	3.982855	7.466215	0.528597	51.693527	0.388928	0.094686	0.880638	0.553553	0.339276	0.260557	-1.844364	-1.291594	2.160618	0.542480	NaN	NaN	0.596918
8	Afghanistan	2016	4.220169	7.461401	0.559072	52.016529	0.522566	0.057072	0.793246	0.564953	0.348332	0.324990	-1.917693	-1.432548	1.796219	0.425627	NaN	NaN	0.418629
9	Afghanistan	2017	2.661718	7.460144	0.490880	52.339527	0.427011	-0.106340	0.954393	0.496349	0.371326	0.261179	NaN	NaN	1.454051	0.546283	NaN	NaN	0.286599
10	Albania	2007	4.634252	9.077325	0.821372	66.576630	0.528605	-0.016183	0.874700	0.552678	0.246335	0.300681	-0.045108	-0.420024	1.764947	0.380848	NaN	0.30325	NaN

	year	life_ladder	log_GDP_per_capita	social_support	life_expectancy	freedom_choices	generosity	corruption_perception	positive_effect	negative_effect	government_confidence	democratic_quality	delivery_quality	ladder_standard_deviation	mean_of_ladder	gini_WB	gini_avg	gini_gallup_5yr
count	1562.000000	1562.000000	1535.000000	1549.000000	1553.000000	1533.000000	1482.000000	1472.000000	1544.000000	1550.000000	1401.000000	1391.000000	1391.000000	1562.000000	1562.000000	583.000000	1386.000000	1205.000000
mean	2011.820743	5.433676	9.220822	0.810669	62.249887	0.728975	0.000079	0.753622	0.708969	0.263171	0.480207	-0.126617	0.004947	2.003501	0.387271	0.372846	0.386948	0.445204
std	3.419787	1.121017	1.184035	0.119370	7.960671	0.145408	0.164202	0.185538	0.107644	0.084006	0.190724	0.873259	0.981052	0.379684	0.119007	0.086609	0.083694	0.105410
min	2005.000000	2.661718	6.377396	0.290184	37.766476	0.257534	-0.322952	0.035198	0.362498	0.083426	0.068769	-2.448228	-2.144974	0.863034	0.133908	0.241000	0.228833	0.223470
25%	2009.000000	4.606351	8.310665	0.748304	57.299580	0.633754	-0.114313	0.697359	0.621471	0.204116	0.334732	-0.772010	-0.717463	1.737934	0.309722	0.307000	0.321583	0.368531
50%	2012.000000	5.332600	9.398610	0.833047	63.803192	0.748014	-0.022638	0.808115	0.717398	0.251798	0.463137	-0.225939	-0.210142	1.960345	0.369751	0.349000	0.371000	0.425395
75%	2015.000000	6.271025	10.190634	0.904329	68.098228	0.843628	0.094649	0.880089	0.800858	0.311515	0.610723	0.665944	0.717996	2.215920	0.451833	0.433500	0.433104	0.508579
max	2017.000000	8.018934	11.770276	0.987343	76.536362	0.985178	0.677773	0.983276	0.943621	0.704590	0.993604	1.540097	2.184725	3.527820	1.022769	0.648000	0.626000	0.961435

	country	year	life_ladder	log_GDP_per_capita	social_support	life_expectancy	freedom_choices	generosity	corruption_perception	positive_effect	negative_effect	government_confidence	democratic_quality	delivery_quality	ladder_standard_deviation	mean_of_ladder	gini_WB	gini_avg	gini_gallup_5yr	life_expectancy_label
413	Egypt	2015	4.762538	9.219856	0.729744	61.254017	0.659261	-0.097960	0.684498	0.609594	0.344332	0.771613	-1.340729	-0.706133	2.184174	0.458616	0.318	0.312200	0.353562	<=70
1467	United Kingdom	2014	6.758148	10.551946	0.910247	71.267242	0.857040	0.344311	0.484118	0.793993	0.251140	0.422672	0.845569	1.770141	1.877457	0.277806	0.341	0.342000	0.484836	>70
819	Luxembourg	2014	6.891127	11.447376	0.875469	72.201309	0.937988	0.090385	0.366287	0.803084	0.170409	0.662738	1.466503	1.816250	1.539558	0.223412	0.312	0.315364	0.345527	>70
126	Belgium	2013	7.103661	10.619573	0.909186	71.521675	0.890711	0.007226	0.573664	0.797417	0.217139	0.554125	1.156564	1.502777	1.501779	0.211409	0.277	0.285182	0.330041	>70
521	Guatemala	2006	5.901429	8.773075	0.830442	60.256836	0.663382	0.167738	0.706096	0.818015	0.287082	0.385179	-0.452318	-0.688423	2.086916	0.353629	0.549	0.526750	NaN	<=70
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
394	Ecuador	2008	5.296513	9.136249	0.829395	65.709946	0.640317	-0.099406	0.801257	0.842587	0.283164	0.514704	-0.475608	-0.964573	2.115736	0.399458	0.506	0.505929	NaN	<=70
359	Czech Republic	2013	6.697656	10.253431	0.888043	69.155342	0.725946	-0.162773	0.915899	0.719958	0.252653	0.242543	1.034656	0.811469	1.999549	0.298545	0.265	0.264727	0.293450	<=70
1410	Turkey	2011	5.271944	9.886387	0.691902	64.453491	0.445607	-0.250812	0.648596	0.621293	0.379784	0.596202	-0.548424	0.209493	2.521838	0.478351	0.400	0.403077	0.334984	<=70
1466	United Kingdom	2013	6.918055	10.529394	0.936884	71.004280	0.905278	0.336297	0.568043	0.775744	0.252096	0.379232	0.907619	1.667561	2.054752	0.297013	0.332	0.342000	0.480820	>70
637	Ireland	2011	7.006904	10.718406	0.977378	70.777657	0.952034	0.373031	0.589913	0.865225	0.190309	0.529104	1.127180	1.593909	1.888445	0.269512	0.323	0.325545	0.502659	>70

	country	year	life_ladder	log_GDP_per_capita	social_support	life_expectancy	freedom_choices	generosity	corruption_perception	positive_effect	negative_effect	government_confidence	democratic_quality	delivery_quality	ladder_standard_deviation	mean_of_ladder	gini_WB	gini_avg	gini_gallup_5yr	life_expectancy_label
377	Djibouti	2009	4.905925	7.859205	0.900565	51.018696	0.649316	0.005091	0.634223	0.662168	0.232133	0.604356	-0.357723	-0.637458	1.384643	0.282239	0.368895	0.430667	NaN	<=70
1206	Serbia	2017	5.122031	9.561538	0.883770	65.684875	0.684846	-0.081416	0.851458	0.509721	0.326407	0.511198	NaN	NaN	2.394510	0.467492	0.368895	0.305900	0.366130	<=70
2	Afghanistan	2010	4.758381	7.386629	0.539075	50.008961	0.600127	0.137630	0.706766	0.618265	0.275324	0.299357	-1.991810	-1.617176	1.878622	0.394803	0.368895	NaN	0.327318	<=70
132	Belize	2014	5.955647	8.987883	0.756932	58.923908	0.873569	0.003250	0.782105	0.754977	0.281604	0.384267	0.284336	-0.524305	2.455257	0.412257	0.368895	NaN	0.446026	<=70
303	Comoros	2011	3.838486	7.254611	0.721833	53.192173	0.499674	-0.023724	0.731508	0.666620	0.173323	0.441242	-0.463344	-1.228847	1.353432	0.352595	0.368895	0.504500	0.401082	<=70
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1466	United Kingdom	2013	6.918055	10.529394	0.936884	71.004280	0.905278	0.336297	0.568043	0.775744	0.252096	0.379232	0.907619	1.667561	2.054752	0.297013	0.332000	0.342000	0.480820	>70
58	Australia	2011	7.405616	10.640227	0.967029	72.028244	0.944586	0.354615	0.381772	0.815860	0.195324	0.530787	1.194716	1.835444	1.775903	0.239805	0.368895	0.342750	0.431181	>70
637	Ireland	2011	7.006904	10.718406	0.977378	70.777657	0.952034	0.373031	0.589913	0.865225	0.190309	0.529104	1.127180	1.593909	1.888445	0.269512	0.323000	0.325545	0.502659	>70
733	Kosovo	2013	6.125758	9.061113	0.720750	61.483791	0.568463	0.106924	0.935095	0.691511	0.202731	0.361421	NaN	NaN	2.366108	0.386255	0.368895	NaN	0.440340	<=70
1250	Somaliland region	2009	4.991400	NaN	0.879567	NaN	0.746304	NaN	0.513372	0.818879	0.112012	0.538495	NaN	NaN	1.932600	0.387186	0.368895	NaN	0.556402	0

	country	year	life_ladder	log_GDP_per_capita	social_support	life_expectancy	freedom_choices	generosity	corruption_perception	positive_effect	...	government_confidence	democratic_quality	delivery_quality	ladder_standard_deviation	mean_of_ladder	gini_WB	gini_avg	gini_gallup_5yr	life_expectancy_label	gini_WB_win
521	Guatemala	2006	5.901429	8.773075	0.830442	60.256836	0.663382	0.167738	0.706096	0.818015	...	0.385179	-0.452318	-0.688423	2.086916	0.353629	0.549	0.526750	NaN	<=70	0.549
292	Colombia	2009	6.271605	9.268606	0.885927	62.922821	0.757101	-0.061859	0.837143	0.842629	...	0.440458	-0.990759	-0.198892	2.465255	0.393082	0.559	0.552500	0.474249	<=70	0.552
182	Brazil	2011	7.037817	9.614011	0.916253	64.086929	0.833656	-0.085790	0.662167	0.807467	...	0.502891	0.190550	0.063088	2.039300	0.289763	0.531	0.550214	0.410126	<=70	0.531
1069	Panama	2007	6.894140	9.542232	0.937078	66.400909	0.640219	0.091073	0.915287	0.819987	...	0.250142	0.280292	0.015082	2.270610	0.329354	0.530	0.536750	NaN	<=70	0.530
146	Bolivia	2007	5.628419	8.511677	0.796136	56.131241	0.779935	0.006721	0.816994	0.771075	...	0.506286	-0.404038	-0.696602	1.941059	0.344868	0.554	0.531643	NaN	<=70	0.552
293	Colombia	2010	6.408113	9.296564	0.892993	63.078953	0.816121	-0.056406	0.814524	0.830626	...	0.551749	-0.834782	-0.124936	2.564224	0.400153	0.555	0.552500	0.467720	<=70	0.552
1080	Paraguay	2006	4.730082	8.751277	0.895428	61.959164	0.691022	0.080250	0.840989	0.815584	...	0.305138	-0.515817	-0.942334	1.966313	0.415704	0.536	0.519067	NaN	<=70	0.536
177	Brazil	2005	6.636771	9.417304	0.882923	62.258163	0.882186	NaN	0.744994	0.818337	...	0.340625	0.096596	-0.156601	2.436181	0.367073	0.566	0.550214	NaN	<=70	0.552
1081	Paraguay	2007	5.272461	8.790003	0.862656	62.134632	0.698988	0.145930	0.929891	0.866669	...	0.174151	-0.451516	-0.903102	2.060686	0.390839	0.521	0.519067	NaN	<=70	0.521
553	Honduras	2008	5.420331	8.314905	0.828176	62.305752	0.686881	0.230542	0.863222	0.789361	...	0.288162	-0.412119	-0.698374	2.695869	0.497362	0.557	0.555600	NaN	<=70	0.552
178	Brazil	2007	6.320673	9.493340	0.886402	62.884094	0.776645	-0.029040	0.728038	0.858976	...	0.383460	0.090059	-0.171662	2.245499	0.355263	0.552	0.550214	NaN	<=70	0.552
1259	South Africa	2011	4.930511	9.412782	0.857703	49.427708	0.835448	-0.165678	0.819182	0.763071	...	0.627496	0.307435	0.256577	1.851779	0.375575	0.634	0.622500	0.648069	<=70	0.552
183	Brazil	2012	6.660004	9.623651	0.890314	64.354973	0.848606	NaN	0.622543	0.754625	...	0.458936	0.261934	-0.035250	2.395508	0.359686	0.527	0.550214	0.517130	<=70	0.527

Can You Predict Life Expectancy At Birth For Different Countries?¶

Table of contents¶

Importing Packages¶

Loading the Data Set¶

Implementing My Project Plan¶

Please Note:¶

Renaming the columns¶

Creating my own label for data analysis and model prediction¶

Exploratory Data Analysis¶

Sampling¶

Detecting outliers¶

Approach 1 - Drop rows with NA in gini_WB¶

Approach 2 - Filling NA values with mean of gini_WB¶

Winsorization of outliers¶

Winsorizing the entire dataset¶

Redoing the labeling based on the new filled values of life expectancy¶

Visualizing the relationship between columns¶

Pairplot¶

Regression Plot¶

Model Training¶

Multiregression model¶

OLS Regression (Simple Linear Regression)¶

Normalizing the dataframe¶

Linear Regression Model Analysis¶

Decision Tree Regressor¶

Recreating the training and test sets¶

Setting a parameter grid¶

Analysis for Decision Tree Regressor Model¶

Ensemble Modeling¶

Gradient Boosting Regressor¶

Creating and fitting the model¶

Calculating RMSE and R2 scores¶

Analysis for Gradient Boosting Regressor Model¶

Random Forest Model¶

Initializing the model¶

Overall Analysis¶

Visualizing the model performance based on different modeling techniques used¶