# Load required libraries
library(olsrr)
## Warning: package 'olsrr' was built under R version 4.1.3
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(knitr)
library(rmarkdown)
library(nortest)
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.1.3
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
# Load the data set
growth <- read.csv("Data-Alberto_Rodriguez_Zacarias (1).csv")
dim(growth) # prints the dimensions of the data frame, 58 rows and 18 columns
## [1] 58 18
head(growth)
## Country growth med.age infant.mortality under5mortality contraceptive
## 1 Afghanistan 3.27 16.20 61.0 82.8 21
## 2 Armenia 0.22 32.30 14.6 16.3 55
## 3 Azerbaijan 1.33 29.34 31.4 35.9 51
## 4 Bangladesh 1.17 24.70 35.0 43.7 61
## 5 Belarus -0.09 39.15 3.6 4.8 73
## 6 Benin 2.80 18.30 68.6 107.2 13
## tb.coverage no.tb tb.death coverage.HIV hiv.aids.death cell.subscriber
## 1 49 189 44.0 8 2 60
## 2 80 53 6.2 35 17 112
## 3 80 86 4.2 24 11 109
## 4 49 221 52.0 27 1 63
## 5 79 64 8.1 45 11 114
## 6 62 66 10.0 67 31 84
## exp.educ lfp.female income.percapita gdp.growth migration fpi
## 1 2.52 15.99 1940 14.43 448007 122.92
## 2 2.77 51.99 7890 7.20 -30535 136.17
## 3 2.07 61.10 14860 2.20 0 139.55
## 4 2.18 30.94 2640 6.52 -2526483 132.98
## 5 4.96 58.12 16800 1.73 75799 140.03
## 6 4.87 68.15 1710 4.81 -42268 140.98
Fit the full model to the data, utilizing all the independent variables to predict ‘growth’.
model<-lm(growth~., data = growth[,-1])
summary(model)
##
## Call:
## lm(formula = growth ~ ., data = growth[, -1])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.03588 -0.18965 0.01506 0.21000 0.77805
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.840e+00 8.355e-01 6.990 1.70e-08 ***
## med.age -1.661e-01 1.655e-02 -10.034 1.33e-12 ***
## infant.mortality -4.175e-02 1.614e-02 -2.587 0.0133 *
## under5mortality 2.293e-02 1.022e-02 2.244 0.0303 *
## contraceptive -1.569e-02 6.015e-03 -2.608 0.0126 *
## tb.coverage -1.468e-03 5.820e-03 -0.252 0.8022
## no.tb 7.398e-04 9.601e-04 0.771 0.4454
## tb.death -1.621e-03 6.377e-03 -0.254 0.8006
## coverage.HIV -8.144e-03 3.352e-03 -2.430 0.0196 *
## hiv.aids.death -5.980e-04 1.383e-03 -0.433 0.6676
## cell.subscriber 3.010e-03 2.235e-03 1.347 0.1855
## exp.educ 9.452e-02 3.826e-02 2.471 0.0177 *
## lfp.female 3.073e-03 4.305e-03 0.714 0.4793
## income.percapita 6.798e-05 2.389e-05 2.845 0.0069 **
## gdp.growth 1.862e-02 1.604e-02 1.161 0.2523
## migration -8.442e-08 1.419e-07 -0.595 0.5552
## fpi 3.384e-04 3.506e-03 0.097 0.9236
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4076 on 41 degrees of freedom
## Multiple R-squared: 0.8988, Adjusted R-squared: 0.8593
## F-statistic: 22.76 on 16 and 41 DF, p-value: 2.035e-15
The full multiple linear regression model has an adjusted R-squared of 85.93. This means that 86.93% of the variation in the population growth rate can be explained by the 16 predictor variables.
Using an alpha = 0.05, the variables coverage.HIV, income.percapita, infant.mortality, med.age, under5mortality, exp.educ and contraceptive were found to be significant.
A. All-Possible-Regression
This procedure performs multiple linear regression across all possible combinations of independent variables.
apr_result <- ols_step_all_possible(model)
paged_table(apr_result, options = list(rows.print = 10, cols.print = 4))
Finding the best combination of variables per class.
class_lst <- unique(apr_result$n)
apr_best <- data.frame()
for (class in class_lst) {
best_class <- data.frame(apr_result %>% filter(n==class) %>% arrange(desc(adjr))%>% slice(1) %>% select(n, predictors, rsquare, adjr, cp))
apr_best <- rbind(apr_best, best_class)
}
kable(apr_best)
n | predictors | rsquare | adjr | cp |
---|---|---|---|---|
1 | med.age | 0.7985956 | 0.7949991 | 27.604902 |
2 | med.age income.percapita | 0.8172425 | 0.8105968 | 22.049559 |
3 | med.age contraceptive income.percapita | 0.8506455 | 0.8423481 | 10.515344 |
4 | med.age contraceptive exp.educ income.percapita | 0.8655808 | 0.8554359 | 6.463897 |
5 | med.age contraceptive coverage.HIV exp.educ income.percapita | 0.8719385 | 0.8596249 | 5.887885 |
6 | med.age contraceptive coverage.HIV cell.subscriber exp.educ income.percapita | 0.8757351 | 0.8611157 | 6.349567 |
7 | med.age infant.mortality under5mortality contraceptive coverage.HIV exp.educ income.percapita | 0.8845927 | 0.8684356 | 4.760671 |
8 | med.age infant.mortality under5mortality contraceptive coverage.HIV cell.subscriber exp.educ income.percapita | 0.8890797 | 0.8709702 | 4.942621 |
9 | med.age infant.mortality under5mortality contraceptive coverage.HIV cell.subscriber exp.educ income.percapita gdp.growth | 0.8925307 | 0.8723802 | 5.544355 |
10 | med.age infant.mortality under5mortality contraceptive no.tb coverage.HIV cell.subscriber exp.educ income.percapita gdp.growth | 0.8957450 | 0.8735631 | 6.241964 |
11 | med.age infant.mortality under5mortality contraceptive no.tb coverage.HIV hiv.aids.death cell.subscriber exp.educ income.percapita gdp.growth | 0.8966306 | 0.8719119 | 7.883128 |
12 | med.age infant.mortality under5mortality contraceptive no.tb coverage.HIV cell.subscriber exp.educ lfp.female income.percapita gdp.growth migration | 0.8980219 | 0.8708277 | 9.319427 |
13 | med.age infant.mortality under5mortality contraceptive no.tb coverage.HIV hiv.aids.death cell.subscriber exp.educ lfp.female income.percapita gdp.growth migration | 0.8985822 | 0.8686179 | 11.092383 |
14 | med.age infant.mortality under5mortality contraceptive no.tb tb.death coverage.HIV hiv.aids.death cell.subscriber exp.educ lfp.female income.percapita gdp.growth migration | 0.8986427 | 0.8656426 | 13.067891 |
15 | med.age infant.mortality under5mortality contraceptive tb.coverage no.tb tb.death coverage.HIV hiv.aids.death cell.subscriber exp.educ lfp.female income.percapita gdp.growth migration | 0.8987872 | 0.8626398 | 15.009316 |
16 | med.age infant.mortality under5mortality contraceptive tb.coverage no.tb tb.death coverage.HIV hiv.aids.death cell.subscriber exp.educ lfp.female income.percapita gdp.growth migration fpi | 0.8988102 | 0.8593216 | 17.000000 |
B. Step-wise Selection Procedure
ols_step_both_p(model,penter = 0.05, prem = 0.05)
##
## Stepwise Selection Summary
## --------------------------------------------------------------------------------------------
## Added/ Adj.
## Step Variable Removed R-Square R-Square C(p) AIC RMSE
## --------------------------------------------------------------------------------------------
## 1 med.age addition 0.799 0.795 27.6050 86.2981 0.4921
## 2 income.percapita addition 0.817 0.811 22.0500 82.6631 0.4730
## 3 contraceptive addition 0.851 0.842 10.5150 72.9565 0.4315
## 4 exp.educ addition 0.866 0.855 6.4640 68.8457 0.4132
## --------------------------------------------------------------------------------------------
Based on the step-wise procedure, the variables med.age, income.percapita, contraceptive, and exp.educ were selected for the reduced model.
Partial Regression Plots - examining the partial importance of a particular variable
avPlots(model)
The partial regression plots of the variables med.age, income.percapita, contraceptive, and exp.educ show no deviation from linearity, and thus may be added as a linear term.
model.v1<-lm(growth~med.age+income.percapita+contraceptive+exp.educ, data = growth)
summary(model.v1)
##
## Call:
## lm(formula = growth ~ med.age + income.percapita + contraceptive +
## exp.educ, data = growth)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.45228 -0.17347 0.04288 0.22982 0.86192
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.386e+00 2.855e-01 18.862 < 2e-16 ***
## med.age -1.542e-01 1.517e-02 -10.166 4.72e-14 ***
## income.percapita 7.275e-05 1.814e-05 4.010 0.000192 ***
## contraceptive -1.546e-02 3.749e-03 -4.123 0.000132 ***
## exp.educ 8.207e-02 3.382e-02 2.427 0.018671 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4132 on 53 degrees of freedom
## Multiple R-squared: 0.8656, Adjusted R-squared: 0.8554
## F-statistic: 85.32 on 4 and 53 DF, p-value: < 2.2e-16
Model v1:
growth = 5.386 - (0.1542)med.age + (0.0000727)income.percapita - (0.01546)contraceptive + (0.08207)exp.educ
Diagnostics Checking
1.) The error terms follow a normal distribution.
e1<-residuals(model.v1)
qqnorm(e1, ylim=c(-2,2))
ad.test(e1)
##
## Anderson-Darling normality test
##
## data: e1
## A = 0.79653, p-value = 0.03676
At 0.05 level of significance, there is sufficient evidence to reject the null hypothesis that the residuals follow a normal distribution. This is also evident in the q-q plot which shows that the residuals deviate from a straight diagonal line.
# Implement variable transformations to resolve non-normality.
# a. Square root transform
model.v1a<-lm(growth~sqrt(med.age)+sqrt(income.percapita)+sqrt(contraceptive)+sqrt(exp.educ), data = growth)
e1a<-residuals(model.v1a)
ad.test(e1a)
##
## Anderson-Darling normality test
##
## data: e1a
## A = 0.88163, p-value = 0.02252
# b. log transform
model.v1b<-lm(log10(growth)~log10(med.age)+log10(income.percapita)+log10(contraceptive)+log10(exp.educ), data = growth)
## Warning in eval(predvars, data, env): NaNs produced
e1b<-residuals(model.v1b)
ad.test(e1b)
##
## Anderson-Darling normality test
##
## data: e1b
## A = 2.5523, p-value = 1.573e-06
# c. Reciprocal
model.v1c<-lm(growth~(1/med.age)+(1/income.percapita)+(1/contraceptive)+(1/exp.educ), data = growth)
e1c<-residuals(model.v1c)
ad.test(e1c)
##
## Anderson-Darling normality test
##
## data: e1c
## A = 0.88921, p-value = 0.02156
# d. Cube root transform
model.v1d<-lm((growth^(1/3))~(med.age)+(income.percapita)+(contraceptive)+(exp.educ), data = growth)
e1d<-residuals(model.v1d)
ad.test(e1d)
##
## Anderson-Darling normality test
##
## data: e1d
## A = 2.048, p-value = 2.792e-05
After attempting several transformations, the problem of non-normality still persists. Hence, adding another explanatory variable was considered based on the results of the All-Possible-Regression procedure.
kable(head(apr_best), n = 5)
n | predictors | rsquare | adjr | cp |
---|---|---|---|---|
1 | med.age | 0.7985956 | 0.7949991 | 27.604902 |
2 | med.age income.percapita | 0.8172425 | 0.8105968 | 22.049559 |
3 | med.age contraceptive income.percapita | 0.8506455 | 0.8423481 | 10.515344 |
4 | med.age contraceptive exp.educ income.percapita | 0.8655808 | 0.8554359 | 6.463897 |
5 | med.age contraceptive coverage.HIV exp.educ income.percapita | 0.8719385 | 0.8596249 | 5.887885 |
6 | med.age contraceptive coverage.HIV cell.subscriber exp.educ income.percapita | 0.8757351 | 0.8611157 | 6.349567 |
Based on the results of the APR, the variable coverage.HIV will be added to the model as it returned the highest coefficient of determination (R-squared) among the remaining variables.
model.v2 <-lm(growth~med.age+income.percapita+contraceptive+exp.educ+coverage.HIV, data = growth)
summary(model.v2)
##
## Call:
## lm(formula = growth ~ med.age + income.percapita + contraceptive +
## exp.educ + coverage.HIV, data = growth)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.23723 -0.19469 0.04473 0.19177 0.77721
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.567e+00 3.031e-01 18.365 < 2e-16 ***
## med.age -1.564e-01 1.501e-02 -10.420 2.46e-14 ***
## income.percapita 7.149e-05 1.790e-05 3.995 0.000205 ***
## contraceptive -1.456e-02 3.736e-03 -3.898 0.000279 ***
## exp.educ 9.724e-02 3.464e-02 2.807 0.007015 **
## coverage.HIV -4.580e-03 2.851e-03 -1.607 0.114170
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4072 on 52 degrees of freedom
## Multiple R-squared: 0.8719, Adjusted R-squared: 0.8596
## F-statistic: 70.81 on 5 and 52 DF, p-value: < 2.2e-16
Model v2:
growth = 5.567 - (0.01564)med.age + (0.00007149)income.percapita - (0.0145)contraceptive + (0.09724)exp.educ - (0.00458)coverage.HIV
Diagnostics Checking
1.) The error terms follow a normal distribution.
ad.test(residuals(model.v2))
##
## Anderson-Darling normality test
##
## data: residuals(model.v2)
## A = 0.61485, p-value = 0.1047
At 0.05 level of significance, there is sufficient evidence to conclude that the residuals follow a normal distribution.
2.) The error terms have a constant variance.
bptest(model.v2)
##
## studentized Breusch-Pagan test
##
## data: model.v2
## BP = 10.596, df = 5, p-value = 0.06
At 0.05 level of significance, there is sufficient evidence to conclude that the residuals have a constant variance.
3.) Multi-collinearity
vif(model.v2)
## med.age income.percapita contraceptive exp.educ
## 3.012704 2.597501 2.796162 1.159166
## coverage.HIV
## 1.133345
kappa(model.v2)
## [1] 33573.81
Since the condition number is greater than 1000, severe multicollinearity is present in the model.
In an attempt to solve multicollinearity, the observations were centered.
growth$growth2<-growth$growth-mean(growth$growth)
growth$med.age2<-growth$med.age-mean(growth$med.age)
growth$income.percapita2<-growth$income.percapita-mean(growth$income.percapita)
growth$contraceptive2<-growth$contraceptive-mean(growth$contraceptive)
growth$exp.educ2<-growth$exp.educ-mean(growth$exp.educ)
growth$coverage.HIV2<-growth$coverage.HIV-mean(growth$coverage.HIV)
Fit a linear regression model to the centered data.
model.v2.cent<-lm(growth2~med.age2+income.percapita2+contraceptive2+exp.educ2+coverage.HIV2, data = growth)
kappa(model.v2.cent)
## [1] 4295.377
Since multicollinearity is still present, the variables are examined to remove any variable that may be causing it.
apr_result[apr_result$predictors %in% c('med.age',
'income.percapita',
'contraceptive',
'exp.educ',
'coverage.HIV',
'contraceptive income.percapita',
'exp.educ income.percapita',
'coverage.HIV income.percapita',
'income.percapita med.age',
'contraceptive med.age',
'exp.educ med.age',
'coverage.HIV med.age',
'contraceptive exp.educ',
'contraceptive coverage.HIV',
'coverage.HIV exp.educ'),]
## Index N Predictors R-Square Adj. R-Square
## 1 1 1 med.age 0.7985955958 0.79499909
## 4 2 1 contraceptive 0.5752258292 0.56764058
## 13 5 1 income.percapita 0.3373952458 0.32556302
## 8 15 1 coverage.HIV 0.0054540470 -0.01230570
## 11 16 1 exp.educ 0.0006586039 -0.01718678
## 65 32 2 contraceptive exp.educ 0.6019944412 0.58752151
## 67 39 2 contraceptive income.percapita 0.5798314252 0.56455257
## 62 43 2 contraceptive coverage.HIV 0.5756665318 0.56023622
## 105 76 2 coverage.HIV income.percapita 0.3441810794 0.32033312
## 123 79 2 exp.educ income.percapita 0.3379549426 0.31388058
## 103 136 2 coverage.HIV exp.educ 0.0080755556 -0.02799442
## Mallow's Cp
## 1 27.6049
## 4 118.1097
## 13 214.4738
## 8 348.9695
## 11 350.9125
## 65 109.2636
## 67 118.2436
## 62 119.9312
## 105 213.7243
## 123 216.2470
## 103 349.9073
The adjusted R-squared of income.percapita and contraceptive are 0.0325563 and 0.5676406, respectively. However, upon considering both variables, the adjusted R-squared is 0.5645526. This shows that there is no significant increase in explaining the variation of the population growth rate when income.percapita is added to contraceptive.
model.v3<-lm(growth~med.age+contraceptive+exp.educ+coverage.HIV, data = growth)
Diagnostics Checking
1.) The error terms follow a normal distribution.
ad.test(residuals(model.v3))
##
## Anderson-Darling normality test
##
## data: residuals(model.v3)
## A = 1.0666, p-value = 0.007779
Since the p-value < 0.05, there is insufficient evidence to conclude that the residuals of the model follow a normal distribution.
Several transformations were applied to the data however, these resulted in multicollinearity. With this, another socio-economic variable was added to compensate the removal of income.percapita which caused non-normality.
The remaining economic variables were added to the model and the variable ‘cell.subscriber’ yielded the highest adjusted R-squared.
# add cell.subscriber
model.v3a<-lm(growth~med.age+contraceptive+exp.educ+coverage.HIV+cell.subscriber, data = growth)
model.v3a_res <- summary(model.v3a)
model.v3a_res$adj.r.squared
## [1] 0.8365249
# add migration
model.v3b<-lm(growth~med.age+contraceptive+exp.educ+coverage.HIV+migration, data = growth)
model.v3b_res <- summary(model.v3b)
model.v3b_res$adj.r.squared
## [1] 0.823055
# add fpi
model.v3c<-lm(growth~med.age+contraceptive+exp.educ+coverage.HIV+fpi, data = growth)
model.v3c_res <- summary(model.v3c)
model.v3c_res$adj.r.squared
## [1] 0.8169979
# add lfp.female
model.v3d<-lm(growth~med.age+contraceptive+exp.educ+coverage.HIV+lfp.female, data = growth)
model.v3d_res <- summary(model.v3d)
model.v3d_res$adj.r.squared
## [1] 0.8168654
# add gdp.growth
model.v3e<-lm(growth~med.age+contraceptive+exp.educ+coverage.HIV+gdp.growth, data = growth)
model.v3e_res <- summary(model.v3e)
model.v3e_res$adj.r.squared
## [1] 0.8168211
##MODEL.V4 (5 variables): med.age contraceptive exp.educ coverage.HIV cell.subscriber
model.v4<-lm(growth~med.age+contraceptive+exp.educ+coverage.HIV+cell.subscriber, data = growth)
summary(model.v4)
##
## Call:
## lm(formula = growth ~ med.age + contraceptive + exp.educ + coverage.HIV +
## cell.subscriber, data = growth)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.22129 -0.24273 0.01845 0.26833 0.79595
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.188645 0.312376 16.610 < 2e-16 ***
## med.age -0.142274 0.015338 -9.276 1.31e-12 ***
## contraceptive -0.010936 0.003836 -2.851 0.00624 **
## exp.educ 0.075901 0.037176 2.042 0.04627 *
## coverage.HIV -0.005954 0.003093 -1.925 0.05971 .
## cell.subscriber 0.005119 0.002031 2.521 0.01481 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4394 on 52 degrees of freedom
## Multiple R-squared: 0.8509, Adjusted R-squared: 0.8365
## F-statistic: 59.34 on 5 and 52 DF, p-value: < 2.2e-16
Model v4:
growth = 5.188645 - (0.1423)med.age - (0.0109)contraceptive + (0.0759)exp.educ - (0.0060)coverage.HIV + (0.0051)cell.subscriber
The adjusted R-squared is 0.8365, which suggests that the independent variables explain 83.65% of the variation in the population growth rate. At alpha = 0.05, the variables med.age, contraceptive, exp.educ, and cell.subscriber were found to be significant. It should be noted that although not significant, coverage.HIV was retained in order to satisfy the normality assumption.
Model Interpretation:
Holding all other variables constant, ● For every year increase in population median age, there is 14.23% decrease in a country’s population growth rate. ● For every percent increase in contraceptive prevalence rate, there is 1.09% decrease in a country’s population growth rate. ● For every percent increase on the government expenditure on education, there is 7.59% increase in a country’s population growth rate. ● For every percent increase in antiretroviral therapy treatment coverage on HIV, there is 0.60% decrease in a country’s population growth rate. ● For every increase in number of cellular subscribers per 100 population, there is 0.51% increase in a country’s population growth rate.
Diagnostic Checking
1.) The error terms follow a normal distribution.
ad.test(residuals(model.v4))
##
## Anderson-Darling normality test
##
## data: residuals(model.v4)
## A = 0.72569, p-value = 0.05527
Since the p-value is greater than 0.05, we fail to reject the null hypothesis. At 0.05 level of significance, there is sufficient evidence to conclude that the error terms follow a normal distribution.
2.) The error terms have a constant variance.
bptest(model.v4)
##
## studentized Breusch-Pagan test
##
## data: model.v4
## BP = 8.1009, df = 5, p-value = 0.1508
Since the p-value is greater than 0.05, we fail to reject the null hypothesis. At 0.05 level of significance, there is sufficient evidence to conclude that the error terms have a constant variance.
3.) Multicollinearity
vif(model.v4)
## med.age contraceptive exp.educ coverage.HIV cell.subscriber
## 2.701313 2.530712 1.146579 1.145563 1.598860
kappa(model.v4)
## [1] 410.5514
The VIF of each independent variable is less than 10 and the conditioned number of the model is less than 1000, which suggests that there is no severe multicollinearity present.
4.) Autocorrelation - The error terms are independent.
dwtest(model.v4,alternative="two.sided")
##
## Durbin-Watson test
##
## data: model.v4
## DW = 2.1393, p-value = 0.5828
## alternative hypothesis: true autocorrelation is not 0
Since the p-value is greater than 0.05, we fail to reject the null hypothesis. There is sufficient evidence to conclude that the true autocorrelation is 0.
5.) Linearity
avPlots(model.v4)
Non-linearity is not detected among the independent variables.
Detection of Outliers and Influential Observations
Note: Influential observations refer to data points whose deletion causes substantial changes in the fitted model. Meanwhile, outliers are observations that are different from the rest. Ouliers and influential observations are examined as they affect the stability of our estimates.
A. Detection of Outliers
#standardized residuals>2
as.matrix(rstandard(model.v4))
## [,1]
## 1 0.399327901
## 2 -0.823362376
## 3 0.714160074
## 4 -0.393647204
## 5 0.995087860
## 6 -0.103243171
## 7 -1.278152752
## 8 0.364152937
## 9 0.860506865
## 10 0.374815616
## 11 0.244544868
## 12 0.762069448
## 13 0.142599469
## 14 0.948565834
## 15 -0.813816814
## 16 0.695640294
## 17 -2.964364607
## 18 0.329917842
## 19 2.096250921
## 20 0.202070021
## 21 -2.288458871
## 22 -0.683132368
## 23 -0.900951978
## 24 -0.729137396
## 25 0.022021749
## 26 -2.574364958
## 27 0.063919197
## 28 -0.176986282
## 29 0.928721901
## 30 0.755530244
## 31 -1.354456895
## 32 -1.572613440
## 33 0.019874715
## 34 0.452500994
## 35 1.551628439
## 36 -0.656450879
## 37 0.357402116
## 38 0.298995673
## 39 -0.167561255
## 40 -1.733085084
## 41 1.799693622
## 42 -0.077584631
## 43 -0.199047043
## 44 0.740362267
## 45 1.016491370
## 46 -0.810587989
## 47 -0.035989580
## 48 -0.635894065
## 49 -0.367229316
## 50 0.336820793
## 51 -0.017697726
## 52 1.923766088
## 53 -0.065913910
## 54 0.304730457
## 55 1.749543358
## 56 0.004711949
## 57 0.348362123
## 58 -0.338292508
max(rstandard(model.v4))
## [1] 2.096251
#studentized deleted residuals
as.matrix(rstudent(model.v4))
## [,1]
## 1 0.396077340
## 2 -0.820774783
## 3 0.710754000
## 4 -0.390425922
## 5 0.994992266
## 6 -0.102256110
## 7 -1.286168040
## 8 0.361095184
## 9 0.858325719
## 10 0.371696564
## 11 0.242321440
## 12 0.758956298
## 13 0.141249285
## 14 0.947635167
## 15 -0.811135732
## 16 0.692147085
## 17 -3.220414681
## 18 0.327072647
## 19 2.169695211
## 20 0.200196224
## 21 -2.389886201
## 22 -0.679588228
## 23 -0.899293469
## 24 -0.725812263
## 25 0.021809076
## 26 -2.729343232
## 27 0.063304092
## 28 -0.175329045
## 29 0.927472695
## 30 0.752371208
## 31 -1.365676811
## 32 -1.595829744
## 33 0.019682759
## 34 0.449013800
## 35 1.573493817
## 36 -0.652818815
## 37 0.354384409
## 38 0.296361621
## 39 -0.165987085
## 40 -1.768165105
## 41 1.840546973
## 42 -0.076839451
## 43 -0.197198977
## 44 0.737104070
## 45 1.016822936
## 46 -0.807876277
## 47 -0.035642291
## 48 -0.632212914
## 49 -0.364153630
## 50 0.333930876
## 51 -0.017526783
## 52 1.976822727
## 53 -0.065279773
## 54 0.302055956
## 55 1.786001492
## 56 0.004666423
## 57 0.345399504
## 58 -0.335393176
ols_plot_resid_stud_fit(model.v4)
Based on the standardized residuals and studentized deleted residuals criteria, observations 17, 19, 21, and 26 are 2 standard deviations away from 0. (Recall that one of the assumptions in linear regression is that the error terms ~ N(mean = 0, constant variance). Because they are far from the mean, they may be considered potential outliers or influential observations.
The leverage measures the distance between X values for the ith observation and the means of the X values for all n observations. A large leverage value indicates that the observation is distant from the center of the X observations. A leverage is considered large if it is greater than 2p/n.
#leverage>2*(5-1)/58=0.137931
as.matrix(hatvalues(model.v4))
## [,1]
## 1 0.11509060
## 2 0.08045182
## 3 0.07872704
## 4 0.09443724
## 5 0.17321522
## 6 0.07531879
## 7 0.10273828
## 8 0.05113452
## 9 0.10611454
## 10 0.17906921
## 11 0.04052670
## 12 0.08122940
## 13 0.06512559
## 14 0.10295182
## 15 0.05126078
## 16 0.05421222
## 17 0.12086821
## 18 0.10360565
## 19 0.25326009
## 20 0.05771879
## 21 0.27157455
## 22 0.12874287
## 23 0.14834871
## 24 0.09120235
## 25 0.06261441
## 26 0.14156912
## 27 0.03963072
## 28 0.08296581
## 29 0.15423806
## 30 0.06668417
## 31 0.15500309
## 32 0.06064005
## 33 0.05630121
## 34 0.16662023
## 35 0.18969782
## 36 0.08763179
## 37 0.10299113
## 38 0.08207029
## 39 0.10579854
## 40 0.05318779
## 41 0.06531888
## 42 0.08088633
## 43 0.11109206
## 44 0.08369641
## 45 0.14040507
## 46 0.07057929
## 47 0.08755923
## 48 0.07474264
## 49 0.09090339
## 50 0.10281559
## 51 0.03938397
## 52 0.13624933
## 53 0.05765577
## 54 0.07047817
## 55 0.09152555
## 56 0.24799805
## 57 0.08289798
## 58 0.13124308
plot(hatvalues(model.v4),type="h")
Based on the leverage criteria, observations 5, 10, 19, 21, 23, 26, 29, 31, 34, 35, 45, and 56 are considered outlying.
B. Detecting Influential Observations
DFFITS is a scaled measure of the change in the predicted value for the ith observation, when the ith observation is deleted. A large value indicates that the observation is very influential. A size-adjusted cutoff to consider is 2*sqrt(p/n)
#dffits >2*sqrt((5-1)/58)=0.5252257
as.matrix(dffits(model.v4))
## [,1]
## 1 0.142840222
## 2 -0.242775412
## 3 0.207772047
## 4 -0.126081394
## 5 0.455424609
## 6 -0.029184007
## 7 -0.435215429
## 8 0.083825564
## 9 0.295732216
## 10 0.173598406
## 11 0.049801911
## 12 0.225667934
## 13 0.037280820
## 14 0.321033457
## 15 -0.188543998
## 16 0.165710538
## 17 -1.194100417
## 18 0.111195287
## 19 1.263564544
## 20 0.049547781
## 21 -1.459248297
## 22 -0.261236751
## 23 -0.375329258
## 24 -0.229928978
## 25 0.005636575
## 26 -1.108382736
## 27 0.012859635
## 28 -0.052736426
## 29 0.396070813
## 30 0.201107948
## 31 -0.584912038
## 32 -0.405462156
## 33 0.004807598
## 34 0.200771503
## 35 0.761329586
## 36 -0.202319823
## 37 0.120081513
## 38 0.088615609
## 39 -0.057094786
## 40 -0.419079934
## 41 0.486557921
## 42 -0.022794875
## 43 -0.069713644
## 44 0.222772987
## 45 0.410950777
## 46 -0.222626734
## 47 -0.011041148
## 48 -0.179686852
## 49 -0.115151516
## 50 0.113043399
## 51 -0.003548845
## 52 0.785128446
## 53 -0.016147127
## 54 0.083173486
## 55 0.566887159
## 56 0.002679777
## 57 0.103844827
## 58 -0.130359682
plot(dffits(model.v4),type="h")
plot(abs(dffits(model.v4)),type="h")
Based on the DFFITS criteria, observations 17, 19, 21, 26, 31, 35, 52, and 55 are considered highly influential since their absolute DFFITS exceed 0.5252.
DFBETAS measures the change in each parameter estimate when the ith observation is deleted. Large values of DFBETAS indicate observations that are influential in estimating a given parameter. A size-adjusted cut-off for DFBETA is 2/sqrt(n).
#dfbetas 2/sqrt(58)=0.2626129
as.matrix(dfbeta(model.v4))
## (Intercept) med.age contraceptive exp.educ coverage.HIV
## 1 3.680127e-02 -8.438980e-04 6.627513e-05 -6.601060e-04 -3.296739e-04
## 2 2.315348e-02 -2.463525e-03 3.131397e-04 2.255785e-03 1.152960e-04
## 3 5.928650e-03 1.059983e-03 -1.164368e-04 -3.152009e-03 -2.643700e-04
## 4 -2.243806e-02 3.179123e-04 -2.863122e-04 2.010782e-03 1.386867e-04
## 5 -9.751750e-02 5.891055e-03 -5.676484e-04 3.918187e-03 -2.312484e-05
## 6 1.547581e-03 -6.333277e-05 7.295007e-05 -2.291586e-04 -3.909748e-05
## 7 -3.469411e-02 2.758169e-03 -8.091748e-04 -9.873487e-03 7.413997e-04
## 8 4.605445e-03 -5.013260e-06 -1.292452e-04 -8.065039e-05 1.191923e-04
## 9 2.121799e-03 8.162868e-04 -1.635577e-04 5.251182e-03 1.431091e-04
## 10 3.370509e-04 -3.852902e-04 3.272931e-05 -4.570113e-03 3.588122e-04
## 11 9.829472e-03 -1.146088e-04 -2.116889e-05 -8.586405e-04 1.061182e-05
## 12 3.755525e-02 1.976112e-04 -2.872558e-04 -3.030807e-03 2.171762e-05
## 13 1.106921e-03 -1.603304e-04 1.096725e-04 -1.212757e-04 5.598072e-06
## 14 -4.282928e-02 9.243155e-05 4.713910e-04 5.100141e-03 2.956387e-04
## 15 -3.688778e-03 4.166769e-06 4.300662e-04 -1.976973e-03 2.182264e-05
## 16 7.276451e-03 -1.026241e-03 4.723466e-04 1.585521e-05 6.567002e-06
## 17 -1.417575e-01 1.138412e-02 -2.703396e-03 1.300572e-02 4.925958e-04
## 18 2.867167e-03 1.648202e-04 1.209475e-05 1.553112e-03 7.782690e-05
## 19 2.436908e-02 -6.182603e-03 -8.553772e-04 -1.200385e-02 5.043002e-04
## 20 2.927771e-03 -9.754264e-05 -1.004973e-04 7.543987e-05 2.778518e-05
## 21 2.531215e-01 -1.714512e-02 2.564863e-03 1.840504e-02 -2.215708e-03
## 22 2.057411e-02 1.703325e-04 2.478648e-04 -8.362322e-03 8.067174e-05
## 23 -5.681699e-02 4.288361e-03 -6.120207e-04 5.119014e-03 5.587881e-05
## 24 -1.816842e-02 -1.044200e-03 3.920947e-04 4.103203e-03 -2.216668e-04
## 25 8.192302e-04 1.025917e-05 -9.110651e-06 -1.027778e-04 -9.853944e-07
## 26 4.590161e-02 -1.863141e-03 -1.972964e-05 1.629834e-02 -2.894948e-03
## 27 -1.939369e-05 6.220857e-05 1.220954e-05 -4.810028e-05 5.026102e-06
## 28 -5.129169e-03 9.307019e-05 -5.250192e-05 6.641039e-05 1.248977e-04
## 29 5.277589e-02 -1.009613e-03 9.757001e-04 -1.348875e-03 -7.437267e-04
## 30 8.446795e-03 -1.244153e-03 2.887855e-04 1.710289e-03 2.867278e-04
## 31 1.913503e-02 7.145017e-04 2.228430e-04 -1.670211e-02 1.031402e-03
## 32 -6.167223e-02 6.622372e-04 -2.914635e-04 1.144243e-02 -3.650914e-04
## 33 4.976784e-04 1.597226e-05 -1.017442e-05 -4.952120e-05 8.975037e-07
## 34 4.844219e-02 -1.051391e-03 3.343691e-04 -4.393286e-04 -4.653130e-04
## 35 1.171905e-02 -1.439082e-03 -8.816972e-04 9.387229e-04 -9.484448e-04
## 36 -1.417799e-02 5.358068e-04 4.072061e-04 9.046887e-04 -6.789522e-05
## 37 6.442511e-03 1.252632e-04 -2.941466e-04 -7.390733e-04 -6.871994e-05
## 38 -7.487555e-03 6.057085e-04 3.579669e-05 -4.837405e-04 -6.565853e-05
## 39 -3.231534e-04 -1.917975e-04 8.594749e-05 -1.391984e-03 2.444742e-05
## 40 -7.608141e-02 1.513939e-03 -8.355382e-04 5.767004e-05 6.091320e-04
## 41 7.915905e-02 -1.396190e-03 2.313165e-05 3.156778e-03 -7.629888e-05
## 42 -3.611240e-03 -4.763222e-05 1.565759e-05 2.174641e-04 4.445649e-05
## 43 -3.430830e-03 5.741815e-04 -2.095213e-04 1.790318e-04 -6.905761e-05
## 44 1.796191e-02 -1.100009e-03 6.149963e-04 -4.372961e-03 1.780597e-04
## 45 2.141692e-02 -2.035907e-03 7.454564e-04 -1.754011e-03 8.026147e-04
## 46 -1.316454e-02 7.460513e-04 -1.133392e-04 -6.214103e-03 2.357979e-04
## 47 6.750764e-04 -1.896402e-05 2.608144e-05 -2.178562e-04 -6.596150e-06
## 48 -1.875464e-02 -9.288292e-04 3.079356e-04 7.254984e-04 9.281210e-05
## 49 1.682351e-02 1.125003e-04 9.677011e-07 -1.582687e-03 -1.662635e-04
## 50 4.117282e-03 4.881171e-04 1.053947e-04 -2.563585e-03 -3.448038e-05
## 51 -5.637688e-04 9.378166e-06 -3.337386e-07 -2.656369e-05 8.077343e-06
## 52 -1.714699e-01 7.140199e-03 -3.298861e-04 -1.703833e-03 1.228774e-03
## 53 -3.229941e-04 -7.731375e-05 3.340181e-05 -2.360136e-04 1.803556e-06
## 54 -1.679355e-02 5.779320e-04 -7.845418e-05 1.970259e-03 -4.126457e-06
## 55 9.793675e-02 -3.244569e-03 6.556309e-04 -1.113382e-02 7.902527e-04
## 56 -6.172269e-04 3.238793e-05 -4.778610e-06 5.181574e-05 -1.243848e-06
## 57 -8.925731e-03 -2.880205e-04 1.258436e-04 8.986875e-04 -4.698892e-06
## 58 1.019506e-04 9.583455e-04 -2.165191e-04 -2.378358e-03 -1.093180e-04
## cell.subscriber
## 1 2.077339e-05
## 2 -1.286659e-05
## 3 6.897155e-05
## 4 1.072592e-04
## 5 -2.349034e-04
## 6 -1.374623e-05
## 7 -4.315082e-06
## 8 -1.829485e-05
## 9 -4.064202e-04
## 10 1.303720e-04
## 11 -1.620782e-05
## 12 -1.432921e-04
## 13 -9.059133e-06
## 14 -8.235271e-05
## 15 -1.638019e-04
## 16 1.277675e-05
## 17 -1.211533e-03
## 18 -1.685402e-04
## 19 2.270432e-03
## 20 4.087067e-05
## 21 5.167468e-04
## 22 -1.221345e-04
## 23 -5.367200e-04
## 24 1.577387e-04
## 25 -2.935627e-07
## 26 5.966271e-04
## 27 -1.718941e-05
## 28 -2.905428e-05
## 29 -2.527924e-04
## 30 -8.281953e-05
## 31 -4.456272e-04
## 32 2.059698e-04
## 33 -1.331801e-06
## 34 -1.185101e-04
## 35 1.320770e-03
## 36 -2.443092e-04
## 37 1.440847e-04
## 38 -5.533542e-06
## 39 4.799536e-05
## 40 3.864460e-04
## 41 -4.938130e-04
## 42 4.228900e-06
## 43 2.801424e-06
## 44 -4.696752e-05
## 45 -3.478861e-04
## 46 9.455916e-05
## 47 -5.015541e-06
## 48 1.659577e-04
## 49 -8.987901e-05
## 50 -5.860595e-05
## 51 -7.042458e-07
## 52 -1.882954e-04
## 53 1.180414e-05
## 54 1.633400e-05
## 55 -3.527929e-04
## 56 -3.796872e-07
## 57 1.090116e-04
## 58 -7.223393e-06
summary(abs(dfbeta(model.v4)))
## (Intercept) med.age contraceptive
## Min. :1.939e-05 Min. :4.167e-06 Min. :3.337e-07
## 1st Qu.:3.054e-03 1st Qu.:1.130e-04 1st Qu.:3.997e-05
## Median :1.244e-02 Median :5.918e-04 Median :2.130e-04
## Mean :2.909e-02 Mean :1.510e-03 Mean :3.483e-04
## 3rd Qu.:3.627e-02 3rd Qu.:1.208e-03 3rd Qu.:4.244e-04
## Max. :2.531e-01 Max. :1.715e-02 Max. :2.703e-03
## exp.educ coverage.HIV cell.subscriber
## Min. :1.585e-05 Min. :8.975e-07 Min. :2.936e-07
## 1st Qu.:2.309e-04 1st Qu.:2.346e-05 1st Qu.:1.436e-05
## Median :1.643e-03 Median :1.011e-04 Median :9.222e-05
## Mean :3.411e-03 Mean :3.081e-04 Mean :2.132e-04
## 3rd Qu.:4.306e-03 3rd Qu.:3.515e-04 3rd Qu.:2.277e-04
## Max. :1.841e-02 Max. :2.895e-03 Max. :2.270e-03
Since none of the observations have DFBETAS exceeding the cut-off, none of them are influential in the computation of the parameter estimates.
Consolidating the results of the above tests, observations 17, 19, 21, and 26 have been indicated as outliers and influential observations repeatedly.
To determine whether these observations are vital or only occurred by chance and can be removed, the values of these data points are compared to the prediction intervals generated from a regression model fitted excluding the observation.
#####Influencial variables removed#####
m<-lm(growth~med.age+contraceptive+exp.educ+coverage.HIV+cell.subscriber,data = growth[-c(17,19,21,26,31,35,52,55),]) # model excluding influential observations
new17<-data.frame(med.age=growth$med.age[17],contraceptive=growth$contraceptive[17],exp.educ=growth$exp.educ[17],coverage.HIV=growth$coverage.HIV[17],cell.subscriber=growth$cell.subscriber[17])
new19<-data.frame(med.age=growth$med.age[19],contraceptive=growth$contraceptive[19],exp.educ=growth$exp.educ[19],coverage.HIV=growth$coverage.HIV[19],cell.subscriber=growth$cell.subscriber[19])
new21<-data.frame(med.age=growth$med.age[21],contraceptive=growth$contraceptive[21],exp.educ=growth$exp.educ[21],coverage.HIV=growth$coverage.HIV[21],cell.subscriber=growth$cell.subscriber[21])
new26<-data.frame(med.age=growth$med.age[26],contraceptive=growth$contraceptive[26],exp.educ=growth$exp.educ[26],coverage.HIV=growth$coverage.HIV[26],cell.subscriber=growth$cell.subscriber[26])
predict(m, new17, interval="predict")
## fit lwr upr
## 1 1.570651 0.9251948 2.216108
growth$growth[17]
## [1] 0.46
predict(m, new19, interval="predict")
## fit lwr upr
## 1 2.366138 1.649983 3.082293
growth$growth[19]
## [1] 3.46
predict(m, new21, interval="predict")
## fit lwr upr
## 1 -0.1583312 -0.8873084 0.5706461
growth$growth[21]
## [1] -1.3
predict(m, new26, interval="predict")
## fit lwr upr
## 1 1.811616 1.145922 2.47731
growth$growth[26]
## [1] 0.53
Since none of the observed values lies within the prediction intervals, their occurrence cannot be considered by chance and cannot be removed from the sample.