Initial Setup

library(cluster)    
library(factoextra)
library(dplyr)
library(purrr)
library(cluster)
library(dplyr)
library(ggplot2)
library(randomForest)
library(caret)
library(glmnet)
library(psych)
library(nFactors)
library(fpc)
library(NbClust)
library(corrplot)
library(regclass)
library(leaps)
library(bestglm)
library(MASS)

setwd("C:\\Users\\Sanya Garg\\Desktop\\Covid")

Importing base data file

bcg_data <- read.csv("C:\\Users\\Sanya Garg\\Desktop\\Covid\\data_for_bcg_analysis_v3.csv")

Summary statistics

View(bcg_data)
str(bcg_data)
## 'data.frame':    57 obs. of  45 variables:
##  $ Country                           : Factor w/ 57 levels "Afghanistan",..: 2 3 4 6 7 9 10 11 12 13 ...
##  $ Population_2020                   : num  43.85 25.5 9.01 11.59 212.56 ...
##  $ Percent_Pop_Above_65              : num  0.0581 0.1644 0.1944 0.1878 0.0861 ...
##  $ Percent_Pop_Above_80              : num  0.0127 0.0405 0.0526 0.0571 0.0181 ...
##  $ Pop_Density                       : num  17.73 3.25 107.21 377.21 25.06 ...
##  $ Avg_Temp                          : num  18.21 22.45 5.97 6.77 22.76 ...
##  $ GDP_Per_Capita                    : num  4115 57374 51462 47519 8921 ...
##  $ Perc_Deaths_Diabetes              : num  2e-04 2e-04 4e-04 1e-04 3e-04 2e-04 3e-04 2e-04 2e-04 2e-04 ...
##  $ Perc_Deaths_Hypertension          : num  6e-04 1e-04 5e-04 1e-04 2e-04 1e-04 3e-04 2e-04 6e-04 2e-04 ...
##  $ Perc_Deaths_Cerebrovascular       : num  0e+00 5e-04 5e-04 6e-04 5e-04 4e-04 5e-04 3e-04 0e+00 6e-04 ...
##  $ Perc_Deaths_Pneumonia             : num  2e-04 1e-04 1e-04 4e-04 4e-04 2e-04 2e-04 2e-04 2e-04 3e-04 ...
##  $ Perc_Deaths_Lower_Resp_Disease    : num  2e-04 3e-04 3e-04 4e-04 2e-04 3e-04 2e-04 3e-04 2e-04 6e-04 ...
##  $ Perc_Deaths_TB                    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Perc_Deaths_HIV                   : num  0e+00 0e+00 0e+00 0e+00 1e-04 0e+00 0e+00 1e-04 0e+00 0e+00 ...
##  $ Perc_Deaths_Obesity               : num  9e-04 8e-04 1e-03 8e-04 7e-04 8e-04 7e-04 5e-04 9e-04 7e-04 ...
##  $ Perc_Deaths_Influenza             : num  1e-05 1e-05 0e+00 3e-05 0e+00 3e-05 1e-05 1e-05 1e-05 1e-05 ...
##  $ Overall_GHSA_Score                : num  23.6 75.5 58.5 61 59.7 75.3 58.3 44.2 52 70.4 ...
##  $ GHSA1_Emergence_Of_Disease        : num  25.7 68.9 57.4 63.5 59.2 70 56.2 37.2 51.1 72.9 ...
##  $ GHSA2_Detection                   : num  12 97.3 73.2 62.5 82.4 96.4 72.7 41.7 50.7 86 ...
##  $ GHSA3_Response                    : num  19.6 65.9 42.3 47.3 67.1 60.7 60.2 43.5 46.6 58.4 ...
##  $ GHSA4_Sufficient_System           : num  13.1 63.5 46.6 60.5 45 67.7 39.3 34.3 37.4 63.8 ...
##  $ GHSA5_Commitments                 : num  29.1 77 52.8 59.7 41.9 74.7 51.5 60.1 58.9 62.6 ...
##  $ GHSA6_Risk                        : num  51.4 79.4 84.6 78.2 56.2 82.7 70.1 51 74 80.3 ...
##  $ Household_Size_2019               : num  6.1 2.5 2.2 2.3 3 ...
##  $ Stringency_Index                  : num  84.5 43.3 84.8 63.9 67.7 ...
##  $ BCG_Last_15_Year_Coverage         : Factor w/ 2 levels "No","Yes": 2 1 1 1 2 1 2 2 1 1 ...
##  $ BCG_Coverage_Percentage           : num  78.3 21.6 46.4 0 57.3 ...
##  $ BCG_Immunization_Ever             : int  1 1 1 0 1 1 1 1 1 1 ...
##  $ BCG_Immunization_Current          : int  1 0 0 0 1 0 1 1 0 0 ...
##  $ BCG_Years_Of_Immunization         : int  51 30 38 0 44 27 73 60 57 40 ...
##  $ BCG_Last_15_Coverage_Yes          : int  1 0 0 0 1 0 1 1 0 0 ...
##  $ BCG_Last_15_Coverage_No           : int  0 1 1 1 0 1 0 0 1 1 ...
##  $ BCG_Last_40_Coverage_Yes          : int  1 0 0 0 1 0 1 1 0 0 ...
##  $ Perc_BCG_Coverage_GT50            : int  1 0 0 0 1 0 1 1 1 0 ...
##  $ Perc_BCG_Coverage_LT50            : int  0 1 1 1 0 1 0 0 0 1 ...
##  $ BCG_Coverage_Buckets              : Factor w/ 3 levels "High","Low","Not Recorded": 1 2 2 2 1 2 1 1 1 2 ...
##  $ BCG_coverage_last_40yrs           : num  95.9 1.5 26 0 88.6 ...
##  $ RCV1_Coverage                     : num  7.54 54.51 34.23 36.15 41.27 ...
##  $ RCV1_Duration                     : int  3 49 48 35 28 51 29 25 38 33 ...
##  $ MCV1_Coverage                     : num  -100 52.8 33.6 43.2 61.9 ...
##  $ MCV1_Duration                     : int  -100 1972 1974 1975 1967 1963 1964 -100 1969 1987 ...
##  $ Polio_Coverage                    : num  61.3 49.4 64.7 77 66.1 ...
##  $ Polio_Duration                    : int  51 54 58 63 59 65 59 -100 62 59 ...
##  $ Deaths_Mn_18_Days_After_100th_Case: num  4.675 0.549 5.441 10.527 0.946 ...
##  $ Deaths_Mn_30_Days_After_100th_Case: num  8.76 2 26.98 124.85 5.75 ...
summary(bcg_data)
##         Country   Population_2020    Percent_Pop_Above_65 Percent_Pop_Above_80
##  Afghanistan: 1   Min.   :   4.938   Min.   :0.0258       Min.   :0.002207    
##  algeria    : 1   1st Qu.:  11.590   1st Qu.:0.0581       1st Qu.:0.009955    
##  australia  : 1   Median :  37.742   Median :0.1155       Median :0.030233    
##  austria    : 1   Mean   :  84.166   Mean   :0.1232       Mean   :0.030740    
##  Bangladesh : 1   3rd Qu.:  83.784   3rd Qu.:0.1898       3rd Qu.:0.047023    
##  belgium    : 1   Max.   :1334.000   Max.   :0.2838       Max.   :0.084122    
##  (Other)    :51                                                               
##   Pop_Density          Avg_Temp      GDP_Per_Capita    Perc_Deaths_Diabetes
##  Min.   :   3.249   Min.   :-5.169   Min.   :  520.9   Min.   :0.0000000   
##  1st Qu.:  44.749   1st Qu.: 6.133   1st Qu.: 5627.8   1st Qu.:0.0002000   
##  Median :  88.531   Median :12.022   Median :12301.2   Median :0.0002000   
##  Mean   : 150.529   Mean   :13.064   Mean   :23669.4   Mean   :0.0002123   
##  3rd Qu.: 147.752   3rd Qu.:20.950   3rd Qu.:41715.0   3rd Qu.:0.0002000   
##  Max.   :1239.579   Max.   :28.733   Max.   :82796.6   Max.   :0.0008000   
##                                                                            
##  Perc_Deaths_Hypertension Perc_Deaths_Cerebrovascular Perc_Deaths_Pneumonia
##  Min.   :0.0000000        Min.   :0.0000000           Min.   :0.0000000    
##  1st Qu.:0.0001000        1st Qu.:0.0000000           1st Qu.:0.0002000    
##  Median :0.0003000        Median :0.0004000           Median :0.0002000    
##  Mean   :0.0003684        Mean   :0.0004579           Mean   :0.0002263    
##  3rd Qu.:0.0006000        3rd Qu.:0.0006000           3rd Qu.:0.0002000    
##  Max.   :0.0015000        Max.   :0.0022000           Max.   :0.0010000    
##                                                                            
##  Perc_Deaths_Lower_Resp_Disease Perc_Deaths_TB      Perc_Deaths_HIV    
##  Min.   :0.0000000              Min.   :0.000e+00   Min.   :0.000e+00  
##  1st Qu.:0.0002000              1st Qu.:0.000e+00   1st Qu.:0.000e+00  
##  Median :0.0002000              Median :0.000e+00   Median :0.000e+00  
##  Mean   :0.0002404              Mean   :2.105e-05   Mean   :1.404e-05  
##  3rd Qu.:0.0003000              3rd Qu.:0.000e+00   3rd Qu.:0.000e+00  
##  Max.   :0.0008000              Max.   :6.000e-04   Max.   :4.000e-04  
##                                                                        
##  Perc_Deaths_Obesity Perc_Deaths_Influenza Overall_GHSA_Score
##  Min.   :0.0004000   Min.   :0e+00         Min.   :23.60     
##  1st Qu.:0.0007000   1st Qu.:1e-05         1st Qu.:43.70     
##  Median :0.0009000   Median :1e-05         Median :54.00     
##  Mean   :0.0009228   Mean   :1e-05         Mean   :53.68     
##  3rd Qu.:0.0010000   3rd Qu.:1e-05         3rd Qu.:64.60     
##  Max.   :0.0027000   Max.   :3e-05         Max.   :83.50     
##                                                              
##  GHSA1_Emergence_Of_Disease GHSA2_Detection GHSA3_Response 
##  Min.   :22.10              Min.   :12.00   Min.   :19.50  
##  1st Qu.:38.10              1st Qu.:42.80   1st Qu.:39.90  
##  Median :50.90              Median :61.60   Median :50.10  
##  Mean   :50.33              Mean   :60.91   Mean   :50.23  
##  3rd Qu.:59.20              3rd Qu.:78.40   3rd Qu.:60.70  
##  Max.   :83.10              Max.   :98.20   Max.   :91.90  
##                                                            
##  GHSA4_Sufficient_System GHSA5_Commitments   GHSA6_Risk    Household_Size_2019
##  Min.   :11.8            Min.   :28.70     Min.   :23.30   Min.   :1.900      
##  1st Qu.:33.0            1st Qu.:49.80     1st Qu.:53.70   1st Qu.:2.440      
##  Median :42.2            Median :58.60     Median :61.80   Median :3.100      
##  Mean   :42.3            Mean   :56.93     Mean   :63.15   Mean   :3.102      
##  3rd Qu.:57.1            3rd Qu.:63.00     3rd Qu.:77.30   3rd Qu.:3.493      
##  Max.   :73.8            Max.   :85.30     Max.   :87.10   Max.   :6.600      
##                                                                               
##  Stringency_Index BCG_Last_15_Year_Coverage BCG_Coverage_Percentage
##  Min.   : 24.07   No :20                    Min.   :-100.00        
##  1st Qu.: 58.60   Yes:37                    1st Qu.:  31.48        
##  Median : 71.69                             Median :  53.91        
##  Mean   : 70.74                             Mean   :  43.57        
##  3rd Qu.: 85.24                             3rd Qu.:  70.68        
##  Max.   :100.00                             Max.   :  97.00        
##                                                                    
##  BCG_Immunization_Ever BCG_Immunization_Current BCG_Years_Of_Immunization
##  Min.   :0.0000        Min.   :0.0000           Min.   : 0.00            
##  1st Qu.:1.0000        1st Qu.:0.0000           1st Qu.:35.00            
##  Median :1.0000        Median :1.0000           Median :44.00            
##  Mean   :0.9298        Mean   :0.6667           Mean   :45.46            
##  3rd Qu.:1.0000        3rd Qu.:1.0000           3rd Qu.:59.00            
##  Max.   :1.0000        Max.   :1.0000           Max.   :92.00            
##                                                                          
##  BCG_Last_15_Coverage_Yes BCG_Last_15_Coverage_No BCG_Last_40_Coverage_Yes
##  Min.   :0.0000           Min.   :0.0000          Min.   :0.0000          
##  1st Qu.:0.0000           1st Qu.:0.0000          1st Qu.:0.0000          
##  Median :1.0000           Median :0.0000          Median :1.0000          
##  Mean   :0.6491           Mean   :0.3509          Mean   :0.5263          
##  3rd Qu.:1.0000           3rd Qu.:1.0000          3rd Qu.:1.0000          
##  Max.   :1.0000           Max.   :1.0000          Max.   :1.0000          
##                                                                           
##  Perc_BCG_Coverage_GT50 Perc_BCG_Coverage_LT50   BCG_Coverage_Buckets
##  Min.   :0.0000         Min.   :0.000          High        :31       
##  1st Qu.:0.0000         1st Qu.:0.000          Low         :23       
##  Median :1.0000         Median :0.000          Not Recorded: 3       
##  Mean   :0.5614         Mean   :0.386                                
##  3rd Qu.:1.0000         3rd Qu.:1.000                                
##  Max.   :1.0000         Max.   :1.000                                
##                                                                      
##  BCG_coverage_last_40yrs RCV1_Coverage   RCV1_Duration   MCV1_Coverage    
##  Min.   :-100.00         Min.   : 0.00   Min.   : 0.00   Min.   :-100.00  
##  1st Qu.:  14.47         1st Qu.:23.14   1st Qu.:16.00   1st Qu.:  37.39  
##  Median :  74.32         Median :35.22   Median :26.00   Median :  49.64  
##  Mean   :  50.64         Mean   :30.96   Mean   :26.39   Mean   :  26.80  
##  3rd Qu.:  88.62         3rd Qu.:41.53   3rd Qu.:38.00   3rd Qu.:  57.73  
##  Max.   :  99.00         Max.   :58.43   Max.   :51.00   Max.   :  72.87  
##                                                                           
##  MCV1_Duration  Polio_Coverage    Polio_Duration   
##  Min.   :-100   Min.   :-100.00   Min.   :-100.00  
##  1st Qu.:1966   1st Qu.:  52.43   1st Qu.:  44.00  
##  Median :1972   Median :  66.08   Median :  58.00  
##  Mean   :1646   Mean   :  55.49   Mean   :  47.14  
##  3rd Qu.:1978   3rd Qu.:  73.53   3rd Qu.:  62.00  
##  Max.   :1989   Max.   :  84.69   Max.   :  65.00  
##                                                    
##  Deaths_Mn_18_Days_After_100th_Case Deaths_Mn_30_Days_After_100th_Case
##  Min.   : 0.0000                    Min.   :  0.000                   
##  1st Qu.: 0.6215                    1st Qu.:  1.867                   
##  Median : 1.2552                    Median :  6.473                   
##  Mean   : 3.9988                    Mean   : 22.289                   
##  3rd Qu.: 4.9347                    3rd Qu.: 22.410                   
##  Max.   :22.3079                    Max.   :200.771                   
## 

Data Treatment

Outlier Treatment

num_cols <- unlist(lapply(bcg_data, is.numeric)) 
bcg_data_num <- bcg_data[,num_cols]
pcap <- function(x){
  for (i in which(sapply(x, is.numeric))) {
    quantiles <- quantile( x[,i], c(.01, .97 ), na.rm =TRUE)
    x[,i] = ifelse(x[,i] < quantiles[1] , quantiles[1], x[,i])
    x[,i] = ifelse(x[,i] > quantiles[2] , quantiles[2], x[,i])}
  x}

bcg_data_num_cap <- pcap(bcg_data_num)

Merging with original dataset

bcg_data_fact <- bcg_data[,c( "Country"
                              ,"BCG_Coverage_Buckets"
                            ,"BCG_Last_15_Year_Coverage"
                               )]

bcg_data_2 <- cbind(bcg_data_fact,bcg_data_num_cap)

Exploratory Data Analysis

Getting correlation matrix for numeric variables

names(bcg_data_num_cap) <- c('Population_2020'
                             ,'Percent_Pop_Above_65'
                             ,'Percent_Pop_Above_80'
                             ,'Pop_Density'
                             ,'Avg_Temp'
                             ,'GDP_Per_Capita'
                             ,'Deaths_Diabetes'
                             ,'Deaths_Hypertension'
                             ,'Deaths_Cerebrovascular'
                             ,'Deaths_Pneumonia'
                             ,'Deaths_Resp_Disease'
                             ,'Deaths_TB'
                             ,'Deaths_HIV'
                             ,'Deaths_Obesity'
                             ,'Deaths_Influenza'
                             ,'Overall_GHSA_Score'
                             ,'GHSA1_Emergence_Of_Disease'
                             ,'GHSA2_Detection'
                             ,'GHSA3_Response'
                             ,'GHSA4_Sufficient_System'
                             ,'GHSA5_Commitments'
                             ,'GHSA6_Risk'
                             ,'Household_Size_2019'
                             ,'Stringency_Index'
                             ,'BCG_Coverage_Percentage'
                             ,'BCG_Immunization_Ever'
                             ,'BCG_Immunization_Current'
                             ,'BCG_Years_Of_Immunization'
                             ,'BCG_Last_15_Coverage_Yes'
                             ,'BCG_Last_15_Coverage_No'
                             ,'BCG_Last_40_Coverage_Yes'
                             ,'BCG_Coverage_GT50'
                             ,'BCG_Coverage_LT50'
                             ,'BCG_coverage_last_40yrs'
                             ,'RCV1_Coverage'
                             ,'RCV1_Duration'
                             ,'MCV1_Coverage'
                             ,'MCV1_Duration'
                             ,'Polio_Coverage'
                             ,'Polio_Duration'
                             ,'Deaths_Mn_18_Days_After'
                             ,'Deaths_Mn_30_Days_After'
)

jpeg("file2.jpeg", width = 800,height = 600)
corrplot(cor(bcg_data_num_cap[,-c(30,33)],
             method='pearson'), 
         order ="original"
         #hclust.method = "ward.D2"
         , tl.col='black'
         , tl.cex=.75
         ,type = "upper") 
dev.off()
## png 
##   2

We see evidence of collinearity, running exploratory factor analysis for variable reduction. Running KMO and Bartlett’s test measure of sampling adequacy after removing the target variable (deaths per million) and redundant variables

bcg_data_num_cap_2 <- bcg_data_num_cap[,-c(30,33,41,42)]
bcg_data_cor <- as.data.frame(cor(bcg_data_num_cap_2))

KMO(r=bcg_data_cor)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = bcg_data_cor)
## Overall MSA =  0.54
## MSA for each item = 
##            Population_2020       Percent_Pop_Above_65 
##                       0.23                       0.65 
##       Percent_Pop_Above_80                Pop_Density 
##                       0.69                       0.07 
##                   Avg_Temp             GDP_Per_Capita 
##                       0.64                       0.80 
##            Deaths_Diabetes        Deaths_Hypertension 
##                       0.47                       0.49 
##     Deaths_Cerebrovascular           Deaths_Pneumonia 
##                       0.55                       0.17 
##        Deaths_Resp_Disease                  Deaths_TB 
##                       0.62                       0.46 
##                 Deaths_HIV             Deaths_Obesity 
##                       0.23                       0.35 
##           Deaths_Influenza         Overall_GHSA_Score 
##                       0.50                       0.54 
## GHSA1_Emergence_Of_Disease            GHSA2_Detection 
##                       0.55                       0.44 
##             GHSA3_Response    GHSA4_Sufficient_System 
##                       0.45                       0.59 
##          GHSA5_Commitments                 GHSA6_Risk 
##                       0.34                       0.54 
##        Household_Size_2019           Stringency_Index 
##                       0.55                       0.69 
##    BCG_Coverage_Percentage      BCG_Immunization_Ever 
##                       0.68                       0.42 
##   BCG_Immunization_Current  BCG_Years_Of_Immunization 
##                       0.79                       0.42 
##   BCG_Last_15_Coverage_Yes   BCG_Last_40_Coverage_Yes 
##                       0.80                       0.46 
##          BCG_Coverage_GT50    BCG_coverage_last_40yrs 
##                       0.85                       0.61 
##              RCV1_Coverage              RCV1_Duration 
##                       0.72                       0.76 
##              MCV1_Coverage              MCV1_Duration 
##                       0.45                       0.48 
##             Polio_Coverage             Polio_Duration 
##                       0.42                       0.43
print(cortest.bartlett(bcg_data_cor,nrow(bcg_data_num_cap)))
## $chisq
## [1] 2568.499
## 
## $p.value
## [1] 4.101867e-210
## 
## $df
## [1] 703
#Getting Eigen Values
ev <- eigen(cor(bcg_data_num_cap_2))
ev$values
##  [1] 1.259518e+01 3.731039e+00 3.233762e+00 2.625343e+00 2.286394e+00
##  [6] 1.866159e+00 1.641439e+00 1.291262e+00 1.065168e+00 9.939033e-01
## [11] 8.013203e-01 6.739780e-01 6.324148e-01 5.703372e-01 5.357974e-01
## [16] 4.762821e-01 4.353803e-01 3.943215e-01 3.532218e-01 3.267830e-01
## [21] 2.466581e-01 2.370117e-01 1.931735e-01 1.640162e-01 1.440514e-01
## [26] 1.038516e-01 9.249145e-02 7.932896e-02 7.008928e-02 4.314043e-02
## [31] 3.280871e-02 2.395972e-02 1.743160e-02 9.532307e-03 5.964828e-03
## [36] 4.465426e-03 2.226797e-03 3.161892e-04

Running a scree plot to get optimum number of factors

Factor = c(1:15)
Eigen_Values <- ev$values[1:15]
Scree <- data.frame(Factor, Eigen_Values)
plot(Scree, main = "Scree Plot", xlab = "Number of factors",ylab = "Eigen Values",col= "Blue",ylim=c(0,4))
lines(Scree,col='Red')
abline(h = 1, col="Green")

Using the 7 factor solution

factors_data <- fa(r = bcg_data_num_cap_2
                   , nfactors = 7
                   ,rotate = "varimax")

factors_data
## Factor Analysis using method =  minres
## Call: fa(r = bcg_data_num_cap_2, nfactors = 7, rotate = "varimax")
## Standardized loadings (pattern matrix) based upon correlation matrix
##                              MR1   MR2   MR3   MR5   MR4   MR7   MR6   h2
## Population_2020            -0.08  0.05 -0.33  0.01  0.04  0.07  0.65 0.55
## Percent_Pop_Above_65        0.68 -0.16  0.65  0.10  0.11 -0.12 -0.04 0.94
## Percent_Pop_Above_80        0.62 -0.18  0.61  0.12  0.11 -0.11 -0.03 0.82
## Pop_Density                 0.05 -0.14 -0.07  0.14  0.11  0.04  0.27 0.13
## Avg_Temp                   -0.35  0.02 -0.58 -0.16  0.11  0.27  0.23 0.63
## GDP_Per_Capita              0.73 -0.31  0.14  0.16 -0.02 -0.24 -0.25 0.79
## Deaths_Diabetes             0.02 -0.14  0.26 -0.04  0.03  0.06  0.33 0.20
## Deaths_Hypertension        -0.34  0.02  0.27  0.02 -0.11 -0.28  0.47 0.50
## Deaths_Cerebrovascular      0.11  0.07  0.82 -0.11  0.17  0.30 -0.18 0.85
## Deaths_Pneumonia            0.18  0.11  0.00  0.12 -0.04  0.46  0.22 0.32
## Deaths_Resp_Disease         0.20 -0.21  0.58  0.06 -0.17 -0.04  0.04 0.46
## Deaths_TB                  -0.15  0.06  0.00  0.08  0.03  0.68 -0.01 0.50
## Deaths_HIV                 -0.01  0.10 -0.01 -0.18  0.00  0.64 -0.05 0.46
## Deaths_Obesity             -0.23  0.15  0.74 -0.06  0.10 -0.01  0.09 0.64
## Deaths_Influenza            0.17 -0.35  0.04  0.05 -0.14  0.02 -0.13 0.19
## Overall_GHSA_Score          0.97 -0.18  0.05  0.03  0.13  0.10  0.04 1.00
## GHSA1_Emergence_Of_Disease  0.85 -0.19  0.21  0.14  0.07  0.08 -0.09 0.84
## GHSA2_Detection             0.74 -0.29 -0.08  0.01  0.26  0.12  0.09 0.73
## GHSA3_Response              0.86 -0.08 -0.03  0.01  0.12  0.16  0.20 0.82
## GHSA4_Sufficient_System     0.90 -0.19  0.06 -0.10  0.10  0.08  0.02 0.88
## GHSA5_Commitments           0.69 -0.01  0.06  0.06 -0.11  0.02  0.09 0.50
## GHSA6_Risk                  0.82 -0.14  0.19  0.14  0.13 -0.13 -0.25 0.84
## Household_Size_2019        -0.48  0.24 -0.52 -0.26 -0.02  0.12  0.07 0.65
## Stringency_Index           -0.65  0.10 -0.04 -0.21 -0.07  0.17 -0.20 0.55
## BCG_Coverage_Percentage    -0.09  0.59 -0.32  0.31 -0.12  0.16 -0.34 0.71
## BCG_Immunization_Ever      -0.16  0.53  0.09 -0.04 -0.07 -0.06 -0.01 0.33
## BCG_Immunization_Current   -0.55  0.59 -0.12 -0.21 -0.02  0.25  0.19 0.81
## BCG_Years_Of_Immunization  -0.03  0.93  0.09 -0.06 -0.04 -0.06 -0.09 0.90
## BCG_Last_15_Coverage_Yes   -0.52  0.66 -0.18 -0.19 -0.05  0.25  0.18 0.87
## BCG_Last_40_Coverage_Yes   -0.24  0.68  0.00 -0.21 -0.01  0.37  0.07 0.70
## BCG_Coverage_GT50          -0.18  0.67 -0.20  0.07  0.02  0.27 -0.40 0.76
## BCG_coverage_last_40yrs    -0.16  0.68 -0.33  0.23 -0.12  0.21 -0.21 0.75
## RCV1_Coverage               0.67 -0.16 -0.03 -0.04  0.10 -0.18 -0.47 0.74
## RCV1_Duration               0.80 -0.19  0.19  0.02  0.09 -0.22 -0.34 0.88
## MCV1_Coverage               0.22 -0.04  0.04  0.06  0.95  0.00  0.03 0.96
## MCV1_Duration               0.24 -0.05  0.07  0.06  0.96  0.00  0.07 0.99
## Polio_Coverage              0.13 -0.06  0.04  0.95  0.06 -0.02  0.00 0.92
## Polio_Duration              0.17 -0.05  0.05  0.94  0.05 -0.02  0.10 0.93
##                                 u2 com
## Population_2020             0.4526 1.6
## Percent_Pop_Above_65        0.0624 2.3
## Percent_Pop_Above_80        0.1755 2.4
## Pop_Density                 0.8661 2.8
## Avg_Temp                    0.3681 2.8
## GDP_Per_Capita              0.2125 2.1
## Deaths_Diabetes             0.7976 2.4
## Deaths_Hypertension         0.4961 3.4
## Deaths_Cerebrovascular      0.1537 1.6
## Deaths_Pneumonia            0.6794 2.1
## Deaths_Resp_Disease         0.5438 1.8
## Deaths_TB                   0.5022 1.1
## Deaths_HIV                  0.5396 1.2
## Deaths_Obesity              0.3610 1.4
## Deaths_Influenza            0.8072 2.2
## Overall_GHSA_Score         -0.0018 1.1
## GHSA1_Emergence_Of_Disease  0.1577 1.4
## GHSA2_Detection             0.2719 1.7
## GHSA3_Response              0.1798 1.2
## GHSA4_Sufficient_System     0.1206 1.2
## GHSA5_Commitments           0.5008 1.1
## GHSA6_Risk                  0.1591 1.6
## Household_Size_2019         0.3544 3.1
## Stringency_Index            0.4511 1.6
## BCG_Coverage_Percentage     0.2857 3.2
## BCG_Immunization_Ever       0.6736 1.3
## BCG_Immunization_Current    0.1884 3.0
## BCG_Years_Of_Immunization   0.1030 1.1
## BCG_Last_15_Coverage_Yes    0.1250 2.8
## BCG_Last_40_Coverage_Yes    0.2994 2.1
## BCG_Coverage_GT50           0.2426 2.5
## BCG_coverage_last_40yrs     0.2464 2.4
## RCV1_Coverage               0.2580 2.2
## RCV1_Duration               0.1225 1.8
## MCV1_Coverage               0.0401 1.1
## MCV1_Duration               0.0096 1.2
## Polio_Coverage              0.0780 1.1
## Polio_Duration              0.0704 1.1
## 
##                        MR1  MR2  MR3  MR5  MR4  MR7  MR6
## SS loadings           9.32 4.43 3.67 2.41 2.18 2.05 1.98
## Proportion Var        0.25 0.12 0.10 0.06 0.06 0.05 0.05
## Cumulative Var        0.25 0.36 0.46 0.52 0.58 0.63 0.69
## Proportion Explained  0.36 0.17 0.14 0.09 0.08 0.08 0.08
## Cumulative Proportion 0.36 0.53 0.67 0.76 0.85 0.92 1.00
## 
## Mean item complexity =  1.9
## Test of the hypothesis that 7 factors are sufficient.
## 
## The degrees of freedom for the null model are  703  and the objective function was  60.44 with Chi Square of  2568.5
## The degrees of freedom for the model are 458  and the objective function was  23.19 
## 
## The root mean square of the residuals (RMSR) is  0.04 
## The df corrected root mean square of the residuals is  0.05 
## 
## The harmonic number of observations is  57 with the empirical chi square  128.13  with prob <  1 
## The total number of observations was  57  with Likelihood Chi Square =  877.18  with prob <  1.1e-28 
## 
## Tucker Lewis Index of factoring reliability =  0.594
## RMSEA index =  0.125  and the 90 % confidence intervals are  0.115 0.141
## BIC =  -974.53
## Fit based upon off diagonal values = 0.99

Clustering

After factor analysis, choosing distinct variables from each factor

bcg_data_clus <- bcg_data_2[,c("Percent_Pop_Above_65"
                               ,"Population_2020"
                               ,"Pop_Density"
                               ,"Avg_Temp"
                               ,"GDP_Per_Capita"
                               ,"Stringency_Index"
                               )]

#Standardising
bcg_data_clus <- scale(bcg_data_clus)

set.seed(234)

Examining WSS plot to get optimum number of centroids

wss <- function(k) {
  kmeans(bcg_data_clus, k, nstart = 10 )$tot.withinss
}


# Compute and plot wss for k = 1 to k = 15
k.values <- 1:15

# extract wss for 2-15 clusters
wss_values <- map_dbl(k.values, wss)

plot(k.values, wss_values,
     type="b", pch = 19, frame = FALSE,
     main = "Optimal number of clusters",
     xlab="Number of clusters K",
     ylab="Total within-clusters sum of squares")

Running k means with 9 centroids

kmeans_soln <- kmeans(bcg_data_clus, centers = 9, nstart = 20)
str(kmeans_soln)
## List of 9
##  $ cluster     : int [1:57] 9 3 7 8 2 7 5 9 6 7 ...
##  $ centers     : num [1:9, 1:6] -1.044 -0.891 1.125 0.569 -0.623 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:9] "1" "2" "3" "4" ...
##   .. ..$ : chr [1:6] "Percent_Pop_Above_65" "Population_2020" "Pop_Density" "Avg_Temp" ...
##  $ totss       : num 336
##  $ withinss    : num [1:9] 5.43 2.57 11.37 0 11.94 ...
##  $ tot.withinss: num 69.2
##  $ betweenss   : num 267
##  $ size        : int [1:9] 4 3 6 1 7 10 7 4 15
##  $ iter        : int 3
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"
clus_soln_9 <- kmeans_soln$cluster

Getting standard clustering separation statistics

km_stats <- cluster.stats(dist(bcg_data_clus), kmeans_soln$cluster)
x1 <- km_stats$dunn
x2 <- km_stats$avg.silwidth
x3 <- km_stats$average.between
x4 <- km_stats$average.within
x5 <- km_stats$within.cluster.ss
rbind(x1,x2,x3,x4,x5)
##          [,1]
## x1  0.3334449
## x2  0.3159702
## x3  3.5214398
## x4  1.6077259
## x5 69.1865532

Running k means with 6 centroids

kmeans_soln <- kmeans(bcg_data_clus, centers = 6, nstart = 20)
str(kmeans_soln)
## List of 9
##  $ cluster     : int [1:57] 3 4 5 6 2 5 3 3 1 5 ...
##  $ centers     : num [1:6, 1:6] 0.807 -0.978 -0.937 1.046 0.905 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:6] "1" "2" "3" "4" ...
##   .. ..$ : chr [1:6] "Percent_Pop_Above_65" "Population_2020" "Pop_Density" "Avg_Temp" ...
##  $ totss       : num 336
##  $ withinss    : num [1:6] 19.88 18.08 28.89 20.75 8.54 ...
##  $ tot.withinss: num 99.9
##  $ betweenss   : num 236
##  $ size        : int [1:6] 12 7 20 7 7 4
##  $ iter        : int 4
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"
clus_soln_6 <- kmeans_soln$cluster

Getting standard clustering separation statistics

km_stats <- cluster.stats(dist(bcg_data_clus), kmeans_soln$cluster)
x1 <- km_stats$dunn
x2 <- km_stats$avg.silwidth
x3 <- km_stats$average.between
x4 <- km_stats$average.within
x5 <- km_stats$within.cluster.ss
rbind(x1,x2,x3,x4,x5)
##          [,1]
## x1  0.3161203
## x2  0.3262905
## x3  3.6235573
## x4  1.8415076
## x5 99.8651383

Appending both solutions to main data

bcg_data_2 <- cbind(bcg_data_2,clus_soln_6,clus_soln_9)

write.csv(bcg_data_2,"clustering_results.csv")

Linear Regression

Retaining relevant variables and creating a new target variable, log(deaths/mn 30 days post 100 cases). This is done to account for the exponential nature of teh deaths/mn curve.

bcg_data_reg <- bcg_data_2[,c("Percent_Pop_Above_65"
                              ,"Population_2020"
                              ,"Pop_Density"
                              ,"Avg_Temp"
                              ,"GDP_Per_Capita"
                              ,"Stringency_Index"
                              ,"BCG_coverage_last_40yrs"
                              ,"BCG_Last_40_Coverage_Yes"
                              ,"BCG_Last_15_Coverage_Yes"
                              ,"Polio_Coverage"
                              ,"Polio_Duration"
                              ,"MCV1_Coverage"
                              ,"MCV1_Duration"
                              ,"Deaths_Mn_30_Days_After_100th_Case")]

bcg_data_reg$log_deaths_mn <- log(bcg_data_reg$Deaths_Mn_30_Days_After_100th_Case)

Running stepwise regression model

full.model <- lm(log_deaths_mn ~ Percent_Pop_Above_65
                   +Population_2020
                   +Pop_Density
                   +Avg_Temp
                   +GDP_Per_Capita
                   +Stringency_Index
                   +BCG_Last_15_Coverage_Yes
                   +BCG_coverage_last_40yrs
                   +BCG_Last_40_Coverage_Yes
                   +Polio_Coverage
                   +Polio_Duration
                   +MCV1_Coverage
                   +MCV1_Duration, data = bcg_data_reg)

# Stepwise regression model
step.model <- stepAIC(full.model, direction = "both", 
                      trace = FALSE)
summary(step.model)
## 
## Call:
## lm(formula = log_deaths_mn ~ Percent_Pop_Above_65 + Population_2020 + 
##     BCG_Last_15_Coverage_Yes, data = bcg_data_reg)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3.07034 -0.77491  0.03627  0.86900  2.70218 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)   
## (Intercept)               1.809362   0.710161   2.548  0.01378 * 
## Percent_Pop_Above_65      9.265413   3.438486   2.695  0.00942 **
## Population_2020          -0.005156   0.002365  -2.180  0.03374 * 
## BCG_Last_15_Coverage_Yes -1.242971   0.473923  -2.623  0.01136 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.269 on 53 degrees of freedom
## Multiple R-squared:   0.52,  Adjusted R-squared:  0.4929 
## F-statistic: 19.14 on 3 and 53 DF,  p-value: 1.538e-08
confint(step.model)
##                                2.5 %        97.5 %
## (Intercept)               0.38495888  3.2337658678
## Percent_Pop_Above_65      2.36868299 16.1621432903
## Population_2020          -0.00989966 -0.0004116397
## BCG_Last_15_Coverage_Yes -2.19353914 -0.2924020404