--- title: 'Supplemental Materials: Regression Model Fitting Code' author: "Nathan Hahn, Sara Bombaci, George Wittemyer" date: "5/12/21" output: html_notebook: default --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message=FALSE) library(qualtRics) library(survey) library(dplyr) library(MuMIn) library(ggplot2) library(reshape2) library(tidyr) library(knitr) library(sjPlot) library(ggsci) theme_set( theme_bw() + # set theme with no legend of strip text theme(panel.grid.major = element_blank(), strip.background = element_blank(), panel.border = element_rect(colour = "black"), strip.text = element_text(size = 12), legend.text = element_text(size = 10), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"), axis.title=element_text(size=14), axis.text = element_text(size=12)) ) ``` ```{r functions, include=FALSE} # function to make data vertical using reshape2::melt melt.svy <- function(x, string, melt.vars = 'profession_2') { require(reshape2) se <- select(x, matches(string)) t <- select(x, -contains('se.')) melt <- melt(t, id.vars = melt.vars) se.melt <- melt(se, id.vars = melt.vars) melt$se <- se.melt$value return(melt) } # Function to create catagory dataframes for each role. cat_combine1 <- function(df, string){ colName = paste0(categories) t <- select(df, matches(string)) %>% mutate(sum = rowSums(.)) #x <- ifelse(t$sum == 0, 0 ,1) } # Function to summarize catagory dataframes for each role. Columns collapsed into 0/1 data cat_combine2 <- function(df, string){ colName = paste0(categories) t <- select(df, matches(string)) %>% mutate(sum = rowSums(.)) x <- ifelse(t$sum == 0, 0 ,1) } ``` ```{r Q1, results='hide', message=FALSE, warning=FALSE} # prep data t1 <- read_survey('./CTNA_numeric_20210113.csv') %>% #filter(!is.na(Q3.2)) %>% # create grouped profession col mutate(profession_2 = as.factor(case_when( Q3.2 <= 3 ~ 'user', Q3.2 == 6 ~ 'technologist', Q3.2 > 3 | Q3.2 < 6 ~ 'ac_user' ))) %>% # create a user/developer column mutate(profession_3 = if_else(Q3.2 == 6, 'developer', 'practitioner')) %>% # switch to 0/1 data for feature prevention question mutate_at(vars(matches('prevent|Q4.1_|Q4.7_')), function(x) if_else(is.na(x), 0, 1)) %>% # RANK to numeric mutate_at(vars(contains('Q6.2_0')), as.numeric) %>% # remove 'other' text answer columns from analysis select(-contains(c('TEXT', 'Other'))) # re-order factor level for comparative stats t1$profession_2 <- factor(t1$profession_2, levels = c("user", "ac_user", "technologist")) t2 <- filter(t1, !is.na(Q3.2)) sv <- svydesign(id = ~1, strata = ~Q3.2, data = t2, check.strata = TRUE) ``` ## Code for Regression Model Fitting Model code appears in relation to the analysis objectives and the order in which results are presented in the paper and supplementary materials. ## 1. What are the technical barriers to use of technology for conservation and research applications, and are development priorities focused on alleviating them? ### 1a. Ordinal Regression: Frequency of technical issues experienced ```{r issue freq model 1} #### Ordinal regression - Frequency of issues #### # prep/reshape data temp <- dplyr::select(t2, matches('perf|profession_2')) %>% mutate_all(as.factor) %>% filter(profession_2 != 'technologist') melt <- temp %>% mutate(id = seq_len(n())) %>% melt(id.var = c('profession_2','id'), na.rm = T) %>% arrange(profession_2, id, variable) %>% dplyr::select(-id) %>% mutate_all(as.factor) # note: combine often and always categories levels(melt$value) <- c('NotApp', 'Never', 'Rarely', 'Sometimes', 'Often', 'Always') # note: drop NA values - selected when the feature is not applicable to their work melt <- filter(melt, value != 'NotApp') %>% #filter(variable != 'perf_freq_access') %>% # remove data access droplevels() #levels(melt$variable) levels(melt$variable) <- c('durability', 'use', 'real-time', 'cost', 'power', 'data management', 'interoperability') # summarize data as counts #melt %>% group_by(variable, value) %>% tally() colnames(melt) <- c("occupation", "feature", "frequency") levels(melt$occupation) <- c('practitioner', 'ac_rsrch') ## Fit ordinal logistic regression - Which features have frequent issues? library(ordinal) # set reference level melt$feature <- relevel(melt$feature, ref = 'use') melt$occupation <- relevel(melt$occupation, ref = 'ac_rsrch') mod <- clm(frequency ~ feature + occupation, data = melt) ## View model results # table - html table needs to be inserted as image for markdown t <- tab_model(mod, rm.terms = c("Never|Rarely", "Rarely|Sometimes", "Sometimes|Often", "Often|Always"), show.r2 = FALSE, show.loglik = TRUE, show.ci = 0.95, show.p = FALSE, title = 'Frequency of Technical Issues') # #plot the estimates coefficients p1 <- plot_model(mod, vline.color = 'black', colors = c("#377eb8", '#e4211c'), dot.size = 4, line.size = 2, rm.terms = c("Never|Rarely", "Rarely|Sometimes", "Sometimes|Often", "Often|Always")) ggtitle('technical problem frequency') p1 ``` ### 1b. Logstic Regression: Feature issues that prevent use of technologies ```{r} #### Logistic Regression - Prevention #### # prep data vars <- colnames(select(t2, matches("prevent|profession_2"))) temp <- select(t2, matches('prevent|profession_2')) %>% mutate_all(as.factor) %>% filter(profession_2 != 'technologist') # reshape data for model fitting melt <- temp %>% mutate(id = seq_len(n())) %>% melt(id.var = c('profession_2','id'), na.rm = T) %>% arrange(profession_2, id, variable) %>% dplyr::select(-id) %>% mutate_all(as.factor) %>% #filter(variable != 'prevent_access') %>% # remove data access droplevels() #levels(melt$variable) levels(melt$variable) <- c('durability', 'use', 'real-time', 'cost', 'power', 'data-mgmt', 'interoperability') colnames(melt) <- c('profession_2', 'feature', 'prevent') melt$feature <- relevel(melt$feature, ref = 'use') melt$profession_2 <- relevel(melt$profession_2, ref = 'ac_user') # fit logistic regression library(lme4) mod <- glm(prevent ~ feature + profession_2, data = melt, family = binomial) # table - html table t <- tab_model(mod, show.r2 = FALSE, show.loglik = TRUE, show.ci = 0.95, show.p = FALSE, title = 'Feature Issues Preventing Use') # plot model p2 <- plot_model(mod, vline.color = 'black', dot.size = 4, line.size = 2) + ggtitle('feature prevention') p2 ``` ### 1c. Ordinal Regression: Rank of Development Priorities ```{r} #### Ordinal regression - dev rankings #### # prep data var <- colnames(select(t2, matches('Q6.2_0|profession_2')) %>% select(matches('RANK|profession_2'))) temp <- select(t2, var) %>% mutate_all(as.factor) %>% # condense groups mutate_at(var, recode_factor, "1" = "1", "2" = "2", "3" = "3", "4" = "4","5" = "5_10","6" = "5_10","7" = "5_10","8" = "5_10","9" = "5_10","10" = "5_10") # reshape data for model fitting melt <- temp %>% mutate(id = seq_len(n())) %>% melt(id.var = c('profession_2','id'), na.rm = T) %>% arrange(profession_2, id, variable) %>% dplyr::select(-id) %>% mutate_all(as.factor) %>% filter(variable != 'Q6.2_0_9_RANK') %>% filter(profession_2 != 'technologist') %>% droplevels() # remove other - too few responses in this category # rename and relevel features to match names across models #levels(melt$variable) levels(melt$variable) <- c('durability', 'use', 'cost', 'power', 'data-mgmt', 'interoperability', 'real-time') melt$variable <- factor(melt$variable, levels = c('durability', 'use', 'real-time', 'cost', 'power', 'data-mgmt', 'interoperability')) colnames(melt) <- c('profession_2', 'feature', 'rank') #table(melt$feature, melt$rank) # fit ordinal logistic regression library(ordinal) # set reference level melt$feature<- relevel(melt$feature, ref = 'use') melt$profession_2 <- relevel(melt$profession_2, ref = "ac_user") # flip order (1 = highest) melt$rank <- forcats::fct_rev(melt$rank) mod <- clm(rank ~ feature + profession_2, data = melt) # table - html table t <- tab_model(mod, rm.terms = c("5_10|4", "4|3", "3|2", "2|1"), show.r2 = FALSE, show.loglik = TRUE, show.ci = 0.95, show.p = FALSE, title = 'Feature Priorities for Development') p3 <- plot_model(mod, vline.color = 'black', dot.size = 4, line.size = 2, rm.terms = c("5_10|4", "4|3", "3|2", "2|1")) + ggtitle('feature development rank') p3 ``` ## 2. Roles and Collaborations Ordinal Regression: Factors affecting collaboration ratings ```{r} library(MuMIn) #### Ordinal regression - Tech collaboration ratings #### df.dev <- read.csv('./Q4.8/CTNA_dev_20210113.csv') %>% filter(!is.na(profession)) # get collaboration sizes. Size of 0 indicates no collaboration. Will filter out later df.4.8 <- select(df.dev, contains('_')) df.4.8[is.na(df.4.8)] <- 0 #replace all NAs with 0 cat <- list() for(i in 1:15){ cat[[i]] <- cat_combine2(df.4.8, string = paste0('X', i, '_')) # function needed! } df <- as.data.frame(do.call(cbind, cat)) df.dev$collab.size <- rowSums(df, na.rm = T) # create dataframe for plotting and modeling df.mod <- as.data.frame(cbind(df.dev$collab.rating, df.dev$tech.type, df.dev$collab.size, df.barriers)) colnames(df.mod) <- c("collab.rating", "tech.type", "collab.size", "cost", "timeline", "proj_mgmt", "deliverables", "lack_tech_support", "partner_communication", "lack_partners", "none", "other", "profession") df.mod <- df.mod %>% mutate_all(as.factor) %>% mutate_at('collab.size', as.numeric) %>% filter(!is.na(collab.rating)) # filter out responses with no collab # Fit ordinal regression - factors effecting collaboration rating library(ordinal) # fit global model mod.global <- clm(collab.rating ~ tech.type + collab.size + profession + cost + lack_tech_support + timeline + partner_communication + deliverables, data = df.mod, na.action = "na.fail") #summary(mod.global) t <- MuMIn::dredge(mod.global, na.action) mods <- get.models(t, subset = delta < 5) # model selection table using top 5 models + global model mod.global <- clm(collab.rating ~ tech.type + collab.size + cost + lack_tech_support + timeline + partner_communication + deliverables, data = df.mod, na.action = "na.fail") mod.2 <- clm(collab.rating ~ cost + lack_tech_support + partner_communication, data = df.mod, na.action = "na.fail") mod.3 <- clm(collab.rating ~ cost + deliverables + lack_tech_support, data = df.mod, na.action = "na.fail") mod.4 <- clm(collab.rating ~ partner_communication, data = df.mod, na.action = "na.fail") mod.5 <- clm(collab.rating ~ cost + partner_communication, data = df.mod, na.action = "na.fail") mod.6 <- clm(collab.rating ~ partner_communication, data = df.mod, na.action = 'na.fail') t <- as.data.frame(AIC(mod.global, mod.2, mod.3, mod.4, mod.5, mod.6)) t$loglik <- c(logLik(mod.global), logLik(mod.2), logLik(mod.3), logLik(mod.4), logLik(mod.5), logLik(mod.6)) t$mod.formula <- c(mod.global$formula, mod.2$formula, mod.3$formula, mod.4$formula, mod.5$formula, mod.6$formula) # read in formatted table with model statements kable(t) # view top model table - html table t <- tab_model(mods[[1]], rm.terms = c('2|3', '3|4', '4|5'), show.r2 = FALSE, show.loglik = TRUE, show.p = FALSE, title = "Factors Affecting Collaboration Ratings") ```