Note: Change ‘eval’ to ‘TRUE’ if user wants to run this code

Load packages

library(here)  # makes pathways easier
library(tidyverse)  # for data manipulation
library(glmmTMB)  # for beta regression with random effects
library(buildmer)  # for backward variable elimination

Data prep

# load data
sal <- read_csv(here("data_voicing_coded.csv"))

# filter out deleted tokens 
sal <- sal %>% filter(!is.na(voic_hum))

40 beta regression models and save FRC’s p-values (if in minimal adequate model)

Note: This block takes hours to run

# varying thresholds for number of tokens upon which FRC is based, and varying amounts of removing the middle of the FRC range (centered on the median FRC)
collector <- tibble()
for (n in 3:10) {
  cat("threshold = ", n, "\n")
  for (sd_threshold in 0:4) {
    cat("\tSD = ", sd_threshold, "\n")
    model1 <- sal %>% 
      filter(n_ratio_cond >= n) %>%  # filter threshold of N of tokens upon which FRC is based
      filter((ratio_cond <= median(.$ratio_cond) - (sd(.$ratio_cond) / 4) * sd_threshold) | (ratio_cond >= median(.$ratio_cond) + (sd(.$ratio_cond) / 4) * sd_threshold)) %>%  # filter out tokens with FRC near median FRC
      mutate(voic_hum = if_else(voic_hum == 0, voic_hum + 0.001, if_else(voic_hum == 1, voic_hum - 0.001, voic_hum))) %>%  # move voicing value slightly off of 0 and 1
      group_by(file) %>%
      mutate(post_spk_rate_dip = scale(post_spk_rate_dip)) %>%  # z-score speech rate per speaker
      ungroup() %>%
      buildmer::buildglmmTMB(voic_hum ~  # to do stepwise elimination
                         (ratio_cond + 
                         fol_phon_bin5 +
                         stress +
                         log(wd_freq+1) +
                         pre_phon_bin2 +
                         post_spk_rate_dip +
                         wd_class_bin)^2 + 
                         (1+ratio_cond|file) + (1+ratio_cond|wd_upper), 
                       family = beta_family(link = "logit"), data = .)
    coefficients <- summary(model1)$coefficients$cond 
    vars <- rownames(coefficients)
    coefficients <- as_tibble(coefficients) %>% mutate(vars) %>% select(vars, everything()) %>% rename(p = `Pr(>|z|)`)
    coefficients <- coefficients %>% filter(str_detect(vars, "ratio_cond")) %>% mutate(n_frc = n) %>% mutate(sd_fourth = sd_threshold)
    if (nrow(coefficients) > 0) {
      collector <- bind_rows(collector, coefficients)
    }
  }  # next standard deviation threshold
    
}  # next threshold of number of tokens upon which FRC is based

# write out data frame to the hard drive as a CSV file 
write_csv(collector, path = here("coefficients.csv"))

# read in data frame (if needed because it's no longer in working memory)
collector <- read_csv(here("coefficients.csv"))

Figure 3

p-values of FRC across 40 models

collector %>% 
  bind_rows(tibble(vars = "ratio_cond", Estimate = NA, `Std. Error` = NA, `z value` = NA, p = NA, n_frc = NA, sd_fourth = 0, N = NA)) %>% 
  mutate(sd_fourth = factor(sd_fourth, labels = c("Complete FRC-range included", "Mid 0.5 SD FRC-range excluded", "Mid 1.0 SD FRC-range excluded", "Mid 1.5 SD FRC-range excluded", "Mid 2.0 SD FRC-range excluded"))) %>%
  mutate(vars = str_replace_all(vars, "log\\(wd_freq \\+ 1\\)", "wd_freq")) %>% 
  mutate(Predictors = factor(vars, levels = c("ratio_cond", "fol_phon_bin5voicedC:ratio_cond", "post_spk_rate_dip:ratio_cond"), labels = c("FRC", "FRC:FollPhon", "FRC:SpchRate"))) %>% 
  # ggplot(aes(n_frc, p, color = Predictors)) +
  ggplot(aes(n_frc, p)) +
  facet_wrap(~sd_fourth) +
  # geom_line() +
  geom_line(aes(linetype = Predictors)) +
  theme_bw() +
  theme(text = element_text(family = "Times New Roman")) +
  labs(x = "Minimum threshold of tokens for FRC calculation", y = "p-value")
ggsave(here("plot_p_value_threshold.png"), width = 7, height = 4)

Create temporary data frame with subset of data

FRC score must be based on at least 5 tokens in the reference corpus

Tokens whose FRC lay within the middle half standard deviation of FRC-range excluded

temp <- sal %>% 
  filter(n_ratio_cond >= 5) %>%  
  filter((ratio_cond <= median(.$ratio_cond) - (sd(.$ratio_cond) / 4) * 1) | (ratio_cond >= median(.$ratio_cond) + (sd(.$ratio_cond) / 4) * 1)) %>% 
  mutate(voic_hum = if_else(voic_hum == 0, voic_hum + 0.001, if_else(voic_hum == 1, voic_hum - 0.001, voic_hum))) %>%
  group_by(file) %>%
  mutate(post_spk_rate_dip = scale(post_spk_rate_dip)) %>%
  mutate(fol_phon_bin5 = factor(fol_phon_bin5, levels = c("voicedC", "other"), labels = c("Voiced C", "Other sound or pause"))) %>% 
  ungroup() %>% 
  mutate(voic_hum = voic_hum * 100)

Table 1

model1 <- buildmer::buildglmmTMB(voic_hum ~  # to do stepwise elimination
                     (ratio_cond + 
                     fol_phon_bin5 +
                     stress +
                     log(wd_freq+1) +
                     pre_phon_bin2 +
                     post_spk_rate_dip +
                     wd_class_bin)^2 + 
                     (1+ratio_cond|file) + (1+ratio_cond|wd_upper), 
                   family = beta_family(link = "logit"), data = temp)
summary(model1)

Figure 4

Facetted scatterplots of FRC by following phonological context

temp %>% 
  ggplot(aes(ratio_cond, voic_hum)) +
  geom_point(alpha = 0.4) +
  geom_smooth(method = lm, color = "black") +
  facet_wrap(~fol_phon_bin5) +
  theme_bw()+
  theme(text = element_text(family = "Times New Roman")) +
  labs(x = "Forms' Ratio of Conditioning", y = "/s/ voicing (%)") +
  coord_cartesian(ylim = c(0, 100))
ggsave("plot_frcXfolphon.png", width = 7, height = 4)

Figure 5

Boxplot of voicing by following sound

temp %>% 
  ggplot(aes(fol_phon_bin5, voic_hum)) +
  geom_boxplot(notch = F) +
  theme_bw() +
  theme(text = element_text(family = "Times New Roman")) +
  labs(x = "Following Sound", y = "/s/ voicing (%)") +
  stat_summary(fun = mean, geom="point", shape=4, size=3)
ggsave(here("plot_folXvoicing.png"),width = 7, height = 4)

Figure 6

Scatterplot of voicing by post speech rate

temp %>% 
  ggplot(aes(post_spk_rate_dip, voic_hum)) +
  geom_point(alpha = 0.4)+
  geom_smooth(method = lm, color = "black") +
  theme_bw()+
  theme(text = element_text(family = "Times New Roman")) +
  labs(x = "Post Speech Rate (z-score)", y = "/s/ voicing (%)")
ggsave(here("plot_voicingXspch_rate.png"), width = 7, height = 4)  

Figure 7

Boxplot of voicing by word class

temp %>% 
  ggplot(aes(wd_class_bin, voic_hum)) +
  geom_boxplot(notch = T) +
  theme_bw() +
  theme(text = element_text(family = "Times New Roman")) +
  labs(x = "Word Class", y = "/s/ voicing (%)") +
  stat_summary(fun = mean, geom="point", shape=4, size=3)
ggsave(here("plot_wdClassXvoicing.png"), width = 7, height = 4)

Figure 8

Facetted scatterplots of voicing by lexical frequency by following sound

temp %>% 
  ggplot(aes(log(wd_freq + 1), voic_hum)) +
  geom_point(alpha = 0.4) +
  geom_smooth(method = lm, color = "black") +
  facet_wrap(~fol_phon_bin5) +
  theme_bw() +
  theme(text = element_text(family = "Times New Roman")) +
  labs(x = "Lexical frequency (log)", y = "/s/ voicing (%)")
ggsave(here("plot_voicingXfreq.png"), width = 7, height = 4)

Figure 9

Facetted scatterplots of voicing by lexical frequency by word class

temp %>% 
  mutate(wd_class_bin = factor(wd_class_bin, labels = c("Content word", "Function word"))) %>% 
  ggplot(aes(log(wd_freq + 1), voic_hum)) +
  geom_point(alpha = 0.4) +
  geom_smooth(method = lm, color = "black") +
  facet_wrap(~wd_class_bin) +
  theme_bw() +
  theme(text = element_text(family = "Times New Roman")) +
  labs(x = "Lexical frequency (log)", y = "/s/ voicing (%)")
ggsave(here("plot_wdClassXfreq.png"), width = 7, height = 4)