--- title: "Data Analysis in NCDS" author: "Dian Bicona" date: "28/06/2025" output: html_document --- ```{r include=FALSE} library(r2glmm) library(tidyverse) # data manipulation and visualization library(gridExtra) library(psych) library(pheatmap) library(sjPlot) library(data.table) library(knitr) library(kableExtra) library(naniar) library(dplyr) ``` ```{r include= FALSE} #Reading in NCDS polygenic scores ncds_data <- read.table(file = "prs.score.DYSLEXIA.all_score", header = TRUE) ncds_head <- read.table(file= "ncds_head.csv", header = TRUE, sep= ",") ncds <- read.table(file="language_and_reading_measures_for_Diana.csv", header= TRUE, sep=",") ncds_ancestry <- read.table(file = "NCDS_ancestry_components.csv", header= TRUE, sep=",") ``` ```{r include= FALSE} #Tidying up the datasets before merging ncds_head <- ncds_head %>% group_by(iid) %>% mutate( IID = sub(".*_", "", iid) ) ncds<- ncds %>% mutate( IID = lucid ) #merging the data together ncds_full<- merge(ncds_data, ncds_head, by="IID") ncds_full<- merge(ncds_full, ncds, by= "IID") ncds_full<-merge(ncds_full,ncds_ancestry, by="iid") ncds_full$Std_PRS <- scale(ncds_full$Pt_1) ``` ```{r} #checking the sample size knitr::kable(ncds_full %>% summarise( N_individuals = n_distinct(IID) )) ``` ```{r echo=FALSE} # creeating data sets for PCA pca7 <- ncds_full[c("N68", "N81", "N92")] pca16 <- ncds_full[c("N2928", "N2245")] ``` ```{r echo= FALSE} #tidying up PCA datasets pca7$N68<- na_if(pca7$N68, -1) pca7$N81<- na_if(pca7$N81, -1) pca7$N92<- na_if(pca7$N92, -1) pca7$N92<- na_if(pca7$N92, 0) ``` ```{r echo=FALSE} #reverse code some variables for age 7 PCA pca7<- na.omit(pca7) pca7$N81 = 8 - pca7$N81 pca7$N68 = 6 - pca7$N68 ``` ```{r echo=FALSE} #getting rid of NAs pca7$N81<- na_if(pca7$N81, 7) pca7<- na.omit(pca7) ``` ```{r echo=FALSE} #Exploring the distribution of scores for each measure hist(pca7$N68) ``` Normally distributed ```{r echo=FALSE} hist(pca7$N81) ``` Non-normal distribution, moderately skewness ```{r echo=FALSE} hist(pca7$N92) ``` Non normal distribution, highly skewed ``` {r echo=FALSE} #scaling the variables prior to PCA pca7 <- pca7 %>% mutate( N68 = scale(N68), N81 = scale(N81), N92 = scale(N92) ) ``` ```{r echo=FALSE} psych::alpha(pca7) ``` ```{r echo=FALSE} #Correlation matrix Rpca <- cor(pca7) pheatmap(Rpca, breaks = seq(-1, 1, length.out = 100)) ``` ```{r echo=FALSE} #Conducting the PCA for Age 7 age7_pca <- principal(pca7, nfactors = ncol(pca7), covar = TRUE, rotate = 'none') age7_pca$loadings ``` ```{r} omega(pca7, nfactors=1) ``` ```{r} fa.parallel(pca7, fa="pc", quant=.95) ``` ```{r echo=FALSE} #Loadings in a table loadings7 <- age7_pca$loadings[, 1:1] knitr::kable(loadings7, caption = "PC 1", digits= 2) %>% kable_styling("striped") ``` ```{r echo=FALSE} #Data manipulation for Age 16 PCA pca16$N2928<- na_if(pca16$N2928, -1) pca16$N2245<- na_if(pca16$N2245, -1) pca16$N2245 <- na_if(pca16$N2245, 6) pca16<- na.omit(pca16) ``` ```{r echo=FALSE} #Data manipulation for Age 16 PCA; changing the sex variable so it's binary in R pca16$N2245<- 6 - pca16$N2245 ncds_full$sex<- ncds_full$sex -1 ``` ```{r echo=FALSE} #Summary stats for Age 16 variables knitr::kable(pca16 %>% summarise(across(everything(), list(M = mean, SD = sd))) %>% pivot_longer(everything()) %>% mutate( value = round(value, 2), name = str_replace(name, '_M', '.M'), name = str_replace(name, '_SD', '.SD') ) %>% separate(name, into = c('Variable', 'summary'), sep = '\\.') %>% pivot_wider(names_from = summary, values_from = value), caption = "Summary") %>% kable_styling("striped") ``` ```{r echo=FALSE} #Scaling the variables for Age 16 PCA pca16 <- pca16 %>% mutate( N2928= scale(N2928), N2245 = scale(N2245) ) ``` ```{r echo=FALSE} psych::alpha(pca16) ``` ```{r} omega(pca16, nfactors=1) ``` ```{r echo=FALSE} Rpca16 <- cor(pca16) pheatmap(Rpca16, breaks = seq(-1, 1, length.out = 100)) ``` ```{r echo=FALSE} age16pca <- principal(pca16, nfactors = ncol(pca16), covar = FALSE, rotate = 'none') age16pca$loadings ``` ```{r} fa.parallel(pca16, fa="pc", quant=.95) ``` ```{r} #Loadings in a table loadings <- age16pca$loadings[, 1:1] knitr::kable(loadings, caption = "PC 1", digits= 2) %>% kable_styling("striped") ``` ```{r} #Recoding SES categories in the full dataset ncds_full$SES <- NA # Initialize with NA ncds_full$SES[ncds_full$N190 == 2] <- 8 ncds_full$SES[ncds_full$N190 == 3] <- 7 ncds_full$SES[ncds_full$N190 == 4] <- 6 ncds_full$SES[ncds_full$N190 == 5] <- 5 ncds_full$SES[ncds_full$N190 == 6] <- 4 ncds_full$SES[ncds_full$N190 == 7] <- 3 ncds_full$SES[ncds_full$N190 == 8] <- 2 ncds_full$SES[ncds_full$N190 == 1] <- 1 ncds_full$SES[is.na(ncds_full$N190) | ncds_full$N190 == -1] <- NA ``` ```{r} #Manipulating the dataset for Age 7 model, getting rid of NAs to match PCA dataset, etc age7 <- ncds_full[c("iid", "IID", "Pt_1", "Pt_5e.08", "Pt_5e.05","Pt_0.001", "sex", "Array", "N68", "N81", "N92", "SES", "Std_PRS")] age7 <- merge(age7, ncds_ancestry, by = "iid") age7$N68<- na_if(age7$N68, -1) age7$N81<- na_if(age7$N81, -1) age7$N81<- na_if(age7$N81, 1) age7$N92<- na_if(age7$N92, -1) age7$N92<- na_if(age7$N92, 0) age7 <- age7[complete.cases(age7[ , c('N68', 'N81', 'N92')]), ] age7$SES<- na_if(age7$SES, 0) age7 <- age7 %>% mutate( reading_composite = age7_pca$scores[,1] ) ``` "PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8", "PC9", "PC10" ```{r echo=FALSE} #Exploring histogram of the composite score hist(age7$reading_composite) ``` ```{r echo=FALSE} #Adjusting for sex, standardising the score again pca7score = lm(data=age7, reading_composite ~ sex , na.action=na.exclude) age7$pca7score <- rstandard(pca7score) ``` ```{r echo=FALSE} #Checking the histogram again, looks better hist(age7$pca7score) ``` ```{r} # Count the number of values in pca7score that are less than -3 count_below_neg3 <- sum(age7$pca7score < -3) # Display the result count_below_neg3 ``` ```{r} # List all values in pca7score that are less than -3 values_below_neg3 <- age7$pca7score[age7$pca7score < -3] # Display the result values_below_neg3 ``` ```{r echo=FALSE} #Model 1, regressing only on PRS age7_m1 <- lm(pca7score ~ Std_PRS + Array + PC1 + PC2 + PC3 + PC4 +PC5 + PC6 +PC7 +PC8 + PC9 +PC10 , data = age7, na.action =na.exclude) summary(age7_m1) ``` ```{r} library(boot) # Define a function to calculate R-squared using the existing model formula rsq <- function(data, indices) { d <- data[indices, ] # Resample the data fit <- lm(formula(age7_m1), data = d) # Use formula from existing model with resampled data summary(fit)$r.squared } # Run bootstrapping set.seed(123) # For reproducibility results <- boot(data = age7, statistic = rsq, R = 1000) # Use the correct dataset # Calculate and display the original R-squared original_rsq <- summary(age7_m1)$r.squared print(paste("Original R-squared:", round(original_rsq, 2))) # Get 95% confidence interval for the bootstrapped R-squared values ci <- boot.ci(results, type = "perc") print(ci) ``` ```{r} library(parameters) standardized_output <- standardize_parameters(age7_m1) print(standardized_output) ``` ```{r} #Interaction model age7_m2 <- lm(pca7score ~ Std_PRS * scale(SES) + Array + PC1 + PC2 + PC3 + PC4 +PC5 + PC6 +PC7 +PC8 + PC9 +PC10 , data = age7, na.action =na.exclude) summary(age7_m2) ``` ```{r} standardized_output2 <- standardize_parameters(age7_m2) print(standardized_output2) ``` ```{r} library(simr) #Power calculations # Parameters n_sim <- 1000 effect_sizes <- c(0.02, 0.05, 0.075, 0.1) # Different effect sizes n_total <- 5712 # Total sample size results <- data.frame(effect_size = effect_sizes, power = NA) # To store results # Loop through each effect size set.seed(123) # For reproducibility for (j in seq_along(effect_sizes)) { interaction_effect <- effect_sizes[j] # Current effect size p_values <- numeric(n_sim) # To store p-values for this effect size # Simulate data and fit the model repeatedly for (i in 1:n_sim) { # Simulate data data <- data.frame( std_prs = rnorm(n_total, mean = 0, sd = 1), # Polygenic score predictor SES = rnorm(n_total, mean = 0, sd = 1) # SES predictor ) # Simulate the outcome with interaction effect data$pca7score <- interaction_effect * data$std_prs * data$SES + rnorm(n_total, mean = 0, sd = 1) # Residual noise # Fit the linear model model <- lm(pca7score ~ std_prs * SES, data = data, na.action = na.exclude) # Extract p-value for the interaction effect p_values[i] <- summary(model)$coefficients["std_prs:SES", 4] } # Calculate power for this effect size results$power[j] <- mean(p_values < 0.05) } # Display results print(results) ``` ```{r} #Code for getting CIs rsq <- function(data, indices) { d <- data[indices, ] # Resample the data fit <- lm(formula(age7_m2), data = d) # Use formula from existing model with resampled data summary(fit)$r.squared } # Run bootstrapping set.seed(123) # For reproducibility results <- boot(data = age7, statistic = rsq, R = 1000) # Use the correct dataset # Calculate and display the original R-squared original_rsq <- summary(age7_m2)$r.squared print(paste("Original R-squared:", round(original_rsq, 2))) # Get 95% confidence interval for the bootstrapped R-squared values ci <- boot.ci(results, type = "perc") print(ci) ``` ```{r echo=FALSE} #Creating dataset for age 16 model age16 <- ncds_full[c("iid", "IID", "Pt_1", "Pt_5e.08", "sex", "Array", "N2928", "N2199", "N2245", "SES", "Std_PRS" )] age16 <- merge(age16, ncds_ancestry, by = "iid") ``` ```{r echo=FALSE} #Tidying up that dataset to match PCA dataset age16$N2928<- na_if(age16$N2928, -1) age16$N2245<- na_if(age16$N2245, -1) age16$N2245<- na_if(age16$N2245, 6) age16 <- age16[complete.cases(age16[ , c('N2928', 'N2245')]), ] age16$SES<- na_if(age16$SES, 0) age16 <- age16 %>% mutate( reading_composite16 = age16pca$scores[,1], std_prs = scale(Pt_1), std_prs2 = scale(Pt_5e.08) ) ``` ```{r echo=FALSE} #Adjusting and standardising the pca score pca16score_adj = lm(reading_composite16 ~ sex, data= age16, na.action=na.exclude) age16$pca16score <- rstandard(pca16score_adj) age16$SES<- na_if(age16$SES, 0) ``` ```{r echo=FALSE} #Checking the distribution of PCA scores hist(age16$pca16score) ``` ```{r echo-FALSE} #Prettier histogram ggplot(data=age16, aes(x=pca16score))+ geom_histogram(fill='deeppink3', color = 'black')+ labs(title="Histogram of residuals", x="Residuals", y="Frequency") ``` ```{r} age16_m <- lm(pca16score ~ Std_PRS + Array + PC1 + PC2 + PC3 + PC4 +PC5 + PC6 +PC7 +PC8 + PC9 +PC10 ,data = age16, na.action =na.exclude) summary(age16_m) ``` ```{r} standardized_output3 <- standardize_parameters(age16_m) print(standardized_output3) ``` ```{r} #Code for getting CIs for age 16 model rsq <- function(data, indices) { d <- data[indices, ] # Resample the data fit <- lm(formula(age16_m), data = d) # Use formula from existing model with resampled data summary(fit)$r.squared } # Run bootstrapping set.seed(123) # For reproducibility results <- boot(data = age16, statistic = rsq, R = 1000) # Use the correct dataset # Calculate and display the original R-squared original_rsq <- summary(age16_m)$r.squared print(paste("Original R-squared:", round(original_rsq, 2))) # Get 95% confidence interval for the bootstrapped R-squared values ci <- boot.ci(results, type = "perc") print(ci) ``` ```{r echo=FALSE} #Interaction model for age 16 age16_m1 <- lm(pca16score ~ Std_PRS * scale(SES) + Array + PC1 + PC2 + PC3 + PC4 +PC5 + PC6 +PC7 +PC8 + PC9 +PC10 , data = age16, na.action =na.exclude) summary(age16_m1) ``` ```{r} standardized_output4 <- standardize_parameters(age16_m1) print(standardized_output4) ``` ```{r} rsq <- function(data, indices) { d <- data[indices, ] # Resample the data fit <- lm(formula(age16_m1), data = d) # Use formula from existing model with resampled data summary(fit)$r.squared } # Run bootstrapping set.seed(123) # For reproducibility results <- boot(data = age16, statistic = rsq, R = 1000) # Use the correct dataset # Calculate and display the original R-squared original_rsq <- summary(age16_m1)$r.squared print(paste("Original R-squared:", round(original_rsq, 2))) # Get 95% confidence interval for the bootstrapped R-squared values ci <- boot.ci(results, type = "perc") print(ci) ``` ```{r} #Power calculations for age 16 subsample n_sim <- 1000 effect_sizes <- c(0.02, 0.05, 0.075, 0.1) # Different effect sizes n_total <- 4809 # Total sample size results <- data.frame(effect_size = effect_sizes, power = NA) # To store results # Loop through each effect size set.seed(123) # For reproducibility for (j in seq_along(effect_sizes)) { interaction_effect <- effect_sizes[j] # Current effect size p_values <- numeric(n_sim) # To store p-values for this effect size # Simulate data and fit the model repeatedly for (i in 1:n_sim) { # Simulate data data <- data.frame( std_prs = rnorm(n_total, mean = 0, sd = 1), # Polygenic score predictor SES = rnorm(n_total, mean = 0, sd = 1) # SES predictor ) # Simulate the outcome with interaction effect data$pca16score <- interaction_effect * data$std_prs * data$SES + rnorm(n_total, mean = 0, sd = 1) # Residual noise # Fit the linear model model <- lm(pca16score ~ std_prs * SES, data = data, na.action = na.exclude) # Extract p-value for the interaction effect p_values[i] <- summary(model)$coefficients["std_prs:SES", 4] } # Calculate power for this effect size results$power[j] <- mean(p_values < 0.05) } # Display results print(results) ``` ```{r echo=FALSE} # Average Dyslexia PRS, PCA score, standard deviation, and number of observations in each SES group age16 %>% filter(!is.na(std_prs), !is.na(SES)) %>% # Remove rows where std_prs or SES is NA group_by(SES) %>% summarise( AverageDyslexiaPRS = round(mean(std_prs), 2), SD_DyslexiaPRS = round(sd(std_prs), 2), Number_of_observations = n_distinct(IID) ) ``` ```{r echo=FALSE} ```{r} citation("stats") ``` ```{r} citation("psych") ```