--- title: "Brisbane data analysis" author: "Diana Bicona" date: "28/06/2025" output: html_document: default word_document: default --- ```{r include=FALSE} library(r2glmm) library(tidyverse) library(gridExtra) library(psych) library(pheatmap) library(sjPlot) library(data.table) library(knitr) library(kableExtra) library(nlme) library(ggplot2) library(simr) full_data=read.table(file="AnalysisPhenotypes_BATS_phenotypescovariates_withPRS.txt",header=T,sep='\t') school_data = read.csv(file= "School_Data.csv") SES_data = read.csv(file= "FAM_SES.csv") ``` ```{r echo=FALSE} #Creating separate data set for PCA analysis pca_data and a seperate dataset for main analysis consisting of variables of interest pca_variables<- c("READING_IRREGULAR_CORRECT", "READING_REGULAR_CORRECT", "READING_NONWORD_CORRECT", "SPELLING_IRREGULAR_CORRECT", "SPELLING_REGULAR_CORRECT", "SPELLING_NONWORD_CORRECT") myvars<- c("FID", "IID" ,"Std_S7", "Std_S8", "FAM_ID", "ZYGOSITY","READING_IRREGULAR_CORRECT", "READING_REGULAR_CORRECT", "READING_NONWORD_CORRECT", "SPELLING_IRREGULAR_CORRECT", "SPELLING_REGULAR_CORRECT", "SPELLING_NONWORD_CORRECT", "PC1","PC2", "PC3","PC4","PC5", "PC6","PC7","PC8", "PC9","PC10","PC11", "PC12","PC13","PC14", "PC15","PC16","PC17", "PC18","PC19","PC20", "ImputationRun_Hap","ImputationRun_GSA", "SEX01", "RD1.age", "RD2.age") pca_data <- full_data[pca_variables] pca_data[is.na(pca_data)] <- 0 twin_data <- full_data[myvars] ``` ```{r echo=FALSE} alpha(pca_data) ``` ```{r echo=FALSE} #Exploring average age and SD of age knitr::kable(full_data %>% summarise( N_families = n_distinct(FID), N_individuals = n_distinct(IID), Average_Age= round(mean(RD1.age, na.rm = TRUE), 1), SD_Age = round(sd(RD1.age, na.rm = TRUE), 1), )) ``` ```{r} #Explore male/female ratio library( data.table ) setDT(full_data)[ , 100 * .N / nrow( full_data ), by = SEX ] ``` ```{r echo= FALSE} #Summary of reading and spelling scores knitr::kable(pca_data %>% summarise(across(everything(), list(M = mean, SD = sd))) %>% pivot_longer(everything()) %>% mutate( value = round(value, 2), name = str_replace(name, '_M', '.M'), name = str_replace(name, '_SD', '.SD') ) %>% separate(name, into = c('Variable', 'summary'), sep = '\\.') %>% pivot_wider(names_from = summary, values_from = value), caption = "Summary") %>% kable_styling("striped") ``` ```{r echo=FALSE} #Exploring the data summary(pca_data) ``` ```{r echo=FALSE} R <- cor(pca_data) pheatmap(R, breaks = seq(-1, 1, length.out = 100)) ``` ```{r echo=FALSE} #Show that the correlation matrix and the covariance matrix of the standardized variables are identical all.equal(cor(pca_data), cov(scale(pca_data))) ``` ```{r} #As the standard deviations in summary stats were quite different (and so will the variances), I will perform PCA using the correlation matrix. language_pca <- principal(pca_data, nfactors = ncol(pca_data), covar = FALSE, rotate = 'none') language_pca$loadings ``` ```{r} #Running a few tests to confirm a general factor - parallel analysis and scree plot suggests 1 fa.parallel(pca_data, fa="pc", quant=.95) ``` ```{r tab2, echo=FALSE, out.height="50%"} #PCA loadings in a table loadings <- language_pca$loadings[, 1] knitr::kable(loadings, caption = "Principal Component Analysis Loadings", digits= 2) %>% kable_styling("striped") ``` ```{r echo=FALSE } twin_data <- twin_data %>% mutate( skills_score1 = language_pca$scores[,1], prs = scale(twin_data$Std_S8) ) ``` ```{r echo=FALSE } #Regressing PCA component on age, age squared, sex) pca_score_Z = lm(data=twin_data,skills_score1 ~ RD1.age + I(RD1.age^2) + SEX01, na.action=na.exclude) twin_data$pca_score_Z <- rstandard(pca_score_Z) ``` ```{r echo=FALSE } library(ggplot2) ggplot(data=twin_data, aes(x=pca_score_Z))+ geom_histogram(fill='deeppink3', color = 'black')+ labs(title="Histogram of residuals", x="Residuals", y="Frequency") ``` ```{r} # Count the number of outliers count_below_4 <- sum(twin_data$pca_score_Z < -4, na.rm=TRUE) count_below_4 ``` ```{r echo=FALSE } # Outliers[Residuals lower than -4] replaced to NA twin_data$pca_score_Z[twin_data$pca_score_Z < -4]<-NA ``` ```{r echo=FALSE } #Let's see how the histogram looks now ggplot(data=twin_data, aes(x=pca_score_Z))+ geom_histogram(fill='deeppink3', color = 'black')+ labs(title="Histogram of residuals", x="Residuals", y="Frequency") ``` ```{r echo=FALSE } #Data recoded to distinguish between less categories: MZs, DZs/other siblings and NAs twin_data$twin[twin_data$ZYGOSITY==1 | twin_data$ZYGOSITY==2]<-1 twin_data$twin[twin_data$ZYGOSITY==3 | twin_data$ZYGOSITY==4| twin_data$ZYGOSITY==5| twin_data$ZYGOSITY==6]<-2 twin_data$twin[is.na(twin_data$twin)]<-0 twin_data$gen[twin_data$ZYGOSITY==1 | twin_data$ZYGOSITY==2]<-1 twin_data$gen[twin_data$ZYGOSITY==3 | twin_data$ZYGOSITY==4| twin_data$ZYGOSITY==5| twin_data$ZYGOSITY==6]<-0 twin_data$gen[is.na(twin_data$gen)]<-0 ``` ```{r echo=FALSE } #Retaining one MZ only, creating the dataset dataMZs <- subset(twin_data, twin_data$twin == 1) # Randomize row order before removing duplicates set.seed(123) # Ensures reproducibility dataMZs <- dataMZs[sample(nrow(dataMZs)), ] #random selection # Keep only one twin per family dataMZs <- dataMZs[!duplicated(dataMZs$FAM_ID), ] dataDZs <- subset(twin_data, (twin_data$gen==0)) data1<- data.frame(rbind(dataMZs, dataDZs)) ``` ```{r} #Scaling ancestry components as otherwise the model using them as covariates had trouble loading data1 <- data1 %>% mutate(across(starts_with("PC"), ~ as.numeric(scale(.)))) ``` ```{r} install.packages("lmerTest") # if not already installed library(lmerTest) # Refit your model using lmerTest (same syntax) model <- lmer(pca_score_Z ~ prs + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + PC11 + PC12 + PC13 + PC14 + PC15 + PC16 + PC17 + PC18 + PC19 + PC20 + ImputationRun_Hap + (1 | FAM_ID), data = data1) # Now this will show p-values summary(model) ``` ```{r} library(parameters) so1 <- standardize_parameters(model) print(so1) ``` ```{r echo=FALSE } #Looking at variance explained - using composite score, PRS explains 3.7% of variance. In GWAS Nature Genetics article it was up to 3.6% r2beta(model) ``` ```{r} #summary of SES data knitr::kable(SES_data %>% summarise( N_families = n_distinct(F_ID), N_individuals = n_distinct(ID), Average_SEI= round(mean(TM.SEI, na.rm = TRUE), 1), SD_SEI= round(sd(TM.SEI, na.rm = TRUE), 1), )) ``` ```{r} hist(SES_data$TM.SEI) ``` ```{r echo=FALSE} #Reducing the SES data to use average family SES index SES_data_reduced <- SES_data %>% group_by(F_ID) %>% summarise( SEI = mean(TM.SEI, na.rm = TRUE), # Calculate the average SEI per family .groups = "drop" # Ensure no residual grouping ) ``` ```{r echo=FALSE} #Removing the duplicates, merging the data for SES model SES_data_reduced <- SES_data_reduced[!duplicated(SES_data_reduced$F_ID), ] ``` ```{r} SES_data_reduced$F_ID <- trimws(SES_data_reduced$F_ID) data1$FAM_ID <- trimws(data1$FAM_ID) ``` ```{r} data2<- merge(data1, SES_data_reduced,by.x="FAM_ID", by.y="F_ID") ``` ```{r echo=FALSE } #SES is given in a score of 0-100, standardising it for the effect to be more interpretable data2<- data2%>% mutate( SES_s = scale(SEI) ) ``` ```{r} ggplot(data=data2, aes(x=SES_s))+ geom_histogram(fill='deeppink3', color = 'black')+ labs(title="Histogram of residuals", x="Residuals", y="Frequency") ``` ```{r} ggplot(data=data2, aes(x=pca_score_Z))+ geom_histogram(fill='deeppink3', color = 'black')+ labs(title="Histogram of residuals", x="Residuals", y="Frequency") ``` ```{r echo=FALSE} #From 716 families from SES data and 767 families from polygenic score data, there's 604 families when merged knitr::kable(data2 %>% summarise( N_families = n_distinct(FAM_ID), N_individuals = n_distinct(IID), )) ``` ```{r} setdiff(data1$FAM_ID, SES_data_reduced$F_ID) ``` ```{r} setdiff(SES_data_reduced$F_ID, data1$FAM_ID) ``` ```{r} str(SES_data_reduced$F_ID) # Check F_ID type str(data1$FAM_ID) ``` ```{r} library(dplyr) # Count unique individuals (IID) per family (FAM_ID) family_distribution <- data2 %>% group_by(FAM_ID) %>% summarise(num_individuals = n_distinct(IID), .groups = "drop") %>% count(num_individuals, name = "num_families") # View the distribution of family sizes print(family_distribution) ``` ```{r} library(dplyr) # Count unique individuals (IID) per family (FAM_ID) family_distribution1 <- full_data %>% group_by(FID) %>% summarise(num_individuals = n_distinct(IID), .groups = "drop") %>% count(num_individuals, name = "num_families") # View the distribution of family sizes print(family_distribution1) ``` ```{r echo=FALSE} data2$SESgroups <- cut(data2$SEI, breaks=c(0, 30, 70, 100), labels=c('Low', 'Average', 'High')) ``` ```{r} aggregate(prs ~ SESgroups, data = data2, mean) ``` ```{r echo=FALSE } #Exploring the SES and interaction model2 <- lmer(pca_score_Z ~ prs*SES_s + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + PC11 + PC12 + PC13 + PC14 + PC15 + PC16 + PC17 + PC18 + PC19 + PC20 + ImputationRun_Hap + (1|FAM_ID), data=data2, na.action=na.exclude) ``` ```{r echo=FALSE } summary(model2) ``` ```{r} ``` ```{r echo=FALSE } library(r2glmm) r2beta(model2,method = "nsj", partial = TRUE) ``` ```{r} #Power calculations # Parameters n_sim <- 1000 interaction_effect <- 0.03 # Interaction effect size n_groups <- 604 # Number of groups (families) n_total <- 1307 # Total sample size p_values <- numeric(n_sim) # Simulate data and fit the model repeatedly set.seed(123) # For reproducibility for (i in 1:n_sim) { # Simulate data data <- data.frame( FAM_ID = rep(1:n_groups, each = ceiling(n_total / n_groups), length.out = n_total), # Family grouping prs = rnorm(n_total, mean = 0, sd = 1), # Predictor 1 SES_s = rnorm(n_total, mean = 0, sd = 1) # Predictor 2 ) random_intercepts <- rnorm(n_groups, mean = 0, sd = 1) # Random intercepts # Simulate the outcome with interaction effect data$pca_score_Z <- random_intercepts[data$FAM_ID] + interaction_effect * data$prs * data$SES_s + # Interaction term rnorm(n_total, mean = 0, sd = 1) # Residual noise # Fit the mixed-effects model model <- lme(fixed = pca_score_Z ~ prs * SES_s, random = ~ 1 | FAM_ID, data = data, na.action = na.exclude) # Extract p-value for the interaction effect p_values[i] <- summary(model)$tTable["prs:SES_s", "p-value"] } # Calculate power power <- mean(p_values < 0.05) cat("Estimated power:", power) ``` ```{r echo=FALSE } data3<- merge(data1, school_data, by="IID") data4<- merge(data3, data2, by= "IID") knitr::kable(data4 %>% group_by(school_type)%>% summarise( N_individuals = n_distinct(IID) )) ``` ```{r} library(dplyr) library(knitr) knitr::kable( data2 %>% group_by(SESgroups) %>% summarise( Average_PRS = round(mean(prs), 2), SD_PRS = round(sd(prs), 2), Group_Size = n() ), caption = "Average Polygenic Scores and Standard Deviations in different SES levels" ) ``` ```{r} #The school type sample is quite small - looking at how PRS explain variance here prsmodelschool <- lmer(pca_score_Z ~ prs + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + PC11 + PC12 + PC13 + PC14 + PC15 + PC16 + PC17 + PC18 + PC19 + PC20 + ImputationRun_Hap + (1|FAM_ID), data=data3, na.action=na.exclude) summary(prsmodelschool) ``` ```{r} #half of the explained variance than in the larger sample r2beta(prsmodelschool) ``` ```{r echo=FALSE } #school model model4 <- lmer(pca_score_Z ~ prs * type + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + PC11 + PC12 + PC13 + PC14 + PC15 + PC16 + PC17 + PC18 + PC19 + PC20 + ImputationRun_Hap + (1|FAM_ID), data=data3, na.action=na.exclude) ``` ```{r echo=FALSE } summary(model4) ``` ```{r} r2beta(model4) ``` ```{r} model5 <- lmer(pca_score_Z.x ~ type *prs.x +SES_s + PC1.x + PC2.x + PC3.x + PC4.x + PC5.x + PC6.x + PC7.x + PC8.x + PC9.x + PC10.x + PC11.x + PC12.x + PC13.x + PC14.x + PC15.x + PC16.x + PC17.x + PC18.x + PC19.x + PC20.x + ImputationRun_Hap.x + (1|FAM_ID.x), data=data4, na.action=na.exclude) ``` ```{r} summary(model5) ``` ```{r} r2beta(model5) ```