This page complements Imputation panel comparison (HRC
vs TOPMed) and PGS:QS. It follows the
manuscript Results section on separating missing SNPs
from genotype error, the PGS-impute approach (GCTB
--convert), supplementary figures, and Glaucoma / Hair
colour genotype checks.
## define predictor
traitfile="/QRISdata/Q6913/GCTB_predictor_list_for_batch_profiling.txt"
trait=$(sed "${i}q;d" $traitfile | awk '{print $1}' )
predictor=$(sed "${i}q;d" $traitfile | awk '{print $3}' )
echo $trait
echo $predictor
outdir=PRS_all_GCTB
mkdir -p $outdir
data1="WGS_NA10861"
genofile1=ceph_combined_SBRC_SNPs.vcf
plink \
--vcf $genofile1 \
--score $predictor 2 5 8 header sum \
--exclude missing_in_any_SNPs_for_NA10861.txt \
--out ${outdir}/${data1}_${trait}_SBayesRC
data2=GSA_HRCr1.1
genofile2=Merged_plink/GSA_HRCr1.1_autosomes
plink \
--bfile $genofile2 \
--score $predictor 2 5 8 header sum \
--exclude missing_in_any_SNPs_for_NA10861.txt \
--out ${outdir}/${data2}_${trait}_SBayesRC
data3=GSA_TopMed
genofile3=Merged_plink/GSA_TOPMedr3_autosomes_7.3M_subset
plink \
--bfile $genofile3 \
--score $predictor 2 5 8 header sum \
--exclude missing_in_any_SNPs_for_NA10861.txt \
--out ${outdir}/${data3}_${trait}_SBayesRC
compare the PGS of NA10861 between the three resources
outdir=PRS_all_GCTB
traitfile="/QRISdata/Q6913/GCTB_predictor_list_for_batch_profiling.txt"
data3=GSA_TopMed
genofile3=Merged_plink/GSA_TOPMedr3_autosomes_7.3M_subset
data2=GSA_HRCr1.1
genofile2=Merged_plink/GSA_HRCr1.1_autosomes
Rscript merge_PGS.R ${genofile2}.fam $traitfile $outdir $data2
## input PGS from TopMed imputed
ceph_consis_PRS_TopMed = read.csv("Data/missingness_check/PRS_all_GCTB_GSA_TopMed_all_139_traits_GCTB_PRS.csv", row.names =1)
ceph_consis_PRS_TopMed= ceph_consis_PRS_TopMed[,match(included_traits, colnames(ceph_consis_PRS_TopMed) )]
ceph_consis_PRS_TopMed$imputation_panel = "TopMedr3"
## input PGS from HRC imputed
ceph_consis_PRS_HRC = read.csv("Data/missingness_check/PRS_all_GCTB_GSA_HRCr1.1_all_139_traits_GCTB_PRS.csv", row.names =1)
ceph_consis_PRS_HRC= ceph_consis_PRS_HRC[,match(included_traits, colnames(ceph_consis_PRS_HRC) )]
ceph_consis_PRS_HRC$imputation_panel = "HRCr1.1"
## merge
ceph_consis_PRS = rbind(ceph_consis_PRS_TopMed, ceph_consis_PRS_HRC)
## get the four samples
pattern <- paste(c("NA06990", "NA07023", "NA07057", "NA10861"), collapse = "|")
ceph_consis_PRS <- ceph_consis_PRS[grepl(pattern, row.names(ceph_consis_PRS)), ]
## input PGS from WGS using only overlap SNPs
ceph_consis_PRS_WGS = read.table("Data/missingness_check/NA10861_WGS_PRS_vGCTB.txt")
ceph_consis_PRS_WGS$V1 = gsub("-", ".",ceph_consis_PRS_WGS$V1 )
ceph_consis_PRS_WGS = (ceph_consis_PRS_WGS[ match(included_traits, ceph_consis_PRS_WGS$V1),])
colnames(ceph_consis_PRS_WGS) = c("variable", "CN1", "CN2", "NA10861")
mean.sd.table = read.csv("Data/LifeLines_MEAN_and_SD_of_traits_for_standardization.csv", row.names = 1)
mean.sd.table = mean.sd.table[match(included_traits, mean.sd.table$trait ),]
ceph_consis_PRS.norm = sweep(sweep ( sapply (ceph_consis_PRS[,included_traits], as.numeric ), 2, mean.sd.table$mean), 2, mean.sd.table$sd, FUN = '/')
ceph_consis_PRS.norm = data.frame(ceph_consis_PRS.norm)
ceph_consis_PRS.norm$imputation_panel = ceph_consis_PRS$imputation_panel
row.names(ceph_consis_PRS.norm) = row.names(ceph_consis_PRS)
ceph_consis_PRS.norm$IID = sapply(strsplit(row.names(ceph_consis_PRS), "_"), `[`, 4)
ceph_consis_PRS.norm$imputation_panel = factor(ceph_consis_PRS.norm$imputation_panel , levels = panel.order)
ceph_consis_PRS.norm = ceph_consis_PRS.norm %>%
arrange(imputation_panel) %>%
group_by(IID) %>%
mutate(y.position = row_number()) %>%
ungroup() # Ungroup to return to normal data frame structure
ceph_consis_PRS.norm$y.position =ceph_consis_PRS.norm$y.position * 5
## melt it
melted_ceph_consis_PRS = reshape2::melt(ceph_consis_PRS.norm,
id.vars =c("IID", "imputation_panel", "y.position") ,
measure.vars =included_traits )
colnames(melted_ceph_consis_PRS)[1] = "Study_ID"
ceph_consis_PRS_WGS$NA10861 = ( ceph_consis_PRS_WGS$NA10861 - mean.sd.table$mean) / mean.sd.table$sd
## add label
ceph_consis_PRS_WGS$Trait = predictor.list[match(ceph_consis_PRS_WGS$variable, predictor.list$Predictor),"Label"]
melted_ceph_consis_PRS$Trait = predictor.list[match(melted_ceph_consis_PRS$variable, predictor.list$Predictor),"Label"]
mean.prs.with.missing.summary = melted_ceph_consis_PRS%>%
group_by(Study_ID, variable, imputation_panel) %>%
summarise(
mean.prs = mean(value)
)%>%
pivot_wider(
names_from = imputation_panel,
values_from = mean.prs
)
mean.prs.with.missing.summary$difference = abs(mean.prs.with.missing.summary$TopMedr3 - mean.prs.with.missing.summary$HRCr1.1)
cross.sample.mean.prs.with.missing.summary= mean.prs.with.missing.summary %>% group_by(variable) %>% summarise(mean_difference = mean(difference)) %>% arrange(mean_difference)
cross.sample.mean = data.frame(cross.sample.mean)
cross.sample.mean.prs.with.missing.summary = data.frame(cross.sample.mean.prs.with.missing.summary)
cross.sample.mean$diff_under_same_snps = cross.sample.mean.prs.with.missing.summary[ match(cross.sample.mean$variable, cross.sample.mean.prs.with.missing.summary$variable), "mean_difference"]
ggplot(data = cross.sample.mean, aes(x = mean_difference, y = diff_under_same_snps)) + geom_point() + geom_abline(intercept = 0, slope =1)

hist.a = ggplot(data = cross.sample.mean, aes(x = mean_difference)) + geom_histogram(bins =50) + xlab( "|PGS-TopMed - PGS-HRC| (in SD unit)" ) + theme_bw() + xlim(0,2.2) + ylim(0,55)
hist.b = ggplot(data = cross.sample.mean, aes(x = diff_under_same_snps)) + geom_histogram(bins =50) + xlab( "|PGS-TopMed - PGS-HRC| (in SD unit)" ) + theme_bw() + xlim(0,2.2) + ylim(0,55)
hist.supfig6 = ggarrange(hist.a, hist.b, labels = c("A", "B"))
hist.supfig6
ggsave( "Figures/SupFig6_hist_of_difference.jpeg", hist.supfig6, width = 12, height = 5)

example.traits = example.traits[5:8]
example.consist.hist = panel_plot(example.traits =example.traits, facet.col = 4, legend.pos = "bottom" , melted_ceph_consis_PRS , ceph_consis_PRS_WGS, melted.prs )
info.text.c = data.frame(
variable = example.traits,
Trait = predictor.list[match(example.traits, predictor.list$Predictor),"Label"],
variance = format(round((predictor.list[match(example.traits, predictor.list$Predictor),"total.variance"]) * (predictor.list[match(example.traits, predictor.list$Predictor),"QIMR_TopMed"]) ,3 ), nsmall=3) ,
# HRCr1.1 = paste0( round(100 * predictor.list[match(example.traits[5:8], predictor.list$Predictor),"QIMR_TopMed"], 1 ) , "%"),
# TopMedr3 = paste0( round(100 * predictor.list[match(example.traits[5:8], predictor.list$Predictor),"QIMR_TopMed"], 1 ) , "%"),
HRCr1.1 = "100%",
TopMedr3 = "100%",
x = -0.5,
y = Inf
)
info.text.c$variable = factor(info.text.c$variable , levels = info.text.c$variable )
info.text.c$Trait = factor(info.text.c$Trait , levels = info.text.c$Trait )
info.text.c$labels = paste0("V=", info.text.c$variance ,"; +:", info.text.c$HRCr1.1, "; \u25B3",":" , info.text.c$TopMedr3)
labeled.example.consist.hist = example.consist.hist +
geom_text(
data = info.text.c,
aes(x = median(x), y = y, label = (labels) ),
inherit.aes = FALSE,
vjust =1, # Push down from top
hjust = 0.5,
size = 4,
family = "Arial Unicode MS"
)
labeled.example.consist.hist

We are using correlation of PGS to show how the PGS got affected by SNP missingness, and how the convert function recover the PGS.
The function is added to GCTB.
SNP weight of the missing SNPs are converted to the existing SNPs.
## define predictor
traitfile="GCTB_predictor_list_with_7.3M.txt"
trait=$(sed "${i}q;d" $traitfile | awk '{print $1}' )
predictor=$(sed "${i}q;d" $traitfile | awk '{print $3}' )
echo $trait
echo $predictor
targetlist=SNP_list_for_nonmissing_test.txt
ldm=/QRISdata/Q6913/Pipeline/ukbEUR_Imputed
gctb="/scratch/project_mnt/S0007/jzeng/repo/GCTB_dev/scr/gctb"
$gctb --convert \
--snp-res $predictor \
--ldm-eigen $ldm \
--extract $targetlist \
--thread 8 \
--out Converted_predictors/${trait}_converted_nonmissing_SBayesRC_predictor
## input PGS from TopMed imputed using converted predictors
ceph_convert_PRS_TopMed = read.csv("Data/missingness_check/QIMR_TopMed_PGS_from_converted_predictors_QIMR_GSA_TopMedr3_imputed_all_123_traits_GCTB_PRS.csv", row.names =1)
included_traits = included_traits[included_traits %in% colnames(ceph_convert_PRS_TopMed)]
ceph_convert_PRS_TopMed= ceph_convert_PRS_TopMed[,match(included_traits, colnames(ceph_convert_PRS_TopMed) )]
ceph_convert_PRS_TopMed$imputation_panel = "TopMedr3"
## input PGS from HRC imputed using original predictor
ceph_PRS_HRC = read.csv("Data/QIMR/PRS_all_GCTB_GSA_HRCr1.1_all_140_traits_GCTB_PRS.csv", row.names =1)
ceph_PRS_HRC= ceph_PRS_HRC[,match(included_traits, colnames(ceph_PRS_HRC) )]
ceph_PRS_HRC$imputation_panel = "HRCr1.1"
## merge
ceph_convert_PRS = rbind(ceph_convert_PRS_TopMed, ceph_PRS_HRC)
## get the four samples
pattern <- paste(c("NA06990", "NA07023", "NA07057", "NA10861"), collapse = "|")
ceph_convert_PRS <- ceph_convert_PRS[grepl(pattern, row.names(ceph_convert_PRS)), ]
## input PGS from WGS using original predictor
ceph_PRS_WGS = read.table("Data/WGS/NA10861_WGS_PRS_vGCTB.txt", header = T)
ceph_PRS_WGS$V1 = gsub("-", ".", ceph_PRS_WGS$V1)
ceph_PRS_WGS = (ceph_PRS_WGS[ match(included_traits, ceph_PRS_WGS$V1),])
colnames(ceph_PRS_WGS) = c("variable", "CN1", "CN2", "NA10861")
mean.sd.table = read.csv("Data/LifeLines_MEAN_and_SD_of_traits_for_standardization.csv", row.names = 1)
mean.sd.table = mean.sd.table[match(included_traits, mean.sd.table$trait ),]
ceph_convert_PRS.norm = sweep(sweep ( sapply (ceph_convert_PRS[,included_traits], as.numeric ), 2, mean.sd.table$mean), 2, mean.sd.table$sd, FUN = '/')
ceph_convert_PRS.norm = data.frame(ceph_convert_PRS.norm)
ceph_convert_PRS.norm$imputation_panel = ceph_convert_PRS$imputation_panel
row.names(ceph_convert_PRS.norm) = row.names(ceph_convert_PRS)
ceph_convert_PRS.norm$Study_ID = sapply(strsplit(row.names(ceph_convert_PRS), "_"), `[`, 4)
ceph_convert_PRS.norm$imputation_panel = factor(ceph_convert_PRS.norm$imputation_panel , levels = panel.order)
ceph_convert_PRS.norm = ceph_convert_PRS.norm %>%
arrange(imputation_panel) %>%
group_by(Study_ID) %>%
mutate(y.position = row_number()) %>%
ungroup() # Ungroup to return to normal data frame structure
ceph_convert_PRS.norm$y.position =ceph_convert_PRS.norm$y.position * 5
## melt it
melted_ceph_convert_PRS = reshape2::melt(ceph_convert_PRS.norm,
id.vars =c("Study_ID", "imputation_panel", "y.position") ,
measure.vars =included_traits )
ceph_PRS_WGS$NA10861 = ( ceph_PRS_WGS$NA10861 - mean.sd.table$mean) / mean.sd.table$sd
## add label
ceph_PRS_WGS$Trait = predictor.list[match(ceph_PRS_WGS$variable, predictor.list$Predictor),"Label"]
melted_ceph_convert_PRS$Trait = predictor.list[match(melted_ceph_convert_PRS$variable, predictor.list$Predictor),"Label"]
mean.prs.with.convert.summary = melted_ceph_convert_PRS%>%
group_by(Study_ID, variable, imputation_panel) %>%
summarise(
mean.prs = mean(value)
)%>%
pivot_wider(
names_from = imputation_panel,
values_from = mean.prs
)
mean.prs.with.convert.summary$difference = abs(mean.prs.with.convert.summary$TopMedr3 - mean.prs.with.convert.summary$HRCr1.1)
cross.sample.mean.prs.with.convert.summary= mean.prs.with.convert.summary %>% group_by(variable) %>% summarise(mean_difference = mean(difference)) %>% arrange(mean_difference)
cross.sample.mean = data.frame(cross.sample.mean)
cross.sample.mean.prs.with.convert.summary = data.frame(cross.sample.mean.prs.with.convert.summary)
cross.sample.mean$diff_from_convert = cross.sample.mean.prs.with.convert.summary[ match(cross.sample.mean$variable, cross.sample.mean.prs.with.convert.summary$variable), "mean_difference"]
example.convert.hist = panel_plot(example.traits =example.traits, facet.col = 4, legend.pos = "bottom" , melted_ceph_convert_PRS , ceph_PRS_WGS, melted.prs )
info.text.d = data.frame(
variable = example.traits,
Trait = predictor.list[match(example.traits, predictor.list$Predictor),"Label"],
variance = format(round((predictor.list[match(example.traits, predictor.list$Predictor),"total.variance"]) ,3 ) , nsmall=3) ,
#HRCr1.1 = paste0( round(100 * predictor.list[match(example.traits[5:8], predictor.list$Predictor),"QIMR_TopMed"], 1 ) , "%"),
#TopMedr3 = paste0( round(100 * predictor.list[match(example.traits[5:8], predictor.list$Predictor),"QIMR_HRC"], 1 ) , "%"),
HRCr1.1 = "100%",
TopMedr3 = "100%",
x = -0.5,
y = Inf
)
info.text.d$variable = factor(info.text.d$variable , levels = info.text.d$variable )
info.text.d$Trait = factor(info.text.d$Trait , levels = info.text.d$Trait )
info.text.d$labels = paste0("V=", info.text.d$variance ,"; +:", info.text.d$HRCr1.1, "; \u25B3",":" , info.text.d$TopMedr3)
labeled.example.convert.hist = example.convert.hist +
geom_text(
data = info.text.d,
aes(x = median(x), y = y, label = (labels) ),
inherit.aes = FALSE,
vjust =1, # Push down from top
hjust = 0.5,
size = 4,
family = "Arial Unicode MS"
)
labeled.example.convert.hist
# ggsave(labeled.example.convert.hist, filename = "Figures/Fig4_PGS_consistency_when_using_converted_predictors_in_ToPMed.jpeg", width = 12, height = 3)

cross.sample.mean = cross.sample.mean %>%
mutate(across(where(is.numeric), ~ round(.x, 3)))
write.csv(cross.sample.mean, "Tables/SupTable3_difference_between_imputation_panel_and_coverted.csv")
fig.a = ggplot(data = cross.sample.mean, aes(x = mean_difference, y = diff_from_convert)) +
geom_point() +
geom_abline(intercept = 0, slope =1)+
geom_text_repel(data = cross.sample.mean %>% filter(mean_difference > 0.2 | diff_from_convert > 0.2) ,
aes(label = variable), vjust = -1, color = "black", size = 1.5) +
xlab("difference between panel") +
ylab("difference after converting the predictor") +
theme_bw()
fig.b = ggplot(data = cross.sample.mean, aes(x = mean_difference, y = diff_under_same_snps)) +
geom_point() +
geom_abline(intercept = 0, slope =1)+
geom_text_repel(data = cross.sample.mean %>% filter(mean_difference > 0.2 | diff_from_convert > 0.2) ,
aes(label = variable), vjust = -1, color = "black", size = 1.5) +
xlab("difference between panel") +
ylab("difference when using the same SNP set")+
theme_bw()
fig.change = ggarrange(fig.b , fig.a, nrow = 1)
fig.change
ggsave(filename = "Figures/SupFig3_imputation_panel_difference_before_and_after_convert.jpeg", plot = fig.change, width = 12, height = 6)

The top 1000 SNPs were picked from Glaucoma SBsyesRC weights. Genotype of these 1000 SNPs were extracted from the three data sets that included sample NA10861: WGS, HRC imputed and TopMed imputed QIMR Malanoma NCI cohort genotyped with GSAv2 chip.
The genotype value is in additive model as the number of risk alleles. We removed all the events that have exactly the same genotype in each data set, which resulted with 1 event from WGS, 1 event from TopMed imputation, and 4 event from HRC imputation. Then we matched these 6 events, and removed all the SNPs that have exactly the same calling among all of them. There are 13 SNPs remained.
Their genotype are merged with the SNP information, and sorted based on the SBayesRC SNP weight. All the genotype callings that are different from the calling in WGS are highlighted with red.
We can see that both the count of risk alleles and the partial PGS calculated from these SNPs are higher in HRC imputed events, which is consistent with the observation in whole PGS.
glaucoma.effect = data.frame(fread("../Large_Data/Glaucoma_01_SBayesRC.snpRes"))
top1000 = read.table("Data/missingness_check/Glaucoma_01_SBayesRC_top1000_SNPs.txt")
glaucoma.effect = glaucoma.effect[glaucoma.effect$Name %in% top1000$V1,]
glaucoma.effect$Risk_Allele <- ifelse(glaucoma.effect$A1Effect > 0,
glaucoma.effect$A1,
glaucoma.effect$A2)
glaucoma.effect$Bene_Allele <- ifelse(glaucoma.effect$A1Effect < 0,
glaucoma.effect$A1,
glaucoma.effect$A2)
glaucoma.effect$Risk_Allele_effect = abs(glaucoma.effect$A1Effect)
#write.table(glaucoma.effect[,c("Name", "Risk_Allele")], row.names = F, col.names = F, quote = F, sep ="\t", file = "Data/missingness_check/top1000SNP_Risk_Allele_of_Glaucoma.txt")
glaucoma.effect$SNP_Risk_Allele = paste0(glaucoma.effect$Name, "_", glaucoma.effect$Risk_Allele)
glaucoma.effect$SNP_Bene_Allele = paste0(glaucoma.effect$Name, "_", glaucoma.effect$Bene_Allele)
## set reference allele
Merged_plink=/QRISdata/Q5740/QIMR_Berghofer/Merged_plink/
plink \
--bfile ${Merged_plink}/GSA_TOPMedr3_autosomes_7.3M_subset \
--extract Glaucoma_top1000_genotype/Glaucoma_01_SBayesRC_top1000_SNPs.txt \
--exclude missing_in_any_SNPs_for_NA10861.txt \
--recode A \
--a1-allele top1000SNP_Risk_Allele_of_Glaucoma.txt 1 2 \
--out Glaucoma_top1000_genotype/Glaucoma_01_SBayesRC_top1000_SNPs_in_QIMR_GSA_TOPMedr3
plink \
--bfile ${Merged_plink}/GSA_HRCr1.1_autosomes \
--extract Glaucoma_top1000_genotype/Glaucoma_01_SBayesRC_top1000_SNPs.txt \
--exclude missing_in_any_SNPs_for_NA10861.txt \
--recode A \
--a1-allele top1000SNP_Risk_Allele_of_Glaucoma.txt 1 2 \
--out Glaucoma_top1000_genotype/Glaucoma_01_SBayesRC_top1000_SNPs_in_QIMR_GSA_HRCr1.1
plink \
--vcf NA10861_combined_SBRC_SNPs.vcf \
--extract Glaucoma_top1000_genotype/Glaucoma_01_SBayesRC_top1000_SNPs.txt \
--exclude missing_in_any_SNPs_for_NA10861.txt \
--recode A \
--a1-allele top1000SNP_Risk_Allele_of_Glaucoma.txt 1 2 \
--out Glaucoma_top1000_genotype/Glaucoma_01_SBayesRC_top1000_SNPs_in_WGS_NA10861
geno_trans = function(raw){
# Remove non-SNP columns (e.g., FID, IID, PAT, MAT, SEX, PHENOTYPE)
snp_data <- raw[, 7:ncol(raw)] # SNP columns start at col 7 in PLINK .raw
# Get individual IDs to set as colnames
individual_ids <- raw$IID
# Transpose
t_snp <- t(snp_data)
colnames(t_snp) <- individual_ids
rownames(t_snp) <- colnames(snp_data)
# Convert to data frame if needed
t_snp_df <- as.data.frame(t_snp)
return(t_snp_df)
}
# Read PLINK raw file
geno.wgs <- read.table("Data/missingness_check/Glaucoma_01_SBayesRC_top1000_SNPs_in_WGS_NA10861.raw", header = TRUE)
geno.wgs = geno_trans(geno.wgs)
geno.wgs$toy2 = 1
#geno.wgs$effect_size = "NA"
geno.gsa.topmed = read.table("Data/missingness_check/Glaucoma_01_SBayesRC_top1000_SNPs_in_QIMR_GSA_TOPMedr3.raw", header = T)
geno.gsa.topmed = geno_trans(geno.gsa.topmed)
geno.gsa.topmed = geno.gsa.topmed[,grep("NA10861", colnames(geno.gsa.topmed))]
geno.gsa.topmed$toy = 1
geno.gsa.hrc = read.table("Data/missingness_check/Glaucoma_01_SBayesRC_top1000_SNPs_in_QIMR_GSA_HRCr1.1.raw", header = T)
geno.gsa.hrc = geno_trans(geno.gsa.hrc)
geno.gsa.hrc = geno.gsa.hrc[,grep("NA10861", colnames(geno.gsa.hrc))]
# Transpose so samples are rows and SNPs are columns
geno_t <- t(geno.gsa.topmed)
# Find unique rows (i.e., unique samples), then get their indices
unique_indices <- !duplicated(geno_t)
# Subset back to keep only unique columns in the original matrix
geno.gsa.topmed.unique <- geno.gsa.topmed[, unique_indices]
# Optional: check how many duplicates were removed
cat("Number of duplicate samples removed:", ncol(geno.gsa.topmed) - ncol(geno.gsa.topmed.unique), "\n")
# Transpose so samples are rows and SNPs are columns
geno_t <- t(geno.gsa.hrc)
# Find unique rows (i.e., unique samples), then get their indices
unique_indices <- !duplicated(geno_t)
# Subset back to keep only unique columns in the original matrix
geno.gsa.hrc.unique <- geno.gsa.hrc[, unique_indices]
# Optional: check how many duplicates were removed
cat("Number of duplicate samples removed:", ncol(geno.gsa.hrc) - ncol(geno.gsa.hrc.unique), "\n")
merge_glaucoma_geno <- function(glaucoma.effect, geno) {
individuals <- colnames(geno)
output <- matrix(NA, nrow = nrow(glaucoma.effect), ncol = length(individuals))
rownames(output) <- glaucoma.effect$Name
colnames(output) <- individuals
for (i in seq_len(nrow(glaucoma.effect))) {
snp_a1 <- glaucoma.effect$SNP_Risk_Allele[i]
snp_a2 <- glaucoma.effect$SNP_Bene_Allele[i]
if (snp_a1 %in% rownames(geno)) {
output[i, ] <- as.numeric(geno[snp_a1, , drop = FALSE])
} else if (snp_a2 %in% rownames(geno)) {
geno_vals <- as.numeric(geno[snp_a2, , drop = FALSE])
geno_flipped <- ifelse(geno_vals == 2, 0,
ifelse(geno_vals == 0, 2, geno_vals)) # keep 1 as-is
output[i, ] <- geno_flipped
}
}
geno_df <- as.data.frame(output)
final <- cbind(glaucoma.effect, geno_df)
return(final)
}
colnames(geno.gsa.hrc.unique) = paste0(colnames(geno.gsa.hrc.unique), "_HRC")
colnames(geno.gsa.topmed.unique) = paste0(colnames(geno.gsa.topmed.unique), "_TopMed")
merged_hrc <- merge_glaucoma_geno(glaucoma.effect, geno.gsa.hrc.unique)
merged_hrc_top = merge_glaucoma_geno(merged_hrc, geno.gsa.topmed.unique)
merged_hrc_top_wgs = merge_glaucoma_geno(merged_hrc_top, geno.wgs)
merged_hrc_top_wgs=merged_hrc_top_wgs[,grep("toy", colnames(merged_hrc_top_wgs), invert = T)]
merged_hrc_top_wgs = merged_hrc_top_wgs %>% arrange(-Risk_Allele_effect)
merged_hrc_top_wgs$Index = 1:1000
last6_cols <- tail(colnames(merged_hrc_top_wgs), 6)
# Keep rows where the values in the last 6 columns are not all the same
filtered_df <- merged_hrc_top_wgs[
apply(merged_hrc_top_wgs[, last6_cols], 1, function(row) {
length(unique(row)) > 1
}),
]
# write.csv(filtered_df, "Tables/SNPs_with_different_geno_in_top1000_of_Glaucoma.csv", row.names = F)
filtered_df = read.csv("Tables/SNPs_with_different_geno_in_top1000_of_Glaucoma.csv")
colnames(filtered_df) = gsub("MelanomaMDD_NCI_GSA_" , "", colnames(filtered_df))
filtered_df %>%
select(-Pi1, -Pi2, -Pi3, -Pi4, -Pi5, ) %>%
kable() %>%
kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover")) %>%
column_spec(17:22, width = "100px", extra_css = "word-wrap: break-word; white-space: normal;")
| Index | Name | Chrom | Position | A1 | A2 | A1Frq | A1Effect | SE | VarExplained | PIP | SNP_A1 | SNP_A2 | Risk_Allele | SNP_Risk_Allele | Risk_Allele_effect | Bene_Allele | SNP_Bene_Allele | NA10861_WGS | NA10861_PC08411_TopMed | NA10861_PC08411_HRC | NA10861_PC08621_HRC | NA10861_PC11092_HRC | NA10861_PC11139_HRC |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 44 | rs4842266 | 12 | 79951566 | A | G | 0.696275 | 0.034213 | 0.033722 | 5.44e-05 | 0.5307170 | rs4842266_A | rs4842266_G | A | rs4842266_A | 0.034213 | G | rs4842266_G | 1 | 1 | 2 | 2 | 2 | 2 |
| 117 | rs965652 | 6 | 134368953 | A | G | 0.496475 | 0.018888 | 0.029578 | 3.42e-05 | 0.3260560 | rs965652_A | rs965652_G | A | rs965652_A | 0.018888 | G | rs965652_G | 2 | 2 | 2 | 1 | 2 | 1 |
| 131 | rs11755754 | 6 | 164325492 | G | A | 0.399175 | 0.017493 | 0.025092 | 2.52e-05 | 0.3695080 | rs11755754_G | rs11755754_A | G | rs11755754_G | 0.017493 | A | rs11755754_A | 0 | 0 | 0 | 0 | 1 | 0 |
| 151 | rs11614730 | 12 | 79925802 | A | G | 0.696100 | 0.015212 | 0.027911 | 2.35e-05 | 0.2567350 | rs11614730_A | rs11614730_G | A | rs11614730_A | 0.015212 | G | rs11614730_G | 1 | 1 | 2 | 2 | 2 | 2 |
| 257 | rs2811687 | 6 | 134369822 | A | G | 0.497950 | 0.009840 | 0.023446 | 1.80e-05 | 0.1748420 | rs2811687_A | rs2811687_G | A | rs2811687_A | 0.009840 | G | rs2811687_G | 2 | 2 | 2 | 1 | 2 | 1 |
| 319 | rs2207441 | 6 | 164325217 | A | G | 0.401375 | 0.008405 | 0.018563 | 1.10e-05 | 0.1442670 | rs2207441_A | rs2207441_G | A | rs2207441_A | 0.008405 | G | rs2207441_G | 0 | 0 | 0 | 0 | 1 | 0 |
| 354 | rs34852247 | 6 | 164324711 | A | G | 0.373500 | 0.007586 | 0.020255 | 1.17e-05 | 0.1303810 | rs34852247_A | rs34852247_G | A | rs34852247_A | 0.007586 | G | rs34852247_G | 0 | 0 | 0 | 0 | 1 | 0 |
| 454 | rs3800902 | 7 | 2144378 | T | C | 0.400550 | -0.006281 | 0.019049 | 8.30e-06 | 0.1571720 | rs3800902_T | rs3800902_C | C | rs3800902_C | 0.006281 | T | rs3800902_T | 2 | 0 | 0 | 0 | 0 | 0 |
| 460 | rs34913183 | 12 | 58166590 | T | C | 0.011150 | -0.006232 | 0.031416 | 1.10e-06 | 0.0520962 | rs34913183_T | rs34913183_C | C | rs34913183_C | 0.006232 | T | rs34913183_T | 1 | 1 | 2 | 2 | 2 | 2 |
| 776 | rs8016358 | 14 | 80895880 | A | C | 0.427800 | 0.004200 | 0.012595 | 4.80e-06 | 0.1101090 | rs8016358_A | rs8016358_C | A | rs8016358_A | 0.004200 | C | rs8016358_C | 0 | 0 | 1 | 1 | 1 | 0 |
| 841 | rs4842318 | 12 | 80044167 | T | C | 0.696550 | 0.003971 | 0.015281 | 5.60e-06 | 0.0611977 | rs4842318_T | rs4842318_C | T | rs4842318_T | 0.003971 | C | rs4842318_C | 1 | 1 | 2 | 2 | 2 | 2 |
| 867 | rs8176822 | 12 | 80048478 | T | C | 0.696575 | 0.003860 | 0.015743 | 5.90e-06 | 0.0535097 | rs8176822_T | rs8176822_C | T | rs8176822_T | 0.003860 | C | rs8176822_C | 1 | 1 | 2 | 2 | 2 | 2 |
| 897 | rs2253480 | 7 | 6686780 | C | T | 0.957700 | 0.003743 | 0.019280 | 1.00e-06 | 0.0605963 | rs2253480_C | rs2253480_T | C | rs2253480_C | 0.003743 | T | rs2253480_T | 1 | 1 | 2 | 2 | 2 | 2 |
The top 1000 SNPs were picked from Hair color SBsyesRC weights. Genotype of these 1000 SNPs were extracted from the three data sets that included sample NA10861: WGS, HRC imputed and TopMed imputed QIMR Malanoma NCI cohort genotyped with GSAv2 chip.
The genotype value is in additive model as the number of risk alleles. We removed all the events that have exactly the same genotype in each data set, which resulted with 1 event from WGS, XX event from TopMed imputation, and XX event from HRC imputation. Then we matched these 6 events, and removed all the SNPs that have exactly the same calling among all of them. There are XX SNPs remained.
Their genotype are merged with the SNP information, and sorted based on the SBayesRC SNP weight. All the genotype callings that are different from the calling in WGS are highlighted with red.
We can see that both the count of risk alleles and the partial PGS calculated from these SNPs are higher in HRC imputed events, which is consistent with the observation in whole PGS.
## run in bunya
library(data.table)
library(dplyr)
predictor = data.frame(fread("/QRISdata/Q6913/Uno_Traits/Binary_Traits/HairColor_01/SBayesRC/GCTB_v2.5.2/HairColor_01_SBayesRC.snpRes"))
top1000 <- predictor %>%
arrange(desc(abs(A1Effect))) %>%
slice_head(n = 1000)
top1000$Risk_Allele <- ifelse(top1000$A1Effect > 0,
top1000$A1,
top1000$A2)
top1000$Bene_Allele <- ifelse(top1000$A1Effect < 0,
top1000$A1,
top1000$A2)
top1000$Risk_Allele_effect = abs(top1000$A1Effect)
write.table(top1000[,c("Name", "Risk_Allele")], row.names = F, col.names = F, quote = F, sep ="\t", file = "top1000SNP_Risk_Allele_of_HairColor.txt")
top1000$SNP_Risk_Allele = paste0(top1000$Name, "_", top1000$Risk_Allele)
top1000$SNP_Bene_Allele = paste0(top1000$Name, "_", top1000$Bene_Allele)
write.csv(top1000, "Data/missingness_check/HairColr_top1000.csv")
## set reference allele
plink=Genotype
plink \
--bfile ${plink}/GSA_TOPMedr3_autosomes_7.3M_subset \
--extract top1000SNP_of_HairColor.txt \
--exclude Lists/missing_in_any_SNPs_for_NA10861.txt \
--recode A \
--a1-allele top1000SNP_Risk_Allele_of_HairColor.txt 1 2 \
--out HairColor_top1000_SNPs_in_QIMR_GSA_TOPMedr3
plink \
--bfile ${plink}/GSA_HRCr1.1_autosomes \
--extract top1000SNP_of_HairColor.txt \
--exclude Lists/missing_in_any_SNPs_for_NA10861.txt \
--recode A \
--a1-allele top1000SNP_Risk_Allele_of_HairColor.txt 1 2 \
--out HairColor_top1000_SNPs_in_QIMR_GSA_HRCr1.1
plink \
--vcf ${plink}/NA10861_combined_SBRC_SNPs.vcf \
--extract top1000SNP_of_HairColor.txt \
--exclude Lists/missing_in_any_SNPs_for_NA10861.txt \
--recode A \
--a1-allele top1000SNP_Risk_Allele_of_HairColor.txt 1 2 \
--out HairColor_top1000_SNPs_in_WGS_NA10861
geno_trans = function(raw){
# Remove non-SNP columns (e.g., FID, IID, PAT, MAT, SEX, PHENOTYPE)
snp_data <- raw[, 7:ncol(raw)] # SNP columns start at col 7 in PLINK .raw
# Get individual IDs to set as colnames
individual_ids <- raw$IID
# Transpose
t_snp <- t(snp_data)
colnames(t_snp) <- individual_ids
rownames(t_snp) <- colnames(snp_data)
# Convert to data frame if needed
t_snp_df <- as.data.frame(t_snp)
return(t_snp_df)
}
# Read PLINK raw file
geno.wgs <- read.table("Data/missingness_check/HairColor_top1000_SNPs_in_WGS_NA10861.raw", header = TRUE)
geno.wgs = geno_trans(geno.wgs)
geno.wgs$toy2 = 1
#geno.wgs$effect_size = "NA"
geno.gsa.topmed = read.table("Data/missingness_check/HairColor_top1000_SNPs_in_QIMR_GSA_TOPMedr3.raw", header = T)
geno.gsa.topmed = geno_trans(geno.gsa.topmed)
geno.gsa.topmed = geno.gsa.topmed[,grep("NA10861", colnames(geno.gsa.topmed))]
geno.gsa.topmed$toy = 1
geno.gsa.hrc = read.table("Data/missingness_check/HairColor_top1000_SNPs_in_QIMR_GSA_HRCr1.1.raw", header = T)
geno.gsa.hrc = geno_trans(geno.gsa.hrc)
geno.gsa.hrc = geno.gsa.hrc[,grep("NA10861", colnames(geno.gsa.hrc))]
# Transpose so samples are rows and SNPs are columns
geno_t <- t(geno.gsa.topmed)
# Find unique rows (i.e., unique samples), then get their indices
unique_indices <- !duplicated(geno_t)
# Subset back to keep only unique columns in the original matrix
geno.gsa.topmed.unique <- geno.gsa.topmed[, unique_indices]
# Optional: check how many duplicates were removed
cat("Number of duplicate samples removed:", ncol(geno.gsa.topmed) - ncol(geno.gsa.topmed.unique), "\n")
# Transpose so samples are rows and SNPs are columns
geno_t <- t(geno.gsa.hrc)
# Find unique rows (i.e., unique samples), then get their indices
unique_indices <- !duplicated(geno_t)
# Subset back to keep only unique columns in the original matrix
geno.gsa.hrc.unique <- geno.gsa.hrc[, unique_indices]
# Optional: check how many duplicates were removed
cat("Number of duplicate samples removed:", ncol(geno.gsa.hrc) - ncol(geno.gsa.hrc.unique), "\n")
merge_geno <- function(top1000, geno) {
individuals <- colnames(geno)
output <- matrix(NA, nrow = nrow(top1000), ncol = length(individuals))
rownames(output) <- top1000$Name
colnames(output) <- individuals
for (i in seq_len(nrow(top1000))) {
snp_a1 <- top1000$SNP_Risk_Allele[i]
snp_a2 <- top1000$SNP_Bene_Allele[i]
if (snp_a1 %in% rownames(geno)) {
output[i, ] <- as.numeric(geno[snp_a1, , drop = FALSE])
} else if (snp_a2 %in% rownames(geno)) {
geno_vals <- as.numeric(geno[snp_a2, , drop = FALSE])
geno_flipped <- ifelse(geno_vals == 2, 0,
ifelse(geno_vals == 0, 2, geno_vals)) # keep 1 as-is
output[i, ] <- geno_flipped
}
}
geno_df <- as.data.frame(output)
final <- cbind(top1000, geno_df)
return(final)
}
top1000 = read.csv("Data/missingness_check/HairColr_top1000.csv", row.names =1)
colnames(geno.gsa.hrc.unique) = paste0(colnames(geno.gsa.hrc.unique), "_HRC")
colnames(geno.gsa.topmed.unique) = paste0(colnames(geno.gsa.topmed.unique), "_TopMed")
merged_hrc <- merge_geno(top1000, geno.gsa.hrc.unique)
merged_hrc_top = merge_geno(merged_hrc, geno.gsa.topmed.unique)
merged_hrc_top_wgs = merge_geno(merged_hrc_top, geno.wgs)
merged_hrc_top_wgs=merged_hrc_top_wgs[,grep("toy", colnames(merged_hrc_top_wgs), invert = T)]
merged_hrc_top_wgs = merged_hrc_top_wgs %>% arrange(-Risk_Allele_effect)
merged_hrc_top_wgs$Index = 1:1000
last8_cols <- tail(colnames(merged_hrc_top_wgs), 8)
# Keep rows where the values in the last 6 columns are not all the same
filtered_df <- merged_hrc_top_wgs[
apply(merged_hrc_top_wgs[, last8_cols], 1, function(row) {
length(unique(row)) > 1
}),
]
write.csv(filtered_df, "Tables/SNPs_with_different_geno_in_top1000_of_HairColor.csv", row.names = F)
filtered_df = read.csv("Tables/SNPs_with_different_geno_in_top1000_of_HairColor.csv")
colnames(filtered_df) = gsub("MelanomaMDD_NCI_GSA_" , "", colnames(filtered_df))
filtered_df %>%
select(-Pi1, -Pi2, -Pi3, -Pi4, -Pi5, ) %>%
kable() %>%
kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover")) %>%
column_spec(17:22, width = "100px", extra_css = "word-wrap: break-word; white-space: normal;")
| Index | Name | Chrom | Position | A1 | A2 | A1Frq | A1Effect | SE | VarExplained | PIP | Risk_Allele | Bene_Allele | Risk_Allele_effect | SNP_Risk_Allele | SNP_Bene_Allele | NA10861_PC08411_HRC | NA10861_PC11071_HRC | NA10861_PC11075_HRC | NA10861_PC11129_HRC | NA10861_PC11139_HRC | NA10861_PC08411_TopMed | NA10861_PC11138_TopMed | NA10861 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 38 | rs10179295 | 2 | 222033115 | A | G | 0.405730 | -0.011539 | 0.001074 | 0.0002685 | 1.0000000 | G | A | 0.011539 | rs10179295_G | rs10179295_A | 1 | 1 | 1 | 1 | 2 | 1 | 1 | 1 |
| 66 | rs9971729 | 12 | 23979791 | C | A | 0.566359 | -0.004921 | 0.002216 | 0.0000593 | 0.8550120 | A | C | 0.004921 | rs9971729_A | rs9971729_C | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 174 | rs142661815 | 2 | 99811314 | A | G | 0.010089 | 0.001114 | 0.003635 | 0.0000012 | 0.1107170 | A | G | 0.001114 | rs142661815_A | rs142661815_G | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 258 | rs10423391 | 19 | 1252451 | C | T | 0.441128 | 0.000749 | 0.001361 | 0.0000049 | 0.2655930 | C | T | 0.000749 | rs10423391_C | rs10423391_T | 2 | 1 | 1 | 2 | 1 | 2 | 2 | 2 |
| 306 | rs10416809 | 19 | 1252311 | T | C | 0.422649 | 0.000626 | 0.001366 | 0.0000046 | 0.2211020 | T | C | 0.000626 | rs10416809_T | rs10416809_C | 2 | 1 | 1 | 2 | 1 | 2 | 2 | 2 |
| 334 | rs12609746 | 19 | 1252827 | C | T | 0.241425 | 0.000567 | 0.001365 | 0.0000033 | 0.1982280 | C | T | 0.000567 | rs12609746_C | rs12609746_T | 1 | 0 | 1 | 1 | 0 | 1 | 1 | 1 |
| 356 | rs10771034 | 12 | 23979199 | A | T | 0.550739 | -0.000523 | 0.001755 | 0.0000069 | 0.0884168 | T | A | 0.000523 | rs10771034_T | rs10771034_A | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 448 | rs58837341 | 19 | 1248920 | T | G | 0.428600 | 0.000415 | 0.001054 | 0.0000026 | 0.1678550 | T | G | 0.000415 | rs58837341_T | rs58837341_G | 2 | 1 | 1 | 2 | 1 | 2 | 2 | 2 |
| 639 | rs112363642 | 19 | 1248348 | A | G | 0.221800 | 0.000301 | 0.000948 | 0.0000014 | 0.1306690 | A | G | 0.000301 | rs112363642_A | rs112363642_G | 1 | 0 | 1 | 1 | 0 | 1 | 1 | 1 |
| 756 | rs1888379 | 9 | 27353599 | G | T | 0.405515 | 0.000263 | 0.000973 | 0.0000020 | 0.1171700 | G | T | 0.000263 | rs1888379_G | rs1888379_T | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 |
| 885 | rs16832679 | 3 | 181511339 | A | T | 0.287000 | -0.000232 | 0.001151 | 0.0000023 | 0.0587273 | T | A | 0.000232 | rs16832679_T | rs16832679_A | 1 | 1 | 1 | 1 | 1 | 0 | 0 | 0 |
Also see: Imputation panel comparison ยท PGS:QS