This page complements Imputation panel comparison (HRC vs TOPMed) and PGS:QS. It follows the manuscript Results section on separating missing SNPs from genotype error, the PGS-impute approach (GCTB --convert), supplementary figures, and Glaucoma / Hair colour genotype checks.


1 Test whether difference is from missing or error

1.1 profile PGS without the missing SNPs

## define predictor
traitfile="/QRISdata/Q6913/GCTB_predictor_list_for_batch_profiling.txt"
trait=$(sed "${i}q;d" $traitfile | awk '{print $1}' )
predictor=$(sed  "${i}q;d"  $traitfile  | awk '{print $3}' )
echo $trait
echo $predictor


outdir=PRS_all_GCTB
mkdir -p $outdir

data1="WGS_NA10861"
genofile1=ceph_combined_SBRC_SNPs.vcf

plink \
  --vcf  $genofile1 \
  --score  $predictor  2 5 8  header sum    \
  --exclude missing_in_any_SNPs_for_NA10861.txt  \
  --out    ${outdir}/${data1}_${trait}_SBayesRC

data2=GSA_HRCr1.1
genofile2=Merged_plink/GSA_HRCr1.1_autosomes

plink \
  --bfile  $genofile2 \
  --score  $predictor  2 5 8  header sum    \
  --exclude missing_in_any_SNPs_for_NA10861.txt  \
  --out    ${outdir}/${data2}_${trait}_SBayesRC

data3=GSA_TopMed
genofile3=Merged_plink/GSA_TOPMedr3_autosomes_7.3M_subset

plink \
  --bfile  $genofile3 \
  --score  $predictor  2 5 8  header sum    \
  --exclude missing_in_any_SNPs_for_NA10861.txt  \
  --out    ${outdir}/${data3}_${trait}_SBayesRC

compare the PGS of NA10861 between the three resources

outdir=PRS_all_GCTB
traitfile="/QRISdata/Q6913/GCTB_predictor_list_for_batch_profiling.txt"

data3=GSA_TopMed
genofile3=Merged_plink/GSA_TOPMedr3_autosomes_7.3M_subset

data2=GSA_HRCr1.1
genofile2=Merged_plink/GSA_HRCr1.1_autosomes

Rscript merge_PGS.R ${genofile2}.fam  $traitfile $outdir $data2

1.2 combine the data

## input PGS from TopMed imputed
ceph_consis_PRS_TopMed = read.csv("Data/missingness_check/PRS_all_GCTB_GSA_TopMed_all_139_traits_GCTB_PRS.csv", row.names =1)
ceph_consis_PRS_TopMed= ceph_consis_PRS_TopMed[,match(included_traits, colnames(ceph_consis_PRS_TopMed) )]
ceph_consis_PRS_TopMed$imputation_panel = "TopMedr3"

## input PGS from HRC imputed
ceph_consis_PRS_HRC = read.csv("Data/missingness_check/PRS_all_GCTB_GSA_HRCr1.1_all_139_traits_GCTB_PRS.csv", row.names =1)
ceph_consis_PRS_HRC= ceph_consis_PRS_HRC[,match(included_traits, colnames(ceph_consis_PRS_HRC) )]
ceph_consis_PRS_HRC$imputation_panel = "HRCr1.1"

## merge
ceph_consis_PRS = rbind(ceph_consis_PRS_TopMed, ceph_consis_PRS_HRC)

## get the four samples
pattern <- paste(c("NA06990", "NA07023", "NA07057", "NA10861"), collapse = "|")
ceph_consis_PRS <- ceph_consis_PRS[grepl(pattern, row.names(ceph_consis_PRS)), ]


## input PGS from WGS using only overlap SNPs
ceph_consis_PRS_WGS = read.table("Data/missingness_check/NA10861_WGS_PRS_vGCTB.txt")
ceph_consis_PRS_WGS$V1 = gsub("-", ".",ceph_consis_PRS_WGS$V1  )
ceph_consis_PRS_WGS = (ceph_consis_PRS_WGS[ match(included_traits, ceph_consis_PRS_WGS$V1),])
colnames(ceph_consis_PRS_WGS) = c("variable", "CN1", "CN2", "NA10861")

1.3 standardize and melt

mean.sd.table = read.csv("Data/LifeLines_MEAN_and_SD_of_traits_for_standardization.csv", row.names = 1)
mean.sd.table = mean.sd.table[match(included_traits, mean.sd.table$trait ),]

ceph_consis_PRS.norm = sweep(sweep (  sapply (ceph_consis_PRS[,included_traits],  as.numeric  ), 2, mean.sd.table$mean),  2, mean.sd.table$sd, FUN = '/')
ceph_consis_PRS.norm = data.frame(ceph_consis_PRS.norm)

ceph_consis_PRS.norm$imputation_panel = ceph_consis_PRS$imputation_panel
row.names(ceph_consis_PRS.norm) = row.names(ceph_consis_PRS)


ceph_consis_PRS.norm$IID = sapply(strsplit(row.names(ceph_consis_PRS), "_"), `[`, 4)
ceph_consis_PRS.norm$imputation_panel = factor(ceph_consis_PRS.norm$imputation_panel , levels = panel.order)

ceph_consis_PRS.norm = ceph_consis_PRS.norm %>%  
  arrange(imputation_panel) %>%
  group_by(IID) %>%
  mutate(y.position = row_number()) %>%
  ungroup()  # Ungroup to return to normal data frame structure

  
ceph_consis_PRS.norm$y.position =ceph_consis_PRS.norm$y.position * 5

## melt it
melted_ceph_consis_PRS = reshape2::melt(ceph_consis_PRS.norm, 
                  id.vars =c("IID", "imputation_panel", "y.position") , 
                  measure.vars =included_traits )
colnames(melted_ceph_consis_PRS)[1] = "Study_ID"

ceph_consis_PRS_WGS$NA10861 =  ( ceph_consis_PRS_WGS$NA10861 -  mean.sd.table$mean) / mean.sd.table$sd

## add label
ceph_consis_PRS_WGS$Trait = predictor.list[match(ceph_consis_PRS_WGS$variable, predictor.list$Predictor),"Label"]
melted_ceph_consis_PRS$Trait =  predictor.list[match(melted_ceph_consis_PRS$variable, predictor.list$Predictor),"Label"]

1.4 summary difference

mean.prs.with.missing.summary = melted_ceph_consis_PRS%>% 
  group_by(Study_ID, variable, imputation_panel) %>%
  summarise(
    mean.prs = mean(value)
  )%>%
  pivot_wider(
    names_from = imputation_panel,
    values_from = mean.prs
  )

mean.prs.with.missing.summary$difference = abs(mean.prs.with.missing.summary$TopMedr3 - mean.prs.with.missing.summary$HRCr1.1)

cross.sample.mean.prs.with.missing.summary= mean.prs.with.missing.summary %>% group_by(variable) %>% summarise(mean_difference = mean(difference)) %>% arrange(mean_difference)

cross.sample.mean = data.frame(cross.sample.mean)
cross.sample.mean.prs.with.missing.summary = data.frame(cross.sample.mean.prs.with.missing.summary)
cross.sample.mean$diff_under_same_snps = cross.sample.mean.prs.with.missing.summary[ match(cross.sample.mean$variable, cross.sample.mean.prs.with.missing.summary$variable), "mean_difference"]
ggplot(data = cross.sample.mean, aes(x = mean_difference, y = diff_under_same_snps)) + geom_point() + geom_abline(intercept = 0, slope =1)

1.5 sup fig 6

hist.a = ggplot(data = cross.sample.mean, aes(x = mean_difference)) + geom_histogram(bins =50) + xlab( "|PGS-TopMed - PGS-HRC| (in SD unit)"   ) + theme_bw()  + xlim(0,2.2) + ylim(0,55)
hist.b = ggplot(data = cross.sample.mean, aes(x = diff_under_same_snps)) + geom_histogram(bins =50) + xlab( "|PGS-TopMed - PGS-HRC| (in SD unit)"   ) + theme_bw()   + xlim(0,2.2) + ylim(0,55)

hist.supfig6 = ggarrange(hist.a, hist.b, labels = c("A", "B"))
hist.supfig6
ggsave( "Figures/SupFig6_hist_of_difference.jpeg", hist.supfig6, width = 12, height = 5)

1.6 plot

example.traits = example.traits[5:8]


example.consist.hist =  panel_plot(example.traits =example.traits, facet.col = 4,  legend.pos = "bottom" , melted_ceph_consis_PRS , ceph_consis_PRS_WGS, melted.prs )
 

info.text.c = data.frame(
 variable = example.traits,
 Trait = predictor.list[match(example.traits, predictor.list$Predictor),"Label"],
 variance = format(round((predictor.list[match(example.traits, predictor.list$Predictor),"total.variance"]) * (predictor.list[match(example.traits, predictor.list$Predictor),"QIMR_TopMed"]) ,3 ), nsmall=3) ,
# HRCr1.1 = paste0( round(100 * predictor.list[match(example.traits[5:8], predictor.list$Predictor),"QIMR_TopMed"], 1 ) ,  "%"),
# TopMedr3 = paste0( round(100 * predictor.list[match(example.traits[5:8], predictor.list$Predictor),"QIMR_TopMed"], 1 ) ,  "%"),
HRCr1.1 = "100%",
TopMedr3 = "100%",
 x = -0.5,
 y = Inf
)

info.text.c$variable = factor(info.text.c$variable , levels = info.text.c$variable )
info.text.c$Trait = factor(info.text.c$Trait , levels = info.text.c$Trait )

info.text.c$labels = paste0("V=", info.text.c$variance ,"; +:", info.text.c$HRCr1.1, "; \u25B3",":" , info.text.c$TopMedr3)

labeled.example.consist.hist  = example.consist.hist + 
  geom_text(
    data = info.text.c,
    aes(x = median(x), y = y, label = (labels) ),
    inherit.aes = FALSE,
    vjust =1,        # Push down from top
    hjust = 0.5,
    size = 4,
    family = "Arial Unicode MS"
  ) 

labeled.example.consist.hist

2 Examine convert function in fixing the missingness

We are using correlation of PGS to show how the PGS got affected by SNP missingness, and how the convert function recover the PGS.

2.1 convert SNP weight for missing SNPs

The function is added to GCTB.

SNP weight of the missing SNPs are converted to the existing SNPs.


## define predictor
traitfile="GCTB_predictor_list_with_7.3M.txt"
trait=$(sed "${i}q;d" $traitfile | awk '{print $1}' )
predictor=$(sed  "${i}q;d"  $traitfile  | awk '{print $3}' )
echo $trait
echo $predictor

targetlist=SNP_list_for_nonmissing_test.txt
ldm=/QRISdata/Q6913/Pipeline/ukbEUR_Imputed
gctb="/scratch/project_mnt/S0007/jzeng/repo/GCTB_dev/scr/gctb"

$gctb --convert  \
--snp-res $predictor  \
--ldm-eigen  $ldm \
--extract  $targetlist  \
--thread 8 \
--out Converted_predictors/${trait}_converted_nonmissing_SBayesRC_predictor

2.2 combine the data

## input PGS from TopMed imputed using converted predictors
ceph_convert_PRS_TopMed = read.csv("Data/missingness_check/QIMR_TopMed_PGS_from_converted_predictors_QIMR_GSA_TopMedr3_imputed_all_123_traits_GCTB_PRS.csv", row.names =1)

included_traits = included_traits[included_traits %in% colnames(ceph_convert_PRS_TopMed)]

ceph_convert_PRS_TopMed= ceph_convert_PRS_TopMed[,match(included_traits, colnames(ceph_convert_PRS_TopMed) )]
ceph_convert_PRS_TopMed$imputation_panel = "TopMedr3"

## input PGS from HRC imputed using original predictor
ceph_PRS_HRC = read.csv("Data/QIMR/PRS_all_GCTB_GSA_HRCr1.1_all_140_traits_GCTB_PRS.csv", row.names =1)
ceph_PRS_HRC= ceph_PRS_HRC[,match(included_traits, colnames(ceph_PRS_HRC) )]
ceph_PRS_HRC$imputation_panel = "HRCr1.1"

## merge
ceph_convert_PRS = rbind(ceph_convert_PRS_TopMed, ceph_PRS_HRC)

## get the four samples
pattern <- paste(c("NA06990", "NA07023", "NA07057", "NA10861"), collapse = "|")
ceph_convert_PRS <- ceph_convert_PRS[grepl(pattern, row.names(ceph_convert_PRS)), ]


## input PGS from WGS using original predictor
ceph_PRS_WGS = read.table("Data/WGS/NA10861_WGS_PRS_vGCTB.txt", header = T)
ceph_PRS_WGS$V1 = gsub("-", ".", ceph_PRS_WGS$V1)
ceph_PRS_WGS = (ceph_PRS_WGS[ match(included_traits, ceph_PRS_WGS$V1),])
colnames(ceph_PRS_WGS) = c("variable", "CN1", "CN2", "NA10861")

2.3 standardize and melt

mean.sd.table = read.csv("Data/LifeLines_MEAN_and_SD_of_traits_for_standardization.csv", row.names = 1)
mean.sd.table = mean.sd.table[match(included_traits, mean.sd.table$trait ),]

ceph_convert_PRS.norm = sweep(sweep (  sapply (ceph_convert_PRS[,included_traits],  as.numeric  ), 2, mean.sd.table$mean),  2, mean.sd.table$sd, FUN = '/')
ceph_convert_PRS.norm = data.frame(ceph_convert_PRS.norm)

ceph_convert_PRS.norm$imputation_panel = ceph_convert_PRS$imputation_panel
row.names(ceph_convert_PRS.norm) = row.names(ceph_convert_PRS)


ceph_convert_PRS.norm$Study_ID = sapply(strsplit(row.names(ceph_convert_PRS), "_"), `[`, 4)
ceph_convert_PRS.norm$imputation_panel = factor(ceph_convert_PRS.norm$imputation_panel , levels = panel.order)

ceph_convert_PRS.norm = ceph_convert_PRS.norm %>%  
  arrange(imputation_panel) %>%
  group_by(Study_ID) %>%
  mutate(y.position = row_number()) %>%
  ungroup()  # Ungroup to return to normal data frame structure

  
ceph_convert_PRS.norm$y.position =ceph_convert_PRS.norm$y.position * 5

## melt it
melted_ceph_convert_PRS = reshape2::melt(ceph_convert_PRS.norm, 
                  id.vars =c("Study_ID", "imputation_panel", "y.position") , 
                  measure.vars =included_traits )

ceph_PRS_WGS$NA10861 =  ( ceph_PRS_WGS$NA10861 -  mean.sd.table$mean) / mean.sd.table$sd

## add label

ceph_PRS_WGS$Trait = predictor.list[match(ceph_PRS_WGS$variable, predictor.list$Predictor),"Label"]
melted_ceph_convert_PRS$Trait =  predictor.list[match(melted_ceph_convert_PRS$variable, predictor.list$Predictor),"Label"]

2.4 summarize

mean.prs.with.convert.summary = melted_ceph_convert_PRS%>% 
  group_by(Study_ID, variable, imputation_panel) %>%
  summarise(
    mean.prs = mean(value)
  )%>%
  pivot_wider(
    names_from = imputation_panel,
    values_from = mean.prs
  )

mean.prs.with.convert.summary$difference = abs(mean.prs.with.convert.summary$TopMedr3 - mean.prs.with.convert.summary$HRCr1.1)

cross.sample.mean.prs.with.convert.summary= mean.prs.with.convert.summary %>% group_by(variable) %>% summarise(mean_difference = mean(difference)) %>% arrange(mean_difference)

cross.sample.mean = data.frame(cross.sample.mean)
cross.sample.mean.prs.with.convert.summary = data.frame(cross.sample.mean.prs.with.convert.summary)
cross.sample.mean$diff_from_convert = cross.sample.mean.prs.with.convert.summary[ match(cross.sample.mean$variable, cross.sample.mean.prs.with.convert.summary$variable), "mean_difference"]

2.5 plot

example.convert.hist =  panel_plot(example.traits =example.traits, facet.col = 4,  legend.pos = "bottom" , melted_ceph_convert_PRS , ceph_PRS_WGS, melted.prs )
 

info.text.d = data.frame(
 variable = example.traits,
 Trait = predictor.list[match(example.traits, predictor.list$Predictor),"Label"],
 variance = format(round((predictor.list[match(example.traits, predictor.list$Predictor),"total.variance"]) ,3 ) , nsmall=3) ,
 #HRCr1.1 = paste0( round(100 * predictor.list[match(example.traits[5:8], predictor.list$Predictor),"QIMR_TopMed"], 1 ) ,  "%"),
 #TopMedr3 = paste0( round(100 * predictor.list[match(example.traits[5:8], predictor.list$Predictor),"QIMR_HRC"], 1 ) ,  "%"),
 HRCr1.1 = "100%",
TopMedr3 = "100%",
x = -0.5,
 y = Inf
)

info.text.d$variable = factor(info.text.d$variable , levels = info.text.d$variable )
info.text.d$Trait = factor(info.text.d$Trait , levels = info.text.d$Trait )

info.text.d$labels = paste0("V=", info.text.d$variance ,"; +:", info.text.d$HRCr1.1, "; \u25B3",":" , info.text.d$TopMedr3)

labeled.example.convert.hist  = example.convert.hist + 
  geom_text(
    data = info.text.d,
    aes(x = median(x), y = y, label = (labels) ),
    inherit.aes = FALSE,
    vjust =1,        # Push down from top
    hjust = 0.5,
    size = 4,
    family = "Arial Unicode MS"
  ) 

labeled.example.convert.hist

# ggsave(labeled.example.convert.hist, filename = "Figures/Fig4_PGS_consistency_when_using_converted_predictors_in_ToPMed.jpeg", width = 12, height = 3)

3 summary of changes

cross.sample.mean = cross.sample.mean %>%
    mutate(across(where(is.numeric), ~ round(.x, 3)))

write.csv(cross.sample.mean, "Tables/SupTable3_difference_between_imputation_panel_and_coverted.csv")

fig.a =  ggplot(data = cross.sample.mean, aes(x = mean_difference, y = diff_from_convert)) + 
  geom_point() + 
  geom_abline(intercept = 0, slope =1)+ 
  geom_text_repel(data = cross.sample.mean %>% filter(mean_difference > 0.2 | diff_from_convert > 0.2) , 
                    aes(label = variable), vjust = -1, color = "black", size = 1.5)  +
  xlab("difference between panel") +
  ylab("difference after converting the predictor") +
  theme_bw()

fig.b =  ggplot(data = cross.sample.mean, aes(x = mean_difference, y = diff_under_same_snps)) + 
  geom_point() + 
  geom_abline(intercept = 0, slope =1)+ 
  geom_text_repel(data = cross.sample.mean %>% filter(mean_difference > 0.2 | diff_from_convert > 0.2) , 
                    aes(label = variable), vjust = -1, color = "black", size = 1.5)  +
  xlab("difference between panel") +
  ylab("difference when using the same SNP set")+
  theme_bw()

fig.change = ggarrange(fig.b , fig.a,  nrow = 1)



fig.change


ggsave(filename = "Figures/SupFig3_imputation_panel_difference_before_and_after_convert.jpeg", plot = fig.change, width = 12, height = 6)

4 check genotype of Glaucoma

The top 1000 SNPs were picked from Glaucoma SBsyesRC weights. Genotype of these 1000 SNPs were extracted from the three data sets that included sample NA10861: WGS, HRC imputed and TopMed imputed QIMR Malanoma NCI cohort genotyped with GSAv2 chip.

The genotype value is in additive model as the number of risk alleles. We removed all the events that have exactly the same genotype in each data set, which resulted with 1 event from WGS, 1 event from TopMed imputation, and 4 event from HRC imputation. Then we matched these 6 events, and removed all the SNPs that have exactly the same calling among all of them. There are 13 SNPs remained.

Their genotype are merged with the SNP information, and sorted based on the SBayesRC SNP weight. All the genotype callings that are different from the calling in WGS are highlighted with red.

We can see that both the count of risk alleles and the partial PGS calculated from these SNPs are higher in HRC imputed events, which is consistent with the observation in whole PGS.

glaucoma.effect = data.frame(fread("../Large_Data/Glaucoma_01_SBayesRC.snpRes"))
top1000 = read.table("Data/missingness_check/Glaucoma_01_SBayesRC_top1000_SNPs.txt")
glaucoma.effect = glaucoma.effect[glaucoma.effect$Name %in% top1000$V1,]


glaucoma.effect$Risk_Allele <- ifelse(glaucoma.effect$A1Effect > 0,
                                      glaucoma.effect$A1,
                                      glaucoma.effect$A2)


glaucoma.effect$Bene_Allele <- ifelse(glaucoma.effect$A1Effect < 0,
                                      glaucoma.effect$A1,
                                      glaucoma.effect$A2)


glaucoma.effect$Risk_Allele_effect = abs(glaucoma.effect$A1Effect)

#write.table(glaucoma.effect[,c("Name", "Risk_Allele")], row.names = F, col.names = F, quote = F, sep ="\t", file = "Data/missingness_check/top1000SNP_Risk_Allele_of_Glaucoma.txt")

glaucoma.effect$SNP_Risk_Allele = paste0(glaucoma.effect$Name, "_", glaucoma.effect$Risk_Allele)
glaucoma.effect$SNP_Bene_Allele = paste0(glaucoma.effect$Name, "_", glaucoma.effect$Bene_Allele)

## set reference allele
Merged_plink=/QRISdata/Q5740/QIMR_Berghofer/Merged_plink/

plink \
--bfile ${Merged_plink}/GSA_TOPMedr3_autosomes_7.3M_subset \
--extract Glaucoma_top1000_genotype/Glaucoma_01_SBayesRC_top1000_SNPs.txt  \
--exclude missing_in_any_SNPs_for_NA10861.txt  \
--recode A \
--a1-allele top1000SNP_Risk_Allele_of_Glaucoma.txt 1 2 \
--out Glaucoma_top1000_genotype/Glaucoma_01_SBayesRC_top1000_SNPs_in_QIMR_GSA_TOPMedr3 


plink \
--bfile ${Merged_plink}/GSA_HRCr1.1_autosomes \
--extract Glaucoma_top1000_genotype/Glaucoma_01_SBayesRC_top1000_SNPs.txt  \
--exclude missing_in_any_SNPs_for_NA10861.txt  \
--recode A \
--a1-allele top1000SNP_Risk_Allele_of_Glaucoma.txt 1 2 \
--out Glaucoma_top1000_genotype/Glaucoma_01_SBayesRC_top1000_SNPs_in_QIMR_GSA_HRCr1.1 

plink \
--vcf NA10861_combined_SBRC_SNPs.vcf \
--extract Glaucoma_top1000_genotype/Glaucoma_01_SBayesRC_top1000_SNPs.txt  \
--exclude missing_in_any_SNPs_for_NA10861.txt   \
--recode A \
--a1-allele top1000SNP_Risk_Allele_of_Glaucoma.txt 1 2 \
--out Glaucoma_top1000_genotype/Glaucoma_01_SBayesRC_top1000_SNPs_in_WGS_NA10861 
geno_trans = function(raw){
# Remove non-SNP columns (e.g., FID, IID, PAT, MAT, SEX, PHENOTYPE)
snp_data <- raw[, 7:ncol(raw)]  # SNP columns start at col 7 in PLINK .raw

# Get individual IDs to set as colnames
individual_ids <- raw$IID

# Transpose
t_snp <- t(snp_data)
colnames(t_snp) <- individual_ids
rownames(t_snp) <- colnames(snp_data)

# Convert to data frame if needed
t_snp_df <- as.data.frame(t_snp)
return(t_snp_df)
}


# Read PLINK raw file
geno.wgs <- read.table("Data/missingness_check/Glaucoma_01_SBayesRC_top1000_SNPs_in_WGS_NA10861.raw", header = TRUE)
geno.wgs = geno_trans(geno.wgs)
geno.wgs$toy2 = 1
#geno.wgs$effect_size = "NA"


geno.gsa.topmed = read.table("Data/missingness_check/Glaucoma_01_SBayesRC_top1000_SNPs_in_QIMR_GSA_TOPMedr3.raw", header = T)
geno.gsa.topmed = geno_trans(geno.gsa.topmed)
geno.gsa.topmed = geno.gsa.topmed[,grep("NA10861", colnames(geno.gsa.topmed))]
geno.gsa.topmed$toy = 1

geno.gsa.hrc = read.table("Data/missingness_check/Glaucoma_01_SBayesRC_top1000_SNPs_in_QIMR_GSA_HRCr1.1.raw", header = T)
geno.gsa.hrc = geno_trans(geno.gsa.hrc)
geno.gsa.hrc = geno.gsa.hrc[,grep("NA10861", colnames(geno.gsa.hrc))]



# Transpose so samples are rows and SNPs are columns
geno_t <- t(geno.gsa.topmed)
# Find unique rows (i.e., unique samples), then get their indices
unique_indices <- !duplicated(geno_t)
# Subset back to keep only unique columns in the original matrix
geno.gsa.topmed.unique <- geno.gsa.topmed[, unique_indices]
# Optional: check how many duplicates were removed
cat("Number of duplicate samples removed:", ncol(geno.gsa.topmed) - ncol(geno.gsa.topmed.unique), "\n")


# Transpose so samples are rows and SNPs are columns
geno_t <- t(geno.gsa.hrc)
# Find unique rows (i.e., unique samples), then get their indices
unique_indices <- !duplicated(geno_t)
# Subset back to keep only unique columns in the original matrix
geno.gsa.hrc.unique <- geno.gsa.hrc[, unique_indices]
# Optional: check how many duplicates were removed
cat("Number of duplicate samples removed:", ncol(geno.gsa.hrc) - ncol(geno.gsa.hrc.unique), "\n")
merge_glaucoma_geno <- function(glaucoma.effect, geno) {
  individuals <- colnames(geno)
  output <- matrix(NA, nrow = nrow(glaucoma.effect), ncol = length(individuals))
  rownames(output) <- glaucoma.effect$Name
  colnames(output) <- individuals

  for (i in seq_len(nrow(glaucoma.effect))) {
    snp_a1 <- glaucoma.effect$SNP_Risk_Allele[i]
    snp_a2 <- glaucoma.effect$SNP_Bene_Allele[i]

    if (snp_a1 %in% rownames(geno)) {
      output[i, ] <- as.numeric(geno[snp_a1, , drop = FALSE])
    } else if (snp_a2 %in% rownames(geno)) {
      geno_vals <- as.numeric(geno[snp_a2, , drop = FALSE])
      geno_flipped <- ifelse(geno_vals == 2, 0,
                       ifelse(geno_vals == 0, 2, geno_vals))  # keep 1 as-is
      output[i, ] <- geno_flipped
    }
  }

  geno_df <- as.data.frame(output)
  final <- cbind(glaucoma.effect, geno_df)
  return(final)
}


colnames(geno.gsa.hrc.unique) = paste0(colnames(geno.gsa.hrc.unique), "_HRC")
colnames(geno.gsa.topmed.unique) = paste0(colnames(geno.gsa.topmed.unique), "_TopMed")


merged_hrc <- merge_glaucoma_geno(glaucoma.effect, geno.gsa.hrc.unique)

merged_hrc_top  =  merge_glaucoma_geno(merged_hrc, geno.gsa.topmed.unique)
merged_hrc_top_wgs  =  merge_glaucoma_geno(merged_hrc_top, geno.wgs)

merged_hrc_top_wgs=merged_hrc_top_wgs[,grep("toy", colnames(merged_hrc_top_wgs), invert = T)]

merged_hrc_top_wgs = merged_hrc_top_wgs %>% arrange(-Risk_Allele_effect)

merged_hrc_top_wgs$Index = 1:1000


last6_cols <- tail(colnames(merged_hrc_top_wgs), 6)

# Keep rows where the values in the last 6 columns are not all the same
filtered_df <- merged_hrc_top_wgs[
  apply(merged_hrc_top_wgs[, last6_cols], 1, function(row) {
    length(unique(row)) > 1
  }),
]

# write.csv(filtered_df, "Tables/SNPs_with_different_geno_in_top1000_of_Glaucoma.csv", row.names = F)
filtered_df = read.csv("Tables/SNPs_with_different_geno_in_top1000_of_Glaucoma.csv")

colnames(filtered_df) = gsub("MelanomaMDD_NCI_GSA_" , "", colnames(filtered_df))

filtered_df  %>%
  select(-Pi1, -Pi2, -Pi3, -Pi4, -Pi5, ) %>% 
  kable() %>%
  kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover")) %>%
  column_spec(17:22, width = "100px", extra_css = "word-wrap: break-word; white-space: normal;")
Index Name Chrom Position A1 A2 A1Frq A1Effect SE VarExplained PIP SNP_A1 SNP_A2 Risk_Allele SNP_Risk_Allele Risk_Allele_effect Bene_Allele SNP_Bene_Allele NA10861_WGS NA10861_PC08411_TopMed NA10861_PC08411_HRC NA10861_PC08621_HRC NA10861_PC11092_HRC NA10861_PC11139_HRC
44 rs4842266 12 79951566 A G 0.696275 0.034213 0.033722 5.44e-05 0.5307170 rs4842266_A rs4842266_G A rs4842266_A 0.034213 G rs4842266_G 1 1 2 2 2 2
117 rs965652 6 134368953 A G 0.496475 0.018888 0.029578 3.42e-05 0.3260560 rs965652_A rs965652_G A rs965652_A 0.018888 G rs965652_G 2 2 2 1 2 1
131 rs11755754 6 164325492 G A 0.399175 0.017493 0.025092 2.52e-05 0.3695080 rs11755754_G rs11755754_A G rs11755754_G 0.017493 A rs11755754_A 0 0 0 0 1 0
151 rs11614730 12 79925802 A G 0.696100 0.015212 0.027911 2.35e-05 0.2567350 rs11614730_A rs11614730_G A rs11614730_A 0.015212 G rs11614730_G 1 1 2 2 2 2
257 rs2811687 6 134369822 A G 0.497950 0.009840 0.023446 1.80e-05 0.1748420 rs2811687_A rs2811687_G A rs2811687_A 0.009840 G rs2811687_G 2 2 2 1 2 1
319 rs2207441 6 164325217 A G 0.401375 0.008405 0.018563 1.10e-05 0.1442670 rs2207441_A rs2207441_G A rs2207441_A 0.008405 G rs2207441_G 0 0 0 0 1 0
354 rs34852247 6 164324711 A G 0.373500 0.007586 0.020255 1.17e-05 0.1303810 rs34852247_A rs34852247_G A rs34852247_A 0.007586 G rs34852247_G 0 0 0 0 1 0
454 rs3800902 7 2144378 T C 0.400550 -0.006281 0.019049 8.30e-06 0.1571720 rs3800902_T rs3800902_C C rs3800902_C 0.006281 T rs3800902_T 2 0 0 0 0 0
460 rs34913183 12 58166590 T C 0.011150 -0.006232 0.031416 1.10e-06 0.0520962 rs34913183_T rs34913183_C C rs34913183_C 0.006232 T rs34913183_T 1 1 2 2 2 2
776 rs8016358 14 80895880 A C 0.427800 0.004200 0.012595 4.80e-06 0.1101090 rs8016358_A rs8016358_C A rs8016358_A 0.004200 C rs8016358_C 0 0 1 1 1 0
841 rs4842318 12 80044167 T C 0.696550 0.003971 0.015281 5.60e-06 0.0611977 rs4842318_T rs4842318_C T rs4842318_T 0.003971 C rs4842318_C 1 1 2 2 2 2
867 rs8176822 12 80048478 T C 0.696575 0.003860 0.015743 5.90e-06 0.0535097 rs8176822_T rs8176822_C T rs8176822_T 0.003860 C rs8176822_C 1 1 2 2 2 2
897 rs2253480 7 6686780 C T 0.957700 0.003743 0.019280 1.00e-06 0.0605963 rs2253480_C rs2253480_T C rs2253480_C 0.003743 T rs2253480_T 1 1 2 2 2 2

5 check genotype of Hair color

The top 1000 SNPs were picked from Hair color SBsyesRC weights. Genotype of these 1000 SNPs were extracted from the three data sets that included sample NA10861: WGS, HRC imputed and TopMed imputed QIMR Malanoma NCI cohort genotyped with GSAv2 chip.

The genotype value is in additive model as the number of risk alleles. We removed all the events that have exactly the same genotype in each data set, which resulted with 1 event from WGS, XX event from TopMed imputation, and XX event from HRC imputation. Then we matched these 6 events, and removed all the SNPs that have exactly the same calling among all of them. There are XX SNPs remained.

Their genotype are merged with the SNP information, and sorted based on the SBayesRC SNP weight. All the genotype callings that are different from the calling in WGS are highlighted with red.

We can see that both the count of risk alleles and the partial PGS calculated from these SNPs are higher in HRC imputed events, which is consistent with the observation in whole PGS.

## run in bunya
library(data.table)
library(dplyr)
predictor = data.frame(fread("/QRISdata/Q6913/Uno_Traits/Binary_Traits/HairColor_01/SBayesRC/GCTB_v2.5.2/HairColor_01_SBayesRC.snpRes"))

top1000 <- predictor %>%
  arrange(desc(abs(A1Effect))) %>%
  slice_head(n = 1000)

top1000$Risk_Allele <- ifelse(top1000$A1Effect > 0,
                                      top1000$A1,
                                      top1000$A2)


top1000$Bene_Allele <- ifelse(top1000$A1Effect < 0,
                                      top1000$A1,
                                      top1000$A2)


top1000$Risk_Allele_effect = abs(top1000$A1Effect)

write.table(top1000[,c("Name", "Risk_Allele")], row.names = F, col.names = F, quote = F, sep ="\t", file = "top1000SNP_Risk_Allele_of_HairColor.txt")

top1000$SNP_Risk_Allele = paste0(top1000$Name, "_", top1000$Risk_Allele)
top1000$SNP_Bene_Allele = paste0(top1000$Name, "_", top1000$Bene_Allele)

write.csv(top1000, "Data/missingness_check/HairColr_top1000.csv")

## set reference allele
plink=Genotype

plink \
--bfile ${plink}/GSA_TOPMedr3_autosomes_7.3M_subset \
--extract top1000SNP_of_HairColor.txt \
--exclude Lists/missing_in_any_SNPs_for_NA10861.txt  \
--recode A \
--a1-allele top1000SNP_Risk_Allele_of_HairColor.txt 1 2 \
--out HairColor_top1000_SNPs_in_QIMR_GSA_TOPMedr3 


plink \
--bfile ${plink}/GSA_HRCr1.1_autosomes \
--extract top1000SNP_of_HairColor.txt  \
--exclude Lists/missing_in_any_SNPs_for_NA10861.txt  \
--recode A \
--a1-allele top1000SNP_Risk_Allele_of_HairColor.txt 1 2 \
--out HairColor_top1000_SNPs_in_QIMR_GSA_HRCr1.1 

plink \
--vcf ${plink}/NA10861_combined_SBRC_SNPs.vcf \
--extract top1000SNP_of_HairColor.txt  \
--exclude Lists/missing_in_any_SNPs_for_NA10861.txt   \
--recode A \
--a1-allele top1000SNP_Risk_Allele_of_HairColor.txt 1 2 \
--out HairColor_top1000_SNPs_in_WGS_NA10861 
geno_trans = function(raw){
# Remove non-SNP columns (e.g., FID, IID, PAT, MAT, SEX, PHENOTYPE)
snp_data <- raw[, 7:ncol(raw)]  # SNP columns start at col 7 in PLINK .raw

# Get individual IDs to set as colnames
individual_ids <- raw$IID

# Transpose
t_snp <- t(snp_data)
colnames(t_snp) <- individual_ids
rownames(t_snp) <- colnames(snp_data)

# Convert to data frame if needed
t_snp_df <- as.data.frame(t_snp)
return(t_snp_df)
}


# Read PLINK raw file
geno.wgs <- read.table("Data/missingness_check/HairColor_top1000_SNPs_in_WGS_NA10861.raw", header = TRUE)
geno.wgs = geno_trans(geno.wgs)
geno.wgs$toy2 = 1
#geno.wgs$effect_size = "NA"


geno.gsa.topmed = read.table("Data/missingness_check/HairColor_top1000_SNPs_in_QIMR_GSA_TOPMedr3.raw", header = T)
geno.gsa.topmed = geno_trans(geno.gsa.topmed)
geno.gsa.topmed = geno.gsa.topmed[,grep("NA10861", colnames(geno.gsa.topmed))]
geno.gsa.topmed$toy = 1

geno.gsa.hrc = read.table("Data/missingness_check/HairColor_top1000_SNPs_in_QIMR_GSA_HRCr1.1.raw", header = T)
geno.gsa.hrc = geno_trans(geno.gsa.hrc)
geno.gsa.hrc = geno.gsa.hrc[,grep("NA10861", colnames(geno.gsa.hrc))]



# Transpose so samples are rows and SNPs are columns
geno_t <- t(geno.gsa.topmed)
# Find unique rows (i.e., unique samples), then get their indices
unique_indices <- !duplicated(geno_t)
# Subset back to keep only unique columns in the original matrix
geno.gsa.topmed.unique <- geno.gsa.topmed[, unique_indices]
# Optional: check how many duplicates were removed
cat("Number of duplicate samples removed:", ncol(geno.gsa.topmed) - ncol(geno.gsa.topmed.unique), "\n")


# Transpose so samples are rows and SNPs are columns
geno_t <- t(geno.gsa.hrc)
# Find unique rows (i.e., unique samples), then get their indices
unique_indices <- !duplicated(geno_t)
# Subset back to keep only unique columns in the original matrix
geno.gsa.hrc.unique <- geno.gsa.hrc[, unique_indices]
# Optional: check how many duplicates were removed
cat("Number of duplicate samples removed:", ncol(geno.gsa.hrc) - ncol(geno.gsa.hrc.unique), "\n")
merge_geno <- function(top1000, geno) {
  individuals <- colnames(geno)
  output <- matrix(NA, nrow = nrow(top1000), ncol = length(individuals))
  rownames(output) <- top1000$Name
  colnames(output) <- individuals

  for (i in seq_len(nrow(top1000))) {
    snp_a1 <- top1000$SNP_Risk_Allele[i]
    snp_a2 <- top1000$SNP_Bene_Allele[i]

    if (snp_a1 %in% rownames(geno)) {
      output[i, ] <- as.numeric(geno[snp_a1, , drop = FALSE])
    } else if (snp_a2 %in% rownames(geno)) {
      geno_vals <- as.numeric(geno[snp_a2, , drop = FALSE])
      geno_flipped <- ifelse(geno_vals == 2, 0,
                       ifelse(geno_vals == 0, 2, geno_vals))  # keep 1 as-is
      output[i, ] <- geno_flipped
    }
  }

  geno_df <- as.data.frame(output)
  final <- cbind(top1000, geno_df)
  return(final)
}


top1000 = read.csv("Data/missingness_check/HairColr_top1000.csv", row.names =1)

colnames(geno.gsa.hrc.unique) = paste0(colnames(geno.gsa.hrc.unique), "_HRC")
colnames(geno.gsa.topmed.unique) = paste0(colnames(geno.gsa.topmed.unique), "_TopMed")


merged_hrc <- merge_geno(top1000, geno.gsa.hrc.unique)

merged_hrc_top  =  merge_geno(merged_hrc, geno.gsa.topmed.unique)
merged_hrc_top_wgs  =  merge_geno(merged_hrc_top, geno.wgs)

merged_hrc_top_wgs=merged_hrc_top_wgs[,grep("toy", colnames(merged_hrc_top_wgs), invert = T)]

merged_hrc_top_wgs = merged_hrc_top_wgs %>% arrange(-Risk_Allele_effect)

merged_hrc_top_wgs$Index = 1:1000


last8_cols <- tail(colnames(merged_hrc_top_wgs), 8)

# Keep rows where the values in the last 6 columns are not all the same
filtered_df <- merged_hrc_top_wgs[
  apply(merged_hrc_top_wgs[, last8_cols], 1, function(row) {
    length(unique(row)) > 1
  }),
]

write.csv(filtered_df, "Tables/SNPs_with_different_geno_in_top1000_of_HairColor.csv", row.names = F)
filtered_df = read.csv("Tables/SNPs_with_different_geno_in_top1000_of_HairColor.csv")

colnames(filtered_df) = gsub("MelanomaMDD_NCI_GSA_" , "", colnames(filtered_df))

filtered_df  %>%
  select(-Pi1, -Pi2, -Pi3, -Pi4, -Pi5, ) %>% 
  kable() %>%
  kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover")) %>%
  column_spec(17:22, width = "100px", extra_css = "word-wrap: break-word; white-space: normal;")
Index Name Chrom Position A1 A2 A1Frq A1Effect SE VarExplained PIP Risk_Allele Bene_Allele Risk_Allele_effect SNP_Risk_Allele SNP_Bene_Allele NA10861_PC08411_HRC NA10861_PC11071_HRC NA10861_PC11075_HRC NA10861_PC11129_HRC NA10861_PC11139_HRC NA10861_PC08411_TopMed NA10861_PC11138_TopMed NA10861
38 rs10179295 2 222033115 A G 0.405730 -0.011539 0.001074 0.0002685 1.0000000 G A 0.011539 rs10179295_G rs10179295_A 1 1 1 1 2 1 1 1
66 rs9971729 12 23979791 C A 0.566359 -0.004921 0.002216 0.0000593 0.8550120 A C 0.004921 rs9971729_A rs9971729_C 0 0 0 1 0 0 0 0
174 rs142661815 2 99811314 A G 0.010089 0.001114 0.003635 0.0000012 0.1107170 A G 0.001114 rs142661815_A rs142661815_G 0 0 0 0 0 0 1 0
258 rs10423391 19 1252451 C T 0.441128 0.000749 0.001361 0.0000049 0.2655930 C T 0.000749 rs10423391_C rs10423391_T 2 1 1 2 1 2 2 2
306 rs10416809 19 1252311 T C 0.422649 0.000626 0.001366 0.0000046 0.2211020 T C 0.000626 rs10416809_T rs10416809_C 2 1 1 2 1 2 2 2
334 rs12609746 19 1252827 C T 0.241425 0.000567 0.001365 0.0000033 0.1982280 C T 0.000567 rs12609746_C rs12609746_T 1 0 1 1 0 1 1 1
356 rs10771034 12 23979199 A T 0.550739 -0.000523 0.001755 0.0000069 0.0884168 T A 0.000523 rs10771034_T rs10771034_A 0 0 0 1 0 0 0 0
448 rs58837341 19 1248920 T G 0.428600 0.000415 0.001054 0.0000026 0.1678550 T G 0.000415 rs58837341_T rs58837341_G 2 1 1 2 1 2 2 2
639 rs112363642 19 1248348 A G 0.221800 0.000301 0.000948 0.0000014 0.1306690 A G 0.000301 rs112363642_A rs112363642_G 1 0 1 1 0 1 1 1
756 rs1888379 9 27353599 G T 0.405515 0.000263 0.000973 0.0000020 0.1171700 G T 0.000263 rs1888379_G rs1888379_T 0 0 0 0 0 1 1 1
885 rs16832679 3 181511339 A T 0.287000 -0.000232 0.001151 0.0000023 0.0587273 T A 0.000232 rs16832679_T rs16832679_A 1 1 1 1 1 0 0 0

Also see: Imputation panel comparison ยท PGS:QS