# Importing genotypic data
<- 'data/barley_genbank_2388variants_554individuals_biallelic.hmp.txt.gz'
geno_file
<- read.csv(geno_file, sep = "\t")
geno_data
# Show the first 50 rows/columns only
create_dt(geno_data[1:50,1:50])
20 Module 6.1: Genomic Selection
Genomic selection enables early and accurate prediction of complex traits based on genome-wide SNP data. This can allow us to predict performance immediately after genotyping seedlings, which can speed up selection decisions and reduce the cost of field trials. We will be using the rrBBLUP
package for this part, which is a is a package for genomic prediction with the ridge regression best linear unbiased prediction or RR-BLUP mixed linear model (Endelman, 2011). One application is to estimate marker effects by ridge regression; alternatively, BLUPs can be calculated based on an additive relationship matrix or a Gaussian kernel.
20.1 Read, Filter and Impute Marker Data
The genotypic matrix for rrBLUP has to be coded as {-1, 0, 1}, meaning we will need a conversion step if our data is not yet in that format. Moreover, we will be carrying out a mean value imputation.
<- geno_data[, -c(1:11)]
M rownames(M) <- geno_data[, 1]
<- t(M)
M
# Reformatting matrix to numeric
<- ASRgenomics::snp.recode(M, na.string = 'NN', rename.markers = FALSE)$Mrecode M
A total of 87800 values were identified as missing with the string NN and were replaced by NA.
Matrix M was recoded from bi-allelic nucleotide bases to numeric.
# Filtering genotypic matrix
<- ASRgenomics::qc.filtering(M,
M marker.callrate = 0.1,
ind.callrate = 0.2,
maf = 0.05,
impute = TRUE)$M.clean
Initial marker matrix M contains 554 individuals and 2388 markers.
A total of 674 markers were removed because their proportion of missing values was equal or larger than 0.1.
A total of 5 individuals were removed because their proportion of missing values was equal or larger than 0.2.
A total of 34 markers were removed because their MAF was smaller than 0.05.
A total of 0 markers were removed because their heterozygosity was larger than 1.
A total of 0 markers were removed because their |F| was larger than 1.
Final cleaned marker matrix M contains 2.84% of missing SNPs.
Final cleaned marker matrix M contains 549 individuals and 1680 markers.
A total of 26221 missing values were imputed, corresponding to 2.84% of the total number of SNPs.
# Reformatting to required format
<- as.matrix(M) - 1
M
# Show the first 50 rows/columns only
create_dt(M[1:50,1:50])
20.2 Read and Align Phenotypic Data
# Importing phenotypic data
<- 'barley_genbank_phenotypic_mean_data.csv'
pheno_file
<- read.csv(pheno_file)
pheno_data
<- merge(pheno_data, as.data.frame(rownames(M)), by.x = 'sample_id', by.y = 1, all.y = TRUE)
pheno_data
# Extracting requires trait
<- pheno_data$Yield
y
# Printing phenotypic data
create_dt(pheno_data)
20.3 Model 1
The first model assumes that all marker effects are normally distributed and have identical variance. Parameters are estimated as a solution to the optimization problem. Predictions from this method are equivalent to BLUP values from an animal model (i.e., GBLUP). The observed relationship matrix is calculated from the markers using the formula by VanRaden (2008).
# Fitting a Model using rrBLUP (case 1: all SNPs in the random effect)
<- rrBLUP::mixed.solve(y, Z = M, X = NULL, method = 'REML')
GS
# Obtaining predictions
<- matrix(data = GS$beta, nrow = length(y), ncol = 1) + M %*% GS$u
predGS
# Printing predictions
create_dt(cbind(round(predGS, 3), y))
# Plotting predictions
plot(y, predGS)
20.3.1 Goodness of Fit Statistics
# Heritability
# Trait variance
<- var(y, na.rm = TRUE)) (var_y
[1] 2.107743
# Error variance
<- GS$Ve) (var_e
[1] 1.465679
# Proportion of explained variance (High = good fit, potential heritability)
<- 1 - var_e / var_y) (h2_GS
[1] 0.3046215
<- cbind(rownames(M), y, predGS)
Anas_table
# Predictive ability (High = good prediction accuracy)
<- cor(y, predGS, method = 'pearson', use = 'complete.obs')) (PA
[,1]
[1,] 0.7800125
20.4 Model 2
This model handles some markers as having a fixed effect (e.g. significant SNPs from GWAS output). It splits the genotypic data into X matrix (fixed effects) and Z matrix (random effects). To calculate the REML solution for the model you solve y = X b + Z u + e. We predict BLUE(b) and BLUP(u) solutions for the fixed and random effects, respectively, using standard formulas (Searle et al. 1992).
# Defining markers
<- c('S7H_63131260', 'S5H_573915992')
markers <- colnames(M)
SNPs
# Filtering matrices
<- M[, SNPs %in% markers]
X <- M[,!SNPs %in% markers]
Z
# Fitting a Model using rrBLUP by exclude markers from the Z matrix (for the Random effects) and include them in the X matrix (for the fixed effect)
<- rrBLUP::mixed.solve(y, Z = Z, X = X, method = 'REML')
GS
# Obtaining predictions
<- X %*% GS$beta + Z %*% GS$u
predGS
# Printing predictions
create_dt(cbind(round(predGS, 3), y))
# Plot to verify
plot(y, predGS)
20.4.1 Goodness of Fit Statistics
# Heritability (GS)
# Trait variance
<- var(y, na.rm = TRUE)) (var_y
[1] 2.107743
# Error variance
<- GS$Ve) (var_e
[1] 1.018018
# Proportion of explained variance (High = good fit, potential heritability)
<- 1 - var_e / var_y) (h2_GS
[1] 0.5170105
# Predictive Ability (correlation between actual and predicted response)
<- cor(y, predGS, method = 'pearson', use = 'complete.obs')) (PA
[,1]
[1,] 0.8952614
20.5 K-Fold Cross Validation
# Set the total number of folds, and get the total number of individuals
<- 5
k <- length(y)
n
# Set randomly the fold group for each observation
# That tells when it will be used for validation
<- sample(rep(1:k, length.out = n), n)
group head(group, 20)
[1] 5 1 3 2 1 4 2 4 1 4 4 4 2 4 3 3 2 5 1 5
table(group)
group
1 2 3 4 5
110 110 110 110 109
# An empty matrix to save calculated heritability in each fold, and GS prediction that calculated when individual be in validation group
<- matrix(data = NA, nrow = n, ncol = 1)
predGS <- matrix(data = NA, nrow = k, ncol = 1)
h2_cv
for (g in 1:k) {
# Reset response variable, and exclude validation set values for the given k fold
<- y
y_cv == g] <- NA
y_cv[group
# Fitting a Model using rrBLUP (model 1: all SNPs in the random effect)
# GS <- mixed.solve(y_cv, Z = M, X = NULL, method = 'REML')
# predGS_cv <- matrix(data = GS$beta, nrow = length(y), ncol = 1) + M %*% GS$u
# Model 2: have some markers to be handled as a fixed effect
<- rrBLUP::mixed.solve(y_cv, Z = Z, X = X, method = 'REML')
GS
# Obtaining predictions
<- X %*% GS$beta + Z %*% GS$u
predGS_cv
# Heritability (GS)
<- var(y_cv, na.rm = TRUE)
var_y <- GS$Ve
var_e <- 1 - var_e / var_y
h2_cv[g]
# Get the GS prediction for all validation set individuals
== g] <- predGS_cv[group == g]
predGS[group
}
# Plot to verify
plot(y, predGS)
# Heritability (GS cross-validation)
h2_cv
[,1]
[1,] 0.5669156
[2,] 0.4867977
[3,] 0.5506054
[4,] 0.5487734
[5,] 0.5748752
mean(h2_cv)
[1] 0.5455935
# Predictive Ability (correlation between actual and predicted response)
<- cor(y, predGS, method = 'pearson', use = 'complete.obs')) (PA
[,1]
[1,] 0.398479
20.6 Predict Breeding Values
# Calculate the additive relationship matrix (it is the kinship matrix multiplied by 2) it can also perform imputation for missing marker data by
# Define impute.method (mean or EM) (see also: min.MAF, max.missing, and return.imputed parameters)
<- rrBLUP::A.mat(M)
A heatmap(A)
<- rrBLUP::mixed.solve(y, K = A.mat(M), SE = TRUE)
GS
# Genomic Estimated Breeding Value (GEBV)
<- cbind(round(GS$u, 3), round(GS$u.SE, 3))
GEBV
colnames(GEBV) <- c('GEBV', 'SE')
create_dt(GEBV)