Short lecture on methods to detect selection
We will use fst outlier scans- these are common and relatively simply to conduct. It is first useful to see the distribution of the fst estimates between our populations. We can do this with histograms/ridgeplots from the values we calcuate in snpR
library(snpR)
library(ggplot2)
library(tidyverse)
library(ggridges)
# these are mostly the same steps as with the population structure.
<- read_vcf("~/shared_materials/tutorial_files/variants.vcf")
dat
<- data.frame(pop = substr(colnames(dat), 1, 2))
sample_meta $pop <- factor(sample_meta$pop, levels=c("GA", "HP", "BC", "PC", "TR"))
sample_metasample.meta(dat) <- sample_meta
# we now do the same calculations as before, but now faceting by population and chromosome.
<- calc_pairwise_fst(dat, facets="pop.CHROM", method = "WC")
my.dat <- get.snpR.stats(my.dat, facets = "pop.CHROM", stats = "fst")$pairwise
fst_out # the pairwise, above says to grab the pariwise fst estimates.
# there are a bunch of NA's, because there isn't variation between some of the populations remove these
sum(is.na(fst_out$fst))
<- fst_out %>% drop_na(fst)
fst_out sum(is.na(fst_out$fst))
# there are also negative values, these are due to the sample size correction in the fst calculation
## set these to zero
$fst[fst_out$fst < 0] <- 0
fst_out
# density plots:
# we can first plot density plots of the estimates
ggplot( fst_out, aes(x=fst, y=comparison)) +
geom_density_ridges(quantile_lines=TRUE, quantiles=2)
# and we can grab means for each
%>%
fst_out group_by(comparison) %>%
summarize(mean = mean(fst), sd = sd(fst), median=median(fst))
Lets talk about the fst distributions.
We can next look at each snp as ask if it is an outlier and putatively under selection.
For simplicity, we’ll first just look at chromosome 1.
# filter down to only chromosome 1
<- fst_out %>% filter(CHROM == "chr1")
chr1
# make plot
<- ggplot(chr1, aes(x = position, y = fst, color = comparison)) +
p geom_point(alpha = 0.1) +
theme_bw() +
facet_wrap(~comparison, ncol=1) +
ylim(0,1)
p
How would we identify outliers?
And we can add a smoothed average to get broad patterns across the genome.
<- calc_smoothed_averages(x = my.dat,
chr1 facets = "pop.CHROM",
sigma = 50, # using a window size of 50 kb
step = 25) # using a step size of 10kb between windows
# pull out the smoothed values
<- get.snpR.stats(chr1, facets = "pop.CHROM", stats ="fst")$pairwise.window
fst_smooth
# parse down to just chromosome 1 again
<- fst_smooth %>% filter(snp.subfacet == "chr1")
chr1_smooth
# have to change the column names to match:
colnames(chr1_smooth)[2] <- c("comparison")
# add our smoothed values to the previous plot
+ geom_line(data=chr1_smooth,
p aes(x=position,y=fst), color="black")
Are there any genomic regions that appear to be under selection?
Our previous plot was only one chromosome. We might want to plot all our chromosomes. We can use the qqman package to do this. This might be useful for some of you in your projects later.
install.packages("qqman")
library(qqman)
# subset data to just one comparison:
<- fst_out %>% filter(comparison == "HP~PC")
HP_PC
# make a new dataframe that fits the qqman requirements:
##
<- data.frame(SNP = HP_PC$ID,
qqdf CHR = as.numeric(substr(HP_PC$CHROM, 4,4)),
BP = HP_PC$position,
P = HP_PC$fst
)# make the manhattan plot
manhattan(qqdf,logp = FALSE, ylim=c(0,1))
We can use multiple programs and test to determine if a variant is an outlier. In the most simple test, we could just take the top 1% (or whatever threshold). But there are better ways to actually determine if a locus is an outlier.
Common methods include:
We will use outFLANK today.
# thanks to Rachael Bay's tutorial for some of this: https://baylab.github.io/MarineGenomics/week-7-fst-and-outlier-analysis.html
library(OutFLANK)
# we need to get our genotypes to the format 0, 1, 2, which means homozygous ref, heterozygote, homozygous alternate
# it is probably easiest to use the package vcfR to do this:
library(vcfR)
<- read.vcfR("~/shared_materials/tutorial_files/variants.vcf")
dat
# pull out the genotypes:
<- extract.gt(dat)
geno dim(geno)
head(geno[,1:10])
# we need to convert the genotypes from 0/0, 0/1, 1/1 to 0, 1, 2
%in% c("0/0")] <- 0
geno[geno %in% c("0/1", "1/0")] <- 1
geno[geno %in% c("1/1")] <- 2
geno[geno
# and missing values need to be 9
is.na(geno)] <- 9
geno[
# finally, the matrix needs to be converted to longforamt where SNPs as columns and individuals as rows
<- t(geno)
tgeno dim(tgeno)
We can now calculate Fst (again). But first we need to subset down our genotypes to just the populations we care about. We can drop the rows that have the wrong indivs, the rownames have the individual ids.
Let’s compare HP and PC, because they are close together, but have very different phenotypes
# read in the population files :
<- read.table("~/shared_materials/tutorial_files/HP.pop", header=F)
HP <- read.table("~/shared_materials/tutorial_files/PC.pop", header=F)
PC
# subset. Find just the rownames that are in our HP and PC df's
<- tgeno[row.names(tgeno) %in% c(HP$V1, PC$V1),]
genoSub
1:10,1:10]
genoSub[
# and then get the population ids:
<- substr(row.names(genoSub), 1, 2) pops
Because we subset down, some of our variants are no longer vairable in our populations. We need to drop these.
# get the column indices where all values are 0 (homozygous)
<- which(apply(genoSub, 2, function(x) all(x %in% c("0"))))
remove_0
#apply takes:
# 1. The data object you want to apply the function to.
# 2. A value of either 1 or 2, which specifies whether you want to apply the function to the rows or columns of the data object, respectively.
# 3. The function you want to apply.
# remove the columns from the dataframe
<- genoSub[, -remove_0]
genoSub 1:10,1:10]
genoSub[
<- which(apply(genoSub, 2, function(x) all(x %in% c("2"))))
remove_2 <- genoSub[, -remove_2]
genoSub 1:10,1:10]
genoSub[
# remove when all hets:
<- which(apply(genoSub, 2, function(x) all(x %in% c("1"))))
remove_h <- genoSub[, -remove_h]
genoSub
# finally, covert our genotype matrix to numeric:
<- apply(genoSub, c(1, 2), as.numeric) # c(1,2) means do over both rows and columns genoSub
We are now ready to run outflank.
# run outflank:
# first calculate fst
<- MakeDiploidFSTMat(genoSub,locusNames=colnames(genoSub),popNames=pops)
fst
head(fst)
# and we can look at the histogram of the values.
hist(fst$FST,breaks=100)
summary(fst$FST)
# lets fit our model to determine if we have outliers:
# fitting a Chi-Squared distribution, are there more snps in the tail than we expect by chance?
<- OutFLANK(fst,LeftTrimFraction=0.05,RightTrimFraction=0.05,
OF Hmin=0.1,NumberOfSamples=2,qthreshold=0.05)
OutFLANKResultsPlotter(OF,withOutliers=T,
NoCorr=T,Hmin=0.1,binwidth=0.005,
Zoom=F,RightZoomFraction=0.05,titletext=NULL)
Talk about what outflank is doing.
Now we can pull out the outliers
sum(OF$results$OutlierFlag == TRUE, na.rm=TRUE)
<- pOutlierFinderChiSqNoCorr(fst,Fstbar=OF$FSTNoCorrbar,
P1 dfInferred=OF$dfInferred,qthreshold=0.05,Hmin=0.1)
<- P1$LocusName[which(P1$OutlierFlag==TRUE)] #which of the SNPs are outliers?
outliers length(outliers)
plot(P1$LocusName,P1$FST,xlab="Position",ylab="FST",col=rgb(0,0,0,alpha=0.1))
points(P1$LocusName[outliers],P1$FST[outliers],col="magenta")
But you should notice that our plot doesn’t show chromsosomes. Let’s go back to our manhattan plot with qqman and make this look better:
<- fst_out %>% filter(comparison == "HP~PC")
HP_PC
# make a new dataframe that fits the qqman requirements:
##
<- data.frame(SNP = HP_PC$ID,
qqdf CHR = as.numeric(substr(HP_PC$CHROM, 4,4)),
BP = HP_PC$position,
P = HP_PC$fst)
)
# make the manhattan plot
manhattan(qqdf,logp = FALSE, ylim=c(0,1), highlight = outliers)
We could also look at our outlier regions more closely:
manhattan(subset(qqdf, CHR == 2), highlight = outliers,
xlim = c(31716537, 32043405), main = "Chr 2", logp = FALSE)
Look at the other outlier regions more closely like we did with qqman. Do you think they’re reliable?