<- useEnsembl(biomart = "ensembl", dataset="hsapiens_gene_ensembl", version = 86)
ensembl <- c(seq(1,22), "X", "Y")
chr
<- getBM(attributes = c("hgnc_symbol", "chromosome_name", "start_position", "end_position"),
gene_coor filters = "hgnc_symbol", values = rownames(res_sh_vs_ctrl), mart = ensembl,
useCache = FALSE)
# patched chromosomes are coming up, therefore not unique, need to filter out
<- gene_coor[gene_coor$chromosome_name %in% chr,] gene_coor
3 Functional Analysis
What do do once you have a list of genes?
3.1 Gene name conversions
Getting genomic coordinates from a list of gene names.
There are many ways of doing this, they’re all slightly different depending on the database.
3.2 Enrichment Analysis
Many of the functional analysis is achieved using ClusterProfiler. While the vignette is very comprehensive, it can be overwhelming to see which functions are useful, so I will point out the most useful ones here.
3.2.1 Overrepresention Analysis (ORA)
A very good explanation and accompanying paper of why we need background lists. Which genes to use for backgrounds? Check this.
<- as.data.frame(peakAnno)$geneId # get the genes from peak annotation
genes
<- rownames(res_lrt) # all genes that were tested for significance
geneUniverse <- enrichGO (gene = genes,
go keyType = "SYMBOL",
universe = geneUniverse,
ont = "BP",
OrgDb = org.Hs.eg.db,
qvalueCutoff = 0.05)
<- simplify(go) # to get rid of redundant GO terms go_simplified
To wrap long labels into a few lines
dotplot(go) + scale_y_discrete(labels = function(x) str_wrap(x, width = 30)) + ggtitle("") +
theme(plot.title = element_text(hjust = 0.5))
To use KEGG, you need to convert gene symbols to entrezid or use the geneId column from annotatePeaks.
<- lapply(deg_list, function(x) {
deg_list_entrez mapIds(org.Hs.eg.db, rownames(x), "ENTREZID", "SYMBOL")
})<- enrichKEGG(gene = deg_list_entrez$up, organism = "hsa") kegg
Or use bit_kegg https://guangchuangyu.github.io/2016/05/convert-biological-id-with-kegg-api-using-clusterprofiler/. I have not tried this method yet.
3.2.2 Gene Set Enrichment Analysis (GSEA)
Explanation of GSEA and how to do it in R https://sbc.shef.ac.uk/workshops/2019-01-14-rna-seq-r/rna-seq-gene-set-testing.nb.html.
How I first did it https://stephenturner.github.io/deseq-to-fgsea/.
I rank the genes by the stat column. For the Wald test, stat is the Wald statistic: the log2FoldChange divided by lfcSE.
<- degenes %>% as.data.frame() %>% rownames_to_column("symbol") %>% dplyr::select(symbol, stat)
ranked_genes <- ranked_genes %>% arrange(-stat) %>% deframe() ranked_genes
We usually perform GSEA with the Molecular Signature Database and there is a package for easy retrieval.
library(msigdbr)
<- msigdbr(species = "Homo sapiens", category = "H") %>%
h ::select(gs_name, gene_symbol)
dplyr<- GSEA(ranked_genes, TERM2GENE = h)
gsea ridgeplot(gsea)
::gseaplot2(gsea, geneSetID = "HALLMARK_") enrichplot