Get the released version from CRAN:
install.packages("GeoTcgaData")Or the development version from github:
if(!requireNamespace("devtools", quietly = TRUE))
install.packages("devtools")
devtools::install_github("huerqiang/GeoTcgaData")GEO and TCGA provide us with a wealth of data, such as RNA-seq, DNA Methylation, single nucleotide Variation and Copy number variation data. It’s easy to download data from TCGA using the gdc tool or TCGAbiolinks, but processing these data into a format suitable for bioinformatics analysis requires more work. This R package was developed to handle these data.
library(GeoTcgaData)
#> Hello, friend! welcome to use!This is a basic example which shows you how to solve a common problem:
Use TCGAbiolinks or GDCRNATools to download and analysis Gene expression data. TCGAbiolinks use edgeR package to do differential expression analysis, while GDCRNATools can implement three most commonly used methods: limma, edgeR , and DESeq2 to identify differentially expressed genes (DEGs).
use TCGAbiolinks to download TCGA data
library(TCGAbiolinks)
query <- GDCquery(project = "TCGA-ACC",
data.category = "DNA Methylation",
data.type = "Methylation Beta Value",
platform = "Illumina Human Methylation 450")
GDCdownload(query, method = "api", files.per.chunk = 5, directory = Your_Path)The function Merge_methy_tcga could Merge methylation data downloaded from TCGA official website or TCGAbiolinks. This makes it easier to extract differentially methylated genes in the downstream analysis. For example:
merge_result <- Merge_methy_tcga(Your_Path_to_DNA_Methylation_data)Then use ChAMP package to do difference analysis.
library(ChAMP)
diff_gene <- methyDiff(cpgData = merge_result, sampleGroup = sample(c("C","T"),
ncol(merge_result[[1]]), replace = TRUE))Use clusterProfiler to do enrichment analytics:
diff_gene$p.adj <- p.adjust(diff_gene$pvalue)
genes <- diff_gene[diff_gene$p.adj < 0.05, "gene"]
library(clusterProfiler)
library(enrichplot)
library(org.Hs.eg.db)
ego <- enrichGO(gene = genes, OrgDb = org.Hs.eg.db, keyType = "SYMBOL")
dotplot(ego)use TCGAbiolinks to download TCGA data(Gene Level Copy Number Scores)
library(TCGAbiolinks)
query <- GDCquery(project = "TCGA-LGG",
data.category = "Copy Number Variation",
data.type = "Gene Level Copy Number Scores")
GDCdownload(query, method = "api", files.per.chunk = 5, directory = Your_Path)
data <- GDCprepare(query = query,
directory = "Your_Path") Do difference analysis of gene level copy number variation data using diff_CNV
class(data) <- "data.frame"
cnvData <- data[, -c(1,2,3)]
rownames(cnvData) <- data[, 1]
sampleGroup = sample(c("A","B"), ncol(cnvData), replace = TRUE)
diffCnv <- diff_CNV(cnvData, sampleGroup)Use clusterProfiler to do enrichment analytics:
pvalues <- diffCnv$pvalue * sign(diffCnv$odds)
genes <- rownames(diffCnv)[diffCnv$pvalue < 0.05]
library(clusterProfiler)
library(enrichplot)
library(org.Hs.eg.db)
ego <- enrichGO(gene = genes, OrgDb = org.Hs.eg.db, keyType = "ENSEMBL")
dotplot(ego)Use TCGAbiolinks to download TCGA data
library(TCGAbiolinks)
query <- GDCquery(project = "TCGA-ACC",
data.category = "Simple Nucleotide Variation",
data.type = "Masked Somatic Mutation",
workflow.type = "MuSE Variant Aggregation and Masking")
GDCdownload(query, method = "api", files.per.chunk = 5, directory = Your_Path)
data_snp <- GDCprepare(query = query,
directory = "Your_Path") Use diff_SNP_tcga to do difference analysis
samples <- unique(data_snp$Tumor_Sample_Barcode)
sampleType <- sample(c("A","B"), length(samples), replace = TRUE)
names(sampleType) <- samples
pvalue <- diff_SNP_tcga(snpData = data_snp, sampleType = sampleType)Use clusterProfiler to do enrichment analysis
pvalue2 <- sort(pvalue, decreasing = TRUE)
library(clusterProfiler)
library(enrichplot)
library(org.Hs.eg.db)
gsego <- gseGO(pvalue2, OrgDb = org.Hs.eg.db, keyType = "SYMBOL")
dotplot(gsego)The function gene_ave could average the expression data of different ids for the same gene in the GEO chip data. For example:
aa <- c("MARCH1","MARC1","MARCH1","MARCH1","MARCH1")
bb <- c(2.969058399,4.722410064,8.165514853,8.24243893,8.60815086)
cc <- c(3.969058399,5.722410064,7.165514853,6.24243893,7.60815086)
file_gene_ave <- data.frame(aa=aa,bb=bb,cc=cc)
colnames(file_gene_ave) <- c("Gene", "GSM1629982", "GSM1629983")
result <- gene_ave(file_gene_ave, 1)Multiple genes symbols may correspond to a same chip id. The result of function rep1 is to assign the expression of this id to each gene, and function rep2 deletes the expression. For example:
aa <- c("MARCH1 /// MMA","MARC1","MARCH2 /// MARCH3",
"MARCH3 /// MARCH4","MARCH1")
bb <- c("2.969058399","4.722410064","8.165514853","8.24243893","8.60815086")
cc <- c("3.969058399","5.722410064","7.165514853","6.24243893","7.60815086")
input_file <- data.frame(aa=aa,bb=bb,cc=cc)
rep1_result <- rep1(input_file," /// ")
rep2_result <- rep2(input_file," /// ")id_conversion_vector could convert gene id from one of symbol, RefSeq_ID, Ensembl_ID, NCBI_Gene_ID, UCSC_ID, and UniProt_ID , etc. to another. Use id_ava() to get all the convertible ids. For example:id_conversion_vector("symbol", "ensembl_gene_id", c("A2ML1", "A2ML1-AS1", "A4GALT", "A12M1", "AAAS"))
#> 80% were successfully converted.
#> from to
#> 1 A2ML1 ENSG00000166535
#> 2 A2ML1-AS1 ENSG00000256661
#> 3 A4GALT ENSG00000128274
#> 4 A12M1 <NA>
#> 5 AAAS ENSG00000094914When the user converts the Ensembl ID to other ids, the version number needs to be removed. For example, “ENSG00000186092.4” doesn’t work, you need to change it to “ENSG00000186092”.
Especially, the function id_conversion could convert ENSEMBL gene id to gene Symbol in TCGA. For example:
result <- id_conversion(profile)
#>
#> 载入需要的程辑包:org.Hs.eg.db
#> 载入需要的程辑包:AnnotationDbi
#> 载入需要的程辑包:stats4
#> 载入需要的程辑包:BiocGenerics
#> 载入需要的程辑包:parallel
#>
#> 载入程辑包:'BiocGenerics'
#> The following objects are masked from 'package:parallel':
#>
#> clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
#> clusterExport, clusterMap, parApply, parCapply, parLapply,
#> parLapplyLB, parRapply, parSapply, parSapplyLB
#> The following objects are masked from 'package:stats':
#>
#> IQR, mad, sd, var, xtabs
#> The following objects are masked from 'package:base':
#>
#> Filter, Find, Map, Position, Reduce, anyDuplicated, append,
#> as.data.frame, basename, cbind, colnames, dirname, do.call,
#> duplicated, eval, evalq, get, grep, grepl, intersect, is.unsorted,
#> lapply, mapply, match, mget, order, paste, pmax, pmax.int, pmin,
#> pmin.int, rank, rbind, rownames, sapply, setdiff, sort, table,
#> tapply, union, unique, unsplit, which.max, which.min
#> 载入需要的程辑包:Biobase
#> Welcome to Bioconductor
#>
#> Vignettes contain introductory material; view with
#> 'browseVignettes()'. To cite Bioconductor, see
#> 'citation("Biobase")', and for packages 'citation("pkgname")'.
#> 载入需要的程辑包:IRanges
#> 载入需要的程辑包:S4Vectors
#>
#> 载入程辑包:'S4Vectors'
#> The following objects are masked from 'package:base':
#>
#> I, expand.grid, unname
#>
#> 载入程辑包:'IRanges'
#> The following object is masked from 'package:grDevices':
#>
#> windows
#>
#> 'select()' returned 1:1 mapping between keys and columns
#> Warning in clusterProfiler::bitr(rownames(profiles), fromType = "ENSEMBL", :
#> 33.33% of input gene IDs are fail to map...The parameter profile is a data.frame or matrix of gene expression data in TCGA.
Note: In previous versions(< 1.0.0) the id_conversion and id_conversion_vector used HGNC data to convert human gene id. In future versions, we will use clusterProfiler::bitr for ID conversion.
library(clusterProfiler)
#> clusterProfiler v4.0.0 For help: https://guangchuangyu.github.io/software/clusterProfiler
#>
#> If you use clusterProfiler in published research, please cite:
#> Guangchuang Yu, Li-Gen Wang, Yanyan Han, Qing-Yu He. clusterProfiler: an R package for comparing biological themes among gene clusters. OMICS: A Journal of Integrative Biology. 2012, 16(5):284-287.
#>
#> 载入程辑包:'clusterProfiler'
#> The following object is masked from 'package:AnnotationDbi':
#>
#> select
#> The following object is masked from 'package:IRanges':
#>
#> slice
#> The following object is masked from 'package:S4Vectors':
#>
#> rename
#> The following object is masked from 'package:stats':
#>
#> filter
bitr(c("A2ML1", "A2ML1-AS1", "A4GALT", "A12M1", "AAAS"), fromType = "SYMBOL", toType = "ENSEMBL", OrgDb = org.Hs.eg.db, drop = FALSE)
#> 'select()' returned 1:1 mapping between keys and columns
#> Warning in bitr(c("A2ML1", "A2ML1-AS1", "A4GALT", "A12M1", "AAAS"), fromType =
#> "SYMBOL", : 40% of input gene IDs are fail to map...
#> SYMBOL ENSEMBL
#> 1 A2ML1 ENSG00000166535
#> 2 A2ML1-AS1 <NA>
#> 3 A4GALT ENSG00000128274
#> 4 A12M1 <NA>
#> 5 AAAS ENSG00000094914countToFpkm_matrix and countToTpm_matrix could convert count data to FPKM or TPM data.lung_squ_count2 <- matrix(c(1,2,3,4,5,6,7,8,9),ncol=3)
rownames(lung_squ_count2) <- c("DISC1","TCOF1","SPPL3")
colnames(lung_squ_count2) <- c("sample1","sample2","sample3")
jieguo <- countToFpkm_matrix(lung_squ_count2)lung_squ_count2 <- matrix(c(0.11,0.22,0.43,0.14,0.875,0.66,0.77,0.18,0.29),ncol=3)
rownames(lung_squ_count2) <- c("DISC1","TCOF1","SPPL3")
colnames(lung_squ_count2) <- c("sample1","sample2","sample3")
jieguo <- countToTpm_matrix(lung_squ_count2)tcga_cli_deal could combine clinical information obtained from TCGA and extract survival data. For example:tcga_cli <- tcga_cli_deal(system.file(file.path("extdata","tcga_cli"),package="GeoTcgaData"))