-
Notifications
You must be signed in to change notification settings - Fork 0
/
count2TPM.R
58 lines (54 loc) · 2.58 KB
/
count2TPM.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
rm(list=ls())
library(tidyverse)
expMatrix <- read.table("FastQ/GSE102349_featurecounts.txt",comment.char = "#",
row.names = 1, header = TRUE, sep="\t",check.names = F)
expMatrix[1:5,1:6]
# See the read count for the first five genes
exprSet <- expMatrix[,-c(1:5)]
exprSet[1:5,1:6]
colnames(exprSet) <- gsub("(.*?)\\/(.*?)\\_sorted.bam","\\2\\",colnames(exprSet) )
exprSet_out <- exprSet %>% as.data.frame()%>% rownames_to_column("gene_id")
exprSet_out[1:5,1:6]
write.table(exprSet_out,file = "01rawData/GSE102349_featurecounts.txt",row.names = F,quote = F,sep = "\t")
# The count value is converted to TPM (Transcripts Per Million)
x <- exprSet / expMatrix$Length
exprSet_tpm <- t( t(x) / colSums(x) ) * 1e6
exprSet_tpm[1:3,1:5]
exprSet_tpm_out <- exprSet_tpm %>% as.data.frame()%>% rownames_to_column("gene_id")
exprSet_tpm_out[1:3,1:5]
write.table(exprSet_tpm_out,file = "01rawData/GSE102349_featurecounts_TPM.txt",row.names = F,quote = F,sep = "\t")
# ensemble id is converted to symbol id
library("rtracklayer")
gtf1 = import('D:/TCGA/Homo_sapiens.GRCh38.100.chr.gtf')
gtf_df <- as.data.frame(gtf1)
gtf_df[1:9,1:3]
gtf_df <- gtf_df[gtf_df$gene_biotype == "protein_coding",c("gene_id","gene_biotype","gene_name")]
# gtf_df$"gene_id" <- substr(gtf_df$"gene_id", 1,15)
gtf_df <- unique(gtf_df)
exprSet_symbol <- exprSet_out %>%
dplyr::inner_join(gtf_df,by="gene_id") %>%
# tidyr::unite(gene_id,gene_id,gene_type,gene_name,sep = "|")%>%
# Get rid of redundant information
dplyr::select(-c(gene_id,gene_biotype))%>%
# realignment
dplyr::select(gene_name,everything()) %>%
# The expression levels of duplicate genes were averaged
dplyr::group_by(gene_name) %>%
dplyr::summarise_all(mean)
exprSet_symbol[1:9,1:3]
write.table(exprSet_symbol,file = "01rawData/GSE102349_counts_symbol.txt",row.names = F,quote = F,sep = "\t")
save(exprSet_symbol,file = "01rawData/GSE102349_counts_symbol.Rdata")
# TPM symbol
exprSet_TPM_symbol <- exprSet_tpm_out %>%
dplyr::inner_join(gtf_df,by="gene_id") %>%
# tidyr::unite(gene_id,gene_id,gene_type,gene_name,sep = "|")%>%
# Get rid of redundant information
dplyr::select(-c(gene_id,gene_biotype))%>%
#realignment
dplyr::select(gene_name,everything()) %>%
# The expression levels of duplicate genes were averaged
dplyr::group_by(gene_name) %>%
dplyr::summarise_all(mean)
exprSet_TPM_symbol[1:9,1:3]
write.table(exprSet_TPM_symbol,file = "01rawData/GSE102349_TPM_symbol.txt",row.names = F,quote = F,sep = "\t")
save(exprSet_TPM_symbol,file = "01rawData/GSE102349_TPM_symbol.Rdata")