Skip to content

Commit

Permalink
Merge branch 'dev' into 'master'
Browse files Browse the repository at this point in the history
prepare release 0.6.9

See merge request tron/bnt_neoants/splice2neo!131
  • Loading branch information
franla23 committed Jul 1, 2024
2 parents dfd87f6 + ae7245e commit 357821d
Show file tree
Hide file tree
Showing 12 changed files with 198 additions and 37 deletions.
4 changes: 3 additions & 1 deletion .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,10 @@ testing:
- testing.log
stage: check
script:
- sudo apt-get update
- sudo apt-get install -y qpdf
- R -e 'install.packages("devtools", verbose = FALSE, quiet = TRUE)'
- R -e 'devtools::install(quiet=TRUE)'
- R -e 'devtools::install(quiet=TRUE, dependencies = TRUE)'
- R -e 'devtools::check()'

pages:
Expand Down
4 changes: 2 additions & 2 deletions R/annotate_mut_effect.R
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,8 @@ annotate_mut_effect <- function(effect_df,
# gene and transcripts
gene_transcript_mapping <-
tibble::tibble(
gene_id = unlist(transcripts_gr@elementMetadata$gene_id),
tx_name = transcripts_gr@elementMetadata$tx_name
gene_id = unlist(transcripts_gr$gene_id),
tx_name = transcripts_gr$tx_name
)

# consider only transcripts that relate to gene that is provided in the
Expand Down
19 changes: 7 additions & 12 deletions R/combine_mut_junc.R
Original file line number Diff line number Diff line change
Expand Up @@ -35,22 +35,17 @@ combine_mut_junc <- function(junc_data_list){
# select distinct junctions by the indicator columns
purrr::map(dplyr::distinct, mut_id, tx_id, junc_id) %>%

# combine into a single data.frame with tool column
dplyr::bind_rows(.id = "tool") %>%
# make sure to add detected column for each spliceai tool
purrr::imap(~dplyr::mutate(.x, "{.y}_detected" := TRUE)) %>%

# mark as detected
mutate(detected = TRUE) %>%
# join lists
purrr::reduce(dplyr::full_join, by = c("mut_id", "tx_id", "junc_id")) %>%

# expand by junction and tool
# set not detected to FALSE
complete(
nesting(mut_id, tx_id, junc_id),
tool,
fill = list(detected = FALSE)
) %>%

# add tools as separate columns
pivot_wider(names_from = "tool", values_from = "detected",
names_glue = "{.name}_detected")
fill = names(junc_data_list) %>% purrr::map( ~ FALSE) %>% set_names(paste0(names(junc_data_list), "_detected"))
)


# rename annotation columns
Expand Down
32 changes: 28 additions & 4 deletions R/format_cispliceai.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
#'
#' @param cispliceai_variants [tibble][tibble::tibble-package] with parsed
#' CI-SpliceAI mutations from \code{\link{parse_spliceai}}
#' @param transcripts_gr *Optionally*, A GRanges object with transcript ranges created by
#' `GenomicFeatures::transcripts(txdb)`can be provided which will allow to annotate full gene ids
#' to the formatted table and to consider only transcripts related to the annotated gene in `annotate_mut_effect`.
#' This parameter is optionally.
#'
#' @return A [tibble][tibble::tibble-package] with splicing effects per row
#'
Expand All @@ -17,7 +21,7 @@
#'
#' @seealso \code{\link{parse_cispliceai_thresh}}, \code{\link{annotate_mut_effect}}
#' @export
format_cispliceai_thresh <- function(cispliceai_variants){
format_cispliceai_thresh <- function(cispliceai_variants, transcripts_gr = NULL){

#format columns
cispliceai_variants <- cispliceai_variants %>%
Expand All @@ -33,9 +37,29 @@ format_cispliceai_thresh <- function(cispliceai_variants){
mut_id = str_c(CHROM, POS, REF, ALT, sep = "_"),
chr = CHROM,
pos = as.integer(POS) + pos_rel
) %>%
)

# keep only relevant columns
dplyr::select(mut_id, effect, score, chr, pos_rel, pos) %>%
if(!is.null(transcripts_gr)){

stopifnot("gene_id" %in% names(S4Vectors::mcols(transcripts_gr)))

# CI-SpliceAI does not return the version part of the gene id if gene table from CI-SPliceAi is used as annotation
# we need to map them based on the transcripts_gr object
# if gene_id should be kept
gene_table <- tibble::tibble(gene_id = unique(unlist(transcripts_gr$gene_id))) %>%
dplyr::mutate(gene_id_short = gsub("\\..*", "", gene_id))

formated_variants <- cispliceai_variants %>%
dplyr::left_join(gene_table, by = c("SYMBOL" = "gene_id_short")) %>%
dplyr::select(mut_id, effect, score, chr, pos_rel, pos, gene_id)

} else{

formated_variants <- cispliceai_variants %>%
dplyr::select(mut_id, effect, score, chr, pos_rel, pos)

}

formated_variants %>%
dplyr::distinct()
}
2 changes: 1 addition & 1 deletion R/standardize_junctions_RNA_tools.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#' format.
#'
#' @return A combined table with unique junctions. The columns
#' identified_by_{name} contains information which tools identified the given
#' identified_by_\{name\} contains information which tools identified the given
#' junction
#'
#' @examples
Expand Down
21 changes: 11 additions & 10 deletions inst/extdata/cispliceai_thresh_output.vcf
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,15 @@
##contig=<ID=chrX,length=155270560>
##contig=<ID=chrY,length=59373566>
##INFO=<ID=SpliceAI,Number=.,Type=String,Description="SpliceAIv1.3.1 variant annotation. These include delta scores (DS) and delta positions (DP) for acceptor gain (AG), acceptor loss (AL), donor gain (DG), and donor loss (DL). Format: ALLELE|SYMBOL|DS_AG|DS_AL|DS_DG|DS_DL|DP_AG|DP_AL|DP_DG|DP_DL">
##INFO=<ID=CI-SpliceAI,Number=.,Type=String,Description="CI-SpliceAI V1.1.0 variant annotation for a maximum distance of 50 nucleotides from the variant; masking off; remove nucs outside tx all effects. Format: ALLELE|SYMBOL|DS_AG|DS_AL|DS_DG|DS_DL|DP_AG|DP_AL|DP_DG|DP_DL (DS=Delta Score, DP=Delta Position, AG/AL=Acceptor Gain/Acceptor Loss, DG/DL=Donor Gain/Donor Loss).">
#CHROM POS ID REF ALT QUAL FILTER INFO
chr1 25000 . A C,G,T . . . CI-SpliceAI=C|ENSG00000189409
chr2 152389953 . T A,C,G . . CI-SpliceAI=A|ENSG00000188157|DL:0.02:-18
chr2 179415988 . C CA . . CI-SpliceAI=CA|ENSG00000188157|AG:0.03:-41|AL:0.8:-2|DG:0.08:8|DL:0.5:12
chr2 179446218 . ATACT A . . CI-SpliceAI=A|ENSG00000162512|AL:0.02:27|AG:0.03:14|DG:0.02:-48
chr2 179446218 . ATACT AT,ATA . . CI-SpliceAI=AT|ENSG00000162512|AL:0.02:27
chr2 179642185 . G A . . CI-SpliceAI=A|ENSG00000259030,A|ENSG00000116783
chr19 38958362 . C T . . CI-SpliceAI=T|+,T|-
chr21 47406854 . CCA C . . CI-SpliceAI=C|ENSG00000118733|DL:0.03:14|AG:0.02:-48|AG:0.8:-2
chr21 47406856 . A AT . . CI-SpliceAI=AT|+|AL:0.02:27,AT|-|DL:0.5:12
chrX 129274636 . A C,G,T . . CI-SpliceAI=C|ENSG00000116299|AG:0.7:-41,G|ENSG00000116299|DG:0.03:10,T|ENSG00000116299||AL:0.03:40
chr1 25000 . A C,G,T . . CI-SpliceAI=C|+,C|-,G|+,G|-,T|+,T|-
chr2 152389953 . T A,C,G . . SpliceAI=A|NEB|0.01|0.00|0.00|0.74|43|3|-26|3,C|NEB|0.04|0.00|0.00|0.71|43|3|-26|3,G|NEB|0.03|0.00|0.00|0.75|43|3|-26|3;CI-SpliceAI=A|ENSG00000183091|DL:0.96:3,C|ENSG00000183091|DL:0.90:3,G|ENSG00000183091|DL:0.97:3
chr2 179415988 . C CA . . SpliceAI=CA|TTN|0.07|1.00|0.00|0.00|-7|-1|35|-29;CI-SpliceAI=CA|ENSG00000155657|AL:1.00:-1|AG:0.05:0|AG:0.03:7|AG:0.04:9
chr2 179446218 . ATACT A . . SpliceAI=A|TTN|0.00|0.00|0.02|0.91|-7|34|-11|8;CI-SpliceAI=A|ENSG00000155657|DG:0.03:-11|DL:0.87:8
chr2 179446218 . ATACT AT,ATA . . SpliceAI=AT|TTN|.|.|.|.|.|.|.|.,ATA|TTN|.|.|.|.|.|.|.|.;CI-SpliceAI=AT|ENSG00000155657|DG:0.05:-11|DL:0.22:8,ATA|ENSG00000155657|DL:0.60:8
chr2 179642185 . G A . . SpliceAI=A|TTN|0.00|0.00|0.64|0.55|2|38|2|-38;CI-SpliceAI=A|ENSG00000155657|DL:0.77:-38|DG:0.61:2
chr19 38958362 . C T . . SpliceAI=T|RYR1|0.00|0.00|0.91|0.08|-28|-46|-2|-31;CI-SpliceAI=T|ENSG00000196218|DG:0.98:-2
chr21 47406854 . CCA C . . SpliceAI=C|COL6A1|0.04|0.98|0.00|0.00|-38|4|38|4;CI-SpliceAI=C|ENSG00000142156|AG:0.03:-19|AL:0.98:4
chr21 47406856 . A AT . . SpliceAI=AT|COL6A1|0.03|0.99|0.00|0.00|-40|2|36|2;CI-SpliceAI=AT|ENSG00000142156|AG:0.02:-21|AL:0.98:2
chrX 129274636 . A C,G,T . . SpliceAI=C|AIFM1|0.00|0.18|0.00|0.00|-28|-44|-44|45,G|AIFM1|0.00|0.17|0.00|0.00|-8|-44|-44|45,T|AIFM1|0.00|0.19|0.00|0.00|-2|-44|-44|45;CI-SpliceAI=C|ENSG00000156709|AL:0.06:-44,G|ENSG00000156709|AL:0.06:-44,T|ENSG00000156709|AL:0.03:-44
7 changes: 6 additions & 1 deletion man/format_cispliceai_thresh.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/generate_combined_dataset.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

44 changes: 39 additions & 5 deletions tests/testthat/test-annotate_mut_effect.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ test_that("annotate_mut_effect works on toy example with pangolin with gene_mapp

gene_transcript_mapping <-
tibble::tibble(
gene_id = unlist(toy_transcripts_gr@elementMetadata$gene_id),
tx_name = toy_transcripts_gr@elementMetadata$tx_name
gene_id = unlist(toy_transcripts_gr$gene_id),
tx_name = toy_transcripts_gr$tx_name
)

pangolin_file <- system.file("extdata", "spliceai_output.pangolin.vcf", package = "splice2neo")
Expand All @@ -45,12 +45,12 @@ test_that("annotate_mut_effect works on toy example with pangolin with gene_mapp

# we need to do a dirty fix because different versions of gencode annotations in effect_df and toy_transcripts
toy_transcripts_gr_fix <- toy_transcripts_gr
toy_transcripts_gr_fix@elementMetadata$gene_id = gsub("_.*", "", unlist(toy_transcripts_gr@elementMetadata$gene_id))
toy_transcripts_gr_fix$gene_id = gsub("_.*", "", unlist(toy_transcripts_gr$gene_id))
effect_df$gene_id <- gsub("_.*", "", effect_df$gene_id)
effect_df$gene_id <- gsub("\\...*", "", effect_df$gene_id)
toy_transcripts_gr_fix@elementMetadata$gene_id = gsub("\\...*", "", unlist(toy_transcripts_gr_fix@elementMetadata$gene_id))
toy_transcripts_gr_fix$gene_id = gsub("\\...*", "", unlist(toy_transcripts_gr_fix$gene_id))
effect_df <- effect_df %>%
filter(gene_id %in% toy_transcripts_gr_fix@elementMetadata$gene_id)
filter(gene_id %in% toy_transcripts_gr_fix$gene_id)


# without gene mapping
Expand All @@ -72,6 +72,40 @@ test_that("annotate_mut_effect works on toy example with pangolin with gene_mapp

})

test_that("annotate_mut_effect works on toy example with CI-SpliceAI with gene_mapping = TRUE", {

gene_transcript_mapping <-
tibble::tibble(
gene_id = unlist(toy_transcripts_gr$gene_id),
tx_name = toy_transcripts_gr$tx_name
)

cispliceai_file <- system.file("extdata", "cispliceai_thresh_output.vcf", package = "splice2neo")
df_raw <- parse_cispliceai_thresh(cispliceai_file)
df_with_gene <- format_cispliceai_thresh(df_raw, transcripts_gr = toy_transcripts_gr)

# without gene mapping
annot_df <- annotate_mut_effect(df_with_gene, toy_transcripts, toy_transcripts_gr, gene_mapping = FALSE)
# with gene mapping
annot_df_map <- annotate_mut_effect(df_with_gene, toy_transcripts, toy_transcripts_gr, gene_mapping = TRUE)

expect_true(nrow(annot_df_map) >= nrow(df_with_gene))


# check that additional rows if gene_mapping = FALSE are because of not fitting gene-transcript pair
df_not_mapped <- annot_df %>%
dplyr::select(mut_id, junc_id, tx_id, gene_id) %>%
#get additional rows in not mapped table
anti_join(annot_df_map, by = c("mut_id", "junc_id", "tx_id")) %>%
#map genes related to those transcripts
left_join(gene_transcript_mapping, by = c("tx_id" = "tx_name"), suffix = c("_cisplicai", "_mapped"))

expect_true(all(df_not_mapped$gene_id_cisplicai != df_not_mapped$gene_id_mapped))

})



test_that("annotate_mut_effect works on empty tibble", {

spliceai_file <- system.file("extdata", "spliceai_output.vcf", package = "splice2neo")
Expand Down
79 changes: 79 additions & 0 deletions tests/testthat/test-combine_mut_junc.R
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,89 @@ test_that("combine_mut_junc works", {
"mmsplice" = mmsplice_annot_df
)

n_unique <- bind_rows(junc_data_list) %>% select(mut_id, junc_id, tx_id) %>% n_distinct()

df_comb <- combine_mut_junc(junc_data_list)

expect_true(nrow(df_comb) >= 1)

})

test_that("combine_mut_junc works if one of the input table is empty", {

# get spliceAI junctions
spliceai_file <- system.file("extdata", "spliceai_output.vcf", package = "splice2neo")

spliceai_annot_df <- parse_spliceai(spliceai_file) %>%
format_spliceai() %>%
annotate_mut_effect(toy_transcripts, toy_transcripts_gr)

# get pangolin junctions
pangolin_file <- system.file("extdata", "spliceai_output.pangolin.vcf", package = "splice2neo")

pangolin_annot_df <- parse_pangolin(pangolin_file) %>%
format_pangolin() %>%
annotate_mut_effect(toy_transcripts, toy_transcripts_gr)

# get mmsplice junctions
mmsplice_file <- system.file("extdata", "mmsplice_pred.csv", package = "splice2neo")
mmsplice_df <- parse_mmsplice(mmsplice_file)
mmsplice_annot_df <- mmsplice_df %>%
annotate_mmsplice(toy_transcripts)

mmsplice_annot_df_empty <- mmsplice_annot_df %>%
filter(row_number() < 1)

junc_data_list = list(
"spliceai" = spliceai_annot_df,
"pangolin" = pangolin_annot_df,
"mmsplice" = mmsplice_annot_df_empty
)

junc_data_list_wo_mmsplice = list(
"spliceai" = spliceai_annot_df,
"pangolin" = pangolin_annot_df
)

df_comb <- combine_mut_junc(junc_data_list)
df_comb_wo_mmsplice <- combine_mut_junc(junc_data_list_wo_mmsplice)

expect_true(nrow(df_comb) >= 1)
expect_true(nrow(df_comb) == nrow(df_comb_wo_mmsplice))

# column detected returned for all tools?
expect_true(length(grep("detected", names(df_comb))) == length(junc_data_list))

})


test_that("combine_mut_junc provides expected number of rows", {

# get spliceAI junctions
spliceai_file <- system.file("extdata", "spliceai_output.vcf", package = "splice2neo")

spliceai_annot_df <- parse_spliceai(spliceai_file) %>%
format_spliceai() %>%
annotate_mut_effect(toy_transcripts, toy_transcripts_gr)

# get pangolin junctions
pangolin_file <- system.file("extdata", "spliceai_output.pangolin.vcf", package = "splice2neo")

pangolin_annot_df <- parse_pangolin(pangolin_file) %>%
format_pangolin() %>%
annotate_mut_effect(toy_transcripts, toy_transcripts_gr)


merged_df <- spliceai_annot_df %>%
dplyr::full_join(pangolin_annot_df, by = c("mut_id", "tx_id", "junc_id"), relationship = "many-to-many")

junc_data_list_wo_mmsplice = list(
"spliceai" = spliceai_annot_df,
"pangolin" = pangolin_annot_df
)

df_comb <- combine_mut_junc(junc_data_list_wo_mmsplice)

expect_true(nrow(df_comb) == nrow(merged_df))

})
19 changes: 19 additions & 0 deletions tests/testthat/test-format_cispliceai.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,23 @@ test_that("format_cispliceai_thresh works on CI-SpliceAI example file", {

expect_true(nrow(df) >= 10)

expected_columns <- c("mut_id", "effect", "score", "chr", "pos_rel", "pos")

expect_equal( names(df), expected_columns )

})

test_that("format_cispliceai_thresh works on CI-SpliceAI example file with keep_gene_id option", {

cispliceai_file <- system.file("extdata", "cispliceai_thresh_output.vcf", package = "splice2neo")
df_raw <- parse_cispliceai_thresh(cispliceai_file)
df <- format_cispliceai_thresh(df_raw)
df_with_gene <- format_cispliceai_thresh(df_raw, transcripts_gr = toy_transcripts_gr )

expect_equal(nrow(df), nrow(df_with_gene))

expected_columns <- c("mut_id", "effect", "score", "chr", "pos_rel", "pos", "gene_id")

expect_equal( names(df_with_gene), expected_columns )

})
2 changes: 2 additions & 0 deletions tests/testthat/test-liftover_junc_id.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
test_that("liftover_junc_id works on toy example data", {

chain_file = system.file(package="liftOver", "extdata", "hg38ToHg19.over.chain")
stopifnot(chain_file != "")

junc_df <- toy_junc_df

Expand All @@ -16,6 +17,7 @@ test_that("liftover_junc_id works on toy example data", {
test_that("liftover_junc_id works with non-unique mappings", {

chain_file = system.file(package="liftOver", "extdata", "hg38ToHg19.over.chain")
stopifnot(chain_file != "")

junc_df <- toy_junc_df %>%
head(3) %>%
Expand Down

0 comments on commit 357821d

Please sign in to comment.