Merge branch 'dev' into 'master'

prepare release 0.6.9 See merge request tron/bnt_neoants/splice2neo!131
TRON-Bioinformatics · Jul 1, 2024 · 357821d · 357821d
2 parents dfd87f6 + ae7245e
commit 357821d
Show file tree

Hide file tree

Showing 12 changed files with 198 additions and 37 deletions.
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -11,8 +11,10 @@ testing:
       - testing.log
   stage: check
   script:
+    - sudo apt-get update
+    - sudo apt-get install -y qpdf
     - R -e 'install.packages("devtools", verbose = FALSE, quiet = TRUE)'
-    - R -e 'devtools::install(quiet=TRUE)'
+    - R -e 'devtools::install(quiet=TRUE, dependencies = TRUE)'
     - R -e 'devtools::check()'
 
 pages:

diff --git a/R/annotate_mut_effect.R b/R/annotate_mut_effect.R
@@ -134,8 +134,8 @@ annotate_mut_effect <- function(effect_df,
       # gene and transcripts
       gene_transcript_mapping <-
         tibble::tibble(
-          gene_id = unlist(transcripts_gr@elementMetadata$gene_id),
-          tx_name = transcripts_gr@elementMetadata$tx_name
+          gene_id = unlist(transcripts_gr$gene_id),
+          tx_name = transcripts_gr$tx_name
         )
 
       # consider only transcripts that relate to gene that is provided in the

diff --git a/R/combine_mut_junc.R b/R/combine_mut_junc.R
@@ -35,22 +35,17 @@ combine_mut_junc <- function(junc_data_list){
     # select distinct junctions by the indicator columns
     purrr::map(dplyr::distinct, mut_id, tx_id, junc_id) %>%
 
-    # combine into a single data.frame with tool column
-    dplyr::bind_rows(.id = "tool") %>%
+    # make sure to add detected column for each spliceai tool
+    purrr::imap(~dplyr::mutate(.x, "{.y}_detected" := TRUE)) %>%
 
-    # mark as detected
-    mutate(detected = TRUE) %>%
+    # join lists
+    purrr::reduce(dplyr::full_join, by = c("mut_id", "tx_id", "junc_id")) %>%
 
-    # expand by junction and tool
+    # set not detected to FALSE
     complete(
       nesting(mut_id, tx_id, junc_id),
-      tool,
-      fill = list(detected = FALSE)
-    ) %>%
-
-    # add tools as separate columns
-    pivot_wider(names_from = "tool", values_from = "detected",
-                names_glue = "{.name}_detected")
+      fill = names(junc_data_list) %>% purrr::map( ~ FALSE) %>% set_names(paste0(names(junc_data_list), "_detected"))
+    )
 
 
   # rename annotation columns

diff --git a/R/format_cispliceai.R b/R/format_cispliceai.R
@@ -7,6 +7,10 @@
 #'
 #' @param cispliceai_variants [tibble][tibble::tibble-package] with parsed
 #' CI-SpliceAI mutations from \code{\link{parse_spliceai}}
+#' @param transcripts_gr *Optionally*, A GRanges object with transcript ranges created by
+#'   `GenomicFeatures::transcripts(txdb)`can be provided which will allow to annotate full gene ids
+#'   to the formatted table and to consider only transcripts related to the annotated gene in `annotate_mut_effect`.
+#'   This parameter is optionally.
 #'
 #' @return A [tibble][tibble::tibble-package] with splicing effects per row
 #'
@@ -17,7 +21,7 @@
 #'
 #' @seealso \code{\link{parse_cispliceai_thresh}}, \code{\link{annotate_mut_effect}}
 #' @export
-format_cispliceai_thresh <- function(cispliceai_variants){
+format_cispliceai_thresh <- function(cispliceai_variants, transcripts_gr = NULL){
 
   #format columns
   cispliceai_variants <- cispliceai_variants %>%
@@ -33,9 +37,29 @@ format_cispliceai_thresh <- function(cispliceai_variants){
       mut_id = str_c(CHROM, POS, REF, ALT, sep = "_"),
       chr = CHROM,
       pos = as.integer(POS) + pos_rel
-    ) %>%
+    )
 
-    # keep only relevant columns
-    dplyr::select(mut_id, effect, score, chr, pos_rel, pos) %>%
+  if(!is.null(transcripts_gr)){
+
+    stopifnot("gene_id" %in% names(S4Vectors::mcols(transcripts_gr)))
+
+    # CI-SpliceAI does not return the version part of the gene id if gene table from CI-SPliceAi is used as annotation
+    # we need to map them based on the transcripts_gr object
+    # if gene_id should be kept
+    gene_table <- tibble::tibble(gene_id = unique(unlist(transcripts_gr$gene_id))) %>%
+      dplyr::mutate(gene_id_short = gsub("\\..*", "", gene_id))
+
+    formated_variants <- cispliceai_variants %>%
+      dplyr::left_join(gene_table, by = c("SYMBOL" = "gene_id_short")) %>%
+      dplyr::select(mut_id, effect, score, chr, pos_rel, pos, gene_id)
+
+  } else{
+
+    formated_variants <- cispliceai_variants %>%
+      dplyr::select(mut_id, effect, score, chr, pos_rel, pos)
+
+  }
+
+  formated_variants %>%
     dplyr::distinct()
 }
diff --git a/R/standardize_junctions_RNA_tools.R b/R/standardize_junctions_RNA_tools.R
@@ -9,7 +9,7 @@
 #'   format.
 #'
 #' @return A combined table with unique junctions. The columns
-#'   identified_by_{name} contains information which tools identified the given
+#'   identified_by_\{name\} contains information which tools identified the given
 #'   junction
 #'
 #' @examples

diff --git a/inst/extdata/cispliceai_thresh_output.vcf b/inst/extdata/cispliceai_thresh_output.vcf
@@ -27,14 +27,15 @@
 ##contig=<ID=chrX,length=155270560>
 ##contig=<ID=chrY,length=59373566>
 ##INFO=<ID=SpliceAI,Number=.,Type=String,Description="SpliceAIv1.3.1 variant annotation. These include delta scores (DS) and delta positions (DP) for acceptor gain (AG), acceptor loss (AL), donor gain (DG), and donor loss (DL). Format: ALLELE|SYMBOL|DS_AG|DS_AL|DS_DG|DS_DL|DP_AG|DP_AL|DP_DG|DP_DL">
+##INFO=<ID=CI-SpliceAI,Number=.,Type=String,Description="CI-SpliceAI V1.1.0 variant annotation for a maximum distance of 50 nucleotides from the variant; masking off; remove nucs outside tx all effects. Format: ALLELE|SYMBOL|DS_AG|DS_AL|DS_DG|DS_DL|DP_AG|DP_AL|DP_DG|DP_DL (DS=Delta Score, DP=Delta Position, AG/AL=Acceptor Gain/Acceptor Loss, DG/DL=Donor Gain/Donor Loss).">
 #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
-chr1	25000	.	A	C,G,T	.	.	.   CI-SpliceAI=C|ENSG00000189409
-chr2	152389953	.	T	A,C,G	.	.	CI-SpliceAI=A|ENSG00000188157|DL:0.02:-18
-chr2	179415988	.	C	CA	.	.	CI-SpliceAI=CA|ENSG00000188157|AG:0.03:-41|AL:0.8:-2|DG:0.08:8|DL:0.5:12
-chr2	179446218	.	ATACT	A	.	.	CI-SpliceAI=A|ENSG00000162512|AL:0.02:27|AG:0.03:14|DG:0.02:-48
-chr2	179446218	.	ATACT	AT,ATA	.	.	CI-SpliceAI=AT|ENSG00000162512|AL:0.02:27
-chr2	179642185	.	G	A	.	.	CI-SpliceAI=A|ENSG00000259030,A|ENSG00000116783
-chr19	38958362	.	C	T	.	.	CI-SpliceAI=T|+,T|-
-chr21	47406854	.	CCA	C	.	.	CI-SpliceAI=C|ENSG00000118733|DL:0.03:14|AG:0.02:-48|AG:0.8:-2
-chr21	47406856	.	A	AT	.	.	CI-SpliceAI=AT|+|AL:0.02:27,AT|-|DL:0.5:12
-chrX	129274636	.	A	C,G,T	.	.	CI-SpliceAI=C|ENSG00000116299|AG:0.7:-41,G|ENSG00000116299|DG:0.03:10,T|ENSG00000116299||AL:0.03:40
+chr1	25000	.	A	C,G,T	.	.	CI-SpliceAI=C|+,C|-,G|+,G|-,T|+,T|-
+chr2	152389953	.	T	A,C,G	.	.	SpliceAI=A|NEB|0.01|0.00|0.00|0.74|43|3|-26|3,C|NEB|0.04|0.00|0.00|0.71|43|3|-26|3,G|NEB|0.03|0.00|0.00|0.75|43|3|-26|3;CI-SpliceAI=A|ENSG00000183091|DL:0.96:3,C|ENSG00000183091|DL:0.90:3,G|ENSG00000183091|DL:0.97:3
+chr2	179415988	.	C	CA	.	.	SpliceAI=CA|TTN|0.07|1.00|0.00|0.00|-7|-1|35|-29;CI-SpliceAI=CA|ENSG00000155657|AL:1.00:-1|AG:0.05:0|AG:0.03:7|AG:0.04:9
+chr2	179446218	.	ATACT	A	.	.	SpliceAI=A|TTN|0.00|0.00|0.02|0.91|-7|34|-11|8;CI-SpliceAI=A|ENSG00000155657|DG:0.03:-11|DL:0.87:8
+chr2	179446218	.	ATACT	AT,ATA	.	.	SpliceAI=AT|TTN|.|.|.|.|.|.|.|.,ATA|TTN|.|.|.|.|.|.|.|.;CI-SpliceAI=AT|ENSG00000155657|DG:0.05:-11|DL:0.22:8,ATA|ENSG00000155657|DL:0.60:8
+chr2	179642185	.	G	A	.	.	SpliceAI=A|TTN|0.00|0.00|0.64|0.55|2|38|2|-38;CI-SpliceAI=A|ENSG00000155657|DL:0.77:-38|DG:0.61:2
+chr19	38958362	.	C	T	.	.	SpliceAI=T|RYR1|0.00|0.00|0.91|0.08|-28|-46|-2|-31;CI-SpliceAI=T|ENSG00000196218|DG:0.98:-2
+chr21	47406854	.	CCA	C	.	.	SpliceAI=C|COL6A1|0.04|0.98|0.00|0.00|-38|4|38|4;CI-SpliceAI=C|ENSG00000142156|AG:0.03:-19|AL:0.98:4
+chr21	47406856	.	A	AT	.	.	SpliceAI=AT|COL6A1|0.03|0.99|0.00|0.00|-40|2|36|2;CI-SpliceAI=AT|ENSG00000142156|AG:0.02:-21|AL:0.98:2
+chrX	129274636	.	A	C,G,T	.	.	SpliceAI=C|AIFM1|0.00|0.18|0.00|0.00|-28|-44|-44|45,G|AIFM1|0.00|0.17|0.00|0.00|-8|-44|-44|45,T|AIFM1|0.00|0.19|0.00|0.00|-2|-44|-44|45;CI-SpliceAI=C|ENSG00000156709|AL:0.06:-44,G|ENSG00000156709|AL:0.06:-44,T|ENSG00000156709|AL:0.03:-44
diff --git a/man/format_cispliceai_thresh.Rd b/man/format_cispliceai_thresh.Rd
diff --git a/man/generate_combined_dataset.Rd b/man/generate_combined_dataset.Rd
diff --git a/tests/testthat/test-annotate_mut_effect.R b/tests/testthat/test-annotate_mut_effect.R
@@ -31,8 +31,8 @@ test_that("annotate_mut_effect works on toy example with pangolin with gene_mapp
 
   gene_transcript_mapping <-
     tibble::tibble(
-      gene_id = unlist(toy_transcripts_gr@elementMetadata$gene_id),
-      tx_name = toy_transcripts_gr@elementMetadata$tx_name
+      gene_id = unlist(toy_transcripts_gr$gene_id),
+      tx_name = toy_transcripts_gr$tx_name
     )
 
   pangolin_file <- system.file("extdata", "spliceai_output.pangolin.vcf", package = "splice2neo")
@@ -45,12 +45,12 @@ test_that("annotate_mut_effect works on toy example with pangolin with gene_mapp
 
   # we need to do a dirty fix because different versions of gencode annotations in effect_df and toy_transcripts
   toy_transcripts_gr_fix <- toy_transcripts_gr
-  toy_transcripts_gr_fix@elementMetadata$gene_id = gsub("_.*", "", unlist(toy_transcripts_gr@elementMetadata$gene_id))
+  toy_transcripts_gr_fix$gene_id = gsub("_.*", "", unlist(toy_transcripts_gr$gene_id))
   effect_df$gene_id <- gsub("_.*", "", effect_df$gene_id)
   effect_df$gene_id <- gsub("\\...*", "", effect_df$gene_id)
-  toy_transcripts_gr_fix@elementMetadata$gene_id = gsub("\\...*", "", unlist(toy_transcripts_gr_fix@elementMetadata$gene_id))
+  toy_transcripts_gr_fix$gene_id = gsub("\\...*", "", unlist(toy_transcripts_gr_fix$gene_id))
   effect_df <- effect_df %>%
-    filter(gene_id %in% toy_transcripts_gr_fix@elementMetadata$gene_id)
+    filter(gene_id %in% toy_transcripts_gr_fix$gene_id)
 
 
   # without gene mapping
@@ -72,6 +72,40 @@ test_that("annotate_mut_effect works on toy example with pangolin with gene_mapp
 
 })
 
+test_that("annotate_mut_effect works on toy example with CI-SpliceAI with gene_mapping = TRUE", {
+
+  gene_transcript_mapping <-
+    tibble::tibble(
+      gene_id = unlist(toy_transcripts_gr$gene_id),
+      tx_name = toy_transcripts_gr$tx_name
+    )
+
+  cispliceai_file <- system.file("extdata", "cispliceai_thresh_output.vcf", package = "splice2neo")
+  df_raw <- parse_cispliceai_thresh(cispliceai_file)
+  df_with_gene <- format_cispliceai_thresh(df_raw, transcripts_gr = toy_transcripts_gr)
+
+  # without gene mapping
+  annot_df <- annotate_mut_effect(df_with_gene, toy_transcripts, toy_transcripts_gr, gene_mapping = FALSE)
+  # with gene mapping
+  annot_df_map <- annotate_mut_effect(df_with_gene, toy_transcripts, toy_transcripts_gr, gene_mapping = TRUE)
+
+  expect_true(nrow(annot_df_map) >= nrow(df_with_gene))
+
+
+  # check that additional rows if gene_mapping = FALSE are because of not fitting gene-transcript pair
+  df_not_mapped <- annot_df %>%
+    dplyr::select(mut_id, junc_id, tx_id, gene_id) %>%
+    #get additional rows in not mapped table
+    anti_join(annot_df_map, by = c("mut_id", "junc_id", "tx_id")) %>%
+    #map genes related to those transcripts
+    left_join(gene_transcript_mapping, by = c("tx_id" = "tx_name"), suffix = c("_cisplicai", "_mapped"))
+
+  expect_true(all(df_not_mapped$gene_id_cisplicai != df_not_mapped$gene_id_mapped))
+
+})
+
+
+
 test_that("annotate_mut_effect works on empty tibble", {
 
   spliceai_file <- system.file("extdata", "spliceai_output.vcf", package = "splice2neo")

diff --git a/tests/testthat/test-combine_mut_junc.R b/tests/testthat/test-combine_mut_junc.R
@@ -30,10 +30,89 @@ test_that("combine_mut_junc works", {
     "mmsplice" = mmsplice_annot_df
   )
 
+  n_unique <- bind_rows(junc_data_list) %>% select(mut_id, junc_id, tx_id) %>% n_distinct()
+
   df_comb <- combine_mut_junc(junc_data_list)
 
   expect_true(nrow(df_comb) >= 1)
 
 })
 
+test_that("combine_mut_junc works if one of the input table is empty", {
+
+  # get spliceAI junctions
+  spliceai_file <- system.file("extdata", "spliceai_output.vcf", package = "splice2neo")
+
+  spliceai_annot_df <- parse_spliceai(spliceai_file) %>%
+    format_spliceai() %>%
+    annotate_mut_effect(toy_transcripts, toy_transcripts_gr)
+
+  # get pangolin junctions
+  pangolin_file <- system.file("extdata", "spliceai_output.pangolin.vcf", package = "splice2neo")
+
+  pangolin_annot_df <- parse_pangolin(pangolin_file) %>%
+    format_pangolin() %>%
+    annotate_mut_effect(toy_transcripts, toy_transcripts_gr)
 
+  # get mmsplice junctions
+  mmsplice_file <- system.file("extdata", "mmsplice_pred.csv", package = "splice2neo")
+  mmsplice_df <- parse_mmsplice(mmsplice_file)
+  mmsplice_annot_df <- mmsplice_df %>%
+    annotate_mmsplice(toy_transcripts)
+
+  mmsplice_annot_df_empty <- mmsplice_annot_df %>%
+    filter(row_number() < 1)
+
+  junc_data_list = list(
+    "spliceai" = spliceai_annot_df,
+    "pangolin" = pangolin_annot_df,
+    "mmsplice" = mmsplice_annot_df_empty
+  )
+
+  junc_data_list_wo_mmsplice = list(
+    "spliceai" = spliceai_annot_df,
+    "pangolin" = pangolin_annot_df
+  )
+
+  df_comb <- combine_mut_junc(junc_data_list)
+  df_comb_wo_mmsplice <- combine_mut_junc(junc_data_list_wo_mmsplice)
+
+  expect_true(nrow(df_comb) >= 1)
+  expect_true(nrow(df_comb) == nrow(df_comb_wo_mmsplice))
+
+  # column detected returned for all tools?
+  expect_true(length(grep("detected", names(df_comb))) == length(junc_data_list))
+
+})
+
+
+test_that("combine_mut_junc provides expected number of rows", {
+
+  # get spliceAI junctions
+  spliceai_file <- system.file("extdata", "spliceai_output.vcf", package = "splice2neo")
+
+  spliceai_annot_df <- parse_spliceai(spliceai_file) %>%
+    format_spliceai() %>%
+    annotate_mut_effect(toy_transcripts, toy_transcripts_gr)
+
+  # get pangolin junctions
+  pangolin_file <- system.file("extdata", "spliceai_output.pangolin.vcf", package = "splice2neo")
+
+  pangolin_annot_df <- parse_pangolin(pangolin_file) %>%
+    format_pangolin() %>%
+    annotate_mut_effect(toy_transcripts, toy_transcripts_gr)
+
+
+  merged_df <- spliceai_annot_df %>%
+    dplyr::full_join(pangolin_annot_df, by = c("mut_id", "tx_id", "junc_id"), relationship = "many-to-many")
+
+  junc_data_list_wo_mmsplice = list(
+    "spliceai" = spliceai_annot_df,
+    "pangolin" = pangolin_annot_df
+  )
+
+  df_comb <- combine_mut_junc(junc_data_list_wo_mmsplice)
+
+  expect_true(nrow(df_comb) == nrow(merged_df))
+
+})
diff --git a/tests/testthat/test-format_cispliceai.R b/tests/testthat/test-format_cispliceai.R
@@ -6,4 +6,23 @@ test_that("format_cispliceai_thresh works on CI-SpliceAI example file", {
 
   expect_true(nrow(df) >= 10)
 
+  expected_columns <- c("mut_id", "effect", "score", "chr", "pos_rel", "pos")
+
+  expect_equal( names(df), expected_columns )
+
+})
+
+test_that("format_cispliceai_thresh works on CI-SpliceAI example file with keep_gene_id option", {
+
+  cispliceai_file <- system.file("extdata", "cispliceai_thresh_output.vcf", package = "splice2neo")
+  df_raw <- parse_cispliceai_thresh(cispliceai_file)
+  df <- format_cispliceai_thresh(df_raw)
+  df_with_gene <- format_cispliceai_thresh(df_raw, transcripts_gr = toy_transcripts_gr )
+
+  expect_equal(nrow(df), nrow(df_with_gene))
+
+  expected_columns <- c("mut_id", "effect", "score", "chr", "pos_rel", "pos", "gene_id")
+
+  expect_equal( names(df_with_gene), expected_columns )
+
 })
diff --git a/tests/testthat/test-liftover_junc_id.R b/tests/testthat/test-liftover_junc_id.R
@@ -1,6 +1,7 @@
 test_that("liftover_junc_id works on toy example data", {
 
   chain_file = system.file(package="liftOver", "extdata", "hg38ToHg19.over.chain")
+  stopifnot(chain_file != "")
 
   junc_df <- toy_junc_df
 
@@ -16,6 +17,7 @@ test_that("liftover_junc_id works on toy example data", {
 test_that("liftover_junc_id works with non-unique mappings", {
 
   chain_file = system.file(package="liftOver", "extdata", "hg38ToHg19.over.chain")
+  stopifnot(chain_file != "")
 
   junc_df <- toy_junc_df %>%
     head(3) %>%