Merge branch 'dev' into 'master'

release 0.6.3 See merge request tron/splice2neo!106
TRON-Bioinformatics · Jul 6, 2023 · 4f2473f · 4f2473f
2 parents a100752 + 8f9f6eb
commit 4f2473f
Show file tree

Hide file tree

Showing 9 changed files with 75 additions and 59 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: splice2neo
 Title: Aberrant splice junction analysis with associated mutations
-Version: 0.6.2
+Version: 0.6.3
 Authors@R: 
     c(
     person(given = "Jonas",

diff --git a/R/add_peptide.R b/R/add_peptide.R
@@ -181,7 +181,10 @@ add_peptide <- function(df, cds, flanking_size = 14, bsg = NULL, keep_ranges = F
     dplyr::left_join(df_annotated_peptide, by = c("junc_id", "tx_id")) %>%
     dplyr::mutate(
       cds_description = ifelse(is.na(protein_wt), "no wt cds", cds_description)
-    )
+    ) %>%
+  # return protein seq only until first stop codon
+    dplyr::mutate(protein = protein %>% str_extract("[^*]+"))
+
 
   return(df)
 

diff --git a/R/annotate_mut_effect.R b/R/annotate_mut_effect.R
@@ -96,7 +96,8 @@ annotate_mut_effect <- function(effect_df, transcripts, transcripts_gr, gene_map
       # remove predicted effects with missing values
       filter(!is.na(left) & !is.na(right) & !is.na(tx_strand) & !is.na(chr)) %>%
       # remove predicted effects outside of transcript range
-      filter((!is.na(downstream_start) & !is.na(downstream_end)) | (!is.na(upstream_start) & !is.na(upstream_end)))%>%
+      filter(!(is.na(downstream_start) & is.na(downstream_end))) %>%
+      filter(!(is.na(upstream_start) & is.na(upstream_end))) %>%
 
       # add junction IDs
       mutate(

diff --git a/R/format_pangolin.R b/R/format_pangolin.R
@@ -3,7 +3,7 @@
 #'
 #' Reformat the data for each annotated effect per row. As pangolin does not
 #' provide information on donor or acceptor, use both for gain and loss annotations.
-#' Effects with score = 0 are filtered out.
+#' Effects with score = 0 are filtered out. The resulting `score` represents the absolute value of the Pangolin score.
 #'
 #' @param variants [tibble][tibble::tibble-package] with parsed
 #' pangolin mutations from \code{\link{parse_pangolin}}
@@ -21,23 +21,14 @@ format_pangolin <- function(variants, keep_gene_id = FALSE){
 
   # get all splicing affects for each variant in rows
   variants %>%
-    pivot_longer(
-      cols = c("increase_pos", "increase_score", "decrease_pos", "decrease_score"),
-      names_sep = "_",
-      # names_to = c("effect", "score_pos"),
-      names_to = c("effect_direction", ".value"),
-      # names_pattern = "(DS|DP)_(\\w*)",
-    ) %>%
-    dplyr::rename(pos_rel = pos) %>%
+    mutate(effect_direction = ifelse(pangolin_score < 0, "decrease", "increase")) %>%
 
-    # filter out effects without score or scores <= 0
-    filter(!is.na(score) & score > 0) %>%
+    # filter out effects without score or score == 0
+    filter(!is.na(pangolin_score) & pangolin_score != 0) %>%
 
     # as no annotation on acceptor or donor existst in pangolin, both are considered
-    left_join(
-      pangolin_effect_translation,
-      by = "effect_direction"
-    ) %>%
+    left_join(pangolin_effect_translation,
+              by = "effect_direction") %>%
 
     # add unique IDs for mutation
     mutate(
@@ -46,10 +37,14 @@ format_pangolin <- function(variants, keep_gene_id = FALSE){
       pos = as.integer(POS) + pos_rel
     ) %>%
 
+    # use only absolute value of pangolin score as the +/- is now integrated int the effect column
+    mutate(score = abs(pangolin_score)) %>%
+    select(-pangolin_score) %>%
+
     # keep only relevant columns
     {
       if (keep_gene_id)
-        dplyr::select(. ,mut_id, effect, score, chr, pos_rel, pos, gene_id)
+        dplyr::select(. , mut_id, effect, score, chr, pos_rel, pos, gene_id)
       else
         dplyr::select(. , mut_id, effect, score, chr, pos_rel, pos)
     } %>%

diff --git a/R/parse_pangolin.R b/R/parse_pangolin.R
@@ -17,15 +17,24 @@
 parse_pangolin <- function(vcf_file){
 
   # Format: Pangolin=ENSG00000237298.10_8|33:0.0|-50:0.0|Wa
-  # Format: gene|pos:largest_increase|pos:largest_decrease|
+  # Format: gene|pos:largest_increase|pos:largest_decrease|,gene|pos:largest_increase|pos:largest_decrease|
+  # if pangolin option -s was used, there can be multiple increase/decrease scores!
 
   col_names <- c("gene_id",
                  "pos_largest_increase","pos_largest_decrease")
 
   vcf <- vcfR::read.vcfR(vcf_file, verbose = FALSE)
 
   # get fixed fields for variants
-  fix_df <-  vcfR::getFIX(vcf) %>%
+  fix_df <- vcfR::getFIX(vcf)
+
+  # required for vcf with only one mutation
+  if(is.null(nrow(fix_df))){
+    fix_df <- fix_df %>%
+      t()
+  }
+
+  fix_df <- fix_df %>%
     tibble::as_tibble() %>%
     mutate(
       Key = row_number()
@@ -35,33 +44,35 @@ parse_pangolin <- function(vcf_file){
   pangolin_df <- vcf %>%
     vcfR::extract_info_tidy(info_fields = c("Pangolin")) %>%
     mutate(
-      annot = Pangolin %>%
+      annot_gene = Pangolin %>%
 
         # remove warning massage
         str_replace_all("Warnings:NoAnnotatedSitesToMaskForThisGene", "") %>%
+        str_replace_all("Warnings:", "") %>%
 
-        # extract each annotation per variant
-        str_extract_all("[^|]*\\|-?\\d+:\\d\\.+\\d+\\|-?\\d+:\\d\\.+\\d+\\|")
+        # extract all gene-wise annotations per variant
+        str_split(",")
     ) %>%
     select(-Pangolin) %>%
-    unnest(annot) %>%
+    unnest(annot_gene) %>%
     # pares individual annotation values
     mutate(
-      gene_id = annot %>%
-        str_replace("([^|]*)\\|-?\\d+:\\d\\.+\\d+\\|-?\\d+:\\d\\.+\\d+\\|", "\\1"),
-      increase_pos = annot %>%
-        str_replace("[^|]*\\|(-?\\d+):\\d\\.+\\d+\\|-?\\d+:\\d\\.+\\d+\\|", "\\1") %>%
-        as.integer(),
-      increase_score = annot %>%
-        str_replace("[^|]*\\|-?\\d+:(\\d\\.+\\d+)\\|-?\\d+:\\d\\.+\\d+\\|", "\\1") %>%
-        as.numeric(),
-      decrease_pos = annot %>%
-        str_replace("[^|]*\\|-?\\d+:\\d\\.+\\d+\\|(-?\\d+):\\d\\.+\\d+\\|", "\\1") %>%
+      gene_id = annot_gene %>%
+        str_replace("([^|]*)\\|-?\\d+:-?\\d\\.+\\d+\\|-?\\d+:-?\\d\\.+\\d+\\|", "\\1"),
+      # extract all annotations per variant - gene
+      annot = annot_gene %>%
+        str_extract_all("-?\\d+:-?\\d\\.+\\d+")
+    ) %>%
+    unnest(annot) %>%
+    mutate(
+      pos_rel = annot %>%
+        str_replace("(-?\\d+):-?\\d\\.+\\d+", "\\1") %>%
         as.integer(),
-      decrease_score = annot %>%
-        str_replace("[^|]*\\|-?\\d+:\\d\\.+\\d+\\|-?\\d+:(\\d\\.+\\d+)\\|", "\\1") %>%
-        as.numeric(),
-    )
+      pangolin_score = annot %>%
+        str_replace("-?\\d+:(-?\\d\\.+\\d+)", "\\1") %>%
+        as.numeric()
+    ) %>%
+    select(-annot, -annot_gene)
 
 
   fix_df %>%

diff --git a/inst/extdata/spliceai_output.pangolin.vcf b/inst/extdata/spliceai_output.pangolin.vcf
@@ -2,7 +2,7 @@
 ##fileDate=20191004
 ##reference=GRCh37/hg19
 ##INFO=<ID=SpliceAI,Number=.,Type=String,Description="SpliceAIv1.3.1 variant annotation. These include delta scores (DS) and delta positions (DP) for acceptor gain (AG), acceptor loss (AL), donor gain (DG), and donor loss (DL). Format: ALLELE|SYMBOL|DS_AG|DS_AL|DS_DG|DS_DL|DP_AG|DP_AL|DP_DG|DP_DL">
-##INFO=<ID=Pangolin,Number=.,Type=String,Description="Pangolin splice scores. Format: gene|pos:score_change|pos:score_change|...">
+##INFO=<ID=Pangolin,Number=.,Type=String,Description="Pangolin splice scores. Format: gene|pos:score_change|pos:score_change|warnings,...">
 ##FILTER=<ID=PASS,Description="All filters passed">
 ##contig=<ID=chr1,length=249250621>
 ##contig=<ID=chr2,length=243199373>
@@ -29,13 +29,13 @@
 ##contig=<ID=chrX,length=155270560>
 ##contig=<ID=chrY,length=59373566>
 #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
-chr1	25000	.	A	C,G,T	.	.	Pangolin=ENSG00000227232.5_3|8:0.01|-50:0.0|Warnings:NoAnnotatedSitesToMaskForThisGene
-chr2	152389953	.	T	A,C,G	.	.	SpliceAI=A|NEB|0.01|0.00|0.00|0.74|43|3|-26|3,C|NEB|0.04|0.00|0.00|0.71|43|3|-26|3,G|NEB|0.03|0.00|0.00|0.75|43|3|-26|3;Pangolin=ENSG00000183091.19_10|2:0.01|-50:0.0|Warnings:NoAnnotatedSitesToMaskForThisGene
-chr2	179415988	.	C	CA	.	.	SpliceAI=CA|TTN|0.07|1.00|0.00|0.00|-7|-1|35|-29;Pangolin=ENSG00000237298.10_8|33:0.0|-50:0.0|Warnings:NoAnnotatedSitesToMaskForThisGeneENSG00000155657.27_6|-26:0.08|-49:0.0|Warnings:NoAnnotatedSitesToMaskForThisGene
-chr2	179446218	.	ATACT	A	.	.	SpliceAI=A|TTN|0.00|0.00|0.02|0.91|-7|34|-11|8;Pangolin=ENSG00000237298.10_8|14:0.0|-50:0.0|Warnings:NoAnnotatedSitesToMaskForThisGeneENSG00000155657.27_6|-7:0.08|-50:0.0|Warnings:NoAnnotatedSitesToMaskForThisGene
+chr1	25000	.	A	C,G,T	.	.	Pangolin=ENSG00000227232.5_5|8:0.009999999776482582|-50:0.0|Warnings:
+chr2	152389953	.	T	A,C,G	.	.	SpliceAI=A|NEB|0.01|0.00|0.00|0.74|43|3|-26|3,C|NEB|0.04|0.00|0.00|0.71|43|3|-26|3,G|NEB|0.03|0.00|0.00|0.75|43|3|-26|3;Pangolin=ENSG00000183091.20_14|2:0.009999999776482582|3:-0.33000001311302185|Warnings:
+chr2	179415988	.	C	CA	.	.	SpliceAI=CA|TTN|0.07|1.00|0.00|0.00|-7|-1|35|-29;Pangolin=ENSG00000237298.10_10|33:0.0|-50:0.0|Warnings:,ENSG00000155657.28_10|-26:0.07999999821186066|-1:-0.8600000143051147|Warnings:
+chr2	179446218	.	ATACT	A	.	.	SpliceAI=A|TTN|0.00|0.00|0.02|0.91|-7|34|-11|8;Pangolin=ENSG00000237298.10_10|14:0.0|-50:0.0|Warnings:,ENSG00000155657.28_10|-7:0.08|8:-0.81|Warnings:
 chr2	179446218	.	ATACT	AT,ATA	.	.	SpliceAI=AT|TTN|.|.|.|.|.|.|.|.,ATA|TTN|.|.|.|.|.|.|.|.
-chr2	179642185	.	G	A	.	.	SpliceAI=A|TTN|0.00|0.00|0.64|0.55|2|38|2|-38;Pangolin=ENSG00000237298.10_8|-2:0.0|-50:0.0|Warnings:NoAnnotatedSitesToMaskForThisGeneENSG00000155657.27_6|2:0.54|-50:0.0|Warnings:NoAnnotatedSitesToMaskForThisGene
-chr19	38958362	.	C	T	.	.	SpliceAI=T|RYR1|0.00|0.00|0.91|0.08|-28|-46|-2|-31;Pangolin=ENSG00000196218.13_8|-2:0.85|-50:0.0|Warnings:NoAnnotatedSitesToMaskForThisGene
-chr21	47406854	.	CCA	C	.	.	SpliceAI=C|COL6A1|0.04|0.98|0.00|0.00|-38|4|38|4;Pangolin=ENSG00000142156.15_4|-38:0.06|-40:0.0|Warnings:NoAnnotatedSitesToMaskForThisGene
-chr21	47406856	.	A	AT	.	.	SpliceAI=AT|COL6A1|0.03|0.99|0.00|0.00|-40|2|36|2;Pangolin=ENSG00000142156.15_4|-40:0.08|-42:0.0|Warnings:NoAnnotatedSitesToMaskForThisGene
-chrX	129274636	.	A	C,G,T	.	.	SpliceAI=C|AIFM1|0.00|0.18|0.00|0.00|-28|-44|-44|45,G|AIFM1|0.00|0.17|0.00|0.00|-8|-44|-44|45,T|AIFM1|0.00|0.19|0.00|0.00|-2|-44|-44|45;Pangolin=ENSG00000156709.14_6|-41:0.01|-49:0.0|Warnings:NoAnnotatedSitesToMaskForThisGene
+chr2	179642185	.	G	A	.	.	SpliceAI=A|TTN|0.00|0.00|0.64|0.55|2|38|2|-38;Pangolin=ENSG00000237298.10_10|-2:0.0|-50:0.0|Warnings:,ENSG00000155657.28_10|2:0.5400000214576721|-38:-0.3499999940395355|Warnings:
+chr19	38958362	.	C	T	.	.	SpliceAI=T|RYR1|0.00|0.00|0.91|0.08|-28|-46|-2|-31;Pangolin=ENSG00000196218.13_12|-2:0.8500000238418579|-50:0.0|Warnings:
+chr21	47406854	.	CCA	C	.	.	SpliceAI=C|COL6A1|0.04|0.98|0.00|0.00|-38|4|38|4;Pangolin=ENSG00000142156.16_6|-38:0.06|4:-0.82|Warnings:
+chr21	47406856	.	A	AT	.	.	SpliceAI=AT|COL6A1|0.03|0.99|0.00|0.00|-40|2|36|2;Pangolin=ENSG00000142156.16_6|-40:0.07999999821186066|2:-0.8100000023841858|Warnings:
+chrX	129274636	.	A	C,G,T	.	.	SpliceAI=C|AIFM1|0.00|0.18|0.00|0.00|-28|-44|-44|45,G|AIFM1|0.00|0.17|0.00|0.00|-8|-44|-44|45,T|AIFM1|0.00|0.19|0.00|0.00|-2|-44|-44|45;Pangolin=ENSG00000156709.15_10|-41:0.009999999776482582|-44:-0.2800000011920929|Warnings:
diff --git a/man/format_pangolin.Rd b/man/format_pangolin.Rd
diff --git a/tests/testthat/test-annotate_mut_effect.R b/tests/testthat/test-annotate_mut_effect.R
@@ -35,20 +35,27 @@ test_that("annotate_mut_effect works on toy example with pangolin with gene_mapp
       tx_name = toy_transcripts_gr@elementMetadata$tx_name
     )
 
-
   pangolin_file <- system.file("extdata", "spliceai_output.pangolin.vcf", package = "splice2neo")
 
   effect_df <- parse_pangolin(pangolin_file)
   # modify : same mutation for two distinct genes + different scores
 
   effect_df <- effect_df %>% format_pangolin(keep_gene_id = TRUE)
 
+  # we need to do a dirty fix because different versions of gencode annotations in effect_df and toy_transcripts
+  toy_transcripts_gr_fix <- toy_transcripts_gr
+  toy_transcripts_gr_fix@elementMetadata$gene_id = gsub("_.*", "", unlist(toy_transcripts_gr@elementMetadata$gene_id))
+  effect_df$gene_id <- gsub("_.*", "", effect_df$gene_id)
+  effect_df$gene_id <- gsub("\\...*", "", effect_df$gene_id)
+  toy_transcripts_gr_fix@elementMetadata$gene_id = gsub("\\...*", "", unlist(toy_transcripts_gr_fix@elementMetadata$gene_id))
+  effect_df <- effect_df %>%
+    filter(gene_id %in% toy_transcripts_gr_fix@elementMetadata$gene_id)
+
+
+  # without gene mapping
   annot_df <- annotate_mut_effect(effect_df, toy_transcripts, toy_transcripts_gr, gene_mapping = FALSE)
-  # annot_df1 <- annot_df %>%
-  #   mutate(id = paste0(mut_id,  gene_id))
-  annot_df_map <- annotate_mut_effect(effect_df, toy_transcripts, toy_transcripts_gr, gene_mapping = TRUE)
-  # annot_df_map1 <- annot_df_map %>%
-  #   mutate(id = paste0(mut_id,  gene_id))
+  # with gene mapping
+  annot_df_map <- annotate_mut_effect(effect_df, toy_transcripts, toy_transcripts_gr_fix, gene_mapping = TRUE)
 
   expect_true(nrow(annot_df) >= nrow(effect_df))
   expect_true(nrow(annot_df_map) >= nrow(effect_df))
@@ -64,8 +71,6 @@ test_that("annotate_mut_effect works on toy example with pangolin with gene_mapp
 
   expect_true(all(df_not_mapped$gene_id.x != df_not_mapped$gene_id))
 
-
-
 })
 
 test_that("annotate_mut_effect works on empty tibble", {

diff --git a/tests/testthat/test-parse_pangolin.R b/tests/testthat/test-parse_pangolin.R
@@ -4,9 +4,10 @@ test_that("parse_pangolin works on provided example file", {
   pangolin_df <- parse_pangolin(pangolin_file)
 
   expect_true(nrow(pangolin_df) > 0)
+  expect_true(ncol(pangolin_df) == 11)
 
   expect_true(all(
-    c("increase_pos", "increase_score", "decrease_pos", "decrease_score") %in% names(pangolin_df)
+    c("pos_rel", "pangolin_score") %in% names(pangolin_df)
     ))
 
 })