Skip to content

Commit

Permalink
Merge branch 'dev' into 'master'
Browse files Browse the repository at this point in the history
release 0.6.3

See merge request tron/splice2neo!106
  • Loading branch information
franla23 committed Jul 6, 2023
2 parents a100752 + 8f9f6eb commit 4f2473f
Show file tree
Hide file tree
Showing 9 changed files with 75 additions and 59 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: splice2neo
Title: Aberrant splice junction analysis with associated mutations
Version: 0.6.2
Version: 0.6.3
Authors@R:
c(
person(given = "Jonas",
Expand Down
5 changes: 4 additions & 1 deletion R/add_peptide.R
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,10 @@ add_peptide <- function(df, cds, flanking_size = 14, bsg = NULL, keep_ranges = F
dplyr::left_join(df_annotated_peptide, by = c("junc_id", "tx_id")) %>%
dplyr::mutate(
cds_description = ifelse(is.na(protein_wt), "no wt cds", cds_description)
)
) %>%
# return protein seq only until first stop codon
dplyr::mutate(protein = protein %>% str_extract("[^*]+"))


return(df)

Expand Down
3 changes: 2 additions & 1 deletion R/annotate_mut_effect.R
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,8 @@ annotate_mut_effect <- function(effect_df, transcripts, transcripts_gr, gene_map
# remove predicted effects with missing values
filter(!is.na(left) & !is.na(right) & !is.na(tx_strand) & !is.na(chr)) %>%
# remove predicted effects outside of transcript range
filter((!is.na(downstream_start) & !is.na(downstream_end)) | (!is.na(upstream_start) & !is.na(upstream_end)))%>%
filter(!(is.na(downstream_start) & is.na(downstream_end))) %>%
filter(!(is.na(upstream_start) & is.na(upstream_end))) %>%

# add junction IDs
mutate(
Expand Down
27 changes: 11 additions & 16 deletions R/format_pangolin.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
#'
#' Reformat the data for each annotated effect per row. As pangolin does not
#' provide information on donor or acceptor, use both for gain and loss annotations.
#' Effects with score = 0 are filtered out.
#' Effects with score = 0 are filtered out. The resulting `score` represents the absolute value of the Pangolin score.
#'
#' @param variants [tibble][tibble::tibble-package] with parsed
#' pangolin mutations from \code{\link{parse_pangolin}}
Expand All @@ -21,23 +21,14 @@ format_pangolin <- function(variants, keep_gene_id = FALSE){

# get all splicing affects for each variant in rows
variants %>%
pivot_longer(
cols = c("increase_pos", "increase_score", "decrease_pos", "decrease_score"),
names_sep = "_",
# names_to = c("effect", "score_pos"),
names_to = c("effect_direction", ".value"),
# names_pattern = "(DS|DP)_(\\w*)",
) %>%
dplyr::rename(pos_rel = pos) %>%
mutate(effect_direction = ifelse(pangolin_score < 0, "decrease", "increase")) %>%

# filter out effects without score or scores <= 0
filter(!is.na(score) & score > 0) %>%
# filter out effects without score or score == 0
filter(!is.na(pangolin_score) & pangolin_score != 0) %>%

# as no annotation on acceptor or donor existst in pangolin, both are considered
left_join(
pangolin_effect_translation,
by = "effect_direction"
) %>%
left_join(pangolin_effect_translation,
by = "effect_direction") %>%

# add unique IDs for mutation
mutate(
Expand All @@ -46,10 +37,14 @@ format_pangolin <- function(variants, keep_gene_id = FALSE){
pos = as.integer(POS) + pos_rel
) %>%

# use only absolute value of pangolin score as the +/- is now integrated int the effect column
mutate(score = abs(pangolin_score)) %>%
select(-pangolin_score) %>%

# keep only relevant columns
{
if (keep_gene_id)
dplyr::select(. ,mut_id, effect, score, chr, pos_rel, pos, gene_id)
dplyr::select(. , mut_id, effect, score, chr, pos_rel, pos, gene_id)
else
dplyr::select(. , mut_id, effect, score, chr, pos_rel, pos)
} %>%
Expand Down
51 changes: 31 additions & 20 deletions R/parse_pangolin.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,24 @@
parse_pangolin <- function(vcf_file){

# Format: Pangolin=ENSG00000237298.10_8|33:0.0|-50:0.0|Wa
# Format: gene|pos:largest_increase|pos:largest_decrease|
# Format: gene|pos:largest_increase|pos:largest_decrease|,gene|pos:largest_increase|pos:largest_decrease|
# if pangolin option -s was used, there can be multiple increase/decrease scores!

col_names <- c("gene_id",
"pos_largest_increase","pos_largest_decrease")

vcf <- vcfR::read.vcfR(vcf_file, verbose = FALSE)

# get fixed fields for variants
fix_df <- vcfR::getFIX(vcf) %>%
fix_df <- vcfR::getFIX(vcf)

# required for vcf with only one mutation
if(is.null(nrow(fix_df))){
fix_df <- fix_df %>%
t()
}

fix_df <- fix_df %>%
tibble::as_tibble() %>%
mutate(
Key = row_number()
Expand All @@ -35,33 +44,35 @@ parse_pangolin <- function(vcf_file){
pangolin_df <- vcf %>%
vcfR::extract_info_tidy(info_fields = c("Pangolin")) %>%
mutate(
annot = Pangolin %>%
annot_gene = Pangolin %>%

# remove warning massage
str_replace_all("Warnings:NoAnnotatedSitesToMaskForThisGene", "") %>%
str_replace_all("Warnings:", "") %>%

# extract each annotation per variant
str_extract_all("[^|]*\\|-?\\d+:\\d\\.+\\d+\\|-?\\d+:\\d\\.+\\d+\\|")
# extract all gene-wise annotations per variant
str_split(",")
) %>%
select(-Pangolin) %>%
unnest(annot) %>%
unnest(annot_gene) %>%
# pares individual annotation values
mutate(
gene_id = annot %>%
str_replace("([^|]*)\\|-?\\d+:\\d\\.+\\d+\\|-?\\d+:\\d\\.+\\d+\\|", "\\1"),
increase_pos = annot %>%
str_replace("[^|]*\\|(-?\\d+):\\d\\.+\\d+\\|-?\\d+:\\d\\.+\\d+\\|", "\\1") %>%
as.integer(),
increase_score = annot %>%
str_replace("[^|]*\\|-?\\d+:(\\d\\.+\\d+)\\|-?\\d+:\\d\\.+\\d+\\|", "\\1") %>%
as.numeric(),
decrease_pos = annot %>%
str_replace("[^|]*\\|-?\\d+:\\d\\.+\\d+\\|(-?\\d+):\\d\\.+\\d+\\|", "\\1") %>%
gene_id = annot_gene %>%
str_replace("([^|]*)\\|-?\\d+:-?\\d\\.+\\d+\\|-?\\d+:-?\\d\\.+\\d+\\|", "\\1"),
# extract all annotations per variant - gene
annot = annot_gene %>%
str_extract_all("-?\\d+:-?\\d\\.+\\d+")
) %>%
unnest(annot) %>%
mutate(
pos_rel = annot %>%
str_replace("(-?\\d+):-?\\d\\.+\\d+", "\\1") %>%
as.integer(),
decrease_score = annot %>%
str_replace("[^|]*\\|-?\\d+:\\d\\.+\\d+\\|-?\\d+:(\\d\\.+\\d+)\\|", "\\1") %>%
as.numeric(),
)
pangolin_score = annot %>%
str_replace("-?\\d+:(-?\\d\\.+\\d+)", "\\1") %>%
as.numeric()
) %>%
select(-annot, -annot_gene)


fix_df %>%
Expand Down
20 changes: 10 additions & 10 deletions inst/extdata/spliceai_output.pangolin.vcf
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
##fileDate=20191004
##reference=GRCh37/hg19
##INFO=<ID=SpliceAI,Number=.,Type=String,Description="SpliceAIv1.3.1 variant annotation. These include delta scores (DS) and delta positions (DP) for acceptor gain (AG), acceptor loss (AL), donor gain (DG), and donor loss (DL). Format: ALLELE|SYMBOL|DS_AG|DS_AL|DS_DG|DS_DL|DP_AG|DP_AL|DP_DG|DP_DL">
##INFO=<ID=Pangolin,Number=.,Type=String,Description="Pangolin splice scores. Format: gene|pos:score_change|pos:score_change|...">
##INFO=<ID=Pangolin,Number=.,Type=String,Description="Pangolin splice scores. Format: gene|pos:score_change|pos:score_change|warnings,...">
##FILTER=<ID=PASS,Description="All filters passed">
##contig=<ID=chr1,length=249250621>
##contig=<ID=chr2,length=243199373>
Expand All @@ -29,13 +29,13 @@
##contig=<ID=chrX,length=155270560>
##contig=<ID=chrY,length=59373566>
#CHROM POS ID REF ALT QUAL FILTER INFO
chr1 25000 . A C,G,T . . Pangolin=ENSG00000227232.5_3|8:0.01|-50:0.0|Warnings:NoAnnotatedSitesToMaskForThisGene
chr2 152389953 . T A,C,G . . SpliceAI=A|NEB|0.01|0.00|0.00|0.74|43|3|-26|3,C|NEB|0.04|0.00|0.00|0.71|43|3|-26|3,G|NEB|0.03|0.00|0.00|0.75|43|3|-26|3;Pangolin=ENSG00000183091.19_10|2:0.01|-50:0.0|Warnings:NoAnnotatedSitesToMaskForThisGene
chr2 179415988 . C CA . . SpliceAI=CA|TTN|0.07|1.00|0.00|0.00|-7|-1|35|-29;Pangolin=ENSG00000237298.10_8|33:0.0|-50:0.0|Warnings:NoAnnotatedSitesToMaskForThisGeneENSG00000155657.27_6|-26:0.08|-49:0.0|Warnings:NoAnnotatedSitesToMaskForThisGene
chr2 179446218 . ATACT A . . SpliceAI=A|TTN|0.00|0.00|0.02|0.91|-7|34|-11|8;Pangolin=ENSG00000237298.10_8|14:0.0|-50:0.0|Warnings:NoAnnotatedSitesToMaskForThisGeneENSG00000155657.27_6|-7:0.08|-50:0.0|Warnings:NoAnnotatedSitesToMaskForThisGene
chr1 25000 . A C,G,T . . Pangolin=ENSG00000227232.5_5|8:0.009999999776482582|-50:0.0|Warnings:
chr2 152389953 . T A,C,G . . SpliceAI=A|NEB|0.01|0.00|0.00|0.74|43|3|-26|3,C|NEB|0.04|0.00|0.00|0.71|43|3|-26|3,G|NEB|0.03|0.00|0.00|0.75|43|3|-26|3;Pangolin=ENSG00000183091.20_14|2:0.009999999776482582|3:-0.33000001311302185|Warnings:
chr2 179415988 . C CA . . SpliceAI=CA|TTN|0.07|1.00|0.00|0.00|-7|-1|35|-29;Pangolin=ENSG00000237298.10_10|33:0.0|-50:0.0|Warnings:,ENSG00000155657.28_10|-26:0.07999999821186066|-1:-0.8600000143051147|Warnings:
chr2 179446218 . ATACT A . . SpliceAI=A|TTN|0.00|0.00|0.02|0.91|-7|34|-11|8;Pangolin=ENSG00000237298.10_10|14:0.0|-50:0.0|Warnings:,ENSG00000155657.28_10|-7:0.08|8:-0.81|Warnings:
chr2 179446218 . ATACT AT,ATA . . SpliceAI=AT|TTN|.|.|.|.|.|.|.|.,ATA|TTN|.|.|.|.|.|.|.|.
chr2 179642185 . G A . . SpliceAI=A|TTN|0.00|0.00|0.64|0.55|2|38|2|-38;Pangolin=ENSG00000237298.10_8|-2:0.0|-50:0.0|Warnings:NoAnnotatedSitesToMaskForThisGeneENSG00000155657.27_6|2:0.54|-50:0.0|Warnings:NoAnnotatedSitesToMaskForThisGene
chr19 38958362 . C T . . SpliceAI=T|RYR1|0.00|0.00|0.91|0.08|-28|-46|-2|-31;Pangolin=ENSG00000196218.13_8|-2:0.85|-50:0.0|Warnings:NoAnnotatedSitesToMaskForThisGene
chr21 47406854 . CCA C . . SpliceAI=C|COL6A1|0.04|0.98|0.00|0.00|-38|4|38|4;Pangolin=ENSG00000142156.15_4|-38:0.06|-40:0.0|Warnings:NoAnnotatedSitesToMaskForThisGene
chr21 47406856 . A AT . . SpliceAI=AT|COL6A1|0.03|0.99|0.00|0.00|-40|2|36|2;Pangolin=ENSG00000142156.15_4|-40:0.08|-42:0.0|Warnings:NoAnnotatedSitesToMaskForThisGene
chrX 129274636 . A C,G,T . . SpliceAI=C|AIFM1|0.00|0.18|0.00|0.00|-28|-44|-44|45,G|AIFM1|0.00|0.17|0.00|0.00|-8|-44|-44|45,T|AIFM1|0.00|0.19|0.00|0.00|-2|-44|-44|45;Pangolin=ENSG00000156709.14_6|-41:0.01|-49:0.0|Warnings:NoAnnotatedSitesToMaskForThisGene
chr2 179642185 . G A . . SpliceAI=A|TTN|0.00|0.00|0.64|0.55|2|38|2|-38;Pangolin=ENSG00000237298.10_10|-2:0.0|-50:0.0|Warnings:,ENSG00000155657.28_10|2:0.5400000214576721|-38:-0.3499999940395355|Warnings:
chr19 38958362 . C T . . SpliceAI=T|RYR1|0.00|0.00|0.91|0.08|-28|-46|-2|-31;Pangolin=ENSG00000196218.13_12|-2:0.8500000238418579|-50:0.0|Warnings:
chr21 47406854 . CCA C . . SpliceAI=C|COL6A1|0.04|0.98|0.00|0.00|-38|4|38|4;Pangolin=ENSG00000142156.16_6|-38:0.06|4:-0.82|Warnings:
chr21 47406856 . A AT . . SpliceAI=AT|COL6A1|0.03|0.99|0.00|0.00|-40|2|36|2;Pangolin=ENSG00000142156.16_6|-40:0.07999999821186066|2:-0.8100000023841858|Warnings:
chrX 129274636 . A C,G,T . . SpliceAI=C|AIFM1|0.00|0.18|0.00|0.00|-28|-44|-44|45,G|AIFM1|0.00|0.17|0.00|0.00|-8|-44|-44|45,T|AIFM1|0.00|0.19|0.00|0.00|-2|-44|-44|45;Pangolin=ENSG00000156709.15_10|-41:0.009999999776482582|-44:-0.2800000011920929|Warnings:
2 changes: 1 addition & 1 deletion man/format_pangolin.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

21 changes: 13 additions & 8 deletions tests/testthat/test-annotate_mut_effect.R
Original file line number Diff line number Diff line change
Expand Up @@ -35,20 +35,27 @@ test_that("annotate_mut_effect works on toy example with pangolin with gene_mapp
tx_name = toy_transcripts_gr@elementMetadata$tx_name
)


pangolin_file <- system.file("extdata", "spliceai_output.pangolin.vcf", package = "splice2neo")

effect_df <- parse_pangolin(pangolin_file)
# modify : same mutation for two distinct genes + different scores

effect_df <- effect_df %>% format_pangolin(keep_gene_id = TRUE)

# we need to do a dirty fix because different versions of gencode annotations in effect_df and toy_transcripts
toy_transcripts_gr_fix <- toy_transcripts_gr
toy_transcripts_gr_fix@elementMetadata$gene_id = gsub("_.*", "", unlist(toy_transcripts_gr@elementMetadata$gene_id))
effect_df$gene_id <- gsub("_.*", "", effect_df$gene_id)
effect_df$gene_id <- gsub("\\...*", "", effect_df$gene_id)
toy_transcripts_gr_fix@elementMetadata$gene_id = gsub("\\...*", "", unlist(toy_transcripts_gr_fix@elementMetadata$gene_id))
effect_df <- effect_df %>%
filter(gene_id %in% toy_transcripts_gr_fix@elementMetadata$gene_id)


# without gene mapping
annot_df <- annotate_mut_effect(effect_df, toy_transcripts, toy_transcripts_gr, gene_mapping = FALSE)
# annot_df1 <- annot_df %>%
# mutate(id = paste0(mut_id, gene_id))
annot_df_map <- annotate_mut_effect(effect_df, toy_transcripts, toy_transcripts_gr, gene_mapping = TRUE)
# annot_df_map1 <- annot_df_map %>%
# mutate(id = paste0(mut_id, gene_id))
# with gene mapping
annot_df_map <- annotate_mut_effect(effect_df, toy_transcripts, toy_transcripts_gr_fix, gene_mapping = TRUE)

expect_true(nrow(annot_df) >= nrow(effect_df))
expect_true(nrow(annot_df_map) >= nrow(effect_df))
Expand All @@ -64,8 +71,6 @@ test_that("annotate_mut_effect works on toy example with pangolin with gene_mapp

expect_true(all(df_not_mapped$gene_id.x != df_not_mapped$gene_id))



})

test_that("annotate_mut_effect works on empty tibble", {
Expand Down
3 changes: 2 additions & 1 deletion tests/testthat/test-parse_pangolin.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@ test_that("parse_pangolin works on provided example file", {
pangolin_df <- parse_pangolin(pangolin_file)

expect_true(nrow(pangolin_df) > 0)
expect_true(ncol(pangolin_df) == 11)

expect_true(all(
c("increase_pos", "increase_score", "decrease_pos", "decrease_score") %in% names(pangolin_df)
c("pos_rel", "pangolin_score") %in% names(pangolin_df)
))

})

0 comments on commit 4f2473f

Please sign in to comment.