-
Notifications
You must be signed in to change notification settings - Fork 0
/
fusion_genes_compare.py
executable file
·111 lines (96 loc) · 4.26 KB
/
fusion_genes_compare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/usr/bin/env python
import os
import re
import argparse
import sys
def read_files_store_data(input_files,output_file):
fusion_dict={}
for input_file in input_files:
if input_file.endswith("star-fusion.fusion_candidates.final.abridged"):
#We have a star fusion file
with open(input_file, 'r') as f:
for line in f:
if line.startswith("#"):
#Skip header
continue
else:
fusion=line.split("\t")[0]
# If we want to store to metadata then that can be inserted here
if fusion in fusion_dict.keys():
fusion_dict[fusion]='Both'
else:
fusion_dict[fusion]='STAR'
elif input_file.endswith("summary_candidate_fusions.txt"):
#We have a Fusion catcher file
with open(input_file, 'r') as f:
for line in f:
if line.startswith(" * "):
fusion=line.split(" ")[3]
if fusion in fusion_dict.keys():
fusion_dict[fusion]='Both'
else:
fusion_dict[fusion]='FusionCatcher'
else:
print"Found file with incorect file ending, omitting file {}".format(input_file)
make_report(fusion_dict, output_file)
def group_NGI_files(input_files,outputfile):
sample_pattern=re.compile("^(P[0-9]+_[0-9]+)")
matches=[]
for input_file in input_files:
try:
match=sample_pattern.search(os.path.basename(input_file)).group(1)
if match:
matches.append(match)
except AttributeError:
continue
NGI_names=matches
for NGI_name in NGI_names:
sample_files=[]
for fusion_file in input_files:
if os.path.basename(fusion_file).startswith(NGI_name):
sample_files.append(fusion_file)
outfile="{}.fusion_comparison.txt".format(NGI_name)
read_files_store_data(sample_files,outfile)
def make_report(fusion_dict, output_file):
content=str()
gene_in_both=[]
gene_star_only=[]
gene_fc_only=[]
len_fc=0
len_star=0
for fusion_gene in fusion_dict.keys():
if fusion_dict[fusion_gene] == 'Both':
gene_in_both.append(fusion_gene)
len_fc+=1
len_star+=1
elif fusion_dict[fusion_gene] == 'STAR':
gene_star_only.append(fusion_gene)
len_star+=1
elif fusion_dict[fusion_gene] == 'FusionCatcher':
gene_fc_only.append(fusion_gene)
len_fc+=1
content+="## Number of Fusion genes detected with STAR-fusion: {} \n".format(len_star)
content+="## Number of Fusion genes detected with FusionCatcher: {} \n".format(len_fc)
content +="##FUSIONCATCHER\tSTAR-FUSION\tBOTH\n"
##cleanup
gene_in_both=[item.rstrip() for item in gene_in_both]
gene_star_only=[item.rstrip() for item in gene_star_only]
gene_fc_only=[item.rstrip() for item in gene_fc_only]
maxlen = max([len(l) for l in [gene_in_both,gene_star_only,gene_fc_only]])
for idx in range(0, maxlen-1):
both_str = gene_in_both[idx] if len(gene_in_both) > idx else ''
star_str = gene_star_only[idx] if len(gene_star_only) > idx else ''
fc_str = gene_fc_only[idx] if len(gene_fc_only) > idx else ''
content += "{}\t{}\t{}\n".format(fc_str, star_str, both_str)
with open(output_file, 'w') as f:
f.write(content)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="""Compare two list of fusion genes and give which fusions are found in both """)
parser.add_argument("input_files", metavar='Input file', nargs='+', default='.',
help="Input files from STAR fusion and Fusion catcher ")
parser.add_argument("output_file", metavar='Output file', nargs='?', default='fusion_comparison.txt',
help="File to save output to. ")
args = parser.parse_args()
#merge_files(args.input_dir, args.dest_dir)
group_NGI_files(args.input_files,args.output_file)
read_files_store_data(args.input_files,args.output_file)