-
Notifications
You must be signed in to change notification settings - Fork 0
/
output.py
129 lines (114 loc) · 3.62 KB
/
output.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/env python3.5
# -*- coding: utf-8 -*-
__author__ = "Tobias Tilly"
__name__ = "Output"
'''
The output module works with the structures created by the analyzer. It will
plot the statistics and write human readable output files, if necessary.
If the time allows it, error handling will happen here as well.
'''
import matplotlib.pyplot as plt
HUMAN_READABLE = {
'sample_count': 'Number of samples',
'record_count': 'Number of records',
'snp_count': 'Number of SNPs',
'indel_count': 'Number of INDELs',
'tstv': 'Ts/Tv ratio',
'tstv_1st_alt': 'Ts/TV ratio of the 1st ALT',
}
""" Takes the statistic summary parsed by the parser module and print it to the
command line.
Args:
stat_dict (dictionary):
the parsed output of the bcftools stats command
returns:
n/a
ToDo:
How to represent the allele frequencies with 20 or more bins
"""
def print_statistic_summary(stat_dict):
v = stat_dict.pop('filename')
output = "Statistic summary for {}: \n".format(v)
for k in stat_dict:
output += "\t{}:\t{}\n".format(HUMAN_READABLE[k], stat_dict[k].rstrip())
print(output)
def plot_allele_frequencies(af_stats, legend, output, maf=False):
af_bins = []
n_of_snps = []
n_of_indels = []
for line in af_stats:
line = line.split('\t')
# collect data if maf is True and af is <= 0.5 or maf is False
if not(maf and line[2] > 0.5):
af_bins.append(line[2])
n_of_snps.append(line[3])
n_of_indels.append(line[6])
else:
break
fig = plt.figure()
plot1 = fig.add_subplot(211)
plot1.set_title('SNPs per allele frequency')
plot1.semilogy(af_bins, n_of_snps, marker='D', label=legend)
plot1.set_ylabel('Number of SNPs')
plot1.set_xlabel('Allele frequencies')
plot1.legend()
plot2 = fig.add_subplot(212)
plot2.set_title('INDELs per allele frequency')
plot2.semilogy(af_bins, n_of_indels, marker='D', label=legend)
plot2.set_ylabel('Number of INDELs')
plot2.set_xlabel('Allele frequencies')
plot2.legend()
if output is not '':
plt.savefig(output)
else:
plt.show()
def plot_allele_frequency_comparison(stats1, stats2, legend, output, maf=False):
af_bins1 = []
n_of_snps1 = []
n_of_indels1 = []
af_bins2 = []
n_of_snps2 = []
n_of_indels2 = []
for line in stats1:
line = line.split('\t')
# collect data if maf is True and af is <= 0.5 or maf is False
if not(maf and float(line[2]) > 0.5):
af_bins1.append(line[2])
n_of_snps1.append(line[3])
n_of_indels1.append(line[6])
else:
break
for line in stats2:
line = line.split('\t')
# collect data if maf is True and af is <= 0.5 or maf is False
if not(maf and float(line[2]) > 0.5):
af_bins2.append(line[2])
n_of_snps2.append(line[3])
n_of_indels2.append(line[6])
else:
break
fig = plt.figure()
plot1 = fig.add_subplot(211)
plot1.set_title('SNPs per allele frequency')
plot1.semilogy(af_bins1, n_of_snps1, marker='D', color='red', label=legend[0])
plot1.semilogy(af_bins2, n_of_snps2, marker='*', color='blue', label=legend[1])
plot1.set_ylabel('Number of SNPs')
plot1.set_xlabel('Allele frequencies')
plot1.legend()
plot2 = fig.add_subplot(212)
plot2.set_title('INDELs per allele frequency')
plot2.semilogy(af_bins1, n_of_indels1, marker='D', color='red', label=legend[0])
plot2.semilogy(af_bins2, n_of_indels2, marker='*', color='blue', label=legend[1])
plot2.set_ylabel('Number of INDELs')
plot2.set_xlabel('Allele frequencies')
plot2.legend()
if output is not '':
plt.savefig(output)
else:
plt.show()
def print_hwe_stat(kept, total):
kept = int(kept)
total = int(total)
kept = total-kept
print("HWE Quality:")
print("{} out of {} Sites have a HWE below 0.05 and an MAF of 0.5".format(kept, total))