-
Notifications
You must be signed in to change notification settings - Fork 78
/
aligner_metrics.h
375 lines (338 loc) · 11.2 KB
/
aligner_metrics.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
/*
* aligner_metrics.h
*/
#ifndef ALIGNER_METRICS_H_
#define ALIGNER_METRICS_H_
#include <math.h>
#include <iostream>
#include "alphabet.h"
#include "sstring.h"
#include "timer.h"
using namespace std;
/**
* Borrowed from http://www.johndcook.com/standard_deviation.html,
* which in turn is borrowed from Knuth.
*/
class RunningStat {
public:
RunningStat() : m_n(0), m_tot(0.0) { }
void clear() {
m_n = 0;
m_tot = 0.0;
}
void push(float x) {
m_n++;
m_tot += x;
// See Knuth TAOCP vol 2, 3rd edition, page 232
if (m_n == 1) {
m_oldM = m_newM = x;
m_oldS = 0.0;
} else {
m_newM = m_oldM + (x - m_oldM)/m_n;
m_newS = m_oldS + (x - m_oldM)*(x - m_newM);
// set up for next iteration
m_oldM = m_newM;
m_oldS = m_newS;
}
}
int num() const {
return m_n;
}
double tot() const {
return m_tot;
}
double mean() const {
return (m_n > 0) ? m_newM : 0.0;
}
double variance() const {
return ( (m_n > 1) ? m_newS/(m_n - 1) : 0.0 );
}
double stddev() const {
return sqrt(variance());
}
private:
int m_n;
double m_tot;
double m_oldM, m_newM, m_oldS, m_newS;
};
/**
* Encapsulates a set of metrics that we would like an aligner to keep
* track of, so that we can possibly use it to diagnose performance
* issues.
*/
class AlignerMetrics {
public:
AlignerMetrics() :
curBacktracks_(0),
curBwtOps_(0),
first_(true),
curIsLowEntropy_(false),
curIsHomoPoly_(false),
curHadRanges_(false),
curNumNs_(0),
reads_(0),
homoReads_(0),
lowEntReads_(0),
hiEntReads_(0),
alignedReads_(0),
unalignedReads_(0),
threeOrMoreNReads_(0),
lessThanThreeNRreads_(0),
bwtOpsPerRead_(),
backtracksPerRead_(),
bwtOpsPerHomoRead_(),
backtracksPerHomoRead_(),
bwtOpsPerLoEntRead_(),
backtracksPerLoEntRead_(),
bwtOpsPerHiEntRead_(),
backtracksPerHiEntRead_(),
bwtOpsPerAlignedRead_(),
backtracksPerAlignedRead_(),
bwtOpsPerUnalignedRead_(),
backtracksPerUnalignedRead_(),
bwtOpsPer0nRead_(),
backtracksPer0nRead_(),
bwtOpsPer1nRead_(),
backtracksPer1nRead_(),
bwtOpsPer2nRead_(),
backtracksPer2nRead_(),
bwtOpsPer3orMoreNRead_(),
backtracksPer3orMoreNRead_(),
timer_(cout, "", false)
{ }
void printSummary() {
if(!first_) {
finishRead();
}
cout << "AlignerMetrics:" << endl;
cout << " # Reads: " << reads_ << endl;
float hopct = (reads_ > 0) ? (((float)homoReads_)/((float)reads_)) : (0.0);
hopct *= 100.0;
cout << " % homo-polymeric: " << (hopct) << endl;
float lopct = (reads_ > 0) ? ((float)lowEntReads_/(float)(reads_)) : (0.0);
lopct *= 100.0;
cout << " % low-entropy: " << (lopct) << endl;
float unpct = (reads_ > 0) ? ((float)unalignedReads_/(float)(reads_)) : (0.0);
unpct *= 100.0;
cout << " % unaligned: " << (unpct) << endl;
float npct = (reads_ > 0) ? ((float)threeOrMoreNReads_/(float)(reads_)) : (0.0);
npct *= 100.0;
cout << " % with 3 or more Ns: " << (npct) << endl;
cout << endl;
cout << " Total BWT ops: avg: " << bwtOpsPerRead_.mean() << ", stddev: " << bwtOpsPerRead_.stddev() << endl;
cout << " Total Backtracks: avg: " << backtracksPerRead_.mean() << ", stddev: " << backtracksPerRead_.stddev() << endl;
time_t elapsed = timer_.elapsed();
cout << " BWT ops per second: " << (bwtOpsPerRead_.tot()/elapsed) << endl;
cout << " Backtracks per second: " << (backtracksPerRead_.tot()/elapsed) << endl;
cout << endl;
cout << " Homo-poly:" << endl;
cout << " BWT ops: avg: " << bwtOpsPerHomoRead_.mean() << ", stddev: " << bwtOpsPerHomoRead_.stddev() << endl;
cout << " Backtracks: avg: " << backtracksPerHomoRead_.mean() << ", stddev: " << backtracksPerHomoRead_.stddev() << endl;
cout << " Low-entropy:" << endl;
cout << " BWT ops: avg: " << bwtOpsPerLoEntRead_.mean() << ", stddev: " << bwtOpsPerLoEntRead_.stddev() << endl;
cout << " Backtracks: avg: " << backtracksPerLoEntRead_.mean() << ", stddev: " << backtracksPerLoEntRead_.stddev() << endl;
cout << " High-entropy:" << endl;
cout << " BWT ops: avg: " << bwtOpsPerHiEntRead_.mean() << ", stddev: " << bwtOpsPerHiEntRead_.stddev() << endl;
cout << " Backtracks: avg: " << backtracksPerHiEntRead_.mean() << ", stddev: " << backtracksPerHiEntRead_.stddev() << endl;
cout << endl;
cout << " Unaligned:" << endl;
cout << " BWT ops: avg: " << bwtOpsPerUnalignedRead_.mean() << ", stddev: " << bwtOpsPerUnalignedRead_.stddev() << endl;
cout << " Backtracks: avg: " << backtracksPerUnalignedRead_.mean() << ", stddev: " << backtracksPerUnalignedRead_.stddev() << endl;
cout << " Aligned:" << endl;
cout << " BWT ops: avg: " << bwtOpsPerAlignedRead_.mean() << ", stddev: " << bwtOpsPerAlignedRead_.stddev() << endl;
cout << " Backtracks: avg: " << backtracksPerAlignedRead_.mean() << ", stddev: " << backtracksPerAlignedRead_.stddev() << endl;
cout << endl;
cout << " 0 Ns:" << endl;
cout << " BWT ops: avg: " << bwtOpsPer0nRead_.mean() << ", stddev: " << bwtOpsPer0nRead_.stddev() << endl;
cout << " Backtracks: avg: " << backtracksPer0nRead_.mean() << ", stddev: " << backtracksPer0nRead_.stddev() << endl;
cout << " 1 N:" << endl;
cout << " BWT ops: avg: " << bwtOpsPer1nRead_.mean() << ", stddev: " << bwtOpsPer1nRead_.stddev() << endl;
cout << " Backtracks: avg: " << backtracksPer1nRead_.mean() << ", stddev: " << backtracksPer1nRead_.stddev() << endl;
cout << " 2 Ns:" << endl;
cout << " BWT ops: avg: " << bwtOpsPer2nRead_.mean() << ", stddev: " << bwtOpsPer2nRead_.stddev() << endl;
cout << " Backtracks: avg: " << backtracksPer2nRead_.mean() << ", stddev: " << backtracksPer2nRead_.stddev() << endl;
cout << " >2 Ns:" << endl;
cout << " BWT ops: avg: " << bwtOpsPer3orMoreNRead_.mean() << ", stddev: " << bwtOpsPer3orMoreNRead_.stddev() << endl;
cout << " Backtracks: avg: " << backtracksPer3orMoreNRead_.mean() << ", stddev: " << backtracksPer3orMoreNRead_.stddev() << endl;
cout << endl;
}
/**
*
*/
void nextRead(const BTDnaString& read) {
if(!first_) {
finishRead();
}
first_ = false;
float ent = entropyDna5(read);
curIsLowEntropy_ = (ent < 0.75f);
curIsHomoPoly_ = (ent < 0.001f);
curHadRanges_ = false;
curBwtOps_ = 0;
curBacktracks_ = 0;
// Count Ns
curNumNs_ = 0;
const size_t len = read.length();
for(size_t i = 0; i < len; i++) {
if((int)read[i] == 4) curNumNs_++;
}
}
inline float entropyDna5(const BTDnaString& read) {
size_t cs[5] = {0, 0, 0, 0, 0};
size_t readLen = read.length();
for(size_t i = 0; i < readLen; i++) {
int c = (int)read[i];
assert_lt(c, 5);
assert_geq(c, 0);
cs[c]++;
}
if(cs[4] > 0) {
// Charge the Ns to the non-N character with maximal count and
// then exclude them from the entropy calculation (i.e.,
// penalize Ns as much as possible)
if(cs[0] >= cs[1] && cs[0] >= cs[2] && cs[0] >= cs[3]) {
// Charge Ns to As
cs[0] += cs[4];
} else if(cs[1] >= cs[2] && cs[1] >= cs[3]) {
// Charge Ns to Cs
cs[1] += cs[4];
} else if(cs[2] >= cs[3]) {
// Charge Ns to Gs
cs[2] += cs[4];
} else {
// Charge Ns to Ts
cs[3] += cs[4];
}
}
float ent = 0.0;
for(int i = 0; i < 4; i++) {
if(cs[i] > 0) {
float frac = (float)cs[i] / (float)readLen;
ent += (frac * log(frac));
}
}
ent = -ent;
assert_geq(ent, 0.0);
return ent;
}
/**
*
*/
void setReadHasRange() {
curHadRanges_ = true;
}
/**
* Commit the running statistics for this read to
*/
void finishRead() {
reads_++;
if(curIsHomoPoly_) homoReads_++;
else if(curIsLowEntropy_) lowEntReads_++;
else hiEntReads_++;
if(curHadRanges_) alignedReads_++;
else unalignedReads_++;
bwtOpsPerRead_.push((float)curBwtOps_);
backtracksPerRead_.push((float)curBacktracks_);
// Drill down by entropy
if(curIsHomoPoly_) {
bwtOpsPerHomoRead_.push((float)curBwtOps_);
backtracksPerHomoRead_.push((float)curBacktracks_);
} else if(curIsLowEntropy_) {
bwtOpsPerLoEntRead_.push((float)curBwtOps_);
backtracksPerLoEntRead_.push((float)curBacktracks_);
} else {
bwtOpsPerHiEntRead_.push((float)curBwtOps_);
backtracksPerHiEntRead_.push((float)curBacktracks_);
}
// Drill down by whether it aligned
if(curHadRanges_) {
bwtOpsPerAlignedRead_.push((float)curBwtOps_);
backtracksPerAlignedRead_.push((float)curBacktracks_);
} else {
bwtOpsPerUnalignedRead_.push((float)curBwtOps_);
backtracksPerUnalignedRead_.push((float)curBacktracks_);
}
if(curNumNs_ == 0) {
lessThanThreeNRreads_++;
bwtOpsPer0nRead_.push((float)curBwtOps_);
backtracksPer0nRead_.push((float)curBacktracks_);
} else if(curNumNs_ == 1) {
lessThanThreeNRreads_++;
bwtOpsPer1nRead_.push((float)curBwtOps_);
backtracksPer1nRead_.push((float)curBacktracks_);
} else if(curNumNs_ == 2) {
lessThanThreeNRreads_++;
bwtOpsPer2nRead_.push((float)curBwtOps_);
backtracksPer2nRead_.push((float)curBacktracks_);
} else {
threeOrMoreNReads_++;
bwtOpsPer3orMoreNRead_.push((float)curBwtOps_);
backtracksPer3orMoreNRead_.push((float)curBacktracks_);
}
}
// Running-total of the number of backtracks and BWT ops for the
// current read
uint32_t curBacktracks_;
uint32_t curBwtOps_;
protected:
bool first_;
// true iff the current read is low entropy
bool curIsLowEntropy_;
// true if current read is all 1 char (or very close)
bool curIsHomoPoly_;
// true iff the current read has had one or more ranges reported
bool curHadRanges_;
// number of Ns in current read
int curNumNs_;
// # reads
uint32_t reads_;
// # homo-poly reads
uint32_t homoReads_;
// # low-entropy reads
uint32_t lowEntReads_;
// # high-entropy reads
uint32_t hiEntReads_;
// # reads with alignments
uint32_t alignedReads_;
// # reads without alignments
uint32_t unalignedReads_;
// # reads with 3 or more Ns
uint32_t threeOrMoreNReads_;
// # reads with < 3 Ns
uint32_t lessThanThreeNRreads_;
// Distribution of BWT operations per read
RunningStat bwtOpsPerRead_;
RunningStat backtracksPerRead_;
// Distribution of BWT operations per homo-poly read
RunningStat bwtOpsPerHomoRead_;
RunningStat backtracksPerHomoRead_;
// Distribution of BWT operations per low-entropy read
RunningStat bwtOpsPerLoEntRead_;
RunningStat backtracksPerLoEntRead_;
// Distribution of BWT operations per high-entropy read
RunningStat bwtOpsPerHiEntRead_;
RunningStat backtracksPerHiEntRead_;
// Distribution of BWT operations per read that "aligned" (for
// which a range was arrived at - range may not have necessarily
// lead to an alignment)
RunningStat bwtOpsPerAlignedRead_;
RunningStat backtracksPerAlignedRead_;
// Distribution of BWT operations per read that didn't align
RunningStat bwtOpsPerUnalignedRead_;
RunningStat backtracksPerUnalignedRead_;
// Distribution of BWT operations/backtracks per read with no Ns
RunningStat bwtOpsPer0nRead_;
RunningStat backtracksPer0nRead_;
// Distribution of BWT operations/backtracks per read with one N
RunningStat bwtOpsPer1nRead_;
RunningStat backtracksPer1nRead_;
// Distribution of BWT operations/backtracks per read with two Ns
RunningStat bwtOpsPer2nRead_;
RunningStat backtracksPer2nRead_;
// Distribution of BWT operations/backtracks per read with three or
// more Ns
RunningStat bwtOpsPer3orMoreNRead_;
RunningStat backtracksPer3orMoreNRead_;
Timer timer_;
};
#endif /* ALIGNER_METRICS_H_ */