-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract-overlapping-reads.py
executable file
·79 lines (66 loc) · 2.4 KB
/
extract-overlapping-reads.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#!/usr/bin/env python3
import os
import sys
import glob
from Bio.SeqIO.QualityIO import FastqGeneralIterator
accession, seeds_file = sys.argv[1:]
K=40
END_LENGTH=100
def rc(s):
return "".join({'T':'A',
'G':'C',
'A':'T',
'C':'G',
'N':'N'}[x] for x in reversed(s))
# kmer -> [target, should_rc]
to_find = {}
def search_for_rc(kmer, target, should_rc):
if kmer in to_find:
to_find[kmer] = None, None
else:
to_find[kmer] = target, should_rc
def search_for(kmer, target):
search_for_rc(kmer, target, False)
search_for_rc(rc(kmer), target, True)
def kmers(seq):
for i in range(len(seq) - K + 1):
yield seq[i:K+i]
files = {}
with open(seeds_file) as inf:
for line in inf:
target = line.split(".fasta")[0]
if os.path.exists("%s/final_contig.seq" % target):
continue
iteration = max(int(fname.split("/")[-1].split(".")[0])
for fname in glob.glob("%s/*.contig.seq" % target))
target_seq_fname = "%s/%s.contig.seq" % (target, iteration)
files[target] = open(
"%s/%s.%s.fasta" % (target, accession, iteration), "w")
with open(target_seq_fname) as seqf:
contig, = seqf
contig = contig.strip()
# As we keep iterating the middle of the contig is eventually
# settled, and we only care about the ends. If it's a short contig,
# extract reads matching anywhere, but if it's a long contig
# extract only for the ends.
if len(contig) <= (END_LENGTH+K)*2:
for kmer in kmers(contig):
search_for(kmer, target)
else:
for kmer in kmers(contig[:END_LENGTH+K]):
search_for(kmer, target)
for kmer in kmers(contig[-(END_LENGTH+K):]):
search_for(kmer, target)
for title, seq, quality in FastqGeneralIterator(sys.stdin):
targets = {} # target -> should_rc
for kmer in kmers(seq):
target, should_rc = to_find.get(kmer, (None, None))
if target:
targets[target] = should_rc
for target, should_rc in targets.items():
files[target].write(
">%s%s\n%s\n" % (
title,
":rc" if should_rc else "",
rc(seq) if should_rc else seq,
))