-
Notifications
You must be signed in to change notification settings - Fork 0
/
LONG-Genome Assembly as Shortest Superstring.py
142 lines (118 loc) · 3.87 KB
/
LONG-Genome Assembly as Shortest Superstring.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
"""
Rosalind Problems: Genome Assembly as Shortest Superstring
Problem ID: LONG
http://rosalind.info/problems/long/
For a collection of strings, a larger string containing every one of the smaller strings as a substring is called a superstring.
By the assumption of parsimony, a shortest possible superstring over a collection of reads serves as a candidate chromosome.
Given: At most 50 DNA strings of approximately equal length, not exceeding 1 kbp, in FASTA format (which represent reads deriving from the same strand of a single linear chromosome).
The dataset is guaranteed to satisfy the following condition: there exists a unique way to reconstruct the entire chromosome from these reads by gluing together pairs of reads that overlap by more than half their length.
Return: A shortest superstring containing all the given strings (thus corresponding to a reconstructed chromosome)
"""
'''
#traverse all permutations: too slow and memory load is too high
def shortestSuperstring(seq):
length = len(seq)
overlap_pool = [[0] * length for x in range(length)]
#print(overlap_pool)
original = {}
for i, x in enumerate(seq):
original[i] = x
for j, y in enumerate(seq):
if i!=j:
for ans in range(min(len(x),len(y)), -1, -1):
if x.endswith(y[:ans]):
overlap_pool[i][j] = ans
break
print('Overlap matrix is:')
for i in overlap_pool:
print(i)
#overlap_pool[i][j] is the number of max overlap characters. seq[i] gives suffix and seq[j] gives prefix.
#print(original)
keys = [i for i in original.keys()]
#full permutation: build all paths
def perm(data):
if len(data) == 1:
return [data]
r = []
for i in range(len(data)):
s = data[:i] + data[i+1:]
p = perm(s)
for x in p:
r.append(data[i:i+1]+x)
return r
paths = {}
for k,v in enumerate(perm(keys)):
paths.update({k:v})
print(paths)
print(paths)
scoreboard = {}
score = 0
for path in paths.keys():
for i in range(len(paths[path])-1):
if overlap_pool[paths[path][i]][paths[path][i+1]] > 5:
score = score + overlap_pool[paths[path][i]][paths[path][i+1]]
else:
break
scoreboard.update({path:score})
score = 0
print('Scoreboard:')
print(scoreboard) #Overlap chars for every possible path
#pick the highest score and get its path
highest_score = max(scoreboard.values())
best_path_no = [k for k,v in scoreboard.items() if v == highest_score]
best_path = []
for x in best_path_no:
best_path.append(paths[x])
#print('best path is:')
#print(best_path)
for alter_path in best_path:
overlap_path = []
for i in range(len(alter_path)-1):
overlap_path.append(overlap_pool[alter_path[i]][alter_path[i+1]])
#print(overlap_path)
for i in alter_path:
if alter_path.index(i) == 0:
print(seq[i], end='')
else:
print(seq[i][overlap_path[i-1]:], end='')
def getfile(file):
with open(file, 'r') as f:
seq = []
for line in f:
if line.startswith('>'):
pass
else:
seq.append(line.replace('\n',''))
return seq
seq = getfile('output.fasta')
shortestSuperstring(seq)
'''
def getfile(file):
with open(file, 'r') as f:
seq = []
for line in f:
if line.startswith('>'):
pass
else:
seq.append(line.replace('\n',''))
return seq
def concatenate_overlap(str1, str2):
less = min(len(str1), len(str2))
for i in range(less, int(less/2), -1):
if str1[-i:] == str2[:i]: #suffix of str1 == prefix of str2 (max overlap)
return str1 + str2[i:]
break
elif str2[-i:] == str1[:i]:
return str2 + str1[i:]
break
seq = getfile('output.fasta')
#print(seq)
while(len(seq)>1):
saver = (float('inf'),'','')
for x in seq[1:]:
temp = concatenate_overlap(seq[0],x)
if temp and (len(temp) < saver[0]):
saver = (len(temp), temp, x)
seq[0] = saver[1]
seq.pop(seq.index(saver[2]))
print(len(seq))