forked from ctb/eel-pond
-
Notifications
You must be signed in to change notification settings - Fork 0
/
accession_lookup.py
53 lines (40 loc) · 1.34 KB
/
accession_lookup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#!/usr/bin/env python
#
# argv[1]: genes of interest (tsv with 'accession' column)
import csv
import sys
from Bio import Entrez, SeqIO
Entrez.email = '[email protected]'
with open(sys.argv[1], 'rU') as in_file, open(sys.argv[2], 'w') as out_file:
goi = csv.DictReader(in_file, delimiter='\t')
accessions = []
for (accession, row) in ((row['accession'], row) for row in goi):
if accession == 'N/A':
print 'Skipping `%s\' - No applicable accession number.' % (row['symbol'])
continue
else:
accessions.append(accession + '[accn]')
assert(len(accessions) < 100000) # would require paging
print '\nSearching by accession in `protein` database...'
search_opts = {
'db': 'protein',
'term': ' OR '.join(accessions),
'usehistory': 'y'
}
search = Entrez.esearch(**search_opts)
print 'Parsing search results...'
result = Entrez.read(search)
search.close()
print 'Done!'
print '\nFetching FASTA records...'
fetch_opts = {
'db': 'protein',
'query_key': result['QueryKey'],
'webenv': result['WebEnv'],
'retmode': 'text',
'rettype': 'fasta'
}
fetch = Entrez.efetch(**fetch_opts)
out_file.write(fetch.read())
fetch.close()
print 'Done!\nFASTA records saved to %s' % (sys.argv[2])