-
Notifications
You must be signed in to change notification settings - Fork 0
/
gene_exp_10x.py
126 lines (95 loc) · 2.84 KB
/
gene_exp_10x.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
def load_gene_exp_to_df(inst_path):
'''
Loads gene expression data from 10x in sparse matrix format and returns a
Pandas dataframe
'''
import pandas as pd
from scipy import io
from scipy import sparse
from ast import literal_eval as make_tuple
# matrix
Matrix = io.mmread( inst_path + 'matrix.mtx')
mat = Matrix.todense()
# genes
filename = inst_path + 'genes.tsv'
f = open(filename, 'r')
lines = f.readlines()
f.close()
# # add unique id to all genes
# genes = []
# unique_id = 0
# for inst_line in lines:
# inst_line = inst_line.strip().split()
# if len(inst_line) > 1:
# inst_gene = inst_line[1]
# else:
# inst_gene = inst_line[0]
# genes.append(inst_gene + '_' + str(unique_id))
# unique_id = unique_id + 1
# add unique id only to duplicate genes
ini_genes = []
for inst_line in lines:
inst_line = inst_line.strip().split()
if len(inst_line) > 1:
inst_gene = inst_line[1]
else:
inst_gene = inst_line[0]
ini_genes.append(inst_gene)
gene_name_count = pd.Series(ini_genes).value_counts()
duplicate_genes = gene_name_count[gene_name_count > 1].index.tolist()
dup_index = {}
genes = []
for inst_row in ini_genes:
# add index to non-unique genes
if inst_row in duplicate_genes:
# calc_non-unque index
if inst_row not in dup_index:
dup_index[inst_row] = 1
else:
dup_index[inst_row] = dup_index[inst_row] + 1
new_row = inst_row + '_' + str(dup_index[inst_row])
else:
new_row = inst_row
genes.append(new_row)
# barcodes
filename = inst_path + 'barcodes.tsv'
f = open(filename, 'r')
lines = f.readlines()
f.close()
cell_barcodes = []
for inst_bc in lines:
inst_bc = inst_bc.strip().split('\t')
# remove dash from barcodes if necessary
if '-' in inst_bc[0]:
inst_bc[0] = inst_bc[0].split('-')[0]
cell_barcodes.append(inst_bc[0])
# parse tuples if necessary
try:
cell_barcodes = [make_tuple(x) for x in cell_barcodes]
except:
pass
try:
genes = [make_tuple(x) for x in genes]
except:
pass
# make dataframe
df = pd.DataFrame(mat, index=genes, columns=cell_barcodes)
return df
def save_gene_exp_to_mtx_dir(inst_path, df):
import os
from scipy import io
from scipy import sparse
if not os.path.exists(inst_path):
os.makedirs(inst_path)
genes = df.index.tolist()
barcodes = df.columns.tolist()
save_list_to_tsv(genes, inst_path + 'genes.tsv')
save_list_to_tsv(barcodes, inst_path + 'barcodes.tsv')
mat_ge = df.get_values()
mat_ge_sparse = sparse.coo_matrix(mat_ge)
io.mmwrite( inst_path + 'matrix.mtx', mat_ge_sparse)
def save_list_to_tsv(inst_list, filename):
f = open(filename, 'w')
for inst_line in inst_list:
f.write(str(inst_line) + '\n')
f.close()