Skip to content

Commit

Permalink
Merge pull request #23 from maxplanck-ie/fix_export
Browse files Browse the repository at this point in the history
fixed export functions
  • Loading branch information
fidelram committed Mar 23, 2016
2 parents aca6b5a + 8f868ec commit ab6664a
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 37 deletions.
70 changes: 44 additions & 26 deletions hicexplorer/HiCMatrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ def __init__(self, matrixFile=None, format=None, skiprows=None, chrnameList=None
self.correction_factors = None # this value is set in case a matrix was iteratively corrected
self.non_homogeneous_warning_already_printed = False
self.distanceCounts = None # only defined when getCountsByDistance is called
self.bin_size = None
self.bin_size_homogeneous = None # track if the bins are equally spaced or not

if matrixFile:
self.nan_bins = np.array([])
Expand Down Expand Up @@ -146,16 +148,18 @@ def getBinSize(self):
bin at the en of the chromosomes) a warning is issued.
In case of uneven bins, the median is returned.
"""
chrom, start, end, extra = zip(*self.cut_intervals)
median = int(np.median(np.diff(start)))
diff = np.array(end) - np.array(start)
# check if the bin size is homogeneous
if len(np.flatnonzero(diff != median)) > (len(diff) * 0.01):
if self.non_homogeneous_warning_already_printed is False:
sys.stderr.write('WARNING: bin size is not homogeneous. Median {}\n'.format(median))
self.non_homogeneous_warning_already_printed = True
# raise Exception('bin size is not homogeneous')
return median
if self.bin_size is None:
chrom, start, end, extra = zip(*self.cut_intervals)
median = int(np.median(np.diff(start)))
diff = np.array(end) - np.array(start)
# check if the bin size is homogeneous
if len(np.flatnonzero(diff != median)) > (len(diff) * 0.01):
self.bin_size_homogeneous = False
if self.non_homogeneous_warning_already_printed is False:
sys.stderr.write('WARNING: bin size is not homogeneous. Median {}\n'.format(median))
self.non_homogeneous_warning_already_printed = True
self.bin_size = median
return self.bin_size

def getDekkerBins(self, fileName):
"""
Expand Down Expand Up @@ -640,6 +644,9 @@ def save_bing_ren(self, fileName):
Saves the matrix using bing ren's
method which is chrom_name\tstart_bin\tend_bin\tvalues...
"""
if os.path.isdir(fileName):
exit('Please specify a file name to save the data. The given file name is a folder: \n{}\n'.format(fileName))

if fileName[-3:] != '.gz':
fileName += '.gz'

Expand All @@ -664,6 +671,9 @@ def save_dekker(self, fileName):
"""
Saves the matrix using dekker format
"""
if os.path.isdir(fileName):
exit('Please specify a file name to save the data. The given file name is a folder: \n{}\n'.format(fileName))

if fileName[-3:] != '.gz':
fileName += '.gz'

Expand All @@ -676,9 +686,9 @@ def save_dekker(self, fileName):
colNames = []
for x in range(self.matrix.shape[0]):
chrom, start, end = self.cut_intervals[x][0:3]
colNames.append("{}|dm3|{}:{}-{}".format(x, chrom, start, end)) # adds dm3 to the end (?problem..)
colNames.append("{}|--|{}:{}-{}".format(x, chrom, start, end)) # adds dm3 to the end (?problem..)

fileh.write("#converted from npz\n")
fileh.write("#converted from hicexplorer\n")
fileh.write("\t" + "\t".join(colNames) + "\n")
for row in range(self.matrix.shape[0]):
values = [str(x) for x in self.matrix[row, :].toarray().flatten()]
Expand All @@ -689,27 +699,35 @@ def save_dekker(self, fileName):
def save_lieberman(self, fileName):
"""
Saves the matrix using lieberman format. Given an output directory name and resolution of the matrix.
In contrast to other methods, when saving using liebermans format a folder is required
where the data is saved per chromosome
"""

os.mkdir(fileName)
if os.path.isfile(fileName):
exit("a file with the same name as the given path exists ({}). Please choose a folder.".format(fileName))

if not os.path.isdir(fileName):
os.mkdir(fileName)

lib_mat = self.matrix
resolution = self.getBinSize()
if not self.bin_size_homogeneous:
sys.stderr.write("*WARNING* The hic matrix contains bins of difference size but\n"
"the 'lieberman' format requires equally spaced bins. The program\n"
"will proceed but the results may be unreliable.\n")

for chrom in self.interval_trees.keys():
fileh = gzip.open("{}/chr{}_{}.gz".format(fileName,chrom,resolution), 'w')
rowNames = []
chrstart, chrend = lib_mat.getChrBinRange(chrom)
chrwise_mat = lib_mat.matrix[chrstart:chrend, chrstart:chrend]
chrstart, chrend = self.getChrBinRange(chrom)
chrwise_mat = self.matrix[chrstart:chrend, chrstart:chrend]
if len(chrwise_mat.data) > 0:
sys.stderr.write("Saving chromosome {}...\n".format(chrom))
chrwise_mat_coo = triu(chrwise_mat, k=0, format='csr').tocoo()
for x in range(chrwise_mat_coo.shape[0]):
start = chrwise_mat_coo.row[x,]*resolution
end = chrwise_mat_coo.col[x,]*resolution
data = chrwise_mat_coo.data[x,]
rowNames.append("{}\t{}\t{}".format(start, end,data))

fileh.write("#converted from npz")
fileh.write("\n" + "\n".join(rowNames) + "\n")
start = chrwise_mat_coo.row * resolution
end = chrwise_mat_coo.col * resolution
fileh = gzip.open("{}/chr{}.gz".format(fileName, chrom), 'w')
fileh.write("#converted from HiCExplorer format\n")
for idx in range(len(start)):
fileh.write("{}\t{}\t{}\n".format(start[idx], end[idx], chrwise_mat_coo.data[idx]))
fileh.close()

def save(self, filename):
Expand Down
21 changes: 10 additions & 11 deletions hicexplorer/hicExport.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,9 @@ def parse_arguments(args=None):
)

parser.add_argument('--outFileName', '-o',
help='File name to save the plain text matrix',
type=argparse.FileType('w'),
help='File name to save the plain text matrix. In the case of "lieberman" '
'output format this should be the path of a folder where the information '
'per chromosome is stored.',
required=True)

parser.add_argument('--chromosomeOrder',
Expand All @@ -52,7 +53,7 @@ def parse_arguments(args=None):
'with three columns : contact start, contact end, and raw observed score. '
'This corresponds to the RawObserved files from lieberman group. ',
default='dekker',
choices=['dekker', 'ren', 'lieberman' , 'hicexplorer'])
choices=['dekker', 'ren', 'lieberman', 'hicexplorer'])

parser.add_argument('--clearMaskedBins',
help='if set, masked bins are removed from the matrix. Masked bins '
Expand All @@ -73,9 +74,9 @@ def main():
if (args.chrNameList is None ):
exit("Error: --chrNameList is required when the input format is lieberman. ")
else:
hic_ma = hm.hiCMatrix(matrixFile=args.inFile, format = 'lieberman',chrnameList = args.chrNameList)
hic_ma = hm.hiCMatrix(matrixFile=args.inFile, format='lieberman', chrnameList=args.chrNameList)
else:
hic_ma = hm.hiCMatrix(matrixFile=args.inFile[0], format = 'npz')
hic_ma = hm.hiCMatrix(matrixFile=args.inFile[0], format='npz')

if args.chromosomeOrder:
hic_ma.keepOnlyTheseChr(args.chromosomeOrder)
Expand All @@ -84,14 +85,12 @@ def main():
hic_ma.maskBins(hic_ma.nan_bins)

sys.stderr.write('saving...\n')
matrix_name = args.outFileName.name
args.outFileName.close()

if args.outputFormat == 'dekker':
hic_ma.save_dekker(matrix_name)
hic_ma.save_dekker(args.outFileName)
elif args.outputFormat == 'ren':
hic_ma.save_bing_ren(matrix_name)
hic_ma.save_bing_ren(args.outFileName)
elif args.outputFormat == 'lieberman':
hic_ma.save_lieberman(matrix_name)
hic_ma.save_lieberman(args.outFileName)
else:
hic_ma.save(matrix_name)
hic_ma.save(args.outFileName)

0 comments on commit ab6664a

Please sign in to comment.