diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 0a29b17e..1a98874e 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -7,7 +7,11 @@ assignees: '' --- -**Welcome to the HiCExplorer GitHub repository! Before opening the issue please check +**Welcome to the HiCExplorer GitHub repository!** + +If your issue concern `HiCPlotTADs`, please post your issue to the [pyGenomeTracks github repo](https://github.com/deeptools/pyGenomeTracks/issues/new/choose). + +**Before opening the issue please check that the following requirements are met :** - [ ] Search whether this issue (or a similar issue) has been solved before using the search tab above. Link the previous issue if appropriate below. diff --git a/README.rst b/README.rst index be2c2f36..e2f1ed3f 100644 --- a/README.rst +++ b/README.rst @@ -37,6 +37,9 @@ The `scHiCExplorer `_. Citation: ^^^^^^^^^ +Joachim Wolff, Rolf Backofen, Björn Grüning. +**Loop detection using Hi-C data with HiCExplorer**, GigaScience, Volume 11, 2022, giac061, https://doi.org/10.1093/gigascience/giac061 + Joachim Wolff, Leily Rabbani, Ralf Gilsbach, Gautier Richard, Thomas Manke, Rolf Backofen, Björn A Grüning. **Galaxy HiCExplorer 3: a web server for reproducible Hi-C, capture Hi-C and single-cell Hi-C data analysis, quality control and visualization, Nucleic Acids Research**, Nucleic Acids Research, Volume 48, Issue W1, 02 July 2020, Pages W177–W184, https://doi.org/10.1093/nar/gkaa220 diff --git a/azure-pipelines.yml b/azure-pipelines.yml index f5faacf8..fc09c773 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -9,12 +9,13 @@ jobs: vmImage: 'ubuntu-latest' strategy: matrix: - Python36: - python.version: '3.6' - Python37: - python.version: '3.7' Python38: python.version: '3.8' + Python39: + python.version: '3.9' + Python310: + python.version: '3.10' + steps: - bash: | @@ -23,14 +24,16 @@ jobs: displayName: Add conda to PATH - bash: | conda config --set always_yes yes --set changeps1 no - conda info -a - conda create -n hicexplorer --yes -c conda-forge -c bioconda python=$(python.version) --file requirements.txt + conda install mamba --yes -c conda-forge + hash -r + mamba info -a + mamba create -n hicexplorer --yes -c conda-forge -c bioconda python=$(python.version) --file requirements.txt source activate hicexplorer - conda install --yes -c conda-forge -c bioconda pytest flake8 pytest-xdist pytest-forked - conda install --yes -c conda-forge -c bioconda nose - conda install --yes pathlib - conda install --yes -c defaults -c conda-forge -c bioconda configparser - python setup.py install + mamba install --yes -c conda-forge -c bioconda pytest flake8 pytest-xdist pytest-forked + mamba install --yes -c conda-forge -c bioconda nose + mamba install --yes pathlib + mamba install --yes -c defaults -c conda-forge -c bioconda configparser + pip install . displayName: installing dependencies - script: | source activate hicexplorer @@ -44,22 +47,25 @@ jobs: - script: | source activate hicexplorer py.test hicexplorer/test/general/ --doctest-modules --capture=sys -n 4 --ignore=hicexplorer/test/general/test_hicTADClassifier.py --ignore=hicexplorer/test/general/test_hicTrainTADClassifier.py + displayName: pytest + - script: | + source activate hicexplorer py.test hicexplorer/test/general/test_hicTADClassifier.py py.test hicexplorer/test/general/test_hicTrainTADClassifier.py - displayName: pytest + displayName: pytest_tad_classifier - job: 'OSX' timeoutInMinutes: 0 pool: - vmImage: 'macOS-10.14' + vmImage: 'macOS-latest' strategy: matrix: - Python36: - python.version: '3.6' - Python37: - python.version: '3.7' Python38: python.version: '3.8' + Python39: + python.version: '3.9' + Python310: + python.version: '3.10' steps: - bash: | @@ -75,7 +81,7 @@ jobs: conda install --yes -c conda-forge -c bioconda nose conda install --yes pathlib conda install --yes -c defaults -c conda-forge -c bioconda configparser - python setup.py install + pip install . displayName: installing dependencies - script: | source activate hicexplorer @@ -89,6 +95,9 @@ jobs: - script: | source activate hicexplorer py.test hicexplorer/test/general/ --doctest-modules --capture=sys -n 4 --ignore=hicexplorer/test/general/test_hicTADClassifier.py --ignore=hicexplorer/test/general/test_hicTrainTADClassifier.py + displayName: pytest + - script: | + source activate hicexplorer py.test hicexplorer/test/general/test_hicTADClassifier.py py.test hicexplorer/test/general/test_hicTrainTADClassifier.py - displayName: pytest \ No newline at end of file + displayName: pytest_tad_classifier \ No newline at end of file diff --git a/docs/content/News.rst b/docs/content/News.rst index fc051a77..de2bd567 100644 --- a/docs/content/News.rst +++ b/docs/content/News.rst @@ -2,6 +2,14 @@ News and Developments ===================== +Release 3.7.3 +------------- +**17 November 2023** + +- Maintenance update for HiCExplorer to keep up to date with APIs of dependencies +- Add additional of the polarization ratio to the output of hicCompartmentalization. Thanks @contessoto. + + Release 3.7.2 ------------- **1 October 2021** diff --git a/hicexplorer/_version.py b/hicexplorer/_version.py index f6d68cf3..7c743e79 100644 --- a/hicexplorer/_version.py +++ b/hicexplorer/_version.py @@ -2,4 +2,4 @@ # This file is originally generated from Git information by running 'setup.py # version'. Distribution tarballs contain a pre-generated copy of this file. -__version__ = '3.7.2' +__version__ = '3.7.3' diff --git a/hicexplorer/chicExportData.py b/hicexplorer/chicExportData.py index 380c556c..b12689b7 100644 --- a/hicexplorer/chicExportData.py +++ b/hicexplorer/chicExportData.py @@ -148,7 +148,8 @@ def exportData(pFileList, pArgs, pViewpointObject, pDecimalPlace, pChromosomeSiz file_content_string = header_information for key in key_list: - file_content_string += '\t'.join('{:.{decimal_places}f}'.format(x, decimal_places=pDecimalPlace) if isinstance(x, np.float) else str(x) for x in data[1][key]) + '\n' + file_content_string += '\t'.join('{:.{decimal_places}f}'.format(x, decimal_places=pDecimalPlace) if isinstance(x, float) else str(x) for x in data[1][key]) + '\n' + # breakpoint() else: for key in key_list: chromosome_name.append(str(data[1][key][0])) @@ -224,7 +225,9 @@ def exportData(pFileList, pArgs, pViewpointObject, pDecimalPlace, pChromosomeSiz line_content, data = pViewpointObject.readAggregatedFileHDF(pArgs.file, sample) file_content_string = header_information for line in line_content: - file_content_string += '\t'.join('{:.{decimal_places}f}'.format(x, decimal_places=pDecimalPlace) if isinstance(x, np.float) else str(x) for x in line) + '\n' + file_content_string += '\t'.join('{:.{decimal_places}f}'.format(x, decimal_places=pDecimalPlace) if isinstance(x, float) else str(x) for x in line) + '\n' + # breakpoint() + file_content_list.append(file_content_string) file_name = '_'.join(sample) + '_' + pFileType + '.txt' @@ -241,7 +244,9 @@ def exportData(pFileList, pArgs, pViewpointObject, pDecimalPlace, pChromosomeSiz file_content_string = header_information for line in item: - file_content_string += '\t'.join('{:.{decimal_places}f}'.format(x, decimal_places=pDecimalPlace) if isinstance(x, np.float) else str(x) for x in line) + '\n' + file_content_string += '\t'.join('{:.{decimal_places}f}'.format(x, decimal_places=pDecimalPlace) if isinstance(x, float) else str(x) for x in line) + '\n' + # breakpoint() + file_content_list.append(file_content_string) file_name = '_'.join(file) + '_' + item_classification[i] + '_' + pFileType + '.txt' file_list.append(file_name) diff --git a/hicexplorer/hicAdjustMatrix.py b/hicexplorer/hicAdjustMatrix.py index 0f4eb6fe..caded430 100644 --- a/hicexplorer/hicAdjustMatrix.py +++ b/hicexplorer/hicAdjustMatrix.py @@ -4,7 +4,7 @@ import argparse from hicmatrix import HiCMatrix as hm from hicexplorer._version import __version__ -from hicmatrix.HiCMatrix import check_cooler +from hicexplorer.utilities import check_cooler import numpy as np import cooler import logging @@ -132,6 +132,7 @@ def adjustMatrix(pArgs): if len(genomic_regions) == 0: log.error('No valid chromosome given. Available: {}'.format(chromosomes_list)) exit(1) + matrix_indices_regions = [] for region in genomic_regions: log.debug('region {}'.format(region)) diff --git a/hicexplorer/hicAggregateContacts.py b/hicexplorer/hicAggregateContacts.py index 6f43769f..0810b5e1 100755 --- a/hicexplorer/hicAggregateContacts.py +++ b/hicexplorer/hicAggregateContacts.py @@ -8,7 +8,7 @@ matplotlib.use('Agg') import matplotlib.pyplot as plt import matplotlib.gridspec as gridspec -import matplotlib.cm as cm +from matplotlib import colormaps as cm from mpl_toolkits.mplot3d import Axes3D # from scipy.cluster.vq import vq, kmeans @@ -515,7 +515,7 @@ def compute_clusters(updated_info, k, method="kmeans", how='full', max_deviation # shape = (num_submatrices, submatrix.shape[0] * submatrix.shape[1] # In other words, each submatrix is converted into a row of the matrix submat_vectors.append(submatrix.reshape((1, shape[0] * shape[1]))) - matrix = np.vstack(submat_vectors) + matrix = np.asarray(np.vstack(submat_vectors)) if how == 'diagonal': assert matrix.shape == (len(updated_info["submatrices"]), shape[0]) elif how == 'center': @@ -600,7 +600,7 @@ def cluster_matrices(agg_info, k, method='kmeans', how='full', perChr=False, max updated_info[chrom1] = {"coords": full_coords, "centers": agg_info["agg_center_values"][chrom1][chrom2], "submatrices": agg_info["agg_matrix"][chrom1][chrom2], "clustered_dict": [], "diagonal": agg_info["agg_diagonals"][chrom1][chrom2]} - assert(chrom1 == chrom2) + assert (chrom1 == chrom2) log.info("Length of entry on chr {}: {}".format(chrom1, len(agg_info["agg_matrix"][chrom1][chrom2]))) if len(agg_info["agg_matrix"][chrom1][chrom2]) < k: log.info("number of the submatrices on chromosome {} is less than {}. Clustering is skipped.".format(chrom1, k)) @@ -660,7 +660,7 @@ def plot_aggregated_contacts(clustered_info, num_clusters, M_half, args): log.debug("vmax: {}, vmin: {}".format(vmax, vmin)) chrom_avg = OrderedDict() for idx, (chrom1, v1) in enumerate(clustered_info.items()): - assert(v1 != {}) + assert (v1 != {}) if chrom1 not in chrom_avg.keys(): chrom_avg[chrom1] = [] @@ -925,7 +925,7 @@ def main(args=None): exit("No susbmatrix found to be aggregated.") if args.kmeans is not None: - assert(args.kmeans > 1) + assert (args.kmeans > 1) if args.perChr == True: clustered_info = cluster_matrices(agg_info, k=args.kmeans, method='kmeans', how=args.howToCluster, @@ -938,7 +938,7 @@ def main(args=None): keep_outlier=args.keep_outlier) num_clusters = args.kmeans elif args.hclust is not None: - assert(args.hclust > 1) + assert (args.hclust > 1) log.info("Performing hierarchical clustering." "Please note that it might be very slow for large datasets.\n") if args.perChr == True: diff --git a/hicexplorer/hicAverageRegions.py b/hicexplorer/hicAverageRegions.py index 7a489d7a..81cadaec 100644 --- a/hicexplorer/hicAverageRegions.py +++ b/hicexplorer/hicAverageRegions.py @@ -7,7 +7,7 @@ import logging log = logging.getLogger(__name__) import numpy as np -from scipy.sparse import csr_matrix, save_npz, lil_matrix +from scipy.sparse import csr_matrix, save_npz, lil_matrix, coo_matrix def parse_arguments(args=None): @@ -196,11 +196,13 @@ def main(args=None): if orientation is None or orientation == '+': summed_matrix[_start:_end, _start:_end] += hic_ma.matrix[start:end, start:end] elif orientation == '-': - summed_matrix[_start:_end, _start:_end] += hic_ma.matrix[start:end, start:end].T summed_matrix /= count_matrix - summed_matrix = np.array(summed_matrix) - data = summed_matrix[np.nonzero(summed_matrix)] + + summed_matrix = coo_matrix(summed_matrix) + + data = summed_matrix.data + row = np.nonzero(summed_matrix)[0] col = np.nonzero(summed_matrix)[1] summed_matrix = csr_matrix((data, (row, col)), shape=(dimensions_new_matrix, dimensions_new_matrix)) diff --git a/hicexplorer/hicCompartmentalization.py b/hicexplorer/hicCompartmentalization.py index c3a3c148..244e87c9 100644 --- a/hicexplorer/hicCompartmentalization.py +++ b/hicexplorer/hicCompartmentalization.py @@ -99,7 +99,7 @@ def count_interactions(obs_exp, pc1, quantiles_number, offset): number_of_bins = np.zeros((quantiles_number, quantiles_number)) if offset: for dist in offset: - assert(dist >= 0) + assert (dist >= 0) indices = np.arange(0, obs_exp.matrix.shape[0] - dist) obs_exp.matrix[indices, indices + dist] = np.nan obs_exp.matrix[indices + dist, indices] = np.nan @@ -220,3 +220,4 @@ def main(args=None): np.savez(args.outputMatrix, [matrix for matrix in output_matrices]) plot_polarization_ratio( polarization_ratio, args.outputFileName, labels, args.quantile) + np.savetxt(args.outputFileName + '_' + 'dat', polarization_ratio) diff --git a/hicexplorer/hicCorrectMatrix.py b/hicexplorer/hicCorrectMatrix.py index 9037faee..a3221368 100644 --- a/hicexplorer/hicCorrectMatrix.py +++ b/hicexplorer/hicCorrectMatrix.py @@ -368,7 +368,7 @@ def __init__(self, points): diff = np.sum((points - self.median), axis=-1) self.med_abs_deviation = np.median(np.abs(diff)) - self.modified_z_score = self.mad_b_value * diff / self.med_abs_deviation + self.modified_z_score = np.multiply(self.mad_b_value, np.divide(diff, self.med_abs_deviation)) def get_motified_zscores(self): @@ -665,7 +665,7 @@ def main(args=None): if args.sequencedCountCutoff and 0 < args.sequencedCountCutoff < 1: chrom, _, _, coverage = zip(*ma.cut_intervals) - assert type(coverage[0]) == np.float64 + assert type(coverage[0]) is np.float64 failed_bins = np.flatnonzero( np.array(coverage) < args.sequencedCountCutoff) @@ -703,7 +703,7 @@ def main(args=None): correction_factors.append(_corr_factors) else: # Set the kr matrix along with its correction factors vector - assert(args.correctionMethod == 'KR') + assert (args.correctionMethod == 'KR') log.debug("Loading a float sparse matrix for KR balancing") kr = kr_balancing(chr_submatrix.shape[0], chr_submatrix.shape[1], @@ -729,7 +729,7 @@ def main(args=None): ma.matrix, args) ma.setMatrixValues(corrected_matrix) else: - assert(args.correctionMethod == 'KR') + assert (args.correctionMethod == 'KR') log.debug("Loading a float sparse matrix for KR balancing") kr = kr_balancing(ma.matrix.shape[0], ma.matrix.shape[1], ma.matrix.count_nonzero(), ma.matrix.indptr.astype(np.int64, copy=False), diff --git a/hicexplorer/hicCorrelate.py b/hicexplorer/hicCorrelate.py index 510fa096..0dfce7c7 100644 --- a/hicexplorer/hicCorrelate.py +++ b/hicexplorer/hicCorrelate.py @@ -16,6 +16,7 @@ # for plotting from matplotlib import use as mplt_use import matplotlib as mpl +from matplotlib import colormaps as cm mplt_use('Agg') @@ -165,7 +166,7 @@ def plot_correlation(corr_matrix, labels, plot_filename, vmax=None, link_color_func=lambda k: 'black') axdendro.set_xticks([]) axdendro.set_yticks([]) - cmap = plt.get_cmap(colormap) + cmap = cm.get_cmap(colormap) # this line simply makes a new cmap, based on the original # colormap that goes from 0.0 to 0.9 diff --git a/hicexplorer/hicDetectLoops.py b/hicexplorer/hicDetectLoops.py index 7ffcf403..da66dbf2 100644 --- a/hicexplorer/hicDetectLoops.py +++ b/hicexplorer/hicDetectLoops.py @@ -331,7 +331,7 @@ def compute_long_range_contacts(pHiCMatrix, pObsExpMatrix, pWindowSize, for i in range(pThreads): if queue[i] is not None and not queue[i].empty(): mask_threads = queue[i].get() - if 'Fail: ' in mask_threads: + if isinstance(mask_threads, str) and 'Fail: ' in mask_threads: fail_flag = True fail_message = mask_threads else: @@ -368,7 +368,7 @@ def compute_long_range_contacts(pHiCMatrix, pObsExpMatrix, pWindowSize, candidates, _ = neighborhood_merge( candidates, pWindowSize, pObsExpMatrix, pThreads) - if 'Fail: ' in candidates: + if isinstance(candidates, str) and 'Fail: ' in candidates: return candidates, None if len(candidates) == 0: return None, None @@ -476,7 +476,7 @@ def neighborhood_merge(pCandidates, pWindowSize, pInteractionCountMatrix, pThrea for i in range(pThreads): if queue[i] is not None and not queue[i].empty(): new_candidate_list_threads[i] = queue[i].get() - if 'Fail: ' in new_candidate_list_threads[i]: + if isinstance(new_candidate_list_threads[i], str) and 'Fail: ' in new_candidate_list_threads[i]: fail_flag = True fail_message = new_candidate_list_threads[i] queue[i] = None @@ -718,7 +718,7 @@ def candidate_region_test(pHiCMatrix, pCandidates, pWindowSize, pPValue, for i in range(pThreads): if queue[i] is not None and not queue[i].empty(): result_thread = queue[i].get() - if 'Fail: ' in result_thread: + if isinstance(result_thread, str) and 'Fail: ' in result_thread: fail_flag = True fail_message = result_thread else: @@ -914,10 +914,10 @@ def compute_loops(pHiCMatrix, pRegion, pArgs, pIsCooler, pQueue=None): else: pQueue.put([None]) return - elif 'Fail: ' in candidates and pQueue is not None: + elif isinstance(candidates, str) and 'Fail: ' in candidates and pQueue is not None: pQueue.put(candidates) return - elif 'Fail: ' in candidates and pQueue is None: + elif isinstance(candidates, str) and 'Fail: ' in candidates and pQueue is None: return candidates mapped_loops = cluster_to_genome_position_mapping( pHiCMatrix, candidates, pValueList, pArgs.maxLoopDistance) @@ -1024,7 +1024,7 @@ def main(args=None): if loops is None: log.error('No loops could be detected. Please change your input parameters, use a matrix with a better read coverage or contact the develops on https://github.com/deeptools/HiCExplorer/issues') exit(1) - if 'Fail: ' in loops: + if isinstance(loops, str) and 'Fail: ' in loops: log.error(loops[6:]) exit(1) if loops is not None: @@ -1059,7 +1059,7 @@ def main(args=None): all_data_processed = True elif queue[i] is not None and not queue[i].empty(): result = queue[i].get() - if result is not None and 'Fail: ' in result: + if result is not None and isinstance(result, str) and 'Fail: ' in result: fail_flag = True fail_message = result if result[0] is not None: diff --git a/hicexplorer/hicInfo.py b/hicexplorer/hicInfo.py index 9aa91d45..2a3acb91 100644 --- a/hicexplorer/hicInfo.py +++ b/hicexplorer/hicInfo.py @@ -9,7 +9,7 @@ from hicmatrix import HiCMatrix as hm from hicexplorer._version import __version__ from hicexplorer.utilities import toString -from hicmatrix.HiCMatrix import check_cooler +from hicexplorer.utilities import check_cooler import logging log = logging.getLogger(__name__) diff --git a/hicexplorer/hicMergeLoops.py b/hicexplorer/hicMergeLoops.py index 40efecda..8195ff1f 100644 --- a/hicexplorer/hicMergeLoops.py +++ b/hicexplorer/hicMergeLoops.py @@ -156,7 +156,7 @@ def main(args=None): if dataframe is None: dataframe = readFile(file) else: - dataframe = dataframe.append(readFile(file), ignore_index=True) + dataframe = pd.concat([dataframe, readFile(file)]) dataframe_bedtool = BedTool.from_dataframe(dataframe) dataframe = dataframe_bedtool.sort().to_dataframe( diff --git a/hicexplorer/hicNormalize.py b/hicexplorer/hicNormalize.py index ff508bd6..3c90b3f2 100644 --- a/hicexplorer/hicNormalize.py +++ b/hicexplorer/hicNormalize.py @@ -84,7 +84,7 @@ def main(args=None): min_max_difference = np.float64(max_value - min_value) hic_matrix.matrix.data -= min_value - hic_matrix.matrix.data /= min_max_difference + hic_matrix.matrix.data = np.divide(hic_matrix.matrix.data, min_max_difference) mask = np.isnan(hic_matrix.matrix.data) hic_matrix.matrix.data[mask] = 0 @@ -110,7 +110,7 @@ def main(args=None): mask = np.isinf(hic_matrix.matrix.data) hic_matrix.matrix.data[mask] = 0 adjust_factor = sum_list[i] / sum_list[argmin] - hic_matrix.matrix.data /= adjust_factor + hic_matrix.matrix.data = np.divide(hic_matrix.matrix.data, adjust_factor) mask = np.isnan(hic_matrix.matrix.data) mask = np.isnan(hic_matrix.matrix.data) diff --git a/hicexplorer/hicPCA.py b/hicexplorer/hicPCA.py index 2aa7d956..4cb19a9d 100644 --- a/hicexplorer/hicPCA.py +++ b/hicexplorer/hicPCA.py @@ -229,7 +229,7 @@ def correlateEigenvectorWithHistonMarkTrack(pEigenvector, bwTrack, chromosome, vector[pos_indices] = np.negative(vector[pos_indices]) vector[neg_indices] = np.negative(vector[neg_indices]) else: - assert(pHistonMarkType == 'inactive') + assert (pHistonMarkType == 'inactive') if (pos_mean > neg_mean) and (neg_mean != 0) and (pos_mean != 0): # flip the sign vector[pos_indices] = -1 * vector[pos_indices] @@ -318,7 +318,7 @@ def main(args=None): eigenvectors_correlate = np.hstack((eigenvectors_correlate, eigs[:, int(id) - 1:int(id)])) if args.extraTrack and (args.extraTrack.endswith('.bw') or args.extraTrack.endswith('.bigwig')): - assert(len(end) == len(start)) + assert (len(end) == len(start)) correlateEigenvectorWithHistonMarkTrack(eigenvectors_correlate.transpose(), bwTrack, chrname, start, end, args.extraTrack, @@ -357,12 +357,12 @@ def main(args=None): if args.format == 'bedgraph': for idx, outfile in enumerate(args.outputFileName): - assert(len(vecs_list) == len(chrom_list)) + assert (len(vecs_list) == len(chrom_list)) with open(outfile, 'w') as fh: for i, value in enumerate(vecs_list): if len(value) == len(args.whichEigenvectors): - if isinstance(value[idx], np.complex): + if isinstance(value[idx], complex): value[idx] = value[idx].real fh.write("{}\t{}\t{}\t{:.12f}\n".format(toString(chrom_list[i]), start_list[i], end_list[i], value[idx])) @@ -383,7 +383,7 @@ def main(args=None): log.debug("bigwig: len(vecs_list) {}".format(len(vecs_list))) log.debug("bigwig: len(chrom_list) {}".format(len(chrom_list))) - assert(len(vecs_list) == len(chrom_list)) + assert (len(vecs_list) == len(chrom_list)) _chrom_list = [] _start_list = [] _end_list = [] @@ -396,16 +396,16 @@ def main(args=None): for i, value in enumerate(vecs_list): # it can happen that some 'value' is having less dimensions than it should if len(value) == len(args.whichEigenvectors): - if isinstance(value[idx], np.complex): + if isinstance(value[idx], complex): value[idx] = value[idx].real values.append(value[idx]) _chrom_list.append(toString(chrom_list[i])) _start_list.append(start_list[i]) _end_list.append(end_list[i]) - # write entries bw.addEntries(_chrom_list, _start_list, ends=_end_list, values=values) + bw.close() else: log.error("Output format not known: {}".format(args.format)) diff --git a/hicexplorer/hicPlotMatrix.py b/hicexplorer/hicPlotMatrix.py index 36218f3d..9151f911 100644 --- a/hicexplorer/hicPlotMatrix.py +++ b/hicexplorer/hicPlotMatrix.py @@ -2,7 +2,7 @@ from collections import OrderedDict from mpl_toolkits.axes_grid1 import make_axes_locatable import matplotlib.gridspec as gridspec -import matplotlib.cm as cm +from matplotlib import colormaps as cm import matplotlib.pyplot as plt from matplotlib.colors import LogNorm import matplotlib @@ -241,8 +241,16 @@ def plotHeatmap(ma, chrBinBoundaries, fig, position, args, cmap, xlabel=None, start_pos2 = start_pos xmesh, ymesh = np.meshgrid(start_pos, start_pos2) + + # patch for newer matplotlib versions where vmin and vmax is for norm not allowed anymore + vmin = None + vmax = None + if pNorm is None: + vmin = args.vMin + vmax = args.vMax + img3 = axHeat2.pcolormesh( - xmesh.T, ymesh.T, ma, vmin=args.vMin, vmax=args.vMax, cmap=cmap, norm=pNorm) + xmesh.T, ymesh.T, ma, vmin=vmin, vmax=vmax, cmap=cmap, norm=pNorm) img3.set_rasterized(True) if args.region: @@ -519,10 +527,10 @@ def plotPerChr(hic_matrix, cmap, args, pBigwig, pResolution): np.isinf(matrix).any())) if args.log1p: matrix += 1 - norm = LogNorm() + norm = LogNorm(vmin=args.vMin, vmax=args.vMax) elif args.log: - norm = LogNorm() + norm = LogNorm(vmin=args.vMin, vmax=args.vMax) chr_bin_boundary = OrderedDict() chr_bin_boundary[chrname] = hic_matrix.get_chromosome_sizes()[chrname] @@ -785,9 +793,9 @@ def main(args=None): np.isinf(matrix).any())) if args.log1p: matrix += 1 - norm = LogNorm() + norm = LogNorm(vmin=args.vMin, vmax=args.vMax) elif args.log: - norm = LogNorm() + norm = LogNorm(vmin=args.vMin, vmax=args.vMax) fig_height = 7 fig_width = 8 diff --git a/hicexplorer/hicPrepareQCreport.py b/hicexplorer/hicPrepareQCreport.py index 51bbc9eb..2afc267e 100644 --- a/hicexplorer/hicPrepareQCreport.py +++ b/hicexplorer/hicPrepareQCreport.py @@ -21,7 +21,7 @@ def parse_arguments(): 'hicBuildMatrix log files within an HTML output', add_help=False, usage='%(prog)s --logfiles matrix1_QCfolder/QC.log matrix2_QCfolder/QC.log ' - '--labels "sample 1" "sample 2" --outputFolder QC_all_samples)') + '--labels "sample 1" "sample 2" --outputFolder QC_all_samples') parserRequired = parser.add_argument_group('Required arguments') # define the arguments @@ -64,17 +64,17 @@ def save_html(filename, unmap_table, discard_table, distance_table, orientation_ html_content = html.read() # the html code has a placeholder for the html table html_content = html_content.replace("%%TABLE_UNMAP%%", unmap_table.style - .format(lambda x: '{:,}'.format(x) if x > 1 else '{:.2%}'.format(x)).render()) + .format(lambda x: '{:,}'.format(x) if x > 1 else '{:.2%}'.format(x)).to_html(classes='df')) html_content = html_content.replace("%%TABLE_DISCARDED%%", discard_table.style - .format(lambda x: '{:,}'.format(x) if x > 1 else '{:.2%}'.format(x)).render()) + .format(lambda x: '{:,}'.format(x) if x > 1 else '{:.2%}'.format(x)).to_html(classes='df')) html_content = html_content.replace("%%TABLE_DISTANCE%%", distance_table.style - .format(lambda x: '{:,}'.format(x) if x > 1 else '{:.2%}'.format(x)).render()) + .format(lambda x: '{:,}'.format(x) if x > 1 else '{:.2%}'.format(x)).to_html(classes='df')) html_content = html_content.replace("%%TABLE_ORIENTATION%%", orientation_table.style - .format(lambda x: '{:,}'.format(x) if x > 1 else '{:.2%}'.format(x)).render()) + .format(lambda x: '{:,}'.format(x) if x > 1 else '{:.2%}'.format(x)).to_html(classes='df')) all_table = all_table.drop(['Min rest. site distance', 'Max library insert size'], axis=1) - html_content = html_content.replace("%%TABLE%%", all_table.style.render()) + html_content = html_content.replace("%%TABLE%%", all_table.style.to_html(classes='df')) with open(filename, 'w') as fh: fh.write(html_content) html.close() diff --git a/hicexplorer/hicTADClassifier.py b/hicexplorer/hicTADClassifier.py index ab121665..268e3ec6 100644 --- a/hicexplorer/hicTADClassifier.py +++ b/hicexplorer/hicTADClassifier.py @@ -9,7 +9,7 @@ def parse_arguments(args=None): """ - get command line arguments + get command line arguments """ parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, diff --git a/hicexplorer/hicTransform.py b/hicexplorer/hicTransform.py index 42baf9c5..ba141c0c 100644 --- a/hicexplorer/hicTransform.py +++ b/hicexplorer/hicTransform.py @@ -92,6 +92,8 @@ def parse_arguments(args=None): def _obs_exp_lieberman(pSubmatrix, pLengthChromosome, pChromosomeCount): + if len(pSubmatrix.data) == 0: + return pSubmatrix obs_exp_matrix_ = obs_exp_matrix_lieberman(pSubmatrix, pLengthChromosome, pChromosomeCount) obs_exp_matrix_ = convertNansToZeros(csr_matrix(obs_exp_matrix_)) obs_exp_matrix_ = convertInfsToZeros(csr_matrix(obs_exp_matrix_)) @@ -101,6 +103,8 @@ def _obs_exp_lieberman(pSubmatrix, pLengthChromosome, pChromosomeCount): def _pearson(pSubmatrix): + if len(pSubmatrix.data) == 0: + return pSubmatrix pearson_correlation_matrix = np.corrcoef(pSubmatrix) pearson_correlation_matrix = convertNansToZeros(csr_matrix(pearson_correlation_matrix)) pearson_correlation_matrix = convertInfsToZeros(csr_matrix(pearson_correlation_matrix)) @@ -111,6 +115,8 @@ def _pearson(pSubmatrix): def _obs_exp(pSubmatrix): + if len(pSubmatrix.data) == 0: + return pSubmatrix obs_exp_matrix_ = obs_exp_matrix(pSubmatrix) obs_exp_matrix_ = convertNansToZeros(csr_matrix(obs_exp_matrix_)) obs_exp_matrix_ = convertInfsToZeros(csr_matrix(obs_exp_matrix_)) @@ -122,7 +128,8 @@ def _obs_exp(pSubmatrix): def _obs_exp_non_zero(pSubmatrix, ligation_factor): - + if len(pSubmatrix.data) == 0: + return pSubmatrix obs_exp_matrix_ = obs_exp_matrix_non_zero(pSubmatrix, ligation_factor) obs_exp_matrix_ = convertNansToZeros(csr_matrix(obs_exp_matrix_)) obs_exp_matrix_ = convertInfsToZeros(csr_matrix(obs_exp_matrix_)) diff --git a/hicexplorer/iterativeCorrection.py b/hicexplorer/iterativeCorrection.py index 588497a4..8b350575 100644 --- a/hicexplorer/iterativeCorrection.py +++ b/hicexplorer/iterativeCorrection.py @@ -75,7 +75,7 @@ def iterativeCorrection(matrix, v=None, M=50, tolerance=1e-5, verbose=False): # scale the total bias such that the sum is 1.0 corr = total_bias[total_bias != 0].mean() - total_bias /= corr + total_bias = np.divide(total_bias, corr) W.data = W.data * corr * corr if np.any(W.data > 1e10): log.error("*Error* matrix correction produced extremely large values. " diff --git a/hicexplorer/lib/tadClassifier.py b/hicexplorer/lib/tadClassifier.py index ef916ed2..aec45eac 100644 --- a/hicexplorer/lib/tadClassifier.py +++ b/hicexplorer/lib/tadClassifier.py @@ -38,7 +38,7 @@ from sklearn.metrics import roc_auc_score from sklearn.metrics import roc_curve, auc -from cleanlab.classification import LearningWithNoisyLabels +from cleanlab.classification import CleanLearning from imblearn.under_sampling import * # PCA @@ -66,7 +66,7 @@ def __init__(self, domain_file, protein_file=None, domain_file) # read protein file and intersect if necessary - if(protein_file is not None): + if (protein_file is not None): # if pAddRemoveChrPrexix: self.protein_df = TADClassifier.MP_Domain_Data.readProtein( @@ -95,7 +95,7 @@ def readProtein(pFile, pAddChr): # protein_df[0] = 'chr' + protein_df[0].astype(str) protein_df[0] = protein_df[0].str.lstrip('chr') log.debug('protein_df {}'.format(protein_df)) - if(protein_df.size < 1): + if (protein_df.size < 1): raise ValueError('empty protein file passed') # use bedtools sorting and apply proper column names @@ -115,7 +115,7 @@ def read_domain_file(tad_file): domains_by_hicfindtads = domains_by_hicfindtads.sort_values( by=["Chrom", "Start"]) - if(domains_by_hicfindtads.size < 1): + if (domains_by_hicfindtads.size < 1): raise ValueError('empty domain file passed') return domains_by_hicfindtads @@ -275,7 +275,7 @@ def get_positions(self): def get_resolution(self): """resolution of matrix""" - if(self.resolution is None): + if (self.resolution is None): self.resolution = self.hic_matrix.getBinSize() return self.resolution @@ -305,7 +305,7 @@ def get_features(self, distance, use_gradient=False): if features is None: return None # experimental: use gradient matrix too - if(use_gradient): + if (use_gradient): x_gradient, y_gradient = self.build_gradient_features(distance) features = np.concatenate( (features, x_gradient, y_gradient), axis=1) @@ -453,7 +453,7 @@ def __init__(self, resolution=10000, self.distance = distance self.is_default_classifier = alternative_classifier is None - if(alternative_classifier is None): + if (alternative_classifier is None): self.classifier = BaggingClassifier( base_estimator=AdaBoostClassifier(), n_estimators=0, @@ -467,7 +467,7 @@ def __init__(self, resolution=10000, estimators = self.classifier.get_params( )['n_estimators'] + self.estimators_per_step self.classifier.set_params(n_estimators=estimators) - self.classifier = LearningWithNoisyLabels( + self.classifier = CleanLearning( clf=self.classifier, n_jobs=self.threads) def single_run(self, positions, X, y, test_size_n=0.2): @@ -492,7 +492,7 @@ def single_run(self, positions, X, y, test_size_n=0.2): # train model # if input contains only one class, an exception is thrown log.debug('fitting') - if(np.unique(y).shape[0] >= 2): + if (np.unique(y).shape[0] >= 2): self.classifier.fit(X, y) else: @@ -513,12 +513,12 @@ def incremental_fit(self, X, y, resample=True): X = TADClassifier.MP_Classifier.impute(X, self.impute_value) # enact oversample imbalance strategy - if(resample): + if (resample): X, y = TADClassifier.MP_Classifier.resample( X, y, self.resampling_method, self.alternative_resampling_method, threads=self.threads) # introduce additional estimators to accomodate for new dataset - if(self.use_cleanlab): + if (self.use_cleanlab): estimators = self.classifier.clf.get_params( )['n_estimators'] + self.estimators_per_step self.classifier.clf.set_params(n_estimators=estimators) @@ -557,7 +557,7 @@ def predict_test(self, positions, X, y_test): # test model log.debug('predicting') - if(not y_test[y_test == False].shape[0] <= 0 and not y_test[y_test == True].shape[0] <= 0): + if (not y_test[y_test == False].shape[0] <= 0 and not y_test[y_test == True].shape[0] <= 0): y_pred = self.classifier.predict(X) return positions, X, y_test, y_pred else: @@ -583,19 +583,21 @@ def resample(X, y, method='undersample_random', '''enact resample method chosen with method''' # resample method: KMEANs - if(method == 'undersample_cluster_centroids'): + if (method == 'undersample_cluster_centroids'): log.debug('resampling with: ' + method) - cc = ClusterCentroids(random_state=42, n_jobs=threads) + # Uses k-means of sklearn in the background which sets + # the number of cores via system environment variable OMP_NUM_THREADS + cc = ClusterCentroids(random_state=42) return cc.fit_resample(X, y) # resample method: random - elif(method == 'undersample_random'): + elif (method == 'undersample_random'): log.debug('resampling with: ' + method) rus = RandomUnderSampler(random_state=42) return rus.fit_resample(X, y) # use users method - elif(method == 'passed_method' and passed_method is not None): + elif (method == 'passed_method' and passed_method is not None): log.debug('resampling with: ' + method) return passed_method.fit_resample(X, y) @@ -631,7 +633,7 @@ def print_results(self, y_test, y_pred, out_file=None): log.debug(classification) log.debug(conf) - if(out_file): + if (out_file): f = open(out_file, "w") f.writelines(acc) @@ -642,7 +644,7 @@ def print_results(self, y_test, y_pred, out_file=None): def get_feature_importance(self, out_file): '''print feature importances''' - if(self.is_default_classifier and not self.use_cleanlab): + if (self.is_default_classifier and not self.use_cleanlab): # only use on Bagging Classifier feature_importances = np.mean( [est.feature_importances_ for est in self.classifier.estimators_], axis=0) @@ -654,7 +656,7 @@ def get_feature_importance(self, out_file): off = 0 barlist[off].set_color('red') - while(d > 0): + while (d > 0): off = d + off barlist[off].set_color('red') d = d - 1 @@ -664,7 +666,7 @@ def get_feature_importance(self, out_file): def get_roc(self, X_test, y_test, out_file): '''get receiver operating characteristic''' - if(self.is_default_classifier and not self.use_cleanlab): + if (self.is_default_classifier and not self.use_cleanlab): # from # https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html fpr = dict() @@ -724,8 +726,8 @@ def __init__(self, mode, self.concatenate_before_resample = concatenate_before_resample self.addRemoveChrPrexix = pAddRemoveChrPrexix - if(mode == 'predict' or mode == 'train_existing' or mode == 'predict_test'): - if(saved_classifier is not None): + if (mode == 'predict' or mode == 'train_existing' or mode == 'predict_test'): + if (saved_classifier is not None): try: self.classifier = pickle.load(open(saved_classifier, 'rb')) except Exception as exp: @@ -735,7 +737,7 @@ def __init__(self, mode, else: model_location = site.getsitepackages()[0] + '/hicexplorer/trained_models/' - if(normalization_method == 'obs_exp'): + if (normalization_method == 'obs_exp'): if resolution == 10000: model_location += '10kb_model_cleanlab_proteins_obs_exp.BIN' elif resolution == 25000: @@ -768,16 +770,16 @@ def __init__(self, mode, except Exception as exp: log.error('Tried to load ML model but failed! {}'.format(str(exp))) exit(1) - if(mode == 'train_new' or mode == 'train_test'): - if(alternative_classifier is not None and not isinstance(alternative_classifier, sklearn.base.BaseEstimator)): + if (mode == 'train_new' or mode == 'train_test'): + if (alternative_classifier is not None and not isinstance(alternative_classifier, sklearn.base.BaseEstimator)): raise ValueError('the passed classifier is not valid') - if(not resampling_method == 'passed_method' and alternative_resampling_method is not None): + if (not resampling_method == 'passed_method' and alternative_resampling_method is not None): warnings.warn( 'default resampling method chosen, but custom method passed; using passed method') resampling_method = 'passed_method' - if(alternative_resampling_method is not None and not isinstance(alternative_resampling_method, imblearn.base.BaseCleaningSampler)): + if (alternative_resampling_method is not None and not isinstance(alternative_resampling_method, imblearn.base.BaseCleaningSampler)): raise ValueError('the passed resampling method is not valid') self.classifier = TADClassifier.MP_Classifier( @@ -821,14 +823,14 @@ def prepare_train(self, matrix_file, domain_file, protein_file, pChromosome, ): return matrix, None, features, is_boundary positions = matrix.positions - if(not self.classifier.resolution == matrix.get_resolution()): + if (not self.classifier.resolution == matrix.get_resolution()): warnings.warn( 'training matrix with resolution {} on classifier with resolution {}'.format( self.classifier.resolution, matrix.get_resolution())) # get rid of cases at the border of matrix - if(self.unselect_border_cases): + if (self.unselect_border_cases): features = TADClassifier.MP_Matrix.unselect_border_cases( features, self.classifier.distance) positions = TADClassifier.MP_Matrix.unselect_border_cases( @@ -854,7 +856,7 @@ def prepare_predict(self, matrix_file, pChromosome): positions = matrix.positions # get rid of cases at the border of matrix - if(self.unselect_border_cases): + if (self.unselect_border_cases): features = TADClassifier.MP_Matrix.unselect_border_cases( features, self.classifier.distance) positions = TADClassifier.MP_Matrix.unselect_border_cases( @@ -871,7 +873,7 @@ def train_test(self, matrix_list, domain_list, protein_list, pChromosome): out_file_s = self.out_file.split('.') - if(out_file_s[-1] == 'txt'): + if (out_file_s[-1] == 'txt'): out_file = self.out_file else: out_file = self.out_file + '.txt' @@ -910,7 +912,7 @@ def multi_train(self, matrix_list, domain_list, matrix.hic_matrix = None matrix = None - if(conc_features is None): + if (conc_features is None): conc_features = features conc_is_boundary = is_boundary @@ -920,7 +922,7 @@ def multi_train(self, matrix_list, domain_list, conc_is_boundary = np.concatenate( (conc_is_boundary, is_boundary), axis=0) - if(i == nm_conc_features): + if (i == nm_conc_features): resampled_X, resampled_y = self.incremental_resample( resampled_X, resampled_y, conc_features, conc_is_boundary) i = 0 @@ -968,7 +970,7 @@ def multi_train_concatenate_before_resample( matrix.hic_matrix = None matrix = None - if(conc_features is None): + if (conc_features is None): conc_features = features conc_is_boundary = is_boundary @@ -978,7 +980,7 @@ def multi_train_concatenate_before_resample( conc_is_boundary = np.concatenate( (conc_is_boundary, is_boundary), axis=0) - if(i == nm_conc_features): + if (i == nm_conc_features): try: self.classifier.incremental_fit( conc_features, conc_is_boundary, resample=True) @@ -1033,7 +1035,7 @@ def multi_test_predict(self, matrix_list, domain_list, protein_list, pChromosome positions, features, y_test, y_pred = self.classifier.predict_test( positions, features, y_test=is_boundary) - if(conc_test_boundary is None): + if (conc_test_boundary is None): conc_test_boundary = y_test conc_is_boundary = y_pred conc_positions = positions @@ -1079,7 +1081,7 @@ def incremental_resample(self, X, y, X_i, y_i): '''resample without fitting''' try: - if(X is None): + if (X is None): X, y = TADClassifier.MP_Classifier.resample( X_i, y_i, method=self.classifier.resampling_method, passed_method=self.classifier.alternative_resampling_method, threads=self.classifier.threads) @@ -1101,7 +1103,7 @@ def persist(self, out_file): out_file_s = out_file.split('.') - if(not out_file_s[-1] == 'BIN'): + if (not out_file_s[-1] == 'BIN'): out_file = out_file + '.BIN' pickle.dump(self.classifier, open(out_file, 'wb')) @@ -1109,25 +1111,25 @@ def persist(self, out_file): def run_hicTrainClassifier(self, matrix_list, domain_list, protein_list, pChromosomes): '''run hicTrainClassifier with specified mode on file''' - if(isinstance(matrix_list, str)): + if (isinstance(matrix_list, str)): matrix_list = [matrix_list] - if(isinstance(domain_list, str)): + if (isinstance(domain_list, str)): domain_list = [domain_list] - if(len(domain_list) == 1): + if (len(domain_list) == 1): domain_list = domain_list * len(matrix_list) - if(protein_list is None): + if (protein_list is None): protein_list = [None] * len(matrix_list) - elif(isinstance(protein_list, str)): + elif (isinstance(protein_list, str)): protein_list = [protein_list] - if(len(protein_list) == 1): + if (len(protein_list) == 1): protein_list = protein_list * len(matrix_list) - if(not (len(matrix_list) == len(domain_list) and len(protein_list) == len(domain_list))): + if (not (len(matrix_list) == len(domain_list) and len(protein_list) == len(domain_list))): raise ValueError( 'please pass domain (,optional protein) and matrix lists of same length or pass a single domain (and optional protein) file') @@ -1139,25 +1141,25 @@ def run_hicTrainClassifier(self, matrix_list, domain_list, protein_list, pChromo cooler_obj = cooler.Cooler(matrix) chromosome_list.append(cooler_obj.chromnames) - if(self.mode == 'train_new' or self.mode == 'train_existing'): + if (self.mode == 'train_new' or self.mode == 'train_existing'): - if(self.concatenate_before_resample): + if (self.concatenate_before_resample): self.multi_train_concatenate_before_resample( matrix_list, domain_list, protein_list, pChromosome=chromosome_list) else: self.multi_train(matrix_list, domain_list, protein_list, pChromosome=chromosome_list) - elif(self.mode == 'train_test'): + elif (self.mode == 'train_test'): self.train_test(matrix_list, domain_list, protein_list, pChromosome=chromosome_list) - elif(self.mode == 'predict_test'): + elif (self.mode == 'predict_test'): self.multi_test_predict(matrix_list, domain_list, protein_list, pChromosome=chromosome_list) def run_hicTADClassifier(self, matrix_list, pChromosomes): '''predict a dataset''' - if(isinstance(matrix_list, str)): + if (isinstance(matrix_list, str)): matrix_list = [matrix_list] chromosome_list = [] @@ -1263,7 +1265,7 @@ def get_domains(positions, y): # print('domain_df 1 {}'.format(domain_df)) def rgb_helper(i): - if(i == 0): + if (i == 0): return '31,120,180' else: return '51,160,44' diff --git a/hicexplorer/lib/viewpoint.py b/hicexplorer/lib/viewpoint.py index 9c470a9c..fb0745bd 100644 --- a/hicexplorer/lib/viewpoint.py +++ b/hicexplorer/lib/viewpoint.py @@ -537,7 +537,7 @@ def smoothInteractionValues(self, pData, pWindowSize): Adds -pWindowsSize/2 and +pWindowsSize/2 around pData[i] and averages pData[i] by pWindowSize to smooth the interaction values. ''' - window_size = np.int(np.floor(pWindowSize / 2)) + window_size = np.int32(np.floor(pWindowSize / 2)) window_size_upstream = window_size if pWindowSize % 2 == 0: window_size_upstream -= 1 diff --git a/hicexplorer/test/general/test_chicViewpointBackgroundModel.py b/hicexplorer/test/general/test_chicViewpointBackgroundModel.py index 83eac8a0..dfa7a554 100644 --- a/hicexplorer/test/general/test_chicViewpointBackgroundModel.py +++ b/hicexplorer/test/general/test_chicViewpointBackgroundModel.py @@ -32,8 +32,8 @@ def are_files_equal(file1, file2, delta=1, skip=0, eps=0.1): line1_list = np.array(line1.split('\t')) line2_list = np.array(line2.split('\t')) - line1_list = line1_list.astype(np.float) - line2_list = line2_list.astype(np.float) + line1_list = line1_list.astype(np.float64) + line2_list = line2_list.astype(np.float64) for value1, value2 in zip(line1_list, line2_list): if np.abs(value1 - value2) < eps: diff --git a/hicexplorer/test/general/test_hicBuildMatrix.py b/hicexplorer/test/general/test_hicBuildMatrix.py index 2008faaa..cd869360 100644 --- a/hicexplorer/test/general/test_hicBuildMatrix.py +++ b/hicexplorer/test/general/test_hicBuildMatrix.py @@ -14,6 +14,7 @@ sam_R1 = ROOT + "small_test_R1_unsorted.bam" sam_R2 = ROOT + "small_test_R2_unsorted.bam" dpnii_file = ROOT + "DpnII.bed" +delta = 80000 def are_files_equal(file1, file2, delta=1): @@ -57,8 +58,8 @@ def test_build_matrix(capsys): assert are_files_equal(ROOT + "QC/QC.log", qc_folder + "/QC.log") assert set(os.listdir(ROOT + "QC/")) == set(os.listdir(qc_folder)) - # accept delta of 60 kb, file size is around 4.5 MB - assert abs(os.path.getsize(ROOT + "small_test_matrix_result.bam") - os.path.getsize(outfile_bam.name)) < 64000 + # accept delta of 80 kb, file size is around 4.5 MB + assert abs(os.path.getsize(ROOT + "small_test_matrix_result.bam") - os.path.getsize(outfile_bam.name)) < delta os.unlink(outfile.name) shutil.rmtree(qc_folder) @@ -86,8 +87,8 @@ def test_build_matrix_restriction_enzyme(capsys): assert are_files_equal(ROOT + "QC_multi_restriction/QC.log", qc_folder + "/QC.log") assert set(os.listdir(ROOT + "QC_multi_restriction/")) == set(os.listdir(qc_folder)) - # accept delta of 60 kb, file size is around 4.5 MB - assert abs(os.path.getsize(ROOT + "small_test_matrix_result.bam") - os.path.getsize(outfile_bam.name)) < 64000 + # accept delta of 80 kb, file size is around 4.5 MB + assert abs(os.path.getsize(ROOT + "small_test_matrix_result.bam") - os.path.getsize(outfile_bam.name)) < delta os.unlink(outfile.name) shutil.rmtree(qc_folder) @@ -119,8 +120,8 @@ def test_build_matrix_restriction_enzyme_region(capsys): assert are_files_equal(ROOT + "QC_region/QC.log", qc_folder + "/QC.log") assert set(os.listdir(ROOT + "QC_region/")) == set(os.listdir(qc_folder)) - # accept delta of 60 kb, file size is around 4.5 MB - assert abs(os.path.getsize(ROOT + "build_region.bam") - os.path.getsize(outfile_bam.name)) < 64000 + # accept delta of 80 kb, file size is around 4.5 MB + assert abs(os.path.getsize(ROOT + "build_region.bam") - os.path.getsize(outfile_bam.name)) < delta os.unlink(outfile.name) shutil.rmtree(qc_folder) @@ -148,8 +149,8 @@ def test_build_matrix_chromosome_sizes(capsys): assert are_files_equal(ROOT + "QC/QC.log", qc_folder + "/QC.log") assert set(os.listdir(ROOT + "QC/")) == set(os.listdir(qc_folder)) - # accept delta of 60 kb, file size is around 4.5 MB - assert abs(os.path.getsize(ROOT + "hicBuildMatrix/chromosome_sizes/test.bam") - os.path.getsize(outfile_bam.name)) < 64000 + # accept delta of 80 kb, file size is around 4.5 MB + assert abs(os.path.getsize(ROOT + "hicBuildMatrix/chromosome_sizes/test.bam") - os.path.getsize(outfile_bam.name)) < delta os.unlink(outfile.name) shutil.rmtree(qc_folder) diff --git a/hicexplorer/test/general/test_hicDifferentialTAD.py b/hicexplorer/test/general/test_hicDifferentialTAD.py index 9318bc07..6339d086 100644 --- a/hicexplorer/test/general/test_hicDifferentialTAD.py +++ b/hicexplorer/test/general/test_hicDifferentialTAD.py @@ -489,8 +489,8 @@ def test_cool_all_multi_core_all_multichr_chromosome(): # test on the intersection to exclude the case of duplicated lines assert number_of_tads == all_tads_present(ROOT + "untreated_R1_domains_chr1_chr2.bed", outfile_pref.name + '_accepted.diff_tad', outfile_pref.name + '_rejected.diff_tad') - assert are_files_equal(outfile_pref.name + '_accepted.diff_tad', ROOT + 'multichromosome_accepted.diff_tad', delta=0, skip=4) - assert are_files_equal(outfile_pref.name + '_rejected.diff_tad', ROOT + 'multichromosome_rejected.diff_tad', delta=0, skip=4) + assert are_files_equal(outfile_pref.name + '_accepted.diff_tad', ROOT + 'multichromosome_accepted.diff_tad', delta=2, skip=4) + assert are_files_equal(outfile_pref.name + '_rejected.diff_tad', ROOT + 'multichromosome_rejected.diff_tad', delta=2, skip=4) # mode_all_reject_all__t4_cool_multi_chromosomes_rejected.diff_tad @@ -522,5 +522,5 @@ def test_cool_all_one_core_all_multichr_chromosome(): assert number_of_tads == all_tads_present(ROOT + "untreated_R1_domains_chr1_chr2.bed", outfile_pref.name + '_accepted.diff_tad', outfile_pref.name + '_rejected.diff_tad') # assert number_of_tads == all_tads_present(ROOT + "untreated_R1_domains_chr1_chr2.bed", ROOT + 'mode_all_reject_all__t4_cool_multi_chromosomes_accepted.diff_tad', ROOT + 'mode_all_reject_all__t4_cool_multi_chromosomes_rejected.diff_tad') - assert are_files_equal(outfile_pref.name + '_accepted.diff_tad', ROOT + 'multichromosome_accepted.diff_tad', delta=0, skip=4) - assert are_files_equal(outfile_pref.name + '_rejected.diff_tad', ROOT + 'multichromosome_rejected.diff_tad', delta=0, skip=4) + assert are_files_equal(outfile_pref.name + '_accepted.diff_tad', ROOT + 'multichromosome_accepted.diff_tad', delta=2, skip=4) + assert are_files_equal(outfile_pref.name + '_rejected.diff_tad', ROOT + 'multichromosome_rejected.diff_tad', delta=2, skip=4) diff --git a/hicexplorer/test/general/test_hicPCA.py b/hicexplorer/test/general/test_hicPCA.py index c29f5596..b2ca8831 100644 --- a/hicexplorer/test/general/test_hicPCA.py +++ b/hicexplorer/test/general/test_hicPCA.py @@ -30,7 +30,7 @@ def are_files_equal(file1, file2): split_y = y.split('\t') if split_x[0] == split_y[0] and split_x[1] == split_y[1] and split_x[2] == split_y[2]: # to ignore rounding errors after 2th digit - if 0 <= abs(abs(float(split_x[3].strip())) - abs(float(split_y[3].strip()))) <= 0.01: + if 0 <= abs(abs(np.complex128(split_x[3].strip()).real) - abs(np.complex128(split_y[3].strip()).real)) <= 0.01: continue else: log.debug('split_x {} split_y {}'.format(split_x, split_y)) diff --git a/hicexplorer/test/general/test_hicTADClassifier.py b/hicexplorer/test/general/test_hicTADClassifier.py index deab5971..aba38138 100644 --- a/hicexplorer/test/general/test_hicTADClassifier.py +++ b/hicexplorer/test/general/test_hicTADClassifier.py @@ -156,7 +156,7 @@ def test_hicTADClassifier(): '-n', 'range', '--saved_classifier', - ROOT + 'trained_model.bin.BIN', + ROOT + 'trained_model.BIN', '--threads', '4' ] @@ -198,7 +198,7 @@ def test_hicTADClassifier_two_matrices(): '-n', 'range', '--saved_classifier', - ROOT + 'trained_model.bin.BIN', + ROOT + 'trained_model.BIN', '--unselect_border_cases', # 'True', '--threads', @@ -227,6 +227,8 @@ def test_hicTADClassifier_load_model_obs_exp(): outfile.name, '-n', 'obs_exp', + '--saved_classifier', + ROOT + 'trained_model_obs_exp.BIN', '--threads', '4' ] diff --git a/hicexplorer/test/general/test_hicTrainTADClassifier.py b/hicexplorer/test/general/test_hicTrainTADClassifier.py index 0e7f1968..870763af 100644 --- a/hicexplorer/test/general/test_hicTrainTADClassifier.py +++ b/hicexplorer/test/general/test_hicTrainTADClassifier.py @@ -101,5 +101,4 @@ def test_hicTrainClassifier_train_test(): compute(hicTrainTADClassifier.main, args, 5) f = open(test_folder + 'predict_test_results.txt', "r") assert f.readline().split()[0] == 'accuracy' - -# shutil.rmtree(test_folder) + # shutil.rmtree(test_folder) diff --git a/hicexplorer/test/test_compute_function.py b/hicexplorer/test/test_compute_function.py index 6263cb44..39beeaad 100644 --- a/hicexplorer/test/test_compute_function.py +++ b/hicexplorer/test/test_compute_function.py @@ -1,15 +1,17 @@ import time from random import randint +import traceback def compute(pFunction, pParameters, pTries): exception_string = '' + pTries = 1 for i in range(pTries): try: pFunction(pParameters) return - except Exception as exp: + except Exception: time.sleep(1 + randint(0, 5)) - exception_string = str(exp) + exception_string = traceback.format_exc() raise Exception('Tries {}, but still got an exception: {}'.format(pTries, exception_string)) return diff --git a/hicexplorer/test/test_data/hicTADClassifier/trained_model.BIN b/hicexplorer/test/test_data/hicTADClassifier/trained_model.BIN new file mode 100644 index 00000000..fea5a546 Binary files /dev/null and b/hicexplorer/test/test_data/hicTADClassifier/trained_model.BIN differ diff --git a/hicexplorer/test/test_data/hicTADClassifier/trained_model.bin.BIN b/hicexplorer/test/test_data/hicTADClassifier/trained_model.bin.BIN deleted file mode 100644 index 383a1743..00000000 Binary files a/hicexplorer/test/test_data/hicTADClassifier/trained_model.bin.BIN and /dev/null differ diff --git a/hicexplorer/test/test_data/hicTADClassifier/trained_model_obs_exp.BIN b/hicexplorer/test/test_data/hicTADClassifier/trained_model_obs_exp.BIN new file mode 100644 index 00000000..4ec5b841 Binary files /dev/null and b/hicexplorer/test/test_data/hicTADClassifier/trained_model_obs_exp.BIN differ diff --git a/hicexplorer/test/test_data/number_of_tests.txt b/hicexplorer/test/test_data/number_of_tests.txt index 5ae48373..e88ff725 100644 --- a/hicexplorer/test/test_data/number_of_tests.txt +++ b/hicexplorer/test/test_data/number_of_tests.txt @@ -1 +1 @@ -585 \ No newline at end of file +602 \ No newline at end of file diff --git a/hicexplorer/test/trivial_runs/test_hicAggregateContacts_trivial_runs.py b/hicexplorer/test/trivial_runs/test_hicAggregateContacts_trivial_runs.py index 28c3f747..8dbe1480 100644 --- a/hicexplorer/test/trivial_runs/test_hicAggregateContacts_trivial_runs.py +++ b/hicexplorer/test/trivial_runs/test_hicAggregateContacts_trivial_runs.py @@ -41,7 +41,7 @@ @pytest.mark.parametrize("BED2", [BED2]) @pytest.mark.parametrize("numberOfBins", [30]) @pytest.mark.parametrize("transform", sorted(['total-counts'])) -@pytest.mark.parametrize("operationType", ['sum', 'mean', 'median']) +@pytest.mark.parametrize("operationType", sorted(['sum', 'mean', 'median'])) @pytest.mark.parametrize("outFilePrefixMatrix", ['outFilePrefix']) @pytest.mark.parametrize("outFileContactPairs", ['outFileContactPairs']) @pytest.mark.parametrize("diagnosticHeatmapFile", [diagnosticHeatmapFile]) diff --git a/hicexplorer/test/trivial_runs/test_hicAggregateContacts_trivial_runs_five.py b/hicexplorer/test/trivial_runs/test_hicAggregateContacts_trivial_runs_five.py index 1b82e4ab..adc9258a 100644 --- a/hicexplorer/test/trivial_runs/test_hicAggregateContacts_trivial_runs_five.py +++ b/hicexplorer/test/trivial_runs/test_hicAggregateContacts_trivial_runs_five.py @@ -41,7 +41,7 @@ @pytest.mark.parametrize("BED2", [BED2]) @pytest.mark.parametrize("numberOfBins", [30]) @pytest.mark.parametrize("transform", sorted(['z-score'])) -@pytest.mark.parametrize("operationType", ['sum', 'mean', 'median']) +@pytest.mark.parametrize("operationType", sorted(['sum', 'mean', 'median'])) @pytest.mark.parametrize("outFilePrefixMatrix", ['outFilePrefix']) @pytest.mark.parametrize("outFileContactPairs", ['outFileContactPairs']) @pytest.mark.parametrize("diagnosticHeatmapFile", [diagnosticHeatmapFile]) diff --git a/hicexplorer/test/trivial_runs/test_hicAggregateContacts_trivial_runs_four.py b/hicexplorer/test/trivial_runs/test_hicAggregateContacts_trivial_runs_four.py index f931cec1..e8786a0b 100644 --- a/hicexplorer/test/trivial_runs/test_hicAggregateContacts_trivial_runs_four.py +++ b/hicexplorer/test/trivial_runs/test_hicAggregateContacts_trivial_runs_four.py @@ -41,7 +41,7 @@ @pytest.mark.parametrize("BED2", [BED2]) @pytest.mark.parametrize("numberOfBins", [30]) @pytest.mark.parametrize("transform", sorted(['obs/exp'])) -@pytest.mark.parametrize("operationType", ['sum', 'mean', 'median']) +@pytest.mark.parametrize("operationType", sorted(['sum', 'mean', 'median'])) @pytest.mark.parametrize("outFilePrefixMatrix", ['outFilePrefix']) @pytest.mark.parametrize("outFileContactPairs", ['outFileContactPairs']) @pytest.mark.parametrize("diagnosticHeatmapFile", [diagnosticHeatmapFile]) diff --git a/hicexplorer/test/trivial_runs/test_hicAggregateContacts_trivial_runs_six.py b/hicexplorer/test/trivial_runs/test_hicAggregateContacts_trivial_runs_six.py index 80d00a11..ff572633 100755 --- a/hicexplorer/test/trivial_runs/test_hicAggregateContacts_trivial_runs_six.py +++ b/hicexplorer/test/trivial_runs/test_hicAggregateContacts_trivial_runs_six.py @@ -41,7 +41,7 @@ @pytest.mark.parametrize("BED2", [BED2]) @pytest.mark.parametrize("numberOfBins", [30]) @pytest.mark.parametrize("transform", sorted(['none'])) -@pytest.mark.parametrize("operationType", ['sum', 'mean', 'median']) +@pytest.mark.parametrize("operationType", sorted(['sum', 'mean', 'median'])) @pytest.mark.parametrize("outFilePrefixMatrix", ['outFilePrefix']) @pytest.mark.parametrize("outFileContactPairs", ['outFileContactPairs']) @pytest.mark.parametrize("diagnosticHeatmapFile", [diagnosticHeatmapFile]) diff --git a/hicexplorer/utilities.py b/hicexplorer/utilities.py index 54e56670..16e62d15 100644 --- a/hicexplorer/utilities.py +++ b/hicexplorer/utilities.py @@ -263,13 +263,13 @@ def expected_interactions_in_distance(pLength_chromosome, pChromosome_count, pSu expected_interactions[distance_] += pSubmatrix.data[i] count_times_i = np.arange(float(len(expected_interactions))) - pChromosome_count = np.int(pChromosome_count) - pLength_chromosome = np.int(pLength_chromosome) + pChromosome_count = np.int32(pChromosome_count) + pLength_chromosome = np.int32(pLength_chromosome) count_times_i *= pChromosome_count count_times_i -= pLength_chromosome - count_times_i *= np.int(-1) + count_times_i *= np.int32(-1) - expected_interactions /= count_times_i + expected_interactions = np.divide(expected_interactions, count_times_i) # log.debug('exp_obs_matrix_lieberman {}'.format(expected_interactions)) return expected_interactions @@ -289,7 +289,7 @@ def expected_interactions_non_zero(pSubmatrix): for i, distance_ in enumerate(distance): expected_interactions[distance_] += pSubmatrix.data[i] occurences[distance_] += 1 - expected_interactions /= occurences + expected_interactions = np.divide(expected_interactions, occurences) mask = np.isnan(expected_interactions) expected_interactions[mask] = 0 @@ -363,7 +363,7 @@ def expected_interactions(pSubmatrix, pThreads=None): for i in range(pThreads): if queue[i] is not None and not queue[i].empty(): expected_interactions_thread_ = queue[i].get() - if 'Fail: ' in expected_interactions_thread_: + if isinstance(expected_interactions_thread_, str) and 'Fail: ' in expected_interactions_thread_: fail_flag = True fail_message = expected_interactions_thread_ else: @@ -389,7 +389,7 @@ def expected_interactions(pSubmatrix, pThreads=None): # for i, distance_ in enumerate(distance): # expected_interactions[distance_] += pSubmatrix.data[i] # occurences[distance_] += 1 - expected_interactions /= occurrences + expected_interactions = np.divide(expected_interactions, occurrences) mask = np.isnan(expected_interactions) expected_interactions[mask] = 0 @@ -463,7 +463,7 @@ def obs_exp_matrix_lieberman(pSubmatrix, pLength_chromosome, pChromosome_count): expected = expected_interactions_in_distance_[distance] pSubmatrix.data = pSubmatrix.data.astype(np.float32) - pSubmatrix.data /= expected + pSubmatrix.data = np.divide(pSubmatrix.data, expected) pSubmatrix.data = convertInfsToZeros_ArrayFloat(pSubmatrix.data).astype(data_type) return pSubmatrix @@ -498,7 +498,7 @@ def obs_exp_matrix_non_zero(pSubmatrix, ligation_factor=False, pInplace=True, pT if ligation_factor: expected *= row_sums[row[i]] * row_sums[col[i]] / total_interactions - submatrix.data[i] /= expected + submatrix.data[i] = np.divide(submatrix.data[i], expected) if pToEpsilon: epsilon = 0.000000001 @@ -545,11 +545,11 @@ def obs_exp_matrix(pSubmatrix, pInplace=True, pToEpsilon=False, pThreads=None, p expected = expected_interactions_in_distance_[distance] if pInplace: pSubmatrix.data = pSubmatrix.data.astype(np.float32) - pSubmatrix.data /= expected + pSubmatrix.data = np.divide(pSubmatrix.data, expected) pSubmatrix.data = convertInfsToZeros_ArrayFloat(pSubmatrix.data, pToEpsilon).astype(data_type) else: pSubmatrix_copy.data = pSubmatrix_copy.data.astype(np.float32) - pSubmatrix_copy.data /= expected + pSubmatrix_copy.data = np.divide(pSubmatrix_copy.data, expected) pSubmatrix_copy.data = convertInfsToZeros_ArrayFloat(pSubmatrix_copy.data, pToEpsilon).astype(data_type) del expected diff --git a/requirements.txt b/requirements.txt index 66559064..016f53e8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,28 +1,28 @@ -python >= 3.6 +python >= 3.8 numpy >= 1.19.* -scipy >= 1.5.* -matplotlib-base >= 3.1.* -ipykernel >= 5.3.0 -pysam >= 0.16.* -intervaltree >= 3.1.* +scipy >= 1.10 +matplotlib-base >= 3.6 +ipykernel >= 6.25.2 +pysam >= 0.21 +intervaltree >= 3.1 biopython -pytables >= 3.6.* -pandas >= 1.1.* -pybigwig >= 0.3.* -jinja2 >= 2.11 -unidecode >= 1.1.* -hicmatrix >= 16 +pytables >= 3.8 +pandas >= 2.0 +pybigwig >= 0.3 +jinja2 >= 3.1.2 +unidecode >= 1.3 +hicmatrix >= 17 hic2cool >= 0.8.3 -psutil >= 5.7.* -pygenometracks >= 3.5 -fit_nbinom >= 1.1 -cooler >= 0.8.10 +psutil >= 5.9 +pygenometracks >= 3.8 +fit_nbinom >= 1.2 +cooler >= 0.9.3 krbalancing >= 0.0.5 -pybedtools >= 0.8.* +pybedtools >= 0.9 future >= 0.18 -tqdm >= 4.50 -hyperopt >= 0.2.4 -python-graphviz >= 0.14 -scikit-learn >= 0.23.2 -imbalanced-learn >= 0.7.* -cleanlab >= 0.1.* +tqdm >= 4.66 +hyperopt >= 0.2.7 +python-graphviz >= 0.20 +scikit-learn >= 1.3.1 +imbalanced-learn >= 0.11 +cleanlab >= 2.5 diff --git a/setup.py b/setup.py index c8657a47..87304cd4 100755 --- a/setup.py +++ b/setup.py @@ -92,33 +92,33 @@ def checkProgramIsInstalled(self, program, args, where_to_download, sys.stderr.write("Error: {}".format(e)) -install_requires_py = ["numpy >= 1.19.*", - "scipy >= 1.5.*", - "matplotlib >= 3.1.*", - "ipykernel >= 5.3.0", - "pysam >= 0.16", - "intervaltree >= 3.1.*", +install_requires_py = ["numpy >= 1.19", + "scipy >= 1.10", + "matplotlib >= 3.6", + "ipykernel >= 6.25.2", + "pysam >= 0.21", + "intervaltree >= 3.1", "biopython", - "tables >= 3.6.*", - "pandas >= 1.1.*", - "pyBigWig >= 0.3.*", - "cooler >= 0.8.10", - "jinja2 >= 2.11.*", - "unidecode >= 1.1.*", - "hicmatrix >= 16", - "pygenometracks >= 3.5", - "psutil >= 5.7.*", - "fit_nbinom >= 1.1", + "tables >= 3.8", + "pandas >= 2.0", + "pybigwig >= 0.3", + "jinja2 >= 3.1.2", + "unidecode >= 1.3", + "hicmatrix >= 17", "hic2cool >= 0.8.3", + "psutil >= 5.9", + "pygenometracks >= 3.8", + "fit_nbinom >= 1.2", + "cooler >= 0.9.3", "krbalancing >= 0.0.5", - "pybedtools >= 0.8", + "pybedtools >= 0.9", "future >= 0.18", - "tqdm >= 4.50", - "hyperopt >= 0.2.4", - "graphviz >= 0.14", - "scikit-learn >= 0.23.2", - "imbalanced-learn >= 0.7.*", - "cleanlab >= 0.1.*" + "tqdm >= 4.66", + "hyperopt >= 0.2.7", + "graphviz >= 0.20", + "scikit-learn >= 1.3.1", + "imbalanced-learn >= 0.11", + "cleanlab >= 2.5" ]