diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml new file mode 100644 index 0000000..4fe66e5 --- /dev/null +++ b/.github/workflows/pypi.yml @@ -0,0 +1,36 @@ +name: pypi +on: + push: + branches: + - main + pull_request: + branches: + - main +jobs: + pypi: + name: upload to pypi + runs-on: ubuntu-latest + defaults: + run: + shell: bash -l {0} + steps: + - name: Checkout repository + uses: actions/checkout@v2 + - name: Set up conda + uses: conda-incubator/setup-miniconda@v2 + - name: Create env + run: | + conda create -n upload -c conda-forge -q --yes python=3.8 twine hatch + - name: sdist + run: | + conda activate upload + conda info + rm -f dist/* + hatch build -t sdist + - name: upload + if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags') + env: + TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} + TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + run: | + twine upload dist/* diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..24c767a --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,55 @@ +name: test + +on: + push: + branches: + - develop + pull_request: + branches: + - develop + - main + +jobs: + build: + name: Test installation + runs-on: ${{ matrix.os }} + defaults: + run: + shell: bash -l {0} + strategy: + max-parallel: 5 + matrix: + os: [ubuntu-latest, macos-latest] + python-version: [ "3.8" ] + + steps: + - name: Checkout repository + uses: actions/checkout@v2 + - name: Set up conda + uses: conda-incubator/setup-miniconda@v2 + with: + channels: conda-forge,bioconda,defaults + auto-activate-base: false + activate-environment: sincei + environment-file: requirements.yml + - name: Check environment + run: | + conda info + - name: List tools + run: | + conda list + - name: Install sincei + run: | + pip install . + which sincei + sincei --help + - name: Test with pytest + run: | + conda install pytest + pytest + + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: psf/black@stable diff --git a/.gitignore b/.gitignore index a5fabd6..e02b023 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,6 @@ bin/__pycache__ notebooks .DS_Store .ipynb_checkpoints/ +.pytest_cache +build +docs/_build diff --git a/.readthedocs.yml b/.readthedocs.yml new file mode 100644 index 0000000..36b3ebe --- /dev/null +++ b/.readthedocs.yml @@ -0,0 +1,16 @@ +# yaml file to configure readthedocs build +version: 2 +build: + os: ubuntu-20.04 + tools: + python: "3.8" +sphinx: + configuration: docs/conf.py + # disable this for more lenient docs builds + fail_on_warning: false +python: + install: + - method: pip + path: . + extra_requirements: + - doc diff --git a/LICENCE.txt b/LICENCE.txt index 96e1ff7..ea2730c 100644 --- a/LICENCE.txt +++ b/LICENCE.txt @@ -1,6 +1,6 @@ MIT license: -Copyright 2021 +Copyright 2021 Vivek Bhardwaj Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: diff --git a/README.md b/README.md index 3254d7d..02cde4a 100644 --- a/README.md +++ b/README.md @@ -1,23 +1,29 @@ -sincei logo - -Bhardwaj V. (2022) sincei: A user-friendly toolkit for QC, counting, clustering and plotting of single-cell (epi)genomics data. [![DOI](https://zenodo.org/badge/271841139.svg)](https://zenodo.org/badge/latestdoi/271841139) +## sincei: A user-friendly toolkit for QC, counting, clustering and plotting of single-cell (epi)genomics data. +[![DOI](https://zenodo.org/badge/271841139.svg)](https://zenodo.org/badge/latestdoi/271841139) [![Documentation Status](https://readthedocs.org/projects/sincei/badge/?version=latest)](https://sincei.readthedocs.io/en/latest/?badge=latest) [![PyPI Version](https://img.shields.io/pypi/v/sincei.svg?style=plastic)](https://pypi.org/project/sincei/) [![test](https://github.com/vivekbhr/sincei/actions/workflows/test.yml/badge.svg)](https://github.com/vivekbhr/sincei/actions/workflows/test.yml) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) +## [Full Documentation](http://sincei.rtfd.io/). ## Installation sincei is a command line toolkit based on python3, and can be installed using [conda](https://conda.io/projects/conda/en/latest/user-guide/install/index.html). -Create a new conda environment and install sincei using: +Create a new conda environment and install sincei stable release from github using: ``` -cd -conda create -n sincei -c bioconda -c conda-forge scanpy deeptools +cd +conda create -n sincei -c conda-forge -c bioconda scanpy deeptools conda activate sincei (sincei): pip install --editable=git+https://github.com/vivekbhr/sincei.git@master#egg=sincei ``` +For the development version, try: + +``` +(sincei): pip install --editable=git+https://github.com/vivekbhr/sincei.git@develop#egg=sincei +``` + ## Usage **Get the tool list with `sincei --help`** @@ -26,14 +32,4 @@ Each tool begins with the prefix sc, such as: $ scBulkCoverage -b file1.bam -g groupinfo.txt -o coverage -[ Tools for a typical single-cell analysis workflow ] (WIP: work in progress/not available yet) - - scFilterBarcodes Identify and filter cell barcodes from BAM file (for droplet-based single-cell seq) - scFilterStats Produce per-cell statistics after filtering reads by user-defined criteria. - scCountReads Counts reads for each barcode on genomic bins or user-defined features. - scCountQC Perform quality control and filter the output of scCountReads. - scCombineCounts [WIP] Concatenate/merge the counts from different samples/batches or modalities - scClusterCells Perform dimensionality reduction and clustering on the output of scCountReads. - scFindMarkers [WIP] Find marker genes per group, given the output of scCountReads and a user-defined group. - scBulkCoverage Get pseudo-bulk coverage per group using a user-supplied cell->group mapping (output of scClusterCells). - scFeaturePlot [WIP] Plot the counts for a given feature on a UMAP or on a (IGV-style) genomic-track. + diff --git a/bin/scBulkCoverage b/bin/scBulkCoverage deleted file mode 100755 index 8219907..0000000 --- a/bin/scBulkCoverage +++ /dev/null @@ -1,465 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -# own tools -import argparse -import sys -import os -import itertools -import numpy as np -import pandas as pd -from deeptools.getScaleFactor import get_scale_factor -from deeptools.bamHandler import openBam -import warnings -warnings.simplefilter(action='ignore', category=FutureWarning) - -## own Functions -scriptdir=os.path.abspath(os.path.join(__file__, "../../sincei")) -sys.path.append(scriptdir) -import ParserCommon -import WriteBedGraph - -debug = 0 - - -def parseArguments(): - io_args = ParserCommon.inputOutputOptions(opts=['bamfiles', 'groupInfo', 'outFilePrefix']) - bam_args = ParserCommon.bamOptions(default_opts={'binSize': 100}) - read_args = ParserCommon.readOptions() - filter_args = ParserCommon.filterOptions() - other_args = ParserCommon.otherOptions() - parser = \ - argparse.ArgumentParser( - parents=[io_args, get_args(), bam_args, filter_args, read_args, other_args], - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - description='This tool takes alignments of reads or fragments ' - 'as input (BAM files), along with cell grouping information, such as ' - 'barcode -> batch, or barcode -> cluster, as tsv file, and generates a ' - 'coverage track (bigWig or bedGraph) per group as output. ' - 'The coverage is calculated as the number of reads per bin, ' - 'where bins are short consecutive counting windows of a defined ' - 'size. It is possible to extended/change the length of the reads ' - 'to better reflect the actual fragment length. *scBulkCoverage* ' - 'offers normalization per cluster using different methods \n', - usage='An example usage is:' - '$ scBulkCoverage -b file1.bam file2.bam --labels file1 file2 -g scClusterCells_output.tsv -o coverage.bw', - add_help=False) - - return parser - - -def get_args(): - parser = argparse.ArgumentParser(add_help=False) - - - optional = parser.add_argument_group('Coverage-related Options') - optional.add_argument('--outFileFormat', '-of', - help='Output file type. Either "bigwig" or "bedgraph".', - choices=['bigwig', 'bedgraph'], - default='bigwig') - - optional.add_argument('--normalizeUsing', '-n', - help='How to normalize the pseudo-bulk counts. Options are ' - '"CPM": normalized each bin to the counts per million mapped reads in that group.\n' - '"Frequency": binarize the coverage per bin and normalize to the total no. of cells per group. \n' - '"Mean": get mean signal per bin across cells in each group.\n' - '"None": simply return the sum of coverage per group.', - choices=['CPM', 'Frequency', 'Mean', 'None'], - default='CPM') - - optional.add_argument('--ignoreForNormalization', '-ig', - help='Chromosomes to skip while calculating normalization factors', - nargs='+', - default=None) - - optional.add_argument('--normalizeByReference', '-nr', - help='NOT IMPLEMENTED: Normalize each group of cells by a reference group (which must be present in the --groupinfo file)' - 'Note that the --normalizeUsing method is applied beforehand.', - choices=['ratio', 'log2_ratio', 'difference', 'None'], - default=None) - - optional.add_argument('--scaleFactor', - help='The computed scaling factor (or 1, if not applicable) will ' - 'be multiplied by this. (Default: %(default)s)', - default=1.0, - type=float, - required=False) - - optional.add_argument('--MNase', - help='Determine nucleosome positions from MNase-seq/CUTnRUN data. ' - 'Only 3 nucleotides at the center of each fragment are counted. ' - 'The fragment ends are defined by the two mate reads. Only fragment lengths' - 'between 130 - 200 bp are considered to avoid dinucleosomes or other artifacts. ' - 'By default, any fragments smaller or larger than this are ignored. To ' - 'over-ride this, use the --minFragmentLength and --maxFragmentLength options, ' - 'which will default to 130 and 200 if not otherwise specified in the presence ' - 'of --MNase. *NOTE*: Requires paired-end data. A bin size of 1 is recommended.', - action='store_true') - - optional.add_argument('--Offset', - help='Uses this offset inside of each read as the signal. This is useful in ' - 'cases like RiboSeq or GROseq, where the signal is 12, 15 or 0 bases past the ' - 'start of the read. This can be paired with the --filterRNAstrand option. ' - 'Note that negative values indicate offsets from the end of each read. A value ' - 'of 1 indicates the first base of the alignment (taking alignment orientation ' - 'into account). Likewise, a value of -1 is the last base of the alignment. An ' - 'offset of 0 is not permitted. If two values are specified, then they will be ' - 'used to specify a range of positions. Note that specifying something like ' - '--Offset 5 -1 will result in the 5th through last position being used, which ' - 'is equivalent to trimming 4 bases from the 5-prime end of alignments. Note ' - 'that if you specify --centerReads, the centering will be performed before the ' - 'offset.', - metavar='INT', - type=int, - nargs='+', - required=False) - - - return parser - - - -def main(args=None): - - args = ParserCommon.validateInputs(parseArguments().parse_args(args), process_barcodes=False)#newlabels are discarded, since they are re-created from groupInfo file - global debug - if args.verbose: - sys.stderr.write("Specified --scaleFactor: {}\n".format(args.scaleFactor)) - debug = 1 - else: - debug = 0 - - ## read the group info file (make it more robust) - ## if the no. of columns are 3, expect "sample", "barcode", "cluster", if 4, expect sample:bc, umap1, umap2, cluster - df = pd.read_csv(args.groupInfo, sep="\t", index_col=None, comment="#") - if len(df.columns) == 3: - df.columns = ['sample', 'barcode', 'cluster'] - elif len(df.columns) == 4: - df.columns = ['barcode', 'umap1', 'umap2', 'cluster'] - df['sample'] = [x.split("::")[:-1] for x in df.barcode] - df['barcode'] = [x.split("::")[-1] for x in df.barcode] - else: - sys.exit("*Error*:No. of columns in --groupInfo file not recognized. " - "Please provide either 3 (sample, barcode, group) or 4 (sample::barcode, umap1, umap2, group) column file") - df.index = df[['sample', 'barcode']].apply(lambda x: '::'.join(x), axis=1) - #barcodes = groupInfo.barcode.unique().tolist() - - ## match the sample labels with groupInfo labels before proceeding - ## create new DF with user-provided bam+labels (this would match the counts) - - # check that sample names in df and --labels match - labels_not_in_df = set(args.labels).difference(set(df['sample'])) - df_not_in_labels = set(df['sample']).difference(set(args.labels)) - if bool(df_not_in_labels): - sys.stderr.write("Some (or all) of the samples indicated in the groupInfo file " - "are absent from the bam file labels! \n" - "Mismatched samples are: {} \n".format(df_not_in_labels)) - elif bool(labels_not_in_df): - sys.stderr.write("Some (or all) of the samples indicated in --labels " - "are absent from the in the groupInfo file! \n" - "Mismatched samples are: {} \n".format(labels_not_in_df)) - - # make another df, containing union of all barcodes, and in the same order per sample - barcodes = df['barcode'].unique().tolist() - sm = list(itertools.chain.from_iterable(itertools.repeat(lab, len(barcodes)) for lab in args.labels)) - bc = barcodes*len(args.labels) - groupInfo = pd.DataFrame({"sample":sm, "barcode":bc}) - groupInfo.index = groupInfo[['sample', 'barcode']].apply(lambda x: '::'.join(x), axis=1) - groupInfo = pd.merge(groupInfo, df['cluster'], how="left", left_index=True, right_index=True, sort=False) - groupInfo = groupInfo.reset_index()[['sample', 'barcode', 'cluster']] - # re-construct new labels (sample+bc) - newlabels = ['::'.join([x,y]) for x, y in zip(groupInfo['sample'], groupInfo['barcode'])] - # Normalization options - scale_factor = args.scaleFactor - func_args = {'scaleFactor': args.scaleFactor } - - coverageAsFrequency=False - if not args.ignoreForNormalization: - args.ignoreForNormalization = [] - if args.normalizeUsing == 'None': - args.normalizeUsing = None # For the sake of sanity - elif args.normalizeUsing == 'Frequency': - coverageAsFrequency = True - args.normalizeUsing = 'Mean'# mean of binarized counts shall give us frequency - - - # This fixes issue #520, where --extendReads wasn't honored if --filterRNAstrand was used - if args.filterRNAstrand and not args.Offset: - args.Offset = [1, -1] - - if args.MNase: - # check that library is paired end - # using getFragmentAndReadSize - from deeptools.getFragmentAndReadSize import get_read_and_fragment_length - fraglengths = [get_read_and_fragment_length(file, - return_lengths=False, - blackListFileName=args.blackListFileName, - numberOfProcessors=args.numberOfProcessors, - verbose=args.verbose) for file in args.bamfiles] - if any([x[0] is None for x in fraglengths]): - sys.exit("*Error*: For the --MNAse function a paired end library is required. ") - - # Set some default fragment length bounds - if args.minFragmentLength == 0: - args.minFragmentLength = 130 - if args.maxFragmentLength == 0: - args.maxFragmentLength = 200 - - wr = CenterFragment(args.bamfiles, - binLength=args.binSize, - stepSize=args.binSize, - barcodes=barcodes, - clusterInfo=groupInfo, - cellTag=args.cellTag, - groupTag=args.groupTag, - groupLabels=newlabels, - motifFilter=args.motifFilter, - genome2bit=args.genome2bit, - GCcontentFilter=args.GCcontentFilter, - region=args.region, - blackListFileName=args.blackListFileName, - numberOfProcessors=args.numberOfProcessors, - extendReads=args.extendReads, - minMappingQuality=args.minMappingQuality, - duplicateFilter=args.duplicateFilter, - center_read=args.centerReads, - zerosToNans=False, - samFlag_include=args.samFlagInclude, - samFlag_exclude=args.samFlagExclude, - minFragmentLength=args.minFragmentLength, - maxFragmentLength=args.maxFragmentLength, - chrsToSkip=args.ignoreForNormalization, - binarizeCoverage=coverageAsFrequency, - verbose=args.verbose, - ) - - elif args.Offset: - if len(args.Offset) > 1: - if args.Offset[0] == 0: - sys.exit("*Error*: An offset of 0 isn't allowed, since offsets are 1-based positions inside each alignment.") - if args.Offset[1] > 0 and args.Offset[1] < args.Offset[0]: - sys.exit("'Error*: The right side bound is less than the left-side bound. This is inappropriate.") - else: - if args.Offset[0] == 0: - sys.exit("*Error*: An offset of 0 isn't allowed, since offsets are 1-based positions inside each alignment.") - wr = OffsetFragment(args.bamfiles, - binLength=args.binSize, - stepSize=args.binSize, - barcodes=barcodes, - clusterInfo=groupInfo, - cellTag=args.cellTag, - groupTag=args.groupTag, - groupLabels=newlabels, - motifFilter=args.motifFilter, - genome2bit=args.genome2bit, - GCcontentFilter=args.GCcontentFilter, - region=args.region, - numberOfProcessors=args.numberOfProcessors, - extendReads=args.extendReads, - minMappingQuality=args.minMappingQuality, - duplicateFilter=args.duplicateFilter, - center_read=args.centerReads, - zerosToNans=False, - samFlag_include=args.samFlagInclude, - samFlag_exclude=args.samFlagExclude, - minFragmentLength=args.minFragmentLength, - maxFragmentLength=args.maxFragmentLength, - chrsToSkip=args.ignoreForNormalization, - binarizeCoverage=coverageAsFrequency, - verbose=args.verbose) - wr.filter_strand = args.filterRNAstrand - wr.Offset = args.Offset - else: - wr = WriteBedGraph.WriteBedGraph(args.bamfiles, - binLength=args.binSize, - stepSize=args.binSize, - barcodes=barcodes, - clusterInfo=groupInfo, - cellTag=args.cellTag, - groupTag=args.groupTag, - groupLabels=newlabels, - motifFilter=args.motifFilter, - genome2bit=args.genome2bit, - GCcontentFilter=args.GCcontentFilter, - region=args.region, - blackListFileName=args.blackListFileName, - numberOfProcessors=args.numberOfProcessors, - extendReads=args.extendReads, - minMappingQuality=args.minMappingQuality, - duplicateFilter=args.duplicateFilter, - center_read=args.centerReads, - zerosToNans=False, - samFlag_include=args.samFlagInclude, - samFlag_exclude=args.samFlagExclude, - minFragmentLength=args.minFragmentLength, - maxFragmentLength=args.maxFragmentLength, - chrsToSkip=args.ignoreForNormalization, - binarizeCoverage=coverageAsFrequency, - verbose=args.verbose - ) - - wr.run(WriteBedGraph.scaleCoverage, func_args, args.outFilePrefix, - blackListFileName=args.blackListFileName, normUsing=args.normalizeUsing, - format=args.outFileFormat, smoothLength=None) - - -class OffsetFragment(WriteBedGraph.WriteBedGraph): - """ - Class to redefine the get_fragment_from_read for the --Offset case - """ - def filterStrand(self, read, rv): - """ - A generic read filtering function that gets used by everything in this class. - - rv is returned if the strand is correct, otherwise [(None, None)] - """ - # Filter by RNA strand, if desired - if read.is_paired: - if self.filter_strand == 'forward': - if read.flag & 144 == 128 or read.flag & 96 == 64: - return rv - elif self.filter_strand == 'reverse': - if read.flag & 144 == 144 or read.flag & 96 == 96: - return rv - else: - return rv - else: - if self.filter_strand == 'forward': - if read.flag & 16 == 16: - return rv - elif self.filter_strand == 'reverse': - if read.flag & 16 == 0: - return rv - else: - return rv - - return [(None, None)] - - def get_fragment_from_read_list(self, read, offset): - """ - Return the range of exons from the 0th through 1st bases, inclusive. Positions are 1-based - """ - rv = [(None, None)] - blocks = read.get_blocks() - blockLen = sum([x[1] - x[0] for x in blocks]) - - if self.defaultFragmentLength != 'read length': - if self.is_proper_pair(read, self.maxPairedFragmentLength): - if read.is_reverse: - foo = (read.next_reference_start, read.reference_start) - if foo[0] < foo[1]: - blocks.insert(0, foo) - else: - foo = (read.reference_end, read.reference_end + abs(read.template_length) - read.infer_query_length()) - if foo[0] < foo[1]: - blocks.append(foo) - - # Extend using the default fragment length - else: - if read.is_reverse: - foo = (read.reference_start - self.defaultFragmentLength + read.infer_query_length(), read.reference_start) - if foo[0] < 0: - foo = (0, foo[1]) - if foo[0] < foo[1]: - blocks.insert(0, foo) - else: - foo = (read.reference_end, read.reference_end + self.defaultFragmentLength - read.infer_query_length()) - if foo[0] < foo[1]: - blocks.append(foo) - - stretch = [] - # For the sake of simplicity, convert [(10, 20), (30, 40)] to [10, 11, 12, 13, ..., 40] - # Then subset accordingly - for block in blocks: - stretch.extend(range(block[0], block[1])) - if read.is_reverse: - stretch = stretch[::-1] - - # Handle --centerReads - if self.center_read: - _ = (len(stretch) - blockLen) // 2 - stretch = stretch[_:_ + blockLen] - - # Subset by --Offset - try: - foo = stretch[offset[0]:offset[1]] - except: - return rv - - if len(foo) == 0: - return rv - if read.is_reverse: - foo = foo[::-1] - - # Convert the stretch back to a list of tuples - foo = np.array(foo) - d = foo[1:] - foo[:-1] - idx = np.argwhere(d > 1).flatten().tolist() # This now holds the interval bounds as a list - idx.append(-1) - last = 0 - rv = [] - for i in idx: - rv.append((foo[last].astype("int"), foo[i].astype("int") + 1)) - last = i + 1 - - # Handle strand filtering, if needed - return self.filterStrand(read, rv) - - def get_fragment_from_read(self, read): - """ - This is mostly a wrapper for self.get_fragment_from_read_list(), - which needs a list and for the offsets to be tweaked by 1. - """ - offset = [x for x in self.Offset] - if len(offset) > 1: - if offset[0] > 0: - offset[0] -= 1 - if offset[1] < 0: - offset[1] += 1 - else: - if offset[0] > 0: - offset[0] -= 1 - offset = [offset[0], offset[0] + 1] - else: - if offset[0] < -1: - offset = [offset[0], offset[0] + 1] - else: - offset = [offset[0], None] - if offset[1] == 0: - # -1 gets switched to 0, which screws things up - offset = (offset[0], None) - return self.get_fragment_from_read_list(read, offset) - - -class CenterFragment(WriteBedGraph.WriteBedGraph): - """ - Class to redefine the get_fragment_from_read for the --MNase case - - The coverage of the fragment is defined as the 2 or 3 basepairs at the - center of the fragment length. - """ - def get_fragment_from_read(self, read): - """ - Takes a proper pair fragment of high quality and limited - to a certain length and outputs the center - """ - fragment_start = fragment_end = None - - # only paired forward reads are considered - # Fragments have already been filtered according to length - if read.is_proper_pair and not read.is_reverse and 1 < abs(read.tlen): - # distance between pairs is even return two bases at the center - if read.tlen % 2 == 0: - fragment_start = read.pos + read.tlen / 2 - 1 - fragment_end = fragment_start + 2 - - # distance is odd return three bases at the center - else: - fragment_start = read.pos + read.tlen / 2 - 1 - fragment_end = fragment_start + 3 - - return [(fragment_start, fragment_end)] - - -if __name__ == "__main__": - main() diff --git a/bin/scClusterCells b/bin/scClusterCells deleted file mode 100755 index c993647..0000000 --- a/bin/scClusterCells +++ /dev/null @@ -1,144 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import sys -import os -import argparse -import numpy as np -import pandas as pd -from scipy import sparse, io -from sklearn.preprocessing import binarize -# plotting -import matplotlib -import matplotlib.pyplot as plt -matplotlib.use('Agg') -matplotlib.rcParams['pdf.fonttype'] = 42 -matplotlib.rcParams['svg.fonttype'] = 'none' - -# single-cell stuff -import anndata -import scanpy as sc - - -## own Functions -scriptdir=os.path.abspath(os.path.join(__file__, "../../sincei")) -sys.path.append(scriptdir) -import ParserCommon -from Clustering import LSA_gensim - -def parseArguments(): - io_args = ParserCommon.inputOutputOptions(opts=['loomfile', 'outFile'], requiredOpts=['outFile']) - plot_args = ParserCommon.plotOptions() - other_args = ParserCommon.otherOptions() - - parser = argparse.ArgumentParser(parents=[io_args, get_args(), plot_args, other_args], - formatter_class=argparse.ArgumentDefaultsHelpFormatter,#argparse.RawDescriptionHelpFormatter, - description=""" - This tool clusters the cells based on the input count matrix (output of scCountReads) and performs dimentionality reduction, - community detection and 2D projection (UMAP) of the cells. The result is an updated loom object, and (optionally) a plot file - and a tsv file with UMAP coordinates and corresponding cluster id for each barcode. - """, - usage='Example usage: scClusterCells -i cellCounts.loom -o clustered.loom -op .png > log.txt', - add_help=False) - - return parser - -def get_args(): - parser = argparse.ArgumentParser(add_help=False) - - general = parser.add_argument_group('Clustering Options') - general.add_argument('--outFileUMAP', '-op', - type=str, - required=False, - help='The output plot file (for UMAP). If you specify this option, another file with the same ' - 'prefix (and .txt extention) is also created with the raw UMAP coordinates.') - - general.add_argument('--outFileTrainedModel', '-om', - type=argparse.FileType('w'), - required=False, - help='The output file for the trained LSI model. The saved model can be used later to embed/compare new cells ' - 'to the existing cluster of cells.') - - general.add_argument('--method', '-m', - type=str, - choices=['LSA'], - default='LSA', - help='The dimentionality reduction method for clustering. (Default: %(default)s)') - - general.add_argument('--binarize', - action='store_true', - help='Binarize the counts per region before dimentionality reduction (only for LSA/LDA)') - - general.add_argument('--nPrinComps', '-n', - default=20, - type=int, - help='Number of principle components to reduce the dimentionality to. ' - 'Use higher number for samples with more expected heterogenity. (Default: %(default)s)') - - general.add_argument('--nNeighbors', '-nk', - default=30, - type=int, - help='Number of nearest neighbours to consider for clustering and UMAP. This number should be chosen considering ' - 'the total number of cells and expected number of clusters. Smaller number will lead to more fragmented clusters. ' - '(Default: %(default)s)') - - general.add_argument('--clusterResolution', '-cr', - default=1.0, - type=float, - help='Resolution parameter for clustering. Values lower than 1.0 would result in less clusters, ' - 'while higher values lead to splitting of clusters. In most cases, the optimum value would be between ' - '0.8 and 1.2. (Default: %(default)s)') - - return parser - - -def main(args=None): - args = parseArguments().parse_args(args) - - adata = sc.read_loom(args.input, obs_names='obs_names', var_names='var_names') - - ## LSA and clustering based on gensim - mtx = sparse.csr_matrix(adata.X.transpose()) - if args.binarize: - mtx=binarize(mtx, copy=True) - corpus_lsi, cell_topic, corpus_tfidf = LSA_gensim(mtx, list(adata.obs.index), list(adata.var.index), nTopics = args.nPrinComps, smartCode='lfu') - - ## update the anndata object, drop cells which are not in the anndata - adata=adata[cell_topic.index] - adata.obsm['X_pca']=np.asarray(cell_topic.iloc[:,1:args.nPrinComps]) - sc.pp.neighbors(adata, use_rep='X_pca', n_neighbors=args.nNeighbors) - sc.tl.leiden(adata, resolution=args.clusterResolution) - sc.tl.paga(adata) - sc.pl.paga(adata, plot=False, threshold=0.1) - sc.tl.umap(adata, min_dist=0.1, spread=5, init_pos='paga') - - adata.write_loom(args.outFile, write_obsm_varm=True) - - if args.outFileUMAP: - ## plot UMAP - fig=sc.pl.umap(adata, color='leiden', legend_loc='on data', return_fig=True, show=False) - fig.savefig(args.outFileUMAP, dpi=200, format=args.plotFileFormat) - prefix=args.outFileUMAP.split(".")[0] - umap_lsi = pd.DataFrame(adata.obsm['X_umap'], columns=['UMAP1', 'UMAP2'], index=adata.obs.index) - umap_lsi['cluster'] = adata.obs['leiden'] - umap_lsi.to_csv(prefix+".tsv", sep = "\t", index_label='cell_id') - #plt.rcParams['font.size'] = 8.0 - # convert cm values to inches - #fig = plt.figure(figsize=(args.plotWidth / 2.54, args.plotHeight / 2.54)) - #fig.suptitle('LSA-UMAP', y=(1 - (0.06 / args.plotHeight))) - #plt.scatter(umap_lsi.UMAP1, umap_lsi.UMAP2, s=5, alpha = 0.8, c=[sns.color_palette()[x] for x in list(umap_lsi.cluster)]) - #plt.tight_layout() - #plt.savefig(args.outFileUMAP, dpi=200, format=args.plotFileFormat) - #plt.close() - - # save if asked - if args.outFileTrainedModel: - corpus_lsi.save(args.outFileTrainedModel) -# if args.outGraph: -# 'The output file for the Graph object (lgl format) which can be used for further clustering/integration.' -# graph.write_lgl(args.outGraph) - - return 0 - -if __name__ == "__main__": - main() diff --git a/bin/scCombineCounts b/bin/scCombineCounts deleted file mode 100755 index d218bb7..0000000 --- a/bin/scCombineCounts +++ /dev/null @@ -1,121 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import sys -import os -import argparse -import numpy as np -import pandas as pd -from scipy import sparse, io -# single-cell stuff -import anndata -import scanpy as sc - - -## own Functions -scriptdir=os.path.abspath(os.path.join(__file__, "../../sincei")) -sys.path.append(scriptdir) -import ParserCommon -from Clustering import LSA_gensim - -def parseArguments(): - other_args = ParserCommon.otherOptions() - - parser = argparse.ArgumentParser(parents=[get_args(), other_args], - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - description=""" - This tool combines multiple count matrices (output of scCountReads) into one, either assuming they are different samples (multi-sample) - or different measurements on the same set of cells (multi-modal). The result is a .loom file with combined counts. NOTE: it doesn't perform - any 'batch effect correction' or 'integration' of data from different technologies, which requires more sophisticated methods. - """, - usage='Example usage: scCombineCounts -i sample1.loom sample2.loom -o combined.loom > log.txt', - add_help=False) - - return parser - -def get_args(): - parser = argparse.ArgumentParser(add_help=False) - - general = parser.add_argument_group('General Options') - - general.add_argument('--input', '-i', - metavar='LOOM', - help='Input files in .loom format', - nargs='+', - required=True) - - general.add_argument('--outFile', '-o', - type=str, - help='The file to write results to. For method: `multi-sample`, the output ' - 'file is an updated .loom object, which can be used by other tools. ' - 'For method: `multi-omic`, the output file is an .hdf5 file. This file can only be ' - 'used by scClusterCells, to perform multi-modal clustering. ', - required=True) - - general.add_argument('--labels', '-l', - metavar='sample1 sample2', - help='User defined labels instead of default labels from ' - 'file names. Multiple labels have to be separated by a space, e.g. ' - '--labels sample1 sample2 sample3', - nargs='+') - - general.add_argument('--method', '-m', - type=str, - choices=['multi-sample', 'multi-modal'], - default='multi-sample', - help='How to merge the counts from the provided samples. ' - '`multi-sample`: assumes that each sample is the independent, ' - 'but were counted in the same manner (i.e. on same features), therefore ' - 'it looks for feature overlaps, but not for barcode overlaps. ' - '`multi-modal`: assumes that the counts were generated in 2 different ways, ' - 'but from the same set of cells (for example, using a multi-omic technology), ' - 'therefore it looks for the overlap of cell barcodes, but not for the overlaps ' - 'of features (Default: %(default)s)') - - - return parser - - -def main(args=None): - args = parseArguments().parse_args(args) - if args.method != 'multi-sample': - sys.stderr.write("Only multi-sample method is currently implemented") - sys.exit(1) - - if args.labels and len(args.input) != len(args.labels): - print("The number of labels does not match the number of input files.") - sys.exit(1) - if not args.labels: - if args.smartLabels: - args.labels = smartLabels(args.input) - else: - args.labels = [os.path.basename(x) for x in args.input] - adata_list = [sc.read_loom(x,obs_names='obs_names', var_names='var_names') for x in args.input] - - ## concatenate labels and match chrom, start, end - var_list=[] - var_cols=['chrom', 'start', 'end'] - for lab, ad in zip(args.labels, adata_list): - obs = ad.obs_names.to_list() - lab = [lab]*len(obs) - new_names=['_'.join([x,y]) for x,y in zip(lab, obs)] - ad.obs_names = new_names - hasinfo = all([x in ad.var.columns for x in var_cols]) - var_list.append(hasinfo) - - ## keep the chrom, start, end from original sample if present - adata = anndata.concat(adata_list) - if all(var_list): - var_df = adata_list[0].var[var_cols] - adata.var = adata.var.join(var_df) - else: - sys.stderr.write("WARNING: Not all input files contain the 'chrom', 'start', 'end' information. " - "The output will lack these fields. This might cause an error in some downstream tools") - - sys.stdout.write("Combined cells: {} \n".format(adata.shape[0])) - sys.stdout.write("Combined features: {} \n".format(adata.shape[1])) - adata.write_loom(args.outFile) - return 0 - -if __name__ == "__main__": - main() diff --git a/bin/scCountQC b/bin/scCountQC deleted file mode 100755 index c9f34ae..0000000 --- a/bin/scCountQC +++ /dev/null @@ -1,232 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import sys -import os -import re -import argparse -import numpy as np -import pandas as pd -from scipy import sparse, io -from itertools import compress -import copy -import scanpy as sc -import warnings -warnings.simplefilter(action='ignore', category=FutureWarning) - -## own Functions -scriptdir=os.path.abspath(os.path.join(__file__, "../../sincei")) -sys.path.append(scriptdir) -import ParserCommon -from Utilities import gini - -### ------ Functions ------ -def filter_adata(adata, filter_region_dict=None, - filter_cell_dict=None, bad_chrom=None, bad_cells=None): - # 1. regions - if bad_chrom: - adata=adata[:, ~adata.var.chrom.isin(bad_chrom)] - if filter_region_dict: - for key in filter_region_dict.keys(): - adata=adata[:, (adata.var[key] >= filter_region_dict[key][0]) & - (adata.var[key] <= filter_region_dict[key][1]) ] - - # 2. Cells - if bad_cells: - adata=adata[~adata.obs.index.isin(bad_cells)] - if filter_cell_dict: - for key in filter_cell_dict.keys(): - adata=adata[(adata.obs[key] >= filter_cell_dict[key][0]) & - (adata.obs[key] <= filter_cell_dict[key][1]), : ] - - return adata - -''' -def make_plots(adata, fname=None): - # plotting - import matplotlib - import matplotlib.pyplot as plt - matplotlib.use('Agg') - matplotlib.rcParams['pdf.fonttype'] = 42 - matplotlib.rcParams['svg.fonttype'] = 'none' - import seaborn as sns# seaborn colormaps conflicts with deeptools colormaps - - ## filtering of regions - plt.figure() - pl1=sns.violinplot(data=adata.var, x='total_counts', y='chrom') - plt.figure() - pl2=sns.distplot(np.log10(adata.var['total_counts']+1)) - plt.figure() - pl3=sns.distplot(adata.var['mean_counts']) - plt.figure() - pl4=sns.distplot(adata.var['fraction_cells_with_signal']) - ## cells - plt.figure() - pl5=sns.scatterplot(data=adata.obs, x='total_counts', y='pct_counts_in_top_100_genes') - pl5.set(xscale="log") - plt.figure() - pl6=sns.distplot(adata.obs['fraction_regions_with_signal']) - plt.figure() - pl7=sns.scatterplot(data=adata.obs, x='total_counts', y='fraction_regions_with_signal') - - plist=[pl1, pl2, pl3, pl4, pl5, pl6, pl7] - if fname: - with PdfPages(fname) as pp: - for plot in plist: - pp.savefig(plot.figure) - return plist - - general.add_argument('--outPlot', '-op', - type=str, - help='The output plot file. This describes the distribution of filtering metrics pre and post filtering') - if args.outPlot: - make_plots(adata, fname=args.outPlot) - if cellfilter or regionfilter or badcells or badchrom: - make_plots(adata_filt, fname=args.outPlot+".filtered.pdf") -''' - -def parseArguments(): - io_args = ParserCommon.inputOutputOptions(opts=['loomfile', 'outFile']) - plot_args = ParserCommon.plotOptions() - other_args = ParserCommon.otherOptions() - parser = argparse.ArgumentParser(parents=[io_args, get_args(), plot_args, other_args], - formatter_class=argparse.RawDescriptionHelpFormatter, - description=""" - This tool performs calculates multiple quality controls metrics on the input .loom file (output of scCountReads) - and (optionally) filters the input file based on filterCellArgs/filterRegionArgs. The output is either an - updated .loom object (if filtering is requested) or the filtering metrics (--outMetrics) and plots (--outPlot). - """, - usage='Example usage: scCountQC -i cellCounts.loom -om qc_metrics.tsv > log.txt', - add_help=False) - - return parser - -def get_args(): - parser = argparse.ArgumentParser(add_help=False) - - general = parser.add_argument_group('Filtering arguments') - general.add_argument('--describe', '-d', - action='store_true', - help='Print a list of cell and region metrics available for QC/filtering.') - - general.add_argument('--outMetrics', '-om', - type=str, - help='Prefix of the output file with calculated QC metrics. If given, the cell metrics are printed in ' - '.cell.tsv and region metrics as .region.tsv') - - general.add_argument('--filterCellArgs', '-fc', - type=str, - help='List of arguments to filter cells. The format is "arg_name: minvalue, maxvalue; arg_name: minvalue, maxvalue; ...." ' - 'where arg_name is the QC metric for cells present in the input loom file. In order to view all available ' - 'cell filtering metrics, run scCountFilter with "--describe". The two arguments are supplied (minvalue, maxvalue) ' - 'they are used as lower and upper bounds to filter cells. Make sure they are float/integer numbers.') - - general.add_argument('--filterRegionArgs', '-fr', - type=str, - help='List of arguments to filter regions. The format is "arg_name: minvalue, maxvalue; arg_name: minvalue; ...." ' - 'where arg_name is the QC metric for regions present in the input loom file. In order to view all available ' - 'cell filtering metrics, run scCountFilter with "--describe". The two arguments are supplied (minvalue, maxvalue) ' - 'they are used as lower and upper bounds to filter cells. Make sure they are float/integer numbers.') - - general.add_argument('--cell_blacklist', '-cb', - default=None, - type=argparse.FileType('r'), - help='A list of barcodes to be included for the clustering. The barcodes ' - '(along with sample labels) must be present in the input object.') - - general.add_argument('--chrom_blacklist', '-chb', - default=None, - type=str, - help='A comma separated list of chromosomes to exclude. eg. chrM, chrUn') - - return parser - - -def main(args=None): - args = parseArguments().parse_args(args) - - adata = sc.read_loom(args.input, obs_names='obs_names', var_names='var_names') - ## add QC stats to the anndata object - # 1. scanpy metrics # fraction of regions/genes with signal are included in the metrics (pct_dropouts/n_genes_by_counts) - try: - sc.pp.calculate_qc_metrics(adata, inplace=True) - except IndexError:# not enough genes/regions - sys.stderr.write('\n Error: Too few regions in the input file to perform QC \n') - exit() - #pool = multiprocessing.Pool(args.numberOfProcessors) - - #func=partial(gini, X=adata.X) - #gini_list = pool.map(func, range(adata.shape[0]) ) - gini_list = [gini(i, adata.X) for i in range(adata.shape[0])] - adata.obs['gini_coefficient'] = gini_list - - if args.outMetrics: - mat=re.sub(".txt|.tsv|.csv","",args.outMetrics) - obs_tsv = mat+".cells.tsv" - var_tsv = mat+".regions.tsv" - adata.obs.to_csv(obs_tsv, sep="\t", index_label='cell_id') - adata.var.to_csv(var_tsv, sep="\t", index_label='feature_id') - - # if --describe is asked, only print the numeric vars and obs columns - if args.describe: - - is_num_col=[(pd.api.types.is_float_dtype(x) | pd.api.types.is_integer_dtype(x)) for x in adata.obs.dtypes] - cols=adata.obs.loc[:, is_num_col] - sys.stdout.write("\n Cell metrics: \n") - sys.stdout.write("Total cells: {} \n".format(adata.shape[0])) - print(pd.DataFrame({'min': cols.min(), 'max': cols.max()})) - - is_num_row=[(pd.api.types.is_float_dtype(x) | pd.api.types.is_integer_dtype(x)) for x in adata.var.dtypes] - rows=adata.var.loc[:, is_num_row] - sys.stdout.write("\n Feature metrics: \n") - sys.stdout.write("Total features: {} \n".format(adata.shape[1])) - print(pd.DataFrame({'min': rows.min(), 'max': rows.max()})) - exit() - - if args.filterCellArgs: - cellfilter=dict() - for x in args.filterCellArgs.strip().split(";"): - key=x.strip().split(":")[0] - v=x.strip().split(":")[1] - value=[float(x) for x in v.strip().split(",")] - cellfilter[key] = value - else: - cellfilter=None - if args.filterRegionArgs: - regionfilter=dict() - for x in args.filterRegionArgs.strip().split(";"): - key=x.strip().split(":")[0] - v=x.strip().split(":")[1] - value=[float(x) for x in v.strip().split(",")] - regionfilter[key] = value - else: - regionfilter=None - - if args.cell_blacklist: - ## read the barcode file - with open(args.cell_blacklist, 'r') as f: - badcells = f.read().splitlines() - f.close() - else: - badcells=None - - if args.chrom_blacklist: - badchrom=args.chrom_blacklist.strip().split(",") - else: - badchrom=None - - if cellfilter or regionfilter or badcells or badchrom: - sys.stdout.write("Applying filters \n") - adata_filt=filter_adata(adata, - filter_region_dict=regionfilter, - filter_cell_dict=cellfilter, - bad_chrom=badchrom, - bad_cells=badcells) - sys.stdout.write("Remaining cells: {} \n".format(adata_filt.shape[0])) - sys.stdout.write("Remaining features: {} \n".format(adata_filt.shape[1])) - adata_filt.write_loom(args.outFile) - - return 0 - -if __name__ == "__main__": - main() diff --git a/bin/scCountReads b/bin/scCountReads deleted file mode 100755 index 48e25d4..0000000 --- a/bin/scCountReads +++ /dev/null @@ -1,212 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import os -import sys -import argparse -import numpy as np -from scipy import sparse, io -import re -import pandas as pd -import anndata as ad -import scanpy as sc -from deeptools import parserCommon -from deeptools.utilities import smartLabels -import warnings -warnings.simplefilter(action='ignore', category=FutureWarning) - -# own functions -scriptdir=os.path.abspath(os.path.join(__file__, "../../sincei")) - -sys.path.append(scriptdir) -import ReadCounter as countR -import ParserCommon -old_settings = np.seterr(all='ignore') - - -def parseArguments(args=None): - parser = \ - argparse.ArgumentParser( - formatter_class=argparse.RawDescriptionHelpFormatter, - description=""" - ``scCountReads`` computes the read coverages per cell barcode for genomic regions in the provided BAM file(s). - The analysis can be performed for the entire genome by running the program in 'bins' mode. - If you want to count the read coverage for specific regions only, use the ``features`` mode instead. - The standard output of ``scCountReads`` is a ".loom" file with counts, along with rowName (features) and colNames (cell barcodes). - - A detailed sub-commands help is available by typing: - - scCountReads bins -h - - scCountReads features -h - - """, - epilog='example usages:\n' - 'scCountReads bins --bamfiles file1.bam file2.bam --barcodes whitelist.txt -o results \n\n' - 'scCountReads features --BED selection.bed --bamfiles file1.bam file2.bam --barcodes whitelist.txt \n' - '-o results' - ' \n\n', - conflict_handler='resolve') - - subparsers = parser.add_subparsers( - title="commands", - dest='command', - description='subcommands', - help='subcommands', - metavar='') - - read_args = ParserCommon.readOptions(suppress_args=['filterRNAstrand']) - filter_args = ParserCommon.filterOptions() - other_args = ParserCommon.otherOptions() - - # bins mode options - subparsers.add_parser( - 'bins', - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - parents=[ParserCommon.inputOutputOptions(opts=['bamfiles', 'barcodes', 'outFilePrefix', 'BED'], - requiredOpts=['barcodes','outFilePrefix'], - suppress_args=['BED']), - parserCommon.gtf_options(suppress=True), - ParserCommon.bamOptions(default_opts={'binSize':10000, - 'distanceBetweenBins':0}), - read_args, filter_args, - get_args(), other_args - ], - help="The reads are counted in bins of equal size. The bin size and distance between bins can be adjusted.", - add_help=False, - usage='%(prog)s -bs 100000 --bamfiles file1.bam file2.bam -o results \n') - - # BED file arguments - subparsers.add_parser( - 'features', - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - parents=[ParserCommon.inputOutputOptions(opts=['bamfiles', 'barcodes', 'outFilePrefix', 'BED'], - requiredOpts=['barcodes','outFilePrefix', 'BED']), - parserCommon.gtf_options(), - ParserCommon.bamOptions(suppress_args=['binSize', 'distanceBetweenBins'], - default_opts={'binSize':10000, - 'distanceBetweenBins':0}), - read_args, filter_args, - get_args(), other_args - ], - help="The user provides a BED/GTF file containing all regions " - "that should be counted. A common use would be to count scRNA-seq reads on Genes.", - usage='%(prog)s --BED selection.bed --bamfiles file1.bam file2.bam --barcodes whitelist.txt -o results\n', - add_help=False) - - return parser - - -def get_args(): - parser = argparse.ArgumentParser(add_help=False) - optional = parser.add_argument_group('Misc arguments') - optional.add_argument('--genomeChunkSize', - type=int, - default=None, - help='Manually specify the size of the genome provided to each processor. ' - 'The default value of None specifies that this is determined by read ' - 'density of the BAM file.') - - optional.add_argument('--outFileFormat', - type=str, - default='loom', - choices=['loom', 'mtx'], - help='Output file format. Default is to write an anndata object of name ' - '.loom, which can either be opened in scanpy, or by downstream tools. ' - '"mtx" refers to the MatrixMarket sparse-matrix format. The output in this case would be ' - '.counts.mtx, along with .rownames.txt and .colnames.txt') - - return parser - - -def main(args=None): - """ - 1. get read counts at different positions either - all of same length or from genomic regions from the BED file - - 2. save data for further plotting - - """ - args, newlabels = ParserCommon.validateInputs(parseArguments().parse_args(args)) - - if 'BED' in args: - bed_regions = args.BED - else: - bed_regions = None - - ## create row/colNames - if args.outFileFormat == "mtx": - mtxFile = args.outFilePrefix + ".counts.mtx" - rowNamesFile = args.outFilePrefix + ".rownames.txt" - colNamesFile = args.outFilePrefix + ".colnames.txt" - else: - rowNamesFile=None - - stepSize = args.binSize + args.distanceBetweenBins - c = countR.CountReadsPerBin( - args.bamfiles, - binLength=args.binSize, - stepSize=stepSize, - barcodes=args.barcodes, - cellTag=args.cellTag, - groupTag=args.groupTag, - groupLabels=newlabels, - motifFilter=args.motifFilter, - genome2bit=args.genome2bit, - GCcontentFilter=args.GCcontentFilter, - numberOfSamples=None, - genomeChunkSize=args.genomeChunkSize, - numberOfProcessors=args.numberOfProcessors, - verbose=args.verbose, - region=args.region, - bedFile=bed_regions, - blackListFileName=args.blackListFileName, - extendReads=args.extendReads, - minMappingQuality=args.minMappingQuality, - duplicateFilter=args.duplicateFilter, - center_read=args.centerReads, - samFlag_include=args.samFlagInclude, - samFlag_exclude=args.samFlagExclude, - minFragmentLength=args.minFragmentLength, - maxFragmentLength=args.maxFragmentLength, - zerosToNans=False, - out_file_for_raw_data=rowNamesFile) - - num_reads_per_bin, regionList = c.run(allArgs=args) - - sys.stderr.write("Number of bins " - "found: {}\n".format(num_reads_per_bin.shape[0])) - - if num_reads_per_bin.shape[0] < 1: - exit("ERROR: too few non zero bins found.\n" - "If using --region please check that this " - "region is covered by reads.\n") - - ## write mtx/rownames if asked - if args.outFileFormat == 'mtx': - f = open(colNamesFile, "w") - f.write("\n".join(newlabels)) - f.write("\n") - f.close() - ## write the matrix as .mtx - sp = sparse.csr_matrix(num_reads_per_bin) - io.mmwrite(mtxFile, sp, field="integer") - else: - # write anndata - adata = ad.AnnData(num_reads_per_bin.T) - adata.obs = pd.DataFrame({"sample":[x.split('::')[-2] for x in newlabels], - "barcodes":[x.split('::')[-1] for x in newlabels] - },index=newlabels) - - rows=list(regionList) - - adata.var = pd.DataFrame({"chrom":[x.split('_')[0] for x in rows], - "start":[x.split('_')[1] for x in rows], - "end":[x.split('_')[2] for x in rows] - }, index=rows) - - # export as loom - adata.write_loom(args.outFilePrefix+".loom") - -if __name__ == "__main__": - main() diff --git a/bin/scFilterBarcodes b/bin/scFilterBarcodes deleted file mode 100755 index 67d2537..0000000 --- a/bin/scFilterBarcodes +++ /dev/null @@ -1,207 +0,0 @@ -#!/usr/bin/env python -import argparse -import sys -import os - -from deeptools import parserCommon, bamHandler, utilities -from deeptools.mapReduce import mapReduce -from deeptoolsintervals import GTF -import numpy as np -import pandas as pd -from collections import Counter -from functools import reduce -import matplotlib.pyplot as plt -import warnings -warnings.simplefilter(action='ignore', category=FutureWarning) - -## own functions -scriptdir=os.path.abspath(os.path.join(__file__, "../../sincei")) -sys.path.append(scriptdir) -from Utilities import * -import ParserCommon - -def parseArguments(): - io_args = ParserCommon.inputOutputOptions(opts=['bamfile', 'whitelist', 'outfile'], - requiredOpts=['bamfile']) - bam_args = ParserCommon.bamOptions(suppress_args=['labels', 'smartLabels', 'distanceBetweenBins', 'region'], - default_opts={'binSize': 100000}) - other_args = ParserCommon.otherOptions() - - parser = argparse.ArgumentParser( - parents=[io_args, get_args(), bam_args, other_args], - formatter_class=argparse.RawDescriptionHelpFormatter, - description=""" - This tool identifies barcodes present in a BAM files and produces a list. You can optionally filter these - barcodes by matching them to a whitelist or based on total counts. - """, - usage='Example usage: scFilterBarcodes -b sample.bam -w whitelist.txt > barcodes_detected.txt', - add_help=False) - - return parser - -def get_args(): - parser = argparse.ArgumentParser(add_help=False) - general = parser.add_argument_group('Counting Options') - - general.add_argument('--minHammingDist', '-d', - help='Minimum hamming distance to match the barcode in whitelist. Note that increasing the ' - 'hamming distance really slows down the barcode detection process.', - metavar="INT", - type=int, - default=0, - required=False) - - general.add_argument('--minCount', '-mc', - help='Minimum no. of bins with non-zero counts, in order to report a barcode. Note that this number would range ' - 'from 0 to genome size/binSize. ', - metavar="INT", - type=int, - default=0, - required=False) - - general.add_argument('--minMappingQuality', '-mq', - metavar='INT', - help='If set, only reads that have a mapping ' - 'quality score of at least this are ' - 'considered.', - type=int) - - general.add_argument('--rankPlot', '-rp', - type=parserCommon.writableFile, - help='The output file name to plot the ranked counts per barcode (similar to the \"knee plot\",' - 'but counts in this case would be the number of non-zero bins)') - - return parser - -## get hamming dist between 2 sequences -def ham_dist(s1, s2): - if len(s1) != len(s2): - raise ValueError("Undefined") - return sum(ch1 != ch2 for ch1, ch2 in zip(s1, s2)) - -def getFiltered_worker(arglist): - chrom, start, end, args = arglist - # Fix the bounds - if end <= start: - end = start + 1 - ## open blacklist file - blackList = None - if args.blackListFileName is not None: - blackList = GTF(args.blackListFileName) - - - fh = bamHandler.openBam(args.bamfile) - chromUse = utilities.mungeChromosome(chrom, fh.references) - - BCset = set() - for read in fh.fetch(chromUse, start, end): - if read.pos < start: - # ensure that we never double count (in case distanceBetweenBins == 0) - continue - - if read.flag & 4: - # Ignore unmapped reads, they were counted already - continue - - if args.minMappingQuality and read.mapq < args.minMappingQuality: - continue - - ## reads in blacklisted regions - if blackList and blackList.findOverlaps(chrom, read.reference_start, read.reference_start + read.infer_query_length(always=False) - 1): - continue - - ## get barcode and append to set - try: - bc = read.get_tag(args.cellTag) - except KeyError: - continue - if args.whitelist: - # match barcode to whitelist - if args.minHammingDist == 0: - if bc in args.whitelist: - BCset.add(bc) - else: - try: - hamdist = [ham_dist(x, bc) for x in args.whitelist] - if min(hamdist) <= args.minHammingDist: - BCset.add(bc) - except ValueError: - continue - else: - BCset.add(bc) - fh.close() - - # return the set with barcodes detected - return BCset - - -def main(args=None): - args = parseArguments().parse_args(args) - - # open barcode file and read the content in a list - if args.whitelist: - with open(args.whitelist, 'r') as f: - barcodes = f.read().splitlines() - args.whitelist = barcodes - - bhs = bamHandler.openBam(args.bamfile, returnStats=True, nThreads=args.numberOfProcessors)[0] - chrom_sizes = list(zip(bhs.references, bhs.lengths)) - bhs.close() - - # Get the remaining metrics - res = mapReduce([args], - getFiltered_worker, - chrom_sizes, - genomeChunkLength=args.binSize, - blackListFileName=args.blackListFileName, - numberOfProcessors=args.numberOfProcessors, - verbose=args.verbose) - ## res, should be a list of sets - #final_set = list(set().union(*res)) - df=pd.DataFrame(Counter(reduce(lambda x,y: list(x)+list(y),res)).items(), - columns=['barcode','count']) - - if args.minCount: - if args.minCount > len(res): - print("minCount bigger than No. of bins. Reducing to maximum") - args.minCount = len(res) - final_set=df.loc[df['count']>=args.minCount]['barcode'].to_list() - else: - final_set=df['barcode'].tolist() - - # convert count to log10 - df['count']=np.log10(df['count']) - if args.rankPlot: - df['count_rank']=df['count'].rank(method='min', ascending=False) - fig, ax = plt.subplots() - plt.plot('count_rank', 'count', data=df, linestyle='none', marker='o', color='grey', markersize=6) - ax.set_xlabel('Barcode Rank', fontsize=15) - ax.set_ylabel('No. of nonzero bins (log10)',fontsize=15) - ax.set_title('Ranked counts (#bins) for detected Barcodes', fontsize=15) - plt.xticks(np.arange(0, max(df['count']), int(max(df['count_rank'])/10)), fontsize=10) - plt.yticks(np.arange(0, 4, 0.1), fontsize=10) - - - # Annotation - if args.minCount: - plt.axhline(args.minCount, color='r') - - fig.tight_layout() - plt.savefig(args.rankPlot, dpi='figure', - format=None, metadata=None, - pad_inches=0.2, - facecolor='auto', edgecolor='auto', - backend=None) - - if args.outFile is None: - of=sys.stdout - else: - of = open(args.outFile, "w") - - for x in final_set: - of.write(str(x)+'\n') - df.to_csv("~/programs/sincei/test_df.csv") - return 0 - -if __name__ == "__main__": - main() diff --git a/bin/scJSD b/bin/scJSD deleted file mode 100755 index 0024f98..0000000 --- a/bin/scJSD +++ /dev/null @@ -1,146 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import os -import time -import sys -import argparse -import numpy as np -import pandas as pd -from deeptools.plotFingerprint import getSyntheticJSD -from deeptools import parserCommon - -from matplotlib.pyplot import plot -import matplotlib -matplotlib.use('Agg') -matplotlib.rcParams['pdf.fonttype'] = 42 -matplotlib.rcParams['svg.fonttype'] = 'none' -import warnings -warnings.simplefilter(action='ignore', category=FutureWarning) - -scriptdir=os.path.abspath(os.path.join(__file__, "../../sincei")) -sys.path.append(scriptdir) -## own functions -import ReadCounter as countR -import ParserCommon - -## plot KDE of JSD values -from numpy import array, linspace -from sklearn import neighbors - -old_settings = np.seterr(all='ignore') -MAXLEN = 10000000 - - -def get_args(): - parser = argparse.ArgumentParser(add_help=False, - conflict_handler='resolve') - optional = parser.add_argument_group('Optional arguments') - - optional.add_argument('--numberOfSamples', '-n', - help='The number of bins that are sampled from the genome, ' - 'for which the overlapping number of reads is computed. (Default: %(default)s)', - default=1e5, - type=int) - - optional.add_argument('--skipZeros', - help='If set, then regions with zero overlapping reads' - 'for *all* given BAM files are ignored. This ' - 'will result in a reduced number of read ' - 'counts than that specified in --numberOfSamples', - action='store_true') - - - return parser - -def parse_arguments(args=None): - io_args = ParserCommon.inputOutputOptions(opts=['bamfiles', 'barcodes', 'outFile'], - requiredOpts=['bamfiles', 'barcodes', 'outFile']) - bam_args = ParserCommon.bamOptions(suppress_args=['region', 'distanceBetweenBins'], - default_opts={'binSize': 10000}) - read_args = ParserCommon.readOptions(suppress_args=['filterRNAstrand', 'extendReads', 'centerReads']) - filter_args = ParserCommon.filterOptions(suppress_args=['motifFilter', 'genome2bit', 'GCcontentFilter', 'minAlignedFraction']) - other_args = ParserCommon.otherOptions() - - parser = argparse.ArgumentParser( - parents=[io_args, bam_args, read_args, - filter_args, get_args(), other_args], - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - description='This tool samples regions in the genome from BAM files ' - 'and compares the cumulative read coverages for each cell on those regions. ' - 'to a synthetic cell with poisson distributed reads using Jansson Shannon Distance. ' - 'Cells with high enrichment of signals show a higher JSD compared to cells whose signal ' - 'is homogenously distrubuted.', - conflict_handler='resolve', - usage='An example usage is: plotFingerprint -b treatment.bam control.bam ' - '-plot fingerprint.png', - add_help=False) - - return parser - - -def main(args=None): - args = ParserCommon.process_args(parse_arguments().parse_args(args)) - - ## read the barcode file - with open(args.barcodes, 'r') as f: - barcodes = f.read().splitlines() - f.close() - - ## Count - c = countR.CountReadsPerBin( - args.bamfiles, - binLength=args.binSize, - numberOfSamples=args.numberOfSamples, - barcodes=barcodes, - cellTag=args.cellTag, - motifFilter=None, - genome2bit=None, - GCcontentFilter=None, - blackListFileName=args.blackListFileName, - numberOfProcessors=args.numberOfProcessors, - verbose=args.verbose, - region=None, - bedFile=None, - extendReads=args.extendReads, - minMappingQuality=args.minMappingQuality, - duplicateFilter=args.duplicateFilter, - center_read=False, - samFlag_include=args.samFlagInclude, - samFlag_exclude=args.samFlagExclude, - minFragmentLength=args.minFragmentLength, - maxFragmentLength=args.maxFragmentLength, - zerosToNans=False, - sumCoveragePerBin=True) - - num_reads_per_bin, _ = c.run(allArgs=None) - - if num_reads_per_bin.sum() == 0: - import sys - sys.stderr.write( - "\nNo reads were found in {} regions sampled. Check that the\n" - "min mapping quality is not overly high and that the \n" - "chromosome names between bam files are consistant.\n" - "For small genomes, decrease the --numberOfSamples.\n" - "\n".format(num_reads_per_bin.shape[0])) - exit(1) - - if args.skipZeros: - num_reads_per_bin = countR.remove_row_of_zeros(num_reads_per_bin) - - total = len(num_reads_per_bin[:, 0]) - x = np.arange(total).astype('float') / total # normalize from 0 to 1 - - jsd_all = [] - for i in range(0, num_reads_per_bin.shape[1]): - jsd_all.append(getSyntheticJSD(num_reads_per_bin[:, i])) - - ## create colnames (sampleLabel+barcode) - newlabels = ["{}_{}".format(a, b) for a in args.labels for b in barcodes ] - df = pd.DataFrame({'cell': newlabels, 'jsd': jsd_all}) - df.to_csv(args.outFile, sep = "\t") - - - -if __name__ == "__main__": - main() diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..d4bb2cb --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..b00ee39 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,87 @@ +# -*- coding: utf-8 -*- + +# Configuration file for the Sphinx documentation builder. +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import os +import re +import sys + +# sys.path.insert(0, os.path.abspath('../')) + +VPATH = "../sincei/_version.py" + + +def get_version(path=VPATH): + try: + f = open(path) + except EnvironmentError: + return None + for line in f.readlines(): + mo = re.match('__version__ = "([^//"]+)"', line) + if mo: + ver = mo.group(1) + return ver + return None + + +# -- Project information ----------------------------------------------------- + +project = "sincei" +copyright = "2023, Vivek Bhardwaj" +author = "Vivek Bhardwaj" + +# The full version, including alpha/beta/rc tags +release = get_version() + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + "sphinx.ext.autodoc", + "sphinx.ext.doctest", + "sphinx.ext.coverage", + "sphinx.ext.mathjax", + "sphinx.ext.viewcode", + "sphinx.ext.autosummary", + "sphinxarg.ext", +] +# 'numpydoc' + +# This is needed to suppress autosummary reordering +numpydoc_show_class_members = False + +# Add any paths that contain templates here, relative to this directory. +# templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +on_rtd = os.environ.get("READTHEDOCS", None) == "True" + +# if not on_rtd: # only import and set the theme if we're building docs locally +# import sphinx_rtd_theme +# html_theme = 'sphinx_rtd_theme' +# html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +# html_static_path = ['_static'] diff --git a/sincei.png b/docs/content/images/sincei.png similarity index 100% rename from sincei.png rename to docs/content/images/sincei.png diff --git a/docs/content/list_of_tools.rst b/docs/content/list_of_tools.rst new file mode 100644 index 0000000..f3badfb --- /dev/null +++ b/docs/content/list_of_tools.rst @@ -0,0 +1,44 @@ +List of tools +================= + +The following tools use **BAM files** as input. These BAM files could come from any single-cell genomics protocol, as long as they have a **tag** that specifies the cell barcodes. + +.. toctree:: + :maxdepth: 1 + + tools/scFilterBarcodes + tools/scFilterStats + tools/scCountReads + tools/scBulkCoverage + tools/scJSD + +The following tools use the **scloom** output produced within the sincei analysis workflow. The format of **scloom** file is same as **.loom**, but it contains extra metadata that's needed for sincei tools. + +.. toctree:: + :maxdepth: 1 + + tools/scCountQC + tools/scCombineCounts + tools/scClusterCells + tools/scPlotRegion + +.. contents:: + :local: + ++--------------------------------------+------------------+-----------------------+---------------------------------------------+-------------------------------------------------------------------------------------------------------------+ +| tool | type | input files | main output file(s) | application | ++======================================+==================+=======================+=============================================+=============================================================================================================+ +|:ref:`scFilterBarcodes` | preprocessing | BAM/SAM files | text file with filtered cell barcodes | Identify and filter cell barcodes (for droplet-based single-cell seq) | ++--------------------------------------+------------------+-----------------------+---------------------------------------------+-------------------------------------------------------------------------------------------------------------+ +|:ref:`scFilterStats` | QC | BAM/SAM files | text file with QC per cell | Produce per-cell statistics after filtering reads by user-defined criteria. | ++--------------------------------------+------------------+-----------------------+---------------------------------------------+-------------------------------------------------------------------------------------------------------------+ +|:ref:`scCountReads` | preprocessing | BAM/SAM files | scloom object with cellxregion counts | Counts reads for each barcode on genomic bins or user-defined features. | ++--------------------------------------+------------------+-----------------------+---------------------------------------------+-------------------------------------------------------------------------------------------------------------+ +|:ref:`scCountQC` | QC | scloom object | QC metrics / filtered scloom file | Perform quality control and filter the output of scCountReads. | ++--------------------------------------+------------------+-----------------------+---------------------------------------------+-------------------------------------------------------------------------------------------------------------+ +|:ref:`scCombineCounts` | preprocessing | scloom objects | merged scloom object | Concatenate/merge the counts from different samples/batches or modalities | ++--------------------------------------+------------------+-----------------------+---------------------------------------------+-------------------------------------------------------------------------------------------------------------+ +|:ref:`scClusterCells` | analysis | scloom object | .tsv file with clusters, png with UMAP | Perform dimensionality reduction and clustering on the output of scCountReads. | ++--------------------------------------+------------------+-----------------------+---------------------------------------------+-------------------------------------------------------------------------------------------------------------+ +|:ref:`scBulkCoverage` | analysis | tsv file + BAM file | bigwig files | Get pseudo-bulk coverage per group using a user-supplied cell->group mapping (output of scClusterCells). | ++--------------------------------------+------------------+-----------------------+---------------------------------------------+-------------------------------------------------------------------------------------------------------------+ diff --git a/docs/content/modules/modules.rst b/docs/content/modules/modules.rst new file mode 100644 index 0000000..c5cab60 --- /dev/null +++ b/docs/content/modules/modules.rst @@ -0,0 +1,9 @@ +Python API +=========== + +Following modules are available for users who would like to use sincei directly in python. + +.. toctree:: + :maxdepth: 2 + + sincei diff --git a/docs/content/modules/sincei.rst b/docs/content/modules/sincei.rst new file mode 100644 index 0000000..ebcd49f --- /dev/null +++ b/docs/content/modules/sincei.rst @@ -0,0 +1,182 @@ +sincei package +============== + + +Submodules +---------- + +sincei.ExponentialFamily module +------------------------------- + +.. automodule:: sincei.ExponentialFamily + :members: + :undoc-members: + :show-inheritance: + +sincei.FragmentFFT module +------------------------- + +.. automodule:: sincei.FragmentFFT + :members: + :undoc-members: + :show-inheritance: + +sincei.GLMPCA module +-------------------- + +.. automodule:: sincei.GLMPCA + :members: + :undoc-members: + :show-inheritance: + +sincei.GetStats module +---------------------- + +.. automodule:: sincei.GetStats + :members: + :undoc-members: + :show-inheritance: + +sincei.ParserCommon module +-------------------------- + +.. automodule:: sincei.ParserCommon + :members: + :undoc-members: + :show-inheritance: + +sincei.ReadCounter module +------------------------- + +.. automodule:: sincei.ReadCounter + :members: + :undoc-members: + :show-inheritance: + +sincei.RegionQuery module +------------------------- + +.. automodule:: sincei.RegionQuery + :members: + :undoc-members: + :show-inheritance: + +sincei.TopicModels module +------------------------- + +.. automodule:: sincei.TopicModels + :members: + :undoc-members: + :show-inheritance: + +sincei.Utilities module +----------------------- + +.. automodule:: sincei.Utilities + :members: + :undoc-members: + :show-inheritance: + +sincei.WriteBedGraph module +--------------------------- + +.. automodule:: sincei.WriteBedGraph + :members: + :undoc-members: + :show-inheritance: + +sincei.multimodalClustering module +---------------------------------- + +.. automodule:: sincei.multimodalClustering + :members: + :undoc-members: + :show-inheritance: + +sincei.scBAMops module +---------------------- + +.. automodule:: sincei.scBAMops + :members: + :undoc-members: + :show-inheritance: + +sincei.scBulkCoverage module +---------------------------- + +.. automodule:: sincei.scBulkCoverage + :members: + :undoc-members: + :show-inheritance: + +sincei.scClusterCells module +---------------------------- + +.. automodule:: sincei.scClusterCells + :members: + :undoc-members: + :show-inheritance: + +sincei.scCombineCounts module +----------------------------- + +.. automodule:: sincei.scCombineCounts + :members: + :undoc-members: + :show-inheritance: + +sincei.scCountQC module +----------------------- + +.. automodule:: sincei.scCountQC + :members: + :undoc-members: + :show-inheritance: + +sincei.scCountReads module +-------------------------- + +.. automodule:: sincei.scCountReads + :members: + :undoc-members: + :show-inheritance: + +sincei.scFilterBarcodes module +------------------------------ + +.. automodule:: sincei.scFilterBarcodes + :members: + :undoc-members: + :show-inheritance: + +sincei.scFilterStats module +--------------------------- + +.. automodule:: sincei.scFilterStats + :members: + :undoc-members: + :show-inheritance: + +sincei.scJSD module +------------------- + +.. automodule:: sincei.scJSD + :members: + :undoc-members: + :show-inheritance: + +sincei.sincei module +-------------------- + +.. automodule:: sincei.sincei + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: sincei + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/content/news.rst b/docs/content/news.rst new file mode 100644 index 0000000..2165264 --- /dev/null +++ b/docs/content/news.rst @@ -0,0 +1,2 @@ +News and Updates +================= diff --git a/docs/content/tools/scBulkCoverage.rst b/docs/content/tools/scBulkCoverage.rst new file mode 100644 index 0000000..437688a --- /dev/null +++ b/docs/content/tools/scBulkCoverage.rst @@ -0,0 +1,9 @@ +.. _scBulkCoverage: + +scBulkCoverage +==================== + +.. argparse:: + :ref: sincei.scBulkCoverage.parseArguments + :prog: scBulkCoverage + :nodefault: diff --git a/docs/content/tools/scClusterCells.rst b/docs/content/tools/scClusterCells.rst new file mode 100644 index 0000000..27c1d46 --- /dev/null +++ b/docs/content/tools/scClusterCells.rst @@ -0,0 +1,9 @@ +.. _scClusterCells: + +scClusterCells +==================== + +.. argparse:: + :ref: sincei.scClusterCells.parseArguments + :prog: scClusterCells + :nodefault: diff --git a/docs/content/tools/scCombineCounts.rst b/docs/content/tools/scCombineCounts.rst new file mode 100644 index 0000000..2d467d5 --- /dev/null +++ b/docs/content/tools/scCombineCounts.rst @@ -0,0 +1,9 @@ +.. _scCombineCounts: + +scCombineCounts +==================== + +.. argparse:: + :ref: sincei.scCombineCounts.parseArguments + :prog: scCombineCounts + :nodefault: diff --git a/docs/content/tools/scCountQC.rst b/docs/content/tools/scCountQC.rst new file mode 100644 index 0000000..456a2c7 --- /dev/null +++ b/docs/content/tools/scCountQC.rst @@ -0,0 +1,9 @@ +.. _scCountQC: + +scCountQC +==================== + +.. argparse:: + :ref: sincei.scCountQC.parseArguments + :prog: scCountQC + :nodefault: diff --git a/docs/content/tools/scCountReads.rst b/docs/content/tools/scCountReads.rst new file mode 100644 index 0000000..9bad6a9 --- /dev/null +++ b/docs/content/tools/scCountReads.rst @@ -0,0 +1,9 @@ +.. _scCountReads: + +scCountReads +==================== + +.. argparse:: + :ref: sincei.scCountReads.parseArguments + :prog: scCountReads + :nodefault: diff --git a/docs/content/tools/scFilterBarcodes.rst b/docs/content/tools/scFilterBarcodes.rst new file mode 100644 index 0000000..aaf0364 --- /dev/null +++ b/docs/content/tools/scFilterBarcodes.rst @@ -0,0 +1,9 @@ +.. _scFilterBarcodes: + +scFilterBarcodes +==================== + +.. argparse:: + :ref: sincei.scFilterBarcodes.parseArguments + :prog: scFilterBarcodes + :nodefault: diff --git a/docs/content/tools/scFilterStats.rst b/docs/content/tools/scFilterStats.rst new file mode 100644 index 0000000..5bfddfb --- /dev/null +++ b/docs/content/tools/scFilterStats.rst @@ -0,0 +1,9 @@ +.. _scFilterStats: + +scFilterStats +==================== + +.. argparse:: + :ref: sincei.scFilterStats.parseArguments + :prog: scFilterStats + :nodefault: diff --git a/docs/content/tools/scJSD.rst b/docs/content/tools/scJSD.rst new file mode 100644 index 0000000..b0c6fad --- /dev/null +++ b/docs/content/tools/scJSD.rst @@ -0,0 +1,9 @@ +.. _scJSD: + +scJSD +==================== + +.. argparse:: + :ref: sincei.scJSD.parseArguments + :prog: scJSD + :nodefault: diff --git a/docs/content/tools/scPlotRegion.rst b/docs/content/tools/scPlotRegion.rst new file mode 100644 index 0000000..977a37d --- /dev/null +++ b/docs/content/tools/scPlotRegion.rst @@ -0,0 +1,4 @@ +.. _scPlotRegion: + +scPlotRegion +==================== diff --git a/docs/content/tutorials.rst b/docs/content/tutorials.rst new file mode 100644 index 0000000..d1a1e85 --- /dev/null +++ b/docs/content/tutorials.rst @@ -0,0 +1,2 @@ +Tutorials +=========== diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..055a80d --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,67 @@ + +sincei +======= + +.. image:: ./content/images/sincei.png + + +.. image:: https://zenodo.org/badge/271841139.svg + :target: https://zenodo.org/badge/latestdoi/271841139 + :alt: DOI + +Mourragui S. and Bhardwaj V. (2023) sincei: A user-friendly toolkit for QC, counting, clustering and plotting of single-cell (epi)genomics data. + + +Installation +------------------- +sincei is a command line toolkit based on python3, and can be installed using `conda `_ + +Create a new conda environment and install sincei using: + +.. code-block:: bash + + cd + conda create -n sincei -c bioconda -c conda-forge scanpy deeptools + conda activate sincei + (sincei): pip install --editable=git+https://github.com/vivekbhr/sincei.git@master#egg=sincei + + +Getting Help +------------ + +* For all kind of questions, suggesting changes/enhancements and to report bugs, please create an issue on `our GitHub repository `_ + +**Please Note that sincei is under active development.** Some features might be incomplete, untested or might be removed as we move towards a stable version. + + + +The list of tools available in sincei +--------------------------------------- + +Tools for a typical single-cell analysis workflow (WIP: work in progress/not available yet) + +========================== ============================================================================================================ +tool description +========================== ============================================================================================================ +:ref:`scFilterBarcodes` Identify and filter cell barcodes from BAM file (for droplet-based single-cell seq) +:ref:`scFilterStats` Produce per-cell statistics after filtering reads by user-defined criteria. +:ref:`scCountReads` Counts reads for each barcode on genomic bins or user-defined features. +:ref:`scCountQC` Perform quality control and filter the output of scCountReads. +:ref:`scCombineCounts` Concatenate/merge the counts from different samples/batches or modalities +:ref:`scClusterCells` Perform dimensionality reduction and clustering on the output of scCountReads. +:ref:`scBulkCoverage` Get pseudo-bulk coverage per group using a user-supplied cell->group mapping (output of scClusterCells). + scFindMarkers [WIP] Find marker genes per group, given the output of scCountReads and a user-defined group. + scFeaturePlot [WIP] Plot the counts for a given feature on a UMAP or on a (IGV-style) genomic-track. +========================== ============================================================================================================ + + +Contents: +--------- + +.. toctree:: + :maxdepth: 2 + + content/list_of_tools + content/tutorials + content/modules/modules + content/news diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..32bb245 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 0000000..1acbce1 --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,4 @@ +sphinx >=5.0 +sphinx-argparse >=0.2.5 +sphinx_rtd_theme +numpydoc diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..fa9e083 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,125 @@ +[build-system] +build-backend = "hatchling.build" +requires = ["hatchling"] + + +[project] +name = "sincei" +description = "A user-friendly toolkit for QC, counting, clustering and plotting of single-cell (epi)genomics data " +readme = "README.md" +requires-python = ">=3.8" +dynamic = ["version"] +license = {file = "LICENCE.txt"} +authors = [ + {name = "Vivek Bhardwaj"}, +] +maintainers = [ + {name = "Vivek Bhardwaj", email = "vivbhr@gmail.com"}, +] +classifiers = [ + "Intended Audience :: Science/Research", + "Topic :: Scientific/Engineering :: Bio-Informatics", + "Programming Language :: Python :: 3", +] +urls.Documentation = "https://sincei.readthedocs.io/" +urls.Source = "https://github.com/vivekbhr/sincei" +urls.Home-page = "https://github.com/vivekbhr/sincei" +dependencies = [ + "anndata", + "scanpy", + "loompy", + "umap-learn", + "pandas", + "deeptools", + "leidenalg", + "networkx", + "community", + "python-igraph", + # for debug logging (referenced from the issue template) + "session-info", + "torch", + "mctorch-lib", + "gensim" + ] + +[project.scripts] +scBulkCoverage = "sincei.scBulkCoverage:main" +scClusterCells = "sincei.scClusterCells:main" +scCombineCounts = "sincei.scCombineCounts:main" +scCountQC = "sincei.scCountQC:main" +scCountReads = "sincei.scCountReads:main" +scFilterBarcodes = "sincei.scFilterBarcodes:main" +scFilterStats = "sincei.scFilterStats:main" +scJSD = "sincei.scJSD:main" +sincei = "sincei.sincei:main" + + +[project.optional-dependencies] +dev = [ + # CLI for bumping the version number + "bump2version", + "pre-commit", + "twine>=4.0.2" +] +doc = [ + "sphinx>=4", + "sphinx-book-theme>=0.3.3", + "myst-nb", + "sphinxcontrib-bibtex>=1.0.0", + "sphinx-autodoc-typehints", + "readthedocs-sphinx-ext", + "sphinx-argparse", + # For notebooks + # "ipykernel", + # "ipython", + # "sphinx-copybutton", +] +test = [ + "pytest" +] + +[tool.hatch.build.targets.sdist] +include = [ + "/sincei", +] + +[tool.hatch.build.targets.wheel] +packages = ['sincei'] + +[tool.hatch.version] +path = "sincei/_version.py" + +[tool.pytest.ini_options] +testpaths = ["sincei/test"] +xfail_strict = true +addopts = [ + "--import-mode=importlib", # allow using test files with same name +] + +[tool.isort] +include_trailing_comma = true +multi_line_output = 3 +profile = "black" +skip_glob = ["docs/*"] + +[tool.black] +line-length = 120 +target-version = ['py38'] +include = '\.pyi?$' +exclude = ''' +( + /( + \.eggs + | \.git + | \.hg + | \.mypy_cache + | \.tox + | \.venv + | _build + | buck-out + | build + | dist + )/ +) +''' + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..8cbf05c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,10 @@ +scanpy +loompy +umap-learn +pandas +deeptools +gensim +leidenalg +networkx +community +python-igraph diff --git a/requirements.yaml b/requirements.yml similarity index 51% rename from requirements.yaml rename to requirements.yml index c081e51..9797a71 100644 --- a/requirements.yaml +++ b/requirements.yml @@ -1,11 +1,9 @@ name: sincei channels: - - bioconda - conda-forge - - anaconda + - bioconda - defaults dependencies: + - deeptools - scanpy - pip - - pip: - - "git+https://github.com/vivekbhr/sincei.git@master#egg=sincei" diff --git a/setup.py b/setup.py deleted file mode 100755 index 8855cae..0000000 --- a/setup.py +++ /dev/null @@ -1,81 +0,0 @@ -# -*- coding: utf-8 -*- - -import re -# read the contents of your README file -from pathlib import Path -from setuptools import setup, find_packages -from setuptools.command.sdist import sdist as _sdist -from setuptools.command.install import install as _install - -VERSION_PY = """ -# This file is originally generated from Git information by running 'setup.py -# version'. Distribution tarballs contain a pre-generated copy of this file. -__version__ = '%s' -""" - - -def get_version(): - try: - f = open("sincei/_version.py") - except EnvironmentError: - return None - for line in f.readlines(): - mo = re.match("__version__ = '([^']+)'", line) - if mo: - ver = mo.group(1) - return ver - return None - - -class sdist(_sdist): - - def run(self): - self.distribution.metadata.version = get_version() - return _sdist.run(self) - - -class install(_install): - - def run(self): - self.distribution.metadata.version = get_version() - _install.run(self) - return - - -this_directory = Path(__file__).parent -long_description = (this_directory / "README.md").read_text() - - -setup( - name='sincei', - version=get_version(), - author=' Vivek Bhardwaj', - author_email='vivbhr@gmail.com', - packages=find_packages(), - scripts=['bin/scBulkCoverage', 'bin/scClusterCells', 'bin/scCombineCounts', 'bin/scCountQC', - 'bin/scCountReads', 'bin/scFilterBarcodes', 'bin/scFilterStats', 'bin/scJSD', 'bin/sincei', - ], - include_package_data=True, - url='https://github.com/vivekbhr/sincei', - license='LICENSE.txt', - description='A user-friendly toolkit for QC, counting, clustering and plotting of single-cell (epi)genomics data ', - long_description=long_description, - long_description_content_type='text/markdown', - classifiers=[ - 'Intended Audience :: Science/Research', - 'Topic :: Scientific/Engineering :: Bio-Informatics'], - install_requires=[ - "scanpy >= 1.7.2", - "loompy >= 3.0.6", - "umap-learn==0.5.1", - "pandas >= 1.4", - "deeptools", - "gensim", - "leidenalg", - "networkx", - "community", - "python-igraph" - ], - zip_safe=True, - cmdclass={'sdist': sdist, 'install': install} -) diff --git a/sincei/Clustering.py b/sincei/Clustering.py deleted file mode 100644 index f42845e..0000000 --- a/sincei/Clustering.py +++ /dev/null @@ -1,78 +0,0 @@ -# topic models -import numpy as np -import pandas as pd -import scanpy as sc -from gensim import corpora, matutils, models - -# Louvain clustering and UMAP -from networkx import convert_matrix -from sklearn.metrics import pairwise_distances -import leidenalg as la -import community -import umap -from scanpy._utils import get_igraph_from_adjacency -from scanpy.neighbors import _compute_connectivities_umap, _get_indices_distances_from_dense_matrix - -### ------ Functions ------ - - -def LSA_gensim(mat, cells, regions, nTopics, smartCode='lfu'): - r""" - Computes LSA for a given matrix and returns the cell-topic matrix - - Parameters - ---------- - mat : numpy array - Matrix of shape (cells, regions) - - cells : list - List of cells - - regions : list - List of regions - - nTopics : int - Number of topics - - smartCode : str - Smart code for tfidf - - Returns - ------- - corpus_lsi : gensim corpus - LSA corpus - - cell_topic : pandas dataframe - Cell-topic matrix - - corpus_tfidf : gensim corpus - TFIDF corpus - """ - # LSA - regions_dict = corpora.dictionary.Dictionary([regions]) - corpus = matutils.Sparse2Corpus(mat) - tfidf = models.TfidfModel(corpus, id2word=regions_dict, normalize=True, smartirs=smartCode) - corpus_tfidf = tfidf[corpus] - lsi_model = models.LsiModel(corpus_tfidf, id2word=regions_dict, num_topics=nTopics) - corpus_lsi = lsi_model[corpus_tfidf] - - # Compute Coherence Score - coherence_model_lsa = models.CoherenceModel(model=lsi_model, corpus=corpus, - dictionary=regions_dict, coherence='u_mass') - coherence_lsa = coherence_model_lsa.get_coherence() - print('\nCoherence Score: ', coherence_lsa) - - ## make cell-topic df - li = [[tup[0] for tup in x] for x in corpus_lsi] - li_val = [[tup[1] for tup in x] for x in corpus_lsi] - if len(set([len(x) for x in li_val])) > 1: # if all documents don't have same set of topics - bad_idx = sorted([i for i,v in enumerate(li_val) if len(v) != nTopics], reverse=True) - print("{} Cells were detected which don't contribute to all {} topics. Removing them!".format(len(bad_idx), nTopics)) - [li_val.pop(x) for x in bad_idx] - [li.pop(x) for x in bad_idx] - [cells.pop(x) for x in bad_idx] - li_val = np.stack(li_val) - cell_topic = pd.DataFrame(li_val, columns=li[0]) - cell_topic.index = cells - - return corpus_lsi, cell_topic, corpus_tfidf diff --git a/sincei/ExponentialFamily.py b/sincei/ExponentialFamily.py new file mode 100644 index 0000000..c4a9cf1 --- /dev/null +++ b/sincei/ExponentialFamily.py @@ -0,0 +1,320 @@ +import torch +from copy import deepcopy +import numpy as np +import scipy +from tqdm import tqdm +from joblib import Parallel, delayed + +saturation_eps = 10**-10 + + +class ExponentialFamily: + def __init__(self, family_params=None, **kwargs): + self.family_name = "base" + self.family_params = family_params if family_params else {} + + def sufficient_statistics(self, X: torch.Tensor): + return X + + def natural_parametrization(self, theta: torch.Tensor): + return theta + + def log_partition(self, theta: torch.Tensor): + return 0.0 + + def base_measure(self, X: torch.Tensor): + return torch.ones(size=X.shape) + + def invert_g(self, X: torch.Tensor): + return X + + def exponential_term(self, X: torch.Tensor, theta: torch.Tensor): + return torch.multiply(self.sufficient_statistics(X), self.natural_parametrization(theta)) + + def distribution(self, X: torch.Tensor, theta: torch.Tensor): + f = self.base_measure(X) + expt = self.exponential_term(X, theta) - self.log_partition(theta) + + return torch.multiply(f, torch.exp(expt)) + + def log_distribution(self, X: torch.Tensor, theta: torch.Tensor): + f = self.base_measure(X) + expt = self.exponential_term(X, theta) - self.log_partition(theta) + + return expt - torch.log(f) + + def log_likelihood(self, X: torch.Tensor, theta: torch.Tensor): + """Computes negative log-likelihood between dataset X and parameters theta""" + expt = self.exponential_term(X, theta) - self.log_partition(theta) + return -torch.sum(expt) + + def load_family_params_to_gpu(self, device): + self.family_params = { + k: self.family_params[k].to(device) + if type(self.family_params[k]) is torch.Tensor + else self.family_params[k] + for k in self.family_params + } + + def initialize_family_parameters(self, X: torch.Tensor = None): + """General method to initialize certain parameters (e.g. for Beta or Negative Binomial""" + pass + + +class Gaussian(ExponentialFamily): + def __init__(self, family_params=None, **kwargs): + self.family_name = "gaussian" + self.family_params = family_params if family_params else {} + + def sufficient_statistics(self, X: torch.Tensor): + return X + + def natural_parametrization(self, theta: torch.Tensor): + return theta + + def log_partition(self, theta: torch.Tensor): + return torch.square(theta) / 2.0 + + def base_measure(self, X: torch.Tensor): + return torch.exp(-torch.square(X) / 2.0) / np.sqrt(2.0 * torch.pi) + + def invert_g(self, X: torch.Tensor): + return X + + +class Bernoulli(ExponentialFamily): + def __init__(self, family_params=None, **kwargs): + self.family_name = "bernoulli" + self.family_params = family_params if family_params else {"max_val": 30} + + def sufficient_statistics(self, X: torch.Tensor): + return X + + def natural_parametrization(self, theta: torch.Tensor): + return theta + + def log_partition(self, theta: torch.Tensor): + return torch.log(1.0 + torch.exp(theta)) + + def base_measure(self, X: torch.Tensor): + return 1.0 + + def invert_g(self, X: torch.Tensor): + return torch.log(X / (X - 1)).clip(-self.family_params["max_val"], self.family_params["max_val"]) + + def log_likelihood(self, X: torch.Tensor, theta: torch.Tensor): + """Computes negative log-likelihood between dataset X and parameters theta""" + expt = self.exponential_term(X, theta) - self.log_partition(theta) + return -torch.sum(expt) + + +class Poisson(ExponentialFamily): + def __init__(self, family_params=None, **kwargs): + self.family_name = "poisson" + self.family_params = family_params if family_params else {"min_val": 1e-20} + + def sufficient_statistics(self, X: torch.Tensor): + return X + + def natural_parametrization(self, theta: torch.Tensor): + return theta + + def log_partition(self, theta: torch.Tensor): + return torch.exp(theta) + + def base_measure(self, X: torch.Tensor): + return 1 / scipy.special.gamma(X + 1) + + def invert_g(self, X: torch.Tensor): + return torch.log(X).clip(self.family_params["min_val"]) + + def log_distribution(self, X: torch.Tensor, theta: torch.Tensor): + """The computation of gamma function for the base measure (h) would lead to inf, hence a re-design + of the method.""" + log_f = torch.lgamma(X + 1) + expt = self.exponential_term(X, theta) - self.log_partition(theta) + + return expt - log_f + + +class Beta(ExponentialFamily): + def __init__(self, family_params=None, **kwargs): + self.family_name = "beta" + if family_params is None or "nu" not in family_params: + print("Beta distribution not initialized yet") + default_family_params = {"min_val": 1e-5, "n_jobs": 1, "eps": 1e-4, "maxiter": 100} + self.family_params = family_params if family_params else default_family_params + for k in kwargs: + self.family_params[k] = kwargs[k] + + def sufficient_statistics(self, X: torch.Tensor): + return torch.stack([torch.log(X), torch.log(1 - X)]) + + def natural_parametrization(self, theta: torch.Tensor): + nat_params = torch.stack([theta * self.family_params["nu"], (1 - theta) * self.family_params["nu"]]) + if nat_params.shape[1] == 1: + nat_params = nat_params.flatten() + return nat_params + + def base_measure(self, X: torch.Tensor): + return torch.mul(X, 1 - X) + + def log_partition(self, theta: torch.Tensor): + numerator = torch.sum(torch.lgamma(self.natural_parametrization(theta)), axis=0) + numerator = torch.sum(numerator, axis=0) + denominator = torch.lgamma(self.family_params["nu"]) + return numerator - denominator + + def exponential_term(self, X: torch.Tensor, theta: torch.Tensor): + return torch.sum(torch.multiply(self.sufficient_statistics(X), self.natural_parametrization(theta)), axis=0) + + def _derivative_log_likelihood(self, X: torch.Tensor, theta: torch.Tensor): + return ( + torch.log(X / (1 - X)) + + torch.digamma((1 - theta) * self.family_params["nu"]) + - torch.digamma(theta * self.family_params["nu"]) + ) + + def initialize_family_parameters(self, X: torch.Tensor = None): + p = X.shape[1] + + self.family_params["nu"] = torch.Tensor( + Parallel(n_jobs=self.family_params["n_jobs"])( + delayed(scipy.stats.beta.fit)(X[:, idx], floc=0, fscale=1) for idx in tqdm(range(p)) + ) + ) + self.family_params["nu"] = torch.sum(self.family_params["nu"][:, :2], axis=1) + assert self.family_params["nu"].shape[0] == p + + return True + + def invert_g(self, X: torch.Tensor = None): + """Dichotomy to find where derivative maxes out""" + + # Initialize dichotomy parameters. + min_val = torch.zeros(X.shape) + max_val = torch.ones(X.shape) + theta = (min_val + max_val) / 2 + + llik = self._derivative_log_likelihood(X, theta) + for idx in tqdm(range(self.family_params["maxiter"])): + min_val[llik > 0] = theta[llik > 0] + max_val[llik < 0] = theta[llik < 0] + theta = (min_val + max_val) / 2 + llik = self._derivative_log_likelihood(X, theta) + + if torch.max(torch.abs(llik)) < self.family_params["eps"]: + print("CONVERGENCE AFTER %s ITERATIONS" % (idx)) + break + + if idx == self.family_params["maxiter"]: + print("CONVERGENCE NOT REACHED") + + return theta + + +class Gamma(ExponentialFamily): + def __init__(self, family_params=None, **kwargs): + self.family_name = "gamma" + if family_params is None or "nu" not in family_params: + print("Gamma distribution not initialized yet") + default_family_params = {"min_val": 1e-5, "n_jobs": 1, "eps": 1e-4, "maxiter": 100, "maxval": 10e6} + self.family_params = family_params if family_params else default_family_params + for k in kwargs: + self.family_params[k] = kwargs[k] + + def sufficient_statistics(self, X: torch.Tensor): + return torch.stack([torch.log(X), X]) + + def natural_parametrization(self, theta: torch.Tensor): + nat_params = torch.stack([theta, -torch.ones(theta.shape) * self.family_params["nu"]]) + if nat_params.shape[1] == 1: + nat_params = nat_params.flatten() + return nat_params + + def log_partition(self, theta: torch.Tensor): + return torch.lgamma(theta + 1) - (theta + 1) * torch.log(self.family_params["nu"]) + + def exponential_term(self, X: torch.Tensor, theta: torch.Tensor): + return torch.sum(torch.multiply(self.sufficient_statistics(X), self.natural_parametrization(theta)), axis=0) + + def _digamma_implicit_function(self, X: torch.Tensor, theta: torch.Tensor): + return torch.digamma(theta + 1) - torch.log(X * self.family_params["nu"]) + + def invert_g(self, X: torch.Tensor = None): + """Dichotomy to compute inverse of digamma function.""" + + # Initialize dichotomy parameters. + min_val = torch.zeros(X.shape) + max_val = torch.ones(X.shape) * self.family_params["maxval"] + theta = (min_val + max_val) / 2 + + llik = self._digamma_implicit_function(X, theta) + for idx in tqdm(range(self.family_params["maxiter"])): + max_val[llik > 0] = theta[llik > 0] + min_val[llik < 0] = theta[llik < 0] + theta = (min_val + max_val) / 2 + llik = self._digamma_implicit_function(X, theta) + + if torch.max(torch.abs(llik)) < self.family_params["eps"]: + print("CONVERGENCE AFTER %s ITERATIONS" % (idx)) + break + + if idx == self.family_params["maxiter"]: + print("CONVERGENCE NOT REACHED") + + return theta + + def initialize_family_parameters(self, X: torch.Tensor = None): + p = X.shape[1] + + self.family_params["nu"] = torch.Tensor( + Parallel(n_jobs=self.family_params["n_jobs"])( + delayed(scipy.stats.gamma.fit)(X[:, idx], floc=0) for idx in tqdm(range(p)) + ) + ) + self.family_params["nu"] = 1.0 / self.family_params["nu"][:, -1] + assert self.family_params["nu"].shape[0] == p + + return True + + +class LogNormal(ExponentialFamily): + def __init__(self, family_params=None, **kwargs): + self.family_name = "log_normal" + if family_params is None or "nu" not in family_params: + print("Log Normal distribution not initialized yet") + default_family_params = {"min_val": 1e-8, "n_jobs": 1, "eps": 1e-4, "maxiter": 100, "maxval": 10e6} + self.family_params = family_params if family_params else default_family_params + for k in kwargs: + self.family_params[k] = kwargs[k] + + def sufficient_statistics(self, X: torch.Tensor): + log_X = torch.log(X) + return torch.stack([log_X, torch.square(log_X)]) + + def natural_parametrization(self, theta: torch.Tensor): + nat_params = torch.stack( + [ + theta / torch.square(self.family_params["nu"]), + -torch.ones(theta.shape) / (2 * torch.square(self.family_params["nu"])), + ] + ) + if nat_params.shape[1] == 1: + nat_params = nat_params.flatten() + return nat_params + + def base_measure(self, X: torch.Tensor): + return 1 / (np.sqrt(2 * torch.pi) * X) + + def log_partition(self, theta: torch.Tensor): + return torch.square(theta) / (2 * torch.square(self.family_params["nu"])) + torch.log(self.family_params["nu"]) + + def exponential_term(self, X: torch.Tensor, theta: torch.Tensor): + return torch.sum(torch.multiply(self.sufficient_statistics(X), self.natural_parametrization(theta)), axis=0) + + def invert_g(self, X: torch.Tensor = None): + return torch.log(X.clip(self.family_params["min_val"])) + + def initialize_family_parameters(self, X: torch.Tensor = None): + self.family_params["nu"] = torch.sqrt(torch.var(torch.log(X), axis=0)) diff --git a/sincei/FragmentFFT.py b/sincei/FragmentFFT.py index d252b82..039bfdf 100644 --- a/sincei/FragmentFFT.py +++ b/sincei/FragmentFFT.py @@ -1,5 +1,6 @@ import matplotlib -matplotlib.use('Agg') + +matplotlib.use("Agg") import matplotlib.pyplot as plt import scipy as sp @@ -7,6 +8,7 @@ import numpy as np import pandas as pd + ## fast fouriour transform def ffttable(selected): r"""Computes the FFT of the fragment length distribution @@ -39,14 +41,15 @@ def ffttable(selected): 5 5.0 1.812500 """ selected = np.log2(selected + 1) - selected2 = [y-x for x, y in zip(selected, selected[1:])] + selected2 = [y - x for x, y in zip(selected, selected[1:])] fragment_fft = sp.fftpack.fft(selected2) fragment_psd = np.abs(fragment_fft) ** 2 fftfreq = sp.fftpack.fftfreq(len(fragment_psd), 10.0) - d = pd.DataFrame({'freq':fftfreq, 'value':fragment_psd}) - d2 = d[d.freq >0] + d = pd.DataFrame({"freq": fftfreq, "value": fragment_psd}) + d2 = d[d.freq > 0] return d2 + # get fragment size disribution and fft value from dict{barcode:fragment_size_list} def fragment_distribution(fragment_len_dict, length_plot): r"""Plot fragment length distribution @@ -73,37 +76,38 @@ def fragment_distribution(fragment_len_dict, length_plot): ... 'AAACCTGAGAGGTTCT': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]} >>> fragment_distribution(fragment_len_dict, test.length_plot) """ - plt.style.use('classic') - fig=plt.figure() - ax1=fig.add_subplot(1,1,1) + plt.style.use("classic") + fig = plt.figure() + ax1 = fig.add_subplot(1, 1, 1) - outdict=dict.fromkeys(fragment_len_dict.keys()) + outdict = dict.fromkeys(fragment_len_dict.keys()) for barcode in outdict.keys(): fragment_len = fragment_len_dict[barcode] - n, bins, patches = ax1.hist(fragment_len,bins=100,color='orange',range=(0,1000), alpha=0.3) - #calculating fft - dflist=[] + n, bins, patches = ax1.hist(fragment_len, bins=100, color="orange", range=(0, 1000), alpha=0.3) + # calculating fft + dflist = [] for num in range(80, 101, 1): dflist.append(ffttable(n[10:num])) - d2 = pd.concat(dflist,ignore_index=True) - d2 = d2.sort_values(by=['freq'],ascending=False) + d2 = pd.concat(dflist, ignore_index=True) + d2 = d2.sort_values(by=["freq"], ascending=False) outdict[barcode] = d2 # plot fragment sizes - ax1.xaxis.set_ticks_position('bottom') - ax1.yaxis.set_ticks_position('left') - plt.ticklabel_format(style='sci', axis='y', scilimits=(0,0)) - plt.xlabel('Fragment length', fontsize=20) - plt.ylabel('Fragment count', fontsize=20) + ax1.xaxis.set_ticks_position("bottom") + ax1.yaxis.set_ticks_position("left") + plt.ticklabel_format(style="sci", axis="y", scilimits=(0, 0)) + plt.xlabel("Fragment length", fontsize=20) + plt.ylabel("Fragment count", fontsize=20) plt.xticks(fontsize=20) plt.yticks(fontsize=20) - ax1.set_title('Fragment distribution', fontsize=20) - plt.savefig(length_plot, dpi=300, bbox_inches='tight') + ax1.set_title("Fragment distribution", fontsize=20) + plt.savefig(length_plot, dpi=300, bbox_inches="tight") plt.close(fig) return outdict + ## plot the fragment periodicity per barcode using the output of def fftplot(outdict, plot): r"""Computes the periodicity of the fragment distribution @@ -129,40 +133,40 @@ def fftplot(outdict, plot): >>> fftplot(d, 'test.png') """ - plt.style.use('classic') + plt.style.use("classic") fig = plt.figure() - ax = fig.add_subplot(1,1,1) + ax = fig.add_subplot(1, 1, 1) periodicity = dict.fromkeys(outdict.keys()) for barcode in outdict.keys(): d2 = outdict[barcode] - max_y_pos = 1 / d2.loc[d2.value.idxmax(),'freq'] + max_y_pos = 1 / d2.loc[d2.value.idxmax(), "freq"] mean_value = np.mean(d2.value) ## color barcodes where max periodicity is not between 120 and 160, or 240 and 320 p1 = range(120, 160, 1) p2 = range(240, 320, 1) - mononuc_periodicity = np.mean(d2.loc[[int(x) in p1 for x in 1/d2.freq], 'value']) - dinuc_periodicity = np.mean(d2.loc[[int(x) in p2 for x in 1/d2.freq], 'value']) + mononuc_periodicity = np.mean(d2.loc[[int(x) in p1 for x in 1 / d2.freq], "value"]) + dinuc_periodicity = np.mean(d2.loc[[int(x) in p2 for x in 1 / d2.freq], "value"]) - #if int(mean_value) in p1: + # if int(mean_value) in p1: # col='red' - #elif int(mean_value) in p2: + # elif int(mean_value) in p2: # col='blue' - #else: - col='grey' - ax.plot(1/d2.freq, 10 * np.log10(d2.value + 1), color=col, alpha=0.2) + # else: + col = "grey" + ax.plot(1 / d2.freq, 10 * np.log10(d2.value + 1), color=col, alpha=0.2) periodicity[barcode] = [mononuc_periodicity, dinuc_periodicity] - #ax.axvline(max_y_pos,color = 'r',linestyle= '--') + # ax.axvline(max_y_pos,color = 'r',linestyle= '--') plt.xticks(fontsize=20) plt.yticks(fontsize=20) ax.set_xlim(0, 400) - ax.set_xlabel('Period (bp)', fontsize=20) - ax.set_ylabel('Power', fontsize=20) - ax.set_title('Period of fragment distribution', fontsize=20) + ax.set_xlabel("Period (bp)", fontsize=20) + ax.set_ylabel("Power", fontsize=20) + ax.set_title("Period of fragment distribution", fontsize=20) - plt.savefig(plot,dpi=300,bbox_inches='tight') + plt.savefig(plot, dpi=300, bbox_inches="tight") return periodicity diff --git a/sincei/GLMPCA.py b/sincei/GLMPCA.py new file mode 100644 index 0000000..9dac606 --- /dev/null +++ b/sincei/GLMPCA.py @@ -0,0 +1,235 @@ +import numpy as np +import pandas as pd +import torch, os +import torch.optim +from copy import deepcopy +from joblib import Parallel, delayed +import mctorch.nn as mnn +from pickle import dump, load +import mctorch.optim as moptim +from torch.utils.data import Dataset, TensorDataset, DataLoader +from tqdm import tqdm +from scipy.stats import beta as beta_dst +from scipy.stats import lognorm +from scipy.stats import gamma as gamma_dst + +from sincei.ExponentialFamily import Gaussian, Poisson, Bernoulli, Beta, Gamma, LogNormal + +EXPONENTIAL_FAMILY_DICT = { + "gaussian": Gaussian, + "poisson": Poisson, + "bernoulli": Bernoulli, + "beta": Beta, + "gamma": Gamma, + "lognormal": LogNormal, + "log_normal": LogNormal, +} +LEARNING_RATE_LIMIT = 10 ** (-10) + + +class GLMPCA: + def __init__( + self, + n_pc, + family, + family_params=None, + max_iter=1000, + learning_rate=0.02, + batch_size=128, + step_size=20, + gamma=0.5, + n_init=1, + init="spectral", + n_jobs=1, + ): + self.n_pc = n_pc + self.family = family + self.family_params = family_params + self.log_part_theta_matrices_ = None + self.max_iter = np.abs(max_iter) + self.learning_rate_ = learning_rate + self.initial_learning_rate_ = learning_rate + self.n_jobs = n_jobs + self.batch_size = batch_size + self.n_init = n_init + self.gamma = gamma + self.step_size = step_size + self.init = init + + self.saturated_loadings_ = None + # saturated_intercept_: before projecting + self.saturated_intercept_ = None + # reconstruction_intercept: after projecting + self.reconstruction_intercept_ = None + + # Whether to perform sample or gene projection + self.sample_projection = False + + self.exp_family_params = None + self.loadings_learning_scores_ = [] + self.loadings_learning_rates_ = [] + + # Initialize device + self.device = None + + # Set up exponential family + if type(family) is str: + self.exponential_family = EXPONENTIAL_FAMILY_DICT[family](self.family_params) + else: + self.exponential_family = family + + def fit(self, X): + # Fit exponential family params (e.g., dispersion for negative binomial) + self.exponential_family.initialize_family_parameters(X) + + # Compute saturated parameters, alongside exponential family parameters (if needed) + saturated_parameters = self.exponential_family.invert_g(X) + + # Use saturated parameters to find loadings by projected gradient descent + self.saturated_loadings_ = [] + self.saturated_intercept_ = [] + + # Initialize the learning procedure + self.learning_rate_ = self.initial_learning_rate_ + self.loadings_learning_scores_ = [] + self.loadings_learning_rates_ = [] + + # Compute loadings for different parameters + for _ in range(self.n_init): + self._compute_saturated_loadings(X) + + # Select best model + training_cost = torch.Tensor( + [ + self._optim_cost(loadings, intercept, X, saturated_parameters) + for loadings, intercept in zip(self.saturated_loadings_, self.saturated_intercept_) + ] + ) + best_model_idx = torch.argmin(training_cost) + self.saturated_intercept_ = self.saturated_intercept_[best_model_idx] + self.saturated_loadings_ = self.saturated_loadings_[best_model_idx] + + def transform(self, X): + saturated_parameters = self.exponential_family.invert_g(X) + + # Compute intercept term + n = X.shape[0] + intercept_term = self.saturated_intercept_.unsqueeze(0).repeat(n, 1).to(self.device) + + projected_parameters = saturated_parameters - intercept_term + projected_parameters = projected_parameters.matmul(self.saturated_loadings_) + + return projected_parameters + + def _compute_saturated_loadings(self, X): + # Compute saturated parameters and load on divide + saturated_parameters = self.exponential_family.invert_g(X) + + # Train GLM-PCA with mcTorch. + loadings_ = self._saturated_loading_iter(saturated_parameters, X) + self.saturated_loadings_.append(loadings_[0]) + self.saturated_intercept_.append(loadings_[1]) + + def _saturated_loading_iter(self, saturated_parameters: torch.Tensor, X: torch.Tensor): + if self.learning_rate_ < LEARNING_RATE_LIMIT: + raise ValueError("LEARNING RATE IS TOO SMALL : DID NOT CONVERGE") + + # Set device for GPU usage + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # Set up list of saving scores + self.loadings_learning_scores_.append([]) + self.loadings_learning_rates_.append([]) + + _optimizer, _loadings, _intercept, _lr_scheduler = self._create_saturated_loading_optim( + parameters=saturated_parameters.data.clone(), X=X + ) + + train_data = TensorDataset(X, saturated_parameters.data.clone()) + train_loader = DataLoader(dataset=train_data, batch_size=self.batch_size, shuffle=True) + + for _ in tqdm(range(self.max_iter)): + loss_val = [] + for batch_data, batch_parameters in train_loader: + cost_step = self._optim_cost( + loadings=_loadings, + intercept=_intercept, + batch_data=batch_data, + batch_parameters=batch_parameters, + ) + + if "cuda" in str(self.device): + self.loadings_learning_scores_[-1].append(cost_step.cpu().detach().numpy()) + else: + self.loadings_learning_scores_[-1].append(cost_step.detach().numpy()) + cost_step.backward() + _optimizer.step() + _optimizer.zero_grad() + self.loadings_learning_rates_[-1].append(_lr_scheduler.get_last_lr()) + _lr_scheduler.step() + + if np.isinf(self.loadings_learning_scores_[-1][-1]) or np.isnan(self.loadings_learning_scores_[-1][-1]): + print("\tRESTART BECAUSE INF/NAN FOUND", flush=True) + self.learning_rate_ = self.learning_rate_ * self.gamma + self.loadings_learning_scores_ = self.loadings_learning_scores_[:-1] + self.loadings_learning_rates_ = self.loadings_learning_rates_[:-1] + + # Remove memory + del train_data, train_loader, _optimizer, _loadings, _intercept, _lr_scheduler + if "cuda" in str(self.device): + torch.cuda.empty_cache() + + return self._saturated_loading_iter( + saturated_parameters=saturated_parameters, + X=X, + ) + + return (_loadings, _intercept) + + def _create_saturated_loading_optim(self, parameters: torch.Tensor, X: torch.Tensor): + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # Initialize loadings with spectrum + random_idx = np.random.choice(np.arange(parameters.shape[0]), replace=False, size=self.batch_size) + if self.init == "spectral": + _, _, v = torch.linalg.svd(parameters[random_idx] - torch.mean(parameters[random_idx], axis=0)) + loadings = mnn.Parameter(data=v[: self.n_pc, :].T, manifold=mnn.Stiefel(parameters.shape[1], self.n_pc)).to( + self.device + ) + elif self.init == "random": + loadings = mnn.Parameter(manifold=mnn.Stiefel(parameters.shape[1], self.n_pc)).to(self.device) + + # Initialize intercept + if self.exponential_family.family_name in ["poisson"]: + intercept = mnn.Parameter( + torch.median(parameters[random_idx], axis=0).values, manifold=mnn.Euclidean(parameters.shape[1]) + ).to(self.device) + else: + intercept = mnn.Parameter( + torch.mean(parameters[random_idx], axis=0), manifold=mnn.Euclidean(parameters.shape[1]) + ).to(self.device) + + # Load ExponentialFamily params to GPU (if they exist) + self.exponential_family.load_family_params_to_gpu(self.device) + + # Create optimizer + # TODO: allow for other optimizer to be used. + print("LEARNING RATE: %s" % (self.learning_rate_)) + optimizer = moptim.rAdagrad( + params=[{"params": loadings, "lr": self.learning_rate_}, {"params": intercept, "lr": self.learning_rate_}] + ) + lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=self.step_size, gamma=self.gamma) + + return optimizer, loadings, intercept, lr_scheduler + + def _optim_cost( + self, loadings: torch.Tensor, intercept: torch.Tensor, batch_data: torch.Tensor, batch_parameters: torch.Tensor + ): + n = batch_data.shape[0] + intercept_term = intercept.unsqueeze(0).repeat(n, 1).to(self.device) + + projected_parameters = batch_parameters - intercept_term + projected_parameters = projected_parameters.matmul(loadings).matmul(loadings.T) + projected_parameters = projected_parameters + intercept_term + + return self.exponential_family.log_likelihood(batch_data, projected_parameters) diff --git a/sincei/GetStats.py b/sincei/GetStats.py index 6e4cdfe..009f438 100644 --- a/sincei/GetStats.py +++ b/sincei/GetStats.py @@ -8,9 +8,11 @@ import numpy as np import py2bit import pandas as pd + ## own functions -scriptdir=os.path.join(os.path.abspath(os.pardir), "sincei") -from Utilities import * +scriptdir = os.path.join(os.path.abspath(os.pardir), "sincei") +from sincei.Utilities import * + def getStats_worker(arglist): r"""Computes statistics for each read in a bam file @@ -63,7 +65,7 @@ def getStats_worker(arglist): prev_pos = set() lpos = None ## initiate a dict with all values to keep per read - info_list = []#dict.fromkeys(['barcode', 'position', 'duplicate', 'GCcontent', 'strand']) + info_list = [] # dict.fromkeys(['barcode', 'position', 'duplicate', 'GCcontent', 'strand']) for read in fh.fetch(chromUse, start, end): ## general filtering @@ -78,16 +80,20 @@ def getStats_worker(arglist): if args.minAlignedFraction: if not checkAlignedFraction(read, args.minAlignedFraction): continue - if blackList and blackList.findOverlaps(chrom, read.reference_start, read.reference_start + read.infer_query_length(always=False) - 1): + if blackList and blackList.findOverlaps( + chrom, + read.reference_start, + read.reference_start + read.infer_query_length(always=False) - 1, + ): continue if args.motifFilter: - test = [ checkMotifs(read, chrom, twoBitGenome, m[0], m[1]) for m in args.motifFilter ] + test = [checkMotifs(read, chrom, twoBitGenome, m[0], m[1]) for m in args.motifFilter] # if none given motif found, return true if not any(test): continue # now collect info - info = [None for n in range(0,8)] + info = [None for n in range(0, 8)] info[0] = chromUse if args.barcodes is not None: bc = read.get_tag(args.cellTag) @@ -102,9 +108,8 @@ def getStats_worker(arglist): ## Duplicates tup = getDupFilterTuple(read, bc, args.duplicateFilter) - if lpos is not None and lpos == read.reference_start \ - and tup in prev_pos: - info[3] = True# read is duplicate + if lpos is not None and lpos == read.reference_start and tup in prev_pos: + info[3] = True # read is duplicate info[4] = 0 else: info[3] = False diff --git a/sincei/ParserCommon.py b/sincei/ParserCommon.py index 2904e0b..cc8b848 100644 --- a/sincei/ParserCommon.py +++ b/sincei/ParserCommon.py @@ -1,111 +1,138 @@ import argparse import os from deeptools.utilities import smartLabels -from _version import __version__ +from sincei._version import __version__ + def inputOutputOptions(args=None, opts=None, requiredOpts=[], suppress_args=None): parser = argparse.ArgumentParser(add_help=False) - group = parser.add_argument_group('Input/Output options') + group = parser.add_argument_group("Input/Output options") ## inputs - if 'loomfile' in opts: - group.add_argument('--input', '-i', - metavar='LOOM', - help='Input file in .loom format', - required=True) - elif 'bamfiles' in opts: - group.add_argument('--bamfiles', '-b', - metavar='FILE1 FILE2', - help='List of indexed bam files separated by spaces.', - nargs='+', - required=True) - elif 'bamfile' in opts: - group.add_argument('--bamfile', '-b', - metavar='FILE', - help='Indexed BAM file', - required=True) - if 'whitelist' in opts: - group.add_argument('--whitelist', '-w', - help="A single-column file containing the whitelist of barcodes to be used", - metavar="TXT", - default=None, - required=True if 'whitelist' in requiredOpts else False) - elif 'barcodes' in opts: - group.add_argument('--barcodes', '-bc', - help="A single-column file containing barcodes (whitelist) to be used for the analysis.", - metavar="TXT", - required=True if 'barcodes' in requiredOpts else False) - if 'groupInfo' in opts: - group.add_argument('--groupInfo', '-i', - help='A 3-column tsv file with Cell grouping information in the ' - 'format: `sample, barcode, group`. Coverages will be ' - 'computed per group.', - metavar='TXT file', - required=True) - - if 'BED' in opts: - group.add_argument('--BED', - help=show_or_hide('Limits the coverage analysis to ' - 'the regions specified in these files.', - 'BED' , suppress_args), - metavar='FILE1.bed FILE2.bed', - nargs='+', - required=True if 'BED' in requiredOpts else False) + if "loomfile" in opts: + group.add_argument( + "--input", + "-i", + metavar="LOOM", + help="Input file in .loom format", + required=True, + ) + elif "bamfiles" in opts: + group.add_argument( + "--bamfiles", + "-b", + metavar="FILE1 FILE2", + help="List of indexed bam files separated by spaces.", + nargs="+", + required=True, + ) + elif "bamfile" in opts: + group.add_argument("--bamfile", "-b", metavar="FILE", help="Indexed BAM file", required=True) + if "whitelist" in opts: + group.add_argument( + "--whitelist", + "-w", + help="A single-column file containing the whitelist of barcodes to be used", + metavar="TXT", + default=None, + required=True if "whitelist" in requiredOpts else False, + ) + elif "barcodes" in opts: + group.add_argument( + "--barcodes", + "-bc", + help="A single-column file containing barcodes (whitelist) to be used for the analysis.", + metavar="TXT", + required=True if "barcodes" in requiredOpts else False, + ) + if "groupInfo" in opts: + group.add_argument( + "--groupInfo", + "-i", + help="A 3-column tsv file with Cell grouping information in the " + "format: `sample, barcode, group`. Coverages will be " + "computed per group.", + metavar="TXT file", + required=True, + ) + + if "BED" in opts: + group.add_argument( + "--BED", + help=show_or_hide( + "Limits the coverage analysis to " "the regions specified in these files.", + "BED", + suppress_args, + ), + metavar="FILE1.bed FILE2.bed", + nargs="+", + required=True if "BED" in requiredOpts else False, + ) ## outputs - if 'outFilePrefix' in opts: - group.add_argument('--outFilePrefix', '-o', - help='Output file name prefix.', - metavar='FILEPREFIX', - type=str, - required=True if 'outFilePrefix' in requiredOpts else False) - - elif 'outFile' in opts: - group.add_argument('--outFile', '-o', - type=str, - help='The file to write results to. For `scFilterStats`, `scFilterBarcodes` ' - 'and `scJSD`, the output file is a .txt file. For other tools, the output file is ' - 'an updated .loom object with the result of the requested operation. ', - required=True if 'outFile' in requiredOpts else False) + if "outFilePrefix" in opts: + group.add_argument( + "--outFilePrefix", + "-o", + help="Output file name prefix.", + metavar="FILEPREFIX", + type=str, + required=True if "outFilePrefix" in requiredOpts else False, + ) + + elif "outFile" in opts: + group.add_argument( + "--outFile", + "-o", + type=str, + help="The file to write results to. For `scFilterStats`, `scFilterBarcodes` " + "and `scJSD`, the output file is a .txt file. For other tools, the output file is " + "an updated .loom object with the result of the requested operation. ", + required=True if "outFile" in requiredOpts else False, + ) return parser + def otherOptions(args=None): parser = argparse.ArgumentParser(add_help=False) - group = parser.add_argument_group('Other options') + group = parser.add_argument_group("Other options") - group.add_argument("--help", "-h", action="help", - help="show this help message and exit") + group.add_argument("--help", "-h", action="help", help="show this help message and exit") - group.add_argument('--verbose', '-v', - help='Set to see processing messages.', - action='store_true') + group.add_argument("--verbose", "-v", help="Set to see processing messages.", action="store_true") - group.add_argument('--version', action='version', - version='%(prog)s {}'.format(__version__)) + group.add_argument("--version", action="version", version="%(prog)s {}".format(__version__)) return parser + def plotOptions(args=None): parser = argparse.ArgumentParser(add_help=False) - group = parser.add_argument_group('Plot options') - - group.add_argument('--plotWidth', - default=10, - type=float, - help='Output plot width (in cm). (Default: %(default)s)') - - group.add_argument('--plotHeight', - default=10, - type=float, - help='Output plot height (in cm). (Default: %(default)s)') - - group.add_argument('--plotFileFormat', - metavar='FILETYPE', - type=str, - choices=['png', 'pdf', 'svg', 'eps'], - default='png', - help='Image format type. If given, this option ' - 'overrides the image format based on the plotFile ' - 'ending. (Default: %(default)s)') + group = parser.add_argument_group("Plot options") + + group.add_argument( + "--plotWidth", + default=10, + type=float, + help="Output plot width (in cm). (Default: %(default)s)", + ) + + group.add_argument( + "--plotHeight", + default=10, + type=float, + help="Output plot height (in cm). (Default: %(default)s)", + ) + + group.add_argument( + "--plotFileFormat", + metavar="FILETYPE", + type=str, + choices=["png", "pdf", "svg", "eps"], + default="png", + help="Image format type. If given, this option " + "overrides the image format based on the plotFile " + "ending. (Default: %(default)s)", + ) return parser @@ -116,244 +143,358 @@ def bamOptions(args=None, suppress_args=None, default_opts=None): """ parser = argparse.ArgumentParser(add_help=False) - group = parser.add_argument_group('BAM processing options') - - group.add_argument('--cellTag', '-ct', - metavar='STR', - help='Name of the BAM tag from which to extract barcodes.', - type=str, - default='BC') - - group.add_argument('--groupTag', '-gt', - metavar='STR', - help=show_or_hide('In case of a groupped BAM file, such as the one containing Read Group (`RG`) or Sample (`SM`) tag,' - 'it is possible to process group the reads using the provided --groupTag argument. NOTE: In case of such input, ' - 'please ensure that the --labels argument indicates the expected group labels contained in the BAM files. ' - 'The --groupTag along with the --cellTag is then used to identify unique samples (cells) from the input.', - 'groupTag',suppress_args), - type=str, - default=None) - - group.add_argument('--numberOfProcessors', '-p', - help='Number of processors to use. Type "max/2" to ' - 'use half the maximum number of processors or "max" ' - 'to use all available processors. (Default: %(default)s)', - metavar="INT", - type=numberOfProcessors, - default=1, - required=False) - - group.add_argument('--labels', '-l', - metavar='sample1 sample2', - help=show_or_hide('User defined labels instead of default labels from ' - 'file names. Multiple labels have to be separated by a space, e.g. ' - '--labels sample1 sample2 sample3', - 'labels', suppress_args), - nargs='+') - - group.add_argument('--smartLabels', - action='store_true', - help=show_or_hide('Instead of manually specifying labels for the input ' - 'BAM files, this causes sincei to use the file name ' - 'after removing the path and extension.', - 'smartLabels', suppress_args) - ) - - group.add_argument('--region', '-r', - help=show_or_hide('Region of the genome to limit the operation ' - 'to - this is useful when testing parameters to ' - 'reduce the computing time. The format is ' - 'chr:start:end, for example --region chr10 or --region chr10:456700:891000.', - 'region', suppress_args), - metavar="CHR:START:END", - required=False, - type=genomicRegion) - - group.add_argument('--blackListFileName', '-bl', - help=show_or_hide("A BED or GTF file containing regions that should be excluded from all analyses. Currently this works by rejecting genomic chunks that happen to overlap an entry. Consequently, for BAM files, if a read partially overlaps a blacklisted region or a fragment spans over it, then the read/fragment might still be considered. Please note that you should adjust the effective genome size, if relevant.", - 'blackListFileName', suppress_args), - metavar="BED file", - nargs="+", - required=False) - - group.add_argument('--binSize', '-bs', - help=show_or_hide('Size of the bins, in bases, to calculate coverage (Default: %(default)s)', - 'binSize', suppress_args), - metavar="INT bp", - type=int, - default=get_default('binSize', default_opts)) - - group.add_argument('--distanceBetweenBins', - metavar='INT', - help=show_or_hide('The gap distance between bins during counting. ' - 'Larger numbers can be used to sample the genome for input files with high coverage ' - 'while smaller values are useful for lower coverage data. Note that if you specify a value that ' - 'results in too few (<1000) reads sampled, the value will be decreased. (Default: %(default)s)', - 'distanceBetweenBins', suppress_args), - default=get_default('distanceBetweenBins', default_opts), - type=int) + group = parser.add_argument_group("BAM processing options") + + group.add_argument( + "--cellTag", + "-ct", + metavar="STR", + help="Name of the BAM tag from which to extract barcodes.", + type=str, + default="BC", + ) + + group.add_argument( + "--groupTag", + "-gt", + metavar="STR", + help=show_or_hide( + "In case of a groupped BAM file, such as the one containing Read Group (`RG`) or Sample (`SM`) tag," + "it is possible to process group the reads using the provided --groupTag argument. NOTE: In case of such input, " + "please ensure that the --labels argument indicates the expected group labels contained in the BAM files. " + "The --groupTag along with the --cellTag is then used to identify unique samples (cells) from the input.", + "groupTag", + suppress_args, + ), + type=str, + default=None, + ) + + group.add_argument( + "--numberOfProcessors", + "-p", + help='Number of processors to use. Type "max/2" to ' + 'use half the maximum number of processors or "max" ' + "to use all available processors. (Default: %(default)s)", + metavar="INT", + type=numberOfProcessors, + default=1, + required=False, + ) + + group.add_argument( + "--labels", + "-l", + metavar="sample1 sample2", + help=show_or_hide( + "User defined labels instead of default labels from " + "file names. Multiple labels have to be separated by a space, e.g. " + "--labels sample1 sample2 sample3", + "labels", + suppress_args, + ), + nargs="+", + ) + + group.add_argument( + "--smartLabels", + action="store_true", + help=show_or_hide( + "Instead of manually specifying labels for the input " + "BAM files, this causes sincei to use the file name " + "after removing the path and extension.", + "smartLabels", + suppress_args, + ), + ) + + group.add_argument( + "--region", + "-r", + help=show_or_hide( + "Region of the genome to limit the operation " + "to - this is useful when testing parameters to " + "reduce the computing time. The format is " + "chr:start:end, for example --region chr10 or --region chr10:456700:891000.", + "region", + suppress_args, + ), + metavar="CHR:START:END", + required=False, + type=genomicRegion, + ) + + group.add_argument( + "--blackListFileName", + "-bl", + help=show_or_hide( + "A BED or GTF file containing regions that should be excluded from all analyses. Currently this works by rejecting genomic chunks that happen to overlap an entry. Consequently, for BAM files, if a read partially overlaps a blacklisted region or a fragment spans over it, then the read/fragment might still be considered. Please note that you should adjust the effective genome size, if relevant.", + "blackListFileName", + suppress_args, + ), + metavar="BED file", + nargs="+", + required=False, + ) + + group.add_argument( + "--binSize", + "-bs", + help=show_or_hide( + "Size of the bins, in bases, to calculate coverage (Default: %(default)s)", + "binSize", + suppress_args, + ), + metavar="INT bp", + type=int, + default=get_default("binSize", default_opts), + ) + + group.add_argument( + "--distanceBetweenBins", + metavar="INT", + help=show_or_hide( + "The gap distance between bins during counting. " + "Larger numbers can be used to sample the genome for input files with high coverage " + "while smaller values are useful for lower coverage data. Note that if you specify a value that " + "results in too few (<1000) reads sampled, the value will be decreased. (Default: %(default)s)", + "distanceBetweenBins", + suppress_args, + ), + default=get_default("distanceBetweenBins", default_opts), + type=int, + ) return parser + ## Tools: scBulkCoverage, scCountReads def filterOptions(args=None, suppress_args=None): parser = argparse.ArgumentParser(add_help=False) - group = parser.add_argument_group('Read Filtering Options') - - group.add_argument('--duplicateFilter', - help=show_or_hide('How to filter for duplicates? Different combinations (using start/end/umi) are possible. ' - 'Read start position and read barcode are always considered. Default (None) would consider all reads. ' - 'Note that in case of paired end data, both reads in the fragment are considered (and kept). So if you wish ' - 'to keep only read1, combine this option with samFlagInclude ', - 'duplicateFilter' , suppress_args), - type=str, - choices=['start_bc', 'start_bc_umi', 'start_end_bc', 'start_end_bc_umi'], - default=None) - - group.add_argument('--motifFilter', '-m', - metavar='STR', - help=show_or_hide('Check whether a given motif is present in the read and the corresponding reference genome. ' - 'This option checks for the motif at the 5-end of the read and at the 5-overhang in the genome, ' - 'which is useful in identifying reads properly cut by a restriction-enzyme or MNAse. ' - 'For example, if you want to search for an "A" at the 5\'-end of the read and "TA" at 5\'-overhang, ' - 'use "-m \'A,TA\'". Reads not containing the given motif are discarded. ', - 'motifFilter' , suppress_args), - type=str, - nargs='+', - default=None) - - group.add_argument('--genome2bit', '-g', - metavar='STR', - help=show_or_hide('If --motifFilter is provided, please also provide the genome sequence (in 2bit format). ', - 'genome2bit' , suppress_args), - type=str, - default=None) - - group.add_argument('--GCcontentFilter', '-gc', - metavar='STR', - help=show_or_hide('Check whether the GC content of the read falls within the provided range. ' - 'If the GC content of the reads fall outside the range, they are discarded. ', - 'GCcontentFilter' , suppress_args), - type=str, - default=None) - - group.add_argument('--minAlignedFraction', - help=show_or_hide('Minimum fraction of the reads which should be aligned to be counted. This includes ' - 'mismatches tolerated by the aligners, but excludes InDels/Clippings (Default: %(default)s)', - 'minAlignedFraction' , suppress_args), - metavar='FLOAT', - default=None, - type=float, - required=False) + group = parser.add_argument_group("Read Filtering Options") + + group.add_argument( + "--duplicateFilter", + help=show_or_hide( + "How to filter for duplicates? Different combinations (using start/end/umi) are possible. " + "Read start position and read barcode are always considered. Default (None) would consider all reads. " + "Note that in case of paired end data, both reads in the fragment are considered (and kept). So if you wish " + "to keep only read1, combine this option with samFlagInclude ", + "duplicateFilter", + suppress_args, + ), + type=str, + choices=["start_bc", "start_bc_umi", "start_end_bc", "start_end_bc_umi"], + default=None, + ) + + group.add_argument( + "--motifFilter", + "-m", + metavar="STR", + help=show_or_hide( + "Check whether a given motif is present in the read and the corresponding reference genome. " + "This option checks for the motif at the 5-end of the read and at the 5-overhang in the genome, " + "which is useful in identifying reads properly cut by a restriction-enzyme or MNAse. " + 'For example, if you want to search for an "A" at the 5\'-end of the read and "TA" at 5\'-overhang, ' + "use \"-m 'A,TA'\". Reads not containing the given motif are discarded. ", + "motifFilter", + suppress_args, + ), + type=str, + nargs="+", + default=None, + ) + + group.add_argument( + "--genome2bit", + "-g", + metavar="STR", + help=show_or_hide( + "If --motifFilter is provided, please also provide the genome sequence (in 2bit format). ", + "genome2bit", + suppress_args, + ), + type=str, + default=None, + ) + + group.add_argument( + "--GCcontentFilter", + "-gc", + metavar="STR", + help=show_or_hide( + "Check whether the GC content of the read falls within the provided range. " + "If the GC content of the reads fall outside the range, they are discarded. ", + "GCcontentFilter", + suppress_args, + ), + type=str, + default=None, + ) + + group.add_argument( + "--minAlignedFraction", + help=show_or_hide( + "Minimum fraction of the reads which should be aligned to be counted. This includes " + "mismatches tolerated by the aligners, but excludes InDels/Clippings (Default: %(default)s)", + "minAlignedFraction", + suppress_args, + ), + metavar="FLOAT", + default=None, + type=float, + required=False, + ) return parser + ## Tools: scBulkCoverage, scCountReads def readOptions(args=None, suppress_args=None): """Common arguments related to BAM files and the interpretation of the read coverage """ parser = argparse.ArgumentParser(add_help=False) - group = parser.add_argument_group('Read Processing Options') - - group.add_argument('--minMappingQuality', - metavar='INT', - help=show_or_hide('If set, only reads that have a mapping ' - 'quality score of at least this are considered.', - 'minMappingQuality', suppress_args), - type=int, - ) - - group.add_argument('--samFlagInclude', - help=show_or_hide('Include reads based on the SAM flag. For example, ' - 'to get only reads that are the first mate, use a flag of 64. ' - 'This is useful to count properly paired reads only once, ' - 'as otherwise the second mate will be also considered for the ' - 'coverage. (Default: %(default)s)', - 'samFlagInclude' , suppress_args), - metavar='INT', - default=None, - type=int, - required=False) - - group.add_argument('--samFlagExclude', - help=show_or_hide('Exclude reads based on the SAM flag. For example, ' - 'to get only reads that map to the forward strand, use ' - '--samFlagExclude 16, where 16 is the SAM flag for reads ' - 'that map to the reverse strand. (Default: %(default)s)', - 'samFlagExclude' , suppress_args), - metavar='INT', - default=None, - type=int, - required=False) - - group.add_argument('--minFragmentLength', - help=show_or_hide('The minimum fragment length needed for read/pair ' - 'inclusion. This option is primarily useful ' - 'in ATACseq experiments, for filtering mono- or ' - 'di-nucleosome fragments. (Default: %(default)s)', - 'minFragmentLength' , suppress_args), - metavar='INT', - default=0, - type=int, - required=False) - - group.add_argument('--maxFragmentLength', - help=show_or_hide('The maximum fragment length needed for read/pair ' - 'inclusion. (Default: %(default)s)', - 'maxFragmentLength' , suppress_args), - metavar='INT', - default=0, - type=int, - required=False) - - group.add_argument('--filterRNAstrand', - help=show_or_hide('Selects RNA-seq reads (single-end or paired-end) originating from genes ' - 'on the given strand. This option assumes a standard dUTP-based library ' - 'preparation (that is, --filterRNAstrand=forward keeps minus-strand reads, ' - 'which originally came from genes on the forward strand using a dUTP-based ' - 'method). Consider using --samExcludeFlag instead for filtering by strand in ' - 'other contexts.', - 'filterRNAstrand' , suppress_args), - choices=['forward', 'reverse'], - default=None) - - group.add_argument('--extendReads', '-e', - help=show_or_hide('This parameter allows the extension of reads to ' - 'fragment size. If set, each read is ' - 'extended, without exception.\n' - '*NOTE*: This feature is generally NOT recommended for ' - 'spliced-read data, such as RNA-seq, as it would ' - 'extend reads over skipped regions.\n' - '*Single-end*: Requires a user specified value for the ' - 'final fragment length. Reads that already exceed this ' - 'fragment length will not be extended.\n' - '*Paired-end*: Reads with mates are always extended to ' - 'match the fragment size defined by the two read mates. ' - 'Unmated reads, mate reads that map too far apart ' - '(>4x fragment length) or even map to different ' - 'chromosomes are treated like single-end reads. The input ' - 'of a fragment length value is optional. If ' - 'no value is specified, it is estimated from the ' - 'data (mean of the fragment size of all mate reads).\n', - 'extendReads' , suppress_args), - type=int, - nargs='?', - const=True, - default=False, - metavar="INT bp") - - group.add_argument('--centerReads', - help=show_or_hide('By adding this option, reads are centered with ' - 'respect to the fragment length. For paired-end data, ' - 'the read is centered at the fragment length defined ' - 'by the two ends of the fragment. For single-end data, the ' - 'given fragment length is used. This option is ' - 'useful to get a sharper signal around enriched regions.', - 'centerReads' , suppress_args), - action='store_true') + group = parser.add_argument_group("Read Processing Options") + + group.add_argument( + "--minMappingQuality", + metavar="INT", + help=show_or_hide( + "If set, only reads that have a mapping " "quality score of at least this are considered.", + "minMappingQuality", + suppress_args, + ), + type=int, + ) + + group.add_argument( + "--samFlagInclude", + help=show_or_hide( + "Include reads based on the SAM flag. For example, " + "to get only reads that are the first mate, use a flag of 64. " + "This is useful to count properly paired reads only once, " + "as otherwise the second mate will be also considered for the " + "coverage. (Default: %(default)s)", + "samFlagInclude", + suppress_args, + ), + metavar="INT", + default=None, + type=int, + required=False, + ) + + group.add_argument( + "--samFlagExclude", + help=show_or_hide( + "Exclude reads based on the SAM flag. For example, " + "to get only reads that map to the forward strand, use " + "--samFlagExclude 16, where 16 is the SAM flag for reads " + "that map to the reverse strand. (Default: %(default)s)", + "samFlagExclude", + suppress_args, + ), + metavar="INT", + default=None, + type=int, + required=False, + ) + + group.add_argument( + "--minFragmentLength", + help=show_or_hide( + "The minimum fragment length needed for read/pair " + "inclusion. This option is primarily useful " + "in ATACseq experiments, for filtering mono- or " + "di-nucleosome fragments. (Default: %(default)s)", + "minFragmentLength", + suppress_args, + ), + metavar="INT", + default=0, + type=int, + required=False, + ) + + group.add_argument( + "--maxFragmentLength", + help=show_or_hide( + "The maximum fragment length needed for read/pair " "inclusion. (Default: %(default)s)", + "maxFragmentLength", + suppress_args, + ), + metavar="INT", + default=0, + type=int, + required=False, + ) + + group.add_argument( + "--filterRNAstrand", + help=show_or_hide( + "Selects RNA-seq reads (single-end or paired-end) originating from genes " + "on the given strand. This option assumes a standard dUTP-based library " + "preparation (that is, --filterRNAstrand=forward keeps minus-strand reads, " + "which originally came from genes on the forward strand using a dUTP-based " + "method). Consider using --samExcludeFlag instead for filtering by strand in " + "other contexts.", + "filterRNAstrand", + suppress_args, + ), + choices=["forward", "reverse"], + default=None, + ) + + group.add_argument( + "--extendReads", + "-e", + help=show_or_hide( + "This parameter allows the extension of reads to " + "fragment size. If set, each read is " + "extended, without exception.\n" + "*NOTE*: This feature is generally NOT recommended for " + "spliced-read data, such as RNA-seq, as it would " + "extend reads over skipped regions.\n" + "*Single-end*: Requires a user specified value for the " + "final fragment length. Reads that already exceed this " + "fragment length will not be extended.\n" + "*Paired-end*: Reads with mates are always extended to " + "match the fragment size defined by the two read mates. " + "Unmated reads, mate reads that map too far apart " + "(>4x fragment length) or even map to different " + "chromosomes are treated like single-end reads. The input " + "of a fragment length value is optional. If " + "no value is specified, it is estimated from the " + "data (mean of the fragment size of all mate reads).\n", + "extendReads", + suppress_args, + ), + type=int, + nargs="?", + const=True, + default=False, + metavar="INT bp", + ) + + group.add_argument( + "--centerReads", + help=show_or_hide( + "By adding this option, reads are centered with " + "respect to the fragment length. For paired-end data, " + "the read is centered at the fragment length defined " + "by the two ends of the fragment. For single-end data, the " + "given fragment length is used. This option is " + "useful to get a sharper signal around enriched regions.", + "centerReads", + suppress_args, + ), + action="store_true", + ) return parser + ## helper functions def show_or_hide(help, name, arglist): if arglist and name in arglist: @@ -361,12 +502,14 @@ def show_or_hide(help, name, arglist): else: return help + def get_default(name, argdict): if argdict and name in argdict.keys(): return argdict[name] else: return None + def process_args(args=None): if not args.labels: if args.smartLabels: @@ -379,10 +522,11 @@ def process_args(args=None): return args + def genomicRegion(string): # remove whitespaces using split,join trick - region = ''.join(string.split()) - if region == '': + region = "".join(string.split()) + if region == "": return None # remove undesired characters that may be present and # replace - by : @@ -392,12 +536,13 @@ def genomicRegion(string): except: region = region.translate({ord(i): None for i in ",;|!{}()"}) if len(region) == 0: - raise argparse.ArgumentTypeError( - "{} is not a valid region".format(string)) + raise argparse.ArgumentTypeError("{} is not a valid region".format(string)) return region + def numberOfProcessors(string): import multiprocessing + availProc = multiprocessing.cpu_count() if string == "max/2": # default case @@ -410,27 +555,29 @@ def numberOfProcessors(string): try: numberOfProcessors = int(string) except ValueError: - raise argparse.ArgumentTypeError( - "{} is not a valid number of processors".format(string)) + raise argparse.ArgumentTypeError("{} is not a valid number of processors".format(string)) except Exception as e: - raise argparse.ArgumentTypeError("the given value {} is not valid. " - "Error message: {}\nThe number of " - "available processors in your " - "computer is {}.".format(string, e, availProc)) + raise argparse.ArgumentTypeError( + "the given value {} is not valid. " + "Error message: {}\nThe number of " + "available processors in your " + "computer is {}.".format(string, e, availProc) + ) if numberOfProcessors > availProc: numberOfProcessors = availProc return numberOfProcessors + def smartLabel(label): """ Remove the path name and the last extension from the file name /pth/to/file.name.bam -> file.name """ lab = os.path.splitext(os.path.basename(label))[0] - if lab == '': + if lab == "": # Maybe we have a dot file? lab = os.path.basename(label) return lab @@ -439,10 +586,13 @@ def smartLabel(label): def smartLabels(labels): smrt = [smartLabel(x) for x in labels] if len(smrt) != len(set(smrt)): - print("Labels inferred from file names are not unique. " - "Please be aware that in case of overlapping barcodes the counts will be merged.") + print( + "Labels inferred from file names are not unique. " + "Please be aware that in case of overlapping barcodes the counts will be merged." + ) return smrt + def validateInputs(args, process_barcodes=True): """ Ensure that right input is provided from argparse @@ -459,8 +609,10 @@ def validateInputs(args, process_barcodes=True): exit(0) else: if args.labels and len(args.bamfiles) != len(args.labels): - print("The number of labels does not match the number of bam files. " - "This is only allowed if a single BAM file is provided and --groupTag is specified.") + print( + "The number of labels does not match the number of bam files. " + "This is only allowed if a single BAM file is provided and --groupTag is specified." + ) exit(0) if not args.labels: args.labels = smartLabels(args.bamfiles) @@ -471,7 +623,7 @@ def validateInputs(args, process_barcodes=True): print("MotifFilter asked but genome (2bit) file not provided.") sys.exit(1) else: - args.motifFilter = [ x.strip(" ").split(",") for x in args.motifFilter ] + args.motifFilter = [x.strip(" ").split(",") for x in args.motifFilter] if args.GCcontentFilter: gc = args.GCcontentFilter.strip(" ").split(",") @@ -479,11 +631,11 @@ def validateInputs(args, process_barcodes=True): ## Barcodes + new labels (if required) if process_barcodes: - with open(args.barcodes, 'r') as f: + with open(args.barcodes, "r") as f: barcodes = f.read().splitlines() f.close() args.barcodes = barcodes - newlabels = ["{}::{}".format(a, b) for a in args.labels for b in barcodes ] + newlabels = ["{}::{}".format(a, b) for a in args.labels for b in barcodes] return args, newlabels else: return args diff --git a/sincei/ReadCounter.py b/sincei/ReadCounter.py index 62d57ee..e4891ef 100644 --- a/sincei/ReadCounter.py +++ b/sincei/ReadCounter.py @@ -4,7 +4,8 @@ import sys import multiprocessing import numpy as np -#import scipy as sc + +# import scipy as sc # deepTools packages import deeptools.utilities from deeptools import bamHandler @@ -14,13 +15,14 @@ import py2bit ## own functions -from Utilities import * +from sincei.Utilities import * debug = 0 -old_settings = np.seterr(all='ignore') +old_settings = np.seterr(all="ignore") ####----------- Functions needed inside the class ------------------ + def remove_row_of_zeros(matrix): """ remove rows containing all zeros or all nans @@ -50,8 +52,7 @@ def estimateSizeFactors(m): loggeomeans = np.ma.masked_where(np.isinf(loggeomeans), loggeomeans) # DESeq2 ratio-based size factor sf = np.exp(np.ma.median((np.log(m).T - loggeomeans).T, axis=0)) - return 1. / sf - + return 1.0 / sf def countReadsInRegions_wrapper(args): @@ -65,10 +66,11 @@ def countReadsInRegions_wrapper(args): """ return CountReadsPerBin.count_reads_in_region(*args) + ######### --------------- Class definitions -------------- -class CountReadsPerBin(object): +class CountReadsPerBin(object): r"""Collects coverage over multiple bam files using multiprocessing This function collects read counts (coverage) from several bam files and returns @@ -215,41 +217,46 @@ class CountReadsPerBin(object): [0., 1., 1., 2.]]) """ - def __init__(self, bamFilesList, binLength=50, - barcodes=None, - cellTag=None, - groupTag=None, - groupLabels=None, - clusterInfo=None, - motifFilter=None, - genome2bit=None, - GCcontentFilter=None, - numberOfSamples=None, - numberOfProcessors=1, - verbose=False, region=None, - bedFile=None, extendReads=False, - genomeChunkSize=None, - blackListFileName=None, - minMappingQuality=None, - duplicateFilter=None, - chrsToSkip=[], - stepSize=None, - center_read=False, - samFlag_include=None, - samFlag_exclude=None, - zerosToNans=False, - skipZeroOverZero=False, - smoothLength=0, - minFragmentLength=0, - maxFragmentLength=0, - minAlignedFraction=0, - out_file_for_raw_data=None, - bed_and_bin=False, - sumCoveragePerBin=False, - binarizeCoverage=False, - statsList=[], - mappedList=[]): - + def __init__( + self, + bamFilesList, + binLength=50, + barcodes=None, + cellTag=None, + groupTag=None, + groupLabels=None, + clusterInfo=None, + motifFilter=None, + genome2bit=None, + GCcontentFilter=None, + numberOfSamples=None, + numberOfProcessors=1, + verbose=False, + region=None, + bedFile=None, + extendReads=False, + genomeChunkSize=None, + blackListFileName=None, + minMappingQuality=None, + duplicateFilter=None, + chrsToSkip=[], + stepSize=None, + center_read=False, + samFlag_include=None, + samFlag_exclude=None, + zerosToNans=False, + skipZeroOverZero=False, + smoothLength=0, + minFragmentLength=0, + maxFragmentLength=0, + minAlignedFraction=0, + out_file_for_raw_data=None, + bed_and_bin=False, + sumCoveragePerBin=False, + binarizeCoverage=False, + statsList=[], + mappedList=[], + ): self.bamFilesList = bamFilesList self.binLength = binLength self.numberOfSamples = numberOfSamples @@ -262,25 +269,34 @@ def __init__(self, bamFilesList, binLength=50, if extendReads and len(bamFilesList): from deeptools.getFragmentAndReadSize import get_read_and_fragment_length - frag_len_dict, read_len_dict = get_read_and_fragment_length(bamFilesList[0], - return_lengths=False, - blackListFileName=blackListFileName, - numberOfProcessors=numberOfProcessors, - verbose=verbose) + + frag_len_dict, read_len_dict = get_read_and_fragment_length( + bamFilesList[0], + return_lengths=False, + blackListFileName=blackListFileName, + numberOfProcessors=numberOfProcessors, + verbose=verbose, + ) if extendReads is True: # try to guess fragment length if the bam file contains paired end reads if frag_len_dict: - self.defaultFragmentLength = int(frag_len_dict['median']) + self.defaultFragmentLength = int(frag_len_dict["median"]) else: exit("*ERROR*: library is not paired-end. Please provide an extension length.") if verbose: - print(("Fragment length based on paired en data " - "estimated to be {}".format(frag_len_dict['median']))) - - elif extendReads < read_len_dict['median']: - sys.stderr.write("*WARNING*: read extension is smaller than read length (read length = {}). " - "Reads will not be extended.\n".format(int(read_len_dict['median']))) - self.defaultFragmentLength = 'read length' + print( + ( + "Fragment length based on paired en data " + "estimated to be {}".format(frag_len_dict["median"]) + ) + ) + + elif extendReads < read_len_dict["median"]: + sys.stderr.write( + "*WARNING*: read extension is smaller than read length (read length = {}). " + "Reads will not be extended.\n".format(int(read_len_dict["median"])) + ) + self.defaultFragmentLength = "read length" elif extendReads > 2000: exit("*ERROR*: read extension must be smaller that 2000. Value give: {} ".format(extendReads)) @@ -288,7 +304,7 @@ def __init__(self, bamFilesList, binLength=50, self.defaultFragmentLength = int(extendReads) else: - self.defaultFragmentLength = 'read length' + self.defaultFragmentLength = "read length" self.numberOfProcessors = numberOfProcessors self.verbose = verbose @@ -310,9 +326,9 @@ def __init__(self, bamFilesList, binLength=50, self.cellTag = cellTag self.groupTag = groupTag self.groupLabels = groupLabels - self.clusterInfo=clusterInfo - self.motifFilter = motifFilter# list of [readMotif, refMotif] - self.GCcontentFilter = GCcontentFilter# list of [readMotif, refMotif] + self.clusterInfo = clusterInfo + self.motifFilter = motifFilter # list of [readMotif, refMotif] + self.GCcontentFilter = GCcontentFilter # list of [readMotif, refMotif] self.genome = genome2bit self.sumCoveragePerBin = sumCoveragePerBin self.binarizeCoverage = binarizeCoverage @@ -328,7 +344,7 @@ def __init__(self, bamFilesList, binLength=50, if numberOfSamples is None and stepSize is None and bedFile is None: raise ValueError("either stepSize, numberOfSamples or bedFile have to be set") - if self.defaultFragmentLength != 'read length': + if self.defaultFragmentLength != "read length": self.maxPairedFragmentLength = 4 * self.defaultFragmentLength else: self.maxPairedFragmentLength = 1000 @@ -338,7 +354,9 @@ def __init__(self, bamFilesList, binLength=50, if len(self.mappedList) == 0: try: for fname in self.bamFilesList: - bam, mapped, unmapped, stats = bamHandler.openBam(fname, returnStats=True, nThreads=self.numberOfProcessors) + bam, mapped, unmapped, stats = bamHandler.openBam( + fname, returnStats=True, nThreads=self.numberOfProcessors + ) self.mappedList.append(mapped) self.statsList.append(stats) bam.close() @@ -433,35 +451,44 @@ def run(self, allArgs=None): self.region += ":{}".format(self.binLength) # Handle GTF options - transcriptID, exonID, transcript_id_designator, keepExons = deeptools.utilities.gtfOptions(allArgs) + ( + transcriptID, + exonID, + transcript_id_designator, + keepExons, + ) = deeptools.utilities.gtfOptions(allArgs) # use map reduce to call countReadsInRegions_wrapper - imap_res, _ = mapReduce.mapReduce([], - countReadsInRegions_wrapper, - chromsizes, - self_=self, - genomeChunkLength=chunkSize, - bedFile=self.bedFile, - blackListFileName=self.blackListFileName, - region=self.region, - includeLabels=True, - numberOfProcessors=self.numberOfProcessors, - transcriptID=transcriptID, - exonID=exonID, - keepExons=keepExons, - transcript_id_designator=transcript_id_designator) + imap_res, _ = mapReduce.mapReduce( + [], + countReadsInRegions_wrapper, + chromsizes, + self_=self, + genomeChunkLength=chunkSize, + bedFile=self.bedFile, + blackListFileName=self.blackListFileName, + region=self.region, + includeLabels=True, + numberOfProcessors=self.numberOfProcessors, + transcriptID=transcriptID, + exonID=exonID, + keepExons=keepExons, + transcript_id_designator=transcript_id_designator, + ) if self.out_file_for_raw_data: if len(non_common): - sys.stderr.write("*Warning*\nThe resulting bed file does not contain information for " - "the chromosomes that were not common between the bigwig files\n") + sys.stderr.write( + "*Warning*\nThe resulting bed file does not contain information for " + "the chromosomes that were not common between the bigwig files\n" + ) # concatenate intermediary bedgraph files ofile = open(self.out_file_for_raw_data, "w") for _values, tempFileName, regions in imap_res: if tempFileName: # concatenate all intermediate tempfiles into one - _foo = open(tempFileName, 'r') + _foo = open(tempFileName, "r") shutil.copyfileobj(_foo, ofile) _foo.close() os.remove(tempFileName) @@ -475,13 +502,16 @@ def run(self, allArgs=None): except ValueError: if self.bedFile: - sys.exit('\nNo coverage values could be computed.\n\n' - 'Please check that the chromosome names in the BED file are found on the bam files.\n\n' - 'The valid chromosome names are:\n{}'.format(chrNames)) + sys.exit( + "\nNo coverage values could be computed.\n\n" + "Please check that the chromosome names in the BED file are found on the bam files.\n\n" + "The valid chromosome names are:\n{}".format(chrNames) + ) else: - sys.exit('\nNo coverage values could be computed.\n\nCheck that all bam files are valid and ' - 'contain mapped reads.') - + sys.exit( + "\nNo coverage values could be computed.\n\nCheck that all bam files are valid and " + "contain mapped reads." + ) def count_reads_in_region(self, chrom, start, end, bed_regions_list=None): """Counts the reads in each bam file at each 'stepSize' position @@ -561,7 +591,7 @@ def count_reads_in_region(self, chrom, start, end, bed_regions_list=None): # A list of lists of tuples transcriptsToConsider = [] if bed_regions_list is not None: - #print(bed_regions_list) + # print(bed_regions_list) # bed/gtf file is provided # in this case the column entry can be optionally saved in the output regionNames = [x[2] for x in bed_regions_list] @@ -586,18 +616,22 @@ def count_reads_in_region(self, chrom, start, end, bed_regions_list=None): continue transcriptsToConsider.append([(i, i + self.binLength)]) -# if self.save_data: -# _file = open(deeptools.utilities.getTempFileName(suffix='.bed'), 'w+t') -# _file_name = _file.name -# else: -# _file_name = '' + # if self.save_data: + # _file = open(deeptools.utilities.getTempFileName(suffix='.bed'), 'w+t') + # _file_name = _file.name + # else: + # _file_name = '' # array to keep the read counts for the regions subnum_reads_per_bin = [] for trans in transcriptsToConsider: for bam in bam_handles: - tcov = self.get_coverage_of_region(bam, chrom, trans)# tcov is supposed to be an np.array, but now it's a dict(barcode:array) - tcov_stack = np.stack(list(tcov.values()), axis=0)# col-bind the output (rownames = barcode, colnames = bins ) + tcov = self.get_coverage_of_region( + bam, chrom, trans + ) # tcov is supposed to be an np.array, but now it's a dict(barcode:array) + tcov_stack = np.stack( + list(tcov.values()), axis=0 + ) # col-bind the output (rownames = barcode, colnames = bins ) if bed_regions_list is not None and not self.bed_and_bin: # output should be list of arrays. length = nRegions, values.shape=[nBAMs*nbarcodes] subnum_reads_per_bin.append([np.sum(s) for s in tcov_stack]) @@ -612,14 +646,18 @@ def count_reads_in_region(self, chrom, start, end, bed_regions_list=None): if not self.bed_and_bin: if self.groupTag and self.groupLabels: # stack the arrays column-wise, output rows=barcodes*groups*nBam, col=regions then reshape them so the regions are rows now - subnum_reads_per_bin = np.asarray(subnum_reads_per_bin).reshape((-1, len(self.groupLabels)*len(self.bamFilesList)), order='C') + subnum_reads_per_bin = np.asarray(subnum_reads_per_bin).reshape( + (-1, len(self.groupLabels) * len(self.bamFilesList)), order="C" + ) else: # stack the arrays column-wise, output rows=barcodes(*nBam), col=regions then reshape them so the regions are rows now - subnum_reads_per_bin = np.asarray(subnum_reads_per_bin).reshape((-1, len(self.barcodes)*len(self.bamFilesList)), order='C') + subnum_reads_per_bin = np.asarray(subnum_reads_per_bin).reshape( + (-1, len(self.barcodes) * len(self.bamFilesList)), order="C" + ) else: subnum_reads_per_bin = np.concatenate(subnum_reads_per_bin).transpose() ## convert the output to a sparse matrix - #subnum_reads_per_bin = sc.sparse.csr_matrix(subnum_reads_per_bin) + # subnum_reads_per_bin = sc.sparse.csr_matrix(subnum_reads_per_bin) ## prepare list of regions regionList = [] idx = 0 @@ -634,7 +672,7 @@ def count_reads_in_region(self, chrom, start, end, bed_regions_list=None): ends = ",".join([str(x[1]) for x in trans]) name = "{}_{}_{}::{}".format(chrom, starts, ends, bedname) regionList.append(name) - #_file.write(name + "\n") + # _file.write(name + "\n") else: for exon in trans: for startPos in range(exon[0], exon[1], exon[2]): @@ -642,29 +680,37 @@ def count_reads_in_region(self, chrom, start, end, bed_regions_list=None): # At the end of chromosomes (or due to blacklisted regions), there are bins smaller than the bin size # Counts there are added to the bin before them, but range() will still try to include them. break - name = "{}_{}_{}::{}".format(chrom, startPos, min(startPos + exon[2], exon[1]), bedname ) + name = "{}_{}_{}::{}".format(chrom, startPos, min(startPos + exon[2], exon[1]), bedname) regionList.append(name) - #_file.write(name+"\n") + # _file.write(name+"\n") idx += 1 # save region data as text (if the mtx file is asked) if self.save_data: - _file = open(deeptools.utilities.getTempFileName(suffix='.bed'), 'w+t') - _file_name = _file.name - for name in regionList: - _file.write(name+"\n") - _file.close() - regionList=None + _file = open(deeptools.utilities.getTempFileName(suffix=".bed"), "w+t") + _file_name = _file.name + for name in regionList: + _file.write(name + "\n") + _file.close() + regionList = None else: - _file_name = '' + _file_name = "" if self.verbose: endTime = time.time() rows = subnum_reads_per_bin.shape[0] - print("%s countReadsInRegions_worker: processing %d " - "(%.1f per sec) @ %s:%s-%s" % - (multiprocessing.current_process().name, - rows, rows / (endTime - start_time), chrom, start, end)) + print( + "%s countReadsInRegions_worker: processing %d " + "(%.1f per sec) @ %s:%s-%s" + % ( + multiprocessing.current_process().name, + rows, + rows / (endTime - start_time), + chrom, + start, + end, + ) + ) return subnum_reads_per_bin, _file_name, regionList @@ -712,17 +758,17 @@ def get_coverage_of_region(self, bamHandle, chrom, regions, fragmentFromRead_fun nbins += (reg[1] - reg[0]) // reg[2] if (reg[1] - reg[0]) % reg[2] > 0: nbins += 1 - #coverages = np.zeros(nbins, dtype='float64') + # coverages = np.zeros(nbins, dtype='float64') ## instead of an array, the coverages object is a dict with keys = barcodes, values = np arrays coverages = {} - if self.groupTag and self.groupLabels:# multi-sample BAM input, use the reconstructed labels + if self.groupTag and self.groupLabels: # multi-sample BAM input, use the reconstructed labels for b in self.groupLabels: - coverages[b] = np.zeros(nbins, dtype='float64') + coverages[b] = np.zeros(nbins, dtype="float64") else: for b in self.barcodes: - coverages[b] = np.zeros(nbins, dtype='float64') + coverages[b] = np.zeros(nbins, dtype="float64") - if self.defaultFragmentLength == 'read length': + if self.defaultFragmentLength == "read length": extension = 0 else: extension = self.maxPairedFragmentLength @@ -774,17 +820,21 @@ def get_coverage_of_region(self, bamHandle, chrom, regions, fragmentFromRead_fun except: # bigWig input, currently unUSED if bamHandle.chroms(chrom): - _ = np.array(bamHandle.stats(chrom, regStart, regEnd, type="mean", nBins=nRegBins), dtype=np.float) + _ = np.array( + bamHandle.stats(chrom, regStart, regEnd, type="mean", nBins=nRegBins), + dtype=np.float, + ) _[np.isnan(_)] = 0.0 _ = _ * tileSize coverages[bc] += _ continue else: - raise NameError("chromosome {} not found in bigWig file with chroms {}".format(chrom, bamHandle.chroms())) - + raise NameError( + "chromosome {} not found in bigWig file with chroms {}".format(chrom, bamHandle.chroms()) + ) prev_pos = set() - lpos = None # of previous processed read pair + lpos = None # of previous processed read pair for read in bamHandle.fetch(chrom, regStart, regEnd): if read.is_unmapped: @@ -807,7 +857,7 @@ def get_coverage_of_region(self, bamHandle, chrom, regions, fragmentFromRead_fun # Motif filter if self.motifFilter: - test = [ checkMotifs(read, chrom, twoBitGenome, m[0], m[1]) for m in self.motifFilter ] + test = [checkMotifs(read, chrom, twoBitGenome, m[0], m[1]) for m in self.motifFilter] if not any(test): continue # GC content filter @@ -825,7 +875,7 @@ def get_coverage_of_region(self, bamHandle, chrom, regions, fragmentFromRead_fun bc = read.get_tag(self.cellTag) if self.groupTag: grp = read.get_tag(self.groupTag) - new_bc = '::'.join([grp, bc])# new barcode tag = sample+bc tag + new_bc = "::".join([grp, bc]) # new barcode tag = sample+bc tag if new_bc not in self.groupLabels: if self.verbose: print("Encountered group: {}, not in provided labels. skipping..".format(grp)) @@ -842,9 +892,8 @@ def get_coverage_of_region(self, bamHandle, chrom, regions, fragmentFromRead_fun # get rid of duplicate reads with same barcode, startpos and optionally, endpos/umi if self.duplicateFilter: tup = getDupFilterTuple(read, new_bc, self.duplicateFilter) - if lpos is not None and lpos == read.reference_start \ - and tup in prev_pos: - continue + if lpos is not None and lpos == read.reference_start and tup in prev_pos: + continue if lpos != read.reference_start: prev_pos.clear() lpos = read.reference_start @@ -877,7 +926,10 @@ def get_coverage_of_region(self, bamHandle, chrom, regions, fragmentFromRead_fun if fragmentEnd > reg[0] + len(coverages[new_bc]) * tileSize: fragmentEnd = reg[0] + len(coverages[new_bc]) * tileSize sIdx = vector_start + max((fragmentStart - reg[0]) // tileSize, 0) - eIdx = vector_start + min(np.ceil(float(fragmentEnd - reg[0]) / tileSize).astype('int'), nRegBins) + eIdx = vector_start + min( + np.ceil(float(fragmentEnd - reg[0]) / tileSize).astype("int"), + nRegBins, + ) if last_eIdx is not None: sIdx = max(last_eIdx, sIdx) if sIdx >= eIdx: @@ -918,8 +970,17 @@ def get_coverage_of_region(self, bamHandle, chrom, regions, fragmentFromRead_fun if self.verbose: endTime = time.time() - print("%s, processing %s (%.1f per sec) reads @ %s:%s-%s" % ( - multiprocessing.current_process().name, c, c / (endTime - start_time), chrom, reg[0], reg[1])) + print( + "%s, processing %s (%.1f per sec) reads @ %s:%s-%s" + % ( + multiprocessing.current_process().name, + c, + c / (endTime - start_time), + chrom, + reg[0], + reg[1], + ) + ) vector_start += nRegBins @@ -1084,7 +1145,7 @@ def get_fragment_from_read(self, read): # E.g for a cigar of 40M260N22M # get blocks return two elements for the first 40 matches # and the for the last 22 matches. - if self.defaultFragmentLength == 'read length': + if self.defaultFragmentLength == "read length": return read.get_blocks() else: @@ -1112,8 +1173,9 @@ def get_fragment_from_read(self, read): fragmentStart = int(fragmentCenter - read.infer_query_length(always=False) / 2) fragmentEnd = fragmentStart + read.infer_query_length(always=False) - assert fragmentStart < fragmentEnd, "fragment start greater than fragment" \ - "end for read {}".format(read.query_name) + assert fragmentStart < fragmentEnd, "fragment start greater than fragment" "end for read {}".format( + read.query_name + ) return [(fragmentStart, fragmentEnd)] def getSmoothRange(self, tileIndex, tileSize, smoothRange, maxPosition): @@ -1173,6 +1235,7 @@ def getSmoothRange(self, tileIndex, tileSize, smoothRange, maxPosition): indexEnd = min(maxPosition, tileIndex + smoothTilesRight) return (indexStart, indexEnd) + def remove_row_of_zeros(matrix): r"remove rows containing all zeros or all nans" _mat = np.nan_to_num(matrix) @@ -1200,11 +1263,10 @@ def estimateSizeFactors(m): loggeomeans = np.ma.masked_where(np.isinf(loggeomeans), loggeomeans) # DESeq2 ratio-based size factor sf = np.exp(np.ma.median((np.log(m).T - loggeomeans).T, axis=0)) - return 1. / sf + return 1.0 / sf class Tester(object): - def __init__(self): """ The distribution of reads between the two bam files is as follows. @@ -1226,20 +1288,19 @@ def __init__(self): self.bamFile1 = self.root + "testA.bam" self.bamFile2 = self.root + "testB.bam" self.bamFile_PE = self.root + "test_paired2.bam" - self.chrom = '3R' + self.chrom = "3R" global debug debug = 0 def getRead(self, readType): - """ prepare arguments for test - """ + """prepare arguments for test""" bam = bamHandler.openBam(self.bamFile_PE) - if readType == 'paired-reverse': - read = [x for x in bam.fetch('chr2', 5000081, 5000082)][0] - elif readType == 'single-forward': - read = [x for x in bam.fetch('chr2', 5001491, 5001492)][0] - elif readType == 'single-reverse': - read = [x for x in bam.fetch('chr2', 5001700, 5001701)][0] + if readType == "paired-reverse": + read = [x for x in bam.fetch("chr2", 5000081, 5000082)][0] + elif readType == "single-forward": + read = [x for x in bam.fetch("chr2", 5001491, 5001492)][0] + elif readType == "single-reverse": + read = [x for x in bam.fetch("chr2", 5001700, 5001701)][0] else: # by default a forward paired read is returned - read = [x for x in bam.fetch('chr2', 5000027, 5000028)][0] + read = [x for x in bam.fetch("chr2", 5000027, 5000028)][0] return read diff --git a/sincei/RegionQuery.py b/sincei/RegionQuery.py index 8fd897a..9d45e1a 100644 --- a/sincei/RegionQuery.py +++ b/sincei/RegionQuery.py @@ -1,5 +1,6 @@ from itertools import compress + ## get overlap of GTF object (deeptoolsintervals) with anndata object (from sincei) ## output: dict (region->gene mapping) def get_gtf_adata_olaps(adata, gtf): @@ -28,20 +29,25 @@ def get_gtf_adata_olaps(adata, gtf): >>> olaps['Gm37381'] [('ENSMUSG00000064372', '+'), ('ENSMUSG00000064372', '-')] """ - var=adata.var - olaps=dict.fromkeys(var.index) + var = adata.var + olaps = dict.fromkeys(var.index) for i, key in enumerate(var.index): try: - chrom, start, end = var['chrom'][i], int(var['start'][i]), int(var['end'][i]) - ol=gtf.findOverlaps(chrom, start, end, includeStrand=True) + chrom, start, end = ( + var["chrom"][i], + int(var["start"][i]), + int(var["end"][i]), + ) + ol = gtf.findOverlaps(chrom, start, end, includeStrand=True) if ol: - genelist=[(x[2], x[5]) for x in ol] - olaps[key]=genelist + genelist = [(x[2], x[5]) for x in ol] + olaps[key] = genelist except ValueError: - olaps[key]=None + olaps[key] = None continue return olaps + ## Search for bins by gene name, return either the first bin (promoter) or all overlapping bins def get_bins_by_gene(dict, gene, firstBin=False): r""" @@ -72,24 +78,24 @@ def get_bins_by_gene(dict, gene, firstBin=False): >>> get_bins_by_gene(dict, 'gene1', firstBin=True) 'chr1_1' """ - klist=[] + klist = [] for k, v in dict.items(): if v: - vlist=[x[0] for x in v]# overlapping genes - slist=[x[1] for x in v]# overlapping gene strands - match=[x.lower()==gene.lower() for x in vlist] + vlist = [x[0] for x in v] # overlapping genes + slist = [x[1] for x in v] # overlapping gene strands + match = [x.lower() == gene.lower() for x in vlist] if any(match): klist.append(k) # get strand of the gene - strand=list(compress(slist, match))[0] + strand = list(compress(slist, match))[0] else: - strand=None + strand = None # if firstBin is asked, sort the bins by start pos and # return only the firstBin by strand if klist and firstBin: - spos = [x.split('_')[1] for x in klist] - if strand == '+': + spos = [x.split("_")[1] for x in klist] + if strand == "+": first_bin = spos.index(min(spos)) else: first_bin = spos.index(max(spos)) diff --git a/sincei/TopicModels.py b/sincei/TopicModels.py new file mode 100644 index 0000000..bdd2cfa --- /dev/null +++ b/sincei/TopicModels.py @@ -0,0 +1,130 @@ +# topic models +import numpy as np +import pandas as pd +import scanpy as sc +from gensim import corpora, matutils, models +import copy + +# Louvain clustering and UMAP +from networkx import convert_matrix +from sklearn.metrics import pairwise_distances +import leidenalg as la + +# import community +import umap +from scanpy._utils import get_igraph_from_adjacency +from scanpy.neighbors import ( + _compute_connectivities_umap, + _get_indices_distances_from_dense_matrix, +) + +### ------ Functions ------ + + +class TOPICMODEL: + r""" + Computes LSA or LDA for a given matrix and returns the cell-topic matrix + + Parameters + ---------- + mat : numpy array + Matrix of shape (cells, regions) + + cells : list + List of cells + + regions : list + List of regions + + nTopics : int + Number of topics + + smartCode : str + Smart code for tfidf + + Returns + ------- + corpus_lsi : gensim corpus + LSA corpus + + cell_topic : pandas dataframe + Cell-topic matrix + + corpus_tfidf : gensim corpus + TFIDF corpus + """ + + def __init__( + self, + mat, + cells, + regions, + n_topics, + smart_code="lfu", + n_passes=1, + n_workers=1, + ): + self.n_topics = n_topics + self.smart_code = smart_code + self.cells = cells + self.regions_dict = corpora.dictionary.Dictionary([regions]) + self.corpus = matutils.Sparse2Corpus(mat) + self.n_passes = n_passes + self.n_workers = n_workers + self.lsi_model = None + self.lda_model = None + self.cell_topic_dist = None + self.topic_region_dist = None + + def runLSA(self): + r"Computes LSA for a given matrix and returns the updated object" + + # LSA + tfidf = models.TfidfModel(self.corpus, id2word=self.regions_dict, normalize=True, smartirs=self.smart_code) + corpus_tfidf = tfidf[self.corpus] + self.lsi_model = models.LsiModel(corpus_tfidf, id2word=self.regions_dict, num_topics=self.n_topics) + self.cell_topic_dist = self.lsi_model[corpus_tfidf] + + # Compute Coherence Score + coherence_model_lsa = models.CoherenceModel( + model=self.lsi_model, corpus=self.corpus, dictionary=self.regions_dict, coherence="u_mass" + ) + coherence_lsa = coherence_model_lsa.get_coherence() + print("\nCoherence Score: ", coherence_lsa) + + def runLDA(self): + r"Computes LDA model for a given matrix and returns the updated object" + + self.lda_model = models.LdaMulticore( + corpus=self.corpus, num_topics=self.n_topics, passes=self.n_passes, workers=self.n_workers + ) + # get topic distributions for each document + self.cell_topic_dist = self.lda_model[self.corpus] + # get topic-word distributions + self.topic_region_dist = self.lda_model.get_topics() + + def get_cell_topic(self, pop_sparse_cells=False): + r"Returns cell-topic matrix from the updated object" + + cells = copy.deepcopy(self.cells) + ## make cell-topic df + li = [[tup[0] for tup in x] for x in self.cell_topic_dist] + li_val = [[tup[1] for tup in x] for x in self.cell_topic_dist] + + # if all documents don't have same set of topics, (optionally) remove them + if len(set([len(x) for x in li_val])) > 1: + bad_idx = sorted([i for i, v in enumerate(li_val) if len(v) != self.n_topics], reverse=True) + print("{} Cells were detected which don't contribute to all {} topics.".format(len(bad_idx), self.n_topics)) + if pop_sparse_cells: + print("Removing these cells from the analysis") + [li_val.pop(x) for x in bad_idx] + [li.pop(x) for x in bad_idx] + [cells.pop(x) for x in bad_idx] + else: + print("Not implemented! Need to fill these entries with zeros") + + li_val = np.stack(li_val) + cell_topic = pd.DataFrame(li_val, columns=li[0]) + cell_topic.index = cells + + return cell_topic diff --git a/sincei/Utilities.py b/sincei/Utilities.py index 68fde1e..7ff5257 100644 --- a/sincei/Utilities.py +++ b/sincei/Utilities.py @@ -3,67 +3,69 @@ import numpy as np import sys + def checkBAMtag(bam, name, tag): bctag = [read.has_tag(tag) for read in bam.head(1000)] if not any(bctag): sys.stderr.write("WARNING: Input file {} seems to lack the tag {}. Output might be empty. \n".format(name, tag)) return None + def checkMotifs(read, chrom, genome, readMotif, refMotif): - """ - Check whether a given motif is present in the read and the corresponding reference genome. - For example, in MNAse (scChIC-seq) data, we expect the reads to have an 'A' at the 5'-end, - while the genome has a 'TA' over hang (where the 'A' aligns with 'A' in the forward read), - like this below. - - Forwards aligned read: read has 'A', upstream has T - R1 ........A-------> - ----------TA------------\ Ref (+) - - Rev aligned read: read has 'T', downstream has A - - <-------T....... R1 - --------TA------------\ Ref (+) - - This function can look for any arbitrary motif in read and corresponding genome, but in the - same orientation as described above. - - :return: bool - - >>> import pysam - >>> import os - >>> from scDeepTools.scReadCounter import CountReadsPerBin as cr - >>> root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_data/" - >>> bam = pysam.AlignmentFile("{}/test_TA_filtering.bam".format(root)) - >>> iter = bam.fetch() - >>> read = next(iter) - >>> cr.checkMotifs(read, 'A', 'TA') # for valid scChIC read - True - >>> read = next(iter) - >>> cr.checkMotifs(read, 'A', 'TA') # for invalid scChIC read - False - """ - # get read and ref motif pos - read_motif = read.get_forward_sequence()[0:len(readMotif)] - ref_motifLen = len(refMotif) - 1 - - if(read.is_reverse): - # for reverse reads ref motif begins at read-end and ends downstream - endpos = read.reference_end + ref_motifLen - if endpos > genome.chroms()[chrom]: - endpos = read.reference_end #fail without error - ref_motif = genome.sequence(chrom, read.reference_end - 1, endpos) - else: - # for forward reads ref motif begins upstream and ends at read-start - startpos = read.reference_start - ref_motifLen - if startpos < 0: - startpos = read.reference_start #fail without error - ref_motif = genome.sequence(chrom, startpos, read.reference_start + 1) + """ + Check whether a given motif is present in the read and the corresponding reference genome. + For example, in MNAse (scChIC-seq) data, we expect the reads to have an 'A' at the 5'-end, + while the genome has a 'TA' over hang (where the 'A' aligns with 'A' in the forward read), + like this below. - if read_motif == readMotif and ref_motif == refMotif: - return True - else: - return False + Forwards aligned read: read has 'A', upstream has T + R1 ........A-------> + ----------TA------------\ Ref (+) + + Rev aligned read: read has 'T', downstream has A + + <-------T....... R1 + --------TA------------\ Ref (+) + + This function can look for any arbitrary motif in read and corresponding genome, but in the + same orientation as described above. + + :return: bool + + >>> import pysam + >>> import os + >>> from scDeepTools.scReadCounter import CountReadsPerBin as cr + >>> root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_data/" + >>> bam = pysam.AlignmentFile("{}/test_TA_filtering.bam".format(root)) + >>> iter = bam.fetch() + >>> read = next(iter) + >>> cr.checkMotifs(read, 'A', 'TA') # for valid scChIC read + True + >>> read = next(iter) + >>> cr.checkMotifs(read, 'A', 'TA') # for invalid scChIC read + False + """ + # get read and ref motif pos + read_motif = read.get_forward_sequence()[0 : len(readMotif)] + ref_motifLen = len(refMotif) - 1 + + if read.is_reverse: + # for reverse reads ref motif begins at read-end and ends downstream + endpos = read.reference_end + ref_motifLen + if endpos > genome.chroms()[chrom]: + endpos = read.reference_end # fail without error + ref_motif = genome.sequence(chrom, read.reference_end - 1, endpos) + else: + # for forward reads ref motif begins upstream and ends at read-start + startpos = read.reference_start - ref_motifLen + if startpos < 0: + startpos = read.reference_start # fail without error + ref_motif = genome.sequence(chrom, startpos, read.reference_start + 1) + + if read_motif == readMotif and ref_motif == refMotif: + return True + else: + return False def checkGCcontent(read, lowFilter, highFilter, returnGC=False): @@ -98,8 +100,8 @@ def checkGCcontent(read, lowFilter, highFilter, returnGC=False): """ seq = read.get_forward_sequence() total_bases = len(seq) - gc_bases = len([x for x in seq if x == 'C' or x == 'G']) - gc_frac = float(gc_bases)/total_bases + gc_bases = len([x for x in seq if x == "C" or x == "G"]) + gc_frac = float(gc_bases) / total_bases if returnGC: return gc_frac else: @@ -120,12 +122,13 @@ def checkAlignedFraction(read, lowFilter): cig = read.cigartuples tot = read.infer_read_length() matchPos = [i[0] == 0 for i in cig] - matchSum = sum([i[1] for i in list(compress(cig, matchPos)) ]) - if matchSum/tot >= lowFilter: + matchSum = sum([i[1] for i in list(compress(cig, matchPos))]) + if matchSum / tot >= lowFilter: return True else: return False + def colorPicker(name): r""" This function returns a list of colors for plotting. @@ -150,15 +153,62 @@ def colorPicker(name): ['#0072B2', '#009E73', '#D55E00', '#CC79A7', '#F0E442', '#56B4E9', '#E69F00', '#000000'] """ - colors = {'twentyfive': ["#dd4e34", "#78d545", "#b047dc", "#d6d94d", "#5d48c6", "#74d88b", "#c944af", - "#8a9739", "#542c75", "#d3953c", "#607dc7", "#487f46", "#d04774", "#7bd2c8", - "#6b2737", "#cfcb9a", "#332a42", "#d7928a", "#343d25", "#cc8ad3", "#7b6a43", - "#b5bad9", "#99472b", "#4e8290", "#936987"], - 'colorblind': ["#ffaec0", "#cd7600", "#893558", "#195f37", "#da71f9", "#a2d391", "#881e9b", - "#b9d05f", "#524d7e", "#f2bf4b", "#01d6c2", "#f54040", "#0097fb", "#756400", - "#4b44aa", "#f0bd79", "#a1008b", "#6d4d02", "#ff9afc", "#01df8f", "#e2b8ed", - "#6e9d00", "#f4177b", "#01b65f", "#9b2532"] - } + colors = { + "twentyfive": [ + "#dd4e34", + "#78d545", + "#b047dc", + "#d6d94d", + "#5d48c6", + "#74d88b", + "#c944af", + "#8a9739", + "#542c75", + "#d3953c", + "#607dc7", + "#487f46", + "#d04774", + "#7bd2c8", + "#6b2737", + "#cfcb9a", + "#332a42", + "#d7928a", + "#343d25", + "#cc8ad3", + "#7b6a43", + "#b5bad9", + "#99472b", + "#4e8290", + "#936987", + ], + "colorblind": [ + "#ffaec0", + "#cd7600", + "#893558", + "#195f37", + "#da71f9", + "#a2d391", + "#881e9b", + "#b9d05f", + "#524d7e", + "#f2bf4b", + "#01d6c2", + "#f54040", + "#0097fb", + "#756400", + "#4b44aa", + "#f0bd79", + "#a1008b", + "#6d4d02", + "#ff9afc", + "#01df8f", + "#e2b8ed", + "#6e9d00", + "#f4177b", + "#01b65f", + "#9b2532", + ], + } return colors[name] @@ -198,7 +248,7 @@ def getDupFilterTuple(read, bc, filter): """ tLenDup = getTLen(read, notAbs=True) - filt = filter.split('_') + filt = filter.split("_") ## get read (or fragment) start/end # get fragment start and end for that read if tLenDup >= 0: @@ -209,15 +259,15 @@ def getDupFilterTuple(read, bc, filter): e = s - tLenDup if read.reference_id != read.next_reference_id: e = read.pnext - if 'end' not in filt: + if "end" not in filt: # use only read (or fragment) start if read.is_reverse: s = None else: e = None ## get UMI if asked - if 'umi' in filt: - umi = read.get_tag('RX') + if "umi" in filt: + umi = read.get_tag("RX") else: umi = None tup = (bc, umi, s, e, read.next_reference_id, read.is_reverse) @@ -256,16 +306,16 @@ def gini(i, X): # from: # http://www.statsdirect.com/help/default.htm#nonparametric_methods/gini.htm # All values are treated equally, arrays must be 1d: - array=X[i,:].A.flatten()# get all bins from i'th cell - array=array[array.nonzero()] + array = X[i, :].A.flatten() # get all bins from i'th cell + array = array[array.nonzero()] if array.shape[0] <= 1: return np.nan else: array = np.sort(array) # Index per array element: - index = np.arange(1,array.shape[0]+1) + index = np.arange(1, array.shape[0] + 1) # Number of array elements: n = array.shape[0] # Gini coefficient: - return ((np.sum((2 * index - n - 1) * array)) / (n * np.sum(array))) + return (np.sum((2 * index - n - 1) * array)) / (n * np.sum(array)) diff --git a/sincei/WriteBedGraph.py b/sincei/WriteBedGraph.py index a97545f..7f09b45 100644 --- a/sincei/WriteBedGraph.py +++ b/sincei/WriteBedGraph.py @@ -14,7 +14,7 @@ from deeptools.writeBedGraph import bedGraphToBigWig, getGenomeChunkLength # own modules -import ReadCounter as cr +from sincei import ReadCounter as cr debug = 0 @@ -24,7 +24,8 @@ def scaleCoverage(tile_coverage, args): Return coverage per cluster as sum of cells. tileCoverage should be an list with only one element """ - return args['scaleFactor'] * tile_coverage + return args["scaleFactor"] * tile_coverage + def writeBedGraph_wrapper(args): r""" @@ -38,9 +39,7 @@ def writeBedGraph_wrapper(args): return WriteBedGraph.writeBedGraph_worker(*args) - class WriteBedGraph(cr.CountReadsPerBin): - r"""Reads bam files coverages and writes a bedgraph or bigwig file Extends the CountReadsPerBin object such that the coverage @@ -102,7 +101,16 @@ class WriteBedGraph(cr.CountReadsPerBin): """ - def run(self, func_to_call, func_args, out_file_prefix, blackListFileName=None, format="bedgraph", smoothLength=0, normUsing=None): + def run( + self, + func_to_call, + func_args, + out_file_prefix, + blackListFileName=None, + format="bedgraph", + smoothLength=0, + normUsing=None, + ): r""" Given a list of bamfiles, a function and a function arguments, this method writes a bedgraph file (or bigwig) file @@ -123,7 +131,7 @@ def run(self, func_to_call, func_args, out_file_prefix, blackListFileName=None, """ - #self.__dict__["smoothLength"] = smoothLength + # self.__dict__["smoothLength"] = smoothLength getStats = len(self.mappedList) < len(self.bamFilesList) bam_handles = [] for x in self.bamFilesList: @@ -145,20 +153,28 @@ def run(self, func_to_call, func_args, out_file_prefix, blackListFileName=None, self.region += ":{}".format(self.binLength) for x in list(self.__dict__.keys()): - if x in ["mappedList", "statsList", "barcodes", "clusterInfo", "groupLabels"]: + if x in [ + "mappedList", + "statsList", + "barcodes", + "clusterInfo", + "groupLabels", + ]: continue sys.stderr.write("{}: {}\n".format(x, self.__getattribute__(x))) # below we get the same ouput as in deeptools, except that the 3rd list # element contains multiple tmp file names, one tmp file per cluster - res = mapReduce.mapReduce([func_to_call, func_args], - writeBedGraph_wrapper, - chrom_names_and_size, - self_=self, - genomeChunkLength=genome_chunk_length, - region=self.region, - blackListFileName=blackListFileName, - numberOfProcessors=self.numberOfProcessors) + res = mapReduce.mapReduce( + [func_to_call, func_args], + writeBedGraph_wrapper, + chrom_names_and_size, + self_=self, + genomeChunkLength=genome_chunk_length, + region=self.region, + blackListFileName=blackListFileName, + numberOfProcessors=self.numberOfProcessors, + ) # Determine the sorted order of the temp files chrom_order = dict() @@ -172,15 +188,15 @@ def run(self, func_to_call, func_args, out_file_prefix, blackListFileName=None, clusters = cluster_info.cluster.unique().tolist() prefix = os.path.splitext(os.path.basename(out_file_prefix))[0] for cl in clusters: - print('Writing output for group: {}'.format(str(cl))) + print("Writing output for group: {}".format(str(cl))) if pd.isna(cl): continue # concatenate the coverages tmp_out = "/tmp/{}_{}.tmp".format(prefix, cl) - out_file = open(tmp_out, 'wb') + out_file = open(tmp_out, "wb") for r in res: if r[3][cl]: - _foo = open(r[3][cl], 'rb') + _foo = open(r[3][cl], "rb") shutil.copyfileobj(_foo, out_file) _foo.close() os.remove(r[3][cl]) @@ -189,34 +205,39 @@ def run(self, func_to_call, func_args, out_file_prefix, blackListFileName=None, ## read back and normalize cl_idx = cluster_info.index[pd.Series(cluster_info.cluster == cl)].tolist() nCells = float(len(cl_idx)) - out_file = pd.read_csv(tmp_out, sep = "\t", index_col=None, header = None) + out_file = pd.read_csv(tmp_out, sep="\t", index_col=None, header=None) # CPM norm - if normUsing=='CPM': + if normUsing == "CPM": mil_reads_mapped = float(np.sum(out_file[3])) / 1e6 if mil_reads_mapped < 0.00001: - sys.stderr.write("\n No or too few reads counted for group: " + cl + - ". If this persists for all groups, please double-check that your barcodes" - " match between the groupInfo file and the BAM files and you specified the correct " - " --cellTag \n") + sys.stderr.write( + "\n No or too few reads counted for group: " + + cl + + ". If this persists for all groups, please double-check that your barcodes" + " match between the groupInfo file and the BAM files and you specified the correct " + " --cellTag \n" + ) continue else: # per mil counts out_file[3] *= 1.0 / (mil_reads_mapped) - elif normUsing=='Mean': + elif normUsing == "Mean": # divided by nCells - out_file[3] *= 1.0/(nCells) + out_file[3] *= 1.0 / (nCells) # out bg_out = "{}_{}.bedgraph".format(out_file_prefix, cl) - out_file.to_csv(bg_out, sep ="\t", index = False, header = False) + out_file.to_csv(bg_out, sep="\t", index=False, header=False) os.remove(tmp_out) - if format == 'bigwig': - bedGraphToBigWig(chrom_names_and_size, [bg_out], - "{}_{}.bw".format(out_file_prefix, cl)) + if format == "bigwig": + bedGraphToBigWig( + chrom_names_and_size, + [bg_out], + "{}_{}.bw".format(out_file_prefix, cl), + ) def writeBedGraph_worker(self, chrom, start, end, func_to_call, func_args, bed_regions_list=None): - r"""Writes a bedgraph based on the read coverage per group of cells, indicated by cluster_info data frame. The given func is called to compute the desired bedgraph value @@ -267,8 +288,7 @@ def writeBedGraph_worker(self, chrom, start, end, func_to_call, func_args, bed_r >>> os.remove(tempFile[3]) """ if start > end: - raise NameError("start position ({0}) bigger " - "than end position ({1})".format(start, end)) + raise NameError("start position ({0}) bigger " "than end position ({1})".format(start, end)) coverage, _, r = self.count_reads_in_region(chrom, start, end) ## get groups (clusters) @@ -279,11 +299,11 @@ def writeBedGraph_worker(self, chrom, start, end, func_to_call, func_args, bed_r for cl in clusters: if pd.isna(cl): continue - _file = open(utilities.getTempFileName(suffix='.bg'), 'w') + _file = open(utilities.getTempFileName(suffix=".bg"), "w") previous_value = None line_string = "{}\t{}\t{}\t{:g}\n" cl_idx = cluster_info.index[pd.Series(cluster_info.cluster == cl)].tolist() -# nCells = len(cl_idx) + # nCells = len(cl_idx) for tileIndex in range(coverage.shape[0]): ## smoothing disabled for now @@ -303,15 +323,14 @@ def writeBedGraph_worker(self, chrom, start, end, func_to_call, func_args, bed_r elif previous_value != value: if not np.isnan(previous_value): - _file.write( - line_string.format(chrom, writeStart, writeEnd, previous_value)) + _file.write(line_string.format(chrom, writeStart, writeEnd, previous_value)) previous_value = value writeStart = writeEnd writeEnd = min(writeStart + self.binLength, end) # write remaining value if not a nan if previous_value is not None and writeStart != end and not np.isnan(previous_value): - _file.write(line_string.format(chrom, writeStart,end, previous_value)) + _file.write(line_string.format(chrom, writeStart, end, previous_value)) tempfilenames[cl] = _file.name _file.close() diff --git a/sincei/__init__.py b/sincei/__init__.py new file mode 100644 index 0000000..0dae0b3 --- /dev/null +++ b/sincei/__init__.py @@ -0,0 +1,4 @@ +import logging + +logging.basicConfig(level=logging.INFO) +logging.getLogger("cooler").setLevel(logging.WARNING) diff --git a/sincei/_deprecated.py b/sincei/_deprecated.py index 1a331f7..fbbfc37 100644 --- a/sincei/_deprecated.py +++ b/sincei/_deprecated.py @@ -31,6 +31,7 @@ def read_mtx(prefix): return mtx, rownames, colnames + # from mtx def preprocess_mtx(sparse_mtx, rownames, colnames, min_cell_sum, min_region_sum): r"""Preprocesses a sparse matrix for use with scanpy @@ -69,6 +70,7 @@ def preprocess_mtx(sparse_mtx, rownames, colnames, min_cell_sum, min_region_sum) """ from itertools import compress + ## binarize nonzero_mask = np.array(sparse_mtx[sparse_mtx.nonzero()] > 1)[0] rows = sparse_mtx.nonzero()[0][nonzero_mask] @@ -76,8 +78,8 @@ def preprocess_mtx(sparse_mtx, rownames, colnames, min_cell_sum, min_region_sum) sparse_mtx[rows, cols] = 1 ## filter low counts - colmask = np.array(np.sum(sparse_mtx, axis = 0) >= min_cell_sum)[0] - rowmask = np.array(np.sum(sparse_mtx, axis = 1) >= min_region_sum) + colmask = np.array(np.sum(sparse_mtx, axis=0) >= min_cell_sum)[0] + rowmask = np.array(np.sum(sparse_mtx, axis=1) >= min_region_sum) rowmask = np.array([x[0] for x in rowmask]) sparse_mtx = sparse_mtx[rowmask, :] sparse_mtx = sparse_mtx[:, colmask] @@ -85,14 +87,23 @@ def preprocess_mtx(sparse_mtx, rownames, colnames, min_cell_sum, min_region_sum) ## create anndata row_subset = list(compress(rownames, rowmask)) col_subset = list(compress(colnames, colmask)) - adata = anndata.AnnData(sparse_mtx.transpose(), - obs=pd.DataFrame(col_subset), - var=pd.DataFrame(row_subset)) + adata = anndata.AnnData( + sparse_mtx.transpose(), + obs=pd.DataFrame(col_subset), + var=pd.DataFrame(row_subset), + ) return adata -def cluster_LSA(cell_topic, modularityAlg = 'leiden', distance_metric='cosine', nk=30, resolution=1.0, connectivity_graph=True): +def cluster_LSA( + cell_topic, + modularityAlg="leiden", + distance_metric="cosine", + nk=30, + resolution=1.0, + connectivity_graph=True, +): r"""Cluster cells using the output of LSA_gensim Parameters @@ -127,43 +138,50 @@ def cluster_LSA(cell_topic, modularityAlg = 'leiden', distance_metric='cosine', # cluster on cel-topic dist _distances = pairwise_distances(cell_topic.iloc[:, 1:], metric=distance_metric) knn_indices, knn_distances = _get_indices_distances_from_dense_matrix(_distances, nk) - distances, connectivities = _compute_connectivities_umap(knn_indices, - knn_distances, - _distances.shape[0], nk) + distances, connectivities = _compute_connectivities_umap(knn_indices, knn_distances, _distances.shape[0], nk) - - if modularityAlg == 'leiden': + if modularityAlg == "leiden": if connectivity_graph: G = get_igraph_from_adjacency(connectivities, directed=True) else: G = get_igraph_from_adjacency(distances, directed=True) - partition = la.find_partition(G, - la.RBConfigurationVertexPartition, - weights='weight', - seed=42, - resolution_parameter=resolution) - cell_topic['cluster'] = partition.membership + partition = la.find_partition( + G, + la.RBConfigurationVertexPartition, + weights="weight", + seed=42, + resolution_parameter=resolution, + ) + cell_topic["cluster"] = partition.membership else: if connectivity_graph: G = convert_matrix.from_numpy_array(connectivities) else: G = convert_matrix.from_numpy_array(distances) partition = community.best_partition(G, resolution=resolution, random_state=42) - cell_topic['cluster'] = partition.values() + cell_topic["cluster"] = partition.values() # umap on cell-topic dist - um = umap.UMAP(spread = 5, min_dist=0.1, n_neighbors=nk, metric=distance_metric, init='random', random_state=42) - umfit = um.fit(cell_topic.iloc[:, 0:(len(cell_topic.columns) - 1)]) + um = umap.UMAP( + spread=5, + min_dist=0.1, + n_neighbors=nk, + metric=distance_metric, + init="random", + random_state=42, + ) + umfit = um.fit(cell_topic.iloc[:, 0 : (len(cell_topic.columns) - 1)]) umap_df = pd.DataFrame(umfit.embedding_) - umap_df.columns = ['UMAP1', 'UMAP2'] - umap_df['cluster'] = list(cell_topic.cluster) + umap_df.columns = ["UMAP1", "UMAP2"] + umap_df["cluster"] = list(cell_topic.cluster) umap_df.index = cell_topic.index return umap_df, G + ## LSA using scipy (older version) -#def runLSA(sparse_mtx, nPCs, scaleFactor): - #tf = (sparse_mtx.transpose() / sparse_mtx.sum(axis=0)).transpose() +# def runLSA(sparse_mtx, nPCs, scaleFactor): +# tf = (sparse_mtx.transpose() / sparse_mtx.sum(axis=0)).transpose() # tf = sparse_mtx / sparse_mtx.sum(axis=0) # tf = np.log1p(tf * scaleFactor) @@ -175,7 +193,7 @@ def cluster_LSA(cell_topic, modularityAlg = 'leiden', distance_metric='cosine', # return pca, tfidf ## for anndata -#def lsa_anndata(adata, n_pcs, scale_factor): +# def lsa_anndata(adata, n_pcs, scale_factor): # from scipy.sparse import issparse, coo_matrix, csr_matrix # mtx = sparse.csr_matrix(adata.X) # lsa_out, tfidf = runLSA(mtx.transpose(), n_pcs, scale_factor) @@ -188,18 +206,18 @@ def cluster_LSA(cell_topic, modularityAlg = 'leiden', distance_metric='cosine', # return adata -#def UMAP_clustering(adata): +# def UMAP_clustering(adata): - # make umap on PCA +# make umap on PCA # lsa_out = adata.obsm['X_pca'].transpose()[1:, :] # reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, spread=1.0, metric='euclidean', init = 'random') # embeddings = reducer.fit_transform(lsa_out.transpose()) # adata.obsm['X_umap'] = embeddings - # louvain (from scanpy) +# louvain (from scanpy) # scp.pp.neighbors(adata) # scp.tl.louvain(adata) # cluster_id = [int(x) for x in adata.obs['louvain'].to_list()] - # return +# return # return adata diff --git a/sincei/_version.py b/sincei/_version.py index 80db2c4..6a35e85 100644 --- a/sincei/_version.py +++ b/sincei/_version.py @@ -1 +1 @@ -__version__ = 0.2 +__version__ = "0.3" diff --git a/sincei/multimodalClustering.py b/sincei/multimodalClustering.py index f885bc8..3b1aba1 100644 --- a/sincei/multimodalClustering.py +++ b/sincei/multimodalClustering.py @@ -1,16 +1,22 @@ import umap import pandas as pd import scanpy as sc -from scanpy.neighbors import _compute_connectivities_umap, _get_indices_distances_from_dense_matrix +from scanpy.neighbors import ( + _compute_connectivities_umap, + _get_indices_distances_from_dense_matrix, +) from scanpy._utils import get_igraph_from_adjacency import igraph as ig import leidenalg as la from scipy import sparse import sys -from scClusterCells import LSA_gensim, cluster_LSA -#umap.__version__ : should be >= 0.5.1 +# own modules +from sincei.TopicModels import TOPICMODEL +from sincei._deprecated import cluster_LSA + +# umap.__version__ : should be >= 0.5.1 ## literature on multi-graph clustering """ @@ -26,8 +32,11 @@ https://web.media.mit.edu/~xdong/paper/tsp14.pdf """ -def multiModal_clustering(mode1_adata, mode2_adata, column_key='barcode_nla', nK=20): - r""" Performs multi-graph clustering on matched keys(barcodes) bw two anndata objects. + +def multiModal_clustering(mode1_adata, mode2_adata, column_key="barcode_nla", nK=20): + r""" + Performs multi-graph clustering on matched keys(barcodes) bw two anndata objects. + Parameters ---------- mode1_adata : AnnData @@ -50,51 +59,59 @@ def multiModal_clustering(mode1_adata, mode2_adata, column_key='barcode_nla', nK """ # subset RNA anndata - mode1_adata=mode1_adata[[i for i,v in enumerate(mode1_adata.obs[column_key]) if v in mode2_adata.obs.index]] + mode1_adata = mode1_adata[[i for i, v in enumerate(mode1_adata.obs[column_key]) if v in mode2_adata.obs.index]] # re-compute distances/clusters/umap sc.pp.neighbors(mode1_adata, n_neighbors=nK) sc.tl.louvain(mode1_adata) - sc.tl.umap(mode1_adata, min_dist=0.5, spread=5, init_pos='random', random_state=42) + sc.tl.umap(mode1_adata, min_dist=0.5, spread=5, init_pos="random", random_state=42) # get graph - G_rna = get_igraph_from_adjacency(mode1_adata.obsp['connectivities'], directed=True) + G_rna = get_igraph_from_adjacency(mode1_adata.obsp["connectivities"], directed=True) # subset chic anndata - #mode2_adata=mode2_adata[[i for i,v in enumerate(mode2_adata.obs_names.to_list()) if v in set(mode1_adata.obs[column_key])]] - mode2_adata=mode2_adata[mode1_adata.obs[column_key].tolist()] + # mode2_adata=mode2_adata[[i for i,v in enumerate(mode2_adata.obs_names.to_list()) if v in set(mode1_adata.obs[column_key])]] + mode2_adata = mode2_adata[mode1_adata.obs[column_key].tolist()] # re-do LSA mtx = sparse.csr_matrix(mode2_adata.X.transpose()) - corpus_lsi, cell_topic = LSA_gensim(mtx, list(mode2_adata.obs.index), - list(mode2_adata.var.index), nTopics = 20, smartCode='lfu') + dat = TOPICMODEL( + mtx, + list(mode2_adata.obs.index), + list(mode2_adata.var.index), + nTopics=20, + smartCode="lfu", + ) + dat.runLSA() + cell_topic = dat.get_cell_topic() + # get graph - chic_umap, G_chic = cluster_LSA(cell_topic, modularityAlg='leiden', resolution=1, nk=nK) - mode2_adata.obsm['X_pca'] = cell_topic - mode2_adata.obsm['X_umap'] = chic_umap + chic_umap, G_chic = cluster_LSA(cell_topic, modularityAlg="leiden", resolution=1, nk=nK) + mode2_adata.obsm["X_pca"] = cell_topic + mode2_adata.obsm["X_umap"] = chic_umap # leiden multi-layer clustering - optimiser=la.Optimiser() + optimiser = la.Optimiser() part_rna = la.ModularityVertexPartition(G_rna) part_chic = la.ModularityVertexPartition(G_chic) # - #part_rna = la.CPMVertexPartition(G_rna, resolution=res_layer1) - #part_chic = la.CPMVertexPartition(G_chic, resolution=res_layer2) - optimiser.optimise_partition_multiplex([part_rna, part_chic], - layer_weights=[2, 1], - n_iterations=-1) + # part_rna = la.CPMVertexPartition(G_rna, resolution=res_layer1) + # part_chic = la.CPMVertexPartition(G_chic, resolution=res_layer2) + optimiser.optimise_partition_multiplex([part_rna, part_chic], layer_weights=[2, 1], n_iterations=-1) print("Detected clusters: ", set(part_chic.membership)) - chic_umap['cluster_multi'] = part_chic.membership + chic_umap["cluster_multi"] = part_chic.membership # merge RNA and ChIC UMAPs - rna_umap = pd.DataFrame(mode1_adata.obsm['X_umap'], columns=['RNA_UMAP1', 'RNA_UMAP2']) + rna_umap = pd.DataFrame(mode1_adata.obsm["X_umap"], columns=["RNA_UMAP1", "RNA_UMAP2"]) rna_umap.index = mode1_adata.obs.index - rna_umap['cluster_mode1'] = mode1_adata.obs.louvain + rna_umap["cluster_mode1"] = mode1_adata.obs.louvain rna_umap[column_key] = mode1_adata.obs[column_key] - multi_umap=rna_umap.merge(chic_umap, left_index=False, right_index=True, left_on=column_key) - multi_umap['cluster_mode1']=[int(x) for x in multi_umap['cluster_mode1'].to_list()] + multi_umap = rna_umap.merge(chic_umap, left_index=False, right_index=True, left_on=column_key) + multi_umap["cluster_mode1"] = [int(x) for x in multi_umap["cluster_mode1"].to_list()] return multi_umap, mode1_adata, mode2_adata + ## run aligned UMAPs between two dfs with PCs -def umap_aligned(pca_mode1, pca_mode2, nK=15, distance_metric='eucledian'): - r"""Aligns two UMAP embeddings using the UMAP AlignedUMAP class +def umap_aligned(pca_mode1, pca_mode2, nK=15, distance_metric="eucledian"): + r""" + Aligns two UMAP embeddings using the UMAP AlignedUMAP class Parameters ---------- @@ -126,27 +143,34 @@ def umap_aligned(pca_mode1, pca_mode2, nK=15, distance_metric='eucledian'): 1 0. """ - pca_mode1=pca_mode1.loc[pca_mode2.index] - keys=[x for x in range(len(pca_mode1.index))] - values=[] + pca_mode1 = pca_mode1.loc[pca_mode2.index] + keys = [x for x in range(len(pca_mode1.index))] + values = [] for x in pca_mode1.index: - values.append([i for i,v in enumerate(pca_mode2.index) if v==x]) + values.append([i for i, v in enumerate(pca_mode2.index) if v == x]) values = [x[0] for x in values] - relation_dict = {i:v for i,v in zip(keys, values)} + relation_dict = {i: v for i, v in zip(keys, values)} chic_idx = pca_mode2.index rna_idx = pca_mode1.index - pca_mode2=pca_mode2.reset_index().drop('index', axis=1) - pca_mode1=pca_mode1.reset_index().drop('index', axis=1) + pca_mode2 = pca_mode2.reset_index().drop("index", axis=1) + pca_mode1 = pca_mode1.reset_index().drop("index", axis=1) # UMAP - UA = umap.AlignedUMAP(spread = 10, min_dist=0.01, n_neighbors=nK, metric=distance_metric, init='random', random_state=42) + UA = umap.AlignedUMAP( + spread=10, + min_dist=0.01, + n_neighbors=nK, + metric=distance_metric, + init="random", + random_state=42, + ) aligned_umap = UA.fit([pca_mode1, pca_mode2], relations=[relation_dict]) mode1_umap_aligned = pd.DataFrame(aligned_umap.embeddings_[0]) mode2_umap_aligned = pd.DataFrame(aligned_umap.embeddings_[1]) - mode2_umap_aligned.index=chic_idx - mode1_umap_aligned.index=rna_idx - mode2_umap_aligned.columns=['aligned_UMAP1', 'aligned_UMAP2'] - mode1_umap_aligned.columns=['aligned_UMAP1', 'aligned_UMAP2'] + mode2_umap_aligned.index = chic_idx + mode1_umap_aligned.index = rna_idx + mode2_umap_aligned.columns = ["aligned_UMAP1", "aligned_UMAP2"] + mode1_umap_aligned.columns = ["aligned_UMAP1", "aligned_UMAP2"] return mode1_umap_aligned, mode2_umap_aligned diff --git a/bin/alignmentSieve.py b/sincei/scBAMops.py similarity index 71% rename from bin/alignmentSieve.py rename to sincei/scBAMops.py index fb0b68a..cd617cd 100755 --- a/bin/alignmentSieve.py +++ b/sincei/scBAMops.py @@ -1,4 +1,6 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*- + import argparse import pysam import os @@ -10,63 +12,87 @@ from deeptools.mapReduce import mapReduce from deeptools.utilities import getTLen, smartLabels, getTempFileName import warnings -warnings.simplefilter(action='ignore', category=FutureWarning) + +warnings.simplefilter(action="ignore", category=FutureWarning) # own functions -scriptdir = os.path.abspath(os.path.join(__file__, "../../sincei")) -sys.path.append(scriptdir) -import ParserCommon -from Utilities import checkMotifs, checkGCcontent, getDupFilterTuple -from _version import __version__ +# scriptdir=os.path.abspath(os.path.join(__file__, "../../sincei")) +# sys.path.append(scriptdir) + +from sincei import ParserCommon +from sincei.Utilities import checkMotifs, checkGCcontent, getDupFilterTuple +from sincei._version import __version__ ## UPDATE: add group tag to BAM file based on a 2-columns mapping file (barcode -> group) + def parseArguments(): internalParser = get_args() - ioParser = ParserCommon.inputOutputOptions(opts=['bamfile', 'groupInfo', 'outFile']) - bamParser = ParserCommon.bamOptions(suppress_args=['binSize', 'distanceBetweenBins']) + ioParser = ParserCommon.inputOutputOptions(opts=["bamfile", "groupInfo", "outFile"]) + bamParser = ParserCommon.bamOptions(suppress_args=["binSize", "distanceBetweenBins"]) filterParser = ParserCommon.filterOptions() - readParser = ParserCommon.readOptions(suppress_args=['extendReads', 'centerReads']) + readParser = ParserCommon.readOptions(suppress_args=["extendReads", "centerReads"]) otherParser = ParserCommon.otherOptions() parser = argparse.ArgumentParser( - parents=[ioParser, internalParser, bamParser, filterParser, readParser, otherParser], + parents=[ + ioParser, + internalParser, + bamParser, + filterParser, + readParser, + otherParser, + ], formatter_class=argparse.RawDescriptionHelpFormatter, description="This tool filters alignments in a BAM/CRAM file according the the specified parameters. It can optionally output to BEDPE format.", - usage='Example usage: alignmentSieve.py -b sample1.bam -o sample1.filtered.bam --minMappingQuality 10 --filterMetrics log.txt', add_help=False) + usage="Example usage: scBAMops -b sample1.bam -o sample1.filtered.bam --minMappingQuality 10 --filterMetrics log.txt", + add_help=False, + ) return parser def get_args(): parser = argparse.ArgumentParser(add_help=False) - mod = parser.add_argument_group('Read modifiction arguments') - mod.add_argument('--shift', - nargs='+', - type=int, - help='Shift the left and right end of a read (for BAM files) or a fragment (for BED files). A positive value shift an end to the right (on the + strand) and a negative value shifts a fragment to the left. Either 2 or 4 integers can be provided. For example, "2 -3" will shift the left-most fragment end two bases to the right and the right-most end 3 bases to the left. If 4 integers are provided, then the first and last two refer to fragments whose read 1 is on the left or right, respectively. Consequently, it is possible to take strand into consideration for strand-specific protocols. A fragment whose length falls below 1 due to shifting will not be written to the output. See the online documentation for graphical examples. Note that non-properly-paired reads will be filtered.') - - mod.add_argument('--ATACshift', - action='store_true', - help='Shift the produced BAM file or BEDPE regions as commonly done for ATAC-seq. This is equivalent to --shift 4 -5 5 -4.') - - output = parser.add_argument_group('Optional output arguments') - output.add_argument('--BED', - action='store_true', - help='Instead of producing BAM files, write output in BEDPE format (as defined by MACS2). Note that only reads/fragments passing filtering criterion are written in BEDPE format.') - - output.add_argument('--filterMetrics', - metavar="FILE.log", - help="The number of entries in total and filtered are saved to this file") - - output.add_argument('--filteredOutReads', - metavar="filtered.bam", - help="If desired, all reads NOT passing the filtering criteria can be written to this file.") - - output.add_argument('--outTagName', - metavar="STR", - type=str, - help="In case where you want to group the BAM files based on user-defined --groupInfo, specify the output BAM tag which would contain the group name.", - default=None) + mod = parser.add_argument_group("Read modifiction arguments") + mod.add_argument( + "--shift", + nargs="+", + type=int, + help='Shift the left and right end of a read (for BAM files) or a fragment (for BED files). A positive value shift an end to the right (on the + strand) and a negative value shifts a fragment to the left. Either 2 or 4 integers can be provided. For example, "2 -3" will shift the left-most fragment end two bases to the right and the right-most end 3 bases to the left. If 4 integers are provided, then the first and last two refer to fragments whose read 1 is on the left or right, respectively. Consequently, it is possible to take strand into consideration for strand-specific protocols. A fragment whose length falls below 1 due to shifting will not be written to the output. See the online documentation for graphical examples. Note that non-properly-paired reads will be filtered.', + ) + + mod.add_argument( + "--ATACshift", + action="store_true", + help="Shift the produced BAM file or BEDPE regions as commonly done for ATAC-seq. This is equivalent to --shift 4 -5 5 -4.", + ) + + output = parser.add_argument_group("Optional output arguments") + output.add_argument( + "--BED", + action="store_true", + help="Instead of producing BAM files, write output in BEDPE format (as defined by MACS2). Note that only reads/fragments passing filtering criterion are written in BEDPE format.", + ) + + output.add_argument( + "--filterMetrics", + metavar="FILE.log", + help="The number of entries in total and filtered are saved to this file", + ) + + output.add_argument( + "--filteredOutReads", + metavar="filtered.bam", + help="If desired, all reads NOT passing the filtering criteria can be written to this file.", + ) + + output.add_argument( + "--outTagName", + metavar="STR", + type=str, + help="In case where you want to group the BAM files based on user-defined --groupInfo, specify the output BAM tag which would contain the group name.", + default=None, + ) return parser @@ -135,12 +161,12 @@ def filterWorker(arglist): if chrom not in twoBitGenome.chroms().keys(): raise NameError("chromosome {} not found in 2bit file".format(chrom)) - mode = 'wbu' - oname = getTempFileName(suffix='.bam') + mode = "wbu" + oname = getTempFileName(suffix=".bam") ofh = pysam.AlignmentFile(oname, mode=mode, template=fh) if args.filteredOutReads: - onameFiltered = getTempFileName(suffix='.bam') + onameFiltered = getTempFileName(suffix=".bam") ofiltered = pysam.AlignmentFile(onameFiltered, mode=mode, template=fh) else: onameFiltered = None @@ -152,7 +178,6 @@ def filterWorker(arglist): nFiltered = 0 total = 0 for read in fh.fetch(chrom, start, end): - if read.pos < start: # ensure that we never double count (in case distanceBetweenBins == 0) continue @@ -162,9 +187,11 @@ def filterWorker(arglist): if isinstance(args.groupInfo, pd.DataFrame): smpl = read.get_tag(args.groupTag) try: - tag_to_add = args.groupInfo.loc[(args.groupInfo['sample'] == smpl) & \ - (args.groupInfo['barcode'] == bc), 'cluster'].values.item() - read.set_tag(args.outTagName, tag_to_add, value_type='Z', replace=True) + tag_to_add = args.groupInfo.loc[ + (args.groupInfo["sample"] == smpl) & (args.groupInfo["barcode"] == bc), + "cluster", + ].values.item() + read.set_tag(args.outTagName, tag_to_add, value_type="Z", replace=True) except: if args.verbose: sys.stderr.write("Encountered read tags not in groupInfo file, skipped..") @@ -212,8 +239,7 @@ def filterWorker(arglist): if args.duplicateFilter: tup = getDupFilterTuple(read, bc, args.duplicateFilter) - if lpos is not None and lpos == read.reference_start \ - and tup in prev_pos: + if lpos is not None and lpos == read.reference_start and tup in prev_pos: nFiltered += 1 if ofiltered: ofiltered.write(read) @@ -242,7 +268,7 @@ def filterWorker(arglist): # filterRNAstrand if args.filterRNAstrand: if read.is_paired: - if args.filterRNAstrand == 'forward': + if args.filterRNAstrand == "forward": if read.flag & 144 == 128 or read.flag & 96 == 64: pass else: @@ -250,7 +276,7 @@ def filterWorker(arglist): if ofiltered: ofiltered.write(read) continue - elif args.filterRNAstrand == 'reverse': + elif args.filterRNAstrand == "reverse": if read.flag & 144 == 144 or read.flag & 96 == 96: pass else: @@ -259,7 +285,7 @@ def filterWorker(arglist): ofiltered.write(read) continue else: - if args.filterRNAstrand == 'forward': + if args.filterRNAstrand == "forward": if read.flag & 16 == 16: pass else: @@ -267,7 +293,7 @@ def filterWorker(arglist): if ofiltered: ofiltered.write(read) continue - elif args.filterRNAstrand == 'reverse': + elif args.filterRNAstrand == "reverse": if read.flag & 16 == 0: pass else: @@ -329,16 +355,20 @@ def main(args=None): # grouping and tagging the output alignments if args.groupInfo: if not args.outTagName: - sys.stderr.write("--groupInfo provided without --outTagName. By default, groups will be added to the 'RG' tag in the output BAM file.") - args.outTagName = 'RG' + sys.stderr.write( + "--groupInfo provided without --outTagName. By default, groups will be added to the 'RG' tag in the output BAM file." + ) + args.outTagName = "RG" df = pd.read_csv(args.groupInfo, sep="\t", index_col=None, comment="#") if len(df.columns) == 3: - df.columns = ['sample', 'barcode', 'cluster'] - df.index = df[['sample', 'barcode']].apply(lambda x: '::'.join(x), axis=1) + df.columns = ["sample", "barcode", "cluster"] + df.index = df[["sample", "barcode"]].apply(lambda x: "::".join(x), axis=1) args.groupInfo = df else: - sys.stderr.write("Error: The input to --groupInfo must be a 3-column tsv file with and ") + sys.stderr.write( + "Error: The input to --groupInfo must be a 3-column tsv file with and " + ) # shifting the output alignments if args.shift: @@ -349,8 +379,7 @@ def main(args=None): elif args.ATACshift: args.shift = [4, -5, 5, -4] - bam, mapped, unmapped, stats = openBam( - args.bamfile, returnStats=True, nThreads=args.numberOfProcessors) + bam, mapped, unmapped, stats = openBam(args.bamfile, returnStats=True, nThreads=args.numberOfProcessors) total = mapped + unmapped chrom_sizes = [(x, y) for x, y in zip(bam.references, bam.lengths)] chromDict = {x: y for x, y in zip(bam.references, bam.lengths)} @@ -368,13 +397,15 @@ def main(args=None): args.GCcontentFilter = [float(x) for x in gc] # Filter, writing the results to a bunch of temporary files - res = mapReduce([args, chromDict], - filterWorker, - chrom_sizes, - region=args.region, - blackListFileName=args.blackListFileName, - numberOfProcessors=args.numberOfProcessors, - verbose=args.verbose) + res = mapReduce( + [args, chromDict], + filterWorker, + chrom_sizes, + region=args.region, + blackListFileName=args.blackListFileName, + numberOfProcessors=args.numberOfProcessors, + verbose=args.verbose, + ) res = sorted(res) # The temp files are now in order for concatenation nFiltered = sum([x[3] for x in res]) @@ -415,7 +446,3 @@ def main(args=None): of.close() return 0 - - -if __name__ == "__main__": - main() diff --git a/sincei/scBulkCoverage.py b/sincei/scBulkCoverage.py new file mode 100755 index 0000000..56fb419 --- /dev/null +++ b/sincei/scBulkCoverage.py @@ -0,0 +1,522 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import argparse +import sys +import os +import itertools +import numpy as np +import pandas as pd +from deeptools.getScaleFactor import get_scale_factor +from deeptools.bamHandler import openBam +import warnings + +warnings.simplefilter(action="ignore", category=FutureWarning) + +## own Functions +# scriptdir=os.path.abspath(os.path.join(__file__, "../../sincei")) +# sys.path.append(scriptdir) + +from sincei import ParserCommon +from sincei import WriteBedGraph + +debug = 0 + + +def parseArguments(): + io_args = ParserCommon.inputOutputOptions(opts=["bamfiles", "groupInfo", "outFilePrefix"]) + bam_args = ParserCommon.bamOptions(default_opts={"binSize": 100}) + read_args = ParserCommon.readOptions() + filter_args = ParserCommon.filterOptions() + other_args = ParserCommon.otherOptions() + parser = argparse.ArgumentParser( + parents=[io_args, get_args(), bam_args, filter_args, read_args, other_args], + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description="This tool takes alignments of reads or fragments " + "as input (BAM files), along with cell grouping information, such as " + "barcode -> batch, or barcode -> cluster, as tsv file, and generates a " + "coverage track (bigWig or bedGraph) per group as output. " + "The coverage is calculated as the number of reads per bin, " + "where bins are short consecutive counting windows of a defined " + "size. It is possible to extended/change the length of the reads " + "to better reflect the actual fragment length. *scBulkCoverage* " + "offers normalization per cluster using different methods \n", + usage="An example usage is:" + "$ scBulkCoverage -b file1.bam file2.bam --labels file1 file2 -g scClusterCells_output.tsv -o coverage.bw", + add_help=False, + ) + + return parser + + +def get_args(): + parser = argparse.ArgumentParser(add_help=False) + + optional = parser.add_argument_group("Coverage-related Options") + optional.add_argument( + "--outFileFormat", + "-of", + help='Output file type. Either "bigwig" or "bedgraph".', + choices=["bigwig", "bedgraph"], + default="bigwig", + ) + + optional.add_argument( + "--normalizeUsing", + "-n", + help="How to normalize the pseudo-bulk counts. Options are " + '"CPM": normalized each bin to the counts per million mapped reads in that group.\n' + '"Frequency": binarize the coverage per bin and normalize to the total no. of cells per group. \n' + '"Mean": get mean signal per bin across cells in each group.\n' + '"None": simply return the sum of coverage per group.', + choices=["CPM", "Frequency", "Mean", "None"], + default="CPM", + ) + + optional.add_argument( + "--ignoreForNormalization", + "-ig", + help="Chromosomes to skip while calculating normalization factors", + nargs="+", + default=None, + ) + + optional.add_argument( + "--normalizeByReference", + "-nr", + help="NOT IMPLEMENTED: Normalize each group of cells by a reference group (which must be present in the --groupinfo file)" + "Note that the --normalizeUsing method is applied beforehand.", + choices=["ratio", "log2_ratio", "difference", "None"], + default=None, + ) + + optional.add_argument( + "--scaleFactor", + help="The computed scaling factor (or 1, if not applicable) will " + "be multiplied by this. (Default: %(default)s)", + default=1.0, + type=float, + required=False, + ) + + optional.add_argument( + "--MNase", + help="Determine nucleosome positions from MNase-seq/CUTnRUN data. " + "Only 3 nucleotides at the center of each fragment are counted. " + "The fragment ends are defined by the two mate reads. Only fragment lengths" + "between 130 - 200 bp are considered to avoid dinucleosomes or other artifacts. " + "By default, any fragments smaller or larger than this are ignored. To " + "over-ride this, use the --minFragmentLength and --maxFragmentLength options, " + "which will default to 130 and 200 if not otherwise specified in the presence " + "of --MNase. *NOTE*: Requires paired-end data. A bin size of 1 is recommended.", + action="store_true", + ) + + optional.add_argument( + "--Offset", + help="Uses this offset inside of each read as the signal. This is useful in " + "cases like RiboSeq or GROseq, where the signal is 12, 15 or 0 bases past the " + "start of the read. This can be paired with the --filterRNAstrand option. " + "Note that negative values indicate offsets from the end of each read. A value " + "of 1 indicates the first base of the alignment (taking alignment orientation " + "into account). Likewise, a value of -1 is the last base of the alignment. An " + "offset of 0 is not permitted. If two values are specified, then they will be " + "used to specify a range of positions. Note that specifying something like " + "--Offset 5 -1 will result in the 5th through last position being used, which " + "is equivalent to trimming 4 bases from the 5-prime end of alignments. Note " + "that if you specify --centerReads, the centering will be performed before the " + "offset.", + metavar="INT", + type=int, + nargs="+", + required=False, + ) + + return parser + + +def main(args=None): + args = ParserCommon.validateInputs( + parseArguments().parse_args(args), process_barcodes=False + ) # newlabels are discarded, since they are re-created from groupInfo file + global debug + if args.verbose: + sys.stderr.write("Specified --scaleFactor: {}\n".format(args.scaleFactor)) + debug = 1 + else: + debug = 0 + + ## read the group info file (make it more robust) + ## if the no. of columns are 3, expect "sample", "barcode", "cluster", if 4, expect sample:bc, umap1, umap2, cluster + df = pd.read_csv(args.groupInfo, sep="\t", index_col=None, comment="#") + if len(df.columns) == 3: + df.columns = ["sample", "barcode", "cluster"] + elif len(df.columns) == 4: + df.columns = ["barcode", "umap1", "umap2", "cluster"] + df["sample"] = [x.split("::")[:-1] for x in df.barcode] + df["barcode"] = [x.split("::")[-1] for x in df.barcode] + else: + sys.exit( + "*Error*:No. of columns in --groupInfo file not recognized. " + "Please provide either 3 (sample, barcode, group) or 4 (sample::barcode, umap1, umap2, group) column file" + ) + df.index = df[["sample", "barcode"]].apply(lambda x: "::".join(x), axis=1) + # barcodes = groupInfo.barcode.unique().tolist() + + ## match the sample labels with groupInfo labels before proceeding + ## create new DF with user-provided bam+labels (this would match the counts) + + # check that sample names in df and --labels match + labels_not_in_df = set(args.labels).difference(set(df["sample"])) + df_not_in_labels = set(df["sample"]).difference(set(args.labels)) + if bool(df_not_in_labels): + sys.stderr.write( + "Some (or all) of the samples indicated in the groupInfo file " + "are absent from the bam file labels! \n" + "Mismatched samples are: {} \n".format(df_not_in_labels) + ) + df = df.loc[df["sample"].isin(set(args.labels)),] + elif bool(labels_not_in_df): + sys.stderr.write( + "Some (or all) of the samples indicated in --labels " + "are absent from the in the groupInfo file! \n" + "Mismatched samples are: {} \n".format(labels_not_in_df) + ) + + # make another df, containing union of all barcodes, and in the same order per sample + barcodes = df["barcode"].unique().tolist() + sm = list(itertools.chain.from_iterable(itertools.repeat(lab, len(barcodes)) for lab in args.labels)) + bc = barcodes * len(args.labels) + groupInfo = pd.DataFrame({"sample": sm, "barcode": bc}) + groupInfo.index = groupInfo[["sample", "barcode"]].apply(lambda x: "::".join(x), axis=1) + groupInfo = pd.merge( + groupInfo, + df["cluster"], + how="left", + left_index=True, + right_index=True, + sort=False, + ) + groupInfo = groupInfo.reset_index()[["sample", "barcode", "cluster"]] + # re-construct new labels (sample+bc) + newlabels = ["::".join([x, y]) for x, y in zip(groupInfo["sample"], groupInfo["barcode"])] + # Normalization options + scale_factor = args.scaleFactor + func_args = {"scaleFactor": args.scaleFactor} + + coverageAsFrequency = False + if not args.ignoreForNormalization: + args.ignoreForNormalization = [] + if args.normalizeUsing == "None": + args.normalizeUsing = None # For the sake of sanity + elif args.normalizeUsing == "Frequency": + coverageAsFrequency = True + args.normalizeUsing = "Mean" # mean of binarized counts shall give us frequency + + # This fixes issue #520, where --extendReads wasn't honored if --filterRNAstrand was used + if args.filterRNAstrand and not args.Offset: + args.Offset = [1, -1] + + if args.MNase: + # check that library is paired end + # using getFragmentAndReadSize + from deeptools.getFragmentAndReadSize import get_read_and_fragment_length + + fraglengths = [ + get_read_and_fragment_length( + file, + return_lengths=False, + blackListFileName=args.blackListFileName, + numberOfProcessors=args.numberOfProcessors, + verbose=args.verbose, + ) + for file in args.bamfiles + ] + if any([x[0] is None for x in fraglengths]): + sys.exit("*Error*: For the --MNAse function a paired end library is required. ") + + # Set some default fragment length bounds + if args.minFragmentLength == 0: + args.minFragmentLength = 130 + if args.maxFragmentLength == 0: + args.maxFragmentLength = 200 + + wr = CenterFragment( + args.bamfiles, + binLength=args.binSize, + stepSize=args.binSize, + barcodes=barcodes, + clusterInfo=groupInfo, + cellTag=args.cellTag, + groupTag=args.groupTag, + groupLabels=newlabels, + motifFilter=args.motifFilter, + genome2bit=args.genome2bit, + GCcontentFilter=args.GCcontentFilter, + region=args.region, + blackListFileName=args.blackListFileName, + numberOfProcessors=args.numberOfProcessors, + extendReads=args.extendReads, + minMappingQuality=args.minMappingQuality, + duplicateFilter=args.duplicateFilter, + center_read=args.centerReads, + zerosToNans=False, + samFlag_include=args.samFlagInclude, + samFlag_exclude=args.samFlagExclude, + minFragmentLength=args.minFragmentLength, + maxFragmentLength=args.maxFragmentLength, + chrsToSkip=args.ignoreForNormalization, + binarizeCoverage=coverageAsFrequency, + verbose=args.verbose, + ) + + elif args.Offset: + if len(args.Offset) > 1: + if args.Offset[0] == 0: + sys.exit( + "*Error*: An offset of 0 isn't allowed, since offsets are 1-based positions inside each alignment." + ) + if args.Offset[1] > 0 and args.Offset[1] < args.Offset[0]: + sys.exit("'Error*: The right side bound is less than the left-side bound. This is inappropriate.") + else: + if args.Offset[0] == 0: + sys.exit( + "*Error*: An offset of 0 isn't allowed, since offsets are 1-based positions inside each alignment." + ) + wr = OffsetFragment( + args.bamfiles, + binLength=args.binSize, + stepSize=args.binSize, + barcodes=barcodes, + clusterInfo=groupInfo, + cellTag=args.cellTag, + groupTag=args.groupTag, + groupLabels=newlabels, + motifFilter=args.motifFilter, + genome2bit=args.genome2bit, + GCcontentFilter=args.GCcontentFilter, + region=args.region, + numberOfProcessors=args.numberOfProcessors, + extendReads=args.extendReads, + minMappingQuality=args.minMappingQuality, + duplicateFilter=args.duplicateFilter, + center_read=args.centerReads, + zerosToNans=False, + samFlag_include=args.samFlagInclude, + samFlag_exclude=args.samFlagExclude, + minFragmentLength=args.minFragmentLength, + maxFragmentLength=args.maxFragmentLength, + chrsToSkip=args.ignoreForNormalization, + binarizeCoverage=coverageAsFrequency, + verbose=args.verbose, + ) + wr.filter_strand = args.filterRNAstrand + wr.Offset = args.Offset + else: + wr = WriteBedGraph.WriteBedGraph( + args.bamfiles, + binLength=args.binSize, + stepSize=args.binSize, + barcodes=barcodes, + clusterInfo=groupInfo, + cellTag=args.cellTag, + groupTag=args.groupTag, + groupLabels=newlabels, + motifFilter=args.motifFilter, + genome2bit=args.genome2bit, + GCcontentFilter=args.GCcontentFilter, + region=args.region, + blackListFileName=args.blackListFileName, + numberOfProcessors=args.numberOfProcessors, + extendReads=args.extendReads, + minMappingQuality=args.minMappingQuality, + duplicateFilter=args.duplicateFilter, + center_read=args.centerReads, + zerosToNans=False, + samFlag_include=args.samFlagInclude, + samFlag_exclude=args.samFlagExclude, + minFragmentLength=args.minFragmentLength, + maxFragmentLength=args.maxFragmentLength, + chrsToSkip=args.ignoreForNormalization, + binarizeCoverage=coverageAsFrequency, + verbose=args.verbose, + ) + + wr.run( + WriteBedGraph.scaleCoverage, + func_args, + args.outFilePrefix, + blackListFileName=args.blackListFileName, + normUsing=args.normalizeUsing, + format=args.outFileFormat, + smoothLength=None, + ) + + +class OffsetFragment(WriteBedGraph.WriteBedGraph): + """ + Class to redefine the get_fragment_from_read for the --Offset case + """ + + def filterStrand(self, read, rv): + """ + A generic read filtering function that gets used by everything in this class. + + rv is returned if the strand is correct, otherwise [(None, None)] + """ + # Filter by RNA strand, if desired + if read.is_paired: + if self.filter_strand == "forward": + if read.flag & 144 == 128 or read.flag & 96 == 64: + return rv + elif self.filter_strand == "reverse": + if read.flag & 144 == 144 or read.flag & 96 == 96: + return rv + else: + return rv + else: + if self.filter_strand == "forward": + if read.flag & 16 == 16: + return rv + elif self.filter_strand == "reverse": + if read.flag & 16 == 0: + return rv + else: + return rv + + return [(None, None)] + + def get_fragment_from_read_list(self, read, offset): + """ + Return the range of exons from the 0th through 1st bases, inclusive. Positions are 1-based + """ + rv = [(None, None)] + blocks = read.get_blocks() + blockLen = sum([x[1] - x[0] for x in blocks]) + + if self.defaultFragmentLength != "read length": + if self.is_proper_pair(read, self.maxPairedFragmentLength): + if read.is_reverse: + foo = (read.next_reference_start, read.reference_start) + if foo[0] < foo[1]: + blocks.insert(0, foo) + else: + foo = ( + read.reference_end, + read.reference_end + abs(read.template_length) - read.infer_query_length(), + ) + if foo[0] < foo[1]: + blocks.append(foo) + + # Extend using the default fragment length + else: + if read.is_reverse: + foo = ( + read.reference_start - self.defaultFragmentLength + read.infer_query_length(), + read.reference_start, + ) + if foo[0] < 0: + foo = (0, foo[1]) + if foo[0] < foo[1]: + blocks.insert(0, foo) + else: + foo = ( + read.reference_end, + read.reference_end + self.defaultFragmentLength - read.infer_query_length(), + ) + if foo[0] < foo[1]: + blocks.append(foo) + + stretch = [] + # For the sake of simplicity, convert [(10, 20), (30, 40)] to [10, 11, 12, 13, ..., 40] + # Then subset accordingly + for block in blocks: + stretch.extend(range(block[0], block[1])) + if read.is_reverse: + stretch = stretch[::-1] + + # Handle --centerReads + if self.center_read: + _ = (len(stretch) - blockLen) // 2 + stretch = stretch[_ : _ + blockLen] + + # Subset by --Offset + try: + foo = stretch[offset[0] : offset[1]] + except: + return rv + + if len(foo) == 0: + return rv + if read.is_reverse: + foo = foo[::-1] + + # Convert the stretch back to a list of tuples + foo = np.array(foo) + d = foo[1:] - foo[:-1] + idx = np.argwhere(d > 1).flatten().tolist() # This now holds the interval bounds as a list + idx.append(-1) + last = 0 + rv = [] + for i in idx: + rv.append((foo[last].astype("int"), foo[i].astype("int") + 1)) + last = i + 1 + + # Handle strand filtering, if needed + return self.filterStrand(read, rv) + + def get_fragment_from_read(self, read): + """ + This is mostly a wrapper for self.get_fragment_from_read_list(), + which needs a list and for the offsets to be tweaked by 1. + """ + offset = [x for x in self.Offset] + if len(offset) > 1: + if offset[0] > 0: + offset[0] -= 1 + if offset[1] < 0: + offset[1] += 1 + else: + if offset[0] > 0: + offset[0] -= 1 + offset = [offset[0], offset[0] + 1] + else: + if offset[0] < -1: + offset = [offset[0], offset[0] + 1] + else: + offset = [offset[0], None] + if offset[1] == 0: + # -1 gets switched to 0, which screws things up + offset = (offset[0], None) + return self.get_fragment_from_read_list(read, offset) + + +class CenterFragment(WriteBedGraph.WriteBedGraph): + """ + Class to redefine the get_fragment_from_read for the --MNase case + + The coverage of the fragment is defined as the 2 or 3 basepairs at the + center of the fragment length. + """ + + def get_fragment_from_read(self, read): + """ + Takes a proper pair fragment of high quality and limited + to a certain length and outputs the center + """ + fragment_start = fragment_end = None + + # only paired forward reads are considered + # Fragments have already been filtered according to length + if read.is_proper_pair and not read.is_reverse and 1 < abs(read.tlen): + # distance between pairs is even return two bases at the center + if read.tlen % 2 == 0: + fragment_start = read.pos + read.tlen / 2 - 1 + fragment_end = fragment_start + 2 + + # distance is odd return three bases at the center + else: + fragment_start = read.pos + read.tlen / 2 - 1 + fragment_end = fragment_start + 3 + + return [(fragment_start, fragment_end)] diff --git a/sincei/scClusterCells.py b/sincei/scClusterCells.py new file mode 100755 index 0000000..1110558 --- /dev/null +++ b/sincei/scClusterCells.py @@ -0,0 +1,224 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import sys +import os +import copy +import argparse +import numpy as np +import pandas as pd +import torch +from scipy import sparse, io +from sklearn.preprocessing import binarize + +# plotting +import matplotlib +import matplotlib.pyplot as plt + +matplotlib.use("Agg") +matplotlib.rcParams["pdf.fonttype"] = 42 +matplotlib.rcParams["svg.fonttype"] = "none" + +# single-cell stuff +import anndata +import scanpy as sc + + +## own Functions +# scriptdir=os.path.abspath(os.path.join(__file__, "../../sincei")) +# sys.path.append(scriptdir) + +from sincei import ParserCommon +from sincei.TopicModels import TOPICMODEL +from sincei.GLMPCA import GLMPCA, EXPONENTIAL_FAMILY_DICT + + +def parseArguments(): + io_args = ParserCommon.inputOutputOptions(opts=["loomfile", "outFile"], requiredOpts=["outFile"]) + plot_args = ParserCommon.plotOptions() + other_args = ParserCommon.otherOptions() + + parser = argparse.ArgumentParser( + parents=[io_args, get_args(), plot_args, other_args], + formatter_class=argparse.ArgumentDefaultsHelpFormatter, # argparse.RawDescriptionHelpFormatter, + description=""" + This tool clusters the cells based on the input count matrix (output of scCountReads) and performs dimentionality reduction, + community detection and 2D projection (UMAP) of the cells. The result is an updated loom object, and (optionally) a plot file + and a tsv file with UMAP coordinates and corresponding cluster id for each barcode. + """, + usage="Example usage: scClusterCells -i cellCounts.loom -o clustered.loom -op .png > log.txt", + add_help=False, + ) + + return parser + + +def get_args(): + parser = argparse.ArgumentParser(add_help=False) + + general = parser.add_argument_group("Clustering Options") + general.add_argument( + "--outFileUMAP", + "-op", + type=str, + required=False, + help="The output plot file (for UMAP). If you specify this option, another file with the same " + "prefix (and .txt extention) is also created with the raw UMAP coordinates.", + ) + + general.add_argument( + "--outFileTrainedModel", + "-om", + type=argparse.FileType("w"), + required=False, + help="The output file for the trained LSI model. The saved model can be used later to embed/compare new cells " + "to the existing cluster of cells.", + ) + + general.add_argument( + "--method", + "-m", + type=str, + choices=["LSA", "LDA", "glmPCA"], + default="LSA", + help="The dimentionality reduction method for clustering. (Default: %(default)s)", + ) + + general.add_argument( + "--glmPCAfamily", + "-gf", + type=str, + choices=EXPONENTIAL_FAMILY_DICT.keys(), + default="poisson", + help="The choice of exponential family distribution to use for glmPCA method. (Default: %(default)s)", + ) + + general.add_argument( + "--binarize", + action="store_true", + help="Binarize the counts per region before dimentionality reduction (only for LSA/LDA)", + ) + + general.add_argument( + "--nPrinComps", + "-n", + default=20, + type=int, + help="Number of principle components to reduce the dimentionality to. " + "Use higher number for samples with more expected heterogenity. (Default: %(default)s)", + ) + + general.add_argument( + "--nNeighbors", + "-nk", + default=30, + type=int, + help="Number of nearest neighbours to consider for clustering and UMAP. This number should be chosen considering " + "the total number of cells and expected number of clusters. Smaller number will lead to more fragmented clusters. " + "(Default: %(default)s)", + ) + + general.add_argument( + "--clusterResolution", + "-cr", + default=1.0, + type=float, + help="Resolution parameter for clustering. Values lower than 1.0 would result in less clusters, " + "while higher values lead to splitting of clusters. In most cases, the optimum value would be between " + "0.8 and 1.2. (Default: %(default)s)", + ) + + return parser + + +def main(args=None): + args = parseArguments().parse_args(args) + + adata = sc.read_loom(args.input, obs_names="obs_names", var_names="var_names") + + mtx = sparse.csr_matrix(adata.X.copy().transpose()) # features x cells + cells = copy.deepcopy(adata.obs_names.to_list()) + regions = copy.deepcopy(adata.var_names.to_list()) + + if args.binarize: + mtx = binarize(mtx, copy=True) + + if args.method == "LSA": + ## LSA using gensim + model_object = TOPICMODEL( + mtx, + cells, + regions, + n_topics=args.nPrinComps, + smart_code="lfu", + ) + model_object.runLSA() + cell_topic = model_object.get_cell_topic(pop_sparse_cells=True) + ## update the anndata object, drop cells which are not in the anndata, drop 1st PC + adata = adata[cell_topic.index] + adata.obsm["X_pca"] = np.asarray(cell_topic.iloc[:, 1 : args.nPrinComps]) + + elif args.method == "LDA": + ## LDA using gensim + model_object = TOPICMODEL( + mtx, + cells, + regions, + n_topics=args.nPrinComps, + n_passes=2, + n_workers=4, + ) + model_object.runLDA() + cell_topic = model_object.get_cell_topic(pop_sparse_cells=True) + ## update the anndata object, drop cells which are not in the anndata, drop 1st PC + adata = adata[cell_topic.index] + adata.obsm["X_pca"] = np.asarray(cell_topic.iloc[:, 1 : args.nPrinComps]) + + elif args.method == "glmPCA": + # convert mtx to torch tensor + mtx = torch.tensor(mtx.todense()) # feature*cell tensor + + ## glmPCA using mctorch + model_object = GLMPCA( + n_pc=args.nPrinComps, + family=args.glmPCAfamily, + ) + model_object.fit(mtx) + cell_pcs = model_object.saturated_loadings_.detach().numpy() + + ## update the anndata object + adata.obsm["X_pca"] = np.asarray(cell_pcs) + + sc.pp.neighbors(adata, use_rep="X_pca", n_neighbors=args.nNeighbors) + sc.tl.leiden(adata, resolution=args.clusterResolution) + sc.tl.paga(adata) + sc.pl.paga(adata, plot=False, threshold=0.1) + sc.tl.umap(adata, min_dist=0.1, spread=5, init_pos="paga") + + adata.write_loom(args.outFile, write_obsm_varm=True) + + if args.outFileUMAP: + ## plot UMAP + fig = sc.pl.umap(adata, color="leiden", legend_loc="on data", return_fig=True, show=False) + fig.savefig(args.outFileUMAP, dpi=200, format=args.plotFileFormat) + prefix = args.outFileUMAP.split(".")[0] + umap_lsi = pd.DataFrame(adata.obsm["X_umap"], columns=["UMAP1", "UMAP2"], index=adata.obs.index) + umap_lsi["cluster"] = adata.obs["leiden"] + umap_lsi.to_csv(prefix + ".tsv", sep="\t", index_label="cell_id") + # plt.rcParams['font.size'] = 8.0 + # convert cm values to inches + # fig = plt.figure(figsize=(args.plotWidth / 2.54, args.plotHeight / 2.54)) + # fig.suptitle('LSA-UMAP', y=(1 - (0.06 / args.plotHeight))) + # plt.scatter(umap_lsi.UMAP1, umap_lsi.UMAP2, s=5, alpha = 0.8, c=[sns.color_palette()[x] for x in list(umap_lsi.cluster)]) + # plt.tight_layout() + # plt.savefig(args.outFileUMAP, dpi=200, format=args.plotFileFormat) + # plt.close() + + # save if asked + if args.outFileTrainedModel: + model_object.lsi_model.save(args.outFileTrainedModel) + # if args.outGraph: + # 'The output file for the Graph object (lgl format) which can be used for further clustering/integration.' + # graph.write_lgl(args.outGraph) + + return 0 diff --git a/sincei/scCombineCounts.py b/sincei/scCombineCounts.py new file mode 100755 index 0000000..24b4de9 --- /dev/null +++ b/sincei/scCombineCounts.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import sys +import os +import argparse +import numpy as np +import pandas as pd +from scipy import sparse, io + +# single-cell stuff +import anndata +import scanpy as sc + + +## own Functions +# scriptdir=os.path.abspath(os.path.join(__file__, "../../sincei")) +# sys.path.append(scriptdir) +from sincei import ParserCommon + + +def parseArguments(): + other_args = ParserCommon.otherOptions() + + parser = argparse.ArgumentParser( + parents=[get_args(), other_args], + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description=""" + This tool combines multiple count matrices (output of scCountReads) into one, either assuming they are different samples (multi-sample) + or different measurements on the same set of cells (multi-modal). The result is a .loom file with combined counts. NOTE: it doesn't perform + any 'batch effect correction' or 'integration' of data from different technologies, which requires more sophisticated methods. + """, + usage="Example usage: scCombineCounts -i sample1.loom sample2.loom -o combined.loom > log.txt", + add_help=False, + ) + + return parser + + +def get_args(): + parser = argparse.ArgumentParser(add_help=False) + + general = parser.add_argument_group("General Options") + + general.add_argument( + "--input", + "-i", + metavar="LOOM", + help="Input files in .loom format", + nargs="+", + required=True, + ) + + general.add_argument( + "--outFile", + "-o", + type=str, + help="The file to write results to. For method: `multi-sample`, the output " + "file is an updated .loom object, which can be used by other tools. " + "For method: `multi-omic`, the output file is an .hdf5 file. This file can only be " + "used by scClusterCells, to perform multi-modal clustering. ", + required=True, + ) + + general.add_argument( + "--labels", + "-l", + metavar="sample1 sample2", + help="User defined labels instead of default labels from " + "file names. Multiple labels have to be separated by a space, e.g. " + "--labels sample1 sample2 sample3", + nargs="+", + ) + + general.add_argument( + "--method", + "-m", + type=str, + choices=["multi-sample", "multi-modal"], + default="multi-sample", + help="How to merge the counts from the provided samples. " + "`multi-sample`: assumes that each sample is the independent, " + "but were counted in the same manner (i.e. on same features), therefore " + "it looks for feature overlaps, but not for barcode overlaps. " + "`multi-modal`: assumes that the counts were generated in 2 different ways, " + "but from the same set of cells (for example, using a multi-omic technology), " + "therefore it looks for the overlap of cell barcodes, but not for the overlaps " + "of features (Default: %(default)s)", + ) + + return parser + + +def main(args=None): + args = parseArguments().parse_args(args) + if args.method != "multi-sample": + sys.stderr.write("Only multi-sample method is currently implemented") + sys.exit(1) + + if args.labels and len(args.input) != len(args.labels): + print("The number of labels does not match the number of input files.") + sys.exit(1) + if not args.labels: + if args.smartLabels: + args.labels = smartLabels(args.input) + else: + args.labels = [os.path.basename(x) for x in args.input] + adata_list = [sc.read_loom(x, obs_names="obs_names", var_names="var_names") for x in args.input] + + ## concatenate labels and match chrom, start, end + var_list = [] + var_cols = ["chrom", "start", "end"] + for lab, ad in zip(args.labels, adata_list): + obs = ad.obs_names.to_list() + lab = [lab] * len(obs) + new_names = ["_".join([x, y]) for x, y in zip(lab, obs)] + ad.obs_names = new_names + hasinfo = all([x in ad.var.columns for x in var_cols]) + var_list.append(hasinfo) + + ## keep the chrom, start, end from original sample if present + adata = anndata.concat(adata_list) + if all(var_list): + var_df = adata_list[0].var[var_cols] + adata.var = adata.var.join(var_df) + else: + sys.stderr.write( + "WARNING: Not all input files contain the 'chrom', 'start', 'end' information. " + "The output will lack these fields. This might cause an error in some downstream tools" + ) + + sys.stdout.write("Combined cells: {} \n".format(adata.shape[0])) + sys.stdout.write("Combined features: {} \n".format(adata.shape[1])) + adata.write_loom(args.outFile) + return 0 diff --git a/sincei/scCountQC.py b/sincei/scCountQC.py new file mode 100755 index 0000000..31f32e4 --- /dev/null +++ b/sincei/scCountQC.py @@ -0,0 +1,264 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import sys +import os +import re +import argparse +import numpy as np +import pandas as pd +from scipy import sparse, io +from itertools import compress +import copy +import scanpy as sc +import warnings + +warnings.simplefilter(action="ignore", category=FutureWarning) + +## own Functions +# scriptdir=os.path.abspath(os.path.join(__file__, "../../sincei")) +# sys.path.append(scriptdir) +from sincei import ParserCommon +from sincei.Utilities import gini + + +### ------ Functions ------ +def filter_adata( + adata, + filter_region_dict=None, + filter_cell_dict=None, + bad_chrom=None, + bad_cells=None, +): + # 1. regions + if bad_chrom: + adata = adata[:, ~adata.var.chrom.isin(bad_chrom)] + if filter_region_dict: + for key in filter_region_dict.keys(): + adata = adata[ + :, + (adata.var[key] >= filter_region_dict[key][0]) & (adata.var[key] <= filter_region_dict[key][1]), + ] + + # 2. Cells + if bad_cells: + adata = adata[~adata.obs.index.isin(bad_cells)] + if filter_cell_dict: + for key in filter_cell_dict.keys(): + adata = adata[ + (adata.obs[key] >= filter_cell_dict[key][0]) & (adata.obs[key] <= filter_cell_dict[key][1]), + :, + ] + + return adata + + +""" +def make_plots(adata, fname=None): + # plotting + import matplotlib + import matplotlib.pyplot as plt + matplotlib.use('Agg') + matplotlib.rcParams['pdf.fonttype'] = 42 + matplotlib.rcParams['svg.fonttype'] = 'none' + import seaborn as sns# seaborn colormaps conflicts with deeptools colormaps + + ## filtering of regions + plt.figure() + pl1=sns.violinplot(data=adata.var, x='total_counts', y='chrom') + plt.figure() + pl2=sns.distplot(np.log10(adata.var['total_counts']+1)) + plt.figure() + pl3=sns.distplot(adata.var['mean_counts']) + plt.figure() + pl4=sns.distplot(adata.var['fraction_cells_with_signal']) + ## cells + plt.figure() + pl5=sns.scatterplot(data=adata.obs, x='total_counts', y='pct_counts_in_top_100_genes') + pl5.set(xscale="log") + plt.figure() + pl6=sns.distplot(adata.obs['fraction_regions_with_signal']) + plt.figure() + pl7=sns.scatterplot(data=adata.obs, x='total_counts', y='fraction_regions_with_signal') + + plist=[pl1, pl2, pl3, pl4, pl5, pl6, pl7] + if fname: + with PdfPages(fname) as pp: + for plot in plist: + pp.savefig(plot.figure) + return plist + + general.add_argument('--outPlot', '-op', + type=str, + help='The output plot file. This describes the distribution of filtering metrics pre and post filtering') + if args.outPlot: + make_plots(adata, fname=args.outPlot) + if cellfilter or regionfilter or badcells or badchrom: + make_plots(adata_filt, fname=args.outPlot+".filtered.pdf") +""" + + +def parseArguments(): + io_args = ParserCommon.inputOutputOptions(opts=["loomfile", "outFile"]) + plot_args = ParserCommon.plotOptions() + other_args = ParserCommon.otherOptions() + parser = argparse.ArgumentParser( + parents=[io_args, get_args(), plot_args, other_args], + formatter_class=argparse.RawDescriptionHelpFormatter, + description=""" + This tool performs calculates multiple quality controls metrics on the input .loom file (output of scCountReads) + and (optionally) filters the input file based on filterCellArgs/filterRegionArgs. The output is either an + updated .loom object (if filtering is requested) or the filtering metrics (--outMetrics) and plots (--outPlot). + """, + usage="Example usage: scCountQC -i cellCounts.loom -om qc_metrics.tsv > log.txt", + add_help=False, + ) + + return parser + + +def get_args(): + parser = argparse.ArgumentParser(add_help=False) + + general = parser.add_argument_group("Filtering arguments") + general.add_argument( + "--describe", + "-d", + action="store_true", + help="Print a list of cell and region metrics available for QC/filtering.", + ) + + general.add_argument( + "--outMetrics", + "-om", + type=str, + help="Prefix of the output file with calculated QC metrics. If given, the cell metrics are printed in " + ".cell.tsv and region metrics as .region.tsv", + ) + + general.add_argument( + "--filterCellArgs", + "-fc", + type=str, + help='List of arguments to filter cells. The format is "arg_name: minvalue, maxvalue; arg_name: minvalue, maxvalue; ...." ' + "where arg_name is the QC metric for cells present in the input loom file. In order to view all available " + 'cell filtering metrics, run scCountFilter with "--describe". The two arguments are supplied (minvalue, maxvalue) ' + "they are used as lower and upper bounds to filter cells. Make sure they are float/integer numbers.", + ) + + general.add_argument( + "--filterRegionArgs", + "-fr", + type=str, + help='List of arguments to filter regions. The format is "arg_name: minvalue, maxvalue; arg_name: minvalue; ...." ' + "where arg_name is the QC metric for regions present in the input loom file. In order to view all available " + 'cell filtering metrics, run scCountFilter with "--describe". The two arguments are supplied (minvalue, maxvalue) ' + "they are used as lower and upper bounds to filter cells. Make sure they are float/integer numbers.", + ) + + general.add_argument( + "--cell_blacklist", + "-cb", + default=None, + type=argparse.FileType("r"), + help="A list of barcodes to be included for the clustering. The barcodes " + "(along with sample labels) must be present in the input object.", + ) + + general.add_argument( + "--chrom_blacklist", + "-chb", + default=None, + type=str, + help="A comma separated list of chromosomes to exclude. eg. chrM, chrUn", + ) + + return parser + + +def main(args=None): + args = parseArguments().parse_args(args) + + adata = sc.read_loom(args.input, obs_names="obs_names", var_names="var_names") + ## add QC stats to the anndata object + # 1. scanpy metrics # fraction of regions/genes with signal are included in the metrics (pct_dropouts/n_genes_by_counts) + try: + sc.pp.calculate_qc_metrics(adata, inplace=True) + except IndexError: # not enough genes/regions + sys.stderr.write("\n Error: Too few regions in the input file to perform QC \n") + exit() + # pool = multiprocessing.Pool(args.numberOfProcessors) + + # func=partial(gini, X=adata.X) + # gini_list = pool.map(func, range(adata.shape[0]) ) + gini_list = [gini(i, adata.X) for i in range(adata.shape[0])] + adata.obs["gini_coefficient"] = gini_list + + if args.outMetrics: + mat = re.sub(".txt|.tsv|.csv", "", args.outMetrics) + obs_tsv = mat + ".cells.tsv" + var_tsv = mat + ".regions.tsv" + adata.obs.to_csv(obs_tsv, sep="\t", index_label="cell_id") + adata.var.to_csv(var_tsv, sep="\t", index_label="feature_id") + + # if --describe is asked, only print the numeric vars and obs columns + if args.describe: + is_num_col = [(pd.api.types.is_float_dtype(x) | pd.api.types.is_integer_dtype(x)) for x in adata.obs.dtypes] + cols = adata.obs.loc[:, is_num_col] + sys.stdout.write("\n Cell metrics: \n") + sys.stdout.write("Total cells: {} \n".format(adata.shape[0])) + print(pd.DataFrame({"min": cols.min(), "max": cols.max()})) + + is_num_row = [(pd.api.types.is_float_dtype(x) | pd.api.types.is_integer_dtype(x)) for x in adata.var.dtypes] + rows = adata.var.loc[:, is_num_row] + sys.stdout.write("\n Feature metrics: \n") + sys.stdout.write("Total features: {} \n".format(adata.shape[1])) + print(pd.DataFrame({"min": rows.min(), "max": rows.max()})) + exit() + + if args.filterCellArgs: + cellfilter = dict() + for x in args.filterCellArgs.strip().split(";"): + key = x.strip().split(":")[0] + v = x.strip().split(":")[1] + value = [float(x) for x in v.strip().split(",")] + cellfilter[key] = value + else: + cellfilter = None + if args.filterRegionArgs: + regionfilter = dict() + for x in args.filterRegionArgs.strip().split(";"): + key = x.strip().split(":")[0] + v = x.strip().split(":")[1] + value = [float(x) for x in v.strip().split(",")] + regionfilter[key] = value + else: + regionfilter = None + + if args.cell_blacklist: + ## read the barcode file + with open(args.cell_blacklist, "r") as f: + badcells = f.read().splitlines() + f.close() + else: + badcells = None + + if args.chrom_blacklist: + badchrom = args.chrom_blacklist.strip().split(",") + else: + badchrom = None + + if cellfilter or regionfilter or badcells or badchrom: + sys.stdout.write("Applying filters \n") + adata_filt = filter_adata( + adata, + filter_region_dict=regionfilter, + filter_cell_dict=cellfilter, + bad_chrom=badchrom, + bad_cells=badcells, + ) + sys.stdout.write("Remaining cells: {} \n".format(adata_filt.shape[0])) + sys.stdout.write("Remaining features: {} \n".format(adata_filt.shape[1])) + adata_filt.write_loom(args.outFile) + + return 0 diff --git a/sincei/scCountReads.py b/sincei/scCountReads.py new file mode 100755 index 0000000..6b09bd1 --- /dev/null +++ b/sincei/scCountReads.py @@ -0,0 +1,238 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import os +import sys +import argparse +import numpy as np +from scipy import sparse, io +import re +import pandas as pd +import anndata as ad +import scanpy as sc +from deeptools import parserCommon +from deeptools.utilities import smartLabels +import warnings + +warnings.simplefilter(action="ignore", category=FutureWarning) + +# own functions +# scriptdir=os.path.abspath(os.path.join(__file__, "../../sincei")) +# sys.path.append(scriptdir) + +from sincei import ReadCounter as countR +from sincei import ParserCommon + +old_settings = np.seterr(all="ignore") + + +def parseArguments(args=None): + parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description=""" + ``scCountReads`` computes the read coverages per cell barcode for genomic regions in the provided BAM file(s). + The analysis can be performed for the entire genome by running the program in 'bins' mode. + If you want to count the read coverage for specific regions only, use the ``features`` mode instead. + The standard output of ``scCountReads`` is a ".loom" file with counts, along with rowName (features) and colNames (cell barcodes). + + A detailed sub-commands help is available by typing: + + scCountReads bins -h + + scCountReads features -h + + """, + epilog="example usages:\n" + "scCountReads bins --bamfiles file1.bam file2.bam --barcodes whitelist.txt -o results \n\n" + "scCountReads features --BED selection.bed --bamfiles file1.bam file2.bam --barcodes whitelist.txt \n" + "-o results" + " \n\n", + conflict_handler="resolve", + ) + + subparsers = parser.add_subparsers( + title="commands", + dest="command", + description="subcommands", + help="subcommands", + metavar="", + ) + + read_args = ParserCommon.readOptions(suppress_args=["filterRNAstrand"]) + filter_args = ParserCommon.filterOptions() + other_args = ParserCommon.otherOptions() + + # bins mode options + subparsers.add_parser( + "bins", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + parents=[ + ParserCommon.inputOutputOptions( + opts=["bamfiles", "barcodes", "outFilePrefix", "BED"], + requiredOpts=["barcodes", "outFilePrefix"], + suppress_args=["BED"], + ), + parserCommon.gtf_options(suppress=True), + ParserCommon.bamOptions(default_opts={"binSize": 10000, "distanceBetweenBins": 0}), + read_args, + filter_args, + get_args(), + other_args, + ], + help="The reads are counted in bins of equal size. The bin size and distance between bins can be adjusted.", + add_help=False, + usage="%(prog)s -bs 100000 --bamfiles file1.bam file2.bam -o results \n", + ) + + # BED file arguments + subparsers.add_parser( + "features", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + parents=[ + ParserCommon.inputOutputOptions( + opts=["bamfiles", "barcodes", "outFilePrefix", "BED"], + requiredOpts=["barcodes", "outFilePrefix", "BED"], + ), + parserCommon.gtf_options(), + ParserCommon.bamOptions( + suppress_args=["binSize", "distanceBetweenBins"], + default_opts={"binSize": 10000, "distanceBetweenBins": 0}, + ), + read_args, + filter_args, + get_args(), + other_args, + ], + help="The user provides a BED/GTF file containing all regions " + "that should be counted. A common use would be to count scRNA-seq reads on Genes.", + usage="%(prog)s --BED selection.bed --bamfiles file1.bam file2.bam --barcodes whitelist.txt -o results\n", + add_help=False, + ) + + return parser + + +def get_args(): + parser = argparse.ArgumentParser(add_help=False) + optional = parser.add_argument_group("Misc arguments") + optional.add_argument( + "--genomeChunkSize", + type=int, + default=None, + help="Manually specify the size of the genome provided to each processor. " + "The default value of None specifies that this is determined by read " + "density of the BAM file.", + ) + + optional.add_argument( + "--outFileFormat", + type=str, + default="loom", + choices=["loom", "mtx"], + help="Output file format. Default is to write an anndata object of name " + ".loom, which can either be opened in scanpy, or by downstream tools. " + '"mtx" refers to the MatrixMarket sparse-matrix format. The output in this case would be ' + ".counts.mtx, along with .rownames.txt and .colnames.txt", + ) + + return parser + + +def main(args=None): + """ + 1. get read counts at different positions either + all of same length or from genomic regions from the BED file + + 2. save data for further plotting + + """ + args, newlabels = ParserCommon.validateInputs(parseArguments().parse_args(args)) + + if "BED" in args: + bed_regions = args.BED + else: + bed_regions = None + + ## create row/colNames + if args.outFileFormat == "mtx": + mtxFile = args.outFilePrefix + ".counts.mtx" + rowNamesFile = args.outFilePrefix + ".rownames.txt" + colNamesFile = args.outFilePrefix + ".colnames.txt" + else: + rowNamesFile = None + + stepSize = args.binSize + args.distanceBetweenBins + c = countR.CountReadsPerBin( + args.bamfiles, + binLength=args.binSize, + stepSize=stepSize, + barcodes=args.barcodes, + cellTag=args.cellTag, + groupTag=args.groupTag, + groupLabels=newlabels, + motifFilter=args.motifFilter, + genome2bit=args.genome2bit, + GCcontentFilter=args.GCcontentFilter, + numberOfSamples=None, + genomeChunkSize=args.genomeChunkSize, + numberOfProcessors=args.numberOfProcessors, + verbose=args.verbose, + region=args.region, + bedFile=bed_regions, + blackListFileName=args.blackListFileName, + extendReads=args.extendReads, + minMappingQuality=args.minMappingQuality, + duplicateFilter=args.duplicateFilter, + center_read=args.centerReads, + samFlag_include=args.samFlagInclude, + samFlag_exclude=args.samFlagExclude, + minFragmentLength=args.minFragmentLength, + maxFragmentLength=args.maxFragmentLength, + zerosToNans=False, + out_file_for_raw_data=rowNamesFile, + ) + + num_reads_per_bin, regionList = c.run(allArgs=args) + + sys.stderr.write("Number of bins " "found: {}\n".format(num_reads_per_bin.shape[0])) + + if num_reads_per_bin.shape[0] < 1: + exit( + "ERROR: too few non zero bins found.\n" + "If using --region please check that this " + "region is covered by reads.\n" + ) + + ## write mtx/rownames if asked + if args.outFileFormat == "mtx": + f = open(colNamesFile, "w") + f.write("\n".join(newlabels)) + f.write("\n") + f.close() + ## write the matrix as .mtx + sp = sparse.csr_matrix(num_reads_per_bin) + io.mmwrite(mtxFile, sp, field="integer") + else: + # write anndata + adata = ad.AnnData(num_reads_per_bin.T) + adata.obs = pd.DataFrame( + { + "sample": [x.split("::")[-2] for x in newlabels], + "barcodes": [x.split("::")[-1] for x in newlabels], + }, + index=newlabels, + ) + + rows = list(regionList) + + adata.var = pd.DataFrame( + { + "chrom": [x.split("_")[0] for x in rows], + "start": [x.split("_")[1] for x in rows], + "end": [x.split("_")[2] for x in rows], + }, + index=rows, + ) + + # export as loom + adata.write_loom(args.outFilePrefix + ".loom") diff --git a/sincei/scFilterBarcodes.py b/sincei/scFilterBarcodes.py new file mode 100755 index 0000000..69e825b --- /dev/null +++ b/sincei/scFilterBarcodes.py @@ -0,0 +1,242 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import argparse +import sys +import os + +from deeptools import parserCommon, bamHandler, utilities +from deeptools.mapReduce import mapReduce +from deeptoolsintervals import GTF +import numpy as np +import pandas as pd +from collections import Counter +from functools import reduce +import matplotlib.pyplot as plt +import warnings + +warnings.simplefilter(action="ignore", category=FutureWarning) + +## own functions +# scriptdir=os.path.abspath(os.path.join(__file__, "../../sincei")) +# sys.path.append(scriptdir) +from sincei.Utilities import * +from sincei import ParserCommon + + +def parseArguments(): + io_args = ParserCommon.inputOutputOptions(opts=["bamfile", "whitelist", "outFile"], requiredOpts=["bamfile"]) + bam_args = ParserCommon.bamOptions( + suppress_args=["labels", "smartLabels", "distanceBetweenBins", "region"], + default_opts={"binSize": 100000}, + ) + other_args = ParserCommon.otherOptions() + + parser = argparse.ArgumentParser( + parents=[io_args, get_args(), bam_args, other_args], + formatter_class=argparse.RawDescriptionHelpFormatter, + description=""" + This tool identifies barcodes present in a BAM files and produces a list. You can optionally filter these + barcodes by matching them to a whitelist or based on total counts. + """, + usage="Example usage: scFilterBarcodes -b sample.bam -w whitelist.txt > barcodes_detected.txt", + add_help=False, + ) + + return parser + + +def get_args(): + parser = argparse.ArgumentParser(add_help=False) + general = parser.add_argument_group("Counting Options") + + general.add_argument( + "--minHammingDist", + "-d", + help="Minimum hamming distance to match the barcode in whitelist. Note that increasing the " + "hamming distance really slows down the barcode detection process.", + metavar="INT", + type=int, + default=0, + required=False, + ) + + general.add_argument( + "--minCount", + "-mc", + help="Minimum no. of bins with non-zero counts, in order to report a barcode. Note that this number would range " + "from 0 to genome size/binSize. ", + metavar="INT", + type=int, + default=0, + required=False, + ) + + general.add_argument( + "--minMappingQuality", + "-mq", + metavar="INT", + help="If set, only reads that have a mapping " "quality score of at least this are " "considered.", + type=int, + ) + + general.add_argument( + "--rankPlot", + "-rp", + type=parserCommon.writableFile, + help='The output file name to plot the ranked counts per barcode (similar to the "knee plot",' + "but counts in this case would be the number of non-zero bins)", + ) + + return parser + + +## get hamming dist between 2 sequences +def ham_dist(s1, s2): + if len(s1) != len(s2): + raise ValueError("Undefined") + return sum(ch1 != ch2 for ch1, ch2 in zip(s1, s2)) + + +def getFiltered_worker(arglist): + chrom, start, end, args = arglist + # Fix the bounds + if end <= start: + end = start + 1 + ## open blacklist file + blackList = None + if args.blackListFileName is not None: + blackList = GTF(args.blackListFileName) + + fh = bamHandler.openBam(args.bamfile) + chromUse = utilities.mungeChromosome(chrom, fh.references) + + BCset = set() + for read in fh.fetch(chromUse, start, end): + if read.pos < start: + # ensure that we never double count (in case distanceBetweenBins == 0) + continue + + if read.flag & 4: + # Ignore unmapped reads, they were counted already + continue + + if args.minMappingQuality and read.mapq < args.minMappingQuality: + continue + + ## reads in blacklisted regions + if blackList and blackList.findOverlaps( + chrom, + read.reference_start, + read.reference_start + read.infer_query_length(always=False) - 1, + ): + continue + + ## get barcode and append to set + try: + bc = read.get_tag(args.cellTag) + except KeyError: + continue + if args.whitelist: + # match barcode to whitelist + if args.minHammingDist == 0: + if bc in args.whitelist: + BCset.add(bc) + else: + try: + hamdist = [ham_dist(x, bc) for x in args.whitelist] + if min(hamdist) <= args.minHammingDist: + BCset.add(bc) + except ValueError: + continue + else: + BCset.add(bc) + fh.close() + + # return the set with barcodes detected + return BCset + + +def main(args=None): + args = parseArguments().parse_args(args) + + # open barcode file and read the content in a list + if args.whitelist: + with open(args.whitelist, "r") as f: + barcodes = f.read().splitlines() + args.whitelist = barcodes + + bhs = bamHandler.openBam(args.bamfile, returnStats=True, nThreads=args.numberOfProcessors)[0] + chrom_sizes = list(zip(bhs.references, bhs.lengths)) + bhs.close() + + # Get the remaining metrics + res = mapReduce( + [args], + getFiltered_worker, + chrom_sizes, + genomeChunkLength=args.binSize, + blackListFileName=args.blackListFileName, + numberOfProcessors=args.numberOfProcessors, + verbose=args.verbose, + ) + ## res, should be a list of sets + # final_set = list(set().union(*res)) + df = pd.DataFrame( + Counter(reduce(lambda x, y: list(x) + list(y), res)).items(), + columns=["barcode", "count"], + ) + + if args.minCount: + if args.minCount > len(res): + print("minCount bigger than No. of bins. Reducing to maximum") + args.minCount = len(res) + final_set = df.loc[df["count"] >= args.minCount]["barcode"].to_list() + else: + final_set = df["barcode"].tolist() + + # convert count to log10 + df["count"] = np.log10(df["count"]) + if args.rankPlot: + df["count_rank"] = df["count"].rank(method="min", ascending=False) + fig, ax = plt.subplots() + plt.plot( + "count_rank", + "count", + data=df, + linestyle="none", + marker="o", + color="grey", + markersize=6, + ) + ax.set_xlabel("Barcode Rank", fontsize=15) + ax.set_ylabel("No. of nonzero bins (log10)", fontsize=15) + ax.set_title("Ranked counts (#bins) for detected Barcodes", fontsize=15) + plt.xticks(np.arange(0, max(df["count"]), int(max(df["count_rank"]) / 10)), fontsize=10) + plt.yticks(np.arange(0, 4, 0.1), fontsize=10) + + # Annotation + if args.minCount: + plt.axhline(args.minCount, color="r") + + fig.tight_layout() + plt.savefig( + args.rankPlot, + dpi="figure", + format=None, + metadata=None, + pad_inches=0.2, + facecolor="auto", + edgecolor="auto", + backend=None, + ) + + if args.outFile is None: + of = sys.stdout + else: + of = open(args.outFile, "w") + + for x in final_set: + of.write(str(x) + "\n") + # df.to_csv("~/programs/sincei/test_df.csv") + return 0 diff --git a/bin/scFilterStats b/sincei/scFilterStats.py similarity index 75% rename from bin/scFilterStats rename to sincei/scFilterStats.py index c0d7d1c..9125a1b 100755 --- a/bin/scFilterStats +++ b/sincei/scFilterStats.py @@ -1,4 +1,6 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*- + import argparse import sys import os @@ -12,24 +14,33 @@ import py2bit import pandas as pd import warnings -warnings.simplefilter(action='ignore', category=FutureWarning) + +warnings.simplefilter(action="ignore", category=FutureWarning) ## own functions -scriptdir=os.path.abspath(os.path.join(__file__, "../../sincei")) -sys.path.append(scriptdir) -from Utilities import * -import ParserCommon +# scriptdir=os.path.abspath(os.path.join(__file__, "../../sincei")) +# sys.path.append(scriptdir) +from sincei.Utilities import * +from sincei import ParserCommon + def parseArguments(): filterParser = ParserCommon.filterOptions() - io_args = ParserCommon.inputOutputOptions(opts=['bamfiles', 'barcodes', 'outFile'], - requiredOpts=['barcodes']) - bam_args = ParserCommon.bamOptions(suppress_args=['region'], - default_opts={'binSize': 100000, - 'distanceBetweenBins': 1000000}) + io_args = ParserCommon.inputOutputOptions(opts=["bamfiles", "barcodes", "outFile"], requiredOpts=["barcodes"]) + bam_args = ParserCommon.bamOptions( + suppress_args=["region"], + default_opts={"binSize": 100000, "distanceBetweenBins": 1000000}, + ) filter_args = ParserCommon.filterOptions() - read_args = ParserCommon.readOptions(suppress_args=['minFragmentLength', 'maxFragmentLength', 'extendReads', 'centerReads']) + read_args = ParserCommon.readOptions( + suppress_args=[ + "minFragmentLength", + "maxFragmentLength", + "extendReads", + "centerReads", + ] + ) other_args = ParserCommon.otherOptions() parser = argparse.ArgumentParser( @@ -55,8 +66,9 @@ def parseArguments(): The sum of these may be more than the total number of reads. Note that alignments are sampled from bins of size --binSize spaced --distanceBetweenBins apart. """, - usage='Example usage: scFilterStats.py -b sample1.bam sample2.bam -bc barcodes.txt > log.txt', - add_help=False) + usage="Example usage: scFilterStats.py -b sample1.bam sample2.bam -bc barcodes.txt > log.txt", + add_help=False, + ) return parser @@ -116,7 +128,6 @@ def getFiltered_worker(arglist): nFiltered[b] = 0 total[b] = 0 # This is only used to estimate the percentage affected - for read in fh.fetch(chromUse, start, end): bc = read.get_tag(args.cellTag) # also keep a counter for barcodes not in whitelist? @@ -148,15 +159,18 @@ def getFiltered_worker(arglist): minAlignedFraction[bc] += 1 ## reads in blacklisted regions - if blackList and blackList.findOverlaps(chrom, read.reference_start, read.reference_start + read.infer_query_length(always=False) - 1): + if blackList and blackList.findOverlaps( + chrom, + read.reference_start, + read.reference_start + read.infer_query_length(always=False) - 1, + ): filtered[bc] = 1 blacklisted[bc] += 1 ## Duplicates if args.duplicateFilter: tup = getDupFilterTuple(read, bc, args.duplicateFilter) - if lpos is not None and lpos == read.reference_start \ - and tup in prev_pos: + if lpos is not None and lpos == read.reference_start and tup in prev_pos: filtered[bc] = 1 internalDupes[bc] += 1 if lpos != read.reference_start: @@ -178,7 +192,7 @@ def getFiltered_worker(arglist): ## remove reads that don't pass the motif filter if args.motifFilter: - test = [ checkMotifs(read, chrom, twoBitGenome, m[0], m[1]) for m in args.motifFilter ] + test = [checkMotifs(read, chrom, twoBitGenome, m[0], m[1]) for m in args.motifFilter] # if none given motif found, return true if not any(test): filtered[bc] = 1 @@ -187,26 +201,26 @@ def getFiltered_worker(arglist): # filterRNAstrand if args.filterRNAstrand: if read.is_paired: - if args.filterRNAstrand == 'forward': + if args.filterRNAstrand == "forward": if read.flag & 144 == 128 or read.flag & 96 == 64: pass else: filtered[bc] = 1 filterRNAstrand[bc] += 1 - elif args.filterRNAstrand == 'reverse': + elif args.filterRNAstrand == "reverse": if read.flag & 144 == 144 or read.flag & 96 == 96: pass else: filtered[bc] = 1 filterRNAstrand[bc] += 1 else: - if args.filterRNAstrand == 'forward': + if args.filterRNAstrand == "forward": if read.flag & 16 == 16: pass else: filtered[bc] = 1 filterRNAstrand[bc] += 1 - elif args.filterRNAstrand == 'reverse': + elif args.filterRNAstrand == "reverse": if read.flag & 16 == 0: pass else: @@ -218,7 +232,21 @@ def getFiltered_worker(arglist): fh.close() # first make a tuple where each entry is a dict of barcodes:value - tup = (total, nFiltered, blacklisted, minMapq, samFlagInclude, samFlagExclude, internalDupes, externalDupes, singletons, filterRNAstrand, filterMotifs, filterGC, minAlignedFraction) + tup = ( + total, + nFiltered, + blacklisted, + minMapq, + samFlagInclude, + samFlagExclude, + internalDupes, + externalDupes, + singletons, + filterRNAstrand, + filterMotifs, + filterGC, + minAlignedFraction, + ) # now simplify it merged = {} @@ -226,14 +254,13 @@ def getFiltered_worker(arglist): merged[b] = tuple(merged[b] for merged in tup) # now merged is a dict with each key = barcode, values = tuple of stats # Now convert it to array - out = np.stack([ v for k, v in merged.items() ]) + out = np.stack([v for k, v in merged.items()]) # out is an array with row = len(barcode) [384], column = len(stats) [11] o.append(out) return o def main(args=None): - args, rowLabels = ParserCommon.validateInputs(parseArguments().parse_args(args)) if args.outFile is None: @@ -253,35 +280,43 @@ def main(args=None): x.close() # Get the remaining metrics - res = mapReduce([args], - getFiltered_worker, - chrom_sizes, - genomeChunkLength=args.binSize + args.distanceBetweenBins, - blackListFileName=args.blackListFileName, - numberOfProcessors=args.numberOfProcessors, - verbose=args.verbose) + res = mapReduce( + [args], + getFiltered_worker, + chrom_sizes, + genomeChunkLength=args.binSize + args.distanceBetweenBins, + blackListFileName=args.blackListFileName, + numberOfProcessors=args.numberOfProcessors, + verbose=args.verbose, + ) ## res, should be the list of np.arrays of length (len(barcodes) * 9) ## final output is an array where nrows = bamfiles*barcodes, ncol = No. of stats - final_array = np.asarray(res).sum(axis = 0) + final_array = np.asarray(res).sum(axis=0) ## get final row/col Names (bamnames_barcode) - colLabels = ["Total_sampled","Filtered","Blacklisted", "Low_MAPQ", - "Missing_Flags","Excluded_Flags","Internal_Duplicates", - "Marked_Duplicates","Singletons","Wrong_strand", - "Wrong_motif", "Unwanted_GC_content", "Low_aligned_fraction"] - - final_df = pd.DataFrame(data = np.concatenate(final_array), - index = rowLabels, - columns = colLabels) + colLabels = [ + "Total_sampled", + "Filtered", + "Blacklisted", + "Low_MAPQ", + "Missing_Flags", + "Excluded_Flags", + "Internal_Duplicates", + "Marked_Duplicates", + "Singletons", + "Wrong_strand", + "Wrong_motif", + "Unwanted_GC_content", + "Low_aligned_fraction", + ] + + final_df = pd.DataFrame(data=np.concatenate(final_array), index=rowLabels, columns=colLabels) ## since stats are approximate, present results as % - final_df.iloc[:,1:] = final_df.iloc[:,1:].div(final_df.Total_sampled, axis=0)*100 + final_df.iloc[:, 1:] = final_df.iloc[:, 1:].div(final_df.Total_sampled, axis=0) * 100 if args.outFile is not None: - final_df.to_csv(args.outFile, sep = "\t") + final_df.to_csv(args.outFile, sep="\t") else: print(final_df) return 0 - -if __name__ == "__main__": - main() diff --git a/sincei/scJSD.py b/sincei/scJSD.py new file mode 100755 index 0000000..55d7344 --- /dev/null +++ b/sincei/scJSD.py @@ -0,0 +1,157 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import os +import time +import sys +import argparse +import numpy as np +import pandas as pd +from deeptools.plotFingerprint import getSyntheticJSD +from deeptools import parserCommon + +from matplotlib.pyplot import plot +import matplotlib + +matplotlib.use("Agg") +matplotlib.rcParams["pdf.fonttype"] = 42 +matplotlib.rcParams["svg.fonttype"] = "none" +import warnings + +warnings.simplefilter(action="ignore", category=FutureWarning) + +# scriptdir=os.path.abspath(os.path.join(__file__, "../../sincei")) +# sys.path.append(scriptdir) +## own functions +from sincei import ReadCounter as countR +from sincei import ParserCommon + +## plot KDE of JSD values +from numpy import array, linspace +from sklearn import neighbors + +old_settings = np.seterr(all="ignore") +MAXLEN = 10000000 + + +def get_args(): + parser = argparse.ArgumentParser(add_help=False, conflict_handler="resolve") + optional = parser.add_argument_group("Optional arguments") + + optional.add_argument( + "--numberOfSamples", + "-n", + help="The number of bins that are sampled from the genome, " + "for which the overlapping number of reads is computed. (Default: %(default)s)", + default=1e5, + type=int, + ) + + optional.add_argument( + "--skipZeros", + help="If set, then regions with zero overlapping reads" + "for *all* given BAM files are ignored. This " + "will result in a reduced number of read " + "counts than that specified in --numberOfSamples", + action="store_true", + ) + + return parser + + +def parseArguments(args=None): + io_args = ParserCommon.inputOutputOptions( + opts=["bamfiles", "barcodes", "outFile"], + requiredOpts=["bamfiles", "barcodes", "outFile"], + ) + bam_args = ParserCommon.bamOptions(suppress_args=["region", "distanceBetweenBins"], default_opts={"binSize": 10000}) + read_args = ParserCommon.readOptions(suppress_args=["filterRNAstrand", "extendReads", "centerReads"]) + filter_args = ParserCommon.filterOptions( + suppress_args=[ + "motifFilter", + "genome2bit", + "GCcontentFilter", + "minAlignedFraction", + ] + ) + other_args = ParserCommon.otherOptions() + + parser = argparse.ArgumentParser( + parents=[io_args, bam_args, read_args, filter_args, get_args(), other_args], + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description="This tool samples regions in the genome from BAM files " + "and compares the cumulative read coverages for each cell on those regions. " + "to a synthetic cell with poisson distributed reads using Jansson Shannon Distance. " + "Cells with high enrichment of signals show a higher JSD compared to cells whose signal " + "is homogenously distrubuted.", + conflict_handler="resolve", + usage="An example usage is: plotFingerprint -b treatment.bam control.bam " "-plot fingerprint.png", + add_help=False, + ) + + return parser + + +def main(args=None): + args = ParserCommon.process_args(parseArguments().parse_args(args)) + + ## read the barcode file + with open(args.barcodes, "r") as f: + barcodes = f.read().splitlines() + f.close() + + ## Count + c = countR.CountReadsPerBin( + args.bamfiles, + binLength=args.binSize, + numberOfSamples=args.numberOfSamples, + barcodes=barcodes, + cellTag=args.cellTag, + motifFilter=None, + genome2bit=None, + GCcontentFilter=None, + blackListFileName=args.blackListFileName, + numberOfProcessors=args.numberOfProcessors, + verbose=args.verbose, + region=None, + bedFile=None, + extendReads=args.extendReads, + minMappingQuality=args.minMappingQuality, + duplicateFilter=args.duplicateFilter, + center_read=False, + samFlag_include=args.samFlagInclude, + samFlag_exclude=args.samFlagExclude, + minFragmentLength=args.minFragmentLength, + maxFragmentLength=args.maxFragmentLength, + zerosToNans=False, + sumCoveragePerBin=True, + ) + + num_reads_per_bin, _ = c.run(allArgs=None) + + if num_reads_per_bin.sum() == 0: + import sys + + sys.stderr.write( + "\nNo reads were found in {} regions sampled. Check that the\n" + "min mapping quality is not overly high and that the \n" + "chromosome names between bam files are consistant.\n" + "For small genomes, decrease the --numberOfSamples.\n" + "\n".format(num_reads_per_bin.shape[0]) + ) + exit(1) + + if args.skipZeros: + num_reads_per_bin = countR.remove_row_of_zeros(num_reads_per_bin) + + total = len(num_reads_per_bin[:, 0]) + x = np.arange(total).astype("float") / total # normalize from 0 to 1 + + jsd_all = [] + for i in range(0, num_reads_per_bin.shape[1]): + jsd_all.append(getSyntheticJSD(num_reads_per_bin[:, i])) + + ## create colnames (sampleLabel+barcode) + newlabels = ["{}_{}".format(a, b) for a in args.labels for b in barcodes] + df = pd.DataFrame({"cell": newlabels, "jsd": jsd_all}) + df.to_csv(args.outFile, sep="\t") diff --git a/bin/scPlotRegion b/sincei/scPlotRegion similarity index 58% rename from bin/scPlotRegion rename to sincei/scPlotRegion index 7b81800..d1ae07a 100755 --- a/bin/scPlotRegion +++ b/sincei/scPlotRegion @@ -1,12 +1,16 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +import sys import warnings warnings.simplefilter(action="ignore", category=RuntimeWarning) warnings.simplefilter(action="ignore", category=PendingDeprecationWarning) -from pygenometracks import plotTracks - +#from pygenometracks import plotTracks def main(args=None): - plotTracks.main(args) + #plotTracks.main(args) + sys.stderr.out("Not implemented yet") + +#if __name__ == "__main__": +# main() diff --git a/bin/sincei b/sincei/sincei.py similarity index 85% rename from bin/sincei rename to sincei/sincei.py index 8d2eaf4..007cb2a 100755 --- a/bin/sincei +++ b/sincei/sincei.py @@ -4,10 +4,11 @@ import argparse import sys import os + ## own functions -scriptdir=os.path.abspath(os.path.join(__file__, "../../sincei")) -sys.path.append(scriptdir) -from _version import __version__ +# scriptdir=os.path.abspath(os.path.join(__file__, "../../sincei")) +# sys.path.append(scriptdir) +from sincei._version import __version__ def parse_arguments(args=None): @@ -29,12 +30,15 @@ def parse_arguments(args=None): scCountQC Perform quality control and filter the output of scCountReads. scCombineCounts [WIP] Concatenate/merge the counts from different samples/batches or modalities scClusterCells Perform dimensionality reduction and clustering on the output of scCountReads. - scFindMarkers [WIP] Find marker genes per group, given the output of scCountReads and a user-defined group. scBulkCoverage Get pseudo-bulk coverage per group using a user-supplied cell->group mapping (output of scClusterCells). + scBAMops Modify a BAM file to group cells (using cell barcodes), or filter/shift mapped reads. + scFindMarkers [WIP] Find marker genes per group, given the output of scCountReads and a user-defined group. scFeaturePlot [WIP] Plot the counts for a given feature on a UMAP or on a (IGV-style) genomic-track. -""".format(__version__)) - +""".format( + __version__ + ), + ) return parser @@ -49,6 +53,3 @@ def main(args=None): if args is None and len(sys.argv) == 1: args = ["--help"] process_args(args) - -if __name__ == "__main__": - main() diff --git a/sincei/test/__init__.py b/sincei/test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sincei/test/__pycache__/__init__.cpython-310.pyc b/sincei/test/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..63eded1 Binary files /dev/null and b/sincei/test/__pycache__/__init__.cpython-310.pyc differ diff --git a/sincei/test/__pycache__/test_tools.cpython-310-pytest-7.2.1.pyc b/sincei/test/__pycache__/test_tools.cpython-310-pytest-7.2.1.pyc new file mode 100644 index 0000000..f295900 Binary files /dev/null and b/sincei/test/__pycache__/test_tools.cpython-310-pytest-7.2.1.pyc differ diff --git a/sincei/test/test_tools.py b/sincei/test/test_tools.py new file mode 100644 index 0000000..91fb0ed --- /dev/null +++ b/sincei/test/test_tools.py @@ -0,0 +1,16 @@ +import subprocess +import os + +ROOT = os.path.dirname(os.path.abspath(__file__)) + "/../../bin" + + +def test_tools(): + """ + Checks everything that is in /bin/ + and tries to run it + """ + if os.path.isdir(ROOT): + for _file in os.listdir(ROOT): + print(_file) + if os.path.isfile(os.path.join(ROOT, _file)): + subprocess.check_call("{}/{} -h".format(ROOT, _file).split()) diff --git a/testdata/test_barcodes.txt b/testdata/test_barcodes.txt new file mode 100644 index 0000000..2ff7435 --- /dev/null +++ b/testdata/test_barcodes.txt @@ -0,0 +1,14 @@ +ACGGTAAT +AGATATTG +ATATAACT +ATGTCGTG +ATTCTCGA +CGTGGTAA +GCGAGCAT +GCGCTGAC +GGACACCG +GGAGTCAT +GTCAAGCA +GTGCCGCG +TAGACTTG +TCTTCTGT