Merge branch 'dev' into 'master'

Merge for release 0.5.4 See merge request tron/easyquant!26
TRON-Bioinformatics · Jul 1, 2024 · 96da140 · 96da140
2 parents cb04b14 + d8bb1d3
commit 96da140
Show file tree

Hide file tree

Showing 6 changed files with 62 additions and 13 deletions.
diff --git a/README.md b/README.md
@@ -212,4 +212,17 @@ Hint: This is just an example to illustrate the design of the table. Results may
  - **within_interval** reads mapping fully onto the interval
  - **coverage_perc** percentual coverage of the interval by aligned reads
  - **coverage_mean** average coverage per base for the interval (fold coverage)
- - **coverage_median** median coverage per base for the interval
+ - **coverage_median** median coverage per base for the interval
+
+
+### Things to consider
+
+When choosing the aligner you have to take into account that there are several differences among them:
+
+- STAR: 
+  - end-to-end alignment with no soft-clipping
+  - very slow for small reference sequences
+  - several available alignment parameters to optimize your results, which can be used while starting the pipeline
+- bowtie2: 
+  - end-to-end alignment might lead to insertions where the context sequence starts/ends
+  - faster than STAR for short reference sequences (index creation parameters are calculated automatically)
diff --git a/environment.yml b/environment.yml
@@ -4,8 +4,8 @@ channels:
   - conda-forge
   - defaults
 dependencies:
-  - python=3.7
-  - bowtie2=2.4.4
+  - python=3.8
+  - bowtie2=2.5.3
   - bwa=0.7.17
   - samtools=1.9
   - pysam=0.21.0

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "bp_quant"
-version = "0.5.3"
+version = "0.5.4"
 authors = [
   {name = "TRON - Translational Oncology at the University Medical Center of the Johannes Gutenberg University Mainz", email = "[email protected]"},
 ]

diff --git a/src/bp_quant/requantify.py b/src/bp_quant/requantify.py
@@ -15,6 +15,15 @@
 logger = logging.getLogger(__name__)
 
 
+def get_aligner(bam_file):
+    """
+    Uses pysam to detect the aligner used to create the input BAM file
+    """
+    header_dict = pysam.AlignmentFile(bam_file, "rb").header.to_dict()
+    aligner = header_dict["PG"][0]["ID"].lower()
+    return aligner
+
+
 def perc_true(lst):
     n = len(lst)
     num_true = sum(1 for val in lst if val > 0)
@@ -117,6 +126,7 @@ class Quantification(object):
     def __init__(self, seq_table_file, bam_file, output_path, bp_dist, allow_mismatches, interval_mode):
         self.seq_table_file = seq_table_file
         self.bam_file = bam_file
+        self.aligner = get_aligner(bam_file)
         self.output_path = os.path.abspath(output_path)
         self.quant_file = os.path.join(output_path, "quantification.tsv")
         self.reads_file = os.path.join(output_path, "read_info.tsv.gz")
@@ -186,9 +196,7 @@ def parse_alignment(self):
         missing_refs = {}
         r1 = None
         r2 = None
-
         for read in bam.fetch():
-
             if read.flag > 511:
                 continue
             # Handle missing reference sequences which occur in SAM/BAM
@@ -197,10 +205,31 @@ def parse_alignment(self):
                 if read.reference_name not in missing_refs:
                     missing_refs[read.reference_name] = 0
                 missing_refs[read.reference_name] += 1
-            if not r1:
-                r1 = read
-            elif r1 and not r2:
-                r2 = read
+            # Bowtie reports alignments in the following order,
+            # therefore the following if statement prevents
+            # mismatching issues for read pairs where
+            # there are secondary alignments:
+            # R1
+            # R1_secondary_1
+            # R1_secondary_2
+            # R1_secondary_3
+            # R2
+            # ...
+            # Secondary alignments will be excluded from the matching process
+            # and unmapped mates will be simulated as they are not reported
+            # within the bowtie2 alignment
+            if read.is_secondary and self.aligner == "bowtie2":
+                r1_sec = read
+                r2_sec = pysam.AlignedSegment(bam.header)
+                r2_sec.reference_name = r1_sec.reference_name
+                r2_sec.query_name = r1_sec.query_name
+                r2_sec.is_unmapped = True
+                self.quantify(r1_sec, r2_sec)
+            else:
+                if not r1:
+                    r1 = read
+                elif r1 and not r2:
+                    r2 = read
 
 
             if r1 and r2:
@@ -210,11 +239,13 @@ def parse_alignment(self):
                 # and therefore needs more relaxed clause
                 if not (r1.is_unmapped and r2.is_unmapped or
                     r1.query_name != r2.query_name):
-
                     if (r1.reference_name == r2.reference_name or
                         r1.is_unmapped and not r2.is_unmapped or
                         not r1.is_unmapped and r2.is_unmapped):
                         self.quantify(r1, r2)
+                elif r1.query_name != r2.query_name:
+                    print("MISMATCHING QUERY NAME!", r1, r2)
+                    break
                 r1 = None
                 r2 = None
         logger.info("Quantification done.")

diff --git a/src/bp_quant/version.py b/src/bp_quant/version.py
@@ -1,2 +1,2 @@
-version_info = (0, 5, 3)
+version_info = (0, 5, 4)
 version = '.'.join(str(c) for c in version_info)
diff --git a/tests/tests.py b/tests/tests.py
@@ -5,11 +5,16 @@
 
 SEQ_TABLE_FILE = os.path.join("example_data", "CLDN18_Context_seq.csv")
 
-from bp_quant.requantify import perc_true, get_seq_to_pos, classify_read
+from bp_quant.requantify import get_aligner, perc_true, get_seq_to_pos, classify_read
 
 
 class TestRequantify(unittest.TestCase):
 
+    def test_get_aligner(self):
+        bam_file = os.path.join("example_data", "example_rna-seq.bam")
+        self.assertEqual(get_aligner(bam_file), "star")
+
+
     def test_perc_true(self):
         data = [4, 3, 0, 5, 9, 0, 1, 0, 0, 0]
         self.assertEqual(perc_true(data), 0.5)