feat(derive): new readlen subcommand

stjude-rust-labs · Dec 3, 2023 · 0c01978 · 0c01978
1 parent b70da70
commit 0c01978
Show file tree

Hide file tree

Showing 6 changed files with 220 additions and 0 deletions.
diff --git a/src/derive.rs b/src/derive.rs
@@ -2,3 +2,4 @@
 
 pub mod command;
 pub mod instrument;
+pub mod readlen;
diff --git a/src/derive/command.rs b/src/derive/command.rs
@@ -1,6 +1,7 @@
 //! Functionality related to the `ngs derive` subcommand itself.
 
 pub mod instrument;
+pub mod readlen;
 
 use clap::Args;
 use clap::Subcommand;
@@ -22,4 +23,7 @@ pub struct DeriveArgs {
 pub enum DeriveSubcommand {
     /// Derives the instrument used to produce the file.
     Instrument(self::instrument::DeriveInstrumentArgs),
+
+    /// Derives the read length of the file.
+    Readlen(self::readlen::DeriveReadlenArgs),
 }
diff --git a/src/derive/command/readlen.rs b/src/derive/command/readlen.rs
@@ -0,0 +1,85 @@
+//! Functionality relating to the `ngs derive readlen` subcommand itself.
+
+use std::collections::HashMap;
+use std::path::PathBuf;
+
+use clap::Args;
+use tracing::info;
+
+use crate::derive::readlen::compute;
+use crate::utils::formats::bam::ParsedBAMFile;
+use crate::utils::formats::utils::IndexCheck;
+
+/// Utility method to parse the Majority Vote Cutoff passed in on the command line and
+/// ensure the cutoff is within the range [0.0, 1.0].
+pub fn cutoff_in_range(cutoff_raw: &str) -> Result<f64, String> {
+    let cutoff: f64 = cutoff_raw
+        .parse()
+        .map_err(|_| format!("{} isn't a float", cutoff_raw))?;
+
+    match (0.0..=1.0).contains(&cutoff) {
+        true => Ok(cutoff),
+        false => Err(String::from("Error rate must be between 0.0 and 1.0")),
+    }
+}
+
+/// Clap arguments for the `ngs derive readlen` subcommand.
+#[derive(Args)]
+pub struct DeriveReadlenArgs {
+    // Source BAM.
+    #[arg(value_name = "BAM")]
+    src: PathBuf,
+
+    /// Only examine the first n records in the file.
+    #[arg(short, long, value_name = "USIZE")]
+    num_records: Option<usize>,
+
+    /// Majority vote cutoff value as a fraction between [0.0, 1.0].
+    #[arg(short, long, value_name = "F64", default_value = "0.7")]
+    #[arg(value_parser = cutoff_in_range)]
+    majority_vote_cutoff: Option<f64>,
+}
+
+/// Main function for the `ngs derive readlen` subcommand.
+pub fn derive(args: DeriveReadlenArgs) -> anyhow::Result<()> {
+    let mut read_lengths = HashMap::new();
+
+    info!("Starting derive readlen subcommand.");
+
+    let ParsedBAMFile {
+        mut reader, header, ..
+    } = crate::utils::formats::bam::open_and_parse(args.src, IndexCheck::Full)?;
+
+    // (1) Collect read lengths from reads within the
+    // file. Support for sampling only a portion of the reads is provided.
+    let mut samples = 0;
+    let mut sample_max = 0;
+
+    if let Some(s) = args.num_records {
+        sample_max = s;
+    }
+
+    for result in reader.records(&header.parsed) {
+        let record = result?;
+        let len = record.sequence().len();
+
+        read_lengths.entry(len).and_modify(|e| *e += 1).or_insert(1);
+
+        if sample_max > 0 {
+            samples += 1;
+            if samples > sample_max {
+                break;
+            }
+        }
+    }
+
+    // (2) Derive the consensus read length based on the read lengths gathered.
+    let result = compute::predict(read_lengths, args.majority_vote_cutoff.unwrap()).unwrap();
+
+    // (3) Print the output to stdout as JSON (more support for different output
+    // types may be added in the future, but for now, only JSON).
+    let output = serde_json::to_string_pretty(&result).unwrap();
+    print!("{}", output);
+
+    Ok(())
+}
diff --git a/src/derive/readlen.rs b/src/derive/readlen.rs
@@ -0,0 +1,3 @@
+//! Supporting functionality for the `ngs derive instrument` subcommand.
+
+pub mod compute;
diff --git a/src/derive/readlen/compute.rs b/src/derive/readlen/compute.rs
@@ -0,0 +1,124 @@
+//! Module holding the logic for computing the consensus read length.
+
+use anyhow::bail;
+use serde::Serialize;
+use std::collections::HashMap;
+
+/// Struct holding the final results for an `ngs derive readlen` subcommand
+/// call.
+#[derive(Debug, Serialize)]
+pub struct DerivedReadlenResult {
+    /// Whether or not the `ngs derive readlen` subcommand succeeded.
+    pub succeeded: bool,
+
+    /// The concsensus read length, if available.
+    pub consensus_read_length: Option<usize>,
+
+    /// The majority vote percentage of the consensus read length, if available.
+    pub majority_pct_detected: f64,
+
+    /// Status of the evidence that supports (or does not support) this
+    /// read length, if available.
+    pub evidence: Vec<(usize, i32)>,
+}
+
+impl DerivedReadlenResult {
+    /// Creates a new [`DerivedReadlenResult`].
+    pub fn new(
+        succeeded: bool,
+        consensus_read_length: Option<usize>,
+        majority_pct_detected: f64,
+        evidence: Vec<(usize, i32)>,
+    ) -> Self {
+        DerivedReadlenResult {
+            succeeded,
+            consensus_read_length,
+            majority_pct_detected,
+            evidence,
+        }
+    }
+}
+
+/// Main method to evaluate the collected read lengths and
+/// return a result for the consensus read length. This may fail, and the
+/// resulting [`DerivedReadlenResult`] should be evaluated accordingly.
+pub fn predict(
+    read_lengths: HashMap<usize, i32>,
+    majority_vote_cutoff: f64,
+) -> Result<DerivedReadlenResult, anyhow::Error> {
+    let mut num_records = 0;
+    let mut max_count = 0;
+    let mut max_read_length = 0;
+
+    for (read_length, count) in &read_lengths {
+        num_records += *count;
+        if *read_length > max_read_length {
+            max_read_length = *read_length;
+            max_count = *count;
+        }
+    }
+
+    if num_records == 0 {
+        bail!("No read lengths were detected in the file.");
+    }
+
+    let consensus_read_length = max_read_length;
+    let majority_detected = max_count as f64 / num_records as f64;
+
+    // Sort the read lengths by their key for output.
+    let mut read_lengths: Vec<(usize, i32)> = read_lengths.into_iter().collect();
+    read_lengths.sort_by(|a, b| b.0.cmp(&a.0));
+
+    let mut result =
+        DerivedReadlenResult::new(false, None, majority_detected * 100.0, read_lengths);
+
+    if majority_detected >= majority_vote_cutoff {
+        result.succeeded = true;
+        result.consensus_read_length = Some(consensus_read_length);
+        result.majority_pct_detected = majority_detected * 100.0;
+    }
+
+    Ok(result)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_derive_readlen_from_empty_hashmap() {
+        let read_lengths = HashMap::new();
+        let result = predict(read_lengths, 0.7);
+        assert!(result.is_err());
+    }
+
+    #[test]
+    fn test_derive_readlen_when_all_readlengths_equal() {
+        let read_lengths = HashMap::from([(100, 10)]);
+        let result = predict(read_lengths, 1.0).unwrap();
+        assert!(result.succeeded);
+        assert_eq!(result.consensus_read_length, Some(100));
+        assert_eq!(result.majority_pct_detected, 100.0);
+        assert_eq!(result.evidence, Vec::from([(100, 10)]));
+    }
+
+    #[test]
+    fn test_derive_readlen_success_when_not_all_readlengths_equal() {
+        let read_lengths = HashMap::from([(101, 1000), (100, 5), (99, 5)]);
+        let result = predict(read_lengths, 0.7).unwrap();
+        assert!(result.succeeded);
+        assert_eq!(result.consensus_read_length, Some(101));
+        assert!(result.majority_pct_detected > 99.0);
+        assert_eq!(result.evidence, Vec::from([(101, 1000), (100, 5), (99, 5)]));
+    }
+
+    #[test]
+    fn test_derive_readlen_fail_when_not_all_readlengths_equal() {
+        let read_lengths = HashMap::from([(101, 5), (100, 1000), (99, 5)]);
+        let result = predict(read_lengths, 0.7).unwrap();
+        assert!(!result.succeeded);
+        assert_eq!(result.consensus_read_length, None);
+        assert!(result.majority_pct_detected < 1.0);
+        assert_eq!(result.evidence, Vec::from([(101, 5), (100, 1000), (99, 5)]));
+    }
+}
diff --git a/src/main.rs b/src/main.rs
@@ -92,6 +92,9 @@ fn main() -> anyhow::Result<()> {
             derive::command::DeriveSubcommand::Instrument(args) => {
                 derive::command::instrument::derive(args)?
             }
+            derive::command::DeriveSubcommand::Readlen(args) => {
+                derive::command::readlen::derive(args)?
+            }
         },
         Subcommands::Generate(args) => generate::command::generate(args)?,
         Subcommands::Index(args) => index::command::index(args)?,
Original file line number	Diff line number	Diff line change
Expand Up		@@ -2,3 +2,4 @@

		pub mod command;
		pub mod instrument;
		pub mod readlen;
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		//! Supporting functionality for the `ngs derive instrument` subcommand.

		pub mod compute;