Skip to content

Commit

Permalink
feat(derive): new readlen subcommand
Browse files Browse the repository at this point in the history
  • Loading branch information
a-frantz committed Dec 3, 2023
1 parent b70da70 commit 0c01978
Show file tree
Hide file tree
Showing 6 changed files with 220 additions and 0 deletions.
1 change: 1 addition & 0 deletions src/derive.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
pub mod command;
pub mod instrument;
pub mod readlen;
4 changes: 4 additions & 0 deletions src/derive/command.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
//! Functionality related to the `ngs derive` subcommand itself.
pub mod instrument;
pub mod readlen;

use clap::Args;
use clap::Subcommand;
Expand All @@ -22,4 +23,7 @@ pub struct DeriveArgs {
pub enum DeriveSubcommand {
/// Derives the instrument used to produce the file.
Instrument(self::instrument::DeriveInstrumentArgs),

/// Derives the read length of the file.
Readlen(self::readlen::DeriveReadlenArgs),
}
85 changes: 85 additions & 0 deletions src/derive/command/readlen.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
//! Functionality relating to the `ngs derive readlen` subcommand itself.
use std::collections::HashMap;
use std::path::PathBuf;

use clap::Args;
use tracing::info;

use crate::derive::readlen::compute;
use crate::utils::formats::bam::ParsedBAMFile;
use crate::utils::formats::utils::IndexCheck;

/// Utility method to parse the Majority Vote Cutoff passed in on the command line and
/// ensure the cutoff is within the range [0.0, 1.0].
pub fn cutoff_in_range(cutoff_raw: &str) -> Result<f64, String> {
let cutoff: f64 = cutoff_raw
.parse()
.map_err(|_| format!("{} isn't a float", cutoff_raw))?;

match (0.0..=1.0).contains(&cutoff) {
true => Ok(cutoff),
false => Err(String::from("Error rate must be between 0.0 and 1.0")),
}
}

/// Clap arguments for the `ngs derive readlen` subcommand.
#[derive(Args)]
pub struct DeriveReadlenArgs {
// Source BAM.
#[arg(value_name = "BAM")]
src: PathBuf,

/// Only examine the first n records in the file.
#[arg(short, long, value_name = "USIZE")]
num_records: Option<usize>,

/// Majority vote cutoff value as a fraction between [0.0, 1.0].
#[arg(short, long, value_name = "F64", default_value = "0.7")]
#[arg(value_parser = cutoff_in_range)]
majority_vote_cutoff: Option<f64>,
}

/// Main function for the `ngs derive readlen` subcommand.
pub fn derive(args: DeriveReadlenArgs) -> anyhow::Result<()> {
let mut read_lengths = HashMap::new();

info!("Starting derive readlen subcommand.");

let ParsedBAMFile {
mut reader, header, ..
} = crate::utils::formats::bam::open_and_parse(args.src, IndexCheck::Full)?;

// (1) Collect read lengths from reads within the
// file. Support for sampling only a portion of the reads is provided.
let mut samples = 0;
let mut sample_max = 0;

if let Some(s) = args.num_records {
sample_max = s;
}

for result in reader.records(&header.parsed) {
let record = result?;
let len = record.sequence().len();

read_lengths.entry(len).and_modify(|e| *e += 1).or_insert(1);

if sample_max > 0 {
samples += 1;
if samples > sample_max {
break;
}
}
}

// (2) Derive the consensus read length based on the read lengths gathered.
let result = compute::predict(read_lengths, args.majority_vote_cutoff.unwrap()).unwrap();

// (3) Print the output to stdout as JSON (more support for different output
// types may be added in the future, but for now, only JSON).
let output = serde_json::to_string_pretty(&result).unwrap();
print!("{}", output);

Ok(())
}
3 changes: 3 additions & 0 deletions src/derive/readlen.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
//! Supporting functionality for the `ngs derive instrument` subcommand.
pub mod compute;
124 changes: 124 additions & 0 deletions src/derive/readlen/compute.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
//! Module holding the logic for computing the consensus read length.
use anyhow::bail;
use serde::Serialize;
use std::collections::HashMap;

/// Struct holding the final results for an `ngs derive readlen` subcommand
/// call.
#[derive(Debug, Serialize)]
pub struct DerivedReadlenResult {
/// Whether or not the `ngs derive readlen` subcommand succeeded.
pub succeeded: bool,

/// The concsensus read length, if available.
pub consensus_read_length: Option<usize>,

/// The majority vote percentage of the consensus read length, if available.
pub majority_pct_detected: f64,

/// Status of the evidence that supports (or does not support) this
/// read length, if available.
pub evidence: Vec<(usize, i32)>,
}

impl DerivedReadlenResult {
/// Creates a new [`DerivedReadlenResult`].
pub fn new(
succeeded: bool,
consensus_read_length: Option<usize>,
majority_pct_detected: f64,
evidence: Vec<(usize, i32)>,
) -> Self {
DerivedReadlenResult {
succeeded,
consensus_read_length,
majority_pct_detected,
evidence,
}
}
}

/// Main method to evaluate the collected read lengths and
/// return a result for the consensus read length. This may fail, and the
/// resulting [`DerivedReadlenResult`] should be evaluated accordingly.
pub fn predict(
read_lengths: HashMap<usize, i32>,
majority_vote_cutoff: f64,
) -> Result<DerivedReadlenResult, anyhow::Error> {
let mut num_records = 0;
let mut max_count = 0;
let mut max_read_length = 0;

for (read_length, count) in &read_lengths {
num_records += *count;
if *read_length > max_read_length {
max_read_length = *read_length;
max_count = *count;
}
}

if num_records == 0 {
bail!("No read lengths were detected in the file.");
}

let consensus_read_length = max_read_length;
let majority_detected = max_count as f64 / num_records as f64;

// Sort the read lengths by their key for output.
let mut read_lengths: Vec<(usize, i32)> = read_lengths.into_iter().collect();
read_lengths.sort_by(|a, b| b.0.cmp(&a.0));

let mut result =
DerivedReadlenResult::new(false, None, majority_detected * 100.0, read_lengths);

if majority_detected >= majority_vote_cutoff {
result.succeeded = true;
result.consensus_read_length = Some(consensus_read_length);
result.majority_pct_detected = majority_detected * 100.0;
}

Ok(result)
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_derive_readlen_from_empty_hashmap() {
let read_lengths = HashMap::new();
let result = predict(read_lengths, 0.7);
assert!(result.is_err());
}

#[test]
fn test_derive_readlen_when_all_readlengths_equal() {
let read_lengths = HashMap::from([(100, 10)]);
let result = predict(read_lengths, 1.0).unwrap();
assert!(result.succeeded);
assert_eq!(result.consensus_read_length, Some(100));
assert_eq!(result.majority_pct_detected, 100.0);
assert_eq!(result.evidence, Vec::from([(100, 10)]));
}

#[test]
fn test_derive_readlen_success_when_not_all_readlengths_equal() {
let read_lengths = HashMap::from([(101, 1000), (100, 5), (99, 5)]);
let result = predict(read_lengths, 0.7).unwrap();
assert!(result.succeeded);
assert_eq!(result.consensus_read_length, Some(101));
assert!(result.majority_pct_detected > 99.0);
assert_eq!(result.evidence, Vec::from([(101, 1000), (100, 5), (99, 5)]));
}

#[test]
fn test_derive_readlen_fail_when_not_all_readlengths_equal() {
let read_lengths = HashMap::from([(101, 5), (100, 1000), (99, 5)]);
let result = predict(read_lengths, 0.7).unwrap();
assert!(!result.succeeded);
assert_eq!(result.consensus_read_length, None);
assert!(result.majority_pct_detected < 1.0);
assert_eq!(result.evidence, Vec::from([(101, 5), (100, 1000), (99, 5)]));
}
}
3 changes: 3 additions & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,9 @@ fn main() -> anyhow::Result<()> {
derive::command::DeriveSubcommand::Instrument(args) => {
derive::command::instrument::derive(args)?
}
derive::command::DeriveSubcommand::Readlen(args) => {
derive::command::readlen::derive(args)?
}
},
Subcommands::Generate(args) => generate::command::generate(args)?,
Subcommands::Index(args) => index::command::index(args)?,
Expand Down

0 comments on commit 0c01978

Please sign in to comment.