-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(derive): new readlen subcommand
- Loading branch information
Showing
6 changed files
with
220 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,3 +2,4 @@ | |
pub mod command; | ||
pub mod instrument; | ||
pub mod readlen; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
//! Functionality relating to the `ngs derive readlen` subcommand itself. | ||
use std::collections::HashMap; | ||
use std::path::PathBuf; | ||
|
||
use clap::Args; | ||
use tracing::info; | ||
|
||
use crate::derive::readlen::compute; | ||
use crate::utils::formats::bam::ParsedBAMFile; | ||
use crate::utils::formats::utils::IndexCheck; | ||
|
||
/// Utility method to parse the Majority Vote Cutoff passed in on the command line and | ||
/// ensure the cutoff is within the range [0.0, 1.0]. | ||
pub fn cutoff_in_range(cutoff_raw: &str) -> Result<f64, String> { | ||
let cutoff: f64 = cutoff_raw | ||
.parse() | ||
.map_err(|_| format!("{} isn't a float", cutoff_raw))?; | ||
|
||
match (0.0..=1.0).contains(&cutoff) { | ||
true => Ok(cutoff), | ||
false => Err(String::from("Error rate must be between 0.0 and 1.0")), | ||
} | ||
} | ||
|
||
/// Clap arguments for the `ngs derive readlen` subcommand. | ||
#[derive(Args)] | ||
pub struct DeriveReadlenArgs { | ||
// Source BAM. | ||
#[arg(value_name = "BAM")] | ||
src: PathBuf, | ||
|
||
/// Only examine the first n records in the file. | ||
#[arg(short, long, value_name = "USIZE")] | ||
num_records: Option<usize>, | ||
|
||
/// Majority vote cutoff value as a fraction between [0.0, 1.0]. | ||
#[arg(short, long, value_name = "F64", default_value = "0.7")] | ||
#[arg(value_parser = cutoff_in_range)] | ||
majority_vote_cutoff: Option<f64>, | ||
} | ||
|
||
/// Main function for the `ngs derive readlen` subcommand. | ||
pub fn derive(args: DeriveReadlenArgs) -> anyhow::Result<()> { | ||
let mut read_lengths = HashMap::new(); | ||
|
||
info!("Starting derive readlen subcommand."); | ||
|
||
let ParsedBAMFile { | ||
mut reader, header, .. | ||
} = crate::utils::formats::bam::open_and_parse(args.src, IndexCheck::Full)?; | ||
|
||
// (1) Collect read lengths from reads within the | ||
// file. Support for sampling only a portion of the reads is provided. | ||
let mut samples = 0; | ||
let mut sample_max = 0; | ||
|
||
if let Some(s) = args.num_records { | ||
sample_max = s; | ||
} | ||
|
||
for result in reader.records(&header.parsed) { | ||
let record = result?; | ||
let len = record.sequence().len(); | ||
|
||
read_lengths.entry(len).and_modify(|e| *e += 1).or_insert(1); | ||
|
||
if sample_max > 0 { | ||
samples += 1; | ||
if samples > sample_max { | ||
break; | ||
} | ||
} | ||
} | ||
|
||
// (2) Derive the consensus read length based on the read lengths gathered. | ||
let result = compute::predict(read_lengths, args.majority_vote_cutoff.unwrap()).unwrap(); | ||
|
||
// (3) Print the output to stdout as JSON (more support for different output | ||
// types may be added in the future, but for now, only JSON). | ||
let output = serde_json::to_string_pretty(&result).unwrap(); | ||
print!("{}", output); | ||
|
||
Ok(()) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
//! Supporting functionality for the `ngs derive instrument` subcommand. | ||
pub mod compute; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
//! Module holding the logic for computing the consensus read length. | ||
use anyhow::bail; | ||
use serde::Serialize; | ||
use std::collections::HashMap; | ||
|
||
/// Struct holding the final results for an `ngs derive readlen` subcommand | ||
/// call. | ||
#[derive(Debug, Serialize)] | ||
pub struct DerivedReadlenResult { | ||
/// Whether or not the `ngs derive readlen` subcommand succeeded. | ||
pub succeeded: bool, | ||
|
||
/// The concsensus read length, if available. | ||
pub consensus_read_length: Option<usize>, | ||
|
||
/// The majority vote percentage of the consensus read length, if available. | ||
pub majority_pct_detected: f64, | ||
|
||
/// Status of the evidence that supports (or does not support) this | ||
/// read length, if available. | ||
pub evidence: Vec<(usize, i32)>, | ||
} | ||
|
||
impl DerivedReadlenResult { | ||
/// Creates a new [`DerivedReadlenResult`]. | ||
pub fn new( | ||
succeeded: bool, | ||
consensus_read_length: Option<usize>, | ||
majority_pct_detected: f64, | ||
evidence: Vec<(usize, i32)>, | ||
) -> Self { | ||
DerivedReadlenResult { | ||
succeeded, | ||
consensus_read_length, | ||
majority_pct_detected, | ||
evidence, | ||
} | ||
} | ||
} | ||
|
||
/// Main method to evaluate the collected read lengths and | ||
/// return a result for the consensus read length. This may fail, and the | ||
/// resulting [`DerivedReadlenResult`] should be evaluated accordingly. | ||
pub fn predict( | ||
read_lengths: HashMap<usize, i32>, | ||
majority_vote_cutoff: f64, | ||
) -> Result<DerivedReadlenResult, anyhow::Error> { | ||
let mut num_records = 0; | ||
let mut max_count = 0; | ||
let mut max_read_length = 0; | ||
|
||
for (read_length, count) in &read_lengths { | ||
num_records += *count; | ||
if *read_length > max_read_length { | ||
max_read_length = *read_length; | ||
max_count = *count; | ||
} | ||
} | ||
|
||
if num_records == 0 { | ||
bail!("No read lengths were detected in the file."); | ||
} | ||
|
||
let consensus_read_length = max_read_length; | ||
let majority_detected = max_count as f64 / num_records as f64; | ||
|
||
// Sort the read lengths by their key for output. | ||
let mut read_lengths: Vec<(usize, i32)> = read_lengths.into_iter().collect(); | ||
read_lengths.sort_by(|a, b| b.0.cmp(&a.0)); | ||
|
||
let mut result = | ||
DerivedReadlenResult::new(false, None, majority_detected * 100.0, read_lengths); | ||
|
||
if majority_detected >= majority_vote_cutoff { | ||
result.succeeded = true; | ||
result.consensus_read_length = Some(consensus_read_length); | ||
result.majority_pct_detected = majority_detected * 100.0; | ||
} | ||
|
||
Ok(result) | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use super::*; | ||
|
||
#[test] | ||
fn test_derive_readlen_from_empty_hashmap() { | ||
let read_lengths = HashMap::new(); | ||
let result = predict(read_lengths, 0.7); | ||
assert!(result.is_err()); | ||
} | ||
|
||
#[test] | ||
fn test_derive_readlen_when_all_readlengths_equal() { | ||
let read_lengths = HashMap::from([(100, 10)]); | ||
let result = predict(read_lengths, 1.0).unwrap(); | ||
assert!(result.succeeded); | ||
assert_eq!(result.consensus_read_length, Some(100)); | ||
assert_eq!(result.majority_pct_detected, 100.0); | ||
assert_eq!(result.evidence, Vec::from([(100, 10)])); | ||
} | ||
|
||
#[test] | ||
fn test_derive_readlen_success_when_not_all_readlengths_equal() { | ||
let read_lengths = HashMap::from([(101, 1000), (100, 5), (99, 5)]); | ||
let result = predict(read_lengths, 0.7).unwrap(); | ||
assert!(result.succeeded); | ||
assert_eq!(result.consensus_read_length, Some(101)); | ||
assert!(result.majority_pct_detected > 99.0); | ||
assert_eq!(result.evidence, Vec::from([(101, 1000), (100, 5), (99, 5)])); | ||
} | ||
|
||
#[test] | ||
fn test_derive_readlen_fail_when_not_all_readlengths_equal() { | ||
let read_lengths = HashMap::from([(101, 5), (100, 1000), (99, 5)]); | ||
let result = predict(read_lengths, 0.7).unwrap(); | ||
assert!(!result.succeeded); | ||
assert_eq!(result.consensus_read_length, None); | ||
assert!(result.majority_pct_detected < 1.0); | ||
assert_eq!(result.evidence, Vec::from([(101, 5), (100, 1000), (99, 5)])); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters