Thomas hai 3 meses
pai
achega
671e2ea3db

+ 3 - 7
src/collection/bam.rs

@@ -7,11 +7,9 @@ use std::{
 
 use anyhow::{anyhow, Context};
 use chrono::{DateTime, Utc};
-use dashmap::DashMap;
 use glob::glob;
 use log::{debug, info, warn};
 use rand::{rng, Rng};
-use rayon::prelude::*;
 use rust_htslib::bam::{ext::BamRecordExtensions, record::Cigar, Read};
 use serde::{Deserialize, Serialize};
 
@@ -24,11 +22,9 @@ pub struct WGSBam {
     pub id: String,
     pub time_point: String,
     pub reference_genome: String,
-    // pub bam_type: BamType,
     pub path: PathBuf,
     pub modified: DateTime<Utc>,
     pub bam_stats: WGSBamStats,
-    // pub cramino: Option<CraminoRes>,
     pub composition: Vec<(String, String, f64)>, // acquisition id, fn
 }
 
@@ -661,7 +657,7 @@ pub fn sample_random_positions(chromosomes: &Vec<(String, u64)>, n: usize) -> Ve
 /// - pass vendor quality checks (`!BAM_FQCFAIL`),
 /// - not marked as duplicate (`!BAM_FDUP`),
 /// - and with mapping quality ≥ `bam_min_mapq`,
-/// are counted as mapped reads.
+///   are counted as mapped reads.
 ///
 #[derive(Debug, Clone, Deserialize, Serialize)]
 pub struct WGSBamStats {
@@ -766,7 +762,7 @@ impl WGSBamStats {
             mapped_lengths.push(len);
             lengths_by_tid.entry(r.tid()).or_default().push(len);
 
-            if n_reads % 500_000 == 0 {
+            if n_reads.is_multiple_of(500_000) {
                 info!("{case_id} {time}: processed {n_reads} mapped reads");
             }
         }
@@ -1073,7 +1069,7 @@ pub fn base_at_new(
 /// * `bam`           – an open `rust_htslib::bam::IndexedReader`.
 /// * `chr` / `pos`   – reference contig name and coordinate.
 /// * `with_next_ins` – if `true`, report `PileBase::Ins` when an insertion starts
-///                     **right after** the queried base.
+///   **right after** the queried base.
 ///
 /// The function bounds the internal pile-up depth to 10 000 reads to protect
 /// against malformed BAM files that could otherwise allocate unbounded memory.

+ 10 - 8
src/collection/flowcells.rs

@@ -1,7 +1,7 @@
 use std::{
-    collections::{HashMap, HashSet},
+    collections::HashSet,
     fmt,
-    fs::{self, File, OpenOptions},
+    fs::File,
     io::{BufReader, Read, Write},
     os::unix::fs::MetadataExt,
     path::Path,
@@ -16,7 +16,7 @@ use serde::{Deserialize, Serialize};
 use crate::{
     collection::minknow::{parse_pore_activity_from_reader, parse_throughput_from_reader},
     helpers::{find_files, list_directories},
-    io::{readers::{get_gz_reader, get_reader}, writers::{get_gz_writer, get_writer}},
+    io::{readers::get_reader, writers::get_writer},
 };
 
 use super::minknow::{MinKnowSampleSheet, PoreStateEntry, PoreStateEntryExt, ThroughputEntry};
@@ -182,8 +182,6 @@ impl FlowCells {
         let mut file = get_writer(&tmp_path)
             .with_context(|| format!("Failed to open archive file for writing: {archive_path}"))?;
 
-
-
         serde_json::to_writer_pretty(&mut file, &self.flow_cells)
             .with_context(|| format!("Failed to write FlowCells to: {archive_path}"))?;
         file.flush()?;
@@ -281,7 +279,7 @@ impl FlowCells {
         //     .flow_cells
         //     .drain(..)
         //     .map(|fc| {
-        //         (fc.sample_sheet.flow_cell_id.clone(), fc) 
+        //         (fc.sample_sheet.flow_cell_id.clone(), fc)
         //     })
         //     .collect();
         //
@@ -498,7 +496,8 @@ impl FlowCell {
         match &self.experiment {
             FlowCellExperiment::WGSPod5Mux(p) => p,
             FlowCellExperiment::WGSPod5Demux(p) => p,
-        }.to_string()
+        }
+        .to_string()
     }
 }
 
@@ -541,7 +540,10 @@ impl FlowCellExperiment {
     /// - `Some(FlowCellExperiment)` if a known subdirectory is found.
     /// - `None` if no match is detected.
     pub fn from_path(flowcell_path: &str) -> Option<Self> {
-        for dir in list_directories(flowcell_path.into()).ok().unwrap_or_default() {
+        for dir in list_directories(flowcell_path.into())
+            .ok()
+            .unwrap_or_default()
+        {
             if dir == "pod5" {
                 return Some(FlowCellExperiment::WGSPod5Mux(dir.to_string()));
             }

+ 5 - 5
src/collection/pod5.rs

@@ -183,9 +183,9 @@ impl Pod5sFlowCell {
         })
     }
     /// Compute summary statistics for the collection.
-    pub fn stats(&self) -> Pod5FlowCellStats {
+    pub fn stats(&self) -> Pod5sFlowCellStats {
         if self.pod5s.is_empty() {
-            return Pod5FlowCellStats {
+            return Pod5sFlowCellStats {
                 flow_cell_id: self.flow_cell_id.clone(),
                 count: 0,
                 total_size: 0,
@@ -219,7 +219,7 @@ impl Pod5sFlowCell {
         let avg_sample_rate =
             Some(self.pod5s.iter().map(|p| p.sample_rate as f64).sum::<f64>() / count as f64);
 
-        Pod5FlowCellStats {
+        Pod5sFlowCellStats {
             flow_cell_id: self.flow_cell_id.clone(),
             count,
             total_size,
@@ -233,7 +233,7 @@ impl Pod5sFlowCell {
 }
 
 #[derive(Debug, Clone)]
-pub struct Pod5FlowCellStats {
+pub struct Pod5sFlowCellStats {
     pub flow_cell_id: String,
     pub count: usize,
     pub total_size: u64,
@@ -244,7 +244,7 @@ pub struct Pod5FlowCellStats {
     pub avg_sample_rate: Option<f64>,
 }
 
-impl fmt::Display for Pod5FlowCellStats {
+impl fmt::Display for Pod5sFlowCellStats {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         writeln!(f, "Pod5 Flow Cell Stats")?;
         writeln!(f, "---------------------")?;

+ 3 - 3
src/collection/run.rs

@@ -1,7 +1,7 @@
 use std::{
     collections::{hash_map::Entry, HashMap, HashSet},
     fs::{self, File},
-    io::{Read, Write},
+    io::Read,
     path::{Path, PathBuf},
 };
 
@@ -15,7 +15,7 @@ use crate::collection::minknow::{parse_pore_activity_from_reader, parse_throughp
 use super::minknow::{MinKnowSampleSheet, PoreStateEntry, ThroughputEntry};
 
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
-enum PodDir {
+pub enum PodDir {
     Muxed {
         dir: PathBuf,
         source: Pod5Source,
@@ -29,7 +29,7 @@ enum PodDir {
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
-enum Pod5Source {
+pub enum Pod5Source {
     LocalDir,
     Tar,
 }

+ 0 - 6
src/commands/mod.rs

@@ -681,9 +681,3 @@ where
 
     Ok(results)
 }
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-}
-

+ 23 - 7
src/io/pod5_footer_generated.rs

@@ -112,7 +112,7 @@ pub mod minknow {
             }
         }
 
-        impl<'a> flatbuffers::Verifiable for ContentType {
+        impl flatbuffers::Verifiable for ContentType {
             #[inline]
             fn run_verifier(
                 v: &mut flatbuffers::Verifier,
@@ -199,7 +199,7 @@ pub mod minknow {
             }
         }
 
-        impl<'a> flatbuffers::Verifiable for Format {
+        impl flatbuffers::Verifiable for Format {
             #[inline]
             fn run_verifier(
                 v: &mut flatbuffers::Verifier,
@@ -235,6 +235,14 @@ pub mod minknow {
             pub const VT_CONTENT_TYPE: flatbuffers::VOffsetT = 10;
 
             #[inline]
+            /// # Safety
+            ///
+            /// `table` must be a valid FlatBuffers `Footer` table. It is assumed that:
+            /// - `table` was constructed from a buffer containing a well-formed `Footer`.
+            /// - The lifetime `'a` matches the underlying buffer's lifetime.
+            ///
+            /// Calling this with an invalid table may lead to undefined behavior when
+            /// accessing fields through the resulting `Footer`.
             pub unsafe fn init_from_table(table: flatbuffers::Table<'a>) -> Self {
                 EmbeddedFile { _tab: table }
             }
@@ -322,7 +330,7 @@ pub mod minknow {
             pub format: Format,
             pub content_type: ContentType,
         }
-        impl<'a> Default for EmbeddedFileArgs {
+        impl Default for EmbeddedFileArgs {
             #[inline]
             fn default() -> Self {
                 EmbeddedFileArgs {
@@ -413,6 +421,14 @@ pub mod minknow {
             pub const VT_CONTENTS: flatbuffers::VOffsetT = 10;
 
             #[inline]
+            /// # Safety
+            ///
+            /// `table` must be a valid FlatBuffers `Footer` table. It is assumed that:
+            /// - `table` was constructed from a buffer containing a well-formed `Footer`.
+            /// - The lifetime `'a` matches the underlying buffer's lifetime.
+            ///
+            /// Calling this with an invalid table may lead to undefined behavior when
+            /// accessing fields through the resulting `Footer`.
             pub unsafe fn init_from_table(table: flatbuffers::Table<'a>) -> Self {
                 Footer { _tab: table }
             }
@@ -594,7 +610,7 @@ pub mod minknow {
         /// catch every error, or be maximally performant. For the
         /// previous, unchecked, behavior use
         /// `root_as_footer_unchecked`.
-        pub fn root_as_footer(buf: &[u8]) -> Result<Footer, flatbuffers::InvalidFlatbuffer> {
+        pub fn root_as_footer(buf: &[u8]) -> Result<Footer<'_>, flatbuffers::InvalidFlatbuffer> {
             flatbuffers::root::<Footer>(buf)
         }
         #[inline]
@@ -606,7 +622,7 @@ pub mod minknow {
         /// `size_prefixed_root_as_footer_unchecked`.
         pub fn size_prefixed_root_as_footer(
             buf: &[u8],
-        ) -> Result<Footer, flatbuffers::InvalidFlatbuffer> {
+        ) -> Result<Footer<'_>, flatbuffers::InvalidFlatbuffer> {
             flatbuffers::size_prefixed_root::<Footer>(buf)
         }
         #[inline]
@@ -639,14 +655,14 @@ pub mod minknow {
         /// Assumes, without verification, that a buffer of bytes contains a Footer and returns it.
         /// # Safety
         /// Callers must trust the given bytes do indeed contain a valid `Footer`.
-        pub unsafe fn root_as_footer_unchecked(buf: &[u8]) -> Footer {
+        pub unsafe fn root_as_footer_unchecked(buf: &[u8]) -> Footer<'_> {
             flatbuffers::root_unchecked::<Footer>(buf)
         }
         #[inline]
         /// Assumes, without verification, that a buffer of bytes contains a size prefixed Footer and returns it.
         /// # Safety
         /// Callers must trust the given bytes do indeed contain a valid size prefixed `Footer`.
-        pub unsafe fn size_prefixed_root_as_footer_unchecked(buf: &[u8]) -> Footer {
+        pub unsafe fn size_prefixed_root_as_footer_unchecked(buf: &[u8]) -> Footer<'_> {
             flatbuffers::size_prefixed_root_unchecked::<Footer>(buf)
         }
         #[inline]

+ 3 - 3
src/lib.rs

@@ -647,7 +647,7 @@ mod tests {
 
         let row = "chr2\t207968575\tID_16420_1\ta\ta]chr11:41497080]\t.\tPASS\tSVTYPE=BND;MATEID=ID_16420_2;TUMOUR_READ_SUPPORT=4;TUMOUR_ALN_SUPPORT=4;NORMAL_READ_SUPPORT=0;NORMAL_ALN_SUPPORT=0;SVLEN=0;BP_NOTATION=++;SOURCE=SUPPLEMENTARY;CLUSTERED_READS_TUMOUR=4;CLUSTERED_READS_NORMAL=0;ORIGIN_STARTS_STD_DEV=0.433;ORIGIN_MAPQ_MEAN=56.25;ORIGIN_EVENT_SIZE_STD_DEV=0;ORIGIN_EVENT_SIZE_MEDIAN=0;ORIGIN_EVENT_SIZE_MEAN=0;END_STARTS_STD_DEV=17.754;END_MAPQ_MEAN=56.25;END_EVENT_SIZE_STD_DEV=0;END_EVENT_SIZE_MEDIAN=0;END_EVENT_SIZE_MEAN=0;TUMOUR_DP_BEFORE=8,11;TUMOUR_DP_AT=4,11;TUMOUR_DP_AFTER=4,11;NORMAL_DP_BEFORE=7,16;NORMAL_DP_AT=7,16;NORMAL_DP_AFTER=7,16;TUMOUR_AF=1,0.364;NORMAL_AF=0,0;TUMOUR_TOTAL_HP_AT=1,3,0;NORMAL_TOTAL_HP_AT=4,3,0;TUMOUR_ALT_HP=2,1,1;TUMOUR_PS=207946665;NORMAL_ALT_HP=0,0,0;CLASS=PREDICTED_SOMATIC\tGT\t0/1";
         let variant: VcfVariant = row.parse()?;
-        let var_string = variant.into_vcf_row();
+        // let var_string = variant.into_vcf_row();
         let u = variant.n_alt_depth();
         println!("{u:?}");
 
@@ -773,7 +773,7 @@ mod tests {
 
         let deleted_genes = bedrow_overlaps_par(
             &r,
-            &vec![&GenomeRange {
+            &[&GenomeRange {
                 contig: variant.position.contig,
                 range: del.start..del.end,
             }],
@@ -1157,7 +1157,7 @@ mod tests {
             let path = format!("{}/{id}/diag/{id}_somatic_variants.bit", config.result_dir);
             match variant_collection::Variants::load_from_file(&path) {
                 Ok(mut variants) => {
-                    let (mut high_depth_ranges, _) = somatic_depth_quality_ranges(&id, &config)?;
+                    let (mut high_depth_ranges, _) = somatic_depth_quality_ranges(id, &config)?;
                     high_depth_ranges.par_sort_by_key(|r| (r.contig, r.range.start));
 
                     let res = VariantsStats::new(&mut variants, id, &config, &high_depth_ranges)?

+ 1 - 1
src/math.rs

@@ -113,7 +113,7 @@ pub fn compute_median(data: &[f64]) -> f64 {
     let len = data.len();
 
     // Calculate the median based on whether the length is even or odd
-    if len % 2 == 0 {
+    if len.is_multiple_of(2) {
         // If even, average the two middle values
         (data[len / 2 - 1] + data[len / 2]) / 2.0
     } else {

+ 11 - 17
src/slurm_helpers.rs

@@ -4,14 +4,14 @@ use std::process::Command;
 use anyhow::Context;
 
 #[derive(Debug)]
-struct NodeInfo {
-    partition: String,
-    node: String,
-    cpu_total: i32,
-    cpu_free: i32,
-    gpu_total: i32,
-    gpu_free: i32,
-    gpu_names: Vec<String>, // <- models/types for GRES
+pub struct NodeInfo {
+    pub partition: String,
+    pub node: String,
+    pub cpu_total: i32,
+    pub cpu_free: i32,
+    pub gpu_total: i32,
+    pub gpu_free: i32,
+    pub gpu_names: Vec<String>, // <- models/types for GRES
 }
 
 fn parse_kv_line(line: &str) -> HashMap<String, String> {
@@ -25,8 +25,6 @@ fn parse_kv_line(line: &str) -> HashMap<String, String> {
     map
 }
 
-// ---- GPU count helpers (as before) ----
-
 fn parse_gpu_from_tres(s: &str) -> u32 {
     // handles "gres/gpu=4" and "gres/gpu:V100=4"
     if let Some(idx) = s.find("gres/gpu") {
@@ -44,7 +42,7 @@ fn parse_gpu_from_gres(s: &str) -> u32 {
     if let Some(idx) = s.find("gpu:") {
         let rest = &s[idx + "gpu:".len()..];
         // could be "V100:4" or "4"
-        let parts: Vec<&str> = rest.split(|c| c == ':' || c == '(').collect();
+        let parts: Vec<&str> = rest.split([':', '(']).collect();
         // last numeric field is count
         for p in parts.iter().rev() {
             if let Ok(v) = p.parse() {
@@ -81,8 +79,6 @@ fn extract_alloc_gpus(fields: &HashMap<String, String>) -> u32 {
     0
 }
 
-// ---- GPU name / type extraction ----
-
 fn extract_gpu_names_from_gres(s: &str) -> Vec<String> {
     // Examples:
     //   "gpu:V100:4(S:0-3)"
@@ -150,8 +146,6 @@ fn extract_gpu_names(fields: &HashMap<String, String>) -> Vec<String> {
     Vec::new()
 }
 
-// ---- Node parsing ----
-
 fn parse_node(fields: HashMap<String, String>) -> NodeInfo {
     let partition = fields.get("Partitions").cloned().unwrap_or_default();
     let node = fields.get("NodeName").cloned().unwrap_or_default();
@@ -216,8 +210,8 @@ mod tests {
     #[test]
     fn slurm_info() -> anyhow::Result<()> {
         println!(
-            "{:15} {:15} {:>7} {:>8} {:>7} {:>8}  {}",
-            "Partition", "Node", "CPU_TOT", "CPU_FREE", "GPU_TOT", "GPU_FREE", "GPU_NAMES"
+            "{:15} {:15} {:>7} {:>8} {:>7} {:>8}  GPU_NAMES",
+            "Partition", "Node", "CPU_TOT", "CPU_FREE", "GPU_TOT", "GPU_FREE"
         );
 
         for info in slurm_availables()? {

+ 4 - 10
src/variant/variant_collection.rs

@@ -11,7 +11,6 @@ use bitcode::{Decode, Encode};
 use csv::ReaderBuilder;
 use dashmap::DashMap;
 use log::{debug, error, info, warn};
-use pandora_lib_assembler::assembler::calculate_shannon_entropy;
 use rayon::prelude::*;
 use serde::{Deserialize, Serialize};
 use uuid::Uuid;
@@ -38,7 +37,6 @@ use crate::{
     },
     io::{fasta::sequence_at, readers::get_reader, vcf::vcf_header, writers::get_gz_writer},
     positions::{overlaps_par, GenomePosition, GenomeRange, GetGenomePosition},
-    variant::variant::VariantId,
 };
 
 /// A collection of VCF variants along with associated metadata.
@@ -257,7 +255,7 @@ impl VariantCollection {
     /// * `annotations` - A reference to the Annotations structure to store the results.
     /// * `reference` - Path to the reference FASTA file.
     /// * `seq_len` - Length of the sequence context to consider for entropy calculation
-    ///               (must be >= 3 to extract trinucleotide context).
+    ///   (must be >= 3 to extract trinucleotide context).
     /// * `max_threads` - Maximum number of threads to use for parallel processing.
     ///
     /// # Behavior
@@ -949,7 +947,7 @@ pub fn group_variants_by_bnd_rc(
                 .keys()
                 .find(|key| {
                     key.a_contig == desc.b_contig
-                        && key.a_sens == !desc.b_sens
+                        && key.a_sens != desc.b_sens
                         && (key.a_position as i64 - desc.b_position as i64).abs() < threshold
                 })
                 .cloned();
@@ -1786,9 +1784,7 @@ impl ExternalAnnotation {
                         } else {
                             warn!(
                                 "No VEP entry for {}\t{}\t{}",
-                                entry.position.to_string(),
-                                entry.reference.to_string(),
-                                entry.alternative.to_string()
+                                entry.position, entry.reference, entry.alternative
                             );
                             n_not_vep += 1;
                         }
@@ -1910,9 +1906,7 @@ fn process_vep_chunk(
         } else {
             warn!(
                 "No VEP entry for {}\t{}\t{}",
-                entry.position.to_string(),
-                entry.reference.to_string(),
-                entry.alternative.to_string()
+                entry.position, entry.reference, entry.alternative
             );
             n_not_vep += 1;
         }

+ 3 - 46
src/variant/variants_stats.rs

@@ -1,7 +1,6 @@
 use std::{
     collections::{BTreeMap, BTreeSet},
     io::BufRead,
-    sync::Arc,
 };
 
 use anyhow::Context;
@@ -81,7 +80,7 @@ where
 }
 
 impl VariantsStats {
-    pub fn new(variants: &mut Variants, id: &str, config: &Config, high_depth_ranges: &[GenomeRange]) -> anyhow::Result<Self> {
+    pub fn new(variants: &mut Variants, _id: &str, config: &Config, high_depth_ranges: &[GenomeRange]) -> anyhow::Result<Self> {
         let n = variants.data.len() as u32;
         let alteration_categories: DashMap<String, u32> = DashMap::new();
         let vep_impact: DashMap<String, u32> = DashMap::new();
@@ -203,7 +202,7 @@ impl VariantsStats {
         // HighDepths
         let ann = Annotation::HighDepth;
         let res = variants.annotate_with_ranges(
-            &high_depth_ranges,
+            high_depth_ranges,
             Some(ann.clone()),
             config.min_n_callers,
             Vec::new(),
@@ -456,7 +455,7 @@ pub fn somatic_rates(
 /// reads paired “normal” (MRD) and “tumoral” (Diag) count files, then:
 /// 1. Marks positions where both depths ≥ `config.min_high_quality_depth` as high‑depth.
 /// 2. Marks positions where both depths < `config.max_depth_low_quality` as low‑quality.
-/// Consecutive runs of true values are merged into `GenomeRange`s.
+///    Consecutive runs of true values are merged into `GenomeRange`s.
 ///
 /// # Arguments
 ///
@@ -638,48 +637,6 @@ pub fn merge_adjacent_ranges(mut ranges: Vec<GenomeRange>) -> Vec<GenomeRange> {
     merged
 }
 
-/// Converts a slice of booleans into a list of `GenomeRange`s representing
-/// consecutive `true` values, offset by a `start` position and tagged with a contig ID.
-///
-/// # Arguments
-/// - `vec`: A slice of booleans (`true` means "active" position)
-/// - `start`: The offset to add to all index positions (e.g., bin or genomic start position)
-/// - `contig`: The contig name (e.g., "chr1", "chrX") to be mapped to a numerical ID
-///
-/// # Returns
-/// A vector of `GenomeRange` objects, each corresponding to a consecutive sequence of `true` values.
-fn ranges_from_consecutive_true(vec: &[bool], start: u32, contig: &str) -> Vec<GenomeRange> {
-    let contig = contig_to_num(contig);
-    let mut ranges = Vec::new();
-    let mut current_start: Option<u32> = None;
-
-    for (idx, &is_true) in vec.iter().enumerate() {
-        let i = idx as u32 + start;
-
-        match (is_true, current_start) {
-            (true, None) => current_start = Some(i), // Start new range
-            (false, Some(start_idx)) => {
-                ranges.push(GenomeRange {
-                    contig,
-                    range: start_idx..i,
-                });
-                current_start = None;
-            }
-            _ => {} // Continue range or stay idle
-        }
-    }
-
-    // If the final values were true, we need to flush the last range
-    if let Some(start_idx) = current_start {
-        ranges.push(GenomeRange {
-            contig,
-            range: start_idx..(vec.len() as u32 + start),
-        });
-    }
-
-    ranges
-}
-
 /// A region-level data structure for modeling mutation rates using GLMs or other statistical models.
 ///
 /// Each `GlmRow` represents a genomic interval (e.g., from a BED file) and contains: