Thomas hai 2 semanas
pai
achega
334057b7e1

A diferenza do arquivo foi suprimida porque é demasiado grande
+ 220 - 158
Cargo.lock


+ 1 - 1
Cargo.toml

@@ -65,7 +65,7 @@ filetime = "0.2.27"
 opt-level = 0
 debug = false
 
-[patch.crates-io]
+[target.'cfg(windows)'.patch.crates-io]
 hts-sys = { path = "patches/hts-sys" }
 rust-htslib = { path = "patches/rust-htslib" }
 

+ 111 - 45
src/annotation/mod.rs

@@ -10,7 +10,8 @@ use std::{
     collections::{HashMap, HashSet},
     fmt,
     fs::File,
-    io::{Read, Write},
+    io::{BufWriter, Read, Write},
+    path::Path,
     str::FromStr,
     sync::{
         atomic::{AtomicU32, Ordering},
@@ -19,7 +20,7 @@ use std::{
 };
 
 use crate::{
-    helpers::{mean, Blake3BuildHasher, Hash128},
+    helpers::{format_count, mean, Blake3BuildHasher, Hash128},
     variant::{variant_collection::VariantCollection, vcf_variant::AlterationCategory},
 };
 use bitcode::{Decode, Encode};
@@ -93,6 +94,15 @@ pub enum Annotation {
 
     /// RepeatMasker
     Repeat,
+
+    /// NanomonSV Target Site Duplication
+    TSD(String),
+
+    /// NanomonSV insertion type
+    InsertionType(String),
+
+    /// query_start,query_end,class,class...
+    RepeatMasker(String),
 }
 
 /// Denotes the biological sample type associated with a variant call.
@@ -231,6 +241,9 @@ impl fmt::Display for Annotation {
             HighDepth => "HighDepth".into(),
             Panel(name) => format!("Panel_{name}"),
             LowMAPQ => "LowMAPQ".to_string(),
+            TSD(_) => "TSD".to_string(),
+            InsertionType(v) => format!("InsertionType_{v}"),
+            RepeatMasker(_) => "RepeatMasker".to_string(),
         };
         write!(f, "{}", s)
     }
@@ -326,15 +339,95 @@ pub struct AnnotationsStats {
     pub numeric: DashMap<String, HashMap<String, Vec<f64>>>,
 }
 
+impl fmt::Display for AnnotationsStats {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let total: u64 = self.categorical.iter().map(|e| *e.value()).sum();
+        let max_key_len = self
+            .categorical
+            .iter()
+            .map(|e| e.key().len())
+            .max()
+            .unwrap_or(0);
+
+        writeln!(f, "\nCallers stats:")?;
+        writeln!(f, "  n categories: {}\n", self.categorical.len())?;
+
+        let mut lines: Vec<String> = self
+            .categorical
+            .iter()
+            .map(|e| {
+                let k = e.key();
+                let v = *e.value();
+                let pct = 100.0 * v as f64 / total as f64;
+                let mut num_str = Vec::new();
+                if let Some(nums) = self.numeric.get(k) {
+                    num_str.extend(
+                        nums.iter()
+                            .map(|(k_n, v_n)| format!("{k_n}: {:.2}", mean(v_n))),
+                    );
+                }
+                num_str.sort();
+                let num_part = if num_str.is_empty() {
+                    String::new()
+                } else {
+                    format!("  [{}]", num_str.join(", "))
+                };
+                format!(
+                    "  {:<width$}  {:>10}  {:>5.1}%{num_part}",
+                    k,
+                    format_count(v),
+                    pct,
+                    width = max_key_len
+                )
+            })
+            .collect();
+
+        lines.sort();
+        writeln!(f, "{}", lines.join("\n"))?;
+        writeln!(f, "  {:-<width$}", "", width = max_key_len + 22)?;
+        writeln!(
+            f,
+            "  {:<width$}  {:>10}  100.0%",
+            "Total",
+            format_count(total),
+            width = max_key_len
+        )
+    }
+}
+
+pub enum AnnotationsStatCat {
+    Categorical(String),
+    Numerical(String, f64),
+}
+
+impl Annotation {
+    pub fn stat(&self) -> AnnotationsStatCat {
+        use Annotation::*;
+        match self {
+            ShannonEntropy(v) => AnnotationsStatCat::Numerical(self.to_string(), *v),
+            ConstitDepth(v) => AnnotationsStatCat::Numerical(self.to_string(), *v as f64),
+            ConstitAlt(v) => AnnotationsStatCat::Numerical(self.to_string(), *v as f64),
+            Cosmic(c) => AnnotationsStatCat::Numerical(self.to_string(), c.cosmic_cnt as f64),
+            Callers(caller, sample) => {
+                AnnotationsStatCat::Categorical(format!("{caller} {sample}"))
+            }
+            AlterationCategory(alt_cat) => AnnotationsStatCat::Categorical(alt_cat.to_string()),
+            _ => AnnotationsStatCat::Categorical(self.to_string()),
+        }
+    }
+}
+
 impl AnnotationsStats {
-    pub fn save_to_json(&self, file_path: &str) -> anyhow::Result<()> {
-        let json = serde_json::to_string_pretty(self)?;
-        let mut file = File::create(file_path)?;
-        file.write_all(json.as_bytes())?;
+    /// Serializes the statistics to a pretty-printed JSON file.
+    pub fn save_to_json(&self, file_path: impl AsRef<Path>) -> anyhow::Result<()> {
+        let file = File::create(file_path)?;
+        let writer = BufWriter::new(file);
+        serde_json::to_writer_pretty(writer, self)?;
         Ok(())
     }
 
-    pub fn load_from_json(file_path: &str) -> anyhow::Result<Self> {
+    /// Deserializes statistics from a JSON file produced by [`save_to_json`](Self::save_to_json).
+    pub fn load_from_json(file_path: impl AsRef<Path>) -> anyhow::Result<Self> {
         let mut file = File::open(file_path)?;
         let mut contents = String::new();
         file.read_to_string(&mut contents)?;
@@ -345,6 +438,9 @@ impl AnnotationsStats {
 
 #[allow(clippy::type_complexity)]
 impl Annotations {
+    /// Inserts or updates annotations for a given variant key.
+    ///
+    /// If the key already exists, the new annotations are appended to the existing ones.
     pub fn insert_update(&self, key: Hash128, add: &[Annotation]) {
         self.store
             .entry(key)
@@ -356,7 +452,6 @@ impl Annotations {
         &self,
         annotations: Option<Box<dyn Fn(&Annotation) -> bool + Send + Sync>>,
     ) -> AnnotationsStats {
-        use Annotation::*;
         let map: DashMap<String, u64> = DashMap::new();
         let num_maps: DashMap<String, HashMap<String, Vec<f64>>> = DashMap::new();
 
@@ -370,17 +465,9 @@ impl Annotations {
             let mut categorical = Vec::new();
             let mut numerical = Vec::new();
             for ann in anns.iter() {
-                match ann {
-                    LowConstitDepth | LowEntropy | GnomAD(_) | VEP(_) | TriNucleotides(_)
-                    | ReplicationTiming(_) | HighDepth | CpG | VNTR | Repeat | Panel(_)
-                    | LowMAPQ | HighConstitAlt => categorical.push(ann.to_string()),
-                    Callers(caller, sample) => categorical.push(format!("{caller} {sample}")),
-                    ShannonEntropy(v) => numerical.push((ann.to_string(), *v)),
-                    ConstitDepth(v) | Annotation::ConstitAlt(v) => {
-                        numerical.push((ann.to_string(), *v as f64));
-                    }
-                    Cosmic(c) => numerical.push((ann.to_string(), c.cosmic_cnt as f64)),
-                    AlterationCategory(alt_cat) => categorical.push(alt_cat.to_string()),
+                match ann.stat() {
+                    AnnotationsStatCat::Categorical(s) => categorical.push(s),
+                    AnnotationsStatCat::Numerical(k, v) => numerical.push((k, v)),
                 }
             }
 
@@ -400,34 +487,13 @@ impl Annotations {
             }
         });
 
-        println!("\nCallers stats:");
-        println!("\tn categories: {}", map.len());
-        let mut n = 0;
-        let lines: Vec<String> = map
-            .iter()
-            .map(|e| {
-                let k = e.key();
-                let v = e.value();
-                n += v;
-                let mut num_str = Vec::new();
-                if let Some(nums) = num_maps.get(k) {
-                    num_str.extend(
-                        nums.iter()
-                            .map(|(k_n, v_n)| format!("{k_n} {:.2}", mean(v_n))),
-                    )
-                }
-                num_str.sort();
-                format!("\t- {k}\t{v}\t{}", num_str.join("\t"))
-            })
-            .collect();
-
-        println!("{}", lines.join("\n"));
-        println!("Total\t{n}\n");
-
-        AnnotationsStats {
+        let stats = AnnotationsStats {
             categorical: map,
             numeric: num_maps,
-        }
+        };
+
+        info!("\n{stats}");
+        stats
     }
 
     pub fn vep_stats(&self) -> anyhow::Result<VepStats> {

+ 1 - 1
src/annotation/vep.rs

@@ -709,6 +709,7 @@ impl JobCommand for VepJob {
         )
     }
 }
+
 impl LocalRunner for VepJob {}
 impl LocalBatchRunner for VepJob {}
 impl SbatchRunner for VepJob {
@@ -732,7 +733,6 @@ impl Run for VepJob {
             .context("Can't save VEP logs")?;
         Ok(())
     }
-    // add code here
 }
 
 /// Selects the "best" VEP annotation from a list, based on biological impact and transcript priority.

+ 6 - 8
src/callers/clairs.rs

@@ -166,8 +166,7 @@ use crate::{
         bcftools::{BcftoolsConcat, BcftoolsKeepPass},
         exec_jobs,
         samtools::SamtoolsMergeMany,
-        AnyJob, Command as JobCommand, LocalBatchRunner, LocalRunner, SbatchRunner, SlurmParams,
-        SlurmRunner,
+        AnyJob, Command as JobCommand, LocalRunner, SbatchRunner, SlurmParams,
     },
     config::Config,
     helpers::{
@@ -742,8 +741,8 @@ impl Version for ClairS {
             }
         }
 
-        impl SlurmRunner for ClairSVersionJob<'_> {
-            fn slurm_args(&self) -> Vec<String> {
+        impl SbatchRunner for ClairSVersionJob<'_> {
+            fn slurm_params(&self) -> SlurmParams {
                 SlurmParams {
                     job_name: Some("clairs_version".into()),
                     partition: Some("shortq".into()),
@@ -751,13 +750,12 @@ impl Version for ClairS {
                     mem: Some("10G".into()),
                     gres: None,
                 }
-                .to_args()
             }
         }
 
         let mut job = ClairSVersionJob { config };
         let out =
-            SlurmRunner::exec(&mut job).context("Failed to run ClairS --version via Slurm")?;
+            SbatchRunner::exec(&mut job).context("Failed to run ClairS --version via Slurm")?;
 
         let mut combined = out.stdout.clone();
         if let Some(epilog) = &out.slurm_epilog {
@@ -1194,9 +1192,9 @@ mod tests {
         test_init();
         let id = "CHAHA";
         let config = Config::default();
-        let mut clairs = ClairS::initialize(id, &config)?;
+        let  clairs = ClairS::initialize(id, &config)?;
 
-        clairs.run_chunked_resume(60);
+        clairs.run_chunked_resume(60)?;
         Ok(())
     }
 }

+ 9 - 21
src/callers/nanomonsv.rs

@@ -85,7 +85,7 @@ use crate::{
     collection::vcf::Vcf,
     commands::{
         bcftools::{BcftoolsConcat, BcftoolsKeepPass},
-        CapturedOutput, Command as JobCommand, LocalRunner, SbatchRunner, SlurmParams, SlurmRunner,
+        CapturedOutput, Command as JobCommand, LocalRunner, SbatchRunner, SlurmParams,
     },
     config::Config,
     helpers::{is_file_older, remove_dir_if_exists},
@@ -141,19 +141,6 @@ impl JobCommand for NanomonSV {
 
 impl LocalRunner for NanomonSV {}
 
-impl SlurmRunner for NanomonSV {
-    fn slurm_args(&self) -> Vec<String> {
-        let mem = self.memory.unwrap_or(20);
-        SlurmParams {
-            job_name: Some(format!("nanomonsv_{}", self.id)),
-            cpus_per_task: Some(self.threads as u32),
-            mem: Some(format!("{mem}G")),
-            partition: Some("mediumq".into()),
-            gres: None,
-        }
-        .to_args()
-    }
-}
 impl SbatchRunner for NanomonSV {
     fn slurm_params(&self) -> SlurmParams {
         let mem = self.memory.unwrap_or(20);
@@ -166,6 +153,7 @@ impl SbatchRunner for NanomonSV {
         }
     }
 }
+
 impl Initialize for NanomonSV {
     /// Initializes a new NanomonSV pipeline for the given sample ID and config.
     ///
@@ -902,26 +890,26 @@ pub fn nanomonsv_create_pon(config: &Config, pon_path: &str) -> anyhow::Result<(
 /// # Errors
 /// Returns an error if command execution fails.
 pub fn nanomonsv_insert_classify(
-    input_vcf: &str,
-    output_vcf: &str,
+    input_vcf: impl AsRef<Path>,
+    output_vcf: impl AsRef<Path>,
     config: &Config,
 ) -> anyhow::Result<CapturedOutput> {
     let args = vec![
         "insert_classify".to_string(),
-        input_vcf.to_string(),
-        output_vcf.to_string(),
+        input_vcf.as_ref().to_string_lossy().to_string(),
+        output_vcf.as_ref().to_string_lossy().to_string(),
         config.reference.clone(),
         config.refseq_gtf.clone(),
         config.nanomonsv_line1_bed.clone(),
     ];
 
     let mut runner = NanomonSV {
-        id: "nanomonsv_insert_classify".to_string(),
+        id: "insert_classify".to_string(),
         log_dir: config.tmp_dir.clone(),
         config: config.clone(),
         job_args: args,
-        threads: 1,
-        memory: Some(8),
+        threads: 4,
+        memory: Some(40),
     };
 
     run!(config, &mut runner)

+ 66 - 22
src/callers/savana.rs

@@ -77,11 +77,11 @@ use crate::{
         bcftools::BcftoolsKeepPass,
         longphase::{LongphaseHap, LongphasePhase},
         samtools::SamtoolsIndex,
-        CapturedOutput, Command as JobCommand, LocalRunner, SbatchRunner, SlurmParams, SlurmRunner,
+        CapturedOutput, Command as JobCommand, LocalRunner, SbatchRunner, SlurmParams,
     },
     config::Config,
-    helpers::{is_file_older, remove_dir_if_exists},
-    io::{readers::get_gz_reader, vcf::read_vcf},
+    helpers::{find_file, is_file_older, remove_dir_if_exists},
+    io::{fasta::fasta_records_to_hasmap, readers::get_gz_reader, vcf::read_vcf},
     locker::SampleLock,
     pipes::{Initialize, InitializeSolo, ShouldRun, Version},
     positions::{num_to_contig, GenomeRange},
@@ -89,7 +89,7 @@ use crate::{
     runners::Run,
     variant::{
         variant_collection::VariantCollection,
-        vcf_variant::{Label, Variants},
+        vcf_variant::{Info, Label, Variants},
     },
 };
 use anyhow::Context;
@@ -97,7 +97,7 @@ use itertools::Itertools;
 use log::{debug, info, warn};
 use rayon::prelude::*;
 use serde::{Deserialize, Serialize};
-use std::io::Write;
+use std::{collections::HashMap, io::Write};
 use std::{
     fs::{self, File},
     io::{BufRead, BufWriter},
@@ -124,7 +124,6 @@ pub struct Savana {
     pub config: Config,
     /// Directory for log file storage
     pub log_dir: String,
-
     /// Command-line arguments for Savana executable (populated during run)
     job_args: Vec<String>,
 }
@@ -204,19 +203,6 @@ impl JobCommand for Savana {
 
 impl LocalRunner for Savana {}
 
-impl SlurmRunner for Savana {
-    fn slurm_args(&self) -> Vec<String> {
-        SlurmParams {
-            job_name: Some(format!("savana_{}", self.id)),
-            cpus_per_task: Some(self.config.savana_threads as u32),
-            mem: Some(format!("{}G", self.config.savana_mem)),
-            partition: Some("mediumq".into()),
-            gres: None,
-        }
-        .to_args()
-    }
-}
-
 impl SbatchRunner for Savana {
     fn slurm_params(&self) -> SlurmParams {
         SlurmParams {
@@ -407,9 +393,37 @@ impl Variants for Savana {
         let add = vec![caller.clone()];
         let passed_vcf = self.config.savana_passed_vcf(&self.id);
 
+        // Adding INSSEQ to the variant so subsequent inserted sequence qualification can apply
+        let insertion_store = if let Some(insertion_fasta) =
+            find_file(Path::new(&self.config.savana_output_dir(&self.id)), |f| {
+                f.contains("inserted_sequences.fa")
+            }) {
+            fasta_records_to_hasmap(&insertion_fasta).with_context(|| {
+                format!("Failed to read fasta from: {}", insertion_fasta.display())
+            })?
+        } else {
+            HashMap::new()
+        };
+
         info!("Loading variants from {}: {}", caller, &passed_vcf);
-        let variants = read_vcf(&passed_vcf)
-            .map_err(|e| anyhow::anyhow!("Failed to read Savana VCF {}.\n{e}", passed_vcf))?;
+
+        let variants: Vec<crate::variant::vcf_variant::VcfVariant> = read_vcf(&passed_vcf)
+            .map_err(|e| anyhow::anyhow!("Failed to read Savana VCF {}.\n{e}", passed_vcf))?
+            .into_iter()
+            .map(|mut v| {
+                if let Some(crate::variant::vcf_variant::SVType::INS) = v.svtype() {
+                    if let Some(sequence) = get_longest_sequence(&insertion_store, &v.id) {
+                        if let Ok(parsed) = sequence.parse() {
+                            v.alternative = parsed;
+                            v.infos.0.push(Info::INSLEN(sequence.len() as i32));
+                            v.infos.0.push(Info::INSSEQ(sequence));
+                        }
+                    }
+                }
+                v
+            })
+            .collect();
+
         variants.par_iter().for_each(|v| {
             annotations.insert_update(v.hash(), &add);
         });
@@ -423,6 +437,19 @@ impl Variants for Savana {
     }
 }
 
+fn extract_id(s: &str) -> Option<u32> {
+    s.strip_prefix("ID_")?.split('_').next()?.parse().ok()
+}
+
+fn get_longest_sequence(map: &HashMap<String, String>, seq_id: &str) -> Option<String> {
+    let seq_id = extract_id(seq_id)?.to_string();
+    map.iter()
+        .filter(|(k, _)| k.contains(&seq_id))
+        .map(|(_, v)| v)
+        .max_by_key(|v| v.len())
+        .cloned()
+}
+
 impl Label for Savana {
     /// Returns the string label for this caller.
     fn label(&self) -> String {
@@ -970,7 +997,7 @@ pub fn read_savana_purity_ploidy(path: &Path) -> anyhow::Result<(f64, f64)> {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::helpers::test_init;
+    use crate::{helpers::test_init, variant::vcf_variant::SVType};
 
     #[test]
     fn savana_version() -> anyhow::Result<()> {
@@ -989,4 +1016,21 @@ mod tests {
         caller.run()?;
         Ok(())
     }
+
+    #[test]
+    fn savana_load() -> anyhow::Result<()> {
+        test_init();
+        let config = Config::default();
+
+        let caller = Savana::initialize("CHAHA", &config)?;
+        let annotations = Annotations::default();
+        let v = caller.variants(&annotations)?;
+
+        v.variants
+            .into_iter()
+            .filter(|v| v.svtype() == Some(SVType::INS))
+            .inspect(|v| println!("{v:?}"));
+
+        Ok(())
+    }
 }

+ 11 - 0
src/helpers.rs

@@ -1057,6 +1057,7 @@ fn remove_tracked_path(path: &Path) -> anyhow::Result<()> {
 
     if path.extension().and_then(|e| e.to_str()) == Some("bam") {
         remove_existing_file(path.with_extension("bam.bai"));
+        remove_existing_file(path.with_extension(".tbi"));
         remove_existing_file(path.with_extension("bai"));
     }
 
@@ -1325,6 +1326,16 @@ pub fn revcomp(s: &str) -> String {
     String::from_utf8(v).unwrap()
 }
 
+pub fn format_count(n: u64) -> String {
+    let s = n.to_string();
+    let mut result = String::new();
+    for (i, c) in s.chars().rev().enumerate() {
+        if i > 0 && i % 3 == 0 { result.push('_'); }
+        result.push(c);
+    }
+    result.chars().rev().collect()
+}   
+
 #[cfg(test)]
 mod tests {
     use hashbrown::HashSet;

+ 57 - 0
src/io/fasta.rs

@@ -4,6 +4,7 @@
 //! throughout the codebase. Conversion to the 1-based noodles API is done
 //! transparently inside each function.
 
+use std::collections::HashMap;
 use std::io::{BufRead, Write};
 use std::{
     fs::File,
@@ -178,11 +179,31 @@ pub fn read_single_contig_fasta(path: &Path) -> anyhow::Result<Vec<u8>> {
     Ok(seq)
 }
 
+/// A FASTA index (`.fai`) entry containing a sequence name and length.
 pub struct FaiEntry {
+    /// Sequence name.
     pub name: String,
+
+    /// Sequence length in bases or residues.
     pub len: usize,
 }
 
+/// Reads a FASTA index (`.fai`) file.
+///
+/// # Arguments
+///
+/// * `path` - Path to the `.fai` file.
+///
+/// # Returns
+///
+/// A vector of [`FaiEntry`] parsed from the index file.
+///
+/// # Errors
+///
+/// Returns an error if:
+/// - the file cannot be opened
+/// - a line is malformed
+/// - the sequence length cannot be parsed
 pub fn read_fai(path: &Path) -> anyhow::Result<Vec<FaiEntry>> {
     use std::io::BufRead;
     let f = std::fs::File::open(path)?;
@@ -201,3 +222,39 @@ pub fn read_fai(path: &Path) -> anyhow::Result<Vec<FaiEntry>> {
         })
         .collect()
 }
+
+/// Reads a FASTA file and returns a map of record names to sequences.
+///
+/// # Arguments
+///
+/// * `path` - Path to the FASTA file.
+///
+/// # Returns
+///
+/// A `HashMap<String, String>` where:
+/// - keys are FASTA record names
+/// - values are nucleotide or amino acid sequences
+///
+/// # Errors
+///
+/// Returns an error if:
+/// - the file cannot be opened
+/// - a FASTA record cannot be read
+/// - record names or sequences are not valid UTF-8
+pub fn fasta_records_to_hasmap(path: &Path) -> anyhow::Result<HashMap<String, String>> {
+    let mut reader = File::open(path)
+        .map(BufReader::new)
+        .map(noodles_fasta::io::Reader::new)?;
+
+    let mut store = HashMap::new();
+
+    for (i, result) in reader.records().enumerate() {
+        let record = result?;
+        let k = String::from_utf8(record.name().to_vec())
+            .with_context(|| format!("Error while parsing fasta record {} name.", i + 1))?;
+        let v = String::from_utf8(record.sequence().as_ref().to_vec())
+            .with_context(|| format!("Error while parsing fasta sequence {k}"))?;
+        store.insert(k, v);
+    }
+    Ok(store)
+}

+ 11 - 5
src/io/vcf.rs

@@ -15,7 +15,7 @@
 //! **Coordinate note:** [`GenomePosition::position`] is 0-based; VCF POS is
 //! 1-based. The conversion `vcf_pos = position + 1` is applied internally.
 
-use std::io::{BufRead, BufReader};
+use std::{borrow::Borrow, io::{BufRead, BufReader}};
 
 use anyhow::Context;
 use log::{info, warn};
@@ -93,7 +93,10 @@ pub fn read_vcf(path: &str) -> anyhow::Result<Vec<VcfVariant>> {
 ///
 /// Returns an error if writing fails or if the variants are not coordinate-sorted
 /// (required by Tabix).
-pub fn write_vcf(variants: &[VcfVariant], path: &str) -> anyhow::Result<()> {
+pub fn write_vcf<B: Borrow<VcfVariant>>(
+    variants: &[B], 
+    path: &str
+) -> anyhow::Result<()> {
     info!("Writing: {path}");
 
     let mut writer = BgzTabixWriter::new(path, IndexFormat::Tbi, true)?;
@@ -101,10 +104,12 @@ pub fn write_vcf(variants: &[VcfVariant], path: &str) -> anyhow::Result<()> {
     writer.write_header(b"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n")?;
 
     for variant in variants {
-        let row = format!("{}\n", variant.commun_deepvariant_clairs().into_vcf_row());
-        let rname = variant.position.contig();
+        let variant_ref = variant.borrow();
+        let row = format!("{}\n", variant_ref.commun_deepvariant_clairs().into_vcf_row());
+        let rname = variant_ref.position.contig();
+
         // GenomePosition.position is 0-based; VCF POS is 1-based
-        let vcf_pos = variant.position.position as usize + 1;
+        let vcf_pos = variant_ref.position.position as usize + 1;
         let pos = Position::try_from(vcf_pos)
             .with_context(|| format!("invalid VCF position: {vcf_pos}"))?;
         writer.write_record(row.as_bytes(), &rname, pos, pos)?;
@@ -254,6 +259,7 @@ pub fn vcf_header(dict: &str) -> anyhow::Result<Vec<String>> {
 
     Ok(header)
 }
+
 #[cfg(test)]
 mod tests {
     use crate::config::Config;

+ 18 - 2
src/pipes/somatic.rs

@@ -1,5 +1,5 @@
 use crate::{
-    annotation::is_gnomad_and_constit_alt,
+    annotation::{is_gnomad_and_constit_alt, Caller},
     create_should_run_normal_tumoral, init_solo_callers_normal_tumoral,
     io::bed::read_bed,
     pipes::{Initialize, InitializeSolo, ShouldRun},
@@ -330,7 +330,6 @@ impl Run for SomaticPipe {
         //     info!("\t - {} {}: {}", coll.vcf.caller, coll.vcf.time, rm);
         // });
 
-        println!("mlfjs");
         // MASK mapq
         let (mut high_depth_ranges, mut low_quality_ranges) =
             somatic_depth_quality_ranges(&id, &config)?;
@@ -510,6 +509,23 @@ impl Run for SomaticPipe {
                 Ok(())
             })?;
 
+        // Insertion annotation
+        info!("Annotation of insertions with NanomonSV (RepeatMasker and Minimap2 on reference genome).");
+        variants_collections
+            .iter()
+            .filter(|c| {
+                matches!(
+                    c.caller,
+                    Annotation::Callers(Caller::Savana, Sample::Somatic)
+                        | Annotation::Callers(Caller::Severus, Sample::Somatic)
+                        | Annotation::Callers(Caller::NanomonSV, Sample::Somatic)
+                )
+            })
+            .try_for_each(|c| -> anyhow::Result<()> {
+                c.annotate_insertions_with_nanomonsv(&annotations, &config)?;
+                Ok(())
+            })?;
+
         annotations
             .callers_stat(Some(Box::new(caller_cat_anns)))
             .save_to_json(&format!("{stats_dir}/{id}_annotations_09_vep.json"))?;

+ 112 - 11
src/variant/variant_collection.rs

@@ -6,9 +6,12 @@ use std::{
     sync::Arc,
 };
 
+use crate::{
+    callers::nanomonsv::nanomonsv_insert_classify,
+    io::{tsv::TsvLine, vcf::read_vcf},
+    runners::Run,
+};
 use anyhow::Context;
-// use bgzip::{BGZFReader, BGZFWriter};
-use crate::{io::tsv::TsvLine, runners::Run};
 use bitcode::{Decode, Encode};
 use dashmap::DashMap;
 use log::{debug, error, info, warn};
@@ -46,7 +49,6 @@ use crate::{
         writers::{finalize_bgzf_file, get_gz_writer},
     },
     positions::{overlaps_par, GenomePosition, GenomeRange, GetGenomePosition},
-    run,
 };
 
 /// A collection of VCF variants along with associated metadata.
@@ -245,6 +247,68 @@ impl VariantCollection {
         (total_bp, overlaps.len())
     }
 
+    /// Annotates insertion variants using NanomonSV `insert_classify`.
+    ///
+    /// Writes insertions to a temporary VCF, runs `insert_classify`, then stores
+    /// `INSERT_TYPE`, `RMSK_INFO`, and `TSD` annotations keyed by variant hash.
+    ///
+    /// # Errors
+    /// Fails if VCF header generation, file I/O, `insert_classify`, or VCF parsing fails.
+    pub fn annotate_insertions_with_nanomonsv(
+        &self,
+        annotations: &Annotations,
+        config: &Config,
+    ) -> anyhow::Result<()> {
+        let mut guard = TempFileGuard::new();
+        let in_tmp = guard.tmp_path(".vcf", &config.tmp_dir);
+        let output_vcf = guard.tmp_path(".vcf", &config.tmp_dir);
+
+        let variants: Vec<String> = self
+            .variants
+            .iter()
+            .filter(|v| v.alteration_category() == AlterationCategory::INS)
+            .map(|v| v.into_vcf_row())
+            .collect();
+
+        let header = vcf_header(&config.dict_file)?.join("\n");
+        let mut vcf = File::create(&in_tmp)?;
+        writeln!(vcf, "{}", header)?;
+        for row in variants {
+            println!("{row}");
+            writeln!(vcf, "{row}",)?;
+        }
+
+        nanomonsv_insert_classify(&in_tmp, &output_vcf, config).with_context(|| {
+            format!(
+                "Failed to run NanomonsSv insert classify for: {}",
+                in_tmp.display()
+            )
+        })?;
+
+        let variants = read_vcf(&output_vcf.to_string_lossy()).map_err(|e| {
+            anyhow::anyhow!(
+                "Failed to read NanomonSV VCF after annotating insertions {}.\n{e}",
+                output_vcf.display()
+            )
+        })?;
+
+        for variant in variants {
+            for info in variant.infos.0.iter() {
+                let annotation = match info {
+                    Info::INSERT_TYPE(v) => Annotation::InsertionType(v.to_string()),
+                    Info::RMSK_INFO(v) => Annotation::RepeatMasker(v.to_string()),
+                    Info::TSD(v) => Annotation::TSD(v.to_string()),
+                    _ => continue,
+                };
+                let key = variant.hash;
+                let mut anns = annotations.store.entry(key).or_default();
+                anns.push(annotation);
+            }
+        }
+
+        Ok(())
+    }
+
     fn chunk_size(&self, max_threads: u8) -> usize {
         let total_items = self.variants.len();
         let min_chunk_size = 1000;
@@ -1943,7 +2007,7 @@ impl ExternalAnnotation {
         let (sv, unfound): (Vec<VcfVariant>, Vec<VcfVariant>) =
             unfound.into_iter().partition(|v| v.has_svtype());
 
-        let min_chunk_size = 1000;
+        let min_chunk_size = 5000;
         let max_chunks = 150;
 
         let config: &Config = &self.config;
@@ -1955,7 +2019,10 @@ impl ExternalAnnotation {
                 .div_ceil(max_chunks as usize)
                 .max(min_chunk_size);
 
-            debug!("{} chunks to process.", unfound.len() / optimal_chunk_size);
+            debug!(
+                "{} chunks to process.",
+                unfound.len().div_ceil(optimal_chunk_size)
+            );
             unfound
                 .par_chunks(optimal_chunk_size)
                 .enumerate()
@@ -2008,7 +2075,8 @@ impl ExternalAnnotation {
                         }
 
                         let mut vep_job = VepJob::new(&in_tmp, &out_vep, &case_id, config);
-                        run!(config, &mut vep_job).context("Error while running VEP.")?;
+                        vep_job.run()?;
+                        // run!(config, &mut vep_job).context("Error while running VEP.")?;
 
                         let mut vep_rdr = std::io::BufReader::new(
                             fs::File::open(&out_vep).context("Can't open result file.")?,
@@ -2037,8 +2105,14 @@ impl ExternalAnnotation {
                             let k = (i + 1) as u64;
 
                             if let Some(vep_lines) = lines.get(&k) {
-                                if let Ok(veps) = vep_lines.iter().map(VEP::try_from).collect() {
-                                    chunk_results.push((entry.hash, veps));
+                                // if let Ok(veps) = vep_lines.iter().map(VEP::try_from).collect() {
+                                //     chunk_results.push((entry.hash, veps));
+                                // }
+                                match vep_lines.iter().map(VEP::try_from).collect() {
+                                    Ok(veps) => chunk_results.push((entry.hash, veps)),
+                                    Err(e) => {
+                                        warn!("VEP conversion failed for {}: {e}", entry.position)
+                                    }
                                 }
                             } else {
                                 warn!(
@@ -2128,8 +2202,12 @@ fn process_vep_chunk(
         )?;
     }
 
-    let mut vep_job = VepJob::new(&in_tmp, &out_vep, &case_id, config);
-    vep_job.run();
+    let mut vep_job = VepJob::new(&in_tmp, &out_vep, case_id, config);
+    vep_job.run().context(format!(
+        "Failed to run VEP annotation chunk for {case_id}: {} -> {}",
+        in_tmp.display(),
+        out_vep.display()
+    ))?;
     // match run!(config, &mut vep_job) {
     //     Ok(c) => {
     //         c.save_to_file(format!("{}/{}/log/"), config.result_dir, );
@@ -2200,7 +2278,10 @@ fn process_vep_chunk(
 mod tests {
     use super::*;
     use crate::{
-        annotation::Caller, callers::clairs::ClairS, helpers::test_init, pipes::Initialize,
+        annotation::Caller,
+        callers::{clairs::ClairS, savana::Savana},
+        helpers::test_init,
+        pipes::Initialize,
         variant::vcf_variant::Variants,
     };
 
@@ -2245,4 +2326,24 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn insertion_annot() -> anyhow::Result<()> {
+        test_init();
+        let config = Config::default();
+        let annotations = Annotations::default();
+        let variant_collection = Savana::initialize("CHAHA", &config)?.variants(&annotations)?;
+        variant_collection.annotate_insertions_with_nanomonsv(&annotations, &config)?;
+        for (_, annot) in annotations.store.into_iter() {
+            println!(
+                "{}",
+                annot
+                    .iter()
+                    .map(|a| a.to_string())
+                    .collect::<Vec<String>>()
+                    .join(", ")
+            );
+        }
+        Ok(())
+    }
 }

+ 1 - 3
src/variant/variants_stats.rs

@@ -575,14 +575,12 @@ pub fn somatic_depth_quality_ranges(
     id: &str,
     config: &Config,
 ) -> anyhow::Result<(Vec<GenomeRange>, Vec<GenomeRange>)> {
-    println!("somatic_depth_quality_ranges");
-    // chr1..chr22 + X,Y,M
+    // chr1..chr22 + X,Y
     let contigs: Vec<String> = read_dict(&config.dict_file)?
         .into_iter()
         .filter(|(ctg, _)| ctg != "chrM")
         .map(|(sn, _ln)| sn)
         .collect();
-    println!("{contigs:#?}");
 
     let cfg = config;
 

Algúns arquivos non se mostraron porque demasiados arquivos cambiaron neste cambio