Преглед изворни кода

dbsnp parse even if no FREQ and echtvar use exec_jobs

Thomas пре 5 часа
родитељ
комит
2a7460a44d

+ 1 - 1
src/callers/clairs.rs

@@ -1192,7 +1192,7 @@ mod tests {
         test_init();
         let id = "DUMCO";
         let config = Config::default();
-        let  clairs = ClairS::initialize(id, &config)?;
+        let clairs = ClairS::initialize(id, &config)?;
 
         clairs.run_chunked_resume(60)?;
         Ok(())

+ 4 - 2
src/helpers.rs

@@ -1330,11 +1330,13 @@ pub fn format_count(n: u64) -> String {
     let s = n.to_string();
     let mut result = String::new();
     for (i, c) in s.chars().rev().enumerate() {
-        if i > 0 && i % 3 == 0 { result.push('_'); }
+        if i > 0 && i % 3 == 0 {
+            result.push('_');
+        }
         result.push(c);
     }
     result.chars().rev().collect()
-}   
+}
 
 #[cfg(test)]
 mod tests {

+ 2 - 2
src/io/readers.rs

@@ -335,8 +335,8 @@ impl TabixReader {
             None => return Ok(()),
         };
 
-        let interval_start = Position::try_from(region.range.start as usize + 1)
-            .context("region start overflow")?;
+        let interval_start =
+            Position::try_from(region.range.start as usize + 1).context("region start overflow")?;
         let interval_end =
             Position::try_from(region.range.end as usize).context("region end overflow")?;
         let interval = Interval::from(interval_start..=interval_end);

+ 2 - 2
src/io/somaticpipe_container.rs

@@ -704,8 +704,8 @@ struct PathSource(PathBuf);
 
 impl SectionSource for PathSource {
     fn open(&self) -> anyhow::Result<Box<dyn ReadSeek>> {
-        let file = File::open(&self.0)
-            .with_context(|| format!("failed to open {}", self.0.display()))?;
+        let file =
+            File::open(&self.0).with_context(|| format!("failed to open {}", self.0.display()))?;
         Ok(Box::new(file))
     }
 }

+ 9 - 6
src/io/vcf.rs

@@ -15,7 +15,10 @@
 //! **Coordinate note:** [`GenomePosition::position`] is 0-based; VCF POS is
 //! 1-based. The conversion `vcf_pos = position + 1` is applied internally.
 
-use std::{borrow::Borrow, io::{BufRead, BufReader}};
+use std::{
+    borrow::Borrow,
+    io::{BufRead, BufReader},
+};
 
 use anyhow::Context;
 use log::{info, warn};
@@ -93,10 +96,7 @@ pub fn read_vcf(path: &str) -> anyhow::Result<Vec<VcfVariant>> {
 ///
 /// Returns an error if writing fails or if the variants are not coordinate-sorted
 /// (required by Tabix).
-pub fn write_vcf<B: Borrow<VcfVariant>>(
-    variants: &[B], 
-    path: &str
-) -> anyhow::Result<()> {
+pub fn write_vcf<B: Borrow<VcfVariant>>(variants: &[B], path: &str) -> anyhow::Result<()> {
     info!("Writing: {path}");
 
     let mut writer = BgzTabixWriter::new(path, IndexFormat::Tbi, true)?;
@@ -105,7 +105,10 @@ pub fn write_vcf<B: Borrow<VcfVariant>>(
 
     for variant in variants {
         let variant_ref = variant.borrow();
-        let row = format!("{}\n", variant_ref.commun_deepvariant_clairs().into_vcf_row());
+        let row = format!(
+            "{}\n",
+            variant_ref.commun_deepvariant_clairs().into_vcf_row()
+        );
         let rname = variant_ref.position.contig();
 
         // GenomePosition.position is 0-based; VCF POS is 1-based

+ 3 - 1
src/pipes/somatic.rs

@@ -59,7 +59,9 @@
 //! - Pandora export helpers
 //! - Annotation matrix generation utilities
 use crate::{
-    annotation::{dbsnp::annotate_dbsnp, is_dbsnp_and_constit_alt, is_gnomad_and_constit_alt, Caller},
+    annotation::{
+        dbsnp::annotate_dbsnp, is_dbsnp_and_constit_alt, is_gnomad_and_constit_alt, Caller,
+    },
     create_should_run_normal_tumoral, init_solo_callers_normal_tumoral,
     io::bed::read_bed,
     pipes::{Initialize, InitializeSolo, ShouldRun},

+ 41 - 14
src/scan/scan.rs

@@ -48,10 +48,11 @@
 use std::collections::HashMap;
 use std::path::Path;
 use std::sync::atomic::{AtomicU64, Ordering};
-use std::{fmt, fs, io::Write};
+use std::{fmt, fs};
 
 use anyhow::Context;
 use log::{debug, error, info};
+use noodles_core::Position;
 use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator};
 use rayon::slice::ParallelSliceMut;
 use rust_htslib::bam::ext::BamRecordExtensions;
@@ -61,7 +62,7 @@ use crate::helpers::{bam_contigs, get_genome_sizes, is_file_older};
 use crate::io::bam::fb_inv_from_record;
 use crate::io::readers::get_gz_reader;
 use crate::io::tsv::TsvLine;
-use crate::io::writers::{finalize_bgzf_file, get_gz_writer};
+use crate::io::writers::{BgzTabixWriter, IndexFormat};
 use crate::math::filter_outliers_modified_z_score_with_indices;
 
 use crate::config::Config;
@@ -727,8 +728,10 @@ pub fn par_whole_scan(id: &str, time_point: &str, config: &Config) -> anyhow::Re
         .par_iter()
         .try_for_each(|(contig, length)| -> anyhow::Result<()> {
             let out_file = config.somatic_scan_solo_count_file(id, time_point, contig);
+            let out_tbi_file = format!("{out_file}.tbi");
 
             if !is_file_older(&out_file, bam_path, false).unwrap_or(true)
+                && !is_file_older(&out_tbi_file, bam_path, false).unwrap_or(true)
                 && !config.somatic_scan_force
             {
                 return Ok(());
@@ -773,9 +776,8 @@ pub fn par_whole_scan(id: &str, time_point: &str, config: &Config) -> anyhow::Re
                 .with_context(|| format!("temp path is not valid UTF-8: {tmp_path:?}"))?;
 
             {
-                let mut writer = get_gz_writer(tmp_path_str, true).map_err(|e| {
-                    anyhow::anyhow!("failed to open BGZF temp file: {tmp_path_str}: {e:#}")
-                })?;
+                let mut writer = BgzTabixWriter::new(tmp_path_str, IndexFormat::Tbi, true)
+                    .with_context(|| format!("failed to open BGZF/tabix temp file: {tmp_path_str}"))?;
 
                 for bin in &bins {
                     let row = bin.to_tsv_row();
@@ -803,18 +805,25 @@ pub fn par_whole_scan(id: &str, time_point: &str, config: &Config) -> anyhow::Re
                         bin.contig
                     );
 
-                    writeln!(writer, "{row}")
-                        .with_context(|| format!("failed writing {contig} row to {tmp_path_str}"))?;
+                    let tabix_start = bin.start as usize + 1;
+                    let tabix_end = bin.end as usize + 1;
+                    let start = Position::try_from(tabix_start).with_context(|| {
+                        format!("invalid tabix start for {contig}: {tabix_start}")
+                    })?;
+                    let end = Position::try_from(tabix_end)
+                        .with_context(|| format!("invalid tabix end for {contig}: {tabix_end}"))?;
+                    let line = format!("{row}\n");
+
+                    writer
+                        .write_record(line.as_bytes(), contig, start, end)
+                        .with_context(|| {
+                            format!("failed writing {contig} row to {tmp_path_str}")
+                        })?;
                 }
 
                 writer
-                    .flush()
-                    .with_context(|| format!("failed flushing BGZF writer: {tmp_path_str}"))?;
-
-                finalize_bgzf_file(writer, tmp_path_str)?;
-                // writer.close().with_context(|| {
-                //     format!("failed closing/finalizing BGZF writer: {tmp_path_str}")
-                // })?;
+                    .finish()
+                    .with_context(|| format!("failed finalizing BGZF/tabix temp file: {tmp_path_str}"))?;
             }
 
             validate_count_file(tmp_path_str, contig, bins.len())
@@ -825,8 +834,26 @@ pub fn par_whole_scan(id: &str, time_point: &str, config: &Config) -> anyhow::Re
                     )
                 })?;
 
+            let tmp_tbi_path = format!("{tmp_path_str}.tbi");
+
+            if Path::new(&out_file).exists() {
+                std::fs::remove_file(&out_file)
+                    .with_context(|| format!("failed to remove existing output: {out_file}"))?;
+            }
+            if Path::new(&out_tbi_file).exists() {
+                std::fs::remove_file(&out_tbi_file).with_context(|| {
+                    format!("failed to remove existing tabix index: {out_tbi_file}")
+                })?;
+            }
+
             std::fs::rename(&tmp_path, &out_file)
                 .with_context(|| format!("failed atomic rename {tmp_path_str} -> {out_file}"))?;
+            if let Err(e) = std::fs::rename(&tmp_tbi_path, &out_tbi_file) {
+                let _ = std::fs::remove_file(&out_file);
+                return Err(e).with_context(|| {
+                    format!("failed atomic rename {tmp_tbi_path} -> {out_tbi_file}")
+                });
+            }
 
             Ok(())
         })?;

+ 1 - 1
src/variant/mod.rs

@@ -40,6 +40,6 @@
 //! ```
 
 pub mod variant_collection;
+pub mod variant_representation;
 pub mod variants_stats;
 pub mod vcf_variant;
-pub mod variant_representation;

+ 190 - 183
src/variant/variant_collection.rs

@@ -8,8 +8,10 @@ use std::{
 
 use crate::{
     callers::nanomonsv::nanomonsv_insert_classify,
+    commands::{exec_jobs, AnyJob},
     io::{somaticpipe_container::PandoraReader, tsv::TsvLine, vcf::read_vcf},
-    runners::Run, variant::vcf_variant::VariantId,
+    runners::Run,
+    variant::vcf_variant::VariantId,
 };
 use anyhow::Context;
 use bitcode::{Decode, Encode};
@@ -26,7 +28,7 @@ use super::vcf_variant::{
 use crate::{
     annotation::{
         cosmic::Cosmic,
-        echtvar::{parse_echtvar_val, run_echtvar},
+        echtvar::{parse_echtvar_val, EchtvarJob},
         gnomad::GnomAD,
         parse_trinuc,
         vep::{get_best_vep, VepJob, VepLine, VEP},
@@ -548,148 +550,143 @@ impl VariantCollection {
         let chunk_size = self.chunk_size();
         let reference = config.reference.clone();
 
-        self.variants
-            .par_chunks(chunk_size)
-            .for_each_init(
-                || {
-                    let bam = rust_htslib::bam::IndexedReader::from_path(constit_bam_path);
-                    let fasta = noodles_fasta::io::indexed_reader::Builder::default()
-                        .build_from_path(&reference);
-                    (bam, fasta)
-                },
-                |(bam_res, fasta_res), chunk| {
-                    let Ok(ref mut bam) = bam_res else {
-                        error!("BAM reader unavailable for chunk");
-                        return;
-                    };
-                    let Ok(ref mut fasta_reader) = fasta_res else {
-                        error!("FASTA reader unavailable for chunk");
-                        return;
-                    };
+        self.variants.par_chunks(chunk_size).for_each_init(
+            || {
+                let bam = rust_htslib::bam::IndexedReader::from_path(constit_bam_path);
+                let fasta = noodles_fasta::io::indexed_reader::Builder::default()
+                    .build_from_path(&reference);
+                (bam, fasta)
+            },
+            |(bam_res, fasta_res), chunk| {
+                let Ok(ref mut bam) = bam_res else {
+                    error!("BAM reader unavailable for chunk");
+                    return;
+                };
+                let Ok(ref mut fasta_reader) = fasta_res else {
+                    error!("FASTA reader unavailable for chunk");
+                    return;
+                };
 
-                    for var in chunk {
-                        let key = var.hash;
-                        let mut anns = annotations.store.entry(key).or_default();
+                for var in chunk {
+                    let key = var.hash;
+                    let mut anns = annotations.store.entry(key).or_default();
 
-                        if anns
-                            .iter()
-                            .filter(|e| {
-                                matches!(e, Annotation::ConstitAlt(_) | Annotation::ConstitDepth(_))
-                            })
-                            .count()
-                            == 2
-                        {
-                            continue;
-                        }
+                    if anns
+                        .iter()
+                        .filter(|e| {
+                            matches!(e, Annotation::ConstitAlt(_) | Annotation::ConstitDepth(_))
+                        })
+                        .count()
+                        == 2
+                    {
+                        continue;
+                    }
 
-                        let result = (|| -> anyhow::Result<()> {
-                            match var.alteration_category() {
-                                AlterationCategory::SNV => {
-                                    let pileup = counts_at(
+                    let result = (|| -> anyhow::Result<()> {
+                        match var.alteration_category() {
+                            AlterationCategory::SNV => {
+                                let pileup =
+                                    counts_at(bam, &var.position.contig(), var.position.position)?;
+                                let alt_seq = var.alternative.to_string();
+                                let (depth, alt) =
+                                    pileup.into_iter().fold((0, 0), folder(&alt_seq));
+                                anns.push(Annotation::ConstitDepth(depth as u16));
+                                anns.push(Annotation::ConstitAlt(alt as u16));
+                            }
+                            AlterationCategory::DEL => {
+                                if let Some(del_repr) = var.deletion_desc() {
+                                    let len = var.deletion_len().unwrap_or_default();
+
+                                    let pileup_start = crate::collection::bam::nt_pileup_new(
                                         bam,
                                         &var.position.contig(),
-                                        var.position.position,
+                                        del_repr.start.saturating_sub(1),
+                                        false,
                                     )?;
-                                    let alt_seq = var.alternative.to_string();
-                                    let (depth, alt) =
-                                        pileup.into_iter().fold((0, 0), folder(&alt_seq));
-                                    anns.push(Annotation::ConstitDepth(depth as u16));
-                                    anns.push(Annotation::ConstitAlt(alt as u16));
-                                }
-                                AlterationCategory::DEL => {
-                                    if let Some(del_repr) = var.deletion_desc() {
-                                        let len = var.deletion_len().unwrap_or_default();
 
-                                        let pileup_start = crate::collection::bam::nt_pileup_new(
-                                            bam,
-                                            &var.position.contig(),
-                                            del_repr.start.saturating_sub(1),
-                                            false,
-                                        )?;
+                                    let pileup_end = crate::collection::bam::nt_pileup_new(
+                                        bam,
+                                        &var.position.contig(),
+                                        del_repr.end.saturating_sub(1),
+                                        false,
+                                    )?;
 
-                                        let pileup_end = crate::collection::bam::nt_pileup_new(
-                                            bam,
+                                    let tol = if len > 1 {
+                                        let seq = crate::io::fasta::sequence_range(
+                                            fasta_reader,
                                             &var.position.contig(),
-                                            del_repr.end.saturating_sub(1),
-                                            false,
+                                            del_repr.start.saturating_sub(1) as usize,
+                                            del_repr.end.saturating_sub(1) as usize,
                                         )?;
-
-                                        let tol = if len > 1 {
-                                            let seq = crate::io::fasta::sequence_range(
-                                                fasta_reader,
-                                                &var.position.contig(),
-                                                del_repr.start.saturating_sub(1) as usize,
-                                                del_repr.end.saturating_sub(1) as usize,
-                                            )?;
-                                            match detect_repetition(&seq) {
-                                                Repeat::None => 0,
-                                                Repeat::RepOne(_, _) => 3,
-                                                Repeat::RepTwo(_, _) => 3,
-                                            }
-                                        } else {
-                                            0
-                                        };
-
-                                        let alt: u32 = pileup_start
-                                            .iter()
-                                            .map(|pb| match pb {
-                                                crate::collection::bam::PileBase::Del((_qn, l))
-                                                    if *l >= len.saturating_sub(tol).max(1)
-                                                        && *l <= len + tol =>
-                                                {
-                                                    1
-                                                }
-                                                _ => 0,
-                                            })
-                                            .sum();
-
-                                        let depth = pileup_start.len().max(pileup_end.len());
-                                        anns.push(Annotation::ConstitAlt(alt as u16));
-                                        anns.push(Annotation::ConstitDepth(depth as u16));
+                                        match detect_repetition(&seq) {
+                                            Repeat::None => 0,
+                                            Repeat::RepOne(_, _) => 3,
+                                            Repeat::RepTwo(_, _) => 3,
+                                        }
                                     } else {
-                                        anns.push(Annotation::ConstitAlt(0_u16));
-                                        anns.push(Annotation::ConstitDepth(111_u16));
-                                    }
+                                        0
+                                    };
+
+                                    let alt: u32 = pileup_start
+                                        .iter()
+                                        .map(|pb| match pb {
+                                            crate::collection::bam::PileBase::Del((_qn, l))
+                                                if *l >= len.saturating_sub(tol).max(1)
+                                                    && *l <= len + tol =>
+                                            {
+                                                1
+                                            }
+                                            _ => 0,
+                                        })
+                                        .sum();
+
+                                    let depth = pileup_start.len().max(pileup_end.len());
+                                    anns.push(Annotation::ConstitAlt(alt as u16));
+                                    anns.push(Annotation::ConstitDepth(depth as u16));
+                                } else {
+                                    anns.push(Annotation::ConstitAlt(0_u16));
+                                    anns.push(Annotation::ConstitDepth(111_u16));
                                 }
-                                AlterationCategory::INS => {
-                                    let normal_pileup = crate::collection::bam::nt_pileup_new(
-                                        bam,
-                                        &var.position.contig(),
-                                        var.position.position,
-                                        false,
-                                    )?;
-                                    let normal_depth = normal_pileup.len();
-                                    let alt_seq = var.inserted_seq().unwrap_or_default();
-                                    let alt_seq = alt_seq.as_bytes().to_vec();
-
-                                    let mut normal_n_alt = 0;
-                                    for pile in normal_pileup {
-                                        if let PileBase::Ins { len, seq } = pile {
-                                            if let Some(seq) = seq {
-                                                let dist = levenshtein_exp(&alt_seq, &seq);
-                                                let dist_frac = dist as f64 / len as f64;
-                                                if dist_frac < 0.1 {
-                                                    normal_n_alt += 1;
-                                                }
-                                            } else if alt_seq.len() as u32 == len {
+                            }
+                            AlterationCategory::INS => {
+                                let normal_pileup = crate::collection::bam::nt_pileup_new(
+                                    bam,
+                                    &var.position.contig(),
+                                    var.position.position,
+                                    false,
+                                )?;
+                                let normal_depth = normal_pileup.len();
+                                let alt_seq = var.inserted_seq().unwrap_or_default();
+                                let alt_seq = alt_seq.as_bytes().to_vec();
+
+                                let mut normal_n_alt = 0;
+                                for pile in normal_pileup {
+                                    if let PileBase::Ins { len, seq } = pile {
+                                        if let Some(seq) = seq {
+                                            let dist = levenshtein_exp(&alt_seq, &seq);
+                                            let dist_frac = dist as f64 / len as f64;
+                                            if dist_frac < 0.1 {
                                                 normal_n_alt += 1;
                                             }
+                                        } else if alt_seq.len() as u32 == len {
+                                            normal_n_alt += 1;
                                         }
                                     }
-                                    anns.push(Annotation::ConstitAlt(normal_n_alt as u16));
-                                    anns.push(Annotation::ConstitDepth(normal_depth as u16));
                                 }
-                                _ => (),
+                                anns.push(Annotation::ConstitAlt(normal_n_alt as u16));
+                                anns.push(Annotation::ConstitDepth(normal_depth as u16));
                             }
-                            Ok(())
-                        })();
-
-                        if let Err(e) = result {
-                            warn!("BAM annotation failed for {}: {e}", var.variant_id());
+                            _ => (),
                         }
+                        Ok(())
+                    })();
+
+                    if let Err(e) = result {
+                        warn!("BAM annotation failed for {}: {e}", var.variant_id());
                     }
-                },
-            );
+                }
+            },
+        );
         Ok(())
     }
 
@@ -1884,71 +1881,81 @@ impl ExternalAnnotation {
         let optimal_chunk_size = unfound.len().div_ceil(max_chunks as usize);
         let optimal_chunk_size = optimal_chunk_size.max(min_chunk_size);
 
-        let config = Arc::new(&self.config);
-
-        let results = unfound
-            .par_chunks(optimal_chunk_size)
-            .flat_map(|chunk| -> anyhow::Result<Vec<_>> {
-                let in_tmp = temp_dir.join(format!("echtvar_in_{}.vcf", Uuid::new_v4()));
-                let out_tmp = temp_dir.join(format!("echtvar_out_{}.vcf.gz", Uuid::new_v4()));
-
-                // Write input VCF
-                let mut vcf = File::create(&in_tmp)?;
-                writeln!(vcf, "{}", header)?;
-                for (i, row) in chunk.iter().enumerate() {
-                    writeln!(
-                        vcf,
-                        "{}\t{}\t{}\t{}\t{}\t.\tPASS\t.\t.\t.",
-                        row.position.contig(),
-                        row.position.position + 1, // vcf
-                        i + 1,
-                        row.reference,
-                        row.alternative
-                    )?;
-                }
+        let mut chunk_metadata = Vec::new();
+        let mut jobs: Vec<Vec<Box<dyn AnyJob>>> = Vec::new();
+
+        for chunk in unfound.chunks(optimal_chunk_size) {
+            let in_tmp = temp_dir.join(format!("echtvar_in_{}.vcf", Uuid::new_v4()));
+            let out_tmp = temp_dir.join(format!("echtvar_out_{}.vcf.gz", Uuid::new_v4()));
+
+            // Write input VCF
+            let mut vcf = File::create(&in_tmp)?;
+            writeln!(vcf, "{}", header)?;
+            for (i, row) in chunk.iter().enumerate() {
+                writeln!(
+                    vcf,
+                    "{}\t{}\t{}\t{}\t{}\t.\tPASS\t.\t.\t.",
+                    row.position.contig(),
+                    row.position.position + 1, // vcf
+                    i + 1,
+                    row.reference,
+                    row.alternative
+                )?;
+            }
 
-                // Run echtvar
-                run_echtvar(&in_tmp, &out_tmp, &config)?;
-                // fs::remove_file(in_tmp)?;
+            let chunk_hashes = chunk.iter().map(|row| row.hash).collect::<Vec<_>>();
+            let job = EchtvarJob {
+                input_vcf: in_tmp.clone(),
+                output_vcf: out_tmp.clone(),
+                config: self.config.clone(),
+            };
 
-                // Parse echtvar output
-                let mut echtvar_rdr =
-                    std::io::BufReader::new(get_reader(out_tmp.to_str().unwrap())?);
-                let mut echtvar_line = TsvLine::new();
-                let mut chunk_results = Vec::new();
-                let mut i = 0usize;
+            chunk_metadata.push((chunk_hashes, out_tmp));
+            jobs.push(crate::jobs_seq![job]);
+        }
 
-                while echtvar_line.read(&mut echtvar_rdr)? {
-                    if echtvar_line.as_str().starts_with('#') || echtvar_line.as_str().is_empty() {
-                        continue;
-                    }
-                    let row: crate::io::vcf::VCFRow = echtvar_line
-                        .as_str()
-                        .parse()
-                        .context("Failed to parse echtvar VCF row")?;
-
-                    // Verify that the ID corresponds to the input
-                    let id: usize = row.id.parse().context("Failed to parse ID")?;
-                    if id != i + 1 {
-                        return Err(anyhow::anyhow!(
-                            "Echtvar output ID {} does not match expected ID {}",
-                            id,
-                            i + 1
-                        ));
-                    }
+        exec_jobs(
+            jobs,
+            self.config.slurm_runner,
+            self.config.slurm_max_par.into(),
+        )
+        .context("Failed to run EchtVar chunked annotation")?;
+
+        let mut results = Vec::new();
+        for (chunk_hashes, out_tmp) in chunk_metadata {
+            // Parse echtvar output
+            let mut echtvar_rdr = std::io::BufReader::new(get_reader(out_tmp.to_str().unwrap())?);
+            let mut echtvar_line = TsvLine::new();
+            let mut i = 0usize;
+
+            while echtvar_line.read(&mut echtvar_rdr)? {
+                if echtvar_line.as_str().starts_with('#') || echtvar_line.as_str().is_empty() {
+                    continue;
+                }
+                let row: crate::io::vcf::VCFRow = echtvar_line
+                    .as_str()
+                    .parse()
+                    .context("Failed to parse echtvar VCF row")?;
+
+                // Verify that the ID corresponds to the input
+                let id: usize = row.id.parse().context("Failed to parse ID")?;
+                if id != i + 1 {
+                    return Err(anyhow::anyhow!(
+                        "Echtvar output ID {} does not match expected ID {}",
+                        id,
+                        i + 1
+                    ));
+                }
 
-                    let (cosmic, gnomad) = parse_echtvar_val(&row.info)?;
-                    let hash = chunk[i].hash;
-                    i += 1;
+                let (cosmic, gnomad) = parse_echtvar_val(&row.info)?;
+                let hash = chunk_hashes[i];
+                i += 1;
 
-                    chunk_results.push((hash, cosmic, gnomad));
-                }
+                results.push((hash, cosmic, gnomad));
+            }
 
-                // fs::remove_file(out_tmp)?;
-                Ok(chunk_results)
-            })
-            .flatten()
-            .collect::<Vec<_>>();
+            // fs::remove_file(out_tmp)?;
+        }
 
         for (hash, cosmic, gnomad) in results {
             // let modified = chrono::Utc::now().to_rfc3339();

+ 203 - 200
src/variant/variant_representation.rs

@@ -1,200 +1,203 @@
-use ordered_float::OrderedFloat;
-use serde::Serialize;
-
-use crate::{
-    helpers::estimate_shannon_entropy,
-    positions::contig_to_num,
-    variant::{variant_collection::Variant, vcf_variant::{AlterationCategory, BNDDesc}},
-};
-
-#[derive(Debug, Clone, Default, Eq, PartialEq, Serialize)]
-pub enum Representation {
-    #[default]
-    SNV,
-    Deletion {
-        start: u32,
-        end: u32,
-        len: u32,
-        inserted_sequence: Option<String>,
-    },
-    DeletionInversion {
-        start: u32,
-        start_sens: bool,
-        end: u32,
-        end_sens: bool,
-        len: u32,
-        inserted_sequence: Option<String>,
-    },
-    Insertion {
-        len: u32,
-        entropy: OrderedFloat<f64>,
-        inserted_sequence: String,
-    },
-    Translocation {
-        left_sens: bool,
-        left_contig: String,
-        left_position: u32,
-        left_genes: Vec<String>,
-        right_contig: String,
-        right_position: u32,
-        right_sens: bool,
-        right_genes: Vec<String>,
-        inserted_sequence: Option<String>,
-    },
-    Duplication {
-        start: u32,
-        end: u32,
-        len: u32,
-    },
-    Inversion {
-        inserted_sequence: Option<String>,
-        start: u32,
-        start_sens: bool,
-        end: u32,
-        end_sens: bool,
-        len: u32,
-    },
-}
-
-impl TryFrom<&Variant> for Representation {
-    type Error = anyhow::Error;
-
-    fn try_from(variant: &Variant) -> anyhow::Result<Self> {
-        let mut contig = variant.position.contig;
-        let mut position = variant.position.position + 1;
-
-        use AlterationCategory::*;
-
-        let category = variant
-            .alteration_category()
-            .first()
-            .cloned()
-            .ok_or_else(|| anyhow::anyhow!("variant has no alteration category"))?;
-
-        let repr: Option<Representation> = match category {
-            SNV => Some(Representation::SNV),
-
-            DEL => variant
-                .vcf_variants
-                .iter()
-                .find_map(|v| v.deletion_desc())
-                .map(|d| Representation::Deletion {
-                    start: d.start,
-                    end: d.end,
-                    len: variant.deletion_length().unwrap_or_default(),
-                    inserted_sequence: None,
-                }),
-
-            INS => variant.vcf_variants.iter().find_map(|v| {
-                v.inserted_seq().map(|inserted_sequence| {
-                    let entropy = estimate_shannon_entropy(&inserted_sequence).into();
-
-                    Representation::Insertion {
-                        len: inserted_sequence.len() as u32,
-                        entropy,
-                        inserted_sequence,
-                    }
-                })
-            }),
-
-            TRL => variant
-                .vcf_variants
-                .iter()
-                .find_map(|v| v.bnd_desc().ok())
-                .map(|d| {
-                    let d = normalize_bnd(d);
-
-                    contig = contig_to_num(&d.a_contig);
-                    position = d.a_position;
-
-                    let inserted_sequence = (!d.added_nt.is_empty()).then(|| d.added_nt.clone());
-
-                    Representation::Translocation {
-                        left_sens: d.a_sens,
-                        left_contig: d.a_contig,
-                        left_position: d.a_position,
-                        right_contig: d.b_contig,
-                        right_position: d.b_position,
-                        right_sens: d.b_sens,
-                        inserted_sequence,
-                        left_genes: Vec::new(),
-                        right_genes: Vec::new(),
-                    }
-                }),
-
-            DUP => variant
-                .vcf_variants
-                .iter()
-                .find_map(|v| v.bnd_desc().ok())
-                .map(|d| Representation::Duplication {
-                    start: d.b_position,
-                    end: d.a_position,
-                    len: d.a_position.saturating_sub(d.b_position),
-                }),
-
-            DELINV => variant
-                .vcf_variants
-                .iter()
-                .find_map(|v| v.bnd_desc().ok())
-                .map(|d| {
-                    let inserted_sequence = (!d.added_nt.is_empty()).then(|| d.added_nt.clone());
-
-                    let d = normalize_bnd(d);
-
-                    position = d.a_position;
-
-                    Representation::DeletionInversion {
-                        inserted_sequence,
-                        start: d.a_position,
-                        start_sens: d.a_sens,
-                        end: d.b_position,
-                        end_sens: d.b_sens,
-                        len: d.b_position.saturating_sub(d.a_position),
-                    }
-                }),
-
-            INV => variant
-                .vcf_variants
-                .iter()
-                .find_map(|v| v.bnd_desc().ok())
-                .map(|d| {
-                    let inserted_sequence = (!d.added_nt.is_empty()).then(|| d.added_nt.clone());
-
-                    let d = normalize_bnd(d);
-
-                    position = d.a_position;
-
-                    Representation::Inversion {
-                        inserted_sequence,
-                        start: d.a_position,
-                        start_sens: d.a_sens,
-                        end: d.b_position,
-                        end_sens: d.b_sens,
-                        len: d.b_position.saturating_sub(d.a_position),
-                    }
-                }),
-
-            CNV | BND | Other => None,
-        };
-
-        repr.ok_or_else(|| {
-            anyhow::anyhow!(
-                "could not build Representation for category {:?} at {}:{}",
-                category,
-                contig,
-                position
-            )
-        })
-    }
-}
-
-fn normalize_bnd(d: BNDDesc) -> BNDDesc {
-    let a = contig_to_num(&d.a_contig);
-    let b = contig_to_num(&d.b_contig);
-
-    if a > b || (a == b && d.a_position > d.b_position) {
-        d.rc()
-    } else {
-        d
-    }
-}
+use ordered_float::OrderedFloat;
+use serde::Serialize;
+
+use crate::{
+    helpers::estimate_shannon_entropy,
+    positions::contig_to_num,
+    variant::{
+        variant_collection::Variant,
+        vcf_variant::{AlterationCategory, BNDDesc},
+    },
+};
+
+#[derive(Debug, Clone, Default, Eq, PartialEq, Serialize)]
+pub enum Representation {
+    #[default]
+    SNV,
+    Deletion {
+        start: u32,
+        end: u32,
+        len: u32,
+        inserted_sequence: Option<String>,
+    },
+    DeletionInversion {
+        start: u32,
+        start_sens: bool,
+        end: u32,
+        end_sens: bool,
+        len: u32,
+        inserted_sequence: Option<String>,
+    },
+    Insertion {
+        len: u32,
+        entropy: OrderedFloat<f64>,
+        inserted_sequence: String,
+    },
+    Translocation {
+        left_sens: bool,
+        left_contig: String,
+        left_position: u32,
+        left_genes: Vec<String>,
+        right_contig: String,
+        right_position: u32,
+        right_sens: bool,
+        right_genes: Vec<String>,
+        inserted_sequence: Option<String>,
+    },
+    Duplication {
+        start: u32,
+        end: u32,
+        len: u32,
+    },
+    Inversion {
+        inserted_sequence: Option<String>,
+        start: u32,
+        start_sens: bool,
+        end: u32,
+        end_sens: bool,
+        len: u32,
+    },
+}
+
+impl TryFrom<&Variant> for Representation {
+    type Error = anyhow::Error;
+
+    fn try_from(variant: &Variant) -> anyhow::Result<Self> {
+        let mut contig = variant.position.contig;
+        let mut position = variant.position.position + 1;
+
+        use AlterationCategory::*;
+
+        let category = variant
+            .alteration_category()
+            .first()
+            .cloned()
+            .ok_or_else(|| anyhow::anyhow!("variant has no alteration category"))?;
+
+        let repr: Option<Representation> = match category {
+            SNV => Some(Representation::SNV),
+
+            DEL => variant
+                .vcf_variants
+                .iter()
+                .find_map(|v| v.deletion_desc())
+                .map(|d| Representation::Deletion {
+                    start: d.start,
+                    end: d.end,
+                    len: variant.deletion_length().unwrap_or_default(),
+                    inserted_sequence: None,
+                }),
+
+            INS => variant.vcf_variants.iter().find_map(|v| {
+                v.inserted_seq().map(|inserted_sequence| {
+                    let entropy = estimate_shannon_entropy(&inserted_sequence).into();
+
+                    Representation::Insertion {
+                        len: inserted_sequence.len() as u32,
+                        entropy,
+                        inserted_sequence,
+                    }
+                })
+            }),
+
+            TRL => variant
+                .vcf_variants
+                .iter()
+                .find_map(|v| v.bnd_desc().ok())
+                .map(|d| {
+                    let d = normalize_bnd(d);
+
+                    contig = contig_to_num(&d.a_contig);
+                    position = d.a_position;
+
+                    let inserted_sequence = (!d.added_nt.is_empty()).then(|| d.added_nt.clone());
+
+                    Representation::Translocation {
+                        left_sens: d.a_sens,
+                        left_contig: d.a_contig,
+                        left_position: d.a_position,
+                        right_contig: d.b_contig,
+                        right_position: d.b_position,
+                        right_sens: d.b_sens,
+                        inserted_sequence,
+                        left_genes: Vec::new(),
+                        right_genes: Vec::new(),
+                    }
+                }),
+
+            DUP => variant
+                .vcf_variants
+                .iter()
+                .find_map(|v| v.bnd_desc().ok())
+                .map(|d| Representation::Duplication {
+                    start: d.b_position,
+                    end: d.a_position,
+                    len: d.a_position.saturating_sub(d.b_position),
+                }),
+
+            DELINV => variant
+                .vcf_variants
+                .iter()
+                .find_map(|v| v.bnd_desc().ok())
+                .map(|d| {
+                    let inserted_sequence = (!d.added_nt.is_empty()).then(|| d.added_nt.clone());
+
+                    let d = normalize_bnd(d);
+
+                    position = d.a_position;
+
+                    Representation::DeletionInversion {
+                        inserted_sequence,
+                        start: d.a_position,
+                        start_sens: d.a_sens,
+                        end: d.b_position,
+                        end_sens: d.b_sens,
+                        len: d.b_position.saturating_sub(d.a_position),
+                    }
+                }),
+
+            INV => variant
+                .vcf_variants
+                .iter()
+                .find_map(|v| v.bnd_desc().ok())
+                .map(|d| {
+                    let inserted_sequence = (!d.added_nt.is_empty()).then(|| d.added_nt.clone());
+
+                    let d = normalize_bnd(d);
+
+                    position = d.a_position;
+
+                    Representation::Inversion {
+                        inserted_sequence,
+                        start: d.a_position,
+                        start_sens: d.a_sens,
+                        end: d.b_position,
+                        end_sens: d.b_sens,
+                        len: d.b_position.saturating_sub(d.a_position),
+                    }
+                }),
+
+            CNV | BND | Other => None,
+        };
+
+        repr.ok_or_else(|| {
+            anyhow::anyhow!(
+                "could not build Representation for category {:?} at {}:{}",
+                category,
+                contig,
+                position
+            )
+        })
+    }
+}
+
+fn normalize_bnd(d: BNDDesc) -> BNDDesc {
+    let a = contig_to_num(&d.a_contig);
+    let b = contig_to_num(&d.b_contig);
+
+    if a > b || (a == b && d.a_position > d.b_position) {
+        d.rc()
+    } else {
+        d
+    }
+}