//! # 🧬 Long-read Somatic Variant Calling and Analysis Framework //! //! This Rust library provides a modular, parallelizable framework for somatic variant calling, annotation, and interpretation from long-read sequencing data. It is designed to support full pipelines for research and clinical workflows across multiple variant callers and analysis stages. //! //! The library also serves as an extensible platform that developers can leverage to add custom features, integrate new tools, and tailor workflows to specific use cases. //! //! ## 🧩 Key Features //! //! - **POD5 Demultiplexing and Alignment**: End-to-end support for processing ONT POD5 files: //! - Barcode-aware demultiplexing using metadata CSVs //! - POD5 subsetting and organization by case //! - Integration with basecallers (e.g., Dorado) for read alignment //! - **Pipeline Management**: Full orchestration of Dockerized execution pipelines for tools such as ClairS, Nanomonsv, DeepVariant, Savana, Modkit, and Severus. //! - **Flexible Configuration**: Centralized configuration system (`Config`, `CollectionsConfig`) for all modules and pipelines. //! - **Input Abstraction**: Unified handling of BAM, POD5, and VCF file collections across cohorts and directories. //! - **Variant Processing**: Modular loading, filtering, statistical analysis, and annotation of somatic and germline variants. //! - **Haplotype Phasing and Methylation**: Support for LongPhase-based phasing and Modkit methylation pileups with support for multi-threaded pileup and aggregation. //! - **Parallel Execution**: Uses `rayon` for efficient multicore parallelization over large cohorts and tasks. //! //! ## 📚 Module Highlights //! //! - `callers`: Interfaces to variant calling tools (ClairS, DeepVariant, Nanomonsv, Savana, etc...) //! - `runners`: Pipeline runners (e.g. `Somatic`, `SeverusSolo`, `LongphasePhase`) that manage end-to-end execution. //! - `collection`: Organizes input data across BAMs, VCFs, and POD5 files with auto-detection of completed runs. //! - `annotation`: VEP line parsing and high-level annotation aggregation. //! - `pipes`: Composition modules for executing pipelines across callers and post-processing steps. //! - `functions`: Custom logic for genome assembly, entropy estimation, and internal tooling. //! - `positions`, `variant`, `helpers`: Utilities for SV modeling, variant filtering, position overlap logic, and helper methods. //! //! ## ⚡ Workflow Overview //! //! ### 1. 📦 From POD5 to BAM Alignment //! //! - **Demultiplexing**: POD5 files are subset and demuxed using barcodes (via CSV metadata). //! - **Flowcell Case Management**: Each sample is identified by a [`collection::pod5::FlowCellCase`] containing its ID, time point, and POD5 directory. //! - **Alignment**: The [`commands::dorado::Dorado`] module handles alignment of POD5 reads to reference genome, producing BAMs. //! //! ```rust //! let case = FlowCellCase { id: "PATIENT1", time_point: "diag", barcode: "01", pod_dir: "...".into() }; //! Dorado::init(case, Config::default())?.run_pipe()?; //! ``` //! //! ### 2. 🧬 Variant Calling (BAM ➝ VCF) //! //! Using the aligned BAMs, multiple variant callers can be run in parallel. The [`callers`] and [`runners`] modules support: //! //! - **ClairS** – somatic small variant calling with LongPhase haplotagging //! - **Nanomonsv** – structural variants (SV) //! - **DeepVariant** – germline small variants //! - **Savana** – SVs and copy number variations (CNV) //! - **Modkit** – methylation pileups //! - **LongPhase** – phasing and modcalling //! //! All workflows can be triggered per-case or per-cohort using `Collections` or `Somatic` runners. //! //! ```rust //! ClairS::initialize("PATIENT1", Config::default())?.run()?; //! NanomonSV::initialize("PATIENT1", Config::default())?.run()?; //! ``` //! //! ### 3. 📈 Aggregation & Statistics (VCF ➝ JSON / Stats) //! //! After variant calling: //! //! - Annotate with VEP ([`annotation`] module) //! - Load and filter with [`variant::variant_collection`] //! - Compute variant and region-level stats (e.g., mutation rates, alteration categories, coding overlaps) //! //! ```rust //! let variants = Variants::load_from_json("/path/to/somatic_variants.json.gz")?; //! let stats = VariantsStats::new(&variants, "PATIENT1", &config)?; //! stats.save_to_json("/output/path/stats.json.gz")?; //! ``` //! //! ### 4. 🧠 Intelligent Task Management (`collection` module) //! //! - Auto-discovers available samples, POD5s, BAMs, and VCFs //! - Detects missing outputs and creates task lists //! - Tasks are parallelizable using Rayon and can be run on-demand //! //! ```rust //! let mut collections = Collections::new(CollectionsConfig::default())?; //! collections.todo()?; // Identify missing steps //! collections.run()?; // Run them automatically //! ``` //! //! ## 🔬 Testing //! //! Integration tests demonstrate the entire pipeline. Run with logging enabled: //! //! ```bash //! export RUST_LOG=debug //! cargo test -- --nocapture //! ``` //! //! ## 🧪 Example Use Cases //! //! - Full somatic variant calling pipeline on matched tumor/normal samples //! - POD5-based pipeline from raw signal to variants //! - Aggregation and annotation of SVs across a clinical cohort //! - Methylation analysis using nanopore-specific tools //! - Variant calling and analysis in large-scale longitudinal studies //! //! ## 🚀 Getting Started //! //! All workflows are initialized from `Config` and driven by the `Collections` structure: //! //! ```rust //! let collections = Collections::new(CollectionsConfig::default())?; //! collections.todo()?; //! collections.run()?; //! ``` //! //! ## 🔗 References //! //! **Basecalling and alignment** //! - Dorado: //! //! **Variants Callers** //! - ClairS: //! - Nanomonsv: //! - Savana: //! - DeepVariant: //! - DeepSomatic: //! - LongPhase: //! - Modkit: //! //! **Variants annotation** //! - VEP: //! //! --- use std::sync::{Arc, Mutex}; pub mod commands; pub mod config; pub mod modkit; pub mod callers; pub mod runners; pub mod collection; pub mod functions; pub mod helpers; pub mod variant; pub mod io; pub mod pipes; pub mod positions; pub mod annotation; pub mod cases; pub mod scan; pub mod math; #[macro_use] extern crate lazy_static; // Define DOCKER_ID lock for handling Docker kill when ctrlc is pressed lazy_static! { static ref DOCKER_ID: Arc>> = Arc::new(Mutex::new(Vec::new())); } #[cfg(test)] mod tests { use std::{collections::HashMap, fs, path::Path}; use annotation::{vep::{VepLine, VEP}, Annotations}; use callers::{nanomonsv::nanomonsv_create_pon, savana::{Savana, SavanaReadCounts}, severus::{Severus, SeverusSolo}}; use collection::{bam::{counts_at, counts_ins_at, nt_pileup, WGSBam, WGSBamStats}, Initialize, InitializeSolo, Version}; use commands::{longphase::{LongphaseConfig, LongphaseHap, LongphaseModcallSolo, LongphasePhase}, modkit::{bed_methyl, ModkitConfig}}; use functions::assembler::{Assembler, AssemblerConfig}; use helpers::estimate_shannon_entropy; use io::bed::read_bed; use itertools::Itertools; use log::{error, info, warn}; use pipes::somatic::SomaticPipe; use positions::{overlaps_par, GenomePosition, GenomeRange}; use rayon::prelude::*; use runners::Run; use variant::{variant::{Variants, VcfVariant}, variant_collection}; use self::{collection::pod5::{FlowCellCase, Pod5Collection}, commands::dorado, config::Config}; use super::*; use crate::{annotation::Annotation, callers::{clairs::ClairS, deep_variant::DeepVariant, nanomonsv::{NanomonSV, NanomonSVSolo}, savana::SavanaCN}, collection::{bam::{self, nt_pileup_new}, flowcells::{scan_archive, FlowCells}, run_tasks, vcf::VcfCollection, Collections, CollectionsConfig, ShouldRun}, commands::dorado::Dorado, helpers::find_files, io::{bed::bedrow_overlaps_par, dict::read_dict, gff::features_ranges}, pipes::somatic::const_stats, positions::{merge_overlapping_genome_ranges, range_intersection_par, sort_ranges}, scan::scan::somatic_scan, variant::{variant::{AlterationCategory, BNDDesc, BNDGraph, GroupByThreshold, ToBNDGraph}, variant_collection::{group_variants_by_bnd_desc, group_variants_by_bnd_rc, Variant}, variants_stats::{self, somatic_depth_quality_ranges, VariantsStats}}}; // export RUST_LOG="debug" fn init() { let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info")) .is_test(true) .try_init(); } #[test] fn it_works() { let bam_path = "/data/longreads_basic_pipe/PARACHINI/diag/PARACHINI_diag_hs1.bam"; modkit::modkit(bam_path); } #[test] fn run_dorado() -> anyhow::Result<()> { let case = FlowCellCase { id: "CONSIGNY".to_string(), time_point: "mrd".to_string(), barcode: "07".to_string(), pod_dir: "/data/run_data/20240326-CL/CONSIGNY-MRD-NB07_RICCO-DIAG-NB08/20240326_1355_1E_PAU78333_bc25da25/pod5_pass/barcode07".into() }; dorado::Dorado::init(case, Config::default())?.run_pipe() } #[test] fn pod5() -> anyhow::Result<()> { let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info")) .build(); let coll = Pod5Collection::new( "/data/run_data", "/data/flow_cells.tsv", "/data/longreads_basic_pipe", )?; println!("{coll:#?}"); // let runs = Runs::import_dir("/home/prom/store/banana-pool/run_data", "/data/flow_cells.tsv")?; Ok(()) } #[test] fn bam() -> anyhow::Result<()> { init(); let bam_collection = bam::load_bam_collection("/data/longreads_basic_pipe"); bam_collection .bams .iter() // .filter(|b| matches!(b.bam_type, BamType::Panel(_))) .for_each(|b| println!("{b:#?}")); let u = bam_collection.get("PARACHINI", "mrd"); println!("{u:#?}"); Ok(()) } #[test] fn vcf() -> anyhow::Result<()> { init(); let mut vcf_collection = VcfCollection::new("/data/longreads_basic_pipe"); vcf_collection.sort_by_id(); vcf_collection .vcfs .iter() .for_each(|v| v.println().unwrap()); Ok(()) } // pod5 view -I /data/run_data/20240903-CL/ARMEM-DG-N02_ASSJU-DG-N03/20240903_1428_1B_PAW47629_fc24c3cf/pod5/PAW47629_fc24c3cf_77b07847_0.pod5 | head -5000 | awk '{if(NR==1){print "target,"$0}else{print "subset_1.pod5,"$0}}' > /tmp/subset_ids.csv // pod5 subset /data/run_data/20240903-CL/ARMEM-DG-N02_ASSJU-DG-N03/20240903_1428_1B_PAW47629_fc24c3cf/pod5/PAW47629_fc24c3cf_77b07847_0.pod5 --csv /tmp/subset_ids.csv -o /data/test_suite/pod5/muxed/ #[test] fn mux() -> anyhow::Result<()> { init(); let result_dir = "/data/test_suite/results".to_string(); let cases = vec![ FlowCellCase { id: "test_02".to_string(), time_point: "diag".to_string(), barcode: "02".to_string(), pod_dir: "/data/test_suite/pod5/muxed".into() }, FlowCellCase { id: "test_03".to_string(), time_point: "diag".to_string(), barcode: "03".to_string(), pod_dir: "/data/test_suite/pod5/muxed".into() }, ]; cases.iter().for_each(|c| { let dir = format!("{result_dir}/{}", c.id); if Path::new(&dir).exists() { fs::remove_dir_all(dir).unwrap(); } }); let config = Config { result_dir, ..Default::default() }; Dorado::from_mux(cases, config) } // #[test_log::test] // fn clairs() -> anyhow::Result<()> { // let config = ClairSConfig { // result_dir: "/data/test".to_string(), // ..ClairSConfig::default() // }; // ClairS::new("test_a", "/data/test_data/subset.bam", "/data/test_data/subset_mrd.bam", config).run() // } #[test] fn nanomonsv() -> anyhow::Result<()> { init(); let id = "HAMROUNE"; NanomonSV::initialize(id, Config::default())?.run() } #[test] fn nanomddonsv_solo() -> anyhow::Result<()> { init(); NanomonSVSolo::initialize("BRETON", "diag", Config::default())?.run() } // cargo test run -- --nocapture; ~/run_scripts/notify_finish.sh & #[test] fn todo_all() -> anyhow::Result<()> { init(); // let config = CollectionsConfig::default(); let config = CollectionsConfig { pod_dir: "/data/run_data".to_string(), ..Default::default() }; info!("Runing todo with config: {:#?}", config); let mut collections = Collections::new(config)?; collections.todo()?; collections.tasks.iter().for_each(|t| println!("{t}")); println!("{}", collections.tasks.len()); Ok(()) } // #[test] // fn todo_agg() -> anyhow::Result<()> { // init(); // let config = CollectionsConfig::default(); // info!("Runing todo with config: {:#?}", config); // let collections = Collections::new(config)?; // let agg_tasks = collections.todo_variants_agg()?; // println!("{:#?}", agg_tasks); // println!("{}", agg_tasks.len()); // Ok(()) // } // #[test] // fn run_agg() -> anyhow::Result<()> { // init(); // let config = CollectionsConfig { // id_black_list: vec!["MANCUSO".to_string(),"HAMROUNE".to_string()], // ..Default::default() // }; // info!("Runing todo with config: {:#?}", config); // let mut collections = Collections::new(config)?; // collections.tasks = collections.todo_variants_agg()?; // collections.run()?; // // Ok(()) // } // export RUST_LOG="debug" #[test] fn run_t() -> anyhow::Result<()> { init(); // let config = CollectionsConfig::default(); let config = CollectionsConfig { pod_dir: "/data/run_data".to_string(), ..Default::default() }; run_tasks(config) } // #[test_log::test] // fn bcftools_pass() { // let config = BcftoolsConfig::default(); // let id = "RICCO"; // let time = "diag"; // let caller = "DeepVariant"; // // Config::default(); // // // let (i, o) = // // let i = format!("/data/longreads_basic_pipe/{id}/{time}/nanomonsv/{id}_diag.nanomonsv.result.vcf"); // // let o = format!("/data/longreads_basic_pipe/{id}/{time}/nanomonsv/{id}_diag_nanomonsv_PASSED.vcf.gz"); // bcftools_keep_pass(&i, &o, config).unwrap(); // } #[test] fn bam_ok() -> anyhow::Result<()> { init(); let collections = Collections::new( CollectionsConfig::default() )?; let mut res: Vec<_> = collections.bam.by_id_completed(15.0, 10.0).iter().map(|b| { (b.id.to_string(), b.time_point.to_string(), b.path.to_str().unwrap().to_string()) }).collect(); res.sort_by_key(|b| b.1.clone()); res.sort_by_key(|b| b.0.clone()); res.iter().for_each(|(id, tp, path)| println!("{id}\t{tp}\t{path}")); Ok(()) } #[test] fn todo_assembler() -> anyhow::Result<()> { init(); let collections = Collections::new( CollectionsConfig::default() )?; collections.todo_assembler()?; Ok(()) } #[test] fn sv_pon() -> anyhow::Result<()> { init(); nanomonsv_create_pon(&Config::default(), "/data/ref/hs1/nanomonsv_pon.vcf.gz") } #[test] fn todo_mod() -> anyhow::Result<()> { init(); let collections = Collections::new( CollectionsConfig::default() )?; collections.todo_mod_pileup(); Ok(()) } #[test] fn todo_deepv() -> anyhow::Result<()> { init(); let collections = Collections::new( CollectionsConfig::default() )?; let tasks = collections.todo_deepvariants(); tasks.iter().for_each(|t| info!("{t}")); info!("n tasks {}", tasks.len()); Ok(()) } #[test] fn todo_clairs() -> anyhow::Result<()> { init(); let collections = Collections::new( CollectionsConfig::default() )?; collections.todo_clairs().iter().for_each(|t| info!("{t}")); Ok(()) } #[test] fn run_assemblers() -> anyhow::Result<()> { Assembler::new("CAMEL".to_string(), "diag".to_string(), AssemblerConfig::default()).run() } // #[test] // fn run_dmr_par() -> anyhow::Result<()> { // init(); // let collections = Collections::new( // CollectionsConfig::default() // )?; // let tasks = collections.todo_dmr_c_diag_mrd(); // tasks.iter().for_each(|t| info!("{t}")); // let len = tasks.len(); // // let pool = ThreadPoolBuilder::new().num_threads(10).build().unwrap(); // // pool.install(|| { // // tasks.par_iter().enumerate().for_each(|(i, t)| { // // let config = ModkitConfig {threads: 2, ..Default::default() }; // // if let collection::CollectionsTasks::DMRCDiagMrd { id, .. } = t { let _ = dmr_c_mrd_diag(id, &config); } // // println!("⚡ {i}/{len}"); // // }); // // }); // Ok(()) // } #[test] fn run_mod_par() -> anyhow::Result<()> { init(); let collections = Collections::new( CollectionsConfig::default() )?; let tasks = collections.todo_mod_pileup(); let len = tasks.len(); tasks.par_iter().enumerate().for_each(|(i, t)| { let config = ModkitConfig {threads: 2, ..Default::default() }; if let collection::CollectionsTasks::ModPileup { bam, .. } = t { let _ = bed_methyl(bam.to_owned(), &config); } println!("⚡ {i}/{len}"); }); Ok(()) } #[test] fn run_severus() -> anyhow::Result<()> { init(); Severus::initialize("CAMEL", Config::default())?.run() } #[test] fn run_severus_solo() -> anyhow::Result<()> { init(); SeverusSolo::initialize("CAMEL","diag", Config::default())?.run() } #[test] fn run_savana() -> anyhow::Result<()> { init(); let collections = Collections::new( CollectionsConfig::default() )?; for bam in collections.bam.by_id_completed(15.0, 10.0).iter() { let id = &bam.id; match ClairS::initialize(id, Config::default())?.run() { Ok(_) => match Savana::initialize(id, Config::default())?.run() { Ok(_) => (), Err(e) => error!("{e}"), }, Err(e) => error!("{e}"), } ; } Ok(()) } #[test] fn check_versions() -> anyhow::Result<()> { init(); let config = Config::default(); let v = Savana::version(&config)?; info!("Savanna version {v}"); let v = Severus::version(&config)?; info!("Severus version {v}"); Ok(()) } #[test] fn run_multi_deepvariant() -> anyhow::Result<()> { init(); let mut collections = Collections::new( CollectionsConfig::default() )?; collections.run_deepvariant() } #[test] fn run_deepvariant() -> anyhow::Result<()> { init(); DeepVariant::initialize("HAMROUNE", "diag", Config::default())?.run() } #[test] fn run_clairs() -> anyhow::Result<()> { init(); ClairS::initialize("ADJAGBA", Config::default())?.run() } #[test] fn run_longphase() -> anyhow::Result<()> { init(); let id = "BECERRA"; let diag_bam = format!("/data/longreads_basic_pipe/{id}/diag/{id}_diag_hs1.bam"); let vcf = format!("/data/longreads_basic_pipe/{id}/diag/ClairS/clair3_normal_tumoral_germline_output.vcf.gz"); let mrd_bam = format!("/data/longreads_basic_pipe/{id}/mrd/{id}_mrd_hs1.bam"); LongphaseHap::new(id, &diag_bam, &vcf, LongphaseConfig::default()).run()?; LongphaseHap::new(id, &mrd_bam, &vcf, LongphaseConfig::default()).run() } #[test] fn run_longphase_modcall() -> anyhow::Result<()> { init(); let id = "ADJAGBA"; let time = "diag"; LongphaseModcallSolo::initialize(id, time, Config::default())?.run() } #[test] fn run_longphase_phase() -> anyhow::Result<()> { init(); let id = "ADJAGBA"; LongphasePhase::initialize(id, Config::default())?.run() } #[test] fn variant_parse() -> anyhow::Result<()> { let row = "chr1\t1366\t.\tC\tCCCT\t8.2\tPASS\t.\tGT:GQ:DP:AD:VAF:PL\t1/1:4:6:1,4:0.66667:6,4,0"; let variant: VcfVariant = row.parse()?; let var_string = variant.into_vcf_row(); assert_eq!(row, &var_string); let row = "chr1\t1366\t.\tC\tCCCT\t8.2\tPASS\t."; let variant: VcfVariant = row.parse()?; let var_string = variant.into_vcf_row(); assert_eq!(row, &var_string); let row = "chr1\t2628434\t.\tC\tT\t17.973\tPASS\tH;FAU=0;FCU=7;FGU=0;FTU=7;RAU=0;RCU=2;RGU=0;RTU=2\tGT:GQ:DP:AF:AD:NAF:NDP:NAD:AU:CU:GU:TU:NAU:NCU:NGU:NTU\t0/1:17:18:0.5:0,9:0:11:0,0:0:9:0:9:0:11:0:0"; let variant: VcfVariant = row.parse()?; let var_string = variant.into_vcf_row(); assert_eq!(row, &var_string); let row = "chr1\t52232\t.\tC\tCT\t18\t.\t.\tGT:GQ:DP:AD:AF\t1/.:1:24:3,5:0.208333"; let variant: VcfVariant = row.parse()?; let var_string = variant.into_vcf_row(); assert_eq!(row, &var_string); let row = "chr1\t52232\t.\tC\tCT\t18\t.\t.\tGT:GQ:DP:AD:AF\t1/1:1:24:3,5:0.208333"; let variant_b: VcfVariant = row.parse()?; assert_eq!(variant, variant_b); let row = "chr1\t475157\t.\tA\tG\t12.301\tPASS\tH;FAU=2;FCU=0;FGU=2;FTU=0;RAU=3;RCU=0;RGU=3;RTU=0\tGT:GQ:DP:AF:AD:NAF:NDP:NAD:AU:CU:GU:TU:NAU:NCU:NGU:NTU\t0/1:12:10:0.5:0,5:0.0769:13:0,1:5:0:5:0:12:0:1:0"; let variant: VcfVariant = row.parse()?; let var_string = variant.into_vcf_row(); assert_eq!(row, &var_string); let row = "chr1\t161417408\tr_10_0\tT\t[chr1:161417447[TTGGCAGGTTCC\t.\tPASS\tSVTYPE=BND;MATEID=r_10_1;SVINSLEN=11;SVINSSEQ=TTGGCAGGTTC\tTR:VR\t22:3\t12:0"; let variant: VcfVariant = row.parse()?; println!("{variant:#?}"); let u = variant.bnd_desc(); println!("{u:#?}"); // Severus mates are not in RC let vcf = "chr7\t27304522\tseverus_BND6747_1\tN\t[chr6:32688062[N\t60\tPASS\tPRECISE;SVTYPE=BND;MATE_ID=severus_BND6747_2;STRANDS=--;MAPQ=60;CLUSTERID=severus_2\tGT:VAF:hVAF:DR:DV\t0/1:0.29:0.29,0,0:12:5"; let variant: VcfVariant = vcf.parse()?; let bnd_a = variant.bnd_desc()?; let vcf = "chr6\t32688062\tseverus_BND6747_2\tN\t[chr7:27304522[N\t60\tPASS\tPRECISE;SVTYPE=BND;MATE_ID=severus_BND6747_1;STRANDS=--;MAPQ=60;CLUSTERID=severus_2 GT:VAF:hVAF:DR:DV\t0/1:0.29:0.29,0,0:12:5"; let variant: VcfVariant = vcf.parse()?; let bnd_b = variant.bnd_desc()?; assert_eq!(bnd_a, bnd_b.rc()); println!("{bnd_a}\n{bnd_b}"); // Savana here each mate are in RC let vcf = "chr10\t102039096\tID_35957_2\tG\t]chr10:101973386]G\t.\tPASS\tSVTYPE=BND;MATEID=ID_35957_1;TUMOUR_READ_SUPPORT=7;TUMOUR_ALN_SUPPORT=7;NORMAL_READ_SUPPORT=0;NORMAL_ALN_SUPPORT=0;SVLEN=65710;BP_NOTATION=+-;SOURCE=SUPPLEMENTARY;CLUSTERED_READS_TUMOUR=7;CLUSTERED_READS_NORMAL=0;ORIGIN_STARTS_STD_DEV=0.35;ORIGIN_MAPQ_MEAN=60;ORIGIN_EVENT_SIZE_STD_DEV=7.248;ORIGIN_EVENT_SIZE_MEDIAN=65710;ORIGIN_EVENT_SIZE_MEAN=65705.4;END_STARTS_STD_DEV=7.007;END_MAPQ_MEAN=60;END_EVENT_SIZE_STD_DEV=7.248;END_EVENT_SIZE_MEDIAN=65710;END_EVENT_SIZE_MEAN=65705.4;TUMOUR_DP_BEFORE=38,29;TUMOUR_DP_AT=44,21;TUMOUR_DP_AFTER=44,21;NORMAL_DP_BEFORE=13,15;NORMAL_DP_AT=13,15;NORMAL_DP_AFTER=13,15;TUMOUR_AF=0.159,0.333;NORMAL_AF=0,0;TUMOUR_TOTAL_HP_AT=20,16,8;NORMAL_TOTAL_HP_AT=6,7,0;TUMOUR_ALT_HP=0,1,6;TUMOUR_PS=101917152;NORMAL_ALT_HP=0,0,0;CLASS=PREDICTED_SOMATIC\tGT\t0/1"; let variant: VcfVariant = vcf.parse()?; let bnd_a = variant.bnd_desc()?; let vcf = "chr10\t101973386\tID_35957_1\tA\tA[chr10:102039096[\t.\tPASS\tSVTYPE=BND;MATEID=ID_35957_2;TUMOUR_READ_SUPPORT=7;TUMOUR_ALN_SUPPORT=7;NORMAL_READ_SUPPORT=0;NORMAL_ALN_SUPPORT=0;SVLEN=65710;BP_NOTATION=+-;SOURCE=SUPPLEMENTARY;CLUSTERED_READS_TUMOUR=7;CLUSTERED_READS_NORMAL=0;ORIGIN_STARTS_STD_DEV=0.35;ORIGIN_MAPQ_MEAN=60;ORIGIN_EVENT_SIZE_STD_DEV=7.248;ORIGIN_EVENT_SIZE_MEDIAN=65710;ORIGIN_EVENT_SIZE_MEAN=65705.4;END_STARTS_STD_DEV=7.007;END_MAPQ_MEAN=60;END_EVENT_SIZE_STD_DEV=7.248;END_EVENT_SIZE_MEDIAN=65710;END_EVENT_SIZE_MEAN=65705.4;TUMOUR_DP_BEFORE=29,38;TUMOUR_DP_AT=21,44;TUMOUR_DP_AFTER=21,44;NORMAL_DP_BEFORE=15,13;NORMAL_DP_AT=15,13;NORMAL_DP_AFTER=15,13;TUMOUR_AF=0.333,0.159;NORMAL_AF=0,0;TUMOUR_TOTAL_HP_AT=17,0,4;NORMAL_TOTAL_HP_AT=5,7,3;TUMOUR_ALT_HP=0,6,1;TUMOUR_PS=101917152;NORMAL_ALT_HP=0,0,0;CLASS=PREDICTED_SOMATIC\tGT\t0/1"; let variant: VcfVariant = vcf.parse()?; let bnd_b = variant.bnd_desc()?; assert_eq!(bnd_a, bnd_b); println!("{bnd_a}\n{bnd_b}"); // Deletions // Severus let vcf = "chr7\t143674704\tseverus_DEL7318\tN\t\t60\tPASS≥\tPRECISE;SVTYPE=DEL;SVLEN=3642;END=143678346;STRANDS=+-;MAPQ=60\tGT:GQ:VAF:hVAF:DR:DV\t0/1:114:0.39:0.39,0,0:14:9"; let variant: VcfVariant = vcf.parse()?; println!("{:?}", variant.infos); println!("{:?}", variant.formats); let del = variant.deletion_desc().unwrap(); println!("{:?}", del); println!("{:?} {:?}", del.len(), variant.formats.n_alt_depth()); assert_eq!("chr7:143674704_143678346_del", variant.deletion_desc().unwrap().to_string()); println!("--\n"); let vcf="chr7\t144003249\tr_106\tC\t\t.\tPASS\tEND=144142733;SVTYPE=DEL;SVLEN=-139484;SVINSLEN=4;SVINSSEQ=GCCA\tTR:VR\t12:10\t51:0"; let variant: VcfVariant = vcf.parse()?; println!("{:?}", variant.infos); println!("{:?}", variant.formats); let del = variant.deletion_desc().unwrap(); println!("{:?}", del); println!("{:?} {:?}", del.len(), variant.formats.n_alt_depth()); let path = "/data/ref/hs1/chm13v2.0_RefSeq_Liftoff_v5.1_Genes.bed"; let r = read_bed(path)?; let deleted_genes = bedrow_overlaps_par(&r, &vec![&GenomeRange { contig: variant.position.contig, range: del.start..del.end }]).into_iter().filter_map(|e| { e.name }).collect::>().join(", "); println!("{deleted_genes}"); Ok(()) } #[test] fn variant_load_deepvariant() -> anyhow::Result<()> { init(); let id = "ADJAGBA"; let time = "diag"; let dv = DeepVariant::initialize(id, time, Config::default())?; let annotations = Annotations::default(); let variant_collection = dv.variants(&annotations)?; println!("Deepvariant for {id} {time}: variants {} {}", variant_collection.variants.len(), variant_collection.vcf.n_variants); Ok(()) } #[test] fn variant_load_clairs() -> anyhow::Result<()> { init(); let id = "ADJAGBA"; let clairs = ClairS::initialize(id, Config::default())?; let annotations = Annotations::default(); let variant_collection = clairs.variants(&annotations)?; println!("ClairS for {id}: variants {} {}", variant_collection.variants.len(), variant_collection.vcf.n_variants); Ok(()) } #[test] fn variant_load_nanomonsv() -> anyhow::Result<()> { init(); let id = "ADJAGBA"; let nanomonsv = NanomonSV::initialize(id, Config::default())?; let annotations = Annotations::default(); let variant_collection = nanomonsv.variants(&annotations)?; println!("NanomonSV for {id}: variants {} {}", variant_collection.variants.len(), variant_collection.vcf.n_variants); println!("{:?}", variant_collection.variants.first()); Ok(()) } #[test] fn variant_load_clairs_germline() -> anyhow::Result<()> { init(); let id = "ADJAGBA"; let clairs = ClairS::initialize(id, Config::default())?; let annotations = Annotations::default(); let germline_variant_collection = clairs.germline(&annotations)?; println!("ClairS for {id}: variants {} {}", germline_variant_collection.variants.len(), germline_variant_collection.vcf.n_variants); Ok(()) } #[test] fn pipe_somatic() -> anyhow::Result<()> { init(); let collections = Collections::new( CollectionsConfig::default() )?; for (a, _) in collections.bam_pairs().iter() { if ["AUBERT", "BAFFREAU", "BAILLEUL"].contains(&a.id.as_str()) { continue; } if let Err(e) = SomaticPipe::initialize(&a.id, Config::default()).map(|mut p| if p.should_run() { if let Err(e) = p.run() { error!("{e}"); } }) { error!("{e}"); } } Ok(()) // let id = "VILI"; // SomaticPipe::initialize(id, Config::default())?.run() } #[test] fn overlaps() { init(); let positions = vec![ &GenomePosition { contig: 1, position: 100 }, &GenomePosition { contig: 1, position: 150 }, &GenomePosition { contig: 1, position: 200 }, &GenomePosition { contig: 2, position: 150 }, ]; let ranges = vec![ &GenomeRange { contig: 1, range: 50..150 }, &GenomeRange { contig: 2, range: 100..200 }, ]; let parallel_overlapping_indices = overlaps_par(&positions, &ranges); assert_eq!(parallel_overlapping_indices, vec![0, 3]) } #[test] fn bed_read() -> anyhow::Result<()> { init(); let path = &Config::default().mask_bed("ADJAGBA"); let r = read_bed(path)?; println!("{}", r.len()); Ok(()) } #[test] fn test_read_dict() -> anyhow::Result<()> { init(); let genome = read_dict(&Config::default().dict_file)?; let genome_length: usize = genome.into_iter().map(|(_, len)| len as usize).sum(); println!("{genome_length}"); Ok(()) } #[test] fn bases_at() -> anyhow::Result<()> { init(); let id = "ADJAGBA"; let c = Config::default(); let chr = "chr3"; let position = 62416039; // 1-based let mut bam = rust_htslib::bam::IndexedReader::from_path(c.solo_bam(id, "diag"))?; let p = nt_pileup(&mut bam, chr, position - 1, false)?.iter().map(|e| String::from_utf8(vec![*e]).unwrap()).collect::>(); let mut counts = HashMap::new(); for item in p.iter() { *counts.entry(item.as_str()).or_insert(0) += 1; } for (key, value) in &counts { println!("{}: {}", key, value); } assert_eq!(8, *counts.get("C").unwrap()); assert_eq!(13, *counts.get("G").unwrap()); assert_eq!(6, *counts.get("D").unwrap()); let chr = "chr1"; let position = 3220; // 1-based let mut bam = rust_htslib::bam::IndexedReader::from_path(c.solo_bam(id, "mrd"))?; let p = counts_at(&mut bam, chr, position - 1)?; println!("{p:#?}"); Ok(()) } #[test] fn seq_at() -> anyhow::Result<()> { init(); let c = Config::default(); let chr = "chr3"; let position = 716766; let mut fasta_reader = noodles_fasta::indexed_reader::Builder::default().build_from_path(c.reference)?; let r = io::fasta::sequence_at(&mut fasta_reader, chr, position, 10)?; println!("{r} ({} {:.2})", r.len(), estimate_shannon_entropy(r.as_str())); Ok(()) } #[test] fn ins_at() -> anyhow::Result<()> { init(); let id = "ADJAGBA"; let c = Config::default(); let chr = "chr1"; let position = 52232; // 1-based like in vcf let mut bam = rust_htslib::bam::IndexedReader::from_path(c.solo_bam(id, "mrd"))?; // let p = ins_pileup(&mut bam, chr, position - 1, true)?.iter().map(|e| String::from_utf8(vec![*e]).unwrap()).collect::>(); let counts = counts_ins_at(&mut bam, chr, position -1)?; for (key, value) in &counts { println!("{}: {}", key, value); } Ok(()) } #[test] fn vep_line() -> anyhow::Result<()> { init(); let line = "chr2_1922358_-/T\tchr2:1922357-1922358\tT\tMYT1L\tNM_001303052.2\tTranscript\tintron_variant\t-\t-\t-\t-\t-\t-\tIMPACT=MODIFIER;STRAND=-1;SYMBOL=MYT1L;SOURCE=chm13v2.0_RefSeq_Liftoff_v5.1_sorted.gff3.gz;HGVSc=NM_001303052.2:c.1619-613dup"; let vep_line: VepLine = line.parse()?; println!("{vep_line:#?}"); let vep: VEP = VEP::try_from(&vep_line)?; println!("{vep:#?}"); Ok(()) } #[test] fn savana_cn() -> anyhow::Result<()> { init(); let id = "CAMARA"; // let s = SavanaCopyNumber::load_id(id, Config::default())?; let s = SavanaReadCounts::load_id(id, Config::default())?; println!("tumoral reads: {}", s.n_tumoral_reads()); println!("normal reads: {}", s.n_normal_reads()); println!("tumoral:\n{:#?}", s.norm_chr_counts()); Ok(()) } #[test] fn coverage() -> anyhow::Result<()> { init(); let id = "CAMARA"; let time = "diag"; WGSBamStats::new(id, time, Config::default())?.print(); Ok(()) } #[test] fn load_bam() -> anyhow::Result<()> { init(); let id = "ADJAGBA"; let time = "diag"; let bam_path = Config::default().solo_bam(id, time); WGSBam::new(Path::new(&bam_path).to_path_buf())?; Ok(()) } #[test] fn tar() -> anyhow::Result<()> { init(); scan_archive("/data/lto/20241030_CUNVI-DG-N20_SEBZA-DG-N21.tar")?; Ok(()) } #[test] fn run_somatic() -> anyhow::Result<()> { init(); let collections = Collections::new( CollectionsConfig::default() )?; let bams = collections.bam.by_id_completed(15.0, 10.0); let n = bams.len(); let mut config = Config::default(); // config.somatic_scan_force = true; warn!("{n} cases"); for (i, bam) in bams.iter().enumerate() { let id = &bam.id; warn!("{i}/{n} {id}"); if id == "BANGA" { continue; } if id == "ARM" { continue; } match SomaticPipe::initialize(id, config.clone())?.run() { Ok(_) => (), Err(e) => error!("{id} {e}"), }; } Ok(()) } #[test] fn somatic_cases() -> anyhow::Result<()> { init(); let id = "ACHITE"; let config = Config { somatic_pipe_force: true, ..Default::default() }; match SomaticPipe::initialize(id, config)?.run() { Ok(_) => (), Err(e) => error!("{id} {e}"), }; Ok(()) } #[test] fn load_variants() -> anyhow::Result<()> { init(); let id = "ACHITE"; let config = Config::default(); let path = format!("{}/{id}/diag/{id}_somatic_variants.bit", config.result_dir); let variants = variant_collection::Variants::load_from_file(&path)?; println!("n variants {}", variants.data.len()); let n_vep: usize = variants.data.iter().map(|v| v.vep().len()).sum(); println!("VEP: {n_vep}"); let translocations = variants.get_alteration_cat(AlterationCategory::TRL); println!("{} translocations", translocations.len()); let threshold = 5; let res = group_variants_by_bnd_desc(&translocations, 5); let rres = group_variants_by_bnd_rc(&res, threshold); rres.iter().for_each(|group| { println!("{} {}", group.0.len(), group.1.len()); }); Ok(()) } #[test] fn load_fc() -> anyhow::Result<()> { init(); // FlowCells::load_archive_from_scan("/data/lto", "/data/archives.json")?; let r = FlowCells::load("/home/prom/mnt/store", "/data/pandora_id_inputs.json", "/data/archives.json.gz")?; println!("{r:#?}"); Ok(()) } #[test] fn alt_cat() -> anyhow::Result<()> { let id = "ADJAGBA"; let config = Config::default(); let path = format!("{}/{id}/diag/somatic_variants.json.gz", config.result_dir); let variants = variant_collection::Variants::load_from_json(&path)?; println!("n variants {}", variants.data.len()); variants.data.iter() .filter(|v| v.alteration_category().contains(&AlterationCategory::TRL)) .for_each(|v| { println!("{:?} {}", v.vcf_variants.iter().map(|v| v.bnd_desc()).collect::>(), v.annotations.iter().filter(|a| matches!(a, Annotation::Callers(..))).map(|a| a.to_string()).collect::>().join(";") ) }); Ok(()) } #[test] fn variants_stats() -> anyhow::Result<()> { init(); let config = Config::default(); let all_variants_bit = find_files(&format!("{}/*/diag/*_somatic_variants.bit", config.result_dir))?; for v in all_variants_bit.into_iter() { let id = v.file_name().unwrap().to_str().unwrap().split("_somatic").next().unwrap(); println!("{id}"); let config = Config::default(); let path = format!("{}/{id}/diag/{id}_somatic_variants.bit", config.result_dir); match variant_collection::Variants::load_from_file(&path) { Ok(mut variants) => { let (mut high_depth_ranges, _) = somatic_depth_quality_ranges(&id, &config)?; high_depth_ranges.par_sort_by_key(|r| (r.contig, r.range.start)); let res = VariantsStats::new(&mut variants, id, &config, &high_depth_ranges)?.save_to_json(&format!( "{}/{id}/diag/{id}_somatic_variants_stats.json.gz", config.result_dir)); if res.is_err() { info!("{:#?}", res); } }, Err(e) => error!("{e}"), } } Ok(()) } #[test] fn constit_stats() { init(); let id = "ADJAGBA"; let config = Config::default(); let _ = const_stats(id.to_string(), config); } #[test] fn test_bnd() -> anyhow::Result<()> { init(); let id = "COIFFET"; let config = Config::default(); let annotations = Annotations::default(); let s = Savana::initialize(id, config)?.variants(&annotations)?; s.variants.iter().for_each(|e| { if let Ok(bnd) = e.bnd_desc() { println!("{}\t{}\t{}", e.position , e.reference, e.alternative); println!("{:#?}", bnd); } }); Ok(()) } #[test] fn parse_savana_seg() { init(); let r = SavanaCN::parse_file("ADJAGBA", &config::Config::default()).unwrap().segments; println!("{} lines", r.len()); println!("{:#?}", r.first().unwrap()); } #[test] fn whole_scan() -> anyhow::Result<()> { init(); let id = "CHENU"; let mut config = Config::default(); let u = config.solo_bam(id, "mrd"); println!("{u}"); config.somatic_scan_force = true; somatic_scan(id, &config)?; Ok(()) } #[test] fn parse_gff() -> anyhow::Result<()> { init(); let id = "ADJAGBA"; let config = Config::default(); let path = format!("{}/{id}/diag/somatic_variants.bit", config.result_dir); let exon_ranges = features_ranges("exon", &config)?; let exon_ranges = merge_overlapping_genome_ranges(&exon_ranges); let variants = variant_collection::Variants::load_from_file(&path)?; let full = variants_stats::somatic_rates(&variants.data, &exon_ranges, &config); info!("{full:#?}"); // let restrained: Vec = variants.data.iter().filter(|v| v.vcf_variants.len() >= 2) // .cloned().collect(); // let min_2 = variants_stats::somatic_rates(&restrained, &exon_ranges, &config); // info!("{min_2:#?}"); // // let restrained: Vec = restrained.iter().filter(|v| v.vcf_variants.len() >= 3) // .cloned().collect(); // let min_3 = variants_stats::somatic_rates(&restrained, &exon_ranges, &config); // info!("{min_3:#?}"); // let mut high_depth_ranges = variants_stats::high_depth_somatic(id, &config)?; // high_depth_ranges.par_sort_by_key(|r| ( r.contig, r.range.start )); // // let exon_ranges_ref: Vec<&GenomeRange> = exon_ranges.iter().collect(); // let exons_high_depth = range_intersection_par(&high_depth_ranges.iter().collect::>(), &exon_ranges_ref); // // let full = variants_stats::somatic_rates(&variants.data, &exons_high_depth, &config); // info!("{full:#?}"); // // info!("n variants loaded: {}", variants.data.len()); // // let r = features_ranges("exon", &config::Config::default())?; // info!("n exon: {}", r.len()); // // let merged = merge_overlapping_genome_ranges(&r); // info!("n merged exon: {}", merged.len()); // // let ol = par_overlaps(&variants.data, &r); // info!("variants in exon {}", ol.len()); // // let n_coding = ol.iter().filter_map(|i| variants.data[*i].best_vep().ok() ).filter_map(|bv| bv.impact()).filter(|impact| *impact <= VepImpact::MODERATE).count(); // info!("coding variants {n_coding}"); // // let n_bases_m = merged.par_iter().map(|gr| gr.length()).sum::(); // info!("{n_bases_m}nt"); // // let mega_base_m = n_bases_m as f64 / 10.0e6; // // let wgs_len = read_dict(&config.dict_file)?.iter().map(|(_, l)| *l).sum::(); // info!("wgs len {wgs_len}"); // let rate_wgs = variants.data.len() as f64 / (wgs_len as f64 / 10.0e6); // info!("somatic mutation rate {rate_wgs:.2}/mb"); // // let n_exons_mb = ol.len() as f64 / mega_base_m; // info!("somatic mutation rate in the coding region {n_exons_mb:.2}/mb"); // // let n_exons_mb = n_coding as f64 / mega_base_m; // info!("somatic non synonymous mutation rate in the coding region {n_exons_mb:.2}/mb"); Ok(()) } fn gr(contig: u8, start: u32, end: u32) -> GenomeRange { GenomeRange { contig, range: start..end, } } #[test] fn test_both_empty() { let a: Vec<&GenomeRange> = vec![]; let b: Vec<&GenomeRange> = vec![]; let result = range_intersection_par(&a, &b); assert!(result.is_empty()); } #[test] fn test_one_empty() { let a = [gr(1, 0, 100)]; let b: Vec<&GenomeRange> = vec![]; let a_refs: Vec<&GenomeRange> = a.iter().collect(); let mut result = range_intersection_par(&a_refs, &b); sort_ranges(&mut result); assert!(result.is_empty()); } #[test] fn test_single_range_no_overlap() { let a = [gr(1, 0, 100)]; let b = [gr(1, 100, 200)]; let a_refs: Vec<&GenomeRange> = a.iter().collect(); let b_refs: Vec<&GenomeRange> = b.iter().collect(); let mut result = range_intersection_par(&a_refs, &b_refs); sort_ranges(&mut result); assert!(result.is_empty()); } #[test] fn test_single_range_full_overlap() { let a = [gr(1, 0, 100)]; let b = [gr(1, 0, 100)]; let a_refs: Vec<&GenomeRange> = a.iter().collect(); let b_refs: Vec<&GenomeRange> = b.iter().collect(); let mut result = range_intersection_par(&a_refs, &b_refs); let mut expected = [gr(1, 0, 100)]; sort_ranges(&mut result); sort_ranges(&mut expected); assert_eq!(result, expected); } #[test] fn test_different_contigs() { let a = [gr(1, 0, 100)]; let b = [gr(2, 0, 100)]; let a_refs: Vec<&GenomeRange> = a.iter().collect(); let b_refs: Vec<&GenomeRange> = b.iter().collect(); let mut result = range_intersection_par(&a_refs, &b_refs); sort_ranges(&mut result); assert!(result.is_empty()); } #[test] fn test_touching_ranges() { let a = [gr(1, 0, 100)]; let b = [gr(1, 100, 200)]; let a_refs: Vec<&GenomeRange> = a.iter().collect(); let b_refs: Vec<&GenomeRange> = b.iter().collect(); let mut result = range_intersection_par(&a_refs, &b_refs); sort_ranges(&mut result); assert!(result.is_empty()); } #[test] fn test_complete_subrange() { let a = [gr(1, 0, 200)]; let b = [gr(1, 50, 150)]; let a_refs: Vec<&GenomeRange> = a.iter().collect(); let b_refs: Vec<&GenomeRange> = b.iter().collect(); let mut result = range_intersection_par(&a_refs, &b_refs); let mut expected = [gr(1, 50, 150)]; sort_ranges(&mut result); sort_ranges(&mut expected); assert_eq!(result, expected); } #[test] fn test_multiple_overlaps_same_contig() { let a = [gr(1, 0, 50), gr(1, 75, 125)]; let b = [gr(1, 25, 100), gr(1, 150, 200)]; let a_refs: Vec<&GenomeRange> = a.iter().collect(); let b_refs: Vec<&GenomeRange> = b.iter().collect(); let mut result = range_intersection_par(&a_refs, &b_refs); let mut expected = [gr(1, 25, 50), gr(1, 75, 100)]; sort_ranges(&mut result); sort_ranges(&mut expected); assert_eq!(result, expected); } #[test] fn test_multiple_contigs() { let a = [gr(1, 0, 100), gr(2, 50, 150), gr(3, 200, 300)]; let b = [gr(1, 50, 150), gr(2, 0, 100), gr(4, 0, 100)]; let a_refs: Vec<&GenomeRange> = a.iter().collect(); let b_refs: Vec<&GenomeRange> = b.iter().collect(); let mut result = range_intersection_par(&a_refs, &b_refs); let mut expected = [gr(1, 50, 100), gr(2, 50, 100)]; sort_ranges(&mut result); sort_ranges(&mut expected); assert_eq!(result, expected); } #[test] fn test_adjacent_ranges() { let a = [gr(1, 0, 50), gr(1, 50, 100)]; let b = [gr(1, 25, 75)]; let a_refs: Vec<&GenomeRange> = a.iter().collect(); let b_refs: Vec<&GenomeRange> = b.iter().collect(); let mut result = range_intersection_par(&a_refs, &b_refs); let mut expected = [gr(1, 25, 50), gr(1, 50, 75)]; sort_ranges(&mut result); sort_ranges(&mut expected); assert_eq!(result, expected); } #[test] fn test_minimal_overlap() { let a = [gr(1, 0, 100)]; let b = [gr(1, 99, 200)]; let a_refs: Vec<&GenomeRange> = a.iter().collect(); let b_refs: Vec<&GenomeRange> = b.iter().collect(); let mut result = range_intersection_par(&a_refs, &b_refs); let mut expected = [gr(1, 99, 100)]; sort_ranges(&mut result); sort_ranges(&mut expected); assert_eq!(result, expected); } #[test] fn todo_scan() -> anyhow::Result<()> { init(); let mut collections = Collections::new( CollectionsConfig::default() )?; collections.todo_bam_count(&Config::default())?; collections.tasks.iter().for_each(|t| info!("{t}")); let pool = rayon::ThreadPoolBuilder::new() .num_threads(100) .build() .unwrap(); pool.install(move || { collections.tasks.into_iter().for_each(|t| { // info!("{t}"); if let Err(e) = t.run() { error!("{e}"); } }); }); Ok(()) } /// helper to build a forward‑strand BND (same contig) where /// A = pos, B = pos + 5 fn fwd(contig: &str, pos: u32) -> BNDDesc { BNDDesc { a_contig: contig.into(), a_position: pos, a_sens: true, b_contig: contig.into(), b_position: pos + 5, b_sens: true, added_nt: String::new(), } } /// Build a six‑node *forward* chain relying **only** on `auto_connect()` /// (no manual edges) and assert the Hamiltonian path spans all nodes. #[test] fn hamiltonian_chain_auto() { // positions 10,15 20,25 … 60,65 satisfy B(u) ≤ A(v) let bnds: Vec = (1..=6).map(|i| fwd("chr1", i * 10)).collect(); let g: BNDGraph<()> = bnds.to_bnd_graph(); // trait uses auto_connect() // ensure auto_connect produced 5 edges in a line assert_eq!(g.inner().edge_count(), 5); let path = g.hamiltonian_path().expect("chain should be Hamiltonian"); assert_eq!(path.len(), 6); } /// Two disconnected "V" shapes -> auto_connect creates 2 edges in each /// component, no Hamiltonian path, components sorted by size. #[test] fn components_after_auto_connect() { // comp1: a->b<-c on reverse strand of chrX let a = BNDDesc { a_contig: "chrX".into(), a_position: 300, a_sens: false, b_contig: "chrX".into(), b_position: 250, b_sens: false, added_nt: String::new() }; let b = BNDDesc { a_contig: "chrX".into(), a_position: 200, a_sens: false, b_contig: "chrX".into(), b_position: 150, b_sens: false, added_nt: String::new() }; let c = BNDDesc { a_contig: "chrX".into(), a_position: 100, a_sens: false, b_contig: "chrX".into(), b_position: 50, b_sens: false, added_nt: String::new() }; // comp2: three‑node forward chain on chrY let d = fwd("chrY", 10); let e = fwd("chrY", 20); let f = fwd("chrY", 30); let g: BNDGraph<()> = vec![a,b,c,d,e,f].to_bnd_graph(); assert!(g.hamiltonian_path().is_none()); let comps = g.components_by_size(); comps.iter().for_each(|a| println!("{}", g.fmt_path(a) )); assert_eq!(comps.len(), 2); assert_eq!(comps[0].len(), 3); assert_eq!(comps[1].len(), 3); } #[test] fn bnd_connections() { // comp1: a->b<-c on reverse strand of chrX let a = BNDDesc { a_contig: "chrX".into(), a_position: 300, a_sens: true, b_contig: "chr14".into(), b_position: 250, b_sens: false, added_nt: String::new() }; let b = BNDDesc { a_contig: "chr14".into(), a_position: 200, a_sens: false, b_contig: "chrX".into(), b_position: 301, b_sens: true, added_nt: String::new() }; let c = BNDDesc { a_contig: "chrX".into(), a_position: 302, a_sens: true, b_contig: "chrZ".into(), b_position: 50, b_sens: false, added_nt: String::new() }; // comp2: three‑node forward chain on chrY let d = fwd("chrY", 10); let e = fwd("chrY", 20); let f = fwd("chrY", 30); let g: BNDGraph<()> = vec![a,b,c,d,e,f].to_bnd_graph(); assert!(g.hamiltonian_path().is_none()); let comps = g.components_by_size(); comps.iter().for_each(|a| println!("{}", g.fmt_path(a) )); assert_eq!(comps.len(), 2); assert_eq!(comps[0].len(), 3); assert_eq!(comps[1].len(), 3); } }