lib.rs 65 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548
  1. //! # 🧬 Long-read Somatic Variant Calling and Analysis Framework
  2. //!
  3. //! This Rust library provides a modular, parallelizable framework for somatic variant calling, annotation, and interpretation from long-read sequencing data. It is designed to support full pipelines for research and clinical workflows across multiple variant callers and analysis stages.
  4. //!
  5. //! The library also serves as an extensible platform that developers can leverage to add custom features, integrate new tools, and tailor workflows to specific use cases.
  6. //!
  7. //! ## 🧩 Key Features
  8. //!
  9. //! - **POD5 Demultiplexing and Alignment**: End-to-end support for processing ONT POD5 files:
  10. //! - Barcode-aware demultiplexing using metadata CSVs
  11. //! - POD5 subsetting and organization by case
  12. //! - Integration with basecallers (e.g., Dorado) for read alignment
  13. //! - **Pipeline Management**: Full orchestration of Dockerized execution pipelines for tools such as ClairS, Nanomonsv, DeepVariant, Savana, Modkit, and Severus.
  14. //! - **Flexible Configuration**: Centralized configuration system (`Config`, `CollectionsConfig`) for all modules and pipelines.
  15. //! - **Input Abstraction**: Unified handling of BAM, POD5, and VCF file collections across cohorts and directories.
  16. //! - **Variant Processing**: Modular loading, filtering, statistical analysis, and annotation of somatic and germline variants.
  17. //! - **Haplotype Phasing and Methylation**: Support for LongPhase-based phasing and Modkit methylation pileups with support for multi-threaded pileup and aggregation.
  18. //! - **Parallel Execution**: Uses `rayon` for efficient multicore parallelization over large cohorts and tasks.
  19. //!
  20. //! ## 📚 Module Highlights
  21. //!
  22. //! - `callers`: Interfaces to variant calling tools (ClairS, DeepVariant, Nanomonsv, Savana, etc...)
  23. //! - `runners`: Pipeline runners (e.g. `Somatic`, `SeverusSolo`, `LongphasePhase`) that manage end-to-end execution.
  24. //! - `collection`: Organizes input data across BAMs, VCFs, and POD5 files with auto-detection of completed runs.
  25. //! - `annotation`: VEP line parsing and high-level annotation aggregation.
  26. //! - `pipes`: Composition modules for executing pipelines across callers and post-processing steps.
  27. //! - `functions`: Custom logic for genome assembly, entropy estimation, and internal tooling.
  28. //! - `positions`, `variant`, `helpers`: Utilities for SV modeling, variant filtering, position overlap logic, and helper methods.
  29. //!
  30. //! ## ⚡ Workflow Overview
  31. //!
  32. //! ### 1. 📦 From POD5 to BAM Alignment
  33. //!
  34. //! - **Demultiplexing**: POD5 files are subset and demuxed using barcodes (via CSV metadata).
  35. //! - **Flowcell Case Management**: Each sample is identified by a [`collection::pod5::FlowCellCase`] containing its ID, time point, and POD5 directory.
  36. //! - **Alignment**: The [`commands::dorado::Dorado`] module handles alignment of POD5 reads to reference genome, producing BAMs.
  37. //!
  38. //! ```rust
  39. //! let case = FlowCellCase { id: "PATIENT1", time_point: "diag", barcode: "01", pod_dir: "...".into() };
  40. //! Dorado::init(case, Config::default())?.run_pipe()?;
  41. //! ```
  42. //!
  43. //! ### 2. 🧬 Variant Calling (BAM ➝ VCF)
  44. //!
  45. //! Using the aligned BAMs, multiple variant callers can be run in parallel. The [`callers`] and [`runners`] modules support:
  46. //!
  47. //! - **ClairS** – somatic small variant calling with LongPhase haplotagging
  48. //! - **Nanomonsv** – structural variants (SV)
  49. //! - **DeepVariant** – germline small variants
  50. //! - **Savana** – SVs and copy number variations (CNV)
  51. //! - **Modkit** – methylation pileups
  52. //! - **LongPhase** – phasing and modcalling
  53. //!
  54. //! All workflows can be triggered per-case or per-cohort using `Collections` or `Somatic` runners.
  55. //!
  56. //! ```rust
  57. //! ClairS::initialize("PATIENT1", Config::default())?.run()?;
  58. //! NanomonSV::initialize("PATIENT1", Config::default())?.run()?;
  59. //! ```
  60. //!
  61. //! ### 3. 📈 Aggregation & Statistics (VCF ➝ JSON / Stats)
  62. //!
  63. //! After variant calling:
  64. //!
  65. //! - Annotate with VEP ([`annotation`] module)
  66. //! - Load and filter with [`variant::variant_collection`]
  67. //! - Compute variant and region-level stats (e.g., mutation rates, alteration categories, coding overlaps)
  68. //!
  69. //! ```rust
  70. //! let variants = Variants::load_from_json("/path/to/somatic_variants.json.gz")?;
  71. //! let stats = VariantsStats::new(&variants, "PATIENT1", &config)?;
  72. //! stats.save_to_json("/output/path/stats.json.gz")?;
  73. //! ```
  74. //!
  75. //! ### 4. 🧠 Intelligent Task Management (`collection` module)
  76. //!
  77. //! - Auto-discovers available samples, POD5s, BAMs, and VCFs
  78. //! - Detects missing outputs and creates task lists
  79. //! - Tasks are parallelizable using Rayon and can be run on-demand
  80. //!
  81. //! ```rust
  82. //! let mut collections = Collections::new(CollectionsConfig::default())?;
  83. //! collections.todo()?; // Identify missing steps
  84. //! collections.run()?; // Run them automatically
  85. //! ```
  86. //!
  87. //! ## 🔬 Testing
  88. //!
  89. //! Integration tests demonstrate the entire pipeline. Run with logging enabled:
  90. //!
  91. //! ```bash
  92. //! export RUST_LOG=debug
  93. //! cargo test -- --nocapture
  94. //! ```
  95. //!
  96. //! ## 🧪 Example Use Cases
  97. //!
  98. //! - Full somatic variant calling pipeline on matched tumor/normal samples
  99. //! - POD5-based pipeline from raw signal to variants
  100. //! - Aggregation and annotation of SVs across a clinical cohort
  101. //! - Methylation analysis using nanopore-specific tools
  102. //! - Variant calling and analysis in large-scale longitudinal studies
  103. //!
  104. //! ## 🚀 Getting Started
  105. //!
  106. //! All workflows are initialized from `Config` and driven by the `Collections` structure:
  107. //!
  108. //! ```rust
  109. //! let collections = Collections::new(CollectionsConfig::default())?;
  110. //! collections.todo()?;
  111. //! collections.run()?;
  112. //! ```
  113. //!
  114. //! ## 🔗 References
  115. //!
  116. //! **Basecalling and alignment**
  117. //! - Dorado: <https://github.com/nanoporetech/dorado>
  118. //!
  119. //! **Variants Callers**
  120. //! - ClairS: <https://github.com/HKU-BAL/ClairS>
  121. //! - Nanomonsv: <https://github.com/friend1ws/nanomonsv>
  122. //! - Savana: <https://github.com/cortes-ciriano-lab/savana>
  123. //! - DeepVariant: <https://github.com/google/deepvariant>
  124. //! - DeepSomatic: <https://github.com/google/deepsomatic>
  125. //! - LongPhase: <https://github.com/PorubskyResearch/LongPhase>
  126. //! - Modkit: <https://github.com/nanoporetech/modkit>
  127. //!
  128. //! **Variants annotation**
  129. //! - VEP: <https://www.ensembl.org/info/docs/tools/vep/index.html>
  130. //!
  131. //! ---
  132. use std::sync::{Arc, Mutex};
  133. pub mod annotation;
  134. pub mod callers;
  135. pub mod collection;
  136. pub mod commands;
  137. pub mod config;
  138. pub mod functions;
  139. pub mod helpers;
  140. pub mod io;
  141. pub mod locker;
  142. pub mod math;
  143. pub mod pipes;
  144. pub mod positions;
  145. pub mod runners;
  146. pub mod scan;
  147. pub mod slurm_helpers;
  148. pub mod variant;
  149. pub mod de_novo;
  150. pub mod aligner;
  151. #[macro_use]
  152. extern crate lazy_static;
  153. // Define DOCKER_ID lock for handling Docker kill when ctrlc is pressed
  154. lazy_static! {
  155. static ref DOCKER_ID: Arc<Mutex<Vec<String>>> = Arc::new(Mutex::new(Vec::new()));
  156. static ref TEST_DIR: String = "/mnt/beegfs02/scratch/t_steimle/test_data".to_string();
  157. }
  158. #[cfg(test)]
  159. mod tests {
  160. use std::{collections::HashMap, path::Path};
  161. use annotation::{
  162. vep::{VepLine, VEP},
  163. Annotations,
  164. };
  165. use collection::bam::{counts_at, counts_ins_at, nt_pileup, WGSBam};
  166. use functions::assembler::{Assembler, AssemblerConfig};
  167. use helpers::estimate_shannon_entropy;
  168. use io::bed::read_bed;
  169. use log::{error, info};
  170. use positions::{overlaps_par, GenomePosition, GenomeRange};
  171. use rayon::prelude::*;
  172. use variant::{variant_collection, vcf_variant::VcfVariant};
  173. use self::{/* collection::pod5::{FlowCellCase, Pod5Collection}, */ config::Config};
  174. use super::*;
  175. use crate::{
  176. annotation::Annotation,
  177. collection::{
  178. bam::{self},
  179. flowcells::{scan_archive, FlowCells},
  180. vcf::VcfCollection,
  181. },
  182. helpers::find_files,
  183. io::{bed::bedrow_overlaps_par, dict::read_dict, gff::features_ranges},
  184. positions::{merge_overlapping_genome_ranges, range_intersection_par, sort_ranges},
  185. scan::scan::somatic_scan,
  186. variant::{
  187. variant_collection::{
  188. group_variants_by_bnd_desc, group_variants_by_bnd_rc, VariantCollection,
  189. },
  190. variants_stats::{self, somatic_depth_quality_ranges, VariantsStats},
  191. vcf_variant::{AlterationCategory, BNDDesc, BNDGraph, ToBNDGraph},
  192. },
  193. };
  194. // export RUST_LOG="debug"
  195. fn init() {
  196. let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info"))
  197. .try_init();
  198. }
  199. // #[test]
  200. // fn run_dorado() -> anyhow::Result<()> {
  201. // init();
  202. // let case = FlowCellCase {
  203. // id: "CONSIGNY".to_string(),
  204. // time_point: "mrd".to_string(), barcode: "07".to_string(), pod_dir: "/mnt/beegfs01/ data/run_data/20240326-CL/CONSIGNY-MRD-NB07_RICCO-DIAG-NB08/20240326_1355_1E_PAU78333_bc25da25/pod5_pass/barcode07".into()
  205. // };
  206. // dorado::Dorado::init(case, Config::default())?.run_pipe()
  207. // }
  208. // #[test]
  209. // fn pod5() -> anyhow::Result<()> {
  210. // let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info"))
  211. // .build();
  212. //
  213. // let coll = Pod5Collection::new(
  214. // "/data/run_data",
  215. // "/data/flow_cells.tsv",
  216. // "/data/longreads_basic_pipe",
  217. // )?;
  218. // println!("{coll:#?}");
  219. // // let runs = Runs::import_dir("/home/prom/store/banana-pool/run_data", "/data/flow_cells.tsv")?;
  220. // Ok(())
  221. // }
  222. #[test]
  223. fn bam() -> anyhow::Result<()> {
  224. init();
  225. let bam_collection = bam::load_bam_collection("/data/longreads_basic_pipe");
  226. bam_collection
  227. .bams
  228. .iter()
  229. // .filter(|b| matches!(b.bam_type, BamType::Panel(_)))
  230. .for_each(|b| println!("{b:#?}"));
  231. let u = bam_collection.get("PARACHINI", "mrd");
  232. println!("{u:#?}");
  233. Ok(())
  234. }
  235. #[test]
  236. fn vcf() -> anyhow::Result<()> {
  237. init();
  238. let mut vcf_collection = VcfCollection::new("/data/longreads_basic_pipe");
  239. vcf_collection.sort_by_id();
  240. vcf_collection
  241. .vcfs
  242. .iter()
  243. .for_each(|v| v.println().unwrap());
  244. Ok(())
  245. }
  246. // pod5 view -I /data/run_data/20240903-CL/ARMEM-DG-N02_ASSJU-DG-N03/20240903_1428_1B_PAW47629_fc24c3cf/pod5/PAW47629_fc24c3cf_77b07847_0.pod5 | head -5000 | awk '{if(NR==1){print "target,"$0}else{print "subset_1.pod5,"$0}}' > /tmp/subset_ids.csv
  247. // pod5 subset /data/run_data/20240903-CL/ARMEM-DG-N02_ASSJU-DG-N03/20240903_1428_1B_PAW47629_fc24c3cf/pod5/PAW47629_fc24c3cf_77b07847_0.pod5 --csv /tmp/subset_ids.csv -o /data/test_suite/pod5/muxed/
  248. // #[test]
  249. // fn mux() -> anyhow::Result<()> {
  250. // init();
  251. // let result_dir = "/data/test_suite/results".to_string();
  252. // let cases = vec![
  253. // FlowCellCase { id: "test_02".to_string(), time_point: "diag".to_string(), barcode: "02".to_string(), pod_dir: "/data/test_suite/pod5/muxed".into() },
  254. // FlowCellCase { id: "test_03".to_string(), time_point: "diag".to_string(), barcode: "03".to_string(), pod_dir: "/data/test_suite/pod5/muxed".into() },
  255. // ];
  256. //
  257. // cases.iter().for_each(|c| {
  258. // let dir = format!("{result_dir}/{}", c.id);
  259. // if Path::new(&dir).exists() {
  260. // fs::remove_dir_all(dir).unwrap();
  261. // }
  262. // });
  263. // let config = Config { result_dir, ..Default::default() };
  264. // Dorado::from_mux(cases, config)
  265. // }
  266. // #[test_log::test]
  267. // fn clairs() -> anyhow::Result<()> {
  268. // let config = ClairSConfig {
  269. // result_dir: "/data/test".to_string(),
  270. // ..ClairSConfig::default()
  271. // };
  272. // ClairS::new("test_a", "/data/test_data/subset.bam", "/data/test_data/subset_mrd.bam", config).run()
  273. // }
  274. // #[test]
  275. // fn nanomonsv() -> anyhow::Result<()> {
  276. // init();
  277. // let id = "HAMROUNE";
  278. // NanomonSV::initialize(id, Config::default())?.run()
  279. // }
  280. //
  281. // #[test]
  282. // fn nanomonsv_version() -> anyhow::Result<()> {
  283. // init();
  284. // let v = NanomonSV::version(&Config::default())?;
  285. // println!("NanomonSV version: {v}");
  286. // let v = DeepVariant::version(&Config::default())?;
  287. // println!("DeepVariant version: {v}");
  288. // let v = Savana::version(&Config::default())?;
  289. // println!("Savana version: {v}");
  290. // let v = DeepSomatic::version(&Config::default())?;
  291. // println!("DeepSomatic version: {v}");
  292. // let v = ClairS::version(&Config::default())?;
  293. // println!("ClairS version: {v}");
  294. // Ok(())
  295. // }
  296. //
  297. // #[test]
  298. // fn nanomonsv_solo() -> anyhow::Result<()> {
  299. // init();
  300. // NanomonSVSolo::initialize("LAKHDHAR", "diag", Config::default())?.run()
  301. // }
  302. #[test]
  303. fn run_assemblers() -> anyhow::Result<()> {
  304. Assembler::new(
  305. "CAMEL".to_string(),
  306. "diag".to_string(),
  307. AssemblerConfig::default(),
  308. )
  309. .run()
  310. }
  311. // #[test]
  312. // fn run_dmr_par() -> anyhow::Result<()> {
  313. // init();
  314. // let collections = Collections::new(
  315. // CollectionsConfig::default()
  316. // )?;
  317. // let tasks = collections.todo_dmr_c_diag_mrd();
  318. // tasks.iter().for_each(|t| info!("{t}"));
  319. // let len = tasks.len();
  320. // // let pool = ThreadPoolBuilder::new().num_threads(10).build().unwrap();
  321. // // pool.install(|| {
  322. // // tasks.par_iter().enumerate().for_each(|(i, t)| {
  323. // // let config = ModkitConfig {threads: 2, ..Default::default() };
  324. // // if let collection::CollectionsTasks::DMRCDiagMrd { id, .. } = t { let _ = dmr_c_mrd_diag(id, &config); }
  325. // // println!("⚡ {i}/{len}");
  326. // // });
  327. // // });
  328. // Ok(())
  329. // }
  330. // #[test]
  331. // fn run_severus() -> anyhow::Result<()> {
  332. // init();
  333. // Severus::initialize("BANGA", Config::default())?.run()
  334. // }
  335. //
  336. // #[test]
  337. // fn run_severus_solo() -> anyhow::Result<()> {
  338. // init();
  339. // SeverusSolo::initialize("LAKHDHAR", "diag", Config::default())?.run()
  340. // }
  341. //
  342. // #[test]
  343. // fn check_versions() -> anyhow::Result<()> {
  344. // init();
  345. // let config = Config::default();
  346. // let v = Savana::version(&config)?;
  347. // info!("Savanna version {v}");
  348. // let v = Severus::version(&config)?;
  349. // info!("Severus version {v}");
  350. // Ok(())
  351. // }
  352. // #[test]
  353. // fn run_deepvariant() -> anyhow::Result<()> {
  354. // init();
  355. // DeepVariant::initialize("HAMROUNE", "diag", Config::default())?.run()
  356. // }
  357. // #[test]
  358. // fn run_clairs() -> anyhow::Result<()> {
  359. // init();
  360. // ClairS::initialize("ADJAGBA", Config::default())?.run()
  361. // }
  362. // #[test]
  363. // fn run_longphase() -> anyhow::Result<()> {
  364. // init();
  365. // let id = "BECERRA";
  366. // let diag_bam = format!("/data/longreads_basic_pipe/{id}/diag/{id}_diag_hs1.bam");
  367. // let vcf = format!("/data/longreads_basic_pipe/{id}/diag/ClairS/clair3_normal_tumoral_germline_output.vcf.gz");
  368. // let mrd_bam = format!("/data/longreads_basic_pipe/{id}/mrd/{id}_mrd_hs1.bam");
  369. //
  370. // LongphaseHap::new(id, &diag_bam, &vcf, LongphaseConfig::default()).run()?;
  371. // LongphaseHap::new(id, &mrd_bam, &vcf, LongphaseConfig::default()).run()
  372. // }
  373. // #[test]
  374. // fn run_longphase_modcall() -> anyhow::Result<()> {
  375. // init();
  376. // let id = "ADJAGBA";
  377. // let time = "diag";
  378. // LongphaseModcallSolo::initialize(id, time, Config::default())?.run()
  379. // }
  380. //
  381. // #[test]
  382. // fn run_longphase_phase() -> anyhow::Result<()> {
  383. // init();
  384. // let id = "ADJAGBA";
  385. // LongphasePhase::initialize(id, Config::default())?.run()
  386. // }
  387. #[test]
  388. fn snv_parse() -> anyhow::Result<()> {
  389. init();
  390. // ClairS
  391. let row = "chr1\t10407\t.\tA\tG\t10.1\tPASS\tF\tGT:GQ:DP:AD:AF\t0/1:10:31:10,20:0.6452";
  392. let variant: VcfVariant = row.parse()?;
  393. let var_string = variant.into_vcf_row();
  394. assert_eq!(row, &var_string);
  395. let mut variant_col = VariantCollection {
  396. variants: vec![variant],
  397. vcf: collection::vcf::Vcf::new(
  398. "/data/longreads_basic_pipe/ACHITE/diag/ClairS/ACHITE_diag_clairs_PASSED.vcf.gz"
  399. .into(),
  400. )?,
  401. caller: Annotation::Callers(annotation::Caller::ClairS, annotation::Sample::Somatic),
  402. };
  403. let annotations = Annotations::default();
  404. variant_col.annotate_with_constit_bam(
  405. &annotations,
  406. "/data/longreads_basic_pipe/ACHITE/mrd/ACHITE_mrd_hs1.bam",
  407. 1,
  408. )?;
  409. // DeepVariant
  410. let row =
  411. "chr1\t10407\t.\tA\tG\t7.4\tPASS\t.\tGT:GQ:DP:AD:VAF:PL\t1/1:5:9:1,8:0.888889:5,5,0";
  412. variant_col.variants.push(row.parse()?);
  413. let anns: Vec<Vec<Annotation>> = variant_col
  414. .variants
  415. .iter()
  416. .filter_map(|e| {
  417. annotations
  418. .store
  419. .get(&e.hash())
  420. .map(|v| v.value().to_owned())
  421. })
  422. .collect();
  423. assert_eq!(anns[0], anns[1]);
  424. Ok(())
  425. }
  426. #[test]
  427. fn deletion_parse() -> anyhow::Result<()> {
  428. init();
  429. // Clairs
  430. let row = "chr1\t16760\t.\tAaag\tA\t8.48\tPASS\tF\tGT:GQ:DP:AD:AF\t0/1:8:39:22,17:0.4359";
  431. let variant: VcfVariant = row.parse()?;
  432. let var_string = variant.into_vcf_row();
  433. // case are not keeped
  434. assert_eq!(
  435. &var_string,
  436. "chr1\t16760\t.\tAAAG\tA\t8.48\tPASS\tF\tGT:GQ:DP:AD:AF\t0/1:8:39:22,17:0.4359"
  437. );
  438. assert_eq!(AlterationCategory::DEL, variant.alteration_category());
  439. let mut variant_col = VariantCollection {
  440. variants: vec![variant],
  441. vcf: collection::vcf::Vcf::new(
  442. "/data/longreads_basic_pipe/ACHITE/diag/ClairS/ACHITE_diag_clairs_PASSED.vcf.gz"
  443. .into(),
  444. )?,
  445. caller: Annotation::Callers(annotation::Caller::ClairS, annotation::Sample::Somatic),
  446. };
  447. let annotations = Annotations::default();
  448. // DeepVariant, VAF is not an f32 at it should be, sometimes too long removed the last number for eq
  449. let row = "chr12\t326911\t.\tGTGTA\tG\t4.7\tPASS\t.\tGT:GQ:DP:AD:VAF:PL\t0/1:5:8:5,3:0.37500:2,0,21";
  450. let variant: VcfVariant = row.parse()?;
  451. let var_string = variant.into_vcf_row();
  452. assert_eq!(row, &var_string);
  453. variant_col.variants.push(row.parse()?);
  454. assert_eq!(AlterationCategory::DEL, variant.alteration_category());
  455. // Severus, 0000 added to VAF and hVAF
  456. let row = "chr10\t108974982\tseverus_DEL885\tN\t<DEL>\t60\tPASS\tPRECISE;SVTYPE=DEL;SVLEN=474;END=108975456;STRANDS=+-;INSIDE_VNTR=TRUE;MAPQ=60\tGT:GQ:VAF:hVAF:DR:DV\t0/1:61:0.30000:0.30000,0.00000,0.00000:7:3";
  457. let variant: VcfVariant = row.parse()?;
  458. let var_string = variant.into_vcf_row();
  459. assert_eq!(row, &var_string);
  460. assert_eq!(AlterationCategory::DEL, variant.alteration_category());
  461. variant_col.variants.push(row.parse()?);
  462. // Nanomonsv dont parse last format remove: \t22:0 and nt putted in uppercase
  463. let row = "chr14\t5247080\td_106\tG\t<DEL>\t.\tPASS\tEND=5247255;SVTYPE=DEL;SVLEN=-175;SVINSLEN=39;SVINSSEQ=ATAACCCAGGTGATATAACACTTCTTTAGGCTCTGCCTA\tTR:VR\t18:3";
  464. let variant: VcfVariant = row.parse()?;
  465. let var_string = variant.into_vcf_row();
  466. assert_eq!(row, &var_string);
  467. assert_eq!(AlterationCategory::DEL, variant.alteration_category());
  468. variant_col.variants.push(row.parse()?);
  469. let row = "chr1\t20654667\t.\tTG\tT\t3.5\tPASS\t.\tGT:GQ:DP:AD:VAF:PL\t0/1:3:23:20,3:0.13044:0,0,16";
  470. let variant: VcfVariant = row.parse()?;
  471. let var_string = variant.into_vcf_row();
  472. assert_eq!(row, &var_string);
  473. assert_eq!(AlterationCategory::DEL, variant.alteration_category());
  474. variant_col.variants.push(row.parse()?);
  475. let row = "chr1\t2160094\t.\tTCTGACAGCCTGGAACAGCACCCACAACCGCAGGTGAGCATCTGACAGCCCGCAGCAGCACCCACACGCACAGGTGAGAATCTGACAGCCCGGAGCAGCACCCACACAGGCAGGGGAGCATCTGACATCCTGGAGCAGCACCGACAACCCCAGGTGAGCAACTGAGAGCCTGGAACAGCACCCACACCCCCAGGTGAGAATCTGACAGCCTGGAAGAGCACCCCACATCCCCGGGTGAGCATCTGACAGCCTGGAACAGCATCAACACCCCCAGGTGAGCATCCGATAGCCTGGAGCAGCACCCACACCCTCAGGTGAGCATCTGACAGCCTGGAACAGCAACCACACCCCCAGGTGAACATCTGACAGCCCGGAGCAGCACCCACACCCCCAGGTGAGCATCTGACAGCCTGGAACAGCACCCACACCCCCAGGAGAGCATCCGGTAGCCTGGAGCAGAACCCACACCCACAGGCGAGCATCTGACAGCCTGGGTTGGCACCCACACCCCCAGGTGAGCATCTGATGGTCTGGAGCAGCACCCACACCTACAGGAGAGCATCTGACAACCTGCAACAGAACCCAAACCCCCAGGTGAGCATCTGACAGACTGGAACAGCACCCTGCACCCCCAGGTGAGTATCTGACGGCCTGGAACAGAACACACAAGCCCAGGTGAGCATCCGACAGCCTGGAGCAGCACCCACACCCCCAGGTGAGCATCAGACAGCCTGGAGTAGCACCCCACACCCCCAGGTGAGCATCCGACAGCCTGGAGCAGCACCCACACTCACCAGGTGAGCATGTGACATGCTGGAACAGCACCCACACCCCCAGGCGAGCATCTGACAGCCTGGAGCGGCACCCCACACCCCCAGGTGAGCATCGGACAGCCTGGAGCAGTACCCACACCCCCAGGTGAGCATCCGACAGCCTGGAACAGCACCCACACCCCCAGGTGAGCATCTGACAGACAGGAACAGCACCCACATGTCCAGGTGAGCCTCTGACAGACTGGAACAGCACGCGCACCCCCAGGTGAGCATCTGACAGGCTGGAACAGCACCCACACCCCCAGGAGAGCATCTTACAGCATGTAACAGCACCCACACACCCACGTGAGCATCTGACAGCCTGGAACAGCACCCTGCACCCCCAGGTGCGCACGTGACAGCCTGGAACAGCACCCACACCCCCAGGCGAGCATCGGACAGCCTGGAGCAGCACCCCACACCCGCAGGTGAGCATCCGACAGCCTGGAGCAGCACCCACACCCCCAGGTGAGCATGTGACAGCCTGGAACAGCACCCACACCCCCAGGCGAGCATCTGACAGCCTGGAGCAACACCCCACACCCCCAGGTGAGCATCGAACTGTCTGGAGCAGCACCCACAACCACAGGTGGGCATCGGAGAGAGTGGAGCAGCGCCCAGACCACCAGGCAAGCATCTGACAGCCTGGAGCAGTGCCCACACCCCCATGTGAGCATCTGACAGTATGGAGCAGCACCCACAGCCCAAGGTGAGCATCTGACAACCTGGAGCAGCACCCACACCCCCAGGCGAGCATCTGAACGCACAGAGCAGCACCCACACCCCCAGGCGAGCATCCGACAGCGTGGAGCAGCACCCACACTCCCAGGTGCGCGTGTGATGGTCTGGGGCAGCACCCACACACACAGGTGAGCCTGCGACAGCCTGGAGCAGCACCCACAGCCCCAGGTGAGCATCTGATGGTCTGGAGCAGCACCCACAACCACAGGTGAGCATCGGAGAGACTGGAGCAGCGCCGAAACCCCCAGGCGAACATTTGAGAGCCTGGAGCAGTGCCGACACCCCCAGGTGAGCATCTGACACCGTGGAGCAGCACCCACAGCCCAAGGTGAGCATCTGACAACCTGGAGCAGCACCCACAGCCCCAGGCGAGCATTTGAACGCACGGAGCAGGACCTACAGCCCCAGGCGAGCATCCGACAGCCTGGAGCAGCACCCACACACCCAGGTGAGCATCTGACAGCCTGGAGCAGCACCCACAACCCCAGGGGAGCATCTGACCGCATGGAATGGCATCCTCACCCGTAGGTGAACGTCCGACAGCCTGGAGCAGCACCCACACCCCCAGGTGAGCATCTGACAGCCTGGAACAGCACCTGCACCCCCGGGTGAGGATCAGATAGCCTGGAGCAGCACCCACACTCCAGGTGAGCATCTGACAGCCTGAAGCAGCACCCACACCAACAGGTGAGCATCTGACAGCCTGGAAGAGCACCCACAACCCCACGTGAGCATCTGACAGCCTGGAACAGGACCCTGCACCCCAAAGTGAGCATCTGACAACCTGGAGCAGGAACCACAACCCCAGGTGAGCATCTGATAGCCTGGAATAGCACCCACACACCCAGGTGAGCATCTGAGAGCCTGGAGCAGCACCCACACCCCCAGGTGAGCATCCGACAGCCTGGAACAGTACCCACACACCCAGGCGAGCATCTGACAGCCTGAAACAGCACACACACTCCCAGGTGAGCATCTCATATGCTGGAACAGCACCCACACCCCCAGGTGAGCATCTGACTGCCCGGAGCAGCACGCACACCCCCGGGTGAGCATCTGATAGCCTGGAACAGCACCCACACCCCCAGATGAGCATCCGACAGCCTGGAGCGGAGCCCACAGCCCCAGGCGAGCATCTGACAGCCTGGAACAGCACCCTGCATCCCCGGGTGAGGATCAGACAGCCTGGAGCAGCACCCACACTCCAGGTGAGCATCTGACAGCCTGAAGCAGCACCCACACCAACAGGTGAGCCTCTGACAGCCTGCAACAGCACCCACACCCCAAGGTGAGCATCTGACAGCCTGGAAGAGGACCCTGCAGCCCCAAGTGAGCATCTGACAACCTGAAGCAGCAACCACACTCCAAGGTGAGCATCCAACAGCCTGGAACAGCACCCACACACCCAGGTGAGCATCTGACAGCCTGGAGCAGCACCACACCCCCAGGTGAGCATCCGACAGCCTGGAACAGCACCTACAAACCCAGGAGAGCATCCGACAGCCTGGAGCAGCACCCACACCCCCAGGCGAGCATCTGACAGCCCGGAGCAGCACGCAAACCCCCAGGTGAGCATCTGATACCCTGGAACAGCACCCACACACCCAGGTGAGCATCCGACAGCCTAGAGCTGAACCCACACCAACAGGAGAGCATCTGACAGCCTGGGTCGGCACCCACACCCCCAGGTGAGCATCTGACAGCCTGGAACAGCACCCTGCACCCTCAGGTGCGCACGTGACAGCCTGGAACAGCACCCACACACCCAGGCGAGCATCTGACGGCCTGGAACGGCACCCACACCCCGAGGCGAGCATGGGACAGCCTGGAGAGCAGCCACACCCTCAGATGAGCATCTGACAGCCTGGAACAGCACCCTGCACCCCCAGGTGAGCATCTGACAGCCTGGAACAGGACCCACGTCCCCAGGCGAGCAA\tT\t3.1\tPASS\t.\tGT:GQ:DP:AD:VAF:PL\t0/1:3:15:11,4:0.26667:0,0,22";
  476. let variant: VcfVariant = row.parse()?;
  477. let var_string = variant.into_vcf_row();
  478. assert_eq!(row, &var_string);
  479. assert_eq!(AlterationCategory::DEL, variant.alteration_category());
  480. variant_col.variants.push(row.parse()?);
  481. let row = "chr1\t27073397\t.\tTGGGG\tT\t28.9\tPASS\t.\tGT:GQ:DP:AD:VAF:PL\t./.:2:13:1,3:0.23077:24,2,26";
  482. let variant: VcfVariant = row.parse()?;
  483. let var_string = variant.into_vcf_row();
  484. assert_eq!(row, &var_string);
  485. assert_eq!(AlterationCategory::DEL, variant.alteration_category());
  486. variant_col.variants.push(row.parse()?);
  487. let row = "chrY\t639648\t.\tCTTTTTT\tC\t10.4\tPASS\t.\tGT:GQ:DP:AD:VAF:PL\t0/1:1:7:0,2:0.28571:4,0,2";
  488. let variant: VcfVariant = row.parse()?;
  489. let var_string = variant.into_vcf_row();
  490. assert_eq!(row, &var_string);
  491. assert_eq!(AlterationCategory::DEL, variant.alteration_category());
  492. variant_col.variants.push(row.parse()?);
  493. let row = "chr11\t31260259\tseverus_BND1077_1\tN\t]chr11:36214171]N\t60\tPASS\tPRECISE;SVTYPE=BND;SVLEN=4953912;MATE_ID=severus_BND1077_2;STRANDS=+-;MAPQ=60\tGT:GQ:VAF:hVAF:DR:DV\t0/1:68:0.67000:0.67000,0.00000,0.00000:4:8";
  494. let variant: VcfVariant = row.parse()?;
  495. let var_string = variant.into_vcf_row();
  496. assert_eq!(row, &var_string);
  497. assert_eq!(AlterationCategory::DEL, variant.alteration_category());
  498. println!("{:#?}", variant.deletion_desc());
  499. variant_col.variants.push(row.parse()?);
  500. variant_col.annotate_with_constit_bam(
  501. &annotations,
  502. "/data/longreads_basic_pipe/ACHITE/mrd/ACHITE_mrd_hs1.bam",
  503. 1,
  504. )?;
  505. Ok(())
  506. }
  507. #[test]
  508. fn insertion_parse() -> anyhow::Result<()> {
  509. init();
  510. // Clairs
  511. let row = "chr1\t23005\t.\tT\tTCAC\t3.1\tPASS\tF\tGT:GQ:DP:AD:A\t0/1:3:13:9,4:0.3077";
  512. let variant: VcfVariant = row.parse()?;
  513. let var_string = variant.into_vcf_row();
  514. assert_eq!(&var_string, row);
  515. assert_eq!(AlterationCategory::INS, variant.alteration_category());
  516. let mut variant_col = VariantCollection {
  517. variants: vec![variant],
  518. vcf: collection::vcf::Vcf::new(
  519. "/data/longreads_basic_pipe/ACHITE/diag/ClairS/ACHITE_diag_clairs_PASSED.vcf.gz"
  520. .into(),
  521. )?,
  522. caller: Annotation::Callers(annotation::Caller::ClairS, annotation::Sample::Somatic),
  523. };
  524. let annotations = Annotations::default();
  525. // DeepVariant, VAF is not an f32 at it should be, sometimes too long removed the last number for eq
  526. let row = "chrX\t152732993\t.\tG\tGTTTTTTTTT\t18.9\tPASS\t.\tGT:GQ:DP:AD:VAF:PL\t1/1:3:6:0,5:0.50000:15,7,3";
  527. let variant: VcfVariant = row.parse()?;
  528. let var_string = variant.into_vcf_row();
  529. assert_eq!(row, &var_string);
  530. // variant_col.variants.push(row.parse()?);
  531. assert_eq!(AlterationCategory::INS, variant.alteration_category());
  532. // Severus, 0000 added to VAF and hVAF
  533. let row = "chr11\t26669932\tseverus_INS9403\tN\tT\t60\tPASS\tPRECISE;SVTYPE=INS;SVLEN=460;INSIDE_VNTR=TRUE;MAPQ=60\tGT:GQ:VAF:hVAF:DR:DV\t0/1:71:0.27000:0.27000,0.00000,0.00000:8:3";
  534. let variant: VcfVariant = row.parse()?;
  535. let var_string = variant.into_vcf_row();
  536. assert_eq!(row, &var_string);
  537. assert_eq!(AlterationCategory::INS, variant.alteration_category());
  538. // variant_col.variants.push(row.parse()?);
  539. // Nanomonsv dont parse last format remove: \t22:0 and nt putted in uppercase
  540. let row = "chr8\t87940084\td_333\tT\t<INS>\t.\tPASS\tEND=87940201;SVTYPE=INS;SVINSLEN=172;SVINSSEQ=TA\tTR:VR\t9:5";
  541. let variant: VcfVariant = row.parse()?;
  542. let var_string = variant.into_vcf_row();
  543. assert_eq!(row, &var_string);
  544. assert_eq!(AlterationCategory::INS, variant.alteration_category());
  545. // variant_col.variants.push(row.parse()?);
  546. //
  547. let row = "chr5\t36122736\t.\tT\tTGCTCCG\t3.9\tPASS\t.\tGT:GQ:DP:AD:VAF:PL\t0/1:4:28:11,16:0.57143:1,0,37";
  548. let variant: VcfVariant = row.parse()?;
  549. let var_string = variant.into_vcf_row();
  550. assert_eq!(row, &var_string);
  551. assert_eq!(AlterationCategory::INS, variant.alteration_category());
  552. variant_col.variants = Vec::new();
  553. variant_col.variants.push(row.parse()?);
  554. variant_col.annotate_with_constit_bam(
  555. &annotations,
  556. "/data/longreads_basic_pipe/PASSARD/mrd/PASSARD_mrd_hs1.bam",
  557. 1,
  558. )?;
  559. println!("{variant_col:?}");
  560. println!("{annotations:?}");
  561. Ok(())
  562. }
  563. #[test]
  564. fn trl_parse() -> anyhow::Result<()> {
  565. init();
  566. let row = "chr2\t207968575\tID_16420_1\ta\ta]chr11:41497080]\t.\tPASS\tSVTYPE=BND;MATEID=ID_16420_2;TUMOUR_READ_SUPPORT=4;TUMOUR_ALN_SUPPORT=4;NORMAL_READ_SUPPORT=0;NORMAL_ALN_SUPPORT=0;SVLEN=0;BP_NOTATION=++;SOURCE=SUPPLEMENTARY;CLUSTERED_READS_TUMOUR=4;CLUSTERED_READS_NORMAL=0;ORIGIN_STARTS_STD_DEV=0.433;ORIGIN_MAPQ_MEAN=56.25;ORIGIN_EVENT_SIZE_STD_DEV=0;ORIGIN_EVENT_SIZE_MEDIAN=0;ORIGIN_EVENT_SIZE_MEAN=0;END_STARTS_STD_DEV=17.754;END_MAPQ_MEAN=56.25;END_EVENT_SIZE_STD_DEV=0;END_EVENT_SIZE_MEDIAN=0;END_EVENT_SIZE_MEAN=0;TUMOUR_DP_BEFORE=8,11;TUMOUR_DP_AT=4,11;TUMOUR_DP_AFTER=4,11;NORMAL_DP_BEFORE=7,16;NORMAL_DP_AT=7,16;NORMAL_DP_AFTER=7,16;TUMOUR_AF=1,0.364;NORMAL_AF=0,0;TUMOUR_TOTAL_HP_AT=1,3,0;NORMAL_TOTAL_HP_AT=4,3,0;TUMOUR_ALT_HP=2,1,1;TUMOUR_PS=207946665;NORMAL_ALT_HP=0,0,0;CLASS=PREDICTED_SOMATIC\tGT\t0/1";
  567. let variant: VcfVariant = row.parse()?;
  568. // let var_string = variant.into_vcf_row();
  569. let u = variant.n_alt_depth();
  570. println!("{u:?}");
  571. Ok(())
  572. }
  573. #[test]
  574. fn dup_parse() -> anyhow::Result<()> {
  575. init();
  576. let row = "chr9\t91352364\tID_14667_1\tC\t]chr9:98584394]C\t.\tPASS\tSVTYPE=BND;MATEID=ID_14667_2;TUMOUR_READ_SUPPORT=3;TUMOUR_ALN_SUPPORT=3;NORMAL_READ_SUPPORT=0;NORMAL_ALN_SUPPORT=0;SVLEN=7232030;BP_NOTATION=-+;SOURCE=SUPPLEMENTARY;CLUSTERED_READS_TUMOUR=3;CLUSTERED_READS_NORMAL=0;ORIGIN_STARTS_STD_DEV=0.471;ORIGIN_MAPQ_MEAN=60;ORIGIN_EVENT_SIZE_STD_DEV=8.524;ORIGIN_EVENT_SIZE_MEDIAN=7232030;ORIGIN_EVENT_SIZE_MEAN=7232020;END_STARTS_STD_DEV=8.731;END_MAPQ_MEAN=60;END_EVENT_SIZE_STD_DEV=8.524;END_EVENT_SIZE_MEDIAN=7232030;END_EVENT_SIZE_MEAN=7232020;TUMOUR_DP_BEFORE=16,38;TUMOUR_DP_AT=20,33;TUMOUR_DP_AFTER=20,33;NORMAL_DP_BEFORE=26,22;NORMAL_DP_AT=26,22;NORMAL_DP_AFTER=26,22;TUMOUR_AF=0.15,0.091;NORMAL_AF=0,0;TUMOUR_TOTAL_HP_AT=7,12,1;NORMAL_TOTAL_HP_AT=13,13,0;TUMOUR_ALT_HP=0,2,1;TUMOUR_PS=90796185;NORMAL_ALT_HP=0,0,0;CLASS=PREDICTED_SOMATIC\tGT\t0/1";
  577. let variant: VcfVariant = row.parse()?;
  578. assert_eq!(AlterationCategory::DUP, variant.alteration_category());
  579. let row = "chr9\t98584394\tID_14667_2\tC\tC[chr9:91352364[\t.\tPASS\tSVTYPE=BND;MATEID=ID_14667_1;TUMOUR_READ_SUPPORT=3;TUMOUR_ALN_SUPPORT=3;NORMAL_READ_SUPPORT=0;NORMAL_ALN_SUPPORT=0;SVLEN=7232030;BP_NOTATION=-+;SOURCE=SUPPLEMENTARY;CLUSTERED_READS_TUMOUR=3;CLUSTERED_READS_NORMAL=0;ORIGIN_STARTS_STD_DEV=0.471;ORIGIN_MAPQ_MEAN=60;ORIGIN_EVENT_SIZE_STD_DEV=8.524;ORIGIN_EVENT_SIZE_MEDIAN=7232030;ORIGIN_EVENT_SIZE_MEAN=7232020;END_STARTS_STD_DEV=8.731;END_MAPQ_MEAN=60;END_EVENT_SIZE_STD_DEV=8.524;END_EVENT_SIZE_MEDIAN=7232030;END_EVENT_SIZE_MEAN=7232020;TUMOUR_DP_BEFORE=38,16;TUMOUR_DP_AT=33,20;TUMOUR_DP_AFTER=33,20;NORMAL_DP_BEFORE=22,26;NORMAL_DP_AT=22,26;NORMAL_DP_AFTER=22,26;TUMOUR_AF=0.091,0.15;NORMAL_AF=0,0;TUMOUR_TOTAL_HP_AT=15,18,0;NORMAL_TOTAL_HP_AT=11,10,1;TUMOUR_ALT_HP=1,0,2;TUMOUR_PS=98176815;NORMAL_ALT_HP=0,0,0;CLASS=PREDICTED_SOMATIC\tGT\t0/1";
  580. let variant: VcfVariant = row.parse()?;
  581. assert_eq!(AlterationCategory::DUP, variant.alteration_category());
  582. let row = "chr1\t9218455\tr_0\tC\t<DUP>\t.\tPASS\tSVTYPE=DUP;SVLEN=12572;END=9231027\tTR:VR\t37:9";
  583. let variant: VcfVariant = row.parse()?;
  584. println!("{:#?}", variant.alteration_category());
  585. println!("{:#?}", variant.bnd_desc());
  586. let row = "chr1\t47110295\tseverus_BND178_1\tN\t]chr1:47192169]N\t60.00\tPASS\tPRECISE;SVTYPE=BND;SVLEN=81874;MATE_ID=severus_BND178_2;STRANDS=+-;MAPQ=60\tGT:GQ:VAF:hVAF:DR:DV\t0/1:66:0.38000:0.38000,0.00000,0.00000:8:5";
  587. let variant: VcfVariant = row.parse()?;
  588. println!("{:#?}", variant.alteration_category());
  589. println!("{:#?}", variant.bnd_desc());
  590. Ok(())
  591. }
  592. #[test]
  593. fn variant_parse() -> anyhow::Result<()> {
  594. let row =
  595. "chr1\t1366\t.\tC\tCCCT\t8.2\tPASS\t.\tGT:GQ:DP:AD:VAF:PL\t1/1:4:6:1,4:0.66667:6,4,0";
  596. let variant: VcfVariant = row.parse()?;
  597. let var_string = variant.into_vcf_row();
  598. assert_eq!(row, &var_string);
  599. let row = "chr1\t1366\t.\tC\tCCCT\t8.2\tPASS\t.";
  600. let variant: VcfVariant = row.parse()?;
  601. let var_string = variant.into_vcf_row();
  602. assert_eq!(row, &var_string);
  603. let row = "chr1\t2628434\t.\tC\tT\t17.973\tPASS\tH;FAU=0;FCU=7;FGU=0;FTU=7;RAU=0;RCU=2;RGU=0;RTU=2\tGT:GQ:DP:AF:AD:NAF:NDP:NAD:AU:CU:GU:TU:NAU:NCU:NGU:NTU\t0/1:17:18:0.5:0,9:0:11:0,0:0:9:0:9:0:11:0:0";
  604. let variant: VcfVariant = row.parse()?;
  605. let var_string = variant.into_vcf_row();
  606. assert_eq!(row, &var_string);
  607. let row = "chr1\t52232\t.\tC\tCT\t18\t.\t.\tGT:GQ:DP:AD:AF\t1/.:1:24:3,5:0.208333";
  608. let variant: VcfVariant = row.parse()?;
  609. let var_string = variant.into_vcf_row();
  610. assert_eq!(row, &var_string);
  611. let row = "chr1\t52232\t.\tC\tCT\t18\t.\t.\tGT:GQ:DP:AD:AF\t1/1:1:24:3,5:0.208333";
  612. let variant_b: VcfVariant = row.parse()?;
  613. assert_eq!(variant, variant_b);
  614. let row = "chr1\t475157\t.\tA\tG\t12.301\tPASS\tH;FAU=2;FCU=0;FGU=2;FTU=0;RAU=3;RCU=0;RGU=3;RTU=0\tGT:GQ:DP:AF:AD:NAF:NDP:NAD:AU:CU:GU:TU:NAU:NCU:NGU:NTU\t0/1:12:10:0.5:0,5:0.0769:13:0,1:5:0:5:0:12:0:1:0";
  615. let variant: VcfVariant = row.parse()?;
  616. let var_string = variant.into_vcf_row();
  617. assert_eq!(row, &var_string);
  618. let row = "chr1\t161417408\tr_10_0\tT\t[chr1:161417447[TTGGCAGGTTCC\t.\tPASS\tSVTYPE=BND;MATEID=r_10_1;SVINSLEN=11;SVINSSEQ=TTGGCAGGTTC\tTR:VR\t22:3\t12:0";
  619. let variant: VcfVariant = row.parse()?;
  620. println!("{variant:#?}");
  621. let u = variant.bnd_desc();
  622. println!("{u:#?}");
  623. // Severus mates are not in RC
  624. let vcf = "chr7\t27304522\tseverus_BND6747_1\tN\t[chr6:32688062[N\t60\tPASS\tPRECISE;SVTYPE=BND;MATE_ID=severus_BND6747_2;STRANDS=--;MAPQ=60;CLUSTERID=severus_2\tGT:VAF:hVAF:DR:DV\t0/1:0.29:0.29,0,0:12:5";
  625. let variant: VcfVariant = vcf.parse()?;
  626. let bnd_a = variant.bnd_desc()?;
  627. let vcf = "chr6\t32688062\tseverus_BND6747_2\tN\t[chr7:27304522[N\t60\tPASS\tPRECISE;SVTYPE=BND;MATE_ID=severus_BND6747_1;STRANDS=--;MAPQ=60;CLUSTERID=severus_2 GT:VAF:hVAF:DR:DV\t0/1:0.29:0.29,0,0:12:5";
  628. let variant: VcfVariant = vcf.parse()?;
  629. let bnd_b = variant.bnd_desc()?;
  630. assert_eq!(bnd_a, bnd_b.rc());
  631. println!("{bnd_a}\n{bnd_b}");
  632. // Savana here each mate are in RC
  633. let vcf = "chr10\t102039096\tID_35957_2\tG\t]chr10:101973386]G\t.\tPASS\tSVTYPE=BND;MATEID=ID_35957_1;TUMOUR_READ_SUPPORT=7;TUMOUR_ALN_SUPPORT=7;NORMAL_READ_SUPPORT=0;NORMAL_ALN_SUPPORT=0;SVLEN=65710;BP_NOTATION=+-;SOURCE=SUPPLEMENTARY;CLUSTERED_READS_TUMOUR=7;CLUSTERED_READS_NORMAL=0;ORIGIN_STARTS_STD_DEV=0.35;ORIGIN_MAPQ_MEAN=60;ORIGIN_EVENT_SIZE_STD_DEV=7.248;ORIGIN_EVENT_SIZE_MEDIAN=65710;ORIGIN_EVENT_SIZE_MEAN=65705.4;END_STARTS_STD_DEV=7.007;END_MAPQ_MEAN=60;END_EVENT_SIZE_STD_DEV=7.248;END_EVENT_SIZE_MEDIAN=65710;END_EVENT_SIZE_MEAN=65705.4;TUMOUR_DP_BEFORE=38,29;TUMOUR_DP_AT=44,21;TUMOUR_DP_AFTER=44,21;NORMAL_DP_BEFORE=13,15;NORMAL_DP_AT=13,15;NORMAL_DP_AFTER=13,15;TUMOUR_AF=0.159,0.333;NORMAL_AF=0,0;TUMOUR_TOTAL_HP_AT=20,16,8;NORMAL_TOTAL_HP_AT=6,7,0;TUMOUR_ALT_HP=0,1,6;TUMOUR_PS=101917152;NORMAL_ALT_HP=0,0,0;CLASS=PREDICTED_SOMATIC\tGT\t0/1";
  634. let variant: VcfVariant = vcf.parse()?;
  635. let bnd_a = variant.bnd_desc()?;
  636. let vcf = "chr10\t101973386\tID_35957_1\tA\tA[chr10:102039096[\t.\tPASS\tSVTYPE=BND;MATEID=ID_35957_2;TUMOUR_READ_SUPPORT=7;TUMOUR_ALN_SUPPORT=7;NORMAL_READ_SUPPORT=0;NORMAL_ALN_SUPPORT=0;SVLEN=65710;BP_NOTATION=+-;SOURCE=SUPPLEMENTARY;CLUSTERED_READS_TUMOUR=7;CLUSTERED_READS_NORMAL=0;ORIGIN_STARTS_STD_DEV=0.35;ORIGIN_MAPQ_MEAN=60;ORIGIN_EVENT_SIZE_STD_DEV=7.248;ORIGIN_EVENT_SIZE_MEDIAN=65710;ORIGIN_EVENT_SIZE_MEAN=65705.4;END_STARTS_STD_DEV=7.007;END_MAPQ_MEAN=60;END_EVENT_SIZE_STD_DEV=7.248;END_EVENT_SIZE_MEDIAN=65710;END_EVENT_SIZE_MEAN=65705.4;TUMOUR_DP_BEFORE=29,38;TUMOUR_DP_AT=21,44;TUMOUR_DP_AFTER=21,44;NORMAL_DP_BEFORE=15,13;NORMAL_DP_AT=15,13;NORMAL_DP_AFTER=15,13;TUMOUR_AF=0.333,0.159;NORMAL_AF=0,0;TUMOUR_TOTAL_HP_AT=17,0,4;NORMAL_TOTAL_HP_AT=5,7,3;TUMOUR_ALT_HP=0,6,1;TUMOUR_PS=101917152;NORMAL_ALT_HP=0,0,0;CLASS=PREDICTED_SOMATIC\tGT\t0/1";
  637. let variant: VcfVariant = vcf.parse()?;
  638. let bnd_b = variant.bnd_desc()?;
  639. assert_eq!(bnd_a, bnd_b);
  640. println!("{bnd_a}\n{bnd_b}");
  641. // Deletions
  642. // Severus
  643. let vcf = "chr7\t143674704\tseverus_DEL7318\tN\t<DEL>\t60\tPASS≥\tPRECISE;SVTYPE=DEL;SVLEN=3642;END=143678346;STRANDS=+-;MAPQ=60\tGT:GQ:VAF:hVAF:DR:DV\t0/1:114:0.39:0.39,0,0:14:9";
  644. let variant: VcfVariant = vcf.parse()?;
  645. println!("{:?}", variant.infos);
  646. println!("{:?}", variant.formats);
  647. let del = variant.deletion_desc().unwrap();
  648. println!("{:?}", del);
  649. println!("{:?} {:?}", del.len(), variant.formats.n_alt_depth());
  650. assert_eq!(
  651. "chr7:143674704_143678346_del",
  652. variant.deletion_desc().unwrap().to_string()
  653. );
  654. println!("--\n");
  655. let vcf="chr7\t144003249\tr_106\tC\t<DEL>\t.\tPASS\tEND=144142733;SVTYPE=DEL;SVLEN=-139484;SVINSLEN=4;SVINSSEQ=GCCA\tTR:VR\t12:10\t51:0";
  656. let variant: VcfVariant = vcf.parse()?;
  657. println!("{:?}", variant.infos);
  658. println!("{:?}", variant.formats);
  659. let del = variant.deletion_desc().unwrap();
  660. println!("{:?}", del);
  661. println!("{:?} {:?}", del.len(), variant.formats.n_alt_depth());
  662. let path = "/data/ref/hs1/chm13v2.0_RefSeq_Liftoff_v5.1_Genes.bed";
  663. let r = read_bed(path)?;
  664. let deleted_genes = bedrow_overlaps_par(
  665. &r,
  666. &[&GenomeRange {
  667. contig: variant.position.contig,
  668. range: del.start..del.end,
  669. }],
  670. )
  671. .into_iter()
  672. .filter_map(|e| e.name)
  673. .collect::<Vec<String>>()
  674. .join(", ");
  675. println!("{deleted_genes}");
  676. Ok(())
  677. }
  678. #[test]
  679. // fn variant_load_deepvariant() -> anyhow::Result<()> {
  680. // init();
  681. // let id = "ADJAGBA";
  682. // let time = "diag";
  683. // let dv = DeepVariant::initialize(id, time, Config::default())?;
  684. // let annotations = Annotations::default();
  685. // let variant_collection = dv.variants(&annotations)?;
  686. // println!(
  687. // "Deepvariant for {id} {time}: variants {} {}",
  688. // variant_collection.variants.len(),
  689. // variant_collection.vcf.n_variants
  690. // );
  691. // Ok(())
  692. // }
  693. // #[test]
  694. // fn variant_load_clairs() -> anyhow::Result<()> {
  695. // init();
  696. // let id = "ELKLIFI";
  697. // let clairs = NanomonSV::initialize(id, Config::default())?;
  698. // let u = clairs.should_run();
  699. // info!("should_run {u:?}");
  700. // // let annotations = Annotations::default();
  701. // // let variant_collection = clairs.variants(&annotations)?;
  702. // // println!("ClairS for {id}: variants {} {}", variant_collection.variants.len(), variant_collection.vcf.n_variants);
  703. // Ok(())
  704. // }
  705. //
  706. // #[test]
  707. // fn variant_load_nanomonsv() -> anyhow::Result<()> {
  708. // init();
  709. // let id = "ADJAGBA";
  710. // let nanomonsv = NanomonSV::initialize(id, Config::default())?;
  711. // let annotations = Annotations::default();
  712. // let variant_collection = nanomonsv.variants(&annotations)?;
  713. // println!(
  714. // "NanomonSV for {id}: variants {} {}",
  715. // variant_collection.variants.len(),
  716. // variant_collection.vcf.n_variants
  717. // );
  718. // println!("{:?}", variant_collection.variants.first());
  719. // Ok(())
  720. // }
  721. // #[test]
  722. // fn variant_load_clairs_germline() -> anyhow::Result<()> {
  723. // init();
  724. // let id = "ADJAGBA";
  725. // let clairs = ClairS::initialize(id, Config::default())?;
  726. // let annotations = Annotations::default();
  727. // let germline_variant_collection = clairs.germline(&annotations)?;
  728. // println!("ClairS for {id}: variants {} {}", germline_variant_collection.variants.len(), germline_variant_collection.vcf.n_variants);
  729. // Ok(())
  730. // }
  731. #[test]
  732. fn overlaps() {
  733. init();
  734. let positions = vec![
  735. &GenomePosition {
  736. contig: 1,
  737. position: 100,
  738. },
  739. &GenomePosition {
  740. contig: 1,
  741. position: 150,
  742. },
  743. &GenomePosition {
  744. contig: 1,
  745. position: 200,
  746. },
  747. &GenomePosition {
  748. contig: 2,
  749. position: 150,
  750. },
  751. ];
  752. let ranges = vec![
  753. &GenomeRange {
  754. contig: 1,
  755. range: 50..150,
  756. },
  757. &GenomeRange {
  758. contig: 2,
  759. range: 100..200,
  760. },
  761. ];
  762. let parallel_overlapping_indices = overlaps_par(&positions, &ranges);
  763. assert_eq!(parallel_overlapping_indices, vec![0, 3])
  764. }
  765. #[test]
  766. fn bed_read() -> anyhow::Result<()> {
  767. init();
  768. let path = &Config::default().mask_bed("ADJAGBA");
  769. let r = read_bed(path)?;
  770. println!("{}", r.len());
  771. Ok(())
  772. }
  773. #[test]
  774. fn test_read_dict() -> anyhow::Result<()> {
  775. init();
  776. let genome = read_dict(&Config::default().dict_file)?;
  777. let genome_length: usize = genome.into_iter().map(|(_, len)| len as usize).sum();
  778. println!("{genome_length}");
  779. Ok(())
  780. }
  781. #[test]
  782. fn bases_at() -> anyhow::Result<()> {
  783. init();
  784. let id = "ADJAGBA";
  785. let c = Config::default();
  786. let chr = "chr3";
  787. let position = 62416039; // 1-based
  788. let mut bam = rust_htslib::bam::IndexedReader::from_path(c.solo_bam(id, "diag"))?;
  789. let p = nt_pileup(&mut bam, chr, position - 1, false)?
  790. .iter()
  791. .map(|e| String::from_utf8(vec![*e]).unwrap())
  792. .collect::<Vec<_>>();
  793. let mut counts = HashMap::new();
  794. for item in p.iter() {
  795. *counts.entry(item.as_str()).or_insert(0) += 1;
  796. }
  797. for (key, value) in &counts {
  798. println!("{}: {}", key, value);
  799. }
  800. assert_eq!(8, *counts.get("C").unwrap());
  801. assert_eq!(13, *counts.get("G").unwrap());
  802. assert_eq!(6, *counts.get("D").unwrap());
  803. let chr = "chr1";
  804. let position = 3220; // 1-based
  805. let mut bam = rust_htslib::bam::IndexedReader::from_path(c.solo_bam(id, "mrd"))?;
  806. let p = counts_at(&mut bam, chr, position - 1)?;
  807. println!("{p:#?}");
  808. Ok(())
  809. }
  810. #[test]
  811. fn seq_at() -> anyhow::Result<()> {
  812. init();
  813. let c = Config::default();
  814. let chr = "chr1";
  815. let position = 16761;
  816. let mut fasta_reader =
  817. noodles_fasta::io::indexed_reader::Builder::default().build_from_path(c.reference)?;
  818. let r = io::fasta::sequence_at(&mut fasta_reader, chr, position, 3)?;
  819. println!(
  820. "{r} ({} {:.2})",
  821. r.len(),
  822. estimate_shannon_entropy(r.as_str())
  823. );
  824. Ok(())
  825. }
  826. #[test]
  827. fn ins_at() -> anyhow::Result<()> {
  828. init();
  829. let id = "PASSARD";
  830. let c = Config::default();
  831. let chr = "chr5";
  832. let position = 36122736; // 1-based like in vcf
  833. let mut bam = rust_htslib::bam::IndexedReader::from_path(c.solo_bam(id, "mrd"))?;
  834. // let p = ins_pileup(&mut bam, chr, position - 1, true)?.iter().map(|e| String::from_utf8(vec![*e]).unwrap()).collect::<Vec<_>>();
  835. let counts = counts_ins_at(&mut bam, chr, position - 1)?;
  836. println!("{counts:?}");
  837. for (key, value) in &counts {
  838. println!("{}: {}", key, value);
  839. }
  840. Ok(())
  841. }
  842. // #[test]
  843. // fn del_at() -> anyhow::Result<()> {
  844. // let id = "PASSARD";
  845. // let c = Config::default();
  846. //
  847. // let mut bam = rust_htslib::bam::IndexedReader::from_path(c.solo_bam(id, "mrd"))?;
  848. //
  849. // let pileup_start = crate::collection::bam::nt_pileup(
  850. // &mut bam,
  851. // "chr5",
  852. // 36122735,
  853. // false,
  854. // )?;
  855. //
  856. // let uu = crate::collection::bam::nt_pileup_new(
  857. // &mut bam,
  858. // "chr5",
  859. // 36122735,
  860. // false,
  861. // )?;
  862. //
  863. // let pileup_end = crate::collection::bam::nt_pileup_new(
  864. // &mut bam,
  865. // &var.position.contig(),
  866. // del_repr.end.saturating_sub(1),
  867. // false,
  868. // )?;
  869. //
  870. // println!("{pileup_start:?}");
  871. // println!("{uu:?}");
  872. // let depth = uu.len().max(pileup_end.len());
  873. //
  874. // Ok(())
  875. //
  876. // }
  877. #[test]
  878. fn vep_line() -> anyhow::Result<()> {
  879. init();
  880. let line = "chr2_1922358_-/T\tchr2:1922357-1922358\tT\tMYT1L\tNM_001303052.2\tTranscript\tintron_variant\t-\t-\t-\t-\t-\t-\tIMPACT=MODIFIER;STRAND=-1;SYMBOL=MYT1L;SOURCE=chm13v2.0_RefSeq_Liftoff_v5.1_sorted.gff3.gz;HGVSc=NM_001303052.2:c.1619-613dup";
  881. let vep_line: VepLine = line.parse()?;
  882. println!("{vep_line:#?}");
  883. let vep: VEP = VEP::try_from(&vep_line)?;
  884. println!("{vep:#?}");
  885. Ok(())
  886. }
  887. // #[test]
  888. // fn savana_cn() -> anyhow::Result<()> {
  889. // init();
  890. // let id = "CAMARA";
  891. // // let s = SavanaCopyNumber::load_id(id, Config::default())?;
  892. // let s = SavanaReadCounts::load_id(id, Config::default())?;
  893. // println!("tumoral reads: {}", s.n_tumoral_reads());
  894. // println!("normal reads: {}", s.n_normal_reads());
  895. // println!("tumoral:\n{:#?}", s.norm_chr_counts());
  896. // Ok(())
  897. // }
  898. #[test]
  899. fn load_bam() -> anyhow::Result<()> {
  900. init();
  901. let id = "ADJAGBA";
  902. let time = "diag";
  903. let bam_path = Config::default().solo_bam(id, time);
  904. WGSBam::new(Path::new(&bam_path).to_path_buf())?;
  905. Ok(())
  906. }
  907. #[test]
  908. fn tar() -> anyhow::Result<()> {
  909. init();
  910. scan_archive("/data/lto/20241030_CUNVI-DG-N20_SEBZA-DG-N21.tar")?;
  911. Ok(())
  912. }
  913. // #[test]
  914. // fn somatic_cases() -> anyhow::Result<()> {
  915. // init();
  916. // let id = "PASSARD";
  917. // let config = Config {
  918. // somatic_pipe_force: true,
  919. // ..Default::default()
  920. // };
  921. // match SomaticPipe::initialize(id, config)?.run() {
  922. // Ok(_) => (),
  923. // Err(e) => error!("{id} {e}"),
  924. // };
  925. // Ok(())
  926. // }
  927. #[test]
  928. fn load_variants() -> anyhow::Result<()> {
  929. init();
  930. let id = "ACHITE";
  931. let config = Config::default();
  932. let path = format!("{}/{id}/diag/{id}_somatic_variants.bit", config.result_dir);
  933. let variants = variant_collection::Variants::load_from_file(&path)?;
  934. println!("n variants {}", variants.data.len());
  935. let n_vep: usize = variants.data.iter().map(|v| v.vep().len()).sum();
  936. println!("VEP: {n_vep}");
  937. let translocations = variants.get_alteration_cat(AlterationCategory::TRL);
  938. println!("{} translocations", translocations.len());
  939. let threshold = 5;
  940. let res = group_variants_by_bnd_desc(&translocations, 5);
  941. let rres = group_variants_by_bnd_rc(&res, threshold);
  942. rres.iter().for_each(|group| {
  943. println!("{} {}", group.0.len(), group.1.len());
  944. });
  945. Ok(())
  946. }
  947. #[test]
  948. fn load_fc() -> anyhow::Result<()> {
  949. init();
  950. // FlowCells::load_archive_from_scan("/data/lto", "/data/archives.json")?;
  951. let r = FlowCells::load(
  952. "/home/prom/mnt/store",
  953. "/data/pandora_id_inputs.json",
  954. "/data/archives.json.gz",
  955. )?;
  956. println!("{r:#?}");
  957. Ok(())
  958. }
  959. #[test]
  960. fn alt_cat() -> anyhow::Result<()> {
  961. let id = "ADJAGBA";
  962. let config = Config::default();
  963. let path = format!("{}/{id}/diag/somatic_variants.json.gz", config.result_dir);
  964. let variants = variant_collection::Variants::load_from_json(&path)?;
  965. println!("n variants {}", variants.data.len());
  966. variants
  967. .data
  968. .iter()
  969. .filter(|v| v.alteration_category().contains(&AlterationCategory::TRL))
  970. .for_each(|v| {
  971. println!(
  972. "{:?} {}",
  973. v.vcf_variants
  974. .iter()
  975. .map(|v| v.bnd_desc())
  976. .collect::<Vec<_>>(),
  977. v.annotations
  978. .iter()
  979. .filter(|a| matches!(a, Annotation::Callers(..)))
  980. .map(|a| a.to_string())
  981. .collect::<Vec<String>>()
  982. .join(";")
  983. )
  984. });
  985. Ok(())
  986. }
  987. #[test]
  988. fn variants_stats() -> anyhow::Result<()> {
  989. init();
  990. let config = Config::default();
  991. let all_variants_bit = find_files(&format!(
  992. "{}/*/diag/*_somatic_variants.bit",
  993. config.result_dir
  994. ))?;
  995. for v in all_variants_bit.into_iter() {
  996. let id = v
  997. .file_name()
  998. .unwrap()
  999. .to_str()
  1000. .unwrap()
  1001. .split("_somatic")
  1002. .next()
  1003. .unwrap();
  1004. println!("{id}");
  1005. let config = Config::default();
  1006. let path = format!("{}/{id}/diag/{id}_somatic_variants.bit", config.result_dir);
  1007. match variant_collection::Variants::load_from_file(&path) {
  1008. Ok(mut variants) => {
  1009. let (mut high_depth_ranges, _) = somatic_depth_quality_ranges(id, &config)?;
  1010. high_depth_ranges.par_sort_by_key(|r| (r.contig, r.range.start));
  1011. let res = VariantsStats::new(&mut variants, id, &config, &high_depth_ranges)?
  1012. .save_to_json(&format!(
  1013. "{}/{id}/diag/{id}_somatic_variants_stats.json.gz",
  1014. config.result_dir
  1015. ));
  1016. if res.is_err() {
  1017. info!("{:#?}", res);
  1018. }
  1019. }
  1020. Err(e) => error!("{e}"),
  1021. }
  1022. }
  1023. Ok(())
  1024. }
  1025. // #[test]
  1026. // fn constit_stats() {
  1027. // init();
  1028. // let id = "ADJAGBA";
  1029. // let config = Config::default();
  1030. //
  1031. // let _ = const_stats(id.to_string(), config);
  1032. // }
  1033. // #[test]
  1034. // fn test_bnd() -> anyhow::Result<()> {
  1035. // init();
  1036. // let id = "COIFFET";
  1037. // let config = Config::default();
  1038. //
  1039. // let annotations = Annotations::default();
  1040. // let s = Savana::initialize(id, config)?.variants(&annotations)?;
  1041. // s.variants.iter().for_each(|e| {
  1042. // if let Ok(bnd) = e.bnd_desc() {
  1043. // println!("{}\t{}\t{}", e.position, e.reference, e.alternative);
  1044. // println!("{:#?}", bnd);
  1045. // }
  1046. // });
  1047. // Ok(())
  1048. // }
  1049. // #[test]
  1050. // fn parse_savana_seg() {
  1051. // init();
  1052. // let r = SavanaCN::parse_file("ADJAGBA", &config::Config::default())
  1053. // .unwrap()
  1054. // .segments;
  1055. // println!("{} lines", r.len());
  1056. // println!("{:#?}", r.first().unwrap());
  1057. // }
  1058. #[test]
  1059. fn whole_scan() -> anyhow::Result<()> {
  1060. init();
  1061. let id = "CHENU";
  1062. let mut config = Config::default();
  1063. let u = config.solo_bam(id, "mrd");
  1064. println!("{u}");
  1065. config.somatic_scan_force = true;
  1066. somatic_scan(id, &config)?;
  1067. Ok(())
  1068. }
  1069. #[test]
  1070. fn parse_gff() -> anyhow::Result<()> {
  1071. init();
  1072. let id = "ADJAGBA";
  1073. let config = Config::default();
  1074. let path = format!("{}/{id}/diag/somatic_variants.bit", config.result_dir);
  1075. let exon_ranges = features_ranges("exon", &config)?;
  1076. let exon_ranges = merge_overlapping_genome_ranges(&exon_ranges);
  1077. let variants = variant_collection::Variants::load_from_file(&path)?;
  1078. let full = variants_stats::somatic_rates(&variants.data, &exon_ranges, &config);
  1079. info!("{full:#?}");
  1080. // let restrained: Vec<variant_collection::Variant> = variants.data.iter().filter(|v| v.vcf_variants.len() >= 2)
  1081. // .cloned().collect();
  1082. // let min_2 = variants_stats::somatic_rates(&restrained, &exon_ranges, &config);
  1083. // info!("{min_2:#?}");
  1084. //
  1085. // let restrained: Vec<variant_collection::Variant> = restrained.iter().filter(|v| v.vcf_variants.len() >= 3)
  1086. // .cloned().collect();
  1087. // let min_3 = variants_stats::somatic_rates(&restrained, &exon_ranges, &config);
  1088. // info!("{min_3:#?}");
  1089. // let mut high_depth_ranges = variants_stats::high_depth_somatic(id, &config)?;
  1090. // high_depth_ranges.par_sort_by_key(|r| ( r.contig, r.range.start ));
  1091. //
  1092. // let exon_ranges_ref: Vec<&GenomeRange> = exon_ranges.iter().collect();
  1093. // let exons_high_depth = range_intersection_par(&high_depth_ranges.iter().collect::<Vec<&GenomeRange>>(), &exon_ranges_ref);
  1094. //
  1095. // let full = variants_stats::somatic_rates(&variants.data, &exons_high_depth, &config);
  1096. // info!("{full:#?}");
  1097. //
  1098. // info!("n variants loaded: {}", variants.data.len());
  1099. //
  1100. // let r = features_ranges("exon", &config::Config::default())?;
  1101. // info!("n exon: {}", r.len());
  1102. //
  1103. // let merged = merge_overlapping_genome_ranges(&r);
  1104. // info!("n merged exon: {}", merged.len());
  1105. //
  1106. // let ol = par_overlaps(&variants.data, &r);
  1107. // info!("variants in exon {}", ol.len());
  1108. //
  1109. // let n_coding = ol.iter().filter_map(|i| variants.data[*i].best_vep().ok() ).filter_map(|bv| bv.impact()).filter(|impact| *impact <= VepImpact::MODERATE).count();
  1110. // info!("coding variants {n_coding}");
  1111. //
  1112. // let n_bases_m = merged.par_iter().map(|gr| gr.length()).sum::<u32>();
  1113. // info!("{n_bases_m}nt");
  1114. //
  1115. // let mega_base_m = n_bases_m as f64 / 10.0e6;
  1116. //
  1117. // let wgs_len = read_dict(&config.dict_file)?.iter().map(|(_, l)| *l).sum::<u32>();
  1118. // info!("wgs len {wgs_len}");
  1119. // let rate_wgs = variants.data.len() as f64 / (wgs_len as f64 / 10.0e6);
  1120. // info!("somatic mutation rate {rate_wgs:.2}/mb");
  1121. //
  1122. // let n_exons_mb = ol.len() as f64 / mega_base_m;
  1123. // info!("somatic mutation rate in the coding region {n_exons_mb:.2}/mb");
  1124. //
  1125. // let n_exons_mb = n_coding as f64 / mega_base_m;
  1126. // info!("somatic non synonymous mutation rate in the coding region {n_exons_mb:.2}/mb");
  1127. Ok(())
  1128. }
  1129. fn gr(contig: u8, start: u32, end: u32) -> GenomeRange {
  1130. GenomeRange {
  1131. contig,
  1132. range: start..end,
  1133. }
  1134. }
  1135. #[test]
  1136. fn test_both_empty() {
  1137. let a: Vec<&GenomeRange> = vec![];
  1138. let b: Vec<&GenomeRange> = vec![];
  1139. let result = range_intersection_par(&a, &b);
  1140. assert!(result.is_empty());
  1141. }
  1142. #[test]
  1143. fn test_one_empty() {
  1144. let a = [gr(1, 0, 100)];
  1145. let b: Vec<&GenomeRange> = vec![];
  1146. let a_refs: Vec<&GenomeRange> = a.iter().collect();
  1147. let mut result = range_intersection_par(&a_refs, &b);
  1148. sort_ranges(&mut result);
  1149. assert!(result.is_empty());
  1150. }
  1151. #[test]
  1152. fn test_single_range_no_overlap() {
  1153. let a = [gr(1, 0, 100)];
  1154. let b = [gr(1, 100, 200)];
  1155. let a_refs: Vec<&GenomeRange> = a.iter().collect();
  1156. let b_refs: Vec<&GenomeRange> = b.iter().collect();
  1157. let mut result = range_intersection_par(&a_refs, &b_refs);
  1158. sort_ranges(&mut result);
  1159. assert!(result.is_empty());
  1160. }
  1161. #[test]
  1162. fn test_single_range_full_overlap() {
  1163. let a = [gr(1, 0, 100)];
  1164. let b = [gr(1, 0, 100)];
  1165. let a_refs: Vec<&GenomeRange> = a.iter().collect();
  1166. let b_refs: Vec<&GenomeRange> = b.iter().collect();
  1167. let mut result = range_intersection_par(&a_refs, &b_refs);
  1168. let mut expected = [gr(1, 0, 100)];
  1169. sort_ranges(&mut result);
  1170. sort_ranges(&mut expected);
  1171. assert_eq!(result, expected);
  1172. }
  1173. #[test]
  1174. fn test_different_contigs() {
  1175. let a = [gr(1, 0, 100)];
  1176. let b = [gr(2, 0, 100)];
  1177. let a_refs: Vec<&GenomeRange> = a.iter().collect();
  1178. let b_refs: Vec<&GenomeRange> = b.iter().collect();
  1179. let mut result = range_intersection_par(&a_refs, &b_refs);
  1180. sort_ranges(&mut result);
  1181. assert!(result.is_empty());
  1182. }
  1183. #[test]
  1184. fn test_touching_ranges() {
  1185. let a = [gr(1, 0, 100)];
  1186. let b = [gr(1, 100, 200)];
  1187. let a_refs: Vec<&GenomeRange> = a.iter().collect();
  1188. let b_refs: Vec<&GenomeRange> = b.iter().collect();
  1189. let mut result = range_intersection_par(&a_refs, &b_refs);
  1190. sort_ranges(&mut result);
  1191. assert!(result.is_empty());
  1192. }
  1193. #[test]
  1194. fn test_complete_subrange() {
  1195. let a = [gr(1, 0, 200)];
  1196. let b = [gr(1, 50, 150)];
  1197. let a_refs: Vec<&GenomeRange> = a.iter().collect();
  1198. let b_refs: Vec<&GenomeRange> = b.iter().collect();
  1199. let mut result = range_intersection_par(&a_refs, &b_refs);
  1200. let mut expected = [gr(1, 50, 150)];
  1201. sort_ranges(&mut result);
  1202. sort_ranges(&mut expected);
  1203. assert_eq!(result, expected);
  1204. }
  1205. #[test]
  1206. fn test_multiple_overlaps_same_contig() {
  1207. let a = [gr(1, 0, 50), gr(1, 75, 125)];
  1208. let b = [gr(1, 25, 100), gr(1, 150, 200)];
  1209. let a_refs: Vec<&GenomeRange> = a.iter().collect();
  1210. let b_refs: Vec<&GenomeRange> = b.iter().collect();
  1211. let mut result = range_intersection_par(&a_refs, &b_refs);
  1212. let mut expected = [gr(1, 25, 50), gr(1, 75, 100)];
  1213. sort_ranges(&mut result);
  1214. sort_ranges(&mut expected);
  1215. assert_eq!(result, expected);
  1216. }
  1217. #[test]
  1218. fn test_multiple_contigs() {
  1219. let a = [gr(1, 0, 100), gr(2, 50, 150), gr(3, 200, 300)];
  1220. let b = [gr(1, 50, 150), gr(2, 0, 100), gr(4, 0, 100)];
  1221. let a_refs: Vec<&GenomeRange> = a.iter().collect();
  1222. let b_refs: Vec<&GenomeRange> = b.iter().collect();
  1223. let mut result = range_intersection_par(&a_refs, &b_refs);
  1224. let mut expected = [gr(1, 50, 100), gr(2, 50, 100)];
  1225. sort_ranges(&mut result);
  1226. sort_ranges(&mut expected);
  1227. assert_eq!(result, expected);
  1228. }
  1229. #[test]
  1230. fn test_adjacent_ranges() {
  1231. let a = [gr(1, 0, 50), gr(1, 50, 100)];
  1232. let b = [gr(1, 25, 75)];
  1233. let a_refs: Vec<&GenomeRange> = a.iter().collect();
  1234. let b_refs: Vec<&GenomeRange> = b.iter().collect();
  1235. let mut result = range_intersection_par(&a_refs, &b_refs);
  1236. let mut expected = [gr(1, 25, 50), gr(1, 50, 75)];
  1237. sort_ranges(&mut result);
  1238. sort_ranges(&mut expected);
  1239. assert_eq!(result, expected);
  1240. }
  1241. #[test]
  1242. fn test_minimal_overlap() {
  1243. let a = [gr(1, 0, 100)];
  1244. let b = [gr(1, 99, 200)];
  1245. let a_refs: Vec<&GenomeRange> = a.iter().collect();
  1246. let b_refs: Vec<&GenomeRange> = b.iter().collect();
  1247. let mut result = range_intersection_par(&a_refs, &b_refs);
  1248. let mut expected = [gr(1, 99, 100)];
  1249. sort_ranges(&mut result);
  1250. sort_ranges(&mut expected);
  1251. assert_eq!(result, expected);
  1252. }
  1253. /// helper to build a forward‑strand BND (same contig) where
  1254. /// A = pos, B = pos + 5
  1255. fn fwd(contig: &str, pos: u32) -> BNDDesc {
  1256. BNDDesc {
  1257. a_contig: contig.into(),
  1258. a_position: pos,
  1259. a_sens: true,
  1260. b_contig: contig.into(),
  1261. b_position: pos + 5,
  1262. b_sens: true,
  1263. added_nt: String::new(),
  1264. }
  1265. }
  1266. /// Build a six‑node *forward* chain relying **only** on `auto_connect()`
  1267. /// (no manual edges) and assert the Hamiltonian path spans all nodes.
  1268. #[test]
  1269. fn hamiltonian_chain_auto() {
  1270. // positions 10,15 20,25 … 60,65 satisfy B(u) ≤ A(v)
  1271. let bnds: Vec<BNDDesc> = (1..=6).map(|i| fwd("chr1", i * 10)).collect();
  1272. let g: BNDGraph<()> = bnds.to_bnd_graph(); // trait uses auto_connect()
  1273. // ensure auto_connect produced 5 edges in a line
  1274. assert_eq!(g.inner().edge_count(), 5);
  1275. let path = g.hamiltonian_path().expect("chain should be Hamiltonian");
  1276. assert_eq!(path.len(), 6);
  1277. }
  1278. /// Two disconnected "V" shapes -> auto_connect creates 2 edges in each
  1279. /// component, no Hamiltonian path, components sorted by size.
  1280. #[test]
  1281. fn components_after_auto_connect() {
  1282. // comp1: a->b<-c on reverse strand of chrX
  1283. let a = BNDDesc {
  1284. a_contig: "chrX".into(),
  1285. a_position: 300,
  1286. a_sens: false,
  1287. b_contig: "chrX".into(),
  1288. b_position: 250,
  1289. b_sens: false,
  1290. added_nt: String::new(),
  1291. };
  1292. let b = BNDDesc {
  1293. a_contig: "chrX".into(),
  1294. a_position: 200,
  1295. a_sens: false,
  1296. b_contig: "chrX".into(),
  1297. b_position: 150,
  1298. b_sens: false,
  1299. added_nt: String::new(),
  1300. };
  1301. let c = BNDDesc {
  1302. a_contig: "chrX".into(),
  1303. a_position: 100,
  1304. a_sens: false,
  1305. b_contig: "chrX".into(),
  1306. b_position: 50,
  1307. b_sens: false,
  1308. added_nt: String::new(),
  1309. };
  1310. // comp2: three‑node forward chain on chrY
  1311. let d = fwd("chrY", 10);
  1312. let e = fwd("chrY", 20);
  1313. let f = fwd("chrY", 30);
  1314. let g: BNDGraph<()> = vec![a, b, c, d, e, f].to_bnd_graph();
  1315. assert!(g.hamiltonian_path().is_none());
  1316. let comps = g.components_by_size();
  1317. comps.iter().for_each(|a| println!("{}", g.fmt_path(a)));
  1318. assert_eq!(comps.len(), 2);
  1319. assert_eq!(comps[0].len(), 3);
  1320. assert_eq!(comps[1].len(), 3);
  1321. }
  1322. #[test]
  1323. fn bnd_connections() {
  1324. // comp1: a->b<-c on reverse strand of chrX
  1325. let a = BNDDesc {
  1326. a_contig: "chrX".into(),
  1327. a_position: 300,
  1328. a_sens: true,
  1329. b_contig: "chr14".into(),
  1330. b_position: 250,
  1331. b_sens: false,
  1332. added_nt: String::new(),
  1333. };
  1334. let b = BNDDesc {
  1335. a_contig: "chr14".into(),
  1336. a_position: 200,
  1337. a_sens: false,
  1338. b_contig: "chrX".into(),
  1339. b_position: 301,
  1340. b_sens: true,
  1341. added_nt: String::new(),
  1342. };
  1343. let c = BNDDesc {
  1344. a_contig: "chrX".into(),
  1345. a_position: 302,
  1346. a_sens: true,
  1347. b_contig: "chrZ".into(),
  1348. b_position: 50,
  1349. b_sens: false,
  1350. added_nt: String::new(),
  1351. };
  1352. // comp2: three‑node forward chain on chrY
  1353. let d = fwd("chrY", 10);
  1354. let e = fwd("chrY", 20);
  1355. let f = fwd("chrY", 30);
  1356. let g: BNDGraph<()> = vec![a, b, c, d, e, f].to_bnd_graph();
  1357. assert!(g.hamiltonian_path().is_none());
  1358. let comps = g.components_by_size();
  1359. comps.iter().for_each(|a| println!("{}", g.fmt_path(a)));
  1360. assert_eq!(comps.len(), 2);
  1361. assert_eq!(comps[0].len(), 3);
  1362. assert_eq!(comps[1].len(), 3);
  1363. }
  1364. #[test]
  1365. fn snv_hg38() -> anyhow::Result<()> {
  1366. init();
  1367. let id = "CHALO";
  1368. let config = Config::default();
  1369. let path = format!("{}/{id}/diag/{id}_somatic_variants.bit", config.result_dir);
  1370. let mut variants = variant_collection::Variants::load_from_file(&path)?;
  1371. info!("All: {}", variants.len());
  1372. variants.retain(|v| *v.alteration_category().first().unwrap_or(&AlterationCategory::Other) == AlterationCategory::SNV);
  1373. info!("SNV: {}", variants.len());
  1374. variants.in_place_merge();
  1375. info!("SNV: {}", variants.len());
  1376. // variants.write_vcf("/home/t_steimle/CHALO_hs1.vcf.gz", "/home/t_steimle/ref/hs1/chm13v2.0.dict", true)
  1377. variants.save_into_lifted(
  1378. "/home/t_steimle/CHALO_SNV_hg38.vcf",
  1379. "./unmapped",
  1380. "/home/t_steimle/ref/hs1/chm13v2-grch38.chain",
  1381. "/home/t_steimle/ref/hg38/hg38.fa",
  1382. 100,
  1383. )
  1384. }
  1385. }