< 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530
  1. use std::{
  2. collections::HashMap,
  3. fmt,
  4. fs::{self, metadata},
  5. path::{Path, PathBuf},
  6. time::SystemTime,
  7. };
  8. use anyhow::Context;
  9. use chrono::{DateTime, Utc};
  10. use glob::glob;
  11. use log::{info, warn};
  12. use self::{bam::BamCollection, pod5::Pod5Collection, vcf::VcfCollection};
  13. use crate::{
  14. callers::{
  15. clairs::{ClairS, ClairSConfig},
  16. deep_variant::{DeepVariant, DeepVariantConfig},
  17. nanomonsv::{NanomonSV, NanomonSVConfig},
  18. },
  19. collection::pod5::FlowCellCase,
  20. commands::dorado::Dorado as BasecallAlign,
  21. config::Config,
  22. functions::{
  23. assembler::{Assembler, AssemblerConfig},
  24. variants::{Variants, VariantsConfig},
  25. whole_scan::{WholeScan, WholeScanConfig},
  26. },
  27. };
  28. pub mod bam;
  29. pub mod pod5;
  30. pub mod variants;
  31. pub mod vcf;
  32. #[derive(Debug, Clone)]
  33. pub struct CollectionsConfig {
  34. pub pod_dir: String,
  35. pub corrected_fc_path: String,
  36. pub result_dir: String,
  37. pub dict_file: String,
  38. pub min_diag_cov: f32,
  39. pub min_mrd_cov: f32,
  40. }
  41. impl Default for CollectionsConfig {
  42. fn default() -> Self {
  43. Self {
  44. pod_dir: "/data/run_data".to_string(),
  45. corrected_fc_path: "/data/flow_cells.tsv".to_string(),
  46. result_dir: "/data/longreads_basic_pipe".to_string(),
  47. dict_file: "/data/ref/hs1/chm13v2.0.dict".to_string(),
  48. min_diag_cov: 12.0,
  49. min_mrd_cov: 10.0,
  50. }
  51. }
  52. }
  53. #[derive(Debug)]
  54. pub struct Collections {
  55. pub config: CollectionsConfig,
  56. pub pod5: Pod5Collection,
  57. pub bam: BamCollection,
  58. pub vcf: VcfCollection,
  59. pub tasks: Vec<CollectionsTasks>,
  60. }
  61. impl Collections {
  62. pub fn new(config: CollectionsConfig) -> anyhow::Result<Self> {
  63. let CollectionsConfig {
  64. pod_dir,
  65. corrected_fc_path,
  66. result_dir,
  67. ..
  68. } = &config;
  69. let pod5 = Pod5Collection::new(pod_dir, corrected_fc_path, result_dir)?;
  70. let bam = BamCollection::new(result_dir);
  71. let vcf = VcfCollection::new(result_dir);
  72. Ok(Self {
  73. pod5,
  74. bam,
  75. vcf,
  76. tasks: Vec::new(),
  77. config,
  78. })
  79. }
  80. pub fn todo(&mut self) -> anyhow::Result<()> {
  81. info!("Looking for base calling tasks...");
  82. let mut tasks = Vec::new();
  83. let mut to_demux = Vec::new();
  84. for run in self.pod5.runs.iter() {
  85. for fc in run.flowcells.iter() {
  86. let acq_id = fc.pod5_info.acquisition_id.clone();
  87. for case in fc.cases.iter() {
  88. let bams_ids: Vec<String> = self
  89. .bam
  90. .get(&case.id, &case.time_point)
  91. .iter()
  92. .flat_map(|b| {
  93. b.composition
  94. .iter()
  95. .map(|c| c.0.clone())
  96. .collect::<Vec<String>>()
  97. })
  98. .filter(|id| *id == acq_id)
  99. .collect();
  100. if bams_ids.is_empty() {
  101. match fc.pod5_type {
  102. pod5::Pod5Type::Raw => to_demux.push(case.clone()),
  103. pod5::Pod5Type::Demuxed => {
  104. tasks.push(CollectionsTasks::Align(case.clone()))
  105. }
  106. }
  107. }
  108. }
  109. }
  110. }
  111. // Group for muxed and push task with all cases
  112. let mut grouped: HashMap<PathBuf, Vec<FlowCellCase>> = HashMap::new();
  113. for case in to_demux {
  114. grouped.entry(case.pod_dir.clone()).or_default().push(case);
  115. }
  116. grouped
  117. .into_values()
  118. .for_each(|data| tasks.push(CollectionsTasks::DemuxAlign(data)));
  119. // Whole scan
  120. for bam in self
  121. .bam
  122. .by_id_completed(self.config.min_diag_cov, self.config.min_mrd_cov)
  123. {
  124. let config = WholeScanConfig::default();
  125. let scan_dir = format!(
  126. "{}/{}/{}/{}",
  127. &config.result_dir, bam.id, bam.time_point, config.scan_dir
  128. );
  129. if PathBuf::from(&scan_dir).exists() {
  130. let dir_mod: DateTime<Utc> = fs::metadata(&scan_dir)?.modified()?.into();
  131. if bam.modified > dir_mod {
  132. fs::remove_dir_all(&scan_dir)?;
  133. }
  134. }
  135. if !PathBuf::from(&scan_dir).exists() {
  136. tasks.push(CollectionsTasks::WholeScan {
  137. id: bam.id,
  138. time_point: bam.time_point,
  139. bam: bam
  140. .path
  141. .to_str()
  142. .context("Cant convert path to string")?
  143. .to_string(),
  144. config: WholeScanConfig::default(),
  145. });
  146. }
  147. }
  148. // Remove VCF anterior to BAM
  149. let vcf_by_id = self.vcf.group_by_id();
  150. vcf_by_id.iter().for_each(|(id, vcfs)| {
  151. if let (Some(diag), Some(mrd)) = (
  152. self.bam.get(id, "diag").first(),
  153. self.bam.get(id, "mrd").first(),
  154. ) {
  155. let diag_modified = diag.modified;
  156. let mrd_modified = mrd.modified;
  157. let mut rm_paths: Vec<&Path> = vcfs
  158. .iter()
  159. .flat_map(|vcf| {
  160. let vcf_mod: DateTime<Utc> = vcf
  161. .file_metadata
  162. .modified()
  163. .expect("Can't read VCF modified time.")
  164. .into();
  165. // For somatic caller erase if one bam (diag or mrd) is more recent.
  166. if vcf.caller != "DeepVariant" {
  167. if vcf_mod < diag_modified || vcf_mod < mrd_modified {
  168. vec![vcf.path.parent().unwrap()]
  169. } else {
  170. vec![]
  171. }
  172. } else if (vcf.time_point == "diag" && vcf_mod < diag_modified)
  173. || (vcf.time_point == "mrd" && vcf_mod < mrd_modified)
  174. {
  175. vec![vcf.path.parent().unwrap()]
  176. } else {
  177. vec![]
  178. }
  179. })
  180. .collect();
  181. rm_paths.sort();
  182. rm_paths.dedup();
  183. rm_paths
  184. .iter()
  185. .for_each(|p| fs::remove_dir_all(p).expect("Can't erase caller dir."))
  186. }
  187. });
  188. // Variant calling
  189. info!("Looking for variant calling tasks...");
  190. self.bam.bams.iter().map(|b| b.id.clone()).for_each(|id| {
  191. if let (Some(diag), Some(mrd)) = (
  192. self.bam.get(&id, "diag").first(),
  193. self.bam.get(&id, "mrd").first(),
  194. ) {
  195. if let (Some(diag_cramino), Some(mrd_cramino)) = (&diag.cramino, &mrd.cramino) {
  196. if diag_cramino.mean_coverage >= self.config.min_diag_cov.into()
  197. && mrd_cramino.mean_coverage >= self.config.min_mrd_cov.into()
  198. {
  199. let caller_time: Vec<(&str, &str)> = vcf_by_id
  200. .iter()
  201. .filter(|(i, _)| *i == id)
  202. .flat_map(|(_, vcfs)| {
  203. vcfs.iter()
  204. .map(|vcf| (vcf.caller.as_str(), vcf.time_point.as_str()))
  205. })
  206. .collect();
  207. if !caller_time.contains(&("clairs", "diag"))
  208. || !caller_time.contains(&("clairs_indel", "diag"))
  209. {
  210. tasks.push(CollectionsTasks::ClairS {
  211. id: id.to_string(),
  212. diag_bam: diag.path.to_str().unwrap().to_string(),
  213. mrd_bam: mrd.path.to_str().unwrap().to_string(),
  214. config: ClairSConfig::default(),
  215. });
  216. }
  217. if !caller_time.contains(&("DeepVariant", "diag")) {
  218. tasks.push(CollectionsTasks::DeepVariant {
  219. id: id.to_string(),
  220. time_point: "diag".to_string(),
  221. bam: diag.path.to_str().unwrap().to_string(),
  222. config: DeepVariantConfig::default(),
  223. });
  224. }
  225. if !caller_time.contains(&("DeepVariant", "mrd")) {
  226. tasks.push(CollectionsTasks::DeepVariant {
  227. id: id.to_string(),
  228. time_point: "mrd".to_string(),
  229. bam: mrd.path.to_str().unwrap().to_string(),
  230. config: DeepVariantConfig::default(),
  231. });
  232. }
  233. if !caller_time.contains(&("nanomonsv", "diag")) {
  234. tasks.push(CollectionsTasks::NanomonSV {
  235. id: id.to_string(),
  236. diag_bam: diag.path.to_str().unwrap().to_string(),
  237. mrd_bam: mrd.path.to_str().unwrap().to_string(),
  238. config: NanomonSVConfig::default(),
  239. });
  240. }
  241. }
  242. }
  243. }
  244. });
  245. // Variants aggregation
  246. // info!("Looking for variants aggregation tasks...");
  247. // self.bam.bams.iter().filter(|b| b.time_point == "diag" ).for_each(|bam| {
  248. // let id = bam.id;
  249. // });
  250. // de novo
  251. tasks.extend(self.todo_assembler()?);
  252. // Tasks sorting and dedup
  253. let mut hs = HashMap::new();
  254. tasks.into_iter().for_each(|t| {
  255. hs.insert(t.to_string(), t);
  256. });
  257. self.tasks = hs.into_values().collect();
  258. Ok(())
  259. }
  260. pub fn tasks_dedup(&mut self) {
  261. let mut hs = HashMap::new();
  262. self.tasks.clone().into_iter().for_each(|t| {
  263. hs.insert(t.to_string(), t);
  264. });
  265. self.tasks = hs.into_values().collect();
  266. }
  267. pub fn todo_assembler(&mut self) -> anyhow::Result<Vec<CollectionsTasks>> {
  268. let mut tasks = Vec::new();
  269. let config = AssemblerConfig::default();
  270. for b in &self.bam.bams {
  271. let assemblies_dir = format!(
  272. "{}/{}/{}/{}",
  273. config.result_dir, b.id, b.time_point, config.output_dir_name
  274. );
  275. if !Path::new(&assemblies_dir).exists() {
  276. tasks.push(CollectionsTasks::Assemble {
  277. id: b.id.clone(),
  278. time_point: b.time_point.clone(),
  279. config: config.clone(),
  280. });
  281. continue;
  282. }
  283. let pattern = format!("{assemblies_dir}/*/*.bam");
  284. let mut mtimes: Vec<SystemTime> = glob(&pattern)?
  285. .filter_map(|entry| entry.ok())
  286. .filter_map(|path| metadata(path).ok()?.modified().ok())
  287. .collect();
  288. if mtimes.is_empty() {
  289. tasks.push(CollectionsTasks::Assemble {
  290. id: b.id.clone(),
  291. time_point: b.time_point.clone(),
  292. config: config.clone(),
  293. });
  294. continue;
  295. }
  296. mtimes.sort_unstable();
  297. mtimes.dedup();
  298. let max_mtime: DateTime<Utc> =
  299. mtimes.last().context("No modified time")?.to_owned().into();
  300. if b.modified > max_mtime {
  301. tasks.push(CollectionsTasks::Assemble {
  302. id: b.id.clone(),
  303. time_point: b.time_point.clone(),
  304. config: config.clone(),
  305. });
  306. }
  307. }
  308. Ok(tasks)
  309. }
  310. pub fn run(&mut self) -> anyhow::Result<()> {
  311. // self.tasks.reverse();
  312. if self.tasks.is_empty() {
  313. self.todo()?;
  314. if self.tasks.is_empty() {
  315. return Ok(());
  316. } else {
  317. self.run()?;
  318. }
  319. } else {
  320. let n_tasks = self.tasks.len();
  321. warn!("{n_tasks} tasks to run");
  322. let mut i = 1;
  323. while let Some(task) = self.tasks.pop() {
  324. warn!("Running {i}/{n_tasks}");
  325. info!("{task:#?}");
  326. task.run()?;
  327. i += 1;
  328. }
  329. }
  330. Ok(())
  331. }
  332. }
  333. use strum_macros::EnumOrd;
  334. #[derive(Debug, Clone, EnumOrd)]
  335. pub enum CollectionsTasks {
  336. Align(FlowCellCase),
  337. DemuxAlign(Vec<FlowCellCase>),
  338. WholeScan {
  339. id: String,
  340. time_point: String,
  341. bam: String,
  342. config: WholeScanConfig,
  343. },
  344. Assemble {
  345. id: String,
  346. time_point: String,
  347. config: AssemblerConfig,
  348. },
  349. DeepVariant {
  350. id: String,
  351. time_point: String,
  352. bam: String,
  353. config: DeepVariantConfig,
  354. },
  355. ClairS {
  356. id: String,
  357. diag_bam: String,
  358. mrd_bam: String,
  359. config: ClairSConfig,
  360. },
  361. NanomonSV {
  362. id: String,
  363. diag_bam: String,
  364. mrd_bam: String,
  365. config: NanomonSVConfig,
  366. },
  367. Variants {
  368. id: String,
  369. config: VariantsConfig,
  370. },
  371. }
  372. impl CollectionsTasks {
  373. pub fn run(self) -> anyhow::Result<()> {
  374. match self {
  375. CollectionsTasks::Align(case) => {
  376. BasecallAlign::init(case.clone(), Config::default())?.run_pipe()?;
  377. }
  378. CollectionsTasks::DemuxAlign(cases) => {
  379. BasecallAlign::from_mux(cases, Config::default())?;
  380. }
  381. CollectionsTasks::DeepVariant {
  382. id,
  383. time_point,
  384. bam,
  385. config,
  386. } => {
  387. DeepVariant::new(&id, &time_point, &bam, config).run();
  388. }
  389. CollectionsTasks::ClairS {
  390. id,
  391. diag_bam,
  392. mrd_bam,
  393. config,
  394. } => {
  395. ClairS::new(&id, &diag_bam, &mrd_bam, config).run();
  396. }
  397. CollectionsTasks::NanomonSV {
  398. id,
  399. diag_bam,
  400. mrd_bam,
  401. config,
  402. } => {
  403. NanomonSV::new(&id, &diag_bam, &mrd_bam, config).run();
  404. }
  405. CollectionsTasks::WholeScan {
  406. id,
  407. time_point,
  408. bam,
  409. config,
  410. } => {
  411. WholeScan::new(id, time_point, bam, config)?.run()?;
  412. }
  413. CollectionsTasks::Variants { id, config } => {
  414. Variants::new(id, config).run()?;
  415. }
  416. CollectionsTasks::Assemble {
  417. id,
  418. time_point,
  419. config,
  420. } => {
  421. Assembler::new(id, time_point, config).run()?;
  422. }
  423. }
  424. Ok(())
  425. }
  426. }
  427. // Implement Display for CollectionsTasks
  428. impl fmt::Display for CollectionsTasks {
  429. fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
  430. use CollectionsTasks::*;
  431. match self {
  432. Align(case) => write!(f, "Align task with: {:#?}", case),
  433. DemuxAlign(cases) => write!(f, "DemuxAlign task with: {:#?}", cases),
  434. DeepVariant {
  435. id,
  436. time_point,
  437. bam,
  438. ..
  439. } => {
  440. write!(
  441. f,
  442. "DeepVariant task with id: {}, time_point: {}, bam: {}",
  443. id, time_point, bam
  444. )
  445. }
  446. ClairS {
  447. id,
  448. diag_bam,
  449. mrd_bam,
  450. ..
  451. } => {
  452. write!(
  453. f,
  454. "ClairS task with id: {}, diag_bam: {}, mrd_bam: {}",
  455. id, diag_bam, mrd_bam
  456. )
  457. }
  458. NanomonSV {
  459. id,
  460. diag_bam,
  461. mrd_bam,
  462. ..
  463. } => {
  464. write!(
  465. f,
  466. "NanomonSV task with id: {}, diag_bam: {}, mrd_bam: {}",
  467. id, diag_bam, mrd_bam
  468. )
  469. }
  470. WholeScan { id, bam, .. } => write!(f, "Whole scan for id: {}, bam: {}", id, bam),
  471. Variants { id, .. } => write!(f, "Variants aggregation for id: {}", id),
  472. Assemble { id, time_point, .. } => {
  473. write!(f, "Assembly for id: {}, time point: {}", id, time_point)
  474. }
  475. }
  476. }
  477. }
  478. pub fn run_tasks(config: CollectionsConfig) -> anyhow::Result<()> {
  479. let mut last_n = Vec::new();
  480. loop {
  481. let mut collection = Collections::new(config.clone())?;
  482. collection.todo()?;
  483. if collection.tasks.is_empty() {
  484. warn!("All results are update");
  485. break;
  486. }
  487. let n_tasks = collection.tasks.len();
  488. warn!("{n_tasks} tasks to run");
  489. if last_n.len() > 2
  490. && last_n[last_n.len() - 1] == n_tasks
  491. && last_n[last_n.len() - 2] == n_tasks
  492. {
  493. warn!("Tasks don't progress");
  494. break;
  495. }
  496. last_n.push(n_tasks);
  497. collection.run()?;
  498. }
  499. Ok(())
  500. }