vcf.rs 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143
  1. use anyhow::Context;
  2. use chrono::{DateTime, Utc};
  3. use csi::binning_index::ReferenceSequence;
  4. use glob::glob;
  5. use log::warn;
  6. use std::{collections::HashMap, fs::Metadata, os::unix::fs::MetadataExt, path::PathBuf};
  7. use noodles_csi as csi;
  8. use num_format::{Locale, ToFormattedString};
  9. use crate::commands::bcftools::{bcftools_index, BcftoolsConfig};
  10. #[derive(Debug, Clone)]
  11. pub struct Vcf {
  12. pub id: String,
  13. pub caller: String,
  14. pub time: String,
  15. pub path: PathBuf,
  16. pub file_metadata: Metadata,
  17. pub n_variants: u64,
  18. }
  19. impl Vcf {
  20. pub fn new(path: PathBuf) -> anyhow::Result<Self> {
  21. let stem = path
  22. .file_stem()
  23. .context("Can't parse stem")?
  24. .to_string_lossy()
  25. .to_string();
  26. let stem_splt: Vec<&str> = stem.split('_').collect();
  27. let id = stem_splt[0].to_string();
  28. let time = stem_splt[1].to_string();
  29. let caller = stem_splt[2..stem_splt.len() - 1].join("_");
  30. if !PathBuf::from(format!("{}.csi", path.display())).exists() {
  31. bcftools_index(&path.display().to_string(), &BcftoolsConfig::default())?;
  32. }
  33. let n_variants = n_variants(path.to_str().context("Can't convert path to str")?)?;
  34. let file_metadata = path
  35. .metadata()
  36. .context(format!("Can't access metadata for {}", path.display()))?;
  37. Ok(Self {
  38. id,
  39. caller,
  40. time,
  41. path,
  42. file_metadata,
  43. n_variants,
  44. })
  45. }
  46. pub fn modified(&self) -> anyhow::Result<DateTime<Utc>> {
  47. Ok(self.file_metadata.modified().unwrap().into())
  48. }
  49. pub fn size(&self) -> u64 {
  50. self.file_metadata.size()
  51. }
  52. pub fn tsv(&self) -> anyhow::Result<String> {
  53. Ok([
  54. self.id.clone(),
  55. self.time.clone(),
  56. self.caller.clone(),
  57. self.n_variants.to_string(),
  58. self.modified()?.to_string(),
  59. self.size().to_string(),
  60. self.path.display().to_string(),
  61. ]
  62. .join("\t"))
  63. }
  64. pub fn println(&self) -> anyhow::Result<()> {
  65. let formated_n_variants = self.n_variants.to_formatted_string(&Locale::en);
  66. let formated_modified = self.modified()?.naive_local().to_string();
  67. let formated_size = format!("{:#}", byte_unit::Byte::from_u64(self.size()));
  68. println!(
  69. "{}",
  70. [
  71. self.id.to_string(),
  72. self.time.to_string(),
  73. self.caller.to_string(),
  74. formated_n_variants,
  75. formated_modified,
  76. formated_size,
  77. self.path.display().to_string()
  78. ]
  79. .join("\t")
  80. );
  81. Ok(())
  82. }
  83. }
  84. #[derive(Debug)]
  85. pub struct VcfCollection {
  86. pub vcfs: Vec<Vcf>,
  87. }
  88. impl VcfCollection {
  89. pub fn new(result_dir: &str) -> Self {
  90. let mut vcfs = Vec::new();
  91. let pattern = format!("{}/*/*/*/*_PASSED.vcf.gz", result_dir);
  92. for entry in glob(&pattern).expect("Failed to read glob pattern") {
  93. match entry {
  94. Ok(path) => match Vcf::new(path) {
  95. Ok(vcf) => vcfs.push(vcf),
  96. Err(e) => warn!("{e}"),
  97. },
  98. Err(e) => warn!("Error: {:?}", e),
  99. }
  100. }
  101. VcfCollection { vcfs }
  102. }
  103. pub fn sort_by_id(&mut self) {
  104. self.vcfs.sort_by_key(|v| v.id.clone());
  105. }
  106. pub fn group_by_id(&self) -> Vec<(String, Vec<Vcf>)> {
  107. let mut vcf_by_ids: HashMap<String, Vec<Vcf>> = HashMap::new();
  108. self.vcfs.iter().for_each(|v| {
  109. vcf_by_ids.entry(v.id.clone()).or_default().push(v.clone());
  110. });
  111. vcf_by_ids.into_iter().collect()
  112. }
  113. }
  114. pub fn n_variants(path: &str) -> anyhow::Result<u64> {
  115. let csi_src = format!("{path}.csi");
  116. let index = csi::read(csi_src).context(format!("can't read index of {path}"))?;
  117. let mut n = 0;
  118. for reference_sequence in index.reference_sequences() {
  119. if let Some(metadata) = reference_sequence.metadata() {
  120. n += metadata.mapped_record_count()
  121. }
  122. }
  123. Ok(n)
  124. }