vcf.rs 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141
  1. use anyhow::Context;
  2. use chrono::{DateTime, Utc};
  3. use csi::binning_index::ReferenceSequence;
  4. use glob::glob;
  5. use log::warn;
  6. use std::{collections::HashMap, fs::Metadata, path::PathBuf};
  7. use noodles_csi as csi;
  8. use num_format::{Locale, ToFormattedString};
  9. #[derive(Debug, Clone)]
  10. pub struct Vcf {
  11. pub id: String,
  12. pub caller: String,
  13. pub time: String,
  14. pub path: PathBuf,
  15. pub file_metadata: Metadata,
  16. pub n_variants: u64,
  17. }
  18. impl Vcf {
  19. pub fn new(path: PathBuf) -> anyhow::Result<Self> {
  20. let stem = path
  21. .file_stem()
  22. .context("Can't parse stem")?
  23. .to_string_lossy()
  24. .to_string();
  25. let stem_splt: Vec<&str> = stem.split('_').collect();
  26. let id = stem_splt[0].to_string();
  27. let time = stem_splt[1].to_string();
  28. let caller = stem_splt[2..stem_splt.len() - 1].join("_");
  29. if !PathBuf::from(format!("{}.csi", path.display())).exists() {
  30. anyhow::bail!("CSI index for {} doesn't exist.", path.display())
  31. }
  32. let n_variants = n_variants(path.to_str().context("Can't convert path to str")?)?;
  33. let file_metadata = path
  34. .metadata()
  35. .context(format!("Can't access metadata for {}", path.display()))?;
  36. Ok(Self {
  37. id,
  38. caller,
  39. time,
  40. path,
  41. file_metadata,
  42. n_variants,
  43. })
  44. }
  45. pub fn modified(&self) -> anyhow::Result<DateTime<Utc>> {
  46. Ok(self.file_metadata.modified().unwrap().into())
  47. }
  48. pub fn size(&self) -> u64 {
  49. self.file_metadata.len()
  50. }
  51. pub fn tsv(&self) -> anyhow::Result<String> {
  52. Ok([
  53. self.id.clone(),
  54. self.time.clone(),
  55. self.caller.clone(),
  56. self.n_variants.to_string(),
  57. self.modified()?.to_string(),
  58. self.size().to_string(),
  59. self.path.display().to_string(),
  60. ]
  61. .join("\t"))
  62. }
  63. pub fn println(&self) -> anyhow::Result<()> {
  64. let formated_n_variants = self.n_variants.to_formatted_string(&Locale::en);
  65. let formated_modified = self.modified()?.naive_local().to_string();
  66. let formated_size = format!("{:#}", byte_unit::Byte::from_u64(self.size()));
  67. println!(
  68. "{}",
  69. [
  70. self.id.to_string(),
  71. self.time.to_string(),
  72. self.caller.to_string(),
  73. formated_n_variants,
  74. formated_modified,
  75. formated_size,
  76. self.path.display().to_string()
  77. ]
  78. .join("\t")
  79. );
  80. Ok(())
  81. }
  82. }
  83. #[derive(Debug, Default)]
  84. pub struct VcfCollection {
  85. pub vcfs: Vec<Vcf>,
  86. }
  87. impl VcfCollection {
  88. pub fn new(result_dir: &str) -> Self {
  89. let mut vcfs = Vec::new();
  90. let pattern = format!("{}/*/*/*/*_PASSED.vcf.gz", result_dir);
  91. for entry in glob(&pattern).expect("Failed to read glob pattern") {
  92. match entry {
  93. Ok(path) => match Vcf::new(path) {
  94. Ok(vcf) => vcfs.push(vcf),
  95. Err(e) => warn!("{e}"),
  96. },
  97. Err(e) => warn!("Error: {:?}", e),
  98. }
  99. }
  100. VcfCollection { vcfs }
  101. }
  102. pub fn sort_by_id(&mut self) {
  103. self.vcfs.sort_by_key(|v| v.id.clone());
  104. }
  105. pub fn group_by_id(&self) -> Vec<(String, Vec<Vcf>)> {
  106. let mut vcf_by_ids: HashMap<String, Vec<Vcf>> = HashMap::new();
  107. self.vcfs.iter().for_each(|v| {
  108. vcf_by_ids.entry(v.id.clone()).or_default().push(v.clone());
  109. });
  110. vcf_by_ids.into_iter().collect()
  111. }
  112. }
  113. pub fn n_variants(path: &str) -> anyhow::Result<u64> {
  114. let csi_src = format!("{path}.csi");
  115. let index = csi::fs::read(&csi_src).with_context(|| format!("can't read index of {path}"))?;
  116. let n = index
  117. .reference_sequences()
  118. .iter()
  119. .filter_map(|rs| rs.metadata())
  120. .map(|m| m.mapped_record_count())
  121. .sum();
  122. Ok(n)
  123. }