lib.rs 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550
  1. use anyhow::{Ok, Result};
  2. use log::info;
  3. use minimap2::{Aligner, Mapping};
  4. use noodles_fasta as fasta;
  5. use rust_htslib::bam::{self, Record};
  6. use std::{
  7. collections::{HashMap, VecDeque},
  8. fmt::{self, format},
  9. fs::{self, File},
  10. io::{BufWriter, Write},
  11. process::{Command, Stdio},
  12. };
  13. use uuid::Uuid;
  14. #[derive(Debug, Clone)]
  15. pub struct Genome {
  16. pub chromosomes: HashMap<String, Chromosome>,
  17. }
  18. #[derive(Debug, Clone)]
  19. pub struct Chromosome {
  20. contigs: Vec<Contig>,
  21. }
  22. #[derive(Debug, Clone)]
  23. pub struct Contig {
  24. pub id: String,
  25. // contig seq on ref
  26. pub mappings: Vec<Mapping>,
  27. // reads on ref
  28. pub supporting_records: Vec<Record>,
  29. pub sequence: String,
  30. pub contig_ref: ContigRef,
  31. }
  32. #[derive(Debug, Clone)]
  33. pub enum ContigRef {
  34. Unique(Mapping),
  35. Chimeric((Mapping, Mapping)),
  36. ChimericMultiple((Mapping, Vec<Mapping>, Mapping)),
  37. LeftAmbiguity((Vec<Mapping>, Mapping)),
  38. RightAmbiguity((Mapping, Vec<Mapping>)),
  39. Ambigous(Vec<Mapping>),
  40. }
  41. impl fmt::Display for ContigRef {
  42. fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
  43. let str = match self {
  44. ContigRef::Unique(m) => mapping_to_string(m),
  45. ContigRef::Chimeric((a, b)) => {
  46. format!("{}<->{}", mapping_to_string(a), mapping_to_string(b))
  47. }
  48. ContigRef::ChimericMultiple((a, v, b)) => format!(
  49. "{}<->{}<->{}",
  50. mapping_to_string(a),
  51. mappings_to_string(v),
  52. mapping_to_string(b)
  53. ),
  54. ContigRef::LeftAmbiguity((v, b)) => {
  55. format!("{}<->{}", mappings_to_string(v), mapping_to_string(b))
  56. }
  57. ContigRef::RightAmbiguity((a, v)) => {
  58. format!("{}<->{}", mapping_to_string(a), mappings_to_string(v))
  59. }
  60. ContigRef::Ambigous(v) => format!("{}", mappings_to_string(v)),
  61. };
  62. fmt.write_str(&str).unwrap();
  63. std::result::Result::Ok(())
  64. }
  65. }
  66. impl ContigRef {
  67. pub fn hgvs(&self) -> Option<String> {
  68. match self {
  69. ContigRef::Unique(_) => None,
  70. ContigRef::Chimeric((a, b)) => {
  71. if a.target_name == b.target_name {
  72. let chr = a.target_name.clone().unwrap_or("UNKNOWN".to_string());
  73. let end = a.target_end;
  74. let start = b.target_start;
  75. Some(format!("{chr}:{end}_{start}"))
  76. } else {
  77. None
  78. }
  79. }
  80. ContigRef::ChimericMultiple(_) => None,
  81. ContigRef::LeftAmbiguity(_) => None,
  82. ContigRef::RightAmbiguity(_) => None,
  83. ContigRef::Ambigous(_) => None,
  84. }
  85. }
  86. }
  87. pub fn mapping_to_string(mapping: &Mapping) -> String {
  88. let uk = "UNKNOWN".to_string();
  89. format!(
  90. "{}:{}-{}({}:{}-{})",
  91. mapping.target_name.clone().unwrap_or(uk.clone()),
  92. mapping.target_start,
  93. mapping.target_end,
  94. mapping.query_name.clone().unwrap_or(uk),
  95. mapping.query_start,
  96. mapping.query_end
  97. )
  98. }
  99. fn mappings_to_string(mappings: &Vec<Mapping>) -> String {
  100. let v = mappings
  101. .iter()
  102. .map(mapping_to_string)
  103. .collect::<Vec<String>>();
  104. v.join("//")
  105. }
  106. pub fn get_ref_pos(mappings: Vec<Mapping>) -> Result<ContigRef> {
  107. let mut mappings = mappings;
  108. if mappings.len() == 1 {
  109. return Ok(ContigRef::Unique(mappings.get(0).unwrap().clone()));
  110. } else {
  111. let mut grouped: VecDeque<Vec<Mapping>> = group_mappings(&mut mappings)?.into();
  112. if grouped.len() == 1 {
  113. let r = grouped.into_iter().flat_map(|e| e).collect();
  114. return Ok(ContigRef::Ambigous(r));
  115. } else if grouped.len() >= 2 {
  116. let first = grouped.pop_back().unwrap();
  117. let last = grouped.pop_front().unwrap();
  118. if grouped.len() == 0 {
  119. if first.len() == 1 && last.len() == 1 {
  120. return Ok(ContigRef::Chimeric((
  121. first.get(0).unwrap().clone(),
  122. last.get(0).unwrap().clone(),
  123. )));
  124. } else if first.len() == 1 {
  125. return Ok(ContigRef::RightAmbiguity((
  126. first.get(0).unwrap().clone(),
  127. last.clone(),
  128. )));
  129. } else if last.len() == 1 {
  130. return Ok(ContigRef::LeftAmbiguity((
  131. first.clone(),
  132. last.get(0).unwrap().clone(),
  133. )));
  134. } else {
  135. let all: Vec<Mapping> = vec![first, last].into_iter().flat_map(|e| e).collect();
  136. return Ok(ContigRef::Ambigous(all));
  137. }
  138. } else {
  139. }
  140. if first.len() == 1 && last.len() == 1 {
  141. return Ok(ContigRef::ChimericMultiple((
  142. first.get(0).unwrap().clone(),
  143. grouped.into_iter().flat_map(|e| e).collect(),
  144. last.get(0).unwrap().clone(),
  145. )));
  146. } else if first.len() == 1 {
  147. let right: Vec<Mapping> = vec![grouped.into_iter().flat_map(|e| e).collect(), last]
  148. .into_iter()
  149. .flat_map(|e| e)
  150. .collect();
  151. return Ok(ContigRef::RightAmbiguity((
  152. first.get(0).unwrap().clone(),
  153. right,
  154. )));
  155. } else if last.len() == 1 {
  156. let left: Vec<Mapping> = vec![first, grouped.into_iter().flat_map(|e| e).collect()]
  157. .into_iter()
  158. .flat_map(|e| e)
  159. .collect();
  160. return Ok(ContigRef::LeftAmbiguity((
  161. left,
  162. last.get(0).unwrap().clone(),
  163. )));
  164. } else {
  165. let all: Vec<Mapping> =
  166. vec![first, grouped.into_iter().flat_map(|e| e).collect(), last]
  167. .into_iter()
  168. .flat_map(|e| e)
  169. .collect();
  170. return Ok(ContigRef::Ambigous(all));
  171. }
  172. } else {
  173. return Ok(ContigRef::Ambigous(
  174. grouped.into_iter().flat_map(|e| e).collect(),
  175. ));
  176. }
  177. }
  178. }
  179. impl Genome {
  180. pub fn new() -> Self {
  181. Genome {
  182. chromosomes: HashMap::new(),
  183. }
  184. }
  185. pub fn iter(&self) -> std::collections::hash_map::Iter<'_, String, Chromosome> {
  186. self.chromosomes.iter()
  187. }
  188. pub fn add_contig(
  189. &mut self,
  190. id: String,
  191. mappings: Vec<Mapping>,
  192. supporting_records: Vec<Record>,
  193. sequence: String,
  194. ) -> Result<()> {
  195. let new_contig = Contig {
  196. id,
  197. mappings: mappings.clone(),
  198. supporting_records,
  199. sequence,
  200. contig_ref: get_ref_pos(mappings)?,
  201. };
  202. // get the category of Mapping
  203. match new_contig.contig_ref.clone() {
  204. ContigRef::Unique(contig_mapping) => {
  205. match self
  206. .chromosomes
  207. .get_mut(&contig_mapping.target_name.unwrap())
  208. {
  209. Some(chromosome) => {
  210. chromosome.contigs.push(new_contig);
  211. }
  212. None => (),
  213. }
  214. }
  215. ContigRef::Chimeric((a, b)) => {
  216. let a_target_name = a.target_name.unwrap();
  217. let b_target_name = b.target_name.unwrap();
  218. if a_target_name == b_target_name {
  219. if let Some(chromosome) = self.chromosomes.get_mut(&a_target_name) {
  220. chromosome.contigs.push(new_contig);
  221. } else {
  222. self.chromosomes.insert(
  223. a_target_name,
  224. Chromosome {
  225. contigs: vec![new_contig],
  226. },
  227. );
  228. }
  229. } else {
  230. let chimeric_name = format!("{}-{}", a_target_name, b_target_name);
  231. if let Some(chromosome) = self.chromosomes.get_mut(&chimeric_name) {
  232. chromosome.contigs.push(new_contig);
  233. } else {
  234. self.chromosomes.insert(
  235. chimeric_name,
  236. Chromosome {
  237. contigs: vec![new_contig],
  238. },
  239. );
  240. }
  241. }
  242. }
  243. ContigRef::ChimericMultiple((left, _, right)) => {
  244. let left_target_name = left.target_name.unwrap();
  245. let right_target_name = right.target_name.unwrap();
  246. if left_target_name == right_target_name {
  247. if let Some(chromosome) = self.chromosomes.get_mut(&left_target_name) {
  248. chromosome.contigs.push(new_contig);
  249. } else {
  250. self.chromosomes.insert(
  251. left_target_name,
  252. Chromosome {
  253. contigs: vec![new_contig],
  254. },
  255. );
  256. }
  257. } else {
  258. let chimeric_name = format!("{}-{}", left_target_name, right_target_name);
  259. if let Some(chromosome) = self.chromosomes.get_mut(&chimeric_name) {
  260. chromosome.contigs.push(new_contig);
  261. } else {
  262. self.chromosomes.insert(
  263. chimeric_name,
  264. Chromosome {
  265. contigs: vec![new_contig],
  266. },
  267. );
  268. }
  269. }
  270. }
  271. _ => {
  272. if let Some(chromosome) = self.chromosomes.get_mut("Ambigous") {
  273. chromosome.contigs.push(new_contig);
  274. } else {
  275. self.chromosomes.insert(
  276. "Ambigous".to_string(),
  277. Chromosome {
  278. contigs: vec![new_contig],
  279. },
  280. );
  281. }
  282. }
  283. };
  284. Ok(())
  285. }
  286. pub fn stats(&self) {
  287. for (k, v) in self.chromosomes.iter() {
  288. info!("{}:{}", k, v.contigs.len());
  289. }
  290. }
  291. pub fn chromosome(&self, chromosome: &str) -> Option<Vec<Contig>> {
  292. if let Some(chr) = self.chromosomes.get(chromosome) {
  293. Some(chr.contigs.clone())
  294. } else {
  295. None
  296. }
  297. }
  298. }
  299. impl Chromosome {
  300. pub fn iter(&self) -> std::slice::Iter<'_, Contig> {
  301. self.contigs.iter()
  302. }
  303. }
  304. impl Contig {
  305. pub fn sort(&mut self) {
  306. self.mappings
  307. .sort_by(|a, b| a.target_start.cmp(&b.target_start));
  308. }
  309. pub fn to_igv(&self, dir_path: &str) -> Result<()> {
  310. let contig_dir = format!("{dir_path}/{}", self.id);
  311. fs::create_dir_all(contig_dir.clone())?;
  312. let fasta_path = format!("{contig_dir}/contig.fa");
  313. write_fasta(&fasta_path, &vec![(self.id.clone(), self.sequence.clone())]);
  314. write_fai(&fasta_path);
  315. let reads_path = format!("{contig_dir}/reads.fa");
  316. let n_reads = self.supporting_records.clone().into_iter().map(|r| {
  317. (
  318. String::from_utf8(r.qname().to_vec()).unwrap(),
  319. String::from_utf8(r.seq().as_bytes()).unwrap(),
  320. )
  321. }).collect();
  322. write_fasta(&reads_path, &n_reads);
  323. let bam_path = format!("{contig_dir}/{}.bam", self.id);
  324. create_bam(&fasta_path, &reads_path, &bam_path)?;
  325. let bed_path = format!("{contig_dir}/contig.bed");
  326. match &self.contig_ref {
  327. ContigRef::Chimeric((a, b)) => {
  328. let d = vec![
  329. (self.id.clone(), a.query_start, a.query_end, format!("{}:{}-{}", a.target_name.clone().unwrap(), a.target_start, a.target_end)),
  330. (self.id.clone(), b.query_start, b.query_end, format!("{}:{}-{}", b.target_name.clone().unwrap(), b.target_start, b.target_end))
  331. ];
  332. write_bed(&bed_path, &d)?;
  333. },
  334. _ => ()
  335. }
  336. Ok(())
  337. }
  338. // bug cigar len != seq len
  339. pub fn write_bam(&self, path: &str) -> Result<()> {
  340. let aligner = Aligner::builder()
  341. .asm5()
  342. .with_threads(8)
  343. .with_cigar()
  344. .with_seq(self.sequence.as_bytes())
  345. .expect("Unable to build index");
  346. let mut mappings = Vec::new();
  347. for record in self.supporting_records.iter() {
  348. let seq = record.seq().as_bytes();
  349. let alignment = aligner
  350. .map(&seq, false, false, None, None)
  351. .expect("Unable to align");
  352. mappings.push(alignment);
  353. }
  354. let mut mappings: Vec<_> = mappings.into_iter().flatten().collect();
  355. mappings.sort_by(|a, b| a.target_start.cmp(&b.target_start));
  356. let mut header = bam::Header::new();
  357. let mut sq_record = rust_htslib::bam::header::HeaderRecord::new(b"SQ");
  358. sq_record.push_tag(b"SN", self.id.clone());
  359. sq_record.push_tag(b"LN", self.sequence.len());
  360. header.push_record(&sq_record);
  361. let mut out = bam::Writer::from_path(path, &header, bam::Format::Bam).unwrap();
  362. // copy reverse reads to new BAM file
  363. for mapping in mappings.iter() {
  364. let record = minimap2::htslib::mapping_to_record(
  365. Some(mapping),
  366. self.sequence.as_bytes(),
  367. header.clone(),
  368. None,
  369. Some(Uuid::new_v4().as_bytes()),
  370. );
  371. let _ = out.write(&record);
  372. }
  373. rust_htslib::bam::index::build(path, None, rust_htslib::bam::index::Type::Bai, 1)?;
  374. Ok(())
  375. }
  376. pub fn hgvs(&self) -> Option<String> {
  377. self.contig_ref.hgvs()
  378. }
  379. }
  380. fn group_mappings(mappings: &mut Vec<Mapping>) -> Result<Vec<Vec<Mapping>>> {
  381. // sort alignments by query_start
  382. mappings.sort_by(|a, b| a.query_start.cmp(&b.query_start));
  383. let mut alignments: Vec<Vec<Mapping>> = vec![];
  384. // group by overlapps > 30
  385. for aln in mappings.iter() {
  386. let last = alignments.last_mut();
  387. if let Some(l) = last {
  388. if l.iter()
  389. .filter(|a| a.query_end - aln.query_start > 30)
  390. .count()
  391. > 0
  392. {
  393. l.push(aln.clone());
  394. } else {
  395. alignments.push(vec![aln.clone()]);
  396. }
  397. } else {
  398. alignments.push(vec![aln.clone()]);
  399. }
  400. }
  401. Ok(alignments)
  402. }
  403. pub fn write_bed(path: &str, d: &Vec<(String, i32, i32, String)>) -> Result<()> {
  404. let file = File::create(path).unwrap();
  405. let mut writer = BufWriter::new(file);
  406. for (chr, start, end, value) in d.iter() {
  407. let row = format!("{}\n", vec![chr.to_string(), start.to_string(), end.to_string(), value.to_string()].join("\t"));
  408. writer.write_all(row.as_bytes())?;
  409. }
  410. Ok(())
  411. }
  412. // unique
  413. pub fn write_fastq(fastq_path: &str, d: &Vec<Record>) -> Result<()> {
  414. let file = File::create(fastq_path)?;
  415. let mut writer = BufWriter::new(file);
  416. for record in d {
  417. let name = String::from_utf8(record.qname().to_vec()).unwrap();
  418. writer.write_all(format!("@{name}\n").as_bytes())?;
  419. let seq = record.seq().as_bytes();
  420. writer.write_all(&seq)?;
  421. writer.write_all(b"\n+\n")?;
  422. let qual = record.qual();
  423. writer.write_all(qual)?;
  424. }
  425. Ok(())
  426. }
  427. pub fn write_fasta(fasta_path: &str, d: &Vec<(String, String)>) {
  428. let file = File::create(fasta_path).unwrap();
  429. let mut writer = fasta::writer::Builder::default().build_with_writer(file);
  430. let mut passed = Vec::new();
  431. for (name, sequence) in d {
  432. let name = name.to_string();
  433. if sequence.len() == 0 {
  434. continue;
  435. }
  436. if passed.contains(&name) {
  437. continue;
  438. }
  439. passed.push(name.clone());
  440. let record = fasta::Record::new(
  441. fasta::record::Definition::new(name.as_str(), None),
  442. fasta::record::Sequence::from(sequence.as_bytes().to_vec()),
  443. );
  444. writer.write_record(&record).unwrap();
  445. }
  446. }
  447. pub fn write_fai(path: &str) {
  448. let mut faidx = Command::new("samtools").arg("faidx").arg(path).spawn().expect("Samtools faidx failed to start");
  449. faidx.wait().unwrap();
  450. }
  451. // pub fn write_aln_sh()
  452. pub fn create_bam(ref_path: &str, reads_path: &str, bam_path: &str) -> Result<()> {
  453. let rg_id = uuid::Uuid::new_v4();
  454. let mm2 = Command::new("minimap2")
  455. .arg("-t")
  456. .arg("128")
  457. .arg("-ax")
  458. .arg("map-ont")
  459. .arg("-R")
  460. .arg(format!(
  461. "@RG\\tPL:ONTASM_PROM\\tID:ONTASM_{rg_id}\\tSM:{rg_id}\\tLB:ONTASM_NB_PROM"
  462. ))
  463. .arg(ref_path)
  464. .arg(reads_path)
  465. .stdout(Stdio::piped())
  466. // .stdout(Stdio::from(output_file))
  467. .spawn()
  468. .expect("Minimap2 failed to start");
  469. // mm2.wait().unwrap();
  470. let view = Command::new("sambamba")
  471. .arg("view")
  472. .arg("-h")
  473. .arg("-S")
  474. .arg("-t")
  475. .arg("20")
  476. .arg("--format=bam")
  477. .arg("/dev/stdin")
  478. .stdin(Stdio::from(mm2.stdout.unwrap()))
  479. .stdout(Stdio::piped())
  480. .spawn()
  481. .expect("Sambamba view failed to start");
  482. let mut sort = Command::new("sambamba")
  483. .arg("sort")
  484. .arg("-t")
  485. .arg("20")
  486. .arg("/dev/stdin")
  487. .arg("-o")
  488. .arg(bam_path)
  489. .stdin(Stdio::from(view.stdout.unwrap()))
  490. .spawn()
  491. .expect("Sambamba sort failed to start");
  492. sort.wait().unwrap();
  493. Ok(())
  494. }
  495. #[cfg(test)]
  496. mod tests {
  497. use super::*;
  498. #[test]
  499. fn it_works() {
  500. // let result = add(2, 2);
  501. // assert_eq!(result, 4);
  502. }
  503. }