lib.rs 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596
  1. use anyhow::{Ok, Result};
  2. use log::info;
  3. use minimap2::{Aligner, Mapping};
  4. use noodles_fasta as fasta;
  5. use rust_htslib::bam::{self, Record};
  6. use std::{
  7. collections::{HashMap, VecDeque},
  8. fmt::{self, format},
  9. fs::{self, File},
  10. io::{BufWriter, Write},
  11. process::{Command, Stdio},
  12. };
  13. use uuid::Uuid;
  14. #[derive(Debug, Clone)]
  15. pub struct Genome {
  16. pub chromosomes: HashMap<String, Chromosome>,
  17. }
  18. #[derive(Debug, Clone)]
  19. pub struct Chromosome {
  20. contigs: Vec<Contig>,
  21. }
  22. #[derive(Debug, Clone)]
  23. pub struct Contig {
  24. pub id: String,
  25. // contig seq on ref
  26. pub mappings: Vec<Mapping>,
  27. // reads on ref
  28. pub supporting_records: Vec<Record>,
  29. pub sequence: String,
  30. pub contig_ref: ContigRef,
  31. }
  32. #[derive(Debug, Clone)]
  33. pub enum ContigRef {
  34. Unique(Mapping),
  35. Chimeric((Mapping, Mapping)),
  36. ChimericMultiple((Mapping, Vec<Mapping>, Mapping)),
  37. LeftAmbiguity((Vec<Mapping>, Mapping)),
  38. RightAmbiguity((Mapping, Vec<Mapping>)),
  39. Ambigous(Vec<Mapping>),
  40. }
  41. impl fmt::Display for ContigRef {
  42. fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
  43. let str = match self {
  44. ContigRef::Unique(m) => mapping_to_string(m),
  45. ContigRef::Chimeric((a, b)) => {
  46. format!("{}<->{}", mapping_to_string(a), mapping_to_string(b))
  47. }
  48. ContigRef::ChimericMultiple((a, v, b)) => format!(
  49. "{}<->{}<->{}",
  50. mapping_to_string(a),
  51. mappings_to_string(v),
  52. mapping_to_string(b)
  53. ),
  54. ContigRef::LeftAmbiguity((v, b)) => {
  55. format!("{}<->{}", mappings_to_string(v), mapping_to_string(b))
  56. }
  57. ContigRef::RightAmbiguity((a, v)) => {
  58. format!("{}<->{}", mapping_to_string(a), mappings_to_string(v))
  59. }
  60. ContigRef::Ambigous(v) => format!("{}", mappings_to_string(v)),
  61. };
  62. fmt.write_str(&str).unwrap();
  63. std::result::Result::Ok(())
  64. }
  65. }
  66. impl ContigRef {
  67. pub fn hgvs(&self) -> Option<String> {
  68. match self {
  69. ContigRef::Unique(_) => None,
  70. ContigRef::Chimeric((a, b)) => {
  71. if a.target_name == b.target_name {
  72. let chr = a.target_name.clone().unwrap_or("UNKNOWN".to_string());
  73. let del_start = a.target_end;
  74. let del_end = b.target_start;
  75. let hgvs = format!("{chr}:{del_start}_{del_end}");
  76. Some(hgvs)
  77. } else {
  78. let a_chr = a.target_name.clone().unwrap_or("UNKNOWN".to_string());
  79. let a_bp = a.target_end;
  80. let b_chr = b.target_name.clone().unwrap_or("UNKNOWN".to_string());
  81. let b_bp = b.target_end;
  82. let hgvs = format!("{a_chr}:{a_bp}delins[{b_chr}:{b_bp}]");
  83. Some(hgvs)
  84. }
  85. }
  86. ContigRef::ChimericMultiple(_) => None,
  87. ContigRef::LeftAmbiguity(_) => None,
  88. ContigRef::RightAmbiguity(_) => None,
  89. ContigRef::Ambigous(_) => None,
  90. }
  91. }
  92. }
  93. pub fn mapping_to_string(mapping: &Mapping) -> String {
  94. let uk = "UNKNOWN".to_string();
  95. format!(
  96. "{}:{}-{}({}:{}-{})",
  97. mapping.target_name.clone().unwrap_or(uk.clone()),
  98. mapping.target_start,
  99. mapping.target_end,
  100. mapping.query_name.clone().unwrap_or(uk),
  101. mapping.query_start,
  102. mapping.query_end
  103. )
  104. }
  105. fn mappings_to_string(mappings: &Vec<Mapping>) -> String {
  106. let v = mappings
  107. .iter()
  108. .map(mapping_to_string)
  109. .collect::<Vec<String>>();
  110. v.join("//")
  111. }
  112. pub fn get_ref_pos(mappings: Vec<Mapping>) -> Result<ContigRef> {
  113. let mut mappings = mappings;
  114. if mappings.len() == 1 {
  115. return Ok(ContigRef::Unique(mappings.get(0).unwrap().clone()));
  116. } else {
  117. let mut grouped: VecDeque<Vec<Mapping>> = group_mappings(&mut mappings)?.into();
  118. if grouped.len() == 1 {
  119. let r = grouped.into_iter().flat_map(|e| e).collect();
  120. return Ok(ContigRef::Ambigous(r));
  121. } else if grouped.len() >= 2 {
  122. let first = grouped.pop_back().unwrap();
  123. let last = grouped.pop_front().unwrap();
  124. if grouped.len() == 0 {
  125. if first.len() == 1 && last.len() == 1 {
  126. return Ok(ContigRef::Chimeric((
  127. first.get(0).unwrap().clone(),
  128. last.get(0).unwrap().clone(),
  129. )));
  130. } else if first.len() == 1 {
  131. return Ok(ContigRef::RightAmbiguity((
  132. first.get(0).unwrap().clone(),
  133. last.clone(),
  134. )));
  135. } else if last.len() == 1 {
  136. return Ok(ContigRef::LeftAmbiguity((
  137. first.clone(),
  138. last.get(0).unwrap().clone(),
  139. )));
  140. } else {
  141. let all: Vec<Mapping> = vec![first, last].into_iter().flat_map(|e| e).collect();
  142. return Ok(ContigRef::Ambigous(all));
  143. }
  144. } else {
  145. }
  146. if first.len() == 1 && last.len() == 1 {
  147. return Ok(ContigRef::ChimericMultiple((
  148. first.get(0).unwrap().clone(),
  149. grouped.into_iter().flat_map(|e| e).collect(),
  150. last.get(0).unwrap().clone(),
  151. )));
  152. } else if first.len() == 1 {
  153. let right: Vec<Mapping> = vec![grouped.into_iter().flat_map(|e| e).collect(), last]
  154. .into_iter()
  155. .flat_map(|e| e)
  156. .collect();
  157. return Ok(ContigRef::RightAmbiguity((
  158. first.get(0).unwrap().clone(),
  159. right,
  160. )));
  161. } else if last.len() == 1 {
  162. let left: Vec<Mapping> = vec![first, grouped.into_iter().flat_map(|e| e).collect()]
  163. .into_iter()
  164. .flat_map(|e| e)
  165. .collect();
  166. return Ok(ContigRef::LeftAmbiguity((
  167. left,
  168. last.get(0).unwrap().clone(),
  169. )));
  170. } else {
  171. let all: Vec<Mapping> =
  172. vec![first, grouped.into_iter().flat_map(|e| e).collect(), last]
  173. .into_iter()
  174. .flat_map(|e| e)
  175. .collect();
  176. return Ok(ContigRef::Ambigous(all));
  177. }
  178. } else {
  179. return Ok(ContigRef::Ambigous(
  180. grouped.into_iter().flat_map(|e| e).collect(),
  181. ));
  182. }
  183. }
  184. }
  185. impl Genome {
  186. pub fn new() -> Self {
  187. Genome {
  188. chromosomes: HashMap::new(),
  189. }
  190. }
  191. pub fn iter(&self) -> std::collections::hash_map::Iter<'_, String, Chromosome> {
  192. self.chromosomes.iter()
  193. }
  194. pub fn add_contig(
  195. &mut self,
  196. id: String,
  197. mappings: Vec<Mapping>,
  198. supporting_records: Vec<Record>,
  199. sequence: String,
  200. ) -> Result<()> {
  201. let new_contig = Contig {
  202. id,
  203. mappings: mappings.clone(),
  204. supporting_records,
  205. sequence,
  206. contig_ref: get_ref_pos(mappings)?,
  207. };
  208. // get the category of Mapping
  209. match new_contig.contig_ref.clone() {
  210. ContigRef::Unique(contig_mapping) => {
  211. match self
  212. .chromosomes
  213. .get_mut(&contig_mapping.target_name.unwrap())
  214. {
  215. Some(chromosome) => {
  216. chromosome.contigs.push(new_contig);
  217. }
  218. None => (),
  219. }
  220. }
  221. ContigRef::Chimeric((a, b)) => {
  222. let a_target_name = a.target_name.unwrap();
  223. let b_target_name = b.target_name.unwrap();
  224. if a_target_name == b_target_name {
  225. if let Some(chromosome) = self.chromosomes.get_mut(&a_target_name) {
  226. chromosome.contigs.push(new_contig);
  227. } else {
  228. self.chromosomes.insert(
  229. a_target_name,
  230. Chromosome {
  231. contigs: vec![new_contig],
  232. },
  233. );
  234. }
  235. } else {
  236. let chimeric_name = format!("{}-{}", a_target_name, b_target_name);
  237. if let Some(chromosome) = self.chromosomes.get_mut(&chimeric_name) {
  238. chromosome.contigs.push(new_contig);
  239. } else {
  240. self.chromosomes.insert(
  241. chimeric_name,
  242. Chromosome {
  243. contigs: vec![new_contig],
  244. },
  245. );
  246. }
  247. }
  248. }
  249. ContigRef::ChimericMultiple((left, _, right)) => {
  250. let left_target_name = left.target_name.unwrap();
  251. let right_target_name = right.target_name.unwrap();
  252. if left_target_name == right_target_name {
  253. if let Some(chromosome) = self.chromosomes.get_mut(&left_target_name) {
  254. chromosome.contigs.push(new_contig);
  255. } else {
  256. self.chromosomes.insert(
  257. left_target_name,
  258. Chromosome {
  259. contigs: vec![new_contig],
  260. },
  261. );
  262. }
  263. } else {
  264. let chimeric_name = format!("{}-{}", left_target_name, right_target_name);
  265. if let Some(chromosome) = self.chromosomes.get_mut(&chimeric_name) {
  266. chromosome.contigs.push(new_contig);
  267. } else {
  268. self.chromosomes.insert(
  269. chimeric_name,
  270. Chromosome {
  271. contigs: vec![new_contig],
  272. },
  273. );
  274. }
  275. }
  276. }
  277. _ => {
  278. if let Some(chromosome) = self.chromosomes.get_mut("Ambigous") {
  279. chromosome.contigs.push(new_contig);
  280. } else {
  281. self.chromosomes.insert(
  282. "Ambigous".to_string(),
  283. Chromosome {
  284. contigs: vec![new_contig],
  285. },
  286. );
  287. }
  288. }
  289. };
  290. Ok(())
  291. }
  292. pub fn stats(&self) {
  293. for (k, v) in self.chromosomes.iter() {
  294. info!("{}:{}", k, v.contigs.len());
  295. }
  296. }
  297. pub fn chromosome(&self, chromosome: &str) -> Option<Vec<Contig>> {
  298. if let Some(chr) = self.chromosomes.get(chromosome) {
  299. Some(chr.contigs.clone())
  300. } else {
  301. None
  302. }
  303. }
  304. }
  305. impl Chromosome {
  306. pub fn iter(&self) -> std::slice::Iter<'_, Contig> {
  307. self.contigs.iter()
  308. }
  309. }
  310. impl Contig {
  311. pub fn sort(&mut self) {
  312. self.mappings
  313. .sort_by(|a, b| a.target_start.cmp(&b.target_start));
  314. }
  315. pub fn to_igv(&self, dir_path: &str) -> Result<()> {
  316. let contig_name = if let Some(hgvs) = self.hgvs() {
  317. hgvs
  318. } else {
  319. self.id.clone()
  320. };
  321. let contig_dir = format!("{dir_path}/{contig_name}");
  322. fs::create_dir_all(contig_dir.clone())?;
  323. let fasta_path = format!("{contig_dir}/contig.fa");
  324. write_fasta(&fasta_path, &vec![(self.id.clone(), self.sequence.clone())]);
  325. write_fai(&fasta_path);
  326. let reads_path = format!("{contig_dir}/reads.fa");
  327. let n_reads = self
  328. .supporting_records
  329. .clone()
  330. .into_iter()
  331. .map(|r| {
  332. (
  333. String::from_utf8(r.qname().to_vec()).unwrap(),
  334. String::from_utf8(r.seq().as_bytes()).unwrap(),
  335. )
  336. })
  337. .collect();
  338. write_fasta(&reads_path, &n_reads);
  339. let bam_path = format!("{contig_dir}/{}.bam", self.id);
  340. write_bam(&fasta_path, &reads_path, &bam_path)?;
  341. let bed_path = format!("{contig_dir}/contig.bed");
  342. match &self.contig_ref {
  343. ContigRef::Chimeric((a, b)) => {
  344. let d = vec![
  345. (
  346. self.id.clone(),
  347. a.query_start,
  348. a.query_end,
  349. format!(
  350. "{}:{}-{}",
  351. a.target_name.clone().unwrap(),
  352. a.target_start,
  353. a.target_end
  354. ),
  355. ),
  356. (
  357. self.id.clone(),
  358. b.query_start,
  359. b.query_end,
  360. format!(
  361. "{}:{}-{}",
  362. b.target_name.clone().unwrap(),
  363. b.target_start,
  364. b.target_end
  365. ),
  366. ),
  367. ];
  368. write_bed(&bed_path, &d)?;
  369. }
  370. _ => (),
  371. }
  372. Ok(())
  373. }
  374. // bug cigar len != seq len
  375. pub fn write_bam(&self, path: &str) -> Result<()> {
  376. let aligner = Aligner::builder()
  377. .asm5()
  378. .with_threads(8)
  379. .with_cigar()
  380. .with_seq(self.sequence.as_bytes())
  381. .expect("Unable to build index");
  382. let mut mappings = Vec::new();
  383. for record in self.supporting_records.iter() {
  384. let seq = record.seq().as_bytes();
  385. let alignment = aligner
  386. .map(&seq, false, false, None, None)
  387. .expect("Unable to align");
  388. mappings.push(alignment);
  389. }
  390. let mut mappings: Vec<_> = mappings.into_iter().flatten().collect();
  391. mappings.sort_by(|a, b| a.target_start.cmp(&b.target_start));
  392. let mut header = bam::Header::new();
  393. let mut sq_record = rust_htslib::bam::header::HeaderRecord::new(b"SQ");
  394. sq_record.push_tag(b"SN", self.id.clone());
  395. sq_record.push_tag(b"LN", self.sequence.len());
  396. header.push_record(&sq_record);
  397. let mut out = bam::Writer::from_path(path, &header, bam::Format::Bam).unwrap();
  398. // copy reverse reads to new BAM file
  399. for mapping in mappings.iter() {
  400. let record = minimap2::htslib::mapping_to_record(
  401. Some(mapping),
  402. self.sequence.as_bytes(),
  403. header.clone(),
  404. None,
  405. Some(Uuid::new_v4().as_bytes()),
  406. );
  407. let _ = out.write(&record);
  408. }
  409. rust_htslib::bam::index::build(path, None, rust_htslib::bam::index::Type::Bai, 1)?;
  410. Ok(())
  411. }
  412. pub fn hgvs(&self) -> Option<String> {
  413. self.contig_ref.hgvs()
  414. }
  415. }
  416. fn group_mappings(mappings: &mut Vec<Mapping>) -> Result<Vec<Vec<Mapping>>> {
  417. // sort alignments by query_start
  418. mappings.sort_by(|a, b| a.query_start.cmp(&b.query_start));
  419. let mut alignments: Vec<Vec<Mapping>> = vec![];
  420. // group by overlapps > 30
  421. for aln in mappings.iter() {
  422. let last = alignments.last_mut();
  423. if let Some(l) = last {
  424. if l.iter()
  425. .filter(|a| a.query_end - aln.query_start > 30)
  426. .count()
  427. > 0
  428. {
  429. l.push(aln.clone());
  430. } else {
  431. alignments.push(vec![aln.clone()]);
  432. }
  433. } else {
  434. alignments.push(vec![aln.clone()]);
  435. }
  436. }
  437. Ok(alignments)
  438. }
  439. pub fn write_bed(path: &str, d: &Vec<(String, i32, i32, String)>) -> Result<()> {
  440. let file = File::create(path).unwrap();
  441. let mut writer = BufWriter::new(file);
  442. for (chr, start, end, value) in d.iter() {
  443. let row = format!(
  444. "{}\n",
  445. vec![
  446. chr.to_string(),
  447. start.to_string(),
  448. end.to_string(),
  449. value.to_string()
  450. ]
  451. .join("\t")
  452. );
  453. writer.write_all(row.as_bytes())?;
  454. }
  455. Ok(())
  456. }
  457. // unique
  458. pub fn write_fastq(fastq_path: &str, d: &Vec<Record>) -> Result<()> {
  459. let file = File::create(fastq_path)?;
  460. let mut writer = BufWriter::new(file);
  461. for record in d {
  462. let name = String::from_utf8(record.qname().to_vec()).unwrap();
  463. writer.write_all(format!("@{name}\n").as_bytes())?;
  464. let seq = record.seq().as_bytes();
  465. writer.write_all(&seq)?;
  466. writer.write_all(b"\n+\n")?;
  467. let qual = record.qual();
  468. writer.write_all(qual)?;
  469. }
  470. Ok(())
  471. }
  472. pub fn write_fasta(fasta_path: &str, d: &Vec<(String, String)>) {
  473. let file = File::create(fasta_path).unwrap();
  474. let mut writer = fasta::writer::Builder::default().build_with_writer(file);
  475. let mut passed = Vec::new();
  476. for (name, sequence) in d {
  477. let name = name.to_string();
  478. if sequence.len() == 0 {
  479. continue;
  480. }
  481. if passed.contains(&name) {
  482. continue;
  483. }
  484. passed.push(name.clone());
  485. let record = fasta::Record::new(
  486. fasta::record::Definition::new(name.as_str(), None),
  487. fasta::record::Sequence::from(sequence.as_bytes().to_vec()),
  488. );
  489. writer.write_record(&record).unwrap();
  490. }
  491. }
  492. pub fn write_fai(path: &str) {
  493. let mut faidx = Command::new("samtools")
  494. .arg("faidx")
  495. .arg(path)
  496. .spawn()
  497. .expect("Samtools faidx failed to start");
  498. faidx.wait().unwrap();
  499. }
  500. pub fn write_bam(ref_path: &str, reads_path: &str, bam_path: &str) -> Result<()> {
  501. let rg_id = uuid::Uuid::new_v4();
  502. let mm2 = Command::new("minimap2")
  503. .arg("-t")
  504. .arg("128")
  505. .arg("-ax")
  506. .arg("map-ont")
  507. .arg("-R")
  508. .arg(format!(
  509. "@RG\\tPL:ONTASM_PROM\\tID:ONTASM_{rg_id}\\tSM:{rg_id}\\tLB:ONTASM_NB_PROM"
  510. ))
  511. .arg(ref_path)
  512. .arg(reads_path)
  513. .stdout(Stdio::piped())
  514. .stderr(Stdio::piped())
  515. .spawn()
  516. .expect("Minimap2 failed to start");
  517. let view = Command::new("sambamba")
  518. .arg("view")
  519. .arg("-h")
  520. .arg("-S")
  521. .arg("-t")
  522. .arg("20")
  523. .arg("--format=bam")
  524. .arg("/dev/stdin")
  525. .stdin(Stdio::from(mm2.stdout.unwrap()))
  526. .stdout(Stdio::piped())
  527. .stderr(Stdio::piped())
  528. .spawn()
  529. .expect("Sambamba view failed to start");
  530. let mut sort = Command::new("sambamba")
  531. .arg("sort")
  532. .arg("-t")
  533. .arg("20")
  534. .arg("/dev/stdin")
  535. .arg("-o")
  536. .arg(bam_path)
  537. .stderr(Stdio::piped())
  538. .stdin(Stdio::from(view.stdout.unwrap()))
  539. .spawn()
  540. .expect("Sambamba sort failed to start");
  541. sort.wait().unwrap();
  542. Ok(())
  543. }
  544. #[cfg(test)]
  545. mod tests {
  546. use super::*;
  547. #[test]
  548. fn it_works() {
  549. // let result = add(2, 2);
  550. // assert_eq!(result, 4);
  551. }
  552. }