lib.rs 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591
  1. use anyhow::{Ok, Result};
  2. use log::info;
  3. use minimap2::{Aligner, Mapping};
  4. use noodles_fasta as fasta;
  5. use rust_htslib::bam::{self, Record};
  6. use std::{
  7. collections::{HashMap, VecDeque},
  8. fmt::{self, format},
  9. fs::{self, File},
  10. io::{BufWriter, Write},
  11. process::{Command, Stdio},
  12. };
  13. use uuid::Uuid;
  14. #[derive(Debug, Clone)]
  15. pub struct Genome {
  16. pub chromosomes: HashMap<String, Chromosome>,
  17. }
  18. #[derive(Debug, Clone)]
  19. pub struct Chromosome {
  20. contigs: Vec<Contig>,
  21. }
  22. #[derive(Debug, Clone)]
  23. pub struct Contig {
  24. pub id: String,
  25. // contig seq on ref
  26. pub mappings: Vec<Mapping>,
  27. // reads on ref
  28. pub supporting_records: Vec<Record>,
  29. pub sequence: String,
  30. pub contig_ref: ContigRef,
  31. }
  32. #[derive(Debug, Clone)]
  33. pub enum ContigRef {
  34. Unique(Mapping),
  35. Chimeric((Mapping, Mapping)),
  36. ChimericMultiple((Mapping, Vec<Mapping>, Mapping)),
  37. LeftAmbiguity((Vec<Mapping>, Mapping)),
  38. RightAmbiguity((Mapping, Vec<Mapping>)),
  39. Ambigous(Vec<Mapping>),
  40. }
  41. impl fmt::Display for ContigRef {
  42. fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
  43. let str = match self {
  44. ContigRef::Unique(m) => mapping_to_string(m),
  45. ContigRef::Chimeric((a, b)) => {
  46. format!("{}<->{}", mapping_to_string(a), mapping_to_string(b))
  47. }
  48. ContigRef::ChimericMultiple((a, v, b)) => format!(
  49. "{}<->{}<->{}",
  50. mapping_to_string(a),
  51. mappings_to_string(v),
  52. mapping_to_string(b)
  53. ),
  54. ContigRef::LeftAmbiguity((v, b)) => {
  55. format!("{}<->{}", mappings_to_string(v), mapping_to_string(b))
  56. }
  57. ContigRef::RightAmbiguity((a, v)) => {
  58. format!("{}<->{}", mapping_to_string(a), mappings_to_string(v))
  59. }
  60. ContigRef::Ambigous(v) => format!("{}", mappings_to_string(v)),
  61. };
  62. fmt.write_str(&str).unwrap();
  63. std::result::Result::Ok(())
  64. }
  65. }
  66. impl ContigRef {
  67. pub fn hgvs(&self) -> Option<String> {
  68. match self {
  69. ContigRef::Unique(_) => None,
  70. ContigRef::Chimeric((a, b)) => {
  71. if a.target_name == b.target_name {
  72. let chr = a.target_name.clone().unwrap_or("UNKNOWN".to_string());
  73. let del_start = a.target_end;
  74. let del_end = b.target_start;
  75. Some(format!("{chr}:{del_start}_{del_end}"))
  76. } else {
  77. let a_chr = a.target_name.clone().unwrap_or("UNKNOWN".to_string());
  78. let a_bp = a.target_end;
  79. let b_chr = b.target_name.clone().unwrap_or("UNKNOWN".to_string());
  80. let b_bp = b.target_end;
  81. Some(format!("{a_chr}:{a_bp}delins[{b_chr}:{b_bp}]"))
  82. }
  83. }
  84. ContigRef::ChimericMultiple(_) => None,
  85. ContigRef::LeftAmbiguity(_) => None,
  86. ContigRef::RightAmbiguity(_) => None,
  87. ContigRef::Ambigous(_) => None,
  88. }
  89. }
  90. }
  91. pub fn mapping_to_string(mapping: &Mapping) -> String {
  92. let uk = "UNKNOWN".to_string();
  93. format!(
  94. "{}:{}-{}({}:{}-{})",
  95. mapping.target_name.clone().unwrap_or(uk.clone()),
  96. mapping.target_start,
  97. mapping.target_end,
  98. mapping.query_name.clone().unwrap_or(uk),
  99. mapping.query_start,
  100. mapping.query_end
  101. )
  102. }
  103. fn mappings_to_string(mappings: &Vec<Mapping>) -> String {
  104. let v = mappings
  105. .iter()
  106. .map(mapping_to_string)
  107. .collect::<Vec<String>>();
  108. v.join("//")
  109. }
  110. pub fn get_ref_pos(mappings: Vec<Mapping>) -> Result<ContigRef> {
  111. let mut mappings = mappings;
  112. if mappings.len() == 1 {
  113. return Ok(ContigRef::Unique(mappings.get(0).unwrap().clone()));
  114. } else {
  115. let mut grouped: VecDeque<Vec<Mapping>> = group_mappings(&mut mappings)?.into();
  116. if grouped.len() == 1 {
  117. let r = grouped.into_iter().flat_map(|e| e).collect();
  118. return Ok(ContigRef::Ambigous(r));
  119. } else if grouped.len() >= 2 {
  120. let first = grouped.pop_back().unwrap();
  121. let last = grouped.pop_front().unwrap();
  122. if grouped.len() == 0 {
  123. if first.len() == 1 && last.len() == 1 {
  124. return Ok(ContigRef::Chimeric((
  125. first.get(0).unwrap().clone(),
  126. last.get(0).unwrap().clone(),
  127. )));
  128. } else if first.len() == 1 {
  129. return Ok(ContigRef::RightAmbiguity((
  130. first.get(0).unwrap().clone(),
  131. last.clone(),
  132. )));
  133. } else if last.len() == 1 {
  134. return Ok(ContigRef::LeftAmbiguity((
  135. first.clone(),
  136. last.get(0).unwrap().clone(),
  137. )));
  138. } else {
  139. let all: Vec<Mapping> = vec![first, last].into_iter().flat_map(|e| e).collect();
  140. return Ok(ContigRef::Ambigous(all));
  141. }
  142. } else {
  143. }
  144. if first.len() == 1 && last.len() == 1 {
  145. return Ok(ContigRef::ChimericMultiple((
  146. first.get(0).unwrap().clone(),
  147. grouped.into_iter().flat_map(|e| e).collect(),
  148. last.get(0).unwrap().clone(),
  149. )));
  150. } else if first.len() == 1 {
  151. let right: Vec<Mapping> = vec![grouped.into_iter().flat_map(|e| e).collect(), last]
  152. .into_iter()
  153. .flat_map(|e| e)
  154. .collect();
  155. return Ok(ContigRef::RightAmbiguity((
  156. first.get(0).unwrap().clone(),
  157. right,
  158. )));
  159. } else if last.len() == 1 {
  160. let left: Vec<Mapping> = vec![first, grouped.into_iter().flat_map(|e| e).collect()]
  161. .into_iter()
  162. .flat_map(|e| e)
  163. .collect();
  164. return Ok(ContigRef::LeftAmbiguity((
  165. left,
  166. last.get(0).unwrap().clone(),
  167. )));
  168. } else {
  169. let all: Vec<Mapping> =
  170. vec![first, grouped.into_iter().flat_map(|e| e).collect(), last]
  171. .into_iter()
  172. .flat_map(|e| e)
  173. .collect();
  174. return Ok(ContigRef::Ambigous(all));
  175. }
  176. } else {
  177. return Ok(ContigRef::Ambigous(
  178. grouped.into_iter().flat_map(|e| e).collect(),
  179. ));
  180. }
  181. }
  182. }
  183. impl Genome {
  184. pub fn new() -> Self {
  185. Genome {
  186. chromosomes: HashMap::new(),
  187. }
  188. }
  189. pub fn iter(&self) -> std::collections::hash_map::Iter<'_, String, Chromosome> {
  190. self.chromosomes.iter()
  191. }
  192. pub fn add_contig(
  193. &mut self,
  194. id: String,
  195. mappings: Vec<Mapping>,
  196. supporting_records: Vec<Record>,
  197. sequence: String,
  198. ) -> Result<()> {
  199. let new_contig = Contig {
  200. id,
  201. mappings: mappings.clone(),
  202. supporting_records,
  203. sequence,
  204. contig_ref: get_ref_pos(mappings)?,
  205. };
  206. // get the category of Mapping
  207. match new_contig.contig_ref.clone() {
  208. ContigRef::Unique(contig_mapping) => {
  209. match self
  210. .chromosomes
  211. .get_mut(&contig_mapping.target_name.unwrap())
  212. {
  213. Some(chromosome) => {
  214. chromosome.contigs.push(new_contig);
  215. }
  216. None => (),
  217. }
  218. }
  219. ContigRef::Chimeric((a, b)) => {
  220. let a_target_name = a.target_name.unwrap();
  221. let b_target_name = b.target_name.unwrap();
  222. if a_target_name == b_target_name {
  223. if let Some(chromosome) = self.chromosomes.get_mut(&a_target_name) {
  224. chromosome.contigs.push(new_contig);
  225. } else {
  226. self.chromosomes.insert(
  227. a_target_name,
  228. Chromosome {
  229. contigs: vec![new_contig],
  230. },
  231. );
  232. }
  233. } else {
  234. let chimeric_name = format!("{}-{}", a_target_name, b_target_name);
  235. if let Some(chromosome) = self.chromosomes.get_mut(&chimeric_name) {
  236. chromosome.contigs.push(new_contig);
  237. } else {
  238. self.chromosomes.insert(
  239. chimeric_name,
  240. Chromosome {
  241. contigs: vec![new_contig],
  242. },
  243. );
  244. }
  245. }
  246. }
  247. ContigRef::ChimericMultiple((left, _, right)) => {
  248. let left_target_name = left.target_name.unwrap();
  249. let right_target_name = right.target_name.unwrap();
  250. if left_target_name == right_target_name {
  251. if let Some(chromosome) = self.chromosomes.get_mut(&left_target_name) {
  252. chromosome.contigs.push(new_contig);
  253. } else {
  254. self.chromosomes.insert(
  255. left_target_name,
  256. Chromosome {
  257. contigs: vec![new_contig],
  258. },
  259. );
  260. }
  261. } else {
  262. let chimeric_name = format!("{}-{}", left_target_name, right_target_name);
  263. if let Some(chromosome) = self.chromosomes.get_mut(&chimeric_name) {
  264. chromosome.contigs.push(new_contig);
  265. } else {
  266. self.chromosomes.insert(
  267. chimeric_name,
  268. Chromosome {
  269. contigs: vec![new_contig],
  270. },
  271. );
  272. }
  273. }
  274. }
  275. _ => {
  276. if let Some(chromosome) = self.chromosomes.get_mut("Ambigous") {
  277. chromosome.contigs.push(new_contig);
  278. } else {
  279. self.chromosomes.insert(
  280. "Ambigous".to_string(),
  281. Chromosome {
  282. contigs: vec![new_contig],
  283. },
  284. );
  285. }
  286. }
  287. };
  288. Ok(())
  289. }
  290. pub fn stats(&self) {
  291. for (k, v) in self.chromosomes.iter() {
  292. info!("{}:{}", k, v.contigs.len());
  293. }
  294. }
  295. pub fn chromosome(&self, chromosome: &str) -> Option<Vec<Contig>> {
  296. if let Some(chr) = self.chromosomes.get(chromosome) {
  297. Some(chr.contigs.clone())
  298. } else {
  299. None
  300. }
  301. }
  302. }
  303. impl Chromosome {
  304. pub fn iter(&self) -> std::slice::Iter<'_, Contig> {
  305. self.contigs.iter()
  306. }
  307. }
  308. impl Contig {
  309. pub fn sort(&mut self) {
  310. self.mappings
  311. .sort_by(|a, b| a.target_start.cmp(&b.target_start));
  312. }
  313. pub fn to_igv(&self, dir_path: &str) -> Result<()> {
  314. let contig_dir = format!("{dir_path}/{}", self.id);
  315. fs::create_dir_all(contig_dir.clone())?;
  316. let fasta_path = format!("{contig_dir}/contig.fa");
  317. write_fasta(&fasta_path, &vec![(self.id.clone(), self.sequence.clone())]);
  318. write_fai(&fasta_path);
  319. let reads_path = format!("{contig_dir}/reads.fa");
  320. let n_reads = self
  321. .supporting_records
  322. .clone()
  323. .into_iter()
  324. .map(|r| {
  325. (
  326. String::from_utf8(r.qname().to_vec()).unwrap(),
  327. String::from_utf8(r.seq().as_bytes()).unwrap(),
  328. )
  329. })
  330. .collect();
  331. write_fasta(&reads_path, &n_reads);
  332. let bam_path = format!("{contig_dir}/{}.bam", self.id);
  333. create_bam(&fasta_path, &reads_path, &bam_path)?;
  334. let bed_path = format!("{contig_dir}/contig.bed");
  335. match &self.contig_ref {
  336. ContigRef::Chimeric((a, b)) => {
  337. let d = vec![
  338. (
  339. self.id.clone(),
  340. a.query_start,
  341. a.query_end,
  342. format!(
  343. "{}:{}-{}",
  344. a.target_name.clone().unwrap(),
  345. a.target_start,
  346. a.target_end
  347. ),
  348. ),
  349. (
  350. self.id.clone(),
  351. b.query_start,
  352. b.query_end,
  353. format!(
  354. "{}:{}-{}",
  355. b.target_name.clone().unwrap(),
  356. b.target_start,
  357. b.target_end
  358. ),
  359. ),
  360. ];
  361. write_bed(&bed_path, &d)?;
  362. }
  363. _ => (),
  364. }
  365. Ok(())
  366. }
  367. // bug cigar len != seq len
  368. pub fn write_bam(&self, path: &str) -> Result<()> {
  369. let aligner = Aligner::builder()
  370. .asm5()
  371. .with_threads(8)
  372. .with_cigar()
  373. .with_seq(self.sequence.as_bytes())
  374. .expect("Unable to build index");
  375. let mut mappings = Vec::new();
  376. for record in self.supporting_records.iter() {
  377. let seq = record.seq().as_bytes();
  378. let alignment = aligner
  379. .map(&seq, false, false, None, None)
  380. .expect("Unable to align");
  381. mappings.push(alignment);
  382. }
  383. let mut mappings: Vec<_> = mappings.into_iter().flatten().collect();
  384. mappings.sort_by(|a, b| a.target_start.cmp(&b.target_start));
  385. let mut header = bam::Header::new();
  386. let mut sq_record = rust_htslib::bam::header::HeaderRecord::new(b"SQ");
  387. sq_record.push_tag(b"SN", self.id.clone());
  388. sq_record.push_tag(b"LN", self.sequence.len());
  389. header.push_record(&sq_record);
  390. let mut out = bam::Writer::from_path(path, &header, bam::Format::Bam).unwrap();
  391. // copy reverse reads to new BAM file
  392. for mapping in mappings.iter() {
  393. let record = minimap2::htslib::mapping_to_record(
  394. Some(mapping),
  395. self.sequence.as_bytes(),
  396. header.clone(),
  397. None,
  398. Some(Uuid::new_v4().as_bytes()),
  399. );
  400. let _ = out.write(&record);
  401. }
  402. rust_htslib::bam::index::build(path, None, rust_htslib::bam::index::Type::Bai, 1)?;
  403. Ok(())
  404. }
  405. pub fn hgvs(&self) -> Option<String> {
  406. self.contig_ref.hgvs()
  407. }
  408. }
  409. fn group_mappings(mappings: &mut Vec<Mapping>) -> Result<Vec<Vec<Mapping>>> {
  410. // sort alignments by query_start
  411. mappings.sort_by(|a, b| a.query_start.cmp(&b.query_start));
  412. let mut alignments: Vec<Vec<Mapping>> = vec![];
  413. // group by overlapps > 30
  414. for aln in mappings.iter() {
  415. let last = alignments.last_mut();
  416. if let Some(l) = last {
  417. if l.iter()
  418. .filter(|a| a.query_end - aln.query_start > 30)
  419. .count()
  420. > 0
  421. {
  422. l.push(aln.clone());
  423. } else {
  424. alignments.push(vec![aln.clone()]);
  425. }
  426. } else {
  427. alignments.push(vec![aln.clone()]);
  428. }
  429. }
  430. Ok(alignments)
  431. }
  432. pub fn write_bed(path: &str, d: &Vec<(String, i32, i32, String)>) -> Result<()> {
  433. let file = File::create(path).unwrap();
  434. let mut writer = BufWriter::new(file);
  435. for (chr, start, end, value) in d.iter() {
  436. let row = format!(
  437. "{}\n",
  438. vec![
  439. chr.to_string(),
  440. start.to_string(),
  441. end.to_string(),
  442. value.to_string()
  443. ]
  444. .join("\t")
  445. );
  446. writer.write_all(row.as_bytes())?;
  447. }
  448. Ok(())
  449. }
  450. // unique
  451. pub fn write_fastq(fastq_path: &str, d: &Vec<Record>) -> Result<()> {
  452. let file = File::create(fastq_path)?;
  453. let mut writer = BufWriter::new(file);
  454. for record in d {
  455. let name = String::from_utf8(record.qname().to_vec()).unwrap();
  456. writer.write_all(format!("@{name}\n").as_bytes())?;
  457. let seq = record.seq().as_bytes();
  458. writer.write_all(&seq)?;
  459. writer.write_all(b"\n+\n")?;
  460. let qual = record.qual();
  461. writer.write_all(qual)?;
  462. }
  463. Ok(())
  464. }
  465. pub fn write_fasta(fasta_path: &str, d: &Vec<(String, String)>) {
  466. let file = File::create(fasta_path).unwrap();
  467. let mut writer = fasta::writer::Builder::default().build_with_writer(file);
  468. let mut passed = Vec::new();
  469. for (name, sequence) in d {
  470. let name = name.to_string();
  471. if sequence.len() == 0 {
  472. continue;
  473. }
  474. if passed.contains(&name) {
  475. continue;
  476. }
  477. passed.push(name.clone());
  478. let record = fasta::Record::new(
  479. fasta::record::Definition::new(name.as_str(), None),
  480. fasta::record::Sequence::from(sequence.as_bytes().to_vec()),
  481. );
  482. writer.write_record(&record).unwrap();
  483. }
  484. }
  485. pub fn write_fai(path: &str) {
  486. let mut faidx = Command::new("samtools")
  487. .arg("faidx")
  488. .arg(path)
  489. .spawn()
  490. .expect("Samtools faidx failed to start");
  491. faidx.wait().unwrap();
  492. }
  493. // pub fn write_aln_sh()
  494. pub fn create_bam(ref_path: &str, reads_path: &str, bam_path: &str) -> Result<()> {
  495. let rg_id = uuid::Uuid::new_v4();
  496. let mm2 = Command::new("minimap2")
  497. .arg("-t")
  498. .arg("128")
  499. .arg("-ax")
  500. .arg("map-ont")
  501. .arg("-R")
  502. .arg(format!(
  503. "@RG\\tPL:ONTASM_PROM\\tID:ONTASM_{rg_id}\\tSM:{rg_id}\\tLB:ONTASM_NB_PROM"
  504. ))
  505. .arg(ref_path)
  506. .arg(reads_path)
  507. .stdout(Stdio::piped())
  508. .stderr(Stdio::piped())
  509. .spawn()
  510. .expect("Minimap2 failed to start");
  511. let view = Command::new("sambamba")
  512. .arg("view")
  513. .arg("-h")
  514. .arg("-S")
  515. .arg("-t")
  516. .arg("20")
  517. .arg("--format=bam")
  518. .arg("/dev/stdin")
  519. .stdin(Stdio::from(mm2.stdout.unwrap()))
  520. .stdout(Stdio::piped())
  521. .stderr(Stdio::piped())
  522. .spawn()
  523. .expect("Sambamba view failed to start");
  524. let mut sort = Command::new("sambamba")
  525. .arg("sort")
  526. .arg("-t")
  527. .arg("20")
  528. .arg("/dev/stdin")
  529. .arg("-o")
  530. .arg(bam_path)
  531. .stderr(Stdio::piped())
  532. .stdin(Stdio::from(view.stdout.unwrap()))
  533. .spawn()
  534. .expect("Sambamba sort failed to start");
  535. sort.wait().unwrap();
  536. Ok(())
  537. }
  538. #[cfg(test)]
  539. mod tests {
  540. use super::*;
  541. #[test]
  542. fn it_works() {
  543. // let result = add(2, 2);
  544. // assert_eq!(result, 4);
  545. }
  546. }