lib.rs 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404
  1. use anyhow::{Ok, Result};
  2. use log::info;
  3. use minimap2::{Aligner, Mapping};
  4. use rust_htslib::bam::{self, Record};
  5. use std::{
  6. collections::{HashMap, VecDeque},
  7. fmt,
  8. };
  9. #[derive(Debug, Clone)]
  10. pub struct Genome {
  11. pub chromosomes: HashMap<String, Chromosome>,
  12. }
  13. #[derive(Debug, Clone)]
  14. pub struct Chromosome {
  15. contigs: Vec<Contig>,
  16. }
  17. #[derive(Debug, Clone)]
  18. pub struct Contig {
  19. pub id: String,
  20. // contig seq on ref
  21. pub mappings: Vec<Mapping>,
  22. // reads on ref
  23. pub supporting_records: Vec<Record>,
  24. pub sequence: String,
  25. pub contig_ref: ContigRef,
  26. }
  27. #[derive(Debug, Clone)]
  28. pub enum ContigRef {
  29. Unique(Mapping),
  30. Chimeric((Mapping, Mapping)),
  31. ChimericMultiple((Mapping, Vec<Mapping>, Mapping)),
  32. LeftAmbiguity((Vec<Mapping>, Mapping)),
  33. RightAmbiguity((Mapping, Vec<Mapping>)),
  34. Ambigous(Vec<Mapping>),
  35. }
  36. impl fmt::Display for ContigRef {
  37. fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
  38. let str = match self {
  39. ContigRef::Unique(m) => mapping_to_string(m),
  40. ContigRef::Chimeric((a, b)) => {
  41. format!("{}<->{}", mapping_to_string(a), mapping_to_string(b))
  42. }
  43. ContigRef::ChimericMultiple((a, v, b)) => format!(
  44. "{}<->{}<->{}",
  45. mapping_to_string(a),
  46. mappings_to_string(v),
  47. mapping_to_string(b)
  48. ),
  49. ContigRef::LeftAmbiguity((v, b)) => {
  50. format!("{}<->{}", mappings_to_string(v), mapping_to_string(b))
  51. }
  52. ContigRef::RightAmbiguity((a, v)) => {
  53. format!("{}<->{}", mapping_to_string(a), mappings_to_string(v))
  54. }
  55. ContigRef::Ambigous(v) => format!("{}", mappings_to_string(v)),
  56. };
  57. fmt.write_str(&str).unwrap();
  58. std::result::Result::Ok(())
  59. }
  60. }
  61. impl ContigRef {
  62. pub fn hgvs(&self) -> Option<String> {
  63. match self {
  64. ContigRef::Unique(_) => None,
  65. ContigRef::Chimeric((a, b)) => {
  66. if a.target_name == b.target_name {
  67. let chr = a.target_name.clone().unwrap_or("UNKNOWN".to_string());
  68. let end = a.target_end;
  69. let start = b.target_start;
  70. Some(format!("{chr}:{end}_{start}"))
  71. } else {
  72. None
  73. }
  74. }
  75. ContigRef::ChimericMultiple(_) => None,
  76. ContigRef::LeftAmbiguity(_) => None,
  77. ContigRef::RightAmbiguity(_) => None,
  78. ContigRef::Ambigous(_) => None,
  79. }
  80. }
  81. }
  82. pub fn mapping_to_string(mapping: &Mapping) -> String {
  83. let uk = "UNKNOWN".to_string();
  84. format!(
  85. "{}:{}-{}({}:{}-{})",
  86. mapping.target_name.clone().unwrap_or(uk.clone()),
  87. mapping.target_start,
  88. mapping.target_end,
  89. mapping.query_name.clone().unwrap_or(uk),
  90. mapping.query_start,
  91. mapping.query_end
  92. )
  93. }
  94. fn mappings_to_string(mappings: &Vec<Mapping>) -> String {
  95. let v = mappings
  96. .iter()
  97. .map(mapping_to_string)
  98. .collect::<Vec<String>>();
  99. v.join("//")
  100. }
  101. pub fn get_ref_pos(mappings: Vec<Mapping>) -> Result<ContigRef> {
  102. let mut mappings = mappings;
  103. if mappings.len() == 1 {
  104. return Ok(ContigRef::Unique(mappings.get(0).unwrap().clone()));
  105. } else {
  106. let mut grouped: VecDeque<Vec<Mapping>> = group_mappings(&mut mappings)?.into();
  107. if grouped.len() == 1 {
  108. let r = grouped.into_iter().flat_map(|e| e).collect();
  109. return Ok(ContigRef::Ambigous(r));
  110. } else if grouped.len() >= 2 {
  111. let first = grouped.pop_back().unwrap();
  112. let last = grouped.pop_front().unwrap();
  113. if grouped.len() == 0 {
  114. if first.len() == 1 && last.len() == 1 {
  115. return Ok(ContigRef::Chimeric((
  116. first.get(0).unwrap().clone(),
  117. last.get(0).unwrap().clone(),
  118. )));
  119. } else if first.len() == 1 {
  120. return Ok(ContigRef::RightAmbiguity((
  121. first.get(0).unwrap().clone(),
  122. last.clone(),
  123. )));
  124. } else if last.len() == 1 {
  125. return Ok(ContigRef::LeftAmbiguity((
  126. first.clone(),
  127. last.get(0).unwrap().clone(),
  128. )));
  129. } else {
  130. let all: Vec<Mapping> = vec![first, last].into_iter().flat_map(|e| e).collect();
  131. return Ok(ContigRef::Ambigous(all));
  132. }
  133. } else {
  134. }
  135. if first.len() == 1 && last.len() == 1 {
  136. return Ok(ContigRef::ChimericMultiple((
  137. first.get(0).unwrap().clone(),
  138. grouped.into_iter().flat_map(|e| e).collect(),
  139. last.get(0).unwrap().clone(),
  140. )));
  141. } else if first.len() == 1 {
  142. let right: Vec<Mapping> = vec![grouped.into_iter().flat_map(|e| e).collect(), last]
  143. .into_iter()
  144. .flat_map(|e| e)
  145. .collect();
  146. return Ok(ContigRef::RightAmbiguity((
  147. first.get(0).unwrap().clone(),
  148. right,
  149. )));
  150. } else if last.len() == 1 {
  151. let left: Vec<Mapping> = vec![first, grouped.into_iter().flat_map(|e| e).collect()]
  152. .into_iter()
  153. .flat_map(|e| e)
  154. .collect();
  155. return Ok(ContigRef::LeftAmbiguity((
  156. left,
  157. last.get(0).unwrap().clone(),
  158. )));
  159. } else {
  160. let all: Vec<Mapping> =
  161. vec![first, grouped.into_iter().flat_map(|e| e).collect(), last]
  162. .into_iter()
  163. .flat_map(|e| e)
  164. .collect();
  165. return Ok(ContigRef::Ambigous(all));
  166. }
  167. } else {
  168. return Ok(ContigRef::Ambigous(
  169. grouped.into_iter().flat_map(|e| e).collect(),
  170. ));
  171. }
  172. }
  173. }
  174. impl Genome {
  175. pub fn new() -> Self {
  176. Genome {
  177. chromosomes: HashMap::new(),
  178. }
  179. }
  180. pub fn iter(&self) -> std::collections::hash_map::Iter<'_, String, Chromosome> {
  181. self.chromosomes.iter()
  182. }
  183. pub fn add_contig(
  184. &mut self,
  185. id: String,
  186. mappings: Vec<Mapping>,
  187. supporting_records: Vec<Record>,
  188. sequence: String,
  189. ) -> Result<()> {
  190. let new_contig = Contig {
  191. id,
  192. mappings: mappings.clone(),
  193. supporting_records,
  194. sequence,
  195. contig_ref: get_ref_pos(mappings)?,
  196. };
  197. // get the category of Mapping
  198. match new_contig.contig_ref.clone() {
  199. ContigRef::Unique(contig_mapping) => {
  200. match self
  201. .chromosomes
  202. .get_mut(&contig_mapping.target_name.unwrap())
  203. {
  204. Some(chromosome) => {
  205. chromosome.contigs.push(new_contig);
  206. }
  207. None => (),
  208. }
  209. }
  210. ContigRef::Chimeric((a, b)) => {
  211. let a_target_name = a.target_name.unwrap();
  212. let b_target_name = b.target_name.unwrap();
  213. if a_target_name == b_target_name {
  214. if let Some(chromosome) = self.chromosomes.get_mut(&a_target_name) {
  215. chromosome.contigs.push(new_contig);
  216. } else {
  217. self.chromosomes.insert(
  218. a_target_name,
  219. Chromosome {
  220. contigs: vec![new_contig],
  221. },
  222. );
  223. }
  224. } else {
  225. let chimeric_name = format!("{}-{}", a_target_name, b_target_name);
  226. if let Some(chromosome) = self.chromosomes.get_mut(&chimeric_name) {
  227. chromosome.contigs.push(new_contig);
  228. } else {
  229. self.chromosomes.insert(
  230. chimeric_name,
  231. Chromosome {
  232. contigs: vec![new_contig],
  233. },
  234. );
  235. }
  236. }
  237. }
  238. ContigRef::ChimericMultiple((left, _, right)) => {
  239. let left_target_name = left.target_name.unwrap();
  240. let right_target_name = right.target_name.unwrap();
  241. if left_target_name == right_target_name {
  242. if let Some(chromosome) = self.chromosomes.get_mut(&left_target_name) {
  243. chromosome.contigs.push(new_contig);
  244. } else {
  245. self.chromosomes.insert(
  246. left_target_name,
  247. Chromosome {
  248. contigs: vec![new_contig],
  249. },
  250. );
  251. }
  252. } else {
  253. let chimeric_name = format!("{}-{}", left_target_name, right_target_name);
  254. if let Some(chromosome) = self.chromosomes.get_mut(&chimeric_name) {
  255. chromosome.contigs.push(new_contig);
  256. } else {
  257. self.chromosomes.insert(
  258. chimeric_name,
  259. Chromosome {
  260. contigs: vec![new_contig],
  261. },
  262. );
  263. }
  264. }
  265. }
  266. _ => {
  267. if let Some(chromosome) = self.chromosomes.get_mut("Ambigous") {
  268. chromosome.contigs.push(new_contig);
  269. } else {
  270. self.chromosomes.insert(
  271. "Ambigous".to_string(),
  272. Chromosome {
  273. contigs: vec![new_contig],
  274. },
  275. );
  276. }
  277. }
  278. };
  279. Ok(())
  280. }
  281. pub fn stats(&self) {
  282. for (k, v) in self.chromosomes.iter() {
  283. info!("{}:{}", k, v.contigs.len());
  284. }
  285. }
  286. pub fn chromosome(&self, chromosome: &str) -> Option<Vec<Contig>> {
  287. if let Some(chr) = self.chromosomes.get(chromosome) {
  288. Some(chr.contigs.clone())
  289. } else {
  290. None
  291. }
  292. }
  293. }
  294. impl Chromosome {
  295. pub fn iter(&self) -> std::slice::Iter<'_, Contig> {
  296. self.contigs.iter()
  297. }
  298. }
  299. impl Contig {
  300. pub fn sort(&mut self) {
  301. self.mappings
  302. .sort_by(|a, b| a.target_start.cmp(&b.target_start));
  303. }
  304. pub fn write_bam(&self, path: &str) -> Result<()> {
  305. let aligner = Aligner::builder()
  306. .asm5()
  307. .with_threads(8)
  308. .with_cigar()
  309. .with_seq(self.sequence.as_bytes())
  310. .expect("Unable to build index");
  311. let mut mappings = Vec::new();
  312. for record in self.supporting_records.iter() {
  313. let seq = record.seq().as_bytes();
  314. let alignment = aligner
  315. .map(&seq, false, false, None, None)
  316. .expect("Unable to align");
  317. mappings.push(alignment);
  318. }
  319. let mappings: Vec<_> = mappings.into_iter().flatten().collect();
  320. let mut header = bam::Header::new();
  321. let mut sq_record = rust_htslib::bam::header::HeaderRecord::new(b"SQ");
  322. sq_record.push_tag(b"SN", self.id.clone());
  323. sq_record.push_tag(b"LN", self.sequence.len());
  324. header.push_record(&sq_record);
  325. let mut out = bam::Writer::from_path(path, &header, bam::Format::Bam).unwrap();
  326. // copy reverse reads to new BAM file
  327. for mapping in mappings.iter() {
  328. let record = minimap2::htslib::mapping_to_record(
  329. Some(mapping),
  330. self.sequence.as_bytes(),
  331. header.clone(),
  332. None,
  333. Some(self.id.as_bytes()),
  334. );
  335. out.write(&record)?;
  336. }
  337. rust_htslib::bam::index::build(path, None, rust_htslib::bam::index::Type::Bai, 5)?;
  338. Ok(())
  339. }
  340. pub fn hgvs(&self) -> Option<String> {
  341. self.contig_ref.hgvs()
  342. }
  343. }
  344. fn group_mappings(mappings: &mut Vec<Mapping>) -> Result<Vec<Vec<Mapping>>> {
  345. // sort alignments by query_start
  346. mappings.sort_by(|a, b| a.query_start.cmp(&b.query_start));
  347. let mut alignments: Vec<Vec<Mapping>> = vec![];
  348. // group by overlapps > 30
  349. for aln in mappings.iter() {
  350. let last = alignments.last_mut();
  351. if let Some(l) = last {
  352. if l.iter()
  353. .filter(|a| a.query_end - aln.query_start > 30)
  354. .count()
  355. > 0
  356. {
  357. l.push(aln.clone());
  358. } else {
  359. alignments.push(vec![aln.clone()]);
  360. }
  361. } else {
  362. alignments.push(vec![aln.clone()]);
  363. }
  364. }
  365. Ok(alignments)
  366. }
  367. #[cfg(test)]
  368. mod tests {
  369. use super::*;
  370. #[test]
  371. fn it_works() {
  372. // let result = add(2, 2);
  373. // assert_eq!(result, 4);
  374. }
  375. }