lib.rs 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374
  1. use anyhow::{Result, Ok};
  2. use log::info;
  3. use minimap2::Mapping;
  4. use rust_htslib::bam::{self, Record};
  5. use std::{
  6. collections::{HashMap, VecDeque},
  7. fmt,
  8. };
  9. #[derive(Debug, Clone)]
  10. pub struct Genome {
  11. pub chromosomes: HashMap<String, Chromosome>,
  12. }
  13. #[derive(Debug, Clone)]
  14. pub struct Chromosome {
  15. contigs: Vec<Contig>,
  16. }
  17. #[derive(Debug, Clone)]
  18. pub struct Contig {
  19. pub id: String,
  20. pub mappings: Vec<Mapping>,
  21. pub supporting_records: Vec<Record>,
  22. pub sequence: String,
  23. pub contig_ref: ContigRef,
  24. }
  25. #[derive(Debug, Clone)]
  26. pub enum ContigRef {
  27. Unique(Mapping),
  28. Chimeric((Mapping, Mapping)),
  29. ChimericMultiple((Mapping, Vec<Mapping>, Mapping)),
  30. LeftAmbiguity((Vec<Mapping>, Mapping)),
  31. RightAmbiguity((Mapping, Vec<Mapping>)),
  32. Ambigous(Vec<Mapping>),
  33. }
  34. impl fmt::Display for ContigRef {
  35. fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
  36. let str = match self {
  37. ContigRef::Unique(m) => mapping_to_string(m),
  38. ContigRef::Chimeric((a, b)) => {
  39. format!("{}<->{}", mapping_to_string(a), mapping_to_string(b))
  40. }
  41. ContigRef::ChimericMultiple((a, v, b)) => format!(
  42. "{}<->{}<->{}",
  43. mapping_to_string(a),
  44. mappings_to_string(v),
  45. mapping_to_string(b)
  46. ),
  47. ContigRef::LeftAmbiguity((v, b)) => {
  48. format!("{}<->{}", mappings_to_string(v), mapping_to_string(b))
  49. }
  50. ContigRef::RightAmbiguity((a, v)) => {
  51. format!("{}<->{}", mapping_to_string(a), mappings_to_string(v))
  52. }
  53. ContigRef::Ambigous(v) => format!("{}", mappings_to_string(v)),
  54. };
  55. fmt.write_str(&str).unwrap();
  56. std::result::Result::Ok(())
  57. }
  58. }
  59. impl ContigRef {
  60. pub fn hgvs(&self) -> Option<String> {
  61. match self {
  62. ContigRef::Unique(_) => None,
  63. ContigRef::Chimeric((a, b)) => {
  64. if a.target_name == b.target_name {
  65. let chr = a.target_name.clone().unwrap_or("UNKNOWN".to_string());
  66. let end = a.target_end;
  67. let start = b.target_start;
  68. Some(format!("{chr}:{end}_{start}"))
  69. } else {
  70. None
  71. }
  72. }
  73. ContigRef::ChimericMultiple(_) => None,
  74. ContigRef::LeftAmbiguity(_) => None,
  75. ContigRef::RightAmbiguity(_) => None,
  76. ContigRef::Ambigous(_) => None,
  77. }
  78. }
  79. }
  80. pub fn mapping_to_string(mapping: &Mapping) -> String {
  81. let uk = "UNKNOWN".to_string();
  82. format!(
  83. "{}:{}-{}({}:{}-{})",
  84. mapping.target_name.clone().unwrap_or(uk.clone()),
  85. mapping.target_start,
  86. mapping.target_end,
  87. mapping.query_name.clone().unwrap_or(uk),
  88. mapping.query_start,
  89. mapping.query_end
  90. )
  91. }
  92. fn mappings_to_string(mappings: &Vec<Mapping>) -> String {
  93. let v = mappings
  94. .iter()
  95. .map(mapping_to_string)
  96. .collect::<Vec<String>>();
  97. v.join("//")
  98. }
  99. pub fn get_ref_pos(mappings: Vec<Mapping>) -> Result<ContigRef> {
  100. let mut mappings = mappings;
  101. if mappings.len() == 1 {
  102. return Ok(ContigRef::Unique(mappings.get(0).unwrap().clone()));
  103. } else {
  104. let mut grouped: VecDeque<Vec<Mapping>> = group_mappings(&mut mappings)?.into();
  105. if grouped.len() == 1 {
  106. let r = grouped.into_iter().flat_map(|e| e).collect();
  107. return Ok(ContigRef::Ambigous(r));
  108. } else if grouped.len() >= 2 {
  109. let first = grouped.pop_back().unwrap();
  110. let last = grouped.pop_front().unwrap();
  111. if grouped.len() == 0 {
  112. if first.len() == 1 && last.len() == 1 {
  113. return Ok(ContigRef::Chimeric((
  114. first.get(0).unwrap().clone(),
  115. last.get(0).unwrap().clone(),
  116. )));
  117. } else if first.len() == 1 {
  118. return Ok(ContigRef::RightAmbiguity((
  119. first.get(0).unwrap().clone(),
  120. last.clone(),
  121. )));
  122. } else if last.len() == 1 {
  123. return Ok(ContigRef::LeftAmbiguity((
  124. first.clone(),
  125. last.get(0).unwrap().clone(),
  126. )));
  127. } else {
  128. let all: Vec<Mapping> = vec![first, last].into_iter().flat_map(|e| e).collect();
  129. return Ok(ContigRef::Ambigous(all));
  130. }
  131. } else {
  132. }
  133. if first.len() == 1 && last.len() == 1 {
  134. return Ok(ContigRef::ChimericMultiple((
  135. first.get(0).unwrap().clone(),
  136. grouped.into_iter().flat_map(|e| e).collect(),
  137. last.get(0).unwrap().clone(),
  138. )));
  139. } else if first.len() == 1 {
  140. let right: Vec<Mapping> = vec![grouped.into_iter().flat_map(|e| e).collect(), last]
  141. .into_iter()
  142. .flat_map(|e| e)
  143. .collect();
  144. return Ok(ContigRef::RightAmbiguity((
  145. first.get(0).unwrap().clone(),
  146. right,
  147. )));
  148. } else if last.len() == 1 {
  149. let left: Vec<Mapping> = vec![first, grouped.into_iter().flat_map(|e| e).collect()]
  150. .into_iter()
  151. .flat_map(|e| e)
  152. .collect();
  153. return Ok(ContigRef::LeftAmbiguity((
  154. left,
  155. last.get(0).unwrap().clone(),
  156. )));
  157. } else {
  158. let all: Vec<Mapping> =
  159. vec![first, grouped.into_iter().flat_map(|e| e).collect(), last]
  160. .into_iter()
  161. .flat_map(|e| e)
  162. .collect();
  163. return Ok(ContigRef::Ambigous(all));
  164. }
  165. } else {
  166. return Ok(ContigRef::Ambigous(
  167. grouped.into_iter().flat_map(|e| e).collect(),
  168. ));
  169. }
  170. }
  171. }
  172. impl Genome {
  173. pub fn new() -> Self {
  174. Genome {
  175. chromosomes: HashMap::new(),
  176. }
  177. }
  178. pub fn iter(&self) -> std::collections::hash_map::Iter<'_, String, Chromosome> {
  179. self.chromosomes.iter()
  180. }
  181. pub fn add_contig(
  182. &mut self,
  183. id: String,
  184. mappings: Vec<Mapping>,
  185. supporting_records: Vec<Record>,
  186. sequence: String,
  187. ) -> Result<()> {
  188. let new_contig = Contig {
  189. id,
  190. mappings: mappings.clone(),
  191. supporting_records,
  192. sequence,
  193. contig_ref: get_ref_pos(mappings)?,
  194. };
  195. // get the category of Mapping
  196. match new_contig.contig_ref.clone() {
  197. ContigRef::Unique(contig_mapping) => {
  198. match self
  199. .chromosomes
  200. .get_mut(&contig_mapping.target_name.unwrap())
  201. {
  202. Some(chromosome) => {
  203. chromosome.contigs.push(new_contig);
  204. }
  205. None => (),
  206. }
  207. }
  208. ContigRef::Chimeric((a, b)) => {
  209. let a_target_name = a.target_name.unwrap();
  210. let b_target_name = b.target_name.unwrap();
  211. if a_target_name == b_target_name {
  212. if let Some(chromosome) = self.chromosomes.get_mut(&a_target_name) {
  213. chromosome.contigs.push(new_contig);
  214. } else {
  215. self.chromosomes.insert(
  216. a_target_name,
  217. Chromosome {
  218. contigs: vec![new_contig],
  219. },
  220. );
  221. }
  222. } else {
  223. let chimeric_name = format!("{}-{}", a_target_name, b_target_name);
  224. if let Some(chromosome) = self.chromosomes.get_mut(&chimeric_name) {
  225. chromosome.contigs.push(new_contig);
  226. } else {
  227. self.chromosomes.insert(
  228. chimeric_name,
  229. Chromosome {
  230. contigs: vec![new_contig],
  231. },
  232. );
  233. }
  234. }
  235. }
  236. ContigRef::ChimericMultiple((left, _, right)) => {
  237. let left_target_name = left.target_name.unwrap();
  238. let right_target_name = right.target_name.unwrap();
  239. if left_target_name == right_target_name {
  240. if let Some(chromosome) = self.chromosomes.get_mut(&left_target_name) {
  241. chromosome.contigs.push(new_contig);
  242. } else {
  243. self.chromosomes.insert(
  244. left_target_name,
  245. Chromosome {
  246. contigs: vec![new_contig],
  247. },
  248. );
  249. }
  250. } else {
  251. let chimeric_name = format!("{}-{}", left_target_name, right_target_name);
  252. if let Some(chromosome) = self.chromosomes.get_mut(&chimeric_name) {
  253. chromosome.contigs.push(new_contig);
  254. } else {
  255. self.chromosomes.insert(
  256. chimeric_name,
  257. Chromosome {
  258. contigs: vec![new_contig],
  259. },
  260. );
  261. }
  262. }
  263. }
  264. _ => {
  265. if let Some(chromosome) = self.chromosomes.get_mut("Ambigous") {
  266. chromosome.contigs.push(new_contig);
  267. } else {
  268. self.chromosomes.insert(
  269. "Ambigous".to_string(),
  270. Chromosome {
  271. contigs: vec![new_contig],
  272. },
  273. );
  274. }
  275. }
  276. };
  277. Ok(())
  278. }
  279. pub fn stats(&self) {
  280. for (k, v) in self.chromosomes.iter() {
  281. info!("{}:{}", k, v.contigs.len());
  282. }
  283. }
  284. pub fn chromosome(&self, chromosome: &str) -> Option<Vec<Contig>> {
  285. if let Some(chr) = self.chromosomes.get(chromosome) {
  286. Some(chr.contigs.clone())
  287. } else {
  288. None
  289. }
  290. }
  291. }
  292. impl Chromosome {
  293. pub fn iter(&self) -> std::slice::Iter<'_, Contig> {
  294. self.contigs.iter()
  295. }
  296. }
  297. impl Contig {
  298. pub fn sort(&mut self) {
  299. self.mappings.sort_by(|a, b| a.target_start.cmp(&b.target_start));
  300. }
  301. pub fn write_bam(&self, path: &str) -> Result<()> {
  302. let mut header = bam::Header::new();
  303. let mut sq_record = rust_htslib::bam::header::HeaderRecord::new(b"@SQ");
  304. sq_record.push_tag(b"SN", self.id.clone());
  305. sq_record.push_tag(b"LN", self.sequence.len());
  306. header.push_record(&sq_record);
  307. let mut out = bam::Writer::from_path(path, &header, bam::Format::Bam).unwrap();
  308. // copy reverse reads to new BAM file
  309. for mapping in self.mappings.iter() {
  310. let record = minimap2::htslib::mapping_to_record(Some(mapping), self.sequence.as_bytes(), header.clone(), None, Some(self.id.as_bytes()));
  311. out.write(&record)?;
  312. }
  313. rust_htslib::bam::index::build(path, None, rust_htslib::bam::index::Type::Bai, 5)?;
  314. Ok(())
  315. }
  316. }
  317. fn group_mappings(mappings: &mut Vec<Mapping>) -> Result<Vec<Vec<Mapping>>> {
  318. // sort alignments by query_start
  319. mappings.sort_by(|a, b| a.query_start.cmp(&b.query_start));
  320. let mut alignments: Vec<Vec<Mapping>> = vec![];
  321. // group by overlapps > 30
  322. for aln in mappings.iter() {
  323. let last = alignments.last_mut();
  324. if let Some(l) = last {
  325. if l.iter()
  326. .filter(|a| a.query_end - aln.query_start > 30)
  327. .count()
  328. > 0
  329. {
  330. l.push(aln.clone());
  331. } else {
  332. alignments.push(vec![aln.clone()]);
  333. }
  334. } else {
  335. alignments.push(vec![aln.clone()]);
  336. }
  337. }
  338. Ok(alignments)
  339. }
  340. #[cfg(test)]
  341. mod tests {
  342. use super::*;
  343. #[test]
  344. fn it_works() {
  345. // let result = add(2, 2);
  346. // assert_eq!(result, 4);
  347. }
  348. }