< 40 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201
  1. pub mod breakpoint;
  2. pub mod genomic_graph;
  3. // mod phase;
  4. use anyhow::{anyhow, Ok, Result};
  5. use fasta::record::Sequence;
  6. use log::info;
  7. use minimap2::{Aligner, Mapping};
  8. use noodles_fasta as fasta;
  9. use num_format::{CustomFormat, Grouping, ToFormattedString, WriteFormatted};
  10. use petgraph::{dot::Dot, prelude::*};
  11. use rust_htslib::bam::{self, Record};
  12. use std::{
  13. collections::HashMap,
  14. fmt,
  15. fs::{self, File},
  16. io::{BufReader, BufWriter, Write},
  17. path::PathBuf,
  18. process::{Command, Stdio},
  19. };
  20. use uuid::Uuid;
  21. #[derive(Debug, Clone)]
  22. pub struct Genome {
  23. pub chromosomes: HashMap<String, Chromosome>,
  24. }
  25. #[derive(Debug, Clone)]
  26. pub struct Chromosome {
  27. contigs: Vec<Contig>,
  28. }
  29. #[derive(Debug, Clone)]
  30. pub struct Contig {
  31. pub id: String,
  32. // contig seq on ref
  33. pub mappings: Vec<Mapping>,
  34. // reads on ref
  35. pub supporting_records: Option<Vec<Record>>,
  36. pub sequence: String,
  37. pub contig_ref: ContigRef,
  38. }
  39. #[derive(Debug, Clone)]
  40. pub enum ContigRef {
  41. Unique(Mapping),
  42. Chimeric((Mapping, Mapping)),
  43. ChimericTriple((Mapping, Mapping, Mapping)),
  44. ChimericMultiple((Mapping, Vec<Mapping>, Mapping)),
  45. LeftAmbiguity((Vec<Mapping>, Mapping)),
  46. RightAmbiguity((Mapping, Vec<Mapping>)),
  47. Ambigous(Vec<Mapping>),
  48. }
  49. impl fmt::Display for ContigRef {
  50. fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
  51. let str = match self {
  52. ContigRef::Unique(m) => mapping_to_string(m),
  53. ContigRef::Chimeric((a, b)) => {
  54. format!("{}<->{}", mapping_to_string(a), mapping_to_string(b))
  55. }
  56. ContigRef::ChimericMultiple((a, v, b)) => format!(
  57. "{}<->{}<->{}",
  58. mapping_to_string(a),
  59. mappings_to_string(v),
  60. mapping_to_string(b)
  61. ),
  62. ContigRef::LeftAmbiguity((v, b)) => {
  63. format!("{}<->{}", mappings_to_string(v), mapping_to_string(b))
  64. }
  65. ContigRef::RightAmbiguity((a, v)) => {
  66. format!("{}<->{}", mapping_to_string(a), mappings_to_string(v))
  67. }
  68. ContigRef::Ambigous(v) => format!("{}", mappings_to_string(v)),
  69. ContigRef::ChimericTriple((a, b, c)) => format!(
  70. "{}<->{}<->{}",
  71. mapping_to_string(a),
  72. mapping_to_string(b),
  73. mapping_to_string(c)
  74. ),
  75. };
  76. fmt.write_str(&str).unwrap();
  77. std::result::Result::Ok(())
  78. }
  79. }
  80. impl ContigRef {
  81. pub fn breakpoints(&self) -> Option<Vec<Mapping>> {
  82. match self {
  83. ContigRef::Unique(_) => None,
  84. ContigRef::Chimeric((a, b)) => Some(vec![a.clone(), b.clone()]),
  85. ContigRef::ChimericTriple((a, b, c)) => Some(vec![a.clone(), b.clone(), c.clone()]),
  86. ContigRef::ChimericMultiple(_) => None,
  87. ContigRef::LeftAmbiguity(_) => None,
  88. ContigRef::RightAmbiguity(_) => None,
  89. ContigRef::Ambigous(_) => None,
  90. }
  91. }
  92. pub fn breakpoints_repr(&self) -> Option<Vec<String>> {
  93. let left = "►";
  94. let right = "◄";
  95. let bp_right = "▐";
  96. let bp_left = "▌";
  97. let format = CustomFormat::builder()
  98. .grouping(Grouping::Standard)
  99. .minus_sign("-")
  100. .separator("_")
  101. .build()
  102. .unwrap();
  103. let get_sign = |m: &Mapping| -> &str {
  104. match m.strand {
  105. minimap2::Strand::Forward => left,
  106. minimap2::Strand::Reverse => right,
  107. }
  108. };
  109. let uk = "UNKNOWN".to_string();
  110. let mut res = Vec::new();
  111. if let Some(breakpoints) = self.breakpoints() {
  112. for v in breakpoints.windows(2) {
  113. let mut bp_string = format!("{}:", v[0].target_name.clone().unwrap_or(uk.clone()));
  114. let _ = bp_string
  115. .write_formatted(&v[0].target_end, &format)
  116. .unwrap();
  117. bp_string = format!(
  118. "{bp_string}{}{bp_left}{bp_right}{}{}:",
  119. get_sign(&v[0]),
  120. get_sign(&v[1]),
  121. v[1].target_name.clone().unwrap_or(uk.clone())
  122. );
  123. let _ = bp_string
  124. .write_formatted(&v[1].target_start, &format)
  125. .unwrap();
  126. res.push(bp_string);
  127. }
  128. }
  129. if !res.is_empty() {
  130. Some(res)
  131. } else {
  132. None
  133. }
  134. }
  135. pub fn desc(&self) -> Option<String> {
  136. let uk = "UNKNOWN".to_string();
  137. let to_desc = |v: &mut Vec<Mapping>| -> String {
  138. v.sort_by(|a, b| a.query_start.cmp(&b.query_start));
  139. let v: Vec<String> = v
  140. .into_iter()
  141. .map(|e| {
  142. let strand = match e.strand {
  143. minimap2::Strand::Forward => "",
  144. minimap2::Strand::Reverse => "_rev",
  145. };
  146. format!(
  147. "{}:{}_{}{}",
  148. e.target_name.clone().unwrap_or(uk.clone()),
  149. e.target_start,
  150. e.target_end,
  151. strand
  152. )
  153. })
  154. .collect();
  155. format!("[{}]", v.join(";"))
  156. };
  157. match self {
  158. ContigRef::Unique(a) => Some(format!(
  159. "{}:{}_{}",
  160. a.target_name.clone().unwrap_or(uk.clone()),
  161. a.target_start,
  162. a.target_end
  163. )),
  164. ContigRef::Chimeric((a, b)) => Some(to_desc(&mut vec![a.to_owned(), b.to_owned()])),
  165. ContigRef::ChimericTriple((a, b, c)) => {
  166. Some(to_desc(&mut vec![a.to_owned(), b.to_owned(), c.to_owned()]))
  167. }
  168. ContigRef::ChimericMultiple(_) => None,
  169. ContigRef::LeftAmbiguity(_) => None,
  170. ContigRef::RightAmbiguity(_) => None,
  171. ContigRef::Ambigous(a) => Some(to_desc(&mut a.to_owned())),
  172. }
  173. }
  174. pub fn hgvs(&self) -> Option<String> {
  175. let uk = "UNKNOWN".to_string();
  176. match self {
  177. ContigRef::Unique(_) => None,
  178. ContigRef::Chimeric((a, b)) => {
  179. if a.target_name == b.target_name {
  180. let chr = a.target_name.clone().unwrap_or(uk.clone());
  181. let del_start = a.target_end;
  182. let del_end = b.target_start;
  183. let hgvs = format!("{chr}:{del_start}_{del_end}");
  184. Some(hgvs)
  185. } else {
  186. let a_chr = a.target_name.clone().unwrap_or(uk.clone());
  187. let a_bp = a.target_end;
  188. let b_chr = b.target_name.clone().unwrap_or(uk.clone());
  189. let b_bp = b.target_end;
  190. let hgvs = format!("{a_chr}:{a_bp}delins[{b_chr}:{b_bp}]");
  191. Some(hgvs)
  192. }
  193. }
  194. ContigRef::ChimericMultiple(_) => None,
  195. ContigRef::LeftAmbiguity(_) => None,
  196. ContigRef::RightAmbiguity(_) => None,
  197. ContigRef::Ambigous(_) => None,
  198. ContigRef::ChimericTriple((a, b, c)) => {
  199. let mut v = [a, b, c];
  200. v.sort_by(|a, b| a.query_start.cmp(&b.query_start));
  201. let (a, b, c) = (
  202. *v.get(0).clone().unwrap(),
  203. *v.get(1).clone().unwrap(),
  204. *v.get(2).clone().unwrap(),
  205. );
  206. let a_target_name = a.target_name.clone().unwrap_or(uk.clone());
  207. let b_target_name = b.target_name.clone().unwrap_or(uk.clone());
  208. let c_target_name = c.target_name.clone().unwrap_or(uk.clone());
  209. // if a_target_name != b_target_name {}
  210. // Insertions
  211. // prioritize first len
  212. let (bp_a_1, bp_a_2) = if a.query_end <= b.query_end {
  213. // TODO add inserted nt
  214. (
  215. (a.target_name.clone().unwrap_or(uk.clone()), a.target_end),
  216. (b.target_name.clone().unwrap_or(uk.clone()), b.target_start),
  217. )
  218. } else {
  219. let diff = a.query_end - b.query_start;
  220. (
  221. (a.target_name.clone().unwrap_or(uk.clone()), a.target_end),
  222. (
  223. b.target_name.clone().unwrap_or(uk.clone()),
  224. b.target_start + diff,
  225. ),
  226. )
  227. };
  228. let (bp_b_1, bp_b_2) = if b.query_end <= c.query_end {
  229. // TODO add inserted nt
  230. (
  231. (b.target_name.clone().unwrap_or(uk.clone()), b.target_end),
  232. (c.target_name.clone().unwrap_or(uk.clone()), c.target_start),
  233. )
  234. } else {
  235. let diff = b.query_end - c.query_start;
  236. (
  237. (b.target_name.clone().unwrap_or(uk.clone()), b.target_end),
  238. (
  239. c.target_name.clone().unwrap_or(uk.clone()),
  240. c.target_start + diff,
  241. ),
  242. )
  243. };
  244. if bp_a_1.0 == bp_b_2.0 {
  245. let hgvs = format!(
  246. "{}:{}_{}ins[{}:{}_{}]",
  247. bp_a_1.0, bp_a_1.1, bp_b_2.1, bp_a_2.0, bp_a_2.1, bp_b_1.1
  248. );
  249. Some(hgvs)
  250. } else {
  251. None
  252. }
  253. }
  254. }
  255. }
  256. }
  257. pub fn mapping_to_string(mapping: &Mapping) -> String {
  258. let uk = "UNKNOWN".to_string();
  259. format!(
  260. "{}:{}-{}({}:{}-{})",
  261. mapping.target_name.clone().unwrap_or(uk.clone()),
  262. mapping.target_start,
  263. mapping.target_end,
  264. mapping.query_name.clone().unwrap_or(uk),
  265. mapping.query_start,
  266. mapping.query_end
  267. )
  268. }
  269. fn mappings_to_string(mappings: &Vec<Mapping>) -> String {
  270. let v = mappings
  271. .iter()
  272. .map(mapping_to_string)
  273. .collect::<Vec<String>>();
  274. v.join("//")
  275. }
  276. pub fn get_ref_pos(mappings: Vec<Mapping>) -> Result<ContigRef> {
  277. let mut mappings = mappings;
  278. mappings.sort_by(|a, b| a.query_start.cmp(&b.query_start));
  279. if mappings.len() == 1 {
  280. return Ok(ContigRef::Unique(mappings.get(0).unwrap().clone()));
  281. } else {
  282. let mut grouped: Vec<Vec<Mapping>> = group_mappings(&mut mappings)?;
  283. // let mut grouped: VecDeque<Vec<Mapping>> = group_mappings(&mut mappings)?.into();
  284. if grouped.len() == 1 {
  285. let r = grouped.into_iter().flat_map(|e| e).collect();
  286. return Ok(ContigRef::Ambigous(r));
  287. } else if grouped.len() >= 2 {
  288. // let first = grouped.pop_back().unwrap();
  289. // let last = grouped.pop_front().unwrap();
  290. let first = grouped.first().unwrap().to_vec();
  291. let last = grouped.last().unwrap().to_vec();
  292. grouped.remove(0);
  293. grouped.remove(grouped.len() - 1);
  294. assert!(first[0].query_start < last[0].query_start);
  295. if grouped.len() == 0 {
  296. if first.len() == 1 && last.len() == 1 {
  297. return Ok(ContigRef::Chimeric((
  298. first.get(0).unwrap().clone(),
  299. last.get(0).unwrap().clone(),
  300. )));
  301. } else if first.len() == 1 {
  302. return Ok(ContigRef::RightAmbiguity((
  303. first.get(0).unwrap().clone(),
  304. last.clone(),
  305. )));
  306. } else if last.len() == 1 {
  307. return Ok(ContigRef::LeftAmbiguity((
  308. first.clone(),
  309. last.get(0).unwrap().clone(),
  310. )));
  311. } else {
  312. let all: Vec<Mapping> = vec![first, last].into_iter().flat_map(|e| e).collect();
  313. return Ok(ContigRef::Ambigous(all));
  314. }
  315. }
  316. if first.len() == 1 && last.len() == 1 {
  317. if grouped.len() == 1 {
  318. return Ok(ContigRef::ChimericTriple((
  319. first.get(0).unwrap().clone(),
  320. grouped.get(0).unwrap().get(0).unwrap().clone(),
  321. last.get(0).unwrap().clone(),
  322. )));
  323. } else {
  324. return Ok(ContigRef::ChimericMultiple((
  325. first.get(0).unwrap().clone(),
  326. grouped.into_iter().flat_map(|e| e).collect(),
  327. last.get(0).unwrap().clone(),
  328. )));
  329. }
  330. } else if first.len() == 1 {
  331. let right: Vec<Mapping> = vec![grouped.into_iter().flat_map(|e| e).collect(), last]
  332. .into_iter()
  333. .flat_map(|e| e)
  334. .collect();
  335. return Ok(ContigRef::RightAmbiguity((
  336. first.get(0).unwrap().clone(),
  337. right,
  338. )));
  339. } else if last.len() == 1 {
  340. let left: Vec<Mapping> = vec![first, grouped.into_iter().flat_map(|e| e).collect()]
  341. .into_iter()
  342. .flat_map(|e| e)
  343. .collect();
  344. return Ok(ContigRef::LeftAmbiguity((
  345. left,
  346. last.get(0).unwrap().clone(),
  347. )));
  348. } else {
  349. let all: Vec<Mapping> =
  350. vec![first, grouped.into_iter().flat_map(|e| e).collect(), last]
  351. .into_iter()
  352. .flat_map(|e| e)
  353. .collect();
  354. return Ok(ContigRef::Ambigous(all));
  355. }
  356. } else {
  357. return Ok(ContigRef::Ambigous(
  358. grouped.into_iter().flat_map(|e| e).collect(),
  359. ));
  360. }
  361. }
  362. }
  363. impl Genome {
  364. pub fn new() -> Self {
  365. Genome {
  366. chromosomes: HashMap::new(),
  367. }
  368. }
  369. pub fn iter(&self) -> std::collections::hash_map::Iter<'_, String, Chromosome> {
  370. self.chromosomes.iter()
  371. }
  372. pub fn contigs(&self) -> impl Iterator<Item = &Contig> {
  373. self.chromosomes.iter().flat_map(|(_, e)| e.iter())
  374. }
  375. pub fn add_contig(
  376. &mut self,
  377. id: String,
  378. mappings: Vec<Mapping>,
  379. supporting_records: Option<Vec<Record>>,
  380. sequence: String,
  381. ) -> Result<()> {
  382. let mut mappings = mappings;
  383. mappings.sort_by(|a, b| a.query_start.cmp(&b.query_start));
  384. let new_contig = Contig {
  385. id,
  386. mappings: mappings.clone(),
  387. supporting_records,
  388. sequence,
  389. contig_ref: get_ref_pos(mappings)?,
  390. };
  391. // get the category of Mapping
  392. match new_contig.contig_ref.clone() {
  393. ContigRef::Unique(contig_mapping) => {
  394. match self
  395. .chromosomes
  396. .get_mut(&contig_mapping.target_name.unwrap())
  397. {
  398. Some(chromosome) => {
  399. chromosome.contigs.push(new_contig);
  400. }
  401. None => (),
  402. }
  403. }
  404. ContigRef::Chimeric((a, b)) => {
  405. let a_target_name = a.target_name.unwrap();
  406. let b_target_name = b.target_name.unwrap();
  407. if a_target_name == b_target_name {
  408. if let Some(chromosome) = self.chromosomes.get_mut(&a_target_name) {
  409. chromosome.contigs.push(new_contig);
  410. } else {
  411. self.chromosomes.insert(
  412. a_target_name,
  413. Chromosome {
  414. contigs: vec![new_contig],
  415. },
  416. );
  417. }
  418. } else {
  419. let chimeric_name = format!("{}-{}", a_target_name, b_target_name);
  420. if let Some(chromosome) = self.chromosomes.get_mut(&chimeric_name) {
  421. chromosome.contigs.push(new_contig);
  422. } else {
  423. self.chromosomes.insert(
  424. chimeric_name,
  425. Chromosome {
  426. contigs: vec![new_contig],
  427. },
  428. );
  429. }
  430. }
  431. }
  432. ContigRef::ChimericMultiple((left, _, right)) => {
  433. let left_target_name = left.target_name.unwrap();
  434. let right_target_name = right.target_name.unwrap();
  435. if left_target_name == right_target_name {
  436. if let Some(chromosome) = self.chromosomes.get_mut(&left_target_name) {
  437. chromosome.contigs.push(new_contig);
  438. } else {
  439. self.chromosomes.insert(
  440. left_target_name,
  441. Chromosome {
  442. contigs: vec![new_contig],
  443. },
  444. );
  445. }
  446. } else {
  447. let chimeric_name = format!("{}-{}", left_target_name, right_target_name);
  448. if let Some(chromosome) = self.chromosomes.get_mut(&chimeric_name) {
  449. chromosome.contigs.push(new_contig);
  450. } else {
  451. self.chromosomes.insert(
  452. chimeric_name,
  453. Chromosome {
  454. contigs: vec![new_contig],
  455. },
  456. );
  457. }
  458. }
  459. }
  460. _ => {
  461. if let Some(chromosome) = self.chromosomes.get_mut("Ambigous") {
  462. chromosome.contigs.push(new_contig);
  463. } else {
  464. self.chromosomes.insert(
  465. "Ambigous".to_string(),
  466. Chromosome {
  467. contigs: vec![new_contig],
  468. },
  469. );
  470. }
  471. }
  472. };
  473. Ok(())
  474. }
  475. pub fn add_contig_from_seq(
  476. &mut self,
  477. name: String,
  478. sequence: &[u8],
  479. aligner: impl Fn(String) -> Result<Vec<Mapping>>,
  480. ) -> Result<()> {
  481. let mappings = aligner(String::from_utf8(sequence.to_vec())?)?;
  482. // println!("{mappings:?}");
  483. self.add_contig(name, mappings, None, String::from_utf8(sequence.to_vec())?)?;
  484. Ok(())
  485. }
  486. pub fn write_contigs_sequences(&self, dir: &str) {
  487. for contig in self.contigs() {
  488. contig.write_fasta(&format!("{dir}/{}.fasta", contig.id))
  489. }
  490. // self.iter().for_each(|(_, chr)| {
  491. // chr.iter().for_each(|c| c.write_fasta(&format!("{dir}/{}.fasta", c.id)))
  492. // });
  493. }
  494. pub fn from_contigs_sequences(dir: &str) -> Result<Self> {
  495. let aligner_url = "http://localhost:4444/align";
  496. let aligner = aligner_client::dist_align(aligner_url.to_string());
  497. let mut genome = Self::new();
  498. // let paths = get_ext_paths(dir, "fasta")?;
  499. let paths = get_contigs_fa_paths(dir)?;
  500. for path in paths {
  501. let fa = read_fasta(path.to_str().unwrap())?;
  502. for (name, sequence) in fa {
  503. genome.add_contig_from_seq(name, sequence.as_ref(), &aligner)?;
  504. }
  505. }
  506. Ok(genome)
  507. }
  508. //
  509. // pub fn from_dir_bed(dir: &str) {
  510. //
  511. // }
  512. // pub fn write_records(&self, file: &str) {
  513. // let mut records = Vec::new();
  514. // for (name, chromosome) in self.chromosomes.iter() {
  515. // for contig in chromosome.iter() {
  516. // if let Some(rec) = &contig.supporting_records {
  517. // records.extend(rec.iter());
  518. // };
  519. // }
  520. // }
  521. // // header
  522. //
  523. // // writer
  524. // rust_htslib::bam::Writer::from_path(path, header, format)
  525. // }
  526. pub fn stats(&self) {
  527. for (k, v) in self.chromosomes.iter() {
  528. info!("{}:{}", k, v.contigs.len());
  529. }
  530. }
  531. pub fn chromosome(&self, chromosome: &str) -> Option<Vec<Contig>> {
  532. self.chromosomes
  533. .get(chromosome)
  534. .map(|chr| chr.contigs.clone())
  535. }
  536. }
  537. impl Chromosome {
  538. pub fn iter(&self) -> std::slice::Iter<'_, Contig> {
  539. self.contigs.iter()
  540. }
  541. }
  542. impl Contig {
  543. // pub fn sort(&mut self) {
  544. // // sorting by target order
  545. // self.mappings
  546. // .sort_by(|a, b| a.target_start.cmp(&b.target_start));
  547. // }
  548. pub fn to_igv(&self, dir_path: &str) -> Result<()> {
  549. let supporting_records = self.supporting_records.clone().ok_or(anyhow!("no reads"))?;
  550. let contig_name = if let Some(hgvs) = self.hgvs() {
  551. hgvs
  552. } else {
  553. self.id.clone()
  554. };
  555. let contig_dir = format!("{dir_path}/{contig_name}");
  556. fs::create_dir_all(contig_dir.clone())?;
  557. let fasta_path = format!("{contig_dir}/contig.fa");
  558. write_fasta(&fasta_path, &vec![(self.id.clone(), self.sequence.clone())]);
  559. write_fai(&fasta_path);
  560. let reads_path = format!("{contig_dir}/reads.fa");
  561. let n_reads = supporting_records
  562. .clone()
  563. .into_iter()
  564. .map(|r| {
  565. (
  566. String::from_utf8(r.qname().to_vec()).unwrap(),
  567. String::from_utf8(r.seq().as_bytes()).unwrap(),
  568. )
  569. })
  570. .collect();
  571. write_fasta(&reads_path, &n_reads);
  572. let bam_path = format!("{contig_dir}/{}.bam", self.id);
  573. write_bam(&fasta_path, &reads_path, &bam_path)?;
  574. let bed_path = format!("{contig_dir}/contig.bed");
  575. match &self.contig_ref {
  576. ContigRef::Chimeric((a, b)) => {
  577. let d = vec![
  578. (
  579. self.id.clone(),
  580. a.query_start,
  581. a.query_end,
  582. format!(
  583. "{}:{}-{}",
  584. a.target_name.clone().unwrap(),
  585. a.target_start,
  586. a.target_end
  587. ),
  588. ),
  589. (
  590. self.id.clone(),
  591. b.query_start,
  592. b.query_end,
  593. format!(
  594. "{}:{}-{}",
  595. b.target_name.clone().unwrap(),
  596. b.target_start,
  597. b.target_end
  598. ),
  599. ),
  600. ];
  601. write_bed(&bed_path, &d)?;
  602. }
  603. ContigRef::ChimericTriple((a, b, c)) => {
  604. let d: Vec<(String, i32, i32, String)> = [a, b, c]
  605. .iter()
  606. .map(|r| {
  607. (
  608. self.id.clone(),
  609. r.query_start,
  610. r.query_end,
  611. format!(
  612. "{}:{}-{}",
  613. r.target_name.clone().unwrap(),
  614. r.target_start,
  615. r.target_end
  616. ),
  617. )
  618. })
  619. .collect();
  620. write_bed(&bed_path, &d)?;
  621. }
  622. _ => (),
  623. }
  624. Ok(())
  625. }
  626. // bug cigar len != seq len
  627. // pub fn write_bam(&self, path: &str) -> Result<()> {
  628. // let aligner = Aligner::builder()
  629. // .asm5()
  630. // .with_threads(8)
  631. // .with_cigar()
  632. // .with_seq(self.sequence.as_bytes())
  633. // .expect("Unable to build index");
  634. //
  635. // let mut mappings = Vec::new();
  636. // let supporting_records = self
  637. // .supporting_records
  638. // .clone()
  639. // .ok_or(anyhow!("no supporting records"))?;
  640. // for record in supporting_records.iter() {
  641. // let seq = record.seq().as_bytes();
  642. // let alignment = aligner
  643. // .map(&seq, false, false, None, None)
  644. // .expect("Unable to align");
  645. // mappings.push(alignment);
  646. // }
  647. // let mut mappings: Vec<_> = mappings.into_iter().flatten().collect();
  648. // mappings.sort_by(|a, b| a.target_start.cmp(&b.target_start));
  649. //
  650. // let mut header = bam::Header::new();
  651. // let mut sq_record = rust_htslib::bam::header::HeaderRecord::new(b"SQ");
  652. // sq_record.push_tag(b"SN", self.id.clone());
  653. // sq_record.push_tag(b"LN", self.sequence.len());
  654. // header.push_record(&sq_record);
  655. //
  656. // let mut out = bam::Writer::from_path(path, &header, bam::Format::Bam).unwrap();
  657. //
  658. // // copy reverse reads to new BAM file
  659. // for mapping in mappings.iter() {
  660. // let record = minimap2::htslib::mapping_to_record(
  661. // Some(mapping),
  662. // self.sequence.as_bytes(),
  663. // header.clone(),
  664. // None,
  665. // Some(Uuid::new_v4().as_bytes()),
  666. // );
  667. // let _ = out.write(&record);
  668. // }
  669. // rust_htslib::bam::index::build(path, None, rust_htslib::bam::index::Type::Bai, 1)?;
  670. // Ok(())
  671. // }
  672. pub fn hgvs(&self) -> Option<String> {
  673. self.contig_ref.hgvs()
  674. }
  675. pub fn desc(&self) -> Option<String> {
  676. self.contig_ref.desc()
  677. }
  678. pub fn breakpoints_repr(&self) -> Option<Vec<String>> {
  679. self.contig_ref.breakpoints_repr()
  680. }
  681. pub fn write_fasta(&self, fasta_path: &str) {
  682. write_fasta(fasta_path, &vec![(self.id.clone(), self.sequence.clone())]);
  683. }
  684. }
  685. fn group_mappings(mappings: &mut [Mapping]) -> Result<Vec<Vec<Mapping>>> {
  686. // sort alignments by query_start
  687. mappings.sort_by(|a, b| a.query_start.cmp(&b.query_start));
  688. let mut alignments: Vec<Vec<Mapping>> = vec![];
  689. // group by overlapps > 30
  690. for aln in mappings.iter() {
  691. let last = alignments.last_mut();
  692. if let Some(l) = last {
  693. if l.iter()
  694. .filter(|a| a.query_end - aln.query_start > 30)
  695. .count()
  696. > 0
  697. {
  698. l.push(aln.clone());
  699. } else {
  700. alignments.push(vec![aln.clone()]);
  701. }
  702. } else {
  703. alignments.push(vec![aln.clone()]);
  704. }
  705. }
  706. Ok(alignments)
  707. }
  708. pub fn write_bed(path: &str, d: &[(String, i32, i32, String)]) -> Result<()> {
  709. let file = File::create(path).unwrap();
  710. let mut writer = BufWriter::new(file);
  711. for (chr, start, end, value) in d.iter() {
  712. let row = format!(
  713. "{}\n",
  714. [
  715. chr.to_string(),
  716. start.to_string(),
  717. end.to_string(),
  718. value.to_string()
  719. ]
  720. .join("\t")
  721. );
  722. writer.write_all(row.as_bytes())?;
  723. }
  724. Ok(())
  725. }
  726. // unique
  727. pub fn write_fastq(fastq_path: &str, d: &Vec<Record>) -> Result<()> {
  728. let file = File::create(fastq_path)?;
  729. let mut writer = BufWriter::new(file);
  730. for record in d {
  731. let name = String::from_utf8(record.qname().to_vec()).unwrap();
  732. writer.write_all(format!("@{name}\n").as_bytes())?;
  733. let seq = record.seq().as_bytes();
  734. writer.write_all(&seq)?;
  735. writer.write_all(b"\n+\n")?;
  736. let qual = record.qual();
  737. writer.write_all(qual)?;
  738. }
  739. Ok(())
  740. }
  741. pub fn write_fasta(fasta_path: &str, d: &Vec<(String, String)>) {
  742. let file = File::create(fasta_path).unwrap();
  743. let mut writer = fasta::writer::Builder::default().build_with_writer(file);
  744. let mut passed = Vec::new();
  745. for (name, sequence) in d {
  746. let name = name.to_string();
  747. if sequence.is_empty() {
  748. continue;
  749. }
  750. if passed.contains(&name) {
  751. continue;
  752. }
  753. passed.push(name.clone());
  754. let record = fasta::Record::new(
  755. fasta::record::Definition::new(name.as_str(), None),
  756. fasta::record::Sequence::from(sequence.as_bytes().to_vec()),
  757. );
  758. writer.write_record(&record).unwrap();
  759. }
  760. }
  761. pub fn write_fai(path: &str) {
  762. let mut faidx = Command::new("samtools")
  763. .arg("faidx")
  764. .arg(path)
  765. .spawn()
  766. .expect("Samtools faidx failed to start");
  767. faidx.wait().unwrap();
  768. }
  769. pub fn write_bam(ref_path: &str, reads_path: &str, bam_path: &str) -> Result<()> {
  770. let rg_id = uuid::Uuid::new_v4();
  771. let mm2 = Command::new("minimap2")
  772. .arg("-t")
  773. .arg("128")
  774. .arg("-ax")
  775. .arg("map-ont")
  776. .arg("-R")
  777. .arg(format!(
  778. "@RG\\tPL:ONTASM_PROM\\tID:ONTASM_{rg_id}\\tSM:{rg_id}\\tLB:ONTASM_NB_PROM"
  779. ))
  780. .arg(ref_path)
  781. .arg(reads_path)
  782. .stdout(Stdio::piped())
  783. .stderr(Stdio::piped())
  784. .spawn()
  785. .expect("Minimap2 failed to start");
  786. let view = Command::new("sambamba")
  787. .arg("view")
  788. .arg("-h")
  789. .arg("-S")
  790. .arg("-t")
  791. .arg("20")
  792. .arg("--format=bam")
  793. .arg("/dev/stdin")
  794. .stdin(Stdio::from(mm2.stdout.unwrap()))
  795. .stdout(Stdio::piped())
  796. .stderr(Stdio::piped())
  797. .spawn()
  798. .expect("Sambamba view failed to start");
  799. let mut sort = Command::new("sambamba")
  800. .arg("sort")
  801. .arg("-t")
  802. .arg("20")
  803. .arg("/dev/stdin")
  804. .arg("-o")
  805. .arg(bam_path)
  806. .stderr(Stdio::piped())
  807. .stdin(Stdio::from(view.stdout.unwrap()))
  808. .spawn()
  809. .expect("Sambamba sort failed to start");
  810. sort.wait().unwrap();
  811. Ok(())
  812. }
  813. pub fn read_fasta(path: &str) -> Result<Vec<(String, Sequence)>> {
  814. let mut reader = File::open(path)
  815. .map(BufReader::new)
  816. .map(fasta::Reader::new)?;
  817. let mut res = Vec::new();
  818. for result in reader.records() {
  819. let record = result?;
  820. let u = String::from_utf8(record.name().to_vec())?;
  821. let s = record.sequence().to_owned();
  822. res.push((u, s));
  823. }
  824. Ok(res)
  825. }
  826. // fn get_ext_paths(dir: &str, ext: &str) -> Result<Vec<PathBuf>> {
  827. // let paths = std::fs::read_dir(dir)?
  828. // // Filter out all those directory entries which couldn't be read
  829. // .filter_map(|res| res.ok())
  830. // // Map the directory entries to paths
  831. // .map(|dir_entry| dir_entry.path())
  832. // // Filter out all paths with extensions other than `csv`
  833. // .filter_map(|path| {
  834. // if path.extension().map_or(false, |xt| xt == ext) {
  835. // Some(path)
  836. // } else {
  837. // None
  838. // }
  839. // })
  840. // .collect::<Vec<_>>();
  841. // Ok(paths)
  842. // }
  843. fn get_contigs_fa_paths(dir: &str) -> Result<Vec<PathBuf>> {
  844. let pattern = format!("{}/**/*_flye.fa", dir);
  845. let fa_paths: Vec<PathBuf> = glob::glob(&pattern)
  846. .expect("Failed to read glob pattern")
  847. .filter_map(Result::ok)
  848. .collect();
  849. Ok(fa_paths)
  850. }
  851. pub fn dot_graph(
  852. graph: &StableGraph<String, String>,
  853. way: &[NodeIndex],
  854. value: &str,
  855. color: &str,
  856. erase: bool,
  857. ) -> String {
  858. let mut g = graph.clone();
  859. // erase labels
  860. if erase {
  861. g.edge_weights_mut().for_each(|e| *e = "".to_string());
  862. }
  863. let mut labels = Vec::new();
  864. for window in way.windows(2) {
  865. let edge_id = g.find_edge(window[0], window[1]).unwrap();
  866. let edge = g.edge_weight_mut(edge_id).unwrap();
  867. let v = if !edge.is_empty() {
  868. let mut v: Vec<&str> = edge.split(",").filter(|e| !e.is_empty()).collect();
  869. v.push(value);
  870. v.join(", ")
  871. } else {
  872. value.to_string()
  873. };
  874. labels.push(v.clone());
  875. *edge = v;
  876. }
  877. if erase {
  878. g.retain_edges(|g, i| g.edge_weight(i).unwrap().to_string() != "".to_string());
  879. g.retain_nodes(|g, n| g.neighbors_undirected(n).count() > 0);
  880. }
  881. let mut dot = Dot::new(&g).to_string().replace("\\\"", "");
  882. labels.sort_by_key(|b| std::cmp::Reverse(b.len()));
  883. // labels.sort_by(|a, b| b.len().cmp(&a.len()));
  884. for label in labels {
  885. let label_str = format!("label = \"{label}\"");
  886. dot = dot.replace(
  887. &format!("{label_str} ]\n"),
  888. &format!("{label_str}, color = \"{color}\" ]\n"),
  889. );
  890. }
  891. dot
  892. }
  893. /// for non overlapping ways
  894. pub fn dot_graph_biall(
  895. graph: &StableGraph<String, String>,
  896. ways: &[Vec<NodeIndex>],
  897. values: Vec<&str>,
  898. colors: Vec<&str>,
  899. erase: bool,
  900. ) -> String {
  901. let mut g = graph.clone();
  902. // erase labels
  903. if erase {
  904. g.edge_weights_mut().for_each(|e| *e = "".to_string());
  905. }
  906. let mut labels = Vec::new();
  907. for ((way, value), color) in ways.iter().zip(values.into_iter()).zip(colors.into_iter()) {
  908. for window in way.windows(2) {
  909. let edge_id = g.find_edge(window[0], window[1]).unwrap();
  910. let edge = g.edge_weight_mut(edge_id).unwrap();
  911. let v = if !edge.is_empty() {
  912. let mut v: Vec<&str> = edge.split(",").filter(|e| !e.is_empty()).collect();
  913. v.push(value);
  914. v.join(", ")
  915. } else {
  916. value.to_string()
  917. };
  918. labels.push((v.clone(), color));
  919. *edge = v;
  920. }
  921. }
  922. // g.retain_edges(|g, i| g.edge_weight(i).unwrap().to_string() != "".to_string());
  923. g.retain_edges(|g, i| !g.edge_weight(i).unwrap().is_empty());
  924. g.retain_nodes(|g, n| g.neighbors_undirected(n).count() > 0);
  925. let mut dot = Dot::new(&g).to_string().replace("\\\"", "");
  926. labels.sort_by(|a, b| b.0.len().cmp(&a.0.len()));
  927. for (label, color) in labels {
  928. let label_str = format!("label = \"{label}\"");
  929. dot = dot.replace(
  930. &format!("{label_str} ]\n"),
  931. &format!("{label_str}, color = \"{color}\" ]\n"),
  932. );
  933. }
  934. dot
  935. }
  936. #[cfg(test)]
  937. mod tests {
  938. use env_logger::Env;
  939. use super::*;
  940. use crate::{
  941. genomic_graph::GenomicGraph,
  942. // phase::{variants_phasing, write_phases_bed},
  943. };
  944. fn init() {
  945. let _ = env_logger::Builder::from_env(Env::default().default_filter_or("info"))
  946. .is_test(true)
  947. .try_init();
  948. }
  949. #[test]
  950. fn it_works() -> Result<()> {
  951. let _ = env_logger::builder().is_test(true).try_init();
  952. let contig_fa = "./data_test/contig_2.fa";
  953. let aligner_url = "http://localhost:4444/align";
  954. let mut genome = Genome::new();
  955. let aligner = aligner_client::dist_align(aligner_url.to_string());
  956. let sequences = read_fasta(contig_fa)?;
  957. for (name, seq) in sequences {
  958. genome.add_contig_from_seq(name.clone(), &seq.as_ref().to_vec(), &aligner)?;
  959. let mut seqc: Vec<u8> = seq.complement().map(|e| e.unwrap()).collect();
  960. seqc.reverse();
  961. genome.add_contig_from_seq(format!("{name}_rev"), &seqc, &aligner)?;
  962. println!("Sending");
  963. }
  964. genome.iter().for_each(|(_, c)| {
  965. c.iter().for_each(|cont| {
  966. println!("{}", cont.contig_ref.desc().unwrap());
  967. });
  968. });
  969. Ok(())
  970. }
  971. #[test]
  972. fn test_graph() -> Result<()> {
  973. init();
  974. let case = "SALICETTO";
  975. let chrom = vec!["chr10"];
  976. info!("This record will be captured by `cargo test`");
  977. let dir = format!("/data/longreads_basic_pipe/{case}/diag/asm_contis");
  978. // Load from fasta in dir.
  979. let genome = Genome::from_contigs_sequences(&dir)?;
  980. genome.stats();
  981. let mut genomic_graph = GenomicGraph::from_genome(&genome);
  982. let sens = vec![true, false];
  983. let pos = vec![0, i32::MAX];
  984. let mut all_ways = Vec::new();
  985. if chrom.len() > 1 {
  986. (0..4).into_iter().for_each(|i| {
  987. let start_pos = if i < 2 { 0 } else { i32::MAX };
  988. let end_pos = pos[i % 2];
  989. (0..4).into_iter().for_each(|i| {
  990. let start_sens = if i < 2 { true } else { false };
  991. let end_sens = sens[i % 2];
  992. (0..4).into_iter().for_each(|i| {
  993. let start_chr = if i < 2 { chrom[0] } else { chrom[1] };
  994. let end_chr = chrom[i % 2];
  995. let start = (start_sens, start_chr, start_pos);
  996. let end = (end_sens, end_chr, end_pos);
  997. let (oriented_graph, _integrated_graph, ways) =
  998. genomic_graph.ways(start, end);
  999. let dot = oriented_graph.dot_graph();
  1000. println!("dot\n{dot}");
  1001. for (_i, way) in ways.iter().enumerate() {
  1002. let s = way
  1003. .iter()
  1004. .map(|(_, _, _, s)| s.to_string())
  1005. .collect::<Vec<String>>()
  1006. .join("");
  1007. all_ways.push(s);
  1008. }
  1009. });
  1010. });
  1011. });
  1012. } else {
  1013. let start_chr = chrom[0];
  1014. let end_chr = chrom[0];
  1015. let start = (true, start_chr, 0);
  1016. let end = (true, end_chr, i32::MAX);
  1017. let (oriented_graph, _integrated_graph, ways) = genomic_graph.ways(start, end);
  1018. let dot = oriented_graph.dot_graph();
  1019. println!("dot\n{dot}");
  1020. for (_i, way) in ways.iter().enumerate() {
  1021. let s = way
  1022. .iter()
  1023. .map(|(_, _, _, s)| s.to_string())
  1024. .collect::<Vec<String>>()
  1025. .join("");
  1026. all_ways.push(s);
  1027. }
  1028. }
  1029. all_ways.dedup();
  1030. all_ways
  1031. .iter()
  1032. .enumerate()
  1033. .for_each(|(i, s)| println!("{i}.\t{s}"));
  1034. // let s = Dot::new(&integrated_graph).to_string().replace("\\\"", "");
  1035. // let x11_colors: Vec<String> = vec![
  1036. // String::from("Red"),
  1037. // String::from("Green"),
  1038. // String::from("Blue"),
  1039. // String::from("Cyan"),
  1040. // String::from("Magenta"),
  1041. // String::from("Yellow"),
  1042. // String::from("DarkRed"),
  1043. // String::from("DarkGreen"),
  1044. // String::from("DarkBlue"),
  1045. // String::from("DarkCyan"),
  1046. // String::from("DarkMagenta"),
  1047. // String::from("DarkYellow"),
  1048. // String::from("LightRed"),
  1049. // String::from("LightGreen"),
  1050. // String::from("LightBlue"),
  1051. // String::from("LightCyan"),
  1052. // String::from("LightMagenta"),
  1053. // String::from("LightYellow"),
  1054. // String::from("Orange"),
  1055. // String::from("Brown"),
  1056. // String::from("Beige"),
  1057. // ];
  1058. // let mut s = s.clone();
  1059. // ways.iter().enumerate().for_each(|(i, _)| {
  1060. // s = s.replace(
  1061. // &format!("[ label = \"{}\" ]", i + 1),
  1062. // &format!(
  1063. // "[ label = \"{}\" color = \"{}\" ]",
  1064. // i + 1,
  1065. // x11_colors[i].to_string()
  1066. // ),
  1067. // );
  1068. // });
  1069. // println!("{s}");
  1070. //
  1071. // for (i, way) in ways.iter().enumerate() {
  1072. // let s = way
  1073. // .iter()
  1074. // .map(|(_, _, _, s)| s.to_string())
  1075. // .collect::<Vec<String>>()
  1076. // .join("");
  1077. // println!("{}.\t{s}", i + 1);
  1078. // }
  1079. Ok(())
  1080. }
  1081. #[test]
  1082. fn dir() {
  1083. init();
  1084. let id = "ROBIN";
  1085. info!("This record will be captured by `cargo test`");
  1086. let dir = format!("/data/longreads_basic_pipe/{id}/diag/scan/reads",);
  1087. // Load from fasta in dir.
  1088. let genome = Genome::from_contigs_sequences(&dir).unwrap();
  1089. genome.stats();
  1090. let mut res: Vec<String> = genome
  1091. .iter()
  1092. .flat_map(|(_s, chrom)| {
  1093. chrom.iter().filter_map(|c| c.hgvs())
  1094. // .map(|c| println!("{c}"))
  1095. })
  1096. .collect();
  1097. res.sort();
  1098. res.dedup();
  1099. res.iter().for_each(|s| println!("{s}"));
  1100. // println!("{genome:#?}");
  1101. }
  1102. }