Your Name 1 жил өмнө
parent
commit
46a1e63ce9
2 өөрчлөгдсөн 80 нэмэгдсэн , 1221 устгасан
  1. 0 1201
      <
  2. 80 20
      src/lib.rs

+ 0 - 1201
<

@@ -1,1201 +0,0 @@
-pub mod breakpoint;
-pub mod genomic_graph;
-// mod phase;
-
-use anyhow::{anyhow, Ok, Result};
-use fasta::record::Sequence;
-use log::info;
-use minimap2::{Aligner, Mapping};
-use noodles_fasta as fasta;
-use num_format::{CustomFormat, Grouping, ToFormattedString, WriteFormatted};
-use petgraph::{dot::Dot, prelude::*};
-use rust_htslib::bam::{self, Record};
-use std::{
-    collections::HashMap,
-    fmt,
-    fs::{self, File},
-    io::{BufReader, BufWriter, Write},
-    path::PathBuf,
-    process::{Command, Stdio},
-};
-use uuid::Uuid;
-
-#[derive(Debug, Clone)]
-pub struct Genome {
-    pub chromosomes: HashMap<String, Chromosome>,
-}
-
-#[derive(Debug, Clone)]
-pub struct Chromosome {
-    contigs: Vec<Contig>,
-}
-
-#[derive(Debug, Clone)]
-pub struct Contig {
-    pub id: String,
-    // contig seq on ref
-    pub mappings: Vec<Mapping>,
-    // reads on ref
-    pub supporting_records: Option<Vec<Record>>,
-    pub sequence: String,
-    pub contig_ref: ContigRef,
-}
-
-#[derive(Debug, Clone)]
-pub enum ContigRef {
-    Unique(Mapping),
-    Chimeric((Mapping, Mapping)),
-    ChimericTriple((Mapping, Mapping, Mapping)),
-    ChimericMultiple((Mapping, Vec<Mapping>, Mapping)),
-    LeftAmbiguity((Vec<Mapping>, Mapping)),
-    RightAmbiguity((Mapping, Vec<Mapping>)),
-    Ambigous(Vec<Mapping>),
-}
-
-impl fmt::Display for ContigRef {
-    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
-        let str = match self {
-            ContigRef::Unique(m) => mapping_to_string(m),
-            ContigRef::Chimeric((a, b)) => {
-                format!("{}<->{}", mapping_to_string(a), mapping_to_string(b))
-            }
-            ContigRef::ChimericMultiple((a, v, b)) => format!(
-                "{}<->{}<->{}",
-                mapping_to_string(a),
-                mappings_to_string(v),
-                mapping_to_string(b)
-            ),
-            ContigRef::LeftAmbiguity((v, b)) => {
-                format!("{}<->{}", mappings_to_string(v), mapping_to_string(b))
-            }
-            ContigRef::RightAmbiguity((a, v)) => {
-                format!("{}<->{}", mapping_to_string(a), mappings_to_string(v))
-            }
-            ContigRef::Ambigous(v) => format!("{}", mappings_to_string(v)),
-            ContigRef::ChimericTriple((a, b, c)) => format!(
-                "{}<->{}<->{}",
-                mapping_to_string(a),
-                mapping_to_string(b),
-                mapping_to_string(c)
-            ),
-        };
-        fmt.write_str(&str).unwrap();
-
-        std::result::Result::Ok(())
-    }
-}
-
-impl ContigRef {
-    pub fn breakpoints(&self) -> Option<Vec<Mapping>> {
-        match self {
-            ContigRef::Unique(_) => None,
-            ContigRef::Chimeric((a, b)) => Some(vec![a.clone(), b.clone()]),
-            ContigRef::ChimericTriple((a, b, c)) => Some(vec![a.clone(), b.clone(), c.clone()]),
-            ContigRef::ChimericMultiple(_) => None,
-            ContigRef::LeftAmbiguity(_) => None,
-            ContigRef::RightAmbiguity(_) => None,
-            ContigRef::Ambigous(_) => None,
-        }
-    }
-    pub fn breakpoints_repr(&self) -> Option<Vec<String>> {
-        let left = "►";
-        let right = "◄";
-        let bp_right = "▐";
-        let bp_left = "▌";
-        let format = CustomFormat::builder()
-            .grouping(Grouping::Standard)
-            .minus_sign("-")
-            .separator("_")
-            .build()
-            .unwrap();
-
-        let get_sign = |m: &Mapping| -> &str {
-            match m.strand {
-                minimap2::Strand::Forward => left,
-                minimap2::Strand::Reverse => right,
-            }
-        };
-
-        let uk = "UNKNOWN".to_string();
-        let mut res = Vec::new();
-        if let Some(breakpoints) = self.breakpoints() {
-            for v in breakpoints.windows(2) {
-                let mut bp_string = format!("{}:", v[0].target_name.clone().unwrap_or(uk.clone()));
-                let _ = bp_string
-                    .write_formatted(&v[0].target_end, &format)
-                    .unwrap();
-                bp_string = format!(
-                    "{bp_string}{}{bp_left}{bp_right}{}{}:",
-                    get_sign(&v[0]),
-                    get_sign(&v[1]),
-                    v[1].target_name.clone().unwrap_or(uk.clone())
-                );
-                let _ = bp_string
-                    .write_formatted(&v[1].target_start, &format)
-                    .unwrap();
-                res.push(bp_string);
-            }
-        }
-
-        if !res.is_empty() {
-            Some(res)
-        } else {
-            None
-        }
-    }
-    pub fn desc(&self) -> Option<String> {
-        let uk = "UNKNOWN".to_string();
-        let to_desc = |v: &mut Vec<Mapping>| -> String {
-            v.sort_by(|a, b| a.query_start.cmp(&b.query_start));
-            let v: Vec<String> = v
-                .into_iter()
-                .map(|e| {
-                    let strand = match e.strand {
-                        minimap2::Strand::Forward => "",
-                        minimap2::Strand::Reverse => "_rev",
-                    };
-                    format!(
-                        "{}:{}_{}{}",
-                        e.target_name.clone().unwrap_or(uk.clone()),
-                        e.target_start,
-                        e.target_end,
-                        strand
-                    )
-                })
-                .collect();
-            format!("[{}]", v.join(";"))
-        };
-
-        match self {
-            ContigRef::Unique(a) => Some(format!(
-                "{}:{}_{}",
-                a.target_name.clone().unwrap_or(uk.clone()),
-                a.target_start,
-                a.target_end
-            )),
-            ContigRef::Chimeric((a, b)) => Some(to_desc(&mut vec![a.to_owned(), b.to_owned()])),
-            ContigRef::ChimericTriple((a, b, c)) => {
-                Some(to_desc(&mut vec![a.to_owned(), b.to_owned(), c.to_owned()]))
-            }
-            ContigRef::ChimericMultiple(_) => None,
-            ContigRef::LeftAmbiguity(_) => None,
-            ContigRef::RightAmbiguity(_) => None,
-            ContigRef::Ambigous(a) => Some(to_desc(&mut a.to_owned())),
-        }
-    }
-
-    pub fn hgvs(&self) -> Option<String> {
-        let uk = "UNKNOWN".to_string();
-        match self {
-            ContigRef::Unique(_) => None,
-            ContigRef::Chimeric((a, b)) => {
-                if a.target_name == b.target_name {
-                    let chr = a.target_name.clone().unwrap_or(uk.clone());
-                    let del_start = a.target_end;
-                    let del_end = b.target_start;
-                    let hgvs = format!("{chr}:{del_start}_{del_end}");
-                    Some(hgvs)
-                } else {
-                    let a_chr = a.target_name.clone().unwrap_or(uk.clone());
-                    let a_bp = a.target_end;
-                    let b_chr = b.target_name.clone().unwrap_or(uk.clone());
-                    let b_bp = b.target_end;
-                    let hgvs = format!("{a_chr}:{a_bp}delins[{b_chr}:{b_bp}]");
-                    Some(hgvs)
-                }
-            }
-            ContigRef::ChimericMultiple(_) => None,
-            ContigRef::LeftAmbiguity(_) => None,
-            ContigRef::RightAmbiguity(_) => None,
-            ContigRef::Ambigous(_) => None,
-            ContigRef::ChimericTriple((a, b, c)) => {
-                let mut v = [a, b, c];
-                v.sort_by(|a, b| a.query_start.cmp(&b.query_start));
-                let (a, b, c) = (
-                    *v.get(0).clone().unwrap(),
-                    *v.get(1).clone().unwrap(),
-                    *v.get(2).clone().unwrap(),
-                );
-                let a_target_name = a.target_name.clone().unwrap_or(uk.clone());
-                let b_target_name = b.target_name.clone().unwrap_or(uk.clone());
-                let c_target_name = c.target_name.clone().unwrap_or(uk.clone());
-
-                // if a_target_name != b_target_name {}
-
-                // Insertions
-                // prioritize first len
-                let (bp_a_1, bp_a_2) = if a.query_end <= b.query_end {
-                    // TODO add inserted nt
-                    (
-                        (a.target_name.clone().unwrap_or(uk.clone()), a.target_end),
-                        (b.target_name.clone().unwrap_or(uk.clone()), b.target_start),
-                    )
-                } else {
-                    let diff = a.query_end - b.query_start;
-                    (
-                        (a.target_name.clone().unwrap_or(uk.clone()), a.target_end),
-                        (
-                            b.target_name.clone().unwrap_or(uk.clone()),
-                            b.target_start + diff,
-                        ),
-                    )
-                };
-                let (bp_b_1, bp_b_2) = if b.query_end <= c.query_end {
-                    // TODO add inserted nt
-                    (
-                        (b.target_name.clone().unwrap_or(uk.clone()), b.target_end),
-                        (c.target_name.clone().unwrap_or(uk.clone()), c.target_start),
-                    )
-                } else {
-                    let diff = b.query_end - c.query_start;
-                    (
-                        (b.target_name.clone().unwrap_or(uk.clone()), b.target_end),
-                        (
-                            c.target_name.clone().unwrap_or(uk.clone()),
-                            c.target_start + diff,
-                        ),
-                    )
-                };
-                if bp_a_1.0 == bp_b_2.0 {
-                    let hgvs = format!(
-                        "{}:{}_{}ins[{}:{}_{}]",
-                        bp_a_1.0, bp_a_1.1, bp_b_2.1, bp_a_2.0, bp_a_2.1, bp_b_1.1
-                    );
-                    Some(hgvs)
-                } else {
-                    None
-                }
-            }
-        }
-    }
-}
-
-pub fn mapping_to_string(mapping: &Mapping) -> String {
-    let uk = "UNKNOWN".to_string();
-    format!(
-        "{}:{}-{}({}:{}-{})",
-        mapping.target_name.clone().unwrap_or(uk.clone()),
-        mapping.target_start,
-        mapping.target_end,
-        mapping.query_name.clone().unwrap_or(uk),
-        mapping.query_start,
-        mapping.query_end
-    )
-}
-
-fn mappings_to_string(mappings: &Vec<Mapping>) -> String {
-    let v = mappings
-        .iter()
-        .map(mapping_to_string)
-        .collect::<Vec<String>>();
-    v.join("//")
-}
-
-pub fn get_ref_pos(mappings: Vec<Mapping>) -> Result<ContigRef> {
-    let mut mappings = mappings;
-    mappings.sort_by(|a, b| a.query_start.cmp(&b.query_start));
-
-    if mappings.len() == 1 {
-        return Ok(ContigRef::Unique(mappings.get(0).unwrap().clone()));
-    } else {
-        let mut grouped: Vec<Vec<Mapping>> = group_mappings(&mut mappings)?;
-        // let mut grouped: VecDeque<Vec<Mapping>> = group_mappings(&mut mappings)?.into();
-
-        if grouped.len() == 1 {
-            let r = grouped.into_iter().flat_map(|e| e).collect();
-            return Ok(ContigRef::Ambigous(r));
-        } else if grouped.len() >= 2 {
-            // let first = grouped.pop_back().unwrap();
-            // let last = grouped.pop_front().unwrap();
-            let first = grouped.first().unwrap().to_vec();
-            let last = grouped.last().unwrap().to_vec();
-            grouped.remove(0);
-            grouped.remove(grouped.len() - 1);
-            assert!(first[0].query_start < last[0].query_start);
-
-            if grouped.len() == 0 {
-                if first.len() == 1 && last.len() == 1 {
-                    return Ok(ContigRef::Chimeric((
-                        first.get(0).unwrap().clone(),
-                        last.get(0).unwrap().clone(),
-                    )));
-                } else if first.len() == 1 {
-                    return Ok(ContigRef::RightAmbiguity((
-                        first.get(0).unwrap().clone(),
-                        last.clone(),
-                    )));
-                } else if last.len() == 1 {
-                    return Ok(ContigRef::LeftAmbiguity((
-                        first.clone(),
-                        last.get(0).unwrap().clone(),
-                    )));
-                } else {
-                    let all: Vec<Mapping> = vec![first, last].into_iter().flat_map(|e| e).collect();
-                    return Ok(ContigRef::Ambigous(all));
-                }
-            }
-            if first.len() == 1 && last.len() == 1 {
-                if grouped.len() == 1 {
-                    return Ok(ContigRef::ChimericTriple((
-                        first.get(0).unwrap().clone(),
-                        grouped.get(0).unwrap().get(0).unwrap().clone(),
-                        last.get(0).unwrap().clone(),
-                    )));
-                } else {
-                    return Ok(ContigRef::ChimericMultiple((
-                        first.get(0).unwrap().clone(),
-                        grouped.into_iter().flat_map(|e| e).collect(),
-                        last.get(0).unwrap().clone(),
-                    )));
-                }
-            } else if first.len() == 1 {
-                let right: Vec<Mapping> = vec![grouped.into_iter().flat_map(|e| e).collect(), last]
-                    .into_iter()
-                    .flat_map(|e| e)
-                    .collect();
-                return Ok(ContigRef::RightAmbiguity((
-                    first.get(0).unwrap().clone(),
-                    right,
-                )));
-            } else if last.len() == 1 {
-                let left: Vec<Mapping> = vec![first, grouped.into_iter().flat_map(|e| e).collect()]
-                    .into_iter()
-                    .flat_map(|e| e)
-                    .collect();
-                return Ok(ContigRef::LeftAmbiguity((
-                    left,
-                    last.get(0).unwrap().clone(),
-                )));
-            } else {
-                let all: Vec<Mapping> =
-                    vec![first, grouped.into_iter().flat_map(|e| e).collect(), last]
-                        .into_iter()
-                        .flat_map(|e| e)
-                        .collect();
-                return Ok(ContigRef::Ambigous(all));
-            }
-        } else {
-            return Ok(ContigRef::Ambigous(
-                grouped.into_iter().flat_map(|e| e).collect(),
-            ));
-        }
-    }
-}
-
-impl Genome {
-    pub fn new() -> Self {
-        Genome {
-            chromosomes: HashMap::new(),
-        }
-    }
-
-    pub fn iter(&self) -> std::collections::hash_map::Iter<'_, String, Chromosome> {
-        self.chromosomes.iter()
-    }
-
-    pub fn contigs(&self) -> impl Iterator<Item = &Contig> {
-        self.chromosomes.iter().flat_map(|(_, e)| e.iter())
-    }
-
-    pub fn add_contig(
-        &mut self,
-        id: String,
-        mappings: Vec<Mapping>,
-        supporting_records: Option<Vec<Record>>,
-        sequence: String,
-    ) -> Result<()> {
-        let mut mappings = mappings;
-        mappings.sort_by(|a, b| a.query_start.cmp(&b.query_start));
-        let new_contig = Contig {
-            id,
-            mappings: mappings.clone(),
-            supporting_records,
-            sequence,
-            contig_ref: get_ref_pos(mappings)?,
-        };
-        // get the category of Mapping
-        match new_contig.contig_ref.clone() {
-            ContigRef::Unique(contig_mapping) => {
-                match self
-                    .chromosomes
-                    .get_mut(&contig_mapping.target_name.unwrap())
-                {
-                    Some(chromosome) => {
-                        chromosome.contigs.push(new_contig);
-                    }
-                    None => (),
-                }
-            }
-            ContigRef::Chimeric((a, b)) => {
-                let a_target_name = a.target_name.unwrap();
-                let b_target_name = b.target_name.unwrap();
-                if a_target_name == b_target_name {
-                    if let Some(chromosome) = self.chromosomes.get_mut(&a_target_name) {
-                        chromosome.contigs.push(new_contig);
-                    } else {
-                        self.chromosomes.insert(
-                            a_target_name,
-                            Chromosome {
-                                contigs: vec![new_contig],
-                            },
-                        );
-                    }
-                } else {
-                    let chimeric_name = format!("{}-{}", a_target_name, b_target_name);
-                    if let Some(chromosome) = self.chromosomes.get_mut(&chimeric_name) {
-                        chromosome.contigs.push(new_contig);
-                    } else {
-                        self.chromosomes.insert(
-                            chimeric_name,
-                            Chromosome {
-                                contigs: vec![new_contig],
-                            },
-                        );
-                    }
-                }
-            }
-            ContigRef::ChimericMultiple((left, _, right)) => {
-                let left_target_name = left.target_name.unwrap();
-                let right_target_name = right.target_name.unwrap();
-                if left_target_name == right_target_name {
-                    if let Some(chromosome) = self.chromosomes.get_mut(&left_target_name) {
-                        chromosome.contigs.push(new_contig);
-                    } else {
-                        self.chromosomes.insert(
-                            left_target_name,
-                            Chromosome {
-                                contigs: vec![new_contig],
-                            },
-                        );
-                    }
-                } else {
-                    let chimeric_name = format!("{}-{}", left_target_name, right_target_name);
-                    if let Some(chromosome) = self.chromosomes.get_mut(&chimeric_name) {
-                        chromosome.contigs.push(new_contig);
-                    } else {
-                        self.chromosomes.insert(
-                            chimeric_name,
-                            Chromosome {
-                                contigs: vec![new_contig],
-                            },
-                        );
-                    }
-                }
-            }
-            _ => {
-                if let Some(chromosome) = self.chromosomes.get_mut("Ambigous") {
-                    chromosome.contigs.push(new_contig);
-                } else {
-                    self.chromosomes.insert(
-                        "Ambigous".to_string(),
-                        Chromosome {
-                            contigs: vec![new_contig],
-                        },
-                    );
-                }
-            }
-        };
-
-        Ok(())
-    }
-
-    pub fn add_contig_from_seq(
-        &mut self,
-        name: String,
-        sequence: &[u8],
-        aligner: impl Fn(String) -> Result<Vec<Mapping>>,
-    ) -> Result<()> {
-        let mappings = aligner(String::from_utf8(sequence.to_vec())?)?;
-        // println!("{mappings:?}");
-        self.add_contig(name, mappings, None, String::from_utf8(sequence.to_vec())?)?;
-        Ok(())
-    }
-
-    pub fn write_contigs_sequences(&self, dir: &str) {
-        for contig in self.contigs() {
-            contig.write_fasta(&format!("{dir}/{}.fasta", contig.id))
-        }
-        // self.iter().for_each(|(_, chr)| {
-        //     chr.iter().for_each(|c| c.write_fasta(&format!("{dir}/{}.fasta", c.id)))
-        // });
-    }
-
-    pub fn from_contigs_sequences(dir: &str) -> Result<Self> {
-        let aligner_url = "http://localhost:4444/align";
-        let aligner = aligner_client::dist_align(aligner_url.to_string());
-        let mut genome = Self::new();
-        // let paths = get_ext_paths(dir, "fasta")?;
-        let paths = get_contigs_fa_paths(dir)?;
-        for path in paths {
-            let fa = read_fasta(path.to_str().unwrap())?;
-            for (name, sequence) in fa {
-                genome.add_contig_from_seq(name, sequence.as_ref(), &aligner)?;
-            }
-        }
-        Ok(genome)
-    }
-    //
-    // pub fn from_dir_bed(dir: &str)  {
-    //
-    // }
-
-    // pub fn write_records(&self, file: &str) {
-    //     let mut records =  Vec::new();
-    //     for (name, chromosome) in self.chromosomes.iter() {
-    //         for contig in chromosome.iter() {
-    //             if let Some(rec) = &contig.supporting_records {
-    //                 records.extend(rec.iter());
-    //             };
-    //         }
-    //     }
-    //     // header
-    //
-    //     // writer
-    //     rust_htslib::bam::Writer::from_path(path, header, format)
-    // }
-
-    pub fn stats(&self) {
-        for (k, v) in self.chromosomes.iter() {
-            info!("{}:{}", k, v.contigs.len());
-        }
-    }
-
-    pub fn chromosome(&self, chromosome: &str) -> Option<Vec<Contig>> {
-        self.chromosomes
-            .get(chromosome)
-            .map(|chr| chr.contigs.clone())
-    }
-}
-
-impl Chromosome {
-    pub fn iter(&self) -> std::slice::Iter<'_, Contig> {
-        self.contigs.iter()
-    }
-}
-
-impl Contig {
-    // pub fn sort(&mut self) {
-    //     // sorting by target order
-    //     self.mappings
-    //         .sort_by(|a, b| a.target_start.cmp(&b.target_start));
-    // }
-
-    pub fn to_igv(&self, dir_path: &str) -> Result<()> {
-        let supporting_records = self.supporting_records.clone().ok_or(anyhow!("no reads"))?;
-        let contig_name = if let Some(hgvs) = self.hgvs() {
-            hgvs
-        } else {
-            self.id.clone()
-        };
-        let contig_dir = format!("{dir_path}/{contig_name}");
-        fs::create_dir_all(contig_dir.clone())?;
-
-        let fasta_path = format!("{contig_dir}/contig.fa");
-        write_fasta(&fasta_path, &vec![(self.id.clone(), self.sequence.clone())]);
-        write_fai(&fasta_path);
-
-        let reads_path = format!("{contig_dir}/reads.fa");
-        let n_reads = supporting_records
-            .clone()
-            .into_iter()
-            .map(|r| {
-                (
-                    String::from_utf8(r.qname().to_vec()).unwrap(),
-                    String::from_utf8(r.seq().as_bytes()).unwrap(),
-                )
-            })
-            .collect();
-        write_fasta(&reads_path, &n_reads);
-
-        let bam_path = format!("{contig_dir}/{}.bam", self.id);
-        write_bam(&fasta_path, &reads_path, &bam_path)?;
-
-        let bed_path = format!("{contig_dir}/contig.bed");
-        match &self.contig_ref {
-            ContigRef::Chimeric((a, b)) => {
-                let d = vec![
-                    (
-                        self.id.clone(),
-                        a.query_start,
-                        a.query_end,
-                        format!(
-                            "{}:{}-{}",
-                            a.target_name.clone().unwrap(),
-                            a.target_start,
-                            a.target_end
-                        ),
-                    ),
-                    (
-                        self.id.clone(),
-                        b.query_start,
-                        b.query_end,
-                        format!(
-                            "{}:{}-{}",
-                            b.target_name.clone().unwrap(),
-                            b.target_start,
-                            b.target_end
-                        ),
-                    ),
-                ];
-                write_bed(&bed_path, &d)?;
-            }
-            ContigRef::ChimericTriple((a, b, c)) => {
-                let d: Vec<(String, i32, i32, String)> = [a, b, c]
-                    .iter()
-                    .map(|r| {
-                        (
-                            self.id.clone(),
-                            r.query_start,
-                            r.query_end,
-                            format!(
-                                "{}:{}-{}",
-                                r.target_name.clone().unwrap(),
-                                r.target_start,
-                                r.target_end
-                            ),
-                        )
-                    })
-                    .collect();
-                write_bed(&bed_path, &d)?;
-            }
-            _ => (),
-        }
-
-        Ok(())
-    }
-
-    // bug cigar len != seq len
-    // pub fn write_bam(&self, path: &str) -> Result<()> {
-    //     let aligner = Aligner::builder()
-    //         .asm5()
-    //         .with_threads(8)
-    //         .with_cigar()
-    //         .with_seq(self.sequence.as_bytes())
-    //         .expect("Unable to build index");
-    //
-    //     let mut mappings = Vec::new();
-    //     let supporting_records = self
-    //         .supporting_records
-    //         .clone()
-    //         .ok_or(anyhow!("no supporting records"))?;
-    //     for record in supporting_records.iter() {
-    //         let seq = record.seq().as_bytes();
-    //         let alignment = aligner
-    //             .map(&seq, false, false, None, None)
-    //             .expect("Unable to align");
-    //         mappings.push(alignment);
-    //     }
-    //     let mut mappings: Vec<_> = mappings.into_iter().flatten().collect();
-    //     mappings.sort_by(|a, b| a.target_start.cmp(&b.target_start));
-    //
-    //     let mut header = bam::Header::new();
-    //     let mut sq_record = rust_htslib::bam::header::HeaderRecord::new(b"SQ");
-    //     sq_record.push_tag(b"SN", self.id.clone());
-    //     sq_record.push_tag(b"LN", self.sequence.len());
-    //     header.push_record(&sq_record);
-    //
-    //     let mut out = bam::Writer::from_path(path, &header, bam::Format::Bam).unwrap();
-    //
-    //     // copy reverse reads to new BAM file
-    //     for mapping in mappings.iter() {
-    //         let record = minimap2::htslib::mapping_to_record(
-    //             Some(mapping),
-    //             self.sequence.as_bytes(),
-    //             header.clone(),
-    //             None,
-    //             Some(Uuid::new_v4().as_bytes()),
-    //         );
-    //         let _ = out.write(&record);
-    //     }
-    //     rust_htslib::bam::index::build(path, None, rust_htslib::bam::index::Type::Bai, 1)?;
-    //     Ok(())
-    // }
-
-    pub fn hgvs(&self) -> Option<String> {
-        self.contig_ref.hgvs()
-    }
-
-    pub fn desc(&self) -> Option<String> {
-        self.contig_ref.desc()
-    }
-
-    pub fn breakpoints_repr(&self) -> Option<Vec<String>> {
-        self.contig_ref.breakpoints_repr()
-    }
-
-    pub fn write_fasta(&self, fasta_path: &str) {
-        write_fasta(fasta_path, &vec![(self.id.clone(), self.sequence.clone())]);
-    }
-}
-
-fn group_mappings(mappings: &mut [Mapping]) -> Result<Vec<Vec<Mapping>>> {
-    // sort alignments by query_start
-    mappings.sort_by(|a, b| a.query_start.cmp(&b.query_start));
-
-    let mut alignments: Vec<Vec<Mapping>> = vec![];
-    // group by overlapps > 30
-    for aln in mappings.iter() {
-        let last = alignments.last_mut();
-        if let Some(l) = last {
-            if l.iter()
-                .filter(|a| a.query_end - aln.query_start > 30)
-                .count()
-                > 0
-            {
-                l.push(aln.clone());
-            } else {
-                alignments.push(vec![aln.clone()]);
-            }
-        } else {
-            alignments.push(vec![aln.clone()]);
-        }
-    }
-
-    Ok(alignments)
-}
-
-pub fn write_bed(path: &str, d: &[(String, i32, i32, String)]) -> Result<()> {
-    let file = File::create(path).unwrap();
-    let mut writer = BufWriter::new(file);
-    for (chr, start, end, value) in d.iter() {
-        let row = format!(
-            "{}\n",
-            [
-                chr.to_string(),
-                start.to_string(),
-                end.to_string(),
-                value.to_string()
-            ]
-            .join("\t")
-        );
-        writer.write_all(row.as_bytes())?;
-    }
-    Ok(())
-}
-
-// unique
-pub fn write_fastq(fastq_path: &str, d: &Vec<Record>) -> Result<()> {
-    let file = File::create(fastq_path)?;
-    let mut writer = BufWriter::new(file);
-    for record in d {
-        let name = String::from_utf8(record.qname().to_vec()).unwrap();
-        writer.write_all(format!("@{name}\n").as_bytes())?;
-        let seq = record.seq().as_bytes();
-        writer.write_all(&seq)?;
-        writer.write_all(b"\n+\n")?;
-        let qual = record.qual();
-        writer.write_all(qual)?;
-    }
-    Ok(())
-}
-
-pub fn write_fasta(fasta_path: &str, d: &Vec<(String, String)>) {
-    let file = File::create(fasta_path).unwrap();
-    let mut writer = fasta::writer::Builder::default().build_with_writer(file);
-    let mut passed = Vec::new();
-    for (name, sequence) in d {
-        let name = name.to_string();
-        if sequence.is_empty() {
-            continue;
-        }
-        if passed.contains(&name) {
-            continue;
-        }
-        passed.push(name.clone());
-        let record = fasta::Record::new(
-            fasta::record::Definition::new(name.as_str(), None),
-            fasta::record::Sequence::from(sequence.as_bytes().to_vec()),
-        );
-        writer.write_record(&record).unwrap();
-    }
-}
-
-pub fn write_fai(path: &str) {
-    let mut faidx = Command::new("samtools")
-        .arg("faidx")
-        .arg(path)
-        .spawn()
-        .expect("Samtools faidx failed to start");
-    faidx.wait().unwrap();
-}
-
-pub fn write_bam(ref_path: &str, reads_path: &str, bam_path: &str) -> Result<()> {
-    let rg_id = uuid::Uuid::new_v4();
-
-    let mm2 = Command::new("minimap2")
-        .arg("-t")
-        .arg("128")
-        .arg("-ax")
-        .arg("map-ont")
-        .arg("-R")
-        .arg(format!(
-            "@RG\\tPL:ONTASM_PROM\\tID:ONTASM_{rg_id}\\tSM:{rg_id}\\tLB:ONTASM_NB_PROM"
-        ))
-        .arg(ref_path)
-        .arg(reads_path)
-        .stdout(Stdio::piped())
-        .stderr(Stdio::piped())
-        .spawn()
-        .expect("Minimap2 failed to start");
-
-    let view = Command::new("sambamba")
-        .arg("view")
-        .arg("-h")
-        .arg("-S")
-        .arg("-t")
-        .arg("20")
-        .arg("--format=bam")
-        .arg("/dev/stdin")
-        .stdin(Stdio::from(mm2.stdout.unwrap()))
-        .stdout(Stdio::piped())
-        .stderr(Stdio::piped())
-        .spawn()
-        .expect("Sambamba view failed to start");
-
-    let mut sort = Command::new("sambamba")
-        .arg("sort")
-        .arg("-t")
-        .arg("20")
-        .arg("/dev/stdin")
-        .arg("-o")
-        .arg(bam_path)
-        .stderr(Stdio::piped())
-        .stdin(Stdio::from(view.stdout.unwrap()))
-        .spawn()
-        .expect("Sambamba sort failed to start");
-
-    sort.wait().unwrap();
-    Ok(())
-}
-
-pub fn read_fasta(path: &str) -> Result<Vec<(String, Sequence)>> {
-    let mut reader = File::open(path)
-        .map(BufReader::new)
-        .map(fasta::Reader::new)?;
-
-    let mut res = Vec::new();
-    for result in reader.records() {
-        let record = result?;
-        let u = String::from_utf8(record.name().to_vec())?;
-        let s = record.sequence().to_owned();
-        res.push((u, s));
-    }
-
-    Ok(res)
-}
-
-// fn get_ext_paths(dir: &str, ext: &str) -> Result<Vec<PathBuf>> {
-//     let paths = std::fs::read_dir(dir)?
-//         // Filter out all those directory entries which couldn't be read
-//         .filter_map(|res| res.ok())
-//         // Map the directory entries to paths
-//         .map(|dir_entry| dir_entry.path())
-//         // Filter out all paths with extensions other than `csv`
-//         .filter_map(|path| {
-//             if path.extension().map_or(false, |xt| xt == ext) {
-//                 Some(path)
-//             } else {
-//                 None
-//             }
-//         })
-//         .collect::<Vec<_>>();
-//     Ok(paths)
-// }
-
-fn get_contigs_fa_paths(dir: &str) -> Result<Vec<PathBuf>> {
-    let pattern = format!("{}/**/*_flye.fa", dir);
-    let fa_paths: Vec<PathBuf> = glob::glob(&pattern)
-        .expect("Failed to read glob pattern")
-        .filter_map(Result::ok)
-        .collect();
-
-    Ok(fa_paths)
-}
-
-pub fn dot_graph(
-    graph: &StableGraph<String, String>,
-    way: &[NodeIndex],
-    value: &str,
-    color: &str,
-    erase: bool,
-) -> String {
-    let mut g = graph.clone();
-
-    // erase labels
-    if erase {
-        g.edge_weights_mut().for_each(|e| *e = "".to_string());
-    }
-
-    let mut labels = Vec::new();
-    for window in way.windows(2) {
-        let edge_id = g.find_edge(window[0], window[1]).unwrap();
-        let edge = g.edge_weight_mut(edge_id).unwrap();
-        let v = if !edge.is_empty() {
-            let mut v: Vec<&str> = edge.split(",").filter(|e| !e.is_empty()).collect();
-            v.push(value);
-            v.join(", ")
-        } else {
-            value.to_string()
-        };
-        labels.push(v.clone());
-        *edge = v;
-    }
-
-    if erase {
-        g.retain_edges(|g, i| g.edge_weight(i).unwrap().to_string() != "".to_string());
-        g.retain_nodes(|g, n| g.neighbors_undirected(n).count() > 0);
-    }
-
-    let mut dot = Dot::new(&g).to_string().replace("\\\"", "");
-
-    labels.sort_by_key(|b| std::cmp::Reverse(b.len()));
-    // labels.sort_by(|a, b| b.len().cmp(&a.len()));
-    for label in labels {
-        let label_str = format!("label = \"{label}\"");
-        dot = dot.replace(
-            &format!("{label_str} ]\n"),
-            &format!("{label_str}, color = \"{color}\" ]\n"),
-        );
-    }
-    dot
-}
-
-/// for non overlapping ways
-pub fn dot_graph_biall(
-    graph: &StableGraph<String, String>,
-    ways: &[Vec<NodeIndex>],
-    values: Vec<&str>,
-    colors: Vec<&str>,
-    erase: bool,
-) -> String {
-    let mut g = graph.clone();
-    // erase labels
-    if erase {
-        g.edge_weights_mut().for_each(|e| *e = "".to_string());
-    }
-
-    let mut labels = Vec::new();
-    for ((way, value), color) in ways.iter().zip(values.into_iter()).zip(colors.into_iter()) {
-        for window in way.windows(2) {
-            let edge_id = g.find_edge(window[0], window[1]).unwrap();
-            let edge = g.edge_weight_mut(edge_id).unwrap();
-            let v = if !edge.is_empty() {
-                let mut v: Vec<&str> = edge.split(",").filter(|e| !e.is_empty()).collect();
-                v.push(value);
-                v.join(", ")
-            } else {
-                value.to_string()
-            };
-            labels.push((v.clone(), color));
-            *edge = v;
-        }
-    }
-
-    // g.retain_edges(|g, i| g.edge_weight(i).unwrap().to_string() != "".to_string());
-    g.retain_edges(|g, i| !g.edge_weight(i).unwrap().is_empty());
-    g.retain_nodes(|g, n| g.neighbors_undirected(n).count() > 0);
-
-    let mut dot = Dot::new(&g).to_string().replace("\\\"", "");
-
-    labels.sort_by(|a, b| b.0.len().cmp(&a.0.len()));
-    for (label, color) in labels {
-        let label_str = format!("label = \"{label}\"");
-        dot = dot.replace(
-            &format!("{label_str} ]\n"),
-            &format!("{label_str}, color = \"{color}\" ]\n"),
-        );
-    }
-    dot
-}
-
-#[cfg(test)]
-mod tests {
-    use env_logger::Env;
-
-    use super::*;
-    use crate::{
-        genomic_graph::GenomicGraph,
-        // phase::{variants_phasing, write_phases_bed},
-    };
-
-    fn init() {
-        let _ = env_logger::Builder::from_env(Env::default().default_filter_or("info"))
-            .is_test(true)
-            .try_init();
-    }
-
-    #[test]
-    fn it_works() -> Result<()> {
-        let _ = env_logger::builder().is_test(true).try_init();
-        let contig_fa = "./data_test/contig_2.fa";
-        let aligner_url = "http://localhost:4444/align";
-
-        let mut genome = Genome::new();
-        let aligner = aligner_client::dist_align(aligner_url.to_string());
-
-        let sequences = read_fasta(contig_fa)?;
-        for (name, seq) in sequences {
-            genome.add_contig_from_seq(name.clone(), &seq.as_ref().to_vec(), &aligner)?;
-            let mut seqc: Vec<u8> = seq.complement().map(|e| e.unwrap()).collect();
-            seqc.reverse();
-            genome.add_contig_from_seq(format!("{name}_rev"), &seqc, &aligner)?;
-            println!("Sending");
-        }
-        genome.iter().for_each(|(_, c)| {
-            c.iter().for_each(|cont| {
-                println!("{}", cont.contig_ref.desc().unwrap());
-            });
-        });
-
-        Ok(())
-    }
-
-    #[test]
-    fn test_graph() -> Result<()> {
-        init();
-
-        let case = "SALICETTO";
-        let chrom = vec!["chr10"];
-        info!("This record will be captured by `cargo test`");
-
-        let dir = format!("/data/longreads_basic_pipe/{case}/diag/asm_contis");
-
-        // Load from fasta in dir.
-        let genome = Genome::from_contigs_sequences(&dir)?;
-        genome.stats();
-        let mut genomic_graph = GenomicGraph::from_genome(&genome);
-
-        let sens = vec![true, false];
-        let pos = vec![0, i32::MAX];
-        let mut all_ways = Vec::new();
-        if chrom.len() > 1 {
-            (0..4).into_iter().for_each(|i| {
-                let start_pos = if i < 2 { 0 } else { i32::MAX };
-                let end_pos = pos[i % 2];
-
-                (0..4).into_iter().for_each(|i| {
-                    let start_sens = if i < 2 { true } else { false };
-                    let end_sens = sens[i % 2];
-                    (0..4).into_iter().for_each(|i| {
-                        let start_chr = if i < 2 { chrom[0] } else { chrom[1] };
-                        let end_chr = chrom[i % 2];
-                        let start = (start_sens, start_chr, start_pos);
-                        let end = (end_sens, end_chr, end_pos);
-
-                        let (oriented_graph, _integrated_graph, ways) =
-                            genomic_graph.ways(start, end);
-
-                        let dot = oriented_graph.dot_graph();
-                        println!("dot\n{dot}");
-
-                        for (_i, way) in ways.iter().enumerate() {
-                            let s = way
-                                .iter()
-                                .map(|(_, _, _, s)| s.to_string())
-                                .collect::<Vec<String>>()
-                                .join("");
-                            all_ways.push(s);
-                        }
-                    });
-                });
-            });
-        } else {
-            let start_chr = chrom[0];
-            let end_chr = chrom[0];
-            let start = (true, start_chr, 0);
-            let end = (true, end_chr, i32::MAX);
-
-            let (oriented_graph, _integrated_graph, ways) = genomic_graph.ways(start, end);
-
-            let dot = oriented_graph.dot_graph();
-            println!("dot\n{dot}");
-
-            for (_i, way) in ways.iter().enumerate() {
-                let s = way
-                    .iter()
-                    .map(|(_, _, _, s)| s.to_string())
-                    .collect::<Vec<String>>()
-                    .join("");
-                all_ways.push(s);
-            }
-        }
-
-        all_ways.dedup();
-        all_ways
-            .iter()
-            .enumerate()
-            .for_each(|(i, s)| println!("{i}.\t{s}"));
-
-        // let s = Dot::new(&integrated_graph).to_string().replace("\\\"", "");
-        // let x11_colors: Vec<String> = vec![
-        //     String::from("Red"),
-        //     String::from("Green"),
-        //     String::from("Blue"),
-        //     String::from("Cyan"),
-        //     String::from("Magenta"),
-        //     String::from("Yellow"),
-        //     String::from("DarkRed"),
-        //     String::from("DarkGreen"),
-        //     String::from("DarkBlue"),
-        //     String::from("DarkCyan"),
-        //     String::from("DarkMagenta"),
-        //     String::from("DarkYellow"),
-        //     String::from("LightRed"),
-        //     String::from("LightGreen"),
-        //     String::from("LightBlue"),
-        //     String::from("LightCyan"),
-        //     String::from("LightMagenta"),
-        //     String::from("LightYellow"),
-        //     String::from("Orange"),
-        //     String::from("Brown"),
-        //     String::from("Beige"),
-        // ];
-        // let mut s = s.clone();
-        // ways.iter().enumerate().for_each(|(i, _)| {
-        //     s = s.replace(
-        //         &format!("[ label = \"{}\" ]", i + 1),
-        //         &format!(
-        //             "[ label = \"{}\" color = \"{}\" ]",
-        //             i + 1,
-        //             x11_colors[i].to_string()
-        //         ),
-        //     );
-        // });
-        // println!("{s}");
-        //
-        // for (i, way) in ways.iter().enumerate() {
-        //     let s = way
-        //         .iter()
-        //         .map(|(_, _, _, s)| s.to_string())
-        //         .collect::<Vec<String>>()
-        //         .join("");
-        //     println!("{}.\t{s}", i + 1);
-        // }
-        Ok(())
-    }
-
-    #[test]
-    fn dir() {
-        init();
-        let id = "ROBIN";
-        info!("This record will be captured by `cargo test`");
-
-        let dir = format!("/data/longreads_basic_pipe/{id}/diag/scan/reads",);
-
-        // Load from fasta in dir.
-        let genome = Genome::from_contigs_sequences(&dir).unwrap();
-        genome.stats();
-        let mut res: Vec<String> = genome
-            .iter()
-            .flat_map(|(_s, chrom)| {
-                chrom.iter().filter_map(|c| c.hgvs())
-                // .map(|c| println!("{c}"))
-            })
-            .collect();
-        res.sort();
-        res.dedup();
-        res.iter().for_each(|s| println!("{s}"));
-
-        // println!("{genome:#?}");
-    }
-}

+ 80 - 20
src/lib.rs

@@ -520,19 +520,22 @@ impl Genome {
         // });
     }
 
-    pub fn from_contigs_sequences(dir: &str) -> Result<Self> {
+    pub fn from_dir(dir: &str) -> Result<Self> {
+        let mut genome = Self::new();
+        genome.add_dir(dir)?;
+        Ok(genome)
+    }
+
+    pub fn add_dir(&mut self, dir: &str) -> Result<()> {
         let aligner_url = "http://localhost:4444/align";
         let aligner = aligner_client::dist_align(aligner_url.to_string());
-        let mut genome = Self::new();
-        // let paths = get_ext_paths(dir, "fasta")?;
-        let paths = get_contigs_fa_paths(dir)?;
-        for path in paths {
+        for path in get_contigs_fa_paths(dir)? {
             let fa = read_fasta(path.to_str().unwrap())?;
             for (name, sequence) in fa {
-                genome.add_contig_from_seq(name, sequence.as_ref(), &aligner)?;
+                self.add_contig_from_seq(name, sequence.as_ref(), &aligner)?;
             }
         }
-        Ok(genome)
+        Ok(())
     }
     //
     // pub fn from_dir_bed(dir: &str)  {
@@ -1008,6 +1011,19 @@ pub fn dot_graph_biall(
     dot
 }
 
+pub fn load_genome(id: &str) -> Genome {
+    let mut chr: Vec<String> = (1..=22).map(|p| format!("chr{p}")).collect();
+    chr.extend(["chrX", "chrY", "chrM"].iter().map(|&s| s.to_string()));
+
+    let mut genome = Genome::new();
+    // Load from fasta in dir.
+    chr.iter().for_each(|chr| {
+        let dir = format!("/data/longreads_basic_pipe/{id}/diag/assemblies/{chr}",);
+        genome.add_dir(&dir).unwrap();
+    });
+    genome
+}
+
 #[cfg(test)]
 mod tests {
     use env_logger::Env;
@@ -1024,6 +1040,22 @@ mod tests {
             .try_init();
     }
 
+    #[test]
+    fn info() -> Result<()> {
+        init();
+        let id = "RIVOALEN";
+        let genome = load_genome(id);
+
+        genome.iter().for_each(|(_, chr)| {
+            chr.iter().for_each(|c| {
+                if let Some(d) = c.desc() {
+                    println!("{d}")
+                }
+            })
+        });
+        Ok(())
+    }
+
     #[test]
     fn it_works() -> Result<()> {
         let _ = env_logger::builder().is_test(true).try_init();
@@ -1055,14 +1087,15 @@ mod tests {
         init();
 
         let case = "SALICETTO";
-        let chrom = vec!["chr10"];
+        let chrom = ["chr10"];
         info!("This record will be captured by `cargo test`");
+        let genome = load_genome(case);
 
-        let dir = format!("/data/longreads_basic_pipe/{case}/diag/asm_contis");
-
-        // Load from fasta in dir.
-        let genome = Genome::from_contigs_sequences(&dir)?;
-        genome.stats();
+        // let dir = format!("/data/longreads_basic_pipe/{case}/diag/asm_contis");
+        //
+        // // Load from fasta in dir.
+        // let genome = Genome::from_dir(&dir)?;
+        // genome.stats();
         let mut genomic_graph = GenomicGraph::from_genome(&genome);
 
         let sens = vec![true, false];
@@ -1178,14 +1211,35 @@ mod tests {
     fn dir() {
         init();
 
-        let id = "ROBIN";
-        let chr = "chrM";
-        let dir = format!("/data/longreads_basic_pipe/{id}/diag/assemblies/{chr}",);
-        // let dir = "/data/spades".to_string();
-        // let dir = "/data/wtdbg2".to_string();
+        let id = "RIVOALEN";
+
+        let mut chr: Vec<String> = (1..=22).map(|p| format!("chr{p}")).collect();
+        chr.extend(["chrX", "chrY", "chrM"].iter().map(|&s| s.to_string()));
+
+        let mut genome_constit = Genome::new();
+        chr.iter().for_each(|chr| {
+            let dir = format!("/data/longreads_basic_pipe/{id}/mrd/assemblies/{chr}",);
+            genome_constit.add_dir(&dir).unwrap();
+        });
+        genome_constit.stats();
+        let mut res_constit: Vec<String> = genome_constit
+            .iter()
+            .flat_map(|(_s, chrom)| {
+                chrom.iter().filter_map(|c| c.hgvs())
+                // .map(|c| println!("{c}"))
+            })
+            .collect();
+        res_constit.sort();
+        res_constit.dedup();
+
+        let res_constit = Vec::new();
 
+        let mut genome = Genome::new();
         // Load from fasta in dir.
-        let genome = Genome::from_contigs_sequences(&dir).unwrap();
+        chr.iter().for_each(|chr| {
+            let dir = format!("/data/longreads_basic_pipe/{id}/diag/assemblies/{chr}",);
+            genome.add_dir(&dir).unwrap();
+        });
         genome.stats();
         let mut res: Vec<String> = genome
             .iter()
@@ -1196,7 +1250,13 @@ mod tests {
             .collect();
         res.sort();
         res.dedup();
-        res.iter().for_each(|s| println!("{s}"));
+        res.iter().for_each(|s| {
+            if res_constit.contains(s) {
+                info!("constit {s}");
+            } else {
+                println!("{s}");
+            }
+        });
 
         // println!("{genome:#?}");
     }