Thomas hai 1 ano
pai
achega
4ecb20d095
Modificáronse 3 ficheiros con 133 adicións e 12 borrados
  1. 30 6
      src/lib.rs
  2. 32 0
      src/utils.rs
  3. 71 6
      src/variants.rs

+ 30 - 6
src/lib.rs

@@ -11,11 +11,9 @@ mod tests {
     use anyhow::{Ok, Result};
     use indicatif::MultiProgress;
     use indicatif_log_bridge::LogWrapper;
+    use noodles_core::{Position, Region};
 
-    use crate::{
-        config::Config,
-        sql::variants_sql::load_variants_name,
-    };
+    use crate::{config::Config, sql::variants_sql::load_variants_name, utils::count_repetitions};
 
     use super::*;
     #[test]
@@ -27,7 +25,7 @@ mod tests {
 
     #[test]
     fn load_from_vcf() -> Result<()> {
-        let name = "FAVOT";
+        let name = "HATTAB";
 
         let logger =
             env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info"))
@@ -36,7 +34,7 @@ mod tests {
         LogWrapper::new(multi.clone(), logger).try_init().unwrap();
 
         variants::run_pipe(name, &multi)?;
-        
+
         Ok(())
     }
 
@@ -56,4 +54,30 @@ mod tests {
         println!("{} variants loaded from db.", variants.len());
         Ok(())
     }
+
+    use noodles_fasta::indexed_reader::Builder as FastaBuilder;
+
+    #[test]
+    fn check_stretch() -> Result<()> {
+        let contig = "chr7";
+        let pos = 105303375;
+
+        let cfg = Config::get()?;
+        let mut genome_reader = FastaBuilder::default().build_from_path(&cfg.reference_fa)?;
+        let start = Position::try_from((pos - 20) as usize).unwrap();
+        let end = Position::try_from((pos + 19) as usize).unwrap();
+        let r = Region::new(contig.to_string(), start..=end);
+        if let std::result::Result::Ok(reg) = genome_reader.query(&r) {
+            let seq = String::from_utf8(reg.sequence().as_ref().to_vec()).unwrap();
+            let left = &seq[0..20];
+            let right = &seq[21..seq.len() - 1];
+
+            let r_plet = count_repetitions(right, 3);
+
+            println!("{left:?}");
+            println!("{right:?} {r_plet:?}");
+        }
+
+        Ok(())
+    }
 }

+ 32 - 0
src/utils.rs

@@ -219,3 +219,35 @@ pub fn new_pg_bytes(len: u64) -> ProgressBar {
     pg.enable_steady_tick(Duration::from_millis(200));
     pg
 }
+
+pub fn count_repetitions(sequence: &str, pattern_size: usize) -> usize {
+    if sequence.len() < pattern_size {
+        return 0;
+    }
+
+    let chars: Vec<char> = sequence.chars().collect();
+    let mut max_repetitions = 0;
+    let mut i = 0;
+
+    while i <= chars.len() - pattern_size {
+        let current_pattern = &chars[i..i + pattern_size];
+        let mut j = i + pattern_size;
+        let mut current_repetitions = 1;
+
+        while j <= chars.len() - pattern_size && &chars[j..j + pattern_size] == current_pattern {
+            current_repetitions += 1;
+            j += pattern_size;
+        }
+
+        if current_repetitions > 1 {
+            max_repetitions = max_repetitions.max(current_repetitions);
+        }
+
+        i += pattern_size * current_repetitions; // Move to the next new pattern
+    }
+
+    max_repetitions
+}
+
+
+

+ 71 - 6
src/variants.rs

@@ -22,8 +22,7 @@ use crate::{
     },
     sql::{stats_sql::insert_stats, variants_sql::insert_variants},
     utils::{
-        chi_square_test_for_proportions, estimate_shannon_entropy, get_hts_nt_pileup, new_pg,
-        new_pg_speed, print_stat_cat,
+        chi_square_test_for_proportions, count_repetitions, estimate_shannon_entropy, get_hts_nt_pileup, new_pg, new_pg_speed, print_stat_cat
     },
 };
 use anyhow::{anyhow, Context, Ok, Result};
@@ -38,7 +37,6 @@ use noodles_gff as gff;
 
 use rayon::prelude::*;
 use serde::{ser::SerializeStruct, Deserialize, Serialize, Serializer};
-use utoipa::ToSchema;
 use std::io::Write;
 use std::{
     env::temp_dir,
@@ -50,6 +48,7 @@ use std::{
         Arc,
     },
 };
+use utoipa::ToSchema;
 
 // chr12:25116542|G>T KRAS
 #[derive(Debug, Clone)]
@@ -362,17 +361,63 @@ impl Variants {
                                 let ent = estimate_shannon_entropy(&s.to_lowercase());
 
                                 if ent < cfg.min_diversity {
-                                    if tumoral.position == 148725437 {
-                                        warn!("POS {}", ent);
-                                    }
                                     n_low_diversity.fetch_add(1, Ordering::SeqCst);
                                     tumoral.annotations.push(AnnotationType::VariantCategory(
                                         VariantCategory::LowDiversity,
                                     ));
                                     continue;
                                 }
+
+                                // Check triplets or doublets if DeepVariant
+                                let callers = tumoral.callers();
+
+                                if callers.len() == 1 {
+                                    if callers[0] == "DeepVariant".to_string() {
+                                        let seq_left = &s[0..20];
+                                        let seq_right = &s[21..s.len() - 1];
+
+                                        // Triplet right
+                                        if count_repetitions(seq_right, 3) >= 3 {
+                                            n_low_diversity.fetch_add(1, Ordering::SeqCst);
+                                            tumoral.annotations.push(AnnotationType::VariantCategory(
+                                                    VariantCategory::LowDiversity,
+                                                    ));
+                                            continue;
+                                        }
+
+                                        // Doublet right
+                                        if count_repetitions(seq_right, 2) >= 4 {
+                                            n_low_diversity.fetch_add(1, Ordering::SeqCst);
+                                            tumoral.annotations.push(AnnotationType::VariantCategory(
+                                                    VariantCategory::LowDiversity,
+                                                    ));
+                                            continue;
+                                        }
+
+                                        // Triplet left
+                                        if count_repetitions(seq_left, 3) >= 3 {
+                                            n_low_diversity.fetch_add(1, Ordering::SeqCst);
+                                            tumoral.annotations.push(AnnotationType::VariantCategory(
+                                                    VariantCategory::LowDiversity,
+                                                    ));
+                                            continue;
+                                        }
+
+                                        // Doublet left
+                                        if count_repetitions(seq_left, 2) >= 4 {
+                                            n_low_diversity.fetch_add(1, Ordering::SeqCst);
+                                            tumoral.annotations.push(AnnotationType::VariantCategory(
+                                                    VariantCategory::LowDiversity,
+                                                    ));
+                                            continue;
+                                        }
+
+                                    }
+                                }
                             }
 
+                            
+
                             // Check if the base is in constitutionnal pileup
                             if let ReferenceAlternative::Nucleotide(alt_b) = &tumoral.alternative {
                                 let alt_b = alt_b.clone().into_u8();
@@ -1044,6 +1089,18 @@ impl FromStr for VCFSource {
     }
 }
 
+impl ToString for VCFSource {
+    fn to_string(&self) -> String {
+        let s = match self {
+            VCFSource::DeepVariant => "DeepVariant",
+            VCFSource::ClairS => "ClairS",
+            VCFSource::Sniffles => "Sniffles",
+            VCFSource::Nanomonsv => "Nanomonsv",
+        };
+        s.to_string()
+    }
+}
+
 impl Variant {
     pub fn from_vcfrow(row: &VCFRow, source: VCFSource, variant_type: VariantType) -> Result<Self> {
         let callers_data = vec![CallerData {
@@ -1059,6 +1116,7 @@ impl Variant {
                 row.value
             ))?,
         }];
+
         Ok(Variant {
             contig: row.chr.to_string(),
             position: row.pos,
@@ -1262,6 +1320,13 @@ impl Variant {
         }
         vec_bools.iter().all(|&x| x)
     }
+
+    pub fn callers(&self) -> Vec<String> {
+        self.source
+            .iter()
+            .map(|source| source.to_string())
+            .collect()
+    }
 }
 #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
 enum AlterationCategory {