Переглянути джерело

completing info and format

Thomas 1 рік тому
батько
коміт
f83fe9a647
7 змінених файлів з 206 додано та 86 видалено
  1. 7 2
      README.md
  2. 2 0
      jq_filters/jq_bam.jq
  3. 12 2
      jq_filters/jq_variants.jq
  4. 1 1
      src/config.rs
  5. 5 1
      src/lib.rs
  6. 3 0
      src/runners.rs
  7. 176 80
      src/variant/variant.rs

+ 7 - 2
README.md

@@ -20,10 +20,15 @@ sudo apt install cmake libclang-dev
 
 ## Usage
 
-### Use jq for selecting variants:
+### Use jq for selecting variants
 
 * Somatic Variants of chrM (25) 
 ```
 zcat /data/longreads_basic_pipe/*/diag/somatic_variants.json.gz | \
-jq -L ./jq_filters -C 'include "jq_filters"; [.data[] | select(contig("chrM") and n_in_constit <= 1) | format]'
+jq -L ./jq_filters -C 'include "jq_variants"; [.data[] | select(contig("chrM") and n_in_constit <= 1) | format]'
+```
+
+### Using jq and find to look for chrM norm coverage
+```
+find /data/longreads_basic_pipe/ -name "*_diag_hs1_info.json" -type f -exec sh -c 'basename $(dirname $(dirname "{}")) | tr -d "\n"' \; -printf "\t" -exec jq -L ./jq_filters -r 'include "jq_bam"; contig_coverage("chrM")' {} \;
 ```

+ 2 - 0
jq_filters/jq_bam.jq

@@ -0,0 +1,2 @@
+def contig_coverage(contig):
+	.bam_stats as $stats | $stats.karyotype[] | select(.[2] == contig) | (.[3] / .[1]) / $stats.global_coverage;

+ 12 - 2
jq_filters/jq_filters.jq → jq_filters/jq_variants.jq

@@ -10,6 +10,14 @@ def num_to_contig(n_contig):
   elif n_contig == 25 then "chrM"
   else (n_contig | "chr" + tostring // "other") end;
 
+def ref_alt_vcf(ref_alt):
+	ref_alt | keys[0] as $k | .[$k] as $r |
+	if ($r | type) == "array" then
+		$r | join("")
+	else
+		$r
+	end;
+
 def format:
 	. + {pos: .position} + . | del(.position) |
 	{position: (.pos.position + 1)} + . |
@@ -18,10 +26,12 @@ def format:
   .vcf_variants[] |= (del(.hash,.position,.reference,.alternative) |
 	.infos |= map(select(. != "Empty")));
 
+def vcf:
+	.[] | [.contig, .position, ref_alt_vcf(.reference), ref_alt_vcf(.alternative)] |
+	@tsv;
+
 def n_alt_in_constit:
 	(.annotations | map(select(.ConstitAlt != null).ConstitAlt) // [0] | .[0]);
 
-
 def contig(contig_str):
 	.position.contig == contig_to_num(contig_str);
-

+ 1 - 1
src/config.rs

@@ -108,7 +108,7 @@ impl Default for Config {
 
             // DeepVariant
             deepvariant_output_dir: "{result_dir}/{id}/{time}/DeepVariant".to_string(),
-            deepvariant_threads: 100,
+            deepvariant_threads: 150,
             deepvariant_bin_version: "1.8.0".to_string(),
             deepvariant_model_type: "ONT_R104".to_string(),
             deepvariant_force: false,

+ 5 - 1
src/lib.rs

@@ -430,9 +430,13 @@ mod tests {
         let var_string = variant.into_vcf_row();
         assert_eq!(row, &var_string);
 
-         let row = "chr1\t52232\t.\tC\tCT\t18\t.\t.\tGT:GQ:DP:AD:AF\t1/1:1:24:3,5:0.208333";
+        let row = "chr1\t52232\t.\tC\tCT\t18\t.\t.\tGT:GQ:DP:AD:AF\t1/1:1:24:3,5:0.208333";
         let variant_b: VcfVariant = row.parse()?;
         assert_eq!(variant, variant_b);
+        let row = "chr1\t475157\t.\tA\tG\t12.301\tPASS\tH;FAU=2;FCU=0;FGU=2;FTU=0;RAU=3;RCU=0;RGU=3;RTU=0\tGT:GQ:DP:AF:AD:NAF:NDP:NAD:AU:CU:GU:TU:NAU:NCU:NGU:NTU\t0/1:12:10:0.5:0,5:0.0769:13:0,1:5:0:5:0:12:0:1:0";
+        let variant: VcfVariant = row.parse()?;
+        let var_string = variant.into_vcf_row();
+        assert_eq!(row, &var_string);
 
         Ok(())
     }

+ 3 - 0
src/runners.rs

@@ -105,6 +105,9 @@ impl Run for DockerRun {
             std::process::exit(1);
         });
 
+        // limit memory
+        self.args.insert(0, "--memory=400g".to_string());
+
         // Spawn the main command
         let output = Command::new("docker")
             .args(&self.args)

+ 176 - 80
src/variant/variant.rs

@@ -354,6 +354,7 @@ impl fmt::Display for Infos {
     }
 }
 
+#[allow(non_camel_case_types)]
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
 pub enum Info {
     Empty,
@@ -369,9 +370,42 @@ pub enum Info {
     RGU(u32),
     RTU(u32),
     SVTYPE(SVType),
+    MATEID(String),
+    NORMAL_READ_SUPPORT(u32),
+    TUMOUR_READ_SUPPORT(u32),
+    NORMAL_ALN_SUPPORT(u32),
+    TUMOUR_ALN_SUPPORT(u32),
     SVLEN(i32),
+    TUMOUR_DP_BEFORE(Vec<u32>),
+    TUMOUR_DP_AT(Vec<u32>),
+    TUMOUR_DP_AFTER(Vec<u32>),
+    NORMAL_DP_BEFORE(Vec<u32>),
+    NORMAL_DP_AT(Vec<u32>),
+    NORMAL_DP_AFTER(Vec<u32>),
+    TUMOUR_AF(Vec<f32>),
+    NORMAL_AF(Vec<f32>),
+    BP_NOTATION(String),
+    SOURCE(String),
+    CLUSTERED_READS_TUMOUR(u32),
+    CLUSTERED_READS_NORMAL(u32),
+    TUMOUR_ALT_HP(Vec<u32>),
+    TUMOUR_PS(Vec<String>),
+    NORMAL_ALT_HP(Vec<u32>),
+    NORMAL_PS(Vec<String>),
+    TUMOUR_TOTAL_HP_AT(Vec<u32>),
+    NORMAL_TOTAL_HP_AT(Vec<u32>),
+    ORIGIN_STARTS_STD_DEV(f32),
+    ORIGIN_MAPQ_MEAN(f32),
+    ORIGIN_EVENT_SIZE_STD_DEV(f32),
+    ORIGIN_EVENT_SIZE_MEDIAN(f32),
+    ORIGIN_EVENT_SIZE_MEAN(f32),
+    END_STARTS_STD_DEV(f32),
+    END_MAPQ_MEAN(f32),
+    END_EVENT_SIZE_STD_DEV(f32),
+    END_EVENT_SIZE_MEDIAN(f32),
+    END_EVENT_SIZE_MEAN(f32),
+    CLASS(String),
     END(u32),
-    MATEID(String),
     SVINSLEN(u32),
     SVINSSEQ(String),
 }
@@ -380,70 +414,63 @@ impl FromStr for Info {
     type Err = anyhow::Error;
 
     fn from_str(s: &str) -> anyhow::Result<Self> {
-        if s.contains("=") {
+        if s.contains('=') {
             let (key, value) = s
                 .split_once('=')
-                .context(format!("Can't split with `=` {s}"))?;
+                .context(format!("Can't split with `=` in string: {s}"))?;
+
             Ok(match key {
-                "FAU" => Info::FAU(
-                    value
-                        .parse()
-                        .context(format!("Can't parse into u32: {value}"))?,
-                ),
-                "FCU" => Info::FCU(
-                    value
-                        .parse()
-                        .context(format!("Can't parse into u32: {value}"))?,
-                ),
-                "FGU" => Info::FGU(
-                    value
-                        .parse()
-                        .context(format!("Can't parse into u32: {value}"))?,
-                ),
-                "FTU" => Info::FTU(
-                    value
-                        .parse()
-                        .context(format!("Can't parse into u32: {value}"))?,
-                ),
-                "RAU" => Info::RAU(
-                    value
-                        .parse()
-                        .context(format!("Can't parse into u32: {value}"))?,
-                ),
-                "RCU" => Info::RCU(
-                    value
-                        .parse()
-                        .context(format!("Can't parse into u32: {value}"))?,
-                ),
-                "RGU" => Info::RGU(
-                    value
-                        .parse()
-                        .context(format!("Can't parse into u32: {value}"))?,
-                ),
-                "RTU" => Info::RTU(
-                    value
-                        .parse()
-                        .context(format!("Can't parse into u32: {value}"))?,
-                ),
+                "FAU" => Info::FAU(parse_value(value, key)?),
+                "FCU" => Info::FCU(parse_value(value, key)?),
+                "FGU" => Info::FGU(parse_value(value, key)?),
+                "FTU" => Info::FTU(parse_value(value, key)?),
+                "RAU" => Info::RAU(parse_value(value, key)?),
+                "RCU" => Info::RCU(parse_value(value, key)?),
+                "RGU" => Info::RGU(parse_value(value, key)?),
+                "RTU" => Info::RTU(parse_value(value, key)?),
+                "SVLEN" => Info::SVLEN(parse_value(value, key)?),
+                "END" => Info::END(parse_value(value, key)?),
+                "SVINSLEN" => Info::SVINSLEN(parse_value(value, key)?),
                 "SVTYPE" => Info::SVTYPE(value.parse()?),
-                "SVLEN" => Info::SVLEN(
-                    value
-                        .parse()
-                        .context(format!("Can't parse into u32: {value}"))?,
-                ),
-                "END" => Info::END(
-                    value
-                        .parse()
-                        .context(format!("Can't parse into u32: {value}"))?,
-                ),
                 "MATEID" => Info::MATEID(value.to_string()),
-                "SVINSLEN" => Info::SVINSLEN(
-                    value
-                        .parse()
-                        .context(format!("Can't parse into u32: {value}"))?,
-                ),
+                "NORMAL_READ_SUPPORT" => Info::NORMAL_READ_SUPPORT(parse_value(value, key)?),
+                "TUMOUR_READ_SUPPORT" => Info::TUMOUR_READ_SUPPORT(parse_value(value, key)?),
+                "NORMAL_ALN_SUPPORT" => Info::NORMAL_ALN_SUPPORT(parse_value(value, key)?),
+                "TUMOUR_ALN_SUPPORT" => Info::TUMOUR_ALN_SUPPORT(parse_value(value, key)?),
                 "SVINSSEQ" => Info::SVINSSEQ(value.to_string()),
-
+                "TUMOUR_DP_BEFORE" => Info::TUMOUR_DP_BEFORE(parse_vec_value(value, key)?),
+                "TUMOUR_DP_AT" => Info::TUMOUR_DP_AT(parse_vec_value(value, key)?),
+                "TUMOUR_DP_AFTER" => Info::TUMOUR_DP_AFTER(parse_vec_value(value, key)?),
+                "NORMAL_DP_BEFORE" => Info::NORMAL_DP_BEFORE(parse_vec_value(value, key)?),
+                "NORMAL_DP_AT" => Info::NORMAL_DP_AT(parse_vec_value(value, key)?),
+                "NORMAL_DP_AFTER" => Info::NORMAL_DP_AFTER(parse_vec_value(value, key)?),
+                "TUMOUR_AF" => Info::TUMOUR_AF(parse_vec_value(value, key)?),
+                "NORMAL_AF" => Info::NORMAL_AF(parse_vec_value(value, key)?),
+                "BP_NOTATION" => Info::BP_NOTATION(value.to_string()),
+                "SOURCE" => Info::SOURCE(value.to_string()),
+                "CLUSTERED_READS_TUMOUR" => Info::CLUSTERED_READS_TUMOUR(parse_value(value, key)?),
+                "CLUSTERED_READS_NORMAL" => Info::CLUSTERED_READS_NORMAL(parse_value(value, key)?),
+                "TUMOUR_ALT_HP" => Info::TUMOUR_ALT_HP(parse_vec_value(value, key)?),
+                "TUMOUR_PS" => Info::TUMOUR_PS(parse_vec_value(value, key)?),
+                "NORMAL_ALT_HP" => Info::NORMAL_ALT_HP(parse_vec_value(value, key)?),
+                "NORMAL_PS" => Info::NORMAL_PS(parse_vec_value(value, key)?),
+                "TUMOUR_TOTAL_HP_AT" => Info::TUMOUR_TOTAL_HP_AT(parse_vec_value(value, key)?),
+                "NORMAL_TOTAL_HP_AT" => Info::NORMAL_TOTAL_HP_AT(parse_vec_value(value, key)?),
+                "ORIGIN_STARTS_STD_DEV" => Info::ORIGIN_STARTS_STD_DEV(parse_value(value, key)?),
+                "ORIGIN_MAPQ_MEAN" => Info::ORIGIN_MAPQ_MEAN(parse_value(value, key)?),
+                "ORIGIN_EVENT_SIZE_STD_DEV" => {
+                    Info::ORIGIN_EVENT_SIZE_STD_DEV(parse_value(value, key)?)
+                }
+                "ORIGIN_EVENT_SIZE_MEDIAN" => {
+                    Info::ORIGIN_EVENT_SIZE_MEDIAN(parse_value(value, key)?)
+                }
+                "ORIGIN_EVENT_SIZE_MEAN" => Info::ORIGIN_EVENT_SIZE_MEAN(parse_value(value, key)?),
+                "END_STARTS_STD_DEV" => Info::END_STARTS_STD_DEV(parse_value(value, key)?),
+                "END_MAPQ_MEAN" => Info::END_MAPQ_MEAN(parse_value(value, key)?),
+                "END_EVENT_SIZE_STD_DEV" => Info::END_EVENT_SIZE_STD_DEV(parse_value(value, key)?),
+                "END_EVENT_SIZE_MEDIAN" => Info::END_EVENT_SIZE_MEDIAN(parse_value(value, key)?),
+                "END_EVENT_SIZE_MEAN" => Info::END_EVENT_SIZE_MEAN(parse_value(value, key)?),
+                "CLASS" => Info::CLASS(value.to_string()),
                 _ => Info::Empty,
             })
         } else {
@@ -451,7 +478,6 @@ impl FromStr for Info {
                 "H" => Info::H,
                 "F" => Info::F,
                 "P" => Info::P,
-
                 _ => Info::Empty,
             })
         }
@@ -479,10 +505,50 @@ impl fmt::Display for Info {
             Info::MATEID(v) => write!(f, "MATEID={v}"),
             Info::SVINSLEN(v) => write!(f, "SVINSLEN={v}"),
             Info::SVINSSEQ(v) => write!(f, "SVINSSEQ={v}"),
+            Info::NORMAL_READ_SUPPORT(v) => write!(f, "NORMAL_READ_SUPPORT={v}"),
+            Info::TUMOUR_READ_SUPPORT(v) => write!(f, "TUMOUR_READ_SUPPORT={v}"),
+            Info::NORMAL_ALN_SUPPORT(v) => write!(f, "NORMAL_ALN_SUPPORT={v}"),
+            Info::TUMOUR_ALN_SUPPORT(v) => write!(f, "TUMOUR_ALN_SUPPORT={v}"),
+            Info::TUMOUR_DP_BEFORE(v) => write!(f, "TUMOUR_DP_BEFORE={}", concat_numbers(v)),
+            Info::TUMOUR_DP_AT(v) => write!(f, "TUMOUR_DP_AT={}", concat_numbers(v)),
+            Info::TUMOUR_DP_AFTER(v) => write!(f, "TUMOUR_DP_AFTER={}", concat_numbers(v)),
+            Info::NORMAL_DP_BEFORE(v) => write!(f, "NORMAL_DP_BEFORE={}", concat_numbers(v)),
+            Info::NORMAL_DP_AT(v) => write!(f, "NORMAL_DP_AT={}", concat_numbers(v)),
+            Info::NORMAL_DP_AFTER(v) => write!(f, "NORMAL_DP_AFTER={}", concat_numbers(v)),
+            Info::TUMOUR_AF(v) => write!(f, "TUMOUR_AF={}", concat_numbers(v)),
+            Info::NORMAL_AF(v) => write!(f, "NORMAL_AF={}", concat_numbers(v)),
+            Info::BP_NOTATION(v) => write!(f, "BP_NOTATION={v}"),
+            Info::SOURCE(v) => write!(f, "SOURCE={v}"),
+            Info::CLUSTERED_READS_TUMOUR(v) => write!(f, "CLUSTERED_READS_TUMOUR={v}"),
+            Info::CLUSTERED_READS_NORMAL(v) => write!(f, "CLUSTERED_READS_NORMAL={v}"),
+            Info::TUMOUR_ALT_HP(v) => write!(f, "TUMOUR_ALT_HP={}", concat_numbers(v)),
+            Info::TUMOUR_PS(v) => write!(f, "TUMOUR_PS={}", v.join(",")),
+            Info::NORMAL_ALT_HP(v) => write!(f, "NORMAL_ALT_HP={}", concat_numbers(v)),
+            Info::NORMAL_PS(v) => write!(f, "NORMAL_PS={}", v.join(",")),
+            Info::TUMOUR_TOTAL_HP_AT(v) => write!(f, "TUMOUR_TOTAL_HP_AT={}", concat_numbers(v)),
+            Info::NORMAL_TOTAL_HP_AT(v) => write!(f, "NORMAL_TOTAL_HP_AT={}", concat_numbers(v)),
+            Info::ORIGIN_STARTS_STD_DEV(v) => write!(f, "ORIGIN_STARTS_STD_DEV={v}"),
+            Info::ORIGIN_MAPQ_MEAN(v) => write!(f, "ORIGIN_MAPQ_MEAN={v}"),
+            Info::ORIGIN_EVENT_SIZE_STD_DEV(v) => write!(f, "ORIGIN_EVENT_SIZE_STD_DEV={v}"),
+            Info::ORIGIN_EVENT_SIZE_MEDIAN(v) => write!(f, "ORIGIN_EVENT_SIZE_MEDIAN={v}"),
+            Info::ORIGIN_EVENT_SIZE_MEAN(v) => write!(f, "ORIGIN_EVENT_SIZE_MEAN={v}"),
+            Info::END_STARTS_STD_DEV(v) => write!(f, "END_STARTS_STD_DEV={v}"),
+            Info::END_MAPQ_MEAN(v) => write!(f, "END_MAPQ_MEAN={v}"),
+            Info::END_EVENT_SIZE_STD_DEV(v) => write!(f, "END_EVENT_SIZE_STD_DEV={v}"),
+            Info::END_EVENT_SIZE_MEDIAN(v) => write!(f, "END_EVENT_SIZE_MEDIAN={v}"),
+            Info::END_EVENT_SIZE_MEAN(v) => write!(f, "END_EVENT_SIZE_MEAN={v}"),
+            Info::CLASS(v) => write!(f, "CLASS={v}"),
         }
     }
 }
 
+pub fn concat_numbers<T: ToString>(v: &[T]) -> String {
+    v.iter()
+        .map(|n| n.to_string())
+        .collect::<Vec<String>>()
+        .join(",")
+}
+
 /// Format
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
 pub enum Format {
@@ -495,8 +561,9 @@ pub enum Format {
     PL(Vec<u32>),
 
     // Clairs
+    // when format begins with N: normal
     AF(f32),
-    NAF(u32),
+    NAF(f32), // DP(u32),
     NDP(u32),
     NAD(Vec<u32>),
     AU(u32),
@@ -558,28 +625,57 @@ impl TryFrom<(&str, &str)> for Format {
     type Error = anyhow::Error;
 
     fn try_from((key, value): (&str, &str)) -> anyhow::Result<Self> {
-        Ok(match key {
+        let format = match key {
             "GT" => Format::GT(value.to_string()),
-            "GQ" => Format::GQ(value.parse().context(format!("Can't parse GQ: {value}"))?),
-            "DP" => Format::DP(value.parse().context(format!("Can't parse DP: {value}"))?),
-            "AD" => Format::AD(
-                value
-                    .split(',')
-                    .map(|e| e.parse().context("Failed to parse AD"))
-                    .collect::<anyhow::Result<Vec<_>>>()?,
-            ),
-            "VAF" => Format::VAF(value.parse().context(format!("Can't parse VAF: {value}"))?),
-            "PL" => Format::PL(
-                value
-                    .split(',')
-                    .map(|e| e.parse().context("Failed to parse AD"))
-                    .collect::<anyhow::Result<Vec<_>>>()?,
-            ),
-            "TR" => Format::TR(value.parse()?),
-            "VR" => Format::TR(value.parse()?),
+            "GQ" => Format::GQ(parse_value(value, key)?),
+            "DP" => Format::DP(parse_value(value, key)?),
+            "AD" => Format::AD(parse_vec_value(value, key)?),
+            "VAF" => Format::VAF(parse_value(value, key)?),
+            "AF" => Format::AF(parse_value(value, key)?),
+            "NAF" => Format::NAF(parse_value(value, key)?),
+            "NDP" => Format::NDP(parse_value(value, key)?),
+            "NAD" => Format::NAD(parse_vec_value(value, key)?),
+            "AU" => Format::AU(parse_value(value, key)?),
+            "CU" => Format::CU(parse_value(value, key)?),
+            "GU" => Format::GU(parse_value(value, key)?),
+            "TU" => Format::TU(parse_value(value, key)?),
+            "NAU" => Format::NAU(parse_value(value, key)?),
+            "NCU" => Format::NCU(parse_value(value, key)?),
+            "NGU" => Format::NGU(parse_value(value, key)?),
+            "NTU" => Format::NTU(parse_value(value, key)?),
+            "PL" => Format::PL(parse_vec_value(value, key)?),
+            "TR" => Format::TR(parse_value(value, key)?),
+            "VR" => Format::VR(parse_value(value, key)?),
             _ => Format::Other((key.to_string(), value.to_string())),
+        };
+        Ok(format)
+    }
+}
+
+// Helper function to parse a single value (DeepSeek)
+fn parse_value<T: std::str::FromStr>(value: &str, key: &str) -> anyhow::Result<T>
+where
+    T::Err: std::fmt::Debug,
+{
+    value
+        .parse()
+        .map_err(|e| anyhow::anyhow!("{:?}", e)) // Convert the error to `anyhow::Error`
+        .context(format!("Can't parse {}: {}", key, value)) // Add context
+}
+
+// Helper function to parse comma-separated values (DeepSeek)
+fn parse_vec_value<T: std::str::FromStr>(value: &str, key: &str) -> anyhow::Result<Vec<T>>
+where
+    T::Err: std::fmt::Debug,
+{
+    value
+        .split(',')
+        .map(|e| {
+            e.parse()
+                .map_err(|e| anyhow::anyhow!("{:?}", e)) // Convert the error to `anyhow::Error`
+                .context(format!("Failed to parse {}: {}", key, e)) // Add context
         })
-    }
+        .collect()
 }
 
 impl From<Format> for (String, String) {