Browse Source

callers staglr

Thomas 3 days ago
parent
commit
25db0587a1
1 changed files with 58 additions and 36 deletions
  1. 58 36
      src/callers/straglr.rs

+ 58 - 36
src/callers/straglr.rs

@@ -640,11 +640,11 @@ impl StraglrSolo {
 
 /// Runs Straglr in parallel chunks for genome-wide STR genotyping.
 ///
-/// Splits the genome into `n_parts` regions, creates temporary BED files for each region,
-/// runs Straglr in parallel, and merges the results into a single TSV file.
+/// Splits the input BED file (`config.straglr_loci_bed`) into `n_parts` chunks,
+/// runs Straglr in parallel on each chunk, and merges the results into a single TSV file.
 ///
-/// This function is designed for whole-genome STR genotyping where processing the entire
-/// genome at once would be too slow. By splitting into chunks, multiple Straglr instances
+/// This function is designed for whole-genome STR genotyping where processing all loci
+/// at once would be too slow. By splitting into chunks, multiple Straglr instances
 /// can run in parallel.
 ///
 /// # Arguments
@@ -652,7 +652,7 @@ impl StraglrSolo {
 /// * `id` - Sample identifier
 /// * `time_point` - Time point label (e.g., "norm", "diag")
 /// * `config` - Global pipeline configuration
-/// * `n_parts` - Number of parallel chunks (will be adjusted if genome is smaller)
+/// * `n_parts` - Number of parallel chunks
 ///
 /// # Returns
 ///
@@ -662,7 +662,7 @@ impl StraglrSolo {
 ///
 /// Returns an error if:
 /// - `n_parts` is 0
-/// - BAM file cannot be opened or has no header
+/// - Input BED file cannot be read
 /// - Temporary BED files cannot be created
 /// - Any Straglr chunk fails to execute
 /// - TSV merging fails
@@ -670,9 +670,9 @@ impl StraglrSolo {
 ///
 /// # Implementation Details
 ///
-/// 1. Reads BAM header to determine genome sizes
-/// 2. Splits genome into approximately equal-sized regions
-/// 3. Creates temporary BED file for each region in `{tmp_dir}/straglr_chunk_{id}_{time}_{i}.bed`
+/// 1. Reads input BED file (`config.straglr_loci_bed`)
+/// 2. Splits BED lines into N approximately equal chunks
+/// 3. Creates temporary BED file for each chunk in `{tmp_dir}/straglr_chunk_{id}_{time}_{i}.bed`
 /// 4. Runs Straglr in parallel via `run_many!` macro
 /// 5. Concatenates all output TSV files (skipping headers from parts 2+)
 /// 6. Removes temporary BED and partial TSV files
@@ -702,20 +702,35 @@ pub fn run_straglr_chunked(
         n_parts, id, time_point
     );
 
-    // Get genome sizes from BAM header
-    let bam_path = config.solo_bam(id, time_point);
-    let reader = bam::Reader::from_path(&bam_path)
-        .with_context(|| format!("Failed to open BAM: {}", bam_path))?;
-    let header = bam::Header::from_template(reader.header());
-    let genome_sizes = get_genome_sizes(&header)?;
+    // Read input BED file
+    let bed_path = &config.straglr_loci_bed;
+    let bed_file = File::open(bed_path)
+        .context(format!("Failed to open BED file: {}", bed_path))?;
+    let reader = BufReader::new(bed_file);
+
+    // Read all BED lines (skip comments and empty lines)
+    let mut bed_lines: Vec<String> = Vec::new();
+    for line in reader.lines() {
+        let line = line.context("Failed to read line from BED file")?;
+        if !line.starts_with('#') && !line.trim().is_empty() {
+            bed_lines.push(line);
+        }
+    }
+
+    let total_loci = bed_lines.len();
+    if total_loci == 0 {
+        anyhow::bail!("Input BED file is empty: {}", bed_path);
+    }
+
+    info!("Read {} loci from {}", total_loci, bed_path);
 
-    // Split genome into regions
-    let region_chunks = split_genome_into_n_regions_exact(&genome_sizes, n_parts);
-    let actual_n_parts = region_chunks.len();
+    // Calculate chunk sizes
+    let chunk_size = (total_loci + n_parts - 1) / n_parts; // ceil division
+    let actual_n_parts = (total_loci + chunk_size - 1) / chunk_size;
 
     info!(
-        "Split genome into {} chunks for Straglr processing",
-        actual_n_parts
+        "Splitting {} loci into {} chunks (~{} loci per chunk)",
+        total_loci, actual_n_parts, chunk_size
     );
 
     // Create output directory
@@ -723,36 +738,43 @@ pub fn run_straglr_chunked(
     fs::create_dir_all(&output_dir)
         .context(format!("Failed to create output directory: {}", output_dir))?;
 
+    let bam_path = config.solo_bam(id, time_point);
+
     // Create temporary BED files and jobs
     let mut jobs = Vec::with_capacity(actual_n_parts);
     let mut temp_bed_files = Vec::with_capacity(actual_n_parts);
     let mut temp_tsv_files = Vec::with_capacity(actual_n_parts);
 
-    for (i, regions) in region_chunks.into_iter().enumerate() {
+    for i in 0..actual_n_parts {
         let part_num = i + 1;
+        let start_idx = i * chunk_size;
+        let end_idx = ((i + 1) * chunk_size).min(total_loci);
 
         // Create temporary BED file for this chunk
-        let bed_path = format!(
+        let bed_chunk_path = format!(
             "{}/straglr_chunk_{}_{}_part{}.bed",
             config.tmp_dir, id, time_point, part_num
         );
 
-        let mut bed_file = File::create(&bed_path)
-            .context(format!("Failed to create temporary BED file: {}", bed_path))?;
+        let mut bed_chunk_file = File::create(&bed_chunk_path)
+            .context(format!("Failed to create temporary BED file: {}", bed_chunk_path))?;
 
-        // Write regions to BED file (format: chr\tstart\tend)
-        for region_str in &regions {
-            // Parse region format: "chr1:1000-2000" -> "chr1\t1000\t2000"
-            if let Some((chr, range)) = region_str.split_once(':') {
-                if let Some((start, end)) = range.split_once('-') {
-                    writeln!(bed_file, "{}\t{}\t{}", chr, start, end)
-                        .context("Failed to write to BED file")?;
-                }
-            }
+        // Write chunk of BED lines
+        for line in &bed_lines[start_idx..end_idx] {
+            writeln!(bed_chunk_file, "{}", line)
+                .context("Failed to write to BED chunk file")?;
         }
-        bed_file.flush().context("Failed to flush BED file")?;
+        bed_chunk_file.flush().context("Failed to flush BED chunk file")?;
+
+        debug!(
+            "Created chunk {} with {} loci (lines {}-{})",
+            part_num,
+            end_idx - start_idx,
+            start_idx + 1,
+            end_idx
+        );
 
-        temp_bed_files.push(bed_path.clone());
+        temp_bed_files.push(bed_chunk_path.clone());
 
         // Create job for this chunk
         let output_prefix = format!("{}/{}_{}_part{}", output_dir, id, time_point, part_num);
@@ -764,7 +786,7 @@ pub fn run_straglr_chunked(
             straglr_bin: config.straglr_bin.clone(),
             bam: bam_path.clone(),
             reference: config.reference.clone(),
-            loci_bed: bed_path, // Use the chunk BED file instead of global loci
+            loci_bed: bed_chunk_path,
             output_prefix,
             min_support: config.straglr_min_support,
             min_cluster_size: config.straglr_min_cluster_size,