|
|
@@ -640,11 +640,11 @@ impl StraglrSolo {
|
|
|
|
|
|
/// Runs Straglr in parallel chunks for genome-wide STR genotyping.
|
|
|
///
|
|
|
-/// Splits the genome into `n_parts` regions, creates temporary BED files for each region,
|
|
|
-/// runs Straglr in parallel, and merges the results into a single TSV file.
|
|
|
+/// Splits the input BED file (`config.straglr_loci_bed`) into `n_parts` chunks,
|
|
|
+/// runs Straglr in parallel on each chunk, and merges the results into a single TSV file.
|
|
|
///
|
|
|
-/// This function is designed for whole-genome STR genotyping where processing the entire
|
|
|
-/// genome at once would be too slow. By splitting into chunks, multiple Straglr instances
|
|
|
+/// This function is designed for whole-genome STR genotyping where processing all loci
|
|
|
+/// at once would be too slow. By splitting into chunks, multiple Straglr instances
|
|
|
/// can run in parallel.
|
|
|
///
|
|
|
/// # Arguments
|
|
|
@@ -652,7 +652,7 @@ impl StraglrSolo {
|
|
|
/// * `id` - Sample identifier
|
|
|
/// * `time_point` - Time point label (e.g., "norm", "diag")
|
|
|
/// * `config` - Global pipeline configuration
|
|
|
-/// * `n_parts` - Number of parallel chunks (will be adjusted if genome is smaller)
|
|
|
+/// * `n_parts` - Number of parallel chunks
|
|
|
///
|
|
|
/// # Returns
|
|
|
///
|
|
|
@@ -662,7 +662,7 @@ impl StraglrSolo {
|
|
|
///
|
|
|
/// Returns an error if:
|
|
|
/// - `n_parts` is 0
|
|
|
-/// - BAM file cannot be opened or has no header
|
|
|
+/// - Input BED file cannot be read
|
|
|
/// - Temporary BED files cannot be created
|
|
|
/// - Any Straglr chunk fails to execute
|
|
|
/// - TSV merging fails
|
|
|
@@ -670,9 +670,9 @@ impl StraglrSolo {
|
|
|
///
|
|
|
/// # Implementation Details
|
|
|
///
|
|
|
-/// 1. Reads BAM header to determine genome sizes
|
|
|
-/// 2. Splits genome into approximately equal-sized regions
|
|
|
-/// 3. Creates temporary BED file for each region in `{tmp_dir}/straglr_chunk_{id}_{time}_{i}.bed`
|
|
|
+/// 1. Reads input BED file (`config.straglr_loci_bed`)
|
|
|
+/// 2. Splits BED lines into N approximately equal chunks
|
|
|
+/// 3. Creates temporary BED file for each chunk in `{tmp_dir}/straglr_chunk_{id}_{time}_{i}.bed`
|
|
|
/// 4. Runs Straglr in parallel via `run_many!` macro
|
|
|
/// 5. Concatenates all output TSV files (skipping headers from parts 2+)
|
|
|
/// 6. Removes temporary BED and partial TSV files
|
|
|
@@ -702,20 +702,35 @@ pub fn run_straglr_chunked(
|
|
|
n_parts, id, time_point
|
|
|
);
|
|
|
|
|
|
- // Get genome sizes from BAM header
|
|
|
- let bam_path = config.solo_bam(id, time_point);
|
|
|
- let reader = bam::Reader::from_path(&bam_path)
|
|
|
- .with_context(|| format!("Failed to open BAM: {}", bam_path))?;
|
|
|
- let header = bam::Header::from_template(reader.header());
|
|
|
- let genome_sizes = get_genome_sizes(&header)?;
|
|
|
+ // Read input BED file
|
|
|
+ let bed_path = &config.straglr_loci_bed;
|
|
|
+ let bed_file = File::open(bed_path)
|
|
|
+ .context(format!("Failed to open BED file: {}", bed_path))?;
|
|
|
+ let reader = BufReader::new(bed_file);
|
|
|
+
|
|
|
+ // Read all BED lines (skip comments and empty lines)
|
|
|
+ let mut bed_lines: Vec<String> = Vec::new();
|
|
|
+ for line in reader.lines() {
|
|
|
+ let line = line.context("Failed to read line from BED file")?;
|
|
|
+ if !line.starts_with('#') && !line.trim().is_empty() {
|
|
|
+ bed_lines.push(line);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ let total_loci = bed_lines.len();
|
|
|
+ if total_loci == 0 {
|
|
|
+ anyhow::bail!("Input BED file is empty: {}", bed_path);
|
|
|
+ }
|
|
|
+
|
|
|
+ info!("Read {} loci from {}", total_loci, bed_path);
|
|
|
|
|
|
- // Split genome into regions
|
|
|
- let region_chunks = split_genome_into_n_regions_exact(&genome_sizes, n_parts);
|
|
|
- let actual_n_parts = region_chunks.len();
|
|
|
+ // Calculate chunk sizes
|
|
|
+ let chunk_size = (total_loci + n_parts - 1) / n_parts; // ceil division
|
|
|
+ let actual_n_parts = (total_loci + chunk_size - 1) / chunk_size;
|
|
|
|
|
|
info!(
|
|
|
- "Split genome into {} chunks for Straglr processing",
|
|
|
- actual_n_parts
|
|
|
+ "Splitting {} loci into {} chunks (~{} loci per chunk)",
|
|
|
+ total_loci, actual_n_parts, chunk_size
|
|
|
);
|
|
|
|
|
|
// Create output directory
|
|
|
@@ -723,36 +738,43 @@ pub fn run_straglr_chunked(
|
|
|
fs::create_dir_all(&output_dir)
|
|
|
.context(format!("Failed to create output directory: {}", output_dir))?;
|
|
|
|
|
|
+ let bam_path = config.solo_bam(id, time_point);
|
|
|
+
|
|
|
// Create temporary BED files and jobs
|
|
|
let mut jobs = Vec::with_capacity(actual_n_parts);
|
|
|
let mut temp_bed_files = Vec::with_capacity(actual_n_parts);
|
|
|
let mut temp_tsv_files = Vec::with_capacity(actual_n_parts);
|
|
|
|
|
|
- for (i, regions) in region_chunks.into_iter().enumerate() {
|
|
|
+ for i in 0..actual_n_parts {
|
|
|
let part_num = i + 1;
|
|
|
+ let start_idx = i * chunk_size;
|
|
|
+ let end_idx = ((i + 1) * chunk_size).min(total_loci);
|
|
|
|
|
|
// Create temporary BED file for this chunk
|
|
|
- let bed_path = format!(
|
|
|
+ let bed_chunk_path = format!(
|
|
|
"{}/straglr_chunk_{}_{}_part{}.bed",
|
|
|
config.tmp_dir, id, time_point, part_num
|
|
|
);
|
|
|
|
|
|
- let mut bed_file = File::create(&bed_path)
|
|
|
- .context(format!("Failed to create temporary BED file: {}", bed_path))?;
|
|
|
+ let mut bed_chunk_file = File::create(&bed_chunk_path)
|
|
|
+ .context(format!("Failed to create temporary BED file: {}", bed_chunk_path))?;
|
|
|
|
|
|
- // Write regions to BED file (format: chr\tstart\tend)
|
|
|
- for region_str in ®ions {
|
|
|
- // Parse region format: "chr1:1000-2000" -> "chr1\t1000\t2000"
|
|
|
- if let Some((chr, range)) = region_str.split_once(':') {
|
|
|
- if let Some((start, end)) = range.split_once('-') {
|
|
|
- writeln!(bed_file, "{}\t{}\t{}", chr, start, end)
|
|
|
- .context("Failed to write to BED file")?;
|
|
|
- }
|
|
|
- }
|
|
|
+ // Write chunk of BED lines
|
|
|
+ for line in &bed_lines[start_idx..end_idx] {
|
|
|
+ writeln!(bed_chunk_file, "{}", line)
|
|
|
+ .context("Failed to write to BED chunk file")?;
|
|
|
}
|
|
|
- bed_file.flush().context("Failed to flush BED file")?;
|
|
|
+ bed_chunk_file.flush().context("Failed to flush BED chunk file")?;
|
|
|
+
|
|
|
+ debug!(
|
|
|
+ "Created chunk {} with {} loci (lines {}-{})",
|
|
|
+ part_num,
|
|
|
+ end_idx - start_idx,
|
|
|
+ start_idx + 1,
|
|
|
+ end_idx
|
|
|
+ );
|
|
|
|
|
|
- temp_bed_files.push(bed_path.clone());
|
|
|
+ temp_bed_files.push(bed_chunk_path.clone());
|
|
|
|
|
|
// Create job for this chunk
|
|
|
let output_prefix = format!("{}/{}_{}_part{}", output_dir, id, time_point, part_num);
|
|
|
@@ -764,7 +786,7 @@ pub fn run_straglr_chunked(
|
|
|
straglr_bin: config.straglr_bin.clone(),
|
|
|
bam: bam_path.clone(),
|
|
|
reference: config.reference.clone(),
|
|
|
- loci_bed: bed_path, // Use the chunk BED file instead of global loci
|
|
|
+ loci_bed: bed_chunk_path,
|
|
|
output_prefix,
|
|
|
min_support: config.straglr_min_support,
|
|
|
min_cluster_size: config.straglr_min_cluster_size,
|