use std::{collections::HashMap, fs::File, io::{BufRead, BufReader}, path::Path}; use anyhow::{anyhow, Context, Result}; #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum Strand { Plus, Minus, Unknown, } #[derive(Clone, Copy, Debug)] pub struct Interval { pub start: u64, // 1-based inclusive pub end: u64, // 1-based inclusive } #[derive(Debug)] pub struct GeneModel { pub gene_id: String, pub chrom: String, pub strand: Strand, pub start: u64, pub end: u64, pub exons: Vec, // merged, sorted, non-overlapping (genomic coords) } pub fn load_gene_model_from_gff3>( path: P, transcript_id: &str, ) -> Result { let f = File::open(&path) .with_context(|| format!("Failed to open GFF3 at {}", path.as_ref().display()))?; let reader = BufReader::new(f); let mut chrom: Option = None; let mut strand: Option = None; let mut tx_start: Option = None; let mut tx_end: Option = None; let mut exon_intervals: Vec = Vec::new(); let mut gene_name: Option = None; for (lineno, line) in reader.lines().enumerate() { let line = line.with_context(|| format!("Error reading line {}", lineno + 1))?; if line.is_empty() || line.starts_with('#') { continue; } let mut fields = line.split('\t'); let seqid = fields .next() .ok_or_else(|| anyhow!("Malformed GFF3 (seqid) at line {}", lineno + 1))?; let _source = fields .next() .ok_or_else(|| anyhow!("Malformed GFF3 (source) at line {}", lineno + 1))?; let ftype = fields .next() .ok_or_else(|| anyhow!("Malformed GFF3 (type) at line {}", lineno + 1))?; let start_s = fields .next() .ok_or_else(|| anyhow!("Malformed GFF3 (start) at line {}", lineno + 1))?; let end_s = fields .next() .ok_or_else(|| anyhow!("Malformed GFF3 (end) at line {}", lineno + 1))?; let _score = fields .next() .ok_or_else(|| anyhow!("Malformed GFF3 (score) at line {}", lineno + 1))?; let strand_s = fields .next() .ok_or_else(|| anyhow!("Malformed GFF3 (strand) at line {}", lineno + 1))?; let _phase = fields .next() .ok_or_else(|| anyhow!("Malformed GFF3 (phase) at line {}", lineno + 1))?; let attrs_s = fields .next() .ok_or_else(|| anyhow!("Malformed GFF3 (attributes) at line {}", lineno + 1))?; let start: u64 = start_s .parse() .with_context(|| format!("Invalid start at line {}: {}", lineno + 1, start_s))?; let end: u64 = end_s .parse() .with_context(|| format!("Invalid end at line {}: {}", lineno + 1, end_s))?; let s = match strand_s { "+" => Strand::Plus, "-" => Strand::Minus, "." => Strand::Unknown, _ => Strand::Unknown, }; let attrs = parse_attrs(attrs_s); // Capture transcript row if ftype.eq_ignore_ascii_case("transcript") { if let Some(id) = attrs.get("ID") { if id == transcript_id { chrom = Some(seqid.to_string()); strand = Some(s); tx_start = Some(start); tx_end = Some(end); if let Some(gene) = attrs.get("gene") { gene_name = Some(gene.clone()); } } } } // Capture exons of this transcript if ftype.eq_ignore_ascii_case("exon") { if let Some(parent) = attrs.get("Parent") { if parent == transcript_id { if chrom.is_none() { chrom = Some(seqid.to_string()); } if strand.is_none() { strand = Some(s); } exon_intervals.push(Interval { start, end }); // If no explicit transcript record, derive span from exons tx_start = Some(tx_start.map_or(start, |v| v.min(start))); tx_end = Some(tx_end.map_or(end, |v| v.max(end))); if gene_name.is_none() { if let Some(gene) = attrs.get("gene") { gene_name = Some(gene.clone()); } } } } } } if exon_intervals.is_empty() { return Err(anyhow!( "No exons found for transcript ID '{}' in {}", transcript_id, path.as_ref().display() )); } // Merge/sort exons exon_intervals.sort_by_key(|iv| (iv.start, iv.end)); let mut merged: Vec = Vec::with_capacity(exon_intervals.len()); for iv in exon_intervals { if let Some(last) = merged.last_mut() { if iv.start <= last.end.saturating_add(1) { if iv.end > last.end { last.end = iv.end; } } else { merged.push(iv); } } else { merged.push(iv); } } let gm = GeneModel { gene_id: gene_name.unwrap_or_else(|| transcript_id.to_string()), chrom: chrom.ok_or_else(|| anyhow!("Chromosome not determined for '{}'", transcript_id))?, strand: strand.unwrap_or(Strand::Unknown), start: tx_start.ok_or_else(|| anyhow!("Start not determined for '{}'", transcript_id))?, end: tx_end.ok_or_else(|| anyhow!("End not determined for '{}'", transcript_id))?, exons: merged, }; Ok(gm) } fn parse_attrs(attr_field: &str) -> HashMap { let mut m = HashMap::new(); for kv in attr_field.split(';') { if kv.is_empty() { continue; } let mut it = kv.splitn(2, '='); let k = it.next().unwrap().trim(); if let Some(v) = it.next() { // GFF3 allows percent-encoding; here we keep it simple. m.insert(k.to_string(), v.trim().to_string()); } else { m.insert(k.to_string(), String::new()); } } m }