| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185 |
- use std::{collections::HashMap, fs::File, io::{BufRead, BufReader}, path::Path};
- use anyhow::{anyhow, Context, Result};
- #[derive(Clone, Copy, Debug, PartialEq, Eq)]
- pub enum Strand {
- Plus,
- Minus,
- Unknown,
- }
- #[derive(Clone, Copy, Debug)]
- pub struct Interval {
- pub start: u64, // 1-based inclusive
- pub end: u64, // 1-based inclusive
- }
- #[derive(Debug)]
- pub struct GeneModel {
- pub gene_id: String,
- pub chrom: String,
- pub strand: Strand,
- pub start: u64,
- pub end: u64,
- pub exons: Vec<Interval>, // merged, sorted, non-overlapping (genomic coords)
- }
- pub fn load_gene_model_from_gff3<P: AsRef<Path>>(
- path: P,
- transcript_id: &str,
- ) -> Result<GeneModel> {
- let f = File::open(&path)
- .with_context(|| format!("Failed to open GFF3 at {}", path.as_ref().display()))?;
- let reader = BufReader::new(f);
- let mut chrom: Option<String> = None;
- let mut strand: Option<Strand> = None;
- let mut tx_start: Option<u64> = None;
- let mut tx_end: Option<u64> = None;
- let mut exon_intervals: Vec<Interval> = Vec::new();
- let mut gene_name: Option<String> = None;
- for (lineno, line) in reader.lines().enumerate() {
- let line = line.with_context(|| format!("Error reading line {}", lineno + 1))?;
- if line.is_empty() || line.starts_with('#') {
- continue;
- }
- let mut fields = line.split('\t');
- let seqid = fields
- .next()
- .ok_or_else(|| anyhow!("Malformed GFF3 (seqid) at line {}", lineno + 1))?;
- let _source = fields
- .next()
- .ok_or_else(|| anyhow!("Malformed GFF3 (source) at line {}", lineno + 1))?;
- let ftype = fields
- .next()
- .ok_or_else(|| anyhow!("Malformed GFF3 (type) at line {}", lineno + 1))?;
- let start_s = fields
- .next()
- .ok_or_else(|| anyhow!("Malformed GFF3 (start) at line {}", lineno + 1))?;
- let end_s = fields
- .next()
- .ok_or_else(|| anyhow!("Malformed GFF3 (end) at line {}", lineno + 1))?;
- let _score = fields
- .next()
- .ok_or_else(|| anyhow!("Malformed GFF3 (score) at line {}", lineno + 1))?;
- let strand_s = fields
- .next()
- .ok_or_else(|| anyhow!("Malformed GFF3 (strand) at line {}", lineno + 1))?;
- let _phase = fields
- .next()
- .ok_or_else(|| anyhow!("Malformed GFF3 (phase) at line {}", lineno + 1))?;
- let attrs_s = fields
- .next()
- .ok_or_else(|| anyhow!("Malformed GFF3 (attributes) at line {}", lineno + 1))?;
- let start: u64 = start_s
- .parse()
- .with_context(|| format!("Invalid start at line {}: {}", lineno + 1, start_s))?;
- let end: u64 = end_s
- .parse()
- .with_context(|| format!("Invalid end at line {}: {}", lineno + 1, end_s))?;
- let s = match strand_s {
- "+" => Strand::Plus,
- "-" => Strand::Minus,
- "." => Strand::Unknown,
- _ => Strand::Unknown,
- };
- let attrs = parse_attrs(attrs_s);
- // Capture transcript row
- if ftype.eq_ignore_ascii_case("transcript") {
- if let Some(id) = attrs.get("ID") {
- if id == transcript_id {
- chrom = Some(seqid.to_string());
- strand = Some(s);
- tx_start = Some(start);
- tx_end = Some(end);
- if let Some(gene) = attrs.get("gene") {
- gene_name = Some(gene.clone());
- }
- }
- }
- }
- // Capture exons of this transcript
- if ftype.eq_ignore_ascii_case("exon") {
- if let Some(parent) = attrs.get("Parent") {
- if parent == transcript_id {
- if chrom.is_none() {
- chrom = Some(seqid.to_string());
- }
- if strand.is_none() {
- strand = Some(s);
- }
- exon_intervals.push(Interval { start, end });
- // If no explicit transcript record, derive span from exons
- tx_start = Some(tx_start.map_or(start, |v| v.min(start)));
- tx_end = Some(tx_end.map_or(end, |v| v.max(end)));
- if gene_name.is_none() {
- if let Some(gene) = attrs.get("gene") {
- gene_name = Some(gene.clone());
- }
- }
- }
- }
- }
- }
- if exon_intervals.is_empty() {
- return Err(anyhow!(
- "No exons found for transcript ID '{}' in {}",
- transcript_id,
- path.as_ref().display()
- ));
- }
- // Merge/sort exons
- exon_intervals.sort_by_key(|iv| (iv.start, iv.end));
- let mut merged: Vec<Interval> = Vec::with_capacity(exon_intervals.len());
- for iv in exon_intervals {
- if let Some(last) = merged.last_mut() {
- if iv.start <= last.end.saturating_add(1) {
- if iv.end > last.end {
- last.end = iv.end;
- }
- } else {
- merged.push(iv);
- }
- } else {
- merged.push(iv);
- }
- }
- let gm = GeneModel {
- gene_id: gene_name.unwrap_or_else(|| transcript_id.to_string()),
- chrom: chrom.ok_or_else(|| anyhow!("Chromosome not determined for '{}'", transcript_id))?,
- strand: strand.unwrap_or(Strand::Unknown),
- start: tx_start.ok_or_else(|| anyhow!("Start not determined for '{}'", transcript_id))?,
- end: tx_end.ok_or_else(|| anyhow!("End not determined for '{}'", transcript_id))?,
- exons: merged,
- };
- Ok(gm)
- }
- fn parse_attrs(attr_field: &str) -> HashMap<String, String> {
- let mut m = HashMap::new();
- for kv in attr_field.split(';') {
- if kv.is_empty() {
- continue;
- }
- let mut it = kv.splitn(2, '=');
- let k = it.next().unwrap().trim();
- if let Some(v) = it.next() {
- // GFF3 allows percent-encoding; here we keep it simple.
- m.insert(k.to_string(), v.trim().to_string());
- } else {
- m.insert(k.to_string(), String::new());
- }
- }
- m
- }
|