gene_model.rs 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185
  1. use std::{collections::HashMap, fs::File, io::{BufRead, BufReader}, path::Path};
  2. use anyhow::{anyhow, Context, Result};
  3. #[derive(Clone, Copy, Debug, PartialEq, Eq)]
  4. pub enum Strand {
  5. Plus,
  6. Minus,
  7. Unknown,
  8. }
  9. #[derive(Clone, Copy, Debug)]
  10. pub struct Interval {
  11. pub start: u64, // 1-based inclusive
  12. pub end: u64, // 1-based inclusive
  13. }
  14. #[derive(Debug)]
  15. pub struct GeneModel {
  16. pub gene_id: String,
  17. pub chrom: String,
  18. pub strand: Strand,
  19. pub start: u64,
  20. pub end: u64,
  21. pub exons: Vec<Interval>, // merged, sorted, non-overlapping (genomic coords)
  22. }
  23. pub fn load_gene_model_from_gff3<P: AsRef<Path>>(
  24. path: P,
  25. transcript_id: &str,
  26. ) -> Result<GeneModel> {
  27. let f = File::open(&path)
  28. .with_context(|| format!("Failed to open GFF3 at {}", path.as_ref().display()))?;
  29. let reader = BufReader::new(f);
  30. let mut chrom: Option<String> = None;
  31. let mut strand: Option<Strand> = None;
  32. let mut tx_start: Option<u64> = None;
  33. let mut tx_end: Option<u64> = None;
  34. let mut exon_intervals: Vec<Interval> = Vec::new();
  35. let mut gene_name: Option<String> = None;
  36. for (lineno, line) in reader.lines().enumerate() {
  37. let line = line.with_context(|| format!("Error reading line {}", lineno + 1))?;
  38. if line.is_empty() || line.starts_with('#') {
  39. continue;
  40. }
  41. let mut fields = line.split('\t');
  42. let seqid = fields
  43. .next()
  44. .ok_or_else(|| anyhow!("Malformed GFF3 (seqid) at line {}", lineno + 1))?;
  45. let _source = fields
  46. .next()
  47. .ok_or_else(|| anyhow!("Malformed GFF3 (source) at line {}", lineno + 1))?;
  48. let ftype = fields
  49. .next()
  50. .ok_or_else(|| anyhow!("Malformed GFF3 (type) at line {}", lineno + 1))?;
  51. let start_s = fields
  52. .next()
  53. .ok_or_else(|| anyhow!("Malformed GFF3 (start) at line {}", lineno + 1))?;
  54. let end_s = fields
  55. .next()
  56. .ok_or_else(|| anyhow!("Malformed GFF3 (end) at line {}", lineno + 1))?;
  57. let _score = fields
  58. .next()
  59. .ok_or_else(|| anyhow!("Malformed GFF3 (score) at line {}", lineno + 1))?;
  60. let strand_s = fields
  61. .next()
  62. .ok_or_else(|| anyhow!("Malformed GFF3 (strand) at line {}", lineno + 1))?;
  63. let _phase = fields
  64. .next()
  65. .ok_or_else(|| anyhow!("Malformed GFF3 (phase) at line {}", lineno + 1))?;
  66. let attrs_s = fields
  67. .next()
  68. .ok_or_else(|| anyhow!("Malformed GFF3 (attributes) at line {}", lineno + 1))?;
  69. let start: u64 = start_s
  70. .parse()
  71. .with_context(|| format!("Invalid start at line {}: {}", lineno + 1, start_s))?;
  72. let end: u64 = end_s
  73. .parse()
  74. .with_context(|| format!("Invalid end at line {}: {}", lineno + 1, end_s))?;
  75. let s = match strand_s {
  76. "+" => Strand::Plus,
  77. "-" => Strand::Minus,
  78. "." => Strand::Unknown,
  79. _ => Strand::Unknown,
  80. };
  81. let attrs = parse_attrs(attrs_s);
  82. // Capture transcript row
  83. if ftype.eq_ignore_ascii_case("transcript") {
  84. if let Some(id) = attrs.get("ID") {
  85. if id == transcript_id {
  86. chrom = Some(seqid.to_string());
  87. strand = Some(s);
  88. tx_start = Some(start);
  89. tx_end = Some(end);
  90. if let Some(gene) = attrs.get("gene") {
  91. gene_name = Some(gene.clone());
  92. }
  93. }
  94. }
  95. }
  96. // Capture exons of this transcript
  97. if ftype.eq_ignore_ascii_case("exon") {
  98. if let Some(parent) = attrs.get("Parent") {
  99. if parent == transcript_id {
  100. if chrom.is_none() {
  101. chrom = Some(seqid.to_string());
  102. }
  103. if strand.is_none() {
  104. strand = Some(s);
  105. }
  106. exon_intervals.push(Interval { start, end });
  107. // If no explicit transcript record, derive span from exons
  108. tx_start = Some(tx_start.map_or(start, |v| v.min(start)));
  109. tx_end = Some(tx_end.map_or(end, |v| v.max(end)));
  110. if gene_name.is_none() {
  111. if let Some(gene) = attrs.get("gene") {
  112. gene_name = Some(gene.clone());
  113. }
  114. }
  115. }
  116. }
  117. }
  118. }
  119. if exon_intervals.is_empty() {
  120. return Err(anyhow!(
  121. "No exons found for transcript ID '{}' in {}",
  122. transcript_id,
  123. path.as_ref().display()
  124. ));
  125. }
  126. // Merge/sort exons
  127. exon_intervals.sort_by_key(|iv| (iv.start, iv.end));
  128. let mut merged: Vec<Interval> = Vec::with_capacity(exon_intervals.len());
  129. for iv in exon_intervals {
  130. if let Some(last) = merged.last_mut() {
  131. if iv.start <= last.end.saturating_add(1) {
  132. if iv.end > last.end {
  133. last.end = iv.end;
  134. }
  135. } else {
  136. merged.push(iv);
  137. }
  138. } else {
  139. merged.push(iv);
  140. }
  141. }
  142. let gm = GeneModel {
  143. gene_id: gene_name.unwrap_or_else(|| transcript_id.to_string()),
  144. chrom: chrom.ok_or_else(|| anyhow!("Chromosome not determined for '{}'", transcript_id))?,
  145. strand: strand.unwrap_or(Strand::Unknown),
  146. start: tx_start.ok_or_else(|| anyhow!("Start not determined for '{}'", transcript_id))?,
  147. end: tx_end.ok_or_else(|| anyhow!("End not determined for '{}'", transcript_id))?,
  148. exons: merged,
  149. };
  150. Ok(gm)
  151. }
  152. fn parse_attrs(attr_field: &str) -> HashMap<String, String> {
  153. let mut m = HashMap::new();
  154. for kv in attr_field.split(';') {
  155. if kv.is_empty() {
  156. continue;
  157. }
  158. let mut it = kv.splitn(2, '=');
  159. let k = it.next().unwrap().trim();
  160. if let Some(v) = it.next() {
  161. // GFF3 allows percent-encoding; here we keep it simple.
  162. m.insert(k.to_string(), v.trim().to_string());
  163. } else {
  164. m.insert(k.to_string(), String::new());
  165. }
  166. }
  167. m
  168. }