|
|
@@ -2,6 +2,8 @@ use log::{info, warn};
|
|
|
use noodles_bgzf as bgzf;
|
|
|
use noodles_csi::{self as csi};
|
|
|
use noodles_tabix as tabix;
|
|
|
+use noodles_vcf::variant::record::info::field::key::{ALLELE_FREQUENCIES, TOTAL_DEPTH};
|
|
|
+use noodles_vcf::variant::record_buf::info::field::Value;
|
|
|
use vcf::variant::io::Write;
|
|
|
use vcf::variant::record_buf::AlternateBases;
|
|
|
use vcf::variant::RecordBuf;
|
|
|
@@ -18,156 +20,63 @@ use noodles_core::position::Position;
|
|
|
use noodles_vcf::{
|
|
|
self as vcf,
|
|
|
header::record::value::{
|
|
|
- map::{Contig, Format},
|
|
|
+ map::{Contig, Info},
|
|
|
Map,
|
|
|
},
|
|
|
};
|
|
|
-use vcf::variant::record::samples::keys::key;
|
|
|
use vcf::Header;
|
|
|
|
|
|
fn get_vcf_header(dict_file: &str) -> Result<Header> {
|
|
|
- // let mut header: Vec<String> = vec!["##fileformat=VCFv4.2".to_string()];
|
|
|
- //
|
|
|
- // header.extend(
|
|
|
- // read_dict(&dict_file)?
|
|
|
- // .iter()
|
|
|
- // .map(|(sn, len)| format!("##contig=<ID={},length={}>", sn, len)),
|
|
|
- // );
|
|
|
- //
|
|
|
- // header.push("##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read depth\">".to_string());
|
|
|
- // header.push(
|
|
|
- // "##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Read depth for each allele\">"
|
|
|
- // .to_string(),
|
|
|
- // );
|
|
|
- //
|
|
|
- // header.push(
|
|
|
- // vec![
|
|
|
- // "#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", "SAMPLE",
|
|
|
- // ]
|
|
|
- // .join("\t"),
|
|
|
- // );
|
|
|
-
|
|
|
let mut header = vcf::Header::builder()
|
|
|
- .add_format(key::READ_DEPTH, Map::<Format>::from(key::READ_DEPTH))
|
|
|
- .add_format(key::READ_DEPTHS, Map::<Format>::from(key::READ_DEPTHS))
|
|
|
- .add_sample_name("LOH")
|
|
|
+ .add_info(TOTAL_DEPTH, Map::<Info>::from(TOTAL_DEPTH))
|
|
|
+ .add_info(ALLELE_FREQUENCIES, Map::<Info>::from(ALLELE_FREQUENCIES))
|
|
|
.build();
|
|
|
|
|
|
for (ctg, len) in read_dict(dict_file)? {
|
|
|
let mut contig = Map::<Contig>::new();
|
|
|
*contig.length_mut() = Some(len as usize);
|
|
|
header.contigs_mut().insert(ctg.parse()?, contig);
|
|
|
-
|
|
|
- // header.add_contig(ctg.parse().unwrap(), contig);
|
|
|
}
|
|
|
- // read_dict(&dict_file)?.iter().for_each(|(ctg, len)| {
|
|
|
- // let mut contig = Map::<Contig>::new();
|
|
|
- // *contig.length_mut() = Some(*len as usize);
|
|
|
- // header.add_contig(ctg.parse().unwrap(), contig);
|
|
|
- // });
|
|
|
-
|
|
|
- // header.add_format(key::READ_DEPTH, Map::<Format>::from(&key::READ_DEPTH));
|
|
|
- // header.formats_mut().insert(key::READ_DEPTH, Map::<Format>::from(&key::READ_DEPTH));
|
|
|
-
|
|
|
- // header.add_format(key::READ_DEPTHS, Map::<Format>::from(&key::READ_DEPTHS));
|
|
|
Ok(header)
|
|
|
- // Ok(header.join("\n"))
|
|
|
}
|
|
|
|
|
|
pub fn write_vcf(path: &str, data: &mut [Variant], dict_file: &str) -> Result<()> {
|
|
|
let mut writer = File::create(path).map(bgzf::Writer::new)?;
|
|
|
let mut writer_vcf = vcf::io::Writer::new(&mut writer);
|
|
|
|
|
|
- // let mut indexer = csi::binning_index::Indexer::default().set_header(csi::binning_index::index::header::Builder::vcf().build());
|
|
|
-
|
|
|
- // let mut indexer = tabix::index::Indexer::default();
|
|
|
- // indexer.set_header(csi::binning_index::index::header::Builder::vcf().build());
|
|
|
-
|
|
|
- // indexer.set_header(csi::binning_index::index::header::Builder::vcf())
|
|
|
- // indexer.set_header(csi::binning_index::index::header::Builder::vcf().build());
|
|
|
-
|
|
|
let header = get_vcf_header(dict_file)?;
|
|
|
writer_vcf.write_header(&header)?;
|
|
|
- // indexer.set_header(csi::binning_index::index::header::Builder::);
|
|
|
- // writer.write_all(&buf)?;
|
|
|
- // buf.clear();
|
|
|
-
|
|
|
- // let mut start_position = writer_vcf.get_ref().virtual_position();
|
|
|
- // let mut actual_contig = String::new();
|
|
|
- // let mut actual_id = 0;
|
|
|
|
|
|
for (i, row) in data.iter_mut().enumerate() {
|
|
|
- // if actual_contig != row.contig {
|
|
|
- // actual_contig = row.contig.clone();
|
|
|
- // actual_id += 1;
|
|
|
- // }
|
|
|
-
|
|
|
let record = RecordBuf::builder()
|
|
|
.set_reference_sequence_name(&row.contig)
|
|
|
.set_variant_start(Position::new(row.position as usize).unwrap())
|
|
|
.set_ids([i.to_string()].into_iter().collect())
|
|
|
.set_reference_bases(format!("{}", row.reference))
|
|
|
.set_alternate_bases(AlternateBases::from(vec![format!("{}", row.alternative)]))
|
|
|
- // .set_genotypes(Genotypes::parse(&row.to_min_string(), &header)?)
|
|
|
+ .set_info(
|
|
|
+ [
|
|
|
+ (
|
|
|
+ String::from(TOTAL_DEPTH),
|
|
|
+ Some(Value::Integer(row.get_depth() as i32)),
|
|
|
+ ),
|
|
|
+ (
|
|
|
+ String::from(ALLELE_FREQUENCIES),
|
|
|
+ Some(Value::Float(row.vaf())),
|
|
|
+ ),
|
|
|
+ ]
|
|
|
+ .into_iter()
|
|
|
+ .collect(),
|
|
|
+ )
|
|
|
.build();
|
|
|
|
|
|
writer_vcf.write_variant_record(&header, &record)?;
|
|
|
- // writer.write_all(record.to_string().as_bytes())?;
|
|
|
- // writer.write_all("\n".to_string().as_bytes())?;
|
|
|
- // let end_position = writer.virtual_position();
|
|
|
-
|
|
|
- // let chunk = Chunk::new(start_position, end_position);
|
|
|
-
|
|
|
- // let reference_sequence_name = record.chromosome().to_string();
|
|
|
- // let start = noodles_core::Position::try_from(usize::from(record.position()))
|
|
|
- // .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
|
|
|
- // let start = record.;
|
|
|
- // let start = record.variant_start().unwrap();
|
|
|
- // let end = vcf::variant::Record::variant_end(&record, &header)?;
|
|
|
- // let end = record.variant_end(&header)
|
|
|
- // .end()
|
|
|
- // .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))
|
|
|
- // .and_then(|position| {
|
|
|
- // noodles_core::Position::try_from(usize::from(position))
|
|
|
- // .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))
|
|
|
- // })?;
|
|
|
-
|
|
|
- // indexer.add_record(Some((actual_id - 1, start, end, true)), chunk)?;
|
|
|
- // indexer.add_record(&row.contig, start, end, chunk)?;
|
|
|
-
|
|
|
- // writer.write_record(&header, &record);
|
|
|
-
|
|
|
- // start_position = end_position;
|
|
|
}
|
|
|
- // let index = indexer.build(read_dict(&dict_file)?.len());
|
|
|
- // let index = indexer.build();
|
|
|
|
|
|
- // let index_file = File::create(&format!("{}.csi", path)).expect("error creating index file");
|
|
|
- // let mut writer = csi::Writer::new(index_file);
|
|
|
- // csi::write(&format!("{}.csi", path), &index)?;
|
|
|
- // let index = noodles_vcf::io::reader::Builder::default().build_from_path(path)?;
|
|
|
let index = noodles_vcf::index(path)?;
|
|
|
|
|
|
tabix::write(format!("{}.tbi", path), &index)?;
|
|
|
|
|
|
- // writer.write_index(&index)?;
|
|
|
-
|
|
|
- // writeln!(vcf, "{}", get_vcf_header(dict_file)?).unwrap();
|
|
|
-
|
|
|
- // for (i, row) in data.iter_mut().enumerate() {
|
|
|
- // writeln!(
|
|
|
- // vcf,
|
|
|
- // "{}\t{}\t{}\t{}\t{}\t{}\tPASS\t.\t{}",
|
|
|
- // row.contig.to_string(),
|
|
|
- // row.position.to_string(),
|
|
|
- // i + 1,
|
|
|
- // row.reference.to_string(),
|
|
|
- // row.alternative.to_string(),
|
|
|
- // ".", // qual
|
|
|
- // row.to_min_string()
|
|
|
- // )?;
|
|
|
- // }
|
|
|
- // let index = vcf::index(path)?;
|
|
|
Ok(())
|
|
|
}
|
|
|
|
|
|
@@ -175,8 +84,6 @@ pub struct VariantWritter {
|
|
|
path: String,
|
|
|
writer: vcf::io::Writer<bgzf::Writer<File>>,
|
|
|
header: Header,
|
|
|
- // indexer: Indexer,
|
|
|
- // start_position: VirtualPosition,
|
|
|
id: usize,
|
|
|
}
|
|
|
|
|
|
@@ -184,25 +91,47 @@ impl VariantWritter {
|
|
|
pub fn new(path: &str, dict_file: &str) -> Result<Self> {
|
|
|
let mut writer = vcf::io::Writer::new(File::create(path).map(bgzf::Writer::new)?);
|
|
|
|
|
|
- // let mut writer = File::create(path).map(bgzf::Writer::new)?;
|
|
|
let mut indexer = tabix::index::Indexer::default();
|
|
|
indexer.set_header(csi::binning_index::index::header::Builder::vcf().build());
|
|
|
|
|
|
let header = get_vcf_header(dict_file)?;
|
|
|
writer.write_header(&header)?;
|
|
|
- // let hs = header.to_string();
|
|
|
- // writer.write_all(hs.as_bytes())?;
|
|
|
|
|
|
- // let start_position = writer.virtual_position();
|
|
|
Ok(Self {
|
|
|
path: path.to_string(),
|
|
|
writer,
|
|
|
header,
|
|
|
- // indexer,
|
|
|
- // start_position,
|
|
|
id: 0,
|
|
|
})
|
|
|
}
|
|
|
+
|
|
|
+ // pub fn parse_variants(&mut self, variants: Variants) -> Result<()> {
|
|
|
+ //
|
|
|
+ // let record = RecordBuf::builder()
|
|
|
+ // .set_reference_sequence_name(&row.contig)
|
|
|
+ // .set_variant_start(Position::new(row.position as usize).unwrap())
|
|
|
+ // // .set_ids(Ids::default())
|
|
|
+ // .set_ids([self.id.to_string()].into_iter().collect())
|
|
|
+ // .set_reference_bases(format!("{}", row.reference))
|
|
|
+ // .set_alternate_bases(AlternateBases::from(vec![format!("{}", row.alternative)]))
|
|
|
+ // .set_info(
|
|
|
+ // [
|
|
|
+ // (
|
|
|
+ // String::from(TOTAL_DEPTH),
|
|
|
+ // Some(Value::Integer(row.get_depth() as i32)),
|
|
|
+ // ),
|
|
|
+ // (
|
|
|
+ // String::from(ALLELE_FREQUENCIES),
|
|
|
+ // Some(Value::Float(row.vaf())),
|
|
|
+ // ),
|
|
|
+ // ]
|
|
|
+ // .into_iter()
|
|
|
+ // .collect(),
|
|
|
+ // )
|
|
|
+ // .build();
|
|
|
+ // Ok(())
|
|
|
+ // }
|
|
|
+
|
|
|
pub fn write_variant(&mut self, row: &mut Variant) -> Result<()> {
|
|
|
let record = RecordBuf::builder()
|
|
|
.set_reference_sequence_name(&row.contig)
|
|
|
@@ -211,46 +140,30 @@ impl VariantWritter {
|
|
|
.set_ids([self.id.to_string()].into_iter().collect())
|
|
|
.set_reference_bases(format!("{}", row.reference))
|
|
|
.set_alternate_bases(AlternateBases::from(vec![format!("{}", row.alternative)]))
|
|
|
- // .set_genotypes(Genotypes::parse(&row.to_min_string(), &self.header)?)
|
|
|
+ .set_info(
|
|
|
+ [
|
|
|
+ (
|
|
|
+ String::from(TOTAL_DEPTH),
|
|
|
+ Some(Value::Integer(row.get_depth() as i32)),
|
|
|
+ ),
|
|
|
+ (
|
|
|
+ String::from(ALLELE_FREQUENCIES),
|
|
|
+ Some(Value::Float(row.vaf())),
|
|
|
+ ),
|
|
|
+ ]
|
|
|
+ .into_iter()
|
|
|
+ .collect(),
|
|
|
+ )
|
|
|
.build();
|
|
|
|
|
|
- // self.writer.write
|
|
|
|
|
|
self.writer.write_variant_record(&self.header, &record)?;
|
|
|
- // self.writer.write(record.to_string().as_bytes()).unwrap();
|
|
|
- // info!("{:?}", record);
|
|
|
- // self.writer.write("\n".to_string().as_bytes())?;
|
|
|
- // self.writer.flush()?;
|
|
|
self.id += 1;
|
|
|
- // let end_position = self.writer.virtual_position();
|
|
|
-
|
|
|
- // let chunk = Chunk::new(self.start_position, end_position);
|
|
|
-
|
|
|
- // let reference_sequence_name = record.chromosome().to_string();
|
|
|
- // let start = noodles_core::Position::try_from(usize::from(record.position()))
|
|
|
- // .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
|
|
|
- // let end = record
|
|
|
- // .end()
|
|
|
- // .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))
|
|
|
- // .and_then(|position| {
|
|
|
- // noodles_core::Position::try_from(usize::from(position))
|
|
|
- // .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))
|
|
|
- // })?;
|
|
|
- //
|
|
|
- // // indexer.add_record(Some((actual_id - 1, start, end, true)), chunk)?;
|
|
|
- // self.indexer.add_record(&row.contig, start, end, chunk)?;
|
|
|
- //
|
|
|
- // // writer.write_record(&header, &record);
|
|
|
- //
|
|
|
- // self.start_position = end_position;
|
|
|
+
|
|
|
Ok(())
|
|
|
}
|
|
|
+
|
|
|
pub fn write_index_finish(&mut self) -> Result<()> {
|
|
|
- // self.writer.finish();
|
|
|
- // let mut idx = Indexer::default();
|
|
|
- // std::mem::swap(&mut idx, &mut self.indexer);
|
|
|
- // // std::mem::replace(&mut self.indexer, Indexer::default());
|
|
|
- // let index = idx.build();
|
|
|
let index_path = format!("{}.tbi", &self.path);
|
|
|
if Path::new(&index_path).exists() {
|
|
|
fs::remove_file(&index_path)?;
|
|
|
@@ -260,10 +173,9 @@ impl VariantWritter {
|
|
|
if let Err(err) = tabix::write(&index_path, &index) {
|
|
|
warn!("Can't write VCF index {index_path} {err}");
|
|
|
}
|
|
|
- },
|
|
|
+ }
|
|
|
Err(err) => warn!("Can't write VCF index {index_path} {err}"),
|
|
|
}
|
|
|
-
|
|
|
|
|
|
Ok(())
|
|
|
}
|