| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115 |
- // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.6.rna.gbff.gz
- import fs from 'fs'
- import readline from 'readline'
- import { Buffer } from 'buffer'
- import genbankParser from 'genbank-parser'
- const line$ = (path: string) => readline.createInterface({
- input: fs.createReadStream(path),
- crlfDelay: Infinity
- });
- const readOffset = (path: string, from:number, to:number) => {
- return new Promise<string>(async (resolve, reject) => {
- const size = to - from
- const buffer = Buffer.alloc(size);
- let filehandle = null;
- try {
- filehandle = await fs.promises.open(path, 'r+');
- await filehandle.read(buffer, 0, buffer.length, from);
- } finally {
- if (filehandle) {
- await filehandle.close()
- resolve(buffer.toString())
- }
- }
- })
- }
- /*
- * strings -a -t d human.1.rna.gbff | grep LOCUS | awk '{print $1"\t"$3}' > human.1.rna.gbff.index
- *
- */
- const makeGbffIndex = async (filePath: string, lineSize = 80, indexPath?: string) => {
- interface entry {
- filePath: string;
- value : string;
- from : number;
- to ?: number;
- }
- indexPath = indexPath || filePath + '.jsi'
- let entries = [] as entry[]
- let lineN = 0
- let byteAcc = 0
- for await (const line of line$(filePath)) {
- if(line.match(/^LOCUS/)) {
- entries.push({
- filePath,
- value : line.split(/\s+/)[1],
- from : byteAcc
- })
- if(lineN !== 0) {
- entries[entries.length - 2]["to"] = byteAcc
- await fs.promises.appendFile(indexPath, [
- entries[entries.length - 2]["value"],
- entries[entries.length - 2]["from"],
- entries[entries.length - 2]["to"]].join('\t') + '\n')
- entries = entries.splice(1)
- }
- }
- byteAcc += (line.length + 1)
- lineN++
- }
- entries[entries.length - 1]["to"] = byteAcc
-
- await fs.promises.appendFile(indexPath, [
- entries[entries.length - 1]["value"],
- entries[entries.length - 1]["from"],
- entries[entries.length - 1]["to"]].join('\t'))
- return entries
- }
- const getOffset = async (indexPath: string, acc: string) => {
- let res
- for await (const line of line$(indexPath)) {
- const tmp = line.split('\t')
- if (tmp[0] === acc) {
- res = [indexPath.split('.jsi')[0], tmp[1], tmp[2]]
- break
- }
- }
- return res
- }
- const getFromAcc = async (acc: string, dbPath: string | string[], indexPath?: string | string[]) => {
- dbPath = Array.isArray(dbPath) ? dbPath : [dbPath]
- if (!indexPath) {
- indexPath = []
- for (const p of dbPath) {
- const iP = p + '.jsi'
- if (!fs.existsSync(iP)) {
- console.log('Writing index: ' + iP);
- await makeGbffIndex(p)
- }
- indexPath.push(iP)
- }
- } else {
- indexPath = Array.isArray(indexPath) ? indexPath : [indexPath]
- if (indexPath.length !== dbPath.length) throw 'Error'
- }
- let i = 0
- let res
- for (const p of dbPath) {
- res = await getOffset(indexPath[i], acc)
- if (res) break
- i++
- }
- if (res) {
- const rr = await readOffset(res[0], Number(res[1]), Number(res[2]))
- res = genbankParser(rr)[0]
- }
- return res
- }
- export { getFromAcc }
|