// wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_feature_table.txt.gz // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_assembly_structure/Primary_Assembly/assembled_chromosomes/chr2acc // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gbff.gz // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/LRG_RefSeqGene // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/refseqgene.1.genomic.gbff.gz // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/biological_region/human.biological_region.gbff.gz // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.[1-10].rna.gbff.gz import fs from 'fs' import readline from 'readline' import { Buffer } from 'buffer' import genbankParser from 'genbank-parser' const line$ = (path: string) => readline.createInterface({ input: fs.createReadStream(path), crlfDelay: Infinity }); const readOffset = (path: string, from:number, to:number) => { return new Promise(async (resolve, reject) => { const size = to - from const buffer = Buffer.alloc(size); let filehandle = null; try { filehandle = await fs.promises.open(path, 'r+'); await filehandle.read(buffer, 0, buffer.length, from); } finally { if (filehandle) { await filehandle.close() resolve(buffer.toString()) } } }) } /* * strings -a -t d human.1.rna.gbff | grep LOCUS | awk '{print $1"\t"$3}' > human.1.rna.gbff.index * */ const makeGbffIndex = async (filePath: string, indexPath?: string) => { interface entry { filePath: string; value : string; from : number; to ?: number; } indexPath = indexPath || filePath + '.jsi' let entries = [] as entry[] let lineN = 0 let byteAcc = 0 for await (const line of line$(filePath)) { if(line.match(/^LOCUS/)) { entries.push({ filePath, value : line.split(/\s+/)[1], from : byteAcc }) if(lineN !== 0) { entries[entries.length - 2]["to"] = byteAcc await fs.promises.appendFile(indexPath, [ entries[entries.length - 2]["value"], entries[entries.length - 2]["from"], entries[entries.length - 2]["to"]].join('\t') + '\n') entries = entries.splice(1) } } byteAcc += (line.length + 1) lineN++ } entries[entries.length - 1]["to"] = byteAcc await fs.promises.appendFile(indexPath, [ entries[entries.length - 1]["value"], entries[entries.length - 1]["from"], entries[entries.length - 1]["to"]].join('\t')) return entries } const getOffset = async (indexPath: string, acc: string) => { let res for await (const line of line$(indexPath)) { const tmp = line.split('\t') if (tmp[0] === acc) { res = [indexPath.split('.jsi')[0], tmp[1], tmp[2]] break } } return res } const getFromAcc = async (accession: string, dbPath: string | string[], indexPath?: string | string[]) => { dbPath = Array.isArray(dbPath) ? dbPath : [dbPath] if (!indexPath) { indexPath = [] for (const p of dbPath) { const iP = p + '.jsi' if (!fs.existsSync(iP)) { console.log('Writing index: ' + iP); await makeGbffIndex(p) } indexPath.push(iP) } } else { indexPath = Array.isArray(indexPath) ? indexPath : [indexPath] if (indexPath.length !== dbPath.length) throw 'Error' } for (const iP of indexPath) { const [filePath, from, to] = await getOffset(iP, accession) || [undefined,undefined,undefined] if (filePath) { const txt = await readOffset(filePath, Number(from), Number(to)) return genbankParser(txt)[0] } } return undefined } interface selector {[key: string]: string} const getPos = async (selector: selector, tablePath: string, headerTablePath:string[]) => { const results:any = []; (await fs.promises.readFile(tablePath)).toString().split('\n') .map((line:string) => { const lineObj = line.split('\t').reduce((p,c,i) => ({...p, [headerTablePath[i]]: isFinite(Number(c)) ? Number(c) : c}), {} as any) if(lineObj[Object.keys(selector)[0]] === selector[Object.keys(selector)[0]]) results.push(lineObj) }) return results } const getIndex = async (symbol:string, LRGPath:string) => { interface geneIndex { GeneID : string; GeneAcc : string; TranscriptsAcc: string[]; ProteinAcc : string[] } return (await fs.promises.readFile(LRGPath)).toString() .split('\n') .filter((line:string) => line.match(new RegExp(symbol, 'g'))) .reduce((p:any,c:any) => { const [TaxID, GeneID, GeneName, GeneAcc, TranscriptsAcc, ProteinAcc, _] = c.split('\t').filter((e:any)=>e!=='') return {GeneID, GeneAcc, TranscriptsAcc: [...(new Set([...p.TranscriptsAcc, TranscriptsAcc]))], ProteinAcc: [...(new Set([...p.ProteinAcc, ProteinAcc]))]} }, {GeneID: '', GeneAcc: '', TranscriptsAcc: [], ProteinAcc: []} as geneIndex) } const getSymbol = async ( symbol:string, LRGPath:string, // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/LRG_RefSeqGene tablePath:string, // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_feature_table.txt.gz // regionDBPath: string | string[], // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/biological_region/human.biological_region.gbff.gz geneDBPath: string | string[], // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/refseqgene.[1-7].genomic.gbff.gz rnaDBPath: string | string[] // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.[1-10].rna.gbff.gz ) => { const geneIndex = await getIndex(symbol, LRGPath) // const regionData = await getFromAcc(geneIndex.GeneAcc.split('.')[0], regionDBPath) // if (regionData) await fs.promises.writeFile('test-region.json', JSON.stringify(regionData, null, 4)) const headerTablePath = [ 'feature', 'class', 'assembly', 'assembly_unit', 'seq_type', 'chromosome', 'genomic_accession', 'start', 'end', 'strand', 'product_accession', 'non_redundant_refseq', 'related_accession', 'name', 'symbol', 'GeneID', 'locus_tag', 'feature_interval_length', 'product_length','attributes' ] const allFeatures = await getPos({symbol}, tablePath, headerTablePath) for (let index = 0; index < allFeatures.length; index++) { const {feature, product_accession} = allFeatures[index] let tmp switch (feature) { case 'gene': allFeatures[index].product_accession = geneIndex.GeneAcc tmp = await getFromAcc(geneIndex.GeneAcc.split('.')[0], geneDBPath) await fs.promises.writeFile('test/test-gene.json', JSON.stringify(tmp, null, 4)) allFeatures[index].data = tmp break; case 'mRNA': tmp = await getFromAcc(product_accession.split('.')[0], rnaDBPath) await fs.promises.writeFile('test/test-rna-'+index+'.json', JSON.stringify(tmp, null, 4)) allFeatures[index].data = tmp break; default: break; } } return allFeatures } export { getFromAcc, getSymbol }