// wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_feature_table.txt.gz // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_assembly_structure/Primary_Assembly/assembled_chromosomes/chr2acc // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gbff.gz // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/LRG_RefSeqGene // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/refseqgene.1.genomic.gbff.gz // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/biological_region/human.biological_region.gbff.gz // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.[1-10].rna.gbff.gz import fs from 'fs' import os from 'os' import path from 'path' import { spawn } from 'child_process' import readline from 'readline' import { Buffer } from 'buffer' import genbankParser from 'genbank-parser' import { writeSequence } from 'aligner' import jsonata from 'jsonata' const async_exec = (prog: string, args: string[], onData: Function) => { return new Promise((resolve, reject) => { const child = spawn(prog, args, {shell: true}) child.stdout.on('data', data => onData(data.toString().trim())) child.stderr.on('data', data => onData(data.toString().trim())) child.on('error', err => reject(err)) child.on('exit', code => resolve(code)) }) } const line$ = (path: string) => readline.createInterface({ input: fs.createReadStream(path), crlfDelay: Infinity }); const readOffset = (path: string, from:number, to:number) => { return new Promise(async (resolve, reject) => { const size = to - from const buffer = Buffer.alloc(size) let filehandle = null try { filehandle = await fs.promises.open(path, 'r+'); await filehandle.read(buffer, 0, buffer.length, from) } finally { if (filehandle) { await filehandle.close() resolve(buffer.toString()) } } }) } /* * strings -a -t d human.1.rna.gbff | grep LOCUS | awk '{print $1"\t"$3}' > human.1.rna.gbff.index * */ const makeGbffIndex = async (filePath: string, indexPath?: string) => { interface entry { filePath: string; value : string; from : number; to ?: number; } indexPath = indexPath || filePath + '.jsi' let entries = [] as entry[] let lineN = 0 let byteAcc = 0 for await (const line of line$(filePath)) { if(line.match(/^LOCUS/)) { entries.push({ filePath, value : line.split(/\s+/)[1], from : byteAcc }) if(lineN !== 0) { entries[entries.length - 2]["to"] = byteAcc await fs.promises.appendFile(indexPath, [ entries[entries.length - 2]["value"], entries[entries.length - 2]["from"], entries[entries.length - 2]["to"]].join('\t') + '\n') entries = entries.splice(1) } } byteAcc += (line.length + 1) lineN++ } entries[entries.length - 1]["to"] = byteAcc await fs.promises.appendFile(indexPath, [ entries[entries.length - 1]["value"], entries[entries.length - 1]["from"], entries[entries.length - 1]["to"]].join('\t')) return entries } const getOffset = async (indexPath: string, acc: string) => { let res for await (const line of line$(indexPath)) { const tmp = line.split('\t') if (tmp[0] === acc) { res = [indexPath.split('.jsi')[0], tmp[1], tmp[2]] break } } return res } const getOffsets = async (indexPath: string | string[], accessions: RegExp) => { let res: string[][] = [] const indexPaths = Array.isArray(indexPath) ? indexPath : [indexPath] for (const iP of indexPaths) { for await (const line of line$(iP)) { const tmp = line.split('\t') if (accessions.test(tmp[0])) { res.push([iP.split('.jsi')[0], tmp[1], tmp[2], tmp[0]]) } } } return res } const getData = async (dbPath: string | string[], accessionRegex: RegExp, outPath: string, query?: string) => { dbPath = Array.isArray(dbPath) ? dbPath : [dbPath] const allOffsets = await getOffsets(dbPath.map(e => e + '.jsi'), accessionRegex) console.log(allOffsets.length + ' entry to parse.'); fs.promises.appendFile(outPath, '[\n') for (let index = 0; index < allOffsets.length; index++) { const offset = allOffsets[index]; const txt = await readOffset(offset[0], Number(offset[1]), Number(offset[2])) const json = genbankParser(txt)[0] const tmp = query ? jsonata(query).evaluate(json) : json const end = index + 1 === allOffsets.length ? '' : ',' fs.promises.appendFile(outPath, JSON.stringify(tmp, null, 4) + end + '\n') if ((index + 1)%100 === 0) console.log('Already ' + ( index + 1) + ' sequence parsed') } fs.promises.appendFile(outPath, ']') return 0 } const getFromAcc = async (accession: string, dbPath: string | string[], indexPath?: string | string[]) => { dbPath = Array.isArray(dbPath) ? dbPath : [dbPath] if (!indexPath) { indexPath = await getJSI(dbPath) } else { indexPath = Array.isArray(indexPath) ? indexPath : [indexPath] if (indexPath.length !== dbPath.length) throw 'Error' } for (const iP of indexPath) { const [filePath, from, to] = await getOffset(iP, accession) || [undefined,undefined,undefined] if (filePath) { const txt = await readOffset(filePath, Number(from), Number(to)) return genbankParser(txt)[0] } } return undefined } interface selector {[key: string]: string} const getPos = async (selector: selector, tablePath: string, headerTablePath:string[]) => { const results:any = []; (await fs.promises.readFile(tablePath)).toString().split('\n') .map((line:string) => { const lineObj = line.split('\t').reduce((p,c,i) => ({...p, [headerTablePath[i]]: isFinite(Number(c)) ? Number(c) : c}), {} as any) if(lineObj[Object.keys(selector)[0]] === selector[Object.keys(selector)[0]]) results.push(lineObj) }) return results } const getIndex = async (symbol:string, LRGPath:string) => { interface geneIndex { GeneID : string; GeneAcc : string; TranscriptsAcc: string[]; ProteinAcc : string[] } return (await fs.promises.readFile(LRGPath)).toString() .split('\n') .filter((line:string) => line.match(new RegExp(symbol, 'g'))) .reduce((p:any,c:any) => { const [TaxID, GeneID, GeneName, GeneAcc, TranscriptsAcc, ProteinAcc, _] = c.split('\t').filter((e:any)=>e!=='') return {GeneID, GeneAcc, TranscriptsAcc: [...(new Set([...p.TranscriptsAcc, TranscriptsAcc]))], ProteinAcc: [...(new Set([...p.ProteinAcc, ProteinAcc]))]} }, {GeneID: '', GeneAcc: '', TranscriptsAcc: [], ProteinAcc: []} as geneIndex) } const getSymbol = async ( symbol:string, LRGPath:string, // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/LRG_RefSeqGene tablePath:string, // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_feature_table.txt.gz // regionDBPath: string | string[], // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/biological_region/human.biological_region.gbff.gz geneDBPath: string | string[], // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/refseqgene.[1-7].genomic.gbff.gz rnaDBPath: string | string[] // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.[1-10].rna.gbff.gz ) => { const geneIndex = await getIndex(symbol, LRGPath) // const regionData = await getFromAcc(geneIndex.GeneAcc.split('.')[0], regionDBPath) // if (regionData) await fs.promises.writeFile('test-region.json', JSON.stringify(regionData, null, 4)) const headerTablePath = [ 'feature', 'class', 'assembly', 'assembly_unit', 'seq_type', 'chromosome', 'genomic_accession', 'start', 'end', 'strand', 'product_accession', 'non_redundant_refseq', 'related_accession', 'name', 'symbol', 'GeneID', 'locus_tag', 'feature_interval_length', 'product_length','attributes' ] const allFeatures = await getPos({symbol}, tablePath, headerTablePath) for (let index = 0; index < allFeatures.length; index++) { const {feature, product_accession} = allFeatures[index] let tmp switch (feature) { case 'gene': allFeatures[index].product_accession = geneIndex.GeneAcc tmp = await getFromAcc(geneIndex.GeneAcc.split('.')[0], geneDBPath) // await fs.promises.writeFile('test/test-gene.json', JSON.stringify(tmp, null, 4)) allFeatures[index].data = tmp break; case 'mRNA': tmp = await getFromAcc(product_accession.split('.')[0], rnaDBPath) // await fs.promises.writeFile('test/test-rna-'+index+'.json', JSON.stringify(tmp, null, 4)) allFeatures[index].data = tmp break; default: break; } } return allFeatures } const getJSI = async (dbPath: string | string[]) => { dbPath = Array.isArray(dbPath) ? dbPath : [dbPath] const indexPath: string[] = [] for (const p of dbPath) { const iP = p + '.jsi' if (!fs.existsSync(iP)) { console.log('Writing index: ' + iP); await makeGbffIndex(p) } indexPath.push(iP) } return indexPath } // Todo: add progress const makeRefSeqFromReg = async ( dbPath: string | string[], reg: RegExp, distFile:string, limit?: number ) => { dbPath = Array.isArray(dbPath) ? dbPath : [dbPath] const jsiFiles = await getJSI(dbPath) const tmpDir = path.join(os.tmpdir(), 'parser-' + Math.random()) await fs.promises.mkdir(tmpDir) const createdFiles: string[] = [] let counter = 0 for (const jsiFile of jsiFiles) { console.log('reading ' + jsiFile) for await (const line of line$(jsiFile)) { if(line.match(reg)) { const [accession, from, to] = line.split('\t') const res = await getFromAcc(accession, jsiFile.split('.jsi')[0]) if (res?.sequence) { try { const file = path.join(tmpDir, res?.version || res.accession + '.fa') if (!createdFiles.includes(file)) { if (createdFiles.length === 0) if (fs.existsSync(distFile)) await fs.promises.rm(distFile) await writeSequence(res?.version || res.accession, res?.sequence, file) createdFiles.push(file) const tmp = await fs.promises.readFile(file) await fs.promises.appendFile(distFile, tmp.toString() + '\n') await fs.promises.rm(file) counter++ if (counter%100 === 0) console.log('Already ' + counter + ' sequence parsed') } } catch (error) { console.log(error) } } } if (limit) if (counter === limit) break } if (limit) if (counter === limit) break } console.log(createdFiles.length + ' sequences were extracted') await fs.promises.rm(tmpDir, {recursive: true}) await async_exec('bwa', ['index', distFile], () => console.log) } export { getFromAcc, getSymbol, makeRefSeqFromReg, getOffsets, getData }