| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283 |
- // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_feature_table.txt.gz
- // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_assembly_structure/Primary_Assembly/assembled_chromosomes/chr2acc
- // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gbff.gz
- // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/LRG_RefSeqGene
- // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/refseqgene.1.genomic.gbff.gz
- // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/biological_region/human.biological_region.gbff.gz
- // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.[1-10].rna.gbff.gz
- import fs from 'fs'
- import os from 'os'
- import path from 'path'
- import { spawn } from 'child_process'
- import readline from 'readline'
- import { Buffer } from 'buffer'
- import genbankParser from 'genbank-parser'
- import { writeSequence } from 'aligner'
- import jsonata from 'jsonata'
- const async_exec = (prog: string, args: string[], onData: Function) => {
- return new Promise((resolve, reject) => {
- const child = spawn(prog, args, {shell: true})
- child.stdout.on('data', data => onData(data.toString().trim()))
- child.stderr.on('data', data => onData(data.toString().trim()))
- child.on('error', err => reject(err))
- child.on('exit', code => resolve(code))
- })
- }
- const line$ = (path: string) => readline.createInterface({
- input: fs.createReadStream(path),
- crlfDelay: Infinity
- });
- const readOffset = (path: string, from:number, to:number) => {
- return new Promise<string>(async (resolve, reject) => {
- const size = to - from
- const buffer = Buffer.alloc(size)
- let filehandle = null
- try {
- filehandle = await fs.promises.open(path, 'r+');
- await filehandle.read(buffer, 0, buffer.length, from)
- } finally {
- if (filehandle) {
- await filehandle.close()
- resolve(buffer.toString())
- }
- }
- })
- }
- /*
- * strings -a -t d human.1.rna.gbff | grep LOCUS | awk '{print $1"\t"$3}' > human.1.rna.gbff.index
- *
- */
- const makeGbffIndex = async (filePath: string, indexPath?: string) => {
- interface entry {
- filePath: string;
- value : string;
- from : number;
- to ?: number;
- }
- indexPath = indexPath || filePath + '.jsi'
- let entries = [] as entry[]
- let lineN = 0
- let byteAcc = 0
- for await (const line of line$(filePath)) {
- if(line.match(/^LOCUS/)) {
- entries.push({
- filePath,
- value : line.split(/\s+/)[1],
- from : byteAcc
- })
- if(lineN !== 0) {
- entries[entries.length - 2]["to"] = byteAcc
- await fs.promises.appendFile(indexPath, [
- entries[entries.length - 2]["value"],
- entries[entries.length - 2]["from"],
- entries[entries.length - 2]["to"]].join('\t') + '\n')
- entries = entries.splice(1)
- }
- }
- byteAcc += (line.length + 1)
- lineN++
- }
- entries[entries.length - 1]["to"] = byteAcc
-
- await fs.promises.appendFile(indexPath, [
- entries[entries.length - 1]["value"],
- entries[entries.length - 1]["from"],
- entries[entries.length - 1]["to"]].join('\t'))
- return entries
- }
- const getOffset = async (indexPath: string, acc: string) => {
- let res
- for await (const line of line$(indexPath)) {
- const tmp = line.split('\t')
- if (tmp[0] === acc) {
- res = [indexPath.split('.jsi')[0], tmp[1], tmp[2]]
- break
- }
- }
- return res
- }
- const getOffsets = async (indexPath: string | string[], accessions: RegExp) => {
- let res: string[][] = []
- const indexPaths = Array.isArray(indexPath) ? indexPath : [indexPath]
- for (const iP of indexPaths) {
- for await (const line of line$(iP)) {
- const tmp = line.split('\t')
- if (accessions.test(tmp[0])) {
- res.push([iP.split('.jsi')[0], tmp[1], tmp[2], tmp[0]])
- }
- }
- }
- return res
- }
- const getData = async (dbPath: string | string[], accessionRegex: RegExp, query?: string) => {
- dbPath = Array.isArray(dbPath) ? dbPath : [dbPath]
- const results = []
- const allOffsets = await getOffsets(dbPath.map(e => e + '.jsi'), accessionRegex)
- for (const offset of allOffsets) {
- const txt = await readOffset(offset[0], Number(offset[1]), Number(offset[2]))
- const json = genbankParser(txt)[0]
- query ? results.push(jsonata(query).evaluate(json)) : results.push(json)
- }
- return results
- }
- const getFromAcc = async (accession: string, dbPath: string | string[], indexPath?: string | string[]) => {
- dbPath = Array.isArray(dbPath) ? dbPath : [dbPath]
- if (!indexPath) {
- indexPath = await getJSI(dbPath)
- } else {
- indexPath = Array.isArray(indexPath) ? indexPath : [indexPath]
- if (indexPath.length !== dbPath.length) throw 'Error'
- }
- for (const iP of indexPath) {
- const [filePath, from, to] = await getOffset(iP, accession) || [undefined,undefined,undefined]
- if (filePath) {
- const txt = await readOffset(filePath, Number(from), Number(to))
- return genbankParser(txt)[0]
- }
- }
- return undefined
- }
- interface selector {[key: string]: string}
- const getPos = async (selector: selector, tablePath: string, headerTablePath:string[]) => {
- const results:any = [];
- (await fs.promises.readFile(tablePath)).toString().split('\n')
- .map((line:string) => {
- const lineObj = line.split('\t').reduce((p,c,i) => ({...p, [headerTablePath[i]]: isFinite(Number(c)) ? Number(c) : c}), {} as any)
- if(lineObj[Object.keys(selector)[0]] === selector[Object.keys(selector)[0]]) results.push(lineObj)
- })
- return results
- }
- const getIndex = async (symbol:string, LRGPath:string) => {
- interface geneIndex {
- GeneID : string;
- GeneAcc : string;
- TranscriptsAcc: string[];
- ProteinAcc : string[]
- }
- return (await fs.promises.readFile(LRGPath)).toString()
- .split('\n')
- .filter((line:string) => line.match(new RegExp(symbol, 'g')))
- .reduce((p:any,c:any) => {
- const [TaxID, GeneID, GeneName, GeneAcc, TranscriptsAcc, ProteinAcc, _] = c.split('\t').filter((e:any)=>e!=='')
- return {GeneID, GeneAcc, TranscriptsAcc: [...(new Set([...p.TranscriptsAcc, TranscriptsAcc]))], ProteinAcc: [...(new Set([...p.ProteinAcc, ProteinAcc]))]}
- }, {GeneID: '', GeneAcc: '', TranscriptsAcc: [], ProteinAcc: []} as geneIndex)
- }
- const getSymbol = async (
- symbol:string,
- LRGPath:string, // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/LRG_RefSeqGene
- tablePath:string, // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_feature_table.txt.gz
- // regionDBPath: string | string[], // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/biological_region/human.biological_region.gbff.gz
- geneDBPath: string | string[], // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/refseqgene.[1-7].genomic.gbff.gz
- rnaDBPath: string | string[] // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.[1-10].rna.gbff.gz
- ) => {
- const geneIndex = await getIndex(symbol, LRGPath)
- // const regionData = await getFromAcc(geneIndex.GeneAcc.split('.')[0], regionDBPath)
- // if (regionData) await fs.promises.writeFile('test-region.json', JSON.stringify(regionData, null, 4))
- const headerTablePath = [
- 'feature', 'class', 'assembly', 'assembly_unit', 'seq_type', 'chromosome',
- 'genomic_accession', 'start', 'end', 'strand', 'product_accession', 'non_redundant_refseq',
- 'related_accession', 'name', 'symbol', 'GeneID', 'locus_tag', 'feature_interval_length',
- 'product_length','attributes'
- ]
- const allFeatures = await getPos({symbol}, tablePath, headerTablePath)
- for (let index = 0; index < allFeatures.length; index++) {
- const {feature, product_accession} = allFeatures[index]
- let tmp
- switch (feature) {
- case 'gene':
- allFeatures[index].product_accession = geneIndex.GeneAcc
- tmp = await getFromAcc(geneIndex.GeneAcc.split('.')[0], geneDBPath)
- // await fs.promises.writeFile('test/test-gene.json', JSON.stringify(tmp, null, 4))
- allFeatures[index].data = tmp
- break;
- case 'mRNA':
- tmp = await getFromAcc(product_accession.split('.')[0], rnaDBPath)
- // await fs.promises.writeFile('test/test-rna-'+index+'.json', JSON.stringify(tmp, null, 4))
- allFeatures[index].data = tmp
- break;
- default:
- break;
- }
-
- }
- return allFeatures
- }
- const getJSI = async (dbPath: string | string[]) => {
- dbPath = Array.isArray(dbPath) ? dbPath : [dbPath]
- const indexPath: string[] = []
- for (const p of dbPath) {
- const iP = p + '.jsi'
- if (!fs.existsSync(iP)) {
- console.log('Writing index: ' + iP);
- await makeGbffIndex(p)
- }
- indexPath.push(iP)
- }
- return indexPath
- }
- // Todo: add progress
- const makeRefSeqFromReg = async (
- dbPath: string | string[], reg: RegExp, distFile:string, limit?: number
- ) => {
- dbPath = Array.isArray(dbPath) ? dbPath : [dbPath]
- const jsiFiles = await getJSI(dbPath)
- const tmpDir = path.join(os.tmpdir(), 'parser-' + Math.random())
- await fs.promises.mkdir(tmpDir)
- const createdFiles: string[] = []
- let counter = 0
- for (const jsiFile of jsiFiles) {
- console.log('reading ' + jsiFile)
- for await (const line of line$(jsiFile)) {
- if(line.match(reg)) {
- const [accession, from, to] = line.split('\t')
- const res = await getFromAcc(accession, jsiFile.split('.jsi')[0])
- if (res?.sequence) {
- try {
- const file = path.join(tmpDir, res?.version || res.accession + '.fa')
- if (!createdFiles.includes(file)) {
- if (createdFiles.length === 0) if (fs.existsSync(distFile)) await fs.promises.rm(distFile)
- await writeSequence(res?.version || res.accession, res?.sequence, file)
- createdFiles.push(file)
- const tmp = await fs.promises.readFile(file)
- await fs.promises.appendFile(distFile, tmp.toString() + '\n')
- await fs.promises.rm(file)
- counter++
- if (counter%100 === 0) console.log('Already ' + counter + ' sequence parsed')
- }
- } catch (error) {
- console.log(error)
- }
- }
- }
- if (limit) if (counter === limit) break
- }
- if (limit) if (counter === limit) break
- }
- console.log(createdFiles.length + ' sequences were extracted')
-
- await fs.promises.rm(tmpDir, {recursive: true})
- await async_exec('bwa', ['index', distFile], () => console.log)
- }
- export { getFromAcc, getSymbol, makeRefSeqFromReg, getOffsets, getData }
|