| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189 |
- // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_feature_table.txt.gz
- // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_assembly_structure/Primary_Assembly/assembled_chromosomes/chr2acc
- // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gbff.gz
- // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/LRG_RefSeqGene
- // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/refseqgene.1.genomic.gbff.gz
- // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/biological_region/human.biological_region.gbff.gz
- // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.[1-10].rna.gbff.gz
- import fs from 'fs'
- import readline from 'readline'
- import { Buffer } from 'buffer'
- import genbankParser from 'genbank-parser'
- const line$ = (path: string) => readline.createInterface({
- input: fs.createReadStream(path),
- crlfDelay: Infinity
- });
- const readOffset = (path: string, from:number, to:number) => {
- return new Promise<string>(async (resolve, reject) => {
- const size = to - from
- const buffer = Buffer.alloc(size);
- let filehandle = null;
- try {
- filehandle = await fs.promises.open(path, 'r+');
- await filehandle.read(buffer, 0, buffer.length, from);
- } finally {
- if (filehandle) {
- await filehandle.close()
- resolve(buffer.toString())
- }
- }
- })
- }
- /*
- * strings -a -t d human.1.rna.gbff | grep LOCUS | awk '{print $1"\t"$3}' > human.1.rna.gbff.index
- *
- */
- const makeGbffIndex = async (filePath: string, indexPath?: string) => {
- interface entry {
- filePath: string;
- value : string;
- from : number;
- to ?: number;
- }
- indexPath = indexPath || filePath + '.jsi'
- let entries = [] as entry[]
- let lineN = 0
- let byteAcc = 0
- for await (const line of line$(filePath)) {
- if(line.match(/^LOCUS/)) {
- entries.push({
- filePath,
- value : line.split(/\s+/)[1],
- from : byteAcc
- })
- if(lineN !== 0) {
- entries[entries.length - 2]["to"] = byteAcc
- await fs.promises.appendFile(indexPath, [
- entries[entries.length - 2]["value"],
- entries[entries.length - 2]["from"],
- entries[entries.length - 2]["to"]].join('\t') + '\n')
- entries = entries.splice(1)
- }
- }
- byteAcc += (line.length + 1)
- lineN++
- }
- entries[entries.length - 1]["to"] = byteAcc
-
- await fs.promises.appendFile(indexPath, [
- entries[entries.length - 1]["value"],
- entries[entries.length - 1]["from"],
- entries[entries.length - 1]["to"]].join('\t'))
- return entries
- }
- const getOffset = async (indexPath: string, acc: string) => {
- let res
- for await (const line of line$(indexPath)) {
- const tmp = line.split('\t')
- if (tmp[0] === acc) {
- res = [indexPath.split('.jsi')[0], tmp[1], tmp[2]]
- break
- }
- }
- return res
- }
- const getFromAcc = async (accession: string, dbPath: string | string[], indexPath?: string | string[]) => {
- dbPath = Array.isArray(dbPath) ? dbPath : [dbPath]
- if (!indexPath) {
- indexPath = []
- for (const p of dbPath) {
- const iP = p + '.jsi'
- if (!fs.existsSync(iP)) {
- console.log('Writing index: ' + iP);
- await makeGbffIndex(p)
- }
- indexPath.push(iP)
- }
- } else {
- indexPath = Array.isArray(indexPath) ? indexPath : [indexPath]
- if (indexPath.length !== dbPath.length) throw 'Error'
- }
- for (const iP of indexPath) {
- const [filePath, from, to] = await getOffset(iP, accession) || [undefined,undefined,undefined]
- if (filePath) {
- const txt = await readOffset(filePath, Number(from), Number(to))
- return genbankParser(txt)[0]
- }
- }
- return undefined
- }
- interface selector {[key: string]: string}
- const getPos = async (selector: selector, tablePath: string, headerTablePath:string[]) => {
- const results:any = [];
- (await fs.promises.readFile(tablePath)).toString().split('\n')
- .map((line:string) => {
- const lineObj = line.split('\t').reduce((p,c,i) => ({...p, [headerTablePath[i]]: isFinite(Number(c)) ? Number(c) : c}), {} as any)
- if(lineObj[Object.keys(selector)[0]] === selector[Object.keys(selector)[0]]) results.push(lineObj)
- })
- return results
- }
- const getIndex = async (symbol:string, LRGPath:string) => {
- interface geneIndex {
- GeneID : string;
- GeneAcc : string;
- TranscriptsAcc: string[];
- ProteinAcc : string[]
- }
- return (await fs.promises.readFile(LRGPath)).toString()
- .split('\n')
- .filter((line:string) => line.match(new RegExp(symbol, 'g')))
- .reduce((p:any,c:any) => {
- const [TaxID, GeneID, GeneName, GeneAcc, TranscriptsAcc, ProteinAcc, _] = c.split('\t').filter((e:any)=>e!=='')
- return {GeneID, GeneAcc, TranscriptsAcc: [...(new Set([...p.TranscriptsAcc, TranscriptsAcc]))], ProteinAcc: [...(new Set([...p.ProteinAcc, ProteinAcc]))]}
- }, {GeneID: '', GeneAcc: '', TranscriptsAcc: [], ProteinAcc: []} as geneIndex)
- }
- const getSymbol = async (
- symbol:string,
- LRGPath:string, // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/LRG_RefSeqGene
- tablePath:string, // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_feature_table.txt.gz
- // regionDBPath: string | string[], // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/biological_region/human.biological_region.gbff.gz
- geneDBPath: string | string[], // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/refseqgene.[1-7].genomic.gbff.gz
- rnaDBPath: string | string[] // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.[1-10].rna.gbff.gz
- ) => {
- const geneIndex = await getIndex(symbol, LRGPath)
- // const regionData = await getFromAcc(geneIndex.GeneAcc.split('.')[0], regionDBPath)
- // if (regionData) await fs.promises.writeFile('test-region.json', JSON.stringify(regionData, null, 4))
- const headerTablePath = [
- 'feature', 'class', 'assembly', 'assembly_unit', 'seq_type', 'chromosome',
- 'genomic_accession', 'start', 'end', 'strand', 'product_accession', 'non_redundant_refseq',
- 'related_accession', 'name', 'symbol', 'GeneID', 'locus_tag', 'feature_interval_length',
- 'product_length','attributes'
- ]
- const allFeatures = await getPos({symbol}, tablePath, headerTablePath)
- for (let index = 0; index < allFeatures.length; index++) {
- const {feature, product_accession} = allFeatures[index]
- let tmp
- switch (feature) {
- case 'gene':
- allFeatures[index].product_accession = geneIndex.GeneAcc
- tmp = await getFromAcc(geneIndex.GeneAcc.split('.')[0], geneDBPath)
- await fs.promises.writeFile('test/test-gene.json', JSON.stringify(tmp, null, 4))
- allFeatures[index].data = tmp
- break;
- case 'mRNA':
- tmp = await getFromAcc(product_accession.split('.')[0], rnaDBPath)
- await fs.promises.writeFile('test/test-rna-'+index+'.json', JSON.stringify(tmp, null, 4))
- allFeatures[index].data = tmp
- break;
- default:
- break;
- }
-
- }
- return allFeatures
- }
- export { getFromAcc, getSymbol }
|