"use strict"; // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_feature_table.txt.gz // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_assembly_structure/Primary_Assembly/assembled_chromosomes/chr2acc // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gbff.gz // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/LRG_RefSeqGene // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/refseqgene.1.genomic.gbff.gz // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/biological_region/human.biological_region.gbff.gz // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.[1-10].rna.gbff.gz var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; var __asyncValues = (this && this.__asyncValues) || function (o) { if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined."); var m = o[Symbol.asyncIterator], i; return m ? m.call(o) : (o = typeof __values === "function" ? __values(o) : o[Symbol.iterator](), i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i); function verb(n) { i[n] = o[n] && function (v) { return new Promise(function (resolve, reject) { v = o[n](v), settle(resolve, reject, v.done, v.value); }); }; } function settle(resolve, reject, d, v) { Promise.resolve(v).then(function(v) { resolve({ value: v, done: d }); }, reject); } }; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.getData = exports.getOffsets = exports.makeRefSeqFromReg = exports.getSymbol = exports.getFromAcc = void 0; const fs_1 = __importDefault(require("fs")); const os_1 = __importDefault(require("os")); const path_1 = __importDefault(require("path")); const child_process_1 = require("child_process"); const readline_1 = __importDefault(require("readline")); const buffer_1 = require("buffer"); const genbank_parser_1 = __importDefault(require("genbank-parser")); const aligner_1 = require("aligner"); const jsonata_1 = __importDefault(require("jsonata")); const async_exec = (prog, args, onData) => { return new Promise((resolve, reject) => { const child = (0, child_process_1.spawn)(prog, args, { shell: true }); child.stdout.on('data', data => onData(data.toString().trim())); child.stderr.on('data', data => onData(data.toString().trim())); child.on('error', err => reject(err)); child.on('exit', code => resolve(code)); }); }; const line$ = (path) => readline_1.default.createInterface({ input: fs_1.default.createReadStream(path), crlfDelay: Infinity }); const readOffset = (path, from, to) => { return new Promise((resolve, reject) => __awaiter(void 0, void 0, void 0, function* () { const size = to - from; const buffer = buffer_1.Buffer.alloc(size); let filehandle = null; try { filehandle = yield fs_1.default.promises.open(path, 'r+'); yield filehandle.read(buffer, 0, buffer.length, from); } finally { if (filehandle) { yield filehandle.close(); resolve(buffer.toString()); } } })); }; /* * strings -a -t d human.1.rna.gbff | grep LOCUS | awk '{print $1"\t"$3}' > human.1.rna.gbff.index * */ const makeGbffIndex = (filePath, indexPath) => __awaiter(void 0, void 0, void 0, function* () { var e_1, _a; indexPath = indexPath || filePath + '.jsi'; let entries = []; let lineN = 0; let byteAcc = 0; try { for (var _b = __asyncValues(line$(filePath)), _c; _c = yield _b.next(), !_c.done;) { const line = _c.value; if (line.match(/^LOCUS/)) { entries.push({ filePath, value: line.split(/\s+/)[1], from: byteAcc }); if (lineN !== 0) { entries[entries.length - 2]["to"] = byteAcc; yield fs_1.default.promises.appendFile(indexPath, [ entries[entries.length - 2]["value"], entries[entries.length - 2]["from"], entries[entries.length - 2]["to"] ].join('\t') + '\n'); entries = entries.splice(1); } } byteAcc += (line.length + 1); lineN++; } } catch (e_1_1) { e_1 = { error: e_1_1 }; } finally { try { if (_c && !_c.done && (_a = _b.return)) yield _a.call(_b); } finally { if (e_1) throw e_1.error; } } entries[entries.length - 1]["to"] = byteAcc; yield fs_1.default.promises.appendFile(indexPath, [ entries[entries.length - 1]["value"], entries[entries.length - 1]["from"], entries[entries.length - 1]["to"] ].join('\t')); return entries; }); const getOffset = (indexPath, acc) => __awaiter(void 0, void 0, void 0, function* () { var e_2, _d; let res; try { for (var _e = __asyncValues(line$(indexPath)), _f; _f = yield _e.next(), !_f.done;) { const line = _f.value; const tmp = line.split('\t'); if (tmp[0] === acc) { res = [indexPath.split('.jsi')[0], tmp[1], tmp[2]]; break; } } } catch (e_2_1) { e_2 = { error: e_2_1 }; } finally { try { if (_f && !_f.done && (_d = _e.return)) yield _d.call(_e); } finally { if (e_2) throw e_2.error; } } return res; }); const getOffsets = (indexPath, accessions) => __awaiter(void 0, void 0, void 0, function* () { var e_3, _g; let res = []; const indexPaths = Array.isArray(indexPath) ? indexPath : [indexPath]; for (const iP of indexPaths) { try { for (var _h = (e_3 = void 0, __asyncValues(line$(iP))), _j; _j = yield _h.next(), !_j.done;) { const line = _j.value; const tmp = line.split('\t'); if (accessions.test(tmp[0])) { res.push([iP.split('.jsi')[0], tmp[1], tmp[2], tmp[0]]); } } } catch (e_3_1) { e_3 = { error: e_3_1 }; } finally { try { if (_j && !_j.done && (_g = _h.return)) yield _g.call(_h); } finally { if (e_3) throw e_3.error; } } } return res; }); exports.getOffsets = getOffsets; const getData = (dbPath, accessionRegex, outPath, query) => __awaiter(void 0, void 0, void 0, function* () { dbPath = Array.isArray(dbPath) ? dbPath : [dbPath]; const allOffsets = yield getOffsets(dbPath.map(e => e + '.jsi'), accessionRegex); console.log(allOffsets.length + ' entry to parse.'); fs_1.default.promises.appendFile(outPath, '[\n'); for (let index = 0; index < allOffsets.length; index++) { const offset = allOffsets[index]; const txt = yield readOffset(offset[0], Number(offset[1]), Number(offset[2])); const json = (0, genbank_parser_1.default)(txt)[0]; const tmp = query ? (0, jsonata_1.default)(query).evaluate(json) : json; const end = index + 1 === allOffsets.length ? '' : ','; fs_1.default.promises.appendFile(outPath, JSON.stringify(tmp, null, 4) + end + '\n'); if ((index + 1) % 100 === 0) console.log('Already ' + (index + 1) + ' sequence parsed'); } fs_1.default.promises.appendFile(outPath, ']'); return 0; }); exports.getData = getData; const getFromAcc = (accession, dbPath, indexPath) => __awaiter(void 0, void 0, void 0, function* () { dbPath = Array.isArray(dbPath) ? dbPath : [dbPath]; if (!indexPath) { indexPath = yield getJSI(dbPath); } else { indexPath = Array.isArray(indexPath) ? indexPath : [indexPath]; if (indexPath.length !== dbPath.length) throw 'Error'; } for (const iP of indexPath) { const [filePath, from, to] = (yield getOffset(iP, accession)) || [undefined, undefined, undefined]; if (filePath) { const txt = yield readOffset(filePath, Number(from), Number(to)); return (0, genbank_parser_1.default)(txt)[0]; } } return undefined; }); exports.getFromAcc = getFromAcc; const getPos = (selector, tablePath, headerTablePath) => __awaiter(void 0, void 0, void 0, function* () { const results = []; (yield fs_1.default.promises.readFile(tablePath)).toString().split('\n') .map((line) => { const lineObj = line.split('\t').reduce((p, c, i) => (Object.assign(Object.assign({}, p), { [headerTablePath[i]]: isFinite(Number(c)) ? Number(c) : c })), {}); if (lineObj[Object.keys(selector)[0]] === selector[Object.keys(selector)[0]]) results.push(lineObj); }); return results; }); const getIndex = (symbol, LRGPath) => __awaiter(void 0, void 0, void 0, function* () { return (yield fs_1.default.promises.readFile(LRGPath)).toString() .split('\n') .filter((line) => line.match(new RegExp(symbol, 'g'))) .reduce((p, c) => { const [TaxID, GeneID, GeneName, GeneAcc, TranscriptsAcc, ProteinAcc, _] = c.split('\t').filter((e) => e !== ''); return { GeneID, GeneAcc, TranscriptsAcc: [...(new Set([...p.TranscriptsAcc, TranscriptsAcc]))], ProteinAcc: [...(new Set([...p.ProteinAcc, ProteinAcc]))] }; }, { GeneID: '', GeneAcc: '', TranscriptsAcc: [], ProteinAcc: [] }); }); const getSymbol = (symbol, LRGPath, // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/LRG_RefSeqGene tablePath, // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_feature_table.txt.gz // regionDBPath: string | string[], // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/biological_region/human.biological_region.gbff.gz geneDBPath, // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/refseqgene.[1-7].genomic.gbff.gz rnaDBPath // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.[1-10].rna.gbff.gz ) => __awaiter(void 0, void 0, void 0, function* () { const geneIndex = yield getIndex(symbol, LRGPath); // const regionData = await getFromAcc(geneIndex.GeneAcc.split('.')[0], regionDBPath) // if (regionData) await fs.promises.writeFile('test-region.json', JSON.stringify(regionData, null, 4)) const headerTablePath = [ 'feature', 'class', 'assembly', 'assembly_unit', 'seq_type', 'chromosome', 'genomic_accession', 'start', 'end', 'strand', 'product_accession', 'non_redundant_refseq', 'related_accession', 'name', 'symbol', 'GeneID', 'locus_tag', 'feature_interval_length', 'product_length', 'attributes' ]; const allFeatures = yield getPos({ symbol }, tablePath, headerTablePath); for (let index = 0; index < allFeatures.length; index++) { const { feature, product_accession } = allFeatures[index]; let tmp; switch (feature) { case 'gene': allFeatures[index].product_accession = geneIndex.GeneAcc; tmp = yield getFromAcc(geneIndex.GeneAcc.split('.')[0], geneDBPath); // await fs.promises.writeFile('test/test-gene.json', JSON.stringify(tmp, null, 4)) allFeatures[index].data = tmp; break; case 'mRNA': tmp = yield getFromAcc(product_accession.split('.')[0], rnaDBPath); // await fs.promises.writeFile('test/test-rna-'+index+'.json', JSON.stringify(tmp, null, 4)) allFeatures[index].data = tmp; break; default: break; } } return allFeatures; }); exports.getSymbol = getSymbol; const getJSI = (dbPath) => __awaiter(void 0, void 0, void 0, function* () { dbPath = Array.isArray(dbPath) ? dbPath : [dbPath]; const indexPath = []; for (const p of dbPath) { const iP = p + '.jsi'; if (!fs_1.default.existsSync(iP)) { console.log('Writing index: ' + iP); yield makeGbffIndex(p); } indexPath.push(iP); } return indexPath; }); // Todo: add progress const makeRefSeqFromReg = (dbPath, reg, distFile, limit) => __awaiter(void 0, void 0, void 0, function* () { var e_4, _k; dbPath = Array.isArray(dbPath) ? dbPath : [dbPath]; const jsiFiles = yield getJSI(dbPath); const tmpDir = path_1.default.join(os_1.default.tmpdir(), 'parser-' + Math.random()); yield fs_1.default.promises.mkdir(tmpDir); const createdFiles = []; let counter = 0; for (const jsiFile of jsiFiles) { console.log('reading ' + jsiFile); try { for (var _l = (e_4 = void 0, __asyncValues(line$(jsiFile))), _m; _m = yield _l.next(), !_m.done;) { const line = _m.value; if (line.match(reg)) { const [accession, from, to] = line.split('\t'); const res = yield getFromAcc(accession, jsiFile.split('.jsi')[0]); if (res === null || res === void 0 ? void 0 : res.sequence) { try { const file = path_1.default.join(tmpDir, (res === null || res === void 0 ? void 0 : res.version) || res.accession + '.fa'); if (!createdFiles.includes(file)) { if (createdFiles.length === 0) if (fs_1.default.existsSync(distFile)) yield fs_1.default.promises.rm(distFile); yield (0, aligner_1.writeSequence)((res === null || res === void 0 ? void 0 : res.version) || res.accession, res === null || res === void 0 ? void 0 : res.sequence, file); createdFiles.push(file); const tmp = yield fs_1.default.promises.readFile(file); yield fs_1.default.promises.appendFile(distFile, tmp.toString() + '\n'); yield fs_1.default.promises.rm(file); counter++; if (counter % 100 === 0) console.log('Already ' + counter + ' sequence parsed'); } } catch (error) { console.log(error); } } } if (limit) if (counter === limit) break; } } catch (e_4_1) { e_4 = { error: e_4_1 }; } finally { try { if (_m && !_m.done && (_k = _l.return)) yield _k.call(_l); } finally { if (e_4) throw e_4.error; } } if (limit) if (counter === limit) break; } console.log(createdFiles.length + ' sequences were extracted'); yield fs_1.default.promises.rm(tmpDir, { recursive: true }); yield async_exec('bwa', ['index', distFile], () => console.log); }); exports.makeRefSeqFromReg = makeRefSeqFromReg;