| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332 |
- "use strict";
- // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_feature_table.txt.gz
- // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_assembly_structure/Primary_Assembly/assembled_chromosomes/chr2acc
- // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gbff.gz
- // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/LRG_RefSeqGene
- // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/refseqgene.1.genomic.gbff.gz
- // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/biological_region/human.biological_region.gbff.gz
- // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.[1-10].rna.gbff.gz
- var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
- function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
- return new (P || (P = Promise))(function (resolve, reject) {
- function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
- function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
- function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
- step((generator = generator.apply(thisArg, _arguments || [])).next());
- });
- };
- var __asyncValues = (this && this.__asyncValues) || function (o) {
- if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined.");
- var m = o[Symbol.asyncIterator], i;
- return m ? m.call(o) : (o = typeof __values === "function" ? __values(o) : o[Symbol.iterator](), i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i);
- function verb(n) { i[n] = o[n] && function (v) { return new Promise(function (resolve, reject) { v = o[n](v), settle(resolve, reject, v.done, v.value); }); }; }
- function settle(resolve, reject, d, v) { Promise.resolve(v).then(function(v) { resolve({ value: v, done: d }); }, reject); }
- };
- var __importDefault = (this && this.__importDefault) || function (mod) {
- return (mod && mod.__esModule) ? mod : { "default": mod };
- };
- Object.defineProperty(exports, "__esModule", { value: true });
- exports.getData = exports.getOffsets = exports.makeRefSeqFromReg = exports.getSymbol = exports.getFromAcc = void 0;
- const fs_1 = __importDefault(require("fs"));
- const os_1 = __importDefault(require("os"));
- const path_1 = __importDefault(require("path"));
- const child_process_1 = require("child_process");
- const readline_1 = __importDefault(require("readline"));
- const buffer_1 = require("buffer");
- const genbank_parser_1 = __importDefault(require("genbank-parser"));
- const aligner_1 = require("aligner");
- const jsonata_1 = __importDefault(require("jsonata"));
- const async_exec = (prog, args, onData) => {
- return new Promise((resolve, reject) => {
- const child = (0, child_process_1.spawn)(prog, args, { shell: true });
- child.stdout.on('data', data => onData(data.toString().trim()));
- child.stderr.on('data', data => onData(data.toString().trim()));
- child.on('error', err => reject(err));
- child.on('exit', code => resolve(code));
- });
- };
- const line$ = (path) => readline_1.default.createInterface({
- input: fs_1.default.createReadStream(path),
- crlfDelay: Infinity
- });
- const readOffset = (path, from, to) => {
- return new Promise((resolve, reject) => __awaiter(void 0, void 0, void 0, function* () {
- const size = to - from;
- const buffer = buffer_1.Buffer.alloc(size);
- let filehandle = null;
- try {
- filehandle = yield fs_1.default.promises.open(path, 'r+');
- yield filehandle.read(buffer, 0, buffer.length, from);
- }
- finally {
- if (filehandle) {
- yield filehandle.close();
- resolve(buffer.toString());
- }
- }
- }));
- };
- /*
- * strings -a -t d human.1.rna.gbff | grep LOCUS | awk '{print $1"\t"$3}' > human.1.rna.gbff.index
- *
- */
- const makeGbffIndex = (filePath, indexPath) => __awaiter(void 0, void 0, void 0, function* () {
- var e_1, _a;
- indexPath = indexPath || filePath + '.jsi';
- let entries = [];
- let lineN = 0;
- let byteAcc = 0;
- try {
- for (var _b = __asyncValues(line$(filePath)), _c; _c = yield _b.next(), !_c.done;) {
- const line = _c.value;
- if (line.match(/^LOCUS/)) {
- entries.push({
- filePath,
- value: line.split(/\s+/)[1],
- from: byteAcc
- });
- if (lineN !== 0) {
- entries[entries.length - 2]["to"] = byteAcc;
- yield fs_1.default.promises.appendFile(indexPath, [
- entries[entries.length - 2]["value"],
- entries[entries.length - 2]["from"],
- entries[entries.length - 2]["to"]
- ].join('\t') + '\n');
- entries = entries.splice(1);
- }
- }
- byteAcc += (line.length + 1);
- lineN++;
- }
- }
- catch (e_1_1) { e_1 = { error: e_1_1 }; }
- finally {
- try {
- if (_c && !_c.done && (_a = _b.return)) yield _a.call(_b);
- }
- finally { if (e_1) throw e_1.error; }
- }
- entries[entries.length - 1]["to"] = byteAcc;
- yield fs_1.default.promises.appendFile(indexPath, [
- entries[entries.length - 1]["value"],
- entries[entries.length - 1]["from"],
- entries[entries.length - 1]["to"]
- ].join('\t'));
- return entries;
- });
- const getOffset = (indexPath, acc) => __awaiter(void 0, void 0, void 0, function* () {
- var e_2, _d;
- let res;
- try {
- for (var _e = __asyncValues(line$(indexPath)), _f; _f = yield _e.next(), !_f.done;) {
- const line = _f.value;
- const tmp = line.split('\t');
- if (tmp[0] === acc) {
- res = [indexPath.split('.jsi')[0], tmp[1], tmp[2]];
- break;
- }
- }
- }
- catch (e_2_1) { e_2 = { error: e_2_1 }; }
- finally {
- try {
- if (_f && !_f.done && (_d = _e.return)) yield _d.call(_e);
- }
- finally { if (e_2) throw e_2.error; }
- }
- return res;
- });
- const getOffsets = (indexPath, accessions) => __awaiter(void 0, void 0, void 0, function* () {
- var e_3, _g;
- let res = [];
- const indexPaths = Array.isArray(indexPath) ? indexPath : [indexPath];
- for (const iP of indexPaths) {
- try {
- for (var _h = (e_3 = void 0, __asyncValues(line$(iP))), _j; _j = yield _h.next(), !_j.done;) {
- const line = _j.value;
- const tmp = line.split('\t');
- if (accessions.test(tmp[0])) {
- res.push([iP.split('.jsi')[0], tmp[1], tmp[2], tmp[0]]);
- }
- }
- }
- catch (e_3_1) { e_3 = { error: e_3_1 }; }
- finally {
- try {
- if (_j && !_j.done && (_g = _h.return)) yield _g.call(_h);
- }
- finally { if (e_3) throw e_3.error; }
- }
- }
- return res;
- });
- exports.getOffsets = getOffsets;
- const getData = (dbPath, accessionRegex, outPath, query) => __awaiter(void 0, void 0, void 0, function* () {
- dbPath = Array.isArray(dbPath) ? dbPath : [dbPath];
- const allOffsets = yield getOffsets(dbPath.map(e => e + '.jsi'), accessionRegex);
- console.log(allOffsets.length + ' entry to parse.');
- fs_1.default.promises.appendFile(outPath, '[\n');
- for (let index = 0; index < allOffsets.length; index++) {
- const offset = allOffsets[index];
- const txt = yield readOffset(offset[0], Number(offset[1]), Number(offset[2]));
- const json = (0, genbank_parser_1.default)(txt)[0];
- const tmp = query ? (0, jsonata_1.default)(query).evaluate(json) : json;
- const end = index + 1 === allOffsets.length ? '' : ',';
- fs_1.default.promises.appendFile(outPath, JSON.stringify(tmp, null, 4) + end + '\n');
- if ((index + 1) % 100 === 0)
- console.log('Already ' + (index + 1) + ' sequence parsed');
- }
- fs_1.default.promises.appendFile(outPath, ']');
- return 0;
- });
- exports.getData = getData;
- const getFromAcc = (accession, dbPath, indexPath) => __awaiter(void 0, void 0, void 0, function* () {
- dbPath = Array.isArray(dbPath) ? dbPath : [dbPath];
- if (!indexPath) {
- indexPath = yield getJSI(dbPath);
- }
- else {
- indexPath = Array.isArray(indexPath) ? indexPath : [indexPath];
- if (indexPath.length !== dbPath.length)
- throw 'Error';
- }
- for (const iP of indexPath) {
- const [filePath, from, to] = (yield getOffset(iP, accession)) || [undefined, undefined, undefined];
- if (filePath) {
- const txt = yield readOffset(filePath, Number(from), Number(to));
- return (0, genbank_parser_1.default)(txt)[0];
- }
- }
- return undefined;
- });
- exports.getFromAcc = getFromAcc;
- const getPos = (selector, tablePath, headerTablePath) => __awaiter(void 0, void 0, void 0, function* () {
- const results = [];
- (yield fs_1.default.promises.readFile(tablePath)).toString().split('\n')
- .map((line) => {
- const lineObj = line.split('\t').reduce((p, c, i) => (Object.assign(Object.assign({}, p), { [headerTablePath[i]]: isFinite(Number(c)) ? Number(c) : c })), {});
- if (lineObj[Object.keys(selector)[0]] === selector[Object.keys(selector)[0]])
- results.push(lineObj);
- });
- return results;
- });
- const getIndex = (symbol, LRGPath) => __awaiter(void 0, void 0, void 0, function* () {
- return (yield fs_1.default.promises.readFile(LRGPath)).toString()
- .split('\n')
- .filter((line) => line.match(new RegExp(symbol, 'g')))
- .reduce((p, c) => {
- const [TaxID, GeneID, GeneName, GeneAcc, TranscriptsAcc, ProteinAcc, _] = c.split('\t').filter((e) => e !== '');
- return { GeneID, GeneAcc, TranscriptsAcc: [...(new Set([...p.TranscriptsAcc, TranscriptsAcc]))], ProteinAcc: [...(new Set([...p.ProteinAcc, ProteinAcc]))] };
- }, { GeneID: '', GeneAcc: '', TranscriptsAcc: [], ProteinAcc: [] });
- });
- const getSymbol = (symbol, LRGPath, // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/LRG_RefSeqGene
- tablePath, // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_feature_table.txt.gz
- // regionDBPath: string | string[], // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/biological_region/human.biological_region.gbff.gz
- geneDBPath, // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/refseqgene.[1-7].genomic.gbff.gz
- rnaDBPath // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.[1-10].rna.gbff.gz
- ) => __awaiter(void 0, void 0, void 0, function* () {
- const geneIndex = yield getIndex(symbol, LRGPath);
- // const regionData = await getFromAcc(geneIndex.GeneAcc.split('.')[0], regionDBPath)
- // if (regionData) await fs.promises.writeFile('test-region.json', JSON.stringify(regionData, null, 4))
- const headerTablePath = [
- 'feature', 'class', 'assembly', 'assembly_unit', 'seq_type', 'chromosome',
- 'genomic_accession', 'start', 'end', 'strand', 'product_accession', 'non_redundant_refseq',
- 'related_accession', 'name', 'symbol', 'GeneID', 'locus_tag', 'feature_interval_length',
- 'product_length', 'attributes'
- ];
- const allFeatures = yield getPos({ symbol }, tablePath, headerTablePath);
- for (let index = 0; index < allFeatures.length; index++) {
- const { feature, product_accession } = allFeatures[index];
- let tmp;
- switch (feature) {
- case 'gene':
- allFeatures[index].product_accession = geneIndex.GeneAcc;
- tmp = yield getFromAcc(geneIndex.GeneAcc.split('.')[0], geneDBPath);
- // await fs.promises.writeFile('test/test-gene.json', JSON.stringify(tmp, null, 4))
- allFeatures[index].data = tmp;
- break;
- case 'mRNA':
- tmp = yield getFromAcc(product_accession.split('.')[0], rnaDBPath);
- // await fs.promises.writeFile('test/test-rna-'+index+'.json', JSON.stringify(tmp, null, 4))
- allFeatures[index].data = tmp;
- break;
- default:
- break;
- }
- }
- return allFeatures;
- });
- exports.getSymbol = getSymbol;
- const getJSI = (dbPath) => __awaiter(void 0, void 0, void 0, function* () {
- dbPath = Array.isArray(dbPath) ? dbPath : [dbPath];
- const indexPath = [];
- for (const p of dbPath) {
- const iP = p + '.jsi';
- if (!fs_1.default.existsSync(iP)) {
- console.log('Writing index: ' + iP);
- yield makeGbffIndex(p);
- }
- indexPath.push(iP);
- }
- return indexPath;
- });
- // Todo: add progress
- const makeRefSeqFromReg = (dbPath, reg, distFile, limit) => __awaiter(void 0, void 0, void 0, function* () {
- var e_4, _k;
- dbPath = Array.isArray(dbPath) ? dbPath : [dbPath];
- const jsiFiles = yield getJSI(dbPath);
- const tmpDir = path_1.default.join(os_1.default.tmpdir(), 'parser-' + Math.random());
- yield fs_1.default.promises.mkdir(tmpDir);
- const createdFiles = [];
- let counter = 0;
- for (const jsiFile of jsiFiles) {
- console.log('reading ' + jsiFile);
- try {
- for (var _l = (e_4 = void 0, __asyncValues(line$(jsiFile))), _m; _m = yield _l.next(), !_m.done;) {
- const line = _m.value;
- if (line.match(reg)) {
- const [accession, from, to] = line.split('\t');
- const res = yield getFromAcc(accession, jsiFile.split('.jsi')[0]);
- if (res === null || res === void 0 ? void 0 : res.sequence) {
- try {
- const file = path_1.default.join(tmpDir, (res === null || res === void 0 ? void 0 : res.version) || res.accession + '.fa');
- if (!createdFiles.includes(file)) {
- if (createdFiles.length === 0)
- if (fs_1.default.existsSync(distFile))
- yield fs_1.default.promises.rm(distFile);
- yield (0, aligner_1.writeSequence)((res === null || res === void 0 ? void 0 : res.version) || res.accession, res === null || res === void 0 ? void 0 : res.sequence, file);
- createdFiles.push(file);
- const tmp = yield fs_1.default.promises.readFile(file);
- yield fs_1.default.promises.appendFile(distFile, tmp.toString() + '\n');
- yield fs_1.default.promises.rm(file);
- counter++;
- if (counter % 100 === 0)
- console.log('Already ' + counter + ' sequence parsed');
- }
- }
- catch (error) {
- console.log(error);
- }
- }
- }
- if (limit)
- if (counter === limit)
- break;
- }
- }
- catch (e_4_1) { e_4 = { error: e_4_1 }; }
- finally {
- try {
- if (_m && !_m.done && (_k = _l.return)) yield _k.call(_l);
- }
- finally { if (e_4) throw e_4.error; }
- }
- if (limit)
- if (counter === limit)
- break;
- }
- console.log(createdFiles.length + ' sequences were extracted');
- yield fs_1.default.promises.rm(tmpDir, { recursive: true });
- yield async_exec('bwa', ['index', distFile], () => console.log);
- });
- exports.makeRefSeqFromReg = makeRefSeqFromReg;
|