index.js 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332
  1. "use strict";
  2. // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_feature_table.txt.gz
  3. // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_assembly_structure/Primary_Assembly/assembled_chromosomes/chr2acc
  4. // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gbff.gz
  5. // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/LRG_RefSeqGene
  6. // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/refseqgene.1.genomic.gbff.gz
  7. // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/biological_region/human.biological_region.gbff.gz
  8. // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.[1-10].rna.gbff.gz
  9. var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
  10. function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
  11. return new (P || (P = Promise))(function (resolve, reject) {
  12. function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
  13. function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
  14. function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
  15. step((generator = generator.apply(thisArg, _arguments || [])).next());
  16. });
  17. };
  18. var __asyncValues = (this && this.__asyncValues) || function (o) {
  19. if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined.");
  20. var m = o[Symbol.asyncIterator], i;
  21. return m ? m.call(o) : (o = typeof __values === "function" ? __values(o) : o[Symbol.iterator](), i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i);
  22. function verb(n) { i[n] = o[n] && function (v) { return new Promise(function (resolve, reject) { v = o[n](v), settle(resolve, reject, v.done, v.value); }); }; }
  23. function settle(resolve, reject, d, v) { Promise.resolve(v).then(function(v) { resolve({ value: v, done: d }); }, reject); }
  24. };
  25. var __importDefault = (this && this.__importDefault) || function (mod) {
  26. return (mod && mod.__esModule) ? mod : { "default": mod };
  27. };
  28. Object.defineProperty(exports, "__esModule", { value: true });
  29. exports.getData = exports.getOffsets = exports.makeRefSeqFromReg = exports.getSymbol = exports.getFromAcc = void 0;
  30. const fs_1 = __importDefault(require("fs"));
  31. const os_1 = __importDefault(require("os"));
  32. const path_1 = __importDefault(require("path"));
  33. const child_process_1 = require("child_process");
  34. const readline_1 = __importDefault(require("readline"));
  35. const buffer_1 = require("buffer");
  36. const genbank_parser_1 = __importDefault(require("genbank-parser"));
  37. const aligner_1 = require("aligner");
  38. const jsonata_1 = __importDefault(require("jsonata"));
  39. const async_exec = (prog, args, onData) => {
  40. return new Promise((resolve, reject) => {
  41. const child = (0, child_process_1.spawn)(prog, args, { shell: true });
  42. child.stdout.on('data', data => onData(data.toString().trim()));
  43. child.stderr.on('data', data => onData(data.toString().trim()));
  44. child.on('error', err => reject(err));
  45. child.on('exit', code => resolve(code));
  46. });
  47. };
  48. const line$ = (path) => readline_1.default.createInterface({
  49. input: fs_1.default.createReadStream(path),
  50. crlfDelay: Infinity
  51. });
  52. const readOffset = (path, from, to) => {
  53. return new Promise((resolve, reject) => __awaiter(void 0, void 0, void 0, function* () {
  54. const size = to - from;
  55. const buffer = buffer_1.Buffer.alloc(size);
  56. let filehandle = null;
  57. try {
  58. filehandle = yield fs_1.default.promises.open(path, 'r+');
  59. yield filehandle.read(buffer, 0, buffer.length, from);
  60. }
  61. finally {
  62. if (filehandle) {
  63. yield filehandle.close();
  64. resolve(buffer.toString());
  65. }
  66. }
  67. }));
  68. };
  69. /*
  70. * strings -a -t d human.1.rna.gbff | grep LOCUS | awk '{print $1"\t"$3}' > human.1.rna.gbff.index
  71. *
  72. */
  73. const makeGbffIndex = (filePath, indexPath) => __awaiter(void 0, void 0, void 0, function* () {
  74. var e_1, _a;
  75. indexPath = indexPath || filePath + '.jsi';
  76. let entries = [];
  77. let lineN = 0;
  78. let byteAcc = 0;
  79. try {
  80. for (var _b = __asyncValues(line$(filePath)), _c; _c = yield _b.next(), !_c.done;) {
  81. const line = _c.value;
  82. if (line.match(/^LOCUS/)) {
  83. entries.push({
  84. filePath,
  85. value: line.split(/\s+/)[1],
  86. from: byteAcc
  87. });
  88. if (lineN !== 0) {
  89. entries[entries.length - 2]["to"] = byteAcc;
  90. yield fs_1.default.promises.appendFile(indexPath, [
  91. entries[entries.length - 2]["value"],
  92. entries[entries.length - 2]["from"],
  93. entries[entries.length - 2]["to"]
  94. ].join('\t') + '\n');
  95. entries = entries.splice(1);
  96. }
  97. }
  98. byteAcc += (line.length + 1);
  99. lineN++;
  100. }
  101. }
  102. catch (e_1_1) { e_1 = { error: e_1_1 }; }
  103. finally {
  104. try {
  105. if (_c && !_c.done && (_a = _b.return)) yield _a.call(_b);
  106. }
  107. finally { if (e_1) throw e_1.error; }
  108. }
  109. entries[entries.length - 1]["to"] = byteAcc;
  110. yield fs_1.default.promises.appendFile(indexPath, [
  111. entries[entries.length - 1]["value"],
  112. entries[entries.length - 1]["from"],
  113. entries[entries.length - 1]["to"]
  114. ].join('\t'));
  115. return entries;
  116. });
  117. const getOffset = (indexPath, acc) => __awaiter(void 0, void 0, void 0, function* () {
  118. var e_2, _d;
  119. let res;
  120. try {
  121. for (var _e = __asyncValues(line$(indexPath)), _f; _f = yield _e.next(), !_f.done;) {
  122. const line = _f.value;
  123. const tmp = line.split('\t');
  124. if (tmp[0] === acc) {
  125. res = [indexPath.split('.jsi')[0], tmp[1], tmp[2]];
  126. break;
  127. }
  128. }
  129. }
  130. catch (e_2_1) { e_2 = { error: e_2_1 }; }
  131. finally {
  132. try {
  133. if (_f && !_f.done && (_d = _e.return)) yield _d.call(_e);
  134. }
  135. finally { if (e_2) throw e_2.error; }
  136. }
  137. return res;
  138. });
  139. const getOffsets = (indexPath, accessions) => __awaiter(void 0, void 0, void 0, function* () {
  140. var e_3, _g;
  141. let res = [];
  142. const indexPaths = Array.isArray(indexPath) ? indexPath : [indexPath];
  143. for (const iP of indexPaths) {
  144. try {
  145. for (var _h = (e_3 = void 0, __asyncValues(line$(iP))), _j; _j = yield _h.next(), !_j.done;) {
  146. const line = _j.value;
  147. const tmp = line.split('\t');
  148. if (accessions.test(tmp[0])) {
  149. res.push([iP.split('.jsi')[0], tmp[1], tmp[2], tmp[0]]);
  150. }
  151. }
  152. }
  153. catch (e_3_1) { e_3 = { error: e_3_1 }; }
  154. finally {
  155. try {
  156. if (_j && !_j.done && (_g = _h.return)) yield _g.call(_h);
  157. }
  158. finally { if (e_3) throw e_3.error; }
  159. }
  160. }
  161. return res;
  162. });
  163. exports.getOffsets = getOffsets;
  164. const getData = (dbPath, accessionRegex, outPath, query) => __awaiter(void 0, void 0, void 0, function* () {
  165. dbPath = Array.isArray(dbPath) ? dbPath : [dbPath];
  166. const allOffsets = yield getOffsets(dbPath.map(e => e + '.jsi'), accessionRegex);
  167. console.log(allOffsets.length + ' entry to parse.');
  168. fs_1.default.promises.appendFile(outPath, '[\n');
  169. for (let index = 0; index < allOffsets.length; index++) {
  170. const offset = allOffsets[index];
  171. const txt = yield readOffset(offset[0], Number(offset[1]), Number(offset[2]));
  172. const json = (0, genbank_parser_1.default)(txt)[0];
  173. const tmp = query ? (0, jsonata_1.default)(query).evaluate(json) : json;
  174. const end = index + 1 === allOffsets.length ? '' : ',';
  175. fs_1.default.promises.appendFile(outPath, JSON.stringify(tmp, null, 4) + end + '\n');
  176. if ((index + 1) % 100 === 0)
  177. console.log('Already ' + (index + 1) + ' sequence parsed');
  178. }
  179. fs_1.default.promises.appendFile(outPath, ']');
  180. return 0;
  181. });
  182. exports.getData = getData;
  183. const getFromAcc = (accession, dbPath, indexPath) => __awaiter(void 0, void 0, void 0, function* () {
  184. dbPath = Array.isArray(dbPath) ? dbPath : [dbPath];
  185. if (!indexPath) {
  186. indexPath = yield getJSI(dbPath);
  187. }
  188. else {
  189. indexPath = Array.isArray(indexPath) ? indexPath : [indexPath];
  190. if (indexPath.length !== dbPath.length)
  191. throw 'Error';
  192. }
  193. for (const iP of indexPath) {
  194. const [filePath, from, to] = (yield getOffset(iP, accession)) || [undefined, undefined, undefined];
  195. if (filePath) {
  196. const txt = yield readOffset(filePath, Number(from), Number(to));
  197. return (0, genbank_parser_1.default)(txt)[0];
  198. }
  199. }
  200. return undefined;
  201. });
  202. exports.getFromAcc = getFromAcc;
  203. const getPos = (selector, tablePath, headerTablePath) => __awaiter(void 0, void 0, void 0, function* () {
  204. const results = [];
  205. (yield fs_1.default.promises.readFile(tablePath)).toString().split('\n')
  206. .map((line) => {
  207. const lineObj = line.split('\t').reduce((p, c, i) => (Object.assign(Object.assign({}, p), { [headerTablePath[i]]: isFinite(Number(c)) ? Number(c) : c })), {});
  208. if (lineObj[Object.keys(selector)[0]] === selector[Object.keys(selector)[0]])
  209. results.push(lineObj);
  210. });
  211. return results;
  212. });
  213. const getIndex = (symbol, LRGPath) => __awaiter(void 0, void 0, void 0, function* () {
  214. return (yield fs_1.default.promises.readFile(LRGPath)).toString()
  215. .split('\n')
  216. .filter((line) => line.match(new RegExp(symbol, 'g')))
  217. .reduce((p, c) => {
  218. const [TaxID, GeneID, GeneName, GeneAcc, TranscriptsAcc, ProteinAcc, _] = c.split('\t').filter((e) => e !== '');
  219. return { GeneID, GeneAcc, TranscriptsAcc: [...(new Set([...p.TranscriptsAcc, TranscriptsAcc]))], ProteinAcc: [...(new Set([...p.ProteinAcc, ProteinAcc]))] };
  220. }, { GeneID: '', GeneAcc: '', TranscriptsAcc: [], ProteinAcc: [] });
  221. });
  222. const getSymbol = (symbol, LRGPath, // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/LRG_RefSeqGene
  223. tablePath, // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_feature_table.txt.gz
  224. // regionDBPath: string | string[], // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/biological_region/human.biological_region.gbff.gz
  225. geneDBPath, // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/refseqgene.[1-7].genomic.gbff.gz
  226. rnaDBPath // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.[1-10].rna.gbff.gz
  227. ) => __awaiter(void 0, void 0, void 0, function* () {
  228. const geneIndex = yield getIndex(symbol, LRGPath);
  229. // const regionData = await getFromAcc(geneIndex.GeneAcc.split('.')[0], regionDBPath)
  230. // if (regionData) await fs.promises.writeFile('test-region.json', JSON.stringify(regionData, null, 4))
  231. const headerTablePath = [
  232. 'feature', 'class', 'assembly', 'assembly_unit', 'seq_type', 'chromosome',
  233. 'genomic_accession', 'start', 'end', 'strand', 'product_accession', 'non_redundant_refseq',
  234. 'related_accession', 'name', 'symbol', 'GeneID', 'locus_tag', 'feature_interval_length',
  235. 'product_length', 'attributes'
  236. ];
  237. const allFeatures = yield getPos({ symbol }, tablePath, headerTablePath);
  238. for (let index = 0; index < allFeatures.length; index++) {
  239. const { feature, product_accession } = allFeatures[index];
  240. let tmp;
  241. switch (feature) {
  242. case 'gene':
  243. allFeatures[index].product_accession = geneIndex.GeneAcc;
  244. tmp = yield getFromAcc(geneIndex.GeneAcc.split('.')[0], geneDBPath);
  245. // await fs.promises.writeFile('test/test-gene.json', JSON.stringify(tmp, null, 4))
  246. allFeatures[index].data = tmp;
  247. break;
  248. case 'mRNA':
  249. tmp = yield getFromAcc(product_accession.split('.')[0], rnaDBPath);
  250. // await fs.promises.writeFile('test/test-rna-'+index+'.json', JSON.stringify(tmp, null, 4))
  251. allFeatures[index].data = tmp;
  252. break;
  253. default:
  254. break;
  255. }
  256. }
  257. return allFeatures;
  258. });
  259. exports.getSymbol = getSymbol;
  260. const getJSI = (dbPath) => __awaiter(void 0, void 0, void 0, function* () {
  261. dbPath = Array.isArray(dbPath) ? dbPath : [dbPath];
  262. const indexPath = [];
  263. for (const p of dbPath) {
  264. const iP = p + '.jsi';
  265. if (!fs_1.default.existsSync(iP)) {
  266. console.log('Writing index: ' + iP);
  267. yield makeGbffIndex(p);
  268. }
  269. indexPath.push(iP);
  270. }
  271. return indexPath;
  272. });
  273. // Todo: add progress
  274. const makeRefSeqFromReg = (dbPath, reg, distFile, limit) => __awaiter(void 0, void 0, void 0, function* () {
  275. var e_4, _k;
  276. dbPath = Array.isArray(dbPath) ? dbPath : [dbPath];
  277. const jsiFiles = yield getJSI(dbPath);
  278. const tmpDir = path_1.default.join(os_1.default.tmpdir(), 'parser-' + Math.random());
  279. yield fs_1.default.promises.mkdir(tmpDir);
  280. const createdFiles = [];
  281. let counter = 0;
  282. for (const jsiFile of jsiFiles) {
  283. console.log('reading ' + jsiFile);
  284. try {
  285. for (var _l = (e_4 = void 0, __asyncValues(line$(jsiFile))), _m; _m = yield _l.next(), !_m.done;) {
  286. const line = _m.value;
  287. if (line.match(reg)) {
  288. const [accession, from, to] = line.split('\t');
  289. const res = yield getFromAcc(accession, jsiFile.split('.jsi')[0]);
  290. if (res === null || res === void 0 ? void 0 : res.sequence) {
  291. try {
  292. const file = path_1.default.join(tmpDir, (res === null || res === void 0 ? void 0 : res.version) || res.accession + '.fa');
  293. if (!createdFiles.includes(file)) {
  294. if (createdFiles.length === 0)
  295. if (fs_1.default.existsSync(distFile))
  296. yield fs_1.default.promises.rm(distFile);
  297. yield (0, aligner_1.writeSequence)((res === null || res === void 0 ? void 0 : res.version) || res.accession, res === null || res === void 0 ? void 0 : res.sequence, file);
  298. createdFiles.push(file);
  299. const tmp = yield fs_1.default.promises.readFile(file);
  300. yield fs_1.default.promises.appendFile(distFile, tmp.toString() + '\n');
  301. yield fs_1.default.promises.rm(file);
  302. counter++;
  303. if (counter % 100 === 0)
  304. console.log('Already ' + counter + ' sequence parsed');
  305. }
  306. }
  307. catch (error) {
  308. console.log(error);
  309. }
  310. }
  311. }
  312. if (limit)
  313. if (counter === limit)
  314. break;
  315. }
  316. }
  317. catch (e_4_1) { e_4 = { error: e_4_1 }; }
  318. finally {
  319. try {
  320. if (_m && !_m.done && (_k = _l.return)) yield _k.call(_l);
  321. }
  322. finally { if (e_4) throw e_4.error; }
  323. }
  324. if (limit)
  325. if (counter === limit)
  326. break;
  327. }
  328. console.log(createdFiles.length + ' sequences were extracted');
  329. yield fs_1.default.promises.rm(tmpDir, { recursive: true });
  330. yield async_exec('bwa', ['index', distFile], () => console.log);
  331. });
  332. exports.makeRefSeqFromReg = makeRefSeqFromReg;