index.js 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162
  1. "use strict";
  2. // refactor with https://github.com/piscinajs/piscina
  3. var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
  4. function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
  5. return new (P || (P = Promise))(function (resolve, reject) {
  6. function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
  7. function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
  8. function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
  9. step((generator = generator.apply(thisArg, _arguments || [])).next());
  10. });
  11. };
  12. var __importDefault = (this && this.__importDefault) || function (mod) {
  13. return (mod && mod.__esModule) ? mod : { "default": mod };
  14. };
  15. Object.defineProperty(exports, "__esModule", { value: true });
  16. exports.saveMultifastaFromIds = void 0;
  17. const https_1 = __importDefault(require("https"));
  18. const fs_1 = __importDefault(require("fs"));
  19. ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  20. // Donwload and save multifasta from query
  21. ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  22. // save_multifasta_from_ids('Viruses[Organism]+AND+srcdb_refseq[PROP]+AND+vhost_human[Filter]', '/home/thomas/viralHuman.fna')
  23. // Configs
  24. const esearch = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi';
  25. const efetch = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi';
  26. const regex_sam_restriction = /[>0-9A-Za-z!#$%&+\./:;?@^_|~-]|[\n\t]/g;
  27. const maxWaiting = 300 * 60 * 1000;
  28. const invReplace = (regex, string, by = '_') => {
  29. return string.split('').map(letter => letter.match(regex) ? letter : by).join('');
  30. };
  31. //get_seq_from_id('12175745', (res) => {console.log(res)})
  32. const get_seq_from_id = (arg, cb) => {
  33. const id = arg.id;
  34. const NCBI_API = arg.NCBI_API;
  35. https_1.default.get(`${efetch}?db=nuccore&id=${id}&rettype=fasta&retmode=text&api_key=${NCBI_API}`, (resp) => {
  36. let data = '';
  37. resp.on('data', (chunk) => { data += chunk; });
  38. resp.on('end', () => {
  39. cb(invReplace(regex_sam_restriction, data));
  40. });
  41. }).on("error", (err) => { console.log("Error: " + err.message); });
  42. };
  43. const sleep = (ms) => { return new Promise(resolve => setTimeout(resolve, ms)); };
  44. const async_get_multiseq_from_ids = (ids, NCBI_API, onProgress, concurrency = 2) => {
  45. return new Promise((resolve, reject) => __awaiter(void 0, void 0, void 0, function* () {
  46. const q = require('fastq')(get_seq_from_id, concurrency);
  47. let data = '';
  48. ids.forEach(id => q.push({ id, NCBI_API }, (arg) => { data += arg; }));
  49. let currQL = 0;
  50. let timePassed = 0;
  51. while (!q.idle()) {
  52. yield sleep(300);
  53. timePassed += 300;
  54. if (currQL != q.length()) {
  55. currQL = q.length();
  56. onProgress(`${ids.length - currQL}/${ids.length}`);
  57. }
  58. if (timePassed > maxWaiting)
  59. reject('timeout');
  60. }
  61. resolve(data.replace(/(^[ \t]*\n)/gm, ''));
  62. }));
  63. };
  64. const get_ids_from_query = (arg, cb) => {
  65. const { query, from, max, NCBI_API } = arg;
  66. const url = `${esearch}?db=nucleotide&term=${query}&usehistory=y&api_key=${NCBI_API}&retstart=${from}&retmax=${max}`;
  67. https_1.default.get(url, (resp) => {
  68. let data = '';
  69. resp.on('data', (chunk) => { data += chunk; });
  70. resp.on('end', () => {
  71. const keysMatches = data.matchAll(/<Id>(\d+)<\/Id>/g);
  72. let keys = [];
  73. for (const key of keysMatches)
  74. keys.push(key[1]);
  75. cb(keys);
  76. });
  77. });
  78. };
  79. // async
  80. const get_multipage_ids_from_query = (query, onProgress, NCBI_API, retMax = 20, concurrency = 2) => {
  81. return new Promise((resolve, reject) => {
  82. https_1.default.get(`${esearch}?db=nucleotide&term=${query}&usehistory=y&api_key=${NCBI_API}`, (resp) => __awaiter(void 0, void 0, void 0, function* () {
  83. let data = '';
  84. resp.on('data', (chunk) => { data += chunk; });
  85. resp.on('end', () => __awaiter(void 0, void 0, void 0, function* () {
  86. const tmp = data.match(/<Count>(\d+)<\/Count>/) || [0, 0];
  87. const count = parseInt(tmp[1]);
  88. const nIter = count % retMax === 0 ? count / retMax : Math.trunc(count / retMax) + 1;
  89. const q = require('fastq')(get_ids_from_query, concurrency);
  90. let ids = [];
  91. const callback = (arg) => {
  92. const tmp = Array.isArray(arg) ? arg : [arg];
  93. ids = [...ids, ...tmp];
  94. };
  95. for (let index = 0; index < nIter; index++) {
  96. q.push({ query, from: (index * retMax), max: retMax, NCBI_API }, callback);
  97. }
  98. let currQL = 0;
  99. let timePassed = 0;
  100. while (!q.idle()) {
  101. yield sleep(300);
  102. timePassed += 300;
  103. if (currQL != q.length()) {
  104. currQL = q.length();
  105. onProgress(`${nIter - currQL}/${nIter}`);
  106. }
  107. if (timePassed > maxWaiting)
  108. reject('timeout');
  109. }
  110. if (ids.length === count) {
  111. resolve(ids);
  112. }
  113. else {
  114. reject(['Error ', ids.length, count].join(' '));
  115. }
  116. }));
  117. })).on("error", (err) => {
  118. console.log("Error: " + err.message);
  119. reject(err);
  120. });
  121. });
  122. };
  123. const saveMultifastaFromIds = (query, path, NCBI_API, onProgress) => {
  124. return new Promise((resolve, reject) => __awaiter(void 0, void 0, void 0, function* () {
  125. try {
  126. console.log('Fetching NCBI IDs from : ', query);
  127. const ids = yield get_multipage_ids_from_query(query, onProgress, NCBI_API);
  128. if (Array.isArray(ids)) {
  129. console.log(`Fetching NCBI Sequences (${ids.length}) of : `, ids);
  130. const seq = yield async_get_multiseq_from_ids(ids, NCBI_API, onProgress);
  131. if (typeof (seq) === 'string') {
  132. yield fs_1.default.promises.writeFile(path, seq);
  133. resolve(true);
  134. }
  135. else {
  136. reject('');
  137. }
  138. }
  139. else {
  140. reject('');
  141. }
  142. }
  143. catch (error) {
  144. reject(error);
  145. }
  146. }));
  147. };
  148. exports.saveMultifastaFromIds = saveMultifastaFromIds;
  149. // https://linsalrob.github.io/ComputationalGenomicsManual/Databases/NCBI_Edirect.html
  150. // https://www.ncbi.nlm.nih.gov/books/NBK21091/
  151. // https://www.ncbi.nlm.nih.gov/books/NBK50679/
  152. // ""Homo sapiens"[Organism] AND biomol_transcribed_rna[PROP] AND refseq[filter]"
  153. // ""Homo sapiens"[Organism] AND srcdb_refseq[prop] AND biomol_rna[prop] "
  154. // (async () => {
  155. // const NCBI_API = '5b283f20e48000e0e9f20874125d4cced808'
  156. // await saveMultifastaFromIds(
  157. // '"Homo sapiens"[Organism] AND biomol_transcribed_rna[PROP] AND refseq[filter]',
  158. // '/home/thomas/Human_Transcriptome_RefSeq.fna',
  159. // NCBI_API,
  160. // console.log
  161. // )
  162. // })()