| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162 |
- "use strict";
- // refactor with https://github.com/piscinajs/piscina
- var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
- function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
- return new (P || (P = Promise))(function (resolve, reject) {
- function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
- function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
- function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
- step((generator = generator.apply(thisArg, _arguments || [])).next());
- });
- };
- var __importDefault = (this && this.__importDefault) || function (mod) {
- return (mod && mod.__esModule) ? mod : { "default": mod };
- };
- Object.defineProperty(exports, "__esModule", { value: true });
- exports.saveMultifastaFromIds = void 0;
- const https_1 = __importDefault(require("https"));
- const fs_1 = __importDefault(require("fs"));
- ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- // Donwload and save multifasta from query
- ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- // save_multifasta_from_ids('Viruses[Organism]+AND+srcdb_refseq[PROP]+AND+vhost_human[Filter]', '/home/thomas/viralHuman.fna')
- // Configs
- const esearch = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi';
- const efetch = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi';
- const regex_sam_restriction = /[>0-9A-Za-z!#$%&+\./:;?@^_|~-]|[\n\t]/g;
- const maxWaiting = 300 * 60 * 1000;
- const invReplace = (regex, string, by = '_') => {
- return string.split('').map(letter => letter.match(regex) ? letter : by).join('');
- };
- //get_seq_from_id('12175745', (res) => {console.log(res)})
- const get_seq_from_id = (arg, cb) => {
- const id = arg.id;
- const NCBI_API = arg.NCBI_API;
- https_1.default.get(`${efetch}?db=nuccore&id=${id}&rettype=fasta&retmode=text&api_key=${NCBI_API}`, (resp) => {
- let data = '';
- resp.on('data', (chunk) => { data += chunk; });
- resp.on('end', () => {
- cb(invReplace(regex_sam_restriction, data));
- });
- }).on("error", (err) => { console.log("Error: " + err.message); });
- };
- const sleep = (ms) => { return new Promise(resolve => setTimeout(resolve, ms)); };
- const async_get_multiseq_from_ids = (ids, NCBI_API, onProgress, concurrency = 2) => {
- return new Promise((resolve, reject) => __awaiter(void 0, void 0, void 0, function* () {
- const q = require('fastq')(get_seq_from_id, concurrency);
- let data = '';
- ids.forEach(id => q.push({ id, NCBI_API }, (arg) => { data += arg; }));
- let currQL = 0;
- let timePassed = 0;
- while (!q.idle()) {
- yield sleep(300);
- timePassed += 300;
- if (currQL != q.length()) {
- currQL = q.length();
- onProgress(`${ids.length - currQL}/${ids.length}`);
- }
- if (timePassed > maxWaiting)
- reject('timeout');
- }
- resolve(data.replace(/(^[ \t]*\n)/gm, ''));
- }));
- };
- const get_ids_from_query = (arg, cb) => {
- const { query, from, max, NCBI_API } = arg;
- const url = `${esearch}?db=nucleotide&term=${query}&usehistory=y&api_key=${NCBI_API}&retstart=${from}&retmax=${max}`;
- https_1.default.get(url, (resp) => {
- let data = '';
- resp.on('data', (chunk) => { data += chunk; });
- resp.on('end', () => {
- const keysMatches = data.matchAll(/<Id>(\d+)<\/Id>/g);
- let keys = [];
- for (const key of keysMatches)
- keys.push(key[1]);
- cb(keys);
- });
- });
- };
- // async
- const get_multipage_ids_from_query = (query, onProgress, NCBI_API, retMax = 20, concurrency = 2) => {
- return new Promise((resolve, reject) => {
- https_1.default.get(`${esearch}?db=nucleotide&term=${query}&usehistory=y&api_key=${NCBI_API}`, (resp) => __awaiter(void 0, void 0, void 0, function* () {
- let data = '';
- resp.on('data', (chunk) => { data += chunk; });
- resp.on('end', () => __awaiter(void 0, void 0, void 0, function* () {
- const tmp = data.match(/<Count>(\d+)<\/Count>/) || [0, 0];
- const count = parseInt(tmp[1]);
- const nIter = count % retMax === 0 ? count / retMax : Math.trunc(count / retMax) + 1;
- const q = require('fastq')(get_ids_from_query, concurrency);
- let ids = [];
- const callback = (arg) => {
- const tmp = Array.isArray(arg) ? arg : [arg];
- ids = [...ids, ...tmp];
- };
- for (let index = 0; index < nIter; index++) {
- q.push({ query, from: (index * retMax), max: retMax, NCBI_API }, callback);
- }
- let currQL = 0;
- let timePassed = 0;
- while (!q.idle()) {
- yield sleep(300);
- timePassed += 300;
- if (currQL != q.length()) {
- currQL = q.length();
- onProgress(`${nIter - currQL}/${nIter}`);
- }
- if (timePassed > maxWaiting)
- reject('timeout');
- }
- if (ids.length === count) {
- resolve(ids);
- }
- else {
- reject(['Error ', ids.length, count].join(' '));
- }
- }));
- })).on("error", (err) => {
- console.log("Error: " + err.message);
- reject(err);
- });
- });
- };
- const saveMultifastaFromIds = (query, path, NCBI_API, onProgress) => {
- return new Promise((resolve, reject) => __awaiter(void 0, void 0, void 0, function* () {
- try {
- console.log('Fetching NCBI IDs from : ', query);
- const ids = yield get_multipage_ids_from_query(query, onProgress, NCBI_API);
- if (Array.isArray(ids)) {
- console.log(`Fetching NCBI Sequences (${ids.length}) of : `, ids);
- const seq = yield async_get_multiseq_from_ids(ids, NCBI_API, onProgress);
- if (typeof (seq) === 'string') {
- yield fs_1.default.promises.writeFile(path, seq);
- resolve(true);
- }
- else {
- reject('');
- }
- }
- else {
- reject('');
- }
- }
- catch (error) {
- reject(error);
- }
- }));
- };
- exports.saveMultifastaFromIds = saveMultifastaFromIds;
- // https://linsalrob.github.io/ComputationalGenomicsManual/Databases/NCBI_Edirect.html
- // https://www.ncbi.nlm.nih.gov/books/NBK21091/
- // https://www.ncbi.nlm.nih.gov/books/NBK50679/
- // ""Homo sapiens"[Organism] AND biomol_transcribed_rna[PROP] AND refseq[filter]"
- // ""Homo sapiens"[Organism] AND srcdb_refseq[prop] AND biomol_rna[prop] "
- // (async () => {
- // const NCBI_API = '5b283f20e48000e0e9f20874125d4cced808'
- // await saveMultifastaFromIds(
- // '"Homo sapiens"[Organism] AND biomol_transcribed_rna[PROP] AND refseq[filter]',
- // '/home/thomas/Human_Transcriptome_RefSeq.fna',
- // NCBI_API,
- // console.log
- // )
- // })()
|