| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169 |
- // refactor with https://github.com/piscinajs/piscina
- import https from 'https'
- import fs from 'fs'
- // Types
- import type { queue, done } from 'fastq'
- type Task = {id: string, NCBI_API: string}
- type TaskBis = {query: string, from: number, max: number, NCBI_API: string}
- ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- // Donwload and save multifasta from query
- ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- // save_multifasta_from_ids('Viruses[Organism]+AND+srcdb_refseq[PROP]+AND+vhost_human[Filter]', '/home/thomas/viralHuman.fna')
- // Configs
- const esearch: string = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
- const efetch: string = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
- const regex_sam_restriction: RegExp = /[>0-9A-Za-z!#$%&+\./:;?@^_|~-]|[\n\t]/g;
- const maxWaiting = 300 * 60 * 1000
- const invReplace = (regex: RegExp, string: string, by = '_') => {
- return string.split('').map(letter => letter.match(regex) ? letter : by).join('')
- }
- //get_seq_from_id('12175745', (res) => {console.log(res)})
- const get_seq_from_id = (arg: Task, cb: Function) => {
- const id = arg.id
- const NCBI_API = arg.NCBI_API
- https.get(`${efetch}?db=nuccore&id=${id}&rettype=fasta&retmode=text&api_key=${NCBI_API}`, (resp) => {
- let data = ''
- resp.on('data', (chunk) => { data += chunk })
- resp.on('end', () => {
- cb(invReplace(regex_sam_restriction, data))
- })
- }).on("error", (err) => { console.log("Error: " + err.message) })
- }
- const sleep = (ms: number) => {return new Promise(resolve => setTimeout(resolve, ms))}
- const async_get_multiseq_from_ids = (
- ids: any[],
- NCBI_API: string,
- onProgress: Function,
- concurrency = 2
- ) => {
- return new Promise(async (resolve, reject) => {
- const q: queue<Task> = require('fastq')(get_seq_from_id, concurrency)
- let data = ''
- ids.forEach(id => q.push({id, NCBI_API}, (arg) => {data += arg}))
- let currQL = 0
- let timePassed = 0
- while(!q.idle()) {
- await sleep(300);
- timePassed += 300
- if (currQL != q.length()) {
- currQL = q.length();
- onProgress(`${ids.length-currQL}/${ids.length}`)
- }
- if(timePassed > maxWaiting) reject('timeout')
- }
- resolve(data.replace(/(^[ \t]*\n)/gm, ''));
- })
- }
- const get_ids_from_query = (arg: TaskBis, cb: Function) => {
- const {query, from, max, NCBI_API} = arg
- const url = `${esearch}?db=nucleotide&term=${query}&usehistory=y&api_key=${NCBI_API}&retstart=${from}&retmax=${max}`
- https.get(url, (resp) => {
- let data = ''
- resp.on('data', (chunk) => { data += chunk })
- resp.on('end', () => {
- const keysMatches = data.matchAll(/<Id>(\d+)<\/Id>/g)
- let keys: any[] = []
- for (const key of keysMatches) keys.push(key[1])
- cb(keys)
- })
- })
- }
- // async
- const get_multipage_ids_from_query = (
- query: string,
- onProgress: Function,
- NCBI_API: string,
- retMax = 20,
- concurrency = 2
- ) => {
- return new Promise((resolve, reject) => {
- https.get(`${esearch}?db=nucleotide&term=${query}&usehistory=y&api_key=${NCBI_API}`, async (resp) => {
- let data: string = ''
- resp.on('data', (chunk) => { data += chunk })
- resp.on('end', async () => {
- const tmp: any[] = data.match(/<Count>(\d+)<\/Count>/) || [0,0]
- const count = parseInt(tmp[1])
- const nIter = count%retMax === 0 ? count/retMax : Math.trunc(count/retMax) + 1
- const q: queue<TaskBis> = require('fastq')(get_ids_from_query, concurrency)
- let ids: any[] = []
- const callback: done = (arg) => {
- const tmp = Array.isArray(arg) ? arg : [arg]
- ids = [...ids, ...tmp]
- }
- for (let index = 0; index < nIter; index++) {
- q.push({query, from: (index*retMax), max: retMax, NCBI_API}, callback)
- }
- let currQL = 0
- let timePassed = 0
- while(!q.idle()) {
- await sleep(300)
- timePassed += 300
- if (currQL != q.length()) {
- currQL = q.length();
- onProgress(`${nIter-currQL}/${nIter}`)
- }
- if(timePassed > maxWaiting) reject('timeout')
- }
- if(ids.length === count) {
- resolve(ids)
- } else {
- reject(['Error ',ids.length,count].join(' '))
- }
- })
- }).on("error", (err) => {
- console.log("Error: " + err.message);
- reject(err);
- })
- });
- }
- const saveMultifastaFromIds = (
- query: string,
- path: string,
- NCBI_API: string,
- onProgress: Function
- ) => {
- return new Promise(async (resolve, reject) => {
- try {
- console.log('Fetching NCBI IDs from : ', query)
- const ids = await get_multipage_ids_from_query(query, onProgress, NCBI_API)
- if (Array.isArray(ids)) {
- console.log(`Fetching NCBI Sequences (${ids.length}) of : `, ids)
- const seq = await async_get_multiseq_from_ids(ids, NCBI_API, onProgress)
- if (typeof(seq) === 'string') {
- await fs.promises.writeFile(path, seq)
- resolve(true)
- } else { reject('') }
- } else { reject('') }
- } catch (error) { reject(error) }
- })
- }
- export { saveMultifastaFromIds }
- // https://linsalrob.github.io/ComputationalGenomicsManual/Databases/NCBI_Edirect.html
- // https://www.ncbi.nlm.nih.gov/books/NBK21091/
- // https://www.ncbi.nlm.nih.gov/books/NBK50679/
- // ""Homo sapiens"[Organism] AND biomol_transcribed_rna[PROP] AND refseq[filter]"
- // ""Homo sapiens"[Organism] AND srcdb_refseq[prop] AND biomol_rna[prop] "
- // (async () => {
- // const NCBI_API = '5b283f20e48000e0e9f20874125d4cced808'
- // await saveMultifastaFromIds(
- // '"Homo sapiens"[Organism] AND biomol_transcribed_rna[PROP] AND refseq[filter]',
- // '/home/thomas/Human_Transcriptome_RefSeq.fna',
- // NCBI_API,
- // console.log
- // )
- // })()
|