index.ts 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169
  1. // refactor with https://github.com/piscinajs/piscina
  2. import https from 'https'
  3. import fs from 'fs'
  4. // Types
  5. import type { queue, done } from 'fastq'
  6. type Task = {id: string, NCBI_API: string}
  7. type TaskBis = {query: string, from: number, max: number, NCBI_API: string}
  8. ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  9. // Donwload and save multifasta from query
  10. ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  11. // save_multifasta_from_ids('Viruses[Organism]+AND+srcdb_refseq[PROP]+AND+vhost_human[Filter]', '/home/thomas/viralHuman.fna')
  12. // Configs
  13. const esearch: string = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
  14. const efetch: string = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
  15. const regex_sam_restriction: RegExp = /[>0-9A-Za-z!#$%&+\./:;?@^_|~-]|[\n\t]/g;
  16. const maxWaiting = 300 * 60 * 1000
  17. const invReplace = (regex: RegExp, string: string, by = '_') => {
  18. return string.split('').map(letter => letter.match(regex) ? letter : by).join('')
  19. }
  20. //get_seq_from_id('12175745', (res) => {console.log(res)})
  21. const get_seq_from_id = (arg: Task, cb: Function) => {
  22. const id = arg.id
  23. const NCBI_API = arg.NCBI_API
  24. https.get(`${efetch}?db=nuccore&id=${id}&rettype=fasta&retmode=text&api_key=${NCBI_API}`, (resp) => {
  25. let data = ''
  26. resp.on('data', (chunk) => { data += chunk })
  27. resp.on('end', () => {
  28. cb(invReplace(regex_sam_restriction, data))
  29. })
  30. }).on("error", (err) => { console.log("Error: " + err.message) })
  31. }
  32. const sleep = (ms: number) => {return new Promise(resolve => setTimeout(resolve, ms))}
  33. const async_get_multiseq_from_ids = (
  34. ids: any[],
  35. NCBI_API: string,
  36. onProgress: Function,
  37. concurrency = 2
  38. ) => {
  39. return new Promise(async (resolve, reject) => {
  40. const q: queue<Task> = require('fastq')(get_seq_from_id, concurrency)
  41. let data = ''
  42. ids.forEach(id => q.push({id, NCBI_API}, (arg) => {data += arg}))
  43. let currQL = 0
  44. let timePassed = 0
  45. while(!q.idle()) {
  46. await sleep(300);
  47. timePassed += 300
  48. if (currQL != q.length()) {
  49. currQL = q.length();
  50. onProgress(`${ids.length-currQL}/${ids.length}`)
  51. }
  52. if(timePassed > maxWaiting) reject('timeout')
  53. }
  54. resolve(data.replace(/(^[ \t]*\n)/gm, ''));
  55. })
  56. }
  57. const get_ids_from_query = (arg: TaskBis, cb: Function) => {
  58. const {query, from, max, NCBI_API} = arg
  59. const url = `${esearch}?db=nucleotide&term=${query}&usehistory=y&api_key=${NCBI_API}&retstart=${from}&retmax=${max}`
  60. https.get(url, (resp) => {
  61. let data = ''
  62. resp.on('data', (chunk) => { data += chunk })
  63. resp.on('end', () => {
  64. const keysMatches = data.matchAll(/<Id>(\d+)<\/Id>/g)
  65. let keys: any[] = []
  66. for (const key of keysMatches) keys.push(key[1])
  67. cb(keys)
  68. })
  69. })
  70. }
  71. // async
  72. const get_multipage_ids_from_query = (
  73. query: string,
  74. onProgress: Function,
  75. NCBI_API: string,
  76. retMax = 20,
  77. concurrency = 2
  78. ) => {
  79. return new Promise((resolve, reject) => {
  80. https.get(`${esearch}?db=nucleotide&term=${query}&usehistory=y&api_key=${NCBI_API}`, async (resp) => {
  81. let data: string = ''
  82. resp.on('data', (chunk) => { data += chunk })
  83. resp.on('end', async () => {
  84. const tmp: any[] = data.match(/<Count>(\d+)<\/Count>/) || [0,0]
  85. const count = parseInt(tmp[1])
  86. const nIter = count%retMax === 0 ? count/retMax : Math.trunc(count/retMax) + 1
  87. const q: queue<TaskBis> = require('fastq')(get_ids_from_query, concurrency)
  88. let ids: any[] = []
  89. const callback: done = (arg) => {
  90. const tmp = Array.isArray(arg) ? arg : [arg]
  91. ids = [...ids, ...tmp]
  92. }
  93. for (let index = 0; index < nIter; index++) {
  94. q.push({query, from: (index*retMax), max: retMax, NCBI_API}, callback)
  95. }
  96. let currQL = 0
  97. let timePassed = 0
  98. while(!q.idle()) {
  99. await sleep(300)
  100. timePassed += 300
  101. if (currQL != q.length()) {
  102. currQL = q.length();
  103. onProgress(`${nIter-currQL}/${nIter}`)
  104. }
  105. if(timePassed > maxWaiting) reject('timeout')
  106. }
  107. if(ids.length === count) {
  108. resolve(ids)
  109. } else {
  110. reject(['Error ',ids.length,count].join(' '))
  111. }
  112. })
  113. }).on("error", (err) => {
  114. console.log("Error: " + err.message);
  115. reject(err);
  116. })
  117. });
  118. }
  119. const saveMultifastaFromIds = (
  120. query: string,
  121. path: string,
  122. NCBI_API: string,
  123. onProgress: Function
  124. ) => {
  125. return new Promise(async (resolve, reject) => {
  126. try {
  127. console.log('Fetching NCBI IDs from : ', query)
  128. const ids = await get_multipage_ids_from_query(query, onProgress, NCBI_API)
  129. if (Array.isArray(ids)) {
  130. console.log(`Fetching NCBI Sequences (${ids.length}) of : `, ids)
  131. const seq = await async_get_multiseq_from_ids(ids, NCBI_API, onProgress)
  132. if (typeof(seq) === 'string') {
  133. await fs.promises.writeFile(path, seq)
  134. resolve(true)
  135. } else { reject('') }
  136. } else { reject('') }
  137. } catch (error) { reject(error) }
  138. })
  139. }
  140. export { saveMultifastaFromIds }
  141. // https://linsalrob.github.io/ComputationalGenomicsManual/Databases/NCBI_Edirect.html
  142. // https://www.ncbi.nlm.nih.gov/books/NBK21091/
  143. // https://www.ncbi.nlm.nih.gov/books/NBK50679/
  144. // ""Homo sapiens"[Organism] AND biomol_transcribed_rna[PROP] AND refseq[filter]"
  145. // ""Homo sapiens"[Organism] AND srcdb_refseq[prop] AND biomol_rna[prop] "
  146. // (async () => {
  147. // const NCBI_API = '5b283f20e48000e0e9f20874125d4cced808'
  148. // await saveMultifastaFromIds(
  149. // '"Homo sapiens"[Organism] AND biomol_transcribed_rna[PROP] AND refseq[filter]',
  150. // '/home/thomas/Human_Transcriptome_RefSeq.fna',
  151. // NCBI_API,
  152. // console.log
  153. // )
  154. // })()