index.ts 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256
  1. // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_feature_table.txt.gz
  2. // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_assembly_structure/Primary_Assembly/assembled_chromosomes/chr2acc
  3. // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gbff.gz
  4. // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/LRG_RefSeqGene
  5. // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/refseqgene.1.genomic.gbff.gz
  6. // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/biological_region/human.biological_region.gbff.gz
  7. // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.[1-10].rna.gbff.gz
  8. import fs from 'fs'
  9. import os from 'os'
  10. import path from 'path'
  11. import { spawn } from 'child_process'
  12. import readline from 'readline'
  13. import { Buffer } from 'buffer'
  14. import genbankParser from 'genbank-parser'
  15. import { writeSequence } from 'aligner'
  16. const async_exec = (prog: string, args: string[], onData: Function) => {
  17. return new Promise((resolve, reject) => {
  18. const child = spawn(prog, args, {shell: true})
  19. child.stdout.on('data', data => onData(data.toString().trim()))
  20. child.stderr.on('data', data => onData(data.toString().trim()))
  21. child.on('error', err => reject(err))
  22. child.on('exit', code => resolve(code))
  23. })
  24. }
  25. const line$ = (path: string) => readline.createInterface({
  26. input: fs.createReadStream(path),
  27. crlfDelay: Infinity
  28. });
  29. const readOffset = (path: string, from:number, to:number) => {
  30. return new Promise<string>(async (resolve, reject) => {
  31. const size = to - from
  32. const buffer = Buffer.alloc(size);
  33. let filehandle = null;
  34. try {
  35. filehandle = await fs.promises.open(path, 'r+');
  36. await filehandle.read(buffer, 0, buffer.length, from);
  37. } finally {
  38. if (filehandle) {
  39. await filehandle.close()
  40. resolve(buffer.toString())
  41. }
  42. }
  43. })
  44. }
  45. /*
  46. * strings -a -t d human.1.rna.gbff | grep LOCUS | awk '{print $1"\t"$3}' > human.1.rna.gbff.index
  47. *
  48. */
  49. const makeGbffIndex = async (filePath: string, indexPath?: string) => {
  50. interface entry {
  51. filePath: string;
  52. value : string;
  53. from : number;
  54. to ?: number;
  55. }
  56. indexPath = indexPath || filePath + '.jsi'
  57. let entries = [] as entry[]
  58. let lineN = 0
  59. let byteAcc = 0
  60. for await (const line of line$(filePath)) {
  61. if(line.match(/^LOCUS/)) {
  62. entries.push({
  63. filePath,
  64. value : line.split(/\s+/)[1],
  65. from : byteAcc
  66. })
  67. if(lineN !== 0) {
  68. entries[entries.length - 2]["to"] = byteAcc
  69. await fs.promises.appendFile(indexPath, [
  70. entries[entries.length - 2]["value"],
  71. entries[entries.length - 2]["from"],
  72. entries[entries.length - 2]["to"]].join('\t') + '\n')
  73. entries = entries.splice(1)
  74. }
  75. }
  76. byteAcc += (line.length + 1)
  77. lineN++
  78. }
  79. entries[entries.length - 1]["to"] = byteAcc
  80. await fs.promises.appendFile(indexPath, [
  81. entries[entries.length - 1]["value"],
  82. entries[entries.length - 1]["from"],
  83. entries[entries.length - 1]["to"]].join('\t'))
  84. return entries
  85. }
  86. const getOffset = async (indexPath: string, acc: string) => {
  87. let res
  88. for await (const line of line$(indexPath)) {
  89. const tmp = line.split('\t')
  90. if (tmp[0] === acc) {
  91. res = [indexPath.split('.jsi')[0], tmp[1], tmp[2]]
  92. break
  93. }
  94. }
  95. return res
  96. }
  97. const getFromAcc = async (accession: string, dbPath: string | string[], indexPath?: string | string[]) => {
  98. dbPath = Array.isArray(dbPath) ? dbPath : [dbPath]
  99. if (!indexPath) {
  100. indexPath = await getJSI(dbPath)
  101. } else {
  102. indexPath = Array.isArray(indexPath) ? indexPath : [indexPath]
  103. if (indexPath.length !== dbPath.length) throw 'Error'
  104. }
  105. for (const iP of indexPath) {
  106. const [filePath, from, to] = await getOffset(iP, accession) || [undefined,undefined,undefined]
  107. if (filePath) {
  108. const txt = await readOffset(filePath, Number(from), Number(to))
  109. return genbankParser(txt)[0]
  110. }
  111. }
  112. return undefined
  113. }
  114. interface selector {[key: string]: string}
  115. const getPos = async (selector: selector, tablePath: string, headerTablePath:string[]) => {
  116. const results:any = [];
  117. (await fs.promises.readFile(tablePath)).toString().split('\n')
  118. .map((line:string) => {
  119. const lineObj = line.split('\t').reduce((p,c,i) => ({...p, [headerTablePath[i]]: isFinite(Number(c)) ? Number(c) : c}), {} as any)
  120. if(lineObj[Object.keys(selector)[0]] === selector[Object.keys(selector)[0]]) results.push(lineObj)
  121. })
  122. return results
  123. }
  124. const getIndex = async (symbol:string, LRGPath:string) => {
  125. interface geneIndex {
  126. GeneID : string;
  127. GeneAcc : string;
  128. TranscriptsAcc: string[];
  129. ProteinAcc : string[]
  130. }
  131. return (await fs.promises.readFile(LRGPath)).toString()
  132. .split('\n')
  133. .filter((line:string) => line.match(new RegExp(symbol, 'g')))
  134. .reduce((p:any,c:any) => {
  135. const [TaxID, GeneID, GeneName, GeneAcc, TranscriptsAcc, ProteinAcc, _] = c.split('\t').filter((e:any)=>e!=='')
  136. return {GeneID, GeneAcc, TranscriptsAcc: [...(new Set([...p.TranscriptsAcc, TranscriptsAcc]))], ProteinAcc: [...(new Set([...p.ProteinAcc, ProteinAcc]))]}
  137. }, {GeneID: '', GeneAcc: '', TranscriptsAcc: [], ProteinAcc: []} as geneIndex)
  138. }
  139. const getSymbol = async (
  140. symbol:string,
  141. LRGPath:string, // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/LRG_RefSeqGene
  142. tablePath:string, // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_feature_table.txt.gz
  143. // regionDBPath: string | string[], // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/biological_region/human.biological_region.gbff.gz
  144. geneDBPath: string | string[], // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/refseqgene.[1-7].genomic.gbff.gz
  145. rnaDBPath: string | string[] // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.[1-10].rna.gbff.gz
  146. ) => {
  147. const geneIndex = await getIndex(symbol, LRGPath)
  148. // const regionData = await getFromAcc(geneIndex.GeneAcc.split('.')[0], regionDBPath)
  149. // if (regionData) await fs.promises.writeFile('test-region.json', JSON.stringify(regionData, null, 4))
  150. const headerTablePath = [
  151. 'feature', 'class', 'assembly', 'assembly_unit', 'seq_type', 'chromosome',
  152. 'genomic_accession', 'start', 'end', 'strand', 'product_accession', 'non_redundant_refseq',
  153. 'related_accession', 'name', 'symbol', 'GeneID', 'locus_tag', 'feature_interval_length',
  154. 'product_length','attributes'
  155. ]
  156. const allFeatures = await getPos({symbol}, tablePath, headerTablePath)
  157. for (let index = 0; index < allFeatures.length; index++) {
  158. const {feature, product_accession} = allFeatures[index]
  159. let tmp
  160. switch (feature) {
  161. case 'gene':
  162. allFeatures[index].product_accession = geneIndex.GeneAcc
  163. tmp = await getFromAcc(geneIndex.GeneAcc.split('.')[0], geneDBPath)
  164. // await fs.promises.writeFile('test/test-gene.json', JSON.stringify(tmp, null, 4))
  165. allFeatures[index].data = tmp
  166. break;
  167. case 'mRNA':
  168. tmp = await getFromAcc(product_accession.split('.')[0], rnaDBPath)
  169. // await fs.promises.writeFile('test/test-rna-'+index+'.json', JSON.stringify(tmp, null, 4))
  170. allFeatures[index].data = tmp
  171. break;
  172. default:
  173. break;
  174. }
  175. }
  176. return allFeatures
  177. }
  178. const getJSI = async (dbPath: string | string[]) => {
  179. dbPath = Array.isArray(dbPath) ? dbPath : [dbPath]
  180. const indexPath: string[] = []
  181. for (const p of dbPath) {
  182. const iP = p + '.jsi'
  183. if (!fs.existsSync(iP)) {
  184. console.log('Writing index: ' + iP);
  185. await makeGbffIndex(p)
  186. }
  187. indexPath.push(iP)
  188. }
  189. return indexPath
  190. }
  191. // Todo: add progress
  192. const makeRefSeqFromReg = async (
  193. dbPath: string | string[], reg: RegExp, distFile:string, limit?: number
  194. ) => {
  195. dbPath = Array.isArray(dbPath) ? dbPath : [dbPath]
  196. const jsiFiles = await getJSI(dbPath)
  197. const tmpDir = path.join(os.tmpdir(), 'parser-' + Math.random())
  198. await fs.promises.mkdir(tmpDir)
  199. const createdFiles: string[] = []
  200. let counter = 0
  201. for (const jsiFile of jsiFiles) {
  202. console.log('reading ' + jsiFile)
  203. for await (const line of line$(jsiFile)) {
  204. if(line.match(reg)) {
  205. const [accession, from, to] = line.split('\t')
  206. const res = await getFromAcc(accession, jsiFile.split('.jsi')[0])
  207. if (res?.sequence) {
  208. try {
  209. const file = path.join(tmpDir, res?.version || res.accession + '.fa')
  210. if (!createdFiles.includes(file)) {
  211. await writeSequence(res?.version || res.accession, res?.sequence, file)
  212. createdFiles.push(file)
  213. counter++
  214. if (counter%100 === 0) console.log('Already ' + counter + ' sequence parsed')
  215. }
  216. } catch (error) {
  217. console.log(error)
  218. }
  219. }
  220. }
  221. if (limit) if (counter === limit) break
  222. }
  223. if (limit) if (counter === limit) break
  224. }
  225. console.log(createdFiles.length + ' sequences')
  226. if (fs.existsSync(distFile)) await fs.promises.rm(distFile)
  227. for (const createdFile of createdFiles) {
  228. const tmp = await fs.promises.readFile(createdFile)
  229. await fs.promises.appendFile(distFile, tmp.toString() + '\n')
  230. }
  231. await fs.promises.rm(tmpDir, {recursive: true})
  232. await async_exec('bwa', ['index', distFile], () => console.log)
  233. }
  234. export { getFromAcc, getSymbol, makeRefSeqFromReg }