index.ts 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290
  1. // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_feature_table.txt.gz
  2. // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_assembly_structure/Primary_Assembly/assembled_chromosomes/chr2acc
  3. // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gbff.gz
  4. // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/LRG_RefSeqGene
  5. // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/refseqgene.1.genomic.gbff.gz
  6. // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/biological_region/human.biological_region.gbff.gz
  7. // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.[1-10].rna.gbff.gz
  8. import fs from 'fs'
  9. import os from 'os'
  10. import path from 'path'
  11. import { spawn } from 'child_process'
  12. import readline from 'readline'
  13. import { Buffer } from 'buffer'
  14. import genbankParser from 'genbank-parser'
  15. import { writeSequence } from 'aligner'
  16. import jsonata from 'jsonata'
  17. const async_exec = (prog: string, args: string[], onData: Function) => {
  18. return new Promise((resolve, reject) => {
  19. const child = spawn(prog, args, {shell: true})
  20. child.stdout.on('data', data => onData(data.toString().trim()))
  21. child.stderr.on('data', data => onData(data.toString().trim()))
  22. child.on('error', err => reject(err))
  23. child.on('exit', code => resolve(code))
  24. })
  25. }
  26. const line$ = (path: string) => readline.createInterface({
  27. input: fs.createReadStream(path),
  28. crlfDelay: Infinity
  29. });
  30. const readOffset = (path: string, from:number, to:number) => {
  31. return new Promise<string>(async (resolve, reject) => {
  32. const size = to - from
  33. const buffer = Buffer.alloc(size)
  34. let filehandle = null
  35. try {
  36. filehandle = await fs.promises.open(path, 'r+');
  37. await filehandle.read(buffer, 0, buffer.length, from)
  38. } finally {
  39. if (filehandle) {
  40. await filehandle.close()
  41. resolve(buffer.toString())
  42. }
  43. }
  44. })
  45. }
  46. /*
  47. * strings -a -t d human.1.rna.gbff | grep LOCUS | awk '{print $1"\t"$3}' > human.1.rna.gbff.index
  48. *
  49. */
  50. const makeGbffIndex = async (filePath: string, indexPath?: string) => {
  51. interface entry {
  52. filePath: string;
  53. value : string;
  54. from : number;
  55. to ?: number;
  56. }
  57. indexPath = indexPath || filePath + '.jsi'
  58. let entries = [] as entry[]
  59. let lineN = 0
  60. let byteAcc = 0
  61. for await (const line of line$(filePath)) {
  62. if(line.match(/^LOCUS/)) {
  63. entries.push({
  64. filePath,
  65. value : line.split(/\s+/)[1],
  66. from : byteAcc
  67. })
  68. if(lineN !== 0) {
  69. entries[entries.length - 2]["to"] = byteAcc
  70. await fs.promises.appendFile(indexPath, [
  71. entries[entries.length - 2]["value"],
  72. entries[entries.length - 2]["from"],
  73. entries[entries.length - 2]["to"]].join('\t') + '\n')
  74. entries = entries.splice(1)
  75. }
  76. }
  77. byteAcc += (line.length + 1)
  78. lineN++
  79. }
  80. entries[entries.length - 1]["to"] = byteAcc
  81. await fs.promises.appendFile(indexPath, [
  82. entries[entries.length - 1]["value"],
  83. entries[entries.length - 1]["from"],
  84. entries[entries.length - 1]["to"]].join('\t'))
  85. return entries
  86. }
  87. const getOffset = async (indexPath: string, acc: string) => {
  88. let res
  89. for await (const line of line$(indexPath)) {
  90. const tmp = line.split('\t')
  91. if (tmp[0] === acc) {
  92. res = [indexPath.split('.jsi')[0], tmp[1], tmp[2]]
  93. break
  94. }
  95. }
  96. return res
  97. }
  98. const getOffsets = async (indexPath: string | string[], accessions: RegExp) => {
  99. let res: string[][] = []
  100. const indexPaths = Array.isArray(indexPath) ? indexPath : [indexPath]
  101. for (const iP of indexPaths) {
  102. for await (const line of line$(iP)) {
  103. const tmp = line.split('\t')
  104. if (accessions.test(tmp[0])) {
  105. res.push([iP.split('.jsi')[0], tmp[1], tmp[2], tmp[0]])
  106. }
  107. }
  108. }
  109. return res
  110. }
  111. const getData = async (dbPath: string | string[], accessionRegex: RegExp, outPath: string, query?: string) => {
  112. dbPath = Array.isArray(dbPath) ? dbPath : [dbPath]
  113. const allOffsets = await getOffsets(dbPath.map(e => e + '.jsi'), accessionRegex)
  114. console.log(allOffsets.length + ' entry to parse.');
  115. fs.promises.appendFile(outPath, '[\n')
  116. for (let index = 0; index < allOffsets.length; index++) {
  117. const offset = allOffsets[index];
  118. const txt = await readOffset(offset[0], Number(offset[1]), Number(offset[2]))
  119. const json = genbankParser(txt)[0]
  120. const tmp = query ? jsonata(query).evaluate(json) : json
  121. const end = index + 1 === allOffsets.length ? '' : ','
  122. fs.promises.appendFile(outPath, JSON.stringify(tmp, null, 4) + end + '\n')
  123. if ((index + 1)%100 === 0) console.log('Already ' + ( index + 1) + ' sequence parsed')
  124. }
  125. fs.promises.appendFile(outPath, ']')
  126. return 0
  127. }
  128. const getFromAcc = async (accession: string, dbPath: string | string[], indexPath?: string | string[]) => {
  129. dbPath = Array.isArray(dbPath) ? dbPath : [dbPath]
  130. if (!indexPath) {
  131. indexPath = await getJSI(dbPath)
  132. } else {
  133. indexPath = Array.isArray(indexPath) ? indexPath : [indexPath]
  134. if (indexPath.length !== dbPath.length) throw 'Error'
  135. }
  136. for (const iP of indexPath) {
  137. const [filePath, from, to] = await getOffset(iP, accession) || [undefined,undefined,undefined]
  138. if (filePath) {
  139. const txt = await readOffset(filePath, Number(from), Number(to))
  140. return genbankParser(txt)[0]
  141. }
  142. }
  143. return undefined
  144. }
  145. interface selector {[key: string]: string}
  146. const getPos = async (selector: selector, tablePath: string, headerTablePath:string[]) => {
  147. const results:any = [];
  148. (await fs.promises.readFile(tablePath)).toString().split('\n')
  149. .map((line:string) => {
  150. const lineObj = line.split('\t').reduce((p,c,i) => ({...p, [headerTablePath[i]]: isFinite(Number(c)) ? Number(c) : c}), {} as any)
  151. if(lineObj[Object.keys(selector)[0]] === selector[Object.keys(selector)[0]]) results.push(lineObj)
  152. })
  153. return results
  154. }
  155. const getIndex = async (symbol:string, LRGPath:string) => {
  156. interface geneIndex {
  157. GeneID : string;
  158. GeneAcc : string;
  159. TranscriptsAcc: string[];
  160. ProteinAcc : string[]
  161. }
  162. return (await fs.promises.readFile(LRGPath)).toString()
  163. .split('\n')
  164. .filter((line:string) => line.match(new RegExp(symbol, 'g')))
  165. .reduce((p:any,c:any) => {
  166. const [TaxID, GeneID, GeneName, GeneAcc, TranscriptsAcc, ProteinAcc, _] = c.split('\t').filter((e:any)=>e!=='')
  167. return {GeneID, GeneAcc, TranscriptsAcc: [...(new Set([...p.TranscriptsAcc, TranscriptsAcc]))], ProteinAcc: [...(new Set([...p.ProteinAcc, ProteinAcc]))]}
  168. }, {GeneID: '', GeneAcc: '', TranscriptsAcc: [], ProteinAcc: []} as geneIndex)
  169. }
  170. const getSymbol = async (
  171. symbol:string,
  172. LRGPath:string, // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/LRG_RefSeqGene
  173. tablePath:string, // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_feature_table.txt.gz
  174. // regionDBPath: string | string[], // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/biological_region/human.biological_region.gbff.gz
  175. geneDBPath: string | string[], // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/refseqgene.[1-7].genomic.gbff.gz
  176. rnaDBPath: string | string[] // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.[1-10].rna.gbff.gz
  177. ) => {
  178. const geneIndex = await getIndex(symbol, LRGPath)
  179. // const regionData = await getFromAcc(geneIndex.GeneAcc.split('.')[0], regionDBPath)
  180. // if (regionData) await fs.promises.writeFile('test-region.json', JSON.stringify(regionData, null, 4))
  181. const headerTablePath = [
  182. 'feature', 'class', 'assembly', 'assembly_unit', 'seq_type', 'chromosome',
  183. 'genomic_accession', 'start', 'end', 'strand', 'product_accession', 'non_redundant_refseq',
  184. 'related_accession', 'name', 'symbol', 'GeneID', 'locus_tag', 'feature_interval_length',
  185. 'product_length','attributes'
  186. ]
  187. const allFeatures = await getPos({symbol}, tablePath, headerTablePath)
  188. for (let index = 0; index < allFeatures.length; index++) {
  189. const {feature, product_accession} = allFeatures[index]
  190. let tmp
  191. switch (feature) {
  192. case 'gene':
  193. allFeatures[index].product_accession = geneIndex.GeneAcc
  194. tmp = await getFromAcc(geneIndex.GeneAcc.split('.')[0], geneDBPath)
  195. // await fs.promises.writeFile('test/test-gene.json', JSON.stringify(tmp, null, 4))
  196. allFeatures[index].data = tmp
  197. break;
  198. case 'mRNA':
  199. tmp = await getFromAcc(product_accession.split('.')[0], rnaDBPath)
  200. // await fs.promises.writeFile('test/test-rna-'+index+'.json', JSON.stringify(tmp, null, 4))
  201. allFeatures[index].data = tmp
  202. break;
  203. default:
  204. break;
  205. }
  206. }
  207. return allFeatures
  208. }
  209. const getJSI = async (dbPath: string | string[]) => {
  210. dbPath = Array.isArray(dbPath) ? dbPath : [dbPath]
  211. const indexPath: string[] = []
  212. for (const p of dbPath) {
  213. const iP = p + '.jsi'
  214. if (!fs.existsSync(iP)) {
  215. console.log('Writing index: ' + iP);
  216. await makeGbffIndex(p)
  217. }
  218. indexPath.push(iP)
  219. }
  220. return indexPath
  221. }
  222. // Todo: add progress
  223. const makeRefSeqFromReg = async (
  224. dbPath: string | string[], reg: RegExp, distFile:string, limit?: number
  225. ) => {
  226. dbPath = Array.isArray(dbPath) ? dbPath : [dbPath]
  227. const jsiFiles = await getJSI(dbPath)
  228. const tmpDir = path.join(os.tmpdir(), 'parser-' + Math.random())
  229. await fs.promises.mkdir(tmpDir)
  230. const createdFiles: string[] = []
  231. let counter = 0
  232. for (const jsiFile of jsiFiles) {
  233. console.log('reading ' + jsiFile)
  234. for await (const line of line$(jsiFile)) {
  235. if(line.match(reg)) {
  236. const [accession, from, to] = line.split('\t')
  237. const res = await getFromAcc(accession, jsiFile.split('.jsi')[0])
  238. if (res?.sequence) {
  239. try {
  240. const file = path.join(tmpDir, res?.version || res.accession + '.fa')
  241. if (!createdFiles.includes(file)) {
  242. if (createdFiles.length === 0) if (fs.existsSync(distFile)) await fs.promises.rm(distFile)
  243. await writeSequence(res?.version || res.accession, res?.sequence, file)
  244. createdFiles.push(file)
  245. const tmp = await fs.promises.readFile(file)
  246. await fs.promises.appendFile(distFile, tmp.toString() + '\n')
  247. await fs.promises.rm(file)
  248. counter++
  249. if (counter%100 === 0) console.log('Already ' + counter + ' sequence parsed')
  250. }
  251. } catch (error) {
  252. console.log(error)
  253. }
  254. }
  255. }
  256. if (limit) if (counter === limit) break
  257. }
  258. if (limit) if (counter === limit) break
  259. }
  260. console.log(createdFiles.length + ' sequences were extracted')
  261. await fs.promises.rm(tmpDir, {recursive: true})
  262. await async_exec('bwa', ['index', distFile], () => console.log)
  263. }
  264. export { getFromAcc, getSymbol, makeRefSeqFromReg, getOffsets, getData }