index.ts 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283
  1. // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_feature_table.txt.gz
  2. // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_assembly_structure/Primary_Assembly/assembled_chromosomes/chr2acc
  3. // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gbff.gz
  4. // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/LRG_RefSeqGene
  5. // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/refseqgene.1.genomic.gbff.gz
  6. // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/biological_region/human.biological_region.gbff.gz
  7. // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.[1-10].rna.gbff.gz
  8. import fs from 'fs'
  9. import os from 'os'
  10. import path from 'path'
  11. import { spawn } from 'child_process'
  12. import readline from 'readline'
  13. import { Buffer } from 'buffer'
  14. import genbankParser from 'genbank-parser'
  15. import { writeSequence } from 'aligner'
  16. import jsonata from 'jsonata'
  17. const async_exec = (prog: string, args: string[], onData: Function) => {
  18. return new Promise((resolve, reject) => {
  19. const child = spawn(prog, args, {shell: true})
  20. child.stdout.on('data', data => onData(data.toString().trim()))
  21. child.stderr.on('data', data => onData(data.toString().trim()))
  22. child.on('error', err => reject(err))
  23. child.on('exit', code => resolve(code))
  24. })
  25. }
  26. const line$ = (path: string) => readline.createInterface({
  27. input: fs.createReadStream(path),
  28. crlfDelay: Infinity
  29. });
  30. const readOffset = (path: string, from:number, to:number) => {
  31. return new Promise<string>(async (resolve, reject) => {
  32. const size = to - from
  33. const buffer = Buffer.alloc(size)
  34. let filehandle = null
  35. try {
  36. filehandle = await fs.promises.open(path, 'r+');
  37. await filehandle.read(buffer, 0, buffer.length, from)
  38. } finally {
  39. if (filehandle) {
  40. await filehandle.close()
  41. resolve(buffer.toString())
  42. }
  43. }
  44. })
  45. }
  46. /*
  47. * strings -a -t d human.1.rna.gbff | grep LOCUS | awk '{print $1"\t"$3}' > human.1.rna.gbff.index
  48. *
  49. */
  50. const makeGbffIndex = async (filePath: string, indexPath?: string) => {
  51. interface entry {
  52. filePath: string;
  53. value : string;
  54. from : number;
  55. to ?: number;
  56. }
  57. indexPath = indexPath || filePath + '.jsi'
  58. let entries = [] as entry[]
  59. let lineN = 0
  60. let byteAcc = 0
  61. for await (const line of line$(filePath)) {
  62. if(line.match(/^LOCUS/)) {
  63. entries.push({
  64. filePath,
  65. value : line.split(/\s+/)[1],
  66. from : byteAcc
  67. })
  68. if(lineN !== 0) {
  69. entries[entries.length - 2]["to"] = byteAcc
  70. await fs.promises.appendFile(indexPath, [
  71. entries[entries.length - 2]["value"],
  72. entries[entries.length - 2]["from"],
  73. entries[entries.length - 2]["to"]].join('\t') + '\n')
  74. entries = entries.splice(1)
  75. }
  76. }
  77. byteAcc += (line.length + 1)
  78. lineN++
  79. }
  80. entries[entries.length - 1]["to"] = byteAcc
  81. await fs.promises.appendFile(indexPath, [
  82. entries[entries.length - 1]["value"],
  83. entries[entries.length - 1]["from"],
  84. entries[entries.length - 1]["to"]].join('\t'))
  85. return entries
  86. }
  87. const getOffset = async (indexPath: string, acc: string) => {
  88. let res
  89. for await (const line of line$(indexPath)) {
  90. const tmp = line.split('\t')
  91. if (tmp[0] === acc) {
  92. res = [indexPath.split('.jsi')[0], tmp[1], tmp[2]]
  93. break
  94. }
  95. }
  96. return res
  97. }
  98. const getOffsets = async (indexPath: string | string[], accessions: RegExp) => {
  99. let res: string[][] = []
  100. const indexPaths = Array.isArray(indexPath) ? indexPath : [indexPath]
  101. for (const iP of indexPaths) {
  102. for await (const line of line$(iP)) {
  103. const tmp = line.split('\t')
  104. if (accessions.test(tmp[0])) {
  105. res.push([iP.split('.jsi')[0], tmp[1], tmp[2], tmp[0]])
  106. }
  107. }
  108. }
  109. return res
  110. }
  111. const getData = async (dbPath: string | string[], accessionRegex: RegExp, query?: string) => {
  112. dbPath = Array.isArray(dbPath) ? dbPath : [dbPath]
  113. const results = []
  114. const allOffsets = await getOffsets(dbPath.map(e => e + '.jsi'), accessionRegex)
  115. for (const offset of allOffsets) {
  116. const txt = await readOffset(offset[0], Number(offset[1]), Number(offset[2]))
  117. const json = genbankParser(txt)[0]
  118. query ? results.push(jsonata(query).evaluate(json)) : results.push(json)
  119. }
  120. return results
  121. }
  122. const getFromAcc = async (accession: string, dbPath: string | string[], indexPath?: string | string[]) => {
  123. dbPath = Array.isArray(dbPath) ? dbPath : [dbPath]
  124. if (!indexPath) {
  125. indexPath = await getJSI(dbPath)
  126. } else {
  127. indexPath = Array.isArray(indexPath) ? indexPath : [indexPath]
  128. if (indexPath.length !== dbPath.length) throw 'Error'
  129. }
  130. for (const iP of indexPath) {
  131. const [filePath, from, to] = await getOffset(iP, accession) || [undefined,undefined,undefined]
  132. if (filePath) {
  133. const txt = await readOffset(filePath, Number(from), Number(to))
  134. return genbankParser(txt)[0]
  135. }
  136. }
  137. return undefined
  138. }
  139. interface selector {[key: string]: string}
  140. const getPos = async (selector: selector, tablePath: string, headerTablePath:string[]) => {
  141. const results:any = [];
  142. (await fs.promises.readFile(tablePath)).toString().split('\n')
  143. .map((line:string) => {
  144. const lineObj = line.split('\t').reduce((p,c,i) => ({...p, [headerTablePath[i]]: isFinite(Number(c)) ? Number(c) : c}), {} as any)
  145. if(lineObj[Object.keys(selector)[0]] === selector[Object.keys(selector)[0]]) results.push(lineObj)
  146. })
  147. return results
  148. }
  149. const getIndex = async (symbol:string, LRGPath:string) => {
  150. interface geneIndex {
  151. GeneID : string;
  152. GeneAcc : string;
  153. TranscriptsAcc: string[];
  154. ProteinAcc : string[]
  155. }
  156. return (await fs.promises.readFile(LRGPath)).toString()
  157. .split('\n')
  158. .filter((line:string) => line.match(new RegExp(symbol, 'g')))
  159. .reduce((p:any,c:any) => {
  160. const [TaxID, GeneID, GeneName, GeneAcc, TranscriptsAcc, ProteinAcc, _] = c.split('\t').filter((e:any)=>e!=='')
  161. return {GeneID, GeneAcc, TranscriptsAcc: [...(new Set([...p.TranscriptsAcc, TranscriptsAcc]))], ProteinAcc: [...(new Set([...p.ProteinAcc, ProteinAcc]))]}
  162. }, {GeneID: '', GeneAcc: '', TranscriptsAcc: [], ProteinAcc: []} as geneIndex)
  163. }
  164. const getSymbol = async (
  165. symbol:string,
  166. LRGPath:string, // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/LRG_RefSeqGene
  167. tablePath:string, // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_feature_table.txt.gz
  168. // regionDBPath: string | string[], // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/biological_region/human.biological_region.gbff.gz
  169. geneDBPath: string | string[], // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/refseqgene.[1-7].genomic.gbff.gz
  170. rnaDBPath: string | string[] // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.[1-10].rna.gbff.gz
  171. ) => {
  172. const geneIndex = await getIndex(symbol, LRGPath)
  173. // const regionData = await getFromAcc(geneIndex.GeneAcc.split('.')[0], regionDBPath)
  174. // if (regionData) await fs.promises.writeFile('test-region.json', JSON.stringify(regionData, null, 4))
  175. const headerTablePath = [
  176. 'feature', 'class', 'assembly', 'assembly_unit', 'seq_type', 'chromosome',
  177. 'genomic_accession', 'start', 'end', 'strand', 'product_accession', 'non_redundant_refseq',
  178. 'related_accession', 'name', 'symbol', 'GeneID', 'locus_tag', 'feature_interval_length',
  179. 'product_length','attributes'
  180. ]
  181. const allFeatures = await getPos({symbol}, tablePath, headerTablePath)
  182. for (let index = 0; index < allFeatures.length; index++) {
  183. const {feature, product_accession} = allFeatures[index]
  184. let tmp
  185. switch (feature) {
  186. case 'gene':
  187. allFeatures[index].product_accession = geneIndex.GeneAcc
  188. tmp = await getFromAcc(geneIndex.GeneAcc.split('.')[0], geneDBPath)
  189. // await fs.promises.writeFile('test/test-gene.json', JSON.stringify(tmp, null, 4))
  190. allFeatures[index].data = tmp
  191. break;
  192. case 'mRNA':
  193. tmp = await getFromAcc(product_accession.split('.')[0], rnaDBPath)
  194. // await fs.promises.writeFile('test/test-rna-'+index+'.json', JSON.stringify(tmp, null, 4))
  195. allFeatures[index].data = tmp
  196. break;
  197. default:
  198. break;
  199. }
  200. }
  201. return allFeatures
  202. }
  203. const getJSI = async (dbPath: string | string[]) => {
  204. dbPath = Array.isArray(dbPath) ? dbPath : [dbPath]
  205. const indexPath: string[] = []
  206. for (const p of dbPath) {
  207. const iP = p + '.jsi'
  208. if (!fs.existsSync(iP)) {
  209. console.log('Writing index: ' + iP);
  210. await makeGbffIndex(p)
  211. }
  212. indexPath.push(iP)
  213. }
  214. return indexPath
  215. }
  216. // Todo: add progress
  217. const makeRefSeqFromReg = async (
  218. dbPath: string | string[], reg: RegExp, distFile:string, limit?: number
  219. ) => {
  220. dbPath = Array.isArray(dbPath) ? dbPath : [dbPath]
  221. const jsiFiles = await getJSI(dbPath)
  222. const tmpDir = path.join(os.tmpdir(), 'parser-' + Math.random())
  223. await fs.promises.mkdir(tmpDir)
  224. const createdFiles: string[] = []
  225. let counter = 0
  226. for (const jsiFile of jsiFiles) {
  227. console.log('reading ' + jsiFile)
  228. for await (const line of line$(jsiFile)) {
  229. if(line.match(reg)) {
  230. const [accession, from, to] = line.split('\t')
  231. const res = await getFromAcc(accession, jsiFile.split('.jsi')[0])
  232. if (res?.sequence) {
  233. try {
  234. const file = path.join(tmpDir, res?.version || res.accession + '.fa')
  235. if (!createdFiles.includes(file)) {
  236. if (createdFiles.length === 0) if (fs.existsSync(distFile)) await fs.promises.rm(distFile)
  237. await writeSequence(res?.version || res.accession, res?.sequence, file)
  238. createdFiles.push(file)
  239. const tmp = await fs.promises.readFile(file)
  240. await fs.promises.appendFile(distFile, tmp.toString() + '\n')
  241. await fs.promises.rm(file)
  242. counter++
  243. if (counter%100 === 0) console.log('Already ' + counter + ' sequence parsed')
  244. }
  245. } catch (error) {
  246. console.log(error)
  247. }
  248. }
  249. }
  250. if (limit) if (counter === limit) break
  251. }
  252. if (limit) if (counter === limit) break
  253. }
  254. console.log(createdFiles.length + ' sequences were extracted')
  255. await fs.promises.rm(tmpDir, {recursive: true})
  256. await async_exec('bwa', ['index', distFile], () => console.log)
  257. }
  258. export { getFromAcc, getSymbol, makeRefSeqFromReg, getOffsets, getData }