index.ts 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189
  1. // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_feature_table.txt.gz
  2. // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_assembly_structure/Primary_Assembly/assembled_chromosomes/chr2acc
  3. // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gbff.gz
  4. // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/LRG_RefSeqGene
  5. // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/refseqgene.1.genomic.gbff.gz
  6. // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/biological_region/human.biological_region.gbff.gz
  7. // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.[1-10].rna.gbff.gz
  8. import fs from 'fs'
  9. import readline from 'readline'
  10. import { Buffer } from 'buffer'
  11. import genbankParser from 'genbank-parser'
  12. const line$ = (path: string) => readline.createInterface({
  13. input: fs.createReadStream(path),
  14. crlfDelay: Infinity
  15. });
  16. const readOffset = (path: string, from:number, to:number) => {
  17. return new Promise<string>(async (resolve, reject) => {
  18. const size = to - from
  19. const buffer = Buffer.alloc(size);
  20. let filehandle = null;
  21. try {
  22. filehandle = await fs.promises.open(path, 'r+');
  23. await filehandle.read(buffer, 0, buffer.length, from);
  24. } finally {
  25. if (filehandle) {
  26. await filehandle.close()
  27. resolve(buffer.toString())
  28. }
  29. }
  30. })
  31. }
  32. /*
  33. * strings -a -t d human.1.rna.gbff | grep LOCUS | awk '{print $1"\t"$3}' > human.1.rna.gbff.index
  34. *
  35. */
  36. const makeGbffIndex = async (filePath: string, indexPath?: string) => {
  37. interface entry {
  38. filePath: string;
  39. value : string;
  40. from : number;
  41. to ?: number;
  42. }
  43. indexPath = indexPath || filePath + '.jsi'
  44. let entries = [] as entry[]
  45. let lineN = 0
  46. let byteAcc = 0
  47. for await (const line of line$(filePath)) {
  48. if(line.match(/^LOCUS/)) {
  49. entries.push({
  50. filePath,
  51. value : line.split(/\s+/)[1],
  52. from : byteAcc
  53. })
  54. if(lineN !== 0) {
  55. entries[entries.length - 2]["to"] = byteAcc
  56. await fs.promises.appendFile(indexPath, [
  57. entries[entries.length - 2]["value"],
  58. entries[entries.length - 2]["from"],
  59. entries[entries.length - 2]["to"]].join('\t') + '\n')
  60. entries = entries.splice(1)
  61. }
  62. }
  63. byteAcc += (line.length + 1)
  64. lineN++
  65. }
  66. entries[entries.length - 1]["to"] = byteAcc
  67. await fs.promises.appendFile(indexPath, [
  68. entries[entries.length - 1]["value"],
  69. entries[entries.length - 1]["from"],
  70. entries[entries.length - 1]["to"]].join('\t'))
  71. return entries
  72. }
  73. const getOffset = async (indexPath: string, acc: string) => {
  74. let res
  75. for await (const line of line$(indexPath)) {
  76. const tmp = line.split('\t')
  77. if (tmp[0] === acc) {
  78. res = [indexPath.split('.jsi')[0], tmp[1], tmp[2]]
  79. break
  80. }
  81. }
  82. return res
  83. }
  84. const getFromAcc = async (accession: string, dbPath: string | string[], indexPath?: string | string[]) => {
  85. dbPath = Array.isArray(dbPath) ? dbPath : [dbPath]
  86. if (!indexPath) {
  87. indexPath = []
  88. for (const p of dbPath) {
  89. const iP = p + '.jsi'
  90. if (!fs.existsSync(iP)) {
  91. console.log('Writing index: ' + iP);
  92. await makeGbffIndex(p)
  93. }
  94. indexPath.push(iP)
  95. }
  96. } else {
  97. indexPath = Array.isArray(indexPath) ? indexPath : [indexPath]
  98. if (indexPath.length !== dbPath.length) throw 'Error'
  99. }
  100. for (const iP of indexPath) {
  101. const [filePath, from, to] = await getOffset(iP, accession) || [undefined,undefined,undefined]
  102. if (filePath) {
  103. const txt = await readOffset(filePath, Number(from), Number(to))
  104. return genbankParser(txt)[0]
  105. }
  106. }
  107. return undefined
  108. }
  109. interface selector {[key: string]: string}
  110. const getPos = async (selector: selector, tablePath: string, headerTablePath:string[]) => {
  111. const results:any = [];
  112. (await fs.promises.readFile(tablePath)).toString().split('\n')
  113. .map((line:string) => {
  114. const lineObj = line.split('\t').reduce((p,c,i) => ({...p, [headerTablePath[i]]: isFinite(Number(c)) ? Number(c) : c}), {} as any)
  115. if(lineObj[Object.keys(selector)[0]] === selector[Object.keys(selector)[0]]) results.push(lineObj)
  116. })
  117. return results
  118. }
  119. const getIndex = async (symbol:string, LRGPath:string) => {
  120. interface geneIndex {
  121. GeneID : string;
  122. GeneAcc : string;
  123. TranscriptsAcc: string[];
  124. ProteinAcc : string[]
  125. }
  126. return (await fs.promises.readFile(LRGPath)).toString()
  127. .split('\n')
  128. .filter((line:string) => line.match(new RegExp(symbol, 'g')))
  129. .reduce((p:any,c:any) => {
  130. const [TaxID, GeneID, GeneName, GeneAcc, TranscriptsAcc, ProteinAcc, _] = c.split('\t').filter((e:any)=>e!=='')
  131. return {GeneID, GeneAcc, TranscriptsAcc: [...(new Set([...p.TranscriptsAcc, TranscriptsAcc]))], ProteinAcc: [...(new Set([...p.ProteinAcc, ProteinAcc]))]}
  132. }, {GeneID: '', GeneAcc: '', TranscriptsAcc: [], ProteinAcc: []} as geneIndex)
  133. }
  134. const getSymbol = async (
  135. symbol:string,
  136. LRGPath:string, // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/LRG_RefSeqGene
  137. tablePath:string, // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_feature_table.txt.gz
  138. // regionDBPath: string | string[], // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/biological_region/human.biological_region.gbff.gz
  139. geneDBPath: string | string[], // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/refseqgene.[1-7].genomic.gbff.gz
  140. rnaDBPath: string | string[] // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.[1-10].rna.gbff.gz
  141. ) => {
  142. const geneIndex = await getIndex(symbol, LRGPath)
  143. // const regionData = await getFromAcc(geneIndex.GeneAcc.split('.')[0], regionDBPath)
  144. // if (regionData) await fs.promises.writeFile('test-region.json', JSON.stringify(regionData, null, 4))
  145. const headerTablePath = [
  146. 'feature', 'class', 'assembly', 'assembly_unit', 'seq_type', 'chromosome',
  147. 'genomic_accession', 'start', 'end', 'strand', 'product_accession', 'non_redundant_refseq',
  148. 'related_accession', 'name', 'symbol', 'GeneID', 'locus_tag', 'feature_interval_length',
  149. 'product_length','attributes'
  150. ]
  151. const allFeatures = await getPos({symbol}, tablePath, headerTablePath)
  152. for (let index = 0; index < allFeatures.length; index++) {
  153. const {feature, product_accession} = allFeatures[index]
  154. let tmp
  155. switch (feature) {
  156. case 'gene':
  157. allFeatures[index].product_accession = geneIndex.GeneAcc
  158. tmp = await getFromAcc(geneIndex.GeneAcc.split('.')[0], geneDBPath)
  159. await fs.promises.writeFile('test/test-gene.json', JSON.stringify(tmp, null, 4))
  160. allFeatures[index].data = tmp
  161. break;
  162. case 'mRNA':
  163. tmp = await getFromAcc(product_accession.split('.')[0], rnaDBPath)
  164. await fs.promises.writeFile('test/test-rna-'+index+'.json', JSON.stringify(tmp, null, 4))
  165. allFeatures[index].data = tmp
  166. break;
  167. default:
  168. break;
  169. }
  170. }
  171. return allFeatures
  172. }
  173. export { getFromAcc, getSymbol }