index.ts 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115
  1. // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.6.rna.gbff.gz
  2. import fs from 'fs'
  3. import readline from 'readline'
  4. import { Buffer } from 'buffer'
  5. import genbankParser from 'genbank-parser'
  6. const line$ = (path: string) => readline.createInterface({
  7. input: fs.createReadStream(path),
  8. crlfDelay: Infinity
  9. });
  10. const readOffset = (path: string, from:number, to:number) => {
  11. return new Promise<string>(async (resolve, reject) => {
  12. const size = to - from
  13. const buffer = Buffer.alloc(size);
  14. let filehandle = null;
  15. try {
  16. filehandle = await fs.promises.open(path, 'r+');
  17. await filehandle.read(buffer, 0, buffer.length, from);
  18. } finally {
  19. if (filehandle) {
  20. await filehandle.close()
  21. resolve(buffer.toString())
  22. }
  23. }
  24. })
  25. }
  26. /*
  27. * strings -a -t d human.1.rna.gbff | grep LOCUS | awk '{print $1"\t"$3}' > human.1.rna.gbff.index
  28. *
  29. */
  30. const makeGbffIndex = async (filePath: string, lineSize = 80, indexPath?: string) => {
  31. interface entry {
  32. filePath: string;
  33. value : string;
  34. from : number;
  35. to ?: number;
  36. }
  37. indexPath = indexPath || filePath + '.jsi'
  38. let entries = [] as entry[]
  39. let lineN = 0
  40. let byteAcc = 0
  41. for await (const line of line$(filePath)) {
  42. if(line.match(/^LOCUS/)) {
  43. entries.push({
  44. filePath,
  45. value : line.split(/\s+/)[1],
  46. from : byteAcc
  47. })
  48. if(lineN !== 0) {
  49. entries[entries.length - 2]["to"] = byteAcc
  50. await fs.promises.appendFile(indexPath, [
  51. entries[entries.length - 2]["value"],
  52. entries[entries.length - 2]["from"],
  53. entries[entries.length - 2]["to"]].join('\t') + '\n')
  54. entries = entries.splice(1)
  55. }
  56. }
  57. byteAcc += (line.length + 1)
  58. lineN++
  59. }
  60. entries[entries.length - 1]["to"] = byteAcc
  61. await fs.promises.appendFile(indexPath, [
  62. entries[entries.length - 1]["value"],
  63. entries[entries.length - 1]["from"],
  64. entries[entries.length - 1]["to"]].join('\t'))
  65. return entries
  66. }
  67. const getOffset = async (indexPath: string, acc: string) => {
  68. let res
  69. for await (const line of line$(indexPath)) {
  70. const tmp = line.split('\t')
  71. if (tmp[0] === acc) {
  72. res = [indexPath.split('.jsi')[0], tmp[1], tmp[2]]
  73. break
  74. }
  75. }
  76. return res
  77. }
  78. const getFromAcc = async (acc: string, dbPath: string | string[], indexPath?: string | string[]) => {
  79. dbPath = Array.isArray(dbPath) ? dbPath : [dbPath]
  80. if (!indexPath) {
  81. indexPath = []
  82. for (const p of dbPath) {
  83. const iP = p + '.jsi'
  84. if (!fs.existsSync(iP)) {
  85. console.log('Writing index: ' + iP);
  86. await makeGbffIndex(p)
  87. }
  88. indexPath.push(iP)
  89. }
  90. } else {
  91. indexPath = Array.isArray(indexPath) ? indexPath : [indexPath]
  92. if (indexPath.length !== dbPath.length) throw 'Error'
  93. }
  94. let i = 0
  95. let res
  96. for (const p of dbPath) {
  97. res = await getOffset(indexPath[i], acc)
  98. if (res) break
  99. i++
  100. }
  101. if (res) {
  102. const rr = await readOffset(res[0], Number(res[1]), Number(res[2]))
  103. res = genbankParser(rr)[0]
  104. }
  105. return res
  106. }
  107. export { getFromAcc }