index.js 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209
  1. "use strict";
  2. // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_feature_table.txt.gz
  3. // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_assembly_structure/Primary_Assembly/assembled_chromosomes/chr2acc
  4. // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gbff.gz
  5. // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/LRG_RefSeqGene
  6. // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/refseqgene.1.genomic.gbff.gz
  7. // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/biological_region/human.biological_region.gbff.gz
  8. // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.[1-10].rna.gbff.gz
  9. var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
  10. function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
  11. return new (P || (P = Promise))(function (resolve, reject) {
  12. function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
  13. function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
  14. function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
  15. step((generator = generator.apply(thisArg, _arguments || [])).next());
  16. });
  17. };
  18. var __asyncValues = (this && this.__asyncValues) || function (o) {
  19. if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined.");
  20. var m = o[Symbol.asyncIterator], i;
  21. return m ? m.call(o) : (o = typeof __values === "function" ? __values(o) : o[Symbol.iterator](), i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i);
  22. function verb(n) { i[n] = o[n] && function (v) { return new Promise(function (resolve, reject) { v = o[n](v), settle(resolve, reject, v.done, v.value); }); }; }
  23. function settle(resolve, reject, d, v) { Promise.resolve(v).then(function(v) { resolve({ value: v, done: d }); }, reject); }
  24. };
  25. var __importDefault = (this && this.__importDefault) || function (mod) {
  26. return (mod && mod.__esModule) ? mod : { "default": mod };
  27. };
  28. Object.defineProperty(exports, "__esModule", { value: true });
  29. exports.getSymbol = exports.getFromAcc = void 0;
  30. const fs_1 = __importDefault(require("fs"));
  31. const readline_1 = __importDefault(require("readline"));
  32. const buffer_1 = require("buffer");
  33. const genbank_parser_1 = __importDefault(require("genbank-parser"));
  34. const line$ = (path) => readline_1.default.createInterface({
  35. input: fs_1.default.createReadStream(path),
  36. crlfDelay: Infinity
  37. });
  38. const readOffset = (path, from, to) => {
  39. return new Promise((resolve, reject) => __awaiter(void 0, void 0, void 0, function* () {
  40. const size = to - from;
  41. const buffer = buffer_1.Buffer.alloc(size);
  42. let filehandle = null;
  43. try {
  44. filehandle = yield fs_1.default.promises.open(path, 'r+');
  45. yield filehandle.read(buffer, 0, buffer.length, from);
  46. }
  47. finally {
  48. if (filehandle) {
  49. yield filehandle.close();
  50. resolve(buffer.toString());
  51. }
  52. }
  53. }));
  54. };
  55. /*
  56. * strings -a -t d human.1.rna.gbff | grep LOCUS | awk '{print $1"\t"$3}' > human.1.rna.gbff.index
  57. *
  58. */
  59. const makeGbffIndex = (filePath, indexPath) => __awaiter(void 0, void 0, void 0, function* () {
  60. var e_1, _a;
  61. indexPath = indexPath || filePath + '.jsi';
  62. let entries = [];
  63. let lineN = 0;
  64. let byteAcc = 0;
  65. try {
  66. for (var _b = __asyncValues(line$(filePath)), _c; _c = yield _b.next(), !_c.done;) {
  67. const line = _c.value;
  68. if (line.match(/^LOCUS/)) {
  69. entries.push({
  70. filePath,
  71. value: line.split(/\s+/)[1],
  72. from: byteAcc
  73. });
  74. if (lineN !== 0) {
  75. entries[entries.length - 2]["to"] = byteAcc;
  76. yield fs_1.default.promises.appendFile(indexPath, [
  77. entries[entries.length - 2]["value"],
  78. entries[entries.length - 2]["from"],
  79. entries[entries.length - 2]["to"]
  80. ].join('\t') + '\n');
  81. entries = entries.splice(1);
  82. }
  83. }
  84. byteAcc += (line.length + 1);
  85. lineN++;
  86. }
  87. }
  88. catch (e_1_1) { e_1 = { error: e_1_1 }; }
  89. finally {
  90. try {
  91. if (_c && !_c.done && (_a = _b.return)) yield _a.call(_b);
  92. }
  93. finally { if (e_1) throw e_1.error; }
  94. }
  95. entries[entries.length - 1]["to"] = byteAcc;
  96. yield fs_1.default.promises.appendFile(indexPath, [
  97. entries[entries.length - 1]["value"],
  98. entries[entries.length - 1]["from"],
  99. entries[entries.length - 1]["to"]
  100. ].join('\t'));
  101. return entries;
  102. });
  103. const getOffset = (indexPath, acc) => __awaiter(void 0, void 0, void 0, function* () {
  104. var e_2, _d;
  105. let res;
  106. try {
  107. for (var _e = __asyncValues(line$(indexPath)), _f; _f = yield _e.next(), !_f.done;) {
  108. const line = _f.value;
  109. const tmp = line.split('\t');
  110. if (tmp[0] === acc) {
  111. res = [indexPath.split('.jsi')[0], tmp[1], tmp[2]];
  112. break;
  113. }
  114. }
  115. }
  116. catch (e_2_1) { e_2 = { error: e_2_1 }; }
  117. finally {
  118. try {
  119. if (_f && !_f.done && (_d = _e.return)) yield _d.call(_e);
  120. }
  121. finally { if (e_2) throw e_2.error; }
  122. }
  123. return res;
  124. });
  125. const getFromAcc = (accession, dbPath, indexPath) => __awaiter(void 0, void 0, void 0, function* () {
  126. dbPath = Array.isArray(dbPath) ? dbPath : [dbPath];
  127. if (!indexPath) {
  128. indexPath = [];
  129. for (const p of dbPath) {
  130. const iP = p + '.jsi';
  131. if (!fs_1.default.existsSync(iP)) {
  132. console.log('Writing index: ' + iP);
  133. yield makeGbffIndex(p);
  134. }
  135. indexPath.push(iP);
  136. }
  137. }
  138. else {
  139. indexPath = Array.isArray(indexPath) ? indexPath : [indexPath];
  140. if (indexPath.length !== dbPath.length)
  141. throw 'Error';
  142. }
  143. for (const iP of indexPath) {
  144. const [filePath, from, to] = (yield getOffset(iP, accession)) || [undefined, undefined, undefined];
  145. if (filePath) {
  146. const txt = yield readOffset(filePath, Number(from), Number(to));
  147. return (0, genbank_parser_1.default)(txt)[0];
  148. }
  149. }
  150. return undefined;
  151. });
  152. exports.getFromAcc = getFromAcc;
  153. const getPos = (selector, tablePath, headerTablePath) => __awaiter(void 0, void 0, void 0, function* () {
  154. const results = [];
  155. (yield fs_1.default.promises.readFile(tablePath)).toString().split('\n')
  156. .map((line) => {
  157. const lineObj = line.split('\t').reduce((p, c, i) => (Object.assign(Object.assign({}, p), { [headerTablePath[i]]: isFinite(Number(c)) ? Number(c) : c })), {});
  158. if (lineObj[Object.keys(selector)[0]] === selector[Object.keys(selector)[0]])
  159. results.push(lineObj);
  160. });
  161. return results;
  162. });
  163. const getIndex = (symbol, LRGPath) => __awaiter(void 0, void 0, void 0, function* () {
  164. return (yield fs_1.default.promises.readFile(LRGPath)).toString()
  165. .split('\n')
  166. .filter((line) => line.match(new RegExp(symbol, 'g')))
  167. .reduce((p, c) => {
  168. const [TaxID, GeneID, GeneName, GeneAcc, TranscriptsAcc, ProteinAcc, _] = c.split('\t').filter((e) => e !== '');
  169. return { GeneID, GeneAcc, TranscriptsAcc: [...(new Set([...p.TranscriptsAcc, TranscriptsAcc]))], ProteinAcc: [...(new Set([...p.ProteinAcc, ProteinAcc]))] };
  170. }, { GeneID: '', GeneAcc: '', TranscriptsAcc: [], ProteinAcc: [] });
  171. });
  172. const getSymbol = (symbol, LRGPath, // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/LRG_RefSeqGene
  173. tablePath, // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_feature_table.txt.gz
  174. // regionDBPath: string | string[], // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/biological_region/human.biological_region.gbff.gz
  175. geneDBPath, // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/refseqgene.[1-7].genomic.gbff.gz
  176. rnaDBPath // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.[1-10].rna.gbff.gz
  177. ) => __awaiter(void 0, void 0, void 0, function* () {
  178. const geneIndex = yield getIndex(symbol, LRGPath);
  179. // const regionData = await getFromAcc(geneIndex.GeneAcc.split('.')[0], regionDBPath)
  180. // if (regionData) await fs.promises.writeFile('test-region.json', JSON.stringify(regionData, null, 4))
  181. const headerTablePath = [
  182. 'feature', 'class', 'assembly', 'assembly_unit', 'seq_type', 'chromosome',
  183. 'genomic_accession', 'start', 'end', 'strand', 'product_accession', 'non_redundant_refseq',
  184. 'related_accession', 'name', 'symbol', 'GeneID', 'locus_tag', 'feature_interval_length',
  185. 'product_length', 'attributes'
  186. ];
  187. const allFeatures = yield getPos({ symbol }, tablePath, headerTablePath);
  188. for (let index = 0; index < allFeatures.length; index++) {
  189. const { feature, product_accession } = allFeatures[index];
  190. let tmp;
  191. switch (feature) {
  192. case 'gene':
  193. allFeatures[index].product_accession = geneIndex.GeneAcc;
  194. tmp = yield getFromAcc(geneIndex.GeneAcc.split('.')[0], geneDBPath);
  195. yield fs_1.default.promises.writeFile('test/test-gene.json', JSON.stringify(tmp, null, 4));
  196. allFeatures[index].data = tmp;
  197. break;
  198. case 'mRNA':
  199. tmp = yield getFromAcc(product_accession.split('.')[0], rnaDBPath);
  200. yield fs_1.default.promises.writeFile('test/test-rna-' + index + '.json', JSON.stringify(tmp, null, 4));
  201. allFeatures[index].data = tmp;
  202. break;
  203. default:
  204. break;
  205. }
  206. }
  207. return allFeatures;
  208. });
  209. exports.getSymbol = getSymbol;