index.js 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287
  1. "use strict";
  2. // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_feature_table.txt.gz
  3. // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_assembly_structure/Primary_Assembly/assembled_chromosomes/chr2acc
  4. // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gbff.gz
  5. // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/LRG_RefSeqGene
  6. // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/refseqgene.1.genomic.gbff.gz
  7. // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/biological_region/human.biological_region.gbff.gz
  8. // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.[1-10].rna.gbff.gz
  9. var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
  10. function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
  11. return new (P || (P = Promise))(function (resolve, reject) {
  12. function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
  13. function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
  14. function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
  15. step((generator = generator.apply(thisArg, _arguments || [])).next());
  16. });
  17. };
  18. var __asyncValues = (this && this.__asyncValues) || function (o) {
  19. if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined.");
  20. var m = o[Symbol.asyncIterator], i;
  21. return m ? m.call(o) : (o = typeof __values === "function" ? __values(o) : o[Symbol.iterator](), i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i);
  22. function verb(n) { i[n] = o[n] && function (v) { return new Promise(function (resolve, reject) { v = o[n](v), settle(resolve, reject, v.done, v.value); }); }; }
  23. function settle(resolve, reject, d, v) { Promise.resolve(v).then(function(v) { resolve({ value: v, done: d }); }, reject); }
  24. };
  25. var __importDefault = (this && this.__importDefault) || function (mod) {
  26. return (mod && mod.__esModule) ? mod : { "default": mod };
  27. };
  28. Object.defineProperty(exports, "__esModule", { value: true });
  29. exports.makeRefSeqFromReg = exports.getSymbol = exports.getFromAcc = void 0;
  30. const fs_1 = __importDefault(require("fs"));
  31. const os_1 = __importDefault(require("os"));
  32. const path_1 = __importDefault(require("path"));
  33. const child_process_1 = require("child_process");
  34. const readline_1 = __importDefault(require("readline"));
  35. const buffer_1 = require("buffer");
  36. const genbank_parser_1 = __importDefault(require("genbank-parser"));
  37. const aligner_1 = require("aligner");
  38. const async_exec = (prog, args, onData) => {
  39. return new Promise((resolve, reject) => {
  40. const child = (0, child_process_1.spawn)(prog, args, { shell: true });
  41. child.stdout.on('data', data => onData(data.toString().trim()));
  42. child.stderr.on('data', data => onData(data.toString().trim()));
  43. child.on('error', err => reject(err));
  44. child.on('exit', code => resolve(code));
  45. });
  46. };
  47. const line$ = (path) => readline_1.default.createInterface({
  48. input: fs_1.default.createReadStream(path),
  49. crlfDelay: Infinity
  50. });
  51. const readOffset = (path, from, to) => {
  52. return new Promise((resolve, reject) => __awaiter(void 0, void 0, void 0, function* () {
  53. const size = to - from;
  54. const buffer = buffer_1.Buffer.alloc(size);
  55. let filehandle = null;
  56. try {
  57. filehandle = yield fs_1.default.promises.open(path, 'r+');
  58. yield filehandle.read(buffer, 0, buffer.length, from);
  59. }
  60. finally {
  61. if (filehandle) {
  62. yield filehandle.close();
  63. resolve(buffer.toString());
  64. }
  65. }
  66. }));
  67. };
  68. /*
  69. * strings -a -t d human.1.rna.gbff | grep LOCUS | awk '{print $1"\t"$3}' > human.1.rna.gbff.index
  70. *
  71. */
  72. const makeGbffIndex = (filePath, indexPath) => __awaiter(void 0, void 0, void 0, function* () {
  73. var e_1, _a;
  74. indexPath = indexPath || filePath + '.jsi';
  75. let entries = [];
  76. let lineN = 0;
  77. let byteAcc = 0;
  78. try {
  79. for (var _b = __asyncValues(line$(filePath)), _c; _c = yield _b.next(), !_c.done;) {
  80. const line = _c.value;
  81. if (line.match(/^LOCUS/)) {
  82. entries.push({
  83. filePath,
  84. value: line.split(/\s+/)[1],
  85. from: byteAcc
  86. });
  87. if (lineN !== 0) {
  88. entries[entries.length - 2]["to"] = byteAcc;
  89. yield fs_1.default.promises.appendFile(indexPath, [
  90. entries[entries.length - 2]["value"],
  91. entries[entries.length - 2]["from"],
  92. entries[entries.length - 2]["to"]
  93. ].join('\t') + '\n');
  94. entries = entries.splice(1);
  95. }
  96. }
  97. byteAcc += (line.length + 1);
  98. lineN++;
  99. }
  100. }
  101. catch (e_1_1) { e_1 = { error: e_1_1 }; }
  102. finally {
  103. try {
  104. if (_c && !_c.done && (_a = _b.return)) yield _a.call(_b);
  105. }
  106. finally { if (e_1) throw e_1.error; }
  107. }
  108. entries[entries.length - 1]["to"] = byteAcc;
  109. yield fs_1.default.promises.appendFile(indexPath, [
  110. entries[entries.length - 1]["value"],
  111. entries[entries.length - 1]["from"],
  112. entries[entries.length - 1]["to"]
  113. ].join('\t'));
  114. return entries;
  115. });
  116. const getOffset = (indexPath, acc) => __awaiter(void 0, void 0, void 0, function* () {
  117. var e_2, _d;
  118. let res;
  119. try {
  120. for (var _e = __asyncValues(line$(indexPath)), _f; _f = yield _e.next(), !_f.done;) {
  121. const line = _f.value;
  122. const tmp = line.split('\t');
  123. if (tmp[0] === acc) {
  124. res = [indexPath.split('.jsi')[0], tmp[1], tmp[2]];
  125. break;
  126. }
  127. }
  128. }
  129. catch (e_2_1) { e_2 = { error: e_2_1 }; }
  130. finally {
  131. try {
  132. if (_f && !_f.done && (_d = _e.return)) yield _d.call(_e);
  133. }
  134. finally { if (e_2) throw e_2.error; }
  135. }
  136. return res;
  137. });
  138. const getFromAcc = (accession, dbPath, indexPath) => __awaiter(void 0, void 0, void 0, function* () {
  139. dbPath = Array.isArray(dbPath) ? dbPath : [dbPath];
  140. if (!indexPath) {
  141. indexPath = yield getJSI(dbPath);
  142. }
  143. else {
  144. indexPath = Array.isArray(indexPath) ? indexPath : [indexPath];
  145. if (indexPath.length !== dbPath.length)
  146. throw 'Error';
  147. }
  148. for (const iP of indexPath) {
  149. const [filePath, from, to] = (yield getOffset(iP, accession)) || [undefined, undefined, undefined];
  150. if (filePath) {
  151. const txt = yield readOffset(filePath, Number(from), Number(to));
  152. return (0, genbank_parser_1.default)(txt)[0];
  153. }
  154. }
  155. return undefined;
  156. });
  157. exports.getFromAcc = getFromAcc;
  158. const getPos = (selector, tablePath, headerTablePath) => __awaiter(void 0, void 0, void 0, function* () {
  159. const results = [];
  160. (yield fs_1.default.promises.readFile(tablePath)).toString().split('\n')
  161. .map((line) => {
  162. const lineObj = line.split('\t').reduce((p, c, i) => (Object.assign(Object.assign({}, p), { [headerTablePath[i]]: isFinite(Number(c)) ? Number(c) : c })), {});
  163. if (lineObj[Object.keys(selector)[0]] === selector[Object.keys(selector)[0]])
  164. results.push(lineObj);
  165. });
  166. return results;
  167. });
  168. const getIndex = (symbol, LRGPath) => __awaiter(void 0, void 0, void 0, function* () {
  169. return (yield fs_1.default.promises.readFile(LRGPath)).toString()
  170. .split('\n')
  171. .filter((line) => line.match(new RegExp(symbol, 'g')))
  172. .reduce((p, c) => {
  173. const [TaxID, GeneID, GeneName, GeneAcc, TranscriptsAcc, ProteinAcc, _] = c.split('\t').filter((e) => e !== '');
  174. return { GeneID, GeneAcc, TranscriptsAcc: [...(new Set([...p.TranscriptsAcc, TranscriptsAcc]))], ProteinAcc: [...(new Set([...p.ProteinAcc, ProteinAcc]))] };
  175. }, { GeneID: '', GeneAcc: '', TranscriptsAcc: [], ProteinAcc: [] });
  176. });
  177. const getSymbol = (symbol, LRGPath, // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/LRG_RefSeqGene
  178. tablePath, // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_feature_table.txt.gz
  179. // regionDBPath: string | string[], // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/biological_region/human.biological_region.gbff.gz
  180. geneDBPath, // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/refseqgene.[1-7].genomic.gbff.gz
  181. rnaDBPath // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.[1-10].rna.gbff.gz
  182. ) => __awaiter(void 0, void 0, void 0, function* () {
  183. const geneIndex = yield getIndex(symbol, LRGPath);
  184. // const regionData = await getFromAcc(geneIndex.GeneAcc.split('.')[0], regionDBPath)
  185. // if (regionData) await fs.promises.writeFile('test-region.json', JSON.stringify(regionData, null, 4))
  186. const headerTablePath = [
  187. 'feature', 'class', 'assembly', 'assembly_unit', 'seq_type', 'chromosome',
  188. 'genomic_accession', 'start', 'end', 'strand', 'product_accession', 'non_redundant_refseq',
  189. 'related_accession', 'name', 'symbol', 'GeneID', 'locus_tag', 'feature_interval_length',
  190. 'product_length', 'attributes'
  191. ];
  192. const allFeatures = yield getPos({ symbol }, tablePath, headerTablePath);
  193. for (let index = 0; index < allFeatures.length; index++) {
  194. const { feature, product_accession } = allFeatures[index];
  195. let tmp;
  196. switch (feature) {
  197. case 'gene':
  198. allFeatures[index].product_accession = geneIndex.GeneAcc;
  199. tmp = yield getFromAcc(geneIndex.GeneAcc.split('.')[0], geneDBPath);
  200. // await fs.promises.writeFile('test/test-gene.json', JSON.stringify(tmp, null, 4))
  201. allFeatures[index].data = tmp;
  202. break;
  203. case 'mRNA':
  204. tmp = yield getFromAcc(product_accession.split('.')[0], rnaDBPath);
  205. // await fs.promises.writeFile('test/test-rna-'+index+'.json', JSON.stringify(tmp, null, 4))
  206. allFeatures[index].data = tmp;
  207. break;
  208. default:
  209. break;
  210. }
  211. }
  212. return allFeatures;
  213. });
  214. exports.getSymbol = getSymbol;
  215. const getJSI = (dbPath) => __awaiter(void 0, void 0, void 0, function* () {
  216. dbPath = Array.isArray(dbPath) ? dbPath : [dbPath];
  217. const indexPath = [];
  218. for (const p of dbPath) {
  219. const iP = p + '.jsi';
  220. if (!fs_1.default.existsSync(iP)) {
  221. console.log('Writing index: ' + iP);
  222. yield makeGbffIndex(p);
  223. }
  224. indexPath.push(iP);
  225. }
  226. return indexPath;
  227. });
  228. // Todo: add progress
  229. const makeRefSeqFromReg = (dbPath, reg, distFile, limit) => __awaiter(void 0, void 0, void 0, function* () {
  230. var e_3, _g;
  231. dbPath = Array.isArray(dbPath) ? dbPath : [dbPath];
  232. const jsiFiles = yield getJSI(dbPath);
  233. const tmpDir = path_1.default.join(os_1.default.tmpdir(), 'parser-' + Math.random());
  234. yield fs_1.default.promises.mkdir(tmpDir);
  235. const createdFiles = [];
  236. let counter = 0;
  237. for (const jsiFile of jsiFiles) {
  238. console.log('reading ' + jsiFile);
  239. try {
  240. for (var _h = (e_3 = void 0, __asyncValues(line$(jsiFile))), _j; _j = yield _h.next(), !_j.done;) {
  241. const line = _j.value;
  242. if (line.match(reg)) {
  243. const [accession, from, to] = line.split('\t');
  244. const res = yield getFromAcc(accession, jsiFile.split('.jsi')[0]);
  245. if (res === null || res === void 0 ? void 0 : res.sequence) {
  246. try {
  247. const file = path_1.default.join(tmpDir, (res === null || res === void 0 ? void 0 : res.version) || res.accession + '.fa');
  248. if (!createdFiles.includes(file)) {
  249. yield (0, aligner_1.writeSequence)((res === null || res === void 0 ? void 0 : res.version) || res.accession, res === null || res === void 0 ? void 0 : res.sequence, file);
  250. createdFiles.push(file);
  251. counter++;
  252. if (counter % 100 === 0)
  253. console.log('Already ' + counter + ' sequence parsed');
  254. }
  255. }
  256. catch (error) {
  257. console.log(error);
  258. }
  259. }
  260. }
  261. if (limit)
  262. if (counter === limit)
  263. break;
  264. }
  265. }
  266. catch (e_3_1) { e_3 = { error: e_3_1 }; }
  267. finally {
  268. try {
  269. if (_j && !_j.done && (_g = _h.return)) yield _g.call(_h);
  270. }
  271. finally { if (e_3) throw e_3.error; }
  272. }
  273. if (limit)
  274. if (counter === limit)
  275. break;
  276. }
  277. console.log(createdFiles.length + ' sequences');
  278. if (fs_1.default.existsSync(distFile))
  279. yield fs_1.default.promises.rm(distFile);
  280. for (const createdFile of createdFiles) {
  281. const tmp = yield fs_1.default.promises.readFile(createdFile);
  282. yield fs_1.default.promises.appendFile(distFile, tmp.toString() + '\n');
  283. }
  284. yield fs_1.default.promises.rm(tmpDir, { recursive: true });
  285. yield async_exec('bwa', ['index', distFile], () => console.log);
  286. });
  287. exports.makeRefSeqFromReg = makeRefSeqFromReg;