|
|
@@ -1,5 +1,11 @@
|
|
|
"use strict";
|
|
|
-// wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.6.rna.gbff.gz
|
|
|
+// wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_feature_table.txt.gz
|
|
|
+// wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_assembly_structure/Primary_Assembly/assembled_chromosomes/chr2acc
|
|
|
+// wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gbff.gz
|
|
|
+// wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/LRG_RefSeqGene
|
|
|
+// wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/refseqgene.1.genomic.gbff.gz
|
|
|
+// wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/biological_region/human.biological_region.gbff.gz
|
|
|
+// wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.[1-10].rna.gbff.gz
|
|
|
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
|
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
|
return new (P || (P = Promise))(function (resolve, reject) {
|
|
|
@@ -20,7 +26,7 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
|
};
|
|
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
|
-exports.getFromAcc = void 0;
|
|
|
+exports.getSymbol = exports.getFromAcc = void 0;
|
|
|
const fs_1 = __importDefault(require("fs"));
|
|
|
const readline_1 = __importDefault(require("readline"));
|
|
|
const buffer_1 = require("buffer");
|
|
|
@@ -50,7 +56,7 @@ const readOffset = (path, from, to) => {
|
|
|
* strings -a -t d human.1.rna.gbff | grep LOCUS | awk '{print $1"\t"$3}' > human.1.rna.gbff.index
|
|
|
*
|
|
|
*/
|
|
|
-const makeGbffIndex = (filePath, lineSize = 80, indexPath) => __awaiter(void 0, void 0, void 0, function* () {
|
|
|
+const makeGbffIndex = (filePath, indexPath) => __awaiter(void 0, void 0, void 0, function* () {
|
|
|
var e_1, _a;
|
|
|
indexPath = indexPath || filePath + '.jsi';
|
|
|
let entries = [];
|
|
|
@@ -116,7 +122,7 @@ const getOffset = (indexPath, acc) => __awaiter(void 0, void 0, void 0, function
|
|
|
}
|
|
|
return res;
|
|
|
});
|
|
|
-const getFromAcc = (acc, dbPath, indexPath) => __awaiter(void 0, void 0, void 0, function* () {
|
|
|
+const getFromAcc = (accession, dbPath, indexPath) => __awaiter(void 0, void 0, void 0, function* () {
|
|
|
dbPath = Array.isArray(dbPath) ? dbPath : [dbPath];
|
|
|
if (!indexPath) {
|
|
|
indexPath = [];
|
|
|
@@ -134,18 +140,70 @@ const getFromAcc = (acc, dbPath, indexPath) => __awaiter(void 0, void 0, void 0,
|
|
|
if (indexPath.length !== dbPath.length)
|
|
|
throw 'Error';
|
|
|
}
|
|
|
- let i = 0;
|
|
|
- let res;
|
|
|
- for (const p of dbPath) {
|
|
|
- res = yield getOffset(indexPath[i], acc);
|
|
|
- if (res)
|
|
|
- break;
|
|
|
- i++;
|
|
|
- }
|
|
|
- if (res) {
|
|
|
- const rr = yield readOffset(res[0], Number(res[1]), Number(res[2]));
|
|
|
- res = (0, genbank_parser_1.default)(rr)[0];
|
|
|
+ for (const iP of indexPath) {
|
|
|
+ const [filePath, from, to] = (yield getOffset(iP, accession)) || [undefined, undefined, undefined];
|
|
|
+ if (filePath) {
|
|
|
+ const txt = yield readOffset(filePath, Number(from), Number(to));
|
|
|
+ return (0, genbank_parser_1.default)(txt)[0];
|
|
|
+ }
|
|
|
}
|
|
|
- return res;
|
|
|
+ return undefined;
|
|
|
});
|
|
|
exports.getFromAcc = getFromAcc;
|
|
|
+const getPos = (selector, tablePath, headerTablePath) => __awaiter(void 0, void 0, void 0, function* () {
|
|
|
+ const results = [];
|
|
|
+ (yield fs_1.default.promises.readFile(tablePath)).toString().split('\n')
|
|
|
+ .map((line) => {
|
|
|
+ const lineObj = line.split('\t').reduce((p, c, i) => (Object.assign(Object.assign({}, p), { [headerTablePath[i]]: isFinite(Number(c)) ? Number(c) : c })), {});
|
|
|
+ if (lineObj[Object.keys(selector)[0]] === selector[Object.keys(selector)[0]])
|
|
|
+ results.push(lineObj);
|
|
|
+ });
|
|
|
+ return results;
|
|
|
+});
|
|
|
+const getIndex = (symbol, LRGPath) => __awaiter(void 0, void 0, void 0, function* () {
|
|
|
+ return (yield fs_1.default.promises.readFile(LRGPath)).toString()
|
|
|
+ .split('\n')
|
|
|
+ .filter((line) => line.match(new RegExp(symbol, 'g')))
|
|
|
+ .reduce((p, c) => {
|
|
|
+ const [TaxID, GeneID, GeneName, GeneAcc, TranscriptsAcc, ProteinAcc, _] = c.split('\t').filter((e) => e !== '');
|
|
|
+ return { GeneID, GeneAcc, TranscriptsAcc: [...(new Set([...p.TranscriptsAcc, TranscriptsAcc]))], ProteinAcc: [...(new Set([...p.ProteinAcc, ProteinAcc]))] };
|
|
|
+ }, { GeneID: '', GeneAcc: '', TranscriptsAcc: [], ProteinAcc: [] });
|
|
|
+});
|
|
|
+const getSymbol = (symbol, LRGPath, // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/LRG_RefSeqGene
|
|
|
+tablePath, // wget https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_feature_table.txt.gz
|
|
|
+// regionDBPath: string | string[], // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/biological_region/human.biological_region.gbff.gz
|
|
|
+geneDBPath, // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/refseqgene.[1-7].genomic.gbff.gz
|
|
|
+rnaDBPath // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.[1-10].rna.gbff.gz
|
|
|
+) => __awaiter(void 0, void 0, void 0, function* () {
|
|
|
+ const geneIndex = yield getIndex(symbol, LRGPath);
|
|
|
+ // const regionData = await getFromAcc(geneIndex.GeneAcc.split('.')[0], regionDBPath)
|
|
|
+ // if (regionData) await fs.promises.writeFile('test-region.json', JSON.stringify(regionData, null, 4))
|
|
|
+ const headerTablePath = [
|
|
|
+ 'feature', 'class', 'assembly', 'assembly_unit', 'seq_type', 'chromosome',
|
|
|
+ 'genomic_accession', 'start', 'end', 'strand', 'product_accession', 'non_redundant_refseq',
|
|
|
+ 'related_accession', 'name', 'symbol', 'GeneID', 'locus_tag', 'feature_interval_length',
|
|
|
+ 'product_length', 'attributes'
|
|
|
+ ];
|
|
|
+ const allFeatures = yield getPos({ symbol }, tablePath, headerTablePath);
|
|
|
+ for (let index = 0; index < allFeatures.length; index++) {
|
|
|
+ const { feature, product_accession } = allFeatures[index];
|
|
|
+ let tmp;
|
|
|
+ switch (feature) {
|
|
|
+ case 'gene':
|
|
|
+ allFeatures[index].product_accession = geneIndex.GeneAcc;
|
|
|
+ tmp = yield getFromAcc(geneIndex.GeneAcc.split('.')[0], geneDBPath);
|
|
|
+ yield fs_1.default.promises.writeFile('test/test-gene.json', JSON.stringify(tmp, null, 4));
|
|
|
+ allFeatures[index].data = tmp;
|
|
|
+ break;
|
|
|
+ case 'mRNA':
|
|
|
+ tmp = yield getFromAcc(product_accession.split('.')[0], rnaDBPath);
|
|
|
+ yield fs_1.default.promises.writeFile('test/test-rna-' + index + '.json', JSON.stringify(tmp, null, 4));
|
|
|
+ allFeatures[index].data = tmp;
|
|
|
+ break;
|
|
|
+ default:
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return allFeatures;
|
|
|
+});
|
|
|
+exports.getSymbol = getSymbol;
|