浏览代码

makeRefSeqFromReg

Thomas 3 年之前
父节点
当前提交
038d521613
共有 6 个文件被更改,包括 155 次插入25 次删除
  1. 75 10
      index.js
  2. 67 10
      index.ts
  3. 1 0
      package.json
  4. 3 2
      test.js
  5. 5 3
      test.ts
  6. 4 0
      yarn.lock

+ 75 - 10
index.js

@@ -26,11 +26,24 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
     return (mod && mod.__esModule) ? mod : { "default": mod };
 };
 Object.defineProperty(exports, "__esModule", { value: true });
-exports.getSymbol = exports.getFromAcc = void 0;
+exports.makeRefSeqFromReg = exports.getSymbol = exports.getFromAcc = void 0;
 const fs_1 = __importDefault(require("fs"));
+const os_1 = __importDefault(require("os"));
+const path_1 = __importDefault(require("path"));
+const child_process_1 = require("child_process");
 const readline_1 = __importDefault(require("readline"));
 const buffer_1 = require("buffer");
 const genbank_parser_1 = __importDefault(require("genbank-parser"));
+const aligner_1 = require("aligner");
+const async_exec = (prog, args, onData) => {
+    return new Promise((resolve, reject) => {
+        const child = (0, child_process_1.spawn)(prog, args, { shell: true });
+        child.stdout.on('data', data => onData(data.toString().trim()));
+        child.stderr.on('data', data => onData(data.toString().trim()));
+        child.on('error', err => reject(err));
+        child.on('exit', code => resolve(code));
+    });
+};
 const line$ = (path) => readline_1.default.createInterface({
     input: fs_1.default.createReadStream(path),
     crlfDelay: Infinity
@@ -125,15 +138,7 @@ const getOffset = (indexPath, acc) => __awaiter(void 0, void 0, void 0, function
 const getFromAcc = (accession, dbPath, indexPath) => __awaiter(void 0, void 0, void 0, function* () {
     dbPath = Array.isArray(dbPath) ? dbPath : [dbPath];
     if (!indexPath) {
-        indexPath = [];
-        for (const p of dbPath) {
-            const iP = p + '.jsi';
-            if (!fs_1.default.existsSync(iP)) {
-                console.log('Writing index: ' + iP);
-                yield makeGbffIndex(p);
-            }
-            indexPath.push(iP);
-        }
+        indexPath = yield getJSI(dbPath);
     }
     else {
         indexPath = Array.isArray(indexPath) ? indexPath : [indexPath];
@@ -207,3 +212,63 @@ rnaDBPath // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.[1
     return allFeatures;
 });
 exports.getSymbol = getSymbol;
+const getJSI = (dbPath) => __awaiter(void 0, void 0, void 0, function* () {
+    dbPath = Array.isArray(dbPath) ? dbPath : [dbPath];
+    const indexPath = [];
+    for (const p of dbPath) {
+        const iP = p + '.jsi';
+        if (!fs_1.default.existsSync(iP)) {
+            console.log('Writing index: ' + iP);
+            yield makeGbffIndex(p);
+        }
+        indexPath.push(iP);
+    }
+    return indexPath;
+});
+// Todo: add progress
+const makeRefSeqFromReg = (dbPath, reg, distFile) => __awaiter(void 0, void 0, void 0, function* () {
+    var e_3, _g;
+    dbPath = Array.isArray(dbPath) ? dbPath : [dbPath];
+    const jsiFiles = yield getJSI(dbPath);
+    const tmpDir = path_1.default.join(os_1.default.tmpdir(), 'parser-' + Math.random());
+    yield fs_1.default.promises.mkdir(tmpDir);
+    const createdFiles = [];
+    for (const jsiFile of jsiFiles) {
+        try {
+            for (var _h = (e_3 = void 0, __asyncValues(line$(jsiFile))), _j; _j = yield _h.next(), !_j.done;) {
+                const line = _j.value;
+                if (line.match(reg)) {
+                    const [accession, from, to] = line.split('\t');
+                    const res = yield getFromAcc(accession, jsiFile.split('.jsi')[0]);
+                    if (res === null || res === void 0 ? void 0 : res.sequence) {
+                        try {
+                            const file = path_1.default.join(tmpDir, (res === null || res === void 0 ? void 0 : res.version) || res.accession + '.fa');
+                            yield (0, aligner_1.writeSequence)((res === null || res === void 0 ? void 0 : res.version) || res.accession, res === null || res === void 0 ? void 0 : res.sequence, file);
+                            createdFiles.push(file);
+                        }
+                        catch (error) {
+                            console.log(error);
+                        }
+                    }
+                }
+            }
+        }
+        catch (e_3_1) { e_3 = { error: e_3_1 }; }
+        finally {
+            try {
+                if (_j && !_j.done && (_g = _h.return)) yield _g.call(_h);
+            }
+            finally { if (e_3) throw e_3.error; }
+        }
+    }
+    console.log(createdFiles.length + ' sequences');
+    if (fs_1.default.existsSync(distFile))
+        yield fs_1.default.promises.rm(distFile);
+    for (const createdFile of createdFiles) {
+        const tmp = yield fs_1.default.promises.readFile(createdFile);
+        yield fs_1.default.promises.appendFile(distFile, tmp.toString() + '\n');
+    }
+    yield fs_1.default.promises.rm(tmpDir, { recursive: true });
+    yield async_exec('bwa', ['index', distFile], () => console.log);
+});
+exports.makeRefSeqFromReg = makeRefSeqFromReg;

+ 67 - 10
index.ts

@@ -7,9 +7,25 @@
 // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.[1-10].rna.gbff.gz
 
 import fs from 'fs'
+import os from 'os'
+import path from 'path'
+import { spawn } from 'child_process'
 import readline from 'readline'
 import { Buffer } from 'buffer'
 import genbankParser from 'genbank-parser'
+import { writeSequence } from 'aligner'
+
+const async_exec = (prog: string, args: string[], onData: Function) => {
+    return new Promise((resolve, reject) => {
+        const child = spawn(prog, args, {shell: true})
+
+        child.stdout.on('data', data => onData(data.toString().trim()))
+        child.stderr.on('data', data => onData(data.toString().trim()))
+
+        child.on('error', err => reject(err))
+        child.on('exit', code => resolve(code))
+    })
+}
 
 const line$ = (path: string) => readline.createInterface({
     input: fs.createReadStream(path),
@@ -91,15 +107,7 @@ const getOffset = async (indexPath: string, acc: string) => {
 const getFromAcc = async (accession: string, dbPath: string | string[], indexPath?: string | string[]) => {
     dbPath = Array.isArray(dbPath) ? dbPath : [dbPath]
     if (!indexPath) {
-        indexPath = []
-        for (const p of dbPath) {
-            const iP = p + '.jsi'
-            if (!fs.existsSync(iP)) {
-                console.log('Writing index: ' + iP);
-                await makeGbffIndex(p)
-            }
-            indexPath.push(iP)
-        }
+        indexPath = await getJSI(dbPath)
     } else {
         indexPath = Array.isArray(indexPath) ? indexPath : [indexPath]
         if (indexPath.length !== dbPath.length) throw 'Error'
@@ -186,4 +194,53 @@ const getSymbol = async (
     return allFeatures
 }
 
-export { getFromAcc, getSymbol }
+const getJSI = async (dbPath: string | string[]) => {
+    dbPath = Array.isArray(dbPath) ? dbPath : [dbPath]
+    const indexPath: string[] = []
+    for (const p of dbPath) {
+        const iP = p + '.jsi'
+        if (!fs.existsSync(iP)) {
+            console.log('Writing index: ' + iP);
+            await makeGbffIndex(p)
+        }
+        indexPath.push(iP)
+    }
+    return indexPath
+}
+
+// Todo: add progress
+const makeRefSeqFromReg = async (dbPath: string | string[], reg: RegExp, distFile:string ) => {
+    dbPath = Array.isArray(dbPath) ? dbPath : [dbPath]
+    const jsiFiles = await getJSI(dbPath)
+    const tmpDir = path.join(os.tmpdir(), 'parser-'+Math.random())
+    await fs.promises.mkdir(tmpDir)
+    const createdFiles = []
+    for (const jsiFile of jsiFiles) {
+        for await (const line of line$(jsiFile)) {
+            if(line.match(reg)) {
+                const [accession, from, to] = line.split('\t')
+                const res = await getFromAcc(accession, jsiFile.split('.jsi')[0])
+                if (res?.sequence) {
+                    try {
+                        const file = path.join(tmpDir, res?.version || res.accession + '.fa')
+                        await writeSequence(res?.version || res.accession, res?.sequence, file)
+                        createdFiles.push(file)
+                    } catch (error) {
+                        console.log(error)
+                    }
+                }
+            }
+        }
+    }
+    console.log(createdFiles.length + ' sequences')
+    
+    if (fs.existsSync(distFile)) await fs.promises.rm(distFile)
+    for (const createdFile of createdFiles) {
+        const tmp = await fs.promises.readFile(createdFile)
+        await fs.promises.appendFile(distFile, tmp.toString() + '\n')
+    }
+    await fs.promises.rm(tmpDir, {recursive: true})
+    await async_exec('bwa', ['index', distFile], () => console.log)
+}
+
+export { getFromAcc, getSymbol, makeRefSeqFromReg }

+ 1 - 0
package.json

@@ -18,6 +18,7 @@
   },
   "dependencies": {
     "@gmod/bgzf-filehandle": "^1.4.2",
+    "aligner": "http://git.t0m4.fr/Thomas/aligner.git",
     "blessed": "^0.1.81",
     "chalk": "^4.1.2",
     "figlet": "^1.5.2",

+ 3 - 2
test.js

@@ -9,7 +9,6 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
     });
 };
 Object.defineProperty(exports, "__esModule", { value: true });
-const _1 = require(".");
 (() => __awaiter(void 0, void 0, void 0, function* () {
     // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/LRG_RefSeqGene
     const LRGPath = '/home/thomas/NGS/ref/ncbi/LRG_RefSeqGene';
@@ -21,5 +20,7 @@ const _1 = require(".");
     const geneDBPath = [1, 2, 3, 4, 5, 6, 7].map(n => '/home/thomas/NGS/ref/ncbi/GENES/refseqgene.' + n + '.genomic.gbff');
     // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.[1-10].rna.gbff.gz
     const rnaDBPath = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10].map(n => '/home/thomas/NGS/ref/ncbi/RNA/human.' + n + '.rna.gbff');
-    yield (0, _1.getSymbol)('NOTCH1', LRGPath, tablePath, geneDBPath, rnaDBPath);
+    // const res = await getSymbol('NOTCH1', LRGPath, tablePath, geneDBPath, rnaDBPath)
+    // await fs.promises.writeFile('test/test-getSymbol.json', JSON.stringify(res, null, 4))
+    // await makeRefSeqFromReg(rnaDBPath, /NM_/, '/home/thomas/NGS/ref/ncbi/RNA/human_NM.fa')
 }))();

+ 5 - 3
test.ts

@@ -1,5 +1,5 @@
-import { getSymbol } from '.'
-
+import { getSymbol, makeRefSeqFromReg } from '.'
+import fs from 'fs'
 ( async () => {
     // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/LRG_RefSeqGene
     const LRGPath = '/home/thomas/NGS/ref/ncbi/LRG_RefSeqGene'
@@ -14,6 +14,8 @@ import { getSymbol } from '.'
     // wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.[1-10].rna.gbff.gz
     const rnaDBPath  = [1,2,3,4,5,6,7,8,9,10].map(n => '/home/thomas/NGS/ref/ncbi/RNA/human.' + n + '.rna.gbff')
 
-    await getSymbol('NOTCH1', LRGPath, tablePath, geneDBPath, rnaDBPath)
+    // const res = await getSymbol('NOTCH1', LRGPath, tablePath, geneDBPath, rnaDBPath)
+    // await fs.promises.writeFile('test/test-getSymbol.json', JSON.stringify(res, null, 4))
+    // await makeRefSeqFromReg(rnaDBPath, /NM_/, '/home/thomas/NGS/ref/ncbi/RNA/human_NM.fa')
 })()
 

+ 4 - 0
yarn.lock

@@ -34,6 +34,10 @@
   resolved "https://registry.npmjs.org/@types/node/-/node-17.0.19.tgz"
   integrity sha512-PfeQhvcMR4cPFVuYfBN4ifG7p9c+Dlh3yUZR6k+5yQK7wX3gDgVxBly4/WkBRs9x4dmcy1TVl08SY67wwtEvmA==
 
+"aligner@http://git.t0m4.fr/Thomas/aligner.git":
+  version "1.0.0"
+  resolved "http://git.t0m4.fr/Thomas/aligner.git#217ebec880a91085edd9a71e4e198eafe33ac7c3"
+
 ansi-styles@^4.1.0:
   version "4.3.0"
   resolved "https://registry.yarnpkg.com/ansi-styles/-/ansi-styles-4.3.0.tgz#edd803628ae71c04c85ae7a0906edad34b648937"