Thomas пре 3 година
родитељ
комит
b62480f264
4 измењених фајлова са 192 додато и 87 уклоњено
  1. 87 38
      index.js
  2. 99 45
      index.ts
  3. 3 2
      test.js
  4. 3 2
      test.ts

+ 87 - 38
index.js

@@ -24,7 +24,6 @@ exports.getInteractionsFromEntry = exports.getEntryFromGeneName = exports.getEnr
 const fs_1 = __importDefault(require("fs"));
 const readline_1 = __importDefault(require("readline"));
 const fast_xml_parser_1 = require("fast-xml-parser");
-const jsonata_1 = __importDefault(require("jsonata"));
 const line$ = (path) => readline_1.default.createInterface({
     input: fs_1.default.createReadStream(path),
     crlfDelay: Infinity
@@ -100,7 +99,7 @@ const getEntryOffset = (dbPath, accession) => __awaiter(void 0, void 0, void 0,
     }
     return [0, 0];
 });
-const getEnrty = (dbPath, accession) => __awaiter(void 0, void 0, void 0, function* () {
+const getEntry = (dbPath, accession) => __awaiter(void 0, void 0, void 0, function* () {
     const parser = new fast_xml_parser_1.XMLParser({
         ignoreAttributes: false,
         alwaysCreateTextNode: false,
@@ -111,8 +110,13 @@ const getEnrty = (dbPath, accession) => __awaiter(void 0, void 0, void 0, functi
     const offsets = yield getEntryOffset(dbPath, accession);
     return parser.parse(yield readOffset(dbPath, offsets[0], offsets[1]));
 });
-exports.getEnrty = getEnrty;
+exports.getEnrty = getEntry;
 const getEntryFromGeneName = (idmappingPath, dbPath, geneName) => __awaiter(void 0, void 0, void 0, function* () {
+    const accessions = yield getAccessFromGene(idmappingPath, geneName);
+    return yield getEntry(dbPath, accessions[0]); // seems to be always the first with entry
+});
+exports.getEntryFromGeneName = getEntryFromGeneName;
+const getAccessFromGene = (idmappingPath, geneName) => __awaiter(void 0, void 0, void 0, function* () {
     var e_3, _g;
     const sel = new RegExp('Gene_Name\t' + geneName);
     let accessions = [];
@@ -130,43 +134,88 @@ const getEntryFromGeneName = (idmappingPath, dbPath, geneName) => __awaiter(void
         }
         finally { if (e_3) throw e_3.error; }
     }
-    return yield getEnrty(dbPath, accessions[0]);
+    return accessions;
 });
-exports.getEntryFromGeneName = getEntryFromGeneName;
 const getInteractionsFromEntry = (json) => __awaiter(void 0, void 0, void 0, function* () {
+    const blaskList = ['DNA', 'PHOSPHOSERINE', 'MOTIFS', 'INFECTION', 'PROTEIN', 'PROTEINS', 'GAMMA-SECRETASE', 'CALCIUM',
+        'MICROBIAL', 'VIRUS', 'HEPATITIS', 'HERPES', 'SIMPLEX', 'RELATED', 'AND', 'CLATHRIN', 'WORTMANNIN'];
     const uniprotIDs = Array.isArray(json.entry.accession) ? json.entry.accession : [json.entry.accession];
-    // Comment interactant
-    const res_inter = (0, jsonata_1.default)(`entry.comment[type="interaction"].interactant`).evaluate(json);
-    let genes_interactant = [];
-    if (res_inter) {
-        genes_interactant = [...new Set(res_inter
-                .filter((e) => !uniprotIDs.includes(e.label))
-                .map((e) => e.label)
-                .filter((e) => e))];
-    }
-    // Reference scope = INTERACTION WITH
-    const scope_inter = (0, jsonata_1.default)(`entry.reference[scope ~> /INTERACTION WITH/i ]`).evaluate(json);
-    let genes_scope_inter = [];
-    if (scope_inter) {
-        const comment_scope_inters = Array.isArray(scope_inter) ? scope_inter : [scope_inter];
-        const comment_scope_inters_genes = comment_scope_inters.map((e) => (Object.assign({ txt: Array.isArray(e.scope) ? e.scope.filter((ee) => ee.match(/INTERACTION\ WITH/)).join() : e.scope }, e))).map((e) => (Object.assign({ interaction: e.txt.substring(e.txt.indexOf("INTERACTION WITH ") + "INTERACTION WITH ".length, e.txt.length) }, e)));
-        genes_scope_inter = [...new Set(comment_scope_inters_genes.map((e) => e.interaction))].flatMap((e) => e.split(/; | AND /));
-    }
-    // Comment subunit
-    const comment_subunit = (0, jsonata_1.default)(`entry.comment[type="subunit"].text.value`).evaluate(json);
-    let comment_subunits_genes = [];
-    if (comment_subunit) {
-        const comment_subunits = Array.isArray(comment_subunit) ? comment_subunit : [comment_subunit];
-        comment_subunits_genes = comment_subunits
-            .flatMap((e) => e.replace(/ *\([^)]*\) */g, '').split(/\n/))
-            .filter((e) => /Interacts/.test(e))
-            .flatMap((e) => e.match(/ [A-Z][A-Z0-9\-]{2,}/g));
-    }
-    let res = [...new Set([...genes_scope_inter, ...genes_interactant, ...comment_subunits_genes])].sort().filter((e) => typeof e === 'string').filter(_ => _);
-    let filterOut = ['PHOSPHOSERINE', 'MOTIFS', 'INFECTION', 'PROTEIN', 'PROTEINS',
-        'MICROBIAL', 'VIRUS', 'HEPATITIS', 'HERPES', 'SIMPLEX', 'RELATED', 'AND', 'CLATHRIN'];
-    if (res.length > 0)
-        res = res.flatMap((e) => e.match(/[A-Z]{2,}[A-Z0-9\-]{1,}/g)).filter((e) => !(filterOut.includes(e))).filter((e) => !/-$/.test(e));
-    return [...new Set(res.filter(_ => _))];
+    // geneName
+    const gnT = Array.isArray(json.entry.gene.name) ? json.entry.gene.name : [json.entry.gene.name];
+    const geneName = gnT.filter((e) => e.type === 'primary').map((e) => e.value)[0];
+    // Interactants
+    const interactants = json.entry.comment
+        .filter((e) => (e === null || e === void 0 ? void 0 : e.type) === 'interaction')
+        .flatMap((e) => ({
+        type: 'interactant',
+        fromProductId: e.interactant[0].id,
+        toProductId: e.interactant[1].id,
+        to: e.interactant[1].label,
+        nExperiments: Number(e.experiments)
+    }));
+    const regExp = new RegExp('INTERACTION WITH |Interacts with |complex with ', 'i');
+    const geneRegExp = new RegExp(/[A-Z]{1}[A-Z|0-9]{2,}$/);
+    // uniprot_comment_text_value
+    const commentInteractsWith = json.entry.comment
+        .filter((e) => { var _a; return (_a = e === null || e === void 0 ? void 0 : e.text) === null || _a === void 0 ? void 0 : _a.value; })
+        .filter((e) => regExp.test(e.text.value))
+        .map((e) => ({
+        to: e.text.value
+            .split(/\.|;/)
+            .flatMap((ee) => ee.replace(/ *\([^)]*\) */g, ' '))
+            .filter((ee) => regExp.test(ee))
+            .flatMap((ee) => ee.trim().split(regExp))
+            .flatMap((ee) => ee.split(/,| and | /))
+            .filter((_) => _)
+            .filter((ee) => geneRegExp.test(ee))
+            .filter((ee) => !blaskList.includes(ee) && ee !== geneName)
+            .map((ee) => ee.trim()),
+        text: e.text.value,
+        //evidences: e.text.evidence.split(' ')//.map((ee:string)=> json.entry.reference.filter((eee:any)=> eee.key === ee)) // Doesnt work with ref key
+    }))
+        .flatMap((e) => e.to.flatMap((ee) => ({
+        type: 'uniprot_comment_text_value',
+        to: ee,
+        text: e.text
+    })));
+    // uniprot_reference_scope
+    const referenceInteract = json.entry.reference
+        .map((e) => (Object.assign(Object.assign({}, e), { scope: Array.isArray(e.scope) ? e.scope : [e.scope] })))
+        .filter((e) => regExp.test(e.scope.join('')))
+        .map((e) => (Object.assign({ to: e.scope
+            //.split(/\.|;/)
+            .flatMap((ee) => regExp.test(ee) ? [ee] : [])
+            .filter((_) => _)
+            .flatMap((ee) => ee.replace(/ *\([^)]*\) */g, ' '))
+            .filter((ee) => regExp.test(ee))
+            .flatMap((ee) => ee.trim().split(regExp)[1])
+            .flatMap((ee) => ee.split(/,| and | /i))
+            .filter((_) => _)
+            .filter((ee) => geneRegExp.test(ee))
+            .filter((ee) => !blaskList.includes(ee) && ee !== geneName)
+            .map((ee) => ee.trim()) }, e)))
+        .flatMap((e) => e.to.flatMap((ee) => ({
+        type: 'reference_scope',
+        to: ee,
+        scope: e.scope,
+        //citation: e.citation
+    })));
+    // Group
+    const byTo = {};
+    [...interactants, ...referenceInteract, ...commentInteractsWith]
+        //.map((e:any)=> byTo[e.to] = byTo[e.to] ? [e, ...byTo[e.to]] : [e] )
+        .map((e) => byTo[e.to] = byTo[e.to] ? Object.assign(Object.assign({}, e), byTo[e.to]) : Object.assign({}, e));
+    const results = Object.keys(byTo).map((e) => {
+        var _a;
+        (_a = byTo[e]) === null || _a === void 0 ? true : delete _a.to;
+        return {
+            from: geneName,
+            to: e,
+            data: byTo[e]
+        };
+    })
+        .filter((e) => !blaskList.includes(e.to) && e.to !== geneName);
+    yield fs_1.default.promises.writeFile('test/tmp.json', JSON.stringify(results.map((e) => e.to), null, 4));
+    return results;
 });
 exports.getInteractionsFromEntry = getInteractionsFromEntry;

+ 99 - 45
index.ts

@@ -4,6 +4,7 @@ import fs from 'fs'
 import readline from 'readline'
 import { XMLParser } from 'fast-xml-parser'
 import jsonata from 'jsonata'
+import { text } from 'stream/consumers'
 
 const line$ = (path: string) => readline.createInterface({
     input: fs.createReadStream(path),
@@ -59,7 +60,7 @@ const getEntryOffset = async (dbPath:string, accession:string): Promise<number[]
     return [0, 0]
 }
 
-const getEnrty = async (dbPath:string, accession:string) => {
+const getEntry = async (dbPath:string, accession:string) => {
     const parser = new XMLParser({
         ignoreAttributes: false, 
         alwaysCreateTextNode: false, 
@@ -72,63 +73,116 @@ const getEnrty = async (dbPath:string, accession:string) => {
     return parser.parse(await readOffset(dbPath, offsets[0], offsets[1]))
 }
 
+
 const getEntryFromGeneName = async (idmappingPath: string, dbPath:string, geneName:string) => {
+    const accessions = await getAccessFromGene(idmappingPath, geneName)
+    return await getEntry(dbPath, accessions[0]) // seems to be always the first with entry
+}
+
+
+const getAccessFromGene = async (idmappingPath: string, geneName:string) => {
     const sel = new RegExp('Gene_Name\t' + geneName)
     let accessions: any[] = []
     for await (const line of line$(idmappingPath)) {
         if(sel.test(line)) accessions.push(line.split('\t')[0])
     }
-    return await getEnrty(dbPath, accessions[0])   
+    return accessions
 }
 
 const getInteractionsFromEntry = async (json:any) => {
-    const uniprotIDs = Array.isArray(json.entry.accession) ? json.entry.accession : [json.entry.accession]
+    const blaskList = 
+    ['DNA', 'PHOSPHOSERINE', 'MOTIFS', 'INFECTION', 'PROTEIN', 'PROTEINS', 'GAMMA-SECRETASE', 'CALCIUM',
+    'MICROBIAL', 'VIRUS', 'HEPATITIS', 'HERPES', 'SIMPLEX', 'RELATED', 'AND', 'CLATHRIN', 'WORTMANNIN']
     
-    // Comment interactant
-    const res_inter = jsonata(`entry.comment[type="interaction"].interactant`).evaluate(json)
-    let genes_interactant:any[] = []
-    if (res_inter) {
-        genes_interactant = [...new Set(
-            res_inter
-            .filter((e:any) => !uniprotIDs.includes(e.label))
-            .map((e:any) => e.label)
-            .filter((e:any) => e)
-        )]
-    }
+    const uniprotIDs = Array.isArray(json.entry.accession) ? json.entry.accession : [json.entry.accession]
+
+    // geneName
+    const gnT = Array.isArray(json.entry.gene.name) ? json.entry.gene.name :  [json.entry.gene.name]
+    const geneName = gnT.filter((e:any)=> e.type === 'primary').map((e:any)=> e.value)[0]
+
+    // Interactants
+    const interactants = json.entry.comment
+    .filter((e:any)=> e?.type === 'interaction')
+    .flatMap((e:any) => ({
+        type         : 'interactant',
+        fromProductId: e.interactant[0].id,
+        toProductId  : e.interactant[1].id,
+        to           : e.interactant[1].label,
+        nExperiments : Number(e.experiments)
+    }))
+
+    const regExp = new RegExp('INTERACTION WITH |Interacts with |complex with ', 'i')
+    const geneRegExp = new RegExp(/[A-Z]{1}[A-Z|0-9]{2,}$/)
     
-    // Reference scope = INTERACTION WITH
-    const scope_inter = jsonata(`entry.reference[scope ~> /INTERACTION WITH/i ]`).evaluate(json)
-    let genes_scope_inter:any[] = []
-    if (scope_inter) {
-        const comment_scope_inters = Array.isArray(scope_inter) ? scope_inter : [scope_inter]
-        const comment_scope_inters_genes = comment_scope_inters.map((e:any) => ({
-            txt: Array.isArray(e.scope) ? e.scope.filter((ee:any) => ee.match(/INTERACTION\ WITH/)).join() : e.scope,
-            ...e
-        })).map((e:any) => ({
-            interaction: e.txt.substring(e.txt.indexOf("INTERACTION WITH ") + "INTERACTION WITH ".length, e.txt.length),
-            ...e
-        }))
-        genes_scope_inter = [...new Set(comment_scope_inters_genes.map((e:any) => e.interaction))].flatMap((e:any) => e.split(/; | AND /))
-    }
+    // uniprot_comment_text_value
+    const commentInteractsWith = json.entry.comment
+    .filter((e:any) => e?.text?.value)
+    .filter((e:any) => regExp.test(e.text.value))
+    .map((e:any) => ({
+        to  : e.text.value
+            .split(/\.|;/)
+            .flatMap((ee:any) => ee.replace(/ *\([^)]*\) */g, ' '))
+            .filter((ee:any) => regExp.test(ee))
+            .flatMap((ee:any) => ee.trim().split(regExp))
+            .flatMap((ee:any) => ee.split(/,| and | /))
+            .filter((_:any) => _)
+            .filter((ee:any) => geneRegExp.test(ee))
+            .filter((ee:any) => !blaskList.includes(ee) && ee !== geneName)
+            .map((ee:any) => ee.trim()),
+        text: e.text.value,
+        //evidences: e.text.evidence.split(' ')//.map((ee:string)=> json.entry.reference.filter((eee:any)=> eee.key === ee)) // Doesnt work with ref key
+    }))
+    .flatMap((e:any)=> e.to.flatMap((ee:any) => ({
+        type    : 'uniprot_comment_text_value',
+        to      : ee,
+        text   : e.text
+    })))
+
+    // uniprot_reference_scope
+    const referenceInteract = json.entry.reference
+    .map((e:any)=> ({...e, scope : Array.isArray(e.scope) ? e.scope : [e.scope]}))
+    .filter((e:any) => regExp.test(e.scope.join('')))
+    .map((e:any)=> ({
+        to: e.scope
+            //.split(/\.|;/)
+            .flatMap((ee:any) => regExp.test(ee) ? [ee] : [])
+            .filter((_:any) => _)
+            .flatMap((ee:any) => ee.replace(/ *\([^)]*\) */g, ' '))
+            .filter((ee:any) => regExp.test(ee))
+            .flatMap((ee:any) => ee.trim().split(regExp)[1])
+            .flatMap((ee:any) => ee.split(/,| and | /i))
+            .filter((_:any) => _)
+            .filter((ee:any) => geneRegExp.test(ee))
+            .filter((ee:any) => !blaskList.includes(ee) && ee !== geneName)
+            .map((ee:any) => ee.trim()),
+        ...e,
+    }))
+    .flatMap((e:any)=> e.to.flatMap((ee:any) => ({
+        type    : 'reference_scope',
+        to      : ee,
+        scope   : e.scope, 
+        //citation: e.citation
+    })))
+
+    // Group
+    const byTo = {} as {[key:string]: any}
+    [...interactants, ...referenceInteract, ...commentInteractsWith]
+    //.map((e:any)=> byTo[e.to] = byTo[e.to] ? [e, ...byTo[e.to]] : [e] )
+    .map((e:any)=> byTo[e.to] = byTo[e.to] ? {...e, ...byTo[e.to]} : {...e} )
     
-    // Comment subunit
-    const comment_subunit = jsonata(`entry.comment[type="subunit"].text.value`).evaluate(json)
-    let comment_subunits_genes:any[] = []
-    if (comment_subunit) {
-        const comment_subunits = Array.isArray(comment_subunit) ? comment_subunit : [comment_subunit]
-        comment_subunits_genes = comment_subunits
-        .flatMap((e:any) => e.replace(/ *\([^)]*\) */g, '').split(/\n/))
-        .filter((e:any) => /Interacts/.test(e))
-        .flatMap((e:any) => e.match(/ [A-Z][A-Z0-9\-]{2,}/g))
-    }
+    const results = Object.keys(byTo).map((e:any) => {
+        delete byTo[e]?.to
+        return {
+            from: geneName,
+            to: e,
+            data: byTo[e]
+        }
+    })
+    .filter((e:any) => !blaskList.includes(e.to) && e.to !== geneName)
 
-    let res = [...new Set([...genes_scope_inter, ...genes_interactant, ...comment_subunits_genes])].sort().filter((e:any) => typeof e === 'string').filter(_=>_)
-    
-    let filterOut = ['PHOSPHOSERINE', 'MOTIFS', 'INFECTION', 'PROTEIN', 'PROTEINS',
-    'MICROBIAL', 'VIRUS', 'HEPATITIS', 'HERPES', 'SIMPLEX', 'RELATED', 'AND', 'CLATHRIN']
-    if(res.length > 0) res = res.flatMap((e:any) => e.match(/[A-Z]{2,}[A-Z0-9\-]{1,}/g)).filter((e:any) => !(filterOut.includes(e))).filter((e:any)=> !/-$/.test(e))
+    await fs.promises.writeFile('test/tmp.json', JSON.stringify(results.map((e:any)=>e.to), null,4))
     
-    return [...new Set(res.filter(_=>_))]
+    return results
 }
 
 // const findDistance = async (idmappingPath: string, dbPath:string, geneNameA:string, geneNameB:string, maxDistance = 6) => {
@@ -158,4 +212,4 @@ const getInteractionsFromEntry = async (json:any) => {
 //     return nIter
 // }
 
-export { makeIndex, readOffset, getEnrty, getEntryFromGeneName, getInteractionsFromEntry }
+export { makeIndex, readOffset, getEntry as getEnrty, getEntryFromGeneName, getInteractionsFromEntry }

+ 3 - 2
test.js

@@ -22,8 +22,9 @@ const fs_1 = __importDefault(require("fs"));
     // const n = await getEntryFromGeneName(idmappingPath, uniprotDB, 'TTC23L')
     // await fs.promises.writeFile('test/test-CITED2.json', JSON.stringify(n, null, 4))
     // console.log(await getInteractionsFromEntry(n))
-    const n = yield (0, _1.getEntryFromGeneName)(idmappingPath, uniprotDB, 'ZFP36L2');
-    yield fs_1.default.promises.writeFile('test/test-ZFP36L2.json', JSON.stringify(n, null, 4));
+    const geneName = 'HTT';
+    const n = yield (0, _1.getEntryFromGeneName)(idmappingPath, uniprotDB, geneName);
+    yield fs_1.default.promises.writeFile('test/test-' + geneName + '.json', JSON.stringify(n, null, 4));
     const tmp = yield (0, _1.getInteractionsFromEntry)(n);
     console.log(tmp);
 }))();

+ 3 - 2
test.ts

@@ -11,8 +11,9 @@ import jsonata from 'jsonata'
     // await fs.promises.writeFile('test/test-CITED2.json', JSON.stringify(n, null, 4))
     // console.log(await getInteractionsFromEntry(n))
 
-    const n = await getEntryFromGeneName(idmappingPath, uniprotDB, 'ZFP36L2')
-    await fs.promises.writeFile('test/test-ZFP36L2.json', JSON.stringify(n, null, 4))
+    const geneName = 'HTT'
+    const n = await getEntryFromGeneName(idmappingPath, uniprotDB, geneName)
+    await fs.promises.writeFile('test/test-' + geneName + '.json', JSON.stringify(n, null, 4))
     const tmp = await getInteractionsFromEntry(n)
     console.log(tmp);    
 })()