// https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/uniprot_sprot_human.xml.gz import fs from 'fs' import readline from 'readline' import { XMLParser } from 'fast-xml-parser' const line$ = (path: string) => readline.createInterface({ input: fs.createReadStream(path), crlfDelay: Infinity }) const makeIndex = async (filePath: string, indexPath?: string) => { indexPath = indexPath || filePath + '.jsi' let byteAcc = 0 const fromSel = new RegExp("^") const valSel = new RegExp('') let tmp = {values:[]} as any for await (const line of line$(filePath)) { if(fromSel.test(line)) tmp['from'] = byteAcc byteAcc += (line.length + 1) if (valSel.test(line)) tmp['values'].push(line.match("(.*?)")![1]) // 'uck if(toSel.test(line)) { await fs.promises.appendFile(indexPath, tmp.values.join(';') + '\t' + tmp.from + '\t' + byteAcc + '\n') tmp = {values:[]} } } } const readOffset = (path: string, from:number, to:number) => { return new Promise(async (resolve, reject) => { const size = to - from const buffer = Buffer.alloc(size); let filehandle try { filehandle = await fs.promises.open(path, 'r+'); await filehandle.read(buffer, 0, buffer.length, from); } finally { if (filehandle) { await filehandle.close() resolve(buffer.toString()) } } }) } const getEntryOffset = async (dbPath:string, accession:string): Promise => { const indexPath = dbPath + '.jsi' if (!fs.existsSync(indexPath)) await makeIndex(dbPath) const lineSel = new RegExp(accession) for await (const line of line$(indexPath)) { if (lineSel.test(line)) return [Number(line.split('\t')[1]),Number(line.split('\t')[2])] } return [0, 0] } const getEntry = async (dbPath:string, accession:string) => { const parser = new XMLParser({ ignoreAttributes: false, alwaysCreateTextNode: false, attributeNamePrefix: "", textNodeName: "value", allowBooleanAttributes: true, }) const offsets = await getEntryOffset(dbPath, accession) return parser.parse(await readOffset(dbPath, offsets[0], offsets[1])) } const getEntryFromGeneName = async (idmappingPath: string, dbPath:string, geneName:string) => { const accessions = await getAccessFromGene(idmappingPath, geneName) return await getEntry(dbPath, accessions[0]) // seems to be always the first with entry } const getAccessFromGene = async (idmappingPath: string, geneName:string) => { const sel = new RegExp('Gene_Name\t' + geneName) let accessions: any[] = [] for await (const line of line$(idmappingPath)) { if(sel.test(line)) accessions.push(line.split('\t')[0]) } return accessions } const getInteractionsFromEntry = async (json:any) => { const blaskList = ['DNA', 'PHOSPHOSERINE', 'MOTIFS', 'INFECTION', 'PROTEIN', 'PROTEINS', 'GAMMA-SECRETASE', 'CALCIUM', 'MICROBIAL', 'VIRUS', 'HEPATITIS', 'HERPES', 'SIMPLEX', 'RELATED', 'AND', 'CLATHRIN', 'WORTMANNIN', 'NUCLEOSOME', 'undefined'] const uniprotIDs = Array.isArray(json.entry.accession) ? json.entry.accession : [json.entry.accession] // geneName const gnTT = Array.isArray(json.entry.gene) ? json.entry.gene[0] : json.entry.gene const gnT = Array.isArray(gnTT.name) ? gnTT.name : [gnTT.name] const geneName = gnT.filter((e:any)=> e.type === 'primary').map((e:any)=> e.value)[0] // Interactants const jecT = Array.isArray(json.entry.comment) ? json.entry.comment : [json.entry.comment] const interactants = jecT .filter((e:any)=> e?.type === 'interaction') .flatMap((e:any) => ({ type : 'interactant', fromProductId: e.interactant[0].id, toProductId : e.interactant[1].id, to : e.interactant[1].label, nExperiments : Number(e.experiments) })) const regExp = new RegExp('INTERACTION WITH |Interacts with |complex with ', 'i') const geneRegExp = new RegExp(/[A-Z]{1}[A-Z|0-9]{2,}$/) // uniprot_comment_text_value const commentInteractsWith = jecT .filter((e:any) => e?.text?.value) .filter((e:any) => regExp.test(e.text.value)) .map((e:any) => ({ to : e.text.value .split(/\.|;/) .flatMap((ee:any) => ee.replace(/ *\([^)]*\) */g, ' ')) .filter((ee:any) => regExp.test(ee)) .flatMap((ee:any) => ee.trim().split(regExp)) .flatMap((ee:any) => ee.split(/,| and | /)) .filter((_:any) => _) .filter((ee:any) => geneRegExp.test(ee)) .filter((ee:any) => !blaskList.includes(ee) && ee !== geneName) .map((ee:any) => ee.trim()), text: e.text.value, //evidences: e.text.evidence.split(' ')//.map((ee:string)=> json.entry.reference.filter((eee:any)=> eee.key === ee)) // Doesnt work with ref key })) .flatMap((e:any)=> e.to.flatMap((ee:any) => ({ type : 'uniprot_comment_text_value', to : ee, text : e.text }))) // uniprot_reference_scope const referenceInteract = json.entry.reference .map((e:any)=> ({...e, scope : Array.isArray(e.scope) ? e.scope : [e.scope]})) .filter((e:any) => regExp.test(e.scope.join(''))) .map((e:any)=> ({ to: e.scope //.split(/\.|;/) .flatMap((ee:any) => regExp.test(ee) ? [ee] : []) .filter((_:any) => _) .flatMap((ee:any) => ee.replace(/ *\([^)]*\) */g, ' ')) .filter((ee:any) => regExp.test(ee)) .flatMap((ee:any) => ee.trim().split(regExp)[1]) .flatMap((ee:any) => ee.split(/,| and | /i)) .filter((_:any) => _) .filter((ee:any) => geneRegExp.test(ee)) .filter((ee:any) => !blaskList.includes(ee) && ee !== geneName) .map((ee:any) => ee.trim()), ...e, })) .flatMap((e:any)=> e.to.flatMap((ee:any) => ({ type : 'reference_scope', to : ee, scope : e.scope, //citation: e.citation }))) // Group const byTo = {} as {[key:string]: any} [...interactants, ...referenceInteract, ...commentInteractsWith] //.map((e:any)=> byTo[e.to] = byTo[e.to] ? [e, ...byTo[e.to]] : [e] ) .map((e:any)=> byTo[e.to] = byTo[e.to] ? {...e, ...byTo[e.to]} : {...e} ) const results = Object.keys(byTo).map((e:any) => { delete byTo[e]?.to return { from: geneName, to: e, data: byTo[e] } }) .filter((e:any) => !blaskList.includes(e.to) && e.to !== geneName) // await fs.promises.writeFile('test/tmp.json', JSON.stringify(results.map((e:any)=>e.to), null,4)) return results } // const findDistance = async (idmappingPath: string, dbPath:string, geneNameA:string, geneNameB:string, maxDistance = 6) => { // let rounds = [[geneNameA]] // let tree = {[geneNameA]: {}} as {[key:string]:any} // let run = true // let a = tree // Object.keys(a).map((gene) => ) // let nIter = 0 // while(nIter <= maxDistance && run) { // for (const gA of rounds[nIter]) { // console.log(nIter,gA); // const tmp = await getInteractionsFromEntry(await getEntryFromGeneName(idmappingPath, dbPath, gA)) // if (tmp.includes(geneNameB)) { run = false; break } // rounds.push(tmp) // } // nIter++ // } // //console.log(rounds); // return nIter // } export { makeIndex, readOffset, getEntry as getEnrty, getEntryFromGeneName, getInteractionsFromEntry }