// https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/uniprot_sprot_human.xml.gz import fs from 'fs' import readline from 'readline' import { XMLParser } from 'fast-xml-parser' import jsonata from 'jsonata' const line$ = (path: string) => readline.createInterface({ input: fs.createReadStream(path), crlfDelay: Infinity }) const makeIndex = async (filePath: string, indexPath?: string) => { indexPath = indexPath || filePath + '.jsi' let byteAcc = 0 const fromSel = new RegExp("^") const valSel = new RegExp('') let tmp = {values:[]} as any for await (const line of line$(filePath)) { if(fromSel.test(line)) tmp['from'] = byteAcc byteAcc += (line.length + 1) if (valSel.test(line)) tmp['values'].push(line.match("(.*?)")![1]) // 'uck if(toSel.test(line)) { await fs.promises.appendFile(indexPath, tmp.values.join(';') + '\t' + tmp.from + '\t' + byteAcc + '\n') tmp = {values:[]} } } } const readOffset = (path: string, from:number, to:number) => { return new Promise(async (resolve, reject) => { const size = to - from const buffer = Buffer.alloc(size); let filehandle try { filehandle = await fs.promises.open(path, 'r+'); await filehandle.read(buffer, 0, buffer.length, from); } finally { if (filehandle) { await filehandle.close() resolve(buffer.toString()) } } }) } const getEntryOffset = async (dbPath:string, accession:string): Promise => { const indexPath = dbPath + '.jsi' if (!fs.existsSync(indexPath)) await makeIndex(dbPath) const lineSel = new RegExp(accession) for await (const line of line$(indexPath)) { if (lineSel.test(line)) return [Number(line.split('\t')[1]),Number(line.split('\t')[2])] } return [0, 0] } const getEnrty = async (dbPath:string, accession:string) => { const parser = new XMLParser({ ignoreAttributes: false, alwaysCreateTextNode: false, attributeNamePrefix: "", textNodeName: "value", allowBooleanAttributes: true, }) const offsets = await getEntryOffset(dbPath, accession) return parser.parse(await readOffset(dbPath, offsets[0], offsets[1])) } const getEntryFromGeneName = async (idmappingPath: string, dbPath:string, geneName:string) => { const sel = new RegExp('Gene_Name\t' + geneName) let accessions: any[] = [] for await (const line of line$(idmappingPath)) { if(sel.test(line)) accessions.push(line.split('\t')[0]) } return await getEnrty(dbPath, accessions[0]) } const getInteractionsFromEntry = async (json:any) => { const uniprotIDs = Array.isArray(json.entry.accession) ? json.entry.accession : [json.entry.accession] // Comment interactant const res_inter = jsonata(`entry.comment[type="interaction"].interactant`).evaluate(json) let genes_interactant:any[] = [] if (res_inter) { genes_interactant = [...new Set( res_inter .filter((e:any) => !uniprotIDs.includes(e.label)) .map((e:any) => e.label) .filter((e:any) => e) )] } // Reference scope = INTERACTION WITH const scope_inter = jsonata(`entry.reference[scope ~> /INTERACTION WITH/i ]`).evaluate(json) let genes_scope_inter:any[] = [] if (scope_inter) { const comment_scope_inters = Array.isArray(scope_inter) ? scope_inter : [scope_inter] const comment_scope_inters_genes = comment_scope_inters.map((e:any) => ({ txt: Array.isArray(e.scope) ? e.scope.filter((ee:any) => ee.match(/INTERACTION\ WITH/)).join() : e.scope, ...e })).map((e:any) => ({ interaction: e.txt.substring(e.txt.indexOf("INTERACTION WITH ") + "INTERACTION WITH ".length, e.txt.length), ...e })) genes_scope_inter = [...new Set(comment_scope_inters_genes.map((e:any) => e.interaction))].flatMap((e:any) => e.split(/; | AND /)) } // Comment subunit const comment_subunit = jsonata(`entry.comment[type="subunit"].text.value`).evaluate(json) let comment_subunits_genes:any[] = [] if (comment_subunit) { const comment_subunits = Array.isArray(comment_subunit) ? comment_subunit : [comment_subunit] comment_subunits_genes = comment_subunits .flatMap((e:any) => e.replace(/ *\([^)]*\) */g, '').split(/\n/)) .filter((e:any) => /Interacts/.test(e)) .flatMap((e:any) => e.match(/[A-Z][A-Z0-9]{2,}/g)) } let res = [...new Set([...genes_scope_inter, ...genes_interactant, ...comment_subunits_genes])].sort().filter((e:any) => typeof e === 'string').filter(_=>_) let filterOut = ['PHOSPHOSERINE', 'MOTIFS', 'INFECTION', 'PROTEIN', 'MICROBIAL', 'VIRUS', 'HEPATITIS', 'HERPES', 'SIMPLEX'] if(res.length > 0) res = res.flatMap((e:any) => e.match(/[A-Z]{2,}[A-Z0-9]{1,}/g)).filter((e:any) => !(filterOut.includes(e))) return res.filter(_=>_) } // const findDistance = async (idmappingPath: string, dbPath:string, geneNameA:string, geneNameB:string, maxDistance = 6) => { // let rounds = [[geneNameA]] // let tree = {[geneNameA]: {}} as {[key:string]:any} // let run = true // let a = tree // Object.keys(a).map((gene) => ) // let nIter = 0 // while(nIter <= maxDistance && run) { // for (const gA of rounds[nIter]) { // console.log(nIter,gA); // const tmp = await getInteractionsFromEntry(await getEntryFromGeneName(idmappingPath, dbPath, gA)) // if (tmp.includes(geneNameB)) { run = false; break } // rounds.push(tmp) // } // nIter++ // } // //console.log(rounds); // return nIter // } export { makeIndex, readOffset, getEnrty, getEntryFromGeneName, getInteractionsFromEntry }