import fs from 'fs' import readline from 'readline' import {chi2test} from '@stdlib/stats' const line$ = (path: string) => readline.createInterface({ input: fs.createReadStream(path), crlfDelay: Infinity }) // http://geneontology.org/docs/guide-go-evidence-codes/ // http://wiki.geneontology.org/index.php/Category:Evidence_Codes // http://current.geneontology.org/ontology/go-basic.obo // http://current.geneontology.org/annotations/goa_human.gaf.gz const getSymbol = async (symbol:string, goaPath:string, oboPath:string) => { const header = [ 'database', 'ID', 'Symbol', 'Qualifier', 'GO_Term', 'Evidence', 'Evidence_Code', 'With', 'From','Name', 'Alternative_symbols', 'Class', 'Taxon', 'Date', 'Origin' ] const tester = new RegExp('\t'+symbol+'\t') const separator = new RegExp('\\|') const results = [] as Array<{[key:string]:any}> for await (const line of line$(goaPath)) { if(tester.test(line)) results.push(line.split('\t').filter((e:any)=>e).reduce((p,c,i) => ({...p, [header[i]] : separator.test(c) ? c.split('|') : c}), {})) } const subTerms = results.map(e => e?.GO_Term) const cacheTerms = await getGOterms(subTerms, oboPath) for (let index = 0; index < results.length; index++) { const goTerm = results[index]?.GO_Term results[index]['GO_Term'] = cacheTerms.filter(e => e.id === goTerm)[0] } return results } const getGOterms = async ( terms: string[] | string, oboPath: string ) => { terms = Array.isArray(terms) ? terms : [terms] const testerList = terms.map(e => new RegExp('id: ' + e)) let delim = false const results = [] let result = {} as {[key:string]: any} for await (const line of line$(oboPath)) { if (testerList.some(rx => rx.test(line))) delim = true if (line === '' && delim) { delim = false results.push(result) result = {} } if (delim) result[line.split(': ')[0]] = line.split(': ')[1] } return results } const getN = async ( terms: string | string[], qualifiers: string | string[], goaPath: string, oboPath: string ) => { terms = Array.isArray(terms) ? terms : [terms] qualifiers = Array.isArray(qualifiers) ? qualifiers : [qualifiers] let qualifs = {} as {[key:string]: {[key:string]:any}} let termsList: any[] = [] for (const term of terms) { termsList.push({ nTotal: 0, genes: [], ...(await getGOterms(term, oboPath))[0], test: new RegExp('\t' + term + '\t'), }) } qualifiers.map((e:any)=> qualifs[e] = {}) for await (const line of line$(goaPath)) { const t = termsList.filter((rx:any) => rx.test.test(line)) if (t.length > 0) { termsList = termsList.map((rx:any) => { if (rx.test.test(line)) { const genes = [...new Set([...(rx?.genes), line.split('\t')[2]])] return {...rx, nTotal:genes.length, genes} } else { return rx } }) } } return termsList.map((e:any)=> { delete e['test'] return e }) } const headerGOA = [ 'database', 'ID', 'Symbol', 'Qualifier', 'GO_Term', 'Evidence', 'Evidence_Code', 'With', 'From','Name', 'Alternative_symbols', 'Class', 'Taxon', 'Date', 'Origin'] const getAllSymbols = async ( symbols: string[], qualifier: string, goaPath: string, oboPath: string ) => { symbols = Array.isArray(symbols) ? symbols : [symbols] const testerList = [...new Set(symbols)].map(e => new RegExp('\t' + e + '\t' + qualifier + '\t')) const allGoa = [] as Array<{[key:string]:any}> const allGenes = {} as {[key:string]: any} for await (const line of line$(goaPath)) { if (testerList.some(rx => rx.test(line))) { allGoa.push( line.split('\t'). reduce((p,c,i) => ({...p, [headerGOA[i]] : c}), {}) ) } allGenes[line.split('\t')[2]] = {} } const nAllGenes = Object.keys(allGenes).length console.log(nAllGenes); const allIDs = await getN([...new Set(allGoa.map((e:any)=> e.GO_Term))], qualifier, goaPath, oboPath) return allIDs.map((e:any) => { const observedGenes = symbols.filter((ee:string)=> e.genes.includes(ee)) // const observedGenesNotIn = symbols.filter((ee:string)=> !e.genes.includes(ee)) // const allIn = Object.keys(allGenes).filter((ee:any)=>) const diff = e.genes.length - observedGenes.length return { n: observedGenes.length, prop: observedGenes.length/e.genes.length, chi2test: chi2test( [[observedGenes.length, diff < 0 ? 0 : diff], [symbols.length, nAllGenes]] ).pValue, observedGenes, ...e } }).sort((a,b) => b.chi2test - a.chi2test) } export { getSymbol, getGOterms, getN, getAllSymbols }