index.ts 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151
  1. import fs from 'fs'
  2. import readline from 'readline'
  3. import {chi2test} from '@stdlib/stats'
  4. const line$ = (path: string) => readline.createInterface({
  5. input: fs.createReadStream(path),
  6. crlfDelay: Infinity
  7. })
  8. // http://geneontology.org/docs/guide-go-evidence-codes/
  9. // http://wiki.geneontology.org/index.php/Category:Evidence_Codes
  10. // http://current.geneontology.org/ontology/go-basic.obo
  11. // http://current.geneontology.org/annotations/goa_human.gaf.gz
  12. const getSymbol = async (symbol:string, goaPath:string, oboPath:string) => {
  13. const header = [
  14. 'database', 'ID', 'Symbol', 'Qualifier',
  15. 'GO_Term', 'Evidence', 'Evidence_Code',
  16. 'With', 'From','Name', 'Alternative_symbols',
  17. 'Class', 'Taxon', 'Date', 'Origin'
  18. ]
  19. const tester = new RegExp('\t'+symbol+'\t')
  20. const separator = new RegExp('\\|')
  21. const results = [] as Array<{[key:string]:any}>
  22. for await (const line of line$(goaPath)) {
  23. if(tester.test(line)) results.push(line.split('\t').filter((e:any)=>e).reduce((p,c,i) => ({...p, [header[i]] : separator.test(c) ? c.split('|') : c}), {}))
  24. }
  25. const subTerms = results.map(e => e?.GO_Term)
  26. const cacheTerms = await getGOterms(subTerms, oboPath)
  27. for (let index = 0; index < results.length; index++) {
  28. const goTerm = results[index]?.GO_Term
  29. results[index]['GO_Term'] = cacheTerms.filter(e => e.id === goTerm)[0]
  30. }
  31. return results
  32. }
  33. const getGOterms = async (
  34. terms: string[] | string,
  35. oboPath: string
  36. ) => {
  37. terms = Array.isArray(terms) ? terms : [terms]
  38. const testerList = terms.map(e => new RegExp('id: ' + e))
  39. let delim = false
  40. const results = []
  41. let result = {} as {[key:string]: any}
  42. for await (const line of line$(oboPath)) {
  43. if (testerList.some(rx => rx.test(line))) delim = true
  44. if (line === '' && delim) {
  45. delim = false
  46. results.push(result)
  47. result = {}
  48. }
  49. if (delim) result[line.split(': ')[0]] = line.split(': ')[1]
  50. }
  51. return results
  52. }
  53. const getN = async (
  54. terms: string | string[],
  55. qualifiers: string | string[],
  56. goaPath: string,
  57. oboPath: string
  58. ) => {
  59. terms = Array.isArray(terms) ? terms : [terms]
  60. qualifiers = Array.isArray(qualifiers) ? qualifiers : [qualifiers]
  61. let qualifs = {} as {[key:string]: {[key:string]:any}}
  62. let termsList: any[] = []
  63. for (const term of terms) {
  64. termsList.push({
  65. nTotal: 0,
  66. genes: [],
  67. ...(await getGOterms(term, oboPath))[0],
  68. test: new RegExp('\t' + term + '\t'),
  69. })
  70. }
  71. qualifiers.map((e:any)=> qualifs[e] = {})
  72. for await (const line of line$(goaPath)) {
  73. const t = termsList.filter((rx:any) => rx.test.test(line))
  74. if (t.length > 0) {
  75. termsList = termsList.map((rx:any) => {
  76. if (rx.test.test(line)) {
  77. const genes = [...new Set([...(rx?.genes), line.split('\t')[2]])]
  78. return {...rx, nTotal:genes.length, genes}
  79. } else {
  80. return rx
  81. }
  82. })
  83. }
  84. }
  85. return termsList.map((e:any)=> {
  86. delete e['test']
  87. return e
  88. })
  89. }
  90. const headerGOA = [
  91. 'database', 'ID', 'Symbol', 'Qualifier',
  92. 'GO_Term', 'Evidence', 'Evidence_Code',
  93. 'With', 'From','Name', 'Alternative_symbols',
  94. 'Class', 'Taxon', 'Date', 'Origin']
  95. const getAllSymbols = async (
  96. symbols: string[],
  97. qualifier: string,
  98. goaPath: string,
  99. oboPath: string
  100. ) => {
  101. symbols = Array.isArray(symbols) ? symbols : [symbols]
  102. const testerList = [...new Set(symbols)].map(e => new RegExp('\t' + e + '\t' + qualifier + '\t'))
  103. const allGoa = [] as Array<{[key:string]:any}>
  104. const allGenes = {} as {[key:string]: any}
  105. for await (const line of line$(goaPath)) {
  106. if (testerList.some(rx => rx.test(line))) {
  107. allGoa.push(
  108. line.split('\t').
  109. reduce((p,c,i) => ({...p, [headerGOA[i]] : c}), {})
  110. )
  111. }
  112. allGenes[line.split('\t')[2]] = {}
  113. }
  114. const nAllGenes = Object.keys(allGenes).length
  115. console.log(nAllGenes);
  116. const allIDs = await getN([...new Set(allGoa.map((e:any)=> e.GO_Term))], qualifier, goaPath, oboPath)
  117. return allIDs.map((e:any) => {
  118. const observedGenes = symbols.filter((ee:string)=> e.genes.includes(ee))
  119. // const observedGenesNotIn = symbols.filter((ee:string)=> !e.genes.includes(ee))
  120. // const allIn = Object.keys(allGenes).filter((ee:any)=>)
  121. const diff = e.genes.length - observedGenes.length
  122. return {
  123. n: observedGenes.length,
  124. prop: observedGenes.length/e.genes.length,
  125. chi2test: chi2test(
  126. [[observedGenes.length,
  127. diff < 0 ? 0 : diff],
  128. [symbols.length,
  129. nAllGenes]]
  130. ).pValue,
  131. observedGenes,
  132. ...e
  133. }
  134. }).sort((a,b) => b.chi2test - a.chi2test)
  135. }
  136. export { getSymbol, getGOterms, getN, getAllSymbols }