index.ts 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214
  1. // https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/uniprot_sprot_human.xml.gz
  2. import fs from 'fs'
  3. import readline from 'readline'
  4. import { XMLParser } from 'fast-xml-parser'
  5. const line$ = (path: string) => readline.createInterface({
  6. input: fs.createReadStream(path),
  7. crlfDelay: Infinity
  8. })
  9. const makeIndex = async (filePath: string, indexPath?: string) => {
  10. indexPath = indexPath || filePath + '.jsi'
  11. let byteAcc = 0
  12. const fromSel = new RegExp("^<entry")
  13. const toSel = new RegExp("^</entry>")
  14. const valSel = new RegExp('<accession>')
  15. let tmp = {values:[]} as any
  16. for await (const line of line$(filePath)) {
  17. if(fromSel.test(line)) tmp['from'] = byteAcc
  18. byteAcc += (line.length + 1)
  19. if (valSel.test(line)) tmp['values'].push(line.match("<accession>(.*?)</accession>")![1]) // 'uck
  20. if(toSel.test(line)) {
  21. await fs.promises.appendFile(indexPath, tmp.values.join(';') + '\t' + tmp.from + '\t' + byteAcc + '\n')
  22. tmp = {values:[]}
  23. }
  24. }
  25. }
  26. const readOffset = (path: string, from:number, to:number) => {
  27. return new Promise<string>(async (resolve, reject) => {
  28. const size = to - from
  29. const buffer = Buffer.alloc(size);
  30. let filehandle
  31. try {
  32. filehandle = await fs.promises.open(path, 'r+');
  33. await filehandle.read(buffer, 0, buffer.length, from);
  34. } finally {
  35. if (filehandle) {
  36. await filehandle.close()
  37. resolve(buffer.toString())
  38. }
  39. }
  40. })
  41. }
  42. const getEntryOffset = async (dbPath:string, accession:string): Promise<number[]> => {
  43. const indexPath = dbPath + '.jsi'
  44. if (!fs.existsSync(indexPath)) await makeIndex(dbPath)
  45. const lineSel = new RegExp(accession)
  46. for await (const line of line$(indexPath)) {
  47. if (lineSel.test(line)) return [Number(line.split('\t')[1]),Number(line.split('\t')[2])]
  48. }
  49. return [0, 0]
  50. }
  51. const getEntry = async (dbPath:string, accession:string) => {
  52. const parser = new XMLParser({
  53. ignoreAttributes: false,
  54. alwaysCreateTextNode: false,
  55. attributeNamePrefix: "",
  56. textNodeName: "value",
  57. allowBooleanAttributes: true,
  58. })
  59. const offsets = await getEntryOffset(dbPath, accession)
  60. return parser.parse(await readOffset(dbPath, offsets[0], offsets[1]))
  61. }
  62. const getEntryFromGeneName = async (idmappingPath: string, dbPath:string, geneName:string) => {
  63. const accessions = await getAccessFromGene(idmappingPath, geneName)
  64. return await getEntry(dbPath, accessions[0]) // seems to be always the first with entry
  65. }
  66. const getAccessFromGene = async (idmappingPath: string, geneName:string) => {
  67. const sel = new RegExp('Gene_Name\t' + geneName)
  68. let accessions: any[] = []
  69. for await (const line of line$(idmappingPath)) {
  70. if(sel.test(line)) accessions.push(line.split('\t')[0])
  71. }
  72. return accessions
  73. }
  74. const getInteractionsFromEntry = async (json:any) => {
  75. const blaskList =
  76. ['DNA', 'PHOSPHOSERINE', 'MOTIFS', 'INFECTION', 'PROTEIN', 'PROTEINS', 'GAMMA-SECRETASE', 'CALCIUM',
  77. 'MICROBIAL', 'VIRUS', 'HEPATITIS', 'HERPES', 'SIMPLEX', 'RELATED', 'AND', 'CLATHRIN', 'WORTMANNIN']
  78. const uniprotIDs = Array.isArray(json.entry.accession) ? json.entry.accession : [json.entry.accession]
  79. // geneName
  80. const gnT = Array.isArray(json.entry.gene.name) ? json.entry.gene.name : [json.entry.gene.name]
  81. const geneName = gnT.filter((e:any)=> e.type === 'primary').map((e:any)=> e.value)[0]
  82. // Interactants
  83. const jecT = Array.isArray(json.entry.comment) ? json.entry.comment : [json.entry.comment]
  84. const interactants = jecT
  85. .filter((e:any)=> e?.type === 'interaction')
  86. .flatMap((e:any) => ({
  87. type : 'interactant',
  88. fromProductId: e.interactant[0].id,
  89. toProductId : e.interactant[1].id,
  90. to : e.interactant[1].label,
  91. nExperiments : Number(e.experiments)
  92. }))
  93. const regExp = new RegExp('INTERACTION WITH |Interacts with |complex with ', 'i')
  94. const geneRegExp = new RegExp(/[A-Z]{1}[A-Z|0-9]{2,}$/)
  95. // uniprot_comment_text_value
  96. const commentInteractsWith = jecT
  97. .filter((e:any) => e?.text?.value)
  98. .filter((e:any) => regExp.test(e.text.value))
  99. .map((e:any) => ({
  100. to : e.text.value
  101. .split(/\.|;/)
  102. .flatMap((ee:any) => ee.replace(/ *\([^)]*\) */g, ' '))
  103. .filter((ee:any) => regExp.test(ee))
  104. .flatMap((ee:any) => ee.trim().split(regExp))
  105. .flatMap((ee:any) => ee.split(/,| and | /))
  106. .filter((_:any) => _)
  107. .filter((ee:any) => geneRegExp.test(ee))
  108. .filter((ee:any) => !blaskList.includes(ee) && ee !== geneName)
  109. .map((ee:any) => ee.trim()),
  110. text: e.text.value,
  111. //evidences: e.text.evidence.split(' ')//.map((ee:string)=> json.entry.reference.filter((eee:any)=> eee.key === ee)) // Doesnt work with ref key
  112. }))
  113. .flatMap((e:any)=> e.to.flatMap((ee:any) => ({
  114. type : 'uniprot_comment_text_value',
  115. to : ee,
  116. text : e.text
  117. })))
  118. // uniprot_reference_scope
  119. const referenceInteract = json.entry.reference
  120. .map((e:any)=> ({...e, scope : Array.isArray(e.scope) ? e.scope : [e.scope]}))
  121. .filter((e:any) => regExp.test(e.scope.join('')))
  122. .map((e:any)=> ({
  123. to: e.scope
  124. //.split(/\.|;/)
  125. .flatMap((ee:any) => regExp.test(ee) ? [ee] : [])
  126. .filter((_:any) => _)
  127. .flatMap((ee:any) => ee.replace(/ *\([^)]*\) */g, ' '))
  128. .filter((ee:any) => regExp.test(ee))
  129. .flatMap((ee:any) => ee.trim().split(regExp)[1])
  130. .flatMap((ee:any) => ee.split(/,| and | /i))
  131. .filter((_:any) => _)
  132. .filter((ee:any) => geneRegExp.test(ee))
  133. .filter((ee:any) => !blaskList.includes(ee) && ee !== geneName)
  134. .map((ee:any) => ee.trim()),
  135. ...e,
  136. }))
  137. .flatMap((e:any)=> e.to.flatMap((ee:any) => ({
  138. type : 'reference_scope',
  139. to : ee,
  140. scope : e.scope,
  141. //citation: e.citation
  142. })))
  143. // Group
  144. const byTo = {} as {[key:string]: any}
  145. [...interactants, ...referenceInteract, ...commentInteractsWith]
  146. //.map((e:any)=> byTo[e.to] = byTo[e.to] ? [e, ...byTo[e.to]] : [e] )
  147. .map((e:any)=> byTo[e.to] = byTo[e.to] ? {...e, ...byTo[e.to]} : {...e} )
  148. const results = Object.keys(byTo).map((e:any) => {
  149. delete byTo[e]?.to
  150. return {
  151. from: geneName,
  152. to: e,
  153. data: byTo[e]
  154. }
  155. })
  156. .filter((e:any) => !blaskList.includes(e.to) && e.to !== geneName)
  157. // await fs.promises.writeFile('test/tmp.json', JSON.stringify(results.map((e:any)=>e.to), null,4))
  158. return results
  159. }
  160. // const findDistance = async (idmappingPath: string, dbPath:string, geneNameA:string, geneNameB:string, maxDistance = 6) => {
  161. // let rounds = [[geneNameA]]
  162. // let tree = {[geneNameA]: {}} as {[key:string]:any}
  163. // let run = true
  164. // let a = tree
  165. // Object.keys(a).map((gene) => )
  166. // let nIter = 0
  167. // while(nIter <= maxDistance && run) {
  168. // for (const gA of rounds[nIter]) {
  169. // console.log(nIter,gA);
  170. // const tmp = await getInteractionsFromEntry(await getEntryFromGeneName(idmappingPath, dbPath, gA))
  171. // if (tmp.includes(geneNameB)) { run = false; break }
  172. // rounds.push(tmp)
  173. // }
  174. // nIter++
  175. // }
  176. // //console.log(rounds);
  177. // return nIter
  178. // }
  179. export { makeIndex, readOffset, getEntry as getEnrty, getEntryFromGeneName, getInteractionsFromEntry }