index.ts 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160
  1. // https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/uniprot_sprot_human.xml.gz
  2. import fs from 'fs'
  3. import readline from 'readline'
  4. import { XMLParser } from 'fast-xml-parser'
  5. import jsonata from 'jsonata'
  6. const line$ = (path: string) => readline.createInterface({
  7. input: fs.createReadStream(path),
  8. crlfDelay: Infinity
  9. })
  10. const makeIndex = async (filePath: string, indexPath?: string) => {
  11. indexPath = indexPath || filePath + '.jsi'
  12. let byteAcc = 0
  13. const fromSel = new RegExp("^<entry")
  14. const toSel = new RegExp("^</entry>")
  15. const valSel = new RegExp('<accession>')
  16. let tmp = {values:[]} as any
  17. for await (const line of line$(filePath)) {
  18. if(fromSel.test(line)) tmp['from'] = byteAcc
  19. byteAcc += (line.length + 1)
  20. if (valSel.test(line)) tmp['values'].push(line.match("<accession>(.*?)</accession>")![1]) // 'uck
  21. if(toSel.test(line)) {
  22. await fs.promises.appendFile(indexPath, tmp.values.join(';') + '\t' + tmp.from + '\t' + byteAcc + '\n')
  23. tmp = {values:[]}
  24. }
  25. }
  26. }
  27. const readOffset = (path: string, from:number, to:number) => {
  28. return new Promise<string>(async (resolve, reject) => {
  29. const size = to - from
  30. const buffer = Buffer.alloc(size);
  31. let filehandle
  32. try {
  33. filehandle = await fs.promises.open(path, 'r+');
  34. await filehandle.read(buffer, 0, buffer.length, from);
  35. } finally {
  36. if (filehandle) {
  37. await filehandle.close()
  38. resolve(buffer.toString())
  39. }
  40. }
  41. })
  42. }
  43. const getEntryOffset = async (dbPath:string, accession:string): Promise<number[]> => {
  44. const indexPath = dbPath + '.jsi'
  45. if (!fs.existsSync(indexPath)) await makeIndex(dbPath)
  46. const lineSel = new RegExp(accession)
  47. for await (const line of line$(indexPath)) {
  48. if (lineSel.test(line)) return [Number(line.split('\t')[1]),Number(line.split('\t')[2])]
  49. }
  50. return [0, 0]
  51. }
  52. const getEnrty = async (dbPath:string, accession:string) => {
  53. const parser = new XMLParser({
  54. ignoreAttributes: false,
  55. alwaysCreateTextNode: false,
  56. attributeNamePrefix: "",
  57. textNodeName: "value",
  58. allowBooleanAttributes: true,
  59. })
  60. const offsets = await getEntryOffset(dbPath, accession)
  61. return parser.parse(await readOffset(dbPath, offsets[0], offsets[1]))
  62. }
  63. const getEntryFromGeneName = async (idmappingPath: string, dbPath:string, geneName:string) => {
  64. const sel = new RegExp('Gene_Name\t' + geneName)
  65. let accessions: any[] = []
  66. for await (const line of line$(idmappingPath)) {
  67. if(sel.test(line)) accessions.push(line.split('\t')[0])
  68. }
  69. return await getEnrty(dbPath, accessions[0])
  70. }
  71. const getInteractionsFromEntry = async (json:any) => {
  72. const uniprotIDs = Array.isArray(json.entry.accession) ? json.entry.accession : [json.entry.accession]
  73. // Comment interactant
  74. const res_inter = jsonata(`entry.comment[type="interaction"].interactant`).evaluate(json)
  75. let genes_interactant:any[] = []
  76. if (res_inter) {
  77. genes_interactant = [...new Set(
  78. res_inter
  79. .filter((e:any) => !uniprotIDs.includes(e.label))
  80. .map((e:any) => e.label)
  81. .filter((e:any) => e)
  82. )]
  83. }
  84. // Reference scope = INTERACTION WITH
  85. const scope_inter = jsonata(`entry.reference[scope ~> /INTERACTION WITH/i ]`).evaluate(json)
  86. let genes_scope_inter:any[] = []
  87. if (scope_inter) {
  88. const comment_scope_inters = Array.isArray(scope_inter) ? scope_inter : [scope_inter]
  89. const comment_scope_inters_genes = comment_scope_inters.map((e:any) => ({
  90. txt: Array.isArray(e.scope) ? e.scope.filter((ee:any) => ee.match(/INTERACTION\ WITH/)).join() : e.scope,
  91. ...e
  92. })).map((e:any) => ({
  93. interaction: e.txt.substring(e.txt.indexOf("INTERACTION WITH ") + "INTERACTION WITH ".length, e.txt.length),
  94. ...e
  95. }))
  96. genes_scope_inter = [...new Set(comment_scope_inters_genes.map((e:any) => e.interaction))].flatMap((e:any) => e.split(/; | AND /))
  97. }
  98. // Comment subunit
  99. const comment_subunit = jsonata(`entry.comment[type="subunit"].text.value`).evaluate(json)
  100. let comment_subunits_genes:any[] = []
  101. if (comment_subunit) {
  102. const comment_subunits = Array.isArray(comment_subunit) ? comment_subunit : [comment_subunit]
  103. comment_subunits_genes = comment_subunits
  104. .flatMap((e:any) => e.replace(/ *\([^)]*\) */g, '').split(/\n/))
  105. .filter((e:any) => /Interacts/.test(e))
  106. .flatMap((e:any) => e.match(/[A-Z][A-Z0-9]{2,}/g))
  107. }
  108. let res = [...new Set([...genes_scope_inter, ...genes_interactant, ...comment_subunits_genes])].sort().filter((e:any) => typeof e === 'string').filter(_=>_)
  109. let filterOut = ['PHOSPHOSERINE', 'MOTIFS', 'INFECTION', 'PROTEIN', 'MICROBIAL']
  110. if(res.length > 0) res = res.flatMap((e:any) => e.match(/[A-Z]{2,}[A-Z0-9]{1,}/g)).filter((e:any) => !(filterOut.includes(e)))
  111. return res.filter(_=>_)
  112. }
  113. // const findDistance = async (idmappingPath: string, dbPath:string, geneNameA:string, geneNameB:string, maxDistance = 6) => {
  114. // let rounds = [[geneNameA]]
  115. // let tree = {[geneNameA]: {}} as {[key:string]:any}
  116. // let run = true
  117. // let a = tree
  118. // Object.keys(a).map((gene) => )
  119. // let nIter = 0
  120. // while(nIter <= maxDistance && run) {
  121. // for (const gA of rounds[nIter]) {
  122. // console.log(nIter,gA);
  123. // const tmp = await getInteractionsFromEntry(await getEntryFromGeneName(idmappingPath, dbPath, gA))
  124. // if (tmp.includes(geneNameB)) { run = false; break }
  125. // rounds.push(tmp)
  126. // }
  127. // nIter++
  128. // }
  129. // //console.log(rounds);
  130. // return nIter
  131. // }
  132. export { makeIndex, readOffset, getEnrty, getEntryFromGeneName, getInteractionsFromEntry }