index.ts 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145
  1. import { spawn } from 'child_process';
  2. /* (c) Thomas Steimlé 2022
  3. * cat bwa_mem_splitters_on_HG38_Viral.sam | awk '$0~/^@/{next}{lxa=split($0,xa,"XA:Z:"); print $1"\t"$3"\t"$4; if(lxa>1){split(xa[2],xap,","); print $1"\t"xap[1]"\t"substr(xap[2],2)"\tXA"}}' | more
  4. * require os : cat, awk, sort, uniq
  5. *
  6. */
  7. const async_exec = (prog: string, args: string[], onData: Function, onErr: Function) => {
  8. return new Promise((resolve, reject) => {
  9. const child = spawn(prog, args, {shell: true})
  10. child.stdout.on('data', data => onData(data/*.toString().trim()*/))
  11. child.stderr.on('data', data => onErr(data.toString().trim()))
  12. child.on('error', err => reject(err))
  13. child.on('exit', code => resolve(code))
  14. })
  15. }
  16. const clusterSam = (
  17. input_sam: string | Array<string>,
  18. threshold: number,
  19. minReads : number,
  20. blackList ?: string[]
  21. ) => {
  22. return new Promise<any>( async (resolve, _reject) => {
  23. let inputSam: string = Array.isArray(input_sam) ? input_sam.join(' ') : input_sam
  24. let lineAcc: string = ''
  25. interface position {
  26. rname: string;
  27. position: number;
  28. }
  29. interface byContigs {
  30. [key: string]: position[]
  31. }
  32. let byContigs: byContigs = {}
  33. await async_exec('cat', [
  34. inputSam,
  35. '|',
  36. 'awk', '\'$0~/^@/{next}{lxa=split($0,xa,"XA:Z:"); print $1"\t"$3"\t"$4; if(lxa>1){split(xa[2],xap,","); print $1"\t"xap[1]"\t"substr(xap[2],2)"\tXA"}}\'', //skip header
  37. '|',
  38. 'sort',
  39. '|',
  40. 'uniq'
  41. ], (m: string) => {
  42. let tmpSeq: string[] = (lineAcc + m).split(/\n/)
  43. lineAcc = tmpSeq.pop() ! // 'uck typescript
  44. tmpSeq.map(e => {
  45. let tmpName: string = ''
  46. let tmpPos: position = {rname: '', position: 0}
  47. e.split(/\t/).map((el, i) => {
  48. switch (i) {
  49. case 0:
  50. tmpPos['rname'] = el
  51. break;
  52. case 1:
  53. tmpName = el
  54. break;
  55. case 2:
  56. tmpPos['position'] = Number(el)
  57. break;
  58. default:
  59. break;
  60. }
  61. })
  62. let add = true
  63. if (blackList) {
  64. add = blackList.includes(tmpName) ? false : true
  65. }
  66. if(add) {
  67. if (Array.isArray(byContigs[tmpName])) {
  68. byContigs[tmpName].push(tmpPos)
  69. } else {
  70. byContigs[tmpName] = [tmpPos]
  71. }
  72. }
  73. })
  74. }, console.log)
  75. interface byReads {
  76. [key: string]: string[]
  77. }
  78. let byReads: byReads = {}
  79. interface posObj {
  80. [key: string]: string
  81. }
  82. interface posAll {
  83. [key: string]: posObj
  84. }
  85. let posAll: posAll = {}
  86. Object
  87. .keys(byContigs)
  88. .map(name => {
  89. let cluster = 0
  90. let firstPos = 0
  91. byContigs[name]
  92. .sort((a, b) => a.position - b.position)
  93. .map((e, i, a) => {
  94. if(i === 0) {
  95. if(typeof posAll[name] === 'undefined') posAll[name] = {}
  96. firstPos = e.position
  97. }
  98. if (a.length === 1) {
  99. posAll[name][String(cluster)] = String(firstPos)
  100. }
  101. if (Math.abs(e.position - a[i-1]?.position) > threshold) {
  102. posAll[name][String(cluster)] = firstPos + '-' + a[i-1]?.position
  103. cluster = cluster + 1
  104. firstPos = e.position
  105. }
  106. // cluster = Math.abs(e.position - a[i-1]?.position) > threshold ? cluster + 1 : cluster
  107. const clutserName = cluster + '@' + name
  108. byReads[e.rname] = Array.isArray(byReads[e.rname]) ? [... new Set([...byReads[e.rname], clutserName])] : [clutserName]
  109. })
  110. })
  111. interface byClusters {
  112. [key: string]: string[]
  113. }
  114. let byClusters: byClusters = {}
  115. Object.keys(byReads).map(rname => {
  116. const tmpClusterName = byReads[rname].sort().map(e => {
  117. const splited = e.split(/@/)
  118. return splited[1] + ':' + posAll[splited[1]][splited[0]] + '(' + splited[0] + ')'
  119. }).join('<--->')
  120. byClusters[tmpClusterName] = Array.isArray(byClusters[tmpClusterName]) ? [... new Set([...byClusters[tmpClusterName], rname])] : [rname]
  121. })
  122. Object.keys(byClusters).map(e => byClusters[e].length < minReads ? delete byClusters[e] : null);
  123. resolve((Object.keys(byClusters).map(clusterName => ({clusterName, rnames: byClusters[clusterName]})).sort((a:any,b:any) => b.rnames.length - a.rnames.length)) )
  124. })
  125. }
  126. export { clusterSam }
  127. /*
  128. (async () => {
  129. console.log(await clusterSam('/home/thomas/Documents/Programmes/ttest/bwa_mem_splitters_on_HG38_Viral.sam', 333, 55,
  130. ['NR_145819.1', 'NR_145822.1']));
  131. })()
  132. */