import { spawn } from 'child_process'; /* (c) Thomas Steimlé 2022 * cat bwa_mem_splitters_on_HG38_Viral.sam | awk '$0~/^@/{next}{lxa=split($0,xa,"XA:Z:"); print $1"\t"$3"\t"$4; if(lxa>1){split(xa[2],xap,","); print $1"\t"xap[1]"\t"substr(xap[2],2)"\tXA"}}' | more * require os : cat, awk, sort, uniq * */ const async_exec = (prog: string, args: string[], onData: Function, onErr: Function) => { return new Promise((resolve, reject) => { const child = spawn(prog, args, {shell: true}) child.stdout.on('data', data => onData(data/*.toString().trim()*/)) child.stderr.on('data', data => onErr(data.toString().trim())) child.on('error', err => reject(err)) child.on('exit', code => resolve(code)) }) } const clusterSam = ( input_sam: string | Array, threshold: number, minReads : number ) => { return new Promise( async (resolve, _reject) => { let inputSam: string = Array.isArray(input_sam) ? input_sam.join(' ') : input_sam let lineAcc: string = '' interface position { rname: string; position: number; } interface byContigs { [key: string]: position[] } let byContigs: byContigs = {} await async_exec('cat', [ inputSam, '|', 'awk', '\'$0~/^@/{next}{lxa=split($0,xa,"XA:Z:"); print $1"\t"$3"\t"$4; if(lxa>1){split(xa[2],xap,","); print $1"\t"xap[1]"\t"substr(xap[2],2)"\tXA"}}\'', //skip header '|', 'sort', '|', 'uniq' ], (m: string) => { let tmpSeq: string[] = (lineAcc + m).split(/\n/) lineAcc = tmpSeq.pop() ! // 'uck typescript tmpSeq.map(e => { let tmpName: string = '' let tmpPos: position = {rname: '', position: 0} e.split(/\t/).map((el, i) => { switch (i) { case 0: tmpPos['rname'] = el break; case 1: tmpName = el break; case 2: tmpPos['position'] = Number(el) break; default: break; } }) if (Array.isArray(byContigs[tmpName])) { byContigs[tmpName].push(tmpPos) } else { byContigs[tmpName] = [tmpPos] } }) }, console.log) interface byReads { [key: string]: string[] } let byReads: byReads = {} Object .keys(byContigs) .map(name => { let cluster = 0 byContigs[name] .sort((a, b) => a.position - b.position) .map((e, i, a) => { cluster = Math.abs(e.position - a[i-1]?.position) > threshold ? cluster + 1 : cluster const clutserName = cluster + '@' + name byReads[e.rname] = Array.isArray(byReads[e.rname]) ? [... new Set([...byReads[e.rname], clutserName])] : [clutserName] }) }) interface byClusters { [key: string]: string[] } let byClusters: byClusters = {} Object.keys(byReads).map(rname => { const tmpClusterName = byReads[rname].sort().join('<--->') byClusters[tmpClusterName] = Array.isArray(byClusters[tmpClusterName]) ? [... new Set([...byClusters[tmpClusterName], rname])] : [rname] }) Object.keys(byClusters).map(e => byClusters[e].length < minReads ? delete byClusters[e] : null); resolve((Object.keys(byClusters).map(clusterName => ({clusterName, rnames: byClusters[clusterName]})).sort((a:any,b:any) => b.rnames.length - a.rnames.length)) ) }) } export { clusterSam } /* (async () => { console.log(await clusterSam('/home/thomas/Documents/Programmes/ttest/bwa_mem_splitters_on_HG38_Viral.sam', 333, 55)); })() */