Thomas
/
clusterSam


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171
							import { spawn } from 'child_process';
import fs from 'fs'
/* (c) Thomas Steimlé 2022 
 * cat bwa_mem_splitters_on_HG38_Viral.sam | awk '$0~/^@/{next}{lxa=split($0,xa,"XA:Z:"); print $1"\t"$3"\t"$4; if(lxa>1){split(xa[2],xap,","); print $1"\t"xap[1]"\t"substr(xap[2],2)"\tXA"}}' | more
 * require os : cat, awk, sort, uniq
 *
 */ 
const async_exec = (prog: string, args: string[], onData: Function, onErr: Function) => {
    return new Promise((resolve, reject) => {
        const child = spawn(prog, args, {shell: true})

        child.stdout.on('data', data => onData(data/*.toString().trim()*/))
        child.stderr.on('data', data => onErr(data.toString().trim()))

        child.on('error', err => reject(err))
        child.on('exit', code => resolve(code))
    })
}

const clusterSam = (
    input_sam: string | Array<string>,
    threshold: number,
    minReads : number,
    blackList ?: string[]
) => {
    return new Promise<any>( async (resolve, _reject) => {
        let inputSam: string = Array.isArray(input_sam) ? input_sam.join(' ') : input_sam

        let lineAcc: string = ''
        interface position {
            rname: string;
            position: number;
        }

        interface byContigs {
            [key: string]: position[]
        }

        let byContigs: byContigs = {}
        await async_exec('cat', [
            inputSam, 
            '|', 
                'awk', '\'$0~/^@/{next}{lxa=split($0,xa,"XA:Z:"); print $1"\t"$3"\t"$4; if(lxa>1){split(xa[2],xap,","); print $1"\t"xap[1]"\t"substr(xap[2],2)"\tXA"}}\'', //skip header
            '|', 
                'sort',
            '|', 
                'uniq' 
            ], (m: string) => {
                let tmpSeq: string[] = (lineAcc + m).split(/\n/)
                lineAcc = tmpSeq.pop() ! // 'uck typescript
                tmpSeq.map(e => {
                    let tmpName: string = ''
                    let tmpPos: position = {rname: '', position: 0}
                    e.split(/\t/).map((el, i) => {
                        switch (i) {
                            case 0:
                                tmpPos['rname'] = el
                                break;
                            case 1:
                                tmpName = el
                                break;
                            case 2:
                                tmpPos['position'] = Number(el)
                                break;
                            default:
                                break;
                        }
                    })
                    let add = true
                    if (blackList) {
                        add = blackList.includes(tmpName) ? false : true
                    }
                    if(add) {
                        if (Array.isArray(byContigs[tmpName])) {
                            byContigs[tmpName].push(tmpPos)
                        } else {
                            byContigs[tmpName] = [tmpPos]
                        }
                    }
                })
            }, console.log)

            interface byReads {
                [key: string]: string[]
            }
            let byReads: byReads = {}
            interface posObj {
                [key: string]: string
            }
            interface posAll {
                [key: string]: posObj
            }
            let posAll: posAll = {}
            Object
                .keys(byContigs)
                .map(name => {
                    let cluster = 0
                    let firstPos = 0
                    byContigs[name]
                    .filter(a => a)
                    .sort((a, b) => a.position - b.position)
                    /*.map((e, i, a) => {
                        if(i === 0) {
                            if(typeof posAll[name] === 'undefined') posAll[name] = {'0': ''}
                            firstPos = e.position
                        }
                        if (a.length === 1) {
                            posAll[name][String(Object.keys(posAll[name]).length - 1)] = String(firstPos)
                        }
                        if (Math.abs(e.position - a[i-1]?.position) > threshold) {
                            posAll[name][String(Object.keys(posAll[name]).length - 1)] = firstPos + '-' + a[i-1]?.position
                            cluster = cluster + 1
                            firstPos = e.position
                        }
                        if(i === (a.length - 1)) {
                            posAll[name][String(Object.keys(posAll[name]).length - 1)] = firstPos + '-' + e.position
                        }
                        // cluster = Math.abs(e.position - a[i-1]?.position) > threshold ? cluster + 1 : cluster
                        const clutserName = String(Object.keys(posAll[name]).length - 1) + '@' + name
                        byReads[e.rname] = Array.isArray(byReads[e.rname]) ? [... new Set([...byReads[e.rname], clutserName])] : [clutserName]
                    })*/
                    .reduce((p,c,i,a) => {
                        const currentCluster = (Object.keys(p).length - 1)
                        let tmp = p
                        if(p[String(currentCluster)].length > 0 && c.position - Math.max(...p[String(currentCluster)]) > threshold) {
                            tmp = {...p, 
                                [String(currentCluster)]: Math.min(...p[String(currentCluster)]) + '-' + Math.max(...p[String(currentCluster)]),
                                [String(currentCluster + 1)]: [c.position]
                            }
                        } else {
                            tmp = {...p, [String(currentCluster)]: [...p[String(currentCluster)], c.position]}
                        }
                        if(i === (a.length-1)) {
                            tmp = {...p, 
                                [String(currentCluster)]: Math.min(...p[String(currentCluster)]) + '-' + c.position,
                            }
                        }
                        posAll[name] = tmp
                        const clutserName = String(Object.keys(posAll[name]).length - 1) + '@' + name
                        byReads[c.rname] = Array.isArray(byReads[c.rname]) ? [... new Set([...byReads[c.rname], clutserName])] : [clutserName]
                        return tmp
                    }, {'0': [] } as {[key: string]: any})
                })

            interface byClusters {
                [key: string]: string[]
            }
            let byClusters: byClusters = {}            
            
            Object.keys(byReads).map(rname => {
                const tmpClusterName = byReads[rname].sort().map(e => {
                    const splited = e.split(/@/)
                    return splited[1] + ':' + posAll[splited[1]][splited[0]] + '(' + splited[0] + ')'
                }).join('<>')
                byClusters[tmpClusterName] = Array.isArray(byClusters[tmpClusterName]) ? [... new Set([...byClusters[tmpClusterName], rname])] : [rname]
            })

            Object.keys(byClusters).map(e => byClusters[e].length < minReads ? delete byClusters[e] : null);
            resolve((Object.keys(byClusters).map(clusterName => ({clusterName, rnames: byClusters[clusterName]})).sort((a:any,b:any) => b.rnames.length - a.rnames.length)) )
    })
}

export { clusterSam }


/*(async () => {
    const bl = (await fs.promises.readFile('/home/thomas/Documents/Programmes/ttest/blackListRNA.txt')).toString().split('\n')
    console.log(bl);

    console.log(await clusterSam('/home/thomas/Documents/Programmes/ttest/bwa_mem_splitters_on_human_rna.sam', 333, 10, bl));
})()*/