/******************************************************************************* * Requires Groovy 2+ * * The easiest way to install Groovy is to install Gvmtool http://gvmtool.net/ * then run: gvm install groovy ******************************************************************************/ @Grab( 'com.bloidonia:groovy-common-extensions:0.5.5' ) @Grab('com.xlson.groovycsv:groovycsv:1.0') import static com.xlson.groovycsv.CsvParser.parseCsv @Grab( 'org.gsheets.kktec:gsheets:0.3.2a' ) import org.gsheets.building.* import groovy.transform.* // Add to File for showing progress whilst parsing large files File.metaClass.withSpinnerReader = { Closure cl -> new SpinnerReader( delegate.newReader() ).withClosable cl } // Get the root of this script File root = new File( getClass().protectionDomain.codeSource.location.toURI() ).parentFile // And the folders we're going to be writing to File inputs = new File( root, 'Inputs' ) File outputs = new File( root, 'Outputs' ) outputs.mkdirs() /******************************************************************************* * Check input data ******************************************************************************/ File cosmicData = new File( inputs, 'CosmicCellLineProject_v67_241013.tsv' ) File ccleData = new File( inputs, 'CCLE_hybrid_capture1650_hg19_NoCommonSNPs_NoNeutralVariants_CDS_2012-2.05.07.maf' ) File screenedData = new File( inputs, 'genes_screened_by_ccle_hybrid_capture.csv' ) File ccleClassification = new File( inputs, 'CCLE_sample_info_file_2012-10-18.txt' ) File censusData = new File( inputs, 'cancer_gene_census list.csv' ) File allVariantData = new File( inputs, 'CCLE_hybrid_capture1650_hg19_allVariants_2012.05.07.maf' ) Map gatkData = [ H2009: new File( inputs, 'JB3-1-H2009_CGATGT_L001.csv' ), H1437: new File( inputs, 'JB3-2-H1437_GCCAAT_L001.csv' ), H2122: new File( inputs, 'JB3-3-H2122_TAGCTT_L001.csv' ), H2087: new File( inputs, 'JB3-4-H2087_CTTGTA_L001.csv' ) ] if( [ cosmicData, ccleData, screenedData, ccleClassification, censusData, allVariantData ].collect { if( !it.exists() ) { println "$it does not exist. Please download and try again." ; true } else { false } }.any() ) { System.exit( -1 ) } boolean gatkAvailable = gatkData.values().every { it.exists() } /******************************************************************************* * Load Data ******************************************************************************/ Project cosmic = parse( cosmicData ) { row -> [ sample : row[ 'Sample name' ].replace( '-', '' ).replace( '(', '' ).replace( ')', '' ).toUpperCase(), gene : row[ 'Gene name' ].tokenize( '_' ).head(), amino : row[ 'AA_MUT_SYNTAX' ].trim(), location : row[ 'GRCh37 genome position' ].tokenize( ':' ).with { x, y -> "${( x == '23' ? 'X' : x == '24' ? 'Y' : x )}:$y" } ] } Project ccle = parse( ccleData ) { row -> [ sample : row[ 'Tumor_Sample_Barcode' ].tokenize( '_' ).head().toUpperCase(), gene : row[ 'Hugo_Symbol' ], amino : row[ 'Protein_Change' ].trim(), location : "${row[ 'Chromosome' ]}:${row[ 'Start_position' ]}-${row[ 'End_position' ]}" ] } Project gatk = !gatkAvailable ? null : parse( gatkData ) { row -> [ dbSNP_id : row[ 'dbSNP_id' ], func : row[ 'Func.refGene' ], amino : row[ 'AAChange.refGene' ].tokenize( ',' ).collect { v -> v ? v.tokenize( ':' ).with { c -> [ gene:c[ 0 ], amino:c[ 4 ] ] } : '' }.unique { it.gene }, location : "${row[ 'chr' ] - 'chr'}:${row['start']}-${row['end']}" ] } Set screened = screenedData.withSpinnerReader { r -> parseCsv( r, separator:',' ).collect { row -> row[ 'HGNC Symbol' ] ?: row[ 'Symbol' ] } } Map cellLineClassification = ccleClassification.withSpinnerReader { r -> parseCsv( r, separator:'\t' ).collectEntries { row -> [ row[ 'CCLE name' ].tokenize( '_' ).head().toUpperCase(), row[ 'Site Primary' ] ] } } Set cancerCensusGenes = censusData.withSpinnerReader { r -> parseCsv( r, separator:',' ).collect { row -> row[ 'Symbol' ] } } /******************************************************************************* * Filter by the CCLE screened list ******************************************************************************/ Set cosmicGenes = cosmic.cellLines.values().collectMany { it.mutations.keySet() } as Set Set ccleGenes = ccle.cellLines.values().collectMany { it.mutations.keySet() } as Set Set reportedByCCLEButNotInTheirScreenedList = ccleGenes - ccleGenes.intersect( screened ) int removed = 0 ccle.cellLines.each { clName, cl -> reportedByCCLEButNotInTheirScreenedList.each { badGene -> removed++ cl.mutations.remove( badGene ) } } println "Removed $removed CCLE mutations in genes not mentioned as screened by CCLE" println "" /******************************************************************************* * Cosmic Gene name alignment ******************************************************************************/ int renamed = renameGenes( cosmic ) println "Renamed $renamed cosmic genes to align with CCLE" println "" cosmicGenes = cosmic.cellLines.values().collectMany { it.mutations.keySet() } as Set ccleGenes = ccle.cellLines.values().collectMany { it.mutations.keySet() } as Set gatkGenes = gatk?.cellLines?.values()?.collectMany { it.mutations.keySet() } as Set println "${cosmicGenes.size()} cosmic genes found in CosmicCellLineProject_v67_241013" println "${ccleGenes.size()} ccle genes found in CCLE_hybrid_capture maf file" println "${cosmicGenes.intersect( ccleGenes ).size()} genes in both ccle and cosmic" println "${screened.size()} genes in the genes_screened_by_ccle_hybrid_capture file" println "" println "${cosmicGenes.intersect( screened ).size()} genes in both cosmic and the genes_screened_by_ccle_hybrid_capture file" println "${ccleGenes.intersect( screened ).size()} genes in both ccle and the genes_screened_by_ccle_hybrid_capture file" println "" /******************************************************************************* * Filter cell lines ******************************************************************************/ cosmic.cellLines = cosmic.cellLines.findAll { name, cl -> name in ccle.cellLines.keySet() } ccle.cellLines = ccle.cellLines.findAll { name, cl -> name in cosmic.cellLines.keySet() } /******************************************************************************* * Remove invalid or synonymous mutations ******************************************************************************/ cosmic.cellLines.each { cellLineName, cellLine -> cellLine.mutations = cellLine.mutations .subMap( cosmicGenes.intersect( screened ) ) .collectEntries { geneName, mutations -> [ geneName, mutations?.findAll { it.valid && !it.synonymous } ] } .findAll { it.value } } ccle.cellLines.each { cellLineName, cellLine -> cellLine.mutations = cellLine.mutations .subMap( ccleGenes.intersect( screened ) ) .collectEntries { geneName, mutations -> [ geneName, mutations?.findAll { it.valid && !it.synonymous } ] } .findAll { it.value } } gatk?.cellLines?.each { cellLineName, cellLine -> cellLine.mutations = cellLine.mutations .subMap( gatkGenes.intersect( screened ) ) .collectEntries { geneName, mutations -> [ geneName, mutations?.findAll { am -> def amatch = am.amino =~ /^p\.([A-Z]+)(\d*)([A-Z]+)$/ def avalid = amatch.matches() && amatch[0][1] != 'X' && amatch[0][3] != 'X' def asynon = avalid ? amatch[ 0 ][ 1 ] == amatch[ 0 ][ 3 ] : false avalid && !asynon } ] } .findAll { it.value } } if( gatk ) { dumpGatkIntersection( outputs, 'GATK_incDBSNP', gatk, cosmic, ccle ) /******************************************************************************* * Remove dbSNP mutations from GATK ******************************************************************************/ gatk.cellLines.each { cellLineName, cellLine -> cellLine.mutations = cellLine.mutations .collectEntries { geneName, mutations -> [ geneName, mutations?.findAll { it.valid && !it.synonymous } ] } .findAll { it.value } } dumpGatkIntersection( outputs, 'GATK', gatk, cosmic, ccle ) generateExcelFileFromGATKOutput( outputs ) } /******************************************************************************* * Generate CellLine overview ******************************************************************************/ int ccleTotal = 0 int cosmicTotal = 0 int intersectTotal = 0 cosmic.cellLines.each { lineName, cellLine -> def cos = cellLine.mutations.collectMany { k, v -> v } def ccl = ccle.cellLines[ lineName ].mutations.collectMany { k, v -> v } def inter = cos.intersect( ccl ) cos = cos - inter ccl = ccl - inter ccleTotal += ccl.size() cosmicTotal += cos.size() intersectTotal += inter.size() } println "" println "TOTAL ACROSS ALL LINES COSMIC:$cosmicTotal, CCLE:$ccleTotal, INTERSECT:$intersectTotal" File cellLineOutput = new File( outputs, 'CELLLINE' ) cellLineOutput.mkdirs() dumpCellLineData( new File( cellLineOutput, 'CellLineComparison.csv' ), cellLineClassification, cosmic, ccle ) dumpMutationData( outputs, inputs, cosmic, ccle, cancerCensusGenes, allVariantData ) /******************************************************************************* * Utility methods ******************************************************************************/ Project parse( Map gatkFiles, Closure data ) { int missing = 0 int synonymous = 0 int mutations = 0 new Project( name: 'GATK Data' ).with { project -> gatkFiles.each { cellLineName, f -> println "READING $f.name" cellLineName = "NCI$cellLineName".toString() f.withSpinnerReader { Reader r -> parseCsv( r, separator:',' ).eachWithIndex { row, idx -> def d = data( row ) if( d.func == 'exonic' && d.amino ) { d.amino.each { am -> // aminos is a list of AA mutations def amatch = am.amino =~ /^p\.([A-Z]+)(\d*)([A-Z]+)$/ def avalid = amatch.matches() && d.dbSNP_id == '.' && amatch[0][1] != 'X' && amatch[0][3] != 'X' def asynon = avalid ? amatch[ 0 ][ 1 ] == amatch[ 0 ][ 3 ] : false def mut = new Mutation( gene: am.gene, amino: am.amino, valid: avalid, location: d.location, synonymous: asynon ) project.cellLines[ cellLineName ].mutations[ am.gene ] << mut mutations++ if( avalid && asynon ) { synonymous++ } if( !avalid ) { missing++ } } } else { d.gene.each { gene -> def mut = new Mutation( gene: gene, amino:'', valid: false, location: d.location, synonymous: false ) project.cellLines[ cellLineName ] .mutations[ gene ] << mut mutations++ missing++ } } } } } if( mutations ) { println "- Successfully read $mutations mutations" } if( missing ) { println "- $missing mutations marked with invalid amino, dbSNP or cds changes" } if( synonymous ) { println "- $synonymous valid mutations marked as synonymous mutations" } println "" project } } Project parse( File f, Closure data ) { println "Parsing $f.name" int skipped = 0 int missing = 0 int synonymous = 0 int mutations = 0 Project p = new Project( name: f.name ).with { project -> f.withSpinnerReader { Reader r -> parseCsv( r, separator:'\t' ).eachWithIndex { row, idx -> if( row.values.size() < row.columns.size() ) { skipped++ } else { def d = data( row ) def amatch = d.amino =~ /^p\.([A-Z]+)(\d*)([A-WYZ]+)$/ def avalid = amatch.matches() def asynon = avalid ? amatch[ 0 ][ 1 ] == amatch[ 0 ][ 3 ] : false project.cellLines[ d.sample ] .mutations[ d.gene ] << new Mutation( gene : d.gene, amino : d.amino, valid : avalid, synonymous : asynon, location : d.location ) mutations++ if( avalid && asynon ) { synonymous++ } if( !avalid ) { missing++ } } } } if( mutations ) { println "- Successfully read $mutations mutations" } if( skipped ) { println "- Skipped $skipped bad rows" } if( missing ) { println "- $missing mutations marked with invalid amino or cds changes" } if( synonymous ) { println "- $synonymous valid mutations marked as synonymous mutations" } println "" project.cellLines.each { k, v -> v.mutations = v.mutations as TreeMap } project.cellLines = project.cellLines as TreeMap project } int adj = 0 int mon = 0 p.cellLines.each { name, cellLine -> cellLine.mutations.each { gene, muts -> muts.sort( false ) { it.start } .inject { prev, m -> if( prev.start == m.start - 1 ) { if( prev.valid ) { prev.valid = false adj++ } if( m.valid ) { m.valid = false adj++ } } m } muts.each { if( it.valid && it.start != it.end ) { it.valid = false mon++ } } } } println "Marked $adj adjacent mutations and $mon non-monomer mutations as invalid" p } int renameGenes( Project p ) { int renamed = 0 List changed = [] p.cellLines.each { cellLineName, cellLine -> cellLine.mutations = cellLine.mutations.collectEntries { geneName, mutations -> switch( geneName ) { case 'IGH': renamed++ ; changed << geneName ; return [ 'IGH@': mutations ] case 'IGK': renamed++ ; changed << geneName ; return [ 'IGK@': mutations ] case 'TRB': renamed++ ; changed << geneName ; return [ 'TRB@': mutations ] case 'TRA': renamed++ ; changed << geneName ; return [ 'TRA@': mutations ] case 'TRD': renamed++ ; changed << geneName ; return [ 'TRD@': mutations ] case 'POMK': renamed++ ; changed << geneName ; return [ 'SgK196': mutations ] case 'IGL': renamed++ ; changed << geneName ; return [ 'IGL@': mutations ] case 'DUX10': renamed++ ; changed << geneName ; return [ 'DUX4': mutations ] case 'MYST4': renamed++ ; changed << geneName ; return [ 'KAT6B': mutations ] case 'MYST3': renamed++ ; changed << geneName ; return [ 'KAT6A': mutations ] case 'TBCK': renamed++ ; changed << geneName ; return [ 'TBCKL': mutations ] case 'MGC52498': renamed++ ; changed << geneName ; return [ 'FAM159A': mutations ] case 'MYST2': renamed++ ; changed << geneName ; return [ 'KAT7': mutations ] case 'ITIH5L': renamed++ ; changed << geneName ; return [ 'ITIH6': mutations ] case 'NALP6': renamed++ ; changed << geneName ; return [ 'NLRP6': mutations ] case 'LPP-LRFT': renamed++ ; changed << geneName ; return [ 'C12orf9': mutations ] case 'XTP4': renamed++ ; changed << geneName ; return [ 'MEIN1': mutations ] case 'CNB': renamed++ ; changed << geneName ; return [ 'PPP3R1': mutations ] case 'RAGE': renamed++ ; changed << geneName ; return [ 'MOK': mutations ] case 'AMPK1': renamed++ ; changed << geneName ; return [ 'PRKAA1': mutations ] case 'BID3': renamed++ ; changed << geneName ; return [ 'HRK': mutations ] case 'TNG1': renamed++ ; changed << geneName ; return [ 'TCL6': mutations ] case 'SSX2A': renamed++ ; changed << geneName ; return [ 'SSX2': mutations ] case 'NESK': renamed++ ; changed << geneName ; return [ 'NRK': mutations ] case 'C15orf21': renamed++ ; changed << geneName ; return [ 'HMGN2P46': mutations ] // alt alt names case 'IGHDY1': renamed++ ; changed << geneName ; return [ 'IGH@': mutations ] case 'TCRB': renamed++ ; changed << geneName ; return [ 'TRB@': mutations ] case 'TCRA': renamed++ ; changed << geneName ; return [ 'TRA@': mutations ] case 'TCRD': renamed++ ; changed << geneName ; return [ 'TRD@': mutations ] case 'IGLC6': renamed++ ; changed << geneName ; return [ 'IGL@': mutations ] case 'RDX12': renamed++ ; changed << geneName ; return [ 'MEIN1': mutations ] case 'CALNB1': renamed++ ; changed << geneName ; return [ 'PPP3R1': mutations ] case 'AMPK': renamed++ ; changed << geneName ; return [ 'PRKAA1': mutations ] case 'MOZ2': renamed++ ; changed << geneName ; return [ 'KAT6B': mutations ] case 'MOZ': renamed++ ; changed << geneName ; return [ 'KAT6A': mutations ] case 'HBO1': renamed++ ; changed << geneName ; return [ 'KAT7': mutations ] case 'RDX12': renamed++ ; changed << geneName ; return [ 'MEIN1': mutations ] case 'CALNB1': renamed++ ; changed << geneName ; return [ 'PPP3R1': mutations ] case 'RAGE1': renamed++ ; changed << geneName ; return [ 'MOK': mutations ] case 'DP5': renamed++ ; changed << geneName ; return [ 'HRK': mutations ] case 'TNG2': renamed++ ; changed << geneName ; return [ 'TCL6': mutations ] // alt alt alt names case 'AMPKa1': renamed++ ; changed << geneName ; return [ 'PRKAA1': mutations ] default : return [ (geneName): mutations ] } } } println "${changed.unique().size()} gene names renamed ($renamed instances) from ${changed.unique()}" renamed } @Field FilenameFilter csvFilter = { d, f -> f.endsWith( '.csv' ) } void writeFilesToExcel( File outputFile, List csvs ) { def dumpRow = { l -> l.split( ',' ).with { toks -> toks.take( 1 ) + toks.drop( 1 ).collect { try { Double.valueOf( it ) } catch( e ) { it } } } } new WorkbookBuilder( true ).with { wb -> int maxCol = 0 def book = wb.workbook { csvs.each { csv -> nextRowNum = 0 sheet( csv.name ) { csv.file.eachLine { l -> dumpRow( l ).with { r -> maxCol = Math.max( maxCol, r.size() ) row( r ) } } autoColumnWidth( maxCol ) } } } outputFile.withOutputStream { os -> book.write( os ) } } } void generateExcelFileFromGATKOutput( File root ) { def nosnp = new File( root, 'GATK' ).listFiles( csvFilter ).sort { it.name } def snp = new File( root, 'GATK_incDBSNP' ).listFiles( csvFilter ).sort { it.name } def csvs = [nosnp,snp].transpose().collectMany { nosnpf, snpf -> [ [ name:( nosnpf.name - '.csv' ).capitalize() + ' dbSNPs removed', file: nosnpf ], [ name:( snpf.name - '.csv' ).capitalize() + ' dbSNPs included', file: snpf ] ] } writeFilesToExcel( new File( root, 'GATKIntersection.xlsx' ), csvs ) } @CompileStatic List intersect( String firstKey, List firstList, String secondKey, List secondList ) { firstList.findResults { Mutation i -> List others = secondList.findAll { Mutation it -> it == i }.toList() if( others ) { assert others.amino.size() == 1 if( i.amino instanceof Map ) { new Mutation( gene:i.gene, amino:((Map)i.amino) + [ (secondKey):others.amino[0] ], location:i.location, valid:i.valid, synonymous:i.synonymous ) } else { new Mutation( gene:i.gene, amino:[ (firstKey):i.amino, (secondKey):others.amino[0] ], location:i.location, valid:i.valid, synonymous:i.synonymous ) } } else { null } }.toList() } void dumpGatkIntersection( File root, String folder, Project gatk, Project cosmic, Project ccle ) { gatk.cellLines.each { cellLineName, cellLine -> List gatkMutations = cellLine.mutations.values()?.collectMany { it }?.sort { it.gene } ?: [] List cosmicMutations = cosmic.cellLines[ cellLineName ]?.mutations?.values()?.collectMany { it }?.sort { it.gene } ?: [] List ccleMutations = ccle.cellLines[ cellLineName ]?.mutations?.values()?.collectMany { it }?.sort { it.gene } ?: [] List gatkCosmicCcle = intersect( null, intersect( 'cosmic', cosmicMutations, 'gatk', gatkMutations ), 'ccle', ccleMutations ) List gatkCosmic = intersect( 'gatk', gatkMutations, 'cosmic', cosmicMutations ) - gatkCosmicCcle List gatkCcle = intersect( 'gatk', gatkMutations, 'ccle', ccleMutations ) - gatkCosmicCcle List ccleCosmic = intersect( 'ccle', ccleMutations, 'cosmic', cosmicMutations ) - gatkCosmicCcle List gatkOnly = gatkMutations - cosmicMutations - ccleMutations List cosmicOnly = cosmicMutations - ccleMutations - gatkMutations List ccleOnly = ccleMutations - cosmicMutations - gatkMutations new File( root, "$folder/${cellLineName}.csv" ).with { out -> out.parentFile.mkdirs() out.withWriter { w -> def dump = { mu -> mu.groupBy { it.gene }.sort { it.key }.each { g, m -> w.write "\n$g," m.each { w.write ",${it.aminoChange},${it.gc}" } } w.writeLine "" w.writeLine "${mu.size()} mutations" def avg = mu.gc.average() w.writeLine "AVG GC, ${String.format( '%.3g', avg.mean )},stdDev,${String.format( '%.3g', avg.stdDev )}" w.writeLine "" } w.writeLine "ALL THREE" dump( gatkCosmicCcle ) w.writeLine "GATK AND COSMIC" dump( gatkCosmic ) w.writeLine "GATK AND CCLE" dump( gatkCcle ) w.writeLine "CCLE AND COSMIC" dump( ccleCosmic ) w.writeLine "GATK ONLY" dump( gatkOnly ) w.writeLine "COSMIC ONLY" dump( cosmicOnly ) w.writeLine "CCLE ONLY" dump( ccleOnly ) } } } } void dumpCellLineData( File cellLineFile, Map cellLineClassification, Project cosmic, Project ccle ) { cellLineFile.withWriter { w -> w.writeLine( 'Cell Line,Classification,Gene Intersection,Cosmic Genes,CCLE Genes,Mutation Intersection,Cosmic Mutations,CCLE Mutations' ) cosmic.cellLines.each { cellLineName, cosmicCellLine -> w.write "$cellLineName,${cellLineClassification[ cellLineName ] ?: ''}," def ccleCellLine = ccle.cellLines[ cellLineName ] def cosmicGenes = cosmicCellLine.mutations.keySet() def ccleGenes = ccleCellLine.mutations.keySet() def geneIntersection = cosmicGenes.intersect( ccleGenes ) w.write "${geneIntersection.size()},${(cosmicGenes - geneIntersection).size()},${(ccleGenes - geneIntersection).size()}," def cosmicMutations = cosmicCellLine.mutations.collectMany { k, v -> v } def ccleMutations = ccleCellLine.mutations.collectMany { k, v -> v } def mutationIntersection = cosmicMutations.intersect( ccleMutations ) w.writeLine "${mutationIntersection.size()},${(cosmicMutations - mutationIntersection).size()},${(ccleMutations - mutationIntersection).size()}" } } } void dumpMutationData( File outputs, File inputs, Project cosmic, Project ccle, Set cancerCensusGenes, File allVariantData ) { def cosmicMutations = [:] def ccleMutations = [:] def intersectMutations = [:] cosmic.cellLines.each { cellLineName, cellLine -> def co = cellLine.mutations.collectMany { k, v -> v } def cc = ccle.cellLines[ cellLineName ].mutations.collectMany { k, v -> v } def ic = intersect( 'cosmic', co, 'ccle', cc ) cosmicMutations[ cellLineName ] = ( co - ic ).countBy { it } .collect { k, v -> k.count = v ; k } .sort { it.gene } .groupBy { it.gene } ccleMutations[ cellLineName ] = ( cc - ic ).countBy { it } .collect { k, v -> k.count = v ; k } .sort { it.gene } .groupBy { it.gene } intersectMutations[ cellLineName ] = ic.countBy { it } .collect { k, v -> k.count = v ; k } .sort { it.gene } .groupBy { it.gene } } TreeSet geneCoverage = cosmicMutations.collectMany { name, geneMap -> geneMap.keySet() } + ccleMutations.collectMany { name, geneMap -> geneMap.keySet() } + intersectMutations.collectMany { name, geneMap -> geneMap.keySet() } def mut = new File( outputs, 'MUTATION' ) mut.mkdirs() def dump = { writer, gene, data -> writer.write "$gene" data.each { writer.writeLine ",${it.join(',')}" } writer.writeLine "" } def process = { name, mutationMap -> println "Writing $name" boolean debug = true new File( mut, "${name}InCancerCensus.csv" ).withWriter { BufferedWriter wcc -> new File( mut, "${name}NotCancerCensus.csv" ).withWriter { BufferedWriter wncc -> new File( mut, "${name}.csv" ).withWriter { BufferedWriter w -> w.writeLine "${name.toUpperCase()}" wcc.writeLine "${name.toUpperCase()} (in census)" wcc.writeLine "" wcc.writeLine "Gene,Mutation,Cell Line,Mutation,Cell Line,Mutation,Cell Line" wcc.writeLine "" wncc.writeLine "${name.toUpperCase()} (not in census)" List gcs = [] List gcscc = [] List gcsncc = [] geneCoverage.each { gene -> def m = mutationMap.collectMany { cellLineName, geneMap -> geneMap[ gene ]?.collectMany { mutation -> [ "$mutation.aminoChange [$mutation.count] [$cellLineName]", String.format( '%.3g', mutation.gc ) ] } ?: [] } List localGC = mutationMap.collectMany { cellLineName, geneMap -> geneMap[ gene ]?.collectMany { [ it.gc ] * it.count } ?: [] } gcs += localGC if( m ) { List collated = m.collate( 6 ) dump( w, gene, collated ) if( gene in cancerCensusGenes ) { dump( wcc, gene, collated ) gcscc += localGC } else { dump( wncc, gene, collated ) gcsncc += localGC } } } def gcsa = gcs.average() w.writeLine "" w.writeLine "AVG GC,${String.format( '%.3g', gcsa.mean )},stdDev,${String.format( '%.3g', gcsa.stdDev )}" w.writeLine "Mutations,${gcs.size()}" def gcscca = gcscc.average() wcc.writeLine "" wcc.writeLine "AVG GC, ${String.format( '%.3g', gcscca.mean )},stdDev,${String.format( '%.3g', gcscca.stdDev )}" wcc.writeLine "Mutations,${gcscc.size()}" def gcsncca = gcsncc.average() wncc.writeLine "" wncc.writeLine "AVG GC,${String.format( '%.3g', gcsncca.mean )},stdDev,${String.format( '%.3g', gcsncca.stdDev )}" wncc.writeLine "Mutations,${gcsncc.size()}" } } } } process( "intersection", intersectMutations ) process( "cosmic", cosmicMutations ) process( "ccle", ccleMutations ) println "" println "Switching from gene to location based mapping for cosmic" def cosmicLocationMap = cosmicMutations.collectEntries { cellLine, genes -> [ cellLine, genes.values().flatten().groupBy { it.location } ] } println "Scanning allVariants MAF file" allVariantData.withSpinnerReader { r -> parseCsv( r, separator:'\t' ).each { row -> def pos = "${row[ 'Chromosome' ]}:${row[ 'Start_position' ]}-${row[ 'End_position' ]}".toString() def cel = row[ 'Tumor_Sample_Barcode' ].tokenize( '_' ).head().toUpperCase() def mutations = cosmicLocationMap[ cel ]?.remove( pos ) if( mutations ) { mutations.each { m -> if( intersectMutations[ cel ][ m.gene ] ) { intersectMutations[ cel ][ m.gene ] += m } else { intersectMutations[ cel ][ m.gene ] = [ m ] } } } } } def intersectionWriter = { c, w -> List gcs = [] geneCoverage.each { gene -> def m = c.collectMany { cellLineName, geneMap -> geneMap[ gene ]?.collectMany { mutation -> [ "$mutation.aminoChange [$mutation.count] [$cellLineName]", String.format( '%.3g', mutation.gc ) ] } ?: [] } gcs += c.collectMany { cellLineName, geneMap -> geneMap[ gene ]?.collectMany { [ it.gc ] * it.count } ?: [] } if( m ) { dump( w, gene, m.collate( 6 ) ) } } def gcsa = gcs.average() w.writeLine "" w.writeLine "AVG GC,${String.format( '%.3g', gcsa.mean )},stdDev,${String.format( '%.3g', gcsa.stdDev )}" w.writeLine "Mutations,${gcs.size()}" } new File( mut, "IntersectionAfterAllVariantsScan.csv" ).withWriter { w -> intersectionWriter( intersectMutations, w ) } cosmicLocationMap = cosmicLocationMap.collectEntries { cellLine, locations -> [ cellLine, locations.values().flatten().groupBy { it.gene } ] } new File( mut, "CosmicAfterAllVariantsScan.csv" ).withWriter { w -> intersectionWriter( cosmicLocationMap, w ) } def csvs = [ [ name: 'Cell Lines', file:new File( outputs, 'CELLLINE/CellLineComparison.csv' ) ] ] csvs += mut.listFiles( csvFilter ) .sort { it.name } .collect { [ name: ( it.name - '.csv' ).capitalize(), file: it ] } writeFilesToExcel( new File( outputs, 'CellLinesAndMutationIntersection.xlsx' ), csvs ) } /******************************************************************************* * Holding classes ******************************************************************************/ @EqualsAndHashCode( includes=[ "name" ]) @CompileStatic class Project implements Comparable { String name Map cellLines = [:].withDefault { String it -> new CellLine( name: it ) } int compareTo( Project other ) { name <=> other.name } String toString() { "$name (${cellLines.size()} cell-lines)" } } @EqualsAndHashCode( includes=[ "name" ]) @CompileStatic class CellLine implements Comparable { String name Map> mutations = [:].withDefault{ [] as TreeSet } int compareTo( CellLine other ) { name <=> other.name } String toString() { "$name (${mutations.size()} genes)" } } @EqualsAndHashCode( includes=[ "gene", "location" ]) @ToString( includeNames=true, includes=[ "gene", "amino", "location", "valid", "synonymous" ] ) @CompileStatic class Mutation implements Comparable { final static int gcwidth = 100 final static String ensemblVersion = '70' String gene def amino String location boolean valid boolean synonymous int count = 1 @CompileDynamic String getAminoChange() { if( amino instanceof Map ) { // Find the most occuring entr(ies) def mapCount = amino.countBy { it.value } def max = mapCount.max { it.value }.value def maxKeys = mapCount.findAll { it.value == max }.keySet() def uniq = amino.findAll { it.value in maxKeys }.values() // If there's just one winner, use it if( uniq.size() == 1 ) { uniq[ 0 ] } else { // User the ccle>cosmic>gatk precidence to pick an amino amino.ccle ?: amino.cosmic ?: amino.gatk } } else { amino } } @Lazy String chr = { java.util.regex.Matcher m = ( this.location =~ /(\w+):(\w+)-(\w+)/ ) if( m.matches() ) { m.group( 1 ) } }() @Lazy Integer start = { java.util.regex.Matcher m = ( this.location =~ /(\w+):(\w+)-(\w+)/ ) if( m.matches() ) { m.group( 2 ).toInteger() } }() @Lazy Integer end = { java.util.regex.Matcher m = ( this.location =~ /(\w+):(\w+)-(\w+)/ ) if( m.matches() ) { m.group( 3 ).toInteger() } }() @Lazy double gc = { def tokens = this.location.tokenize( ':' ) String chr = tokens[ 0 ] String pos = tokens[ 1 ] tokens = pos.tokenize( '-' ) Integer start = tokens[ 0 ].toInteger() Integer end = tokens[ 1 ].toInteger() start = start + ( end - start ).intdiv( 2 ) start = start - gcwidth.intdiv( 2 ) end = start + gcwidth - 1 def root = "http://annmap.cruk.manchester.ac.uk/sequence/mysequence/homo_sapiens/${ensemblVersion}" def count = "$root/$chr/1/${start}/${end}".toURL() .text .split( '[\r\n]' ) .toList() .drop( 1 ) .join('') .toList() .count { String ch -> ch == 'G' || ch == 'C' } ( count / (double)gcwidth ) as double }() int compareTo( Mutation other ) { gene <=> other.gene ?: location <=> other.location } } // Show some progress parsing large input files @InheritConstructors class SpinnerReader extends BufferedReader { private String output = '/-\\|' private int offset = 0 private void update() { print "\r${output[ offset ]}" System.out.flush() offset += 1 offset %= output.length() } int read() { update() ; super.read() } int read(char[] cbuf, int off, int len) { update() ; super.read( cbuf, off, len ) } String readLine() { update() ; super.readLine() } void close() { print "\r" super.close() } }