@Grab('com.xlson.groovycsv:groovycsv:1.0') import static com.xlson.groovycsv.CsvParser.parseCsv @Grab( 'org.apache.poi:poi:3.9' ) import static org.apache.poi.ss.usermodel.CellStyle.* import static org.apache.poi.ss.usermodel.IndexedColors.* import static org.apache.poi.hssf.usermodel.HSSFFont.* import org.apache.poi.hssf.usermodel.HSSFWorkbook import groovy.transform.* /*********************************************************************************** ******** Lookup Cosmic, Broad and GATK mutations ******** ***********************************************************************************/ File root = new File( getClass().protectionDomain.codeSource.location.path ).parentFile.parentFile /*********************************************************************************** ******** A lookup table to map NCBI genes to Ensembl Genes ******* ***********************************************************************************/ @ToString( includeNames=true ) class MartLookupEntry { String ensg String enst String symbol String ncbi } Map martLookup = new File( root, "Raw/mart_export.txt" ).withReader { r -> parseCsv( [ separator: '\t' ], r ).inject( [:].withDefault{ [] } ) { map, row -> if( row.'RefSeq mRNA' ) { map[ row.'RefSeq mRNA' ] << new MartLookupEntry( ensg : row.'Ensembl Gene ID', enst : row.'Ensembl Transcript ID', symbol: row.'HGNC symbol', ncbi : row.'RefSeq mRNA' ) } map } } /*********************************************************************************** ******** A class to hold each coding change ******* ***********************************************************************************/ @EqualsAndHashCode( includes=[ "original", "replacement", "position" ] ) @ToString class Coding implements Comparable { String coding private @Lazy Map data = { parse() }() String getOriginal() { data.original } String getReplacement() { data.replacement } Integer getPosition() { data.position } Boolean getErroneous() { data.erroneous } Coding( String coding ) { this.coding = coding } private Map parse() { def ret = [ original:'', replacement:'', position:-1, erroneous:true ] def match = coding =~ /^c\.(\d+)([TGAC])\>([TGAC])$/ if( match ) { def (pos,start,end) = match[ 0 ].drop( 1 ) ret.original = start ret.replacement = end ret.position = pos as Integer ret.erroneous = false return ret } match = coding =~ /^c\.([TGAC])(\d+)([TGAC])$/ if( match ) { def (start,pos,end) = match[ 0 ].drop( 1 ) ret.original = start ret.replacement = end ret.position = pos as Integer ret.erroneous = false } return ret } int compareTo( other ) { position <=> other.position ?: original <=> other.original ?: replacement <=> other.replacement } } /*********************************************************************************** ******** A class to hold each amino change ******* ***********************************************************************************/ @EqualsAndHashCode( includes=[ "amino" ] ) @ToString class Amino implements Comparable { String amino private @Lazy Map data = { parse() }() String getStartAmino() { data.start } String getEndAmino() { data.end } Integer getPosition() { data.position } Boolean getSynonymous() { data.synonymous } Boolean getErroneous() { data.erroneous } Amino( String amino ) { this.amino = amino } private Map parse() { def ret = [ start:'', end:'', position:-1, synonymous:false, erroneous:true ] def match = amino =~ /^p\.([A-Z]+)(\d*)([A-Z]+)$/ if( match ) { def (start,pos,end) = match[ 0 ].drop( 1 ) ret.start = start ret.end = end ?: null ret.position = pos ? pos as Integer : -1 ret.synonymous = ret.start == ret.end ret.erroneous = ret.position < 0 || ret.end == null } ret } int compareTo( other ) { amino <=> other.amino } } /*********************************************************************************** ******** A class to hold each mutation ******* ***********************************************************************************/ @EqualsAndHashCode( includes=[ "gene", "amino", "coding" ]) @ToString( includeNames=true ) class Mutation implements Comparable { String gene Amino amino Coding coding String transcript Integer location Boolean remove void setAmino( String amino ) { this.amino = new Amino( amino ) } void setCoding( String coding ) { this.coding = new Coding( coding ) } Integer getLocation() { if( location == null && coding.position ) { try { def ret = new URL( "http://beta.rest.ensembl.org/map/cds/${transcript}/${coding.position}..${coding.position}?content-type=text/plain" ).text def matcher = ret =~ /(?m)^.+?start: (\d+)$/ this.location = matcher[ 0 ][ 1 ] as Integer } catch( e ) { location = -1 } } location } int compareTo( other ) { gene <=> other.gene ?: amino <=> other.amino ?: coding <=> other.coding } } /*********************************************************************************** ******** A class to hold each cell line ******* ***********************************************************************************/ class CellLine { String name // Lists of all genes scanned TreeSet cosmicOnlyGenes // Just those in cosmic TreeSet ccleOnlyGenes // Just those in ccle TreeSet gatkOnlyGenes // Just those in GATK TreeSet ccleGatkGenes TreeSet cosmicGatkGenes TreeSet cosmicCcleGenes TreeSet combinedGenes // the intersection TreeSet cosmicMutations // Cosmic mutations for this cell line TreeSet ccleMutations // Ccle mutations for this cell line TreeSet gatkMutations // GATK mutations for this cell line CellLine( String name, File cosmicMutationFile, File cosmicNonMutationFile, File gatkMutationFile, File ccleMutationFile, TreeSet ccleGenes, Map martLookup ) { this.name = name // Read the mutations from the cosmicMutationFile try { cosmicMutations = cosmicMutationFile.withReader { r -> new TreeSet( parseCsv( r ).collect { row -> if( row.'AA Mutation' ) { new Mutation( gene : row.'Gene', amino : row.'AA Mutation', coding : row.'CDS Mutation', transcript: row.'Transcript'.startsWith( 'ENST' ) ? row.'Transcript' : ( martLookup[ row.'Transcript'.takeWhile { it != '.' } ]?.enst?.getAt( 0 ) ?: row.'Transcript'.takeWhile { it != '.' } ), remove : false ) } }.findAll() ) } } catch( e ) { println "Caught $e in $name" throw e } cosmicOnlyGenes = cosmicMutations*.gene // Add the list of genes in this file to the list of ccle genes cosmicOnlyGenes.addAll( cosmicNonMutationFile.withReader { r -> parseCsv( r ).collect { row -> row.'Gene Name' } } ) cosmicMutations = cosmicMutations.findAll { it && !it.amino.erroneous && !it.amino.synonymous } // Read the mutations from the gatkMutationFile Set funcs = [] gatkMutations = gatkMutationFile.withReader { r -> parseCsv( [ separator: ',' ], r ).inject([]) { list, row -> def aachange = row.'AAChange.refGene'?.tokenize(',') if( aachange?.size() ) { aachange.each { aa -> aa = aa.tokenize( ':' ) def mut = new Mutation( gene : aa[ 0 ], amino : aa[ 4 ], coding : aa[ 3 ], transcript : martLookup[ aa[ 1 ] ]?.enst ?: '???', location : row.start as Integer, remove : !row.'Func.refGene'.contains( 'exonic' ) || row.'dbSNP_id' != '.' || row.alleleFreq_esp6500si_all.length() > 0 || row.nci60.length() > 0 ) list << mut } } else if( row.'Func.refGene'.contains( 'UTR' ) || // UTR3 UTR5 row.'Func.refGene'.contains( 'onic' ) ) { // ncRNA_intronic intronic ncRNA_exonic row.'Gene.refGene'?.tokenize(',').each { gene -> list << new Mutation( gene : gene, amino : null, coding : null, transcript : null, location : -1, remove : true ) } } else { funcs << row.'Func.refGene' } list } } println funcs println "$name gatkMutations.size() == ${gatkMutations.size()}" gatkOnlyGenes = gatkMutations*.gene gatkMutations = gatkMutations.findAll { it && !it.amino.erroneous && !it.amino.synonymous && !it.remove } println "$name gatkMutations.size() == ${gatkMutations.size()}" // Read the mutations from the ccleMutationFile ccleMutations = ccleMutationFile.withReader { r -> parseCsv( [ separator: '\t' ], r ).collect { row -> if( row.Tumor_Sample_Barcode.contains( name ) ) { if( row.Protein_Change ) { new Mutation( gene : row.Hugo_Symbol, amino : row.Protein_Change, coding : row.cDNA_Change, transcript : martLookup[ row.Refseq_mRNA_Id ]?.enst ?: '???', location : row.Start_position as Integer, remove : false ) } } }.findAll() } combinedGenes = cosmicOnlyGenes.intersect( ccleGenes ).intersect( gatkOnlyGenes ) cosmicCcleGenes = cosmicOnlyGenes.intersect( ccleGenes ).findAll { !combinedGenes.contains( it ) } cosmicGatkGenes = cosmicOnlyGenes.intersect( gatkOnlyGenes ).findAll { !combinedGenes.contains( it ) } ccleGatkGenes = ccleGenes.intersect( gatkOnlyGenes ).findAll { !combinedGenes.contains( it ) } cosmicOnlyGenes = cosmicOnlyGenes.findAll { !combinedGenes.contains( it ) && !cosmicCcleGenes.contains( it ) && !cosmicGatkGenes.contains( it ) && !ccleGatkGenes.contains( it ) } ccleOnlyGenes = ccleOnlyGenes.findAll { !combinedGenes.contains( it ) && !cosmicCcleGenes.contains( it ) && !cosmicGatkGenes.contains( it ) && !ccleGatkGenes.contains( it ) } gatkOnlyGenes = gatkOnlyGenes.findAll { !combinedGenes.contains( it ) && !cosmicCcleGenes.contains( it ) && !cosmicGatkGenes.contains( it ) && !ccleGatkGenes.contains( it ) } ccleMutations = ccleMutations.findAll { it && !it.amino.erroneous && !it.amino.synonymous } } void writeToSheet( HSSFWorkbook workbook, Map styles ) { def addValues = { row, cellidx, style, ...values -> values.eachWithIndex { it, idx -> row.createCell( cellidx + idx ).with { cell -> cell.setCellValue( it ) if( style ) cell.setCellStyle( style ) } } } workbook.createSheet( name ).with { sheet -> println "Writing sheet $name" def dumpTypes = { rowidx -> sheet.createRow( rowidx ).with { row -> row.createCell( 0 ).with { cell -> cell.setCellValue "CellLine: $name" } row.createCell( 1 ).with { cell -> cell.setCellValue "ALL PLATFORMS" ; cell.cellStyle = styles.all } row.createCell( 2 ).with { cell -> cell.setCellValue "COSMIC ONLY" ; cell.cellStyle = styles.cosmic } row.createCell( 3 ).with { cell -> cell.setCellValue "CCLE ONLY" ; cell.cellStyle = styles.ccle } row.createCell( 4 ).with { cell -> cell.setCellValue "GATK ONLY" ; cell.cellStyle = styles.gatk } row.createCell( 5 ).with { cell -> cell.setCellValue "COSMIC/CCLE" ; cell.cellStyle = styles.cosmic_ccle } row.createCell( 6 ).with { cell -> cell.setCellValue "CCLE/GATK" ; cell.cellStyle = styles.ccle_gatk } row.createCell( 7 ).with { cell -> cell.setCellValue "COSMIC/GATK" ; cell.cellStyle = styles.cosmic_gatk } } rowidx + 1 } // Headings def rownum = dumpTypes( 0 ) // Stats sheet.createRow( 2 ).with { row -> row.createCell( 1 ).with { cell -> cell.setCellValue "Genes" } row.createCell( 2 ).with { cell -> cell.setCellValue "Mutations" } } def mutationIntersect = ccleMutations.intersect( cosmicMutations ).intersect( gatkMutations ) sheet.createRow( 3 ).with { row -> addValues( row, 0, styles.all, "ALL PLATFORMS", combinedGenes.size(), mutationIntersect.size() ) } sheet.createRow( 4 ).with { row -> addValues( row, 0, styles.cosmic, "COSMIC ONLY", cosmicOnlyGenes.size(), cosmicMutations.findAll { !ccleMutations.contains( it ) && !gatkMutations.contains( it ) }.size() ) } sheet.createRow( 5 ).with { row -> addValues( row, 0, styles.ccle, "CCLE ONLY", ccleOnlyGenes.size(), ccleMutations.findAll { !cosmicMutations.contains( it ) && !gatkMutations.contains( it ) }.size() ) } sheet.createRow( 6 ).with { row -> addValues( row, 0, styles.gatk, "GATK ONLY", gatkOnlyGenes.size(), gatkMutations.findAll { !cosmicMutations.contains( it ) && !ccleMutations.contains( it ) }.size() ) } sheet.createRow( 7 ).with { row -> addValues( row, 0, styles.cosmic_ccle, "COSMIC/CCLE", cosmicCcleGenes.size(), cosmicMutations.intersect( ccleMutations ).findAll { !gatkMutations.contains( it ) }.size() ) } sheet.createRow( 8 ).with { row -> addValues( row, 0, styles.ccle_gatk, "CCLE/GATK", ccleGatkGenes.size(), ccleMutations.intersect( gatkMutations ).findAll { !cosmicMutations.contains( it ) }.size() ) } sheet.createRow( 9 ).with { row -> addValues( row, 0, styles.cosmic_gatk, "COSMIC/GATK", cosmicGatkGenes.size(), cosmicMutations.intersect( gatkMutations ).findAll { !ccleMutations.contains( it ) }.size() ) } sheet.groupRow( 2, 9 ) def writeGenes = { name, rowidx, style, genes, filter={ true } -> def accidx = rowidx sheet.createRow( accidx++ ).with { row -> row.createCell( 0 ).with { cell -> cell.setCellValue( name ) cell.setCellStyle( style ) } } genes.each { gene -> sheet.createRow( accidx++ ).with { row -> def cellidx = 0 // The gene name row.createCell( cellidx++ ).with { cell -> cell.setCellValue( gene ) } // then all mutations def written = [] cosmicMutations.findAll { it.gene == gene && filter( it ) }.each { mut -> if( !written.contains( mut ) ) { def (gatk,ccle) = [ gatkMutations.contains( mut ), ccleMutations.contains( mut ) ] def st = gatk && ccle ? styles.all : gatk ? styles.cosmic_gatk : ccle ? styles.cosmic_ccle : styles.cosmic row.createCell( cellidx++ ).with { cell -> cell.setCellValue( mut.amino.amino ) cell.setCellStyle( st ) } written << mut } } ccleMutations.findAll { it.gene == gene && filter( it ) }.each { mut -> if( !written.contains( mut ) ) { def (gatk,cosm) = [ gatkMutations.contains( mut ), cosmicMutations.contains( mut ) ] def st = gatk && cosm ? styles.all : gatk ? styles.ccle_gatk : cosm ? styles.cosmic_ccle : styles.ccle if( !cosmicMutations.contains( mut ) ) { row.createCell( cellidx++ ).with { cell -> cell.setCellValue( mut.amino.amino ) cell.setCellStyle( st ) } } written << mut } } gatkMutations.findAll { it.gene == gene && filter( it ) }.each { mut -> if( !written.contains( mut ) ) { def (ccle,cosm) = [ ccleMutations.contains( mut ), cosmicMutations.contains( mut ) ] def st = ccle && cosm ? styles.all : ccle ? styles.ccle_gatk : cosm ? styles.cosmic_gatk : styles.gatk if( !cosmicMutations.contains( mut ) ) { row.createCell( cellidx++ ).with { cell -> cell.setCellValue( mut.amino.amino ) cell.setCellStyle( st ) } } written << mut } } } } sheet.groupRow( rowidx, accidx ) accidx + 2 } rownum = 11 // Both genes and mutations rownum = dumpTypes( rownum ) rownum = writeGenes( 'ALL PLATFORMS', rownum, styles.all, combinedGenes ) // Cosmic only genes and mutations rownum = dumpTypes( rownum ) rownum = writeGenes( 'COSMIC', rownum, styles.cosmic, cosmicOnlyGenes ) // Ccle only genes and mutations rownum = dumpTypes( rownum ) rownum = writeGenes( 'CCLE', rownum, styles.ccle, ccleOnlyGenes ) // Gatk only genes and mutations rownum = dumpTypes( rownum ) rownum = writeGenes( 'GATK', rownum, styles.gatk, gatkOnlyGenes ) // Cosmic/CCLE only genes and mutations rownum = dumpTypes( rownum ) rownum = writeGenes( 'Cosmic/CCLE', rownum, styles.cosmic_ccle, cosmicCcleGenes ) // CCLE/Gatk only genes and mutations rownum = dumpTypes( rownum ) rownum = writeGenes( 'CCLE/GATK', rownum, styles.ccle_gatk, ccleGatkGenes ) // Cosmic/Gatk only genes and mutations rownum = dumpTypes( rownum ) rownum = writeGenes( 'Cosmic/GATK', rownum, styles.cosmic_gatk, cosmicGatkGenes ) (0..20).each { c -> sheet.autoSizeColumn( c ) } } } } // Load the screened list for ccle TreeSet ccleGenes = new File( root, 'Raw/broad data/ccle_genes.csv' ).withReader { r -> parseCsv( r ).collectMany { row -> [ row.'Symbol'.trim(), row.'HGNC Symbol'.trim() ] } as TreeSet } File mafFile = new File( root, 'Raw/broad data/CCLE_hybrid_capture1650_hg19_NoCommonSNPs_NoNeutralVariants_CDS_2012-2.05.07.maf' ) Map gatkFiles = new File( root, 'GATK' ).listFiles( { f, name -> name.endsWith( '.csv' ) } as FilenameFilter ).toList() .groupBy { ( it.name =~ /^.+(H[0-9]+)_.+\.csv$/ )[0][1].toUpperCase() } def cellLines = new File( root, 'Raw/cosmic data' ).listFiles( { f, name -> !name.startsWith( '.' ) && name.endsWith( '.csv' ) } as FilenameFilter ).toList() .groupBy { ( it.name - '-' - 'mutations' - 'mutation' - 'non' - '.csv' ).trim().toUpperCase() } .findAll { name, files -> gatkFiles[ name ] != null } .collect { name, files -> new CellLine( name, files.find { !it.name.contains( 'non' ) }, files.find { it.name.contains( 'non' ) }, gatkFiles[ name ].find(), mafFile, ccleGenes, martLookup ) } new HSSFWorkbook().with { workbook -> def whiteFont = workbook.createFont().with { font -> font.boldweight = BOLDWEIGHT_BOLD font.color = WHITE.index font } def styles = [ cosmic:[RED, true], ccle:[BLUE, true], gatk:[GREEN, true], cosmic_ccle:[PLUM, true], ccle_gatk:[AQUA, false], cosmic_gatk:[YELLOW, false], all:[GREY_25_PERCENT, false] ].collectEntries { name, st -> workbook.createCellStyle().with { style -> if( st[ 1 ] ) { style.font = whiteFont } style.fillForegroundColor = st[ 0 ].index style.fillPattern = SOLID_FOREGROUND [ name, style ] } } cellLines.each { it.writeToSheet( workbook, styles ) } new File( root, 'GATK/output.xls' ).withOutputStream { os -> workbook.write( os ) } } println "Done."