## Functions for HW3 ## Written by Anna Ritz ## Last Edited Feb 5, 2016 import sys ####################################################### ## Functions to Manipulate Datasets ####################################################### def getData(dataset): ''' Given a dataset name, returns five things: - DNA string - exonStarts list - intronStarts list - strand string (either '+' or '-') - mRNA string (for comparison) - peptide string (for comparison) NOTE: The datafiles/ directory must be in the same location as the .py file. ''' sequenceFile = 'datafiles/'+dataset+'-sequences.fasta' dna,rna,peptide = readFasta(sequenceFile) ## add a STOP codon to the peptide sequence. peptide = peptide + '*' tablefile = 'datafiles/'+dataset+'-exon-info.txt' exonStarts,intronStarts,strand = readTable(tablefile) return dna,exonStarts,intronStarts,strand,rna,peptide def readTable(infile): ''' Reads a knownGene table from the UCSC Genome Browser. Specifications: this table has two lines, one of row headers and one of a single isoform of a gene. The columns are specified by the UCSC genome browser. See the schema here: http://genome.ucsc.edu/cgi-bin/hgTables and click "describe table schema" Input: a file name, including the directory name. Outputs three things: exonStarts - a list of exon starting points intronStarts - a list of intron starting points strand - a string that is either '+' or '-'. Example: readTable('datafiles/testPositiveStrand-exon-info.txt') returns [[3, 19], [9, 28], '+']. Usage: exonStarts,intronStarts,strand = readTable('datafiles/testPositiveStrand-exon-info.txt') ''' # Read and parse the elements in the row. tableLines = open(infile,'r').readlines() row = tableLines[1].split() strand = row[2] transcriptionStart = int(row[3]) # convert the string to an int transcriptionEnd = int(row[4]) codingRegionStart = int(row[5]) codingRegionEnd = int(row[6]) exonStarts = [int(start) for start in row[8].split(',') if len(start) != 0] intronStarts = [int(start) for start in row[9].split(',') if len(start) != 0] ## The variables now have the original values from the table. We need ## to transform them by (1) moving indices of the non-coding portions ## of exons and (2) adjusting the indices to they start at 0. ## First, identify the positions of the coding regions. ## Sometimes entire exons are non-coding! Remove these. indicesAbove5PrimeUTR = [i for i in range(len(intronStarts)) if intronStarts[i]>codingRegionStart] indicesBelow3PrimeUTR = [i for i in range(len(exonStarts)) if exonStarts[i] transcriptionStart: exonStarts[0] = codingRegionStart if transcriptionEnd > codingRegionEnd: intronStarts[-1] = codingRegionEnd # Print the number of exons, if we have completely removed some. #if len(exonStarts) != int(row[7]): # print('After removing non-coding exons, %d exons left.' % (len(exonStarts))) ## shift the exon Starts and Ends to be 0. exonStarts = [start-transcriptionStart for start in exonStarts] intronStarts = [start-transcriptionStart for start in intronStarts] ## return the exonStarts and the intronStarts (indexed at 0) ## and the strand (+/-) return exonStarts,intronStarts,strand def readFasta(infile): ''' Returns the sequences in the FASTA file, with the line breaks (newlines) removed. Specifications: this file must have THREE sequences, labeled by three headers named '>dna','>rna','>peptide'. Input: a file name, including the directory name. Output: three strings representing the DNA, RNA, and peptide sequencs. Example: readFasta('testPos') ''' ## fileLines is a list of strings from the infile variable. fileLines = open(infile,'r').readlines() ## dna, rna, and peptide will eventually contain the sequences ## we will return. dna = '' rna = '' peptide = '' ## thisheader will contain the most recent header (line with a ">"). thisheader = '' ## go through each line in fileLines... for line in fileLines: ## remove newlines strippedLine = line.strip() ## update the header if this line starts with a ">" ## otherwise, add the line to either dna, rna, or peptide. if strippedLine[0] == '>': thisheader = strippedLine else: ## make line upper case strippedLine = strippedLine.upper() ## check the header to see which variable to add the string to. if thisheader == '>dna': dna = dna + strippedLine elif thisheader == '>rna': rna = rna + strippedLine elif thisheader == '>peptide': peptide = peptide + strippedLine else: ## The header wasn't one of the ones above - print an error message. sys.exit('Error! Header "'+thisheader+'" is not >dna, >rna, or >peptide. Exiting.\n') return dna,rna,peptide