import re, os, time def toDecimal(x): return sum(map(lambda z: int(x[z]) and 2**(len(x) - z - 1), range(len(x)-1, -1, -1))) def getprobabilitytable(filename): infile = open(filename,"r") indata = infile.readlines() infile.close() bounds = {} boundprobabilities = {} for line in indata[1:]: line = re.split(',',line) bounds[line[0]] = line[1] boundprobabilities[line[0]] = re.sub('\s','',line[2]) return bounds, boundprobabilities def getinputdata(infilename,bounds,boundprobabilities): infile = open(infilename,"r") indata = infile.readlines() infile.close() samplenames = re.sub('\n|\r','',indata[0]) samplenames = re.split(',',samplenames) if samplenames[-1] == '': samplenames = samplenames[:-1] samplenames = samplenames[3:] markernames = [] chromosomenames = [] geneticmap = [] priordata = [] transformeddata = [] for sample in samplenames: priordata.append({}) transformeddata.append({}) for line in indata[1:]: if line[0] != '': line = re.split(',',line) markernames.append(line[0]) if len(chromosomenames) == 0: chromosomenames.append(line[1]) geneticmap.append([]) elif chromosomenames[-1] != line[1]: chromosomenames.append(line[1]) geneticmap.append([]) geneticmap[-1].append(float(line[2])) i = 0 for sample in line[3:]: priordata[i][line[0]] = re.sub('\n|\r','',sample) if priordata[i][line[0]] == '': if transformeddata[i].has_key(line[0]) == False: transformeddata[i][line[0]] = '' else: print "Two markers have same name?" print line[0] else: for bound in bounds.keys(): if float(priordata[i][line[0]]) >= float(bound) and float(priordata[i][line[0]]) < float(bounds[bound]): if transformeddata[i].has_key(line[0]) == False: transformeddata[i][line[0]] = boundprobabilities[bound] else: print "Two markers have same name?" print line[0] i = i + 1 return chromosomenames, geneticmap, samplenames, markernames, priordata, transformeddata def goleftright(currentpositiononchromosome,currentoverallposition,chromosomelength,markernames,transformeddata,markerstoconsider): # Searches markers to the left and right of the current position for other markers with data # The number of left markers and right markers with data must be equal # Returns the position (relative to current) of the bounding markers # If the maximum number to be searched cannot be satisfied, returns largest number that can if transformeddata[markernames[currentoverallposition]] == '': return 0,0,0 possibleleftmarkers = 0 possiblerightmarkers = 0 for number in range(currentoverallposition+1, currentoverallposition+chromosomelength-currentpositiononchromosome): if transformeddata[markernames[number]] != '': possiblerightmarkers = possiblerightmarkers + 1 for number in range(currentoverallposition-1, currentoverallposition-currentpositiononchromosome-1,-1): if transformeddata[markernames[number]] != '': possibleleftmarkers = possibleleftmarkers + 1 if possibleleftmarkers >= markerstoconsider and possiblerightmarkers >= markerstoconsider: possibleleftmarkers = markerstoconsider possiblerightmarkers = markerstoconsider elif possibleleftmarkers > possiblerightmarkers: possibleleftmarkers = possiblerightmarkers elif possibleleftmarkers < possiblerightmarkers: possiblerightmarkers = possibleleftmarkers goodmarkers = 0 position = 0 while goodmarkers != possiblerightmarkers: position = position + 1 if transformeddata[markernames[currentoverallposition+position]] != '': goodmarkers = goodmarkers + 1 rightmarkerposition = position goodmarkers = 0 position = 0 while goodmarkers != possibleleftmarkers: position = position - 1 if transformeddata[markernames[currentoverallposition+position]] != '': goodmarkers = goodmarkers + 1 leftmarkerposition = position return rightmarkerposition,leftmarkerposition, goodmarkers def calculatepriorprobability(currentgenotype,chromosomemap,markernames,transformeddata,genotype_marker,marker,currentpositiononchromosome,currentoverallposition,currentpriorprobability,probability_given,skipped_markers,countupordown): # Test whether first marker has data if transformeddata[markernames[currentoverallposition+marker]] != '': next_marker_position = countupordown # Find next marker with data while transformeddata[markernames[currentoverallposition+marker+next_marker_position]] == '': next_marker_position = next_marker_position + countupordown # Calculate probability of recombination #print currentpositiononchromosome,marker,next_marker_position distance = abs(chromosomemap[currentpositiononchromosome+marker] - chromosomemap[currentpositiononchromosome+marker+next_marker_position]) # The cM distance between the markers is the probability of a recombination from the F1 to F2 generation. # The probability of a recombination in all generations is ~2x that. distance = distance * 2 / 100 if distance > .5: distance = .5 # If current assumed genotypes are equal, distance is for no recombination event if currentgenotype[genotype_marker] == currentgenotype[genotype_marker+countupordown]: distance = 1 - distance probability_given = probability_given * distance if currentgenotype[genotype_marker] == '0': currentpriorprobability = currentpriorprobability * float(transformeddata[markernames[currentoverallposition+marker]]) elif currentgenotype[genotype_marker] == '1': currentpriorprobability = currentpriorprobability * (1 - float(transformeddata[markernames[currentoverallposition+marker]])) genotype_marker = genotype_marker + countupordown else: skipped_markers = skipped_markers + 1 return probability_given,currentpriorprobability,skipped_markers,genotype_marker def calculateposteriorprobability(rightmarkerposition,leftmarkerposition,currentpositiononchromosome,currentoverallposition,markernames,transformeddata,chromosomemap,markersused): # Bayes's Theorem: P(A1|B) = P(B|A1)P(A1)/(P(B|A1)P(A1)+P(B|A2)P(A2)) # In this case, we calculate: P(currentoverallposition = Col|assumed genotype of surrounding markers) # Which equals P(assumed genotype|current = Col) * P(current = Col--prior) / P(assumed genotype based on genetic map) # Calculate P(assumed genotype) = P(assumed genotype|current = Col)*P(current = Col) + P(assumed genotype|current = Ler)*P(current = Ler) # This array represents the current assumed genotype; 0 = Col, 1 = Ler currentposteriorprobability = 0 currentgenotype = '0'*(2*markersused+1) for iteration in range(0,2**(2*markersused)): # probability_given_col equals the probability of the genotype given that the current marker is Columbia based on recombination frequency probability_given_col = 1. currentpriorprobability = 1. skipped_markers = 0 genotype_marker = 0 for marker in range(leftmarkerposition,0): probability_given_col,currentpriorprobability,skipped_markers,genotype_marker = calculatepriorprobability(currentgenotype,chromosomemap,markernames,transformeddata,genotype_marker,marker,currentpositiononchromosome,currentoverallposition,currentpriorprobability,probability_given_col,skipped_markers,1) genotype_marker = -1 for marker in range(rightmarkerposition,0,-1): probability_given_col,currentpriorprobability,skipped_markers,genotype_marker = calculatepriorprobability(currentgenotype,chromosomemap,markernames,transformeddata,genotype_marker,marker,currentpositiononchromosome,currentoverallposition,currentpriorprobability,probability_given_col,skipped_markers,-1) # The prior probability of each genotype will be the same for Col and Ler calculations! # This is since the genotype of the surrounding markers does not change, so their state probability from the given data does not change priorprobability = currentpriorprobability currentgenotype = currentgenotype[0:markersused] + '1' + currentgenotype[markersused+1:] # probability_given_ler equals the probability of the genotype given that the current marker is Landsberg based on recombination frequency probability_given_ler = 1. currentpriorprobability = 1. skipped_markers = 0 genotype_marker = 0 for marker in range(leftmarkerposition,0): probability_given_ler,currentpriorprobability,skipped_markers,genotype_marker = calculatepriorprobability(currentgenotype,chromosomemap,markernames,transformeddata,genotype_marker,marker,currentpositiononchromosome,currentoverallposition,currentpriorprobability,probability_given_ler,skipped_markers,1) genotype_marker = -1 for marker in range(rightmarkerposition,0,-1): probability_given_ler,currentpriorprobability,skipped_markers,genotype_marker = calculatepriorprobability(currentgenotype,chromosomemap,markernames,transformeddata,genotype_marker,marker,currentpositiononchromosome,currentoverallposition,currentpriorprobability,probability_given_ler,skipped_markers,-1) # P(genotype|marker = Col)*P(marker = Col) bayes_given_col = probability_given_col * float(transformeddata[markernames[currentoverallposition]]) # P(genotype}marker = Ler)*P(marker = Ler) bayes_given_ler = probability_given_ler * (1 - float(transformeddata[markernames[currentoverallposition]])) if bayes_given_col == 0 and bayes_given_ler == 0: bayes = 0 else: # P(marker = Col|genotype) = P(genotype|marker = Col)P(marker = Col)/(P(genotype|marker=Col)P(marker=Col)+P(genotype|marker=Ler)P(marker=Ler)) bayes = bayes_given_col/(bayes_given_col+bayes_given_ler) # posterior P(marker = Col) = P(marker=Col|genotype)P(genotype), then sum over all possible genotypes currentposteriorprobability = currentposteriorprobability + bayes * priorprobability # iterate to next genotype currentgenotype = currentgenotype[0:markersused]+currentgenotype[markersused+1:] settolength = len(currentgenotype) currentcount = toDecimal(currentgenotype) + 1 decimal2binary = lambda n: n>0 and decimal2binary(n>>1)+str(n&1) or '' currentgenotype = decimal2binary(currentcount) currentgenotype = '0' * (settolength-len(currentgenotype)) + currentgenotype currentgenotype = currentgenotype[0:markersused] + '0' + currentgenotype[markersused:] print currentposteriorprobability return currentposteriorprobability if __name__ == '__main__': # Import settings infile = open("GenotypeBayesSettings.txt","r") indata = infile.readlines() infile.close() i = 0 for line in indata: if re.search('Input Data Directory',line) != None: j = i if re.search('Cy3 Input Data',line) != None: l = i if re.search('Cy5 Input Data',line) != None: r = i if re.search('Output Data',line) != None: m = i if re.search('Cy3 Probability',line) != None: n = i if re.search('Cy5 Probability',line) != None: p = i if re.search('only',line) != None: q = i if re.search('Output Data Directory',line) != None: s = i if re.search('Probability File Directory',line) != None: t = i if re.search('rate for posterior',line) != None: u = i if re.search('rate for prior',line) != None: v = i i = i + 1 outputdirectory = re.sub('\r','',indata[s+1][:-1]) probabilitydirectory = re.sub('\r','',indata[t+1][:-1]) inputdirectory = re.sub('\r','',indata[j+1][:-1]) cy3inputdatafile = re.sub('\r','',indata[l+1][:-1]) cy5inputdatafile = re.sub('\r','',indata[r+1][:-1]) outputdatafile = re.sub('\r','',indata[m+1][:-1]) cy3probabilityfile = re.sub('\r','',indata[n+1][:-1]) cy5probabilityfile = re.sub('\r','',indata[p+1][:-1]) cy3orcy5 = re.sub('\r','',indata[q+1][:-1]) posteriorerrorrate = float(re.sub('\r','',indata[u+1][:-1]))/2 priorerrorrate = float(re.sub('\r','',indata[v+1][:-1]))/2 startdirectory = os.getcwd() # Change directory to where mapfiles are os.chdir(probabilitydirectory) print "Using",probabilitydirectory,"for probability tables" # Get probability matrices if re.search('Cy3|Both',cy3orcy5) != None: try: cy3bounds, cy3boundprobabilities = getprobabilitytable(cy3probabilityfile) except: print "Error Loading Cy3 Probability File!" time.sleep(1000) if re.search('Cy5|Both',cy3orcy5) != None: try: cy5bounds, cy5boundprobabilities = getprobabilitytable(cy5probabilityfile) except: print "Error Loading Cy5 Probability File!" time.sleep(1000) # Get input data and genetic map os.chdir(inputdirectory) print "Using",inputdirectory,"for Input files" if re.search('Cy3|Both',cy3orcy5) != None: chromosomenames, geneticmap, cy3samplenames, cy3markernames, cy3priordata, cy3transformeddata = getinputdata(cy3inputdatafile,cy3bounds,cy3boundprobabilities) samplenumber = len(cy3samplenames) if re.search('Cy5|Both',cy3orcy5) != None: chromosomenames, geneticmap, cy5samplenames, cy5markernames, cy5priordata, cy5transformeddata = getinputdata(cy5inputdatafile,cy5bounds,cy5boundprobabilities) samplenumber = len(cy5samplenames) # Check that samplenames are the same # Check that markers are the same if re.search('Both',cy3orcy5) != None: i = 0 for marker in cy3markernames: if cy5markernames[i] != marker: print "Cy3 and Cy5 marker names don't match!" time.sleep(10000) i = i + 1 for sample in range(0,len(cy3samplenames)): if cy5samplenames[sample] != cy3samplenames[sample]: print "Cy3 and Cy5 sample names don't match!" time.sleep(10000) if len(cy5samplenames) != len(cy3samplenames): print "Cy3 and Cy5 have different numbers of samples!" time.sleep(10000) cy3outdata = [] cy5outdata = [] if re.search('Cy3|Both',cy3orcy5) != None: for sample in range(0,samplenumber): cy3outdata.append({}) if re.search('Cy5|Both',cy3orcy5) != None: for sample in range(0,samplenumber): cy5outdata.append({}) for sample in range(0,samplenumber): currentoverallposition = 0 for chromosome in geneticmap: currentpositiononchromosome = 0 for marker in chromosome: if re.search('Cy3|Both',cy3orcy5) != None: print cy3markernames[currentoverallposition] cy3rightmarkerposition,cy3leftmarkerposition,markersused = goleftright(currentpositiononchromosome,currentoverallposition,len(chromosome),cy3markernames,cy3transformeddata[sample],1) if cy3rightmarkerposition == 0: if cy3transformeddata[sample][cy3markernames[currentoverallposition]] != '': currentposteriorprobability = float(cy3transformeddata[sample][cy3markernames[currentoverallposition]]) else: currentposteriorprobability = '' else: currentposteriorprobability = calculateposteriorprobability(cy3rightmarkerposition,cy3leftmarkerposition,currentpositiononchromosome,currentoverallposition,cy3markernames,cy3transformeddata[sample],chromosome,markersused) cy3outdata[sample][cy3markernames[currentoverallposition]] = currentposteriorprobability if re.search('Cy5|Both',cy3orcy5) != None: cy5rightmarkerposition,cy5leftmarkerposition,markersused = goleftright(currentpositiononchromosome,currentoverallposition,len(chromosome),cy5markernames,cy5transformeddata[sample],1) if cy5rightmarkerposition == 0: if cy5transformeddata[sample][cy5markernames[currentoverallposition]] != '': currentposteriorprobability = float(cy5transformeddata[sample][cy5markernames[currentoverallposition]]) else: currentposteriorprobability = '' else: currentposteriorprobability = calculateposteriorprobability(cy5rightmarkerposition,cy5leftmarkerposition,currentpositiononchromosome,currentoverallposition,cy5markernames,cy5transformeddata[sample],chromosome,markersused) cy5outdata[sample][cy5markernames[currentoverallposition]] = currentposteriorprobability currentpositiononchromosome = currentpositiononchromosome + 1 currentoverallposition = currentoverallposition + 1 os.chdir(outputdirectory) print "Using",outputdirectory,"for Output file" outfile = open(outputdatafile,'w') outfile.write("Chromosome,Position,Marker Name,") if re.search('Both',cy3orcy5) != None: for sample in range(0,len(cy3samplenames)): outfile.write('Cy3 Prior '+cy3samplenames[sample]+',Cy5 Prior '+cy5samplenames[sample]+',Cy3 Posterior '+cy3samplenames[sample]+',Cy5 Posterior '+cy5samplenames[sample]+',Call '+cy3samplenames[sample]+',') if re.search('Cy3',cy3orcy5) != None: for sample in range(0,len(cy3samplenames)): outfile.write('Cy3 Prior '+cy3samplenames[sample]+',Cy3 Posterior '+cy3samplenames[sample]+',Call '+cy3samplenames[sample]+',') if re.search('Cy5',cy3orcy5) != None: for sample in range(0,len(cy5samplenames)): outfile.write('Cy5 Prior '+cy5samplenames[sample]+',Cy5 Posterior '+cy5samplenames[sample]+',Call '+cy5samplenames[sample]+',') outfile.write('\n') currentoverallposition = 0 for chromosome in range(0,len(chromosomenames)): for markerposition in geneticmap[chromosome]: outfile.write(chromosomenames[chromosome]+','+repr(markerposition)+',') if re.search('Cy3',cy3orcy5) != None: outfile.write(cy3markernames[currentoverallposition]+',') for sample in range(0,len(cy3samplenames)): outfile.write(re.sub('\'','',repr(cy3transformeddata[sample][cy3markernames[currentoverallposition]]))+','+re.sub('\'\'','',repr(cy3outdata[sample][cy3markernames[currentoverallposition]]))+',') if cy3outdata[sample][cy3markernames[currentoverallposition]] == '': outfile.write(',') elif float(cy3outdata[sample][cy3markernames[currentoverallposition]]) > 1 - posteriorerrorrate or float(cy3transformeddata[sample][cy3markernames[currentoverallposition]]) > 1 - priorerrorrate: outfile.write('COL,') elif float(cy3outdata[sample][cy3markernames[currentoverallposition]]) < posteriorerrorrate or float(cy3transformeddata[sample][cy3markernames[currentoverallposition]]) < priorerrorrate: outfile.write('LER,') else: outfile.write(',') elif re.search('Cy5',cy3orcy5) != None: outfile.write(cy5markernames[currentoverallposition]+',') for sample in range(0,len(cy5samplenames)): outfile.write(re.sub('\'','',repr(cy5transformeddata[sample][cy5markernames[currentoverallposition]]))+','+re.sub('\'\'','',repr(cy5outdata[sample][cy5markernames[currentoverallposition]]))+',') if cy5outdata[sample][cy5markernames[currentoverallposition]] == '': outfile.write(',') elif float(cy5outdata[sample][cy5markernames[currentoverallposition]]) > 1 - posteriorerrorrate or float(cy5transformeddata[sample][cy5markernames[currentoverallposition]]) > 1 - priorerrorrate: outfile.write('COL,') elif float(cy5outdata[sample][cy5markernames[currentoverallposition]]) < posteriorerrorrate or float(cy5transformeddata[sample][cy5markernames[currentoverallposition]]) < priorerrorrate: outfile.write('LER,') else: outfile.write(',') elif re.search('Both',cy3orcy5) != None: outfile.write(cy3markernames[currentoverallposition]+',') for sample in range(0,len(cy3samplenames)): outfile.write(re.sub('\'','',repr(cy3transformeddata[sample][cy3markernames[currentoverallposition]]))+','+re.sub('\'','',repr(cy5transformeddata[sample][cy5markernames[currentoverallposition]]))+','+re.sub('\'\'','',repr(cy3outdata[sample][cy3markernames[currentoverallposition]]))+','+re.sub('\'\'','',repr(cy5outdata[sample][cy5markernames[currentoverallposition]]))+',') if cy3outdata[sample][cy3markernames[currentoverallposition]] == '' and cy5outdata[sample][cy3markernames[currentoverallposition]] == '': combinedprobability = 0.5 elif cy3outdata[sample][cy3markernames[currentoverallposition]] == '': combinedprobability = float(cy5outdata[sample][cy3markernames[currentoverallposition]]) elif cy5outdata[sample][cy3markernames[currentoverallposition]] == '': combinedprobability = float(cy3outdata[sample][cy3markernames[currentoverallposition]]) else: p1 = float(cy5outdata[sample][cy3markernames[currentoverallposition]]) p2 = float(cy3outdata[sample][cy3markernames[currentoverallposition]]) combinedprobability = p1 * p2 / (p1 * p2 + (1 - p1) * (1 - p2)) r1 = float(cy5transformeddata[sample][cy5markernames[currentoverallposition]]) r2 = float(cy3transformeddata[sample][cy3markernames[currentoverallposition]]) combinedpriorprobability = r1 * r2 / (r1 * r2 + (1 - r1) * (1 - r2)) if combinedprobability > 1 - posteriorerrorrate or combinedpriorprobability > 1 - priorerrorrate: outfile.write('COL,') elif combinedprobability < posteriorerrorrate or combinedpriorprobability < priorerrorrate: outfile.write('LER,') else: outfile.write(',') outfile.write('\n') currentoverallposition = currentoverallposition + 1 outfile.close()