The following functions are from last time:
def GetAlleles(filename):
""" Read the alleles in filename """
allele = dict()
with open(filename, 'rb') as tsvfile:
for row in tsvfile:
line = row[:row.find('#')].strip(' \r\n') # strip comments and line endings
if (len(line) == 0): # skip empty lines
continue
field = line.split('\t')
if (len(field) != 4): # print error for lines missing fields
print "ERROR: %s" % str(field)
continue
allele[field[0]] = field[3]
return allele
def GetCoordinates(filename):
""" Read the chromosome and positions of alleles in filename """
chrPos = dict()
with open(filename, 'rb') as tsvfile:
for row in tsvfile:
line = row[:row.find('#')].strip(' \r\n') # strip comments and line endings
if (len(line) == 0): # skip empty lines
continue
field = line.split('\t')
if (len(field) != 4): # print error for lines missing fields
print "ERROR: %s" % str(field)
continue
chrPos[field[0]] = (field[1], int(field[2]))
return chrPos
This is the phasing routine from last time.
alleles = dict()
for sample in "ABCDEFGHIQ":
alleles[sample] = GetAlleles("genome%s.txt" % sample)
coordinates = GetCoordinates("genomeG.txt")
The following code sorts the alleles by their genomic position
chromo = [str(i) for i in xrange(1,23)] + ['X', 'Y', 'MT']
def compareCoords(a1, a2):
a1Chr, a1Pos = coordinates[a1]
a2Chr, a2Pos = coordinates[a2]
a1Index = chromo.index(a1Chr)
a2Index = chromo.index(a2Chr)
if (a1Index == a2Index):
return a1Pos - a2Pos
else:
return a1Index - a2Index
orderedAlleles = sorted(coordinates.keys(), cmp=compareCoords)
A first attempt at a function to compute the relatedness of two samples
related = {
('A', 'A'): 1, ('A', 'AA'): 1, ('AA', 'A'): 1, ('AA', 'AA'): 2,
('C', 'C'): 1, ('C', 'CC'): 1, ('CC', 'C'): 1, ('CC', 'CC'): 2,
('G', 'G'): 1, ('G', 'GG'): 1, ('GG', 'G'): 1, ('GG', 'GG'): 2,
('T', 'T'): 1, ('T', 'TT'): 1, ('TT', 'T'): 1, ('TT', 'TT'): 2,
('I', 'I'): 1, ('I', 'II'): 1, ('II', 'I'): 1, ('II', 'II'): 2,
('D', 'D'): 1, ('D', 'DD'): 1, ('DD', 'D'): 1, ('DD', 'DD'): 2,
('AA', 'AC'): 1, ('AA', 'AG'): 1, ('AA', 'AT'): 1,
('AC', 'AA'): 1, ('AC', 'AC'): 2, ('AC', 'AG'): 1, ('AC', 'AT'): 1,
('AC', 'CC'): 1, ('AC', 'CG'): 1, ('AC', 'CT'): 1,
('AG', 'AA'): 1, ('AG', 'AC'): 1, ('AG', 'AG'): 2, ('AG', 'AT'): 1,
('AG', 'CG'): 1, ('AG', 'GG'): 1, ('AG', 'GT'): 1,
('AT', 'AA'): 1, ('AT', 'AC'): 1, ('AT', 'AG'): 1, ('AT', 'AT'): 2,
('AT', 'CT'): 1, ('AT', 'GT'): 1, ('AT', 'TT'): 1,
('CC', 'AC'): 1, ('CC', 'CG'): 1, ('CC', 'CT'): 1,
('CG', 'AC'): 1, ('CG', 'CC'): 1, ('CG', 'CG'): 2, ('CG', 'CT'): 1,
('CG', 'AG'): 1, ('CG', 'GG'): 1, ('CG', 'GT'): 1,
('CT', 'CC'): 1, ('CT', 'CG'): 1, ('CT', 'CT'): 2,
('CT', 'AT'): 1, ('CT', 'GT'): 1, ('CT', 'TT'): 1,
('GG', 'AG'): 1, ('GG', 'CG'): 1, ('GG', 'GT'): 1,
('GT', 'AT'): 1, ('GT', 'CT'): 1, ('GT', 'GT'): 2, ('GT', 'TT'): 1,
('GT', 'AG'): 1, ('GT', 'CG'): 1, ('GT', 'GG'): 1,
('TT', 'AT'): 1, ('TT', 'CT'): 1, ('TT', 'GT'): 1,
('DD', 'DI'): 1, ('DI', 'DD'): 1, ('DI', 'DI'): 2,
('DI', 'II'): 1, ('II', 'DI'): 1,
('AC', 'A'): 1, ('AC', 'C'): 1, ('AG', 'A'): 1, ('AG', 'G'): 1,
('AT', 'A'): 1, ('AT', 'T'): 1, ('CG', 'C'): 1, ('CG', 'G'): 1,
('CT', 'C'): 1, ('CT', 'T'): 1, ('GT', 'G'): 1, ('GT', 'T'): 1,
('A', 'AC'): 1, ('C', 'AC'): 1, ('A', 'AG'): 1, ('G', 'AG'): 1,
('A', 'AT'): 1, ('T', 'AT'): 1, ('C', 'CG'): 1, ('G', 'CG'): 1,
('C', 'CT'): 1, ('T', 'CT'): 1, ('G', 'GT'): 1, ('T', 'GT'): 1,
}
def relatedness(alleles1, alleles2):
N, R, B = 0, 0, 0 # counts
for gId in orderedAlleles:
try:
call = alleles1[gId]
other = alleles2[gId]
except:
continue
try:
R += related[call, other]
except KeyError:
if ('-' not in call) and ('-' not in other):
B += 1
continue
N += min(len(call), len(other))
return R, N, B
relResults = {}
sample = "ABCDEFGHIQ"
for i in xrange(len(sample)-1):
for j in xrange(i+1, len(sample)):
print sample[i]+sample[j],
R, N, B = relatedness(alleles[sample[i]], alleles[sample[j]])
print "%8d %8d %6d %5.2f" % (R, N, B, (100.0*R)/N)
relResults[sample[i]+sample[j]] = B
expResult = {
'BC': 0.5, 'BD': 0.5, 'CD': 0.5, 'HI': 0.5, # siblings
'DG': 0.5, 'EG': 0.5, # parents
'FH': 0.5, 'FI': 0.5, 'GH': 0.5, 'GI': 0.5, # parents
'DH': 0.25,'DI': 0.25,'EH': 0.25,'EI': 0.25, # grandparents
'AB': 0.125, 'AC': 0.125, 'AD': 0.125, # 1st cousin once removed
'AG': 0.0625, # 1st cousin twice removed
'AH': 0.03125, 'AI': 0.03125 # 1st cousin thrice removed
}
%matplotlib inline
import matplotlib.pyplot as plot
import numpyx = [relResults[key] for key in expResult.iterkeys()]
x = [relResults[key] for key in expResult.iterkeys()]
y = [expResult[key] for key in expResult.iterkeys()]
plot.scatter(x, y)
A = numpy.vstack([x, numpy.ones(len(x))]).T
m, c = numpy.linalg.lstsq(A,y)[0]
t = numpy.arange(0, 60000.0, 2500.0)
plot.plot(t, m*t+c, 'r:')
print m, c
for key, value in sorted(relResults.iteritems()):
print key, 100*(m*value + c)
A better model might be one of the form: R = 1/(k x + 1)
which can be rewritten as:
R*x k + R = 1
R*x k = 1 - R
A = [relResults[key]*expResult[key] for key in expResult.iterkeys()]
y2 = [1.0 - expResult[key] for key in expResult.iterkeys()]
A = numpy.vstack([A]).T
k = numpy.linalg.lstsq(A,y)[0][0]
print m
plot.scatter(x, y)
plot.plot(t, 1.0/(k*t + 1), 'r:')
plot.plot(t, 1.0/(2.0*k*t + 1), 'g:')
plot.plot(t, 1.0/(0.5*k*t + 1), 'b:')
Did I do something wrong?