Motif Finding

In [15]:
nucleotideMap = { 'a':0, 'c':1, 'g':2, 't':3 }
reverseMap = { 0:'a', 1:'c', 2:'g', 3:'t' }
In [16]:
ScoreCount = 0

def getAndClearScoreCount():
    global ScoreCount
    rval = ScoreCount
    ScoreCount = 0
    return rval
In [17]:
def Score(s, DNA, l):
    """ compute the consensus SCORE of a given l-base 
        alignment given offsets into each DNA string.
            s = list of starting indices, 1-based, 0 means ignore
            DNA = list of nucleotide strings
            l = Target Motif length"""
    global ScoreCount
    ScoreCount += 1
    score = 0
    for i in xrange(l):
       # loop over string positions
       cnt = [0 for x in xrange(4)]
       for j in xrange(len(s)):
           # loop over DNA strands
           sval = s[j]
           if (sval != 0):
               cnt[nucleotideMap[DNA[j][sval-1+i]]] += 1
       score += max(cnt)
    return score


def Consensus(s, DNA, l):
    """ compute the consensus STRING of a given l-base 
        alignment given offsets into each DNA string.
            s = list of starting indices, 1-based, 0 means ignore
            DNA = list of nucleotide strings
            l = Target Motif length """
    cstring = ''
    for i in xrange(l):
       # loop over string positions
       cnt = [0 for x in xrange(4)]
       for j in xrange(len(s)):
           # loop over DNA strands
           sval = s[j]
           if (sval != 0):
               cnt[nucleotideMap[DNA[j][sval-1+i]]] += 1
       cstring += reverseMap[cnt.index(max(cnt))]
    return cstring
In [18]:
def NextLeaf(a, L, k):
    """ generates all L^k permutations of 
        of the list of integers, "a", when
        initialized with k*[1] """
    for i in reversed(xrange(L)):
        if (a[i] < k):
            a[i] += 1
            break
        else:
            a[i] = 1
    return a


def NextVertex(a, i, L, k):
    """ generates all nodes in a
        search tree with L^k leafs """
    if (i < L):
        a[i] = 1
        return (a, i+1)
    else:
        for j in reversed(xrange(L)):
            if (a[j] < k):
                a[j] += 1
                return (a, j+1)
            a[j] = 0
    return (a, 0)


def Bypass(a, i, L, k):
    """ ignore the children of an interior node beyond i generations """
    for j in reversed(xrange(i)):
        if (a[j] < k):
            a[j] += 1
            return (a, j+1)
        a[j] = 0
    return (a, 0)

Brute Force search for the l-mer with the highest score

Exhaustively enumerate every possible position for the motif.

In [22]:
def BruteForceMotifSearchAgain(DNA,t,n,l):
    """ searches all possible alignments of 't'
        DNA fragments (of length 'n') for the
        'l'-mer with the maximum consensus score """
    s = [1 for i in xrange(t)]
    bestScore = Score(s, DNA, l)
    bestMotif = [x for x in s]
    while (True):
        s = NextLeaf(s,t,n-l+1)
        if (sum(s) == t):
            break
        score = Score(s, DNA, l)
        if (score > bestScore):
            print s, " = ", score
            bestScore = score
            bestMotif = [x for x in s]
    return bestMotif


Second version

A rewrite of the brute force method from above, but organized to allow for branching

In [23]:
def SimpleMotifSearch(DNA,t,n,l):
    """ traverses the full search tree of all possible alignments of 't' DNA
        fragments of length 'n' for the 'l'-mer with the maximum consensus score """
    s = [0 for i in xrange(t)]
    bestScore = 0
    i = 0
    while (i > 0 or bestScore == 0):
        if (i < t):
            s, i = NextVertex(s,i,t,n-l+1)
        else:
            score = Score(s, DNA, l)
            if (score > bestScore):
                print s, " = ", score
                bestScore = score
                bestMotif = [x for x in s]
            s, i = NextVertex(s,i,t,n-l+1)
    return bestMotif

Third Version

This rewrite of version 2 allows for pruning interior trees.

In [24]:
def BranchAndBoundMotifSearch(DNA,t,n,l):
    # traverses a search tree of alignments
    # of 't' DNA fragments (of length 'n')
    # while pruning hopless excursions in
    # search for the 'l'-mer with the
    # maximum consensus score
    s = [0 for i in xrange(t)]
    bestScore = 0
    i = 0
    while (i > 0 or bestScore == 0):
        if (i < t):
            optimisticScore = Score(s, DNA, l) + (t-i)*l
            if (optimisticScore < bestScore):
                s, i = Bypass(s,i,t,n-l+1)
            else:
                s, i = NextVertex(s,i,t,n-l+1)
        else:
            score = Score(s, DNA, l)
            if (score > bestScore):
                print s, " = ", score
                bestScore = score
                bestMotif = [x for x in s]
            s, i = NextVertex(s,i,t,n-l+1)
    return bestMotif
In [25]:
# Example from notes
DNA = ["cctgatagacgctatctggctatccaggtacttaggtcctctgtgcgaatctatgcgtttccaaccat",
       "agtactggtgtacatttgatccatacgtacaccggcaacctgaaacaaacgctcagaaccagaagtgc",
       "aaacgttagtgcaccctctttcttcgtggctctggccaacgagggctgatgtataagacgaaaatttt",
       "agcctccgatgtaagtcatagctgtaactattacctgccacccctattacatcttacgtccatataca",
       "ctgttatacaacgcgtcatggcggggtatgcgttttggtcgtcgtacgctcgatcgttaccgtacggc"]
In [26]:
print "Branch-and-Bound Search = ",
s = BranchAndBoundMotifSearch(DNA,5,68,8)
print s, "=", Consensus(s,DNA,8)
print "iterations =", getAndClearScoreCount()
Branch-and-Bound Search =  [1, 1, 1, 1, 1]  =  19
[1, 1, 1, 1, 2]  =  22
[1, 1, 1, 1, 14]  =  24
[1, 1, 1, 14, 2]  =  26
[1, 1, 1, 14, 14]  =  27
[1, 1, 2, 14, 2]  =  28
[1, 1, 4, 14, 14]  =  29
[1, 1, 22, 14, 14]  =  30
[2, 5, 46, 4, 1]  =  31
[2, 5, 46, 6, 1]  =  34
[2, 5, 46, 6, 1] = ctgatgta
iterations = 431698

In [27]:
print "Simple Brute Force:"
s = SimpleMotifSearch(DNA,5,16,8)
print s, "=", Consensus(s,DNA,8)
print "iterations =", getAndClearScoreCount()
Simple Brute Force:
[1, 1, 1, 1, 1]  =  19
[1, 1, 1, 1, 2]  =  22
[1, 1, 2, 1, 2]  =  24
[1, 1, 2, 8, 2]  =  25
[1, 1, 4, 7, 2]  =  27
[2, 2, 3, 9, 1]  =  28
[2, 5, 3, 6, 1]  =  29
[2, 5, 6, 6, 1]  =  30
[2, 5, 6, 6, 1] = ctgatgta
iterations = 59049

In [28]:
print "Branch-and-Bound Search:"
s = BranchAndBoundMotifSearch(DNA,5,16,8)
print s, "=", Consensus(s,DNA,8)
print "iterations =", getAndClearScoreCount()
Branch-and-Bound Search:
[1, 1, 1, 1, 1]  =  19
[1, 1, 1, 1, 2]  =  22
[1, 1, 2, 1, 2]  =  24
[1, 1, 2, 8, 2]  =  25
[1, 1, 4, 7, 2]  =  27
[2, 2, 3, 9, 1]  =  28
[2, 5, 3, 6, 1]  =  29
[2, 5, 6, 6, 1]  =  30
[2, 5, 6, 6, 1] = ctgatgta
iterations = 6274

How to enumerate all l-mers?

In [13]:
def lmers(l):
    s = [1 for i in xrange(l)]
    while (True):
        motif = "".join(["*acgt"[i] for i in s])
        print motif
        # test l-mer here
        s = NextLeaf(s,l,4)
        if (sum(s) == l):
            break
In [12]:
lmers(3)
aaa
aac
aag
aat
aca
acc
acg
act
aga
agc
agg
agt
ata
atc
atg
att
caa
cac
cag
cat
cca
ccc
ccg
cct
cga
cgc
cgg
cgt
cta
ctc
ctg
ctt
gaa
gac
gag
gat
gca
gcc
gcg
gct
gga
ggc
ggg
ggt
gta
gtc
gtg
gtt
taa
tac
tag
tat
tca
tcc
tcg
tct
tga
tgc
tgg
tgt
tta
ttc
ttg
ttt

In []: