Compgen Tool Suite

Problem Set #3

mcmillan / Version 14

Comp 555: BioAlgorithms -- Spring 2015

Problem Set #3

Issued: 4/10/2015 Due: In class 4/20/2015

Homework Information: Some of the problems are probably too long to attempt the night before the due date, so plan accordingly. No late homework will accepted. However, your lowest homework will be dropped. Feel free to work with others, but the work you hand in should be your own.

String Searching

Below I provide python code for implementing both Suffix Trees and Suffix Arrays:


In[0]:import string
import array
import sys

# The following classes are used to construct 
# and represent Suffix trees

class Edge:
    count = 0
    def __init__(self, dstNode, first, last):
        self.dstNode = dstNode
        self.first = first
        self.last = last
        Edge.count += 1
    def split(self, suffix, suffixTree):
        # add new explicit node
        newIndex = len(suffixTree.nodes)
        suffixTree.nodes.append(0)
        # add new suffix edge
        newFirst = self.first + len(suffix)
        suffixTree.edgeLookup[newIndex, suffixTree.string[newFirst]] = Edge(self.dstNode, newFirst, self.last)
        # shorten this edge
        self.last = newFirst - 1
        self.dstNode = newIndex
        return newIndex
    def isLeafEdge(self, suffixTree):
        return (self.last == suffixTree.lastIndex)
    def __len__(self):
        return self.last - self.first + 1
    def __repr__(self):
        return "Edge(%d, %d, %d)" % (self.dstNode, self.first, self.last)

class Suffix:
    def __init__(self, srcNode, first, last):
        self.srcNode = srcNode
        self.first = first
        self.last = last
    def is_explicit(self):
        return self.first > self.last
    def is_implicit(self):
        return self.first <= self.last
    def canonicalize(self, suffixTree):
        if self.is_implicit():
            edge = suffixTree.edgeLookup[self.srcNode, suffixTree.string[self.first]]
            if (len(edge) <= len(self)):
                self.first += len(edge)
                self.srcNode = edge.dstNode
                self.canonicalize(suffixTree)
    def __len__(self):
        return self.last - self.first + 1
    def __repr__(self):
        return "Suffix(%d, %d, %d)" % (self.srcNode, self.first, self.last)

class SuffixTree:
    def __init__(self, string):
        self.string = string            # save a pointer to the string
        self.alphabet = set()           # alphabet of string
        self.nodes = array.array('l')   # index of ith node's parent
        self.nodes.append(0)            # add root node
        self.edgeLookup = {}            # adjacency list indexed by (srcNode, char)
        self.lastIndex = len(string) - 1
        activePoint = Suffix(0, 0, -1)
        for i in xrange(len(string)):
            self.alphabet.add(string[i])
            self.addPrefix(i, activePoint)
    def addPrefix(self, last, activePoint):
        LastParentNode = -1
        while True:
            ParentNode = activePoint.srcNode
            if activePoint.is_explicit():
                if (activePoint.srcNode, self.string[last]) in self.edgeLookup:
                    break
            else:               #potentially split an implicit node
                edge = self.edgeLookup[activePoint.srcNode, self.string[activePoint.first]]
                if (self.string[edge.first + len(activePoint)] == self.string[last]):
                    break
                else:
                    ParentNode = edge.split(activePoint, self)
            self.nodes.append(-1)
            self.edgeLookup[ParentNode, self.string[last]] = Edge(len(self.nodes)-1, last, self.lastIndex)
            # add suffix link
            if (LastParentNode > 0):
                self.nodes[LastParentNode] = ParentNode
            LastParentNode = ParentNode
            if (activePoint.srcNode == 0):
                activePoint.first += 1
            else:
                activePoint.srcNode = self.nodes[activePoint.srcNode]
            activePoint.canonicalize(self)
        if (LastParentNode > 0):
            self.nodes[LastParentNode] = ParentNode
        activePoint.last += 1
        activePoint.canonicalize(self)
    def leafIndices(self, nodeIndex=0, lenSoFar=0):
        indexList = []
        if (self.nodes[nodeIndex] < 0):
            indexList.append(self.lastIndex + 1 - lenSoFar)
        else:
            for char in self.alphabet:
                try:
                    edge = self.edgeLookup[nodeIndex, char]
                    if edge.isLeafEdge(self):
                        indexList.append(self.lastIndex + 1 - len(edge) - lenSoFar)
                    else:
                        indexList += self.leafIndices(edge.dstNode, lenSoFar + len(edge))
                except KeyError:
                    continue
        return indexList
    def distinct(self, nodeIndex=0, lenSoFar=0):
        distinctList = []
        # examine children of node
        for char in self.alphabet:
            try:
                edge = self.edgeLookup[nodeIndex, char]
                if edge.isLeafEdge(self):
                    distinctList.append((edge.first-lenSoFar, lenSoFar+1))
                else:
                    distinctList += self.distinct(edge.dstNode, lenSoFar + len(edge))
            except:
                continue
        return distinctList
    def thread(self, target):
        nodeIndex = 0           # start from root
        i = 0                   # characters threaded so far
        while (i < len(target)):
            try:
                edge = self.edgeLookup[nodeIndex, target[i]]
                prefix = self.string[edge.first:edge.last+1]
                if (target[i:].startswith(prefix) or prefix.startswith(target[i:])):
                    i += len(prefix)
                else:
                    return []
                nodeIndex = edge.dstNode
            except KeyError:
                return []
        return self.leafIndices(nodeIndex, i)
    def printTree(self, nodeIndex=0, prefixLength=0, prefix=""):
        branches = list()
        for c in self.alphabet:
            try:
                edge = self.edgeLookup[nodeIndex, c]
                extent = prefix + "(%d)-%s-" % (nodeIndex, self.string[edge.first:edge.last+1])
                subtree = self.printTree(edge.dstNode, prefixLength+len(edge), extent)
                if (len(subtree) > 0):
                    branches.append(subtree)
                elif (edge.isLeafEdge(self)):
                    print extent + "(%d)[%d]" % (edge.dstNode, edge.first-prefixLength)
            except KeyError:
                pass
        return branches

In [3]:test = "imissmissmississippiskiss$"
suffixTree = SuffixTree(test)
print len(suffixTree.nodes)
suffixTree.printTree()


41
(0)-$-(40)[25]
(0)-i-(3)-missmissmississippiskiss$-(1)[0]
(0)-i-(3)-ppiskiss$-(29)[16]
(0)-i-(3)-s-(33)-kiss$-(34)[19]
(0)-i-(3)-s-(33)-s-(18)-$-(37)[22]
(0)-i-(3)-s-(33)-s-(18)-i-(23)-ppiskiss$-(24)[13]
(0)-i-(3)-s-(33)-s-(18)-i-(23)-ssippiskiss$-(19)[10]
(0)-i-(3)-s-(33)-s-(18)-miss-(10)-issippiskiss$-(11)[6]
(0)-i-(3)-s-(33)-s-(18)-miss-(10)-mississippiskiss$-(4)[2]
(0)-kiss$-(36)[21]
(0)-miss-(16)-issippiskiss$-(17)[9]
(0)-miss-(16)-miss-(8)-issippiskiss$-(9)[5]
(0)-miss-(16)-miss-(8)-mississippiskiss$-(2)[1]
(0)-p-(31)-iskiss$-(32)[18]
(0)-p-(31)-piskiss$-(30)[17]
(0)-s-(6)-$-(39)[24]
(0)-s-(6)-i-(27)-ppiskiss$-(28)[15]
(0)-s-(6)-i-(27)-ssippiskiss$-(22)[12]
(0)-s-(6)-kiss$-(35)[20]
(0)-s-(6)-miss-(14)-issippiskiss$-(15)[8]
(0)-s-(6)-miss-(14)-mississippiskiss$-(7)[4]
(0)-s-(6)-s-(20)-$-(38)[23]
(0)-s-(6)-s-(20)-i-(25)-ppiskiss$-(26)[14]
(0)-s-(6)-s-(20)-i-(25)-ssippiskiss$-(21)[11]
(0)-s-(6)-s-(20)-miss-(12)-issippiskiss$-(13)[7]
(0)-s-(6)-s-(20)-miss-(12)-mississippiskiss$-(5)[3]

In [4]:print suffixTree.thread("iss")


[22, 13, 10, 6, 2]

In [5]:# a one-line function to compute a Suffix array
def argsort(text):
    return sorted(range(len(text)), cmp=lambda i,j: -1 if text[i:] < text[j:] else 1)

def findFirst(pattern, text, sfa):
    """ Finds the index of the first occurence of pattern in the suffix array """
    hi = len(text)
    lo = 0
    while (lo < hi):
        mid = (lo+hi)//2
        if (pattern > text[sfa[mid]:]):
            lo = mid + 1
        else:
            hi = mid
    return lo

def findLast(pattern, text, sfa):
    """ Finds the index of the last occurence of pattern in the suffix array """
    hi = len(text)
    lo = 0
    m = len(pattern)
    while (lo < hi):
        mid = (lo+hi)//2
        i = sfa[mid]
        if (pattern >= text[i:i+m]):
            lo = mid + 1
        else:
            hi = mid
    return lo

In [5]:sufArray = argsort(test)
print sufArray


[25, 0, 16, 19, 22, 13, 10, 6, 2, 21, 9, 5, 1, 18, 17, 24, 15, 12, 20, 8, 4, 23, 14, 11, 7, 3]

In [6]:lo = findFirst("iss", test, sufArray)
hi = findLast("iss", test, sufArray)
print sufArray[lo:hi]


[22, 13, 10, 6, 2]

Problem #1:

Build suffix trees for substrings of Human Chromosome 1 starting from the first non-'N' nucleotide and of lengths 100, 1000, 10,000, 100,000, etc. until either the construction time for the suffix tree exceeds 100 secs, the length of 100,000,000 is reached, or the program crashes. For each such substring report the time required to build the suffix tree and the time to find all "CAT" substrings in the given substring. Also report the number of "CAT"s. (Note: It is likely that you will have to increase Python's recursion limit for larger strings).

Problem #2:

Build suffixarrays for substrings of Human Chromosome 1 starting from the first non-'N' nucleotide and of lengths 100, 1000, 10,000, 100,000, etc. until either the construction time for the suffix array exceeds 100 secs or the length of 100,000,000 is reached. For each such substring report the time required to build the suffix array and the time to find all "CAT" substrings in the given substring. Also report the number of "CAT"s.

Problem #3:

Write a program to extract the suffix array from the suffix tree. Time the combination of suffix tree construction and suffix array extraction for various sizes of text strings. Use a plot to estimate the length of a text where first constructing a suffix tree will be faster than the given algorithm of suffix array construction.

Submit your solutions, in the form of an iPython Notebook, here.