Homework Information: Some of the problems are probably too long to attempt the night before the due date, so plan accordingly. No late homework will accepted. However, your lowest homework will be dropped. Feel free to work with others, but the work you hand in should be your own.
String Searching
Below I provide python code for implementing both Suffix Trees and Suffix Arrays:
In[0]:
import string
import array
import sys
# The following classes are used to construct
# and represent Suffix trees
class Edge:
count = 0
def __init__(self, dstNode, first, last):
self.dstNode = dstNode
self.first = first
self.last = last
Edge.count += 1
def split(self, suffix, suffixTree):
# add new explicit node
newIndex = len(suffixTree.nodes)
suffixTree.nodes.append(0)
# add new suffix edge
newFirst = self.first + len(suffix)
suffixTree.edgeLookup[newIndex, suffixTree.string[newFirst]] = Edge(self.dstNode, newFirst, self.last)
# shorten this edge
self.last = newFirst - 1
self.dstNode = newIndex
return newIndex
def isLeafEdge(self, suffixTree):
return (self.last == suffixTree.lastIndex)
def __len__(self):
return self.last - self.first + 1
def __repr__(self):
return "Edge(%d, %d, %d)" % (self.dstNode, self.first, self.last)
class Suffix:
def __init__(self, srcNode, first, last):
self.srcNode = srcNode
self.first = first
self.last = last
def is_explicit(self):
return self.first > self.last
def is_implicit(self):
return self.first <= self.last
def canonicalize(self, suffixTree):
if self.is_implicit():
edge = suffixTree.edgeLookup[self.srcNode, suffixTree.string[self.first]]
if (len(edge) <= len(self)):
self.first += len(edge)
self.srcNode = edge.dstNode
self.canonicalize(suffixTree)
def __len__(self):
return self.last - self.first + 1
def __repr__(self):
return "Suffix(%d, %d, %d)" % (self.srcNode, self.first, self.last)
class SuffixTree:
def __init__(self, string):
self.string = string # save a pointer to the string
self.alphabet = set() # alphabet of string
self.nodes = array.array('l') # index of ith node's parent
self.nodes.append(0) # add root node
self.edgeLookup = {} # adjacency list indexed by (srcNode, char)
self.lastIndex = len(string) - 1
activePoint = Suffix(0, 0, -1)
for i in xrange(len(string)):
self.alphabet.add(string[i])
self.addPrefix(i, activePoint)
def addPrefix(self, last, activePoint):
LastParentNode = -1
while True:
ParentNode = activePoint.srcNode
if activePoint.is_explicit():
if (activePoint.srcNode, self.string[last]) in self.edgeLookup:
break
else: #potentially split an implicit node
edge = self.edgeLookup[activePoint.srcNode, self.string[activePoint.first]]
if (self.string[edge.first + len(activePoint)] == self.string[last]):
break
else:
ParentNode = edge.split(activePoint, self)
self.nodes.append(-1)
self.edgeLookup[ParentNode, self.string[last]] = Edge(len(self.nodes)-1, last, self.lastIndex)
# add suffix link
if (LastParentNode > 0):
self.nodes[LastParentNode] = ParentNode
LastParentNode = ParentNode
if (activePoint.srcNode == 0):
activePoint.first += 1
else:
activePoint.srcNode = self.nodes[activePoint.srcNode]
activePoint.canonicalize(self)
if (LastParentNode > 0):
self.nodes[LastParentNode] = ParentNode
activePoint.last += 1
activePoint.canonicalize(self)
def leafIndices(self, nodeIndex=0, lenSoFar=0):
indexList = []
if (self.nodes[nodeIndex] < 0):
indexList.append(self.lastIndex + 1 - lenSoFar)
else:
for char in self.alphabet:
try:
edge = self.edgeLookup[nodeIndex, char]
if edge.isLeafEdge(self):
indexList.append(self.lastIndex + 1 - len(edge) - lenSoFar)
else:
indexList += self.leafIndices(edge.dstNode, lenSoFar + len(edge))
except KeyError:
continue
return indexList
def distinct(self, nodeIndex=0, lenSoFar=0):
distinctList = []
# examine children of node
for char in self.alphabet:
try:
edge = self.edgeLookup[nodeIndex, char]
if edge.isLeafEdge(self):
distinctList.append((edge.first-lenSoFar, lenSoFar+1))
else:
distinctList += self.distinct(edge.dstNode, lenSoFar + len(edge))
except:
continue
return distinctList
def thread(self, target):
nodeIndex = 0 # start from root
i = 0 # characters threaded so far
while (i < len(target)):
try:
edge = self.edgeLookup[nodeIndex, target[i]]
prefix = self.string[edge.first:edge.last+1]
if (target[i:].startswith(prefix) or prefix.startswith(target[i:])):
i += len(prefix)
else:
return []
nodeIndex = edge.dstNode
except KeyError:
return []
return self.leafIndices(nodeIndex, i)
def printTree(self, nodeIndex=0, prefixLength=0, prefix=""):
branches = list()
for c in self.alphabet:
try:
edge = self.edgeLookup[nodeIndex, c]
extent = prefix + "(%d)-%s-" % (nodeIndex, self.string[edge.first:edge.last+1])
subtree = self.printTree(edge.dstNode, prefixLength+len(edge), extent)
if (len(subtree) > 0):
branches.append(subtree)
elif (edge.isLeafEdge(self)):
print extent + "(%d)[%d]" % (edge.dstNode, edge.first-prefixLength)
except KeyError:
pass
return branches
In [3]:
test = "imissmissmississippiskiss$"
suffixTree = SuffixTree(test)
print len(suffixTree.nodes)
suffixTree.printTree()
# a one-line function to compute a Suffix array
def argsort(text):
return sorted(range(len(text)), cmp=lambda i,j: -1 if text[i:] < text[j:] else 1)
def findFirst(pattern, text, sfa):
""" Finds the index of the first occurence of pattern in the suffix array """
hi = len(text)
lo = 0
while (lo < hi):
mid = (lo+hi)//2
if (pattern > text[sfa[mid]:]):
lo = mid + 1
else:
hi = mid
return lo
def findLast(pattern, text, sfa):
""" Finds the index of the last occurence of pattern in the suffix array """
hi = len(text)
lo = 0
m = len(pattern)
while (lo < hi):
mid = (lo+hi)//2
i = sfa[mid]
if (pattern >= text[i:i+m]):
lo = mid + 1
else:
hi = mid
return lo
lo = findFirst("iss", test, sufArray)
hi = findLast("iss", test, sufArray)
print sufArray[lo:hi]
[22, 13, 10, 6, 2]
Problem #1:
Build suffix trees for substrings of Human Chromosome 1 starting from the first non-'N' nucleotide and of lengths 100, 1000, 10,000, 100,000, etc. until either the construction time for the suffix tree exceeds 100 secs, the length of 100,000,000 is reached, or the program crashes. For each such substring report the time required to build the suffix tree and the time to find all "CAT" substrings in the given substring. Also report the number of "CAT"s. (Note: It is likely that you will have to increase Python's recursion limit for larger strings).
Problem #2:
Build suffixarrays for substrings of Human Chromosome 1 starting from the first non-'N' nucleotide and of lengths 100, 1000, 10,000, 100,000, etc. until either the construction time for the suffix array exceeds 100 secs or the length of 100,000,000 is reached. For each such substring report the time required to build the suffix array and the time to find all "CAT" substrings in the given substring. Also report the number of "CAT"s.
Problem #3:
Write a program to extract the suffix array from the suffix tree. Time the combination of suffix tree construction and suffix array extraction for various sizes of text strings. Use a plot to estimate the length of a text where first constructing a suffix tree will be faster than the given algorithm of suffix array construction.