# a one-line function to compute a Suffix array
def argsort(text):
return sorted(range(len(text)), cmp=lambda i,j: -1 if text[i:] < text[j:] else 1)
text = "mississippi$"
suffixArray = argsort(text)
for i, j in enumerate(suffixArray):
print i, text[j:]
def findFirst(pattern, text, sfa):
""" Finds the index of the first occurrence of pattern in the suffix array """
hi = len(text)
lo = 0
while (lo < hi):
mid = (lo+hi)//2
if (pattern > text[sfa[mid]:]):
lo = mid + 1
else:
hi = mid
return lo
def findLast(pattern, text, sfa):
""" Finds the index of the last occurrence of pattern in the suffix array """
hi = len(text)
lo = 0
m = len(pattern)
while (lo < hi):
mid = (lo+hi)//2
i = sfa[mid]
if (pattern >= text[i:i+m]):
lo = mid + 1
else:
hi = mid
return lo
target = "is"
start = findFirst(target, text, suffixArray)
end = findLast(target, text, suffixArray)
print str((start, end))
for i in xrange(start,end):
print i, text[suffixArray[i]:]
mGene = open("MouseGene.seq", 'r').read()
print mGene
%timeit argsort(mGene)
suffixArray = argsort(mGene)
target = "CAT"
start = findFirst(target, mGene, suffixArray)
end = findLast(target, mGene, suffixArray)
print str((start, end))
for i in xrange(start,end):
print "%6d %s" % (suffixArray[i], mGene[suffixArray[i]:min(len(suffixArray), suffixArray[i]+20)])
def BWT(s):
# create a table, with rows of all possible rotations of s
rotation = [s[i:] + s[:i] for i in xrange(len(s))]
# sort rows alphabetically
rotation.sort()
# return (last column of the table)
return "".join([r[-1] for r in rotation])
def inverseBWT(s):
# initialize table from s
table = [c for c in s]
# repeat length(s) - 1 times
for j in xrange(len(s)-1):
# sort rows of the table alphabetically
table.sort()
# insert s as the first column
table = [s[i]+table[i] for i in xrange(len(s))]
# return (row that ends with the 'EOS' character)
return table[[r[-1] for r in table].index('$')]
bwt = BWT("mississippi$")
print bwt
print inverseBWT(bwt)