I am following Cormen Leiserson Rivest Stein (clrs) book and came across "kmp algorithm" for string matching. I implemented it using Python (as-is).
However, it doesn't seem to work for some reason. where is my fault?
The code is given below:
def kmp_matcher(t,p):
n=len(t)
m=len(p)
# pi=[0]*n;
pi = compute_prefix_function(p)
q=-1
for i in range(n):
while(q>0 and p[q]!=t[i]):
q=pi[q]
if(p[q]==t[i]):
q=q+1
if(q==m):
print "pattern occurs with shift "+str(i-m)
q=pi[q]
def compute_prefix_function(p):
m=len(p)
pi =range(m)
pi[1]=0
k=0
for q in range(2,m):
while(k>0 and p[k]!=p[q]):
k=pi[k]
if(p[k]==p[q]):
k=k+1
pi[q]=k
return pi
t = 'brownfoxlazydog'
p = 'lazy'
kmp_matcher(t,p)
This is a class I wrote based on CLRs KMP algorithm, which contains what you are after. Note that only DNA "characters" are accepted here.
class KmpMatcher(object):
def __init__(self, pattern, string, stringName):
self.motif = pattern.upper()
self.seq = string.upper()
self.header = stringName
self.prefix = []
self.validBases = ['A', 'T', 'G', 'C', 'N']
#Matches the motif pattern against itself.
def computePrefix(self):
#Initialize prefix array
self.fillPrefixList()
k = 0
for pos in range(1, len(self.motif)):
#Check valid nt
if(self.motif[pos] not in self.validBases):
self.invalidMotif()
#Unique base in motif
while(k > 0 and self.motif[k] != self.motif[pos]):
k = self.prefix[k]
#repeat in motif
if(self.motif[k] == self.motif[pos]):
k += 1
self.prefix[pos] = k
#Initialize the prefix list and set first element to 0
def fillPrefixList(self):
self.prefix = [None] * len(self.motif)
self.prefix[0] = 0
#An implementation of the Knuth-Morris-Pratt algorithm for linear time string matching
def kmpSearch(self):
#Compute prefix array
self.computePrefix()
#Number of characters matched
match = 0
found = False
for pos in range(0, len(self.seq)):
#Check valid nt
if(self.seq[pos] not in self.validBases):
self.invalidSequence()
#Next character is not a match
while(match > 0 and self.motif[match] != self.seq[pos]):
match = self.prefix[match-1]
#A character match has been found
if(self.motif[match] == self.seq[pos]):
match += 1
#Motif found
if(match == len(self.motif)):
print(self.header)
print("Match found at position: " + str(pos-match+2) + ':' + str(pos+1))
found = True
match = self.prefix[match-1]
if(found == False):
print("Sorry '" + self.motif + "'" + " was not found in " + str(self.header))
#An invalid character in the motif message to the user
def invalidMotif(self):
print("Error: motif contains invalid DNA nucleotides")
exit()
#An invalid character in the sequence message to the user
def invalidSequence(self):
print("Error: " + str(self.header) + "sequence contains invalid DNA nucleotides")
exit()