Optimal Mismatch Algorithm
This algorithm works by scanning pattern characters from the least frequent one to the most frequent one. Doing so one may hope to have a mismatch most of the times and thus to scan the whole text very quickly. One needs to know the frequencies of each of the character of the alphabet.
Private Shared _x As Char(), _y As Char()
Private Shared _m As Integer, _n As Integer
Private Shared _adaptedGs As Integer(), _qsBc As Integer(), _frequency As Integer()
Private Shared _pattern As Pattern()
Private Shared Sub OrderPattern(x As Char(), pat As Pattern(), freq As Integer())
For i As Integer = 0 To x.Length - 1
Dim ptrn As New Pattern()
ptrn.Location = i
ptrn.Character = x(i)
pat(i) = ptrn
Next
QuickSortPattern(pat, 0, x.Length - 1, freq)
End Sub
Private Shared Sub QuickSortPattern(pat As Pattern(), low As Integer, n As Integer, freq As Integer())
Dim lo As Integer = low
Dim hi As Integer = n
If lo >= n Then
Return
End If
Dim mid As Pattern = pat((lo + hi) \ 2)
While lo < hi
While lo < hi AndAlso OptimalPatternCompare(pat(lo), mid, freq) < 0
lo += 1
End While
While lo < hi AndAlso OptimalPatternCompare(pat(hi), mid, freq) > 0
hi -= 1
End While
If lo < hi Then
Dim temp As Pattern = pat(lo)
pat(lo) = pat(hi)
pat(hi) = temp
End If
End While
If hi < lo Then
Dim temp As Integer = hi
hi = lo
lo = temp
End If
QuickSortPattern(pat, low, lo, freq)
QuickSortPattern(pat, If(lo = low, lo + 1, lo), n, freq)
End Sub
Private Shared Function OptimalPatternCompare(pat1 As Pattern, pat2 As Pattern, freq As Integer()) As Integer
Dim fx As Integer = freq(AscW(pat1.Character)) - freq(AscW(pat2.Character))
Return (If(fx <> 0, (If(fx > 0, 1, -1)), (pat2.Location - pat1.Location)))
End Function
Private Shared Function MatchShift(x As Char(), ploc As Integer, lShift As Integer, pat As Pattern()) As Integer
Dim i As Integer, j As Integer
While lShift < x.Length
i = ploc
While System.Threading.Interlocked.Decrement(i) >= 0
j = (pat(i).Location - lShift)
If j < 0 Then
Continue While
End If
If pat(i).Character <> x(j) Then
Exit While
End If
End While
If i < 0 Then
Exit While
End If
lShift += 1
End While
Return (lShift)
End Function
Private Shared Sub PreAdaptedGs(x As Char(), adaptedGs As Integer(), pat As Pattern())
Dim lShift As Integer, i As Integer, pLoc As Integer
lShift = 1
adaptedGs(0) = lShift
For pLoc = 1 To x.Length
lShift = MatchShift(x, pLoc, lShift, pat)
adaptedGs(pLoc) = lShift
Next
For pLoc = 0 To x.Length - 1
lShift = adaptedGs(pLoc)
While lShift < x.Length
i = pat(pLoc).Location - lShift
If i < 0 OrElse pat(pLoc).Character <> x(i) Then
Exit While
End If
lShift += 1
lShift = MatchShift(x, pLoc, lShift, pat)
End While
adaptedGs(pLoc) = lShift
Next
End Sub
Private Shared Function CalculateCharFrequency(x As Char(), y As Char(), z As Integer) As Integer()
Dim i As Integer
Dim freq As Integer() = New Integer(z - 1) {}
For i = 0 To x.Length - 1
freq(AscW(x(i))) += 1
Next
For i = 0 To y.Length - 1
freq(AscW(y(i))) += 1
Next
Return freq
End Function
Private Shared Sub PreQsBc(x As Char(), qsBc As Integer())
Dim i As Integer, m As Integer = x.Length
For i = 0 To qsBc.Length - 1
qsBc(i) = m + 1
Next
For i = 0 To m - 1
qsBc(AscW(x(i))) = m - i
Next
End Sub
Private Shared Sub SetupOptimalSearch()
OrderPattern(_x, _pattern, _frequency)
PreQsBc(_x, _qsBc)
PreAdaptedGs(_x, _adaptedGs, _pattern)
End Sub
Public Shared Sub InitOptimalSearch(pattern As String, source As String)
_x = pattern.ToCharArray()
_y = source.ToCharArray()
_m = _x.Length
_n = _y.Length
_adaptedGs = New Integer(_m) {}
_qsBc = New Integer(65535) {}
_frequency = CalculateCharFrequency(_x, _y, 65536)
_pattern = New Pattern(_m - 1) {}
End Sub
Public Shared Function FindAll() As Result
Dim i As Integer, j As Integer
Dim result As New List(Of Integer)()
SetupOptimalSearch()
j = 0
Dim jOld As Integer = 0
While j <= _n - _m
i = 0
While i < _m AndAlso _pattern(i).Character = _y(j + _pattern(i).Location)
i += 1
End While
If i >= _m Then
result.Add(j)
End If
jOld = j
If j < _n - _m Then
j += Math.Max(_adaptedGs(i), _qsBc(AscW(_y(j + _m))))
Else
j += _adaptedGs(i)
End If
End While
Return New Result(jOld, result)
End Function
Public Structure Result
Public Comp As Integer
Public Indexes As List(Of Integer)
Public Sub New(comp As Integer, indexes As List(Of Integer))
Me.Comp = comp
Me.Indexes = indexes
End Sub
End Structure
Public Structure Pattern
Public Location As Integer
Public Character As Char
End Structure
Example
Dim source As String = "GCATCGCAGAGAGTATACAGTACG"
Dim pattern As String = "GCAGAGAG"
InitOptimalSearch(pattern, source)
Dim result As Result = FindAll()
Output
result {
Comp: 14
Indexes: { 5 }
}