Optimal Mismatch Algorithm

This algorithm works by scanning pattern characters from the least frequent one to the most frequent one. Doing so one may hope to have a mismatch most of the times and thus to scan the whole text very quickly. One needs to know the frequencies of each of the character of the alphabet.



									Private Shared _x As Char(), _y As Char()
Private Shared _m As Integer, _n As Integer
Private Shared _adaptedGs As Integer(), _qsBc As Integer(), _frequency As Integer()
Private Shared _pattern As Pattern()

Private Shared Sub OrderPattern(x As Char(), pat As Pattern(), freq As Integer())
	For i As Integer = 0 To x.Length - 1
		Dim ptrn As New Pattern()
		ptrn.Location = i
		ptrn.Character = x(i)
		pat(i) = ptrn
	Next

	QuickSortPattern(pat, 0, x.Length - 1, freq)
End Sub

Private Shared Sub QuickSortPattern(pat As Pattern(), low As Integer, n As Integer, freq As Integer())
	Dim lo As Integer = low
	Dim hi As Integer = n

	If lo >= n Then
		Return
	End If

	Dim mid As Pattern = pat((lo + hi) \ 2)

	While lo < hi
		While lo < hi AndAlso OptimalPatternCompare(pat(lo), mid, freq) < 0
			lo += 1
		End While
		While lo < hi AndAlso OptimalPatternCompare(pat(hi), mid, freq) > 0
			hi -= 1
		End While

		If lo < hi Then
			Dim temp As Pattern = pat(lo)
			pat(lo) = pat(hi)
			pat(hi) = temp
		End If
	End While

	If hi < lo Then
		Dim temp As Integer = hi
		hi = lo
		lo = temp
	End If

	QuickSortPattern(pat, low, lo, freq)
	QuickSortPattern(pat, If(lo = low, lo + 1, lo), n, freq)
End Sub

Private Shared Function OptimalPatternCompare(pat1 As Pattern, pat2 As Pattern, freq As Integer()) As Integer
	Dim fx As Integer = freq(AscW(pat1.Character)) - freq(AscW(pat2.Character))
	Return (If(fx <> 0, (If(fx > 0, 1, -1)), (pat2.Location - pat1.Location)))
End Function

Private Shared Function MatchShift(x As Char(), ploc As Integer, lShift As Integer, pat As Pattern()) As Integer
	Dim i As Integer, j As Integer

	While lShift < x.Length
		i = ploc
		While System.Threading.Interlocked.Decrement(i) >= 0
			j = (pat(i).Location - lShift)
			If j < 0 Then
				Continue While
			End If
			If pat(i).Character <> x(j) Then
				Exit While
			End If
		End While
		If i < 0 Then
			Exit While
		End If
		lShift += 1
	End While

	Return (lShift)
End Function

Private Shared Sub PreAdaptedGs(x As Char(), adaptedGs As Integer(), pat As Pattern())
	Dim lShift As Integer, i As Integer, pLoc As Integer
	lShift = 1
	adaptedGs(0) = lShift

	For pLoc = 1 To x.Length
		lShift = MatchShift(x, pLoc, lShift, pat)
		adaptedGs(pLoc) = lShift
	Next

	For pLoc = 0 To x.Length - 1
		lShift = adaptedGs(pLoc)
		While lShift < x.Length
			i = pat(pLoc).Location - lShift
			If i < 0 OrElse pat(pLoc).Character <> x(i) Then
				Exit While
			End If
			lShift += 1
			lShift = MatchShift(x, pLoc, lShift, pat)
		End While
		adaptedGs(pLoc) = lShift
	Next
End Sub

Private Shared Function CalculateCharFrequency(x As Char(), y As Char(), z As Integer) As Integer()
	Dim i As Integer
	Dim freq As Integer() = New Integer(z - 1) {}
	For i = 0 To x.Length - 1
		freq(AscW(x(i))) += 1
	Next
	For i = 0 To y.Length - 1
		freq(AscW(y(i))) += 1
	Next
	Return freq
End Function

Private Shared Sub PreQsBc(x As Char(), qsBc As Integer())
	Dim i As Integer, m As Integer = x.Length

	For i = 0 To qsBc.Length - 1
		qsBc(i) = m + 1
	Next

	For i = 0 To m - 1
		qsBc(AscW(x(i))) = m - i
	Next
End Sub

Private Shared Sub SetupOptimalSearch()
	OrderPattern(_x, _pattern, _frequency)
	PreQsBc(_x, _qsBc)
	PreAdaptedGs(_x, _adaptedGs, _pattern)
End Sub

Public Shared Sub InitOptimalSearch(pattern As String, source As String)
	_x = pattern.ToCharArray()
	_y = source.ToCharArray()
	_m = _x.Length
	_n = _y.Length
	_adaptedGs = New Integer(_m) {}
	_qsBc = New Integer(65535) {}
	_frequency = CalculateCharFrequency(_x, _y, 65536)
	_pattern = New Pattern(_m - 1) {}
End Sub

Public Shared Function FindAll() As Result
	Dim i As Integer, j As Integer
	Dim result As New List(Of Integer)()
	SetupOptimalSearch()

	j = 0
	Dim jOld As Integer = 0
	While j <= _n - _m
		i = 0
		While i < _m AndAlso _pattern(i).Character = _y(j + _pattern(i).Location)
			i += 1
		End While

		If i >= _m Then
			result.Add(j)
		End If

		jOld = j
		If j < _n - _m Then
			j += Math.Max(_adaptedGs(i), _qsBc(AscW(_y(j + _m))))
		Else
			j += _adaptedGs(i)
		End If
	End While

	Return New Result(jOld, result)
End Function

Public Structure Result
	Public Comp As Integer
	Public Indexes As List(Of Integer)

	Public Sub New(comp As Integer, indexes As List(Of Integer))
		Me.Comp = comp
		Me.Indexes = indexes
	End Sub
End Structure

Public Structure Pattern
	Public Location As Integer
	Public Character As Char
End Structure
								


Example

									Dim source As String = "GCATCGCAGAGAGTATACAGTACG"
Dim pattern As String = "GCAGAGAG"
InitOptimalSearch(pattern, source)
Dim result As Result = FindAll()
								


Output

									result {
	Comp: 14
	Indexes: { 5 }
}