Optimal Mismatch Algorithm
This algorithm works by scanning pattern characters from the least frequent one to the most frequent one. Doing so one may hope to have a mismatch most of the times and thus to scan the whole text very quickly. One needs to know the frequencies of each of the character of the alphabet.
private static char[] _x, _y;
private static int _m, _n;
private static int[] _adaptedGs, _qsBc, _frequency;
private static Pattern[] _pattern;
private static void OrderPattern(char[] x, Pattern[] pat, int[] freq)
{
for (int i = 0; i < x.Length; ++i)
{
Pattern ptrn = new Pattern();
ptrn.Location = i;
ptrn.Character = x[i];
pat[i] = ptrn;
}
QuickSortPattern(pat, 0, x.Length - 1, freq);
}
private static void QuickSortPattern(Pattern[] pat, int low, int n, int[] freq)
{
int lo = low;
int hi = n;
if (lo >= n) return;
Pattern mid = pat[(lo + hi) / 2];
while (lo < hi)
{
while (lo < hi && OptimalPatternCompare(pat[lo], mid, freq) < 0) lo++;
while (lo < hi && OptimalPatternCompare(pat[hi], mid, freq) > 0) hi--;
if (lo < hi)
{
Pattern temp = pat[lo];
pat[lo] = pat[hi];
pat[hi] = temp;
}
}
if (hi < lo)
{
int temp = hi;
hi = lo;
lo = temp;
}
QuickSortPattern(pat, low, lo, freq);
QuickSortPattern(pat, lo == low ? lo + 1 : lo, n, freq);
}
private static int OptimalPatternCompare(Pattern pat1, Pattern pat2, int[] freq)
{
int fx = freq[pat1.Character] - freq[pat2.Character];
return (fx != 0 ? (fx > 0 ? 1 : -1) : (pat2.Location - pat1.Location));
}
private static int MatchShift(char[] x, int ploc, int lShift, Pattern[] pat)
{
int i, j;
for (; lShift < x.Length; ++lShift)
{
i = ploc;
while (--i >= 0)
{
if ((j = (pat[i].Location - lShift)) < 0) continue;
if (pat[i].Character != x[j]) break;
}
if (i < 0) break;
}
return (lShift);
}
private static void PreAdaptedGs(char[] x, int[] adaptedGs, Pattern[] pat)
{
int lShift, i, pLoc;
adaptedGs[0] = lShift = 1;
for (pLoc = 1; pLoc <= x.Length; ++pLoc)
{
lShift = MatchShift(x, pLoc, lShift, pat);
adaptedGs[pLoc] = lShift;
}
for (pLoc = 0; pLoc < x.Length; ++pLoc)
{
lShift = adaptedGs[pLoc];
while (lShift < x.Length)
{
i = pat[pLoc].Location - lShift;
if (i < 0 || pat[pLoc].Character != x[i]) break;
++lShift;
lShift = MatchShift(x, pLoc, lShift, pat);
}
adaptedGs[pLoc] = lShift;
}
}
private static int[] CalculateCharFrequency(char[] x, char[] y, int z)
{
int i;
int[] freq = new int[z];
for (i = 0; i < x.Length; i++) freq[x[i]]++;
for (i = 0; i < y.Length; i++) freq[y[i]]++;
return freq;
}
private static void PreQsBc(char[] x, int[] qsBc)
{
int i, m = x.Length;
for (i = 0; i < qsBc.Length; ++i)
qsBc[i] = m + 1;
for (i = 0; i < m; ++i)
qsBc[x[i]] = m - i;
}
private static void SetupOptimalSearch()
{
OrderPattern(_x, _pattern, _frequency);
PreQsBc(_x, _qsBc);
PreAdaptedGs(_x, _adaptedGs, _pattern);
}
public static void InitOptimalSearch(string pattern, string source)
{
_x = pattern.ToCharArray();
_y = source.ToCharArray();
_m = _x.Length;
_n = _y.Length;
_adaptedGs = new int[_m + 1];
_qsBc = new int[65536];
_frequency = CalculateCharFrequency(_x, _y, 65536);
_pattern = new Pattern[_m];
}
public static Result FindAll()
{
int i, j;
List<int> result = new List<int>();
SetupOptimalSearch();
j = 0;
int jOld = 0;
while (j <= _n - _m)
{
i = 0;
while (i < _m && _pattern[i].Character == _y[j + _pattern[i].Location]) ++i;
if (i >= _m)
result.Add(j);
jOld = j;
if (j < _n - _m)
j += Math.Max(_adaptedGs[i], _qsBc[_y[j + _m]]);
else
j += _adaptedGs[i];
}
return new Result(jOld, result);
}
public struct Result
{
public int Comp;
public List<int> Indexes;
public Result(int comp, List<int> indexes)
{
Comp = comp;
Indexes = indexes;
}
}
public struct Pattern
{
public int Location;
public char Character;
}
Example
string source = "GCATCGCAGAGAGTATACAGTACG";
string pattern = "GCAGAGAG";
InitOptimalSearch(pattern, source);
Result result = FindAll();
Output
result {
Comp: 14
Indexes: { 5 }
}